v3.9.5.4

v3.9.5.3
v3.9.5.2
2025-09-17 23:44:27 +00:00 · 2019-07-15 17:00:26 -04:00 · 2019-07-12 10:42:38 -04:00 · 2019-07-04 12:12:11 -04:00 · 2019-07-02 15:10:38 -04:00 · 2019-06-26 14:16:01 -04:00
241 changed files with 8423 additions and 7782 deletions
--- a/3
+++ b/3
@@ -42,9 +42,6 @@ openssl 1.1.0e or higher. Add one of the following, depending on the
 compiler version, to CFLAGS:
 "-march=native" or "-march=znver1" or "-msha".

-Due to poor AVX2 performance on Ryzen users should add -DRYZEN_ to CFLAGS
-to override multiway AVX2 on algos with sha256, and use SHA instead.
-
 Additional instructions for static compilalation can be found here:
 https://lxadm.com/Static_compilation_of_cpuminer
 Static builds should only considered in a homogeneous HW and SW environment.
--- a/Makefile.am
+++ b/Makefile.am
@@ -131,22 +131,24 @@ cpuminer_SOURCES = \
  algo/lyra2/lyra2h-4way.c \
  algo/lyra2/allium-4way.c \
  algo/lyra2/allium.c \
+  algo/lyra2/phi2-4way.c \
  algo/lyra2/phi2.c \
  algo/m7m.c \
-  algo/neoscrypt/neoscrypt.c \
  algo/nist5/nist5-gate.c \
  algo/nist5/nist5-4way.c \
  algo/nist5/nist5.c \
  algo/nist5/zr5.c \
  algo/panama/sph_panama.c \
  algo/radiogatun/sph_radiogatun.c \
-  algo/pluck.c \
  algo/quark/quark-gate.c \
  algo/quark/quark.c \
  algo/quark/quark-4way.c \
  algo/quark/anime-gate.c \
  algo/quark/anime.c \
  algo/quark/anime-4way.c \
+  algo/quark/hmq1725-gate.c \
+  algo/quark/hmq1725-4way.c \
+  algo/quark/hmq1725.c \
  algo/qubit/qubit-gate.c \
  algo/qubit/qubit.c \
  algo/qubit/qubit-2way.c \
@@ -158,7 +160,9 @@ cpuminer_SOURCES = \
  algo/ripemd/lbry-gate.c \
  algo/ripemd/lbry.c \
  algo/ripemd/lbry-4way.c \
-  algo/scrypt.c \
+  algo/scrypt/scrypt.c \
+  algo/scrypt/neoscrypt.c \
+  algo/scrypt/pluck.c \
  algo/scryptjane/scrypt-jane.c \
  algo/sha/sph_sha2.c \
  algo/sha/sph_sha2big.c \
@@ -194,7 +198,6 @@ cpuminer_SOURCES = \
  algo/whirlpool/sph_whirlpool.c \
  algo/whirlpool/whirlpool-hash-4way.c \
  algo/whirlpool/whirlpool-gate.c \
-  algo/whirlpool/whirlpool-4way.c \
  algo/whirlpool/whirlpool.c \
  algo/whirlpool/whirlpoolx.c \
  algo/x11/x11-gate.c \
@@ -257,7 +260,6 @@ cpuminer_SOURCES = \
  algo/x17/xevan-gate.c \
  algo/x17/xevan.c \
  algo/x17/xevan-4way.c \
-  algo/x17/hmq1725.c \
  algo/x17/sonoa-gate.c \
  algo/x17/sonoa-4way.c \
  algo/x17/sonoa.c \
--- a/README.md
+++ b/README.md
@@ -59,9 +59,6 @@ Supported Algorithms
                          blake2s       Blake-2 S
                          bmw           BMW 256
                          c11           Chaincoin
-                          cryptolight   Cryptonight-light
-                          cryptonight  
-                          cryptonightv7 Monero (XMR)
                          decred
                          deep          Deepcoin (DCN)
                          dmd-gr        Diamond-Groestl
@@ -78,9 +75,9 @@ Supported Algorithms
                          luffa         Luffa
                          lyra2h        Hppcoin
                          lyra2re       lyra2
-                          lyra2rev2     lyra2v2, Vertcoin
+                          lyra2rev2     lyra2v2
                          lyra2rev3     lyrav2v3, Vertcoin
-                          lyra2z        Zcoin (XZC)
+                          lyra2z        
                          lyra2z330     Lyra2 330 rows, Zoin (ZOI)
                          m7m           Magi (XMG)
                          myr-gr        Myriad-Groestl
@@ -97,6 +94,7 @@ Supported Algorithms
                          scrypt:N      scrypt(N, 1, 1)
                          scryptjane:nf
                          sha256d       Double SHA-256
+                          sha256q       Quad SHA-256, Pyrite (PYE)
                          sha256t       Triple SHA-256, Onecoin (OC)
                          shavite3      Shavite3
                          skein         Skein+Sha (Skeincoin)
--- a/README.txt
+++ b/README.txt
@@ -29,7 +29,7 @@ cpuminer-sse2.exe      "-msse2"                  Core2, Nehalem
 cpuminer-aes-sse42.exe "-march=westmere"         Westmere
 cpuminer-avx.exe       "-march=corei7-avx"       Sandy-Ivybridge
 cpuminer-avx2.exe      "-march=core-avx2"        Haswell, Sky-Kaby-Coffeelake
-cpuminer-zen           "-march=znver1 -DRYZEN_"  Ryzen
+cpuminer-zen           "-march=znver1"           AMD Ryzen, Threadripper

 If you like this software feel free to donate:

--- a/68
+++ b/68
@@ -38,6 +38,74 @@ supported.
 Change Log
 ----------

+v3.9.5.4
+
+Fixed sha256q AVX2 poor performance.
+Fixed skein2 buffer overflow and restored bswap-interleave optimization.
+More restructuring.
+
+v3.9.5.3
+
+Fix crash mining hodl with aes-sse42.
+More restructuring and share report tweaks.
+
+v3.9.5.2
+
+Revert bswap-interleave optimization for causing crashes on Windows.
+
+v3.9.5.1
+
+Fixed skein2 crash on Windows.
+
+Fixed CPU temperature reading on Ubuntu 19.04.
+
+Realigned log message colours, blue is used to report normal activity and
+yellow is only used to report abnormal activity.
+
+Changed stats colours, yellow now means below average, white is average
+range. Tweaked colour thresholds.
+
+Changed colour of stratum difficulty change messages to blue to match other
+normal protocol messages. Blue messages (block, stratum, submit) will no
+longer be displayed when using -q option.
+
+Added job id to new block, share submit, and share result messages and added
+new nessage when a new job is received for an existing block. This will for
+better troubleshooting of invalid job id rejects seen at zergpool.
+
+Some more restructuring.
+
+v3.9.5
+
+New share reporting information includes calculation of equivalent hashrate
+based on share difficulty, network latency, 5 minute summary.
+Per-thread hash rate reports are disabled by default.
+New command line option --hash-meter added to enable per-thread hash rates.
+
+
+v3.9.4
+
+Faster AVX2 for lyra2v3, quark, anime.
+Fixed skein AVX2 regression (invalid shares since v3.9.0) and faster.
+Faster skein2 with 4way AVX2 enabled.
+Automatic SHA override on Ryzen CPUs, no need for -DRYZEN compile flag.
+Ongoing restructuring.
+
+v3.9.3.1
+
+Skipped v3.9.3 due to misidentification of v3.9.2.5 as v3.9.3.
+Fixed x16r algo 25% invalid share reject rate. The bug may have also
+affected other algos.
+
+v3.9.2.5
+
+Fixed 2 regressions: hodl AES detection, x16r invalid shares with AVX2.
+More restructuring.
+
+v3.9.2.4
+
+Yet another affinity fix. Hopefully the last one.
+
 v3.9.2.3

 Another cpu-affinity fix.
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -71,7 +71,6 @@ bool return_false () { return false; }
 void *return_null () { return NULL;  }
 void call_error   () { printf("ERR: Uninitialized function pointer\n"); }

-
 void algo_not_tested()
 {
  applog( LOG_WARNING,"Algo %s has not been tested live. It may not work",
@@ -149,111 +148,110 @@ void init_algo_gate( algo_gate_t* gate )
 // called by each thread that uses the gate
 bool register_algo_gate( int algo, algo_gate_t *gate )
 {
-   if ( NULL == gate )
-   {
-     applog(LOG_ERR,"FAIL: algo_gate registration failed, NULL gate\n");
-     return false;
-   }
+  if ( NULL == gate )
+  {
+    applog(LOG_ERR,"FAIL: algo_gate registration failed, NULL gate\n");
+    return false;
+  }

-   init_algo_gate( gate );
+  init_algo_gate( gate );

-   switch (algo)
-   {
-     case ALGO_ALLIUM:       register_allium_algo       ( gate ); break;
-     case ALGO_ANIME:        register_anime_algo        ( gate ); break;
-     case ALGO_ARGON2:       register_argon2_algo       ( gate ); break;
-     case ALGO_ARGON2D250:   register_argon2d_crds_algo ( gate ); break;
-     case ALGO_ARGON2D500:   register_argon2d_dyn_algo  ( gate ); break;
-     case ALGO_ARGON2D4096:  register_argon2d4096_algo  ( gate ); break;
-     case ALGO_AXIOM:        register_axiom_algo        ( gate ); break;
-     case ALGO_BASTION:      register_bastion_algo      ( gate ); break;
-     case ALGO_BLAKE:        register_blake_algo        ( gate ); break;
-     case ALGO_BLAKECOIN:    register_blakecoin_algo    ( gate ); break;
-//     case ALGO_BLAKE2B:      register_blake2b_algo    ( gate ); break;
-     case ALGO_BLAKE2S:      register_blake2s_algo      ( gate ); break;
-     case ALGO_C11:          register_c11_algo          ( gate ); break;
-     case ALGO_CRYPTOLIGHT:  register_cryptolight_algo  ( gate ); break;
-     case ALGO_CRYPTONIGHT:  register_cryptonight_algo  ( gate ); break;
-     case ALGO_CRYPTONIGHTV7:register_cryptonightv7_algo( gate ); break;
-     case ALGO_DECRED:       register_decred_algo       ( gate ); break;
-     case ALGO_DEEP:         register_deep_algo         ( gate ); break;
-     case ALGO_DMD_GR:       register_dmd_gr_algo       ( gate ); break;
-     case ALGO_DROP:         register_drop_algo         ( gate ); break;
-     case ALGO_FRESH:        register_fresh_algo        ( gate ); break;
-     case ALGO_GROESTL:      register_groestl_algo      ( gate ); break;
-     case ALGO_HEAVY:        register_heavy_algo        ( gate ); break;
-     case ALGO_HMQ1725:      register_hmq1725_algo      ( gate ); break;
-     case ALGO_HODL:         register_hodl_algo         ( gate ); break;
-     case ALGO_JHA:          register_jha_algo          ( gate ); break;
-     case ALGO_KECCAK:       register_keccak_algo       ( gate ); break;
-     case ALGO_KECCAKC:      register_keccakc_algo      ( gate ); break;
-     case ALGO_LBRY:         register_lbry_algo         ( gate ); break;
-     case ALGO_LUFFA:        register_luffa_algo        ( gate ); break;
-     case ALGO_LYRA2H:       register_lyra2h_algo       ( gate ); break;
-     case ALGO_LYRA2RE:      register_lyra2re_algo      ( gate ); break;
-     case ALGO_LYRA2REV2:    register_lyra2rev2_algo    ( gate ); break;
-     case ALGO_LYRA2REV3:    register_lyra2rev3_algo    ( gate ); break;
-     case ALGO_LYRA2Z:       register_lyra2z_algo       ( gate ); break;
-     case ALGO_LYRA2Z330:    register_lyra2z330_algo    ( gate ); break;
-     case ALGO_M7M:          register_m7m_algo          ( gate ); break;
-     case ALGO_MYR_GR:       register_myriad_algo       ( gate ); break;
-     case ALGO_NEOSCRYPT:    register_neoscrypt_algo    ( gate ); break;
-     case ALGO_NIST5:        register_nist5_algo        ( gate ); break;
-     case ALGO_PENTABLAKE:   register_pentablake_algo   ( gate ); break;
-     case ALGO_PHI1612:      register_phi1612_algo      ( gate ); break;
-     case ALGO_PHI2:         register_phi2_algo         ( gate ); break;
-     case ALGO_PLUCK:        register_pluck_algo        ( gate ); break;
-     case ALGO_POLYTIMOS:    register_polytimos_algo    ( gate ); break;
-     case ALGO_QUARK:        register_quark_algo        ( gate ); break;
-     case ALGO_QUBIT:        register_qubit_algo        ( gate ); break;
-     case ALGO_SCRYPT:       register_scrypt_algo       ( gate ); break;
-     case ALGO_SCRYPTJANE:   register_scryptjane_algo   ( gate ); break;
-     case ALGO_SHA256D:      register_sha256d_algo      ( gate ); break;
-     case ALGO_SHA256T:      register_sha256t_algo      ( gate ); break;
-     case ALGO_SHA256Q:      register_sha256q_algo      ( gate ); break;
-     case ALGO_SHAVITE3:     register_shavite_algo      ( gate ); break;
-     case ALGO_SKEIN:        register_skein_algo        ( gate ); break;
-     case ALGO_SKEIN2:       register_skein2_algo       ( gate ); break;
-     case ALGO_SKUNK:        register_skunk_algo        ( gate ); break;
-     case ALGO_SONOA:        register_sonoa_algo        ( gate ); break;
-     case ALGO_TIMETRAVEL:   register_timetravel_algo   ( gate ); break;
-     case ALGO_TIMETRAVEL10: register_timetravel10_algo ( gate ); break;
-     case ALGO_TRIBUS:       register_tribus_algo       ( gate ); break;
-     case ALGO_VANILLA:      register_vanilla_algo      ( gate ); break;
-     case ALGO_VELTOR:       register_veltor_algo       ( gate ); break;
-     case ALGO_WHIRLPOOL:    register_whirlpool_algo    ( gate ); break;
-     case ALGO_WHIRLPOOLX:   register_whirlpoolx_algo   ( gate ); break;
-     case ALGO_X11:          register_x11_algo          ( gate ); break;
-     case ALGO_X11EVO:       register_x11evo_algo       ( gate ); break;
-     case ALGO_X11GOST:      register_x11gost_algo      ( gate ); break;
-     case ALGO_X12:          register_x12_algo          ( gate ); break;
-     case ALGO_X13:          register_x13_algo          ( gate ); break;
-     case ALGO_X13SM3:       register_x13sm3_algo       ( gate ); break;
-     case ALGO_X14:          register_x14_algo          ( gate ); break;
-     case ALGO_X15:          register_x15_algo          ( gate ); break;
-     case ALGO_X16R:         register_x16r_algo         ( gate ); break;
-     case ALGO_X16S:         register_x16s_algo         ( gate ); break;
-     case ALGO_X17:          register_x17_algo          ( gate ); break;
-     case ALGO_XEVAN:        register_xevan_algo        ( gate ); break;
+  switch (algo)
+  {
+    case ALGO_ALLIUM:        register_allium_algo        ( gate ); break;
+    case ALGO_ANIME:         register_anime_algo         ( gate ); break;
+    case ALGO_ARGON2:        register_argon2_algo        ( gate ); break;
+    case ALGO_ARGON2D250:    register_argon2d_crds_algo  ( gate ); break;
+    case ALGO_ARGON2D500:    register_argon2d_dyn_algo   ( gate ); break;
+    case ALGO_ARGON2D4096:   register_argon2d4096_algo   ( gate ); break;
+    case ALGO_AXIOM:         register_axiom_algo         ( gate ); break;
+    case ALGO_BASTION:       register_bastion_algo       ( gate ); break;
+    case ALGO_BLAKE:         register_blake_algo         ( gate ); break;
+    case ALGO_BLAKECOIN:     register_blakecoin_algo     ( gate ); break;
+//    case ALGO_BLAKE2B:      register_blake2b_algo     ( gate ); break;
+    case ALGO_BLAKE2S:       register_blake2s_algo       ( gate ); break;
+    case ALGO_C11:           register_c11_algo           ( gate ); break;
+    case ALGO_CRYPTOLIGHT:   register_cryptolight_algo   ( gate ); break;
+    case ALGO_CRYPTONIGHT:   register_cryptonight_algo   ( gate ); break;
+    case ALGO_CRYPTONIGHTV7: register_cryptonightv7_algo ( gate ); break;
+    case ALGO_DECRED:        register_decred_algo        ( gate ); break;
+    case ALGO_DEEP:          register_deep_algo          ( gate ); break;
+    case ALGO_DMD_GR:        register_dmd_gr_algo        ( gate ); break;
+    case ALGO_DROP:          register_drop_algo          ( gate ); break;
+    case ALGO_FRESH:         register_fresh_algo         ( gate ); break;
+    case ALGO_GROESTL:       register_groestl_algo       ( gate ); break;
+    case ALGO_HEAVY:         register_heavy_algo         ( gate ); break;
+    case ALGO_HMQ1725:       register_hmq1725_algo       ( gate ); break;
+    case ALGO_HODL:          register_hodl_algo          ( gate ); break;
+    case ALGO_JHA:           register_jha_algo           ( gate ); break;
+    case ALGO_KECCAK:        register_keccak_algo        ( gate ); break;
+    case ALGO_KECCAKC:       register_keccakc_algo       ( gate ); break;
+    case ALGO_LBRY:          register_lbry_algo          ( gate ); break;
+    case ALGO_LUFFA:         register_luffa_algo         ( gate ); break;
+    case ALGO_LYRA2H:        register_lyra2h_algo        ( gate ); break;
+    case ALGO_LYRA2RE:       register_lyra2re_algo       ( gate ); break;
+    case ALGO_LYRA2REV2:     register_lyra2rev2_algo     ( gate ); break;
+    case ALGO_LYRA2REV3:     register_lyra2rev3_algo     ( gate ); break;
+    case ALGO_LYRA2Z:        register_lyra2z_algo        ( gate ); break;
+    case ALGO_LYRA2Z330:     register_lyra2z330_algo     ( gate ); break;
+    case ALGO_M7M:           register_m7m_algo           ( gate ); break;
+    case ALGO_MYR_GR:        register_myriad_algo        ( gate ); break;
+    case ALGO_NEOSCRYPT:     register_neoscrypt_algo     ( gate ); break;
+    case ALGO_NIST5:         register_nist5_algo         ( gate ); break;
+    case ALGO_PENTABLAKE:    register_pentablake_algo    ( gate ); break;
+    case ALGO_PHI1612:       register_phi1612_algo       ( gate ); break;
+    case ALGO_PHI2:          register_phi2_algo          ( gate ); break;
+    case ALGO_PLUCK:         register_pluck_algo         ( gate ); break;
+    case ALGO_POLYTIMOS:     register_polytimos_algo     ( gate ); break;
+    case ALGO_QUARK:         register_quark_algo         ( gate ); break;
+    case ALGO_QUBIT:         register_qubit_algo         ( gate ); break;
+    case ALGO_SCRYPT:        register_scrypt_algo        ( gate ); break;
+    case ALGO_SCRYPTJANE:    register_scryptjane_algo    ( gate ); break;
+    case ALGO_SHA256D:       register_sha256d_algo       ( gate ); break;
+    case ALGO_SHA256Q:       register_sha256q_algo       ( gate ); break;
+    case ALGO_SHA256T:       register_sha256t_algo       ( gate ); break;
+    case ALGO_SHAVITE3:      register_shavite_algo       ( gate ); break;
+    case ALGO_SKEIN:         register_skein_algo         ( gate ); break;
+    case ALGO_SKEIN2:        register_skein2_algo        ( gate ); break;
+    case ALGO_SKUNK:         register_skunk_algo         ( gate ); break;
+    case ALGO_SONOA:         register_sonoa_algo         ( gate ); break;
+    case ALGO_TIMETRAVEL:    register_timetravel_algo    ( gate ); break;
+    case ALGO_TIMETRAVEL10:  register_timetravel10_algo  ( gate ); break;
+    case ALGO_TRIBUS:        register_tribus_algo        ( gate ); break;
+    case ALGO_VANILLA:       register_vanilla_algo       ( gate ); break;
+    case ALGO_VELTOR:        register_veltor_algo        ( gate ); break;
+    case ALGO_WHIRLPOOL:     register_whirlpool_algo     ( gate ); break;
+    case ALGO_WHIRLPOOLX:    register_whirlpoolx_algo    ( gate ); break;
+    case ALGO_X11:           register_x11_algo           ( gate ); break;
+    case ALGO_X11EVO:        register_x11evo_algo        ( gate ); break;
+    case ALGO_X11GOST:       register_x11gost_algo       ( gate ); break;
+    case ALGO_X12:           register_x12_algo           ( gate ); break;
+    case ALGO_X13:           register_x13_algo           ( gate ); break;
+    case ALGO_X13SM3:        register_x13sm3_algo        ( gate ); break;
+    case ALGO_X14:           register_x14_algo           ( gate ); break;
+    case ALGO_X15:           register_x15_algo           ( gate ); break;
+    case ALGO_X16R:          register_x16r_algo          ( gate ); break;
+    case ALGO_X16S:          register_x16s_algo          ( gate ); break;
+    case ALGO_X17:           register_x17_algo           ( gate ); break;
+    case ALGO_XEVAN:         register_xevan_algo         ( gate ); break;
 /*    case ALGO_YESCRYPT:     register_yescrypt_05_algo     ( gate ); break;
     case ALGO_YESCRYPTR8:   register_yescryptr8_05_algo   ( gate ); break;
     case ALGO_YESCRYPTR16:  register_yescryptr16_05_algo  ( gate ); break;
     case ALGO_YESCRYPTR32:  register_yescryptr32_05_algo  ( gate ); break;
 */
-     case ALGO_YESCRYPT:     register_yescrypt_algo     ( gate ); break;
-     case ALGO_YESCRYPTR8:   register_yescryptr8_algo   ( gate ); break;
-     case ALGO_YESCRYPTR16:  register_yescryptr16_algo  ( gate ); break;
-     case ALGO_YESCRYPTR32:  register_yescryptr32_algo  ( gate ); break;
+    case ALGO_YESCRYPT:      register_yescrypt_algo      ( gate ); break;
+    case ALGO_YESCRYPTR8:    register_yescryptr8_algo    ( gate ); break;
+    case ALGO_YESCRYPTR16:   register_yescryptr16_algo   ( gate ); break;
+    case ALGO_YESCRYPTR32:   register_yescryptr32_algo   ( gate ); break;
+    case ALGO_YESPOWER:      register_yespower_algo      ( gate ); break;
+    case ALGO_YESPOWERR16:   register_yespowerr16_algo   ( gate ); break;
+    case ALGO_ZR5:           register_zr5_algo           ( gate ); break;
+   default:
+      applog(LOG_ERR,"FAIL: algo_gate registration failed, unknown algo %s.\n", algo_names[opt_algo] );
+      return false;
+  } // switch

-     case ALGO_YESPOWER:     register_yespower_algo     ( gate ); break;
-     case ALGO_YESPOWERR16:  register_yespowerr16_algo  ( gate ); break;
-     case ALGO_ZR5:          register_zr5_algo          ( gate ); break;
-    default:
-        applog(LOG_ERR,"FAIL: algo_gate registration failed, unknown algo %s.\n", algo_names[opt_algo] );
-        return false;
-   } // switch
-
-  // ensure required functions were defined.
+ // ensure required functions were defined.
  if (  gate->scanhash == (void*)&null_scanhash )
  {
    applog(LOG_ERR, "FAIL: Required algo_gate functions undefined\n");
@@ -364,14 +362,15 @@ void get_algo_alias( char** algo_or_alias )
 #undef PROPER

 bool submit_solution( struct work *work, void *hash,
-                      struct thr_info *thr, int lane )
+                      struct thr_info *thr )
 {
     work_set_target_ratio( work, hash );
     if ( submit_work( thr, work ) )
     {
-         applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
-                 accepted_share_count + rejected_share_count + 1,
-                 thr->id, lane );
+         if ( !opt_quiet )
+            applog( LOG_BLUE, "Share %d submitted by thread %d, job %s.",
+                    accepted_share_count + rejected_share_count + 1,
+                    thr->id, work->job_id );
         return true;
     }
     else
@@ -379,4 +378,23 @@ bool submit_solution( struct work *work, void *hash,
     return false;
 }

+bool submit_lane_solution( struct work *work, void *hash,
+                           struct thr_info *thr, int lane )
+{
+     work_set_target_ratio( work, hash );
+     if ( submit_work( thr, work ) )
+     {
+         if ( !opt_quiet )
+//            applog( LOG_BLUE, "Share %d submitted by thread %d, lane %d.",
+//                    accepted_share_count + rejected_share_count + 1,
+//                    thr->id, lane );
+            applog( LOG_BLUE, "Share %d submitted by thread %d, lane %d, job %s.",
+                    accepted_share_count + rejected_share_count + 1, thr->id,
+                    lane, work->job_id );
+         return true;
+     }
+     else
+          applog( LOG_WARNING, "Failed to submit share." );
+     return false;
+}

--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -2,8 +2,7 @@
 #include <stdbool.h>
 #include <stdint.h>
 #include "miner.h"
-#include "avxdefs.h"
-#include "interleave.h"
+#include "simd-utils.h"

 /////////////////////////////
 ////
@@ -117,7 +116,7 @@ typedef struct
 // Added a 5th arg for the thread_info structure to replace the int thr id
 // in the first arg. Both will co-exist during the trasition.
 //int ( *scanhash ) ( int, struct work*, uint32_t, uint64_t* );
-int ( *scanhash ) ( int, struct work*, uint32_t, uint64_t*, struct thr_info* );
+int ( *scanhash ) ( struct work*, uint32_t, uint64_t*, struct thr_info* );

 // optional unsafe, must be overwritten if algo uses function
 void ( *hash )     ( void*, const void*, uint32_t ) ;
@@ -154,7 +153,6 @@ int  ntime_index;
 int  nbits_index;
 int  nonce_index;            // use with caution, see warning below
 int  work_cmp_size;
-
 } algo_gate_t;

 extern algo_gate_t algo_gate;
@@ -195,9 +193,12 @@ void four_way_not_tested();
 // allways returns failure
 int null_scanhash();

-// The one and only, a callback for scanhash.
+// Allow algos to submit from scanhash loop.
 bool submit_solution( struct work *work, void *hash,
-                      struct thr_info *thr, int lane );
+                      struct thr_info *thr );
+bool submit_lane_solution( struct work *work, void *hash,
+                          struct thr_info *thr, int lane );
+
 
 bool submit_work( struct thr_info *thr, const struct work *work_in );

--- a/algo/argon2/argon2a/argon2a.c
+++ b/algo/argon2/argon2a/argon2a.c
@@ -42,12 +42,14 @@ void argon2hash(void *output, const void *input)
 		(unsigned char *)output);
 }

-int scanhash_argon2(int thr_id, struct work* work, uint32_t max_nonce, uint64_t *hashes_done)
+int scanhash_argon2( struct work* work, uint32_t max_nonce,
+                     uint64_t *hashes_done, struct thr_info *mythr )
 {
 	uint32_t _ALIGN(64) endiandata[20];
 	uint32_t _ALIGN(64) hash[8];
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
+   int thr_id = mythr->id;  // thr_id arg is deprecated

 	const uint32_t first_nonce = pdata[19];
 	const uint32_t Htarg = ptarget[7];
--- a/algo/argon2/argon2d/argon2d-gate.c
+++ b/algo/argon2/argon2d/argon2d-gate.c
@@ -33,13 +33,14 @@ void argon2d_crds_hash( void *output, const void *input )
 	argon2_ctx( &context, Argon2_d );
 }

-int scanhash_argon2d_crds( int thr_id, struct work *work, uint32_t max_nonce,
-                      uint64_t *hashes_done )
+int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t _ALIGN(64) endiandata[20];
        uint32_t _ALIGN(64) hash[8];
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
+        int thr_id = mythr->id;  // thr_id arg is deprecated

        const uint32_t first_nonce = pdata[19];
        const uint32_t Htarg = ptarget[7];
@@ -103,13 +104,14 @@ void argon2d_dyn_hash( void *output, const void *input )
    argon2_ctx( &context, Argon2_d );
 }

-int scanhash_argon2d_dyn( int thr_id, struct work *work, uint32_t max_nonce,
-                      uint64_t *hashes_done )
+int scanhash_argon2d_dyn( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t _ALIGN(64) endiandata[20];
        uint32_t _ALIGN(64) hash[8];
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
+        int thr_id = mythr->id;  // thr_id arg is deprecated

        const uint32_t first_nonce = pdata[19];
        const uint32_t Htarg = ptarget[7];
@@ -147,8 +149,8 @@ bool register_argon2d_dyn_algo( algo_gate_t* gate )

 // Unitus

-int scanhash_argon2d4096( int thr_id, struct work *work, uint32_t max_nonce,
-                           uint64_t *hashes_done)
+int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t _ALIGN(64) vhash[8];
   uint32_t _ALIGN(64) endiandata[20];
@@ -157,7 +159,7 @@ int scanhash_argon2d4096( int thr_id, struct work *work, uint32_t max_nonce,
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
-    
+   int thr_id = mythr->id;  // thr_id arg is deprecated
   uint32_t t_cost = 1; // 1 iteration
   uint32_t m_cost = 4096; // use 4MB
   uint32_t parallelism = 1; // 1 thread, 2 lanes
--- a/algo/argon2/argon2d/argon2d-gate.h
+++ b/algo/argon2/argon2d/argon2d-gate.h
@@ -9,23 +9,23 @@ bool register_argon2d_crds_algo( algo_gate_t* gate );

 void argon2d_crds_hash( void *state, const void *input );

-int scanhash_argon2d_crds( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done );
+int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done, struct thr_info *mythr );

 // Dynamic: version = 0x10, m_cost = 500.
 bool register_argon2d_dyn_algo( algo_gate_t* gate );

 void argon2d_dyn_hash( void *state, const void *input );

-int scanhash_argon2d_dyn( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done );
+int scanhash_argon2d_dyn( struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done, struct thr_info *mythr );


 // Unitus: version = 0x13, m_cost = 4096.
 bool register_argon2d4096_algo( algo_gate_t* gate );

-int scanhash_argon2d4096( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done );
+int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done, struct thr_info *mythr );

 #endif

--- a/algo/argon2/argon2d/argon2d/core.c
+++ b/algo/argon2/argon2d/argon2d/core.c
@@ -112,7 +112,7 @@ int allocate_memory(const argon2_context *context, uint8_t **memory,
 void free_memory(const argon2_context *context, uint8_t *memory,
                 size_t num, size_t size) {
    size_t memory_size = num*size;
-    clear_internal_memory(memory, memory_size);
+//    clear_internal_memory(memory, memory_size);
    if (context->free_cbk) {
        (context->free_cbk)(memory, memory_size);
    } else {
@@ -137,7 +137,7 @@ void NOT_OPTIMIZED secure_wipe_memory(void *v, size_t n) {
 int FLAG_clear_internal_memory = 0;
 void clear_internal_memory(void *v, size_t n) {
  if (FLAG_clear_internal_memory && v) {
-    secure_wipe_memory(v, n);
+//    secure_wipe_memory(v, n);
  }
 }

@@ -559,7 +559,7 @@ void initial_hash(uint8_t *blockhash, argon2_context *context,
                       context->pwdlen);

        if (context->flags & ARGON2_FLAG_CLEAR_PASSWORD) {
-            secure_wipe_memory(context->pwd, context->pwdlen);
+//            secure_wipe_memory(context->pwd, context->pwdlen);
            context->pwdlen = 0;
        }
    }
@@ -580,7 +580,7 @@ void initial_hash(uint8_t *blockhash, argon2_context *context,
                       context->secretlen);

        if (context->flags & ARGON2_FLAG_CLEAR_SECRET) {
-            secure_wipe_memory(context->secret, context->secretlen);
+//            secure_wipe_memory(context->secret, context->secretlen);
            context->secretlen = 0;
        }
    }
--- a/algo/blake/blake-4way.c
+++ b/algo/blake/blake-4way.c
@@ -15,11 +15,11 @@ void blakehash_4way(void *state, const void *input)
     memcpy( &ctx, &blake_4w_ctx, sizeof ctx );
     blake256r14_4way( &ctx, input + (64<<2), 16 );
     blake256r14_4way_close( &ctx, vhash );
-     mm128_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
+     dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
 }

-int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done )
+int scanhash_blake_4way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
   uint32_t hash[8*4] __attribute__ ((aligned (32)));
@@ -27,43 +27,34 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   uint32_t HTarget = ptarget[7];
-   uint32_t _ALIGN(32) edata[20];
+   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
   uint32_t n = first_nonce;
-   uint32_t *nonces = work->nonces;
-   int num_found = 0;
+   int thr_id = mythr->id;  // thr_id arg is deprecated

   if (opt_benchmark)
      HTarget = 0x7f;

-   // we need big endian data...
-   swab32_array( edata, pdata, 20 );
-   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+   mm128_bswap32_intrlv80_4x32( vdata, pdata );
   blake256r14_4way_init( &blake_4w_ctx );
   blake256r14_4way( &blake_4w_ctx, vdata, 64 );

-   uint32_t *noncep = vdata + 76;   // 19*4
   do {
-      be32enc( noncep,    n   );
-      be32enc( noncep +1, n+1 );
-      be32enc( noncep +2, n+2 );
-      be32enc( noncep +3, n+3 );
+      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );

      blakehash_4way( hash, vdata );

      for ( int i = 0; i < 4; i++ )
-      if (  (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget ) )
+      if ( (hash+(i<<3))[7] <= HTarget )
+      if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
      {
          pdata[19] = n+i;
-          nonces[ num_found++ ] = n+i;
-          work_set_target_ratio( work, hash+(i<<3) );
+          submit_lane_solution( work, hash+(i<<3), mythr, i );
      }
      n += 4;

-   } while ( (num_found == 0) && (n < max_nonce) 
-             && !work_restart[thr_id].restart );
-
+   } while ( (n < max_nonce) && !work_restart[thr_id].restart );
   *hashes_done = n - first_nonce + 1;
-   return num_found;
+   return 0;
 }

 #endif
@@ -79,13 +70,13 @@ void blakehash_8way( void *state, const void *input )
     memcpy( &ctx, &blake_8w_ctx, sizeof ctx );
     blake256r14_8way( &ctx, input + (64<<3), 16 );
     blake256r14_8way_close( &ctx, vhash );
-     mm256_deinterleave_8x32( state,     state+ 32, state+ 64, state+ 96,
-                              state+128, state+160, state+192, state+224,
-                              vhash, 256 );
+     _dintrlv_8x32( state,     state+ 32, state+ 64, state+ 96,
+                    state+128, state+160, state+192, state+224,
+                    vhash, 256 );
 }

-int scanhash_blake_8way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done )
+int scanhash_blake_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
   uint32_t hash[8*8] __attribute__ ((aligned (32)));
@@ -93,33 +84,21 @@ int scanhash_blake_8way( int thr_id, struct work *work, uint32_t max_nonce,
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   uint32_t HTarget = ptarget[7];
-   uint32_t _ALIGN(32) edata[20];
   uint32_t n = first_nonce;
-   uint32_t *nonces = work->nonces;
-   int num_found = 0;
+   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
+   int thr_id = mythr->id;  // thr_id arg is deprecated

   if (opt_benchmark)
      HTarget = 0x7f;

-   // we need big endian data...
-   swab32_array( edata, pdata, 20 );
-
-   mm256_interleave_8x32( vdata, edata, edata, edata, edata,
-                                 edata, edata, edata, edata, 640 );
+   mm256_bswap32_intrlv80_8x32( vdata, pdata );

   blake256r14_8way_init( &blake_8w_ctx );
   blake256r14_8way( &blake_8w_ctx, vdata, 64 );

-   uint32_t *noncep = vdata + 152;   // 19*8
   do {
-      be32enc( noncep,    n   );
-      be32enc( noncep +1, n+1 );
-      be32enc( noncep +2, n+2 );
-      be32enc( noncep +3, n+3 );
-      be32enc( noncep +4, n+4 );
-      be32enc( noncep +5, n+5 );
-      be32enc( noncep +6, n+6 );
-      be32enc( noncep +7, n+7 );
+      *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
+                                                  n+3, n+2, n+1, n ) );
      pdata[19] = n;

      blakehash_8way( hash, vdata );
@@ -128,17 +107,14 @@ int scanhash_blake_8way( int thr_id, struct work *work, uint32_t max_nonce,
      if ( (hash+i)[7] <= HTarget && fulltest( hash+i, ptarget ) )
      {
          pdata[19] = n+i;
-          num_found++;
-          nonces[i] = n+i;
-          work_set_target_ratio( work, hash+1 );
+          submit_lane_solution( work, hash+(i<<3), mythr, i );
      }
      n += 8;

-   } while ( (num_found == 0) && (n < max_nonce)
-             && !work_restart[thr_id].restart );
+   } while ( (n < max_nonce) !work_restart[thr_id].restart );

   *hashes_done = n - first_nonce + 1;
-   return num_found;
+   return 0;
 }

 #endif
--- a/algo/blake/blake-gate.h
+++ b/algo/blake/blake-gate.h
@@ -10,12 +10,12 @@

 #if defined (BLAKE_4WAY)
 void blakehash_4way(void *state, const void *input);
-int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done );
+int scanhash_blake_4way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
 #endif

 void blakehash( void *state, const void *input );
-int scanhash_blake( int thr_id, struct work *work, uint32_t max_nonce,
-                      uint64_t *hashes_done );
+int scanhash_blake( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr );

 #endif
--- a/algo/blake/blake-hash-4way.h
+++ b/algo/blake/blake-hash-4way.h
@@ -45,7 +45,7 @@ extern "C"{

 #include <stddef.h>
 #include "algo/sha/sph_types.h"
-#include "avxdefs.h"
+#include "simd-utils.h"

 #define SPH_SIZE_blake256   256

--- a/algo/blake/blake.c
+++ b/algo/blake/blake.c
@@ -39,8 +39,8 @@ void blakehash(void *state, const void *input)

 }

-int scanhash_blake( int thr_id, struct work *work, uint32_t max_nonce,
-                      uint64_t *hashes_done )
+int scanhash_blake( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
@@ -49,6 +49,7 @@ int scanhash_blake( int thr_id, struct work *work, uint32_t max_nonce,
 	uint32_t _ALIGN(32) hash64[8];
 	uint32_t _ALIGN(32) endiandata[20];
 	uint32_t n = first_nonce;
+   int thr_id = mythr->id;  // thr_id arg is deprecated

 	ctx_midstate_done = false;

--- a/algo/blake/blake256-hash-4way.c
+++ b/algo/blake/blake256-hash-4way.c
@@ -412,34 +412,16 @@ do { \
 	V5 = H5; \
 	V6 = H6; \
 	V7 = H7; \
-        V8 = _mm_xor_si128( S0, _mm_set_epi32( CS0, CS0, CS0, CS0 ) ); \
-        V9 = _mm_xor_si128( S1, _mm_set_epi32( CS1, CS1, CS1, CS1 ) ); \
-        VA = _mm_xor_si128( S2, _mm_set_epi32( CS2, CS2, CS2, CS2 ) ); \
-        VB = _mm_xor_si128( S3, _mm_set_epi32( CS3, CS3, CS3, CS3 ) ); \
-        VC = _mm_xor_si128( _mm_set_epi32( T0, T0, T0, T0 ), \
-                            _mm_set_epi32( CS4, CS4, CS4, CS4 ) ); \
-        VD = _mm_xor_si128( _mm_set_epi32( T0, T0, T0, T0 ), \
-                            _mm_set_epi32( CS5, CS5, CS5, CS5 ) ); \
-        VE = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ) \
-                          , _mm_set_epi32( CS6, CS6, CS6, CS6 ) ); \
-        VF = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \
-                            _mm_set_epi32( CS7, CS7, CS7, CS7 ) ); \
-	M[0x0] = mm128_bswap_32( *(buf +  0) ); \
-	M[0x1] = mm128_bswap_32( *(buf +  1) ); \
-	M[0x2] = mm128_bswap_32( *(buf +  2) ); \
-	M[0x3] = mm128_bswap_32( *(buf +  3) ); \
-	M[0x4] = mm128_bswap_32( *(buf +  4) ); \
-	M[0x5] = mm128_bswap_32( *(buf +  5) ); \
-	M[0x6] = mm128_bswap_32( *(buf +  6) ); \
-	M[0x7] = mm128_bswap_32( *(buf +  7) ); \
-	M[0x8] = mm128_bswap_32( *(buf +  8) ); \
-	M[0x9] = mm128_bswap_32( *(buf +  9) ); \
-	M[0xA] = mm128_bswap_32( *(buf + 10) ); \
-	M[0xB] = mm128_bswap_32( *(buf + 11) ); \
-	M[0xC] = mm128_bswap_32( *(buf + 12) ); \
-	M[0xD] = mm128_bswap_32( *(buf + 13) ); \
-	M[0xE] = mm128_bswap_32( *(buf + 14) ); \
-	M[0xF] = mm128_bswap_32( *(buf + 15) ); \
+   V8 = _mm_xor_si128( S0, _mm_set1_epi32( CS0 ) ); \
+   V9 = _mm_xor_si128( S1, _mm_set1_epi32( CS1 ) ); \
+   VA = _mm_xor_si128( S2, _mm_set1_epi32( CS2 ) ); \
+   VB = _mm_xor_si128( S3, _mm_set1_epi32( CS3 ) ); \
+   VC = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS4 ) ); \
+   VD = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS5 ) ); \
+   VE = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS6 ) ); \
+   VF = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS7 ) ); \
+   mm128_block_bswap_32( M, buf ); \
+   mm128_block_bswap_32( M+8, buf+8 ); \
 	for (r = 0; r < rounds; r ++) \
 		ROUND_S_4WAY(r); \
        H0 = _mm_xor_si128( _mm_xor_si128( \
@@ -464,6 +446,54 @@ do { \

 // current impl

+#if defined(__SSSE3__)
+
+#define BLAKE256_4WAY_BLOCK_BSWAP32 do \
+{ \
+   __m128i shuf_bswap32 = _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
+                                          0x0405060700010203 ); \
+   M0 = _mm_shuffle_epi8( buf[ 0], shuf_bswap32 ); \
+   M1 = _mm_shuffle_epi8( buf[ 1], shuf_bswap32 ); \
+   M2 = _mm_shuffle_epi8( buf[ 2], shuf_bswap32 ); \
+   M3 = _mm_shuffle_epi8( buf[ 3], shuf_bswap32 ); \
+   M4 = _mm_shuffle_epi8( buf[ 4], shuf_bswap32 ); \
+   M5 = _mm_shuffle_epi8( buf[ 5], shuf_bswap32 ); \
+   M6 = _mm_shuffle_epi8( buf[ 6], shuf_bswap32 ); \
+   M7 = _mm_shuffle_epi8( buf[ 7], shuf_bswap32 ); \
+   M8 = _mm_shuffle_epi8( buf[ 8], shuf_bswap32 ); \
+   M9 = _mm_shuffle_epi8( buf[ 9], shuf_bswap32 ); \
+   MA = _mm_shuffle_epi8( buf[10], shuf_bswap32 ); \
+   MB = _mm_shuffle_epi8( buf[11], shuf_bswap32 ); \
+   MC = _mm_shuffle_epi8( buf[12], shuf_bswap32 ); \
+   MD = _mm_shuffle_epi8( buf[13], shuf_bswap32 ); \
+   ME = _mm_shuffle_epi8( buf[14], shuf_bswap32 ); \
+   MF = _mm_shuffle_epi8( buf[15], shuf_bswap32 ); \
+} while(0)
+
+#else  // SSE2
+
+#define BLAKE256_4WAY_BLOCK_BSWAP32 do \
+{ \
+   M0 = mm128_bswap_32( buf[0] ); \
+   M1 = mm128_bswap_32( buf[1] ); \
+   M2 = mm128_bswap_32( buf[2] ); \
+   M3 = mm128_bswap_32( buf[3] ); \
+   M4 = mm128_bswap_32( buf[4] ); \
+   M5 = mm128_bswap_32( buf[5] ); \
+   M6 = mm128_bswap_32( buf[6] ); \
+   M7 = mm128_bswap_32( buf[7] ); \
+   M8 = mm128_bswap_32( buf[8] ); \
+   M9 = mm128_bswap_32( buf[9] ); \
+   MA = mm128_bswap_32( buf[10] ); \
+   MB = mm128_bswap_32( buf[11] ); \
+   MC = mm128_bswap_32( buf[12] ); \
+   MD = mm128_bswap_32( buf[13] ); \
+   ME = mm128_bswap_32( buf[14] ); \
+   MF = mm128_bswap_32( buf[15] ); \
+} while(0)
+
+#endif  // SSSE3 else SSE2
+
 #define COMPRESS32_4WAY( rounds ) \
 do { \
   __m128i M0, M1, M2, M3, M4, M5, M6, M7; \
@@ -486,22 +516,7 @@ do { \
   VD = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS5 ) ); \
   VE = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS6 ) ); \
   VF = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS7 ) ); \
-   M0 = mm128_bswap_32( buf[ 0] ); \
-   M1 = mm128_bswap_32( buf[ 1] ); \
-   M2 = mm128_bswap_32( buf[ 2] ); \
-   M3 = mm128_bswap_32( buf[ 3] ); \
-   M4 = mm128_bswap_32( buf[ 4] ); \
-   M5 = mm128_bswap_32( buf[ 5] ); \
-   M6 = mm128_bswap_32( buf[ 6] ); \
-   M7 = mm128_bswap_32( buf[ 7] ); \
-   M8 = mm128_bswap_32( buf[ 8] ); \
-   M9 = mm128_bswap_32( buf[ 9] ); \
-   MA = mm128_bswap_32( buf[10] ); \
-   MB = mm128_bswap_32( buf[11] ); \
-   MC = mm128_bswap_32( buf[12] ); \
-   MD = mm128_bswap_32( buf[13] ); \
-   ME = mm128_bswap_32( buf[14] ); \
-   MF = mm128_bswap_32( buf[15] ); \
+   BLAKE256_4WAY_BLOCK_BSWAP32; \
   ROUND_S_4WAY(0); \
   ROUND_S_4WAY(1); \
   ROUND_S_4WAY(2); \
@@ -519,14 +534,14 @@ do { \
      ROUND_S_4WAY(2); \
      ROUND_S_4WAY(3); \
   } \
-   H0 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( V8, V0 ), S0 ), H0 ); \
-   H1 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( V9, V1 ), S1 ), H1 ); \
-   H2 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VA, V2 ), S2 ), H2 ); \
-   H3 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VB, V3 ), S3 ), H3 ); \
-   H4 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VC, V4 ), S0 ), H4 ); \
-   H5 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VD, V5 ), S1 ), H5 ); \
-   H6 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VE, V6 ), S2 ), H6 ); \
-   H7 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VF, V7 ), S3 ), H7 ); \
+   H0 = mm128_xor4( V8, V0, S0, H0 ); \
+   H1 = mm128_xor4( V9, V1, S1, H1 ); \
+   H2 = mm128_xor4( VA, V2, S2, H2 ); \
+   H3 = mm128_xor4( VB, V3, S3, H3 ); \
+   H4 = mm128_xor4( VC, V4, S0, H4 ); \
+   H5 = mm128_xor4( VD, V5, S1, H5 ); \
+   H6 = mm128_xor4( VE, V6, S2, H6 ); \
+   H7 = mm128_xor4( VF, V7, S3, H7 ); \
 } while (0)

 #endif
@@ -607,6 +622,7 @@ do { \
   __m256i M8, M9, MA, MB, MC, MD, ME, MF; \
   __m256i V0, V1, V2, V3, V4, V5, V6, V7; \
   __m256i V8, V9, VA, VB, VC, VD, VE, VF; \
+   __m256i shuf_bswap32; \
   V0 = H0; \
   V1 = H1; \
   V2 = H2; \
@@ -623,22 +639,24 @@ do { \
   VD = _mm256_xor_si256( _mm256_set1_epi32( T0 ), _mm256_set1_epi32( CS5 ) ); \
   VE = _mm256_xor_si256( _mm256_set1_epi32( T1 ), _mm256_set1_epi32( CS6 ) ); \
   VF = _mm256_xor_si256( _mm256_set1_epi32( T1 ), _mm256_set1_epi32( CS7 ) ); \
-   M0 = mm256_bswap_32( * buf ); \
-   M1 = mm256_bswap_32( *(buf+1) ); \
-   M2 = mm256_bswap_32( *(buf+2) ); \
-   M3 = mm256_bswap_32( *(buf+3) ); \
-   M4 = mm256_bswap_32( *(buf+4) ); \
-   M5 = mm256_bswap_32( *(buf+5) ); \
-   M6 = mm256_bswap_32( *(buf+6) ); \
-   M7 = mm256_bswap_32( *(buf+7) ); \
-   M8 = mm256_bswap_32( *(buf+8) ); \
-   M9 = mm256_bswap_32( *(buf+9) ); \
-   MA = mm256_bswap_32( *(buf+10) ); \
-   MB = mm256_bswap_32( *(buf+11) ); \
-   MC = mm256_bswap_32( *(buf+12) ); \
-   MD = mm256_bswap_32( *(buf+13) ); \
-   ME = mm256_bswap_32( *(buf+14) ); \
-   MF = mm256_bswap_32( *(buf+15) ); \
+   shuf_bswap32 = _mm256_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203, \
+                                     0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
+   M0 = _mm256_shuffle_epi8( * buf    , shuf_bswap32 ); \
+   M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \
+   M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap32 ); \
+   M3 = _mm256_shuffle_epi8( *(buf+ 3), shuf_bswap32 ); \
+   M4 = _mm256_shuffle_epi8( *(buf+ 4), shuf_bswap32 ); \
+   M5 = _mm256_shuffle_epi8( *(buf+ 5), shuf_bswap32 ); \
+   M6 = _mm256_shuffle_epi8( *(buf+ 6), shuf_bswap32 ); \
+   M7 = _mm256_shuffle_epi8( *(buf+ 7), shuf_bswap32 ); \
+   M8 = _mm256_shuffle_epi8( *(buf+ 8), shuf_bswap32 ); \
+   M9 = _mm256_shuffle_epi8( *(buf+ 9), shuf_bswap32 ); \
+   MA = _mm256_shuffle_epi8( *(buf+10), shuf_bswap32 ); \
+   MB = _mm256_shuffle_epi8( *(buf+11), shuf_bswap32 ); \
+   MC = _mm256_shuffle_epi8( *(buf+12), shuf_bswap32 ); \
+   MD = _mm256_shuffle_epi8( *(buf+13), shuf_bswap32 ); \
+   ME = _mm256_shuffle_epi8( *(buf+14), shuf_bswap32 ); \
+   MF = _mm256_shuffle_epi8( *(buf+15), shuf_bswap32 ); \
   ROUND_S_8WAY(0); \
   ROUND_S_8WAY(1); \
   ROUND_S_8WAY(2); \
@@ -656,22 +674,14 @@ do { \
      ROUND_S_8WAY(2); \
      ROUND_S_8WAY(3); \
   } \
-   H0 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( V8, V0 ), \
-                                                              S0 ), H0 ); \
-   H1 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( V9, V1 ), \
-                                                              S1 ), H1 ); \
-   H2 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( VA, V2 ), \
-                                                              S2 ), H2 ); \
-   H3 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( VB, V3 ), \
-                                                              S3 ), H3 ); \
-   H4 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( VC, V4 ), \
-                                                              S0 ), H4 ); \
-   H5 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( VD, V5 ), \
-                                                              S1 ), H5 ); \
-   H6 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( VE, V6 ), \
-                                                              S2 ), H6 ); \
-   H7 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( VF, V7 ), \
-                                                              S3 ), H7 ); \
+   H0 = mm256_xor4( V8, V0, S0, H0 ); \
+   H1 = mm256_xor4( V9, V1, S1, H1 ); \
+   H2 = mm256_xor4( VA, V2, S2, H2 ); \
+   H3 = mm256_xor4( VB, V3, S3, H3 ); \
+   H4 = mm256_xor4( VC, V4, S0, H4 ); \
+   H5 = mm256_xor4( VD, V5, S1, H5 ); \
+   H6 = mm256_xor4( VE, V6, S2, H6 ); \
+   H7 = mm256_xor4( VF, V7, S3, H7 ); \
 } while (0)


@@ -685,6 +695,7 @@ static void
 blake32_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
                   const uint32_t *salt, int rounds )
 {
+   __m128i zero = m128_zero;
   casti_m128i( ctx->H, 0 ) = _mm_set1_epi32( iv[0] );
   casti_m128i( ctx->H, 1 ) = _mm_set1_epi32( iv[1] );
   casti_m128i( ctx->H, 2 ) = _mm_set1_epi32( iv[2] );
@@ -694,16 +705,10 @@ blake32_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
   casti_m128i( ctx->H, 6 ) = _mm_set1_epi32( iv[6] );
   casti_m128i( ctx->H, 7 ) = _mm_set1_epi32( iv[7] );

-   casti_m128i( ctx->S, 0 ) = m128_zero;
-   casti_m128i( ctx->S, 1 ) = m128_zero;
-   casti_m128i( ctx->S, 2 ) = m128_zero;
-   casti_m128i( ctx->S, 3 ) = m128_zero;
-/*
-   sc->S[0] = _mm_set1_epi32( salt[0] );
-   sc->S[1] = _mm_set1_epi32( salt[1] );
-   sc->S[2] = _mm_set1_epi32( salt[2] );
-   sc->S[3] = _mm_set1_epi32( salt[3] );
-*/
+   casti_m128i( ctx->S, 0 ) = zero;
+   casti_m128i( ctx->S, 1 ) = zero;
+   casti_m128i( ctx->S, 2 ) = zero;
+   casti_m128i( ctx->S, 3 ) = zero;
   ctx->T0 = ctx->T1 = 0;
   ctx->ptr = 0;
   ctx->rounds = rounds;
@@ -796,14 +801,7 @@ blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,
      blake32_4way( ctx, buf, 64 );
   }

-   casti_m128i( dst, 0 ) = mm128_bswap_32( casti_m128i( ctx->H, 0 ) );
-   casti_m128i( dst, 1 ) = mm128_bswap_32( casti_m128i( ctx->H, 1 ) );
-   casti_m128i( dst, 2 ) = mm128_bswap_32( casti_m128i( ctx->H, 2 ) );
-   casti_m128i( dst, 3 ) = mm128_bswap_32( casti_m128i( ctx->H, 3 ) );
-   casti_m128i( dst, 4 ) = mm128_bswap_32( casti_m128i( ctx->H, 4 ) );
-   casti_m128i( dst, 5 ) = mm128_bswap_32( casti_m128i( ctx->H, 5 ) );
-   casti_m128i( dst, 6 ) = mm128_bswap_32( casti_m128i( ctx->H, 6 ) );
-   casti_m128i( dst, 7 ) = mm128_bswap_32( casti_m128i( ctx->H, 7 ) );
+   mm128_block_bswap_32( (__m128i*)dst, (__m128i*)ctx->H );
 }

 #if defined (__AVX2__)
@@ -816,11 +814,21 @@ static void
 blake32_8way_init( blake_8way_small_context *sc, const sph_u32 *iv,
                   const sph_u32 *salt, int rounds )
 {
-   int i;
-   for ( i = 0; i < 8; i++ )
-      sc->H[i] = _mm256_set1_epi32( iv[i] );
-   for ( i = 0; i < 4; i++ )
-      sc->S[i] = _mm256_set1_epi32( salt[i] );
+   __m256i zero = m256_zero;
+   casti_m256i( sc->H, 0 ) = _mm256_set1_epi32( iv[0] );
+   casti_m256i( sc->H, 1 ) = _mm256_set1_epi32( iv[1] );
+   casti_m256i( sc->H, 2 ) = _mm256_set1_epi32( iv[2] );
+   casti_m256i( sc->H, 3 ) = _mm256_set1_epi32( iv[3] );
+   casti_m256i( sc->H, 4 ) = _mm256_set1_epi32( iv[4] );
+   casti_m256i( sc->H, 5 ) = _mm256_set1_epi32( iv[5] );
+   casti_m256i( sc->H, 6 ) = _mm256_set1_epi32( iv[6] );
+   casti_m256i( sc->H, 7 ) = _mm256_set1_epi32( iv[7] );
+
+   casti_m256i( sc->S, 0 ) = zero;
+   casti_m256i( sc->S, 1 ) = zero;
+   casti_m256i( sc->S, 2 ) = zero;
+   casti_m256i( sc->S, 3 ) = zero;
+
   sc->T0 = sc->T1 = 0;
   sc->ptr = 0;
   sc->rounds = rounds;
@@ -872,14 +880,10 @@ static void
 blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
                    void *dst, size_t out_size_w32 )
 {
-//   union {
-        __m256i buf[16];
-//        sph_u32 dummy;
-//   } u;
-   size_t ptr, k;
+   __m256i buf[16];
+   size_t ptr;
   unsigned bit_len;
   sph_u32 th, tl;
-   __m256i *out;

   ptr = sc->ptr;
   bit_len = ((unsigned)ptr << 3);
@@ -923,9 +927,7 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
        *(buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
        blake32_8way( sc, buf, 64 );
   }
-   out = (__m256i*)dst;
-   for ( k = 0; k < out_size_w32; k++ )
-        out[k] = mm256_bswap_32( sc->H[k] );
+   mm256_block_bswap_32( (__m256i*)dst, (__m256i*)sc->H );
 }

 #endif
--- a/algo/blake/blake2b.c
+++ b/algo/blake/blake2b.c
@@ -35,13 +35,14 @@ static void blake2b_hash_end(uint32_t *output, const uint32_t *input)
 }
 */

-int scanhash_blake2b( int thr_id, struct work *work, uint32_t max_nonce,
-                      uint64_t *hashes_done )
+int scanhash_blake2b( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr )
 {
 	uint32_t _ALIGN(A) vhashcpu[8];
 	uint32_t _ALIGN(A) endiandata[20];
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
+   int thr_id = mythr->id;  // thr_id arg is deprecated

 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[8];
--- a/algo/blake/blake2s-4way.c
+++ b/algo/blake/blake2s-4way.c
@@ -16,60 +16,49 @@ void blake2s_8way_hash( void *output, const void *input )
   blake2s_8way_update( &ctx, input + (64<<3), 16 );
   blake2s_8way_final( &ctx, vhash, BLAKE2S_OUTBYTES );

-   mm256_deinterleave_8x32( output,     output+ 32, output+ 64, output+ 96,
-                            output+128, output+160, output+192, output+224,
-                            vhash, 256 );
+   dintrlv_8x32( output,     output+ 32, output+ 64, output+ 96,
+                 output+128, output+160, output+192, output+224,
+                 vhash, 256 );
 }

-int scanhash_blake2s_8way( int thr_id, struct work *work, uint32_t max_nonce,
-                      uint64_t *hashes_done )
+int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
   uint32_t hash[8*8] __attribute__ ((aligned (32)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
-   uint32_t _ALIGN(64) edata[20];
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
+   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
   uint32_t n = first_nonce;
-   uint32_t *nonces = work->nonces;
-   int num_found = 0;
-   uint32_t *noncep = vdata + 152;   // 19*8
+   int thr_id = mythr->id;  // thr_id arg is deprecated

-   swab32_array( edata, pdata, 20 );
-   mm256_interleave_8x32( vdata, edata, edata, edata, edata,
-                                 edata, edata, edata, edata, 640 );
+   mm256_bswap32_intrlv80_8x32( vdata, pdata );
   blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
   blake2s_8way_update( &blake2s_8w_ctx, vdata, 64 );

   do {
-      be32enc( noncep,    n   );
-      be32enc( noncep +1, n+1 );
-      be32enc( noncep +2, n+2 );
-      be32enc( noncep +3, n+3 );
-      be32enc( noncep +4, n+4 );
-      be32enc( noncep +5, n+5 );
-      be32enc( noncep +6, n+6 );
-      be32enc( noncep +7, n+7 );
+      *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
+                                                  n+3, n+2, n+1, n ) );
      pdata[19] = n;

      blake2s_8way_hash( hash, vdata );


      for ( int i = 0; i < 8; i++ )
-      if (  (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+      if (  (hash+(i<<3))[7] <= Htarg )
+      if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
      {
          pdata[19] = n+i;
-          nonces[ num_found++ ] = n+i;
-          work_set_target_ratio( work, hash+(i<<3) );
+          submit_lane_solution( work, hash+(i<<3), mythr, i );
      }
      n += 8;

-   } while ( (num_found == 0) && (n < max_nonce)
-             && !work_restart[thr_id].restart );
+   } while ( (n < max_nonce) && !work_restart[thr_id].restart );

   *hashes_done = n - first_nonce + 1;
-   return num_found;
+   return 0;
 }

 #elif defined(BLAKE2S_4WAY)
@@ -85,53 +74,46 @@ void blake2s_4way_hash( void *output, const void *input )
   blake2s_4way_update( &ctx, input + (64<<2), 16 );
   blake2s_4way_final( &ctx, vhash, BLAKE2S_OUTBYTES );

-   mm128_deinterleave_4x32( output, output+32, output+64, output+96,
+   dintrlv_4x32( output, output+32, output+64, output+96,
 		            vhash, 256 );
 }

-int scanhash_blake2s_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                      uint64_t *hashes_done )
+int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
   uint32_t hash[8*4] __attribute__ ((aligned (32)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
-   uint32_t _ALIGN(64) edata[20];
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
+   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
   uint32_t n = first_nonce;
-   uint32_t *nonces = work->nonces;
-   int num_found = 0;
-   uint32_t *noncep = vdata + 76;   // 19*4
+   int thr_id = mythr->id;  // thr_id arg is deprecated

-   swab32_array( edata, pdata, 20 );
-   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+   mm128_bswap32_intrlv80_4x32( vdata, pdata );
   blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
   blake2s_4way_update( &blake2s_4w_ctx, vdata, 64 );

   do {
-      be32enc( noncep,    n   );
-      be32enc( noncep +1, n+1 );
-      be32enc( noncep +2, n+2 );
-      be32enc( noncep +3, n+3 );
+      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
      pdata[19] = n;

      blake2s_4way_hash( hash, vdata );

      for ( int i = 0; i < 4; i++ )
-      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+      if ( (hash+(i<<3))[7] <= Htarg )
+      if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
      {
          pdata[19] = n+i;
-          nonces[ num_found++ ] = n+i;
-          work_set_target_ratio( work, hash+(i<<3) );
+          submit_lane_solution( work, hash+(i<<3), mythr, i );
      }
      n += 4;

-   } while ( (num_found == 0) && (n < max_nonce)
-             && !work_restart[thr_id].restart );
+   } while ( (n < max_nonce) && !work_restart[thr_id].restart );

   *hashes_done = n - first_nonce + 1;
-   return num_found;
+   return 0;
 }

 #endif
--- a/algo/blake/blake2s-gate.h
+++ b/algo/blake/blake2s-gate.h
@@ -16,19 +16,19 @@ bool register_blake2s_algo( algo_gate_t* gate );
 #if defined(BLAKE2S_8WAY)

 void blake2s_8way_hash( void *state, const void *input );
-int scanhash_blake2s_8way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done );
+int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );

 #elif defined (BLAKE2S_4WAY)

 void blake2s_4way_hash( void *state, const void *input );
-int scanhash_blake2s_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done );
+int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
 #else

 void blake2s_hash( void *state, const void *input );
-int scanhash_blake2s( int thr_id, struct work *work, uint32_t max_nonce,
-                      uint64_t *hashes_done );
+int scanhash_blake2s( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr );

 #endif

--- a/algo/blake/blake2s-hash-4way.h
+++ b/algo/blake/blake2s-hash-4way.h
@@ -16,7 +16,7 @@

 #if defined(__SSE4_2__)

-#include "avxdefs.h"
+#include "simd-utils.h"

 #include <stddef.h>
 #include <stdint.h>
--- a/algo/blake/blake2s.c
+++ b/algo/blake/blake2s.c
@@ -32,14 +32,15 @@ static void blake2s_hash_end(uint32_t *output, const uint32_t *input)
 	blake2s_final(&s_ctx, (uint8_t*) output, BLAKE2S_OUTBYTES);
 }
 */
-int scanhash_blake2s(int thr_id, struct work *work,
-	uint32_t max_nonce, uint64_t *hashes_done)
+int scanhash_blake2s( struct work *work,
+	uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;

 	uint32_t _ALIGN(64) hash64[8];
 	uint32_t _ALIGN(64) endiandata[20];
+   int thr_id = mythr->id;  // thr_id arg is deprecated

 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
--- a/algo/blake/blake512-hash-4way.c
+++ b/algo/blake/blake512-hash-4way.c
@@ -412,18 +412,18 @@ static const sph_u64 CB[16] = {
 	V5 = H5; \
 	V6 = H6; \
 	V7 = H7; \
-        V8 = _mm256_xor_si256( S0, _mm256_set_epi64x( CB0, CB0, CB0, CB0 ) ); \
-        V9 = _mm256_xor_si256( S1, _mm256_set_epi64x( CB1, CB1, CB1, CB1 ) ); \
-        VA = _mm256_xor_si256( S2, _mm256_set_epi64x( CB2, CB2, CB2, CB2 ) ); \
-        VB = _mm256_xor_si256( S3, _mm256_set_epi64x( CB3, CB3, CB3, CB3 ) ); \
-        VC = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
-                               _mm256_set_epi64x( CB4, CB4, CB4, CB4 ) ); \
-        VD = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
-                               _mm256_set_epi64x( CB5, CB5, CB5, CB5 ) ); \
-        VE = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
-                               _mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \
-        VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
-                               _mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \
+   V8 = _mm256_xor_si256( S0, _mm256_set_epi64x( CB0, CB0, CB0, CB0 ) ); \
+   V9 = _mm256_xor_si256( S1, _mm256_set_epi64x( CB1, CB1, CB1, CB1 ) ); \
+   VA = _mm256_xor_si256( S2, _mm256_set_epi64x( CB2, CB2, CB2, CB2 ) ); \
+   VB = _mm256_xor_si256( S3, _mm256_set_epi64x( CB3, CB3, CB3, CB3 ) ); \
+   VC = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
+                          _mm256_set_epi64x( CB4, CB4, CB4, CB4 ) ); \
+   VD = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
+                          _mm256_set_epi64x( CB5, CB5, CB5, CB5 ) ); \
+   VE = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
+                          _mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \
+   VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
+                          _mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \
 	M[0x0] = mm256_bswap_64( *(buf+0) ); \
 	M[0x1] = mm256_bswap_64( *(buf+1) ); \
 	M[0x2] = mm256_bswap_64( *(buf+2) ); \
@@ -464,80 +464,76 @@ static const sph_u64 CB[16] = {

 //current impl

-#define COMPRESS64_4WAY   do { \
-     __m256i M0, M1, M2, M3, M4, M5, M6, M7; \
-     __m256i M8, M9, MA, MB, MC, MD, ME, MF; \
-     __m256i V0, V1, V2, V3, V4, V5, V6, V7; \
-     __m256i V8, V9, VA, VB, VC, VD, VE, VF; \
-     V0 = H0; \
-     V1 = H1; \
-     V2 = H2; \
-     V3 = H3; \
-     V4 = H4; \
-     V5 = H5; \
-     V6 = H6; \
-     V7 = H7; \
-     V8 = _mm256_xor_si256( S0, _mm256_set_epi64x( CB0, CB0, CB0, CB0 ) );  \
-     V9 = _mm256_xor_si256( S1, _mm256_set_epi64x( CB1, CB1, CB1, CB1 ) );  \
-     VA = _mm256_xor_si256( S2, _mm256_set_epi64x( CB2, CB2, CB2, CB2 ) );  \
-     VB = _mm256_xor_si256( S3, _mm256_set_epi64x( CB3, CB3, CB3, CB3 ) );  \
-     VC = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
-                            _mm256_set_epi64x( CB4, CB4, CB4, CB4 ) );  \
-     VD = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
-                            _mm256_set_epi64x( CB5, CB5, CB5, CB5 ) );  \
-     VE = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
-                            _mm256_set_epi64x( CB6, CB6, CB6, CB6 ) );  \
-     VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
-                            _mm256_set_epi64x( CB7, CB7, CB7, CB7 ) );  \
-     M0 = mm256_bswap_64( *(buf + 0) ); \
-     M1 = mm256_bswap_64( *(buf + 1) ); \
-     M2 = mm256_bswap_64( *(buf + 2) ); \
-     M3 = mm256_bswap_64( *(buf + 3) ); \
-     M4 = mm256_bswap_64( *(buf + 4) ); \
-     M5 = mm256_bswap_64( *(buf + 5) ); \
-     M6 = mm256_bswap_64( *(buf + 6) ); \
-     M7 = mm256_bswap_64( *(buf + 7) ); \
-     M8 = mm256_bswap_64( *(buf + 8) ); \
-     M9 = mm256_bswap_64( *(buf + 9) ); \
-     MA = mm256_bswap_64( *(buf + 10) ); \
-     MB = mm256_bswap_64( *(buf + 11) ); \
-     MC = mm256_bswap_64( *(buf + 12) ); \
-     MD = mm256_bswap_64( *(buf + 13) ); \
-     ME = mm256_bswap_64( *(buf + 14) ); \
-     MF = mm256_bswap_64( *(buf + 15) ); \
-     ROUND_B_4WAY(0); \
-     ROUND_B_4WAY(1); \
-     ROUND_B_4WAY(2); \
-     ROUND_B_4WAY(3); \
-     ROUND_B_4WAY(4); \
-     ROUND_B_4WAY(5); \
-     ROUND_B_4WAY(6); \
-     ROUND_B_4WAY(7); \
-     ROUND_B_4WAY(8); \
-     ROUND_B_4WAY(9); \
-     ROUND_B_4WAY(0); \
-     ROUND_B_4WAY(1); \
-     ROUND_B_4WAY(2); \
-     ROUND_B_4WAY(3); \
-     ROUND_B_4WAY(4); \
-     ROUND_B_4WAY(5); \
-     H0 = _mm256_xor_si256( _mm256_xor_si256( \
-                            _mm256_xor_si256( S0, V0 ), V8 ), H0 ); \
-     H1 = _mm256_xor_si256( _mm256_xor_si256( \
-                            _mm256_xor_si256( S1, V1 ), V9 ), H1 ); \
-     H2 = _mm256_xor_si256( _mm256_xor_si256( \
-                            _mm256_xor_si256( S2, V2 ), VA ), H2 ); \
-     H3 = _mm256_xor_si256( _mm256_xor_si256( \
-                            _mm256_xor_si256( S3, V3 ), VB ), H3 ); \
-     H4 = _mm256_xor_si256( _mm256_xor_si256( \
-                            _mm256_xor_si256( S0, V4 ), VC ), H4 ); \
-     H5 = _mm256_xor_si256( _mm256_xor_si256( \
-                            _mm256_xor_si256( S1, V5 ), VD ), H5 ); \
-     H6 = _mm256_xor_si256( _mm256_xor_si256( \
-                            _mm256_xor_si256( S2, V6 ), VE ), H6 ); \
-     H7 = _mm256_xor_si256( _mm256_xor_si256( \
-                            _mm256_xor_si256( S3, V7 ), VF ), H7 ); \
-	} while (0)
+#define COMPRESS64_4WAY   do \
+{ \
+  __m256i M0, M1, M2, M3, M4, M5, M6, M7; \
+  __m256i M8, M9, MA, MB, MC, MD, ME, MF; \
+  __m256i V0, V1, V2, V3, V4, V5, V6, V7; \
+  __m256i V8, V9, VA, VB, VC, VD, VE, VF; \
+  __m256i shuf_bswap64; \
+  V0 = H0; \
+  V1 = H1; \
+  V2 = H2; \
+  V3 = H3; \
+  V4 = H4; \
+  V5 = H5; \
+  V6 = H6; \
+  V7 = H7; \
+  V8 = _mm256_xor_si256( S0, _mm256_set1_epi64x( CB0 ) );  \
+  V9 = _mm256_xor_si256( S1, _mm256_set1_epi64x( CB1 ) );  \
+  VA = _mm256_xor_si256( S2, _mm256_set1_epi64x( CB2 ) );  \
+  VB = _mm256_xor_si256( S3, _mm256_set1_epi64x( CB3 ) );  \
+  VC = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
+                         _mm256_set1_epi64x( CB4 ) );  \
+  VD = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
+                         _mm256_set1_epi64x( CB5 ) );  \
+  VE = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \
+                         _mm256_set1_epi64x( CB6 ) );  \
+  VF = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \
+                         _mm256_set1_epi64x( CB7 ) );  \
+  shuf_bswap64 = _mm256_set_epi64x( 0x08090a0b0c0d0e0f, 0x0001020304050607, \
+                                    0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
+  M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
+  M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
+  M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \
+  M3 = _mm256_shuffle_epi8( *(buf+ 3), shuf_bswap64 ); \
+  M4 = _mm256_shuffle_epi8( *(buf+ 4), shuf_bswap64 ); \
+  M5 = _mm256_shuffle_epi8( *(buf+ 5), shuf_bswap64 ); \
+  M6 = _mm256_shuffle_epi8( *(buf+ 6), shuf_bswap64 ); \
+  M7 = _mm256_shuffle_epi8( *(buf+ 7), shuf_bswap64 ); \
+  M8 = _mm256_shuffle_epi8( *(buf+ 8), shuf_bswap64 ); \
+  M9 = _mm256_shuffle_epi8( *(buf+ 9), shuf_bswap64 ); \
+  MA = _mm256_shuffle_epi8( *(buf+10), shuf_bswap64 ); \
+  MB = _mm256_shuffle_epi8( *(buf+11), shuf_bswap64 ); \
+  MC = _mm256_shuffle_epi8( *(buf+12), shuf_bswap64 ); \
+  MD = _mm256_shuffle_epi8( *(buf+13), shuf_bswap64 ); \
+  ME = _mm256_shuffle_epi8( *(buf+14), shuf_bswap64 ); \
+  MF = _mm256_shuffle_epi8( *(buf+15), shuf_bswap64 ); \
+  ROUND_B_4WAY(0); \
+  ROUND_B_4WAY(1); \
+  ROUND_B_4WAY(2); \
+  ROUND_B_4WAY(3); \
+  ROUND_B_4WAY(4); \
+  ROUND_B_4WAY(5); \
+  ROUND_B_4WAY(6); \
+  ROUND_B_4WAY(7); \
+  ROUND_B_4WAY(8); \
+  ROUND_B_4WAY(9); \
+  ROUND_B_4WAY(0); \
+  ROUND_B_4WAY(1); \
+  ROUND_B_4WAY(2); \
+  ROUND_B_4WAY(3); \
+  ROUND_B_4WAY(4); \
+  ROUND_B_4WAY(5); \
+  H0 = mm256_xor4( V8, V0, S0, H0 ); \
+  H1 = mm256_xor4( V9, V1, S1, H1 ); \
+  H2 = mm256_xor4( VA, V2, S2, H2 ); \
+  H3 = mm256_xor4( VB, V3, S3, H3 ); \
+  H4 = mm256_xor4( VC, V4, S0, H4 ); \
+  H5 = mm256_xor4( VD, V5, S1, H5 ); \
+  H6 = mm256_xor4( VE, V6, S2, H6 ); \
+  H7 = mm256_xor4( VF, V7, S3, H7 ); \
+} while (0)

 #endif

@@ -547,13 +543,23 @@ static void
 blake64_4way_init( blake_4way_big_context *sc, const sph_u64 *iv,
              const sph_u64 *salt )
 {
-        int i;
-        for ( i = 0; i < 8; i++ )
-           sc->H[i] = _mm256_set1_epi64x( iv[i] );
-        for ( i = 0; i < 4; i++ )
-           sc->S[i] = _mm256_set1_epi64x( salt[i] );
-        sc->T0 = sc->T1 = 0;
-        sc->ptr = 0;
+   __m256i zero = m256_zero;
+   casti_m256i( sc->H, 0 ) = _mm256_set1_epi64x( iv[0] );
+   casti_m256i( sc->H, 1 ) = _mm256_set1_epi64x( iv[1] );
+   casti_m256i( sc->H, 2 ) = _mm256_set1_epi64x( iv[2] );
+   casti_m256i( sc->H, 3 ) = _mm256_set1_epi64x( iv[3] );
+   casti_m256i( sc->H, 4 ) = _mm256_set1_epi64x( iv[4] );
+   casti_m256i( sc->H, 5 ) = _mm256_set1_epi64x( iv[5] );
+   casti_m256i( sc->H, 6 ) = _mm256_set1_epi64x( iv[6] );
+   casti_m256i( sc->H, 7 ) = _mm256_set1_epi64x( iv[7] );
+
+   casti_m256i( sc->S, 0 ) = zero;
+   casti_m256i( sc->S, 1 ) = zero;
+   casti_m256i( sc->S, 2 ) = zero;
+   casti_m256i( sc->S, 3 ) = zero;
+
+   sc->T0 = sc->T1 = 0;
+   sc->ptr = 0;
 }

 static void
@@ -604,15 +610,11 @@ static void
 blake64_4way_close( blake_4way_big_context *sc,
 	unsigned ub, unsigned n, void *dst, size_t out_size_w64)
 {
-//   union {
-      __m256i buf[16];
-//      sph_u64 dummy;
-//   } u;
-   size_t ptr, k;
+   __m256i buf[16];
+   size_t ptr;
   unsigned bit_len;
   uint64_t z, zz;
   sph_u64 th, tl;
-   __m256i *out;

   ptr = sc->ptr;
   bit_len = ((unsigned)ptr << 3);
@@ -665,9 +667,7 @@ blake64_4way_close( blake_4way_big_context *sc,

       blake64_4way( sc, buf, 128 );
   }
-   out = (__m256i*)dst;
-   for ( k = 0; k < out_size_w64; k++ )
-       out[k] = mm256_bswap_64( sc->H[k] );
+   mm256_block_bswap_64( (__m256i*)dst, sc->H );
 }

 void
--- a/algo/blake/blakecoin-4way.c
+++ b/algo/blake/blakecoin-4way.c
@@ -17,11 +17,11 @@ void blakecoin_4way_hash(void *state, const void *input)
     blake256r8_4way( &ctx, input + (64<<2), 16 );
     blake256r8_4way_close( &ctx, vhash );

-     mm128_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
+     dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
 }

-int scanhash_blakecoin_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done )
+int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
   uint32_t hash[8*4] __attribute__ ((aligned (32)));
@@ -29,41 +29,34 @@ int scanhash_blakecoin_4way( int thr_id, struct work *work, uint32_t max_nonce,
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   uint32_t HTarget = ptarget[7];
-   uint32_t _ALIGN(32) edata[20];
   uint32_t n = first_nonce;
-   uint32_t *nonces = work->nonces;
-   int num_found = 0;
+   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
+   int thr_id = mythr->id;  // thr_id arg is deprecated
   if ( opt_benchmark )
      HTarget = 0x7f;

-   swab32_array( edata, pdata, 20 );
-   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+   mm128_bswap32_intrlv80_4x32( vdata, pdata );
   blake256r8_4way_init( &blakecoin_4w_ctx );
   blake256r8_4way( &blakecoin_4w_ctx, vdata, 64 );

-   uint32_t *noncep = vdata + 76;   // 19*4
   do {
-      be32enc( noncep,    n   );
-      be32enc( noncep +1, n+1 );
-      be32enc( noncep +2, n+2 );
-      be32enc( noncep +3, n+3 );
+      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
      pdata[19] = n;
      blakecoin_4way_hash( hash, vdata );

      for ( int i = 0; i < 4; i++ )
-      if (  (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget ) )
+      if (  (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget )
+           && !opt_benchmark )
      {
          pdata[19] = n+i;
-          nonces[ num_found++ ] = n+i;
-          work_set_target_ratio( work, hash+(i<<3) );
+          submit_lane_solution( work, hash+(i<<3), mythr, i );
      }
      n += 4;

-   } while ( (num_found == 0) && (n < max_nonce) 
-             && !work_restart[thr_id].restart );
+   } while ( (n < max_nonce) && !work_restart[thr_id].restart );

   *hashes_done = n - first_nonce + 1;
-   return num_found;
+   return 0;
 }

 #endif
@@ -81,13 +74,12 @@ void blakecoin_8way_hash( void *state, const void *input )
     blake256r8_8way( &ctx, input + (64<<3), 16 );
     blake256r8_8way_close( &ctx, vhash );

-     mm256_deinterleave_8x32( state,     state+ 32, state+ 64, state+ 96,
-                              state+128, state+160, state+192, state+224,
-                              vhash, 256 );
+     dintrlv_8x32( state,     state+ 32, state+ 64, state+ 96, state+128,
+                   state+160, state+192, state+224, vhash, 256 );
 }

-int scanhash_blakecoin_8way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done )
+int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
   uint32_t hash[8*8] __attribute__ ((aligned (32)));
@@ -95,46 +87,34 @@ int scanhash_blakecoin_8way( int thr_id, struct work *work, uint32_t max_nonce,
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   uint32_t HTarget = ptarget[7];
-   uint32_t _ALIGN(32) edata[20];
   uint32_t n = first_nonce;
-   uint32_t *nonces = work->nonces;
-   uint32_t *noncep = vdata + 152;   // 19*8
-   int num_found = 0;
+   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
+   int thr_id = mythr->id;  // thr_id arg is deprecated
   if ( opt_benchmark )
      HTarget = 0x7f;

-   // we need big endian data...
-   swab32_array( edata, pdata, 20 );
-   mm256_interleave_8x32( vdata, edata, edata, edata, edata,
-                                 edata, edata, edata, edata, 640 );
+   mm256_bswap32_intrlv80_8x32( vdata, pdata );
   blake256r8_8way_init( &blakecoin_8w_ctx );
   blake256r8_8way( &blakecoin_8w_ctx, vdata, 64 );

   do {
-      be32enc( noncep,    n   );
-      be32enc( noncep +1, n+1 );
-      be32enc( noncep +2, n+2 );
-      be32enc( noncep +3, n+3 );
-      be32enc( noncep +4, n+4 );
-      be32enc( noncep +5, n+5 );
-      be32enc( noncep +6, n+6 );
-      be32enc( noncep +7, n+7 );
+      *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
+                                                  n+3, n+2, n+1, n ) );
      pdata[19] = n;
      blakecoin_8way_hash( hash, vdata );

      for ( int i = 0; i < 8; i++ )
-      if (  (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget ) )
+      if (  (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget )
+          && !opt_benchmark )
      {
          pdata[19] = n+i;
-          nonces[ num_found++ ] = n+i;
-          work_set_target_ratio( work, hash+(i<<3) );
+          submit_lane_solution( work, hash+(i<<3), mythr, i );
      }
      n += 8;
-   } while ( (num_found == 0) && (n < max_nonce)
-             && !work_restart[thr_id].restart );
+   } while ( (n < max_nonce) && !work_restart[thr_id].restart );

   *hashes_done = n - first_nonce + 1;
-   return num_found;
+   return 0;
 }

 #endif
--- a/algo/blake/blakecoin-gate.h
+++ b/algo/blake/blakecoin-gate.h
@@ -13,18 +13,18 @@

 #if defined (BLAKECOIN_8WAY)
 void blakecoin_8way_hash(void *state, const void *input);
-int scanhash_blakecoin_8way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done );
+int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
 #endif

 #if defined (BLAKECOIN_4WAY)
 void blakecoin_4way_hash(void *state, const void *input);
-int scanhash_blakecoin_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done );
+int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
 #endif

 void blakecoinhash( void *state, const void *input );
-int scanhash_blakecoin( int thr_id, struct work *work, uint32_t max_nonce,
-                      uint64_t *hashes_done );
+int scanhash_blakecoin( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr );

 #endif
--- a/algo/blake/blakecoin.c
+++ b/algo/blake/blakecoin.c
@@ -39,13 +39,14 @@ void blakecoinhash( void *state, const void *input )
 	memcpy( state, hash, 32 );
 }

-int scanhash_blakecoin( int thr_id, struct work *work, uint32_t max_nonce,
-                        uint64_t *hashes_done )
+int scanhash_blakecoin( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
 	const uint32_t first_nonce = pdata[19];
 	uint32_t HTarget = ptarget[7];
+   int thr_id = mythr->id;  // thr_id arg is deprecated

 	uint32_t _ALIGN(32) hash64[8];
 	uint32_t _ALIGN(32) endiandata[20];
--- a/algo/blake/decred-4way.c
+++ b/algo/blake/decred-4way.c
@@ -23,11 +23,11 @@ void decred_hash_4way( void *state, const void *input )
     memcpy( &ctx, &blake_mid, sizeof(blake_mid) );
     blake256_4way( &ctx, tail, tail_len );
     blake256_4way_close( &ctx, vhash );
-     mm128_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
+     dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
 }

-int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done)
+int scanhash_decred_4way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t vdata[48*4] __attribute__ ((aligned (64)));
   uint32_t hash[8*4] __attribute__ ((aligned (32)));
@@ -37,14 +37,13 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
   const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
   uint32_t n = first_nonce;
   const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];
-   uint32_t *nonces = work->nonces;
-   int num_found = 0;
+   int thr_id = mythr->id;  // thr_id arg is deprecated

   // copy to buffer guaranteed to be aligned.
   memcpy( edata, pdata, 180 );

   // use the old way until  new way updated for size.
-   mm128_interleave_4x32x( vdata, edata, edata, edata, edata, 180*8 );
+   mm128_intrlv_4x32x( vdata, edata, edata, edata, edata, 180*8 );

   blake256_4way_init( &blake_mid );
   blake256_4way( &blake_mid, vdata, DECRED_MIDSTATE_LEN );
@@ -59,18 +58,17 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
      decred_hash_4way( hash, vdata );

      for ( int i = 0; i < 4; i++ )
-      if (  (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget ) )
+      if (  (hash+(i<<3))[7] <= HTarget )
+      if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
      {
          pdata[DECRED_NONCE_INDEX] = n+i;
-          nonces[ num_found++ ] = n+i;
-          work_set_target_ratio( work, hash+(i<<3) );
+          submit_lane_solution( work, hash+(i<<3), mythr, i );
      }
      n += 4;
-  } while ( (num_found == 0) && (n < max_nonce) 
-            && !work_restart[thr_id].restart );
+  } while ( (n < max_nonce) && !work_restart[thr_id].restart );

  *hashes_done = n - first_nonce + 1;
-  return num_found;
+  return 0;
 }

 #endif
--- a/algo/blake/decred-gate.h
+++ b/algo/blake/decred-gate.h
@@ -14,7 +14,7 @@

 #if defined (__AVX2__) 
 //void blakehash_84way(void *state, const void *input);
-//int scanhash_blake_8way( int thr_id, struct work *work, uint32_t max_nonce,
+//int scanhash_blake_8way( struct work *work, uint32_t max_nonce,
 //                         uint64_t *hashes_done );
 #endif

@@ -24,13 +24,13 @@

 #if defined (DECRED_4WAY)
 void decred_hash_4way(void *state, const void *input);
-int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done );
+int scanhash_decred_4way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr );
 #endif

 void decred_hash( void *state, const void *input );
-int scanhash_decred( int thr_id, struct work *work, uint32_t max_nonce,
-                     uint64_t *hashes_done );
+int scanhash_decred( struct work *work, uint32_t max_nonce,
+                     uint64_t *hashes_done, struct thr_info *mythr );

 #endif

--- a/algo/blake/decred.c
+++ b/algo/blake/decred.c
@@ -52,12 +52,14 @@ void decred_hash_simple(void *state, const void *input)
        sph_blake256_close(&ctx, state);
 }

-int scanhash_decred(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done)
+int scanhash_decred( struct work *work, uint32_t max_nonce,
+               uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t _ALIGN(64) endiandata[48];
        uint32_t _ALIGN(64) hash32[8];
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
+   int thr_id = mythr->id;  // thr_id arg is deprecated

 //        #define DCR_NONCE_OFT32 35

--- a/algo/blake/pentablake-4way.c
+++ b/algo/blake/pentablake-4way.c
@@ -10,13 +10,8 @@
 #include "blake-hash-4way.h"
 #include "sph_blake.h"

-//#define DEBUG_ALGO
-
 extern void pentablakehash_4way( void *output, const void *input )
 {
-	unsigned char _ALIGN(32) hash[128];
-//	// same as uint32_t hashA[16], hashB[16];
-//	#define hashB hash+64

     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
@@ -30,21 +25,6 @@ extern void pentablakehash_4way( void *output, const void *input )
     blake512_4way( &ctx, input, 80 );
     blake512_4way_close( &ctx, vhash );

-uint64_t sin0[10], sin1[10], sin2[10], sin3[10];
-mm256_deinterleave_4x64( sin0, sin1, sin2, sin3, input, 640 );
-sph_blake512_context ctx2_blake;
-sph_blake512_init(&ctx2_blake);
-sph_blake512(&ctx2_blake, sin0, 80);
-sph_blake512_close(&ctx2_blake, (void*) hash);
-
-mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
-uint64_t* hash64 = (uint64_t*)hash;
-for( int i = 0; i < 8; i++ )
-{
-   if ( hash0[i] != hash64[i] )
-      printf("hash mismatch %u\n",i);
-}
-
     blake512_4way_init( &ctx );
     blake512_4way( &ctx, vhash, 64 );
     blake512_4way_close( &ctx, vhash );
@@ -61,46 +41,14 @@ for( int i = 0; i < 8; i++ )
     blake512_4way( &ctx, vhash, 64 );
     blake512_4way_close( &ctx, vhash );

-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     memcpy( output,    hash0, 32 );
     memcpy( output+32, hash1, 32 );
     memcpy( output+64, hash2, 32 );
     memcpy( output+96, hash3, 32 );
-
-/*
-     uint64_t sin0[10] __attribute__ ((aligned (64)));
-     uint64_t sin1[10] __attribute__ ((aligned (64)));
-     uint64_t sin2[10] __attribute__ ((aligned (64)));
-     uint64_t sin3[10] __attribute__ ((aligned (64)));
-
-	sph_blake512_context     ctx_blake;
-
-	sph_blake512_init(&ctx_blake);
-	sph_blake512(&ctx_blake, input, 80);
-	sph_blake512_close(&ctx_blake, hash);
-
-        sph_blake512_init(&ctx_blake);
-	sph_blake512(&ctx_blake, hash, 64);
-	sph_blake512_close(&ctx_blake, hash);
-
-        sph_blake512_init(&ctx_blake);
-	sph_blake512(&ctx_blake, hash, 64);
-	sph_blake512_close(&ctx_blake, hash);
-
-        sph_blake512_init(&ctx_blake);
-	sph_blake512(&ctx_blake, hash, 64);
-	sph_blake512_close(&ctx_blake, hash);
-
-        sph_blake512_init(&ctx_blake);
-	sph_blake512(&ctx_blake, hash, 64);
-	sph_blake512_close(&ctx_blake, hash);
-
-	memcpy(output, hash, 32);
-*/
 }

-int scanhash_pentablake_4way( int thr_id, struct work *work,
-                              uint32_t max_nonce, uint64_t *hashes_done )
+int scanhash_pentablake_4way( struct work *work,
+      uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
 {
    uint32_t hash[4*8] __attribute__ ((aligned (64)));
    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
@@ -110,9 +58,8 @@ int scanhash_pentablake_4way( int thr_id, struct work *work,
    uint32_t n = pdata[19] - 1;
    const uint32_t first_nonce = pdata[19];
    const uint32_t Htarg = ptarget[7];
-    uint32_t *nonces = work->nonces;
-    int num_found = 0;
    uint32_t *noncep = vdata + 73;   // 9*8 + 1
+    int thr_id = mythr->id;  // thr_id arg is deprecated

 //    uint32_t _ALIGN(32) hash64[8];
 //    uint32_t _ALIGN(32) endiandata[32];
@@ -138,7 +85,7 @@ int scanhash_pentablake_4way( int thr_id, struct work *work,
    swab32_array( endiandata, pdata, 20 );

    uint64_t *edata = (uint64_t*)endiandata;
-    mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+    intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );

    for ( int m=0; m < 6; m++ )
    {
@@ -155,10 +102,10 @@ int scanhash_pentablake_4way( int thr_id, struct work *work,

              for ( int i = 0; i < 4; i++ )
              if ( !( (hash+(i<<3))[7] & mask )
-                  && fulltest( hash+(i<<3), ptarget ) )
+                  && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
              {
-                 nonces[ num_found++ ] = n+i;
-                 work_set_target_ratio( work, hash+(i<<3) );
+                 pdata[19] = n + i;
+                 submit_lane_solution( work, hash+(i<<3), mythr, i );
              }
              n += 4;

--- a/algo/blake/pentablake-gate.h
+++ b/algo/blake/pentablake-gate.h
@@ -10,12 +10,12 @@

 #if defined(PENTABLAKE_4WAY)
 void pentablakehash_4way( void *state, const void *input );
-int scanhash_pentablake_4way( int thr_id, struct work *work,
-                              uint32_t max_nonce, uint64_t *hashes_done );
+int scanhash_pentablake_4way( struct work *work,
+           uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr );
 #endif

 void pentablakehash( void *state, const void *input );
-int scanhash_pentablake( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done );
+int scanhash_pentablake( struct work *work, uint32_t max_nonce,
+            uint64_t *hashes_done, struct thr_info *mythr );
 #endif

--- a/algo/blake/pentablake.c
+++ b/algo/blake/pentablake.c
@@ -40,8 +40,8 @@ extern void pentablakehash(void *output, const void *input)

 }

-int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_nonce,
-      uint64_t *hashes_done)
+int scanhash_pentablake( struct work *work, uint32_t max_nonce,
+      uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
@@ -49,6 +49,7 @@ int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_nonce,
 	uint32_t n = pdata[19] - 1;
 	const uint32_t first_nonce = pdata[19];
 	const uint32_t Htarg = ptarget[7];
+   int thr_id = mythr->id;  // thr_id arg is deprecated

 	uint32_t _ALIGN(32) hash64[8];
 	uint32_t _ALIGN(32) endiandata[32];
--- a/algo/bmw/bmw-hash-4way.h
+++ b/algo/bmw/bmw-hash-4way.h
@@ -43,7 +43,7 @@ extern "C"{
 #include <stddef.h>

 #include "algo/sha/sph_types.h"
-#include "avxdefs.h"
+#include "simd-utils.h"

 #define SPH_SIZE_bmw256   256

--- a/algo/bmw/bmw256-hash-4way.c
+++ b/algo/bmw/bmw256-hash-4way.c
@@ -113,50 +113,27 @@ static const uint32_t IV256[] = {


 #define expand1s( qt, M, H, i ) \
-   _mm_add_epi32( \
-      _mm_add_epi32( \
-         _mm_add_epi32( \
-             _mm_add_epi32( \
-                _mm_add_epi32( ss1( qt[ (i)-16 ] ), \
-                               ss2( qt[ (i)-15 ] ) ), \
-                _mm_add_epi32( ss3( qt[ (i)-14 ] ), \
-                               ss0( qt[ (i)-13 ] ) ) ), \
-             _mm_add_epi32( \
-                _mm_add_epi32( ss1( qt[ (i)-12 ] ), \
-                               ss2( qt[ (i)-11 ] ) ), \
-                _mm_add_epi32( ss3( qt[ (i)-10 ] ), \
-                               ss0( qt[ (i)- 9 ] ) ) ) ), \
-         _mm_add_epi32( \
-             _mm_add_epi32( \
-                _mm_add_epi32( ss1( qt[ (i)- 8 ] ), \
-                               ss2( qt[ (i)- 7 ] ) ), \
-                _mm_add_epi32( ss3( qt[ (i)- 6 ] ), \
-                               ss0( qt[ (i)- 5 ] ) ) ), \
-             _mm_add_epi32( \
-                _mm_add_epi32( ss1( qt[ (i)- 4 ] ), \
-                               ss2( qt[ (i)- 3 ] ) ), \
-                _mm_add_epi32( ss3( qt[ (i)- 2 ] ), \
-                               ss0( qt[ (i)- 1 ] ) ) ) ) ), \
+   _mm_add_epi32(  mm128_add4_32( \
+            mm128_add4_32( ss1( qt[ (i)-16 ] ), ss2( qt[ (i)-15 ] ), \
+                           ss3( qt[ (i)-14 ] ), ss0( qt[ (i)-13 ] ) ), \
+            mm128_add4_32( ss1( qt[ (i)-12 ] ), ss2( qt[ (i)-11 ] ), \
+                           ss3( qt[ (i)-10 ] ), ss0( qt[ (i)- 9 ] ) ), \
+            mm128_add4_32( ss1( qt[ (i)- 8 ] ), ss2( qt[ (i)- 7 ] ), \
+                           ss3( qt[ (i)- 6 ] ), ss0( qt[ (i)- 5 ] ) ),  \
+            mm128_add4_32( ss1( qt[ (i)- 4 ] ), ss2( qt[ (i)- 3 ] ), \
+                           ss3( qt[ (i)- 2 ] ), ss0( qt[ (i)- 1 ] ) ) ), \
      add_elt_s( M, H, (i)-16 ) )

 #define expand2s( qt, M, H, i) \
-   _mm_add_epi32( \
-      _mm_add_epi32( \
-         _mm_add_epi32( \
-             _mm_add_epi32( \
-                _mm_add_epi32( qt[ (i)-16 ], rs1( qt[ (i)-15 ] ) ), \
-                _mm_add_epi32( qt[ (i)-14 ], rs2( qt[ (i)-13 ] ) ) ), \
-             _mm_add_epi32( \
-                _mm_add_epi32( qt[ (i)-12 ], rs3( qt[ (i)-11 ] ) ), \
-                _mm_add_epi32( qt[ (i)-10 ], rs4( qt[ (i)- 9 ] ) ) ) ), \
-         _mm_add_epi32( \
-             _mm_add_epi32( \
-                _mm_add_epi32( qt[ (i)- 8 ], rs5( qt[ (i)- 7 ] ) ), \
-                _mm_add_epi32( qt[ (i)- 6 ], rs6( qt[ (i)- 5 ] ) ) ), \
-             _mm_add_epi32( \
-                _mm_add_epi32( qt[ (i)- 4 ], rs7( qt[ (i)- 3 ] ) ), \
-                _mm_add_epi32( ss4( qt[ (i)- 2 ] ), \
-                               ss5( qt[ (i)- 1 ] ) ) ) ) ), \
+   _mm_add_epi32( mm128_add4_32( \
+            mm128_add4_32( qt[ (i)-16 ], rs1( qt[ (i)-15 ] ), \
+                           qt[ (i)-14 ], rs2( qt[ (i)-13 ] ) ), \
+            mm128_add4_32( qt[ (i)-12 ], rs3( qt[ (i)-11 ] ), \
+                           qt[ (i)-10 ], rs4( qt[ (i)- 9 ] ) ), \
+            mm128_add4_32( qt[ (i)- 8 ], rs5( qt[ (i)- 7 ] ), \
+                           qt[ (i)- 6 ], rs6( qt[ (i)- 5 ] ) ), \
+            mm128_add4_32( qt[ (i)- 4 ], rs7( qt[ (i)- 3 ] ), \
+                           ss4( qt[ (i)- 2 ] ), ss5( qt[ (i)- 1 ] ) ) ), \
      add_elt_s( M, H, (i)-16 ) )

 #define Ws0 \
@@ -357,17 +334,11 @@ void compress_small( const __m128i *M, const __m128i H[16], __m128i dH[16] )
   qt[30] = expand2s( qt, M, H, 30 );
   qt[31] = expand2s( qt, M, H, 31 );

-   xl = _mm_xor_si128(
-              _mm_xor_si128( _mm_xor_si128( qt[16], qt[17] ),
-                             _mm_xor_si128( qt[18], qt[19] ) ),
-              _mm_xor_si128( _mm_xor_si128( qt[20], qt[21] ),
-                             _mm_xor_si128( qt[22], qt[23] ) ) );
-   xh = _mm_xor_si128( xl,
-             _mm_xor_si128(
-                 _mm_xor_si128( _mm_xor_si128( qt[24], qt[25] ),
-                                   _mm_xor_si128( qt[26], qt[27] ) ),
-                 _mm_xor_si128( _mm_xor_si128( qt[28], qt[29] ),
-                                   _mm_xor_si128( qt[30], qt[31] ) )));
+   xl = _mm_xor_si128( mm128_xor4( qt[16], qt[17], qt[18], qt[19] ),
+                       mm128_xor4( qt[20], qt[21], qt[22], qt[23] ) );
+   xh = _mm_xor_si128( xl, _mm_xor_si128(
+                             mm128_xor4( qt[24], qt[25], qt[26], qt[27] ),
+                             mm128_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );

   dH[ 0] = _mm_add_epi32(
                 _mm_xor_si128( M[0],
@@ -537,6 +508,8 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
      }
   }
   sc->ptr = ptr;
+
+
   if ( h1 != sc->H )
        memcpy_128( sc->H, h1, 16 );
 }
@@ -571,6 +544,7 @@ bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,

   for ( u = 0; u < 16; u ++ )
      buf[u] = h2[u];
+
   compress_small( buf, (__m128i*)final_s, h1 );

   for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++)
@@ -692,22 +666,15 @@ bmw256_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)

 #define expand2s8( qt, M, H, i) \
   _mm256_add_epi32( \
-      _mm256_add_epi32( \
-         _mm256_add_epi32( \
-             _mm256_add_epi32( \
-                _mm256_add_epi32( qt[ (i)-16 ], r8s1( qt[ (i)-15 ] ) ), \
-                _mm256_add_epi32( qt[ (i)-14 ], r8s2( qt[ (i)-13 ] ) ) ), \
-             _mm256_add_epi32( \
-                _mm256_add_epi32( qt[ (i)-12 ], r8s3( qt[ (i)-11 ] ) ), \
-                _mm256_add_epi32( qt[ (i)-10 ], r8s4( qt[ (i)- 9 ] ) ) ) ), \
-         _mm256_add_epi32( \
-             _mm256_add_epi32( \
-                _mm256_add_epi32( qt[ (i)- 8 ], r8s5( qt[ (i)- 7 ] ) ), \
-                _mm256_add_epi32( qt[ (i)- 6 ], r8s6( qt[ (i)- 5 ] ) ) ), \
-             _mm256_add_epi32( \
-                _mm256_add_epi32( qt[ (i)- 4 ], r8s7( qt[ (i)- 3 ] ) ), \
-                _mm256_add_epi32( s8s4( qt[ (i)- 2 ] ), \
-                                  s8s5( qt[ (i)- 1 ] ) ) ) ) ), \
+      mm256_add4_32( \
+          mm256_add4_32( qt[ (i)-16 ], r8s1( qt[ (i)-15 ] ), \
+                         qt[ (i)-14 ], r8s2( qt[ (i)-13 ] ) ), \
+          mm256_add4_32( qt[ (i)-12 ], r8s3( qt[ (i)-11 ] ), \
+                         qt[ (i)-10 ], r8s4( qt[ (i)- 9 ] ) ), \
+          mm256_add4_32( qt[ (i)- 8 ], r8s5( qt[ (i)- 7 ] ), \
+                         qt[ (i)- 6 ], r8s6( qt[ (i)- 5 ] ) ), \
+          mm256_add4_32( qt[ (i)- 4 ], r8s7( qt[ (i)- 3 ] ), \
+                         s8s4( qt[ (i)- 2 ] ), s8s5( qt[ (i)- 1 ] ) ) ), \
      add_elt_s8( M, H, (i)-16 ) )


@@ -910,16 +877,11 @@ void compress_small_8way( const __m256i *M, const __m256i H[16],
   qt[31] = expand2s8( qt, M, H, 31 );

   xl = _mm256_xor_si256(
-              _mm256_xor_si256( _mm256_xor_si256( qt[16], qt[17] ),
-                                _mm256_xor_si256( qt[18], qt[19] ) ),
-              _mm256_xor_si256( _mm256_xor_si256( qt[20], qt[21] ),
-                                _mm256_xor_si256( qt[22], qt[23] ) ) );
-   xh = _mm256_xor_si256( xl,
-             _mm256_xor_si256(
-                 _mm256_xor_si256( _mm256_xor_si256( qt[24], qt[25] ),
-                                   _mm256_xor_si256( qt[26], qt[27] ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( qt[28], qt[29] ),
-                                   _mm256_xor_si256( qt[30], qt[31] ) )));
+              mm256_xor4( qt[16], qt[17], qt[18], qt[19] ),
+              mm256_xor4( qt[20], qt[21], qt[22], qt[23] ) );
+   xh = _mm256_xor_si256( xl,  _mm256_xor_si256(
+                 mm256_xor4( qt[24], qt[25], qt[26], qt[27] ),
+                 mm256_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );

   dH[ 0] = _mm256_add_epi32(
                 _mm256_xor_si256( M[0],
@@ -1041,22 +1003,22 @@ static const __m256i final_s8[16] =

 void bmw256_8way_init( bmw256_8way_context *ctx )
 {
-   ctx->H[ 0] = _mm256_set1_epi64x( IV256[ 0] );
-   ctx->H[ 1] = _mm256_set1_epi64x( IV256[ 1] );
-   ctx->H[ 2] = _mm256_set1_epi64x( IV256[ 2] );
-   ctx->H[ 3] = _mm256_set1_epi64x( IV256[ 3] );
-   ctx->H[ 4] = _mm256_set1_epi64x( IV256[ 4] );
-   ctx->H[ 5] = _mm256_set1_epi64x( IV256[ 5] );
-   ctx->H[ 6] = _mm256_set1_epi64x( IV256[ 6] );
-   ctx->H[ 7] = _mm256_set1_epi64x( IV256[ 7] );
-   ctx->H[ 8] = _mm256_set1_epi64x( IV256[ 8] );
-   ctx->H[ 9] = _mm256_set1_epi64x( IV256[ 9] );
-   ctx->H[10] = _mm256_set1_epi64x( IV256[10] );
-   ctx->H[11] = _mm256_set1_epi64x( IV256[11] );
-   ctx->H[12] = _mm256_set1_epi64x( IV256[12] );
-   ctx->H[13] = _mm256_set1_epi64x( IV256[13] );
-   ctx->H[14] = _mm256_set1_epi64x( IV256[14] );
-   ctx->H[15] = _mm256_set1_epi64x( IV256[15] );
+   ctx->H[ 0] = _mm256_set1_epi32( IV256[ 0] );
+   ctx->H[ 1] = _mm256_set1_epi32( IV256[ 1] );
+   ctx->H[ 2] = _mm256_set1_epi32( IV256[ 2] );
+   ctx->H[ 3] = _mm256_set1_epi32( IV256[ 3] );
+   ctx->H[ 4] = _mm256_set1_epi32( IV256[ 4] );
+   ctx->H[ 5] = _mm256_set1_epi32( IV256[ 5] );
+   ctx->H[ 6] = _mm256_set1_epi32( IV256[ 6] );
+   ctx->H[ 7] = _mm256_set1_epi32( IV256[ 7] );
+   ctx->H[ 8] = _mm256_set1_epi32( IV256[ 8] );
+   ctx->H[ 9] = _mm256_set1_epi32( IV256[ 9] );
+   ctx->H[10] = _mm256_set1_epi32( IV256[10] );
+   ctx->H[11] = _mm256_set1_epi32( IV256[11] );
+   ctx->H[12] = _mm256_set1_epi32( IV256[12] );
+   ctx->H[13] = _mm256_set1_epi32( IV256[13] );
+   ctx->H[14] = _mm256_set1_epi32( IV256[14] );
+   ctx->H[15] = _mm256_set1_epi32( IV256[15] );
   ctx->ptr       = 0;
   ctx->bit_count = 0;

@@ -1076,14 +1038,15 @@ void bmw256_8way( bmw256_8way_context *ctx, const void *data, size_t len )
   ptr = ctx->ptr;
   h1 = ctx->H;
   h2 = htmp;
+
   while ( len > 0 )
   {
      size_t clen;
      clen = buf_size - ptr;
      if ( clen > len )
         clen = len;
-      memcpy_256( buf + (ptr>>3), vdata, clen >> 3 );
-      vdata = vdata + (clen>>3);
+      memcpy_256( buf + (ptr>>2), vdata, clen >> 2 );
+      vdata = vdata + (clen>>2);
      len -= clen;
      ptr += clen;
      if ( ptr == buf_size )
@@ -1097,6 +1060,7 @@ void bmw256_8way( bmw256_8way_context *ctx, const void *data, size_t len )
      }
   }
   ctx->ptr = ptr;
+
   if ( h1 != ctx->H )
        memcpy_256( ctx->H, h1, 16 );
 }
@@ -1106,24 +1070,26 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst )
   __m256i *buf;
   __m256i h1[16], h2[16], *h;
   size_t ptr, u, v;
-//   unsigned z;
   const int buf_size = 64;  // bytes of one lane, compatible with len

   buf = ctx->buf;
   ptr = ctx->ptr;
-   buf[ ptr>>3 ] = _mm256_set1_epi32( 0x80 );
-   ptr += 8;
+   buf[ ptr>>2 ] = _mm256_set1_epi32( 0x80 );
+   ptr += 4;
   h = ctx->H;

-   if (  ptr > (buf_size - 8) )
+   if (  ptr > (buf_size - 4) )
   {
-      memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 );
+      memset_zero_256( buf + (ptr>>2), (buf_size - ptr) >> 2 );
      compress_small_8way( buf, h, h1 );
      ptr = 0;
      h = h1;
   }
-   memset_zero_256( buf + (ptr>>3), (buf_size - 8 - ptr) >> 3 );
-   buf[ (buf_size - 8) >> 3 ] = _mm256_set1_epi64x( ctx->bit_count );
+   memset_zero_256( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 );
+   buf[ (buf_size - 8) >> 2 ] = _mm256_set1_epi32( ctx->bit_count );
+   buf[ (buf_size - 4) >> 2 ] = m256_zero;
+
+
   compress_small_8way( buf, h, h2 );

   for ( u = 0; u < 16; u ++ )
--- a/algo/bmw/bmw256.c
+++ b/algo/bmw/bmw256.c
@@ -19,14 +19,15 @@ void bmwhash(void *output, const void *input)
 */
 }

-int scanhash_bmw(int thr_id, struct work *work,
-	uint32_t max_nonce, uint64_t *hashes_done)
+int scanhash_bmw( struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;

 	uint32_t _ALIGN(64) hash64[8];
 	uint32_t _ALIGN(64) endiandata[20];
+   int thr_id = mythr->id;

 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
--- a/algo/bmw/bmw512-hash-4way.c
+++ b/algo/bmw/bmw512-hash-4way.c
@@ -569,28 +569,20 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )


 #define sb0(x) \
-   _mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 1), \
-                                       _mm256_slli_epi64( (x), 3) ), \
-                     _mm256_xor_si256( mm256_rol_64( (x),  4), \
-                                       mm256_rol_64( (x), 37) ) )
+   mm256_xor4( _mm256_srli_epi64( (x), 1), _mm256_slli_epi64( (x), 3), \
+                mm256_rol_64(     (x), 4),  mm256_rol_64(     (x),37) )

 #define sb1(x) \
-   _mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 1), \
-                                       _mm256_slli_epi64( (x), 2) ), \
-                     _mm256_xor_si256( mm256_rol_64( (x), 13), \
-                                       mm256_rol_64( (x), 43) ) )
+   mm256_xor4( _mm256_srli_epi64( (x), 1), _mm256_slli_epi64( (x), 2), \
+                mm256_rol_64(     (x),13),  mm256_rol_64(     (x),43) )

 #define sb2(x) \
-   _mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 2), \
-                                       _mm256_slli_epi64( (x), 1) ), \
-                     _mm256_xor_si256( mm256_rol_64( (x), 19), \
-                                       mm256_rol_64( (x), 53) ) )
+   mm256_xor4( _mm256_srli_epi64( (x), 2), _mm256_slli_epi64( (x), 1), \
+                mm256_rol_64(     (x),19),  mm256_rol_64(     (x),53) )

 #define sb3(x) \
-   _mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 2), \
-                                       _mm256_slli_epi64( (x), 2) ), \
-                     _mm256_xor_si256( mm256_rol_64( (x), 28), \
-                                       mm256_rol_64( (x), 59) ) )
+   mm256_xor4( _mm256_srli_epi64( (x), 2), _mm256_slli_epi64( (x), 2), \
+                mm256_rol_64(     (x),28),  mm256_rol_64(     (x),59) )

 #define sb4(x) \
  _mm256_xor_si256( (x), _mm256_srli_epi64( (x), 1 ) )
@@ -618,55 +610,32 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )
                             rol_off_64( M, j, 10 ) ), \
            _mm256_set1_epi64x( ( (j) + 16 ) * 0x0555555555555555ULL ) ), \
       H[ ( (j)+7 ) & 0xF ] )
-          
+
+
 #define expand1b( qt, M, H, i ) \
-   _mm256_add_epi64( \
-      _mm256_add_epi64( \
-         _mm256_add_epi64( \
-             _mm256_add_epi64( \
-                _mm256_add_epi64( sb1( qt[ (i)-16 ] ), \
-                                  sb2( qt[ (i)-15 ] ) ), \
-                _mm256_add_epi64( sb3( qt[ (i)-14 ] ), \
-                                  sb0( qt[ (i)-13 ] ) ) ), \
-             _mm256_add_epi64( \
-                _mm256_add_epi64( sb1( qt[ (i)-12 ] ), \
-                                  sb2( qt[ (i)-11 ] ) ), \
-                _mm256_add_epi64( sb3( qt[ (i)-10 ] ), \
-                                  sb0( qt[ (i)- 9 ] ) ) ) ), \
-         _mm256_add_epi64( \
-             _mm256_add_epi64( \
-                _mm256_add_epi64( sb1( qt[ (i)- 8 ] ), \
-                                  sb2( qt[ (i)- 7 ] ) ), \
-                _mm256_add_epi64( sb3( qt[ (i)- 6 ] ), \
-                                  sb0( qt[ (i)- 5 ] ) ) ), \
-             _mm256_add_epi64( \
-                _mm256_add_epi64( sb1( qt[ (i)- 4 ] ), \
-                                  sb2( qt[ (i)- 3 ] ) ), \
-                _mm256_add_epi64( sb3( qt[ (i)- 2 ] ), \
-                                  sb0( qt[ (i)- 1 ] ) ) ) ) ), \
+   _mm256_add_epi64( mm256_add4_64( \
+      mm256_add4_64( sb1( qt[ (i)-16 ] ), sb2( qt[ (i)-15 ] ), \
+                     sb3( qt[ (i)-14 ] ), sb0( qt[ (i)-13 ] )), \
+      mm256_add4_64( sb1( qt[ (i)-12 ] ), sb2( qt[ (i)-11 ] ), \
+                     sb3( qt[ (i)-10 ] ), sb0( qt[ (i)- 9 ] )), \
+      mm256_add4_64( sb1( qt[ (i)- 8 ] ), sb2( qt[ (i)- 7 ] ), \
+                     sb3( qt[ (i)- 6 ] ), sb0( qt[ (i)- 5 ] )), \
+      mm256_add4_64( sb1( qt[ (i)- 4 ] ), sb2( qt[ (i)- 3 ] ), \
+                     sb3( qt[ (i)- 2 ] ), sb0( qt[ (i)- 1 ] ) ) ), \
      add_elt_b( M, H, (i)-16 ) )

 #define expand2b( qt, M, H, i) \
-   _mm256_add_epi64( \
-      _mm256_add_epi64( \
-         _mm256_add_epi64( \
-             _mm256_add_epi64( \
-                _mm256_add_epi64( qt[ (i)-16 ], rb1( qt[ (i)-15 ] ) ), \
-                _mm256_add_epi64( qt[ (i)-14 ], rb2( qt[ (i)-13 ] ) ) ), \
-             _mm256_add_epi64( \
-                _mm256_add_epi64( qt[ (i)-12 ], rb3( qt[ (i)-11 ] ) ), \
-                _mm256_add_epi64( qt[ (i)-10 ], rb4( qt[ (i)- 9 ] ) ) ) ), \
-         _mm256_add_epi64( \
-             _mm256_add_epi64( \
-                _mm256_add_epi64( qt[ (i)- 8 ], rb5( qt[ (i)- 7 ] ) ), \
-                _mm256_add_epi64( qt[ (i)- 6 ], rb6( qt[ (i)- 5 ] ) ) ), \
-             _mm256_add_epi64( \
-                _mm256_add_epi64( qt[ (i)- 4 ], rb7( qt[ (i)- 3 ] ) ), \
-                _mm256_add_epi64( sb4( qt[ (i)- 2 ] ), \
-                                  sb5( qt[ (i)- 1 ] ) ) ) ) ), \
+   _mm256_add_epi64( mm256_add4_64( \
+      mm256_add4_64( qt[ (i)-16 ], rb1( qt[ (i)-15 ] ), \
+                     qt[ (i)-14 ], rb2( qt[ (i)-13 ] ) ), \
+      mm256_add4_64( qt[ (i)-12 ], rb3( qt[ (i)-11 ] ), \
+                     qt[ (i)-10 ], rb4( qt[ (i)- 9 ] ) ), \
+      mm256_add4_64( qt[ (i)- 8 ], rb5( qt[ (i)- 7 ] ), \
+                     qt[ (i)- 6 ], rb6( qt[ (i)- 5 ] ) ), \
+      mm256_add4_64( qt[ (i)- 4 ], rb7( qt[ (i)- 3 ] ), \
+                     sb4( qt[ (i)- 2 ] ), sb5( qt[ (i)- 1 ] ) ) ), \
      add_elt_b( M, H, (i)-16 ) )

-
 #define Wb0 \
   _mm256_add_epi64( \
       _mm256_add_epi64( \
@@ -864,95 +833,90 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
   qt[30] = expand2b( qt, M, H, 30 ); 
   qt[31] = expand2b( qt, M, H, 31 ); 

-   xl = _mm256_xor_si256( 
-              _mm256_xor_si256( _mm256_xor_si256( qt[16], qt[17] ), 
-                                _mm256_xor_si256( qt[18], qt[19] ) ), 
-              _mm256_xor_si256( _mm256_xor_si256( qt[20], qt[21] ), 
-                                _mm256_xor_si256( qt[22], qt[23] ) ) ); 
-   xh = _mm256_xor_si256( xl, 
-             _mm256_xor_si256( 
-                 _mm256_xor_si256( _mm256_xor_si256( qt[24], qt[25] ),
-                                   _mm256_xor_si256( qt[26], qt[27] ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( qt[28], qt[29] ),
-                                   _mm256_xor_si256( qt[30], qt[31] ) )));
+   xl = _mm256_xor_si256(
+           mm256_xor4( qt[16], qt[17], qt[18], qt[19] ), 
+           mm256_xor4( qt[20], qt[21], qt[22], qt[23] ) ); 
+   xh = _mm256_xor_si256( xl, _mm256_xor_si256( 
+           mm256_xor4( qt[24], qt[25], qt[26], qt[27] ),
+           mm256_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );

   dH[ 0] = _mm256_add_epi64(
-                 _mm256_xor_si256( M[0],
-                      _mm256_xor_si256( _mm256_slli_epi64( xh, 5 ),
-                                        _mm256_srli_epi64( qt[16], 5 ) ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[24] ), qt[ 0] ));
+               _mm256_xor_si256( M[0],
+                  _mm256_xor_si256( _mm256_slli_epi64( xh, 5 ),
+                                    _mm256_srli_epi64( qt[16], 5 ) ) ),
+               _mm256_xor_si256( _mm256_xor_si256( xl, qt[24] ), qt[ 0] ) );
   dH[ 1] = _mm256_add_epi64(
-                 _mm256_xor_si256( M[1],
-                      _mm256_xor_si256( _mm256_srli_epi64( xh, 7 ),
-                                        _mm256_slli_epi64( qt[17], 8 ) ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[25] ), qt[ 1] ));
+               _mm256_xor_si256( M[1],
+                  _mm256_xor_si256( _mm256_srli_epi64( xh, 7 ),
+                                    _mm256_slli_epi64( qt[17], 8 ) ) ),
+               _mm256_xor_si256( _mm256_xor_si256( xl, qt[25] ), qt[ 1] ) );
   dH[ 2] = _mm256_add_epi64(
-                 _mm256_xor_si256( M[2],
-                      _mm256_xor_si256( _mm256_srli_epi64( xh, 5 ),
-                                        _mm256_slli_epi64( qt[18], 5 ) ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[26] ), qt[ 2] ));
+               _mm256_xor_si256( M[2],
+                  _mm256_xor_si256( _mm256_srli_epi64( xh, 5 ),
+                                    _mm256_slli_epi64( qt[18], 5 ) ) ),
+               _mm256_xor_si256( _mm256_xor_si256( xl, qt[26] ), qt[ 2] ) );
   dH[ 3] = _mm256_add_epi64(
-                 _mm256_xor_si256( M[3],
-                      _mm256_xor_si256( _mm256_srli_epi64( xh, 1 ),
-                                        _mm256_slli_epi64( qt[19], 5 ) ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[27] ), qt[ 3] ));
+               _mm256_xor_si256( M[3],
+                  _mm256_xor_si256( _mm256_srli_epi64( xh, 1 ),
+                                    _mm256_slli_epi64( qt[19], 5 ) ) ),
+               _mm256_xor_si256( _mm256_xor_si256( xl, qt[27] ), qt[ 3] ) );
   dH[ 4] = _mm256_add_epi64(
-                 _mm256_xor_si256( M[4],
-                      _mm256_xor_si256( _mm256_srli_epi64( xh, 3 ),
-                                        _mm256_slli_epi64( qt[20], 0 ) ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[28] ), qt[ 4] ));
+               _mm256_xor_si256( M[4],
+                  _mm256_xor_si256( _mm256_srli_epi64( xh, 3 ),
+                                    _mm256_slli_epi64( qt[20], 0 ) ) ),
+               _mm256_xor_si256( _mm256_xor_si256( xl, qt[28] ), qt[ 4] ) );
   dH[ 5] = _mm256_add_epi64(
-                 _mm256_xor_si256( M[5],
-                      _mm256_xor_si256( _mm256_slli_epi64( xh, 6 ),
-                                        _mm256_srli_epi64( qt[21], 6 ) ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[29] ), qt[ 5] ));
+               _mm256_xor_si256( M[5],
+                  _mm256_xor_si256( _mm256_slli_epi64( xh, 6 ),
+                                    _mm256_srli_epi64( qt[21], 6 ) ) ),
+               _mm256_xor_si256( _mm256_xor_si256( xl, qt[29] ), qt[ 5] ) );
   dH[ 6] = _mm256_add_epi64(
-                 _mm256_xor_si256( M[6],
-                      _mm256_xor_si256( _mm256_srli_epi64( xh, 4 ),
-                                        _mm256_slli_epi64( qt[22], 6 ) ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[30] ), qt[ 6] ));
+               _mm256_xor_si256( M[6],
+                  _mm256_xor_si256( _mm256_srli_epi64( xh, 4 ),
+                                    _mm256_slli_epi64( qt[22], 6 ) ) ),
+               _mm256_xor_si256( _mm256_xor_si256( xl, qt[30] ), qt[ 6] ) );
   dH[ 7] = _mm256_add_epi64(
-                 _mm256_xor_si256( M[7],
-                      _mm256_xor_si256( _mm256_srli_epi64( xh, 11 ),
-                                        _mm256_slli_epi64( qt[23], 2 ) ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[31] ), qt[ 7] ));
+               _mm256_xor_si256( M[7],
+                  _mm256_xor_si256( _mm256_srli_epi64( xh, 11 ),
+                                    _mm256_slli_epi64( qt[23], 2 ) ) ),
+               _mm256_xor_si256( _mm256_xor_si256( xl, qt[31] ), qt[ 7] ) );
   dH[ 8] = _mm256_add_epi64( _mm256_add_epi64(
-                 mm256_rol_64( dH[4], 9 ),
+              mm256_rol_64( dH[4], 9 ),
                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[24] ), M[ 8] )),
                 _mm256_xor_si256( _mm256_slli_epi64( xl, 8 ),
                                   _mm256_xor_si256( qt[23], qt[ 8] ) ) );
   dH[ 9] = _mm256_add_epi64( _mm256_add_epi64(
-                 mm256_rol_64( dH[5], 10 ),
+              mm256_rol_64( dH[5], 10 ),
                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[25] ), M[ 9] )),
                 _mm256_xor_si256( _mm256_srli_epi64( xl, 6 ),
                                   _mm256_xor_si256( qt[16], qt[ 9] ) ) );
   dH[10] = _mm256_add_epi64( _mm256_add_epi64(
-                 mm256_rol_64( dH[6], 11 ),
+              mm256_rol_64( dH[6], 11 ),
                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[26] ), M[10] )),
                 _mm256_xor_si256( _mm256_slli_epi64( xl, 6 ),
                                   _mm256_xor_si256( qt[17], qt[10] ) ) );
   dH[11] = _mm256_add_epi64( _mm256_add_epi64(
-                 mm256_rol_64( dH[7], 12 ),
+              mm256_rol_64( dH[7], 12 ),
                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[27] ), M[11] )),
                 _mm256_xor_si256( _mm256_slli_epi64( xl, 4 ),
                                   _mm256_xor_si256( qt[18], qt[11] ) ) );
   dH[12] = _mm256_add_epi64( _mm256_add_epi64(
-                 mm256_rol_64( dH[0], 13 ),
+              mm256_rol_64( dH[0], 13 ),
                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[28] ), M[12] )),
                 _mm256_xor_si256( _mm256_srli_epi64( xl, 3 ),
                                   _mm256_xor_si256( qt[19], qt[12] ) ) );
   dH[13] = _mm256_add_epi64( _mm256_add_epi64(
-                 mm256_rol_64( dH[1], 14 ),
+              mm256_rol_64( dH[1], 14 ),
                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[29] ), M[13] )),
                 _mm256_xor_si256( _mm256_srli_epi64( xl, 4 ),
                                   _mm256_xor_si256( qt[20], qt[13] ) ) );
   dH[14] = _mm256_add_epi64( _mm256_add_epi64(
-                 mm256_rol_64( dH[2], 15 ),
+              mm256_rol_64( dH[2], 15 ),
                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[30] ), M[14] )),
                 _mm256_xor_si256( _mm256_srli_epi64( xl, 7 ),
                                   _mm256_xor_si256( qt[21], qt[14] ) ) );
   dH[15] = _mm256_add_epi64( _mm256_add_epi64(
-                 mm256_rol_64( dH[3], 16 ),
+              mm256_rol_64( dH[3], 16 ),
                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[31] ), M[15] )),
                 _mm256_xor_si256( _mm256_srli_epi64( xl, 2 ),
                                   _mm256_xor_si256( qt[22], qt[15] ) ) );
--- a/algo/cryptonight/cryptolight.c
+++ b/algo/cryptonight/cryptolight.c
@@ -242,6 +242,8 @@ void cryptolight_hash(void* output, const void* input, int len) {
 	free(ctx);
 }

+#if defined(__AES__)
+
 static void cryptolight_hash_ctx_aes_ni(void* output, const void* input,
                       int len, struct cryptonight_ctx* ctx)
 {
@@ -312,8 +314,10 @@ static void cryptolight_hash_ctx_aes_ni(void* output, const void* input,
 	oaes_free((OAES_CTX **) &ctx->aes_ctx);
 }

-int scanhash_cryptolight(int thr_id, struct work *work,
-		uint32_t max_nonce, uint64_t *hashes_done)
+#endif
+
+int scanhash_cryptolight( struct work *work,
+		uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr)
 {
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
@@ -322,6 +326,7 @@ int scanhash_cryptolight(int thr_id, struct work *work,
 	const uint32_t first_nonce = n + 1;
 	//const uint32_t Htarg = ptarget[7];
 	uint32_t _ALIGN(32) hash[HASH_SIZE / 4];
+   int thr_id = mythr->id;

 	struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx));

--- a/algo/cryptonight/cryptonight-common.c
+++ b/algo/cryptonight/cryptonight-common.c
@@ -70,11 +70,12 @@ void cryptonight_hash_suw( void *restrict output, const void *input )

 bool cryptonightV7 = false;

-int scanhash_cryptonight( int thr_id, struct work *work, uint32_t max_nonce,
-                   uint64_t *hashes_done )
+int scanhash_cryptonight( struct work *work, uint32_t max_nonce,
+                   uint64_t *hashes_done, struct thr_info *mythr )
 {
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
+    int thr_id = mythr->id;

    uint32_t *nonceptr = (uint32_t*) (((char*)pdata) + 39);
    uint32_t n = *nonceptr - 1;
--- a/algo/cryptonight/cryptonight.h
+++ b/algo/cryptonight/cryptonight.h
@@ -40,8 +40,8 @@ void cryptonight_hash_ctx(void* output, const void* input, int len);
 void keccakf(uint64_t st[25], int rounds);
 extern void (* const extra_hashes[4])(const void *, size_t, char *);

-int scanhash_cryptonight( int thr_id, struct work *work, uint32_t max_nonce,
-                           uint64_t *hashes_done );
+int scanhash_cryptonight( struct work *work, uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr );

 void cryptonight_hash_aes( void *restrict output, const void *input, int len );

--- a/algo/cubehash/cube-hash-2way.h
+++ b/algo/cubehash/cube-hash-2way.h
@@ -4,7 +4,7 @@
 #if defined(__AVX2__)

 #include <stdint.h>
-#include "avxdefs.h"
+#include "simd-utils.h"

 // 2x128, 2 way parallel SSE2

--- a/algo/cubehash/cubehash_sse2.c
+++ b/algo/cubehash/cubehash_sse2.c
@@ -13,7 +13,7 @@
 #include <stdbool.h>
 #include <unistd.h>
 #include <memory.h>
-#include "avxdefs.h"
+#include "simd-utils.h"
 #include <stdio.h>

 // The result of hashing 10 rounds of initial data which is params and 
--- a/algo/fugue/sph_fugue.c
+++ b/algo/fugue/sph_fugue.c
@@ -11,6 +11,8 @@ extern "C"{
 #pragma warning (disable: 4146)
 #endif

+#define SPH_FUGUE_NOCOPY 1
+
 static const sph_u32 IV224[] = {
 	SPH_C32(0xf4c9120d), SPH_C32(0x6286f757), SPH_C32(0xee39e01c),
 	SPH_C32(0xe074e3cb), SPH_C32(0xa1127c62), SPH_C32(0x9a43d215),
--- a/algo/groestl/aes_ni/brg_endian.h
+++ b/algo/groestl/aes_ni/brg_endian.h
@@ -43,7 +43,7 @@
 #  if !defined( __MINGW32__ ) && !defined( _AIX )
 #    include <endian.h>
 #    if !defined( __BEOS__ )
-#      include <byteswap.h>
+//#      include <byteswap.h>
 #    endif
 #  endif
 #endif
--- a/algo/groestl/aes_ni/hash-groestl.c
+++ b/algo/groestl/aes_ni/hash-groestl.c
@@ -12,7 +12,7 @@
 #include <memory.h>
 #include "hash-groestl.h"
 #include "miner.h"
-#include "avxdefs.h"
+#include "simd-utils.h"

 #ifndef NO_AES_NI

--- a/algo/groestl/aes_ni/hash-groestl256.c
+++ b/algo/groestl/aes_ni/hash-groestl256.c
@@ -9,7 +9,7 @@
 #include <memory.h>
 #include "hash-groestl256.h"
 #include "miner.h"
-#include "avxdefs.h"
+#include "simd-utils.h"

 #ifndef NO_AES_NI

--- a/algo/groestl/groestl.c
+++ b/algo/groestl/groestl.c
@@ -56,14 +56,15 @@ void groestlhash( void *output, const void *input )
     memcpy(output, hash, 32);
 }

-int scanhash_groestl( int thr_id, struct work *work, uint32_t max_nonce,
-                      uint64_t *hashes_done )
+int scanhash_groestl( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
        uint32_t endiandata[20] __attribute__ ((aligned (64)));
 	const uint32_t first_nonce = pdata[19];
 	uint32_t nonce = first_nonce;
+   int thr_id = mythr->id;  // thr_id arg is deprecated

 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x0000ff;
--- a/algo/groestl/myr-groestl.c
+++ b/algo/groestl/myr-groestl.c
@@ -54,8 +54,8 @@ void myriad_hash(void *output, const void *input)
 	memcpy(output, hash, 32);
 }

-int scanhash_myriad(int thr_id, struct work *work,
-	uint32_t max_nonce, uint64_t *hashes_done)
+int scanhash_myriad( struct work *work,
+	uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr)
 {
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
@@ -63,6 +63,7 @@ int scanhash_myriad(int thr_id, struct work *work,
 	uint32_t _ALIGN(64) endiandata[20];
 	const uint32_t first_nonce = pdata[19];
 	uint32_t nonce = first_nonce;
+   int thr_id = mythr->id;  // thr_id arg is deprecated

 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x0000ff;
--- a/algo/groestl/myrgr-4way.c
+++ b/algo/groestl/myrgr-4way.c
@@ -33,7 +33,7 @@ void myriad_4way_hash( void *output, const void *input )
     myrgr_4way_ctx_holder ctx;
     memcpy( &ctx, &myrgr_4way_ctx, sizeof(myrgr_4way_ctx) );

-     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, input, 640 );
+     dintrlv_4x32( hash0, hash1, hash2, hash3, input, 640 );

     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 640 );
     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
@@ -43,66 +43,52 @@ void myriad_4way_hash( void *output, const void *input )
     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 640 );

-     mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );

     sha256_4way( &ctx.sha, vhash, 64 );
-     sha256_4way_close( &ctx.sha, vhash );
-
-     mm128_deinterleave_4x32( output, output+32, output+64, output+96,
-                           vhash, 256 );
+     sha256_4way_close( &ctx.sha, output );
 }

-int scanhash_myriad_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done )
+int scanhash_myriad_4way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t hash[8*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-   uint32_t _ALIGN(64) edata[20];
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t *hash7 = &(hash[7<<2]);
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
-   uint32_t *nonces = work->nonces;
-   int num_found = 0;
-   uint32_t *noncep = vdata + 76; // 19*4
+   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
+   int thr_id = mythr->id;  // thr_id arg is deprecated

-/*
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
-
-	uint32_t _ALIGN(64) endiandata[20];
-	const uint32_t first_nonce = pdata[19];
-	uint32_t nonce = first_nonce;
-*/
   if ( opt_benchmark )
      ( (uint32_t*)ptarget )[7] = 0x0000ff;

-   swab32_array( edata, pdata, 20 );
-   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
-
+   mm128_bswap32_intrlv80_4x32( vdata, pdata );
   do {
-      be32enc( noncep,   n   );
-      be32enc( noncep+1, n+1 );
-      be32enc( noncep+2, n+2 );
-      be32enc( noncep+3, n+3 );
+      *noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) );

      myriad_4way_hash( hash, vdata );
      pdata[19] = n;

-      for ( int i = 0; i < 4; i++ )
-      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+      for ( int lane = 0; lane < 4; lane++ )
+      if ( hash7[ lane ] <= Htarg )
      {
-          pdata[19] = n+i;
-          nonces[ num_found++ ] = n+i;
-          work_set_target_ratio( work, hash+(i<<3) );
+         extr_lane_4x32( lane_hash, hash, lane, 256 );
+         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+         {
+            pdata[19] = n + lane;
+            submit_lane_solution( work, lane_hash, mythr, lane );
+         }
      }
      n += 4;
-   } while ( (num_found == 0) && (n < max_nonce-4)
-                   && !work_restart[thr_id].restart);
+   } while ( (n < max_nonce-4) && !work_restart[thr_id].restart);

   *hashes_done = n - first_nonce + 1;
-   return num_found;
+   return 0;
 }

 #endif
--- a/algo/groestl/myrgr-gate.h
+++ b/algo/groestl/myrgr-gate.h
@@ -12,8 +12,8 @@

 void myriad_4way_hash( void *state, const void *input );

-int scanhash_myriad_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done );
+int scanhash_myriad_4way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );

 void init_myrgr_4way_ctx();

@@ -21,8 +21,8 @@ void init_myrgr_4way_ctx();

 void myriad_hash( void *state, const void *input );

-int scanhash_myriad( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done );
+int scanhash_myriad( struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done, struct thr_info *mythr );

 void init_myrgr_ctx();

--- a/algo/hamsi/hamsi-hash-4way.c
+++ b/algo/hamsi/hamsi-hash-4way.c
@@ -531,16 +531,17 @@ static const sph_u32 T512[64][16] = {

 #define INPUT_BIG \
 do { \
+  const __m256i zero = _mm256_setzero_si256(); \
  __m256i db = *buf; \
  const sph_u32 *tp = &T512[0][0]; \
-  m0 = m256_zero; \
-  m1 = m256_zero; \
-  m2 = m256_zero; \
-  m3 = m256_zero; \
-  m4 = m256_zero; \
-  m5 = m256_zero; \
-  m6 = m256_zero; \
-  m7 = m256_zero; \
+  m0 = zero; \
+  m1 = zero; \
+  m2 = zero; \
+  m3 = zero; \
+  m4 = zero; \
+  m5 = zero; \
+  m6 = zero; \
+  m7 = zero; \
  for ( int u = 0; u < 64; u++ ) \
  { \
     __m256i dm = _mm256_and_si256( db, m256_one_64 ) ; \
@@ -913,9 +914,7 @@ void hamsi512_4way( hamsi_4way_big_context *sc, const void *data, size_t len )

 void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
 {
-   __m256i *out = (__m256i*)dst;
   __m256i pad[1];
-   size_t u;
   int ch, cl;

   sph_enc32be( &ch, sc->count_high );
@@ -925,8 +924,8 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
                                  0UL, 0x80UL, 0UL, 0x80UL );
   hamsi_big( sc, sc->buf, 1 );
   hamsi_big_final( sc, pad );
-   for ( u = 0; u < 8; u ++ )
-      out[u] = mm256_bswap_32( sc->h[u] );
+
+   mm256_block_bswap_32( (__m256i*)dst, sc->h );
 }

 #ifdef __cplusplus
--- a/algo/hamsi/hamsi-hash-4way.h
+++ b/algo/hamsi/hamsi-hash-4way.h
@@ -40,7 +40,7 @@

 #if defined (__AVX2__)

-#include "avxdefs.h"
+#include "simd-utils.h"

 #ifdef __cplusplus
 extern "C"{
--- a/algo/haval/haval-hash-4way.h
+++ b/algo/haval/haval-hash-4way.h
@@ -69,7 +69,7 @@ extern "C"{

 #include <stddef.h>
 #include "algo/sha/sph_types.h"
-#include "avxdefs.h"
+#include "simd-utils.h"

 #define SPH_SIZE_haval256_5   256

--- a/algo/heavy/bastion.c
+++ b/algo/heavy/bastion.c
@@ -131,12 +131,14 @@ void bastionhash(void *output, const void *input)
 	memcpy(output, hash, 32);
 }

-int scanhash_bastion(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done)
+int scanhash_bastion( struct work *work, uint32_t max_nonce,
+      uint64_t *hashes_done, struct thr_info *mythr)
 {
 	uint32_t _ALIGN(64) hash32[8];
 	uint32_t _ALIGN(64) endiandata[20];
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
+   int thr_id = mythr->id;  // thr_id arg is deprecated

 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
--- a/algo/heavy/heavy.c
+++ b/algo/heavy/heavy.c
@@ -79,11 +79,12 @@ extern void heavyhash(unsigned char* output, const unsigned char* input, int len

 }

-int scanhash_heavy(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
-                    uint32_t max_nonce, uint64_t *hashes_done)
+int scanhash_heavy( uint32_t *pdata, const uint32_t *ptarget,
+            uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr)
 {
    uint32_t hash[8];
    uint32_t start_nonce = pdata[19];
+    int thr_id = mythr->id;  // thr_id arg is deprecated
    
    do {
        heavyhash((unsigned char *)hash, (unsigned char *)pdata, 80);
--- a/algo/hodl/aes.c
+++ b/algo/hodl/aes.c
@@ -83,7 +83,7 @@ void ExpandAESKey256(__m128i *keys, const __m128i *KeyBuf)
    keys[14] = tmp1;
 }

-#ifdef __SSE4_2__
+#if defined(__SSE4_2__)
 //#ifdef __AVX__

 #define AESENC(i,j) \
@@ -151,7 +151,7 @@ void AES256CBC(__m128i** data, const __m128i** next, __m128i ExpandedKey[][16],
    }
 }

-#else    // NO SSE4.2
+#else    // NO AVX

 static inline __m128i AES256Core(__m128i State, const __m128i *ExpandedKey)
 {
--- a/algo/hodl/hodl-gate.c
+++ b/algo/hodl/hodl-gate.c
@@ -143,20 +143,20 @@ bool hodl_do_this_thread( int thr_id )
  return ( thr_id == 0 );
 }

-int hodl_scanhash( int thr_id, struct work* work, uint32_t max_nonce,
-                   uint64_t *hashes_done )
+int hodl_scanhash( struct work* work, uint32_t max_nonce,
+                   uint64_t *hashes_done, struct thr_info *mythr )
 {
 #if defined(__AES__)
-  GenRandomGarbage( (CacheEntry*)hodl_scratchbuf, work->data, thr_id );
+  GenRandomGarbage( (CacheEntry*)hodl_scratchbuf, work->data, mythr->id );
  pthread_barrier_wait( &hodl_barrier );
-  return scanhash_hodl_wolf( thr_id, work, max_nonce, hashes_done );
+  return scanhash_hodl_wolf( work, max_nonce, hashes_done, thr_info );
 #endif
  return false;
 }

 bool register_hodl_algo( algo_gate_t* gate )
 {
-#if defined(__AES__)
+#if !defined(__AES__)
  applog( LOG_ERR, "Only CPUs with AES are supported, use legacy version.");
  return false;
 #endif
@@ -166,7 +166,7 @@ bool register_hodl_algo( algo_gate_t* gate )
 //     return false;
 //  }
  pthread_barrier_init( &hodl_barrier, NULL, opt_n_threads );
-  gate->optimizations         = AES_OPT | SSE42_OPT | AVX2_OPT;
+  gate->optimizations         = AES_OPT | AVX_OPT | AVX2_OPT;
  gate->scanhash              = (void*)&hodl_scanhash;
  gate->get_new_work          = (void*)&hodl_get_new_work;
  gate->longpoll_rpc_call     = (void*)&hodl_longpoll_rpc_call;
--- a/algo/hodl/hodl-wolf.c
+++ b/algo/hodl/hodl-wolf.c
@@ -17,7 +17,7 @@ void GenerateGarbageCore( CacheEntry *Garbage, int ThreadID, int ThreadCount,
    const uint32_t StartChunk = ThreadID * Chunk;
    const uint32_t EndChunk   = StartChunk + Chunk;

-#ifdef __SSE4_2__
+#if defined(__SSE4_2__)
 //#ifdef __AVX__
    uint64_t* TempBufs[ SHA512_PARALLEL_N ] ;
    uint64_t* desination[ SHA512_PARALLEL_N ];
@@ -61,13 +61,14 @@ void Rev256(uint32_t *Dest, const uint32_t *Src)
 }
 */

-int scanhash_hodl_wolf( int threadNumber, struct work* work, uint32_t max_nonce,
-                        uint64_t *hashes_done )
+int scanhash_hodl_wolf( struct work* work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr )
 {
-#ifdef __SSE4_2__
+#if defined(__SSE4_2__)
 //#ifdef __AVX__
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
+    int threadNumber = mythr->id;
    CacheEntry *Garbage = (CacheEntry*)hodl_scratchbuf;
    CacheEntry Cache[AES_PARALLEL_N];
    __m128i* data[AES_PARALLEL_N];
@@ -139,7 +140,7 @@ int scanhash_hodl_wolf( int threadNumber, struct work* work, uint32_t max_nonce,
    return(0);


-#else  // no SSE4.2
+#else  // no AVX

    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
@@ -147,6 +148,7 @@ int scanhash_hodl_wolf( int threadNumber, struct work* work, uint32_t max_nonce,
    CacheEntry *Garbage = (CacheEntry*)hodl_scratchbuf;
    CacheEntry Cache;
    uint32_t CollisionCount = 0;
+    int threadNumber = mythr->id;

    swab32_array( BlockHdr, pdata, 20 );
        // Search for pattern in psuedorandom data      
@@ -204,7 +206,7 @@ int scanhash_hodl_wolf( int threadNumber, struct work* work, uint32_t max_nonce,
    *hashes_done = CollisionCount;
    return(0);

-#endif  // SSE4.2 else
+#endif  // AVX else

 }

--- a/algo/hodl/hodl-wolf.h
+++ b/algo/hodl/hodl-wolf.h
@@ -19,8 +19,8 @@ typedef union _CacheEntry
 	__m128i dqwords[GARBAGE_SLICE_SIZE >> 4] __attribute__((aligned(16)));
 } CacheEntry;

-int scanhash_hodl_wolf( int thr_id, struct work* work, uint32_t max_nonce,
-                   uint64_t *hashes_done );
+int scanhash_hodl_wolf( struct work* work, uint32_t max_nonce,
+                   uint64_t *hashes_done, struct thr_info *mythr );

 void GenRandomGarbage( CacheEntry *Garbage, uint32_t *pdata, int thr_id);

--- a/algo/hodl/sha512-avx.h
+++ b/algo/hodl/sha512-avx.h
@@ -23,6 +23,7 @@ typedef struct
   __m256i h[8];
   __m256i w[80];
 #elif defined(__SSE4_2__)
+//#elif defined(__AVX__)
   __m128i h[8];
   __m128i w[80];
 #else
@@ -32,7 +33,8 @@ typedef struct

 #ifdef __AVX2__
 #define SHA512_PARALLEL_N 8
-#elif defined(__SSE$_2__)
+#elif defined(__SSE4_2__)
+//#elif defined(__AVX__)
 #define SHA512_PARALLEL_N 4
 #else
 #define SHA512_PARALLEL_N 1   // dummy value
--- a/algo/hodl/sha512_avx.c
+++ b/algo/hodl/sha512_avx.c
@@ -1,6 +1,6 @@
 #ifndef __AVX2__

-#ifdef __SSE4_2__
+#if defined(__SSE4_2__)
 //#ifdef __AVX__

 //Dependencies
--- a/algo/hodl/wolf-aes.h
+++ b/algo/hodl/wolf-aes.h
@@ -6,7 +6,7 @@

 void ExpandAESKey256(__m128i *keys, const __m128i *KeyBuf);

-#ifdef __SSE4_2__
+#if defined(__SSE4_2__)
 //#ifdef __AVX__

 #define AES_PARALLEL_N 8
--- a/algo/jh/jh-hash-4way.h
+++ b/algo/jh/jh-hash-4way.h
@@ -44,7 +44,7 @@ extern "C"{

 #include <stddef.h>
 #include "algo/sha/sph_types.h"
-#include "avxdefs.h"
+#include "simd-utils.h"

 #define SPH_SIZE_jh256   256

--- a/algo/jh/jha-4way.c
+++ b/algo/jh/jha-4way.c
@@ -3,7 +3,6 @@
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-//#include "avxdefs.h"

 #if defined(JHA_4WAY)

@@ -13,9 +12,6 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/groestl/aes_ni/hash-groestl.h"

-//static __thread keccak512_4way_context jha_kec_mid
-//                                   __attribute__ ((aligned (64)));
-
 void jha_hash_4way( void *out, const void *input )
 {
    uint64_t hash0[8] __attribute__ ((aligned (64)));
@@ -46,7 +42,7 @@ void jha_hash_4way( void *out, const void *input )
       vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256(
               vh[0], _mm256_set1_epi64x( 1 ) ), m256_zero );

-       mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+       dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
       init_groestl( &ctx_groestl, 64 );
       update_and_final_groestl( &ctx_groestl, (char*)hash0,
                                               (char*)hash0, 512 );
@@ -59,7 +55,7 @@ void jha_hash_4way( void *out, const void *input )
       init_groestl( &ctx_groestl, 64 );
       update_and_final_groestl( &ctx_groestl, (char*)hash3,
                                               (char*)hash3, 512 );
-       mm256_interleave_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
+       intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );

       skein512_4way_init( &ctx_skein );
       skein512_4way( &ctx_skein, vhash, 64 );
@@ -77,26 +73,24 @@ void jha_hash_4way( void *out, const void *input )
       jh512_4way_close( &ctx_jh, vhashB );

       for ( int i = 0; i < 8; i++ )
-          vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
+          casti_m256i( out, i ) = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
    }
-
-    mm256_deinterleave_4x64( out, out+32, out+64, out+96, vhash, 256 );
 }

-int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                       uint64_t *hashes_done )
+int scanhash_jha_4way( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t hash[8*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-   uint32_t endiandata[20] __attribute__((aligned(64)));
+   uint32_t *hash7 = &(hash[25]);
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   const uint32_t Htarg = ptarget[7];
   uint32_t n = pdata[19];
-   uint32_t *nonces = work->nonces;
-   int num_found = 0;
-   uint32_t *noncep = vdata + 73;   // 9*8 + 1
+    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+   int thr_id = mythr->id;  // thr_id arg is deprecated

   uint64_t htmax[] = {
 		0,
@@ -115,11 +109,7 @@ int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
 		0
 	};

-   for ( int i=0; i < 19; i++ )
-      be32enc( &endiandata[i], pdata[i] );
-
-   uint64_t *edata = (uint64_t*)endiandata;
-   mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+   mm256_bswap32_intrlv80_4x64( vdata, pdata );

   for ( int m = 0; m < 6; m++ )
   {
@@ -127,29 +117,27 @@ int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
      {
         uint32_t mask = masks[m];
         do {
-              be32enc( noncep,   n   );
-              be32enc( noncep+2, n+1 );
-              be32enc( noncep+4, n+2 );
-              be32enc( noncep+6, n+3 );
+              *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );

              jha_hash_4way( hash, vdata );
              pdata[19] = n;

-              for ( int i = 0; i < 4; i++ )
-              if ( ( !( (hash+(i<<3))[7] & mask ) == 0 )
-                  && fulltest( hash+(i<<3), ptarget ) )
+              for ( int i = 0; i < 4; i++ ) if ( !( (hash7[i] & mask ) == 0 ) )
              {
-                 pdata[19] = n;
-                 nonces[ num_found++ ] = n+i;
-                 work_set_target_ratio( work, hash+(i<<3) );
+                 extr_lane_4x64( lane_hash, hash, i, 256 );
+                 if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
+                 {
+                    pdata[19] = n+i;
+                    submit_lane_solution( work, lane_hash, mythr, i );
+                 }
              }
              n += 4;
-         } while ( ( num_found == 0 ) && ( n < max_nonce )
-                     && !work_restart[thr_id].restart );
+         } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
         break;
      }
   }
   *hashes_done = n - first_nonce + 1;
-   return num_found;
+   return 0;
 }
 #endif
--- a/algo/jh/jha-gate.h
+++ b/algo/jh/jha-gate.h
@@ -12,14 +12,14 @@
 #if defined JHA_4WAY
 void jha_hash_4way( void *state, const void *input );

-int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                       uint64_t *hashes_done );
+int scanhash_jha_4way( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr );
 #endif

 void jha_hash( void *state, const void *input );

-int scanhash_jha( int thr_id, struct work *work, uint32_t max_nonce,
-                     uint64_t *hashes_done );
+int scanhash_jha( struct work *work, uint32_t max_nonce,
+                     uint64_t *hashes_done, struct thr_info *mythr );

 #endif

--- a/algo/jh/jha.c
+++ b/algo/jh/jha.c
@@ -81,7 +81,8 @@ void jha_hash(void *output, const void *input)
 	memcpy(output, hash, 32);
 }

-int scanhash_jha(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done)
+int scanhash_jha( struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done, struct thr_info *mythr )
 {
 	uint32_t _ALIGN(128) hash32[8];
 	uint32_t _ALIGN(128) endiandata[20];
@@ -89,7 +90,8 @@ int scanhash_jha(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *ha
 	uint32_t *ptarget = work->target;
 	const uint32_t first_nonce = pdata[19];
 	const uint32_t Htarg = ptarget[7];
-	uint32_t n = pdata[19] - 1;
+   uint32_t n = pdata[19] - 1;
+   int thr_id = mythr->id;  // thr_id arg is deprecated

 	uint64_t htmax[] = {
 		0,
--- a/algo/keccak/keccak-4way.c
+++ b/algo/keccak/keccak-4way.c
@@ -16,55 +16,44 @@ void keccakhash_4way(void *state, const void *input)
    keccak256_4way_close( &ctx, state );
 }

-int scanhash_keccak_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done)
+int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
-   uint32_t hash[8*4] __attribute__ ((aligned (32)));
+   uint32_t hash[16*4] __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *hash7 = &(hash[25]);   // 3*8+1
-   uint32_t lane_hash[8];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   uint32_t n = pdata[19];
   const uint32_t first_nonce = pdata[19];
+   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
 //   const uint32_t Htarg = ptarget[7];
-   uint32_t endiandata[20];
-   uint32_t *nonces = work->nonces;
-   int num_found = 0;
-   uint32_t *noncep = vdata + 73;   // 9*8 + 1
-
-   for ( int i=0; i < 19; i++ ) 
-      be32enc( &endiandata[i], pdata[i] );
-
-   uint64_t *edata = (uint64_t*)endiandata;
-   mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+    int thr_id = mythr->id;  // thr_id arg is deprecated

+   mm256_bswap32_intrlv80_4x64( vdata, pdata );
   do {
-      be32enc( noncep,   n   );
-      be32enc( noncep+2, n+1 );
-      be32enc( noncep+4, n+2 );
-      be32enc( noncep+6, n+3 );
+       *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
 	
      keccakhash_4way( hash, vdata );

      for ( int lane = 0; lane < 4; lane++ )
      if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) )
      {
-          mm256_extract_lane_4x64( lane_hash, hash, lane, 256 );
+          extr_lane_4x64( lane_hash, hash, lane, 256 );
          if ( fulltest( lane_hash, ptarget ) )
          {
              pdata[19] = n + lane;
-              nonces[ num_found++ ] = n + lane;
-              work_set_target_ratio( work, lane_hash );
+              submit_lane_solution( work, lane_hash, mythr, lane );
          }
      }
      n += 4;

-   } while ( (num_found == 0) && (n < max_nonce-4)
-                   && !work_restart[thr_id].restart);
+   } while ( (n < max_nonce-4) && !work_restart[thr_id].restart);

   *hashes_done = n - first_nonce + 1;
-   return num_found;
+   return 0;
 }

 #endif
--- a/algo/keccak/keccak-gate.h
+++ b/algo/keccak/keccak-gate.h
@@ -11,13 +11,13 @@
 #if defined(KECCAK_4WAY)

 void keccakhash_4way( void *state, const void *input );
-int scanhash_keccak_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done );
+int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );

 #endif

 void keccakhash( void *state, const void *input );
-int scanhash_keccak( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done );
+int scanhash_keccak( struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done, struct thr_info *mythr );

 #endif
--- a/algo/keccak/keccak-hash-4way.h
+++ b/algo/keccak/keccak-hash-4way.h
@@ -44,7 +44,7 @@ extern "C"{

 #include <stddef.h>
 #include "algo/sha/sph_types.h"
-#include "avxdefs.h"
+#include "simd-utils.h"

 #define SPH_SIZE_keccak256   256

--- a/algo/keccak/keccak.c
+++ b/algo/keccak/keccak.c
@@ -18,14 +18,15 @@ void keccakhash(void *state, const void *input)
 	memcpy(state, hash, 32);
 }

-int scanhash_keccak(int thr_id, struct work *work,
-	uint32_t max_nonce, uint64_t *hashes_done)
+int scanhash_keccak( struct work *work,
+	uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
 	uint32_t n = pdata[19] - 1;
 	const uint32_t first_nonce = pdata[19];
 	//const uint32_t Htarg = ptarget[7];
+   int thr_id = mythr->id;  // thr_id arg is deprecated

 	uint32_t _ALIGN(32) hash64[8];
 	uint32_t endiandata[32];
--- a/algo/luffa/luffa-hash-2way.c
+++ b/algo/luffa/luffa-hash-2way.c
@@ -24,7 +24,7 @@

 #if defined(__AVX2__)

-#include "avxdefs.h"
+#include "simd-utils.h"

 #define MASK _mm256_set_epi32( 0UL, 0UL, 0UL, 0xffffffffUL, \
                               0UL, 0UL, 0UL, 0xffffffffUL )
--- a/algo/luffa/luffa-hash-2way.h
+++ b/algo/luffa/luffa-hash-2way.h
@@ -24,7 +24,7 @@

 #include <immintrin.h>
 #include "algo/sha/sha3-defs.h"
-#include "avxdefs.h"
+#include "simd-utils.h"

 /* The length of digests*/
 #define DIGEST_BIT_LEN_224 224
--- a/algo/luffa/luffa_for_sse2.c
+++ b/algo/luffa/luffa_for_sse2.c
@@ -20,7 +20,7 @@

 #include <string.h>
 #include <emmintrin.h>
-#include "avxdefs.h"
+#include "simd-utils.h"
 #include "luffa_for_sse2.h"

 #define MULT2(a0,a1) do \
--- a/algo/luffa/sph_luffa.c
+++ b/algo/luffa/sph_luffa.c
@@ -77,6 +77,24 @@ static const sph_u32 V_INIT[5][8] = {
 	}
 };

+#if SPH_LUFFA_PARALLEL
+
+static const sph_u64 RCW010[8] = {
+   SPH_C64(0xb6de10ed303994a6), SPH_C64(0x70f47aaec0e65299),
+   SPH_C64(0x0707a3d46cc33a12), SPH_C64(0x1c1e8f51dc56983e),
+   SPH_C64(0x707a3d451e00108f), SPH_C64(0xaeb285627800423d),
+   SPH_C64(0xbaca15898f5b7882), SPH_C64(0x40a46f3e96e1db12)
+};
+
+static const sph_u64 RCW014[8] = {
+   SPH_C64(0x01685f3de0337818), SPH_C64(0x05a17cf4441ba90d),
+   SPH_C64(0xbd09caca7f34d442), SPH_C64(0xf4272b289389217f),
+   SPH_C64(0x144ae5cce5a8bce6), SPH_C64(0xfaa7ae2b5274baf4),
+   SPH_C64(0x2e48f1c126889ba7), SPH_C64(0xb923c7049a226e9d)
+};
+
+#else
+
 static const sph_u32 RC00[8] = {
 	SPH_C32(0x303994a6), SPH_C32(0xc0e65299),
 	SPH_C32(0x6cc33a12), SPH_C32(0xdc56983e),
@@ -105,20 +123,18 @@ static const sph_u32 RC14[8] = {
 	SPH_C32(0x2e48f1c1), SPH_C32(0xb923c704)
 };

-#if SPH_LUFFA_PARALLEL
-
-static const sph_u64 RCW010[8] = {
-	SPH_C64(0xb6de10ed303994a6), SPH_C64(0x70f47aaec0e65299),
-	SPH_C64(0x0707a3d46cc33a12), SPH_C64(0x1c1e8f51dc56983e),
-	SPH_C64(0x707a3d451e00108f), SPH_C64(0xaeb285627800423d),
-	SPH_C64(0xbaca15898f5b7882), SPH_C64(0x40a46f3e96e1db12)
+static const sph_u32 RC30[8] = {
+   SPH_C32(0xb213afa5), SPH_C32(0xc84ebe95),
+   SPH_C32(0x4e608a22), SPH_C32(0x56d858fe),
+   SPH_C32(0x343b138f), SPH_C32(0xd0ec4e3d),
+   SPH_C32(0x2ceb4882), SPH_C32(0xb3ad2208)
 };

-static const sph_u64 RCW014[8] = {
-	SPH_C64(0x01685f3de0337818), SPH_C64(0x05a17cf4441ba90d),
-	SPH_C64(0xbd09caca7f34d442), SPH_C64(0xf4272b289389217f),
-	SPH_C64(0x144ae5cce5a8bce6), SPH_C64(0xfaa7ae2b5274baf4),
-	SPH_C64(0x2e48f1c126889ba7), SPH_C64(0xb923c7049a226e9d)
+static const sph_u32 RC34[8] = {
+   SPH_C32(0xe028c9bf), SPH_C32(0x44756f91),
+   SPH_C32(0x7e8fce32), SPH_C32(0x956548be),
+   SPH_C32(0xfe191be2), SPH_C32(0x3cb226e5),
+   SPH_C32(0x5944a28e), SPH_C32(0xa1c4c355)
 };

 #endif
@@ -137,19 +153,6 @@ static const sph_u32 RC24[8] = {
 	SPH_C32(0x36eda57f), SPH_C32(0x703aace7)
 };

-static const sph_u32 RC30[8] = {
-	SPH_C32(0xb213afa5), SPH_C32(0xc84ebe95),
-	SPH_C32(0x4e608a22), SPH_C32(0x56d858fe),
-	SPH_C32(0x343b138f), SPH_C32(0xd0ec4e3d),
-	SPH_C32(0x2ceb4882), SPH_C32(0xb3ad2208)
-};
-
-static const sph_u32 RC34[8] = {
-	SPH_C32(0xe028c9bf), SPH_C32(0x44756f91),
-	SPH_C32(0x7e8fce32), SPH_C32(0x956548be),
-	SPH_C32(0xfe191be2), SPH_C32(0x3cb226e5),
-	SPH_C32(0x5944a28e), SPH_C32(0xa1c4c355)
-};

 #if SPH_LUFFA_PARALLEL

--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -44,10 +44,11 @@ void allium_4way_hash( void *state, const void *input )
   blake256_4way( &ctx.blake, input + (64<<2), 16 );
   blake256_4way_close( &ctx.blake, vhash32 );

-   mm256_reinterleave_4x64( vhash64, vhash32, 256 );
+   rintrlv_4x32_4x64( vhash64, vhash32, 256 );
   keccak256_4way( &ctx.keccak, vhash64, 32 );
   keccak256_4way_close( &ctx.keccak, vhash64 );
-   mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
+
+   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );

   LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
   LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
@@ -67,52 +68,42 @@ void allium_4way_hash( void *state, const void *input )
   LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
   LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );

-   mm256_interleave_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
+   intrlv_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
+
   skein256_4way( &ctx.skein, vhash64, 32 );
   skein256_4way_close( &ctx.skein, vhash64 );
-   mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );

-   update_and_final_groestl256( &ctx.groestl, hash0, hash0, 256 );
-   memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, hash1, hash1, 256 );
-   memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, hash2, hash2, 256 );
-   memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, hash3, hash3, 256 );
+   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );

-   memcpy( state,    hash0, 32 );
-   memcpy( state+32, hash1, 32 );
-   memcpy( state+64, hash2, 32 );
-   memcpy( state+96, hash3, 32 );
+   update_and_final_groestl256( &ctx.groestl, state, hash0, 256 );
+   memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+32, hash1, 256 );
+   memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+64, hash2, 256 );
+   memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+96, hash3, 256 );
 }

-int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
+int scanhash_allium_4way( struct work *work, uint32_t max_nonce,
                             uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t hash[8*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-   uint32_t _ALIGN(64) edata[20];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
   const uint32_t Htarg = ptarget[7];
   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
-   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
+   int thr_id = mythr->id;  // thr_id arg is deprecated

   if ( opt_benchmark )
      ( (uint32_t*)ptarget )[7] = 0x0000ff;

-   casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
-   casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
-   casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
-   casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
-   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
-
-   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+   mm128_bswap32_intrlv80_4x32( vdata, pdata );
   blake256_4way_init( &allium_4way_ctx.blake );
   blake256_4way( &allium_4way_ctx.blake, vdata, 64 );

@@ -124,16 +115,10 @@ int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,

     for ( int lane = 0; lane < 4; lane++ ) if ( (hash+(lane<<3))[7] <= Htarg )
     {
-        if ( fulltest( hash+(lane<<3), ptarget ) )
+        if ( fulltest( hash+(lane<<3), ptarget ) && !opt_benchmark )
        {
           pdata[19] = n + lane;
-           work_set_target_ratio( work, hash+(lane<<3) );
-           if ( submit_work( mythr, work ) )
-               applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
-                             accepted_share_count + rejected_share_count + 1,
-                             thr_id, lane );
-           else
-               applog( LOG_WARNING, "Failed to submit share." );
+           submit_lane_solution( work, hash+(lane<<3), mythr, lane );
         }
     }
     n += 4;
--- a/algo/lyra2/allium.c
+++ b/algo/lyra2/allium.c
@@ -69,7 +69,7 @@ void allium_hash(void *state, const void *input)
    memcpy(state, hash, 32);
 }

-int scanhash_allium( int thr_id, struct work *work, uint32_t max_nonce,
+int scanhash_allium( struct work *work, uint32_t max_nonce,
                     uint64_t *hashes_done, struct thr_info *mythr )
 {
    uint32_t _ALIGN(128) hash[8];
@@ -80,7 +80,7 @@ int scanhash_allium( int thr_id, struct work *work, uint32_t max_nonce,
    const uint32_t Htarg = ptarget[7];
    const uint32_t first_nonce = pdata[19];
    uint32_t nonce = first_nonce;
-   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
+    int thr_id = mythr->id;  // thr_id arg is deprecated

    if ( opt_benchmark )
        ptarget[7] = 0x3ffff;
@@ -94,18 +94,14 @@ int scanhash_allium( int thr_id, struct work *work, uint32_t max_nonce,
    do {
        be32enc( &endiandata[19], nonce );
        allium_hash( hash, endiandata );
-
-        if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
+        if ( hash[7] <= Htarg )
+        if ( fulltest( hash, ptarget ) && !opt_benchmark )
        {
-            work_set_target_ratio( work, hash );
            pdata[19] = nonce;
-            *hashes_done = pdata[19] - first_nonce;
-            return 1;
+            submit_solution( work, hash, mythr );
        }
        nonce++;
-
-    } while (nonce < max_nonce && !work_restart[thr_id].restart);
-
+    } while ( nonce < max_nonce && !work_restart[thr_id].restart );
    pdata[19] = nonce;
    *hashes_done = pdata[19] - first_nonce + 1;
    return 0;
--- a/algo/lyra2/lyra2-gate.c
+++ b/algo/lyra2/lyra2-gate.c
@@ -27,11 +27,15 @@
 // Convert algos that don't yet do so to use dynamic alllocation.
 // Alloc huge pages globally. If ok each thread will create a pointer to
 // its chunk. If fail each thread will use use _mm_alloc for itself. 
+// BLOCK_LEN_BYTES is 768.

 #define LYRA2REV3_NROWS 4
 #define LYRA2REV3_NCOLS 4
-//#define LYRA2REV3_MATRIX_SIZE ((BLOCK_LEN_BYTES)*(LYRA2REV3_NCOLS)* \
-//                                                 (LYRA2REV3_NROWS)*8)
+/*
+#define LYRA2REV3_MATRIX_SIZE ((BLOCK_LEN_BYTES)*(LYRA2REV3_NCOLS)* \
+                                                 (LYRA2REV3_NROWS)*8)
+*/
+
 #define LYRA2REV3_MATRIX_SIZE ((BLOCK_LEN_BYTES)<<4)

 __thread uint64_t* l2v3_wholeMatrix;
@@ -43,7 +47,9 @@ bool lyra2rev3_thread_init()

   int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
   l2v3_wholeMatrix = _mm_malloc( size, 64 );
-#if defined (LYRA2REV3_4WAY)
+#if defined (LYRA2REV3_8WAY)
+   init_lyra2rev3_8way_ctx();;
+#elif defined (LYRA2REV3_4WAY)
   init_lyra2rev3_4way_ctx();;
 #else
   init_lyra2rev3_ctx();
@@ -53,7 +59,10 @@ bool lyra2rev3_thread_init()

 bool register_lyra2rev3_algo( algo_gate_t* gate )
 {
-#if defined (LYRA2REV3_4WAY)
+#if defined (LYRA2REV3_8WAY)
+  gate->scanhash  = (void*)&scanhash_lyra2rev3_8way;
+  gate->hash      = (void*)&lyra2rev3_8way_hash;
+#elif defined (LYRA2REV3_4WAY)
  gate->scanhash  = (void*)&scanhash_lyra2rev3_4way;
  gate->hash      = (void*)&lyra2rev3_4way_hash;
 #else
@@ -199,13 +208,18 @@ void phi2_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )

 bool register_phi2_algo( algo_gate_t* gate )
 {
-   init_phi2_ctx();
+//   init_phi2_ctx();
   gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
   gate->get_work_data_size = (void*)&phi2_get_work_data_size;
   gate->decode_extra_data  = (void*)&phi2_decode_extra_data;
   gate->build_extraheader  = (void*)&phi2_build_extraheader;
   gate->set_target         = (void*)&alt_set_target; 
   gate->get_max64          = (void*)&get_max64_0xffffLL;
+#if defined(PHI2_4WAY)
+   gate->scanhash           = (void*)&scanhash_phi2_4way;
+#else
+   init_phi2_ctx();
   gate->scanhash           = (void*)&scanhash_phi2;
+#endif
   return true;
 }
--- a/algo/lyra2/lyra2-gate.h
+++ b/algo/lyra2/lyra2-gate.h
@@ -6,24 +6,34 @@
 #include "lyra2.h"

 #if defined(__AVX2__)
+  #define LYRA2REV3_8WAY
+#endif
+
+#if defined(__SSE2__)
  #define LYRA2REV3_4WAY
 #endif

 extern __thread uint64_t* l2v3_wholeMatrix;

 bool register_lyra2rev3_algo( algo_gate_t* gate );
+#if defined(LYRA2REV3_8WAY)

-#if defined(LYRA2REV3_4WAY)
+void lyra2rev3_8way_hash( void *state, const void *input );
+int scanhash_lyra2rev3_8way( struct work *work, uint32_t max_nonce,
+                             uint64_t *hashes_done, struct thr_info *mythr );
+bool init_lyra2rev3_8way_ctx();
+
+#elif defined(LYRA2REV3_4WAY)

 void lyra2rev3_4way_hash( void *state, const void *input );
-int scanhash_lyra2rev3_4way( int thr_id, struct work *work, uint32_t max_nonce,
+int scanhash_lyra2rev3_4way( struct work *work, uint32_t max_nonce,
                             uint64_t *hashes_done, struct thr_info *mythr );
 bool init_lyra2rev3_4way_ctx();

 #else

 void lyra2rev3_hash( void *state, const void *input );
-int scanhash_lyra2rev3( int thr_id, struct work *work, uint32_t max_nonce,
+int scanhash_lyra2rev3( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );
 bool init_lyra2rev3_ctx();

@@ -42,14 +52,14 @@ bool register_lyra2rev2_algo( algo_gate_t* gate );
 #if defined(LYRA2REV2_4WAY)

 void lyra2rev2_4way_hash( void *state, const void *input );
-int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
+int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
                             uint64_t *hashes_done, struct thr_info *mythr );
 bool init_lyra2rev2_4way_ctx();

 #else

 void lyra2rev2_hash( void *state, const void *input );
-int scanhash_lyra2rev2( int thr_id, struct work *work, uint32_t max_nonce,
+int scanhash_lyra2rev2( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );
 bool init_lyra2rev2_ctx();

@@ -70,21 +80,21 @@ bool init_lyra2rev2_ctx();
 #if defined(LYRA2Z_8WAY)

 void lyra2z_8way_hash( void *state, const void *input );
-int scanhash_lyra2z_8way( int thr_id, struct work *work, uint32_t max_nonce,
+int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr );
 bool lyra2z_8way_thread_init();

 #elif defined(LYRA2Z_4WAY)

 void lyra2z_4way_hash( void *state, const void *input );
-int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
+int scanhash_lyra2z_4way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr );
 bool lyra2z_4way_thread_init();

 #else

 void lyra2z_hash( void *state, const void *input );
-int scanhash_lyra2z( int thr_id, struct work *work, uint32_t max_nonce,
+int scanhash_lyra2z( struct work *work, uint32_t max_nonce,
                     uint64_t *hashes_done, struct thr_info *mythr );
 bool lyra2z_thread_init();

@@ -101,14 +111,14 @@ bool lyra2z_thread_init();
 #if defined(LYRA2H_4WAY)

 void lyra2h_4way_hash( void *state, const void *input );
-int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
+int scanhash_lyra2h_4way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr );
 bool lyra2h_4way_thread_init();

 #else

 void lyra2h_hash( void *state, const void *input );
-int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
+int scanhash_lyra2h( struct work *work, uint32_t max_nonce,
                     uint64_t *hashes_done, struct thr_info *mythr );
 bool lyra2h_thread_init();

@@ -125,14 +135,14 @@ bool register_allium_algo( algo_gate_t* gate );
 #if defined(ALLIUM_4WAY)

 void allium_4way_hash( void *state, const void *input );
-int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
+int scanhash_allium_4way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr );
 bool init_allium_4way_ctx();

 #else

 void allium_hash( void *state, const void *input );
-int scanhash_allium( int thr_id, struct work *work, uint32_t max_nonce,
+int scanhash_allium( struct work *work, uint32_t max_nonce,
                     uint64_t *hashes_done, struct thr_info *mythr );
 bool init_allium_ctx();

@@ -140,15 +150,29 @@ bool init_allium_ctx();

 /////////////////////////////////////////

+#if defined(__AVX2__) && defined(__AES__)
+//  #define PHI2_4WAY
+#endif
+
 bool phi2_has_roots;

 bool register_phi2_algo( algo_gate_t* gate );
+#if defined(PHI2_4WAY)
+
+void phi2_hash_4way( void *state, const void *input );
+int scanhash_phi2_4way( struct work *work, uint32_t max_nonce,
+                     uint64_t *hashes_done, struct thr_info *mythr );
+//void init_phi2_ctx();
+
+#else

 void phi2_hash( void *state, const void *input );
-int scanhash_phi2( int thr_id, struct work *work, uint32_t max_nonce,
+int scanhash_phi2( struct work *work, uint32_t max_nonce,
                     uint64_t *hashes_done, struct thr_info *mythr );
 void init_phi2_ctx();

+#endif
+
 #endif  // LYRA2_GATE_H__


--- a/algo/lyra2/lyra2.c
+++ b/algo/lyra2/lyra2.c
@@ -236,7 +236,7 @@ int LYRA2REV3( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
   //Tries to allocate enough space for the whole memory matrix

   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
-   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+//   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
   const int64_t BLOCK_LEN = BLOCK_LEN_BLAKE2_SAFE_INT64;
 /*
   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
@@ -566,7 +566,7 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,

 #if defined(__AVX2__)
   memset_zero_256( (__m256i*)wholeMatrix, i>>5 );
-#elif defined(__SSE4_2__)
+#elif defined(__SSE2__)
   memset_zero_128( (__m128i*)wholeMatrix, i>>4 );   
 #else
   memset( wholeMatrix, 0, i );
--- a/algo/lyra2/lyra2h-4way.c
+++ b/algo/lyra2/lyra2h-4way.c
@@ -5,7 +5,7 @@
 #include <memory.h>
 #include <mm_malloc.h>
 #include "lyra2.h"
-#include "algo/blake/sph_blake.h"
+//#include "algo/blake/sph_blake.h"
 #include "algo/blake/blake-hash-4way.h"

 __thread uint64_t* lyra2h_4way_matrix;
@@ -36,67 +36,53 @@ void lyra2h_4way_hash( void *state, const void *input )
     blake256_4way( &ctx_blake, input + (64*4), 16 );
     blake256_4way_close( &ctx_blake, vhash );

-     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
+     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );

-     LYRA2Z( lyra2h_4way_matrix, hash0, 32, hash0, 32, hash0, 32, 16, 16, 16 );
-     LYRA2Z( lyra2h_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 16, 16, 16 );
-     LYRA2Z( lyra2h_4way_matrix, hash2, 32, hash2, 32, hash2, 32, 16, 16, 16 );
-     LYRA2Z( lyra2h_4way_matrix, hash3, 32, hash3, 32, hash3, 32, 16, 16, 16 );
-
-     memcpy( state,    hash0, 32 );
-     memcpy( state+32, hash1, 32 );
-     memcpy( state+64, hash2, 32 );
-     memcpy( state+96, hash3, 32 );
+     LYRA2Z( lyra2h_4way_matrix, state, 32, hash0, 32, hash0, 32,
+             16, 16, 16 );
+     LYRA2Z( lyra2h_4way_matrix, state+32, 32, hash1, 32, hash1,
+             32, 16, 16, 16 );
+     LYRA2Z( lyra2h_4way_matrix, state+64, 32, hash2, 32, hash2,
+             32, 16, 16, 16 );
+     LYRA2Z( lyra2h_4way_matrix, state+96, 32, hash3, 32, hash3,
+             32, 16, 16, 16 );
 }

-int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
+int scanhash_lyra2h_4way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t hash[8*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-   uint32_t _ALIGN(64) edata[20];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
-   uint32_t *nonces = work->nonces;
-   int num_found = 0;
-   uint32_t *noncep= vdata + 76; // 19*4
-   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
+   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
+   int thr_id = mythr->id;  // thr_id arg is deprecated

   if ( opt_benchmark )
      ptarget[7] = 0x0000ff;

-   for ( int i=0; i < 20; i++ )
-      be32enc( &edata[i], pdata[i] );
-
-   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
-
+   mm128_bswap32_intrlv80_4x32( vdata, pdata );
   lyra2h_4way_midstate( vdata );

   do {
-      be32enc( noncep,   n   );
-      be32enc( noncep+1, n+1 );
-      be32enc( noncep+2, n+2 );
-      be32enc( noncep+3, n+3 );
-
-      be32enc( &edata[19], n );
+     *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
      lyra2h_4way_hash( hash, vdata );

      for ( int i = 0; i < 4; i++ )
-      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
+           && !opt_benchmark )
      {
          pdata[19] = n+i;         
-          nonces[ num_found++ ] = n+i;
-          work_set_target_ratio( work, hash+(i<<3) );
+          submit_lane_solution( work, hash+(i<<3), mythr, i );
      }
      n += 4;
-   } while ( (num_found == 0) && (n < max_nonce-4)
-                   && !work_restart[thr_id].restart);
+   } while (  (n < max_nonce-4) && !work_restart[thr_id].restart);

   *hashes_done = n - first_nonce + 1;
-   return num_found;
+   return 0;
 }

 #endif
--- a/algo/lyra2/lyra2h.c
+++ b/algo/lyra2/lyra2h.c
@@ -35,7 +35,7 @@ void lyra2h_hash( void *state, const void *input )
    memcpy(state, hash, 32);
 }

-int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
+int scanhash_lyra2h( struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr )
 {
 	uint32_t _ALIGN(64) hash[8];
@@ -45,7 +45,7 @@ int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
 	uint32_t nonce = first_nonce;
-   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
+   int thr_id = mythr->id;  // thr_id arg is deprecated

 	if (opt_benchmark)
 		ptarget[7] = 0x0000ff;
@@ -54,22 +54,19 @@ int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
 		be32enc(&endiandata[i], pdata[i]);
 	}

-        lyra2h_midstate( endiandata );
-
+   lyra2h_midstate( endiandata );
 	do {
 		be32enc(&endiandata[19], nonce);
                lyra2h_hash( hash, endiandata );

-		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
-			work_set_target_ratio(work, hash);
+		if ( hash[7] <= Htarg )
+      if ( fulltest( hash, ptarget ) && !opt_benchmark )
+      {
 			pdata[19] = nonce;
-			*hashes_done = pdata[19] - first_nonce;
-			return 1;
-		}
+         submit_solution( work, hash, mythr );
+      }
 		nonce++;
-
 	} while (nonce < max_nonce && !work_restart[thr_id].restart);
-
 	pdata[19] = nonce;
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
--- a/algo/lyra2/lyra2re.c
+++ b/algo/lyra2/lyra2re.c
@@ -6,7 +6,7 @@
 #include "algo/keccak/sph_keccak.h"
 #include "lyra2.h"
 #include "algo-gate-api.h"
-#include "avxdefs.h"
+#include "simd-utils.h"
 #if defined(__AES__)
  #include "algo/groestl/aes_ni/hash-groestl256.h"
 #endif
@@ -81,7 +81,7 @@ void lyra2re_hash(void *state, const void *input)
 	memcpy(state, hashA, 32);
 }

-int scanhash_lyra2re( int thr_id, struct work *work, uint32_t max_nonce,
+int scanhash_lyra2re( struct work *work, uint32_t max_nonce,
 	              uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t *pdata = work->data;
@@ -91,7 +91,7 @@ int scanhash_lyra2re( int thr_id, struct work *work, uint32_t max_nonce,
 	const uint32_t first_nonce = pdata[19];
 	uint32_t nonce = first_nonce;
        const uint32_t Htarg = ptarget[7];
-   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
+   int thr_id = mythr->id;  // thr_id arg is deprecated

        swab32_array( endiandata, pdata, 20 );

@@ -100,20 +100,14 @@ int scanhash_lyra2re( int thr_id, struct work *work, uint32_t max_nonce,
 	do {
 		be32enc(&endiandata[19], nonce);
 		lyra2re_hash(hash, endiandata);
-		if (hash[7] <= Htarg )
-                {
-                   if ( fulltest(hash, ptarget) )
-                   {
+		if ( hash[7] <= Htarg )
+      if ( fulltest(hash, ptarget) && !opt_benchmark )
+      {
 			pdata[19] = nonce;
-			*hashes_done = pdata[19] - first_nonce;
-                        work_set_target_ratio( work, hash );
-			return 1;
-                   }
-		}
+         submit_solution( work, hash, mythr );
+      }
 		nonce++;
-
 	} while (nonce < max_nonce && !work_restart[thr_id].restart);
-
 	pdata[19] = nonce;
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
--- a/algo/lyra2/lyra2rev2-4way.c
+++ b/algo/lyra2/lyra2rev2-4way.c
@@ -42,10 +42,12 @@ void lyra2rev2_4way_hash( void *state, const void *input )
   blake256_4way( &ctx.blake, input + (64<<2), 16 );
   blake256_4way_close( &ctx.blake, vhash );

-   mm256_reinterleave_4x64( vhash64, vhash, 256 );
+   rintrlv_4x32_4x64( vhash64, vhash, 256 );
+
   keccak256_4way( &ctx.keccak, vhash64, 32 );
   keccak256_4way_close( &ctx.keccak, vhash64 );
-   mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
+
+   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );

   cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
   cubehashInit( &ctx.cube, 256, 16, 32 );
@@ -60,10 +62,12 @@ void lyra2rev2_4way_hash( void *state, const void *input )
   LYRA2REV2( l2v2_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 );
   LYRA2REV2( l2v2_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );

-   mm256_interleave_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
+   intrlv_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
+
   skein256_4way( &ctx.skein, vhash64, 32 );
   skein256_4way_close( &ctx.skein, vhash64 );
-   mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
+
+   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );

   cubehashInit( &ctx.cube, 256, 16, 32 );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
@@ -74,61 +78,55 @@ void lyra2rev2_4way_hash( void *state, const void *input )
   cubehashInit( &ctx.cube, 256, 16, 32 );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );

-   mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
-   bmw256_4way( &ctx.bmw, vhash, 32 );
-   bmw256_4way_close( &ctx.bmw, vhash );
+   intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );

-   mm128_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
+   bmw256_4way( &ctx.bmw, vhash, 32 );
+   bmw256_4way_close( &ctx.bmw, state );
 }

-int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
+int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
                             uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t hash[8*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-   uint32_t _ALIGN(64) edata[20];
+   uint32_t *hash7 = &(hash[7<<2]);
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
   const uint32_t Htarg = ptarget[7];
-   uint32_t *nonces = work->nonces;
-   int num_found = 0;
-   uint32_t *noncep = vdata + 76; // 19*4
-   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
+   __m128i *noncev = (__m128i*)vdata + 19;   // aligned
+   int thr_id = mythr->id;  // thr_id arg is deprecated

   if ( opt_benchmark )
      ( (uint32_t*)ptarget )[7] = 0x0000ff;

-   swab32_array( edata, pdata, 20 );
-
-   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+   mm128_bswap32_intrlv80_4x32( vdata, pdata );

   blake256_4way_init( &l2v2_4way_ctx.blake );
   blake256_4way( &l2v2_4way_ctx.blake, vdata, 64 );

-   do {
-      be32enc( noncep,   n   );
-      be32enc( noncep+1, n+1 );
-      be32enc( noncep+2, n+2 );
-      be32enc( noncep+3, n+3 );
+   do
+   {
+      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );

      lyra2rev2_4way_hash( hash, vdata );
      pdata[19] = n;

-      for ( int i = 0; i < 4; i++ )
-      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+      for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
      {
-          pdata[19] = n+i;         
-          nonces[ num_found++ ] = n+i;
-          work_set_target_ratio( work, hash+(i<<3) );
+         extr_lane_4x32( lane_hash, hash, lane, 256 );
+         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+         {
+            pdata[19] = n + lane;         
+            submit_lane_solution( work, lane_hash, mythr, lane );
+         }
      }
      n += 4;
-   } while ( (num_found == 0) && (n < max_nonce-4)
-                   && !work_restart[thr_id].restart);
-
+   } while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
   *hashes_done = n - first_nonce + 1;
-   return num_found;
+   return 0;
 }

 #endif
--- a/algo/lyra2/lyra2rev2.c
+++ b/algo/lyra2/lyra2rev2.c
@@ -40,31 +40,31 @@ void l2v2_blake256_midstate( const void* input )

 void lyra2rev2_hash( void *state, const void *input )
 {
-        lyra2v2_ctx_holder ctx __attribute__ ((aligned (64))); 
-        memcpy( &ctx, &lyra2v2_ctx, sizeof(lyra2v2_ctx) );
-        uint8_t hash[128] __attribute__ ((aligned (64)));
-        #define hashA hash
-        #define hashB hash+64
-        const int midlen = 64;            // bytes
-        const int tail   = 80 - midlen;   // 16
+   lyra2v2_ctx_holder ctx __attribute__ ((aligned (64))); 
+   memcpy( &ctx, &lyra2v2_ctx, sizeof(lyra2v2_ctx) );
+   uint8_t hash[128] __attribute__ ((aligned (64)));
+   #define hashA hash
+   #define hashB hash+64
+   const int midlen = 64;            // bytes
+   const int tail   = 80 - midlen;   // 16

-        memcpy( &ctx.blake, &l2v2_blake_mid, sizeof l2v2_blake_mid );
+   memcpy( &ctx.blake, &l2v2_blake_mid, sizeof l2v2_blake_mid );
 	sph_blake256( &ctx.blake, (uint8_t*)input + midlen, tail );
 	sph_blake256_close( &ctx.blake, hashA );

 	sph_keccak256( &ctx.keccak, hashA, 32 );
 	sph_keccak256_close(&ctx.keccak, hashB);

-        cubehashUpdateDigest( &ctx.cube1, (byte*) hashA,
-                              (const byte*) hashB, 32 );
+   cubehashUpdateDigest( &ctx.cube1, (byte*) hashA,
+                               (const byte*) hashB, 32 );

 	LYRA2REV2( l2v2_wholeMatrix, hashA, 32, hashA, 32, hashA, 32, 1, 4, 4 );

 	sph_skein256( &ctx.skein, hashA, 32 );
 	sph_skein256_close( &ctx.skein, hashB );

-        cubehashUpdateDigest( &ctx.cube2, (byte*) hashA, 
-                              (const byte*) hashB, 32 );
+   cubehashUpdateDigest( &ctx.cube2, (byte*) hashA, 
+                               (const byte*) hashB, 32 );

 	sph_bmw256( &ctx.bmw, hashA, 32 );
 	sph_bmw256_close( &ctx.bmw, hashB );
@@ -72,43 +72,37 @@ void lyra2rev2_hash( void *state, const void *input )
 	memcpy( state, hashB, 32 );
 }

-int scanhash_lyra2rev2(int thr_id, struct work *work,
+int scanhash_lyra2rev2( struct work *work,
 	uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr)
 {
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
 	uint32_t endiandata[20] __attribute__ ((aligned (64)));
-        uint32_t hash[8] __attribute__((aligned(64)));
+   uint32_t hash[8] __attribute__((aligned(64)));
 	const uint32_t first_nonce = pdata[19];
 	uint32_t nonce = first_nonce;
        const uint32_t Htarg = ptarget[7];
-   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
+   int thr_id = mythr->id;  // thr_id arg is deprecated

 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x0000ff;

-        swab32_array( endiandata, pdata, 20 );
+   swab32_array( endiandata, pdata, 20 );

-        l2v2_blake256_midstate( endiandata );
+   l2v2_blake256_midstate( endiandata );

 	do {
 		be32enc(&endiandata[19], nonce);
 		lyra2rev2_hash(hash, endiandata);

 		if (hash[7] <= Htarg )
-                {
-                   if( fulltest(hash, ptarget) )
-                   {
+      if( fulltest( hash, ptarget ) && !opt_benchmark )
+      {
 			pdata[19] = nonce;
-                        work_set_target_ratio( work, hash );
-			*hashes_done = pdata[19] - first_nonce;
-		   	return 1;
-		   }
-                }
+         submit_solution( work, hash, mythr );
+      }
 		nonce++;
-
-	} while (nonce < max_nonce && !work_restart[thr_id].restart);
-
+	} while ( nonce < max_nonce && !work_restart[thr_id].restart );
 	pdata[19] = nonce;
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
--- a/algo/lyra2/lyra2rev3-4way.c
+++ b/algo/lyra2/lyra2rev3-4way.c
@@ -1,12 +1,138 @@
 #include "lyra2-gate.h"
 #include <memory.h>

-#if defined (LYRA2REV3_4WAY)	
-
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
 #include "algo/cubehash/cubehash_sse2.h" 

+
+#if defined (LYRA2REV3_8WAY)
+
+typedef struct {
+   blake256_8way_context     blake;
+   cubehashParam             cube;
+   bmw256_8way_context       bmw;
+} lyra2v3_8way_ctx_holder;
+
+static lyra2v3_8way_ctx_holder l2v3_8way_ctx;
+
+bool init_lyra2rev3_8way_ctx()
+{
+   blake256_8way_init( &l2v3_8way_ctx.blake );
+   cubehashInit( &l2v3_8way_ctx.cube, 256, 16, 32 );
+   bmw256_8way_init( &l2v3_8way_ctx.bmw );
+   return true;
+}
+
+void lyra2rev3_8way_hash( void *state, const void *input )
+{
+   uint32_t vhash[8*8] __attribute__ ((aligned (64)));
+   uint32_t hash0[8] __attribute__ ((aligned (64)));
+   uint32_t hash1[8] __attribute__ ((aligned (32)));
+   uint32_t hash2[8] __attribute__ ((aligned (32)));
+   uint32_t hash3[8] __attribute__ ((aligned (32)));
+   uint32_t hash4[8] __attribute__ ((aligned (32)));
+   uint32_t hash5[8] __attribute__ ((aligned (32)));
+   uint32_t hash6[8] __attribute__ ((aligned (32)));
+   uint32_t hash7[8] __attribute__ ((aligned (32)));
+   lyra2v3_8way_ctx_holder ctx __attribute__ ((aligned (64)));
+   memcpy( &ctx, &l2v3_8way_ctx, sizeof(l2v3_8way_ctx) );
+
+   blake256_8way( &ctx.blake, input, 80 );
+   blake256_8way_close( &ctx.blake, vhash );
+
+   dintrlv_8x32( hash0, hash1, hash2, hash3,
+                       hash4, hash5, hash6, hash7, vhash, 256 );
+
+   LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
+   LYRA2REV3( l2v3_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 );
+   LYRA2REV3( l2v3_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 );
+   LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
+   LYRA2REV3( l2v3_wholeMatrix, hash4, 32, hash4, 32, hash4, 32, 1, 4, 4 );
+   LYRA2REV3( l2v3_wholeMatrix, hash5, 32, hash5, 32, hash5, 32, 1, 4, 4 );
+   LYRA2REV3( l2v3_wholeMatrix, hash6, 32, hash6, 32, hash6, 32, 1, 4, 4 );
+   LYRA2REV3( l2v3_wholeMatrix, hash7, 32, hash7, 32, hash7, 32, 1, 4, 4 );
+
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash4, (const byte*) hash4, 32 );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash5, (const byte*) hash5, 32 );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash6, (const byte*) hash6, 32 );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash7, (const byte*) hash7, 32 );
+
+   LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
+   LYRA2REV3( l2v3_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 );
+   LYRA2REV3( l2v3_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 );
+   LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
+   LYRA2REV3( l2v3_wholeMatrix, hash4, 32, hash4, 32, hash4, 32, 1, 4, 4 );
+   LYRA2REV3( l2v3_wholeMatrix, hash5, 32, hash5, 32, hash5, 32, 1, 4, 4 );
+   LYRA2REV3( l2v3_wholeMatrix, hash6, 32, hash6, 32, hash6, 32, 1, 4, 4 );
+   LYRA2REV3( l2v3_wholeMatrix, hash7, 32, hash7, 32, hash7, 32, 1, 4, 4 );
+
+   intrlv_8x32( vhash, hash0, hash1, hash2, hash3,
+                             hash4, hash5, hash6, hash7, 256 );
+
+   bmw256_8way( &ctx.bmw, vhash, 32 );
+   bmw256_8way_close( &ctx.bmw, state );
+
+   }
+
+int scanhash_lyra2rev3_8way( struct work *work, uint32_t max_nonce,
+                             uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*8] __attribute__ ((aligned (64)));
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t *hash7 = &(hash[7<<3]);
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   const uint32_t Htarg = ptarget[7];
+   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
+   int thr_id = mythr->id;  // thr_id arg is deprecated
+
+   if ( opt_benchmark )
+      ( (uint32_t*)ptarget )[7] = 0x0000ff;
+
+   mm256_bswap32_intrlv80_8x32( vdata, pdata );
+   do
+   {
+      *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
+                                                  n+3, n+2, n+1, n ) );
+
+      lyra2rev3_8way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int lane = 0; lane < 8; lane++ ) if ( hash7[lane] <= Htarg )
+      {
+         extr_lane_8x32( lane_hash, hash, lane, 256 );
+         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+         {
+              pdata[19] = n + lane;
+              submit_lane_solution( work, lane_hash, mythr, lane );
+         }
+      }
+      n += 8;
+   } while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
+   *hashes_done = n - first_nonce + 1;
+   return 0;
+}
+
+#endif
+
+#if defined (LYRA2REV3_4WAY)  
+
+
 typedef struct {
   blake256_4way_context     blake;
   cubehashParam             cube;
@@ -35,7 +161,7 @@ void lyra2rev3_4way_hash( void *state, const void *input )

   blake256_4way( &ctx.blake, input, 80 );
   blake256_4way_close( &ctx.blake, vhash );
-   mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
+   dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );

   LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
   LYRA2REV3( l2v3_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 );
@@ -55,40 +181,30 @@ void lyra2rev3_4way_hash( void *state, const void *input )
   LYRA2REV3( l2v3_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 );
   LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );

-   mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
+   intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
   bmw256_4way( &ctx.bmw, vhash, 32 );
   bmw256_4way_close( &ctx.bmw, state );
-
 }

-int scanhash_lyra2rev3_4way( int thr_id, struct work *work, uint32_t max_nonce,
+int scanhash_lyra2rev3_4way( struct work *work, uint32_t max_nonce,
                             uint64_t *hashes_done, struct thr_info *mythr ) 
 {
   uint32_t hash[8*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-   uint32_t edata[20] __attribute__ ((aligned (64)));
   uint32_t *hash7 = &(hash[7<<2]);
-   uint32_t lane_hash[8];
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
   const uint32_t Htarg = ptarget[7];
   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
-   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
+   int thr_id = mythr->id;  // thr_id arg is deprecated
   
   if ( opt_benchmark )
      ( (uint32_t*)ptarget )[7] = 0x0000ff;

-   // Need big endian data
-   casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
-   casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
-   casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
-   casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
-   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
-
-   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
-
+   mm128_bswap32_intrlv80_4x32( vdata, pdata );
   do
   {
      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
@@ -98,23 +214,15 @@ int scanhash_lyra2rev3_4way( int thr_id, struct work *work, uint32_t max_nonce,

      for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
      {
-         mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
-
-         if ( fulltest( lane_hash, ptarget ) )
+         extr_lane_4x32( lane_hash, hash, lane, 256 );
+         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
         {
              pdata[19] = n + lane;    
-              work_set_target_ratio( work, lane_hash );
-              if ( submit_work( mythr, work ) )
-                applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
-		             accepted_share_count + rejected_share_count + 1,
-			     thr_id, lane );
-              else
-                applog( LOG_WARNING, "Failed to submit share." );
-	 }
+              submit_lane_solution( work, lane_hash, mythr, lane );
+	      }
      }
      n += 4;
   } while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
-
   *hashes_done = n - first_nonce + 1;
   return 0;
 }
--- a/algo/lyra2/lyra2rev3.c
+++ b/algo/lyra2/lyra2rev3.c
@@ -57,7 +57,7 @@ void lyra2rev3_hash( void *state, const void *input )
 	memcpy( state, hash, 32 );
 }

-int scanhash_lyra2rev3( int thr_id, struct work *work,
+int scanhash_lyra2rev3( struct work *work,
 	uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t *pdata = work->data;
@@ -67,7 +67,7 @@ int scanhash_lyra2rev3( int thr_id, struct work *work,
   const uint32_t first_nonce = pdata[19];
   uint32_t nonce = first_nonce;
   const uint32_t Htarg = ptarget[7];
-   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
+   int thr_id = mythr->id;  // thr_id arg is deprecated

   if (opt_benchmark)
 	((uint32_t*)ptarget)[7] = 0x0000ff;
@@ -78,28 +78,20 @@ int scanhash_lyra2rev3( int thr_id, struct work *work,
   casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
   casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
   casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
-
   l2v3_blake256_midstate( endiandata );
-
   do
   {
 	be32enc(&endiandata[19], nonce);
 	lyra2rev3_hash(hash, endiandata);

-	if (hash[7] <= Htarg )
-        {
-            if( fulltest(hash, ptarget) )
-            {
-		pdata[19] = nonce;
-                work_set_target_ratio( work, hash );
-                *hashes_done = pdata[19] - first_nonce;
-		return 1;
-	    }
-         }
-         nonce++;
-
-   } while (nonce < max_nonce && !work_restart[thr_id].restart);
-
+      if (hash[7] <= Htarg )
+      if( fulltest( hash, ptarget ) && !opt_benchmark )
+      {
+          pdata[19] = nonce;
+          submit_solution( work, hash, mythr );
+      }
+      nonce++;
+   } while ( nonce < max_nonce && !work_restart[thr_id].restart );
   pdata[19] = nonce;
   *hashes_done = pdata[19] - first_nonce + 1;
   return 0;
--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -36,43 +36,31 @@ void lyra2z_4way_hash( void *state, const void *input )
     blake256_4way( &ctx_blake, input + (64*4), 16 );
     blake256_4way_close( &ctx_blake, vhash );

-     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
+     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );

-     LYRA2Z( lyra2z_4way_matrix, hash0, 32, hash0, 32, hash0, 32, 8, 8, 8 );
-     LYRA2Z( lyra2z_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
-     LYRA2Z( lyra2z_4way_matrix, hash2, 32, hash2, 32, hash2, 32, 8, 8, 8 );
-     LYRA2Z( lyra2z_4way_matrix, hash3, 32, hash3, 32, hash3, 32, 8, 8, 8 );
-
-     memcpy( state,    hash0, 32 );
-     memcpy( state+32, hash1, 32 );
-     memcpy( state+64, hash2, 32 );
-     memcpy( state+96, hash3, 32 );
+     LYRA2Z( lyra2z_4way_matrix, state   , 32, hash0, 32, hash0, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_4way_matrix, state+32, 32, hash1, 32, hash1, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_4way_matrix, state+64, 32, hash2, 32, hash2, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_4way_matrix, state+96, 32, hash3, 32, hash3, 32, 8, 8, 8 );
 }

-int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
+int scanhash_lyra2z_4way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t hash[8*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-   uint32_t _ALIGN(64) edata[20];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
-   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
+   int thr_id = mythr->id;  // thr_id arg is deprecated

   if ( opt_benchmark )
      ptarget[7] = 0x0000ff;

-   casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
-   casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
-   casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
-   casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
-   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
-   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
-
+   mm128_bswap32_intrlv80_4x32( vdata, pdata );
   lyra2z_4way_midstate( vdata );

   do {
@@ -82,16 +70,11 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
      pdata[19] = n;

      for ( int i = 0; i < 4; i++ )
-      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
+           && !opt_benchmark )
      {
          pdata[19] = n+i;         
-          work_set_target_ratio( work, hash+(i<<3) );
-          if ( submit_work( mythr, work ) )
-              applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
-                             accepted_share_count + rejected_share_count + 1,
-                             thr_id, i );
-          else
-              applog( LOG_WARNING, "Failed to submit share." );
+          submit_lane_solution( work, hash+(i<<3), mythr, i );
      }
      n += 4;
   } while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
@@ -136,8 +119,8 @@ void lyra2z_8way_hash( void *state, const void *input )
     blake256_8way( &ctx_blake, input + (64*8), 16 );
     blake256_8way_close( &ctx_blake, vhash );

-     mm256_deinterleave_8x32( hash0, hash1, hash2, hash3,
-                              hash4, hash5, hash6, hash7, vhash, 256 );
+     dintrlv_8x32( hash0, hash1, hash2, hash3,
+                   hash4, hash5, hash6, hash7, vhash, 256 );

     LYRA2Z( lyra2z_8way_matrix, hash0, 32, hash0, 32, hash0, 32, 8, 8, 8 );
     LYRA2Z( lyra2z_8way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
@@ -158,30 +141,23 @@ void lyra2z_8way_hash( void *state, const void *input )
     memcpy( state+224, hash7, 32 );
 }

-int scanhash_lyra2z_8way( int thr_id, struct work *work, uint32_t max_nonce,
+int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t hash[8*8] __attribute__ ((aligned (64)));
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
-   uint32_t _ALIGN(64) edata[20];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
-   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
+   int thr_id = mythr->id;  // thr_id arg is deprecated

   if ( opt_benchmark )
      ptarget[7] = 0x0000ff;

-   casti_m256i( edata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
-   casti_m256i( edata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
-   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
-
-   mm256_interleave_8x32( vdata, edata, edata, edata, edata,
-                                 edata, edata, edata, edata, 640 );
-
+   mm256_bswap32_intrlv80_8x32( vdata, pdata );
   lyra2z_8way_midstate( vdata );

   do {
@@ -191,16 +167,11 @@ int scanhash_lyra2z_8way( int thr_id, struct work *work, uint32_t max_nonce,
      pdata[19] = n;

      for ( int i = 0; i < 8; i++ )
-      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
+           && !opt_benchmark )
      {
          pdata[19] = n+i;         
-          work_set_target_ratio( work, hash+(i<<3) );
-          if ( submit_work( mythr, work ) )
-              applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
-                             accepted_share_count + rejected_share_count + 1,
-                             thr_id, i );
-          else
-              applog( LOG_WARNING, "Failed to submit share." );
+          submit_lane_solution( work, hash+(i<<3), mythr, i );
      }
      n += 8;
   } while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
--- a/algo/lyra2/lyra2z.c
+++ b/algo/lyra2/lyra2z.c
@@ -3,7 +3,7 @@
 #include "lyra2-gate.h"
 #include "lyra2.h"
 #include "algo/blake/sph_blake.h"
-#include "avxdefs.h"
+#include "simd-utils.h"

 __thread uint64_t* lyra2z_matrix;

@@ -43,7 +43,7 @@ void lyra2z_hash( void *state, const void *input )
    memcpy(state, hash, 32);
 }

-int scanhash_lyra2z( int thr_id, struct work *work, uint32_t max_nonce,
+int scanhash_lyra2z( struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr )
 {
 	uint32_t _ALIGN(64) hash[8];
@@ -53,7 +53,7 @@ int scanhash_lyra2z( int thr_id, struct work *work, uint32_t max_nonce,
 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
 	uint32_t nonce = first_nonce;
-   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
+   int thr_id = mythr->id;  // thr_id arg is deprecated

 	if (opt_benchmark)
 		ptarget[7] = 0x0000ff;
@@ -68,16 +68,14 @@ int scanhash_lyra2z( int thr_id, struct work *work, uint32_t max_nonce,
 		be32enc(&endiandata[19], nonce);
                lyra2z_hash( hash, endiandata );

-		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
-			work_set_target_ratio(work, hash);
+      if ( hash[7] <= Htarg )
+      if ( fulltest( hash, ptarget ) && !opt_benchmark )
+      {
 			pdata[19] = nonce;
-			*hashes_done = pdata[19] - first_nonce;
-			return 1;
-		}
+			submit_solution( work, hash, mythr );
+	   }
 		nonce++;
-
-	} while (nonce < max_nonce && !work_restart[thr_id].restart);
-
+	} while ( nonce < max_nonce && !work_restart[thr_id].restart );
 	pdata[19] = nonce;
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
--- a/algo/lyra2/lyra2z330.c
+++ b/algo/lyra2/lyra2z330.c
@@ -1,7 +1,7 @@
 #include <memory.h>
 #include "algo-gate-api.h"
 #include "lyra2.h"
-#include "avxdefs.h"
+#include "simd-utils.h"

 __thread uint64_t* lyra2z330_wholeMatrix;

@@ -15,41 +15,42 @@ void lyra2z330_hash(void *state, const void *input, uint32_t height)
 	memcpy(state, hash, 32);
 }

-int scanhash_lyra2z330( int thr_id, struct work *work, uint32_t max_nonce,
+int scanhash_lyra2z330( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr )
 {
-	uint32_t hash[8] __attribute__ ((aligned (64))); 
-	uint32_t endiandata[20] __attribute__ ((aligned (64)));
-	uint32_t *pdata = work->data;
-	uint32_t *ptarget = work->target;
-	const uint32_t Htarg = ptarget[7];
-	const uint32_t first_nonce = pdata[19];
-	uint32_t nonce = first_nonce;
-   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
-	if (opt_benchmark)
-		ptarget[7] = 0x0000ff;
+   uint32_t hash[8] __attribute__ ((aligned (64))); 
+   uint32_t endiandata[20] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t nonce = first_nonce;
+   int thr_id = mythr->id;  // thr_id arg is deprecated

-	for (int i=0; i < 19; i++) {
-		be32enc(&endiandata[i], pdata[i]);
-	}
+   if (opt_benchmark)
+	ptarget[7] = 0x0000ff;

-	do {
-		be32enc(&endiandata[19], nonce);
-		lyra2z330_hash( hash, endiandata, work->height );
-
-		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
-			work_set_target_ratio(work, hash);
-			pdata[19] = nonce;
-			*hashes_done = pdata[19] - first_nonce;
-			return 1;
-		}
-		nonce++;
-
-	} while (nonce < max_nonce && !work_restart[thr_id].restart);
-
-	pdata[19] = nonce;
-	*hashes_done = pdata[19] - first_nonce + 1;
-	return 0;
+   casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
+   casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
+   casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
+   casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
+   casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+   
+   do
+   {
+      be32enc( &endiandata[19], nonce );
+      lyra2z330_hash( hash, endiandata, work->height );
+      if ( hash[7] <= Htarg )
+      if ( fulltest( hash, ptarget ) && !opt_benchmark )
+      {
+         pdata[19] = nonce;
+         submit_solution( work, hash, mythr );
+      }
+      nonce++;
+   } while ( nonce < max_nonce && !work_restart[thr_id].restart );
+   pdata[19] = nonce;
+   *hashes_done = pdata[19] - first_nonce + 1;
+   return 0;
 }

 void lyra2z330_set_target( struct work* work, double job_diff )
--- a/algo/lyra2/phi2-4way.c
+++ b/algo/lyra2/phi2-4way.c
@@ -0,0 +1,233 @@
+/**
+ * Phi-2 algo Implementation
+ */
+
+#include "lyra2-gate.h"
+
+#if defined(PHI2_4WAY)
+
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/gost/sph_gost.h"
+#include "algo/cubehash/cubehash_sse2.h"
+#include "algo/echo/aes_ni/hash_api.h"
+
+typedef struct {
+     cubehashParam           cube;
+     jh512_4way_context      jh;
+     hashState_echo          echo;
+//     hashState_echo          echo2;
+     sph_gost512_context     gost;
+     skein512_4way_context   skein;
+} phi2_ctx_holder;
+/*
+phi2_ctx_holder phi2_ctx;
+
+void init_phi2_ctx()
+{
+   cubehashInit( &phi2_ctx.cube, 512, 16, 32 );
+   sph_jh512_init(&phi2_ctx.jh);
+   init_echo( &phi2_ctx.echo1, 512 );
+   init_echo( &phi2_ctx.echo2, 512 );
+   sph_gost512_init(&phi2_ctx.gost);
+   sph_skein512_init(&phi2_ctx.skein);
+};
+*/
+void phi2_hash_4way( void *state, const void *input )
+{
+   uint32_t hash[4][16] __attribute__ ((aligned (64)));
+   uint32_t hashA[4][16] __attribute__ ((aligned (64)));
+   uint32_t hashB[4][16] __attribute__ ((aligned (64)));
+   uint32_t vhash[4*16] __attribute__ ((aligned (64)));
+
+//   unsigned char _ALIGN(128) hash[64];
+//	unsigned char _ALIGN(128) hashA[64];
+//	unsigned char _ALIGN(128) hashB[64];
+
+   phi2_ctx_holder ctx __attribute__ ((aligned (64)));
+//  memcpy( &ctx, &phi2_ctx, sizeof(phi2_ctx) );
+
+   cubehashInit( &ctx.cube, 512, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*)hashB[0], (const byte*)input,
+                        phi2_has_roots ? 144 : 80 );
+   cubehashInit( &ctx.cube, 512, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*)hashB[1], (const byte*)input+144,
+                        phi2_has_roots ? 144 : 80 );
+   cubehashInit( &ctx.cube, 512, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*)hashB[2], (const byte*)input+288,
+                        phi2_has_roots ? 144 : 80 );
+   cubehashInit( &ctx.cube, 512, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*)hashB[3], (const byte*)input+432,
+                        phi2_has_roots ? 144 : 80 );
+
+	LYRA2RE( &hashA[0][0], 32, &hashB[0][0], 32, &hashB[0][0], 32, 1, 8, 8 );
+	LYRA2RE( &hashA[0][8], 32, &hashB[0][8], 32, &hashB[0][8], 32, 1, 8, 8 );
+   LYRA2RE( &hashA[1][0], 32, &hashB[1][0], 32, &hashB[1][0], 32, 1, 8, 8 );
+   LYRA2RE( &hashA[1][8], 32, &hashB[1][8], 32, &hashB[1][8], 32, 1, 8, 8 );
+   LYRA2RE( &hashA[2][0], 32, &hashB[2][0], 32, &hashB[2][0], 32, 1, 8, 8 );
+   LYRA2RE( &hashA[2][8], 32, &hashB[2][8], 32, &hashB[2][8], 32, 1, 8, 8 );
+   LYRA2RE( &hashA[3][0], 32, &hashB[3][0], 32, &hashB[3][0], 32, 1, 8, 8 );
+   LYRA2RE( &hashA[3][8], 32, &hashB[3][8], 32, &hashB[3][8], 32, 1, 8, 8 );
+
+   intrlv_4x64( vhash, hashA[0], hashA[1], hashA[2], hashA[3], 512 );
+
+   jh512_4way_init( &ctx.jh );
+   jh512_4way( &ctx.jh, vhash, 64 );
+   jh512_4way_close( &ctx.jh, vhash );
+
+   dintrlv_4x64( hash[0], hash[1], hash[2], hash[3], vhash, 512 );
+
+   if ( hash[0][0] & 1 )
+  	{
+      sph_gost512_init( &ctx.gost );
+      sph_gost512( &ctx.gost, (const void*)hash[0], 64 );
+	   sph_gost512_close( &ctx.gost, (void*)hash[0] );
+	}
+  	else
+  	{
+      init_echo( &ctx.echo, 512 );
+      update_final_echo ( &ctx.echo, (BitSequence *)hash[0],
+                          (const BitSequence *)hash[0], 512 );
+      init_echo( &ctx.echo, 512 );
+      update_final_echo ( &ctx.echo, (BitSequence *)hash[0],
+                          (const BitSequence *)hash[0], 512 );
+	}
+
+   if ( hash[1][0] & 1 )
+   {
+      sph_gost512_init( &ctx.gost );
+      sph_gost512( &ctx.gost, (const void*)hash[1], 64 );
+      sph_gost512_close( &ctx.gost, (void*)hash[1] );
+   }
+   else
+   {
+      init_echo( &ctx.echo, 512 );
+      update_final_echo ( &ctx.echo, (BitSequence *)hash[1],
+                          (const BitSequence *)hash[1], 512 );
+      init_echo( &ctx.echo, 512 );
+      update_final_echo ( &ctx.echo, (BitSequence *)hash[1],
+                          (const BitSequence *)hash[1], 512 );
+   }
+
+   if ( hash[2][0] & 1 )
+   {
+      sph_gost512_init( &ctx.gost );
+      sph_gost512( &ctx.gost, (const void*)hash[2], 64 );
+      sph_gost512_close( &ctx.gost, (void*)hash[2] );
+   }
+   else
+   {
+      init_echo( &ctx.echo, 512 );
+      update_final_echo ( &ctx.echo, (BitSequence *)hash[2],
+                          (const BitSequence *)hash[2], 512 );
+      init_echo( &ctx.echo, 512 );
+      update_final_echo ( &ctx.echo, (BitSequence *)hash[2],
+                          (const BitSequence *)hash[2], 512 );
+   }
+
+   if ( hash[3][0] & 1 )
+   {
+      sph_gost512_init( &ctx.gost );
+      sph_gost512( &ctx.gost, (const void*)hash[3], 64 );
+      sph_gost512_close( &ctx.gost, (void*)hash[3] );
+   }
+   else
+   {
+      init_echo( &ctx.echo, 512 );
+      update_final_echo ( &ctx.echo, (BitSequence *)hash[3],
+                          (const BitSequence *)hash[3], 512 );
+      init_echo( &ctx.echo, 512 );
+      update_final_echo ( &ctx.echo, (BitSequence *)hash[3],
+                          (const BitSequence *)hash[3], 512 );
+   }
+
+   intrlv_4x64( vhash, hash[0], hash[1], hash[2], hash[3], 512 );
+   
+   skein512_4way_init( &ctx.skein );
+	skein512_4way( &ctx.skein, vhash, 64 );
+	skein512_4way_close( &ctx.skein, vhash );
+
+   for (int i=0; i<4; i++)
+   {
+      ( (uint64_t*)vhash    )[i] ^= ( (uint64_t*)vhash    )[i+4];
+      ( (uint64_t*)vhash+ 8 )[i] ^= ( (uint64_t*)vhash+ 8 )[i+4];
+      ( (uint64_t*)vhash+16 )[i] ^= ( (uint64_t*)vhash+16 )[i+4];
+      ( (uint64_t*)vhash+24 )[i] ^= ( (uint64_t*)vhash+24 )[i+4];
+   }
+//   for ( int i = 0; i < 4; i++ )
+//      casti_m256i( vhash, i ) = _mm256_xor_si256( casti_m256i( vhash, i   ),
+//                                                  casti_m256i( vhash, i+4 ) );
+
+	memcpy( state, vhash, 128 );
+}
+
+int scanhash_phi2_4way( struct work *work, uint32_t max_nonce,
+	                     uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t _ALIGN(128) hash[8];
+   uint32_t _ALIGN(128) edata[36];
+   uint32_t vdata[4][36] __attribute__ ((aligned (64)));
+   uint32_t *hash7 = &(hash[25]);
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   int thr_id = mythr->id;  // thr_id arg is deprecated
+
+   if(opt_benchmark){
+   	ptarget[7] = 0x00ff;
+   }
+
+// Data is not interleaved, but hash is.
+// any non-zero data at index 20 or above sets roots true.
+// Split up the operations, bswap first, then set roots.
+
+   phi2_has_roots = false;
+   for ( int i=0; i < 36; i++ )
+   {
+   be32enc(&edata[i], pdata[i]);
+   if (i >= 20 && pdata[i]) phi2_has_roots = true;
+   }
+/*
+   casti_m256i( vdata[0], 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );   
+   casti_m256i( vdata[0], 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
+   casti_m256i( vdata[0], 2 ) = mm256_bswap_32( casti_m256i( pdata, 2 ) );
+   casti_m256i( vdata[0], 3 ) = mm256_bswap_32( casti_m256i( pdata, 3 ) );
+   casti_m128i( vdata[0], 8 ) = mm128_bswap_32( casti_m128i( pdata, 8 ) );
+   phi2_has_roots = mm128_anybits1( casti_m128i( vdata[0], 5 ) ) ||
+                    mm128_anybits1( casti_m128i( vdata[0], 6 ) ) ||
+                    mm128_anybits1( casti_m128i( vdata[0], 7 ) ) ||
+                    mm128_anybits1( casti_m128i( vdata[0], 8 ) );
+*/   
+
+   memcpy( vdata[0], edata, 144 );
+   memcpy( vdata[1], edata, 144 );
+   memcpy( vdata[2], edata, 144 );
+   memcpy( vdata[3], edata, 144 );
+
+   do {
+      be32enc( &vdata[0][19], n );
+      be32enc( &vdata[1][19], n+1 );
+      be32enc( &vdata[2][19], n+2 );
+      be32enc( &vdata[3][19], n+3 );
+
+      phi2_hash_4way( hash, vdata );
+
+      for ( int lane = 0; lane < 4; lane++ ) if (  hash7[ lane<<1 ] < Htarg )
+      {
+          extr_lane_4x64( lane_hash, hash, lane, 256 );
+          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+          {
+              pdata[19] = n + lane;
+              submit_lane_solution( work, lane_hash, mythr, lane );
+          }
+       }
+       n += 4;
+    } while ( ( n < max_nonce - 4 ) && !work_restart[thr_id].restart );
+    *hashes_done = n - first_nonce + 1;
+    return 0;
+}
+   
+#endif  // PHI2_4WAY
--- a/algo/lyra2/phi2.c
+++ b/algo/lyra2/phi2.c
@@ -50,11 +50,11 @@ void phi2_hash(void *state, const void *input)
 	unsigned char _ALIGN(128) hashA[64];
 	unsigned char _ALIGN(128) hashB[64];

-        phi2_ctx_holder ctx __attribute__ ((aligned (64)));
-        memcpy( &ctx, &phi2_ctx, sizeof(phi2_ctx) );
+  phi2_ctx_holder ctx __attribute__ ((aligned (64)));
+  memcpy( &ctx, &phi2_ctx, sizeof(phi2_ctx) );

-        cubehashUpdateDigest( &ctx.cube, (byte*)hashB, (const byte*)input,
-		              phi2_has_roots ? 144 : 80 );
+  cubehashUpdateDigest( &ctx.cube, (byte*)hashB, (const byte*)input,
+                        phi2_has_roots ? 144 : 80 );

 	LYRA2RE( &hashA[ 0], 32, &hashB[ 0], 32, &hashB[ 0], 32, 1, 8, 8 );
 	LYRA2RE( &hashA[32], 32, &hashB[32], 32, &hashB[32], 32, 1, 8, 8 );
@@ -63,17 +63,17 @@ void phi2_hash(void *state, const void *input)
 	sph_jh512_close( &ctx.jh, (void*)hash );

 	if ( hash[0] & 1 )
-       	{
-           sph_gost512( &ctx.gost, (const void*)hash, 64 );
+  	{
+      sph_gost512( &ctx.gost, (const void*)hash, 64 );
 	   sph_gost512_close( &ctx.gost, (void*)hash );
 	}
-       	else
-       	{
+  	else
+  	{
 #if defined(__AES__)
-           update_final_echo ( &ctx.echo1, (BitSequence *)hash,
-                               (const BitSequence *)hash, 512 );
-           update_final_echo ( &ctx.echo2, (BitSequence *)hash,
-                               (const BitSequence *)hash, 512 );
+      update_final_echo ( &ctx.echo1, (BitSequence *)hash,
+                          (const BitSequence *)hash, 512 );
+      update_final_echo ( &ctx.echo2, (BitSequence *)hash,
+                          (const BitSequence *)hash, 512 );
 #else
 	   sph_echo512( &ctx.echo1, (const void*)hash, 64 );
 	   sph_echo512_close( &ctx.echo1, (void*)hash );
@@ -92,7 +92,7 @@ void phi2_hash(void *state, const void *input)
 	memcpy(state, hash, 32);
 }

-int scanhash_phi2( int thr_id, struct work *work, uint32_t max_nonce,
+int scanhash_phi2( struct work *work, uint32_t max_nonce,
 	           uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t _ALIGN(128) hash[8];
@@ -102,7 +102,7 @@ int scanhash_phi2( int thr_id, struct work *work, uint32_t max_nonce,
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
-   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
+   int thr_id = mythr->id;  // thr_id arg is deprecated

   if(opt_benchmark){
   	ptarget[7] = 0x00ff;
@@ -111,30 +111,21 @@ int scanhash_phi2( int thr_id, struct work *work, uint32_t max_nonce,
   phi2_has_roots = false;
   for ( int i=0; i < 36; i++ )
   {
-	be32enc(&endiandata[i], pdata[i]);
-	if (i >= 20 && pdata[i]) phi2_has_roots = true;
+	   be32enc(&endiandata[i], pdata[i]);
+      if ( i >= 20 && pdata[i] ) phi2_has_roots = true;
   }

   do {
 	be32enc( &endiandata[19], n );
 	phi2_hash( hash, endiandata );
-
-	if ( hash[7] < Htarg && fulltest( hash, ptarget ) )
-       	{
-           pdata[19] = n;
-           work_set_target_ratio( work, hash );
-           if ( submit_work( mythr, work ) )
-               applog( LOG_NOTICE, "Share %d submitted by thread %d.",
-                            accepted_share_count + rejected_share_count + 1,
-                            thr_id );
-           else
-               applog( LOG_WARNING, "Failed to submit share." );
-			*hashes_done = n - first_nonce + 1;
-	}
+	if ( hash[7] < Htarg )
+   if ( fulltest( hash, ptarget ) && !opt_benchmark )
+  	{
+       pdata[19] = n;
+       submit_solution( work, hash, mythr );
+   }
 	n++;
-
   } while ( n < max_nonce && !work_restart[thr_id].restart );
-
   *hashes_done = n - first_nonce + 1;
   pdata[19] = n;
   return 0;
--- a/algo/lyra2/sponge.c
+++ b/algo/lyra2/sponge.c
@@ -51,7 +51,7 @@ inline void initState( uint64_t State[/*16*/] )
  state[3] = _mm256_set_epi64x( blake2b_IV[7], blake2b_IV[6],
                                blake2b_IV[5], blake2b_IV[4] );

-#elif defined (__SSE4_2__)
+#elif defined (__SSE2__)

  __m128i* state = (__m128i*)State;

@@ -137,7 +137,7 @@ inline void squeeze( uint64_t *State, byte *Out, unsigned int len )
    //Squeezes remaining bytes
    memcpy_256( out, state, ( len_m256i % BLOCK_LEN_M256I ) );

-#elif defined (__SSE4_2__)
+#elif defined (__SSE2__)

    const int len_m128i = len / 16;
    const int fullBlocks = len_m128i / BLOCK_LEN_M128I;
@@ -205,7 +205,7 @@ inline void absorbBlock( uint64_t *State, const uint64_t *In )
    _mm256_store_si256( (__m256i*)State + 2, state2 );
    _mm256_store_si256( (__m256i*)State + 3, state3 );

-#elif defined (__SSE4_2__)
+#elif defined (__SSE2__)

    __m128i* state = (__m128i*)State;
    __m128i* in    = (__m128i*)In;
@@ -273,7 +273,7 @@ inline void absorbBlockBlake2Safe( uint64_t *State, const uint64_t *In )
    _mm256_store_si256( (__m256i*)State + 2, state2 );
    _mm256_store_si256( (__m256i*)State + 3, state3 );

-#elif defined (__SSE4_2__)
+#elif defined (__SSE2__)

    __m128i* state = (__m128i*)State;
    __m128i* in    = (__m128i*)In;
@@ -355,7 +355,7 @@ inline void reducedSqueezeRow0( uint64_t* State, uint64_t* rowOut,
    _mm256_store_si256( (__m256i*)State + 2, state2 );
    _mm256_store_si256( (__m256i*)State + 3, state3 );

-#elif defined (__SSE4_2__)
+#elif defined (__SSE2__)

    __m128i* state = (__m128i*)State;
    __m128i  state0 = _mm_load_si128(  state    );
@@ -494,7 +494,7 @@ inline void reducedDuplexRow1( uint64_t *State, uint64_t *rowIn,
    _mm256_store_si256( (__m256i*)State + 2, state2 );
    _mm256_store_si256( (__m256i*)State + 3, state3 );

-#elif defined (__SSE4_2__)
+#elif defined (__SSE2__)

    __m128i* state = (__m128i*)State;
    __m128i  state0 = _mm_load_si128(  state    );
@@ -694,7 +694,7 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
    _mm256_store_si256( (__m256i*)State + 2, state2 );
    _mm256_store_si256( (__m256i*)State + 3, state3 );

-#elif defined (__SSE4_2__)
+#elif defined (__SSE2__)

    __m128i* in    = (__m128i*)rowIn;
    __m128i* inout = (__m128i*)rowInOut;
@@ -713,9 +713,9 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
    __m128i* state = (__m128i*)State;

    // For the last round in this function not optimized for AVX
-    uint64_t* ptrWordIn = rowIn;        //In Lyra2: pointer to prev
-    uint64_t* ptrWordInOut = rowInOut;  //In Lyra2: pointer to row*
-    uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
+//    uint64_t* ptrWordIn = rowIn;        //In Lyra2: pointer to prev
+//    uint64_t* ptrWordInOut = rowInOut;  //In Lyra2: pointer to row*
+//    uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row

    for ( i = 0; i < nCols; i++ )
    {
@@ -750,6 +750,28 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
        out[4] = _mm_xor_si128( state[4], in[4] );
        out[5] = _mm_xor_si128( state[5], in[5] );

+
+       __m128i t0, t1;
+       t0 = _mm_srli_si128( state[0], 8 );
+       t1 = _mm_srli_si128( state[1], 8 );
+       inout[0] = _mm_xor_si128( inout[0],
+                              _mm_or_si128( _mm_slli_si128( state[0], 8 ),
+                                            _mm_srli_si128( state[5], 8 ) ) );
+       inout[1] = _mm_xor_si128( inout[1],
+                        _mm_or_si128( _mm_slli_si128( state[1], 8 ), t0 ) );
+       t0 = _mm_srli_si128( state[2], 8 );
+       inout[2] = _mm_xor_si128( inout[2],
+                        _mm_or_si128( _mm_slli_si128( state[2], 8 ), t1 ) );
+       t1 = _mm_srli_si128( state[3], 8 );
+       inout[3] = _mm_xor_si128( inout[3],
+                        _mm_or_si128( _mm_slli_si128( state[3], 8 ), t0 ) );
+       t0 = _mm_srli_si128( state[4], 8 );
+       inout[4] = _mm_xor_si128( inout[4],
+                        _mm_or_si128( _mm_slli_si128( state[4], 8 ), t1 ) );
+       inout[5] = _mm_xor_si128( inout[5],
+                        _mm_or_si128( _mm_slli_si128( state[5], 8 ), t0 ) );
+
+/*
        ptrWordInOut[0]  ^= State[11];
        ptrWordInOut[1]  ^= State[0];
        ptrWordInOut[2]  ^= State[1];
@@ -768,7 +790,7 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
        ptrWordIn += BLOCK_LEN_INT64;
        //Output: goes to previous column
        ptrWordOut -= BLOCK_LEN_INT64;
-
+*/
        inout += BLOCK_LEN_M128I;
        in    += BLOCK_LEN_M128I;
        out   -= BLOCK_LEN_M128I;
@@ -930,7 +952,7 @@ inline void reducedDuplexRow( uint64_t *State, uint64_t *rowIn,
   _mm256_store_si256( (__m256i*)State + 2, state2 );
   _mm256_store_si256( (__m256i*)State + 3, state3 );

-#elif defined(__SSE4_2__)
+#elif defined (__SSE2__)

    __m128i* state = (__m128i*)State;
    __m128i* in    = (__m128i*)rowIn;
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -23,7 +23,7 @@
 #define SPONGE_H_

 #include <stdint.h>
-#include "avxdefs.h"
+#include "simd-utils.h"

 #if defined(__GNUC__)
 #define ALIGN __attribute__ ((aligned(32)))
@@ -59,7 +59,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
 // returns void, updates all args
 #define G_4X64(a,b,c,d) \
   a = _mm256_add_epi64( a, b ); \
-   d = mm256_ror_64( _mm256_xor_si256( d, a), 32 ); \
+   d = mm256_ror_64( _mm256_xor_si256( d, a ), 32 ); \
   c = _mm256_add_epi64( c, d ); \
   b = mm256_ror_64( _mm256_xor_si256( b, c ), 24 ); \
   a = _mm256_add_epi64( a, b ); \
--- a/algo/m7m.c
+++ b/algo/m7m.c
@@ -144,8 +144,8 @@ void init_m7m_ctx()
 #define NM7M 5
 #define SW_DIVS 5
 #define M7_MIDSTATE_LEN 76
-int scanhash_m7m_hash( int thr_id, struct work* work,
-                       uint64_t max_nonce, unsigned long *hashes_done )
+int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
+                       unsigned long *hashes_done, struct thr_info *mythr )
 {
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
@@ -154,6 +154,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
    uint32_t hash[8] __attribute__((aligned(64)));
    uint8_t bhash[7][64] __attribute__((aligned(64)));
    uint32_t n = pdata[19] - 1;
+    int thr_id = mythr->id;  // thr_id arg is deprecated
    uint32_t usw_, mpzscale;
    const uint32_t first_nonce = pdata[19];
    char data_str[161], hash_str[65], target_str[65];
@@ -206,6 +207,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work,

        SHA512_Update(  &ctx2.sha512, data_p64, 80 - M7_MIDSTATE_LEN );
        SHA512_Final( (unsigned char*) (bhash[1]), &ctx2.sha512 );
+
        sph_keccak512( &ctx2.keccak, data_p64, 80 - M7_MIDSTATE_LEN );
        sph_keccak512_close( &ctx2.keccak, (void*)(bhash[2]) );

@@ -221,18 +223,18 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
        sph_ripemd160( &ctx2.ripemd, data_p64, 80 - M7_MIDSTATE_LEN );
        sph_ripemd160_close( &ctx2.ripemd, (void*)(bhash[6]) );

-	mpz_import(bns0, a, -1, p, -1, 0, bhash[0]);
+        mpz_import(bns0, a, -1, p, -1, 0, bhash[0]);
        mpz_set(bns1, bns0);
-	mpz_set(product, bns0);
-	for ( i=1; i < 7; i++ )
+	     mpz_set(product, bns0);
+	     for ( i=1; i < 7; i++ )
        {
-	    mpz_import(bns0, a, -1, p, -1, 0, bhash[i]);
-	    mpz_add(bns1, bns1, bns0);
-            mpz_mul(product, product, bns0);
+	        mpz_import(bns0, a, -1, p, -1, 0, bhash[i]);
+	        mpz_add(bns1, bns1, bns0);
+           mpz_mul(product, product, bns0);
        }
        mpz_mul(product, product, bns1);

-	mpz_mul(product, product, product);
+        mpz_mul(product, product, product);
        bytes = mpz_sizeinbase(product, 256);
        mpz_export((void *)bdata, NULL, -1, 1, 0, 0, product);

@@ -242,27 +244,27 @@ int scanhash_m7m_hash( int thr_id, struct work* work,

        digits=(int)((sqrt((double)(n/2))*(1.+EPS))/9000+75);
        mp_bitcnt_t prec = (long int)(digits*BITS_PER_DIGIT+16);
-	mpf_set_prec_raw(magifpi, prec);
-	mpf_set_prec_raw(mptmp, prec);
-	mpf_set_prec_raw(mpt1, prec);
-	mpf_set_prec_raw(mpt2, prec);
+        mpf_set_prec_raw(magifpi, prec);
+        mpf_set_prec_raw(mptmp, prec);
+        mpf_set_prec_raw(mpt1, prec);
+        mpf_set_prec_raw(mpt2, prec);

        usw_ = sw2_(n/2);
-	mpzscale = 1;
+	     mpzscale = 1;
        mpz_set_ui(magisw, usw_);
 	    
        for ( i = 0; i < 5; i++ )
        {	
            mpf_set_d(mpt1, 0.25*mpzscale);
-	    mpf_sub(mpt1, mpt1, mpt2);
+	         mpf_sub(mpt1, mpt1, mpt2);
            mpf_abs(mpt1, mpt1);
            mpf_div(magifpi, magifpi0, mpt1);
            mpf_pow_ui(mptmp, mpten, digits >> 1);
            mpf_mul(magifpi, magifpi, mptmp);
-	    mpz_set_f(magipi, magifpi);
+	         mpz_set_f(magipi, magifpi);
            mpz_add(magipi,magipi,magisw);
            mpz_add(product,product,magipi);
-	    mpz_import(bns0, b, -1, p, -1, 0, (void*)(hash));
+	         mpz_import(bns0, b, -1, p, -1, 0, (void*)(hash));
            mpz_add(bns1, bns1, bns0);
            mpz_mul(product,product,bns1);
            mpz_cdiv_q (product, product, bns0);
@@ -274,18 +276,18 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
            SHA256_Init( &ctxf_sha256 );
            SHA256_Update(  &ctxf_sha256, bdata, bytes );
            SHA256_Final( (unsigned char*) hash, &ctxf_sha256 );
-	}
+        }

-	const unsigned char *hash_ = (const unsigned char *)hash;
-	const unsigned char *target_ = (const unsigned char *)ptarget;
-	for ( i = 31; i >= 0; i-- )
+        const unsigned char *hash_ = (const unsigned char *)hash;
+        const unsigned char *target_ = (const unsigned char *)ptarget;
+        for ( i = 31; i >= 0; i-- )
        {
-	      if ( hash_[i] != target_[i] )
-              {
-		rc = hash_[i] < target_[i];
-		break;
-	      }
-	}
+	        if ( hash_[i] != target_[i] )
+           {
+		        rc = hash_[i] < target_[i];
+		        break;
+	        }
+        }
        if ( unlikely(rc) )
        {
            if ( opt_debug )
@@ -298,15 +300,15 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
                    hash_str,
                    target_str);
            }
-            work_set_target_ratio( work, hash );
            pdata[19] = data[19];
-            goto out;
-	  }
+            submit_solution( work, hash, mythr );
+        }
    } while (n < max_nonce && !work_restart[thr_id].restart);

     pdata[19] = n;

-out:
+// can this be skipped after finding a share? Seems to work ok.
+//out:
     mpf_set_prec_raw(magifpi, prec0);
     mpf_set_prec_raw(magifpi0, prec0);
     mpf_set_prec_raw(mptmp, prec0);
--- a/algo/nist5/nist5-4way.c
+++ b/algo/nist5/nist5-4way.c
@@ -12,9 +12,6 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/groestl/aes_ni/hash-groestl.h"

-// no improvement with midstate
-//static __thread blake512_4way_context ctx_mid;
-
 void nist5hash_4way( void *out, const void *input )
 {
     uint64_t hash0[8] __attribute__ ((aligned (64)));
@@ -28,14 +25,11 @@ void nist5hash_4way( void *out, const void *input )
     skein512_4way_context  ctx_skein;
     keccak512_4way_context ctx_keccak;

-//     memcpy( &ctx_blake, &ctx_mid, sizeof(ctx_mid) );
-//     blake512_4way( &ctx_blake, input + (64<<2), 16 );
-
     blake512_4way_init( &ctx_blake );
     blake512_4way( &ctx_blake, input, 80 );
     blake512_4way_close( &ctx_blake, vhash );

-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

     init_groestl( &ctx_groestl, 64 );
     update_and_final_groestl( &ctx_groestl, (char*)hash0,
@@ -50,7 +44,7 @@ void nist5hash_4way( void *out, const void *input )
     update_and_final_groestl( &ctx_groestl, (char*)hash3,
                               (const char*)hash3, 512 );

-     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

     jh512_4way_init( &ctx_jh );
     jh512_4way( &ctx_jh, vhash, 64 );
@@ -65,22 +59,20 @@ void nist5hash_4way( void *out, const void *input )
     skein512_4way_close( &ctx_skein, out );
 }

-int scanhash_nist5_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done)
+int scanhash_nist5_4way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr )
 {
     uint32_t hash[4*16] __attribute__ ((aligned (64)));
     uint32_t *hash7 = &(hash[25]);
-     uint32_t lane_hash[8];
+     uint32_t lane_hash[8] __attribute__ ((aligned (32)));
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
-     uint32_t endiandata[20] __attribute__((aligned(64)));
     uint32_t *pdata = work->data;
     uint32_t *ptarget = work->target;
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
     const uint32_t Htarg = ptarget[7];
-     uint32_t *nonces = work->nonces;
-     int num_found = 0;
-     uint32_t *noncep = vdata + 73;   // 9*8 + 1
+     __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+     int thr_id = mythr->id;  // thr_id arg is deprecated

     uint64_t htmax[] = {          0,
                                 0xF,
@@ -96,15 +88,7 @@ int scanhash_nist5_4way( int thr_id, struct work *work, uint32_t max_nonce,
                          0xFFFF0000,
                                   0 };

-     // we need bigendian data...
-     swab32_array( endiandata, pdata, 20 );
-
-     uint64_t *edata = (uint64_t*)endiandata;
-     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
-
-     // precalc midstate
-//     blake512_4way_init( &ctx_mid );
-//     blake512_4way( &ctx_mid, vdata, 64 );
+     mm256_bswap32_intrlv80_4x64( vdata, pdata );

     for ( int m=0; m < 6; m++ )
     {
@@ -113,33 +97,28 @@ int scanhash_nist5_4way( int thr_id, struct work *work, uint32_t max_nonce,
           uint32_t mask = masks[m];

           do {
-              be32enc( noncep,   n   );
-              be32enc( noncep+2, n+1 );
-              be32enc( noncep+4, n+2 );
-              be32enc( noncep+6, n+3 );
+              *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+                 _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );

              nist5hash_4way( hash, vdata );

              for ( int lane = 0; lane < 4; lane++ )
              if ( ( hash7[ lane ] & mask ) == 0 )
              {
-                 mm256_extract_lane_4x64( lane_hash, hash, lane, 256 );
-                 if ( fulltest( lane_hash, ptarget ) )
+                 extr_lane_4x64( lane_hash, hash, lane, 256 );
+                 if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
                 {
                    pdata[19] = n + lane;
-                    nonces[ num_found++ ] = n + lane;
-                    work_set_target_ratio( work, lane_hash );
+                    submit_lane_solution( work, lane_hash, mythr, lane );
                 }
              }
              n += 4;
-           } while ( ( num_found == 0 ) && ( n < max_nonce )
-                     && !work_restart[thr_id].restart );
+           } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
           break;
        }
     }
-
     *hashes_done = n - first_nonce + 1;
-     return num_found;
+     return 0;
 }

 #endif
--- a/algo/nist5/nist5-gate.h
+++ b/algo/nist5/nist5-gate.h
@@ -12,15 +12,15 @@

 void nist5hash_4way( void *state, const void *input );

-int scanhash_nist5_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done );
+int scanhash_nist5_4way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );

 #else

 void nist5hash( void *state, const void *input );

-int scanhash_nist5( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done );
+int scanhash_nist5( struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done, struct thr_info *mythr );
 void init_nist5_ctx();
 #endif

--- a/algo/nist5/nist5.c
+++ b/algo/nist5/nist5.c
@@ -81,8 +81,8 @@ void nist5hash(void *output, const void *input)
     memcpy(output, hash, 32);
 }

-int scanhash_nist5(int thr_id, struct work *work,
-				uint32_t max_nonce, uint64_t *hashes_done)
+int scanhash_nist5( struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done, struct thr_info *mythr)
 {
        uint32_t endiandata[20] __attribute__((aligned(64)));
        uint32_t hash64[8] __attribute__((aligned(32)));
@@ -90,6 +90,7 @@ int scanhash_nist5(int thr_id, struct work *work,
        uint32_t *ptarget = work->target;
 	uint32_t n = pdata[19] - 1;
 	const uint32_t first_nonce = pdata[19];
+   int thr_id = mythr->id;  // thr_id arg is deprecated
 	const uint32_t Htarg = ptarget[7];

 	uint64_t htmax[] = {
--- a/algo/nist5/zr5.c
+++ b/algo/nist5/zr5.c
@@ -144,8 +144,8 @@ static const int arrOrder[][4] =
 	memcpy(state, hash, 32);
 }

-int scanhash_zr5( int thr_id, struct work *work,
-                   uint32_t max_nonce, unsigned long *hashes_done)
+int scanhash_zr5( struct work *work, uint32_t max_nonce,
+                  unsigned long *hashes_done, struct thr_info *mythr )
 {
  uint32_t *pdata = work->data;
  uint32_t *ptarget = work->target;
@@ -154,6 +154,7 @@ int scanhash_zr5( int thr_id, struct work *work,
  const uint32_t version = pdata[0] & (~POK_DATA_MASK);
  const uint32_t first_nonce = pdata[19];
  uint32_t nonce = first_nonce;
+  int thr_id = mythr->id;  // thr_id arg is deprecated

  memcpy(tmpdata, pdata, 80);

--- a/algo/quark/anime-4way.c
+++ b/algo/quark/anime-4way.c
@@ -48,8 +48,9 @@ void anime_4way_hash( void *state, const void *input )
    __m256i* vhA = (__m256i*)vhashA;
    __m256i* vhB = (__m256i*)vhashB;
    __m256i vh_mask;
+    const uint32_t mask = 8;
    const __m256i bit3_mask = _mm256_set1_epi64x( 8 );
-    int i;
+    const __m256i zero = _mm256_setzero_si256();
    anime_4way_ctx_holder ctx;
    memcpy( &ctx, &anime_4way_ctx, sizeof(anime_4way_ctx) );

@@ -59,30 +60,46 @@ void anime_4way_hash( void *state, const void *input )
    blake512_4way( &ctx.blake, vhash, 64 );
    blake512_4way_close( &ctx.blake, vhash );

-    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
-                                  m256_zero );
+    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );

-    mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
-    update_and_final_groestl( &ctx.groestl, (char*)hash0,
-                                            (char*)hash0, 512 );
-    reinit_groestl( &ctx.groestl );
-    update_and_final_groestl( &ctx.groestl, (char*)hash1,
-                                            (char*)hash1, 512 );
-    reinit_groestl( &ctx.groestl );
-    update_and_final_groestl( &ctx.groestl, (char*)hash2,
-                                            (char*)hash2, 512 );
-    reinit_groestl( &ctx.groestl );
-    update_and_final_groestl( &ctx.groestl, (char*)hash3,
-                                            (char*)hash3, 512 );
-    mm256_interleave_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
+    dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

-    skein512_4way( &ctx.skein, vhash, 64 );
-    skein512_4way_close( &ctx.skein, vhashB );
+    if ( hash0[0] & mask )
+    {
+       update_and_final_groestl( &ctx.groestl, (char*)hash0,
+                                               (char*)hash0, 512 );
+    }
+    if ( hash1[0] & mask )
+    {
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash1,
+                                               (char*)hash1, 512 );
+    }
+    if ( hash2[0] & mask )
+    {
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash2,
+                                               (char*)hash2, 512 );
+    }
+    if ( hash3[0] & mask )
+    {
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash3,
+                                               (char*)hash3, 512 );
+    }

-    for ( i = 0; i < 8; i++ )
-       vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
+    intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
+
+    if ( mm256_anybits0( vh_mask ) )
+    {
+       skein512_4way( &ctx.skein, vhash, 64 );
+       skein512_4way_close( &ctx.skein, vhashB );
+    }
+
+    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
+
+    dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

-    mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
    reinit_groestl( &ctx.groestl );
    update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
    reinit_groestl( &ctx.groestl );
@@ -91,24 +108,28 @@ void anime_4way_hash( void *state, const void *input )
    update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
    reinit_groestl( &ctx.groestl );
    update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
-    mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+    intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

    jh512_4way( &ctx.jh, vhash, 64 );
    jh512_4way_close( &ctx.jh, vhash );

-    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
-                                  m256_zero );
+    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );

+    if ( mm256_anybits1( vh_mask ) )
+    {
       blake512_4way_init( &ctx.blake );
       blake512_4way( &ctx.blake, vhash, 64 );
       blake512_4way_close( &ctx.blake, vhashA );
-
+    }
+    if ( mm256_anybits0( vh_mask ) )
+    {
       bmw512_4way_init( &ctx.bmw );
       bmw512_4way( &ctx.bmw, vhash, 64 );
       bmw512_4way_close( &ctx.bmw, vhashB );
+    }

-    for ( i = 0; i < 8; i++ )
-       vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
+    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );

    keccak512_4way( &ctx.keccak, vhash, 64 );
    keccak512_4way_close( &ctx.keccak, vhash );
@@ -117,36 +138,37 @@ void anime_4way_hash( void *state, const void *input )
    skein512_4way( &ctx.skein, vhash, 64 );
    skein512_4way_close( &ctx.skein, vhash );

-    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
-                                  m256_zero );
+    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );

-    keccak512_4way_init( &ctx.keccak );
-    keccak512_4way( &ctx.keccak, vhash, 64 );
-    keccak512_4way_close( &ctx.keccak, vhashA );
+    if ( mm256_anybits1( vh_mask ) )
+    {
+       keccak512_4way_init( &ctx.keccak );
+       keccak512_4way( &ctx.keccak, vhash, 64 );
+       keccak512_4way_close( &ctx.keccak, vhashA );
+    }
+    if ( mm256_anybits0( vh_mask ) )
+    {
+       jh512_4way_init( &ctx.jh );
+       jh512_4way( &ctx.jh, vhash, 64 );
+       jh512_4way_close( &ctx.jh, vhashB );
+    }

-    jh512_4way_init( &ctx.jh );
-    jh512_4way( &ctx.jh, vhash, 64 );
-    jh512_4way_close( &ctx.jh, vhashB );
+    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );

-    for ( i = 0; i < 8; i++ )
-       vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
-
-    mm256_deinterleave_4x64( state, state+32, state+64, state+96, vhash, 256 );
+    dintrlv_4x64( state, state+32, state+64, state+96, vhash, 256 );
 }

-int scanhash_anime_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done)
+int scanhash_anime_4way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr )
 {
    uint32_t hash[4*8] __attribute__ ((aligned (64)));
    uint32_t vdata[24*4] __attribute__ ((aligned (64)));
-    uint32_t endiandata[20] __attribute__((aligned(64)));
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
    uint32_t n = pdata[19];
    const uint32_t first_nonce = pdata[19];
-    uint32_t *nonces = work->nonces;
-    int num_found = 0;
-    uint32_t *noncep = vdata + 73;   // 9*8 + 1
+    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+    int thr_id = mythr->id;  // thr_id arg is deprecated
    const uint32_t Htarg = ptarget[7];
    uint64_t htmax[] = {
                0,
@@ -165,10 +187,7 @@ int scanhash_anime_4way( int thr_id, struct work *work, uint32_t max_nonce,
                0
        };

-    swab32_array( endiandata, pdata, 20 );
-
-    uint64_t *edata = (uint64_t*)endiandata;
-    mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+    mm256_bswap32_intrlv80_4x64( vdata, pdata );

    for (int m=0; m < 6; m++)
       if (Htarg <= htmax[m])
@@ -177,30 +196,26 @@ int scanhash_anime_4way( int thr_id, struct work *work, uint32_t max_nonce,

          do
          {
-             be32enc( noncep,   n   );
-             be32enc( noncep+2, n+1 );
-             be32enc( noncep+4, n+2 );
-             be32enc( noncep+6, n+3 );
+             *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );

             anime_4way_hash( hash, vdata );
             pdata[19] = n;

             for ( int i = 0; i < 4; i++ )
             if ( ( ( (hash+(i<<3))[7] & mask ) == 0 )
-                && fulltest( hash+(i<<3), ptarget ) )
+                && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
             {
                pdata[19] = n+i;
-                nonces[ num_found++ ] = n+i;
-                work_set_target_ratio( work, hash+(i<<3) );
+                submit_lane_solution( work, hash+(i<<3), mythr, i );
             }
             n += 4;
-          } while ( ( num_found == 0 ) && ( n < max_nonce )
-              && !work_restart[thr_id].restart );
+          } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
          break;
       }

    *hashes_done = n - first_nonce + 1;
-    return num_found;
+    return 0;
 }

 #endif
--- a/algo/quark/anime-gate.h
+++ b/algo/quark/anime-gate.h
@@ -13,19 +13,15 @@ bool register_anime_algo( algo_gate_t* gate );
 #if defined(ANIME_4WAY)

 void anime_4way_hash( void *state, const void *input );
-
-int scanhash_anime_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done );
-
+int scanhash_anime_4way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
 void init_anime_4way_ctx();

 #endif

 void anime_hash( void *state, const void *input );
-
-int scanhash_anime( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done );
-
+int scanhash_anime( struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done, struct thr_info *mythr );
 void init_anime_ctx();

 #endif
--- a/algo/quark/anime.c
+++ b/algo/quark/anime.c
@@ -46,20 +46,6 @@ void init_anime_ctx()
 void anime_hash( void *state, const void *input )
 {
    unsigned char hash[128] __attribute__ ((aligned (32)));
-/*
-    uint64_t hash0[8] __attribute__ ((aligned (64)));
-    uint64_t hash1[8] __attribute__ ((aligned (64)));
-    uint64_t hash2[8] __attribute__ ((aligned (64)));
-    uint64_t hash3[8] __attribute__ ((aligned (64)));
-    uint64_t vhash[8*4] __attribute__ ((aligned (64)));
-    uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
-    uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
-    __m256i* vh  = (__m256i*)vhash;
-    __m256i* vhA = (__m256i*)vhashA;
-    __m256i* vhB = (__m256i*)vhashB;
-    __m256i vh_mask;
-    __m256i bit3_mask; bit3_mask = _mm256_set1_epi64x( 8 );
-*/
    uint32_t mask = 8;
    anime_ctx_holder ctx;
    memcpy( &ctx, &anime_ctx, sizeof(anime_ctx) );
@@ -133,8 +119,8 @@ void anime_hash( void *state, const void *input )
   memcpy( state, hash, 32 );
 }

-int scanhash_anime( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done)
+int scanhash_anime( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr)
 {
    uint32_t hash[8] __attribute__ ((aligned (64)));
    uint32_t endiandata[20] __attribute__((aligned(64)));
@@ -142,6 +128,7 @@ int scanhash_anime( int thr_id, struct work *work, uint32_t max_nonce,
    uint32_t *ptarget = work->target;
    uint32_t n = pdata[19];
    const uint32_t first_nonce = pdata[19];
+    int thr_id = mythr->id;  // thr_id arg is deprecated
    const uint32_t Htarg = ptarget[7];
    uint64_t htmax[] = {
                0,
--- a/algo/quark/hmq1725-4way.c
+++ b/algo/quark/hmq1725-4way.c
@@ -0,0 +1,618 @@
+#include "hmq1725-gate.h"
+
+#if defined(HMQ1725_4WAY)
+
+#include <string.h>
+#include <stdint.h>
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/bmw-hash-4way.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/luffa/luffa_for_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
+#include "algo/simd/nist.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/simd/simd-hash-2way.h"
+#include "algo/echo/aes_ni/hash_api.h"
+#include "algo/hamsi/hamsi-hash-4way.h"
+#include "algo/fugue/sph_fugue.h"
+#include "algo/shabal/shabal-hash-4way.h"
+#include "algo/whirlpool/sph_whirlpool.h"
+#include "algo/haval/haval-hash-4way.h"
+#include "algo/sha/sha2-hash-4way.h"
+
+union _hmq1725_4way_context_overlay
+{
+    blake512_4way_context   blake;
+    bmw512_4way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;
+    keccak512_4way_context  keccak;
+    hashState_luffa         luffa;
+    cubehashParam           cube;
+    sph_shavite512_context  shavite;
+    hashState_sd            simd;
+    hashState_echo          echo;
+    hamsi512_4way_context   hamsi;
+    sph_fugue512_context    fugue;
+    shabal512_4way_context  shabal;
+    sph_whirlpool_context   whirlpool;
+    sha512_4way_context     sha512;
+    haval256_5_4way_context haval;
+};
+typedef union _hmq1725_4way_context_overlay hmq1725_4way_context_overlay;
+
+extern void hmq1725_4way_hash(void *state, const void *input)
+{
+// why so big? only really need 16.
+     uint32_t hash0 [32]    __attribute__ ((aligned (64)));
+     uint32_t hash1 [32]    __attribute__ ((aligned (64)));
+     uint32_t hash2 [32]    __attribute__ ((aligned (64)));
+     uint32_t hash3 [32]    __attribute__ ((aligned (64)));
+     uint32_t vhash [32<<2] __attribute__ ((aligned (64)));
+     uint32_t vhashA[32<<2] __attribute__ ((aligned (64)));
+     uint32_t vhashB[32<<2] __attribute__ ((aligned (64)));
+     hmq1725_4way_context_overlay ctx __attribute__ ((aligned (64)));
+     __m256i vh_mask;     
+     const __m256i vmask = _mm256_set1_epi64x( 24 );
+     const uint32_t mask = 24;
+     __m256i* vh  = (__m256i*)vhash;
+     __m256i* vhA = (__m256i*)vhashA;
+     __m256i* vhB = (__m256i*)vhashB;
+
+     bmw512_4way_init( &ctx.bmw );
+     bmw512_4way( &ctx.bmw, input, 80 );
+     bmw512_4way_close( &ctx.bmw, vhash );
+
+     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash0 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash1, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash1 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash2, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash2 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash3 );
+
+// first fork, A is groestl serial, B is skein parallel.
+
+     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ),
+                                   m256_zero );
+
+// A
+
+//     if ( hash0[0] & mask )
+//     {
+       init_groestl( &ctx.groestl, 64 );
+       update_and_final_groestl( &ctx.groestl, (char*)hash0,
+                                               (char*)hash0, 512 );
+//     }
+//     if ( hash1[0] & mask )
+//     {
+       init_groestl( &ctx.groestl, 64 );
+       update_and_final_groestl( &ctx.groestl, (char*)hash1,
+                                               (char*)hash1, 512 );
+//     }
+//     if ( hash2[0] & mask )
+//     {
+       init_groestl( &ctx.groestl, 64 );
+       update_and_final_groestl( &ctx.groestl, (char*)hash2,
+                                               (char*)hash2, 512 );
+//     }
+//     if ( hash3[0] & mask )
+//     {
+       init_groestl( &ctx.groestl, 64 );
+       update_and_final_groestl( &ctx.groestl, (char*)hash3,
+                                               (char*)hash3, 512 );
+//     }
+
+     intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
+
+// B
+
+//     if ( mm256_any_clr_256( vh_mask ) )
+//     {
+       skein512_4way_init( &ctx.skein );
+       skein512_4way( &ctx.skein, vhash, 64 );
+       skein512_4way_close( &ctx.skein, vhashB );
+//     }
+
+     mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
+
+     jh512_4way_init( &ctx.jh );
+     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     keccak512_4way_init( &ctx.keccak );
+     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_close( &ctx.keccak, vhash );
+
+// second fork, A = blake parallel, B= bmw parallel.
+    
+     vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ),
+                                   m256_zero );
+
+//     if ( mm256_any_set_256( vh_mask ) )
+//     {
+       blake512_4way_init( &ctx.blake );
+       blake512_4way( &ctx.blake, vhash, 64 );
+       blake512_4way_close( &ctx.blake, vhashA );
+//     }
+
+//     if ( mm256_any_clr_256( vh_mask ) )
+//     {
+       bmw512_4way_init( &ctx.bmw );
+       bmw512_4way( &ctx.bmw, vhash, 64 );
+       bmw512_4way_close( &ctx.bmw, vhashB );
+//     }
+
+     mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
+    
+     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     init_luffa( &ctx.luffa, 512 );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+                                   (const BitSequence*)hash0, 64 );
+     init_luffa( &ctx.luffa, 512 );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+                                   (const BitSequence*)hash1, 64 );
+     init_luffa( &ctx.luffa, 512 );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+                                   (const BitSequence*)hash2, 64 );
+     init_luffa( &ctx.luffa, 512 );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
+                                   (const BitSequence*)hash3, 64 );
+
+     cubehashInit( &ctx.cube, 512, 16, 32 );
+     cubehashUpdateDigest( &ctx.cube, (BitSequence *)hash0,
+                                (const BitSequence *)hash0, 64 );
+     cubehashInit( &ctx.cube, 512, 16, 32 );
+     cubehashUpdateDigest( &ctx.cube, (BitSequence *)hash1,
+                                (const BitSequence *)hash1, 64 );
+     cubehashInit( &ctx.cube, 512, 16, 32 );
+     cubehashUpdateDigest( &ctx.cube, (BitSequence *)hash2,
+                                (const BitSequence *)hash2, 64 );
+     cubehashInit( &ctx.cube, 512, 16, 32 );
+     cubehashUpdateDigest( &ctx.cube, (BitSequence *)hash3,
+                                (const BitSequence *)hash3, 64 );
+
+     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+// A= keccak parallel, B= jh parallel
+    
+     vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ),
+                                  m256_zero );
+
+//     if ( mm256_any_set_256( vh_mask ) )
+//     {
+        keccak512_4way_init( &ctx.keccak );
+        keccak512_4way( &ctx.keccak, vhash, 64 );
+        keccak512_4way_close( &ctx.keccak, vhashA );
+//     }
+
+//     if ( mm256_any_clr_256( vh_mask ) )
+//     {
+        jh512_4way_init( &ctx.jh );
+        jh512_4way( &ctx.jh, vhash, 64 );
+        jh512_4way_close( &ctx.jh, vhashB );
+//     }
+
+     mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
+
+     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512 ( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512 ( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512 ( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512 ( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+
+     init_sd( &ctx.simd, 512 );
+     update_final_sd( &ctx.simd, (BitSequence *)hash0,
+                           (const BitSequence *)hash0, 512 );
+     init_sd( &ctx.simd, 512 );
+     update_final_sd( &ctx.simd, (BitSequence *)hash1,
+                           (const BitSequence *)hash1, 512 );
+     init_sd( &ctx.simd, 512 );
+     update_final_sd( &ctx.simd, (BitSequence *)hash2,
+                           (const BitSequence *)hash2, 512 );
+     init_sd( &ctx.simd, 512 );
+     update_final_sd( &ctx.simd, (BitSequence *)hash3,
+                           (const BitSequence *)hash3, 512 );
+
+// A is whirlpool serial, B is haval parallel.
+    
+
+     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ),
+                                   m256_zero );
+     // A
+    
+//     if ( hash0[0] & mask )
+//     {
+        sph_whirlpool_init( &ctx.whirlpool );
+        sph_whirlpool( &ctx.whirlpool, hash0, 64 );
+        sph_whirlpool_close( &ctx.whirlpool, hash0 );
+//     }
+//     if ( hash1[0] & mask )
+//     {
+        sph_whirlpool_init( &ctx.whirlpool );
+        sph_whirlpool( &ctx.whirlpool, hash1, 64 );
+        sph_whirlpool_close( &ctx.whirlpool, hash1 );
+//     }
+//     if ( hash2[0] & mask )
+//     {
+        sph_whirlpool_init( &ctx.whirlpool );
+        sph_whirlpool( &ctx.whirlpool, hash2, 64 );
+        sph_whirlpool_close( &ctx.whirlpool, hash2 );
+//     }
+//     if ( hash3[0] & mask )
+//     {
+        sph_whirlpool_init( &ctx.whirlpool );
+        sph_whirlpool( &ctx.whirlpool, hash3, 64 );
+        sph_whirlpool_close( &ctx.whirlpool, hash3 );
+//     }
+
+     intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
+
+// B
+
+//     if ( mm256_any_clr_256( vh_mask ) )
+//     {
+        haval256_5_4way_init( &ctx.haval );
+        haval256_5_4way( &ctx.haval, vhash, 64 );
+        haval256_5_4way_close( &ctx.haval, vhashB );
+        memset( &vhashB[8<<2], 0, 32<<2);
+//     }
+
+     mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
+
+     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+    
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                             (const BitSequence *)hash0, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                             (const BitSequence *)hash1, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                             (const BitSequence *)hash2, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                             (const BitSequence *)hash3, 512 );
+
+     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     
+     blake512_4way_init( &ctx.blake );
+     blake512_4way( &ctx.blake, vhash, 64 );
+     blake512_4way_close( &ctx.blake, vhash );
+
+     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+// shavite & luffa, both serial, select individually.
+
+   if ( hash0[0] & mask )
+   {
+      sph_shavite512_init( &ctx.shavite );
+      sph_shavite512( &ctx.shavite, hash0, 64 ); //
+      sph_shavite512_close( &ctx.shavite, hash0 ); //8
+   }
+   else
+   {
+      init_luffa( &ctx.luffa, 512 );
+      update_and_final_luffa( &ctx.luffa, (BitSequence *)hash0,
+                                    (const BitSequence *)hash0, 64 );
+   }
+
+   if ( hash1[0] & mask )
+   {
+      sph_shavite512_init( &ctx.shavite );
+      sph_shavite512( &ctx.shavite, hash1, 64 ); //
+      sph_shavite512_close( &ctx.shavite, hash1 ); //8
+   }
+   else
+   {
+      init_luffa( &ctx.luffa, 512 );
+      update_and_final_luffa( &ctx.luffa, (BitSequence *)hash1,
+                                    (const BitSequence *)hash1, 64 );
+   }
+
+   if ( hash2[0] & mask )
+   {
+      sph_shavite512_init( &ctx.shavite );
+      sph_shavite512( &ctx.shavite, hash2, 64 ); //
+      sph_shavite512_close( &ctx.shavite, hash2 ); //8
+   }
+   else
+   {
+      init_luffa( &ctx.luffa, 512 );
+      update_and_final_luffa( &ctx.luffa, (BitSequence *)hash2,
+                                    (const BitSequence *)hash2, 64 );
+   }
+
+   if ( hash3[0] & mask )
+   {
+      sph_shavite512_init( &ctx.shavite );
+      sph_shavite512( &ctx.shavite, hash3, 64 ); //
+      sph_shavite512_close( &ctx.shavite, hash3 ); //8
+   }
+   else
+   {
+      init_luffa( &ctx.luffa, 512 );
+      update_and_final_luffa( &ctx.luffa, (BitSequence *)hash3,
+                                    (const BitSequence *)hash3, 64 );
+   }
+
+   intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+   hamsi512_4way_init( &ctx.hamsi );
+   hamsi512_4way( &ctx.hamsi, vhash, 64 );
+   hamsi512_4way_close( &ctx.hamsi, vhash );
+
+   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+   sph_fugue512_init( &ctx.fugue );
+   sph_fugue512( &ctx.fugue, hash0, 64 );
+   sph_fugue512_close( &ctx.fugue, hash0 );
+   sph_fugue512_init( &ctx.fugue );
+   sph_fugue512( &ctx.fugue, hash1, 64 );
+   sph_fugue512_close( &ctx.fugue, hash1 );
+   sph_fugue512_init( &ctx.fugue );
+   sph_fugue512( &ctx.fugue, hash2, 64 );
+   sph_fugue512_close( &ctx.fugue, hash2 );
+   sph_fugue512_init( &ctx.fugue );
+   sph_fugue512( &ctx.fugue, hash3, 64 );
+   sph_fugue512_close( &ctx.fugue, hash3 );
+
+
+//  A echo, B sd both serial
+   
+   if ( hash0[0] & mask ) //4
+   {
+       init_echo( &ctx.echo, 512 );
+       update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                               (const BitSequence *)hash0, 512 );
+   }
+   else
+   {
+       init_sd( &ctx.simd, 512 );
+       update_final_sd( &ctx.simd, (BitSequence *)hash0,
+                             (const BitSequence *)hash0, 512 );
+   }
+
+   if ( hash1[0] & mask ) //4
+   {
+       init_echo( &ctx.echo, 512 );
+       update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                               (const BitSequence *)hash1, 512 );
+   }
+   else
+   {
+       init_sd( &ctx.simd, 512 );
+       update_final_sd( &ctx.simd, (BitSequence *)hash1,
+                             (const BitSequence *)hash1, 512 );
+   }
+
+   if ( hash2[0] & mask ) //4
+   {
+       init_echo( &ctx.echo, 512 );
+       update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                               (const BitSequence *)hash2, 512 );
+   }
+   else
+   {
+       init_sd( &ctx.simd, 512 );
+       update_final_sd( &ctx.simd, (BitSequence *)hash2,
+                             (const BitSequence *)hash2, 512 );
+   }
+
+   if ( hash3[0] & mask ) //4
+   {
+       init_echo( &ctx.echo, 512 );
+       update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                               (const BitSequence *)hash3, 512 );
+   }
+   else
+   {
+       init_sd( &ctx.simd, 512 );
+       update_final_sd( &ctx.simd, (BitSequence *)hash3,
+                             (const BitSequence *)hash3, 512 );
+   }
+
+   intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+
+   shabal512_4way_init( &ctx.shabal );
+   shabal512_4way( &ctx.shabal, vhash, 64 );
+   shabal512_4way_close( &ctx.shabal, vhash );
+
+   dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+
+   sph_whirlpool_init( &ctx.whirlpool );
+   sph_whirlpool( &ctx.whirlpool, hash0, 64 );
+   sph_whirlpool_close( &ctx.whirlpool, hash0 );
+   sph_whirlpool_init( &ctx.whirlpool );
+   sph_whirlpool( &ctx.whirlpool, hash1, 64 );
+   sph_whirlpool_close( &ctx.whirlpool, hash1 );
+   sph_whirlpool_init( &ctx.whirlpool );
+   sph_whirlpool( &ctx.whirlpool, hash2, 64 );
+   sph_whirlpool_close( &ctx.whirlpool, hash2 );
+   sph_whirlpool_init( &ctx.whirlpool );
+   sph_whirlpool( &ctx.whirlpool, hash3, 64 );
+   sph_whirlpool_close( &ctx.whirlpool, hash3 );
+
+// A = fugue serial, B = sha512 prarallel
+   
+   intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+   vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ),
+                                 m256_zero );
+
+//   if ( hash0[0] & mask ) 
+//   {
+      sph_fugue512_init( &ctx.fugue );
+      sph_fugue512( &ctx.fugue, hash0, 64 );
+      sph_fugue512_close( &ctx.fugue, hash0 );
+//   }
+//   if ( hash1[0] & mask ) 
+//   {
+      sph_fugue512_init( &ctx.fugue );
+      sph_fugue512( &ctx.fugue, hash1, 64 );
+      sph_fugue512_close( &ctx.fugue, hash1 );
+//   }
+//   if ( hash2[0] & mask ) 
+//   {
+      sph_fugue512_init( &ctx.fugue );
+      sph_fugue512( &ctx.fugue, hash2, 64 );
+      sph_fugue512_close( &ctx.fugue, hash2 );
+//   }
+//   if ( hash3[0] & mask ) 
+//   {
+      sph_fugue512_init( &ctx.fugue );
+      sph_fugue512( &ctx.fugue, hash3, 64 );
+      sph_fugue512_close( &ctx.fugue, hash3 );
+//   }
+
+   intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
+
+//   if ( mm256_any_clr_256( vh_mask ) )
+//   {
+      sha512_4way_init( &ctx.sha512 );
+      sha512_4way( &ctx.sha512, vhash, 64 );
+      sha512_4way_close( &ctx.sha512, vhashB );
+//   }
+
+   mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
+
+   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+   init_groestl( &ctx.groestl, 64 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+   init_groestl( &ctx.groestl, 64 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+   init_groestl( &ctx.groestl, 64 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+   init_groestl( &ctx.groestl, 64 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+
+   intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+   sha512_4way_init( &ctx.sha512 ); 
+   sha512_4way( &ctx.sha512, vhash, 64 );
+   sha512_4way_close( &ctx.sha512, vhash ); 
+
+// A = haval parallel, B = Whirlpool serial
+
+   vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], vmask ),
+                                 m256_zero );
+
+   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     
+//   if ( mm256_any_set_256( vh_mask ) ) //4
+//   {
+      haval256_5_4way_init( &ctx.haval );
+      haval256_5_4way( &ctx.haval, vhash, 64 );
+      haval256_5_4way_close( &ctx.haval, vhashA );
+      memset( &vhashA[8<<2], 0, 32<<2 );
+//   }
+
+//   if ( !( hash0[0] & mask ) )
+//   {
+      sph_whirlpool_init( &ctx.whirlpool );
+      sph_whirlpool( &ctx.whirlpool, hash0, 64 );
+      sph_whirlpool_close( &ctx.whirlpool, hash0 );
+//   }
+//   if ( !( hash2[0] & mask ) )
+//   {
+      sph_whirlpool_init( &ctx.whirlpool );
+      sph_whirlpool( &ctx.whirlpool, hash1, 64 );
+      sph_whirlpool_close( &ctx.whirlpool, hash1 );
+//   }
+//   if ( !( hash2[0] & mask ) )
+//   {
+      sph_whirlpool_init( &ctx.whirlpool );
+      sph_whirlpool( &ctx.whirlpool, hash2, 64 );
+      sph_whirlpool_close( &ctx.whirlpool, hash2 );
+//   }
+//   if ( !( hash3[0] & mask ) )
+//   {
+      sph_whirlpool_init( &ctx.whirlpool );
+      sph_whirlpool( &ctx.whirlpool, hash3, 64 );
+      sph_whirlpool_close( &ctx.whirlpool, hash3 );
+//   }
+
+   intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, 512 );
+
+   mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
+
+   bmw512_4way_init( &ctx.bmw );
+   bmw512_4way( &ctx.bmw, vhash, 64 );
+   bmw512_4way_close( &ctx.bmw, vhash );
+
+ 	memcpy(state, vhash, 32<<2 );
+}
+
+int scanhash_hmq1725_4way( struct work *work, uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[4*8] __attribute__ ((aligned (64)));
+//   uint32_t *hash7 = &(hash[25]);
+//   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t n = pdata[19] - 1;
+   const uint32_t first_nonce = pdata[19];
+   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+   int thr_id = mythr->id;  // thr_id arg is deprecated
+   const uint32_t Htarg = ptarget[7];
+   uint64_t htmax[] = {          0,        0xF,       0xFF,
+                             0xFFF,     0xFFFF, 0x10000000  };
+   uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
+                        0xFFFFF000, 0xFFFF0000,          0  };
+
+   mm256_bswap32_intrlv80_4x64( vdata, pdata );
+   for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
+   {
+      uint32_t mask = masks[ m ];
+      do
+      {
+         *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+                 _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
+         hmq1725_4way_hash( hash, vdata );
+         for ( int i = 0; i < 4; i++ )
+         if ( ( (hash+(i<<3))[7] & mask ) == 0 )
+         {
+            if ( fulltest( (hash+(i<<3)), ptarget ) && !opt_benchmark )
+            {
+               pdata[19] = n + i;
+               submit_lane_solution( work, (hash+(i<<3)), mythr, i );
+            }
+         }
+	      n += 4;
+      } while ( ( n < max_nonce-4 ) && !work_restart[thr_id].restart );	
+	   break;
+	}
+	*hashes_done = n - first_nonce + 1;
+	return 0;
+}
+
+#endif // HMQ1725_4WAY
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Jay D Dee	e2d5762ef2	v3.9.5.4	2019-07-15 17:00:26 -04:00
Jay D Dee	e625ed5420	v3.9.5.3	2019-07-12 10:42:38 -04:00
Jay D Dee	9abc19a30a	v3.9.5.2	2019-07-04 12:12:11 -04:00
Jay D Dee	0d769ee0fe	v3.9.5.1	2019-07-02 15:10:38 -04:00
Jay D Dee	0d48d573ce	v3.9.5	2019-06-26 14:16:01 -04:00
Jay D Dee	d6e8d7a46e	v3.9.4	2019-06-18 13:15:45 -04:00
Jay D Dee	71d6b97ee8	v3.9.3.1	2019-06-13 21:15:58 -04:00
Jay D Dee	b2331375a3	v3.9.2.5	2019-06-13 11:20:27 -04:00
Jay D Dee	7fec680835	v3.9.2.4	2019-06-07 23:30:38 -04:00