v3.9.8.1

v3.9.8
v3.9.7
2025-09-17 23:44:27 +00:00 · 2019-10-01 14:17:36 -04:00 · 2019-09-26 22:37:26 -04:00 · 2019-08-03 10:39:54 -04:00 · 2019-07-30 10:16:43 -04:00 · 2019-07-18 19:46:57 -04:00
282 changed files with 13656 additions and 10251 deletions
--- a/3
+++ b/3
@@ -42,9 +42,6 @@ openssl 1.1.0e or higher. Add one of the following, depending on the
 compiler version, to CFLAGS:
 "-march=native" or "-march=znver1" or "-msha".

-Due to poor AVX2 performance on Ryzen users should add -DRYZEN_ to CFLAGS
-to override multiway AVX2 on algos with sha256, and use SHA instead.
-
 Additional instructions for static compilalation can be found here:
 https://lxadm.com/Static_compilation_of_cpuminer
 Static builds should only considered in a homogeneous HW and SW environment.
--- a/Makefile.am
+++ b/Makefile.am
@@ -18,7 +18,6 @@ dist_man_MANS	= cpuminer.1
 cpuminer_SOURCES = \
  cpu-miner.c \
  util.c \
-  uint256.cpp \
  api.c \
  sysinfos.c \
  algo-gate-api.c\
@@ -51,12 +50,15 @@ cpuminer_SOURCES = \
  algo/blake/blake.c \
  algo/blake/blake-4way.c \
  algo/blake/sph_blake2b.c \
-  algo/blake/blake2b.c \
  algo/blake/sph-blake2s.c \
  algo/blake/blake2s-hash-4way.c \
  algo/blake/blake2s.c \
  algo/blake/blake2s-gate.c \
  algo/blake/blake2s-4way.c \
+  algo/blake/blake2b-hash-4way.c \
+  algo/blake/blake2b.c \
+  algo/blake/blake2b-gate.c \
+  algo/blake/blake2b-4way.c \
  algo/blake/blakecoin-gate.c \
  algo/blake/mod_blakecoin.c \
  algo/blake/blakecoin.c \
@@ -71,6 +73,9 @@ cpuminer_SOURCES = \
  algo/bmw/bmw256-hash-4way.c \
  algo/bmw/bmw512-hash-4way.c \
  algo/bmw/bmw256.c \
+  algo/bmw/bmw512-gate.c \
+  algo/bmw/bmw512.c \
+  algo/bmw/bmw512-4way.c \
  algo/cryptonight/cryptolight.c \
  algo/cryptonight/cryptonight-common.c\
  algo/cryptonight/cryptonight-aesni.c\
@@ -131,22 +136,24 @@ cpuminer_SOURCES = \
  algo/lyra2/lyra2h-4way.c \
  algo/lyra2/allium-4way.c \
  algo/lyra2/allium.c \
+  algo/lyra2/phi2-4way.c \
  algo/lyra2/phi2.c \
  algo/m7m.c \
-  algo/neoscrypt/neoscrypt.c \
  algo/nist5/nist5-gate.c \
  algo/nist5/nist5-4way.c \
  algo/nist5/nist5.c \
  algo/nist5/zr5.c \
  algo/panama/sph_panama.c \
  algo/radiogatun/sph_radiogatun.c \
-  algo/pluck.c \
  algo/quark/quark-gate.c \
  algo/quark/quark.c \
  algo/quark/quark-4way.c \
  algo/quark/anime-gate.c \
  algo/quark/anime.c \
  algo/quark/anime-4way.c \
+  algo/quark/hmq1725-gate.c \
+  algo/quark/hmq1725-4way.c \
+  algo/quark/hmq1725.c \
  algo/qubit/qubit-gate.c \
  algo/qubit/qubit.c \
  algo/qubit/qubit-2way.c \
@@ -158,11 +165,14 @@ cpuminer_SOURCES = \
  algo/ripemd/lbry-gate.c \
  algo/ripemd/lbry.c \
  algo/ripemd/lbry-4way.c \
-  algo/scrypt.c \
+  algo/scrypt/scrypt.c \
+  algo/scrypt/neoscrypt.c \
+  algo/scrypt/pluck.c \
  algo/scryptjane/scrypt-jane.c \
  algo/sha/sph_sha2.c \
  algo/sha/sph_sha2big.c \
-  algo/sha/sha2-hash-4way.c \
+  algo/sha/sha256-hash-4way.c \
+  algo/sha/sha512-hash-4way.c \
  algo/sha/sha256_hash_11way.c \
  algo/sha/sha2.c \
  algo/sha/sha256t-gate.c \
@@ -194,7 +204,6 @@ cpuminer_SOURCES = \
  algo/whirlpool/sph_whirlpool.c \
  algo/whirlpool/whirlpool-hash-4way.c \
  algo/whirlpool/whirlpool-gate.c \
-  algo/whirlpool/whirlpool-4way.c \
  algo/whirlpool/whirlpool.c \
  algo/whirlpool/whirlpoolx.c \
  algo/x11/x11-gate.c \
@@ -235,6 +244,8 @@ cpuminer_SOURCES = \
  algo/x13/skunk-4way.c \
  algo/x13/skunk.c \
  algo/x13/drop.c \
+  algo/x13/x13bcd-4way.c \
+  algo/x13/x13bcd.c \
  algo/x14/x14-gate.c \
  algo/x14/x14.c \
  algo/x14/x14-4way.c \
@@ -251,13 +262,19 @@ cpuminer_SOURCES = \
  algo/x16/x16r-gate.c \
  algo/x16/x16r.c \
  algo/x16/x16r-4way.c \
+  algo/x16/x16rv2.c \
+  algo/x16/x16rv2-4way.c \
+  algo/x16/x16rt.c \
+  algo/x16/x16rt-4way.c \
+  algo/x16/hex.c \
+  algo/x16/x21s-4way.c \
+  algo/x16/x21s.c \
  algo/x17/x17-gate.c \
  algo/x17/x17.c \
  algo/x17/x17-4way.c \
  algo/x17/xevan-gate.c \
  algo/x17/xevan.c \
  algo/x17/xevan-4way.c \
-  algo/x17/hmq1725.c \
  algo/x17/sonoa-gate.c \
  algo/x17/sonoa-4way.c \
  algo/x17/sonoa.c \
--- a/README.md
+++ b/README.md
@@ -24,7 +24,7 @@ Requirements

 1. A x86_64 architecture CPU with a minimum of SSE2 support. This includes
 Intel Core2 and newer and AMD equivalents. In order to take advantage of AES_NI
-optimizations a CPU with AES_NI is required. This includes Intel Westbridge
+optimizations a CPU with AES_NI is required. This includes Intel Westmere
 and newer and AMD equivalents. Further optimizations are available on some
 algoritms for CPUs with AVX and AVX2, Sandybridge and Haswell respectively.

@@ -55,13 +55,12 @@ Supported Algorithms
                          axiom         Shabal-256 MemoHash
                          bastion
                          blake         Blake-256 (SFR)
-                          blakecoin     blake256r8
+                          blake2b       Blake2b 256
                          blake2s       Blake-2 S
+                          blakecoin     blake256r8
                          bmw           BMW 256
+                          bmw512        BMW 512
                          c11           Chaincoin
-                          cryptolight   Cryptonight-light
-                          cryptonight  
-                          cryptonightv7 Monero (XMR)
                          decred
                          deep          Deepcoin (DCN)
                          dmd-gr        Diamond-Groestl
@@ -69,6 +68,7 @@ Supported Algorithms
                          fresh         Fresh
                          groestl       Groestl coin
                          heavy         Heavy
+                          hex           x16r-hex
                          hmq1725       Espers
                          hodl          Hodlcoin
                          jha           Jackpotcoin
@@ -78,17 +78,18 @@ Supported Algorithms
                          luffa         Luffa
                          lyra2h        Hppcoin
                          lyra2re       lyra2
-                          lyra2rev2     lyra2v2, Vertcoin
+                          lyra2rev2     lyra2v2
                          lyra2rev3     lyrav2v3, Vertcoin
-                          lyra2z        Zcoin (XZC)
+                          lyra2z        
                          lyra2z330     Lyra2 330 rows, Zoin (ZOI)
                          m7m           Magi (XMG)
                          myr-gr        Myriad-Groestl
                          neoscrypt     NeoScrypt(128, 2, 1)
                          nist5         Nist5
                          pentablake    Pentablake
-                          phi1612       phi, LUX coin (original algo)
-                          phi2          LUX coin (new algo)
+                          phi1612       phi
+                          phi2          Luxcoin (LUX)
+                          phi2-lux      identical to phi2
                          pluck         Pluck:128 (Supcoin)
                          polytimos     Ninja
                          quark         Quark
@@ -97,6 +98,7 @@ Supported Algorithms
                          scrypt:N      scrypt(N, 1, 1)
                          scryptjane:nf
                          sha256d       Double SHA-256
+                          sha256q       Quad SHA-256, Pyrite (PYE)
                          sha256t       Triple SHA-256, Onecoin (OC)
                          shavite3      Shavite3
                          skein         Skein+Sha (Skeincoin)
@@ -115,12 +117,17 @@ Supported Algorithms
                          x11gost       sib (SibCoin)
                          x12           Galaxie Cash (GCH)
                          x13           X13
+                          x13bcd        bcd
                          x13sm3        hsr (Hshare)
                          x14           X14
                          x15           X15
-                          x16r          Ravencoin (RVN)
-                          x16s          pigeoncoin (PGN)
+                          x16r          Ravencoin (RVN) (original algo)
+                          x16rv2        Ravencoin (RVN) (new algo)
+                          x16rt         Gincoin (GIN)
+                          x16rt_veil    Veil (VEIL)
+                          x16s          Pigeoncoin (PGN)
                          x17
+                          x21s
                          xevan         Bitsend (BSD)
                          yescrypt      Globalboost-Y (BSTY)
                          yescryptr8    BitZeny (ZNY)
@@ -150,14 +157,15 @@ Benchmark testing does not work for x11evo.
 Bugs
 ----

-Users are encouraged to post their bug reports on the Bitcoin Talk
-forum at:
+Users are encouraged to post their bug reports using git issues or on the
+Bitcoin Talk forum at:

 https://bitcointalk.org/index.php?topic=1326803.0

-All problem reports must be accompanied by a proper definition.
+All problem reports must be accompanied by a proper problem definition.
 This should include how the problem occurred, the command line and
-output from the miner showing the startup and any errors.
+output from the miner showing the startup messages and any errors.
+A history is also useful, ie did it work before.

 Donations
 ---------
--- a/README.txt
+++ b/README.txt
@@ -29,7 +29,7 @@ cpuminer-sse2.exe      "-msse2"                  Core2, Nehalem
 cpuminer-aes-sse42.exe "-march=westmere"         Westmere
 cpuminer-avx.exe       "-march=corei7-avx"       Sandy-Ivybridge
 cpuminer-avx2.exe      "-march=core-avx2"        Haswell, Sky-Kaby-Coffeelake
-cpuminer-zen           "-march=znver1 -DRYZEN_"  Ryzen
+cpuminer-zen           "-march=znver1"           AMD Ryzen, Threadripper

 If you like this software feel free to donate:

--- a/119
+++ b/119
@@ -38,6 +38,125 @@ supported.
 Change Log
 ----------

+v3.9.8.1
+
+Summary log report will be generated on stratum diff change or after 5 minutes,
+whichever comes first, to prevent incorrect data in the report.
+
+Removed phi2-lux alias (introduced in v3.9.8) due to Luxcoin's planned fork
+to a new algo. The new Luxcoin algo is not supported by cpuminer-opt.
+Until the fork Luxcoin can be mined using phi2 algo.
+
+--hide-diff option is deprecated and has no effect. It will be removed in a
+future release.
+
+v3.9.8
+
+Changes to log output to provide data more relevant to actual mining
+performance.
+phi2 can now handle pools with a mix of coins that use and don't use roots.
+phi2-lux added as an alias for phi2 as they are identical except for roots.
+Add x16rv2 algo for Ravencoin fork.
+
+v3.9.7
+
+Command line option changes:
+
+"-R" is no longer used as a shortcut for "--retry-pause", users must
+use the long option.
+
+New options:
+
+-N, --param-n: set the N parameter for yescrypt, yespower or scrypt algos
+-R, --param-r: set the R parameter for yescrypt or yespower algos, scrypt is
+     hardcoded with R=1
+-K, --param-key: set the client key/pers parameter for yescrypt/yespower algos.
+
+These options can be used to mine yescrypt or yespower variations using
+the generic yescrypt or yespower algo name and specifying the parameters
+manually. They can even be used to mine variations that aren't formally
+supported by a unique algo name. Existing algos can continue to to be mined
+using their original name without parameters.
+
+v3.9.6.2
+
+New algo blake2b.
+Faster myr-gr on Ryzen using SHA.
+Faster blake2s SSE2.
+Small speedup of around 1% for several other algos.
+
+v3.9.6.1
+
+New algos: x21s, hex (alias x16r-hex).
+
+v3.9.6
+
+New algos: bmw512, x16rt, x16rt-veil (alias veil), x13bcd (alias bcd).
+
+v3.9.5.4
+
+Fixed sha256q AVX2 poor performance.
+Fixed skein2 buffer overflow and restored bswap-interleave optimization.
+More restructuring.
+
+v3.9.5.3
+
+Fix crash mining hodl with aes-sse42.
+More restructuring and share report tweaks.
+
+v3.9.5.2
+
+Revert bswap-interleave optimization for causing crashes on Windows.
+
+v3.9.5.1
+
+Fixed skein2 crash on Windows.
+
+Fixed CPU temperature reading on Ubuntu 19.04.
+
+Realigned log message colours, blue is used to report normal activity and
+yellow is only used to report abnormal activity.
+
+Changed stats colours, yellow now means below average, white is average
+range. Tweaked colour thresholds.
+
+Changed colour of stratum difficulty change messages to blue to match other
+normal protocol messages. Blue messages (block, stratum, submit) will no
+longer be displayed when using -q option.
+
+Added job id to new block, share submit, and share result messages and added
+new nessage when a new job is received for an existing block. This will for
+better troubleshooting of invalid job id rejects seen at zergpool.
+
+Some more restructuring.
+
+v3.9.5
+
+New share reporting information includes calculation of equivalent hashrate
+based on share difficulty, network latency, 5 minute summary.
+Per-thread hash rate reports are disabled by default.
+New command line option --hash-meter added to enable per-thread hash rates.
+
+
+v3.9.4
+
+Faster AVX2 for lyra2v3, quark, anime.
+Fixed skein AVX2 regression (invalid shares since v3.9.0) and faster.
+Faster skein2 with 4way AVX2 enabled.
+Automatic SHA override on Ryzen CPUs, no need for -DRYZEN compile flag.
+Ongoing restructuring.
+
+v3.9.3.1
+
+Skipped v3.9.3 due to misidentification of v3.9.2.5 as v3.9.3.
+Fixed x16r algo 25% invalid share reject rate. The bug may have also
+affected other algos.
+
+v3.9.2.5
+
+Fixed 2 regressions: hodl AES detection, x16r invalid shares with AVX2.
+More restructuring.
+
 v3.9.2.4

 Yet another affinity fix. Hopefully the last one.
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -71,7 +71,6 @@ bool return_false () { return false; }
 void *return_null () { return NULL;  }
 void call_error   () { printf("ERR: Uninitialized function pointer\n"); }

-
 void algo_not_tested()
 {
  applog( LOG_WARNING,"Algo %s has not been tested live. It may not work",
@@ -123,7 +122,6 @@ void init_algo_gate( algo_gate_t* gate )
   gate->stratum_gen_work        = (void*)&std_stratum_gen_work;
   gate->build_stratum_request   = (void*)&std_le_build_stratum_request;
   gate->malloc_txs_request      = (void*)&std_malloc_txs_request;
-   gate->set_target              = (void*)&std_set_target;
   gate->submit_getwork_result   = (void*)&std_le_submit_getwork_result;
   gate->build_block_header      = (void*)&std_build_block_header;
   gate->build_extraheader       = (void*)&std_build_extraheader;
@@ -149,111 +147,117 @@ void init_algo_gate( algo_gate_t* gate )
 // called by each thread that uses the gate
 bool register_algo_gate( int algo, algo_gate_t *gate )
 {
-   if ( NULL == gate )
-   {
-     applog(LOG_ERR,"FAIL: algo_gate registration failed, NULL gate\n");
-     return false;
-   }
+  if ( NULL == gate )
+  {
+    applog(LOG_ERR,"FAIL: algo_gate registration failed, NULL gate\n");
+    return false;
+  }

-   init_algo_gate( gate );
+  init_algo_gate( gate );

-   switch (algo)
-   {
-     case ALGO_ALLIUM:       register_allium_algo       ( gate ); break;
-     case ALGO_ANIME:        register_anime_algo        ( gate ); break;
-     case ALGO_ARGON2:       register_argon2_algo       ( gate ); break;
-     case ALGO_ARGON2D250:   register_argon2d_crds_algo ( gate ); break;
-     case ALGO_ARGON2D500:   register_argon2d_dyn_algo  ( gate ); break;
-     case ALGO_ARGON2D4096:  register_argon2d4096_algo  ( gate ); break;
-     case ALGO_AXIOM:        register_axiom_algo        ( gate ); break;
-     case ALGO_BASTION:      register_bastion_algo      ( gate ); break;
-     case ALGO_BLAKE:        register_blake_algo        ( gate ); break;
-     case ALGO_BLAKECOIN:    register_blakecoin_algo    ( gate ); break;
-//     case ALGO_BLAKE2B:      register_blake2b_algo    ( gate ); break;
-     case ALGO_BLAKE2S:      register_blake2s_algo      ( gate ); break;
-     case ALGO_C11:          register_c11_algo          ( gate ); break;
-     case ALGO_CRYPTOLIGHT:  register_cryptolight_algo  ( gate ); break;
-     case ALGO_CRYPTONIGHT:  register_cryptonight_algo  ( gate ); break;
-     case ALGO_CRYPTONIGHTV7:register_cryptonightv7_algo( gate ); break;
-     case ALGO_DECRED:       register_decred_algo       ( gate ); break;
-     case ALGO_DEEP:         register_deep_algo         ( gate ); break;
-     case ALGO_DMD_GR:       register_dmd_gr_algo       ( gate ); break;
-     case ALGO_DROP:         register_drop_algo         ( gate ); break;
-     case ALGO_FRESH:        register_fresh_algo        ( gate ); break;
-     case ALGO_GROESTL:      register_groestl_algo      ( gate ); break;
-     case ALGO_HEAVY:        register_heavy_algo        ( gate ); break;
-     case ALGO_HMQ1725:      register_hmq1725_algo      ( gate ); break;
-     case ALGO_HODL:         register_hodl_algo         ( gate ); break;
-     case ALGO_JHA:          register_jha_algo          ( gate ); break;
-     case ALGO_KECCAK:       register_keccak_algo       ( gate ); break;
-     case ALGO_KECCAKC:      register_keccakc_algo      ( gate ); break;
-     case ALGO_LBRY:         register_lbry_algo         ( gate ); break;
-     case ALGO_LUFFA:        register_luffa_algo        ( gate ); break;
-     case ALGO_LYRA2H:       register_lyra2h_algo       ( gate ); break;
-     case ALGO_LYRA2RE:      register_lyra2re_algo      ( gate ); break;
-     case ALGO_LYRA2REV2:    register_lyra2rev2_algo    ( gate ); break;
-     case ALGO_LYRA2REV3:    register_lyra2rev3_algo    ( gate ); break;
-     case ALGO_LYRA2Z:       register_lyra2z_algo       ( gate ); break;
-     case ALGO_LYRA2Z330:    register_lyra2z330_algo    ( gate ); break;
-     case ALGO_M7M:          register_m7m_algo          ( gate ); break;
-     case ALGO_MYR_GR:       register_myriad_algo       ( gate ); break;
-     case ALGO_NEOSCRYPT:    register_neoscrypt_algo    ( gate ); break;
-     case ALGO_NIST5:        register_nist5_algo        ( gate ); break;
-     case ALGO_PENTABLAKE:   register_pentablake_algo   ( gate ); break;
-     case ALGO_PHI1612:      register_phi1612_algo      ( gate ); break;
-     case ALGO_PHI2:         register_phi2_algo         ( gate ); break;
-     case ALGO_PLUCK:        register_pluck_algo        ( gate ); break;
-     case ALGO_POLYTIMOS:    register_polytimos_algo    ( gate ); break;
-     case ALGO_QUARK:        register_quark_algo        ( gate ); break;
-     case ALGO_QUBIT:        register_qubit_algo        ( gate ); break;
-     case ALGO_SCRYPT:       register_scrypt_algo       ( gate ); break;
-     case ALGO_SCRYPTJANE:   register_scryptjane_algo   ( gate ); break;
-     case ALGO_SHA256D:      register_sha256d_algo      ( gate ); break;
-     case ALGO_SHA256T:      register_sha256t_algo      ( gate ); break;
-     case ALGO_SHA256Q:      register_sha256q_algo      ( gate ); break;
-     case ALGO_SHAVITE3:     register_shavite_algo      ( gate ); break;
-     case ALGO_SKEIN:        register_skein_algo        ( gate ); break;
-     case ALGO_SKEIN2:       register_skein2_algo       ( gate ); break;
-     case ALGO_SKUNK:        register_skunk_algo        ( gate ); break;
-     case ALGO_SONOA:        register_sonoa_algo        ( gate ); break;
-     case ALGO_TIMETRAVEL:   register_timetravel_algo   ( gate ); break;
-     case ALGO_TIMETRAVEL10: register_timetravel10_algo ( gate ); break;
-     case ALGO_TRIBUS:       register_tribus_algo       ( gate ); break;
-     case ALGO_VANILLA:      register_vanilla_algo      ( gate ); break;
-     case ALGO_VELTOR:       register_veltor_algo       ( gate ); break;
-     case ALGO_WHIRLPOOL:    register_whirlpool_algo    ( gate ); break;
-     case ALGO_WHIRLPOOLX:   register_whirlpoolx_algo   ( gate ); break;
-     case ALGO_X11:          register_x11_algo          ( gate ); break;
-     case ALGO_X11EVO:       register_x11evo_algo       ( gate ); break;
-     case ALGO_X11GOST:      register_x11gost_algo      ( gate ); break;
-     case ALGO_X12:          register_x12_algo          ( gate ); break;
-     case ALGO_X13:          register_x13_algo          ( gate ); break;
-     case ALGO_X13SM3:       register_x13sm3_algo       ( gate ); break;
-     case ALGO_X14:          register_x14_algo          ( gate ); break;
-     case ALGO_X15:          register_x15_algo          ( gate ); break;
-     case ALGO_X16R:         register_x16r_algo         ( gate ); break;
-     case ALGO_X16S:         register_x16s_algo         ( gate ); break;
-     case ALGO_X17:          register_x17_algo          ( gate ); break;
-     case ALGO_XEVAN:        register_xevan_algo        ( gate ); break;
+  switch (algo)
+  {
+    case ALGO_ALLIUM:        register_allium_algo        ( gate ); break;
+    case ALGO_ANIME:         register_anime_algo         ( gate ); break;
+    case ALGO_ARGON2:        register_argon2_algo        ( gate ); break;
+    case ALGO_ARGON2D250:    register_argon2d_crds_algo  ( gate ); break;
+    case ALGO_ARGON2D500:    register_argon2d_dyn_algo   ( gate ); break;
+    case ALGO_ARGON2D4096:   register_argon2d4096_algo   ( gate ); break;
+    case ALGO_AXIOM:         register_axiom_algo         ( gate ); break;
+    case ALGO_BASTION:       register_bastion_algo       ( gate ); break;
+    case ALGO_BLAKE:         register_blake_algo         ( gate ); break;
+    case ALGO_BLAKE2B:       register_blake2b_algo       ( gate ); break;
+    case ALGO_BLAKE2S:       register_blake2s_algo       ( gate ); break;
+    case ALGO_BLAKECOIN:     register_blakecoin_algo     ( gate ); break;
+    case ALGO_BMW512:        register_bmw512_algo        ( gate ); break;
+    case ALGO_C11:           register_c11_algo           ( gate ); break;
+    case ALGO_CRYPTOLIGHT:   register_cryptolight_algo   ( gate ); break;
+    case ALGO_CRYPTONIGHT:   register_cryptonight_algo   ( gate ); break;
+    case ALGO_CRYPTONIGHTV7: register_cryptonightv7_algo ( gate ); break;
+    case ALGO_DECRED:        register_decred_algo        ( gate ); break;
+    case ALGO_DEEP:          register_deep_algo          ( gate ); break;
+    case ALGO_DMD_GR:        register_dmd_gr_algo        ( gate ); break;
+    case ALGO_DROP:          register_drop_algo          ( gate ); break;
+    case ALGO_FRESH:         register_fresh_algo         ( gate ); break;
+    case ALGO_GROESTL:       register_groestl_algo       ( gate ); break;
+    case ALGO_HEAVY:         register_heavy_algo         ( gate ); break;
+    case ALGO_HEX:           register_hex_algo           ( gate ); break;
+    case ALGO_HMQ1725:       register_hmq1725_algo       ( gate ); break;
+    case ALGO_HODL:          register_hodl_algo          ( gate ); break;
+    case ALGO_JHA:           register_jha_algo           ( gate ); break;
+    case ALGO_KECCAK:        register_keccak_algo        ( gate ); break;
+    case ALGO_KECCAKC:       register_keccakc_algo       ( gate ); break;
+    case ALGO_LBRY:          register_lbry_algo          ( gate ); break;
+    case ALGO_LUFFA:         register_luffa_algo         ( gate ); break;
+    case ALGO_LYRA2H:        register_lyra2h_algo        ( gate ); break;
+    case ALGO_LYRA2RE:       register_lyra2re_algo       ( gate ); break;
+    case ALGO_LYRA2REV2:     register_lyra2rev2_algo     ( gate ); break;
+    case ALGO_LYRA2REV3:     register_lyra2rev3_algo     ( gate ); break;
+    case ALGO_LYRA2Z:        register_lyra2z_algo        ( gate ); break;
+    case ALGO_LYRA2Z330:     register_lyra2z330_algo     ( gate ); break;
+    case ALGO_M7M:           register_m7m_algo           ( gate ); break;
+    case ALGO_MYR_GR:        register_myriad_algo        ( gate ); break;
+    case ALGO_NEOSCRYPT:     register_neoscrypt_algo     ( gate ); break;
+    case ALGO_NIST5:         register_nist5_algo         ( gate ); break;
+    case ALGO_PENTABLAKE:    register_pentablake_algo    ( gate ); break;
+    case ALGO_PHI1612:       register_phi1612_algo       ( gate ); break;
+    case ALGO_PHI2:          register_phi2_algo          ( gate ); break;
+    case ALGO_PLUCK:         register_pluck_algo         ( gate ); break;
+    case ALGO_POLYTIMOS:     register_polytimos_algo     ( gate ); break;
+    case ALGO_QUARK:         register_quark_algo         ( gate ); break;
+    case ALGO_QUBIT:         register_qubit_algo         ( gate ); break;
+    case ALGO_SCRYPT:        register_scrypt_algo        ( gate ); break;
+    case ALGO_SCRYPTJANE:    register_scryptjane_algo    ( gate ); break;
+    case ALGO_SHA256D:       register_sha256d_algo       ( gate ); break;
+    case ALGO_SHA256Q:       register_sha256q_algo       ( gate ); break;
+    case ALGO_SHA256T:       register_sha256t_algo       ( gate ); break;
+    case ALGO_SHAVITE3:      register_shavite_algo       ( gate ); break;
+    case ALGO_SKEIN:         register_skein_algo         ( gate ); break;
+    case ALGO_SKEIN2:        register_skein2_algo        ( gate ); break;
+    case ALGO_SKUNK:         register_skunk_algo         ( gate ); break;
+    case ALGO_SONOA:         register_sonoa_algo         ( gate ); break;
+    case ALGO_TIMETRAVEL:    register_timetravel_algo    ( gate ); break;
+    case ALGO_TIMETRAVEL10:  register_timetravel10_algo  ( gate ); break;
+    case ALGO_TRIBUS:        register_tribus_algo        ( gate ); break;
+    case ALGO_VANILLA:       register_vanilla_algo       ( gate ); break;
+    case ALGO_VELTOR:        register_veltor_algo        ( gate ); break;
+    case ALGO_WHIRLPOOL:     register_whirlpool_algo     ( gate ); break;
+    case ALGO_WHIRLPOOLX:    register_whirlpoolx_algo    ( gate ); break;
+    case ALGO_X11:           register_x11_algo           ( gate ); break;
+    case ALGO_X11EVO:        register_x11evo_algo        ( gate ); break;
+    case ALGO_X11GOST:       register_x11gost_algo       ( gate ); break;
+    case ALGO_X12:           register_x12_algo           ( gate ); break;
+    case ALGO_X13:           register_x13_algo           ( gate ); break;
+    case ALGO_X13BCD:        register_x13bcd_algo        ( gate ); break;
+    case ALGO_X13SM3:        register_x13sm3_algo        ( gate ); break;
+    case ALGO_X14:           register_x14_algo           ( gate ); break;
+    case ALGO_X15:           register_x15_algo           ( gate ); break;
+    case ALGO_X16R:          register_x16r_algo          ( gate ); break;
+    case ALGO_X16RV2:        register_x16rv2_algo        ( gate ); break;
+    case ALGO_X16RT:         register_x16rt_algo         ( gate ); break;
+    case ALGO_X16RT_VEIL:    register_x16rt_veil_algo    ( gate ); break;
+    case ALGO_X16S:          register_x16s_algo          ( gate ); break;
+    case ALGO_X17:           register_x17_algo           ( gate ); break;
+    case ALGO_X21S:          register_x21s_algo          ( gate ); break;
+    case ALGO_XEVAN:         register_xevan_algo         ( gate ); break;
 /*    case ALGO_YESCRYPT:     register_yescrypt_05_algo     ( gate ); break;
     case ALGO_YESCRYPTR8:   register_yescryptr8_05_algo   ( gate ); break;
     case ALGO_YESCRYPTR16:  register_yescryptr16_05_algo  ( gate ); break;
     case ALGO_YESCRYPTR32:  register_yescryptr32_05_algo  ( gate ); break;
 */
-     case ALGO_YESCRYPT:     register_yescrypt_algo     ( gate ); break;
-     case ALGO_YESCRYPTR8:   register_yescryptr8_algo   ( gate ); break;
-     case ALGO_YESCRYPTR16:  register_yescryptr16_algo  ( gate ); break;
-     case ALGO_YESCRYPTR32:  register_yescryptr32_algo  ( gate ); break;
+    case ALGO_YESCRYPT:      register_yescrypt_algo      ( gate ); break;
+    case ALGO_YESCRYPTR8:    register_yescryptr8_algo    ( gate ); break;
+    case ALGO_YESCRYPTR16:   register_yescryptr16_algo   ( gate ); break;
+    case ALGO_YESCRYPTR32:   register_yescryptr32_algo   ( gate ); break;
+    case ALGO_YESPOWER:      register_yespower_algo      ( gate ); break;
+    case ALGO_YESPOWERR16:   register_yespowerr16_algo   ( gate ); break;
+    case ALGO_ZR5:           register_zr5_algo           ( gate ); break;
+   default:
+      applog(LOG_ERR,"FAIL: algo_gate registration failed, unknown algo %s.\n", algo_names[opt_algo] );
+      return false;
+  } // switch

-     case ALGO_YESPOWER:     register_yespower_algo     ( gate ); break;
-     case ALGO_YESPOWERR16:  register_yespowerr16_algo  ( gate ); break;
-     case ALGO_ZR5:          register_zr5_algo          ( gate ); break;
-    default:
-        applog(LOG_ERR,"FAIL: algo_gate registration failed, unknown algo %s.\n", algo_names[opt_algo] );
-        return false;
-   } // switch
-
-  // ensure required functions were defined.
+ // ensure required functions were defined.
  if (  gate->scanhash == (void*)&null_scanhash )
  {
    applog(LOG_ERR, "FAIL: Required algo_gate functions undefined\n");
@@ -329,19 +333,16 @@ const char* const algo_alias_map[][2] =
  { "lyra2",             "lyra2re"      },
  { "lyra2v2",           "lyra2rev2"    },
  { "lyra2v3",           "lyra2rev3"    },
-  { "lyra2zoin",         "lyra2z330"    },
  { "myrgr",             "myr-gr"       },
  { "myriad",            "myr-gr"       },
  { "neo",               "neoscrypt"    },
  { "phi",               "phi1612"      },
-//  { "sia",               "blake2b"      },
  { "sib",               "x11gost"      },
  { "timetravel8",       "timetravel"   },
-  { "ziftr",             "zr5"          },
+  { "veil",              "x16rt-veil"   },
+  { "x16r-hex",          "hex"          },
  { "yenten",            "yescryptr16"  },
-  { "yescryptr8k",       "yescrypt"     },
-  { "zcoin",             "lyra2z"       },
-  { "zoin",              "lyra2z330"    },
+  { "ziftr",             "zr5"          },
  { NULL,                NULL           }   
 };

@@ -363,21 +364,3 @@ void get_algo_alias( char** algo_or_alias )
 #undef ALIAS
 #undef PROPER

-// only for parallel when there are lanes.
-bool submit_solution( struct work *work, void *hash,
-                      struct thr_info *thr, int lane )
-{
-     work_set_target_ratio( work, hash );
-     if ( submit_work( thr, work ) )
-     {
-         applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
-                 accepted_share_count + rejected_share_count + 1,
-                 thr->id, lane );
-         return true;
-     }
-     else
-          applog( LOG_WARNING, "Failed to submit share." );
-     return false;
-}
-
-
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -2,8 +2,7 @@
 #include <stdbool.h>
 #include <stdint.h>
 #include "miner.h"
-#include "avxdefs.h"
-#include "interleave.h"
+#include "simd-utils.h"

 /////////////////////////////
 ////
@@ -117,7 +116,7 @@ typedef struct
 // Added a 5th arg for the thread_info structure to replace the int thr id
 // in the first arg. Both will co-exist during the trasition.
 //int ( *scanhash ) ( int, struct work*, uint32_t, uint64_t* );
-int ( *scanhash ) ( int, struct work*, uint32_t, uint64_t*, struct thr_info* );
+int ( *scanhash ) ( struct work*, uint32_t, uint64_t*, struct thr_info* );

 // optional unsafe, must be overwritten if algo uses function
 void ( *hash )     ( void*, const void*, uint32_t ) ;
@@ -133,7 +132,6 @@ void ( *decode_extra_data )      ( struct work*, uint64_t* );
 void ( *wait_for_diff )          ( struct stratum_ctx* );
 int64_t ( *get_max64 )           ();
 bool ( *work_decode )            ( const json_t*, struct work* );
-void ( *set_target)              ( struct work*, double );
 bool ( *submit_getwork_result )  ( CURL*, struct work* );
 void ( *gen_merkle_root )        ( char*, struct stratum_ctx* );
 void ( *build_extraheader )      ( struct work*, struct stratum_ctx* );
@@ -154,7 +152,6 @@ int  ntime_index;
 int  nbits_index;
 int  nonce_index;            // use with caution, see warning below
 int  work_cmp_size;
-
 } algo_gate_t;

 extern algo_gate_t algo_gate;
@@ -195,12 +192,6 @@ void four_way_not_tested();
 // allways returns failure
 int null_scanhash();

-// The one and only, a callback for scanhash.
-bool submit_solution( struct work *work, void *hash,
-                      struct thr_info *thr, int lane );
- 
-bool submit_work( struct thr_info *thr, const struct work *work_in );
-
 // displays warning
 void null_hash    ();
 void null_hash_suw();
@@ -231,10 +222,6 @@ int64_t get_max64_0x3fffffLL();
 int64_t get_max64_0x1ffff();
 int64_t get_max64_0xffffLL();

-void std_set_target(    struct work *work, double job_diff );
-void alt_set_target(    struct work* work, double job_diff );
-void scrypt_set_target( struct work *work, double job_diff );
-
 bool std_le_work_decode( const json_t *val, struct work *work );
 bool std_be_work_decode( const json_t *val, struct work *work );
 bool jr2_work_decode( const json_t *val, struct work *work );
--- a/algo/argon2/argon2a/argon2a.c
+++ b/algo/argon2/argon2a/argon2a.c
@@ -42,12 +42,14 @@ void argon2hash(void *output, const void *input)
 		(unsigned char *)output);
 }

-int scanhash_argon2(int thr_id, struct work* work, uint32_t max_nonce, uint64_t *hashes_done)
+int scanhash_argon2( struct work* work, uint32_t max_nonce,
+                     uint64_t *hashes_done, struct thr_info *mythr )
 {
 	uint32_t _ALIGN(64) endiandata[20];
 	uint32_t _ALIGN(64) hash[8];
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
+   int thr_id = mythr->id;  // thr_id arg is deprecated

 	const uint32_t first_nonce = pdata[19];
 	const uint32_t Htarg = ptarget[7];
@@ -83,8 +85,9 @@ bool register_argon2_algo( algo_gate_t* gate )
  gate->scanhash        = (void*)&scanhash_argon2;
  gate->hash            = (void*)&argon2hash;
  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
-  gate->set_target      = (void*)&scrypt_set_target;
  gate->get_max64       = (void*)&argon2_get_max64;
+  opt_target_factor = 65536.0;
+
  return true;
 };

--- a/algo/argon2/argon2d/argon2d-gate.c
+++ b/algo/argon2/argon2d/argon2d-gate.c
@@ -33,45 +33,42 @@ void argon2d_crds_hash( void *output, const void *input )
 	argon2_ctx( &context, Argon2_d );
 }

-int scanhash_argon2d_crds( int thr_id, struct work *work, uint32_t max_nonce,
-                      uint64_t *hashes_done )
+int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr )
 {
-        uint32_t _ALIGN(64) endiandata[20];
-        uint32_t _ALIGN(64) hash[8];
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
+   uint32_t _ALIGN(64) endiandata[20];
+   uint32_t _ALIGN(64) hash[8];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   int thr_id = mythr->id;  // thr_id arg is deprecated
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t Htarg = ptarget[7];
+   uint32_t nonce = first_nonce;

-        const uint32_t first_nonce = pdata[19];
-        const uint32_t Htarg = ptarget[7];
+   swab32_array( endiandata, pdata, 20 );

-        uint32_t nonce = first_nonce;
+   do {
+      be32enc(&endiandata[19], nonce);
+      argon2d_crds_hash( hash, endiandata );
+      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) && !opt_benchmark )
+      {
+          pdata[19] = nonce;
+          submit_solution( work, hash, mythr );
+      }
+      nonce++;
+   } while (nonce < max_nonce && !work_restart[thr_id].restart);

-        swab32_array( endiandata, pdata, 20 );
-
-        do {
-                be32enc(&endiandata[19], nonce);
-                argon2d_crds_hash( hash, endiandata );
-                if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
-                {
-                        pdata[19] = nonce;
-                        *hashes_done = pdata[19] - first_nonce;
-                        work_set_target_ratio(work, hash);
-                        return 1;
-                }
-                nonce++;
-        } while (nonce < max_nonce && !work_restart[thr_id].restart);
-
-        pdata[19] = nonce;
-        *hashes_done = pdata[19] - first_nonce + 1;
-        return 0;
+   pdata[19] = nonce;
+   *hashes_done = pdata[19] - first_nonce + 1;
+   return 0;
 }

 bool register_argon2d_crds_algo( algo_gate_t* gate )
 {
        gate->scanhash = (void*)&scanhash_argon2d_crds;
        gate->hash = (void*)&argon2d_crds_hash;
-        gate->set_target = (void*)&scrypt_set_target;
        gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
+        opt_target_factor = 65536.0;
        return true;
 }

@@ -103,52 +100,50 @@ void argon2d_dyn_hash( void *output, const void *input )
    argon2_ctx( &context, Argon2_d );
 }

-int scanhash_argon2d_dyn( int thr_id, struct work *work, uint32_t max_nonce,
-                      uint64_t *hashes_done )
+int scanhash_argon2d_dyn( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr )
 {
-        uint32_t _ALIGN(64) endiandata[20];
-        uint32_t _ALIGN(64) hash[8];
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
+   uint32_t _ALIGN(64) endiandata[20];
+   uint32_t _ALIGN(64) hash[8];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   int thr_id = mythr->id;  // thr_id arg is deprecated
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t Htarg = ptarget[7];
+   uint32_t nonce = first_nonce;

-        const uint32_t first_nonce = pdata[19];
-        const uint32_t Htarg = ptarget[7];
+   swab32_array( endiandata, pdata, 20 );

-        uint32_t nonce = first_nonce;
+   do
+   {
+      be32enc(&endiandata[19], nonce);
+      argon2d_dyn_hash( hash, endiandata );
+      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) && !opt_benchmark )
+      {
+          pdata[19] = nonce;
+          submit_solution( work, hash, mythr );
+      }
+      nonce++;
+  } while (nonce < max_nonce && !work_restart[thr_id].restart);

-        swab32_array( endiandata, pdata, 20 );
-
-        do {
-                be32enc(&endiandata[19], nonce);
-                argon2d_dyn_hash( hash, endiandata );
-                if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
-                {
-                        pdata[19] = nonce;
-                        *hashes_done = pdata[19] - first_nonce;
-                        work_set_target_ratio(work, hash);
-                        return 1;
-                }
-                nonce++;
-        } while (nonce < max_nonce && !work_restart[thr_id].restart);
-
-        pdata[19] = nonce;
-        *hashes_done = pdata[19] - first_nonce + 1;
-        return 0;
+   pdata[19] = nonce;
+   *hashes_done = pdata[19] - first_nonce + 1;
+   return 0;
 }

 bool register_argon2d_dyn_algo( algo_gate_t* gate )
 {
        gate->scanhash = (void*)&scanhash_argon2d_dyn;
        gate->hash = (void*)&argon2d_dyn_hash;
-        gate->set_target = (void*)&scrypt_set_target;
        gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
+        opt_target_factor = 65536.0;
        return true;
 }

 // Unitus

-int scanhash_argon2d4096( int thr_id, struct work *work, uint32_t max_nonce,
-                           uint64_t *hashes_done)
+int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t _ALIGN(64) vhash[8];
   uint32_t _ALIGN(64) endiandata[20];
@@ -157,7 +152,7 @@ int scanhash_argon2d4096( int thr_id, struct work *work, uint32_t max_nonce,
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
-    
+   int thr_id = mythr->id;  // thr_id arg is deprecated
   uint32_t t_cost = 1; // 1 iteration
   uint32_t m_cost = 4096; // use 4MB
   uint32_t parallelism = 1; // 1 thread, 2 lanes
@@ -169,11 +164,10 @@ int scanhash_argon2d4096( int thr_id, struct work *work, uint32_t max_nonce,
      be32enc( &endiandata[19], n );
      argon2d_hash_raw( t_cost, m_cost, parallelism, (char*) endiandata, 80,
                 (char*) endiandata, 80, (char*) vhash, 32, ARGON2_VERSION_13 );
-      if ( vhash[7] < Htarg && fulltest( vhash, ptarget ) )
+      if ( vhash[7] < Htarg && fulltest( vhash, ptarget ) && !opt_benchmark )
      {
-         *hashes_done = n - first_nonce + 1;
         pdata[19] = n;
-         return true;
+         submit_solution( work, vhash, mythr );
      }
      n++;

@@ -190,9 +184,9 @@ int64_t get_max64_0x1ff() { return 0x1ff; }
 bool register_argon2d4096_algo( algo_gate_t* gate )
 {
        gate->scanhash = (void*)&scanhash_argon2d4096;
-        gate->set_target = (void*)&scrypt_set_target;
        gate->get_max64  = (void*)&get_max64_0x1ff;
        gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
+        opt_target_factor = 65536.0;
        return true;
 }

--- a/algo/argon2/argon2d/argon2d-gate.h
+++ b/algo/argon2/argon2d/argon2d-gate.h
@@ -9,23 +9,23 @@ bool register_argon2d_crds_algo( algo_gate_t* gate );

 void argon2d_crds_hash( void *state, const void *input );

-int scanhash_argon2d_crds( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done );
+int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done, struct thr_info *mythr );

 // Dynamic: version = 0x10, m_cost = 500.
 bool register_argon2d_dyn_algo( algo_gate_t* gate );

 void argon2d_dyn_hash( void *state, const void *input );

-int scanhash_argon2d_dyn( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done );
+int scanhash_argon2d_dyn( struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done, struct thr_info *mythr );


 // Unitus: version = 0x13, m_cost = 4096.
 bool register_argon2d4096_algo( algo_gate_t* gate );

-int scanhash_argon2d4096( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done );
+int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done, struct thr_info *mythr );

 #endif

--- a/algo/argon2/argon2d/argon2d/core.c
+++ b/algo/argon2/argon2d/argon2d/core.c
@@ -28,6 +28,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <mm_malloc.h>

 #include "core.h"
 #include "argon2d_thread.h"
@@ -99,7 +100,8 @@ int allocate_memory(const argon2_context *context, uint8_t **memory,
    if (context->allocate_cbk) {
        (context->allocate_cbk)(memory, memory_size);
    } else {
-        *memory = malloc(memory_size);
+        *memory = _mm_malloc( memory_size, 64 );
+//        *memory = malloc(memory_size);
    }

    if (*memory == NULL) {
@@ -112,11 +114,12 @@ int allocate_memory(const argon2_context *context, uint8_t **memory,
 void free_memory(const argon2_context *context, uint8_t *memory,
                 size_t num, size_t size) {
    size_t memory_size = num*size;
-    clear_internal_memory(memory, memory_size);
+//    clear_internal_memory(memory, memory_size);
    if (context->free_cbk) {
        (context->free_cbk)(memory, memory_size);
    } else {
-        free(memory);
+//        free(memory);
+        _mm_free( memory );
    }
 }

@@ -137,7 +140,7 @@ void NOT_OPTIMIZED secure_wipe_memory(void *v, size_t n) {
 int FLAG_clear_internal_memory = 0;
 void clear_internal_memory(void *v, size_t n) {
  if (FLAG_clear_internal_memory && v) {
-    secure_wipe_memory(v, n);
+//    secure_wipe_memory(v, n);
  }
 }

@@ -559,7 +562,7 @@ void initial_hash(uint8_t *blockhash, argon2_context *context,
                       context->pwdlen);

        if (context->flags & ARGON2_FLAG_CLEAR_PASSWORD) {
-            secure_wipe_memory(context->pwd, context->pwdlen);
+//            secure_wipe_memory(context->pwd, context->pwdlen);
            context->pwdlen = 0;
        }
    }
@@ -580,7 +583,7 @@ void initial_hash(uint8_t *blockhash, argon2_context *context,
                       context->secretlen);

        if (context->flags & ARGON2_FLAG_CLEAR_SECRET) {
-            secure_wipe_memory(context->secret, context->secretlen);
+//            secure_wipe_memory(context->secret, context->secretlen);
            context->secretlen = 0;
        }
    }
--- a/algo/argon2/argon2d/argon2d/opt.c
+++ b/algo/argon2/argon2d/argon2d/opt.c
@@ -96,14 +96,14 @@ static void fill_block(__m256i *state, const block *ref_block,
    if (with_xor) {
        for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
            state[i] = _mm256_xor_si256(
-                state[i], _mm256_loadu_si256((const __m256i *)ref_block->v + i));
+                state[i], _mm256_load_si256((const __m256i *)ref_block->v + i));
            block_XY[i] = _mm256_xor_si256(
-                state[i], _mm256_loadu_si256((const __m256i *)next_block->v + i));
+                state[i], _mm256_load_si256((const __m256i *)next_block->v + i));
        }
    } else {
        for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
            block_XY[i] = state[i] = _mm256_xor_si256(
-                state[i], _mm256_loadu_si256((const __m256i *)ref_block->v + i));
+                state[i], _mm256_load_si256((const __m256i *)ref_block->v + i));
        }
    }

@@ -139,7 +139,7 @@ static void fill_block(__m256i *state, const block *ref_block,

    for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
        state[i] = _mm256_xor_si256(state[i], block_XY[i]);
-        _mm256_storeu_si256((__m256i *)next_block->v + i, state[i]);
+        _mm256_store_si256((__m256i *)next_block->v + i, state[i]);
    }
 }

--- a/algo/argon2/argon2d/blake2/blamka-round-opt.h
+++ b/algo/argon2/argon2d/blake2/blamka-round-opt.h
@@ -29,6 +29,8 @@
 #include <x86intrin.h>
 #endif

+#include "simd-utils.h"
+
 #if !defined(__AVX512F__)
 #if !defined(__AVX2__)
 #if !defined(__XOP__)
@@ -182,64 +184,63 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {

 #include <immintrin.h>

-#define rotr32(x)   _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1))
-#define rotr24(x)   _mm256_shuffle_epi8(x, _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
-#define rotr16(x)   _mm256_shuffle_epi8(x, _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
-#define rotr63(x)   _mm256_xor_si256(_mm256_srli_epi64((x), 63), _mm256_add_epi64((x), (x)))
+#define  rotr32  mm256_swap32_64
+#define  rotr24  mm256_ror3x8_64
+#define  rotr16  mm256_ror1x16_64
+#define  rotr63( x ) mm256_rol_64( x, 1 )
+
+//#define rotr32(x)   _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1))
+//#define rotr24(x)   _mm256_shuffle_epi8(x, _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
+//#define rotr16(x)   _mm256_shuffle_epi8(x, _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
+//#define rotr63(x)   _mm256_xor_si256(_mm256_srli_epi64((x), 63), _mm256_add_epi64((x), (x)))

 #define G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
    do { \
-        __m256i ml = _mm256_mul_epu32(A0, B0); \
-        ml = _mm256_add_epi64(ml, ml); \
-        A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \
+        __m256i ml0, ml1; \
+        ml0 = _mm256_mul_epu32(A0, B0); \
+        ml1 = _mm256_mul_epu32(A1, B1); \
+        ml0 = _mm256_add_epi64(ml0, ml0); \
+        ml1 = _mm256_add_epi64(ml1, ml1); \
+        A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml0)); \
+        A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml1)); \
        D0 = _mm256_xor_si256(D0, A0); \
-        D0 = rotr32(D0); \
-        \
-        ml = _mm256_mul_epu32(C0, D0); \
-        ml = _mm256_add_epi64(ml, ml); \
-        C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \
-        \
-        B0 = _mm256_xor_si256(B0, C0); \
-        B0 = rotr24(B0); \
-        \
-        ml = _mm256_mul_epu32(A1, B1); \
-        ml = _mm256_add_epi64(ml, ml); \
-        A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \
        D1 = _mm256_xor_si256(D1, A1); \
+        D0 = rotr32(D0); \
        D1 = rotr32(D1); \
-        \
-        ml = _mm256_mul_epu32(C1, D1); \
-        ml = _mm256_add_epi64(ml, ml); \
-        C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \
-        \
+        ml0 = _mm256_mul_epu32(C0, D0); \
+        ml1 = _mm256_mul_epu32(C1, D1); \
+        ml0 = _mm256_add_epi64(ml0, ml0); \
+        ml1 = _mm256_add_epi64(ml1, ml1); \
+        C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml0)); \
+        C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml1)); \
+        B0 = _mm256_xor_si256(B0, C0); \
        B1 = _mm256_xor_si256(B1, C1); \
+        B0 = rotr24(B0); \
        B1 = rotr24(B1); \
    } while((void)0, 0);

 #define G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
    do { \
-        __m256i ml = _mm256_mul_epu32(A0, B0); \
-        ml = _mm256_add_epi64(ml, ml); \
-        A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \
+        __m256i ml0, ml1; \
+        ml0 = _mm256_mul_epu32(A0, B0); \
+        ml1 = _mm256_mul_epu32(A1, B1); \
+        ml0 = _mm256_add_epi64(ml0, ml0); \
+        ml1 = _mm256_add_epi64(ml1, ml1); \
+        A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml0)); \
+        A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml1)); \
        D0 = _mm256_xor_si256(D0, A0); \
-        D0 = rotr16(D0); \
-        \
-        ml = _mm256_mul_epu32(C0, D0); \
-        ml = _mm256_add_epi64(ml, ml); \
-        C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \
-        B0 = _mm256_xor_si256(B0, C0); \
-        B0 = rotr63(B0); \
-        \
-        ml = _mm256_mul_epu32(A1, B1); \
-        ml = _mm256_add_epi64(ml, ml); \
-        A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \
        D1 = _mm256_xor_si256(D1, A1); \
+        D0 = rotr16(D0); \
        D1 = rotr16(D1); \
-        \
-        ml = _mm256_mul_epu32(C1, D1); \
-        ml = _mm256_add_epi64(ml, ml); \
-        C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \
+        ml0 = _mm256_mul_epu32(C0, D0); \
+        ml1 = _mm256_mul_epu32(C1, D1); \
+        ml0 = _mm256_add_epi64(ml0, ml0); \
+        ml1 = _mm256_add_epi64(ml1, ml1); \
+        C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml0)); \
+        C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml1)); \
+        B0 = _mm256_xor_si256(B0, C0); \
        B1 = _mm256_xor_si256(B1, C1); \
+        B0 = rotr63(B0); \
        B1 = rotr63(B1); \
    } while((void)0, 0);

@@ -259,16 +260,14 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
        __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
        __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
        B1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
-        B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
-        \
        tmp1 = C0; \
+        B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
        C0 = C1; \
-        C1 = tmp1; \
-        \
-        tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \
        tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \
-        D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
+        C1 = tmp1; \
+        tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \
        D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
+        D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
    } while(0);

 #define UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
@@ -287,16 +286,14 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
        __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
        __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
        B0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
-        B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
-        \
        tmp1 = C0; \
+        B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
        C0 = C1; \
-        C1 = tmp1; \
-        \
-        tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \
        tmp2 = _mm256_blend_epi32(D0, D1, 0xCC); \
-        D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
+        C1 = tmp1; \
+        tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \
        D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
+        D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
    } while((void)0, 0);

 #define BLAKE2_ROUND_1(A0, A1, B0, B1, C0, C1, D0, D1) \
--- a/algo/blake/blake-4way.c
+++ b/algo/blake/blake-4way.c
@@ -15,11 +15,11 @@ void blakehash_4way(void *state, const void *input)
     memcpy( &ctx, &blake_4w_ctx, sizeof ctx );
     blake256r14_4way( &ctx, input + (64<<2), 16 );
     blake256r14_4way_close( &ctx, vhash );
-     mm128_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
+     dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
 }

-int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done )
+int scanhash_blake_4way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
   uint32_t hash[8*4] __attribute__ ((aligned (32)));
@@ -27,43 +27,34 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   uint32_t HTarget = ptarget[7];
-   uint32_t _ALIGN(32) edata[20];
+   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
   uint32_t n = first_nonce;
-   uint32_t *nonces = work->nonces;
-   int num_found = 0;
+   int thr_id = mythr->id;  // thr_id arg is deprecated

   if (opt_benchmark)
      HTarget = 0x7f;

-   // we need big endian data...
-   swab32_array( edata, pdata, 20 );
-   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+   mm128_bswap32_intrlv80_4x32( vdata, pdata );
   blake256r14_4way_init( &blake_4w_ctx );
   blake256r14_4way( &blake_4w_ctx, vdata, 64 );

-   uint32_t *noncep = vdata + 76;   // 19*4
   do {
-      be32enc( noncep,    n   );
-      be32enc( noncep +1, n+1 );
-      be32enc( noncep +2, n+2 );
-      be32enc( noncep +3, n+3 );
+      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );

      blakehash_4way( hash, vdata );

      for ( int i = 0; i < 4; i++ )
-      if (  (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget ) )
+      if ( (hash+(i<<3))[7] <= HTarget )
+      if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
      {
          pdata[19] = n+i;
-          nonces[ num_found++ ] = n+i;
-          work_set_target_ratio( work, hash+(i<<3) );
+          submit_lane_solution( work, hash+(i<<3), mythr, i );
      }
      n += 4;

-   } while ( (num_found == 0) && (n < max_nonce) 
-             && !work_restart[thr_id].restart );
-
+   } while ( (n < max_nonce) && !work_restart[thr_id].restart );
   *hashes_done = n - first_nonce + 1;
-   return num_found;
+   return 0;
 }

 #endif
@@ -79,13 +70,13 @@ void blakehash_8way( void *state, const void *input )
     memcpy( &ctx, &blake_8w_ctx, sizeof ctx );
     blake256r14_8way( &ctx, input + (64<<3), 16 );
     blake256r14_8way_close( &ctx, vhash );
-     mm256_deinterleave_8x32( state,     state+ 32, state+ 64, state+ 96,
-                              state+128, state+160, state+192, state+224,
-                              vhash, 256 );
+     _dintrlv_8x32( state,     state+ 32, state+ 64, state+ 96,
+                    state+128, state+160, state+192, state+224,
+                    vhash, 256 );
 }

-int scanhash_blake_8way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done )
+int scanhash_blake_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
   uint32_t hash[8*8] __attribute__ ((aligned (32)));
@@ -93,33 +84,21 @@ int scanhash_blake_8way( int thr_id, struct work *work, uint32_t max_nonce,
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   uint32_t HTarget = ptarget[7];
-   uint32_t _ALIGN(32) edata[20];
   uint32_t n = first_nonce;
-   uint32_t *nonces = work->nonces;
-   int num_found = 0;
+   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
+   int thr_id = mythr->id;  // thr_id arg is deprecated

   if (opt_benchmark)
      HTarget = 0x7f;

-   // we need big endian data...
-   swab32_array( edata, pdata, 20 );
-
-   mm256_interleave_8x32( vdata, edata, edata, edata, edata,
-                                 edata, edata, edata, edata, 640 );
+   mm256_bswap32_intrlv80_8x32( vdata, pdata );

   blake256r14_8way_init( &blake_8w_ctx );
   blake256r14_8way( &blake_8w_ctx, vdata, 64 );

-   uint32_t *noncep = vdata + 152;   // 19*8
   do {
-      be32enc( noncep,    n   );
-      be32enc( noncep +1, n+1 );
-      be32enc( noncep +2, n+2 );
-      be32enc( noncep +3, n+3 );
-      be32enc( noncep +4, n+4 );
-      be32enc( noncep +5, n+5 );
-      be32enc( noncep +6, n+6 );
-      be32enc( noncep +7, n+7 );
+      *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
+                                                  n+3, n+2, n+1, n ) );
      pdata[19] = n;

      blakehash_8way( hash, vdata );
@@ -128,17 +107,14 @@ int scanhash_blake_8way( int thr_id, struct work *work, uint32_t max_nonce,
      if ( (hash+i)[7] <= HTarget && fulltest( hash+i, ptarget ) )
      {
          pdata[19] = n+i;
-          num_found++;
-          nonces[i] = n+i;
-          work_set_target_ratio( work, hash+1 );
+          submit_lane_solution( work, hash+(i<<3), mythr, i );
      }
      n += 8;

-   } while ( (num_found == 0) && (n < max_nonce)
-             && !work_restart[thr_id].restart );
+   } while ( (n < max_nonce) !work_restart[thr_id].restart );

   *hashes_done = n - first_nonce + 1;
-   return num_found;
+   return 0;
 }

 #endif
--- a/algo/blake/blake-gate.h
+++ b/algo/blake/blake-gate.h
@@ -10,12 +10,12 @@

 #if defined (BLAKE_4WAY)
 void blakehash_4way(void *state, const void *input);
-int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done );
+int scanhash_blake_4way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
 #endif

 void blakehash( void *state, const void *input );
-int scanhash_blake( int thr_id, struct work *work, uint32_t max_nonce,
-                      uint64_t *hashes_done );
+int scanhash_blake( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr );

 #endif
--- a/algo/blake/blake-hash-4way.h
+++ b/algo/blake/blake-hash-4way.h
@@ -45,7 +45,7 @@ extern "C"{

 #include <stddef.h>
 #include "algo/sha/sph_types.h"
-#include "avxdefs.h"
+#include "simd-utils.h"

 #define SPH_SIZE_blake256   256

--- a/algo/blake/blake.c
+++ b/algo/blake/blake.c
@@ -39,8 +39,8 @@ void blakehash(void *state, const void *input)

 }

-int scanhash_blake( int thr_id, struct work *work, uint32_t max_nonce,
-                      uint64_t *hashes_done )
+int scanhash_blake( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
@@ -49,6 +49,7 @@ int scanhash_blake( int thr_id, struct work *work, uint32_t max_nonce,
 	uint32_t _ALIGN(32) hash64[8];
 	uint32_t _ALIGN(32) endiandata[20];
 	uint32_t n = first_nonce;
+   int thr_id = mythr->id;  // thr_id arg is deprecated

 	ctx_midstate_done = false;

--- a/algo/blake/blake256-hash-4way.c
+++ b/algo/blake/blake256-hash-4way.c
@@ -308,12 +308,12 @@ static const sph_u32 CS[16] = {
 #define GS_4WAY( m0, m1, c0, c1, a, b, c, d ) \
 do { \
   a = _mm_add_epi32( _mm_add_epi32( _mm_xor_si128( \
-                 _mm_set_epi32( c1, c1, c1, c1 ), m0 ), b ), a ); \
+                                   _mm_set1_epi32( c1 ), m0 ), b ), a ); \
   d = mm128_ror_32( _mm_xor_si128( d, a ), 16 ); \
   c = _mm_add_epi32( c, d ); \
   b = mm128_ror_32( _mm_xor_si128( b, c ), 12 ); \
   a = _mm_add_epi32( _mm_add_epi32( _mm_xor_si128( \
-                 _mm_set_epi32( c0, c0, c0, c0 ), m1 ), b ), a ); \
+                                   _mm_set1_epi32( c0 ), m1 ), b ), a ); \
   d = mm128_ror_32( _mm_xor_si128( d, a ), 8 ); \
   c = _mm_add_epi32( c, d ); \
   b = mm128_ror_32( _mm_xor_si128( b, c ), 7 ); \
@@ -412,34 +412,16 @@ do { \
 	V5 = H5; \
 	V6 = H6; \
 	V7 = H7; \
-        V8 = _mm_xor_si128( S0, _mm_set_epi32( CS0, CS0, CS0, CS0 ) ); \
-        V9 = _mm_xor_si128( S1, _mm_set_epi32( CS1, CS1, CS1, CS1 ) ); \
-        VA = _mm_xor_si128( S2, _mm_set_epi32( CS2, CS2, CS2, CS2 ) ); \
-        VB = _mm_xor_si128( S3, _mm_set_epi32( CS3, CS3, CS3, CS3 ) ); \
-        VC = _mm_xor_si128( _mm_set_epi32( T0, T0, T0, T0 ), \
-                            _mm_set_epi32( CS4, CS4, CS4, CS4 ) ); \
-        VD = _mm_xor_si128( _mm_set_epi32( T0, T0, T0, T0 ), \
-                            _mm_set_epi32( CS5, CS5, CS5, CS5 ) ); \
-        VE = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ) \
-                          , _mm_set_epi32( CS6, CS6, CS6, CS6 ) ); \
-        VF = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \
-                            _mm_set_epi32( CS7, CS7, CS7, CS7 ) ); \
-	M[0x0] = mm128_bswap_32( *(buf +  0) ); \
-	M[0x1] = mm128_bswap_32( *(buf +  1) ); \
-	M[0x2] = mm128_bswap_32( *(buf +  2) ); \
-	M[0x3] = mm128_bswap_32( *(buf +  3) ); \
-	M[0x4] = mm128_bswap_32( *(buf +  4) ); \
-	M[0x5] = mm128_bswap_32( *(buf +  5) ); \
-	M[0x6] = mm128_bswap_32( *(buf +  6) ); \
-	M[0x7] = mm128_bswap_32( *(buf +  7) ); \
-	M[0x8] = mm128_bswap_32( *(buf +  8) ); \
-	M[0x9] = mm128_bswap_32( *(buf +  9) ); \
-	M[0xA] = mm128_bswap_32( *(buf + 10) ); \
-	M[0xB] = mm128_bswap_32( *(buf + 11) ); \
-	M[0xC] = mm128_bswap_32( *(buf + 12) ); \
-	M[0xD] = mm128_bswap_32( *(buf + 13) ); \
-	M[0xE] = mm128_bswap_32( *(buf + 14) ); \
-	M[0xF] = mm128_bswap_32( *(buf + 15) ); \
+   V8 = _mm_xor_si128( S0, _mm_set1_epi32( CS0 ) ); \
+   V9 = _mm_xor_si128( S1, _mm_set1_epi32( CS1 ) ); \
+   VA = _mm_xor_si128( S2, _mm_set1_epi32( CS2 ) ); \
+   VB = _mm_xor_si128( S3, _mm_set1_epi32( CS3 ) ); \
+   VC = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS4 ) ); \
+   VD = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS5 ) ); \
+   VE = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS6 ) ); \
+   VF = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS7 ) ); \
+   mm128_block_bswap_32( M, buf ); \
+   mm128_block_bswap_32( M+8, buf+8 ); \
 	for (r = 0; r < rounds; r ++) \
 		ROUND_S_4WAY(r); \
        H0 = _mm_xor_si128( _mm_xor_si128( \
@@ -464,6 +446,54 @@ do { \

 // current impl

+#if defined(__SSSE3__)
+
+#define BLAKE256_4WAY_BLOCK_BSWAP32 do \
+{ \
+   __m128i shuf_bswap32 = _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
+                                          0x0405060700010203 ); \
+   M0 = _mm_shuffle_epi8( buf[ 0], shuf_bswap32 ); \
+   M1 = _mm_shuffle_epi8( buf[ 1], shuf_bswap32 ); \
+   M2 = _mm_shuffle_epi8( buf[ 2], shuf_bswap32 ); \
+   M3 = _mm_shuffle_epi8( buf[ 3], shuf_bswap32 ); \
+   M4 = _mm_shuffle_epi8( buf[ 4], shuf_bswap32 ); \
+   M5 = _mm_shuffle_epi8( buf[ 5], shuf_bswap32 ); \
+   M6 = _mm_shuffle_epi8( buf[ 6], shuf_bswap32 ); \
+   M7 = _mm_shuffle_epi8( buf[ 7], shuf_bswap32 ); \
+   M8 = _mm_shuffle_epi8( buf[ 8], shuf_bswap32 ); \
+   M9 = _mm_shuffle_epi8( buf[ 9], shuf_bswap32 ); \
+   MA = _mm_shuffle_epi8( buf[10], shuf_bswap32 ); \
+   MB = _mm_shuffle_epi8( buf[11], shuf_bswap32 ); \
+   MC = _mm_shuffle_epi8( buf[12], shuf_bswap32 ); \
+   MD = _mm_shuffle_epi8( buf[13], shuf_bswap32 ); \
+   ME = _mm_shuffle_epi8( buf[14], shuf_bswap32 ); \
+   MF = _mm_shuffle_epi8( buf[15], shuf_bswap32 ); \
+} while(0)
+
+#else  // SSE2
+
+#define BLAKE256_4WAY_BLOCK_BSWAP32 do \
+{ \
+   M0 = mm128_bswap_32( buf[0] ); \
+   M1 = mm128_bswap_32( buf[1] ); \
+   M2 = mm128_bswap_32( buf[2] ); \
+   M3 = mm128_bswap_32( buf[3] ); \
+   M4 = mm128_bswap_32( buf[4] ); \
+   M5 = mm128_bswap_32( buf[5] ); \
+   M6 = mm128_bswap_32( buf[6] ); \
+   M7 = mm128_bswap_32( buf[7] ); \
+   M8 = mm128_bswap_32( buf[8] ); \
+   M9 = mm128_bswap_32( buf[9] ); \
+   MA = mm128_bswap_32( buf[10] ); \
+   MB = mm128_bswap_32( buf[11] ); \
+   MC = mm128_bswap_32( buf[12] ); \
+   MD = mm128_bswap_32( buf[13] ); \
+   ME = mm128_bswap_32( buf[14] ); \
+   MF = mm128_bswap_32( buf[15] ); \
+} while(0)
+
+#endif  // SSSE3 else SSE2
+
 #define COMPRESS32_4WAY( rounds ) \
 do { \
   __m128i M0, M1, M2, M3, M4, M5, M6, M7; \
@@ -478,30 +508,19 @@ do { \
   V5 = H5; \
   V6 = H6; \
   V7 = H7; \
-   V8 = _mm_xor_si128( S0, _mm_set1_epi32( CS0 ) ); \
-   V9 = _mm_xor_si128( S1, _mm_set1_epi32( CS1 ) ); \
-   VA = _mm_xor_si128( S2, _mm_set1_epi32( CS2 ) ); \
-   VB = _mm_xor_si128( S3, _mm_set1_epi32( CS3 ) ); \
-   VC = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS4 ) ); \
-   VD = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS5 ) ); \
-   VE = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS6 ) ); \
-   VF = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS7 ) ); \
-   M0 = mm128_bswap_32( buf[ 0] ); \
-   M1 = mm128_bswap_32( buf[ 1] ); \
-   M2 = mm128_bswap_32( buf[ 2] ); \
-   M3 = mm128_bswap_32( buf[ 3] ); \
-   M4 = mm128_bswap_32( buf[ 4] ); \
-   M5 = mm128_bswap_32( buf[ 5] ); \
-   M6 = mm128_bswap_32( buf[ 6] ); \
-   M7 = mm128_bswap_32( buf[ 7] ); \
-   M8 = mm128_bswap_32( buf[ 8] ); \
-   M9 = mm128_bswap_32( buf[ 9] ); \
-   MA = mm128_bswap_32( buf[10] ); \
-   MB = mm128_bswap_32( buf[11] ); \
-   MC = mm128_bswap_32( buf[12] ); \
-   MD = mm128_bswap_32( buf[13] ); \
-   ME = mm128_bswap_32( buf[14] ); \
-   MF = mm128_bswap_32( buf[15] ); \
+   V8 = _mm_xor_si128( S0, m128_const1_64( 0x243F6A88243F6A88 ) ); \
+   V9 = _mm_xor_si128( S1, m128_const1_64( 0x85A308D385A308D3 ) ); \
+   VA = _mm_xor_si128( S2, m128_const1_64( 0x13198A2E13198A2E ) ); \
+   VB = _mm_xor_si128( S3, m128_const1_64( 0x0370734403707344 ) ); \
+   VC = _mm_xor_si128( _mm_set1_epi32( T0 ), \
+                           m128_const1_64( 0xA4093822A4093822 ) ); \
+   VD = _mm_xor_si128( _mm_set1_epi32( T0 ), \
+                           m128_const1_64( 0x299F31D0299F31D0 ) ); \
+   VE = _mm_xor_si128( _mm_set1_epi32( T1 ), \
+                           m128_const1_64( 0x082EFA98082EFA98 ) ); \
+   VF = _mm_xor_si128( _mm_set1_epi32( T1 ), \
+                           m128_const1_64( 0xEC4E6C89EC4E6C89 ) ); \
+   BLAKE256_4WAY_BLOCK_BSWAP32; \
   ROUND_S_4WAY(0); \
   ROUND_S_4WAY(1); \
   ROUND_S_4WAY(2); \
@@ -519,14 +538,14 @@ do { \
      ROUND_S_4WAY(2); \
      ROUND_S_4WAY(3); \
   } \
-   H0 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( V8, V0 ), S0 ), H0 ); \
-   H1 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( V9, V1 ), S1 ), H1 ); \
-   H2 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VA, V2 ), S2 ), H2 ); \
-   H3 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VB, V3 ), S3 ), H3 ); \
-   H4 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VC, V4 ), S0 ), H4 ); \
-   H5 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VD, V5 ), S1 ), H5 ); \
-   H6 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VE, V6 ), S2 ), H6 ); \
-   H7 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VF, V7 ), S3 ), H7 ); \
+   H0 = mm128_xor4( V8, V0, S0, H0 ); \
+   H1 = mm128_xor4( V9, V1, S1, H1 ); \
+   H2 = mm128_xor4( VA, V2, S2, H2 ); \
+   H3 = mm128_xor4( VB, V3, S3, H3 ); \
+   H4 = mm128_xor4( VC, V4, S0, H4 ); \
+   H5 = mm128_xor4( VD, V5, S1, H5 ); \
+   H6 = mm128_xor4( VE, V6, S2, H6 ); \
+   H7 = mm128_xor4( VF, V7, S3, H7 ); \
 } while (0)

 #endif
@@ -607,6 +626,7 @@ do { \
   __m256i M8, M9, MA, MB, MC, MD, ME, MF; \
   __m256i V0, V1, V2, V3, V4, V5, V6, V7; \
   __m256i V8, V9, VA, VB, VC, VD, VE, VF; \
+   __m256i shuf_bswap32; \
   V0 = H0; \
   V1 = H1; \
   V2 = H2; \
@@ -615,30 +635,36 @@ do { \
   V5 = H5; \
   V6 = H6; \
   V7 = H7; \
-   V8 = _mm256_xor_si256( S0, _mm256_set1_epi32( CS0 ) ); \
-   V9 = _mm256_xor_si256( S1, _mm256_set1_epi32( CS1 ) ); \
-   VA = _mm256_xor_si256( S2, _mm256_set1_epi32( CS2 ) ); \
-   VB = _mm256_xor_si256( S3, _mm256_set1_epi32( CS3 ) ); \
-   VC = _mm256_xor_si256( _mm256_set1_epi32( T0 ), _mm256_set1_epi32( CS4 ) ); \
-   VD = _mm256_xor_si256( _mm256_set1_epi32( T0 ), _mm256_set1_epi32( CS5 ) ); \
-   VE = _mm256_xor_si256( _mm256_set1_epi32( T1 ), _mm256_set1_epi32( CS6 ) ); \
-   VF = _mm256_xor_si256( _mm256_set1_epi32( T1 ), _mm256_set1_epi32( CS7 ) ); \
-   M0 = mm256_bswap_32( * buf ); \
-   M1 = mm256_bswap_32( *(buf+1) ); \
-   M2 = mm256_bswap_32( *(buf+2) ); \
-   M3 = mm256_bswap_32( *(buf+3) ); \
-   M4 = mm256_bswap_32( *(buf+4) ); \
-   M5 = mm256_bswap_32( *(buf+5) ); \
-   M6 = mm256_bswap_32( *(buf+6) ); \
-   M7 = mm256_bswap_32( *(buf+7) ); \
-   M8 = mm256_bswap_32( *(buf+8) ); \
-   M9 = mm256_bswap_32( *(buf+9) ); \
-   MA = mm256_bswap_32( *(buf+10) ); \
-   MB = mm256_bswap_32( *(buf+11) ); \
-   MC = mm256_bswap_32( *(buf+12) ); \
-   MD = mm256_bswap_32( *(buf+13) ); \
-   ME = mm256_bswap_32( *(buf+14) ); \
-   MF = mm256_bswap_32( *(buf+15) ); \
+   V8 = _mm256_xor_si256( S0, m256_const1_64( 0x243F6A88243F6A88 ) ); \
+   V9 = _mm256_xor_si256( S1, m256_const1_64( 0x85A308D385A308D3 ) ); \
+   VA = _mm256_xor_si256( S2, m256_const1_64( 0x13198A2E13198A2E ) ); \
+   VB = _mm256_xor_si256( S3, m256_const1_64( 0x0370734403707344 ) ); \
+   VC = _mm256_xor_si256( _mm256_set1_epi32( T0 ),\
+                              m256_const1_64( 0xA4093822A4093822 ) ); \
+   VD = _mm256_xor_si256( _mm256_set1_epi32( T0 ),\
+                              m256_const1_64( 0x299F31D0299F31D0 ) ); \
+   VE = _mm256_xor_si256( _mm256_set1_epi32( T1 ), \
+                              m256_const1_64( 0x082EFA98082EFA98 ) ); \
+   VF = _mm256_xor_si256( _mm256_set1_epi32( T1 ), \
+                              m256_const1_64( 0xEC4E6C89EC4E6C89 ) ); \
+   shuf_bswap32 = m256_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203, \
+                                 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
+   M0 = _mm256_shuffle_epi8( * buf    , shuf_bswap32 ); \
+   M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \
+   M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap32 ); \
+   M3 = _mm256_shuffle_epi8( *(buf+ 3), shuf_bswap32 ); \
+   M4 = _mm256_shuffle_epi8( *(buf+ 4), shuf_bswap32 ); \
+   M5 = _mm256_shuffle_epi8( *(buf+ 5), shuf_bswap32 ); \
+   M6 = _mm256_shuffle_epi8( *(buf+ 6), shuf_bswap32 ); \
+   M7 = _mm256_shuffle_epi8( *(buf+ 7), shuf_bswap32 ); \
+   M8 = _mm256_shuffle_epi8( *(buf+ 8), shuf_bswap32 ); \
+   M9 = _mm256_shuffle_epi8( *(buf+ 9), shuf_bswap32 ); \
+   MA = _mm256_shuffle_epi8( *(buf+10), shuf_bswap32 ); \
+   MB = _mm256_shuffle_epi8( *(buf+11), shuf_bswap32 ); \
+   MC = _mm256_shuffle_epi8( *(buf+12), shuf_bswap32 ); \
+   MD = _mm256_shuffle_epi8( *(buf+13), shuf_bswap32 ); \
+   ME = _mm256_shuffle_epi8( *(buf+14), shuf_bswap32 ); \
+   MF = _mm256_shuffle_epi8( *(buf+15), shuf_bswap32 ); \
   ROUND_S_8WAY(0); \
   ROUND_S_8WAY(1); \
   ROUND_S_8WAY(2); \
@@ -656,22 +682,14 @@ do { \
      ROUND_S_8WAY(2); \
      ROUND_S_8WAY(3); \
   } \
-   H0 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( V8, V0 ), \
-                                                              S0 ), H0 ); \
-   H1 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( V9, V1 ), \
-                                                              S1 ), H1 ); \
-   H2 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( VA, V2 ), \
-                                                              S2 ), H2 ); \
-   H3 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( VB, V3 ), \
-                                                              S3 ), H3 ); \
-   H4 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( VC, V4 ), \
-                                                              S0 ), H4 ); \
-   H5 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( VD, V5 ), \
-                                                              S1 ), H5 ); \
-   H6 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( VE, V6 ), \
-                                                              S2 ), H6 ); \
-   H7 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( VF, V7 ), \
-                                                              S3 ), H7 ); \
+   H0 = mm256_xor4( V8, V0, S0, H0 ); \
+   H1 = mm256_xor4( V9, V1, S1, H1 ); \
+   H2 = mm256_xor4( VA, V2, S2, H2 ); \
+   H3 = mm256_xor4( VB, V3, S3, H3 ); \
+   H4 = mm256_xor4( VC, V4, S0, H4 ); \
+   H5 = mm256_xor4( VD, V5, S1, H5 ); \
+   H6 = mm256_xor4( VE, V6, S2, H6 ); \
+   H7 = mm256_xor4( VF, V7, S3, H7 ); \
 } while (0)


@@ -685,25 +703,20 @@ static void
 blake32_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
                   const uint32_t *salt, int rounds )
 {
-   casti_m128i( ctx->H, 0 ) = _mm_set1_epi32( iv[0] );
-   casti_m128i( ctx->H, 1 ) = _mm_set1_epi32( iv[1] );
-   casti_m128i( ctx->H, 2 ) = _mm_set1_epi32( iv[2] );
-   casti_m128i( ctx->H, 3 ) = _mm_set1_epi32( iv[3] );
-   casti_m128i( ctx->H, 4 ) = _mm_set1_epi32( iv[4] );
-   casti_m128i( ctx->H, 5 ) = _mm_set1_epi32( iv[5] );
-   casti_m128i( ctx->H, 6 ) = _mm_set1_epi32( iv[6] );
-   casti_m128i( ctx->H, 7 ) = _mm_set1_epi32( iv[7] );
+   __m128i zero = m128_zero;
+   casti_m128i( ctx->H, 0 ) = m128_const1_64( 0x6A09E6676A09E667 );
+   casti_m128i( ctx->H, 1 ) = m128_const1_64( 0xBB67AE85BB67AE85 );
+   casti_m128i( ctx->H, 2 ) = m128_const1_64( 0x3C6EF3723C6EF372 );
+   casti_m128i( ctx->H, 3 ) = m128_const1_64( 0xA54FF53AA54FF53A );
+   casti_m128i( ctx->H, 4 ) = m128_const1_64( 0x510E527F510E527F );
+   casti_m128i( ctx->H, 5 ) = m128_const1_64( 0x9B05688C9B05688C );
+   casti_m128i( ctx->H, 6 ) = m128_const1_64( 0x1F83D9AB1F83D9AB );
+   casti_m128i( ctx->H, 7 ) = m128_const1_64( 0x5BE0CD195BE0CD19 );

-   casti_m128i( ctx->S, 0 ) = m128_zero;
-   casti_m128i( ctx->S, 1 ) = m128_zero;
-   casti_m128i( ctx->S, 2 ) = m128_zero;
-   casti_m128i( ctx->S, 3 ) = m128_zero;
-/*
-   sc->S[0] = _mm_set1_epi32( salt[0] );
-   sc->S[1] = _mm_set1_epi32( salt[1] );
-   sc->S[2] = _mm_set1_epi32( salt[2] );
-   sc->S[3] = _mm_set1_epi32( salt[3] );
-*/
+   casti_m128i( ctx->S, 0 ) = zero;
+   casti_m128i( ctx->S, 1 ) = zero;
+   casti_m128i( ctx->S, 2 ) = zero;
+   casti_m128i( ctx->S, 3 ) = zero;
   ctx->T0 = ctx->T1 = 0;
   ctx->ptr = 0;
   ctx->rounds = rounds;
@@ -773,12 +786,13 @@ blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,
   else
      ctx->T0 -= 512 - bit_len;

-   buf[vptr] = _mm_set1_epi32( 0x80 );
+   buf[vptr] = m128_const1_64( 0x0000008000000080 );

   if ( vptr < 12 )
   {
      memset_zero_128( buf + vptr + 1, 13 - vptr  );
-      buf[ 13 ] = _mm_or_si128( buf[ 13 ], _mm_set1_epi32( 0x01000000UL ) );
+      buf[ 13 ] = _mm_or_si128( buf[ 13 ],
+                                m128_const1_64( 0x0100000001000000ULL ) );
      buf[ 14 ] = mm128_bswap_32( _mm_set1_epi32( th ) );
      buf[ 15 ] = mm128_bswap_32( _mm_set1_epi32( tl ) );
      blake32_4way( ctx, buf + vptr, 64 - ptr );
@@ -790,20 +804,14 @@ blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,
      ctx->T0 = 0xFFFFFE00UL;
      ctx->T1 = 0xFFFFFFFFUL;
      memset_zero_128( buf, 56>>2 );
-      buf[ 13 ] = _mm_or_si128( buf[ 13 ], _mm_set1_epi32( 0x01000000UL ) );
+      buf[ 13 ] = _mm_or_si128( buf[ 13 ],
+                                m128_const1_64( 0x0100000001000000ULL ) );
      buf[ 14 ] = mm128_bswap_32( _mm_set1_epi32( th ) );
      buf[ 15 ] = mm128_bswap_32( _mm_set1_epi32( tl ) );
      blake32_4way( ctx, buf, 64 );
   }

-   casti_m128i( dst, 0 ) = mm128_bswap_32( casti_m128i( ctx->H, 0 ) );
-   casti_m128i( dst, 1 ) = mm128_bswap_32( casti_m128i( ctx->H, 1 ) );
-   casti_m128i( dst, 2 ) = mm128_bswap_32( casti_m128i( ctx->H, 2 ) );
-   casti_m128i( dst, 3 ) = mm128_bswap_32( casti_m128i( ctx->H, 3 ) );
-   casti_m128i( dst, 4 ) = mm128_bswap_32( casti_m128i( ctx->H, 4 ) );
-   casti_m128i( dst, 5 ) = mm128_bswap_32( casti_m128i( ctx->H, 5 ) );
-   casti_m128i( dst, 6 ) = mm128_bswap_32( casti_m128i( ctx->H, 6 ) );
-   casti_m128i( dst, 7 ) = mm128_bswap_32( casti_m128i( ctx->H, 7 ) );
+   mm128_block_bswap_32( (__m128i*)dst, (__m128i*)ctx->H );
 }

 #if defined (__AVX2__)
@@ -816,11 +824,19 @@ static void
 blake32_8way_init( blake_8way_small_context *sc, const sph_u32 *iv,
                   const sph_u32 *salt, int rounds )
 {
-   int i;
-   for ( i = 0; i < 8; i++ )
-      sc->H[i] = _mm256_set1_epi32( iv[i] );
-   for ( i = 0; i < 4; i++ )
-      sc->S[i] = _mm256_set1_epi32( salt[i] );
+   __m256i zero = m256_zero;
+   casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E6676A09E667 );
+   casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE85BB67AE85 );
+   casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF3723C6EF372 );
+   casti_m256i( sc->H, 3 ) = m256_const1_64( 0xA54FF53AA54FF53A );
+   casti_m256i( sc->H, 4 ) = m256_const1_64( 0x510E527F510E527F );
+   casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C9B05688C );
+   casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9AB1F83D9AB );
+   casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD195BE0CD19 );
+   casti_m256i( sc->S, 0 ) = zero;
+   casti_m256i( sc->S, 1 ) = zero;
+   casti_m256i( sc->S, 2 ) = zero;
+   casti_m256i( sc->S, 3 ) = zero;
   sc->T0 = sc->T1 = 0;
   sc->ptr = 0;
   sc->rounds = rounds;
@@ -872,18 +888,14 @@ static void
 blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
                    void *dst, size_t out_size_w32 )
 {
-//   union {
-        __m256i buf[16];
-//        sph_u32 dummy;
-//   } u;
-   size_t ptr, k;
+   __m256i buf[16];
+   size_t ptr;
   unsigned bit_len;
   sph_u32 th, tl;
-   __m256i *out;

   ptr = sc->ptr;
   bit_len = ((unsigned)ptr << 3);
-   buf[ptr>>2] = _mm256_set1_epi32( 0x80 );
+   buf[ptr>>2] = m256_const1_64( 0x0000008000000080ULL );
   tl = sc->T0 + bit_len;
   th = sc->T1;

@@ -905,7 +917,7 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
       memset_zero_256( buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
       if ( out_size_w32 == 8 )
           buf[52>>2] = _mm256_or_si256( buf[52>>2],
-                                           _mm256_set1_epi32( 0x01000000UL ) );
+                                m256_const1_64( 0x0100000001000000ULL ) );
       *(buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) );
       *(buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
       blake32_8way( sc, buf + (ptr>>2), 64 - ptr );
@@ -918,14 +930,12 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
        sc->T1 = SPH_C32(0xFFFFFFFFUL);
        memset_zero_256( buf, 56>>2 );
       if ( out_size_w32 == 8 )
-           buf[52>>2] = _mm256_set1_epi32( 0x01000000UL );
+           buf[52>>2] = m256_const1_64( 0x0100000001000000ULL );
        *(buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) );
        *(buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
        blake32_8way( sc, buf, 64 );
   }
-   out = (__m256i*)dst;
-   for ( k = 0; k < out_size_w32; k++ )
-        out[k] = mm256_bswap_32( sc->H[k] );
+   mm256_block_bswap_32( (__m256i*)dst, (__m256i*)sc->H );
 }

 #endif
--- a/algo/blake/blake256-hash-4way.c.new
+++ b/algo/blake/blake256-hash-4way.c.new
@@ -1,322 +0,0 @@
-// convert blake256 32 bit to use 64 bit with serial vectoring
-//
-//  cut calls to GS in half
-//
-// combine V
-// v0 = {V0,V1}
-// v1 = {V2,V3}
-// v2 = {V4,V5}
-// v3 = {V6,V7}
-// v4 = {V8,V9}
-// v5 = {VA,VB}
-// v6 = {VC,VD}
-// v7 = {CE,VF}
-//
-// v6x = {VD,VC}      swap(VC,VD)   swap(v6)
-// v7x = {VF,VE}      swap(VE,VF)   swap(v7)
-//
-// V0 = v1v0
-// V1 = v3v2
-// V2 = v5v4
-// V3 = v7v6
-// V4 = v9v8
-// V5 = vbva
-// V6 = vdvc
-// V7 = vfve
-//
-// The rotate in ROUND is to effect straddle and unstraddle for the third
-// and 4th iteration of GS.
-// It concatenates 2 contiguous 256 bit vectors and extracts the middle
-// 256 bits. After the transform they must be restored with only the
-// chosen bits modified in the original 2 vectors.
-// ror1x128 achieves this by putting the chosen bits in arg1, the "low"
-// 256 bit vector and saves the untouched bits temporailly in arg0, the
-// "high" 256 bit vector. Simply reverse the process to restore data back
-// to original positions.
-
-// Use standard 4way when AVX2 is not available use x2 mode with AVX2.
-//
-// Data is organised the same as 32 bit 4 way, in effect serial vectoring
-// on top of parallel vectoring. Same data in the same place just taking
-// two chunks at a time.
-//
-// Transparent to user, x2 mode used when AVX2 detected.
-// Use existing 4way context but revert to scalar types.
-// Same interleave function (128 bit) or x2 with 256 bit?
-// User trsnaparency would have to apply to interleave as well.
-//
-// Use common 4way update and close
-
-/*
-typedef struct {
-   unsigned char buf[64<<2];
-   uint32_t H[8<<2];
-   uint32_t S[4<<2];
-   size_t ptr;
-   uint32_t T0, T1;
-   int rounds;   // 14 for blake, 8 for blakecoin & vanilla
-} blakex2_4way_small_context __attribute__ ((aligned (64)));
-*/
-
-static void
-blake32x2_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
-                   const uint32_t *salt, int rounds )
-{
-   casti_m128i( ctx->H, 0 ) = _mm_set1_epi32( iv[0] );
-   casti_m128i( ctx->H, 1 ) = _mm_set1_epi32( iv[1] );
-   casti_m128i( ctx->H, 2 ) = _mm_set1_epi32( iv[2] );
-   casti_m128i( ctx->H, 3 ) = _mm_set1_epi32( iv[3] );
-   casti_m128i( ctx->H, 4 ) = _mm_set1_epi32( iv[4] );
-   casti_m128i( ctx->H, 5 ) = _mm_set1_epi32( iv[5] );
-   casti_m128i( ctx->H, 6 ) = _mm_set1_epi32( iv[6] );
-   casti_m128i( ctx->H, 7 ) = _mm_set1_epi32( iv[7] );
-
-   casti_m128i( ctx->S, 0 ) = m128_zero;
-   casti_m128i( ctx->S, 1 ) = m128_zero;
-   casti_m128i( ctx->S, 2 ) = m128_zero;
-   casti_m128i( ctx->S, 3 ) = m128_zero;
-/*
-   sc->S[0] = _mm_set1_epi32( salt[0] );
-   sc->S[1] = _mm_set1_epi32( salt[1] );
-   sc->S[2] = _mm_set1_epi32( salt[2] );
-   sc->S[3] = _mm_set1_epi32( salt[3] );
-*/
-   ctx->T0 = ctx->T1 = 0;
-   ctx->ptr = 0;
-   ctx->rounds = rounds;
-}
-
-static void
-blake32x2( blake_4way_small_context *ctx, const void *data, size_t len )
-{
-   __m128i *buf = (__m256i*)ctx->buf;
-   size_t  bptr = ctx->ptr << 2;
-   size_t  vptr = ctx->ptr >> 3;
-   size_t  blen = len << 2;
-//    unsigned char *buf = ctx->buf;
-//    size_t ptr         = ctx->ptr<<4;  // repurposed
-    DECL_STATE32x2
-
-//    buf = sc->buf;
-//    ptr = sc->ptr;
-
-// adjust len for use with ptr, clen, all absolute bytes.
-//    int blen = len<<2;
-
-    if ( blen < (sizeof ctx->buf) - bptr )
-    {
-        memcpy( buf + vptr, data, blen );
-        ptr += blen;
-        ctx->ptr = bptr >> 2;;
-        return;
-    }
-
-    READ_STATE32( ctx );
-    while ( blen > 0 )
-    {
-        size_t clen;
-
-        clen = ( sizeof sc->buf ) - ptr;
-        if ( clen > blen )
-            clen = blen;
-        memcpy( buf + vptr, data, clen );
-        bptr += clen;
-        vptr = bptr >> 5;
-	data = (const unsigned char *)data + clen;
-        blen -= clen;
-        if ( bptr == sizeof ctx->buf )
-       	{
-           if ( ( T0 = T0 + 512 ) < 512 ) // not needed, will never rollover
-               T1 += 1;
-           COMPRESS32x2_4WAY( ctx->rounds );
-           ptr = 0;
-        }
-    }
-    WRITE_STATE32x2( ctx );
-    ctx->ptr = bptr >> 2;
-}
-
-static void
-blake32x2_4way_close( blake_4way_small_context *ctx, void *dst )
-{
-   __m256i buf[8] __attribute__ ((aligned (64)));
-   size_t   ptr     = ctx->ptr;
-   size_t   vptr    = ctx->ptr>>2;
-   unsigned bit_len = ( (unsigned)ptr << 3 );  // one lane
-   uint32_t th      = ctx->T1;
-   uint32_t tl      = ctx->T0 + bit_len;
-
-   if ( ptr == 0 )
-   {
-        ctx->T0 = 0xFFFFFE00UL;
-        ctx->T1 = 0xFFFFFFFFUL;
-   }
-   else if ( ctx->T0 == 0 )
-   {
-      ctx->T0 = 0xFFFFFE00UL + bit_len;
-      ctx->T1 -= 1;
-   }
-   else
-      ctx->T0 -= 512 - bit_len;
-
-   // memset doesn't do ints
-   buf[ vptr ] = _mm256_set_epi32( 0,0,0,0, 0x80, 0x80, 0x80, 0x80 );
-
-   if ( vptr < 5 )
-   {
-       memset_zero_256( buf + vptr + 1, 6 - vptr  );
-       buf[ 6 ] = _mm256_or_si256( vbuf[ 6 ], _mm256_set_epi32(
-             0x01000000UL,0x01000000UL,0x01000000UL,0x01000000UL, 0,0,0,0 ) ); 
-       buf[ 7 ] = mm256_bswap_32( _mm256_set_epi32( tl,tl,tl,tl,
-			                            th,th,th,th ) );
-       blake32x2_4way( ctx, buf + vptr, 64 - ptr );
-   }
-   else
-   {
-       memset_zero_256( vbuf + vptr + 1, 7 - vptr );
-       blake32x2_4way( ctx,  vbuf + ptr, 64 - ptr );
-       ctx->T0 = 0xFFFFFE00UL;
-       ctx->T1 = 0xFFFFFFFFUL;
-       buf[ 6 ] = mm256_zero;
-       buf[ 6 ] = _mm256_set_epi32( 0,0,0,0,
-		         0x01000000UL,0x01000000UL,0x01000000UL,0x01000000UL );
-       buf[ 7 ] = mm256_bswap_32( _mm256_set_epi32( tl, tl, tl, tl,
-                                                    th, th, th, th );
-       blake32x2_4way( ctx, buf, 64 );
-   }
-
-   casti_m256i( dst, 0 ) = mm256_bswap_32( casti_m256i( ctx->H, 0 ) );
-   casti_m256i( dst, 1 ) = mm256_bswap_32( casti_m256i( ctx->H, 1 ) );
-   casti_m256i( dst, 2 ) = mm256_bswap_32( casti_m256i( ctx->H, 2 ) );
-   casti_m256i( dst, 3 ) = mm256_bswap_32( casti_m256i( ctx->H, 3 ) );
-}
-
-
-
-
-#define DECL_STATE32x2_4WAY \
-   __m256i H0, H1, H2, H3; \
-   __m256i S0, S1; \
-   uint32_t T0, T1;
-
-#define READ_STATE32x2_4WAY(state)  do \
-{ \
-   H0 = casti_m256i( state->H, 0 ); \
-   H1 = casti_m256i( state->H, 1 ); \
-   H2 = casti_m256i( state->H, 2 ); \
-   H3 = casti_m256i( state->H, 3 ); \
-   S0 = casti_m256i( state->S, 0 ); \
-   S1 = casti_m256i( state->S, 1 ); \
-   T0 = state->T0; \
-   T1 = state->T1; \
-
-#define WRITE_STATE32x2_4WAY(state)   do { \
-   casti_m256i( state->H, 0 ) = H0; \
-   casti_m256i( state->H, 1 ) = H1; \
-   casti_m256i( state->H, 2 ) = H2; \
-   casti_m256i( state->H, 3 ) = H3; \
-   casti_m256i( state->S, 0 ) = S0; \
-   casti_m256i( state->S, 1 ) = S1; \
-   state->T0 = T0; \
-   state->T1 = T1; \
-} while (0)
-
-
-#define GSx2_4WAY( m0m2, m1m3, c0c2, c1c3, a, b, c, d ) do \
-{ \
-   a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \
-          _mm256_set_epi32( c1,c3, c1,c3, c1,c3, c1,c3 ), \
-	  _mm256_set_epi32( m0,m2, m0,m2, m0,m2, m0,m2 ) ), b ), a ); \
-   d = mm256_ror_32( _mm_xor_si128( d, a ), 16 ); \
-   c = _mm256_add_epi32( c, d ); \
-   b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \
-   a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \
-          _mm256_set_epi32( c0,c2, c0,c2, c0,c2, c0,c2 ), \
-	  _mm256_set_epi32( m1,m3, m1,m3, m1,m3, m1,m3 ) ), b ), a ); \
-   d = mm256_ror_32( _mm256_xor_si256( d, a ), 8 ); \
-   c = _mm256_add_epi32( c, d ); \
-   b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \
-} while (0)
-
-#define ROUND_Sx2_4WAY(r)   do \
-{ \
-  GS2_4WAY( Mx(r, 0),  Mx(r, 1),  Mx(r, 2),  Mx(r, 3), \
-           CSx(r, 0), CSx(r, 1), CSx(r, 2), CSx(r, 3), V0, V2, V4, V6 ); \
-  GS2_4WAY( Mx(r, 4),  Mx(r, 5),  Mx(r, 6),  Mx(r, 7), \
-           CSx(r, 4), CSx(r, 5), CSx(r, 6), CSx(r, 7), V1, V3, V5, V7 ); \
-  mm256_ror1x128_512( V3, V2 ); \
-  mm256_ror1x128_512( V6, V7 ); \
-  GS2_4WAY( Mx(r, 8),  Mx(r, 9),  Mx(r, A),  Mx(r, B), \
-           CSx(r, 8), CSx(r, 9), CSx(r, A), CSx(r, B), V0, V2, V5, V7 ); \
-  GS2_4WAY( Mx(r, C),  Mx(r, D),  Mx(r, C),  Mx(r, D), \
-           CSx(r, C), CSx(r, D), CSx(r, C), CSx(r, D), V1, V3, V4, V6 ); \
-  mm256_rol1x128_512( V2, V3 ); \
-  mm256_rol1x128_512( V7, V6 ); 
-
-#define COMPRESS32x2_4WAY( rounds ) do \
-{ \
-   __m256i M0, M1, M2, M3, M4, M5, M6, M7; \
-   __m256i V0, V1, V2, V3, V4, V5, V6, V7; \
-   unsigned r; \
-   V0 = H0; \
-   V1 = H1; \
-   V2 = H2; \
-   V3 = H3; \
-   V4 = _mm256_xor_si256( S0, _mm256_set_epi32( CS1, CS1, CS1, CS1, \
-			                        CS0, CS0, CS0, CS0 ) ); \
-   V5 = _mm256_xor_si256( S1, _mm256_set_epi32( CS3, CS3, CS3, CS3, \
-                                                CS2, CS2, CS2, CS2 ) ); \
-   V6 = _mm256_xor_si256( _mm256_set1_epi32( T0 ), \
-                              _mm256_set_epi32( CS5, CS5, CS5, CS5, \
-		                                CS4, CS4, CS4, CS4 ) ); \
-   V7 = _mm256_xor_si256( _mm256_set1_epi32( T1 ), \
-                              _mm256_set_epi32( CS7, CS7, CS7, CS7, \
-                                                CS6, CS6, CS6, CS6 ) ); \
-   M0 = mm256_bswap_32( buf[ 0] ); \
-   M1 = mm256_bswap_32( buf[ 1] ); \
-   M2 = mm256_bswap_32( buf[ 2] ); \
-   M3 = mm256_bswap_32( buf[ 3] ); \
-   M4 = mm256_bswap_32( buf[ 4] ); \
-   M5 = mm256_bswap_32( buf[ 5] ); \
-   M6 = mm256_bswap_32( buf[ 6] ); \
-   M7 = mm256_bswap_32( buf[ 7] ); \
-   ROUND_Sx2_4WAY(0); \
-   ROUND_Sx2_4WAY(1); \
-   ROUND_Sx2_4WAY(2); \
-   ROUND_Sx2_4WAY(3); \
-   ROUND_Sx2_4WAY(4); \
-   ROUND_Sx2_4WAY(5); \
-   ROUND_Sx2_4WAY(6); \
-   ROUND_Sx2_4WAY(7); \
-   if (rounds == 14) \
-   { \
-      ROUND_Sx2_4WAY(8); \
-      ROUND_Sx2_4WAY(9); \
-      ROUND_Sx2_4WAY(0); \
-      ROUND_Sx2_4WAY(1); \
-      ROUND_Sx2_4WAY(2); \
-      ROUND_Sx2_4WAY(3); \
-   } \
-   H0 = _mm256_xor_si256( _mm256_xor_si256( \
-			           _mm256_xor_si256( V8, V0 ), S0 ), H0 ); \
-   H1 = _mm256_xor_si256( _mm256_xor_si256( \
-			           _mm256_xor_si256( V9, V1 ), S1 ), H1 ); \
-   H2 = _mm256_xor_si256( _mm256_xor_si256( \
-			           _mm256_xor_si256( VA, V2 ), S2 ), H2 ); \
-   H3 = _mm256_xor_si256( _mm256_xor_si256( \
-			           _mm256_xor_si256( VB, V3 ), S3 ), H3 ); \
-} while (0)
-
-
-
-
-
-
-
-
-
-
-
-
-
-
--- a/algo/blake/blake2b-4way.c
+++ b/algo/blake/blake2b-4way.c
@@ -0,0 +1,67 @@
+/**
+ * Blake2-B Implementation
+ * tpruvot@github 2015-2016
+ */
+
+#include "blake2b-gate.h"
+
+#if defined(BLAKE2B_4WAY)
+
+#include <string.h>
+#include <stdint.h>
+#include "blake2b-hash-4way.h"
+
+// Function not used, code inlined.
+void blake2b_4way_hash(void *output, const void *input)
+{
+    blake2b_4way_ctx ctx;
+    blake2b_4way_init( &ctx );
+    blake2b_4way_update( &ctx, input, 80 );
+    blake2b_4way_final( &ctx, output );
+}
+
+int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr )
+{
+	uint32_t hash[8*4] __attribute__ ((aligned (64)));;
+   uint32_t vdata[20*4] __attribute__ ((aligned (32)));;
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   blake2b_4way_ctx ctx __attribute__ ((aligned (32)));
+   uint32_t *hash7 = &(hash[25]);   // 3*8+1
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+   int thr_id = mythr->id;
+   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+	const uint32_t Htarg = ptarget[7];
+	const uint32_t first_nonce = pdata[19];
+
+	uint32_t n = first_nonce;
+
+   mm256_bswap32_intrlv80_4x64( vdata, pdata );
+
+	do {
+      *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
+
+      blake2b_4way_init( &ctx ); 
+      blake2b_4way_update( &ctx, vdata, 80 );
+      blake2b_4way_final( &ctx, hash );
+
+      for ( int lane = 0; lane < 4; lane++ )
+      if ( hash7[ lane<<1 ] < Htarg )
+      {
+          extr_lane_4x64( lane_hash, hash, lane, 256 );
+          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+          {
+              pdata[19] = n + lane;
+              submit_lane_solution( work, lane_hash, mythr, lane );
+          }
+      }
+      n += 4;
+   } while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce + 1;
+   return 0;
+}
+
+#endif
--- a/algo/blake/blake2b-gate.c
+++ b/algo/blake/blake2b-gate.c
@@ -0,0 +1,25 @@
+#include "blake2b-gate.h"
+
+/*
+// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
+int64_t blake2s_get_max64 ()
+{
+   return 0x7ffffLL;
+}
+*/
+
+bool register_blake2b_algo( algo_gate_t* gate )
+{
+#if defined(BLAKE2B_4WAY)
+  gate->scanhash  = (void*)&scanhash_blake2b_4way;
+  gate->hash      = (void*)&blake2b_4way_hash;
+#else
+  gate->scanhash  = (void*)&scanhash_blake2b;
+  gate->hash      = (void*)&blake2b_hash;
+#endif
+//  gate->get_max64 = (void*)&blake2s_get_max64;
+  gate->optimizations =  AVX2_OPT;
+  return true;
+};
+
+
--- a/algo/blake/blake2b-gate.h
+++ b/algo/blake/blake2b-gate.h
@@ -0,0 +1,26 @@
+#ifndef __BLAKE2B_GATE_H__
+#define __BLAKE2B_GATE_H__ 1
+
+#include <stdint.h>
+#include "algo-gate-api.h"
+
+#if defined(__AVX2__)
+  #define BLAKE2B_4WAY
+#endif
+
+bool register_blake2b_algo( algo_gate_t* gate );
+
+#if defined(BLAKE2B_4WAY)
+
+void blake2b_4way_hash( void *state, const void *input );
+int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+#else
+
+void blake2b_hash( void *state, const void *input );
+int scanhash_blake2b( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr );
+
+#endif
+
+#endif
--- a/algo/blake/blake2b-hash-4way.c
+++ b/algo/blake/blake2b-hash-4way.c
@@ -0,0 +1,215 @@
+/*
+ * Copyright 2009 Colin Percival, 2014 savale
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file was originally written by Colin Percival as part of the Tarsnap
+ * online backup system.
+ */
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "blake2b-hash-4way.h"
+
+#if defined(__AVX2__)
+
+// G Mixing function.
+
+#define B2B_G(a, b, c, d, x, y) \
+{ \
+   v[a] = _mm256_add_epi64( _mm256_add_epi64( v[a], v[b] ), x ); \
+	v[d] = mm256_ror_64( _mm256_xor_si256( v[d], v[a] ), 32 ); \
+	v[c] = _mm256_add_epi64( v[c], v[d] ); \
+	v[b] = mm256_ror_64( _mm256_xor_si256( v[b], v[c] ), 24 ); \
+	v[a] = _mm256_add_epi64( _mm256_add_epi64( v[a], v[b] ), y ); \
+	v[d] = mm256_ror_64( _mm256_xor_si256( v[d], v[a] ), 16 ); \
+	v[c] = _mm256_add_epi64( v[c], v[d] ); \
+	v[b] = mm256_ror_64( _mm256_xor_si256( v[b], v[c] ), 63 ); \
+}
+
+// Initialization Vector.
+/*
+static const uint64_t blake2b_iv[8] = {
+	0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
+	0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
+	0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
+	0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
+};
+*/
+
+static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
+{
+	const uint8_t sigma[12][16] = {
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+		{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+		{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+		{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+		{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+		{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+		{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+		{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+		{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+		{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+		{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
+	};
+	int i;
+	__m256i v[16], m[16];
+
+   v[ 0] = ctx->h[0];
+   v[ 1] = ctx->h[1];
+   v[ 2] = ctx->h[2];
+   v[ 3] = ctx->h[3];
+   v[ 4] = ctx->h[4];
+   v[ 5] = ctx->h[5];
+   v[ 6] = ctx->h[6];
+   v[ 7] = ctx->h[7];
+   v[ 8] = m256_const1_64( 0x6A09E667F3BCC908 );
+   v[ 9] = m256_const1_64( 0xBB67AE8584CAA73B );
+   v[10] = m256_const1_64( 0x3C6EF372FE94F82B );
+   v[11] = m256_const1_64( 0xA54FF53A5F1D36F1 );
+   v[12] = m256_const1_64( 0x510E527FADE682D1 );
+   v[13] = m256_const1_64( 0x9B05688C2B3E6C1F );
+   v[14] = m256_const1_64( 0x1F83D9ABFB41BD6B );
+   v[15] = m256_const1_64( 0x5BE0CD19137E2179 );
+
+   v[12] = _mm256_xor_si256( v[12], _mm256_set1_epi64x( ctx->t[0] ) );
+   v[13] = _mm256_xor_si256( v[13], _mm256_set1_epi64x( ctx->t[1] ) );
+
+   if ( last )   
+		v[14] = mm256_not( v[14] );
+
+   m[ 0] = ctx->b[ 0];
+   m[ 1] = ctx->b[ 1];
+   m[ 2] = ctx->b[ 2];
+   m[ 3] = ctx->b[ 3];
+   m[ 4] = ctx->b[ 4];
+   m[ 5] = ctx->b[ 5];
+   m[ 6] = ctx->b[ 6];
+   m[ 7] = ctx->b[ 7];
+   m[ 8] = ctx->b[ 8];
+   m[ 9] = ctx->b[ 9];
+   m[10] = ctx->b[10];
+   m[11] = ctx->b[11];
+   m[12] = ctx->b[12];
+   m[13] = ctx->b[13];
+   m[14] = ctx->b[14];
+   m[15] = ctx->b[15];
+   
+	for ( i = 0; i < 12; i++ )
+   { 
+		B2B_G( 0, 4,  8, 12, m[ sigma[i][ 0] ], m[ sigma[i][ 1] ] );
+		B2B_G( 1, 5,  9, 13, m[ sigma[i][ 2] ], m[ sigma[i][ 3] ] );
+		B2B_G( 2, 6, 10, 14, m[ sigma[i][ 4] ], m[ sigma[i][ 5] ] );
+		B2B_G( 3, 7, 11, 15, m[ sigma[i][ 6] ], m[ sigma[i][ 7] ] );
+		B2B_G( 0, 5, 10, 15, m[ sigma[i][ 8] ], m[ sigma[i][ 9] ] );
+		B2B_G( 1, 6, 11, 12, m[ sigma[i][10] ], m[ sigma[i][11] ] );
+		B2B_G( 2, 7,  8, 13, m[ sigma[i][12] ], m[ sigma[i][13] ] );
+		B2B_G( 3, 4,  9, 14, m[ sigma[i][14] ], m[ sigma[i][15] ] );
+	}
+
+   ctx->h[0] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[0], v[0] ), v[ 8] );
+   ctx->h[1] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[1], v[1] ), v[ 9] );
+   ctx->h[2] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[2], v[2] ), v[10] );
+   ctx->h[3] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[3], v[3] ), v[11] );
+   ctx->h[4] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[4], v[4] ), v[12] );
+   ctx->h[5] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[5], v[5] ), v[13] );
+   ctx->h[6] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[6], v[6] ), v[14] );
+   ctx->h[7] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[7], v[7] ), v[15] );
+}
+
+int blake2b_4way_init( blake2b_4way_ctx *ctx ) 
+{
+	size_t i;
+
+   ctx->h[0] = m256_const1_64( 0x6A09E667F3BCC908 );
+   ctx->h[1] = m256_const1_64( 0xBB67AE8584CAA73B );
+   ctx->h[2] = m256_const1_64( 0x3C6EF372FE94F82B );
+   ctx->h[3] = m256_const1_64( 0xA54FF53A5F1D36F1 );
+   ctx->h[4] = m256_const1_64( 0x510E527FADE682D1 );
+   ctx->h[5] = m256_const1_64( 0x9B05688C2B3E6C1F );
+   ctx->h[6] = m256_const1_64( 0x1F83D9ABFB41BD6B );
+   ctx->h[7] = m256_const1_64( 0x5BE0CD19137E2179 );
+
+   ctx->h[0] = _mm256_xor_si256( ctx->h[0], m256_const1_64( 0x01010020 ) );
+
+	ctx->t[0] = 0;
+	ctx->t[1] = 0;
+	ctx->c = 0;
+	ctx->outlen = 32;
+
+   for ( i = 0; i < 16; i++ )
+     ctx->b[i] = m256_zero;
+
+	return 0;
+}
+
+void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
+                          size_t inlen ) 
+{
+   __m256i* in =(__m256i*)input;
+
+	size_t i, c;
+   c = ctx->c >> 3; 
+
+	for ( i = 0; i < (inlen >> 3); i++ )
+   {
+		if ( ctx->c == 128 )
+      { 
+			ctx->t[0] += ctx->c;
+			if ( ctx->t[0] < ctx->c )
+				ctx->t[1]++;
+			blake2b_4way_compress( ctx, 0 );
+			ctx->c = 0;
+		}
+      ctx->b[ c++ ] = in[i];
+      ctx->c += 8;
+   }
+}
+
+void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out )
+{
+	size_t c;
+   c = ctx->c >> 3;
+
+	ctx->t[0] += ctx->c;
+	if ( ctx->t[0] < ctx->c )
+		ctx->t[1]++;
+
+	while ( ctx->c < 128 )
+   {
+      ctx->b[c++] = m256_zero;
+      ctx->c += 8;
+   }
+
+   blake2b_4way_compress( ctx, 1 );           // final block flag = 1
+
+   casti_m256i( out, 0 ) = ctx->h[0];
+   casti_m256i( out, 1 ) = ctx->h[1];
+   casti_m256i( out, 2 ) = ctx->h[2];
+   casti_m256i( out, 3 ) = ctx->h[3];
+}
+
+#endif
--- a/algo/blake/blake2b-hash-4way.h
+++ b/algo/blake/blake2b-hash-4way.h
@@ -0,0 +1,35 @@
+#pragma once
+#ifndef __BLAKE2B_HASH_4WAY_H__
+#define __BLAKE2B_HASH_4WAY_H__
+
+#if defined(__AVX2__)
+
+#include "simd-utils.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined(_MSC_VER)
+#include <inttypes.h>
+#define inline __inline
+#define ALIGN(x) __declspec(align(x))
+#else
+#define ALIGN(x) __attribute__((aligned(x)))
+#endif
+
+// state context
+ALIGN(64) typedef struct {
+	__m256i b[16]; // input buffer
+	__m256i h[8];  // chained state
+	uint64_t t[2];  // total number of bytes
+	size_t c;       // pointer for b[]
+	size_t outlen;  // digest size
+} blake2b_4way_ctx __attribute__((aligned(64)));
+
+int blake2b_4way_init( blake2b_4way_ctx *ctx );
+void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
+                          size_t inlen );
+void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out );
+
+#endif
+
+#endif
--- a/algo/blake/blake2b.c
+++ b/algo/blake/blake2b.c
@@ -3,13 +3,11 @@
 * tpruvot@github 2015-2016
 */

-#include "algo-gate-api.h"
+#include "blake2b-gate.h"
 #include <string.h>
 #include <stdint.h>
 #include "algo/blake/sph_blake2b.h"

-//static __thread sph_blake2b_ctx s_midstate;
-//static __thread sph_blake2b_ctx s_ctx;
 #define MIDLEN 76
 #define A 64

@@ -25,26 +23,17 @@ void blake2b_hash(void *output, const void *input)
 	memcpy(output, hash, 32);
 }

-/*
-static void blake2b_hash_end(uint32_t *output, const uint32_t *input)
-{
-	s_ctx.outlen = MIDLEN;
-	memcpy(&s_ctx, &s_midstate, 32 + 16 + MIDLEN);
-	sph_blake2b_update(&s_ctx, (uint8_t*) &input[MIDLEN/4], 80 - MIDLEN);
-	sph_blake2b_final(&s_ctx, (uint8_t*) output);
-}
-*/
-
-int scanhash_blake2b( int thr_id, struct work *work, uint32_t max_nonce,
-                      uint64_t *hashes_done )
+int scanhash_blake2b( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr )
 {
 	uint32_t _ALIGN(A) vhashcpu[8];
 	uint32_t _ALIGN(A) endiandata[20];
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
+   int thr_id = mythr->id;  // thr_id arg is deprecated

 	const uint32_t Htarg = ptarget[7];
-	const uint32_t first_nonce = pdata[8];
+	const uint32_t first_nonce = pdata[19];

 	uint32_t n = first_nonce;

@@ -52,179 +41,23 @@ int scanhash_blake2b( int thr_id, struct work *work, uint32_t max_nonce,
 		be32enc(&endiandata[i], pdata[i]);
 	}

-	// midstate (untested yet)
-	//blake2b_init(&s_midstate, 32, NULL, 0);
-	//blake2b_update(&s_midstate, (uint8_t*) endiandata, MIDLEN);
-	//memcpy(&s_ctx, &s_midstate, sizeof(blake2b_ctx));
-
 	do {
-		be32enc(&endiandata[8], n);
+		be32enc(&endiandata[19], n);
 		//blake2b_hash_end(vhashcpu, endiandata);
 		blake2b_hash(vhashcpu, endiandata);

 		if (vhashcpu[7] < Htarg && fulltest(vhashcpu, ptarget)) {
 			work_set_target_ratio(work, vhashcpu);
 			*hashes_done = n - first_nonce + 1;
-			pdata[8] = n;
+			pdata[19] = n;
 			return 1;
 		}
 		n++;

 	} while (n < max_nonce && !work_restart[thr_id].restart);
 	*hashes_done = n - first_nonce + 1;
-	pdata[8] = n;
+	pdata[19] = n;

 	return 0;
 }

-static inline void swab256(void *dest_p, const void *src_p)
-{
-	uint32_t *dest = (uint32_t *)dest_p;
-	const uint32_t *src = (uint32_t *)src_p;
-
-	dest[0] = swab32(src[7]);
-	dest[1] = swab32(src[6]);
-	dest[2] = swab32(src[5]);
-	dest[3] = swab32(src[4]);
-	dest[4] = swab32(src[3]);
-	dest[5] = swab32(src[2]);
-	dest[6] = swab32(src[1]);
-	dest[7] = swab32(src[0]);
-}
-
-/* compute nbits to get the network diff */
-void blake2b_calc_network_diff(struct work *work)
-{
-        // sample for diff 43.281 : 1c05ea29
-        uint32_t nbits = work->data[11]; // unsure if correct
-        uint32_t bits = (nbits & 0xffffff);
-        int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28
-
-        double d = (double)0x0000ffff / (double)bits;
-        for (int m=shift; m < 29; m++) d *= 256.0;
-        for (int m=29; m < shift; m++) d /= 256.0;
-        if (opt_debug_diff)
-                applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits);
-        net_diff = d;
-}
-
-void blake2b_be_build_stratum_request( char *req, struct work *work )
-{
-   unsigned char *xnonce2str;
-   uint32_t ntime,       nonce;
-   char     ntimestr[9], noncestr[9];
-   be32enc( &ntime, work->data[ algo_gate.ntime_index ] );
-   be32enc( &nonce, work->data[ algo_gate.nonce_index ] );
-   bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) );
-   bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) );
-   uint16_t high_nonce = swab32(work->data[9]) >> 16;
-   xnonce2str = abin2hex((unsigned char*)(&high_nonce), 2);
-   snprintf( req, JSON_BUF_LEN,
-        "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
-         rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
-   free( xnonce2str );
-}
-
-#define min(a,b) (a>b ? (b) :(a))
-
-// merkle root handled here, no need for gen_merkle_root gate target
-void blake2b_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
-{
-    uchar merkle_root[64] = { 0 };
-    uint32_t extraheader[32] = { 0 };
-    int headersize = 0;
-    size_t t;
-    int i;
-
-    // merkle root
-    memcpy( merkle_root, sctx->job.coinbase, 32 );
-    headersize = min( (int)sctx->job.coinbase_size - 32, sizeof(extraheader) );
-    memcpy( extraheader, &sctx->job.coinbase[32], headersize );
-    // Increment extranonce2 
-    for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ );
-    // Assemble block header 
-    memset( g_work->data, 0, sizeof(g_work->data) );
-//    g_work->data[0] = le32dec( sctx->job.version );
-//    for ( i = 0; i < 8; i++ )
-//       g_work->data[1 + i] = le32dec( (uint32_t *) sctx->job.prevhash + i );
-    for ( i = 0; i < 8; i++ )
-       g_work->data[i] = ((uint32_t*)sctx->job.prevhash)[7-i];
-//    for ( i = 0; i < 8; i++ )
-//       g_work->data[9 + i] = be32dec( (uint32_t *) merkle_root + i );
-    g_work->data[8]  = 0; // nonce
-    g_work->data[9]  = swab32( extraheader[0] ) | ( rand() & 0xf0 );
-    g_work->data[10] = be32dec( sctx->job.ntime );
-    g_work->data[11] = be32dec( sctx->job.nbits );
-    for ( i = 0; i < 8; i++ )
-       g_work->data[12+i] = ( (uint32_t*)merkle_root )[i];
-}
-
-#undef min
-
-void blake2b_get_new_work( struct work* work, struct work* g_work, int thr_id,
-                           uint32_t* end_nonce_ptr, bool clean_job )
-{
-   const int wkcmp_sz = 32;  // bytes
-   const int wkcmp_off = 32 + 16; 
-   uint32_t *nonceptr = algo_gate.get_nonceptr( work->data );
-
-   if ( memcmp( &work->data[ wkcmp_off ], &g_work->data[ wkcmp_off ], wkcmp_sz )
-      && ( clean_job || ( *nonceptr >= *end_nonce_ptr ) 
-      || strcmp( work->job_id, g_work->job_id ) ) )
-   {
-      work_free( work );
-      work_copy( work, g_work );
-      *nonceptr = ( 0xffffffffU / opt_n_threads ) * thr_id;
-      if ( opt_randomize )
-         *nonceptr += ( (rand() *4 ) & UINT32_MAX ) / opt_n_threads;
-      *end_nonce_ptr = ( 0xffffffffU / opt_n_threads ) * (thr_id+1) - 0x20;
-   }
-   else
-       ++(*nonceptr);
-
-   // suprnova job_id check without data/target/height change...
-   // we just may have copied new g_wwork to work so why this test here?
-//   if (  have_stratum && strcmp( work->job_id, g_work->job_id ) )
-      // exit thread loop
-//      continue;
-//   else
-//   {
-//      nonceptr[1] += 0x10;
-//      nonceptr[1] |= thr_id;
-//   }
-}
-
-bool blake2b_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
-                           int thr_id )
-{
-   if ( have_stratum && strcmp( stratum->job.job_id, work->job_id ) )
-      // need to regen g_work..
-      return false;
-   // extradata: prevent duplicates
-   work->data[ 8     ] += 0x10;
-   work->data[ 8 + 1 ] |= thr_id;
-   return true;
-}
-
-double blake2b_get_max64() { return 0x1fffffLL; }
-
-bool register_blake2b_algo( algo_gate_t* gate )
-{
-  algo_not_tested();
-  gate->ntime_index   = 10;
-  gate->nbits_index   = 11;
-  gate->nonce_index   =  8;
-  gate->work_cmp_size = 32;
-  gate->scanhash              = (void*)&scanhash_blake2b;
-  gate->hash                  = (void*)&blake2b_hash;
-  gate->calc_network_diff     = (void*)&blake2b_calc_network_diff;
-  gate->build_stratum_request = (void*)&blake2b_be_build_stratum_request;
-  gate->work_decode           = (void*)&std_be_work_decode;
-  gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
-  gate->build_extraheader     = (void*)&blake2b_build_extraheader;
-  gate->get_new_work          = (void*)&blake2b_get_new_work;
-  gate->get_max64             = (void*)&blake2b_get_max64;
-  gate->ready_to_mine         = (void*)&blake2b_ready_to_mine;
-  have_gbt = false;
-  return true;
-}
--- a/algo/blake/blake2s-4way.c
+++ b/algo/blake/blake2s-4way.c
@@ -16,60 +16,49 @@ void blake2s_8way_hash( void *output, const void *input )
   blake2s_8way_update( &ctx, input + (64<<3), 16 );
   blake2s_8way_final( &ctx, vhash, BLAKE2S_OUTBYTES );

-   mm256_deinterleave_8x32( output,     output+ 32, output+ 64, output+ 96,
-                            output+128, output+160, output+192, output+224,
-                            vhash, 256 );
+   dintrlv_8x32( output,     output+ 32, output+ 64, output+ 96,
+                 output+128, output+160, output+192, output+224,
+                 vhash, 256 );
 }

-int scanhash_blake2s_8way( int thr_id, struct work *work, uint32_t max_nonce,
-                      uint64_t *hashes_done )
+int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
   uint32_t hash[8*8] __attribute__ ((aligned (32)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
-   uint32_t _ALIGN(64) edata[20];
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
+   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
   uint32_t n = first_nonce;
-   uint32_t *nonces = work->nonces;
-   int num_found = 0;
-   uint32_t *noncep = vdata + 152;   // 19*8
+   int thr_id = mythr->id;  // thr_id arg is deprecated

-   swab32_array( edata, pdata, 20 );
-   mm256_interleave_8x32( vdata, edata, edata, edata, edata,
-                                 edata, edata, edata, edata, 640 );
+   mm256_bswap32_intrlv80_8x32( vdata, pdata );
   blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
   blake2s_8way_update( &blake2s_8w_ctx, vdata, 64 );

   do {
-      be32enc( noncep,    n   );
-      be32enc( noncep +1, n+1 );
-      be32enc( noncep +2, n+2 );
-      be32enc( noncep +3, n+3 );
-      be32enc( noncep +4, n+4 );
-      be32enc( noncep +5, n+5 );
-      be32enc( noncep +6, n+6 );
-      be32enc( noncep +7, n+7 );
+      *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
+                                                  n+3, n+2, n+1, n ) );
      pdata[19] = n;

      blake2s_8way_hash( hash, vdata );


      for ( int i = 0; i < 8; i++ )
-      if (  (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+      if (  (hash+(i<<3))[7] <= Htarg )
+      if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
      {
          pdata[19] = n+i;
-          nonces[ num_found++ ] = n+i;
-          work_set_target_ratio( work, hash+(i<<3) );
+          submit_lane_solution( work, hash+(i<<3), mythr, i );
      }
      n += 8;

-   } while ( (num_found == 0) && (n < max_nonce)
-             && !work_restart[thr_id].restart );
+   } while ( (n < max_nonce) && !work_restart[thr_id].restart );

   *hashes_done = n - first_nonce + 1;
-   return num_found;
+   return 0;
 }

 #elif defined(BLAKE2S_4WAY)
@@ -85,53 +74,46 @@ void blake2s_4way_hash( void *output, const void *input )
   blake2s_4way_update( &ctx, input + (64<<2), 16 );
   blake2s_4way_final( &ctx, vhash, BLAKE2S_OUTBYTES );

-   mm128_deinterleave_4x32( output, output+32, output+64, output+96,
+   dintrlv_4x32( output, output+32, output+64, output+96,
 		            vhash, 256 );
 }

-int scanhash_blake2s_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                      uint64_t *hashes_done )
+int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
   uint32_t hash[8*4] __attribute__ ((aligned (32)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
-   uint32_t _ALIGN(64) edata[20];
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
+   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
   uint32_t n = first_nonce;
-   uint32_t *nonces = work->nonces;
-   int num_found = 0;
-   uint32_t *noncep = vdata + 76;   // 19*4
+   int thr_id = mythr->id;  // thr_id arg is deprecated

-   swab32_array( edata, pdata, 20 );
-   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+   mm128_bswap32_intrlv80_4x32( vdata, pdata );
   blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
   blake2s_4way_update( &blake2s_4w_ctx, vdata, 64 );

   do {
-      be32enc( noncep,    n   );
-      be32enc( noncep +1, n+1 );
-      be32enc( noncep +2, n+2 );
-      be32enc( noncep +3, n+3 );
+      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
      pdata[19] = n;

      blake2s_4way_hash( hash, vdata );

      for ( int i = 0; i < 4; i++ )
-      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+      if ( (hash+(i<<3))[7] <= Htarg )
+      if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
      {
          pdata[19] = n+i;
-          nonces[ num_found++ ] = n+i;
-          work_set_target_ratio( work, hash+(i<<3) );
+          submit_lane_solution( work, hash+(i<<3), mythr, i );
      }
      n += 4;

-   } while ( (num_found == 0) && (n < max_nonce)
-             && !work_restart[thr_id].restart );
+   } while ( (n < max_nonce) && !work_restart[thr_id].restart );

   *hashes_done = n - first_nonce + 1;
-   return num_found;
+   return 0;
 }

 #endif
--- a/algo/blake/blake2s-gate.c
+++ b/algo/blake/blake2s-gate.c
@@ -20,7 +20,7 @@ bool register_blake2s_algo( algo_gate_t* gate )
  gate->hash      = (void*)&blake2s_hash;
 #endif
  gate->get_max64 = (void*)&blake2s_get_max64;
-  gate->optimizations = SSE42_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AVX2_OPT;
  return true;
 };

--- a/algo/blake/blake2s-gate.h
+++ b/algo/blake/blake2s-gate.h
@@ -4,7 +4,8 @@
 #include <stdint.h>
 #include "algo-gate-api.h"

-#if defined(__SSE4_2__)
+//#if defined(__SSE4_2__)
+#if defined(__SSE2__)
  #define BLAKE2S_4WAY
 #endif
 #if defined(__AVX2__)
@@ -16,19 +17,19 @@ bool register_blake2s_algo( algo_gate_t* gate );
 #if defined(BLAKE2S_8WAY)

 void blake2s_8way_hash( void *state, const void *input );
-int scanhash_blake2s_8way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done );
+int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );

 #elif defined (BLAKE2S_4WAY)

 void blake2s_4way_hash( void *state, const void *input );
-int scanhash_blake2s_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done );
+int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
 #else

 void blake2s_hash( void *state, const void *input );
-int scanhash_blake2s( int thr_id, struct work *work, uint32_t max_nonce,
-                      uint64_t *hashes_done );
+int scanhash_blake2s( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr );

 #endif

--- a/algo/blake/blake2s-hash-4way.c
+++ b/algo/blake/blake2s-hash-4way.c
@@ -17,7 +17,9 @@
 #include <string.h>
 #include <stdio.h>

-#if defined(__SSE4_2__)
+//#if defined(__SSE4_2__)
+#if defined(__SSE2__)
+

 static const uint32_t blake2s_IV[8] =
 {
@@ -57,8 +59,18 @@ int blake2s_4way_init( blake2s_4way_state *S, const uint8_t outlen )
   memset( P->personal, 0, sizeof( P->personal ) );

   memset( S, 0, sizeof( blake2s_4way_state ) );
-   for( int i = 0; i < 8; ++i )
-      S->h[i] = _mm_set1_epi32( blake2s_IV[i] );
+
+   S->h[0] = m128_const1_64( 0x6A09E6676A09E667ULL );
+   S->h[1] = m128_const1_64( 0xBB67AE85BB67AE85ULL );
+   S->h[2] = m128_const1_64( 0x3C6EF3723C6EF372ULL );
+   S->h[3] = m128_const1_64( 0xA54FF53AA54FF53AULL );
+   S->h[4] = m128_const1_64( 0x510E527F510E527FULL );
+   S->h[5] = m128_const1_64( 0x9B05688C9B05688CULL );
+   S->h[6] = m128_const1_64( 0x1F83D9AB1F83D9ABULL );
+   S->h[7] = m128_const1_64( 0x5BE0CD195BE0CD19ULL );
+   
+//   for( int i = 0; i < 8; ++i )
+//      S->h[i] = _mm_set1_epi32( blake2s_IV[i] );

   uint32_t *p = ( uint32_t * )( P );

@@ -267,8 +279,18 @@ int blake2s_8way_init( blake2s_8way_state *S, const uint8_t outlen )
   memset( P->personal, 0, sizeof( P->personal ) );

   memset( S, 0, sizeof( blake2s_8way_state ) );
-   for( int i = 0; i < 8; ++i )
-      S->h[i] = _mm256_set1_epi32( blake2s_IV[i] );
+   S->h[0] = m256_const1_64( 0x6A09E6676A09E667ULL );
+   S->h[1] = m256_const1_64( 0xBB67AE85BB67AE85ULL );
+   S->h[2] = m256_const1_64( 0x3C6EF3723C6EF372ULL );
+   S->h[3] = m256_const1_64( 0xA54FF53AA54FF53AULL );
+   S->h[4] = m256_const1_64( 0x510E527F510E527FULL );
+   S->h[5] = m256_const1_64( 0x9B05688C9B05688CULL );
+   S->h[6] = m256_const1_64( 0x1F83D9AB1F83D9ABULL );
+   S->h[7] = m256_const1_64( 0x5BE0CD195BE0CD19ULL );
+
+
+//   for( int i = 0; i < 8; ++i )
+//      S->h[i] = _mm256_set1_epi32( blake2s_IV[i] );

   uint32_t *p = ( uint32_t * )( P );

--- a/algo/blake/blake2s-hash-4way.h
+++ b/algo/blake/blake2s-hash-4way.h
@@ -14,9 +14,10 @@
 #ifndef __BLAKE2S_HASH_4WAY_H__
 #define __BLAKE2S_HASH_4WAY_H__ 1

-#if defined(__SSE4_2__)
+//#if defined(__SSE4_2__)
+#if defined(__SSE2__)

-#include "avxdefs.h"
+#include "simd-utils.h"

 #include <stddef.h>
 #include <stdint.h>
--- a/algo/blake/blake2s.c
+++ b/algo/blake/blake2s.c
@@ -32,14 +32,15 @@ static void blake2s_hash_end(uint32_t *output, const uint32_t *input)
 	blake2s_final(&s_ctx, (uint8_t*) output, BLAKE2S_OUTBYTES);
 }
 */
-int scanhash_blake2s(int thr_id, struct work *work,
-	uint32_t max_nonce, uint64_t *hashes_done)
+int scanhash_blake2s( struct work *work,
+	uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;

 	uint32_t _ALIGN(64) hash64[8];
 	uint32_t _ALIGN(64) endiandata[20];
+   int thr_id = mythr->id;  // thr_id arg is deprecated

 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
--- a/algo/blake/blake512-hash-4way.c
+++ b/algo/blake/blake512-hash-4way.c
@@ -307,12 +307,12 @@ static const sph_u64 CB[16] = {

 #define GB_4WAY(m0, m1, c0, c1, a, b, c, d)   do { \
   a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \
-                 _mm256_set_epi64x( c1, c1, c1, c1 ), m0 ), b ), a ); \
+                 _mm256_set1_epi64x( c1 ), m0 ), b ), a ); \
   d = mm256_ror_64( _mm256_xor_si256( d, a ), 32 ); \
   c = _mm256_add_epi64( c, d ); \
   b = mm256_ror_64( _mm256_xor_si256( b, c ), 25 ); \
   a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \
-                 _mm256_set_epi64x( c0, c0, c0, c0 ), m1 ), b ), a ); \
+                 _mm256_set1_epi64x( c0 ), m1 ), b ), a ); \
   d = mm256_ror_64( _mm256_xor_si256( d, a ), 16 ); \
   c = _mm256_add_epi64( c, d ); \
   b = mm256_ror_64( _mm256_xor_si256( b, c ), 11 ); \
@@ -412,18 +412,18 @@ static const sph_u64 CB[16] = {
 	V5 = H5; \
 	V6 = H6; \
 	V7 = H7; \
-        V8 = _mm256_xor_si256( S0, _mm256_set_epi64x( CB0, CB0, CB0, CB0 ) ); \
-        V9 = _mm256_xor_si256( S1, _mm256_set_epi64x( CB1, CB1, CB1, CB1 ) ); \
-        VA = _mm256_xor_si256( S2, _mm256_set_epi64x( CB2, CB2, CB2, CB2 ) ); \
-        VB = _mm256_xor_si256( S3, _mm256_set_epi64x( CB3, CB3, CB3, CB3 ) ); \
-        VC = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
-                               _mm256_set_epi64x( CB4, CB4, CB4, CB4 ) ); \
-        VD = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
-                               _mm256_set_epi64x( CB5, CB5, CB5, CB5 ) ); \
-        VE = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
-                               _mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \
-        VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
-                               _mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \
+   V8 = _mm256_xor_si256( S0, _mm256_set_epi64x( CB0, CB0, CB0, CB0 ) ); \
+   V9 = _mm256_xor_si256( S1, _mm256_set_epi64x( CB1, CB1, CB1, CB1 ) ); \
+   VA = _mm256_xor_si256( S2, _mm256_set_epi64x( CB2, CB2, CB2, CB2 ) ); \
+   VB = _mm256_xor_si256( S3, _mm256_set_epi64x( CB3, CB3, CB3, CB3 ) ); \
+   VC = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
+                          _mm256_set_epi64x( CB4, CB4, CB4, CB4 ) ); \
+   VD = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
+                          _mm256_set_epi64x( CB5, CB5, CB5, CB5 ) ); \
+   VE = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
+                          _mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \
+   VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
+                          _mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \
 	M[0x0] = mm256_bswap_64( *(buf+0) ); \
 	M[0x1] = mm256_bswap_64( *(buf+1) ); \
 	M[0x2] = mm256_bswap_64( *(buf+2) ); \
@@ -464,80 +464,76 @@ static const sph_u64 CB[16] = {

 //current impl

-#define COMPRESS64_4WAY   do { \
-     __m256i M0, M1, M2, M3, M4, M5, M6, M7; \
-     __m256i M8, M9, MA, MB, MC, MD, ME, MF; \
-     __m256i V0, V1, V2, V3, V4, V5, V6, V7; \
-     __m256i V8, V9, VA, VB, VC, VD, VE, VF; \
-     V0 = H0; \
-     V1 = H1; \
-     V2 = H2; \
-     V3 = H3; \
-     V4 = H4; \
-     V5 = H5; \
-     V6 = H6; \
-     V7 = H7; \
-     V8 = _mm256_xor_si256( S0, _mm256_set_epi64x( CB0, CB0, CB0, CB0 ) );  \
-     V9 = _mm256_xor_si256( S1, _mm256_set_epi64x( CB1, CB1, CB1, CB1 ) );  \
-     VA = _mm256_xor_si256( S2, _mm256_set_epi64x( CB2, CB2, CB2, CB2 ) );  \
-     VB = _mm256_xor_si256( S3, _mm256_set_epi64x( CB3, CB3, CB3, CB3 ) );  \
-     VC = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
-                            _mm256_set_epi64x( CB4, CB4, CB4, CB4 ) );  \
-     VD = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
-                            _mm256_set_epi64x( CB5, CB5, CB5, CB5 ) );  \
-     VE = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
-                            _mm256_set_epi64x( CB6, CB6, CB6, CB6 ) );  \
-     VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
-                            _mm256_set_epi64x( CB7, CB7, CB7, CB7 ) );  \
-     M0 = mm256_bswap_64( *(buf + 0) ); \
-     M1 = mm256_bswap_64( *(buf + 1) ); \
-     M2 = mm256_bswap_64( *(buf + 2) ); \
-     M3 = mm256_bswap_64( *(buf + 3) ); \
-     M4 = mm256_bswap_64( *(buf + 4) ); \
-     M5 = mm256_bswap_64( *(buf + 5) ); \
-     M6 = mm256_bswap_64( *(buf + 6) ); \
-     M7 = mm256_bswap_64( *(buf + 7) ); \
-     M8 = mm256_bswap_64( *(buf + 8) ); \
-     M9 = mm256_bswap_64( *(buf + 9) ); \
-     MA = mm256_bswap_64( *(buf + 10) ); \
-     MB = mm256_bswap_64( *(buf + 11) ); \
-     MC = mm256_bswap_64( *(buf + 12) ); \
-     MD = mm256_bswap_64( *(buf + 13) ); \
-     ME = mm256_bswap_64( *(buf + 14) ); \
-     MF = mm256_bswap_64( *(buf + 15) ); \
-     ROUND_B_4WAY(0); \
-     ROUND_B_4WAY(1); \
-     ROUND_B_4WAY(2); \
-     ROUND_B_4WAY(3); \
-     ROUND_B_4WAY(4); \
-     ROUND_B_4WAY(5); \
-     ROUND_B_4WAY(6); \
-     ROUND_B_4WAY(7); \
-     ROUND_B_4WAY(8); \
-     ROUND_B_4WAY(9); \
-     ROUND_B_4WAY(0); \
-     ROUND_B_4WAY(1); \
-     ROUND_B_4WAY(2); \
-     ROUND_B_4WAY(3); \
-     ROUND_B_4WAY(4); \
-     ROUND_B_4WAY(5); \
-     H0 = _mm256_xor_si256( _mm256_xor_si256( \
-                            _mm256_xor_si256( S0, V0 ), V8 ), H0 ); \
-     H1 = _mm256_xor_si256( _mm256_xor_si256( \
-                            _mm256_xor_si256( S1, V1 ), V9 ), H1 ); \
-     H2 = _mm256_xor_si256( _mm256_xor_si256( \
-                            _mm256_xor_si256( S2, V2 ), VA ), H2 ); \
-     H3 = _mm256_xor_si256( _mm256_xor_si256( \
-                            _mm256_xor_si256( S3, V3 ), VB ), H3 ); \
-     H4 = _mm256_xor_si256( _mm256_xor_si256( \
-                            _mm256_xor_si256( S0, V4 ), VC ), H4 ); \
-     H5 = _mm256_xor_si256( _mm256_xor_si256( \
-                            _mm256_xor_si256( S1, V5 ), VD ), H5 ); \
-     H6 = _mm256_xor_si256( _mm256_xor_si256( \
-                            _mm256_xor_si256( S2, V6 ), VE ), H6 ); \
-     H7 = _mm256_xor_si256( _mm256_xor_si256( \
-                            _mm256_xor_si256( S3, V7 ), VF ), H7 ); \
-	} while (0)
+#define COMPRESS64_4WAY   do \
+{ \
+  __m256i M0, M1, M2, M3, M4, M5, M6, M7; \
+  __m256i M8, M9, MA, MB, MC, MD, ME, MF; \
+  __m256i V0, V1, V2, V3, V4, V5, V6, V7; \
+  __m256i V8, V9, VA, VB, VC, VD, VE, VF; \
+  __m256i shuf_bswap64; \
+  V0 = H0; \
+  V1 = H1; \
+  V2 = H2; \
+  V3 = H3; \
+  V4 = H4; \
+  V5 = H5; \
+  V6 = H6; \
+  V7 = H7; \
+  V8 = _mm256_xor_si256( S0, m256_const1_64( CB0 ) );  \
+  V9 = _mm256_xor_si256( S1, m256_const1_64( CB1 ) );  \
+  VA = _mm256_xor_si256( S2, m256_const1_64( CB2 ) );  \
+  VB = _mm256_xor_si256( S3, m256_const1_64( CB3 ) );  \
+  VC = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
+                         m256_const1_64( CB4 ) );  \
+  VD = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
+                         m256_const1_64( CB5 ) );  \
+  VE = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \
+                         m256_const1_64( CB6 ) );  \
+  VF = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \
+                         m256_const1_64( CB7 ) );  \
+  shuf_bswap64 = m256_const_64( 0x08090a0b0c0d0e0f, 0x0001020304050607, \
+                                0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
+  M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
+  M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
+  M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \
+  M3 = _mm256_shuffle_epi8( *(buf+ 3), shuf_bswap64 ); \
+  M4 = _mm256_shuffle_epi8( *(buf+ 4), shuf_bswap64 ); \
+  M5 = _mm256_shuffle_epi8( *(buf+ 5), shuf_bswap64 ); \
+  M6 = _mm256_shuffle_epi8( *(buf+ 6), shuf_bswap64 ); \
+  M7 = _mm256_shuffle_epi8( *(buf+ 7), shuf_bswap64 ); \
+  M8 = _mm256_shuffle_epi8( *(buf+ 8), shuf_bswap64 ); \
+  M9 = _mm256_shuffle_epi8( *(buf+ 9), shuf_bswap64 ); \
+  MA = _mm256_shuffle_epi8( *(buf+10), shuf_bswap64 ); \
+  MB = _mm256_shuffle_epi8( *(buf+11), shuf_bswap64 ); \
+  MC = _mm256_shuffle_epi8( *(buf+12), shuf_bswap64 ); \
+  MD = _mm256_shuffle_epi8( *(buf+13), shuf_bswap64 ); \
+  ME = _mm256_shuffle_epi8( *(buf+14), shuf_bswap64 ); \
+  MF = _mm256_shuffle_epi8( *(buf+15), shuf_bswap64 ); \
+  ROUND_B_4WAY(0); \
+  ROUND_B_4WAY(1); \
+  ROUND_B_4WAY(2); \
+  ROUND_B_4WAY(3); \
+  ROUND_B_4WAY(4); \
+  ROUND_B_4WAY(5); \
+  ROUND_B_4WAY(6); \
+  ROUND_B_4WAY(7); \
+  ROUND_B_4WAY(8); \
+  ROUND_B_4WAY(9); \
+  ROUND_B_4WAY(0); \
+  ROUND_B_4WAY(1); \
+  ROUND_B_4WAY(2); \
+  ROUND_B_4WAY(3); \
+  ROUND_B_4WAY(4); \
+  ROUND_B_4WAY(5); \
+  H0 = mm256_xor4( V8, V0, S0, H0 ); \
+  H1 = mm256_xor4( V9, V1, S1, H1 ); \
+  H2 = mm256_xor4( VA, V2, S2, H2 ); \
+  H3 = mm256_xor4( VB, V3, S3, H3 ); \
+  H4 = mm256_xor4( VC, V4, S0, H4 ); \
+  H5 = mm256_xor4( VD, V5, S1, H5 ); \
+  H6 = mm256_xor4( VE, V6, S2, H6 ); \
+  H7 = mm256_xor4( VF, V7, S3, H7 ); \
+} while (0)

 #endif

@@ -547,13 +543,23 @@ static void
 blake64_4way_init( blake_4way_big_context *sc, const sph_u64 *iv,
              const sph_u64 *salt )
 {
-        int i;
-        for ( i = 0; i < 8; i++ )
-           sc->H[i] = _mm256_set1_epi64x( iv[i] );
-        for ( i = 0; i < 4; i++ )
-           sc->S[i] = _mm256_set1_epi64x( salt[i] );
-        sc->T0 = sc->T1 = 0;
-        sc->ptr = 0;
+   __m256i zero = m256_zero;
+   casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 );
+   casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE8584CAA73B );
+   casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF372FE94F82B );
+   casti_m256i( sc->H, 3 ) = m256_const1_64( 0xA54FF53A5F1D36F1 );
+   casti_m256i( sc->H, 4 ) = m256_const1_64( 0x510E527FADE682D1 );
+   casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C2B3E6C1F );
+   casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
+   casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
+
+   casti_m256i( sc->S, 0 ) = zero;
+   casti_m256i( sc->S, 1 ) = zero;
+   casti_m256i( sc->S, 2 ) = zero;
+   casti_m256i( sc->S, 3 ) = zero;
+
+   sc->T0 = sc->T1 = 0;
+   sc->ptr = 0;
 }

 static void
@@ -604,15 +610,11 @@ static void
 blake64_4way_close( blake_4way_big_context *sc,
 	unsigned ub, unsigned n, void *dst, size_t out_size_w64)
 {
-//   union {
-      __m256i buf[16];
-//      sph_u64 dummy;
-//   } u;
-   size_t ptr, k;
+   __m256i buf[16];
+   size_t ptr;
   unsigned bit_len;
   uint64_t z, zz;
   sph_u64 th, tl;
-   __m256i *out;

   ptr = sc->ptr;
   bit_len = ((unsigned)ptr << 3);
@@ -640,11 +642,9 @@ blake64_4way_close( blake_4way_big_context *sc,
       memset_zero_256( buf + (ptr>>3) + 1, (104-ptr) >> 3 );
       if ( out_size_w64 == 8 )
          buf[(104>>3)] = _mm256_or_si256( buf[(104>>3)],
-                                 _mm256_set1_epi64x( 0x0100000000000000ULL ) );
-       *(buf+(112>>3)) = mm256_bswap_64(
-                                    _mm256_set_epi64x( th, th, th, th ) );
-       *(buf+(120>>3)) = mm256_bswap_64(
-                                    _mm256_set_epi64x( tl, tl, tl, tl ) );
+                                 m256_const1_64( 0x0100000000000000ULL ) );
+       *(buf+(112>>3)) = _mm256_set1_epi64x( bswap_64( th ) );
+       *(buf+(120>>3)) = _mm256_set1_epi64x( bswap_64( tl ) );

       blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
   }
@@ -657,17 +657,13 @@ blake64_4way_close( blake_4way_big_context *sc,
       sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
       memset_zero_256( buf, 112>>3 ); 
       if ( out_size_w64 == 8 )
-           buf[104>>3] = _mm256_set1_epi64x( 0x0100000000000000ULL );
-       *(buf+(112>>3)) = mm256_bswap_64(
-                                    _mm256_set_epi64x( th, th, th, th ) );
-       *(buf+(120>>3)) = mm256_bswap_64(
-                                    _mm256_set_epi64x( tl, tl, tl, tl ) );
+           buf[104>>3] = m256_const1_64( 0x0100000000000000ULL );
+       *(buf+(112>>3)) = _mm256_set1_epi64x( bswap_64( th ) );
+       *(buf+(120>>3)) = _mm256_set1_epi64x( bswap_64( tl ) );

       blake64_4way( sc, buf, 128 );
   }
-   out = (__m256i*)dst;
-   for ( k = 0; k < out_size_w64; k++ )
-       out[k] = mm256_bswap_64( sc->H[k] );
+   mm256_block_bswap_64( (__m256i*)dst, sc->H );
 }

 void
--- a/algo/blake/blakecoin-4way.c
+++ b/algo/blake/blakecoin-4way.c
@@ -17,11 +17,11 @@ void blakecoin_4way_hash(void *state, const void *input)
     blake256r8_4way( &ctx, input + (64<<2), 16 );
     blake256r8_4way_close( &ctx, vhash );

-     mm128_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
+     dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
 }

-int scanhash_blakecoin_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done )
+int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
   uint32_t hash[8*4] __attribute__ ((aligned (32)));
@@ -29,41 +29,34 @@ int scanhash_blakecoin_4way( int thr_id, struct work *work, uint32_t max_nonce,
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   uint32_t HTarget = ptarget[7];
-   uint32_t _ALIGN(32) edata[20];
   uint32_t n = first_nonce;
-   uint32_t *nonces = work->nonces;
-   int num_found = 0;
+   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
+   int thr_id = mythr->id;  // thr_id arg is deprecated
   if ( opt_benchmark )
      HTarget = 0x7f;

-   swab32_array( edata, pdata, 20 );
-   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+   mm128_bswap32_intrlv80_4x32( vdata, pdata );
   blake256r8_4way_init( &blakecoin_4w_ctx );
   blake256r8_4way( &blakecoin_4w_ctx, vdata, 64 );

-   uint32_t *noncep = vdata + 76;   // 19*4
   do {
-      be32enc( noncep,    n   );
-      be32enc( noncep +1, n+1 );
-      be32enc( noncep +2, n+2 );
-      be32enc( noncep +3, n+3 );
+      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
      pdata[19] = n;
      blakecoin_4way_hash( hash, vdata );

      for ( int i = 0; i < 4; i++ )
-      if (  (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget ) )
+      if (  (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget )
+           && !opt_benchmark )
      {
          pdata[19] = n+i;
-          nonces[ num_found++ ] = n+i;
-          work_set_target_ratio( work, hash+(i<<3) );
+          submit_lane_solution( work, hash+(i<<3), mythr, i );
      }
      n += 4;

-   } while ( (num_found == 0) && (n < max_nonce) 
-             && !work_restart[thr_id].restart );
+   } while ( (n < max_nonce) && !work_restart[thr_id].restart );

   *hashes_done = n - first_nonce + 1;
-   return num_found;
+   return 0;
 }

 #endif
@@ -81,13 +74,12 @@ void blakecoin_8way_hash( void *state, const void *input )
     blake256r8_8way( &ctx, input + (64<<3), 16 );
     blake256r8_8way_close( &ctx, vhash );

-     mm256_deinterleave_8x32( state,     state+ 32, state+ 64, state+ 96,
-                              state+128, state+160, state+192, state+224,
-                              vhash, 256 );
+     dintrlv_8x32( state,     state+ 32, state+ 64, state+ 96, state+128,
+                   state+160, state+192, state+224, vhash, 256 );
 }

-int scanhash_blakecoin_8way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done )
+int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
   uint32_t hash[8*8] __attribute__ ((aligned (32)));
@@ -95,46 +87,34 @@ int scanhash_blakecoin_8way( int thr_id, struct work *work, uint32_t max_nonce,
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   uint32_t HTarget = ptarget[7];
-   uint32_t _ALIGN(32) edata[20];
   uint32_t n = first_nonce;
-   uint32_t *nonces = work->nonces;
-   uint32_t *noncep = vdata + 152;   // 19*8
-   int num_found = 0;
+   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
+   int thr_id = mythr->id;  // thr_id arg is deprecated
   if ( opt_benchmark )
      HTarget = 0x7f;

-   // we need big endian data...
-   swab32_array( edata, pdata, 20 );
-   mm256_interleave_8x32( vdata, edata, edata, edata, edata,
-                                 edata, edata, edata, edata, 640 );
+   mm256_bswap32_intrlv80_8x32( vdata, pdata );
   blake256r8_8way_init( &blakecoin_8w_ctx );
   blake256r8_8way( &blakecoin_8w_ctx, vdata, 64 );

   do {
-      be32enc( noncep,    n   );
-      be32enc( noncep +1, n+1 );
-      be32enc( noncep +2, n+2 );
-      be32enc( noncep +3, n+3 );
-      be32enc( noncep +4, n+4 );
-      be32enc( noncep +5, n+5 );
-      be32enc( noncep +6, n+6 );
-      be32enc( noncep +7, n+7 );
+      *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
+                                                  n+3, n+2, n+1, n ) );
      pdata[19] = n;
      blakecoin_8way_hash( hash, vdata );

      for ( int i = 0; i < 8; i++ )
-      if (  (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget ) )
+      if (  (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget )
+          && !opt_benchmark )
      {
          pdata[19] = n+i;
-          nonces[ num_found++ ] = n+i;
-          work_set_target_ratio( work, hash+(i<<3) );
+          submit_lane_solution( work, hash+(i<<3), mythr, i );
      }
      n += 8;
-   } while ( (num_found == 0) && (n < max_nonce)
-             && !work_restart[thr_id].restart );
+   } while ( (n < max_nonce) && !work_restart[thr_id].restart );

   *hashes_done = n - first_nonce + 1;
-   return num_found;
+   return 0;
 }

 #endif
--- a/algo/blake/blakecoin-gate.h
+++ b/algo/blake/blakecoin-gate.h
@@ -13,18 +13,18 @@

 #if defined (BLAKECOIN_8WAY)
 void blakecoin_8way_hash(void *state, const void *input);
-int scanhash_blakecoin_8way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done );
+int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
 #endif

 #if defined (BLAKECOIN_4WAY)
 void blakecoin_4way_hash(void *state, const void *input);
-int scanhash_blakecoin_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done );
+int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
 #endif

 void blakecoinhash( void *state, const void *input );
-int scanhash_blakecoin( int thr_id, struct work *work, uint32_t max_nonce,
-                      uint64_t *hashes_done );
+int scanhash_blakecoin( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr );

 #endif
--- a/algo/blake/blakecoin.c
+++ b/algo/blake/blakecoin.c
@@ -39,13 +39,14 @@ void blakecoinhash( void *state, const void *input )
 	memcpy( state, hash, 32 );
 }

-int scanhash_blakecoin( int thr_id, struct work *work, uint32_t max_nonce,
-                        uint64_t *hashes_done )
+int scanhash_blakecoin( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
 	const uint32_t first_nonce = pdata[19];
 	uint32_t HTarget = ptarget[7];
+   int thr_id = mythr->id;  // thr_id arg is deprecated

 	uint32_t _ALIGN(32) hash64[8];
 	uint32_t _ALIGN(32) endiandata[20];
--- a/algo/blake/decred-4way.c
+++ b/algo/blake/decred-4way.c
@@ -23,11 +23,11 @@ void decred_hash_4way( void *state, const void *input )
     memcpy( &ctx, &blake_mid, sizeof(blake_mid) );
     blake256_4way( &ctx, tail, tail_len );
     blake256_4way_close( &ctx, vhash );
-     mm128_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
+     dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
 }

-int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done)
+int scanhash_decred_4way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t vdata[48*4] __attribute__ ((aligned (64)));
   uint32_t hash[8*4] __attribute__ ((aligned (32)));
@@ -37,14 +37,13 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
   const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
   uint32_t n = first_nonce;
   const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];
-   uint32_t *nonces = work->nonces;
-   int num_found = 0;
+   int thr_id = mythr->id;  // thr_id arg is deprecated

   // copy to buffer guaranteed to be aligned.
   memcpy( edata, pdata, 180 );

   // use the old way until  new way updated for size.
-   mm128_interleave_4x32x( vdata, edata, edata, edata, edata, 180*8 );
+   mm128_intrlv_4x32x( vdata, edata, edata, edata, edata, 180*8 );

   blake256_4way_init( &blake_mid );
   blake256_4way( &blake_mid, vdata, DECRED_MIDSTATE_LEN );
@@ -59,18 +58,17 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
      decred_hash_4way( hash, vdata );

      for ( int i = 0; i < 4; i++ )
-      if (  (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget ) )
+      if (  (hash+(i<<3))[7] <= HTarget )
+      if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
      {
          pdata[DECRED_NONCE_INDEX] = n+i;
-          nonces[ num_found++ ] = n+i;
-          work_set_target_ratio( work, hash+(i<<3) );
+          submit_lane_solution( work, hash+(i<<3), mythr, i );
      }
      n += 4;
-  } while ( (num_found == 0) && (n < max_nonce) 
-            && !work_restart[thr_id].restart );
+  } while ( (n < max_nonce) && !work_restart[thr_id].restart );

  *hashes_done = n - first_nonce + 1;
-  return num_found;
+  return 0;
 }

 #endif
--- a/algo/blake/decred-gate.h
+++ b/algo/blake/decred-gate.h
@@ -14,7 +14,7 @@

 #if defined (__AVX2__) 
 //void blakehash_84way(void *state, const void *input);
-//int scanhash_blake_8way( int thr_id, struct work *work, uint32_t max_nonce,
+//int scanhash_blake_8way( struct work *work, uint32_t max_nonce,
 //                         uint64_t *hashes_done );
 #endif

@@ -24,13 +24,13 @@

 #if defined (DECRED_4WAY)
 void decred_hash_4way(void *state, const void *input);
-int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done );
+int scanhash_decred_4way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr );
 #endif

 void decred_hash( void *state, const void *input );
-int scanhash_decred( int thr_id, struct work *work, uint32_t max_nonce,
-                     uint64_t *hashes_done );
+int scanhash_decred( struct work *work, uint32_t max_nonce,
+                     uint64_t *hashes_done, struct thr_info *mythr );

 #endif

--- a/algo/blake/decred.c
+++ b/algo/blake/decred.c
@@ -52,12 +52,14 @@ void decred_hash_simple(void *state, const void *input)
        sph_blake256_close(&ctx, state);
 }

-int scanhash_decred(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done)
+int scanhash_decred( struct work *work, uint32_t max_nonce,
+               uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t _ALIGN(64) endiandata[48];
        uint32_t _ALIGN(64) hash32[8];
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
+   int thr_id = mythr->id;  // thr_id arg is deprecated

 //        #define DCR_NONCE_OFT32 35

--- a/algo/blake/pentablake-4way.c
+++ b/algo/blake/pentablake-4way.c
@@ -10,13 +10,8 @@
 #include "blake-hash-4way.h"
 #include "sph_blake.h"

-//#define DEBUG_ALGO
-
 extern void pentablakehash_4way( void *output, const void *input )
 {
-	unsigned char _ALIGN(32) hash[128];
-//	// same as uint32_t hashA[16], hashB[16];
-//	#define hashB hash+64

     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
@@ -30,21 +25,6 @@ extern void pentablakehash_4way( void *output, const void *input )
     blake512_4way( &ctx, input, 80 );
     blake512_4way_close( &ctx, vhash );

-uint64_t sin0[10], sin1[10], sin2[10], sin3[10];
-mm256_deinterleave_4x64( sin0, sin1, sin2, sin3, input, 640 );
-sph_blake512_context ctx2_blake;
-sph_blake512_init(&ctx2_blake);
-sph_blake512(&ctx2_blake, sin0, 80);
-sph_blake512_close(&ctx2_blake, (void*) hash);
-
-mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
-uint64_t* hash64 = (uint64_t*)hash;
-for( int i = 0; i < 8; i++ )
-{
-   if ( hash0[i] != hash64[i] )
-      printf("hash mismatch %u\n",i);
-}
-
     blake512_4way_init( &ctx );
     blake512_4way( &ctx, vhash, 64 );
     blake512_4way_close( &ctx, vhash );
@@ -61,46 +41,14 @@ for( int i = 0; i < 8; i++ )
     blake512_4way( &ctx, vhash, 64 );
     blake512_4way_close( &ctx, vhash );

-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     memcpy( output,    hash0, 32 );
     memcpy( output+32, hash1, 32 );
     memcpy( output+64, hash2, 32 );
     memcpy( output+96, hash3, 32 );
-
-/*
-     uint64_t sin0[10] __attribute__ ((aligned (64)));
-     uint64_t sin1[10] __attribute__ ((aligned (64)));
-     uint64_t sin2[10] __attribute__ ((aligned (64)));
-     uint64_t sin3[10] __attribute__ ((aligned (64)));
-
-	sph_blake512_context     ctx_blake;
-
-	sph_blake512_init(&ctx_blake);
-	sph_blake512(&ctx_blake, input, 80);
-	sph_blake512_close(&ctx_blake, hash);
-
-        sph_blake512_init(&ctx_blake);
-	sph_blake512(&ctx_blake, hash, 64);
-	sph_blake512_close(&ctx_blake, hash);
-
-        sph_blake512_init(&ctx_blake);
-	sph_blake512(&ctx_blake, hash, 64);
-	sph_blake512_close(&ctx_blake, hash);
-
-        sph_blake512_init(&ctx_blake);
-	sph_blake512(&ctx_blake, hash, 64);
-	sph_blake512_close(&ctx_blake, hash);
-
-        sph_blake512_init(&ctx_blake);
-	sph_blake512(&ctx_blake, hash, 64);
-	sph_blake512_close(&ctx_blake, hash);
-
-	memcpy(output, hash, 32);
-*/
 }

-int scanhash_pentablake_4way( int thr_id, struct work *work,
-                              uint32_t max_nonce, uint64_t *hashes_done )
+int scanhash_pentablake_4way( struct work *work,
+      uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
 {
    uint32_t hash[4*8] __attribute__ ((aligned (64)));
    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
@@ -110,9 +58,8 @@ int scanhash_pentablake_4way( int thr_id, struct work *work,
    uint32_t n = pdata[19] - 1;
    const uint32_t first_nonce = pdata[19];
    const uint32_t Htarg = ptarget[7];
-    uint32_t *nonces = work->nonces;
-    int num_found = 0;
    uint32_t *noncep = vdata + 73;   // 9*8 + 1
+    int thr_id = mythr->id;  // thr_id arg is deprecated

 //    uint32_t _ALIGN(32) hash64[8];
 //    uint32_t _ALIGN(32) endiandata[32];
@@ -138,7 +85,7 @@ int scanhash_pentablake_4way( int thr_id, struct work *work,
    swab32_array( endiandata, pdata, 20 );

    uint64_t *edata = (uint64_t*)endiandata;
-    mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+    intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );

    for ( int m=0; m < 6; m++ )
    {
@@ -155,10 +102,10 @@ int scanhash_pentablake_4way( int thr_id, struct work *work,

              for ( int i = 0; i < 4; i++ )
              if ( !( (hash+(i<<3))[7] & mask )
-                  && fulltest( hash+(i<<3), ptarget ) )
+                  && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
              {
-                 nonces[ num_found++ ] = n+i;
-                 work_set_target_ratio( work, hash+(i<<3) );
+                 pdata[19] = n + i;
+                 submit_lane_solution( work, hash+(i<<3), mythr, i );
              }
              n += 4;

--- a/algo/blake/pentablake-gate.h
+++ b/algo/blake/pentablake-gate.h
@@ -10,12 +10,12 @@

 #if defined(PENTABLAKE_4WAY)
 void pentablakehash_4way( void *state, const void *input );
-int scanhash_pentablake_4way( int thr_id, struct work *work,
-                              uint32_t max_nonce, uint64_t *hashes_done );
+int scanhash_pentablake_4way( struct work *work,
+           uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr );
 #endif

 void pentablakehash( void *state, const void *input );
-int scanhash_pentablake( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done );
+int scanhash_pentablake( struct work *work, uint32_t max_nonce,
+            uint64_t *hashes_done, struct thr_info *mythr );
 #endif

--- a/algo/blake/pentablake.c
+++ b/algo/blake/pentablake.c
@@ -40,8 +40,8 @@ extern void pentablakehash(void *output, const void *input)

 }

-int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_nonce,
-      uint64_t *hashes_done)
+int scanhash_pentablake( struct work *work, uint32_t max_nonce,
+      uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
@@ -49,6 +49,7 @@ int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_nonce,
 	uint32_t n = pdata[19] - 1;
 	const uint32_t first_nonce = pdata[19];
 	const uint32_t Htarg = ptarget[7];
+   int thr_id = mythr->id;  // thr_id arg is deprecated

 	uint32_t _ALIGN(32) hash64[8];
 	uint32_t _ALIGN(32) endiandata[32];
--- a/algo/blake/sph_blake2b.c
+++ b/algo/blake/sph_blake2b.c
@@ -103,7 +103,6 @@ static void blake2b_compress( sph_blake2b_ctx *ctx, int last )
 	v[13] ^= ctx->t[1];                 // high 64 bits
 	if (last)                           // last block flag set ?
 		v[14] = ~v[14];
-
 	for (i = 0; i < 16; i++)            // get little-endian words
 		m[i] = B2B_GET64(&ctx->b[8 * i]);

@@ -184,7 +183,8 @@ void sph_blake2b_final( sph_blake2b_ctx *ctx, void *out )

 	while (ctx->c < 128)                // fill up with zeros
 		ctx->b[ctx->c++] = 0;
-	blake2b_compress(ctx, 1);           // final block flag = 1
+
+   blake2b_compress(ctx, 1);           // final block flag = 1

 	// little endian convert and store
 	for (i = 0; i < ctx->outlen; i++) {
--- a/algo/bmw/bmw-hash-4way.h
+++ b/algo/bmw/bmw-hash-4way.h
@@ -43,7 +43,7 @@ extern "C"{
 #include <stddef.h>

 #include "algo/sha/sph_types.h"
-#include "avxdefs.h"
+#include "simd-utils.h"

 #define SPH_SIZE_bmw256   256

@@ -62,7 +62,7 @@ typedef struct {

 typedef bmw_4way_small_context bmw256_4way_context;

-void bmw256_4way_init(void *cc);
+void bmw256_4way_init( bmw256_4way_context *ctx );

 void bmw256_4way(void *cc, const void *data, size_t len);

--- a/algo/bmw/bmw256-hash-4way.c
+++ b/algo/bmw/bmw256-hash-4way.c
@@ -48,7 +48,7 @@ extern "C"{
 #if defined(__SSE2__)

 // BMW-256 4 way 32
-
+/*
 static const uint32_t IV256[] = {
 	0x40414243, 0x44454647,
 	0x48494A4B, 0x4C4D4E4F,
@@ -59,6 +59,7 @@ static const uint32_t IV256[] = {
 	0x70717273, 0x74757677,
 	0x78797A7B, 0x7C7D7E7F
 };
+*/

 #define ss0(x) \
   _mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 1), \
@@ -113,50 +114,27 @@ static const uint32_t IV256[] = {


 #define expand1s( qt, M, H, i ) \
-   _mm_add_epi32( \
-      _mm_add_epi32( \
-         _mm_add_epi32( \
-             _mm_add_epi32( \
-                _mm_add_epi32( ss1( qt[ (i)-16 ] ), \
-                               ss2( qt[ (i)-15 ] ) ), \
-                _mm_add_epi32( ss3( qt[ (i)-14 ] ), \
-                               ss0( qt[ (i)-13 ] ) ) ), \
-             _mm_add_epi32( \
-                _mm_add_epi32( ss1( qt[ (i)-12 ] ), \
-                               ss2( qt[ (i)-11 ] ) ), \
-                _mm_add_epi32( ss3( qt[ (i)-10 ] ), \
-                               ss0( qt[ (i)- 9 ] ) ) ) ), \
-         _mm_add_epi32( \
-             _mm_add_epi32( \
-                _mm_add_epi32( ss1( qt[ (i)- 8 ] ), \
-                               ss2( qt[ (i)- 7 ] ) ), \
-                _mm_add_epi32( ss3( qt[ (i)- 6 ] ), \
-                               ss0( qt[ (i)- 5 ] ) ) ), \
-             _mm_add_epi32( \
-                _mm_add_epi32( ss1( qt[ (i)- 4 ] ), \
-                               ss2( qt[ (i)- 3 ] ) ), \
-                _mm_add_epi32( ss3( qt[ (i)- 2 ] ), \
-                               ss0( qt[ (i)- 1 ] ) ) ) ) ), \
+   _mm_add_epi32(  mm128_add4_32( \
+            mm128_add4_32( ss1( qt[ (i)-16 ] ), ss2( qt[ (i)-15 ] ), \
+                           ss3( qt[ (i)-14 ] ), ss0( qt[ (i)-13 ] ) ), \
+            mm128_add4_32( ss1( qt[ (i)-12 ] ), ss2( qt[ (i)-11 ] ), \
+                           ss3( qt[ (i)-10 ] ), ss0( qt[ (i)- 9 ] ) ), \
+            mm128_add4_32( ss1( qt[ (i)- 8 ] ), ss2( qt[ (i)- 7 ] ), \
+                           ss3( qt[ (i)- 6 ] ), ss0( qt[ (i)- 5 ] ) ),  \
+            mm128_add4_32( ss1( qt[ (i)- 4 ] ), ss2( qt[ (i)- 3 ] ), \
+                           ss3( qt[ (i)- 2 ] ), ss0( qt[ (i)- 1 ] ) ) ), \
      add_elt_s( M, H, (i)-16 ) )

 #define expand2s( qt, M, H, i) \
-   _mm_add_epi32( \
-      _mm_add_epi32( \
-         _mm_add_epi32( \
-             _mm_add_epi32( \
-                _mm_add_epi32( qt[ (i)-16 ], rs1( qt[ (i)-15 ] ) ), \
-                _mm_add_epi32( qt[ (i)-14 ], rs2( qt[ (i)-13 ] ) ) ), \
-             _mm_add_epi32( \
-                _mm_add_epi32( qt[ (i)-12 ], rs3( qt[ (i)-11 ] ) ), \
-                _mm_add_epi32( qt[ (i)-10 ], rs4( qt[ (i)- 9 ] ) ) ) ), \
-         _mm_add_epi32( \
-             _mm_add_epi32( \
-                _mm_add_epi32( qt[ (i)- 8 ], rs5( qt[ (i)- 7 ] ) ), \
-                _mm_add_epi32( qt[ (i)- 6 ], rs6( qt[ (i)- 5 ] ) ) ), \
-             _mm_add_epi32( \
-                _mm_add_epi32( qt[ (i)- 4 ], rs7( qt[ (i)- 3 ] ) ), \
-                _mm_add_epi32( ss4( qt[ (i)- 2 ] ), \
-                               ss5( qt[ (i)- 1 ] ) ) ) ) ), \
+   _mm_add_epi32( mm128_add4_32( \
+            mm128_add4_32( qt[ (i)-16 ], rs1( qt[ (i)-15 ] ), \
+                           qt[ (i)-14 ], rs2( qt[ (i)-13 ] ) ), \
+            mm128_add4_32( qt[ (i)-12 ], rs3( qt[ (i)-11 ] ), \
+                           qt[ (i)-10 ], rs4( qt[ (i)- 9 ] ) ), \
+            mm128_add4_32( qt[ (i)- 8 ], rs5( qt[ (i)- 7 ] ), \
+                           qt[ (i)- 6 ], rs6( qt[ (i)- 5 ] ) ), \
+            mm128_add4_32( qt[ (i)- 4 ], rs7( qt[ (i)- 3 ] ), \
+                           ss4( qt[ (i)- 2 ] ), ss5( qt[ (i)- 1 ] ) ) ), \
      add_elt_s( M, H, (i)-16 ) )

 #define Ws0 \
@@ -357,17 +335,11 @@ void compress_small( const __m128i *M, const __m128i H[16], __m128i dH[16] )
   qt[30] = expand2s( qt, M, H, 30 );
   qt[31] = expand2s( qt, M, H, 31 );

-   xl = _mm_xor_si128(
-              _mm_xor_si128( _mm_xor_si128( qt[16], qt[17] ),
-                             _mm_xor_si128( qt[18], qt[19] ) ),
-              _mm_xor_si128( _mm_xor_si128( qt[20], qt[21] ),
-                             _mm_xor_si128( qt[22], qt[23] ) ) );
-   xh = _mm_xor_si128( xl,
-             _mm_xor_si128(
-                 _mm_xor_si128( _mm_xor_si128( qt[24], qt[25] ),
-                                   _mm_xor_si128( qt[26], qt[27] ) ),
-                 _mm_xor_si128( _mm_xor_si128( qt[28], qt[29] ),
-                                   _mm_xor_si128( qt[30], qt[31] ) )));
+   xl = _mm_xor_si128( mm128_xor4( qt[16], qt[17], qt[18], qt[19] ),
+                       mm128_xor4( qt[20], qt[21], qt[22], qt[23] ) );
+   xh = _mm_xor_si128( xl, _mm_xor_si128(
+                             mm128_xor4( qt[24], qt[25], qt[26], qt[27] ),
+                             mm128_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );

   dH[ 0] = _mm_add_epi32(
                 _mm_xor_si128( M[0],
@@ -491,13 +463,30 @@ static const __m128i final_s[16] =
   { 0xaaaaaaafaaaaaaaf, 0xaaaaaaafaaaaaaaf }
 };
 */
-static void
-bmw32_4way_init(bmw_4way_small_context *sc, const sph_u32 *iv)
+void bmw256_4way_init( bmw256_4way_context *ctx )
 {
-   for ( int i = 0; i < 16; i++ )
-      sc->H[i] = _mm_set1_epi32( iv[i] );
-   sc->ptr = 0;
-   sc->bit_count = 0;
+   ctx->H[ 0] = m128_const1_64( 0x4041424340414243 );
+   ctx->H[ 1] = m128_const1_64( 0x4445464744454647 );
+   ctx->H[ 2] = m128_const1_64( 0x48494A4B48494A4B );
+   ctx->H[ 3] = m128_const1_64( 0x4C4D4E4F4C4D4E4F );
+   ctx->H[ 4] = m128_const1_64( 0x5051525350515253 );
+   ctx->H[ 5] = m128_const1_64( 0x5455565754555657 );
+   ctx->H[ 6] = m128_const1_64( 0x58595A5B58595A5B );
+   ctx->H[ 7] = m128_const1_64( 0x5C5D5E5F5C5D5E5F );
+   ctx->H[ 8] = m128_const1_64( 0x6061626360616263 );
+   ctx->H[ 9] = m128_const1_64( 0x6465666764656667 );
+   ctx->H[10] = m128_const1_64( 0x68696A6B68696A6B );
+   ctx->H[11] = m128_const1_64( 0x6C6D6E6F6C6D6E6F );
+   ctx->H[12] = m128_const1_64( 0x7071727370717273 );
+   ctx->H[13] = m128_const1_64( 0x7475767774757677 );
+   ctx->H[14] = m128_const1_64( 0x78797A7B78797A7B );
+   ctx->H[15] = m128_const1_64( 0x7C7D7E7F7C7D7E7F );
+
+
+//   for ( int i = 0; i < 16; i++ )
+//      sc->H[i] = _mm_set1_epi32( iv[i] );
+   ctx->ptr = 0;
+   ctx->bit_count = 0;
 }

 static void
@@ -537,6 +526,8 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
      }
   }
   sc->ptr = ptr;
+
+
   if ( h1 != sc->H )
        memcpy_128( sc->H, h1, 16 );
 }
@@ -552,7 +543,7 @@ bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,

   buf = sc->buf;
   ptr = sc->ptr;
-   buf[ ptr>>2 ] = _mm_set1_epi32( 0x80 );
+   buf[ ptr>>2 ] = m128_const1_64( 0x0000008000000080 );
   ptr += 4;
   h = sc->H;

@@ -571,17 +562,20 @@ bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,

   for ( u = 0; u < 16; u ++ )
      buf[u] = h2[u];
+
   compress_small( buf, (__m128i*)final_s, h1 );

   for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++)
      casti_m128i( dst, u ) = h1[v];
 }

+/*
 void
 bmw256_4way_init(void *cc)
 {
 	bmw32_4way_init(cc, IV256);
 }
+*/

 void
 bmw256_4way(void *cc, const void *data, size_t len)
@@ -692,22 +686,15 @@ bmw256_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)

 #define expand2s8( qt, M, H, i) \
   _mm256_add_epi32( \
-      _mm256_add_epi32( \
-         _mm256_add_epi32( \
-             _mm256_add_epi32( \
-                _mm256_add_epi32( qt[ (i)-16 ], r8s1( qt[ (i)-15 ] ) ), \
-                _mm256_add_epi32( qt[ (i)-14 ], r8s2( qt[ (i)-13 ] ) ) ), \
-             _mm256_add_epi32( \
-                _mm256_add_epi32( qt[ (i)-12 ], r8s3( qt[ (i)-11 ] ) ), \
-                _mm256_add_epi32( qt[ (i)-10 ], r8s4( qt[ (i)- 9 ] ) ) ) ), \
-         _mm256_add_epi32( \
-             _mm256_add_epi32( \
-                _mm256_add_epi32( qt[ (i)- 8 ], r8s5( qt[ (i)- 7 ] ) ), \
-                _mm256_add_epi32( qt[ (i)- 6 ], r8s6( qt[ (i)- 5 ] ) ) ), \
-             _mm256_add_epi32( \
-                _mm256_add_epi32( qt[ (i)- 4 ], r8s7( qt[ (i)- 3 ] ) ), \
-                _mm256_add_epi32( s8s4( qt[ (i)- 2 ] ), \
-                                  s8s5( qt[ (i)- 1 ] ) ) ) ) ), \
+      mm256_add4_32( \
+          mm256_add4_32( qt[ (i)-16 ], r8s1( qt[ (i)-15 ] ), \
+                         qt[ (i)-14 ], r8s2( qt[ (i)-13 ] ) ), \
+          mm256_add4_32( qt[ (i)-12 ], r8s3( qt[ (i)-11 ] ), \
+                         qt[ (i)-10 ], r8s4( qt[ (i)- 9 ] ) ), \
+          mm256_add4_32( qt[ (i)- 8 ], r8s5( qt[ (i)- 7 ] ), \
+                         qt[ (i)- 6 ], r8s6( qt[ (i)- 5 ] ) ), \
+          mm256_add4_32( qt[ (i)- 4 ], r8s7( qt[ (i)- 3 ] ), \
+                         s8s4( qt[ (i)- 2 ] ), s8s5( qt[ (i)- 1 ] ) ) ), \
      add_elt_s8( M, H, (i)-16 ) )


@@ -910,16 +897,11 @@ void compress_small_8way( const __m256i *M, const __m256i H[16],
   qt[31] = expand2s8( qt, M, H, 31 );

   xl = _mm256_xor_si256(
-              _mm256_xor_si256( _mm256_xor_si256( qt[16], qt[17] ),
-                                _mm256_xor_si256( qt[18], qt[19] ) ),
-              _mm256_xor_si256( _mm256_xor_si256( qt[20], qt[21] ),
-                                _mm256_xor_si256( qt[22], qt[23] ) ) );
-   xh = _mm256_xor_si256( xl,
-             _mm256_xor_si256(
-                 _mm256_xor_si256( _mm256_xor_si256( qt[24], qt[25] ),
-                                   _mm256_xor_si256( qt[26], qt[27] ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( qt[28], qt[29] ),
-                                   _mm256_xor_si256( qt[30], qt[31] ) )));
+              mm256_xor4( qt[16], qt[17], qt[18], qt[19] ),
+              mm256_xor4( qt[20], qt[21], qt[22], qt[23] ) );
+   xh = _mm256_xor_si256( xl,  _mm256_xor_si256(
+                 mm256_xor4( qt[24], qt[25], qt[26], qt[27] ),
+                 mm256_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );

   dH[ 0] = _mm256_add_epi32(
                 _mm256_xor_si256( M[0],
@@ -1041,25 +1023,24 @@ static const __m256i final_s8[16] =

 void bmw256_8way_init( bmw256_8way_context *ctx )
 {
-   ctx->H[ 0] = _mm256_set1_epi64x( IV256[ 0] );
-   ctx->H[ 1] = _mm256_set1_epi64x( IV256[ 1] );
-   ctx->H[ 2] = _mm256_set1_epi64x( IV256[ 2] );
-   ctx->H[ 3] = _mm256_set1_epi64x( IV256[ 3] );
-   ctx->H[ 4] = _mm256_set1_epi64x( IV256[ 4] );
-   ctx->H[ 5] = _mm256_set1_epi64x( IV256[ 5] );
-   ctx->H[ 6] = _mm256_set1_epi64x( IV256[ 6] );
-   ctx->H[ 7] = _mm256_set1_epi64x( IV256[ 7] );
-   ctx->H[ 8] = _mm256_set1_epi64x( IV256[ 8] );
-   ctx->H[ 9] = _mm256_set1_epi64x( IV256[ 9] );
-   ctx->H[10] = _mm256_set1_epi64x( IV256[10] );
-   ctx->H[11] = _mm256_set1_epi64x( IV256[11] );
-   ctx->H[12] = _mm256_set1_epi64x( IV256[12] );
-   ctx->H[13] = _mm256_set1_epi64x( IV256[13] );
-   ctx->H[14] = _mm256_set1_epi64x( IV256[14] );
-   ctx->H[15] = _mm256_set1_epi64x( IV256[15] );
+   ctx->H[ 0] = m256_const1_64( 0x4041424340414243 );
+   ctx->H[ 1] = m256_const1_64( 0x4445464744454647 );
+   ctx->H[ 2] = m256_const1_64( 0x48494A4B48494A4B );
+   ctx->H[ 3] = m256_const1_64( 0x4C4D4E4F4C4D4E4F );
+   ctx->H[ 4] = m256_const1_64( 0x5051525350515253 );
+   ctx->H[ 5] = m256_const1_64( 0x5455565754555657 );
+   ctx->H[ 6] = m256_const1_64( 0x58595A5B58595A5B );
+   ctx->H[ 7] = m256_const1_64( 0x5C5D5E5F5C5D5E5F );
+   ctx->H[ 8] = m256_const1_64( 0x6061626360616263 );
+   ctx->H[ 9] = m256_const1_64( 0x6465666764656667 );
+   ctx->H[10] = m256_const1_64( 0x68696A6B68696A6B );
+   ctx->H[11] = m256_const1_64( 0x6C6D6E6F6C6D6E6F );
+   ctx->H[12] = m256_const1_64( 0x7071727370717273 );
+   ctx->H[13] = m256_const1_64( 0x7475767774757677 );
+   ctx->H[14] = m256_const1_64( 0x78797A7B78797A7B );
+   ctx->H[15] = m256_const1_64( 0x7C7D7E7F7C7D7E7F );
   ctx->ptr       = 0;
   ctx->bit_count = 0;
-
 }

 void bmw256_8way( bmw256_8way_context *ctx, const void *data, size_t len )
@@ -1076,14 +1057,15 @@ void bmw256_8way( bmw256_8way_context *ctx, const void *data, size_t len )
   ptr = ctx->ptr;
   h1 = ctx->H;
   h2 = htmp;
+
   while ( len > 0 )
   {
      size_t clen;
      clen = buf_size - ptr;
      if ( clen > len )
         clen = len;
-      memcpy_256( buf + (ptr>>3), vdata, clen >> 3 );
-      vdata = vdata + (clen>>3);
+      memcpy_256( buf + (ptr>>2), vdata, clen >> 2 );
+      vdata = vdata + (clen>>2);
      len -= clen;
      ptr += clen;
      if ( ptr == buf_size )
@@ -1097,6 +1079,7 @@ void bmw256_8way( bmw256_8way_context *ctx, const void *data, size_t len )
      }
   }
   ctx->ptr = ptr;
+
   if ( h1 != ctx->H )
        memcpy_256( ctx->H, h1, 16 );
 }
@@ -1106,24 +1089,25 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst )
   __m256i *buf;
   __m256i h1[16], h2[16], *h;
   size_t ptr, u, v;
-//   unsigned z;
   const int buf_size = 64;  // bytes of one lane, compatible with len

   buf = ctx->buf;
   ptr = ctx->ptr;
-   buf[ ptr>>3 ] = _mm256_set1_epi32( 0x80 );
-   ptr += 8;
+   buf[ ptr>>2 ] = m256_const1_64( 0x0000008000000080 );
+   ptr += 4;
   h = ctx->H;

-   if (  ptr > (buf_size - 8) )
+   if (  ptr > (buf_size - 4) )
   {
-      memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 );
+      memset_zero_256( buf + (ptr>>2), (buf_size - ptr) >> 2 );
      compress_small_8way( buf, h, h1 );
      ptr = 0;
      h = h1;
   }
-   memset_zero_256( buf + (ptr>>3), (buf_size - 8 - ptr) >> 3 );
-   buf[ (buf_size - 8) >> 3 ] = _mm256_set1_epi64x( ctx->bit_count );
+   memset_zero_256( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 );
+   buf[ (buf_size - 8) >> 2 ] = _mm256_set1_epi32( ctx->bit_count );
+   buf[ (buf_size - 4) >> 2 ] = m256_zero;
+
   compress_small_8way( buf, h, h2 );

   for ( u = 0; u < 16; u ++ )
--- a/algo/bmw/bmw256.c
+++ b/algo/bmw/bmw256.c
@@ -19,14 +19,15 @@ void bmwhash(void *output, const void *input)
 */
 }

-int scanhash_bmw(int thr_id, struct work *work,
-	uint32_t max_nonce, uint64_t *hashes_done)
+int scanhash_bmw( struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;

 	uint32_t _ALIGN(64) hash64[8];
 	uint32_t _ALIGN(64) endiandata[20];
+   int thr_id = mythr->id;

 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
--- a/algo/bmw/bmw512-4way.c
+++ b/algo/bmw/bmw512-4way.c
@@ -0,0 +1,59 @@
+#include "bmw512-gate.h"
+
+#ifdef BMW512_4WAY
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+//#include "sph_keccak.h"
+#include "bmw-hash-4way.h"
+
+void bmw512hash_4way(void *state, const void *input)
+{
+    bmw512_4way_context ctx;
+    bmw512_4way_init( &ctx );
+    bmw512_4way( &ctx, input, 80 );
+    bmw512_4way_close( &ctx, state );
+}
+
+int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+   uint32_t hash[16*4] __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t *hash7 = &(hash[25]);   // 3*8+1
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t n = pdata[19];
+   const uint32_t first_nonce = pdata[19];
+   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+//   const uint32_t Htarg = ptarget[7];
+    int thr_id = mythr->id;  // thr_id arg is deprecated
+
+   mm256_bswap32_intrlv80_4x64( vdata, pdata );
+   do {
+       *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
+	
+      bmw512hash_4way( hash, vdata );
+
+      for ( int lane = 0; lane < 4; lane++ )
+      if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) )
+      {
+          extr_lane_4x64( lane_hash, hash, lane, 256 );
+          if ( fulltest( lane_hash, ptarget ) )
+          {
+              pdata[19] = n + lane;
+              submit_lane_solution( work, lane_hash, mythr, lane );
+          }
+      }
+      n += 4;
+
+   } while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce + 1;
+   return 0;
+}
+
+#endif
--- a/algo/bmw/bmw512-gate.c
+++ b/algo/bmw/bmw512-gate.c
@@ -0,0 +1,20 @@
+#include "bmw512-gate.h"
+
+int64_t bmw512_get_max64() { return 0x7ffffLL; }
+
+bool register_bmw512_algo( algo_gate_t* gate )
+{
+  gate->optimizations = AVX2_OPT;
+  gate->get_max64       = (void*)&bmw512_get_max64;
+  opt_target_factor = 256.0;
+#if defined (BMW512_4WAY)
+  gate->scanhash  = (void*)&scanhash_bmw512_4way;
+  gate->hash      = (void*)&bmw512hash_4way;
+#else
+  gate->scanhash        = (void*)&scanhash_bmw512;
+  gate->hash            = (void*)&bmw512hash;
+#endif
+  return true;
+};
+
+
--- a/algo/bmw/bmw512-gate.h
+++ b/algo/bmw/bmw512-gate.h
@@ -0,0 +1,23 @@
+#ifndef BMW512_GATE_H__
+#define BMW512_GATE_H__
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(__AVX2__)
+  #define BMW512_4WAY 1
+#endif
+
+#if defined(BMW512_4WAY)
+
+void bmw512hash_4way( void *state, const void *input );
+int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+
+#endif
+
+void bmw512hash( void *state, const void *input );
+int scanhash_bmw512( struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done, struct thr_info *mythr );
+
+#endif
--- a/algo/bmw/bmw512-hash-4way.c
+++ b/algo/bmw/bmw512-hash-4way.c
@@ -569,28 +569,20 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )


 #define sb0(x) \
-   _mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 1), \
-                                       _mm256_slli_epi64( (x), 3) ), \
-                     _mm256_xor_si256( mm256_rol_64( (x),  4), \
-                                       mm256_rol_64( (x), 37) ) )
+   mm256_xor4( _mm256_srli_epi64( (x), 1), _mm256_slli_epi64( (x), 3), \
+                mm256_rol_64(     (x), 4),  mm256_rol_64(     (x),37) )

 #define sb1(x) \
-   _mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 1), \
-                                       _mm256_slli_epi64( (x), 2) ), \
-                     _mm256_xor_si256( mm256_rol_64( (x), 13), \
-                                       mm256_rol_64( (x), 43) ) )
+   mm256_xor4( _mm256_srli_epi64( (x), 1), _mm256_slli_epi64( (x), 2), \
+                mm256_rol_64(     (x),13),  mm256_rol_64(     (x),43) )

 #define sb2(x) \
-   _mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 2), \
-                                       _mm256_slli_epi64( (x), 1) ), \
-                     _mm256_xor_si256( mm256_rol_64( (x), 19), \
-                                       mm256_rol_64( (x), 53) ) )
+   mm256_xor4( _mm256_srli_epi64( (x), 2), _mm256_slli_epi64( (x), 1), \
+                mm256_rol_64(     (x),19),  mm256_rol_64(     (x),53) )

 #define sb3(x) \
-   _mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 2), \
-                                       _mm256_slli_epi64( (x), 2) ), \
-                     _mm256_xor_si256( mm256_rol_64( (x), 28), \
-                                       mm256_rol_64( (x), 59) ) )
+   mm256_xor4( _mm256_srli_epi64( (x), 2), _mm256_slli_epi64( (x), 2), \
+                mm256_rol_64(     (x),28),  mm256_rol_64(     (x),59) )

 #define sb4(x) \
  _mm256_xor_si256( (x), _mm256_srli_epi64( (x), 1 ) )
@@ -618,55 +610,32 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )
                             rol_off_64( M, j, 10 ) ), \
            _mm256_set1_epi64x( ( (j) + 16 ) * 0x0555555555555555ULL ) ), \
       H[ ( (j)+7 ) & 0xF ] )
-          
+
+
 #define expand1b( qt, M, H, i ) \
-   _mm256_add_epi64( \
-      _mm256_add_epi64( \
-         _mm256_add_epi64( \
-             _mm256_add_epi64( \
-                _mm256_add_epi64( sb1( qt[ (i)-16 ] ), \
-                                  sb2( qt[ (i)-15 ] ) ), \
-                _mm256_add_epi64( sb3( qt[ (i)-14 ] ), \
-                                  sb0( qt[ (i)-13 ] ) ) ), \
-             _mm256_add_epi64( \
-                _mm256_add_epi64( sb1( qt[ (i)-12 ] ), \
-                                  sb2( qt[ (i)-11 ] ) ), \
-                _mm256_add_epi64( sb3( qt[ (i)-10 ] ), \
-                                  sb0( qt[ (i)- 9 ] ) ) ) ), \
-         _mm256_add_epi64( \
-             _mm256_add_epi64( \
-                _mm256_add_epi64( sb1( qt[ (i)- 8 ] ), \
-                                  sb2( qt[ (i)- 7 ] ) ), \
-                _mm256_add_epi64( sb3( qt[ (i)- 6 ] ), \
-                                  sb0( qt[ (i)- 5 ] ) ) ), \
-             _mm256_add_epi64( \
-                _mm256_add_epi64( sb1( qt[ (i)- 4 ] ), \
-                                  sb2( qt[ (i)- 3 ] ) ), \
-                _mm256_add_epi64( sb3( qt[ (i)- 2 ] ), \
-                                  sb0( qt[ (i)- 1 ] ) ) ) ) ), \
+   _mm256_add_epi64( mm256_add4_64( \
+      mm256_add4_64( sb1( qt[ (i)-16 ] ), sb2( qt[ (i)-15 ] ), \
+                     sb3( qt[ (i)-14 ] ), sb0( qt[ (i)-13 ] )), \
+      mm256_add4_64( sb1( qt[ (i)-12 ] ), sb2( qt[ (i)-11 ] ), \
+                     sb3( qt[ (i)-10 ] ), sb0( qt[ (i)- 9 ] )), \
+      mm256_add4_64( sb1( qt[ (i)- 8 ] ), sb2( qt[ (i)- 7 ] ), \
+                     sb3( qt[ (i)- 6 ] ), sb0( qt[ (i)- 5 ] )), \
+      mm256_add4_64( sb1( qt[ (i)- 4 ] ), sb2( qt[ (i)- 3 ] ), \
+                     sb3( qt[ (i)- 2 ] ), sb0( qt[ (i)- 1 ] ) ) ), \
      add_elt_b( M, H, (i)-16 ) )

 #define expand2b( qt, M, H, i) \
-   _mm256_add_epi64( \
-      _mm256_add_epi64( \
-         _mm256_add_epi64( \
-             _mm256_add_epi64( \
-                _mm256_add_epi64( qt[ (i)-16 ], rb1( qt[ (i)-15 ] ) ), \
-                _mm256_add_epi64( qt[ (i)-14 ], rb2( qt[ (i)-13 ] ) ) ), \
-             _mm256_add_epi64( \
-                _mm256_add_epi64( qt[ (i)-12 ], rb3( qt[ (i)-11 ] ) ), \
-                _mm256_add_epi64( qt[ (i)-10 ], rb4( qt[ (i)- 9 ] ) ) ) ), \
-         _mm256_add_epi64( \
-             _mm256_add_epi64( \
-                _mm256_add_epi64( qt[ (i)- 8 ], rb5( qt[ (i)- 7 ] ) ), \
-                _mm256_add_epi64( qt[ (i)- 6 ], rb6( qt[ (i)- 5 ] ) ) ), \
-             _mm256_add_epi64( \
-                _mm256_add_epi64( qt[ (i)- 4 ], rb7( qt[ (i)- 3 ] ) ), \
-                _mm256_add_epi64( sb4( qt[ (i)- 2 ] ), \
-                                  sb5( qt[ (i)- 1 ] ) ) ) ) ), \
+   _mm256_add_epi64( mm256_add4_64( \
+      mm256_add4_64( qt[ (i)-16 ], rb1( qt[ (i)-15 ] ), \
+                     qt[ (i)-14 ], rb2( qt[ (i)-13 ] ) ), \
+      mm256_add4_64( qt[ (i)-12 ], rb3( qt[ (i)-11 ] ), \
+                     qt[ (i)-10 ], rb4( qt[ (i)- 9 ] ) ), \
+      mm256_add4_64( qt[ (i)- 8 ], rb5( qt[ (i)- 7 ] ), \
+                     qt[ (i)- 6 ], rb6( qt[ (i)- 5 ] ) ), \
+      mm256_add4_64( qt[ (i)- 4 ], rb7( qt[ (i)- 3 ] ), \
+                     sb4( qt[ (i)- 2 ] ), sb5( qt[ (i)- 1 ] ) ) ), \
      add_elt_b( M, H, (i)-16 ) )

-
 #define Wb0 \
   _mm256_add_epi64( \
       _mm256_add_epi64( \
@@ -864,95 +833,90 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
   qt[30] = expand2b( qt, M, H, 30 ); 
   qt[31] = expand2b( qt, M, H, 31 ); 

-   xl = _mm256_xor_si256( 
-              _mm256_xor_si256( _mm256_xor_si256( qt[16], qt[17] ), 
-                                _mm256_xor_si256( qt[18], qt[19] ) ), 
-              _mm256_xor_si256( _mm256_xor_si256( qt[20], qt[21] ), 
-                                _mm256_xor_si256( qt[22], qt[23] ) ) ); 
-   xh = _mm256_xor_si256( xl, 
-             _mm256_xor_si256( 
-                 _mm256_xor_si256( _mm256_xor_si256( qt[24], qt[25] ),
-                                   _mm256_xor_si256( qt[26], qt[27] ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( qt[28], qt[29] ),
-                                   _mm256_xor_si256( qt[30], qt[31] ) )));
+   xl = _mm256_xor_si256(
+           mm256_xor4( qt[16], qt[17], qt[18], qt[19] ), 
+           mm256_xor4( qt[20], qt[21], qt[22], qt[23] ) ); 
+   xh = _mm256_xor_si256( xl, _mm256_xor_si256( 
+           mm256_xor4( qt[24], qt[25], qt[26], qt[27] ),
+           mm256_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );

   dH[ 0] = _mm256_add_epi64(
-                 _mm256_xor_si256( M[0],
-                      _mm256_xor_si256( _mm256_slli_epi64( xh, 5 ),
-                                        _mm256_srli_epi64( qt[16], 5 ) ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[24] ), qt[ 0] ));
+               _mm256_xor_si256( M[0],
+                  _mm256_xor_si256( _mm256_slli_epi64( xh, 5 ),
+                                    _mm256_srli_epi64( qt[16], 5 ) ) ),
+               _mm256_xor_si256( _mm256_xor_si256( xl, qt[24] ), qt[ 0] ) );
   dH[ 1] = _mm256_add_epi64(
-                 _mm256_xor_si256( M[1],
-                      _mm256_xor_si256( _mm256_srli_epi64( xh, 7 ),
-                                        _mm256_slli_epi64( qt[17], 8 ) ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[25] ), qt[ 1] ));
+               _mm256_xor_si256( M[1],
+                  _mm256_xor_si256( _mm256_srli_epi64( xh, 7 ),
+                                    _mm256_slli_epi64( qt[17], 8 ) ) ),
+               _mm256_xor_si256( _mm256_xor_si256( xl, qt[25] ), qt[ 1] ) );
   dH[ 2] = _mm256_add_epi64(
-                 _mm256_xor_si256( M[2],
-                      _mm256_xor_si256( _mm256_srli_epi64( xh, 5 ),
-                                        _mm256_slli_epi64( qt[18], 5 ) ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[26] ), qt[ 2] ));
+               _mm256_xor_si256( M[2],
+                  _mm256_xor_si256( _mm256_srli_epi64( xh, 5 ),
+                                    _mm256_slli_epi64( qt[18], 5 ) ) ),
+               _mm256_xor_si256( _mm256_xor_si256( xl, qt[26] ), qt[ 2] ) );
   dH[ 3] = _mm256_add_epi64(
-                 _mm256_xor_si256( M[3],
-                      _mm256_xor_si256( _mm256_srli_epi64( xh, 1 ),
-                                        _mm256_slli_epi64( qt[19], 5 ) ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[27] ), qt[ 3] ));
+               _mm256_xor_si256( M[3],
+                  _mm256_xor_si256( _mm256_srli_epi64( xh, 1 ),
+                                    _mm256_slli_epi64( qt[19], 5 ) ) ),
+               _mm256_xor_si256( _mm256_xor_si256( xl, qt[27] ), qt[ 3] ) );
   dH[ 4] = _mm256_add_epi64(
-                 _mm256_xor_si256( M[4],
-                      _mm256_xor_si256( _mm256_srli_epi64( xh, 3 ),
-                                        _mm256_slli_epi64( qt[20], 0 ) ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[28] ), qt[ 4] ));
+               _mm256_xor_si256( M[4],
+                  _mm256_xor_si256( _mm256_srli_epi64( xh, 3 ),
+                                    _mm256_slli_epi64( qt[20], 0 ) ) ),
+               _mm256_xor_si256( _mm256_xor_si256( xl, qt[28] ), qt[ 4] ) );
   dH[ 5] = _mm256_add_epi64(
-                 _mm256_xor_si256( M[5],
-                      _mm256_xor_si256( _mm256_slli_epi64( xh, 6 ),
-                                        _mm256_srli_epi64( qt[21], 6 ) ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[29] ), qt[ 5] ));
+               _mm256_xor_si256( M[5],
+                  _mm256_xor_si256( _mm256_slli_epi64( xh, 6 ),
+                                    _mm256_srli_epi64( qt[21], 6 ) ) ),
+               _mm256_xor_si256( _mm256_xor_si256( xl, qt[29] ), qt[ 5] ) );
   dH[ 6] = _mm256_add_epi64(
-                 _mm256_xor_si256( M[6],
-                      _mm256_xor_si256( _mm256_srli_epi64( xh, 4 ),
-                                        _mm256_slli_epi64( qt[22], 6 ) ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[30] ), qt[ 6] ));
+               _mm256_xor_si256( M[6],
+                  _mm256_xor_si256( _mm256_srli_epi64( xh, 4 ),
+                                    _mm256_slli_epi64( qt[22], 6 ) ) ),
+               _mm256_xor_si256( _mm256_xor_si256( xl, qt[30] ), qt[ 6] ) );
   dH[ 7] = _mm256_add_epi64(
-                 _mm256_xor_si256( M[7],
-                      _mm256_xor_si256( _mm256_srli_epi64( xh, 11 ),
-                                        _mm256_slli_epi64( qt[23], 2 ) ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[31] ), qt[ 7] ));
+               _mm256_xor_si256( M[7],
+                  _mm256_xor_si256( _mm256_srli_epi64( xh, 11 ),
+                                    _mm256_slli_epi64( qt[23], 2 ) ) ),
+               _mm256_xor_si256( _mm256_xor_si256( xl, qt[31] ), qt[ 7] ) );
   dH[ 8] = _mm256_add_epi64( _mm256_add_epi64(
-                 mm256_rol_64( dH[4], 9 ),
+              mm256_rol_64( dH[4], 9 ),
                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[24] ), M[ 8] )),
                 _mm256_xor_si256( _mm256_slli_epi64( xl, 8 ),
                                   _mm256_xor_si256( qt[23], qt[ 8] ) ) );
   dH[ 9] = _mm256_add_epi64( _mm256_add_epi64(
-                 mm256_rol_64( dH[5], 10 ),
+              mm256_rol_64( dH[5], 10 ),
                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[25] ), M[ 9] )),
                 _mm256_xor_si256( _mm256_srli_epi64( xl, 6 ),
                                   _mm256_xor_si256( qt[16], qt[ 9] ) ) );
   dH[10] = _mm256_add_epi64( _mm256_add_epi64(
-                 mm256_rol_64( dH[6], 11 ),
+              mm256_rol_64( dH[6], 11 ),
                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[26] ), M[10] )),
                 _mm256_xor_si256( _mm256_slli_epi64( xl, 6 ),
                                   _mm256_xor_si256( qt[17], qt[10] ) ) );
   dH[11] = _mm256_add_epi64( _mm256_add_epi64(
-                 mm256_rol_64( dH[7], 12 ),
+              mm256_rol_64( dH[7], 12 ),
                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[27] ), M[11] )),
                 _mm256_xor_si256( _mm256_slli_epi64( xl, 4 ),
                                   _mm256_xor_si256( qt[18], qt[11] ) ) );
   dH[12] = _mm256_add_epi64( _mm256_add_epi64(
-                 mm256_rol_64( dH[0], 13 ),
+              mm256_rol_64( dH[0], 13 ),
                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[28] ), M[12] )),
                 _mm256_xor_si256( _mm256_srli_epi64( xl, 3 ),
                                   _mm256_xor_si256( qt[19], qt[12] ) ) );
   dH[13] = _mm256_add_epi64( _mm256_add_epi64(
-                 mm256_rol_64( dH[1], 14 ),
+              mm256_rol_64( dH[1], 14 ),
                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[29] ), M[13] )),
                 _mm256_xor_si256( _mm256_srli_epi64( xl, 4 ),
                                   _mm256_xor_si256( qt[20], qt[13] ) ) );
   dH[14] = _mm256_add_epi64( _mm256_add_epi64(
-                 mm256_rol_64( dH[2], 15 ),
+              mm256_rol_64( dH[2], 15 ),
                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[30] ), M[14] )),
                 _mm256_xor_si256( _mm256_srli_epi64( xl, 7 ),
                                   _mm256_xor_si256( qt[21], qt[14] ) ) );
   dH[15] = _mm256_add_epi64( _mm256_add_epi64(
-                 mm256_rol_64( dH[3], 16 ),
+              mm256_rol_64( dH[3], 16 ),
                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[31] ), M[15] )),
                 _mm256_xor_si256( _mm256_srli_epi64( xl, 2 ),
                                   _mm256_xor_si256( qt[22], qt[15] ) ) );
@@ -997,8 +961,22 @@ static const __m256i final_b[16] =
 static void
 bmw64_4way_init( bmw_4way_big_context *sc, const sph_u64 *iv )
 {
-   for ( int i = 0; i < 16; i++ )
-      sc->H[i] = _mm256_set1_epi64x( iv[i] );
+   sc->H[ 0] = m256_const1_64( 0x8081828384858687 );
+   sc->H[ 1] = m256_const1_64( 0x88898A8B8C8D8E8F );
+   sc->H[ 2] = m256_const1_64( 0x9091929394959697 );
+   sc->H[ 3] = m256_const1_64( 0x98999A9B9C9D9E9F );
+   sc->H[ 4] = m256_const1_64( 0xA0A1A2A3A4A5A6A7 );
+   sc->H[ 5] = m256_const1_64( 0xA8A9AAABACADAEAF );
+   sc->H[ 6] = m256_const1_64( 0xB0B1B2B3B4B5B6B7 );
+   sc->H[ 7] = m256_const1_64( 0xB8B9BABBBCBDBEBF );
+   sc->H[ 8] = m256_const1_64( 0xC0C1C2C3C4C5C6C7 );
+   sc->H[ 9] = m256_const1_64( 0xC8C9CACBCCCDCECF );
+   sc->H[10] = m256_const1_64( 0xD0D1D2D3D4D5D6D7 );
+   sc->H[11] = m256_const1_64( 0xD8D9DADBDCDDDEDF );
+   sc->H[12] = m256_const1_64( 0xE0E1E2E3E4E5E6E7 );
+   sc->H[13] = m256_const1_64( 0xE8E9EAEBECEDEEEF );
+   sc->H[14] = m256_const1_64( 0xF0F1F2F3F4F5F6F7 );
+   sc->H[15] = m256_const1_64( 0xF8F9FAFBFCFDFEFF );
   sc->ptr = 0;
   sc->bit_count = 0;
 }
@@ -1050,13 +1028,11 @@ bmw64_4way_close(bmw_4way_big_context *sc, unsigned ub, unsigned n,
   __m256i *buf;
   __m256i h1[16], h2[16], *h;
   size_t ptr, u, v;
-   unsigned z;
   const int buf_size = 128;  // bytes of one lane, compatible with len

   buf = sc->buf;
   ptr = sc->ptr;
-   z = 0x80 >> n;
-   buf[ ptr>>3 ] = _mm256_set1_epi64x( z );
+   buf[ ptr>>3 ] = m256_const1_64( 0x80 );
   ptr += 8;
   h = sc->H;

--- a/algo/bmw/bmw512.c
+++ b/algo/bmw/bmw512.c
@@ -0,0 +1,53 @@
+#include "algo-gate-api.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "sph_bmw.h"
+
+void bmw512hash(void *state, const void *input)
+{
+    sph_bmw512_context ctx;
+    uint32_t hash[32];	
+   
+    sph_bmw512_init( &ctx );
+    sph_bmw512( &ctx,input, 80 );
+    sph_bmw512_close( &ctx, hash );
+
+    memcpy( state, hash, 32 );
+}
+
+int scanhash_bmw512( struct work *work, uint32_t max_nonce,
+                     uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+	uint32_t n = pdata[19] - 1;
+	const uint32_t first_nonce = pdata[19];
+	//const uint32_t Htarg = ptarget[7];
+   int thr_id = mythr->id;  // thr_id arg is deprecated
+
+	uint32_t _ALIGN(32) hash64[8];
+	uint32_t endiandata[32];
+
+   for (int i=0; i < 19; i++) 
+           be32enc(&endiandata[i], pdata[i]);
+
+	do {
+	
+		pdata[19] = ++n;
+		be32enc(&endiandata[19], n); 
+		bmw512hash(hash64, endiandata);
+        if (((hash64[7]&0xFFFFFF00)==0) && 
+				fulltest(hash64, ptarget)) {
+            *hashes_done = n - first_nonce + 1;
+			return true;
+		}
+	} while (n < max_nonce && !work_restart[thr_id].restart);
+	
+	*hashes_done = n - first_nonce + 1;
+	pdata[19] = n;
+	return 0;
+}
+
--- a/algo/cryptonight/cryptolight.c
+++ b/algo/cryptonight/cryptolight.c
@@ -242,6 +242,8 @@ void cryptolight_hash(void* output, const void* input, int len) {
 	free(ctx);
 }

+#if defined(__AES__)
+
 static void cryptolight_hash_ctx_aes_ni(void* output, const void* input,
                       int len, struct cryptonight_ctx* ctx)
 {
@@ -312,8 +314,10 @@ static void cryptolight_hash_ctx_aes_ni(void* output, const void* input,
 	oaes_free((OAES_CTX **) &ctx->aes_ctx);
 }

-int scanhash_cryptolight(int thr_id, struct work *work,
-		uint32_t max_nonce, uint64_t *hashes_done)
+#endif
+
+int scanhash_cryptolight( struct work *work,
+		uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr)
 {
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
@@ -322,6 +326,7 @@ int scanhash_cryptolight(int thr_id, struct work *work,
 	const uint32_t first_nonce = n + 1;
 	//const uint32_t Htarg = ptarget[7];
 	uint32_t _ALIGN(32) hash[HASH_SIZE / 4];
+   int thr_id = mythr->id;

 	struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx));

--- a/algo/cryptonight/cryptonight-common.c
+++ b/algo/cryptonight/cryptonight-common.c
@@ -70,11 +70,12 @@ void cryptonight_hash_suw( void *restrict output, const void *input )

 bool cryptonightV7 = false;

-int scanhash_cryptonight( int thr_id, struct work *work, uint32_t max_nonce,
-                   uint64_t *hashes_done )
+int scanhash_cryptonight( struct work *work, uint32_t max_nonce,
+                   uint64_t *hashes_done, struct thr_info *mythr )
 {
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
+    int thr_id = mythr->id;

    uint32_t *nonceptr = (uint32_t*) (((char*)pdata) + 39);
    uint32_t n = *nonceptr - 1;
--- a/algo/cryptonight/cryptonight.h
+++ b/algo/cryptonight/cryptonight.h
@@ -40,8 +40,8 @@ void cryptonight_hash_ctx(void* output, const void* input, int len);
 void keccakf(uint64_t st[25], int rounds);
 extern void (* const extra_hashes[4])(const void *, size_t, char *);

-int scanhash_cryptonight( int thr_id, struct work *work, uint32_t max_nonce,
-                           uint64_t *hashes_done );
+int scanhash_cryptonight( struct work *work, uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr );

 void cryptonight_hash_aes( void *restrict output, const void *input, int len );

--- a/algo/cubehash/cube-hash-2way.c
+++ b/algo/cubehash/cube-hash-2way.c
@@ -7,6 +7,7 @@

 // 2x128

+/*
 // The result of hashing 10 rounds of initial data which consists of params
 // zero padded.
 static const uint64_t IV256[] =
@@ -24,13 +25,14 @@ static const uint64_t IV512[] =
 0x148FE485FCD398D9, 0xB64445321B017BEF, 0x2FF5781C6A536159, 0x0DBADEA991FA7934,
 0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246
 };
+*/

 static void transform_2way( cube_2way_context *sp )
 {
    int r;
    const int rounds = sp->rounds;

-    __m256i x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3;
+    __m256i x0, x1, x2, x3, x4, x5, x6, x7, y0, y1;

    x0 = _mm256_load_si256( (__m256i*)sp->h     );
    x1 = _mm256_load_si256( (__m256i*)sp->h + 1 );
@@ -47,18 +49,12 @@ static void transform_2way( cube_2way_context *sp )
        x5 = _mm256_add_epi32( x1, x5 );
        x6 = _mm256_add_epi32( x2, x6 );
        x7 = _mm256_add_epi32( x3, x7 );
-        y0 = x2;
-        y1 = x3;
-        y2 = x0;
-        y3 = x1;
-        x0 = _mm256_xor_si256( _mm256_slli_epi32( y0,  7 ),
-                               _mm256_srli_epi32( y0, 25 ) );
-        x1 = _mm256_xor_si256( _mm256_slli_epi32( y1,  7 ),
-                               _mm256_srli_epi32( y1, 25 ) );
-        x2 = _mm256_xor_si256( _mm256_slli_epi32( y2,  7 ),
-                               _mm256_srli_epi32( y2, 25 ) );
-        x3 = _mm256_xor_si256( _mm256_slli_epi32( y3,  7 ),
-                               _mm256_srli_epi32( y3, 25 ) );
+        y0 = x0;
+        y1 = x1;
+        x0 = mm256_rol_32( x2, 7 );
+        x1 = mm256_rol_32( x3, 7 );
+        x2 = mm256_rol_32( y0, 7 );
+        x3 = mm256_rol_32( y1, 7 );
        x0 = _mm256_xor_si256( x0, x4 );
        x1 = _mm256_xor_si256( x1, x5 );
        x2 = _mm256_xor_si256( x2, x6 );
@@ -71,18 +67,12 @@ static void transform_2way( cube_2way_context *sp )
        x5 = _mm256_add_epi32( x1, x5 );
        x6 = _mm256_add_epi32( x2, x6 );
        x7 = _mm256_add_epi32( x3, x7 );
-        y0 = x1;
-        y1 = x0;
-        y2 = x3;
-        y3 = x2;
-        x0 = _mm256_xor_si256( _mm256_slli_epi32( y0, 11 ),
-                               _mm256_srli_epi32( y0, 21 ) );
-        x1 = _mm256_xor_si256( _mm256_slli_epi32( y1, 11 ),
-                               _mm256_srli_epi32( y1, 21 ) );
-        x2 = _mm256_xor_si256( _mm256_slli_epi32( y2, 11 ),
-                               _mm256_srli_epi32( y2, 21 ) );
-        x3 = _mm256_xor_si256( _mm256_slli_epi32( y3, 11 ),
-                               _mm256_srli_epi32( y3, 21 ) );
+        y0 = x0;
+        y1 = x2;
+        x0 = mm256_rol_32( x1, 11 );
+        x1 = mm256_rol_32( y0, 11 );
+        x2 = mm256_rol_32( x3, 11 );
+        x3 = mm256_rol_32( y1, 11 );
        x0 = _mm256_xor_si256( x0, x4 );
        x1 = _mm256_xor_si256( x1, x5 );
        x2 = _mm256_xor_si256( x2, x6 );
@@ -107,23 +97,40 @@ static void transform_2way( cube_2way_context *sp )
 int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds,
                    int blockbytes )
 {
-    const uint64_t* iv = hashbitlen == 512 ? IV512 : IV256;
+    __m128i* h = (__m128i*)sp->h;
    sp->hashlen   = hashbitlen/128;
    sp->blocksize = blockbytes/16;
    sp->rounds    = rounds;
    sp->pos       = 0;

-    __m256i* h = (__m256i*)sp->h;
-
-    h[0] = _mm256_set_epi64x( iv[ 1], iv[ 0], iv[ 1], iv[ 0] );
-    h[1] = _mm256_set_epi64x( iv[ 3], iv[ 2], iv[ 3], iv[ 2] );
-    h[2] = _mm256_set_epi64x( iv[ 5], iv[ 4], iv[ 5], iv[ 4] );
-    h[3] = _mm256_set_epi64x( iv[ 7], iv[ 6], iv[ 7], iv[ 6] );
-    h[4] = _mm256_set_epi64x( iv[ 9], iv[ 8], iv[ 9], iv[ 8] );
-    h[5] = _mm256_set_epi64x( iv[11], iv[10], iv[11], iv[10] );
-    h[6] = _mm256_set_epi64x( iv[13], iv[12], iv[13], iv[12] );
-    h[7] = _mm256_set_epi64x( iv[15], iv[14], iv[15], iv[14] );
+    if ( hashbitlen == 512 )
+    {

+       h[ 0] = m128_const_64( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
+       h[ 2] = m128_const_64( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
+       h[ 4] = m128_const_64( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
+       h[ 6] = m128_const_64( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
+       h[ 8] = m128_const_64( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
+       h[10] = m128_const_64( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
+       h[12] = m128_const_64( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
+       h[14] = m128_const_64( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
+       h[1] = h[ 0];  h[ 3] = h[ 2]; h[ 5] = h[ 4]; h[ 7] = h[ 6];
+       h[9] = h[ 8];  h[11] = h[10]; h[13] = h[12]; h[15] = h[14];
+    }
+    else
+    {
+       h[ 0] = m128_const_64( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
+       h[ 2] = m128_const_64( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
+       h[ 4] = m128_const_64( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
+       h[ 6] = m128_const_64( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
+       h[ 8] = m128_const_64( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
+       h[10] = m128_const_64( 0x93CB628565C892FD, 0x5FA2560309392549 );
+       h[12] = m128_const_64( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
+       h[14] = m128_const_64( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
+       h[1] = h[ 0];  h[ 3] = h[ 2]; h[ 5] = h[ 4]; h[ 7] = h[ 6];
+       h[9] = h[ 8];  h[11] = h[10]; h[13] = h[12]; h[15] = h[14];
+    }
+    
    return 0;
 }

@@ -165,7 +172,7 @@ int cube_2way_close( cube_2way_context *sp, void *output )

    for ( i = 0; i < 10; ++i )           transform_2way( sp );

-    for ( i = 0; i < sp->hashlen; i++ )  hash[i] = sp->h[i];
+    memcpy( hash, sp->h, sp->hashlen<<5 );
    return 0;
 }

@@ -198,7 +205,7 @@ int cube_2way_update_close( cube_2way_context *sp, void *output,

    for ( i = 0; i < 10; ++i )            transform_2way( sp );

-    for ( i = 0; i < sp->hashlen; i++ )   hash[i] = sp->h[i];
+    memcpy( hash, sp->h, sp->hashlen<<5 );
    return 0;
 }

--- a/algo/cubehash/cube-hash-2way.h
+++ b/algo/cubehash/cube-hash-2way.h
@@ -4,7 +4,7 @@
 #if defined(__AVX2__)

 #include <stdint.h>
-#include "avxdefs.h"
+#include "simd-utils.h"

 // 2x128, 2 way parallel SSE2

--- a/algo/cubehash/cubehash_sse2.c
+++ b/algo/cubehash/cubehash_sse2.c
@@ -13,27 +13,9 @@
 #include <stdbool.h>
 #include <unistd.h>
 #include <memory.h>
-#include "avxdefs.h"
+#include "simd-utils.h"
 #include <stdio.h>

-// The result of hashing 10 rounds of initial data which is params and 
-// mostly zeros.
-static const uint64_t IV256[] =
-{
-0xCCD6F29FEA2BD4B4, 0x35481EAE63117E71, 0xE5D94E6322512D5B, 0xF4CC12BE7E624131,
-0x42AF2070C2D0B696, 0x3361DA8CD0720C35, 0x8EF8AD8328CCECA4, 0x40E5FBAB4680AC00,
-0x6107FBD5D89041C3, 0xF0B266796C859D41, 0x5FA2560309392549, 0x93CB628565C892FD,
-0x9E4B4E602AF2B5AE, 0x85254725774ABFDD, 0x4AB6AAD615815AEB, 0xD6032C0A9CDAF8AF
-};
-
-static const uint64_t IV512[] =
-{
-0x50F494D42AEA2A61, 0x4167D83E2D538B8B, 0xC701CF8C3FEE2313, 0x50AC5695CC39968E,
-0xA647A8B34D42C787, 0x825B453797CF0BEF, 0xF22090C4EEF864D2, 0xA23911AED0E5CD33,
-0x148FE485FCD398D9, 0xB64445321B017BEF, 0x2FF5781C6A536159, 0x0DBADEA991FA7934,
-0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246
-};
-
 static void transform( cubehashParam *sp )
 {
    int r;
@@ -53,26 +35,22 @@ static void transform( cubehashParam *sp )
        x2 = _mm256_add_epi32( x0, x2 );
        x3 = _mm256_add_epi32( x1, x3 );
        y0 = x0;
-        x0 = _mm256_xor_si256( _mm256_slli_epi32( x1, 7 ),
-                               _mm256_srli_epi32( x1, 25 ) );
-        x1 = _mm256_xor_si256( _mm256_slli_epi32( y0, 7 ),
-                               _mm256_srli_epi32( y0, 25 ) );
+        x0 = mm256_rol_32( x1, 7 );
+        x1 = mm256_rol_32( y0, 7 );
        x0 = _mm256_xor_si256( x0, x2 );
        x1 = _mm256_xor_si256( x1, x3 );
-        x2 = _mm256_shuffle_epi32( x2, 0x4e );
-        x3 = _mm256_shuffle_epi32( x3, 0x4e );
+        x2 = mm256_swap64_128( x2 );
+        x3 = mm256_swap64_128( x3 );
        x2 = _mm256_add_epi32( x0, x2 );
        x3 = _mm256_add_epi32( x1, x3 );
-        y0 = _mm256_permute4x64_epi64( x0, 0x4e );
-        y1 = _mm256_permute4x64_epi64( x1, 0x4e );
-        x0 = _mm256_xor_si256( _mm256_slli_epi32( y0, 11 ),
-                               _mm256_srli_epi32( y0, 21 ) );
-        x1 = _mm256_xor_si256( _mm256_slli_epi32( y1, 11 ), 
-                               _mm256_srli_epi32( y1, 21 ) );
+        y0 = mm256_swap_128( x0 );
+        y1 = mm256_swap_128( x1 );
+        x0 = mm256_rol_32( y0, 11 );
+        x1 = mm256_rol_32( y1, 11 );
        x0 = _mm256_xor_si256( x0, x2 );
        x1 = _mm256_xor_si256( x1, x3 );
-        x2 = _mm256_shuffle_epi32( x2, 0xb1 );
-        x3 = _mm256_shuffle_epi32( x3, 0xb1 );
+        x2 = mm256_swap32_64( x2 );
+        x3 = mm256_swap32_64( x3 );
    }

    _mm256_store_si256( (__m256i*)sp->x,     x0 );
@@ -147,37 +125,58 @@ static void transform( cubehashParam *sp )
 #endif
 }  // transform

+/*
+// The result of hashing 10 rounds of initial data which is params and
+// mostly zeros.
+static const uint64_t IV256[] =
+{
+0xCCD6F29FEA2BD4B4, 0x35481EAE63117E71, 0xE5D94E6322512D5B, 0xF4CC12BE7E624131,
+0x42AF2070C2D0B696, 0x3361DA8CD0720C35, 0x8EF8AD8328CCECA4, 0x40E5FBAB4680AC00,
+0x6107FBD5D89041C3, 0xF0B266796C859D41, 0x5FA2560309392549, 0x93CB628565C892FD,
+0x9E4B4E602AF2B5AE, 0x85254725774ABFDD, 0x4AB6AAD615815AEB, 0xD6032C0A9CDAF8AF
+};
+
+static const uint64_t IV512[] =
+{
+0x50F494D42AEA2A61, 0x4167D83E2D538B8B, 0xC701CF8C3FEE2313, 0x50AC5695CC39968E,
+0xA647A8B34D42C787, 0x825B453797CF0BEF, 0xF22090C4EEF864D2, 0xA23911AED0E5CD33,
+0x148FE485FCD398D9, 0xB64445321B017BEF, 0x2FF5781C6A536159, 0x0DBADEA991FA7934,
+0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246
+};
+*/
+
 int cubehashInit(cubehashParam *sp, int hashbitlen, int rounds, int blockbytes)
 {
-    const uint64_t* iv = hashbitlen == 512 ? IV512 : IV256;
+    __m128i *x = (__m128i*)sp->x;
    sp->hashlen   = hashbitlen/128;
    sp->blocksize = blockbytes/16;
    sp->rounds    = rounds;
    sp->pos       = 0;
-    
-#if defined(__AVX2__)

-    __m256i* x = (__m256i*)sp->x;
+    if ( hashbitlen == 512 )
+    {

-    x[0] = _mm256_set_epi64x( iv[ 3], iv[ 2], iv[ 1], iv[ 0] );
-    x[1] = _mm256_set_epi64x( iv[ 7], iv[ 6], iv[ 5], iv[ 4] );
-    x[2] = _mm256_set_epi64x( iv[11], iv[10], iv[ 9], iv[ 8] );
-    x[3] = _mm256_set_epi64x( iv[15], iv[14], iv[13], iv[12] );
+       x[0] = m128_const_64( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
+       x[1] = m128_const_64( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
+       x[2] = m128_const_64( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
+       x[3] = m128_const_64( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
+       x[4] = m128_const_64( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
+       x[5] = m128_const_64( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
+       x[6] = m128_const_64( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
+       x[7] = m128_const_64( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
+    }
+    else
+    {
+       x[0] = m128_const_64( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
+       x[1] = m128_const_64( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
+       x[2] = m128_const_64( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
+       x[3] = m128_const_64( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
+       x[4] = m128_const_64( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
+       x[5] = m128_const_64( 0x93CB628565C892FD, 0x5FA2560309392549 );
+       x[6] = m128_const_64( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
+       x[7] = m128_const_64( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
+    }   

-#else
-
-    __m128i* x = (__m128i*)sp->x;
-
-     x[0] = _mm_set_epi64x( iv[ 1], iv[ 0] );
-     x[1] = _mm_set_epi64x( iv[ 3], iv[ 2] );
-     x[2] = _mm_set_epi64x( iv[ 5], iv[ 4] );
-     x[3] = _mm_set_epi64x( iv[ 7], iv[ 6] );
-     x[4] = _mm_set_epi64x( iv[ 9], iv[ 8] );
-     x[5] = _mm_set_epi64x( iv[11], iv[10] );
-     x[6] = _mm_set_epi64x( iv[13], iv[12] );
-     x[7] = _mm_set_epi64x( iv[15], iv[14] );
-
-#endif
    return SUCCESS;
 }

--- a/algo/groestl/aes_ni/brg_endian.h
+++ b/algo/groestl/aes_ni/brg_endian.h
@@ -43,7 +43,7 @@
 #  if !defined( __MINGW32__ ) && !defined( _AIX )
 #    include <endian.h>
 #    if !defined( __BEOS__ )
-#      include <byteswap.h>
+//#      include <byteswap.h>
 #    endif
 #  endif
 #endif
--- a/algo/groestl/aes_ni/hash-groestl.c
+++ b/algo/groestl/aes_ni/hash-groestl.c
@@ -12,7 +12,7 @@
 #include <memory.h>
 #include "hash-groestl.h"
 #include "miner.h"
-#include "avxdefs.h"
+#include "simd-utils.h"

 #ifndef NO_AES_NI

--- a/algo/groestl/aes_ni/hash-groestl256.c
+++ b/algo/groestl/aes_ni/hash-groestl256.c
@@ -9,7 +9,7 @@
 #include <memory.h>
 #include "hash-groestl256.h"
 #include "miner.h"
-#include "avxdefs.h"
+#include "simd-utils.h"

 #ifndef NO_AES_NI

--- a/algo/groestl/groestl.c
+++ b/algo/groestl/groestl.c
@@ -56,14 +56,15 @@ void groestlhash( void *output, const void *input )
     memcpy(output, hash, 32);
 }

-int scanhash_groestl( int thr_id, struct work *work, uint32_t max_nonce,
-                      uint64_t *hashes_done )
+int scanhash_groestl( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
        uint32_t endiandata[20] __attribute__ ((aligned (64)));
 	const uint32_t first_nonce = pdata[19];
 	uint32_t nonce = first_nonce;
+   int thr_id = mythr->id;  // thr_id arg is deprecated

 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x0000ff;
@@ -93,19 +94,14 @@ int scanhash_groestl( int thr_id, struct work *work, uint32_t max_nonce,
 	return 0;
 }

-void groestl_set_target( struct work* work, double job_diff )
-{
- work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
-}
-
 bool register_dmd_gr_algo( algo_gate_t* gate )
 {
    init_groestl_ctx();
    gate->optimizations   = SSE2_OPT | AES_OPT;
    gate->scanhash        = (void*)&scanhash_groestl;
    gate->hash            = (void*)&groestlhash;
-    gate->set_target      = (void*)&groestl_set_target;
    gate->get_max64       = (void*)&get_max64_0x3ffff;
+    opt_target_factor = 256.0;
    return true;
 };

--- a/algo/groestl/myr-groestl.c
+++ b/algo/groestl/myr-groestl.c
@@ -10,7 +10,7 @@
 #else
  #include "aes_ni/hash-groestl.h"
 #endif
-#include "algo/sha/sph_sha2.h"
+#include <openssl/sha.h>

 typedef struct {
 #ifdef NO_AES_NI
@@ -18,7 +18,7 @@ typedef struct {
 #else
    hashState_groestl       groestl;
 #endif
-    sph_sha256_context sha;
+    SHA256_CTX              sha;
 } myrgr_ctx_holder;

 myrgr_ctx_holder myrgr_ctx;
@@ -28,15 +28,15 @@ void init_myrgr_ctx()
 #ifdef NO_AES_NI
     sph_groestl512_init( &myrgr_ctx.groestl );
 #else
-     init_groestl (&myrgr_ctx.groestl, 64 );
+     init_groestl ( &myrgr_ctx.groestl, 64 );
 #endif
-     sph_sha256_init(&myrgr_ctx.sha);
+     SHA256_Init( &myrgr_ctx.sha );
 }

 void myriad_hash(void *output, const void *input)
 {
-        myrgr_ctx_holder ctx;
-        memcpy( &ctx, &myrgr_ctx, sizeof(myrgr_ctx) );
+   myrgr_ctx_holder ctx;
+   memcpy( &ctx, &myrgr_ctx, sizeof(myrgr_ctx) );

 	uint32_t _ALIGN(32) hash[16];

@@ -44,25 +44,25 @@ void myriad_hash(void *output, const void *input)
 	sph_groestl512(&ctx.groestl, input, 80);
 	sph_groestl512_close(&ctx.groestl, hash);
 #else
-        update_groestl( &ctx.groestl, (char*)input, 640 );
-        final_groestl( &ctx.groestl, (char*)hash);
+   update_groestl( &ctx.groestl, (char*)input, 640 );
+   final_groestl( &ctx.groestl, (char*)hash);
 #endif

-	sph_sha256(&ctx.sha, hash, 64);
-	sph_sha256_close(&ctx.sha, hash);
+   SHA256_Update( &ctx.sha, (unsigned char*)hash, 64 );
+   SHA256_Final( (unsigned char*)hash, &ctx.sha );

 	memcpy(output, hash, 32);
 }

-int scanhash_myriad(int thr_id, struct work *work,
-	uint32_t max_nonce, uint64_t *hashes_done)
+int scanhash_myriad( struct work *work, uint32_t max_nonce,
+                     uint64_t *hashes_done, struct thr_info *mythr )
 {
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
-
 	uint32_t _ALIGN(64) endiandata[20];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
 	const uint32_t first_nonce = pdata[19];
 	uint32_t nonce = first_nonce;
+   int thr_id = mythr->id;  // thr_id arg is deprecated

 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x0000ff;
--- a/algo/groestl/myrgr-4way.c
+++ b/algo/groestl/myrgr-4way.c
@@ -8,7 +8,7 @@
 #include <string.h>

 #include "aes_ni/hash-groestl.h"
-#include "algo/sha/sha2-hash-4way.h"
+#include "algo/sha/sha-hash-4way.h"

 typedef struct {
    hashState_groestl       groestl;
@@ -33,7 +33,7 @@ void myriad_4way_hash( void *output, const void *input )
     myrgr_4way_ctx_holder ctx;
     memcpy( &ctx, &myrgr_4way_ctx, sizeof(myrgr_4way_ctx) );

-     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, input, 640 );
+     dintrlv_4x32( hash0, hash1, hash2, hash3, input, 640 );

     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 640 );
     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
@@ -43,66 +43,52 @@ void myriad_4way_hash( void *output, const void *input )
     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 640 );

-     mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );

     sha256_4way( &ctx.sha, vhash, 64 );
-     sha256_4way_close( &ctx.sha, vhash );
-
-     mm128_deinterleave_4x32( output, output+32, output+64, output+96,
-                           vhash, 256 );
+     sha256_4way_close( &ctx.sha, output );
 }

-int scanhash_myriad_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done )
+int scanhash_myriad_4way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t hash[8*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-   uint32_t _ALIGN(64) edata[20];
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t *hash7 = &(hash[7<<2]);
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
-   uint32_t *nonces = work->nonces;
-   int num_found = 0;
-   uint32_t *noncep = vdata + 76; // 19*4
+   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
+   int thr_id = mythr->id;  // thr_id arg is deprecated

-/*
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
-
-	uint32_t _ALIGN(64) endiandata[20];
-	const uint32_t first_nonce = pdata[19];
-	uint32_t nonce = first_nonce;
-*/
   if ( opt_benchmark )
      ( (uint32_t*)ptarget )[7] = 0x0000ff;

-   swab32_array( edata, pdata, 20 );
-   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
-
+   mm128_bswap32_intrlv80_4x32( vdata, pdata );
   do {
-      be32enc( noncep,   n   );
-      be32enc( noncep+1, n+1 );
-      be32enc( noncep+2, n+2 );
-      be32enc( noncep+3, n+3 );
+      *noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) );

      myriad_4way_hash( hash, vdata );
      pdata[19] = n;

-      for ( int i = 0; i < 4; i++ )
-      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+      for ( int lane = 0; lane < 4; lane++ )
+      if ( hash7[ lane ] <= Htarg )
      {
-          pdata[19] = n+i;
-          nonces[ num_found++ ] = n+i;
-          work_set_target_ratio( work, hash+(i<<3) );
+         extr_lane_4x32( lane_hash, hash, lane, 256 );
+         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+         {
+            pdata[19] = n + lane;
+            submit_lane_solution( work, lane_hash, mythr, lane );
+         }
      }
      n += 4;
-   } while ( (num_found == 0) && (n < max_nonce-4)
-                   && !work_restart[thr_id].restart);
+   } while ( (n < max_nonce-4) && !work_restart[thr_id].restart);

   *hashes_done = n - first_nonce + 1;
-   return num_found;
+   return 0;
 }

 #endif
--- a/algo/groestl/myrgr-gate.h
+++ b/algo/groestl/myrgr-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__) && defined(__AES__)
+#if defined(__AVX2__) && defined(__AES__) && !defined(__SHA__)
  #define MYRGR_4WAY
 #endif

@@ -12,8 +12,8 @@

 void myriad_4way_hash( void *state, const void *input );

-int scanhash_myriad_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done );
+int scanhash_myriad_4way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );

 void init_myrgr_4way_ctx();

@@ -21,8 +21,8 @@ void init_myrgr_4way_ctx();

 void myriad_hash( void *state, const void *input );

-int scanhash_myriad( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done );
+int scanhash_myriad( struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done, struct thr_info *mythr );

 void init_myrgr_ctx();

--- a/algo/hamsi/hamsi-hash-4way.c
+++ b/algo/hamsi/hamsi-hash-4way.c
@@ -531,16 +531,17 @@ static const sph_u32 T512[64][16] = {

 #define INPUT_BIG \
 do { \
+  const __m256i zero = _mm256_setzero_si256(); \
  __m256i db = *buf; \
  const sph_u32 *tp = &T512[0][0]; \
-  m0 = m256_zero; \
-  m1 = m256_zero; \
-  m2 = m256_zero; \
-  m3 = m256_zero; \
-  m4 = m256_zero; \
-  m5 = m256_zero; \
-  m6 = m256_zero; \
-  m7 = m256_zero; \
+  m0 = zero; \
+  m1 = zero; \
+  m2 = zero; \
+  m3 = zero; \
+  m4 = zero; \
+  m5 = zero; \
+  m6 = zero; \
+  m7 = zero; \
  for ( int u = 0; u < 64; u++ ) \
  { \
     __m256i dm = _mm256_and_si256( db, m256_one_64 ) ; \
@@ -913,9 +914,7 @@ void hamsi512_4way( hamsi_4way_big_context *sc, const void *data, size_t len )

 void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
 {
-   __m256i *out = (__m256i*)dst;
   __m256i pad[1];
-   size_t u;
   int ch, cl;

   sph_enc32be( &ch, sc->count_high );
@@ -925,8 +924,8 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
                                  0UL, 0x80UL, 0UL, 0x80UL );
   hamsi_big( sc, sc->buf, 1 );
   hamsi_big_final( sc, pad );
-   for ( u = 0; u < 8; u ++ )
-      out[u] = mm256_bswap_32( sc->h[u] );
+
+   mm256_block_bswap_32( (__m256i*)dst, sc->h );
 }

 #ifdef __cplusplus
--- a/algo/hamsi/hamsi-hash-4way.h
+++ b/algo/hamsi/hamsi-hash-4way.h
@@ -40,7 +40,7 @@

 #if defined (__AVX2__)

-#include "avxdefs.h"
+#include "simd-utils.h"

 #ifdef __cplusplus
 extern "C"{
--- a/algo/haval/haval-hash-4way.h
+++ b/algo/haval/haval-hash-4way.h
@@ -69,7 +69,7 @@ extern "C"{

 #include <stddef.h>
 #include "algo/sha/sph_types.h"
-#include "avxdefs.h"
+#include "simd-utils.h"

 #define SPH_SIZE_haval256_5   256

--- a/algo/heavy/bastion.c
+++ b/algo/heavy/bastion.c
@@ -131,12 +131,14 @@ void bastionhash(void *output, const void *input)
 	memcpy(output, hash, 32);
 }

-int scanhash_bastion(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done)
+int scanhash_bastion( struct work *work, uint32_t max_nonce,
+      uint64_t *hashes_done, struct thr_info *mythr)
 {
 	uint32_t _ALIGN(64) hash32[8];
 	uint32_t _ALIGN(64) endiandata[20];
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
+   int thr_id = mythr->id;  // thr_id arg is deprecated

 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
--- a/algo/heavy/heavy.c
+++ b/algo/heavy/heavy.c
@@ -79,11 +79,12 @@ extern void heavyhash(unsigned char* output, const unsigned char* input, int len

 }

-int scanhash_heavy(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
-                    uint32_t max_nonce, uint64_t *hashes_done)
+int scanhash_heavy( uint32_t *pdata, const uint32_t *ptarget,
+            uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr)
 {
    uint32_t hash[8];
    uint32_t start_nonce = pdata[19];
+    int thr_id = mythr->id;  // thr_id arg is deprecated
    
    do {
        heavyhash((unsigned char *)hash, (unsigned char *)pdata, 80);
--- a/algo/hodl/aes.c
+++ b/algo/hodl/aes.c
@@ -83,7 +83,7 @@ void ExpandAESKey256(__m128i *keys, const __m128i *KeyBuf)
    keys[14] = tmp1;
 }

-#ifdef __SSE4_2__
+#if defined(__SSE4_2__)
 //#ifdef __AVX__

 #define AESENC(i,j) \
@@ -151,7 +151,7 @@ void AES256CBC(__m128i** data, const __m128i** next, __m128i ExpandedKey[][16],
    }
 }

-#else    // NO SSE4.2
+#else    // NO AVX

 static inline __m128i AES256Core(__m128i State, const __m128i *ExpandedKey)
 {
--- a/algo/hodl/hodl-gate.c
+++ b/algo/hodl/hodl-gate.c
@@ -15,11 +15,6 @@ pthread_barrier_t hodl_barrier;
 // need to be passed.
 unsigned char *hodl_scratchbuf = NULL;

-void hodl_set_target( struct work* work, double diff )
-{
-     diff_to_target(work->target, diff / 8388608.0 );
-}
-
 void hodl_le_build_stratum_request( char* req, struct work* work,
                                    struct stratum_ctx *sctx ) 
 {
@@ -143,20 +138,20 @@ bool hodl_do_this_thread( int thr_id )
  return ( thr_id == 0 );
 }

-int hodl_scanhash( int thr_id, struct work* work, uint32_t max_nonce,
-                   uint64_t *hashes_done )
+int hodl_scanhash( struct work* work, uint32_t max_nonce,
+                   uint64_t *hashes_done, struct thr_info *mythr )
 {
 #if defined(__AES__)
-  GenRandomGarbage( (CacheEntry*)hodl_scratchbuf, work->data, thr_id );
+  GenRandomGarbage( (CacheEntry*)hodl_scratchbuf, work->data, mythr->id );
  pthread_barrier_wait( &hodl_barrier );
-  return scanhash_hodl_wolf( thr_id, work, max_nonce, hashes_done );
+  return scanhash_hodl_wolf( work, max_nonce, hashes_done, thr_info );
 #endif
  return false;
 }

 bool register_hodl_algo( algo_gate_t* gate )
 {
-#if defined(__AES__)
+#if !defined(__AES__)
  applog( LOG_ERR, "Only CPUs with AES are supported, use legacy version.");
  return false;
 #endif
@@ -166,11 +161,10 @@ bool register_hodl_algo( algo_gate_t* gate )
 //     return false;
 //  }
  pthread_barrier_init( &hodl_barrier, NULL, opt_n_threads );
-  gate->optimizations         = AES_OPT | SSE42_OPT | AVX2_OPT;
+  gate->optimizations         = AES_OPT | AVX_OPT | AVX2_OPT;
  gate->scanhash              = (void*)&hodl_scanhash;
  gate->get_new_work          = (void*)&hodl_get_new_work;
  gate->longpoll_rpc_call     = (void*)&hodl_longpoll_rpc_call;
-  gate->set_target            = (void*)&hodl_set_target;
  gate->build_stratum_request = (void*)&hodl_le_build_stratum_request;
  gate->malloc_txs_request    = (void*)&hodl_malloc_txs_request;
  gate->build_block_header    = (void*)&hodl_build_block_header;
@@ -179,6 +173,7 @@ bool register_hodl_algo( algo_gate_t* gate )
  gate->work_cmp_size         = 76;
  hodl_scratchbuf = (unsigned char*)malloc( 1 << 30 );
  allow_getwork = false;
+  opt_target_factor = 8388608.0;
  return ( hodl_scratchbuf != NULL );
 }

--- a/algo/hodl/hodl-wolf.c
+++ b/algo/hodl/hodl-wolf.c
@@ -17,7 +17,7 @@ void GenerateGarbageCore( CacheEntry *Garbage, int ThreadID, int ThreadCount,
    const uint32_t StartChunk = ThreadID * Chunk;
    const uint32_t EndChunk   = StartChunk + Chunk;

-#ifdef __SSE4_2__
+#if defined(__SSE4_2__)
 //#ifdef __AVX__
    uint64_t* TempBufs[ SHA512_PARALLEL_N ] ;
    uint64_t* desination[ SHA512_PARALLEL_N ];
@@ -61,13 +61,14 @@ void Rev256(uint32_t *Dest, const uint32_t *Src)
 }
 */

-int scanhash_hodl_wolf( int threadNumber, struct work* work, uint32_t max_nonce,
-                        uint64_t *hashes_done )
+int scanhash_hodl_wolf( struct work* work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr )
 {
-#ifdef __SSE4_2__
+#if defined(__SSE4_2__)
 //#ifdef __AVX__
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
+    int threadNumber = mythr->id;
    CacheEntry *Garbage = (CacheEntry*)hodl_scratchbuf;
    CacheEntry Cache[AES_PARALLEL_N];
    __m128i* data[AES_PARALLEL_N];
@@ -139,7 +140,7 @@ int scanhash_hodl_wolf( int threadNumber, struct work* work, uint32_t max_nonce,
    return(0);


-#else  // no SSE4.2
+#else  // no AVX

    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
@@ -147,6 +148,7 @@ int scanhash_hodl_wolf( int threadNumber, struct work* work, uint32_t max_nonce,
    CacheEntry *Garbage = (CacheEntry*)hodl_scratchbuf;
    CacheEntry Cache;
    uint32_t CollisionCount = 0;
+    int threadNumber = mythr->id;

    swab32_array( BlockHdr, pdata, 20 );
        // Search for pattern in psuedorandom data      
@@ -204,7 +206,7 @@ int scanhash_hodl_wolf( int threadNumber, struct work* work, uint32_t max_nonce,
    *hashes_done = CollisionCount;
    return(0);

-#endif  // SSE4.2 else
+#endif  // AVX else

 }

--- a/algo/hodl/hodl-wolf.h
+++ b/algo/hodl/hodl-wolf.h
@@ -19,8 +19,8 @@ typedef union _CacheEntry
 	__m128i dqwords[GARBAGE_SLICE_SIZE >> 4] __attribute__((aligned(16)));
 } CacheEntry;

-int scanhash_hodl_wolf( int thr_id, struct work* work, uint32_t max_nonce,
-                   uint64_t *hashes_done );
+int scanhash_hodl_wolf( struct work* work, uint32_t max_nonce,
+                   uint64_t *hashes_done, struct thr_info *mythr );

 void GenRandomGarbage( CacheEntry *Garbage, uint32_t *pdata, int thr_id);

--- a/algo/hodl/sha512-avx.h
+++ b/algo/hodl/sha512-avx.h
@@ -23,6 +23,7 @@ typedef struct
   __m256i h[8];
   __m256i w[80];
 #elif defined(__SSE4_2__)
+//#elif defined(__AVX__)
   __m128i h[8];
   __m128i w[80];
 #else
@@ -32,7 +33,8 @@ typedef struct

 #ifdef __AVX2__
 #define SHA512_PARALLEL_N 8
-#elif defined(__SSE$_2__)
+#elif defined(__SSE4_2__)
+//#elif defined(__AVX__)
 #define SHA512_PARALLEL_N 4
 #else
 #define SHA512_PARALLEL_N 1   // dummy value
--- a/algo/hodl/sha512_avx.c
+++ b/algo/hodl/sha512_avx.c
@@ -1,6 +1,6 @@
 #ifndef __AVX2__

-#ifdef __SSE4_2__
+#if defined(__SSE4_2__)
 //#ifdef __AVX__

 //Dependencies
--- a/algo/hodl/wolf-aes.h
+++ b/algo/hodl/wolf-aes.h
@@ -6,7 +6,7 @@

 void ExpandAESKey256(__m128i *keys, const __m128i *KeyBuf);

-#ifdef __SSE4_2__
+#if defined(__SSE4_2__)
 //#ifdef __AVX__

 #define AES_PARALLEL_N 8
--- a/algo/jh/jh-hash-4way.c
+++ b/algo/jh/jh-hash-4way.c
@@ -246,18 +246,12 @@ do { \
 	} while (0)
 */

-#define W0(x)   Wz(x, _mm256_set_epi64x( 0x5555555555555555, \
-       0x5555555555555555, 0x5555555555555555, 0x5555555555555555 ), 1 )
-#define W1(x)   Wz(x, _mm256_set_epi64x( 0x3333333333333333, \
-       0x3333333333333333, 0x3333333333333333, 0x3333333333333333 ), 2 )
-#define W2(x)   Wz(x, _mm256_set_epi64x( 0x0F0F0F0F0F0F0F0F, \
-       0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F ), 4 )
-#define W3(x)   Wz(x, _mm256_set_epi64x( 0x00FF00FF00FF00FF, \
-       0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF ), 8 ) 
-#define W4(x)   Wz(x, _mm256_set_epi64x( 0x0000FFFF0000FFFF, \
-       0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF ), 16 )
-#define W5(x)   Wz(x, _mm256_set_epi64x( 0x00000000FFFFFFFF, \
-       0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF ), 32 )
+#define W0(x)   Wz(x, m256_const1_64( 0x5555555555555555 ),  1 )
+#define W1(x)   Wz(x, m256_const1_64( 0x3333333333333333 ),  2 )
+#define W2(x)   Wz(x, m256_const1_64( 0x0F0F0F0F0F0F0F0F ),  4 )
+#define W3(x)   Wz(x, m256_const1_64( 0x00FF00FF00FF00FF ),  8 ) 
+#define W4(x)   Wz(x, m256_const1_64( 0x0000FFFF0000FFFF ), 16 )
+#define W5(x)   Wz(x, m256_const1_64( 0x00000000FFFFFFFF ), 32 )
 #define W6(x) \
 do { \
   __m256i t = x ## h; \
@@ -331,14 +325,14 @@ do { \
 	__m256i m2l = buf[5]; \
 	__m256i m3h = buf[6]; \
 	__m256i m3l = buf[7]; \
-        h0h = _mm256_xor_si256( h0h, m0h ); \
-        h0l = _mm256_xor_si256( h0l, m0l ); \
-        h1h = _mm256_xor_si256( h1h, m1h ); \
-        h1l = _mm256_xor_si256( h1l, m1l ); \
-        h2h = _mm256_xor_si256( h2h, m2h ); \
-        h2l = _mm256_xor_si256( h2l, m2l ); \
-        h3h = _mm256_xor_si256( h3h, m3h ); \
-        h3l = _mm256_xor_si256( h3l, m3l ); \
+   h0h = _mm256_xor_si256( h0h, m0h ); \
+   h0l = _mm256_xor_si256( h0l, m0l ); \
+   h1h = _mm256_xor_si256( h1h, m1h ); \
+   h1l = _mm256_xor_si256( h1l, m1l ); \
+   h2h = _mm256_xor_si256( h2h, m2h ); \
+   h2l = _mm256_xor_si256( h2l, m2l ); \
+   h3h = _mm256_xor_si256( h3h, m3h ); \
+   h3l = _mm256_xor_si256( h3l, m3l ); \

 #define INPUT_BUF2 \
   h4h = _mm256_xor_si256( h4h, m0h ); \
@@ -477,13 +471,48 @@ static const sph_u64 IV512[] = {

 #endif

-static void
-jh_4way_init( jh_4way_context *sc, const void *iv )
+void jh256_4way_init( jh_4way_context *sc )
 {
-    uint64_t *v = (uint64_t*)iv;
-    
-    for ( int i = 0; i < 16; i++ )
-        sc->H[i] = _mm256_set_epi64x( v[i], v[i], v[i], v[i] );
+    // bswapped IV256
+    sc->H[ 0] = m256_const1_64( 0xebd3202c41a398eb );
+    sc->H[ 1] = m256_const1_64( 0xc145b29c7bbecd92 );
+    sc->H[ 2] = m256_const1_64( 0xfac7d4609151931c );
+    sc->H[ 3] = m256_const1_64( 0x038a507ed6820026 );
+    sc->H[ 4] = m256_const1_64( 0x45b92677269e23a4 );
+    sc->H[ 5] = m256_const1_64( 0x77941ad4481afbe0 );
+    sc->H[ 6] = m256_const1_64( 0x7a176b0226abb5cd );
+    sc->H[ 7] = m256_const1_64( 0xa82fff0f4224f056 );
+    sc->H[ 8] = m256_const1_64( 0x754d2e7f8996a371 );
+    sc->H[ 9] = m256_const1_64( 0x62e27df70849141d );
+    sc->H[10] = m256_const1_64( 0x948f2476f7957627 );
+    sc->H[11] = m256_const1_64( 0x6c29804757b6d587 );
+    sc->H[12] = m256_const1_64( 0x6c0d8eac2d275e5c );
+    sc->H[13] = m256_const1_64( 0x0f7a0557c6508451 );
+    sc->H[14] = m256_const1_64( 0xea12247067d3e47b );
+    sc->H[15] = m256_const1_64( 0x69d71cd313abe389 );
+    sc->ptr = 0;
+    sc->block_count = 0;
+}
+
+void jh512_4way_init( jh_4way_context *sc )
+{
+    // bswapped IV512
+    sc->H[ 0] = m256_const1_64( 0x17aa003e964bd16f );
+    sc->H[ 1] = m256_const1_64( 0x43d5157a052e6a63 );
+    sc->H[ 2] = m256_const1_64( 0x0bef970c8d5e228a );
+    sc->H[ 3] = m256_const1_64( 0x61c3b3f2591234e9 );
+    sc->H[ 4] = m256_const1_64( 0x1e806f53c1a01d89 );
+    sc->H[ 5] = m256_const1_64( 0x806d2bea6b05a92a );
+    sc->H[ 6] = m256_const1_64( 0xa6ba7520dbcc8e58 );
+    sc->H[ 7] = m256_const1_64( 0xf73bf8ba763a0fa9 );
+    sc->H[ 8] = m256_const1_64( 0x694ae34105e66901 );
+    sc->H[ 9] = m256_const1_64( 0x5ae66f2e8e8ab546 );
+    sc->H[10] = m256_const1_64( 0x243c84c1d0a74710 );
+    sc->H[11] = m256_const1_64( 0x99c15a2db1716e3b );
+    sc->H[12] = m256_const1_64( 0x56f8b19decf657cf );
+    sc->H[13] = m256_const1_64( 0x56b116577c8806a7 );
+    sc->H[14] = m256_const1_64( 0xfb1785e6dffcc2e3 );
+    sc->H[15] = m256_const1_64( 0x4bdd8ccc78465a54 );
    sc->ptr = 0;
    sc->block_count = 0;
 }
@@ -542,7 +571,7 @@ jh_4way_close( jh_4way_context *sc, unsigned ub, unsigned n, void *dst,
   size_t numz, u;
   sph_u64 l0, l1, l0e, l1e;

-   buf[0] = _mm256_set_epi64x( 0x80, 0x80, 0x80, 0x80 );
+   buf[0] = m256_const1_64( 0x80ULL );

   if ( sc->ptr == 0 )
       numz = 48;
@@ -555,8 +584,8 @@ jh_4way_close( jh_4way_context *sc, unsigned ub, unsigned n, void *dst,
   l1 = SPH_T64(sc->block_count >> 55);
   sph_enc64be( &l0e, l0 );
   sph_enc64be( &l1e, l1 );
-   *(buf + (numz>>3)    ) = _mm256_set_epi64x( l1e, l1e, l1e, l1e );
-   *(buf + (numz>>3) + 1) = _mm256_set_epi64x( l0e, l0e, l0e, l0e ); 
+   *(buf + (numz>>3)    ) = _mm256_set1_epi64x( l1e );
+   *(buf + (numz>>3) + 1) = _mm256_set1_epi64x( l0e ); 

   jh_4way_core( sc, buf, numz + 16 );

@@ -566,11 +595,13 @@ jh_4way_close( jh_4way_context *sc, unsigned ub, unsigned n, void *dst,
    memcpy_256( dst256, buf, 8 );
 }

+/*
 void
 jh256_4way_init(void *cc)
 {
-	jh_4way_init(cc, IV256);
+	jhs_4way_init(cc, IV256);
 }
+*/

 void
 jh256_4way(void *cc, const void *data, size_t len)
@@ -584,11 +615,13 @@ jh256_4way_close(void *cc, void *dst)
 	jh_4way_close(cc, 0, 0, dst, 8, IV256);
 }

+/*
 void
 jh512_4way_init(void *cc)
 {
-	jh_4way_init(cc, IV512);
+	jhb_4way_init(cc, IV512);
 }
+*/

 void
 jh512_4way(void *cc, const void *data, size_t len)
--- a/algo/jh/jh-hash-4way.h
+++ b/algo/jh/jh-hash-4way.h
@@ -44,7 +44,7 @@ extern "C"{

 #include <stddef.h>
 #include "algo/sha/sph_types.h"
-#include "avxdefs.h"
+#include "simd-utils.h"

 #define SPH_SIZE_jh256   256

@@ -79,13 +79,13 @@ typedef jh_4way_context jh256_4way_context;

 typedef jh_4way_context jh512_4way_context;

-void jh256_4way_init(void *cc);
+void jh256_4way_init( jh_4way_context *sc);

 void jh256_4way(void *cc, const void *data, size_t len);

 void jh256_4way_close(void *cc, void *dst);

-void jh512_4way_init(void *cc);
+void jh512_4way_init( jh_4way_context *sc );

 void jh512_4way(void *cc, const void *data, size_t len);

--- a/algo/jh/jha-4way.c
+++ b/algo/jh/jha-4way.c
@@ -3,7 +3,6 @@
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-//#include "avxdefs.h"

 #if defined(JHA_4WAY)

@@ -13,9 +12,6 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/groestl/aes_ni/hash-groestl.h"

-//static __thread keccak512_4way_context jha_kec_mid
-//                                   __attribute__ ((aligned (64)));
-
 void jha_hash_4way( void *out, const void *input )
 {
    uint64_t hash0[8] __attribute__ ((aligned (64)));
@@ -46,7 +42,7 @@ void jha_hash_4way( void *out, const void *input )
       vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256(
               vh[0], _mm256_set1_epi64x( 1 ) ), m256_zero );

-       mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+       dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
       init_groestl( &ctx_groestl, 64 );
       update_and_final_groestl( &ctx_groestl, (char*)hash0,
                                               (char*)hash0, 512 );
@@ -59,7 +55,7 @@ void jha_hash_4way( void *out, const void *input )
       init_groestl( &ctx_groestl, 64 );
       update_and_final_groestl( &ctx_groestl, (char*)hash3,
                                               (char*)hash3, 512 );
-       mm256_interleave_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
+       intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );

       skein512_4way_init( &ctx_skein );
       skein512_4way( &ctx_skein, vhash, 64 );
@@ -77,26 +73,24 @@ void jha_hash_4way( void *out, const void *input )
       jh512_4way_close( &ctx_jh, vhashB );

       for ( int i = 0; i < 8; i++ )
-          vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
+          casti_m256i( out, i ) = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
    }
-
-    mm256_deinterleave_4x64( out, out+32, out+64, out+96, vhash, 256 );
 }

-int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                       uint64_t *hashes_done )
+int scanhash_jha_4way( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t hash[8*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-   uint32_t endiandata[20] __attribute__((aligned(64)));
+   uint32_t *hash7 = &(hash[25]);
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   const uint32_t Htarg = ptarget[7];
   uint32_t n = pdata[19];
-   uint32_t *nonces = work->nonces;
-   int num_found = 0;
-   uint32_t *noncep = vdata + 73;   // 9*8 + 1
+    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+   int thr_id = mythr->id;  // thr_id arg is deprecated

   uint64_t htmax[] = {
 		0,
@@ -115,11 +109,7 @@ int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
 		0
 	};

-   for ( int i=0; i < 19; i++ )
-      be32enc( &endiandata[i], pdata[i] );
-
-   uint64_t *edata = (uint64_t*)endiandata;
-   mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+   mm256_bswap32_intrlv80_4x64( vdata, pdata );

   for ( int m = 0; m < 6; m++ )
   {
@@ -127,29 +117,27 @@ int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
      {
         uint32_t mask = masks[m];
         do {
-              be32enc( noncep,   n   );
-              be32enc( noncep+2, n+1 );
-              be32enc( noncep+4, n+2 );
-              be32enc( noncep+6, n+3 );
+              *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );

              jha_hash_4way( hash, vdata );
              pdata[19] = n;

-              for ( int i = 0; i < 4; i++ )
-              if ( ( !( (hash+(i<<3))[7] & mask ) == 0 )
-                  && fulltest( hash+(i<<3), ptarget ) )
+              for ( int i = 0; i < 4; i++ ) if ( !( (hash7[i] & mask ) == 0 ) )
              {
-                 pdata[19] = n;
-                 nonces[ num_found++ ] = n+i;
-                 work_set_target_ratio( work, hash+(i<<3) );
+                 extr_lane_4x64( lane_hash, hash, i, 256 );
+                 if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
+                 {
+                    pdata[19] = n+i;
+                    submit_lane_solution( work, lane_hash, mythr, i );
+                 }
              }
              n += 4;
-         } while ( ( num_found == 0 ) && ( n < max_nonce )
-                     && !work_restart[thr_id].restart );
+         } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
         break;
      }
   }
   *hashes_done = n - first_nonce + 1;
-   return num_found;
+   return 0;
 }
 #endif
--- a/algo/jh/jha-gate.c
+++ b/algo/jh/jha-gate.c
@@ -12,7 +12,7 @@ bool register_jha_algo( algo_gate_t* gate )
  gate->hash             = (void*)&jha_hash;
 #endif
  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
-  gate->set_target       = (void*)&scrypt_set_target;
+  opt_target_factor = 65536.0;
  return true;
 };

--- a/algo/jh/jha-gate.h
+++ b/algo/jh/jha-gate.h
@@ -12,14 +12,14 @@
 #if defined JHA_4WAY
 void jha_hash_4way( void *state, const void *input );

-int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                       uint64_t *hashes_done );
+int scanhash_jha_4way( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr );
 #endif

 void jha_hash( void *state, const void *input );

-int scanhash_jha( int thr_id, struct work *work, uint32_t max_nonce,
-                     uint64_t *hashes_done );
+int scanhash_jha( struct work *work, uint32_t max_nonce,
+                     uint64_t *hashes_done, struct thr_info *mythr );

 #endif

--- a/algo/jh/jha.c
+++ b/algo/jh/jha.c
@@ -81,7 +81,8 @@ void jha_hash(void *output, const void *input)
 	memcpy(output, hash, 32);
 }

-int scanhash_jha(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done)
+int scanhash_jha( struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done, struct thr_info *mythr )
 {
 	uint32_t _ALIGN(128) hash32[8];
 	uint32_t _ALIGN(128) endiandata[20];
@@ -89,7 +90,8 @@ int scanhash_jha(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *ha
 	uint32_t *ptarget = work->target;
 	const uint32_t first_nonce = pdata[19];
 	const uint32_t Htarg = ptarget[7];
-	uint32_t n = pdata[19] - 1;
+   uint32_t n = pdata[19] - 1;
+   int thr_id = mythr->id;  // thr_id arg is deprecated

 	uint64_t htmax[] = {
 		0,
--- a/algo/keccak/keccak-4way.c
+++ b/algo/keccak/keccak-4way.c
@@ -16,55 +16,44 @@ void keccakhash_4way(void *state, const void *input)
    keccak256_4way_close( &ctx, state );
 }

-int scanhash_keccak_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done)
+int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
-   uint32_t hash[8*4] __attribute__ ((aligned (32)));
+   uint32_t hash[16*4] __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *hash7 = &(hash[25]);   // 3*8+1
-   uint32_t lane_hash[8];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   uint32_t n = pdata[19];
   const uint32_t first_nonce = pdata[19];
+   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
 //   const uint32_t Htarg = ptarget[7];
-   uint32_t endiandata[20];
-   uint32_t *nonces = work->nonces;
-   int num_found = 0;
-   uint32_t *noncep = vdata + 73;   // 9*8 + 1
-
-   for ( int i=0; i < 19; i++ ) 
-      be32enc( &endiandata[i], pdata[i] );
-
-   uint64_t *edata = (uint64_t*)endiandata;
-   mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+    int thr_id = mythr->id;  // thr_id arg is deprecated

+   mm256_bswap32_intrlv80_4x64( vdata, pdata );
   do {
-      be32enc( noncep,   n   );
-      be32enc( noncep+2, n+1 );
-      be32enc( noncep+4, n+2 );
-      be32enc( noncep+6, n+3 );
+       *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
 	
      keccakhash_4way( hash, vdata );

      for ( int lane = 0; lane < 4; lane++ )
-      if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) )
+      if ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 )
      {
-          mm256_extract_lane_4x64( lane_hash, hash, lane, 256 );
-          if ( fulltest( lane_hash, ptarget ) )
+          extr_lane_4x64( lane_hash, hash, lane, 256 );
+          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
          {
              pdata[19] = n + lane;
-              nonces[ num_found++ ] = n + lane;
-              work_set_target_ratio( work, lane_hash );
+              submit_lane_solution( work, lane_hash, mythr, lane );
          }
      }
      n += 4;

-   } while ( (num_found == 0) && (n < max_nonce-4)
-                   && !work_restart[thr_id].restart);
+   } while ( (n < max_nonce-4) && !work_restart[thr_id].restart);

   *hashes_done = n - first_nonce + 1;
-   return num_found;
+   return 0;
 }

 #endif
--- a/algo/keccak/keccak-gate.c
+++ b/algo/keccak/keccak-gate.c
@@ -1,18 +1,13 @@
 #include "keccak-gate.h"

-void keccak_set_target( struct work* work, double job_diff )
-{
-  work_set_target( work, job_diff / (128.0 * opt_diff_factor) );
-}
-
 int64_t keccak_get_max64() { return 0x7ffffLL; }

 bool register_keccak_algo( algo_gate_t* gate )
 {
  gate->optimizations = AVX2_OPT;
  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
-  gate->set_target      = (void*)&keccak_set_target;
  gate->get_max64       = (void*)&keccak_get_max64;
+  opt_target_factor = 128.0;
 #if defined (KECCAK_4WAY)
  gate->scanhash  = (void*)&scanhash_keccak_4way;
  gate->hash      = (void*)&keccakhash_4way;
@@ -23,17 +18,12 @@ bool register_keccak_algo( algo_gate_t* gate )
  return true;
 };

-void keccakc_set_target( struct work* work, double job_diff )
-{
-  work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
-}
-
 bool register_keccakc_algo( algo_gate_t* gate )
 {
  gate->optimizations = AVX2_OPT;
  gate->gen_merkle_root = (void*)&sha256d_gen_merkle_root;
-  gate->set_target      = (void*)&keccakc_set_target;
  gate->get_max64       = (void*)&keccak_get_max64;
+  opt_target_factor = 256.0;
 #if defined (KECCAK_4WAY)
  gate->scanhash  = (void*)&scanhash_keccak_4way;
  gate->hash      = (void*)&keccakhash_4way;
--- a/algo/keccak/keccak-gate.h
+++ b/algo/keccak/keccak-gate.h
@@ -11,13 +11,13 @@
 #if defined(KECCAK_4WAY)

 void keccakhash_4way( void *state, const void *input );
-int scanhash_keccak_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done );
+int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );

 #endif

 void keccakhash( void *state, const void *input );
-int scanhash_keccak( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done );
+int scanhash_keccak( struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done, struct thr_info *mythr );

 #endif
--- a/algo/keccak/keccak-hash-4way.c
+++ b/algo/keccak/keccak-hash-4way.c
@@ -370,18 +370,23 @@ static const sph_u64 RC[] = {

 static void keccak64_init( keccak64_ctx_m256i *kc, unsigned out_size )
 {
-   int i;
-   for (i = 0; i < 25; i ++)
-          kc->w[i] = _mm256_setzero_si256();
+   __m256i zero = m256_zero;
+   __m256i neg1 = m256_neg1;

   // Initialization for the "lane complement".
-   kc->w[ 1] = m256_neg1;
-   kc->w[ 2] = m256_neg1;
-   kc->w[ 8] = m256_neg1;
-   kc->w[12] = m256_neg1;
-   kc->w[17] = m256_neg1;
-   kc->w[20] = m256_neg1;
-   kc->ptr = 0;
+   kc->w[ 0] = zero;   kc->w[ 1] = neg1;
+   kc->w[ 2] = neg1;   kc->w[ 3] = zero;
+   kc->w[ 4] = zero;   kc->w[ 5] = zero;
+   kc->w[ 6] = zero;   kc->w[ 7] = zero;
+   kc->w[ 8] = neg1;   kc->w[ 9] = zero;
+   kc->w[10] = zero;   kc->w[11] = zero;
+   kc->w[12] = neg1;   kc->w[13] = zero;
+   kc->w[14] = zero;   kc->w[15] = zero;
+   kc->w[16] = zero;   kc->w[17] = neg1;
+   kc->w[18] = zero;   kc->w[19] = zero;
+   kc->w[20] = neg1;   kc->w[21] = zero;
+   kc->w[22] = zero;   kc->w[23] = zero;
+   kc->w[24] = zero;   kc->ptr = 0;
   kc->lim = 200 - (out_size >> 2);
 }

@@ -441,8 +446,8 @@ static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
    eb = 0x100  >> 8;
    if ( kc->ptr == (lim - 8) )
    {
-        uint64_t t = eb | 0x8000000000000000;
-        u.tmp[0] = _mm256_set_epi64x( t, t, t, t );
+        const uint64_t t = eb | 0x8000000000000000;
+        u.tmp[0] = m256_const1_64( t );
        j = 8;
    }
    else
@@ -450,8 +455,7 @@ static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
        j = lim - kc->ptr;
        u.tmp[0] = _mm256_set_epi64x( eb, eb, eb, eb );
        memset_zero_256( u.tmp + 1, (j>>3) - 2 );
-        u.tmp[ (j>>3) - 1] = _mm256_set_epi64x( 0x8000000000000000,
-                0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
+        u.tmp[ (j>>3) - 1] = m256_const1_64( 0x8000000000000000 );
    }
    keccak64_core( kc, u.tmp, j, lim );
    /* Finalize the "lane complement" */
@@ -461,9 +465,7 @@ static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
    NOT64( kc->w[12], kc->w[12] );
    NOT64( kc->w[17], kc->w[17] );
    NOT64( kc->w[20], kc->w[20] );
-    for ( j = 0; j < m256_len; j++ )
-         u.tmp[j] =  kc->w[j]; 
-    memcpy_256( dst, u.tmp, m256_len );
+    memcpy_256( dst, kc->w, m256_len );
 }

 void keccak256_4way_init( void *kc )
--- a/algo/keccak/keccak-hash-4way.h
+++ b/algo/keccak/keccak-hash-4way.h
@@ -44,7 +44,7 @@ extern "C"{

 #include <stddef.h>
 #include "algo/sha/sph_types.h"
-#include "avxdefs.h"
+#include "simd-utils.h"

 #define SPH_SIZE_keccak256   256

--- a/algo/keccak/keccak.c
+++ b/algo/keccak/keccak.c
@@ -18,14 +18,15 @@ void keccakhash(void *state, const void *input)
 	memcpy(state, hash, 32);
 }

-int scanhash_keccak(int thr_id, struct work *work,
-	uint32_t max_nonce, uint64_t *hashes_done)
+int scanhash_keccak( struct work *work,
+	uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
 	uint32_t n = pdata[19] - 1;
 	const uint32_t first_nonce = pdata[19];
 	//const uint32_t Htarg = ptarget[7];
+   int thr_id = mythr->id;  // thr_id arg is deprecated

 	uint32_t _ALIGN(32) hash64[8];
 	uint32_t endiandata[32];
--- a/algo/luffa/luffa-hash-2way.c
+++ b/algo/luffa/luffa-hash-2way.c
@@ -24,7 +24,7 @@

 #if defined(__AVX2__)

-#include "avxdefs.h"
+#include "simd-utils.h"

 #define MASK _mm256_set_epi32( 0UL, 0UL, 0UL, 0xffffffffUL, \
                               0UL, 0UL, 0UL, 0xffffffffUL )
--- a/algo/luffa/luffa-hash-2way.h
+++ b/algo/luffa/luffa-hash-2way.h
@@ -24,7 +24,7 @@

 #include <immintrin.h>
 #include "algo/sha/sha3-defs.h"
-#include "avxdefs.h"
+#include "simd-utils.h"

 /* The length of digests*/
 #define DIGEST_BIT_LEN_224 224
--- a/algo/luffa/luffa_for_sse2.c
+++ b/algo/luffa/luffa_for_sse2.c
@@ -20,7 +20,7 @@

 #include <string.h>
 #include <emmintrin.h>
-#include "avxdefs.h"
+#include "simd-utils.h"
 #include "luffa_for_sse2.h"

 #define MULT2(a0,a1) do \
--- a/algo/luffa/sph_luffa.c
+++ b/algo/luffa/sph_luffa.c
@@ -77,6 +77,24 @@ static const sph_u32 V_INIT[5][8] = {
 	}
 };

+#if SPH_LUFFA_PARALLEL
+
+static const sph_u64 RCW010[8] = {
+   SPH_C64(0xb6de10ed303994a6), SPH_C64(0x70f47aaec0e65299),
+   SPH_C64(0x0707a3d46cc33a12), SPH_C64(0x1c1e8f51dc56983e),
+   SPH_C64(0x707a3d451e00108f), SPH_C64(0xaeb285627800423d),
+   SPH_C64(0xbaca15898f5b7882), SPH_C64(0x40a46f3e96e1db12)
+};
+
+static const sph_u64 RCW014[8] = {
+   SPH_C64(0x01685f3de0337818), SPH_C64(0x05a17cf4441ba90d),
+   SPH_C64(0xbd09caca7f34d442), SPH_C64(0xf4272b289389217f),
+   SPH_C64(0x144ae5cce5a8bce6), SPH_C64(0xfaa7ae2b5274baf4),
+   SPH_C64(0x2e48f1c126889ba7), SPH_C64(0xb923c7049a226e9d)
+};
+
+#else
+
 static const sph_u32 RC00[8] = {
 	SPH_C32(0x303994a6), SPH_C32(0xc0e65299),
 	SPH_C32(0x6cc33a12), SPH_C32(0xdc56983e),
@@ -105,20 +123,18 @@ static const sph_u32 RC14[8] = {
 	SPH_C32(0x2e48f1c1), SPH_C32(0xb923c704)
 };

-#if SPH_LUFFA_PARALLEL
-
-static const sph_u64 RCW010[8] = {
-	SPH_C64(0xb6de10ed303994a6), SPH_C64(0x70f47aaec0e65299),
-	SPH_C64(0x0707a3d46cc33a12), SPH_C64(0x1c1e8f51dc56983e),
-	SPH_C64(0x707a3d451e00108f), SPH_C64(0xaeb285627800423d),
-	SPH_C64(0xbaca15898f5b7882), SPH_C64(0x40a46f3e96e1db12)
+static const sph_u32 RC30[8] = {
+   SPH_C32(0xb213afa5), SPH_C32(0xc84ebe95),
+   SPH_C32(0x4e608a22), SPH_C32(0x56d858fe),
+   SPH_C32(0x343b138f), SPH_C32(0xd0ec4e3d),
+   SPH_C32(0x2ceb4882), SPH_C32(0xb3ad2208)
 };

-static const sph_u64 RCW014[8] = {
-	SPH_C64(0x01685f3de0337818), SPH_C64(0x05a17cf4441ba90d),
-	SPH_C64(0xbd09caca7f34d442), SPH_C64(0xf4272b289389217f),
-	SPH_C64(0x144ae5cce5a8bce6), SPH_C64(0xfaa7ae2b5274baf4),
-	SPH_C64(0x2e48f1c126889ba7), SPH_C64(0xb923c7049a226e9d)
+static const sph_u32 RC34[8] = {
+   SPH_C32(0xe028c9bf), SPH_C32(0x44756f91),
+   SPH_C32(0x7e8fce32), SPH_C32(0x956548be),
+   SPH_C32(0xfe191be2), SPH_C32(0x3cb226e5),
+   SPH_C32(0x5944a28e), SPH_C32(0xa1c4c355)
 };

 #endif
@@ -137,19 +153,6 @@ static const sph_u32 RC24[8] = {
 	SPH_C32(0x36eda57f), SPH_C32(0x703aace7)
 };

-static const sph_u32 RC30[8] = {
-	SPH_C32(0xb213afa5), SPH_C32(0xc84ebe95),
-	SPH_C32(0x4e608a22), SPH_C32(0x56d858fe),
-	SPH_C32(0x343b138f), SPH_C32(0xd0ec4e3d),
-	SPH_C32(0x2ceb4882), SPH_C32(0xb3ad2208)
-};
-
-static const sph_u32 RC34[8] = {
-	SPH_C32(0xe028c9bf), SPH_C32(0x44756f91),
-	SPH_C32(0x7e8fce32), SPH_C32(0x956548be),
-	SPH_C32(0xfe191be2), SPH_C32(0x3cb226e5),
-	SPH_C32(0x5944a28e), SPH_C32(0xa1c4c355)
-};

 #if SPH_LUFFA_PARALLEL

--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -44,10 +44,11 @@ void allium_4way_hash( void *state, const void *input )
   blake256_4way( &ctx.blake, input + (64<<2), 16 );
   blake256_4way_close( &ctx.blake, vhash32 );

-   mm256_reinterleave_4x64( vhash64, vhash32, 256 );
+   rintrlv_4x32_4x64( vhash64, vhash32, 256 );
   keccak256_4way( &ctx.keccak, vhash64, 32 );
   keccak256_4way_close( &ctx.keccak, vhash64 );
-   mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
+
+   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );

   LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
   LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
@@ -67,52 +68,42 @@ void allium_4way_hash( void *state, const void *input )
   LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
   LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );

-   mm256_interleave_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
+   intrlv_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
+
   skein256_4way( &ctx.skein, vhash64, 32 );
   skein256_4way_close( &ctx.skein, vhash64 );
-   mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );

-   update_and_final_groestl256( &ctx.groestl, hash0, hash0, 256 );
-   memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, hash1, hash1, 256 );
-   memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, hash2, hash2, 256 );
-   memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, hash3, hash3, 256 );
+   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );

-   memcpy( state,    hash0, 32 );
-   memcpy( state+32, hash1, 32 );
-   memcpy( state+64, hash2, 32 );
-   memcpy( state+96, hash3, 32 );
+   update_and_final_groestl256( &ctx.groestl, state, hash0, 256 );
+   memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+32, hash1, 256 );
+   memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+64, hash2, 256 );
+   memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+96, hash3, 256 );
 }

-int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
+int scanhash_allium_4way( struct work *work, uint32_t max_nonce,
                             uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t hash[8*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-   uint32_t _ALIGN(64) edata[20];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
   const uint32_t Htarg = ptarget[7];
   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
-   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
+   int thr_id = mythr->id;  // thr_id arg is deprecated

   if ( opt_benchmark )
      ( (uint32_t*)ptarget )[7] = 0x0000ff;

-   casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
-   casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
-   casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
-   casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
-   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
-
-   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+   mm128_bswap32_intrlv80_4x32( vdata, pdata );
   blake256_4way_init( &allium_4way_ctx.blake );
   blake256_4way( &allium_4way_ctx.blake, vdata, 64 );

@@ -124,10 +115,10 @@ int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,

     for ( int lane = 0; lane < 4; lane++ ) if ( (hash+(lane<<3))[7] <= Htarg )
     {
-        if ( fulltest( hash+(lane<<3), ptarget ) )
+        if ( fulltest( hash+(lane<<3), ptarget ) && !opt_benchmark )
        {
           pdata[19] = n + lane;
-           submit_solution( work, hash+(lane<<3), mythr, lane );
+           submit_lane_solution( work, hash+(lane<<3), mythr, lane );
         }
     }
     n += 4;
--- a/algo/lyra2/allium.c
+++ b/algo/lyra2/allium.c
@@ -69,7 +69,7 @@ void allium_hash(void *state, const void *input)
    memcpy(state, hash, 32);
 }

-int scanhash_allium( int thr_id, struct work *work, uint32_t max_nonce,
+int scanhash_allium( struct work *work, uint32_t max_nonce,
                     uint64_t *hashes_done, struct thr_info *mythr )
 {
    uint32_t _ALIGN(128) hash[8];
@@ -80,7 +80,7 @@ int scanhash_allium( int thr_id, struct work *work, uint32_t max_nonce,
    const uint32_t Htarg = ptarget[7];
    const uint32_t first_nonce = pdata[19];
    uint32_t nonce = first_nonce;
-   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
+    int thr_id = mythr->id;  // thr_id arg is deprecated

    if ( opt_benchmark )
        ptarget[7] = 0x3ffff;
@@ -94,18 +94,14 @@ int scanhash_allium( int thr_id, struct work *work, uint32_t max_nonce,
    do {
        be32enc( &endiandata[19], nonce );
        allium_hash( hash, endiandata );
-
-        if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
+        if ( hash[7] <= Htarg )
+        if ( fulltest( hash, ptarget ) && !opt_benchmark )
        {
-            work_set_target_ratio( work, hash );
            pdata[19] = nonce;
-            *hashes_done = pdata[19] - first_nonce;
-            return 1;
+            submit_solution( work, hash, mythr );
        }
        nonce++;
-
-    } while (nonce < max_nonce && !work_restart[thr_id].restart);
-
+    } while ( nonce < max_nonce && !work_restart[thr_id].restart );
    pdata[19] = nonce;
    *hashes_done = pdata[19] - first_nonce + 1;
    return 0;
--- a/algo/lyra2/lyra2-gate.c
+++ b/algo/lyra2/lyra2-gate.c
@@ -47,7 +47,9 @@ bool lyra2rev3_thread_init()

   int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
   l2v3_wholeMatrix = _mm_malloc( size, 64 );
-#if defined (LYRA2REV3_4WAY)
+#if defined (LYRA2REV3_8WAY)
+   init_lyra2rev3_8way_ctx();;
+#elif defined (LYRA2REV3_4WAY)
   init_lyra2rev3_4way_ctx();;
 #else
   init_lyra2rev3_ctx();
@@ -57,7 +59,10 @@ bool lyra2rev3_thread_init()

 bool register_lyra2rev3_algo( algo_gate_t* gate )
 {
-#if defined (LYRA2REV3_4WAY)
+#if defined (LYRA2REV3_8WAY)
+  gate->scanhash  = (void*)&scanhash_lyra2rev3_8way;
+  gate->hash      = (void*)&lyra2rev3_8way_hash;
+#elif defined (LYRA2REV3_4WAY)
  gate->scanhash  = (void*)&scanhash_lyra2rev3_4way;
  gate->hash      = (void*)&lyra2rev3_4way_hash;
 #else
@@ -66,7 +71,7 @@ bool register_lyra2rev3_algo( algo_gate_t* gate )
 #endif
  gate->optimizations = SSE2_OPT | SSE42_OPT | AVX2_OPT;
  gate->miner_thread_init = (void*)&lyra2rev3_thread_init;
-  gate->set_target        = (void*)&alt_set_target;
+  opt_target_factor = 256.0;
  return true;
 };

@@ -100,7 +105,7 @@ bool register_lyra2rev2_algo( algo_gate_t* gate )
 #endif
  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
  gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
-  gate->set_target        = (void*)&alt_set_target;
+  opt_target_factor = 256.0;
  return true;
 };

@@ -123,7 +128,7 @@ bool register_lyra2z_algo( algo_gate_t* gate )
 #endif
  gate->optimizations = SSE42_OPT | AVX2_OPT;
  gate->get_max64  = (void*)&get_max64_0xffffLL;
-  gate->set_target = (void*)&alt_set_target;
+  opt_target_factor = 256.0;
  return true;
 };

@@ -143,7 +148,7 @@ bool register_lyra2h_algo( algo_gate_t* gate )
 #endif
  gate->optimizations = SSE42_OPT | AVX2_OPT;
  gate->get_max64  = (void*)&get_max64_0xffffLL;
-  gate->set_target = (void*)&alt_set_target;
+  opt_target_factor = 256.0;
  return true;
 };

@@ -163,8 +168,8 @@ bool register_allium_algo( algo_gate_t* gate )
  gate->hash      = (void*)&allium_hash;
 #endif
  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
-  gate->set_target        = (void*)&alt_set_target;
  gate->get_max64         = (void*)&allium_get_max64_0xFFFFLL;
+  opt_target_factor = 256.0;
  return true;
 };

@@ -177,6 +182,7 @@ int phi2_get_work_data_size() { return phi2_use_roots ? 144 : 128; }

 void phi2_decode_extra_data( struct work *work )
 {
+   phi2_use_roots = false;
   if ( work->data[0] & ( 1<<30 ) ) phi2_use_roots = true;
   else for ( int i = 20; i < 36; i++ )
   {
@@ -203,13 +209,18 @@ void phi2_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )

 bool register_phi2_algo( algo_gate_t* gate )
 {
-   init_phi2_ctx();
+//   init_phi2_ctx();
   gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
   gate->get_work_data_size = (void*)&phi2_get_work_data_size;
   gate->decode_extra_data  = (void*)&phi2_decode_extra_data;
   gate->build_extraheader  = (void*)&phi2_build_extraheader;
-   gate->set_target         = (void*)&alt_set_target; 
   gate->get_max64          = (void*)&get_max64_0xffffLL;
+   opt_target_factor = 256.0;
+#if defined(PHI2_4WAY)
+   gate->scanhash           = (void*)&scanhash_phi2_4way;
+#else
+   init_phi2_ctx();
   gate->scanhash           = (void*)&scanhash_phi2;
+#endif
   return true;
 }
--- a/algo/lyra2/lyra2-gate.h
+++ b/algo/lyra2/lyra2-gate.h
@@ -6,24 +6,34 @@
 #include "lyra2.h"

 #if defined(__AVX2__)
+  #define LYRA2REV3_8WAY
+#endif
+
+#if defined(__SSE2__)
  #define LYRA2REV3_4WAY
 #endif

 extern __thread uint64_t* l2v3_wholeMatrix;

 bool register_lyra2rev3_algo( algo_gate_t* gate );
+#if defined(LYRA2REV3_8WAY)

-#if defined(LYRA2REV3_4WAY)
+void lyra2rev3_8way_hash( void *state, const void *input );
+int scanhash_lyra2rev3_8way( struct work *work, uint32_t max_nonce,
+                             uint64_t *hashes_done, struct thr_info *mythr );
+bool init_lyra2rev3_8way_ctx();
+
+#elif defined(LYRA2REV3_4WAY)

 void lyra2rev3_4way_hash( void *state, const void *input );
-int scanhash_lyra2rev3_4way( int thr_id, struct work *work, uint32_t max_nonce,
+int scanhash_lyra2rev3_4way( struct work *work, uint32_t max_nonce,
                             uint64_t *hashes_done, struct thr_info *mythr );
 bool init_lyra2rev3_4way_ctx();

 #else

 void lyra2rev3_hash( void *state, const void *input );
-int scanhash_lyra2rev3( int thr_id, struct work *work, uint32_t max_nonce,
+int scanhash_lyra2rev3( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );
 bool init_lyra2rev3_ctx();

@@ -42,14 +52,14 @@ bool register_lyra2rev2_algo( algo_gate_t* gate );
 #if defined(LYRA2REV2_4WAY)

 void lyra2rev2_4way_hash( void *state, const void *input );
-int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
+int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
                             uint64_t *hashes_done, struct thr_info *mythr );
 bool init_lyra2rev2_4way_ctx();

 #else

 void lyra2rev2_hash( void *state, const void *input );
-int scanhash_lyra2rev2( int thr_id, struct work *work, uint32_t max_nonce,
+int scanhash_lyra2rev2( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );
 bool init_lyra2rev2_ctx();

@@ -70,21 +80,21 @@ bool init_lyra2rev2_ctx();
 #if defined(LYRA2Z_8WAY)

 void lyra2z_8way_hash( void *state, const void *input );
-int scanhash_lyra2z_8way( int thr_id, struct work *work, uint32_t max_nonce,
+int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr );
 bool lyra2z_8way_thread_init();

 #elif defined(LYRA2Z_4WAY)

 void lyra2z_4way_hash( void *state, const void *input );
-int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
+int scanhash_lyra2z_4way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr );
 bool lyra2z_4way_thread_init();

 #else

 void lyra2z_hash( void *state, const void *input );
-int scanhash_lyra2z( int thr_id, struct work *work, uint32_t max_nonce,
+int scanhash_lyra2z( struct work *work, uint32_t max_nonce,
                     uint64_t *hashes_done, struct thr_info *mythr );
 bool lyra2z_thread_init();

@@ -101,14 +111,14 @@ bool lyra2z_thread_init();
 #if defined(LYRA2H_4WAY)

 void lyra2h_4way_hash( void *state, const void *input );
-int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
+int scanhash_lyra2h_4way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr );
 bool lyra2h_4way_thread_init();

 #else

 void lyra2h_hash( void *state, const void *input );
-int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
+int scanhash_lyra2h( struct work *work, uint32_t max_nonce,
                     uint64_t *hashes_done, struct thr_info *mythr );
 bool lyra2h_thread_init();

@@ -125,14 +135,14 @@ bool register_allium_algo( algo_gate_t* gate );
 #if defined(ALLIUM_4WAY)

 void allium_4way_hash( void *state, const void *input );
-int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
+int scanhash_allium_4way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr );
 bool init_allium_4way_ctx();

 #else

 void allium_hash( void *state, const void *input );
-int scanhash_allium( int thr_id, struct work *work, uint32_t max_nonce,
+int scanhash_allium( struct work *work, uint32_t max_nonce,
                     uint64_t *hashes_done, struct thr_info *mythr );
 bool init_allium_ctx();

@@ -140,15 +150,29 @@ bool init_allium_ctx();

 /////////////////////////////////////////

+#if defined(__AVX2__) && defined(__AES__)
+//  #define PHI2_4WAY
+#endif
+
 bool phi2_has_roots;

 bool register_phi2_algo( algo_gate_t* gate );
+#if defined(PHI2_4WAY)
+
+void phi2_hash_4way( void *state, const void *input );
+int scanhash_phi2_4way( struct work *work, uint32_t max_nonce,
+                     uint64_t *hashes_done, struct thr_info *mythr );
+//void init_phi2_ctx();
+
+#else

 void phi2_hash( void *state, const void *input );
-int scanhash_phi2( int thr_id, struct work *work, uint32_t max_nonce,
+int scanhash_phi2( struct work *work, uint32_t max_nonce,
                     uint64_t *hashes_done, struct thr_info *mythr );
 void init_phi2_ctx();

+#endif
+
 #endif  // LYRA2_GATE_H__


--- a/algo/lyra2/lyra2.c
+++ b/algo/lyra2/lyra2.c
@@ -60,7 +60,7 @@ int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
   int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
   int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
   int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
-   int64_t i; //auxiliary iteration counter
+//   int64_t i; //auxiliary iteration counter
   int64_t v64; // 64bit var for memcpy
   //====================================================================/

@@ -128,17 +128,22 @@ int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
   //================= Initializing the Sponge State ====================//
   //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)

-   initState( state );
+//   initState( state );

   //========================= Setup Phase =============================//
   //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
   
   ptrWord = wholeMatrix;
+
+   absorbBlockBlake2Safe( state, ptrWord, nBlocksInput, BLOCK_LEN );
+/*
   for (i = 0; i < nBlocksInput; i++)
   {
       absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
       ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
   }
+*/
+
   //Initializes M[0] and M[1]
   reducedSqueezeRow0( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here

@@ -227,7 +232,7 @@ int LYRA2REV3( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
   int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
   int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
   int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
-   int64_t i; //auxiliary iteration counter
+//   int64_t i; //auxiliary iteration counter
   int64_t v64; // 64bit var for memcpy
   uint64_t instance = 0;
   //====================================================================/
@@ -302,17 +307,21 @@ int LYRA2REV3( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
   //================= Initializing the Sponge State ====================//
   //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)

-   initState( state );
+//   initState( state );

   //========================= Setup Phase =============================//
   //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits

   ptrWord = wholeMatrix;
+
+   absorbBlockBlake2Safe( state, ptrWord, nBlocksInput, BLOCK_LEN );
+/*
   for (i = 0; i < nBlocksInput; i++)
   {
       absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
       ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
   }
+*/
   //Initializes M[0] and M[1]
   reducedSqueezeRow0( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here

@@ -405,7 +414,7 @@ int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
    int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
    int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
    int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
-    int64_t i; //auxiliary iteration counter
+//    int64_t i; //auxiliary iteration counter
    //=======================================================================/

    //======= Initializing the Memory Matrix and pointers to it =============//
@@ -459,17 +468,21 @@ int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
 //        if (state == NULL) {
 //                return -1;
 //        }
-    initState( state );
+//    initState( state );

    //============================== Setup Phase =============================//
    //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
-        uint64_t *ptrWord = wholeMatrix;
+    uint64_t *ptrWord = wholeMatrix;
+
+    absorbBlockBlake2Safe( state, ptrWord, nBlocksInput,
+                           BLOCK_LEN_BLAKE2_SAFE_INT64 );
+/*
    for ( i = 0; i < nBlocksInput; i++ )
    {
      absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
      ptrWord += BLOCK_LEN_BLAKE2_SAFE_INT64; //goes to next block of pad(pwd || salt || basil)
    }
-
+*/
    //Initializes M[0] and M[1]
        reducedSqueezeRow0(state, &wholeMatrix[0], nCols); //The locally copied password is most likely overwritten here
        reducedDuplexRow1(state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64], nCols);
@@ -566,7 +579,7 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,

 #if defined(__AVX2__)
   memset_zero_256( (__m256i*)wholeMatrix, i>>5 );
-#elif defined(__SSE4_2__)
+#elif defined(__SSE2__)
   memset_zero_128( (__m128i*)wholeMatrix, i>>4 );   
 #else
   memset( wholeMatrix, 0, i );
@@ -623,17 +636,21 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
   //================= Initializing the Sponge State ====================//
   //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)

-   initState( state );
+//   initState( state );

   //========================= Setup Phase =============================//
   //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits

   ptrWord = wholeMatrix;
+
+   absorbBlockBlake2Safe( state, ptrWord, nBlocksInput, BLOCK_LEN );
+/*
   for (i = 0; i < nBlocksInput; i++)
   {
       absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
       ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
   }
+*/
   //Initializes M[0] and M[1]
   reducedSqueezeRow0( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here

--- a/algo/lyra2/lyra2h-4way.c
+++ b/algo/lyra2/lyra2h-4way.c
@@ -5,7 +5,7 @@
 #include <memory.h>
 #include <mm_malloc.h>
 #include "lyra2.h"
-#include "algo/blake/sph_blake.h"
+//#include "algo/blake/sph_blake.h"
 #include "algo/blake/blake-hash-4way.h"

 __thread uint64_t* lyra2h_4way_matrix;
@@ -36,67 +36,53 @@ void lyra2h_4way_hash( void *state, const void *input )
     blake256_4way( &ctx_blake, input + (64*4), 16 );
     blake256_4way_close( &ctx_blake, vhash );

-     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
+     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );

-     LYRA2Z( lyra2h_4way_matrix, hash0, 32, hash0, 32, hash0, 32, 16, 16, 16 );
-     LYRA2Z( lyra2h_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 16, 16, 16 );
-     LYRA2Z( lyra2h_4way_matrix, hash2, 32, hash2, 32, hash2, 32, 16, 16, 16 );
-     LYRA2Z( lyra2h_4way_matrix, hash3, 32, hash3, 32, hash3, 32, 16, 16, 16 );
-
-     memcpy( state,    hash0, 32 );
-     memcpy( state+32, hash1, 32 );
-     memcpy( state+64, hash2, 32 );
-     memcpy( state+96, hash3, 32 );
+     LYRA2Z( lyra2h_4way_matrix, state, 32, hash0, 32, hash0, 32,
+             16, 16, 16 );
+     LYRA2Z( lyra2h_4way_matrix, state+32, 32, hash1, 32, hash1,
+             32, 16, 16, 16 );
+     LYRA2Z( lyra2h_4way_matrix, state+64, 32, hash2, 32, hash2,
+             32, 16, 16, 16 );
+     LYRA2Z( lyra2h_4way_matrix, state+96, 32, hash3, 32, hash3,
+             32, 16, 16, 16 );
 }

-int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
+int scanhash_lyra2h_4way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t hash[8*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-   uint32_t _ALIGN(64) edata[20];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
-   uint32_t *nonces = work->nonces;
-   int num_found = 0;
-   uint32_t *noncep= vdata + 76; // 19*4
-   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
+   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
+   int thr_id = mythr->id;  // thr_id arg is deprecated

   if ( opt_benchmark )
      ptarget[7] = 0x0000ff;

-   for ( int i=0; i < 20; i++ )
-      be32enc( &edata[i], pdata[i] );
-
-   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
-
+   mm128_bswap32_intrlv80_4x32( vdata, pdata );
   lyra2h_4way_midstate( vdata );

   do {
-      be32enc( noncep,   n   );
-      be32enc( noncep+1, n+1 );
-      be32enc( noncep+2, n+2 );
-      be32enc( noncep+3, n+3 );
-
-      be32enc( &edata[19], n );
+     *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
      lyra2h_4way_hash( hash, vdata );

      for ( int i = 0; i < 4; i++ )
-      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
+           && !opt_benchmark )
      {
          pdata[19] = n+i;         
-          nonces[ num_found++ ] = n+i;
-          work_set_target_ratio( work, hash+(i<<3) );
+          submit_lane_solution( work, hash+(i<<3), mythr, i );
      }
      n += 4;
-   } while ( (num_found == 0) && (n < max_nonce-4)
-                   && !work_restart[thr_id].restart);
+   } while (  (n < max_nonce-4) && !work_restart[thr_id].restart);

   *hashes_done = n - first_nonce + 1;
-   return num_found;
+   return 0;
 }

 #endif
--- a/algo/lyra2/lyra2h.c
+++ b/algo/lyra2/lyra2h.c
@@ -35,7 +35,7 @@ void lyra2h_hash( void *state, const void *input )
    memcpy(state, hash, 32);
 }

-int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
+int scanhash_lyra2h( struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr )
 {
 	uint32_t _ALIGN(64) hash[8];
@@ -45,7 +45,7 @@ int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
 	uint32_t nonce = first_nonce;
-   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
+   int thr_id = mythr->id;  // thr_id arg is deprecated

 	if (opt_benchmark)
 		ptarget[7] = 0x0000ff;
@@ -54,22 +54,19 @@ int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
 		be32enc(&endiandata[i], pdata[i]);
 	}

-        lyra2h_midstate( endiandata );
-
+   lyra2h_midstate( endiandata );
 	do {
 		be32enc(&endiandata[19], nonce);
                lyra2h_hash( hash, endiandata );

-		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
-			work_set_target_ratio(work, hash);
+		if ( hash[7] <= Htarg )
+      if ( fulltest( hash, ptarget ) && !opt_benchmark )
+      {
 			pdata[19] = nonce;
-			*hashes_done = pdata[19] - first_nonce;
-			return 1;
-		}
+         submit_solution( work, hash, mythr );
+      }
 		nonce++;
-
 	} while (nonce < max_nonce && !work_restart[thr_id].restart);
-
 	pdata[19] = nonce;
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
--- a/algo/lyra2/lyra2re.c
+++ b/algo/lyra2/lyra2re.c
@@ -6,7 +6,7 @@
 #include "algo/keccak/sph_keccak.h"
 #include "lyra2.h"
 #include "algo-gate-api.h"
-#include "avxdefs.h"
+#include "simd-utils.h"
 #if defined(__AES__)
  #include "algo/groestl/aes_ni/hash-groestl256.h"
 #endif
@@ -81,7 +81,7 @@ void lyra2re_hash(void *state, const void *input)
 	memcpy(state, hashA, 32);
 }

-int scanhash_lyra2re( int thr_id, struct work *work, uint32_t max_nonce,
+int scanhash_lyra2re( struct work *work, uint32_t max_nonce,
 	              uint64_t *hashes_done, struct thr_info *mythr )
 {
        uint32_t *pdata = work->data;
@@ -91,7 +91,7 @@ int scanhash_lyra2re( int thr_id, struct work *work, uint32_t max_nonce,
 	const uint32_t first_nonce = pdata[19];
 	uint32_t nonce = first_nonce;
        const uint32_t Htarg = ptarget[7];
-   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
+   int thr_id = mythr->id;  // thr_id arg is deprecated

        swab32_array( endiandata, pdata, 20 );

@@ -100,20 +100,14 @@ int scanhash_lyra2re( int thr_id, struct work *work, uint32_t max_nonce,
 	do {
 		be32enc(&endiandata[19], nonce);
 		lyra2re_hash(hash, endiandata);
-		if (hash[7] <= Htarg )
-                {
-                   if ( fulltest(hash, ptarget) )
-                   {
+		if ( hash[7] <= Htarg )
+      if ( fulltest(hash, ptarget) && !opt_benchmark )
+      {
 			pdata[19] = nonce;
-			*hashes_done = pdata[19] - first_nonce;
-                        work_set_target_ratio( work, hash );
-			return 1;
-                   }
-		}
+         submit_solution( work, hash, mythr );
+      }
 		nonce++;
-
 	} while (nonce < max_nonce && !work_restart[thr_id].restart);
-
 	pdata[19] = nonce;
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
@@ -124,11 +118,6 @@ int64_t lyra2re_get_max64 ()
  return 0xffffLL;
 }

-void lyra2re_set_target ( struct work* work, double job_diff )
-{
-   work_set_target(work, job_diff / (128.0 * opt_diff_factor) );
-}
-
 bool register_lyra2re_algo( algo_gate_t* gate )
 {
  init_lyra2re_ctx();
@@ -136,7 +125,7 @@ bool register_lyra2re_algo( algo_gate_t* gate )
  gate->scanhash   = (void*)&scanhash_lyra2re;
  gate->hash       = (void*)&lyra2re_hash;
  gate->get_max64  = (void*)&lyra2re_get_max64;
-  gate->set_target = (void*)&lyra2re_set_target;
+  opt_target_factor = 128.0;
  return true;
 };

--- a/algo/lyra2/lyra2rev2-4way.c
+++ b/algo/lyra2/lyra2rev2-4way.c
@@ -42,10 +42,12 @@ void lyra2rev2_4way_hash( void *state, const void *input )
   blake256_4way( &ctx.blake, input + (64<<2), 16 );
   blake256_4way_close( &ctx.blake, vhash );

-   mm256_reinterleave_4x64( vhash64, vhash, 256 );
+   rintrlv_4x32_4x64( vhash64, vhash, 256 );
+
   keccak256_4way( &ctx.keccak, vhash64, 32 );
   keccak256_4way_close( &ctx.keccak, vhash64 );
-   mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
+
+   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );

   cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
   cubehashInit( &ctx.cube, 256, 16, 32 );
@@ -60,10 +62,12 @@ void lyra2rev2_4way_hash( void *state, const void *input )
   LYRA2REV2( l2v2_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 );
   LYRA2REV2( l2v2_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );

-   mm256_interleave_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
+   intrlv_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
+
   skein256_4way( &ctx.skein, vhash64, 32 );
   skein256_4way_close( &ctx.skein, vhash64 );
-   mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
+
+   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );

   cubehashInit( &ctx.cube, 256, 16, 32 );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
@@ -74,61 +78,55 @@ void lyra2rev2_4way_hash( void *state, const void *input )
   cubehashInit( &ctx.cube, 256, 16, 32 );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );

-   mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
-   bmw256_4way( &ctx.bmw, vhash, 32 );
-   bmw256_4way_close( &ctx.bmw, vhash );
+   intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );

-   mm128_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
+   bmw256_4way( &ctx.bmw, vhash, 32 );
+   bmw256_4way_close( &ctx.bmw, state );
 }

-int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
+int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
                             uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t hash[8*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-   uint32_t _ALIGN(64) edata[20];
+   uint32_t *hash7 = &(hash[7<<2]);
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
   const uint32_t Htarg = ptarget[7];
-   uint32_t *nonces = work->nonces;
-   int num_found = 0;
-   uint32_t *noncep = vdata + 76; // 19*4
-   /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
+   __m128i *noncev = (__m128i*)vdata + 19;   // aligned
+   int thr_id = mythr->id;  // thr_id arg is deprecated

   if ( opt_benchmark )
      ( (uint32_t*)ptarget )[7] = 0x0000ff;

-   swab32_array( edata, pdata, 20 );
-
-   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+   mm128_bswap32_intrlv80_4x32( vdata, pdata );

   blake256_4way_init( &l2v2_4way_ctx.blake );
   blake256_4way( &l2v2_4way_ctx.blake, vdata, 64 );

-   do {
-      be32enc( noncep,   n   );
-      be32enc( noncep+1, n+1 );
-      be32enc( noncep+2, n+2 );
-      be32enc( noncep+3, n+3 );
+   do
+   {
+      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );

      lyra2rev2_4way_hash( hash, vdata );
      pdata[19] = n;

-      for ( int i = 0; i < 4; i++ )
-      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+      for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
      {
-          pdata[19] = n+i;         
-          nonces[ num_found++ ] = n+i;
-          work_set_target_ratio( work, hash+(i<<3) );
+         extr_lane_4x32( lane_hash, hash, lane, 256 );
+         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+         {
+            pdata[19] = n + lane;         
+            submit_lane_solution( work, lane_hash, mythr, lane );
+         }
      }
      n += 4;
-   } while ( (num_found == 0) && (n < max_nonce-4)
-                   && !work_restart[thr_id].restart);
-
+   } while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
   *hashes_done = n - first_nonce + 1;
-   return num_found;
+   return 0;
 }

 #endif
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Jay D Dee	789c8b70bc	v3.9.8.1	2019-10-01 14:17:36 -04:00
Jay D Dee	01550d94a2	v3.9.8	2019-09-26 22:37:26 -04:00
Jay D Dee	a042fb7612	v3.9.7	2019-08-03 10:39:54 -04:00
Jay D Dee	9d49e0be7a	v3.9.6.2	2019-07-30 10:16:43 -04:00
Jay D Dee	a51f59086b	v3.9.6.1	2019-07-18 19:46:57 -04:00
Jay D Dee	6f49ba09b7	v3.9.6	2019-07-17 17:54:38 -04:00
Jay D Dee	e2d5762ef2	v3.9.5.4	2019-07-15 17:00:26 -04:00
Jay D Dee	e625ed5420	v3.9.5.3	2019-07-12 10:42:38 -04:00
Jay D Dee	9abc19a30a	v3.9.5.2	2019-07-04 12:12:11 -04:00
Jay D Dee	0d769ee0fe	v3.9.5.1	2019-07-02 15:10:38 -04:00
Jay D Dee	0d48d573ce	v3.9.5	2019-06-26 14:16:01 -04:00
Jay D Dee	d6e8d7a46e	v3.9.4	2019-06-18 13:15:45 -04:00
Jay D Dee	71d6b97ee8	v3.9.3.1	2019-06-13 21:15:58 -04:00
Jay D Dee	b2331375a3	v3.9.2.5	2019-06-13 11:20:27 -04:00