v3.12.8.1

v3.12.8
v3.12.7
2025-09-17 23:44:27 +00:00 · 2020-04-17 16:12:45 -04:00 · 2020-04-09 12:56:18 -04:00 · 2020-03-20 16:30:12 -04:00 · 2020-03-07 14:11:06 -05:00 · 2020-03-05 18:43:20 -05:00
282 changed files with 14237 additions and 30645 deletions
--- a/.RELEASE_NOTES.swp
+++ b/.RELEASE_NOTES.swp
--- a/3
+++ b/3
@@ -33,3 +33,6 @@ Jay D Dee
 xcouiz@gmail.com

 Cryply
+
+Colin Percival
+Alexander Peslyak
--- a/Makefile.am
+++ b/Makefile.am
@@ -21,15 +21,6 @@ cpuminer_SOURCES = \
  api.c \
  sysinfos.c \
  algo-gate-api.c\
-  crypto/oaes_lib.c \
-  crypto/c_keccak.c \
-  crypto/c_groestl.c \
-  crypto/c_blake256.c \
-  crypto/c_jh.c \
-  crypto/c_skein.c \
-  crypto/hash.c \
-  crypto/aesb.c \
-  crypto/magimath.cpp \
  algo/argon2/argon2a/argon2a.c \
  algo/argon2/argon2a/ar2/argon2.c \
  algo/argon2/argon2a/ar2/opt.c \
@@ -76,11 +67,6 @@ cpuminer_SOURCES = \
  algo/bmw/bmw512-gate.c \
  algo/bmw/bmw512.c \
  algo/bmw/bmw512-4way.c \
-  algo/cryptonight/cryptolight.c \
-  algo/cryptonight/cryptonight-common.c\
-  algo/cryptonight/cryptonight-aesni.c\
-  algo/cryptonight/cryptonight.c\
-  algo/cubehash/sph_cubehash.c \
  algo/cubehash/cubehash_sse2.c\
  algo/cubehash/cube-hash-2way.c \
  algo/echo/sph_echo.c \
@@ -89,6 +75,7 @@ cpuminer_SOURCES = \
  algo/gost/sph_gost.c \
  algo/groestl/groestl-gate.c \
  algo/groestl/groestl512-hash-4way.c \
+  algo/groestl/groestl256-hash-4way.c \
  algo/groestl/sph_groestl.c \
  algo/groestl/groestl.c \
  algo/groestl/groestl-4way.c \
@@ -102,9 +89,6 @@ cpuminer_SOURCES = \
  algo/hamsi/hamsi-hash-4way.c \
  algo/haval/haval.c \
  algo/haval/haval-hash-4way.c \
-  algo/heavy/sph_hefty1.c \
-  algo/heavy/heavy.c \
-  algo/heavy/bastion.c \
  algo/hodl/aes.c \
  algo/hodl/hodl-gate.c \
  algo/hodl/hodl-wolf.c \
@@ -120,9 +104,9 @@ cpuminer_SOURCES = \
  algo/keccak/keccak-hash-4way.c \
  algo/keccak/keccak-4way.c\
  algo/keccak/keccak-gate.c \
+  algo/keccak/sha3d-4way.c \
+  algo/keccak/sha3d.c \
  algo/lanehash/lane.c \
-  algo/luffa/sph_luffa.c \
-  algo/luffa/luffa.c \
  algo/luffa/luffa_for_sse2.c \
  algo/luffa/luffa-hash-2way.c \
  algo/lyra2/lyra2.c \
@@ -144,14 +128,14 @@ cpuminer_SOURCES = \
  algo/lyra2/allium.c \
  algo/lyra2/phi2-4way.c \
  algo/lyra2/phi2.c \
-  algo/m7m.c \
+  algo//m7m/m7m.c \
+  algo/m7m/magimath.cpp \
  algo/nist5/nist5-gate.c \
  algo/nist5/nist5-4way.c \
  algo/nist5/nist5.c \
  algo/nist5/zr5.c \
  algo/panama/panama-hash-4way.c \
  algo/panama/sph_panama.c \
-  algo/radiogatun/sph_radiogatun.c \
  algo/quark/quark-gate.c \
  algo/quark/quark.c \
  algo/quark/quark-4way.c \
@@ -174,11 +158,12 @@ cpuminer_SOURCES = \
  algo/ripemd/lbry-4way.c \
  algo/scrypt/scrypt.c \
  algo/scrypt/neoscrypt.c \
-  algo/scrypt/pluck.c \
  algo/sha/sph_sha2.c \
  algo/sha/sph_sha2big.c \
  algo/sha/sha256-hash-4way.c \
  algo/sha/sha512-hash-4way.c \
+  algo/sha/hmac-sha256-hash.c \
+  algo/sha/hmac-sha256-hash-4way.c \
  algo/sha/sha2.c \
  algo/sha/sha256t-gate.c \
  algo/sha/sha256t-4way.c \
@@ -192,7 +177,6 @@ cpuminer_SOURCES = \
  algo/shavite/shavite-hash-2way.c \
  algo/shavite/shavite-hash-4way.c \
  algo/shavite/shavite.c \
-  algo/simd/sph_simd.c \
  algo/simd/nist.c \
  algo/simd/vector.c \
  algo/simd/simd-hash-2way.c \
@@ -230,7 +214,6 @@ cpuminer_SOURCES = \
  algo/x11/timetravel10-gate.c \
  algo/x11/timetravel10.c \
  algo/x11/timetravel10-4way.c \
-  algo/x11/fresh.c \
  algo/x11/x11evo.c \
  algo/x11/x11evo-4way.c \
  algo/x11/x11evo-gate.c \
@@ -249,7 +232,6 @@ cpuminer_SOURCES = \
  algo/x13/skunk-gate.c \
  algo/x13/skunk-4way.c \
  algo/x13/skunk.c \
-  algo/x13/drop.c \
  algo/x13/x13bcd-4way.c \
  algo/x13/x13bcd.c \
  algo/x14/x14-gate.c \
@@ -284,19 +266,17 @@ cpuminer_SOURCES = \
  algo/x17/sonoa-gate.c \
  algo/x17/sonoa-4way.c \
  algo/x17/sonoa.c \
-  algo/x20/x20r.c \
  algo/x22/x22i-4way.c \
  algo/x22/x22i.c \
  algo/x22/x22i-gate.c \
  algo/x22/x25x.c \
  algo/x22/x25x-4way.c \
  algo/yescrypt/yescrypt.c \
-  algo/yescrypt/sha256_Y.c \
  algo/yescrypt/yescrypt-best.c \
  algo/yespower/yespower-gate.c \
  algo/yespower/yespower-blake2b.c \
  algo/yespower/crypto/blake2b-yp.c \
-  algo/yespower/sha256_p.c \
+  algo/yespower/yescrypt-r8g.c \
  algo/yespower/yespower-opt.c

 disable_flags =
--- a/README.md
+++ b/README.md
@@ -12,10 +12,24 @@ a false positive, they are flagged simply because they are cryptocurrency
 miners. The source code is open for anyone to inspect. If you don't trust 
 the software, don't use it.

+
+New thread:
+
+https://bitcointalk.org/index.php?topic=5226770.msg53865575#msg53865575
+
+Old thread:
+
 https://bitcointalk.org/index.php?topic=1326803.0

 mailto://jayddee246@gmail.com

+This note is to confirm that bitcointalk users JayDDee and joblo are the
+same person.
+
+I created a new BCT user JayDDee to match my github user id.
+The old thread has been locked but still contains useful information for
+reading.
+
 See file RELEASE_NOTES for change log and INSTALL_LINUX or INSTALL_WINDOWS
 for compile instructions.

@@ -23,25 +37,25 @@ Requirements
 ------------

 1. A x86_64 architecture CPU with a minimum of SSE2 support. This includes
-Intel Core2 and newer and AMD equivalents. In order to take advantage of AES_NI
-optimizations a CPU with AES_NI is required. This includes Intel Westmere
-and newer and AMD equivalents. Further optimizations are available on some
-algoritms for CPUs with AVX and AVX2, Sandybridge and Haswell respectively.
+Intel Core2 and newer and AMD equivalents. Further optimizations are available
+on some algoritms for CPUs with AES, AVX, AVX2, SHA, AVX512 and VAES.

 Older CPUs are supported by cpuminer-multi by TPruvot but at reduced
 performance.

-ARM CPUs are not supported.
+ARM and Aarch64 CPUs are not supported.

-2. 64 bit Linux OS. Ubuntu and Fedora based distributions, including Mint and
-Centos, are known to work and have all dependencies in their repositories.
-Others may work but may require more effort. Older versions such as Centos 6
-don't work due to missing features. 
+2. 64 bit Linux or Windows OS. Ubuntu and Fedora based distributions,
+including Mint and Centos, are known to work and have all dependencies
+in their repositories. Others may work but may require more effort. Older
+versions such as Centos 6 don't work due to missing features. 
 64 bit Windows OS is supported with mingw_w64 and msys or pre-built binaries.

 MacOS, OSx and Android are not supported.

-3. Stratum pool. Some algos may work wallet mining using getwork or GBT. YMMV.
+3. Stratum pool supporting stratum+tcp:// or stratum+ssl:// protocols or
+RPC getwork using http:// or https://.
+GBT is YMMV.

 Supported Algorithms
 --------------------
@@ -53,7 +67,6 @@ Supported Algorithms
                          argon2d500    argon2d-dyn,  Dynamic (DYN)
                          argon2d4096   argon2d-uis, Unitus, (UIS)
                          axiom         Shabal-256 MemoHash
-                          bastion
                          blake         Blake-256 (SFR)
                          blake2b       Blake2b 256
                          blake2s       Blake-2 S
@@ -64,10 +77,7 @@ Supported Algorithms
                          decred
                          deep          Deepcoin (DCN)
                          dmd-gr        Diamond-Groestl
-                          drop          Dropcoin
-                          fresh         Fresh
                          groestl       Groestl coin
-                          heavy         Heavy
                          hex           x16r-hex
                          hmq1725       Espers
                          hodl          Hodlcoin
@@ -97,10 +107,10 @@ Supported Algorithms
                          qubit         Qubit
                          scrypt        scrypt(1024, 1, 1) (default)
                          scrypt:N      scrypt(N, 1, 1)
-                          scryptjane:nf
                          sha256d       Double SHA-256
                          sha256q       Quad SHA-256, Pyrite (PYE)
                          sha256t       Triple SHA-256, Onecoin (OC)
+                          sha3d         Double keccak256 (BSHA3)
                          shavite3      Shavite3
                          skein         Skein+Sha (Skeincoin)
                          skein2        Double Skein (Woodcoin)
@@ -134,6 +144,7 @@ Supported Algorithms
                          xevan         Bitsend (BSD)
                          yescrypt      Globalboost-Y (BSTY)
                          yescryptr8    BitZeny (ZNY)
+                          yescryptr8g   Koto (KOTO)
                          yescryptr16   Eli
                          yescryptr32   WAVI
                          yespower      Cryply
@@ -141,6 +152,27 @@ Supported Algorithms
                          yespower-b2b  generic yespower + blake2b
                          zr5           Ziftr

+Many variations of scrypt based algos can be mine by specifying their
+parameters:
+
+scryptn2: --algo scrypt --param-n 1048576
+
+cpupower: --algo yespower --param-key "CPUpower: The number of CPU working or available for proof-of-work mining"
+
+power2b: --algo yespower-b2b --param-n 2048 --param-r 32 --param-key "Now I am become Death, the destroyer of worlds"
+
+sugarchain: --algo yespower --param-n 2048 -param-r 32 --param-key "Satoshi Nakamoto 31/Oct/2008 Proof-of-work is essentially one-CPU-one-vote"
+
+yespoweriots: --algo yespower --param-n 2048 --param-key "Iots is committed to the development of IOT"
+
+yespowerlitb: --algo yespower --param-n 2048 --param-r 32 --param-key "LITBpower: The number of LITB working or available for proof-of-work mini"
+
+yespoweric: --algo yespower --param-n 2048 --param-r 32 --param-key "IsotopeC" 
+
+yespowerurx: --algo yespower --param-n 2048 --param-r 32 --param-key "UraniumX"
+
+yespowerltncg: --algo yespower --param-n 2048 --param-r 32 --param-key "LTNCGYES"
+
 Errata
 ------

--- a/286
+++ b/286
@@ -33,9 +33,295 @@ supported.
 64 bit Linux or Windows operating system. Apple, Android and Raspberry Pi
 are not supported. FreeBSD YMMV.

+Reporting bugs
+--------------
+
+Bugs can be reported by sending am email to JayDDee246@gmail.com or opening
+an issue in git: https://github.com/JayDDee/cpuminer-opt/issues
+
+Please include the following information:
+
+1. CPU model, operating system, cpuminer-opt version (must be latest),
+   binary file for Windows, changes to default build procedure for Linux.
+
+2. Exact comand line (except user and pw) and intial output showing
+   the above requested info.
+
+3. Additional program output showing any error messages or other
+   pertinent data.
+
+4. A clear description of the problem including history, scope,
+   persistence or intermittance, and reproduceability. 
+
+In simpler terms:
+
+What is it doing?
+What should it be doing instead?
+Did it work in a previous release?
+Does it happen for all algos? All pools? All options? Solo?
+Does it happen all the time?
+If not what makes it happen or not happen? 
+
 Change Log
 ----------

+v3.12.8.1
+
+Issue #261: Fixed yescryptr8g invalid shares.
+
+v3.12.8
+
+Yespower sha256 prehash made thread safe.
+
+Rewrote diff conversion functions from scratch to be simpler and use 
+long double (float80) and int128 arithmetic for improved accuracy and
+precision.
+
+Some code cleanup and assorted small changes.
+
+v3.12.7
+
+Issue #257: fixed a file descriptor leak which caused the CPU temperature
+and frequency query to report zeros after mining for a couple of hours.
+
+Issue #253: stale share reduction for yescrypt, sonoa.
+
+v3.12.6.1
+
+Issue #252: Fixed SSL mining (stratum+tcps://)
+
+Issue #254 Fixed benchmark.
+
+Issue #253: Implemented stale share reduction for yespower, x25x, x22i, x21s,
+x16*, scryptn2, more to come.
+
+v3.12.6
+
+Issue #246: improved stale share detection for getwork.
+
+Improved precision of target_to_diff conversion from 4 digits to 20+.
+
+Display hash and target debug data for all rejected shares.
+
+A graphical representation of CPU affinity is displayed when using --threads.
+
+Added highest and lowest accepted share to summary log.
+
+Other small changes to logs to improve consistency and clarity.
+
+v3.12.5
+
+Issues #246 & #251: fixed incorrect share diff for stratum and getwork,
+fixed incorrect target diff for getwork. Stats should now be correct for
+getwork as well as stratum.
+
+Issue #252: Fixed stratum+tcps not using curl ssl.
+
+Getwork: reduce stale blocks, faster response to new work.
+
+Added ntime to new job/work logs.
+
+README.md now lists the parameters for yespower variations that don't have
+a specific algo name.
+
+v3.12.4.6
+
+Issue #246: fixed getwork repeated new block logs with same height. New work
+for the same block is now reported as "New work" instead of "New block".
+Also added a check that work is new before generating "New work" log.
+
+Added target diff to getwork new block log.
+
+Changed share ratio in share result log to simple fraction, no longer %.
+
+Added debug log to display mininginfo, use -D.
+
+v3.12.4.5
+
+Issue #246: better stale share detection for getwork, and enhanced logging
+of stale shares for stratum & getwork.
+
+Issue #251: fixed incorrect share difficulty and share ratio in share
+result log.
+
+Changed submit log to include share diff and block height.
+
+Small cosmetic changes to logs. 
+
+v3.12.4.4
+
+Issue #246: Fixed net hashrate in getwork block log,
+            removed duplicate getwork block log, 
+            other small tweaks to stats logs for getwork.
+
+Issue #248: Fixed chronic stale shares with scrypt:1048576 (scryptn2). 
+
+v3.12.4.3
+
+Fixed segfault in new block log for getwork.
+
+Disabled silent discarding of stale work after the submit is logged.
+
+v3.12.4.2
+
+Issue #245: fixed getwork stale shares, solo mining with getwork now works.
+
+Issue #246: implemented block and summary logs for getwork.
+
+v3.12.4.1
+
+Issue #245: fix scantime when mining solo with getwork.
+
+Added debug logs for creation of stratum and longpoll threads, use -D to
+enable.
+
+v3.12.4
+
+Issue #244: Change longpoll to ignore job id.
+
+Lyra2rev2 AVX2 +3%, AVX512 +6%.
+
+v3.12.3.1
+
+Issue #241: Fixed regression that broke coinbase address in v3.11.7.
+
+v3.12.3
+
+Issue #238: Fixed skunk AVX2.
+
+Issue #239: Faster AVX2 & AVX512 for skein +44%, skein2 +30%, plus marginal
+increases for skunk, x16r, x16rv2, x16rt, x16rt-veil, x16s, x21s.
+
+Faster anime VAES +57%, AVX512 +21%, AVX2 +3%.
+
+Redesigned code reponsible for #236.
+
+v3.12.2
+
+Fixed xevan, skein, skein2 AVX2, #238.
+
+Reversed polarity of AVX2 vector bit test utilities, and all users, to be
+logically and semantically correct. Follow up to issue #236. 
+
+v3.12.1
+
+Fixed anime AVX2 low difficulty shares, git issue #236.
+
+Periodic summary now reports lost hash rate due to rejected and stale shares,
+displayed only when non-zero.
+
+v3.12.0.1
+
+Fixed hodl rejects, git issue #237.
+
+Fixed debug code added in v3.12.0 to work with AVX2 to be enabled only
+after low difficulty share have been seen to avoid unnecessarily excessive
+log outout.
+
+Added more digits of precision to diff in log output to help diagnose
+low difficulty shares.
+
+v3.12.0
+
+Faster phi2 AVX2 +62%, AVX512 +150% on Intel CPUs. AMD Ryzen AVX2 is
+YMMV due to its inferiour AVX2 implementation.
+
+Fixed Hodl stats, rejects are still an issue since v3.9.5, git issue #237.
+
+API can now be enabled with "-b port" or "--api-bind port".
+It will use the default address 127.0.0.1.
+
+Editorial: Short form options should only be used on the command line to save
+typing. Configuration files and scripts should always use the long form
+"--api-bind addr:port" without relying on any defaults. This is a general
+recommendation that applies to all options for any application.
+
+Removed obsolete cryptonight, all variants, and supporting code for more
+size reduction and faster compiling.
+
+Tweaked the timing of the CPU temperature and frequency log (Linux only).
+
+Added some debug code to collect more info aboout low difficulty rejects,
+git issue #236.
+
+v3.11.9
+
+Fixed x16r invalid shares when Luffa was first in hash order.
+
+API is disabled by default.
+
+New startup message for status of stratum connection, API & extranonce.
+
+New log report for CPU temperature, frequency of fastest and slowest cores.
+
+Compile time is a little shorter and binary file size a little smaller
+using conditional compilation..
+
+Removed code for Bastion, Drop, Heavy, Luffa an Pluck algos and other unused
+code.
+
+v3.11.8
+
+Fixed network hashrate showing incorrect data, should be close now.
+
+Fixed compile errors when using GCC 10 with default flag -fno-common.
+
+Faster x16r, x16rv2, x16rt, x16s, x21s, veil, hex with midstate prehash.
+
+Decoupled sapling usage from block version 5 in yescryptr8g.
+
+More detailed data reporting for low difficulty rejected shares.
+
+v3.11.7
+
+Added yescryptr8g algo for KOTO, including support for block version 5.
+
+Added sha3d algo for BSHA3.
+
+Removed memcmp and clean_job checks from get_new_work, now only check job_id.
+
+Small improvement to sha512 and sha256 parallel implementations that don't
+use SHA.
+
+v3.11.6
+
+Fixed CPU temperature regression from v3.11.5.
+
+More improvements to share log. More compact, highlight incremented counter,
+block height when solved, job id when stale.
+
+v3.11.5
+
+Fixed AVX512 detection that could cause compilation errors on CPUs
+without AVX512.
+
+Fixed "BLOCK SOLVED" log incorrectly displaying "Accepted" when a block
+is solved.
+Added share counter to share submitited & accepted logs
+Added job id to share submitted log.
+Share submitted log is no longer highlighted blue, there was too much blue.
+
+Another CPU temperature fix for Linux.
+
+Added bug reporting tips to RELEASE NOTES.
+
+v3.11.4
+
+Fixed scrypt segfault since v3.9.9.1.
+
+Stale shares counted and reported seperately from other rejected shares.
+
+Display of counters for solved blocks, rejects, stale shares suppressed in
+periodic summary when zero.
+
+v3.11.3
+
+Fixed x12 AVX2 again.
+
+More speed for allium: AVX2 +4%, AVX512 +6%, VAES +14%.
+
+Restored lost speed for x22i & x25x.
+
 v3.11.2

 Fixed x11gost (sib) AVX2 invalid shares.
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -97,13 +97,10 @@ int null_scanhash()
   return 0;
 }

-void null_hash()
+int null_hash()
 {
   applog(LOG_WARNING,"SWERR: null_hash unsafe null function");
-};
-void null_hash_suw()
-{
-  applog(LOG_WARNING,"SWERR: null_hash_suw unsafe null function");
+   return 0;
 };

 void init_algo_gate( algo_gate_t* gate )
@@ -111,13 +108,10 @@ void init_algo_gate( algo_gate_t* gate )
   gate->miner_thread_init       = (void*)&return_true;
   gate->scanhash                = (void*)&null_scanhash;
   gate->hash                    = (void*)&null_hash;
-   gate->hash_suw                = (void*)&null_hash_suw;
   gate->get_new_work            = (void*)&std_get_new_work;
-   gate->get_nonceptr            = (void*)&std_get_nonceptr;
   gate->work_decode             = (void*)&std_le_work_decode;
   gate->decode_extra_data       = (void*)&do_nothing;
   gate->gen_merkle_root         = (void*)&sha256d_gen_merkle_root;
-   gate->stratum_gen_work        = (void*)&std_stratum_gen_work;
   gate->build_stratum_request   = (void*)&std_le_build_stratum_request;
   gate->malloc_txs_request      = (void*)&std_malloc_txs_request;
   gate->submit_getwork_result   = (void*)&std_le_submit_getwork_result;
@@ -129,7 +123,6 @@ void init_algo_gate( algo_gate_t* gate )
   gate->resync_threads          = (void*)&do_nothing;
   gate->do_this_thread          = (void*)&return_true;
   gate->longpoll_rpc_call       = (void*)&std_longpoll_rpc_call;
-   gate->stratum_handle_response = (void*)&std_stratum_handle_response;
   gate->get_work_data_size      = (void*)&std_get_work_data_size;
   gate->optimizations           = EMPTY_SET;
   gate->ntime_index             = STD_NTIME_INDEX;
@@ -162,23 +155,16 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
    case ALGO_ARGON2D500:    register_argon2d_dyn_algo   ( gate ); break;
    case ALGO_ARGON2D4096:   register_argon2d4096_algo   ( gate ); break;
    case ALGO_AXIOM:         register_axiom_algo         ( gate ); break;
-    case ALGO_BASTION:       register_bastion_algo       ( gate ); break;
    case ALGO_BLAKE:         register_blake_algo         ( gate ); break;
    case ALGO_BLAKE2B:       register_blake2b_algo       ( gate ); break;
    case ALGO_BLAKE2S:       register_blake2s_algo       ( gate ); break;
    case ALGO_BLAKECOIN:     register_blakecoin_algo     ( gate ); break;
    case ALGO_BMW512:        register_bmw512_algo        ( gate ); break;
    case ALGO_C11:           register_c11_algo           ( gate ); break;
-    case ALGO_CRYPTOLIGHT:   register_cryptolight_algo   ( gate ); break;
-    case ALGO_CRYPTONIGHT:   register_cryptonight_algo   ( gate ); break;
-    case ALGO_CRYPTONIGHTV7: register_cryptonightv7_algo ( gate ); break;
    case ALGO_DECRED:        register_decred_algo        ( gate ); break;
    case ALGO_DEEP:          register_deep_algo          ( gate ); break;
    case ALGO_DMD_GR:        register_dmd_gr_algo        ( gate ); break;
-    case ALGO_DROP:          register_drop_algo          ( gate ); break;
-    case ALGO_FRESH:         register_fresh_algo         ( gate ); break;
    case ALGO_GROESTL:       register_groestl_algo       ( gate ); break;
-    case ALGO_HEAVY:         register_heavy_algo         ( gate ); break;
    case ALGO_HEX:           register_hex_algo           ( gate ); break;
    case ALGO_HMQ1725:       register_hmq1725_algo       ( gate ); break;
    case ALGO_HODL:          register_hodl_algo          ( gate ); break;
@@ -186,7 +172,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
    case ALGO_KECCAK:        register_keccak_algo        ( gate ); break;
    case ALGO_KECCAKC:       register_keccakc_algo       ( gate ); break;
    case ALGO_LBRY:          register_lbry_algo          ( gate ); break;
-    case ALGO_LUFFA:         register_luffa_algo         ( gate ); break;
    case ALGO_LYRA2H:        register_lyra2h_algo        ( gate ); break;
    case ALGO_LYRA2RE:       register_lyra2re_algo       ( gate ); break;
    case ALGO_LYRA2REV2:     register_lyra2rev2_algo     ( gate ); break;
@@ -200,7 +185,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
    case ALGO_PENTABLAKE:    register_pentablake_algo    ( gate ); break;
    case ALGO_PHI1612:       register_phi1612_algo       ( gate ); break;
    case ALGO_PHI2:          register_phi2_algo          ( gate ); break;
-    case ALGO_PLUCK:         register_pluck_algo         ( gate ); break;
    case ALGO_POLYTIMOS:     register_polytimos_algo     ( gate ); break;
    case ALGO_POWER2B:       register_power2b_algo       ( gate ); break;
    case ALGO_QUARK:         register_quark_algo         ( gate ); break;
@@ -209,6 +193,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
    case ALGO_SHA256D:       register_sha256d_algo       ( gate ); break;
    case ALGO_SHA256Q:       register_sha256q_algo       ( gate ); break;
    case ALGO_SHA256T:       register_sha256t_algo       ( gate ); break;
+    case ALGO_SHA3D:         register_sha3d_algo         ( gate ); break;
    case ALGO_SHAVITE3:      register_shavite_algo       ( gate ); break;
    case ALGO_SKEIN:         register_skein_algo         ( gate ); break;
    case ALGO_SKEIN2:        register_skein2_algo        ( gate ); break;
@@ -240,13 +225,9 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
    case ALGO_X22I:          register_x22i_algo          ( gate ); break;
    case ALGO_X25X:          register_x25x_algo          ( gate ); break;
    case ALGO_XEVAN:         register_xevan_algo         ( gate ); break;
-/*    case ALGO_YESCRYPT:     register_yescrypt_05_algo     ( gate ); break;
-     case ALGO_YESCRYPTR8:   register_yescryptr8_05_algo   ( gate ); break;
-     case ALGO_YESCRYPTR16:  register_yescryptr16_05_algo  ( gate ); break;
-     case ALGO_YESCRYPTR32:  register_yescryptr32_05_algo  ( gate ); break;
-*/
    case ALGO_YESCRYPT:      register_yescrypt_algo      ( gate ); break;
    case ALGO_YESCRYPTR8:    register_yescryptr8_algo    ( gate ); break;
+    case ALGO_YESCRYPTR8G:   register_yescryptr8g_algo   ( gate ); break;
    case ALGO_YESCRYPTR16:   register_yescryptr16_algo   ( gate ); break;
    case ALGO_YESCRYPTR32:   register_yescryptr32_algo   ( gate ); break;
    case ALGO_YESPOWER:      register_yespower_algo      ( gate ); break;
@@ -270,29 +251,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
 // restore warnings
 #pragma GCC diagnostic pop

-// override std defaults with jr2 defaults
-bool register_json_rpc2( algo_gate_t *gate )
-{
-  applog(LOG_WARNING,"\nCryptonight algorithm and variants are no longer");
-  applog(LOG_WARNING,"supported by cpuminer-opt. Shares submitted will");
-  applog(LOG_WARNING,"likely be rejected. Proceed at your own risk.\n");
-
-//  gate->wait_for_diff           = (void*)&do_nothing;
-  gate->get_new_work            = (void*)&jr2_get_new_work;
-  gate->get_nonceptr            = (void*)&jr2_get_nonceptr;
-  gate->stratum_gen_work        = (void*)&jr2_stratum_gen_work;
-  gate->build_stratum_request   = (void*)&jr2_build_stratum_request;
-  gate->submit_getwork_result   = (void*)&jr2_submit_getwork_result;
-  gate->longpoll_rpc_call       = (void*)&jr2_longpoll_rpc_call;
-  gate->work_decode             = (void*)&jr2_work_decode;
-  gate->stratum_handle_response = (void*)&jr2_stratum_handle_response;
-  gate->nonce_index             = JR2_NONCE_INDEX;
-  jsonrpc_2 = true;   // still needed
-  opt_extranonce = false;
-//  have_gbt = false;
-  return true;
- }
-
 // run the alternate hash function for a specific algo
 void exec_hash_function( int algo, void *output, const void *pdata )
 {
@@ -313,39 +271,37 @@ void exec_hash_function( int algo, void *output, const void *pdata )
 const char* const algo_alias_map[][2] =
 {
 //   alias                proper
-  { "argon2d-crds",      "argon2d250"   },
-  { "argon2d-dyn",       "argon2d500"   },
-  { "argon2d-uis",       "argon2d4096"  },
-  { "bcd",               "x13bcd"       },
-  { "bitcore",           "timetravel10" },
-  { "bitzeny",           "yescryptr8"   },
-  { "blake256r8",        "blakecoin"    },
-  { "blake256r8vnl",     "vanilla"      },
-  { "blake256r14",       "blake"        },
-  { "blake256r14dcr",    "decred"       },
-  { "cryptonote",        "cryptonight"  },
-  { "cryptonight-light", "cryptolight"  },
-  { "diamond",           "dmd-gr"       },
-  { "droplp",            "drop"         },
-  { "espers",            "hmq1725"      },
-  { "flax",              "c11"          },
-  { "hsr",               "x13sm3"       },
-  { "jackpot",           "jha"          },
-  { "jane",              "scryptjane"   }, 
-  { "lyra2",             "lyra2re"      },
-  { "lyra2v2",           "lyra2rev2"    },
-  { "lyra2v3",           "lyra2rev3"    },
-  { "myrgr",             "myr-gr"       },
-  { "myriad",            "myr-gr"       },
-  { "neo",               "neoscrypt"    },
-  { "phi",               "phi1612"      },
-  { "sib",               "x11gost"      },
-  { "timetravel8",       "timetravel"   },
-  { "veil",              "x16rt-veil"   },
-  { "x16r-hex",          "hex"          },
-  { "yenten",            "yescryptr16"  },
-  { "ziftr",             "zr5"          },
-  { NULL,                NULL           }   
+  { "argon2d-crds",      "argon2d250"     },
+  { "argon2d-dyn",       "argon2d500"     },
+  { "argon2d-uis",       "argon2d4096"    },
+  { "bcd",               "x13bcd"         },
+  { "bitcore",           "timetravel10"   },
+  { "bitzeny",           "yescryptr8"     },
+  { "blake256r8",        "blakecoin"      },
+  { "blake256r8vnl",     "vanilla"        },
+  { "blake256r14",       "blake"          },
+  { "blake256r14dcr",    "decred"         },
+  { "diamond",           "dmd-gr"         },
+  { "espers",            "hmq1725"        },
+  { "flax",              "c11"            },
+  { "hsr",               "x13sm3"         },
+  { "jackpot",           "jha"            },
+  { "jane",              "scryptjane"     }, 
+  { "lyra2",             "lyra2re"        },
+  { "lyra2v2",           "lyra2rev2"      },
+  { "lyra2v3",           "lyra2rev3"      },
+  { "myrgr",             "myr-gr"         },
+  { "myriad",            "myr-gr"         },
+  { "neo",               "neoscrypt"      },
+  { "phi",               "phi1612"        },
+  { "scryptn2",          "scrypt:1048576" },
+  { "sib",               "x11gost"        },
+  { "timetravel8",       "timetravel"     },
+  { "veil",              "x16rt-veil"     },
+  { "x16r-hex",          "hex"            },
+  { "yenten",            "yescryptr16"    },
+  { "ziftr",             "zr5"            },
+  { NULL,                NULL             }   
 };

 // if arg is a valid alias for a known algo it is updated with the proper
@@ -358,7 +314,7 @@ void get_algo_alias( char** algo_or_alias )
    if ( !strcasecmp( *algo_or_alias, algo_alias_map[i][ ALIAS ] ) )
    {
      // found valid alias, return proper name
-      *algo_or_alias = (char* const)( algo_alias_map[i][ PROPER ] );
+      *algo_or_alias = (char*)( algo_alias_map[i][ PROPER ] );
      return;
    }
 }
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -75,7 +75,7 @@

 // my hack at creating a set data type using bit masks. Set inclusion,
 // exclusion union and intersection operations are provided for convenience. In // some cases it may be desireable to use boolean algebra directly on the
-// data to perfomr set operations. Sets can be represented as single
+// data to perform set operations. Sets can be represented as single
 // elements, a bitwise OR of multiple elements, a bitwise OR of multiple
 // set variables or constants, or combinations of the above.
 // Examples:
@@ -110,65 +110,60 @@ inline bool set_excl ( set_t a, set_t b ) { return (a & b) == 0; }

 typedef struct
 {
-// mandatory functions, must be overwritten
+// mandatory function, must be overwritten
 int ( *scanhash ) ( struct work*, uint32_t, uint64_t*, struct thr_info* );

-// optional unsafe, must be overwritten if algo uses function
-void ( *hash )     ( void*, const void*, uint32_t ) ;
-void ( *hash_suw ) ( void*, const void* );
+// Deprecated, will be removed
+int ( *hash )     ( void*, const void*, uint32_t ) ;

 //optional, safe to use default in most cases

 // Allocate thread local buffers and other initialization specific to miner
 // threads.
-bool ( *miner_thread_init )      ( int );
-
-// Generate global blockheader from stratum data.
-void ( *stratum_gen_work )       ( struct stratum_ctx*, struct work* );
+bool ( *miner_thread_init )     ( int );

 // Get thread local copy of blockheader with unique nonce.
-void ( *get_new_work )           ( struct work*, struct work*, int, uint32_t*,
-                                   bool );
-
-// Return pointer to nonce in blockheader.
-uint32_t *( *get_nonceptr )      ( uint32_t* );
+void ( *get_new_work )          ( struct work*, struct work*, int, uint32_t* );

 // Decode getwork blockheader
-bool ( *work_decode )            ( const json_t*, struct work* );
+bool ( *work_decode )           ( const json_t*, struct work* );

 // Extra getwork data
-void ( *decode_extra_data )      ( struct work*, uint64_t* );
+void ( *decode_extra_data )     ( struct work*, uint64_t* );

-bool ( *submit_getwork_result )  ( CURL*, struct work* );
+bool ( *submit_getwork_result ) ( CURL*, struct work* );

-void ( *gen_merkle_root )        ( char*, struct stratum_ctx* );
+void ( *gen_merkle_root )       ( char*, struct stratum_ctx* );

 // Increment extranonce
-void ( *build_extraheader )      ( struct work*, struct stratum_ctx* );
+void ( *build_extraheader )     ( struct work*, struct stratum_ctx* );
+
+void ( *build_block_header )    ( struct work*, uint32_t, uint32_t*,
+	                                uint32_t*, uint32_t, uint32_t,
+                                   unsigned char* );

-void ( *build_block_header )     ( struct work*, uint32_t, uint32_t*,
-	                                uint32_t*, uint32_t, uint32_t );
 // Build mining.submit message
-void ( *build_stratum_request )  ( char*, struct work*, struct stratum_ctx* );
+void ( *build_stratum_request ) ( char*, struct work*, struct stratum_ctx* );

-char* ( *malloc_txs_request )    ( struct work* );
+char* ( *malloc_txs_request )   ( struct work* );

 // Big or little
-void ( *set_work_data_endian )   ( struct work* );
+void ( *set_work_data_endian )  ( struct work* );

-double ( *calc_network_diff )    ( struct work* );
+double ( *calc_network_diff )   ( struct work* );

 // Wait for first work
-bool ( *ready_to_mine )          ( struct work*, struct stratum_ctx*, int );
+bool ( *ready_to_mine )         ( struct work*, struct stratum_ctx*, int );

 // Diverge mining threads
-bool ( *do_this_thread )         ( int );
+bool ( *do_this_thread )        ( int );

 // After do_this_thread
-void ( *resync_threads )         ( struct work* );
+void ( *resync_threads )        ( struct work* );
+
+// No longer needed
+json_t* (*longpoll_rpc_call)      ( CURL*, int*, char* );

-json_t* (*longpoll_rpc_call)     ( CURL*, int*, char* );
-bool ( *stratum_handle_response )( json_t* );
 set_t optimizations;
 int  ( *get_work_data_size )     ();
 int  ntime_index;
@@ -216,36 +211,24 @@ void four_way_not_tested();
 int null_scanhash();

 // displays warning
-void null_hash    ();
-void null_hash_suw();
+int null_hash    ();

 // optional safe targets, default listed first unless noted.

-uint32_t *std_get_nonceptr( uint32_t *work_data );
-uint32_t *jr2_get_nonceptr( uint32_t *work_data );
-
 void std_get_new_work( struct work *work, struct work *g_work, int thr_id,
-                       uint32_t* end_nonce_ptr, bool clean_job );
-void jr2_get_new_work( struct work *work, struct work *g_work, int thr_id,
                       uint32_t* end_nonce_ptr );

-void std_stratum_gen_work( struct stratum_ctx *sctx, struct work *work );
-void jr2_stratum_gen_work( struct stratum_ctx *sctx, struct work *work );
-
 void sha256d_gen_merkle_root( char *merkle_root, struct stratum_ctx *sctx );
 void SHA256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx );

 bool std_le_work_decode( const json_t *val, struct work *work );
 bool std_be_work_decode( const json_t *val, struct work *work );
-bool jr2_work_decode(    const json_t *val, struct work *work );

 bool std_le_submit_getwork_result( CURL *curl, struct work *work );
 bool std_be_submit_getwork_result( CURL *curl, struct work *work );
-bool jr2_submit_getwork_result(    CURL *curl, struct work *work );

 void std_le_build_stratum_request( char *req, struct work *work );
 void std_be_build_stratum_request( char *req, struct work *work );
-void jr2_build_stratum_request   ( char *req, struct work *work );

 char* std_malloc_txs_request( struct work *work );

@@ -256,15 +239,12 @@ double std_calc_network_diff( struct work *work );

 void std_build_block_header( struct work* g_work, uint32_t version,
 	                          uint32_t *prevhash,  uint32_t *merkle_root,
-   	                       uint32_t ntime,      uint32_t nbits );
+   	                       uint32_t ntime,      uint32_t nbits,
+                             unsigned char *final_sapling_hash );

 void std_build_extraheader( struct work *work, struct stratum_ctx *sctx );

 json_t* std_longpoll_rpc_call( CURL *curl, int *err, char *lp_url );
-json_t* jr2_longpoll_rpc_call( CURL *curl, int *err );
-
-bool std_stratum_handle_response( json_t *val );
-bool jr2_stratum_handle_response( json_t *val );

 bool std_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
                        int thr_id );
@@ -283,11 +263,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate );
 // compiler warnings but that's just more work for devs adding new algos.
 bool register_algo( algo_gate_t *gate );

-// Overrides a common set of functions used by RPC2 and other RPC2-specific
-// init. Called by algo's register function before initializing algo-specific
-// functions and data.
-bool register_json_rpc2( algo_gate_t *gate );
-
 // use this to call the hash function of an algo directly, ie util.c test.
 void exec_hash_function( int algo, void *output, const void *pdata );

--- a/algo/argon2/argon2d/argon2d-gate.c
+++ b/algo/argon2/argon2d/argon2d-gate.c
@@ -1,4 +1,5 @@
 #include "argon2d-gate.h"
+#include "simd-utils.h"
 #include "argon2d/argon2.h"

 static const size_t INPUT_BYTES = 80;  // Lenth of a block header in bytes. Input Length = Salt Length (salt = input)
@@ -36,7 +37,7 @@ void argon2d_crds_hash( void *output, const void *input )
 int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t _ALIGN(64) endiandata[20];
+   uint32_t _ALIGN(64) edata[20];
   uint32_t _ALIGN(64) hash[8];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
@@ -45,11 +46,11 @@ int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce,
   const uint32_t Htarg = ptarget[7];
   uint32_t nonce = first_nonce;

-   swab32_array( endiandata, pdata, 20 );
+   swab32_array( edata, pdata, 20 );

   do {
-      be32enc(&endiandata[19], nonce);
-      argon2d_crds_hash( hash, endiandata );
+      be32enc(&edata[19], nonce);
+      argon2d_crds_hash( hash, edata );
      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) && !opt_benchmark )
      {
          pdata[19] = nonce;
@@ -103,31 +104,32 @@ void argon2d_dyn_hash( void *output, const void *input )
 int scanhash_argon2d_dyn( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t _ALIGN(64) endiandata[20];
+   uint32_t _ALIGN(64) edata[20];
   uint32_t _ALIGN(64) hash[8];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
-   int thr_id = mythr->id;  // thr_id arg is deprecated
-   const uint32_t first_nonce = pdata[19];
-   const uint32_t Htarg = ptarget[7];
+   const int thr_id = mythr->id; 
+   const uint32_t first_nonce = (const uint32_t)pdata[19];
+   const uint32_t last_nonce = (const uint32_t)max_nonce;
   uint32_t nonce = first_nonce;
+   const bool bench = opt_benchmark;

-   swab32_array( endiandata, pdata, 20 );
-
+   mm128_bswap32_80( edata, pdata );
   do
   {
-      be32enc(&endiandata[19], nonce);
-      argon2d_dyn_hash( hash, endiandata );
-      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) && !opt_benchmark )
+      edata[19] = nonce;
+      argon2d_dyn_hash( hash, edata );
+      if ( unlikely( valid_hash( (uint64_t*)hash, (uint64_t*)ptarget )
+           && !bench ) )
      {
-          pdata[19] = nonce;
+          pdata[19] = bswap_32( nonce );;
          submit_solution( work, hash, mythr );
      }
      nonce++;
-  } while (nonce < max_nonce && !work_restart[thr_id].restart);
+  } while ( likely( nonce < last_nonce && !work_restart[thr_id].restart ) );

   pdata[19] = nonce;
-   *hashes_done = pdata[19] - first_nonce + 1;
+   *hashes_done = pdata[19] - first_nonce;
   return 0;
 }

@@ -146,36 +148,34 @@ int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t _ALIGN(64) vhash[8];
-   uint32_t _ALIGN(64) endiandata[20];
+   uint32_t _ALIGN(64) edata[20];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
-   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = (const uint32_t)max_nonce;
   uint32_t n = first_nonce;
-   int thr_id = mythr->id;  // thr_id arg is deprecated
+   const int thr_id = mythr->id;  // thr_id arg is deprecated
   uint32_t t_cost = 1; // 1 iteration
   uint32_t m_cost = 4096; // use 4MB
   uint32_t parallelism = 1; // 1 thread, 2 lanes
+   const bool bench = opt_benchmark;

-   for ( int i = 0; i < 19; i++ )
-      be32enc( &endiandata[i], pdata[i] );
+   mm128_bswap32_80( edata, pdata );

   do {
-      be32enc( &endiandata[19], n );
-      argon2d_hash_raw( t_cost, m_cost, parallelism, (char*) endiandata, 80,
-                 (char*) endiandata, 80, (char*) vhash, 32, ARGON2_VERSION_13 );
-      if ( vhash[7] < Htarg && fulltest( vhash, ptarget ) && !opt_benchmark )
+      edata[19] = n;
+      argon2d_hash_raw( t_cost, m_cost, parallelism, (char*) edata, 80,
+                 (char*) edata, 80, (char*) vhash, 32, ARGON2_VERSION_13 );
+      if ( unlikely( valid_hash( vhash, ptarget ) && !bench ) )
      {
-         pdata[19] = n;
+         be32enc( &pdata[19], n );
         submit_solution( work, vhash, mythr );
      }
      n++;
+   } while ( likely( n < last_nonce && !work_restart[thr_id].restart ) );

-   } while (n < max_nonce && !work_restart[thr_id].restart);
-
-   *hashes_done = n - first_nonce + 1;
+   *hashes_done = n - first_nonce;
   pdata[19] = n;
-
   return 0;
 }

--- a/algo/blake/blake-4way.c
+++ b/algo/blake/blake-4way.c
@@ -13,7 +13,7 @@ void blakehash_4way(void *state, const void *input)
     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
     blake256r14_4way_context ctx;
     memcpy( &ctx, &blake_4w_ctx, sizeof ctx );
-     blake256r14_4way( &ctx, input + (64<<2), 16 );
+     blake256r14_4way_update( &ctx, input + (64<<2), 16 );
     blake256r14_4way_close( &ctx, vhash );
     dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
 }
@@ -36,7 +36,7 @@ int scanhash_blake_4way( struct work *work, uint32_t max_nonce,

   mm128_bswap32_intrlv80_4x32( vdata, pdata );
   blake256r14_4way_init( &blake_4w_ctx );
-   blake256r14_4way( &blake_4w_ctx, vdata, 64 );
+   blake256r14_4way_update( &blake_4w_ctx, vdata, 64 );

   do {
      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
@@ -48,7 +48,7 @@ int scanhash_blake_4way( struct work *work, uint32_t max_nonce,
      if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
      {
          pdata[19] = n+i;
-          submit_lane_solution( work, hash+(i<<3), mythr, i );
+          submit_solution( work, hash+(i<<3), mythr );
      }
      n += 4;

@@ -107,7 +107,7 @@ int scanhash_blake_8way( struct work *work, uint32_t max_nonce,
      if ( (hash+i)[7] <= HTarget && fulltest( hash+i, ptarget ) )
      {
          pdata[19] = n+i;
-          submit_lane_solution( work, hash+(i<<3), mythr, i );
+          submit_solution( work, hash+(i<<3), mythr );
      }
      n += 8;

--- a/algo/blake/blake-hash-4way.h
+++ b/algo/blake/blake-hash-4way.h
@@ -37,8 +37,6 @@
 #ifndef __BLAKE_HASH_4WAY__
 #define __BLAKE_HASH_4WAY__ 1

-//#ifdef __SSE4_2__
-
 #ifdef __cplusplus
 extern "C"{
 #endif
@@ -51,46 +49,41 @@ extern "C"{

 #define SPH_SIZE_blake512   512

-// With SSE4.2 only Blake-256 4 way is available.
-// With AVX2 Blake-256 8way & Blake-512 4 way are also available.
-
-// Blake-256 4 way
+//////////////////////////
+//
+//   Blake-256 4 way SSE2

 typedef struct {
   unsigned char buf[64<<2];
   uint32_t H[8<<2];
-//   __m128i buf[16] __attribute__ ((aligned (64)));
-//   __m128i H[8];
-//   __m128i S[4];    
   size_t ptr;
   uint32_t T0, T1;
   int rounds;   // 14 for blake, 8 for blakecoin & vanilla
 } blake_4way_small_context __attribute__ ((aligned (64)));

-// Default 14 rounds
+// Default, 14 rounds, blake, decred
 typedef blake_4way_small_context blake256_4way_context;
 void blake256_4way_init(void *ctx);
 void blake256_4way_update(void *ctx, const void *data, size_t len);
-#define blake256_4way blake256_4way_update
 void blake256_4way_close(void *ctx, void *dst);

 // 14 rounds, blake, decred
 typedef blake_4way_small_context blake256r14_4way_context;
 void blake256r14_4way_init(void *cc);
 void blake256r14_4way_update(void *cc, const void *data, size_t len);
-#define blake256r14_4way blake256r14_4way_update
 void blake256r14_4way_close(void *cc, void *dst);

 // 8 rounds, blakecoin, vanilla
 typedef blake_4way_small_context blake256r8_4way_context;
 void blake256r8_4way_init(void *cc);
 void blake256r8_4way_update(void *cc, const void *data, size_t len);
-#define blake256r8_4way blake256r8_4way_update
 void blake256r8_4way_close(void *cc, void *dst);

 #ifdef __AVX2__

-// Blake-256 8 way
+//////////////////////////
+//
+//   Blake-256 8 way AVX2

 typedef struct {
   __m256i buf[16] __attribute__ ((aligned (64)));
@@ -104,7 +97,6 @@ typedef struct {
 typedef blake_8way_small_context blake256_8way_context;
 void blake256_8way_init(void *cc);
 void blake256_8way_update(void *cc, const void *data, size_t len);
-//#define blake256_8way blake256_8way_update
 void blake256_8way_close(void *cc, void *dst);

 // 14 rounds, blake, decred
@@ -117,10 +109,9 @@ void blake256r14_8way_close(void *cc, void *dst);
 typedef blake_8way_small_context blake256r8_8way_context;
 void blake256r8_8way_init(void *cc);
 void blake256r8_8way_update(void *cc, const void *data, size_t len);
-#define blake256r8_8way blake256r8_8way_update
 void blake256r8_8way_close(void *cc, void *dst);

-// Blake-512 4 way
+// Blake-512 4 way AVX2

 typedef struct {
   __m256i buf[16];
@@ -134,14 +125,15 @@ typedef blake_4way_big_context blake512_4way_context;

 void blake512_4way_init( blake_4way_big_context *sc );
 void blake512_4way_update( void *cc, const void *data, size_t len );
-#define blake512_4way blake512_4way_update
 void blake512_4way_close( void *cc, void *dst );
-void blake512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
-                                      void *dst );
+void blake512_4way_full( blake_4way_big_context *sc, void * dst,
+                         const void *data, size_t len );

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

-//Blake-256 16 way
+////////////////////////////
+//
+//   Blake-256 16 way AVX512

 typedef struct {
   __m512i buf[16];
@@ -169,8 +161,9 @@ void blake256r8_16way_init(void *cc);
 void blake256r8_16way_update(void *cc, const void *data, size_t len);
 void blake256r8_16way_close(void *cc, void *dst);

-
-// Blake-512 8 way
+////////////////////////////
+//
+//// Blake-512 8 way AVX512

 typedef struct {
   __m512i buf[16];
@@ -185,12 +178,10 @@ typedef blake_8way_big_context blake512_8way_context;
 void blake512_8way_init( blake_8way_big_context *sc );
 void blake512_8way_update( void *cc, const void *data, size_t len );
 void blake512_8way_close( void *cc, void *dst );
-void blake512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
-                                      void *dst );
+void blake512_8way_full( blake_8way_big_context *sc, void * dst,
+                        const void *data, size_t len );

 #endif  // AVX512
-
-
 #endif  // AVX2

 #ifdef __cplusplus
--- a/algo/blake/blake2b-4way.c
+++ b/algo/blake/blake2b-4way.c
@@ -39,13 +39,13 @@ int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
      blake2b_8way_final( &ctx, hash );

      for ( int lane = 0; lane < 8; lane++ )
-      if ( hash7[ lane<<1 ] < Htarg )
+      if ( hash7[ lane<<1 ] <= Htarg )
      {
          extr_lane_8x64( lane_hash, hash, lane, 256 );
          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
          {
              pdata[19] = n + lane;
-              submit_lane_solution( work, lane_hash, mythr, lane );
+              submit_solution( work, lane_hash, mythr );
          }
      }
      n += 8;
@@ -94,13 +94,13 @@ int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
      blake2b_4way_final( &ctx, hash );

      for ( int lane = 0; lane < 4; lane++ )
-      if ( hash7[ lane<<1 ] < Htarg )
+      if ( hash7[ lane<<1 ] <= Htarg )
      {
          extr_lane_4x64( lane_hash, hash, lane, 256 );
          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
          {
              pdata[19] = n + lane;
-              submit_lane_solution( work, lane_hash, mythr, lane );
+              submit_solution( work, lane_hash, mythr );
          }
      }
      n += 4;
--- a/algo/blake/blake2b-hash-4way.c
+++ b/algo/blake/blake2b-hash-4way.c
@@ -33,6 +33,8 @@

 #include "blake2b-hash-4way.h"

+#if defined(__AVX2__)
+
 static const uint8_t sigma[12][16] =
 {
      { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
@@ -203,9 +205,9 @@ void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
   casti_m512i( out, 3 ) = ctx->h[3];
 }

-#endif
+#endif   // AVX512

-#if defined(__AVX2__)
+// AVX2

 // G Mixing function.

@@ -369,4 +371,4 @@ void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out )
   casti_m256i( out, 3 ) = ctx->h[3];
 }

-#endif
+#endif  // AVX2
--- a/algo/blake/blake2b.c
+++ b/algo/blake/blake2b.c
@@ -4,6 +4,9 @@
 */

 #include "blake2b-gate.h"
+
+#if !defined(BLAKE2B_8WAY) && !defined(BLAKE2B_4WAY)
+
 #include <string.h>
 #include <stdint.h>
 #include "algo/blake/sph_blake2b.h"
@@ -45,7 +48,7 @@ int scanhash_blake2b( struct work *work, uint32_t max_nonce,
 		be32enc(&endiandata[19], n);
 		blake2b_hash(vhashcpu, endiandata);

-		if (vhashcpu[7] < Htarg && fulltest(vhashcpu, ptarget))
+		if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget))
      {
 			pdata[19] = n;
         submit_solution( work, vhashcpu, mythr );
@@ -58,3 +61,4 @@ int scanhash_blake2b( struct work *work, uint32_t max_nonce,
 	return 0;
 }

+#endif
--- a/algo/blake/blake2s-4way.c
+++ b/algo/blake/blake2s-4way.c
@@ -49,7 +49,7 @@ int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
         if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
         {
              pdata[19] = n + lane;
-              submit_lane_solution( work, lane_hash, mythr, lane );
+              submit_solution( work, lane_hash, mythr );
         }
      }
      n += 16;
@@ -104,7 +104,7 @@ int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
         if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
         {
              pdata[19] = n + lane;
-              submit_lane_solution( work, lane_hash, mythr, lane );
+              submit_solution( work, lane_hash, mythr );
         }
      }
      n += 8;
@@ -157,7 +157,7 @@ int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
         {
              pdata[19] = n + lane;
-              submit_lane_solution( work, lane_hash, mythr, lane );
+              submit_solution( work, lane_hash, mythr );
              }
      }
      n += 4;
--- a/algo/blake/blake2s.c
+++ b/algo/blake/blake2s.c
@@ -1,5 +1,7 @@
 #include "blake2s-gate.h"

+#if  !defined(BLAKE2S_16WAY) && !defined(BLAKE2S_8WAY) && !defined(BLAKE2S)
+
 #include <string.h>
 #include <stdint.h>

@@ -56,7 +58,7 @@ int scanhash_blake2s( struct work *work,
 	do {
 		be32enc(&endiandata[19], n);
 		blake2s_hash( hash64, endiandata );
-		if (hash64[7] < Htarg && fulltest(hash64, ptarget)) {
+		if (hash64[7] <= Htarg && fulltest(hash64, ptarget)) {
 			*hashes_done = n - first_nonce + 1;
 			pdata[19] = n;
 			return true;
@@ -70,3 +72,4 @@ int scanhash_blake2s( struct work *work,

 	return 0;
 }
+#endif
--- a/algo/blake/blake512-hash-4way.c
+++ b/algo/blake/blake512-hash-4way.c
@@ -267,22 +267,22 @@ static const sph_u64 CB[16] = {
 #define CBx_(n)     CBx__(n)
 #define CBx__(n)    CB ## n

-#define CB0   SPH_C64(0x243F6A8885A308D3)
-#define CB1   SPH_C64(0x13198A2E03707344)
-#define CB2   SPH_C64(0xA4093822299F31D0)
-#define CB3   SPH_C64(0x082EFA98EC4E6C89)
-#define CB4   SPH_C64(0x452821E638D01377)
-#define CB5   SPH_C64(0xBE5466CF34E90C6C)
-#define CB6   SPH_C64(0xC0AC29B7C97C50DD)
-#define CB7   SPH_C64(0x3F84D5B5B5470917)
-#define CB8   SPH_C64(0x9216D5D98979FB1B)
-#define CB9   SPH_C64(0xD1310BA698DFB5AC)
-#define CBA   SPH_C64(0x2FFD72DBD01ADFB7)
-#define CBB   SPH_C64(0xB8E1AFED6A267E96)
-#define CBC   SPH_C64(0xBA7C9045F12C7F99)
-#define CBD   SPH_C64(0x24A19947B3916CF7)
-#define CBE   SPH_C64(0x0801F2E2858EFC16)
-#define CBF   SPH_C64(0x636920D871574E69)
+#define CB0   0x243F6A8885A308D3
+#define CB1   0x13198A2E03707344
+#define CB2   0xA4093822299F31D0
+#define CB3   0x082EFA98EC4E6C89
+#define CB4   0x452821E638D01377
+#define CB5   0xBE5466CF34E90C6C
+#define CB6   0xC0AC29B7C97C50DD
+#define CB7   0x3F84D5B5B5470917
+#define CB8   0x9216D5D98979FB1B
+#define CB9   0xD1310BA698DFB5AC
+#define CBA   0x2FFD72DBD01ADFB7
+#define CBB   0xB8E1AFED6A267E96
+#define CBC   0xBA7C9045F12C7F99
+#define CBD   0x24A19947B3916CF7
+#define CBE   0x0801F2E2858EFC16
+#define CBF   0x636920D871574E69

 #define READ_STATE64(state)   do { \
      H0 = (state)->H[0]; \
@@ -349,9 +349,9 @@ static const sph_u64 CB[16] = {
 #define DECL_STATE64_8WAY \
   __m512i H0, H1, H2, H3, H4, H5, H6, H7; \
        __m512i S0, S1, S2, S3; \
-   sph_u64 T0, T1;
+   uint64_t T0, T1;

-#define COMPRESS64_8WAY   do \
+#define COMPRESS64_8WAY( buf )   do \
 { \
  __m512i M0, M1, M2, M3, M4, M5, M6, M7; \
  __m512i M8, M9, MA, MB, MC, MD, ME, MF; \
@@ -424,6 +424,84 @@ static const sph_u64 CB[16] = {
  H7 = mm512_xor4( VF, V7, S3, H7 ); \
 } while (0)

+void blake512_8way_compress( blake_8way_big_context *sc )
+{ 
+  __m512i M0, M1, M2, M3, M4, M5, M6, M7;
+  __m512i M8, M9, MA, MB, MC, MD, ME, MF;
+  __m512i V0, V1, V2, V3, V4, V5, V6, V7;
+  __m512i V8, V9, VA, VB, VC, VD, VE, VF;
+  __m512i shuf_bswap64;
+
+  V0 = sc->H[0];
+  V1 = sc->H[1];
+  V2 = sc->H[2];
+  V3 = sc->H[3];
+  V4 = sc->H[4];
+  V5 = sc->H[5];
+  V6 = sc->H[6];
+  V7 = sc->H[7];
+  V8 = _mm512_xor_si512( sc->S[0], m512_const1_64( CB0 ) );
+  V9 = _mm512_xor_si512( sc->S[1], m512_const1_64( CB1 ) );
+  VA = _mm512_xor_si512( sc->S[2], m512_const1_64( CB2 ) );
+  VB = _mm512_xor_si512( sc->S[3], m512_const1_64( CB3 ) );
+  VC = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ),
+                            m512_const1_64( CB4 ) );
+  VD = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ),
+                            m512_const1_64( CB5 ) );
+  VE = _mm512_xor_si512( _mm512_set1_epi64( sc->T1 ),
+                            m512_const1_64( CB6 ) );
+  VF = _mm512_xor_si512( _mm512_set1_epi64( sc->T1 ),
+                            m512_const1_64( CB7 ) );
+
+  shuf_bswap64 = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637,
+                                0x28292a2b2c2d2e2f, 0x2021222324252627,
+                                0x18191a1b1c1d1e1f, 0x1011121314151617,
+                                0x08090a0b0c0d0e0f, 0x0001020304050607 );
+
+  M0 = _mm512_shuffle_epi8( sc->buf[ 0], shuf_bswap64 );
+  M1 = _mm512_shuffle_epi8( sc->buf[ 1], shuf_bswap64 );
+  M2 = _mm512_shuffle_epi8( sc->buf[ 2], shuf_bswap64 );
+  M3 = _mm512_shuffle_epi8( sc->buf[ 3], shuf_bswap64 );
+  M4 = _mm512_shuffle_epi8( sc->buf[ 4], shuf_bswap64 );
+  M5 = _mm512_shuffle_epi8( sc->buf[ 5], shuf_bswap64 );
+  M6 = _mm512_shuffle_epi8( sc->buf[ 6], shuf_bswap64 );
+  M7 = _mm512_shuffle_epi8( sc->buf[ 7], shuf_bswap64 );
+  M8 = _mm512_shuffle_epi8( sc->buf[ 8], shuf_bswap64 );
+  M9 = _mm512_shuffle_epi8( sc->buf[ 9], shuf_bswap64 );
+  MA = _mm512_shuffle_epi8( sc->buf[10], shuf_bswap64 );
+  MB = _mm512_shuffle_epi8( sc->buf[11], shuf_bswap64 );
+  MC = _mm512_shuffle_epi8( sc->buf[12], shuf_bswap64 );
+  MD = _mm512_shuffle_epi8( sc->buf[13], shuf_bswap64 );
+  ME = _mm512_shuffle_epi8( sc->buf[14], shuf_bswap64 );
+  MF = _mm512_shuffle_epi8( sc->buf[15], shuf_bswap64 );
+
+  ROUND_B_8WAY(0);
+  ROUND_B_8WAY(1);
+  ROUND_B_8WAY(2);
+  ROUND_B_8WAY(3);
+  ROUND_B_8WAY(4);
+  ROUND_B_8WAY(5);
+  ROUND_B_8WAY(6);
+  ROUND_B_8WAY(7);
+  ROUND_B_8WAY(8);
+  ROUND_B_8WAY(9);
+  ROUND_B_8WAY(0);
+  ROUND_B_8WAY(1);
+  ROUND_B_8WAY(2);
+  ROUND_B_8WAY(3);
+  ROUND_B_8WAY(4);
+  ROUND_B_8WAY(5);
+
+  sc->H[0] = mm512_xor4( V8, V0, sc->S[0], sc->H[0] );
+  sc->H[1] = mm512_xor4( V9, V1, sc->S[1], sc->H[1] );
+  sc->H[2] = mm512_xor4( VA, V2, sc->S[2], sc->H[2] );
+  sc->H[3] = mm512_xor4( VB, V3, sc->S[3], sc->H[3] );
+  sc->H[4] = mm512_xor4( VC, V4, sc->S[0], sc->H[4] );
+  sc->H[5] = mm512_xor4( VD, V5, sc->S[1], sc->H[5] );
+  sc->H[6] = mm512_xor4( VE, V6, sc->S[2], sc->H[6] );
+  sc->H[7] = mm512_xor4( VF, V7, sc->S[3], sc->H[7] );
+}
+
 void blake512_8way_init( blake_8way_big_context *sc )
 {
   __m512i zero = m512_zero;
@@ -455,39 +533,43 @@ blake64_8way( blake_8way_big_context *sc, const void *data, size_t len )

   const int buf_size = 128;  //  sizeof/8

+// 64, 80 bytes: 1st pass copy data. 2nd pass copy padding and compress.   
+// 128 bytes: 1st pass copy data, compress. 2nd pass copy padding, compress.
+   
   buf = sc->buf;
   ptr = sc->ptr;
   if ( len < (buf_size - ptr) )
   {
-   memcpy_512( buf + (ptr>>3), vdata, len>>3 );
-   ptr += len;
-   sc->ptr = ptr;
-   return;
+      memcpy_512( buf + (ptr>>3), vdata, len>>3 );
+      ptr += len;
+      sc->ptr = ptr;
+      return;
   }

   READ_STATE64(sc);
   while ( len > 0 )
   {
-   size_t clen;
+      size_t clen;

-   clen = buf_size - ptr;
-   if ( clen > len )
+      clen = buf_size - ptr;
+      if ( clen > len )
      clen = len;
-   memcpy_512( buf + (ptr>>3), vdata, clen>>3 );
-   ptr += clen;
-   vdata = vdata + (clen>>3);
-   len -= clen;
-   if ( ptr == buf_size )
-        {
-      if ( ( T0 = SPH_T64(T0 + 1024) ) < 1024 )
-         T1 = SPH_T64(T1 + 1);
-      COMPRESS64_8WAY;
-      ptr = 0;
-   }
+      memcpy_512( buf + (ptr>>3), vdata, clen>>3 );
+      ptr += clen;
+      vdata = vdata + (clen>>3);
+      len -= clen;
+      if ( ptr == buf_size )
+      {
+         if ( ( T0 = T0 + 1024 ) < 1024 )
+            T1 = T1 + 1;
+         COMPRESS64_8WAY( buf );
+         ptr = 0;
+      }
   }
   WRITE_STATE64(sc);
   sc->ptr = ptr;
-}
+
+   }

 static void
 blake64_8way_close( blake_8way_big_context *sc, void *dst )
@@ -495,26 +577,22 @@ blake64_8way_close( blake_8way_big_context *sc, void *dst )
   __m512i buf[16];
   size_t ptr;
   unsigned bit_len;
-//   uint64_t z, zz;
-   sph_u64 th, tl;
+   uint64_t th, tl;

   ptr = sc->ptr;
   bit_len = ((unsigned)ptr << 3);
-//   z = 0x80 >> n;
-//   zz = ((ub & -z) | z) & 0xFF;
-//   buf[ptr>>3] = _mm512_set1_epi64( zz );
   buf[ptr>>3] = m512_const1_64( 0x80 );
   tl = sc->T0 + bit_len;
   th = sc->T1;
   if (ptr == 0 )
   {
-   sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
-   sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
+   sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
+   sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
   }
   else if ( sc->T0 == 0 )
   {
-   sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL) + bit_len;
-   sc->T1 = SPH_T64(sc->T1 - 1);
+   sc->T0 = 0xFFFFFFFFFFFFFC00ULL + bit_len;
+   sc->T1 = sc->T1 - 1;
   }
   else
   {
@@ -535,8 +613,8 @@ blake64_8way_close( blake_8way_big_context *sc, void *dst )
       memset_zero_512( buf + (ptr>>3) + 1, (120 - ptr) >> 3 );

       blake64_8way( sc, buf + (ptr>>3), 128 - ptr );
-       sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
-       sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
+       sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
+       sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
       memset_zero_512( buf, 112>>3 );
       buf[104>>3] = m512_const1_64( 0x0100000000000000ULL );
       buf[112>>3] = m512_const1_64( bswap_64( th ) );
@@ -547,6 +625,79 @@ blake64_8way_close( blake_8way_big_context *sc, void *dst )
   mm512_block_bswap_64( (__m512i*)dst, sc->H );
 }

+// init, update & close
+void blake512_8way_full( blake_8way_big_context *sc, void * dst, 
+                        const void *data, size_t len )
+{
+   
+// init
+
+   casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
+   casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B );
+   casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B );
+   casti_m512i( sc->H, 3 ) = m512_const1_64( 0xA54FF53A5F1D36F1 );
+   casti_m512i( sc->H, 4 ) = m512_const1_64( 0x510E527FADE682D1 );
+   casti_m512i( sc->H, 5 ) = m512_const1_64( 0x9B05688C2B3E6C1F );
+   casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
+   casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
+
+   casti_m512i( sc->S, 0 ) = m512_zero;
+   casti_m512i( sc->S, 1 ) = m512_zero;
+   casti_m512i( sc->S, 2 ) = m512_zero;
+   casti_m512i( sc->S, 3 ) = m512_zero;
+
+   sc->T0 = sc->T1 = 0;
+   sc->ptr = 0;
+
+// update
+
+   memcpy_512( sc->buf, (__m512i*)data, len>>3 );
+   sc->ptr = len;
+   if ( len == 128 )
+   {
+      if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
+            sc->T1 = sc->T1 + 1;
+      blake512_8way_compress( sc );
+      sc->ptr = 0;
+   }
+
+// close
+
+   size_t ptr64 = sc->ptr >> 3;
+   unsigned bit_len;
+   uint64_t th, tl;
+
+   bit_len = sc->ptr << 3;
+   sc->buf[ptr64] = m512_const1_64( 0x80 );
+   tl = sc->T0 + bit_len;
+   th = sc->T1;
+
+   if ( ptr64 == 0 )
+   {
+   sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
+   sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
+   }
+   else if ( sc->T0 == 0 )
+   {
+   sc->T0 = 0xFFFFFFFFFFFFFC00ULL + bit_len;
+   sc->T1 = sc->T1 - 1;
+   }
+   else
+      sc->T0 -= 1024 - bit_len;
+
+   memset_zero_512( sc->buf + ptr64 + 1, 13 - ptr64 );
+   sc->buf[13] = m512_const1_64( 0x0100000000000000ULL );
+   sc->buf[14] = m512_const1_64( bswap_64( th ) );
+   sc->buf[15] = m512_const1_64( bswap_64( tl ) );
+
+   if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
+       sc->T1 = sc->T1 + 1;
+
+   blake512_8way_compress( sc );
+   
+   mm512_block_bswap_64( (__m512i*)dst, sc->H );
+}
+   
 void
 blake512_8way_update(void *cc, const void *data, size_t len)
 {
@@ -555,12 +706,6 @@ blake512_8way_update(void *cc, const void *data, size_t len)

 void
 blake512_8way_close(void *cc, void *dst)
-{
-   blake512_8way_addbits_and_close(cc, 0, 0, dst);
-}
-
-void
-blake512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 {
   blake64_8way_close(cc, dst);
 }
@@ -596,7 +741,7 @@ blake512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 #define DECL_STATE64_4WAY \
 	__m256i H0, H1, H2, H3, H4, H5, H6, H7; \
        __m256i S0, S1, S2, S3; \
-	sph_u64 T0, T1;
+	uint64_t T0, T1;

 #define COMPRESS64_4WAY   do \
 { \
@@ -670,6 +815,81 @@ blake512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 } while (0)


+void blake512_4way_compress( blake_4way_big_context *sc )
+{
+  __m256i M0, M1, M2, M3, M4, M5, M6, M7;
+  __m256i M8, M9, MA, MB, MC, MD, ME, MF;
+  __m256i V0, V1, V2, V3, V4, V5, V6, V7;
+  __m256i V8, V9, VA, VB, VC, VD, VE, VF;
+  __m256i shuf_bswap64;
+
+  V0 = sc->H[0];
+  V1 = sc->H[1];
+  V2 = sc->H[2];
+  V3 = sc->H[3];
+  V4 = sc->H[4];
+  V5 = sc->H[5];
+  V6 = sc->H[6];
+  V7 = sc->H[7];
+  V8 = _mm256_xor_si256( sc->S[0], m256_const1_64( CB0 ) );
+  V9 = _mm256_xor_si256( sc->S[1], m256_const1_64( CB1 ) );
+  VA = _mm256_xor_si256( sc->S[2], m256_const1_64( CB2 ) );
+  VB = _mm256_xor_si256( sc->S[3], m256_const1_64( CB3 ) );
+  VC = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ),
+                             m256_const1_64( CB4 ) );
+  VD = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ),
+                             m256_const1_64( CB5 ) );
+  VE = _mm256_xor_si256( _mm256_set1_epi64x( sc->T1 ),
+                             m256_const1_64( CB6 ) );
+  VF = _mm256_xor_si256( _mm256_set1_epi64x( sc->T1 ),
+                             m256_const1_64( CB7 ) );
+  shuf_bswap64 = m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617,
+                                0x08090a0b0c0d0e0f, 0x0001020304050607 );
+
+  M0 = _mm256_shuffle_epi8( sc->buf[ 0], shuf_bswap64 );
+  M1 = _mm256_shuffle_epi8( sc->buf[ 1], shuf_bswap64 );
+  M2 = _mm256_shuffle_epi8( sc->buf[ 2], shuf_bswap64 );
+  M3 = _mm256_shuffle_epi8( sc->buf[ 3], shuf_bswap64 );
+  M4 = _mm256_shuffle_epi8( sc->buf[ 4], shuf_bswap64 );
+  M5 = _mm256_shuffle_epi8( sc->buf[ 5], shuf_bswap64 );
+  M6 = _mm256_shuffle_epi8( sc->buf[ 6], shuf_bswap64 );
+  M7 = _mm256_shuffle_epi8( sc->buf[ 7], shuf_bswap64 );
+  M8 = _mm256_shuffle_epi8( sc->buf[ 8], shuf_bswap64 );
+  M9 = _mm256_shuffle_epi8( sc->buf[ 9], shuf_bswap64 );
+  MA = _mm256_shuffle_epi8( sc->buf[10], shuf_bswap64 );
+  MB = _mm256_shuffle_epi8( sc->buf[11], shuf_bswap64 );
+  MC = _mm256_shuffle_epi8( sc->buf[12], shuf_bswap64 );
+  MD = _mm256_shuffle_epi8( sc->buf[13], shuf_bswap64 );
+  ME = _mm256_shuffle_epi8( sc->buf[14], shuf_bswap64 );
+  MF = _mm256_shuffle_epi8( sc->buf[15], shuf_bswap64 );
+
+  ROUND_B_4WAY(0);
+  ROUND_B_4WAY(1);
+  ROUND_B_4WAY(2);
+  ROUND_B_4WAY(3);
+  ROUND_B_4WAY(4);
+  ROUND_B_4WAY(5);
+  ROUND_B_4WAY(6);
+  ROUND_B_4WAY(7);
+  ROUND_B_4WAY(8);
+  ROUND_B_4WAY(9);
+  ROUND_B_4WAY(0);
+  ROUND_B_4WAY(1);
+  ROUND_B_4WAY(2);
+  ROUND_B_4WAY(3);
+  ROUND_B_4WAY(4);
+  ROUND_B_4WAY(5);
+
+  sc->H[0] = mm256_xor4( V8, V0, sc->S[0], sc->H[0] );
+  sc->H[1] = mm256_xor4( V9, V1, sc->S[1], sc->H[1] );
+  sc->H[2] = mm256_xor4( VA, V2, sc->S[2], sc->H[2] );
+  sc->H[3] = mm256_xor4( VB, V3, sc->S[3], sc->H[3] );
+  sc->H[4] = mm256_xor4( VC, V4, sc->S[0], sc->H[4] );
+  sc->H[5] = mm256_xor4( VD, V5, sc->S[1], sc->H[5] );
+  sc->H[6] = mm256_xor4( VE, V6, sc->S[2], sc->H[6] );
+  sc->H[7] = mm256_xor4( VF, V7, sc->S[3], sc->H[7] );
+}
+
 void blake512_4way_init( blake_4way_big_context *sc )
 {
   __m256i zero = m256_zero;
@@ -681,10 +901,12 @@ void blake512_4way_init( blake_4way_big_context *sc )
   casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C2B3E6C1F );
   casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
   casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
+
   casti_m256i( sc->S, 0 ) = zero;
   casti_m256i( sc->S, 1 ) = zero;
   casti_m256i( sc->S, 2 ) = zero;
   casti_m256i( sc->S, 3 ) = zero;
+
   sc->T0 = sc->T1 = 0;
   sc->ptr = 0;
 }
@@ -703,31 +925,31 @@ blake64_4way( blake_4way_big_context *sc, const void *data, size_t len)
   ptr = sc->ptr;
   if ( len < (buf_size - ptr) )
   {
-	memcpy_256( buf + (ptr>>3), vdata, len>>3 );
-	ptr += len;
-	sc->ptr = ptr;
-	return;
+   	memcpy_256( buf + (ptr>>3), vdata, len>>3 );
+	   ptr += len;
+	   sc->ptr = ptr;
+	   return;
   }

   READ_STATE64(sc);
   while ( len > 0 )
   {
-	size_t clen;
+   	size_t clen;

-	clen = buf_size - ptr;
-	if ( clen > len )
-		clen = len;
-	memcpy_256( buf + (ptr>>3), vdata, clen>>3 );
-	ptr += clen;
-	vdata = vdata + (clen>>3);
-	len -= clen;
-	if (ptr == buf_size )
-        {
-		if ((T0 = SPH_T64(T0 + 1024)) < 1024)
-			T1 = SPH_T64(T1 + 1);
-		COMPRESS64_4WAY;
-		ptr = 0;
-	}
+	   clen = buf_size - ptr;
+	   if ( clen > len )
+		   clen = len;
+   	memcpy_256( buf + (ptr>>3), vdata, clen>>3 );
+	   ptr += clen;
+	   vdata = vdata + (clen>>3);
+	   len -= clen;
+	   if ( ptr == buf_size )
+      {
+		   if ( (T0 = T0 + 1024 ) < 1024 )
+			   T1 = SPH_T64(T1 + 1);
+	   	COMPRESS64_4WAY;
+		   ptr = 0;
+	   }
   }
   WRITE_STATE64(sc);
   sc->ptr = ptr;
@@ -739,7 +961,7 @@ blake64_4way_close( blake_4way_big_context *sc, void *dst )
   __m256i buf[16];
   size_t ptr;
   unsigned bit_len;
-   sph_u64 th, tl;
+   uint64_t th, tl;

   ptr = sc->ptr;
   bit_len = ((unsigned)ptr << 3);
@@ -748,13 +970,13 @@ blake64_4way_close( blake_4way_big_context *sc, void *dst )
   th = sc->T1;
   if (ptr == 0 )
   {
-	sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
-	sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
+	sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
+	sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
   }
   else if ( sc->T0 == 0 )
   {
-	sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL) + bit_len;
-	sc->T1 = SPH_T64(sc->T1 - 1);
+	sc->T0 = 0xFFFFFFFFFFFFFC00ULL + bit_len;
+	sc->T1 = sc->T1 - 1;
   } 
   else
   {
@@ -788,13 +1010,77 @@ blake64_4way_close( blake_4way_big_context *sc, void *dst )
   mm256_block_bswap_64( (__m256i*)dst, sc->H );
 }

-/*
-void
-blake512_4way_init(void *cc)
+// init, update & close
+void blake512_4way_full( blake_4way_big_context *sc, void * dst,
+                         const void *data, size_t len )
 {
-	blake64_4way_init(cc, IV512, salt_zero_big);
+
+// init
+
+   casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 );
+   casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE8584CAA73B );
+   casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF372FE94F82B );
+   casti_m256i( sc->H, 3 ) = m256_const1_64( 0xA54FF53A5F1D36F1 );
+   casti_m256i( sc->H, 4 ) = m256_const1_64( 0x510E527FADE682D1 );
+   casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C2B3E6C1F );
+   casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
+   casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
+
+   casti_m256i( sc->S, 0 ) = m256_zero;
+   casti_m256i( sc->S, 1 ) = m256_zero;
+   casti_m256i( sc->S, 2 ) = m256_zero;
+   casti_m256i( sc->S, 3 ) = m256_zero;
+
+   sc->T0 = sc->T1 = 0;
+   sc->ptr = 0;
+
+// update
+
+   memcpy_256( sc->buf, (__m256i*)data, len>>3 );
+   sc->ptr += len;
+   if ( len == 128 )
+   {
+      if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
+         sc->T1 =  sc->T1 + 1;
+      blake512_4way_compress( sc );
+      sc->ptr = 0;
+   }
+
+// close
+
+   size_t ptr64 = sc->ptr >> 3;
+   unsigned bit_len;
+   uint64_t th, tl;
+
+   bit_len = sc->ptr << 3;
+   sc->buf[ptr64] = m256_const1_64( 0x80 );
+   tl = sc->T0 + bit_len;
+   th = sc->T1;
+   if ( sc->ptr == 0 )
+   {
+      sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
+      sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
+   }
+   else if ( sc->T0 == 0 )
+   {
+      sc->T0 = 0xFFFFFFFFFFFFFC00ULL + bit_len;
+      sc->T1 = sc->T1 - 1;
+   }
+   else
+        sc->T0 -= 1024 - bit_len;
+
+   memset_zero_256( sc->buf + ptr64 + 1, 13 - ptr64 );
+   sc->buf[13] = m256_const1_64( 0x0100000000000000ULL );
+   sc->buf[14] = m256_const1_64( bswap_64( th ) );
+   sc->buf[15] = m256_const1_64( bswap_64( tl ) );
+
+   if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
+       sc->T1 = sc->T1 + 1;
+
+   blake512_4way_compress( sc );
+
+   mm256_block_bswap_64( (__m256i*)dst, sc->H );
 }
-*/

 void
 blake512_4way_update(void *cc, const void *data, size_t len)
@@ -806,17 +1092,8 @@ void
 blake512_4way_close(void *cc, void *dst)
 {
   blake64_4way_close( cc, dst );
-
-//   blake512_4way_addbits_and_close(cc, dst);
 }

-/*
-void
-blake512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
-{
-	blake64_4way_close(cc, ub, n, dst, 8);
-}
-*/
 #ifdef __cplusplus
 }
 #endif
--- a/algo/blake/blakecoin-4way.c
+++ b/algo/blake/blakecoin-4way.c
@@ -14,7 +14,7 @@ void blakecoin_4way_hash(void *state, const void *input)
     blake256r8_4way_context ctx;

     memcpy( &ctx, &blakecoin_4w_ctx, sizeof ctx );
-     blake256r8_4way( &ctx, input + (64<<2), 16 );
+     blake256r8_4way_update( &ctx, input + (64<<2), 16 );
     blake256r8_4way_close( &ctx, vhash );

     dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
@@ -37,7 +37,7 @@ int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,

   mm128_bswap32_intrlv80_4x32( vdata, pdata );
   blake256r8_4way_init( &blakecoin_4w_ctx );
-   blake256r8_4way( &blakecoin_4w_ctx, vdata, 64 );
+   blake256r8_4way_update( &blakecoin_4w_ctx, vdata, 64 );

   do {
      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
@@ -49,7 +49,7 @@ int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
           && !opt_benchmark )
      {
          pdata[19] = n+i;
-          submit_lane_solution( work, hash+(i<<3), mythr, i );
+          submit_solution( work, hash+(i<<3), mythr );
      }
      n += 4;

@@ -71,7 +71,7 @@ void blakecoin_8way_hash( void *state, const void *input )
     blake256r8_8way_context ctx;

     memcpy( &ctx, &blakecoin_8w_ctx, sizeof ctx );
-     blake256r8_8way( &ctx, input + (64<<3), 16 );
+     blake256r8_8way_update( &ctx, input + (64<<3), 16 );
     blake256r8_8way_close( &ctx, vhash );

     dintrlv_8x32( state,     state+ 32, state+ 64, state+ 96, state+128,
@@ -95,7 +95,7 @@ int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,

   mm256_bswap32_intrlv80_8x32( vdata, pdata );
   blake256r8_8way_init( &blakecoin_8w_ctx );
-   blake256r8_8way( &blakecoin_8w_ctx, vdata, 64 );
+   blake256r8_8way_update( &blakecoin_8w_ctx, vdata, 64 );

   do {
      *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
@@ -108,7 +108,7 @@ int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
          && !opt_benchmark )
      {
          pdata[19] = n+i;
-          submit_lane_solution( work, hash+(i<<3), mythr, i );
+          submit_solution( work, hash+(i<<3), mythr );
      }
      n += 8;
   } while ( (n < max_nonce) && !work_restart[thr_id].restart );
--- a/algo/blake/blakecoin.c
+++ b/algo/blake/blakecoin.c
@@ -1,4 +1,7 @@
 #include "blakecoin-gate.h"
+
+#if !defined(BLAKECOIN_8WAY) && !defined(BLAKECOIN_4WAY)
+
 #define BLAKE32_ROUNDS 8
 #include "sph_blake.h"

@@ -93,3 +96,4 @@ int scanhash_blakecoin( struct work *work, uint32_t max_nonce,
 	return 0;
 }

+#endif
--- a/algo/blake/decred-4way.c
+++ b/algo/blake/decred-4way.c
@@ -21,7 +21,7 @@ void decred_hash_4way( void *state, const void *input )
     blake256_4way_context ctx __attribute__ ((aligned (64)));

     memcpy( &ctx, &blake_mid, sizeof(blake_mid) );
-     blake256_4way( &ctx, tail, tail_len );
+     blake256_4way_update( &ctx, tail, tail_len );
     blake256_4way_close( &ctx, vhash );
     dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
 }
@@ -46,7 +46,7 @@ int scanhash_decred_4way( struct work *work, uint32_t max_nonce,
   mm128_intrlv_4x32x( vdata, edata, edata, edata, edata, 180*8 );

   blake256_4way_init( &blake_mid );
-   blake256_4way( &blake_mid, vdata, DECRED_MIDSTATE_LEN );
+   blake256_4way_update( &blake_mid, vdata, DECRED_MIDSTATE_LEN );

   uint32_t *noncep = vdata + DECRED_NONCE_INDEX * 4;
   do {
@@ -62,7 +62,7 @@ int scanhash_decred_4way( struct work *work, uint32_t max_nonce,
      if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
      {
          pdata[DECRED_NONCE_INDEX] = n+i;
-          submit_lane_solution( work, hash+(i<<3), mythr, i );
+          submit_solution( work, hash+(i<<3), mythr );
      }
      n += 4;
  } while ( (n < max_nonce) && !work_restart[thr_id].restart );
--- a/algo/blake/decred-gate.c
+++ b/algo/blake/decred-gate.c
@@ -153,7 +153,7 @@ bool register_decred_algo( algo_gate_t* gate )
  gate->hash      = (void*)&decred_hash;
 #endif
  gate->optimizations = AVX2_OPT;
-  gate->get_nonceptr          = (void*)&decred_get_nonceptr;
+//  gate->get_nonceptr          = (void*)&decred_get_nonceptr;
  gate->decode_extra_data     = (void*)&decred_decode_extradata;
  gate->build_stratum_request = (void*)&decred_be_build_stratum_request;
  gate->work_decode           = (void*)&std_be_work_decode;
--- a/algo/blake/decred.c
+++ b/algo/blake/decred.c
@@ -1,4 +1,7 @@
 #include "decred-gate.h"
+
+#if !defined(DECRED_8WAY) && !defined(DECRED_4WAY)
+
 #include "sph_blake.h"

 #include <string.h>
@@ -275,3 +278,5 @@ bool register_decred_algo( algo_gate_t* gate )
  return true;
 }
 */
+
+#endif
--- a/algo/blake/pentablake-4way.c
+++ b/algo/blake/pentablake-4way.c
@@ -22,23 +22,23 @@ extern void pentablakehash_4way( void *output, const void *input )


     blake512_4way_init( &ctx );
-     blake512_4way( &ctx, input, 80 );
+     blake512_4way_update( &ctx, input, 80 );
     blake512_4way_close( &ctx, vhash );

     blake512_4way_init( &ctx );
-     blake512_4way( &ctx, vhash, 64 );
+     blake512_4way_update( &ctx, vhash, 64 );
     blake512_4way_close( &ctx, vhash );

     blake512_4way_init( &ctx );
-     blake512_4way( &ctx, vhash, 64 );
+     blake512_4way_update( &ctx, vhash, 64 );
     blake512_4way_close( &ctx, vhash );

     blake512_4way_init( &ctx );
-     blake512_4way( &ctx, vhash, 64 );
+     blake512_4way_update( &ctx, vhash, 64 );
     blake512_4way_close( &ctx, vhash );

     blake512_4way_init( &ctx );
-     blake512_4way( &ctx, vhash, 64 );
+     blake512_4way_update( &ctx, vhash, 64 );
     blake512_4way_close( &ctx, vhash );

     memcpy( output,    hash0, 32 );
@@ -105,7 +105,7 @@ int scanhash_pentablake_4way( struct work *work,
                  && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
              {
                 pdata[19] = n + i;
-                 submit_lane_solution( work, hash+(i<<3), mythr, i );
+                 submit_solution( work, hash+(i<<3), mythr );
              }
              n += 4;

--- a/algo/blake/pentablake.c
+++ b/algo/blake/pentablake.c
@@ -1,4 +1,7 @@
 #include "pentablake-gate.h"
+
+#if !defined(PENTABLAKE_8WAY) && !defined(PENTABLAKE_4WAY)
+
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -111,3 +114,4 @@ int scanhash_pentablake( struct work *work, uint32_t max_nonce,
 	return 0;
 } 

+#endif
--- a/algo/bmw/bmw-hash-4way.h
+++ b/algo/bmw/bmw-hash-4way.h
@@ -138,7 +138,7 @@ void bmw512_2way_close( bmw512_2way_context *ctx, void *dst );

 #if defined(__AVX2__)

-// BMW-512 4 way 64
+// BMW-512 64 bit 4 way

 typedef struct {
   __m256i buf[16];
@@ -149,7 +149,6 @@ typedef struct {

 typedef bmw_4way_big_context bmw512_4way_context;

-
 void bmw512_4way_init(void *cc);

 void bmw512_4way_update(void *cc, const void *data, size_t len);
@@ -164,6 +163,7 @@ void bmw512_4way_addbits_and_close(

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

+// BMW-512 64 bit 8 way
 typedef struct {
   __m512i buf[16];
   __m512i H[16];
@@ -171,6 +171,8 @@ typedef struct {
   uint64_t bit_count;
 } bmw512_8way_context __attribute__((aligned(128)));

+void bmw512_8way_full( bmw512_8way_context *ctx, void *out, const void *data,
+                         size_t len );
 void bmw512_8way_init( bmw512_8way_context *ctx );
 void bmw512_8way_update( bmw512_8way_context *ctx, const void *data,
                         size_t len );
--- a/algo/bmw/bmw512-4way.c
+++ b/algo/bmw/bmw512-4way.c
@@ -40,13 +40,13 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
      bmw512hash_8way( hash, vdata );

      for ( int lane = 0; lane < 8; lane++ )
-      if ( unlikely( hash7[ lane<<1 ] < Htarg ) )
+      if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
      {
          extr_lane_8x64( lane_hash, hash, lane, 256 );
          if ( fulltest( lane_hash, ptarget ) )
          {
              pdata[19] = n + lane;
-              submit_lane_solution( work, lane_hash, mythr, lane );
+              submit_solution( work, lane_hash, mythr );
          }
      }
      n += 8;
@@ -93,14 +93,13 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
      bmw512hash_4way( hash, vdata );

      for ( int lane = 0; lane < 4; lane++ )
-      if ( unlikely( hash7[ lane<<1 ] < Htarg ) )
-//      if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) )
+      if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
      {
          extr_lane_4x64( lane_hash, hash, lane, 256 );
          if ( fulltest( lane_hash, ptarget ) )
          {
              pdata[19] = n + lane;
-              submit_lane_solution( work, lane_hash, mythr, lane );
+              submit_solution( work, lane_hash, mythr );
          }
      }
      n += 4;
--- a/algo/bmw/bmw512-hash-4way.c
+++ b/algo/bmw/bmw512-hash-4way.c
@@ -1507,6 +1507,93 @@ void bmw512_8way_close( bmw512_8way_context *ctx, void *dst )
      casti_m512i( dst, u ) = h1[ v ];
 }

+void bmw512_8way_full( bmw512_8way_context *ctx, void *out, const void *data,
+                                size_t len )
+{
+   __m512i *vdata = (__m512i*)data;
+   __m512i *buf = ctx->buf;
+   __m512i htmp[16];
+   __m512i *H = ctx->H;
+   __m512i *h2 = htmp;
+   uint64_t bit_count = len * 8;
+   size_t ptr = 0;
+   const int buf_size = 128;  // bytes of one lane, compatible with len
+
+// Init
+
+   H[ 0] = m512_const1_64( 0x8081828384858687 );
+   H[ 1] = m512_const1_64( 0x88898A8B8C8D8E8F );
+   H[ 2] = m512_const1_64( 0x9091929394959697 );
+   H[ 3] = m512_const1_64( 0x98999A9B9C9D9E9F );
+   H[ 4] = m512_const1_64( 0xA0A1A2A3A4A5A6A7 );
+   H[ 5] = m512_const1_64( 0xA8A9AAABACADAEAF );
+   H[ 6] = m512_const1_64( 0xB0B1B2B3B4B5B6B7 );
+   H[ 7] = m512_const1_64( 0xB8B9BABBBCBDBEBF );
+   H[ 8] = m512_const1_64( 0xC0C1C2C3C4C5C6C7 );
+   H[ 9] = m512_const1_64( 0xC8C9CACBCCCDCECF );
+   H[10] = m512_const1_64( 0xD0D1D2D3D4D5D6D7 );
+   H[11] = m512_const1_64( 0xD8D9DADBDCDDDEDF );
+   H[12] = m512_const1_64( 0xE0E1E2E3E4E5E6E7 );
+   H[13] = m512_const1_64( 0xE8E9EAEBECEDEEEF );
+   H[14] = m512_const1_64( 0xF0F1F2F3F4F5F6F7 );
+   H[15] = m512_const1_64( 0xF8F9FAFBFCFDFEFF );
+
+// Update
+
+   while ( len > 0 )
+   {
+      size_t clen;
+      clen = buf_size - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_512( buf + (ptr>>3), vdata, clen >> 3 );
+      vdata = vdata + (clen>>3);
+      len -= clen;
+      ptr += clen;
+      if ( ptr == buf_size )
+      {
+         __m512i *ht;
+         compress_big_8way( buf, H, h2 );
+         ht = H;
+         H = h2;
+         h2 = ht;
+         ptr = 0;
+      }
+   }
+   if ( H != ctx->H )
+      memcpy_512( ctx->H, H, 16 );
+
+// Close   
+{
+   __m512i h1[16], h2[16];
+   size_t u, v;
+
+   buf[ ptr>>3 ] = m512_const1_64( 0x80 );
+   ptr += 8;
+
+   if (  ptr > (buf_size - 8) )
+   {
+      memset_zero_512( buf + (ptr>>3), (buf_size - ptr) >> 3 );
+      compress_big_8way( buf, H, h1 );
+      ptr = 0;
+      H = h1;
+   }
+   memset_zero_512( buf + (ptr>>3), (buf_size - 8 - ptr) >> 3 );
+   buf[ (buf_size - 8) >> 3 ] = _mm512_set1_epi64( bit_count );
+   compress_big_8way( buf, H, h2 );
+   for ( u = 0; u < 16; u ++ )
+      buf[ u ] = h2[ u ];
+   compress_big_8way( buf, final_b8, h1 );
+   for (u = 0, v = 8; u < 8; u ++, v ++)
+      casti_m512i( out, u ) = h1[ v ];
+}
+
+
+
+}   
+
+
+
 #endif // AVX512

 #ifdef __cplusplus
--- a/algo/bmw/bmw512.c
+++ b/algo/bmw/bmw512.c
@@ -1,5 +1,7 @@
 #include "algo-gate-api.h"

+#if !defined(BMW512_8WAY) && !defined(BMW512_4WAY)
+
 #include <stdlib.h>
 #include <string.h>
 #include <stdint.h>
@@ -50,4 +52,4 @@ int scanhash_bmw512( struct work *work, uint32_t max_nonce,
 	pdata[19] = n;
 	return 0;
 }
-
+#endif
--- a/algo/bmw/sph_bmw.c
+++ b/algo/bmw/sph_bmw.c
@@ -48,6 +48,8 @@ extern "C"{
 #pragma warning (disable: 4146)
 #endif

+#if !defined(__AVX2__)
+
 static const sph_u32 IV224[] = {
 	SPH_C32(0x00010203), SPH_C32(0x04050607),
 	SPH_C32(0x08090A0B), SPH_C32(0x0C0D0E0F),
@@ -70,6 +72,8 @@ static const sph_u32 IV256[] = {
 	SPH_C32(0x78797A7B), SPH_C32(0x7C7D7E7F)
 };

+#endif // !AVX2
+
 #if SPH_64

 static const sph_u64 IV384[] = {
@@ -135,6 +139,8 @@ static const sph_u64 IV512[] = {
 #define M16_30   14, 15,  1,  2,  5,  8,  9
 #define M16_31   15, 16,  2,  3,  6,  9, 10

+#if !defined(__AVX2__)
+
 #define ss0(x)    (((x) >> 1) ^ SPH_T32((x) << 3) \
                  ^ SPH_ROTL32(x,  4) ^ SPH_ROTL32(x, 19))
 #define ss1(x)    (((x) >> 1) ^ SPH_T32((x) << 2) \
@@ -189,6 +195,8 @@ static const sph_u64 IV512[] = {
 #define expand2s_(qf, mf, hf, i16, ix, iy) \
 	expand2s_inner LPAR qf, mf, hf, i16, ix, iy)

+#endif // !AVX2
+
 #if SPH_64

 #define sb0(x)    (((x) >> 1) ^ SPH_T64((x) << 3) \
@@ -291,6 +299,8 @@ static const sph_u64 Kb_tab[] = {
 	tt((M(i0) ^ H(i0)) op01 (M(i1) ^ H(i1)) op12 (M(i2) ^ H(i2)) \
 	op23 (M(i3) ^ H(i3)) op34 (M(i4) ^ H(i4)))

+#if !defined(__AVX2__)
+
 #define Ws0    MAKE_W(SPH_T32,  5, -,  7, +, 10, +, 13, +, 14)
 #define Ws1    MAKE_W(SPH_T32,  6, -,  8, +, 11, +, 14, -, 15)
 #define Ws2    MAKE_W(SPH_T32,  0, +,  7, +,  9, -, 12, +, 15)
@@ -407,6 +417,8 @@ static const sph_u64 Kb_tab[] = {

 #define Qs(j)   (qt[j])

+#endif  // !AVX2
+
 #if SPH_64

 #define Wb0    MAKE_W(SPH_T64,  5, -,  7, +, 10, +, 13, +, 14)
@@ -557,7 +569,6 @@ static const sph_u64 Kb_tab[] = {
 			+ ((xl >> 2) ^ qf(22) ^ qf(15))); \
 	} while (0)

-#define FOLDs   FOLD(sph_u32, MAKE_Qs, SPH_T32, SPH_ROTL32, M, Qs, dH)

 #if SPH_64

@@ -565,6 +576,10 @@ static const sph_u64 Kb_tab[] = {

 #endif

+#if !defined(__AVX2__)
+
+#define FOLDs   FOLD(sph_u32, MAKE_Qs, SPH_T32, SPH_ROTL32, M, Qs, dH)
+
 static void
 compress_small(const unsigned char *data, const sph_u32 h[16], sph_u32 dh[16])
 {
@@ -711,6 +726,8 @@ bmw32_close(sph_bmw_small_context *sc, unsigned ub, unsigned n,
 		sph_enc32le(out + 4 * u, h1[v]);
 }

+#endif // !AVX2
+
 #if SPH_64

 static void
@@ -840,6 +857,8 @@ bmw64_close(sph_bmw_big_context *sc, unsigned ub, unsigned n,

 #endif

+#if !defined(__AVX2__)
+
 /* see sph_bmw.h */
 void
 sph_bmw224_init(void *cc)
@@ -898,6 +917,8 @@ sph_bmw256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 //	sph_bmw256_init(cc);
 }

+#endif // !AVX2
+
 #if SPH_64

 /* see sph_bmw.h */
--- a/algo/bmw/sph_bmw.h
+++ b/algo/bmw/sph_bmw.h
@@ -77,6 +77,9 @@ extern "C"{
 * computation can be cloned by copying the context (e.g. with a simple
 * <code>memcpy()</code>).
 */
+
+#if !defined(__AVX2__)
+
 typedef struct {
 #ifndef DOXYGEN_IGNORE
 	unsigned char buf[64];    /* first field, for alignment */
@@ -102,6 +105,8 @@ typedef sph_bmw_small_context sph_bmw224_context;
 */
 typedef sph_bmw_small_context sph_bmw256_context;

+#endif // !AVX2
+
 #if SPH_64

 /**
@@ -137,6 +142,8 @@ typedef sph_bmw_big_context sph_bmw512_context;

 #endif

+#if !defined(__AVX2__)
+
 /**
 * Initialize a BMW-224 context. This process performs no memory allocation.
 *
@@ -227,6 +234,8 @@ void sph_bmw256_close(void *cc, void *dst);
 void sph_bmw256_addbits_and_close(
 	void *cc, unsigned ub, unsigned n, void *dst);

+#endif // !AVX2
+
 #if SPH_64

 /**
--- a/algo/cryptonight/cryptolight.c
+++ b/algo/cryptonight/cryptolight.c
@@ -1,368 +0,0 @@
-// Copyright (c) 2012-2013 The Cryptonote developers
-// Distributed under the MIT/X11 software license, see the accompanying
-// file COPYING or http://www.opensource.org/licenses/mit-license.php.
-
-#include "algo-gate-api.h"
-
-#if defined(__arm__) || defined(_MSC_VER)
-#ifndef NOASM
-#define NOASM
-#endif
-#endif
-
-#include "crypto/oaes_lib.h"
-#include "crypto/c_keccak.h"
-#include "crypto/c_groestl.h"
-#include "crypto/c_blake256.h"
-#include "crypto/c_jh.h"
-#include "crypto/c_skein.h"
-#include "crypto/int-util.h"
-#include "crypto/hash-ops.h"
-
-#if USE_INT128
-
-#if __GNUC__ == 4 && __GNUC_MINOR__ >= 4 && __GNUC_MINOR__ < 6
-typedef unsigned int uint128_t __attribute__ ((__mode__ (TI)));
-#elif defined (_MSC_VER)
-/* only for mingw64 on windows */
-#undef  USE_INT128
-#define USE_INT128 (0)
-#else
-typedef __uint128_t uint128_t;
-#endif
-
-#endif
-
-#define LITE 1
-#if LITE /* cryptonight-light */
-#define MEMORY (1 << 20)
-#define ITER   (1 << 19)
-#else
-#define MEMORY (1 << 21) /* 2 MiB */
-#define ITER   (1 << 20)
-#endif
-
-#define AES_BLOCK_SIZE  16
-#define AES_KEY_SIZE    32 /*16*/
-#define INIT_SIZE_BLK   8
-#define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE)
-
-#pragma pack(push, 1)
-union cn_slow_hash_state {
-	union hash_state hs;
-	struct {
-		uint8_t k[64];
-		uint8_t init[INIT_SIZE_BYTE];
-	};
-};
-#pragma pack(pop)
-
-static void do_blake_hash(const void* input, size_t len, char* output) {
-	blake256_hash((uint8_t*)output, input, len);
-}
-
-static void do_groestl_hash(const void* input, size_t len, char* output) {
-	groestl(input, len * 8, (uint8_t*)output);
-}
-
-static void do_jh_hash(const void* input, size_t len, char* output) {
-	int r = jh_hash(HASH_SIZE * 8, input, 8 * len, (uint8_t*)output);
-	assert(likely(SUCCESS == r));
-}
-
-static void do_skein_hash(const void* input, size_t len, char* output) {
-	int r = skein_hash(8 * HASH_SIZE, input, 8 * len, (uint8_t*)output);
-	assert(likely(SKEIN_SUCCESS == r));
-}
-
-extern int aesb_single_round(const uint8_t *in, uint8_t*out, const uint8_t *expandedKey);
-extern int aesb_pseudo_round_mut(uint8_t *val, uint8_t *expandedKey);
-#if !defined(_MSC_VER) && !defined(NOASM)
-extern int fast_aesb_single_round(const uint8_t *in, uint8_t*out, const uint8_t *expandedKey);
-extern int fast_aesb_pseudo_round_mut(uint8_t *val, uint8_t *expandedKey);
-#else
-#define fast_aesb_single_round     aesb_single_round
-#define fast_aesb_pseudo_round_mut aesb_pseudo_round_mut
-#endif
-
-#if defined(NOASM) || !defined(__x86_64__)
-static uint64_t mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi) {
-	// multiplier   = ab = a * 2^32 + b
-	// multiplicand = cd = c * 2^32 + d
-	// ab * cd = a * c * 2^64 + (a * d + b * c) * 2^32 + b * d
-	uint64_t a = hi_dword(multiplier);
-	uint64_t b = lo_dword(multiplier);
-	uint64_t c = hi_dword(multiplicand);
-	uint64_t d = lo_dword(multiplicand);
-
-	uint64_t ac = a * c;
-	uint64_t ad = a * d;
-	uint64_t bc = b * c;
-	uint64_t bd = b * d;
-
-	uint64_t adbc = ad + bc;
-	uint64_t adbc_carry = adbc < ad ? 1 : 0;
-
-	// multiplier * multiplicand = product_hi * 2^64 + product_lo
-	uint64_t product_lo = bd + (adbc << 32);
-	uint64_t product_lo_carry = product_lo < bd ? 1 : 0;
-	*product_hi = ac + (adbc >> 32) + (adbc_carry << 32) + product_lo_carry;
-	assert(ac <= *product_hi);
-
-	return product_lo;
-}
-#else
-extern uint64_t mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi);
-#endif
-
-static void (* const extra_hashes[4])(const void *, size_t, char *) = {
-		do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash
-};
-
-
-static inline size_t e2i(const uint8_t* a) {
-#if !LITE
-	return ((uint32_t *)a)[0] & 0x1FFFF0;
-#else
-	return ((uint32_t *)a)[0] & 0xFFFF0;
-#endif
-}
-
-static inline void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst) {
-	uint64_t hi, lo = mul128(((uint64_t*) a)[0], ((uint64_t*) dst)[0], &hi) + ((uint64_t*) c)[1];
-	hi += ((uint64_t*) c)[0];
-
-	((uint64_t*) c)[0] = ((uint64_t*) dst)[0] ^ hi;
-	((uint64_t*) c)[1] = ((uint64_t*) dst)[1] ^ lo;
-	((uint64_t*) dst)[0] = hi;
-	((uint64_t*) dst)[1] = lo;
-}
-
-static inline void xor_blocks(uint8_t* a, const uint8_t* b) {
-#if USE_INT128
-	*((uint128_t*) a) ^= *((uint128_t*) b);
-#else
-	((uint64_t*) a)[0] ^= ((uint64_t*) b)[0];
-	((uint64_t*) a)[1] ^= ((uint64_t*) b)[1];
-#endif
-}
-
-static inline void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) {
-#if USE_INT128
-	*((uint128_t*) dst) = *((uint128_t*) a) ^ *((uint128_t*) b);
-#else
-	((uint64_t*) dst)[0] = ((uint64_t*) a)[0] ^ ((uint64_t*) b)[0];
-	((uint64_t*) dst)[1] = ((uint64_t*) a)[1] ^ ((uint64_t*) b)[1];
-#endif
-}
-
-struct cryptonight_ctx {
-	uint8_t _ALIGN(16) long_state[MEMORY];
-	union cn_slow_hash_state state;
-	uint8_t _ALIGN(16) text[INIT_SIZE_BYTE];
-	uint8_t _ALIGN(16) a[AES_BLOCK_SIZE];
-	uint8_t _ALIGN(16) b[AES_BLOCK_SIZE];
-	uint8_t _ALIGN(16) c[AES_BLOCK_SIZE];
-	oaes_ctx* aes_ctx;
-};
-
-static void cryptolight_hash_ctx(void* output, const void* input, int len, struct cryptonight_ctx* ctx)
-{
-        len = 76;
-	hash_process(&ctx->state.hs, (const uint8_t*) input, len);
-	ctx->aes_ctx = (oaes_ctx*) oaes_alloc();
-	size_t i, j;
-	memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
-
-	oaes_key_import_data(ctx->aes_ctx, ctx->state.hs.b, AES_KEY_SIZE);
-	for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
-		aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 0], ctx->aes_ctx->key->exp_data);
-		aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 1], ctx->aes_ctx->key->exp_data);
-		aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 2], ctx->aes_ctx->key->exp_data);
-		aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 3], ctx->aes_ctx->key->exp_data);
-		aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 4], ctx->aes_ctx->key->exp_data);
-		aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 5], ctx->aes_ctx->key->exp_data);
-		aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 6], ctx->aes_ctx->key->exp_data);
-		aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 7], ctx->aes_ctx->key->exp_data);
-		memcpy(&ctx->long_state[i], ctx->text, INIT_SIZE_BYTE);
-	}
-
-	xor_blocks_dst(&ctx->state.k[0], &ctx->state.k[32], ctx->a);
-	xor_blocks_dst(&ctx->state.k[16], &ctx->state.k[48], ctx->b);
-
-	for (i = 0; likely(i < ITER / 4); ++i) {
-		/* Dependency chain: address -> read value ------+
-		 * written value <-+ hard function (AES or MUL) <+
-		 * next address  <-+
-		 */
-		/* Iteration 1 */
-		j = e2i(ctx->a);
-		aesb_single_round(&ctx->long_state[j], ctx->c, ctx->a);
-		xor_blocks_dst(ctx->c, ctx->b, &ctx->long_state[j]);
-		/* Iteration 2 */
-		mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c)]);
-		/* Iteration 3 */
-		j = e2i(ctx->a);
-		aesb_single_round(&ctx->long_state[j], ctx->b, ctx->a);
-		xor_blocks_dst(ctx->b, ctx->c, &ctx->long_state[j]);
-		/* Iteration 4 */
-		mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b)]);
-	}
-
-	memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
-	oaes_key_import_data(ctx->aes_ctx, &ctx->state.hs.b[32], AES_KEY_SIZE);
-	for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
-		xor_blocks(&ctx->text[0 * AES_BLOCK_SIZE], &ctx->long_state[i + 0 * AES_BLOCK_SIZE]);
-		aesb_pseudo_round_mut(&ctx->text[0 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
-		xor_blocks(&ctx->text[1 * AES_BLOCK_SIZE], &ctx->long_state[i + 1 * AES_BLOCK_SIZE]);
-		aesb_pseudo_round_mut(&ctx->text[1 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
-		xor_blocks(&ctx->text[2 * AES_BLOCK_SIZE], &ctx->long_state[i + 2 * AES_BLOCK_SIZE]);
-		aesb_pseudo_round_mut(&ctx->text[2 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
-		xor_blocks(&ctx->text[3 * AES_BLOCK_SIZE], &ctx->long_state[i + 3 * AES_BLOCK_SIZE]);
-		aesb_pseudo_round_mut(&ctx->text[3 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
-		xor_blocks(&ctx->text[4 * AES_BLOCK_SIZE], &ctx->long_state[i + 4 * AES_BLOCK_SIZE]);
-		aesb_pseudo_round_mut(&ctx->text[4 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
-		xor_blocks(&ctx->text[5 * AES_BLOCK_SIZE], &ctx->long_state[i + 5 * AES_BLOCK_SIZE]);
-		aesb_pseudo_round_mut(&ctx->text[5 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
-		xor_blocks(&ctx->text[6 * AES_BLOCK_SIZE], &ctx->long_state[i + 6 * AES_BLOCK_SIZE]);
-		aesb_pseudo_round_mut(&ctx->text[6 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
-		xor_blocks(&ctx->text[7 * AES_BLOCK_SIZE], &ctx->long_state[i + 7 * AES_BLOCK_SIZE]);
-		aesb_pseudo_round_mut(&ctx->text[7 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
-	}
-	memcpy(ctx->state.init, ctx->text, INIT_SIZE_BYTE);
-	hash_permutation(&ctx->state.hs);
-	/*memcpy(hash, &state, 32);*/
-	extra_hashes[ctx->state.hs.b[0] & 3](&ctx->state, 200, output);
-	oaes_free((OAES_CTX **) &ctx->aes_ctx);
-}
-
-void cryptolight_hash(void* output, const void* input, int len) {
-	struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx));
-	cryptolight_hash_ctx(output, input, len, ctx);
-	free(ctx);
-}
-
-#if defined(__AES__)
-
-static void cryptolight_hash_ctx_aes_ni(void* output, const void* input,
-                       int len, struct cryptonight_ctx* ctx)
-{
-	hash_process(&ctx->state.hs, (const uint8_t*)input, len);
-	ctx->aes_ctx = (oaes_ctx*) oaes_alloc();
-	size_t i, j;
-	memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
-
-	oaes_key_import_data(ctx->aes_ctx, ctx->state.hs.b, AES_KEY_SIZE);
-	for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
-		fast_aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 0], ctx->aes_ctx->key->exp_data);
-		fast_aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 1], ctx->aes_ctx->key->exp_data);
-		fast_aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 2], ctx->aes_ctx->key->exp_data);
-		fast_aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 3], ctx->aes_ctx->key->exp_data);
-		fast_aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 4], ctx->aes_ctx->key->exp_data);
-		fast_aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 5], ctx->aes_ctx->key->exp_data);
-		fast_aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 6], ctx->aes_ctx->key->exp_data);
-		fast_aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 7], ctx->aes_ctx->key->exp_data);
-		memcpy(&ctx->long_state[i], ctx->text, INIT_SIZE_BYTE);
-	}
-
-	xor_blocks_dst(&ctx->state.k[0], &ctx->state.k[32], ctx->a);
-	xor_blocks_dst(&ctx->state.k[16], &ctx->state.k[48], ctx->b);
-
-	for (i = 0; likely(i < ITER / 4); ++i) {
-		/* Dependency chain: address -> read value ------+
-		 * written value <-+ hard function (AES or MUL) <+
-		 * next address  <-+
-		 */
-		/* Iteration 1 */
-		j = e2i(ctx->a);
-		fast_aesb_single_round(&ctx->long_state[j], ctx->c, ctx->a);
-		xor_blocks_dst(ctx->c, ctx->b, &ctx->long_state[j]);
-		/* Iteration 2 */
-		mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c)]);
-		/* Iteration 3 */
-		j = e2i(ctx->a);
-		fast_aesb_single_round(&ctx->long_state[j], ctx->b, ctx->a);
-		xor_blocks_dst(ctx->b, ctx->c, &ctx->long_state[j]);
-		/* Iteration 4 */
-		mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b)]);
-	}
-
-	memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
-	oaes_key_import_data(ctx->aes_ctx, &ctx->state.hs.b[32], AES_KEY_SIZE);
-	for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
-		xor_blocks(&ctx->text[0 * AES_BLOCK_SIZE], &ctx->long_state[i + 0 * AES_BLOCK_SIZE]);
-		fast_aesb_pseudo_round_mut(&ctx->text[0 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
-		xor_blocks(&ctx->text[1 * AES_BLOCK_SIZE], &ctx->long_state[i + 1 * AES_BLOCK_SIZE]);
-		fast_aesb_pseudo_round_mut(&ctx->text[1 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
-		xor_blocks(&ctx->text[2 * AES_BLOCK_SIZE], &ctx->long_state[i + 2 * AES_BLOCK_SIZE]);
-		fast_aesb_pseudo_round_mut(&ctx->text[2 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
-		xor_blocks(&ctx->text[3 * AES_BLOCK_SIZE], &ctx->long_state[i + 3 * AES_BLOCK_SIZE]);
-		fast_aesb_pseudo_round_mut(&ctx->text[3 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
-		xor_blocks(&ctx->text[4 * AES_BLOCK_SIZE], &ctx->long_state[i + 4 * AES_BLOCK_SIZE]);
-		fast_aesb_pseudo_round_mut(&ctx->text[4 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
-		xor_blocks(&ctx->text[5 * AES_BLOCK_SIZE], &ctx->long_state[i + 5 * AES_BLOCK_SIZE]);
-		fast_aesb_pseudo_round_mut(&ctx->text[5 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
-		xor_blocks(&ctx->text[6 * AES_BLOCK_SIZE], &ctx->long_state[i + 6 * AES_BLOCK_SIZE]);
-		fast_aesb_pseudo_round_mut(&ctx->text[6 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
-		xor_blocks(&ctx->text[7 * AES_BLOCK_SIZE], &ctx->long_state[i + 7 * AES_BLOCK_SIZE]);
-		fast_aesb_pseudo_round_mut(&ctx->text[7 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
-	}
-	memcpy(ctx->state.init, ctx->text, INIT_SIZE_BYTE);
-	hash_permutation(&ctx->state.hs);
-	/*memcpy(hash, &state, 32);*/
-	extra_hashes[ctx->state.hs.b[0] & 3](&ctx->state, 200, output);
-	oaes_free((OAES_CTX **) &ctx->aes_ctx);
-}
-
-#endif
-
-int scanhash_cryptolight( struct work *work,
-		uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr)
-{
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
-	uint32_t *nonceptr = (uint32_t*) (((char*)pdata) + 39);
-	uint32_t n = *nonceptr - 1;
-	const uint32_t first_nonce = n + 1;
-	//const uint32_t Htarg = ptarget[7];
-	uint32_t _ALIGN(32) hash[HASH_SIZE / 4];
-   int thr_id = mythr->id;
-
-	struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx));
-
-#if defined(__AES__)
-		do {
-			*nonceptr = ++n;
-			cryptolight_hash_ctx_aes_ni(hash, pdata, 76, ctx);
-			if (unlikely(hash[7] < ptarget[7])) {
-				*hashes_done = n - first_nonce + 1;
-				free(ctx);
-				return true;
-			}
-		} while (likely((n <= max_nonce && !work_restart[thr_id].restart)));
-#else
-		do {
-			*nonceptr = ++n;
-			cryptolight_hash_ctx(hash, pdata, 76, ctx);
-			if (unlikely(hash[7] < ptarget[7])) {
-				*hashes_done = n - first_nonce + 1;
-				free(ctx);
-				return true;
-			}
-		} while (likely((n <= max_nonce && !work_restart[thr_id].restart)));
-#endif
-	free(ctx);
-	*hashes_done = n - first_nonce + 1;
-	return 0;
-}
-
-bool register_cryptolight_algo( algo_gate_t* gate )
-{
-  register_json_rpc2( gate );
-  gate->optimizations = SSE2_OPT | AES_OPT;
-  gate->scanhash  = (void*)&scanhash_cryptolight;
-  gate->hash      = (void*)&cryptolight_hash;
-  gate->hash_suw  = (void*)&cryptolight_hash; 
-  return true;
-};
-
--- a/algo/cryptonight/cryptonight-aesni.c
+++ b/algo/cryptonight/cryptonight-aesni.c
@@ -1,357 +0,0 @@
-#if defined(__AES__)
-
-#include <x86intrin.h>
-#include <memory.h>
-#include "cryptonight.h"
-#include "miner.h"
-#include "crypto/c_keccak.h"
-#include <immintrin.h>
-
-static inline void ExpandAESKey256_sub1(__m128i *tmp1, __m128i *tmp2)
-{
-	__m128i tmp4;
-	*tmp2 = _mm_shuffle_epi32(*tmp2, 0xFF);
-	tmp4 = _mm_slli_si128(*tmp1, 0x04);
-	*tmp1 = _mm_xor_si128(*tmp1, tmp4);
-	tmp4 = _mm_slli_si128(tmp4, 0x04);
-	*tmp1 = _mm_xor_si128(*tmp1, tmp4);
-	tmp4 = _mm_slli_si128(tmp4, 0x04);
-	*tmp1 = _mm_xor_si128(*tmp1, tmp4);
-	*tmp1 = _mm_xor_si128(*tmp1, *tmp2);
-}
-
-static inline void ExpandAESKey256_sub2(__m128i *tmp1, __m128i *tmp3)
-{
-	__m128i tmp2, tmp4;
-	
-	tmp4 = _mm_aeskeygenassist_si128(*tmp1, 0x00);
-	tmp2 = _mm_shuffle_epi32(tmp4, 0xAA);
-	tmp4 = _mm_slli_si128(*tmp3, 0x04);
-	*tmp3 = _mm_xor_si128(*tmp3, tmp4);
-	tmp4 = _mm_slli_si128(tmp4, 0x04);
-	*tmp3 = _mm_xor_si128(*tmp3, tmp4);
-	tmp4 = _mm_slli_si128(tmp4, 0x04);
-	*tmp3 = _mm_xor_si128(*tmp3, tmp4);
-	*tmp3 = _mm_xor_si128(*tmp3, tmp2);
-}
-
-// Special thanks to Intel for helping me
-// with ExpandAESKey256() and its subroutines
-static inline void ExpandAESKey256(char *keybuf)
-{
-	__m128i tmp1, tmp2, tmp3, *keys;
-	
-	keys = (__m128i *)keybuf;
-	
-	tmp1 = _mm_load_si128((__m128i *)keybuf);
-	tmp3 = _mm_load_si128((__m128i *)(keybuf+0x10));
-	
-	tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x01);
-	ExpandAESKey256_sub1(&tmp1, &tmp2);
-	keys[2] = tmp1;
-	ExpandAESKey256_sub2(&tmp1, &tmp3);
-	keys[3] = tmp3;
-	
-	tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x02);
-	ExpandAESKey256_sub1(&tmp1, &tmp2);
-	keys[4] = tmp1;
-	ExpandAESKey256_sub2(&tmp1, &tmp3);
-	keys[5] = tmp3;
-	
-	tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x04);
-	ExpandAESKey256_sub1(&tmp1, &tmp2);
-	keys[6] = tmp1;
-	ExpandAESKey256_sub2(&tmp1, &tmp3);
-	keys[7] = tmp3;
-	
-	tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x08);
-	ExpandAESKey256_sub1(&tmp1, &tmp2);
-	keys[8] = tmp1;
-	ExpandAESKey256_sub2(&tmp1, &tmp3);
-	keys[9] = tmp3;
-	
-	tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x10);
-	ExpandAESKey256_sub1(&tmp1, &tmp2);
-	keys[10] = tmp1;
-	ExpandAESKey256_sub2(&tmp1, &tmp3);
-	keys[11] = tmp3;
-	
-	tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x20);
-	ExpandAESKey256_sub1(&tmp1, &tmp2);
-	keys[12] = tmp1;
-	ExpandAESKey256_sub2(&tmp1, &tmp3);
-	keys[13] = tmp3;
-	
-	tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x40);
-	ExpandAESKey256_sub1(&tmp1, &tmp2);
-	keys[14] = tmp1;
-}
-
-// align to 64 byte cache line
-typedef struct 
-{
-    uint8_t long_state[MEMORY] __attribute((aligned(64)));
-    union cn_slow_hash_state state;
-    uint8_t text[INIT_SIZE_BYTE] __attribute((aligned(64)));
-    uint64_t a[AES_BLOCK_SIZE >> 3] __attribute__((aligned(64)));
-    uint64_t b[AES_BLOCK_SIZE >> 3] __attribute__((aligned(64)));
-    uint8_t c[AES_BLOCK_SIZE] __attribute__((aligned(64)));
-} cryptonight_ctx;
-
-static __thread cryptonight_ctx ctx;
-
-void cryptonight_hash_aes( void *restrict output, const void *input, int len )
-{
-    uint8_t ExpandedKey[256] __attribute__((aligned(64)));
-    __m128i *longoutput, *expkey, *xmminput;
-    size_t i, j;
-    
-    keccak( (const uint8_t*)input, 76, (char*)&ctx.state.hs.b, 200 );
-
-    if ( cryptonightV7 && len < 43 )
-      return;
-
-    const uint64_t tweak = cryptonightV7 
-                         ? *((const uint64_t*) (((const uint8_t*)input) + 35))
-                           ^ ctx.state.hs.w[24] : 0; 
-
-    memcpy( ExpandedKey, ctx.state.hs.b, AES_KEY_SIZE );
-    ExpandAESKey256( ExpandedKey );
-    memcpy( ctx.text, ctx.state.init, INIT_SIZE_BYTE );
-    
-    longoutput = (__m128i*)ctx.long_state;
-    xmminput   = (__m128i*)ctx.text;
-    expkey     = (__m128i*)ExpandedKey;
-    
-    // prefetch expkey, xmminput and enough longoutput for 4 iterations
-    _mm_prefetch( xmminput,     _MM_HINT_T0 );
-    _mm_prefetch( xmminput + 4, _MM_HINT_T0 );
-    _mm_prefetch( expkey,     _MM_HINT_T0 );
-    _mm_prefetch( expkey + 4, _MM_HINT_T0 );
-    _mm_prefetch( expkey + 8, _MM_HINT_T0 );
-    for ( i = 0; i < 64; i += 16 )
-    {
-        __builtin_prefetch( longoutput + i,      1, 0 );
-        __builtin_prefetch( longoutput + i +  4, 1, 0 );
-        __builtin_prefetch( longoutput + i +  8, 1, 0 );
-        __builtin_prefetch( longoutput + i + 12, 1, 0 );
-    }
-
-    // n-4 iterations
-    for ( i = 0; likely( i < MEMORY_M128I - 4*INIT_SIZE_M128I );
-                         i += INIT_SIZE_M128I )
-    {
-        // prefetch 4 iterations ahead.
-        __builtin_prefetch( longoutput + i + 64, 1, 0 );
-        __builtin_prefetch( longoutput + i + 68, 1, 0 );
-
-	for ( j = 0; j < 10; j++ )
-	{
-		xmminput[0] = _mm_aesenc_si128( xmminput[0], expkey[j] );
-		xmminput[1] = _mm_aesenc_si128( xmminput[1], expkey[j] );
-		xmminput[2] = _mm_aesenc_si128( xmminput[2], expkey[j] );
-		xmminput[3] = _mm_aesenc_si128( xmminput[3], expkey[j] );
-		xmminput[4] = _mm_aesenc_si128( xmminput[4], expkey[j] );
-		xmminput[5] = _mm_aesenc_si128( xmminput[5], expkey[j] );
-		xmminput[6] = _mm_aesenc_si128( xmminput[6], expkey[j] );
-		xmminput[7] = _mm_aesenc_si128( xmminput[7], expkey[j] );
-	}
-	_mm_store_si128( &( longoutput[i  ] ), xmminput[0] );
-	_mm_store_si128( &( longoutput[i+1] ), xmminput[1] );
-	_mm_store_si128( &( longoutput[i+2] ), xmminput[2] );
-	_mm_store_si128( &( longoutput[i+3] ), xmminput[3] );
-	_mm_store_si128( &( longoutput[i+4] ), xmminput[4] );
-	_mm_store_si128( &( longoutput[i+5] ), xmminput[5] );
-	_mm_store_si128( &( longoutput[i+6] ), xmminput[6] );
-	_mm_store_si128( &( longoutput[i+7] ), xmminput[7] );
-    }
-    // last 4 iterations
-    for ( ; likely( i < MEMORY_M128I ); i += INIT_SIZE_M128I )
-    {
-        for ( j = 0; j < 10; j++ )
-        {
-                xmminput[0] = _mm_aesenc_si128( xmminput[0], expkey[j] );
-                xmminput[1] = _mm_aesenc_si128( xmminput[1], expkey[j] );
-                xmminput[2] = _mm_aesenc_si128( xmminput[2], expkey[j] );
-                xmminput[3] = _mm_aesenc_si128( xmminput[3], expkey[j] );
-                xmminput[4] = _mm_aesenc_si128( xmminput[4], expkey[j] );
-                xmminput[5] = _mm_aesenc_si128( xmminput[5], expkey[j] );
-                xmminput[6] = _mm_aesenc_si128( xmminput[6], expkey[j] );
-                xmminput[7] = _mm_aesenc_si128( xmminput[7], expkey[j] );
-        }
-        _mm_store_si128( &( longoutput[i  ] ), xmminput[0] );
-        _mm_store_si128( &( longoutput[i+1] ), xmminput[1] );
-        _mm_store_si128( &( longoutput[i+2] ), xmminput[2] );
-        _mm_store_si128( &( longoutput[i+3] ), xmminput[3] );
-        _mm_store_si128( &( longoutput[i+4] ), xmminput[4] );
-        _mm_store_si128( &( longoutput[i+5] ), xmminput[5] );
-        _mm_store_si128( &( longoutput[i+6] ), xmminput[6] );
-        _mm_store_si128( &( longoutput[i+7] ), xmminput[7] );
-    }
-
-    ctx.a[0] = ((uint64_t *)ctx.state.k)[0] ^ ((uint64_t *)ctx.state.k)[4];
-    ctx.b[0] = ((uint64_t *)ctx.state.k)[2] ^ ((uint64_t *)ctx.state.k)[6];
-    ctx.a[1] = ((uint64_t *)ctx.state.k)[1] ^ ((uint64_t *)ctx.state.k)[5];
-    ctx.b[1] = ((uint64_t *)ctx.state.k)[3] ^ ((uint64_t *)ctx.state.k)[7];
-
-    uint64_t a[2] __attribute((aligned(16))),
-             b[2] __attribute((aligned(16))),
-             c[2] __attribute((aligned(16)));
-    a[0] = ctx.a[0];
-    a[1] = ctx.a[1];
-    __m128i b_x = _mm_load_si128( (__m128i*)ctx.b );
-    __m128i a_x = _mm_load_si128( (__m128i*)a );
-    __m128i* lsa = (__m128i*)&ctx.long_state[ a[0] & 0x1FFFF0 ];
-    __m128i c_x = _mm_load_si128( lsa );
-    uint64_t *nextblock;
-    uint64_t hi, lo;
-
-    // n-1 iterations
-    for( i = 0; __builtin_expect( i < 0x7ffff, 1 ); i++ )
-    {	  
-	c_x = _mm_aesenc_si128( c_x, a_x );
-	_mm_store_si128( (__m128i*)c, c_x );
-        b_x = _mm_xor_si128( b_x, c_x );
-        nextblock = (uint64_t *)&ctx.long_state[c[0] & 0x1FFFF0];
-        _mm_store_si128( lsa, b_x );
-
-        if ( cryptonightV7 )
-        {
-           const uint8_t tmp = ( (const uint8_t*)(lsa) )[11];
-           const uint8_t index = ( ( (tmp >> 3) & 6 ) | (tmp & 1) ) << 1;
-           ((uint8_t*)(lsa))[11] = tmp ^ ( ( 0x75310 >> index) & 0x30 );
-        } 
-
-	b[0] = nextblock[0];
-	b[1] = nextblock[1];
-
-        // hi,lo = 64bit x 64bit multiply of c[0] and b[0]
-	__asm__( "mulq %3\n\t"
-	         : "=d" ( hi ),
-	           "=a" ( lo )
-	         : "%a" ( c[0] ),
-	           "rm" ( b[0] )
-		 : "cc" );
-
-        b_x = c_x;
-
-        a[0] += hi;
-        a[1] += lo;
-        nextblock[0] = a[0];
-        nextblock[1] = cryptonightV7 ? a[1] ^ tweak : a[1];
-        a[0] ^= b[0];
-        a[1] ^= b[1];
-
-        lsa = (__m128i*)&ctx.long_state[ a[0] & 0x1FFFF0 ];
-        a_x = _mm_load_si128( (__m128i*)a );
-        c_x = _mm_load_si128( lsa );
-    }
-    // abreviated nth iteration
-    c_x = _mm_aesenc_si128( c_x, a_x );
-    _mm_store_si128( (__m128i*)c, c_x );
-    b_x = _mm_xor_si128( b_x, c_x );
-    nextblock = (uint64_t *)&ctx.long_state[c[0] & 0x1FFFF0];
-    _mm_store_si128( lsa, b_x );
-
-    if ( cryptonightV7 )
-    {
-       const uint8_t tmp = ( (const uint8_t*)(lsa) )[11];
-       const uint8_t index = ( ( (tmp >> 3) & 6 ) | (tmp & 1) ) << 1;
-       ((uint8_t*)(lsa))[11] = tmp ^ ( ( 0x75310 >> index) & 0x30 );
-    }
-
-    b[0] = nextblock[0];
-    b[1] = nextblock[1];
-
-    __asm__( "mulq %3\n\t"
-             : "=d" ( hi ),
-               "=a" ( lo )
-             : "%a" ( c[0] ),
-               "rm" ( b[0] )
-             : "cc" );
-
-    a[0] += hi;
-    a[1] += lo;
-    nextblock[0] = a[0];
-    nextblock[1] = cryptonightV7 ? a[1] ^ tweak : a[1];
-    a[0] ^= b[0];
-    a[1] ^= b[1];
-
-    memcpy( ExpandedKey, &ctx.state.hs.b[32], AES_KEY_SIZE );
-    ExpandAESKey256( ExpandedKey );
-    memcpy( ctx.text, ctx.state.init, INIT_SIZE_BYTE );
-    
-    // prefetch expkey, all of xmminput and enough longoutput for 4 loops
-    _mm_prefetch( xmminput,     _MM_HINT_T0 );
-    _mm_prefetch( xmminput + 4, _MM_HINT_T0 );
-    for ( i = 0; i < 64; i += 16 )
-    {
-       _mm_prefetch( longoutput + i,      _MM_HINT_T0 );
-       _mm_prefetch( longoutput + i +  4, _MM_HINT_T0 );
-       _mm_prefetch( longoutput + i +  8, _MM_HINT_T0 );
-       _mm_prefetch( longoutput + i + 12, _MM_HINT_T0 );
-    }
-    _mm_prefetch( expkey,     _MM_HINT_T0 );
-    _mm_prefetch( expkey + 4, _MM_HINT_T0 );
-    _mm_prefetch( expkey + 8, _MM_HINT_T0 );
-
-    // n-4 iterations
-    for ( i = 0; likely( i < MEMORY_M128I - 4*INIT_SIZE_M128I );
-                         i += INIT_SIZE_M128I )
-    {
-        // stay 4 iterations ahead.
-        _mm_prefetch( longoutput + i + 64, _MM_HINT_T0 );
-        _mm_prefetch( longoutput + i + 68, _MM_HINT_T0 );
-
-        xmminput[0] = _mm_xor_si128( longoutput[i  ], xmminput[0] );
-        xmminput[1] = _mm_xor_si128( longoutput[i+1], xmminput[1] );
-        xmminput[2] = _mm_xor_si128( longoutput[i+2], xmminput[2] );
-        xmminput[3] = _mm_xor_si128( longoutput[i+3], xmminput[3] );
-        xmminput[4] = _mm_xor_si128( longoutput[i+4], xmminput[4] );
-        xmminput[5] = _mm_xor_si128( longoutput[i+5], xmminput[5] );
-        xmminput[6] = _mm_xor_si128( longoutput[i+6], xmminput[6] );
-        xmminput[7] = _mm_xor_si128( longoutput[i+7], xmminput[7] );
-		
-        for( j = 0; j < 10; j++ )
-        {
-            xmminput[0] = _mm_aesenc_si128( xmminput[0], expkey[j] );
-	    xmminput[1] = _mm_aesenc_si128( xmminput[1], expkey[j] );
-	    xmminput[2] = _mm_aesenc_si128( xmminput[2], expkey[j] );
-	    xmminput[3] = _mm_aesenc_si128( xmminput[3], expkey[j] );
-	    xmminput[4] = _mm_aesenc_si128( xmminput[4], expkey[j] );
-	    xmminput[5] = _mm_aesenc_si128( xmminput[5], expkey[j] );
-	    xmminput[6] = _mm_aesenc_si128( xmminput[6], expkey[j] );
-	    xmminput[7] = _mm_aesenc_si128( xmminput[7], expkey[j] );
-        }
-    }
-    // last 4 iterations 
-    for ( ; likely( i < MEMORY_M128I ); i += INIT_SIZE_M128I )
-    {
-        xmminput[0] = _mm_xor_si128( longoutput[i  ], xmminput[0] );
-        xmminput[1] = _mm_xor_si128( longoutput[i+1], xmminput[1] );
-        xmminput[2] = _mm_xor_si128( longoutput[i+2], xmminput[2] );
-        xmminput[3] = _mm_xor_si128( longoutput[i+3], xmminput[3] );
-        xmminput[4] = _mm_xor_si128( longoutput[i+4], xmminput[4] );
-        xmminput[5] = _mm_xor_si128( longoutput[i+5], xmminput[5] );
-        xmminput[6] = _mm_xor_si128( longoutput[i+6], xmminput[6] );
-        xmminput[7] = _mm_xor_si128( longoutput[i+7], xmminput[7] );
-
-        for( j = 0; j < 10; j++ )
-        {
-            xmminput[0] = _mm_aesenc_si128( xmminput[0], expkey[j] );
-            xmminput[1] = _mm_aesenc_si128( xmminput[1], expkey[j] );
-            xmminput[2] = _mm_aesenc_si128( xmminput[2], expkey[j] );
-            xmminput[3] = _mm_aesenc_si128( xmminput[3], expkey[j] );
-            xmminput[4] = _mm_aesenc_si128( xmminput[4], expkey[j] );
-            xmminput[5] = _mm_aesenc_si128( xmminput[5], expkey[j] );
-            xmminput[6] = _mm_aesenc_si128( xmminput[6], expkey[j] );
-            xmminput[7] = _mm_aesenc_si128( xmminput[7], expkey[j] );
-        }
-    }
-
-    memcpy( ctx.state.init, ctx.text, INIT_SIZE_BYTE);
-    keccakf( (uint64_t*)&ctx.state.hs.w, 24 );
-    extra_hashes[ctx.state.hs.b[0] & 3](&ctx.state, 200, output);
-
-}
-#endif
--- a/algo/cryptonight/cryptonight-common.c
+++ b/algo/cryptonight/cryptonight-common.c
@@ -1,127 +0,0 @@
-// Copyright (c) 2012-2013 The Cryptonote developers
-// Distributed under the MIT/X11 software license, see the accompanying
-// file COPYING or http://www.opensource.org/licenses/mit-license.php.
-
-// Modified for CPUminer by Lucas Jones
-
-#include "cpuminer-config.h"
-#include "algo-gate-api.h"
-
-#if defined(__AES__)
-  #include "algo/groestl/aes_ni/hash-groestl256.h"
-#else
-#include "crypto/c_groestl.h"
-#endif
-#include "crypto/c_blake256.h"
-#include "crypto/c_jh.h"
-#include "crypto/c_skein.h"
-#include "cryptonight.h"
-
-/*
-#if defined __unix__ && (!defined __APPLE__)
-#include <sys/mman.h>
-#elif defined _WIN32
-#include <windows.h>
-#endif
-*/
-
-void do_blake_hash(const void* input, size_t len, char* output) {
-    blake256_hash((uint8_t*)output, input, len);
-}
-
-void do_groestl_hash(const void* input, size_t len, char* output) {
-#if defined(__AES__)
-    hashState_groestl256 ctx;
-    init_groestl256( &ctx, 32 );
-    update_and_final_groestl256( &ctx, output, input, len * 8 );
-#else
-    groestl(input, len * 8, (uint8_t*)output);
-#endif
-}
-
-void do_jh_hash(const void* input, size_t len, char* output) {
-    jh_hash(32 * 8, input, 8 * len, (uint8_t*)output);
-}
-
-void do_skein_hash(const void* input, size_t len, char* output) {
-    skein_hash(8 * 32, input, 8 * len, (uint8_t*)output);
-}
-
-void (* const extra_hashes[4])( const void *, size_t, char *) =
-    { do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash };
-
-void cryptonight_hash( void *restrict output, const void *input, int len )
-{
-#if defined(__AES__)
-  cryptonight_hash_aes( output, input, len );
-#else
-  cryptonight_hash_ctx ( output, input, len );
-#endif
-}
-
-void cryptonight_hash_suw( void *restrict output, const void *input )
-{
-#if defined(__AES__)
-  cryptonight_hash_aes( output, input, 76 );
-#else
-  cryptonight_hash_ctx ( output, input, 76 );
-#endif
-}
-
-bool cryptonightV7 = false;
-
-int scanhash_cryptonight( struct work *work, uint32_t max_nonce,
-                   uint64_t *hashes_done, struct thr_info *mythr )
- {
-    uint32_t *pdata = work->data;
-    uint32_t *ptarget = work->target;
-    int thr_id = mythr->id;
-
-    uint32_t *nonceptr = (uint32_t*) (((char*)pdata) + 39);
-    uint32_t n = *nonceptr - 1;
-    const uint32_t first_nonce = n + 1;
-    const uint32_t Htarg = ptarget[7];
-    uint32_t hash[32 / 4] __attribute__((aligned(32)));
-
-//    if (  (  cryptonightV7 && ( *(uint8_t*)pdata <  7 ) )
-//       || ( !cryptonightV7 && ( *(uint8_t*)pdata == 7 ) ) )
-//          applog(LOG_WARNING,"Cryptonight variant mismatch, shares may be rejected.");
-
-    do
-    {
-       *nonceptr = ++n;
-       cryptonight_hash( hash, pdata, 76 );
-       if (unlikely( hash[7] < Htarg ))
-       {
-           *hashes_done = n - first_nonce + 1;
-//           work_set_target_ratio( work, hash );
-	   return true;
-       }
-    } while (likely((n <= max_nonce && !work_restart[thr_id].restart)));
-    
-    *hashes_done = n - first_nonce + 1;
-    return 0;
-}
-
-bool register_cryptonight_algo( algo_gate_t* gate )
-{
-  cryptonightV7 = false;
-  register_json_rpc2( gate );
-  gate->optimizations = SSE2_OPT | AES_OPT;
-  gate->scanhash         = (void*)&scanhash_cryptonight;
-  gate->hash             = (void*)&cryptonight_hash;
-  gate->hash_suw         = (void*)&cryptonight_hash_suw;  
-  return true;
-};
-
-bool register_cryptonightv7_algo( algo_gate_t* gate )
-{
-  cryptonightV7 = true;
-  register_json_rpc2( gate );
-  gate->optimizations = SSE2_OPT | AES_OPT;
-  gate->scanhash      = (void*)&scanhash_cryptonight;
-  gate->hash          = (void*)&cryptonight_hash;
-  gate->hash_suw      = (void*)&cryptonight_hash_suw;
-  return true;
-};
-
--- a/algo/cryptonight/cryptonight.c
+++ b/algo/cryptonight/cryptonight.c
@@ -1,310 +0,0 @@
-// Copyright (c) 2012-2013 The Cryptonote developers
-// Distributed under the MIT/X11 software license, see the accompanying
-// file COPYING or http://www.opensource.org/licenses/mit-license.php.
-
-// Modified for CPUminer by Lucas Jones
-
-#include "miner.h"
-#include <memory.h>
-
-#if defined(__arm__) || defined(_MSC_VER)
-#ifndef NOASM
-#define NOASM
-#endif
-#endif
-
-#include "crypto/oaes_lib.h"
-#include "crypto/c_keccak.h"
-#include "crypto/c_groestl.h"
-#include "crypto/c_blake256.h"
-#include "crypto/c_jh.h"
-#include "crypto/c_skein.h"
-#include "crypto/int-util.h"
-//#include "crypto/hash-ops.h"
-#include "cryptonight.h"
-
-#if USE_INT128
-
-#if __GNUC__ == 4 && __GNUC_MINOR__ >= 4 && __GNUC_MINOR__ < 6
-typedef unsigned int uint128_t __attribute__ ((__mode__ (TI)));
-#elif defined (_MSC_VER)
-/* only for mingw64 on windows */
-#undef  USE_INT128
-#define USE_INT128 (0)
-#else
-typedef __uint128_t uint128_t;
-#endif
-
-#endif
-
-#define LITE 0
-#if LITE /* cryptonight-light */
-#define MEMORY (1 << 20)
-#define ITER   (1 << 19)
-#else
-#define MEMORY (1 << 21) /* 2 MiB */
-#define ITER   (1 << 20)
-#endif
-
-#define AES_BLOCK_SIZE  16
-#define AES_KEY_SIZE    32 /*16*/
-#define INIT_SIZE_BLK   8
-#define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE)
-
-/*
-#pragma pack(push, 1)
-union cn_slow_hash_state {
-	union hash_state hs;
-	struct {
-		uint8_t k[64];
-		uint8_t init[INIT_SIZE_BYTE];
-	};
-};
-#pragma pack(pop)
-
-static void do_blake_hash(const void* input, size_t len, char* output) {
-	blake256_hash((uint8_t*)output, input, len);
-}
-
-static void do_groestl_hash(const void* input, size_t len, char* output) {
-	groestl(input, len * 8, (uint8_t*)output);
-}
-
-static void do_jh_hash(const void* input, size_t len, char* output) {
-	int r = jh_hash(HASH_SIZE * 8, input, 8 * len, (uint8_t*)output);
-	assert(likely(SUCCESS == r));
-}
-
-static void do_skein_hash(const void* input, size_t len, char* output) {
-	int r = skein_hash(8 * HASH_SIZE, input, 8 * len, (uint8_t*)output);
-	assert(likely(SKEIN_SUCCESS == r));
-}
-*/
-
-extern int aesb_single_round(const uint8_t *in, uint8_t*out, const uint8_t *expandedKey);
-extern int aesb_pseudo_round_mut(uint8_t *val, uint8_t *expandedKey);
-#if !defined(_MSC_VER) && !defined(NOASM)
-extern int fast_aesb_single_round(const uint8_t *in, uint8_t*out, const uint8_t *expandedKey);
-extern int fast_aesb_pseudo_round_mut(uint8_t *val, uint8_t *expandedKey);
-#else
-#define fast_aesb_single_round     aesb_single_round
-#define fast_aesb_pseudo_round_mut aesb_pseudo_round_mut
-#endif
-
-
-#if defined(NOASM) || !defined(__x86_64__)
-static uint64_t mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi) {
-	// multiplier   = ab = a * 2^32 + b
-	// multiplicand = cd = c * 2^32 + d
-	// ab * cd = a * c * 2^64 + (a * d + b * c) * 2^32 + b * d
-	uint64_t a = hi_dword(multiplier);
-	uint64_t b = lo_dword(multiplier);
-	uint64_t c = hi_dword(multiplicand);
-	uint64_t d = lo_dword(multiplicand);
-
-	uint64_t ac = a * c;
-	uint64_t ad = a * d;
-	uint64_t bc = b * c;
-	uint64_t bd = b * d;
-
-	uint64_t adbc = ad + bc;
-	uint64_t adbc_carry = adbc < ad ? 1 : 0;
-
-	// multiplier * multiplicand = product_hi * 2^64 + product_lo
-	uint64_t product_lo = bd + (adbc << 32);
-	uint64_t product_lo_carry = product_lo < bd ? 1 : 0;
-	*product_hi = ac + (adbc >> 32) + (adbc_carry << 32) + product_lo_carry;
-	assert(ac <= *product_hi);
-
-	return product_lo;
-}
-#else
-extern uint64_t mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi);
-#endif
-
-/*
-static void (* const extra_hashes[4])(const void *, size_t, char *) = {
-		do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash
-};
-*/
-
-static inline size_t e2i(const uint8_t* a) {
-#if !LITE
-	return ((uint32_t *)a)[0] & 0x1FFFF0;
-#else
-	return ((uint32_t *)a)[0] & 0xFFFF0;
-#endif
-}
-
-static inline void mul_sum_xor_dst( const uint8_t* a, uint8_t* c, uint8_t* dst, 
-         const uint64_t tweak )
-{
-	uint64_t hi, lo = mul128(((uint64_t*) a)[0], ((uint64_t*) dst)[0], &hi) + ((uint64_t*) c)[1];
-	hi += ((uint64_t*) c)[0];
-
-	((uint64_t*) c)[0] = ((uint64_t*) dst)[0] ^ hi;
-	((uint64_t*) c)[1] = ((uint64_t*) dst)[1] ^ lo;
-	((uint64_t*) dst)[0] = hi;
-	((uint64_t*) dst)[1] = cryptonightV7 ? lo ^ tweak : lo;
-}
-
-static inline void xor_blocks(uint8_t* a, const uint8_t* b) {
-#if USE_INT128
-	*((uint128_t*) a) ^= *((uint128_t*) b);
-#else
-	((uint64_t*) a)[0] ^= ((uint64_t*) b)[0];
-	((uint64_t*) a)[1] ^= ((uint64_t*) b)[1];
-#endif
-}
-
-static inline void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) {
-#if USE_INT128
-	*((uint128_t*) dst) = *((uint128_t*) a) ^ *((uint128_t*) b);
-#else
-	((uint64_t*) dst)[0] = ((uint64_t*) a)[0] ^ ((uint64_t*) b)[0];
-	((uint64_t*) dst)[1] = ((uint64_t*) a)[1] ^ ((uint64_t*) b)[1];
-#endif
-}
-
-typedef struct {
-	uint8_t _ALIGN(16) long_state[MEMORY];
-	union cn_slow_hash_state state;
-	uint8_t _ALIGN(16) text[INIT_SIZE_BYTE];
-	uint8_t _ALIGN(16) a[AES_BLOCK_SIZE];
-	uint8_t _ALIGN(16) b[AES_BLOCK_SIZE];
-	uint8_t _ALIGN(16) c[AES_BLOCK_SIZE];
-	oaes_ctx* aes_ctx;
-} cryptonight_ctx;
-
-static __thread cryptonight_ctx ctx;
-
-void cryptonight_hash_ctx(void* output, const void* input, int len)
-{
-//    hash_process(&ctx.state.hs, (const uint8_t*) input, len);
-    keccak( (const uint8_t*)input, 76, (char*)&ctx.state.hs.b, 200 );
-
-    if ( cryptonightV7 && len < 43 )
-      return;
-    const uint64_t tweak = cryptonightV7
-                         ? *((const uint64_t*) (((const uint8_t*)input) + 35))
-                           ^ ctx.state.hs.w[24] : 0;
-
-    ctx.aes_ctx = (oaes_ctx*) oaes_alloc();
-
-    __builtin_prefetch( ctx.text,             0, 3 );
-    __builtin_prefetch( ctx.text       +  64, 0, 3 );
-    __builtin_prefetch( ctx.long_state,       1, 0 );
-    __builtin_prefetch( ctx.long_state +  64, 1, 0 );
-    __builtin_prefetch( ctx.long_state + 128, 1, 0 );
-    __builtin_prefetch( ctx.long_state + 192, 1, 0 );
-    __builtin_prefetch( ctx.long_state + 256, 1, 0 );
-    __builtin_prefetch( ctx.long_state + 320, 1, 0 );
-    __builtin_prefetch( ctx.long_state + 384, 1, 0 );
-    __builtin_prefetch( ctx.long_state + 448, 1, 0 );
-
-	size_t i, j;
-	memcpy(ctx.text, ctx.state.init, INIT_SIZE_BYTE);
-
-	oaes_key_import_data(ctx.aes_ctx, ctx.state.hs.b, AES_KEY_SIZE);
-	for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
-
-    __builtin_prefetch( ctx.long_state + i + 512, 1, 0 );
-    __builtin_prefetch( ctx.long_state + i + 576, 1, 0 );
-
-		aesb_pseudo_round_mut(&ctx.text[AES_BLOCK_SIZE * 0], ctx.aes_ctx->key->exp_data);
-		aesb_pseudo_round_mut(&ctx.text[AES_BLOCK_SIZE * 1], ctx.aes_ctx->key->exp_data);
-		aesb_pseudo_round_mut(&ctx.text[AES_BLOCK_SIZE * 2], ctx.aes_ctx->key->exp_data);
-		aesb_pseudo_round_mut(&ctx.text[AES_BLOCK_SIZE * 3], ctx.aes_ctx->key->exp_data);
-		aesb_pseudo_round_mut(&ctx.text[AES_BLOCK_SIZE * 4], ctx.aes_ctx->key->exp_data);
-		aesb_pseudo_round_mut(&ctx.text[AES_BLOCK_SIZE * 5], ctx.aes_ctx->key->exp_data);
-		aesb_pseudo_round_mut(&ctx.text[AES_BLOCK_SIZE * 6], ctx.aes_ctx->key->exp_data);
-		aesb_pseudo_round_mut(&ctx.text[AES_BLOCK_SIZE * 7], ctx.aes_ctx->key->exp_data);
-		memcpy(&ctx.long_state[i], ctx.text, INIT_SIZE_BYTE);
-	}
-
-	xor_blocks_dst(&ctx.state.k[0], &ctx.state.k[32], ctx.a);
-	xor_blocks_dst(&ctx.state.k[16], &ctx.state.k[48], ctx.b);
-
-	for (i = 0; likely(i < ITER / 4); ++i)
-        {
-           /* Dependency chain: address -> read value ------+
-            * written value <-+ hard function (AES or MUL) <+
-            * next address  <-+
-            */
-           /* Iteration 1 */
-           j = e2i(ctx.a);
-           aesb_single_round(&ctx.long_state[j], ctx.c, ctx.a);
-           xor_blocks_dst(ctx.c, ctx.b, &ctx.long_state[j]);
-
-           if ( cryptonightV7 )
-           {
-              uint8_t *lsa = (uint8_t*)&ctx.long_state[((uint64_t *)(ctx.a))[0] & 0x1FFFF0];
-              const uint8_t tmp = lsa[11];
-              const uint8_t index = ( ( (tmp >> 3) & 6 ) | (tmp & 1) ) << 1;
-              lsa[11] = tmp ^ ( ( 0x75310 >> index) & 0x30 );
-           }
-
-           /* Iteration 2 */
-           mul_sum_xor_dst(ctx.c, ctx.a, &ctx.long_state[e2i(ctx.c)], tweak );
-
-           /* Iteration 3 */
-           j = e2i(ctx.a);
-           aesb_single_round(&ctx.long_state[j], ctx.b, ctx.a);
-           xor_blocks_dst(ctx.b, ctx.c, &ctx.long_state[j]);
-
-           if ( cryptonightV7 )
-           {
-              uint8_t *lsa = (uint8_t*)&ctx.long_state[((uint64_t *)(ctx.a))[0] & 0x1FFFF0];
-              const uint8_t tmp = lsa[11];
-              const uint8_t index = ( ( (tmp >> 3) & 6 ) | (tmp & 1) ) << 1;
-              lsa[11] = tmp ^ ( ( 0x75310 >> index) & 0x30 );
-           }
-
-           /* Iteration 4 */
-           mul_sum_xor_dst(ctx.b, ctx.a, &ctx.long_state[e2i(ctx.b)], tweak );
-
-	}
-
-    __builtin_prefetch( ctx.text,             0, 3 );
-    __builtin_prefetch( ctx.text       +  64, 0, 3 );
-    __builtin_prefetch( ctx.long_state,       1, 0 );
-    __builtin_prefetch( ctx.long_state +  64, 1, 0 );
-    __builtin_prefetch( ctx.long_state + 128, 1, 0 );
-    __builtin_prefetch( ctx.long_state + 192, 1, 0 );
-    __builtin_prefetch( ctx.long_state + 256, 1, 0 );
-    __builtin_prefetch( ctx.long_state + 320, 1, 0 );
-    __builtin_prefetch( ctx.long_state + 384, 1, 0 );
-    __builtin_prefetch( ctx.long_state + 448, 1, 0 );
-
-	memcpy(ctx.text, ctx.state.init, INIT_SIZE_BYTE);
-	oaes_key_import_data(ctx.aes_ctx, &ctx.state.hs.b[32], AES_KEY_SIZE);
-	for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
-
-    __builtin_prefetch( ctx.long_state + i + 512, 1, 0 );
-    __builtin_prefetch( ctx.long_state + i + 576, 1, 0 );
-
-		xor_blocks(&ctx.text[0 * AES_BLOCK_SIZE], &ctx.long_state[i + 0 * AES_BLOCK_SIZE]);
-		aesb_pseudo_round_mut(&ctx.text[0 * AES_BLOCK_SIZE], ctx.aes_ctx->key->exp_data);
-		xor_blocks(&ctx.text[1 * AES_BLOCK_SIZE], &ctx.long_state[i + 1 * AES_BLOCK_SIZE]);
-		aesb_pseudo_round_mut(&ctx.text[1 * AES_BLOCK_SIZE], ctx.aes_ctx->key->exp_data);
-		xor_blocks(&ctx.text[2 * AES_BLOCK_SIZE], &ctx.long_state[i + 2 * AES_BLOCK_SIZE]);
-		aesb_pseudo_round_mut(&ctx.text[2 * AES_BLOCK_SIZE], ctx.aes_ctx->key->exp_data);
-		xor_blocks(&ctx.text[3 * AES_BLOCK_SIZE], &ctx.long_state[i + 3 * AES_BLOCK_SIZE]);
-		aesb_pseudo_round_mut(&ctx.text[3 * AES_BLOCK_SIZE], ctx.aes_ctx->key->exp_data);
-		xor_blocks(&ctx.text[4 * AES_BLOCK_SIZE], &ctx.long_state[i + 4 * AES_BLOCK_SIZE]);
-		aesb_pseudo_round_mut(&ctx.text[4 * AES_BLOCK_SIZE], ctx.aes_ctx->key->exp_data);
-		xor_blocks(&ctx.text[5 * AES_BLOCK_SIZE], &ctx.long_state[i + 5 * AES_BLOCK_SIZE]);
-		aesb_pseudo_round_mut(&ctx.text[5 * AES_BLOCK_SIZE], ctx.aes_ctx->key->exp_data);
-		xor_blocks(&ctx.text[6 * AES_BLOCK_SIZE], &ctx.long_state[i + 6 * AES_BLOCK_SIZE]);
-		aesb_pseudo_round_mut(&ctx.text[6 * AES_BLOCK_SIZE], ctx.aes_ctx->key->exp_data);
-		xor_blocks(&ctx.text[7 * AES_BLOCK_SIZE], &ctx.long_state[i + 7 * AES_BLOCK_SIZE]);
-		aesb_pseudo_round_mut(&ctx.text[7 * AES_BLOCK_SIZE], ctx.aes_ctx->key->exp_data);
-	}
-	memcpy(ctx.state.init, ctx.text, INIT_SIZE_BYTE);
-//	hash_permutation(&ctx.state.hs);
-        keccakf( (uint64_t*)&ctx.state.hs.w, 24 );
-	/*memcpy(hash, &state, 32);*/
-	extra_hashes[ctx.state.hs.b[0] & 3](&ctx.state, 200, output);
-	oaes_free((OAES_CTX **) &ctx.aes_ctx);
-}
-
--- a/algo/cryptonight/cryptonight.h
+++ b/algo/cryptonight/cryptonight.h
@@ -1,51 +0,0 @@
-#ifndef __CRYPTONIGHT_H_INCLUDED
-#define __CRYPTONIGHT_H_INCLUDED
-
-#include <stddef.h>
-#include "crypto/oaes_lib.h"
-#include "miner.h"
-
-#define MEMORY         (1 << 21) /* 2 MiB */
-#define MEMORY_M128I   (MEMORY >> 4) // 2 MiB / 16 = 128 ki * __m128i
-#define ITER           (1 << 20)
-#define AES_BLOCK_SIZE  16
-#define AES_KEY_SIZE    32 /*16*/
-#define INIT_SIZE_BLK   8
-#define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE)	// 128
-#define INIT_SIZE_M128I (INIT_SIZE_BYTE >> 4) // 8
-
-
-#pragma pack(push, 1)
-union hash_state {
-  uint8_t b[200];
-  uint64_t w[25];
-};
-#pragma pack(pop)
-
-#pragma pack(push, 1)
-union cn_slow_hash_state {
-    union hash_state hs;
-    struct {
-        uint8_t k[64];
-        uint8_t init[INIT_SIZE_BYTE];
-    };
-};
-#pragma pack(pop)
-
-void do_blake_hash(const void* input, size_t len, char* output);
-void do_groestl_hash(const void* input, size_t len, char* output);
-void do_jh_hash(const void* input, size_t len, char* output);
-void do_skein_hash(const void* input, size_t len, char* output);
-void cryptonight_hash_ctx(void* output, const void* input, int len);
-void keccakf(uint64_t st[25], int rounds);
-extern void (* const extra_hashes[4])(const void *, size_t, char *);
-
-int scanhash_cryptonight( struct work *work, uint32_t max_nonce,
-                           uint64_t *hashes_done, struct thr_info *mythr );
-
-void cryptonight_hash_aes( void *restrict output, const void *input, int len );
-
-extern bool cryptonightV7;
-
-#endif
-
--- a/algo/cubehash/cube-hash-2way.c
+++ b/algo/cubehash/cube-hash-2way.c
@@ -168,6 +168,58 @@ int cube_4way_close( cube_4way_context *sp, void *output )
    return 0;
 }

+int cube_4way_full( cube_4way_context *sp, void *output,  int hashbitlen, 
+                    const void *data, size_t size )
+{
+    __m512i *h = (__m512i*)sp->h;
+    __m128i *iv = (__m128i*)( hashbitlen == 512 ? (__m128i*)IV512
+                                                : (__m128i*)IV256 );
+    sp->hashlen   = hashbitlen/128;
+    sp->blocksize = 32/16;
+    sp->rounds    = 16;
+    sp->pos       = 0;
+
+    h[ 0] = m512_const1_128( iv[0] );
+    h[ 1] = m512_const1_128( iv[1] );
+    h[ 2] = m512_const1_128( iv[2] );
+    h[ 3] = m512_const1_128( iv[3] );
+    h[ 4] = m512_const1_128( iv[4] );
+    h[ 5] = m512_const1_128( iv[5] );
+    h[ 6] = m512_const1_128( iv[6] );
+    h[ 7] = m512_const1_128( iv[7] );
+
+    const int len = size >> 4;
+    const __m512i *in = (__m512i*)data;
+    __m512i *hash = (__m512i*)output;
+    int i;
+
+    for ( i = 0; i < len; i++ )
+    {
+        sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ], in[i] );
+        sp->pos++;
+        if ( sp->pos == sp->blocksize )
+        {
+           transform_4way( sp );
+           sp->pos = 0;
+        }
+    }
+
+    // pos is zero for 64 byte data, 1 for 80 byte data.
+    sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ],
+                                    m512_const2_64( 0, 0x0000000000000080 ) );
+    transform_4way( sp );
+
+    sp->h[7] = _mm512_xor_si512( sp->h[7],
+                                    m512_const2_64( 0x0000000100000000, 0 ) );
+
+    for ( i = 0; i < 10; ++i )
+       transform_4way( sp );
+
+    memcpy( hash, sp->h, sp->hashlen<<6);
+    return 0;
+}
+
+
 int cube_4way_update_close( cube_4way_context *sp, void *output,
                               const void *data, size_t size )
 {
@@ -376,4 +428,54 @@ int cube_2way_update_close( cube_2way_context *sp, void *output,
    return 0;
 }

+int cube_2way_full( cube_2way_context *sp, void *output, int hashbitlen,
+                               const void *data, size_t size )
+{
+    __m256i *h = (__m256i*)sp->h;
+    __m128i *iv = (__m128i*)( hashbitlen == 512 ? (__m128i*)IV512
+                                                : (__m128i*)IV256 );
+    sp->hashlen   = hashbitlen/128;
+    sp->blocksize = 32/16;
+    sp->rounds    = 16;
+    sp->pos       = 0;
+
+    h[ 0] = m256_const1_128( iv[0] );
+    h[ 1] = m256_const1_128( iv[1] );
+    h[ 2] = m256_const1_128( iv[2] );
+    h[ 3] = m256_const1_128( iv[3] );
+    h[ 4] = m256_const1_128( iv[4] );
+    h[ 5] = m256_const1_128( iv[5] );
+    h[ 6] = m256_const1_128( iv[6] );
+    h[ 7] = m256_const1_128( iv[7] );
+
+    const int len = size >> 4;
+    const __m256i *in = (__m256i*)data;
+    __m256i *hash = (__m256i*)output;
+    int i;
+
+    for ( i = 0; i < len; i++ )
+    {
+        sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ], in[i] );
+        sp->pos++;
+        if ( sp->pos == sp->blocksize )
+        {
+           transform_2way( sp );
+           sp->pos = 0;
+        }
+    }
+
+    // pos is zero for 64 byte data, 1 for 80 byte data.
+    sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
+                                    m256_const2_64( 0, 0x0000000000000080 ) );
+    transform_2way( sp );
+
+    sp->h[7] = _mm256_xor_si256( sp->h[7],
+                                    m256_const2_64( 0x0000000100000000, 0 ) );
+
+    for ( i = 0; i < 10; ++i )    transform_2way( sp );
+
+    memcpy( hash, sp->h, sp->hashlen<<5 );
+    return 0;
+}
+
 #endif
--- a/algo/cubehash/cube-hash-2way.h
+++ b/algo/cubehash/cube-hash-2way.h
@@ -21,15 +21,33 @@ typedef struct _cube_4way_context cube_4way_context;

 int cube_4way_init( cube_4way_context* sp, int hashbitlen, int rounds,
                       int blockbytes );
-// reinitialize context with same parameters, much faster.
-int cube_4way_reinit( cube_4way_context *sp );
-
 int cube_4way_update( cube_4way_context *sp, const void *data, size_t size );
-
 int cube_4way_close( cube_4way_context *sp, void *output );
-
 int cube_4way_update_close( cube_4way_context *sp, void *output,
                            const void *data, size_t size );
+int cube_4way_full( cube_4way_context *sp, void *output, int hashbitlen,
+                    const void *data, size_t size );
+
+int cube_4x256_full( cube_4way_context *sp, void *output, int hashbitlen,
+                     const void *data, size_t size );
+
+#define cube512_4way_init( sp ) cube_4way_update( sp, 512 )
+#define cube512_4way_update cube_4way_update
+#define cube512_4way_update_close cube_4way_update
+#define cube512_4way_close cube_4way_update
+#define cube512_4way_full( sp, output, data, size ) \
+           cube_4way_full( sp, output, 512, data, size )
+#define cube512_4x256_full( sp, output, data, size ) \
+           cube_4x256_full( sp, output, 512, data, size )
+
+#define cube256_4way_init( sp ) cube_4way_update( sp, 256 )
+#define cube256_4way_update cube_4way_update
+#define cube256_4way_update_close cube_4way_update
+#define cube256_4way_close cube_4way_update
+#define cube256_4way_full( sp, output, data, size ) \
+           cube_4way_full( sp, output, 256, data, size )
+#define cube256_4x256_full( sp, output, data, size ) \
+           cube_4x256_full( sp, output, 256, data, size )

 #endif

@@ -48,15 +66,12 @@ typedef struct _cube_2way_context cube_2way_context;

 int cube_2way_init( cube_2way_context* sp, int hashbitlen, int rounds,
                       int blockbytes );
-// reinitialize context with same parameters, much faster.
-int cube_2way_reinit( cube_2way_context *sp );
-
 int cube_2way_update( cube_2way_context *sp, const void *data, size_t size );
-
 int cube_2way_close( cube_2way_context *sp, void *output );
-
 int cube_2way_update_close( cube_2way_context *sp, void *output,
                            const void *data, size_t size );
+int cube_2way_full( cube_2way_context *sp, void *output, int hashbitlen,
+                    const void *data, size_t size );


 #endif
--- a/algo/cubehash/cubehash_sse2.c
+++ b/algo/cubehash/cubehash_sse2.c
@@ -230,11 +230,10 @@ int cubehashDigest( cubehashParam *sp, byte *digest )

    // pos is zero for 64 byte data, 1 for 80 byte data.
    sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ],
-                                      _mm_set_epi8( 0,0,0,0, 0,0,0,0,
-                                                    0,0,0,0, 0,0,0,0x80 ) );
+                                      m128_const_64( 0, 0x80 ) );
    transform( sp );

-    sp->x[7] = _mm_xor_si128( sp->x[7], _mm_set_epi32( 1,0,0,0 ) );
+    sp->x[7] = _mm_xor_si128( sp->x[7], m128_const_64( 0x100000000, 0 ) );
    transform( sp );
    transform( sp );
    transform( sp );
@@ -276,11 +275,89 @@ int cubehashUpdateDigest( cubehashParam *sp, byte *digest,

    // pos is zero for 64 byte data, 1 for 80 byte data.
    sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ],
-                                      _mm_set_epi8( 0,0,0,0, 0,0,0,0,
-                                                    0,0,0,0, 0,0,0,0x80 ) );
+                                      m128_const_64( 0, 0x80 ) );
    transform( sp );

-    sp->x[7] = _mm_xor_si128( sp->x[7], _mm_set_epi32( 1,0,0,0 ) );
+    sp->x[7] = _mm_xor_si128( sp->x[7], m128_const_64( 0x100000000, 0 ) );
+
+    transform( sp );
+    transform( sp );
+    transform( sp );
+    transform( sp );
+    transform( sp );
+    transform( sp );
+    transform( sp );
+    transform( sp );
+    transform( sp );
+    transform( sp );
+
+    for ( i = 0; i < sp->hashlen; i++ )
+       hash[i] = sp->x[i];
+
+    return SUCCESS;
+}
+
+int cubehash_full( cubehashParam *sp, byte *digest, int hashbitlen,
+                          const byte *data, size_t size )
+{
+    __m128i *x = (__m128i*)sp->x;
+    sp->hashlen   = hashbitlen/128;
+    sp->blocksize = 32/16;
+    sp->rounds    = 16;
+    sp->pos       = 0;
+
+    if ( hashbitlen == 512 )
+    {
+
+       x[0] = m128_const_64( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
+       x[1] = m128_const_64( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
+       x[2] = m128_const_64( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
+       x[3] = m128_const_64( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
+       x[4] = m128_const_64( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
+       x[5] = m128_const_64( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
+       x[6] = m128_const_64( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
+       x[7] = m128_const_64( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
+    }
+    else
+    {
+       x[0] = m128_const_64( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
+       x[1] = m128_const_64( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
+       x[2] = m128_const_64( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
+       x[3] = m128_const_64( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
+       x[4] = m128_const_64( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
+       x[5] = m128_const_64( 0x93CB628565C892FD, 0x5FA2560309392549 );
+       x[6] = m128_const_64( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
+       x[7] = m128_const_64( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
+    }
+
+
+
+
+    const int len = size / 16;
+    const __m128i* in = (__m128i*)data;
+    __m128i* hash = (__m128i*)digest;
+    int i;
+
+    // It is assumed data is aligned to 256 bits and is a multiple of 128 bits.
+    // Current usage sata is either 64 or 80 bytes.
+
+    for ( i = 0; i < len; i++ )
+    {
+        sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ], in[i] );
+        sp->pos++;
+        if ( sp->pos == sp->blocksize )
+        {
+           transform( sp );
+           sp->pos = 0;
+        }
+    }
+
+    // pos is zero for 64 byte data, 1 for 80 byte data.
+    sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ],
+                                      m128_const_64( 0, 0x80 ) );
+    transform( sp );
+
+    sp->x[7] = _mm_xor_si128( sp->x[7], m128_const_64( 0x100000000, 0 ) );

    transform( sp );
    transform( sp );
--- a/algo/cubehash/cubehash_sse2.h
+++ b/algo/cubehash/cubehash_sse2.h
@@ -19,7 +19,7 @@ struct _cubehashParam
    int rounds;
    int blocksize;         // __m128i
    int pos;	           // number of __m128i read into x from current block
-    __m128i _ALIGN(256) x[8];  // aligned for __m256i
+    __m128i _ALIGN(64) x[8];  // aligned for __m256i
 };

 typedef struct _cubehashParam cubehashParam;
@@ -39,6 +39,9 @@ int cubehashDigest(cubehashParam* sp, byte *digest);
 int cubehashUpdateDigest( cubehashParam *sp, byte *digest, const byte *data,
                          size_t size );

+int cubehash_full( cubehashParam* sp, byte *digest, int hashbitlen,
+                   const byte *data, size_t size );
+
 #ifdef __cplusplus
 }
 #endif
--- a/algo/echo/aes_ni/hash.c
+++ b/algo/echo/aes_ni/hash.c
@@ -7,7 +7,6 @@
 * - implements NIST hash api
 * - assumes that message lenght is multiple of 8-bits
 * - _ECHO_VPERM_ must be defined if compiling with ../main.c
- * -  define NO_AES_NI for aes_ni version
 *
 * Cagdas Calik
 * ccalik@metu.edu.tr
@@ -21,13 +20,7 @@
 #include "hash_api.h"
 //#include "vperm.h"
 #include <immintrin.h>
-/*
-#ifndef NO_AES_NI
-#include <wmmintrin.h>
-#else
-#include <tmmintrin.h>
-#endif
-*/
+#include "simd-utils.h"

 MYALIGN const unsigned int _k_s0F[] = {0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F};
 MYALIGN const unsigned int _k_ipt[] = {0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090, 0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC};
@@ -525,6 +518,165 @@ HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
   return SUCCESS;
 }

+HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
+            int nHashSize, const BitSequence *data, DataLength datalen )
+{
+   int i, j;
+
+   state->k = m128_zero;
+   state->processed_bits = 0;
+   state->uBufferBytes = 0;
+
+   switch( nHashSize )
+   {
+      case 256:
+         state->uHashSize = 256;
+         state->uBlockLength = 192;
+         state->uRounds = 8;
+         state->hashsize = m128_const_64( 0, 0x100 );
+         state->const1536 = m128_const_64( 0, 0x600 );
+         break;
+
+      case 512:
+         state->uHashSize = 512;
+         state->uBlockLength = 128;
+         state->uRounds = 10;
+         state->hashsize = m128_const_64( 0, 0x200 );
+         state->const1536 = m128_const_64( 0, 0x400 );
+         break;
+
+      default:
+         return BAD_HASHBITLEN;
+   }
+
+   for(i = 0; i < 4; i++)
+      for(j = 0; j < nHashSize / 256; j++)
+         state->state[i][j] = state->hashsize;
+
+   for(i = 0; i < 4; i++)
+      for(j = nHashSize / 256; j < 4; j++)
+         state->state[i][j] = m128_zero;
+
+
+   unsigned int uBlockCount, uRemainingBytes;
+
+   if( (state->uBufferBytes + datalen) >= state->uBlockLength )
+   {
+        if( state->uBufferBytes != 0 )
+        {
+           // Fill the buffer
+           memcpy( state->buffer + state->uBufferBytes,
+                   (void*)data, state->uBlockLength - state->uBufferBytes );
+
+           // Process buffer
+           Compress( state, state->buffer, 1 );
+           state->processed_bits += state->uBlockLength * 8;
+
+           data += state->uBlockLength - state->uBufferBytes;
+           datalen -= state->uBlockLength - state->uBufferBytes;
+        }
+
+        // buffer now does not contain any unprocessed bytes
+
+        uBlockCount = datalen / state->uBlockLength;
+        uRemainingBytes = datalen % state->uBlockLength;
+
+        if( uBlockCount > 0 )
+        {
+           Compress( state, data, uBlockCount );
+           state->processed_bits += uBlockCount * state->uBlockLength * 8;
+           data += uBlockCount * state->uBlockLength;
+        }
+
+        if( uRemainingBytes > 0 )
+        memcpy(state->buffer, (void*)data, uRemainingBytes);
+
+        state->uBufferBytes = uRemainingBytes;
+   }
+   else
+   {
+        memcpy( state->buffer + state->uBufferBytes, (void*)data, datalen );
+        state->uBufferBytes += datalen;
+   }
+
+   __m128i remainingbits;
+
+   // Add remaining bytes in the buffer
+   state->processed_bits += state->uBufferBytes * 8;
+
+   remainingbits = _mm_set_epi32( 0, 0, 0, state->uBufferBytes * 8 );
+
+   // Pad with 0x80
+   state->buffer[state->uBufferBytes++] = 0x80;
+   // Enough buffer space for padding in this block?
+   if( (state->uBlockLength - state->uBufferBytes) >= 18 )
+   {
+        // Pad with zeros
+        memset( state->buffer + state->uBufferBytes, 0, state->uBlockLength - (state->uBufferBytes + 18) );
+
+        // Hash size
+        *( (unsigned short*)(state->buffer + state->uBlockLength - 18) ) = state->uHashSize;
+
+        // Processed bits
+        *( (DataLength*)(state->buffer + state->uBlockLength - 16) ) =
+                   state->processed_bits;
+        *( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0;
+
+        // Last block contains message bits?
+        if( state->uBufferBytes == 1 )
+        {
+           state->k = _mm_xor_si128( state->k, state->k );
+           state->k = _mm_sub_epi64( state->k, state->const1536 );
+        }
+        else
+        {
+           state->k = _mm_add_epi64( state->k, remainingbits );
+           state->k = _mm_sub_epi64( state->k, state->const1536 );
+        }
+
+        // Compress
+        Compress( state, state->buffer, 1 );
+   }
+   else
+   {
+        // Fill with zero and compress
+        memset( state->buffer + state->uBufferBytes, 0,
+                state->uBlockLength - state->uBufferBytes );
+        state->k = _mm_add_epi64( state->k, remainingbits );
+        state->k = _mm_sub_epi64( state->k, state->const1536 );
+        Compress( state, state->buffer, 1 );
+
+        // Last block
+        memset( state->buffer, 0, state->uBlockLength - 18 );
+
+        // Hash size
+        *( (unsigned short*)(state->buffer + state->uBlockLength - 18) ) =
+                 state->uHashSize;
+
+        // Processed bits
+        *( (DataLength*)(state->buffer + state->uBlockLength - 16) ) =
+                   state->processed_bits;
+        *( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0;
+        // Compress the last block
+        state->k = _mm_xor_si128( state->k, state->k );
+        state->k = _mm_sub_epi64( state->k, state->const1536 );
+        Compress( state, state->buffer, 1) ;
+   }
+
+   // Store the hash value
+   _mm_store_si128( (__m128i*)hashval + 0, state->state[0][0] );
+   _mm_store_si128( (__m128i*)hashval + 1, state->state[1][0] );
+
+   if( state->uHashSize == 512 )
+   {
+        _mm_store_si128( (__m128i*)hashval + 2, state->state[2][0] );
+        _mm_store_si128( (__m128i*)hashval + 3, state->state[3][0] );
+
+   }
+   return SUCCESS;
+}
+
+

 HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval)
 {
--- a/algo/echo/aes_ni/hash.c.test
+++ b/algo/echo/aes_ni/hash.c.test
@@ -1,620 +0,0 @@
-/*
- * file        : echo_vperm.c
- * version     : 1.0.208
- * date        : 14.12.2010
- * 
- * - vperm and aes_ni implementations of hash function ECHO
- * - implements NIST hash api
- * - assumes that message lenght is multiple of 8-bits
- * - _ECHO_VPERM_ must be defined if compiling with ../main.c
- * -  define NO_AES_NI for aes_ni version
- *
- * Cagdas Calik
- * ccalik@metu.edu.tr
- * Institute of Applied Mathematics, Middle East Technical University, Turkey.
- *
- */
-#if defined(__AES__)
-
-#include <memory.h>
-#include "miner.h"
-#include "hash_api.h"
-//#include "vperm.h"
-#include <immintrin.h>
-/*
-#ifndef NO_AES_NI
-#include <wmmintrin.h>
-#else
-#include <tmmintrin.h>
-#endif
-*/
-
-MYALIGN const unsigned int _k_s0F[] = {0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F};
-MYALIGN const unsigned int _k_ipt[] = {0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090, 0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC};
-MYALIGN const unsigned int _k_opt[] = {0xD6B66000, 0xFF9F4929, 0xDEBE6808, 0xF7974121, 0x50BCEC00, 0x01EDBD51, 0xB05C0CE0, 0xE10D5DB1};
-MYALIGN const unsigned int _k_inv[] = {0x0D080180, 0x0E05060F, 0x0A0B0C02, 0x04070309, 0x0F0B0780, 0x01040A06, 0x02050809, 0x030D0E0C};
-MYALIGN const unsigned int _k_sb1[] = {0xCB503E00, 0xB19BE18F, 0x142AF544, 0xA5DF7A6E, 0xFAE22300, 0x3618D415, 0x0D2ED9EF, 0x3BF7CCC1};
-MYALIGN const unsigned int _k_sb2[] = {0x0B712400, 0xE27A93C6, 0xBC982FCD, 0x5EB7E955, 0x0AE12900, 0x69EB8840, 0xAB82234A, 0xC2A163C8};
-MYALIGN const unsigned int _k_sb3[] = {0xC0211A00, 0x53E17249, 0xA8B2DA89, 0xFB68933B, 0xF0030A00, 0x5FF35C55, 0xA6ACFAA5, 0xF956AF09};
-MYALIGN const unsigned int _k_sb4[] = {0x3FD64100, 0xE1E937A0, 0x49087E9F, 0xA876DE97, 0xC393EA00, 0x3D50AED7, 0x876D2914, 0xBA44FE79};
-MYALIGN const unsigned int _k_sb5[] = {0xF4867F00, 0x5072D62F, 0x5D228BDB, 0x0DA9A4F9, 0x3971C900, 0x0B487AC2, 0x8A43F0FB, 0x81B332B8};
-MYALIGN const unsigned int _k_sb7[] = {0xFFF75B00, 0xB20845E9, 0xE1BAA416, 0x531E4DAC, 0x3390E000, 0x62A3F282, 0x21C1D3B1, 0x43125170};
-MYALIGN const unsigned int _k_sbo[] = {0x6FBDC700, 0xD0D26D17, 0xC502A878, 0x15AABF7A, 0x5FBB6A00, 0xCFE474A5, 0x412B35FA, 0x8E1E90D1};
-MYALIGN const unsigned int _k_h63[] = {0x63636363, 0x63636363, 0x63636363, 0x63636363};
-MYALIGN const unsigned int _k_hc6[] = {0xc6c6c6c6, 0xc6c6c6c6, 0xc6c6c6c6, 0xc6c6c6c6};
-MYALIGN const unsigned int _k_h5b[] = {0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b};
-MYALIGN const unsigned int _k_h4e[] = {0x4e4e4e4e, 0x4e4e4e4e, 0x4e4e4e4e, 0x4e4e4e4e};
-MYALIGN const unsigned int _k_h0e[] = {0x0e0e0e0e, 0x0e0e0e0e, 0x0e0e0e0e, 0x0e0e0e0e};
-MYALIGN const unsigned int _k_h15[] = {0x15151515, 0x15151515, 0x15151515, 0x15151515};
-MYALIGN const unsigned int _k_aesmix1[] = {0x0f0a0500, 0x030e0904, 0x07020d08, 0x0b06010c};
-MYALIGN const unsigned int _k_aesmix2[] = {0x000f0a05, 0x04030e09, 0x0807020d, 0x0c0b0601};
-MYALIGN const unsigned int _k_aesmix3[] = {0x05000f0a, 0x0904030e, 0x0d080702, 0x010c0b06};
-MYALIGN const unsigned int _k_aesmix4[] = {0x0a05000f, 0x0e090403, 0x020d0807, 0x06010c0b};
-
-
-MYALIGN const unsigned int 	const1[]		= {0x00000001, 0x00000000, 0x00000000, 0x00000000};
-MYALIGN const unsigned int	mul2mask[]		= {0x00001b00, 0x00000000, 0x00000000, 0x00000000};
-MYALIGN const unsigned int	lsbmask[]		= {0x01010101, 0x01010101, 0x01010101, 0x01010101};
-MYALIGN const unsigned int	invshiftrows[]	= {0x070a0d00, 0x0b0e0104, 0x0f020508, 0x0306090c};
-MYALIGN const unsigned int	zero[]			= {0x00000000, 0x00000000, 0x00000000, 0x00000000};
-MYALIGN const unsigned int	mul2ipt[]		= {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x25d9ab57, 0xfd5ba600, 0x2a8c71d7, 0x1eb845e3, 0xc96f9234};
-
-
-#define ECHO_SUBBYTES(state, i, j) \
-	state[i][j] = _mm_aesenc_si128(state[i][j], k1);\
-	state[i][j] = _mm_aesenc_si128(state[i][j], M128(zero));\
-	k1 = _mm_add_epi32(k1, M128(const1))
-
-#define ECHO_MIXBYTES(state1, state2, j, t1, t2, s2) \
-	s2 = _mm_add_epi8(state1[0][j], state1[0][j]);\
-	t1 = _mm_srli_epi16(state1[0][j], 7);\
-	t1 = _mm_and_si128(t1, M128(lsbmask));\
-	t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
-	s2 = _mm_xor_si128(s2, t2);\
-	state2[0][j] = s2;\
-	state2[1][j] = state1[0][j];\
-	state2[2][j] = state1[0][j];\
-	state2[3][j] = _mm_xor_si128(s2, state1[0][j]);\
-	s2 = _mm_add_epi8(state1[1][(j + 1) & 3], state1[1][(j + 1) & 3]);\
-	t1 = _mm_srli_epi16(state1[1][(j + 1) & 3], 7);\
-	t1 = _mm_and_si128(t1, M128(lsbmask));\
-	t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
-	s2 = _mm_xor_si128(s2, t2);\
-	state2[0][j] = _mm_xor_si128(state2[0][j], _mm_xor_si128(s2, state1[1][(j + 1) & 3]));\
-	state2[1][j] = _mm_xor_si128(state2[1][j], s2);\
-	state2[2][j] = _mm_xor_si128(state2[2][j], state1[1][(j + 1) & 3]);\
-	state2[3][j] = _mm_xor_si128(state2[3][j], state1[1][(j + 1) & 3]);\
-	s2 = _mm_add_epi8(state1[2][(j + 2) & 3], state1[2][(j + 2) & 3]);\
-	t1 = _mm_srli_epi16(state1[2][(j + 2) & 3], 7);\
-	t1 = _mm_and_si128(t1, M128(lsbmask));\
-	t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
-	s2 = _mm_xor_si128(s2, t2);\
-	state2[0][j] = _mm_xor_si128(state2[0][j], state1[2][(j + 2) & 3]);\
-	state2[1][j] = _mm_xor_si128(state2[1][j], _mm_xor_si128(s2, state1[2][(j + 2) & 3]));\
-	state2[2][j] = _mm_xor_si128(state2[2][j], s2);\
-	state2[3][j] = _mm_xor_si128(state2[3][j], state1[2][(j + 2) & 3]);\
-	s2 = _mm_add_epi8(state1[3][(j + 3) & 3], state1[3][(j + 3) & 3]);\
-	t1 = _mm_srli_epi16(state1[3][(j + 3) & 3], 7);\
-	t1 = _mm_and_si128(t1, M128(lsbmask));\
-	t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
-	s2 = _mm_xor_si128(s2, t2);\
-	state2[0][j] = _mm_xor_si128(state2[0][j], state1[3][(j + 3) & 3]);\
-	state2[1][j] = _mm_xor_si128(state2[1][j], state1[3][(j + 3) & 3]);\
-	state2[2][j] = _mm_xor_si128(state2[2][j], _mm_xor_si128(s2, state1[3][(j + 3) & 3]));\
-	state2[3][j] = _mm_xor_si128(state2[3][j], s2)
-
-
-#define ECHO_ROUND_UNROLL2 \
-	ECHO_SUBBYTES(_state, 0, 0);\
-	ECHO_SUBBYTES(_state, 1, 0);\
-	ECHO_SUBBYTES(_state, 2, 0);\
-	ECHO_SUBBYTES(_state, 3, 0);\
-	ECHO_SUBBYTES(_state, 0, 1);\
-	ECHO_SUBBYTES(_state, 1, 1);\
-	ECHO_SUBBYTES(_state, 2, 1);\
-	ECHO_SUBBYTES(_state, 3, 1);\
-	ECHO_SUBBYTES(_state, 0, 2);\
-	ECHO_SUBBYTES(_state, 1, 2);\
-	ECHO_SUBBYTES(_state, 2, 2);\
-	ECHO_SUBBYTES(_state, 3, 2);\
-	ECHO_SUBBYTES(_state, 0, 3);\
-	ECHO_SUBBYTES(_state, 1, 3);\
-	ECHO_SUBBYTES(_state, 2, 3);\
-	ECHO_SUBBYTES(_state, 3, 3);\
-	ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\
-	ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\
-	ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\
-	ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\
-	ECHO_SUBBYTES(_state2, 0, 0);\
-	ECHO_SUBBYTES(_state2, 1, 0);\
-	ECHO_SUBBYTES(_state2, 2, 0);\
-	ECHO_SUBBYTES(_state2, 3, 0);\
-	ECHO_SUBBYTES(_state2, 0, 1);\
-	ECHO_SUBBYTES(_state2, 1, 1);\
-	ECHO_SUBBYTES(_state2, 2, 1);\
-	ECHO_SUBBYTES(_state2, 3, 1);\
-	ECHO_SUBBYTES(_state2, 0, 2);\
-	ECHO_SUBBYTES(_state2, 1, 2);\
-	ECHO_SUBBYTES(_state2, 2, 2);\
-	ECHO_SUBBYTES(_state2, 3, 2);\
-	ECHO_SUBBYTES(_state2, 0, 3);\
-	ECHO_SUBBYTES(_state2, 1, 3);\
-	ECHO_SUBBYTES(_state2, 2, 3);\
-	ECHO_SUBBYTES(_state2, 3, 3);\
-	ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\
-	ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
-	ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
-	ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
-
-
-
-#define SAVESTATE(dst, src)\
-	dst[0][0] = src[0][0];\
-	dst[0][1] = src[0][1];\
-	dst[0][2] = src[0][2];\
-	dst[0][3] = src[0][3];\
-	dst[1][0] = src[1][0];\
-	dst[1][1] = src[1][1];\
-	dst[1][2] = src[1][2];\
-	dst[1][3] = src[1][3];\
-	dst[2][0] = src[2][0];\
-	dst[2][1] = src[2][1];\
-	dst[2][2] = src[2][2];\
-	dst[2][3] = src[2][3];\
-	dst[3][0] = src[3][0];\
-	dst[3][1] = src[3][1];\
-	dst[3][2] = src[3][2];\
-	dst[3][3] = src[3][3]
-
-
-void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBlockCount)
-{
-   unsigned int r, b, i, j;
-   __m128i t1, t2, s2, k1;
-   __m128i _state[4][4], _state2[4][4], _statebackup[4][4]; 
-
-   for(i = 0; i < 4; i++)
-	for(j = 0; j < ctx->uHashSize / 256; j++)
-		_state[i][j] = ctx->state[i][j];
-
-   for(b = 0; b < uBlockCount; b++)
-   {
-	ctx->k = _mm_add_epi64(ctx->k, ctx->const1536);
-
-	// load message
-	for(j = ctx->uHashSize / 256; j < 4; j++)
-	{
-	   for(i = 0; i < 4; i++)
-	   {
-		_state[i][j] = _mm_loadu_si128((__m128i*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i);
-	   }
-	}
-
-uint64_t *b = (uint64_t*)_state;
-//printf("Ss3: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]);
-   
-	// save state
-	SAVESTATE(_statebackup, _state);
-
-	k1 = ctx->k;
-
-	for(r = 0; r < ctx->uRounds / 2; r++)
-	{
-		ECHO_ROUND_UNROLL2;
-	}
-
-//printf("Ss4: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]);
-   
-   
-	if(ctx->uHashSize == 256)
-	{
-	   for(i = 0; i < 4; i++)
-	   {
-		_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][1]);
-		_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]);
-		_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][3]);
-		_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]);
-		_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][1]);
-		_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]);
-		_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][3]);
-	   }
-	}
-	else
-	{
-	   for(i = 0; i < 4; i++)
-	   {
-		_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]);
-		_state[i][1] = _mm_xor_si128(_state[i][1], _state[i][3]);
-		_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]);
-		_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]);
-		_state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][1]);
-		_state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][3]);
-           }
-	}
-	pmsg += ctx->uBlockLength;
-   }
-	SAVESTATE(ctx->state, _state);
-
-}
-
-
-
-HashReturn init_echo(hashState_echo *ctx, int nHashSize)
-{
-	int i, j;
-
-        ctx->k = _mm_setzero_si128(); 
-	ctx->processed_bits = 0;
-	ctx->uBufferBytes = 0;
-
-	switch(nHashSize)
-	{
-		case 256:
-			ctx->uHashSize = 256;
-			ctx->uBlockLength = 192;
-			ctx->uRounds = 8;
-			ctx->hashsize = _mm_set_epi32(0, 0, 0, 0x00000100);
-			ctx->const1536 = _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000600);
-			break;
-
-		case 512:
-			ctx->uHashSize = 512;
-			ctx->uBlockLength = 128;
-			ctx->uRounds = 10;
-			ctx->hashsize = _mm_set_epi32(0, 0, 0, 0x00000200);
-			ctx->const1536 = _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000400);
-			break;
-
-		default:
-			return BAD_HASHBITLEN;
-	}
-
-
-	for(i = 0; i < 4; i++)
-		for(j = 0; j < nHashSize / 256; j++)
-			ctx->state[i][j] = ctx->hashsize;
-
-	for(i = 0; i < 4; i++)
-		for(j = nHashSize / 256; j < 4; j++)
-			ctx->state[i][j] = _mm_set_epi32(0, 0, 0, 0);
-
-	return SUCCESS;
-}
-
-HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLength databitlen)
-{
-	unsigned int uByteLength, uBlockCount, uRemainingBytes;
-
-	uByteLength = (unsigned int)(databitlen / 8);
-
-	if((state->uBufferBytes + uByteLength) >= state->uBlockLength)
-	{
-		if(state->uBufferBytes != 0)
-		{
-			// Fill the buffer
-			memcpy(state->buffer + state->uBufferBytes, (void*)data, state->uBlockLength - state->uBufferBytes);
-
-			// Process buffer
-			Compress(state, state->buffer, 1);
-			state->processed_bits += state->uBlockLength * 8;
-
-			data += state->uBlockLength - state->uBufferBytes;
-			uByteLength -= state->uBlockLength - state->uBufferBytes;
-		}
-
-		// buffer now does not contain any unprocessed bytes
-
-		uBlockCount = uByteLength / state->uBlockLength;
-		uRemainingBytes = uByteLength % state->uBlockLength;
-
-		if(uBlockCount > 0)
-		{
-			Compress(state, data, uBlockCount);
-
-			state->processed_bits += uBlockCount * state->uBlockLength * 8;
-			data += uBlockCount * state->uBlockLength;
-		}
-
-		if(uRemainingBytes > 0)
-		{
-			memcpy(state->buffer, (void*)data, uRemainingBytes);
-		}
-
-		state->uBufferBytes = uRemainingBytes;
-	}
-	else
-	{
-		memcpy(state->buffer + state->uBufferBytes, (void*)data, uByteLength);
-		state->uBufferBytes += uByteLength;
-	}
-
-	return SUCCESS;
-}
-
-HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
-{
-	__m128i remainingbits;
-
-	// Add remaining bytes in the buffer
-	state->processed_bits += state->uBufferBytes * 8;
-
-	remainingbits = _mm_set_epi32(0, 0, 0, state->uBufferBytes * 8);
-
-	// Pad with 0x80
-	state->buffer[state->uBufferBytes++] = 0x80;
-	
-	// Enough buffer space for padding in this block?
-	if((state->uBlockLength - state->uBufferBytes) >= 18)
-	{
-		// Pad with zeros
-		memset(state->buffer + state->uBufferBytes, 0, state->uBlockLength - (state->uBufferBytes + 18));
-
-		// Hash size
-		*((unsigned short*)(state->buffer + state->uBlockLength - 18)) = state->uHashSize;
-
-		// Processed bits
-		*((DataLength*)(state->buffer + state->uBlockLength - 16)) = state->processed_bits;
-		*((DataLength*)(state->buffer + state->uBlockLength - 8)) = 0;
-
-		// Last block contains message bits?
-		if(state->uBufferBytes == 1)
-		{
-			state->k = _mm_xor_si128(state->k, state->k);
-			state->k = _mm_sub_epi64(state->k, state->const1536);
-		}
-		else
-		{
-			state->k = _mm_add_epi64(state->k, remainingbits);
-			state->k = _mm_sub_epi64(state->k, state->const1536);
-		}
-
-		// Compress
-		Compress(state, state->buffer, 1);
-	}
-	else
-	{
-		// Fill with zero and compress
-		memset(state->buffer + state->uBufferBytes, 0, state->uBlockLength - state->uBufferBytes);
-		state->k = _mm_add_epi64(state->k, remainingbits);
-		state->k = _mm_sub_epi64(state->k, state->const1536);
-		Compress(state, state->buffer, 1);
-
-		// Last block
-		memset(state->buffer, 0, state->uBlockLength - 18);
-
-		// Hash size
-		*((unsigned short*)(state->buffer + state->uBlockLength - 18)) = state->uHashSize;
-
-		// Processed bits
-		*((DataLength*)(state->buffer + state->uBlockLength - 16)) = state->processed_bits;
-		*((DataLength*)(state->buffer + state->uBlockLength - 8)) = 0;
-
-		// Compress the last block
-		state->k = _mm_xor_si128(state->k, state->k);
-		state->k = _mm_sub_epi64(state->k, state->const1536);
-		Compress(state, state->buffer, 1);
-	}
-
-	// Store the hash value
-	_mm_storeu_si128((__m128i*)hashval + 0, state->state[0][0]);
-	_mm_storeu_si128((__m128i*)hashval + 1, state->state[1][0]);
-
-	if(state->uHashSize == 512)
-	{
-		_mm_storeu_si128((__m128i*)hashval + 2, state->state[2][0]);
-		_mm_storeu_si128((__m128i*)hashval + 3, state->state[3][0]);
-	}
-
-	return SUCCESS;
-}
-
-HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
-                              const BitSequence *data, DataLength databitlen )
-{
-   unsigned int uByteLength, uBlockCount, uRemainingBytes;
-
-   uByteLength = (unsigned int)(databitlen / 8);
-
-/*   
-   if( (state->uBufferBytes + uByteLength) >= state->uBlockLength )
-   {
-printf("full block\n");
-      if( state->uBufferBytes != 0 )
-        {
-           // Fill the buffer
-           memcpy( state->buffer + state->uBufferBytes,
-                   (void*)data, state->uBlockLength - state->uBufferBytes );
-
-           // Process buffer
-           Compress( state, state->buffer, 1 );
-           state->processed_bits += state->uBlockLength * 8;
-
-           data += state->uBlockLength - state->uBufferBytes;
-           uByteLength -= state->uBlockLength - state->uBufferBytes;
-        }
-
-        // buffer now does not contain any unprocessed bytes
-
-        uBlockCount = uByteLength / state->uBlockLength;
-        uRemainingBytes = uByteLength % state->uBlockLength;
-
-        if( uBlockCount > 0 )
-        {
-           Compress( state, data, uBlockCount );
-           state->processed_bits += uBlockCount * state->uBlockLength * 8;
-           data += uBlockCount * state->uBlockLength;
-        }
-
-        if( uRemainingBytes > 0 )
-        memcpy(state->buffer, (void*)data, uRemainingBytes);
-
-        state->uBufferBytes = uRemainingBytes;
-   }
-   else
-   {
-*/
-   memcpy( state->buffer + state->uBufferBytes, (void*)data, uByteLength );
-        state->uBufferBytes += uByteLength;
-//   }
-
-   __m128i remainingbits;
-
-   // Add remaining bytes in the buffer
-   state->processed_bits += state->uBufferBytes * 8;
-
-   remainingbits = _mm_set_epi32( 0, 0, 0, state->uBufferBytes * 8 );
-
-   // Pad with 0x80
-   state->buffer[state->uBufferBytes++] = 0x80;
-
-   // Enough buffer space for padding in this block?
-
-//   if( (state->uBlockLength - state->uBufferBytes) >= 18 )
-//   {
-        // Pad with zeros
-
-        memset( state->buffer + state->uBufferBytes, 0, state->uBlockLength - (state->uBufferBytes + 18) );
-
-        // Hash size
-        *( (unsigned short*)(state->buffer + state->uBlockLength - 18) ) = state->uHashSize;
-
-        // Processed bits
-        *( (DataLength*)(state->buffer + state->uBlockLength - 16) ) =
-                   state->processed_bits;
-        *( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0;
-
-
-        // Last block contains message bits?
-        if( state->uBufferBytes == 1 )
-        {
-           state->k = _mm_xor_si128( state->k, state->k );
-           state->k = _mm_sub_epi64( state->k, state->const1536 );
-        }
-        else
-        {
-           state->k = _mm_add_epi64( state->k, remainingbits );
-           state->k = _mm_sub_epi64( state->k, state->const1536 );
-        }
-
-uint64_t *b = (uint64_t*)&state->k;
-/*
-printf("Sk: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]);
-b = (uint64_t*)state->buffer;
-printf("Sb: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]);
-printf("Sb: %016lx %016lx %016lx %016lx\n",b[4],b[5],b[6],b[7]);
-printf("Sb: %016lx %016lx %016lx %016lx\n",b[8],b[9],b[10],b[11]);
-printf("Sb: %016lx %016lx %016lx %016lx\n",b[12],b[13],b[14],b[15]);
-
-b = (uint64_t*)state->state;
-printf("Ss1: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]);
-printf("Ss1: %016lx %016lx %016lx %016lx\n",b[4],b[5],b[6],b[7]);
-printf("Ss1: %016lx %016lx %016lx %016lx\n",b[8],b[9],b[10],b[11]);
-printf("Ss1: %016lx %016lx %016lx %016lx\n",b[12],b[13],b[14],b[15]);
-*/        
-        // Compress
-        Compress( state, state->buffer, 1 );
-
-//printf("Ss2: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]);
-
-        
-/*
-   }
-   else
-   {
-        // Fill with zero and compress
-        memset( state->buffer + state->uBufferBytes, 0,
-                state->uBlockLength - state->uBufferBytes );
-        state->k = _mm_add_epi64( state->k, remainingbits );
-        state->k = _mm_sub_epi64( state->k, state->const1536 );
-        Compress( state, state->buffer, 1 );
-
-        // Last block
-        memset( state->buffer, 0, state->uBlockLength - 18 );
-
-        // Hash size
-        *( (unsigned short*)(state->buffer + state->uBlockLength - 18) ) =
-                 state->uHashSize;
-
-        // Processed bits
-        *( (DataLength*)(state->buffer + state->uBlockLength - 16) ) =
-                   state->processed_bits;
-        *( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0;
-        // Compress the last block
-        state->k = _mm_xor_si128( state->k, state->k );
-        state->k = _mm_sub_epi64( state->k, state->const1536 );
-        Compress( state, state->buffer, 1) ;
-   }
-*/
-
-   // Store the hash value
-   _mm_storeu_si128( (__m128i*)hashval + 0, state->state[0][0] );
-   _mm_storeu_si128( (__m128i*)hashval + 1, state->state[1][0] );
-
-   if( state->uHashSize == 512 )
-   {
-        _mm_storeu_si128( (__m128i*)hashval + 2, state->state[2][0] );
-        _mm_storeu_si128( (__m128i*)hashval + 3, state->state[3][0] );
-
-   }
-   return SUCCESS;
-}
-
-
-HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval)
-{
-	HashReturn hRet;
-	hashState_echo hs;
-
-	/////
-	/*
-	__m128i a, b, c, d, t[4], u[4], v[4];
-
-	a = _mm_set_epi32(0x0f0e0d0c, 0x0b0a0908, 0x07060504, 0x03020100);
-	b = _mm_set_epi32(0x1f1e1d1c, 0x1b1a1918, 0x17161514, 0x13121110);
-	c = _mm_set_epi32(0x2f2e2d2c, 0x2b2a2928, 0x27262524, 0x23222120);
-	d = _mm_set_epi32(0x3f3e3d3c, 0x3b3a3938, 0x37363534, 0x33323130);
-
-	t[0] = _mm_unpacklo_epi8(a, b);
-	t[1] = _mm_unpackhi_epi8(a, b);
-	t[2] = _mm_unpacklo_epi8(c, d);
-	t[3] = _mm_unpackhi_epi8(c, d);
-
-	u[0] = _mm_unpacklo_epi16(t[0], t[2]);
-	u[1] = _mm_unpackhi_epi16(t[0], t[2]);
-	u[2] = _mm_unpacklo_epi16(t[1], t[3]);
-	u[3] = _mm_unpackhi_epi16(t[1], t[3]);
-
-
-	t[0] = _mm_unpacklo_epi16(u[0], u[1]);
-	t[1] = _mm_unpackhi_epi16(u[0], u[1]);
-	t[2] = _mm_unpacklo_epi16(u[2], u[3]);
-	t[3] = _mm_unpackhi_epi16(u[2], u[3]);
-
-	u[0] = _mm_unpacklo_epi8(t[0], t[1]);
-	u[1] = _mm_unpackhi_epi8(t[0], t[1]);
-	u[2] = _mm_unpacklo_epi8(t[2], t[3]);
-	u[3] = _mm_unpackhi_epi8(t[2], t[3]);
-
-	a = _mm_unpacklo_epi8(u[0], u[1]);
-	b = _mm_unpackhi_epi8(u[0], u[1]);
-	c = _mm_unpacklo_epi8(u[2], u[3]);
-	d = _mm_unpackhi_epi8(u[2], u[3]);
-	*/
-	/////
-
-	hRet = init_echo(&hs, hashbitlen);
-	if(hRet != SUCCESS)
-		return hRet;
-
-	hRet = update_echo(&hs, data, databitlen);
-	if(hRet != SUCCESS)
-		return hRet;
-
-	hRet = final_echo(&hs, hashval);
-	if(hRet != SUCCESS)
-		return hRet;
-
-	return SUCCESS;
-}
-
-#endif
--- a/algo/echo/aes_ni/hash_api.h
+++ b/algo/echo/aes_ni/hash_api.h
@@ -15,7 +15,7 @@
 #ifndef HASH_API_H
 #define HASH_API_H

-#ifndef NO_AES_NI
+#ifdef __AES__
 #define HASH_IMPL_STR	"ECHO-aesni"
 #else
 #define HASH_IMPL_STR	"ECHO-vperm"
@@ -55,6 +55,8 @@ HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databit

 HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
                              const BitSequence *data, DataLength databitlen );
+HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
+            int nHashSize, const BitSequence *data, DataLength databitlen );

 #endif // HASH_API_H

--- a/algo/echo/echo-hash-4way.c
+++ b/algo/echo/echo-hash-4way.c
@@ -313,4 +313,92 @@ int echo_4way_update_close( echo_4way_context *state, void *hashval,
   return 0;
 }

+int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize, 
+                    const void *data, int datalen )
+{
+   int i, j;
+   int databitlen = datalen * 8;
+   ctx->k = m512_zero;
+   ctx->processed_bits = 0;
+   ctx->uBufferBytes = 0;
+
+   switch( nHashSize )
+   {
+      case 256:
+         ctx->uHashSize = 256;
+         ctx->uBlockLength = 192;
+         ctx->uRounds = 8;
+         ctx->hashsize = _mm512_set4_epi32( 0, 0, 0, 0x100 );
+         ctx->const1536 = _mm512_set4_epi32( 0, 0, 0, 0x600 );
+         break;
+
+      case 512:
+         ctx->uHashSize = 512;
+         ctx->uBlockLength = 128;
+         ctx->uRounds = 10;
+         ctx->hashsize = _mm512_set4_epi32( 0, 0, 0, 0x200 );
+         ctx->const1536 = _mm512_set4_epi32( 0, 0, 0, 0x400);
+         break;
+
+      default:
+         return 1;
+   }
+
+   for( i = 0; i < 4; i++ )
+      for( j = 0; j < nHashSize / 256; j++ )
+         ctx->state[ i ][ j ] = ctx->hashsize;
+
+   for( i = 0; i < 4; i++ )
+      for( j = nHashSize / 256; j < 4; j++ )
+         ctx->state[ i ][ j ] = m512_zero;
+
+   
+// bytelen is either 32 (maybe), 64 or 80 or 128!
+// all are less than full block.
+
+   int vlen = datalen / 32;  
+   const int vblen = ctx->uBlockLength / 16; //  16 bytes per lane
+   __m512i remainingbits;
+
+   if ( databitlen == 1024 )
+   {
+      echo_4way_compress( ctx, data, 1 );
+      ctx->processed_bits = 1024;
+      remainingbits = m512_const2_64( 0, -1024 );
+      vlen = 0;
+   }
+   else
+   {
+      vlen = databitlen / 128;  // * 4 lanes / 128 bits per lane
+      memcpy_512( ctx->buffer, data, vlen );
+      ctx->processed_bits += (unsigned int)( databitlen );
+      remainingbits = _mm512_set4_epi32( 0, 0, 0, databitlen );
+
+   }
+
+   ctx->buffer[ vlen ] = _mm512_set4_epi32( 0, 0, 0, 0x80 );
+   memset_zero_512( ctx->buffer + vlen + 1, vblen - vlen - 2 );
+   ctx->buffer[ vblen-2 ] =
+                _mm512_set4_epi32( (uint32_t)ctx->uHashSize << 16, 0, 0, 0 );
+   ctx->buffer[ vblen-1 ] =
+                   _mm512_set4_epi64( 0, ctx->processed_bits,
+                                      0, ctx->processed_bits );
+
+   ctx->k = _mm512_add_epi64( ctx->k, remainingbits );
+   ctx->k = _mm512_sub_epi64( ctx->k, ctx->const1536 );
+
+   echo_4way_compress( ctx, ctx->buffer, 1 );
+
+   _mm512_store_si512( (__m512i*)hashval + 0, ctx->state[ 0 ][ 0] );
+   _mm512_store_si512( (__m512i*)hashval + 1, ctx->state[ 1 ][ 0] );
+
+   if ( ctx->uHashSize == 512 )
+   {
+      _mm512_store_si512( (__m512i*)hashval + 2, ctx->state[ 2 ][ 0 ] );
+      _mm512_store_si512( (__m512i*)hashval + 3, ctx->state[ 3 ][ 0 ] );
+   }
+   return 0;
+}
+
+
 #endif
--- a/algo/echo/echo-hash-4way.h
+++ b/algo/echo/echo-hash-4way.h
@@ -22,15 +22,26 @@ typedef struct
 } echo_4way_context __attribute__ ((aligned (64)));

 int echo_4way_init( echo_4way_context *state, int hashbitlen );
-
+#define echo512_4way_init( state ) echo_4way_init( state, 512 )
+#define echo256_4way_init( state ) echo_4way_init( state, 256 )

 int echo_4way_update( echo_4way_context *state, const void *data,
    unsigned int databitlen);
+#define echo512_4way_update echo_4way_update

 int echo_close( echo_4way_context *state, void *hashval );
+#define echo512_4way_close echo_4way_close

 int echo_4way_update_close( echo_4way_context *state, void *hashval,
                              const void *data, int databitlen );
+#define echo512_4way_update_close echo_4way_update_close
+
+int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
+                    const void *data, int datalen );
+#define echo512_4way_full( state, hashval, data, datalen ) \
+           echo_4way_full( state, hashval, 512, data, datalen )
+#define echo256_4way_full( state, hashval, data, datalen ) \
+           echo_4way_full( state, hashval, 256, data, datalen )

 #endif 
 #endif
--- a/algo/echo/sph_echo.c
+++ b/algo/echo/sph_echo.c
@@ -36,6 +36,8 @@

 #include "sph_echo.h"

+#if !defined(__AES__)
+
 #ifdef __cplusplus
 extern "C"{
 #endif
@@ -1028,4 +1030,5 @@ sph_echo512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 }
 #ifdef __cplusplus
 }
-#endif
+#endif 
+#endif  // !AES
--- a/algo/echo/sph_echo.h
+++ b/algo/echo/sph_echo.h
@@ -36,6 +36,8 @@
 #ifndef SPH_ECHO_H__
 #define SPH_ECHO_H__

+#if !defined(__AES__)
+
 #ifdef __cplusplus
 extern "C"{
 #endif
@@ -316,5 +318,5 @@ void sph_echo512_addbits_and_close(
 #ifdef __cplusplus
 }
 #endif
-
+#endif // !AES
 #endif
--- a/algo/fugue/sph_fugue.h
+++ b/algo/fugue/sph_fugue.h
@@ -74,6 +74,14 @@ void sph_fugue512_close(void *cc, void *dst);
 void sph_fugue512_addbits_and_close(
 	void *cc, unsigned ub, unsigned n, void *dst);

+#define sph_fugue512_full( cc, dst, data, len ) \
+do{ \
+   sph_fugue512_init( cc ); \
+   sph_fugue512( cc, data, len ); \
+   sph_fugue512_close( cc, dst ); \
+}while(0)
+
+
 #ifdef __cplusplus
 }
 #endif
--- a/algo/groestl/aes_ni/groestl-asm-aes.h
+++ b/algo/groestl/aes_ni/groestl-asm-aes.h
--- a/algo/groestl/aes_ni/groestl-asm-avx.h
+++ b/algo/groestl/aes_ni/groestl-asm-avx.h
--- a/algo/groestl/aes_ni/groestl-asm-vperm.h
+++ b/algo/groestl/aes_ni/groestl-asm-vperm.h
--- a/algo/groestl/aes_ni/groestl-intr-aes.h
+++ b/algo/groestl/aes_ni/groestl-intr-aes.h
@@ -1,3 +1,6 @@
+#if !defined GROESTL_INTR_AES_H__
+#define GROESTL_INTR_AES_H__
+
 /* groestl-intr-aes.h     Aug 2011
 *
 * Groestl implementation with intrinsics using ssse3, sse4.1, and aes
@@ -11,16 +14,51 @@
 #include <wmmintrin.h>
 #include "hash-groestl.h"

-/* global constants  */
-__m128i ROUND_CONST_Lx;
-//__m128i ROUND_CONST_L0[ROUNDS512];
-//__m128i ROUND_CONST_L7[ROUNDS512];
-__m128i ROUND_CONST_P[ROUNDS1024];
-__m128i ROUND_CONST_Q[ROUNDS1024];
-__m128i TRANSP_MASK;
-__m128i SUBSH_MASK[8];
-__m128i ALL_1B;
-__m128i ALL_FF;
+static const __m128i round_const_p[] __attribute__ ((aligned (64))) =
+{
+   { 0x7060504030201000, 0xf0e0d0c0b0a09080 },
+   { 0x7161514131211101, 0xf1e1d1c1b1a19181 },
+   { 0x7262524232221202, 0xf2e2d2c2b2a29282 },
+   { 0x7363534333231303, 0xf3e3d3c3b3a39383 },
+   { 0x7464544434241404, 0xf4e4d4c4b4a49484 },
+   { 0x7565554535251505, 0xf5e5d5c5b5a59585 },
+   { 0x7666564636261606, 0xf6e6d6c6b6a69686 },
+   { 0x7767574737271707, 0xf7e7d7c7b7a79787 },
+   { 0x7868584838281808, 0xf8e8d8c8b8a89888 },
+   { 0x7969594939291909, 0xf9e9d9c9b9a99989 },
+   { 0x7a6a5a4a3a2a1a0a, 0xfaeadacabaaa9a8a },
+   { 0x7b6b5b4b3b2b1b0b, 0xfbebdbcbbbab9b8b },
+   { 0x7c6c5c4c3c2c1c0c, 0xfcecdcccbcac9c8c },
+   { 0x7d6d5d4d3d2d1d0d, 0xfdedddcdbdad9d8d }
+};
+
+static const __m128i round_const_q[] __attribute__ ((aligned (64))) =
+{
+   { 0x8f9fafbfcfdfefff, 0x0f1f2f3f4f5f6f7f },
+   { 0x8e9eaebecedeeefe, 0x0e1e2e3e4e5e6e7e },
+   { 0x8d9dadbdcdddedfd, 0x0d1d2d3d4d5d6d7d },
+   { 0x8c9cacbcccdcecfc, 0x0c1c2c3c4c5c6c7c },
+   { 0x8b9babbbcbdbebfb, 0x0b1b2b3b4b5b6b7b },
+   { 0x8a9aaabacadaeafa, 0x0a1a2a3a4a5a6a7a },
+   { 0x8999a9b9c9d9e9f9, 0x0919293949596979 },
+   { 0x8898a8b8c8d8e8f8, 0x0818283848586878 },
+   { 0x8797a7b7c7d7e7f7, 0x0717273747576777 },
+   { 0x8696a6b6c6d6e6f6, 0x0616263646566676 },
+   { 0x8595a5b5c5d5e5f5, 0x0515253545556575 },
+   { 0x8494a4b4c4d4e4f4, 0x0414243444546474 },
+   { 0x8393a3b3c3d3e3f3, 0x0313233343536373 },
+   { 0x8292a2b2c2d2e2f2, 0x0212223242526272 }
+};
+
+static const __m128i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02 };
+static const __m128i SUBSH_MASK0 = { 0x0b0e0104070a0d00, 0x0306090c0f020508 };
+static const __m128i SUBSH_MASK1 = { 0x0c0f0205080b0e01, 0x04070a0d00030609 };
+static const __m128i SUBSH_MASK2 = { 0x0d000306090c0f02, 0x05080b0e0104070a };
+static const __m128i SUBSH_MASK3 = { 0x0e0104070a0d0003, 0x06090c0f0205080b };
+static const __m128i SUBSH_MASK4 = { 0x0f0205080b0e0104, 0x070a0d000306090c };
+static const __m128i SUBSH_MASK5 = { 0x000306090c0f0205, 0x080b0e0104070a0d };
+static const __m128i SUBSH_MASK6 = { 0x0104070a0d000306, 0x090c0f0205080b0e };
+static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };

 #define tos(a)    #a
 #define tostr(a)  tos(a)
@@ -111,7 +149,7 @@ __m128i ALL_FF;
  \
  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
  /* compute w_i : add y_{i+4} */\
-  b1 = ALL_1B;\
+  b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\
  MUL2(a0, b0, b1);\
  a0 = _mm_xor_si128(a0, TEMP0);\
  MUL2(a1, b0, b1);\
@@ -152,25 +190,6 @@ __m128i ALL_FF;
 }/*MixBytes*/


-#define SET_CONSTANTS(){\
-  ALL_FF = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\
-  ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
-  TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
-  SUBSH_MASK[0] = _mm_set_epi32(0x0306090c, 0x0f020508, 0x0b0e0104, 0x070a0d00);\
-  SUBSH_MASK[1] = _mm_set_epi32(0x04070a0d, 0x00030609, 0x0c0f0205, 0x080b0e01);\
-  SUBSH_MASK[2] = _mm_set_epi32(0x05080b0e, 0x0104070a, 0x0d000306, 0x090c0f02);\
-  SUBSH_MASK[3] = _mm_set_epi32(0x06090c0f, 0x0205080b, 0x0e010407, 0x0a0d0003);\
-  SUBSH_MASK[4] = _mm_set_epi32(0x070a0d00, 0x0306090c, 0x0f020508, 0x0b0e0104);\
-  SUBSH_MASK[5] = _mm_set_epi32(0x080b0e01, 0x04070a0d, 0x00030609, 0x0c0f0205);\
-  SUBSH_MASK[6] = _mm_set_epi32(0x090c0f02, 0x05080b0e, 0x0104070a, 0x0d000306);\
-  SUBSH_MASK[7] = _mm_set_epi32(0x0e010407, 0x0a0d0003, 0x06090c0f, 0x0205080b);\
-  for(i = 0; i < ROUNDS1024; i++)\
-  {\
-    ROUND_CONST_P[i] = _mm_set_epi32(0xf0e0d0c0 ^ (i * 0x01010101), 0xb0a09080 ^ (i * 0x01010101), 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
-    ROUND_CONST_Q[i] = _mm_set_epi32(0x0f1f2f3f ^ (i * 0x01010101), 0x4f5f6f7f ^ (i * 0x01010101), 0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101));\
-  }\
-}while(0);\
-
 /* one round
 * a0-a7 = input rows
 * b0-b7 = output rows
@@ -194,30 +213,34 @@ __m128i ALL_FF;
  u8 round_counter = 0;\
  for(round_counter = 0; round_counter < 14; round_counter+=2) {\
    /* AddRoundConstant P1024 */\
-    xmm8 = _mm_xor_si128(xmm8, (ROUND_CONST_P[round_counter]));\
+    xmm8 = _mm_xor_si128( xmm8, \
+             casti_m128i( round_const_p, round_counter ) ); \
     /* ShiftBytes P1024 + pre-AESENCLAST */\
-    xmm8  = _mm_shuffle_epi8(xmm8,  (SUBSH_MASK[0]));\
-    xmm9  = _mm_shuffle_epi8(xmm9,  (SUBSH_MASK[1]));\
-    xmm10 = _mm_shuffle_epi8(xmm10, (SUBSH_MASK[2]));\
-    xmm11 = _mm_shuffle_epi8(xmm11, (SUBSH_MASK[3]));\
-    xmm12 = _mm_shuffle_epi8(xmm12, (SUBSH_MASK[4]));\
-    xmm13 = _mm_shuffle_epi8(xmm13, (SUBSH_MASK[5]));\
-    xmm14 = _mm_shuffle_epi8(xmm14, (SUBSH_MASK[6]));\
-    xmm15 = _mm_shuffle_epi8(xmm15, (SUBSH_MASK[7]));\
+    xmm8  = _mm_shuffle_epi8( xmm8,  SUBSH_MASK0 ); \
+    xmm9  = _mm_shuffle_epi8( xmm9,  SUBSH_MASK1 ); \
+    xmm10 = _mm_shuffle_epi8( xmm10, SUBSH_MASK2 ); \
+    xmm11 = _mm_shuffle_epi8( xmm11, SUBSH_MASK3 ); \
+    xmm12 = _mm_shuffle_epi8( xmm12, SUBSH_MASK4 ); \
+    xmm13 = _mm_shuffle_epi8( xmm13, SUBSH_MASK5 ); \
+    xmm14 = _mm_shuffle_epi8( xmm14, SUBSH_MASK6 ); \
+    xmm15 = _mm_shuffle_epi8( xmm15, SUBSH_MASK7 ); \
    /* SubBytes + MixBytes */\
-    SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+    SUBMIX( xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, \
+            xmm0, xmm1, xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7 ); \
    \
    /* AddRoundConstant P1024 */\
-    xmm0 = _mm_xor_si128(xmm0, (ROUND_CONST_P[round_counter+1]));\
-    xmm0 = _mm_shuffle_epi8(xmm0, (SUBSH_MASK[0]));\
-    xmm1 = _mm_shuffle_epi8(xmm1, (SUBSH_MASK[1]));\
-    xmm2 = _mm_shuffle_epi8(xmm2, (SUBSH_MASK[2]));\
-    xmm3 = _mm_shuffle_epi8(xmm3, (SUBSH_MASK[3]));\
-    xmm4 = _mm_shuffle_epi8(xmm4, (SUBSH_MASK[4]));\
-    xmm5 = _mm_shuffle_epi8(xmm5, (SUBSH_MASK[5]));\
-    xmm6 = _mm_shuffle_epi8(xmm6, (SUBSH_MASK[6]));\
-    xmm7 = _mm_shuffle_epi8(xmm7, (SUBSH_MASK[7]));\
-    SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+    xmm0 = _mm_xor_si128( xmm0, \
+             casti_m128i( round_const_p, round_counter+1 ) ); \
+    xmm0 = _mm_shuffle_epi8( xmm0, SUBSH_MASK0 ); \
+    xmm1 = _mm_shuffle_epi8( xmm1, SUBSH_MASK1 ); \
+    xmm2 = _mm_shuffle_epi8( xmm2, SUBSH_MASK2 ); \
+    xmm3 = _mm_shuffle_epi8( xmm3, SUBSH_MASK3 ); \
+    xmm4 = _mm_shuffle_epi8( xmm4, SUBSH_MASK4 ); \
+    xmm5 = _mm_shuffle_epi8( xmm5, SUBSH_MASK5 ); \
+    xmm6 = _mm_shuffle_epi8( xmm6, SUBSH_MASK6 ); \
+    xmm7 = _mm_shuffle_epi8( xmm7, SUBSH_MASK7 ); \
+    SUBMIX( xmm0, xmm1, xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7, \
+            xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15 ); \
  }\
 }

@@ -225,48 +248,52 @@ __m128i ALL_FF;
  u8 round_counter = 0;\
  for(round_counter = 0; round_counter < 14; round_counter+=2) {\
    /* AddRoundConstant Q1024 */\
-    xmm1 = ALL_FF;\
-    xmm8  = _mm_xor_si128(xmm8,  xmm1);\
-    xmm9  = _mm_xor_si128(xmm9,  xmm1);\
-    xmm10 = _mm_xor_si128(xmm10, xmm1);\
-    xmm11 = _mm_xor_si128(xmm11, xmm1);\
-    xmm12 = _mm_xor_si128(xmm12, xmm1);\
-    xmm13 = _mm_xor_si128(xmm13, xmm1);\
-    xmm14 = _mm_xor_si128(xmm14, xmm1);\
-    xmm15 = _mm_xor_si128(xmm15, (ROUND_CONST_Q[round_counter]));\
+    xmm1 = m128_neg1;\
+    xmm8  = _mm_xor_si128( xmm8,  xmm1 ); \
+    xmm9  = _mm_xor_si128( xmm9,  xmm1 ); \
+    xmm10 = _mm_xor_si128( xmm10, xmm1 ); \
+    xmm11 = _mm_xor_si128( xmm11, xmm1 ); \
+    xmm12 = _mm_xor_si128( xmm12, xmm1 ); \
+    xmm13 = _mm_xor_si128( xmm13, xmm1 ); \
+    xmm14 = _mm_xor_si128( xmm14, xmm1 ); \
+    xmm15 = _mm_xor_si128( xmm15, \
+              casti_m128i( round_const_q, round_counter ) ); \
    /* ShiftBytes Q1024 + pre-AESENCLAST */\
-    xmm8  = _mm_shuffle_epi8(xmm8,  (SUBSH_MASK[1]));\
-    xmm9  = _mm_shuffle_epi8(xmm9,  (SUBSH_MASK[3]));\
-    xmm10 = _mm_shuffle_epi8(xmm10, (SUBSH_MASK[5]));\
-    xmm11 = _mm_shuffle_epi8(xmm11, (SUBSH_MASK[7]));\
-    xmm12 = _mm_shuffle_epi8(xmm12, (SUBSH_MASK[0]));\
-    xmm13 = _mm_shuffle_epi8(xmm13, (SUBSH_MASK[2]));\
-    xmm14 = _mm_shuffle_epi8(xmm14, (SUBSH_MASK[4]));\
-    xmm15 = _mm_shuffle_epi8(xmm15, (SUBSH_MASK[6]));\
+    xmm8  = _mm_shuffle_epi8( xmm8,  SUBSH_MASK1 ); \
+    xmm9  = _mm_shuffle_epi8( xmm9,  SUBSH_MASK3 ); \
+    xmm10 = _mm_shuffle_epi8( xmm10, SUBSH_MASK5 ); \
+    xmm11 = _mm_shuffle_epi8( xmm11, SUBSH_MASK7 ); \
+    xmm12 = _mm_shuffle_epi8( xmm12, SUBSH_MASK0 ); \
+    xmm13 = _mm_shuffle_epi8( xmm13, SUBSH_MASK2 ); \
+    xmm14 = _mm_shuffle_epi8( xmm14, SUBSH_MASK4 ); \
+    xmm15 = _mm_shuffle_epi8( xmm15, SUBSH_MASK6 ); \
    /* SubBytes + MixBytes */\
-    SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+    SUBMIX( xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, \
+            xmm0, xmm1, xmm2,  xmm3,  xmm4,  xmm5,  xmm6 , xmm7 ); \
    \
    /* AddRoundConstant Q1024 */\
-    xmm9 = ALL_FF;\
-    xmm0 = _mm_xor_si128(xmm0,  xmm9);\
-    xmm1 = _mm_xor_si128(xmm1,  xmm9);\
-    xmm2 = _mm_xor_si128(xmm2,  xmm9);\
-    xmm3 = _mm_xor_si128(xmm3,  xmm9);\
-    xmm4 = _mm_xor_si128(xmm4,  xmm9);\
-    xmm5 = _mm_xor_si128(xmm5,  xmm9);\
-    xmm6 = _mm_xor_si128(xmm6,  xmm9);\
-    xmm7 = _mm_xor_si128(xmm7,  (ROUND_CONST_Q[round_counter+1]));\
+    xmm9 = m128_neg1;\
+    xmm0 = _mm_xor_si128( xmm0, xmm9 ); \
+    xmm1 = _mm_xor_si128( xmm1, xmm9 ); \
+    xmm2 = _mm_xor_si128( xmm2, xmm9 ); \
+    xmm3 = _mm_xor_si128( xmm3, xmm9 ); \
+    xmm4 = _mm_xor_si128( xmm4, xmm9 ); \
+    xmm5 = _mm_xor_si128( xmm5, xmm9 ); \
+    xmm6 = _mm_xor_si128( xmm6, xmm9 ); \
+    xmm7 = _mm_xor_si128( xmm7, \
+             casti_m128i( round_const_q, round_counter+1 ) ); \
    /* ShiftBytes Q1024 + pre-AESENCLAST */\
-    xmm0 = _mm_shuffle_epi8(xmm0, (SUBSH_MASK[1]));\
-    xmm1 = _mm_shuffle_epi8(xmm1, (SUBSH_MASK[3]));\
-    xmm2 = _mm_shuffle_epi8(xmm2, (SUBSH_MASK[5]));\
-    xmm3 = _mm_shuffle_epi8(xmm3, (SUBSH_MASK[7]));\
-    xmm4 = _mm_shuffle_epi8(xmm4, (SUBSH_MASK[0]));\
-    xmm5 = _mm_shuffle_epi8(xmm5, (SUBSH_MASK[2]));\
-    xmm6 = _mm_shuffle_epi8(xmm6, (SUBSH_MASK[4]));\
-    xmm7 = _mm_shuffle_epi8(xmm7, (SUBSH_MASK[6]));\
+    xmm0 = _mm_shuffle_epi8( xmm0, SUBSH_MASK1 ); \
+    xmm1 = _mm_shuffle_epi8( xmm1, SUBSH_MASK3 ); \
+    xmm2 = _mm_shuffle_epi8( xmm2, SUBSH_MASK5 ); \
+    xmm3 = _mm_shuffle_epi8( xmm3, SUBSH_MASK7 ); \
+    xmm4 = _mm_shuffle_epi8( xmm4, SUBSH_MASK0 ); \
+    xmm5 = _mm_shuffle_epi8( xmm5, SUBSH_MASK2 ); \
+    xmm6 = _mm_shuffle_epi8( xmm6, SUBSH_MASK4 ); \
+    xmm7 = _mm_shuffle_epi8( xmm7, SUBSH_MASK6 ); \
    /* SubBytes + MixBytes */\
-    SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+    SUBMIX( xmm0,  xmm1, xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7, \
+            xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15 ); \
  }\
 }

@@ -278,7 +305,7 @@ __m128i ALL_FF;
 * clobbers: t0-t7
 */
 #define Matrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\
-  t0 = TRANSP_MASK;\
+  t0 = TRANSP_MASK; \
 \
  i6 = _mm_shuffle_epi8(i6, t0);\
  i0 = _mm_shuffle_epi8(i0, t0);\
@@ -366,7 +393,7 @@ __m128i ALL_FF;
  i4 = _mm_unpacklo_epi64(i4, i5);\
  t1 = _mm_unpackhi_epi64(t1, i5);\
  t2 = i6;\
-  o0 = TRANSP_MASK;\
+  o0 = TRANSP_MASK; \
  i6 = _mm_unpacklo_epi64(i6, i7);\
  t2 = _mm_unpackhi_epi64(t2, i7);\
  /* load transpose mask into a register, because it will be used 8 times */\
@@ -607,3 +634,4 @@ void OF1024( __m128i* chaining )
  return;
 }

+#endif
--- a/algo/groestl/aes_ni/groestl-intr-avx.h
+++ b/algo/groestl/aes_ni/groestl-intr-avx.h
--- a/algo/groestl/aes_ni/groestl-intr-vperm.h
+++ b/algo/groestl/aes_ni/groestl-intr-vperm.h
--- a/algo/groestl/aes_ni/groestl-version.h
+++ b/algo/groestl/aes_ni/groestl-version.h
@@ -1,17 +0,0 @@
-// specify assembly or intrinsics implementation
-//#define TASM
-#define TINTR
-
-//#define AES_NI
-
-//#ifdef AES_NI
-// specify AES-NI, AVX (with AES-NI) or vector-permute implementation
-
-//#ifndef NO_AES_NI
-
-// Not to be confused with AVX512VAES
-#define VAES
-// #define VAVX
-// #define VVPERM
-
-//#endif
--- a/algo/groestl/aes_ni/groestl256-asm-aes.h
+++ b/algo/groestl/aes_ni/groestl256-asm-aes.h
@@ -1,529 +0,0 @@
-/* groestl-asm-aes.h     Aug 2011
- *
- * Groestl implementation with inline assembly using ssse3, sse4.1, and aes
- * instructions.
- * Authors: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
- *
- * This code is placed in the public domain
- */
-
-#include "hash-groestl256.h"
-/* global constants  */
-__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Lx[16];
-__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L0[ROUNDS512*16];
-__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L7[ROUNDS512*16];
-__attribute__ ((aligned (16))) unsigned char ROUND_CONST_P[ROUNDS1024*16];
-__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Q[ROUNDS1024*16];
-__attribute__ ((aligned (16))) unsigned char TRANSP_MASK[16];
-__attribute__ ((aligned (16))) unsigned char SUBSH_MASK[8*16];
-__attribute__ ((aligned (16))) unsigned char ALL_1B[16];
-__attribute__ ((aligned (16))) unsigned char ALL_FF[16];
-
-/* temporary variables  */
-__attribute__ ((aligned (16))) unsigned char QTEMP[8*16];
-__attribute__ ((aligned (16))) unsigned char TEMP[3*16];
-
-
-#define tos(a)    #a
-#define tostr(a)  tos(a)
-
-
-/* xmm[i] will be multiplied by 2
- * xmm[j] will be lost
- * xmm[k] has to be all 0x1b */
-#define MUL2(i, j, k){\
-  asm("pxor xmm"tostr(j)", xmm"tostr(j)"");\
-  asm("pcmpgtb xmm"tostr(j)", xmm"tostr(i)"");\
-  asm("paddb xmm"tostr(i)", xmm"tostr(i)"");\
-  asm("pand xmm"tostr(j)", xmm"tostr(k)"");\
-  asm("pxor xmm"tostr(i)", xmm"tostr(j)"");\
-}/**/
-
-/* Yet another implementation of MixBytes.
-   This time we use the formulae (3) from the paper "Byte Slicing Groestl".
-   Input: a0, ..., a7
-   Output: b0, ..., b7 = MixBytes(a0,...,a7).
-   but we use the relations:
-   t_i = a_i + a_{i+3}
-   x_i = t_i + t_{i+3}
-   y_i = t_i + t+{i+2} + a_{i+6}
-   z_i = 2*x_i
-   w_i = z_i + y_{i+4}
-   v_i = 2*w_i
-   b_i = v_{i+3} + y_{i+4}
-   We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
-   and then adding v_i computed in the meantime in registers xmm0..xmm7.
-   We almost fit into 16 registers, need only 3 spills to memory.
-   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
-   K. Matusiewicz, 2011/05/29 */
-#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-  /* t_i = a_i + a_{i+1} */\
-  asm("movdqa xmm"tostr(b6)", xmm"tostr(a0)"");\
-  asm("movdqa xmm"tostr(b7)", xmm"tostr(a1)"");\
-  asm("pxor xmm"tostr(a0)", xmm"tostr(a1)"");\
-  asm("movdqa xmm"tostr(b0)", xmm"tostr(a2)"");\
-  asm("pxor xmm"tostr(a1)", xmm"tostr(a2)"");\
-  asm("movdqa xmm"tostr(b1)", xmm"tostr(a3)"");\
-  asm("pxor xmm"tostr(a2)", xmm"tostr(a3)"");\
-  asm("movdqa xmm"tostr(b2)", xmm"tostr(a4)"");\
-  asm("pxor xmm"tostr(a3)", xmm"tostr(a4)"");\
-  asm("movdqa xmm"tostr(b3)", xmm"tostr(a5)"");\
-  asm("pxor xmm"tostr(a4)", xmm"tostr(a5)"");\
-  asm("movdqa xmm"tostr(b4)", xmm"tostr(a6)"");\
-  asm("pxor xmm"tostr(a5)", xmm"tostr(a6)"");\
-  asm("movdqa xmm"tostr(b5)", xmm"tostr(a7)"");\
-  asm("pxor xmm"tostr(a6)", xmm"tostr(a7)"");\
-  asm("pxor xmm"tostr(a7)", xmm"tostr(b6)"");\
-  \
-  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
-  asm("pxor xmm"tostr(b0)", xmm"tostr(a4)"");\
-  asm("pxor xmm"tostr(b6)", xmm"tostr(a4)"");\
-  asm("pxor xmm"tostr(b1)", xmm"tostr(a5)"");\
-  asm("pxor xmm"tostr(b7)", xmm"tostr(a5)"");\
-  asm("pxor xmm"tostr(b2)", xmm"tostr(a6)"");\
-  asm("pxor xmm"tostr(b0)", xmm"tostr(a6)"");\
-  /* spill values y_4, y_5 to memory */\
-  asm("movaps [TEMP+0*16], xmm"tostr(b0)"");\
-  asm("pxor xmm"tostr(b3)", xmm"tostr(a7)"");\
-  asm("pxor xmm"tostr(b1)", xmm"tostr(a7)"");\
-  asm("movaps [TEMP+1*16], xmm"tostr(b1)"");\
-  asm("pxor xmm"tostr(b4)", xmm"tostr(a0)"");\
-  asm("pxor xmm"tostr(b2)", xmm"tostr(a0)"");\
-  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
-  asm("movdqa xmm"tostr(b0)", xmm"tostr(a0)"");\
-  asm("pxor xmm"tostr(b5)", xmm"tostr(a1)"");\
-  asm("pxor xmm"tostr(b3)", xmm"tostr(a1)"");\
-  asm("movdqa xmm"tostr(b1)", xmm"tostr(a1)"");\
-  asm("pxor xmm"tostr(b6)", xmm"tostr(a2)"");\
-  asm("pxor xmm"tostr(b4)", xmm"tostr(a2)"");\
-  asm("movaps [TEMP+2*16], xmm"tostr(a2)"");\
-  asm("pxor xmm"tostr(b7)", xmm"tostr(a3)"");\
-  asm("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\
-  \
-  /* compute x_i = t_i + t_{i+3} */\
-  asm("pxor xmm"tostr(a0)", xmm"tostr(a3)"");\
-  asm("pxor xmm"tostr(a1)", xmm"tostr(a4)"");\
-  asm("pxor xmm"tostr(a2)", xmm"tostr(a5)"");\
-  asm("pxor xmm"tostr(a3)", xmm"tostr(a6)"");\
-  asm("pxor xmm"tostr(a4)", xmm"tostr(a7)"");\
-  asm("pxor xmm"tostr(a5)", xmm"tostr(b0)"");\
-  asm("pxor xmm"tostr(a6)", xmm"tostr(b1)"");\
-  asm("pxor xmm"tostr(a7)", [TEMP+2*16]");\
-  \
-  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
-  /* compute w_i : add y_{i+4} */\
-  asm("movaps xmm"tostr(b1)", [ALL_1B]");\
-  MUL2(a0, b0, b1);\
-  asm("pxor xmm"tostr(a0)", [TEMP+0*16]");\
-  MUL2(a1, b0, b1);\
-  asm("pxor xmm"tostr(a1)", [TEMP+1*16]");\
-  MUL2(a2, b0, b1);\
-  asm("pxor xmm"tostr(a2)", xmm"tostr(b2)"");\
-  MUL2(a3, b0, b1);\
-  asm("pxor xmm"tostr(a3)", xmm"tostr(b3)"");\
-  MUL2(a4, b0, b1);\
-  asm("pxor xmm"tostr(a4)", xmm"tostr(b4)"");\
-  MUL2(a5, b0, b1);\
-  asm("pxor xmm"tostr(a5)", xmm"tostr(b5)"");\
-  MUL2(a6, b0, b1);\
-  asm("pxor xmm"tostr(a6)", xmm"tostr(b6)"");\
-  MUL2(a7, b0, b1);\
-  asm("pxor xmm"tostr(a7)", xmm"tostr(b7)"");\
-  \
-  /* compute v_i : double w_i      */\
-  /* add to y_4 y_5 .. v3, v4, ... */\
-  MUL2(a0, b0, b1);\
-  asm("pxor xmm"tostr(b5)", xmm"tostr(a0)"");\
-  MUL2(a1, b0, b1);\
-  asm("pxor xmm"tostr(b6)", xmm"tostr(a1)"");\
-  MUL2(a2, b0, b1);\
-  asm("pxor xmm"tostr(b7)", xmm"tostr(a2)"");\
-  MUL2(a5, b0, b1);\
-  asm("pxor xmm"tostr(b2)", xmm"tostr(a5)"");\
-  MUL2(a6, b0, b1);\
-  asm("pxor xmm"tostr(b3)", xmm"tostr(a6)"");\
-  MUL2(a7, b0, b1);\
-  asm("pxor xmm"tostr(b4)", xmm"tostr(a7)"");\
-  MUL2(a3, b0, b1);\
-  MUL2(a4, b0, b1);\
-  asm("movaps xmm"tostr(b0)", [TEMP+0*16]");\
-  asm("movaps xmm"tostr(b1)", [TEMP+1*16]");\
-  asm("pxor xmm"tostr(b0)", xmm"tostr(a3)"");\
-  asm("pxor xmm"tostr(b1)", xmm"tostr(a4)"");\
-}/*MixBytes*/
-
-#define SET_CONSTANTS(){\
-  ((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\
-  ((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\
-  ((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\
-  ((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\
-  ((u64*)SUBSH_MASK)[ 0] = 0x0c0f0104070b0e00ULL;\
-  ((u64*)SUBSH_MASK)[ 1] = 0x03060a0d08020509ULL;\
-  ((u64*)SUBSH_MASK)[ 2] = 0x0e090205000d0801ULL;\
-  ((u64*)SUBSH_MASK)[ 3] = 0x04070c0f0a03060bULL;\
-  ((u64*)SUBSH_MASK)[ 4] = 0x080b0306010f0a02ULL;\
-  ((u64*)SUBSH_MASK)[ 5] = 0x05000e090c04070dULL;\
-  ((u64*)SUBSH_MASK)[ 6] = 0x0a0d040702090c03ULL;\
-  ((u64*)SUBSH_MASK)[ 7] = 0x0601080b0e05000fULL;\
-  ((u64*)SUBSH_MASK)[ 8] = 0x0b0e0500030a0d04ULL;\
-  ((u64*)SUBSH_MASK)[ 9] = 0x0702090c0f060108ULL;\
-  ((u64*)SUBSH_MASK)[10] = 0x0d080601040c0f05ULL;\
-  ((u64*)SUBSH_MASK)[11] = 0x00030b0e0907020aULL;\
-  ((u64*)SUBSH_MASK)[12] = 0x0f0a0702050e0906ULL;\
-  ((u64*)SUBSH_MASK)[13] = 0x01040d080b00030cULL;\
-  ((u64*)SUBSH_MASK)[14] = 0x090c000306080b07ULL;\
-  ((u64*)SUBSH_MASK)[15] = 0x02050f0a0d01040eULL;\
-  for(i = 0; i < ROUNDS512; i++)\
-  {\
-    ((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\
-    ((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL)  ^ 0x7060504030201000ULL;\
-    ((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL)  ^ 0x8f9fafbfcfdfefffULL;\
-    ((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\
-  }\
-  ((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\
-  ((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\
-}while(0);
-
-#define Push_All_Regs() do{\
-/*  not using any...
-    asm("push rax");\
-    asm("push rbx");\
-    asm("push rcx");*/\
-}while(0);
-
-#define Pop_All_Regs() do{\
-/*  not using any...
-    asm("pop rcx");\
-    asm("pop rbx");\
-    asm("pop rax");*/\
-}while(0);
-
-/* one round
- * i = round number
- * a0-a7 = input rows
- * b0-b7 = output rows
- */
-#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-  /* AddRoundConstant */\
-  asm ("movaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\
-  asm ("pxor   xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\
-  asm ("pxor   xmm"tostr(a1)", xmm"tostr(b1)"");\
-  asm ("pxor   xmm"tostr(a2)", xmm"tostr(b1)"");\
-  asm ("pxor   xmm"tostr(a3)", xmm"tostr(b1)"");\
-  asm ("pxor   xmm"tostr(a4)", xmm"tostr(b1)"");\
-  asm ("pxor   xmm"tostr(a5)", xmm"tostr(b1)"");\
-  asm ("pxor   xmm"tostr(a6)", xmm"tostr(b1)"");\
-  asm ("pxor   xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\
-  /* ShiftBytes + SubBytes (interleaved) */\
-  asm ("pxor xmm"tostr(b0)",  xmm"tostr(b0)"");\
-  asm ("pshufb     xmm"tostr(a0)", [SUBSH_MASK+0*16]");\
-  asm ("aesenclast xmm"tostr(a0)", xmm"tostr(b0)"");\
-  asm ("pshufb     xmm"tostr(a1)", [SUBSH_MASK+1*16]");\
-  asm ("aesenclast xmm"tostr(a1)", xmm"tostr(b0)"");\
-  asm ("pshufb     xmm"tostr(a2)", [SUBSH_MASK+2*16]");\
-  asm ("aesenclast xmm"tostr(a2)", xmm"tostr(b0)"");\
-  asm ("pshufb     xmm"tostr(a3)", [SUBSH_MASK+3*16]");\
-  asm ("aesenclast xmm"tostr(a3)", xmm"tostr(b0)"");\
-  asm ("pshufb     xmm"tostr(a4)", [SUBSH_MASK+4*16]");\
-  asm ("aesenclast xmm"tostr(a4)", xmm"tostr(b0)"");\
-  asm ("pshufb     xmm"tostr(a5)", [SUBSH_MASK+5*16]");\
-  asm ("aesenclast xmm"tostr(a5)", xmm"tostr(b0)"");\
-  asm ("pshufb     xmm"tostr(a6)", [SUBSH_MASK+6*16]");\
-  asm ("aesenclast xmm"tostr(a6)", xmm"tostr(b0)"");\
-  asm ("pshufb     xmm"tostr(a7)", [SUBSH_MASK+7*16]");\
-  asm ("aesenclast xmm"tostr(a7)", xmm"tostr(b0)"");\
-  /* MixBytes */\
-  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
-}
-
-/* 10 rounds, P and Q in parallel */
-#define ROUNDS_P_Q(){\
-  ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-}
-
-/* Matrix Transpose Step 1
- * input is a 512-bit state with two columns in one xmm
- * output is a 512-bit state with two rows in one xmm
- * inputs: i0-i3
- * outputs: i0, o1-o3
- * clobbers: t0
- */
-#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
-  asm ("movaps xmm"tostr(t0)", [TRANSP_MASK]");\
-  \
-  asm ("pshufb xmm"tostr(i0)", xmm"tostr(t0)"");\
-  asm ("pshufb xmm"tostr(i1)", xmm"tostr(t0)"");\
-  asm ("pshufb xmm"tostr(i2)", xmm"tostr(t0)"");\
-  asm ("pshufb xmm"tostr(i3)", xmm"tostr(t0)"");\
-  \
-  asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\
-  asm ("movdqa xmm"tostr(t0)", xmm"tostr(i2)"");\
-  \
-  asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("punpckhwd xmm"tostr(o1)", xmm"tostr(i1)"");\
-  asm ("punpcklwd xmm"tostr(i2)", xmm"tostr(i3)"");\
-  asm ("punpckhwd xmm"tostr(t0)", xmm"tostr(i3)"");\
-  \
-  asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\
-  asm ("pshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\
-  asm ("pshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\
-  asm ("pshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\
-  \
-  asm ("movdqa xmm"tostr(o2)", xmm"tostr(i0)"");\
-  asm ("movdqa xmm"tostr(o3)", xmm"tostr(o1)"");\
-  \
-  asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i2)"");\
-  asm ("punpckldq xmm"tostr(o1)", xmm"tostr(t0)"");\
-  asm ("punpckhdq xmm"tostr(o2)", xmm"tostr(i2)"");\
-  asm ("punpckhdq xmm"tostr(o3)", xmm"tostr(t0)"");\
-}/**/
-
-/* Matrix Transpose Step 2
- * input are two 512-bit states with two rows in one xmm
- * output are two 512-bit states with one row of each state in one xmm
- * inputs: i0-i3 = P, i4-i7 = Q
- * outputs: (i0, o1-o7) = (P|Q)
- * possible reassignments: (output reg = input reg)
- * * i1 -> o3-7
- * * i2 -> o5-7
- * * i3 -> o7
- * * i4 -> o3-7
- * * i5 -> o6-7
- */
-#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
-  asm ("movdqa     xmm"tostr(o1)", xmm"tostr(i0)"");\
-  asm ("movdqa     xmm"tostr(o2)", xmm"tostr(i1)"");\
-  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i4)"");\
-  asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i4)"");\
-  asm ("movdqa     xmm"tostr(o3)", xmm"tostr(i1)"");\
-  asm ("movdqa     xmm"tostr(o4)", xmm"tostr(i2)"");\
-  asm ("punpcklqdq xmm"tostr(o2)", xmm"tostr(i5)"");\
-  asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i5)"");\
-  asm ("movdqa     xmm"tostr(o5)", xmm"tostr(i2)"");\
-  asm ("movdqa     xmm"tostr(o6)", xmm"tostr(i3)"");\
-  asm ("punpcklqdq xmm"tostr(o4)", xmm"tostr(i6)"");\
-  asm ("punpckhqdq xmm"tostr(o5)", xmm"tostr(i6)"");\
-  asm ("movdqa     xmm"tostr(o7)", xmm"tostr(i3)"");\
-  asm ("punpcklqdq xmm"tostr(o6)", xmm"tostr(i7)"");\
-  asm ("punpckhqdq xmm"tostr(o7)", xmm"tostr(i7)"");\
-}/**/
-
-/* Matrix Transpose Inverse Step 2
- * input are two 512-bit states with one row of each state in one xmm
- * output are two 512-bit states with two rows in one xmm
- * inputs: i0-i7 = (P|Q)
- * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
- */
-#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
-  asm ("movdqa     xmm"tostr(o0)", xmm"tostr(i0)"");\
-  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("punpckhqdq xmm"tostr(o0)", xmm"tostr(i1)"");\
-  asm ("movdqa     xmm"tostr(o1)", xmm"tostr(i2)"");\
-  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
-  asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i3)"");\
-  asm ("movdqa     xmm"tostr(o2)", xmm"tostr(i4)"");\
-  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
-  asm ("punpckhqdq xmm"tostr(o2)", xmm"tostr(i5)"");\
-  asm ("movdqa     xmm"tostr(o3)", xmm"tostr(i6)"");\
-  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
-  asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i7)"");\
-}/**/
-
-/* Matrix Transpose Output Step 2
- * input is one 512-bit state with two rows in one xmm
- * output is one 512-bit state with one row in the low 64-bits of one xmm
- * inputs: i0,i2,i4,i6 = S
- * outputs: (i0-7) = (0|S)
- */
-#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
-  asm ("pxor xmm"tostr(t0)", xmm"tostr(t0)"");\
-  asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\
-  asm ("movdqa xmm"tostr(i3)", xmm"tostr(i2)"");\
-  asm ("movdqa xmm"tostr(i5)", xmm"tostr(i4)"");\
-  asm ("movdqa xmm"tostr(i7)", xmm"tostr(i6)"");\
-  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(t0)"");\
-  asm ("punpckhqdq xmm"tostr(i1)", xmm"tostr(t0)"");\
-  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(t0)"");\
-  asm ("punpckhqdq xmm"tostr(i3)", xmm"tostr(t0)"");\
-  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(t0)"");\
-  asm ("punpckhqdq xmm"tostr(i5)", xmm"tostr(t0)"");\
-  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(t0)"");\
-  asm ("punpckhqdq xmm"tostr(i7)", xmm"tostr(t0)"");\
-}/**/
-
-/* Matrix Transpose Output Inverse Step 2
- * input is one 512-bit state with one row in the low 64-bits of one xmm
- * output is one 512-bit state with two rows in one xmm
- * inputs: i0-i7 = (0|S)
- * outputs: (i0, i2, i4, i6) = S
- */
-#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
-  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
-  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
-  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
-}/**/
-
-
-void INIT256(u64* h)
-{
-  /* __cdecl calling convention: */
-  /* chaining value CV in rdi    */
-
-  asm (".intel_syntax noprefix");
-  asm volatile ("emms");
-
-  /* load IV into registers xmm12 - xmm15 */
-  asm ("movaps xmm12, [rdi+0*16]");
-  asm ("movaps xmm13, [rdi+1*16]");
-  asm ("movaps xmm14, [rdi+2*16]");
-  asm ("movaps xmm15, [rdi+3*16]");
-
-  /* transform chaining value from column ordering into row ordering */
-  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
-  Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
-
-  /* store transposed IV */
-  asm ("movaps [rdi+0*16], xmm12");
-  asm ("movaps [rdi+1*16], xmm2");
-  asm ("movaps [rdi+2*16], xmm6");
-  asm ("movaps [rdi+3*16], xmm7");
-
-  asm volatile ("emms");
-  asm (".att_syntax noprefix");
-}
-
-void TF512(u64* h, u64* m)
-{
-  /* __cdecl calling convention: */
-  /* chaining value CV in rdi    */
-  /* message M in rsi            */
-
-#ifdef IACA_TRACE
-  IACA_START;
-#endif
-
-  asm (".intel_syntax noprefix");
-  Push_All_Regs();
-
-  /* load message into registers xmm12 - xmm15 (Q = message) */
-  asm ("movaps xmm12, [rsi+0*16]");
-  asm ("movaps xmm13, [rsi+1*16]");
-  asm ("movaps xmm14, [rsi+2*16]");
-  asm ("movaps xmm15, [rsi+3*16]");
-
-  /* transform message M from column ordering into row ordering */
-  /* we first put two rows (2x64 bit) of the message into one 128-bit xmm register */
-  Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
-
-  /* load previous chaining value */
-  /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
-  asm ("movaps xmm8, [rdi+0*16]");
-  asm ("movaps xmm0, [rdi+1*16]");
-  asm ("movaps xmm4, [rdi+2*16]");
-  asm ("movaps xmm5, [rdi+3*16]");
-
-  /* xor message to CV get input of P */
-  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
-  asm ("pxor xmm8, xmm12");
-  asm ("pxor xmm0, xmm2");
-  asm ("pxor xmm4, xmm6");
-  asm ("pxor xmm5, xmm7");
-
-  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
-  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
-  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
-  Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15);
-
-  /* compute the two permutations P and Q in parallel */
-  ROUNDS_P_Q();
-
-  /* unpack again to get two rows of P or two rows of Q in one xmm register */
-  Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3);
-
-  /* xor output of P and Q */
-  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
-  asm ("pxor xmm0, xmm8");
-  asm ("pxor xmm1, xmm10");
-  asm ("pxor xmm2, xmm12");
-  asm ("pxor xmm3, xmm14");
-
-  /* xor CV (feed-forward) */
-  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
-  asm ("pxor xmm0, [rdi+0*16]");
-  asm ("pxor xmm1, [rdi+1*16]");
-  asm ("pxor xmm2, [rdi+2*16]");
-  asm ("pxor xmm3, [rdi+3*16]");
-
-  /* store CV */
-  asm ("movaps [rdi+0*16], xmm0");
-  asm ("movaps [rdi+1*16], xmm1");
-  asm ("movaps [rdi+2*16], xmm2");
-  asm ("movaps [rdi+3*16], xmm3");
-
-  Pop_All_Regs();
-  asm (".att_syntax noprefix");
-
-#ifdef IACA_TRACE
-  IACA_END;
-#endif
-  return;
-}
-
-void OF512(u64* h)
-{
-  /* __cdecl calling convention: */
-  /* chaining value CV in rdi    */
-
-  asm (".intel_syntax noprefix");
-  Push_All_Regs();
-
-  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
-  asm ("movaps xmm8,  [rdi+0*16]");
-  asm ("movaps xmm10, [rdi+1*16]");
-  asm ("movaps xmm12, [rdi+2*16]");
-  asm ("movaps xmm14, [rdi+3*16]");
-
-  /* there are now 2 rows of the CV in one xmm register */
-  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
-  /* result: the 8 input rows of P in xmm8 - xmm15 */
-  Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0);
-
-  /* compute the permutation P */
-  /* result: the output of P(CV) in xmm8 - xmm15 */
-  ROUNDS_P_Q();
-
-  /* unpack again to get two rows of P in one xmm register */
-  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
-  Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15);
-
-  /* xor CV to P output (feed-forward) */
-  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
-  asm ("pxor xmm8,  [rdi+0*16]");
-  asm ("pxor xmm10, [rdi+1*16]");
-  asm ("pxor xmm12, [rdi+2*16]");
-  asm ("pxor xmm14, [rdi+3*16]");
-
-  /* transform state back from row ordering into column ordering */
-  /* result: final hash value in xmm9, xmm11 */
-  Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0);
-
-  /* we only need to return the truncated half of the state */
-  asm ("movaps [rdi+2*16], xmm9");
-  asm ("movaps [rdi+3*16], xmm11");
-
-  Pop_All_Regs();
-  asm (".att_syntax noprefix");
-
-  return;
-}
-
--- a/algo/groestl/aes_ni/groestl256-asm-avx.h
+++ b/algo/groestl/aes_ni/groestl256-asm-avx.h
@@ -1,519 +0,0 @@
-/* groestl-asm-avx.h     Aug 2011
- *
- * Groestl implementation with inline assembly using ssse3, sse4.1, aes and avx
- * instructions.
- * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
- *
- * This code is placed in the public domain
- */
-
-#include "hash-groestl256.h"
-
-/* global variables  */
-__attribute__ ((aligned (32))) unsigned char ROUND_CONST_Lx[16];
-__attribute__ ((aligned (32))) unsigned char ROUND_CONST_L0[ROUNDS512*16];
-__attribute__ ((aligned (32))) unsigned char ROUND_CONST_L7[ROUNDS512*16];
-__attribute__ ((aligned (32))) unsigned char ROUND_CONST_P[ROUNDS1024*16];
-__attribute__ ((aligned (32))) unsigned char ROUND_CONST_Q[ROUNDS1024*16];
-__attribute__ ((aligned (32))) unsigned char TRANSP_MASK[16];
-__attribute__ ((aligned (32))) unsigned char SUBSH_MASK[8*16];
-__attribute__ ((aligned (32))) unsigned char ALL_1B[32];
-__attribute__ ((aligned (32))) unsigned char ALL_FF[32];
-
-/* temporary variables  */
-__attribute__ ((aligned (32))) unsigned char TEMP[6*32];
-
-
-#define tos(a)    #a
-#define tostr(a)  tos(a)
-
-#define SET_CONSTANTS(){\
-  ((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\
-  ((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\
-  ((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\
-  ((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\
-  ((u64*)SUBSH_MASK)[ 0] = 0x0c0f0104070b0e00ULL;\
-  ((u64*)SUBSH_MASK)[ 1] = 0x03060a0d08020509ULL;\
-  ((u64*)SUBSH_MASK)[ 2] = 0x0e090205000d0801ULL;\
-  ((u64*)SUBSH_MASK)[ 3] = 0x04070c0f0a03060bULL;\
-  ((u64*)SUBSH_MASK)[ 4] = 0x080b0306010f0a02ULL;\
-  ((u64*)SUBSH_MASK)[ 5] = 0x05000e090c04070dULL;\
-  ((u64*)SUBSH_MASK)[ 6] = 0x0a0d040702090c03ULL;\
-  ((u64*)SUBSH_MASK)[ 7] = 0x0601080b0e05000fULL;\
-  ((u64*)SUBSH_MASK)[ 8] = 0x0b0e0500030a0d04ULL;\
-  ((u64*)SUBSH_MASK)[ 9] = 0x0702090c0f060108ULL;\
-  ((u64*)SUBSH_MASK)[10] = 0x0d080601040c0f05ULL;\
-  ((u64*)SUBSH_MASK)[11] = 0x00030b0e0907020aULL;\
-  ((u64*)SUBSH_MASK)[12] = 0x0f0a0702050e0906ULL;\
-  ((u64*)SUBSH_MASK)[13] = 0x01040d080b00030cULL;\
-  ((u64*)SUBSH_MASK)[14] = 0x090c000306080b07ULL;\
-  ((u64*)SUBSH_MASK)[15] = 0x02050f0a0d01040eULL;\
-  for(i = 0; i < ROUNDS512; i++)\
-  {\
-    ((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\
-    ((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL)  ^ 0x7060504030201000ULL;\
-    ((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL)  ^ 0x8f9fafbfcfdfefffULL;\
-    ((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\
-  }\
-  ((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\
-  ((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\
-}while(0);
-
-#define Push_All_Regs() do{\
-/*  not using any...
-    asm("push rax");\
-    asm("push rbx");\
-    asm("push rcx");*/\
-}while(0);
-
-#define Pop_All_Regs() do{\
-/*  not using any...
-    asm("pop rcx");\
-    asm("pop rbx");\
-    asm("pop rax");*/\
-}while(0);
-
-/* xmm[i] will be multiplied by 2
- * xmm[j] will be lost
- * xmm[k] has to be all 0x1b
- * xmm[z] has to be zero */
-#define VMUL2(i, j, k, z){\
-  asm("vpcmpgtb xmm"tostr(j)", xmm"tostr(z)", xmm"tostr(i)"");\
-  asm("vpaddb xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(i)"");\
-  asm("vpand xmm"tostr(j)", xmm"tostr(j)", xmm"tostr(k)"");\
-  asm("vpxor xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(j)"");\
-}/**/
-
-/* xmm[i] will be multiplied by 2
- * xmm[j] will be lost
- * xmm[k] has to be all 0x1b
- * xmm[z] has to be zero */
-#define VMUL2v2(i, j, k, z){\
-  asm("vpblendvb xmm"tostr(j)", xmm"tostr(z)", xmm"tostr(k)", xmm"tostr(i)"");\
-  asm("vpaddb xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(i)"");\
-  asm("vpxor xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(j)"");\
-}/**/
-
-/* Yet another implementation of MixBytes.
-   This time we use the formulae (3) from the paper "Byte Slicing Groestl".
-   Input: a0, ..., a7
-   Output: b0, ..., b7 = MixBytes(a0,...,a7).
-   but we use the relations:
-   t_i = a_i + a_{i+3}
-   x_i = t_i + t_{i+3}
-   y_i = t_i + t+{i+2} + a_{i+6}
-   z_i = 2*x_i
-   w_i = z_i + y_{i+4}
-   v_i = 2*w_i
-   b_i = v_{i+3} + y_{i+4}
-   We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
-   and then adding v_i computed in the meantime in registers xmm0..xmm7.
-   We almost fit into 16 registers, need only 3 spills to memory.
-   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
-   K. Matusiewicz, 2011/05/29 */
-#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-  /* xmm"tostr(8..xmm"tostr(15 = a2 a3... a0 a1 */\
-  asm("vmovdqa xmm"tostr(b0)", xmm"tostr(a2)"");\
-  asm("vmovdqa xmm"tostr(b1)", xmm"tostr(a3)"");\
-  asm("vmovdqa xmm"tostr(b2)", xmm"tostr(a4)"");\
-  asm("vmovdqa xmm"tostr(b3)", xmm"tostr(a5)"");\
-  asm("vmovdqa xmm"tostr(b4)", xmm"tostr(a6)"");\
-  asm("vmovdqa xmm"tostr(b5)", xmm"tostr(a7)"");\
-  asm("vmovdqa xmm"tostr(b6)", xmm"tostr(a0)"");\
-  asm("vmovdqa xmm"tostr(b7)", xmm"tostr(a1)"");\
-  \
-  /* t_i = a_i + a_{i+1} */\
-  asm("vpxor xmm"tostr(a0)", xmm"tostr(a0)", xmm"tostr(a1)"");\
-  asm("vpxor xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(a2)"");\
-  asm("vpxor xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(a3)"");\
-  asm("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(a4)"");\
-  asm("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(a5)"");\
-  asm("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(a6)"");\
-  asm("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(a7)"");\
-  asm("vpxor xmm"tostr(a7)", xmm"tostr(a7)", xmm"tostr(b6)"");\
-  \
-  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
-  asm("vpxor xmm"tostr(b0)", xmm"tostr(b0)", xmm"tostr(a4)"");\
-  asm("vpxor xmm"tostr(b1)", xmm"tostr(b1)", xmm"tostr(a5)"");\
-  asm("vpxor xmm"tostr(b2)", xmm"tostr(b2)", xmm"tostr(a6)"");\
-  asm("vpxor xmm"tostr(b3)", xmm"tostr(b3)", xmm"tostr(a7)"");\
-  asm("vpxor xmm"tostr(b4)", xmm"tostr(b4)", xmm"tostr(a0)"");\
-  asm("vpxor xmm"tostr(b5)", xmm"tostr(b5)", xmm"tostr(a1)"");\
-  asm("vpxor xmm"tostr(b6)", xmm"tostr(b6)", xmm"tostr(a2)"");\
-  asm("vpxor xmm"tostr(b7)", xmm"tostr(b7)", xmm"tostr(a3)"");\
-  \
-  asm("vpxor xmm"tostr(b0)", xmm"tostr(b0)", xmm"tostr(a6)"");\
-  asm("vpxor xmm"tostr(b1)", xmm"tostr(b1)", xmm"tostr(a7)"");\
-  asm("vpxor xmm"tostr(b2)", xmm"tostr(b2)", xmm"tostr(a0)"");\
-  asm("vpxor xmm"tostr(b3)", xmm"tostr(b3)", xmm"tostr(a1)"");\
-  asm("vpxor xmm"tostr(b4)", xmm"tostr(b4)", xmm"tostr(a2)"");\
-  asm("vpxor xmm"tostr(b5)", xmm"tostr(b5)", xmm"tostr(a3)"");\
-  asm("vpxor xmm"tostr(b6)", xmm"tostr(b6)", xmm"tostr(a4)"");\
-  asm("vpxor xmm"tostr(b7)", xmm"tostr(b7)", xmm"tostr(a5)"");\
-  \
-  /* spill values y_4, y_5 to memory */\
-  asm("vmovaps [TEMP+0*16], xmm"tostr(b0)"");\
-  asm("vmovaps [TEMP+1*16], xmm"tostr(b1)"");\
-  asm("vmovaps [TEMP+2*16], xmm"tostr(b2)"");\
-  \
-  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
-  asm("vmovdqa xmm"tostr(b0)", xmm"tostr(a0)"");\
-  asm("vmovdqa xmm"tostr(b1)", xmm"tostr(a1)"");\
-  asm("vmovaps [TEMP+3*16], xmm"tostr(a2)"");\
-  \
-  /* compute x_i = t_i + t_{i+3} */\
-  asm("vpxor xmm"tostr(a0)", xmm"tostr(a0)", xmm"tostr(a3)"");\
-  asm("vpxor xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(a4)"");\
-  asm("vpxor xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(a5)"");\
-  asm("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(a6)"");\
-  asm("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(a7)"");\
-  asm("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b0)"");\
-  asm("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b1)"");\
-  asm("vpxor xmm"tostr(a7)", xmm"tostr(a7)", [TEMP+3*16]");\
-  \
-  /*compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
-  asm("vmovaps xmm"tostr(b1)", [ALL_1B]");\
-  asm("vpxor xmm"tostr(b2)", xmm"tostr(b2)", xmm"tostr(b2)"");\
-  VMUL2(a7, b0, b1, b2);\
-  VMUL2(a6, b0, b1, b2);\
-  VMUL2(a5, b0, b1, b2);\
-  VMUL2(a4, b0, b1, b2);\
-  VMUL2(a3, b0, b1, b2);\
-  VMUL2(a2, b0, b1, b2);\
-  VMUL2(a1, b0, b1, b2);\
-  VMUL2(a0, b0, b1, b2);\
-  \
-  /* compute w_i :  add y_{i+4} */\
-  asm("vpxor xmm"tostr(a0)", xmm"tostr(a0)", [TEMP+0*16]");\
-  asm("vpxor xmm"tostr(a1)", xmm"tostr(a1)", [TEMP+1*16]");\
-  asm("vpxor xmm"tostr(a2)", xmm"tostr(a2)", [TEMP+2*16]");\
-  asm("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(b3)"");\
-  asm("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(b4)"");\
-  asm("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b5)"");\
-  asm("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b6)"");\
-  asm("vpxor xmm"tostr(a7)", xmm"tostr(a7)", xmm"tostr(b7)"");\
-  \
-  /*compute v_i: double w_i */\
-  VMUL2(a0, b0, b1, b2);\
-  VMUL2(a1, b0, b1, b2);\
-  VMUL2(a2, b0, b1, b2);\
-  VMUL2(a3, b0, b1, b2);\
-  VMUL2(a4, b0, b1, b2);\
-  VMUL2(a5, b0, b1, b2);\
-  VMUL2(a6, b0, b1, b2);\
-  VMUL2(a7, b0, b1, b2);\
-  \
-  /* add to y_4 y_5 .. v3, v4, ... */\
-  asm("vpxor xmm"tostr(b0)", xmm"tostr(a3)", [TEMP+0*16]");\
-  asm("vpxor xmm"tostr(b1)", xmm"tostr(a4)", [TEMP+1*16]");\
-  asm("vpxor xmm"tostr(b2)", xmm"tostr(a5)", [TEMP+2*16]");\
-  asm("vpxor xmm"tostr(b3)", xmm"tostr(b3)", xmm"tostr(a6)"");\
-  asm("vpxor xmm"tostr(b4)", xmm"tostr(b4)", xmm"tostr(a7)"");\
-  asm("vpxor xmm"tostr(b5)", xmm"tostr(b5)", xmm"tostr(a0)"");\
-  asm("vpxor xmm"tostr(b6)", xmm"tostr(b6)", xmm"tostr(a1)"");\
-  asm("vpxor xmm"tostr(b7)", xmm"tostr(b7)", xmm"tostr(a2)"");\
-}/*MixBytes*/
-
-/* one round
- * i = round number
- * a0-a7 = input rows
- * b0-b7 = output rows
- */
-#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-  /* AddRoundConstant */\
-  asm ("vmovaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\
-  asm ("vpxor   xmm"tostr(a0)", xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\
-  asm ("vpxor   xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(b1)"");\
-  asm ("vpxor   xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(b1)"");\
-  asm ("vpxor   xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(b1)"");\
-  asm ("vpxor   xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(b1)"");\
-  asm ("vpxor   xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b1)"");\
-  asm ("vpxor   xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b1)"");\
-  asm ("vpxor   xmm"tostr(a7)", xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\
-  /* ShiftBytes + SubBytes (interleaved) */\
-  asm ("vpxor xmm"tostr(b0)",  xmm"tostr(b0)",  xmm"tostr(b0)"");\
-  asm ("vpshufb     xmm"tostr(a0)", xmm"tostr(a0)", [SUBSH_MASK+0*16]");\
-  asm ("vaesenclast xmm"tostr(a0)", xmm"tostr(a0)", xmm"tostr(b0)"");\
-  asm ("vpshufb     xmm"tostr(a1)", xmm"tostr(a1)", [SUBSH_MASK+1*16]");\
-  asm ("vaesenclast xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(b0)"");\
-  asm ("vpshufb     xmm"tostr(a2)", xmm"tostr(a2)", [SUBSH_MASK+2*16]");\
-  asm ("vaesenclast xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(b0)"");\
-  asm ("vpshufb     xmm"tostr(a3)", xmm"tostr(a3)", [SUBSH_MASK+3*16]");\
-  asm ("vaesenclast xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(b0)"");\
-  asm ("vpshufb     xmm"tostr(a4)", xmm"tostr(a4)", [SUBSH_MASK+4*16]");\
-  asm ("vaesenclast xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(b0)"");\
-  asm ("vpshufb     xmm"tostr(a5)", xmm"tostr(a5)", [SUBSH_MASK+5*16]");\
-  asm ("vaesenclast xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b0)"");\
-  asm ("vpshufb     xmm"tostr(a6)", xmm"tostr(a6)", [SUBSH_MASK+6*16]");\
-  asm ("vaesenclast xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b0)"");\
-  asm ("vpshufb     xmm"tostr(a7)", xmm"tostr(a7)", [SUBSH_MASK+7*16]");\
-  asm ("vaesenclast xmm"tostr(a7)", xmm"tostr(a7)", xmm"tostr(b0)"");\
-  /* MixBytes */\
-  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
-}
-
-/* 10 rounds, P and Q in parallel */
-#define ROUNDS_P_Q(){\
-  ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-}
-
-/* Matrix Transpose Step 1
- * input is a 512-bit state with two columns in one xmm
- * output is a 512-bit state with two rows in one xmm
- * inputs: i0-i3
-
- * outputs: i0, o1-o3
- * clobbers: t0
- */
-#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
-  asm ("vmovaps xmm"tostr(t0)", [TRANSP_MASK]");\
-\
-  asm ("vpshufb xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(t0)"");\
-  asm ("vpshufb xmm"tostr(i1)", xmm"tostr(i1)", xmm"tostr(t0)"");\
-  asm ("vpshufb xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(t0)"");\
-  asm ("vpshufb xmm"tostr(i3)", xmm"tostr(i3)", xmm"tostr(t0)"");\
-\
-  asm ("vpunpckhwd xmm"tostr(o1)", xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("vpunpcklwd xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("vpunpckhwd xmm"tostr(t0)", xmm"tostr(i2)", xmm"tostr(i3)"");\
-  asm ("vpunpcklwd xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\
-\
-  asm ("vpshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\
-  asm ("vpshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\
-  asm ("vpshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\
-  asm ("vpshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\
-\
-  asm ("vpunpckhdq xmm"tostr(o2)", xmm"tostr(i0)", xmm"tostr(i2)"");\
-  asm ("vpunpckhdq xmm"tostr(o3)", xmm"tostr(o1)", xmm"tostr(t0)"");\
-  asm ("vpunpckldq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i2)"");\
-  asm ("vpunpckldq xmm"tostr(o1)", xmm"tostr(o1)", xmm"tostr(t0)"");\
-}/**/
-
-/* Matrix Transpose Step 2
- * input are two 512-bit states with two rows in one xmm
- * output are two 512-bit states with one row of each state in one xmm
- * inputs: i0-i3 = P, i4-i7 = Q
- * outputs: (i0, o1-o7) = (P|Q)
- * possible reassignments: (output reg = input reg)
- * * i1 -> o3-7
- * * i2 -> o5-7
- * * i3 -> o7
- * * i4 -> o3-7
- * * i5 -> o6-7
- */
-#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
-  asm ("vpunpckhqdq xmm"tostr(o1)", xmm"tostr(i0)", xmm"tostr(i4)"");\
-  asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i4)"");\
-  asm ("vpunpcklqdq xmm"tostr(o2)", xmm"tostr(i1)", xmm"tostr(i5)"");\
-  asm ("vpunpckhqdq xmm"tostr(o3)", xmm"tostr(i1)", xmm"tostr(i5)"");\
-  asm ("vpunpcklqdq xmm"tostr(o4)", xmm"tostr(i2)", xmm"tostr(i6)"");\
-  asm ("vpunpckhqdq xmm"tostr(o5)", xmm"tostr(i2)", xmm"tostr(i6)"");\
-  asm ("vpunpcklqdq xmm"tostr(o6)", xmm"tostr(i3)", xmm"tostr(i7)"");\
-  asm ("vpunpckhqdq xmm"tostr(o7)", xmm"tostr(i3)", xmm"tostr(i7)"");\
-}/**/
-
-/* Matrix Transpose Inverse Step 2
- * input are two 512-bit states with one row of each state in one xmm
- * output are two 512-bit states with two rows in one xmm
- * inputs: i0-i7 = (P|Q)
- * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
- */
-#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
-  asm ("vpunpckhqdq xmm"tostr(o0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("vpunpckhqdq xmm"tostr(o1)", xmm"tostr(i2)", xmm"tostr(i3)"");\
-  asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\
-  asm ("vpunpckhqdq xmm"tostr(o2)", xmm"tostr(i4)", xmm"tostr(i5)"");\
-  asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(i5)"");\
-  asm ("vpunpckhqdq xmm"tostr(o3)", xmm"tostr(i6)", xmm"tostr(i7)"");\
-  asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(i7)"");\
-}/**/
-
-/* Matrix Transpose Output Step 2
- * input is one 512-bit state with two rows in one xmm
- * output is one 512-bit state with one row in the low 64-bits of one xmm
- * inputs: i0,i2,i4,i6 = S
- * outputs: (i0-7) = (0|S)
- */
-#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
-  asm ("vpxor xmm"tostr(t0)", xmm"tostr(t0)", xmm"tostr(t0)"");\
-  asm ("vpunpckhqdq xmm"tostr(i1)", xmm"tostr(i0)", xmm"tostr(t0)"");\
-  asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(t0)"");\
-  asm ("vpunpckhqdq xmm"tostr(i3)", xmm"tostr(i2)", xmm"tostr(t0)"");\
-  asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(t0)"");\
-  asm ("vpunpckhqdq xmm"tostr(i5)", xmm"tostr(i4)", xmm"tostr(t0)"");\
-  asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(t0)"");\
-  asm ("vpunpckhqdq xmm"tostr(i7)", xmm"tostr(i6)", xmm"tostr(t0)"");\
-  asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(t0)"");\
-}/**/
-
-/* Matrix Transpose Output Inverse Step 2
- * input is one 512-bit state with one row in the low 64-bits of one xmm
- * output is one 512-bit state with two rows in one xmm
- * inputs: i0-i7 = (0|S)
- * outputs: (i0, i2, i4, i6) = S
- */
-#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
-  asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\
-  asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(i5)"");\
-  asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(i7)"");\
-}/**/
-
-
-void INIT256(u64* h)
-{
-  /* __cdecl calling convention: */
-  /* chaining value CV in rdi    */
-
-  asm (".intel_syntax noprefix");
-  asm volatile ("emms");
-
-  /* load IV into registers xmm12 - xmm15 */
-  asm ("vmovaps xmm12, [rdi+0*16]");
-  asm ("vmovaps xmm13, [rdi+1*16]");
-  asm ("vmovaps xmm14, [rdi+2*16]");
-  asm ("vmovaps xmm15, [rdi+3*16]");
-
-  /* transform chaining value from column ordering into row ordering */
-  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
-  Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
-
-  /* store transposed IV */
-  asm ("vmovaps [rdi+0*16], xmm12");
-  asm ("vmovaps [rdi+1*16], xmm2");
-  asm ("vmovaps [rdi+2*16], xmm6");
-  asm ("vmovaps [rdi+3*16], xmm7");
-
-  asm volatile ("emms");
-  asm (".att_syntax noprefix");
-}
-
-void TF512(u64* h, u64* m)
-{
-  /* __cdecl calling convention: */
-  /* chaining value CV in rdi    */
-  /* message M in rsi            */
-
-#ifdef IACA_TRACE
-  IACA_START;
-#endif
-
-  asm (".intel_syntax noprefix");
-  Push_All_Regs();
-
-  /* load message into registers xmm12 - xmm15 (Q = message) */
-  asm ("vmovaps xmm12, [rsi+0*16]");
-  asm ("vmovaps xmm13, [rsi+1*16]");
-  asm ("vmovaps xmm14, [rsi+2*16]");
-  asm ("vmovaps xmm15, [rsi+3*16]");
-
-  /* transform message M from column ordering into row ordering */
-  /* we first put two rows (64 bit) of the message into one 128-bit xmm register */
-  Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
-
-  /* load previous chaining value and xor message to CV to get input of P */
-  /* we first put two rows (2x64 bit) of the CV into one 128-bit xmm register */
-  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
-  asm ("vpxor xmm8, xmm12, [rdi+0*16]");
-  asm ("vpxor xmm0, xmm2,  [rdi+1*16]");
-  asm ("vpxor xmm4, xmm6,  [rdi+2*16]");
-  asm ("vpxor xmm5, xmm7,  [rdi+3*16]");
-
-  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
-  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
-  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
-  Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15);
-
-  /* compute the two permutations P and Q in parallel */
-  ROUNDS_P_Q();
-
-  /* unpack again to get two rows of P or two rows of Q in one xmm register */
-  Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3);
-
-  /* xor output of P and Q */
-  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
-  asm ("vpxor xmm0, xmm0, xmm8");
-  asm ("vpxor xmm1, xmm1, xmm10");
-  asm ("vpxor xmm2, xmm2, xmm12");
-  asm ("vpxor xmm3, xmm3, xmm14");
-
-  /* xor CV (feed-forward) */
-  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
-  asm ("vpxor xmm0, xmm0, [rdi+0*16]");
-  asm ("vpxor xmm1, xmm1, [rdi+1*16]");
-  asm ("vpxor xmm2, xmm2, [rdi+2*16]");
-  asm ("vpxor xmm3, xmm3, [rdi+3*16]");
-
-  /* store CV */
-  asm ("vmovaps [rdi+0*16], xmm0");
-  asm ("vmovaps [rdi+1*16], xmm1");
-  asm ("vmovaps [rdi+2*16], xmm2");
-  asm ("vmovaps [rdi+3*16], xmm3");
-
-  Pop_All_Regs();
-  asm (".att_syntax noprefix");
-
-#ifdef IACA_TRACE
-  IACA_END;
-#endif
-  return;
-}
-
-void OF512(u64* h)
-{
-  /* __cdecl calling convention: */
-  /* chaining value CV in rdi    */
-
-  asm (".intel_syntax noprefix");
-  Push_All_Regs();
-
-  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
-  asm ("vmovaps xmm8,  [rdi+0*16]");
-  asm ("vmovaps xmm10, [rdi+1*16]");
-  asm ("vmovaps xmm12, [rdi+2*16]");
-  asm ("vmovaps xmm14, [rdi+3*16]");
-
-  /* there are now 2 rows of the CV in one xmm register */
-  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
-  /* result: the 8 input rows of P in xmm8 - xmm15 */
-  Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0);
-
-  /* compute the permutation P */
-  /* result: the output of P(CV) in xmm8 - xmm15 */
-  ROUNDS_P_Q();
-
-  /* unpack again to get two rows of P in one xmm register */
-  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
-  Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15);
-
-  /* xor CV to P output (feed-forward) */
-  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
-  asm ("vpxor xmm8,  xmm8,  [rdi+0*16]");
-  asm ("vpxor xmm10, xmm10, [rdi+1*16]");
-  asm ("vpxor xmm12, xmm12, [rdi+2*16]");
-  asm ("vpxor xmm14, xmm14, [rdi+3*16]");
-
-  /* transform state back from row ordering into column ordering */
-  /* result: final hash value in xmm9, xmm11 */
-  Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0);
-
-  /* we only need to return the truncated half of the state */
-  asm ("vmovaps [rdi+2*16], xmm9");
-  asm ("vmovaps [rdi+3*16], xmm11");
-
-  Pop_All_Regs();
-  asm (".att_syntax noprefix");
-
-  return;
-}
-
--- a/algo/groestl/aes_ni/groestl256-asm-vperm.h
+++ b/algo/groestl/aes_ni/groestl256-asm-vperm.h
@@ -1,856 +0,0 @@
-/* groestl-asm-vperm.h     Aug 2011
- *
- * Groestl implementation with inline assembly using ssse3 instructions.
- * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
- *
- * Based on the vperm and aes_ni implementations of the hash function Groestl
- * by Cagdas Calik <ccalik@metu.edu.tr> http://www.metu.edu.tr/~ccalik/
- * Institute of Applied Mathematics, Middle East Technical University, Turkey
- *
- * This code is placed in the public domain
- */
-
-#include "hash-groestl256.h"
-
-/* global constants  */
-__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Lx[16];
-__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L0[ROUNDS512*16];
-__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L7[ROUNDS512*16];
-__attribute__ ((aligned (16))) unsigned char ROUND_CONST_P[ROUNDS1024*16];
-__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Q[ROUNDS1024*16];
-__attribute__ ((aligned (16))) unsigned char TRANSP_MASK[16];
-__attribute__ ((aligned (16))) unsigned char SUBSH_MASK[8*16];
-__attribute__ ((aligned (16))) unsigned char ALL_0F[16];
-__attribute__ ((aligned (16))) unsigned char ALL_15[16];
-__attribute__ ((aligned (16))) unsigned char ALL_1B[16];
-__attribute__ ((aligned (16))) unsigned char ALL_63[16];
-__attribute__ ((aligned (16))) unsigned char ALL_FF[16];
-__attribute__ ((aligned (16))) unsigned char VPERM_IPT[2*16];
-__attribute__ ((aligned (16))) unsigned char VPERM_OPT[2*16];
-__attribute__ ((aligned (16))) unsigned char VPERM_INV[2*16];
-__attribute__ ((aligned (16))) unsigned char VPERM_SB1[2*16];
-__attribute__ ((aligned (16))) unsigned char VPERM_SB2[2*16];
-__attribute__ ((aligned (16))) unsigned char VPERM_SB4[2*16];
-__attribute__ ((aligned (16))) unsigned char VPERM_SBO[2*16];
-
-/* temporary variables  */
-__attribute__ ((aligned (16))) unsigned char TEMP_MUL1[8*16];
-__attribute__ ((aligned (16))) unsigned char TEMP_MUL2[8*16];
-__attribute__ ((aligned (16))) unsigned char TEMP_MUL4[1*16];
-__attribute__ ((aligned (16))) unsigned char QTEMP[8*16];
-__attribute__ ((aligned (16))) unsigned char TEMP[8*16];
-
-
-#define tos(a)    #a
-#define tostr(a)  tos(a)
-
-#define SET_SHARED_CONSTANTS(){\
-  ((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\
-  ((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\
-  ((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\
-  ((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\
-  ((u64*)ALL_63)[ 0] = 0x6363636363636363ULL;\
-  ((u64*)ALL_63)[ 1] = 0x6363636363636363ULL;\
-  ((u64*)ALL_0F)[ 0] = 0x0F0F0F0F0F0F0F0FULL;\
-  ((u64*)ALL_0F)[ 1] = 0x0F0F0F0F0F0F0F0FULL;\
-  ((u64*)VPERM_IPT)[ 0] = 0x4C01307D317C4D00ULL;\
-  ((u64*)VPERM_IPT)[ 1] = 0xCD80B1FCB0FDCC81ULL;\
-  ((u64*)VPERM_IPT)[ 2] = 0xC2B2E8985A2A7000ULL;\
-  ((u64*)VPERM_IPT)[ 3] = 0xCABAE09052227808ULL;\
-  ((u64*)VPERM_OPT)[ 0] = 0x01EDBD5150BCEC00ULL;\
-  ((u64*)VPERM_OPT)[ 1] = 0xE10D5DB1B05C0CE0ULL;\
-  ((u64*)VPERM_OPT)[ 2] = 0xFF9F4929D6B66000ULL;\
-  ((u64*)VPERM_OPT)[ 3] = 0xF7974121DEBE6808ULL;\
-  ((u64*)VPERM_INV)[ 0] = 0x01040A060F0B0780ULL;\
-  ((u64*)VPERM_INV)[ 1] = 0x030D0E0C02050809ULL;\
-  ((u64*)VPERM_INV)[ 2] = 0x0E05060F0D080180ULL;\
-  ((u64*)VPERM_INV)[ 3] = 0x040703090A0B0C02ULL;\
-  ((u64*)VPERM_SB1)[ 0] = 0x3618D415FAE22300ULL;\
-  ((u64*)VPERM_SB1)[ 1] = 0x3BF7CCC10D2ED9EFULL;\
-  ((u64*)VPERM_SB1)[ 2] = 0xB19BE18FCB503E00ULL;\
-  ((u64*)VPERM_SB1)[ 3] = 0xA5DF7A6E142AF544ULL;\
-  ((u64*)VPERM_SB2)[ 0] = 0x69EB88400AE12900ULL;\
-  ((u64*)VPERM_SB2)[ 1] = 0xC2A163C8AB82234AULL;\
-  ((u64*)VPERM_SB2)[ 2] = 0xE27A93C60B712400ULL;\
-  ((u64*)VPERM_SB2)[ 3] = 0x5EB7E955BC982FCDULL;\
-  ((u64*)VPERM_SB4)[ 0] = 0x3D50AED7C393EA00ULL;\
-  ((u64*)VPERM_SB4)[ 1] = 0xBA44FE79876D2914ULL;\
-  ((u64*)VPERM_SB4)[ 2] = 0xE1E937A03FD64100ULL;\
-  ((u64*)VPERM_SB4)[ 3] = 0xA876DE9749087E9FULL;\
-/*((u64*)VPERM_SBO)[ 0] = 0xCFE474A55FBB6A00ULL;\
-  ((u64*)VPERM_SBO)[ 1] = 0x8E1E90D1412B35FAULL;\
-  ((u64*)VPERM_SBO)[ 2] = 0xD0D26D176FBDC700ULL;\
-  ((u64*)VPERM_SBO)[ 3] = 0x15AABF7AC502A878ULL;*/\
-  ((u64*)ALL_15)[ 0] = 0x1515151515151515ULL;\
-  ((u64*)ALL_15)[ 1] = 0x1515151515151515ULL;\
-}/**/
-
-/* VPERM
- * Transform w/o settings c*
- * transforms 2 rows to/from "vperm mode"
- * this function is derived from:
- *   vperm and aes_ni implementations of hash function Grostl
- *   by Cagdas CALIK
- * inputs:
- * a0, a1 = 2 rows
- * table = transformation table to use
- * t*, c* = clobbers
- * outputs:
- * a0, a1 = 2 rows transformed with table
- * */
-#define VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2){\
-  asm ("movdqa xmm"tostr(t0)", xmm"tostr(c0)"");\
-  asm ("movdqa xmm"tostr(t1)", xmm"tostr(c0)"");\
-  asm ("pandn  xmm"tostr(t0)", xmm"tostr(a0)"");\
-  asm ("pandn  xmm"tostr(t1)", xmm"tostr(a1)"");\
-  asm ("psrld  xmm"tostr(t0)", 4");\
-  asm ("psrld  xmm"tostr(t1)", 4");\
-  asm ("pand   xmm"tostr(a0)", xmm"tostr(c0)"");\
-  asm ("pand   xmm"tostr(a1)", xmm"tostr(c0)"");\
-  asm ("movdqa xmm"tostr(t2)", xmm"tostr(c2)"");\
-  asm ("movdqa xmm"tostr(t3)", xmm"tostr(c2)"");\
-  asm ("pshufb xmm"tostr(t2)", xmm"tostr(a0)"");\
-  asm ("pshufb xmm"tostr(t3)", xmm"tostr(a1)"");\
-  asm ("movdqa xmm"tostr(a0)", xmm"tostr(c1)"");\
-  asm ("movdqa xmm"tostr(a1)", xmm"tostr(c1)"");\
-  asm ("pshufb xmm"tostr(a0)", xmm"tostr(t0)"");\
-  asm ("pshufb xmm"tostr(a1)", xmm"tostr(t1)"");\
-  asm ("pxor   xmm"tostr(a0)", xmm"tostr(t2)"");\
-  asm ("pxor   xmm"tostr(a1)", xmm"tostr(t3)"");\
-}/**/
-
-#define VPERM_Transform_Set_Const(table, c0, c1, c2){\
-  asm ("movaps xmm"tostr(c0)", [ALL_0F]");\
-  asm ("movaps xmm"tostr(c1)", ["tostr(table)"+0*16]");\
-  asm ("movaps xmm"tostr(c2)", ["tostr(table)"+1*16]");\
-}/**/
-
-/* VPERM
- * Transform
- * transforms 2 rows to/from "vperm mode"
- * this function is derived from:
- *   vperm and aes_ni implementations of hash function Grostl
- *   by Cagdas CALIK
- * inputs:
- * a0, a1 = 2 rows
- * table = transformation table to use
- * t*, c* = clobbers
- * outputs:
- * a0, a1 = 2 rows transformed with table
- * */
-#define VPERM_Transform(a0, a1, table, t0, t1, t2, t3, c0, c1, c2){\
-  VPERM_Transform_Set_Const(table, c0, c1, c2);\
-  VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
-}/**/
-
-/* VPERM
- * Transform State
- * inputs:
- * a0-a3 = state
- * table = transformation table to use
- * t* = clobbers
- * outputs:
- * a0-a3 = transformed state
- * */
-#define VPERM_Transform_State(a0, a1, a2, a3, table, t0, t1, t2, t3, c0, c1, c2){\
-  VPERM_Transform_Set_Const(table, c0, c1, c2);\
-  VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
-  VPERM_Transform_No_Const(a2, a3, t0, t1, t2, t3, c0, c1, c2);\
-}/**/
-
-/* VPERM
- * Add Constant to State
- * inputs:
- * a0-a7 = state
- * constant = constant to add
- * t0 = clobber
- * outputs:
- * a0-a7 = state + constant
- * */
-#define VPERM_Add_Constant(a0, a1, a2, a3, a4, a5, a6, a7, constant, t0){\
-  asm ("movaps xmm"tostr(t0)", ["tostr(constant)"]");\
-  asm ("pxor   xmm"tostr(a0)",  xmm"tostr(t0)"");\
-  asm ("pxor   xmm"tostr(a1)",  xmm"tostr(t0)"");\
-  asm ("pxor   xmm"tostr(a2)",  xmm"tostr(t0)"");\
-  asm ("pxor   xmm"tostr(a3)",  xmm"tostr(t0)"");\
-  asm ("pxor   xmm"tostr(a4)",  xmm"tostr(t0)"");\
-  asm ("pxor   xmm"tostr(a5)",  xmm"tostr(t0)"");\
-  asm ("pxor   xmm"tostr(a6)",  xmm"tostr(t0)"");\
-  asm ("pxor   xmm"tostr(a7)",  xmm"tostr(t0)"");\
-}/**/
-
-/* VPERM
- * Set Substitute Core Constants
- * */
-#define VPERM_Substitute_Core_Set_Const(c0, c1, c2){\
-  VPERM_Transform_Set_Const(VPERM_INV, c0, c1, c2);\
-}/**/
-
-/* VPERM
- * Substitute Core
- * first part of sbox inverse computation
- * this function is derived from:
- *   vperm and aes_ni implementations of hash function Grostl
- *   by Cagdas CALIK
- * inputs:
- * a0 = 1 row
- * t*, c* = clobbers
- * outputs:
- * b0a, b0b = inputs for lookup step
- * */
-#define VPERM_Substitute_Core(a0, b0a, b0b, t0, t1, c0, c1, c2){\
-  asm ("movdqa xmm"tostr(t0)",  xmm"tostr(c0)"");\
-  asm ("pandn  xmm"tostr(t0)",  xmm"tostr(a0)"");\
-  asm ("psrld  xmm"tostr(t0)",  4");\
-  asm ("pand   xmm"tostr(a0)",  xmm"tostr(c0)"");\
-  asm ("movdqa xmm"tostr(b0a)", "tostr(c1)"");\
-  asm ("pshufb xmm"tostr(b0a)", xmm"tostr(a0)"");\
-  asm ("pxor   xmm"tostr(a0)",  xmm"tostr(t0)"");\
-  asm ("movdqa xmm"tostr(b0b)", xmm"tostr(c2)"");\
-  asm ("pshufb xmm"tostr(b0b)", xmm"tostr(t0)"");\
-  asm ("pxor   xmm"tostr(b0b)", xmm"tostr(b0a)"");\
-  asm ("movdqa xmm"tostr(t1)",  xmm"tostr(c2)"");\
-  asm ("pshufb xmm"tostr(t1)",  xmm"tostr(a0)"");\
-  asm ("pxor   xmm"tostr(t1)",  xmm"tostr(b0a)"");\
-  asm ("movdqa xmm"tostr(b0a)", xmm"tostr(c2)"");\
-  asm ("pshufb xmm"tostr(b0a)", xmm"tostr(b0b)"");\
-  asm ("pxor   xmm"tostr(b0a)", xmm"tostr(a0)"");\
-  asm ("movdqa xmm"tostr(b0b)", xmm"tostr(c2)"");\
-  asm ("pshufb xmm"tostr(b0b)", xmm"tostr(t1)"");\
-  asm ("pxor   xmm"tostr(b0b)", xmm"tostr(t0)"");\
-}/**/
-
-/* VPERM
- * Lookup
- * second part of sbox inverse computation
- * this function is derived from:
- *   vperm and aes_ni implementations of hash function Grostl
- *   by Cagdas CALIK
- * inputs:
- * a0a, a0b = output of Substitution Core
- * table = lookup table to use (*1 / *2 / *4)
- * t0 = clobber
- * outputs:
- * b0 = output of sbox + multiplication
- * */
-#define VPERM_Lookup(a0a, a0b, table, b0, t0){\
-  asm ("movaps xmm"tostr(b0)", ["tostr(table)"+0*16]");\
-  asm ("movaps xmm"tostr(t0)", ["tostr(table)"+1*16]");\
-  asm ("pshufb xmm"tostr(b0)", xmm"tostr(a0b)"");\
-  asm ("pshufb xmm"tostr(t0)", xmm"tostr(a0a)"");\
-  asm ("pxor   xmm"tostr(b0)", xmm"tostr(t0)"");\
-}/**/
-
-/* VPERM
- * SubBytes and *2 / *4
- * this function is derived from:
- *   Constant-time SSSE3 AES core implementation
- *   by Mike Hamburg
- * and
- *   vperm and aes_ni implementations of hash function Grostl
- *   by Cagdas CALIK
- * inputs:
- * a0-a7 = state
- * t*, c* = clobbers
- * outputs:
- * a0-a7 = state * 4
- * c2 = row0 * 2 -> b0
- * c1 = row7 * 2 -> b3
- * c0 = row7 * 1 -> b4
- * t2 = row4 * 1 -> b7
- * TEMP_MUL1 = row(i) * 1
- * TEMP_MUL2 = row(i) * 2
- *
- * call:VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7) */
-#define VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, t0, t1, t3, t4, c2, c1, c0, t2){\
-  /* set Constants */\
-  VPERM_Substitute_Core_Set_Const(c0, c1, c2);\
-  /* row 1 */\
-  VPERM_Substitute_Core(a1, t0, t1, t3, t4, c0, xmm##c1, c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
-  asm ("movaps [TEMP_MUL1+1*16], xmm"tostr(t2)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
-  asm ("movaps [TEMP_MUL2+1*16], xmm"tostr(t3)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a1, t4);\
-  /* --- */\
-  /* row 2 */\
-  VPERM_Substitute_Core(a2, t0, t1, t3, t4, c0, xmm##c1, c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
-  asm ("movaps [TEMP_MUL1+2*16], xmm"tostr(t2)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
-  asm ("movaps [TEMP_MUL2+2*16], xmm"tostr(t3)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a2, t4);\
-  /* --- */\
-  /* row 3 */\
-  VPERM_Substitute_Core(a3, t0, t1, t3, t4, c0, xmm##c1, c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
-  asm ("movaps [TEMP_MUL1+3*16], xmm"tostr(t2)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
-  asm ("movaps [TEMP_MUL2+3*16], xmm"tostr(t3)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a3, t4);\
-  /* --- */\
-  /* row 5 */\
-  VPERM_Substitute_Core(a5, t0, t1, t3, t4, c0, xmm##c1, c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
-  asm ("movaps [TEMP_MUL1+5*16], xmm"tostr(t2)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
-  asm ("movaps [TEMP_MUL2+5*16], xmm"tostr(t3)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a5, t4);\
-  /* --- */\
-  /* row 6 */\
-  VPERM_Substitute_Core(a6, t0, t1, t3, t4, c0, xmm##c1, c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
-  asm ("movaps [TEMP_MUL1+6*16], xmm"tostr(t2)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
-  asm ("movaps [TEMP_MUL2+6*16], xmm"tostr(t3)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a6, t4);\
-  /* --- */\
-  /* row 7 */\
-  VPERM_Substitute_Core(a7, t0, t1, t3, t4, c0, xmm##c1, c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
-  asm ("movaps [TEMP_MUL1+7*16], xmm"tostr(t2)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB2, c1, t4); /*c1 -> b3*/\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a7, t4);\
-  /* --- */\
-  /* row 4 */\
-  VPERM_Substitute_Core(a4, t0, t1, t3, t4, c0, [VPERM_INV+0*16], c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4); /*t2 -> b7*/\
-  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
-  asm ("movaps [TEMP_MUL2+4*16], xmm"tostr(t3)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a4, t4);\
-  /* --- */\
-  /* row 0 */\
-  VPERM_Substitute_Core(a0, t0, t1, t3, t4, c0, [VPERM_INV+0*16], c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, c0, t4); /*c0 -> b4*/\
-  VPERM_Lookup(t0, t1, VPERM_SB2, c2, t4); /*c2 -> b0*/\
-  asm ("movaps [TEMP_MUL2+0*16], xmm"tostr(c2)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a0, t4);\
-  /* --- */\
-}/**/
-
-
-/* Optimized MixBytes
- * inputs:
- * a0-a7 = (row0-row7) * 4
- * b0 = row0 * 2
- * b3 = row7 * 2
- * b4 = row7 * 1
- * b7 = row4 * 1
- * all *1 and *2 values must also be in TEMP_MUL1, TEMP_MUL2
- * output: b0-b7
- * */
-#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-  /* save one value */\
-  asm ("movaps [TEMP_MUL4], xmm"tostr(a3)"");\
-  /* 1 */\
-  asm ("movdqa xmm"tostr(b1)", xmm"tostr(a0)"");\
-  asm ("pxor   xmm"tostr(b1)", xmm"tostr(a5)"");\
-  asm ("pxor   xmm"tostr(b1)", xmm"tostr(b4)""); /* -> helper! */\
-  asm ("pxor   xmm"tostr(b1)", [TEMP_MUL2+3*16]");\
-  asm ("movdqa xmm"tostr(b2)", xmm"tostr(b1)"");\
-  \
-  /* 2 */\
-  asm ("movdqa xmm"tostr(b5)", xmm"tostr(a1)"");\
-  asm ("pxor   xmm"tostr(b5)", xmm"tostr(a4)"");\
-  asm ("pxor   xmm"tostr(b5)", xmm"tostr(b7)""); /* -> helper! */\
-  asm ("pxor   xmm"tostr(b5)", xmm"tostr(b3)""); /* -> helper! */\
-  asm ("movdqa xmm"tostr(b6)", xmm"tostr(b5)"");\
-  \
-  /* 4 */\
-  asm ("pxor   xmm"tostr(b7)", xmm"tostr(a6)"");\
-  /*asm ("pxor   xmm"tostr(b7)", [TEMP_MUL1+4*16]"); -> helper! */\
-  asm ("pxor   xmm"tostr(b7)", [TEMP_MUL1+6*16]");\
-  asm ("pxor   xmm"tostr(b7)", [TEMP_MUL2+1*16]");\
-  asm ("pxor   xmm"tostr(b7)", xmm"tostr(b3)""); /* -> helper! */\
-  asm ("pxor   xmm"tostr(b2)", xmm"tostr(b7)"");\
-  \
-  /* 3 */\
-  asm ("pxor   xmm"tostr(b0)", xmm"tostr(a7)"");\
-  asm ("pxor   xmm"tostr(b0)", [TEMP_MUL1+5*16]");\
-  asm ("pxor   xmm"tostr(b0)", [TEMP_MUL1+7*16]");\
-  /*asm ("pxor   xmm"tostr(b0)", [TEMP_MUL2+0*16]"); -> helper! */\
-  asm ("pxor   xmm"tostr(b0)", [TEMP_MUL2+2*16]");\
-  asm ("movdqa xmm"tostr(b3)", xmm"tostr(b0)"");\
-  asm ("pxor   xmm"tostr(b1)", xmm"tostr(b0)"");\
-  asm ("pxor   xmm"tostr(b0)", xmm"tostr(b7)""); /* moved from 4 */\
-  \
-  /* 5 */\
-  asm ("pxor   xmm"tostr(b4)", xmm"tostr(a2)"");\
-  /*asm ("pxor   xmm"tostr(b4)", [TEMP_MUL1+0*16]"); -> helper! */\
-  asm ("pxor   xmm"tostr(b4)", [TEMP_MUL1+2*16]");\
-  asm ("pxor   xmm"tostr(b4)", [TEMP_MUL2+3*16]");\
-  asm ("pxor   xmm"tostr(b4)", [TEMP_MUL2+5*16]");\
-  asm ("pxor   xmm"tostr(b3)", xmm"tostr(b4)"");\
-  asm ("pxor   xmm"tostr(b6)", xmm"tostr(b4)"");\
-  \
-  /* 6 */\
-  asm ("pxor xmm"tostr(a3)", [TEMP_MUL1+1*16]");\
-  asm ("pxor xmm"tostr(a3)", [TEMP_MUL1+3*16]");\
-  asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+4*16]");\
-  asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+6*16]");\
-  asm ("pxor xmm"tostr(b4)", xmm"tostr(a3)"");\
-  asm ("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\
-  asm ("pxor xmm"tostr(b7)", xmm"tostr(a3)"");\
-  \
-  /* 7 */\
-  asm ("pxor xmm"tostr(a1)", [TEMP_MUL1+1*16]");\
-  asm ("pxor xmm"tostr(a1)", [TEMP_MUL2+4*16]");\
-  asm ("pxor xmm"tostr(b2)", xmm"tostr(a1)"");\
-  asm ("pxor xmm"tostr(b3)", xmm"tostr(a1)"");\
-  \
-  /* 8 */\
-  asm ("pxor xmm"tostr(a5)", [TEMP_MUL1+5*16]");\
-  asm ("pxor xmm"tostr(a5)", [TEMP_MUL2+0*16]");\
-  asm ("pxor xmm"tostr(b6)", xmm"tostr(a5)"");\
-  asm ("pxor xmm"tostr(b7)", xmm"tostr(a5)"");\
-  \
-  /* 9 */\
-  asm ("movaps xmm"tostr(a3)", [TEMP_MUL1+2*16]");\
-  asm ("pxor   xmm"tostr(a3)", [TEMP_MUL2+5*16]");\
-  asm ("pxor   xmm"tostr(b0)", xmm"tostr(a3)"");\
-  asm ("pxor   xmm"tostr(b5)", xmm"tostr(a3)"");\
-  \
-  /* 10 */\
-  asm ("movaps xmm"tostr(a1)", [TEMP_MUL1+6*16]");\
-  asm ("pxor   xmm"tostr(a1)", [TEMP_MUL2+1*16]");\
-  asm ("pxor   xmm"tostr(b1)", xmm"tostr(a1)"");\
-  asm ("pxor   xmm"tostr(b4)", xmm"tostr(a1)"");\
-  \
-  /* 11 */\
-  asm ("movaps xmm"tostr(a5)", [TEMP_MUL1+3*16]");\
-  asm ("pxor   xmm"tostr(a5)", [TEMP_MUL2+6*16]");\
-  asm ("pxor   xmm"tostr(b1)", xmm"tostr(a5)"");\
-  asm ("pxor   xmm"tostr(b6)", xmm"tostr(a5)"");\
-  \
-  /* 12 */\
-  asm ("movaps xmm"tostr(a3)", [TEMP_MUL1+7*16]");\
-  asm ("pxor   xmm"tostr(a3)", [TEMP_MUL2+2*16]");\
-  asm ("pxor   xmm"tostr(b2)", xmm"tostr(a3)"");\
-  asm ("pxor   xmm"tostr(b5)", xmm"tostr(a3)"");\
-  \
-  /* 13 */\
-  asm ("pxor xmm"tostr(b0)", [TEMP_MUL4]");\
-  asm ("pxor xmm"tostr(b0)", xmm"tostr(a4)"");\
-  asm ("pxor xmm"tostr(b1)", xmm"tostr(a4)"");\
-  asm ("pxor xmm"tostr(b3)", xmm"tostr(a6)"");\
-  asm ("pxor xmm"tostr(b4)", xmm"tostr(a0)"");\
-  asm ("pxor xmm"tostr(b4)", xmm"tostr(a7)"");\
-  asm ("pxor xmm"tostr(b5)", xmm"tostr(a0)"");\
-  asm ("pxor xmm"tostr(b7)", xmm"tostr(a2)"");\
-}/**/
-
-//#if (LENGTH <= 256)
-
-#define SET_CONSTANTS(){\
-  SET_SHARED_CONSTANTS();\
-  ((u64*)SUBSH_MASK)[ 0] = 0x0706050403020100ULL;\
-  ((u64*)SUBSH_MASK)[ 1] = 0x080f0e0d0c0b0a09ULL;\
-  ((u64*)SUBSH_MASK)[ 2] = 0x0007060504030201ULL;\
-  ((u64*)SUBSH_MASK)[ 3] = 0x0a09080f0e0d0c0bULL;\
-  ((u64*)SUBSH_MASK)[ 4] = 0x0100070605040302ULL;\
-  ((u64*)SUBSH_MASK)[ 5] = 0x0c0b0a09080f0e0dULL;\
-  ((u64*)SUBSH_MASK)[ 6] = 0x0201000706050403ULL;\
-  ((u64*)SUBSH_MASK)[ 7] = 0x0e0d0c0b0a09080fULL;\
-  ((u64*)SUBSH_MASK)[ 8] = 0x0302010007060504ULL;\
-  ((u64*)SUBSH_MASK)[ 9] = 0x0f0e0d0c0b0a0908ULL;\
-  ((u64*)SUBSH_MASK)[10] = 0x0403020100070605ULL;\
-  ((u64*)SUBSH_MASK)[11] = 0x09080f0e0d0c0b0aULL;\
-  ((u64*)SUBSH_MASK)[12] = 0x0504030201000706ULL;\
-  ((u64*)SUBSH_MASK)[13] = 0x0b0a09080f0e0d0cULL;\
-  ((u64*)SUBSH_MASK)[14] = 0x0605040302010007ULL;\
-  ((u64*)SUBSH_MASK)[15] = 0x0d0c0b0a09080f0eULL;\
-  for(i = 0; i < ROUNDS512; i++)\
-  {\
-    ((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\
-    ((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL)  ^ 0x7060504030201000ULL;\
-    ((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL)  ^ 0x8f9fafbfcfdfefffULL;\
-    ((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\
-  }\
-  ((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\
-  ((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\
-}/**/
-
-#define Push_All_Regs(){\
-/*  not using any...
-    asm("push rax");\
-    asm("push rbx");\
-    asm("push rcx");*/\
-}/**/
-
-#define Pop_All_Regs(){\
-/*  not using any...
-    asm("pop rcx");\
-    asm("pop rbx");\
-    asm("pop rax");*/\
-}/**/
-
-
-/* vperm:
- * transformation before rounds with ipt
- * first round add transformed constant
- * middle rounds: add constant XOR 0x15...15
- * last round: additionally add 0x15...15 after MB
- * transformation after rounds with opt
- */
-/* one round
- * i = round number
- * a0-a7 = input rows
- * b0-b7 = output rows
- */
-#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-  /* AddRoundConstant + ShiftBytes (interleaved) */\
-  asm ("movaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\
-  asm ("pxor   xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\
-  asm ("pxor   xmm"tostr(a1)", xmm"tostr(b1)"");\
-  asm ("pxor   xmm"tostr(a2)", xmm"tostr(b1)"");\
-  asm ("pxor   xmm"tostr(a3)", xmm"tostr(b1)"");\
-  asm ("pshufb xmm"tostr(a0)", [SUBSH_MASK+0*16]");\
-  asm ("pshufb xmm"tostr(a1)", [SUBSH_MASK+1*16]");\
-  asm ("pxor   xmm"tostr(a4)", xmm"tostr(b1)"");\
-  asm ("pshufb xmm"tostr(a2)", [SUBSH_MASK+2*16]");\
-  asm ("pshufb xmm"tostr(a3)", [SUBSH_MASK+3*16]");\
-  asm ("pxor   xmm"tostr(a5)", xmm"tostr(b1)"");\
-  asm ("pxor   xmm"tostr(a6)", xmm"tostr(b1)"");\
-  asm ("pshufb xmm"tostr(a4)", [SUBSH_MASK+4*16]");\
-  asm ("pshufb xmm"tostr(a5)", [SUBSH_MASK+5*16]");\
-  asm ("pxor   xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\
-  asm ("pshufb xmm"tostr(a6)", [SUBSH_MASK+6*16]");\
-  asm ("pshufb xmm"tostr(a7)", [SUBSH_MASK+7*16]");\
-  /* SubBytes + Multiplication by 2 and 4 */\
-  VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7);\
-  /* MixBytes */\
-  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
-}/**/
-
-/* 10 rounds, P and Q in parallel */
-#define ROUNDS_P_Q(){\
-  VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 0);\
-  ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 0);\
-}
-
-
-/* Matrix Transpose Step 1
- * input is a 512-bit state with two columns in one xmm
- * output is a 512-bit state with two rows in one xmm
- * inputs: i0-i3
- * outputs: i0, o1-o3
- * clobbers: t0
- */
-#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
-  asm ("movaps xmm"tostr(t0)", [TRANSP_MASK]");\
-\
-  asm ("pshufb xmm"tostr(i0)", xmm"tostr(t0)"");\
-  asm ("pshufb xmm"tostr(i1)", xmm"tostr(t0)"");\
-  asm ("pshufb xmm"tostr(i2)", xmm"tostr(t0)"");\
-  asm ("pshufb xmm"tostr(i3)", xmm"tostr(t0)"");\
-\
-  asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\
-  asm ("movdqa xmm"tostr(t0)", xmm"tostr(i2)"");\
-\
-  asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("punpckhwd xmm"tostr(o1)", xmm"tostr(i1)"");\
-  asm ("punpcklwd xmm"tostr(i2)", xmm"tostr(i3)"");\
-  asm ("punpckhwd xmm"tostr(t0)", xmm"tostr(i3)"");\
-\
-  asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\
-  asm ("pshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\
-  asm ("pshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\
-  asm ("pshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\
-\
-  asm ("movdqa xmm"tostr(o2)", xmm"tostr(i0)"");\
-  asm ("movdqa xmm"tostr(o3)", xmm"tostr(o1)"");\
-\
-  asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i2)"");\
-  asm ("punpckldq xmm"tostr(o1)", xmm"tostr(t0)"");\
-  asm ("punpckhdq xmm"tostr(o2)", xmm"tostr(i2)"");\
-  asm ("punpckhdq xmm"tostr(o3)", xmm"tostr(t0)"");\
-}/**/
-
-/* Matrix Transpose Step 2
- * input are two 512-bit states with two rows in one xmm
- * output are two 512-bit states with one row of each state in one xmm
- * inputs: i0-i3 = P, i4-i7 = Q
- * outputs: (i0, o1-o7) = (P|Q)
- * possible reassignments: (output reg = input reg)
- * * i1 -> o3-7
- * * i2 -> o5-7
- * * i3 -> o7
- * * i4 -> o3-7
- * * i5 -> o6-7
- */
-#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
-  asm ("movdqa     xmm"tostr(o1)", xmm"tostr(i0)"");\
-  asm ("movdqa     xmm"tostr(o2)", xmm"tostr(i1)"");\
-  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i4)"");\
-  asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i4)"");\
-  asm ("movdqa     xmm"tostr(o3)", xmm"tostr(i1)"");\
-  asm ("movdqa     xmm"tostr(o4)", xmm"tostr(i2)"");\
-  asm ("punpcklqdq xmm"tostr(o2)", xmm"tostr(i5)"");\
-  asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i5)"");\
-  asm ("movdqa     xmm"tostr(o5)", xmm"tostr(i2)"");\
-  asm ("movdqa     xmm"tostr(o6)", xmm"tostr(i3)"");\
-  asm ("punpcklqdq xmm"tostr(o4)", xmm"tostr(i6)"");\
-  asm ("punpckhqdq xmm"tostr(o5)", xmm"tostr(i6)"");\
-  asm ("movdqa     xmm"tostr(o7)", xmm"tostr(i3)"");\
-  asm ("punpcklqdq xmm"tostr(o6)", xmm"tostr(i7)"");\
-  asm ("punpckhqdq xmm"tostr(o7)", xmm"tostr(i7)"");\
-}/**/
-
-/* Matrix Transpose Inverse Step 2
- * input are two 512-bit states with one row of each state in one xmm
- * output are two 512-bit states with two rows in one xmm
- * inputs: i0-i7 = (P|Q)
- * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
- */
-#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
-  asm ("movdqa     xmm"tostr(o0)", xmm"tostr(i0)"");\
-  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("punpckhqdq xmm"tostr(o0)", xmm"tostr(i1)"");\
-  asm ("movdqa     xmm"tostr(o1)", xmm"tostr(i2)"");\
-  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
-  asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i3)"");\
-  asm ("movdqa     xmm"tostr(o2)", xmm"tostr(i4)"");\
-  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
-  asm ("punpckhqdq xmm"tostr(o2)", xmm"tostr(i5)"");\
-  asm ("movdqa     xmm"tostr(o3)", xmm"tostr(i6)"");\
-  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
-  asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i7)"");\
-}/**/
-
-/* Matrix Transpose Output Step 2
- * input is one 512-bit state with two rows in one xmm
- * output is one 512-bit state with one row in the low 64-bits of one xmm
- * inputs: i0,i2,i4,i6 = S
- * outputs: (i0-7) = (0|S)
- */
-#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
-  asm ("pxor xmm"tostr(t0)", xmm"tostr(t0)"");\
-  asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\
-  asm ("movdqa xmm"tostr(i3)", xmm"tostr(i2)"");\
-  asm ("movdqa xmm"tostr(i5)", xmm"tostr(i4)"");\
-  asm ("movdqa xmm"tostr(i7)", xmm"tostr(i6)"");\
-  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(t0)"");\
-  asm ("punpckhqdq xmm"tostr(i1)", xmm"tostr(t0)"");\
-  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(t0)"");\
-  asm ("punpckhqdq xmm"tostr(i3)", xmm"tostr(t0)"");\
-  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(t0)"");\
-  asm ("punpckhqdq xmm"tostr(i5)", xmm"tostr(t0)"");\
-  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(t0)"");\
-  asm ("punpckhqdq xmm"tostr(i7)", xmm"tostr(t0)"");\
-}/**/
-
-/* Matrix Transpose Output Inverse Step 2
- * input is one 512-bit state with one row in the low 64-bits of one xmm
- * output is one 512-bit state with two rows in one xmm
- * inputs: i0-i7 = (0|S)
- * outputs: (i0, i2, i4, i6) = S
- */
-#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
-  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
-  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
-  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
-}/**/
-
-
-/* transform round constants into VPERM mode */
-#define VPERM_Transform_RoundConst_CNT2(i, j){\
-  asm ("movaps xmm0, [ROUND_CONST_L0+"tostr(i)"*16]");\
-  asm ("movaps xmm1, [ROUND_CONST_L7+"tostr(i)"*16]");\
-  asm ("movaps xmm2, [ROUND_CONST_L0+"tostr(j)"*16]");\
-  asm ("movaps xmm3, [ROUND_CONST_L7+"tostr(j)"*16]");\
-  VPERM_Transform_State(0, 1, 2, 3, VPERM_IPT, 4, 5, 6, 7, 8, 9, 10);\
-  asm ("pxor xmm0, [ALL_15]");\
-  asm ("pxor xmm1, [ALL_15]");\
-  asm ("pxor xmm2, [ALL_15]");\
-  asm ("pxor xmm3, [ALL_15]");\
-  asm ("movaps [ROUND_CONST_L0+"tostr(i)"*16], xmm0");\
-  asm ("movaps [ROUND_CONST_L7+"tostr(i)"*16], xmm1");\
-  asm ("movaps [ROUND_CONST_L0+"tostr(j)"*16], xmm2");\
-  asm ("movaps [ROUND_CONST_L7+"tostr(j)"*16], xmm3");\
-}/**/
-
-/* transform round constants into VPERM mode */
-#define VPERM_Transform_RoundConst(){\
-  asm ("movaps xmm0, [ROUND_CONST_Lx]");\
-  VPERM_Transform(0, 1, VPERM_IPT, 4, 5, 6, 7, 8, 9, 10);\
-  asm ("pxor xmm0, [ALL_15]");\
-  asm ("movaps [ROUND_CONST_Lx], xmm0");\
-  VPERM_Transform_RoundConst_CNT2(0, 1);\
-  VPERM_Transform_RoundConst_CNT2(2, 3);\
-  VPERM_Transform_RoundConst_CNT2(4, 5);\
-  VPERM_Transform_RoundConst_CNT2(6, 7);\
-  VPERM_Transform_RoundConst_CNT2(8, 9);\
-}/**/
-
-void INIT256(u64* h)
-{
-  /* __cdecl calling convention: */
-  /* chaining value CV in rdi    */
-
-  asm (".intel_syntax noprefix");
-  asm volatile ("emms");
-
-  /* transform round constants into VPERM mode */
-  VPERM_Transform_RoundConst();
-
-  /* load IV into registers xmm12 - xmm15 */
-  asm ("movaps xmm12, [rdi+0*16]");
-  asm ("movaps xmm13, [rdi+1*16]");
-  asm ("movaps xmm14, [rdi+2*16]");
-  asm ("movaps xmm15, [rdi+3*16]");
-
-  /* transform chaining value from column ordering into row ordering */
-  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
-  VPERM_Transform_State(12, 13, 14, 15, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7);
-  Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
-
-  /* store transposed IV */
-  asm ("movaps [rdi+0*16], xmm12");
-  asm ("movaps [rdi+1*16], xmm2");
-  asm ("movaps [rdi+2*16], xmm6");
-  asm ("movaps [rdi+3*16], xmm7");
-
-  asm volatile ("emms");
-  asm (".att_syntax noprefix");
-}
-
-void TF512(u64* h, u64* m)
-{
-  /* __cdecl calling convention: */
-  /* chaining value CV in rdi    */
-  /* message M in rsi            */
-
-#ifdef IACA_TRACE
-  IACA_START;
-#endif
-
-  asm (".intel_syntax noprefix");
-  Push_All_Regs();
-
-  /* load message into registers xmm12 - xmm15 (Q = message) */
-  asm ("movaps xmm12, [rsi+0*16]");
-  asm ("movaps xmm13, [rsi+1*16]");
-  asm ("movaps xmm14, [rsi+2*16]");
-  asm ("movaps xmm15, [rsi+3*16]");
-
-  /* transform message M from column ordering into row ordering */
-  /* we first put two rows (64 bit) of the message into one 128-bit xmm register */
-  VPERM_Transform_State(12, 13, 14, 15, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7);
-  Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
-
-  /* load previous chaining value */
-  /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
-  asm ("movaps xmm8, [rdi+0*16]");
-  asm ("movaps xmm0, [rdi+1*16]");
-  asm ("movaps xmm4, [rdi+2*16]");
-  asm ("movaps xmm5, [rdi+3*16]");
-
-  /* xor message to CV get input of P */
-  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
-  asm ("pxor xmm8, xmm12");
-  asm ("pxor xmm0, xmm2");
-  asm ("pxor xmm4, xmm6");
-  asm ("pxor xmm5, xmm7");
-
-  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
-  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
-  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
-  Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15);
-
-  /* compute the two permutations P and Q in parallel */
-  ROUNDS_P_Q();
-
-  /* unpack again to get two rows of P or two rows of Q in one xmm register */
-  Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3);
-
-  /* xor output of P and Q */
-  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
-  asm ("pxor xmm0, xmm8");
-  asm ("pxor xmm1, xmm10");
-  asm ("pxor xmm2, xmm12");
-  asm ("pxor xmm3, xmm14");
-
-  /* xor CV (feed-forward) */
-  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
-  asm ("pxor xmm0, [rdi+0*16]");
-  asm ("pxor xmm1, [rdi+1*16]");
-  asm ("pxor xmm2, [rdi+2*16]");
-  asm ("pxor xmm3, [rdi+3*16]");
-
-  /* store CV */
-  asm ("movaps [rdi+0*16], xmm0");
-  asm ("movaps [rdi+1*16], xmm1");
-  asm ("movaps [rdi+2*16], xmm2");
-  asm ("movaps [rdi+3*16], xmm3");
-
-  Pop_All_Regs();
-  asm (".att_syntax noprefix");
-
-#ifdef IACA_TRACE
-  IACA_END;
-#endif
-
-  return;
-}
-
-void OF512(u64* h)
-{
-  /* __cdecl calling convention: */
-  /* chaining value CV in rdi    */
-
-  asm (".intel_syntax noprefix");
-  Push_All_Regs();
-
-  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
-  asm ("movaps xmm8,  [rdi+0*16]");
-  asm ("movaps xmm10, [rdi+1*16]");
-  asm ("movaps xmm12, [rdi+2*16]");
-  asm ("movaps xmm14, [rdi+3*16]");
-
-  /* there are now 2 rows of the CV in one xmm register */
-  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
-  /* result: the 8 input rows of P in xmm8 - xmm15 */
-  Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0);
-
-  /* compute the permutation P */
-  /* result: the output of P(CV) in xmm8 - xmm15 */
-  ROUNDS_P_Q();
-
-  /* unpack again to get two rows of P in one xmm register */
-  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
-  Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15);
-
-  /* xor CV to P output (feed-forward) */
-  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
-  asm ("pxor xmm8,  [rdi+0*16]");
-  asm ("pxor xmm10, [rdi+1*16]");
-  asm ("pxor xmm12, [rdi+2*16]");
-  asm ("pxor xmm14, [rdi+3*16]");
-
-  /* transform state back from row ordering into column ordering */
-  /* result: final hash value in xmm9, xmm11 */
-  Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0);
-  VPERM_Transform(9, 11, VPERM_OPT, 0, 1, 2, 3, 5, 6, 7);
-
-  /* we only need to return the truncated half of the state */
-  asm ("movaps [rdi+2*16], xmm9");
-  asm ("movaps [rdi+3*16], xmm11");
-
-  Pop_All_Regs();
-  asm (".att_syntax noprefix");
-
-  return;
-}
-
-
--- a/algo/groestl/aes_ni/groestl256-intr-aes.h
+++ b/algo/groestl/aes_ni/groestl256-intr-aes.h
@@ -11,17 +11,44 @@
 #include <wmmintrin.h>
 #include "hash-groestl256.h"

-/* global constants  */
-__m128i ROUND_CONST_Lx;
-__m128i ROUND_CONST_L0[ROUNDS512];
-__m128i ROUND_CONST_L7[ROUNDS512];
-//__m128i ROUND_CONST_P[ROUNDS1024];
-//__m128i ROUND_CONST_Q[ROUNDS1024];
-__m128i TRANSP_MASK;
-__m128i SUBSH_MASK[8];
-__m128i ALL_1B;
-__m128i ALL_FF;
+static const __m128i round_const_l0[] __attribute__ ((aligned (64))) =
+{
+   { 0x7060504030201000, 0xffffffffffffffff },
+   { 0x7161514131211101, 0xffffffffffffffff },
+   { 0x7262524232221202, 0xffffffffffffffff },
+   { 0x7363534333231303, 0xffffffffffffffff },
+   { 0x7464544434241404, 0xffffffffffffffff },
+   { 0x7565554535251505, 0xffffffffffffffff },
+   { 0x7666564636261606, 0xffffffffffffffff },
+   { 0x7767574737271707, 0xffffffffffffffff },
+   { 0x7868584838281808, 0xffffffffffffffff },
+   { 0x7969594939291909, 0xffffffffffffffff }
+};

+static const __m128i round_const_l7[] __attribute__ ((aligned (64))) =
+{
+   { 0x0000000000000000, 0x8f9fafbfcfdfefff },
+   { 0x0000000000000000, 0x8e9eaebecedeeefe },
+   { 0x0000000000000000, 0x8d9dadbdcdddedfd },
+   { 0x0000000000000000, 0x8c9cacbcccdcecfc },
+   { 0x0000000000000000, 0x8b9babbbcbdbebfb },
+   { 0x0000000000000000, 0x8a9aaabacadaeafa },
+   { 0x0000000000000000, 0x8999a9b9c9d9e9f9 },
+   { 0x0000000000000000, 0x8898a8b8c8d8e8f8 },
+   { 0x0000000000000000, 0x8797a7b7c7d7e7f7 },
+   { 0x0000000000000000, 0x8696a6b6c6d6e6f6 }
+};
+
+static const __m128i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02 };
+
+static const __m128i SUBSH_MASK0 = { 0x0c0f0104070b0e00, 0x03060a0d08020509 };
+static const __m128i SUBSH_MASK1 = { 0x0e090205000d0801, 0x04070c0f0a03060b };
+static const __m128i SUBSH_MASK2 = { 0x080b0306010f0a02, 0x05000e090c04070d };
+static const __m128i SUBSH_MASK3 = { 0x0a0d040702090c03, 0x0601080b0e05000f };
+static const __m128i SUBSH_MASK4 = { 0x0b0e0500030a0d04, 0x0702090c0f060108 };
+static const __m128i SUBSH_MASK5 = { 0x0d080601040c0f05, 0x00030b0e0907020a };
+static const __m128i SUBSH_MASK6 = { 0x0f0a0702050e0906, 0x01040d080b00030c };
+static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };

 #define tos(a)    #a
 #define tostr(a)  tos(a)
@@ -38,8 +65,6 @@ __m128i ALL_FF;
  i = _mm_xor_si128(i, j);\
 } 

- /**/
-
 /* Yet another implementation of MixBytes.
   This time we use the formulae (3) from the paper "Byte Slicing Groestl".
   Input: a0, ..., a7
@@ -113,7 +138,7 @@ __m128i ALL_FF;
  \
  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
  /* compute w_i : add y_{i+4} */\
-  b1 = ALL_1B;\
+  b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\
  MUL2(a0, b0, b1);\
  a0 = _mm_xor_si128(a0, TEMP0);\
  MUL2(a1, b0, b1);\
@@ -153,25 +178,6 @@ __m128i ALL_FF;
  b1 = _mm_xor_si128(b1, a4);\
 }/*MixBytes*/

-#define SET_CONSTANTS(){\
-   ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
-  TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
-  SUBSH_MASK[0] = _mm_set_epi32(0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00);\
-  SUBSH_MASK[1] = _mm_set_epi32(0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801);\
-  SUBSH_MASK[2] = _mm_set_epi32(0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02);\
-  SUBSH_MASK[3] = _mm_set_epi32(0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03);\
-  SUBSH_MASK[4] = _mm_set_epi32(0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04);\
-  SUBSH_MASK[5] = _mm_set_epi32(0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05);\
-  SUBSH_MASK[6] = _mm_set_epi32(0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906);\
-  SUBSH_MASK[7] = _mm_set_epi32(0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07);\
-  for(i = 0; i < ROUNDS512; i++)\
-  {\
-    ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
-    ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\
-  }\
-  ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\
-}while(0); \
-
 /* one round
 * i = round number
 * a0-a7 = input rows
@@ -179,34 +185,34 @@ __m128i ALL_FF;
 */
 #define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
  /* AddRoundConstant */\
-  b1 = ROUND_CONST_Lx;\
-  a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\
-  a1 = _mm_xor_si128(a1, b1);\
-  a2 = _mm_xor_si128(a2, b1);\
-  a3 = _mm_xor_si128(a3, b1);\
-  a4 = _mm_xor_si128(a4, b1);\
-  a5 = _mm_xor_si128(a5, b1);\
-  a6 = _mm_xor_si128(a6, b1);\
-  a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\
+  b1 = m128_const_64( 0xffffffffffffffff, 0 ); \
+  a0 = _mm_xor_si128( a0, casti_m128i( round_const_l0, i ) ); \
+  a1 = _mm_xor_si128( a1, b1 ); \
+  a2 = _mm_xor_si128( a2, b1 ); \
+  a3 = _mm_xor_si128( a3, b1 ); \
+  a4 = _mm_xor_si128( a4, b1 ); \
+  a5 = _mm_xor_si128( a5, b1 ); \
+  a6 = _mm_xor_si128( a6, b1 ); \
+  a7 = _mm_xor_si128( a7, casti_m128i( round_const_l7, i ) ); \
  \
  /* ShiftBytes + SubBytes (interleaved) */\
  b0 = _mm_xor_si128(b0,  b0);\
-  a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\
-  a0 = _mm_aesenclast_si128(a0, b0);\
-  a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\
-  a1 = _mm_aesenclast_si128(a1, b0);\
-  a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\
-  a2 = _mm_aesenclast_si128(a2, b0);\
-  a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\
-  a3 = _mm_aesenclast_si128(a3, b0);\
-  a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\
-  a4 = _mm_aesenclast_si128(a4, b0);\
-  a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\
-  a5 = _mm_aesenclast_si128(a5, b0);\
-  a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\
-  a6 = _mm_aesenclast_si128(a6, b0);\
-  a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\
-  a7 = _mm_aesenclast_si128(a7, b0);\
+  a0 = _mm_shuffle_epi8( a0, SUBSH_MASK0 ); \
+  a0 = _mm_aesenclast_si128( a0, b0 );\
+  a1 = _mm_shuffle_epi8( a1, SUBSH_MASK1 ); \
+  a1 = _mm_aesenclast_si128( a1, b0 );\
+  a2 = _mm_shuffle_epi8( a2, SUBSH_MASK2 ); \
+  a2 = _mm_aesenclast_si128( a2, b0 );\
+  a3 = _mm_shuffle_epi8( a3, SUBSH_MASK3 ); \
+  a3 = _mm_aesenclast_si128( a3, b0 );\
+  a4 = _mm_shuffle_epi8( a4, SUBSH_MASK4 ); \
+  a4 = _mm_aesenclast_si128( a4, b0 );\
+  a5 = _mm_shuffle_epi8( a5, SUBSH_MASK5 ); \
+  a5 = _mm_aesenclast_si128( a5, b0 );\
+  a6 = _mm_shuffle_epi8( a6, SUBSH_MASK6 ); \
+  a6 = _mm_aesenclast_si128( a6, b0 );\
+  a7 = _mm_shuffle_epi8( a7, SUBSH_MASK7 ); \
+  a7 = _mm_aesenclast_si128( a7, b0 );\
  \
  /* MixBytes */\
  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
@@ -234,8 +240,9 @@ __m128i ALL_FF;
 * outputs: i0, o1-o3
 * clobbers: t0
 */
+
 #define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
-  t0 = TRANSP_MASK;\
+  t0 = TRANSP_MASK; \
  \
  i0 = _mm_shuffle_epi8(i0, t0);\
  i1 = _mm_shuffle_epi8(i1, t0);\
--- a/algo/groestl/aes_ni/groestl256-intr-avx.h
+++ b/algo/groestl/aes_ni/groestl256-intr-avx.h
@@ -1,482 +0,0 @@
-/* groestl-intr-avx.h     Aug 2011
- *
- * Groestl implementation with intrinsics using ssse3, sse4.1, aes and avx
- * instructions.
- * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
- *
- * This code is placed in the public domain
- */
-
-#include <smmintrin.h>
-#include <wmmintrin.h>
-#include <immintrin.h>
-#include "hash-groestl256.h"
-
-/* global constants  */
-__m128i ROUND_CONST_Lx;
-__m128i ROUND_CONST_L0[ROUNDS512];
-__m128i ROUND_CONST_L7[ROUNDS512];
-__m128i ROUND_CONST_P[ROUNDS1024];
-__m128i ROUND_CONST_Q[ROUNDS1024];
-__m128i TRANSP_MASK;
-__m128i SUBSH_MASK[8];
-__m128i ALL_FF;
-//#if LENGTH <= 256
-__m128i ALL_1B;
-//#else
-//__m256d ALL_1B;
-//#endif
-
-#define tos(a)    #a
-#define tostr(a)  tos(a)
-
-#define insert_m128i_in_m256d(ymm, xmm, pos) (_mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castpd_si256(ymm), xmm, pos)))
-#define extract_m128i_from_m256d(ymm, pos) (_mm256_extractf128_si256(_mm256_castpd_si256(ymm), pos))
-
-#define SET_CONSTANTS(){\
-  ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
-  ALL_FF = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\
-  TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
-  SUBSH_MASK[0] = _mm_set_epi32(0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00);\
-  SUBSH_MASK[1] = _mm_set_epi32(0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801);\
-  SUBSH_MASK[2] = _mm_set_epi32(0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02);\
-  SUBSH_MASK[3] = _mm_set_epi32(0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03);\
-  SUBSH_MASK[4] = _mm_set_epi32(0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04);\
-  SUBSH_MASK[5] = _mm_set_epi32(0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05);\
-  SUBSH_MASK[6] = _mm_set_epi32(0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906);\
-  SUBSH_MASK[7] = _mm_set_epi32(0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07);\
-  for(i = 0; i < ROUNDS512; i++)\
-  {\
-    ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
-    ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\
-  }\
-  ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\
-}while(0);
-
-/* xmm[i] will be multiplied by 2
- * xmm[j] will be lost
- * xmm[k] has to be all 0x1b
- * xmm[z] has to be zero */
-#define VMUL2(i, j, k, z){\
-  j = _mm_cmpgt_epi8(z, i);\
-  i = _mm_add_epi8(i, i);\
-  j = _mm_and_si128(j, k);\
-  i = _mm_xor_si128(i, j);\
-}/**/
-
-/* Yet another implementation of MixBytes.
-   This time we use the formulae (3) from the paper "Byte Slicing Groestl".
-   Input: a0, ..., a7
-   Output: b0, ..., b7 = MixBytes(a0,...,a7).
-   but we use the relations:
-   t_i = a_i + a_{i+3}
-   x_i = t_i + t_{i+3}
-   y_i = t_i + t+{i+2} + a_{i+6}
-   z_i = 2*x_i
-   w_i = z_i + y_{i+4}
-   v_i = 2*w_i
-   b_i = v_{i+3} + y_{i+4}
-   We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
-   and then adding v_i computed in the meantime in registers xmm0..xmm7.
-   We almost fit into 16 registers, need only 3 spills to memory.
-   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
-   K. Matusiewicz, 2011/05/29 */
-#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-  /* xmm"tostr(8..xmm"tostr(15 = a2 a3... a0 a1 */\
-  b0 = a2;\
-  b1 = a3;\
-  b2 = a4;\
-  b3 = a5;\
-  b4 = a6;\
-  b5 = a7;\
-  b6 = a0;\
-  b7 = a1;\
-  \
-  /* t_i = a_i + a_{i+1} */\
-  a0 = _mm_xor_si128(a0, a1);\
-  a1 = _mm_xor_si128(a1, a2);\
-  a2 = _mm_xor_si128(a2, a3);\
-  a3 = _mm_xor_si128(a3, a4);\
-  a4 = _mm_xor_si128(a4, a5);\
-  a5 = _mm_xor_si128(a5, a6);\
-  a6 = _mm_xor_si128(a6, a7);\
-  a7 = _mm_xor_si128(a7, b6);\
-  \
-  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
-  b0 = _mm_xor_si128(b0, a4);\
-  b1 = _mm_xor_si128(b1, a5);\
-  b2 = _mm_xor_si128(b2, a6);\
-  b3 = _mm_xor_si128(b3, a7);\
-  b4 = _mm_xor_si128(b4, a0);\
-  b5 = _mm_xor_si128(b5, a1);\
-  b6 = _mm_xor_si128(b6, a2);\
-  b7 = _mm_xor_si128(b7, a3);\
-  \
-  b0 = _mm_xor_si128(b0, a6);\
-  b1 = _mm_xor_si128(b1, a7);\
-  b2 = _mm_xor_si128(b2, a0);\
-  b3 = _mm_xor_si128(b3, a1);\
-  b4 = _mm_xor_si128(b4, a2);\
-  b5 = _mm_xor_si128(b5, a3);\
-  b6 = _mm_xor_si128(b6, a4);\
-  b7 = _mm_xor_si128(b7, a5);\
-  \
-  /* spill values y_4, y_5 to memory */\
-  TEMP0 = b0;\
-  TEMP1 = b1;\
-  TEMP2 = b2;\
-  \
-  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
-  b0 = a0;\
-  b1 = a1;\
-  TEMP3 = a2;\
-  \
-  /* compute x_i = t_i + t_{i+3} */\
-  a0 = _mm_xor_si128(a0, a3);\
-  a1 = _mm_xor_si128(a1, a4);\
-  a2 = _mm_xor_si128(a2, a5);\
-  a3 = _mm_xor_si128(a3, a6);\
-  a4 = _mm_xor_si128(a4, a7);\
-  a5 = _mm_xor_si128(a5, b0);\
-  a6 = _mm_xor_si128(a6, b1);\
-  a7 = _mm_xor_si128(a7, TEMP3);\
-  \
-  /*compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
-  b1 = ALL_1B;\
-  b2 = _mm_xor_si128(b2, b2);\
-  VMUL2(a7, b0, b1, b2);\
-  VMUL2(a6, b0, b1, b2);\
-  VMUL2(a5, b0, b1, b2);\
-  VMUL2(a4, b0, b1, b2);\
-  VMUL2(a3, b0, b1, b2);\
-  VMUL2(a2, b0, b1, b2);\
-  VMUL2(a1, b0, b1, b2);\
-  VMUL2(a0, b0, b1, b2);\
-  \
-  /* compute w_i :  add y_{i+4} */\
-  a0 = _mm_xor_si128(a0, TEMP0);\
-  a1 = _mm_xor_si128(a1, TEMP1);\
-  a2 = _mm_xor_si128(a2, TEMP2);\
-  a3 = _mm_xor_si128(a3, b3);\
-  a4 = _mm_xor_si128(a4, b4);\
-  a5 = _mm_xor_si128(a5, b5);\
-  a6 = _mm_xor_si128(a6, b6);\
-  a7 = _mm_xor_si128(a7, b7);\
-  \
-  /*compute v_i: double w_i */\
-  VMUL2(a0, b0, b1, b2);\
-  VMUL2(a1, b0, b1, b2);\
-  VMUL2(a2, b0, b1, b2);\
-  VMUL2(a3, b0, b1, b2);\
-  VMUL2(a4, b0, b1, b2);\
-  VMUL2(a5, b0, b1, b2);\
-  VMUL2(a6, b0, b1, b2);\
-  VMUL2(a7, b0, b1, b2);\
-  \
-  /* add to y_4 y_5 .. v3, v4, ... */\
-  b0 = _mm_xor_si128(a3, TEMP0);\
-  b1 = _mm_xor_si128(a4, TEMP1);\
-  b2 = _mm_xor_si128(a5, TEMP2);\
-  b3 = _mm_xor_si128(b3, a6);\
-  b4 = _mm_xor_si128(b4, a7);\
-  b5 = _mm_xor_si128(b5, a0);\
-  b6 = _mm_xor_si128(b6, a1);\
-  b7 = _mm_xor_si128(b7, a2);\
-}/*MixBytes*/
-
-/* one round
- * i = round number
- * a0-a7 = input rows
- * b0-b7 = output rows
- */
-#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-  /* Add Round Constant */\
-  b1 = ROUND_CONST_Lx;\
-  a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\
-  a1 = _mm_xor_si128(a1, b1);\
-  a2 = _mm_xor_si128(a2, b1);\
-  a3 = _mm_xor_si128(a3, b1);\
-  a4 = _mm_xor_si128(a4, b1);\
-  a5 = _mm_xor_si128(a5, b1);\
-  a6 = _mm_xor_si128(a6, b1);\
-  a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\
-  \
-  /* ShiftBytes + SubBytes (interleaved) */\
-  b0 = _mm_xor_si128(b0,  b0);\
-  a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\
-  a0 = _mm_aesenclast_si128(a0, b0);\
-  a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\
-  a1 = _mm_aesenclast_si128(a1, b0);\
-  a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\
-  a2 = _mm_aesenclast_si128(a2, b0);\
-  a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\
-  a3 = _mm_aesenclast_si128(a3, b0);\
-  a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\
-  a4 = _mm_aesenclast_si128(a4, b0);\
-  a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\
-  a5 = _mm_aesenclast_si128(a5, b0);\
-  a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\
-  a6 = _mm_aesenclast_si128(a6, b0);\
-  a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\
-  a7 = _mm_aesenclast_si128(a7, b0);\
-  \
-  /* MixBytes */\
-  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
-}
-
-/* 10 rounds, P and Q in parallel */
-#define ROUNDS_P_Q(){\
-  ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
-  ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
-  ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
-  ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
-  ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
-  ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
-  ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
-  ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
-  ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
-  ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
-}
-
-/* Matrix Transpose Step 1
- * input is a 512-bit state with two columns in one xmm
- * output is a 512-bit state with two rows in one xmm
- * inputs: i0-i3
- * outputs: i0, o1-o3
- * clobbers: t0
- */
-#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
-  t0 = TRANSP_MASK;\
-  \
-  i0 = _mm_shuffle_epi8(i0, t0);\
-  i1 = _mm_shuffle_epi8(i1, t0);\
-  i2 = _mm_shuffle_epi8(i2, t0);\
-  i3 = _mm_shuffle_epi8(i3, t0);\
-  \
-  o1 = _mm_unpackhi_epi16(i0, i1);\
-  i0 = _mm_unpacklo_epi16(i0, i1);\
-  t0 = _mm_unpackhi_epi16(i2, i3);\
-  i2 = _mm_unpacklo_epi16(i2, i3);\
-  \
-  i0 = _mm_shuffle_epi32(i0, 216);\
-  o1 = _mm_shuffle_epi32(o1, 216);\
-  i2 = _mm_shuffle_epi32(i2, 216);\
-  t0 = _mm_shuffle_epi32(t0, 216);\
-  \
-  o2 = _mm_unpackhi_epi32(i0, i2);\
-  o3 = _mm_unpackhi_epi32(o1, t0);\
-  i0 = _mm_unpacklo_epi32(i0, i2);\
-  o1 = _mm_unpacklo_epi32(o1, t0);\
-}/**/
-
-/* Matrix Transpose Step 2
- * input are two 512-bit states with two rows in one xmm
- * output are two 512-bit states with one row of each state in one xmm
- * inputs: i0-i3 = P, i4-i7 = Q
- * outputs: (i0, o1-o7) = (P|Q)
- * possible reassignments: (output reg = input reg)
- * * i1 -> o3-7
- * * i2 -> o5-7
- * * i3 -> o7
- * * i4 -> o3-7
- * * i5 -> o6-7
- */
-#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
-  o1 = _mm_unpackhi_epi64(i0, i4);\
-  i0 = _mm_unpacklo_epi64(i0, i4);\
-  o2 = _mm_unpacklo_epi64(i1, i5);\
-  o3 = _mm_unpackhi_epi64(i1, i5);\
-  o4 = _mm_unpacklo_epi64(i2, i6);\
-  o5 = _mm_unpackhi_epi64(i2, i6);\
-  o6 = _mm_unpacklo_epi64(i3, i7);\
-  o7 = _mm_unpackhi_epi64(i3, i7);\
-}/**/
-
-/* Matrix Transpose Inverse Step 2
- * input are two 512-bit states with one row of each state in one xmm
- * output are two 512-bit states with two rows in one xmm
- * inputs: i0-i7 = (P|Q)
- * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
- */
-#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
-  o0 = _mm_unpackhi_epi64(i0, i1);\
-  i0 = _mm_unpacklo_epi64(i0, i1);\
-  o1 = _mm_unpackhi_epi64(i2, i3);\
-  i2 = _mm_unpacklo_epi64(i2, i3);\
-  o2 = _mm_unpackhi_epi64(i4, i5);\
-  i4 = _mm_unpacklo_epi64(i4, i5);\
-  o3 = _mm_unpackhi_epi64(i6, i7);\
-  i6 = _mm_unpacklo_epi64(i6, i7);\
-}/**/
-
-/* Matrix Transpose Output Step 2
- * input is one 512-bit state with two rows in one xmm
- * output is one 512-bit state with one row in the low 64-bits of one xmm
- * inputs: i0,i2,i4,i6 = S
- * outputs: (i0-7) = (0|S)
- */
-#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
-  t0 = _mm_xor_si128(t0, t0);\
-  i1 = _mm_unpackhi_epi64(i0, t0);\
-  i0 = _mm_unpacklo_epi64(i0, t0);\
-  i3 = _mm_unpackhi_epi64(i2, t0);\
-  i2 = _mm_unpacklo_epi64(i2, t0);\
-  i5 = _mm_unpackhi_epi64(i4, t0);\
-  i4 = _mm_unpacklo_epi64(i4, t0);\
-  i7 = _mm_unpackhi_epi64(i6, t0);\
-  i6 = _mm_unpacklo_epi64(i6, t0);\
-}/**/
-
-/* Matrix Transpose Output Inverse Step 2
- * input is one 512-bit state with one row in the low 64-bits of one xmm
- * output is one 512-bit state with two rows in one xmm
- * inputs: i0-i7 = (0|S)
- * outputs: (i0, i2, i4, i6) = S
- */
-#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
-  i0 = _mm_unpacklo_epi64(i0, i1);\
-  i2 = _mm_unpacklo_epi64(i2, i3);\
-  i4 = _mm_unpacklo_epi64(i4, i5);\
-  i6 = _mm_unpacklo_epi64(i6, i7);\
-}/**/
-
-
-void INIT256(u64* h)
-{
-  __m128i* const chaining = (__m128i*) h;
-  static __m128i xmm0, /*xmm1,*/ xmm2, /*xmm3, xmm4, xmm5,*/ xmm6, xmm7;
-  static __m128i /*xmm8, xmm9, xmm10, xmm11,*/ xmm12, xmm13, xmm14, xmm15;
-
-  /* load IV into registers xmm12 - xmm15 */
-  xmm12 = chaining[0];
-  xmm13 = chaining[1];
-  xmm14 = chaining[2];
-  xmm15 = chaining[3];
-
-  /* transform chaining value from column ordering into row ordering */
-  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
-  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
-
-  /* store transposed IV */
-  chaining[0] = xmm12;
-  chaining[1] = xmm2;
-  chaining[2] = xmm6;
-  chaining[3] = xmm7;
-}
-
-void TF512(u64* h, u64* m)
-{
-  __m128i* const chaining = (__m128i*) h;
-  __m128i* const message = (__m128i*) m;
-  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-  static __m128i TEMP0;
-  static __m128i TEMP1;
-  static __m128i TEMP2;
-  static __m128i TEMP3;
-
-#ifdef IACA_TRACE
-  IACA_START;
-#endif
-
-  /* load message into registers xmm12 - xmm15 */
-  xmm12 = message[0];
-  xmm13 = message[1];
-  xmm14 = message[2];
-  xmm15 = message[3];
-
-  /* transform message M from column ordering into row ordering */
-  /* we first put two rows (64 bit) of the message into one 128-bit xmm register */
-  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
-
-  /* load previous chaining value and xor message to CV to get input of P */
-  /* we first put two rows (2x64 bit) of the CV into one 128-bit xmm register */
-  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
-  xmm8 = _mm_xor_si128(xmm12, chaining[0]);
-  xmm0 = _mm_xor_si128(xmm2,  chaining[1]);
-  xmm4 = _mm_xor_si128(xmm6,  chaining[2]);
-  xmm5 = _mm_xor_si128(xmm7,  chaining[3]);
-
-  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
-  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
-  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
-  Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
-
-  /* compute the two permutations P and Q in parallel */
-  ROUNDS_P_Q();
-
-  /* unpack again to get two rows of P or two rows of Q in one xmm register */
-  Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3);
-
-  /* xor output of P and Q */
-  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
-  xmm0 = _mm_xor_si128(xmm0, xmm8);
-  xmm1 = _mm_xor_si128(xmm1, xmm10);
-  xmm2 = _mm_xor_si128(xmm2, xmm12);
-  xmm3 = _mm_xor_si128(xmm3, xmm14);
-
-  /* xor CV (feed-forward) */
-  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
-  xmm0 = _mm_xor_si128(xmm0, chaining[0]);
-  xmm1 = _mm_xor_si128(xmm1, chaining[1]);
-  xmm2 = _mm_xor_si128(xmm2, chaining[2]);
-  xmm3 = _mm_xor_si128(xmm3, chaining[3]);
-
-  /* store CV */
-  chaining[0] = xmm0;
-  chaining[1] = xmm1;
-  chaining[2] = xmm2;
-  chaining[3] = xmm3;
-
-#ifdef IACA_TRACE
-  IACA_END;
-#endif
-  return;
-}
-
-void OF512(u64* h)
-{
-  __m128i* const chaining = (__m128i*) h;
-  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-  static __m128i TEMP0;
-  static __m128i TEMP1;
-  static __m128i TEMP2;
-  static __m128i TEMP3;
-
-  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
-  xmm8 = chaining[0];
-  xmm10 = chaining[1];
-  xmm12 = chaining[2];
-  xmm14 = chaining[3];
-
-  /* there are now 2 rows of the CV in one xmm register */
-  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
-  /* result: the 8 input rows of P in xmm8 - xmm15 */
-  Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0);
-
-  /* compute the permutation P */
-  /* result: the output of P(CV) in xmm8 - xmm15 */
-  ROUNDS_P_Q();
-
-  /* unpack again to get two rows of P in one xmm register */
-  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
-  Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
-
-  /* xor CV to P output (feed-forward) */
-  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
-  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
-  xmm10 = _mm_xor_si128(xmm10, (chaining[1]));
-  xmm12 = _mm_xor_si128(xmm12, (chaining[2]));
-  xmm14 = _mm_xor_si128(xmm14, (chaining[3]));
-
-  /* transform state back from row ordering into column ordering */
-  /* result: final hash value in xmm9, xmm11 */
-  Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0);
-
-  /* we only need to return the truncated half of the state */
-  chaining[2] = xmm9;
-  chaining[3] = xmm11;
-}
-
-
--- a/algo/groestl/aes_ni/groestl256-intr-vperm.h
+++ b/algo/groestl/aes_ni/groestl256-intr-vperm.h
@@ -1,793 +0,0 @@
-/* groestl-intr-vperm.h     Aug 2011
- *
- * Groestl implementation with intrinsics using ssse3 instructions.
- * Author: Günther A. Roland, Martin Schläffer
- *
- * Based on the vperm and aes_ni implementations of the hash function Groestl
- * by Cagdas Calik <ccalik@metu.edu.tr> http://www.metu.edu.tr/~ccalik/
- * Institute of Applied Mathematics, Middle East Technical University, Turkey
- *
- * This code is placed in the public domain
- */
-
-#include <tmmintrin.h>
-#include "hash-groestl256.h"
-
-/* global constants  */
-__m128i ROUND_CONST_Lx;
-__m128i ROUND_CONST_L0[ROUNDS512];
-__m128i ROUND_CONST_L7[ROUNDS512];
-__m128i ROUND_CONST_P[ROUNDS1024];
-__m128i ROUND_CONST_Q[ROUNDS1024];
-__m128i TRANSP_MASK;
-__m128i SUBSH_MASK[8];
-__m128i ALL_0F;
-__m128i ALL_15;
-__m128i ALL_1B;
-__m128i ALL_63;
-__m128i ALL_FF;
-__m128i VPERM_IPT[2];
-__m128i VPERM_OPT[2];
-__m128i VPERM_INV[2];
-__m128i VPERM_SB1[2];
-__m128i VPERM_SB2[2];
-__m128i VPERM_SB4[2];
-__m128i VPERM_SBO[2];
-
-
-#define tos(a)    #a
-#define tostr(a)  tos(a)
-
-#define SET_SHARED_CONSTANTS(){\
-  TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
-  ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
-  ALL_63 = _mm_set_epi32(0x63636363, 0x63636363, 0x63636363, 0x63636363);\
-  ALL_0F = _mm_set_epi32(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f);\
-  ALL_15 = _mm_set_epi32(0x15151515, 0x15151515, 0x15151515, 0x15151515);\
-  VPERM_IPT[0] = _mm_set_epi32(0xCD80B1FC, 0xB0FDCC81, 0x4C01307D, 0x317C4D00);\
-  VPERM_IPT[1] = _mm_set_epi32(0xCABAE090, 0x52227808, 0xC2B2E898, 0x5A2A7000);\
-  VPERM_OPT[0] = _mm_set_epi32(0xE10D5DB1, 0xB05C0CE0, 0x01EDBD51, 0x50BCEC00);\
-  VPERM_OPT[1] = _mm_set_epi32(0xF7974121, 0xDEBE6808, 0xFF9F4929, 0xD6B66000);\
-  VPERM_INV[0] = _mm_set_epi32(0x030D0E0C, 0x02050809, 0x01040A06, 0x0F0B0780);\
-  VPERM_INV[1] = _mm_set_epi32(0x04070309, 0x0A0B0C02, 0x0E05060F, 0x0D080180);\
-  VPERM_SB1[0] = _mm_set_epi32(0x3BF7CCC1, 0x0D2ED9EF, 0x3618D415, 0xFAE22300);\
-  VPERM_SB1[1] = _mm_set_epi32(0xA5DF7A6E, 0x142AF544, 0xB19BE18F, 0xCB503E00);\
-  VPERM_SB2[0] = _mm_set_epi32(0xC2A163C8, 0xAB82234A, 0x69EB8840, 0x0AE12900);\
-  VPERM_SB2[1] = _mm_set_epi32(0x5EB7E955, 0xBC982FCD, 0xE27A93C6, 0x0B712400);\
-  VPERM_SB4[0] = _mm_set_epi32(0xBA44FE79, 0x876D2914, 0x3D50AED7, 0xC393EA00);\
-  VPERM_SB4[1] = _mm_set_epi32(0xA876DE97, 0x49087E9F, 0xE1E937A0, 0x3FD64100);\
-}/**/
-
-/* VPERM
- * Transform w/o settings c*
- * transforms 2 rows to/from "vperm mode"
- * this function is derived from:
- *   vperm and aes_ni implementations of hash function Grostl
- *   by Cagdas CALIK
- * inputs:
- * a0, a1 = 2 rows
- * table = transformation table to use
- * t*, c* = clobbers
- * outputs:
- * a0, a1 = 2 rows transformed with table
- * */
-#define VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2){\
-  t0 = c0;\
-  t1 = c0;\
-  t0 = _mm_andnot_si128(t0, a0);\
-  t1 = _mm_andnot_si128(t1, a1);\
-  t0 = _mm_srli_epi32(t0, 4);\
-  t1 = _mm_srli_epi32(t1, 4);\
-  a0 = _mm_and_si128(a0, c0);\
-  a1 = _mm_and_si128(a1, c0);\
-  t2 = c2;\
-  t3 = c2;\
-  t2 = _mm_shuffle_epi8(t2, a0);\
-  t3 = _mm_shuffle_epi8(t3, a1);\
-  a0 = c1;\
-  a1 = c1;\
-  a0 = _mm_shuffle_epi8(a0, t0);\
-  a1 = _mm_shuffle_epi8(a1, t1);\
-  a0 = _mm_xor_si128(a0, t2);\
-  a1 = _mm_xor_si128(a1, t3);\
-}/**/
-
-#define VPERM_Transform_Set_Const(table, c0, c1, c2){\
-  c0 = ALL_0F;\
-  c1 = ((__m128i*) table )[0];\
-  c2 = ((__m128i*) table )[1];\
-}/**/
-
-/* VPERM
- * Transform
- * transforms 2 rows to/from "vperm mode"
- * this function is derived from:
- *   vperm and aes_ni implementations of hash function Grostl
- *   by Cagdas CALIK
- * inputs:
- * a0, a1 = 2 rows
- * table = transformation table to use
- * t*, c* = clobbers
- * outputs:
- * a0, a1 = 2 rows transformed with table
- * */
-#define VPERM_Transform(a0, a1, table, t0, t1, t2, t3, c0, c1, c2){\
-  VPERM_Transform_Set_Const(table, c0, c1, c2);\
-  VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
-}/**/
-
-/* VPERM
- * Transform State
- * inputs:
- * a0-a3 = state
- * table = transformation table to use
- * t* = clobbers
- * outputs:
- * a0-a3 = transformed state
- * */
-#define VPERM_Transform_State(a0, a1, a2, a3, table, t0, t1, t2, t3, c0, c1, c2){\
-  VPERM_Transform_Set_Const(table, c0, c1, c2);\
-  VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
-  VPERM_Transform_No_Const(a2, a3, t0, t1, t2, t3, c0, c1, c2);\
-}/**/
-
-/* VPERM
- * Add Constant to State
- * inputs:
- * a0-a7 = state
- * constant = constant to add
- * t0 = clobber
- * outputs:
- * a0-a7 = state + constant
- * */
-#define VPERM_Add_Constant(a0, a1, a2, a3, a4, a5, a6, a7, constant, t0){\
-  t0 = constant;\
-  a0 = _mm_xor_si128(a0,  t0);\
-  a1 = _mm_xor_si128(a1,  t0);\
-  a2 = _mm_xor_si128(a2,  t0);\
-  a3 = _mm_xor_si128(a3,  t0);\
-  a4 = _mm_xor_si128(a4,  t0);\
-  a5 = _mm_xor_si128(a5,  t0);\
-  a6 = _mm_xor_si128(a6,  t0);\
-  a7 = _mm_xor_si128(a7,  t0);\
-}/**/
-
-/* VPERM
- * Set Substitute Core Constants
- * */
-#define VPERM_Substitute_Core_Set_Const(c0, c1, c2){\
-  VPERM_Transform_Set_Const(VPERM_INV, c0, c1, c2);\
-}/**/
-
-/* VPERM
- * Substitute Core
- * first part of sbox inverse computation
- * this function is derived from:
- *   vperm and aes_ni implementations of hash function Grostl
- *   by Cagdas CALIK
- * inputs:
- * a0 = 1 row
- * t*, c* = clobbers
- * outputs:
- * b0a, b0b = inputs for lookup step
- * */
-#define VPERM_Substitute_Core(a0, b0a, b0b, t0, t1, c0, c1, c2){\
-  t0 = c0;\
-  t0 = _mm_andnot_si128(t0, a0);\
-  t0 = _mm_srli_epi32(t0, 4);\
-  a0 = _mm_and_si128(a0,  c0);\
-  b0a = c1;\
-  b0a = _mm_shuffle_epi8(b0a, a0);\
-  a0 = _mm_xor_si128(a0,  t0);\
-  b0b = c2;\
-  b0b = _mm_shuffle_epi8(b0b, t0);\
-  b0b = _mm_xor_si128(b0b, b0a);\
-  t1 = c2;\
-  t1 = _mm_shuffle_epi8(t1,  a0);\
-  t1 = _mm_xor_si128(t1,  b0a);\
-  b0a = c2;\
-  b0a = _mm_shuffle_epi8(b0a, b0b);\
-  b0a = _mm_xor_si128(b0a, a0);\
-  b0b = c2;\
-  b0b = _mm_shuffle_epi8(b0b, t1);\
-  b0b = _mm_xor_si128(b0b, t0);\
-}/**/
-
-/* VPERM
- * Lookup
- * second part of sbox inverse computation
- * this function is derived from:
- *   vperm and aes_ni implementations of hash function Grostl
- *   by Cagdas CALIK
- * inputs:
- * a0a, a0b = output of Substitution Core
- * table = lookup table to use (*1 / *2 / *4)
- * t0 = clobber
- * outputs:
- * b0 = output of sbox + multiplication
- * */
-#define VPERM_Lookup(a0a, a0b, table, b0, t0){\
-  b0 = ((__m128i*) table )[0];\
-  t0 = ((__m128i*) table )[1];\
-  b0 = _mm_shuffle_epi8(b0, a0b);\
-  t0 = _mm_shuffle_epi8(t0, a0a);\
-  b0 = _mm_xor_si128(b0, t0);\
-}/**/
-
-/* VPERM
- * SubBytes and *2 / *4
- * this function is derived from:
- *   Constant-time SSSE3 AES core implementation
- *   by Mike Hamburg
- * and
- *   vperm and aes_ni implementations of hash function Grostl
- *   by Cagdas CALIK
- * inputs:
- * a0-a7 = state
- * t*, c* = clobbers
- * outputs:
- * a0-a7 = state * 4
- * c2 = row0 * 2 -> b0
- * c1 = row7 * 2 -> b3
- * c0 = row7 * 1 -> b4
- * t2 = row4 * 1 -> b7
- * TEMP_MUL1 = row(i) * 1
- * TEMP_MUL2 = row(i) * 2
- *
- * call:VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7) */
-#define VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, t0, t1, t3, t4, c2, c1, c0, t2){\
-  /* set Constants */\
-  VPERM_Substitute_Core_Set_Const(c0, c1, c2);\
-  /* row 1 */\
-  VPERM_Substitute_Core(a1, t0, t1, t3, t4, c0, c1, c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
-  TEMP_MUL1[1] = t2;\
-  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
-  TEMP_MUL2[1] = t3;\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a1, t4);\
-  /* --- */\
-  /* row 2 */\
-  VPERM_Substitute_Core(a2, t0, t1, t3, t4, c0, c1, c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
-  TEMP_MUL1[2] = t2;\
-  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
-  TEMP_MUL2[2] = t3;\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a2, t4);\
-  /* --- */\
-  /* row 3 */\
-  VPERM_Substitute_Core(a3, t0, t1, t3, t4, c0, c1, c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
-  TEMP_MUL1[3] = t2;\
-  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
-  TEMP_MUL2[3] = t3;\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a3, t4);\
-  /* --- */\
-  /* row 5 */\
-  VPERM_Substitute_Core(a5, t0, t1, t3, t4, c0, c1, c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
-  TEMP_MUL1[5] = t2;\
-  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
-  TEMP_MUL2[5] = t3;\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a5, t4);\
-  /* --- */\
-  /* row 6 */\
-  VPERM_Substitute_Core(a6, t0, t1, t3, t4, c0, c1, c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
-  TEMP_MUL1[6] = t2;\
-  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
-  TEMP_MUL2[6] = t3;\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a6, t4);\
-  /* --- */\
-  /* row 7 */\
-  VPERM_Substitute_Core(a7, t0, t1, t3, t4, c0, c1, c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
-  TEMP_MUL1[7] = t2;\
-  VPERM_Lookup(t0, t1, VPERM_SB2, c1, t4); /*c1 -> b3*/\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a7, t4);\
-  /* --- */\
-  /* row 4 */\
-  VPERM_Substitute_Core(a4, t0, t1, t3, t4, c0, (VPERM_INV[0]), c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4); /*t2 -> b7*/\
-  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
-  TEMP_MUL2[4] = t3;\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a4, t4);\
-  /* --- */\
-  /* row 0 */\
-  VPERM_Substitute_Core(a0, t0, t1, t3, t4, c0, (VPERM_INV[0]), c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, c0, t4); /*c0 -> b4*/\
-  VPERM_Lookup(t0, t1, VPERM_SB2, c2, t4); /*c2 -> b0*/\
-  TEMP_MUL2[0] = c2;\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a0, t4);\
-  /* --- */\
-}/**/
-
-
-/* Optimized MixBytes
- * inputs:
- * a0-a7 = (row0-row7) * 4
- * b0 = row0 * 2
- * b3 = row7 * 2
- * b4 = row7 * 1
- * b7 = row4 * 1
- * all *1 and *2 values must also be in TEMP_MUL1, TEMP_MUL2
- * output: b0-b7
- * */
-#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-  /* save one value */\
-  TEMP_MUL4 = a3;\
-  /* 1 */\
-  b1 = a0;\
-  b1 = _mm_xor_si128(b1, a5);\
-  b1 = _mm_xor_si128(b1, b4); /* -> helper! */\
-  b1 = _mm_xor_si128(b1, (TEMP_MUL2[3]));\
-  b2 = b1;\
-  \
-  /* 2 */\
-  b5 = a1;\
-  b5 = _mm_xor_si128(b5, a4);\
-  b5 = _mm_xor_si128(b5, b7); /* -> helper! */\
-  b5 = _mm_xor_si128(b5, b3); /* -> helper! */\
-  b6 = b5;\
-  \
-  /* 4 */\
-  b7 = _mm_xor_si128(b7, a6);\
-  /*b7 = _mm_xor_si128(b7, (TEMP_MUL1[4])); -> helper! */\
-  b7 = _mm_xor_si128(b7, (TEMP_MUL1[6]));\
-  b7 = _mm_xor_si128(b7, (TEMP_MUL2[1]));\
-  b7 = _mm_xor_si128(b7, b3); /* -> helper! */\
-  b2 = _mm_xor_si128(b2, b7);\
-  \
-  /* 3 */\
-  b0 = _mm_xor_si128(b0, a7);\
-  b0 = _mm_xor_si128(b0, (TEMP_MUL1[5]));\
-  b0 = _mm_xor_si128(b0, (TEMP_MUL1[7]));\
-  /*b0 = _mm_xor_si128(b0, (TEMP_MUL2[0])); -> helper! */\
-  b0 = _mm_xor_si128(b0, (TEMP_MUL2[2]));\
-  b3 = b0;\
-  b1 = _mm_xor_si128(b1, b0);\
-  b0 = _mm_xor_si128(b0, b7); /* moved from 4 */\
-  \
-  /* 5 */\
-  b4 = _mm_xor_si128(b4, a2);\
-  /*b4 = _mm_xor_si128(b4, (TEMP_MUL1[0])); -> helper! */\
-  b4 = _mm_xor_si128(b4, (TEMP_MUL1[2]));\
-  b4 = _mm_xor_si128(b4, (TEMP_MUL2[3]));\
-  b4 = _mm_xor_si128(b4, (TEMP_MUL2[5]));\
-  b3 = _mm_xor_si128(b3, b4);\
-  b6 = _mm_xor_si128(b6, b4);\
-  \
-  /* 6 */\
-  a3 = _mm_xor_si128(a3, (TEMP_MUL1[1]));\
-  a3 = _mm_xor_si128(a3, (TEMP_MUL1[3]));\
-  a3 = _mm_xor_si128(a3, (TEMP_MUL2[4]));\
-  a3 = _mm_xor_si128(a3, (TEMP_MUL2[6]));\
-  b4 = _mm_xor_si128(b4, a3);\
-  b5 = _mm_xor_si128(b5, a3);\
-  b7 = _mm_xor_si128(b7, a3);\
-  \
-  /* 7 */\
-  a1 = _mm_xor_si128(a1, (TEMP_MUL1[1]));\
-  a1 = _mm_xor_si128(a1, (TEMP_MUL2[4]));\
-  b2 = _mm_xor_si128(b2, a1);\
-  b3 = _mm_xor_si128(b3, a1);\
-  \
-  /* 8 */\
-  a5 = _mm_xor_si128(a5, (TEMP_MUL1[5]));\
-  a5 = _mm_xor_si128(a5, (TEMP_MUL2[0]));\
-  b6 = _mm_xor_si128(b6, a5);\
-  b7 = _mm_xor_si128(b7, a5);\
-  \
-  /* 9 */\
-  a3 = TEMP_MUL1[2];\
-  a3 = _mm_xor_si128(a3, (TEMP_MUL2[5]));\
-  b0 = _mm_xor_si128(b0, a3);\
-  b5 = _mm_xor_si128(b5, a3);\
-  \
-  /* 10 */\
-  a1 = TEMP_MUL1[6];\
-  a1 = _mm_xor_si128(a1, (TEMP_MUL2[1]));\
-  b1 = _mm_xor_si128(b1, a1);\
-  b4 = _mm_xor_si128(b4, a1);\
-  \
-  /* 11 */\
-  a5 = TEMP_MUL1[3];\
-  a5 = _mm_xor_si128(a5, (TEMP_MUL2[6]));\
-  b1 = _mm_xor_si128(b1, a5);\
-  b6 = _mm_xor_si128(b6, a5);\
-  \
-  /* 12 */\
-  a3 = TEMP_MUL1[7];\
-  a3 = _mm_xor_si128(a3, (TEMP_MUL2[2]));\
-  b2 = _mm_xor_si128(b2, a3);\
-  b5 = _mm_xor_si128(b5, a3);\
-  \
-  /* 13 */\
-  b0 = _mm_xor_si128(b0, (TEMP_MUL4));\
-  b0 = _mm_xor_si128(b0, a4);\
-  b1 = _mm_xor_si128(b1, a4);\
-  b3 = _mm_xor_si128(b3, a6);\
-  b4 = _mm_xor_si128(b4, a0);\
-  b4 = _mm_xor_si128(b4, a7);\
-  b5 = _mm_xor_si128(b5, a0);\
-  b7 = _mm_xor_si128(b7, a2);\
-}/**/
-
-#define SET_CONSTANTS(){\
-  SET_SHARED_CONSTANTS();\
-  SUBSH_MASK[0] = _mm_set_epi32(0x080f0e0d, 0x0c0b0a09, 0x07060504, 0x03020100);\
-  SUBSH_MASK[1] = _mm_set_epi32(0x0a09080f, 0x0e0d0c0b, 0x00070605, 0x04030201);\
-  SUBSH_MASK[2] = _mm_set_epi32(0x0c0b0a09, 0x080f0e0d, 0x01000706, 0x05040302);\
-  SUBSH_MASK[3] = _mm_set_epi32(0x0e0d0c0b, 0x0a09080f, 0x02010007, 0x06050403);\
-  SUBSH_MASK[4] = _mm_set_epi32(0x0f0e0d0c, 0x0b0a0908, 0x03020100, 0x07060504);\
-  SUBSH_MASK[5] = _mm_set_epi32(0x09080f0e, 0x0d0c0b0a, 0x04030201, 0x00070605);\
-  SUBSH_MASK[6] = _mm_set_epi32(0x0b0a0908, 0x0f0e0d0c, 0x05040302, 0x01000706);\
-  SUBSH_MASK[7] = _mm_set_epi32(0x0d0c0b0a, 0x09080f0e, 0x06050403, 0x02010007);\
-  for(i = 0; i < ROUNDS512; i++)\
-  {\
-    ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
-    ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\
-  }\
-  ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\
-}/**/
-
-/* vperm:
- * transformation before rounds with ipt
- * first round add transformed constant
- * middle rounds: add constant XOR 0x15...15
- * last round: additionally add 0x15...15 after MB
- * transformation after rounds with opt
- */
-/* one round
- * i = round number
- * a0-a7 = input rows
- * b0-b7 = output rows
- */
-#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-  /* AddRoundConstant + ShiftBytes (interleaved) */\
-  b1 = ROUND_CONST_Lx;\
-  a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\
-  a1 = _mm_xor_si128(a1, b1);\
-  a2 = _mm_xor_si128(a2, b1);\
-  a3 = _mm_xor_si128(a3, b1);\
-  a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\
-  a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\
-  a4 = _mm_xor_si128(a4, b1);\
-  a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\
-  a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\
-  a5 = _mm_xor_si128(a5, b1);\
-  a6 = _mm_xor_si128(a6, b1);\
-  a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\
-  a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\
-  a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\
-  a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\
-  a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\
-  /* SubBytes + Multiplication by 2 and 4 */\
-  VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7);\
-  /* MixBytes */\
-  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
-}/**/
-
-/* 10 rounds, P and Q in parallel */
-#define ROUNDS_P_Q(){\
-  VPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, ALL_15, xmm0);\
-  ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
-  ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
-  ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
-  ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
-  ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
-  ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
-  ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
-  ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
-  ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
-  ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
-  VPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, ALL_15, xmm0);\
-}
-
-
-/* Matrix Transpose Step 1
- * input is a 512-bit state with two columns in one xmm
- * output is a 512-bit state with two rows in one xmm
- * inputs: i0-i3
- * outputs: i0, o1-o3
- * clobbers: t0
- */
-#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
-  t0 = TRANSP_MASK;\
-\
-  i0 = _mm_shuffle_epi8(i0, t0);\
-  i1 = _mm_shuffle_epi8(i1, t0);\
-  i2 = _mm_shuffle_epi8(i2, t0);\
-  i3 = _mm_shuffle_epi8(i3, t0);\
-\
-  o1 = i0;\
-  t0 = i2;\
-\
-  i0 = _mm_unpacklo_epi16(i0, i1);\
-  o1 = _mm_unpackhi_epi16(o1, i1);\
-  i2 = _mm_unpacklo_epi16(i2, i3);\
-  t0 = _mm_unpackhi_epi16(t0, i3);\
-\
-  i0 = _mm_shuffle_epi32(i0, 216);\
-  o1 = _mm_shuffle_epi32(o1, 216);\
-  i2 = _mm_shuffle_epi32(i2, 216);\
-  t0 = _mm_shuffle_epi32(t0, 216);\
-\
-  o2 = i0;\
-  o3 = o1;\
-\
-  i0 = _mm_unpacklo_epi32(i0, i2);\
-  o1 = _mm_unpacklo_epi32(o1, t0);\
-  o2 = _mm_unpackhi_epi32(o2, i2);\
-  o3 = _mm_unpackhi_epi32(o3, t0);\
-}/**/
-
-/* Matrix Transpose Step 2
- * input are two 512-bit states with two rows in one xmm
- * output are two 512-bit states with one row of each state in one xmm
- * inputs: i0-i3 = P, i4-i7 = Q
- * outputs: (i0, o1-o7) = (P|Q)
- * possible reassignments: (output reg = input reg)
- * * i1 -> o3-7
- * * i2 -> o5-7
- * * i3 -> o7
- * * i4 -> o3-7
- * * i5 -> o6-7
- */
-#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
-  o1 = i0;\
-  o2 = i1;\
-  i0 = _mm_unpacklo_epi64(i0, i4);\
-  o1 = _mm_unpackhi_epi64(o1, i4);\
-  o3 = i1;\
-  o4 = i2;\
-  o2 = _mm_unpacklo_epi64(o2, i5);\
-  o3 = _mm_unpackhi_epi64(o3, i5);\
-  o5 = i2;\
-  o6 = i3;\
-  o4 = _mm_unpacklo_epi64(o4, i6);\
-  o5 = _mm_unpackhi_epi64(o5, i6);\
-  o7 = i3;\
-  o6 = _mm_unpacklo_epi64(o6, i7);\
-  o7 = _mm_unpackhi_epi64(o7, i7);\
-}/**/
-
-/* Matrix Transpose Inverse Step 2
- * input are two 512-bit states with one row of each state in one xmm
- * output are two 512-bit states with two rows in one xmm
- * inputs: i0-i7 = (P|Q)
- * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
- */
-#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
-  o0 = i0;\
-  i0 = _mm_unpacklo_epi64(i0, i1);\
-  o0 = _mm_unpackhi_epi64(o0, i1);\
-  o1 = i2;\
-  i2 = _mm_unpacklo_epi64(i2, i3);\
-  o1 = _mm_unpackhi_epi64(o1, i3);\
-  o2 = i4;\
-  i4 = _mm_unpacklo_epi64(i4, i5);\
-  o2 = _mm_unpackhi_epi64(o2, i5);\
-  o3 = i6;\
-  i6 = _mm_unpacklo_epi64(i6, i7);\
-  o3 = _mm_unpackhi_epi64(o3, i7);\
-}/**/
-
-/* Matrix Transpose Output Step 2
- * input is one 512-bit state with two rows in one xmm
- * output is one 512-bit state with one row in the low 64-bits of one xmm
- * inputs: i0,i2,i4,i6 = S
- * outputs: (i0-7) = (0|S)
- */
-#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
-  t0 = _mm_xor_si128(t0, t0);\
-  i1 = i0;\
-  i3 = i2;\
-  i5 = i4;\
-  i7 = i6;\
-  i0 = _mm_unpacklo_epi64(i0, t0);\
-  i1 = _mm_unpackhi_epi64(i1, t0);\
-  i2 = _mm_unpacklo_epi64(i2, t0);\
-  i3 = _mm_unpackhi_epi64(i3, t0);\
-  i4 = _mm_unpacklo_epi64(i4, t0);\
-  i5 = _mm_unpackhi_epi64(i5, t0);\
-  i6 = _mm_unpacklo_epi64(i6, t0);\
-  i7 = _mm_unpackhi_epi64(i7, t0);\
-}/**/
-
-/* Matrix Transpose Output Inverse Step 2
- * input is one 512-bit state with one row in the low 64-bits of one xmm
- * output is one 512-bit state with two rows in one xmm
- * inputs: i0-i7 = (0|S)
- * outputs: (i0, i2, i4, i6) = S
- */
-#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
-  i0 = _mm_unpacklo_epi64(i0, i1);\
-  i2 = _mm_unpacklo_epi64(i2, i3);\
-  i4 = _mm_unpacklo_epi64(i4, i5);\
-  i6 = _mm_unpacklo_epi64(i6, i7);\
-}/**/
-
-
-/* transform round constants into VPERM mode */
-#define VPERM_Transform_RoundConst_CNT2(i, j){\
-  xmm0 = ROUND_CONST_L0[i];\
-  xmm1 = ROUND_CONST_L7[i];\
-  xmm2 = ROUND_CONST_L0[j];\
-  xmm3 = ROUND_CONST_L7[j];\
-  VPERM_Transform_State(xmm0, xmm1, xmm2, xmm3, VPERM_IPT, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10);\
-  xmm0 = _mm_xor_si128(xmm0, (ALL_15));\
-  xmm1 = _mm_xor_si128(xmm1, (ALL_15));\
-  xmm2 = _mm_xor_si128(xmm2, (ALL_15));\
-  xmm3 = _mm_xor_si128(xmm3, (ALL_15));\
-  ROUND_CONST_L0[i] = xmm0;\
-  ROUND_CONST_L7[i] = xmm1;\
-  ROUND_CONST_L0[j] = xmm2;\
-  ROUND_CONST_L7[j] = xmm3;\
-}/**/
-
-/* transform round constants into VPERM mode */
-#define VPERM_Transform_RoundConst(){\
-  xmm0 = ROUND_CONST_Lx;\
-  VPERM_Transform(xmm0, xmm1, VPERM_IPT, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10);\
-  xmm0 = _mm_xor_si128(xmm0, (ALL_15));\
-  ROUND_CONST_Lx = xmm0;\
-  VPERM_Transform_RoundConst_CNT2(0, 1);\
-  VPERM_Transform_RoundConst_CNT2(2, 3);\
-  VPERM_Transform_RoundConst_CNT2(4, 5);\
-  VPERM_Transform_RoundConst_CNT2(6, 7);\
-  VPERM_Transform_RoundConst_CNT2(8, 9);\
-}/**/
-
-void INIT256(u64* h)
-{
-  __m128i* const chaining = (__m128i*) h;
-  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  static __m128i xmm8, xmm9, xmm10, /*xmm11,*/ xmm12, xmm13, xmm14, xmm15;
-
-  /* transform round constants into VPERM mode */
-  VPERM_Transform_RoundConst();
-
-  /* load IV into registers xmm12 - xmm15 */
-  xmm12 = chaining[0];
-  xmm13 = chaining[1];
-  xmm14 = chaining[2];
-  xmm15 = chaining[3];
-
-  /* transform chaining value from column ordering into row ordering */
-  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
-  VPERM_Transform_State(xmm12, xmm13, xmm14, xmm15, VPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
-  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
-
-  /* store transposed IV */
-  chaining[0] = xmm12;
-  chaining[1] = xmm2;
-  chaining[2] = xmm6;
-  chaining[3] = xmm7;
-}
-
-void TF512(u64* h, u64* m)
-{
-  __m128i* const chaining = (__m128i*) h;
-  __m128i* const message = (__m128i*) m;
-  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-  static __m128i TEMP_MUL1[8];
-  static __m128i TEMP_MUL2[8];
-  static __m128i TEMP_MUL4;
-
-#ifdef IACA_TRACE
-  IACA_START;
-#endif
-
-  /* load message into registers xmm12 - xmm15 */
-  xmm12 = message[0];
-  xmm13 = message[1];
-  xmm14 = message[2];
-  xmm15 = message[3];
-
-  /* transform message M from column ordering into row ordering */
-  /* we first put two rows (64 bit) of the message into one 128-bit xmm register */
-  VPERM_Transform_State(xmm12, xmm13, xmm14, xmm15, VPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
-  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
-
-  /* load previous chaining value */
-  /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
-  xmm8 = chaining[0];
-  xmm0 = chaining[1];
-  xmm4 = chaining[2];
-  xmm5 = chaining[3];
-
-  /* xor message to CV get input of P */
-  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
-  xmm8 = _mm_xor_si128(xmm8, xmm12);
-  xmm0 = _mm_xor_si128(xmm0, xmm2);
-  xmm4 = _mm_xor_si128(xmm4, xmm6);
-  xmm5 = _mm_xor_si128(xmm5, xmm7);
-
-  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
-  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
-  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
-  Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
-
-  /* compute the two permutations P and Q in parallel */
-  ROUNDS_P_Q();
-
-  /* unpack again to get two rows of P or two rows of Q in one xmm register */
-  Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3);
-
-  /* xor output of P and Q */
-  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
-  xmm0 = _mm_xor_si128(xmm0, xmm8);
-  xmm1 = _mm_xor_si128(xmm1, xmm10);
-  xmm2 = _mm_xor_si128(xmm2, xmm12);
-  xmm3 = _mm_xor_si128(xmm3, xmm14);
-
-  /* xor CV (feed-forward) */
-  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
-  xmm0 = _mm_xor_si128(xmm0, (chaining[0]));
-  xmm1 = _mm_xor_si128(xmm1, (chaining[1]));
-  xmm2 = _mm_xor_si128(xmm2, (chaining[2]));
-  xmm3 = _mm_xor_si128(xmm3, (chaining[3]));
-
-  /* store CV */
-  chaining[0] = xmm0;
-  chaining[1] = xmm1;
-  chaining[2] = xmm2;
-  chaining[3] = xmm3;
-
-#ifdef IACA_TRACE
-  IACA_END;
-#endif
-
-  return;
-}
-
-void OF512(u64* h)
-{
-  __m128i* const chaining = (__m128i*) h;
-  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-  static __m128i TEMP_MUL1[8];
-  static __m128i TEMP_MUL2[8];
-  static __m128i TEMP_MUL4;
-
-  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
-  xmm8 = chaining[0];
-  xmm10 = chaining[1];
-  xmm12 = chaining[2];
-  xmm14 = chaining[3];
-
-  /* there are now 2 rows of the CV in one xmm register */
-  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
-  /* result: the 8 input rows of P in xmm8 - xmm15 */
-  Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0);
-
-  /* compute the permutation P */
-  /* result: the output of P(CV) in xmm8 - xmm15 */
-  ROUNDS_P_Q();
-
-  /* unpack again to get two rows of P in one xmm register */
-  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
-  Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
-
-  /* xor CV to P output (feed-forward) */
-  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
-  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
-  xmm10 = _mm_xor_si128(xmm10, (chaining[1]));
-  xmm12 = _mm_xor_si128(xmm12, (chaining[2]));
-  xmm14 = _mm_xor_si128(xmm14, (chaining[3]));
-
-  /* transform state back from row ordering into column ordering */
-  /* result: final hash value in xmm9, xmm11 */
-  Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0);
-  VPERM_Transform(xmm9, xmm11, VPERM_OPT, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm7);
-
-  /* we only need to return the truncated half of the state */
-  chaining[2] = xmm9;
-  chaining[3] = xmm11;
-
-  return;
-}//OF512()
-
-
-
--- a/algo/groestl/aes_ni/hash-groestl.c
+++ b/algo/groestl/aes_ni/hash-groestl.c
@@ -14,50 +14,15 @@
 #include "miner.h"
 #include "simd-utils.h"

-#ifndef NO_AES_NI
+#ifdef __AES__

-#include "groestl-version.h"
-
-#ifdef TASM
-  #ifdef VAES
-    #include "groestl-asm-aes.h"
-  #else
-    #ifdef VAVX
-      #include "groestl-asm-avx.h"
-    #else
-      #ifdef VVPERM
-        #include "groestl-asm-vperm.h"
-      #else
-        #error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
-      #endif
-    #endif
-  #endif
-#else
-  #ifdef TINTR
-    #ifdef VAES
-      #include "groestl-intr-aes.h"
-    #else
-      #ifdef VAVX
-        #include "groestl-intr-avx.h"
-      #else
-        #ifdef VVPERM
-          #include "groestl-intr-vperm.h"
-        #else
-          #error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
-        #endif
-      #endif
-    #endif
-  #else
-    #error NO TYPE SPECIFIED (-DT[ASM/INTR])
-  #endif
-#endif
+#include "groestl-intr-aes.h"

 HashReturn_gr init_groestl( hashState_groestl* ctx, int hashlen )
 {
  int i;

  ctx->hashlen = hashlen;
-  SET_CONSTANTS();

  if (ctx->chaining == NULL || ctx->buffer == NULL)
    return FAIL_GR;
@@ -70,8 +35,6 @@ HashReturn_gr init_groestl( hashState_groestl* ctx, int hashlen )

  // The only non-zero in the IV is len. It can be hard coded.
  ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 );
-//  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
-//  INIT(ctx->chaining);

  ctx->buf_ptr = 0;
  ctx->rem_ptr = 0;
@@ -91,8 +54,7 @@ HashReturn_gr reinit_groestl( hashState_groestl* ctx )
     ctx->chaining[i] = _mm_setzero_si128();
     ctx->buffer[i]   = _mm_setzero_si128();
  }
-  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
-  INIT(ctx->chaining);
+  ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 );
  ctx->buf_ptr = 0;
  ctx->rem_ptr = 0;

@@ -108,7 +70,7 @@ HashReturn_gr reinit_groestl( hashState_groestl* ctx )
 // 5. Midstate will work at reduced impact than full hash, if total hash
 //    (midstate + tail) is less than 1 block.
 //    This, unfortunately, is the case with all current users.
-// 6. the morefull blocks the bigger the gain
+// 6. the more full blocks the bigger the gain

 // use only for midstate precalc
 HashReturn_gr update_groestl( hashState_groestl* ctx, const void* input,
@@ -142,12 +104,11 @@ HashReturn_gr update_groestl( hashState_groestl* ctx, const void* input,
 // deprecated do not use
 HashReturn_gr final_groestl( hashState_groestl* ctx, void* output )
 {
-   const int len = (int)ctx->databitlen / 128;  // bits to __m128i 
-   const int blocks = ctx->blk_count + 1;       // adjust for final block
-
-   const int rem_ptr = ctx->rem_ptr;      // end of data start of padding
-   const int hashlen_m128i = ctx->hashlen / 16;  // bytes to __m128i
-   const int hash_offset = SIZE512 - hashlen_m128i;  // where in buffer
+   const int len = (int)ctx->databitlen / 128; // bits to __m128i 
+   const uint64_t blocks = ctx->blk_count + 1; // adjust for final block
+   const int rem_ptr = ctx->rem_ptr;           // end of data start of padding
+   const int hashlen_m128i = ctx->hashlen / 16;     // bytes to __m128i
+   const int hash_offset = SIZE512 - hashlen_m128i; // where in buffer
   int i;

   // first pad byte = 0x80, last pad byte = block count
@@ -156,21 +117,18 @@ HashReturn_gr final_groestl( hashState_groestl* ctx, void* output )
   if ( rem_ptr == len - 1 )
   {
       // only 128 bits left in buffer, all padding at once
-       ctx->buffer[rem_ptr] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
-                                                  0,0,0,0, 0,0,0,0x80 );
+      ctx->buffer[rem_ptr] = _mm_set_epi64x( blocks << 56, 0x80 );
   }
   else
   {
       // add first padding
-       ctx->buffer[rem_ptr] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
-                                            0,0,0,0, 0,0,0,0x80 );
+       ctx->buffer[rem_ptr] = m128_const_64( 0, 0x80 );
       // add zero padding
       for ( i = rem_ptr + 1; i < SIZE512 - 1; i++ )
           ctx->buffer[i] = _mm_setzero_si128();

       // add length padding, second last byte is zero unless blocks > 255
-       ctx->buffer[i] = _mm_set_epi8( blocks, blocks>>8, 0,0, 0,0,0,0,
-                                           0,         0 ,0,0, 0,0,0,0 );
+       ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0 );
   }

   // digest final padding block and do output transform
@@ -184,6 +142,75 @@ HashReturn_gr final_groestl( hashState_groestl* ctx, void* output )
   return SUCCESS_GR;
 }

+int groestl512_full( hashState_groestl* ctx, void* output,
+                                const void* input, uint64_t databitlen )
+{
+
+   int i;
+   ctx->hashlen = 64;
+
+   for ( i = 0; i < SIZE512; i++ )
+   {
+      ctx->chaining[i] = _mm_setzero_si128();
+      ctx->buffer[i]   = _mm_setzero_si128();
+   }
+   ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 );
+   ctx->buf_ptr = 0;
+   ctx->rem_ptr = 0;
+
+   // --- update ---
+   
+   const int len = (int)databitlen / 128;
+   const int hashlen_m128i = ctx->hashlen / 16;   // bytes to __m128i
+   const int hash_offset = SIZE512 - hashlen_m128i;
+   int rem = ctx->rem_ptr;
+   uint64_t blocks = len / SIZE512;
+   __m128i* in = (__m128i*)input;
+
+   // digest any full blocks, process directly from input 
+   for ( i = 0; i < blocks; i++ )
+      TF1024( ctx->chaining, &in[ i * SIZE512 ] );
+   ctx->buf_ptr = blocks * SIZE512;
+
+   // copy any remaining data to buffer, it may already contain data
+   // from a previous update for a midstate precalc
+   for ( i = 0; i < len % SIZE512; i++ )
+       ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
+   i += rem;    // use i as rem_ptr in final
+
+   //--- final ---
+
+   blocks++;      // adjust for final block
+
+   if ( i == len -1 )
+   {
+       // only 128 bits left in buffer, all padding at once
+      ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0x80 );
+   }
+   else
+   {
+       // add first padding
+       ctx->buffer[i] = m128_const_64( 0, 0x80 );
+       // add zero padding
+       for ( i += 1; i < SIZE512 - 1; i++ )
+           ctx->buffer[i] = _mm_setzero_si128();
+
+       // add length padding, second last byte is zero unless blocks > 255
+       ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0 ); 
+   }
+
+   // digest final padding block and do output transform
+   TF1024( ctx->chaining, ctx->buffer );
+   OF1024( ctx->chaining );
+
+   // store hash result in output 
+   for ( i = 0; i < hashlen_m128i; i++ )
+      casti_m128i( output, i ) = ctx->chaining[ hash_offset + i ];
+
+   return 0;
+}
+   
+
 HashReturn_gr update_and_final_groestl( hashState_groestl* ctx, void* output,
                                const void* input, DataLength_gr databitlen )
 {
@@ -191,7 +218,7 @@ HashReturn_gr update_and_final_groestl( hashState_groestl* ctx, void* output,
   const int hashlen_m128i = ctx->hashlen / 16;   // bytes to __m128i
   const int hash_offset = SIZE512 - hashlen_m128i;
   int rem = ctx->rem_ptr;
-   int blocks = len / SIZE512;
+   uint64_t blocks = len / SIZE512;
   __m128i* in = (__m128i*)input;
   int i;

@@ -215,26 +242,22 @@ HashReturn_gr update_and_final_groestl( hashState_groestl* ctx, void* output,
   if ( i == len -1 )
   {        
       // only 128 bits left in buffer, all padding at once
-       ctx->buffer[i] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
-                                           0,0,0,0, 0,0,0,0x80 );
+      ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0x80 );
   }   
   else
   {
       // add first padding
-       ctx->buffer[i] = _mm_set_epi8( 0,0,0,0, 0,0,0,0, 
-                                      0,0,0,0, 0,0,0,0x80 );
+       ctx->buffer[i] = m128_const_64( 0, 0x80 );
       // add zero padding
       for ( i += 1; i < SIZE512 - 1; i++ )
           ctx->buffer[i] = _mm_setzero_si128();

       // add length padding, second last byte is zero unless blocks > 255
-       ctx->buffer[i] = _mm_set_epi8( blocks, blocks>>8, 0,0, 0,0,0,0, 
-                                           0,         0 ,0,0, 0,0,0,0 );
+       ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0 );
   }

   // digest final padding block and do output transform
   TF1024( ctx->chaining, ctx->buffer );
-
   OF1024( ctx->chaining );

   // store hash result in output 
--- a/algo/groestl/aes_ni/hash-groestl.h
+++ b/algo/groestl/aes_ni/hash-groestl.h
@@ -87,5 +87,6 @@ HashReturn_gr final_groestl( hashState_groestl*, void* );

 HashReturn_gr update_and_final_groestl( hashState_groestl*,  void*,
                                        const void*, DataLength_gr );
+int groestl512_full( hashState_groestl*,  void*, const void*, uint64_t );

 #endif /* __hash_h */
--- a/algo/groestl/aes_ni/hash-groestl256.c
+++ b/algo/groestl/aes_ni/hash-groestl256.c
@@ -11,43 +11,9 @@
 #include "miner.h"
 #include "simd-utils.h"

-#ifndef NO_AES_NI
+#ifdef __AES__

-#include "groestl-version.h"
-
-#ifdef TASM
-  #ifdef VAES
-    #include "groestl256-asm-aes.h"
-  #else
-    #ifdef VAVX
-      #include "groestl256-asm-avx.h"
-    #else
-      #ifdef VVPERM
-        #include "groestl256-asm-vperm.h"
-      #else
-        #error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
-      #endif
-    #endif
-  #endif
-#else
-  #ifdef TINTR
-    #ifdef VAES
-      #include "groestl256-intr-aes.h"
-    #else
-      #ifdef VAVX
-        #include "groestl256-intr-avx.h"
-      #else
-        #ifdef VVPERM
-          #include "groestl256-intr-vperm.h"
-        #else
-          #error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
-        #endif
-      #endif
-    #endif
-  #else
-    #error NO TYPE SPECIFIED (-DT[ASM/INTR])
-  #endif
-#endif
+#include "groestl256-intr-aes.h"

 /* initialise context */
 HashReturn_gr init_groestl256( hashState_groestl256* ctx, int hashlen )
@@ -55,7 +21,6 @@ HashReturn_gr init_groestl256( hashState_groestl256* ctx, int hashlen )
  int i;

  ctx->hashlen = hashlen;
-  SET_CONSTANTS();

  if (ctx->chaining == NULL || ctx->buffer == NULL)
    return FAIL_GR;
@@ -86,8 +51,11 @@ HashReturn_gr reinit_groestl256(hashState_groestl256* ctx)
     ctx->chaining[i] = _mm_setzero_si128();
     ctx->buffer[i]   = _mm_setzero_si128();
  }
-  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
-  INIT256(ctx->chaining);
+
+  ctx->chaining[ 3 ] = m128_const_64( 0, 0x0100000000000000 );
+
+//  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
+//  INIT256(ctx->chaining);
  ctx->buf_ptr = 0;
  ctx->rem_ptr = 0;

@@ -246,6 +214,98 @@ HashReturn_gr update_and_final_groestl256( hashState_groestl256* ctx,
   return SUCCESS_GR;
 }

+int groestl256_full( hashState_groestl256* ctx,
+                   void* output, const void* input, DataLength_gr databitlen )
+{
+   int i;
+   ctx->hashlen = 32;
+  for ( i = 0; i < SIZE256; i++ )
+  {
+     ctx->chaining[i] = _mm_setzero_si128();
+     ctx->buffer[i]   = _mm_setzero_si128();
+  }
+  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
+  INIT256( ctx->chaining );
+  ctx->buf_ptr = 0;
+  ctx->rem_ptr = 0;
+
+   const int len = (int)databitlen / 128;
+   const int hashlen_m128i = ctx->hashlen / 16;   // bytes to __m128i
+   const int hash_offset = SIZE256 - hashlen_m128i;
+   int rem = ctx->rem_ptr;
+   int blocks = len / SIZE256;
+   __m128i* in = (__m128i*)input;
+
+   // --- update ---
+
+   // digest any full blocks, process directly from input
+   for ( i = 0; i < blocks; i++ )
+      TF512( ctx->chaining, &in[ i * SIZE256 ] );
+   ctx->buf_ptr = blocks * SIZE256;
+
+   // cryptonight has 200 byte input, an odd number of __m128i
+   // remainder is only 8 bytes, ie u64.
+   if ( databitlen % 128 !=0 )
+   {
+      // must be cryptonight, copy 64 bits of data
+      *(uint64_t*)(ctx->buffer) = *(uint64_t*)(&in[ ctx->buf_ptr ] );
+      i = -1; // signal for odd length
+   }
+   else
+   {
+      // Copy any remaining data to buffer for final transform
+      for ( i = 0; i < len % SIZE256; i++ )
+          ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
+      i += rem;   // use i as rem_ptr in final
+   }
+
+   //--- final ---
+
+   // adjust for final block
+   blocks++;
+
+   if ( i == len - 1 )
+   {
+       // all padding at once
+       ctx->buffer[i] = _mm_set_epi8( blocks,blocks>>8,0,0, 0,0,0,0,
+                                           0,        0,0,0, 0,0,0,0x80 );
+   }
+   else
+   {
+      if ( i == -1 )
+      {
+         // cryptonight odd length
+         ((uint64_t*)ctx->buffer)[ 1 ] = 0x80ull;
+         // finish the block with zero and length padding as normal
+         i = 0;
+       }
+       else
+       {
+          // add first padding
+          ctx->buffer[i] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
+                                         0,0,0,0, 0,0,0,0x80 );
+       }
+       // add zero padding
+       for ( i += 1; i < SIZE256 - 1; i++ )
+           ctx->buffer[i] = _mm_setzero_si128();
+       // add length padding
+       // cheat since we know the block count is trivial, good if block < 256
+       ctx->buffer[i] = _mm_set_epi8( blocks,blocks>>8,0,0, 0,0,0,0,
+                                           0,        0,0,0, 0,0,0,0 );
+   }
+
+   // digest final padding block and do output transform
+   TF512( ctx->chaining, ctx->buffer );
+   OF512( ctx->chaining );
+
+   // store hash result in output 
+   for ( i = 0; i < hashlen_m128i; i++ )
+      casti_m128i( output, i ) = ctx->chaining[ hash_offset + i ];
+
+   return SUCCESS_GR;
+}
+
+
 /* hash bit sequence */
 HashReturn_gr hash_groestl256(int hashbitlen,
                const BitSequence_gr* data,
--- a/algo/groestl/aes_ni/hash-groestl256.h
+++ b/algo/groestl/aes_ni/hash-groestl256.h
@@ -93,9 +93,6 @@ typedef enum
 typedef struct {
  __attribute__ ((aligned (32))) __m128i chaining[SIZE256];
  __attribute__ ((aligned (32))) __m128i buffer[SIZE256];
-//  __attribute__ ((aligned (32))) u64 chaining[SIZE/8];      /* actual state */
-//  __attribute__ ((aligned (32))) BitSequence_gr buffer[SIZE];  /* data buffer */
-//  u64 block_counter;        /* message block counter */
  int hashlen;              // bytes
  int blk_count;
  int buf_ptr;              /* data buffer pointer */
@@ -118,4 +115,7 @@ HashReturn_gr hash_groestli256( int, const BitSequence_gr*, DataLength_gr,
 HashReturn_gr update_and_final_groestl256( hashState_groestl256*, void*,
                                           const void*, DataLength_gr );

+int groestl256_full( hashState_groestl256* ctx,
+                   void* output, const void* input, DataLength_gr databitlen );
+
 #endif /* __hash_h */
--- a/algo/groestl/groestl-4way.c
+++ b/algo/groestl/groestl-4way.c
@@ -49,11 +49,11 @@ int scanhash_groestl_4way( struct work *work, uint32_t max_nonce,
        pdata[19] = n;

        for ( int lane = 0; lane < 4; lane++ )
-        if ( ( hash+(lane<<3) )[7] < Htarg )
+        if ( ( hash+(lane<<3) )[7] <= Htarg )
        if ( fulltest( hash+(lane<<3), ptarget) && !opt_benchmark )
        {
           pdata[19] = n + lane;
-           submit_lane_solution( work, hash+(lane<<3), mythr, lane );
+           submit_solution( work, hash+(lane<<3), mythr );
        }
        n += 4;
     } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
--- a/algo/groestl/groestl.c
+++ b/algo/groestl/groestl.c
@@ -1,21 +1,23 @@
 #include "groestl-gate.h"
+
+#if !defined(GROESTL_8WAY) && !defined(GROESTLX16R_4WAY)
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
-
-#ifdef NO_AES_NI
-  #include "sph_groestl.h"
-#else
+#ifdef __AES__
  #include "algo/groestl/aes_ni/hash-groestl.h"
+#else
+  #include "sph_groestl.h"
 #endif

 typedef struct
 {
-#ifdef NO_AES_NI
-    sph_groestl512_context groestl1, groestl2;
-#else
+#ifdef __AES__
    hashState_groestl groestl1, groestl2;
+#else
+    sph_groestl512_context groestl1, groestl2;
 #endif

 } groestl_ctx_holder;
@@ -24,12 +26,12 @@ static groestl_ctx_holder groestl_ctx;

 void init_groestl_ctx()
 {
-#ifdef NO_AES_NI
-    sph_groestl512_init( &groestl_ctx.groestl1 );
-    sph_groestl512_init( &groestl_ctx.groestl2 );
-#else
+#ifdef __AES__
    init_groestl( &groestl_ctx.groestl1, 64 );
    init_groestl( &groestl_ctx.groestl2, 64 );
+#else
+    sph_groestl512_init( &groestl_ctx.groestl1 );
+    sph_groestl512_init( &groestl_ctx.groestl2 );
 #endif
 }

@@ -39,18 +41,18 @@ void groestlhash( void *output, const void *input )
     groestl_ctx_holder ctx __attribute__ ((aligned (64)));
     memcpy( &ctx, &groestl_ctx, sizeof(groestl_ctx) );

-#ifdef NO_AES_NI
-     sph_groestl512(&ctx.groestl1, input, 80);
-     sph_groestl512_close(&ctx.groestl1, hash);
-
-     sph_groestl512(&ctx.groestl2, hash, 64);
-     sph_groestl512_close(&ctx.groestl2, hash);
-#else
+#ifdef __AES__
     update_and_final_groestl( &ctx.groestl1, (char*)hash,
                               (const char*)input, 640 );

     update_and_final_groestl( &ctx.groestl2, (char*)hash,
                               (const char*)hash, 512 );
+#else
+     sph_groestl512(&ctx.groestl1, input, 80);
+     sph_groestl512_close(&ctx.groestl1, hash);
+
+     sph_groestl512(&ctx.groestl2, hash, 64);
+     sph_groestl512_close(&ctx.groestl2, hash);
 #endif
     memcpy(output, hash, 32);
 }
@@ -89,4 +91,4 @@ int scanhash_groestl( struct work *work, uint32_t max_nonce,
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }
-
+#endif
--- a/algo/groestl/groestl256-hash-4way.c
+++ b/algo/groestl/groestl256-hash-4way.c
@@ -1,4 +1,5 @@
 /* hash.c     Aug 2011
+ * groestl512-hash-4way https://github.com/JayDDee/cpuminer-opt  2019-12.
 *
 * Groestl implementation for different versions.
 * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
@@ -6,275 +7,170 @@
 * This code is placed in the public domain
 */

+// Optimized for hash and data length that are integrals of __m128i 
+
+
 #include <memory.h>
-#include "hash-groestl256.h"
+#include "groestl256-intr-4way.h"
 #include "miner.h"
 #include "simd-utils.h"

-#ifndef NO_AES_NI
+#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

-#include "groestl-version.h"

-#ifdef TASM
-  #ifdef VAES
-    #include "groestl256-asm-aes.h"
-  #else
-    #ifdef VAVX
-      #include "groestl256-asm-avx.h"
-    #else
-      #ifdef VVPERM
-        #include "groestl256-asm-vperm.h"
-      #else
-        #error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
-      #endif
-    #endif
-  #endif
-#else
-  #ifdef TINTR
-    #ifdef VAES
-      #include "groestl256-intr-aes.h"
-    #else
-      #ifdef VAVX
-        #include "groestl256-intr-avx.h"
-      #else
-        #ifdef VVPERM
-          #include "groestl256-intr-vperm.h"
-        #else
-          #error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
-        #endif
-      #endif
-    #endif
-  #else
-    #error NO TYPE SPECIFIED (-DT[ASM/INTR])
-  #endif
-#endif
-
-/* initialise context */
-HashReturn_gr init_groestl256( hashState_groestl256* ctx, int hashlen )
+int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen )
 {
  int i;

  ctx->hashlen = hashlen;
-  SET_CONSTANTS();

  if (ctx->chaining == NULL || ctx->buffer == NULL)
-    return FAIL_GR;
+    return 1;

  for ( i = 0; i < SIZE256; i++ )
  {
-     ctx->chaining[i] = _mm_setzero_si128();
-     ctx->buffer[i]   = _mm_setzero_si128();
+     ctx->chaining[i] = m512_zero;
+     ctx->buffer[i]   = m512_zero;
  }
-  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
-  INIT256( ctx->chaining );
+
+  // The only non-zero in the IV is len. It can be hard coded.
+  ctx->chaining[ 3 ] = m512_const2_64( 0, 0x0100000000000000 );
+
  ctx->buf_ptr = 0;
  ctx->rem_ptr = 0;

-  return SUCCESS_GR;
+  return 0;
 }

-
-HashReturn_gr reinit_groestl256(hashState_groestl256* ctx)
- {
-  int i;
-
-  if (ctx->chaining == NULL || ctx->buffer == NULL)
-    return FAIL_GR;
-
-  for ( i = 0; i < SIZE256; i++ )
-  {
-     ctx->chaining[i] = _mm_setzero_si128();
-     ctx->buffer[i]   = _mm_setzero_si128();
-  }
-  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
-  INIT256(ctx->chaining);
-  ctx->buf_ptr = 0;
-  ctx->rem_ptr = 0;
-
-  return SUCCESS_GR;
-}
-
-// Use this only for midstate and never for cryptonight
-HashReturn_gr update_groestl256( hashState_groestl256* ctx, const void* input,
-                                 DataLength_gr databitlen )
+int groestl256_4way_full( groestl256_4way_context* ctx, void* output,
+                                const void* input, uint64_t databitlen )
 {
-   __m128i* in = (__m128i*)input;
-   const int len = (int)databitlen / 128;  // bits to __m128i
-   const int blocks = len / SIZE256;    // __M128i to blocks
+   const int len = (int)databitlen / 128;
+   const int hashlen_m128i = 32 / 16;   // bytes to __m128i
+   const int hash_offset = SIZE256 - hashlen_m128i;
   int rem = ctx->rem_ptr;
+   int blocks = len / SIZE256;
+   __m512i* in = (__m512i*)input;
   int i;

-   ctx->blk_count = blocks;
-   ctx->databitlen = databitlen;
+  if (ctx->chaining == NULL || ctx->buffer == NULL)
+    return 1;

-   // digest any full blocks 
+  for ( i = 0; i < SIZE256; i++ )
+  {
+     ctx->chaining[i] = m512_zero;
+     ctx->buffer[i]   = m512_zero;
+  }
+
+  // The only non-zero in the IV is len. It can be hard coded.
+  ctx->chaining[ 3 ] = m512_const2_64( 0, 0x0100000000000000 );
+  ctx->buf_ptr = 0;
+  ctx->rem_ptr = 0;
+   
+   // --- update ---
+
+   // digest any full blocks, process directly from input 
   for ( i = 0; i < blocks; i++ )
-       TF512( ctx->chaining, &in[ i * SIZE256 ] );
-   // adjust buf_ptr to last block
+      TF512_4way( ctx->chaining, &in[ i * SIZE256 ] );
   ctx->buf_ptr = blocks * SIZE256;

-   // Copy any remainder to buffer
+   // copy any remaining data to buffer, it may already contain data
+   // from a previous update for a midstate precalc
   for ( i = 0; i < len % SIZE256; i++ )
       ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
-   // adjust rem_ptr for new data
-   ctx->rem_ptr += i;
+   i += rem;    // use i as rem_ptr in final

-   return SUCCESS_GR;
-}
+   //--- final ---

-// don't use this at all
-HashReturn_gr final_groestl256( hashState_groestl256* ctx, void* output )
-{
-   const int len = (int)ctx->databitlen / 128;  // bits to __m128i 
-   const int blocks = ctx->blk_count + 1;       // adjust for final block
-   const int rem_ptr = ctx->rem_ptr;      // end of data start of padding
-   const int hashlen_m128i = ctx->hashlen / 16;  // bytes to __m128i
-   const int hash_offset = SIZE256 - hashlen_m128i;  // where in buffer
-   int i;
+   blocks++;      // adjust for final block

-   // first pad byte = 0x80, last pad byte = block count
-   // everything in between is zero
-
-   if ( rem_ptr == len - 1 )
-   {
-       // all padding at once
-       ctx->buffer[rem_ptr] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
-                                                  0,0,0,0, 0,0,0,0x80 );
-   }
+   if ( i == SIZE256 - 1 )
+   {        
+       // only 1 vector left in buffer, all padding at once
+      ctx->buffer[i] = m512_const2_64( (uint64_t)blocks << 56, 0x80 ); 
+   }   
   else
   {
       // add first padding
-       ctx->buffer[rem_ptr] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
-                                            0,0,0,0, 0,0,0,0x80 );
+       ctx->buffer[i] = m512_const4_64( 0, 0x80, 0, 0x80 );
       // add zero padding
-       for ( i = rem_ptr + 1; i < SIZE256 - 1; i++ )
-           ctx->buffer[i] = _mm_setzero_si128();
-       // add length padding
-       // cheat since we know the block count is trivial, good if block < 256
-       ctx->buffer[i] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
-                                           0,0,0,0, 0,0,0,0 );
+       for ( i += 1; i < SIZE256 - 1; i++ )
+           ctx->buffer[i] = m512_zero;
+
+       // add length padding, second last byte is zero unless blocks > 255
+      ctx->buffer[i] = m512_const2_64( (uint64_t)blocks << 56, 0 );
   }

-   // digest final padding block and do output transform
-   TF512( ctx->chaining, ctx->buffer );
-   OF512( ctx->chaining );
+// digest final padding block and do output transform
+   TF512_4way( ctx->chaining, ctx->buffer );
+
+   OF512_4way( ctx->chaining );

   // store hash result in output 
   for ( i = 0; i < hashlen_m128i; i++ )
-      casti_m128i( output, i ) = ctx->chaining[ hash_offset + i];
+      casti_m512i( output, i ) = ctx->chaining[ hash_offset + i ];

-   return SUCCESS_GR;
+   return 0;
 }

-HashReturn_gr update_and_final_groestl256( hashState_groestl256* ctx,
-                   void* output, const void* input, DataLength_gr databitlen )
+int groestl256_4way_update_close( groestl256_4way_context* ctx, void* output,
+                                const void* input, uint64_t databitlen )
 {
   const int len = (int)databitlen / 128;
   const int hashlen_m128i = ctx->hashlen / 16;   // bytes to __m128i
   const int hash_offset = SIZE256 - hashlen_m128i;
   int rem = ctx->rem_ptr;
   int blocks = len / SIZE256;
-   __m128i* in = (__m128i*)input;
+   __m512i* in = (__m512i*)input;
   int i;

   // --- update ---

   // digest any full blocks, process directly from input 
   for ( i = 0; i < blocks; i++ )
-      TF512( ctx->chaining, &in[ i * SIZE256 ] );
+      TF512_4way( ctx->chaining, &in[ i * SIZE256 ] );
   ctx->buf_ptr = blocks * SIZE256;

-   // cryptonight has 200 byte input, an odd number of __m128i
-   // remainder is only 8 bytes, ie u64.
-   if ( databitlen % 128 !=0 )
-   {
-      // must be cryptonight, copy 64 bits of data
-      *(uint64_t*)(ctx->buffer) = *(uint64_t*)(&in[ ctx->buf_ptr ] );
-      i = -1; // signal for odd length
-   }
-   else   
-   { 
-      // Copy any remaining data to buffer for final transform
-      for ( i = 0; i < len % SIZE256; i++ )
-          ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
-      i += rem;   // use i as rem_ptr in final
-   }
+   // copy any remaining data to buffer, it may already contain data
+   // from a previous update for a midstate precalc
+   for ( i = 0; i < len % SIZE256; i++ )
+       ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
+   i += rem;    // use i as rem_ptr in final

   //--- final ---

-   // adjust for final block
-   blocks++;
+   blocks++;      // adjust for final block

-   if ( i == len - 1 )
+   if ( i == SIZE256 - 1 )
   {
-       // all padding at once
-       ctx->buffer[i] = _mm_set_epi8( blocks,blocks>>8,0,0, 0,0,0,0,
-                                           0,        0,0,0, 0,0,0,0x80 );
+       // only 1 vector left in buffer, all padding at once
+       ctx->buffer[i] = m512_const1_128( _mm_set_epi8(
+                      blocks, blocks>>8,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) );
   }
   else
   {
-      if ( i == -1 )
-      {
-         // cryptonight odd length
-         ((uint64_t*)ctx->buffer)[ 1 ] = 0x80ull;
-         // finish the block with zero and length padding as normal
-         i = 0;
-       }
-       else
-       {
-          // add first padding
-          ctx->buffer[i] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
-                                         0,0,0,0, 0,0,0,0x80 );
-       }
+       // add first padding
+       ctx->buffer[i] = m512_const4_64( 0, 0x80, 0, 0x80 );
       // add zero padding
       for ( i += 1; i < SIZE256 - 1; i++ )
-           ctx->buffer[i] = _mm_setzero_si128();
-       // add length padding
-       // cheat since we know the block count is trivial, good if block < 256
-       ctx->buffer[i] = _mm_set_epi8( blocks,blocks>>8,0,0, 0,0,0,0,
-                                           0,        0,0,0, 0,0,0,0 );
+           ctx->buffer[i] = m512_zero;
+
+       // add length padding, second last byte is zero unless blocks > 255
+       ctx->buffer[i] = m512_const1_128( _mm_set_epi8(
+                   blocks, blocks>>8, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 ) );
   }

-   // digest final padding block and do output transform
-   TF512( ctx->chaining, ctx->buffer );
-   OF512( ctx->chaining );
+// digest final padding block and do output transform
+   TF512_4way( ctx->chaining, ctx->buffer );
+
+   OF512_4way( ctx->chaining );

   // store hash result in output 
   for ( i = 0; i < hashlen_m128i; i++ )
-      casti_m128i( output, i ) = ctx->chaining[ hash_offset + i ];
+      casti_m512i( output, i ) = ctx->chaining[ hash_offset + i ];

-   return SUCCESS_GR;
+   return 0;
 }

-/* hash bit sequence */
-HashReturn_gr hash_groestl256(int hashbitlen,
-                const BitSequence_gr* data,
-                DataLength_gr databitlen,
-                BitSequence_gr* hashval) {
-  HashReturn_gr ret;
-  hashState_groestl256 context;
+#endif   // VAES

-  /* initialise */
-  if ((ret = init_groestl256(&context, hashbitlen/8)) != SUCCESS_GR)
-    return ret;
-
-  /* process message */
-  if ((ret = update_groestl256(&context, data, databitlen)) != SUCCESS_GR)
-    return ret;
-
-  /* finalise */
-  ret = final_groestl256(&context, hashval);
-
-  return ret;
-}
-
-/* eBash API */
-//#ifdef crypto_hash_BYTES
-//int crypto_hash(unsigned char *out, const unsigned char *in, unsigned long long inlen)
-//{
-//  if (hash_groestl(crypto_hash_BYTES * 8, in, inlen * 8,out) == SUCCESS_GR) return 0;
-//  return -1;
-//}
-//#endif
-
-#endif
--- a/algo/groestl/groestl256-hash-4way.h
+++ b/algo/groestl/groestl256-hash-4way.h
@@ -6,56 +6,39 @@
 * This code is placed in the public domain
 */

-#ifndef __hash_h
-#define __hash_h
+#if !defined(GROESTL256_HASH_4WAY_H__)
+#define GROESTL256_HASH_4WAY_H__ 1

+#include "simd-utils.h"
 #include <immintrin.h>
+#include <stdint.h>
 #include <stdio.h>
 #if defined(_WIN64) || defined(__WINDOWS__)
 #include <windows.h>
 #endif
 #include <stdlib.h>

-/* eBash API begin */
-/*
-#include "crypto_hash.h"
-#ifdef crypto_hash_BYTES
-
-#include <crypto_uint8.h>
-#include <crypto_uint32.h>
-#include <crypto_uint64.h>
-typedef crypto_uint8 u8;
-typedef crypto_uint32 u32;
-typedef crypto_uint64 u64;
-#endif
- */
-/* eBash API end */
-
-//#define LENGTH (512)
-
-#include "brg_endian.h"
-#define NEED_UINT_64T
-#include "algo/sha/brg_types.h"
-
-#ifdef IACA_TRACE
-  #include IACA_MARKS
-#endif
-
+#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+   
 #define LENGTH (256)

+//#include "brg_endian.h"
+//#define NEED_UINT_64T
+//#include "algo/sha/brg_types.h"
+
 /* some sizes (number of bytes) */
 #define ROWS (8)
 #define LENGTHFIELDLEN (ROWS)
 #define COLS512 (8)
 //#define COLS1024 (16)
 #define SIZE_512 ((ROWS)*(COLS512))
-//#define SIZE1024 ((ROWS)*(COLS1024))
+//#define SIZE_1024 ((ROWS)*(COLS1024))
 #define ROUNDS512 (10)
 //#define ROUNDS1024 (14)

 //#if LENGTH<=256
 #define COLS (COLS512)
-//#define SIZE (SIZE512)
+#define SIZE (SIZE512)
 #define ROUNDS (ROUNDS512)
 //#else
 //#define COLS (COLS1024)
@@ -63,59 +46,33 @@ typedef crypto_uint64 u64;
 //#define ROUNDS (ROUNDS1024)
 //#endif

-#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
-
-#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
-#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))
-#define U64BIG(a) (a)
-#endif /* IS_BIG_ENDIAN */
-
-#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
-#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*n)))
-#define U64BIG(a) \
-  ((ROTL64(a, 8) & li_64(000000FF000000FF)) | \
-   (ROTL64(a,24) & li_64(0000FF000000FF00)) | \
-   (ROTL64(a,40) & li_64(00FF000000FF0000)) | \
-   (ROTL64(a,56) & li_64(FF000000FF000000)))
-#endif /* IS_LITTLE_ENDIAN */
-
-typedef unsigned char BitSequence_gr;
-typedef unsigned long long DataLength_gr;
-typedef enum
-{
-    SUCCESS_GR = 0,
-    FAIL_GR = 1,
-    BAD_HASHBITLEN_GR = 2
-} HashReturn_gr;
-
 #define SIZE256 (SIZE_512/16)

 typedef struct {
-  __attribute__ ((aligned (32))) __m128i chaining[SIZE256];
-  __attribute__ ((aligned (32))) __m128i buffer[SIZE256];
-//  __attribute__ ((aligned (32))) u64 chaining[SIZE/8];      /* actual state */
-//  __attribute__ ((aligned (32))) BitSequence_gr buffer[SIZE];  /* data buffer */
-//  u64 block_counter;        /* message block counter */
-  int hashlen;              // bytes
-  int blk_count;
-  int buf_ptr;              /* data buffer pointer */
+  __attribute__ ((aligned (128))) __m512i chaining[SIZE256];
+  __attribute__ ((aligned (64))) __m512i buffer[SIZE256];
+  int hashlen;       // byte
+  int blk_count;     // SIZE_m128i
+  int buf_ptr;       // __m128i offset
  int rem_ptr;
-  int databitlen;
-} hashState_groestl256;
+  int databitlen;    // bits
+} groestl256_4way_context;

-HashReturn_gr init_groestl256( hashState_groestl256*, int );

-HashReturn_gr reinit_groestl256( hashState_groestl256* );
+int groestl256_4way_init( groestl256_4way_context*, uint64_t );

-HashReturn_gr update_groestl256( hashState_groestl256*, const void*,
-                              DataLength_gr );
+//int reinit_groestl( hashState_groestl* );

-HashReturn_gr final_groestl256( hashState_groestl256*, void* );
+//int groestl512_4way_update( groestl256_4way_context*, const void*,
+//                              uint64_t );

-HashReturn_gr hash_groestli256( int, const BitSequence_gr*, DataLength_gr,
-                            BitSequence_gr* );
+//int groestl512_4way_close( groestl512_4way_context*, void* );

-HashReturn_gr update_and_final_groestl256( hashState_groestl256*, void*,
-                                           const void*, DataLength_gr );
+int groestl256_4way_update_close( groestl256_4way_context*,  void*,
+                                        const void*, uint64_t );

-#endif /* __hash_h */
+int groestl256_4way_full( groestl256_4way_context*, void*,
+                          const void*, uint64_t );
+
+#endif
+#endif 
--- a/algo/groestl/groestl256-intr-4way.h
+++ b/algo/groestl/groestl256-intr-4way.h
@@ -7,39 +7,100 @@
 * This code is placed in the public domain
 */

-#include <smmintrin.h>
-#include <wmmintrin.h>
-#include "hash-groestl256.h"

-/* global constants  */
-__m128i ROUND_CONST_Lx;
-__m128i ROUND_CONST_L0[ROUNDS512];
-__m128i ROUND_CONST_L7[ROUNDS512];
-//__m128i ROUND_CONST_P[ROUNDS1024];
-//__m128i ROUND_CONST_Q[ROUNDS1024];
-__m128i TRANSP_MASK;
-__m128i SUBSH_MASK[8];
-__m128i ALL_1B;
-__m128i ALL_FF;
+#if !defined(GROESTL256_INTR_4WAY_H__)
+#define GROESTL256_INTR_4WAY_H__ 1
+      
+#include "groestl256-hash-4way.h"

+#if defined(__VAES__)
+static const __m128i round_const_l0[] __attribute__ ((aligned (64))) =
+{
+   { 0x7060504030201000, 0xffffffffffffffff },
+   { 0x7161514131211101, 0xffffffffffffffff },
+   { 0x7262524232221202, 0xffffffffffffffff },
+   { 0x7363534333231303, 0xffffffffffffffff },
+   { 0x7464544434241404, 0xffffffffffffffff },
+   { 0x7565554535251505, 0xffffffffffffffff },
+   { 0x7666564636261606, 0xffffffffffffffff },
+   { 0x7767574737271707, 0xffffffffffffffff },
+   { 0x7868584838281808, 0xffffffffffffffff },
+   { 0x7969594939291909, 0xffffffffffffffff }
+};
+
+static const __m128i round_const_l7[] __attribute__ ((aligned (64))) =
+{
+   { 0x0000000000000000, 0x8f9fafbfcfdfefff },
+   { 0x0000000000000000, 0x8e9eaebecedeeefe },
+   { 0x0000000000000000, 0x8d9dadbdcdddedfd },
+   { 0x0000000000000000, 0x8c9cacbcccdcecfc },
+   { 0x0000000000000000, 0x8b9babbbcbdbebfb },
+   { 0x0000000000000000, 0x8a9aaabacadaeafa },
+   { 0x0000000000000000, 0x8999a9b9c9d9e9f9 },
+   { 0x0000000000000000, 0x8898a8b8c8d8e8f8 },
+   { 0x0000000000000000, 0x8797a7b7c7d7e7f7 },
+   { 0x0000000000000000, 0x8696a6b6c6d6e6f6 }
+};
+
+static const __m512i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02,
+                                     0x1d1519111c141810, 0x1f171b131e161a12,
+                                     0x2d2529212c242820, 0x2f272b232e262a22,
+                                     0x3d3539313c343830, 0x3f373b333e363a32 };
+
+static const __m512i SUBSH_MASK0 = { 0x0c0f0104070b0e00, 0x03060a0d08020509,
+                                     0x1c1f1114171b1e10, 0x13161a1d18121519,
+                                     0x2c2f2124272b2e20, 0x23262a2d28222529,
+                                     0x3c3f3134373b3e30, 0x33363a3d38323539 };
+
+static const __m512i SUBSH_MASK1 = { 0x0e090205000d0801, 0x04070c0f0a03060b,
+                                     0x1e191215101d1801, 0x14171c1f1a13161b,
+                                     0x2e292225202d2821, 0x24272c2f2a23262b,
+                                     0x3e393235303d3831, 0x34373c3f3a33363b };
+
+static const __m512i SUBSH_MASK2 = { 0x080b0306010f0a02, 0x05000e090c04070d,
+                                     0x181b1316111f1a12, 0x15101e191c14171d,
+                                     0x282b2326212f2a22, 0x25202e292c24272d,
+                                     0x383b3336313f3a32, 0x35303e393c34373d };
+
+static const __m512i SUBSH_MASK3 = { 0x0a0d040702090c03, 0x0601080b0e05000f,
+                                     0x1a1d141712191c13, 0x1611181b1e15101f,
+                                     0x2a2d242722292c23, 0x2621282b2e25202f,
+                                     0x3a3d343732393c33, 0x3631383b3e35303f };
+
+static const __m512i SUBSH_MASK4 = { 0x0b0e0500030a0d04, 0x0702090c0f060108,
+                                     0x1b1e1510131a1d14, 0x1712191c1f161118,
+                                     0x2b2e2520232a2d24, 0x2722292c2f262128,
+                                     0x3b3e3530333a3d34, 0x3732393c3f363138 };
+
+static const __m512i SUBSH_MASK5 = { 0x0d080601040c0f05, 0x00030b0e0907020a,
+                                     0x1d181611141c1f15, 0x10131b1e1917121a,
+                                     0x2d282621242c2f25, 0x20232b2e2927222a,
+                                     0x3d383631343c3f35, 0x30333b3e3937323a };
+
+static const __m512i SUBSH_MASK6 = { 0x0f0a0702050e0906, 0x01040d080b00030c,
+                                     0x1f1a1712151e1916, 0x11141d181b10131c,
+                                     0x2f2a2722252e2926, 0x21242d282b20232c,
+                                     0x3f3a3732353e3936, 0x31343d383b30333c };
+
+static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
+                                     0x191c101316181b17, 0x12151f1a1d11141e,
+                                     0x292c202326282b27, 0x22252f2a2d21242e,
+                                     0x393c303336383b37, 0x32353f3a3d31343e };

 #define tos(a)    #a
 #define tostr(a)  tos(a)

-
 /* xmm[i] will be multiplied by 2
 * xmm[j] will be lost
 * xmm[k] has to be all 0x1b */
 #define MUL2(i, j, k){\
-  j = _mm_xor_si128(j, j);\
-  j = _mm_cmpgt_epi8(j, i);\
-  i = _mm_add_epi8(i, i);\
-  j = _mm_and_si128(j, k);\
-  i = _mm_xor_si128(i, j);\
+  j = _mm512_xor_si512(j, j);\
+  j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask(j, i) );\
+  i = _mm512_add_epi8(i, i);\
+  j = _mm512_and_si512(j, k);\
+  i = _mm512_xor_si512(i, j);\
 } 

- /**/
-
 /* Yet another implementation of MixBytes.
   This time we use the formulae (3) from the paper "Byte Slicing Groestl".
   Input: a0, ..., a7
@@ -61,152 +122,129 @@ __m128i ALL_FF;
  /* t_i = a_i + a_{i+1} */\
  b6 = a0;\
  b7 = a1;\
-  a0 = _mm_xor_si128(a0, a1);\
+  a0 = _mm512_xor_si512(a0, a1);\
  b0 = a2;\
-  a1 = _mm_xor_si128(a1, a2);\
+  a1 = _mm512_xor_si512(a1, a2);\
  b1 = a3;\
-  a2 = _mm_xor_si128(a2, a3);\
+  a2 = _mm512_xor_si512(a2, a3);\
  b2 = a4;\
-  a3 = _mm_xor_si128(a3, a4);\
+  a3 = _mm512_xor_si512(a3, a4);\
  b3 = a5;\
-  a4 = _mm_xor_si128(a4, a5);\
+  a4 = _mm512_xor_si512(a4, a5);\
  b4 = a6;\
-  a5 = _mm_xor_si128(a5, a6);\
+  a5 = _mm512_xor_si512(a5, a6);\
  b5 = a7;\
-  a6 = _mm_xor_si128(a6, a7);\
-  a7 = _mm_xor_si128(a7, b6);\
+  a6 = _mm512_xor_si512(a6, a7);\
+  a7 = _mm512_xor_si512(a7, b6);\
  \
  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
-  b0 = _mm_xor_si128(b0, a4);\
-  b6 = _mm_xor_si128(b6, a4);\
-  b1 = _mm_xor_si128(b1, a5);\
-  b7 = _mm_xor_si128(b7, a5);\
-  b2 = _mm_xor_si128(b2, a6);\
-  b0 = _mm_xor_si128(b0, a6);\
+  b0 = _mm512_xor_si512(b0, a4);\
+  b6 = _mm512_xor_si512(b6, a4);\
+  b1 = _mm512_xor_si512(b1, a5);\
+  b7 = _mm512_xor_si512(b7, a5);\
+  b2 = _mm512_xor_si512(b2, a6);\
+  b0 = _mm512_xor_si512(b0, a6);\
  /* spill values y_4, y_5 to memory */\
  TEMP0 = b0;\
-  b3 = _mm_xor_si128(b3, a7);\
-  b1 = _mm_xor_si128(b1, a7);\
+  b3 = _mm512_xor_si512(b3, a7);\
+  b1 = _mm512_xor_si512(b1, a7);\
  TEMP1 = b1;\
-  b4 = _mm_xor_si128(b4, a0);\
-  b2 = _mm_xor_si128(b2, a0);\
+  b4 = _mm512_xor_si512(b4, a0);\
+  b2 = _mm512_xor_si512(b2, a0);\
  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
  b0 = a0;\
-  b5 = _mm_xor_si128(b5, a1);\
-  b3 = _mm_xor_si128(b3, a1);\
+  b5 = _mm512_xor_si512(b5, a1);\
+  b3 = _mm512_xor_si512(b3, a1);\
  b1 = a1;\
-  b6 = _mm_xor_si128(b6, a2);\
-  b4 = _mm_xor_si128(b4, a2);\
+  b6 = _mm512_xor_si512(b6, a2);\
+  b4 = _mm512_xor_si512(b4, a2);\
  TEMP2 = a2;\
-  b7 = _mm_xor_si128(b7, a3);\
-  b5 = _mm_xor_si128(b5, a3);\
+  b7 = _mm512_xor_si512(b7, a3);\
+  b5 = _mm512_xor_si512(b5, a3);\
  \
  /* compute x_i = t_i + t_{i+3} */\
-  a0 = _mm_xor_si128(a0, a3);\
-  a1 = _mm_xor_si128(a1, a4);\
-  a2 = _mm_xor_si128(a2, a5);\
-  a3 = _mm_xor_si128(a3, a6);\
-  a4 = _mm_xor_si128(a4, a7);\
-  a5 = _mm_xor_si128(a5, b0);\
-  a6 = _mm_xor_si128(a6, b1);\
-  a7 = _mm_xor_si128(a7, TEMP2);\
+  a0 = _mm512_xor_si512(a0, a3);\
+  a1 = _mm512_xor_si512(a1, a4);\
+  a2 = _mm512_xor_si512(a2, a5);\
+  a3 = _mm512_xor_si512(a3, a6);\
+  a4 = _mm512_xor_si512(a4, a7);\
+  a5 = _mm512_xor_si512(a5, b0);\
+  a6 = _mm512_xor_si512(a6, b1);\
+  a7 = _mm512_xor_si512(a7, TEMP2);\
  \
  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
  /* compute w_i : add y_{i+4} */\
-  b1 = ALL_1B;\
+  b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b );\
  MUL2(a0, b0, b1);\
-  a0 = _mm_xor_si128(a0, TEMP0);\
+  a0 = _mm512_xor_si512(a0, TEMP0);\
  MUL2(a1, b0, b1);\
-  a1 = _mm_xor_si128(a1, TEMP1);\
+  a1 = _mm512_xor_si512(a1, TEMP1);\
  MUL2(a2, b0, b1);\
-  a2 = _mm_xor_si128(a2, b2);\
+  a2 = _mm512_xor_si512(a2, b2);\
  MUL2(a3, b0, b1);\
-  a3 = _mm_xor_si128(a3, b3);\
+  a3 = _mm512_xor_si512(a3, b3);\
  MUL2(a4, b0, b1);\
-  a4 = _mm_xor_si128(a4, b4);\
+  a4 = _mm512_xor_si512(a4, b4);\
  MUL2(a5, b0, b1);\
-  a5 = _mm_xor_si128(a5, b5);\
+  a5 = _mm512_xor_si512(a5, b5);\
  MUL2(a6, b0, b1);\
-  a6 = _mm_xor_si128(a6, b6);\
+  a6 = _mm512_xor_si512(a6, b6);\
  MUL2(a7, b0, b1);\
-  a7 = _mm_xor_si128(a7, b7);\
+  a7 = _mm512_xor_si512(a7, b7);\
  \
  /* compute v_i : double w_i      */\
  /* add to y_4 y_5 .. v3, v4, ... */\
  MUL2(a0, b0, b1);\
-  b5 = _mm_xor_si128(b5, a0);\
+  b5 = _mm512_xor_si512(b5, a0);\
  MUL2(a1, b0, b1);\
-  b6 = _mm_xor_si128(b6, a1);\
+  b6 = _mm512_xor_si512(b6, a1);\
  MUL2(a2, b0, b1);\
-  b7 = _mm_xor_si128(b7, a2);\
+  b7 = _mm512_xor_si512(b7, a2);\
  MUL2(a5, b0, b1);\
-  b2 = _mm_xor_si128(b2, a5);\
+  b2 = _mm512_xor_si512(b2, a5);\
  MUL2(a6, b0, b1);\
-  b3 = _mm_xor_si128(b3, a6);\
+  b3 = _mm512_xor_si512(b3, a6);\
  MUL2(a7, b0, b1);\
-  b4 = _mm_xor_si128(b4, a7);\
+  b4 = _mm512_xor_si512(b4, a7);\
  MUL2(a3, b0, b1);\
  MUL2(a4, b0, b1);\
  b0 = TEMP0;\
  b1 = TEMP1;\
-  b0 = _mm_xor_si128(b0, a3);\
-  b1 = _mm_xor_si128(b1, a4);\
+  b0 = _mm512_xor_si512(b0, a3);\
+  b1 = _mm512_xor_si512(b1, a4);\
 }/*MixBytes*/

-#define SET_CONSTANTS(){\
-   ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
-  TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
-  SUBSH_MASK[0] = _mm_set_epi32(0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00);\
-  SUBSH_MASK[1] = _mm_set_epi32(0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801);\
-  SUBSH_MASK[2] = _mm_set_epi32(0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02);\
-  SUBSH_MASK[3] = _mm_set_epi32(0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03);\
-  SUBSH_MASK[4] = _mm_set_epi32(0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04);\
-  SUBSH_MASK[5] = _mm_set_epi32(0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05);\
-  SUBSH_MASK[6] = _mm_set_epi32(0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906);\
-  SUBSH_MASK[7] = _mm_set_epi32(0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07);\
-  for(i = 0; i < ROUNDS512; i++)\
-  {\
-    ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
-    ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\
-  }\
-  ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\
-}while(0); \

-/* one round
- * i = round number
- * a0-a7 = input rows
- * b0-b7 = output rows
- */
 #define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
  /* AddRoundConstant */\
-  b1 = ROUND_CONST_Lx;\
-  a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\
-  a1 = _mm_xor_si128(a1, b1);\
-  a2 = _mm_xor_si128(a2, b1);\
-  a3 = _mm_xor_si128(a3, b1);\
-  a4 = _mm_xor_si128(a4, b1);\
-  a5 = _mm_xor_si128(a5, b1);\
-  a6 = _mm_xor_si128(a6, b1);\
-  a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\
+  b1 = m512_const2_64( 0xffffffffffffffff, 0 ); \
+  a0 = _mm512_xor_si512( a0, m512_const1_128( round_const_l0[i] ) );\
+  a1 = _mm512_xor_si512( a1, b1 );\
+  a2 = _mm512_xor_si512( a2, b1 );\
+  a3 = _mm512_xor_si512( a3, b1 );\
+  a4 = _mm512_xor_si512( a4, b1 );\
+  a5 = _mm512_xor_si512( a5, b1 );\
+  a6 = _mm512_xor_si512( a6, b1 );\
+  a7 = _mm512_xor_si512( a7, m512_const1_128( round_const_l7[i] ) );\
  \
  /* ShiftBytes + SubBytes (interleaved) */\
-  b0 = _mm_xor_si128(b0,  b0);\
-  a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\
-  a0 = _mm_aesenclast_si128(a0, b0);\
-  a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\
-  a1 = _mm_aesenclast_si128(a1, b0);\
-  a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\
-  a2 = _mm_aesenclast_si128(a2, b0);\
-  a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\
-  a3 = _mm_aesenclast_si128(a3, b0);\
-  a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\
-  a4 = _mm_aesenclast_si128(a4, b0);\
-  a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\
-  a5 = _mm_aesenclast_si128(a5, b0);\
-  a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\
-  a6 = _mm_aesenclast_si128(a6, b0);\
-  a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\
-  a7 = _mm_aesenclast_si128(a7, b0);\
+  b0 = _mm512_xor_si512( b0, b0 );\
+  a0 = _mm512_shuffle_epi8( a0, SUBSH_MASK0 );\
+  a0 = _mm512_aesenclast_epi128(a0, b0 );\
+  a1 = _mm512_shuffle_epi8( a1, SUBSH_MASK1 );\
+  a1 = _mm512_aesenclast_epi128(a1, b0 );\
+  a2 = _mm512_shuffle_epi8( a2, SUBSH_MASK2 );\
+  a2 = _mm512_aesenclast_epi128(a2, b0 );\
+  a3 = _mm512_shuffle_epi8( a3, SUBSH_MASK3 );\
+  a3 = _mm512_aesenclast_epi128(a3, b0 );\
+  a4 = _mm512_shuffle_epi8( a4, SUBSH_MASK4 );\
+  a4 = _mm512_aesenclast_epi128(a4, b0 );\
+  a5 = _mm512_shuffle_epi8( a5, SUBSH_MASK5 );\
+  a5 = _mm512_aesenclast_epi128(a5, b0 );\
+  a6 = _mm512_shuffle_epi8( a6, SUBSH_MASK6 );\
+  a6 = _mm512_aesenclast_epi128(a6, b0 );\
+  a7 = _mm512_shuffle_epi8( a7, SUBSH_MASK7 );\
+  a7 = _mm512_aesenclast_epi128( a7, b0 );\
  \
  /* MixBytes */\
  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
@@ -237,31 +275,31 @@ __m128i ALL_FF;
 #define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
  t0 = TRANSP_MASK;\
  \
-  i0 = _mm_shuffle_epi8(i0, t0);\
-  i1 = _mm_shuffle_epi8(i1, t0);\
-  i2 = _mm_shuffle_epi8(i2, t0);\
-  i3 = _mm_shuffle_epi8(i3, t0);\
+  i0 = _mm512_shuffle_epi8( i0, t0 );\
+  i1 = _mm512_shuffle_epi8( i1, t0 );\
+  i2 = _mm512_shuffle_epi8( i2, t0 );\
+  i3 = _mm512_shuffle_epi8( i3, t0 );\
  \
  o1 = i0;\
  t0 = i2;\
  \
-  i0 = _mm_unpacklo_epi16(i0, i1);\
-  o1 = _mm_unpackhi_epi16(o1, i1);\
-  i2 = _mm_unpacklo_epi16(i2, i3);\
-  t0 = _mm_unpackhi_epi16(t0, i3);\
+  i0 = _mm512_unpacklo_epi16( i0, i1 );\
+  o1 = _mm512_unpackhi_epi16( o1, i1 );\
+  i2 = _mm512_unpacklo_epi16( i2, i3 );\
+  t0 = _mm512_unpackhi_epi16( t0, i3 );\
  \
-  i0 = _mm_shuffle_epi32(i0, 216);\
-  o1 = _mm_shuffle_epi32(o1, 216);\
-  i2 = _mm_shuffle_epi32(i2, 216);\
-  t0 = _mm_shuffle_epi32(t0, 216);\
+  i0 = _mm512_shuffle_epi32( i0, 216 );\
+  o1 = _mm512_shuffle_epi32( o1, 216 );\
+  i2 = _mm512_shuffle_epi32( i2, 216 );\
+  t0 = _mm512_shuffle_epi32( t0, 216 );\
  \
  o2 = i0;\
  o3 = o1;\
  \
-  i0 = _mm_unpacklo_epi32(i0, i2);\
-  o1 = _mm_unpacklo_epi32(o1, t0);\
-  o2 = _mm_unpackhi_epi32(o2, i2);\
-  o3 = _mm_unpackhi_epi32(o3, t0);\
+  i0 = _mm512_unpacklo_epi32( i0, i2 );\
+  o1 = _mm512_unpacklo_epi32( o1, t0 );\
+  o2 = _mm512_unpackhi_epi32( o2, i2 );\
+  o3 = _mm512_unpackhi_epi32( o3, t0 );\
 }/**/

 /* Matrix Transpose Step 2
@@ -279,19 +317,19 @@ __m128i ALL_FF;
 #define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
  o1 = i0;\
  o2 = i1;\
-  i0 = _mm_unpacklo_epi64(i0, i4);\
-  o1 = _mm_unpackhi_epi64(o1, i4);\
+  i0 = _mm512_unpacklo_epi64( i0, i4 );\
+  o1 = _mm512_unpackhi_epi64( o1, i4 );\
  o3 = i1;\
  o4 = i2;\
-  o2 = _mm_unpacklo_epi64(o2, i5);\
-  o3 = _mm_unpackhi_epi64(o3, i5);\
+  o2 = _mm512_unpacklo_epi64( o2, i5 );\
+  o3 = _mm512_unpackhi_epi64( o3, i5 );\
  o5 = i2;\
  o6 = i3;\
-  o4 = _mm_unpacklo_epi64(o4, i6);\
-  o5 = _mm_unpackhi_epi64(o5, i6);\
+  o4 = _mm512_unpacklo_epi64( o4, i6 );\
+  o5 = _mm512_unpackhi_epi64( o5, i6 );\
  o7 = i3;\
-  o6 = _mm_unpacklo_epi64(o6, i7);\
-  o7 = _mm_unpackhi_epi64(o7, i7);\
+  o6 = _mm512_unpacklo_epi64( o6, i7 );\
+  o7 = _mm512_unpackhi_epi64( o7, i7 );\
 }/**/

 /* Matrix Transpose Inverse Step 2
@@ -302,19 +340,20 @@ __m128i ALL_FF;
 */
 #define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
  o0 = i0;\
-  i0 = _mm_unpacklo_epi64(i0, i1);\
-  o0 = _mm_unpackhi_epi64(o0, i1);\
+  i0 = _mm512_unpacklo_epi64( i0, i1 );\
+  o0 = _mm512_unpackhi_epi64( o0, i1 );\
  o1 = i2;\
-  i2 = _mm_unpacklo_epi64(i2, i3);\
-  o1 = _mm_unpackhi_epi64(o1, i3);\
+  i2 = _mm512_unpacklo_epi64( i2, i3 );\
+  o1 = _mm512_unpackhi_epi64( o1, i3 );\
  o2 = i4;\
-  i4 = _mm_unpacklo_epi64(i4, i5);\
-  o2 = _mm_unpackhi_epi64(o2, i5);\
+  i4 = _mm512_unpacklo_epi64( i4, i5 );\
+  o2 = _mm512_unpackhi_epi64( o2, i5 );\
  o3 = i6;\
-  i6 = _mm_unpacklo_epi64(i6, i7);\
-  o3 = _mm_unpackhi_epi64(o3, i7);\
+  i6 = _mm512_unpacklo_epi64( i6, i7 );\
+  o3 = _mm512_unpackhi_epi64( o3, i7 );\
 }/**/

+
 /* Matrix Transpose Output Step 2
 * input is one 512-bit state with two rows in one xmm
 * output is one 512-bit state with one row in the low 64-bits of one xmm
@@ -322,19 +361,19 @@ __m128i ALL_FF;
 * outputs: (i0-7) = (0|S)
 */
 #define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
-  t0 = _mm_xor_si128(t0, t0);\
+  t0 = _mm512_xor_si512( t0, t0 );\
  i1 = i0;\
  i3 = i2;\
  i5 = i4;\
  i7 = i6;\
-  i0 = _mm_unpacklo_epi64(i0, t0);\
-  i1 = _mm_unpackhi_epi64(i1, t0);\
-  i2 = _mm_unpacklo_epi64(i2, t0);\
-  i3 = _mm_unpackhi_epi64(i3, t0);\
-  i4 = _mm_unpacklo_epi64(i4, t0);\
-  i5 = _mm_unpackhi_epi64(i5, t0);\
-  i6 = _mm_unpacklo_epi64(i6, t0);\
-  i7 = _mm_unpackhi_epi64(i7, t0);\
+  i0 = _mm512_unpacklo_epi64( i0, t0 );\
+  i1 = _mm512_unpackhi_epi64( i1, t0 );\
+  i2 = _mm512_unpacklo_epi64( i2, t0 );\
+  i3 = _mm512_unpackhi_epi64( i3, t0 );\
+  i4 = _mm512_unpacklo_epi64( i4, t0 );\
+  i5 = _mm512_unpackhi_epi64( i5, t0 );\
+  i6 = _mm512_unpacklo_epi64( i6, t0 );\
+  i7 = _mm512_unpackhi_epi64( i7, t0 );\
 }/**/

 /* Matrix Transpose Output Inverse Step 2
@@ -344,46 +383,20 @@ __m128i ALL_FF;
 * outputs: (i0, i2, i4, i6) = S
 */
 #define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
-  i0 = _mm_unpacklo_epi64(i0, i1);\
-  i2 = _mm_unpacklo_epi64(i2, i3);\
-  i4 = _mm_unpacklo_epi64(i4, i5);\
-  i6 = _mm_unpacklo_epi64(i6, i7);\
+  i0 = _mm512_unpacklo_epi64( i0, i1 );\
+  i2 = _mm512_unpacklo_epi64( i2, i3 );\
+  i4 = _mm512_unpacklo_epi64( i4, i5 );\
+  i6 = _mm512_unpacklo_epi64( i6, i7 );\
 }/**/


-void INIT256( __m128i* chaining )
+void TF512_4way( __m512i* chaining, __m512i* message )
 {
-  static __m128i xmm0, /*xmm1,*/ xmm2, /*xmm3, xmm4, xmm5,*/ xmm6, xmm7;
-  static __m128i /*xmm8, xmm9, xmm10, xmm11,*/ xmm12, xmm13, xmm14, xmm15;
-
-  /* load IV into registers xmm12 - xmm15 */
-  xmm12 = chaining[0];
-  xmm13 = chaining[1];
-  xmm14 = chaining[2];
-  xmm15 = chaining[3];
-
-  /* transform chaining value from column ordering into row ordering */
-  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
-  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
-
-  /* store transposed IV */
-  chaining[0] = xmm12;
-  chaining[1] = xmm2;
-  chaining[2] = xmm6;
-  chaining[3] = xmm7;
-}
-
-void TF512( __m128i* chaining, __m128i* message )
-{
-  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-  static __m128i TEMP0;
-  static __m128i TEMP1;
-  static __m128i TEMP2;
-
-#ifdef IACA_TRACE
-  IACA_START;
-#endif
+  static __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m512i TEMP0;
+  static __m512i TEMP1;
+  static __m512i TEMP2;

  /* load message into registers xmm12 - xmm15 */
  xmm12 = message[0];
@@ -404,10 +417,10 @@ void TF512( __m128i* chaining, __m128i* message )

  /* xor message to CV get input of P */
  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
-  xmm8 = _mm_xor_si128(xmm8, xmm12);
-  xmm0 = _mm_xor_si128(xmm0, xmm2);
-  xmm4 = _mm_xor_si128(xmm4, xmm6);
-  xmm5 = _mm_xor_si128(xmm5, xmm7);
+  xmm8 = _mm512_xor_si512( xmm8, xmm12 );
+  xmm0 = _mm512_xor_si512( xmm0, xmm2 );
+  xmm4 = _mm512_xor_si512( xmm4, xmm6 );
+  xmm5 = _mm512_xor_si512( xmm5, xmm7 );

  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
@@ -422,17 +435,17 @@ void TF512( __m128i* chaining, __m128i* message )

  /* xor output of P and Q */
  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
-  xmm0 = _mm_xor_si128(xmm0, xmm8);
-  xmm1 = _mm_xor_si128(xmm1, xmm10);
-  xmm2 = _mm_xor_si128(xmm2, xmm12);
-  xmm3 = _mm_xor_si128(xmm3, xmm14);
+  xmm0 = _mm512_xor_si512( xmm0, xmm8 );
+  xmm1 = _mm512_xor_si512( xmm1, xmm10 );
+  xmm2 = _mm512_xor_si512( xmm2, xmm12 );
+  xmm3 = _mm512_xor_si512( xmm3, xmm14 );

  /* xor CV (feed-forward) */
  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
-  xmm0 = _mm_xor_si128(xmm0, (chaining[0]));
-  xmm1 = _mm_xor_si128(xmm1, (chaining[1]));
-  xmm2 = _mm_xor_si128(xmm2, (chaining[2]));
-  xmm3 = _mm_xor_si128(xmm3, (chaining[3]));
+  xmm0 = _mm512_xor_si512( xmm0, (chaining[0]) );
+  xmm1 = _mm512_xor_si512( xmm1, (chaining[1]) );
+  xmm2 = _mm512_xor_si512( xmm2, (chaining[2]) );
+  xmm3 = _mm512_xor_si512( xmm3, (chaining[3]) );

  /* store CV */
  chaining[0] = xmm0;
@@ -440,19 +453,16 @@ void TF512( __m128i* chaining, __m128i* message )
  chaining[2] = xmm2;
  chaining[3] = xmm3;

-#ifdef IACA_TRACE
-  IACA_END;
-#endif
  return;
 }

-void OF512( __m128i* chaining )
+void OF512_4way( __m512i* chaining )
 {
-  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-  static __m128i TEMP0;
-  static __m128i TEMP1;
-  static __m128i TEMP2;
+  static __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m512i TEMP0;
+  static __m512i TEMP1;
+  static __m512i TEMP2;

  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
  xmm8 = chaining[0];
@@ -475,10 +485,10 @@ void OF512( __m128i* chaining )

  /* xor CV to P output (feed-forward) */
  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
-  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
-  xmm10 = _mm_xor_si128(xmm10, (chaining[1]));
-  xmm12 = _mm_xor_si128(xmm12, (chaining[2]));
-  xmm14 = _mm_xor_si128(xmm14, (chaining[3]));
+  xmm8  = _mm512_xor_si512( xmm8,  (chaining[0]) );
+  xmm10 = _mm512_xor_si512( xmm10, (chaining[1]) );
+  xmm12 = _mm512_xor_si512( xmm12, (chaining[2]) );
+  xmm14 = _mm512_xor_si512( xmm14, (chaining[3]) );

  /* transform state back from row ordering into column ordering */
  /* result: final hash value in xmm9, xmm11 */
@@ -489,4 +499,5 @@ void OF512( __m128i* chaining )
  chaining[3] = xmm11;
 }

-
+#endif  // VAES
+#endif  // GROESTL512_INTR_4WAY_H__
--- a/algo/groestl/groestl512-hash-4way.c
+++ b/algo/groestl/groestl512-hash-4way.c
@@ -15,38 +15,18 @@
 #include "miner.h"
 #include "simd-utils.h"

-#if defined(__VAES__)
-
-#define ROTL64(a,n) \
-   ( ( ( (a)<<(n) ) | ( (a) >> (64-(n)) ) ) & 0xffffffffffffffff )
-     
-#define U64BIG(a) \
-  ( ( ROTL64(a, 8) & 0x000000FF000000FF ) | \
-    ( ROTL64(a,24) & 0x0000FF000000FF00 ) | \
-    ( ROTL64(a,40) & 0x00FF000000FF0000 ) | \
-    ( ROTL64(a,56) & 0xFF000000FF000000 ) )
+#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

 int groestl512_4way_init( groestl512_4way_context* ctx, uint64_t hashlen )
 {
-  int i;
-
-  ctx->hashlen = hashlen;
-  SET_CONSTANTS();
-
  if (ctx->chaining == NULL || ctx->buffer == NULL)
    return 1;

-  for ( i = 0; i < SIZE512; i++ )
-  {
-     ctx->chaining[i] = m512_zero;
-     ctx->buffer[i]   = m512_zero;
-  }
+  memset_zero_512( ctx->chaining, SIZE512 );
+  memset_zero_512( ctx->buffer, SIZE512 );

  // The only non-zero in the IV is len. It can be hard coded.
  ctx->chaining[ 6 ] = m512_const2_64( 0x0200000000000000, 0 );
-//  uint64_t len = U64BIG((uint64_t)LENGTH);
-//  ctx->chaining[ COLS/2 -1 ] = _mm512_set4_epi64( len, 0, len, 0 );
-//  INIT_4way(ctx->chaining);

  ctx->buf_ptr = 0;
  ctx->rem_ptr = 0;
@@ -58,7 +38,7 @@ int groestl512_4way_update_close( groestl512_4way_context* ctx, void* output,
                                const void* input, uint64_t databitlen )
 {
   const int len = (int)databitlen / 128;
-   const int hashlen_m128i = ctx->hashlen / 16;   // bytes to __m128i
+   const int hashlen_m128i = 64 / 16;   // bytes to __m128i
   const int hash_offset = SIZE512 - hashlen_m128i;
   int rem = ctx->rem_ptr;
   int blocks = len / SIZE512;
@@ -67,16 +47,13 @@ int groestl512_4way_update_close( groestl512_4way_context* ctx, void* output,

   // --- update ---

-   // digest any full blocks, process directly from input 
   for ( i = 0; i < blocks; i++ )
      TF1024_4way( ctx->chaining, &in[ i * SIZE512 ] );
   ctx->buf_ptr = blocks * SIZE512;

-   // copy any remaining data to buffer, it may already contain data
-   // from a previous update for a midstate precalc
   for ( i = 0; i < len % SIZE512; i++ )
       ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
-   i += rem;    // use i as rem_ptr in final
+   i += rem; 

   //--- final ---

@@ -90,23 +67,70 @@ int groestl512_4way_update_close( groestl512_4way_context* ctx, void* output,
   }   
   else
   {
-       // add first padding
       ctx->buffer[i] = m512_const4_64( 0, 0x80, 0, 0x80 );
-       // add zero padding
       for ( i += 1; i < SIZE512 - 1; i++ )
           ctx->buffer[i] = m512_zero;
-
-       // add length padding, second last byte is zero unless blocks > 255
       ctx->buffer[i] = m512_const1_128( _mm_set_epi8(
                   blocks, blocks>>8, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 ) );
   }

-// digest final padding block and do output transform
   TF1024_4way( ctx->chaining, ctx->buffer );
-
   OF1024_4way( ctx->chaining );

-   // store hash result in output 
+   for ( i = 0; i < hashlen_m128i; i++ )
+      casti_m512i( output, i ) = ctx->chaining[ hash_offset + i ];
+
+   return 0;
+}
+
+int groestl512_4way_full( groestl512_4way_context* ctx, void* output,
+                          const void* input, uint64_t datalen )
+{
+   const int len = (int)datalen >> 4;
+   const int hashlen_m128i = 64 >> 4;   // bytes to __m128i
+   const int hash_offset = SIZE512 - hashlen_m128i;
+   uint64_t blocks = len / SIZE512;
+   __m512i* in = (__m512i*)input;
+   int i;
+
+   // --- init ---
+
+   memset_zero_512( ctx->chaining, SIZE512 );
+   memset_zero_512( ctx->buffer, SIZE512 );
+   ctx->chaining[ 6 ] = m512_const2_64( 0x0200000000000000, 0 );
+   ctx->buf_ptr = 0;
+   ctx->rem_ptr = 0;
+
+   // --- update ---
+
+   for ( i = 0; i < blocks; i++ )
+      TF1024_4way( ctx->chaining, &in[ i * SIZE512 ] );
+   ctx->buf_ptr = blocks * SIZE512;
+
+   for ( i = 0; i < len % SIZE512; i++ )
+       ctx->buffer[ ctx->rem_ptr + i ] = in[ ctx->buf_ptr + i ];
+   i += ctx->rem_ptr;
+
+   // --- close ---
+
+   blocks++;   
+
+   if ( i == SIZE512 - 1 )
+   {
+       // only 1 vector left in buffer, all padding at once
+       ctx->buffer[i] = m512_const2_64( blocks << 56, 0x80 );
+   }
+   else
+   {
+       ctx->buffer[i] = m512_const4_64( 0, 0x80, 0, 0x80 );
+       for ( i += 1; i < SIZE512 - 1; i++ )
+           ctx->buffer[i] = m512_zero;
+       ctx->buffer[i] = m512_const2_64( blocks << 56, 0 );
+   }
+
+   TF1024_4way( ctx->chaining, ctx->buffer );
+   OF1024_4way( ctx->chaining );
+
   for ( i = 0; i < hashlen_m128i; i++ )
      casti_m512i( output, i ) = ctx->chaining[ hash_offset + i ];

--- a/algo/groestl/groestl512-hash-4way.h
+++ b/algo/groestl/groestl512-hash-4way.h
@@ -1,11 +1,3 @@
-/* hash.h     Aug 2011
- *
- * Groestl implementation for different versions.
- * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
- *
- * This code is placed in the public domain
- */
-
 #if !defined(GROESTL512_HASH_4WAY_H__)
 #define GROESTL512_HASH_4WAY_H__ 1

@@ -18,11 +10,9 @@
 #endif
 #include <stdlib.h>

-#define LENGTH (512)
+#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

-//#include "brg_endian.h"
-//#define NEED_UINT_64T
-//#include "algo/sha/brg_types.h"
+#define LENGTH (512)

 /* some sizes (number of bytes) */
 #define ROWS (8)
@@ -44,34 +34,11 @@
 #define ROUNDS (ROUNDS1024)
 //#endif

-/*
-#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
-
-#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
-#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))
-#define U64BIG(a) (a)
-#endif // IS_BIG_ENDIAN 
-
-#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
-#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*n)))
-#define U64BIG(a) \
-  ((ROTL64(a, 8) & li_64(000000FF000000FF)) | \
-   (ROTL64(a,24) & li_64(0000FF000000FF00)) | \
-   (ROTL64(a,40) & li_64(00FF000000FF0000)) | \
-   (ROTL64(a,56) & li_64(FF000000FF000000)))
-#endif // IS_LITTLE_ENDIAN 
-
-typedef unsigned char BitSequence_gr;
-typedef unsigned long long DataLength_gr;
-typedef enum { SUCCESS_GR = 0, FAIL_GR = 1, BAD_HASHBITLEN_GR = 2} HashReturn_gr;
-*/
-
 #define SIZE512 (SIZE_1024/16)

 typedef struct {
  __attribute__ ((aligned (128))) __m512i chaining[SIZE512];
  __attribute__ ((aligned (64))) __m512i buffer[SIZE512];
-  int hashlen;       // byte
  int blk_count;     // SIZE_m128i
  int buf_ptr;       // __m128i offset
  int rem_ptr;
@@ -85,10 +52,11 @@ int groestl512_4way_init( groestl512_4way_context*, uint64_t );

 int groestl512_4way_update( groestl512_4way_context*, const void*,
                              uint64_t );
-
 int groestl512_4way_close( groestl512_4way_context*, void* );
-
 int groestl512_4way_update_close( groestl512_4way_context*,  void*,
                                        const void*, uint64_t );
+int groestl512_4way_full( groestl512_4way_context*,  void*,
+                          const void*, uint64_t );

-#endif /* __hash_h */
+#endif   // VAES
+#endif   // GROESTL512_HASH_4WAY_H__
--- a/algo/groestl/groestl512-intr-4way.h
+++ b/algo/groestl/groestl512-intr-4way.h
@@ -15,16 +15,86 @@

 #if defined(__VAES__)

-/* global constants  */
-__m512i ROUND_CONST_Lx;
-//__m128i ROUND_CONST_L0[ROUNDS512];
-//__m128i ROUND_CONST_L7[ROUNDS512];
-__m512i ROUND_CONST_P[ROUNDS1024];
-__m512i ROUND_CONST_Q[ROUNDS1024];
-__m512i TRANSP_MASK;
-__m512i SUBSH_MASK[8];
-__m512i ALL_1B;
-__m512i ALL_FF;
+static const __m128i round_const_p[] __attribute__ ((aligned (64))) =
+{
+   { 0x7060504030201000, 0xf0e0d0c0b0a09080 },
+   { 0x7161514131211101, 0xf1e1d1c1b1a19181 }, 
+   { 0x7262524232221202, 0xf2e2d2c2b2a29282 },
+   { 0x7363534333231303, 0xf3e3d3c3b3a39383 },
+   { 0x7464544434241404, 0xf4e4d4c4b4a49484 },
+   { 0x7565554535251505, 0xf5e5d5c5b5a59585 },
+   { 0x7666564636261606, 0xf6e6d6c6b6a69686 },
+   { 0x7767574737271707, 0xf7e7d7c7b7a79787 },
+   { 0x7868584838281808, 0xf8e8d8c8b8a89888 },
+   { 0x7969594939291909, 0xf9e9d9c9b9a99989 },
+   { 0x7a6a5a4a3a2a1a0a, 0xfaeadacabaaa9a8a },
+   { 0x7b6b5b4b3b2b1b0b, 0xfbebdbcbbbab9b8b },
+   { 0x7c6c5c4c3c2c1c0c, 0xfcecdcccbcac9c8c },
+   { 0x7d6d5d4d3d2d1d0d, 0xfdedddcdbdad9d8d }
+};
+
+static const __m128i round_const_q[] __attribute__ ((aligned (64))) =
+{
+   { 0x8f9fafbfcfdfefff, 0x0f1f2f3f4f5f6f7f },
+   { 0x8e9eaebecedeeefe, 0x0e1e2e3e4e5e6e7e },
+   { 0x8d9dadbdcdddedfd, 0x0d1d2d3d4d5d6d7d },
+   { 0x8c9cacbcccdcecfc, 0x0c1c2c3c4c5c6c7c },
+   { 0x8b9babbbcbdbebfb, 0x0b1b2b3b4b5b6b7b },
+   { 0x8a9aaabacadaeafa, 0x0a1a2a3a4a5a6a7a },
+   { 0x8999a9b9c9d9e9f9, 0x0919293949596979 },
+   { 0x8898a8b8c8d8e8f8, 0x0818283848586878 },
+   { 0x8797a7b7c7d7e7f7, 0x0717273747576777 },
+   { 0x8696a6b6c6d6e6f6, 0x0616263646566676 },
+   { 0x8595a5b5c5d5e5f5, 0x0515253545556575 },
+   { 0x8494a4b4c4d4e4f4, 0x0414243444546474 },
+   { 0x8393a3b3c3d3e3f3, 0x0313233343536373 },
+   { 0x8292a2b2c2d2e2f2, 0x0212223242526272 }
+};
+
+static const __m512i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02,
+                                     0x1d1519111c141810, 0x1f171b131e161a12,
+                                     0x2d2529212c242820, 0x2f272b232e262a22,
+                                     0x3d3539313c343830, 0x3f373b333e363a32 };
+
+static const __m512i SUBSH_MASK0 = { 0x0b0e0104070a0d00, 0x0306090c0f020508,
+                                     0x1b1e1114171a1d10, 0x1316191c1f121518,
+                                     0x2b2e2124272a2d20, 0x2326292c2f222528,
+                                     0x3b3e3134373a3d30, 0x3336393c3f323538 };
+
+static const __m512i SUBSH_MASK1 = { 0x0c0f0205080b0e01, 0x04070a0d00030609,
+                                     0x1c1f1215181b1e11, 0x14171a1d10131619,
+                                     0x2c2f2225282b2e21, 0x24272a2d20232629,
+                                     0x3c3f3235383b3e31, 0x34373a3d30333639 };
+
+static const __m512i SUBSH_MASK2 = { 0x0d000306090c0f02, 0x05080b0e0104070a,
+                                     0x1d101316191c1f12, 0x15181b1e1114171a,
+                                     0x2d202326292c2f22, 0x25282b2e2124272a,
+                                     0x3d303336393c3f32, 0x35383b3e3134373a };
+
+static const __m512i SUBSH_MASK3 = { 0x0e0104070a0d0003, 0x06090c0f0205080b,
+                                     0x1e1114171a1d1013, 0x16191c1f1215181b,
+                                     0x2e2124272a2d2023, 0x26292c2f2225282b,
+                                     0x3e3134373a3d3033, 0x36393c3f3235383b };
+
+static const __m512i SUBSH_MASK4 = { 0x0f0205080b0e0104, 0x070a0d000306090c,
+                                     0x1f1215181b1e1114, 0x171a1d101316191c,
+                                     0x2f2225282b2e2124, 0x272a2d202326292c,
+                                     0x3f3235383b3e3134, 0x373a3d303336393c };
+
+static const __m512i SUBSH_MASK5 = { 0x000306090c0f0205, 0x080b0e0104070a0d,
+                                     0x101316191c1f1215, 0x181b1e1114171a1d,
+                                     0x202326292c2f2225, 0x282b2e2124272a2d,
+                                     0x303336393c3f3235, 0x383b3e3134373a3d };
+
+static const __m512i SUBSH_MASK6 = { 0x0104070a0d000306, 0x090c0f0205080b0e,
+                                     0x1114171a1d101316, 0x191c1f1215181b1e,
+                                     0x2124272a2d202326, 0x292c2f2225282b2e,
+                                     0x3134373a3d303336, 0x393c3f3235383b3e };
+
+static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
+                                     0x16191c1f1215181b, 0x1e1114171a1d1013,
+                                     0x26292c2f2225282b, 0x2e2124272a2d2023,
+                                     0x36393c3f3235383b, 0x3e3134373a3d3033 };

 #define tos(a)    #a
 #define tostr(a)  tos(a)
@@ -155,69 +225,6 @@ __m512i ALL_FF;
  b1 = _mm512_xor_si512(b1, a4);\
 }/*MixBytes*/

-// calculate the round constants seperately and load at startup
-
-#define SET_CONSTANTS(){\
-  ALL_FF = _mm512_set1_epi32( 0xffffffff );\
-  ALL_1B = _mm512_set1_epi32( 0x1b1b1b1b );\
-  TRANSP_MASK   = _mm512_set_epi32( \
-                         0x3f373b33, 0x3e363a32, 0x3d353931, 0x3c343830, \
-                         0x2f272b23, 0x2e262a22, 0x2d252921, 0x2c242820, \
-                         0x1f171b13, 0x1e161a12, 0x1d151911, 0x1c141810, \
-                         0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800 ); \
-  SUBSH_MASK[0] = _mm512_set_epi32( \
-                         0x3336393c, 0x3f323538, 0x3b3e3134, 0x373a3d30, \
-                         0x2326292c, 0x2f222528, 0x2b2e2124, 0x272a2d20, \
-                         0x1316191c, 0x1f121518, 0x1b1e1114, 0x171a1d10, \
-                         0x0306090c, 0x0f020508, 0x0b0e0104, 0x070a0d00 ); \
-  SUBSH_MASK[1] = _mm512_set_epi32( \
-                         0x34373a3d, 0x30333639, 0x3c3f3235, 0x383b3e31, \
-                         0x24272a2d, 0x20232629, 0x2c2f2225, 0x282b2e21, \
-                         0x14171a1d, 0x10131619, 0x1c1f1215, 0x181b1e11, \
-                         0x04070a0d, 0x00030609, 0x0c0f0205, 0x080b0e01 ); \
-  SUBSH_MASK[2] = _mm512_set_epi32( \
-                         0x35383b3e, 0x3134373a, 0x3d303336, 0x393c3f32, \
-                         0x25282b2e, 0x2124272a, 0x2d202326, 0x292c2f22, \
-                         0x15181b1e, 0x1114171a, 0x1d101316, 0x191c1f12, \
-                         0x05080b0e, 0x0104070a, 0x0d000306, 0x090c0f02 ); \
-  SUBSH_MASK[3] = _mm512_set_epi32( \
-                         0x36393c3f, 0x3235383b, 0x3e313437, 0x3a3d3033, \
-                         0x26292c2f, 0x2225282b, 0x2e212427, 0x2a2d2023, \
-                         0x16191c1f, 0x1215181b, 0x1e111417, 0x1a1d1013, \
-                         0x06090c0f, 0x0205080b, 0x0e010407, 0x0a0d0003 ); \
-  SUBSH_MASK[4] = _mm512_set_epi32( \
-                         0x373a3d30, 0x3336393c, 0x3f323538, 0x3b3e3134, \
-                         0x272a2d20, 0x2326292c, 0x2f222528, 0x2b2e2124, \
-                         0x171a1d10, 0x1316191c, 0x1f121518, 0x1b1e1114, \
-                         0x070a0d00, 0x0306090c, 0x0f020508, 0x0b0e0104 ); \
-  SUBSH_MASK[5] = _mm512_set_epi32( \
-                         0x383b3e31, 0x34373a3d, 0x30333639, 0x3c3f3235, \
-                         0x282b2e21, 0x24272a2d, 0x20232629, 0x2c2f2225, \
-                         0x181b1e11, 0x14171a1d, 0x10131619, 0x1c1f1215, \
-                         0x080b0e01, 0x04070a0d, 0x00030609, 0x0c0f0205 ); \
-  SUBSH_MASK[6] = _mm512_set_epi32( \
-                         0x393c3f32, 0x35383b3e, 0x3134373a, 0x3d303336, \
-                         0x292c2f22, 0x25282b2e, 0x2124272a, 0x2d202326, \
-                         0x191c1f12, 0x15181b1e, 0x1114171a, 0x1d101316, \
-                         0x090c0f02, 0x05080b0e, 0x0104070a, 0x0d000306 ); \
-  SUBSH_MASK[7] = _mm512_set_epi32( \
-                         0x3e313437, 0x3a3d3033, 0x36393c3f, 0x3235383b, \
-                         0x2e212427, 0x2a2d2023, 0x26292c2f, 0x2225282b, \
-                         0x1e111417, 0x1a1d1013, 0x16191c1f, 0x1215181b, \
-                         0x0e010407, 0x0a0d0003, 0x06090c0f, 0x0205080b ); \
-  for( i = 0; i < ROUNDS1024; i++ ) \
-  { \
-    ROUND_CONST_P[i] = _mm512_set4_epi32( 0xf0e0d0c0 ^ (i * 0x01010101), \
-                                          0xb0a09080 ^ (i * 0x01010101), \
-                                          0x70605040 ^ (i * 0x01010101), \
-                                          0x30201000 ^ (i * 0x01010101) ); \
-    ROUND_CONST_Q[i] = _mm512_set4_epi32( 0x0f1f2f3f ^ (i * 0x01010101), \
-                                          0x4f5f6f7f ^ (i * 0x01010101), \
-                                          0x8f9fafbf ^ (i * 0x01010101), \
-                                          0xcfdfefff ^ (i * 0x01010101));\
-  } \
-}while(0);\
-
 /* one round
 * a0-a7 = input rows
 * b0-b7 = output rows
@@ -242,30 +249,32 @@ __m512i ALL_FF;
  for ( round_counter = 0; round_counter < 14; round_counter += 2 ) \
  { \
    /* AddRoundConstant P1024 */\
-    xmm8 = _mm512_xor_si512( xmm8, ( ROUND_CONST_P[ round_counter ] ) );\
+    xmm8 = _mm512_xor_si512( xmm8, m512_const1_128( \
+             casti_m128i( round_const_p, round_counter ) ) ); \
    /* ShiftBytes P1024 + pre-AESENCLAST */\
-    xmm8  = _mm512_shuffle_epi8( xmm8,  ( SUBSH_MASK[0] ) );\
-    xmm9  = _mm512_shuffle_epi8( xmm9,  ( SUBSH_MASK[1] ) );\
-    xmm10 = _mm512_shuffle_epi8( xmm10, ( SUBSH_MASK[2] ) );\
-    xmm11 = _mm512_shuffle_epi8( xmm11, ( SUBSH_MASK[3] ) );\
-    xmm12 = _mm512_shuffle_epi8( xmm12, ( SUBSH_MASK[4] ) );\
-    xmm13 = _mm512_shuffle_epi8( xmm13, ( SUBSH_MASK[5] ) );\
-    xmm14 = _mm512_shuffle_epi8( xmm14, ( SUBSH_MASK[6] ) );\
-    xmm15 = _mm512_shuffle_epi8( xmm15, ( SUBSH_MASK[7] ) );\
+    xmm8  = _mm512_shuffle_epi8( xmm8,  SUBSH_MASK0 ); \
+    xmm9  = _mm512_shuffle_epi8( xmm9,  SUBSH_MASK1 );\
+    xmm10 = _mm512_shuffle_epi8( xmm10, SUBSH_MASK2 );\
+    xmm11 = _mm512_shuffle_epi8( xmm11, SUBSH_MASK3 );\
+    xmm12 = _mm512_shuffle_epi8( xmm12, SUBSH_MASK4 );\
+    xmm13 = _mm512_shuffle_epi8( xmm13, SUBSH_MASK5 );\
+    xmm14 = _mm512_shuffle_epi8( xmm14, SUBSH_MASK6 );\
+    xmm15 = _mm512_shuffle_epi8( xmm15, SUBSH_MASK7 );\
    /* SubBytes + MixBytes */\
    SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
    \
     /* AddRoundConstant P1024 */\
-    xmm0 = _mm512_xor_si512( xmm0, ( ROUND_CONST_P[ round_counter+1 ] ) );\
+    xmm0 = _mm512_xor_si512( xmm0, m512_const1_128( \
+             casti_m128i( round_const_p, round_counter+1 ) ) ); \
    /* ShiftBytes P1024 + pre-AESENCLAST */\
-    xmm0 = _mm512_shuffle_epi8( xmm0, ( SUBSH_MASK[0] ) );\
-    xmm1 = _mm512_shuffle_epi8( xmm1, ( SUBSH_MASK[1] ) );\
-    xmm2 = _mm512_shuffle_epi8( xmm2, ( SUBSH_MASK[2] ) );\
-    xmm3 = _mm512_shuffle_epi8( xmm3, ( SUBSH_MASK[3] ) );\
-    xmm4 = _mm512_shuffle_epi8( xmm4, ( SUBSH_MASK[4] ) );\
-    xmm5 = _mm512_shuffle_epi8( xmm5, ( SUBSH_MASK[5] ) );\
-    xmm6 = _mm512_shuffle_epi8( xmm6, ( SUBSH_MASK[6] ) );\
-    xmm7 = _mm512_shuffle_epi8( xmm7, ( SUBSH_MASK[7] ) );\
+    xmm0 = _mm512_shuffle_epi8( xmm0, SUBSH_MASK0 );\
+    xmm1 = _mm512_shuffle_epi8( xmm1, SUBSH_MASK1 );\
+    xmm2 = _mm512_shuffle_epi8( xmm2, SUBSH_MASK2 );\
+    xmm3 = _mm512_shuffle_epi8( xmm3, SUBSH_MASK3 );\
+    xmm4 = _mm512_shuffle_epi8( xmm4, SUBSH_MASK4 );\
+    xmm5 = _mm512_shuffle_epi8( xmm5, SUBSH_MASK5 );\
+    xmm6 = _mm512_shuffle_epi8( xmm6, SUBSH_MASK6 );\
+    xmm7 = _mm512_shuffle_epi8( xmm7, SUBSH_MASK7 );\
    /* SubBytes + MixBytes */\
     SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
  }\
@@ -284,16 +293,17 @@ __m512i ALL_FF;
    xmm12 = _mm512_xor_si512( xmm12, xmm1 );\
    xmm13 = _mm512_xor_si512( xmm13, xmm1 );\
    xmm14 = _mm512_xor_si512( xmm14, xmm1 );\
-    xmm15 = _mm512_xor_si512( xmm15, ( ROUND_CONST_Q[ round_counter ] ) );\
+    xmm15 = _mm512_xor_si512( xmm15, m512_const1_128( \
+                 casti_m128i( round_const_q, round_counter ) ) ); \
    /* ShiftBytes Q1024 + pre-AESENCLAST */\
-    xmm8  = _mm512_shuffle_epi8( xmm8,  ( SUBSH_MASK[1] ) );\
-    xmm9  = _mm512_shuffle_epi8( xmm9,  ( SUBSH_MASK[3] ) );\
-    xmm10 = _mm512_shuffle_epi8( xmm10, ( SUBSH_MASK[5] ) );\
-    xmm11 = _mm512_shuffle_epi8( xmm11, ( SUBSH_MASK[7] ) );\
-    xmm12 = _mm512_shuffle_epi8( xmm12, ( SUBSH_MASK[0] ) );\
-    xmm13 = _mm512_shuffle_epi8( xmm13, ( SUBSH_MASK[2] ) );\
-    xmm14 = _mm512_shuffle_epi8( xmm14, ( SUBSH_MASK[4] ) );\
-    xmm15 = _mm512_shuffle_epi8( xmm15, ( SUBSH_MASK[6] ) );\
+    xmm8  = _mm512_shuffle_epi8( xmm8,  SUBSH_MASK1 );\
+    xmm9  = _mm512_shuffle_epi8( xmm9,  SUBSH_MASK3 );\
+    xmm10 = _mm512_shuffle_epi8( xmm10, SUBSH_MASK5 );\
+    xmm11 = _mm512_shuffle_epi8( xmm11, SUBSH_MASK7 );\
+    xmm12 = _mm512_shuffle_epi8( xmm12, SUBSH_MASK0 );\
+    xmm13 = _mm512_shuffle_epi8( xmm13, SUBSH_MASK2 );\
+    xmm14 = _mm512_shuffle_epi8( xmm14, SUBSH_MASK4 );\
+    xmm15 = _mm512_shuffle_epi8( xmm15, SUBSH_MASK6 );\
    /* SubBytes + MixBytes */\
    SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
    \
@@ -306,16 +316,17 @@ __m512i ALL_FF;
    xmm4 = _mm512_xor_si512( xmm4, xmm9 );\
    xmm5 = _mm512_xor_si512( xmm5, xmm9 );\
    xmm6 = _mm512_xor_si512( xmm6, xmm9 );\
-    xmm7 = _mm512_xor_si512( xmm7, ( ROUND_CONST_Q[ round_counter+1 ] ) );\
+    xmm7 = _mm512_xor_si512( xmm7, m512_const1_128( \
+             casti_m128i( round_const_q, round_counter+1 ) ) ); \
    /* ShiftBytes Q1024 + pre-AESENCLAST */\
-    xmm0 = _mm512_shuffle_epi8( xmm0, ( SUBSH_MASK[1] ) );\
-    xmm1 = _mm512_shuffle_epi8( xmm1, ( SUBSH_MASK[3] ) );\
-    xmm2 = _mm512_shuffle_epi8( xmm2, ( SUBSH_MASK[5] ) );\
-    xmm3 = _mm512_shuffle_epi8( xmm3, ( SUBSH_MASK[7] ) );\
-    xmm4 = _mm512_shuffle_epi8( xmm4, ( SUBSH_MASK[0] ) );\
-    xmm5 = _mm512_shuffle_epi8( xmm5, ( SUBSH_MASK[2] ) );\
-    xmm6 = _mm512_shuffle_epi8( xmm6, ( SUBSH_MASK[4] ) );\
-    xmm7 = _mm512_shuffle_epi8( xmm7, ( SUBSH_MASK[6] ) );\
+    xmm0 = _mm512_shuffle_epi8( xmm0, SUBSH_MASK1 );\
+    xmm1 = _mm512_shuffle_epi8( xmm1, SUBSH_MASK3 );\
+    xmm2 = _mm512_shuffle_epi8( xmm2, SUBSH_MASK5 );\
+    xmm3 = _mm512_shuffle_epi8( xmm3, SUBSH_MASK7 );\
+    xmm4 = _mm512_shuffle_epi8( xmm4, SUBSH_MASK0 );\
+    xmm5 = _mm512_shuffle_epi8( xmm5, SUBSH_MASK2 );\
+    xmm6 = _mm512_shuffle_epi8( xmm6, SUBSH_MASK4 );\
+    xmm7 = _mm512_shuffle_epi8( xmm7, SUBSH_MASK6 );\
    /* SubBytes + MixBytes */\
    SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
  }\
--- a/algo/groestl/myr-groestl.c
+++ b/algo/groestl/myr-groestl.c
@@ -1,22 +1,23 @@
 #include "myrgr-gate.h"

+#if !defined(MYRGR_8WAY) && !defined(MYRGR_4WAY)
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
-
-#ifdef NO_AES_NI
-  #include "sph_groestl.h"
-#else
+#ifdef __AES__
  #include "aes_ni/hash-groestl.h"
+#else
+  #include "sph_groestl.h"
 #endif
 #include <openssl/sha.h>

 typedef struct {
-#ifdef NO_AES_NI
-    sph_groestl512_context  groestl;
-#else
+#ifdef __AES__
    hashState_groestl       groestl;
+#else
+    sph_groestl512_context  groestl;
 #endif
    SHA256_CTX              sha;
 } myrgr_ctx_holder;
@@ -25,10 +26,10 @@ myrgr_ctx_holder myrgr_ctx;

 void init_myrgr_ctx()
 {
-#ifdef NO_AES_NI
-     sph_groestl512_init( &myrgr_ctx.groestl );
-#else
+#ifdef __AES__
     init_groestl ( &myrgr_ctx.groestl, 64 );
+#else
+     sph_groestl512_init( &myrgr_ctx.groestl );
 #endif
     SHA256_Init( &myrgr_ctx.sha );
 }
@@ -40,12 +41,12 @@ void myriad_hash(void *output, const void *input)

 	uint32_t _ALIGN(32) hash[16];

-#ifdef NO_AES_NI
-	sph_groestl512(&ctx.groestl, input, 80);
-	sph_groestl512_close(&ctx.groestl, hash);
-#else
+#ifdef __AES__
   update_groestl( &ctx.groestl, (char*)input, 640 );
   final_groestl( &ctx.groestl, (char*)hash);
+#else
+	sph_groestl512(&ctx.groestl, input, 80);
+	sph_groestl512_close(&ctx.groestl, hash);
 #endif

   SHA256_Update( &ctx.sha, (unsigned char*)hash, 64 );
@@ -88,3 +89,4 @@ int scanhash_myriad( struct work *work, uint32_t max_nonce,
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }
+#endif
--- a/algo/groestl/myrgr-4way.c
+++ b/algo/groestl/myrgr-4way.c
@@ -143,7 +143,7 @@ int scanhash_myriad_8way( struct work *work, uint32_t max_nonce,
         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
         {
            pdata[19] = n + lane;
-            submit_lane_solution( work, lane_hash, mythr, lane );
+            submit_solution( work, lane_hash, mythr );
         }
      }
      n += 8;
@@ -226,7 +226,7 @@ int scanhash_myriad_4way( struct work *work, uint32_t max_nonce,
         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
         {
            pdata[19] = n + lane;
-            submit_lane_solution( work, lane_hash, mythr, lane );
+            submit_solution( work, lane_hash, mythr );
         }
      }
      n += 4;
--- a/algo/groestl/sph_groestl.c
+++ b/algo/groestl/sph_groestl.c
@@ -35,6 +35,8 @@

 #include "sph_groestl.h"

+#if !defined(__AES__)
+
 #ifdef __cplusplus
 extern "C"{
 #endif
@@ -3116,4 +3118,6 @@ sph_groestl512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)

 #ifdef __cplusplus
 }
+
+#endif  // !AES
 #endif
--- a/algo/groestl/sph_groestl.h
+++ b/algo/groestl/sph_groestl.h
@@ -42,6 +42,7 @@ extern "C"{
 #include <stddef.h>
 #include "algo/sha/sph_types.h"

+#if !defined(__AES__)   
 /**
 * Output size (in bits) for Groestl-224.
 */
@@ -326,4 +327,5 @@ void sph_groestl512_addbits_and_close(
 }
 #endif

+#endif  // !AES
 #endif
--- a/algo/heavy/bastion.c
+++ b/algo/heavy/bastion.c
@@ -1,173 +0,0 @@
-#include "algo-gate-api.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <openssl/sha.h>
-#include <stdint.h>
-#include <stdlib.h>
-
-#include "sph_hefty1.h"
-
-#include "algo/luffa/sph_luffa.h"
-#include "algo/fugue/sph_fugue.h"
-#include "algo/skein/sph_skein.h"
-#include "algo/whirlpool/sph_whirlpool.h"
-#include "algo/shabal/sph_shabal.h"
-#include "algo/echo/sph_echo.h"
-#include "algo/hamsi/sph_hamsi.h"
-#include "algo/luffa/luffa_for_sse2.h"
-
-#ifndef NO_AES_NI
-  #include "algo/echo/aes_ni/hash_api.h"
-#endif
-
-void bastionhash(void *output, const void *input)
-{
-	unsigned char hash[64] __attribute__ ((aligned (64)));
-
-#ifdef NO_AES_NI
-        sph_echo512_context     ctx_echo;
-#else
-        hashState_echo          ctx_echo;
-#endif
-        hashState_luffa         ctx_luffa;
-	sph_fugue512_context ctx_fugue;
-	sph_whirlpool_context ctx_whirlpool;
-	sph_shabal512_context ctx_shabal;
-   sph_hamsi512_context ctx_hamsi;
-	sph_skein512_context ctx_skein;
-
-//        unsigned char hashbuf[128] __attribute__ ((aligned (16)));
-//        sph_u64 hashctA;
-//        sph_u64 hashctB;
-//        size_t hashptr;
-
-	HEFTY1(input, 80, hash);
-
-        init_luffa( &ctx_luffa, 512 );
-        update_and_final_luffa( &ctx_luffa, (BitSequence*)hash,
-                                (const BitSequence*)hash, 64 );
-//        update_luffa( &ctx_luffa, hash, 64 );
-//        final_luffa( &ctx_luffa, hash );
-
-	if (hash[0] & 0x8)
-	{
-		sph_fugue512_init(&ctx_fugue);
-		sph_fugue512(&ctx_fugue, hash, 64);
-		sph_fugue512_close(&ctx_fugue, hash);
-	} else {
-   sph_skein512_init( &ctx_skein );
-   sph_skein512( &ctx_skein, hash, 64 );
-   sph_skein512_close( &ctx_skein, hash );
-	}
-
-	sph_whirlpool_init(&ctx_whirlpool);
-	sph_whirlpool(&ctx_whirlpool, hash, 64);
-	sph_whirlpool_close(&ctx_whirlpool, hash);
-
-	sph_fugue512_init(&ctx_fugue);
-	sph_fugue512(&ctx_fugue, hash, 64);
-	sph_fugue512_close(&ctx_fugue, hash);
-
-	if (hash[0] & 0x8)
-	{
-#ifdef NO_AES_NI
-		sph_echo512_init(&ctx_echo);
-		sph_echo512(&ctx_echo, hash, 64);
-		sph_echo512_close(&ctx_echo, hash);
-#else
-                init_echo( &ctx_echo, 512 );
-                update_final_echo ( &ctx_echo,(BitSequence*)hash,
-                                    (const BitSequence*)hash, 512 );
-//                update_echo ( &ctx_echo, hash, 512 );
-//                final_echo( &ctx_echo,  hash );
-#endif
-	} else {
-                init_luffa( &ctx_luffa, 512 );
-                update_and_final_luffa( &ctx_luffa, (BitSequence*)hash,
-                                        (const BitSequence*)hash, 64 );
-//                update_luffa( &ctx_luffa, hash, 64 );
-//                final_luffa( &ctx_luffa, hash );
-	}
-
-	sph_shabal512_init(&ctx_shabal);
-	sph_shabal512(&ctx_shabal, hash, 64);
-	sph_shabal512_close(&ctx_shabal, hash);
-
-   sph_skein512_init( &ctx_skein );
-   sph_skein512( &ctx_skein, hash, 64 );
-   sph_skein512_close( &ctx_skein, hash );
-
-	if (hash[0] & 0x8)
-	{
-		sph_shabal512_init(&ctx_shabal);
-		sph_shabal512(&ctx_shabal, hash, 64);
-		sph_shabal512_close(&ctx_shabal, hash);
-	} else {
-		sph_whirlpool_init(&ctx_whirlpool);
-		sph_whirlpool(&ctx_whirlpool, hash, 64);
-		sph_whirlpool_close(&ctx_whirlpool, hash);
-	}
-
-	sph_shabal512_init(&ctx_shabal);
-	sph_shabal512(&ctx_shabal, hash, 64);
-	sph_shabal512_close(&ctx_shabal, hash);
-
-	if (hash[0] & 0x8)
-	{
-		sph_hamsi512_init(&ctx_hamsi);
-		sph_hamsi512(&ctx_hamsi, hash, 64);
-		sph_hamsi512_close(&ctx_hamsi, hash);
-	} else {
-                init_luffa( &ctx_luffa, 512 );
-                update_and_final_luffa( &ctx_luffa, (BitSequence*)hash,
-                                        (const BitSequence*)hash, 64 );
-//                update_luffa( &ctx_luffa, hash, 64 );
-//                final_luffa( &ctx_luffa, hash );
-	}
-
-	memcpy(output, hash, 32);
-}
-
-int scanhash_bastion( struct work *work, uint32_t max_nonce,
-      uint64_t *hashes_done, struct thr_info *mythr)
-{
-	uint32_t _ALIGN(64) hash32[8];
-	uint32_t _ALIGN(64) endiandata[20];
-	uint32_t *pdata = work->data;
-	uint32_t *ptarget = work->target;
-   int thr_id = mythr->id;  // thr_id arg is deprecated
-
-	const uint32_t Htarg = ptarget[7];
-	const uint32_t first_nonce = pdata[19];
-
-	uint32_t n = first_nonce;
-
-	for (int i=0; i < 19; i++) 
-		be32enc(&endiandata[i], pdata[i]);
-
-	do {
-		be32enc(&endiandata[19], n);
-		bastionhash(hash32, endiandata);
-		if (hash32[7] < Htarg && fulltest(hash32, ptarget)) {
-			pdata[19] = n;
-         submit_solution( work, hash32, mythr );
-		}
-		n++;
-
-	} while (n < max_nonce && !work_restart[thr_id].restart);
-
-	*hashes_done = n - first_nonce + 1;
-	pdata[19] = n;
-
-	return 0;
-}
-
-bool register_bastion_algo( algo_gate_t* gate )
-{
-  gate->optimizations = SSE2_OPT | AES_OPT;
-  gate->scanhash = (void*)&scanhash_bastion;
-  gate->hash     = (void*)&bastionhash;
-  return true;
-};
-
--- a/algo/heavy/heavy.c
+++ b/algo/heavy/heavy.c
@@ -1,111 +0,0 @@
-#include <string.h>
-#include <openssl/sha.h>
-#include <stdint.h>
-
-#include "algo-gate-api.h"
-#include "sph_hefty1.h"
-#include "algo/keccak/sph_keccak.h"
-#include "algo/blake/sph_blake.h"
-#include "algo/groestl/sph_groestl.h"
-
-/* Combines top 64-bits from each hash into a single hash */
-static void combine_hashes(uint32_t *out, uint32_t *hash1, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4)
-{
-    uint32_t *hash[4] = { hash1, hash2, hash3, hash4 };
-
-    /* Transpose first 64 bits of each hash into out */
-    memset(out, 0, 32);
-    int bits = 0;
-    for (unsigned int i = 7; i >= 6; i--) {
-        for (uint32_t mask = 0x80000000; mask; mask >>= 1) {
-            for (unsigned int k = 0; k < 4; k++) {
-                out[(255 - bits)/32] <<= 1;
-                if ((hash[k][i] & mask) != 0)
-                    out[(255 - bits)/32] |= 1;
-                bits++;
-            }
-        }
-    }
-}
-
-extern void heavyhash(unsigned char* output, const unsigned char* input, int len)
-{
-    unsigned char hash1[32];
-    HEFTY1(input, len, hash1);
-
-// HEFTY1 is new, so take an extra security measure to eliminate
-//     * the possiblity of collisions:
-//     *
-//     *     Hash(x) = SHA256(x + HEFTY1(x))
-//     *
-//     * N.B. '+' is concatenation.
-//
-    unsigned char hash2[32];;
-    SHA256_CTX ctx;
-    SHA256_Init(&ctx);
-    SHA256_Update(&ctx, input, len);
-    SHA256_Update(&ctx, hash1, sizeof(hash1));
-    SHA256_Final(hash2, &ctx);
-
-//   * Additional security: Do not rely on a single cryptographic hash
-//     * function.  Instead, combine the outputs of 4 of the most secure
-//     * cryptographic hash functions-- SHA256, KECCAK512, GROESTL512
-//     * and BLAKE512.
-
-
-    uint32_t hash3[16];
-    sph_keccak512_context keccakCtx;
-    sph_keccak512_init(&keccakCtx);
-    sph_keccak512(&keccakCtx, input, len);
-    sph_keccak512(&keccakCtx, hash1, sizeof(hash1));
-    sph_keccak512_close(&keccakCtx, (void *)&hash3);
-
-    uint32_t hash4[16];
-    sph_groestl512_context groestlCtx;
-    sph_groestl512_init(&groestlCtx);
-    sph_groestl512(&groestlCtx, input, len);
-    sph_groestl512(&groestlCtx, hash1, sizeof(hash1));
-    sph_groestl512_close(&groestlCtx, (void *)&hash4);
-
-    uint32_t hash5[16];
-    sph_blake512_context blakeCtx;
-    sph_blake512_init(&blakeCtx);
-    sph_blake512(&blakeCtx, input, len);
-    sph_blake512(&blakeCtx, (unsigned char *)&hash1, sizeof(hash1));
-    sph_blake512_close(&blakeCtx, (void *)&hash5);
-
-    uint32_t *final = (uint32_t *)output;
-    combine_hashes(final, (uint32_t *)hash2, hash3, hash4, hash5);
-
-}
-
-int scanhash_heavy( uint32_t *pdata, const uint32_t *ptarget,
-            uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr)
-{
-    uint32_t hash[8];
-    uint32_t start_nonce = pdata[19];
-    int thr_id = mythr->id;  // thr_id arg is deprecated
-    
-    do {
-        heavyhash((unsigned char *)hash, (unsigned char *)pdata, 80);
-    
-        if (hash[7] <= ptarget[7]) {
-            if (fulltest(hash, ptarget)) {
-                *hashes_done = pdata[19] - start_nonce;
-                return 1;
-                break;
-            }
-        }
-        pdata[19]++;
-    } while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
-    *hashes_done = pdata[19] - start_nonce;
-    return 0;
-}
-
-bool register_heavy_algo( algo_gate_t* gate )
-{
-    gate->scanhash = (void*)&scanhash_heavy;
-    gate->hash     = (void*)&heavyhash;
-    return true;
-};
-
--- a/algo/hodl/hodl-gate.c
+++ b/algo/hodl/hodl-gate.c
@@ -144,7 +144,7 @@ int hodl_scanhash( struct work* work, uint32_t max_nonce,
 #if defined(__AES__)
  GenRandomGarbage( (CacheEntry*)hodl_scratchbuf, work->data, mythr->id );
  pthread_barrier_wait( &hodl_barrier );
-  return scanhash_hodl_wolf( work, max_nonce, hashes_done, thr_info );
+  return scanhash_hodl_wolf( work, max_nonce, hashes_done, mythr );
 #endif
  return false;
 }
@@ -161,7 +161,7 @@ bool register_hodl_algo( algo_gate_t* gate )
 //     return false;
 //  }
  pthread_barrier_init( &hodl_barrier, NULL, opt_n_threads );
-  gate->optimizations         = AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations         = SSE42_OPT | AES_OPT | AVX2_OPT;
  gate->scanhash              = (void*)&hodl_scanhash;
  gate->get_new_work          = (void*)&hodl_get_new_work;
  gate->longpoll_rpc_call     = (void*)&hodl_longpoll_rpc_call;
--- a/algo/hodl/hodl-wolf.c
+++ b/algo/hodl/hodl-wolf.c
@@ -129,9 +129,10 @@ int scanhash_hodl_wolf( struct work* work, uint32_t max_nonce,
 	      if( FinalPoW[7] <= ptarget[7] )
 	      {
 	          pdata[20] = swab32( BlockHdr[20] );
-		  pdata[21] = swab32( BlockHdr[21] );
-		  *hashes_done = CollisionCount;
-		  return(1);
+             pdata[21] = swab32( BlockHdr[21] );
+		       *hashes_done = CollisionCount;
+             submit_solution( work, FinalPoW, mythr );
+             return(0);
 	      }
 	   }
 	}
@@ -198,7 +199,8 @@ int scanhash_hodl_wolf( struct work* work, uint32_t max_nonce,
                  pdata[20] = swab32( BlockHdr[20] );
                  pdata[21] = swab32( BlockHdr[21] );
                  *hashes_done = CollisionCount;
-                  return(1);
+                  submit_solution( work, FinalPoW, mythr );
+                  return(0);
              }
           }
        }
--- a/algo/jh/jh-hash-4way.c
+++ b/algo/jh/jh-hash-4way.c
@@ -41,57 +41,10 @@
 extern "C"{
 #endif

-
-#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_JH
-#define SPH_SMALL_FOOTPRINT_JH   1
-#endif
-
-#if !defined SPH_JH_64 && SPH_64_TRUE
-#define SPH_JH_64   1
-#endif
-
-#if !SPH_64
-#undef SPH_JH_64
-#endif
-
 #ifdef _MSC_VER
 #pragma warning (disable: 4146)
 #endif

-/*
- * The internal bitslice representation may use either big-endian or
- * little-endian (true bitslice operations do not care about the bit
- * ordering, and the bit-swapping linear operations in JH happen to
- * be invariant through endianness-swapping). The constants must be
- * defined according to the chosen endianness; we use some
- * byte-swapping macros for that.
- */
-
-#if SPH_LITTLE_ENDIAN
-
-#if SPH_64
-#define C64e(x)     ((SPH_C64(x) >> 56) \
-                    | ((SPH_C64(x) >> 40) & SPH_C64(0x000000000000FF00)) \
-                    | ((SPH_C64(x) >> 24) & SPH_C64(0x0000000000FF0000)) \
-                    | ((SPH_C64(x) >>  8) & SPH_C64(0x00000000FF000000)) \
-                    | ((SPH_C64(x) <<  8) & SPH_C64(0x000000FF00000000)) \
-                    | ((SPH_C64(x) << 24) & SPH_C64(0x0000FF0000000000)) \
-                    | ((SPH_C64(x) << 40) & SPH_C64(0x00FF000000000000)) \
-                    | ((SPH_C64(x) << 56) & SPH_C64(0xFF00000000000000)))
-#define dec64e_aligned   sph_dec64le_aligned
-#define enc64e           sph_enc64le
-#endif
-
-#else
-
-#if SPH_64
-#define C64e(x)     SPH_C64(x)
-#define dec64e_aligned   sph_dec64be_aligned
-#define enc64e           sph_enc64be
-#endif
-
-#endif
-
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

 #define Sb_8W(x0, x1, x2, x3, c) \
@@ -152,8 +105,97 @@ do { \
    x3 = _mm256_xor_si256( x3, x4 ); \
 } while (0)

-#if SPH_JH_64
+static const uint64_t C[] =
+{
+   0x67f815dfa2ded572, 0x571523b70a15847b,
+   0xf6875a4d90d6ab81, 0x402bd1c3c54f9f4e,
+   0x9cfa455ce03a98ea, 0x9a99b26699d2c503,
+   0x8a53bbf2b4960266, 0x31a2db881a1456b5,
+   0xdb0e199a5c5aa303, 0x1044c1870ab23f40,
+   0x1d959e848019051c, 0xdccde75eadeb336f,
+   0x416bbf029213ba10, 0xd027bbf7156578dc,
+   0x5078aa3739812c0a, 0xd3910041d2bf1a3f,
+   0x907eccf60d5a2d42, 0xce97c0929c9f62dd,
+   0xac442bc70ba75c18, 0x23fcc663d665dfd1,
+   0x1ab8e09e036c6e97, 0xa8ec6c447e450521,
+   0xfa618e5dbb03f1ee, 0x97818394b29796fd,
+   0x2f3003db37858e4a, 0x956a9ffb2d8d672a,
+   0x6c69b8f88173fe8a, 0x14427fc04672c78a,
+   0xc45ec7bd8f15f4c5, 0x80bb118fa76f4475,
+   0xbc88e4aeb775de52, 0xf4a3a6981e00b882,
+   0x1563a3a9338ff48e, 0x89f9b7d524565faa,
+   0xfde05a7c20edf1b6, 0x362c42065ae9ca36,
+   0x3d98fe4e433529ce, 0xa74b9a7374f93a53,
+   0x86814e6f591ff5d0, 0x9f5ad8af81ad9d0e,
+   0x6a6234ee670605a7, 0x2717b96ebe280b8b,
+   0x3f1080c626077447, 0x7b487ec66f7ea0e0,
+   0xc0a4f84aa50a550d, 0x9ef18e979fe7e391,
+   0xd48d605081727686, 0x62b0e5f3415a9e7e,
+   0x7a205440ec1f9ffc, 0x84c9f4ce001ae4e3,
+   0xd895fa9df594d74f, 0xa554c324117e2e55,
+   0x286efebd2872df5b, 0xb2c4a50fe27ff578,
+   0x2ed349eeef7c8905, 0x7f5928eb85937e44,
+   0x4a3124b337695f70, 0x65e4d61df128865e,
+   0xe720b95104771bc7, 0x8a87d423e843fe74,
+   0xf2947692a3e8297d, 0xc1d9309b097acbdd,
+   0xe01bdc5bfb301b1d, 0xbf829cf24f4924da,
+   0xffbf70b431bae7a4, 0x48bcf8de0544320d,
+   0x39d3bb5332fcae3b, 0xa08b29e0c1c39f45,
+   0x0f09aef7fd05c9e5, 0x34f1904212347094,
+   0x95ed44e301b771a2, 0x4a982f4f368e3be9,
+   0x15f66ca0631d4088, 0xffaf52874b44c147,
+   0x30c60ae2f14abb7e, 0xe68c6eccc5b67046,
+   0x00ca4fbd56a4d5a4, 0xae183ec84b849dda,
+   0xadd1643045ce5773, 0x67255c1468cea6e8,
+   0x16e10ecbf28cdaa3, 0x9a99949a5806e933,
+   0x7b846fc220b2601f, 0x1885d1a07facced1,
+   0xd319dd8da15b5932, 0x46b4a5aac01c9a50,
+   0xba6b04e467633d9f, 0x7eee560bab19caf6,
+   0x742128a9ea79b11f, 0xee51363b35f7bde9,
+   0x76d350755aac571d, 0x01707da3fec2463a,
+   0x42d8a498afc135f7, 0x79676b9e20eced78,
+   0xa8db3aea15638341, 0x832c83324d3bc3fa,
+   0xf347271c1f3b40a7, 0x9a762db734f04059,
+   0xfd4f21d26c4e3ee7, 0xef5957dc398dfdb8,
+   0xdaeb492b490c9b8d, 0x0d70f36849d7a25b,
+   0x84558d7ad0ae3b7d, 0x658ef8e4f0e9a5f5,
+   0x533b1036f4a2b8a0, 0x5aec3e759e07a80c,
+   0x4f88e85692946891, 0x4cbcbaf8555cb05b,
+   0x7b9487f3993bbbe3, 0x5d1c6b72d6f4da75,
+   0x6db334dc28acae64, 0x71db28b850a5346c,
+   0x2a518d10f2e261f8, 0xfc75dd593364dbe3,
+   0xa23fce43f1bcac1c, 0xb043e8023cd1bb67,
+   0x75a12988ca5b0a33, 0x5c5316b44d19347f,
+   0x1e4d790ec3943b92, 0x3fafeeb6d7757479,
+   0x21391abef7d4a8ea, 0x5127234c097ef45c,
+   0xd23c32ba5324a326, 0xadd5a66d4a17a344,
+   0x08c9f2afa63e1db5, 0x563c6b91983d5983,
+   0x4d608672a17cf84c, 0xf6c76e08cc3ee246,
+   0x5e76bcb1b333982f, 0x2ae6c4efa566d62b,
+   0x36d4c1bee8b6f406, 0x6321efbc1582ee74,
+   0x69c953f40d4ec1fd, 0x26585806c45a7da7,
+   0x16fae0061614c17e, 0x3f9d63283daf907e,
+   0x0cd29b00e3f2c9d2, 0x300cd4b730ceaa5f,
+   0x9832e0f216512a74, 0x9af8cee3d830eb0d,
+   0x9279f1b57b9ec54b, 0xd36886046ee651ff,
+   0x316796e6574d239b, 0x05750a17f3a6e6cc,
+   0xce6c3213d98176b1, 0x62a205f88452173c,
+   0x47154778b3cb2bf4, 0x486a9323825446ff,
+   0x65655e4e0758df38, 0x8e5086fc897cfcf2,
+   0x86ca0bd0442e7031, 0x4e477830a20940f0,
+   0x8338f7d139eea065, 0xbd3a2ce437e95ef7,
+   0x6ff8130126b29721, 0xe7de9fefd1ed44a3,
+   0xd992257615dfa08b, 0xbe42dc12f6f7853c,
+   0x7eb027ab7ceca7d8, 0xdea83eaada7d8d53,
+   0xd86902bd93ce25aa, 0xf908731afd43f65a,
+   0xa5194a17daef5fc0, 0x6a21fd4c33664d97,
+   0x701541db3198b435, 0x9b54cdedbb0f1eea,
+   0x72409751a163d09a, 0xe26f4791bf9d75f6
+};

+// Big endian version
+
+/*
 static const sph_u64 C[] = {
 	C64e(0x72d5dea2df15f867), C64e(0x7b84150ab7231557),
 	C64e(0x81abd6904d5a87f6), C64e(0x4e9f4fc5c3d12b40),
@@ -240,6 +282,7 @@ static const sph_u64 C[] = {
 	C64e(0x35b49831db411570), C64e(0xea1e0fbbedcd549b),
 	C64e(0x9ad063a151974072), C64e(0xf6759dbf91476fe2)
 };
+*/

 #define Ceven_hi(r)   (C[((r) << 2) + 0])
 #define Ceven_lo(r)   (C[((r) << 2) + 1])
@@ -427,7 +470,7 @@ do { \
   h7h = _mm256_xor_si256( h7h, m3h ); \
   h7l = _mm256_xor_si256( h7l, m3l ); \

-
+/*
 static const sph_u64 IV256[] = {
 	C64e(0xeb98a3412c20d3eb), C64e(0x92cdbe7b9cb245c1),
 	C64e(0x1c93519160d4c7fa), C64e(0x260082d67e508a03),
@@ -450,11 +493,8 @@ static const sph_u64 IV512[] = {
 	C64e(0xcf57f6ec9db1f856), C64e(0xa706887c5716b156),
 	C64e(0xe3c2fcdfe68517fb), C64e(0x545a4678cc8cdd4b)
 };
+*/

-#else
-
-
-#endif

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

@@ -484,57 +524,6 @@ static const sph_u64 IV512[] = {
 		W ## ro(h7); \
 	} while (0)

-#if SPH_SMALL_FOOTPRINT_JH
-
-#if SPH_JH_64
-
-/*
- * The "small footprint" 64-bit version just uses a partially unrolled
- * loop.
- */
-
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-
-#define E8_8W   do { \
-      unsigned r; \
-      for (r = 0; r < 42; r += 7) { \
-         SL_8W(0); \
-         SL_8W(1); \
-         SL_8W(2); \
-         SL_8W(3); \
-         SL_8W(4); \
-         SL_8W(5); \
-         SL_8W(6); \
-      } \
-   } while (0)
-
-#endif
-
-#define E8   do { \
-		unsigned r; \
-		for (r = 0; r < 42; r += 7) { \
-			SL(0); \
-			SL(1); \
-			SL(2); \
-			SL(3); \
-			SL(4); \
-			SL(5); \
-			SL(6); \
-		} \
-	} while (0)
-
-#else
-
-
-#endif
-
-#else
-
-#if SPH_JH_64
-
-/*
- * On a "true 64-bit" architecture, we can unroll at will.
- */

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

@@ -585,6 +574,7 @@ static const sph_u64 IV512[] = {

 #endif  // AVX512

+
 #define E8   do { \
      SLu( 0, 0); \
      SLu( 1, 1); \
@@ -630,13 +620,6 @@ static const sph_u64 IV512[] = {
      SLu(41, 6); \
   } while (0)

-#else
-
-
-#endif
-
-#endif
-
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

 void jh256_8way_init( jh_8way_context *sc )
@@ -732,12 +715,12 @@ jh_8way_core( jh_8way_context *sc, const void *data, size_t len )

 static void
 jh_8way_close( jh_8way_context *sc, unsigned ub, unsigned n, void *dst,
-               size_t out_size_w32, const void *iv )
+               size_t out_size_w32 )
 {
   __m512i buf[16*4];
   __m512i *dst512 = (__m512i*)dst;
   size_t numz, u;
-   sph_u64 l0, l1, l0e, l1e;
+   uint64_t l0, l1;

   buf[0] = m512_const1_64( 0x80ULL );

@@ -748,12 +731,10 @@ jh_8way_close( jh_8way_context *sc, unsigned ub, unsigned n, void *dst,

   memset_zero_512( buf+1, (numz>>3) - 1 );

-   l0 = SPH_T64(sc->block_count << 9) + (sc->ptr << 3);
-   l1 = SPH_T64(sc->block_count >> 55);
-   sph_enc64be( &l0e, l0 );
-   sph_enc64be( &l1e, l1 );
-   *(buf + (numz>>3)    ) = _mm512_set1_epi64( l1e );
-   *(buf + (numz>>3) + 1) = _mm512_set1_epi64( l0e );
+   l0 = ( sc->block_count << 9 ) + ( sc->ptr << 3 );
+   l1 = ( sc->block_count >> 55 );
+   *(buf + (numz>>3)    ) = _mm512_set1_epi64( bswap_64( l1 ) );
+   *(buf + (numz>>3) + 1) = _mm512_set1_epi64( bswap_64( l0 ) );

   jh_8way_core( sc, buf, numz + 16 );

@@ -772,7 +753,7 @@ jh256_8way_update(void *cc, const void *data, size_t len)
 void
 jh256_8way_close(void *cc, void *dst)
 {
-   jh_8way_close(cc, 0, 0, dst, 8, IV256);
+   jh_8way_close(cc, 0, 0, dst, 8);
 }

 void
@@ -784,7 +765,7 @@ jh512_8way_update(void *cc, const void *data, size_t len)
 void
 jh512_8way_close(void *cc, void *dst)
 {
-   jh_8way_close(cc, 0, 0, dst, 16, IV512);
+   jh_8way_close(cc, 0, 0, dst, 16);
 }

 #endif
@@ -882,12 +863,12 @@ jh_4way_core( jh_4way_context *sc, const void *data, size_t len )

 static void
 jh_4way_close( jh_4way_context *sc, unsigned ub, unsigned n, void *dst,
-               size_t out_size_w32, const void *iv )
+               size_t out_size_w32 )
 {
   __m256i buf[16*4];
   __m256i *dst256 = (__m256i*)dst;
   size_t numz, u;
-   sph_u64 l0, l1, l0e, l1e;
+   uint64_t l0, l1;

   buf[0] = m256_const1_64( 0x80ULL );

@@ -898,12 +879,10 @@ jh_4way_close( jh_4way_context *sc, unsigned ub, unsigned n, void *dst,

   memset_zero_256( buf+1, (numz>>3) - 1 );   

-   l0 = SPH_T64(sc->block_count << 9) + (sc->ptr << 3);
-   l1 = SPH_T64(sc->block_count >> 55);
-   sph_enc64be( &l0e, l0 );
-   sph_enc64be( &l1e, l1 );
-   *(buf + (numz>>3)    ) = _mm256_set1_epi64x( l1e );
-   *(buf + (numz>>3) + 1) = _mm256_set1_epi64x( l0e ); 
+   l0 = ( sc->block_count << 9 ) + ( sc->ptr << 3 );
+   l1 = ( sc->block_count >> 55 );
+   *(buf + (numz>>3)    ) = _mm256_set1_epi64x( bswap_64( l1 ) );
+   *(buf + (numz>>3) + 1) = _mm256_set1_epi64x( bswap_64( l0 ) );

   jh_4way_core( sc, buf, numz + 16 );

@@ -922,7 +901,7 @@ jh256_4way_update(void *cc, const void *data, size_t len)
 void
 jh256_4way_close(void *cc, void *dst)
 {
-	jh_4way_close(cc, 0, 0, dst, 8, IV256);
+	jh_4way_close(cc, 0, 0, dst, 8 );
 }

 void
@@ -934,7 +913,7 @@ jh512_4way_update(void *cc, const void *data, size_t len)
 void
 jh512_4way_close(void *cc, void *dst)
 {
-	jh_4way_close(cc, 0, 0, dst, 16, IV512);
+	jh_4way_close(cc, 0, 0, dst, 16 );
 }


--- a/algo/jh/jh-hash-4way.h
+++ b/algo/jh/jh-hash-4way.h
@@ -43,7 +43,6 @@ extern "C"{
 #endif

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
 #include "simd-utils.h"

 #define SPH_SIZE_jh256   256
--- a/algo/jh/jha-4way.c
+++ b/algo/jh/jha-4way.c
@@ -65,7 +65,7 @@ void jha_hash_4way( void *out, const void *input )
          vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );

       blake512_4way_init( &ctx_blake );
-       blake512_4way( &ctx_blake, vhash, 64 );
+       blake512_4way_update( &ctx_blake, vhash, 64 );
       blake512_4way_close( &ctx_blake, vhashA );

       jh512_4way_init( &ctx_jh );
@@ -129,7 +129,7 @@ int scanhash_jha_4way( struct work *work, uint32_t max_nonce,
                 if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
                 {
                    pdata[19] = n+i;
-                    submit_lane_solution( work, lane_hash, mythr, i );
+                    submit_solution( work, lane_hash, mythr );
                 }
              }
              n += 4;
--- a/algo/jh/jha.c
+++ b/algo/jh/jha.c
@@ -1,19 +1,19 @@
 #include "jha-gate.h"

+#if !defined(JHA_8WAY) && !defined(JHA_4WAY)
+
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-
 #include "algo/blake/sph_blake.h"
 #include "algo/jh/sph_jh.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
-
-#ifdef NO_AES_NI
-  #include "algo/groestl/sph_groestl.h"
-#else
+#ifdef __AES__
  #include "algo/groestl/aes_ni/hash-groestl.h"
+#else
+  #include "algo/groestl/sph_groestl.h"
 #endif

 static __thread sph_keccak512_context jha_kec_mid __attribute__ ((aligned (64)));
@@ -28,12 +28,12 @@ void jha_hash(void *output, const void *input)
 {
 	uint8_t _ALIGN(128) hash[64];

-#ifdef NO_AES_NI
-	sph_groestl512_context ctx_groestl;
+#ifdef __AES__
+   hashState_groestl      ctx_groestl;
 #else
-        hashState_groestl      ctx_groestl;
+	sph_groestl512_context ctx_groestl;
 #endif
-        sph_blake512_context ctx_blake;
+   sph_blake512_context ctx_blake;
 	sph_jh512_context ctx_jh;
 	sph_keccak512_context ctx_keccak;
 	sph_skein512_context ctx_skein;
@@ -46,36 +46,36 @@ void jha_hash(void *output, const void *input)
 	for (int round = 0; round < 3; round++)
 	{
 	   if (hash[0] & 0x01)
-           {
-#ifdef NO_AES_NI
-		sph_groestl512_init(&ctx_groestl);
-		sph_groestl512(&ctx_groestl, hash, 64 );
-		sph_groestl512_close(&ctx_groestl, hash );
+      {
+#ifdef __AES__
+         init_groestl( &ctx_groestl, 64 );
+         update_and_final_groestl( &ctx_groestl, (char*)hash,
+                                              (char*)hash, 512 );
 #else
-                init_groestl( &ctx_groestl, 64 );
-                update_and_final_groestl( &ctx_groestl, (char*)hash,
-                                          (char*)hash, 512 );
+   		sph_groestl512_init(&ctx_groestl);
+	   	sph_groestl512(&ctx_groestl, hash, 64 );
+		   sph_groestl512_close(&ctx_groestl, hash );
 #endif
-	    }
-            else
-            {
-		sph_skein512_init(&ctx_skein);
-		sph_skein512(&ctx_skein, hash, 64);
-		sph_skein512_close(&ctx_skein, hash );
-	    }
+      }
+      else
+      {
+		   sph_skein512_init(&ctx_skein);
+		   sph_skein512(&ctx_skein, hash, 64);
+		   sph_skein512_close(&ctx_skein, hash );
+	   }

-	    if (hash[0] & 0x01)
-            {
-		sph_blake512_init(&ctx_blake);
-		sph_blake512(&ctx_blake, hash, 64);
-		sph_blake512_close(&ctx_blake, hash );
-	    }
-            else
-            {
-		sph_jh512_init(&ctx_jh);
-		sph_jh512(&ctx_jh, hash, 64 );
-		sph_jh512_close(&ctx_jh, hash );
-	    }
+	   if (hash[0] & 0x01)
+      {
+		   sph_blake512_init(&ctx_blake);
+		   sph_blake512(&ctx_blake, hash, 64);
+		   sph_blake512_close(&ctx_blake, hash );
+	   }
+      else
+      {
+		   sph_jh512_init(&ctx_jh);
+		   sph_jh512(&ctx_jh, hash, 64 );
+		   sph_jh512_close(&ctx_jh, hash );
+	   }
 	}

 	memcpy(output, hash, 32);
@@ -136,3 +136,4 @@ int scanhash_jha( struct work *work, uint32_t max_nonce,
 	return 0;
 }

+#endif
--- a/algo/keccak/keccak-4way.c
+++ b/algo/keccak/keccak-4way.c
@@ -28,30 +28,32 @@ int scanhash_keccak_8way( struct work *work, uint32_t max_nonce,
   const uint32_t first_nonce = pdata[19];
   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
   const uint32_t Htarg = ptarget[7];
-   int thr_id = mythr->id;  
+   const int thr_id = mythr->id;  
+   const bool bench = opt_benchmark;

   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+   *noncev = mm512_intrlv_blend_32(
+              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                n+3, 0, n+2, 0, n+1, 0, n  , 0 ), *noncev );
   do {
-       *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
-                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
-                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );
-
      keccakhash_8way( hash, vdata );

      for ( int lane = 0; lane < 8; lane++ )
-      if ( hash7[ lane<<1 ] < Htarg ) 
+      if unlikely( hash7[ lane<<1 ] <= Htarg && !bench ) 
      {
          extr_lane_8x64( lane_hash, hash, lane, 256 );
-          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+          if ( valid_hash( lane_hash, ptarget ) )
          {
-              pdata[19] = n + lane;
-              submit_lane_solution( work, lane_hash, mythr, lane );
+              pdata[19] = bswap_32( n + lane );
+              submit_solution( work, lane_hash, mythr );
          }
      }
+      *noncev = _mm512_add_epi32( *noncev,
+                                  m512_const1_64( 0x0000000800000000 ) );
      n += 8;

   } while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
-
+   pdata[19] = n;
   *hashes_done = n - first_nonce + 1;
   return 0;
 }
@@ -79,29 +81,30 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
   const uint32_t first_nonce = pdata[19];
   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
   const uint32_t Htarg = ptarget[7];
-   int thr_id = mythr->id;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;

   mm256_bswap32_intrlv80_4x64( vdata, pdata );
+   *noncev = mm256_intrlv_blend_32(
+                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
   do {
-       *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
-	
      keccakhash_4way( hash, vdata );

      for ( int lane = 0; lane < 4; lane++ )
-      if ( hash7[ lane<<1 ] < Htarg )
+      if unlikely( hash7[ lane<<1 ] <= Htarg && !bench )
      {
          extr_lane_4x64( lane_hash, hash, lane, 256 );
-          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+          if ( valid_hash( lane_hash, ptarget ))
          {
-              pdata[19] = n + lane;
-              submit_lane_solution( work, lane_hash, mythr, lane );
+              pdata[19] = bswap_32( n + lane );
+              submit_solution( work, lane_hash, mythr );
          }
      }
+      *noncev = _mm256_add_epi32( *noncev,
+                                  m256_const1_64( 0x0000000400000000 ) );
      n += 4;
-
   } while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
-
+   pdata[19] = n;
   *hashes_done = n - first_nonce + 1;
   return 0;
 }
--- a/algo/keccak/keccak-gate.c
+++ b/algo/keccak/keccak-gate.c
@@ -1,5 +1,9 @@
 #include "keccak-gate.h"
+#include "sph_keccak.h"

+int hard_coded_eb = 1;
+
+// KECCAK

 bool register_keccak_algo( algo_gate_t* gate )
 {
@@ -19,6 +23,8 @@ bool register_keccak_algo( algo_gate_t* gate )
  return true;
 };

+// KECCAKC
+
 bool register_keccakc_algo( algo_gate_t* gate )
 {
  gate->optimizations = AVX2_OPT | AVX512_OPT;
@@ -37,3 +43,50 @@ bool register_keccakc_algo( algo_gate_t* gate )
  return true;
 };

+// SHA3D
+
+void sha3d( void *state, const void *input, int len )
+{
+	uint32_t _ALIGN(64) buffer[16], hash[16];
+	sph_keccak_context ctx_keccak;
+
+	sph_keccak256_init( &ctx_keccak );
+	sph_keccak256 ( &ctx_keccak, input, len );
+	sph_keccak256_close( &ctx_keccak, (void*) buffer );
+
+   sph_keccak256_init( &ctx_keccak );
+	sph_keccak256 ( &ctx_keccak, buffer, 32 );
+	sph_keccak256_close( &ctx_keccak, (void*) hash );
+
+	memcpy(state, hash, 32);
+}
+
+void sha3d_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx )
+{
+  sha3d( merkle_root, sctx->job.coinbase, (int) sctx->job.coinbase_size );
+  for ( int i = 0; i < sctx->job.merkle_count; i++ )
+  {
+     memcpy( merkle_root + 32, sctx->job.merkle[i], 32 );
+     sha256d( merkle_root, merkle_root, 64 );
+  }
+}
+
+bool register_sha3d_algo( algo_gate_t* gate )
+{
+  hard_coded_eb = 6;
+//  opt_extranonce = false;
+  gate->optimizations = AVX2_OPT | AVX512_OPT;
+  gate->gen_merkle_root = (void*)&sha3d_gen_merkle_root;
+#if defined (KECCAK_8WAY)
+  gate->scanhash  = (void*)&scanhash_sha3d_8way;
+  gate->hash      = (void*)&sha3d_hash_8way;
+#elif defined (KECCAK_4WAY)
+  gate->scanhash  = (void*)&scanhash_sha3d_4way;
+  gate->hash      = (void*)&sha3d_hash_4way;
+#else
+  gate->scanhash  = (void*)&scanhash_sha3d;
+  gate->hash      = (void*)&sha3d_hash;
+#endif
+  return true;
+};
+
--- a/algo/keccak/keccak-gate.h
+++ b/algo/keccak/keccak-gate.h
@@ -10,24 +10,37 @@
  #define KECCAK_4WAY 1
 #endif

+extern int hard_coded_eb;
+
 #if defined(KECCAK_8WAY)

 void keccakhash_8way( void *state, const void *input );
 int scanhash_keccak_8way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );

+void sha3d_hash_8way( void *state, const void *input );
+int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+
 #elif defined(KECCAK_4WAY)

 void keccakhash_4way( void *state, const void *input );
 int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );

+void sha3d_hash_4way( void *state, const void *input );
+int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+
 #else

 void keccakhash( void *state, const void *input );
 int scanhash_keccak( struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr );

-#endif
+void sha3d_hash( void *state, const void *input );
+int scanhash_sha3d( struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done, struct thr_info *mythr );

 #endif
+#endif
--- a/algo/keccak/keccak-hash-4way.c
+++ b/algo/keccak/keccak-hash-4way.c
@@ -1,6 +1,9 @@
 #include <stddef.h>
 #include <stdint.h>
 #include "keccak-hash-4way.h"
+#include "keccak-gate.h"
+
+#if defined(__AVX2__)

 static const uint64_t RC[] = {
        0x0000000000000001, 0x0000000000008082,
@@ -163,12 +166,12 @@ static void keccak64_8way_close( keccak64_ctx_m512i *kc, void *dst,
    unsigned eb;
    union {
       __m512i tmp[lim + 1];
-       sph_u64 dummy;   /* for alignment */
+       uint64_t dummy;   /* for alignment */
    } u;
    size_t j;
    size_t m512_len = byte_len >> 3;

-    eb = 0x100  >> 8;
+    eb = hard_coded_eb;
    if ( kc->ptr == (lim - 8) )
    {
        const uint64_t t = eb | 0x8000000000000000;
@@ -238,7 +241,7 @@ keccak512_8way_close(void *cc, void *dst)

 #endif  // AVX512

-#if defined(__AVX2__)
+// AVX2

 #define INPUT_BUF(size)   do { \
    size_t j; \
@@ -344,12 +347,12 @@ static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
    unsigned eb;
    union {
       __m256i tmp[lim + 1];
-       sph_u64 dummy;   /* for alignment */
+       uint64_t dummy;   /* for alignment */
    } u;
    size_t j;
    size_t m256_len = byte_len >> 3;

-    eb = 0x100  >> 8;
+    eb = hard_coded_eb;
    if ( kc->ptr == (lim - 8) )
    {
        const uint64_t t = eb | 0x8000000000000000;
--- a/algo/keccak/keccak-hash-4way.h
+++ b/algo/keccak/keccak-hash-4way.h
@@ -43,16 +43,8 @@ extern "C"{
 #ifdef  __AVX2__

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
 #include "simd-utils.h"

-#define SPH_SIZE_keccak256   256
-
-/**
- * Output size (in bits) for Keccak-512.
- */
-#define SPH_SIZE_keccak512   512
-
 /**
 * This structure is a context for Keccak computations: it contains the
 * intermediate values and some data from the last entered block. Once a
--- a/algo/keccak/keccak.c
+++ b/algo/keccak/keccak.c
@@ -1,4 +1,6 @@
-#include "algo-gate-api.h"
+#include "keccak-gate.h"
+
+#if !defined(KECCAK_8WAY) && !defined(KECCAK_4WAY)

 #include <stdlib.h>
 #include <string.h>
@@ -18,36 +20,35 @@ void keccakhash(void *state, const void *input)
 	memcpy(state, hash, 32);
 }

-int scanhash_keccak( struct work *work,
-	uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
+int scanhash_keccak( struct work *work, uint32_t max_nonce,
+                     uint64_t *hashes_done, struct thr_info *mythr )
 {
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
-	uint32_t n = pdata[19] - 1;
-	const uint32_t first_nonce = pdata[19];
-	//const uint32_t Htarg = ptarget[7];
-   int thr_id = mythr->id;  // thr_id arg is deprecated
+   uint32_t _ALIGN(64) hash64[8];
+   uint32_t _ALIGN(64) endiandata[32];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t n = pdata[19];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce;
+   const int thr_id = mythr->id;

-	uint32_t _ALIGN(32) hash64[8];
-	uint32_t endiandata[32];
+   for ( int i=0; i < 19; i++ )
+      be32enc( &endiandata[i], pdata[i] );

-        for (int i=0; i < 19; i++) 
-                be32enc(&endiandata[i], pdata[i]);
+   do {
+      be32enc( &endiandata[19], n );
+      keccakhash( hash64, endiandata );
+      if ( valid_hash( hash64, ptarget ) && !opt_benchmark )
+      {
+         pdata[19] = n;
+         submit_solution( work, hash64, mythr );
+      }
+      n++;
+   } while ( n < last_nonce && !work_restart[thr_id].restart );

-	do {
-	
-		pdata[19] = ++n;
-		be32enc(&endiandata[19], n); 
-		keccakhash(hash64, endiandata);
-        if (((hash64[7]&0xFFFFFF00)==0) && 
-				fulltest(hash64, ptarget)) {
-            *hashes_done = n - first_nonce + 1;
-			return true;
-		}
-	} while (n < max_nonce && !work_restart[thr_id].restart);
-	
-	*hashes_done = n - first_nonce + 1;
-	pdata[19] = n;
-	return 0;
+   *hashes_done = n - first_nonce;
+   pdata[19] = n;
+   return 0;
 }

+#endif
--- a/algo/keccak/sha3d-4way.c
+++ b/algo/keccak/sha3d-4way.c
@@ -0,0 +1,126 @@
+#include "keccak-gate.h"
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include "sph_keccak.h"
+#include "keccak-hash-4way.h"
+
+#if defined(KECCAK_8WAY)
+
+void sha3d_hash_8way(void *state, const void *input)
+{
+    uint32_t buffer[16*8] __attribute__ ((aligned (128)));
+    keccak256_8way_context ctx;
+
+    keccak256_8way_init( &ctx );
+    keccak256_8way_update( &ctx, input, 80 );
+    keccak256_8way_close( &ctx, buffer );
+
+    keccak256_8way_init( &ctx );
+    keccak256_8way_update( &ctx, buffer, 32 );
+    keccak256_8way_close( &ctx, state );
+}
+
+int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t vdata[24*8] __attribute__ ((aligned (128)));
+   uint32_t hash[16*8] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t *hash7 = &(hash[49]);   // 3*16+1
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t n = pdata[19];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
+   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   const uint32_t Htarg = ptarget[7];
+   const int thr_id = mythr->id;  
+   const bool bench = opt_benchmark;
+
+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+   *noncev = mm512_intrlv_blend_32(
+              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                n+3, 0, n+2, 0, n+1, 0, n  , 0 ), *noncev );
+   do {
+      sha3d_hash_8way( hash, vdata );
+
+      for ( int lane = 0; lane < 8; lane++ )
+      if ( unlikely( hash7[ lane<<1 ] <= Htarg && !bench ) )
+      {
+          extr_lane_8x64( lane_hash, hash, lane, 256 );
+          if ( valid_hash( lane_hash, ptarget ) )
+          {
+              pdata[19] = bswap_32( n + lane );
+              submit_solution( work, lane_hash, mythr );
+          }
+      }
+      *noncev = _mm512_add_epi32( *noncev,
+                                  m512_const1_64( 0x0000000800000000 ) );
+      n += 8;
+
+   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#elif defined(KECCAK_4WAY)
+
+void sha3d_hash_4way(void *state, const void *input)
+{
+    uint32_t buffer[16*4] __attribute__ ((aligned (64)));
+    keccak256_4way_context ctx;
+
+    keccak256_4way_init( &ctx );
+    keccak256_4way_update( &ctx, input, 80 );
+    keccak256_4way_close( &ctx, buffer );
+
+    keccak256_4way_init( &ctx );
+    keccak256_4way_update( &ctx, buffer, 32 );
+    keccak256_4way_close( &ctx, state );
+}
+
+int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+   uint32_t hash[16*4] __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t *hash7 = &(hash[25]);   // 3*8+1
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t n = pdata[19];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 4;
+   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+   const uint32_t Htarg = ptarget[7];
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+
+   mm256_bswap32_intrlv80_4x64( vdata, pdata );
+   *noncev = mm256_intrlv_blend_32( 
+                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
+   do {
+      sha3d_hash_4way( hash, vdata );
+
+      for ( int lane = 0; lane < 4; lane++ )
+      if ( unlikely( hash7[ lane<<1 ] <= Htarg && !bench ) )
+      {
+          extr_lane_4x64( lane_hash, hash, lane, 256 );
+          if ( valid_hash( lane_hash, ptarget ) )
+          {
+              pdata[19] = bswap_32( n + lane );
+              submit_solution( work, lane_hash, mythr );
+          }
+      }
+      *noncev = _mm256_add_epi32( *noncev,
+                                  m256_const1_64( 0x0000000400000000 ) );
+      n += 4;
+   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#endif
--- a/algo/keccak/sha3d.c
+++ b/algo/keccak/sha3d.c
@@ -0,0 +1,54 @@
+#include "keccak-gate.h"
+
+#if !defined(KECCAK_8WAY) && !defined(KECCAK_4WAY)
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include "sph_keccak.h"
+
+void sha3d_hash(void *state, const void *input)
+{
+    uint32_t buffer[16];
+    sph_keccak256_context ctx_keccak;
+   
+    sph_keccak256_init( &ctx_keccak );
+    sph_keccak256 ( &ctx_keccak, input, 80 );
+    sph_keccak256_close( &ctx_keccak, buffer );
+    sph_keccak256_init( &ctx_keccak );
+    sph_keccak256 ( &ctx_keccak, buffer, 32 );
+    sph_keccak256_close( &ctx_keccak, state );
+}
+
+int scanhash_sha3d( struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t _ALIGN(64) hash64[8];
+   uint32_t _ALIGN(64) endiandata[32];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+	uint32_t n = pdata[19];
+	const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce;
+   const int thr_id = mythr->id;
+
+   for ( int i=0; i < 19; i++ ) 
+      be32enc( &endiandata[i], pdata[i] );
+
+	do {
+		be32enc( &endiandata[19], n ); 
+		sha3d_hash( hash64, endiandata );
+      if ( valid_hash( hash64, ptarget ) && !opt_benchmark )
+      {
+         pdata[19] = n;
+         submit_solution( work, hash64, mythr );
+		}
+      n++;
+   } while ( n < last_nonce && !work_restart[thr_id].restart );
+	
+	*hashes_done = n - first_nonce;
+	pdata[19] = n;
+	return 0;
+}
+
+#endif
--- a/algo/keccak/sph_keccak.c
+++ b/algo/keccak/sph_keccak.c
@@ -32,8 +32,8 @@

 #include <stddef.h>
 #include <string.h>
-
 #include "sph_keccak.h"
+#include "keccak-gate.h"

 #ifdef __cplusplus
 extern "C"{
@@ -1616,7 +1616,7 @@ keccak_core(sph_keccak_context *kc, const void *data, size_t len, size_t lim)
 		} u; \
 		size_t j; \
 \
-		eb = (0x100 | (ub & 0xFF)) >> (8 - n); \
+		eb = hard_coded_eb; \
 		if (kc->ptr == (lim - 1)) { \
 			if (n == 7) { \
 				u.tmp[0] = eb; \
--- a/algo/luffa/luffa-hash-2way.c
+++ b/algo/luffa/luffa-hash-2way.c
@@ -459,6 +459,11 @@ int luffa_4way_init( luffa_4way_context *state, int hashbitlen )
    return 0;
 }

+int luffa512_4way_init( luffa_4way_context *state )
+{
+   return luffa_4way_init( state, 512 );
+}
+   
 // Do not call luffa_update_close after having called luffa_update.
 // Once luffa_update has been called only call luffa_update or luffa_close.
 int luffa_4way_update( luffa_4way_context *state, const void *data,
@@ -496,6 +501,14 @@ int luffa_4way_update( luffa_4way_context *state, const void *data,
    return 0;
 }

+/*
+int luffa512_4way_update( luffa_4way_context *state, const void *data,
+                       size_t len )
+{
+   return luffa_4way_update( state, data, len );
+}
+*/
+
 int luffa_4way_close( luffa_4way_context *state, void *hashval )
 {
    __m512i *buffer = (__m512i*)state->buffer;
@@ -518,6 +531,77 @@ int luffa_4way_close( luffa_4way_context *state, void *hashval )
    return 0;
 }

+/*
+int luffa512_4way_close( luffa_4way_context *state, void *hashval )
+{
+   return luffa_4way_close( state, hashval );
+}
+*/
+
+int luffa512_4way_full( luffa_4way_context *state, void *output,
+                        const void *data, size_t inlen )
+{
+    state->hashbitlen = 512;
+    __m128i *iv = (__m128i*)IV;
+
+    state->chainv[0] = m512_const1_128( iv[0] );
+    state->chainv[1] = m512_const1_128( iv[1] );
+    state->chainv[2] = m512_const1_128( iv[2] );
+    state->chainv[3] = m512_const1_128( iv[3] );
+    state->chainv[4] = m512_const1_128( iv[4] );
+    state->chainv[5] = m512_const1_128( iv[5] );
+    state->chainv[6] = m512_const1_128( iv[6] );
+    state->chainv[7] = m512_const1_128( iv[7] );
+    state->chainv[8] = m512_const1_128( iv[8] );
+    state->chainv[9] = m512_const1_128( iv[9] );
+
+    ((__m512i*)state->buffer)[0] = m512_zero;
+    ((__m512i*)state->buffer)[1] = m512_zero;
+
+    const __m512i *vdata  = (__m512i*)data;
+    __m512i msg[2];
+    int i;
+    const int blocks = (int)( inlen >> 5 );
+    const __m512i shuff_bswap32 = m512_const_64(
+                                   0x3c3d3e3f38393a3b, 0x3435363730313233,
+                                   0x2c2d2e2f28292a2b, 0x2425262720212223,
+                                   0x1c1d1e1f18191a1b, 0x1415161710111213,
+                                   0x0c0d0e0f08090a0b, 0x0405060700010203 );
+
+    state->rembytes = inlen & 0x1F;
+
+    // full blocks
+    for ( i = 0; i < blocks; i++, vdata+=2 )
+    {
+       msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
+       msg[1] = _mm512_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
+       rnd512_4way( state, msg );
+    }
+
+    // 16 byte partial block exists for 80 byte len
+    if ( state->rembytes  )
+    {
+       // padding of partial block
+       msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
+       msg[1] = m512_const2_64( 0, 0x0000000080000000 );
+       rnd512_4way( state, msg );
+    }
+    else
+    {
+       // empty pad block
+       msg[0] = m512_const2_64( 0, 0x0000000080000000 );
+       msg[1] = m512_zero;
+       rnd512_4way( state, msg );
+    }
+
+    finalization512_4way( state, (uint32*)output );
+
+    if ( state->hashbitlen > 512 )
+        finalization512_4way( state, (uint32*)( output+64 ) );
+
+    return 0;
+}
+
 int luffa_4way_update_close( luffa_4way_context *state,
                 void *output, const void *data, size_t inlen )
 {
@@ -1031,6 +1115,69 @@ int luffa_2way_close( luffa_2way_context *state, void *hashval )
    return 0;
 }

+int luffa512_2way_full( luffa_2way_context *state, void *output,
+                        const void *data, size_t inlen )
+{
+    state->hashbitlen = 512;
+    __m128i *iv = (__m128i*)IV;
+
+    state->chainv[0] = m256_const1_128( iv[0] );
+    state->chainv[1] = m256_const1_128( iv[1] );
+    state->chainv[2] = m256_const1_128( iv[2] );
+    state->chainv[3] = m256_const1_128( iv[3] );
+    state->chainv[4] = m256_const1_128( iv[4] );
+    state->chainv[5] = m256_const1_128( iv[5] );
+    state->chainv[6] = m256_const1_128( iv[6] );
+    state->chainv[7] = m256_const1_128( iv[7] );
+    state->chainv[8] = m256_const1_128( iv[8] );
+    state->chainv[9] = m256_const1_128( iv[9] );
+
+    ((__m256i*)state->buffer)[0] = m256_zero;
+    ((__m256i*)state->buffer)[1] = m256_zero;
+
+    const __m256i *vdata  = (__m256i*)data;
+    __m256i msg[2];
+    int i;
+    const int blocks = (int)( inlen >> 5 );
+    const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
+                                                 0x1415161710111213,
+                                                 0x0c0d0e0f08090a0b,
+                                                 0x0405060700010203 );
+
+    state->rembytes = inlen & 0x1F;
+
+    // full blocks
+    for ( i = 0; i < blocks; i++, vdata+=2 )
+    {
+       msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
+       msg[1] = _mm256_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
+       rnd512_2way( state, msg );
+    }
+
+    // 16 byte partial block exists for 80 byte len
+    if ( state->rembytes  )
+    {
+       // padding of partial block
+       msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
+       msg[1] = m256_const2_64( 0, 0x0000000080000000 );
+       rnd512_2way( state, msg );
+    }
+    else
+    {
+       // empty pad block
+       msg[0] = m256_const2_64( 0, 0x0000000080000000 );
+       msg[1] = m256_zero;
+       rnd512_2way( state, msg );
+    }
+
+    finalization512_2way( state, (uint32*)output );
+
+    if ( state->hashbitlen > 512 )
+        finalization512_2way( state, (uint32*)( output+32 ) );
+
+    return 0;
+}
+
 int luffa_2way_update_close( luffa_2way_context *state,
                 void *output, const void *data, size_t inlen )
 {
--- a/algo/luffa/luffa-hash-2way.h
+++ b/algo/luffa/luffa-hash-2way.h
@@ -61,11 +61,23 @@ typedef struct {
 } luffa_4way_context __attribute((aligned(128)));

 int luffa_4way_init( luffa_4way_context *state, int hashbitlen );
-int luffa_4way_update( luffa_4way_context *state, const void *data,
-                       size_t len );
-int luffa_4way_close( luffa_4way_context *state, void *hashval );
+//int luffa_4way_update( luffa_4way_context *state, const void *data,
+//                       size_t len );
+//int luffa_4way_close( luffa_4way_context *state, void *hashval );
 int luffa_4way_update_close( luffa_4way_context *state, void *output,
                                   const void *data, size_t inlen );
+int luffa512_4way_full( luffa_4way_context *state, void *output,
+                         const void *data, size_t inlen );
+int luffa512_4way_init( luffa_4way_context *state );
+int luffa512_4way_update( luffa_4way_context *state, const void *data,
+                       size_t len );
+int luffa512_4way_close( luffa_4way_context *state, void *hashval );
+int luffa512_4way_update_close( luffa_4way_context *state, void *output,
+                                const void *data, size_t inlen );
+
+#define luffa_4way_update       luffa512_4way_update
+#define luffa_4way_close        luffa512_4way_close
+#define luffa_4way_update_close luffa512_4way_update_close

 #endif

@@ -82,6 +94,8 @@ int luffa_2way_update( luffa_2way_context *state, const void *data,
 int luffa_2way_close( luffa_2way_context *state, void *hashval );
 int luffa_2way_update_close( luffa_2way_context *state, void *output,
                                   const void *data, size_t inlen );
+int luffa512_2way_full( luffa_2way_context *state, void *output,
+                         const void *data, size_t inlen );

 #endif
 #endif
--- a/algo/luffa/luffa.c
+++ b/algo/luffa/luffa.c
@@ -1,63 +0,0 @@
-#include "algo-gate-api.h"
-
-#include <stdlib.h>
-#include <stdint.h>
-#include <string.h>
-#include <stdio.h>
-
-#include "sph_luffa.h"
-
-void luffahash(void *output, const void *input)
-{
-	unsigned char _ALIGN(128) hash[64];
-	sph_luffa512_context ctx_luffa;
-
-	sph_luffa512_init(&ctx_luffa);
-	sph_luffa512 (&ctx_luffa, input, 80);
-	sph_luffa512_close(&ctx_luffa, (void*) hash);
-
-	memcpy(output, hash, 32);
-}
-
-int scanhash_luffa(int thr_id, struct work *work,
-	uint32_t max_nonce, uint64_t *hashes_done)
-{
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
-
-	uint32_t _ALIGN(64) hash64[8];
-	uint32_t _ALIGN(64) endiandata[20];
-
-	const uint32_t Htarg = ptarget[7];
-	const uint32_t first_nonce = pdata[19];
-
-	uint32_t n = first_nonce;
-
-        for (int i=0; i < 19; i++) 
-                be32enc(&endiandata[i], pdata[i]);
-
-	do {
-		be32enc(&endiandata[19], n);
-		luffahash(hash64, endiandata);
-		if (hash64[7] < Htarg && fulltest(hash64, ptarget)) {
-			*hashes_done = n - first_nonce + 1;
-			pdata[19] = n;
-			return true;
-		}
-		n++;
-
-	} while (n < max_nonce && !work_restart[thr_id].restart);
-
-	*hashes_done = n - first_nonce + 1;
-	pdata[19] = n;
-
-	return 0;
-}
-
-bool register_luffa_algo( algo_gate_t* gate )
-{
-    gate->scanhash = (void*)&scanhash_luffa;
-    gate->hash     = (void*)&luffahash;
-    return true;
-};
-
--- a/algo/luffa/luffa_for_sse2.c
+++ b/algo/luffa/luffa_for_sse2.c
@@ -344,17 +344,12 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,

    // 16 byte partial block exists for 80 byte len
    if ( state->rembytes  )
-    {
-      // padding of partial block
-      rnd512( state, _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ),
+       // padding of partial block
+       rnd512( state, m128_const_64( 0, 0x80000000 ),
                      mm128_bswap_32( cast_m128i( data ) ) );
-    }
    else
-    {
-      // empty pad block
-     rnd512( state, _mm_setzero_si128(), 
-                       _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ) );
-    }
+       // empty pad block
+       rnd512( state, m128_zero, m128_const_64( 0, 0x80000000 ) );

    finalization512( state, (uint32*) output );
    if ( state->hashbitlen > 512 )
@@ -363,6 +358,56 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
    return SUCCESS;
 }

+
+int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen,
+              const BitSequence* data, size_t inlen )
+{
+// Optimized for integrals of 16 bytes, good for 64 and 80 byte len
+    int i;
+    state->hashbitlen = hashbitlen;
+    /* set the lower 32 bits to '1' */
+    MASK= _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xffffffff);
+    /* set all bits to '1' */
+    ALLONE = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
+    /* set the 32-bit round constant values to the 128-bit data field */
+    for ( i=0; i<32; i++ )
+        CNS128[i] = _mm_load_si128( (__m128i*)&CNS_INIT[i*4] );
+    for ( i=0; i<10; i++ )
+    state->chainv[i] = _mm_load_si128( (__m128i*)&IV[i*4] );
+    memset(state->buffer, 0, sizeof state->buffer );
+
+    // update
+
+    int blocks = (int)( inlen / 32 );
+    state->rembytes = inlen % 32;
+
+    // full blocks
+    for ( i = 0; i < blocks; i++ )
+    {
+       rnd512( state, mm128_bswap_32( casti_m128i( data, 1 ) ),
+                      mm128_bswap_32( casti_m128i( data, 0 ) ) );
+       data += MSG_BLOCK_BYTE_LEN;
+    }
+
+    // final
+
+    // 16 byte partial block exists for 80 byte len
+    if ( state->rembytes  )
+       // padding of partial block
+       rnd512( state, m128_const_64( 0, 0x80000000 ),
+                      mm128_bswap_32( cast_m128i( data ) ) );
+    else
+       // empty pad block
+       rnd512( state, m128_zero, m128_const_64( 0, 0x80000000 ) );
+
+    finalization512( state, (uint32*) output );
+    if ( state->hashbitlen > 512 )
+        finalization512( state, (uint32*)( output+128 ) );
+
+    return SUCCESS;
+}
+
+
 /***************************************************/
 /* Round function         */
 /* state: hash context    */
--- a/algo/luffa/luffa_for_sse2.h
+++ b/algo/luffa/luffa_for_sse2.h
@@ -1,3 +1,6 @@
+#if !defined(LUFFA_FOR_SSE2_H__)
+#define LUFFA_FOR_SSE2_H__ 1
+
 /*
 * luffa_for_sse2.h
 * Version 2.0 (Sep 15th 2009)
@@ -48,8 +51,6 @@
 typedef struct {
    uint32 buffer[8] __attribute((aligned(32)));
    __m128i chainv[10] __attribute((aligned(32)));   /* Chaining values */
-//    uint64 bitlen[2]; /* Message length in bits */
-//    uint32 rembitlen; /* Length of buffer data to be hashed */
    int hashbitlen;
    int rembytes;
 } hashState_luffa;
@@ -65,5 +66,6 @@ HashReturn final_luffa( hashState_luffa *state, BitSequence *hashval );
 HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
                                   const BitSequence* data, size_t inlen );

-
-
+int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen,
+                                   const BitSequence* data, size_t inlen );
+#endif   // LUFFA_FOR_SSE2_H___
--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -7,33 +7,44 @@
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/cubehash/cube-hash-2way.h"
 #include "algo/groestl/aes_ni/hash-groestl256.h"
+#if defined(__VAES__)
+  #include "algo/groestl/groestl256-hash-4way.h"
+#endif

-#if defined (ALLIUM_8WAY)  
+#if defined (ALLIUM_16WAY)  

 typedef struct {
-   blake256_8way_context     blake;
+   blake256_16way_context     blake;
   keccak256_8way_context    keccak;
   cube_4way_context          cube;
   skein256_8way_context     skein;
+#if defined(__VAES__)
+   groestl256_4way_context groestl;
+#else
   hashState_groestl256      groestl;
-} allium_8way_ctx_holder;
+#endif
+} allium_16way_ctx_holder;

-static __thread allium_8way_ctx_holder allium_8way_ctx;
+static __thread allium_16way_ctx_holder allium_16way_ctx;

-bool init_allium_8way_ctx()
+bool init_allium_16way_ctx()
 {
-   keccak256_8way_init( &allium_8way_ctx.keccak );
-   cube_4way_init( &allium_8way_ctx.cube, 256, 16, 32 );
-   skein256_8way_init( &allium_8way_ctx.skein );
-   init_groestl256( &allium_8way_ctx.groestl, 32 );
+   keccak256_8way_init( &allium_16way_ctx.keccak );
+   cube_4way_init( &allium_16way_ctx.cube, 256, 16, 32 );
+   skein256_8way_init( &allium_16way_ctx.skein );
+#if defined(__VAES__)
+   groestl256_4way_init( &allium_16way_ctx.groestl, 32 );
+#else
+   init_groestl256( &allium_16way_ctx.groestl, 32 );
+#endif
   return true;
 }

-void allium_8way_hash( void *state, const void *input )
+void allium_16way_hash( void *state, const void *input )
 {
-   uint32_t vhash[8*8] __attribute__ ((aligned (128)));
-   uint32_t vhashA[8*8] __attribute__ ((aligned (64)));
-   uint32_t vhashB[8*8] __attribute__ ((aligned (64)));
+   uint32_t vhash[16*8] __attribute__ ((aligned (128)));
+   uint32_t vhashA[16*8] __attribute__ ((aligned (64)));
+   uint32_t vhashB[16*8] __attribute__ ((aligned (64)));
   uint32_t hash0[8] __attribute__ ((aligned (64)));
   uint32_t hash1[8] __attribute__ ((aligned (64)));
   uint32_t hash2[8] __attribute__ ((aligned (64)));
@@ -42,18 +53,39 @@ void allium_8way_hash( void *state, const void *input )
   uint32_t hash5[8] __attribute__ ((aligned (64)));
   uint32_t hash6[8] __attribute__ ((aligned (64)));
   uint32_t hash7[8] __attribute__ ((aligned (64)));
-   allium_8way_ctx_holder ctx __attribute__ ((aligned (64)));
+   uint32_t hash8[8] __attribute__ ((aligned (64)));
+   uint32_t hash9[8] __attribute__ ((aligned (64)));
+   uint32_t hash10[8] __attribute__ ((aligned (64)));
+   uint32_t hash11[8] __attribute__ ((aligned (64)));
+   uint32_t hash12[8] __attribute__ ((aligned (64)));
+   uint32_t hash13[8] __attribute__ ((aligned (64)));
+   uint32_t hash14[8] __attribute__ ((aligned (64)));
+   uint32_t hash15[8] __attribute__ ((aligned (64)));
+   allium_16way_ctx_holder ctx __attribute__ ((aligned (64)));

-   memcpy( &ctx, &allium_8way_ctx, sizeof(allium_8way_ctx) );
-   blake256_8way_update( &ctx.blake, input + (64<<3), 16 );
-   blake256_8way_close( &ctx.blake, vhash );
+   memcpy( &ctx, &allium_16way_ctx, sizeof(allium_16way_ctx) );
+   blake256_16way_update( &ctx.blake, input + (64<<4), 16 );
+   blake256_16way_close( &ctx.blake, vhash );

-   rintrlv_8x32_8x64( vhashA, vhash, 256 );
+   dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                  hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15,
+                  vhash, 256 );
+   intrlv_8x64( vhashA, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                256 );
+   intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
+                hash15, 256 );
+   
+//   rintrlv_8x32_8x64( vhashA, vhash, 256 );
   keccak256_8way_update( &ctx.keccak, vhashA, 32 );
-   keccak256_8way_close( &ctx.keccak, vhash );
+   keccak256_8way_close( &ctx.keccak, vhashA);
+   keccak256_8way_init( &ctx.keccak );
+   keccak256_8way_update( &ctx.keccak, vhashB, 32 );
+   keccak256_8way_close( &ctx.keccak, vhashB);

   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                 vhash, 256 );
+                 vhashA, 256 );
+   dintrlv_8x64( hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15,
+                 vhashB, 256 );

   intrlv_2x256( vhash, hash0, hash1, 256 );
   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
@@ -67,17 +99,37 @@ void allium_8way_hash( void *state, const void *input )
   intrlv_2x256( vhash, hash6, hash7, 256 );
   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
   dintrlv_2x256( hash6, hash7, vhash, 256 );
+   intrlv_2x256( vhash, hash8, hash9, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash8, hash9, vhash, 256 );
+   intrlv_2x256( vhash, hash10, hash11, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash10, hash11, vhash, 256 );
+   intrlv_2x256( vhash, hash12, hash13, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash12, hash13, vhash, 256 );
+   intrlv_2x256( vhash, hash14, hash15, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash14, hash15, vhash, 256 );
  
   intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, 256 );
   intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, 256 );

-   cube_4way_update_close( &ctx.cube, vhashA, vhashA, 32 );
-   cube_4way_init( &ctx.cube, 256, 16, 32 );
-   cube_4way_update_close( &ctx.cube, vhashB, vhashB, 32 );
+   cube_4way_full( &ctx.cube, vhashA, 256, vhashA, 32 );
+   cube_4way_full( &ctx.cube, vhashB, 256, vhashB, 32 );

   dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 );
   dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 );

+   intrlv_4x128( vhashA, hash8, hash9, hash10, hash11, 256 );
+   intrlv_4x128( vhashB, hash12, hash13, hash14, hash15, 256 );
+
+   cube_4way_full( &ctx.cube, vhashA, 256, vhashA, 32 );
+   cube_4way_full( &ctx.cube, vhashB, 256, vhashB, 32 );
+
+   dintrlv_4x128( hash8, hash9, hash10, hash11, vhashA, 256 );
+   dintrlv_4x128( hash12, hash13, hash14, hash15, vhashB, 256 );
+
   intrlv_2x256( vhash, hash0, hash1, 256 );
   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
   dintrlv_2x256( hash0, hash1, vhash, 256 );
@@ -90,133 +142,181 @@ void allium_8way_hash( void *state, const void *input )
   intrlv_2x256( vhash, hash6, hash7, 256 );
   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
   dintrlv_2x256( hash6, hash7, vhash, 256 );
+   intrlv_2x256( vhash, hash8, hash9, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash8, hash9, vhash, 256 );
+   intrlv_2x256( vhash, hash10, hash11, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash10, hash11, vhash, 256 );
+   intrlv_2x256( vhash, hash12, hash13, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash12, hash13, vhash, 256 );
+   intrlv_2x256( vhash, hash14, hash15, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash14, hash15, vhash, 256 );

-   intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+   intrlv_8x64( vhashA, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                hash7, 256 );
+   intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
+                hash15, 256 );

-   skein256_8way_update( &ctx.skein, vhash, 32 );
-   skein256_8way_close( &ctx.skein, vhash );
+   skein256_8way_update( &ctx.skein, vhashA, 32 );
+   skein256_8way_close( &ctx.skein, vhashA );
+   skein256_8way_init( &ctx.skein );
+   skein256_8way_update( &ctx.skein, vhashB, 32 );
+   skein256_8way_close( &ctx.skein, vhashB );

   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                 vhash, 256 );
+                 vhashA, 256 );
+   dintrlv_8x64( hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15,
+                 vhashB, 256 );

-   update_and_final_groestl256( &ctx.groestl, state, hash0, 256 );
-   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, state+32, hash1, 256 );
-   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, state+64, hash2, 256 );
-   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, state+96, hash3, 256 );
-   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, state+128, hash4, 256 );
-   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, state+160, hash5, 256 );
-   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, state+192, hash6, 256 );
-   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, state+224, hash7, 256 );
-   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
-           sizeof(hashState_groestl256) );
+#if defined(__VAES__)
+
+   intrlv_4x128( vhash, hash0, hash1, hash2, hash3, 256 );
+
+   groestl256_4way_full( &ctx.groestl, vhash, vhash, 256 );
+
+   dintrlv_4x128( state, state+32, state+64, state+96, vhash, 256 );
+   intrlv_4x128( vhash, hash4, hash5, hash6, hash7, 256 );
+
+   groestl256_4way_full( &ctx.groestl, vhash, vhash, 256 );
+   
+   dintrlv_4x128( state+128, state+160, state+192, state+224, vhash, 256 );
+   intrlv_4x128( vhash, hash8, hash9, hash10, hash11, 256 );
+
+   groestl256_4way_full( &ctx.groestl, vhash, vhash, 256 );
+
+   dintrlv_4x128( state+256, state+288, state+320, state+352, vhash, 256 );
+   intrlv_4x128( vhash, hash12, hash13, hash14, hash15, 256 );
+
+   groestl256_4way_full( &ctx.groestl, vhash, vhash, 256 );
+ 
+   dintrlv_4x128( state+384, state+416, state+448, state+480, vhash, 256 );
+   
+#else
+
+   groestl256_full( &ctx.groestl, state,     hash0,  256 );
+   groestl256_full( &ctx.groestl, state+32,  hash1,  256 );
+   groestl256_full( &ctx.groestl, state+64,  hash2,  256 );
+   groestl256_full( &ctx.groestl, state+96,  hash3,  256 );
+   groestl256_full( &ctx.groestl, state+128, hash4,  256 );
+   groestl256_full( &ctx.groestl, state+160, hash5,  256 );
+   groestl256_full( &ctx.groestl, state+192, hash6,  256 );
+   groestl256_full( &ctx.groestl, state+224, hash7,  256 );
+   groestl256_full( &ctx.groestl, state+256, hash8,  256 );
+   groestl256_full( &ctx.groestl, state+288, hash9,  256 );
+   groestl256_full( &ctx.groestl, state+320, hash10, 256 );
+   groestl256_full( &ctx.groestl, state+352, hash11, 256 );
+   groestl256_full( &ctx.groestl, state+384, hash12, 256 );
+   groestl256_full( &ctx.groestl, state+416, hash13, 256 );
+   groestl256_full( &ctx.groestl, state+448, hash14, 256 );
+   groestl256_full( &ctx.groestl, state+480, hash15, 256 );
+#endif
 }

-int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
+int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
                             uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t hash[8*8] __attribute__ ((aligned (128)));
-   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t hash[8*16] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*16] __attribute__ ((aligned (64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
-   const uint32_t last_nonce = max_nonce - 8;
-   const uint32_t Htarg = ptarget[7];
-   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
-   int thr_id = mythr->id;  // thr_id arg is deprecated
+   const uint32_t last_nonce = max_nonce - 16;
+   __m512i  *noncev = (__m512i*)vdata + 19;   // aligned
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;

-   if ( opt_benchmark )
-      ( (uint32_t*)ptarget )[7] = 0x0000ff;
+   if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff;

-   mm256_bswap32_intrlv80_8x32( vdata, pdata );
-   blake256_8way_init( &allium_8way_ctx.blake );
-   blake256_8way_update( &allium_8way_ctx.blake, vdata, 64 );
+   mm512_bswap32_intrlv80_16x32( vdata, pdata );
+   *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
+                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
+
+   blake256_16way_init( &allium_16way_ctx.blake );
+   blake256_16way_update( &allium_16way_ctx.blake, vdata, 64 );

   do {
-     *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
-                                                 n+3, n+2, n+1, n ) );
+     allium_16way_hash( hash, vdata );

-     allium_8way_hash( hash, vdata );
-     pdata[19] = n;
-
-     for ( int lane = 0; lane < 8; lane++ ) if ( (hash+(lane<<3))[7] <= Htarg )
+     for ( int lane = 0; lane < 16; lane++ ) 
+     if ( unlikely( valid_hash( hash+(lane<<3), ptarget ) && !bench ) )
     {
-        if ( fulltest( hash+(lane<<3), ptarget ) && !opt_benchmark )
-        {
-           pdata[19] = n + lane;
-           submit_lane_solution( work, hash+(lane<<3), mythr, lane );
-         }
+         pdata[19] = bswap_32( n + lane );
+         submit_solution( work, hash+(lane<<3), mythr );
     }
-     n += 8;
-   } while ( (n < last_nonce) && !work_restart[thr_id].restart);
-
+     *noncev = _mm512_add_epi32( *noncev, m512_const1_32( 16 ) );
+     n += 16;
+   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart) );
+   pdata[19] = n;
   *hashes_done = n - first_nonce;
   return 0;
 }

-
-#elif defined (ALLIUM_4WAY)  
-
+#elif defined (ALLIUM_8WAY)  

 typedef struct {
-   blake256_4way_context     blake;
+   blake256_8way_context     blake;
   keccak256_4way_context    keccak;
   cubehashParam             cube;
   skein256_4way_context     skein;
   hashState_groestl256      groestl;

-} allium_4way_ctx_holder;
+} allium_8way_ctx_holder;

-static __thread allium_4way_ctx_holder allium_4way_ctx;
+static __thread allium_8way_ctx_holder allium_8way_ctx;

-bool init_allium_4way_ctx()
+bool init_allium_8way_ctx()
 {
-   keccak256_4way_init( &allium_4way_ctx.keccak );
-   cubehashInit( &allium_4way_ctx.cube, 256, 16, 32 );
-   skein256_4way_init( &allium_4way_ctx.skein );
-   init_groestl256( &allium_4way_ctx.groestl, 32 );
+   keccak256_4way_init( &allium_8way_ctx.keccak );
+   cubehashInit( &allium_8way_ctx.cube, 256, 16, 32 );
+   skein256_4way_init( &allium_8way_ctx.skein );
+   init_groestl256( &allium_8way_ctx.groestl, 32 );
   return true;
 }

-void allium_4way_hash( void *state, const void *input )
+void allium_8way_hash( void *hash, const void *input )
 {
-   uint32_t hash0[8] __attribute__ ((aligned (64)));
-   uint32_t hash1[8] __attribute__ ((aligned (32)));
-   uint32_t hash2[8] __attribute__ ((aligned (32)));
-   uint32_t hash3[8] __attribute__ ((aligned (32)));
-   uint32_t vhash32[8*4] __attribute__ ((aligned (64)));
-   uint32_t vhash64[8*4] __attribute__ ((aligned (64)));
-   allium_4way_ctx_holder ctx __attribute__ ((aligned (64))); 
+   uint64_t vhashA[4*8] __attribute__ ((aligned (64)));
+   uint64_t vhashB[4*8] __attribute__ ((aligned (64)));
+   uint64_t *hash0 = (uint64_t*)hash;
+   uint64_t *hash1 = (uint64_t*)hash+ 4;
+   uint64_t *hash2 = (uint64_t*)hash+ 8;
+   uint64_t *hash3 = (uint64_t*)hash+12;
+   uint64_t *hash4 = (uint64_t*)hash+16;
+   uint64_t *hash5 = (uint64_t*)hash+20;
+   uint64_t *hash6 = (uint64_t*)hash+24;
+   uint64_t *hash7 = (uint64_t*)hash+28;
+   allium_8way_ctx_holder ctx __attribute__ ((aligned (64))); 

-   memcpy( &ctx, &allium_4way_ctx, sizeof(allium_4way_ctx) );
-   blake256_4way_update( &ctx.blake, input + (64<<2), 16 );
-   blake256_4way_close( &ctx.blake, vhash32 );
+   memcpy( &ctx, &allium_8way_ctx, sizeof(allium_8way_ctx) );
+   blake256_8way_update( &ctx.blake, input + (64<<3), 16 );
+   blake256_8way_close( &ctx.blake, vhashA );

-   rintrlv_4x32_4x64( vhash64, vhash32, 256 );
-   keccak256_4way_update( &ctx.keccak, vhash64, 32 );
-   keccak256_4way_close( &ctx.keccak, vhash64 );
+   dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                     vhashA, 256 );
+   intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
+   intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );

-   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
+   keccak256_4way_update( &ctx.keccak, vhashA, 32 );
+   keccak256_4way_close( &ctx.keccak, vhashA );
+   keccak256_4way_init( &ctx.keccak );
+   keccak256_4way_update( &ctx.keccak, vhashB, 32 );
+   keccak256_4way_close( &ctx.keccak, vhashB );
+
+   dintrlv_4x64( hash0, hash1, hash2, hash3, vhashA, 256 );
+   dintrlv_4x64( hash4, hash5, hash6, hash7, vhashB, 256 );

   LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
   LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
   LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
   LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
+   LYRA2RE( hash4, 32, hash4, 32, hash4, 32, 1, 8, 8 );
+   LYRA2RE( hash5, 32, hash5, 32, hash5, 32, 1, 8, 8 );
+   LYRA2RE( hash6, 32, hash6, 32, hash6, 32, 1, 8, 8 );
+   LYRA2RE( hash7, 32, hash7, 32, hash7, 32, 1, 8, 8 );

   cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*)hash0, 32 );
   cubehashInit( &ctx.cube, 256, 16, 32 );
@@ -225,69 +325,83 @@ void allium_4way_hash( void *state, const void *input )
   cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*)hash2, 32 );
   cubehashInit( &ctx.cube, 256, 16, 32 );
   cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*)hash3, 32 );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*)hash4, (const byte*)hash4, 32 );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*)hash5, (const byte*)hash5, 32 );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*)hash6, (const byte*)hash6, 32 );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*)hash7, (const byte*)hash7, 32 );

   LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
   LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
   LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
   LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
+   LYRA2RE( hash4, 32, hash4, 32, hash4, 32, 1, 8, 8 );
+   LYRA2RE( hash5, 32, hash5, 32, hash5, 32, 1, 8, 8 );
+   LYRA2RE( hash6, 32, hash6, 32, hash6, 32, 1, 8, 8 );
+   LYRA2RE( hash7, 32, hash7, 32, hash7, 32, 1, 8, 8 );

-   intrlv_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
+   intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
+   intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );

-   skein256_4way_update( &ctx.skein, vhash64, 32 );
-   skein256_4way_close( &ctx.skein, vhash64 );
+   skein256_4way_update( &ctx.skein, vhashA, 32 );
+   skein256_4way_close( &ctx.skein, vhashA );
+   skein256_4way_init( &ctx.skein );
+   skein256_4way_update( &ctx.skein, vhashB, 32 );
+   skein256_4way_close( &ctx.skein, vhashB );

-   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
+   dintrlv_4x64( hash0, hash1, hash2, hash3, vhashA, 256 );
+   dintrlv_4x64( hash4, hash5, hash6, hash7, vhashB, 256 );

-   update_and_final_groestl256( &ctx.groestl, state, hash0, 256 );
-   memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, state+32, hash1, 256 );
-   memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, state+64, hash2, 256 );
-   memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, state+96, hash3, 256 );
+   groestl256_full( &ctx.groestl, hash0, hash0, 256 );
+   groestl256_full( &ctx.groestl, hash1, hash1, 256 );
+   groestl256_full( &ctx.groestl, hash2, hash2, 256 );
+   groestl256_full( &ctx.groestl, hash3, hash3, 256 );
+   groestl256_full( &ctx.groestl, hash4, hash4, 256 );
+   groestl256_full( &ctx.groestl, hash5, hash5, 256 );
+   groestl256_full( &ctx.groestl, hash6, hash6, 256 );
+   groestl256_full( &ctx.groestl, hash7, hash7, 256 );
 }

-int scanhash_allium_4way( struct work *work, uint32_t max_nonce,
+int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
                             uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t hash[8*4] __attribute__ ((aligned (64)));
-   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint64_t hash[4*8] __attribute__ ((aligned (64)));
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
+   uint64_t *ptarget = (uint64_t*)work->target;
   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
   uint32_t n = first_nonce;
-   const uint32_t Htarg = ptarget[7];
-   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
-   int thr_id = mythr->id;  // thr_id arg is deprecated
+   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
+   const int thr_id = mythr->id;  
+   const bool bench = opt_benchmark;

-   if ( opt_benchmark )
-      ( (uint32_t*)ptarget )[7] = 0x0000ff;
+   mm256_bswap32_intrlv80_8x32( vdata, pdata );
+   *noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );

-   mm128_bswap32_intrlv80_4x32( vdata, pdata );
-   blake256_4way_init( &allium_4way_ctx.blake );
-   blake256_4way( &allium_4way_ctx.blake, vdata, 64 );
+   blake256_8way_init( &allium_8way_ctx.blake );
+   blake256_8way_update( &allium_8way_ctx.blake, vdata, 64 );

   do {
-     *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
+     allium_8way_hash( hash, vdata );

-     allium_4way_hash( hash, vdata );
-     pdata[19] = n;
-
-     for ( int lane = 0; lane < 4; lane++ ) if ( (hash+(lane<<3))[7] <= Htarg )
+     for ( int lane = 0; lane < 8; lane++ )
     {
-        if ( fulltest( hash+(lane<<3), ptarget ) && !opt_benchmark )
+        const uint64_t *lane_hash = hash + (lane<<2);
+        if ( unlikely( valid_hash( lane_hash, ptarget ) && !bench ) )
        {
-           pdata[19] = n + lane;
-           submit_lane_solution( work, hash+(lane<<3), mythr, lane );
-         }
+           pdata[19] = bswap_32( n + lane );
+           submit_solution( work, lane_hash, mythr );
+        }
     }
-     n += 4;
-   } while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
-
-   *hashes_done = n - first_nonce + 1;
+     n += 8;
+     *noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
+   } while ( likely( (n <= last_nonce) && !work_restart[thr_id].restart ) );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
   return 0;
 }

--- a/algo/lyra2/allium.c
+++ b/algo/lyra2/allium.c
@@ -1,4 +1,7 @@
 #include "lyra2-gate.h"
+
+#if !( defined(ALLIUM_16WAY) || defined(ALLIUM_8WAY) || defined(ALLIUM_4WAY) )
+
 #include <memory.h>
 #include "algo/blake/sph_blake.h"
 #include "algo/keccak/sph_keccak.h"
@@ -73,37 +76,35 @@ int scanhash_allium( struct work *work, uint32_t max_nonce,
                     uint64_t *hashes_done, struct thr_info *mythr )
 {
    uint32_t _ALIGN(128) hash[8];
-    uint32_t _ALIGN(128) endiandata[20];
+    uint32_t _ALIGN(128) edata[20];
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
-
-    const uint32_t Htarg = ptarget[7];
    const uint32_t first_nonce = pdata[19];
    uint32_t nonce = first_nonce;
-    int thr_id = mythr->id;  // thr_id arg is deprecated
+    const int thr_id = mythr->id; 

    if ( opt_benchmark )
        ptarget[7] = 0x3ffff;

    for ( int i = 0; i < 19; i++ )
-        be32enc( &endiandata[i], pdata[i] );
+        edata[i] = bswap_32( pdata[i] );

    sph_blake256_init( &allium_ctx.blake );
-    sph_blake256( &allium_ctx.blake, endiandata, 64 );
+    sph_blake256( &allium_ctx.blake, edata, 64 );

    do {
-        be32enc( &endiandata[19], nonce );
-        allium_hash( hash, endiandata );
-        if ( hash[7] <= Htarg )
-        if ( fulltest( hash, ptarget ) && !opt_benchmark )
+        edata[19] = nonce;
+        allium_hash( hash, edata );
+        if ( valid_hash( hash, ptarget ) && !opt_benchmark )
        {
-            pdata[19] = nonce;
+            pdata[19] = bswap_32( nonce );
            submit_solution( work, hash, mythr );
        }
        nonce++;
    } while ( nonce < max_nonce && !work_restart[thr_id].restart );
    pdata[19] = nonce;
-    *hashes_done = pdata[19] - first_nonce + 1;
+    *hashes_done = pdata[19] - first_nonce;
    return 0;
 }

+#endif
--- a/algo/lyra2/lyra2-gate.c
+++ b/algo/lyra2/lyra2-gate.c
@@ -78,7 +78,7 @@ bool register_lyra2rev3_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_lyra2rev3;
  gate->hash      = (void*)&lyra2rev3_hash;
 #endif
-  gate->optimizations = SSE2_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
  gate->miner_thread_init = (void*)&lyra2rev3_thread_init;
  opt_target_factor = 256.0;
  return true;
@@ -94,12 +94,12 @@ bool lyra2rev2_thread_init()
   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;

   int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
-#if defined (LYRA2REV2_8WAY)
+#if defined (LYRA2REV2_16WAY)
   l2v2_wholeMatrix = _mm_malloc( 2 * size, 64 );   // 2 way
-   init_lyra2rev2_8way_ctx();;
-#elif defined (LYRA2REV2_4WAY)
+   init_lyra2rev2_16way_ctx();;
+#elif defined (LYRA2REV2_8WAY)
   l2v2_wholeMatrix = _mm_malloc( size, 64 );
-   init_lyra2rev2_4way_ctx();;
+   init_lyra2rev2_8way_ctx();;
 #else
   l2v2_wholeMatrix = _mm_malloc( size, 64 );
   init_lyra2rev2_ctx();
@@ -109,17 +109,17 @@ bool lyra2rev2_thread_init()

 bool register_lyra2rev2_algo( algo_gate_t* gate )
 {
-#if defined (LYRA2REV2_8WAY)
+#if defined (LYRA2REV2_16WAY)
+  gate->scanhash  = (void*)&scanhash_lyra2rev2_16way;
+  gate->hash      = (void*)&lyra2rev2_16way_hash;
+#elif defined (LYRA2REV2_8WAY)
  gate->scanhash  = (void*)&scanhash_lyra2rev2_8way;
  gate->hash      = (void*)&lyra2rev2_8way_hash;
-#elif defined (LYRA2REV2_4WAY)
-  gate->scanhash  = (void*)&scanhash_lyra2rev2_4way;
-  gate->hash      = (void*)&lyra2rev2_4way_hash;
 #else
  gate->scanhash  = (void*)&scanhash_lyra2rev2;
  gate->hash      = (void*)&lyra2rev2_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
  gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
  opt_target_factor = 256.0;
  return true;
@@ -146,7 +146,7 @@ bool register_lyra2z_algo( algo_gate_t* gate )
  gate->scanhash   = (void*)&scanhash_lyra2z;
  gate->hash       = (void*)&lyra2z_hash;
 #endif
-  gate->optimizations = SSE42_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
  opt_target_factor = 256.0;
  return true;
 };
@@ -165,7 +165,7 @@ bool register_lyra2h_algo( algo_gate_t* gate )
  gate->scanhash   = (void*)&scanhash_lyra2h;
  gate->hash       = (void*)&lyra2h_hash;
 #endif
-  gate->optimizations = SSE42_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AVX2_OPT;
  opt_target_factor = 256.0;
  return true;
 };
@@ -174,27 +174,27 @@ bool register_lyra2h_algo( algo_gate_t* gate )

 bool register_allium_algo( algo_gate_t* gate )
 {
-#if defined (ALLIUM_8WAY)
+#if defined (ALLIUM_16WAY)
+  gate->miner_thread_init = (void*)&init_allium_16way_ctx;
+  gate->scanhash  = (void*)&scanhash_allium_16way;
+  gate->hash      = (void*)&allium_16way_hash;
+#elif defined (ALLIUM_8WAY)
  gate->miner_thread_init = (void*)&init_allium_8way_ctx;
  gate->scanhash  = (void*)&scanhash_allium_8way;
  gate->hash      = (void*)&allium_8way_hash;
-#elif defined (ALLIUM_4WAY)
-  gate->miner_thread_init = (void*)&init_allium_4way_ctx;
-  gate->scanhash  = (void*)&scanhash_allium_4way;
-  gate->hash      = (void*)&allium_4way_hash;
 #else
  gate->miner_thread_init = (void*)&init_allium_ctx;
  gate->scanhash  = (void*)&scanhash_allium;
  gate->hash      = (void*)&allium_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
  opt_target_factor = 256.0;
  return true;
 };

 /////////////////////////////////////////

-bool phi2_has_roots;
+bool phi2_has_roots = false;
 bool phi2_use_roots = false;

 int phi2_get_work_data_size() { return phi2_use_roots ? 144 : 128; }
@@ -220,7 +220,7 @@ void phi2_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
   // Assemble block header
   algo_gate.build_block_header( g_work, le32dec( sctx->job.version ),
                  (uint32_t*) sctx->job.prevhash, (uint32_t*) merkle_tree,
-                  le32dec( sctx->job.ntime ), le32dec(sctx->job.nbits) );
+                  le32dec( sctx->job.ntime ), le32dec(sctx->job.nbits), NULL );
   for ( t = 0; t < 16; t++ )
      g_work->data[ 20+t ] = ((uint32_t*)sctx->job.extra)[t];
 }
@@ -228,13 +228,14 @@ void phi2_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )

 bool register_phi2_algo( algo_gate_t* gate )
 {
-//   init_phi2_ctx();
-   gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
+   gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
   gate->get_work_data_size = (void*)&phi2_get_work_data_size;
   gate->decode_extra_data  = (void*)&phi2_decode_extra_data;
   gate->build_extraheader  = (void*)&phi2_build_extraheader;
   opt_target_factor = 256.0;
-#if defined(PHI2_4WAY)
+#if defined(PHI2_8WAY)
+   gate->scanhash           = (void*)&scanhash_phi2_8way;
+#elif defined(PHI2_4WAY)
   gate->scanhash           = (void*)&scanhash_phi2_4way;
 #else
   init_phi2_ctx();
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Jay D Dee	972d4d70db	v3.12.8.1	2020-04-17 16:12:45 -04:00
Jay D Dee	e96a6bd699	v3.12.8	2020-04-09 12:56:18 -04:00
Jay D Dee	fb9163185a	v3.12.7	2020-03-20 16:30:12 -04:00
Jay D Dee	6e8b8ed34f	v3.12.6.1	2020-03-07 14:11:06 -05:00
Jay D Dee	c0aadbcc99	v3.12.6	2020-03-05 18:43:20 -05:00
Jay D Dee	3da149418a	v3.12.5	2020-03-01 13:18:17 -05:00
Jay D Dee	720610cce5	v3.12.4.6	2020-02-28 18:20:32 -05:00
Jay D Dee	cedcf4d070	v3.12.4.5	2020-02-28 02:42:22 -05:00
Jay D Dee	81b50c3c71	v3.12.4.4	2020-02-25 14:07:32 -05:00
Jay D Dee	0e1e88f53e	v3.12.4.3	2020-02-24 21:35:19 -05:00
Jay D Dee	45c77a5c81	v3.12.4.2	2020-02-23 15:31:06 -05:00
Jay D Dee	dbce7e0721	v3.12.4.1	2020-02-22 18:06:39 -05:00
Jay D Dee	6d66051de6	v3.12.4	2020-02-21 16:34:53 -05:00
Jay D Dee	b93be8816a	v3.12.3.1	2020-02-18 12:05:47 -05:00
Jay D Dee	19b0ac6d5c	v3.12.3	2020-02-13 04:25:33 -05:00
Jay D Dee	3da2b958cf	v3.12.2	2020-02-09 13:30:40 -05:00
Jay D Dee	dc2f8d81d3	v3.12.1	2020-02-07 20:18:20 -05:00
Jay D Dee	fc97ef174a	v3.12.0.1	2020-02-06 22:50:20 -05:00
Jay D Dee	13523a12f9	v3.12.0	2020-02-05 22:50:58 -05:00
Jay D Dee	1b76cee239	v3.11.9	2020-02-04 01:31:59 -05:00
Jay D Dee	0681ca996d	v3.11.8	2020-01-30 03:47:11 -05:00
Jay D Dee	88f81fda0b	v3.11.7	2020-01-26 04:33:39 -05:00
Jay D Dee	103e6ad36c	v3.11.6	2020-01-23 00:11:08 -05:00
Jay D Dee	1a7a573675	v3.11.5	2020-01-18 15:14:27 -05:00