v3.11.4

2025-09-17 23:44:27 +00:00 · 2020-01-16 13:09:56 -05:00
215 changed files with 27605 additions and 8836 deletions
--- a/3
+++ b/3
@@ -33,6 +33,3 @@ Jay D Dee
 xcouiz@gmail.com

 Cryply
-
-Colin Percival
-Alexander Peslyak
--- a/Makefile.am
+++ b/Makefile.am
@@ -21,6 +21,15 @@ cpuminer_SOURCES = \
  api.c \
  sysinfos.c \
  algo-gate-api.c\
+  crypto/oaes_lib.c \
+  crypto/c_keccak.c \
+  crypto/c_groestl.c \
+  crypto/c_blake256.c \
+  crypto/c_jh.c \
+  crypto/c_skein.c \
+  crypto/hash.c \
+  crypto/aesb.c \
+  crypto/magimath.cpp \
  algo/argon2/argon2a/argon2a.c \
  algo/argon2/argon2a/ar2/argon2.c \
  algo/argon2/argon2a/ar2/opt.c \
@@ -67,6 +76,11 @@ cpuminer_SOURCES = \
  algo/bmw/bmw512-gate.c \
  algo/bmw/bmw512.c \
  algo/bmw/bmw512-4way.c \
+  algo/cryptonight/cryptolight.c \
+  algo/cryptonight/cryptonight-common.c\
+  algo/cryptonight/cryptonight-aesni.c\
+  algo/cryptonight/cryptonight.c\
+  algo/cubehash/sph_cubehash.c \
  algo/cubehash/cubehash_sse2.c\
  algo/cubehash/cube-hash-2way.c \
  algo/echo/sph_echo.c \
@@ -89,6 +103,9 @@ cpuminer_SOURCES = \
  algo/hamsi/hamsi-hash-4way.c \
  algo/haval/haval.c \
  algo/haval/haval-hash-4way.c \
+  algo/heavy/sph_hefty1.c \
+  algo/heavy/heavy.c \
+  algo/heavy/bastion.c \
  algo/hodl/aes.c \
  algo/hodl/hodl-gate.c \
  algo/hodl/hodl-wolf.c \
@@ -104,9 +121,9 @@ cpuminer_SOURCES = \
  algo/keccak/keccak-hash-4way.c \
  algo/keccak/keccak-4way.c\
  algo/keccak/keccak-gate.c \
-  algo/keccak/sha3d-4way.c \
-  algo/keccak/sha3d.c \
  algo/lanehash/lane.c \
+  algo/luffa/sph_luffa.c \
+  algo/luffa/luffa.c \
  algo/luffa/luffa_for_sse2.c \
  algo/luffa/luffa-hash-2way.c \
  algo/lyra2/lyra2.c \
@@ -128,14 +145,14 @@ cpuminer_SOURCES = \
  algo/lyra2/allium.c \
  algo/lyra2/phi2-4way.c \
  algo/lyra2/phi2.c \
-  algo//m7m/m7m.c \
-  algo/m7m/magimath.cpp \
+  algo/m7m.c \
  algo/nist5/nist5-gate.c \
  algo/nist5/nist5-4way.c \
  algo/nist5/nist5.c \
  algo/nist5/zr5.c \
  algo/panama/panama-hash-4way.c \
  algo/panama/sph_panama.c \
+  algo/radiogatun/sph_radiogatun.c \
  algo/quark/quark-gate.c \
  algo/quark/quark.c \
  algo/quark/quark-4way.c \
@@ -158,11 +175,11 @@ cpuminer_SOURCES = \
  algo/ripemd/lbry-4way.c \
  algo/scrypt/scrypt.c \
  algo/scrypt/neoscrypt.c \
+  algo/scrypt/pluck.c \
  algo/sha/sph_sha2.c \
  algo/sha/sph_sha2big.c \
  algo/sha/sha256-hash-4way.c \
  algo/sha/sha512-hash-4way.c \
-  algo/sha/hmac-sha256-hash.c \
  algo/sha/sha2.c \
  algo/sha/sha256t-gate.c \
  algo/sha/sha256t-4way.c \
@@ -176,6 +193,7 @@ cpuminer_SOURCES = \
  algo/shavite/shavite-hash-2way.c \
  algo/shavite/shavite-hash-4way.c \
  algo/shavite/shavite.c \
+  algo/simd/sph_simd.c \
  algo/simd/nist.c \
  algo/simd/vector.c \
  algo/simd/simd-hash-2way.c \
@@ -213,6 +231,7 @@ cpuminer_SOURCES = \
  algo/x11/timetravel10-gate.c \
  algo/x11/timetravel10.c \
  algo/x11/timetravel10-4way.c \
+  algo/x11/fresh.c \
  algo/x11/x11evo.c \
  algo/x11/x11evo-4way.c \
  algo/x11/x11evo-gate.c \
@@ -231,6 +250,7 @@ cpuminer_SOURCES = \
  algo/x13/skunk-gate.c \
  algo/x13/skunk-4way.c \
  algo/x13/skunk.c \
+  algo/x13/drop.c \
  algo/x13/x13bcd-4way.c \
  algo/x13/x13bcd.c \
  algo/x14/x14-gate.c \
@@ -265,17 +285,19 @@ cpuminer_SOURCES = \
  algo/x17/sonoa-gate.c \
  algo/x17/sonoa-4way.c \
  algo/x17/sonoa.c \
+  algo/x20/x20r.c \
  algo/x22/x22i-4way.c \
  algo/x22/x22i.c \
  algo/x22/x22i-gate.c \
  algo/x22/x25x.c \
  algo/x22/x25x-4way.c \
  algo/yescrypt/yescrypt.c \
+  algo/yescrypt/sha256_Y.c \
  algo/yescrypt/yescrypt-best.c \
  algo/yespower/yespower-gate.c \
  algo/yespower/yespower-blake2b.c \
  algo/yespower/crypto/blake2b-yp.c \
-  algo/yespower/yescrypt-r8g.c \
+  algo/yespower/sha256_p.c \
  algo/yespower/yespower-opt.c

 disable_flags =
--- a/README.md
+++ b/README.md
@@ -12,24 +12,10 @@ a false positive, they are flagged simply because they are cryptocurrency
 miners. The source code is open for anyone to inspect. If you don't trust 
 the software, don't use it.

-
-New thread:
-
-https://bitcointalk.org/index.php?topic=5226770.msg53865575#msg53865575
-
-Old thread:
-
 https://bitcointalk.org/index.php?topic=1326803.0

 mailto://jayddee246@gmail.com

-This note is to confirm that bitcointalk users JayDDee and joblo are the
-same person.
-
-I created a new BCT user JayDDee to match my github user id.
-The old thread has been locked but still contains useful information for
-reading.
-
 See file RELEASE_NOTES for change log and INSTALL_LINUX or INSTALL_WINDOWS
 for compile instructions.

@@ -67,6 +53,7 @@ Supported Algorithms
                          argon2d500    argon2d-dyn,  Dynamic (DYN)
                          argon2d4096   argon2d-uis, Unitus, (UIS)
                          axiom         Shabal-256 MemoHash
+                          bastion
                          blake         Blake-256 (SFR)
                          blake2b       Blake2b 256
                          blake2s       Blake-2 S
@@ -77,7 +64,10 @@ Supported Algorithms
                          decred
                          deep          Deepcoin (DCN)
                          dmd-gr        Diamond-Groestl
+                          drop          Dropcoin
+                          fresh         Fresh
                          groestl       Groestl coin
+                          heavy         Heavy
                          hex           x16r-hex
                          hmq1725       Espers
                          hodl          Hodlcoin
@@ -107,10 +97,10 @@ Supported Algorithms
                          qubit         Qubit
                          scrypt        scrypt(1024, 1, 1) (default)
                          scrypt:N      scrypt(N, 1, 1)
+                          scryptjane:nf
                          sha256d       Double SHA-256
                          sha256q       Quad SHA-256, Pyrite (PYE)
                          sha256t       Triple SHA-256, Onecoin (OC)
-                          sha3d         Double keccak256 (BSHA3)
                          shavite3      Shavite3
                          skein         Skein+Sha (Skeincoin)
                          skein2        Double Skein (Woodcoin)
@@ -144,7 +134,6 @@ Supported Algorithms
                          xevan         Bitsend (BSD)
                          yescrypt      Globalboost-Y (BSTY)
                          yescryptr8    BitZeny (ZNY)
-                          yescryptr8g   Koto (KOTO)
                          yescryptr16   Eli
                          yescryptr32   WAVI
                          yespower      Cryply
--- a/186
+++ b/186
@@ -33,195 +33,9 @@ supported.
 64 bit Linux or Windows operating system. Apple, Android and Raspberry Pi
 are not supported. FreeBSD YMMV.

-Reporting bugs
--------------
-
-Bugs can be reported by sending am email to JayDDee246@gmail.com or opening
-an issue in git: https://github.com/JayDDee/cpuminer-opt/issues
-
-Please include the following information:
-
-1. CPU model, operating system, cpuminer-opt version (must be latest),
-   binary file for Windows, changes to default build procedure for Linux.
-
-2. Exact comand line (except user and pw) and intial output showing
-   the above requested info.
-
-3. Additional program output showing any error messages or other
-   pertinent data.
-
-4. A clear description of the problem including history, scope,
-   persistence or intermittance, and reproduceability. 
-
-In simpler terms:
-
-What is it doing?
-What should it be doing instead?
-Did it work in a previous release?
-Does it happen for all algos? All pools? All options? Solo?
-Does it happen all the time?
-If not what makes it happen or not happen? 
-
 Change Log
 ----------

-v3.12.5
-
-Issue #246: Fixed net hashrate in getwork block log,
-            removed duplicate getwork block log, 
-            other small tweaks to stats logs for getwork.
-
-Issue #248: Fixed chronic stale shares with scrypt:1048576 (scryptn2). 
-
-v3.12.4.3
-
-Fixed segfault in new block log for getwork.
-
-Disabled silent discarding of stale work after the submit is logged.
-
-v3.12.4.2
-
-Issue #245: fixed getwork stale shares, solo mining with getwork now works.
-
-Issue #246: implemented block and summary logs for getwork.
-
-v3.12.4.1
-
-Issue #245: fix scantime when mining solo with getwork.
-
-Added debug logs for creation of stratum and longpoll threads, use -D to
-enable.
-
-v3.12.4
-
-Issue #244: Change longpoll to ignore job id.
-
-Lyra2rev2 AVX2 +3%, AVX512 +6%.
-
-v3.12.3.1
-
-Issue #241: Fixed regression that broke coinbase address in v3.11.7.
-
-v3.12.3
-
-Issue #238: Fixed skunk AVX2.
-
-Issue #239: Faster AVX2 & AVX512 for skein +44%, skein2 +30%, plus marginal
-increases for skunk, x16r, x16rv2, x16rt, x16rt-veil, x16s, x21s.
-
-Faster anime VAES +57%, AVX512 +21%, AVX2 +3%.
-
-Redesigned code reponsible for #236.
-
-v3.12.2
-
-Fixed xevan, skein, skein2 AVX2, #238.
-
-Reversed polarity of AVX2 vector bit test utilities, and all users, to be
-logically and semantically correct. Follow up to issue #236. 
-
-v3.12.1
-
-Fixed anime AVX2 low difficulty shares, git issue #236.
-
-Periodic summary now reports lost hash rate due to rejected and stale shares,
-displayed only when non-zero.
-
-v3.12.0.1
-
-Fixed hodl rejects, git issue #237.
-
-Fixed debug code added in v3.12.0 to work with AVX2 to be enabled only
-after low difficulty share have been seen to avoid unnecessarily excessive
-log outout.
-
-Added more digits of precision to diff in log output to help diagnose
-low difficulty shares.
-
-v3.12.0
-
-Faster phi2 AVX2 +62%, AVX512 +150% on Intel CPUs. AMD Ryzen AVX2 is
-YMMV due to its inferiour AVX2 implementation.
-
-Fixed Hodl stats, rejects are still an issue since v3.9.5, git issue #237.
-
-API can now be enabled with "-b port" or "--api-bind port".
-It will use the default address 127.0.0.1.
-
-Editorial: Short form options should only be used on the command line to save
-typing. Configuration files and scripts should always use the long form
-"--api-bind addr:port" without relying on any defaults. This is a general
-recommendation that applies to all options for any application.
-
-Removed obsolete cryptonight, all variants, and supporting code for more
-size reduction and faster compiling.
-
-Tweaked the timing of the CPU temperature and frequency log (Linux only).
-
-Added some debug code to collect more info aboout low difficulty rejects,
-git issue #236.
-
-v3.11.9
-
-Fixed x16r invalid shares when Luffa was first in hash order.
-
-API is disabled by default.
-
-New startup message for status of stratum connection, API & extranonce.
-
-New log report for CPU temperature, frequency of fastest and slowest cores.
-
-Compile time is a little shorter and binary file size a little smaller
-using conditional compilation..
-
-Removed code for Bastion, Drop, Heavy, Luffa an Pluck algos and other unused
-code.
-
-v3.11.8
-
-Fixed network hashrate showing incorrect data, should be close now.
-
-Fixed compile errors when using GCC 10 with default flag -fno-common.
-
-Faster x16r, x16rv2, x16rt, x16s, x21s, veil, hex with midstate prehash.
-
-Decoupled sapling usage from block version 5 in yescryptr8g.
-
-More detailed data reporting for low difficulty rejected shares.
-
-v3.11.7
-
-Added yescryptr8g algo for KOTO, including support for block version 5.
-
-Added sha3d algo for BSHA3.
-
-Removed memcmp and clean_job checks from get_new_work, now only check job_id.
-
-Small improvement to sha512 and sha256 parallel implementations that don't
-use SHA.
-
-v3.11.6
-
-Fixed CPU temperature regression from v3.11.5.
-
-More improvements to share log. More compact, highlight incremented counter,
-block height when solved, job id when stale.
-
-v3.11.5
-
-Fixed AVX512 detection that could cause compilation errors on CPUs
-without AVX512.
-
-Fixed "BLOCK SOLVED" log incorrectly displaying "Accepted" when a block
-is solved.
-Added share counter to share submitited & accepted logs
-Added job id to share submitted log.
-Share submitted log is no longer highlighted blue, there was too much blue.
-
-Another CPU temperature fix for Linux.
-
-Added bug reporting tips to RELEASE NOTES.
-
 v3.11.4

 Fixed scrypt segfault since v3.9.9.1.
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -113,6 +113,7 @@ void init_algo_gate( algo_gate_t* gate )
   gate->hash                    = (void*)&null_hash;
   gate->hash_suw                = (void*)&null_hash_suw;
   gate->get_new_work            = (void*)&std_get_new_work;
+   gate->get_nonceptr            = (void*)&std_get_nonceptr;
   gate->work_decode             = (void*)&std_le_work_decode;
   gate->decode_extra_data       = (void*)&do_nothing;
   gate->gen_merkle_root         = (void*)&sha256d_gen_merkle_root;
@@ -128,6 +129,7 @@ void init_algo_gate( algo_gate_t* gate )
   gate->resync_threads          = (void*)&do_nothing;
   gate->do_this_thread          = (void*)&return_true;
   gate->longpoll_rpc_call       = (void*)&std_longpoll_rpc_call;
+   gate->stratum_handle_response = (void*)&std_stratum_handle_response;
   gate->get_work_data_size      = (void*)&std_get_work_data_size;
   gate->optimizations           = EMPTY_SET;
   gate->ntime_index             = STD_NTIME_INDEX;
@@ -160,16 +162,23 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
    case ALGO_ARGON2D500:    register_argon2d_dyn_algo   ( gate ); break;
    case ALGO_ARGON2D4096:   register_argon2d4096_algo   ( gate ); break;
    case ALGO_AXIOM:         register_axiom_algo         ( gate ); break;
+    case ALGO_BASTION:       register_bastion_algo       ( gate ); break;
    case ALGO_BLAKE:         register_blake_algo         ( gate ); break;
    case ALGO_BLAKE2B:       register_blake2b_algo       ( gate ); break;
    case ALGO_BLAKE2S:       register_blake2s_algo       ( gate ); break;
    case ALGO_BLAKECOIN:     register_blakecoin_algo     ( gate ); break;
    case ALGO_BMW512:        register_bmw512_algo        ( gate ); break;
    case ALGO_C11:           register_c11_algo           ( gate ); break;
+    case ALGO_CRYPTOLIGHT:   register_cryptolight_algo   ( gate ); break;
+    case ALGO_CRYPTONIGHT:   register_cryptonight_algo   ( gate ); break;
+    case ALGO_CRYPTONIGHTV7: register_cryptonightv7_algo ( gate ); break;
    case ALGO_DECRED:        register_decred_algo        ( gate ); break;
    case ALGO_DEEP:          register_deep_algo          ( gate ); break;
    case ALGO_DMD_GR:        register_dmd_gr_algo        ( gate ); break;
+    case ALGO_DROP:          register_drop_algo          ( gate ); break;
+    case ALGO_FRESH:         register_fresh_algo         ( gate ); break;
    case ALGO_GROESTL:       register_groestl_algo       ( gate ); break;
+    case ALGO_HEAVY:         register_heavy_algo         ( gate ); break;
    case ALGO_HEX:           register_hex_algo           ( gate ); break;
    case ALGO_HMQ1725:       register_hmq1725_algo       ( gate ); break;
    case ALGO_HODL:          register_hodl_algo          ( gate ); break;
@@ -177,6 +186,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
    case ALGO_KECCAK:        register_keccak_algo        ( gate ); break;
    case ALGO_KECCAKC:       register_keccakc_algo       ( gate ); break;
    case ALGO_LBRY:          register_lbry_algo          ( gate ); break;
+    case ALGO_LUFFA:         register_luffa_algo         ( gate ); break;
    case ALGO_LYRA2H:        register_lyra2h_algo        ( gate ); break;
    case ALGO_LYRA2RE:       register_lyra2re_algo       ( gate ); break;
    case ALGO_LYRA2REV2:     register_lyra2rev2_algo     ( gate ); break;
@@ -190,6 +200,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
    case ALGO_PENTABLAKE:    register_pentablake_algo    ( gate ); break;
    case ALGO_PHI1612:       register_phi1612_algo       ( gate ); break;
    case ALGO_PHI2:          register_phi2_algo          ( gate ); break;
+    case ALGO_PLUCK:         register_pluck_algo         ( gate ); break;
    case ALGO_POLYTIMOS:     register_polytimos_algo     ( gate ); break;
    case ALGO_POWER2B:       register_power2b_algo       ( gate ); break;
    case ALGO_QUARK:         register_quark_algo         ( gate ); break;
@@ -198,7 +209,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
    case ALGO_SHA256D:       register_sha256d_algo       ( gate ); break;
    case ALGO_SHA256Q:       register_sha256q_algo       ( gate ); break;
    case ALGO_SHA256T:       register_sha256t_algo       ( gate ); break;
-    case ALGO_SHA3D:         register_sha3d_algo         ( gate ); break;
    case ALGO_SHAVITE3:      register_shavite_algo       ( gate ); break;
    case ALGO_SKEIN:         register_skein_algo         ( gate ); break;
    case ALGO_SKEIN2:        register_skein2_algo        ( gate ); break;
@@ -237,7 +247,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
 */
    case ALGO_YESCRYPT:      register_yescrypt_algo      ( gate ); break;
    case ALGO_YESCRYPTR8:    register_yescryptr8_algo    ( gate ); break;
-    case ALGO_YESCRYPTR8G:   register_yescryptr8g_algo   ( gate ); break;
    case ALGO_YESCRYPTR16:   register_yescryptr16_algo   ( gate ); break;
    case ALGO_YESCRYPTR32:   register_yescryptr32_algo   ( gate ); break;
    case ALGO_YESPOWER:      register_yespower_algo      ( gate ); break;
@@ -261,6 +270,29 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
 // restore warnings
 #pragma GCC diagnostic pop

+// override std defaults with jr2 defaults
+bool register_json_rpc2( algo_gate_t *gate )
+{
+  applog(LOG_WARNING,"\nCryptonight algorithm and variants are no longer");
+  applog(LOG_WARNING,"supported by cpuminer-opt. Shares submitted will");
+  applog(LOG_WARNING,"likely be rejected. Proceed at your own risk.\n");
+
+//  gate->wait_for_diff           = (void*)&do_nothing;
+  gate->get_new_work            = (void*)&jr2_get_new_work;
+  gate->get_nonceptr            = (void*)&jr2_get_nonceptr;
+  gate->stratum_gen_work        = (void*)&jr2_stratum_gen_work;
+  gate->build_stratum_request   = (void*)&jr2_build_stratum_request;
+  gate->submit_getwork_result   = (void*)&jr2_submit_getwork_result;
+  gate->longpoll_rpc_call       = (void*)&jr2_longpoll_rpc_call;
+  gate->work_decode             = (void*)&jr2_work_decode;
+  gate->stratum_handle_response = (void*)&jr2_stratum_handle_response;
+  gate->nonce_index             = JR2_NONCE_INDEX;
+  jsonrpc_2 = true;   // still needed
+  opt_extranonce = false;
+//  have_gbt = false;
+  return true;
+ }
+
 // run the alternate hash function for a specific algo
 void exec_hash_function( int algo, void *output, const void *pdata )
 {
@@ -281,37 +313,39 @@ void exec_hash_function( int algo, void *output, const void *pdata )
 const char* const algo_alias_map[][2] =
 {
 //   alias                proper
-  { "argon2d-crds",      "argon2d250"     },
-  { "argon2d-dyn",       "argon2d500"     },
-  { "argon2d-uis",       "argon2d4096"    },
-  { "bcd",               "x13bcd"         },
-  { "bitcore",           "timetravel10"   },
-  { "bitzeny",           "yescryptr8"     },
-  { "blake256r8",        "blakecoin"      },
-  { "blake256r8vnl",     "vanilla"        },
-  { "blake256r14",       "blake"          },
-  { "blake256r14dcr",    "decred"         },
-  { "diamond",           "dmd-gr"         },
-  { "espers",            "hmq1725"        },
-  { "flax",              "c11"            },
-  { "hsr",               "x13sm3"         },
-  { "jackpot",           "jha"            },
-  { "jane",              "scryptjane"     }, 
-  { "lyra2",             "lyra2re"        },
-  { "lyra2v2",           "lyra2rev2"      },
-  { "lyra2v3",           "lyra2rev3"      },
-  { "myrgr",             "myr-gr"         },
-  { "myriad",            "myr-gr"         },
-  { "neo",               "neoscrypt"      },
-  { "phi",               "phi1612"        },
-  { "scryptn2",          "scrypt:1048576" },
-  { "sib",               "x11gost"        },
-  { "timetravel8",       "timetravel"     },
-  { "veil",              "x16rt-veil"     },
-  { "x16r-hex",          "hex"            },
-  { "yenten",            "yescryptr16"    },
-  { "ziftr",             "zr5"            },
-  { NULL,                NULL             }   
+  { "argon2d-crds",      "argon2d250"   },
+  { "argon2d-dyn",       "argon2d500"   },
+  { "argon2d-uis",       "argon2d4096"  },
+  { "bcd",               "x13bcd"       },
+  { "bitcore",           "timetravel10" },
+  { "bitzeny",           "yescryptr8"   },
+  { "blake256r8",        "blakecoin"    },
+  { "blake256r8vnl",     "vanilla"      },
+  { "blake256r14",       "blake"        },
+  { "blake256r14dcr",    "decred"       },
+  { "cryptonote",        "cryptonight"  },
+  { "cryptonight-light", "cryptolight"  },
+  { "diamond",           "dmd-gr"       },
+  { "droplp",            "drop"         },
+  { "espers",            "hmq1725"      },
+  { "flax",              "c11"          },
+  { "hsr",               "x13sm3"       },
+  { "jackpot",           "jha"          },
+  { "jane",              "scryptjane"   }, 
+  { "lyra2",             "lyra2re"      },
+  { "lyra2v2",           "lyra2rev2"    },
+  { "lyra2v3",           "lyra2rev3"    },
+  { "myrgr",             "myr-gr"       },
+  { "myriad",            "myr-gr"       },
+  { "neo",               "neoscrypt"    },
+  { "phi",               "phi1612"      },
+  { "sib",               "x11gost"      },
+  { "timetravel8",       "timetravel"   },
+  { "veil",              "x16rt-veil"   },
+  { "x16r-hex",          "hex"          },
+  { "yenten",            "yescryptr16"  },
+  { "ziftr",             "zr5"          },
+  { NULL,                NULL           }   
 };

 // if arg is a valid alias for a known algo it is updated with the proper
@@ -324,7 +358,7 @@ void get_algo_alias( char** algo_or_alias )
    if ( !strcasecmp( *algo_or_alias, algo_alias_map[i][ ALIAS ] ) )
    {
      // found valid alias, return proper name
-      *algo_or_alias = (char*)( algo_alias_map[i][ PROPER ] );
+      *algo_or_alias = (char* const)( algo_alias_map[i][ PROPER ] );
      return;
    }
 }
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -121,51 +121,54 @@ void ( *hash_suw ) ( void*, const void* );

 // Allocate thread local buffers and other initialization specific to miner
 // threads.
-bool ( *miner_thread_init )     ( int );
+bool ( *miner_thread_init )      ( int );

 // Generate global blockheader from stratum data.
-void ( *stratum_gen_work )      ( struct stratum_ctx*, struct work* );
+void ( *stratum_gen_work )       ( struct stratum_ctx*, struct work* );

 // Get thread local copy of blockheader with unique nonce.
-void ( *get_new_work )          ( struct work*, struct work*, int, uint32_t* );
+void ( *get_new_work )           ( struct work*, struct work*, int, uint32_t*,
+                                   bool );
+
+// Return pointer to nonce in blockheader.
+uint32_t *( *get_nonceptr )      ( uint32_t* );

 // Decode getwork blockheader
-bool ( *work_decode )           ( const json_t*, struct work* );
+bool ( *work_decode )            ( const json_t*, struct work* );

 // Extra getwork data
-void ( *decode_extra_data )     ( struct work*, uint64_t* );
+void ( *decode_extra_data )      ( struct work*, uint64_t* );

-bool ( *submit_getwork_result ) ( CURL*, struct work* );
+bool ( *submit_getwork_result )  ( CURL*, struct work* );

-void ( *gen_merkle_root )       ( char*, struct stratum_ctx* );
+void ( *gen_merkle_root )        ( char*, struct stratum_ctx* );

 // Increment extranonce
-void ( *build_extraheader )     ( struct work*, struct stratum_ctx* );
-
-void ( *build_block_header )    ( struct work*, uint32_t, uint32_t*,
-	                                uint32_t*, uint32_t, uint32_t,
-                                   unsigned char* );
+void ( *build_extraheader )      ( struct work*, struct stratum_ctx* );

+void ( *build_block_header )     ( struct work*, uint32_t, uint32_t*,
+	                                uint32_t*, uint32_t, uint32_t );
 // Build mining.submit message
-void ( *build_stratum_request ) ( char*, struct work*, struct stratum_ctx* );
+void ( *build_stratum_request )  ( char*, struct work*, struct stratum_ctx* );

-char* ( *malloc_txs_request )   ( struct work* );
+char* ( *malloc_txs_request )    ( struct work* );

 // Big or little
-void ( *set_work_data_endian )  ( struct work* );
+void ( *set_work_data_endian )   ( struct work* );

-double ( *calc_network_diff )   ( struct work* );
+double ( *calc_network_diff )    ( struct work* );

 // Wait for first work
-bool ( *ready_to_mine )         ( struct work*, struct stratum_ctx*, int );
+bool ( *ready_to_mine )          ( struct work*, struct stratum_ctx*, int );

 // Diverge mining threads
-bool ( *do_this_thread )        ( int );
+bool ( *do_this_thread )         ( int );

 // After do_this_thread
-void ( *resync_threads )        ( struct work* );
+void ( *resync_threads )         ( struct work* );

-json_t* (*longpoll_rpc_call)      ( CURL*, int*, char* );
+json_t* (*longpoll_rpc_call)     ( CURL*, int*, char* );
+bool ( *stratum_handle_response )( json_t* );
 set_t optimizations;
 int  ( *get_work_data_size )     ();
 int  ntime_index;
@@ -218,22 +221,31 @@ void null_hash_suw();

 // optional safe targets, default listed first unless noted.

+uint32_t *std_get_nonceptr( uint32_t *work_data );
+uint32_t *jr2_get_nonceptr( uint32_t *work_data );
+
 void std_get_new_work( struct work *work, struct work *g_work, int thr_id,
+                       uint32_t* end_nonce_ptr, bool clean_job );
+void jr2_get_new_work( struct work *work, struct work *g_work, int thr_id,
                       uint32_t* end_nonce_ptr );

 void std_stratum_gen_work( struct stratum_ctx *sctx, struct work *work );
+void jr2_stratum_gen_work( struct stratum_ctx *sctx, struct work *work );

 void sha256d_gen_merkle_root( char *merkle_root, struct stratum_ctx *sctx );
 void SHA256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx );

 bool std_le_work_decode( const json_t *val, struct work *work );
 bool std_be_work_decode( const json_t *val, struct work *work );
+bool jr2_work_decode(    const json_t *val, struct work *work );

 bool std_le_submit_getwork_result( CURL *curl, struct work *work );
 bool std_be_submit_getwork_result( CURL *curl, struct work *work );
+bool jr2_submit_getwork_result(    CURL *curl, struct work *work );

 void std_le_build_stratum_request( char *req, struct work *work );
 void std_be_build_stratum_request( char *req, struct work *work );
+void jr2_build_stratum_request   ( char *req, struct work *work );

 char* std_malloc_txs_request( struct work *work );

@@ -244,16 +256,15 @@ double std_calc_network_diff( struct work *work );

 void std_build_block_header( struct work* g_work, uint32_t version,
 	                          uint32_t *prevhash,  uint32_t *merkle_root,
-   	                       uint32_t ntime,      uint32_t nbits,
-                             unsigned char *final_sapling_hash );
+   	                       uint32_t ntime,      uint32_t nbits );

 void std_build_extraheader( struct work *work, struct stratum_ctx *sctx );

 json_t* std_longpoll_rpc_call( CURL *curl, int *err, char *lp_url );
-//json_t* jr2_longpoll_rpc_call( CURL *curl, int *err );
+json_t* jr2_longpoll_rpc_call( CURL *curl, int *err );

-//bool std_stratum_handle_response( json_t *val );
-//bool jr2_stratum_handle_response( json_t *val );
+bool std_stratum_handle_response( json_t *val );
+bool jr2_stratum_handle_response( json_t *val );

 bool std_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
                        int thr_id );
@@ -275,7 +286,7 @@ bool register_algo( algo_gate_t *gate );
 // Overrides a common set of functions used by RPC2 and other RPC2-specific
 // init. Called by algo's register function before initializing algo-specific
 // functions and data.
-//bool register_json_rpc2( algo_gate_t *gate );
+bool register_json_rpc2( algo_gate_t *gate );

 // use this to call the hash function of an algo directly, ie util.c test.
 void exec_hash_function( int algo, void *output, const void *pdata );
--- a/algo/argon2/argon2d/argon2d-gate.c
+++ b/algo/argon2/argon2d/argon2d-gate.c
@@ -1,5 +1,4 @@
 #include "argon2d-gate.h"
-#include "simd-utils.h"
 #include "argon2d/argon2.h"

 static const size_t INPUT_BYTES = 80;  // Lenth of a block header in bytes. Input Length = Salt Length (salt = input)
@@ -37,7 +36,7 @@ void argon2d_crds_hash( void *output, const void *input )
 int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t _ALIGN(64) edata[20];
+   uint32_t _ALIGN(64) endiandata[20];
   uint32_t _ALIGN(64) hash[8];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
@@ -46,11 +45,11 @@ int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce,
   const uint32_t Htarg = ptarget[7];
   uint32_t nonce = first_nonce;

-   swab32_array( edata, pdata, 20 );
+   swab32_array( endiandata, pdata, 20 );

   do {
-      be32enc(&edata[19], nonce);
-      argon2d_crds_hash( hash, edata );
+      be32enc(&endiandata[19], nonce);
+      argon2d_crds_hash( hash, endiandata );
      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) && !opt_benchmark )
      {
          pdata[19] = nonce;
@@ -104,32 +103,31 @@ void argon2d_dyn_hash( void *output, const void *input )
 int scanhash_argon2d_dyn( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t _ALIGN(64) edata[20];
+   uint32_t _ALIGN(64) endiandata[20];
   uint32_t _ALIGN(64) hash[8];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
-   const int thr_id = mythr->id; 
-   const uint32_t first_nonce = (const uint32_t)pdata[19];
-   const uint32_t last_nonce = (const uint32_t)max_nonce;
+   int thr_id = mythr->id;  // thr_id arg is deprecated
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t Htarg = ptarget[7];
   uint32_t nonce = first_nonce;
-   const bool bench = opt_benchmark;

-   mm128_bswap32_80( edata, pdata );
+   swab32_array( endiandata, pdata, 20 );
+
   do
   {
-      edata[19] = nonce;
-      argon2d_dyn_hash( hash, edata );
-      if ( unlikely( valid_hash( (uint64_t*)hash, (uint64_t*)ptarget )
-           && !bench ) )
+      be32enc(&endiandata[19], nonce);
+      argon2d_dyn_hash( hash, endiandata );
+      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) && !opt_benchmark )
      {
-          pdata[19] = bswap_32( nonce );;
+          pdata[19] = nonce;
          submit_solution( work, hash, mythr );
      }
      nonce++;
-  } while ( likely( nonce < last_nonce && !work_restart[thr_id].restart ) );
+  } while (nonce < max_nonce && !work_restart[thr_id].restart);

   pdata[19] = nonce;
-   *hashes_done = pdata[19] - first_nonce;
+   *hashes_done = pdata[19] - first_nonce + 1;
   return 0;
 }

@@ -148,34 +146,36 @@ int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t _ALIGN(64) vhash[8];
-   uint32_t _ALIGN(64) edata[20];
+   uint32_t _ALIGN(64) endiandata[20];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = (const uint32_t)max_nonce;
   uint32_t n = first_nonce;
-   const int thr_id = mythr->id;  // thr_id arg is deprecated
+   int thr_id = mythr->id;  // thr_id arg is deprecated
   uint32_t t_cost = 1; // 1 iteration
   uint32_t m_cost = 4096; // use 4MB
   uint32_t parallelism = 1; // 1 thread, 2 lanes
-   const bool bench = opt_benchmark;

-   mm128_bswap32_80( edata, pdata );
+   for ( int i = 0; i < 19; i++ )
+      be32enc( &endiandata[i], pdata[i] );

   do {
-      edata[19] = n;
-      argon2d_hash_raw( t_cost, m_cost, parallelism, (char*) edata, 80,
-                 (char*) edata, 80, (char*) vhash, 32, ARGON2_VERSION_13 );
-      if ( unlikely( valid_hash( vhash, ptarget ) && !bench ) )
+      be32enc( &endiandata[19], n );
+      argon2d_hash_raw( t_cost, m_cost, parallelism, (char*) endiandata, 80,
+                 (char*) endiandata, 80, (char*) vhash, 32, ARGON2_VERSION_13 );
+      if ( vhash[7] < Htarg && fulltest( vhash, ptarget ) && !opt_benchmark )
      {
-         be32enc( &pdata[19], n );
+         pdata[19] = n;
         submit_solution( work, vhash, mythr );
      }
      n++;
-   } while ( likely( n < last_nonce && !work_restart[thr_id].restart ) );

-   *hashes_done = n - first_nonce;
+   } while (n < max_nonce && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce + 1;
   pdata[19] = n;
+
   return 0;
 }

--- a/algo/blake/blake2b-hash-4way.c
+++ b/algo/blake/blake2b-hash-4way.c
@@ -33,8 +33,6 @@

 #include "blake2b-hash-4way.h"

-#if defined(__AVX2__)
-
 static const uint8_t sigma[12][16] =
 {
      { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
@@ -205,9 +203,9 @@ void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
   casti_m512i( out, 3 ) = ctx->h[3];
 }

-#endif   // AVX512
+#endif

-// AVX2
+#if defined(__AVX2__)

 // G Mixing function.

@@ -371,4 +369,4 @@ void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out )
   casti_m256i( out, 3 ) = ctx->h[3];
 }

-#endif  // AVX2
+#endif
--- a/algo/blake/blake2b.c
+++ b/algo/blake/blake2b.c
@@ -4,9 +4,6 @@
 */

 #include "blake2b-gate.h"
-
-#if !defined(BLAKE2B_8WAY) && !defined(BLAKE2B_4WAY)
-
 #include <string.h>
 #include <stdint.h>
 #include "algo/blake/sph_blake2b.h"
@@ -61,4 +58,3 @@ int scanhash_blake2b( struct work *work, uint32_t max_nonce,
 	return 0;
 }

-#endif
--- a/algo/blake/blake2s.c
+++ b/algo/blake/blake2s.c
@@ -1,7 +1,5 @@
 #include "blake2s-gate.h"

-#if  !defined(BLAKE2S_16WAY) && !defined(BLAKE2S_8WAY) && !defined(BLAKE2S)
-
 #include <string.h>
 #include <stdint.h>

@@ -72,4 +70,3 @@ int scanhash_blake2s( struct work *work,

 	return 0;
 }
-#endif
--- a/algo/blake/blakecoin.c
+++ b/algo/blake/blakecoin.c
@@ -1,7 +1,4 @@
 #include "blakecoin-gate.h"
-
-#if !defined(BLAKECOIN_8WAY) && !defined(BLAKECOIN_4WAY)
-
 #define BLAKE32_ROUNDS 8
 #include "sph_blake.h"

@@ -96,4 +93,3 @@ int scanhash_blakecoin( struct work *work, uint32_t max_nonce,
 	return 0;
 }

-#endif
--- a/algo/blake/decred-gate.c
+++ b/algo/blake/decred-gate.c
@@ -153,7 +153,7 @@ bool register_decred_algo( algo_gate_t* gate )
  gate->hash      = (void*)&decred_hash;
 #endif
  gate->optimizations = AVX2_OPT;
-//  gate->get_nonceptr          = (void*)&decred_get_nonceptr;
+  gate->get_nonceptr          = (void*)&decred_get_nonceptr;
  gate->decode_extra_data     = (void*)&decred_decode_extradata;
  gate->build_stratum_request = (void*)&decred_be_build_stratum_request;
  gate->work_decode           = (void*)&std_be_work_decode;
--- a/algo/blake/decred.c
+++ b/algo/blake/decred.c
@@ -1,7 +1,4 @@
 #include "decred-gate.h"
-
-#if !defined(DECRED_8WAY) && !defined(DECRED_4WAY)
-
 #include "sph_blake.h"

 #include <string.h>
@@ -278,5 +275,3 @@ bool register_decred_algo( algo_gate_t* gate )
  return true;
 }
 */
-
-#endif
--- a/algo/blake/pentablake.c
+++ b/algo/blake/pentablake.c
@@ -1,7 +1,4 @@
 #include "pentablake-gate.h"
-
-#if !defined(PENTABLAKE_8WAY) && !defined(PENTABLAKE_4WAY)
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -114,4 +111,3 @@ int scanhash_pentablake( struct work *work, uint32_t max_nonce,
 	return 0;
 } 

-#endif
--- a/algo/bmw/bmw-hash-4way.h
+++ b/algo/bmw/bmw-hash-4way.h
@@ -138,7 +138,7 @@ void bmw512_2way_close( bmw512_2way_context *ctx, void *dst );

 #if defined(__AVX2__)

-// BMW-512 64 bit 4 way
+// BMW-512 4 way 64

 typedef struct {
   __m256i buf[16];
@@ -149,6 +149,7 @@ typedef struct {

 typedef bmw_4way_big_context bmw512_4way_context;

+
 void bmw512_4way_init(void *cc);

 void bmw512_4way_update(void *cc, const void *data, size_t len);
@@ -163,7 +164,6 @@ void bmw512_4way_addbits_and_close(

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

-// BMW-512 64 bit 8 way
 typedef struct {
   __m512i buf[16];
   __m512i H[16];
@@ -171,8 +171,6 @@ typedef struct {
   uint64_t bit_count;
 } bmw512_8way_context __attribute__((aligned(128)));

-void bmw512_8way_full( bmw512_8way_context *ctx, void *out, const void *data,
-                         size_t len );
 void bmw512_8way_init( bmw512_8way_context *ctx );
 void bmw512_8way_update( bmw512_8way_context *ctx, const void *data,
                         size_t len );
--- a/algo/bmw/bmw512-hash-4way.c
+++ b/algo/bmw/bmw512-hash-4way.c
@@ -1507,93 +1507,6 @@ void bmw512_8way_close( bmw512_8way_context *ctx, void *dst )
      casti_m512i( dst, u ) = h1[ v ];
 }

-void bmw512_8way_full( bmw512_8way_context *ctx, void *out, const void *data,
-                                size_t len )
-{
-   __m512i *vdata = (__m512i*)data;
-   __m512i *buf = ctx->buf;
-   __m512i htmp[16];
-   __m512i *H = ctx->H;
-   __m512i *h2 = htmp;
-   uint64_t bit_count = len * 8;
-   size_t ptr = 0;
-   const int buf_size = 128;  // bytes of one lane, compatible with len
-
-// Init
-
-   H[ 0] = m512_const1_64( 0x8081828384858687 );
-   H[ 1] = m512_const1_64( 0x88898A8B8C8D8E8F );
-   H[ 2] = m512_const1_64( 0x9091929394959697 );
-   H[ 3] = m512_const1_64( 0x98999A9B9C9D9E9F );
-   H[ 4] = m512_const1_64( 0xA0A1A2A3A4A5A6A7 );
-   H[ 5] = m512_const1_64( 0xA8A9AAABACADAEAF );
-   H[ 6] = m512_const1_64( 0xB0B1B2B3B4B5B6B7 );
-   H[ 7] = m512_const1_64( 0xB8B9BABBBCBDBEBF );
-   H[ 8] = m512_const1_64( 0xC0C1C2C3C4C5C6C7 );
-   H[ 9] = m512_const1_64( 0xC8C9CACBCCCDCECF );
-   H[10] = m512_const1_64( 0xD0D1D2D3D4D5D6D7 );
-   H[11] = m512_const1_64( 0xD8D9DADBDCDDDEDF );
-   H[12] = m512_const1_64( 0xE0E1E2E3E4E5E6E7 );
-   H[13] = m512_const1_64( 0xE8E9EAEBECEDEEEF );
-   H[14] = m512_const1_64( 0xF0F1F2F3F4F5F6F7 );
-   H[15] = m512_const1_64( 0xF8F9FAFBFCFDFEFF );
-
-// Update
-
-   while ( len > 0 )
-   {
-      size_t clen;
-      clen = buf_size - ptr;
-      if ( clen > len )
-         clen = len;
-      memcpy_512( buf + (ptr>>3), vdata, clen >> 3 );
-      vdata = vdata + (clen>>3);
-      len -= clen;
-      ptr += clen;
-      if ( ptr == buf_size )
-      {
-         __m512i *ht;
-         compress_big_8way( buf, H, h2 );
-         ht = H;
-         H = h2;
-         h2 = ht;
-         ptr = 0;
-      }
-   }
-   if ( H != ctx->H )
-      memcpy_512( ctx->H, H, 16 );
-
-// Close   
-{
-   __m512i h1[16], h2[16];
-   size_t u, v;
-
-   buf[ ptr>>3 ] = m512_const1_64( 0x80 );
-   ptr += 8;
-
-   if (  ptr > (buf_size - 8) )
-   {
-      memset_zero_512( buf + (ptr>>3), (buf_size - ptr) >> 3 );
-      compress_big_8way( buf, H, h1 );
-      ptr = 0;
-      H = h1;
-   }
-   memset_zero_512( buf + (ptr>>3), (buf_size - 8 - ptr) >> 3 );
-   buf[ (buf_size - 8) >> 3 ] = _mm512_set1_epi64( bit_count );
-   compress_big_8way( buf, H, h2 );
-   for ( u = 0; u < 16; u ++ )
-      buf[ u ] = h2[ u ];
-   compress_big_8way( buf, final_b8, h1 );
-   for (u = 0, v = 8; u < 8; u ++, v ++)
-      casti_m512i( out, u ) = h1[ v ];
-}
-
-
-
-}   
-
-
-
 #endif // AVX512

 #ifdef __cplusplus
--- a/algo/bmw/bmw512.c
+++ b/algo/bmw/bmw512.c
@@ -1,7 +1,5 @@
 #include "algo-gate-api.h"

-#if !defined(BMW512_8WAY) && !defined(BMW512_4WAY)
-
 #include <stdlib.h>
 #include <string.h>
 #include <stdint.h>
@@ -52,4 +50,4 @@ int scanhash_bmw512( struct work *work, uint32_t max_nonce,
 	pdata[19] = n;
 	return 0;
 }
-#endif
+
--- a/algo/bmw/sph_bmw.c
+++ b/algo/bmw/sph_bmw.c
@@ -48,8 +48,6 @@ extern "C"{
 #pragma warning (disable: 4146)
 #endif

-#if !defined(__AVX2__)
-
 static const sph_u32 IV224[] = {
 	SPH_C32(0x00010203), SPH_C32(0x04050607),
 	SPH_C32(0x08090A0B), SPH_C32(0x0C0D0E0F),
@@ -72,8 +70,6 @@ static const sph_u32 IV256[] = {
 	SPH_C32(0x78797A7B), SPH_C32(0x7C7D7E7F)
 };

-#endif // !AVX2
-
 #if SPH_64

 static const sph_u64 IV384[] = {
@@ -139,8 +135,6 @@ static const sph_u64 IV512[] = {
 #define M16_30   14, 15,  1,  2,  5,  8,  9
 #define M16_31   15, 16,  2,  3,  6,  9, 10

-#if !defined(__AVX2__)
-
 #define ss0(x)    (((x) >> 1) ^ SPH_T32((x) << 3) \
                  ^ SPH_ROTL32(x,  4) ^ SPH_ROTL32(x, 19))
 #define ss1(x)    (((x) >> 1) ^ SPH_T32((x) << 2) \
@@ -195,8 +189,6 @@ static const sph_u64 IV512[] = {
 #define expand2s_(qf, mf, hf, i16, ix, iy) \
 	expand2s_inner LPAR qf, mf, hf, i16, ix, iy)

-#endif // !AVX2
-
 #if SPH_64

 #define sb0(x)    (((x) >> 1) ^ SPH_T64((x) << 3) \
@@ -299,8 +291,6 @@ static const sph_u64 Kb_tab[] = {
 	tt((M(i0) ^ H(i0)) op01 (M(i1) ^ H(i1)) op12 (M(i2) ^ H(i2)) \
 	op23 (M(i3) ^ H(i3)) op34 (M(i4) ^ H(i4)))

-#if !defined(__AVX2__)
-
 #define Ws0    MAKE_W(SPH_T32,  5, -,  7, +, 10, +, 13, +, 14)
 #define Ws1    MAKE_W(SPH_T32,  6, -,  8, +, 11, +, 14, -, 15)
 #define Ws2    MAKE_W(SPH_T32,  0, +,  7, +,  9, -, 12, +, 15)
@@ -417,8 +407,6 @@ static const sph_u64 Kb_tab[] = {

 #define Qs(j)   (qt[j])

-#endif  // !AVX2
-
 #if SPH_64

 #define Wb0    MAKE_W(SPH_T64,  5, -,  7, +, 10, +, 13, +, 14)
@@ -569,6 +557,7 @@ static const sph_u64 Kb_tab[] = {
 			+ ((xl >> 2) ^ qf(22) ^ qf(15))); \
 	} while (0)

+#define FOLDs   FOLD(sph_u32, MAKE_Qs, SPH_T32, SPH_ROTL32, M, Qs, dH)

 #if SPH_64

@@ -576,10 +565,6 @@ static const sph_u64 Kb_tab[] = {

 #endif

-#if !defined(__AVX2__)
-
-#define FOLDs   FOLD(sph_u32, MAKE_Qs, SPH_T32, SPH_ROTL32, M, Qs, dH)
-
 static void
 compress_small(const unsigned char *data, const sph_u32 h[16], sph_u32 dh[16])
 {
@@ -726,8 +711,6 @@ bmw32_close(sph_bmw_small_context *sc, unsigned ub, unsigned n,
 		sph_enc32le(out + 4 * u, h1[v]);
 }

-#endif // !AVX2
-
 #if SPH_64

 static void
@@ -857,8 +840,6 @@ bmw64_close(sph_bmw_big_context *sc, unsigned ub, unsigned n,

 #endif

-#if !defined(__AVX2__)
-
 /* see sph_bmw.h */
 void
 sph_bmw224_init(void *cc)
@@ -917,8 +898,6 @@ sph_bmw256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 //	sph_bmw256_init(cc);
 }

-#endif // !AVX2
-
 #if SPH_64

 /* see sph_bmw.h */
--- a/algo/bmw/sph_bmw.h
+++ b/algo/bmw/sph_bmw.h
@@ -77,9 +77,6 @@ extern "C"{
 * computation can be cloned by copying the context (e.g. with a simple
 * <code>memcpy()</code>).
 */
-
-#if !defined(__AVX2__)
-
 typedef struct {
 #ifndef DOXYGEN_IGNORE
 	unsigned char buf[64];    /* first field, for alignment */
@@ -105,8 +102,6 @@ typedef sph_bmw_small_context sph_bmw224_context;
 */
 typedef sph_bmw_small_context sph_bmw256_context;

-#endif // !AVX2
-
 #if SPH_64

 /**
@@ -142,8 +137,6 @@ typedef sph_bmw_big_context sph_bmw512_context;

 #endif

-#if !defined(__AVX2__)
-
 /**
 * Initialize a BMW-224 context. This process performs no memory allocation.
 *
@@ -234,8 +227,6 @@ void sph_bmw256_close(void *cc, void *dst);
 void sph_bmw256_addbits_and_close(
 	void *cc, unsigned ub, unsigned n, void *dst);

-#endif // !AVX2
-
 #if SPH_64

 /**
--- a/algo/cryptonight/cryptolight.c
+++ b/algo/cryptonight/cryptolight.c
@@ -0,0 +1,368 @@
+// Copyright (c) 2012-2013 The Cryptonote developers
+// Distributed under the MIT/X11 software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+#include "algo-gate-api.h"
+
+#if defined(__arm__) || defined(_MSC_VER)
+#ifndef NOASM
+#define NOASM
+#endif
+#endif
+
+#include "crypto/oaes_lib.h"
+#include "crypto/c_keccak.h"
+#include "crypto/c_groestl.h"
+#include "crypto/c_blake256.h"
+#include "crypto/c_jh.h"
+#include "crypto/c_skein.h"
+#include "crypto/int-util.h"
+#include "crypto/hash-ops.h"
+
+#if USE_INT128
+
+#if __GNUC__ == 4 && __GNUC_MINOR__ >= 4 && __GNUC_MINOR__ < 6
+typedef unsigned int uint128_t __attribute__ ((__mode__ (TI)));
+#elif defined (_MSC_VER)
+/* only for mingw64 on windows */
+#undef  USE_INT128
+#define USE_INT128 (0)
+#else
+typedef __uint128_t uint128_t;
+#endif
+
+#endif
+
+#define LITE 1
+#if LITE /* cryptonight-light */
+#define MEMORY (1 << 20)
+#define ITER   (1 << 19)
+#else
+#define MEMORY (1 << 21) /* 2 MiB */
+#define ITER   (1 << 20)
+#endif
+
+#define AES_BLOCK_SIZE  16
+#define AES_KEY_SIZE    32 /*16*/
+#define INIT_SIZE_BLK   8
+#define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE)
+
+#pragma pack(push, 1)
+union cn_slow_hash_state {
+	union hash_state hs;
+	struct {
+		uint8_t k[64];
+		uint8_t init[INIT_SIZE_BYTE];
+	};
+};
+#pragma pack(pop)
+
+static void do_blake_hash(const void* input, size_t len, char* output) {
+	blake256_hash((uint8_t*)output, input, len);
+}
+
+static void do_groestl_hash(const void* input, size_t len, char* output) {
+	groestl(input, len * 8, (uint8_t*)output);
+}
+
+static void do_jh_hash(const void* input, size_t len, char* output) {
+	int r = jh_hash(HASH_SIZE * 8, input, 8 * len, (uint8_t*)output);
+	assert(likely(SUCCESS == r));
+}
+
+static void do_skein_hash(const void* input, size_t len, char* output) {
+	int r = skein_hash(8 * HASH_SIZE, input, 8 * len, (uint8_t*)output);
+	assert(likely(SKEIN_SUCCESS == r));
+}
+
+extern int aesb_single_round(const uint8_t *in, uint8_t*out, const uint8_t *expandedKey);
+extern int aesb_pseudo_round_mut(uint8_t *val, uint8_t *expandedKey);
+#if !defined(_MSC_VER) && !defined(NOASM)
+extern int fast_aesb_single_round(const uint8_t *in, uint8_t*out, const uint8_t *expandedKey);
+extern int fast_aesb_pseudo_round_mut(uint8_t *val, uint8_t *expandedKey);
+#else
+#define fast_aesb_single_round     aesb_single_round
+#define fast_aesb_pseudo_round_mut aesb_pseudo_round_mut
+#endif
+
+#if defined(NOASM) || !defined(__x86_64__)
+static uint64_t mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi) {
+	// multiplier   = ab = a * 2^32 + b
+	// multiplicand = cd = c * 2^32 + d
+	// ab * cd = a * c * 2^64 + (a * d + b * c) * 2^32 + b * d
+	uint64_t a = hi_dword(multiplier);
+	uint64_t b = lo_dword(multiplier);
+	uint64_t c = hi_dword(multiplicand);
+	uint64_t d = lo_dword(multiplicand);
+
+	uint64_t ac = a * c;
+	uint64_t ad = a * d;
+	uint64_t bc = b * c;
+	uint64_t bd = b * d;
+
+	uint64_t adbc = ad + bc;
+	uint64_t adbc_carry = adbc < ad ? 1 : 0;
+
+	// multiplier * multiplicand = product_hi * 2^64 + product_lo
+	uint64_t product_lo = bd + (adbc << 32);
+	uint64_t product_lo_carry = product_lo < bd ? 1 : 0;
+	*product_hi = ac + (adbc >> 32) + (adbc_carry << 32) + product_lo_carry;
+	assert(ac <= *product_hi);
+
+	return product_lo;
+}
+#else
+extern uint64_t mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi);
+#endif
+
+static void (* const extra_hashes[4])(const void *, size_t, char *) = {
+		do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash
+};
+
+
+static inline size_t e2i(const uint8_t* a) {
+#if !LITE
+	return ((uint32_t *)a)[0] & 0x1FFFF0;
+#else
+	return ((uint32_t *)a)[0] & 0xFFFF0;
+#endif
+}
+
+static inline void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst) {
+	uint64_t hi, lo = mul128(((uint64_t*) a)[0], ((uint64_t*) dst)[0], &hi) + ((uint64_t*) c)[1];
+	hi += ((uint64_t*) c)[0];
+
+	((uint64_t*) c)[0] = ((uint64_t*) dst)[0] ^ hi;
+	((uint64_t*) c)[1] = ((uint64_t*) dst)[1] ^ lo;
+	((uint64_t*) dst)[0] = hi;
+	((uint64_t*) dst)[1] = lo;
+}
+
+static inline void xor_blocks(uint8_t* a, const uint8_t* b) {
+#if USE_INT128
+	*((uint128_t*) a) ^= *((uint128_t*) b);
+#else
+	((uint64_t*) a)[0] ^= ((uint64_t*) b)[0];
+	((uint64_t*) a)[1] ^= ((uint64_t*) b)[1];
+#endif
+}
+
+static inline void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) {
+#if USE_INT128
+	*((uint128_t*) dst) = *((uint128_t*) a) ^ *((uint128_t*) b);
+#else
+	((uint64_t*) dst)[0] = ((uint64_t*) a)[0] ^ ((uint64_t*) b)[0];
+	((uint64_t*) dst)[1] = ((uint64_t*) a)[1] ^ ((uint64_t*) b)[1];
+#endif
+}
+
+struct cryptonight_ctx {
+	uint8_t _ALIGN(16) long_state[MEMORY];
+	union cn_slow_hash_state state;
+	uint8_t _ALIGN(16) text[INIT_SIZE_BYTE];
+	uint8_t _ALIGN(16) a[AES_BLOCK_SIZE];
+	uint8_t _ALIGN(16) b[AES_BLOCK_SIZE];
+	uint8_t _ALIGN(16) c[AES_BLOCK_SIZE];
+	oaes_ctx* aes_ctx;
+};
+
+static void cryptolight_hash_ctx(void* output, const void* input, int len, struct cryptonight_ctx* ctx)
+{
+        len = 76;
+	hash_process(&ctx->state.hs, (const uint8_t*) input, len);
+	ctx->aes_ctx = (oaes_ctx*) oaes_alloc();
+	size_t i, j;
+	memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
+
+	oaes_key_import_data(ctx->aes_ctx, ctx->state.hs.b, AES_KEY_SIZE);
+	for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
+		aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 0], ctx->aes_ctx->key->exp_data);
+		aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 1], ctx->aes_ctx->key->exp_data);
+		aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 2], ctx->aes_ctx->key->exp_data);
+		aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 3], ctx->aes_ctx->key->exp_data);
+		aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 4], ctx->aes_ctx->key->exp_data);
+		aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 5], ctx->aes_ctx->key->exp_data);
+		aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 6], ctx->aes_ctx->key->exp_data);
+		aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 7], ctx->aes_ctx->key->exp_data);
+		memcpy(&ctx->long_state[i], ctx->text, INIT_SIZE_BYTE);
+	}
+
+	xor_blocks_dst(&ctx->state.k[0], &ctx->state.k[32], ctx->a);
+	xor_blocks_dst(&ctx->state.k[16], &ctx->state.k[48], ctx->b);
+
+	for (i = 0; likely(i < ITER / 4); ++i) {
+		/* Dependency chain: address -> read value ------+
+		 * written value <-+ hard function (AES or MUL) <+
+		 * next address  <-+
+		 */
+		/* Iteration 1 */
+		j = e2i(ctx->a);
+		aesb_single_round(&ctx->long_state[j], ctx->c, ctx->a);
+		xor_blocks_dst(ctx->c, ctx->b, &ctx->long_state[j]);
+		/* Iteration 2 */
+		mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c)]);
+		/* Iteration 3 */
+		j = e2i(ctx->a);
+		aesb_single_round(&ctx->long_state[j], ctx->b, ctx->a);
+		xor_blocks_dst(ctx->b, ctx->c, &ctx->long_state[j]);
+		/* Iteration 4 */
+		mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b)]);
+	}
+
+	memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
+	oaes_key_import_data(ctx->aes_ctx, &ctx->state.hs.b[32], AES_KEY_SIZE);
+	for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
+		xor_blocks(&ctx->text[0 * AES_BLOCK_SIZE], &ctx->long_state[i + 0 * AES_BLOCK_SIZE]);
+		aesb_pseudo_round_mut(&ctx->text[0 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
+		xor_blocks(&ctx->text[1 * AES_BLOCK_SIZE], &ctx->long_state[i + 1 * AES_BLOCK_SIZE]);
+		aesb_pseudo_round_mut(&ctx->text[1 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
+		xor_blocks(&ctx->text[2 * AES_BLOCK_SIZE], &ctx->long_state[i + 2 * AES_BLOCK_SIZE]);
+		aesb_pseudo_round_mut(&ctx->text[2 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
+		xor_blocks(&ctx->text[3 * AES_BLOCK_SIZE], &ctx->long_state[i + 3 * AES_BLOCK_SIZE]);
+		aesb_pseudo_round_mut(&ctx->text[3 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
+		xor_blocks(&ctx->text[4 * AES_BLOCK_SIZE], &ctx->long_state[i + 4 * AES_BLOCK_SIZE]);
+		aesb_pseudo_round_mut(&ctx->text[4 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
+		xor_blocks(&ctx->text[5 * AES_BLOCK_SIZE], &ctx->long_state[i + 5 * AES_BLOCK_SIZE]);
+		aesb_pseudo_round_mut(&ctx->text[5 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
+		xor_blocks(&ctx->text[6 * AES_BLOCK_SIZE], &ctx->long_state[i + 6 * AES_BLOCK_SIZE]);
+		aesb_pseudo_round_mut(&ctx->text[6 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
+		xor_blocks(&ctx->text[7 * AES_BLOCK_SIZE], &ctx->long_state[i + 7 * AES_BLOCK_SIZE]);
+		aesb_pseudo_round_mut(&ctx->text[7 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
+	}
+	memcpy(ctx->state.init, ctx->text, INIT_SIZE_BYTE);
+	hash_permutation(&ctx->state.hs);
+	/*memcpy(hash, &state, 32);*/
+	extra_hashes[ctx->state.hs.b[0] & 3](&ctx->state, 200, output);
+	oaes_free((OAES_CTX **) &ctx->aes_ctx);
+}
+
+void cryptolight_hash(void* output, const void* input, int len) {
+	struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx));
+	cryptolight_hash_ctx(output, input, len, ctx);
+	free(ctx);
+}
+
+#if defined(__AES__)
+
+static void cryptolight_hash_ctx_aes_ni(void* output, const void* input,
+                       int len, struct cryptonight_ctx* ctx)
+{
+	hash_process(&ctx->state.hs, (const uint8_t*)input, len);
+	ctx->aes_ctx = (oaes_ctx*) oaes_alloc();
+	size_t i, j;
+	memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
+
+	oaes_key_import_data(ctx->aes_ctx, ctx->state.hs.b, AES_KEY_SIZE);
+	for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
+		fast_aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 0], ctx->aes_ctx->key->exp_data);
+		fast_aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 1], ctx->aes_ctx->key->exp_data);
+		fast_aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 2], ctx->aes_ctx->key->exp_data);
+		fast_aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 3], ctx->aes_ctx->key->exp_data);
+		fast_aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 4], ctx->aes_ctx->key->exp_data);
+		fast_aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 5], ctx->aes_ctx->key->exp_data);
+		fast_aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 6], ctx->aes_ctx->key->exp_data);
+		fast_aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 7], ctx->aes_ctx->key->exp_data);
+		memcpy(&ctx->long_state[i], ctx->text, INIT_SIZE_BYTE);
+	}
+
+	xor_blocks_dst(&ctx->state.k[0], &ctx->state.k[32], ctx->a);
+	xor_blocks_dst(&ctx->state.k[16], &ctx->state.k[48], ctx->b);
+
+	for (i = 0; likely(i < ITER / 4); ++i) {
+		/* Dependency chain: address -> read value ------+
+		 * written value <-+ hard function (AES or MUL) <+
+		 * next address  <-+
+		 */
+		/* Iteration 1 */
+		j = e2i(ctx->a);
+		fast_aesb_single_round(&ctx->long_state[j], ctx->c, ctx->a);
+		xor_blocks_dst(ctx->c, ctx->b, &ctx->long_state[j]);
+		/* Iteration 2 */
+		mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c)]);
+		/* Iteration 3 */
+		j = e2i(ctx->a);
+		fast_aesb_single_round(&ctx->long_state[j], ctx->b, ctx->a);
+		xor_blocks_dst(ctx->b, ctx->c, &ctx->long_state[j]);
+		/* Iteration 4 */
+		mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b)]);
+	}
+
+	memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
+	oaes_key_import_data(ctx->aes_ctx, &ctx->state.hs.b[32], AES_KEY_SIZE);
+	for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
+		xor_blocks(&ctx->text[0 * AES_BLOCK_SIZE], &ctx->long_state[i + 0 * AES_BLOCK_SIZE]);
+		fast_aesb_pseudo_round_mut(&ctx->text[0 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
+		xor_blocks(&ctx->text[1 * AES_BLOCK_SIZE], &ctx->long_state[i + 1 * AES_BLOCK_SIZE]);
+		fast_aesb_pseudo_round_mut(&ctx->text[1 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
+		xor_blocks(&ctx->text[2 * AES_BLOCK_SIZE], &ctx->long_state[i + 2 * AES_BLOCK_SIZE]);
+		fast_aesb_pseudo_round_mut(&ctx->text[2 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
+		xor_blocks(&ctx->text[3 * AES_BLOCK_SIZE], &ctx->long_state[i + 3 * AES_BLOCK_SIZE]);
+		fast_aesb_pseudo_round_mut(&ctx->text[3 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
+		xor_blocks(&ctx->text[4 * AES_BLOCK_SIZE], &ctx->long_state[i + 4 * AES_BLOCK_SIZE]);
+		fast_aesb_pseudo_round_mut(&ctx->text[4 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
+		xor_blocks(&ctx->text[5 * AES_BLOCK_SIZE], &ctx->long_state[i + 5 * AES_BLOCK_SIZE]);
+		fast_aesb_pseudo_round_mut(&ctx->text[5 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
+		xor_blocks(&ctx->text[6 * AES_BLOCK_SIZE], &ctx->long_state[i + 6 * AES_BLOCK_SIZE]);
+		fast_aesb_pseudo_round_mut(&ctx->text[6 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
+		xor_blocks(&ctx->text[7 * AES_BLOCK_SIZE], &ctx->long_state[i + 7 * AES_BLOCK_SIZE]);
+		fast_aesb_pseudo_round_mut(&ctx->text[7 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
+	}
+	memcpy(ctx->state.init, ctx->text, INIT_SIZE_BYTE);
+	hash_permutation(&ctx->state.hs);
+	/*memcpy(hash, &state, 32);*/
+	extra_hashes[ctx->state.hs.b[0] & 3](&ctx->state, 200, output);
+	oaes_free((OAES_CTX **) &ctx->aes_ctx);
+}
+
+#endif
+
+int scanhash_cryptolight( struct work *work,
+		uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr)
+{
+        uint32_t *pdata = work->data;
+        uint32_t *ptarget = work->target;
+	uint32_t *nonceptr = (uint32_t*) (((char*)pdata) + 39);
+	uint32_t n = *nonceptr - 1;
+	const uint32_t first_nonce = n + 1;
+	//const uint32_t Htarg = ptarget[7];
+	uint32_t _ALIGN(32) hash[HASH_SIZE / 4];
+   int thr_id = mythr->id;
+
+	struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx));
+
+#if defined(__AES__)
+		do {
+			*nonceptr = ++n;
+			cryptolight_hash_ctx_aes_ni(hash, pdata, 76, ctx);
+			if (unlikely(hash[7] < ptarget[7])) {
+				*hashes_done = n - first_nonce + 1;
+				free(ctx);
+				return true;
+			}
+		} while (likely((n <= max_nonce && !work_restart[thr_id].restart)));
+#else
+		do {
+			*nonceptr = ++n;
+			cryptolight_hash_ctx(hash, pdata, 76, ctx);
+			if (unlikely(hash[7] < ptarget[7])) {
+				*hashes_done = n - first_nonce + 1;
+				free(ctx);
+				return true;
+			}
+		} while (likely((n <= max_nonce && !work_restart[thr_id].restart)));
+#endif
+	free(ctx);
+	*hashes_done = n - first_nonce + 1;
+	return 0;
+}
+
+bool register_cryptolight_algo( algo_gate_t* gate )
+{
+  register_json_rpc2( gate );
+  gate->optimizations = SSE2_OPT | AES_OPT;
+  gate->scanhash  = (void*)&scanhash_cryptolight;
+  gate->hash      = (void*)&cryptolight_hash;
+  gate->hash_suw  = (void*)&cryptolight_hash; 
+  return true;
+};
+
--- a/algo/cryptonight/cryptonight-aesni.c
+++ b/algo/cryptonight/cryptonight-aesni.c
@@ -0,0 +1,357 @@
+#if defined(__AES__)
+
+#include <x86intrin.h>
+#include <memory.h>
+#include "cryptonight.h"
+#include "miner.h"
+#include "crypto/c_keccak.h"
+#include <immintrin.h>
+
+static inline void ExpandAESKey256_sub1(__m128i *tmp1, __m128i *tmp2)
+{
+	__m128i tmp4;
+	*tmp2 = _mm_shuffle_epi32(*tmp2, 0xFF);
+	tmp4 = _mm_slli_si128(*tmp1, 0x04);
+	*tmp1 = _mm_xor_si128(*tmp1, tmp4);
+	tmp4 = _mm_slli_si128(tmp4, 0x04);
+	*tmp1 = _mm_xor_si128(*tmp1, tmp4);
+	tmp4 = _mm_slli_si128(tmp4, 0x04);
+	*tmp1 = _mm_xor_si128(*tmp1, tmp4);
+	*tmp1 = _mm_xor_si128(*tmp1, *tmp2);
+}
+
+static inline void ExpandAESKey256_sub2(__m128i *tmp1, __m128i *tmp3)
+{
+	__m128i tmp2, tmp4;
+	
+	tmp4 = _mm_aeskeygenassist_si128(*tmp1, 0x00);
+	tmp2 = _mm_shuffle_epi32(tmp4, 0xAA);
+	tmp4 = _mm_slli_si128(*tmp3, 0x04);
+	*tmp3 = _mm_xor_si128(*tmp3, tmp4);
+	tmp4 = _mm_slli_si128(tmp4, 0x04);
+	*tmp3 = _mm_xor_si128(*tmp3, tmp4);
+	tmp4 = _mm_slli_si128(tmp4, 0x04);
+	*tmp3 = _mm_xor_si128(*tmp3, tmp4);
+	*tmp3 = _mm_xor_si128(*tmp3, tmp2);
+}
+
+// Special thanks to Intel for helping me
+// with ExpandAESKey256() and its subroutines
+static inline void ExpandAESKey256(char *keybuf)
+{
+	__m128i tmp1, tmp2, tmp3, *keys;
+	
+	keys = (__m128i *)keybuf;
+	
+	tmp1 = _mm_load_si128((__m128i *)keybuf);
+	tmp3 = _mm_load_si128((__m128i *)(keybuf+0x10));
+	
+	tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x01);
+	ExpandAESKey256_sub1(&tmp1, &tmp2);
+	keys[2] = tmp1;
+	ExpandAESKey256_sub2(&tmp1, &tmp3);
+	keys[3] = tmp3;
+	
+	tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x02);
+	ExpandAESKey256_sub1(&tmp1, &tmp2);
+	keys[4] = tmp1;
+	ExpandAESKey256_sub2(&tmp1, &tmp3);
+	keys[5] = tmp3;
+	
+	tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x04);
+	ExpandAESKey256_sub1(&tmp1, &tmp2);
+	keys[6] = tmp1;
+	ExpandAESKey256_sub2(&tmp1, &tmp3);
+	keys[7] = tmp3;
+	
+	tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x08);
+	ExpandAESKey256_sub1(&tmp1, &tmp2);
+	keys[8] = tmp1;
+	ExpandAESKey256_sub2(&tmp1, &tmp3);
+	keys[9] = tmp3;
+	
+	tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x10);
+	ExpandAESKey256_sub1(&tmp1, &tmp2);
+	keys[10] = tmp1;
+	ExpandAESKey256_sub2(&tmp1, &tmp3);
+	keys[11] = tmp3;
+	
+	tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x20);
+	ExpandAESKey256_sub1(&tmp1, &tmp2);
+	keys[12] = tmp1;
+	ExpandAESKey256_sub2(&tmp1, &tmp3);
+	keys[13] = tmp3;
+	
+	tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x40);
+	ExpandAESKey256_sub1(&tmp1, &tmp2);
+	keys[14] = tmp1;
+}
+
+// align to 64 byte cache line
+typedef struct 
+{
+    uint8_t long_state[MEMORY] __attribute((aligned(64)));
+    union cn_slow_hash_state state;
+    uint8_t text[INIT_SIZE_BYTE] __attribute((aligned(64)));
+    uint64_t a[AES_BLOCK_SIZE >> 3] __attribute__((aligned(64)));
+    uint64_t b[AES_BLOCK_SIZE >> 3] __attribute__((aligned(64)));
+    uint8_t c[AES_BLOCK_SIZE] __attribute__((aligned(64)));
+} cryptonight_ctx;
+
+static __thread cryptonight_ctx ctx;
+
+void cryptonight_hash_aes( void *restrict output, const void *input, int len )
+{
+    uint8_t ExpandedKey[256] __attribute__((aligned(64)));
+    __m128i *longoutput, *expkey, *xmminput;
+    size_t i, j;
+    
+    keccak( (const uint8_t*)input, 76, (char*)&ctx.state.hs.b, 200 );
+
+    if ( cryptonightV7 && len < 43 )
+      return;
+
+    const uint64_t tweak = cryptonightV7 
+                         ? *((const uint64_t*) (((const uint8_t*)input) + 35))
+                           ^ ctx.state.hs.w[24] : 0; 
+
+    memcpy( ExpandedKey, ctx.state.hs.b, AES_KEY_SIZE );
+    ExpandAESKey256( ExpandedKey );
+    memcpy( ctx.text, ctx.state.init, INIT_SIZE_BYTE );
+    
+    longoutput = (__m128i*)ctx.long_state;
+    xmminput   = (__m128i*)ctx.text;
+    expkey     = (__m128i*)ExpandedKey;
+    
+    // prefetch expkey, xmminput and enough longoutput for 4 iterations
+    _mm_prefetch( xmminput,     _MM_HINT_T0 );
+    _mm_prefetch( xmminput + 4, _MM_HINT_T0 );
+    _mm_prefetch( expkey,     _MM_HINT_T0 );
+    _mm_prefetch( expkey + 4, _MM_HINT_T0 );
+    _mm_prefetch( expkey + 8, _MM_HINT_T0 );
+    for ( i = 0; i < 64; i += 16 )
+    {
+        __builtin_prefetch( longoutput + i,      1, 0 );
+        __builtin_prefetch( longoutput + i +  4, 1, 0 );
+        __builtin_prefetch( longoutput + i +  8, 1, 0 );
+        __builtin_prefetch( longoutput + i + 12, 1, 0 );
+    }
+
+    // n-4 iterations
+    for ( i = 0; likely( i < MEMORY_M128I - 4*INIT_SIZE_M128I );
+                         i += INIT_SIZE_M128I )
+    {
+        // prefetch 4 iterations ahead.
+        __builtin_prefetch( longoutput + i + 64, 1, 0 );
+        __builtin_prefetch( longoutput + i + 68, 1, 0 );
+
+	for ( j = 0; j < 10; j++ )
+	{
+		xmminput[0] = _mm_aesenc_si128( xmminput[0], expkey[j] );
+		xmminput[1] = _mm_aesenc_si128( xmminput[1], expkey[j] );
+		xmminput[2] = _mm_aesenc_si128( xmminput[2], expkey[j] );
+		xmminput[3] = _mm_aesenc_si128( xmminput[3], expkey[j] );
+		xmminput[4] = _mm_aesenc_si128( xmminput[4], expkey[j] );
+		xmminput[5] = _mm_aesenc_si128( xmminput[5], expkey[j] );
+		xmminput[6] = _mm_aesenc_si128( xmminput[6], expkey[j] );
+		xmminput[7] = _mm_aesenc_si128( xmminput[7], expkey[j] );
+	}
+	_mm_store_si128( &( longoutput[i  ] ), xmminput[0] );
+	_mm_store_si128( &( longoutput[i+1] ), xmminput[1] );
+	_mm_store_si128( &( longoutput[i+2] ), xmminput[2] );
+	_mm_store_si128( &( longoutput[i+3] ), xmminput[3] );
+	_mm_store_si128( &( longoutput[i+4] ), xmminput[4] );
+	_mm_store_si128( &( longoutput[i+5] ), xmminput[5] );
+	_mm_store_si128( &( longoutput[i+6] ), xmminput[6] );
+	_mm_store_si128( &( longoutput[i+7] ), xmminput[7] );
+    }
+    // last 4 iterations
+    for ( ; likely( i < MEMORY_M128I ); i += INIT_SIZE_M128I )
+    {
+        for ( j = 0; j < 10; j++ )
+        {
+                xmminput[0] = _mm_aesenc_si128( xmminput[0], expkey[j] );
+                xmminput[1] = _mm_aesenc_si128( xmminput[1], expkey[j] );
+                xmminput[2] = _mm_aesenc_si128( xmminput[2], expkey[j] );
+                xmminput[3] = _mm_aesenc_si128( xmminput[3], expkey[j] );
+                xmminput[4] = _mm_aesenc_si128( xmminput[4], expkey[j] );
+                xmminput[5] = _mm_aesenc_si128( xmminput[5], expkey[j] );
+                xmminput[6] = _mm_aesenc_si128( xmminput[6], expkey[j] );
+                xmminput[7] = _mm_aesenc_si128( xmminput[7], expkey[j] );
+        }
+        _mm_store_si128( &( longoutput[i  ] ), xmminput[0] );
+        _mm_store_si128( &( longoutput[i+1] ), xmminput[1] );
+        _mm_store_si128( &( longoutput[i+2] ), xmminput[2] );
+        _mm_store_si128( &( longoutput[i+3] ), xmminput[3] );
+        _mm_store_si128( &( longoutput[i+4] ), xmminput[4] );
+        _mm_store_si128( &( longoutput[i+5] ), xmminput[5] );
+        _mm_store_si128( &( longoutput[i+6] ), xmminput[6] );
+        _mm_store_si128( &( longoutput[i+7] ), xmminput[7] );
+    }
+
+    ctx.a[0] = ((uint64_t *)ctx.state.k)[0] ^ ((uint64_t *)ctx.state.k)[4];
+    ctx.b[0] = ((uint64_t *)ctx.state.k)[2] ^ ((uint64_t *)ctx.state.k)[6];
+    ctx.a[1] = ((uint64_t *)ctx.state.k)[1] ^ ((uint64_t *)ctx.state.k)[5];
+    ctx.b[1] = ((uint64_t *)ctx.state.k)[3] ^ ((uint64_t *)ctx.state.k)[7];
+
+    uint64_t a[2] __attribute((aligned(16))),
+             b[2] __attribute((aligned(16))),
+             c[2] __attribute((aligned(16)));
+    a[0] = ctx.a[0];
+    a[1] = ctx.a[1];
+    __m128i b_x = _mm_load_si128( (__m128i*)ctx.b );
+    __m128i a_x = _mm_load_si128( (__m128i*)a );
+    __m128i* lsa = (__m128i*)&ctx.long_state[ a[0] & 0x1FFFF0 ];
+    __m128i c_x = _mm_load_si128( lsa );
+    uint64_t *nextblock;
+    uint64_t hi, lo;
+
+    // n-1 iterations
+    for( i = 0; __builtin_expect( i < 0x7ffff, 1 ); i++ )
+    {	  
+	c_x = _mm_aesenc_si128( c_x, a_x );
+	_mm_store_si128( (__m128i*)c, c_x );
+        b_x = _mm_xor_si128( b_x, c_x );
+        nextblock = (uint64_t *)&ctx.long_state[c[0] & 0x1FFFF0];
+        _mm_store_si128( lsa, b_x );
+
+        if ( cryptonightV7 )
+        {
+           const uint8_t tmp = ( (const uint8_t*)(lsa) )[11];
+           const uint8_t index = ( ( (tmp >> 3) & 6 ) | (tmp & 1) ) << 1;
+           ((uint8_t*)(lsa))[11] = tmp ^ ( ( 0x75310 >> index) & 0x30 );
+        } 
+
+	b[0] = nextblock[0];
+	b[1] = nextblock[1];
+
+        // hi,lo = 64bit x 64bit multiply of c[0] and b[0]
+	__asm__( "mulq %3\n\t"
+	         : "=d" ( hi ),
+	           "=a" ( lo )
+	         : "%a" ( c[0] ),
+	           "rm" ( b[0] )
+		 : "cc" );
+
+        b_x = c_x;
+
+        a[0] += hi;
+        a[1] += lo;
+        nextblock[0] = a[0];
+        nextblock[1] = cryptonightV7 ? a[1] ^ tweak : a[1];
+        a[0] ^= b[0];
+        a[1] ^= b[1];
+
+        lsa = (__m128i*)&ctx.long_state[ a[0] & 0x1FFFF0 ];
+        a_x = _mm_load_si128( (__m128i*)a );
+        c_x = _mm_load_si128( lsa );
+    }
+    // abreviated nth iteration
+    c_x = _mm_aesenc_si128( c_x, a_x );
+    _mm_store_si128( (__m128i*)c, c_x );
+    b_x = _mm_xor_si128( b_x, c_x );
+    nextblock = (uint64_t *)&ctx.long_state[c[0] & 0x1FFFF0];
+    _mm_store_si128( lsa, b_x );
+
+    if ( cryptonightV7 )
+    {
+       const uint8_t tmp = ( (const uint8_t*)(lsa) )[11];
+       const uint8_t index = ( ( (tmp >> 3) & 6 ) | (tmp & 1) ) << 1;
+       ((uint8_t*)(lsa))[11] = tmp ^ ( ( 0x75310 >> index) & 0x30 );
+    }
+
+    b[0] = nextblock[0];
+    b[1] = nextblock[1];
+
+    __asm__( "mulq %3\n\t"
+             : "=d" ( hi ),
+               "=a" ( lo )
+             : "%a" ( c[0] ),
+               "rm" ( b[0] )
+             : "cc" );
+
+    a[0] += hi;
+    a[1] += lo;
+    nextblock[0] = a[0];
+    nextblock[1] = cryptonightV7 ? a[1] ^ tweak : a[1];
+    a[0] ^= b[0];
+    a[1] ^= b[1];
+
+    memcpy( ExpandedKey, &ctx.state.hs.b[32], AES_KEY_SIZE );
+    ExpandAESKey256( ExpandedKey );
+    memcpy( ctx.text, ctx.state.init, INIT_SIZE_BYTE );
+    
+    // prefetch expkey, all of xmminput and enough longoutput for 4 loops
+    _mm_prefetch( xmminput,     _MM_HINT_T0 );
+    _mm_prefetch( xmminput + 4, _MM_HINT_T0 );
+    for ( i = 0; i < 64; i += 16 )
+    {
+       _mm_prefetch( longoutput + i,      _MM_HINT_T0 );
+       _mm_prefetch( longoutput + i +  4, _MM_HINT_T0 );
+       _mm_prefetch( longoutput + i +  8, _MM_HINT_T0 );
+       _mm_prefetch( longoutput + i + 12, _MM_HINT_T0 );
+    }
+    _mm_prefetch( expkey,     _MM_HINT_T0 );
+    _mm_prefetch( expkey + 4, _MM_HINT_T0 );
+    _mm_prefetch( expkey + 8, _MM_HINT_T0 );
+
+    // n-4 iterations
+    for ( i = 0; likely( i < MEMORY_M128I - 4*INIT_SIZE_M128I );
+                         i += INIT_SIZE_M128I )
+    {
+        // stay 4 iterations ahead.
+        _mm_prefetch( longoutput + i + 64, _MM_HINT_T0 );
+        _mm_prefetch( longoutput + i + 68, _MM_HINT_T0 );
+
+        xmminput[0] = _mm_xor_si128( longoutput[i  ], xmminput[0] );
+        xmminput[1] = _mm_xor_si128( longoutput[i+1], xmminput[1] );
+        xmminput[2] = _mm_xor_si128( longoutput[i+2], xmminput[2] );
+        xmminput[3] = _mm_xor_si128( longoutput[i+3], xmminput[3] );
+        xmminput[4] = _mm_xor_si128( longoutput[i+4], xmminput[4] );
+        xmminput[5] = _mm_xor_si128( longoutput[i+5], xmminput[5] );
+        xmminput[6] = _mm_xor_si128( longoutput[i+6], xmminput[6] );
+        xmminput[7] = _mm_xor_si128( longoutput[i+7], xmminput[7] );
+		
+        for( j = 0; j < 10; j++ )
+        {
+            xmminput[0] = _mm_aesenc_si128( xmminput[0], expkey[j] );
+	    xmminput[1] = _mm_aesenc_si128( xmminput[1], expkey[j] );
+	    xmminput[2] = _mm_aesenc_si128( xmminput[2], expkey[j] );
+	    xmminput[3] = _mm_aesenc_si128( xmminput[3], expkey[j] );
+	    xmminput[4] = _mm_aesenc_si128( xmminput[4], expkey[j] );
+	    xmminput[5] = _mm_aesenc_si128( xmminput[5], expkey[j] );
+	    xmminput[6] = _mm_aesenc_si128( xmminput[6], expkey[j] );
+	    xmminput[7] = _mm_aesenc_si128( xmminput[7], expkey[j] );
+        }
+    }
+    // last 4 iterations 
+    for ( ; likely( i < MEMORY_M128I ); i += INIT_SIZE_M128I )
+    {
+        xmminput[0] = _mm_xor_si128( longoutput[i  ], xmminput[0] );
+        xmminput[1] = _mm_xor_si128( longoutput[i+1], xmminput[1] );
+        xmminput[2] = _mm_xor_si128( longoutput[i+2], xmminput[2] );
+        xmminput[3] = _mm_xor_si128( longoutput[i+3], xmminput[3] );
+        xmminput[4] = _mm_xor_si128( longoutput[i+4], xmminput[4] );
+        xmminput[5] = _mm_xor_si128( longoutput[i+5], xmminput[5] );
+        xmminput[6] = _mm_xor_si128( longoutput[i+6], xmminput[6] );
+        xmminput[7] = _mm_xor_si128( longoutput[i+7], xmminput[7] );
+
+        for( j = 0; j < 10; j++ )
+        {
+            xmminput[0] = _mm_aesenc_si128( xmminput[0], expkey[j] );
+            xmminput[1] = _mm_aesenc_si128( xmminput[1], expkey[j] );
+            xmminput[2] = _mm_aesenc_si128( xmminput[2], expkey[j] );
+            xmminput[3] = _mm_aesenc_si128( xmminput[3], expkey[j] );
+            xmminput[4] = _mm_aesenc_si128( xmminput[4], expkey[j] );
+            xmminput[5] = _mm_aesenc_si128( xmminput[5], expkey[j] );
+            xmminput[6] = _mm_aesenc_si128( xmminput[6], expkey[j] );
+            xmminput[7] = _mm_aesenc_si128( xmminput[7], expkey[j] );
+        }
+    }
+
+    memcpy( ctx.state.init, ctx.text, INIT_SIZE_BYTE);
+    keccakf( (uint64_t*)&ctx.state.hs.w, 24 );
+    extra_hashes[ctx.state.hs.b[0] & 3](&ctx.state, 200, output);
+
+}
+#endif
--- a/algo/cryptonight/cryptonight-common.c
+++ b/algo/cryptonight/cryptonight-common.c
@@ -0,0 +1,127 @@
+// Copyright (c) 2012-2013 The Cryptonote developers
+// Distributed under the MIT/X11 software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+// Modified for CPUminer by Lucas Jones
+
+#include "cpuminer-config.h"
+#include "algo-gate-api.h"
+
+#if defined(__AES__)
+  #include "algo/groestl/aes_ni/hash-groestl256.h"
+#else
+#include "crypto/c_groestl.h"
+#endif
+#include "crypto/c_blake256.h"
+#include "crypto/c_jh.h"
+#include "crypto/c_skein.h"
+#include "cryptonight.h"
+
+/*
+#if defined __unix__ && (!defined __APPLE__)
+#include <sys/mman.h>
+#elif defined _WIN32
+#include <windows.h>
+#endif
+*/
+
+void do_blake_hash(const void* input, size_t len, char* output) {
+    blake256_hash((uint8_t*)output, input, len);
+}
+
+void do_groestl_hash(const void* input, size_t len, char* output) {
+#if defined(__AES__)
+    hashState_groestl256 ctx;
+    init_groestl256( &ctx, 32 );
+    update_and_final_groestl256( &ctx, output, input, len * 8 );
+#else
+    groestl(input, len * 8, (uint8_t*)output);
+#endif
+}
+
+void do_jh_hash(const void* input, size_t len, char* output) {
+    jh_hash(32 * 8, input, 8 * len, (uint8_t*)output);
+}
+
+void do_skein_hash(const void* input, size_t len, char* output) {
+    skein_hash(8 * 32, input, 8 * len, (uint8_t*)output);
+}
+
+void (* const extra_hashes[4])( const void *, size_t, char *) =
+    { do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash };
+
+void cryptonight_hash( void *restrict output, const void *input, int len )
+{
+#if defined(__AES__)
+  cryptonight_hash_aes( output, input, len );
+#else
+  cryptonight_hash_ctx ( output, input, len );
+#endif
+}
+
+void cryptonight_hash_suw( void *restrict output, const void *input )
+{
+#if defined(__AES__)
+  cryptonight_hash_aes( output, input, 76 );
+#else
+  cryptonight_hash_ctx ( output, input, 76 );
+#endif
+}
+
+bool cryptonightV7 = false;
+
+int scanhash_cryptonight( struct work *work, uint32_t max_nonce,
+                   uint64_t *hashes_done, struct thr_info *mythr )
+ {
+    uint32_t *pdata = work->data;
+    uint32_t *ptarget = work->target;
+    int thr_id = mythr->id;
+
+    uint32_t *nonceptr = (uint32_t*) (((char*)pdata) + 39);
+    uint32_t n = *nonceptr - 1;
+    const uint32_t first_nonce = n + 1;
+    const uint32_t Htarg = ptarget[7];
+    uint32_t hash[32 / 4] __attribute__((aligned(32)));
+
+//    if (  (  cryptonightV7 && ( *(uint8_t*)pdata <  7 ) )
+//       || ( !cryptonightV7 && ( *(uint8_t*)pdata == 7 ) ) )
+//          applog(LOG_WARNING,"Cryptonight variant mismatch, shares may be rejected.");
+
+    do
+    {
+       *nonceptr = ++n;
+       cryptonight_hash( hash, pdata, 76 );
+       if (unlikely( hash[7] < Htarg ))
+       {
+           *hashes_done = n - first_nonce + 1;
+//           work_set_target_ratio( work, hash );
+	   return true;
+       }
+    } while (likely((n <= max_nonce && !work_restart[thr_id].restart)));
+    
+    *hashes_done = n - first_nonce + 1;
+    return 0;
+}
+
+bool register_cryptonight_algo( algo_gate_t* gate )
+{
+  cryptonightV7 = false;
+  register_json_rpc2( gate );
+  gate->optimizations = SSE2_OPT | AES_OPT;
+  gate->scanhash         = (void*)&scanhash_cryptonight;
+  gate->hash             = (void*)&cryptonight_hash;
+  gate->hash_suw         = (void*)&cryptonight_hash_suw;  
+  return true;
+};
+
+bool register_cryptonightv7_algo( algo_gate_t* gate )
+{
+  cryptonightV7 = true;
+  register_json_rpc2( gate );
+  gate->optimizations = SSE2_OPT | AES_OPT;
+  gate->scanhash      = (void*)&scanhash_cryptonight;
+  gate->hash          = (void*)&cryptonight_hash;
+  gate->hash_suw      = (void*)&cryptonight_hash_suw;
+  return true;
+};
+
--- a/algo/cryptonight/cryptonight.c
+++ b/algo/cryptonight/cryptonight.c
@@ -0,0 +1,310 @@
+// Copyright (c) 2012-2013 The Cryptonote developers
+// Distributed under the MIT/X11 software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+// Modified for CPUminer by Lucas Jones
+
+#include "miner.h"
+#include <memory.h>
+
+#if defined(__arm__) || defined(_MSC_VER)
+#ifndef NOASM
+#define NOASM
+#endif
+#endif
+
+#include "crypto/oaes_lib.h"
+#include "crypto/c_keccak.h"
+#include "crypto/c_groestl.h"
+#include "crypto/c_blake256.h"
+#include "crypto/c_jh.h"
+#include "crypto/c_skein.h"
+#include "crypto/int-util.h"
+//#include "crypto/hash-ops.h"
+#include "cryptonight.h"
+
+#if USE_INT128
+
+#if __GNUC__ == 4 && __GNUC_MINOR__ >= 4 && __GNUC_MINOR__ < 6
+typedef unsigned int uint128_t __attribute__ ((__mode__ (TI)));
+#elif defined (_MSC_VER)
+/* only for mingw64 on windows */
+#undef  USE_INT128
+#define USE_INT128 (0)
+#else
+typedef __uint128_t uint128_t;
+#endif
+
+#endif
+
+#define LITE 0
+#if LITE /* cryptonight-light */
+#define MEMORY (1 << 20)
+#define ITER   (1 << 19)
+#else
+#define MEMORY (1 << 21) /* 2 MiB */
+#define ITER   (1 << 20)
+#endif
+
+#define AES_BLOCK_SIZE  16
+#define AES_KEY_SIZE    32 /*16*/
+#define INIT_SIZE_BLK   8
+#define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE)
+
+/*
+#pragma pack(push, 1)
+union cn_slow_hash_state {
+	union hash_state hs;
+	struct {
+		uint8_t k[64];
+		uint8_t init[INIT_SIZE_BYTE];
+	};
+};
+#pragma pack(pop)
+
+static void do_blake_hash(const void* input, size_t len, char* output) {
+	blake256_hash((uint8_t*)output, input, len);
+}
+
+static void do_groestl_hash(const void* input, size_t len, char* output) {
+	groestl(input, len * 8, (uint8_t*)output);
+}
+
+static void do_jh_hash(const void* input, size_t len, char* output) {
+	int r = jh_hash(HASH_SIZE * 8, input, 8 * len, (uint8_t*)output);
+	assert(likely(SUCCESS == r));
+}
+
+static void do_skein_hash(const void* input, size_t len, char* output) {
+	int r = skein_hash(8 * HASH_SIZE, input, 8 * len, (uint8_t*)output);
+	assert(likely(SKEIN_SUCCESS == r));
+}
+*/
+
+extern int aesb_single_round(const uint8_t *in, uint8_t*out, const uint8_t *expandedKey);
+extern int aesb_pseudo_round_mut(uint8_t *val, uint8_t *expandedKey);
+#if !defined(_MSC_VER) && !defined(NOASM)
+extern int fast_aesb_single_round(const uint8_t *in, uint8_t*out, const uint8_t *expandedKey);
+extern int fast_aesb_pseudo_round_mut(uint8_t *val, uint8_t *expandedKey);
+#else
+#define fast_aesb_single_round     aesb_single_round
+#define fast_aesb_pseudo_round_mut aesb_pseudo_round_mut
+#endif
+
+
+#if defined(NOASM) || !defined(__x86_64__)
+static uint64_t mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi) {
+	// multiplier   = ab = a * 2^32 + b
+	// multiplicand = cd = c * 2^32 + d
+	// ab * cd = a * c * 2^64 + (a * d + b * c) * 2^32 + b * d
+	uint64_t a = hi_dword(multiplier);
+	uint64_t b = lo_dword(multiplier);
+	uint64_t c = hi_dword(multiplicand);
+	uint64_t d = lo_dword(multiplicand);
+
+	uint64_t ac = a * c;
+	uint64_t ad = a * d;
+	uint64_t bc = b * c;
+	uint64_t bd = b * d;
+
+	uint64_t adbc = ad + bc;
+	uint64_t adbc_carry = adbc < ad ? 1 : 0;
+
+	// multiplier * multiplicand = product_hi * 2^64 + product_lo
+	uint64_t product_lo = bd + (adbc << 32);
+	uint64_t product_lo_carry = product_lo < bd ? 1 : 0;
+	*product_hi = ac + (adbc >> 32) + (adbc_carry << 32) + product_lo_carry;
+	assert(ac <= *product_hi);
+
+	return product_lo;
+}
+#else
+extern uint64_t mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi);
+#endif
+
+/*
+static void (* const extra_hashes[4])(const void *, size_t, char *) = {
+		do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash
+};
+*/
+
+static inline size_t e2i(const uint8_t* a) {
+#if !LITE
+	return ((uint32_t *)a)[0] & 0x1FFFF0;
+#else
+	return ((uint32_t *)a)[0] & 0xFFFF0;
+#endif
+}
+
+static inline void mul_sum_xor_dst( const uint8_t* a, uint8_t* c, uint8_t* dst, 
+         const uint64_t tweak )
+{
+	uint64_t hi, lo = mul128(((uint64_t*) a)[0], ((uint64_t*) dst)[0], &hi) + ((uint64_t*) c)[1];
+	hi += ((uint64_t*) c)[0];
+
+	((uint64_t*) c)[0] = ((uint64_t*) dst)[0] ^ hi;
+	((uint64_t*) c)[1] = ((uint64_t*) dst)[1] ^ lo;
+	((uint64_t*) dst)[0] = hi;
+	((uint64_t*) dst)[1] = cryptonightV7 ? lo ^ tweak : lo;
+}
+
+static inline void xor_blocks(uint8_t* a, const uint8_t* b) {
+#if USE_INT128
+	*((uint128_t*) a) ^= *((uint128_t*) b);
+#else
+	((uint64_t*) a)[0] ^= ((uint64_t*) b)[0];
+	((uint64_t*) a)[1] ^= ((uint64_t*) b)[1];
+#endif
+}
+
+static inline void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) {
+#if USE_INT128
+	*((uint128_t*) dst) = *((uint128_t*) a) ^ *((uint128_t*) b);
+#else
+	((uint64_t*) dst)[0] = ((uint64_t*) a)[0] ^ ((uint64_t*) b)[0];
+	((uint64_t*) dst)[1] = ((uint64_t*) a)[1] ^ ((uint64_t*) b)[1];
+#endif
+}
+
+typedef struct {
+	uint8_t _ALIGN(16) long_state[MEMORY];
+	union cn_slow_hash_state state;
+	uint8_t _ALIGN(16) text[INIT_SIZE_BYTE];
+	uint8_t _ALIGN(16) a[AES_BLOCK_SIZE];
+	uint8_t _ALIGN(16) b[AES_BLOCK_SIZE];
+	uint8_t _ALIGN(16) c[AES_BLOCK_SIZE];
+	oaes_ctx* aes_ctx;
+} cryptonight_ctx;
+
+static __thread cryptonight_ctx ctx;
+
+void cryptonight_hash_ctx(void* output, const void* input, int len)
+{
+//    hash_process(&ctx.state.hs, (const uint8_t*) input, len);
+    keccak( (const uint8_t*)input, 76, (char*)&ctx.state.hs.b, 200 );
+
+    if ( cryptonightV7 && len < 43 )
+      return;
+    const uint64_t tweak = cryptonightV7
+                         ? *((const uint64_t*) (((const uint8_t*)input) + 35))
+                           ^ ctx.state.hs.w[24] : 0;
+
+    ctx.aes_ctx = (oaes_ctx*) oaes_alloc();
+
+    __builtin_prefetch( ctx.text,             0, 3 );
+    __builtin_prefetch( ctx.text       +  64, 0, 3 );
+    __builtin_prefetch( ctx.long_state,       1, 0 );
+    __builtin_prefetch( ctx.long_state +  64, 1, 0 );
+    __builtin_prefetch( ctx.long_state + 128, 1, 0 );
+    __builtin_prefetch( ctx.long_state + 192, 1, 0 );
+    __builtin_prefetch( ctx.long_state + 256, 1, 0 );
+    __builtin_prefetch( ctx.long_state + 320, 1, 0 );
+    __builtin_prefetch( ctx.long_state + 384, 1, 0 );
+    __builtin_prefetch( ctx.long_state + 448, 1, 0 );
+
+	size_t i, j;
+	memcpy(ctx.text, ctx.state.init, INIT_SIZE_BYTE);
+
+	oaes_key_import_data(ctx.aes_ctx, ctx.state.hs.b, AES_KEY_SIZE);
+	for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
+
+    __builtin_prefetch( ctx.long_state + i + 512, 1, 0 );
+    __builtin_prefetch( ctx.long_state + i + 576, 1, 0 );
+
+		aesb_pseudo_round_mut(&ctx.text[AES_BLOCK_SIZE * 0], ctx.aes_ctx->key->exp_data);
+		aesb_pseudo_round_mut(&ctx.text[AES_BLOCK_SIZE * 1], ctx.aes_ctx->key->exp_data);
+		aesb_pseudo_round_mut(&ctx.text[AES_BLOCK_SIZE * 2], ctx.aes_ctx->key->exp_data);
+		aesb_pseudo_round_mut(&ctx.text[AES_BLOCK_SIZE * 3], ctx.aes_ctx->key->exp_data);
+		aesb_pseudo_round_mut(&ctx.text[AES_BLOCK_SIZE * 4], ctx.aes_ctx->key->exp_data);
+		aesb_pseudo_round_mut(&ctx.text[AES_BLOCK_SIZE * 5], ctx.aes_ctx->key->exp_data);
+		aesb_pseudo_round_mut(&ctx.text[AES_BLOCK_SIZE * 6], ctx.aes_ctx->key->exp_data);
+		aesb_pseudo_round_mut(&ctx.text[AES_BLOCK_SIZE * 7], ctx.aes_ctx->key->exp_data);
+		memcpy(&ctx.long_state[i], ctx.text, INIT_SIZE_BYTE);
+	}
+
+	xor_blocks_dst(&ctx.state.k[0], &ctx.state.k[32], ctx.a);
+	xor_blocks_dst(&ctx.state.k[16], &ctx.state.k[48], ctx.b);
+
+	for (i = 0; likely(i < ITER / 4); ++i)
+        {
+           /* Dependency chain: address -> read value ------+
+            * written value <-+ hard function (AES or MUL) <+
+            * next address  <-+
+            */
+           /* Iteration 1 */
+           j = e2i(ctx.a);
+           aesb_single_round(&ctx.long_state[j], ctx.c, ctx.a);
+           xor_blocks_dst(ctx.c, ctx.b, &ctx.long_state[j]);
+
+           if ( cryptonightV7 )
+           {
+              uint8_t *lsa = (uint8_t*)&ctx.long_state[((uint64_t *)(ctx.a))[0] & 0x1FFFF0];
+              const uint8_t tmp = lsa[11];
+              const uint8_t index = ( ( (tmp >> 3) & 6 ) | (tmp & 1) ) << 1;
+              lsa[11] = tmp ^ ( ( 0x75310 >> index) & 0x30 );
+           }
+
+           /* Iteration 2 */
+           mul_sum_xor_dst(ctx.c, ctx.a, &ctx.long_state[e2i(ctx.c)], tweak );
+
+           /* Iteration 3 */
+           j = e2i(ctx.a);
+           aesb_single_round(&ctx.long_state[j], ctx.b, ctx.a);
+           xor_blocks_dst(ctx.b, ctx.c, &ctx.long_state[j]);
+
+           if ( cryptonightV7 )
+           {
+              uint8_t *lsa = (uint8_t*)&ctx.long_state[((uint64_t *)(ctx.a))[0] & 0x1FFFF0];
+              const uint8_t tmp = lsa[11];
+              const uint8_t index = ( ( (tmp >> 3) & 6 ) | (tmp & 1) ) << 1;
+              lsa[11] = tmp ^ ( ( 0x75310 >> index) & 0x30 );
+           }
+
+           /* Iteration 4 */
+           mul_sum_xor_dst(ctx.b, ctx.a, &ctx.long_state[e2i(ctx.b)], tweak );
+
+	}
+
+    __builtin_prefetch( ctx.text,             0, 3 );
+    __builtin_prefetch( ctx.text       +  64, 0, 3 );
+    __builtin_prefetch( ctx.long_state,       1, 0 );
+    __builtin_prefetch( ctx.long_state +  64, 1, 0 );
+    __builtin_prefetch( ctx.long_state + 128, 1, 0 );
+    __builtin_prefetch( ctx.long_state + 192, 1, 0 );
+    __builtin_prefetch( ctx.long_state + 256, 1, 0 );
+    __builtin_prefetch( ctx.long_state + 320, 1, 0 );
+    __builtin_prefetch( ctx.long_state + 384, 1, 0 );
+    __builtin_prefetch( ctx.long_state + 448, 1, 0 );
+
+	memcpy(ctx.text, ctx.state.init, INIT_SIZE_BYTE);
+	oaes_key_import_data(ctx.aes_ctx, &ctx.state.hs.b[32], AES_KEY_SIZE);
+	for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
+
+    __builtin_prefetch( ctx.long_state + i + 512, 1, 0 );
+    __builtin_prefetch( ctx.long_state + i + 576, 1, 0 );
+
+		xor_blocks(&ctx.text[0 * AES_BLOCK_SIZE], &ctx.long_state[i + 0 * AES_BLOCK_SIZE]);
+		aesb_pseudo_round_mut(&ctx.text[0 * AES_BLOCK_SIZE], ctx.aes_ctx->key->exp_data);
+		xor_blocks(&ctx.text[1 * AES_BLOCK_SIZE], &ctx.long_state[i + 1 * AES_BLOCK_SIZE]);
+		aesb_pseudo_round_mut(&ctx.text[1 * AES_BLOCK_SIZE], ctx.aes_ctx->key->exp_data);
+		xor_blocks(&ctx.text[2 * AES_BLOCK_SIZE], &ctx.long_state[i + 2 * AES_BLOCK_SIZE]);
+		aesb_pseudo_round_mut(&ctx.text[2 * AES_BLOCK_SIZE], ctx.aes_ctx->key->exp_data);
+		xor_blocks(&ctx.text[3 * AES_BLOCK_SIZE], &ctx.long_state[i + 3 * AES_BLOCK_SIZE]);
+		aesb_pseudo_round_mut(&ctx.text[3 * AES_BLOCK_SIZE], ctx.aes_ctx->key->exp_data);
+		xor_blocks(&ctx.text[4 * AES_BLOCK_SIZE], &ctx.long_state[i + 4 * AES_BLOCK_SIZE]);
+		aesb_pseudo_round_mut(&ctx.text[4 * AES_BLOCK_SIZE], ctx.aes_ctx->key->exp_data);
+		xor_blocks(&ctx.text[5 * AES_BLOCK_SIZE], &ctx.long_state[i + 5 * AES_BLOCK_SIZE]);
+		aesb_pseudo_round_mut(&ctx.text[5 * AES_BLOCK_SIZE], ctx.aes_ctx->key->exp_data);
+		xor_blocks(&ctx.text[6 * AES_BLOCK_SIZE], &ctx.long_state[i + 6 * AES_BLOCK_SIZE]);
+		aesb_pseudo_round_mut(&ctx.text[6 * AES_BLOCK_SIZE], ctx.aes_ctx->key->exp_data);
+		xor_blocks(&ctx.text[7 * AES_BLOCK_SIZE], &ctx.long_state[i + 7 * AES_BLOCK_SIZE]);
+		aesb_pseudo_round_mut(&ctx.text[7 * AES_BLOCK_SIZE], ctx.aes_ctx->key->exp_data);
+	}
+	memcpy(ctx.state.init, ctx.text, INIT_SIZE_BYTE);
+//	hash_permutation(&ctx.state.hs);
+        keccakf( (uint64_t*)&ctx.state.hs.w, 24 );
+	/*memcpy(hash, &state, 32);*/
+	extra_hashes[ctx.state.hs.b[0] & 3](&ctx.state, 200, output);
+	oaes_free((OAES_CTX **) &ctx.aes_ctx);
+}
+
--- a/algo/cryptonight/cryptonight.h
+++ b/algo/cryptonight/cryptonight.h
@@ -0,0 +1,51 @@
+#ifndef __CRYPTONIGHT_H_INCLUDED
+#define __CRYPTONIGHT_H_INCLUDED
+
+#include <stddef.h>
+#include "crypto/oaes_lib.h"
+#include "miner.h"
+
+#define MEMORY         (1 << 21) /* 2 MiB */
+#define MEMORY_M128I   (MEMORY >> 4) // 2 MiB / 16 = 128 ki * __m128i
+#define ITER           (1 << 20)
+#define AES_BLOCK_SIZE  16
+#define AES_KEY_SIZE    32 /*16*/
+#define INIT_SIZE_BLK   8
+#define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE)	// 128
+#define INIT_SIZE_M128I (INIT_SIZE_BYTE >> 4) // 8
+
+
+#pragma pack(push, 1)
+union hash_state {
+  uint8_t b[200];
+  uint64_t w[25];
+};
+#pragma pack(pop)
+
+#pragma pack(push, 1)
+union cn_slow_hash_state {
+    union hash_state hs;
+    struct {
+        uint8_t k[64];
+        uint8_t init[INIT_SIZE_BYTE];
+    };
+};
+#pragma pack(pop)
+
+void do_blake_hash(const void* input, size_t len, char* output);
+void do_groestl_hash(const void* input, size_t len, char* output);
+void do_jh_hash(const void* input, size_t len, char* output);
+void do_skein_hash(const void* input, size_t len, char* output);
+void cryptonight_hash_ctx(void* output, const void* input, int len);
+void keccakf(uint64_t st[25], int rounds);
+extern void (* const extra_hashes[4])(const void *, size_t, char *);
+
+int scanhash_cryptonight( struct work *work, uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr );
+
+void cryptonight_hash_aes( void *restrict output, const void *input, int len );
+
+extern bool cryptonightV7;
+
+#endif
+
--- a/algo/cubehash/cube-hash-2way.c
+++ b/algo/cubehash/cube-hash-2way.c
@@ -179,6 +179,14 @@ int cube_4way_full( cube_4way_context *sp, void *output,  int hashbitlen,
    sp->rounds    = 16;
    sp->pos       = 0;

+    h[ 0] = m512_const1_128( iv[0] );
+    h[ 1] = m512_const1_128( iv[1] );
+    h[ 2] = m512_const1_128( iv[2] );
+    h[ 3] = m512_const1_128( iv[3] );
+    h[ 4] = m512_const1_128( iv[4] );
+    h[ 5] = m512_const1_128( iv[5] );
+    h[ 6] = m512_const1_128( iv[6] );
+    h[ 7] = m512_const1_128( iv[7] );
    h[ 0] = m512_const1_128( iv[0] );
    h[ 1] = m512_const1_128( iv[1] );
    h[ 2] = m512_const1_128( iv[2] );
@@ -439,6 +447,14 @@ int cube_2way_full( cube_2way_context *sp, void *output, int hashbitlen,
    sp->rounds    = 16;
    sp->pos       = 0;

+    h[ 0] = m256_const1_128( iv[0] );
+    h[ 1] = m256_const1_128( iv[1] );
+    h[ 2] = m256_const1_128( iv[2] );
+    h[ 3] = m256_const1_128( iv[3] );
+    h[ 4] = m256_const1_128( iv[4] );
+    h[ 5] = m256_const1_128( iv[5] );
+    h[ 6] = m256_const1_128( iv[6] );
+    h[ 7] = m256_const1_128( iv[7] );
    h[ 0] = m256_const1_128( iv[0] );
    h[ 1] = m256_const1_128( iv[1] );
    h[ 2] = m256_const1_128( iv[2] );
--- a/algo/cubehash/cube-hash-2way.h
+++ b/algo/cubehash/cube-hash-2way.h
@@ -28,27 +28,6 @@ int cube_4way_update_close( cube_4way_context *sp, void *output,
 int cube_4way_full( cube_4way_context *sp, void *output, int hashbitlen,
                    const void *data, size_t size );

-int cube_4x256_full( cube_4way_context *sp, void *output, int hashbitlen,
-                     const void *data, size_t size );
-
-#define cube512_4way_init( sp ) cube_4way_update( sp, 512 )
-#define cube512_4way_update cube_4way_update
-#define cube512_4way_update_close cube_4way_update
-#define cube512_4way_close cube_4way_update
-#define cube512_4way_full( sp, output, data, size ) \
-           cube_4way_full( sp, output, 512, data, size )
-#define cube512_4x256_full( sp, output, data, size ) \
-           cube_4x256_full( sp, output, 512, data, size )
-
-#define cube256_4way_init( sp ) cube_4way_update( sp, 256 )
-#define cube256_4way_update cube_4way_update
-#define cube256_4way_update_close cube_4way_update
-#define cube256_4way_close cube_4way_update
-#define cube256_4way_full( sp, output, data, size ) \
-           cube_4way_full( sp, output, 256, data, size )
-#define cube256_4x256_full( sp, output, data, size ) \
-           cube_4x256_full( sp, output, 256, data, size )
-
 #endif

 // 2x128, 2 way parallel SSE2
--- a/algo/cubehash/cubehash_sse2.c
+++ b/algo/cubehash/cubehash_sse2.c
@@ -230,10 +230,11 @@ int cubehashDigest( cubehashParam *sp, byte *digest )

    // pos is zero for 64 byte data, 1 for 80 byte data.
    sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ],
-                                      m128_const_64( 0, 0x80 ) );
+                                      _mm_set_epi8( 0,0,0,0, 0,0,0,0,
+                                                    0,0,0,0, 0,0,0,0x80 ) );
    transform( sp );

-    sp->x[7] = _mm_xor_si128( sp->x[7], m128_const_64( 0x100000000, 0 ) );
+    sp->x[7] = _mm_xor_si128( sp->x[7], _mm_set_epi32( 1,0,0,0 ) );
    transform( sp );
    transform( sp );
    transform( sp );
@@ -275,89 +276,11 @@ int cubehashUpdateDigest( cubehashParam *sp, byte *digest,

    // pos is zero for 64 byte data, 1 for 80 byte data.
    sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ],
-                                      m128_const_64( 0, 0x80 ) );
+                                      _mm_set_epi8( 0,0,0,0, 0,0,0,0,
+                                                    0,0,0,0, 0,0,0,0x80 ) );
    transform( sp );

-    sp->x[7] = _mm_xor_si128( sp->x[7], m128_const_64( 0x100000000, 0 ) );
-
-    transform( sp );
-    transform( sp );
-    transform( sp );
-    transform( sp );
-    transform( sp );
-    transform( sp );
-    transform( sp );
-    transform( sp );
-    transform( sp );
-    transform( sp );
-
-    for ( i = 0; i < sp->hashlen; i++ )
-       hash[i] = sp->x[i];
-
-    return SUCCESS;
-}
-
-int cubehash_full( cubehashParam *sp, byte *digest, int hashbitlen,
-                          const byte *data, size_t size )
-{
-    __m128i *x = (__m128i*)sp->x;
-    sp->hashlen   = hashbitlen/128;
-    sp->blocksize = 32/16;
-    sp->rounds    = 16;
-    sp->pos       = 0;
-
-    if ( hashbitlen == 512 )
-    {
-
-       x[0] = m128_const_64( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
-       x[1] = m128_const_64( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
-       x[2] = m128_const_64( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
-       x[3] = m128_const_64( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
-       x[4] = m128_const_64( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
-       x[5] = m128_const_64( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
-       x[6] = m128_const_64( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
-       x[7] = m128_const_64( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
-    }
-    else
-    {
-       x[0] = m128_const_64( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
-       x[1] = m128_const_64( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
-       x[2] = m128_const_64( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
-       x[3] = m128_const_64( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
-       x[4] = m128_const_64( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
-       x[5] = m128_const_64( 0x93CB628565C892FD, 0x5FA2560309392549 );
-       x[6] = m128_const_64( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
-       x[7] = m128_const_64( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
-    }
-
-
-
-
-    const int len = size / 16;
-    const __m128i* in = (__m128i*)data;
-    __m128i* hash = (__m128i*)digest;
-    int i;
-
-    // It is assumed data is aligned to 256 bits and is a multiple of 128 bits.
-    // Current usage sata is either 64 or 80 bytes.
-
-    for ( i = 0; i < len; i++ )
-    {
-        sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ], in[i] );
-        sp->pos++;
-        if ( sp->pos == sp->blocksize )
-        {
-           transform( sp );
-           sp->pos = 0;
-        }
-    }
-
-    // pos is zero for 64 byte data, 1 for 80 byte data.
-    sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ],
-                                      m128_const_64( 0, 0x80 ) );
-    transform( sp );
-
-    sp->x[7] = _mm_xor_si128( sp->x[7], m128_const_64( 0x100000000, 0 ) );
+    sp->x[7] = _mm_xor_si128( sp->x[7], _mm_set_epi32( 1,0,0,0 ) );

    transform( sp );
    transform( sp );
--- a/algo/cubehash/cubehash_sse2.h
+++ b/algo/cubehash/cubehash_sse2.h
@@ -19,7 +19,7 @@ struct _cubehashParam
    int rounds;
    int blocksize;         // __m128i
    int pos;	           // number of __m128i read into x from current block
-    __m128i _ALIGN(64) x[8];  // aligned for __m256i
+    __m128i _ALIGN(256) x[8];  // aligned for __m256i
 };

 typedef struct _cubehashParam cubehashParam;
@@ -39,9 +39,6 @@ int cubehashDigest(cubehashParam* sp, byte *digest);
 int cubehashUpdateDigest( cubehashParam *sp, byte *digest, const byte *data,
                          size_t size );

-int cubehash_full( cubehashParam* sp, byte *digest, int hashbitlen,
-                   const byte *data, size_t size );
-
 #ifdef __cplusplus
 }
 #endif
--- a/algo/echo/echo-hash-4way.h
+++ b/algo/echo/echo-hash-4way.h
@@ -22,26 +22,18 @@ typedef struct
 } echo_4way_context __attribute__ ((aligned (64)));

 int echo_4way_init( echo_4way_context *state, int hashbitlen );
-#define echo512_4way_init( state ) echo_4way_init( state, 512 )
-#define echo256_4way_init( state ) echo_4way_init( state, 256 )
+

 int echo_4way_update( echo_4way_context *state, const void *data,
    unsigned int databitlen);
-#define echo512_4way_update echo_4way_update

 int echo_close( echo_4way_context *state, void *hashval );
-#define echo512_4way_close echo_4way_close

 int echo_4way_update_close( echo_4way_context *state, void *hashval,
                              const void *data, int databitlen );
-#define echo512_4way_update_close echo_4way_update_close

 int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
                    const void *data, int datalen );
-#define echo512_4way_full( state, hashval, data, datalen ) \
-           echo_4way_full( state, hashval, 512, data, datalen )
-#define echo256_4way_full( state, hashval, data, datalen ) \
-           echo_4way_full( state, hashval, 256, data, datalen )

 #endif 
 #endif
--- a/algo/echo/sph_echo.c
+++ b/algo/echo/sph_echo.c
@@ -36,8 +36,6 @@

 #include "sph_echo.h"

-#if !defined(__AES__)
-
 #ifdef __cplusplus
 extern "C"{
 #endif
@@ -1030,5 +1028,4 @@ sph_echo512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 }
 #ifdef __cplusplus
 }
-#endif 
-#endif  // !AES
+#endif
--- a/algo/echo/sph_echo.h
+++ b/algo/echo/sph_echo.h
@@ -36,8 +36,6 @@
 #ifndef SPH_ECHO_H__
 #define SPH_ECHO_H__

-#if !defined(__AES__)
-
 #ifdef __cplusplus
 extern "C"{
 #endif
@@ -318,5 +316,5 @@ void sph_echo512_addbits_and_close(
 #ifdef __cplusplus
 }
 #endif
-#endif // !AES
+
 #endif
--- a/algo/fugue/sph_fugue.h
+++ b/algo/fugue/sph_fugue.h
@@ -74,14 +74,6 @@ void sph_fugue512_close(void *cc, void *dst);
 void sph_fugue512_addbits_and_close(
 	void *cc, unsigned ub, unsigned n, void *dst);

-#define sph_fugue512_full( cc, dst, data, len ) \
-do{ \
-   sph_fugue512_init( cc ); \
-   sph_fugue512( cc, data, len ); \
-   sph_fugue512_close( cc, dst ); \
-}while(0)
-
-
 #ifdef __cplusplus
 }
 #endif
--- a/algo/groestl/aes_ni/groestl-asm-aes.h
+++ b/algo/groestl/aes_ni/groestl-asm-aes.h
--- a/algo/groestl/aes_ni/groestl-asm-avx.h
+++ b/algo/groestl/aes_ni/groestl-asm-avx.h
--- a/algo/groestl/aes_ni/groestl-asm-vperm.h
+++ b/algo/groestl/aes_ni/groestl-asm-vperm.h
--- a/algo/groestl/aes_ni/groestl-intr-aes.h
+++ b/algo/groestl/aes_ni/groestl-intr-aes.h
@@ -1,6 +1,3 @@
-#if !defined GROESTL_INTR_AES_H__
-#define GROESTL_INTR_AES_H__
-
 /* groestl-intr-aes.h     Aug 2011
 *
 * Groestl implementation with intrinsics using ssse3, sse4.1, and aes
@@ -14,51 +11,16 @@
 #include <wmmintrin.h>
 #include "hash-groestl.h"

-static const __m128i round_const_p[] __attribute__ ((aligned (64))) =
-{
-   { 0x7060504030201000, 0xf0e0d0c0b0a09080 },
-   { 0x7161514131211101, 0xf1e1d1c1b1a19181 },
-   { 0x7262524232221202, 0xf2e2d2c2b2a29282 },
-   { 0x7363534333231303, 0xf3e3d3c3b3a39383 },
-   { 0x7464544434241404, 0xf4e4d4c4b4a49484 },
-   { 0x7565554535251505, 0xf5e5d5c5b5a59585 },
-   { 0x7666564636261606, 0xf6e6d6c6b6a69686 },
-   { 0x7767574737271707, 0xf7e7d7c7b7a79787 },
-   { 0x7868584838281808, 0xf8e8d8c8b8a89888 },
-   { 0x7969594939291909, 0xf9e9d9c9b9a99989 },
-   { 0x7a6a5a4a3a2a1a0a, 0xfaeadacabaaa9a8a },
-   { 0x7b6b5b4b3b2b1b0b, 0xfbebdbcbbbab9b8b },
-   { 0x7c6c5c4c3c2c1c0c, 0xfcecdcccbcac9c8c },
-   { 0x7d6d5d4d3d2d1d0d, 0xfdedddcdbdad9d8d }
-};
-
-static const __m128i round_const_q[] __attribute__ ((aligned (64))) =
-{
-   { 0x8f9fafbfcfdfefff, 0x0f1f2f3f4f5f6f7f },
-   { 0x8e9eaebecedeeefe, 0x0e1e2e3e4e5e6e7e },
-   { 0x8d9dadbdcdddedfd, 0x0d1d2d3d4d5d6d7d },
-   { 0x8c9cacbcccdcecfc, 0x0c1c2c3c4c5c6c7c },
-   { 0x8b9babbbcbdbebfb, 0x0b1b2b3b4b5b6b7b },
-   { 0x8a9aaabacadaeafa, 0x0a1a2a3a4a5a6a7a },
-   { 0x8999a9b9c9d9e9f9, 0x0919293949596979 },
-   { 0x8898a8b8c8d8e8f8, 0x0818283848586878 },
-   { 0x8797a7b7c7d7e7f7, 0x0717273747576777 },
-   { 0x8696a6b6c6d6e6f6, 0x0616263646566676 },
-   { 0x8595a5b5c5d5e5f5, 0x0515253545556575 },
-   { 0x8494a4b4c4d4e4f4, 0x0414243444546474 },
-   { 0x8393a3b3c3d3e3f3, 0x0313233343536373 },
-   { 0x8292a2b2c2d2e2f2, 0x0212223242526272 }
-};
-
-static const __m128i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02 };
-static const __m128i SUBSH_MASK0 = { 0x0b0e0104070a0d00, 0x0306090c0f020508 };
-static const __m128i SUBSH_MASK1 = { 0x0c0f0205080b0e01, 0x04070a0d00030609 };
-static const __m128i SUBSH_MASK2 = { 0x0d000306090c0f02, 0x05080b0e0104070a };
-static const __m128i SUBSH_MASK3 = { 0x0e0104070a0d0003, 0x06090c0f0205080b };
-static const __m128i SUBSH_MASK4 = { 0x0f0205080b0e0104, 0x070a0d000306090c };
-static const __m128i SUBSH_MASK5 = { 0x000306090c0f0205, 0x080b0e0104070a0d };
-static const __m128i SUBSH_MASK6 = { 0x0104070a0d000306, 0x090c0f0205080b0e };
-static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
+/* global constants  */
+__m128i ROUND_CONST_Lx;
+//__m128i ROUND_CONST_L0[ROUNDS512];
+//__m128i ROUND_CONST_L7[ROUNDS512];
+__m128i ROUND_CONST_P[ROUNDS1024];
+__m128i ROUND_CONST_Q[ROUNDS1024];
+__m128i TRANSP_MASK;
+__m128i SUBSH_MASK[8];
+__m128i ALL_1B;
+__m128i ALL_FF;

 #define tos(a)    #a
 #define tostr(a)  tos(a)
@@ -149,7 +111,7 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
  \
  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
  /* compute w_i : add y_{i+4} */\
-  b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\
+  b1 = ALL_1B;\
  MUL2(a0, b0, b1);\
  a0 = _mm_xor_si128(a0, TEMP0);\
  MUL2(a1, b0, b1);\
@@ -190,6 +152,25 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
 }/*MixBytes*/


+#define SET_CONSTANTS(){\
+  ALL_FF = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\
+  ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
+  TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
+  SUBSH_MASK[0] = _mm_set_epi32(0x0306090c, 0x0f020508, 0x0b0e0104, 0x070a0d00);\
+  SUBSH_MASK[1] = _mm_set_epi32(0x04070a0d, 0x00030609, 0x0c0f0205, 0x080b0e01);\
+  SUBSH_MASK[2] = _mm_set_epi32(0x05080b0e, 0x0104070a, 0x0d000306, 0x090c0f02);\
+  SUBSH_MASK[3] = _mm_set_epi32(0x06090c0f, 0x0205080b, 0x0e010407, 0x0a0d0003);\
+  SUBSH_MASK[4] = _mm_set_epi32(0x070a0d00, 0x0306090c, 0x0f020508, 0x0b0e0104);\
+  SUBSH_MASK[5] = _mm_set_epi32(0x080b0e01, 0x04070a0d, 0x00030609, 0x0c0f0205);\
+  SUBSH_MASK[6] = _mm_set_epi32(0x090c0f02, 0x05080b0e, 0x0104070a, 0x0d000306);\
+  SUBSH_MASK[7] = _mm_set_epi32(0x0e010407, 0x0a0d0003, 0x06090c0f, 0x0205080b);\
+  for(i = 0; i < ROUNDS1024; i++)\
+  {\
+    ROUND_CONST_P[i] = _mm_set_epi32(0xf0e0d0c0 ^ (i * 0x01010101), 0xb0a09080 ^ (i * 0x01010101), 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
+    ROUND_CONST_Q[i] = _mm_set_epi32(0x0f1f2f3f ^ (i * 0x01010101), 0x4f5f6f7f ^ (i * 0x01010101), 0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101));\
+  }\
+}while(0);\
+
 /* one round
 * a0-a7 = input rows
 * b0-b7 = output rows
@@ -213,34 +194,30 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
  u8 round_counter = 0;\
  for(round_counter = 0; round_counter < 14; round_counter+=2) {\
    /* AddRoundConstant P1024 */\
-    xmm8 = _mm_xor_si128( xmm8, \
-             casti_m128i( round_const_p, round_counter ) ); \
+    xmm8 = _mm_xor_si128(xmm8, (ROUND_CONST_P[round_counter]));\
     /* ShiftBytes P1024 + pre-AESENCLAST */\
-    xmm8  = _mm_shuffle_epi8( xmm8,  SUBSH_MASK0 ); \
-    xmm9  = _mm_shuffle_epi8( xmm9,  SUBSH_MASK1 ); \
-    xmm10 = _mm_shuffle_epi8( xmm10, SUBSH_MASK2 ); \
-    xmm11 = _mm_shuffle_epi8( xmm11, SUBSH_MASK3 ); \
-    xmm12 = _mm_shuffle_epi8( xmm12, SUBSH_MASK4 ); \
-    xmm13 = _mm_shuffle_epi8( xmm13, SUBSH_MASK5 ); \
-    xmm14 = _mm_shuffle_epi8( xmm14, SUBSH_MASK6 ); \
-    xmm15 = _mm_shuffle_epi8( xmm15, SUBSH_MASK7 ); \
+    xmm8  = _mm_shuffle_epi8(xmm8,  (SUBSH_MASK[0]));\
+    xmm9  = _mm_shuffle_epi8(xmm9,  (SUBSH_MASK[1]));\
+    xmm10 = _mm_shuffle_epi8(xmm10, (SUBSH_MASK[2]));\
+    xmm11 = _mm_shuffle_epi8(xmm11, (SUBSH_MASK[3]));\
+    xmm12 = _mm_shuffle_epi8(xmm12, (SUBSH_MASK[4]));\
+    xmm13 = _mm_shuffle_epi8(xmm13, (SUBSH_MASK[5]));\
+    xmm14 = _mm_shuffle_epi8(xmm14, (SUBSH_MASK[6]));\
+    xmm15 = _mm_shuffle_epi8(xmm15, (SUBSH_MASK[7]));\
    /* SubBytes + MixBytes */\
-    SUBMIX( xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, \
-            xmm0, xmm1, xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7 ); \
+    SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
    \
    /* AddRoundConstant P1024 */\
-    xmm0 = _mm_xor_si128( xmm0, \
-             casti_m128i( round_const_p, round_counter+1 ) ); \
-    xmm0 = _mm_shuffle_epi8( xmm0, SUBSH_MASK0 ); \
-    xmm1 = _mm_shuffle_epi8( xmm1, SUBSH_MASK1 ); \
-    xmm2 = _mm_shuffle_epi8( xmm2, SUBSH_MASK2 ); \
-    xmm3 = _mm_shuffle_epi8( xmm3, SUBSH_MASK3 ); \
-    xmm4 = _mm_shuffle_epi8( xmm4, SUBSH_MASK4 ); \
-    xmm5 = _mm_shuffle_epi8( xmm5, SUBSH_MASK5 ); \
-    xmm6 = _mm_shuffle_epi8( xmm6, SUBSH_MASK6 ); \
-    xmm7 = _mm_shuffle_epi8( xmm7, SUBSH_MASK7 ); \
-    SUBMIX( xmm0, xmm1, xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7, \
-            xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15 ); \
+    xmm0 = _mm_xor_si128(xmm0, (ROUND_CONST_P[round_counter+1]));\
+    xmm0 = _mm_shuffle_epi8(xmm0, (SUBSH_MASK[0]));\
+    xmm1 = _mm_shuffle_epi8(xmm1, (SUBSH_MASK[1]));\
+    xmm2 = _mm_shuffle_epi8(xmm2, (SUBSH_MASK[2]));\
+    xmm3 = _mm_shuffle_epi8(xmm3, (SUBSH_MASK[3]));\
+    xmm4 = _mm_shuffle_epi8(xmm4, (SUBSH_MASK[4]));\
+    xmm5 = _mm_shuffle_epi8(xmm5, (SUBSH_MASK[5]));\
+    xmm6 = _mm_shuffle_epi8(xmm6, (SUBSH_MASK[6]));\
+    xmm7 = _mm_shuffle_epi8(xmm7, (SUBSH_MASK[7]));\
+    SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
  }\
 }

@@ -248,52 +225,48 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
  u8 round_counter = 0;\
  for(round_counter = 0; round_counter < 14; round_counter+=2) {\
    /* AddRoundConstant Q1024 */\
-    xmm1 = m128_neg1;\
-    xmm8  = _mm_xor_si128( xmm8,  xmm1 ); \
-    xmm9  = _mm_xor_si128( xmm9,  xmm1 ); \
-    xmm10 = _mm_xor_si128( xmm10, xmm1 ); \
-    xmm11 = _mm_xor_si128( xmm11, xmm1 ); \
-    xmm12 = _mm_xor_si128( xmm12, xmm1 ); \
-    xmm13 = _mm_xor_si128( xmm13, xmm1 ); \
-    xmm14 = _mm_xor_si128( xmm14, xmm1 ); \
-    xmm15 = _mm_xor_si128( xmm15, \
-              casti_m128i( round_const_q, round_counter ) ); \
+    xmm1 = ALL_FF;\
+    xmm8  = _mm_xor_si128(xmm8,  xmm1);\
+    xmm9  = _mm_xor_si128(xmm9,  xmm1);\
+    xmm10 = _mm_xor_si128(xmm10, xmm1);\
+    xmm11 = _mm_xor_si128(xmm11, xmm1);\
+    xmm12 = _mm_xor_si128(xmm12, xmm1);\
+    xmm13 = _mm_xor_si128(xmm13, xmm1);\
+    xmm14 = _mm_xor_si128(xmm14, xmm1);\
+    xmm15 = _mm_xor_si128(xmm15, (ROUND_CONST_Q[round_counter]));\
    /* ShiftBytes Q1024 + pre-AESENCLAST */\
-    xmm8  = _mm_shuffle_epi8( xmm8,  SUBSH_MASK1 ); \
-    xmm9  = _mm_shuffle_epi8( xmm9,  SUBSH_MASK3 ); \
-    xmm10 = _mm_shuffle_epi8( xmm10, SUBSH_MASK5 ); \
-    xmm11 = _mm_shuffle_epi8( xmm11, SUBSH_MASK7 ); \
-    xmm12 = _mm_shuffle_epi8( xmm12, SUBSH_MASK0 ); \
-    xmm13 = _mm_shuffle_epi8( xmm13, SUBSH_MASK2 ); \
-    xmm14 = _mm_shuffle_epi8( xmm14, SUBSH_MASK4 ); \
-    xmm15 = _mm_shuffle_epi8( xmm15, SUBSH_MASK6 ); \
+    xmm8  = _mm_shuffle_epi8(xmm8,  (SUBSH_MASK[1]));\
+    xmm9  = _mm_shuffle_epi8(xmm9,  (SUBSH_MASK[3]));\
+    xmm10 = _mm_shuffle_epi8(xmm10, (SUBSH_MASK[5]));\
+    xmm11 = _mm_shuffle_epi8(xmm11, (SUBSH_MASK[7]));\
+    xmm12 = _mm_shuffle_epi8(xmm12, (SUBSH_MASK[0]));\
+    xmm13 = _mm_shuffle_epi8(xmm13, (SUBSH_MASK[2]));\
+    xmm14 = _mm_shuffle_epi8(xmm14, (SUBSH_MASK[4]));\
+    xmm15 = _mm_shuffle_epi8(xmm15, (SUBSH_MASK[6]));\
    /* SubBytes + MixBytes */\
-    SUBMIX( xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, \
-            xmm0, xmm1, xmm2,  xmm3,  xmm4,  xmm5,  xmm6 , xmm7 ); \
+    SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
    \
    /* AddRoundConstant Q1024 */\
-    xmm9 = m128_neg1;\
-    xmm0 = _mm_xor_si128( xmm0, xmm9 ); \
-    xmm1 = _mm_xor_si128( xmm1, xmm9 ); \
-    xmm2 = _mm_xor_si128( xmm2, xmm9 ); \
-    xmm3 = _mm_xor_si128( xmm3, xmm9 ); \
-    xmm4 = _mm_xor_si128( xmm4, xmm9 ); \
-    xmm5 = _mm_xor_si128( xmm5, xmm9 ); \
-    xmm6 = _mm_xor_si128( xmm6, xmm9 ); \
-    xmm7 = _mm_xor_si128( xmm7, \
-             casti_m128i( round_const_q, round_counter+1 ) ); \
+    xmm9 = ALL_FF;\
+    xmm0 = _mm_xor_si128(xmm0,  xmm9);\
+    xmm1 = _mm_xor_si128(xmm1,  xmm9);\
+    xmm2 = _mm_xor_si128(xmm2,  xmm9);\
+    xmm3 = _mm_xor_si128(xmm3,  xmm9);\
+    xmm4 = _mm_xor_si128(xmm4,  xmm9);\
+    xmm5 = _mm_xor_si128(xmm5,  xmm9);\
+    xmm6 = _mm_xor_si128(xmm6,  xmm9);\
+    xmm7 = _mm_xor_si128(xmm7,  (ROUND_CONST_Q[round_counter+1]));\
    /* ShiftBytes Q1024 + pre-AESENCLAST */\
-    xmm0 = _mm_shuffle_epi8( xmm0, SUBSH_MASK1 ); \
-    xmm1 = _mm_shuffle_epi8( xmm1, SUBSH_MASK3 ); \
-    xmm2 = _mm_shuffle_epi8( xmm2, SUBSH_MASK5 ); \
-    xmm3 = _mm_shuffle_epi8( xmm3, SUBSH_MASK7 ); \
-    xmm4 = _mm_shuffle_epi8( xmm4, SUBSH_MASK0 ); \
-    xmm5 = _mm_shuffle_epi8( xmm5, SUBSH_MASK2 ); \
-    xmm6 = _mm_shuffle_epi8( xmm6, SUBSH_MASK4 ); \
-    xmm7 = _mm_shuffle_epi8( xmm7, SUBSH_MASK6 ); \
+    xmm0 = _mm_shuffle_epi8(xmm0, (SUBSH_MASK[1]));\
+    xmm1 = _mm_shuffle_epi8(xmm1, (SUBSH_MASK[3]));\
+    xmm2 = _mm_shuffle_epi8(xmm2, (SUBSH_MASK[5]));\
+    xmm3 = _mm_shuffle_epi8(xmm3, (SUBSH_MASK[7]));\
+    xmm4 = _mm_shuffle_epi8(xmm4, (SUBSH_MASK[0]));\
+    xmm5 = _mm_shuffle_epi8(xmm5, (SUBSH_MASK[2]));\
+    xmm6 = _mm_shuffle_epi8(xmm6, (SUBSH_MASK[4]));\
+    xmm7 = _mm_shuffle_epi8(xmm7, (SUBSH_MASK[6]));\
    /* SubBytes + MixBytes */\
-    SUBMIX( xmm0,  xmm1, xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7, \
-            xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15 ); \
+    SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
  }\
 }

@@ -305,7 +278,7 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
 * clobbers: t0-t7
 */
 #define Matrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\
-  t0 = TRANSP_MASK; \
+  t0 = TRANSP_MASK;\
 \
  i6 = _mm_shuffle_epi8(i6, t0);\
  i0 = _mm_shuffle_epi8(i0, t0);\
@@ -393,7 +366,7 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
  i4 = _mm_unpacklo_epi64(i4, i5);\
  t1 = _mm_unpackhi_epi64(t1, i5);\
  t2 = i6;\
-  o0 = TRANSP_MASK; \
+  o0 = TRANSP_MASK;\
  i6 = _mm_unpacklo_epi64(i6, i7);\
  t2 = _mm_unpackhi_epi64(t2, i7);\
  /* load transpose mask into a register, because it will be used 8 times */\
@@ -634,4 +607,3 @@ void OF1024( __m128i* chaining )
  return;
 }

-#endif
--- a/algo/groestl/aes_ni/groestl-intr-avx.h
+++ b/algo/groestl/aes_ni/groestl-intr-avx.h
--- a/algo/groestl/aes_ni/groestl-intr-vperm.h
+++ b/algo/groestl/aes_ni/groestl-intr-vperm.h
--- a/algo/groestl/aes_ni/groestl-version.h
+++ b/algo/groestl/aes_ni/groestl-version.h
@@ -0,0 +1,10 @@
+// specify assembly or intrinsics implementation
+//#define TASM
+#define TINTR
+
+// Not to be confused with AVX512VAES
+#define VAES
+// #define VAVX
+// #define VVPERM
+
+//#endif
--- a/algo/groestl/aes_ni/groestl256-asm-aes.h
+++ b/algo/groestl/aes_ni/groestl256-asm-aes.h
@@ -0,0 +1,529 @@
+/* groestl-asm-aes.h     Aug 2011
+ *
+ * Groestl implementation with inline assembly using ssse3, sse4.1, and aes
+ * instructions.
+ * Authors: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
+ *
+ * This code is placed in the public domain
+ */
+
+#include "hash-groestl256.h"
+/* global constants  */
+__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Lx[16];
+__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L0[ROUNDS512*16];
+__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L7[ROUNDS512*16];
+__attribute__ ((aligned (16))) unsigned char ROUND_CONST_P[ROUNDS1024*16];
+__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Q[ROUNDS1024*16];
+__attribute__ ((aligned (16))) unsigned char TRANSP_MASK[16];
+__attribute__ ((aligned (16))) unsigned char SUBSH_MASK[8*16];
+__attribute__ ((aligned (16))) unsigned char ALL_1B[16];
+__attribute__ ((aligned (16))) unsigned char ALL_FF[16];
+
+/* temporary variables  */
+__attribute__ ((aligned (16))) unsigned char QTEMP[8*16];
+__attribute__ ((aligned (16))) unsigned char TEMP[3*16];
+
+
+#define tos(a)    #a
+#define tostr(a)  tos(a)
+
+
+/* xmm[i] will be multiplied by 2
+ * xmm[j] will be lost
+ * xmm[k] has to be all 0x1b */
+#define MUL2(i, j, k){\
+  asm("pxor xmm"tostr(j)", xmm"tostr(j)"");\
+  asm("pcmpgtb xmm"tostr(j)", xmm"tostr(i)"");\
+  asm("paddb xmm"tostr(i)", xmm"tostr(i)"");\
+  asm("pand xmm"tostr(j)", xmm"tostr(k)"");\
+  asm("pxor xmm"tostr(i)", xmm"tostr(j)"");\
+}/**/
+
+/* Yet another implementation of MixBytes.
+   This time we use the formulae (3) from the paper "Byte Slicing Groestl".
+   Input: a0, ..., a7
+   Output: b0, ..., b7 = MixBytes(a0,...,a7).
+   but we use the relations:
+   t_i = a_i + a_{i+3}
+   x_i = t_i + t_{i+3}
+   y_i = t_i + t+{i+2} + a_{i+6}
+   z_i = 2*x_i
+   w_i = z_i + y_{i+4}
+   v_i = 2*w_i
+   b_i = v_{i+3} + y_{i+4}
+   We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
+   and then adding v_i computed in the meantime in registers xmm0..xmm7.
+   We almost fit into 16 registers, need only 3 spills to memory.
+   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
+   K. Matusiewicz, 2011/05/29 */
+#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* t_i = a_i + a_{i+1} */\
+  asm("movdqa xmm"tostr(b6)", xmm"tostr(a0)"");\
+  asm("movdqa xmm"tostr(b7)", xmm"tostr(a1)"");\
+  asm("pxor xmm"tostr(a0)", xmm"tostr(a1)"");\
+  asm("movdqa xmm"tostr(b0)", xmm"tostr(a2)"");\
+  asm("pxor xmm"tostr(a1)", xmm"tostr(a2)"");\
+  asm("movdqa xmm"tostr(b1)", xmm"tostr(a3)"");\
+  asm("pxor xmm"tostr(a2)", xmm"tostr(a3)"");\
+  asm("movdqa xmm"tostr(b2)", xmm"tostr(a4)"");\
+  asm("pxor xmm"tostr(a3)", xmm"tostr(a4)"");\
+  asm("movdqa xmm"tostr(b3)", xmm"tostr(a5)"");\
+  asm("pxor xmm"tostr(a4)", xmm"tostr(a5)"");\
+  asm("movdqa xmm"tostr(b4)", xmm"tostr(a6)"");\
+  asm("pxor xmm"tostr(a5)", xmm"tostr(a6)"");\
+  asm("movdqa xmm"tostr(b5)", xmm"tostr(a7)"");\
+  asm("pxor xmm"tostr(a6)", xmm"tostr(a7)"");\
+  asm("pxor xmm"tostr(a7)", xmm"tostr(b6)"");\
+  \
+  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
+  asm("pxor xmm"tostr(b0)", xmm"tostr(a4)"");\
+  asm("pxor xmm"tostr(b6)", xmm"tostr(a4)"");\
+  asm("pxor xmm"tostr(b1)", xmm"tostr(a5)"");\
+  asm("pxor xmm"tostr(b7)", xmm"tostr(a5)"");\
+  asm("pxor xmm"tostr(b2)", xmm"tostr(a6)"");\
+  asm("pxor xmm"tostr(b0)", xmm"tostr(a6)"");\
+  /* spill values y_4, y_5 to memory */\
+  asm("movaps [TEMP+0*16], xmm"tostr(b0)"");\
+  asm("pxor xmm"tostr(b3)", xmm"tostr(a7)"");\
+  asm("pxor xmm"tostr(b1)", xmm"tostr(a7)"");\
+  asm("movaps [TEMP+1*16], xmm"tostr(b1)"");\
+  asm("pxor xmm"tostr(b4)", xmm"tostr(a0)"");\
+  asm("pxor xmm"tostr(b2)", xmm"tostr(a0)"");\
+  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
+  asm("movdqa xmm"tostr(b0)", xmm"tostr(a0)"");\
+  asm("pxor xmm"tostr(b5)", xmm"tostr(a1)"");\
+  asm("pxor xmm"tostr(b3)", xmm"tostr(a1)"");\
+  asm("movdqa xmm"tostr(b1)", xmm"tostr(a1)"");\
+  asm("pxor xmm"tostr(b6)", xmm"tostr(a2)"");\
+  asm("pxor xmm"tostr(b4)", xmm"tostr(a2)"");\
+  asm("movaps [TEMP+2*16], xmm"tostr(a2)"");\
+  asm("pxor xmm"tostr(b7)", xmm"tostr(a3)"");\
+  asm("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\
+  \
+  /* compute x_i = t_i + t_{i+3} */\
+  asm("pxor xmm"tostr(a0)", xmm"tostr(a3)"");\
+  asm("pxor xmm"tostr(a1)", xmm"tostr(a4)"");\
+  asm("pxor xmm"tostr(a2)", xmm"tostr(a5)"");\
+  asm("pxor xmm"tostr(a3)", xmm"tostr(a6)"");\
+  asm("pxor xmm"tostr(a4)", xmm"tostr(a7)"");\
+  asm("pxor xmm"tostr(a5)", xmm"tostr(b0)"");\
+  asm("pxor xmm"tostr(a6)", xmm"tostr(b1)"");\
+  asm("pxor xmm"tostr(a7)", [TEMP+2*16]");\
+  \
+  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
+  /* compute w_i : add y_{i+4} */\
+  asm("movaps xmm"tostr(b1)", [ALL_1B]");\
+  MUL2(a0, b0, b1);\
+  asm("pxor xmm"tostr(a0)", [TEMP+0*16]");\
+  MUL2(a1, b0, b1);\
+  asm("pxor xmm"tostr(a1)", [TEMP+1*16]");\
+  MUL2(a2, b0, b1);\
+  asm("pxor xmm"tostr(a2)", xmm"tostr(b2)"");\
+  MUL2(a3, b0, b1);\
+  asm("pxor xmm"tostr(a3)", xmm"tostr(b3)"");\
+  MUL2(a4, b0, b1);\
+  asm("pxor xmm"tostr(a4)", xmm"tostr(b4)"");\
+  MUL2(a5, b0, b1);\
+  asm("pxor xmm"tostr(a5)", xmm"tostr(b5)"");\
+  MUL2(a6, b0, b1);\
+  asm("pxor xmm"tostr(a6)", xmm"tostr(b6)"");\
+  MUL2(a7, b0, b1);\
+  asm("pxor xmm"tostr(a7)", xmm"tostr(b7)"");\
+  \
+  /* compute v_i : double w_i      */\
+  /* add to y_4 y_5 .. v3, v4, ... */\
+  MUL2(a0, b0, b1);\
+  asm("pxor xmm"tostr(b5)", xmm"tostr(a0)"");\
+  MUL2(a1, b0, b1);\
+  asm("pxor xmm"tostr(b6)", xmm"tostr(a1)"");\
+  MUL2(a2, b0, b1);\
+  asm("pxor xmm"tostr(b7)", xmm"tostr(a2)"");\
+  MUL2(a5, b0, b1);\
+  asm("pxor xmm"tostr(b2)", xmm"tostr(a5)"");\
+  MUL2(a6, b0, b1);\
+  asm("pxor xmm"tostr(b3)", xmm"tostr(a6)"");\
+  MUL2(a7, b0, b1);\
+  asm("pxor xmm"tostr(b4)", xmm"tostr(a7)"");\
+  MUL2(a3, b0, b1);\
+  MUL2(a4, b0, b1);\
+  asm("movaps xmm"tostr(b0)", [TEMP+0*16]");\
+  asm("movaps xmm"tostr(b1)", [TEMP+1*16]");\
+  asm("pxor xmm"tostr(b0)", xmm"tostr(a3)"");\
+  asm("pxor xmm"tostr(b1)", xmm"tostr(a4)"");\
+}/*MixBytes*/
+
+#define SET_CONSTANTS(){\
+  ((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\
+  ((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\
+  ((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\
+  ((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\
+  ((u64*)SUBSH_MASK)[ 0] = 0x0c0f0104070b0e00ULL;\
+  ((u64*)SUBSH_MASK)[ 1] = 0x03060a0d08020509ULL;\
+  ((u64*)SUBSH_MASK)[ 2] = 0x0e090205000d0801ULL;\
+  ((u64*)SUBSH_MASK)[ 3] = 0x04070c0f0a03060bULL;\
+  ((u64*)SUBSH_MASK)[ 4] = 0x080b0306010f0a02ULL;\
+  ((u64*)SUBSH_MASK)[ 5] = 0x05000e090c04070dULL;\
+  ((u64*)SUBSH_MASK)[ 6] = 0x0a0d040702090c03ULL;\
+  ((u64*)SUBSH_MASK)[ 7] = 0x0601080b0e05000fULL;\
+  ((u64*)SUBSH_MASK)[ 8] = 0x0b0e0500030a0d04ULL;\
+  ((u64*)SUBSH_MASK)[ 9] = 0x0702090c0f060108ULL;\
+  ((u64*)SUBSH_MASK)[10] = 0x0d080601040c0f05ULL;\
+  ((u64*)SUBSH_MASK)[11] = 0x00030b0e0907020aULL;\
+  ((u64*)SUBSH_MASK)[12] = 0x0f0a0702050e0906ULL;\
+  ((u64*)SUBSH_MASK)[13] = 0x01040d080b00030cULL;\
+  ((u64*)SUBSH_MASK)[14] = 0x090c000306080b07ULL;\
+  ((u64*)SUBSH_MASK)[15] = 0x02050f0a0d01040eULL;\
+  for(i = 0; i < ROUNDS512; i++)\
+  {\
+    ((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\
+    ((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL)  ^ 0x7060504030201000ULL;\
+    ((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL)  ^ 0x8f9fafbfcfdfefffULL;\
+    ((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\
+  }\
+  ((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\
+  ((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\
+}while(0);
+
+#define Push_All_Regs() do{\
+/*  not using any...
+    asm("push rax");\
+    asm("push rbx");\
+    asm("push rcx");*/\
+}while(0);
+
+#define Pop_All_Regs() do{\
+/*  not using any...
+    asm("pop rcx");\
+    asm("pop rbx");\
+    asm("pop rax");*/\
+}while(0);
+
+/* one round
+ * i = round number
+ * a0-a7 = input rows
+ * b0-b7 = output rows
+ */
+#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* AddRoundConstant */\
+  asm ("movaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\
+  asm ("pxor   xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\
+  asm ("pxor   xmm"tostr(a1)", xmm"tostr(b1)"");\
+  asm ("pxor   xmm"tostr(a2)", xmm"tostr(b1)"");\
+  asm ("pxor   xmm"tostr(a3)", xmm"tostr(b1)"");\
+  asm ("pxor   xmm"tostr(a4)", xmm"tostr(b1)"");\
+  asm ("pxor   xmm"tostr(a5)", xmm"tostr(b1)"");\
+  asm ("pxor   xmm"tostr(a6)", xmm"tostr(b1)"");\
+  asm ("pxor   xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\
+  /* ShiftBytes + SubBytes (interleaved) */\
+  asm ("pxor xmm"tostr(b0)",  xmm"tostr(b0)"");\
+  asm ("pshufb     xmm"tostr(a0)", [SUBSH_MASK+0*16]");\
+  asm ("aesenclast xmm"tostr(a0)", xmm"tostr(b0)"");\
+  asm ("pshufb     xmm"tostr(a1)", [SUBSH_MASK+1*16]");\
+  asm ("aesenclast xmm"tostr(a1)", xmm"tostr(b0)"");\
+  asm ("pshufb     xmm"tostr(a2)", [SUBSH_MASK+2*16]");\
+  asm ("aesenclast xmm"tostr(a2)", xmm"tostr(b0)"");\
+  asm ("pshufb     xmm"tostr(a3)", [SUBSH_MASK+3*16]");\
+  asm ("aesenclast xmm"tostr(a3)", xmm"tostr(b0)"");\
+  asm ("pshufb     xmm"tostr(a4)", [SUBSH_MASK+4*16]");\
+  asm ("aesenclast xmm"tostr(a4)", xmm"tostr(b0)"");\
+  asm ("pshufb     xmm"tostr(a5)", [SUBSH_MASK+5*16]");\
+  asm ("aesenclast xmm"tostr(a5)", xmm"tostr(b0)"");\
+  asm ("pshufb     xmm"tostr(a6)", [SUBSH_MASK+6*16]");\
+  asm ("aesenclast xmm"tostr(a6)", xmm"tostr(b0)"");\
+  asm ("pshufb     xmm"tostr(a7)", [SUBSH_MASK+7*16]");\
+  asm ("aesenclast xmm"tostr(a7)", xmm"tostr(b0)"");\
+  /* MixBytes */\
+  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
+}
+
+/* 10 rounds, P and Q in parallel */
+#define ROUNDS_P_Q(){\
+  ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+}
+
+/* Matrix Transpose Step 1
+ * input is a 512-bit state with two columns in one xmm
+ * output is a 512-bit state with two rows in one xmm
+ * inputs: i0-i3
+ * outputs: i0, o1-o3
+ * clobbers: t0
+ */
+#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
+  asm ("movaps xmm"tostr(t0)", [TRANSP_MASK]");\
+  \
+  asm ("pshufb xmm"tostr(i0)", xmm"tostr(t0)"");\
+  asm ("pshufb xmm"tostr(i1)", xmm"tostr(t0)"");\
+  asm ("pshufb xmm"tostr(i2)", xmm"tostr(t0)"");\
+  asm ("pshufb xmm"tostr(i3)", xmm"tostr(t0)"");\
+  \
+  asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\
+  asm ("movdqa xmm"tostr(t0)", xmm"tostr(i2)"");\
+  \
+  asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("punpckhwd xmm"tostr(o1)", xmm"tostr(i1)"");\
+  asm ("punpcklwd xmm"tostr(i2)", xmm"tostr(i3)"");\
+  asm ("punpckhwd xmm"tostr(t0)", xmm"tostr(i3)"");\
+  \
+  asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\
+  asm ("pshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\
+  asm ("pshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\
+  asm ("pshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\
+  \
+  asm ("movdqa xmm"tostr(o2)", xmm"tostr(i0)"");\
+  asm ("movdqa xmm"tostr(o3)", xmm"tostr(o1)"");\
+  \
+  asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i2)"");\
+  asm ("punpckldq xmm"tostr(o1)", xmm"tostr(t0)"");\
+  asm ("punpckhdq xmm"tostr(o2)", xmm"tostr(i2)"");\
+  asm ("punpckhdq xmm"tostr(o3)", xmm"tostr(t0)"");\
+}/**/
+
+/* Matrix Transpose Step 2
+ * input are two 512-bit states with two rows in one xmm
+ * output are two 512-bit states with one row of each state in one xmm
+ * inputs: i0-i3 = P, i4-i7 = Q
+ * outputs: (i0, o1-o7) = (P|Q)
+ * possible reassignments: (output reg = input reg)
+ * * i1 -> o3-7
+ * * i2 -> o5-7
+ * * i3 -> o7
+ * * i4 -> o3-7
+ * * i5 -> o6-7
+ */
+#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
+  asm ("movdqa     xmm"tostr(o1)", xmm"tostr(i0)"");\
+  asm ("movdqa     xmm"tostr(o2)", xmm"tostr(i1)"");\
+  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i4)"");\
+  asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i4)"");\
+  asm ("movdqa     xmm"tostr(o3)", xmm"tostr(i1)"");\
+  asm ("movdqa     xmm"tostr(o4)", xmm"tostr(i2)"");\
+  asm ("punpcklqdq xmm"tostr(o2)", xmm"tostr(i5)"");\
+  asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i5)"");\
+  asm ("movdqa     xmm"tostr(o5)", xmm"tostr(i2)"");\
+  asm ("movdqa     xmm"tostr(o6)", xmm"tostr(i3)"");\
+  asm ("punpcklqdq xmm"tostr(o4)", xmm"tostr(i6)"");\
+  asm ("punpckhqdq xmm"tostr(o5)", xmm"tostr(i6)"");\
+  asm ("movdqa     xmm"tostr(o7)", xmm"tostr(i3)"");\
+  asm ("punpcklqdq xmm"tostr(o6)", xmm"tostr(i7)"");\
+  asm ("punpckhqdq xmm"tostr(o7)", xmm"tostr(i7)"");\
+}/**/
+
+/* Matrix Transpose Inverse Step 2
+ * input are two 512-bit states with one row of each state in one xmm
+ * output are two 512-bit states with two rows in one xmm
+ * inputs: i0-i7 = (P|Q)
+ * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
+ */
+#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
+  asm ("movdqa     xmm"tostr(o0)", xmm"tostr(i0)"");\
+  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("punpckhqdq xmm"tostr(o0)", xmm"tostr(i1)"");\
+  asm ("movdqa     xmm"tostr(o1)", xmm"tostr(i2)"");\
+  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
+  asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i3)"");\
+  asm ("movdqa     xmm"tostr(o2)", xmm"tostr(i4)"");\
+  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
+  asm ("punpckhqdq xmm"tostr(o2)", xmm"tostr(i5)"");\
+  asm ("movdqa     xmm"tostr(o3)", xmm"tostr(i6)"");\
+  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
+  asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i7)"");\
+}/**/
+
+/* Matrix Transpose Output Step 2
+ * input is one 512-bit state with two rows in one xmm
+ * output is one 512-bit state with one row in the low 64-bits of one xmm
+ * inputs: i0,i2,i4,i6 = S
+ * outputs: (i0-7) = (0|S)
+ */
+#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
+  asm ("pxor xmm"tostr(t0)", xmm"tostr(t0)"");\
+  asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\
+  asm ("movdqa xmm"tostr(i3)", xmm"tostr(i2)"");\
+  asm ("movdqa xmm"tostr(i5)", xmm"tostr(i4)"");\
+  asm ("movdqa xmm"tostr(i7)", xmm"tostr(i6)"");\
+  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(t0)"");\
+  asm ("punpckhqdq xmm"tostr(i1)", xmm"tostr(t0)"");\
+  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(t0)"");\
+  asm ("punpckhqdq xmm"tostr(i3)", xmm"tostr(t0)"");\
+  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(t0)"");\
+  asm ("punpckhqdq xmm"tostr(i5)", xmm"tostr(t0)"");\
+  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(t0)"");\
+  asm ("punpckhqdq xmm"tostr(i7)", xmm"tostr(t0)"");\
+}/**/
+
+/* Matrix Transpose Output Inverse Step 2
+ * input is one 512-bit state with one row in the low 64-bits of one xmm
+ * output is one 512-bit state with two rows in one xmm
+ * inputs: i0-i7 = (0|S)
+ * outputs: (i0, i2, i4, i6) = S
+ */
+#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
+  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
+  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
+  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
+}/**/
+
+
+void INIT256(u64* h)
+{
+  /* __cdecl calling convention: */
+  /* chaining value CV in rdi    */
+
+  asm (".intel_syntax noprefix");
+  asm volatile ("emms");
+
+  /* load IV into registers xmm12 - xmm15 */
+  asm ("movaps xmm12, [rdi+0*16]");
+  asm ("movaps xmm13, [rdi+1*16]");
+  asm ("movaps xmm14, [rdi+2*16]");
+  asm ("movaps xmm15, [rdi+3*16]");
+
+  /* transform chaining value from column ordering into row ordering */
+  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
+  Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
+
+  /* store transposed IV */
+  asm ("movaps [rdi+0*16], xmm12");
+  asm ("movaps [rdi+1*16], xmm2");
+  asm ("movaps [rdi+2*16], xmm6");
+  asm ("movaps [rdi+3*16], xmm7");
+
+  asm volatile ("emms");
+  asm (".att_syntax noprefix");
+}
+
+void TF512(u64* h, u64* m)
+{
+  /* __cdecl calling convention: */
+  /* chaining value CV in rdi    */
+  /* message M in rsi            */
+
+#ifdef IACA_TRACE
+  IACA_START;
+#endif
+
+  asm (".intel_syntax noprefix");
+  Push_All_Regs();
+
+  /* load message into registers xmm12 - xmm15 (Q = message) */
+  asm ("movaps xmm12, [rsi+0*16]");
+  asm ("movaps xmm13, [rsi+1*16]");
+  asm ("movaps xmm14, [rsi+2*16]");
+  asm ("movaps xmm15, [rsi+3*16]");
+
+  /* transform message M from column ordering into row ordering */
+  /* we first put two rows (2x64 bit) of the message into one 128-bit xmm register */
+  Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
+
+  /* load previous chaining value */
+  /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
+  asm ("movaps xmm8, [rdi+0*16]");
+  asm ("movaps xmm0, [rdi+1*16]");
+  asm ("movaps xmm4, [rdi+2*16]");
+  asm ("movaps xmm5, [rdi+3*16]");
+
+  /* xor message to CV get input of P */
+  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
+  asm ("pxor xmm8, xmm12");
+  asm ("pxor xmm0, xmm2");
+  asm ("pxor xmm4, xmm6");
+  asm ("pxor xmm5, xmm7");
+
+  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
+  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
+  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
+  Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15);
+
+  /* compute the two permutations P and Q in parallel */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P or two rows of Q in one xmm register */
+  Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3);
+
+  /* xor output of P and Q */
+  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
+  asm ("pxor xmm0, xmm8");
+  asm ("pxor xmm1, xmm10");
+  asm ("pxor xmm2, xmm12");
+  asm ("pxor xmm3, xmm14");
+
+  /* xor CV (feed-forward) */
+  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
+  asm ("pxor xmm0, [rdi+0*16]");
+  asm ("pxor xmm1, [rdi+1*16]");
+  asm ("pxor xmm2, [rdi+2*16]");
+  asm ("pxor xmm3, [rdi+3*16]");
+
+  /* store CV */
+  asm ("movaps [rdi+0*16], xmm0");
+  asm ("movaps [rdi+1*16], xmm1");
+  asm ("movaps [rdi+2*16], xmm2");
+  asm ("movaps [rdi+3*16], xmm3");
+
+  Pop_All_Regs();
+  asm (".att_syntax noprefix");
+
+#ifdef IACA_TRACE
+  IACA_END;
+#endif
+  return;
+}
+
+void OF512(u64* h)
+{
+  /* __cdecl calling convention: */
+  /* chaining value CV in rdi    */
+
+  asm (".intel_syntax noprefix");
+  Push_All_Regs();
+
+  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
+  asm ("movaps xmm8,  [rdi+0*16]");
+  asm ("movaps xmm10, [rdi+1*16]");
+  asm ("movaps xmm12, [rdi+2*16]");
+  asm ("movaps xmm14, [rdi+3*16]");
+
+  /* there are now 2 rows of the CV in one xmm register */
+  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
+  /* result: the 8 input rows of P in xmm8 - xmm15 */
+  Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0);
+
+  /* compute the permutation P */
+  /* result: the output of P(CV) in xmm8 - xmm15 */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P in one xmm register */
+  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
+  Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15);
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
+  asm ("pxor xmm8,  [rdi+0*16]");
+  asm ("pxor xmm10, [rdi+1*16]");
+  asm ("pxor xmm12, [rdi+2*16]");
+  asm ("pxor xmm14, [rdi+3*16]");
+
+  /* transform state back from row ordering into column ordering */
+  /* result: final hash value in xmm9, xmm11 */
+  Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0);
+
+  /* we only need to return the truncated half of the state */
+  asm ("movaps [rdi+2*16], xmm9");
+  asm ("movaps [rdi+3*16], xmm11");
+
+  Pop_All_Regs();
+  asm (".att_syntax noprefix");
+
+  return;
+}
+
--- a/algo/groestl/aes_ni/groestl256-asm-avx.h
+++ b/algo/groestl/aes_ni/groestl256-asm-avx.h
@@ -0,0 +1,519 @@
+/* groestl-asm-avx.h     Aug 2011
+ *
+ * Groestl implementation with inline assembly using ssse3, sse4.1, aes and avx
+ * instructions.
+ * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
+ *
+ * This code is placed in the public domain
+ */
+
+#include "hash-groestl256.h"
+
+/* global variables  */
+__attribute__ ((aligned (32))) unsigned char ROUND_CONST_Lx[16];
+__attribute__ ((aligned (32))) unsigned char ROUND_CONST_L0[ROUNDS512*16];
+__attribute__ ((aligned (32))) unsigned char ROUND_CONST_L7[ROUNDS512*16];
+__attribute__ ((aligned (32))) unsigned char ROUND_CONST_P[ROUNDS1024*16];
+__attribute__ ((aligned (32))) unsigned char ROUND_CONST_Q[ROUNDS1024*16];
+__attribute__ ((aligned (32))) unsigned char TRANSP_MASK[16];
+__attribute__ ((aligned (32))) unsigned char SUBSH_MASK[8*16];
+__attribute__ ((aligned (32))) unsigned char ALL_1B[32];
+__attribute__ ((aligned (32))) unsigned char ALL_FF[32];
+
+/* temporary variables  */
+__attribute__ ((aligned (32))) unsigned char TEMP[6*32];
+
+
+#define tos(a)    #a
+#define tostr(a)  tos(a)
+
+#define SET_CONSTANTS(){\
+  ((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\
+  ((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\
+  ((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\
+  ((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\
+  ((u64*)SUBSH_MASK)[ 0] = 0x0c0f0104070b0e00ULL;\
+  ((u64*)SUBSH_MASK)[ 1] = 0x03060a0d08020509ULL;\
+  ((u64*)SUBSH_MASK)[ 2] = 0x0e090205000d0801ULL;\
+  ((u64*)SUBSH_MASK)[ 3] = 0x04070c0f0a03060bULL;\
+  ((u64*)SUBSH_MASK)[ 4] = 0x080b0306010f0a02ULL;\
+  ((u64*)SUBSH_MASK)[ 5] = 0x05000e090c04070dULL;\
+  ((u64*)SUBSH_MASK)[ 6] = 0x0a0d040702090c03ULL;\
+  ((u64*)SUBSH_MASK)[ 7] = 0x0601080b0e05000fULL;\
+  ((u64*)SUBSH_MASK)[ 8] = 0x0b0e0500030a0d04ULL;\
+  ((u64*)SUBSH_MASK)[ 9] = 0x0702090c0f060108ULL;\
+  ((u64*)SUBSH_MASK)[10] = 0x0d080601040c0f05ULL;\
+  ((u64*)SUBSH_MASK)[11] = 0x00030b0e0907020aULL;\
+  ((u64*)SUBSH_MASK)[12] = 0x0f0a0702050e0906ULL;\
+  ((u64*)SUBSH_MASK)[13] = 0x01040d080b00030cULL;\
+  ((u64*)SUBSH_MASK)[14] = 0x090c000306080b07ULL;\
+  ((u64*)SUBSH_MASK)[15] = 0x02050f0a0d01040eULL;\
+  for(i = 0; i < ROUNDS512; i++)\
+  {\
+    ((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\
+    ((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL)  ^ 0x7060504030201000ULL;\
+    ((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL)  ^ 0x8f9fafbfcfdfefffULL;\
+    ((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\
+  }\
+  ((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\
+  ((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\
+}while(0);
+
+#define Push_All_Regs() do{\
+/*  not using any...
+    asm("push rax");\
+    asm("push rbx");\
+    asm("push rcx");*/\
+}while(0);
+
+#define Pop_All_Regs() do{\
+/*  not using any...
+    asm("pop rcx");\
+    asm("pop rbx");\
+    asm("pop rax");*/\
+}while(0);
+
+/* xmm[i] will be multiplied by 2
+ * xmm[j] will be lost
+ * xmm[k] has to be all 0x1b
+ * xmm[z] has to be zero */
+#define VMUL2(i, j, k, z){\
+  asm("vpcmpgtb xmm"tostr(j)", xmm"tostr(z)", xmm"tostr(i)"");\
+  asm("vpaddb xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(i)"");\
+  asm("vpand xmm"tostr(j)", xmm"tostr(j)", xmm"tostr(k)"");\
+  asm("vpxor xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(j)"");\
+}/**/
+
+/* xmm[i] will be multiplied by 2
+ * xmm[j] will be lost
+ * xmm[k] has to be all 0x1b
+ * xmm[z] has to be zero */
+#define VMUL2v2(i, j, k, z){\
+  asm("vpblendvb xmm"tostr(j)", xmm"tostr(z)", xmm"tostr(k)", xmm"tostr(i)"");\
+  asm("vpaddb xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(i)"");\
+  asm("vpxor xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(j)"");\
+}/**/
+
+/* Yet another implementation of MixBytes.
+   This time we use the formulae (3) from the paper "Byte Slicing Groestl".
+   Input: a0, ..., a7
+   Output: b0, ..., b7 = MixBytes(a0,...,a7).
+   but we use the relations:
+   t_i = a_i + a_{i+3}
+   x_i = t_i + t_{i+3}
+   y_i = t_i + t+{i+2} + a_{i+6}
+   z_i = 2*x_i
+   w_i = z_i + y_{i+4}
+   v_i = 2*w_i
+   b_i = v_{i+3} + y_{i+4}
+   We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
+   and then adding v_i computed in the meantime in registers xmm0..xmm7.
+   We almost fit into 16 registers, need only 3 spills to memory.
+   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
+   K. Matusiewicz, 2011/05/29 */
+#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* xmm"tostr(8..xmm"tostr(15 = a2 a3... a0 a1 */\
+  asm("vmovdqa xmm"tostr(b0)", xmm"tostr(a2)"");\
+  asm("vmovdqa xmm"tostr(b1)", xmm"tostr(a3)"");\
+  asm("vmovdqa xmm"tostr(b2)", xmm"tostr(a4)"");\
+  asm("vmovdqa xmm"tostr(b3)", xmm"tostr(a5)"");\
+  asm("vmovdqa xmm"tostr(b4)", xmm"tostr(a6)"");\
+  asm("vmovdqa xmm"tostr(b5)", xmm"tostr(a7)"");\
+  asm("vmovdqa xmm"tostr(b6)", xmm"tostr(a0)"");\
+  asm("vmovdqa xmm"tostr(b7)", xmm"tostr(a1)"");\
+  \
+  /* t_i = a_i + a_{i+1} */\
+  asm("vpxor xmm"tostr(a0)", xmm"tostr(a0)", xmm"tostr(a1)"");\
+  asm("vpxor xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(a2)"");\
+  asm("vpxor xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(a3)"");\
+  asm("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(a4)"");\
+  asm("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(a5)"");\
+  asm("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(a6)"");\
+  asm("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(a7)"");\
+  asm("vpxor xmm"tostr(a7)", xmm"tostr(a7)", xmm"tostr(b6)"");\
+  \
+  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
+  asm("vpxor xmm"tostr(b0)", xmm"tostr(b0)", xmm"tostr(a4)"");\
+  asm("vpxor xmm"tostr(b1)", xmm"tostr(b1)", xmm"tostr(a5)"");\
+  asm("vpxor xmm"tostr(b2)", xmm"tostr(b2)", xmm"tostr(a6)"");\
+  asm("vpxor xmm"tostr(b3)", xmm"tostr(b3)", xmm"tostr(a7)"");\
+  asm("vpxor xmm"tostr(b4)", xmm"tostr(b4)", xmm"tostr(a0)"");\
+  asm("vpxor xmm"tostr(b5)", xmm"tostr(b5)", xmm"tostr(a1)"");\
+  asm("vpxor xmm"tostr(b6)", xmm"tostr(b6)", xmm"tostr(a2)"");\
+  asm("vpxor xmm"tostr(b7)", xmm"tostr(b7)", xmm"tostr(a3)"");\
+  \
+  asm("vpxor xmm"tostr(b0)", xmm"tostr(b0)", xmm"tostr(a6)"");\
+  asm("vpxor xmm"tostr(b1)", xmm"tostr(b1)", xmm"tostr(a7)"");\
+  asm("vpxor xmm"tostr(b2)", xmm"tostr(b2)", xmm"tostr(a0)"");\
+  asm("vpxor xmm"tostr(b3)", xmm"tostr(b3)", xmm"tostr(a1)"");\
+  asm("vpxor xmm"tostr(b4)", xmm"tostr(b4)", xmm"tostr(a2)"");\
+  asm("vpxor xmm"tostr(b5)", xmm"tostr(b5)", xmm"tostr(a3)"");\
+  asm("vpxor xmm"tostr(b6)", xmm"tostr(b6)", xmm"tostr(a4)"");\
+  asm("vpxor xmm"tostr(b7)", xmm"tostr(b7)", xmm"tostr(a5)"");\
+  \
+  /* spill values y_4, y_5 to memory */\
+  asm("vmovaps [TEMP+0*16], xmm"tostr(b0)"");\
+  asm("vmovaps [TEMP+1*16], xmm"tostr(b1)"");\
+  asm("vmovaps [TEMP+2*16], xmm"tostr(b2)"");\
+  \
+  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
+  asm("vmovdqa xmm"tostr(b0)", xmm"tostr(a0)"");\
+  asm("vmovdqa xmm"tostr(b1)", xmm"tostr(a1)"");\
+  asm("vmovaps [TEMP+3*16], xmm"tostr(a2)"");\
+  \
+  /* compute x_i = t_i + t_{i+3} */\
+  asm("vpxor xmm"tostr(a0)", xmm"tostr(a0)", xmm"tostr(a3)"");\
+  asm("vpxor xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(a4)"");\
+  asm("vpxor xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(a5)"");\
+  asm("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(a6)"");\
+  asm("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(a7)"");\
+  asm("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b0)"");\
+  asm("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b1)"");\
+  asm("vpxor xmm"tostr(a7)", xmm"tostr(a7)", [TEMP+3*16]");\
+  \
+  /*compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
+  asm("vmovaps xmm"tostr(b1)", [ALL_1B]");\
+  asm("vpxor xmm"tostr(b2)", xmm"tostr(b2)", xmm"tostr(b2)"");\
+  VMUL2(a7, b0, b1, b2);\
+  VMUL2(a6, b0, b1, b2);\
+  VMUL2(a5, b0, b1, b2);\
+  VMUL2(a4, b0, b1, b2);\
+  VMUL2(a3, b0, b1, b2);\
+  VMUL2(a2, b0, b1, b2);\
+  VMUL2(a1, b0, b1, b2);\
+  VMUL2(a0, b0, b1, b2);\
+  \
+  /* compute w_i :  add y_{i+4} */\
+  asm("vpxor xmm"tostr(a0)", xmm"tostr(a0)", [TEMP+0*16]");\
+  asm("vpxor xmm"tostr(a1)", xmm"tostr(a1)", [TEMP+1*16]");\
+  asm("vpxor xmm"tostr(a2)", xmm"tostr(a2)", [TEMP+2*16]");\
+  asm("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(b3)"");\
+  asm("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(b4)"");\
+  asm("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b5)"");\
+  asm("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b6)"");\
+  asm("vpxor xmm"tostr(a7)", xmm"tostr(a7)", xmm"tostr(b7)"");\
+  \
+  /*compute v_i: double w_i */\
+  VMUL2(a0, b0, b1, b2);\
+  VMUL2(a1, b0, b1, b2);\
+  VMUL2(a2, b0, b1, b2);\
+  VMUL2(a3, b0, b1, b2);\
+  VMUL2(a4, b0, b1, b2);\
+  VMUL2(a5, b0, b1, b2);\
+  VMUL2(a6, b0, b1, b2);\
+  VMUL2(a7, b0, b1, b2);\
+  \
+  /* add to y_4 y_5 .. v3, v4, ... */\
+  asm("vpxor xmm"tostr(b0)", xmm"tostr(a3)", [TEMP+0*16]");\
+  asm("vpxor xmm"tostr(b1)", xmm"tostr(a4)", [TEMP+1*16]");\
+  asm("vpxor xmm"tostr(b2)", xmm"tostr(a5)", [TEMP+2*16]");\
+  asm("vpxor xmm"tostr(b3)", xmm"tostr(b3)", xmm"tostr(a6)"");\
+  asm("vpxor xmm"tostr(b4)", xmm"tostr(b4)", xmm"tostr(a7)"");\
+  asm("vpxor xmm"tostr(b5)", xmm"tostr(b5)", xmm"tostr(a0)"");\
+  asm("vpxor xmm"tostr(b6)", xmm"tostr(b6)", xmm"tostr(a1)"");\
+  asm("vpxor xmm"tostr(b7)", xmm"tostr(b7)", xmm"tostr(a2)"");\
+}/*MixBytes*/
+
+/* one round
+ * i = round number
+ * a0-a7 = input rows
+ * b0-b7 = output rows
+ */
+#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* AddRoundConstant */\
+  asm ("vmovaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\
+  asm ("vpxor   xmm"tostr(a0)", xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\
+  asm ("vpxor   xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(b1)"");\
+  asm ("vpxor   xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(b1)"");\
+  asm ("vpxor   xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(b1)"");\
+  asm ("vpxor   xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(b1)"");\
+  asm ("vpxor   xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b1)"");\
+  asm ("vpxor   xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b1)"");\
+  asm ("vpxor   xmm"tostr(a7)", xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\
+  /* ShiftBytes + SubBytes (interleaved) */\
+  asm ("vpxor xmm"tostr(b0)",  xmm"tostr(b0)",  xmm"tostr(b0)"");\
+  asm ("vpshufb     xmm"tostr(a0)", xmm"tostr(a0)", [SUBSH_MASK+0*16]");\
+  asm ("vaesenclast xmm"tostr(a0)", xmm"tostr(a0)", xmm"tostr(b0)"");\
+  asm ("vpshufb     xmm"tostr(a1)", xmm"tostr(a1)", [SUBSH_MASK+1*16]");\
+  asm ("vaesenclast xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(b0)"");\
+  asm ("vpshufb     xmm"tostr(a2)", xmm"tostr(a2)", [SUBSH_MASK+2*16]");\
+  asm ("vaesenclast xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(b0)"");\
+  asm ("vpshufb     xmm"tostr(a3)", xmm"tostr(a3)", [SUBSH_MASK+3*16]");\
+  asm ("vaesenclast xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(b0)"");\
+  asm ("vpshufb     xmm"tostr(a4)", xmm"tostr(a4)", [SUBSH_MASK+4*16]");\
+  asm ("vaesenclast xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(b0)"");\
+  asm ("vpshufb     xmm"tostr(a5)", xmm"tostr(a5)", [SUBSH_MASK+5*16]");\
+  asm ("vaesenclast xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b0)"");\
+  asm ("vpshufb     xmm"tostr(a6)", xmm"tostr(a6)", [SUBSH_MASK+6*16]");\
+  asm ("vaesenclast xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b0)"");\
+  asm ("vpshufb     xmm"tostr(a7)", xmm"tostr(a7)", [SUBSH_MASK+7*16]");\
+  asm ("vaesenclast xmm"tostr(a7)", xmm"tostr(a7)", xmm"tostr(b0)"");\
+  /* MixBytes */\
+  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
+}
+
+/* 10 rounds, P and Q in parallel */
+#define ROUNDS_P_Q(){\
+  ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+}
+
+/* Matrix Transpose Step 1
+ * input is a 512-bit state with two columns in one xmm
+ * output is a 512-bit state with two rows in one xmm
+ * inputs: i0-i3
+
+ * outputs: i0, o1-o3
+ * clobbers: t0
+ */
+#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
+  asm ("vmovaps xmm"tostr(t0)", [TRANSP_MASK]");\
+\
+  asm ("vpshufb xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(t0)"");\
+  asm ("vpshufb xmm"tostr(i1)", xmm"tostr(i1)", xmm"tostr(t0)"");\
+  asm ("vpshufb xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(t0)"");\
+  asm ("vpshufb xmm"tostr(i3)", xmm"tostr(i3)", xmm"tostr(t0)"");\
+\
+  asm ("vpunpckhwd xmm"tostr(o1)", xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("vpunpcklwd xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("vpunpckhwd xmm"tostr(t0)", xmm"tostr(i2)", xmm"tostr(i3)"");\
+  asm ("vpunpcklwd xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\
+\
+  asm ("vpshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\
+  asm ("vpshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\
+  asm ("vpshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\
+  asm ("vpshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\
+\
+  asm ("vpunpckhdq xmm"tostr(o2)", xmm"tostr(i0)", xmm"tostr(i2)"");\
+  asm ("vpunpckhdq xmm"tostr(o3)", xmm"tostr(o1)", xmm"tostr(t0)"");\
+  asm ("vpunpckldq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i2)"");\
+  asm ("vpunpckldq xmm"tostr(o1)", xmm"tostr(o1)", xmm"tostr(t0)"");\
+}/**/
+
+/* Matrix Transpose Step 2
+ * input are two 512-bit states with two rows in one xmm
+ * output are two 512-bit states with one row of each state in one xmm
+ * inputs: i0-i3 = P, i4-i7 = Q
+ * outputs: (i0, o1-o7) = (P|Q)
+ * possible reassignments: (output reg = input reg)
+ * * i1 -> o3-7
+ * * i2 -> o5-7
+ * * i3 -> o7
+ * * i4 -> o3-7
+ * * i5 -> o6-7
+ */
+#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
+  asm ("vpunpckhqdq xmm"tostr(o1)", xmm"tostr(i0)", xmm"tostr(i4)"");\
+  asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i4)"");\
+  asm ("vpunpcklqdq xmm"tostr(o2)", xmm"tostr(i1)", xmm"tostr(i5)"");\
+  asm ("vpunpckhqdq xmm"tostr(o3)", xmm"tostr(i1)", xmm"tostr(i5)"");\
+  asm ("vpunpcklqdq xmm"tostr(o4)", xmm"tostr(i2)", xmm"tostr(i6)"");\
+  asm ("vpunpckhqdq xmm"tostr(o5)", xmm"tostr(i2)", xmm"tostr(i6)"");\
+  asm ("vpunpcklqdq xmm"tostr(o6)", xmm"tostr(i3)", xmm"tostr(i7)"");\
+  asm ("vpunpckhqdq xmm"tostr(o7)", xmm"tostr(i3)", xmm"tostr(i7)"");\
+}/**/
+
+/* Matrix Transpose Inverse Step 2
+ * input are two 512-bit states with one row of each state in one xmm
+ * output are two 512-bit states with two rows in one xmm
+ * inputs: i0-i7 = (P|Q)
+ * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
+ */
+#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
+  asm ("vpunpckhqdq xmm"tostr(o0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("vpunpckhqdq xmm"tostr(o1)", xmm"tostr(i2)", xmm"tostr(i3)"");\
+  asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\
+  asm ("vpunpckhqdq xmm"tostr(o2)", xmm"tostr(i4)", xmm"tostr(i5)"");\
+  asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(i5)"");\
+  asm ("vpunpckhqdq xmm"tostr(o3)", xmm"tostr(i6)", xmm"tostr(i7)"");\
+  asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(i7)"");\
+}/**/
+
+/* Matrix Transpose Output Step 2
+ * input is one 512-bit state with two rows in one xmm
+ * output is one 512-bit state with one row in the low 64-bits of one xmm
+ * inputs: i0,i2,i4,i6 = S
+ * outputs: (i0-7) = (0|S)
+ */
+#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
+  asm ("vpxor xmm"tostr(t0)", xmm"tostr(t0)", xmm"tostr(t0)"");\
+  asm ("vpunpckhqdq xmm"tostr(i1)", xmm"tostr(i0)", xmm"tostr(t0)"");\
+  asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(t0)"");\
+  asm ("vpunpckhqdq xmm"tostr(i3)", xmm"tostr(i2)", xmm"tostr(t0)"");\
+  asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(t0)"");\
+  asm ("vpunpckhqdq xmm"tostr(i5)", xmm"tostr(i4)", xmm"tostr(t0)"");\
+  asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(t0)"");\
+  asm ("vpunpckhqdq xmm"tostr(i7)", xmm"tostr(i6)", xmm"tostr(t0)"");\
+  asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(t0)"");\
+}/**/
+
+/* Matrix Transpose Output Inverse Step 2
+ * input is one 512-bit state with one row in the low 64-bits of one xmm
+ * output is one 512-bit state with two rows in one xmm
+ * inputs: i0-i7 = (0|S)
+ * outputs: (i0, i2, i4, i6) = S
+ */
+#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
+  asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\
+  asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(i5)"");\
+  asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(i7)"");\
+}/**/
+
+
+void INIT256(u64* h)
+{
+  /* __cdecl calling convention: */
+  /* chaining value CV in rdi    */
+
+  asm (".intel_syntax noprefix");
+  asm volatile ("emms");
+
+  /* load IV into registers xmm12 - xmm15 */
+  asm ("vmovaps xmm12, [rdi+0*16]");
+  asm ("vmovaps xmm13, [rdi+1*16]");
+  asm ("vmovaps xmm14, [rdi+2*16]");
+  asm ("vmovaps xmm15, [rdi+3*16]");
+
+  /* transform chaining value from column ordering into row ordering */
+  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
+  Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
+
+  /* store transposed IV */
+  asm ("vmovaps [rdi+0*16], xmm12");
+  asm ("vmovaps [rdi+1*16], xmm2");
+  asm ("vmovaps [rdi+2*16], xmm6");
+  asm ("vmovaps [rdi+3*16], xmm7");
+
+  asm volatile ("emms");
+  asm (".att_syntax noprefix");
+}
+
+void TF512(u64* h, u64* m)
+{
+  /* __cdecl calling convention: */
+  /* chaining value CV in rdi    */
+  /* message M in rsi            */
+
+#ifdef IACA_TRACE
+  IACA_START;
+#endif
+
+  asm (".intel_syntax noprefix");
+  Push_All_Regs();
+
+  /* load message into registers xmm12 - xmm15 (Q = message) */
+  asm ("vmovaps xmm12, [rsi+0*16]");
+  asm ("vmovaps xmm13, [rsi+1*16]");
+  asm ("vmovaps xmm14, [rsi+2*16]");
+  asm ("vmovaps xmm15, [rsi+3*16]");
+
+  /* transform message M from column ordering into row ordering */
+  /* we first put two rows (64 bit) of the message into one 128-bit xmm register */
+  Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
+
+  /* load previous chaining value and xor message to CV to get input of P */
+  /* we first put two rows (2x64 bit) of the CV into one 128-bit xmm register */
+  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
+  asm ("vpxor xmm8, xmm12, [rdi+0*16]");
+  asm ("vpxor xmm0, xmm2,  [rdi+1*16]");
+  asm ("vpxor xmm4, xmm6,  [rdi+2*16]");
+  asm ("vpxor xmm5, xmm7,  [rdi+3*16]");
+
+  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
+  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
+  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
+  Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15);
+
+  /* compute the two permutations P and Q in parallel */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P or two rows of Q in one xmm register */
+  Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3);
+
+  /* xor output of P and Q */
+  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
+  asm ("vpxor xmm0, xmm0, xmm8");
+  asm ("vpxor xmm1, xmm1, xmm10");
+  asm ("vpxor xmm2, xmm2, xmm12");
+  asm ("vpxor xmm3, xmm3, xmm14");
+
+  /* xor CV (feed-forward) */
+  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
+  asm ("vpxor xmm0, xmm0, [rdi+0*16]");
+  asm ("vpxor xmm1, xmm1, [rdi+1*16]");
+  asm ("vpxor xmm2, xmm2, [rdi+2*16]");
+  asm ("vpxor xmm3, xmm3, [rdi+3*16]");
+
+  /* store CV */
+  asm ("vmovaps [rdi+0*16], xmm0");
+  asm ("vmovaps [rdi+1*16], xmm1");
+  asm ("vmovaps [rdi+2*16], xmm2");
+  asm ("vmovaps [rdi+3*16], xmm3");
+
+  Pop_All_Regs();
+  asm (".att_syntax noprefix");
+
+#ifdef IACA_TRACE
+  IACA_END;
+#endif
+  return;
+}
+
+void OF512(u64* h)
+{
+  /* __cdecl calling convention: */
+  /* chaining value CV in rdi    */
+
+  asm (".intel_syntax noprefix");
+  Push_All_Regs();
+
+  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
+  asm ("vmovaps xmm8,  [rdi+0*16]");
+  asm ("vmovaps xmm10, [rdi+1*16]");
+  asm ("vmovaps xmm12, [rdi+2*16]");
+  asm ("vmovaps xmm14, [rdi+3*16]");
+
+  /* there are now 2 rows of the CV in one xmm register */
+  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
+  /* result: the 8 input rows of P in xmm8 - xmm15 */
+  Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0);
+
+  /* compute the permutation P */
+  /* result: the output of P(CV) in xmm8 - xmm15 */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P in one xmm register */
+  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
+  Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15);
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
+  asm ("vpxor xmm8,  xmm8,  [rdi+0*16]");
+  asm ("vpxor xmm10, xmm10, [rdi+1*16]");
+  asm ("vpxor xmm12, xmm12, [rdi+2*16]");
+  asm ("vpxor xmm14, xmm14, [rdi+3*16]");
+
+  /* transform state back from row ordering into column ordering */
+  /* result: final hash value in xmm9, xmm11 */
+  Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0);
+
+  /* we only need to return the truncated half of the state */
+  asm ("vmovaps [rdi+2*16], xmm9");
+  asm ("vmovaps [rdi+3*16], xmm11");
+
+  Pop_All_Regs();
+  asm (".att_syntax noprefix");
+
+  return;
+}
+
--- a/algo/groestl/aes_ni/groestl256-asm-vperm.h
+++ b/algo/groestl/aes_ni/groestl256-asm-vperm.h
@@ -0,0 +1,856 @@
+/* groestl-asm-vperm.h     Aug 2011
+ *
+ * Groestl implementation with inline assembly using ssse3 instructions.
+ * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
+ *
+ * Based on the vperm and aes_ni implementations of the hash function Groestl
+ * by Cagdas Calik <ccalik@metu.edu.tr> http://www.metu.edu.tr/~ccalik/
+ * Institute of Applied Mathematics, Middle East Technical University, Turkey
+ *
+ * This code is placed in the public domain
+ */
+
+#include "hash-groestl256.h"
+
+/* global constants  */
+__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Lx[16];
+__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L0[ROUNDS512*16];
+__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L7[ROUNDS512*16];
+__attribute__ ((aligned (16))) unsigned char ROUND_CONST_P[ROUNDS1024*16];
+__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Q[ROUNDS1024*16];
+__attribute__ ((aligned (16))) unsigned char TRANSP_MASK[16];
+__attribute__ ((aligned (16))) unsigned char SUBSH_MASK[8*16];
+__attribute__ ((aligned (16))) unsigned char ALL_0F[16];
+__attribute__ ((aligned (16))) unsigned char ALL_15[16];
+__attribute__ ((aligned (16))) unsigned char ALL_1B[16];
+__attribute__ ((aligned (16))) unsigned char ALL_63[16];
+__attribute__ ((aligned (16))) unsigned char ALL_FF[16];
+__attribute__ ((aligned (16))) unsigned char VPERM_IPT[2*16];
+__attribute__ ((aligned (16))) unsigned char VPERM_OPT[2*16];
+__attribute__ ((aligned (16))) unsigned char VPERM_INV[2*16];
+__attribute__ ((aligned (16))) unsigned char VPERM_SB1[2*16];
+__attribute__ ((aligned (16))) unsigned char VPERM_SB2[2*16];
+__attribute__ ((aligned (16))) unsigned char VPERM_SB4[2*16];
+__attribute__ ((aligned (16))) unsigned char VPERM_SBO[2*16];
+
+/* temporary variables  */
+__attribute__ ((aligned (16))) unsigned char TEMP_MUL1[8*16];
+__attribute__ ((aligned (16))) unsigned char TEMP_MUL2[8*16];
+__attribute__ ((aligned (16))) unsigned char TEMP_MUL4[1*16];
+__attribute__ ((aligned (16))) unsigned char QTEMP[8*16];
+__attribute__ ((aligned (16))) unsigned char TEMP[8*16];
+
+
+#define tos(a)    #a
+#define tostr(a)  tos(a)
+
+#define SET_SHARED_CONSTANTS(){\
+  ((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\
+  ((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\
+  ((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\
+  ((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\
+  ((u64*)ALL_63)[ 0] = 0x6363636363636363ULL;\
+  ((u64*)ALL_63)[ 1] = 0x6363636363636363ULL;\
+  ((u64*)ALL_0F)[ 0] = 0x0F0F0F0F0F0F0F0FULL;\
+  ((u64*)ALL_0F)[ 1] = 0x0F0F0F0F0F0F0F0FULL;\
+  ((u64*)VPERM_IPT)[ 0] = 0x4C01307D317C4D00ULL;\
+  ((u64*)VPERM_IPT)[ 1] = 0xCD80B1FCB0FDCC81ULL;\
+  ((u64*)VPERM_IPT)[ 2] = 0xC2B2E8985A2A7000ULL;\
+  ((u64*)VPERM_IPT)[ 3] = 0xCABAE09052227808ULL;\
+  ((u64*)VPERM_OPT)[ 0] = 0x01EDBD5150BCEC00ULL;\
+  ((u64*)VPERM_OPT)[ 1] = 0xE10D5DB1B05C0CE0ULL;\
+  ((u64*)VPERM_OPT)[ 2] = 0xFF9F4929D6B66000ULL;\
+  ((u64*)VPERM_OPT)[ 3] = 0xF7974121DEBE6808ULL;\
+  ((u64*)VPERM_INV)[ 0] = 0x01040A060F0B0780ULL;\
+  ((u64*)VPERM_INV)[ 1] = 0x030D0E0C02050809ULL;\
+  ((u64*)VPERM_INV)[ 2] = 0x0E05060F0D080180ULL;\
+  ((u64*)VPERM_INV)[ 3] = 0x040703090A0B0C02ULL;\
+  ((u64*)VPERM_SB1)[ 0] = 0x3618D415FAE22300ULL;\
+  ((u64*)VPERM_SB1)[ 1] = 0x3BF7CCC10D2ED9EFULL;\
+  ((u64*)VPERM_SB1)[ 2] = 0xB19BE18FCB503E00ULL;\
+  ((u64*)VPERM_SB1)[ 3] = 0xA5DF7A6E142AF544ULL;\
+  ((u64*)VPERM_SB2)[ 0] = 0x69EB88400AE12900ULL;\
+  ((u64*)VPERM_SB2)[ 1] = 0xC2A163C8AB82234AULL;\
+  ((u64*)VPERM_SB2)[ 2] = 0xE27A93C60B712400ULL;\
+  ((u64*)VPERM_SB2)[ 3] = 0x5EB7E955BC982FCDULL;\
+  ((u64*)VPERM_SB4)[ 0] = 0x3D50AED7C393EA00ULL;\
+  ((u64*)VPERM_SB4)[ 1] = 0xBA44FE79876D2914ULL;\
+  ((u64*)VPERM_SB4)[ 2] = 0xE1E937A03FD64100ULL;\
+  ((u64*)VPERM_SB4)[ 3] = 0xA876DE9749087E9FULL;\
+/*((u64*)VPERM_SBO)[ 0] = 0xCFE474A55FBB6A00ULL;\
+  ((u64*)VPERM_SBO)[ 1] = 0x8E1E90D1412B35FAULL;\
+  ((u64*)VPERM_SBO)[ 2] = 0xD0D26D176FBDC700ULL;\
+  ((u64*)VPERM_SBO)[ 3] = 0x15AABF7AC502A878ULL;*/\
+  ((u64*)ALL_15)[ 0] = 0x1515151515151515ULL;\
+  ((u64*)ALL_15)[ 1] = 0x1515151515151515ULL;\
+}/**/
+
+/* VPERM
+ * Transform w/o settings c*
+ * transforms 2 rows to/from "vperm mode"
+ * this function is derived from:
+ *   vperm and aes_ni implementations of hash function Grostl
+ *   by Cagdas CALIK
+ * inputs:
+ * a0, a1 = 2 rows
+ * table = transformation table to use
+ * t*, c* = clobbers
+ * outputs:
+ * a0, a1 = 2 rows transformed with table
+ * */
+#define VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2){\
+  asm ("movdqa xmm"tostr(t0)", xmm"tostr(c0)"");\
+  asm ("movdqa xmm"tostr(t1)", xmm"tostr(c0)"");\
+  asm ("pandn  xmm"tostr(t0)", xmm"tostr(a0)"");\
+  asm ("pandn  xmm"tostr(t1)", xmm"tostr(a1)"");\
+  asm ("psrld  xmm"tostr(t0)", 4");\
+  asm ("psrld  xmm"tostr(t1)", 4");\
+  asm ("pand   xmm"tostr(a0)", xmm"tostr(c0)"");\
+  asm ("pand   xmm"tostr(a1)", xmm"tostr(c0)"");\
+  asm ("movdqa xmm"tostr(t2)", xmm"tostr(c2)"");\
+  asm ("movdqa xmm"tostr(t3)", xmm"tostr(c2)"");\
+  asm ("pshufb xmm"tostr(t2)", xmm"tostr(a0)"");\
+  asm ("pshufb xmm"tostr(t3)", xmm"tostr(a1)"");\
+  asm ("movdqa xmm"tostr(a0)", xmm"tostr(c1)"");\
+  asm ("movdqa xmm"tostr(a1)", xmm"tostr(c1)"");\
+  asm ("pshufb xmm"tostr(a0)", xmm"tostr(t0)"");\
+  asm ("pshufb xmm"tostr(a1)", xmm"tostr(t1)"");\
+  asm ("pxor   xmm"tostr(a0)", xmm"tostr(t2)"");\
+  asm ("pxor   xmm"tostr(a1)", xmm"tostr(t3)"");\
+}/**/
+
+#define VPERM_Transform_Set_Const(table, c0, c1, c2){\
+  asm ("movaps xmm"tostr(c0)", [ALL_0F]");\
+  asm ("movaps xmm"tostr(c1)", ["tostr(table)"+0*16]");\
+  asm ("movaps xmm"tostr(c2)", ["tostr(table)"+1*16]");\
+}/**/
+
+/* VPERM
+ * Transform
+ * transforms 2 rows to/from "vperm mode"
+ * this function is derived from:
+ *   vperm and aes_ni implementations of hash function Grostl
+ *   by Cagdas CALIK
+ * inputs:
+ * a0, a1 = 2 rows
+ * table = transformation table to use
+ * t*, c* = clobbers
+ * outputs:
+ * a0, a1 = 2 rows transformed with table
+ * */
+#define VPERM_Transform(a0, a1, table, t0, t1, t2, t3, c0, c1, c2){\
+  VPERM_Transform_Set_Const(table, c0, c1, c2);\
+  VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
+}/**/
+
+/* VPERM
+ * Transform State
+ * inputs:
+ * a0-a3 = state
+ * table = transformation table to use
+ * t* = clobbers
+ * outputs:
+ * a0-a3 = transformed state
+ * */
+#define VPERM_Transform_State(a0, a1, a2, a3, table, t0, t1, t2, t3, c0, c1, c2){\
+  VPERM_Transform_Set_Const(table, c0, c1, c2);\
+  VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
+  VPERM_Transform_No_Const(a2, a3, t0, t1, t2, t3, c0, c1, c2);\
+}/**/
+
+/* VPERM
+ * Add Constant to State
+ * inputs:
+ * a0-a7 = state
+ * constant = constant to add
+ * t0 = clobber
+ * outputs:
+ * a0-a7 = state + constant
+ * */
+#define VPERM_Add_Constant(a0, a1, a2, a3, a4, a5, a6, a7, constant, t0){\
+  asm ("movaps xmm"tostr(t0)", ["tostr(constant)"]");\
+  asm ("pxor   xmm"tostr(a0)",  xmm"tostr(t0)"");\
+  asm ("pxor   xmm"tostr(a1)",  xmm"tostr(t0)"");\
+  asm ("pxor   xmm"tostr(a2)",  xmm"tostr(t0)"");\
+  asm ("pxor   xmm"tostr(a3)",  xmm"tostr(t0)"");\
+  asm ("pxor   xmm"tostr(a4)",  xmm"tostr(t0)"");\
+  asm ("pxor   xmm"tostr(a5)",  xmm"tostr(t0)"");\
+  asm ("pxor   xmm"tostr(a6)",  xmm"tostr(t0)"");\
+  asm ("pxor   xmm"tostr(a7)",  xmm"tostr(t0)"");\
+}/**/
+
+/* VPERM
+ * Set Substitute Core Constants
+ * */
+#define VPERM_Substitute_Core_Set_Const(c0, c1, c2){\
+  VPERM_Transform_Set_Const(VPERM_INV, c0, c1, c2);\
+}/**/
+
+/* VPERM
+ * Substitute Core
+ * first part of sbox inverse computation
+ * this function is derived from:
+ *   vperm and aes_ni implementations of hash function Grostl
+ *   by Cagdas CALIK
+ * inputs:
+ * a0 = 1 row
+ * t*, c* = clobbers
+ * outputs:
+ * b0a, b0b = inputs for lookup step
+ * */
+#define VPERM_Substitute_Core(a0, b0a, b0b, t0, t1, c0, c1, c2){\
+  asm ("movdqa xmm"tostr(t0)",  xmm"tostr(c0)"");\
+  asm ("pandn  xmm"tostr(t0)",  xmm"tostr(a0)"");\
+  asm ("psrld  xmm"tostr(t0)",  4");\
+  asm ("pand   xmm"tostr(a0)",  xmm"tostr(c0)"");\
+  asm ("movdqa xmm"tostr(b0a)", "tostr(c1)"");\
+  asm ("pshufb xmm"tostr(b0a)", xmm"tostr(a0)"");\
+  asm ("pxor   xmm"tostr(a0)",  xmm"tostr(t0)"");\
+  asm ("movdqa xmm"tostr(b0b)", xmm"tostr(c2)"");\
+  asm ("pshufb xmm"tostr(b0b)", xmm"tostr(t0)"");\
+  asm ("pxor   xmm"tostr(b0b)", xmm"tostr(b0a)"");\
+  asm ("movdqa xmm"tostr(t1)",  xmm"tostr(c2)"");\
+  asm ("pshufb xmm"tostr(t1)",  xmm"tostr(a0)"");\
+  asm ("pxor   xmm"tostr(t1)",  xmm"tostr(b0a)"");\
+  asm ("movdqa xmm"tostr(b0a)", xmm"tostr(c2)"");\
+  asm ("pshufb xmm"tostr(b0a)", xmm"tostr(b0b)"");\
+  asm ("pxor   xmm"tostr(b0a)", xmm"tostr(a0)"");\
+  asm ("movdqa xmm"tostr(b0b)", xmm"tostr(c2)"");\
+  asm ("pshufb xmm"tostr(b0b)", xmm"tostr(t1)"");\
+  asm ("pxor   xmm"tostr(b0b)", xmm"tostr(t0)"");\
+}/**/
+
+/* VPERM
+ * Lookup
+ * second part of sbox inverse computation
+ * this function is derived from:
+ *   vperm and aes_ni implementations of hash function Grostl
+ *   by Cagdas CALIK
+ * inputs:
+ * a0a, a0b = output of Substitution Core
+ * table = lookup table to use (*1 / *2 / *4)
+ * t0 = clobber
+ * outputs:
+ * b0 = output of sbox + multiplication
+ * */
+#define VPERM_Lookup(a0a, a0b, table, b0, t0){\
+  asm ("movaps xmm"tostr(b0)", ["tostr(table)"+0*16]");\
+  asm ("movaps xmm"tostr(t0)", ["tostr(table)"+1*16]");\
+  asm ("pshufb xmm"tostr(b0)", xmm"tostr(a0b)"");\
+  asm ("pshufb xmm"tostr(t0)", xmm"tostr(a0a)"");\
+  asm ("pxor   xmm"tostr(b0)", xmm"tostr(t0)"");\
+}/**/
+
+/* VPERM
+ * SubBytes and *2 / *4
+ * this function is derived from:
+ *   Constant-time SSSE3 AES core implementation
+ *   by Mike Hamburg
+ * and
+ *   vperm and aes_ni implementations of hash function Grostl
+ *   by Cagdas CALIK
+ * inputs:
+ * a0-a7 = state
+ * t*, c* = clobbers
+ * outputs:
+ * a0-a7 = state * 4
+ * c2 = row0 * 2 -> b0
+ * c1 = row7 * 2 -> b3
+ * c0 = row7 * 1 -> b4
+ * t2 = row4 * 1 -> b7
+ * TEMP_MUL1 = row(i) * 1
+ * TEMP_MUL2 = row(i) * 2
+ *
+ * call:VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7) */
+#define VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, t0, t1, t3, t4, c2, c1, c0, t2){\
+  /* set Constants */\
+  VPERM_Substitute_Core_Set_Const(c0, c1, c2);\
+  /* row 1 */\
+  VPERM_Substitute_Core(a1, t0, t1, t3, t4, c0, xmm##c1, c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
+  asm ("movaps [TEMP_MUL1+1*16], xmm"tostr(t2)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
+  asm ("movaps [TEMP_MUL2+1*16], xmm"tostr(t3)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a1, t4);\
+  /* --- */\
+  /* row 2 */\
+  VPERM_Substitute_Core(a2, t0, t1, t3, t4, c0, xmm##c1, c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
+  asm ("movaps [TEMP_MUL1+2*16], xmm"tostr(t2)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
+  asm ("movaps [TEMP_MUL2+2*16], xmm"tostr(t3)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a2, t4);\
+  /* --- */\
+  /* row 3 */\
+  VPERM_Substitute_Core(a3, t0, t1, t3, t4, c0, xmm##c1, c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
+  asm ("movaps [TEMP_MUL1+3*16], xmm"tostr(t2)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
+  asm ("movaps [TEMP_MUL2+3*16], xmm"tostr(t3)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a3, t4);\
+  /* --- */\
+  /* row 5 */\
+  VPERM_Substitute_Core(a5, t0, t1, t3, t4, c0, xmm##c1, c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
+  asm ("movaps [TEMP_MUL1+5*16], xmm"tostr(t2)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
+  asm ("movaps [TEMP_MUL2+5*16], xmm"tostr(t3)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a5, t4);\
+  /* --- */\
+  /* row 6 */\
+  VPERM_Substitute_Core(a6, t0, t1, t3, t4, c0, xmm##c1, c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
+  asm ("movaps [TEMP_MUL1+6*16], xmm"tostr(t2)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
+  asm ("movaps [TEMP_MUL2+6*16], xmm"tostr(t3)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a6, t4);\
+  /* --- */\
+  /* row 7 */\
+  VPERM_Substitute_Core(a7, t0, t1, t3, t4, c0, xmm##c1, c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
+  asm ("movaps [TEMP_MUL1+7*16], xmm"tostr(t2)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB2, c1, t4); /*c1 -> b3*/\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a7, t4);\
+  /* --- */\
+  /* row 4 */\
+  VPERM_Substitute_Core(a4, t0, t1, t3, t4, c0, [VPERM_INV+0*16], c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4); /*t2 -> b7*/\
+  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
+  asm ("movaps [TEMP_MUL2+4*16], xmm"tostr(t3)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a4, t4);\
+  /* --- */\
+  /* row 0 */\
+  VPERM_Substitute_Core(a0, t0, t1, t3, t4, c0, [VPERM_INV+0*16], c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, c0, t4); /*c0 -> b4*/\
+  VPERM_Lookup(t0, t1, VPERM_SB2, c2, t4); /*c2 -> b0*/\
+  asm ("movaps [TEMP_MUL2+0*16], xmm"tostr(c2)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a0, t4);\
+  /* --- */\
+}/**/
+
+
+/* Optimized MixBytes
+ * inputs:
+ * a0-a7 = (row0-row7) * 4
+ * b0 = row0 * 2
+ * b3 = row7 * 2
+ * b4 = row7 * 1
+ * b7 = row4 * 1
+ * all *1 and *2 values must also be in TEMP_MUL1, TEMP_MUL2
+ * output: b0-b7
+ * */
+#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* save one value */\
+  asm ("movaps [TEMP_MUL4], xmm"tostr(a3)"");\
+  /* 1 */\
+  asm ("movdqa xmm"tostr(b1)", xmm"tostr(a0)"");\
+  asm ("pxor   xmm"tostr(b1)", xmm"tostr(a5)"");\
+  asm ("pxor   xmm"tostr(b1)", xmm"tostr(b4)""); /* -> helper! */\
+  asm ("pxor   xmm"tostr(b1)", [TEMP_MUL2+3*16]");\
+  asm ("movdqa xmm"tostr(b2)", xmm"tostr(b1)"");\
+  \
+  /* 2 */\
+  asm ("movdqa xmm"tostr(b5)", xmm"tostr(a1)"");\
+  asm ("pxor   xmm"tostr(b5)", xmm"tostr(a4)"");\
+  asm ("pxor   xmm"tostr(b5)", xmm"tostr(b7)""); /* -> helper! */\
+  asm ("pxor   xmm"tostr(b5)", xmm"tostr(b3)""); /* -> helper! */\
+  asm ("movdqa xmm"tostr(b6)", xmm"tostr(b5)"");\
+  \
+  /* 4 */\
+  asm ("pxor   xmm"tostr(b7)", xmm"tostr(a6)"");\
+  /*asm ("pxor   xmm"tostr(b7)", [TEMP_MUL1+4*16]"); -> helper! */\
+  asm ("pxor   xmm"tostr(b7)", [TEMP_MUL1+6*16]");\
+  asm ("pxor   xmm"tostr(b7)", [TEMP_MUL2+1*16]");\
+  asm ("pxor   xmm"tostr(b7)", xmm"tostr(b3)""); /* -> helper! */\
+  asm ("pxor   xmm"tostr(b2)", xmm"tostr(b7)"");\
+  \
+  /* 3 */\
+  asm ("pxor   xmm"tostr(b0)", xmm"tostr(a7)"");\
+  asm ("pxor   xmm"tostr(b0)", [TEMP_MUL1+5*16]");\
+  asm ("pxor   xmm"tostr(b0)", [TEMP_MUL1+7*16]");\
+  /*asm ("pxor   xmm"tostr(b0)", [TEMP_MUL2+0*16]"); -> helper! */\
+  asm ("pxor   xmm"tostr(b0)", [TEMP_MUL2+2*16]");\
+  asm ("movdqa xmm"tostr(b3)", xmm"tostr(b0)"");\
+  asm ("pxor   xmm"tostr(b1)", xmm"tostr(b0)"");\
+  asm ("pxor   xmm"tostr(b0)", xmm"tostr(b7)""); /* moved from 4 */\
+  \
+  /* 5 */\
+  asm ("pxor   xmm"tostr(b4)", xmm"tostr(a2)"");\
+  /*asm ("pxor   xmm"tostr(b4)", [TEMP_MUL1+0*16]"); -> helper! */\
+  asm ("pxor   xmm"tostr(b4)", [TEMP_MUL1+2*16]");\
+  asm ("pxor   xmm"tostr(b4)", [TEMP_MUL2+3*16]");\
+  asm ("pxor   xmm"tostr(b4)", [TEMP_MUL2+5*16]");\
+  asm ("pxor   xmm"tostr(b3)", xmm"tostr(b4)"");\
+  asm ("pxor   xmm"tostr(b6)", xmm"tostr(b4)"");\
+  \
+  /* 6 */\
+  asm ("pxor xmm"tostr(a3)", [TEMP_MUL1+1*16]");\
+  asm ("pxor xmm"tostr(a3)", [TEMP_MUL1+3*16]");\
+  asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+4*16]");\
+  asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+6*16]");\
+  asm ("pxor xmm"tostr(b4)", xmm"tostr(a3)"");\
+  asm ("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\
+  asm ("pxor xmm"tostr(b7)", xmm"tostr(a3)"");\
+  \
+  /* 7 */\
+  asm ("pxor xmm"tostr(a1)", [TEMP_MUL1+1*16]");\
+  asm ("pxor xmm"tostr(a1)", [TEMP_MUL2+4*16]");\
+  asm ("pxor xmm"tostr(b2)", xmm"tostr(a1)"");\
+  asm ("pxor xmm"tostr(b3)", xmm"tostr(a1)"");\
+  \
+  /* 8 */\
+  asm ("pxor xmm"tostr(a5)", [TEMP_MUL1+5*16]");\
+  asm ("pxor xmm"tostr(a5)", [TEMP_MUL2+0*16]");\
+  asm ("pxor xmm"tostr(b6)", xmm"tostr(a5)"");\
+  asm ("pxor xmm"tostr(b7)", xmm"tostr(a5)"");\
+  \
+  /* 9 */\
+  asm ("movaps xmm"tostr(a3)", [TEMP_MUL1+2*16]");\
+  asm ("pxor   xmm"tostr(a3)", [TEMP_MUL2+5*16]");\
+  asm ("pxor   xmm"tostr(b0)", xmm"tostr(a3)"");\
+  asm ("pxor   xmm"tostr(b5)", xmm"tostr(a3)"");\
+  \
+  /* 10 */\
+  asm ("movaps xmm"tostr(a1)", [TEMP_MUL1+6*16]");\
+  asm ("pxor   xmm"tostr(a1)", [TEMP_MUL2+1*16]");\
+  asm ("pxor   xmm"tostr(b1)", xmm"tostr(a1)"");\
+  asm ("pxor   xmm"tostr(b4)", xmm"tostr(a1)"");\
+  \
+  /* 11 */\
+  asm ("movaps xmm"tostr(a5)", [TEMP_MUL1+3*16]");\
+  asm ("pxor   xmm"tostr(a5)", [TEMP_MUL2+6*16]");\
+  asm ("pxor   xmm"tostr(b1)", xmm"tostr(a5)"");\
+  asm ("pxor   xmm"tostr(b6)", xmm"tostr(a5)"");\
+  \
+  /* 12 */\
+  asm ("movaps xmm"tostr(a3)", [TEMP_MUL1+7*16]");\
+  asm ("pxor   xmm"tostr(a3)", [TEMP_MUL2+2*16]");\
+  asm ("pxor   xmm"tostr(b2)", xmm"tostr(a3)"");\
+  asm ("pxor   xmm"tostr(b5)", xmm"tostr(a3)"");\
+  \
+  /* 13 */\
+  asm ("pxor xmm"tostr(b0)", [TEMP_MUL4]");\
+  asm ("pxor xmm"tostr(b0)", xmm"tostr(a4)"");\
+  asm ("pxor xmm"tostr(b1)", xmm"tostr(a4)"");\
+  asm ("pxor xmm"tostr(b3)", xmm"tostr(a6)"");\
+  asm ("pxor xmm"tostr(b4)", xmm"tostr(a0)"");\
+  asm ("pxor xmm"tostr(b4)", xmm"tostr(a7)"");\
+  asm ("pxor xmm"tostr(b5)", xmm"tostr(a0)"");\
+  asm ("pxor xmm"tostr(b7)", xmm"tostr(a2)"");\
+}/**/
+
+//#if (LENGTH <= 256)
+
+#define SET_CONSTANTS(){\
+  SET_SHARED_CONSTANTS();\
+  ((u64*)SUBSH_MASK)[ 0] = 0x0706050403020100ULL;\
+  ((u64*)SUBSH_MASK)[ 1] = 0x080f0e0d0c0b0a09ULL;\
+  ((u64*)SUBSH_MASK)[ 2] = 0x0007060504030201ULL;\
+  ((u64*)SUBSH_MASK)[ 3] = 0x0a09080f0e0d0c0bULL;\
+  ((u64*)SUBSH_MASK)[ 4] = 0x0100070605040302ULL;\
+  ((u64*)SUBSH_MASK)[ 5] = 0x0c0b0a09080f0e0dULL;\
+  ((u64*)SUBSH_MASK)[ 6] = 0x0201000706050403ULL;\
+  ((u64*)SUBSH_MASK)[ 7] = 0x0e0d0c0b0a09080fULL;\
+  ((u64*)SUBSH_MASK)[ 8] = 0x0302010007060504ULL;\
+  ((u64*)SUBSH_MASK)[ 9] = 0x0f0e0d0c0b0a0908ULL;\
+  ((u64*)SUBSH_MASK)[10] = 0x0403020100070605ULL;\
+  ((u64*)SUBSH_MASK)[11] = 0x09080f0e0d0c0b0aULL;\
+  ((u64*)SUBSH_MASK)[12] = 0x0504030201000706ULL;\
+  ((u64*)SUBSH_MASK)[13] = 0x0b0a09080f0e0d0cULL;\
+  ((u64*)SUBSH_MASK)[14] = 0x0605040302010007ULL;\
+  ((u64*)SUBSH_MASK)[15] = 0x0d0c0b0a09080f0eULL;\
+  for(i = 0; i < ROUNDS512; i++)\
+  {\
+    ((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\
+    ((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL)  ^ 0x7060504030201000ULL;\
+    ((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL)  ^ 0x8f9fafbfcfdfefffULL;\
+    ((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\
+  }\
+  ((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\
+  ((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\
+}/**/
+
+#define Push_All_Regs(){\
+/*  not using any...
+    asm("push rax");\
+    asm("push rbx");\
+    asm("push rcx");*/\
+}/**/
+
+#define Pop_All_Regs(){\
+/*  not using any...
+    asm("pop rcx");\
+    asm("pop rbx");\
+    asm("pop rax");*/\
+}/**/
+
+
+/* vperm:
+ * transformation before rounds with ipt
+ * first round add transformed constant
+ * middle rounds: add constant XOR 0x15...15
+ * last round: additionally add 0x15...15 after MB
+ * transformation after rounds with opt
+ */
+/* one round
+ * i = round number
+ * a0-a7 = input rows
+ * b0-b7 = output rows
+ */
+#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* AddRoundConstant + ShiftBytes (interleaved) */\
+  asm ("movaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\
+  asm ("pxor   xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\
+  asm ("pxor   xmm"tostr(a1)", xmm"tostr(b1)"");\
+  asm ("pxor   xmm"tostr(a2)", xmm"tostr(b1)"");\
+  asm ("pxor   xmm"tostr(a3)", xmm"tostr(b1)"");\
+  asm ("pshufb xmm"tostr(a0)", [SUBSH_MASK+0*16]");\
+  asm ("pshufb xmm"tostr(a1)", [SUBSH_MASK+1*16]");\
+  asm ("pxor   xmm"tostr(a4)", xmm"tostr(b1)"");\
+  asm ("pshufb xmm"tostr(a2)", [SUBSH_MASK+2*16]");\
+  asm ("pshufb xmm"tostr(a3)", [SUBSH_MASK+3*16]");\
+  asm ("pxor   xmm"tostr(a5)", xmm"tostr(b1)"");\
+  asm ("pxor   xmm"tostr(a6)", xmm"tostr(b1)"");\
+  asm ("pshufb xmm"tostr(a4)", [SUBSH_MASK+4*16]");\
+  asm ("pshufb xmm"tostr(a5)", [SUBSH_MASK+5*16]");\
+  asm ("pxor   xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\
+  asm ("pshufb xmm"tostr(a6)", [SUBSH_MASK+6*16]");\
+  asm ("pshufb xmm"tostr(a7)", [SUBSH_MASK+7*16]");\
+  /* SubBytes + Multiplication by 2 and 4 */\
+  VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7);\
+  /* MixBytes */\
+  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
+}/**/
+
+/* 10 rounds, P and Q in parallel */
+#define ROUNDS_P_Q(){\
+  VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 0);\
+  ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 0);\
+}
+
+
+/* Matrix Transpose Step 1
+ * input is a 512-bit state with two columns in one xmm
+ * output is a 512-bit state with two rows in one xmm
+ * inputs: i0-i3
+ * outputs: i0, o1-o3
+ * clobbers: t0
+ */
+#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
+  asm ("movaps xmm"tostr(t0)", [TRANSP_MASK]");\
+\
+  asm ("pshufb xmm"tostr(i0)", xmm"tostr(t0)"");\
+  asm ("pshufb xmm"tostr(i1)", xmm"tostr(t0)"");\
+  asm ("pshufb xmm"tostr(i2)", xmm"tostr(t0)"");\
+  asm ("pshufb xmm"tostr(i3)", xmm"tostr(t0)"");\
+\
+  asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\
+  asm ("movdqa xmm"tostr(t0)", xmm"tostr(i2)"");\
+\
+  asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("punpckhwd xmm"tostr(o1)", xmm"tostr(i1)"");\
+  asm ("punpcklwd xmm"tostr(i2)", xmm"tostr(i3)"");\
+  asm ("punpckhwd xmm"tostr(t0)", xmm"tostr(i3)"");\
+\
+  asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\
+  asm ("pshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\
+  asm ("pshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\
+  asm ("pshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\
+\
+  asm ("movdqa xmm"tostr(o2)", xmm"tostr(i0)"");\
+  asm ("movdqa xmm"tostr(o3)", xmm"tostr(o1)"");\
+\
+  asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i2)"");\
+  asm ("punpckldq xmm"tostr(o1)", xmm"tostr(t0)"");\
+  asm ("punpckhdq xmm"tostr(o2)", xmm"tostr(i2)"");\
+  asm ("punpckhdq xmm"tostr(o3)", xmm"tostr(t0)"");\
+}/**/
+
+/* Matrix Transpose Step 2
+ * input are two 512-bit states with two rows in one xmm
+ * output are two 512-bit states with one row of each state in one xmm
+ * inputs: i0-i3 = P, i4-i7 = Q
+ * outputs: (i0, o1-o7) = (P|Q)
+ * possible reassignments: (output reg = input reg)
+ * * i1 -> o3-7
+ * * i2 -> o5-7
+ * * i3 -> o7
+ * * i4 -> o3-7
+ * * i5 -> o6-7
+ */
+#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
+  asm ("movdqa     xmm"tostr(o1)", xmm"tostr(i0)"");\
+  asm ("movdqa     xmm"tostr(o2)", xmm"tostr(i1)"");\
+  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i4)"");\
+  asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i4)"");\
+  asm ("movdqa     xmm"tostr(o3)", xmm"tostr(i1)"");\
+  asm ("movdqa     xmm"tostr(o4)", xmm"tostr(i2)"");\
+  asm ("punpcklqdq xmm"tostr(o2)", xmm"tostr(i5)"");\
+  asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i5)"");\
+  asm ("movdqa     xmm"tostr(o5)", xmm"tostr(i2)"");\
+  asm ("movdqa     xmm"tostr(o6)", xmm"tostr(i3)"");\
+  asm ("punpcklqdq xmm"tostr(o4)", xmm"tostr(i6)"");\
+  asm ("punpckhqdq xmm"tostr(o5)", xmm"tostr(i6)"");\
+  asm ("movdqa     xmm"tostr(o7)", xmm"tostr(i3)"");\
+  asm ("punpcklqdq xmm"tostr(o6)", xmm"tostr(i7)"");\
+  asm ("punpckhqdq xmm"tostr(o7)", xmm"tostr(i7)"");\
+}/**/
+
+/* Matrix Transpose Inverse Step 2
+ * input are two 512-bit states with one row of each state in one xmm
+ * output are two 512-bit states with two rows in one xmm
+ * inputs: i0-i7 = (P|Q)
+ * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
+ */
+#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
+  asm ("movdqa     xmm"tostr(o0)", xmm"tostr(i0)"");\
+  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("punpckhqdq xmm"tostr(o0)", xmm"tostr(i1)"");\
+  asm ("movdqa     xmm"tostr(o1)", xmm"tostr(i2)"");\
+  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
+  asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i3)"");\
+  asm ("movdqa     xmm"tostr(o2)", xmm"tostr(i4)"");\
+  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
+  asm ("punpckhqdq xmm"tostr(o2)", xmm"tostr(i5)"");\
+  asm ("movdqa     xmm"tostr(o3)", xmm"tostr(i6)"");\
+  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
+  asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i7)"");\
+}/**/
+
+/* Matrix Transpose Output Step 2
+ * input is one 512-bit state with two rows in one xmm
+ * output is one 512-bit state with one row in the low 64-bits of one xmm
+ * inputs: i0,i2,i4,i6 = S
+ * outputs: (i0-7) = (0|S)
+ */
+#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
+  asm ("pxor xmm"tostr(t0)", xmm"tostr(t0)"");\
+  asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\
+  asm ("movdqa xmm"tostr(i3)", xmm"tostr(i2)"");\
+  asm ("movdqa xmm"tostr(i5)", xmm"tostr(i4)"");\
+  asm ("movdqa xmm"tostr(i7)", xmm"tostr(i6)"");\
+  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(t0)"");\
+  asm ("punpckhqdq xmm"tostr(i1)", xmm"tostr(t0)"");\
+  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(t0)"");\
+  asm ("punpckhqdq xmm"tostr(i3)", xmm"tostr(t0)"");\
+  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(t0)"");\
+  asm ("punpckhqdq xmm"tostr(i5)", xmm"tostr(t0)"");\
+  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(t0)"");\
+  asm ("punpckhqdq xmm"tostr(i7)", xmm"tostr(t0)"");\
+}/**/
+
+/* Matrix Transpose Output Inverse Step 2
+ * input is one 512-bit state with one row in the low 64-bits of one xmm
+ * output is one 512-bit state with two rows in one xmm
+ * inputs: i0-i7 = (0|S)
+ * outputs: (i0, i2, i4, i6) = S
+ */
+#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
+  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
+  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
+  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
+}/**/
+
+
+/* transform round constants into VPERM mode */
+#define VPERM_Transform_RoundConst_CNT2(i, j){\
+  asm ("movaps xmm0, [ROUND_CONST_L0+"tostr(i)"*16]");\
+  asm ("movaps xmm1, [ROUND_CONST_L7+"tostr(i)"*16]");\
+  asm ("movaps xmm2, [ROUND_CONST_L0+"tostr(j)"*16]");\
+  asm ("movaps xmm3, [ROUND_CONST_L7+"tostr(j)"*16]");\
+  VPERM_Transform_State(0, 1, 2, 3, VPERM_IPT, 4, 5, 6, 7, 8, 9, 10);\
+  asm ("pxor xmm0, [ALL_15]");\
+  asm ("pxor xmm1, [ALL_15]");\
+  asm ("pxor xmm2, [ALL_15]");\
+  asm ("pxor xmm3, [ALL_15]");\
+  asm ("movaps [ROUND_CONST_L0+"tostr(i)"*16], xmm0");\
+  asm ("movaps [ROUND_CONST_L7+"tostr(i)"*16], xmm1");\
+  asm ("movaps [ROUND_CONST_L0+"tostr(j)"*16], xmm2");\
+  asm ("movaps [ROUND_CONST_L7+"tostr(j)"*16], xmm3");\
+}/**/
+
+/* transform round constants into VPERM mode */
+#define VPERM_Transform_RoundConst(){\
+  asm ("movaps xmm0, [ROUND_CONST_Lx]");\
+  VPERM_Transform(0, 1, VPERM_IPT, 4, 5, 6, 7, 8, 9, 10);\
+  asm ("pxor xmm0, [ALL_15]");\
+  asm ("movaps [ROUND_CONST_Lx], xmm0");\
+  VPERM_Transform_RoundConst_CNT2(0, 1);\
+  VPERM_Transform_RoundConst_CNT2(2, 3);\
+  VPERM_Transform_RoundConst_CNT2(4, 5);\
+  VPERM_Transform_RoundConst_CNT2(6, 7);\
+  VPERM_Transform_RoundConst_CNT2(8, 9);\
+}/**/
+
+void INIT256(u64* h)
+{
+  /* __cdecl calling convention: */
+  /* chaining value CV in rdi    */
+
+  asm (".intel_syntax noprefix");
+  asm volatile ("emms");
+
+  /* transform round constants into VPERM mode */
+  VPERM_Transform_RoundConst();
+
+  /* load IV into registers xmm12 - xmm15 */
+  asm ("movaps xmm12, [rdi+0*16]");
+  asm ("movaps xmm13, [rdi+1*16]");
+  asm ("movaps xmm14, [rdi+2*16]");
+  asm ("movaps xmm15, [rdi+3*16]");
+
+  /* transform chaining value from column ordering into row ordering */
+  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
+  VPERM_Transform_State(12, 13, 14, 15, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7);
+  Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
+
+  /* store transposed IV */
+  asm ("movaps [rdi+0*16], xmm12");
+  asm ("movaps [rdi+1*16], xmm2");
+  asm ("movaps [rdi+2*16], xmm6");
+  asm ("movaps [rdi+3*16], xmm7");
+
+  asm volatile ("emms");
+  asm (".att_syntax noprefix");
+}
+
+void TF512(u64* h, u64* m)
+{
+  /* __cdecl calling convention: */
+  /* chaining value CV in rdi    */
+  /* message M in rsi            */
+
+#ifdef IACA_TRACE
+  IACA_START;
+#endif
+
+  asm (".intel_syntax noprefix");
+  Push_All_Regs();
+
+  /* load message into registers xmm12 - xmm15 (Q = message) */
+  asm ("movaps xmm12, [rsi+0*16]");
+  asm ("movaps xmm13, [rsi+1*16]");
+  asm ("movaps xmm14, [rsi+2*16]");
+  asm ("movaps xmm15, [rsi+3*16]");
+
+  /* transform message M from column ordering into row ordering */
+  /* we first put two rows (64 bit) of the message into one 128-bit xmm register */
+  VPERM_Transform_State(12, 13, 14, 15, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7);
+  Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
+
+  /* load previous chaining value */
+  /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
+  asm ("movaps xmm8, [rdi+0*16]");
+  asm ("movaps xmm0, [rdi+1*16]");
+  asm ("movaps xmm4, [rdi+2*16]");
+  asm ("movaps xmm5, [rdi+3*16]");
+
+  /* xor message to CV get input of P */
+  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
+  asm ("pxor xmm8, xmm12");
+  asm ("pxor xmm0, xmm2");
+  asm ("pxor xmm4, xmm6");
+  asm ("pxor xmm5, xmm7");
+
+  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
+  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
+  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
+  Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15);
+
+  /* compute the two permutations P and Q in parallel */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P or two rows of Q in one xmm register */
+  Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3);
+
+  /* xor output of P and Q */
+  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
+  asm ("pxor xmm0, xmm8");
+  asm ("pxor xmm1, xmm10");
+  asm ("pxor xmm2, xmm12");
+  asm ("pxor xmm3, xmm14");
+
+  /* xor CV (feed-forward) */
+  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
+  asm ("pxor xmm0, [rdi+0*16]");
+  asm ("pxor xmm1, [rdi+1*16]");
+  asm ("pxor xmm2, [rdi+2*16]");
+  asm ("pxor xmm3, [rdi+3*16]");
+
+  /* store CV */
+  asm ("movaps [rdi+0*16], xmm0");
+  asm ("movaps [rdi+1*16], xmm1");
+  asm ("movaps [rdi+2*16], xmm2");
+  asm ("movaps [rdi+3*16], xmm3");
+
+  Pop_All_Regs();
+  asm (".att_syntax noprefix");
+
+#ifdef IACA_TRACE
+  IACA_END;
+#endif
+
+  return;
+}
+
+void OF512(u64* h)
+{
+  /* __cdecl calling convention: */
+  /* chaining value CV in rdi    */
+
+  asm (".intel_syntax noprefix");
+  Push_All_Regs();
+
+  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
+  asm ("movaps xmm8,  [rdi+0*16]");
+  asm ("movaps xmm10, [rdi+1*16]");
+  asm ("movaps xmm12, [rdi+2*16]");
+  asm ("movaps xmm14, [rdi+3*16]");
+
+  /* there are now 2 rows of the CV in one xmm register */
+  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
+  /* result: the 8 input rows of P in xmm8 - xmm15 */
+  Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0);
+
+  /* compute the permutation P */
+  /* result: the output of P(CV) in xmm8 - xmm15 */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P in one xmm register */
+  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
+  Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15);
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
+  asm ("pxor xmm8,  [rdi+0*16]");
+  asm ("pxor xmm10, [rdi+1*16]");
+  asm ("pxor xmm12, [rdi+2*16]");
+  asm ("pxor xmm14, [rdi+3*16]");
+
+  /* transform state back from row ordering into column ordering */
+  /* result: final hash value in xmm9, xmm11 */
+  Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0);
+  VPERM_Transform(9, 11, VPERM_OPT, 0, 1, 2, 3, 5, 6, 7);
+
+  /* we only need to return the truncated half of the state */
+  asm ("movaps [rdi+2*16], xmm9");
+  asm ("movaps [rdi+3*16], xmm11");
+
+  Pop_All_Regs();
+  asm (".att_syntax noprefix");
+
+  return;
+}
+
+
--- a/algo/groestl/aes_ni/groestl256-intr-aes.h
+++ b/algo/groestl/aes_ni/groestl256-intr-aes.h
@@ -11,44 +11,17 @@
 #include <wmmintrin.h>
 #include "hash-groestl256.h"

-static const __m128i round_const_l0[] __attribute__ ((aligned (64))) =
-{
-   { 0x7060504030201000, 0xffffffffffffffff },
-   { 0x7161514131211101, 0xffffffffffffffff },
-   { 0x7262524232221202, 0xffffffffffffffff },
-   { 0x7363534333231303, 0xffffffffffffffff },
-   { 0x7464544434241404, 0xffffffffffffffff },
-   { 0x7565554535251505, 0xffffffffffffffff },
-   { 0x7666564636261606, 0xffffffffffffffff },
-   { 0x7767574737271707, 0xffffffffffffffff },
-   { 0x7868584838281808, 0xffffffffffffffff },
-   { 0x7969594939291909, 0xffffffffffffffff }
-};
+/* global constants  */
+__m128i ROUND_CONST_Lx;
+__m128i ROUND_CONST_L0[ROUNDS512];
+__m128i ROUND_CONST_L7[ROUNDS512];
+//__m128i ROUND_CONST_P[ROUNDS1024];
+//__m128i ROUND_CONST_Q[ROUNDS1024];
+__m128i TRANSP_MASK;
+__m128i SUBSH_MASK[8];
+__m128i ALL_1B;
+__m128i ALL_FF;

-static const __m128i round_const_l7[] __attribute__ ((aligned (64))) =
-{
-   { 0x0000000000000000, 0x8f9fafbfcfdfefff },
-   { 0x0000000000000000, 0x8e9eaebecedeeefe },
-   { 0x0000000000000000, 0x8d9dadbdcdddedfd },
-   { 0x0000000000000000, 0x8c9cacbcccdcecfc },
-   { 0x0000000000000000, 0x8b9babbbcbdbebfb },
-   { 0x0000000000000000, 0x8a9aaabacadaeafa },
-   { 0x0000000000000000, 0x8999a9b9c9d9e9f9 },
-   { 0x0000000000000000, 0x8898a8b8c8d8e8f8 },
-   { 0x0000000000000000, 0x8797a7b7c7d7e7f7 },
-   { 0x0000000000000000, 0x8696a6b6c6d6e6f6 }
-};
-
-static const __m128i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02 };
-
-static const __m128i SUBSH_MASK0 = { 0x0c0f0104070b0e00, 0x03060a0d08020509 };
-static const __m128i SUBSH_MASK1 = { 0x0e090205000d0801, 0x04070c0f0a03060b };
-static const __m128i SUBSH_MASK2 = { 0x080b0306010f0a02, 0x05000e090c04070d };
-static const __m128i SUBSH_MASK3 = { 0x0a0d040702090c03, 0x0601080b0e05000f };
-static const __m128i SUBSH_MASK4 = { 0x0b0e0500030a0d04, 0x0702090c0f060108 };
-static const __m128i SUBSH_MASK5 = { 0x0d080601040c0f05, 0x00030b0e0907020a };
-static const __m128i SUBSH_MASK6 = { 0x0f0a0702050e0906, 0x01040d080b00030c };
-static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };

 #define tos(a)    #a
 #define tostr(a)  tos(a)
@@ -65,6 +38,8 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
  i = _mm_xor_si128(i, j);\
 } 

+ /**/
+
 /* Yet another implementation of MixBytes.
   This time we use the formulae (3) from the paper "Byte Slicing Groestl".
   Input: a0, ..., a7
@@ -138,7 +113,7 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
  \
  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
  /* compute w_i : add y_{i+4} */\
-  b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\
+  b1 = ALL_1B;\
  MUL2(a0, b0, b1);\
  a0 = _mm_xor_si128(a0, TEMP0);\
  MUL2(a1, b0, b1);\
@@ -178,6 +153,25 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
  b1 = _mm_xor_si128(b1, a4);\
 }/*MixBytes*/

+#define SET_CONSTANTS(){\
+   ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
+  TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
+  SUBSH_MASK[0] = _mm_set_epi32(0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00);\
+  SUBSH_MASK[1] = _mm_set_epi32(0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801);\
+  SUBSH_MASK[2] = _mm_set_epi32(0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02);\
+  SUBSH_MASK[3] = _mm_set_epi32(0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03);\
+  SUBSH_MASK[4] = _mm_set_epi32(0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04);\
+  SUBSH_MASK[5] = _mm_set_epi32(0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05);\
+  SUBSH_MASK[6] = _mm_set_epi32(0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906);\
+  SUBSH_MASK[7] = _mm_set_epi32(0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07);\
+  for(i = 0; i < ROUNDS512; i++)\
+  {\
+    ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
+    ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\
+  }\
+  ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\
+}while(0); \
+
 /* one round
 * i = round number
 * a0-a7 = input rows
@@ -185,34 +179,34 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
 */
 #define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
  /* AddRoundConstant */\
-  b1 = m128_const_64( 0xffffffffffffffff, 0 ); \
-  a0 = _mm_xor_si128( a0, casti_m128i( round_const_l0, i ) ); \
-  a1 = _mm_xor_si128( a1, b1 ); \
-  a2 = _mm_xor_si128( a2, b1 ); \
-  a3 = _mm_xor_si128( a3, b1 ); \
-  a4 = _mm_xor_si128( a4, b1 ); \
-  a5 = _mm_xor_si128( a5, b1 ); \
-  a6 = _mm_xor_si128( a6, b1 ); \
-  a7 = _mm_xor_si128( a7, casti_m128i( round_const_l7, i ) ); \
+  b1 = ROUND_CONST_Lx;\
+  a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\
+  a1 = _mm_xor_si128(a1, b1);\
+  a2 = _mm_xor_si128(a2, b1);\
+  a3 = _mm_xor_si128(a3, b1);\
+  a4 = _mm_xor_si128(a4, b1);\
+  a5 = _mm_xor_si128(a5, b1);\
+  a6 = _mm_xor_si128(a6, b1);\
+  a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\
  \
  /* ShiftBytes + SubBytes (interleaved) */\
  b0 = _mm_xor_si128(b0,  b0);\
-  a0 = _mm_shuffle_epi8( a0, SUBSH_MASK0 ); \
-  a0 = _mm_aesenclast_si128( a0, b0 );\
-  a1 = _mm_shuffle_epi8( a1, SUBSH_MASK1 ); \
-  a1 = _mm_aesenclast_si128( a1, b0 );\
-  a2 = _mm_shuffle_epi8( a2, SUBSH_MASK2 ); \
-  a2 = _mm_aesenclast_si128( a2, b0 );\
-  a3 = _mm_shuffle_epi8( a3, SUBSH_MASK3 ); \
-  a3 = _mm_aesenclast_si128( a3, b0 );\
-  a4 = _mm_shuffle_epi8( a4, SUBSH_MASK4 ); \
-  a4 = _mm_aesenclast_si128( a4, b0 );\
-  a5 = _mm_shuffle_epi8( a5, SUBSH_MASK5 ); \
-  a5 = _mm_aesenclast_si128( a5, b0 );\
-  a6 = _mm_shuffle_epi8( a6, SUBSH_MASK6 ); \
-  a6 = _mm_aesenclast_si128( a6, b0 );\
-  a7 = _mm_shuffle_epi8( a7, SUBSH_MASK7 ); \
-  a7 = _mm_aesenclast_si128( a7, b0 );\
+  a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\
+  a0 = _mm_aesenclast_si128(a0, b0);\
+  a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\
+  a1 = _mm_aesenclast_si128(a1, b0);\
+  a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\
+  a2 = _mm_aesenclast_si128(a2, b0);\
+  a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\
+  a3 = _mm_aesenclast_si128(a3, b0);\
+  a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\
+  a4 = _mm_aesenclast_si128(a4, b0);\
+  a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\
+  a5 = _mm_aesenclast_si128(a5, b0);\
+  a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\
+  a6 = _mm_aesenclast_si128(a6, b0);\
+  a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\
+  a7 = _mm_aesenclast_si128(a7, b0);\
  \
  /* MixBytes */\
  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
@@ -240,9 +234,8 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
 * outputs: i0, o1-o3
 * clobbers: t0
 */
-
 #define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
-  t0 = TRANSP_MASK; \
+  t0 = TRANSP_MASK;\
  \
  i0 = _mm_shuffle_epi8(i0, t0);\
  i1 = _mm_shuffle_epi8(i1, t0);\
--- a/algo/groestl/aes_ni/groestl256-intr-avx.h
+++ b/algo/groestl/aes_ni/groestl256-intr-avx.h
@@ -0,0 +1,482 @@
+/* groestl-intr-avx.h     Aug 2011
+ *
+ * Groestl implementation with intrinsics using ssse3, sse4.1, aes and avx
+ * instructions.
+ * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
+ *
+ * This code is placed in the public domain
+ */
+
+#include <smmintrin.h>
+#include <wmmintrin.h>
+#include <immintrin.h>
+#include "hash-groestl256.h"
+
+/* global constants  */
+__m128i ROUND_CONST_Lx;
+__m128i ROUND_CONST_L0[ROUNDS512];
+__m128i ROUND_CONST_L7[ROUNDS512];
+__m128i ROUND_CONST_P[ROUNDS1024];
+__m128i ROUND_CONST_Q[ROUNDS1024];
+__m128i TRANSP_MASK;
+__m128i SUBSH_MASK[8];
+__m128i ALL_FF;
+//#if LENGTH <= 256
+__m128i ALL_1B;
+//#else
+//__m256d ALL_1B;
+//#endif
+
+#define tos(a)    #a
+#define tostr(a)  tos(a)
+
+#define insert_m128i_in_m256d(ymm, xmm, pos) (_mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castpd_si256(ymm), xmm, pos)))
+#define extract_m128i_from_m256d(ymm, pos) (_mm256_extractf128_si256(_mm256_castpd_si256(ymm), pos))
+
+#define SET_CONSTANTS(){\
+  ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
+  ALL_FF = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\
+  TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
+  SUBSH_MASK[0] = _mm_set_epi32(0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00);\
+  SUBSH_MASK[1] = _mm_set_epi32(0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801);\
+  SUBSH_MASK[2] = _mm_set_epi32(0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02);\
+  SUBSH_MASK[3] = _mm_set_epi32(0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03);\
+  SUBSH_MASK[4] = _mm_set_epi32(0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04);\
+  SUBSH_MASK[5] = _mm_set_epi32(0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05);\
+  SUBSH_MASK[6] = _mm_set_epi32(0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906);\
+  SUBSH_MASK[7] = _mm_set_epi32(0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07);\
+  for(i = 0; i < ROUNDS512; i++)\
+  {\
+    ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
+    ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\
+  }\
+  ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\
+}while(0);
+
+/* xmm[i] will be multiplied by 2
+ * xmm[j] will be lost
+ * xmm[k] has to be all 0x1b
+ * xmm[z] has to be zero */
+#define VMUL2(i, j, k, z){\
+  j = _mm_cmpgt_epi8(z, i);\
+  i = _mm_add_epi8(i, i);\
+  j = _mm_and_si128(j, k);\
+  i = _mm_xor_si128(i, j);\
+}/**/
+
+/* Yet another implementation of MixBytes.
+   This time we use the formulae (3) from the paper "Byte Slicing Groestl".
+   Input: a0, ..., a7
+   Output: b0, ..., b7 = MixBytes(a0,...,a7).
+   but we use the relations:
+   t_i = a_i + a_{i+3}
+   x_i = t_i + t_{i+3}
+   y_i = t_i + t+{i+2} + a_{i+6}
+   z_i = 2*x_i
+   w_i = z_i + y_{i+4}
+   v_i = 2*w_i
+   b_i = v_{i+3} + y_{i+4}
+   We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
+   and then adding v_i computed in the meantime in registers xmm0..xmm7.
+   We almost fit into 16 registers, need only 3 spills to memory.
+   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
+   K. Matusiewicz, 2011/05/29 */
+#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* xmm"tostr(8..xmm"tostr(15 = a2 a3... a0 a1 */\
+  b0 = a2;\
+  b1 = a3;\
+  b2 = a4;\
+  b3 = a5;\
+  b4 = a6;\
+  b5 = a7;\
+  b6 = a0;\
+  b7 = a1;\
+  \
+  /* t_i = a_i + a_{i+1} */\
+  a0 = _mm_xor_si128(a0, a1);\
+  a1 = _mm_xor_si128(a1, a2);\
+  a2 = _mm_xor_si128(a2, a3);\
+  a3 = _mm_xor_si128(a3, a4);\
+  a4 = _mm_xor_si128(a4, a5);\
+  a5 = _mm_xor_si128(a5, a6);\
+  a6 = _mm_xor_si128(a6, a7);\
+  a7 = _mm_xor_si128(a7, b6);\
+  \
+  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
+  b0 = _mm_xor_si128(b0, a4);\
+  b1 = _mm_xor_si128(b1, a5);\
+  b2 = _mm_xor_si128(b2, a6);\
+  b3 = _mm_xor_si128(b3, a7);\
+  b4 = _mm_xor_si128(b4, a0);\
+  b5 = _mm_xor_si128(b5, a1);\
+  b6 = _mm_xor_si128(b6, a2);\
+  b7 = _mm_xor_si128(b7, a3);\
+  \
+  b0 = _mm_xor_si128(b0, a6);\
+  b1 = _mm_xor_si128(b1, a7);\
+  b2 = _mm_xor_si128(b2, a0);\
+  b3 = _mm_xor_si128(b3, a1);\
+  b4 = _mm_xor_si128(b4, a2);\
+  b5 = _mm_xor_si128(b5, a3);\
+  b6 = _mm_xor_si128(b6, a4);\
+  b7 = _mm_xor_si128(b7, a5);\
+  \
+  /* spill values y_4, y_5 to memory */\
+  TEMP0 = b0;\
+  TEMP1 = b1;\
+  TEMP2 = b2;\
+  \
+  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
+  b0 = a0;\
+  b1 = a1;\
+  TEMP3 = a2;\
+  \
+  /* compute x_i = t_i + t_{i+3} */\
+  a0 = _mm_xor_si128(a0, a3);\
+  a1 = _mm_xor_si128(a1, a4);\
+  a2 = _mm_xor_si128(a2, a5);\
+  a3 = _mm_xor_si128(a3, a6);\
+  a4 = _mm_xor_si128(a4, a7);\
+  a5 = _mm_xor_si128(a5, b0);\
+  a6 = _mm_xor_si128(a6, b1);\
+  a7 = _mm_xor_si128(a7, TEMP3);\
+  \
+  /*compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
+  b1 = ALL_1B;\
+  b2 = _mm_xor_si128(b2, b2);\
+  VMUL2(a7, b0, b1, b2);\
+  VMUL2(a6, b0, b1, b2);\
+  VMUL2(a5, b0, b1, b2);\
+  VMUL2(a4, b0, b1, b2);\
+  VMUL2(a3, b0, b1, b2);\
+  VMUL2(a2, b0, b1, b2);\
+  VMUL2(a1, b0, b1, b2);\
+  VMUL2(a0, b0, b1, b2);\
+  \
+  /* compute w_i :  add y_{i+4} */\
+  a0 = _mm_xor_si128(a0, TEMP0);\
+  a1 = _mm_xor_si128(a1, TEMP1);\
+  a2 = _mm_xor_si128(a2, TEMP2);\
+  a3 = _mm_xor_si128(a3, b3);\
+  a4 = _mm_xor_si128(a4, b4);\
+  a5 = _mm_xor_si128(a5, b5);\
+  a6 = _mm_xor_si128(a6, b6);\
+  a7 = _mm_xor_si128(a7, b7);\
+  \
+  /*compute v_i: double w_i */\
+  VMUL2(a0, b0, b1, b2);\
+  VMUL2(a1, b0, b1, b2);\
+  VMUL2(a2, b0, b1, b2);\
+  VMUL2(a3, b0, b1, b2);\
+  VMUL2(a4, b0, b1, b2);\
+  VMUL2(a5, b0, b1, b2);\
+  VMUL2(a6, b0, b1, b2);\
+  VMUL2(a7, b0, b1, b2);\
+  \
+  /* add to y_4 y_5 .. v3, v4, ... */\
+  b0 = _mm_xor_si128(a3, TEMP0);\
+  b1 = _mm_xor_si128(a4, TEMP1);\
+  b2 = _mm_xor_si128(a5, TEMP2);\
+  b3 = _mm_xor_si128(b3, a6);\
+  b4 = _mm_xor_si128(b4, a7);\
+  b5 = _mm_xor_si128(b5, a0);\
+  b6 = _mm_xor_si128(b6, a1);\
+  b7 = _mm_xor_si128(b7, a2);\
+}/*MixBytes*/
+
+/* one round
+ * i = round number
+ * a0-a7 = input rows
+ * b0-b7 = output rows
+ */
+#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* Add Round Constant */\
+  b1 = ROUND_CONST_Lx;\
+  a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\
+  a1 = _mm_xor_si128(a1, b1);\
+  a2 = _mm_xor_si128(a2, b1);\
+  a3 = _mm_xor_si128(a3, b1);\
+  a4 = _mm_xor_si128(a4, b1);\
+  a5 = _mm_xor_si128(a5, b1);\
+  a6 = _mm_xor_si128(a6, b1);\
+  a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\
+  \
+  /* ShiftBytes + SubBytes (interleaved) */\
+  b0 = _mm_xor_si128(b0,  b0);\
+  a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\
+  a0 = _mm_aesenclast_si128(a0, b0);\
+  a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\
+  a1 = _mm_aesenclast_si128(a1, b0);\
+  a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\
+  a2 = _mm_aesenclast_si128(a2, b0);\
+  a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\
+  a3 = _mm_aesenclast_si128(a3, b0);\
+  a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\
+  a4 = _mm_aesenclast_si128(a4, b0);\
+  a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\
+  a5 = _mm_aesenclast_si128(a5, b0);\
+  a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\
+  a6 = _mm_aesenclast_si128(a6, b0);\
+  a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\
+  a7 = _mm_aesenclast_si128(a7, b0);\
+  \
+  /* MixBytes */\
+  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
+}
+
+/* 10 rounds, P and Q in parallel */
+#define ROUNDS_P_Q(){\
+  ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+}
+
+/* Matrix Transpose Step 1
+ * input is a 512-bit state with two columns in one xmm
+ * output is a 512-bit state with two rows in one xmm
+ * inputs: i0-i3
+ * outputs: i0, o1-o3
+ * clobbers: t0
+ */
+#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
+  t0 = TRANSP_MASK;\
+  \
+  i0 = _mm_shuffle_epi8(i0, t0);\
+  i1 = _mm_shuffle_epi8(i1, t0);\
+  i2 = _mm_shuffle_epi8(i2, t0);\
+  i3 = _mm_shuffle_epi8(i3, t0);\
+  \
+  o1 = _mm_unpackhi_epi16(i0, i1);\
+  i0 = _mm_unpacklo_epi16(i0, i1);\
+  t0 = _mm_unpackhi_epi16(i2, i3);\
+  i2 = _mm_unpacklo_epi16(i2, i3);\
+  \
+  i0 = _mm_shuffle_epi32(i0, 216);\
+  o1 = _mm_shuffle_epi32(o1, 216);\
+  i2 = _mm_shuffle_epi32(i2, 216);\
+  t0 = _mm_shuffle_epi32(t0, 216);\
+  \
+  o2 = _mm_unpackhi_epi32(i0, i2);\
+  o3 = _mm_unpackhi_epi32(o1, t0);\
+  i0 = _mm_unpacklo_epi32(i0, i2);\
+  o1 = _mm_unpacklo_epi32(o1, t0);\
+}/**/
+
+/* Matrix Transpose Step 2
+ * input are two 512-bit states with two rows in one xmm
+ * output are two 512-bit states with one row of each state in one xmm
+ * inputs: i0-i3 = P, i4-i7 = Q
+ * outputs: (i0, o1-o7) = (P|Q)
+ * possible reassignments: (output reg = input reg)
+ * * i1 -> o3-7
+ * * i2 -> o5-7
+ * * i3 -> o7
+ * * i4 -> o3-7
+ * * i5 -> o6-7
+ */
+#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
+  o1 = _mm_unpackhi_epi64(i0, i4);\
+  i0 = _mm_unpacklo_epi64(i0, i4);\
+  o2 = _mm_unpacklo_epi64(i1, i5);\
+  o3 = _mm_unpackhi_epi64(i1, i5);\
+  o4 = _mm_unpacklo_epi64(i2, i6);\
+  o5 = _mm_unpackhi_epi64(i2, i6);\
+  o6 = _mm_unpacklo_epi64(i3, i7);\
+  o7 = _mm_unpackhi_epi64(i3, i7);\
+}/**/
+
+/* Matrix Transpose Inverse Step 2
+ * input are two 512-bit states with one row of each state in one xmm
+ * output are two 512-bit states with two rows in one xmm
+ * inputs: i0-i7 = (P|Q)
+ * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
+ */
+#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
+  o0 = _mm_unpackhi_epi64(i0, i1);\
+  i0 = _mm_unpacklo_epi64(i0, i1);\
+  o1 = _mm_unpackhi_epi64(i2, i3);\
+  i2 = _mm_unpacklo_epi64(i2, i3);\
+  o2 = _mm_unpackhi_epi64(i4, i5);\
+  i4 = _mm_unpacklo_epi64(i4, i5);\
+  o3 = _mm_unpackhi_epi64(i6, i7);\
+  i6 = _mm_unpacklo_epi64(i6, i7);\
+}/**/
+
+/* Matrix Transpose Output Step 2
+ * input is one 512-bit state with two rows in one xmm
+ * output is one 512-bit state with one row in the low 64-bits of one xmm
+ * inputs: i0,i2,i4,i6 = S
+ * outputs: (i0-7) = (0|S)
+ */
+#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
+  t0 = _mm_xor_si128(t0, t0);\
+  i1 = _mm_unpackhi_epi64(i0, t0);\
+  i0 = _mm_unpacklo_epi64(i0, t0);\
+  i3 = _mm_unpackhi_epi64(i2, t0);\
+  i2 = _mm_unpacklo_epi64(i2, t0);\
+  i5 = _mm_unpackhi_epi64(i4, t0);\
+  i4 = _mm_unpacklo_epi64(i4, t0);\
+  i7 = _mm_unpackhi_epi64(i6, t0);\
+  i6 = _mm_unpacklo_epi64(i6, t0);\
+}/**/
+
+/* Matrix Transpose Output Inverse Step 2
+ * input is one 512-bit state with one row in the low 64-bits of one xmm
+ * output is one 512-bit state with two rows in one xmm
+ * inputs: i0-i7 = (0|S)
+ * outputs: (i0, i2, i4, i6) = S
+ */
+#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
+  i0 = _mm_unpacklo_epi64(i0, i1);\
+  i2 = _mm_unpacklo_epi64(i2, i3);\
+  i4 = _mm_unpacklo_epi64(i4, i5);\
+  i6 = _mm_unpacklo_epi64(i6, i7);\
+}/**/
+
+
+void INIT256(u64* h)
+{
+  __m128i* const chaining = (__m128i*) h;
+  static __m128i xmm0, /*xmm1,*/ xmm2, /*xmm3, xmm4, xmm5,*/ xmm6, xmm7;
+  static __m128i /*xmm8, xmm9, xmm10, xmm11,*/ xmm12, xmm13, xmm14, xmm15;
+
+  /* load IV into registers xmm12 - xmm15 */
+  xmm12 = chaining[0];
+  xmm13 = chaining[1];
+  xmm14 = chaining[2];
+  xmm15 = chaining[3];
+
+  /* transform chaining value from column ordering into row ordering */
+  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
+  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
+
+  /* store transposed IV */
+  chaining[0] = xmm12;
+  chaining[1] = xmm2;
+  chaining[2] = xmm6;
+  chaining[3] = xmm7;
+}
+
+void TF512(u64* h, u64* m)
+{
+  __m128i* const chaining = (__m128i*) h;
+  __m128i* const message = (__m128i*) m;
+  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m128i TEMP0;
+  static __m128i TEMP1;
+  static __m128i TEMP2;
+  static __m128i TEMP3;
+
+#ifdef IACA_TRACE
+  IACA_START;
+#endif
+
+  /* load message into registers xmm12 - xmm15 */
+  xmm12 = message[0];
+  xmm13 = message[1];
+  xmm14 = message[2];
+  xmm15 = message[3];
+
+  /* transform message M from column ordering into row ordering */
+  /* we first put two rows (64 bit) of the message into one 128-bit xmm register */
+  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
+
+  /* load previous chaining value and xor message to CV to get input of P */
+  /* we first put two rows (2x64 bit) of the CV into one 128-bit xmm register */
+  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
+  xmm8 = _mm_xor_si128(xmm12, chaining[0]);
+  xmm0 = _mm_xor_si128(xmm2,  chaining[1]);
+  xmm4 = _mm_xor_si128(xmm6,  chaining[2]);
+  xmm5 = _mm_xor_si128(xmm7,  chaining[3]);
+
+  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
+  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
+  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
+  Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
+
+  /* compute the two permutations P and Q in parallel */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P or two rows of Q in one xmm register */
+  Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3);
+
+  /* xor output of P and Q */
+  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
+  xmm0 = _mm_xor_si128(xmm0, xmm8);
+  xmm1 = _mm_xor_si128(xmm1, xmm10);
+  xmm2 = _mm_xor_si128(xmm2, xmm12);
+  xmm3 = _mm_xor_si128(xmm3, xmm14);
+
+  /* xor CV (feed-forward) */
+  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
+  xmm0 = _mm_xor_si128(xmm0, chaining[0]);
+  xmm1 = _mm_xor_si128(xmm1, chaining[1]);
+  xmm2 = _mm_xor_si128(xmm2, chaining[2]);
+  xmm3 = _mm_xor_si128(xmm3, chaining[3]);
+
+  /* store CV */
+  chaining[0] = xmm0;
+  chaining[1] = xmm1;
+  chaining[2] = xmm2;
+  chaining[3] = xmm3;
+
+#ifdef IACA_TRACE
+  IACA_END;
+#endif
+  return;
+}
+
+void OF512(u64* h)
+{
+  __m128i* const chaining = (__m128i*) h;
+  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m128i TEMP0;
+  static __m128i TEMP1;
+  static __m128i TEMP2;
+  static __m128i TEMP3;
+
+  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
+  xmm8 = chaining[0];
+  xmm10 = chaining[1];
+  xmm12 = chaining[2];
+  xmm14 = chaining[3];
+
+  /* there are now 2 rows of the CV in one xmm register */
+  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
+  /* result: the 8 input rows of P in xmm8 - xmm15 */
+  Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0);
+
+  /* compute the permutation P */
+  /* result: the output of P(CV) in xmm8 - xmm15 */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P in one xmm register */
+  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
+  Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
+  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
+  xmm10 = _mm_xor_si128(xmm10, (chaining[1]));
+  xmm12 = _mm_xor_si128(xmm12, (chaining[2]));
+  xmm14 = _mm_xor_si128(xmm14, (chaining[3]));
+
+  /* transform state back from row ordering into column ordering */
+  /* result: final hash value in xmm9, xmm11 */
+  Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0);
+
+  /* we only need to return the truncated half of the state */
+  chaining[2] = xmm9;
+  chaining[3] = xmm11;
+}
+
+
--- a/algo/groestl/aes_ni/groestl256-intr-vperm.h
+++ b/algo/groestl/aes_ni/groestl256-intr-vperm.h
@@ -0,0 +1,793 @@
+/* groestl-intr-vperm.h     Aug 2011
+ *
+ * Groestl implementation with intrinsics using ssse3 instructions.
+ * Author: Günther A. Roland, Martin Schläffer
+ *
+ * Based on the vperm and aes_ni implementations of the hash function Groestl
+ * by Cagdas Calik <ccalik@metu.edu.tr> http://www.metu.edu.tr/~ccalik/
+ * Institute of Applied Mathematics, Middle East Technical University, Turkey
+ *
+ * This code is placed in the public domain
+ */
+
+#include <tmmintrin.h>
+#include "hash-groestl256.h"
+
+/* global constants  */
+__m128i ROUND_CONST_Lx;
+__m128i ROUND_CONST_L0[ROUNDS512];
+__m128i ROUND_CONST_L7[ROUNDS512];
+__m128i ROUND_CONST_P[ROUNDS1024];
+__m128i ROUND_CONST_Q[ROUNDS1024];
+__m128i TRANSP_MASK;
+__m128i SUBSH_MASK[8];
+__m128i ALL_0F;
+__m128i ALL_15;
+__m128i ALL_1B;
+__m128i ALL_63;
+__m128i ALL_FF;
+__m128i VPERM_IPT[2];
+__m128i VPERM_OPT[2];
+__m128i VPERM_INV[2];
+__m128i VPERM_SB1[2];
+__m128i VPERM_SB2[2];
+__m128i VPERM_SB4[2];
+__m128i VPERM_SBO[2];
+
+
+#define tos(a)    #a
+#define tostr(a)  tos(a)
+
+#define SET_SHARED_CONSTANTS(){\
+  TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
+  ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
+  ALL_63 = _mm_set_epi32(0x63636363, 0x63636363, 0x63636363, 0x63636363);\
+  ALL_0F = _mm_set_epi32(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f);\
+  ALL_15 = _mm_set_epi32(0x15151515, 0x15151515, 0x15151515, 0x15151515);\
+  VPERM_IPT[0] = _mm_set_epi32(0xCD80B1FC, 0xB0FDCC81, 0x4C01307D, 0x317C4D00);\
+  VPERM_IPT[1] = _mm_set_epi32(0xCABAE090, 0x52227808, 0xC2B2E898, 0x5A2A7000);\
+  VPERM_OPT[0] = _mm_set_epi32(0xE10D5DB1, 0xB05C0CE0, 0x01EDBD51, 0x50BCEC00);\
+  VPERM_OPT[1] = _mm_set_epi32(0xF7974121, 0xDEBE6808, 0xFF9F4929, 0xD6B66000);\
+  VPERM_INV[0] = _mm_set_epi32(0x030D0E0C, 0x02050809, 0x01040A06, 0x0F0B0780);\
+  VPERM_INV[1] = _mm_set_epi32(0x04070309, 0x0A0B0C02, 0x0E05060F, 0x0D080180);\
+  VPERM_SB1[0] = _mm_set_epi32(0x3BF7CCC1, 0x0D2ED9EF, 0x3618D415, 0xFAE22300);\
+  VPERM_SB1[1] = _mm_set_epi32(0xA5DF7A6E, 0x142AF544, 0xB19BE18F, 0xCB503E00);\
+  VPERM_SB2[0] = _mm_set_epi32(0xC2A163C8, 0xAB82234A, 0x69EB8840, 0x0AE12900);\
+  VPERM_SB2[1] = _mm_set_epi32(0x5EB7E955, 0xBC982FCD, 0xE27A93C6, 0x0B712400);\
+  VPERM_SB4[0] = _mm_set_epi32(0xBA44FE79, 0x876D2914, 0x3D50AED7, 0xC393EA00);\
+  VPERM_SB4[1] = _mm_set_epi32(0xA876DE97, 0x49087E9F, 0xE1E937A0, 0x3FD64100);\
+}/**/
+
+/* VPERM
+ * Transform w/o settings c*
+ * transforms 2 rows to/from "vperm mode"
+ * this function is derived from:
+ *   vperm and aes_ni implementations of hash function Grostl
+ *   by Cagdas CALIK
+ * inputs:
+ * a0, a1 = 2 rows
+ * table = transformation table to use
+ * t*, c* = clobbers
+ * outputs:
+ * a0, a1 = 2 rows transformed with table
+ * */
+#define VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2){\
+  t0 = c0;\
+  t1 = c0;\
+  t0 = _mm_andnot_si128(t0, a0);\
+  t1 = _mm_andnot_si128(t1, a1);\
+  t0 = _mm_srli_epi32(t0, 4);\
+  t1 = _mm_srli_epi32(t1, 4);\
+  a0 = _mm_and_si128(a0, c0);\
+  a1 = _mm_and_si128(a1, c0);\
+  t2 = c2;\
+  t3 = c2;\
+  t2 = _mm_shuffle_epi8(t2, a0);\
+  t3 = _mm_shuffle_epi8(t3, a1);\
+  a0 = c1;\
+  a1 = c1;\
+  a0 = _mm_shuffle_epi8(a0, t0);\
+  a1 = _mm_shuffle_epi8(a1, t1);\
+  a0 = _mm_xor_si128(a0, t2);\
+  a1 = _mm_xor_si128(a1, t3);\
+}/**/
+
+#define VPERM_Transform_Set_Const(table, c0, c1, c2){\
+  c0 = ALL_0F;\
+  c1 = ((__m128i*) table )[0];\
+  c2 = ((__m128i*) table )[1];\
+}/**/
+
+/* VPERM
+ * Transform
+ * transforms 2 rows to/from "vperm mode"
+ * this function is derived from:
+ *   vperm and aes_ni implementations of hash function Grostl
+ *   by Cagdas CALIK
+ * inputs:
+ * a0, a1 = 2 rows
+ * table = transformation table to use
+ * t*, c* = clobbers
+ * outputs:
+ * a0, a1 = 2 rows transformed with table
+ * */
+#define VPERM_Transform(a0, a1, table, t0, t1, t2, t3, c0, c1, c2){\
+  VPERM_Transform_Set_Const(table, c0, c1, c2);\
+  VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
+}/**/
+
+/* VPERM
+ * Transform State
+ * inputs:
+ * a0-a3 = state
+ * table = transformation table to use
+ * t* = clobbers
+ * outputs:
+ * a0-a3 = transformed state
+ * */
+#define VPERM_Transform_State(a0, a1, a2, a3, table, t0, t1, t2, t3, c0, c1, c2){\
+  VPERM_Transform_Set_Const(table, c0, c1, c2);\
+  VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
+  VPERM_Transform_No_Const(a2, a3, t0, t1, t2, t3, c0, c1, c2);\
+}/**/
+
+/* VPERM
+ * Add Constant to State
+ * inputs:
+ * a0-a7 = state
+ * constant = constant to add
+ * t0 = clobber
+ * outputs:
+ * a0-a7 = state + constant
+ * */
+#define VPERM_Add_Constant(a0, a1, a2, a3, a4, a5, a6, a7, constant, t0){\
+  t0 = constant;\
+  a0 = _mm_xor_si128(a0,  t0);\
+  a1 = _mm_xor_si128(a1,  t0);\
+  a2 = _mm_xor_si128(a2,  t0);\
+  a3 = _mm_xor_si128(a3,  t0);\
+  a4 = _mm_xor_si128(a4,  t0);\
+  a5 = _mm_xor_si128(a5,  t0);\
+  a6 = _mm_xor_si128(a6,  t0);\
+  a7 = _mm_xor_si128(a7,  t0);\
+}/**/
+
+/* VPERM
+ * Set Substitute Core Constants
+ * */
+#define VPERM_Substitute_Core_Set_Const(c0, c1, c2){\
+  VPERM_Transform_Set_Const(VPERM_INV, c0, c1, c2);\
+}/**/
+
+/* VPERM
+ * Substitute Core
+ * first part of sbox inverse computation
+ * this function is derived from:
+ *   vperm and aes_ni implementations of hash function Grostl
+ *   by Cagdas CALIK
+ * inputs:
+ * a0 = 1 row
+ * t*, c* = clobbers
+ * outputs:
+ * b0a, b0b = inputs for lookup step
+ * */
+#define VPERM_Substitute_Core(a0, b0a, b0b, t0, t1, c0, c1, c2){\
+  t0 = c0;\
+  t0 = _mm_andnot_si128(t0, a0);\
+  t0 = _mm_srli_epi32(t0, 4);\
+  a0 = _mm_and_si128(a0,  c0);\
+  b0a = c1;\
+  b0a = _mm_shuffle_epi8(b0a, a0);\
+  a0 = _mm_xor_si128(a0,  t0);\
+  b0b = c2;\
+  b0b = _mm_shuffle_epi8(b0b, t0);\
+  b0b = _mm_xor_si128(b0b, b0a);\
+  t1 = c2;\
+  t1 = _mm_shuffle_epi8(t1,  a0);\
+  t1 = _mm_xor_si128(t1,  b0a);\
+  b0a = c2;\
+  b0a = _mm_shuffle_epi8(b0a, b0b);\
+  b0a = _mm_xor_si128(b0a, a0);\
+  b0b = c2;\
+  b0b = _mm_shuffle_epi8(b0b, t1);\
+  b0b = _mm_xor_si128(b0b, t0);\
+}/**/
+
+/* VPERM
+ * Lookup
+ * second part of sbox inverse computation
+ * this function is derived from:
+ *   vperm and aes_ni implementations of hash function Grostl
+ *   by Cagdas CALIK
+ * inputs:
+ * a0a, a0b = output of Substitution Core
+ * table = lookup table to use (*1 / *2 / *4)
+ * t0 = clobber
+ * outputs:
+ * b0 = output of sbox + multiplication
+ * */
+#define VPERM_Lookup(a0a, a0b, table, b0, t0){\
+  b0 = ((__m128i*) table )[0];\
+  t0 = ((__m128i*) table )[1];\
+  b0 = _mm_shuffle_epi8(b0, a0b);\
+  t0 = _mm_shuffle_epi8(t0, a0a);\
+  b0 = _mm_xor_si128(b0, t0);\
+}/**/
+
+/* VPERM
+ * SubBytes and *2 / *4
+ * this function is derived from:
+ *   Constant-time SSSE3 AES core implementation
+ *   by Mike Hamburg
+ * and
+ *   vperm and aes_ni implementations of hash function Grostl
+ *   by Cagdas CALIK
+ * inputs:
+ * a0-a7 = state
+ * t*, c* = clobbers
+ * outputs:
+ * a0-a7 = state * 4
+ * c2 = row0 * 2 -> b0
+ * c1 = row7 * 2 -> b3
+ * c0 = row7 * 1 -> b4
+ * t2 = row4 * 1 -> b7
+ * TEMP_MUL1 = row(i) * 1
+ * TEMP_MUL2 = row(i) * 2
+ *
+ * call:VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7) */
+#define VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, t0, t1, t3, t4, c2, c1, c0, t2){\
+  /* set Constants */\
+  VPERM_Substitute_Core_Set_Const(c0, c1, c2);\
+  /* row 1 */\
+  VPERM_Substitute_Core(a1, t0, t1, t3, t4, c0, c1, c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
+  TEMP_MUL1[1] = t2;\
+  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
+  TEMP_MUL2[1] = t3;\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a1, t4);\
+  /* --- */\
+  /* row 2 */\
+  VPERM_Substitute_Core(a2, t0, t1, t3, t4, c0, c1, c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
+  TEMP_MUL1[2] = t2;\
+  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
+  TEMP_MUL2[2] = t3;\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a2, t4);\
+  /* --- */\
+  /* row 3 */\
+  VPERM_Substitute_Core(a3, t0, t1, t3, t4, c0, c1, c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
+  TEMP_MUL1[3] = t2;\
+  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
+  TEMP_MUL2[3] = t3;\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a3, t4);\
+  /* --- */\
+  /* row 5 */\
+  VPERM_Substitute_Core(a5, t0, t1, t3, t4, c0, c1, c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
+  TEMP_MUL1[5] = t2;\
+  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
+  TEMP_MUL2[5] = t3;\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a5, t4);\
+  /* --- */\
+  /* row 6 */\
+  VPERM_Substitute_Core(a6, t0, t1, t3, t4, c0, c1, c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
+  TEMP_MUL1[6] = t2;\
+  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
+  TEMP_MUL2[6] = t3;\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a6, t4);\
+  /* --- */\
+  /* row 7 */\
+  VPERM_Substitute_Core(a7, t0, t1, t3, t4, c0, c1, c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
+  TEMP_MUL1[7] = t2;\
+  VPERM_Lookup(t0, t1, VPERM_SB2, c1, t4); /*c1 -> b3*/\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a7, t4);\
+  /* --- */\
+  /* row 4 */\
+  VPERM_Substitute_Core(a4, t0, t1, t3, t4, c0, (VPERM_INV[0]), c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4); /*t2 -> b7*/\
+  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
+  TEMP_MUL2[4] = t3;\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a4, t4);\
+  /* --- */\
+  /* row 0 */\
+  VPERM_Substitute_Core(a0, t0, t1, t3, t4, c0, (VPERM_INV[0]), c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, c0, t4); /*c0 -> b4*/\
+  VPERM_Lookup(t0, t1, VPERM_SB2, c2, t4); /*c2 -> b0*/\
+  TEMP_MUL2[0] = c2;\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a0, t4);\
+  /* --- */\
+}/**/
+
+
+/* Optimized MixBytes
+ * inputs:
+ * a0-a7 = (row0-row7) * 4
+ * b0 = row0 * 2
+ * b3 = row7 * 2
+ * b4 = row7 * 1
+ * b7 = row4 * 1
+ * all *1 and *2 values must also be in TEMP_MUL1, TEMP_MUL2
+ * output: b0-b7
+ * */
+#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* save one value */\
+  TEMP_MUL4 = a3;\
+  /* 1 */\
+  b1 = a0;\
+  b1 = _mm_xor_si128(b1, a5);\
+  b1 = _mm_xor_si128(b1, b4); /* -> helper! */\
+  b1 = _mm_xor_si128(b1, (TEMP_MUL2[3]));\
+  b2 = b1;\
+  \
+  /* 2 */\
+  b5 = a1;\
+  b5 = _mm_xor_si128(b5, a4);\
+  b5 = _mm_xor_si128(b5, b7); /* -> helper! */\
+  b5 = _mm_xor_si128(b5, b3); /* -> helper! */\
+  b6 = b5;\
+  \
+  /* 4 */\
+  b7 = _mm_xor_si128(b7, a6);\
+  /*b7 = _mm_xor_si128(b7, (TEMP_MUL1[4])); -> helper! */\
+  b7 = _mm_xor_si128(b7, (TEMP_MUL1[6]));\
+  b7 = _mm_xor_si128(b7, (TEMP_MUL2[1]));\
+  b7 = _mm_xor_si128(b7, b3); /* -> helper! */\
+  b2 = _mm_xor_si128(b2, b7);\
+  \
+  /* 3 */\
+  b0 = _mm_xor_si128(b0, a7);\
+  b0 = _mm_xor_si128(b0, (TEMP_MUL1[5]));\
+  b0 = _mm_xor_si128(b0, (TEMP_MUL1[7]));\
+  /*b0 = _mm_xor_si128(b0, (TEMP_MUL2[0])); -> helper! */\
+  b0 = _mm_xor_si128(b0, (TEMP_MUL2[2]));\
+  b3 = b0;\
+  b1 = _mm_xor_si128(b1, b0);\
+  b0 = _mm_xor_si128(b0, b7); /* moved from 4 */\
+  \
+  /* 5 */\
+  b4 = _mm_xor_si128(b4, a2);\
+  /*b4 = _mm_xor_si128(b4, (TEMP_MUL1[0])); -> helper! */\
+  b4 = _mm_xor_si128(b4, (TEMP_MUL1[2]));\
+  b4 = _mm_xor_si128(b4, (TEMP_MUL2[3]));\
+  b4 = _mm_xor_si128(b4, (TEMP_MUL2[5]));\
+  b3 = _mm_xor_si128(b3, b4);\
+  b6 = _mm_xor_si128(b6, b4);\
+  \
+  /* 6 */\
+  a3 = _mm_xor_si128(a3, (TEMP_MUL1[1]));\
+  a3 = _mm_xor_si128(a3, (TEMP_MUL1[3]));\
+  a3 = _mm_xor_si128(a3, (TEMP_MUL2[4]));\
+  a3 = _mm_xor_si128(a3, (TEMP_MUL2[6]));\
+  b4 = _mm_xor_si128(b4, a3);\
+  b5 = _mm_xor_si128(b5, a3);\
+  b7 = _mm_xor_si128(b7, a3);\
+  \
+  /* 7 */\
+  a1 = _mm_xor_si128(a1, (TEMP_MUL1[1]));\
+  a1 = _mm_xor_si128(a1, (TEMP_MUL2[4]));\
+  b2 = _mm_xor_si128(b2, a1);\
+  b3 = _mm_xor_si128(b3, a1);\
+  \
+  /* 8 */\
+  a5 = _mm_xor_si128(a5, (TEMP_MUL1[5]));\
+  a5 = _mm_xor_si128(a5, (TEMP_MUL2[0]));\
+  b6 = _mm_xor_si128(b6, a5);\
+  b7 = _mm_xor_si128(b7, a5);\
+  \
+  /* 9 */\
+  a3 = TEMP_MUL1[2];\
+  a3 = _mm_xor_si128(a3, (TEMP_MUL2[5]));\
+  b0 = _mm_xor_si128(b0, a3);\
+  b5 = _mm_xor_si128(b5, a3);\
+  \
+  /* 10 */\
+  a1 = TEMP_MUL1[6];\
+  a1 = _mm_xor_si128(a1, (TEMP_MUL2[1]));\
+  b1 = _mm_xor_si128(b1, a1);\
+  b4 = _mm_xor_si128(b4, a1);\
+  \
+  /* 11 */\
+  a5 = TEMP_MUL1[3];\
+  a5 = _mm_xor_si128(a5, (TEMP_MUL2[6]));\
+  b1 = _mm_xor_si128(b1, a5);\
+  b6 = _mm_xor_si128(b6, a5);\
+  \
+  /* 12 */\
+  a3 = TEMP_MUL1[7];\
+  a3 = _mm_xor_si128(a3, (TEMP_MUL2[2]));\
+  b2 = _mm_xor_si128(b2, a3);\
+  b5 = _mm_xor_si128(b5, a3);\
+  \
+  /* 13 */\
+  b0 = _mm_xor_si128(b0, (TEMP_MUL4));\
+  b0 = _mm_xor_si128(b0, a4);\
+  b1 = _mm_xor_si128(b1, a4);\
+  b3 = _mm_xor_si128(b3, a6);\
+  b4 = _mm_xor_si128(b4, a0);\
+  b4 = _mm_xor_si128(b4, a7);\
+  b5 = _mm_xor_si128(b5, a0);\
+  b7 = _mm_xor_si128(b7, a2);\
+}/**/
+
+#define SET_CONSTANTS(){\
+  SET_SHARED_CONSTANTS();\
+  SUBSH_MASK[0] = _mm_set_epi32(0x080f0e0d, 0x0c0b0a09, 0x07060504, 0x03020100);\
+  SUBSH_MASK[1] = _mm_set_epi32(0x0a09080f, 0x0e0d0c0b, 0x00070605, 0x04030201);\
+  SUBSH_MASK[2] = _mm_set_epi32(0x0c0b0a09, 0x080f0e0d, 0x01000706, 0x05040302);\
+  SUBSH_MASK[3] = _mm_set_epi32(0x0e0d0c0b, 0x0a09080f, 0x02010007, 0x06050403);\
+  SUBSH_MASK[4] = _mm_set_epi32(0x0f0e0d0c, 0x0b0a0908, 0x03020100, 0x07060504);\
+  SUBSH_MASK[5] = _mm_set_epi32(0x09080f0e, 0x0d0c0b0a, 0x04030201, 0x00070605);\
+  SUBSH_MASK[6] = _mm_set_epi32(0x0b0a0908, 0x0f0e0d0c, 0x05040302, 0x01000706);\
+  SUBSH_MASK[7] = _mm_set_epi32(0x0d0c0b0a, 0x09080f0e, 0x06050403, 0x02010007);\
+  for(i = 0; i < ROUNDS512; i++)\
+  {\
+    ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
+    ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\
+  }\
+  ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\
+}/**/
+
+/* vperm:
+ * transformation before rounds with ipt
+ * first round add transformed constant
+ * middle rounds: add constant XOR 0x15...15
+ * last round: additionally add 0x15...15 after MB
+ * transformation after rounds with opt
+ */
+/* one round
+ * i = round number
+ * a0-a7 = input rows
+ * b0-b7 = output rows
+ */
+#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* AddRoundConstant + ShiftBytes (interleaved) */\
+  b1 = ROUND_CONST_Lx;\
+  a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\
+  a1 = _mm_xor_si128(a1, b1);\
+  a2 = _mm_xor_si128(a2, b1);\
+  a3 = _mm_xor_si128(a3, b1);\
+  a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\
+  a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\
+  a4 = _mm_xor_si128(a4, b1);\
+  a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\
+  a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\
+  a5 = _mm_xor_si128(a5, b1);\
+  a6 = _mm_xor_si128(a6, b1);\
+  a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\
+  a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\
+  a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\
+  a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\
+  a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\
+  /* SubBytes + Multiplication by 2 and 4 */\
+  VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7);\
+  /* MixBytes */\
+  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
+}/**/
+
+/* 10 rounds, P and Q in parallel */
+#define ROUNDS_P_Q(){\
+  VPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, ALL_15, xmm0);\
+  ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  VPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, ALL_15, xmm0);\
+}
+
+
+/* Matrix Transpose Step 1
+ * input is a 512-bit state with two columns in one xmm
+ * output is a 512-bit state with two rows in one xmm
+ * inputs: i0-i3
+ * outputs: i0, o1-o3
+ * clobbers: t0
+ */
+#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
+  t0 = TRANSP_MASK;\
+\
+  i0 = _mm_shuffle_epi8(i0, t0);\
+  i1 = _mm_shuffle_epi8(i1, t0);\
+  i2 = _mm_shuffle_epi8(i2, t0);\
+  i3 = _mm_shuffle_epi8(i3, t0);\
+\
+  o1 = i0;\
+  t0 = i2;\
+\
+  i0 = _mm_unpacklo_epi16(i0, i1);\
+  o1 = _mm_unpackhi_epi16(o1, i1);\
+  i2 = _mm_unpacklo_epi16(i2, i3);\
+  t0 = _mm_unpackhi_epi16(t0, i3);\
+\
+  i0 = _mm_shuffle_epi32(i0, 216);\
+  o1 = _mm_shuffle_epi32(o1, 216);\
+  i2 = _mm_shuffle_epi32(i2, 216);\
+  t0 = _mm_shuffle_epi32(t0, 216);\
+\
+  o2 = i0;\
+  o3 = o1;\
+\
+  i0 = _mm_unpacklo_epi32(i0, i2);\
+  o1 = _mm_unpacklo_epi32(o1, t0);\
+  o2 = _mm_unpackhi_epi32(o2, i2);\
+  o3 = _mm_unpackhi_epi32(o3, t0);\
+}/**/
+
+/* Matrix Transpose Step 2
+ * input are two 512-bit states with two rows in one xmm
+ * output are two 512-bit states with one row of each state in one xmm
+ * inputs: i0-i3 = P, i4-i7 = Q
+ * outputs: (i0, o1-o7) = (P|Q)
+ * possible reassignments: (output reg = input reg)
+ * * i1 -> o3-7
+ * * i2 -> o5-7
+ * * i3 -> o7
+ * * i4 -> o3-7
+ * * i5 -> o6-7
+ */
+#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
+  o1 = i0;\
+  o2 = i1;\
+  i0 = _mm_unpacklo_epi64(i0, i4);\
+  o1 = _mm_unpackhi_epi64(o1, i4);\
+  o3 = i1;\
+  o4 = i2;\
+  o2 = _mm_unpacklo_epi64(o2, i5);\
+  o3 = _mm_unpackhi_epi64(o3, i5);\
+  o5 = i2;\
+  o6 = i3;\
+  o4 = _mm_unpacklo_epi64(o4, i6);\
+  o5 = _mm_unpackhi_epi64(o5, i6);\
+  o7 = i3;\
+  o6 = _mm_unpacklo_epi64(o6, i7);\
+  o7 = _mm_unpackhi_epi64(o7, i7);\
+}/**/
+
+/* Matrix Transpose Inverse Step 2
+ * input are two 512-bit states with one row of each state in one xmm
+ * output are two 512-bit states with two rows in one xmm
+ * inputs: i0-i7 = (P|Q)
+ * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
+ */
+#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
+  o0 = i0;\
+  i0 = _mm_unpacklo_epi64(i0, i1);\
+  o0 = _mm_unpackhi_epi64(o0, i1);\
+  o1 = i2;\
+  i2 = _mm_unpacklo_epi64(i2, i3);\
+  o1 = _mm_unpackhi_epi64(o1, i3);\
+  o2 = i4;\
+  i4 = _mm_unpacklo_epi64(i4, i5);\
+  o2 = _mm_unpackhi_epi64(o2, i5);\
+  o3 = i6;\
+  i6 = _mm_unpacklo_epi64(i6, i7);\
+  o3 = _mm_unpackhi_epi64(o3, i7);\
+}/**/
+
+/* Matrix Transpose Output Step 2
+ * input is one 512-bit state with two rows in one xmm
+ * output is one 512-bit state with one row in the low 64-bits of one xmm
+ * inputs: i0,i2,i4,i6 = S
+ * outputs: (i0-7) = (0|S)
+ */
+#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
+  t0 = _mm_xor_si128(t0, t0);\
+  i1 = i0;\
+  i3 = i2;\
+  i5 = i4;\
+  i7 = i6;\
+  i0 = _mm_unpacklo_epi64(i0, t0);\
+  i1 = _mm_unpackhi_epi64(i1, t0);\
+  i2 = _mm_unpacklo_epi64(i2, t0);\
+  i3 = _mm_unpackhi_epi64(i3, t0);\
+  i4 = _mm_unpacklo_epi64(i4, t0);\
+  i5 = _mm_unpackhi_epi64(i5, t0);\
+  i6 = _mm_unpacklo_epi64(i6, t0);\
+  i7 = _mm_unpackhi_epi64(i7, t0);\
+}/**/
+
+/* Matrix Transpose Output Inverse Step 2
+ * input is one 512-bit state with one row in the low 64-bits of one xmm
+ * output is one 512-bit state with two rows in one xmm
+ * inputs: i0-i7 = (0|S)
+ * outputs: (i0, i2, i4, i6) = S
+ */
+#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
+  i0 = _mm_unpacklo_epi64(i0, i1);\
+  i2 = _mm_unpacklo_epi64(i2, i3);\
+  i4 = _mm_unpacklo_epi64(i4, i5);\
+  i6 = _mm_unpacklo_epi64(i6, i7);\
+}/**/
+
+
+/* transform round constants into VPERM mode */
+#define VPERM_Transform_RoundConst_CNT2(i, j){\
+  xmm0 = ROUND_CONST_L0[i];\
+  xmm1 = ROUND_CONST_L7[i];\
+  xmm2 = ROUND_CONST_L0[j];\
+  xmm3 = ROUND_CONST_L7[j];\
+  VPERM_Transform_State(xmm0, xmm1, xmm2, xmm3, VPERM_IPT, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10);\
+  xmm0 = _mm_xor_si128(xmm0, (ALL_15));\
+  xmm1 = _mm_xor_si128(xmm1, (ALL_15));\
+  xmm2 = _mm_xor_si128(xmm2, (ALL_15));\
+  xmm3 = _mm_xor_si128(xmm3, (ALL_15));\
+  ROUND_CONST_L0[i] = xmm0;\
+  ROUND_CONST_L7[i] = xmm1;\
+  ROUND_CONST_L0[j] = xmm2;\
+  ROUND_CONST_L7[j] = xmm3;\
+}/**/
+
+/* transform round constants into VPERM mode */
+#define VPERM_Transform_RoundConst(){\
+  xmm0 = ROUND_CONST_Lx;\
+  VPERM_Transform(xmm0, xmm1, VPERM_IPT, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10);\
+  xmm0 = _mm_xor_si128(xmm0, (ALL_15));\
+  ROUND_CONST_Lx = xmm0;\
+  VPERM_Transform_RoundConst_CNT2(0, 1);\
+  VPERM_Transform_RoundConst_CNT2(2, 3);\
+  VPERM_Transform_RoundConst_CNT2(4, 5);\
+  VPERM_Transform_RoundConst_CNT2(6, 7);\
+  VPERM_Transform_RoundConst_CNT2(8, 9);\
+}/**/
+
+void INIT256(u64* h)
+{
+  __m128i* const chaining = (__m128i*) h;
+  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m128i xmm8, xmm9, xmm10, /*xmm11,*/ xmm12, xmm13, xmm14, xmm15;
+
+  /* transform round constants into VPERM mode */
+  VPERM_Transform_RoundConst();
+
+  /* load IV into registers xmm12 - xmm15 */
+  xmm12 = chaining[0];
+  xmm13 = chaining[1];
+  xmm14 = chaining[2];
+  xmm15 = chaining[3];
+
+  /* transform chaining value from column ordering into row ordering */
+  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
+  VPERM_Transform_State(xmm12, xmm13, xmm14, xmm15, VPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
+  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
+
+  /* store transposed IV */
+  chaining[0] = xmm12;
+  chaining[1] = xmm2;
+  chaining[2] = xmm6;
+  chaining[3] = xmm7;
+}
+
+void TF512(u64* h, u64* m)
+{
+  __m128i* const chaining = (__m128i*) h;
+  __m128i* const message = (__m128i*) m;
+  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m128i TEMP_MUL1[8];
+  static __m128i TEMP_MUL2[8];
+  static __m128i TEMP_MUL4;
+
+#ifdef IACA_TRACE
+  IACA_START;
+#endif
+
+  /* load message into registers xmm12 - xmm15 */
+  xmm12 = message[0];
+  xmm13 = message[1];
+  xmm14 = message[2];
+  xmm15 = message[3];
+
+  /* transform message M from column ordering into row ordering */
+  /* we first put two rows (64 bit) of the message into one 128-bit xmm register */
+  VPERM_Transform_State(xmm12, xmm13, xmm14, xmm15, VPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
+  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
+
+  /* load previous chaining value */
+  /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
+  xmm8 = chaining[0];
+  xmm0 = chaining[1];
+  xmm4 = chaining[2];
+  xmm5 = chaining[3];
+
+  /* xor message to CV get input of P */
+  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
+  xmm8 = _mm_xor_si128(xmm8, xmm12);
+  xmm0 = _mm_xor_si128(xmm0, xmm2);
+  xmm4 = _mm_xor_si128(xmm4, xmm6);
+  xmm5 = _mm_xor_si128(xmm5, xmm7);
+
+  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
+  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
+  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
+  Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
+
+  /* compute the two permutations P and Q in parallel */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P or two rows of Q in one xmm register */
+  Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3);
+
+  /* xor output of P and Q */
+  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
+  xmm0 = _mm_xor_si128(xmm0, xmm8);
+  xmm1 = _mm_xor_si128(xmm1, xmm10);
+  xmm2 = _mm_xor_si128(xmm2, xmm12);
+  xmm3 = _mm_xor_si128(xmm3, xmm14);
+
+  /* xor CV (feed-forward) */
+  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
+  xmm0 = _mm_xor_si128(xmm0, (chaining[0]));
+  xmm1 = _mm_xor_si128(xmm1, (chaining[1]));
+  xmm2 = _mm_xor_si128(xmm2, (chaining[2]));
+  xmm3 = _mm_xor_si128(xmm3, (chaining[3]));
+
+  /* store CV */
+  chaining[0] = xmm0;
+  chaining[1] = xmm1;
+  chaining[2] = xmm2;
+  chaining[3] = xmm3;
+
+#ifdef IACA_TRACE
+  IACA_END;
+#endif
+
+  return;
+}
+
+void OF512(u64* h)
+{
+  __m128i* const chaining = (__m128i*) h;
+  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m128i TEMP_MUL1[8];
+  static __m128i TEMP_MUL2[8];
+  static __m128i TEMP_MUL4;
+
+  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
+  xmm8 = chaining[0];
+  xmm10 = chaining[1];
+  xmm12 = chaining[2];
+  xmm14 = chaining[3];
+
+  /* there are now 2 rows of the CV in one xmm register */
+  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
+  /* result: the 8 input rows of P in xmm8 - xmm15 */
+  Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0);
+
+  /* compute the permutation P */
+  /* result: the output of P(CV) in xmm8 - xmm15 */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P in one xmm register */
+  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
+  Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
+  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
+  xmm10 = _mm_xor_si128(xmm10, (chaining[1]));
+  xmm12 = _mm_xor_si128(xmm12, (chaining[2]));
+  xmm14 = _mm_xor_si128(xmm14, (chaining[3]));
+
+  /* transform state back from row ordering into column ordering */
+  /* result: final hash value in xmm9, xmm11 */
+  Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0);
+  VPERM_Transform(xmm9, xmm11, VPERM_OPT, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm7);
+
+  /* we only need to return the truncated half of the state */
+  chaining[2] = xmm9;
+  chaining[3] = xmm11;
+
+  return;
+}//OF512()
+
+
+
--- a/algo/groestl/aes_ni/hash-groestl.c
+++ b/algo/groestl/aes_ni/hash-groestl.c
@@ -16,13 +16,48 @@

 #ifdef __AES__

-#include "groestl-intr-aes.h"
+#include "groestl-version.h"
+
+#ifdef TASM
+  #ifdef VAES
+    #include "groestl-asm-aes.h"
+  #else
+    #ifdef VAVX
+      #include "groestl-asm-avx.h"
+    #else
+      #ifdef VVPERM
+        #include "groestl-asm-vperm.h"
+      #else
+        #error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
+      #endif
+    #endif
+  #endif
+#else
+  #ifdef TINTR
+    #ifdef VAES
+      #include "groestl-intr-aes.h"
+    #else
+      #ifdef VAVX
+        #include "groestl-intr-avx.h"
+      #else
+        #ifdef VVPERM
+          #include "groestl-intr-vperm.h"
+        #else
+          #error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
+        #endif
+      #endif
+    #endif
+  #else
+    #error NO TYPE SPECIFIED (-DT[ASM/INTR])
+  #endif
+#endif

 HashReturn_gr init_groestl( hashState_groestl* ctx, int hashlen )
 {
  int i;

  ctx->hashlen = hashlen;
+  SET_CONSTANTS();

  if (ctx->chaining == NULL || ctx->buffer == NULL)
    return FAIL_GR;
@@ -35,6 +70,8 @@ HashReturn_gr init_groestl( hashState_groestl* ctx, int hashlen )

  // The only non-zero in the IV is len. It can be hard coded.
  ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 );
+//  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
+//  INIT(ctx->chaining);

  ctx->buf_ptr = 0;
  ctx->rem_ptr = 0;
@@ -55,6 +92,8 @@ HashReturn_gr reinit_groestl( hashState_groestl* ctx )
     ctx->buffer[i]   = _mm_setzero_si128();
  }
  ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 );
+//  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
+//  INIT(ctx->chaining);
  ctx->buf_ptr = 0;
  ctx->rem_ptr = 0;

@@ -70,7 +109,7 @@ HashReturn_gr reinit_groestl( hashState_groestl* ctx )
 // 5. Midstate will work at reduced impact than full hash, if total hash
 //    (midstate + tail) is less than 1 block.
 //    This, unfortunately, is the case with all current users.
-// 6. the more full blocks the bigger the gain
+// 6. the morefull blocks the bigger the gain

 // use only for midstate precalc
 HashReturn_gr update_groestl( hashState_groestl* ctx, const void* input,
@@ -104,11 +143,12 @@ HashReturn_gr update_groestl( hashState_groestl* ctx, const void* input,
 // deprecated do not use
 HashReturn_gr final_groestl( hashState_groestl* ctx, void* output )
 {
-   const int len = (int)ctx->databitlen / 128; // bits to __m128i 
-   const uint64_t blocks = ctx->blk_count + 1; // adjust for final block
-   const int rem_ptr = ctx->rem_ptr;           // end of data start of padding
-   const int hashlen_m128i = ctx->hashlen / 16;     // bytes to __m128i
-   const int hash_offset = SIZE512 - hashlen_m128i; // where in buffer
+   const int len = (int)ctx->databitlen / 128;  // bits to __m128i 
+   const int blocks = ctx->blk_count + 1;       // adjust for final block
+
+   const int rem_ptr = ctx->rem_ptr;      // end of data start of padding
+   const int hashlen_m128i = ctx->hashlen / 16;  // bytes to __m128i
+   const int hash_offset = SIZE512 - hashlen_m128i;  // where in buffer
   int i;

   // first pad byte = 0x80, last pad byte = block count
@@ -117,18 +157,21 @@ HashReturn_gr final_groestl( hashState_groestl* ctx, void* output )
   if ( rem_ptr == len - 1 )
   {
       // only 128 bits left in buffer, all padding at once
-      ctx->buffer[rem_ptr] = _mm_set_epi64x( blocks << 56, 0x80 );
+       ctx->buffer[rem_ptr] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
+                                                  0,0,0,0, 0,0,0,0x80 );
   }
   else
   {
       // add first padding
-       ctx->buffer[rem_ptr] = m128_const_64( 0, 0x80 );
+       ctx->buffer[rem_ptr] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
+                                            0,0,0,0, 0,0,0,0x80 );
       // add zero padding
       for ( i = rem_ptr + 1; i < SIZE512 - 1; i++ )
           ctx->buffer[i] = _mm_setzero_si128();

       // add length padding, second last byte is zero unless blocks > 255
-       ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0 );
+       ctx->buffer[i] = _mm_set_epi8( blocks, blocks>>8, 0,0, 0,0,0,0,
+                                           0,         0 ,0,0, 0,0,0,0 );
   }

   // digest final padding block and do output transform
@@ -146,20 +189,21 @@ int groestl512_full( hashState_groestl* ctx, void* output,
                                const void* input, uint64_t databitlen )
 {

-   int i;
-   ctx->hashlen = 64;
+  int i;
+
+  ctx->hashlen = 64;
+  SET_CONSTANTS();
+
+  for ( i = 0; i < SIZE512; i++ )
+  {
+     ctx->chaining[i] = _mm_setzero_si128();
+     ctx->buffer[i]   = _mm_setzero_si128();
+  }
+  ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 );
+  ctx->buf_ptr = 0;
+  ctx->rem_ptr = 0;

-   for ( i = 0; i < SIZE512; i++ )
-   {
-      ctx->chaining[i] = _mm_setzero_si128();
-      ctx->buffer[i]   = _mm_setzero_si128();
-   }
-   ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 );
-   ctx->buf_ptr = 0;
-   ctx->rem_ptr = 0;

-   // --- update ---
-   
   const int len = (int)databitlen / 128;
   const int hashlen_m128i = ctx->hashlen / 16;   // bytes to __m128i
   const int hash_offset = SIZE512 - hashlen_m128i;
@@ -167,6 +211,8 @@ int groestl512_full( hashState_groestl* ctx, void* output,
   uint64_t blocks = len / SIZE512;
   __m128i* in = (__m128i*)input;

+   // --- update ---
+
   // digest any full blocks, process directly from input 
   for ( i = 0; i < blocks; i++ )
      TF1024( ctx->chaining, &in[ i * SIZE512 ] );
@@ -185,22 +231,26 @@ int groestl512_full( hashState_groestl* ctx, void* output,
   if ( i == len -1 )
   {
       // only 128 bits left in buffer, all padding at once
-      ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0x80 );
+       ctx->buffer[i] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
+                                           0,0,0,0, 0,0,0,0x80 );
   }
   else
   {
       // add first padding
-       ctx->buffer[i] = m128_const_64( 0, 0x80 );
+       ctx->buffer[i] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
+                                      0,0,0,0, 0,0,0,0x80 );
       // add zero padding
       for ( i += 1; i < SIZE512 - 1; i++ )
           ctx->buffer[i] = _mm_setzero_si128();

       // add length padding, second last byte is zero unless blocks > 255
-       ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0 ); 
+       ctx->buffer[i] = _mm_set_epi8( blocks, blocks>>8, 0,0, 0,0,0,0,
+                                           0,         0 ,0,0, 0,0,0,0 );
   }

   // digest final padding block and do output transform
   TF1024( ctx->chaining, ctx->buffer );
+
   OF1024( ctx->chaining );

   // store hash result in output 
@@ -218,7 +268,7 @@ HashReturn_gr update_and_final_groestl( hashState_groestl* ctx, void* output,
   const int hashlen_m128i = ctx->hashlen / 16;   // bytes to __m128i
   const int hash_offset = SIZE512 - hashlen_m128i;
   int rem = ctx->rem_ptr;
-   uint64_t blocks = len / SIZE512;
+   int blocks = len / SIZE512;
   __m128i* in = (__m128i*)input;
   int i;

@@ -242,22 +292,26 @@ HashReturn_gr update_and_final_groestl( hashState_groestl* ctx, void* output,
   if ( i == len -1 )
   {        
       // only 128 bits left in buffer, all padding at once
-      ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0x80 );
+       ctx->buffer[i] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
+                                           0,0,0,0, 0,0,0,0x80 );
   }   
   else
   {
       // add first padding
-       ctx->buffer[i] = m128_const_64( 0, 0x80 );
+       ctx->buffer[i] = _mm_set_epi8( 0,0,0,0, 0,0,0,0, 
+                                      0,0,0,0, 0,0,0,0x80 );
       // add zero padding
       for ( i += 1; i < SIZE512 - 1; i++ )
           ctx->buffer[i] = _mm_setzero_si128();

       // add length padding, second last byte is zero unless blocks > 255
-       ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0 );
+       ctx->buffer[i] = _mm_set_epi8( blocks, blocks>>8, 0,0, 0,0,0,0, 
+                                           0,         0 ,0,0, 0,0,0,0 );
   }

   // digest final padding block and do output transform
   TF1024( ctx->chaining, ctx->buffer );
+
   OF1024( ctx->chaining );

   // store hash result in output 
--- a/algo/groestl/aes_ni/hash-groestl256.c
+++ b/algo/groestl/aes_ni/hash-groestl256.c
@@ -13,7 +13,41 @@

 #ifdef __AES__

-#include "groestl256-intr-aes.h"
+#include "groestl-version.h"
+
+#ifdef TASM
+  #ifdef VAES
+    #include "groestl256-asm-aes.h"
+  #else
+    #ifdef VAVX
+      #include "groestl256-asm-avx.h"
+    #else
+      #ifdef VVPERM
+        #include "groestl256-asm-vperm.h"
+      #else
+        #error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
+      #endif
+    #endif
+  #endif
+#else
+  #ifdef TINTR
+    #ifdef VAES
+      #include "groestl256-intr-aes.h"
+    #else
+      #ifdef VAVX
+        #include "groestl256-intr-avx.h"
+      #else
+        #ifdef VVPERM
+          #include "groestl256-intr-vperm.h"
+        #else
+          #error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
+        #endif
+      #endif
+    #endif
+  #else
+    #error NO TYPE SPECIFIED (-DT[ASM/INTR])
+  #endif
+#endif

 /* initialise context */
 HashReturn_gr init_groestl256( hashState_groestl256* ctx, int hashlen )
@@ -21,6 +55,7 @@ HashReturn_gr init_groestl256( hashState_groestl256* ctx, int hashlen )
  int i;

  ctx->hashlen = hashlen;
+  SET_CONSTANTS();

  if (ctx->chaining == NULL || ctx->buffer == NULL)
    return FAIL_GR;
@@ -214,98 +249,6 @@ HashReturn_gr update_and_final_groestl256( hashState_groestl256* ctx,
   return SUCCESS_GR;
 }

-int groestl256_full( hashState_groestl256* ctx,
-                   void* output, const void* input, DataLength_gr databitlen )
-{
-   int i;
-   ctx->hashlen = 32;
-  for ( i = 0; i < SIZE256; i++ )
-  {
-     ctx->chaining[i] = _mm_setzero_si128();
-     ctx->buffer[i]   = _mm_setzero_si128();
-  }
-  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
-  INIT256( ctx->chaining );
-  ctx->buf_ptr = 0;
-  ctx->rem_ptr = 0;
-
-   const int len = (int)databitlen / 128;
-   const int hashlen_m128i = ctx->hashlen / 16;   // bytes to __m128i
-   const int hash_offset = SIZE256 - hashlen_m128i;
-   int rem = ctx->rem_ptr;
-   int blocks = len / SIZE256;
-   __m128i* in = (__m128i*)input;
-
-   // --- update ---
-
-   // digest any full blocks, process directly from input
-   for ( i = 0; i < blocks; i++ )
-      TF512( ctx->chaining, &in[ i * SIZE256 ] );
-   ctx->buf_ptr = blocks * SIZE256;
-
-   // cryptonight has 200 byte input, an odd number of __m128i
-   // remainder is only 8 bytes, ie u64.
-   if ( databitlen % 128 !=0 )
-   {
-      // must be cryptonight, copy 64 bits of data
-      *(uint64_t*)(ctx->buffer) = *(uint64_t*)(&in[ ctx->buf_ptr ] );
-      i = -1; // signal for odd length
-   }
-   else
-   {
-      // Copy any remaining data to buffer for final transform
-      for ( i = 0; i < len % SIZE256; i++ )
-          ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
-      i += rem;   // use i as rem_ptr in final
-   }
-
-   //--- final ---
-
-   // adjust for final block
-   blocks++;
-
-   if ( i == len - 1 )
-   {
-       // all padding at once
-       ctx->buffer[i] = _mm_set_epi8( blocks,blocks>>8,0,0, 0,0,0,0,
-                                           0,        0,0,0, 0,0,0,0x80 );
-   }
-   else
-   {
-      if ( i == -1 )
-      {
-         // cryptonight odd length
-         ((uint64_t*)ctx->buffer)[ 1 ] = 0x80ull;
-         // finish the block with zero and length padding as normal
-         i = 0;
-       }
-       else
-       {
-          // add first padding
-          ctx->buffer[i] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
-                                         0,0,0,0, 0,0,0,0x80 );
-       }
-       // add zero padding
-       for ( i += 1; i < SIZE256 - 1; i++ )
-           ctx->buffer[i] = _mm_setzero_si128();
-       // add length padding
-       // cheat since we know the block count is trivial, good if block < 256
-       ctx->buffer[i] = _mm_set_epi8( blocks,blocks>>8,0,0, 0,0,0,0,
-                                           0,        0,0,0, 0,0,0,0 );
-   }
-
-   // digest final padding block and do output transform
-   TF512( ctx->chaining, ctx->buffer );
-   OF512( ctx->chaining );
-
-   // store hash result in output 
-   for ( i = 0; i < hashlen_m128i; i++ )
-      casti_m128i( output, i ) = ctx->chaining[ hash_offset + i ];
-
-   return SUCCESS_GR;
-}
-
-
 /* hash bit sequence */
 HashReturn_gr hash_groestl256(int hashbitlen,
                const BitSequence_gr* data,
--- a/algo/groestl/aes_ni/hash-groestl256.h
+++ b/algo/groestl/aes_ni/hash-groestl256.h
@@ -115,7 +115,4 @@ HashReturn_gr hash_groestli256( int, const BitSequence_gr*, DataLength_gr,
 HashReturn_gr update_and_final_groestl256( hashState_groestl256*, void*,
                                           const void*, DataLength_gr );

-int groestl256_full( hashState_groestl256* ctx,
-                   void* output, const void* input, DataLength_gr databitlen );
-
 #endif /* __hash_h */
--- a/algo/groestl/groestl.c
+++ b/algo/groestl/groestl.c
@@ -1,7 +1,4 @@
 #include "groestl-gate.h"
-
-#if !defined(GROESTL_8WAY) && !defined(GROESTLX16R_4WAY)
-
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
@@ -91,4 +88,4 @@ int scanhash_groestl( struct work *work, uint32_t max_nonce,
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }
-#endif
+
--- a/algo/groestl/groestl256-hash-4way.c
+++ b/algo/groestl/groestl256-hash-4way.c
@@ -15,7 +15,7 @@
 #include "miner.h"
 #include "simd-utils.h"

-#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(__VAES__)


 int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen )
@@ -23,6 +23,7 @@ int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen )
  int i;

  ctx->hashlen = hashlen;
+  SET_CONSTANTS();

  if (ctx->chaining == NULL || ctx->buffer == NULL)
    return 1;
@@ -35,6 +36,9 @@ int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen )

  // The only non-zero in the IV is len. It can be hard coded.
  ctx->chaining[ 3 ] = m512_const2_64( 0, 0x0100000000000000 );
+//  uint64_t len = U64BIG((uint64_t)LENGTH);
+//  ctx->chaining[ COLS/2 -1 ] = _mm512_set4_epi64( len, 0, len, 0 );
+//  INIT256_4way(ctx->chaining);

  ctx->buf_ptr = 0;
  ctx->rem_ptr = 0;
@@ -42,77 +46,6 @@ int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen )
  return 0;
 }

-int groestl256_4way_full( groestl256_4way_context* ctx, void* output,
-                                const void* input, uint64_t databitlen )
-{
-   const int len = (int)databitlen / 128;
-   const int hashlen_m128i = 32 / 16;   // bytes to __m128i
-   const int hash_offset = SIZE256 - hashlen_m128i;
-   int rem = ctx->rem_ptr;
-   int blocks = len / SIZE256;
-   __m512i* in = (__m512i*)input;
-   int i;
-
-  if (ctx->chaining == NULL || ctx->buffer == NULL)
-    return 1;
-
-  for ( i = 0; i < SIZE256; i++ )
-  {
-     ctx->chaining[i] = m512_zero;
-     ctx->buffer[i]   = m512_zero;
-  }
-
-  // The only non-zero in the IV is len. It can be hard coded.
-  ctx->chaining[ 3 ] = m512_const2_64( 0, 0x0100000000000000 );
-  ctx->buf_ptr = 0;
-  ctx->rem_ptr = 0;
-   
-   // --- update ---
-
-   // digest any full blocks, process directly from input 
-   for ( i = 0; i < blocks; i++ )
-      TF512_4way( ctx->chaining, &in[ i * SIZE256 ] );
-   ctx->buf_ptr = blocks * SIZE256;
-
-   // copy any remaining data to buffer, it may already contain data
-   // from a previous update for a midstate precalc
-   for ( i = 0; i < len % SIZE256; i++ )
-       ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
-   i += rem;    // use i as rem_ptr in final
-
-   //--- final ---
-
-   blocks++;      // adjust for final block
-
-   if ( i == SIZE256 - 1 )
-   {        
-       // only 1 vector left in buffer, all padding at once
-      ctx->buffer[i] = m512_const2_64( (uint64_t)blocks << 56, 0x80 ); 
-   }   
-   else
-   {
-       // add first padding
-       ctx->buffer[i] = m512_const4_64( 0, 0x80, 0, 0x80 );
-       // add zero padding
-       for ( i += 1; i < SIZE256 - 1; i++ )
-           ctx->buffer[i] = m512_zero;
-
-       // add length padding, second last byte is zero unless blocks > 255
-      ctx->buffer[i] = m512_const2_64( (uint64_t)blocks << 56, 0 );
-   }
-
-// digest final padding block and do output transform
-   TF512_4way( ctx->chaining, ctx->buffer );
-
-   OF512_4way( ctx->chaining );
-
-   // store hash result in output 
-   for ( i = 0; i < hashlen_m128i; i++ )
-      casti_m512i( output, i ) = ctx->chaining[ hash_offset + i ];
-
-   return 0;
-}
-
 int groestl256_4way_update_close( groestl256_4way_context* ctx, void* output,
                                const void* input, uint64_t databitlen )
 {
@@ -142,11 +75,11 @@ int groestl256_4way_update_close( groestl256_4way_context* ctx, void* output,
   blocks++;      // adjust for final block

   if ( i == SIZE256 - 1 )
-   {
+   {        
       // only 1 vector left in buffer, all padding at once
       ctx->buffer[i] = m512_const1_128( _mm_set_epi8(
                      blocks, blocks>>8,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) );
-   }
+   }   
   else
   {
       // add first padding
--- a/algo/groestl/groestl256-hash-4way.h
+++ b/algo/groestl/groestl256-hash-4way.h
@@ -18,8 +18,6 @@
 #endif
 #include <stdlib.h>

-#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-   
 #define LENGTH (256)

 //#include "brg_endian.h"
@@ -71,8 +69,4 @@ int groestl256_4way_init( groestl256_4way_context*, uint64_t );
 int groestl256_4way_update_close( groestl256_4way_context*,  void*,
                                        const void*, uint64_t );

-int groestl256_4way_full( groestl256_4way_context*, void*,
-                          const void*, uint64_t );
-
-#endif
 #endif 
--- a/algo/groestl/groestl256-intr-4way.h
+++ b/algo/groestl/groestl256-intr-4way.h
@@ -14,78 +14,17 @@
 #include "groestl256-hash-4way.h"

 #if defined(__VAES__)
-static const __m128i round_const_l0[] __attribute__ ((aligned (64))) =
-{
-   { 0x7060504030201000, 0xffffffffffffffff },
-   { 0x7161514131211101, 0xffffffffffffffff },
-   { 0x7262524232221202, 0xffffffffffffffff },
-   { 0x7363534333231303, 0xffffffffffffffff },
-   { 0x7464544434241404, 0xffffffffffffffff },
-   { 0x7565554535251505, 0xffffffffffffffff },
-   { 0x7666564636261606, 0xffffffffffffffff },
-   { 0x7767574737271707, 0xffffffffffffffff },
-   { 0x7868584838281808, 0xffffffffffffffff },
-   { 0x7969594939291909, 0xffffffffffffffff }
-};

-static const __m128i round_const_l7[] __attribute__ ((aligned (64))) =
-{
-   { 0x0000000000000000, 0x8f9fafbfcfdfefff },
-   { 0x0000000000000000, 0x8e9eaebecedeeefe },
-   { 0x0000000000000000, 0x8d9dadbdcdddedfd },
-   { 0x0000000000000000, 0x8c9cacbcccdcecfc },
-   { 0x0000000000000000, 0x8b9babbbcbdbebfb },
-   { 0x0000000000000000, 0x8a9aaabacadaeafa },
-   { 0x0000000000000000, 0x8999a9b9c9d9e9f9 },
-   { 0x0000000000000000, 0x8898a8b8c8d8e8f8 },
-   { 0x0000000000000000, 0x8797a7b7c7d7e7f7 },
-   { 0x0000000000000000, 0x8696a6b6c6d6e6f6 }
-};
-
-static const __m512i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02,
-                                     0x1d1519111c141810, 0x1f171b131e161a12,
-                                     0x2d2529212c242820, 0x2f272b232e262a22,
-                                     0x3d3539313c343830, 0x3f373b333e363a32 };
-
-static const __m512i SUBSH_MASK0 = { 0x0c0f0104070b0e00, 0x03060a0d08020509,
-                                     0x1c1f1114171b1e10, 0x13161a1d18121519,
-                                     0x2c2f2124272b2e20, 0x23262a2d28222529,
-                                     0x3c3f3134373b3e30, 0x33363a3d38323539 };
-
-static const __m512i SUBSH_MASK1 = { 0x0e090205000d0801, 0x04070c0f0a03060b,
-                                     0x1e191215101d1801, 0x14171c1f1a13161b,
-                                     0x2e292225202d2821, 0x24272c2f2a23262b,
-                                     0x3e393235303d3831, 0x34373c3f3a33363b };
-
-static const __m512i SUBSH_MASK2 = { 0x080b0306010f0a02, 0x05000e090c04070d,
-                                     0x181b1316111f1a12, 0x15101e191c14171d,
-                                     0x282b2326212f2a22, 0x25202e292c24272d,
-                                     0x383b3336313f3a32, 0x35303e393c34373d };
-
-static const __m512i SUBSH_MASK3 = { 0x0a0d040702090c03, 0x0601080b0e05000f,
-                                     0x1a1d141712191c13, 0x1611181b1e15101f,
-                                     0x2a2d242722292c23, 0x2621282b2e25202f,
-                                     0x3a3d343732393c33, 0x3631383b3e35303f };
-
-static const __m512i SUBSH_MASK4 = { 0x0b0e0500030a0d04, 0x0702090c0f060108,
-                                     0x1b1e1510131a1d14, 0x1712191c1f161118,
-                                     0x2b2e2520232a2d24, 0x2722292c2f262128,
-                                     0x3b3e3530333a3d34, 0x3732393c3f363138 };
-
-static const __m512i SUBSH_MASK5 = { 0x0d080601040c0f05, 0x00030b0e0907020a,
-                                     0x1d181611141c1f15, 0x10131b1e1917121a,
-                                     0x2d282621242c2f25, 0x20232b2e2927222a,
-                                     0x3d383631343c3f35, 0x30333b3e3937323a };
-
-static const __m512i SUBSH_MASK6 = { 0x0f0a0702050e0906, 0x01040d080b00030c,
-                                     0x1f1a1712151e1916, 0x11141d181b10131c,
-                                     0x2f2a2722252e2926, 0x21242d282b20232c,
-                                     0x3f3a3732353e3936, 0x31343d383b30333c };
-
-static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
-                                     0x191c101316181b17, 0x12151f1a1d11141e,
-                                     0x292c202326282b27, 0x22252f2a2d21242e,
-                                     0x393c303336383b37, 0x32353f3a3d31343e };
+/* global constants  */
+__m512i ROUND_CONST_Lx;
+__m512i ROUND_CONST_L0[ROUNDS512];
+__m512i ROUND_CONST_L7[ROUNDS512];
+//__m512i ROUND_CONST_P[ROUNDS1024];
+//__m512i ROUND_CONST_Q[ROUNDS1024];
+__m512i TRANSP_MASK;
+__m512i SUBSH_MASK[8];
+__m512i ALL_1B;
+__m512i ALL_FF;

 #define tos(a)    #a
 #define tostr(a)  tos(a)
@@ -101,6 +40,8 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
  i = _mm512_xor_si512(i, j);\
 } 

+ /**/
+
 /* Yet another implementation of MixBytes.
   This time we use the formulae (3) from the paper "Byte Slicing Groestl".
   Input: a0, ..., a7
@@ -214,36 +155,95 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
  b1 = _mm512_xor_si512(b1, a4);\
 }/*MixBytes*/

+// calculate the round constants seperately and load at startup
+
+#define SET_CONSTANTS(){\
+  ALL_1B = _mm512_set1_epi32( 0x1b1b1b1b );\
+  TRANSP_MASK   = _mm512_set_epi32( \
+                         0x3f373b33, 0x3e363a32, 0x3d353931, 0x3c343830, \
+                         0x2f272b23, 0x2e262a22, 0x2d252921, 0x2c242820, \
+                         0x1f171b13, 0x1e161a12, 0x1d151911, 0x1c141810, \
+                         0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800 ); \
+  SUBSH_MASK[0] = _mm512_set_epi32( \
+                         0x33363a3d, 0x38323539, 0x3c3f3134, 0x373b3e30, \
+                         0x23262a2d, 0x28222529, 0x2c2f2124, 0x272b2e20, \
+                         0x13161a1d, 0x18121519, 0x1c1f1114, 0x171b1e10, \
+                         0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00 ); \
+  SUBSH_MASK[1] = _mm512_set_epi32( \
+                         0x34373c3f, 0x3a33363b, 0x3e393235, 0x303d3831, \
+                         0x24272c2f, 0x2a23262b, 0x2e292225, 0x202d2821, \
+                         0x14171c1f, 0x1a13161b, 0x1e191215, 0x101d1801, \
+                         0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801 );\
+  SUBSH_MASK[2] = _mm512_set_epi32( \
+                         0x35303e39, 0x3c34373d, 0x383b3336, 0x313f3a32, \
+                         0x25202e29, 0x2c24272d, 0x282b2326, 0x212f2a22, \
+                         0x15101e19, 0x1c14171d, 0x181b1316, 0x111f1a12, \
+                         0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02 );\
+  SUBSH_MASK[3] = _mm512_set_epi32( \
+                         0x3631383b, 0x3e35303f, 0x3a3d3437, 0x32393c33, \
+                         0x2621282b, 0x2e25202f, 0x2a2d2427, 0x22292c23, \
+                         0x1611181b, 0x1e15101f, 0x1a1d1417, 0x12191c13, \
+                         0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03 );\
+  SUBSH_MASK[4] = _mm512_set_epi32( \
+                         0x3732393c, 0x3f363138, 0x3b3e3530, 0x333a3d34, \
+                         0x2722292c, 0x2f262128, 0x2b2e2520, 0x232a2d24, \
+                         0x1712191c, 0x1f161118, 0x1b1e1510, 0x131a1d14, \
+                         0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04 );\
+  SUBSH_MASK[5] = _mm512_set_epi32( \
+                         0x30333b3e, 0x3937323a, 0x3d383631, 0x343c3f35, \
+                         0x20232b2e, 0x2927222a, 0x2d282621, 0x242c2f25, \
+                         0x10131b1e, 0x1917121a, 0x1d181611, 0x141c1f15, \
+                         0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05 );\
+  SUBSH_MASK[6] = _mm512_set_epi32( \
+                         0x31343d38, 0x3b30333c, 0x3f3a3732, 0x353e3936, \
+                         0x21242d28, 0x2b20232c, 0x2f2a2722, 0x252e2926, \
+                         0x11141d18, 0x1b10131c, 0x1f1a1712, 0x151e1916, \
+                         0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906 );\
+  SUBSH_MASK[7] = _mm512_set_epi32( \
+                         0x32353f3a, 0x3d31343e, 0x393c3033, 0x36383b37, \
+                         0x22252f2a, 0x2d21242e, 0x292c2023, 0x26282b27, \
+                         0x12151f1a, 0x1d11141e, 0x191c1013, 0x16181b17, \
+                         0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07 );\
+  for ( i = 0; i < ROUNDS512; i++ ) \
+  {\
+    ROUND_CONST_L0[i] = _mm512_set4_epi32( 0xffffffff, 0xffffffff, \
+          0x70605040 ^ ( i * 0x01010101 ), 0x30201000 ^ ( i * 0x01010101 ) ); \
+    ROUND_CONST_L7[i] = _mm512_set4_epi32( 0x8f9fafbf ^ ( i * 0x01010101 ), \
+          0xcfdfefff ^ ( i * 0x01010101 ), 0x00000000, 0x00000000 ); \
+  }\
+  ROUND_CONST_Lx = _mm512_set4_epi32( 0xffffffff, 0xffffffff, \
+                                      0x00000000, 0x00000000 ); \
+}while(0);\

 #define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
  /* AddRoundConstant */\
-  b1 = m512_const2_64( 0xffffffffffffffff, 0 ); \
-  a0 = _mm512_xor_si512( a0, m512_const1_128( round_const_l0[i] ) );\
+  b1 = ROUND_CONST_Lx;\
+  a0 = _mm512_xor_si512( a0, (ROUND_CONST_L0[i]) );\
  a1 = _mm512_xor_si512( a1, b1 );\
  a2 = _mm512_xor_si512( a2, b1 );\
  a3 = _mm512_xor_si512( a3, b1 );\
  a4 = _mm512_xor_si512( a4, b1 );\
  a5 = _mm512_xor_si512( a5, b1 );\
  a6 = _mm512_xor_si512( a6, b1 );\
-  a7 = _mm512_xor_si512( a7, m512_const1_128( round_const_l7[i] ) );\
+  a7 = _mm512_xor_si512( a7, (ROUND_CONST_L7[i]) );\
  \
  /* ShiftBytes + SubBytes (interleaved) */\
  b0 = _mm512_xor_si512( b0, b0 );\
-  a0 = _mm512_shuffle_epi8( a0, SUBSH_MASK0 );\
+  a0 = _mm512_shuffle_epi8( a0, (SUBSH_MASK[0]) );\
  a0 = _mm512_aesenclast_epi128(a0, b0 );\
-  a1 = _mm512_shuffle_epi8( a1, SUBSH_MASK1 );\
+  a1 = _mm512_shuffle_epi8( a1, (SUBSH_MASK[1]) );\
  a1 = _mm512_aesenclast_epi128(a1, b0 );\
-  a2 = _mm512_shuffle_epi8( a2, SUBSH_MASK2 );\
+  a2 = _mm512_shuffle_epi8( a2, (SUBSH_MASK[2]) );\
  a2 = _mm512_aesenclast_epi128(a2, b0 );\
-  a3 = _mm512_shuffle_epi8( a3, SUBSH_MASK3 );\
+  a3 = _mm512_shuffle_epi8( a3, (SUBSH_MASK[3]) );\
  a3 = _mm512_aesenclast_epi128(a3, b0 );\
-  a4 = _mm512_shuffle_epi8( a4, SUBSH_MASK4 );\
+  a4 = _mm512_shuffle_epi8( a4, (SUBSH_MASK[4]) );\
  a4 = _mm512_aesenclast_epi128(a4, b0 );\
-  a5 = _mm512_shuffle_epi8( a5, SUBSH_MASK5 );\
+  a5 = _mm512_shuffle_epi8( a5, (SUBSH_MASK[5]) );\
  a5 = _mm512_aesenclast_epi128(a5, b0 );\
-  a6 = _mm512_shuffle_epi8( a6, SUBSH_MASK6 );\
+  a6 = _mm512_shuffle_epi8( a6, (SUBSH_MASK[6]) );\
  a6 = _mm512_aesenclast_epi128(a6, b0 );\
-  a7 = _mm512_shuffle_epi8( a7, SUBSH_MASK7 );\
+  a7 = _mm512_shuffle_epi8( a7, (SUBSH_MASK[7]) );\
  a7 = _mm512_aesenclast_epi128( a7, b0 );\
  \
  /* MixBytes */\
@@ -390,6 +390,29 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
 }/**/


+
+void INIT256_4way( __m512i* chaining )
+{
+  static __m512i xmm0, xmm2, xmm6, xmm7;
+  static __m512i xmm12, xmm13, xmm14, xmm15;
+
+  /* load IV into registers xmm12 - xmm15 */
+  xmm12 = chaining[0];
+  xmm13 = chaining[1];
+  xmm14 = chaining[2];
+  xmm15 = chaining[3];
+
+  /* transform chaining value from column ordering into row ordering */
+  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
+  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
+
+  /* store transposed IV */
+  chaining[0] = xmm12;
+  chaining[1] = xmm2;
+  chaining[2] = xmm6;
+  chaining[3] = xmm7;
+}
+
 void TF512_4way( __m512i* chaining, __m512i* message )
 {
  static __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
--- a/algo/groestl/groestl512-hash-4way.c
+++ b/algo/groestl/groestl512-hash-4way.c
@@ -15,10 +15,14 @@
 #include "miner.h"
 #include "simd-utils.h"

-#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(__VAES__)

 int groestl512_4way_init( groestl512_4way_context* ctx, uint64_t hashlen )
 {
+  int i;
+
+  SET_CONSTANTS();
+
  if (ctx->chaining == NULL || ctx->buffer == NULL)
    return 1;

@@ -95,6 +99,7 @@ int groestl512_4way_full( groestl512_4way_context* ctx, void* output,

   // --- init ---

+   SET_CONSTANTS();
   memset_zero_512( ctx->chaining, SIZE512 );
   memset_zero_512( ctx->buffer, SIZE512 );
   ctx->chaining[ 6 ] = m512_const2_64( 0x0200000000000000, 0 );
--- a/algo/groestl/groestl512-hash-4way.h
+++ b/algo/groestl/groestl512-hash-4way.h
@@ -10,8 +10,6 @@
 #endif
 #include <stdlib.h>

-#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-
 #define LENGTH (512)

 /* some sizes (number of bytes) */
@@ -58,5 +56,4 @@ int groestl512_4way_update_close( groestl512_4way_context*,  void*,
 int groestl512_4way_full( groestl512_4way_context*,  void*,
                          const void*, uint64_t );

-#endif   // VAES
-#endif   // GROESTL512_HASH_4WAY_H__
+#endif /* __hash_h */
--- a/algo/groestl/groestl512-intr-4way.h
+++ b/algo/groestl/groestl512-intr-4way.h
@@ -15,86 +15,16 @@

 #if defined(__VAES__)

-static const __m128i round_const_p[] __attribute__ ((aligned (64))) =
-{
-   { 0x7060504030201000, 0xf0e0d0c0b0a09080 },
-   { 0x7161514131211101, 0xf1e1d1c1b1a19181 }, 
-   { 0x7262524232221202, 0xf2e2d2c2b2a29282 },
-   { 0x7363534333231303, 0xf3e3d3c3b3a39383 },
-   { 0x7464544434241404, 0xf4e4d4c4b4a49484 },
-   { 0x7565554535251505, 0xf5e5d5c5b5a59585 },
-   { 0x7666564636261606, 0xf6e6d6c6b6a69686 },
-   { 0x7767574737271707, 0xf7e7d7c7b7a79787 },
-   { 0x7868584838281808, 0xf8e8d8c8b8a89888 },
-   { 0x7969594939291909, 0xf9e9d9c9b9a99989 },
-   { 0x7a6a5a4a3a2a1a0a, 0xfaeadacabaaa9a8a },
-   { 0x7b6b5b4b3b2b1b0b, 0xfbebdbcbbbab9b8b },
-   { 0x7c6c5c4c3c2c1c0c, 0xfcecdcccbcac9c8c },
-   { 0x7d6d5d4d3d2d1d0d, 0xfdedddcdbdad9d8d }
-};
-
-static const __m128i round_const_q[] __attribute__ ((aligned (64))) =
-{
-   { 0x8f9fafbfcfdfefff, 0x0f1f2f3f4f5f6f7f },
-   { 0x8e9eaebecedeeefe, 0x0e1e2e3e4e5e6e7e },
-   { 0x8d9dadbdcdddedfd, 0x0d1d2d3d4d5d6d7d },
-   { 0x8c9cacbcccdcecfc, 0x0c1c2c3c4c5c6c7c },
-   { 0x8b9babbbcbdbebfb, 0x0b1b2b3b4b5b6b7b },
-   { 0x8a9aaabacadaeafa, 0x0a1a2a3a4a5a6a7a },
-   { 0x8999a9b9c9d9e9f9, 0x0919293949596979 },
-   { 0x8898a8b8c8d8e8f8, 0x0818283848586878 },
-   { 0x8797a7b7c7d7e7f7, 0x0717273747576777 },
-   { 0x8696a6b6c6d6e6f6, 0x0616263646566676 },
-   { 0x8595a5b5c5d5e5f5, 0x0515253545556575 },
-   { 0x8494a4b4c4d4e4f4, 0x0414243444546474 },
-   { 0x8393a3b3c3d3e3f3, 0x0313233343536373 },
-   { 0x8292a2b2c2d2e2f2, 0x0212223242526272 }
-};
-
-static const __m512i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02,
-                                     0x1d1519111c141810, 0x1f171b131e161a12,
-                                     0x2d2529212c242820, 0x2f272b232e262a22,
-                                     0x3d3539313c343830, 0x3f373b333e363a32 };
-
-static const __m512i SUBSH_MASK0 = { 0x0b0e0104070a0d00, 0x0306090c0f020508,
-                                     0x1b1e1114171a1d10, 0x1316191c1f121518,
-                                     0x2b2e2124272a2d20, 0x2326292c2f222528,
-                                     0x3b3e3134373a3d30, 0x3336393c3f323538 };
-
-static const __m512i SUBSH_MASK1 = { 0x0c0f0205080b0e01, 0x04070a0d00030609,
-                                     0x1c1f1215181b1e11, 0x14171a1d10131619,
-                                     0x2c2f2225282b2e21, 0x24272a2d20232629,
-                                     0x3c3f3235383b3e31, 0x34373a3d30333639 };
-
-static const __m512i SUBSH_MASK2 = { 0x0d000306090c0f02, 0x05080b0e0104070a,
-                                     0x1d101316191c1f12, 0x15181b1e1114171a,
-                                     0x2d202326292c2f22, 0x25282b2e2124272a,
-                                     0x3d303336393c3f32, 0x35383b3e3134373a };
-
-static const __m512i SUBSH_MASK3 = { 0x0e0104070a0d0003, 0x06090c0f0205080b,
-                                     0x1e1114171a1d1013, 0x16191c1f1215181b,
-                                     0x2e2124272a2d2023, 0x26292c2f2225282b,
-                                     0x3e3134373a3d3033, 0x36393c3f3235383b };
-
-static const __m512i SUBSH_MASK4 = { 0x0f0205080b0e0104, 0x070a0d000306090c,
-                                     0x1f1215181b1e1114, 0x171a1d101316191c,
-                                     0x2f2225282b2e2124, 0x272a2d202326292c,
-                                     0x3f3235383b3e3134, 0x373a3d303336393c };
-
-static const __m512i SUBSH_MASK5 = { 0x000306090c0f0205, 0x080b0e0104070a0d,
-                                     0x101316191c1f1215, 0x181b1e1114171a1d,
-                                     0x202326292c2f2225, 0x282b2e2124272a2d,
-                                     0x303336393c3f3235, 0x383b3e3134373a3d };
-
-static const __m512i SUBSH_MASK6 = { 0x0104070a0d000306, 0x090c0f0205080b0e,
-                                     0x1114171a1d101316, 0x191c1f1215181b1e,
-                                     0x2124272a2d202326, 0x292c2f2225282b2e,
-                                     0x3134373a3d303336, 0x393c3f3235383b3e };
-
-static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
-                                     0x16191c1f1215181b, 0x1e1114171a1d1013,
-                                     0x26292c2f2225282b, 0x2e2124272a2d2023,
-                                     0x36393c3f3235383b, 0x3e3134373a3d3033 };
+/* global constants  */
+__m512i ROUND_CONST_Lx;
+//__m128i ROUND_CONST_L0[ROUNDS512];
+//__m128i ROUND_CONST_L7[ROUNDS512];
+__m512i ROUND_CONST_P[ROUNDS1024];
+__m512i ROUND_CONST_Q[ROUNDS1024];
+__m512i TRANSP_MASK;
+__m512i SUBSH_MASK[8];
+__m512i ALL_1B;
+__m512i ALL_FF;

 #define tos(a)    #a
 #define tostr(a)  tos(a)
@@ -225,6 +155,69 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
  b1 = _mm512_xor_si512(b1, a4);\
 }/*MixBytes*/

+// calculate the round constants seperately and load at startup
+
+#define SET_CONSTANTS(){\
+  ALL_FF = _mm512_set1_epi32( 0xffffffff );\
+  ALL_1B = _mm512_set1_epi32( 0x1b1b1b1b );\
+  TRANSP_MASK   = _mm512_set_epi32( \
+                         0x3f373b33, 0x3e363a32, 0x3d353931, 0x3c343830, \
+                         0x2f272b23, 0x2e262a22, 0x2d252921, 0x2c242820, \
+                         0x1f171b13, 0x1e161a12, 0x1d151911, 0x1c141810, \
+                         0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800 ); \
+  SUBSH_MASK[0] = _mm512_set_epi32( \
+                         0x3336393c, 0x3f323538, 0x3b3e3134, 0x373a3d30, \
+                         0x2326292c, 0x2f222528, 0x2b2e2124, 0x272a2d20, \
+                         0x1316191c, 0x1f121518, 0x1b1e1114, 0x171a1d10, \
+                         0x0306090c, 0x0f020508, 0x0b0e0104, 0x070a0d00 ); \
+  SUBSH_MASK[1] = _mm512_set_epi32( \
+                         0x34373a3d, 0x30333639, 0x3c3f3235, 0x383b3e31, \
+                         0x24272a2d, 0x20232629, 0x2c2f2225, 0x282b2e21, \
+                         0x14171a1d, 0x10131619, 0x1c1f1215, 0x181b1e11, \
+                         0x04070a0d, 0x00030609, 0x0c0f0205, 0x080b0e01 ); \
+  SUBSH_MASK[2] = _mm512_set_epi32( \
+                         0x35383b3e, 0x3134373a, 0x3d303336, 0x393c3f32, \
+                         0x25282b2e, 0x2124272a, 0x2d202326, 0x292c2f22, \
+                         0x15181b1e, 0x1114171a, 0x1d101316, 0x191c1f12, \
+                         0x05080b0e, 0x0104070a, 0x0d000306, 0x090c0f02 ); \
+  SUBSH_MASK[3] = _mm512_set_epi32( \
+                         0x36393c3f, 0x3235383b, 0x3e313437, 0x3a3d3033, \
+                         0x26292c2f, 0x2225282b, 0x2e212427, 0x2a2d2023, \
+                         0x16191c1f, 0x1215181b, 0x1e111417, 0x1a1d1013, \
+                         0x06090c0f, 0x0205080b, 0x0e010407, 0x0a0d0003 ); \
+  SUBSH_MASK[4] = _mm512_set_epi32( \
+                         0x373a3d30, 0x3336393c, 0x3f323538, 0x3b3e3134, \
+                         0x272a2d20, 0x2326292c, 0x2f222528, 0x2b2e2124, \
+                         0x171a1d10, 0x1316191c, 0x1f121518, 0x1b1e1114, \
+                         0x070a0d00, 0x0306090c, 0x0f020508, 0x0b0e0104 ); \
+  SUBSH_MASK[5] = _mm512_set_epi32( \
+                         0x383b3e31, 0x34373a3d, 0x30333639, 0x3c3f3235, \
+                         0x282b2e21, 0x24272a2d, 0x20232629, 0x2c2f2225, \
+                         0x181b1e11, 0x14171a1d, 0x10131619, 0x1c1f1215, \
+                         0x080b0e01, 0x04070a0d, 0x00030609, 0x0c0f0205 ); \
+  SUBSH_MASK[6] = _mm512_set_epi32( \
+                         0x393c3f32, 0x35383b3e, 0x3134373a, 0x3d303336, \
+                         0x292c2f22, 0x25282b2e, 0x2124272a, 0x2d202326, \
+                         0x191c1f12, 0x15181b1e, 0x1114171a, 0x1d101316, \
+                         0x090c0f02, 0x05080b0e, 0x0104070a, 0x0d000306 ); \
+  SUBSH_MASK[7] = _mm512_set_epi32( \
+                         0x3e313437, 0x3a3d3033, 0x36393c3f, 0x3235383b, \
+                         0x2e212427, 0x2a2d2023, 0x26292c2f, 0x2225282b, \
+                         0x1e111417, 0x1a1d1013, 0x16191c1f, 0x1215181b, \
+                         0x0e010407, 0x0a0d0003, 0x06090c0f, 0x0205080b ); \
+  for( i = 0; i < ROUNDS1024; i++ ) \
+  { \
+    ROUND_CONST_P[i] = _mm512_set4_epi32( 0xf0e0d0c0 ^ (i * 0x01010101), \
+                                          0xb0a09080 ^ (i * 0x01010101), \
+                                          0x70605040 ^ (i * 0x01010101), \
+                                          0x30201000 ^ (i * 0x01010101) ); \
+    ROUND_CONST_Q[i] = _mm512_set4_epi32( 0x0f1f2f3f ^ (i * 0x01010101), \
+                                          0x4f5f6f7f ^ (i * 0x01010101), \
+                                          0x8f9fafbf ^ (i * 0x01010101), \
+                                          0xcfdfefff ^ (i * 0x01010101));\
+  } \
+}while(0);\
+
 /* one round
 * a0-a7 = input rows
 * b0-b7 = output rows
@@ -249,32 +242,30 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
  for ( round_counter = 0; round_counter < 14; round_counter += 2 ) \
  { \
    /* AddRoundConstant P1024 */\
-    xmm8 = _mm512_xor_si512( xmm8, m512_const1_128( \
-             casti_m128i( round_const_p, round_counter ) ) ); \
+    xmm8 = _mm512_xor_si512( xmm8, ( ROUND_CONST_P[ round_counter ] ) );\
    /* ShiftBytes P1024 + pre-AESENCLAST */\
-    xmm8  = _mm512_shuffle_epi8( xmm8,  SUBSH_MASK0 ); \
-    xmm9  = _mm512_shuffle_epi8( xmm9,  SUBSH_MASK1 );\
-    xmm10 = _mm512_shuffle_epi8( xmm10, SUBSH_MASK2 );\
-    xmm11 = _mm512_shuffle_epi8( xmm11, SUBSH_MASK3 );\
-    xmm12 = _mm512_shuffle_epi8( xmm12, SUBSH_MASK4 );\
-    xmm13 = _mm512_shuffle_epi8( xmm13, SUBSH_MASK5 );\
-    xmm14 = _mm512_shuffle_epi8( xmm14, SUBSH_MASK6 );\
-    xmm15 = _mm512_shuffle_epi8( xmm15, SUBSH_MASK7 );\
+    xmm8  = _mm512_shuffle_epi8( xmm8,  ( SUBSH_MASK[0] ) );\
+    xmm9  = _mm512_shuffle_epi8( xmm9,  ( SUBSH_MASK[1] ) );\
+    xmm10 = _mm512_shuffle_epi8( xmm10, ( SUBSH_MASK[2] ) );\
+    xmm11 = _mm512_shuffle_epi8( xmm11, ( SUBSH_MASK[3] ) );\
+    xmm12 = _mm512_shuffle_epi8( xmm12, ( SUBSH_MASK[4] ) );\
+    xmm13 = _mm512_shuffle_epi8( xmm13, ( SUBSH_MASK[5] ) );\
+    xmm14 = _mm512_shuffle_epi8( xmm14, ( SUBSH_MASK[6] ) );\
+    xmm15 = _mm512_shuffle_epi8( xmm15, ( SUBSH_MASK[7] ) );\
    /* SubBytes + MixBytes */\
    SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
    \
     /* AddRoundConstant P1024 */\
-    xmm0 = _mm512_xor_si512( xmm0, m512_const1_128( \
-             casti_m128i( round_const_p, round_counter+1 ) ) ); \
+    xmm0 = _mm512_xor_si512( xmm0, ( ROUND_CONST_P[ round_counter+1 ] ) );\
    /* ShiftBytes P1024 + pre-AESENCLAST */\
-    xmm0 = _mm512_shuffle_epi8( xmm0, SUBSH_MASK0 );\
-    xmm1 = _mm512_shuffle_epi8( xmm1, SUBSH_MASK1 );\
-    xmm2 = _mm512_shuffle_epi8( xmm2, SUBSH_MASK2 );\
-    xmm3 = _mm512_shuffle_epi8( xmm3, SUBSH_MASK3 );\
-    xmm4 = _mm512_shuffle_epi8( xmm4, SUBSH_MASK4 );\
-    xmm5 = _mm512_shuffle_epi8( xmm5, SUBSH_MASK5 );\
-    xmm6 = _mm512_shuffle_epi8( xmm6, SUBSH_MASK6 );\
-    xmm7 = _mm512_shuffle_epi8( xmm7, SUBSH_MASK7 );\
+    xmm0 = _mm512_shuffle_epi8( xmm0, ( SUBSH_MASK[0] ) );\
+    xmm1 = _mm512_shuffle_epi8( xmm1, ( SUBSH_MASK[1] ) );\
+    xmm2 = _mm512_shuffle_epi8( xmm2, ( SUBSH_MASK[2] ) );\
+    xmm3 = _mm512_shuffle_epi8( xmm3, ( SUBSH_MASK[3] ) );\
+    xmm4 = _mm512_shuffle_epi8( xmm4, ( SUBSH_MASK[4] ) );\
+    xmm5 = _mm512_shuffle_epi8( xmm5, ( SUBSH_MASK[5] ) );\
+    xmm6 = _mm512_shuffle_epi8( xmm6, ( SUBSH_MASK[6] ) );\
+    xmm7 = _mm512_shuffle_epi8( xmm7, ( SUBSH_MASK[7] ) );\
    /* SubBytes + MixBytes */\
     SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
  }\
@@ -293,17 +284,16 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
    xmm12 = _mm512_xor_si512( xmm12, xmm1 );\
    xmm13 = _mm512_xor_si512( xmm13, xmm1 );\
    xmm14 = _mm512_xor_si512( xmm14, xmm1 );\
-    xmm15 = _mm512_xor_si512( xmm15, m512_const1_128( \
-                 casti_m128i( round_const_q, round_counter ) ) ); \
+    xmm15 = _mm512_xor_si512( xmm15, ( ROUND_CONST_Q[ round_counter ] ) );\
    /* ShiftBytes Q1024 + pre-AESENCLAST */\
-    xmm8  = _mm512_shuffle_epi8( xmm8,  SUBSH_MASK1 );\
-    xmm9  = _mm512_shuffle_epi8( xmm9,  SUBSH_MASK3 );\
-    xmm10 = _mm512_shuffle_epi8( xmm10, SUBSH_MASK5 );\
-    xmm11 = _mm512_shuffle_epi8( xmm11, SUBSH_MASK7 );\
-    xmm12 = _mm512_shuffle_epi8( xmm12, SUBSH_MASK0 );\
-    xmm13 = _mm512_shuffle_epi8( xmm13, SUBSH_MASK2 );\
-    xmm14 = _mm512_shuffle_epi8( xmm14, SUBSH_MASK4 );\
-    xmm15 = _mm512_shuffle_epi8( xmm15, SUBSH_MASK6 );\
+    xmm8  = _mm512_shuffle_epi8( xmm8,  ( SUBSH_MASK[1] ) );\
+    xmm9  = _mm512_shuffle_epi8( xmm9,  ( SUBSH_MASK[3] ) );\
+    xmm10 = _mm512_shuffle_epi8( xmm10, ( SUBSH_MASK[5] ) );\
+    xmm11 = _mm512_shuffle_epi8( xmm11, ( SUBSH_MASK[7] ) );\
+    xmm12 = _mm512_shuffle_epi8( xmm12, ( SUBSH_MASK[0] ) );\
+    xmm13 = _mm512_shuffle_epi8( xmm13, ( SUBSH_MASK[2] ) );\
+    xmm14 = _mm512_shuffle_epi8( xmm14, ( SUBSH_MASK[4] ) );\
+    xmm15 = _mm512_shuffle_epi8( xmm15, ( SUBSH_MASK[6] ) );\
    /* SubBytes + MixBytes */\
    SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
    \
@@ -316,17 +306,16 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
    xmm4 = _mm512_xor_si512( xmm4, xmm9 );\
    xmm5 = _mm512_xor_si512( xmm5, xmm9 );\
    xmm6 = _mm512_xor_si512( xmm6, xmm9 );\
-    xmm7 = _mm512_xor_si512( xmm7, m512_const1_128( \
-             casti_m128i( round_const_q, round_counter+1 ) ) ); \
+    xmm7 = _mm512_xor_si512( xmm7, ( ROUND_CONST_Q[ round_counter+1 ] ) );\
    /* ShiftBytes Q1024 + pre-AESENCLAST */\
-    xmm0 = _mm512_shuffle_epi8( xmm0, SUBSH_MASK1 );\
-    xmm1 = _mm512_shuffle_epi8( xmm1, SUBSH_MASK3 );\
-    xmm2 = _mm512_shuffle_epi8( xmm2, SUBSH_MASK5 );\
-    xmm3 = _mm512_shuffle_epi8( xmm3, SUBSH_MASK7 );\
-    xmm4 = _mm512_shuffle_epi8( xmm4, SUBSH_MASK0 );\
-    xmm5 = _mm512_shuffle_epi8( xmm5, SUBSH_MASK2 );\
-    xmm6 = _mm512_shuffle_epi8( xmm6, SUBSH_MASK4 );\
-    xmm7 = _mm512_shuffle_epi8( xmm7, SUBSH_MASK6 );\
+    xmm0 = _mm512_shuffle_epi8( xmm0, ( SUBSH_MASK[1] ) );\
+    xmm1 = _mm512_shuffle_epi8( xmm1, ( SUBSH_MASK[3] ) );\
+    xmm2 = _mm512_shuffle_epi8( xmm2, ( SUBSH_MASK[5] ) );\
+    xmm3 = _mm512_shuffle_epi8( xmm3, ( SUBSH_MASK[7] ) );\
+    xmm4 = _mm512_shuffle_epi8( xmm4, ( SUBSH_MASK[0] ) );\
+    xmm5 = _mm512_shuffle_epi8( xmm5, ( SUBSH_MASK[2] ) );\
+    xmm6 = _mm512_shuffle_epi8( xmm6, ( SUBSH_MASK[4] ) );\
+    xmm7 = _mm512_shuffle_epi8( xmm7, ( SUBSH_MASK[6] ) );\
    /* SubBytes + MixBytes */\
    SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
  }\
--- a/algo/groestl/myr-groestl.c
+++ b/algo/groestl/myr-groestl.c
@@ -1,7 +1,4 @@
 #include "myrgr-gate.h"
-
-#if !defined(MYRGR_8WAY) && !defined(MYRGR_4WAY)
-
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
@@ -89,4 +86,3 @@ int scanhash_myriad( struct work *work, uint32_t max_nonce,
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }
-#endif
--- a/algo/groestl/sph_groestl.c
+++ b/algo/groestl/sph_groestl.c
@@ -35,8 +35,6 @@

 #include "sph_groestl.h"

-#if !defined(__AES__)
-
 #ifdef __cplusplus
 extern "C"{
 #endif
@@ -3118,6 +3116,4 @@ sph_groestl512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)

 #ifdef __cplusplus
 }
-
-#endif  // !AES
 #endif
--- a/algo/groestl/sph_groestl.h
+++ b/algo/groestl/sph_groestl.h
@@ -42,7 +42,6 @@ extern "C"{
 #include <stddef.h>
 #include "algo/sha/sph_types.h"

-#if !defined(__AES__)   
 /**
 * Output size (in bits) for Groestl-224.
 */
@@ -327,5 +326,4 @@ void sph_groestl512_addbits_and_close(
 }
 #endif

-#endif  // !AES
 #endif
--- a/algo/heavy/bastion.c
+++ b/algo/heavy/bastion.c
@@ -0,0 +1,156 @@
+#include "algo-gate-api.h"
+#include <stdio.h>
+#include <string.h>
+#include <openssl/sha.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include "sph_hefty1.h"
+#include "algo/luffa/sph_luffa.h"
+#include "algo/fugue/sph_fugue.h"
+#include "algo/skein/sph_skein.h"
+#include "algo/whirlpool/sph_whirlpool.h"
+#include "algo/shabal/sph_shabal.h"
+#include "algo/echo/sph_echo.h"
+#include "algo/hamsi/sph_hamsi.h"
+#include "algo/luffa/luffa_for_sse2.h"
+#ifdef __AES__
+  #include "algo/echo/aes_ni/hash_api.h"
+#endif
+
+void bastionhash(void *output, const void *input)
+{
+	unsigned char hash[64] __attribute__ ((aligned (64)));
+
+#ifdef __AES__
+   hashState_echo          ctx_echo;
+#else
+   sph_echo512_context     ctx_echo;
+#endif
+   hashState_luffa         ctx_luffa;
+	sph_fugue512_context ctx_fugue;
+	sph_whirlpool_context ctx_whirlpool;
+	sph_shabal512_context ctx_shabal;
+   sph_hamsi512_context ctx_hamsi;
+	sph_skein512_context ctx_skein;
+
+	HEFTY1(input, 80, hash);
+
+        init_luffa( &ctx_luffa, 512 );
+        update_and_final_luffa( &ctx_luffa, (BitSequence*)hash,
+                                (const BitSequence*)hash, 64 );
+
+	if (hash[0] & 0x8)
+	{
+		sph_fugue512_init(&ctx_fugue);
+		sph_fugue512(&ctx_fugue, hash, 64);
+		sph_fugue512_close(&ctx_fugue, hash);
+	} else {
+   sph_skein512_init( &ctx_skein );
+   sph_skein512( &ctx_skein, hash, 64 );
+   sph_skein512_close( &ctx_skein, hash );
+	}
+
+	sph_whirlpool_init(&ctx_whirlpool);
+	sph_whirlpool(&ctx_whirlpool, hash, 64);
+	sph_whirlpool_close(&ctx_whirlpool, hash);
+
+	sph_fugue512_init(&ctx_fugue);
+	sph_fugue512(&ctx_fugue, hash, 64);
+	sph_fugue512_close(&ctx_fugue, hash);
+
+	if (hash[0] & 0x8)
+	{
+#ifdef __AES__
+      init_echo( &ctx_echo, 512 );
+      update_final_echo ( &ctx_echo,(BitSequence*)hash,
+                              (const BitSequence*)hash, 512 );
+#else
+		sph_echo512_init(&ctx_echo);
+		sph_echo512(&ctx_echo, hash, 64);
+		sph_echo512_close(&ctx_echo, hash);
+#endif
+	} else {
+      init_luffa( &ctx_luffa, 512 );
+      update_and_final_luffa( &ctx_luffa, (BitSequence*)hash,
+                                    (const BitSequence*)hash, 64 );
+	}
+
+	sph_shabal512_init(&ctx_shabal);
+	sph_shabal512(&ctx_shabal, hash, 64);
+	sph_shabal512_close(&ctx_shabal, hash);
+
+   sph_skein512_init( &ctx_skein );
+   sph_skein512( &ctx_skein, hash, 64 );
+   sph_skein512_close( &ctx_skein, hash );
+
+	if (hash[0] & 0x8)
+	{
+		sph_shabal512_init(&ctx_shabal);
+		sph_shabal512(&ctx_shabal, hash, 64);
+		sph_shabal512_close(&ctx_shabal, hash);
+	} else {
+		sph_whirlpool_init(&ctx_whirlpool);
+		sph_whirlpool(&ctx_whirlpool, hash, 64);
+		sph_whirlpool_close(&ctx_whirlpool, hash);
+	}
+
+	sph_shabal512_init(&ctx_shabal);
+	sph_shabal512(&ctx_shabal, hash, 64);
+	sph_shabal512_close(&ctx_shabal, hash);
+
+	if (hash[0] & 0x8)
+	{
+		sph_hamsi512_init(&ctx_hamsi);
+		sph_hamsi512(&ctx_hamsi, hash, 64);
+		sph_hamsi512_close(&ctx_hamsi, hash);
+	} else {
+      init_luffa( &ctx_luffa, 512 );
+      update_and_final_luffa( &ctx_luffa, (BitSequence*)hash,
+                                    (const BitSequence*)hash, 64 );
+	}
+
+	memcpy(output, hash, 32);
+}
+
+int scanhash_bastion( struct work *work, uint32_t max_nonce,
+      uint64_t *hashes_done, struct thr_info *mythr)
+{
+	uint32_t _ALIGN(64) hash32[8];
+	uint32_t _ALIGN(64) endiandata[20];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+   int thr_id = mythr->id;  // thr_id arg is deprecated
+
+	const uint32_t Htarg = ptarget[7];
+	const uint32_t first_nonce = pdata[19];
+
+	uint32_t n = first_nonce;
+
+	for (int i=0; i < 19; i++) 
+		be32enc(&endiandata[i], pdata[i]);
+
+	do {
+		be32enc(&endiandata[19], n);
+		bastionhash(hash32, endiandata);
+		if (hash32[7] < Htarg && fulltest(hash32, ptarget)) {
+			pdata[19] = n;
+         submit_solution( work, hash32, mythr );
+		}
+		n++;
+
+	} while (n < max_nonce && !work_restart[thr_id].restart);
+
+	*hashes_done = n - first_nonce + 1;
+	pdata[19] = n;
+
+	return 0;
+}
+
+bool register_bastion_algo( algo_gate_t* gate )
+{
+  gate->optimizations = SSE2_OPT | AES_OPT;
+  gate->scanhash = (void*)&scanhash_bastion;
+  gate->hash     = (void*)&bastionhash;
+  return true;
+};
+
--- a/algo/heavy/heavy.c
+++ b/algo/heavy/heavy.c
@@ -0,0 +1,111 @@
+#include <string.h>
+#include <openssl/sha.h>
+#include <stdint.h>
+
+#include "algo-gate-api.h"
+#include "sph_hefty1.h"
+#include "algo/keccak/sph_keccak.h"
+#include "algo/blake/sph_blake.h"
+#include "algo/groestl/sph_groestl.h"
+
+/* Combines top 64-bits from each hash into a single hash */
+static void combine_hashes(uint32_t *out, uint32_t *hash1, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4)
+{
+    uint32_t *hash[4] = { hash1, hash2, hash3, hash4 };
+
+    /* Transpose first 64 bits of each hash into out */
+    memset(out, 0, 32);
+    int bits = 0;
+    for (unsigned int i = 7; i >= 6; i--) {
+        for (uint32_t mask = 0x80000000; mask; mask >>= 1) {
+            for (unsigned int k = 0; k < 4; k++) {
+                out[(255 - bits)/32] <<= 1;
+                if ((hash[k][i] & mask) != 0)
+                    out[(255 - bits)/32] |= 1;
+                bits++;
+            }
+        }
+    }
+}
+
+extern void heavyhash(unsigned char* output, const unsigned char* input, int len)
+{
+    unsigned char hash1[32];
+    HEFTY1(input, len, hash1);
+
+// HEFTY1 is new, so take an extra security measure to eliminate
+//     * the possiblity of collisions:
+//     *
+//     *     Hash(x) = SHA256(x + HEFTY1(x))
+//     *
+//     * N.B. '+' is concatenation.
+//
+    unsigned char hash2[32];;
+    SHA256_CTX ctx;
+    SHA256_Init(&ctx);
+    SHA256_Update(&ctx, input, len);
+    SHA256_Update(&ctx, hash1, sizeof(hash1));
+    SHA256_Final(hash2, &ctx);
+
+//   * Additional security: Do not rely on a single cryptographic hash
+//     * function.  Instead, combine the outputs of 4 of the most secure
+//     * cryptographic hash functions-- SHA256, KECCAK512, GROESTL512
+//     * and BLAKE512.
+
+
+    uint32_t hash3[16];
+    sph_keccak512_context keccakCtx;
+    sph_keccak512_init(&keccakCtx);
+    sph_keccak512(&keccakCtx, input, len);
+    sph_keccak512(&keccakCtx, hash1, sizeof(hash1));
+    sph_keccak512_close(&keccakCtx, (void *)&hash3);
+
+    uint32_t hash4[16];
+    sph_groestl512_context groestlCtx;
+    sph_groestl512_init(&groestlCtx);
+    sph_groestl512(&groestlCtx, input, len);
+    sph_groestl512(&groestlCtx, hash1, sizeof(hash1));
+    sph_groestl512_close(&groestlCtx, (void *)&hash4);
+
+    uint32_t hash5[16];
+    sph_blake512_context blakeCtx;
+    sph_blake512_init(&blakeCtx);
+    sph_blake512(&blakeCtx, input, len);
+    sph_blake512(&blakeCtx, (unsigned char *)&hash1, sizeof(hash1));
+    sph_blake512_close(&blakeCtx, (void *)&hash5);
+
+    uint32_t *final = (uint32_t *)output;
+    combine_hashes(final, (uint32_t *)hash2, hash3, hash4, hash5);
+
+}
+
+int scanhash_heavy( uint32_t *pdata, const uint32_t *ptarget,
+            uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr)
+{
+    uint32_t hash[8];
+    uint32_t start_nonce = pdata[19];
+    int thr_id = mythr->id;  // thr_id arg is deprecated
+    
+    do {
+        heavyhash((unsigned char *)hash, (unsigned char *)pdata, 80);
+    
+        if (hash[7] <= ptarget[7]) {
+            if (fulltest(hash, ptarget)) {
+                *hashes_done = pdata[19] - start_nonce;
+                return 1;
+                break;
+            }
+        }
+        pdata[19]++;
+    } while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+    *hashes_done = pdata[19] - start_nonce;
+    return 0;
+}
+
+bool register_heavy_algo( algo_gate_t* gate )
+{
+    gate->scanhash = (void*)&scanhash_heavy;
+    gate->hash     = (void*)&heavyhash;
+    return true;
+};
+
--- a/algo/hodl/hodl-gate.c
+++ b/algo/hodl/hodl-gate.c
@@ -144,7 +144,7 @@ int hodl_scanhash( struct work* work, uint32_t max_nonce,
 #if defined(__AES__)
  GenRandomGarbage( (CacheEntry*)hodl_scratchbuf, work->data, mythr->id );
  pthread_barrier_wait( &hodl_barrier );
-  return scanhash_hodl_wolf( work, max_nonce, hashes_done, mythr );
+  return scanhash_hodl_wolf( work, max_nonce, hashes_done, thr_info );
 #endif
  return false;
 }
--- a/algo/hodl/hodl-wolf.c
+++ b/algo/hodl/hodl-wolf.c
@@ -129,10 +129,9 @@ int scanhash_hodl_wolf( struct work* work, uint32_t max_nonce,
 	      if( FinalPoW[7] <= ptarget[7] )
 	      {
 	          pdata[20] = swab32( BlockHdr[20] );
-             pdata[21] = swab32( BlockHdr[21] );
-		       *hashes_done = CollisionCount;
-             submit_solution( work, FinalPoW, mythr );
-             return(0);
+		  pdata[21] = swab32( BlockHdr[21] );
+		  *hashes_done = CollisionCount;
+		  return(1);
 	      }
 	   }
 	}
@@ -199,8 +198,7 @@ int scanhash_hodl_wolf( struct work* work, uint32_t max_nonce,
                  pdata[20] = swab32( BlockHdr[20] );
                  pdata[21] = swab32( BlockHdr[21] );
                  *hashes_done = CollisionCount;
-                  submit_solution( work, FinalPoW, mythr );
-                  return(0);
+                  return(1);
              }
           }
        }
--- a/algo/jh/jha.c
+++ b/algo/jh/jha.c
@@ -1,7 +1,4 @@
 #include "jha-gate.h"
-
-#if !defined(JHA_8WAY) && !defined(JHA_4WAY)
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -136,4 +133,3 @@ int scanhash_jha( struct work *work, uint32_t max_nonce,
 	return 0;
 }

-#endif
--- a/algo/keccak/keccak-4way.c
+++ b/algo/keccak/keccak-4way.c
@@ -28,32 +28,30 @@ int scanhash_keccak_8way( struct work *work, uint32_t max_nonce,
   const uint32_t first_nonce = pdata[19];
   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
   const uint32_t Htarg = ptarget[7];
-   const int thr_id = mythr->id;  
-   const bool bench = opt_benchmark;
+   int thr_id = mythr->id;  

   mm512_bswap32_intrlv80_8x64( vdata, pdata );
-   *noncev = mm512_intrlv_blend_32(
-              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
-                                n+3, 0, n+2, 0, n+1, 0, n  , 0 ), *noncev );
   do {
+       *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );
+
      keccakhash_8way( hash, vdata );

      for ( int lane = 0; lane < 8; lane++ )
-      if unlikely( hash7[ lane<<1 ] <= Htarg && !bench ) 
+      if ( hash7[ lane<<1 ] <= Htarg ) 
      {
          extr_lane_8x64( lane_hash, hash, lane, 256 );
-          if ( valid_hash( lane_hash, ptarget ) )
+          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
          {
-              pdata[19] = bswap_32( n + lane );
+              pdata[19] = n + lane;
              submit_lane_solution( work, lane_hash, mythr, lane );
          }
      }
-      *noncev = _mm512_add_epi32( *noncev,
-                                  m512_const1_64( 0x0000000800000000 ) );
      n += 8;

   } while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
-   pdata[19] = n;
+
   *hashes_done = n - first_nonce + 1;
   return 0;
 }
@@ -81,30 +79,29 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
   const uint32_t first_nonce = pdata[19];
   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
   const uint32_t Htarg = ptarget[7];
-   const int thr_id = mythr->id;
-   const bool bench = opt_benchmark;
+   int thr_id = mythr->id;

   mm256_bswap32_intrlv80_4x64( vdata, pdata );
-   *noncev = mm256_intrlv_blend_32(
-                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
   do {
+       *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
+	
      keccakhash_4way( hash, vdata );

      for ( int lane = 0; lane < 4; lane++ )
-      if unlikely( hash7[ lane<<1 ] <= Htarg && !bench )
+      if ( hash7[ lane<<1 ] <= Htarg )
      {
          extr_lane_4x64( lane_hash, hash, lane, 256 );
-          if ( valid_hash( lane_hash, ptarget ))
+          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
          {
-              pdata[19] = bswap_32( n + lane );
+              pdata[19] = n + lane;
              submit_lane_solution( work, lane_hash, mythr, lane );
          }
      }
-      *noncev = _mm256_add_epi32( *noncev,
-                                  m256_const1_64( 0x0000000400000000 ) );
      n += 4;
+
   } while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
-   pdata[19] = n;
+
   *hashes_done = n - first_nonce + 1;
   return 0;
 }
--- a/algo/keccak/keccak-gate.c
+++ b/algo/keccak/keccak-gate.c
@@ -1,9 +1,5 @@
 #include "keccak-gate.h"
-#include "sph_keccak.h"

-int hard_coded_eb = 1;
-
-// KECCAK

 bool register_keccak_algo( algo_gate_t* gate )
 {
@@ -23,8 +19,6 @@ bool register_keccak_algo( algo_gate_t* gate )
  return true;
 };

-// KECCAKC
-
 bool register_keccakc_algo( algo_gate_t* gate )
 {
  gate->optimizations = AVX2_OPT | AVX512_OPT;
@@ -43,50 +37,3 @@ bool register_keccakc_algo( algo_gate_t* gate )
  return true;
 };

-// SHA3D
-
-void sha3d( void *state, const void *input, int len )
-{
-	uint32_t _ALIGN(64) buffer[16], hash[16];
-	sph_keccak_context ctx_keccak;
-
-	sph_keccak256_init( &ctx_keccak );
-	sph_keccak256 ( &ctx_keccak, input, len );
-	sph_keccak256_close( &ctx_keccak, (void*) buffer );
-
-   sph_keccak256_init( &ctx_keccak );
-	sph_keccak256 ( &ctx_keccak, buffer, 32 );
-	sph_keccak256_close( &ctx_keccak, (void*) hash );
-
-	memcpy(state, hash, 32);
-}
-
-void sha3d_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx )
-{
-  sha3d( merkle_root, sctx->job.coinbase, (int) sctx->job.coinbase_size );
-  for ( int i = 0; i < sctx->job.merkle_count; i++ )
-  {
-     memcpy( merkle_root + 32, sctx->job.merkle[i], 32 );
-     sha256d( merkle_root, merkle_root, 64 );
-  }
-}
-
-bool register_sha3d_algo( algo_gate_t* gate )
-{
-  hard_coded_eb = 6;
-//  opt_extranonce = false;
-  gate->optimizations = AVX2_OPT | AVX512_OPT;
-  gate->gen_merkle_root = (void*)&sha3d_gen_merkle_root;
-#if defined (KECCAK_8WAY)
-  gate->scanhash  = (void*)&scanhash_sha3d_8way;
-  gate->hash      = (void*)&sha3d_hash_8way;
-#elif defined (KECCAK_4WAY)
-  gate->scanhash  = (void*)&scanhash_sha3d_4way;
-  gate->hash      = (void*)&sha3d_hash_4way;
-#else
-  gate->scanhash  = (void*)&scanhash_sha3d;
-  gate->hash      = (void*)&sha3d_hash;
-#endif
-  return true;
-};
-
--- a/algo/keccak/keccak-gate.h
+++ b/algo/keccak/keccak-gate.h
@@ -10,37 +10,24 @@
  #define KECCAK_4WAY 1
 #endif

-extern int hard_coded_eb;
-
 #if defined(KECCAK_8WAY)

 void keccakhash_8way( void *state, const void *input );
 int scanhash_keccak_8way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );

-void sha3d_hash_8way( void *state, const void *input );
-int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done, struct thr_info *mythr );
-
 #elif defined(KECCAK_4WAY)

 void keccakhash_4way( void *state, const void *input );
 int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );

-void sha3d_hash_4way( void *state, const void *input );
-int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done, struct thr_info *mythr );
-
 #else

 void keccakhash( void *state, const void *input );
 int scanhash_keccak( struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr );

-void sha3d_hash( void *state, const void *input );
-int scanhash_sha3d( struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done, struct thr_info *mythr );
+#endif

 #endif
-#endif
--- a/algo/keccak/keccak-hash-4way.c
+++ b/algo/keccak/keccak-hash-4way.c
@@ -1,9 +1,6 @@
 #include <stddef.h>
 #include <stdint.h>
 #include "keccak-hash-4way.h"
-#include "keccak-gate.h"
-
-#if defined(__AVX2__)

 static const uint64_t RC[] = {
        0x0000000000000001, 0x0000000000008082,
@@ -171,7 +168,7 @@ static void keccak64_8way_close( keccak64_ctx_m512i *kc, void *dst,
    size_t j;
    size_t m512_len = byte_len >> 3;

-    eb = hard_coded_eb;
+    eb = 0x100  >> 8;
    if ( kc->ptr == (lim - 8) )
    {
        const uint64_t t = eb | 0x8000000000000000;
@@ -241,7 +238,7 @@ keccak512_8way_close(void *cc, void *dst)

 #endif  // AVX512

-// AVX2
+#if defined(__AVX2__)

 #define INPUT_BUF(size)   do { \
    size_t j; \
@@ -352,7 +349,7 @@ static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
    size_t j;
    size_t m256_len = byte_len >> 3;

-    eb = hard_coded_eb;
+    eb = 0x100  >> 8;
    if ( kc->ptr == (lim - 8) )
    {
        const uint64_t t = eb | 0x8000000000000000;
--- a/algo/keccak/keccak.c
+++ b/algo/keccak/keccak.c
@@ -1,6 +1,4 @@
-#include "keccak-gate.h"
-
-#if !defined(KECCAK_8WAY) && !defined(KECCAK_4WAY)
+#include "algo-gate-api.h"

 #include <stdlib.h>
 #include <string.h>
@@ -20,35 +18,36 @@ void keccakhash(void *state, const void *input)
 	memcpy(state, hash, 32);
 }

-int scanhash_keccak( struct work *work, uint32_t max_nonce,
-                     uint64_t *hashes_done, struct thr_info *mythr )
+int scanhash_keccak( struct work *work,
+	uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t _ALIGN(64) hash64[8];
-   uint32_t _ALIGN(64) endiandata[32];
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   uint32_t n = pdata[19];
-   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce;
-   const int thr_id = mythr->id;
+        uint32_t *pdata = work->data;
+        uint32_t *ptarget = work->target;
+	uint32_t n = pdata[19] - 1;
+	const uint32_t first_nonce = pdata[19];
+	//const uint32_t Htarg = ptarget[7];
+   int thr_id = mythr->id;  // thr_id arg is deprecated

-   for ( int i=0; i < 19; i++ )
-      be32enc( &endiandata[i], pdata[i] );
+	uint32_t _ALIGN(32) hash64[8];
+	uint32_t endiandata[32];

-   do {
-      be32enc( &endiandata[19], n );
-      keccakhash( hash64, endiandata );
-      if ( valid_hash( hash64, ptarget ) && !opt_benchmark )
-      {
-         pdata[19] = n;
-         submit_solution( work, hash64, mythr );
-      }
-      n++;
-   } while ( n < last_nonce && !work_restart[thr_id].restart );
+        for (int i=0; i < 19; i++) 
+                be32enc(&endiandata[i], pdata[i]);

-   *hashes_done = n - first_nonce;
-   pdata[19] = n;
-   return 0;
+	do {
+	
+		pdata[19] = ++n;
+		be32enc(&endiandata[19], n); 
+		keccakhash(hash64, endiandata);
+        if (((hash64[7]&0xFFFFFF00)==0) && 
+				fulltest(hash64, ptarget)) {
+            *hashes_done = n - first_nonce + 1;
+			return true;
+		}
+	} while (n < max_nonce && !work_restart[thr_id].restart);
+	
+	*hashes_done = n - first_nonce + 1;
+	pdata[19] = n;
+	return 0;
 }

-#endif
--- a/algo/keccak/sha3d-4way.c
+++ b/algo/keccak/sha3d-4way.c
@@ -1,126 +0,0 @@
-#include "keccak-gate.h"
-#include <stdlib.h>
-#include <string.h>
-#include <stdint.h>
-#include "sph_keccak.h"
-#include "keccak-hash-4way.h"
-
-#if defined(KECCAK_8WAY)
-
-void sha3d_hash_8way(void *state, const void *input)
-{
-    uint32_t buffer[16*8] __attribute__ ((aligned (128)));
-    keccak256_8way_context ctx;
-
-    keccak256_8way_init( &ctx );
-    keccak256_8way_update( &ctx, input, 80 );
-    keccak256_8way_close( &ctx, buffer );
-
-    keccak256_8way_init( &ctx );
-    keccak256_8way_update( &ctx, buffer, 32 );
-    keccak256_8way_close( &ctx, state );
-}
-
-int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t vdata[24*8] __attribute__ ((aligned (128)));
-   uint32_t hash[16*8] __attribute__ ((aligned (64)));
-   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
-   uint32_t *hash7 = &(hash[49]);   // 3*16+1
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   uint32_t n = pdata[19];
-   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce - 8;
-   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
-   const uint32_t Htarg = ptarget[7];
-   const int thr_id = mythr->id;  
-   const bool bench = opt_benchmark;
-
-   mm512_bswap32_intrlv80_8x64( vdata, pdata );
-   *noncev = mm512_intrlv_blend_32(
-              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
-                                n+3, 0, n+2, 0, n+1, 0, n  , 0 ), *noncev );
-   do {
-      sha3d_hash_8way( hash, vdata );
-
-      for ( int lane = 0; lane < 8; lane++ )
-      if ( unlikely( hash7[ lane<<1 ] <= Htarg && !bench ) )
-      {
-          extr_lane_8x64( lane_hash, hash, lane, 256 );
-          if ( valid_hash( lane_hash, ptarget ) )
-          {
-              pdata[19] = bswap_32( n + lane );
-              submit_lane_solution( work, lane_hash, mythr, lane );
-          }
-      }
-      *noncev = _mm512_add_epi32( *noncev,
-                                  m512_const1_64( 0x0000000800000000 ) );
-      n += 8;
-
-   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
-   pdata[19] = n;
-   *hashes_done = n - first_nonce;
-   return 0;
-}
-
-#elif defined(KECCAK_4WAY)
-
-void sha3d_hash_4way(void *state, const void *input)
-{
-    uint32_t buffer[16*4] __attribute__ ((aligned (64)));
-    keccak256_4way_context ctx;
-
-    keccak256_4way_init( &ctx );
-    keccak256_4way_update( &ctx, input, 80 );
-    keccak256_4way_close( &ctx, buffer );
-
-    keccak256_4way_init( &ctx );
-    keccak256_4way_update( &ctx, buffer, 32 );
-    keccak256_4way_close( &ctx, state );
-}
-
-int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
-   uint32_t hash[16*4] __attribute__ ((aligned (32)));
-   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
-   uint32_t *hash7 = &(hash[25]);   // 3*8+1
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   uint32_t n = pdata[19];
-   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce - 4;
-   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
-   const uint32_t Htarg = ptarget[7];
-   const int thr_id = mythr->id;
-   const bool bench = opt_benchmark;
-
-   mm256_bswap32_intrlv80_4x64( vdata, pdata );
-   *noncev = mm256_intrlv_blend_32( 
-                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
-   do {
-      sha3d_hash_4way( hash, vdata );
-
-      for ( int lane = 0; lane < 4; lane++ )
-      if ( unlikely( hash7[ lane<<1 ] <= Htarg && !bench ) )
-      {
-          extr_lane_4x64( lane_hash, hash, lane, 256 );
-          if ( valid_hash( lane_hash, ptarget ) )
-          {
-              pdata[19] = bswap_32( n + lane );
-              submit_lane_solution( work, lane_hash, mythr, lane );
-          }
-      }
-      *noncev = _mm256_add_epi32( *noncev,
-                                  m256_const1_64( 0x0000000400000000 ) );
-      n += 4;
-   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
-   pdata[19] = n;
-   *hashes_done = n - first_nonce;
-   return 0;
-}
-
-#endif
--- a/algo/keccak/sha3d.c
+++ b/algo/keccak/sha3d.c
@@ -1,54 +0,0 @@
-#include "keccak-gate.h"
-
-#if !defined(KECCAK_8WAY) && !defined(KECCAK_4WAY)
-
-#include <stdlib.h>
-#include <string.h>
-#include <stdint.h>
-#include "sph_keccak.h"
-
-void sha3d_hash(void *state, const void *input)
-{
-    uint32_t buffer[16];
-    sph_keccak256_context ctx_keccak;
-   
-    sph_keccak256_init( &ctx_keccak );
-    sph_keccak256 ( &ctx_keccak, input, 80 );
-    sph_keccak256_close( &ctx_keccak, buffer );
-    sph_keccak256_init( &ctx_keccak );
-    sph_keccak256 ( &ctx_keccak, buffer, 32 );
-    sph_keccak256_close( &ctx_keccak, state );
-}
-
-int scanhash_sha3d( struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t _ALIGN(64) hash64[8];
-   uint32_t _ALIGN(64) endiandata[32];
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-	uint32_t n = pdata[19];
-	const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce;
-   const int thr_id = mythr->id;
-
-   for ( int i=0; i < 19; i++ ) 
-      be32enc( &endiandata[i], pdata[i] );
-
-	do {
-		be32enc( &endiandata[19], n ); 
-		sha3d_hash( hash64, endiandata );
-      if ( valid_hash( hash64, ptarget ) && !opt_benchmark )
-      {
-         pdata[19] = n;
-         submit_solution( work, hash64, mythr );
-		}
-      n++;
-   } while ( n < last_nonce && !work_restart[thr_id].restart );
-	
-	*hashes_done = n - first_nonce;
-	pdata[19] = n;
-	return 0;
-}
-
-#endif
--- a/algo/keccak/sph_keccak.c
+++ b/algo/keccak/sph_keccak.c
@@ -32,8 +32,8 @@

 #include <stddef.h>
 #include <string.h>
+
 #include "sph_keccak.h"
-#include "keccak-gate.h"

 #ifdef __cplusplus
 extern "C"{
@@ -1616,7 +1616,7 @@ keccak_core(sph_keccak_context *kc, const void *data, size_t len, size_t lim)
 		} u; \
 		size_t j; \
 \
-		eb = hard_coded_eb; \
+		eb = (0x100 | (ub & 0xFF)) >> (8 - n); \
 		if (kc->ptr == (lim - 1)) { \
 			if (n == 7) { \
 				u.tmp[0] = eb; \
--- a/algo/luffa/luffa.c
+++ b/algo/luffa/luffa.c
@@ -0,0 +1,63 @@
+#include "algo-gate-api.h"
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+
+#include "sph_luffa.h"
+
+void luffahash(void *output, const void *input)
+{
+	unsigned char _ALIGN(128) hash[64];
+	sph_luffa512_context ctx_luffa;
+
+	sph_luffa512_init(&ctx_luffa);
+	sph_luffa512 (&ctx_luffa, input, 80);
+	sph_luffa512_close(&ctx_luffa, (void*) hash);
+
+	memcpy(output, hash, 32);
+}
+
+int scanhash_luffa(int thr_id, struct work *work,
+	uint32_t max_nonce, uint64_t *hashes_done)
+{
+        uint32_t *pdata = work->data;
+        uint32_t *ptarget = work->target;
+
+	uint32_t _ALIGN(64) hash64[8];
+	uint32_t _ALIGN(64) endiandata[20];
+
+	const uint32_t Htarg = ptarget[7];
+	const uint32_t first_nonce = pdata[19];
+
+	uint32_t n = first_nonce;
+
+        for (int i=0; i < 19; i++) 
+                be32enc(&endiandata[i], pdata[i]);
+
+	do {
+		be32enc(&endiandata[19], n);
+		luffahash(hash64, endiandata);
+		if (hash64[7] < Htarg && fulltest(hash64, ptarget)) {
+			*hashes_done = n - first_nonce + 1;
+			pdata[19] = n;
+			return true;
+		}
+		n++;
+
+	} while (n < max_nonce && !work_restart[thr_id].restart);
+
+	*hashes_done = n - first_nonce + 1;
+	pdata[19] = n;
+
+	return 0;
+}
+
+bool register_luffa_algo( algo_gate_t* gate )
+{
+    gate->scanhash = (void*)&scanhash_luffa;
+    gate->hash     = (void*)&luffahash;
+    return true;
+};
+
--- a/algo/luffa/luffa_for_sse2.c
+++ b/algo/luffa/luffa_for_sse2.c
@@ -344,62 +344,18 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,

    // 16 byte partial block exists for 80 byte len
    if ( state->rembytes  )
-       // padding of partial block
-       rnd512( state, m128_const_64( 0, 0x80000000 ),
-                      mm128_bswap_32( cast_m128i( data ) ) );
-    else
-       // empty pad block
-       rnd512( state, m128_zero, m128_const_64( 0, 0x80000000 ) );
-
-    finalization512( state, (uint32*) output );
-    if ( state->hashbitlen > 512 )
-        finalization512( state, (uint32*)( output+128 ) );
-
-    return SUCCESS;
-}
-
-
-int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen,
-              const BitSequence* data, size_t inlen )
-{
-// Optimized for integrals of 16 bytes, good for 64 and 80 byte len
-    int i;
-    state->hashbitlen = hashbitlen;
-    /* set the lower 32 bits to '1' */
-    MASK= _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xffffffff);
-    /* set all bits to '1' */
-    ALLONE = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
-    /* set the 32-bit round constant values to the 128-bit data field */
-    for ( i=0; i<32; i++ )
-        CNS128[i] = _mm_load_si128( (__m128i*)&CNS_INIT[i*4] );
-    for ( i=0; i<10; i++ )
-    state->chainv[i] = _mm_load_si128( (__m128i*)&IV[i*4] );
-    memset(state->buffer, 0, sizeof state->buffer );
-
-    // update
-
-    int blocks = (int)( inlen / 32 );
-    state->rembytes = inlen % 32;
-
-    // full blocks
-    for ( i = 0; i < blocks; i++ )
    {
-       rnd512( state, mm128_bswap_32( casti_m128i( data, 1 ) ),
-                      mm128_bswap_32( casti_m128i( data, 0 ) ) );
-       data += MSG_BLOCK_BYTE_LEN;
+      // padding of partial block
+      rnd512( state, _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ),
+                      mm128_bswap_32( cast_m128i( data ) ) );
+    }
+    else
+    {
+      // empty pad block
+     rnd512( state, _mm_setzero_si128(), 
+                       _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ) );
    }

-    // final
-
-    // 16 byte partial block exists for 80 byte len
-    if ( state->rembytes  )
-       // padding of partial block
-       rnd512( state, m128_const_64( 0, 0x80000000 ),
-                      mm128_bswap_32( cast_m128i( data ) ) );
-    else
-       // empty pad block
-       rnd512( state, m128_zero, m128_const_64( 0, 0x80000000 ) );
-
    finalization512( state, (uint32*) output );
    if ( state->hashbitlen > 512 )
        finalization512( state, (uint32*)( output+128 ) );
@@ -407,7 +363,6 @@ int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen,
    return SUCCESS;
 }

-
 /***************************************************/
 /* Round function         */
 /* state: hash context    */
--- a/algo/luffa/luffa_for_sse2.h
+++ b/algo/luffa/luffa_for_sse2.h
@@ -1,6 +1,3 @@
-#if !defined(LUFFA_FOR_SSE2_H__)
-#define LUFFA_FOR_SSE2_H__ 1
-
 /*
 * luffa_for_sse2.h
 * Version 2.0 (Sep 15th 2009)
@@ -51,6 +48,8 @@
 typedef struct {
    uint32 buffer[8] __attribute((aligned(32)));
    __m128i chainv[10] __attribute((aligned(32)));   /* Chaining values */
+//    uint64 bitlen[2]; /* Message length in bits */
+//    uint32 rembitlen; /* Length of buffer data to be hashed */
    int hashbitlen;
    int rembytes;
 } hashState_luffa;
@@ -66,6 +65,5 @@ HashReturn final_luffa( hashState_luffa *state, BitSequence *hashval );
 HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
                                   const BitSequence* data, size_t inlen );

-int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen,
-                                   const BitSequence* data, size_t inlen );
-#endif   // LUFFA_FOR_SSE2_H___
+
+
--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -115,8 +115,9 @@ void allium_16way_hash( void *state, const void *input )
   intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, 256 );
   intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, 256 );

-   cube_4way_full( &ctx.cube, vhashA, 256, vhashA, 32 );
-   cube_4way_full( &ctx.cube, vhashB, 256, vhashB, 32 );
+   cube_4way_update_close( &ctx.cube, vhashA, vhashA, 32 );
+   cube_4way_init( &ctx.cube, 256, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhashB, vhashB, 32 );

   dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 );
   dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 );
@@ -124,8 +125,10 @@ void allium_16way_hash( void *state, const void *input )
   intrlv_4x128( vhashA, hash8, hash9, hash10, hash11, 256 );
   intrlv_4x128( vhashB, hash12, hash13, hash14, hash15, 256 );

-   cube_4way_full( &ctx.cube, vhashA, 256, vhashA, 32 );
-   cube_4way_full( &ctx.cube, vhashB, 256, vhashB, 32 );
+   cube_4way_init( &ctx.cube, 256, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhashA, vhashA, 32 );
+   cube_4way_init( &ctx.cube, 256, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhashB, vhashB, 32 );

   dintrlv_4x128( hash8, hash9, hash10, hash11, vhashA, 256 );
   dintrlv_4x128( hash12, hash13, hash14, hash15, vhashB, 256 );
@@ -166,6 +169,7 @@ void allium_16way_hash( void *state, const void *input )
   skein256_8way_update( &ctx.skein, vhashB, 32 );
   skein256_8way_close( &ctx.skein, vhashB );

+
   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                 vhashA, 256 );
   dintrlv_8x64( hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15,
@@ -175,43 +179,77 @@ void allium_16way_hash( void *state, const void *input )

   intrlv_4x128( vhash, hash0, hash1, hash2, hash3, 256 );

-   groestl256_4way_full( &ctx.groestl, vhash, vhash, 256 );
+   groestl256_4way_update_close( &ctx.groestl, vhash, vhash, 256 );

   dintrlv_4x128( state, state+32, state+64, state+96, vhash, 256 );
   intrlv_4x128( vhash, hash4, hash5, hash6, hash7, 256 );

-   groestl256_4way_full( &ctx.groestl, vhash, vhash, 256 );
+   groestl256_4way_init( &ctx.groestl, 32 );
+   groestl256_4way_update_close( &ctx.groestl, vhash, vhash, 256 );
   
   dintrlv_4x128( state+128, state+160, state+192, state+224, vhash, 256 );
   intrlv_4x128( vhash, hash8, hash9, hash10, hash11, 256 );

-   groestl256_4way_full( &ctx.groestl, vhash, vhash, 256 );
+   groestl256_4way_init( &ctx.groestl, 32 );
+   groestl256_4way_update_close( &ctx.groestl, vhash, vhash, 256 );

   dintrlv_4x128( state+256, state+288, state+320, state+352, vhash, 256 );
   intrlv_4x128( vhash, hash12, hash13, hash14, hash15, 256 );

-   groestl256_4way_full( &ctx.groestl, vhash, vhash, 256 );
+   groestl256_4way_init( &ctx.groestl, 32 );
+   groestl256_4way_update_close( &ctx.groestl, vhash, vhash, 256 );
 
   dintrlv_4x128( state+384, state+416, state+448, state+480, vhash, 256 );
   
 #else

-   groestl256_full( &ctx.groestl, state,     hash0,  256 );
-   groestl256_full( &ctx.groestl, state+32,  hash1,  256 );
-   groestl256_full( &ctx.groestl, state+64,  hash2,  256 );
-   groestl256_full( &ctx.groestl, state+96,  hash3,  256 );
-   groestl256_full( &ctx.groestl, state+128, hash4,  256 );
-   groestl256_full( &ctx.groestl, state+160, hash5,  256 );
-   groestl256_full( &ctx.groestl, state+192, hash6,  256 );
-   groestl256_full( &ctx.groestl, state+224, hash7,  256 );
-   groestl256_full( &ctx.groestl, state+256, hash8,  256 );
-   groestl256_full( &ctx.groestl, state+288, hash9,  256 );
-   groestl256_full( &ctx.groestl, state+320, hash10, 256 );
-   groestl256_full( &ctx.groestl, state+352, hash11, 256 );
-   groestl256_full( &ctx.groestl, state+384, hash12, 256 );
-   groestl256_full( &ctx.groestl, state+416, hash13, 256 );
-   groestl256_full( &ctx.groestl, state+448, hash14, 256 );
-   groestl256_full( &ctx.groestl, state+480, hash15, 256 );
+   update_and_final_groestl256( &ctx.groestl, state, hash0, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+32, hash1, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+64, hash2, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+96, hash3, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+128, hash4, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+160, hash5, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+192, hash6, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+224, hash7, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+256, hash8, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+288, hash9, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+320, hash10, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+352, hash11, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+384, hash12, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+416, hash13, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+448, hash14, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+480, hash15, 256 );
+
 #endif
 }

@@ -225,32 +263,37 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
   const uint32_t last_nonce = max_nonce - 16;
+   const uint32_t Htarg = ptarget[7];
   __m512i  *noncev = (__m512i*)vdata + 19;   // aligned
-   const int thr_id = mythr->id;
-   const bool bench = opt_benchmark;
+   int thr_id = mythr->id;  // thr_id arg is deprecated

-   if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff;
+   if ( opt_benchmark )
+      ( (uint32_t*)ptarget )[7] = 0x0000ff;

   mm512_bswap32_intrlv80_16x32( vdata, pdata );
-   *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
-                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
-
   blake256_16way_init( &allium_16way_ctx.blake );
   blake256_16way_update( &allium_16way_ctx.blake, vdata, 64 );

   do {
-     allium_16way_hash( hash, vdata );
+     *noncev = mm512_bswap_32( _mm512_set_epi32( n+15, n+14, n+13, n+12,
+                                                 n+11, n+10, n+ 9, n+ 8,
+                                                 n+ 7, n+ 6, n+ 5, n+ 4,
+                                                 n+ 3, n+ 2, n +1, n ) );

-     for ( int lane = 0; lane < 16; lane++ ) 
-     if ( unlikely( valid_hash( hash+(lane<<3), ptarget ) && !bench ) )
+     allium_16way_hash( hash, vdata );
+     pdata[19] = n;
+
+     for ( int lane = 0; lane < 16; lane++ ) if ( (hash+(lane<<3))[7] <= Htarg )
     {
-         pdata[19] = bswap_32( n + lane );
-         submit_lane_solution( work, hash+(lane<<3), mythr, lane );
+        if ( fulltest( hash+(lane<<3), ptarget ) && !opt_benchmark )
+        {
+           pdata[19] = n + lane;
+           submit_lane_solution( work, hash+(lane<<3), mythr, lane );
+         }
     }
-     *noncev = _mm512_add_epi32( *noncev, m512_const1_32( 16 ) );
     n += 16;
-   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart) );
-   pdata[19] = n;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart);
+
   *hashes_done = n - first_nonce;
   return 0;
 }
@@ -277,18 +320,18 @@ bool init_allium_8way_ctx()
   return true;
 }

-void allium_8way_hash( void *hash, const void *input )
+void allium_8way_hash( void *state, const void *input )
 {
-   uint64_t vhashA[4*8] __attribute__ ((aligned (64)));
-   uint64_t vhashB[4*8] __attribute__ ((aligned (64)));
-   uint64_t *hash0 = (uint64_t*)hash;
-   uint64_t *hash1 = (uint64_t*)hash+ 4;
-   uint64_t *hash2 = (uint64_t*)hash+ 8;
-   uint64_t *hash3 = (uint64_t*)hash+12;
-   uint64_t *hash4 = (uint64_t*)hash+16;
-   uint64_t *hash5 = (uint64_t*)hash+20;
-   uint64_t *hash6 = (uint64_t*)hash+24;
-   uint64_t *hash7 = (uint64_t*)hash+28;
+   uint32_t vhashA[8*8] __attribute__ ((aligned (64)));
+   uint32_t vhashB[8*8] __attribute__ ((aligned (64)));
+   uint32_t hash0[8] __attribute__ ((aligned (32)));
+   uint32_t hash1[8] __attribute__ ((aligned (32)));
+   uint32_t hash2[8] __attribute__ ((aligned (32)));
+   uint32_t hash3[8] __attribute__ ((aligned (32)));
+   uint32_t hash4[8] __attribute__ ((aligned (64)));
+   uint32_t hash5[8] __attribute__ ((aligned (32)));
+   uint32_t hash6[8] __attribute__ ((aligned (32)));
+   uint32_t hash7[8] __attribute__ ((aligned (32)));
   allium_8way_ctx_holder ctx __attribute__ ((aligned (64))); 

   memcpy( &ctx, &allium_8way_ctx, sizeof(allium_8way_ctx) );
@@ -355,52 +398,69 @@ void allium_8way_hash( void *hash, const void *input )
   dintrlv_4x64( hash0, hash1, hash2, hash3, vhashA, 256 );
   dintrlv_4x64( hash4, hash5, hash6, hash7, vhashB, 256 );

-   groestl256_full( &ctx.groestl, hash0, hash0, 256 );
-   groestl256_full( &ctx.groestl, hash1, hash1, 256 );
-   groestl256_full( &ctx.groestl, hash2, hash2, 256 );
-   groestl256_full( &ctx.groestl, hash3, hash3, 256 );
-   groestl256_full( &ctx.groestl, hash4, hash4, 256 );
-   groestl256_full( &ctx.groestl, hash5, hash5, 256 );
-   groestl256_full( &ctx.groestl, hash6, hash6, 256 );
-   groestl256_full( &ctx.groestl, hash7, hash7, 256 );
+   update_and_final_groestl256( &ctx.groestl, state, hash0, 256 );
+   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+32, hash1, 256 );
+   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+64, hash2, 256 );
+   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+96, hash3, 256 );
+   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+128, hash4, 256 );
+   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+160, hash5, 256 );
+   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+192, hash6, 256 );
+   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+224, hash7, 256 );
 }

 int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
                             uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint64_t hash[4*8] __attribute__ ((aligned (64)));
+   uint32_t hash[8*8] __attribute__ ((aligned (64)));
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
   uint32_t *pdata = work->data;
-   uint64_t *ptarget = (uint64_t*)work->target;
+   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 8;
   uint32_t n = first_nonce;
+   const uint32_t Htarg = ptarget[7];
   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
-   const int thr_id = mythr->id;  
-   const bool bench = opt_benchmark;
+   int thr_id = mythr->id;  
+
+   if ( opt_benchmark )
+      ( (uint32_t*)ptarget )[7] = 0x0000ff;

   mm256_bswap32_intrlv80_8x32( vdata, pdata );
-   *noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
-
   blake256_8way_init( &allium_8way_ctx.blake );
   blake256_8way_update( &allium_8way_ctx.blake, vdata, 64 );

   do {
-     allium_8way_hash( hash, vdata );
+     *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
+                                                 n+3, n+2, n+1, n ) );

-     for ( int lane = 0; lane < 8; lane++ )
+     allium_8way_hash( hash, vdata );
+     pdata[19] = n;
+
+     for ( int lane = 0; lane < 8; lane++ ) if ( (hash+(lane<<3))[7] <= Htarg )
     {
-        const uint64_t *lane_hash = hash + (lane<<2);
-        if ( unlikely( valid_hash( lane_hash, ptarget ) && !bench ) )
+        if ( fulltest( hash+(lane<<3), ptarget ) && !opt_benchmark )
        {
-           pdata[19] = bswap_32( n + lane );
-           submit_lane_solution( work, lane_hash, mythr, lane );
-        }
+           pdata[19] = n + lane;
+           submit_lane_solution( work, hash+(lane<<3), mythr, lane );
+         }
     }
     n += 8;
-     *noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
-   } while ( likely( (n <= last_nonce) && !work_restart[thr_id].restart ) );
-   pdata[19] = n;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart);
+
   *hashes_done = n - first_nonce;
   return 0;
 }
--- a/algo/lyra2/allium.c
+++ b/algo/lyra2/allium.c
@@ -1,7 +1,4 @@
 #include "lyra2-gate.h"
-
-#if !( defined(ALLIUM_16WAY) || defined(ALLIUM_8WAY) || defined(ALLIUM_4WAY) )
-
 #include <memory.h>
 #include "algo/blake/sph_blake.h"
 #include "algo/keccak/sph_keccak.h"
@@ -76,35 +73,37 @@ int scanhash_allium( struct work *work, uint32_t max_nonce,
                     uint64_t *hashes_done, struct thr_info *mythr )
 {
    uint32_t _ALIGN(128) hash[8];
-    uint32_t _ALIGN(128) edata[20];
+    uint32_t _ALIGN(128) endiandata[20];
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
+
+    const uint32_t Htarg = ptarget[7];
    const uint32_t first_nonce = pdata[19];
    uint32_t nonce = first_nonce;
-    const int thr_id = mythr->id; 
+    int thr_id = mythr->id;  // thr_id arg is deprecated

    if ( opt_benchmark )
        ptarget[7] = 0x3ffff;

    for ( int i = 0; i < 19; i++ )
-        edata[i] = bswap_32( pdata[i] );
+        be32enc( &endiandata[i], pdata[i] );

    sph_blake256_init( &allium_ctx.blake );
-    sph_blake256( &allium_ctx.blake, edata, 64 );
+    sph_blake256( &allium_ctx.blake, endiandata, 64 );

    do {
-        edata[19] = nonce;
-        allium_hash( hash, edata );
-        if ( valid_hash( hash, ptarget ) && !opt_benchmark )
+        be32enc( &endiandata[19], nonce );
+        allium_hash( hash, endiandata );
+        if ( hash[7] <= Htarg )
+        if ( fulltest( hash, ptarget ) && !opt_benchmark )
        {
-            pdata[19] = bswap_32( nonce );
+            pdata[19] = nonce;
            submit_solution( work, hash, mythr );
        }
        nonce++;
    } while ( nonce < max_nonce && !work_restart[thr_id].restart );
    pdata[19] = nonce;
-    *hashes_done = pdata[19] - first_nonce;
+    *hashes_done = pdata[19] - first_nonce + 1;
    return 0;
 }

-#endif
--- a/algo/lyra2/lyra2-gate.c
+++ b/algo/lyra2/lyra2-gate.c
@@ -94,12 +94,12 @@ bool lyra2rev2_thread_init()
   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;

   int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
-#if defined (LYRA2REV2_16WAY)
+#if defined (LYRA2REV2_8WAY)
   l2v2_wholeMatrix = _mm_malloc( 2 * size, 64 );   // 2 way
-   init_lyra2rev2_16way_ctx();;
-#elif defined (LYRA2REV2_8WAY)
-   l2v2_wholeMatrix = _mm_malloc( size, 64 );
   init_lyra2rev2_8way_ctx();;
+#elif defined (LYRA2REV2_4WAY)
+   l2v2_wholeMatrix = _mm_malloc( size, 64 );
+   init_lyra2rev2_4way_ctx();;
 #else
   l2v2_wholeMatrix = _mm_malloc( size, 64 );
   init_lyra2rev2_ctx();
@@ -109,17 +109,17 @@ bool lyra2rev2_thread_init()

 bool register_lyra2rev2_algo( algo_gate_t* gate )
 {
-#if defined (LYRA2REV2_16WAY)
-  gate->scanhash  = (void*)&scanhash_lyra2rev2_16way;
-  gate->hash      = (void*)&lyra2rev2_16way_hash;
-#elif defined (LYRA2REV2_8WAY)
+#if defined (LYRA2REV2_8WAY)
  gate->scanhash  = (void*)&scanhash_lyra2rev2_8way;
  gate->hash      = (void*)&lyra2rev2_8way_hash;
+#elif defined (LYRA2REV2_4WAY)
+  gate->scanhash  = (void*)&scanhash_lyra2rev2_4way;
+  gate->hash      = (void*)&lyra2rev2_4way_hash;
 #else
  gate->scanhash  = (void*)&scanhash_lyra2rev2;
  gate->hash      = (void*)&lyra2rev2_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
  opt_target_factor = 256.0;
  return true;
@@ -194,7 +194,7 @@ bool register_allium_algo( algo_gate_t* gate )

 /////////////////////////////////////////

-bool phi2_has_roots = false;
+bool phi2_has_roots;
 bool phi2_use_roots = false;

 int phi2_get_work_data_size() { return phi2_use_roots ? 144 : 128; }
@@ -220,7 +220,7 @@ void phi2_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
   // Assemble block header
   algo_gate.build_block_header( g_work, le32dec( sctx->job.version ),
                  (uint32_t*) sctx->job.prevhash, (uint32_t*) merkle_tree,
-                  le32dec( sctx->job.ntime ), le32dec(sctx->job.nbits), NULL );
+                  le32dec( sctx->job.ntime ), le32dec(sctx->job.nbits) );
   for ( t = 0; t < 16; t++ )
      g_work->data[ 20+t ] = ((uint32_t*)sctx->job.extra)[t];
 }
@@ -228,14 +228,13 @@ void phi2_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )

 bool register_phi2_algo( algo_gate_t* gate )
 {
-   gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+//   init_phi2_ctx();
+   gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
   gate->get_work_data_size = (void*)&phi2_get_work_data_size;
   gate->decode_extra_data  = (void*)&phi2_decode_extra_data;
   gate->build_extraheader  = (void*)&phi2_build_extraheader;
   opt_target_factor = 256.0;
-#if defined(PHI2_8WAY)
-   gate->scanhash           = (void*)&scanhash_phi2_8way;
-#elif defined(PHI2_4WAY)
+#if defined(PHI2_4WAY)
   gate->scanhash           = (void*)&scanhash_phi2_4way;
 #else
   init_phi2_ctx();
--- a/algo/lyra2/lyra2-gate.h
+++ b/algo/lyra2/lyra2-gate.h
@@ -51,29 +51,28 @@ bool init_lyra2rev3_ctx();
 //////////////////////////////////

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-  #define LYRA2REV2_16WAY 1
-#elif defined(__AVX2__)
  #define LYRA2REV2_8WAY 1
+#elif defined(__AVX2__)
+  #define LYRA2REV2_4WAY 1
 #endif

 extern __thread uint64_t* l2v2_wholeMatrix;

 bool register_lyra2rev2_algo( algo_gate_t* gate );

-#if defined(LYRA2REV2_16WAY)
-
-void lyra2rev2_16way_hash( void *state, const void *input );
-int scanhash_lyra2rev2_16way( struct work *work, uint32_t max_nonce,
-                             uint64_t *hashes_done, struct thr_info *mythr );
-bool init_lyra2rev2_16way_ctx();
-
-#elif defined(LYRA2REV2_8WAY)
+#if defined(LYRA2REV2_8WAY)

 void lyra2rev2_8way_hash( void *state, const void *input );
 int scanhash_lyra2rev2_8way( struct work *work, uint32_t max_nonce,
                             uint64_t *hashes_done, struct thr_info *mythr );
 bool init_lyra2rev2_8way_ctx();

+#elif defined(LYRA2REV2_4WAY)
+
+void lyra2rev2_4way_hash( void *state, const void *input );
+int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
+                             uint64_t *hashes_done, struct thr_info *mythr );
+bool init_lyra2rev2_4way_ctx();

 #else

@@ -186,26 +185,19 @@ bool init_allium_ctx();

 /////////////////////////////////////////

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-  #define PHI2_8WAY 1
-#elif defined(__AVX2__) && defined(__AES__)
-  #define PHI2_4WAY 1
+#if defined(__AVX2__) && defined(__AES__)
+//  #define PHI2_4WAY
 #endif

-extern bool phi2_has_roots;
+bool phi2_has_roots;

 bool register_phi2_algo( algo_gate_t* gate );
-#if defined(PHI2_8WAY)
-
-void phi2_8way_hash( void *state, const void *input );
-int scanhash_phi2_8way( struct work *work, uint32_t max_nonce,
-                     uint64_t *hashes_done, struct thr_info *mythr );
-
-#elif defined(PHI2_4WAY)
+#if defined(PHI2_4WAY)

 void phi2_hash_4way( void *state, const void *input );
 int scanhash_phi2_4way( struct work *work, uint32_t max_nonce,
                     uint64_t *hashes_done, struct thr_info *mythr );
+//void init_phi2_ctx();

 #else

--- a/algo/lyra2/lyra2h.c
+++ b/algo/lyra2/lyra2h.c
@@ -1,7 +1,4 @@
 #include "lyra2-gate.h"
-
-#if !( defined(LYRA2H_8WAY) || defined(LYRA2H_4WAY) )
-
 #include <memory.h>
 #include <mm_malloc.h>
 #include "lyra2.h"
@@ -74,4 +71,3 @@ int scanhash_lyra2h( struct work *work, uint32_t max_nonce,
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }
-#endif
--- a/algo/lyra2/lyra2rev2-4way.c
+++ b/algo/lyra2/lyra2rev2-4way.c
@@ -7,227 +7,23 @@
 #include "algo/cubehash/cubehash_sse2.h" 
 #include "algo/cubehash/cube-hash-2way.h"

-
-#if defined (LYRA2REV2_16WAY)
-
-typedef struct {
-   blake256_16way_context    blake;
-   keccak256_8way_context    keccak;
-   cubehashParam             cube;
-   skein256_8way_context     skein;
-   bmw256_16way_context      bmw;
-} lyra2v2_16way_ctx_holder __attribute__ ((aligned (64)));
-
-static lyra2v2_16way_ctx_holder l2v2_16way_ctx;
-
-bool init_lyra2rev2_16way_ctx()
-{
-   keccak256_8way_init( &l2v2_16way_ctx.keccak );
-   cubehashInit( &l2v2_16way_ctx.cube, 256, 16, 32 );
-   skein256_8way_init( &l2v2_16way_ctx.skein );
-   bmw256_16way_init( &l2v2_16way_ctx.bmw );
-   return true;
-}
-
-void lyra2rev2_16way_hash( void *state, const void *input )
-{
-   uint32_t vhash[8*16] __attribute__ ((aligned (128)));
-   uint32_t hash0[8] __attribute__ ((aligned (64)));
-   uint32_t hash1[8] __attribute__ ((aligned (64)));
-   uint32_t hash2[8] __attribute__ ((aligned (64)));
-   uint32_t hash3[8] __attribute__ ((aligned (64)));
-   uint32_t hash4[8] __attribute__ ((aligned (64)));
-   uint32_t hash5[8] __attribute__ ((aligned (64)));
-   uint32_t hash6[8] __attribute__ ((aligned (64)));
-   uint32_t hash7[8] __attribute__ ((aligned (64)));
-   uint32_t hash8[8] __attribute__ ((aligned (64)));
-   uint32_t hash9[8] __attribute__ ((aligned (64)));
-   uint32_t hash10[8] __attribute__ ((aligned (64)));
-   uint32_t hash11[8] __attribute__ ((aligned (64)));
-   uint32_t hash12[8] __attribute__ ((aligned (64)));
-   uint32_t hash13[8] __attribute__ ((aligned (64)));
-   uint32_t hash14[8] __attribute__ ((aligned (64)));
-   uint32_t hash15[8] __attribute__ ((aligned (64)));
-   lyra2v2_16way_ctx_holder ctx __attribute__ ((aligned (64)));
-   memcpy( &ctx, &l2v2_16way_ctx, sizeof(l2v2_16way_ctx) );
-
-   blake256_16way_update( &ctx.blake, input + (64<<4), 16 );
-   blake256_16way_close( &ctx.blake, vhash );
-
-   dintrlv_16x32( hash0,  hash1,  hash2,  hash3,
-                  hash4,  hash5,  hash6,  hash7,
-                  hash8,  hash9,  hash10, hash11,
-                  hash12, hash13, hash14, hash15, vhash, 256 );
-
-   intrlv_8x64( vhash, hash0, hash1, hash2, hash3,
-                       hash4, hash5, hash6, hash7, 256 );
-
-   keccak256_8way_update( &ctx.keccak, vhash, 32 );
-   keccak256_8way_close( &ctx.keccak, vhash );
-
-   dintrlv_8x64( hash0, hash1, hash2, hash3,
-                 hash4, hash5, hash6, hash7, vhash, 256 );
-   intrlv_8x64( vhash, hash8,  hash9,  hash10, hash11,
-                       hash12, hash13, hash14, hash15, 256 );
-
-   keccak256_8way_init( &ctx.keccak );
-   keccak256_8way_update( &ctx.keccak, vhash, 32 );
-   keccak256_8way_close( &ctx.keccak, vhash );
-
-   dintrlv_8x64( hash8,  hash9,  hash10,  hash11,
-                 hash12, hash13, hash14, hash5, vhash, 256 );
-
-   cubehash_full( &ctx.cube, (byte*) hash0,  256, (const byte*) hash0,  32 );
-   cubehash_full( &ctx.cube, (byte*) hash1,  256, (const byte*) hash1,  32 );
-   cubehash_full( &ctx.cube, (byte*) hash2,  256, (const byte*) hash2,  32 );
-   cubehash_full( &ctx.cube, (byte*) hash3,  256, (const byte*) hash3,  32 );
-   cubehash_full( &ctx.cube, (byte*) hash4,  256, (const byte*) hash4,  32 );
-   cubehash_full( &ctx.cube, (byte*) hash5,  256, (const byte*) hash5,  32 );
-   cubehash_full( &ctx.cube, (byte*) hash6,  256, (const byte*) hash6,  32 );
-   cubehash_full( &ctx.cube, (byte*) hash7,  256, (const byte*) hash7,  32 );
-   cubehash_full( &ctx.cube, (byte*) hash8,  256, (const byte*) hash8,  32 );
-   cubehash_full( &ctx.cube, (byte*) hash9,  256, (const byte*) hash9,  32 );
-   cubehash_full( &ctx.cube, (byte*) hash10, 256, (const byte*) hash10, 32 );
-   cubehash_full( &ctx.cube, (byte*) hash11, 256, (const byte*) hash11, 32 );
-   cubehash_full( &ctx.cube, (byte*) hash12, 256, (const byte*) hash12, 32 );
-   cubehash_full( &ctx.cube, (byte*) hash13, 256, (const byte*) hash13, 32 );
-   cubehash_full( &ctx.cube, (byte*) hash14, 256, (const byte*) hash14, 32 );
-   cubehash_full( &ctx.cube, (byte*) hash15, 256, (const byte*) hash15, 32 );
-
-
-   intrlv_2x256( vhash, hash0, hash1, 256 );
-   LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
-   dintrlv_2x256( hash0, hash1, vhash, 256 );
-   intrlv_2x256( vhash, hash2, hash3, 256 );
-   LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
-   dintrlv_2x256( hash2, hash3, vhash, 256 );
-   intrlv_2x256( vhash, hash4, hash5, 256 );
-   LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
-   dintrlv_2x256( hash4, hash5, vhash, 256 );
-   intrlv_2x256( vhash, hash6, hash7, 256 );
-   LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
-   dintrlv_2x256( hash6, hash7, vhash, 256 );
-   intrlv_2x256( vhash, hash8, hash9, 256 );
-   LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
-   dintrlv_2x256( hash8, hash9, vhash, 256 );
-   intrlv_2x256( vhash, hash10, hash11, 256 );
-   LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
-   dintrlv_2x256( hash10, hash11, vhash, 256 );
-   intrlv_2x256( vhash, hash12, hash13, 256 );
-   LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
-   dintrlv_2x256( hash12, hash13, vhash, 256 );
-   intrlv_2x256( vhash, hash14, hash15, 256 );
-   LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
-   dintrlv_2x256( hash14, hash15, vhash, 256 );
-
-   intrlv_8x64( vhash, hash0, hash1, hash2, hash3,
-                       hash4, hash5, hash6, hash7, 256 );
-   skein256_8way_update( &ctx.skein, vhash, 32 );
-   skein256_8way_close( &ctx.skein, vhash );
-
-   dintrlv_8x64( hash0, hash1, hash2, hash3,
-                 hash4, hash5, hash6, hash7, vhash, 256 );
-   intrlv_8x64( vhash, hash8,  hash9,  hash10, hash11, hash12,
-                       hash13, hash14, hash15, 256 );
-
-   skein256_8way_init( &ctx.skein );
-   skein256_8way_update( &ctx.skein, vhash, 32 );
-   skein256_8way_close( &ctx.skein, vhash );
-
-   dintrlv_8x64( hash8,  hash9,  hash10, hash11,
-                 hash12, hash13, hash14, hash15, vhash, 256 );
-
-   
-   cubehash_full( &ctx.cube, (byte*) hash0,  256, (const byte*) hash0, 32 );
-   cubehash_full( &ctx.cube, (byte*) hash1,  256, (const byte*) hash1, 32 );
-   cubehash_full( &ctx.cube, (byte*) hash2,  256, (const byte*) hash2, 32 );
-   cubehash_full( &ctx.cube, (byte*) hash3,  256, (const byte*) hash3, 32 );
-   cubehash_full( &ctx.cube, (byte*) hash4,  256, (const byte*) hash4, 32 );
-   cubehash_full( &ctx.cube, (byte*) hash5,  256, (const byte*) hash5, 32 );
-   cubehash_full( &ctx.cube, (byte*) hash6,  256, (const byte*) hash6, 32 );
-   cubehash_full( &ctx.cube, (byte*) hash7,  256, (const byte*) hash7, 32 );
-   cubehash_full( &ctx.cube, (byte*) hash8,  256, (const byte*) hash8,  32 );
-   cubehash_full( &ctx.cube, (byte*) hash9,  256, (const byte*) hash9,  32 );
-   cubehash_full( &ctx.cube, (byte*) hash10, 256, (const byte*) hash10, 32 );
-   cubehash_full( &ctx.cube, (byte*) hash11, 256, (const byte*) hash11, 32 );
-   cubehash_full( &ctx.cube, (byte*) hash12, 256, (const byte*) hash12, 32 );
-   cubehash_full( &ctx.cube, (byte*) hash13, 256, (const byte*) hash13, 32 );
-   cubehash_full( &ctx.cube, (byte*) hash14, 256, (const byte*) hash14, 32 );
-   cubehash_full( &ctx.cube, (byte*) hash15, 256, (const byte*) hash15, 32 );
-
-   intrlv_16x32( vhash, hash0,  hash1,  hash2,  hash3,
-                        hash4,  hash5,  hash6,  hash7,
-                        hash8,  hash9,  hash10, hash11,
-                        hash12, hash13, hash14, hash15, 256 );
-
-   bmw256_16way_update( &ctx.bmw, vhash, 32 );
-   bmw256_16way_close( &ctx.bmw, state );
-}
-
-int scanhash_lyra2rev2_16way( struct work *work, const uint32_t max_nonce,
-                             uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t hash[8*16] __attribute__ ((aligned (128)));
-   uint32_t vdata[20*16] __attribute__ ((aligned (64)));
-   uint32_t *hashd7 = &hash[7*16];
-   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce - 16;
-   uint32_t n = first_nonce;
-   const uint32_t targ32 = ptarget[7];
-   __m512i  *noncev = (__m512i*)vdata + 19;
-   const int thr_id = mythr->id;
-   const bool bench = opt_benchmark;
-
-   if ( bench )  ptarget[7] = 0x0000ff;
-
-   mm512_bswap32_intrlv80_16x32( vdata, pdata );
-   *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
-                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );
-   blake256_16way_init( &l2v2_16way_ctx.blake );
-   blake256_16way_update( &l2v2_16way_ctx.blake, vdata, 64 );
-
-   do
-   {
-      lyra2rev2_16way_hash( hash, vdata );
-
-      for ( int lane = 0; lane < 16; lane++ )
-      if ( unlikely( hashd7[lane] <= targ32 ) )
-      {
-         extr_lane_16x32( lane_hash, hash, lane, 256 );
-         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
-         {
-             pdata[19] = bswap_32( n + lane );
-             submit_lane_solution( work, lane_hash, mythr, lane );
-         }
-      }
-      *noncev = _mm512_add_epi32( *noncev, m512_const1_32( 16 ) );
-      n += 16;
-   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
-   pdata[19] = n;
-   *hashes_done = n - first_nonce;
-   return 0;
-}
-
-#elif defined (LYRA2REV2_8WAY)
+#if defined (LYRA2REV2_8WAY)

 typedef struct {
   blake256_8way_context     blake;
-   keccak256_4way_context    keccak;
-   cubehashParam             cube;
-   skein256_4way_context     skein;
-   bmw256_8way_context       bmw;
+   keccak256_8way_context    keccak;
+   cube_4way_context          cube;
+   skein256_8way_context     skein;
+   bmw256_8way_context          bmw;
 } lyra2v2_8way_ctx_holder __attribute__ ((aligned (64)));

 static lyra2v2_8way_ctx_holder l2v2_8way_ctx;

 bool init_lyra2rev2_8way_ctx()
 {
-   keccak256_4way_init( &l2v2_8way_ctx.keccak );
-   cubehashInit( &l2v2_8way_ctx.cube, 256, 16, 32 );
-   skein256_4way_init( &l2v2_8way_ctx.skein );
+   keccak256_8way_init( &l2v2_8way_ctx.keccak );
+   cube_4way_init( &l2v2_8way_ctx.cube, 256, 16, 32 );
+   skein256_8way_init( &l2v2_8way_ctx.skein );
   bmw256_8way_init( &l2v2_8way_ctx.bmw );
   return true;
 }
@@ -235,6 +31,8 @@ bool init_lyra2rev2_8way_ctx()
 void lyra2rev2_8way_hash( void *state, const void *input )
 {
   uint32_t vhash[8*8] __attribute__ ((aligned (128)));
+   uint32_t vhashA[8*8] __attribute__ ((aligned (64)));
+   uint32_t vhashB[8*8] __attribute__ ((aligned (64)));
   uint32_t hash0[8] __attribute__ ((aligned (64)));
   uint32_t hash1[8] __attribute__ ((aligned (64)));
   uint32_t hash2[8] __attribute__ ((aligned (64)));
@@ -249,113 +47,103 @@ void lyra2rev2_8way_hash( void *state, const void *input )
   blake256_8way_update( &ctx.blake, input + (64<<3), 16 );
   blake256_8way_close( &ctx.blake, vhash );

-   dintrlv_8x32( hash0, hash1, hash2, hash3,
-                 hash4, hash5, hash6, hash7, vhash, 256 );
+   rintrlv_8x32_8x64( vhashA, vhash, 256 );

-   intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 256 );
-   keccak256_4way_update( &ctx.keccak, vhash, 32 );
-   keccak256_4way_close( &ctx.keccak, vhash );
-   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 256 );
-   intrlv_4x64( vhash, hash4, hash5, hash6, hash7, 256 );
-   keccak256_4way_init( &ctx.keccak );
-   keccak256_4way_update( &ctx.keccak, vhash, 32 );
-   keccak256_4way_close( &ctx.keccak, vhash );
-   dintrlv_4x64( hash4, hash5, hash6, hash7, vhash, 256 );
+   keccak256_8way_update( &ctx.keccak, vhashA, 32 );
+   keccak256_8way_close( &ctx.keccak, vhash );

-   cubehash_full( &ctx.cube, (byte*) hash0, 256, (const byte*) hash0, 32 );
-   cubehash_full( &ctx.cube, (byte*) hash1, 256, (const byte*) hash1, 32 );
-   cubehash_full( &ctx.cube, (byte*) hash2, 256, (const byte*) hash2, 32 );
-   cubehash_full( &ctx.cube, (byte*) hash3, 256, (const byte*) hash3, 32 );
-   cubehash_full( &ctx.cube, (byte*) hash4, 256, (const byte*) hash4, 32 );
-   cubehash_full( &ctx.cube, (byte*) hash5, 256, (const byte*) hash5, 32 );
-   cubehash_full( &ctx.cube, (byte*) hash6, 256, (const byte*) hash6, 32 );
-   cubehash_full( &ctx.cube, (byte*) hash7, 256, (const byte*) hash7, 32 );
+   rintrlv_8x64_4x128( vhashA, vhashB, vhash, 256 );

+   cube_4way_update_close( &ctx.cube, vhashA, vhashA, 32 );
+   cube_4way_init( &ctx.cube, 256, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhashB, vhashB, 32 );

-   LYRA2REV2( l2v2_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
-   LYRA2REV2( l2v2_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 );
-   LYRA2REV2( l2v2_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 );
-   LYRA2REV2( l2v2_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
-   LYRA2REV2( l2v2_wholeMatrix, hash4, 32, hash4, 32, hash4, 32, 1, 4, 4 );
-   LYRA2REV2( l2v2_wholeMatrix, hash5, 32, hash5, 32, hash5, 32, 1, 4, 4 );
-   LYRA2REV2( l2v2_wholeMatrix, hash6, 32, hash6, 32, hash6, 32, 1, 4, 4 );
-   LYRA2REV2( l2v2_wholeMatrix, hash7, 32, hash7, 32, hash7, 32, 1, 4, 4 );
+   dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 );
+   dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 );
+
+   intrlv_2x256( vhash, hash0, hash1, 256 );
+   LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash0, hash1, vhash, 256 );
+   intrlv_2x256( vhash, hash2, hash3, 256 );
+   LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash2, hash3, vhash, 256 );
+   intrlv_2x256( vhash, hash4, hash5, 256 );
+   LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash4, hash5, vhash, 256 );
+   intrlv_2x256( vhash, hash6, hash7, 256 );
+   LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash6, hash7, vhash, 256 );
+
+   intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                hash7, 256 );
+
+   skein256_8way_update( &ctx.skein, vhash, 32 );
+   skein256_8way_close( &ctx.skein, vhash );
+
+   rintrlv_8x64_4x128( vhashA, vhashB, vhash, 256 );
+
+   cube_4way_init( &ctx.cube, 256, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhashA, vhashA, 32 );
+   cube_4way_init( &ctx.cube, 256, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhashB, vhashB, 32 );
   
-   intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 256 );
-   skein256_4way_update( &ctx.skein, vhash, 32 );
-   skein256_4way_close( &ctx.skein, vhash );
-   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 256 );
-   intrlv_4x64( vhash, hash4, hash5, hash6, hash7, 256 );
-   skein256_4way_init( &ctx.skein );
-   skein256_4way_update( &ctx.skein, vhash, 32 );
-   skein256_4way_close( &ctx.skein, vhash );
-   dintrlv_4x64( hash4, hash5, hash6, hash7, vhash, 256 );
+   dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 );
+   dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 );

-   cubehash_full( &ctx.cube, (byte*) hash0, 256, (const byte*) hash0, 32 );
-   cubehash_full( &ctx.cube, (byte*) hash1, 256, (const byte*) hash1, 32 );
-   cubehash_full( &ctx.cube, (byte*) hash2, 256, (const byte*) hash2, 32 );
-   cubehash_full( &ctx.cube, (byte*) hash3, 256, (const byte*) hash3, 32 );
-   cubehash_full( &ctx.cube, (byte*) hash4, 256, (const byte*) hash4, 32 );
-   cubehash_full( &ctx.cube, (byte*) hash5, 256, (const byte*) hash5, 32 );
-   cubehash_full( &ctx.cube, (byte*) hash6, 256, (const byte*) hash6, 32 );
-   cubehash_full( &ctx.cube, (byte*) hash7, 256, (const byte*) hash7, 32 );
-
-   intrlv_8x32( vhash, hash0, hash1, hash2, hash3,
-                       hash4, hash5, hash6, hash7, 256 );
+   intrlv_8x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, 
+                hash7, 256 );

   bmw256_8way_update( &ctx.bmw, vhash, 32 );
   bmw256_8way_close( &ctx.bmw, state );
 }

-int scanhash_lyra2rev2_8way( struct work *work, const uint32_t max_nonce,
+int scanhash_lyra2rev2_8way( struct work *work, uint32_t max_nonce,
                             uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t hash[8*8] __attribute__ ((aligned (128)));
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
-   uint32_t *hashd7 = &hash[7*8];
-   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t *hash7 = &(hash[7<<3]);
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 8;
   uint32_t n = first_nonce;
-   const uint32_t targ32 = ptarget[7];
-   __m256i  *noncev = (__m256i*)vdata + 19;
-   const int thr_id = mythr->id;
-   const bool bench = opt_benchmark;
+   const uint32_t Htarg = ptarget[7];
+   __m256i *noncev = (__m256i*)vdata + 19;   // aligned
+   int thr_id = mythr->id; 

-   if ( bench )  ptarget[7] = 0x0000ff;
+   if ( opt_benchmark )
+      ( (uint32_t*)ptarget )[7] = 0x0000ff;

   mm256_bswap32_intrlv80_8x32( vdata, pdata );
-   *noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
+
   blake256_8way_init( &l2v2_8way_ctx.blake );
   blake256_8way_update( &l2v2_8way_ctx.blake, vdata, 64 );

   do
   {
-      lyra2rev2_8way_hash( hash, vdata );
+      *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
+                                                  n+3, n+2, n+1, n ) );

-      for ( int lane = 0; lane < 8; lane++ )
-      if ( unlikely( hashd7[lane] <= targ32 ) )
+      lyra2rev2_8way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int lane = 0; lane < 8; lane++ ) if ( hash7[lane] <= Htarg )
      {
         extr_lane_8x32( lane_hash, hash, lane, 256 );
-         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
         {
-             pdata[19] = bswap_32( n + lane );
-             submit_lane_solution( work, lane_hash, mythr, lane );
+            pdata[19] = n + lane;
+            submit_lane_solution( work, lane_hash, mythr, lane );
         }
      }
-      *noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
      n += 8;
-   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
-   pdata[19] = n;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart);
   *hashes_done = n - first_nonce;
   return 0;
 }

-#endif
-
-/*
 #elif defined (LYRA2REV2_4WAY)

 typedef struct {
@@ -438,16 +226,15 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
 {
   uint32_t hash[8*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-   uint32_t *hashd7 = &(hash[7<<2]);
+   uint32_t *hash7 = &(hash[7<<2]);
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce - 4;
   uint32_t n = first_nonce;
-   const uint32_t targ32 = ptarget[7];
-   __m128i *noncev = (__m128i*)vdata + 19;  
-   int thr_id = mythr->id; 
+   const uint32_t Htarg = ptarget[7];
+   __m128i *noncev = (__m128i*)vdata + 19;   // aligned
+   int thr_id = mythr->id;  // thr_id arg is deprecated

   if ( opt_benchmark )
      ( (uint32_t*)ptarget )[7] = 0x0000ff;
@@ -462,22 +249,21 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );

      lyra2rev2_4way_hash( hash, vdata );
+      pdata[19] = n;

-      for ( int lane = 0; lane < 4; lane++ ) if ( hashd7[lane] <= targ32 )
+      for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
      {
         extr_lane_4x32( lane_hash, hash, lane, 256 );
-         if ( valid_hash( lane_hash, ptarget ) && !opt_benchmark )
+         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
         {
            pdata[19] = n + lane;         
            submit_lane_solution( work, lane_hash, mythr, lane );
         }
      }
      n += 4;
-   } while ( (n < last_nonce) && !work_restart[thr_id].restart);
-   pdata[19] = n;
-   *hashes_done = n - first_nonce;
+   } while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
+   *hashes_done = n - first_nonce + 1;
   return 0;
 }

 #endif
-*/
--- a/algo/lyra2/lyra2rev2.c
+++ b/algo/lyra2/lyra2rev2.c
@@ -1,7 +1,4 @@
 #include "lyra2-gate.h"
-
-#if !( defined(LYRA2REV2_16WAY) || defined(LYRA2REV2_8WAY) || defined(LYRA2REV2_4WAY) )
-
 #include <memory.h>
 #include "algo/blake/sph_blake.h"
 #include "algo/cubehash/sph_cubehash.h"
@@ -99,7 +96,7 @@ int scanhash_lyra2rev2( struct work *work,
 		lyra2rev2_hash(hash, endiandata);

 		if (hash[7] <= Htarg )
-      if( valid_hash( hash, ptarget ) && !opt_benchmark )
+      if( fulltest( hash, ptarget ) && !opt_benchmark )
      {
 			pdata[19] = nonce;
         submit_solution( work, hash, mythr );
@@ -110,4 +107,4 @@ int scanhash_lyra2rev2( struct work *work,
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }
-#endif
+
--- a/algo/lyra2/lyra2rev3-4way.c
+++ b/algo/lyra2/lyra2rev3-4way.c
@@ -79,16 +79,19 @@ void lyra2rev3_16way_hash( void *state, const void *input )
   dintrlv_2x256( hash14, hash15, vhash, 256 );

   intrlv_4x128( vhash, hash0, hash1, hash2, hash3, 256 );
-   cube_4way_full( &ctx.cube, vhash, 256, vhash, 32 );
+   cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
   dintrlv_4x128( hash0, hash1, hash2, hash3, vhash, 256 );
   intrlv_4x128( vhash, hash4, hash5, hash6, hash7, 256 );
-   cube_4way_full( &ctx.cube, vhash, 256, vhash, 32 );
+   cube_4way_init( &ctx.cube, 256, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
   dintrlv_4x128( hash4, hash5, hash6, hash7, vhash, 256 );
   intrlv_4x128( vhash, hash8, hash9, hash10, hash11, 256 );
-   cube_4way_full( &ctx.cube, vhash, 256, vhash, 32 );
+   cube_4way_init( &ctx.cube, 256, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
   dintrlv_4x128( hash8, hash9, hash10, hash11, vhash, 256 );
   intrlv_4x128( vhash, hash12, hash13, hash14, hash15, 256 );
-   cube_4way_full( &ctx.cube, vhash, 256, vhash, 32 );
+   cube_4way_init( &ctx.cube, 256, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
   dintrlv_4x128( hash12, hash13, hash14, hash15, vhash, 256 );

   intrlv_2x256( vhash, hash0, hash1, 256 );
@@ -130,15 +133,15 @@ int scanhash_lyra2rev3_16way( struct work *work, const uint32_t max_nonce,
 {
   uint32_t hash[8*16] __attribute__ ((aligned (128)));
   uint32_t vdata[20*16] __attribute__ ((aligned (64)));
-   uint32_t *hashd7 = &hash[7*16];
+   uint32_t *hash7 = &hash[7<<4];
   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
   uint32_t *pdata = work->data;
   const uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
   const uint32_t last_nonce = max_nonce - 16;
-   const uint32_t targ32 = ptarget[7];
-   __m512i  *noncev = (__m512i*)vdata + 19;
+   const uint32_t Htarg = ptarget[7];
+   __m512i  *noncev = (__m512i*)vdata + 19;   // aligned
   const int thr_id = mythr->id;

   if ( opt_benchmark )  ( (uint32_t*)ptarget )[7] = 0x0000ff;
@@ -159,10 +162,10 @@ int scanhash_lyra2rev3_16way( struct work *work, const uint32_t max_nonce,
      pdata[19] = n;

      for ( int lane = 0; lane < 16; lane++ )
-      if ( unlikely( hashd7[lane] <= targ32 ) )
+      if ( unlikely( hash7[lane] <= Htarg ) )
      {
         extr_lane_16x32( lane_hash, hash, lane, 256 );
-         if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ) )
+         if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
         {
             pdata[19] = n + lane;
             submit_lane_solution( work, lane_hash, mythr, lane );
@@ -170,7 +173,6 @@ int scanhash_lyra2rev3_16way( struct work *work, const uint32_t max_nonce,
      }
      n += 16;
   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
-   pdata[19] = n;
   *hashes_done = n - first_nonce;
   return 0;
 }
@@ -195,7 +197,7 @@ bool init_lyra2rev3_8way_ctx()

 void lyra2rev3_8way_hash( void *state, const void *input )
 {
-   uint32_t vhash[8*8] __attribute__ ((aligned (128)));
+   uint32_t vhash[8*8] __attribute__ ((aligned (64)));
   uint32_t hash0[8] __attribute__ ((aligned (64)));
   uint32_t hash1[8] __attribute__ ((aligned (32)));
   uint32_t hash2[8] __attribute__ ((aligned (32)));
@@ -222,14 +224,21 @@ void lyra2rev3_8way_hash( void *state, const void *input )
   LYRA2REV3( l2v3_wholeMatrix, hash6, 32, hash6, 32, hash6, 32, 1, 4, 4 );
   LYRA2REV3( l2v3_wholeMatrix, hash7, 32, hash7, 32, hash7, 32, 1, 4, 4 );

-   cubehash_full( &ctx.cube, (byte*) hash0, 256, (const byte*) hash0, 32 );
-   cubehash_full( &ctx.cube, (byte*) hash1, 256, (const byte*) hash1, 32 );
-   cubehash_full( &ctx.cube, (byte*) hash2, 256, (const byte*) hash2, 32 );
-   cubehash_full( &ctx.cube, (byte*) hash3, 256, (const byte*) hash3, 32 );
-   cubehash_full( &ctx.cube, (byte*) hash4, 256, (const byte*) hash4, 32 );
-   cubehash_full( &ctx.cube, (byte*) hash5, 256, (const byte*) hash5, 32 );
-   cubehash_full( &ctx.cube, (byte*) hash6, 256, (const byte*) hash6, 32 );
-   cubehash_full( &ctx.cube, (byte*) hash7, 256, (const byte*) hash7, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash4, (const byte*) hash4, 32 );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash5, (const byte*) hash5, 32 );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash6, (const byte*) hash6, 32 );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash7, (const byte*) hash7, 32 );

   LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
   LYRA2REV3( l2v3_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 );
@@ -251,47 +260,46 @@ void lyra2rev3_8way_hash( void *state, const void *input )
 int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,
                             uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t hash[8*8] __attribute__ ((aligned (128)));
+   uint32_t hash[8*8] __attribute__ ((aligned (64)));
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
-   uint32_t *hashd7 = &hash[7*8];
+   uint32_t *hash7 = &hash[7<<3];
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
+   const uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce - 8;
   uint32_t n = first_nonce;
-   const uint32_t targ32 = ptarget[7];
-   __m256i  *noncev = (__m256i*)vdata + 19;  
+   const uint32_t Htarg = ptarget[7];
+   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
   const int thr_id = mythr->id;
-   const bool bench = opt_benchmark;

-   if ( bench )  ptarget[7] = 0x0000ff;
+   if ( opt_benchmark )  ( (uint32_t*)ptarget )[7] = 0x0000ff;

   mm256_bswap32_intrlv80_8x32( vdata, pdata );
-   *noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
+
   blake256_8way_init( &l2v3_8way_ctx.blake );
   blake256_8way_update( &l2v3_8way_ctx.blake, vdata, 64 );

   do
   {
+      *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
+                                                  n+3, n+2, n+1, n ) );
+
      lyra2rev3_8way_hash( hash, vdata );
      pdata[19] = n;

      for ( int lane = 0; lane < 8; lane++ )
-      if ( unlikely( hashd7[lane] <= targ32 ) )
+      if ( unlikely( hash7[lane] <= Htarg ) )
      {
         extr_lane_8x32( lane_hash, hash, lane, 256 );
-         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+         if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
         {
-             pdata[19] = bswap_32( n + lane );
+             pdata[19] = n + lane;
             submit_lane_solution( work, lane_hash, mythr, lane );
         }
      }
-      *noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
      n += 8;
-   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
-   pdata[19] = n;
-   *hashes_done = n - first_nonce;
+   } while ( likely( (n < max_nonce-8) && !work_restart[thr_id].restart ) );
+   *hashes_done = n - first_nonce + 1;
   return 0;
 }

@@ -358,41 +366,42 @@ int scanhash_lyra2rev3_4way( struct work *work, const uint32_t max_nonce,
 {
   uint32_t hash[8*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-   uint32_t *hashd7 = &(hash[7*4]);
+   uint32_t *hash7 = &(hash[7<<2]);
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *pdata = work->data;
   const uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
-   const uint32_t targ32 = ptarget[7];
-   __m128i  *noncev = (__m128i*)vdata + 19; 
-   const int thr_id = mythr->id;
+   const uint32_t Htarg = ptarget[7];
+   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
+   const int thr_id = mythr->id;  // thr_id arg is deprecated
   
   if ( opt_benchmark )
      ( (uint32_t*)ptarget )[7] = 0x0000ff;

   mm128_bswap32_intrlv80_4x32( vdata, pdata );
-   *noncev = _mm_set_epi32( n+3, n+2, n+1, n );

   blake256_4way_init( &l2v3_4way_ctx.blake );
   blake256_4way_update( &l2v3_4way_ctx.blake, vdata, 64 );

   do
   {
+      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
+
      lyra2rev3_4way_hash( hash, vdata );
-      for ( int lane = 0; lane < 4; lane++ ) if ( hashd7[lane] <= targ32 )
+      pdata[19] = n;
+
+      for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
      {
         extr_lane_4x32( lane_hash, hash, lane, 256 );
-         if ( valid_hash( lane_hash, ptarget ) && !opt_benchmark ) 
+         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
         {
-              pdata[19] = bswap_32( n + lane );    
+              pdata[19] = n + lane;    
              submit_lane_solution( work, lane_hash, mythr, lane );
 	      }
      }
-      *noncev = _mm_add_epi32( *noncev, m128_const1_32( 4 ) );
      n += 4;
   } while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
-   pdata[19] = n;
   *hashes_done = n - first_nonce + 1;
   return 0;
 }
--- a/algo/lyra2/lyra2rev3.c
+++ b/algo/lyra2/lyra2rev3.c
@@ -1,7 +1,4 @@
 #include "lyra2-gate.h"
-
-#if !( defined(LYRA2REV3_16WAY) || defined(LYRA2REV3_8WAY) || defined(LYRA2REV3_4WAY) )
-
 #include <memory.h>
 #include "algo/blake/sph_blake.h"
 #include "algo/cubehash/sph_cubehash.h"
@@ -88,7 +85,7 @@ int scanhash_lyra2rev3( struct work *work,
 	lyra2rev3_hash(hash, endiandata);

      if (hash[7] <= Htarg )
-      if( valid_hash( hash, ptarget ) && !opt_benchmark )
+      if( fulltest( hash, ptarget ) && !opt_benchmark )
      {
          pdata[19] = nonce;
          submit_solution( work, hash, mythr );
@@ -99,4 +96,4 @@ int scanhash_lyra2rev3( struct work *work,
   *hashes_done = pdata[19] - first_nonce + 1;
   return 0;
 }
-#endif
+
--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -97,42 +97,41 @@ void lyra2z_16way_hash( void *state, const void *input )
 int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint64_t hash[4*16] __attribute__ ((aligned (128)));
+   uint32_t hash[8*16] __attribute__ ((aligned (128)));
   uint32_t vdata[20*16] __attribute__ ((aligned (64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
-   const uint32_t last_nonce = max_nonce - 16;
   __m512i  *noncev = (__m512i*)vdata + 19;   // aligned
-   const int thr_id = mythr->id;
-   const bool bench = opt_benchmark;
+   int thr_id = mythr->id;  // thr_id arg is deprecated

-   if ( bench )   ptarget[7] = 0x0000ff;
+   if ( opt_benchmark )
+      ptarget[7] = 0x0000ff;

   mm512_bswap32_intrlv80_16x32( vdata, pdata );
-   *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
-                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
   lyra2z_16way_midstate( vdata );

   do {
+      *noncev = mm512_bswap_32( _mm512_set_epi32( n+15, n+14, n+13, n+12,
+                                                  n+11, n+10, n+ 9, n+ 8,
+                                                  n+ 7, n+ 6, n+ 5, n+ 4,
+                                                  n+ 3, n+ 2, n+ 1, n ) );
      lyra2z_16way_hash( hash, vdata );
+      pdata[19] = n;

-      for ( int lane = 0; lane < 16; lane++ )
+      for ( int i = 0; i < 16; i++ )
+      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
+           && !opt_benchmark )
      {
-        const uint64_t *lane_hash = hash + (lane<<2);
-        if ( unlikely( valid_hash( lane_hash, ptarget ) && !bench ) )
-        {
-           pdata[19] = bswap_32( n + lane );
-           submit_lane_solution( work, lane_hash, mythr, lane );
-        }
+          pdata[19] = n+i;
+          submit_lane_solution( work, hash+(i<<3), mythr, i );
      }
-      *noncev = _mm512_add_epi32( *noncev, m512_const1_32( 16 ) );
      n += 16;
-   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
+   } while ( (n < max_nonce-16) && !work_restart[thr_id].restart);

-   pdata[19] = n;
-   *hashes_done = n - first_nonce;
+   *hashes_done = n - first_nonce + 1;
   return 0;
 }

@@ -196,40 +195,39 @@ void lyra2z_8way_hash( void *state, const void *input )
 int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint64_t hash[4*8] __attribute__ ((aligned (64)));
+   uint32_t hash[8*8] __attribute__ ((aligned (64)));
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce - 8;
   uint32_t n = first_nonce;
   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
-   const int thr_id = mythr->id;
-   const bool bench = opt_benchmark;
+   int thr_id = mythr->id;  // thr_id arg is deprecated

-   if ( bench )  ptarget[7] = 0x0000ff;
+   if ( opt_benchmark )
+      ptarget[7] = 0x0000ff;

   mm256_bswap32_intrlv80_8x32( vdata, pdata );
-   *noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
   lyra2z_8way_midstate( vdata );

   do {
+      *noncev = mm256_bswap_32(
+                 _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
      lyra2z_8way_hash( hash, vdata );
+      pdata[19] = n;

-      for ( int lane = 0; lane < 8; lane++ )
+      for ( int i = 0; i < 8; i++ )
+      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
+           && !opt_benchmark )
      {
-        const uint64_t *lane_hash = hash + (lane<<2);
-        if ( unlikely( valid_hash( lane_hash, ptarget ) && !bench ) )
-        {
-           pdata[19] = bswap_32( n + lane );
-           submit_lane_solution( work, lane_hash, mythr, lane );
-        }
+          pdata[19] = n+i;
+          submit_lane_solution( work, hash+(i<<3), mythr, i );
      }
-      *noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
      n += 8;
-   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart) );
-   pdata[19] = n;
-   *hashes_done = n - first_nonce;
+   } while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce + 1;
   return 0;
 }

@@ -276,40 +274,39 @@ void lyra2z_4way_hash( void *state, const void *input )
 int scanhash_lyra2z_4way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint64_t hash[4*4] __attribute__ ((aligned (64)));
+   uint32_t hash[8*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce - 4;
   uint32_t n = first_nonce;
   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
-   const int thr_id = mythr->id; 
-   const bool bench = opt_benchmark;
+   int thr_id = mythr->id;  // thr_id arg is deprecated

-   if ( bench )   ptarget[7] = 0x0000ff;
+   if ( opt_benchmark )
+      ptarget[7] = 0x0000ff;

   mm128_bswap32_intrlv80_4x32( vdata, pdata );
-   *noncev = _mm_set_epi32( n+3, n+2, n+1, n );
   lyra2z_4way_midstate( vdata );

   do {
-      lyra2z_4way_hash( hash, vdata );
-      for ( int lane = 0; lane < 4; lane++ )
-      {
-        const uint64_t *lane_hash = hash + (lane<<2);
-        if ( unlikely( valid_hash( lane_hash, ptarget ) && !bench ) )
-        {
-           pdata[19] = bswap_32( n + lane );
-           submit_lane_solution( work, lane_hash, mythr, lane );
-        }
-      }
-      *noncev = _mm_add_epi32( *noncev, m128_const1_32( 4 ) );
-      n += 4;
-   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
+      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );

-   pdata[19] = n;
-   *hashes_done = n - first_nonce;
+      lyra2z_4way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int i = 0; i < 4; i++ )
+      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
+           && !opt_benchmark )
+      {
+          pdata[19] = n+i;         
+          submit_lane_solution( work, hash+(i<<3), mythr, i );
+      }
+      n += 4;
+   } while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce + 1;
   return 0;
 }

--- a/algo/lyra2/lyra2z.c
+++ b/algo/lyra2/lyra2z.c
@@ -1,9 +1,6 @@
 #include <memory.h>
 #include <mm_malloc.h>
 #include "lyra2-gate.h"
-
-#if !( defined(LYRA2Z_16WAY) || defined(LYRA2Z_8WAY) || defined(LYRA2Z_4WAY) )
-
 #include "lyra2.h"
 #include "algo/blake/sph_blake.h"
 #include "simd-utils.h"
@@ -56,7 +53,7 @@ int scanhash_lyra2z( struct work *work, uint32_t max_nonce,
 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
 	uint32_t nonce = first_nonce;
-   int thr_id = mythr->id; 
+   int thr_id = mythr->id;  // thr_id arg is deprecated

 	if (opt_benchmark)
 		ptarget[7] = 0x0000ff;
@@ -65,13 +62,14 @@ int scanhash_lyra2z( struct work *work, uint32_t max_nonce,
 		be32enc(&endiandata[i], pdata[i]);
 	}

-   lyra2z_midstate( endiandata );
+        lyra2z_midstate( endiandata );

 	do {
 		be32enc(&endiandata[19], nonce);
                lyra2z_hash( hash, endiandata );

-      if ( valid_hash( hash, ptarget ) && !opt_benchmark )
+      if ( hash[7] <= Htarg )
+      if ( fulltest( hash, ptarget ) && !opt_benchmark )
      {
 			pdata[19] = nonce;
 			submit_solution( work, hash, mythr );
@@ -82,4 +80,4 @@ int scanhash_lyra2z( struct work *work, uint32_t max_nonce,
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }
-#endif
+
--- a/algo/lyra2/lyra2z330.c
+++ b/algo/lyra2/lyra2z330.c
@@ -9,7 +9,7 @@ void lyra2z330_hash(void *state, const void *input, uint32_t height)
 {
 	uint32_t _ALIGN(256) hash[16];

-   LYRA2Z( lyra2z330_wholeMatrix, hash, 32, input, 80, input, 80,
+        LYRA2Z( lyra2z330_wholeMatrix, hash, 32, input, 80, input, 80,
                 2, 330, 256 );

 	memcpy(state, hash, 32);
@@ -18,40 +18,38 @@ void lyra2z330_hash(void *state, const void *input, uint32_t height)
 int scanhash_lyra2z330( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t hash[8] __attribute__ ((aligned (128))); 
-   uint32_t edata[20] __attribute__ ((aligned (64)));
+   uint32_t hash[8] __attribute__ ((aligned (64))); 
+   uint32_t endiandata[20] __attribute__ ((aligned (64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   uint32_t nonce = first_nonce;
-   const int thr_id = mythr->id; 
+   int thr_id = mythr->id;  // thr_id arg is deprecated

   if (opt_benchmark)
 	ptarget[7] = 0x0000ff;

-   casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
-   casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
-   casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
-   casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
-   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+   casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
+   casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
+   casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
+   casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
+   casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
   
   do
   {
-      edata[19] = nonce;
-
-      LYRA2Z( lyra2z330_wholeMatrix, hash, 32, edata, 80, edata, 80,
-                 2, 330, 256 );
-      
-//      lyra2z330_hash( hash, edata, work->height );
-      if ( valid_hash( hash, ptarget ) && !opt_benchmark )
+      be32enc( &endiandata[19], nonce );
+      lyra2z330_hash( hash, endiandata, work->height );
+      if ( hash[7] <= Htarg )
+      if ( fulltest( hash, ptarget ) && !opt_benchmark )
      {
-         be32enc( pdata + 19, nonce );
+         pdata[19] = nonce;
         submit_solution( work, hash, mythr );
      }
      nonce++;
   } while ( nonce < max_nonce && !work_restart[thr_id].restart );
   pdata[19] = nonce;
-   *hashes_done = nonce - first_nonce;
+   *hashes_done = pdata[19] - first_nonce + 1;
   return 0;
 }

--- a/algo/lyra2/phi2-4way.c
+++ b/algo/lyra2/phi2-4way.c
@@ -1,501 +1,233 @@
+/**
+ * Phi-2 algo Implementation
+ */
+
 #include "lyra2-gate.h"
+
+#if defined(PHI2_4WAY)
+
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/gost/sph_gost.h"
 #include "algo/cubehash/cubehash_sse2.h"
-#include "lyra2.h"
-#if defined(__VAES__)
-  #include "algo/echo/echo-hash-4way.h"
-#elif defined(__AES__)
-  #include "algo/echo/aes_ni/hash_api.h"
-#endif
-
-#if defined(PHI2_8WAY)
-
-typedef struct {
-     cubehashParam           cube;
-     jh512_8way_context      jh;
-#if  defined(__VAES__)
-     echo_4way_context       echo;
-#else
-     hashState_echo          echo;
-#endif
-     sph_gost512_context     gost;
-     skein512_8way_context   skein;
-} phi2_8way_ctx_holder;
-
-void phi2_8way_hash( void *state, const void *input )
-{
-   unsigned char _ALIGN(128) hash[64*8];
-   unsigned char _ALIGN(128) hashA[64*2];
-   unsigned char _ALIGN(64) hash0[64];
-   unsigned char _ALIGN(64) hash1[64];
-   unsigned char _ALIGN(64) hash2[64];
-   unsigned char _ALIGN(64) hash3[64];
-   unsigned char _ALIGN(64) hash4[64];
-   unsigned char _ALIGN(64) hash5[64];
-   unsigned char _ALIGN(64) hash6[64];
-   unsigned char _ALIGN(64) hash7[64];
-   const int size = phi2_has_roots ? 144 : 80 ;
-   phi2_8way_ctx_holder ctx __attribute__ ((aligned (64)));
-
-   cubehash_full( &ctx.cube, (byte*)hash0, 512,
-                       (const byte*)input,         size );
-   cubehash_full( &ctx.cube, (byte*)hash1, 512,
-                       (const byte*)input +   144, size );
-   cubehash_full( &ctx.cube, (byte*)hash2, 512,
-                       (const byte*)input + 2*144, size );
-   cubehash_full( &ctx.cube, (byte*)hash3, 512,
-                       (const byte*)input + 3*144, size );
-   cubehash_full( &ctx.cube, (byte*)hash4, 512,
-                       (const byte*)input + 4*144, size );
-   cubehash_full( &ctx.cube, (byte*)hash5, 512,
-                       (const byte*)input + 5*144, size );
-   cubehash_full( &ctx.cube, (byte*)hash6, 512,
-                       (const byte*)input + 6*144, size );
-   cubehash_full( &ctx.cube, (byte*)hash7, 512,
-                       (const byte*)input + 7*144, size );
-
-   intrlv_2x256( hashA, hash0, hash1, 512 );
-   LYRA2RE_2WAY( hash,        32, hashA,        32, 1, 8, 8 );
-   LYRA2RE_2WAY( hash + 2*32, 32, hashA + 2*32, 32, 1, 8, 8 );
-   dintrlv_2x256( hash0, hash1, hash, 512 );
-   intrlv_2x256( hashA, hash2, hash3, 512 );
-   LYRA2RE_2WAY( hash,        32, hashA,        32, 1, 8, 8 );
-   LYRA2RE_2WAY( hash + 2*32, 32, hashA + 2*32, 32, 1, 8, 8 );
-   dintrlv_2x256( hash2, hash3, hash, 512 );
-   intrlv_2x256( hashA, hash4, hash5, 512 );
-   LYRA2RE_2WAY( hash,        32, hashA,        32, 1, 8, 8 );
-   LYRA2RE_2WAY( hash + 2*32, 32, hashA + 2*32, 32, 1, 8, 8 );
-   dintrlv_2x256( hash4, hash5, hash, 512 );
-   intrlv_2x256( hashA, hash6, hash7, 512 );
-   LYRA2RE_2WAY( hash,        32, hashA,        32, 1, 8, 8 );
-   LYRA2RE_2WAY( hash + 2*32, 32, hashA + 2*32, 32, 1, 8, 8 );
-   dintrlv_2x256( hash6, hash7, hash, 512 );
-   
-   intrlv_8x64_512( hash, hash0, hash1, hash2, hash3,
-                          hash4, hash5, hash6, hash7 );
-
-   jh512_8way_init( &ctx.jh );
-   jh512_8way_update( &ctx.jh, (const void*)hash, 64 );
-   jh512_8way_close( &ctx.jh, (void*)hash );
-
-   dintrlv_8x64_512( hash0, hash1, hash2, hash3,
-                     hash4, hash5, hash6, hash7, hash );
-
-#if defined (__VAES__)
-
-   unsigned char _ALIGN(64) hashA0[64];
-   unsigned char _ALIGN(64) hashA1[64];
-   unsigned char _ALIGN(64) hashA2[64];
-   unsigned char _ALIGN(64) hashA3[64];
-   unsigned char _ALIGN(64) hashA4[64];
-   unsigned char _ALIGN(64) hashA5[64];
-   unsigned char _ALIGN(64) hashA6[64];
-   unsigned char _ALIGN(64) hashA7[64];
-
-   intrlv_4x128_512( hash, hash0, hash1, hash2, hash3 );
-   echo_4way_full( &ctx.echo, hash, 512, hash, 64 ); 
-   echo_4way_full( &ctx.echo, hash, 512, hash, 64 );
-   dintrlv_4x128_512( hashA0, hashA1, hashA2, hashA3, hash );
-
-   intrlv_4x128_512( hash, hash4, hash5, hash6, hash7 );
-   echo_4way_full( &ctx.echo, hash, 512, hash, 64 );
-   echo_4way_full( &ctx.echo, hash, 512, hash, 64 );    
-   dintrlv_4x128_512( hashA4, hashA5, hashA6, hashA7, hash );
-
-#endif    
-
-   if ( hash0[0] & 1 )
-   {
-      sph_gost512_init( &ctx.gost );
-      sph_gost512( &ctx.gost, (const void*)hash0, 64 );
-      sph_gost512_close( &ctx.gost, (void*)hash0 );
-   }
-   else
-#if defined (__VAES__)
-      memcpy( hash0, hashA0, 64 );
-#else
-   {
-      echo_full( &ctx.echo, (BitSequence *)hash0, 512,
-                      (const BitSequence *)hash0, 64 );
-      echo_full( &ctx.echo, (BitSequence *)hash0, 512,
-                      (const BitSequence *)hash0, 64 );
-   }
-#endif
-   if ( hash1[0] & 1 )
-   {
-      sph_gost512_init( &ctx.gost );
-      sph_gost512( &ctx.gost, (const void*)hash1, 64 );
-      sph_gost512_close( &ctx.gost, (void*)hash1 );
-   }
-   else
-#if defined (__VAES__)
-      memcpy( hash1, hashA1, 64 );
-#else
-   {
-      echo_full( &ctx.echo, (BitSequence *)hash1, 512,
-                      (const BitSequence *)hash1, 64 );
-      echo_full( &ctx.echo, (BitSequence *)hash1, 512,
-                      (const BitSequence *)hash1, 64 );
-   }
-#endif
-   if ( hash2[0] & 1 )
-   {
-      sph_gost512_init( &ctx.gost );
-      sph_gost512( &ctx.gost, (const void*)hash2, 64 );
-      sph_gost512_close( &ctx.gost, (void*)hash2 );
-   }
-   else
-#if defined (__VAES__)
-      memcpy( hash2, hashA2, 64 );
-#else 
-   {
-      echo_full( &ctx.echo, (BitSequence *)hash2, 512,
-                      (const BitSequence *)hash2, 64 );
-      echo_full( &ctx.echo, (BitSequence *)hash2, 512,
-                      (const BitSequence *)hash2, 64 );
-   }
-#endif
-   if ( hash3[0] & 1 )
-   {
-      sph_gost512_init( &ctx.gost );
-      sph_gost512( &ctx.gost, (const void*)hash3, 64 );
-      sph_gost512_close( &ctx.gost, (void*)hash3 );
-   }
-   else
-#if defined (__VAES__)
-      memcpy( hash3, hashA3, 64 );
-#else  
-   {
-      echo_full( &ctx.echo, (BitSequence *)hash3, 512,
-                      (const BitSequence *)hash3, 64 );
-      echo_full( &ctx.echo, (BitSequence *)hash3, 512,
-                      (const BitSequence *)hash3, 64 );
-   }
-#endif
-   if ( hash4[0] & 1 )
-   {
-      sph_gost512_init( &ctx.gost );
-      sph_gost512( &ctx.gost, (const void*)hash4, 64 );
-      sph_gost512_close( &ctx.gost, (void*)hash4 );
-   }
-   else
-#if defined (__VAES__)
-      memcpy( hash4, hashA4, 64 );
-#else
-   {
-      echo_full( &ctx.echo, (BitSequence *)hash4, 512,
-                      (const BitSequence *)hash4, 64 );
-      echo_full( &ctx.echo, (BitSequence *)hash4, 512,
-                      (const BitSequence *)hash4, 64 );
-   }
-#endif   
-   if ( hash5[0] & 1 )
-   {
-      sph_gost512_init( &ctx.gost );
-      sph_gost512( &ctx.gost, (const void*)hash5, 64 );
-      sph_gost512_close( &ctx.gost, (void*)hash5 );
-   }
-   else
-#if defined (__VAES__)
-      memcpy( hash5, hashA5, 64 );
-#else
-   {
-      echo_full( &ctx.echo, (BitSequence *)hash5, 512,
-                      (const BitSequence *)hash5, 64 );
-      echo_full( &ctx.echo, (BitSequence *)hash5, 512,
-                      (const BitSequence *)hash5, 64 );
-   }
-#endif   
-   if ( hash6[0] & 1 )
-   {
-      sph_gost512_init( &ctx.gost );
-      sph_gost512( &ctx.gost, (const void*)hash6, 64 );
-      sph_gost512_close( &ctx.gost, (void*)hash6 );
-   }
-   else
-#if defined (__VAES__)
-      memcpy( hash6, hashA6, 64 );
-#else
-   {
-      echo_full( &ctx.echo, (BitSequence *)hash6, 512,
-                      (const BitSequence *)hash6, 64 );
-      echo_full( &ctx.echo, (BitSequence *)hash6, 512,
-                      (const BitSequence *)hash6, 64 );
-   }
-#endif   
-   if ( hash7[0] & 1 )
-   {
-      sph_gost512_init( &ctx.gost );
-      sph_gost512( &ctx.gost, (const void*)hash7, 64 );
-      sph_gost512_close( &ctx.gost, (void*)hash7 );
-   }
-   else
-#if defined (__VAES__)
-      memcpy( hash7, hashA7, 64 );
-#else
-   {
-      echo_full( &ctx.echo, (BitSequence *)hash7, 512,
-                      (const BitSequence *)hash7, 64 );
-      echo_full( &ctx.echo, (BitSequence *)hash7, 512,
-                      (const BitSequence *)hash7, 64 );
-   }
-#endif
-
-   intrlv_8x64_512( hash, hash0, hash1, hash2, hash3,
-                          hash4, hash5, hash6, hash7 );
-
-   skein512_8way_init( &ctx.skein );
-   skein512_8way_update( &ctx.skein, (const void*)hash, 64 );
-   skein512_8way_close( &ctx.skein, (void*)hash );
-
-   for ( int i = 0; i < 4; i++ )
-      casti_m512i( state, i ) = _mm512_xor_si512( casti_m512i( hash, i ),
-                                                  casti_m512i( hash, i+4 ) );
-}
-
-int scanhash_phi2_8way( struct work *work, uint32_t max_nonce,
-                        uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t _ALIGN(128) hash[16*8];
-   uint32_t _ALIGN(128) edata[36*8];
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   uint32_t *hash7 = &(hash[49]);  
-   const uint32_t Htarg = ptarget[7];
-   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce - 8;
-   uint32_t n = first_nonce;
-   const int thr_id = mythr->id;
-   const bool bench = opt_benchmark;
-   if ( bench )      ptarget[7] = 0x00ff;
-
-   phi2_has_roots = false;
-
-   for ( int i = 0; i < 36; i++ )
-   {
-      be32enc( &edata[i], pdata[i] );
-      edata[ i +   36 ] = edata[ i + 2*36 ] = edata[ i + 3*36 ] =
-      edata[ i + 4*36 ] = edata[ i + 5*36 ] = edata[ i + 6*36 ] =
-      edata[ i + 7*36 ] = edata[ i ];
-      if ( i >= 20 && pdata[i] ) phi2_has_roots = true;
-   }
-
-   edata[        19 ] = n;
-   edata[   36 + 19 ] = n+1;
-   edata[ 2*36 + 19 ] = n+2;
-   edata[ 3*36 + 19 ] = n+3;
-   edata[ 4*36 + 19 ] = n+4;
-   edata[ 5*36 + 19 ] = n+5;
-   edata[ 6*36 + 19 ] = n+6;
-   edata[ 7*36 + 19 ] = n+7;
-   
-   do {
-      phi2_8way_hash( hash, edata );
-
-      for ( int lane = 0; lane < 8; lane++ )
-      if ( unlikely( hash7[ lane<<1 ] <= Htarg && !bench ) )
-      {
-         uint64_t _ALIGN(64) lane_hash[8];
-         extr_lane_8x64( lane_hash, hash, lane, 256 );
-         if ( valid_hash( lane_hash, ptarget ) )
-         {
-            be32enc( pdata + 19, n + lane );
-            submit_lane_solution( work, lane_hash, mythr, lane );
-         }
-      }
-      n += 8;
-      edata[        19 ] += 8;
-      edata[   36 + 19 ] += 8;
-      edata[ 2*36 + 19 ] += 8;
-      edata[ 3*36 + 19 ] += 8;
-      edata[ 4*36 + 19 ] += 8;
-      edata[ 5*36 + 19 ] += 8;
-      edata[ 6*36 + 19 ] += 8;
-      edata[ 7*36 + 19 ] += 8;
-   } while ( (n < last_nonce) && !work_restart[thr_id].restart);
-   pdata[19] = n;
-   *hashes_done = n - first_nonce;
-   return 0;
-
-}
-
-#elif defined(PHI2_4WAY)
+#include "algo/echo/aes_ni/hash_api.h"

 typedef struct {
     cubehashParam           cube;
     jh512_4way_context      jh;
-#if  defined(__AES__)
     hashState_echo          echo;
-#else
-     sph_echo512_context     echo;
-#endif
+//     hashState_echo          echo2;
     sph_gost512_context     gost;
     skein512_4way_context   skein;
-} phi2_4way_ctx_holder;
+} phi2_ctx_holder;
+/*
+phi2_ctx_holder phi2_ctx;

-phi2_4way_ctx_holder phi2_4way_ctx;
-
-void phi2_4way_hash(void *state, const void *input)
+void init_phi2_ctx()
 {
-	unsigned char _ALIGN(128) hash[64*4];
-   unsigned char _ALIGN(64) hash0[64];
-   unsigned char _ALIGN(64) hash1[64];
-   unsigned char _ALIGN(64) hash2[64];
-   unsigned char _ALIGN(64) hash3[64];
-   unsigned char _ALIGN(64) hash0A[64];
-   unsigned char _ALIGN(64) hash1A[64];
-   unsigned char _ALIGN(64) hash2A[64];
-   unsigned char _ALIGN(64) hash3A[64];
-   const int size = phi2_has_roots ? 144 : 80 ;
-   phi2_4way_ctx_holder ctx __attribute__ ((aligned (64)));
+   cubehashInit( &phi2_ctx.cube, 512, 16, 32 );
+   sph_jh512_init(&phi2_ctx.jh);
+   init_echo( &phi2_ctx.echo1, 512 );
+   init_echo( &phi2_ctx.echo2, 512 );
+   sph_gost512_init(&phi2_ctx.gost);
+   sph_skein512_init(&phi2_ctx.skein);
+};
+*/
+void phi2_hash_4way( void *state, const void *input )
+{
+   uint32_t hash[4][16] __attribute__ ((aligned (64)));
+   uint32_t hashA[4][16] __attribute__ ((aligned (64)));
+   uint32_t hashB[4][16] __attribute__ ((aligned (64)));
+   uint32_t vhash[4*16] __attribute__ ((aligned (64)));

-   cubehash_full( &ctx.cube, (byte*)hash0A, 512,
-                       (const byte*)input,          size );
-   cubehash_full( &ctx.cube, (byte*)hash1A, 512,
-                       (const byte*)input +   144, size );
-   cubehash_full( &ctx.cube, (byte*)hash2A, 512,
-                       (const byte*)input + 2*144, size );
-   cubehash_full( &ctx.cube, (byte*)hash3A, 512,
-                       (const byte*)input + 3*144, size );
-  
-	LYRA2RE( &hash0[ 0], 32, hash0A,    32, hash0A,    32, 1, 8, 8 );
-	LYRA2RE( &hash0[32], 32, hash0A+32, 32, hash0A+32, 32, 1, 8, 8 );
-   LYRA2RE( &hash1[ 0], 32, hash1A,    32, hash1A,    32, 1, 8, 8 );
-   LYRA2RE( &hash1[32], 32, hash1A+32, 32, hash1A+32, 32, 1, 8, 8 );
-   LYRA2RE( &hash2[ 0], 32, hash2A,    32, hash2A,    32, 1, 8, 8 );
-   LYRA2RE( &hash2[32], 32, hash2A+32, 32, hash2A+32, 32, 1, 8, 8 );
-   LYRA2RE( &hash3[ 0], 32, hash3A,    32, hash3A,    32, 1, 8, 8 );
-   LYRA2RE( &hash3[32], 32, hash3A+32, 32, hash3A+32, 32, 1, 8, 8 );
+//   unsigned char _ALIGN(128) hash[64];
+//	unsigned char _ALIGN(128) hashA[64];
+//	unsigned char _ALIGN(128) hashB[64];

-   intrlv_4x64_512( hash, hash0, hash1, hash2, hash3 );
+   phi2_ctx_holder ctx __attribute__ ((aligned (64)));
+//  memcpy( &ctx, &phi2_ctx, sizeof(phi2_ctx) );
+
+   cubehashInit( &ctx.cube, 512, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*)hashB[0], (const byte*)input,
+                        phi2_has_roots ? 144 : 80 );
+   cubehashInit( &ctx.cube, 512, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*)hashB[1], (const byte*)input+144,
+                        phi2_has_roots ? 144 : 80 );
+   cubehashInit( &ctx.cube, 512, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*)hashB[2], (const byte*)input+288,
+                        phi2_has_roots ? 144 : 80 );
+   cubehashInit( &ctx.cube, 512, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*)hashB[3], (const byte*)input+432,
+                        phi2_has_roots ? 144 : 80 );
+
+	LYRA2RE( &hashA[0][0], 32, &hashB[0][0], 32, &hashB[0][0], 32, 1, 8, 8 );
+	LYRA2RE( &hashA[0][8], 32, &hashB[0][8], 32, &hashB[0][8], 32, 1, 8, 8 );
+   LYRA2RE( &hashA[1][0], 32, &hashB[1][0], 32, &hashB[1][0], 32, 1, 8, 8 );
+   LYRA2RE( &hashA[1][8], 32, &hashB[1][8], 32, &hashB[1][8], 32, 1, 8, 8 );
+   LYRA2RE( &hashA[2][0], 32, &hashB[2][0], 32, &hashB[2][0], 32, 1, 8, 8 );
+   LYRA2RE( &hashA[2][8], 32, &hashB[2][8], 32, &hashB[2][8], 32, 1, 8, 8 );
+   LYRA2RE( &hashA[3][0], 32, &hashB[3][0], 32, &hashB[3][0], 32, 1, 8, 8 );
+   LYRA2RE( &hashA[3][8], 32, &hashB[3][8], 32, &hashB[3][8], 32, 1, 8, 8 );
+
+   intrlv_4x64( vhash, hashA[0], hashA[1], hashA[2], hashA[3], 512 );

   jh512_4way_init( &ctx.jh );
-   jh512_4way_update( &ctx.jh, (const void*)hash, 64 );
-	jh512_4way_close( &ctx.jh, (void*)hash );
+   jh512_4way( &ctx.jh, vhash, 64 );
+   jh512_4way_close( &ctx.jh, vhash );

-   dintrlv_4x64_512( hash0, hash1, hash2, hash3, hash );
+   dintrlv_4x64( hash[0], hash[1], hash[2], hash[3], vhash, 512 );

-   if ( hash0[0] & 1 )
+   if ( hash[0][0] & 1 )
  	{
      sph_gost512_init( &ctx.gost );
-      sph_gost512( &ctx.gost, (const void*)hash0, 64 );
-	   sph_gost512_close( &ctx.gost, (void*)hash0 );
+      sph_gost512( &ctx.gost, (const void*)hash[0], 64 );
+	   sph_gost512_close( &ctx.gost, (void*)hash[0] );
 	}
  	else
  	{
-      echo_full( &ctx.echo, (BitSequence *)hash0, 512,
-                      (const BitSequence *)hash0, 64 );
-      echo_full( &ctx.echo, (BitSequence *)hash0, 512,
-                      (const BitSequence *)hash0, 64 );
+      init_echo( &ctx.echo, 512 );
+      update_final_echo ( &ctx.echo, (BitSequence *)hash[0],
+                          (const BitSequence *)hash[0], 512 );
+      init_echo( &ctx.echo, 512 );
+      update_final_echo ( &ctx.echo, (BitSequence *)hash[0],
+                          (const BitSequence *)hash[0], 512 );
 	}
-   if ( hash1[0] & 1 )
+
+   if ( hash[1][0] & 1 )
   {
      sph_gost512_init( &ctx.gost );
-      sph_gost512( &ctx.gost, (const void*)hash1, 64 );
-      sph_gost512_close( &ctx.gost, (void*)hash1 );
+      sph_gost512( &ctx.gost, (const void*)hash[1], 64 );
+      sph_gost512_close( &ctx.gost, (void*)hash[1] );
   }
   else
   {
-      echo_full( &ctx.echo, (BitSequence *)hash1, 512,
-                      (const BitSequence *)hash1, 64 );
-      echo_full( &ctx.echo, (BitSequence *)hash1, 512,
-                      (const BitSequence *)hash1, 64 );
-   }
-   if ( hash2[0] & 1 )
-   {
-      sph_gost512_init( &ctx.gost );
-      sph_gost512( &ctx.gost, (const void*)hash2, 64 );
-      sph_gost512_close( &ctx.gost, (void*)hash2 );
-   }
-   else
-   {
-      echo_full( &ctx.echo, (BitSequence *)hash2, 512,
-                      (const BitSequence *)hash2, 64 );
-      echo_full( &ctx.echo, (BitSequence *)hash2, 512,
-                      (const BitSequence *)hash2, 64 );
-   }
-   if ( hash3[0] & 1 )
-   {
-      sph_gost512_init( &ctx.gost );
-      sph_gost512( &ctx.gost, (const void*)hash3, 64 );
-      sph_gost512_close( &ctx.gost, (void*)hash3 );
-   }
-   else
-   {
-      echo_full( &ctx.echo, (BitSequence *)hash3, 512,
-                      (const BitSequence *)hash3, 64 );
-      echo_full( &ctx.echo, (BitSequence *)hash3, 512,
-                      (const BitSequence *)hash3, 64 );
+      init_echo( &ctx.echo, 512 );
+      update_final_echo ( &ctx.echo, (BitSequence *)hash[1],
+                          (const BitSequence *)hash[1], 512 );
+      init_echo( &ctx.echo, 512 );
+      update_final_echo ( &ctx.echo, (BitSequence *)hash[1],
+                          (const BitSequence *)hash[1], 512 );
   }

-   intrlv_4x64_512( hash, hash0, hash1, hash2, hash3 );
+   if ( hash[2][0] & 1 )
+   {
+      sph_gost512_init( &ctx.gost );
+      sph_gost512( &ctx.gost, (const void*)hash[2], 64 );
+      sph_gost512_close( &ctx.gost, (void*)hash[2] );
+   }
+   else
+   {
+      init_echo( &ctx.echo, 512 );
+      update_final_echo ( &ctx.echo, (BitSequence *)hash[2],
+                          (const BitSequence *)hash[2], 512 );
+      init_echo( &ctx.echo, 512 );
+      update_final_echo ( &ctx.echo, (BitSequence *)hash[2],
+                          (const BitSequence *)hash[2], 512 );
+   }

+   if ( hash[3][0] & 1 )
+   {
+      sph_gost512_init( &ctx.gost );
+      sph_gost512( &ctx.gost, (const void*)hash[3], 64 );
+      sph_gost512_close( &ctx.gost, (void*)hash[3] );
+   }
+   else
+   {
+      init_echo( &ctx.echo, 512 );
+      update_final_echo ( &ctx.echo, (BitSequence *)hash[3],
+                          (const BitSequence *)hash[3], 512 );
+      init_echo( &ctx.echo, 512 );
+      update_final_echo ( &ctx.echo, (BitSequence *)hash[3],
+                          (const BitSequence *)hash[3], 512 );
+   }
+
+   intrlv_4x64( vhash, hash[0], hash[1], hash[2], hash[3], 512 );
+   
   skein512_4way_init( &ctx.skein );
-	skein512_4way_update( &ctx.skein, (const void*)hash, 64 );
-	skein512_4way_close( &ctx.skein, (void*)hash );
+	skein512_4way( &ctx.skein, vhash, 64 );
+	skein512_4way_close( &ctx.skein, vhash );

+   for (int i=0; i<4; i++)
+   {
+      ( (uint64_t*)vhash    )[i] ^= ( (uint64_t*)vhash    )[i+4];
+      ( (uint64_t*)vhash+ 8 )[i] ^= ( (uint64_t*)vhash+ 8 )[i+4];
+      ( (uint64_t*)vhash+16 )[i] ^= ( (uint64_t*)vhash+16 )[i+4];
+      ( (uint64_t*)vhash+24 )[i] ^= ( (uint64_t*)vhash+24 )[i+4];
+   }
+//   for ( int i = 0; i < 4; i++ )
+//      casti_m256i( vhash, i ) = _mm256_xor_si256( casti_m256i( vhash, i   ),
+//                                                  casti_m256i( vhash, i+4 ) );

-   for ( int i = 0; i < 4; i++ )
-      casti_m256i( state, i ) = _mm256_xor_si256( casti_m256i( hash, i   ),
-                                                  casti_m256i( hash, i+4 ) );
+	memcpy( state, vhash, 128 );
 }

 int scanhash_phi2_4way( struct work *work, uint32_t max_nonce,
 	                     uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t _ALIGN(128) hash[16*4];
-   uint32_t _ALIGN(128) edata[36*4];
+   uint32_t _ALIGN(128) hash[8];
+   uint32_t _ALIGN(128) edata[36];
+   uint32_t vdata[4][36] __attribute__ ((aligned (64)));
+   uint32_t *hash7 = &(hash[25]);
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
-   uint32_t *hash7 = &(hash[25]);   // 3*8+1
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce - 4;
   uint32_t n = first_nonce;
-   const int thr_id = mythr->id;
-   const bool bench = opt_benchmark;
-   if ( bench )   	ptarget[7] = 0x00ff;
-   
-   phi2_has_roots = false;
+   int thr_id = mythr->id;  // thr_id arg is deprecated

-   for ( int i = 0; i < 36; i++ )
-   {
-	   be32enc( &edata[i], pdata[i] );
-      edata[ i+36 ] = edata[ i+72 ] = edata[ i+108 ] = edata[i];
-      if ( i >= 20 && pdata[i] ) phi2_has_roots = true;
+   if(opt_benchmark){
+   	ptarget[7] = 0x00ff;
   }

-   edata[        19 ] = n;
-   edata[   36 + 19 ] = n+1;
-   edata[ 2*36 + 19 ] = n+2;
-   edata[ 3*36 + 19 ] = n+3;
-   
+// Data is not interleaved, but hash is.
+// any non-zero data at index 20 or above sets roots true.
+// Split up the operations, bswap first, then set roots.
+
+   phi2_has_roots = false;
+   for ( int i=0; i < 36; i++ )
+   {
+   be32enc(&edata[i], pdata[i]);
+   if (i >= 20 && pdata[i]) phi2_has_roots = true;
+   }
+/*
+   casti_m256i( vdata[0], 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );   
+   casti_m256i( vdata[0], 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
+   casti_m256i( vdata[0], 2 ) = mm256_bswap_32( casti_m256i( pdata, 2 ) );
+   casti_m256i( vdata[0], 3 ) = mm256_bswap_32( casti_m256i( pdata, 3 ) );
+   casti_m128i( vdata[0], 8 ) = mm128_bswap_32( casti_m128i( pdata, 8 ) );
+   phi2_has_roots = mm128_anybits1( casti_m128i( vdata[0], 5 ) ) ||
+                    mm128_anybits1( casti_m128i( vdata[0], 6 ) ) ||
+                    mm128_anybits1( casti_m128i( vdata[0], 7 ) ) ||
+                    mm128_anybits1( casti_m128i( vdata[0], 8 ) );
+*/   
+
+   memcpy( vdata[0], edata, 144 );
+   memcpy( vdata[1], edata, 144 );
+   memcpy( vdata[2], edata, 144 );
+   memcpy( vdata[3], edata, 144 );
+
   do {
-	   phi2_4way_hash( hash, edata );
+      be32enc( &vdata[0][19], n );
+      be32enc( &vdata[1][19], n+1 );
+      be32enc( &vdata[2][19], n+2 );
+      be32enc( &vdata[3][19], n+3 );

-      for ( int lane = 0; lane < 4; lane++ )
-      if ( unlikely( hash7[ lane<<1 ] <= Htarg && !bench ) )
+      phi2_hash_4way( hash, vdata );
+
+      for ( int lane = 0; lane < 4; lane++ ) if (  hash7[ lane<<1 ] < Htarg )
      {
-         uint64_t _ALIGN(64) lane_hash[8]; 
-         extr_lane_4x64( lane_hash, hash, lane, 256 );
-         if ( valid_hash( lane_hash, ptarget ) )
-         {
-            be32enc( pdata + 19, n + lane );
-            submit_lane_solution( work, lane_hash, mythr, lane );
-         }
-      }
-      edata[        19 ] += 4;
-      edata[   36 + 19 ] += 4;
-      edata[ 2*36 + 19 ] += 4;
-      edata[ 3*36 + 19 ] += 4;
-      n +=4;
-   } while ( (n < last_nonce) && !work_restart[thr_id].restart);
-   pdata[19] = n;
-   *hashes_done = n - first_nonce;
-   return 0;
+          extr_lane_4x64( lane_hash, hash, lane, 256 );
+          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+          {
+              pdata[19] = n + lane;
+              submit_lane_solution( work, lane_hash, mythr, lane );
+          }
+       }
+       n += 4;
+    } while ( ( n < max_nonce - 4 ) && !work_restart[thr_id].restart );
+    *hashes_done = n - first_nonce + 1;
+    return 0;
 }
-
-#endif
-
+   
+#endif  // PHI2_4WAY
--- a/algo/lyra2/phi2.c
+++ b/algo/lyra2/phi2.c
@@ -96,29 +96,32 @@ int scanhash_phi2( struct work *work, uint32_t max_nonce,
 	           uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t _ALIGN(128) hash[8];
-   uint32_t _ALIGN(128) edata[36];
+   uint32_t _ALIGN(128) endiandata[36];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
-   const int thr_id = mythr->id;
-   const bool bench = opt_benchmark;
-   if( bench )   	ptarget[7] = 0x00ff;
+   int thr_id = mythr->id;  // thr_id arg is deprecated
+
+   if(opt_benchmark){
+   	ptarget[7] = 0x00ff;
+   }

   phi2_has_roots = false;
-
-   for ( int i = 0; i < 36; i++ )
+   for ( int i=0; i < 36; i++ )
   {
-	   be32enc( &edata[i], pdata[i] );
+	   be32enc(&endiandata[i], pdata[i]);
      if ( i >= 20 && pdata[i] ) phi2_has_roots = true;
   }

   do {
-	edata[19] = n;
-	phi2_hash( hash, edata );
-   if ( valid_hash( hash, ptarget ) && !opt_benchmark )
+	be32enc( &endiandata[19], n );
+	phi2_hash( hash, endiandata );
+	if ( hash[7] < Htarg )
+   if ( fulltest( hash, ptarget ) && !opt_benchmark )
  	{
-       be32enc( pdata+19, n );
+       pdata[19] = n;
       submit_solution( work, hash, mythr );
   }
 	n++;
--- a/algo/lyra2/sponge.c
+++ b/algo/lyra2/sponge.c
@@ -89,9 +89,6 @@ inline void initState( uint64_t State[/*16*/] )
 *
 * @param v     A 1024-bit (16 uint64_t) array to be processed by Blake2b's G function
 */
-
-#if !defined(__AVX512F__) && !defined(__AVX2__) && !defined(__SSE2__)
-
 inline static void blake2bLyra( uint64_t *v )
 {
    ROUND_LYRA(0);
@@ -117,8 +114,6 @@ inline static void reducedBlake2bLyra( uint64_t *v )
    ROUND_LYRA(0);
 }

-#endif
-
 /**
 * Performs a squeeze operation, using Blake2b's G function as the
 * internal permutation
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -171,6 +171,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7)

+
 #endif // AVX2 else SSE2

 // Scalar
@@ -199,6 +200,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
    G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
    G(r,7,v[ 3],v[ 4],v[ 9],v[14]);

+
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

 union _ovly_512
--- a/algo/m7m/m7m.c
+++ b/algo/m7m/m7m.c
@@ -267,13 +267,8 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
            SHA256_Final( (unsigned char*) hash, &ctxf_sha256 );
        }

-
-        if ( unlikely( valid_hash( (uint64_t*)hash, (uint64_t*)ptarget ) 
-             && !opt_benchmark ) )
-
-
-//        if ( unlikely( hash[7] <= ptarget[7] ) )
-//        if ( likely( fulltest( hash, ptarget ) && !opt_benchmark ) )        
+        if ( unlikely( hash[7] <= ptarget[7] ) )
+        if ( likely( fulltest( hash, ptarget ) && !opt_benchmark ) )        
        {
           if ( opt_debug )
           {
--- a/algo/nist5/nist5.c
+++ b/algo/nist5/nist5.c
@@ -1,7 +1,4 @@
 #include "nist5-gate.h"
-
-#if !defined(NIST5_8WAY) && !defined(NIST5_4WAY)
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -108,4 +105,13 @@ int scanhash_nist5( struct work *work, uint32_t max_nonce,
 	pdata[19] = n;
 	return 0;
 }
-#endif
+/*
+bool register_nist5_algo( algo_gate_t* gate )
+{
+    gate->optimizations = SSE2_OPT | AES_OPT;
+    init_nist5_ctx();
+    gate->scanhash = (void*)&scanhash_nist5;
+    gate->hash     = (void*)&nist5hash;
+    return true;
+};
+*/
--- a/algo/nist5/zr5.c
+++ b/algo/nist5/zr5.c
@@ -154,13 +154,14 @@ int scanhash_zr5( struct work *work, uint32_t max_nonce,
 }

 void zr5_get_new_work( struct work* work, struct work* g_work, int thr_id,
-                       uint32_t* end_nonce_ptr )
+                       uint32_t* end_nonce_ptr, bool clean_job )
 {
   // ignore POK in first word
+// const int nonce_i = 19;
   const int wkcmp_sz = 72;  // (19-1) * sizeof(uint32_t)
-   uint32_t *nonceptr = work->data + algo_gate.nonce_index;
+   uint32_t *nonceptr = algo_gate.get_nonceptr( work->data );
   if ( memcmp( &work->data[1], &g_work->data[1], wkcmp_sz )
-      || ( *nonceptr >= *end_nonce_ptr ) )
+      && ( clean_job || ( *nonceptr >= *end_nonce_ptr ) ) )
   {
      work_free( work );
      work_copy( work, g_work );
--- a/algo/quark/anime-4way.c
+++ b/algo/quark/anime-4way.c
@@ -1,241 +1,18 @@
 #include "cpuminer-config.h"
 #include "anime-gate.h"
+
+#if defined (ANIME_4WAY)
+
 #include <stdio.h>
 #include <string.h>
 #include <stdint.h>
+
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/groestl/aes_ni/hash-groestl.h"
-#if defined(__VAES__)
-  #include "algo/groestl/groestl512-hash-4way.h"
-#endif
-
-#if defined (ANIME_8WAY)
-
-typedef struct {
-    blake512_8way_context   blake;
-    bmw512_8way_context     bmw;
-#if defined(__VAES__)
-    groestl512_4way_context groestl;
-#else
-    hashState_groestl       groestl;
-#endif
-    jh512_8way_context      jh;
-    skein512_8way_context   skein;
-    keccak512_8way_context  keccak;
-} anime_8way_ctx_holder;
-
-anime_8way_ctx_holder anime_8way_ctx __attribute__ ((aligned (64)));
-
-void init_anime_8way_ctx()
-{
-     blake512_8way_init( &anime_8way_ctx.blake );
-     bmw512_8way_init( &anime_8way_ctx.bmw );
-#if defined(__VAES__)
-     groestl512_4way_init( &anime_8way_ctx.groestl, 64 );
-#else
-     init_groestl( &anime_8way_ctx.groestl, 64 );
-#endif
-     skein512_8way_init( &anime_8way_ctx.skein );
-     jh512_8way_init( &anime_8way_ctx.jh );
-     keccak512_8way_init( &anime_8way_ctx.keccak );
-}
-
-void anime_8way_hash( void *state, const void *input )
-{
-    uint64_t vhash[8*8] __attribute__ ((aligned (128)));
-    uint64_t vhashA[8*8] __attribute__ ((aligned (64)));
-    uint64_t vhashB[8*8] __attribute__ ((aligned (64)));
-    uint64_t vhashC[8*8] __attribute__ ((aligned (64)));
-#if !defined(__VAES__)
-    uint64_t hash0[8] __attribute__ ((aligned (64)));
-    uint64_t hash1[8] __attribute__ ((aligned (64)));
-    uint64_t hash2[8] __attribute__ ((aligned (64)));
-    uint64_t hash3[8] __attribute__ ((aligned (64)));
-    uint64_t hash4[8] __attribute__ ((aligned (64)));
-    uint64_t hash5[8] __attribute__ ((aligned (64)));
-    uint64_t hash6[8] __attribute__ ((aligned (64)));
-    uint64_t hash7[8] __attribute__ ((aligned (64)));
-#endif
-    __m512i* vh  = (__m512i*)vhash;
-    __m512i* vhA = (__m512i*)vhashA;
-    __m512i* vhB = (__m512i*)vhashB;
-    __m512i* vhC = (__m512i*)vhashC;
-    const __m512i bit3_mask = m512_const1_64( 8 );
-    const __m512i zero = _mm512_setzero_si512();
-    __mmask8 vh_mask;
-    anime_8way_ctx_holder ctx;
-    memcpy( &ctx, &anime_8way_ctx, sizeof(anime_8way_ctx) );
-
-    bmw512_8way_full( &ctx.bmw, vhash, input, 80 );
-
-    blake512_8way_full( &ctx.blake, vhash, vhash, 64 );
-
-    vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
-                                       zero );
-
-#if defined(__VAES__)
-
-    rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
-
-    if ( ( vh_mask & 0x0f ) != 0x0f )
-       groestl512_4way_full( &ctx.groestl, vhashA, vhashA, 64 );
-    if ( ( vh_mask & 0xf0 ) != 0xf0 )
-       groestl512_4way_full( &ctx.groestl, vhashB, vhashB, 64 );
-
-    rintrlv_4x128_8x64( vhashC, vhashA, vhashB, 512 );
-
-#else
-    
-    dintrlv_8x64_512( hash0, hash1, hash2, hash3,
-                      hash4, hash5, hash6, hash7, vhash );
-
-    if ( hash0[0] & 8 )
-       groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); 
-    if ( hash1[0] & 8 )
-       groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
-    if ( hash2[0] & 8)
-       groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
-    if ( hash3[0] & 8 )
-       groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
-    if ( hash4[0] & 8 )
-       groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
-    if ( hash5[0] & 8 )
-       groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
-    if ( hash6[0] & 8 )
-       groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
-    if ( hash7[0] & 8 )
-       groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
-
-    intrlv_8x64_512( vhashC, hash0, hash1, hash2, hash3,
-                             hash4, hash5, hash6, hash7 );
-
-#endif
-
-    if ( vh_mask & 0xff )
-       skein512_8way_full( &ctx.skein, vhashB, vhash, 64 );
-
-    mm512_blend_hash_8x64( vh, vhC, vhB, vh_mask );
-
-#if defined(__VAES__)
-
-    rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
-
-    groestl512_4way_full( &ctx.groestl, vhashA, vhashA, 64 );
-    groestl512_4way_full( &ctx.groestl, vhashB, vhashB, 64 );
-
-    rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
-
-#else
-    
-    dintrlv_8x64_512( hash0, hash1, hash2, hash3,
-                      hash4, hash5, hash6, hash7, vhash );
-
-    groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
-    groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
-    groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
-    groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
-    groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
-    groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
-    groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
-    groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
-
-    intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3,
-                            hash4, hash5, hash6, hash7 );
-
-#endif
-
-    jh512_8way_init( &ctx.jh );
-    jh512_8way_update( &ctx.jh, vhash, 64 );
-    jh512_8way_close( &ctx.jh, vhash );
-
-    vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
-                                       zero );
-
-    if ( ( vh_mask & 0xff ) != 0xff )
-       blake512_8way_full( &ctx.blake, vhashA, vhash, 64 );
-    if ( vh_mask & 0xff )
-       bmw512_8way_full( &ctx.bmw, vhashB, vhash, 64 );
-
-    mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
-
-    keccak512_8way_init( &ctx.keccak );
-    keccak512_8way_update( &ctx.keccak, vhash, 64 );
-    keccak512_8way_close( &ctx.keccak, vhash );
-
-    skein512_8way_full( &ctx.skein, vhash, vhash, 64 );
-
-    vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ), 
-                                       zero );
-
-    if ( ( vh_mask & 0xff ) != 0xff )
-    {
-       keccak512_8way_init( &ctx.keccak );
-       keccak512_8way_update( &ctx.keccak, vhash, 64 );
-       keccak512_8way_close( &ctx.keccak, vhashA );
-    }
-    if ( vh_mask & 0xff )
-    {
-       jh512_8way_init( &ctx.jh );
-       jh512_8way_update( &ctx.jh, vhash, 64 );
-       jh512_8way_close( &ctx.jh, vhashB );
-    }
-
-   casti_m512i( state,0 ) = _mm512_mask_blend_epi64( vh_mask, vhA[0], vhB[0] );
-   casti_m512i( state,1 ) = _mm512_mask_blend_epi64( vh_mask, vhA[1], vhB[1] );
-   casti_m512i( state,2 ) = _mm512_mask_blend_epi64( vh_mask, vhA[2], vhB[2] );
-   casti_m512i( state,3 ) = _mm512_mask_blend_epi64( vh_mask, vhA[3], vhB[3] );
-}
-
-int scanhash_anime_8way( struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done, struct thr_info *mythr )
-{
-    uint64_t hash64[4*8] __attribute__ ((aligned (64)));
-    uint32_t vdata[20*8] __attribute__ ((aligned (64)));
-    uint32_t lane_hash[8] __attribute__ ((aligned (64)));
-    uint64_t *hash64_q3 = &(hash64[3*8]);
-    uint32_t *pdata = work->data;
-    uint32_t *ptarget = work->target;
-    const uint64_t targ64_q3 = ((uint64_t*)ptarget)[3];
-    uint32_t n = pdata[19];
-    const uint32_t first_nonce = pdata[19];
-    const uint32_t last_nonce = max_nonce - 8;
-    __m512i  *noncev = (__m512i*)vdata + 9; 
-    const int thr_id = mythr->id;
-    const bool bench = opt_benchmark;
-
-    mm512_bswap32_intrlv80_8x64( vdata, pdata );
-    *noncev = mm512_intrlv_blend_32(
-             _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
-                               n+3, 0, n+2, 0, n+1, 0, n  , 0 ), *noncev );
-
-    do
-    {
-       anime_8way_hash( hash64, vdata );
-
-       for ( int lane = 0; lane < 8; lane++ )
-       if ( unlikely( hash64_q3[ lane ] <= targ64_q3 && !bench ) )
-       {
-          extr_lane_8x64( lane_hash, hash64, lane, 256 );
-          if ( valid_hash( lane_hash, ptarget ) )
-          {
-             pdata[19] = bswap_32( n + lane );
-             submit_lane_solution( work, lane_hash, mythr, lane );
-          }
-       }
-       *noncev = _mm512_add_epi32( *noncev,
-                                   m512_const1_64( 0x0000000800000000 ) );
-       n += 8;
-    } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
-    pdata[19] = n;
-    *hashes_done = n - first_nonce;
-    return 0;
-}
-
-#elif defined (ANIME_4WAY)

 typedef struct {
    blake512_4way_context  blake;
@@ -246,6 +23,18 @@ typedef struct {
    keccak512_4way_context keccak;
 } anime_4way_ctx_holder;

+anime_4way_ctx_holder anime_4way_ctx __attribute__ ((aligned (64)));
+
+void init_anime_4way_ctx()
+{
+     blake512_4way_init( &anime_4way_ctx.blake );
+     bmw512_4way_init( &anime_4way_ctx.bmw );
+     init_groestl( &anime_4way_ctx.groestl, 64 );
+     skein512_4way_init( &anime_4way_ctx.skein );
+     jh512_4way_init( &anime_4way_ctx.jh );
+     keccak512_4way_init( &anime_4way_ctx.keccak );
+}
+
 void anime_4way_hash( void *state, const void *input )
 {
    uint64_t hash0[8] __attribute__ ((aligned (64)));
@@ -259,61 +48,81 @@ void anime_4way_hash( void *state, const void *input )
    __m256i* vhA = (__m256i*)vhashA;
    __m256i* vhB = (__m256i*)vhashB;
    __m256i vh_mask;
-    int h_mask;
+    const uint32_t mask = 8;
    const __m256i bit3_mask = m256_const1_64( 8 );
    const __m256i zero = _mm256_setzero_si256();
    anime_4way_ctx_holder ctx;
+    memcpy( &ctx, &anime_4way_ctx, sizeof(anime_4way_ctx) );

-    bmw512_4way_init( &ctx.bmw );
    bmw512_4way_update( &ctx.bmw, input, 80 );
    bmw512_4way_close( &ctx.bmw, vhash );

-    blake512_4way_full( &ctx.blake, vhash, vhash, 64 );
+    blake512_4way_update( &ctx.blake, vhash, 64 );
+    blake512_4way_close( &ctx.blake, vhash );

    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
-    h_mask = _mm256_movemask_epi8( vh_mask );

    dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

-    // A
-    if ( hash0[0] & 8 )
-       groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
-    if ( hash1[0] & 8 )
-       groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
-    if ( hash2[0] & 8)
-       groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
-    if ( hash3[0] & 8 )
-       groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+    if ( hash0[0] & mask )
+    {
+       update_and_final_groestl( &ctx.groestl, (char*)hash0,
+                                               (char*)hash0, 512 );
+    }
+    if ( hash1[0] & mask )
+    {
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash1,
+                                               (char*)hash1, 512 );
+    }
+    if ( hash2[0] & mask )
+    {
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash2,
+                                               (char*)hash2, 512 );
+    }
+    if ( hash3[0] & mask )
+    {
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash3,
+                                               (char*)hash3, 512 );
+    }

    intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );

-    // B
-    if ( h_mask & 0xffffffff )
-       skein512_4way_full( &ctx.skein, vhashB, vhash, 64 );
+    if ( mm256_anybits0( vh_mask ) )
+    {
+       skein512_4way_update( &ctx.skein, vhash, 64 );
+       skein512_4way_close( &ctx.skein, vhashB );
+    }

    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );

    dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

-    groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
-    groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
-    groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
-    groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );

    intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

-    jh512_4way_init( &ctx.jh );
    jh512_4way_update( &ctx.jh, vhash, 64 );
    jh512_4way_close( &ctx.jh, vhash );

    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
-    h_mask = _mm256_movemask_epi8( vh_mask );

-    // A
-    if ( ( h_mask & 0xffffffff ) != 0xffffffff )
-       blake512_4way_full( &ctx.blake, vhashA, vhash, 64 );
-    // B
-    if ( h_mask & 0xffffffff )
+    if ( mm256_anybits1( vh_mask ) )
+    {
+       blake512_4way_init( &ctx.blake );
+       blake512_4way_update( &ctx.blake, vhash, 64 );
+       blake512_4way_close( &ctx.blake, vhashA );
+    }
+    if ( mm256_anybits0( vh_mask ) )
    {
       bmw512_4way_init( &ctx.bmw );
       bmw512_4way_update( &ctx.bmw, vhash, 64 );
@@ -322,76 +131,90 @@ void anime_4way_hash( void *state, const void *input )

    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );

-    keccak512_4way_init( &ctx.keccak );
    keccak512_4way_update( &ctx.keccak, vhash, 64 );
    keccak512_4way_close( &ctx.keccak, vhash );

-    skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
+    skein512_4way_init( &ctx.skein );
+    skein512_4way_update( &ctx.skein, vhash, 64 );
+    skein512_4way_close( &ctx.skein, vhash );

    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
-    h_mask = _mm256_movemask_epi8( vh_mask );

-    // A
-    if ( ( h_mask & 0xffffffff ) != 0xffffffff )
+    if ( mm256_anybits1( vh_mask ) )
    {
       keccak512_4way_init( &ctx.keccak );
       keccak512_4way_update( &ctx.keccak, vhash, 64 );
       keccak512_4way_close( &ctx.keccak, vhashA );
    }
-    // B
-    if ( h_mask & 0xffffffff )
+    if ( mm256_anybits0( vh_mask ) )
    {
       jh512_4way_init( &ctx.jh );
       jh512_4way_update( &ctx.jh, vhash, 64 );
       jh512_4way_close( &ctx.jh, vhashB );
    }

-    casti_m256i( state, 0 ) = _mm256_blendv_epi8( vhA[0], vhB[0], vh_mask );
-    casti_m256i( state, 1 ) = _mm256_blendv_epi8( vhA[1], vhB[1], vh_mask );
-    casti_m256i( state, 2 ) = _mm256_blendv_epi8( vhA[2], vhB[2], vh_mask );
-    casti_m256i( state, 3 ) = _mm256_blendv_epi8( vhA[3], vhB[3], vh_mask );
+    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
+
+    dintrlv_4x64( state, state+32, state+64, state+96, vhash, 256 );
 }

 int scanhash_anime_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr )
 {
-    uint64_t hash64[4*4] __attribute__ ((aligned (64)));
-    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-    uint32_t lane_hash[8] __attribute__ ((aligned (64)));
-    uint64_t *hash64_q3 = &(hash64[3*4]);
+    uint32_t hash[4*8] __attribute__ ((aligned (64)));
+    uint32_t vdata[24*4] __attribute__ ((aligned (64)));
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
-    const uint64_t targ64_q3 = ((uint64_t*)ptarget)[3];
    uint32_t n = pdata[19];
    const uint32_t first_nonce = pdata[19];
-    const uint32_t last_nonce = max_nonce - 4;
-    __m256i  *noncev = (__m256i*)vdata + 9;  
-    const int thr_id = mythr->id;  
-    const bool bench = opt_benchmark;
+    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+    int thr_id = mythr->id;  // thr_id arg is deprecated
+    const uint32_t Htarg = ptarget[7];
+    uint64_t htmax[] = {
+                0,
+                0xF,
+                0xFF,
+                0xFFF,
+                0xFFFF,
+                0x10000000
+        };
+    uint32_t masks[] = {
+                0xFFFFFFFF,
+                0xFFFFFFF0,
+                0xFFFFFF00,
+                0xFFFFF000,
+                0xFFFF0000,
+                0
+        };

    mm256_bswap32_intrlv80_4x64( vdata, pdata );
-    *noncev = mm256_intrlv_blend_32(
-                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
-    do
-    {
-       anime_4way_hash( hash64, vdata );

-       for ( int lane = 0; lane < 4; lane++ )
-       if ( unlikely( hash64_q3[ lane ] <= targ64_q3 && !bench ) )
+    for (int m=0; m < 6; m++)
+       if (Htarg <= htmax[m])
       {
-          extr_lane_4x64( lane_hash, hash64, lane, 256 );
-          if ( valid_hash( lane_hash, ptarget ) )
+          uint32_t mask = masks[m];
+
+          do
          {
-             pdata[19] = bswap_32( n + lane );
-             submit_lane_solution( work, lane_hash, mythr, lane );
-          }
+             *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
+
+             anime_4way_hash( hash, vdata );
+             pdata[19] = n;
+
+             for ( int i = 0; i < 4; i++ )
+             if ( ( ( (hash+(i<<3))[7] & mask ) == 0 )
+                && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
+             {
+                pdata[19] = n+i;
+                submit_lane_solution( work, hash+(i<<3), mythr, i );
+             }
+             n += 4;
+          } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
+          break;
       }
-       *noncev = _mm256_add_epi32( *noncev,
-                                   m256_const1_64( 0x0000000400000000 ) );
-       n += 4;
-    } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
-    pdata[19] = n;
-    *hashes_done = n - first_nonce;
+
+    *hashes_done = n - first_nonce + 1;
    return 0;
 }

--- a/algo/quark/anime-gate.c
+++ b/algo/quark/anime-gate.c
@@ -2,10 +2,8 @@

 bool register_anime_algo( algo_gate_t* gate )
 {
-#if defined (ANIME_8WAY)
-  gate->scanhash  = (void*)&scanhash_anime_8way;
-  gate->hash      = (void*)&anime_8way_hash;
-#elif defined (ANIME_4WAY)
+#if defined (ANIME_4WAY)
+  init_anime_4way_ctx();
  gate->scanhash  = (void*)&scanhash_anime_4way;
  gate->hash      = (void*)&anime_4way_hash;
 #else
@@ -13,7 +11,7 @@ bool register_anime_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_anime;
  gate->hash      = (void*)&anime_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  return true;
 };

--- a/algo/quark/anime-gate.h
+++ b/algo/quark/anime-gate.h
@@ -4,25 +4,18 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-  #define ANIME_8WAY 1
-#elif defined(__AVX2__) && defined(__AES__)
-  #define ANIME_4WAY 1
+#if defined(__AVX2__) && defined(__AES__)
+  #define ANIME_4WAY
 #endif

 bool register_anime_algo( algo_gate_t* gate );

-#if defined(ANIME_8WAY)
-
-void anime_8way_hash( void *state, const void *input );
-int scanhash_anime_8way( struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done, struct thr_info *mythr );
-
-#elif defined(ANIME_4WAY)
+#if defined(ANIME_4WAY)

 void anime_4way_hash( void *state, const void *input );
 int scanhash_anime_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
+void init_anime_4way_ctx();

 #endif

--- a/algo/quark/anime.c
+++ b/algo/quark/anime.c
@@ -1,8 +1,5 @@
 #include "cpuminer-config.h"
 #include "anime-gate.h"
-
-#if !defined(ANIME_8WAY) && !defined(ANIME_4WAY)
-
 #include <stdio.h>
 #include <string.h>
 #include <stdint.h>
@@ -126,29 +123,50 @@ int scanhash_anime( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr)
 {
    uint32_t hash[8] __attribute__ ((aligned (64)));
-    uint32_t edata[20] __attribute__((aligned(64)));
+    uint32_t endiandata[20] __attribute__((aligned(64)));
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
    uint32_t n = pdata[19];
    const uint32_t first_nonce = pdata[19];
-    const int thr_id = mythr->id;
-    const int bench = opt_benchmark;
-    
-    swab32_array( edata, pdata, 20 );
+    int thr_id = mythr->id;  // thr_id arg is deprecated
+    const uint32_t Htarg = ptarget[7];
+    uint64_t htmax[] = {
+                0,
+                0xF,
+                0xFF,
+                0xFFF,
+                0xFFFF,
+                0x10000000
+        };
+    uint32_t masks[] = {
+                0xFFFFFFFF,
+                0xFFFFFFF0,
+                0xFFFFFF00,
+                0xFFFFF000,
+                0xFFFF0000,
+                0
+        };

-    do
-    {
-        edata[19] = n;
-        anime_hash( hash, edata );
-        if ( valid_hash( hash, ptarget ) && !bench )
-        {
-           be32enc( &pdata[19], n );
-           submit_solution( work, hash, mythr );
-        }
-        n++;
-    } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
-    *hashes_done = n - first_nonce;
+    swab32_array( endiandata, pdata, 20 );
+
+    for (int m=0; m < 6; m++)
+       if (Htarg <= htmax[m])
+       {
+          uint32_t mask = masks[m];
+          do
+          {
+              be32enc( &endiandata[19], n );
+              anime_hash( hash, endiandata );
+              pdata[19] = n;
+
+             if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) ) 
+                submit_solution( work, hash, mythr );
+             n++;
+          } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
+          break;
+       }
+    *hashes_done = n - first_nonce + 1;
    pdata[19] = n;
    return 0;
 }
-#endif
+
--- a/algo/quark/hmq1725-4way.c
+++ b/algo/quark/hmq1725-4way.c
--- a/algo/quark/hmq1725.c
+++ b/algo/quark/hmq1725.c
@@ -1,7 +1,4 @@
 #include "hmq1725-gate.h"
-
-#if !defined(HMQ1725_8WAY) && !defined(HMQ1725_4WAY)
-
 #include <string.h>
 #include <stdint.h>
 #include "algo/blake/sph_blake.h"
@@ -10,7 +7,10 @@
 #include "algo/jh/sph_jh.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
+#include "algo/luffa/sph_luffa.h"
+#include "algo/cubehash/sph_cubehash.h"
 #include "algo/shavite/sph_shavite.h"
+#include "algo/simd/sph_simd.h"
 #include "algo/echo/sph_echo.h"
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/fugue/sph_fugue.h"
@@ -21,9 +21,6 @@
 #if defined(__AES__)
  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/echo/aes_ni/hash_api.h"
-#else
-  #include "algo/groestl/sph_groestl.h"
-  #include "algo/echo/sph_echo.h"
 #endif
 #include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
@@ -395,4 +392,3 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
 	pdata[19] = n;
 	return 0;
 }
-#endif
--- a/algo/quark/quark-4way.c
+++ b/algo/quark/quark-4way.c
@@ -72,10 +72,12 @@ void quark_8way_hash( void *state, const void *input )

    memcpy( &ctx, &quark_8way_ctx, sizeof(quark_8way_ctx) );

-    blake512_8way_full( &ctx.blake, vhash, input, 80 );
+    blake512_8way_update( &ctx.blake, input, 80 );
+    blake512_8way_close( &ctx.blake, vhash );
+
+    bmw512_8way_update( &ctx.bmw, vhash, 64 );
+    bmw512_8way_close( &ctx.bmw, vhash );

-    bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 );
-    
    vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
                                       zero );

@@ -84,34 +86,70 @@ void quark_8way_hash( void *state, const void *input )

     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );

-    if ( ( vh_mask & 0x0f ) != 0x0f )
-       groestl512_4way_full( &ctx.groestl, vhashA, vhashA, 64 );
-    if ( ( vh_mask & 0xf0 ) != 0xf0 )
-       groestl512_4way_full( &ctx.groestl, vhashB, vhashB, 64 );
-
-    rintrlv_4x128_8x64( vhashC, vhashA, vhashB, 512 );
+     if ( ( vh_mask & 0x0f ) != 0x0f )
+     {
+        groestl512_4way_init( &ctx.groestl, 64 );
+        groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
+     }
+     if ( ( vh_mask & 0xf0 ) != 0xf0 )
+     {     
+        groestl512_4way_init( &ctx.groestl, 64 );
+        groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
+     }
+     rintrlv_4x128_8x64( vhashC, vhashA, vhashB, 512 );

 #else

    dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                  vhash, 512 );

-     if ( hash0[0] & 8 )
-       groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
-     if ( hash1[0] & 8 )
-       groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
-     if ( hash2[0] & 8)
-       groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
-     if ( hash3[0] & 8 )
-       groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
-     if ( hash4[0] & 8 )
-       groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
-     if ( hash5[0] & 8 )
-       groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
-     if ( hash6[0] & 8 )
-       groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
-     if ( hash7[0] & 8 )
-       groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+    if ( hash0[0] & mask )
+    {
+       update_and_final_groestl( &ctx.groestl, (char*)hash0,
+                                               (char*)hash0, 512 );
+    }
+    if ( hash1[0] & mask )
+    {
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash1,
+                                               (char*)hash1, 512 );
+    }
+    if ( hash2[0] & mask )
+    {
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash2,
+                                               (char*)hash2, 512 );
+    }
+    if ( hash3[0] & mask )
+    {
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash3,
+                                               (char*)hash3, 512 );
+    }
+    if ( hash4[0] & mask )
+    {
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash4,
+                                               (char*)hash4, 512 );
+    }
+    if ( hash5[0] & mask )
+    {
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash5,
+                                               (char*)hash5, 512 );
+    }
+    if ( hash6[0] & mask )
+    {
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash6,
+                                               (char*)hash6, 512 );
+    }
+    if ( hash7[0] & mask )
+    {
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash7,
+                                               (char*)hash7, 512 );
+    }

    intrlv_8x64( vhashC, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                         hash7, 512 );
@@ -119,7 +157,10 @@ void quark_8way_hash( void *state, const void *input )
 #endif

    if ( vh_mask & 0xff )
-       skein512_8way_full( &ctx.skein, vhashB, vhash, 64 );
+    {
+       skein512_8way_update( &ctx.skein, vhash, 64 );
+       skein512_8way_close( &ctx.skein, vhashB );
+    }

    mm512_blend_hash_8x64( vh, vhC, vhB, vh_mask );

@@ -127,10 +168,10 @@ void quark_8way_hash( void *state, const void *input )

     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );

-     if ( ( vh_mask & 0x0f ) != 0x0f )
-       groestl512_4way_full( &ctx.groestl, vhashA, vhashA, 64 );
-     if ( ( vh_mask & 0xf0 ) != 0xf0 )
-       groestl512_4way_full( &ctx.groestl, vhashB, vhashB, 64 );
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );

     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );

@@ -139,22 +180,22 @@ void quark_8way_hash( void *state, const void *input )
    dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                  vhash, 512 );

-    if ( hash0[0] & 8 )
-       groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
-    if ( hash1[0] & 8 )
-       groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
-    if ( hash2[0] & 8)
-       groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
-    if ( hash3[0] & 8 )
-       groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
-    if ( hash4[0] & 8 )
-       groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
-    if ( hash5[0] & 8 )
-       groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
-    if ( hash6[0] & 8 )
-       groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
-    if ( hash7[0] & 8 )
-       groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );

    intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                 512 );
@@ -168,16 +209,27 @@ void quark_8way_hash( void *state, const void *input )
                                       zero );

    if ( ( vh_mask & 0xff ) != 0xff )
-       blake512_8way_full( &ctx.blake, vhashA, vhash, 64 );
+    {
+       blake512_8way_init( &ctx.blake );
+       blake512_8way_update( &ctx.blake, vhash, 64 );
+       blake512_8way_close( &ctx.blake, vhashA );
+    }
+
    if ( vh_mask & 0xff )
-       bmw512_8way_full( &ctx.bmw, vhashB, vhash, 64 );
+    {
+       bmw512_8way_init( &ctx.bmw );
+       bmw512_8way_update( &ctx.bmw, vhash, 64 );
+       bmw512_8way_close( &ctx.bmw, vhashB );
+    }

    mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );

    keccak512_8way_update( &ctx.keccak, vhash, 64 );
    keccak512_8way_close( &ctx.keccak, vhash );

-    skein512_8way_full( &ctx.skein, vhash, vhash, 64 );
+    skein512_8way_init( &ctx.skein );
+    skein512_8way_update( &ctx.skein, vhash, 64 );
+    skein512_8way_close( &ctx.skein, vhash );

    vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
                                       zero );
@@ -206,44 +258,41 @@ void quark_8way_hash( void *state, const void *input )
 int scanhash_quark_8way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr )
 {
-    uint64_t hash64[4*8] __attribute__ ((aligned (128)));
-    uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+    uint32_t hash[8*8] __attribute__ ((aligned (128)));
+    uint32_t vdata[24*8] __attribute__ ((aligned (64)));
    uint32_t lane_hash[8] __attribute__ ((aligned (64)));
-    uint64_t *hash64_q3 = &(hash64[3*8]);
-    uint32_t *ptarget = work->target;
-    const uint64_t targ64_q3 = ((uint64_t*)ptarget)[3];
+    uint32_t *hash7 = &(hash[49]);
    uint32_t *pdata = work->data;
+    uint32_t *ptarget = work->target;
    uint32_t n = pdata[19];
    const uint32_t first_nonce = pdata[19];
-    const uint32_t last_nonce = max_nonce - 8;
-    __m512i  *noncev = (__m512i*)vdata + 9;
-    const int thr_id = mythr->id; 
-    const bool bench = opt_benchmark;
+    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+    int thr_id = mythr->id; 
+    const uint32_t Htarg = ptarget[7];

    mm512_bswap32_intrlv80_8x64( vdata, pdata );
-    *noncev = mm512_intrlv_blend_32(
-                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
-                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ), *noncev );
    do
    {
-       quark_8way_hash( hash64, vdata );
+       *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );

-       for ( int lane = 0; lane < 8; lane++ )
-       if ( unlikely( hash64_q3[ lane ] <= targ64_q3 && !bench ) )
+       quark_8way_hash( hash, vdata );
+       pdata[19] = n;
+
+       for ( int i = 0; i < 8; i++ )
+       if ( unlikely( hash7[ i<<1 ] <= Htarg ) )
       {
-          extr_lane_8x64( lane_hash, hash64, lane, 256 );
-          if ( valid_hash( lane_hash, ptarget ) )
+          extr_lane_8x64( lane_hash, hash, i, 256 );
+          if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
          {
-             pdata[19] = bswap_32( n + lane );
-             submit_lane_solution( work, lane_hash, mythr, lane );
+            pdata[19] = n+i;
+            submit_lane_solution( work, lane_hash, mythr, i );
          }
       }
-       *noncev = _mm512_add_epi32( *noncev,
-                                  m512_const1_64( 0x0000000800000000 ) );
       n += 8;
-    } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
+    } while ( ( n < max_nonce-8 ) && !work_restart[thr_id].restart );

-    pdata[19] = n;
    *hashes_done = n - first_nonce;
    return 0;
 }
@@ -284,47 +333,67 @@ void quark_4way_hash( void *state, const void *input )
    __m256i* vhA = (__m256i*)vhashA;
    __m256i* vhB = (__m256i*)vhashB;
    __m256i vh_mask;
-    int h_mask;
    quark_4way_ctx_holder ctx;
    const __m256i bit3_mask = m256_const1_64( 8 );
+    const uint32_t mask = 8;
    const __m256i zero = _mm256_setzero_si256();

    memcpy( &ctx, &quark_4way_ctx, sizeof(quark_4way_ctx) );

-    blake512_4way_full( &ctx.blake, vhash, input, 80 );
+    blake512_4way_update( &ctx.blake, input, 80 );
+    blake512_4way_close( &ctx.blake, vhash );

    bmw512_4way_update( &ctx.bmw, vhash, 64 );
    bmw512_4way_close( &ctx.bmw, vhash );

    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
-    h_mask = _mm256_movemask_epi8( vh_mask );

    dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

-    // A
-    if ( hash0[0] & 8 )
-       groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
-    if ( hash1[0] & 8 )
-       groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
-    if ( hash2[0] & 8)
-       groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
-    if ( hash3[0] & 8 )
-       groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+    if ( hash0[0] & mask )
+    {
+       update_and_final_groestl( &ctx.groestl, (char*)hash0,
+                                               (char*)hash0, 512 );
+    }
+    if ( hash1[0] & mask )
+    {
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash1,
+                                               (char*)hash1, 512 );
+    }
+    if ( hash2[0] & mask )
+    {   
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash2,
+                                               (char*)hash2, 512 );
+    }
+    if ( hash3[0] & mask )
+    {   
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash3,
+                                               (char*)hash3, 512 );
+    }

    intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );

-    // B
-    if ( likely( h_mask & 0xffffffff ) )
-       skein512_4way_full( &ctx.skein, vhashB, vhash, 64 );
+    if ( mm256_anybits1( vh_mask ) )   
+    {
+       skein512_4way_update( &ctx.skein, vhash, 64 );
+       skein512_4way_close( &ctx.skein, vhashB );
+    }

    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );

    dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

-    groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
-    groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
-    groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
-    groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );

    intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

@@ -332,13 +401,15 @@ void quark_4way_hash( void *state, const void *input )
    jh512_4way_close( &ctx.jh, vhash );

    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
-    h_mask = _mm256_movemask_epi8( vh_mask );

-    // A
-    if ( likely( ( h_mask & 0xffffffff ) != 0xffffffff ) )
-       blake512_4way_full( &ctx.blake, vhashA, vhash, 64 );
-    // B
-    if ( likely( h_mask & 0xffffffff ) )
+    if ( mm256_anybits0( vh_mask ) )   
+    {
+       blake512_4way_init( &ctx.blake );
+       blake512_4way_update( &ctx.blake, vhash, 64 );
+       blake512_4way_close( &ctx.blake, vhashA );
+    }
+
+    if ( mm256_anybits1( vh_mask ) )
    {
       bmw512_4way_init( &ctx.bmw );
       bmw512_4way_update( &ctx.bmw, vhash, 64 );
@@ -350,20 +421,20 @@ void quark_4way_hash( void *state, const void *input )
    keccak512_4way_update( &ctx.keccak, vhash, 64 );
    keccak512_4way_close( &ctx.keccak, vhash );

-    skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
+    skein512_4way_init( &ctx.skein );
+    skein512_4way_update( &ctx.skein, vhash, 64 );
+    skein512_4way_close( &ctx.skein, vhash );

    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
-    h_mask = _mm256_movemask_epi8( vh_mask );

-    // A
-    if ( likely( ( h_mask & 0xffffffff ) != 0xffffffff ) )
+    if ( mm256_anybits0( vh_mask ) )    
    {
       keccak512_4way_init( &ctx.keccak );
       keccak512_4way_update( &ctx.keccak, vhash, 64 );
       keccak512_4way_close( &ctx.keccak, vhashA );
    }
-    // B
-    if ( likely( h_mask & 0xffffffff ) )
+
+    if ( mm256_anybits1( vh_mask ) )
    {
       jh512_4way_init( &ctx.jh );
       jh512_4way_update( &ctx.jh, vhash, 64 );
@@ -380,44 +451,41 @@ void quark_4way_hash( void *state, const void *input )
 int scanhash_quark_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr )
 {
-    uint64_t hash64[4*4] __attribute__ ((aligned (64)));
-    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+    uint32_t hash[4*8] __attribute__ ((aligned (64)));
+    uint32_t vdata[24*4] __attribute__ ((aligned (64)));
    uint32_t lane_hash[8] __attribute__ ((aligned (64)));
-    uint64_t *hash64_q3 = &(hash64[3*4]);
+    uint32_t *hash7 = &(hash[25]);
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
-    const uint64_t targ64_q3 = ((uint64_t*)ptarget)[3];
    uint32_t n = pdata[19];
    const uint32_t first_nonce = pdata[19];
-    const uint32_t last_nonce = max_nonce - 4;
-    __m256i  *noncev = (__m256i*)vdata + 9;
-    const int thr_id = mythr->id;
-    const bool bench = opt_benchmark;
+    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+    int thr_id = mythr->id;
+    const uint32_t Htarg = ptarget[7];
 
    mm256_bswap32_intrlv80_4x64( vdata, pdata );
-    *noncev = mm256_intrlv_blend_32(
-                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
    do
    {
-       quark_4way_hash( hash64, vdata );
+       *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );

-       for ( int lane = 0; lane < 4; lane++ )
-       if ( hash64_q3[ lane ] <= targ64_q3 && !bench )
+       quark_4way_hash( hash, vdata );
+       pdata[19] = n;
+
+       for ( int i = 0; i < 4; i++ )
+       if ( unlikely( hash7[ i<<1 ] <= Htarg ) )
       {
-          extr_lane_4x64( lane_hash, hash64, lane, 256 );
-          if ( valid_hash( lane_hash, ptarget ) )
+          extr_lane_4x64( lane_hash, hash, i, 256 );
+          if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
          {
-             pdata[19] = bswap_32( n + lane );
-             submit_lane_solution( work, lane_hash, mythr, lane );
+            pdata[19] = n+i;
+            submit_lane_solution( work, lane_hash, mythr, i );
          }
       }
-       *noncev = _mm256_add_epi32( *noncev,
-                                  m256_const1_64( 0x0000000400000000 ) );
       n += 4;
-    } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
+    } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );

-    pdata[19] = n;
-    *hashes_done = n - first_nonce;
+    *hashes_done = n - first_nonce + 1;
    return 0;
 }

--- a/Show More
+++ b/Show More