v3.5.0

2025-09-17 23:44:27 +00:00 · 2017-01-12 19:40:17 -05:00
parent 06f82c5b97
commit badc80f071
54 changed files with 410 additions and 25234 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -77,8 +77,6 @@ cpuminer_SOURCES = \
  algo/fresh.c \
  algo/groestl/groestl.c \
  algo/groestl/myr-groestl.c \
  algo/groestl/sse2/grso.c\
  algo/groestl/sse2/grso-asm.c\
  algo/groestl/aes_ni/hash-groestl.c \
  algo/groestl/aes_ni/hash-groestl256.c \
  algo/haval/haval.c\
--- a/359
+++ b/359
@@ -1,359 +0,0 @@
 Version 1.2 (Tanguy Pruvot)
 - Add cryptonight-light (Aeon)
 - Add Lyra2REv2 algo (Vertcoin)
 - Allow to load a remote config with curl
 - Algorithm parameter is now case insensitive
 - Drop anime algo (dead coin)
 - Add Sib(coin) algo
 - Compute and show network diff in pools too
 - Checkup on arm, tested ok on Tegra K1 (CyanogenMod 12.1)
 version 1.1 (Tanguy Pruvot)
 - Add basic API remote control (quit/seturl)
 - Add GroestlCoin, Diamond and Myriad variants
 - Add Pluck algo and fix gbt query crash
 - Add ZR5 algo (ZRC) and fix longpoll bug on linux
 - Add Luffa algo
 - Add Skein2 algo (Double Skein for Woodcoin)
 - Add Animecoin algo (Quark variant)
 - Add Dropcoin pok algo
 - Add BMW-256 (MDT) algo
 - Add Axiom algo
 - Change some logged strings
 - Use all cores by default, not N-1
 - Handle a default config to run without params
 - add cpu-priority and cpu-affinity options
 - add NSIS installer script for windows setup
 - Implement background option on windows
 - add -m stratum option (diff-multiplier)
 - Time limit to allow benchmarks or cron jobs
 - Fix Cryptonight stratum support
 - Allow to disable extranonce support
 version 1.0.9 (Tanguy Pruvot)
 - pool extranonce subscribe
 - upgrade jansson
 - lyra2 algo
 - fix for solo mining
 - API websocket support
 Version 1.0.8 (Tanguy Pruvot)
 - API Monitoring Support
 - Enhance config values support (int/real/bool)
 - Rewrite blake algo (speed x2)
 Version 1.0.7 (Tanguy Pruvot)
 - Add NIST5 and QUBIT algos
 - Show current stratum bloc height
 - Fix wallet solo mining
 Version 1.0.6 (Tanguy Pruvot)
 - Fix scrypt algo
 - More work on VC2013
 - Add -f tuning option to test with reduced difficulty
 - Add S3 algo
 Version 1.0.5 (Tanguy Pruvot)
 - Merge remaining v2.4 cpu-miner changes
 - Add colored output (disable with --no-color)
 - Test and fix blake on NEOS, needs 14 rounds (was 8)
 - Add pentablake (5x blake256) (from bitbandi)
 - Add neoscrypt
 - Windows (VC++ 2013 and MinGW64 build support)
 - Enhance --version informations (compiler + lib versions)
 Version 1.0.4 (Tanguy Pruvot)
 - Add x13 x14 and x15 algos (Sherlockcoin, X14Coin, Webcoin..)
 - Add scrypt:N variants (Vertcoin)
 - Add fresh algo
 - Fix thread khashes/s value output
 - Add a configure option --disable-assembly
 Version multi 1.0.3 (Lucas Jones)
 - Add new algos :
  x11 (Darkcoin [DRK], Hirocoin, Limecoin)
  cryptonight (Bytecoin [BCN], Monero)
  keccak (Maxcoin  HelixCoin, CryptoMeth, Galleon, 365coin, Slothcoin, BitcointalkCoin)
  hefty1 (Heavycoin)
  quark (Quarkcoin)
  skein (Skeincoin, Myriadcoin)
  shavite3 (INKcoin)
  blake (Blakecoin)
 - See README.md
 Version 2.4 - May 20, 2014
 - Add support for the getblocktemplate RPC method (BIP 22)
 - Allow tunnelling Stratum through HTTP proxies
 - Add a --no-redirect option to ignore redirection requests
 - Timeout for long polling is now disabled by default
 - Fix CPU affinity on Linux (kiyominer)
 - Add support for building under 64-bit Cygwin
 - Expand version information with build details
 Version 2.3.3 - Feb 27, 2014
 - The --url option is now mandatory
 - Do not switch to Stratum when using an HTTP proxy
 - Fix scheduling policy change on Linux (clbr)
 - Fix CPU affinity on FreeBSD (ache)
 - Compatibility fixes for various platforms, including Solaris 8
  and old versions of OS X
 - A man page for minerd is now available
 Version 2.3.2 - Jul 10, 2013
 - Add optimizations for AVX2-capable x86-64 processors
 - Ensure that the output stream is flushed after every log message
 - Fix an undefined-behavior bug in the Stratum code
 Version 2.3.1 - Jun 18, 2013
 - Add a --cert option for specifying an SSL certificate (martinwguy)
 - Fix a bug that only made SHA-256d mining work at difficulty 1
 - Fix a couple of compatibility issues with some Stratum servers
 Version 2.3 - Jun 12, 2013
 - Add support for the Stratum mining protocol
 - Automatically switch to Stratum if the mining server supports
  the X-Stratum extension, unless --no-stratum is used
 - Set CPU affinity on FreeBSD (lye)
 - Fix a bug in libcurl initialization (martinwguy)
 Version 2.2.3 - Aug 5, 2012
 - Add optimized ARM NEON code for scrypt and SHA-256d
 - Add a --benchmark option that allows offline testing
 - Support for the X-Reject-Reason extension
 Version 2.2.2 - Jun 7, 2012
 - Various performance improvements for x86 and x86-64
 - Optimize scrypt for ARMv5E and later processors
 - Set the priority of miner threads to idle on Windows
 - Add an option to start minerd as a daemon on POSIX systems
 Version 2.2.1 - May 2, 2012
 - Add optimized code for ARM processors
 - Support for building on NetBSD and OpenBSD
 - Various compatibility fixes for AIX (pontius)
 Version 2.2 - Apr 2, 2012
 - Add an optimized SHA-256d algorithm, with specialized code
  for x86 and x86-64 and support for AVX and XOP instructions
 - Slight performance increase for scrypt on x86 and x86-64
 - The default timeout is now 270 seconds
 Version 2.1.5 - Mar 7, 2012
 - Add optimizations for AVX-capable x86-64 processors
 - Assume HTTP if no protocol is specified for the mining server
 - Fix MinGW compatibility issues and update build instructions
 - Add support for building on Solaris using gcc (pontius)
 Version 2.1.4 - Feb 28, 2012
 - Implement 4-way SHA-256 on x86-64
 - Add TCP keepalive to long polling connections
 - Support HTTP and SOCKS proxies via the --proxy option
 - Username and password are no longer mandatory
 - Add a script that makes assembly code compatible with old versions
  of the GNU assembler that do not support macros
 Version 2.1.3 - Feb 12, 2012
 - Smart handling of long polling failures: switch to short scan time
  if long polling fails, and only try to reactivate it if the server
  continues to advertise the feature in HTTP headers
 - Add "X-Mining-Extensions: midstate" to HTTP headers (p2k)
 - Add support for the "submitold" extension, used by p2pool
 - It is now possible to specify username and password in the URL,
  like this: http://username:password@host:port/
 - Add a --version option, and clean up --help output
 - Avoid division by zero when computing hash rates
 - Handle empty responses properly (TimothyA)
 - Eliminate the delay between starting threads
 Version 2.1.2 - Jan 26, 2012
 - Do not submit work that is known to be stale
 - Allow miner threads to ask for new work if the current one is at least
  45 seconds old and long polling is enabled
 - Refresh work when long polling times out
 - Fix minor speed regression
 - Modify x86-64 code to make it compatible with older versions of binutils
 Version 2.1.1 - Jan 20, 2012
 - Handle network errors properly
 - Make scantime retargeting more accurate
 Version 2.1 - Jan 19, 2012
 - Share the same work among all threads
 - Do not ask for new work if the current one is not expired
 - Do not discard the work returned by long polling
 Version 2.0 - Jan 16, 2012
 - Change default port to 9332 for Litecoin and remove default credentials
 - Add 'scrypt' as the default algorithm and remove other algorithms (ArtForz)
 - Optimize scrypt for x86 and x86-64
 - Make scantime retargeting less granular (ArtForz)
 - Test the whole hash instead of just looking at the high 32 bits
 - Add configurable timeout, with a default of 180 seconds
 - Add share summary output (inlikeflynn)
 - Fix priority and CPU count detection on Windows
 - Fix parameters -u and -p, and add short options -o and -O
 Version 1.0.2 - Jun 13, 2011
 - Linux x86_64 optimisations - Con Kolivas
 - Optimise for x86_64 by default by using sse2_64 algo
 - Detects CPUs and sets number of threads accordingly
 - Uses CPU affinity for each thread where appropriate
 - Sets scheduling policy to lowest possible
 - Minor performance tweaks
 Version 1.0.1 - May 14, 2011
 - OSX support
 Version 1.0 - May 9, 2011
 - jansson 2.0 compatibility
 - correct off-by-one in date (month) display output
 - fix platform detection
 - improve yasm configure bits
 - support full URL, in X-Long-Polling header
 Version 0.8.1 - March 22, 2011
 - Make --user, --pass actually work
 - Add User-Agent HTTP header to requests, so that server operators may
  more easily identify the miner client.
 - Fix minor bug in example JSON config file
 Version 0.8 - March 21, 2011
 - Support long polling: http://deepbit.net/longpolling.php
 - Adjust max workload based on scantime (default 5 seconds,
  or 60 seconds for longpoll)
 - Standardize program output, and support syslog on Unix platforms
 - Suport --user/--pass options (and "user" and "pass" in config file),
  as an alternative to the current --userpass
 Version 0.7.2 - March 14, 2011
 - Add port of ufasoft's sse2 assembly implementation (Linux only)
  This is a substantial speed improvement on Intel CPUs.
 - Move all JSON-RPC I/O to separate thread.  This reduces the
  number of HTTP connections from one-per-thread to one, reducing resource
  usage on upstream bitcoind / pool server.
 Version 0.7.1 - March 2, 2011
 - Add support for JSON-format configuration file.  See example
  file example-cfg.json.  Any long argument on the command line
  may be stored in the config file.
 - Timestamp each solution found
 - Improve sha256_4way performance.  NOTE: This optimization makes
  the 'hash' debug-print output for sha256_way incorrect.
 - Use __builtin_expect() intrinsic as compiler micro-optimization
 - Build on Intel compiler
 - HTTP library now follows HTTP redirects
 Version 0.7 - February 12, 2011
 - Re-use CURL object, thereby reuseing DNS cache and HTTP connections
 - Use bswap_32, if compiler intrinsic is not available
 - Disable full target validation (as opposed to simply H==0) for now
 Version 0.6.1 - February 4, 2011
 - Fully validate "hash < target", rather than simply stopping our scan
  if the high 32 bits are 00000000.
 - Add --retry-pause, to set length of pause time between failure retries
 - Display proof-of-work hash and target, if -D (debug mode) enabled
 - Fix max-nonce auto-adjustment to actually work.  This means if your
  scan takes longer than 5 seconds (--scantime), the miner will slowly
  reduce the number of hashes you work on, before fetching a new work unit.
 Version 0.6 - January 29, 2011
 - Fetch new work unit, if scanhash takes longer than 5 seconds (--scantime)
 - BeeCee1's sha256 4way optimizations
 - lfm's byte swap optimization (improves via, cryptopp)
 - Fix non-working short options -q, -r
 Version 0.5 - December 28, 2010
 - Exit program, when all threads have exited
 - Improve JSON-RPC failure diagnostics and resilience
 - Add --quiet option, to disable hashmeter output.
 Version 0.3.3 - December 27, 2010
 - Critical fix for sha256_cryptopp 'cryptopp_asm' algo
 Version 0.3.2 - December 23, 2010
 - Critical fix for sha256_via
 Version 0.3.1 - December 19, 2010
 - Critical fix for sha256_via
 - Retry JSON-RPC failures (see --retry, under "--help" output)
 Version 0.3 - December 18, 2010
 - Add crypto++ 32bit assembly implementation
 - show version upon 'minerd --help'
 - work around gcc 4.5.x bug that killed 4way performance
 Version 0.2.2 - December 6, 2010
 - VIA padlock implementation works now
 - Minor build and runtime fixes
 Version 0.2.1 - November 29, 2010
 - avoid buffer overflow when submitting solutions
 - add Crypto++ sha256 implementation (C only, ASM elided for now)
 - minor internal optimizations and cleanups
 Version 0.2 - November 27, 2010
 - Add script for building a Windows installer
 - improve hash performance (hashmeter) statistics
 - add tcatm 4way sha256 implementation
 - Add experimental VIA Padlock sha256 implementation
 Version 0.1.2 - November 26, 2010
 - many small cleanups and micro-optimizations
 - build win32 exe using mingw
 - RPC URL, username/password become command line arguments
 - remove unused OpenSSL dependency
 Version 0.1.1 - November 24, 2010
 - Do not build sha256_generic module separately from cpuminer.
 Version 0.1 - November 24, 2010
 - Initial release.
--- a/README.md
+++ b/README.md
@@ -1,262 +1,117 @@
-This project is forked by Jay D Dee.
+cpuminer-opt is a fork of cpuminer-multi by TPruvot with optimizations
 imported from other miners developped by lucas Jones, djm34, Wolf0, pooler,
 Jeff garzik, ig0tik3d, elmad, palmd, and Optiminer, with additional
 optimizations by Jay D Dee.
-Updated for v3.3.2 Windows support.
+All of the code is believed to be open and free. If anyone has a
 claim to any of it post your case in the icpuminer-opt Bitcoin Talk forum
 or by email.
-Building on linux prerequisites:
+https://bitcointalk.org/index.php?topic=1326803.0
-It is assumed users know how to install packages on their system and
+mailto://jayddee246@gmail.com
 be able to compile standard source packages. This is basic Linux and
 beyond the scope of cpuminer-opt. 
-Make sure you have the basic development packages installed.
+See file RELEASE_NOTES for change log and compile instructions.
 Here is a good start:
-http://askubuntu.com/questions/457526/how-to-install-cpuminer-in-ubuntu
+Supported Algorithms
 --------------------
-Install any additional dependencies needed by cpuminer-opt. The list below
+                          argon2
-are some of the ones that may not be in the default install and need to
+                          axiom        Shabal-256 MemoHash
-be installed manually. There may be others, read the error messages they
+                          bastion
-will give a clue as to the missing package.
+                          blake        Blake-256 (SFR)
                          blakecoin    blake256r8
                          blake2s      Blake-2 S
                          bmw          BMW 256
                          c11          Flax
                          cryptolight  Cryptonight-light
                          cryptonight  cryptonote, Monero (XMR)
                          decred
                          drop         Dropcoin
                          fresh        Fresh
                          groestl      groestl
                          heavy        Heavy
                          hmq1725      Espers
                          hodl         Hodlcoin
                          keccak       Keccak
                          lbry         LBC, LBRY Credits
                          luffa        Luffa
                          lyra2re      lyra2
                          lyra2rev2    lyrav2
                          lyra2z       Zcoin (XZC)
                          lyra2zoin    Zoin (ZOI)
                          m7m          Magi (XMG)
                          myr-gr       Myriad-Groestl
                          neoscrypt    NeoScrypt(128, 2, 1)
                          nist5        Nist5
                          pluck        Pluck:128 (Supcoin)
                          pentablake   Pentablake
                          quark        Quark
                          qubit        Qubit
                          scrypt       scrypt(1024, 1, 1) (default)
                          scrypt:N     scrypt(N, 1, 1)
                          scryptjane:nf
                          sha256d      SHA-256d
                          shavite3     Shavite3
                          skein        Skein+Sha (Skeincoin)
                          skein2       Double Skein (Woodcoin)
                          vanilla      blake256r8vnl (VCash)
                          veltor
                          whirlpool
                          whirlpoolx
                          x11          X11
                          x11evo       Revolvercoin
                          x11gost      sib (SibCoin)
                          x13          X13
                          x14          X14
                          x15          X15
                          x17
                          xevan        Bitsend
                          yescrypt
                          zr5          Ziftr
-The folliwing command should install everything you need on Debian based
+Requirements
-packages:
+------------
-sudo apt-get install build-essential libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev automake
+1. A x86_64 architecture CPU with a minimum of SSE2 support. This includes
 Intel Core2 and newer and AMD equivalents. In order to take advantage of AES_NI
 optimizations a CPU with AES_NI is required. This includes Intel Westbridge
 and newer and AMD equivalents. Further optimizations are available on some
 algoritms for CPUs with AVX and AVX2, Sandybridge and Haswell respectively.
-Building on Linux, see below for Windows.
+Older CPUs are supported by cpuminer-multi by TPruvot but at reduced
 performance.
-Dependencies
+2. 64 bit Linux OS. Ubuntu and Fedora based distributions, including Mint and
 Centos are known to work and have all dependencies in their repositories.
 Others may work but may require more effort. 64 bit Windows OS is now supported
 with mingw_w64 and msys.
-build-essential  (for Ubuntu, Development Tools package group on Fedora)
+3. Stratum pool, cpuminer-opt only supports stratum minning.
 automake
 libjansson-dev
 libgmp-dev
 libcurl4-openssl-dev
 libssl-dev
 pthreads
 zlib
-tar xvzf [file.tar.gz]
+Errata
-cd [file]
+------
-Run build.sh to build on Linux or execute the following commands.
+cpuminer-opt does not work mining Decred algo at Nicehash and produces
 only "invalid extranonce2 size" rejects. It works at Zpool.
-./autogen.sh
+Benchmark testing does not work for x11evo.
 CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
 make
-Start mining.
+Bugs
 ----
-./cpuminer -a algo ...
+Users are encouraged to post their bug reports on the Bitcoin Talk
 forum at:
-Building on Windows prerequisites:
+https://bitcointalk.org/index.php?topic=1326803.0
 msys
 mingw_w64
 Visual C++ redistributable 2008 X64
 openssl, not sure about this
 Install msys and mingw_w64, only needed once.
 Unpack msys into C:\msys or your preferred directory.
 Install mingw__w64 from win-builds.
 Follow instructions, check "msys or cygwin" and "x86_64" and accept default
 existing msys instalation.
 Open a msys shell by double clicking on msys.bat.
 Note that msys shell uses linux syntax for file specifications, "C:\" is
 mounted at "/c/".
 Add mingw bin directory to PATH variable
 PATH="/c/msys/opt/windows_64/bin/:$PATH"
 Instalation complete, compile cpuminer-opt
 Unpack cpuminer-opt source files using tar from msys shell, or using 7zip
 or similar Windows program.
 In msys shell cd to miner directory.
 cd /c/path/to/cpuminer-opt
 Run winbuild.sh to build on Windows or execute the following commands.
 ./autogen.sh
 CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl
 make
 The following tips may be useful for older AMD CPUs.
 Some users with AMD CPUs without AES_NI have reported problems compiling
 with build.sh or "-march=native". Problems have included compile errors
 and poor performance. These users are recommended to compile manually
 specifying "-march=btver1" on the configure command line.
 Support for even older x86_64 without AES_NI or SSE2 is not availble.
 cpuminer-multi by TPruvot supports this architecture.
 The rest of this file is taken from cpuminer-multi.
 ----------------
 CPUMiner-Multi
 ==============
 [![Build Status](https://travis-ci.org/tpruvot/cpuminer-multi.svg)](https://travis-ci.org/tpruvot/cpuminer-multi)
 This is a multi-threaded CPU miner,
 fork of [pooler](//github.com/pooler)'s cpuminer (see AUTHORS for list of contributors).
 #### Table of contents
 * [Algorithms](#algorithms)
 * [Dependencies](#dependencies)
 * [Download](#download)
 * [Build](#build)
 * [Usage instructions](#usage-instructions)
 * [Donations](#donations)
 * [Credits](#credits)
 * [License](#license)
 Algorithms
 ==========
 #### Currently supported
 * ✓ __scrypt__ (Litecoin, Dogecoin, Feathercoin, ...)
 * ✓ __scrypt:N__
 * ✓ __sha256d__ (Bitcoin, Freicoin, Peercoin/PPCoin, Terracoin, ...)
 * ✓ __axiom__ (Axiom Shabal-256 based MemoHash)
 * ✓ __blake__ (Saffron [SFR] Blake-256)
 * ✓ __bmw__ (Midnight [MDT] BMW-256)
 * ✓ __cryptonight__ (Bytecoin [BCN], Monero)
 * ✓ __cryptonight-light__ (Aeon)
 * ✓ __dmd-gr__ (Diamond-Groestl)
 * ✓ __fresh__ (FreshCoin)
 * ✓ __groestl__ (Groestlcoin)
 * ✓ __lyra2RE__ (Lyrabar, Cryptocoin)
 * ✓ __lyra2REv2__ (VertCoin [VTC])
 * ✓ __myr-gr__ (Myriad-Groestl)
 * ✓ __neoscrypt__ (Feathercoin)
 * ✓ __nist5__ (MistCoin [MIC], TalkCoin [TAC], ...)
 * ✓ __pentablake__ (Joincoin)
 * ✓ __pluck__ (Supcoin [SUP])
 * ✓ __quark__ (Quarkcoin)
 * ✓ __qubit__ (MyriadCoin [MYR])
 * ✓ __skein__ (Skeincoin, Myriadcoin, Xedoscoin, ...)
 * ✓ __skein2__ (Woodcoin)
 * ✓ __s3__ (OneCoin)
 * ✓ __x11__ (Darkcoin [DRK], Hirocoin, Limecoin, ...)
 * ✓ __x13__ (Sherlockcoin, [ACE], [B2B], [GRC], [XHC], ...)
 * ✓ __x14__ (X14, Webcoin [WEB])
 * ✓ __x15__ (RadianceCoin [RCE])
 * ✓ __zr5__ (Ziftrcoin [ZRC])
 #### Implemented, but untested
 * ? blake2s
 * ? hefty1 (Heavycoin)
 * ? keccak (Maxcoin  HelixCoin, CryptoMeth, Galleon, 365coin, Slothcoin, BitcointalkCoin)
 * ? luffa (Joincoin, Doomcoin)
 * ? shavite3 (INKcoin)
 * ? sib X11 + gost (SibCoin)
 #### Planned support for
 * *scrypt-jane* (YaCoin, CopperBars, Pennies, Tickets, etc..)
 Dependencies
 ============
 * libcurl http://curl.haxx.se/libcurl/
 * jansson http://www.digip.org/jansson/ (jansson source is included in-tree)
 * openssl libcrypto https://www.openssl.org/
 * pthreads
 * zlib (for curl/ssl)
 Download
 ========
 * Windows releases: https://github.com/tpruvot/cpuminer-multi/releases
 * Git tree:   https://github.com/tpruvot/cpuminer-multi
   * Clone with `git clone https://github.com/tpruvot/cpuminer-multi`
 Build
 =====
 #### Basic *nix build instructions:
 * just use ./build.sh
 _OR_
 * ./autogen.sh	# only needed if building from git repo
 * ./nomacro.pl	# only needed if building on Mac OS X or with Clang
 * ./configure CFLAGS="-O3 -march=native" --with-crypto --with-curl
   * # Use -march=native if building for a single machine
 * make
 #### Notes for AIX users:
 * To build a 64-bit binary, export OBJECT_MODE=64
 * GNU-style long options are not supported, but are accessible via configuration file
 #### Basic Windows build with Visual Studio 2013
 * All the required .lib files are now included in tree (windows only)
 * AVX enabled by default for x64 platform (AVX2 and XOP could also be used)
 #### Basic Windows build instructions, using MinGW64:
 * Install MinGW64 and the MSYS Developer Tool Kit (http://www.mingw.org/)
   * Make sure you have mstcpip.h in MinGW\include
 * install pthreads-w64
 * Install libcurl devel (http://curl.haxx.se/download.html)
   * Make sure you have libcurl.m4 in MinGW\share\aclocal
   * Make sure you have curl-config in MinGW\bin
 * Install openssl devel (https://www.openssl.org/related/binaries.html)
 * In the MSYS shell, run:
   * for 64bit, you can use ./mingw64.sh else :
     ./autogen.sh	# only needed if building from git repo
   * LIBCURL="-lcurldll" ./configure CFLAGS="*-march=native*"
     * # Use -march=native if building for a single machine
   * make
 #### Architecture-specific notes:
 * ARM:
   * No runtime CPU detection. The miner can take advantage of some instructions specific to ARMv5E and later processors, but the decision whether to use them is made at compile time, based on compiler-defined macros.
   * To use NEON instructions, add "-mfpu=neon" to CFLAGS.
 * x86:
   * The miner checks for SSE2 instructions support at runtime, and uses them if they are available.
 * x86-64:	
   * The miner can take advantage of AVX, AVX2 and XOP instructions, but only if both the CPU and the operating system support them.
     * Linux supports AVX starting from kernel version 2.6.30.
     * FreeBSD supports AVX starting with 9.1-RELEASE.
     * Mac OS X added AVX support in the 10.6.8 update.
     * Windows supports AVX starting from Windows 7 SP1 and Windows Server 2008 R2 SP1.
   * The configure script outputs a warning if the assembler doesn't support some instruction sets. In that case, the miner can still be built, but unavailable optimizations are left off.
 Usage instructions
 ==================
 Run "cpuminer --help" to see options.
 ### Connecting through a proxy
 Use the --proxy option.
 To use a SOCKS proxy, add a socks4:// or socks5:// prefix to the proxy host  
 Protocols socks4a and socks5h, allowing remote name resolving, are also available since libcurl 7.18.0.
 If no protocol is specified, the proxy is assumed to be a HTTP proxy.  
 When the --proxy option is not used, the program honors the http_proxy and all_proxy environment variables.
 Donations
-=========
+---------
 Donations for the work done in this fork are accepted :
-Tanguy Pruvot :
+I do not do this for money but I have a donation address if users
-* BTC: `1FhDPLPpw18X4srecguG3MxJYe4a1JsZnd`
+are so inclined.
 * ZRC: `ZX6LmrCwphNgitxvDnf8TX6Tsegfxpeozx`
-Lucas Jones :
+bitcoin:12tdvfF7KmAsihBXQXynT6E6th2c2pByTT?label=donations
 * MRO: `472haywQKoxFzf7asaQ4XKBc2foAY4ezk8HiN63ifW4iAbJiLnfmJfhHSR9XmVKw2WYPnszJV9MEHj9Z5WMK9VCNHaGLDmJ`
 * BTC: `139QWoktddChHsZMWZFxmBva4FM96X2dhE`
-Credits
+Happy mining!
 =======
 CPUMiner-multi was forked from pooler's CPUMiner, and has been started by Lucas Jones.
 * [tpruvot](https://github.com/tpruvot) added all the recent features and newer algorythmns
 * [Wolf9466](https://github.com/wolf9466) helped with Intel AES-NI support for CryptoNight
 License
 =======
 GPLv2.  See COPYING for details.
--- a/85
+++ b/85
@@ -1,85 +0,0 @@
 cpuminer-opt now supports over 40 algorithms on CPUs with at least SSE2
 capabilities including Intel Core2, Nehalem and AMD equivalent. See the
 performance chart below for details.
 In addition 19 algorithms have optimizations to take advantage of
 CPUs with AES_NI for even greater performance, including the Intel
 Westbridge and newer and AMD equivalent. See the performance
 comparison below.
 New in 3.4.12
 - lyra2z (zcoin) modified for blocks after 8192
 - fixed scryptjane to support various N factors
 Users with non-SSE2 CPUs or who want to mine algos not supported by
 cpuminer-opt may find cpuminer-multi by TPruvot useful.
 Chart out of date, will be removed.
 The performance chart below is for an Intel i7-6700K @ 4 GHz, 16 GB mem.
 Normalization rates have been added to the chart to help with profit
 switching pools. Reference algo x11 = 1.
 Due to the peculiarities of some algorithms their performance on other CPU
 architectures may not scale equally. Their normalizations rates will also
 differ from those listed below. YMMV.
 Normalized profitability = algo profitability * norm rate
              AES-AVX    SSE2(1)   norm rate(5)
              -------    -------   ---------
 x11             780 K     525 K        1
 x13             392       298          0.50
 x14             370       271          0.48
 x15             341       270          0.45
 x17             317       248          0.43
 x11gost         562       392          0.72
 x11evo          590       387          0.78
 quark          1195       924          1.61
 qubit          1182       765          1.45
 nist5          2000      1592          3.37
 zr5             850       650          1.15    
 c11             784       475          0.99
 myr-gr         1572      1560          2.12
 hmq1725         214       161          0.29
 m7m             121      77.4          0.155
 lyra2re        1380       900          1.76
 lyra2rev2      1350       980          1.73
 cryptonight     290 H     165 H        0.00039
 cryptolight     685         ?          0.00093
 hodl            600       200          0.00081
 lbry            (4)      2620          3.53
 neoscrypt       (4)        32 K        0.043
 argon2          (4)      33.7          0.045
 groestl         (4)       931          1.26 
 skein           (4)      5747          7.77
 skein2          (4)      8675         11.7
 pentablake      (4)      3960          5.35
 keccak          (4)      7790         10.5
 scrypt          (4)       113          0.153
 sha256d         (4)      62.5          0.084
 veltor          (4)      1017          1.30
 blake           (4)      22.4 M       30.4
 blake2s         (4)      19.0         25.7
 vanilla         (4)      33.0         44.6
 blakecoin       (4)      33.9         45.8
 decred          (4)      22.6         30.5 
 axiom           (4)        72 H        0.000098
 yescrypt        (4)      3760          0.0051
 scryptjane      (4)       250          0.00034
 pluck(2)        (4)      1925          0.0026
 drop(2)         (4)       934 K        1.26
 fresh(2)        (4)       528          0.71
 whirlpool(2)    (4)      1290          1.74
 whirlpoolx(2)   (4)      5110          6.9
 Footnotes:
 (1) SSE2 rates are simulated in software (-march=core2) on an i7.
 (2) Benchmark tested only
 (3) CPU architecture not supported for algo. It won't work.
 (4) AES_NI Optimization not available for CPU artchitecture. Uses SSE2, slower.
 (5) Normalised profitability = algo profitability * norm rate, x11 = 1
 (6) Not supported on Windows
--- a/235
+++ b/235
@@ -1,81 +1,204 @@
 Change Log
 ----------
-cpuminer-opt-3.1 release notes
+v3.5.0
 --------------i----------------
-cpuminer-opt combines the best of minerd (x11), cp3u (quark) and
+Fixed blakecoin and vanilla increasing rejects with number of threads.
-cpuminer-multi (multi-algo support plus non-kernel related
+Removed support for SSE2 Groestl functions. SSE2 groestl remains available
-enhancements). Additional credits to Lucas Jones, elmad, palmd,
+  in v3.4.12 and the legacy branch.
-djm34, pooler, Jeff Garzik, Wolf0 and probably others.
+It is no longer necessary to specify stratum+tcp:// in the url, it is assumed
  and is the only supported protocol.
-The core of cpuminer-opt remains cpuminer-multi and is the base for
+v3.4.12
 this fork.
-All of the code is believed to be open and free. If anyone has a
+lyra2z (zcoin) modified for blocks after 8192
-claim to any of it post your case in the Bitcoin Talk forum,
+fixed scryptjane to support various N factors
 link below.
-Features
+v3.4.11
 --------
-V3.1 introduces a new mining engine called algo_gate. This fetaure
+groestl algo AES optimized +200%
-is not visible to the users excetp for the additional 5% performance
+myr-gr algo AES optimized +100%
 increase in all algos. This feature is of interest mostly to
 developpers.
-cpuminer provides accelerated hashing on AES-NI capable CPUs in
+v3.4.10
 x11, x13, x14, x15, quark & qubit algorithms. It also currently
 provides acceleration for SSE2 capable CPUs on quark and qubit
 algorithms only. Other algorithms are available but unchanged from
 cpuminer-multi-1.2pre and in various states of functionality.
 V3.0 pprovides improved hash rates for many algos. See the 
 release annoucent for details.
-Requirements
+xevan AES optimized +35%
 ------------
-A 64 bit CPU with SSE2 support and any of the popular 64 bit
+v3.4.9
 Linux distributions. Standard development tools, libcurl-devel,
 the preferred SSL development package of your distribution.
-Limitations
+fixed zr5, broken in v3.4.8
-----------
+added xevan algo (Bitsend, BSD) with 10% improvement
 added lyra2zoin (Zoin, ZOI) fully optimized but YMMV
-v3.0 is source code only that can be compiled on Linux.
+v3.4.8
 Windows support is not yet available, but planned.
-Compiling
+added zcoin support, optimized for AVX2 but no increase in performance
---------
+fixed API display of diff for cryptonight
 --show-diff is now the default, use "--hide-diff" to disable
 cleaned up some cpuminer-multi artifacts
-After unpacking the tarball change ito the cpuminer directory and
+v3.4.7
 execute these commands. Note that O3 is actually the upper case
 letter O.
-./autogen.sh 
+fixed benchmark, except for x11evo
-./configure CFLAGS="-O3 -march=native" --with-crypto --with-curl
+added CPU temperature to share submission report (Linux only)
 v3.4.6
 For users:
 - cryptolight algo is now supported with AES optimizations
 - display format changed for share submissions
    - colour keyed "Accepted" or "Rejected" status.
    - reject count and rate displayed when share is rejected.
 For developers:
 - code restructuring for detecting new work
    - cleaned up detection and handling of new work
    - removed call to stratum_gen_work from niner_thread.
    - eliminated gen_work_now gate function.
    - renamed gate function init_nonce to get_new_work.
    - renamed gate function alloc_scratchbuf to miner_thread_init,
      removed all scracthbuf references from miner_thread and moved
      implementation to the local algo files of those algos that need it.
    - moved most gate targets from algo-gate.c to cpu-miner.c removing
      most mining related code from algo-gate-api.c.
 v3.4.5
 fixed stale share rejects mining cryptonight at Nicehash
 fixed compile error on Westmere CPUs
 v3.4.4
 fixed compile errors on Westmere CPUs, this is an interim fix that
  will compile without AES on Westmere
 added support for cryptonight at Nicehash, some rejects may be produced
  at Nicehash only.
 v3.4.3
 imported optimized m7m, +42%
 v3.4.2
 added veltor algo
 tweaked lyra2 AVX/AVX2 code for small improvement.
 v3.4.1
 big AVX2 optmizations for lyra2 +35%, lyra2v2 +11%, AVX also faster
 fixed hmq1725
 v3.4.0
 fixed Windows compile error introduced in v3.3.9
 fixed x11gost, broken in v3.3.7
 AVX2 optimizations improving many algos:
   - Lyra2RE  +3%
   - Lyra2REv2  +19%
   - x11gost (sib) +6%
   - x11evo +2.4%
   - c11 +6.9%
   - x11 +5%
   - x13 +5%
   - x14 +3.6%
   - x15 +2.4%
   - x17 +2.8%
   - qubit +8.4%
 Compile Instructions
 --------------------
 Building on linux prerequisites:
 It is assumed users know how to install packages on their system and
 be able to compile standard source packages. This is basic Linux and
 beyond the scope of cpuminer-opt. 
 Make sure you have the basic development packages installed.
 Here is a good start:
 http://askubuntu.com/questions/457526/how-to-install-cpuminer-in-ubuntu
 Install any additional dependencies needed by cpuminer-opt. The list below
 are some of the ones that may not be in the default install and need to
 be installed manually. There may be others, read the error messages they
 will give a clue as to the missing package.
 The folliwing command should install everything you need on Debian based
 packages:
 sudo apt-get install build-essential libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev automake
 Building on Linux, see below for Windows.
 Dependencies
 build-essential  (for Ubuntu, Development Tools package group on Fedora)
 automake
 libjansson-dev
 libgmp-dev
 libcurl4-openssl-dev
 libssl-dev
 pthreads
 zlib
 tar xvzf [file.tar.gz]
 cd [file]
 Run build.sh to build on Linux or execute the following commands.
 ./autogen.sh
 CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
 make
-libcurl-devel and an development packages are required to be
+Start mining.
 installed to build this application and are available in most
 Linux repositories.
-To compile on older CPUs without AES_NI support use the following
+./cpuminer -a algo ...
 CFLAGS options: "-O3 -march=native -DNO_AES_NI"
-Bugs
+Building on Windows prerequisites:
 ----
-Users are encouraged to post their bug reports on the Bitcoin Talk
+msys
-forum at:
+mingw_w64
 Visual C++ redistributable 2008 X64
 openssl, not sure about this
-https://bitcointalk.org/index.php?topic=1326803.0
+Install msys and mingw_w64, only needed once.
-Donations
+Unpack msys into C:\msys or your preferred directory.
 ---------
-I do not do this for money but I have a donation address if users
+Install mingw__w64 from win-builds.
-are so inclined.
+Follow instructions, check "msys or cygwin" and "x86_64" and accept default
 existing msys instalation.
-bitcoin:12tdvfF7KmAsihBXQXynT6E6th2c2pByTT?label=donations
+Open a msys shell by double clicking on msys.bat.
 Note that msys shell uses linux syntax for file specifications, "C:\" is
 mounted at "/c/".
-Happy mining!
+Add mingw bin directory to PATH variable
 PATH="/c/msys/opt/windows_64/bin/:$PATH"
 Instalation complete, compile cpuminer-opt
 Unpack cpuminer-opt source files using tar from msys shell, or using 7zip
 or similar Windows program.
 In msys shell cd to miner directory.
 cd /c/path/to/cpuminer-opt
 Run winbuild.sh to build on Windows or execute the following commands.
 ./autogen.sh
 CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl
 make
 The following tips may be useful for older AMD CPUs.
 Some users with AMD CPUs without AES_NI have reported problems compiling
 with build.sh or "-march=native". Problems have included compile errors
 and poor performance. These users are recommended to compile manually
 specifying "-march=btver1" on the configure command line.
 Support for even older x86_64 without AES_NI or SSE2 is not availble.
--- a/algo/blake/blakecoin.c
+++ b/algo/blake/blakecoin.c
@@ -12,40 +12,36 @@ void blakecoin_close(void *cc, void *dst);
 #include <memory.h>
 #include <openssl/sha.h>
-/* Move init out of loop, so init once externally,
+// context management is staged for efficiency.
- * and then use one single memcpy */
+// 1. global initial ctx cached on startup
-static sph_blake256_context blake_mid;
+// 2. per-thread midstate ctx cache refreshed every scan
-static bool ctx_midstate_done = false;
+// 3. local ctx for final hash calculation
-static void init_blake_hash(void)
+static          sph_blake256_context blake_init_ctx;
 static __thread sph_blake256_context blake_mid_ctx;
 static void blake_midstate_init( const void* input )
 {
-	blakecoin_init(&blake_mid);
+    // copy cached initial state
-	ctx_midstate_done = true;
+    memcpy( &blake_mid_ctx, &blake_init_ctx, sizeof blake_mid_ctx );
    blakecoin( &blake_mid_ctx, input, 64 );
 }
-void blakecoinhash(void *state, const void *input)
+void blakecoinhash( void *state, const void *input )
 {
 	sph_blake256_context ctx;
 	uint8_t hash[64];
-	uint8_t *ending = (uint8_t*) input;
+	uint8_t *ending = (uint8_t*) input + 64;
 	ending += 64;
-	// do one memcopy to get a fresh context
+        // copy cached midstate
-	if (!ctx_midstate_done) {
+        memcpy( &ctx, &blake_mid_ctx, sizeof ctx );
-		init_blake_hash();
+	blakecoin( &ctx, ending, 16 );
-		blakecoin(&blake_mid, input, 64);
+	blakecoin_close( &ctx, hash );
-	}
+	memcpy( state, hash, 32 );
 	memcpy(&ctx, &blake_mid, sizeof(blake_mid));
 	blakecoin(&ctx, ending, 16);
 	blakecoin_close(&ctx, hash);
 	memcpy(state, hash, 32);
 }
-int scanhash_blakecoin(int thr_id, struct work *work, uint32_t max_nonce,
+int scanhash_blakecoin( int thr_id, struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done)
+                        uint64_t *hashes_done )
 {
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
@@ -57,16 +53,14 @@ int scanhash_blakecoin(int thr_id, struct work *work, uint32_t max_nonce,
 	uint32_t n = first_nonce;
 	ctx_midstate_done = false;
 	if (opt_benchmark)
 		HTarget = 0x7f;
 	// we need big endian data...
 //        be32enc_array( endiandata, pdata, 19 );
        for (int kk=0; kk < 19; kk++) 
                be32enc(&endiandata[kk], ((uint32_t*)pdata)[kk]);
        blake_midstate_init( endiandata );
 #ifdef DEBUG_ALGO
 	applog(LOG_DEBUG,"[%d] Target=%08x %08x", thr_id, ptarget[6], ptarget[7]);
@@ -117,6 +111,7 @@ bool register_vanilla_algo( algo_gate_t* gate )
    gate->hash     = (void*)&blakecoinhash;
    gate->hash_alt = (void*)&blakecoinhash;
    gate->get_max64 = (void*)&blakecoin_get_max64;
    blakecoin_init( &blake_init_ctx );
    return true;
 }
--- a/algo/blake/sse2/blake.c
+++ b/algo/blake/sse2/blake.c
@@ -317,7 +317,6 @@ static const sph_u64 blkIV512[8] = {
 #define COMPRESS64   do { \
                int r; \
                int b=0; \
 		sph_u64 M0, M1, M2, M3, M4, M5, M6, M7; \
 		sph_u64 M8, M9, MA, MB, MC, MD, ME, MF; \
--- a/algo/groestl/sse2/brg_endian.h
+++ b/algo/groestl/sse2/brg_endian.h
@@ -1,133 +0,0 @@
 /*
 ---------------------------------------------------------------------------
 Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved.
 LICENSE TERMS
 The redistribution and use of this software (with or without changes)
 is allowed without the payment of fees or royalties provided that:
  1. source code distributions include the above copyright notice, this
     list of conditions and the following disclaimer;
  2. binary distributions include the above copyright notice, this list
     of conditions and the following disclaimer in their documentation;
  3. the name of the copyright holder is not used to endorse products
     built using this software without specific written permission.
 DISCLAIMER
 This software is provided 'as is' with no explicit or implied warranties
 in respect of its properties, including, but not limited to, correctness
 and/or fitness for purpose.
 ---------------------------------------------------------------------------
 Issue Date: 20/12/2007
 */
 #ifndef _BRG_ENDIAN_H
 #define _BRG_ENDIAN_H
 #define IS_BIG_ENDIAN      4321 /* byte 0 is most significant (mc68k) */
 #define IS_LITTLE_ENDIAN   1234 /* byte 0 is least significant (i386) */
 /* Include files where endian defines and byteswap functions may reside */
 #if defined( __sun )
 #  include <sys/isa_defs.h>
 #elif defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ )
 #  include <sys/endian.h>
 #elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) || \
      defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ )
 #  include <machine/endian.h>
 #elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ )
 #  if !defined( __MINGW32__ ) && !defined( _AIX )
 #    include <endian.h>
 #    if !defined( __BEOS__ )
 #      include <byteswap.h>
 #    endif
 #  endif
 #endif
 /* Now attempt to set the define for platform byte order using any  */
 /* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which  */
 /* seem to encompass most endian symbol definitions                 */
 #if defined( BIG_ENDIAN ) && defined( LITTLE_ENDIAN )
 #  if defined( BYTE_ORDER ) && BYTE_ORDER == BIG_ENDIAN
 #    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
 #  elif defined( BYTE_ORDER ) && BYTE_ORDER == LITTLE_ENDIAN
 #    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
 #  endif
 #elif defined( BIG_ENDIAN )
 #  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
 #elif defined( LITTLE_ENDIAN )
 #  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
 #endif
 #if defined( _BIG_ENDIAN ) && defined( _LITTLE_ENDIAN )
 #  if defined( _BYTE_ORDER ) && _BYTE_ORDER == _BIG_ENDIAN
 #    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
 #  elif defined( _BYTE_ORDER ) && _BYTE_ORDER == _LITTLE_ENDIAN
 #    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
 #  endif
 #elif defined( _BIG_ENDIAN )
 #  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
 #elif defined( _LITTLE_ENDIAN )
 #  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
 #endif
 #if defined( __BIG_ENDIAN ) && defined( __LITTLE_ENDIAN )
 #  if defined( __BYTE_ORDER ) && __BYTE_ORDER == __BIG_ENDIAN
 #    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
 #  elif defined( __BYTE_ORDER ) && __BYTE_ORDER == __LITTLE_ENDIAN
 #    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
 #  endif
 #elif defined( __BIG_ENDIAN )
 #  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
 #elif defined( __LITTLE_ENDIAN )
 #  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
 #endif
 #if defined( __BIG_ENDIAN__ ) && defined( __LITTLE_ENDIAN__ )
 #  if defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __BIG_ENDIAN__
 #    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
 #  elif defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __LITTLE_ENDIAN__
 #    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
 #  endif
 #elif defined( __BIG_ENDIAN__ )
 #  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
 #elif defined( __LITTLE_ENDIAN__ )
 #  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
 #endif
 /*  if the platform byte order could not be determined, then try to */
 /*  set this define using common machine defines                    */
 #if !defined(PLATFORM_BYTE_ORDER)
 #if   defined( __alpha__ ) || defined( __alpha ) || defined( i386 )       || \
      defined( __i386__ )  || defined( _M_I86 )  || defined( _M_IX86 )    || \
      defined( __OS2__ )   || defined( sun386 )  || defined( __TURBOC__ ) || \
      defined( vax )       || defined( vms )     || defined( VMS )        || \
      defined( __VMS )     || defined( _M_X64 )
 #  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
 #elif defined( AMIGA )   || defined( applec )    || defined( __AS400__ )  || \
      defined( _CRAY )   || defined( __hppa )    || defined( __hp9000 )   || \
      defined( ibm370 )  || defined( mc68000 )   || defined( m68k )       || \
      defined( __MRC__ ) || defined( __MVS__ )   || defined( __MWERKS__ ) || \
      defined( sparc )   || defined( __sparc)    || defined( SYMANTEC_C ) || \
      defined( __VOS__ ) || defined( __TIGCC__ ) || defined( __TANDEM )   || \
      defined( THINK_C ) || defined( __VMCMS__ ) || defined( _AIX )
 #  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
 #elif 0     /* **** EDIT HERE IF NECESSARY **** */
 #  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
 #elif 0     /* **** EDIT HERE IF NECESSARY **** */
 #  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
 #else
 #  error Please edit lines 126 or 128 in brg_endian.h to set the platform byte order
 #endif
 #endif
 #endif
--- a/algo/groestl/sse2/brg_types.h
+++ b/algo/groestl/sse2/brg_types.h
@@ -1,231 +0,0 @@
 /*
 ---------------------------------------------------------------------------
 Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved.
 (a few lines added by Soeren S. Thomsen, October 2008)
 LICENSE TERMS
 The redistribution and use of this software (with or without changes)
 is allowed without the payment of fees or royalties provided that:
  1. source code distributions include the above copyright notice, this
     list of conditions and the following disclaimer;
  2. binary distributions include the above copyright notice, this list
     of conditions and the following disclaimer in their documentation;
  3. the name of the copyright holder is not used to endorse products
     built using this software without specific written permission.
 DISCLAIMER
 This software is provided 'as is' with no explicit or implied warranties
 in respect of its properties, including, but not limited to, correctness
 and/or fitness for purpose.
 ---------------------------------------------------------------------------
 Issue Date: 20/12/2007
 The unsigned integer types defined here are of the form uint_<nn>t where
 <nn> is the length of the type; for example, the unsigned 32-bit type is
 'uint_32t'.  These are NOT the same as the 'C99 integer types' that are
 defined in the inttypes.h and stdint.h headers since attempts to use these
 types have shown that support for them is still highly variable.  However,
 since the latter are of the form uint<nn>_t, a regular expression search
 and replace (in VC++ search on 'uint_{:z}t' and replace with 'uint\1_t')
 can be used to convert the types used here to the C99 standard types.
 */
 #ifndef _BRG_TYPES_H
 #define _BRG_TYPES_H
 #if defined(__cplusplus)
 extern "C" {
 #endif
 #include <limits.h>
 #if defined( _MSC_VER ) && ( _MSC_VER >= 1300 )
 #  include <stddef.h>
 #  define ptrint_t intptr_t
 #elif defined( __GNUC__ ) && ( __GNUC__ >= 3 )
 #  include <stdint.h>
 #  define ptrint_t intptr_t
 #else
 #  define ptrint_t int
 #endif
 #ifndef BRG_UI8
 #  define BRG_UI8
 #  if UCHAR_MAX == 255u
     typedef unsigned char uint_8t;
 #  else
 #    error Please define uint_8t as an 8-bit unsigned integer type in brg_types.h
 #  endif
 #endif
 #ifndef BRG_UI16
 #  define BRG_UI16
 #  if USHRT_MAX == 65535u
     typedef unsigned short uint_16t;
 #  else
 #    error Please define uint_16t as a 16-bit unsigned short type in brg_types.h
 #  endif
 #endif
 #ifndef BRG_UI32
 #  define BRG_UI32
 #  if UINT_MAX == 4294967295u
 #    define li_32(h) 0x##h##u
     typedef unsigned int uint_32t;
 #  elif ULONG_MAX == 4294967295u
 #    define li_32(h) 0x##h##ul
     typedef unsigned long uint_32t;
 #  elif defined( _CRAY )
 #    error This code needs 32-bit data types, which Cray machines do not provide
 #  else
 #    error Please define uint_32t as a 32-bit unsigned integer type in brg_types.h
 #  endif
 #endif
 #ifndef BRG_UI64
 #  if defined( __BORLANDC__ ) && !defined( __MSDOS__ )
 #    define BRG_UI64
 #    define li_64(h) 0x##h##ui64
     typedef unsigned __int64 uint_64t;
 #  elif defined( _MSC_VER ) && ( _MSC_VER < 1300 )    /* 1300 == VC++ 7.0 */
 #    define BRG_UI64
 #    define li_64(h) 0x##h##ui64
     typedef unsigned __int64 uint_64t;
 #  elif defined( __sun ) && defined( ULONG_MAX ) && ULONG_MAX == 0xfffffffful
 #    define BRG_UI64
 #    define li_64(h) 0x##h##ull
     typedef unsigned long long uint_64t;
 #  elif defined( __MVS__ )
 #    define BRG_UI64
 #    define li_64(h) 0x##h##ull
     typedef unsigned int long long uint_64t;
 #  elif defined( UINT_MAX ) && UINT_MAX > 4294967295u
 #    if UINT_MAX == 18446744073709551615u
 #      define BRG_UI64
 #      define li_64(h) 0x##h##u
       typedef unsigned int uint_64t;
 #    endif
 #  elif defined( ULONG_MAX ) && ULONG_MAX > 4294967295u
 #    if ULONG_MAX == 18446744073709551615ul
 #      define BRG_UI64
 #      define li_64(h) 0x##h##ul
       typedef unsigned long uint_64t;
 #    endif
 #  elif defined( ULLONG_MAX ) && ULLONG_MAX > 4294967295u
 #    if ULLONG_MAX == 18446744073709551615ull
 #      define BRG_UI64
 #      define li_64(h) 0x##h##ull
       typedef unsigned long long uint_64t;
 #    endif
 #  elif defined( ULONG_LONG_MAX ) && ULONG_LONG_MAX > 4294967295u
 #    if ULONG_LONG_MAX == 18446744073709551615ull
 #      define BRG_UI64
 #      define li_64(h) 0x##h##ull
       typedef unsigned long long uint_64t;
 #    endif
 #  endif
 #endif
 #if !defined( BRG_UI64 )
 #  if defined( NEED_UINT_64T )
 #    error Please define uint_64t as an unsigned 64 bit type in brg_types.h
 #  endif
 #endif
 #ifndef RETURN_VALUES
 #  define RETURN_VALUES
 #  if defined( DLL_EXPORT )
 #    if defined( _MSC_VER ) || defined ( __INTEL_COMPILER )
 #      define VOID_RETURN    __declspec( dllexport ) void __stdcall
 #      define INT_RETURN     __declspec( dllexport ) int  __stdcall
 #    elif defined( __GNUC__ )
 #      define VOID_RETURN    __declspec( __dllexport__ ) void
 #      define INT_RETURN     __declspec( __dllexport__ ) int
 #    else
 #      error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
 #    endif
 #  elif defined( DLL_IMPORT )
 #    if defined( _MSC_VER ) || defined ( __INTEL_COMPILER )
 #      define VOID_RETURN    __declspec( dllimport ) void __stdcall
 #      define INT_RETURN     __declspec( dllimport ) int  __stdcall
 #    elif defined( __GNUC__ )
 #      define VOID_RETURN    __declspec( __dllimport__ ) void
 #      define INT_RETURN     __declspec( __dllimport__ ) int
 #    else
 #      error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
 #    endif
 #  elif defined( __WATCOMC__ )
 #    define VOID_RETURN  void __cdecl
 #    define INT_RETURN   int  __cdecl
 #  else
 #    define VOID_RETURN  void
 #    define INT_RETURN   int
 #  endif
 #endif
 /*	These defines are used to detect and set the memory alignment of pointers.
    Note that offsets are in bytes.
 	ALIGN_OFFSET(x,n)			return the positive or zero offset of 
 								the memory addressed by the pointer 'x' 
 								from an address that is aligned on an 
 								'n' byte boundary ('n' is a power of 2)
 	ALIGN_FLOOR(x,n)			return a pointer that points to memory
 								that is aligned on an 'n' byte boundary 
 								and is not higher than the memory address
 								pointed to by 'x' ('n' is a power of 2)
 	ALIGN_CEIL(x,n)				return a pointer that points to memory
 								that is aligned on an 'n' byte boundary 
 								and is not lower than the memory address
 								pointed to by 'x' ('n' is a power of 2)
 */
 #define ALIGN_OFFSET(x,n)	(((ptrint_t)(x)) & ((n) - 1))
 #define ALIGN_FLOOR(x,n)	((uint_8t*)(x) - ( ((ptrint_t)(x)) & ((n) - 1)))
 #define ALIGN_CEIL(x,n)		((uint_8t*)(x) + (-((ptrint_t)(x)) & ((n) - 1)))
 /*  These defines are used to declare buffers in a way that allows
    faster operations on longer variables to be used.  In all these
    defines 'size' must be a power of 2 and >= 8. NOTE that the 
    buffer size is in bytes but the type length is in bits
    UNIT_TYPEDEF(x,size)        declares a variable 'x' of length 
                                'size' bits
    BUFR_TYPEDEF(x,size,bsize)  declares a buffer 'x' of length 'bsize' 
                                bytes defined as an array of variables
                                each of 'size' bits (bsize must be a 
                                multiple of size / 8)
    UNIT_CAST(x,size)           casts a variable to a type of 
                                length 'size' bits
    UPTR_CAST(x,size)           casts a pointer to a pointer to a 
                                varaiable of length 'size' bits
 */
 #define UI_TYPE(size)               uint_##size##t
 #define UNIT_TYPEDEF(x,size)        typedef UI_TYPE(size) x
 #define BUFR_TYPEDEF(x,size,bsize)  typedef UI_TYPE(size) x[bsize / (size >> 3)]
 #define UNIT_CAST(x,size)           ((UI_TYPE(size) )(x))  
 #define UPTR_CAST(x,size)           ((UI_TYPE(size)*)(x))
  /* Added by Soeren S. Thomsen (begin) */
 #define u8 uint_8t
 #define u32 uint_32t
 #define u64 uint_64t
  /* (end) */
 #if defined(__cplusplus)
 }
 #endif
 #endif
--- a/algo/groestl/sse2/groestl.c
+++ b/algo/groestl/sse2/groestl.c
--- a/algo/groestl/sse2/grsi-asm.h
+++ b/algo/groestl/sse2/grsi-asm.h
@@ -1,956 +0,0 @@
 /* groestl-intr-vperm.h     Aug 2011
 *
 * Groestl implementation with intrinsics using ssse3 instructions.
 * Author: Günther A. Roland, Martin Schläffer
 *
 * Based on the vperm and aes_ni implementations of the hash function Groestl
 * by Cagdas Calik <ccalik@metu.edu.tr> http://www.metu.edu.tr/~ccalik/
 * Institute of Applied Mathematics, Middle East Technical University, Turkey
 *
 * This code is placed in the public domain
 */
 #include <tmmintrin.h>
 #include "grsi.h"
 /*define data alignment for different C compilers*/
 #if defined(__GNUC__)
      #define DATA_ALIGN16(x) x __attribute__ ((aligned(16)))
 #else
      #define DATA_ALIGN16(x) __declspec(align(16)) x
 #endif
 //#if defined(DECLARE_GLOBAL)
 #if 1
 #define GLOBAL
 #else
 #define GLOBAL extern
 #endif
 //#if defined(DECLARE_IFUN)
 #if 1
 #define IFUN
 #else
 #define IFUN extern
 #endif
 /* global constants  */
 //GLOBAL __m128i grsiROUND_CONST_Lx;
 //GLOBAL __m128i grsiROUND_CONST_L0[grsiROUNDS512];
 //GLOBAL __m128i grsiROUND_CONST_L7[grsiROUNDS512];
 DATA_ALIGN16(int32_t grsiSUBSH_MASK_short[8*4]) = {
    0x03020100, 0x07060504, 0x0b0a0908, 0x0f0e0d0c,
    0x04030201, 0x08070605, 0x0c0b0a09, 0x000f0e0d,
    0x05040302, 0x09080706, 0x0d0c0b0a, 0x01000f0e,
    0x06050403, 0x0a090807, 0x0e0d0c0b, 0x0201000f,
    0x07060504, 0x0b0a0908, 0x0f0e0d0c, 0x03020100,
    0x08070605, 0x0c0b0a09, 0x000f0e0d, 0x04030201,
    0x09080706, 0x0d0c0b0a, 0x01000f0e, 0x05040302,
    0x0e0d0c0b, 0x0201000f, 0x06050403, 0x0a090807
 };
 GLOBAL __m128i *grsiSUBSH_MASK = grsiSUBSH_MASK_short;
 GLOBAL __m128i grsiALL_0F = {0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f};
 GLOBAL __m128i grsiALL_1B = {0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b};
 GLOBAL __m128i grsiALL_FF = {0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff};
 /* global unsknown */
 GLOBAL __m128i grsiVPERM_OPT[2];
 GLOBAL __m128i grsiVPERM_INV[2];
 GLOBAL __m128i grsiVPERM_SB1[2];
 GLOBAL __m128i grsiVPERM_SB2[2];
 GLOBAL __m128i grsiVPERM_SB4[2];
 GLOBAL __m128i grsiVPERM_SBO[2];
 /* state vars */
 GLOBAL __m128i grsiTRANSP_MASK;
 GLOBAL __m128i grsiVPERM_IPT[2];
 GLOBAL __m128i grsiALL_15;
 GLOBAL __m128i grsiALL_63;
 GLOBAL __m128i grsiROUND_CONST_P[grsiROUNDS1024];
 GLOBAL __m128i grsiROUND_CONST_Q[grsiROUNDS1024];
 #define grsitos(a)    #a
 #define grsitostr(a)  grsitos(a)
 /*
  grsiALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
  grsiALL_63 = _mm_set_epi32(0x63636363, 0x63636363, 0x63636363, 0x63636363);\
 */
 #define grsiSET_SHARED_CONSTANTS(){\
  grsiTRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
  grsiALL_0F = _mm_set_epi32(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f);\
  grsiALL_15 = _mm_set_epi32(0x15151515, 0x15151515, 0x15151515, 0x15151515);\
 \
  grsiVPERM_IPT[0] = _mm_set_epi32(0xCD80B1FC, 0xB0FDCC81, 0x4C01307D, 0x317C4D00);\
  grsiVPERM_IPT[1] = _mm_set_epi32(0xCABAE090, 0x52227808, 0xC2B2E898, 0x5A2A7000);\
  grsiVPERM_OPT[0] = _mm_set_epi32(0xE10D5DB1, 0xB05C0CE0, 0x01EDBD51, 0x50BCEC00);\
  grsiVPERM_OPT[1] = _mm_set_epi32(0xF7974121, 0xDEBE6808, 0xFF9F4929, 0xD6B66000);\
  grsiVPERM_INV[0] = _mm_set_epi32(0x030D0E0C, 0x02050809, 0x01040A06, 0x0F0B0780);\
  grsiVPERM_INV[1] = _mm_set_epi32(0x04070309, 0x0A0B0C02, 0x0E05060F, 0x0D080180);\
  grsiVPERM_SB1[0] = _mm_set_epi32(0x3BF7CCC1, 0x0D2ED9EF, 0x3618D415, 0xFAE22300);\
  grsiVPERM_SB1[1] = _mm_set_epi32(0xA5DF7A6E, 0x142AF544, 0xB19BE18F, 0xCB503E00);\
  grsiVPERM_SB2[0] = _mm_set_epi32(0xC2A163C8, 0xAB82234A, 0x69EB8840, 0x0AE12900);\
  grsiVPERM_SB2[1] = _mm_set_epi32(0x5EB7E955, 0xBC982FCD, 0xE27A93C6, 0x0B712400);\
  grsiVPERM_SB4[0] = _mm_set_epi32(0xBA44FE79, 0x876D2914, 0x3D50AED7, 0xC393EA00);\
  grsiVPERM_SB4[1] = _mm_set_epi32(0xA876DE97, 0x49087E9F, 0xE1E937A0, 0x3FD64100);\
 }/**/
 /* grsiVPERM
 * Transform w/o settings c*
 * transforms 2 rows to/from "vperm mode"
 * this function is derived from:
 *   vperm and aes_ni implementations of hash function Grostl
 *   by Cagdas CALIK
 * inputs:
 * a0, a1 = 2 rows
 * table = transformation table to use
 * t*, c* = clobbers
 * outputs:
 * a0, a1 = 2 rows transformed with table
 * */
 #define grsiVPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2){\
  t0 = c0;\
  t1 = c0;\
  t0 = _mm_andnot_si128(t0, a0);\
  t1 = _mm_andnot_si128(t1, a1);\
  t0 = _mm_srli_epi32(t0, 4);\
  t1 = _mm_srli_epi32(t1, 4);\
  a0 = _mm_and_si128(a0, c0);\
  a1 = _mm_and_si128(a1, c0);\
  t2 = c2;\
  t3 = c2;\
  t2 = _mm_shuffle_epi8(t2, a0);\
  t3 = _mm_shuffle_epi8(t3, a1);\
  a0 = c1;\
  a1 = c1;\
  a0 = _mm_shuffle_epi8(a0, t0);\
  a1 = _mm_shuffle_epi8(a1, t1);\
  a0 = _mm_xor_si128(a0, t2);\
  a1 = _mm_xor_si128(a1, t3);\
 }/**/
 #define grsiVPERM_Transform_Set_Const(table, c0, c1, c2){\
  c0 = grsiALL_0F;\
  c1 = ((__m128i*) table )[0];\
  c2 = ((__m128i*) table )[1];\
 }/**/
 /* grsiVPERM
 * Transform
 * transforms 2 rows to/from "vperm mode"
 * this function is derived from:
 *   vperm and aes_ni implementations of hash function Grostl
 *   by Cagdas CALIK
 * inputs:
 * a0, a1 = 2 rows
 * table = transformation table to use
 * t*, c* = clobbers
 * outputs:
 * a0, a1 = 2 rows transformed with table
 * */
 #define grsiVPERM_Transform(a0, a1, table, t0, t1, t2, t3, c0, c1, c2){\
  grsiVPERM_Transform_Set_Const(table, c0, c1, c2);\
  grsiVPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
 }/**/
 /* grsiVPERM
 * Transform State
 * inputs:
 * a0-a3 = state
 * table = transformation table to use
 * t* = clobbers
 * outputs:
 * a0-a3 = transformed state
 * */
 #define grsiVPERM_Transform_State(a0, a1, a2, a3, table, t0, t1, t2, t3, c0, c1, c2){\
  grsiVPERM_Transform_Set_Const(table, c0, c1, c2);\
  grsiVPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
  grsiVPERM_Transform_No_Const(a2, a3, t0, t1, t2, t3, c0, c1, c2);\
 }/**/
 /* grsiVPERM
 * Add Constant to State
 * inputs:
 * a0-a7 = state
 * constant = constant to add
 * t0 = clobber
 * outputs:
 * a0-a7 = state + constant
 * */
 #define grsiVPERM_Add_Constant(a0, a1, a2, a3, a4, a5, a6, a7, constant, t0){\
  t0 = constant;\
  a0 = _mm_xor_si128(a0,  t0);\
  a1 = _mm_xor_si128(a1,  t0);\
  a2 = _mm_xor_si128(a2,  t0);\
  a3 = _mm_xor_si128(a3,  t0);\
  a4 = _mm_xor_si128(a4,  t0);\
  a5 = _mm_xor_si128(a5,  t0);\
  a6 = _mm_xor_si128(a6,  t0);\
  a7 = _mm_xor_si128(a7,  t0);\
 }/**/
 /* grsiVPERM
 * Set Substitute Core Constants
 * */
 #define grsiVPERM_Substitute_Core_Set_Const(c0, c1, c2){\
  grsiVPERM_Transform_Set_Const(grsiVPERM_INV, c0, c1, c2);\
 }/**/
 /* grsiVPERM
 * Substitute Core
 * first part of sbox inverse computation
 * this function is derived from:
 *   vperm and aes_ni implementations of hash function Grostl
 *   by Cagdas CALIK
 * inputs:
 * a0 = 1 row
 * t*, c* = clobbers
 * outputs:
 * b0a, b0b = inputs for lookup step
 * */
 #define grsiVPERM_Substitute_Core(a0, b0a, b0b, t0, t1, c0, c1, c2){\
  t0 = c0;\
  t0 = _mm_andnot_si128(t0, a0);\
  t0 = _mm_srli_epi32(t0, 4);\
  a0 = _mm_and_si128(a0,  c0);\
  b0a = c1;\
  b0a = _mm_shuffle_epi8(b0a, a0);\
  a0 = _mm_xor_si128(a0,  t0);\
  b0b = c2;\
  b0b = _mm_shuffle_epi8(b0b, t0);\
  b0b = _mm_xor_si128(b0b, b0a);\
  t1 = c2;\
  t1 = _mm_shuffle_epi8(t1,  a0);\
  t1 = _mm_xor_si128(t1,  b0a);\
  b0a = c2;\
  b0a = _mm_shuffle_epi8(b0a, b0b);\
  b0a = _mm_xor_si128(b0a, a0);\
  b0b = c2;\
  b0b = _mm_shuffle_epi8(b0b, t1);\
  b0b = _mm_xor_si128(b0b, t0);\
 }/**/
 /* grsiVPERM
 * Lookup
 * second part of sbox inverse computation
 * this function is derived from:
 *   vperm and aes_ni implementations of hash function Grostl
 *   by Cagdas CALIK
 * inputs:
 * a0a, a0b = output of Substitution Core
 * table = lookup table to use (*1 / *2 / *4)
 * t0 = clobber
 * outputs:
 * b0 = output of sbox + multiplication
 * */
 #define grsiVPERM_Lookup(a0a, a0b, table, b0, t0){\
  b0 = ((__m128i*) table )[0];\
  t0 = ((__m128i*) table )[1];\
  b0 = _mm_shuffle_epi8(b0, a0b);\
  t0 = _mm_shuffle_epi8(t0, a0a);\
  b0 = _mm_xor_si128(b0, t0);\
 }/**/
 /* grsiVPERM
 * SubBytes and *2 / *4
 * this function is derived from:
 *   Constant-time SSSE3 AES core implementation
 *   by Mike Hamburg
 * and
 *   vperm and aes_ni implementations of hash function Grostl
 *   by Cagdas CALIK
 * inputs:
 * a0-a7 = state
 * t*, c* = clobbers
 * outputs:
 * a0-a7 = state * 4
 * c2 = row0 * 2 -> b0
 * c1 = row7 * 2 -> b3
 * c0 = row7 * 1 -> b4
 * t2 = row4 * 1 -> b7
 * TEMP_MUL1 = row(i) * 1
 * TEMP_MUL2 = row(i) * 2
 *
 * call:grsiVPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7) */
 #define grsiVPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, t0, t1, t3, t4, c2, c1, c0, t2){\
  /* set Constants */\
  grsiVPERM_Substitute_Core_Set_Const(c0, c1, c2);\
  /* row 1 */\
  grsiVPERM_Substitute_Core(a1, t0, t1, t3, t4, c0, c1, c2);\
  grsiVPERM_Lookup(t0, t1, grsiVPERM_SB1, t2, t4);\
  TEMP_MUL1[1] = t2;\
  grsiVPERM_Lookup(t0, t1, grsiVPERM_SB2, t3, t4);\
  TEMP_MUL2[1] = t3;\
  grsiVPERM_Lookup(t0, t1, grsiVPERM_SB4, a1, t4);\
  /* --- */\
  /* row 2 */\
  grsiVPERM_Substitute_Core(a2, t0, t1, t3, t4, c0, c1, c2);\
  grsiVPERM_Lookup(t0, t1, grsiVPERM_SB1, t2, t4);\
  TEMP_MUL1[2] = t2;\
  grsiVPERM_Lookup(t0, t1, grsiVPERM_SB2, t3, t4);\
  TEMP_MUL2[2] = t3;\
  grsiVPERM_Lookup(t0, t1, grsiVPERM_SB4, a2, t4);\
  /* --- */\
  /* row 3 */\
  grsiVPERM_Substitute_Core(a3, t0, t1, t3, t4, c0, c1, c2);\
  grsiVPERM_Lookup(t0, t1, grsiVPERM_SB1, t2, t4);\
  TEMP_MUL1[3] = t2;\
  grsiVPERM_Lookup(t0, t1, grsiVPERM_SB2, t3, t4);\
  TEMP_MUL2[3] = t3;\
  grsiVPERM_Lookup(t0, t1, grsiVPERM_SB4, a3, t4);\
  /* --- */\
  /* row 5 */\
  grsiVPERM_Substitute_Core(a5, t0, t1, t3, t4, c0, c1, c2);\
  grsiVPERM_Lookup(t0, t1, grsiVPERM_SB1, t2, t4);\
  TEMP_MUL1[5] = t2;\
  grsiVPERM_Lookup(t0, t1, grsiVPERM_SB2, t3, t4);\
  TEMP_MUL2[5] = t3;\
  grsiVPERM_Lookup(t0, t1, grsiVPERM_SB4, a5, t4);\
  /* --- */\
  /* row 6 */\
  grsiVPERM_Substitute_Core(a6, t0, t1, t3, t4, c0, c1, c2);\
  grsiVPERM_Lookup(t0, t1, grsiVPERM_SB1, t2, t4);\
  TEMP_MUL1[6] = t2;\
  grsiVPERM_Lookup(t0, t1, grsiVPERM_SB2, t3, t4);\
  TEMP_MUL2[6] = t3;\
  grsiVPERM_Lookup(t0, t1, grsiVPERM_SB4, a6, t4);\
  /* --- */\
  /* row 7 */\
  grsiVPERM_Substitute_Core(a7, t0, t1, t3, t4, c0, c1, c2);\
  grsiVPERM_Lookup(t0, t1, grsiVPERM_SB1, t2, t4);\
  TEMP_MUL1[7] = t2;\
  grsiVPERM_Lookup(t0, t1, grsiVPERM_SB2, c1, t4); /*c1 -> b3*/\
  grsiVPERM_Lookup(t0, t1, grsiVPERM_SB4, a7, t4);\
  /* --- */\
  /* row 4 */\
  grsiVPERM_Substitute_Core(a4, t0, t1, t3, t4, c0, (grsiVPERM_INV[0]), c2);\
  grsiVPERM_Lookup(t0, t1, grsiVPERM_SB1, t2, t4); /*t2 -> b7*/\
  grsiVPERM_Lookup(t0, t1, grsiVPERM_SB2, t3, t4);\
  TEMP_MUL2[4] = t3;\
  grsiVPERM_Lookup(t0, t1, grsiVPERM_SB4, a4, t4);\
  /* --- */\
  /* row 0 */\
  grsiVPERM_Substitute_Core(a0, t0, t1, t3, t4, c0, (grsiVPERM_INV[0]), c2);\
  grsiVPERM_Lookup(t0, t1, grsiVPERM_SB1, c0, t4); /*c0 -> b4*/\
  grsiVPERM_Lookup(t0, t1, grsiVPERM_SB2, c2, t4); /*c2 -> b0*/\
  TEMP_MUL2[0] = c2;\
  grsiVPERM_Lookup(t0, t1, grsiVPERM_SB4, a0, t4);\
  /* --- */\
 }/**/
 /* Optimized grsiMixBytes
 * inputs:
 * a0-a7 = (row0-row7) * 4
 * b0 = row0 * 2
 * b3 = row7 * 2
 * b4 = row7 * 1
 * b7 = row4 * 1
 * all *1 and *2 values must also be in TEMP_MUL1, TEMP_MUL2
 * output: b0-b7
 * */
 #define grsiMixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
  /* save one value */\
  TEMP_MUL4 = a3;\
  /* 1 */\
  b1 = a0;\
  b1 = _mm_xor_si128(b1, a5);\
  b1 = _mm_xor_si128(b1, b4); /* -> helper! */\
  b1 = _mm_xor_si128(b1, (TEMP_MUL2[3]));\
  b2 = b1;\
  \
  /* 2 */\
  b5 = a1;\
  b5 = _mm_xor_si128(b5, a4);\
  b5 = _mm_xor_si128(b5, b7); /* -> helper! */\
  b5 = _mm_xor_si128(b5, b3); /* -> helper! */\
  b6 = b5;\
  \
  /* 4 */\
  b7 = _mm_xor_si128(b7, a6);\
  /*b7 = _mm_xor_si128(b7, (TEMP_MUL1[4])); -> helper! */\
  b7 = _mm_xor_si128(b7, (TEMP_MUL1[6]));\
  b7 = _mm_xor_si128(b7, (TEMP_MUL2[1]));\
  b7 = _mm_xor_si128(b7, b3); /* -> helper! */\
  b2 = _mm_xor_si128(b2, b7);\
  \
  /* 3 */\
  b0 = _mm_xor_si128(b0, a7);\
  b0 = _mm_xor_si128(b0, (TEMP_MUL1[5]));\
  b0 = _mm_xor_si128(b0, (TEMP_MUL1[7]));\
  /*b0 = _mm_xor_si128(b0, (TEMP_MUL2[0])); -> helper! */\
  b0 = _mm_xor_si128(b0, (TEMP_MUL2[2]));\
  b3 = b0;\
  b1 = _mm_xor_si128(b1, b0);\
  b0 = _mm_xor_si128(b0, b7); /* moved from 4 */\
  \
  /* 5 */\
  b4 = _mm_xor_si128(b4, a2);\
  /*b4 = _mm_xor_si128(b4, (TEMP_MUL1[0])); -> helper! */\
  b4 = _mm_xor_si128(b4, (TEMP_MUL1[2]));\
  b4 = _mm_xor_si128(b4, (TEMP_MUL2[3]));\
  b4 = _mm_xor_si128(b4, (TEMP_MUL2[5]));\
  b3 = _mm_xor_si128(b3, b4);\
  b6 = _mm_xor_si128(b6, b4);\
  \
  /* 6 */\
  a3 = _mm_xor_si128(a3, (TEMP_MUL1[1]));\
  a3 = _mm_xor_si128(a3, (TEMP_MUL1[3]));\
  a3 = _mm_xor_si128(a3, (TEMP_MUL2[4]));\
  a3 = _mm_xor_si128(a3, (TEMP_MUL2[6]));\
  b4 = _mm_xor_si128(b4, a3);\
  b5 = _mm_xor_si128(b5, a3);\
  b7 = _mm_xor_si128(b7, a3);\
  \
  /* 7 */\
  a1 = _mm_xor_si128(a1, (TEMP_MUL1[1]));\
  a1 = _mm_xor_si128(a1, (TEMP_MUL2[4]));\
  b2 = _mm_xor_si128(b2, a1);\
  b3 = _mm_xor_si128(b3, a1);\
  \
  /* 8 */\
  a5 = _mm_xor_si128(a5, (TEMP_MUL1[5]));\
  a5 = _mm_xor_si128(a5, (TEMP_MUL2[0]));\
  b6 = _mm_xor_si128(b6, a5);\
  b7 = _mm_xor_si128(b7, a5);\
  \
  /* 9 */\
  a3 = TEMP_MUL1[2];\
  a3 = _mm_xor_si128(a3, (TEMP_MUL2[5]));\
  b0 = _mm_xor_si128(b0, a3);\
  b5 = _mm_xor_si128(b5, a3);\
  \
  /* 10 */\
  a1 = TEMP_MUL1[6];\
  a1 = _mm_xor_si128(a1, (TEMP_MUL2[1]));\
  b1 = _mm_xor_si128(b1, a1);\
  b4 = _mm_xor_si128(b4, a1);\
  \
  /* 11 */\
  a5 = TEMP_MUL1[3];\
  a5 = _mm_xor_si128(a5, (TEMP_MUL2[6]));\
  b1 = _mm_xor_si128(b1, a5);\
  b6 = _mm_xor_si128(b6, a5);\
  \
  /* 12 */\
  a3 = TEMP_MUL1[7];\
  a3 = _mm_xor_si128(a3, (TEMP_MUL2[2]));\
  b2 = _mm_xor_si128(b2, a3);\
  b5 = _mm_xor_si128(b5, a3);\
  \
  /* 13 */\
  b0 = _mm_xor_si128(b0, (TEMP_MUL4));\
  b0 = _mm_xor_si128(b0, a4);\
  b1 = _mm_xor_si128(b1, a4);\
  b3 = _mm_xor_si128(b3, a6);\
  b4 = _mm_xor_si128(b4, a0);\
  b4 = _mm_xor_si128(b4, a7);\
  b5 = _mm_xor_si128(b5, a0);\
  b7 = _mm_xor_si128(b7, a2);\
 }/**/
 /*
  grsiSUBSH_MASK[0] = _mm_set_epi32(0x0f0e0d0c, 0x0b0a0908, 0x07060504, 0x03020100);\
  grsiSUBSH_MASK[1] = _mm_set_epi32(0x000f0e0d, 0x0c0b0a09, 0x08070605, 0x04030201);\
  grsiSUBSH_MASK[2] = _mm_set_epi32(0x01000f0e, 0x0d0c0b0a, 0x09080706, 0x05040302);\
  grsiSUBSH_MASK[3] = _mm_set_epi32(0x0201000f, 0x0e0d0c0b, 0x0a090807, 0x06050403);\
  grsiSUBSH_MASK[4] = _mm_set_epi32(0x03020100, 0x0f0e0d0c, 0x0b0a0908, 0x07060504);\
  grsiSUBSH_MASK[5] = _mm_set_epi32(0x04030201, 0x000f0e0d, 0x0c0b0a09, 0x08070605);\
  grsiSUBSH_MASK[6] = _mm_set_epi32(0x05040302, 0x01000f0e, 0x0d0c0b0a, 0x09080706);\
  grsiSUBSH_MASK[7] = _mm_set_epi32(0x0a090807, 0x06050403, 0x0201000f, 0x0e0d0c0b);\
 */
 #define grsiSET_CONSTANTS(){\
  grsiSET_SHARED_CONSTANTS();\
  grsiALL_FF = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\
  for(i = 0; i < grsiROUNDS1024; i++)\
  {\
    grsiROUND_CONST_P[i] = _mm_set_epi32(0xf0e0d0c0 ^ (i * 0x01010101), 0xb0a09080 ^ (i * 0x01010101), 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
    grsiROUND_CONST_Q[i] = _mm_set_epi32(0x0f1f2f3f ^ (i * 0x01010101), 0x4f5f6f7f ^ (i * 0x01010101), 0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101));\
  }\
 }/**/
 /* one round
 * a0-a7 = input rows
 * b0-b7 = output rows
 */
 #define grsiSUBMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
  /* SubBytes + Multiplication */\
  grsiVPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7);\
  /* grsiMixBytes */\
  grsiMixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
 }/**/
 #define grsiROUNDS_P(){\
  u32 round_counter;\
  for(round_counter = 0; round_counter < 14; round_counter+=2) {\
    /* AddRoundConstant P1024 */\
    xmm8 = _mm_xor_si128(xmm8, (grsiROUND_CONST_P[round_counter]));\
    /* ShiftBytes P1024 + pre-AESENCLAST */\
    xmm8 = _mm_shuffle_epi8(xmm8,  (grsiSUBSH_MASK[0]));\
    xmm9 = _mm_shuffle_epi8(xmm9,  (grsiSUBSH_MASK[1]));\
    xmm10 = _mm_shuffle_epi8(xmm10, (grsiSUBSH_MASK[2]));\
    xmm11 = _mm_shuffle_epi8(xmm11, (grsiSUBSH_MASK[3]));\
    xmm12 = _mm_shuffle_epi8(xmm12, (grsiSUBSH_MASK[4]));\
    xmm13 = _mm_shuffle_epi8(xmm13, (grsiSUBSH_MASK[5]));\
    xmm14 = _mm_shuffle_epi8(xmm14, (grsiSUBSH_MASK[6]));\
    xmm15 = _mm_shuffle_epi8(xmm15, (grsiSUBSH_MASK[7]));\
    /* SubBytes + grsiMixBytes */\
    grsiSUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
    grsiVPERM_Add_Constant(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, grsiALL_15, xmm8);\
    \
    /* AddRoundConstant P1024 */\
    xmm0 = _mm_xor_si128(xmm0, (grsiROUND_CONST_P[round_counter+1]));\
    /* ShiftBytes P1024 + pre-AESENCLAST */\
    xmm0 = _mm_shuffle_epi8(xmm0, (grsiSUBSH_MASK[0]));\
    xmm1 = _mm_shuffle_epi8(xmm1, (grsiSUBSH_MASK[1]));\
    xmm2 = _mm_shuffle_epi8(xmm2, (grsiSUBSH_MASK[2]));\
    xmm3 = _mm_shuffle_epi8(xmm3, (grsiSUBSH_MASK[3]));\
    xmm4 = _mm_shuffle_epi8(xmm4, (grsiSUBSH_MASK[4]));\
    xmm5 = _mm_shuffle_epi8(xmm5, (grsiSUBSH_MASK[5]));\
    xmm6 = _mm_shuffle_epi8(xmm6, (grsiSUBSH_MASK[6]));\
    xmm7 = _mm_shuffle_epi8(xmm7, (grsiSUBSH_MASK[7]));\
    /* SubBytes + grsiMixBytes */\
    grsiSUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
    grsiVPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, grsiALL_15, xmm0);\
  }\
 }/**/
 #define grsiROUNDS_Q(){\
  grsiVPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, grsiALL_15, xmm1);\
  u32 round_counter = 0;\
  for(round_counter = 0; round_counter < 14; round_counter+=2) {\
    /* AddRoundConstant Q1024 */\
    xmm1 = grsiALL_FF;\
    xmm8 = _mm_xor_si128(xmm8, xmm1);\
    xmm9 = _mm_xor_si128(xmm9, xmm1);\
    xmm10 = _mm_xor_si128(xmm10, xmm1);\
    xmm11 = _mm_xor_si128(xmm11, xmm1);\
    xmm12 = _mm_xor_si128(xmm12, xmm1);\
    xmm13 = _mm_xor_si128(xmm13, xmm1);\
    xmm14 = _mm_xor_si128(xmm14, xmm1);\
    xmm15 = _mm_xor_si128(xmm15, (grsiROUND_CONST_Q[round_counter]));\
    /* ShiftBytes Q1024 + pre-AESENCLAST */\
    xmm8 = _mm_shuffle_epi8(xmm8, (grsiSUBSH_MASK[1]));\
    xmm9 = _mm_shuffle_epi8(xmm9, (grsiSUBSH_MASK[3]));\
    xmm10 = _mm_shuffle_epi8(xmm10, (grsiSUBSH_MASK[5]));\
    xmm11 = _mm_shuffle_epi8(xmm11, (grsiSUBSH_MASK[7]));\
    xmm12 = _mm_shuffle_epi8(xmm12, (grsiSUBSH_MASK[0]));\
    xmm13 = _mm_shuffle_epi8(xmm13, (grsiSUBSH_MASK[2]));\
    xmm14 = _mm_shuffle_epi8(xmm14, (grsiSUBSH_MASK[4]));\
    xmm15 = _mm_shuffle_epi8(xmm15, (grsiSUBSH_MASK[6]));\
    /* SubBytes + grsiMixBytes */\
    grsiSUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
    \
    /* AddRoundConstant Q1024 */\
    xmm9 = grsiALL_FF;\
    xmm0 = _mm_xor_si128(xmm0, xmm9);\
    xmm1 = _mm_xor_si128(xmm1, xmm9);\
    xmm2 = _mm_xor_si128(xmm2, xmm9);\
    xmm3 = _mm_xor_si128(xmm3, xmm9);\
    xmm4 = _mm_xor_si128(xmm4, xmm9);\
    xmm5 = _mm_xor_si128(xmm5, xmm9);\
    xmm6 = _mm_xor_si128(xmm6, xmm9);\
    xmm7 = _mm_xor_si128(xmm7, (grsiROUND_CONST_Q[round_counter+1]));\
    /* ShiftBytes Q1024 + pre-AESENCLAST */\
    xmm0 = _mm_shuffle_epi8(xmm0, (grsiSUBSH_MASK[1]));\
    xmm1 = _mm_shuffle_epi8(xmm1, (grsiSUBSH_MASK[3]));\
    xmm2 = _mm_shuffle_epi8(xmm2, (grsiSUBSH_MASK[5]));\
    xmm3 = _mm_shuffle_epi8(xmm3, (grsiSUBSH_MASK[7]));\
    xmm4 = _mm_shuffle_epi8(xmm4, (grsiSUBSH_MASK[0]));\
    xmm5 = _mm_shuffle_epi8(xmm5, (grsiSUBSH_MASK[2]));\
    xmm6 = _mm_shuffle_epi8(xmm6, (grsiSUBSH_MASK[4]));\
    xmm7 = _mm_shuffle_epi8(xmm7, (grsiSUBSH_MASK[6]));\
    /* SubBytes + grsiMixBytes*/ \
    grsiSUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
  }\
  grsiVPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, grsiALL_15, xmm1);\
 }/**/
 /* Matrix Transpose
 * input is a 1024-bit state with two columns in one xmm
 * output is a 1024-bit state with two rows in one xmm
 * inputs: i0-i7
 * outputs: i0-i7
 * clobbers: t0-t7
 */
 #define grsiMatrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\
  t0 = grsiTRANSP_MASK;\
 \
  i6 = _mm_shuffle_epi8(i6, t0);\
  i0 = _mm_shuffle_epi8(i0, t0);\
  i1 = _mm_shuffle_epi8(i1, t0);\
  i2 = _mm_shuffle_epi8(i2, t0);\
  i3 = _mm_shuffle_epi8(i3, t0);\
  t1 = i2;\
  i4 = _mm_shuffle_epi8(i4, t0);\
  i5 = _mm_shuffle_epi8(i5, t0);\
  t2 = i4;\
  t3 = i6;\
  i7 = _mm_shuffle_epi8(i7, t0);\
 \
  /* continue with unpack using 4 temp registers */\
  t0 = i0;\
  t2 = _mm_unpackhi_epi16(t2, i5);\
  i4 = _mm_unpacklo_epi16(i4, i5);\
  t3 = _mm_unpackhi_epi16(t3, i7);\
  i6 = _mm_unpacklo_epi16(i6, i7);\
  t0 = _mm_unpackhi_epi16(t0, i1);\
  t1 = _mm_unpackhi_epi16(t1, i3);\
  i2 = _mm_unpacklo_epi16(i2, i3);\
  i0 = _mm_unpacklo_epi16(i0, i1);\
 \
  /* shuffle with immediate */\
  t0 = _mm_shuffle_epi32(t0, 216);\
  t1 = _mm_shuffle_epi32(t1, 216);\
  t2 = _mm_shuffle_epi32(t2, 216);\
  t3 = _mm_shuffle_epi32(t3, 216);\
  i0 = _mm_shuffle_epi32(i0, 216);\
  i2 = _mm_shuffle_epi32(i2, 216);\
  i4 = _mm_shuffle_epi32(i4, 216);\
  i6 = _mm_shuffle_epi32(i6, 216);\
 \
  /* continue with unpack */\
  t4 = i0;\
  i0 = _mm_unpacklo_epi32(i0,  i2);\
  t4 = _mm_unpackhi_epi32(t4,  i2);\
  t5 = t0;\
  t0 = _mm_unpacklo_epi32(t0,  t1);\
  t5 = _mm_unpackhi_epi32(t5,  t1);\
  t6 = i4;\
  i4 = _mm_unpacklo_epi32(i4, i6);\
  t7 = t2;\
  t6 = _mm_unpackhi_epi32(t6,  i6);\
  i2 = t0;\
  t2 = _mm_unpacklo_epi32(t2,  t3);\
  i3 = t0;\
  t7 = _mm_unpackhi_epi32(t7,  t3);\
 \
  /* there are now 2 rows in each xmm */\
  /* unpack to get 1 row of CV in each xmm */\
  i1 = i0;\
  i1 = _mm_unpackhi_epi64(i1, i4);\
  i0 = _mm_unpacklo_epi64(i0, i4);\
  i4 = t4;\
  i3 = _mm_unpackhi_epi64(i3, t2);\
  i5 = t4;\
  i2 = _mm_unpacklo_epi64(i2, t2);\
  i6 = t5;\
  i5 = _mm_unpackhi_epi64(i5, t6);\
  i7 = t5;\
  i4 = _mm_unpacklo_epi64(i4, t6);\
  i7 = _mm_unpackhi_epi64(i7, t7);\
  i6 = _mm_unpacklo_epi64(i6, t7);\
  /* transpose done */\
 }/**/
 /* Matrix Transpose Inverse
 * input is a 1024-bit state with two rows in one xmm
 * output is a 1024-bit state with two columns in one xmm
 * inputs: i0-i7
 * outputs: (i0, o0, i1, i3, o1, o2, i5, i7)
 * clobbers: t0-t4
 */
 #define grsiMatrix_Transpose_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, t0, t1, t2, t3, t4){\
  /*  transpose matrix to get output format */\
  o1 = i0;\
  i0 = _mm_unpacklo_epi64(i0, i1);\
  o1 = _mm_unpackhi_epi64(o1, i1);\
  t0 = i2;\
  i2 = _mm_unpacklo_epi64(i2, i3);\
  t0 = _mm_unpackhi_epi64(t0, i3);\
  t1 = i4;\
  i4 = _mm_unpacklo_epi64(i4, i5);\
  t1 = _mm_unpackhi_epi64(t1, i5);\
  t2 = i6;\
  o0 = grsiTRANSP_MASK;\
  i6 = _mm_unpacklo_epi64(i6, i7);\
  t2 = _mm_unpackhi_epi64(t2, i7);\
  /* load transpose mask into a register, because it will be used 8 times */\
  i0 = _mm_shuffle_epi8(i0, o0);\
  i2 = _mm_shuffle_epi8(i2, o0);\
  i4 = _mm_shuffle_epi8(i4, o0);\
  i6 = _mm_shuffle_epi8(i6, o0);\
  o1 = _mm_shuffle_epi8(o1, o0);\
  t0 = _mm_shuffle_epi8(t0, o0);\
  t1 = _mm_shuffle_epi8(t1, o0);\
  t2 = _mm_shuffle_epi8(t2, o0);\
  /* continue with unpack using 4 temp registers */\
  t3 = i4;\
  o2 = o1;\
  o0 = i0;\
  t4 = t1;\
  \
  t3 = _mm_unpackhi_epi16(t3, i6);\
  i4 = _mm_unpacklo_epi16(i4, i6);\
  o0 = _mm_unpackhi_epi16(o0, i2);\
  i0 = _mm_unpacklo_epi16(i0, i2);\
  o2 = _mm_unpackhi_epi16(o2, t0);\
  o1 = _mm_unpacklo_epi16(o1, t0);\
  t4 = _mm_unpackhi_epi16(t4, t2);\
  t1 = _mm_unpacklo_epi16(t1, t2);\
  /* shuffle with immediate */\
  i4 = _mm_shuffle_epi32(i4, 216);\
  t3 = _mm_shuffle_epi32(t3, 216);\
  o1 = _mm_shuffle_epi32(o1, 216);\
  o2 = _mm_shuffle_epi32(o2, 216);\
  i0 = _mm_shuffle_epi32(i0, 216);\
  o0 = _mm_shuffle_epi32(o0, 216);\
  t1 = _mm_shuffle_epi32(t1, 216);\
  t4 = _mm_shuffle_epi32(t4, 216);\
  /* continue with unpack */\
  i1 = i0;\
  i3 = o0;\
  i5 = o1;\
  i7 = o2;\
  i0 = _mm_unpacklo_epi32(i0, i4);\
  i1 = _mm_unpackhi_epi32(i1, i4);\
  o0 = _mm_unpacklo_epi32(o0, t3);\
  i3 = _mm_unpackhi_epi32(i3, t3);\
  o1 = _mm_unpacklo_epi32(o1, t1);\
  i5 = _mm_unpackhi_epi32(i5, t1);\
  o2 = _mm_unpacklo_epi32(o2, t4);\
  i7 = _mm_unpackhi_epi32(i7, t4);\
  /* transpose done */\
 }/**/
 /* transform round constants into grsiVPERM mode */
 #define grsiVPERM_Transform_RoundConst_CNT2(i, j){\
  xmm0 = grsiROUND_CONST_P[i];\
  xmm1 = grsiROUND_CONST_P[j];\
  xmm2 = grsiROUND_CONST_Q[i];\
  xmm3 = grsiROUND_CONST_Q[j];\
  grsiVPERM_Transform_State(xmm0, xmm1, xmm2, xmm3, grsiVPERM_IPT, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10);\
  xmm2 = _mm_xor_si128(xmm2, (grsiALL_15));\
  xmm3 = _mm_xor_si128(xmm3, (grsiALL_15));\
  grsiROUND_CONST_P[i] = xmm0;\
  grsiROUND_CONST_P[j] = xmm1;\
  grsiROUND_CONST_Q[i] = xmm2;\
  grsiROUND_CONST_Q[j] = xmm3;\
 }/**/
 /* transform round constants into grsiVPERM mode */
 #define grsiVPERM_Transform_RoundConst(){\
  grsiVPERM_Transform_RoundConst_CNT2(0, 1);\
  grsiVPERM_Transform_RoundConst_CNT2(2, 3);\
  grsiVPERM_Transform_RoundConst_CNT2(4, 5);\
  grsiVPERM_Transform_RoundConst_CNT2(6, 7);\
  grsiVPERM_Transform_RoundConst_CNT2(8, 9);\
  grsiVPERM_Transform_RoundConst_CNT2(10, 11);\
  grsiVPERM_Transform_RoundConst_CNT2(12, 13);\
  xmm0 = grsiALL_FF;\
  grsiVPERM_Transform(xmm0, xmm1, grsiVPERM_IPT, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10);\
  xmm0 = _mm_xor_si128(xmm0, (grsiALL_15));\
  grsiALL_FF = xmm0;\
 }/**/
 IFUN void grsiINIT(u64* h)
 #if !defined(DECLARE_IFUN)
 ;
 #else
 {
   __m128i* const chaining = (__m128i*) h;
  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
  /* transform round constants into grsiVPERM mode */
  grsiVPERM_Transform_RoundConst();
  /* load IV into registers xmm8 - xmm15 */
  xmm8 = chaining[0];
  xmm9 = chaining[1];
  xmm10 = chaining[2];
  xmm11 = chaining[3];
  xmm12 = chaining[4];
  xmm13 = chaining[5];
  xmm14 = chaining[6];
  xmm15 = chaining[7];
  /* transform chaining value from column ordering into row ordering */
  grsiVPERM_Transform_State(xmm8, xmm9, xmm10, xmm11, grsiVPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
  grsiVPERM_Transform_State(xmm12, xmm13, xmm14, xmm15, grsiVPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
  grsiMatrix_Transpose(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
  /* store transposed IV */
  chaining[0] = xmm8;
  chaining[1] = xmm9;
  chaining[2] = xmm10;
  chaining[3] = xmm11;
  chaining[4] = xmm12;
  chaining[5] = xmm13;
  chaining[6] = xmm14;
  chaining[7] = xmm15;
 }
 #endif
 IFUN void grsiTF1024(u64* h, u64* m)
 #if !defined(DECLARE_IFUN)
 ;
 #else
 {
  __m128i* const chaining = (__m128i*) h;
  __m128i* const message = (__m128i*) m;
  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
  static __m128i TEMP_MUL1[8];
  static __m128i TEMP_MUL2[8];
  static __m128i TEMP_MUL4;
  static __m128i QTEMP[8];
  /* load message into registers xmm8 - xmm15 (Q = message) */
  xmm8 = message[0];
  xmm9 = message[1];
  xmm10 = message[2];
  xmm11 = message[3];
  xmm12 = message[4];
  xmm13 = message[5];
  xmm14 = message[6];
  xmm15 = message[7];
  /* transform message M from column ordering into row ordering */
  grsiVPERM_Transform_State(xmm8, xmm9, xmm10, xmm11, grsiVPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
  grsiVPERM_Transform_State(xmm12, xmm13, xmm14, xmm15, grsiVPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
  grsiMatrix_Transpose(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
  /* store message M (Q input) for later */
  QTEMP[0] = xmm8;
  QTEMP[1] = xmm9;
  QTEMP[2] = xmm10;
  QTEMP[3] = xmm11;
  QTEMP[4] = xmm12;
  QTEMP[5] = xmm13;
  QTEMP[6] = xmm14;
  QTEMP[7] = xmm15;
  /* xor CV to message to get P input */
  /* result: CV+M in xmm8...xmm15 */
  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
  xmm9 = _mm_xor_si128(xmm9,  (chaining[1]));
  xmm10 = _mm_xor_si128(xmm10, (chaining[2]));
  xmm11 = _mm_xor_si128(xmm11, (chaining[3]));
  xmm12 = _mm_xor_si128(xmm12, (chaining[4]));
  xmm13 = _mm_xor_si128(xmm13, (chaining[5]));
  xmm14 = _mm_xor_si128(xmm14, (chaining[6]));
  xmm15 = _mm_xor_si128(xmm15, (chaining[7]));
  /* compute permutation P */
  /* result: P(CV+M) in xmm8...xmm15 */
  grsiROUNDS_P();
  /* xor CV to P output (feed-forward) */
  /* result: P(CV+M)+CV in xmm8...xmm15 */
  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
  xmm9 = _mm_xor_si128(xmm9,  (chaining[1]));
  xmm10 = _mm_xor_si128(xmm10, (chaining[2]));
  xmm11 = _mm_xor_si128(xmm11, (chaining[3]));
  xmm12 = _mm_xor_si128(xmm12, (chaining[4]));
  xmm13 = _mm_xor_si128(xmm13, (chaining[5]));
  xmm14 = _mm_xor_si128(xmm14, (chaining[6]));
  xmm15 = _mm_xor_si128(xmm15, (chaining[7]));
  /* store P(CV+M)+CV */
  chaining[0] = xmm8;
  chaining[1] = xmm9;
  chaining[2] = xmm10;
  chaining[3] = xmm11;
  chaining[4] = xmm12;
  chaining[5] = xmm13;
  chaining[6] = xmm14;
  chaining[7] = xmm15;
  /* load message M (Q input) into xmm8-15 */
  xmm8 = QTEMP[0];
  xmm9 = QTEMP[1];
  xmm10 = QTEMP[2];
  xmm11 = QTEMP[3];
  xmm12 = QTEMP[4];
  xmm13 = QTEMP[5];
  xmm14 = QTEMP[6];
  xmm15 = QTEMP[7];
  /* compute permutation Q */
  /* result: Q(M) in xmm8...xmm15 */
  grsiROUNDS_Q();
  /* xor Q output */
  /* result: P(CV+M)+CV+Q(M) in xmm8...xmm15 */
  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
  xmm9 = _mm_xor_si128(xmm9,  (chaining[1]));
  xmm10 = _mm_xor_si128(xmm10, (chaining[2]));
  xmm11 = _mm_xor_si128(xmm11, (chaining[3]));
  xmm12 = _mm_xor_si128(xmm12, (chaining[4]));
  xmm13 = _mm_xor_si128(xmm13, (chaining[5]));
  xmm14 = _mm_xor_si128(xmm14, (chaining[6]));
  xmm15 = _mm_xor_si128(xmm15, (chaining[7]));
  /* store CV */
  chaining[0] = xmm8;
  chaining[1] = xmm9;
  chaining[2] = xmm10;
  chaining[3] = xmm11;
  chaining[4] = xmm12;
  chaining[5] = xmm13;
  chaining[6] = xmm14;
  chaining[7] = xmm15;
  return;
 }
 #endif
 IFUN void grsiOF1024(u64* h)
 #if !defined(DECLARE_IFUN)
 ;
 #else
 {
  __m128i* const chaining = (__m128i*) h;
  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
  static __m128i TEMP_MUL1[8];
  static __m128i TEMP_MUL2[8];
  static __m128i TEMP_MUL4;
  /* load CV into registers xmm8 - xmm15 */
  xmm8 = chaining[0];
  xmm9 = chaining[1];
  xmm10 = chaining[2];
  xmm11 = chaining[3];
  xmm12 = chaining[4];
  xmm13 = chaining[5];
  xmm14 = chaining[6];
  xmm15 = chaining[7];
  /* compute permutation P */
  /* result: P(CV) in xmm8...xmm15 */
  grsiROUNDS_P();
  /* xor CV to P output (feed-forward) */
  /* result: P(CV)+CV in xmm8...xmm15 */
  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
  xmm9 = _mm_xor_si128(xmm9,  (chaining[1]));
  xmm10 = _mm_xor_si128(xmm10, (chaining[2]));
  xmm11 = _mm_xor_si128(xmm11, (chaining[3]));
  xmm12 = _mm_xor_si128(xmm12, (chaining[4]));
  xmm13 = _mm_xor_si128(xmm13, (chaining[5]));
  xmm14 = _mm_xor_si128(xmm14, (chaining[6]));
  xmm15 = _mm_xor_si128(xmm15, (chaining[7]));
  /* transpose CV back from row ordering to column ordering */
  /* result: final hash value in xmm0, xmm6, xmm13, xmm15 */
  grsiMatrix_Transpose_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm4, xmm0, xmm6, xmm1, xmm2, xmm3, xmm5, xmm7);
  grsiVPERM_Transform_State(xmm0, xmm6, xmm13, xmm15, grsiVPERM_OPT, xmm1, xmm2, xmm3, xmm5, xmm7, xmm10, xmm12);
  /* we only need to return the truncated half of the state */
  chaining[4] = xmm0;
  chaining[5] = xmm6;
  chaining[6] = xmm13;
  chaining[7] = xmm15;
  return;
 }
 #endif
--- a/algo/groestl/sse2/grsi.c
+++ b/algo/groestl/sse2/grsi.c
@@ -1,273 +0,0 @@
 /* hash.c     Aug 2011
 *
 * Groestl implementation for different versions.
 * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
 *
 * This code is placed in the public domain
 */
 #include "grsi.h"
 #include "grsi-asm.h"
 /* void grsiInit(grsiState* ctx) { */
 #define GRS_I \
 do { \
  grsiState *ctx = &sts_grs; \
  u8 i = 0; \
 \
  /* set number of state columns and state size depending on \
     variant */ \
  ctx->grsicolumns = grsiCOLS; \
  ctx->grsistatesize = grsiSIZE; \
    ctx->grsiv = LONG; \
 \
   grsiSET_CONSTANTS();  \
 \
  memset(ctx->grsichaining, 0, sizeof(u64)*grsiSIZE/8); \
  memset(ctx->grsibuffer, 0, sizeof(grsiBitSequence)*grsiSIZE); \
 \
  if (ctx->grsichaining == NULL || ctx->grsibuffer == NULL) \
    return;  \
 \
  /* set initial value */ \
  ctx->grsichaining[ctx->grsicolumns-1] = grsiU64BIG((u64)grsiLENGTH); \
 \
  grsiINIT(ctx->grsichaining); \
 \
  /* set other variables */ \
  ctx->grsibuf_ptr = 0; \
  ctx->grsiblock_counter = 0; \
  ctx->grsibits_in_last_byte = 0; \
 \
 } while (0) 
 /* digest up to len bytes of input (full blocks only) */
 void grsiTransform(grsiState *ctx,
 	       const u8 *in, 
 	       unsigned long long len) {
    /* increment block counter */
    ctx->grsiblock_counter += len/grsiSIZE;
    /* digest message, one block at a time */
    for (; len >= grsiSIZE; len -= grsiSIZE, in += grsiSIZE)
      grsiTF1024((u64*)ctx->grsichaining, (u64*)in);
    asm volatile ("emms");
 }
 /* given state h, do h <- P(h)+h */
 void grsiOutputTransformation(grsiState *ctx) {
    /* determine variant */
    grsiOF1024((u64*)ctx->grsichaining);
    asm volatile ("emms");
 }
 /* initialise context */
 void grsiInit(grsiState* ctx) {
  u8 i = 0;
  /* output size (in bits) must be a positive integer less than or
     equal to 512, and divisible by 8 */
  if (grsiLENGTH <= 0 || (grsiLENGTH%8) || grsiLENGTH > 512)
    return; 
  /* set number of state columns and state size depending on
     variant */
  ctx->grsicolumns = grsiCOLS;
  ctx->grsistatesize = grsiSIZE;
    ctx->grsiv = LONG;
  grsiSET_CONSTANTS();
  for (i=0; i<grsiSIZE/8; i++)
    ctx->grsichaining[i] = 0;
  for (i=0; i<grsiSIZE; i++)
    ctx->grsibuffer[i] = 0;
  if (ctx->grsichaining == NULL || ctx->grsibuffer == NULL)
    return; 
  /* set initial value */
  ctx->grsichaining[ctx->grsicolumns-1] = grsiU64BIG((u64)grsiLENGTH);
  grsiINIT(ctx->grsichaining);
  /* set other variables */
  ctx->grsibuf_ptr = 0;
  ctx->grsiblock_counter = 0;
  ctx->grsibits_in_last_byte = 0;
  return;
 }
 /* update state with databitlen bits of input */
 void grsiUpdate(grsiState* ctx,
 		  const grsiBitSequence* input,
 		  grsiDataLength databitlen) {
  int index = 0;
  int msglen = (int)(databitlen/8);
  int rem = (int)(databitlen%8);
  /* non-integral number of message bytes can only be supplied in the
     last call to this function */
  if (ctx->grsibits_in_last_byte) return;
  /* if the buffer contains data that has not yet been digested, first
     add data to buffer until full */
  if (ctx->grsibuf_ptr) {
    while (ctx->grsibuf_ptr < ctx->grsistatesize && index < msglen) {
      ctx->grsibuffer[(int)ctx->grsibuf_ptr++] = input[index++];
    }
    if (ctx->grsibuf_ptr < ctx->grsistatesize) {
      /* buffer still not full, return */
      if (rem) {
        ctx->grsibits_in_last_byte = rem;
        ctx->grsibuffer[(int)ctx->grsibuf_ptr++] = input[index];
      }
      return;
    }
    /* digest buffer */
    ctx->grsibuf_ptr = 0;
    printf("error\n");
    grsiTransform(ctx, ctx->grsibuffer, ctx->grsistatesize);
  }
  /* digest bulk of message */
  grsiTransform(ctx, input+index, msglen-index);
  index += ((msglen-index)/ctx->grsistatesize)*ctx->grsistatesize;
  /* store remaining data in buffer */
  while (index < msglen) {
    ctx->grsibuffer[(int)ctx->grsibuf_ptr++] = input[index++];
  }
  /* if non-integral number of bytes have been supplied, store
     remaining bits in last byte, together with information about
     number of bits */
  if (rem) {
    ctx->grsibits_in_last_byte = rem;
    ctx->grsibuffer[(int)ctx->grsibuf_ptr++] = input[index];
  }
  return; 
 }
 /* update state with databitlen bits of input */
 void grsiUpdateq(grsiState* ctx, const grsiBitSequence* input)
 {
  grsiDataLength databitlen= 64*8;
  int index = 0;
  int msglen = (int)(databitlen/8);
  int rem = (int)(databitlen%8);
  /* non-integral number of message bytes can only be supplied in the
     last call to this function */
  if (ctx->grsibits_in_last_byte) return;
  /* if the buffer contains data that has not yet been digested, first
     add data to buffer until full */
  if (ctx->grsibuf_ptr) {
    while (ctx->grsibuf_ptr < ctx->grsistatesize && index < msglen) {
      ctx->grsibuffer[(int)ctx->grsibuf_ptr++] = input[index++];
    }
    if (ctx->grsibuf_ptr < ctx->grsistatesize) {
      /* buffer still not full, return */
      if (rem) {
        ctx->grsibits_in_last_byte = rem;
        ctx->grsibuffer[(int)ctx->grsibuf_ptr++] = input[index];
      }
      return;
    }
    /* digest buffer */
    ctx->grsibuf_ptr = 0;
    printf("error\n");
    grsiTransform(ctx, ctx->grsibuffer, ctx->grsistatesize);
  }
  /* digest bulk of message */
  grsiTransform(ctx, input+index, msglen-index);
  index += ((msglen-index)/ctx->grsistatesize)*ctx->grsistatesize;
  /* store remaining data in buffer */
  while (index < msglen) {
    ctx->grsibuffer[(int)ctx->grsibuf_ptr++] = input[index++];
  }
  /* if non-integral number of bytes have been supplied, store
     remaining bits in last byte, together with information about
     number of bits */
  if (rem) {
    ctx->grsibits_in_last_byte = rem;
    ctx->grsibuffer[(int)ctx->grsibuf_ptr++] = input[index];
  }
  return; 
 }
 #define BILB ctx->grsibits_in_last_byte
 /* finalise: process remaining data (including padding), perform
   output transformation, and write hash result to 'output' */
 void grsiFinal(grsiState* ctx,
 		 grsiBitSequence* output) {
  int i, j = 0, grsibytelen = grsiLENGTH/8;
  u8 *s = (grsiBitSequence*)ctx->grsichaining;
  /* pad with '1'-bit and first few '0'-bits */
  if (BILB) {
    ctx->grsibuffer[(int)ctx->grsibuf_ptr-1] &= ((1<<BILB)-1)<<(8-BILB);
    ctx->grsibuffer[(int)ctx->grsibuf_ptr-1] ^= 0x1<<(7-BILB);
    BILB = 0;
  }
  else ctx->grsibuffer[(int)ctx->grsibuf_ptr++] = 0x80;
  /* pad with '0'-bits */
  if (ctx->grsibuf_ptr > ctx->grsistatesize-grsiLENGTHFIELDLEN) {
    /* padding requires two blocks */
    while (ctx->grsibuf_ptr < ctx->grsistatesize) {
      ctx->grsibuffer[(int)ctx->grsibuf_ptr++] = 0;
    }
    /* digest first padding block */
    grsiTransform(ctx, ctx->grsibuffer, ctx->grsistatesize);
    ctx->grsibuf_ptr = 0;
  }
  while (ctx->grsibuf_ptr < ctx->grsistatesize-grsiLENGTHFIELDLEN) {
    ctx->grsibuffer[(int)ctx->grsibuf_ptr++] = 0;
  }
  /* length padding */
  ctx->grsiblock_counter++;
  ctx->grsibuf_ptr = ctx->grsistatesize;
  while (ctx->grsibuf_ptr > ctx->grsistatesize-grsiLENGTHFIELDLEN) {
    ctx->grsibuffer[(int)--ctx->grsibuf_ptr] = (u8)ctx->grsiblock_counter;
    ctx->grsiblock_counter >>= 8;
  }
  /* digest final padding block */
  grsiTransform(ctx, ctx->grsibuffer, ctx->grsistatesize);
  /* perform output transformation */
  grsiOutputTransformation(ctx);
  /* store hash result in output */
  for (i = ctx->grsistatesize-grsibytelen; i < ctx->grsistatesize; i++,j++) {
    output[j] = s[i];
  }
  /* zeroise relevant variables and deallocate memory */
  for (i = 0; i < ctx->grsicolumns; i++) {
    ctx->grsichaining[i] = 0;
  }
  for (i = 0; i < ctx->grsistatesize; i++) {
    ctx->grsibuffer[i] = 0;
  }
 //  free(ctx->grsichaining);
 //  free(ctx->grsibuffer);
  return; 
 }
--- a/algo/groestl/sse2/grsi.h
+++ b/algo/groestl/sse2/grsi.h
@@ -1,79 +0,0 @@
 /* hash.h     Aug 2011
 *
 * Groestl implementation for different versions.
 * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
 *
 * This code is placed in the public domain
 */
 #ifndef __grsi_h
 #define __grsi_h
 #include <stdio.h>
 #include <stdlib.h>
 #include "brg_endian.h"
 #define NEED_UINT_64T
 #include "brg_types.h"
 #define grsiLENGTH 512
 /* some sizes (number of bytes) */
 #define grsiROWS 8
 #define grsiLENGTHFIELDLEN grsiROWS
 #define grsiCOLS512 8
 #define grsiCOLS1024 16
 #define grsiSIZE512 (grsiROWS*grsiCOLS512)
 #define grsiSIZE1024 (grsiROWS*grsiCOLS1024)
 #define grsiROUNDS512 10
 #define grsiROUNDS1024 14
 #if grsiLENGTH<=256
 #define grsiCOLS grsiCOLS512
 #define grsiSIZE grsiSIZE512
 #define grsiROUNDS grsiROUNDS512
 #else
 #define grsiCOLS grsiCOLS1024
 #define grsiSIZE grsiSIZE1024
 #define grsiROUNDS grsiROUNDS1024
 #endif
 #define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
 #if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
 #define grsiEXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))
 #define grsiU64BIG(a) (a)
 #endif /* IS_BIG_ENDIAN */
 #if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
 #define grsiEXT_BYTE(var,n) ((u8)((u64)(var) >> (8*n)))
 #define grsiU64BIG(a) \
  ((ROTL64(a, 8) & li_64(000000FF000000FF)) | \
   (ROTL64(a,24) & li_64(0000FF000000FF00)) | \
   (ROTL64(a,40) & li_64(00FF000000FF0000)) | \
   (ROTL64(a,56) & li_64(FF000000FF000000)))
 #endif /* IS_LITTLE_ENDIAN */
 typedef enum { LONG, SHORT } grsiVar;
 /* NIST API begin */
 typedef unsigned char grsiBitSequence;
 typedef unsigned long long grsiDataLength;
 typedef struct {
  __attribute__ ((aligned (32))) u64 grsichaining[grsiSIZE/8];      /* actual state */
  __attribute__ ((aligned (32))) grsiBitSequence grsibuffer[grsiSIZE];  /* data buffer */
  u64 grsiblock_counter;        /* message block counter */
  int grsibuf_ptr;              /* data buffer pointer */
  int grsibits_in_last_byte;    /* no. of message bits in last byte of
                               data buffer */
  int grsicolumns;              /* no. of columns in state */
  int grsistatesize;            /* total no. of bytes in state */
  grsiVar grsiv;                    /* LONG or SHORT */
 } grsiState;
 void grsiInit(grsiState*);
 void grsiUpdate(grsiState*, const grsiBitSequence*, grsiDataLength);
 void grsiFinal(grsiState*, grsiBitSequence*);
 /* NIST API end   */
 #endif /* __hash_h */
--- a/algo/groestl/sse2/grsn-asm.h
+++ b/algo/groestl/sse2/grsn-asm.h
--- a/algo/groestl/sse2/grsn.c
+++ b/algo/groestl/sse2/grsn.c
@@ -1,247 +0,0 @@
 /* hash.c     Aug 2011
 *
 * Groestl implementation for different versions.
 * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
 *
 * This code is placed in the public domain
 */
 #include "grsn-asm.h"
 /* digest up to len bytes of input (full blocks only) */
 void grsnTransform(grsnState *ctx,
 	       const u8 *in, 
 	       unsigned long long len) {
    /* increment block counter */
    ctx->block_counter += len/grsnSIZE;
    /* digest message, one block at a time */
    for (; len >= grsnSIZE; len -= grsnSIZE, in += grsnSIZE)
 #if grsnLENGTH<=256
      TF512((u64*)ctx->chaining, (u64*)in);
 #else
      TF1024((u64*)ctx->chaining, (u64*)in);
 #endif
    asm volatile ("emms");
 }
 /* given state h, do h <- P(h)+h */
 void grsnOutputTransformation(grsnState *ctx) {
    /* determine variant */
 #if (grsnLENGTH <= 256)
    OF512((u64*)ctx->chaining);
 #else
    OF1024((u64*)ctx->chaining);
 #endif
    asm volatile ("emms");
 }
 /* initialise context */
 void grsnInit(grsnState* ctx) {
  u8 i = 0;
  /* output size (in bits) must be a positive integer less than or
     equal to 512, and divisible by 8 */
  if (grsnLENGTH <= 0 || (grsnLENGTH%8) || grsnLENGTH > 512)
    return; 
  /* set number of state columns and state size depending on
     variant */
  ctx->columns = grsnCOLS;
  ctx->statesize = grsnSIZE;
 #if (grsnLENGTH <= 256)
    ctx->v = SHORT;
 #else
    ctx->v = LONG;
 #endif
  SET_CONSTANTS();
  for (i=0; i<grsnSIZE/8; i++)
    ctx->chaining[i] = 0;
  for (i=0; i<grsnSIZE; i++)
    ctx->buffer[i] = 0;
  if (ctx->chaining == NULL || ctx->buffer == NULL)
    return; 
  /* set initial value */
  ctx->chaining[ctx->columns-1] = U64BIG((u64)grsnLENGTH);
  INIT(ctx->chaining);
  /* set other variables */
  ctx->buf_ptr = 0;
  ctx->block_counter = 0;
  ctx->bits_in_last_byte = 0;
  return;
 }
 /* update state with databitlen bits of input */
 void grsnUpdate(grsnState* ctx,
 		  const BitSequence* input,
 		  DataLength databitlen) {
  int index = 0;
  int msglen = (int)(databitlen/8);
  int rem = (int)(databitlen%8);
  /* non-integral number of message bytes can only be supplied in the
     last call to this function */
  if (ctx->bits_in_last_byte) return; 
  /* if the buffer contains data that has not yet been digested, first
     add data to buffer until full */
  if (ctx->buf_ptr) {
    while (ctx->buf_ptr < ctx->statesize && index < msglen) {
      ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
    }
    if (ctx->buf_ptr < ctx->statesize) {
      /* buffer still not full, return */
      if (rem) {
        ctx->bits_in_last_byte = rem;
        ctx->buffer[(int)ctx->buf_ptr++] = input[index];
      }
      return;
    }
    /* digest buffer */
    ctx->buf_ptr = 0;
    printf("error\n");
    grsnTransform(ctx, ctx->buffer, ctx->statesize);
  }
  /* digest bulk of message */
  grsnTransform(ctx, input+index, msglen-index);
  index += ((msglen-index)/ctx->statesize)*ctx->statesize;
  /* store remaining data in buffer */
  while (index < msglen) {
    ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
  }
  /* if non-integral number of bytes have been supplied, store
     remaining bits in last byte, together with information about
     number of bits */
  if (rem) {
    ctx->bits_in_last_byte = rem;
    ctx->buffer[(int)ctx->buf_ptr++] = input[index];
  }
  return;
 }
 /* update state with databitlen bits of input */
 void grsnUpdateq(grsnState* ctx, const BitSequence* input)
 {
  int index = 0;
  int msglen = (int)((64*8)/8);
  int rem = (int)((64*8)%8);
  /* if the buffer contains data that has not yet been digested, first
     add data to buffer until full */
  if (ctx->buf_ptr) {
    while (ctx->buf_ptr < ctx->statesize && index < msglen) {
      ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
    }
    if (ctx->buf_ptr < ctx->statesize) {
      /* buffer still not full, return */
      if (rem) {
        ctx->bits_in_last_byte = rem;
        ctx->buffer[(int)ctx->buf_ptr++] = input[index];
      }
      return;
    }
    /* digest buffer */
    ctx->buf_ptr = 0;
    printf("error\n");
    grsnTransform(ctx, ctx->buffer, ctx->statesize);
  }
  /* digest bulk of message */
  grsnTransform(ctx, input+index, msglen-index);
  index += ((msglen-index)/ctx->statesize)*ctx->statesize;
  /* store remaining data in buffer */
  while (index < msglen) {
    ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
  }
  /* if non-integral number of bytes have been supplied, store
     remaining bits in last byte, together with information about
     number of bits */
  if (rem) {
    ctx->bits_in_last_byte = rem;
    ctx->buffer[(int)ctx->buf_ptr++] = input[index];
  }
  return;
 }
 #define BILB ctx->bits_in_last_byte
 /* finalise: process remaining data (including padding), perform
   output transformation, and write hash result to 'output' */
 void grsnFinal(grsnState* ctx,
 		 BitSequence* output) {
  int i, j = 0, grsnbytelen = grsnLENGTH/8;
  u8 *s = (BitSequence*)ctx->chaining;
  /* pad with '1'-bit and first few '0'-bits */
  if (BILB) {
    ctx->buffer[(int)ctx->buf_ptr-1] &= ((1<<BILB)-1)<<(8-BILB);
    ctx->buffer[(int)ctx->buf_ptr-1] ^= 0x1<<(7-BILB);
    BILB = 0;
  }
  else ctx->buffer[(int)ctx->buf_ptr++] = 0x80;
  /* pad with '0'-bits */
  if (ctx->buf_ptr > ctx->statesize-grsnLENGTHFIELDLEN) {
    /* padding requires two blocks */
    while (ctx->buf_ptr < ctx->statesize) {
      ctx->buffer[(int)ctx->buf_ptr++] = 0;
    }
    /* digest first padding block */
    grsnTransform(ctx, ctx->buffer, ctx->statesize);
    ctx->buf_ptr = 0;
  }
  while (ctx->buf_ptr < ctx->statesize-grsnLENGTHFIELDLEN) {
    ctx->buffer[(int)ctx->buf_ptr++] = 0;
  }
  /* length padding */
  ctx->block_counter++;
  ctx->buf_ptr = ctx->statesize;
  while (ctx->buf_ptr > ctx->statesize-grsnLENGTHFIELDLEN) {
    ctx->buffer[(int)--ctx->buf_ptr] = (u8)ctx->block_counter;
    ctx->block_counter >>= 8;
  }
  /* digest final padding block */
  grsnTransform(ctx, ctx->buffer, ctx->statesize);
  /* perform output transformation */
  grsnOutputTransformation(ctx);
  /* store hash result in output */
  for (i = ctx->statesize-grsnbytelen; i < ctx->statesize; i++,j++) {
    output[j] = s[i];
  }
  /* zeroise relevant variables and deallocate memory */
  for (i = 0; i < ctx->columns; i++) {
    ctx->chaining[i] = 0;
  }
  for (i = 0; i < ctx->statesize; i++) {
    ctx->buffer[i] = 0;
  }
 //  free(ctx->chaining);
 //  free(ctx->buffer);
  return;
 }
--- a/algo/groestl/sse2/grsn.h
+++ b/algo/groestl/sse2/grsn.h
@@ -1,80 +0,0 @@
 /* hash.h     Aug 2011
 *
 * Groestl implementation for different versions.
 * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
 *
 * This code is placed in the public domain
 */
 #ifndef __grsn_h
 #define __grsn_h
 #include <stdio.h>
 #include <stdlib.h>
 #include "brg_endian.h"
 #define NEED_UINT_64T
 #include "brg_types.h"
 #ifndef grsnLENGTH
 #define grsnLENGTH 512
 #endif
 /* some sizes (number of bytes) */
 #define grsnROWS 8
 #define grsnLENGTHFIELDLEN grsnROWS
 #define grsnCOLS512 8
 #define grsnCOLS1024 16
 #define grsnSIZE512 (grsnROWS*grsnCOLS512)
 #define grsnSIZE1024 (grsnROWS*grsnCOLS1024)
 #define grsnROUNDS512 10
 #define grsnROUNDS1024 14
 #if grsnLENGTH<=256
 #define grsnCOLS grsnCOLS512
 #define grsnSIZE grsnSIZE512
 #define grsnROUNDS grsnROUNDS512
 #else
 #define grsnCOLS grsnCOLS1024
 #define grsnSIZE grsnSIZE1024
 #define grsnROUNDS grsnROUNDS1024
 #endif
 #define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
 #if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
 #define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))
 #define U64BIG(a) (a)
 #endif /* IS_BIG_ENDIAN */
 #if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
 #define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*n)))
 #define U64BIG(a) \
  ((ROTL64(a, 8) & li_64(000000FF000000FF)) | \
   (ROTL64(a,24) & li_64(0000FF000000FF00)) | \
   (ROTL64(a,40) & li_64(00FF000000FF0000)) | \
   (ROTL64(a,56) & li_64(FF000000FF000000)))
 #endif /* IS_LITTLE_ENDIAN */
 typedef enum { LONG, SHORT } Var;
 /* NIST API begin */
 typedef unsigned char BitSequence;
 typedef unsigned long long DataLength;
 typedef struct {
  __attribute__ ((aligned (32))) u64 chaining[grsnSIZE/8];      /* actual state */
  __attribute__ ((aligned (32))) BitSequence buffer[grsnSIZE];  /* data buffer */
  u64 block_counter;        /* message block counter */
  int buf_ptr;              /* data buffer pointer */
  int bits_in_last_byte;    /* no. of message bits in last byte of
                               data buffer */
  int columns;              /* no. of columns in state */
  int statesize;            /* total no. of bytes in state */
  Var v;                    /* LONG or SHORT */
 } grsnState;
 void grsnInit(grsnState*);
 void grsnUpdate(grsnState*, const BitSequence*, DataLength);
 void grsnFinal(grsnState*, BitSequence*);
 #endif /* __hash_h */
--- a/algo/groestl/sse2/grso-asm.c
+++ b/algo/groestl/sse2/grso-asm.c
--- a/algo/groestl/sse2/grso-asm.h
+++ b/algo/groestl/sse2/grso-asm.h
@@ -1,10 +0,0 @@
 #ifndef GRSOASM_H
 #define GRSOASM_H
 #include "grso.h"
 void grsoP1024ASM (u64 *x) ;
 void grsoQ1024ASM (u64 *x) ;
 #endif 
--- a/algo/groestl/sse2/grso-asm2.c
+++ b/algo/groestl/sse2/grso-asm2.c
--- a/algo/groestl/sse2/grso-asm2.h
+++ b/algo/groestl/sse2/grso-asm2.h
@@ -1,11 +0,0 @@
 #ifndef GRSOASM_H
 #define GRSOASM_H
 /* really same as the mmx asm.h */
 /* made just in case something must be changed */
 #include "grso.h"
 void grsoP1024ASM (u64 *x) ;
 void grsoQ1024ASM (u64 *x) ;
 #endif 
--- a/algo/groestl/sse2/grso-macro.c
+++ b/algo/groestl/sse2/grso-macro.c
@@ -1,110 +0,0 @@
 /* hash.c     January 2011
 *
 * Groestl-512 implementation with inline assembly containing mmx and
 * sse instructions. Optimized for Opteron.
 * Authors: Krystian Matusiewicz and Soeren S. Thomsen
 *
 * This code is placed in the public domain
 */
 //#include "grso.h"
 //#include "grso-asm.h"
 // #include "grsotab.h"
 #define DECL_GRS
 /* load initial constants */
 #define GRS_I \
 do { \
  int i; \
  /* set initial value */ \
  for (i = 0; i < grsoCOLS-1; i++) sts_grs.grsstate[i] = 0; \
  sts_grs.grsstate[grsoCOLS-1] = grsoU64BIG((u64)(8*grsoDIGESTSIZE)); \
 \
  /* set other variables */ \
  sts_grs.grsbuf_ptr = 0; \
  sts_grs.grsblock_counter = 0; \
 } while (0); \
 /* load hash */
 #define GRS_U \
 do { \
    unsigned char* in = hash; \
  unsigned long long index = 0; \
 \
  /* if the buffer contains data that has not yet been digested, first \
     add data to buffer until full */ \
  if (sts_grs.grsbuf_ptr) { \
    while (sts_grs.grsbuf_ptr < grsoSIZE && index < 64) { \
      hashbuf[(int)sts_grs.grsbuf_ptr++] = in[index++]; \
    } \
    if (sts_grs.grsbuf_ptr < grsoSIZE) continue; \
 \
    /* digest buffer */ \
    sts_grs.grsbuf_ptr = 0; \
    grsoTransform(&sts_grs, hashbuf, grsoSIZE); \
  } \
 \
  /* digest bulk of message */ \
  grsoTransform(&sts_grs, in+index, 64-index); \
  index += ((64-index)/grsoSIZE)*grsoSIZE; \
 \
  /* store remaining data in buffer */ \
  while (index < 64) { \
    hashbuf[(int)sts_grs.grsbuf_ptr++] = in[index++]; \
  } \
 \
 } while (0);
 /* groestl512 hash loaded */
 /* hash = groestl512(loaded) */
 #define GRS_C \
 do { \
    char *out = hash; \
  int i, j = 0; \
  unsigned char *s = (unsigned char*)sts_grs.grsstate; \
 \
  hashbuf[sts_grs.grsbuf_ptr++] = 0x80; \
 \
  /* pad with '0'-bits */ \
  if (sts_grs.grsbuf_ptr > grsoSIZE-grsoLENGTHFIELDLEN) { \
    /* padding requires two blocks */ \
    while (sts_grs.grsbuf_ptr < grsoSIZE) { \
      hashbuf[sts_grs.grsbuf_ptr++] = 0; \
    } \
    /* digest first padding block */ \
    grsoTransform(&sts_grs, hashbuf, grsoSIZE); \
    sts_grs.grsbuf_ptr = 0; \
  } \
  while (sts_grs.grsbuf_ptr < grsoSIZE-grsoLENGTHFIELDLEN) { \
    hashbuf[sts_grs.grsbuf_ptr++] = 0; \
  } \
 \
  /* length padding */ \
  sts_grs.grsblock_counter++; \
  sts_grs.grsbuf_ptr = grsoSIZE; \
  while (sts_grs.grsbuf_ptr > grsoSIZE-grsoLENGTHFIELDLEN) { \
    hashbuf[--sts_grs.grsbuf_ptr] = (unsigned char)sts_grs.grsblock_counter; \
    sts_grs.grsblock_counter >>= 8; \
  } \
 \
  /* digest final padding block */ \
  grsoTransform(&sts_grs, hashbuf, grsoSIZE); \
  /* perform output transformation */ \
  grsoOutputTransformation(&sts_grs); \
 \
  /* store hash result in output */ \
  for (i = grsoSIZE-grsoDIGESTSIZE; i < grsoSIZE; i++,j++) { \
    out[j] = s[i]; \
  } \
 \
  /* zeroise relevant variables and deallocate memory */ \
  for (i = 0; i < grsoCOLS; i++) { \
    sts_grs.grsstate[i] = 0; \
  } \
  for (i = 0; i < grsoSIZE; i++) { \
    hashbuf[i] = 0; \
  } \
 } while (0); 
--- a/algo/groestl/sse2/grso.c
+++ b/algo/groestl/sse2/grso.c
@@ -1,57 +0,0 @@
 /* hash.c     January 2011
 *
 * Groestl-512 implementation with inline assembly containing mmx and
 * sse instructions. Optimized for Opteron.
 * Authors: Krystian Matusiewicz and Soeren S. Thomsen
 *
 * This code is placed in the public domain
 */
 #include "algo/groestl/sse2/grso-asm.h"
 #include "algo/groestl/sse2/grso.h"
 #include "algo/groestl/sse2/grsotab.h"
 /* digest up to len bytes of input (full blocks only) */
 void grsoTransform(grsoState *ctx, 
 	       const unsigned char *in, 
 	       unsigned long long len) {
  u64 y[grsoCOLS+2] __attribute__ ((aligned (16)));
  u64 z[grsoCOLS+2] __attribute__ ((aligned (16)));
  u64 *m, *h = (u64*)ctx->grsstate;
  int i;
  /* increment block counter */
  ctx->grsblock_counter += len/grsoSIZE;
  /* digest message, one block at a time */
  for (; len >= grsoSIZE; len -= grsoSIZE, in += grsoSIZE) {
    m = (u64*)in;
    for (i = 0; i < grsoCOLS; i++) {
      y[i] = m[i];
      z[i] = m[i] ^ h[i];
    }
    grsoQ1024ASM(y);
    grsoP1024ASM(z);
    /* h' == h + Q(m) + P(h+m) */
    for (i = 0; i < grsoCOLS; i++) {
      h[i] ^= z[i] ^ y[i];
    }
  }
 }
 /* given state h, do h <- P(h)+h */
 void grsoOutputTransformation(grsoState *ctx) {
  u64 z[grsoCOLS] __attribute__ ((aligned (16)));
  int j;
  for (j = 0; j < grsoCOLS; j++) {
    z[j] = ctx->grsstate[j];
  }
  grsoP1024ASM(z);
  for (j = 0; j < grsoCOLS; j++) {
    ctx->grsstate[j] ^= z[j];
  }
 }
--- a/algo/groestl/sse2/grso.h
+++ b/algo/groestl/sse2/grso.h
@@ -1,62 +0,0 @@
 #ifndef __hash_h
 #define __hash_h
 #include <stdio.h>
 #include <stdlib.h>
 #include "brg_endian.h"
 #include "brg_types.h"
 /* some sizes (number of bytes) */
 #define grsoROWS 8
 #define grsoLENGTHFIELDLEN grsoROWS
 #define grsoCOLS 16
 #define grsoSIZE (grsoROWS*grsoCOLS)
 #define grsoDIGESTSIZE 64
 #define grsoROUNDS 14
 #define grsoROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&((u64)0xffffffffffffffffULL))
 #if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
 #error
 #endif /* IS_BIG_ENDIAN */
 #if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
 #define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*n)))
 #define grsoU64BIG(a)				\
  ((grsoROTL64(a, 8) & ((u64)0x000000ff000000ffULL)) |	\
   (grsoROTL64(a,24) & ((u64)0x0000ff000000ff00ULL)) |	\
   (grsoROTL64(a,40) & ((u64)0x00ff000000ff0000ULL)) |	\
   (grsoROTL64(a,56) & ((u64)0xff000000ff000000ULL)))
 #endif /* IS_LITTLE_ENDIAN */
 typedef struct {
  u64 grsstate[grsoCOLS];             /* actual state */
  u64 grsblock_counter;           /* message block counter */
  int grsbuf_ptr;                 /* data buffer pointer */
 } grsoState;
 //extern int grsoInit(grsoState* ctx); 
 //extern int grsoUpdate(grsoState* ctx, const unsigned char* in,
 //	   unsigned long long len);
 //extern int grsoUpdateq(grsoState* ctx, const unsigned char* in);
 //extern int grsoFinal(grsoState* ctx,
 //	  unsigned char* out); 
 //
 //extern int grsohash(unsigned char *out,
 //		const unsigned char *in,
 //		unsigned long long len);
 /* digest up to len bytes of input (full blocks only) */
 void grsoTransform( grsoState *ctx, const unsigned char *in,
                            unsigned long long len );
 /* given state h, do h <- P(h)+h */
 void grsoOutputTransformation( grsoState *ctx );
 int grso_init ( grsoState* sts_grs );
 int grso_update ( grsoState* sts_grs, char* hashbuf, char* hash );
 int grso_close ( grsoState *sts_grs, char* hashbuf, char* hash );
 #endif /* __hash_h */
--- a/algo/groestl/sse2/grsotab.h
+++ b/algo/groestl/sse2/grsotab.h
--- a/algo/groestl/sse2/grss.c
+++ b/algo/groestl/sse2/grss.c
--- a/algo/groestl/sse2/grss_api.h
+++ b/algo/groestl/sse2/grss_api.h
@@ -1,45 +0,0 @@
 /*
 * file        : hash_api.h
 * version     : 1.0.208
 * date        : 14.12.2010
 * 
 * Grostl multi-stream bitsliced implementation Hash API
 *
 * Cagdas Calik
 * ccalik@metu.edu.tr
 * Institute of Applied Mathematics, Middle East Technical University, Turkey.
 *
 */
 #ifndef GRSS_API_H
 #define GRSS_API_H
 #include "sha3_common.h"
 #include <tmmintrin.h>
 typedef struct
 {
 	__m128i state1[8];
 	__m128i state2[8];
 	__m128i state3[8];
 	__m128i state4[8];
 	__m128i _Pconst[14][8];
 	__m128i	_Qconst[14][8];
 	__m128i	_shiftconst[8];
 	unsigned int uHashLength;
 	unsigned int uBlockLength;
 	BitSequence buffer[128];
 } grssState;
 void grssInit(grssState *state, int grssbitlen);
 void grssUpdate(grssState *state, const BitSequence *data, DataLength databitlen);
 void grssFinal(grssState *state, BitSequence *grssval);
 #endif // HASH_API_H
--- a/algo/groestl/sse2/grstab.h
+++ b/algo/groestl/sse2/grstab.h
--- a/algo/groestl/sse2/grsv-asm.h
+++ b/algo/groestl/sse2/grsv-asm.h
--- a/algo/groestl/sse2/grsv.c
+++ b/algo/groestl/sse2/grsv.c
@@ -1,202 +0,0 @@
 /* hash.c     Aug 2011
 *
 * Groestl implementation for different versions.
 * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
 *
 * This code is placed in the public domain
 */
 #include "grsv.h"
 #include "grsv-asm.h"
 /* digest up to len bytes of input (full blocks only) */
 void grsvTransform(grsvState *ctx,
 	       const u8 *in, 
 	       unsigned long long len) {
    /* increment block counter */
    ctx->grsvblock_counter += len/grsvSIZE;
    /* digest message, one block at a time */
    for (; len >= grsvSIZE; len -= grsvSIZE, in += grsvSIZE)
 #if grsvLENGTH<=256
      grsvTF512((u64*)ctx->grsvchaining, (u64*)in);
 #else
      grsvTF1024((u64*)ctx->grsvchaining, (u64*)in);
 #endif
    asm volatile ("emms");
 }
 /* given state h, do h <- P(h)+h */
 void grsvOutputTransformation(grsvState *ctx) {
    /* determine variant */
 #if (grsvLENGTH <= 256)
    grsvOF512((u64*)ctx->grsvchaining);
 #else
    grsvOF1024((u64*)ctx->grsvchaining);
 #endif
    asm volatile ("emms");
 }
 /* initialise context */
 void grsvInit(grsvState* ctx) {
  u8 i = 0;
  /* output size (in bits) must be a positive integer less than or
     equal to 512, and divisible by 8 */
  if (grsvLENGTH <= 0 || (grsvLENGTH%8) || grsvLENGTH > 512)
    return;
  /* set number of state columns and state size depending on
     variant */
  ctx->grsvcolumns = grsvCOLS;
  ctx->grsvstatesize = grsvSIZE;
 #if (grsvLENGTH <= 256)
    ctx->grsvv = SHORT;
 #else
    ctx->grsvv = LONG;
 #endif
  SET_CONSTANTS();
  for (i=0; i<grsvSIZE/8; i++)
    ctx->grsvchaining[i] = 0;
  for (i=0; i<grsvSIZE; i++)
    ctx->grsvbuffer[i] = 0;
  if (ctx->grsvchaining == NULL || ctx->grsvbuffer == NULL)
    return;
  /* set initial value */
  ctx->grsvchaining[ctx->grsvcolumns-1] = U64BIG((u64)grsvLENGTH);
  grsvINIT(ctx->grsvchaining);
  /* set other variables */
  ctx->grsvbuf_ptr = 0;
  ctx->grsvblock_counter = 0;
  ctx->grsvbits_in_last_byte = 0;
  return; 
 }
 /* update state with databitlen bits of input */
 void grsvUpdate(grsvState* ctx,
 		  const grsvBitSequence* input,
 		  grsvDataLength databitlen) {
  int index = 0;
  int msglen = (int)(databitlen/8);
  int rem = (int)(databitlen%8);
  /* non-integral number of message bytes can only be supplied in the
     last call to this function */
  if (ctx->grsvbits_in_last_byte) return;
  /* if the buffer contains data that has not yet been digested, first
     add data to buffer until full */
  if (ctx->grsvbuf_ptr) {
    while (ctx->grsvbuf_ptr < ctx->grsvstatesize && index < msglen) {
      ctx->grsvbuffer[(int)ctx->grsvbuf_ptr++] = input[index++];
    }
    if (ctx->grsvbuf_ptr < ctx->grsvstatesize) {
      /* buffer still not full, return */
      if (rem) {
        ctx->grsvbits_in_last_byte = rem;
        ctx->grsvbuffer[(int)ctx->grsvbuf_ptr++] = input[index];
      }
      return; 
    }
    /* digest buffer */
    ctx->grsvbuf_ptr = 0;
    printf("error\n");
    grsvTransform(ctx, ctx->grsvbuffer, ctx->grsvstatesize);
  }
  /* digest bulk of message */
  grsvTransform(ctx, input+index, msglen-index);
  index += ((msglen-index)/ctx->grsvstatesize)*ctx->grsvstatesize;
  /* store remaining data in buffer */
  while (index < msglen) {
    ctx->grsvbuffer[(int)ctx->grsvbuf_ptr++] = input[index++];
  }
  /* if non-integral number of bytes have been supplied, store
     remaining bits in last byte, together with information about
     number of bits */
  if (rem) {
    ctx->grsvbits_in_last_byte = rem;
    ctx->grsvbuffer[(int)ctx->grsvbuf_ptr++] = input[index];
  }
  return;
 }
 #define BILB ctx->grsvbits_in_last_byte
 /* finalise: process remaining data (including padding), perform
   output transformation, and write hash result to 'output' */
 void grsvFinal(grsvState* ctx,
 		 grsvBitSequence* output) {
  int i, j = 0, grsvbytelen = grsvLENGTH/8;
  u8 *s = (grsvBitSequence*)ctx->grsvchaining;
  /* pad with '1'-bit and first few '0'-bits */
  if (BILB) {
    ctx->grsvbuffer[(int)ctx->grsvbuf_ptr-1] &= ((1<<BILB)-1)<<(8-BILB);
    ctx->grsvbuffer[(int)ctx->grsvbuf_ptr-1] ^= 0x1<<(7-BILB);
    BILB = 0;
  }
  else ctx->grsvbuffer[(int)ctx->grsvbuf_ptr++] = 0x80;
  /* pad with '0'-bits */
  if (ctx->grsvbuf_ptr > ctx->grsvstatesize-grsvLENGTHFIELDLEN) {
    /* padding requires two blocks */
    while (ctx->grsvbuf_ptr < ctx->grsvstatesize) {
      ctx->grsvbuffer[(int)ctx->grsvbuf_ptr++] = 0;
    }
    /* digest first padding block */
    grsvTransform(ctx, ctx->grsvbuffer, ctx->grsvstatesize);
    ctx->grsvbuf_ptr = 0;
  }
  while (ctx->grsvbuf_ptr < ctx->grsvstatesize-grsvLENGTHFIELDLEN) {
    ctx->grsvbuffer[(int)ctx->grsvbuf_ptr++] = 0;
  }
  /* length padding */
  ctx->grsvblock_counter++;
  ctx->grsvbuf_ptr = ctx->grsvstatesize;
  while (ctx->grsvbuf_ptr > ctx->grsvstatesize-grsvLENGTHFIELDLEN) {
    ctx->grsvbuffer[(int)--ctx->grsvbuf_ptr] = (u8)ctx->grsvblock_counter;
    ctx->grsvblock_counter >>= 8;
  }
  /* digest final padding block */
  grsvTransform(ctx, ctx->grsvbuffer, ctx->grsvstatesize);
  /* perform output transformation */
  grsvOutputTransformation(ctx);
  /* store hash result in output */
  for (i = ctx->grsvstatesize-grsvbytelen; i < ctx->grsvstatesize; i++,j++) {
    output[j] = s[i];
  }
  /* zeroise relevant variables and deallocate memory */
  for (i = 0; i < ctx->grsvcolumns; i++) {
    ctx->grsvchaining[i] = 0;
  }
  for (i = 0; i < ctx->grsvstatesize; i++) {
    ctx->grsvbuffer[i] = 0;
  }
 //  free(ctx->grsvchaining);
 //  free(ctx->buffer);
  return;
 }
--- a/algo/groestl/sse2/grsv.h
+++ b/algo/groestl/sse2/grsv.h
@@ -1,77 +0,0 @@
 /* hash.h     Aug 2011
 *
 * Groestl implementation for different versions.
 * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
 *
 * This code is placed in the public domain
 */
 #ifndef __grsv_h
 #define __grsv_h
 #include <stdio.h>
 #include <stdlib.h>
 #include "brg_endian.h"
 #define NEED_UINT_64T
 #include "brg_types.h"
 #define grsvLENGTH 512
 /* some sizes (number of bytes) */
 #define grsvROWS 8
 #define grsvLENGTHFIELDLEN grsvROWS
 #define grsvCOLS512 8
 #define grsvCOLS1024 16
 #define grsvSIZE512 (grsvROWS*grsvCOLS512)
 #define grsvSIZE1024 (grsvROWS*grsvCOLS1024)
 #define grsvROUNDS512 10
 #define grsvROUNDS1024 14
 #if grsvLENGTH<=256
 #define grsvCOLS grsvCOLS512
 #define grsvSIZE grsvSIZE512
 #define grsvROUNDS grsvROUNDS512
 #else
 #define grsvCOLS grsvCOLS1024
 #define grsvSIZE grsvSIZE1024
 #define grsvROUNDS grsvROUNDS1024
 #endif
 #define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
 #if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
 #define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))
 #define U64BIG(a) (a)
 #endif /* IS_BIG_ENDIAN */
 #if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
 #define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*n)))
 #define U64BIG(a) \
  ((ROTL64(a, 8) & li_64(000000FF000000FF)) | \
   (ROTL64(a,24) & li_64(0000FF000000FF00)) | \
   (ROTL64(a,40) & li_64(00FF000000FF0000)) | \
   (ROTL64(a,56) & li_64(FF000000FF000000)))
 #endif /* IS_LITTLE_ENDIAN */
 typedef enum { LONG, SHORT } grsvVar;
 typedef unsigned char grsvBitSequence;
 typedef unsigned long long grsvDataLength;
 typedef struct {
  __attribute__ ((aligned (32))) u64 grsvchaining[grsvSIZE/8];      /* actual state */
  __attribute__ ((aligned (32))) grsvBitSequence grsvbuffer[grsvSIZE];  /* data buffer */
  u64 grsvblock_counter;        /* message block counter */
  int grsvbuf_ptr;              /* data buffer pointer */
  int grsvbits_in_last_byte;    /* no. of message bits in last byte of
                               data buffer */
  int grsvcolumns;              /* no. of columns in state */
  int grsvstatesize;            /* total no. of bytes in state */
  grsvVar grsvv;                    /* LONG or SHORT */
 } grsvState;
 void grsvInit(grsvState*);
 void grsvUpdate(grsvState*, const grsvBitSequence*, grsvDataLength);
 void grsvFinal(grsvState*, grsvBitSequence*);
 #endif /* __grsv_h */
--- a/algo/hmq1725.c
+++ b/algo/hmq1725.c
@@ -23,10 +23,7 @@
 #include "algo/sha2/sph-sha2.h"
 #include "algo/haval/sph-haval.h"
-#ifdef NO_AES_NI
+#ifndef NO_AES_NI
  #include "algo/groestl/sse2/grso.h"
  #include "algo/groestl/sse2/grso-macro.c"
 #else
  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/echo/aes_ni/hash_api.h"
 #endif
@@ -34,38 +31,31 @@
 #include "algo/luffa/sse2/luffa_for_sse2.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
 #include "algo/simd/sse2/nist.h"
 //#include "algo/blake/sse2/blake.c"
 //#include "algo/keccak/sse2/keccak.c"
 //#include "algo/bmw/sse2/bmw.c"
 //#include "algo/skein/sse2/skein.c"
 #include "algo/jh/sse2/jh_sse2_opt64.h"
 typedef struct {
-  sph_blake512_context       blake1, blake2;
+  sph_blake512_context    blake1, blake2;
-  sph_bmw512_context         bmw1, bmw2, bmw3;
+  sph_bmw512_context      bmw1, bmw2, bmw3;
-  sph_skein512_context       skein1, skein2;
+  sph_skein512_context    skein1, skein2;
-  sph_jh512_context          jh1, jh2;
+  sph_jh512_context       jh1, jh2;
-  sph_keccak512_context      keccak1, keccak2;
+  sph_keccak512_context   keccak1, keccak2;
-//  sph_luffa512_context       luffa1, luffa2;
+  hashState_luffa         luffa1, luffa2;
-  hashState_luffa            luffa1, luffa2;
+  cubehashParam           cube;
-//  sph_cubehash512_context    cube1, cube2;
+  sph_shavite512_context  shavite1, shavite2;
-  cubehashParam              cube;
+  hashState_sd            simd1, simd2;
-  sph_shavite512_context     shavite1, shavite2;
+  sph_hamsi512_context    hamsi1;
-//  sph_simd512_context        simd1, simd2;
+  sph_fugue512_context    fugue1, fugue2;
-  hashState_sd               simd1, simd2;
+  sph_shabal512_context   shabal1;
-  sph_hamsi512_context       hamsi1;
+  sph_whirlpool_context   whirlpool1, whirlpool2, whirlpool3, whirlpool4;
-  sph_fugue512_context       fugue1, fugue2;
+  sph_sha512_context      sha1, sha2;
-  sph_shabal512_context      shabal1;
+  sph_haval256_5_context  haval1, haval2;
  sph_whirlpool_context      whirlpool1, whirlpool2, whirlpool3, whirlpool4;
  sph_sha512_context         sha1, sha2;
  sph_haval256_5_context     haval1, haval2;
 #ifdef NO_AES_NI
-    sph_groestl512_context  groestl1, groestl2;
+  sph_groestl512_context  groestl1, groestl2;
-    sph_echo512_context     echo1, echo2;
+  sph_echo512_context     echo1, echo2;
 #else
-     hashState_echo          echo1, echo2;
+  hashState_echo          echo1, echo2;
-     hashState_groestl       groestl1, groestl2;
+  hashState_groestl       groestl1, groestl2;
 #endif
 } hmq1725_ctx_holder;
@@ -90,19 +80,14 @@ void init_hmq1725_ctx()
    sph_keccak512_init(&hmq1725_ctx.keccak1);
    sph_keccak512_init(&hmq1725_ctx.keccak2);
 //    sph_luffa512_init(&hmq1725_ctx.luffa1);
 //    sph_luffa512_init(&hmq1725_ctx.luffa2);
     init_luffa( &hmq1725_ctx.luffa1, 512 );
     init_luffa( &hmq1725_ctx.luffa2, 512 );
 //    sph_cubehash512_init(&hmq1725_ctx.cubehash1);
     cubehashInit( &hmq1725_ctx.cube, 512, 16, 32 );
    sph_shavite512_init(&hmq1725_ctx.shavite1);
    sph_shavite512_init(&hmq1725_ctx.shavite2);
 //    sph_simd512_init(&hmq1725_ctx.simd1);
 //    sph_simd512_init(&hmq1725_ctx.simd2);
     init_sd( &hmq1725_ctx.simd1, 512 );
     init_sd( &hmq1725_ctx.simd2, 512 );
@@ -135,46 +120,18 @@ void init_hmq1725_ctx()
     init_groestl( &hmq1725_ctx.groestl1 );
     init_groestl( &hmq1725_ctx.groestl2 );
 #endif
 }
 extern void hmq1725hash(void *state, const void *input)
 {
   hmq1725_ctx_holder ctx;
   memcpy(&ctx, &hmq1725_ctx, sizeof(hmq1725_ctx));
     size_t hashptr;
 //        DATA_ALIGNXY(sph_u64 hashctA,8);
 //        DATA_ALIGNXY(sph_u64 hashctB,8);
 //        DATA_ALIGNXY(unsigned char hash[128],16);
     unsigned char hashbuf[128];
     sph_u64 hashctA;
     sph_u64 hashctB;
    const uint32_t mask = 24;
    uint32_t hashA[25], hashB[25];
    hmq1725_ctx_holder ctx;
-//these uint512 in the c++ source of the client are backed by an array of uint32
+    memcpy(&ctx, &hmq1725_ctx, sizeof(hmq1725_ctx));
    uint32_t hashA[25], hashB[25];	
 //        unsigned char hash[128]; // uint32_t hashA[16], hashB[16];
 //        #define hashA hash
 //        #define hashB (hash+64)
    sph_bmw512 (&ctx.bmw1, input, 80);    //0
    sph_bmw512_close(&ctx.bmw1, hashA);   //1
 /*
     DECL_BMW;
     BMW_I;
     BMW_U;
     #define M(x)    sph_dec64le_aligned(data + 8 * (x))
     #define H(x)    (h[x])
     #define dH(x)   (dh[x])
     BMW_C;
     #undef M
     #undef H
     #undef dH
 */
    sph_whirlpool (&ctx.whirlpool1, hashA, 64);    //0
    sph_whirlpool_close(&ctx.whirlpool1, hashB);   //1
@@ -182,8 +139,8 @@ extern void hmq1725hash(void *state, const void *input)
    if ( hashB[0] & mask )   //1
    {
 #ifdef NO_AES_NI
-        sph_groestl512 (&ctx.groestl1, hashB, 64); //1
+     sph_groestl512 (&ctx.groestl1, hashB, 64); //1
-        sph_groestl512_close(&ctx.groestl1, hashA); //2
+     sph_groestl512_close(&ctx.groestl1, hashA); //2
 #else
     update_groestl( &ctx.groestl1, (char*)hashB, 512 );
     final_groestl( &ctx.groestl1, (char*)hashA );
@@ -191,8 +148,8 @@ extern void hmq1725hash(void *state, const void *input)
    }
    else
    {
-        sph_skein512 (&ctx.skein1, hashB, 64); //1
+      sph_skein512 (&ctx.skein1, hashB, 64); //1
-        sph_skein512_close(&ctx.skein1, hashA); //2
+      sph_skein512_close(&ctx.skein1, hashA); //2
    }
    sph_jh512 (&ctx.jh1, hashA, 64); //3
@@ -212,13 +169,9 @@ extern void hmq1725hash(void *state, const void *input)
        sph_bmw512_close(&ctx.bmw2, hashB);   //5
    }
 //    sph_luffa512 (&ctx.luffa1, hashB, 64); //5
 //    sph_luffa512_close(&ctx.luffa1, hashA); //6
     update_luffa( &ctx.luffa1, (BitSequence*)hashB, 512 );
     final_luffa( &ctx.luffa1, (BitSequence*)hashA );
 //    sph_cubehash512 (&ctx.cubehash1, hashA, 64); //6
 //    sph_cubehash512_close(&ctx.cubehash1, hashB); //7
     cubehashUpdate( &ctx.cube, (BitSequence *)hashA, 64 );
     cubehashDigest( &ctx.cube, (BitSequence *)hashB );
@@ -233,14 +186,11 @@ extern void hmq1725hash(void *state, const void *input)
        sph_jh512_close(&ctx.jh2, hashA); //8
    }
    sph_shavite512 (&ctx.shavite1, hashA, 64); //3
    sph_shavite512_close(&ctx.shavite1, hashB); //4
-//    sph_simd512 (&ctx.simd1, hashB, 64); //2
+    update_sd( &ctx.simd1, (BitSequence *)hashB, 512 );
-//    sph_simd512_close(&ctx.simd1, hashA); //3
+    final_sd( &ctx.simd1, (BitSequence *)hashA );
     update_sd( &ctx.simd1, (BitSequence *)hashB, 512 );
     final_sd( &ctx.simd1, (BitSequence *)hashA );
    if ( hashA[0] & mask ) //4
    {
@@ -258,8 +208,8 @@ extern void hmq1725hash(void *state, const void *input)
    sph_echo512 (&ctx.echo1, hashB, 64); //5
    sph_echo512_close(&ctx.echo1, hashA); //6
 #else
-     update_echo ( &ctx.echo1, (BitSequence *)hashB, 512 );
+    update_echo ( &ctx.echo1, (BitSequence *)hashB, 512 );
-     final_echo( &ctx.echo1, (BitSequence *)hashA );
+    final_echo( &ctx.echo1, (BitSequence *)hashA );
 #endif
    sph_blake512 (&ctx.blake2, hashA, 64); //6
@@ -272,8 +222,6 @@ extern void hmq1725hash(void *state, const void *input)
    }
    else
    {
 //        sph_luffa512 (&ctx.luffa2, hashB, 64); //7
 //        sph_luffa512_close(&ctx.luffa2, hashA); //8
     update_luffa( &ctx.luffa2, (BitSequence *)hashB, 512 );
     final_luffa( &ctx.luffa2, (BitSequence *)hashA );
    }
@@ -287,8 +235,8 @@ extern void hmq1725hash(void *state, const void *input)
    if ( hashA[0] & mask ) //4
    {
 #ifdef NO_AES_NI
-        sph_echo512 (&ctx.echo2, hashA, 64); //
+     sph_echo512 (&ctx.echo2, hashA, 64); //
-        sph_echo512_close(&ctx.echo2, hashB); //5
+     sph_echo512_close(&ctx.echo2, hashB); //5
 #else
     update_echo ( &ctx.echo2, (BitSequence *)hashA, 512 );
     final_echo( &ctx.echo2, (BitSequence *)hashB );
@@ -296,8 +244,6 @@ extern void hmq1725hash(void *state, const void *input)
    }
    else
    {
 //        sph_simd512 (&ctx.simd2, hashA, 64); //4
 //        sph_simd512_close(&ctx.simd2, hashB);   //5
     update_sd( &ctx.simd2, (BitSequence *)hashA, 512 );
     final_sd( &ctx.simd2, (BitSequence *)hashB );
    }
@@ -323,8 +269,8 @@ extern void hmq1725hash(void *state, const void *input)
    sph_groestl512 (&ctx.groestl2, hashA, 64); //3
    sph_groestl512_close(&ctx.groestl2, hashB); //4
 #else
-     update_groestl( &ctx.groestl2, (char*)hashA, 512 );
+    update_groestl( &ctx.groestl2, (char*)hashA, 512 );
-     final_groestl( &ctx.groestl2, (char*)hashB );
+    final_groestl( &ctx.groestl2, (char*)hashB );
 #endif
    sph_sha512 (&ctx.sha2, hashB, 64); //2 
--- a/algo/nist5.c
+++ b/algo/nist5.c
@@ -7,6 +7,7 @@
 #include <stdio.h>
 #include "algo/blake/sph_blake.h"
 #include "algo/groestl/sph_groestl.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/jh/sph_jh.h"
 #include "algo/keccak/sph_keccak.h"
@@ -16,15 +17,14 @@
 #include "algo/skein/sse2/skein.c"
 #include "algo/jh/sse2/jh_sse2_opt64.h"
-#ifdef NO_AES_NI
+#ifndef NO_AES_NI
  #include "algo/groestl/sse2/grso.h"
  #include "algo/groestl/sse2/grso-macro.c"
 #else
  #include "algo/groestl/aes_ni/hash-groestl.h"
 #endif
 typedef struct {
-#ifndef NO_AES_NI
+#ifdef NO_AES_NI
    sph_groestl512_context groestl;
 #else
    hashState_groestl      groestl;
 #endif
 } nist5_ctx_holder;
@@ -33,16 +33,15 @@ nist5_ctx_holder nist5_ctx;
 void init_nist5_ctx()
 {
-#ifndef NO_AES_NI
+#ifdef NO_AES_NI
     sph_groestl512_init( &nist5_ctx.groestl );
 #else
     init_groestl( &nist5_ctx.groestl );
 #endif
 }
 void nist5hash(void *output, const void *input)
 {
 #ifdef NO_AES_NI
     grsoState sts_grs;
 #endif
     size_t hashptr;
     unsigned char hashbuf[128];
     sph_u64 hashctA;
@@ -54,16 +53,14 @@ void nist5hash(void *output, const void *input)
     nist5_ctx_holder ctx;
     memcpy( &ctx, &nist5_ctx, sizeof(nist5_ctx) );
     DECL_BLK;
     BLK_I;
     BLK_W;
     BLK_C;
     #ifdef NO_AES_NI
-       GRS_I;
+       sph_groestl512 (&ctx.groestl, hash, 64);
-       GRS_U;
+       sph_groestl512_close(&ctx.groestl, hash);
       GRS_C;
     #else
       update_groestl( &ctx.groestl, (char*)hash,512);
       final_groestl( &ctx.groestl, (char*)hash);
--- a/algo/quark/quark.c
+++ b/algo/quark/quark.c
@@ -19,10 +19,7 @@
 #include "algo/skein/sse2/skein.c"
 #include "algo/jh/sse2/jh_sse2_opt64.h"
-#ifdef NO_AES_NI
+#ifndef NO_AES_NI
  #include "algo/groestl/sse2/grso.h"
  #include "algo/groestl/sse2/grso-macro.c"
 #else
 #include "algo/groestl/aes_ni/hash-groestl.h"
 #endif
@@ -36,37 +33,36 @@
      #define DATA_ALIGNXY(x,y) __declspec(align(y)) x
 #endif
-#ifndef NO_AES_NI
+#ifdef NO_AES_NI
-hashState_groestl quark_groestl_ctx;
+    sph_groestl512_context quark_ctx;
 #else
    hashState_groestl      quark_ctx;
 #endif
 void init_quark_ctx()
 {
-#ifndef NO_AES_NI
+#ifdef NO_AES_NI
- init_groestl( &quark_groestl_ctx );
+   sph_groestl512_init( &quark_ctx );
 #else
   init_groestl( &quark_ctx );
 #endif
 }
 inline static void quarkhash(void *state, const void *input)
 {
 #ifdef NO_AES_NI
  grsoState sts_grs;
 #else
  hashState_groestl ctx;
  memcpy(&ctx, &quark_groestl_ctx, sizeof(quark_groestl_ctx));
 #endif
    /* shared  temp space */
    /* hash is really just 64bytes but it used to hold both hash and final round constants passed 64 */
    unsigned char hashbuf[128];
    size_t hashptr;
    sph_u64 hashctA;
    sph_u64 hashctB;
    int i;
    unsigned char hash[128];
 #ifdef NO_AES_NI
    sph_groestl512_context ctx;
 #else
    hashState_groestl ctx;
 #endif
    memcpy( &ctx, &quark_ctx, sizeof(ctx) );
    // Blake
    DECL_BLK;
@@ -117,13 +113,13 @@ inline static void quarkhash(void *state, const void *input)
          {
 #ifdef NO_AES_NI
-           GRS_I;
+             sph_groestl512_init( &ctx );
-           GRS_U;
+             sph_groestl512 ( &ctx, hash, 64 );
-           GRS_C;
+             sph_groestl512_close( &ctx, hash );
 #else
-           reinit_groestl( &ctx );
+             reinit_groestl( &ctx );
-           update_groestl(&ctx, (char*)hash,512);
+             update_groestl( &ctx, (char*)hash, 512 );
-           final_groestl(&ctx, (char*)hash);
+             final_groestl( &ctx, (char*)hash );
 #endif
          } while(0); continue;
--- a/algo/skein/sse2/skein.c
+++ b/algo/skein/sse2/skein.c
@@ -371,7 +371,6 @@ extern "C"{
 #define DECL_SKN \
 	sph_u64 sknh0, sknh1, sknh2, sknh3, sknh4, sknh5, sknh6, sknh7; \
 	unsigned char sknbuf[64]; \
 #define sknREAD_STATE_BIG(sc)   do { \
 		sknh0 = (sc)->sknh0; \
@@ -424,7 +423,6 @@ do { \
 do { \
 	unsigned char *buf; \
 	size_t ptr; \
 	unsigned first; \
        size_t len = 64; \
        const void *data = hash; \
 	buf = hashbuf; \
@@ -441,7 +439,6 @@ do { \
 	unsigned char *buf; \
 	size_t ptr; \
 	unsigned et; \
 	int i; \
 \
 	buf = hashbuf; \
 	ptr = hashptr; \
--- a/algo/x11/c11.c
+++ b/algo/x11/c11.c
@@ -18,10 +18,7 @@
 #include "algo/simd/sph_simd.h"
 #include "algo/echo/sph_echo.h"
-#ifdef NO_AES_NI
+#ifndef NO_AES_NI
 //  #include "algo/echo/sph_echo.h"
 //  #include "algo/groestl/sph_groestl.h"
 #else
  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/echo/aes_ni/hash_api.h"
 #endif
--- a/algo/x11/x11.c
+++ b/algo/x11/x11.c
@@ -17,10 +17,7 @@
 #include "algo/simd/sph_simd.h"
 #include "algo/echo/sph_echo.h"
-#ifdef NO_AES_NI
+#ifndef NO_AES_NI
  #include "algo/groestl/sse2/grso.h"
  #include "algo/groestl/sse2/grso-macro.c"
 #else
  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/echo/aes_ni/hash_api.h"
 #endif
@@ -40,7 +37,7 @@ typedef struct {
    hashState_sd            simd;
    sph_shavite512_context  shavite;
 #ifdef NO_AES_NI
-//    sph_groestl512_context  groestl;
+    sph_groestl512_context  groestl;
    sph_echo512_context     echo;
 #else
    hashState_echo          echo;
@@ -57,7 +54,7 @@ void init_x11_ctx()
     sph_shavite512_init( &x11_ctx.shavite );
     init_sd( &x11_ctx.simd, 512 );
 #ifdef NO_AES_NI
-//     sph_groestl512_init( &x11_ctx.groestl );
+     sph_groestl512_init( &x11_ctx.groestl );
     sph_echo512_init( &x11_ctx.echo );
 #else
     init_echo( &x11_ctx.echo, 512 );
@@ -92,13 +89,8 @@ static void x11_hash( void *state, const void *input )
     #undef dH
 #ifdef NO_AES_NI
-     grsoState sts_grs;
+     sph_groestl512 (&ctx.groestl, hash, 64);
-     GRS_I;
+     sph_groestl512_close(&ctx.groestl, hash);
     GRS_U;
     GRS_C;
 //     sph_groestl512 (&ctx.groestl, hash, 64);
 //     sph_groestl512_close(&ctx.groestl, hash);
 #else
     update_groestl( &ctx.groestl, (char*)hash, 512 );
     final_groestl( &ctx.groestl, (char*)hash );
--- a/algo/x11/x11evo.c
+++ b/algo/x11/x11evo.c
@@ -18,10 +18,7 @@
 #include "algo/simd/sph_simd.h"
 #include "algo/echo/sph_echo.h"
-#ifdef NO_AES_NI
+#ifndef NO_AES_NI
 //  #include "algo/groestl/sse2/grso.h"
 //  #include "algo/groestl/sse2/grso-macro.c"
 #else
  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/echo/aes_ni/hash_api.h"
 #endif
--- a/algo/x11/x11gost.c
+++ b/algo/x11/x11gost.c
@@ -6,6 +6,7 @@
 #include <string.h>
 #include <stdio.h>
 #include "algo/groestl/sph_groestl.h"
 #include "algo/gost/sph_gost.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/echo/sph_echo.h"
@@ -19,10 +20,7 @@
 #include "algo/skein/sse2/skein.c"
 #include "algo/jh/sse2/jh_sse2_opt64.h"
-#ifdef NO_AES_NI
+#ifndef NO_AES_NI
  #include "algo/groestl/sse2/grso.h"
  #include "algo/groestl/sse2/grso-macro.c"
 #else
  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/echo/aes_ni/hash_api.h"
 #endif
@@ -34,6 +32,7 @@ typedef struct {
     cubehashParam           cube;
     hashState_sd            simd;
 #ifdef NO_AES_NI
     sph_groestl512_context  groestl;
     sph_echo512_context     echo;
 #else
     hashState_echo          echo;
@@ -51,6 +50,7 @@ void init_sib_ctx()
     cubehashInit( &sib_ctx.cube, 512, 16, 32 );
     init_sd( &sib_ctx.simd, 512 );
 #ifdef NO_AES_NI
     sph_groestl512_init( &sib_ctx.groestl );
     sph_echo512_init( &sib_ctx.echo );
 #else
     init_echo( &sib_ctx.echo, 512 );
@@ -59,17 +59,12 @@ void init_sib_ctx()
 }
 void sibhash(void *output, const void *input)
 {
     unsigned char hash[128]; // uint32_t hashA[16], hashB[16];
     #define hashA hash
     #define hashB hash+64
     #ifdef NO_AES_NI
        grsoState sts_grs;
     #endif
     size_t hashptr;
     unsigned char hashbuf[128];
     sph_u64 hashctA;
@@ -95,12 +90,11 @@ void sibhash(void *output, const void *input)
     #undef dH
     #ifdef NO_AES_NI
-          GRS_I;
+        sph_groestl512 (&ctx.groestl, hash, 64);
-          GRS_U;
+        sph_groestl512_close(&ctx.groestl, hash);
          GRS_C;
     #else
-          update_groestl( &ctx.groestl, (char*)hash,512);
+        update_groestl( &ctx.groestl, (char*)hash,512);
-          final_groestl( &ctx.groestl, (char*)hash);
+        final_groestl( &ctx.groestl, (char*)hash);
     #endif
     DECL_SKN;
--- a/algo/x13/x13.c
+++ b/algo/x13/x13.c
@@ -29,10 +29,7 @@
 #include "algo/skein/sse2/skein.c"
 #include "algo/jh/sse2/jh_sse2_opt64.h"
-#ifdef NO_AES_NI
+#ifndef NO_AES_NI
  #include "algo/groestl/sse2/grso.h"
  #include "algo/groestl/sse2/grso-macro.c"
 #else
  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/echo/aes_ni/hash_api.h"
 #endif
@@ -79,9 +76,6 @@ static void x13hash(void *output, const void *input)
        x13_ctx_holder ctx;
        memcpy( &ctx, &x13_ctx, sizeof(x13_ctx) );
 #ifdef NO_AES_NI
        grsoState sts_grs;
 #endif
        // X11 algos
@@ -116,12 +110,8 @@ static void x13hash(void *output, const void *input)
        //---groetl----
 #ifdef NO_AES_NI
-// use GRS if possible
+        sph_groestl512 (&ctx.groestl, hash, 64);
-          GRS_I;
+        sph_groestl512_close(&ctx.groestl, hash);
          GRS_U;
          GRS_C;
 //        sph_groestl512 (&ctx.groestl, hash, 64);
 //        sph_groestl512_close(&ctx.groestl, hash);
 #else
        update_groestl( &ctx.groestl, (char*)hash,512);
        final_groestl( &ctx.groestl, (char*)hash);
--- a/algo/x14/x14.c
+++ b/algo/x14/x14.c
@@ -31,10 +31,7 @@
 #include "algo/skein/sse2/skein.c"
 #include "algo/jh/sse2/jh_sse2_opt64.h"
-#ifdef NO_AES_NI
+#ifndef NO_AES_NI
  #include "algo/groestl/sse2/grso.h"
  #include "algo/groestl/sse2/grso-macro.c"
 #else
  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/echo/aes_ni/hash_api.h"
 #endif
@@ -84,10 +81,6 @@ static void x14hash(void *output, const void *input)
        x14_ctx_holder ctx;
        memcpy(&ctx, &x14_ctx, sizeof(x14_ctx));
 #ifdef NO_AES_NI
      grsoState sts_grs;
 #endif
        unsigned char hashbuf[128];
        size_t hashptr;
        sph_u64 hashctA;
@@ -119,12 +112,8 @@ static void x14hash(void *output, const void *input)
        //---groestl----
 #ifdef NO_AES_NI
-// use SSE2 optimized GRS if possible
+        sph_groestl512 (&ctx.groestl, hash, 64);
-         GRS_I;
+        sph_groestl512_close(&ctx.groestl, hash);
         GRS_U;
         GRS_C;
 //        sph_groestl512 (&ctx.groestl, hash, 64);
 //        sph_groestl512_close(&ctx.groestl, hash);
 #else
        update_groestl( &ctx.groestl, (char*)hash,512);
        final_groestl( &ctx.groestl, (char*)hash);
--- a/algo/x15/x15.c
+++ b/algo/x15/x15.c
@@ -31,10 +31,7 @@
 #include "algo/skein/sse2/skein.c"
 #include "algo/jh/sse2/jh_sse2_opt64.h"
-#ifdef NO_AES_NI
+#ifndef NO_AES_NI
  #include "algo/groestl/sse2/grso.h"
  #include "algo/groestl/sse2/grso-macro.c"
 #else
  #include "algo/echo/aes_ni/hash_api.h"
  #include "algo/groestl/aes_ni/hash-groestl.h"
 #endif
@@ -86,10 +83,6 @@ static void x15hash(void *output, const void *input)
        x15_ctx_holder ctx;
        memcpy( &ctx, &x15_ctx, sizeof(x15_ctx) );
 #ifdef NO_AES_NI
        grsoState sts_grs;
 #endif
        unsigned char hashbuf[128];
        size_t hashptr;
        sph_u64 hashctA;
@@ -120,14 +113,11 @@ static void x15hash(void *output, const void *input)
        //---groestl----
 #ifdef NO_AES_NI
-        GRS_I;
+        sph_groestl512(&ctx.groestl, hash, 64);
-        GRS_U;
+        sph_groestl512_close(&ctx.groestl, hash);
        GRS_C;
 //        sph_groestl512(&ctx.groestl, hash, 64);
 //       sph_groestl512_close(&ctx.groestl, hash);
 #else
-          update_groestl( &ctx.groestl, (char*)hash,512);
+        update_groestl( &ctx.groestl, (char*)hash,512);
-          final_groestl( &ctx.groestl, (char*)hash);
+        final_groestl( &ctx.groestl, (char*)hash);
 #endif
        //---skein4---
--- a/algo/x17/x17.c
+++ b/algo/x17/x17.c
@@ -33,10 +33,7 @@
 #include "algo/skein/sse2/skein.c"
 #include "algo/jh/sse2/jh_sse2_opt64.h"
-#ifdef NO_AES_NI
+#ifndef NO_AES_NI
  #include "algo/groestl/sse2/grso.h"
  #include "algo/groestl/sse2/grso-macro.c"
 #else
  #include "algo/echo/aes_ni/hash_api.h"
  #include "algo/groestl/aes_ni/hash-groestl.h"
 #endif
@@ -92,10 +89,6 @@ static void x17hash(void *output, const void *input)
        x17_ctx_holder ctx;
        memcpy( &ctx, &x17_ctx, sizeof(x17_ctx) );
 #ifdef NO_AES_NI
        grsoState sts_grs;
 #endif
        unsigned char hashbuf[128];
        size_t hashptr;
        sph_u64 hashctA;
@@ -126,14 +119,11 @@ static void x17hash(void *output, const void *input)
        //---groestl----
 #ifdef NO_AES_NI
 //        GRS_I;
 //        GRS_U;
 //        GRS_C;
        sph_groestl512(&ctx.groestl, hash, 64);
        sph_groestl512_close(&ctx.groestl, hash);
 #else
-          update_groestl( &ctx.groestl, (char*)hash,512);
+        update_groestl( &ctx.groestl, (char*)hash,512);
-          final_groestl( &ctx.groestl, (char*)hash);
+        final_groestl( &ctx.groestl, (char*)hash);
 #endif
        //---skein4---
--- a/algo/x2.hide/scrypt-arm.S
+++ b/algo/x2.hide/scrypt-arm.S
--- a/algo/x2.hide/scrypt-x64.S
+++ b/algo/x2.hide/scrypt-x64.S
--- a/algo/x2.hide/scrypt-x86.S
+++ b/algo/x2.hide/scrypt-x86.S
@@ -1,821 +0,0 @@
 /*
 * Copyright 2011-2012 pooler@litecoinpool.org
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */
 #include "cpuminer-config.h"
 #if defined(__linux__) && defined(__ELF__)
 	.section .note.GNU-stack,"",%progbits
 #endif
 #if defined(__i386__)
 .macro scrypt_shuffle src, so, dest, do
 	movl	\so+60(\src), %eax
 	movl	\so+44(\src), %ebx
 	movl	\so+28(\src), %ecx
 	movl	\so+12(\src), %edx
 	movl	%eax, \do+12(\dest)
 	movl	%ebx, \do+28(\dest)
 	movl	%ecx, \do+44(\dest)
 	movl	%edx, \do+60(\dest)
 	movl	\so+40(\src), %eax
 	movl	\so+8(\src), %ebx
 	movl	\so+48(\src), %ecx
 	movl	\so+16(\src), %edx
 	movl	%eax, \do+8(\dest)
 	movl	%ebx, \do+40(\dest)
 	movl	%ecx, \do+16(\dest)
 	movl	%edx, \do+48(\dest)
 	movl	\so+20(\src), %eax
 	movl	\so+4(\src), %ebx
 	movl	\so+52(\src), %ecx
 	movl	\so+36(\src), %edx
 	movl	%eax, \do+4(\dest)
 	movl	%ebx, \do+20(\dest)
 	movl	%ecx, \do+36(\dest)
 	movl	%edx, \do+52(\dest)
 	movl	\so+0(\src), %eax
 	movl	\so+24(\src), %ebx
 	movl	\so+32(\src), %ecx
 	movl	\so+56(\src), %edx
 	movl	%eax, \do+0(\dest)
 	movl	%ebx, \do+24(\dest)
 	movl	%ecx, \do+32(\dest)
 	movl	%edx, \do+56(\dest)
 .endm
 .macro salsa8_core_gen_quadround
 	movl	52(%esp), %ecx
 	movl	4(%esp), %edx
 	movl	20(%esp), %ebx
 	movl	8(%esp), %esi
 	leal	(%ecx, %edx), %edi
 	roll	$7, %edi
 	xorl	%edi, %ebx
 	movl	%ebx, 4(%esp)
 	movl	36(%esp), %edi
 	leal	(%edx, %ebx), %ebp
 	roll	$9, %ebp
 	xorl	%ebp, %edi
 	movl	24(%esp), %ebp
 	movl	%edi, 8(%esp)
 	addl	%edi, %ebx
 	roll	$13, %ebx
 	xorl	%ebx, %ecx
 	movl	40(%esp), %ebx
 	movl	%ecx, 20(%esp)
 	addl	%edi, %ecx
 	roll	$18, %ecx
 	leal	(%esi, %ebp), %edi
 	roll	$7, %edi
 	xorl	%edi, %ebx
 	movl	%ebx, 24(%esp)
 	movl	56(%esp), %edi
 	xorl	%ecx, %edx
 	leal	(%ebp, %ebx), %ecx
 	roll	$9, %ecx
 	xorl	%ecx, %edi
 	movl	%edi, 36(%esp)
 	movl	28(%esp), %ecx
 	movl	%edx, 28(%esp)
 	movl	44(%esp), %edx
 	addl	%edi, %ebx
 	roll	$13, %ebx
 	xorl	%ebx, %esi
 	movl	60(%esp), %ebx
 	movl	%esi, 40(%esp)
 	addl	%edi, %esi
 	roll	$18, %esi
 	leal	(%ecx, %edx), %edi
 	roll	$7, %edi
 	xorl	%edi, %ebx
 	movl	%ebx, 44(%esp)
 	movl	12(%esp), %edi
 	xorl	%esi, %ebp
 	leal	(%edx, %ebx), %esi
 	roll	$9, %esi
 	xorl	%esi, %edi
 	movl	%edi, 12(%esp)
 	movl	48(%esp), %esi
 	movl	%ebp, 48(%esp)
 	movl	64(%esp), %ebp
 	addl	%edi, %ebx
 	roll	$13, %ebx
 	xorl	%ebx, %ecx
 	movl	16(%esp), %ebx
 	movl	%ecx, 16(%esp)
 	addl	%edi, %ecx
 	roll	$18, %ecx
 	leal	(%esi, %ebp), %edi
 	roll	$7, %edi
 	xorl	%edi, %ebx
 	movl	32(%esp), %edi
 	xorl	%ecx, %edx
 	leal	(%ebp, %ebx), %ecx
 	roll	$9, %ecx
 	xorl	%ecx, %edi
 	movl	%edi, 32(%esp)
 	movl	%ebx, %ecx
 	movl	%edx, 52(%esp)
 	movl	28(%esp), %edx
 	addl	%edi, %ebx
 	roll	$13, %ebx
 	xorl	%ebx, %esi
 	movl	40(%esp), %ebx
 	movl	%esi, 28(%esp)
 	addl	%edi, %esi
 	roll	$18, %esi
 	leal	(%ecx, %edx), %edi
 	roll	$7, %edi
 	xorl	%edi, %ebx
 	movl	%ebx, 40(%esp)
 	movl	12(%esp), %edi
 	xorl	%esi, %ebp
 	leal	(%edx, %ebx), %esi
 	roll	$9, %esi
 	xorl	%esi, %edi
 	movl	%edi, 12(%esp)
 	movl	4(%esp), %esi
 	movl	%ebp, 4(%esp)
 	movl	48(%esp), %ebp
 	addl	%edi, %ebx
 	roll	$13, %ebx
 	xorl	%ebx, %ecx
 	movl	16(%esp), %ebx
 	movl	%ecx, 16(%esp)
 	addl	%edi, %ecx
 	roll	$18, %ecx
 	leal	(%esi, %ebp), %edi
 	roll	$7, %edi
 	xorl	%edi, %ebx
 	movl	%ebx, 48(%esp)
 	movl	32(%esp), %edi
 	xorl	%ecx, %edx
 	leal	(%ebp, %ebx), %ecx
 	roll	$9, %ecx
 	xorl	%ecx, %edi
 	movl	%edi, 32(%esp)
 	movl	24(%esp), %ecx
 	movl	%edx, 24(%esp)
 	movl	52(%esp), %edx
 	addl	%edi, %ebx
 	roll	$13, %ebx
 	xorl	%ebx, %esi
 	movl	28(%esp), %ebx
 	movl	%esi, 28(%esp)
 	addl	%edi, %esi
 	roll	$18, %esi
 	leal	(%ecx, %edx), %edi
 	roll	$7, %edi
 	xorl	%edi, %ebx
 	movl	%ebx, 52(%esp)
 	movl	8(%esp), %edi
 	xorl	%esi, %ebp
 	leal	(%edx, %ebx), %esi
 	roll	$9, %esi
 	xorl	%esi, %edi
 	movl	%edi, 8(%esp)
 	movl	44(%esp), %esi
 	movl	%ebp, 44(%esp)
 	movl	4(%esp), %ebp
 	addl	%edi, %ebx
 	roll	$13, %ebx
 	xorl	%ebx, %ecx
 	movl	20(%esp), %ebx
 	movl	%ecx, 4(%esp)
 	addl	%edi, %ecx
 	roll	$18, %ecx
 	leal	(%esi, %ebp), %edi
 	roll	$7, %edi
 	xorl	%edi, %ebx
 	movl	36(%esp), %edi
 	xorl	%ecx, %edx
 	leal	(%ebp, %ebx), %ecx
 	roll	$9, %ecx
 	xorl	%ecx, %edi
 	movl	%edi, 20(%esp)
 	movl	%ebx, %ecx
 	movl	%edx, 36(%esp)
 	movl	24(%esp), %edx
 	addl	%edi, %ebx
 	roll	$13, %ebx
 	xorl	%ebx, %esi
 	movl	28(%esp), %ebx
 	movl	%esi, 24(%esp)
 	addl	%edi, %esi
 	roll	$18, %esi
 	leal	(%ecx, %edx), %edi
 	roll	$7, %edi
 	xorl	%edi, %ebx
 	movl	%ebx, 28(%esp)
 	xorl	%esi, %ebp
 	movl	8(%esp), %esi
 	leal	(%edx, %ebx), %edi
 	roll	$9, %edi
 	xorl	%edi, %esi
 	movl	40(%esp), %edi
 	movl	%ebp, 8(%esp)
 	movl	44(%esp), %ebp
 	movl	%esi, 40(%esp)
 	addl	%esi, %ebx
 	roll	$13, %ebx
 	xorl	%ebx, %ecx
 	movl	4(%esp), %ebx
 	movl	%ecx, 44(%esp)
 	addl	%esi, %ecx
 	roll	$18, %ecx
 	leal	(%edi, %ebp), %esi
 	roll	$7, %esi
 	xorl	%esi, %ebx
 	movl	%ebx, 4(%esp)
 	movl	20(%esp), %esi
 	xorl	%ecx, %edx
 	leal	(%ebp, %ebx), %ecx
 	roll	$9, %ecx
 	xorl	%ecx, %esi
 	movl	%esi, 56(%esp)
 	movl	48(%esp), %ecx
 	movl	%edx, 20(%esp)
 	movl	36(%esp), %edx
 	addl	%esi, %ebx
 	roll	$13, %ebx
 	xorl	%ebx, %edi
 	movl	24(%esp), %ebx
 	movl	%edi, 24(%esp)
 	addl	%esi, %edi
 	roll	$18, %edi
 	leal	(%ecx, %edx), %esi
 	roll	$7, %esi
 	xorl	%esi, %ebx
 	movl	%ebx, 60(%esp)
 	movl	12(%esp), %esi
 	xorl	%edi, %ebp
 	leal	(%edx, %ebx), %edi
 	roll	$9, %edi
 	xorl	%edi, %esi
 	movl	%esi, 12(%esp)
 	movl	52(%esp), %edi
 	movl	%ebp, 36(%esp)
 	movl	8(%esp), %ebp
 	addl	%esi, %ebx
 	roll	$13, %ebx
 	xorl	%ebx, %ecx
 	movl	16(%esp), %ebx
 	movl	%ecx, 16(%esp)
 	addl	%esi, %ecx
 	roll	$18, %ecx
 	leal	(%edi, %ebp), %esi
 	roll	$7, %esi
 	xorl	%esi, %ebx
 	movl	32(%esp), %esi
 	xorl	%ecx, %edx
 	leal	(%ebp, %ebx), %ecx
 	roll	$9, %ecx
 	xorl	%ecx, %esi
 	movl	%esi, 32(%esp)
 	movl	%ebx, %ecx
 	movl	%edx, 48(%esp)
 	movl	20(%esp), %edx
 	addl	%esi, %ebx
 	roll	$13, %ebx
 	xorl	%ebx, %edi
 	movl	24(%esp), %ebx
 	movl	%edi, 20(%esp)
 	addl	%esi, %edi
 	roll	$18, %edi
 	leal	(%ecx, %edx), %esi
 	roll	$7, %esi
 	xorl	%esi, %ebx
 	movl	%ebx, 8(%esp)
 	movl	12(%esp), %esi
 	xorl	%edi, %ebp
 	leal	(%edx, %ebx), %edi
 	roll	$9, %edi
 	xorl	%edi, %esi
 	movl	%esi, 12(%esp)
 	movl	28(%esp), %edi
 	movl	%ebp, 52(%esp)
 	movl	36(%esp), %ebp
 	addl	%esi, %ebx
 	roll	$13, %ebx
 	xorl	%ebx, %ecx
 	movl	16(%esp), %ebx
 	movl	%ecx, 16(%esp)
 	addl	%esi, %ecx
 	roll	$18, %ecx
 	leal	(%edi, %ebp), %esi
 	roll	$7, %esi
 	xorl	%esi, %ebx
 	movl	%ebx, 28(%esp)
 	movl	32(%esp), %esi
 	xorl	%ecx, %edx
 	leal	(%ebp, %ebx), %ecx
 	roll	$9, %ecx
 	xorl	%ecx, %esi
 	movl	%esi, 32(%esp)
 	movl	4(%esp), %ecx
 	movl	%edx, 4(%esp)
 	movl	48(%esp), %edx
 	addl	%esi, %ebx
 	roll	$13, %ebx
 	xorl	%ebx, %edi
 	movl	20(%esp), %ebx
 	movl	%edi, 20(%esp)
 	addl	%esi, %edi
 	roll	$18, %edi
 	leal	(%ecx, %edx), %esi
 	roll	$7, %esi
 	xorl	%esi, %ebx
 	movl	%ebx, 48(%esp)
 	movl	40(%esp), %esi
 	xorl	%edi, %ebp
 	leal	(%edx, %ebx), %edi
 	roll	$9, %edi
 	xorl	%edi, %esi
 	movl	%esi, 36(%esp)
 	movl	60(%esp), %edi
 	movl	%ebp, 24(%esp)
 	movl	52(%esp), %ebp
 	addl	%esi, %ebx
 	roll	$13, %ebx
 	xorl	%ebx, %ecx
 	movl	44(%esp), %ebx
 	movl	%ecx, 40(%esp)
 	addl	%esi, %ecx
 	roll	$18, %ecx
 	leal	(%edi, %ebp), %esi
 	roll	$7, %esi
 	xorl	%esi, %ebx
 	movl	%ebx, 52(%esp)
 	movl	56(%esp), %esi
 	xorl	%ecx, %edx
 	leal	(%ebp, %ebx), %ecx
 	roll	$9, %ecx
 	xorl	%ecx, %esi
 	movl	%esi, 56(%esp)
 	addl	%esi, %ebx
 	movl	%edx, 44(%esp)
 	roll	$13, %ebx
 	xorl	%ebx, %edi
 	movl	%edi, 60(%esp)
 	addl	%esi, %edi
 	roll	$18, %edi
 	xorl	%edi, %ebp
 	movl	%ebp, 64(%esp)
 .endm
 	.text
 	.p2align 5
 salsa8_core_gen:
 	salsa8_core_gen_quadround
 	salsa8_core_gen_quadround
 	ret
 	.text
 	.p2align 5
 	.globl scrypt_core
 	.globl _scrypt_core
 scrypt_core:
 _scrypt_core:
 	pushl	%ebx
 	pushl	%ebp
 	pushl	%edi
 	pushl	%esi
 	/* Check for SSE2 availability */
 	movl	$1, %eax
 	cpuid
 	andl	$0x04000000, %edx
 	jnz scrypt_core_sse2
 scrypt_core_gen:
 	movl	20(%esp), %edi
 	movl	24(%esp), %esi
 	subl	$72, %esp
 .macro scrypt_core_macro1a p, q
 	movl	\p(%edi), %eax
 	movl	\q(%edi), %edx
 	movl	%eax, \p(%esi)
 	movl	%edx, \q(%esi)
 	xorl	%edx, %eax
 	movl	%eax, \p(%edi)
 	movl	%eax, \p(%esp)
 .endm
 .macro scrypt_core_macro1b p, q
 	movl	\p(%edi), %eax
 	xorl	\p(%esi, %edx), %eax
 	movl	\q(%edi), %ebx
 	xorl	\q(%esi, %edx), %ebx
 	movl	%ebx, \q(%edi)
 	xorl	%ebx, %eax
 	movl	%eax, \p(%edi)
 	movl	%eax, \p(%esp)
 .endm
 .macro scrypt_core_macro2 p, q
 	movl	\p(%esp), %eax
 	addl	\p(%edi), %eax
 	movl	%eax, \p(%edi)
 	xorl	\q(%edi), %eax
 	movl	%eax, \q(%edi)
 	movl	%eax, \p(%esp)
 .endm
 .macro scrypt_core_macro3 p, q
 	movl	\p(%esp), %eax
 	addl	\q(%edi), %eax
 	movl	%eax, \q(%edi)
 .endm
 	leal	131072(%esi), %ecx
 scrypt_core_gen_loop1:
 	movl	%esi, 64(%esp)
 	movl	%ecx, 68(%esp)
 	scrypt_core_macro1a	0, 64
 	scrypt_core_macro1a	4, 68
 	scrypt_core_macro1a	8, 72
 	scrypt_core_macro1a	12, 76
 	scrypt_core_macro1a	16, 80
 	scrypt_core_macro1a	20, 84
 	scrypt_core_macro1a	24, 88
 	scrypt_core_macro1a	28, 92
 	scrypt_core_macro1a	32, 96
 	scrypt_core_macro1a	36, 100
 	scrypt_core_macro1a	40, 104
 	scrypt_core_macro1a	44, 108
 	scrypt_core_macro1a	48, 112
 	scrypt_core_macro1a	52, 116
 	scrypt_core_macro1a	56, 120
 	scrypt_core_macro1a	60, 124
 	call salsa8_core_gen
 	movl	92(%esp), %edi
 	scrypt_core_macro2	0, 64
 	scrypt_core_macro2	4, 68
 	scrypt_core_macro2	8, 72
 	scrypt_core_macro2	12, 76
 	scrypt_core_macro2	16, 80
 	scrypt_core_macro2	20, 84
 	scrypt_core_macro2	24, 88
 	scrypt_core_macro2	28, 92
 	scrypt_core_macro2	32, 96
 	scrypt_core_macro2	36, 100
 	scrypt_core_macro2	40, 104
 	scrypt_core_macro2	44, 108
 	scrypt_core_macro2	48, 112
 	scrypt_core_macro2	52, 116
 	scrypt_core_macro2	56, 120
 	scrypt_core_macro2	60, 124
 	call salsa8_core_gen
 	movl	92(%esp), %edi
 	scrypt_core_macro3	0, 64
 	scrypt_core_macro3	4, 68
 	scrypt_core_macro3	8, 72
 	scrypt_core_macro3	12, 76
 	scrypt_core_macro3	16, 80
 	scrypt_core_macro3	20, 84
 	scrypt_core_macro3	24, 88
 	scrypt_core_macro3	28, 92
 	scrypt_core_macro3	32, 96
 	scrypt_core_macro3	36, 100
 	scrypt_core_macro3	40, 104
 	scrypt_core_macro3	44, 108
 	scrypt_core_macro3	48, 112
 	scrypt_core_macro3	52, 116
 	scrypt_core_macro3	56, 120
 	scrypt_core_macro3	60, 124
 	movl	64(%esp), %esi
 	movl	68(%esp), %ecx
 	addl	$128, %esi
 	cmpl	%ecx, %esi
 	jne scrypt_core_gen_loop1
 	movl	96(%esp), %esi
 	movl	$1024, %ecx
 scrypt_core_gen_loop2:
 	movl	%ecx, 68(%esp)
 	movl	64(%edi), %edx
 	andl	$1023, %edx
 	shll	$7, %edx
 	scrypt_core_macro1b	0, 64
 	scrypt_core_macro1b	4, 68
 	scrypt_core_macro1b	8, 72
 	scrypt_core_macro1b	12, 76
 	scrypt_core_macro1b	16, 80
 	scrypt_core_macro1b	20, 84
 	scrypt_core_macro1b	24, 88
 	scrypt_core_macro1b	28, 92
 	scrypt_core_macro1b	32, 96
 	scrypt_core_macro1b	36, 100
 	scrypt_core_macro1b	40, 104
 	scrypt_core_macro1b	44, 108
 	scrypt_core_macro1b	48, 112
 	scrypt_core_macro1b	52, 116
 	scrypt_core_macro1b	56, 120
 	scrypt_core_macro1b	60, 124
 	call salsa8_core_gen
 	movl	92(%esp), %edi
 	scrypt_core_macro2	0, 64
 	scrypt_core_macro2	4, 68
 	scrypt_core_macro2	8, 72
 	scrypt_core_macro2	12, 76
 	scrypt_core_macro2	16, 80
 	scrypt_core_macro2	20, 84
 	scrypt_core_macro2	24, 88
 	scrypt_core_macro2	28, 92
 	scrypt_core_macro2	32, 96
 	scrypt_core_macro2	36, 100
 	scrypt_core_macro2	40, 104
 	scrypt_core_macro2	44, 108
 	scrypt_core_macro2	48, 112
 	scrypt_core_macro2	52, 116
 	scrypt_core_macro2	56, 120
 	scrypt_core_macro2	60, 124
 	call salsa8_core_gen
 	movl	92(%esp), %edi
 	movl	96(%esp), %esi
 	scrypt_core_macro3	0, 64
 	scrypt_core_macro3	4, 68
 	scrypt_core_macro3	8, 72
 	scrypt_core_macro3	12, 76
 	scrypt_core_macro3	16, 80
 	scrypt_core_macro3	20, 84
 	scrypt_core_macro3	24, 88
 	scrypt_core_macro3	28, 92
 	scrypt_core_macro3	32, 96
 	scrypt_core_macro3	36, 100
 	scrypt_core_macro3	40, 104
 	scrypt_core_macro3	44, 108
 	scrypt_core_macro3	48, 112
 	scrypt_core_macro3	52, 116
 	scrypt_core_macro3	56, 120
 	scrypt_core_macro3	60, 124
 	movl	68(%esp), %ecx
 	subl	$1, %ecx
 	ja scrypt_core_gen_loop2
 	addl	$72, %esp
 	popl	%esi
 	popl	%edi
 	popl	%ebp
 	popl	%ebx
 	ret
 .macro salsa8_core_sse2_doubleround
 	movdqa	%xmm1, %xmm4
 	paddd	%xmm0, %xmm4
 	movdqa	%xmm4, %xmm5
 	pslld	$7, %xmm4
 	psrld	$25, %xmm5
 	pxor	%xmm4, %xmm3
 	movdqa	%xmm0, %xmm4
 	pxor	%xmm5, %xmm3
 	paddd	%xmm3, %xmm4
 	movdqa	%xmm4, %xmm5
 	pslld	$9, %xmm4
 	psrld	$23, %xmm5
 	pxor	%xmm4, %xmm2
 	movdqa	%xmm3, %xmm4
 	pxor	%xmm5, %xmm2
 	pshufd	$0x93, %xmm3, %xmm3
 	paddd	%xmm2, %xmm4
 	movdqa	%xmm4, %xmm5
 	pslld	$13, %xmm4
 	psrld	$19, %xmm5
 	pxor	%xmm4, %xmm1
 	movdqa	%xmm2, %xmm4
 	pxor	%xmm5, %xmm1
 	pshufd	$0x4e, %xmm2, %xmm2
 	paddd	%xmm1, %xmm4
 	movdqa	%xmm4, %xmm5
 	pslld	$18, %xmm4
 	psrld	$14, %xmm5
 	pxor	%xmm4, %xmm0
 	movdqa	%xmm3, %xmm4
 	pxor	%xmm5, %xmm0
 	pshufd	$0x39, %xmm1, %xmm1
 	paddd	%xmm0, %xmm4
 	movdqa	%xmm4, %xmm5
 	pslld	$7, %xmm4
 	psrld	$25, %xmm5
 	pxor	%xmm4, %xmm1
 	movdqa	%xmm0, %xmm4
 	pxor	%xmm5, %xmm1
 	paddd	%xmm1, %xmm4
 	movdqa	%xmm4, %xmm5
 	pslld	$9, %xmm4
 	psrld	$23, %xmm5
 	pxor	%xmm4, %xmm2
 	movdqa	%xmm1, %xmm4
 	pxor	%xmm5, %xmm2
 	pshufd	$0x93, %xmm1, %xmm1
 	paddd	%xmm2, %xmm4
 	movdqa	%xmm4, %xmm5
 	pslld	$13, %xmm4
 	psrld	$19, %xmm5
 	pxor	%xmm4, %xmm3
 	movdqa	%xmm2, %xmm4
 	pxor	%xmm5, %xmm3
 	pshufd	$0x4e, %xmm2, %xmm2
 	paddd	%xmm3, %xmm4
 	movdqa	%xmm4, %xmm5
 	pslld	$18, %xmm4
 	psrld	$14, %xmm5
 	pxor	%xmm4, %xmm0
 	pshufd	$0x39, %xmm3, %xmm3
 	pxor	%xmm5, %xmm0
 .endm
 .macro salsa8_core_sse2
 	salsa8_core_sse2_doubleround
 	salsa8_core_sse2_doubleround
 	salsa8_core_sse2_doubleround
 	salsa8_core_sse2_doubleround
 .endm
 	.p2align 5
 scrypt_core_sse2:
 	movl	20(%esp), %edi
 	movl	24(%esp), %esi
 	movl	%esp, %ebp
 	subl	$128, %esp
 	andl	$-16, %esp
 	scrypt_shuffle %edi, 0, %esp, 0
 	scrypt_shuffle %edi, 64, %esp, 64
 	movdqa	96(%esp), %xmm6
 	movdqa	112(%esp), %xmm7
 	movl	%esi, %edx
 	leal	131072(%esi), %ecx
 scrypt_core_sse2_loop1:
 	movdqa	0(%esp), %xmm0
 	movdqa	16(%esp), %xmm1
 	movdqa	32(%esp), %xmm2
 	movdqa	48(%esp), %xmm3
 	movdqa	64(%esp), %xmm4
 	movdqa	80(%esp), %xmm5
 	pxor	%xmm4, %xmm0
 	pxor	%xmm5, %xmm1
 	movdqa	%xmm0, 0(%edx)
 	movdqa	%xmm1, 16(%edx)
 	pxor	%xmm6, %xmm2
 	pxor	%xmm7, %xmm3
 	movdqa	%xmm2, 32(%edx)
 	movdqa	%xmm3, 48(%edx)
 	movdqa	%xmm4, 64(%edx)
 	movdqa	%xmm5, 80(%edx)
 	movdqa	%xmm6, 96(%edx)
 	movdqa	%xmm7, 112(%edx)
 	salsa8_core_sse2
 	paddd	0(%edx), %xmm0
 	paddd	16(%edx), %xmm1
 	paddd	32(%edx), %xmm2
 	paddd	48(%edx), %xmm3
 	movdqa	%xmm0, 0(%esp)
 	movdqa	%xmm1, 16(%esp)
 	movdqa	%xmm2, 32(%esp)
 	movdqa	%xmm3, 48(%esp)
 	pxor	64(%esp), %xmm0
 	pxor	80(%esp), %xmm1
 	pxor	%xmm6, %xmm2
 	pxor	%xmm7, %xmm3
 	movdqa	%xmm0, 64(%esp)
 	movdqa	%xmm1, 80(%esp)
 	movdqa	%xmm2, %xmm6
 	movdqa	%xmm3, %xmm7
 	salsa8_core_sse2
 	paddd	64(%esp), %xmm0
 	paddd	80(%esp), %xmm1
 	paddd	%xmm2, %xmm6
 	paddd	%xmm3, %xmm7
 	movdqa	%xmm0, 64(%esp)
 	movdqa	%xmm1, 80(%esp)
 	addl	$128, %edx
 	cmpl	%ecx, %edx
 	jne scrypt_core_sse2_loop1
 	movdqa	64(%esp), %xmm4
 	movdqa	80(%esp), %xmm5
 	movl	$1024, %ecx
 scrypt_core_sse2_loop2:
 	movd	%xmm4, %edx
 	movdqa	0(%esp), %xmm0
 	movdqa	16(%esp), %xmm1
 	movdqa	32(%esp), %xmm2
 	movdqa	48(%esp), %xmm3
 	andl	$1023, %edx
 	shll	$7, %edx
 	pxor	0(%esi, %edx), %xmm0
 	pxor	16(%esi, %edx), %xmm1
 	pxor	32(%esi, %edx), %xmm2
 	pxor	48(%esi, %edx), %xmm3
 	pxor	%xmm4, %xmm0
 	pxor	%xmm5, %xmm1
 	movdqa	%xmm0, 0(%esp)
 	movdqa	%xmm1, 16(%esp)
 	pxor	%xmm6, %xmm2
 	pxor	%xmm7, %xmm3
 	movdqa	%xmm2, 32(%esp)
 	movdqa	%xmm3, 48(%esp)
 	salsa8_core_sse2
 	paddd	0(%esp), %xmm0
 	paddd	16(%esp), %xmm1
 	paddd	32(%esp), %xmm2
 	paddd	48(%esp), %xmm3
 	movdqa	%xmm0, 0(%esp)
 	movdqa	%xmm1, 16(%esp)
 	movdqa	%xmm2, 32(%esp)
 	movdqa	%xmm3, 48(%esp)
 	pxor	64(%esi, %edx), %xmm0
 	pxor	80(%esi, %edx), %xmm1
 	pxor	96(%esi, %edx), %xmm2
 	pxor	112(%esi, %edx), %xmm3
 	pxor	64(%esp), %xmm0
 	pxor	80(%esp), %xmm1
 	pxor	%xmm6, %xmm2
 	pxor	%xmm7, %xmm3
 	movdqa	%xmm0, 64(%esp)
 	movdqa	%xmm1, 80(%esp)
 	movdqa	%xmm2, %xmm6
 	movdqa	%xmm3, %xmm7
 	salsa8_core_sse2
 	paddd	64(%esp), %xmm0
 	paddd	80(%esp), %xmm1
 	paddd	%xmm2, %xmm6
 	paddd	%xmm3, %xmm7
 	movdqa	%xmm0, %xmm4
 	movdqa	%xmm1, %xmm5
 	movdqa	%xmm0, 64(%esp)
 	movdqa	%xmm1, 80(%esp)
 	subl	$1, %ecx
 	ja scrypt_core_sse2_loop2
 	movdqa	%xmm6, 96(%esp)
 	movdqa	%xmm7, 112(%esp)
 	scrypt_shuffle %esp, 0, %edi, 0
 	scrypt_shuffle %esp, 64, %edi, 64
 	movl	%ebp, %esp
 	popl	%esi
 	popl	%edi
 	popl	%ebp
 	popl	%ebx
 	ret
 #endif
--- a/algo/x2.hide/scrypt.c
+++ b/algo/x2.hide/scrypt.c
@@ -1,767 +0,0 @@
 /*
 * Copyright 2009 Colin Percival, 2011 ArtForz, 2011-2013 pooler
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 * This file was originally written by Colin Percival as part of the Tarsnap
 * online backup system.
 */
 #include "../cpuminer-config.h"
 #include "../miner.h"
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 static const uint32_t keypad[12] = {
 	0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280
 };
 static const uint32_t innerpad[11] = {
 	0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x000004a0
 };
 static const uint32_t outerpad[8] = {
 	0x80000000, 0, 0, 0, 0, 0, 0, 0x00000300
 };
 static const uint32_t finalblk[16] = {
 	0x00000001, 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000620
 };
 static inline void HMAC_SHA256_80_init(const uint32_t *key,
 	uint32_t *tstate, uint32_t *ostate)
 {
 	uint32_t ihash[8];
 	uint32_t pad[16];
 	int i;
 	/* tstate is assumed to contain the midstate of key */
 	memcpy(pad, key + 16, 16);
 	memcpy(pad + 4, keypad, 48);
 	sha256_transform(tstate, pad, 0);
 	memcpy(ihash, tstate, 32);
 	sha256_init(ostate);
 	for (i = 0; i < 8; i++)
 		pad[i] = ihash[i] ^ 0x5c5c5c5c;
 	for (; i < 16; i++)
 		pad[i] = 0x5c5c5c5c;
 	sha256_transform(ostate, pad, 0);
 	sha256_init(tstate);
 	for (i = 0; i < 8; i++)
 		pad[i] = ihash[i] ^ 0x36363636;
 	for (; i < 16; i++)
 		pad[i] = 0x36363636;
 	sha256_transform(tstate, pad, 0);
 }
 static inline void PBKDF2_SHA256_80_128(const uint32_t *tstate,
 	const uint32_t *ostate, const uint32_t *salt, uint32_t *output)
 {
 	uint32_t istate[8], ostate2[8];
 	uint32_t ibuf[16], obuf[16];
 	int i, j;
 	memcpy(istate, tstate, 32);
 	sha256_transform(istate, salt, 0);
 	memcpy(ibuf, salt + 16, 16);
 	memcpy(ibuf + 5, innerpad, 44);
 	memcpy(obuf + 8, outerpad, 32);
 	for (i = 0; i < 4; i++) {
 		memcpy(obuf, istate, 32);
 		ibuf[4] = i + 1;
 		sha256_transform(obuf, ibuf, 0);
 		memcpy(ostate2, ostate, 32);
 		sha256_transform(ostate2, obuf, 0);
 		for (j = 0; j < 8; j++)
 			output[8 * i + j] = swab32(ostate2[j]);
 	}
 }
 static inline void PBKDF2_SHA256_128_32(uint32_t *tstate, uint32_t *ostate,
 	const uint32_t *salt, uint32_t *output)
 {
 	uint32_t buf[16];
 	int i;
 	sha256_transform(tstate, salt, 1);
 	sha256_transform(tstate, salt + 16, 1);
 	sha256_transform(tstate, finalblk, 0);
 	memcpy(buf, tstate, 32);
 	memcpy(buf + 8, outerpad, 32);
 	sha256_transform(ostate, buf, 0);
 	for (i = 0; i < 8; i++)
 		output[i] = swab32(ostate[i]);
 }
 #ifdef HAVE_SHA256_4WAY
 static const uint32_t keypad_4way[4 * 12] = {
 	0x80000000, 0x80000000, 0x80000000, 0x80000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000280, 0x00000280, 0x00000280, 0x00000280
 };
 static const uint32_t innerpad_4way[4 * 11] = {
 	0x80000000, 0x80000000, 0x80000000, 0x80000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x000004a0, 0x000004a0, 0x000004a0, 0x000004a0
 };
 static const uint32_t outerpad_4way[4 * 8] = {
 	0x80000000, 0x80000000, 0x80000000, 0x80000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000300, 0x00000300, 0x00000300, 0x00000300
 };
 static const uint32_t finalblk_4way[4 * 16] __attribute__((aligned(16))) = {
 	0x00000001, 0x00000001, 0x00000001, 0x00000001,
 	0x80000000, 0x80000000, 0x80000000, 0x80000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000620, 0x00000620, 0x00000620, 0x00000620
 };
 static inline void HMAC_SHA256_80_init_4way(const uint32_t *key,
 	uint32_t *tstate, uint32_t *ostate)
 {
 	uint32_t ihash[4 * 8] __attribute__((aligned(16)));
 	uint32_t pad[4 * 16] __attribute__((aligned(16)));
 	int i;
 	/* tstate is assumed to contain the midstate of key */
 	memcpy(pad, key + 4 * 16, 4 * 16);
 	memcpy(pad + 4 * 4, keypad_4way, 4 * 48);
 	sha256_transform_4way(tstate, pad, 0);
 	memcpy(ihash, tstate, 4 * 32);
 	sha256_init_4way(ostate);
 	for (i = 0; i < 4 * 8; i++)
 		pad[i] = ihash[i] ^ 0x5c5c5c5c;
 	for (; i < 4 * 16; i++)
 		pad[i] = 0x5c5c5c5c;
 	sha256_transform_4way(ostate, pad, 0);
 	sha256_init_4way(tstate);
 	for (i = 0; i < 4 * 8; i++)
 		pad[i] = ihash[i] ^ 0x36363636;
 	for (; i < 4 * 16; i++)
 		pad[i] = 0x36363636;
 	sha256_transform_4way(tstate, pad, 0);
 }
 static inline void PBKDF2_SHA256_80_128_4way(const uint32_t *tstate,
 	const uint32_t *ostate, const uint32_t *salt, uint32_t *output)
 {
 	uint32_t istate[4 * 8] __attribute__((aligned(16)));
 	uint32_t ostate2[4 * 8] __attribute__((aligned(16)));
 	uint32_t ibuf[4 * 16] __attribute__((aligned(16)));
 	uint32_t obuf[4 * 16] __attribute__((aligned(16)));
 	int i, j;
 	memcpy(istate, tstate, 4 * 32);
 	sha256_transform_4way(istate, salt, 0);
 	memcpy(ibuf, salt + 4 * 16, 4 * 16);
 	memcpy(ibuf + 4 * 5, innerpad_4way, 4 * 44);
 	memcpy(obuf + 4 * 8, outerpad_4way, 4 * 32);
 	for (i = 0; i < 4; i++) {
 		memcpy(obuf, istate, 4 * 32);
 		ibuf[4 * 4 + 0] = i + 1;
 		ibuf[4 * 4 + 1] = i + 1;
 		ibuf[4 * 4 + 2] = i + 1;
 		ibuf[4 * 4 + 3] = i + 1;
 		sha256_transform_4way(obuf, ibuf, 0);
 		memcpy(ostate2, ostate, 4 * 32);
 		sha256_transform_4way(ostate2, obuf, 0);
 		for (j = 0; j < 4 * 8; j++)
 			output[4 * 8 * i + j] = swab32(ostate2[j]);
 	}
 }
 static inline void PBKDF2_SHA256_128_32_4way(uint32_t *tstate,
 	uint32_t *ostate, const uint32_t *salt, uint32_t *output)
 {
 	uint32_t buf[4 * 16] __attribute__((aligned(16)));
 	int i;
 	sha256_transform_4way(tstate, salt, 1);
 	sha256_transform_4way(tstate, salt + 4 * 16, 1);
 	sha256_transform_4way(tstate, finalblk_4way, 0);
 	memcpy(buf, tstate, 4 * 32);
 	memcpy(buf + 4 * 8, outerpad_4way, 4 * 32);
 	sha256_transform_4way(ostate, buf, 0);
 	for (i = 0; i < 4 * 8; i++)
 		output[i] = swab32(ostate[i]);
 }
 #endif /* HAVE_SHA256_4WAY */
 #ifdef HAVE_SHA256_8WAY
 static const uint32_t finalblk_8way[8 * 16] __attribute__((aligned(32))) = {
 	0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001,
 	0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620
 };
 static inline void HMAC_SHA256_80_init_8way(const uint32_t *key,
 	uint32_t *tstate, uint32_t *ostate)
 {
 	uint32_t ihash[8 * 8] __attribute__((aligned(32)));
 	uint32_t pad[8 * 16] __attribute__((aligned(32)));
 	int i;
 	/* tstate is assumed to contain the midstate of key */
 	memcpy(pad, key + 8 * 16, 8 * 16);
 	for (i = 0; i < 8; i++)
 		pad[8 * 4 + i] = 0x80000000;
 	memset(pad + 8 * 5, 0x00, 8 * 40);
 	for (i = 0; i < 8; i++)
 		pad[8 * 15 + i] = 0x00000280;
 	sha256_transform_8way(tstate, pad, 0);
 	memcpy(ihash, tstate, 8 * 32);
 	sha256_init_8way(ostate);
 	for (i = 0; i < 8 * 8; i++)
 		pad[i] = ihash[i] ^ 0x5c5c5c5c;
 	for (; i < 8 * 16; i++)
 		pad[i] = 0x5c5c5c5c;
 	sha256_transform_8way(ostate, pad, 0);
 	sha256_init_8way(tstate);
 	for (i = 0; i < 8 * 8; i++)
 		pad[i] = ihash[i] ^ 0x36363636;
 	for (; i < 8 * 16; i++)
 		pad[i] = 0x36363636;
 	sha256_transform_8way(tstate, pad, 0);
 }
 static inline void PBKDF2_SHA256_80_128_8way(const uint32_t *tstate,
 	const uint32_t *ostate, const uint32_t *salt, uint32_t *output)
 {
 	uint32_t istate[8 * 8] __attribute__((aligned(32)));
 	uint32_t ostate2[8 * 8] __attribute__((aligned(32)));
 	uint32_t ibuf[8 * 16] __attribute__((aligned(32)));
 	uint32_t obuf[8 * 16] __attribute__((aligned(32)));
 	int i, j;
 	memcpy(istate, tstate, 8 * 32);
 	sha256_transform_8way(istate, salt, 0);
 	memcpy(ibuf, salt + 8 * 16, 8 * 16);
 	for (i = 0; i < 8; i++)
 		ibuf[8 * 5 + i] = 0x80000000;
 	memset(ibuf + 8 * 6, 0x00, 8 * 36);
 	for (i = 0; i < 8; i++)
 		ibuf[8 * 15 + i] = 0x000004a0;
 	for (i = 0; i < 8; i++)
 		obuf[8 * 8 + i] = 0x80000000;
 	memset(obuf + 8 * 9, 0x00, 8 * 24);
 	for (i = 0; i < 8; i++)
 		obuf[8 * 15 + i] = 0x00000300;
 	for (i = 0; i < 4; i++) {
 		memcpy(obuf, istate, 8 * 32);
 		ibuf[8 * 4 + 0] = i + 1;
 		ibuf[8 * 4 + 1] = i + 1;
 		ibuf[8 * 4 + 2] = i + 1;
 		ibuf[8 * 4 + 3] = i + 1;
 		ibuf[8 * 4 + 4] = i + 1;
 		ibuf[8 * 4 + 5] = i + 1;
 		ibuf[8 * 4 + 6] = i + 1;
 		ibuf[8 * 4 + 7] = i + 1;
 		sha256_transform_8way(obuf, ibuf, 0);
 		memcpy(ostate2, ostate, 8 * 32);
 		sha256_transform_8way(ostate2, obuf, 0);
 		for (j = 0; j < 8 * 8; j++)
 			output[8 * 8 * i + j] = swab32(ostate2[j]);
 	}
 }
 static inline void PBKDF2_SHA256_128_32_8way(uint32_t *tstate,
 	uint32_t *ostate, const uint32_t *salt, uint32_t *output)
 {
 	uint32_t buf[8 * 16] __attribute__((aligned(32)));
 	int i;
 	sha256_transform_8way(tstate, salt, 1);
 	sha256_transform_8way(tstate, salt + 8 * 16, 1);
 	sha256_transform_8way(tstate, finalblk_8way, 0);
 	memcpy(buf, tstate, 8 * 32);
 	for (i = 0; i < 8; i++)
 		buf[8 * 8 + i] = 0x80000000;
 	memset(buf + 8 * 9, 0x00, 8 * 24);
 	for (i = 0; i < 8; i++)
 		buf[8 * 15 + i] = 0x00000300;
 	sha256_transform_8way(ostate, buf, 0);
 	for (i = 0; i < 8 * 8; i++)
 		output[i] = swab32(ostate[i]);
 }
 #endif /* HAVE_SHA256_8WAY */
 #if defined(__x86_64__)
 #define SCRYPT_MAX_WAYS 12
 #define HAVE_SCRYPT_3WAY 1
 int scrypt_best_throughput();
 void scrypt_core(uint32_t *X, uint32_t *V);
 void scrypt_core_3way(uint32_t *X, uint32_t *V);
 #if defined(USE_AVX2)
 #undef SCRYPT_MAX_WAYS
 #define SCRYPT_MAX_WAYS 24
 #define HAVE_SCRYPT_6WAY 1
 void scrypt_core_6way(uint32_t *X, uint32_t *V);
 #endif
 #elif defined(__i386__)
 #define SCRYPT_MAX_WAYS 4
 #define scrypt_best_throughput() 1
 void scrypt_core(uint32_t *X, uint32_t *V);
 #elif defined(__arm__) && defined(__APCS_32__)
 void scrypt_core(uint32_t *X, uint32_t *V);
 #if defined(__ARM_NEON__)
 #undef HAVE_SHA256_4WAY
 #define SCRYPT_MAX_WAYS 3
 #define HAVE_SCRYPT_3WAY 1
 #define scrypt_best_throughput() 3
 void scrypt_core_3way(uint32_t *X, uint32_t *V);
 #endif
 #else
 static inline void xor_salsa8(uint32_t B[16], const uint32_t Bx[16])
 {
 	uint32_t x00,x01,x02,x03,x04,x05,x06,x07,x08,x09,x10,x11,x12,x13,x14,x15;
 	int i;
 	x00 = (B[ 0] ^= Bx[ 0]);
 	x01 = (B[ 1] ^= Bx[ 1]);
 	x02 = (B[ 2] ^= Bx[ 2]);
 	x03 = (B[ 3] ^= Bx[ 3]);
 	x04 = (B[ 4] ^= Bx[ 4]);
 	x05 = (B[ 5] ^= Bx[ 5]);
 	x06 = (B[ 6] ^= Bx[ 6]);
 	x07 = (B[ 7] ^= Bx[ 7]);
 	x08 = (B[ 8] ^= Bx[ 8]);
 	x09 = (B[ 9] ^= Bx[ 9]);
 	x10 = (B[10] ^= Bx[10]);
 	x11 = (B[11] ^= Bx[11]);
 	x12 = (B[12] ^= Bx[12]);
 	x13 = (B[13] ^= Bx[13]);
 	x14 = (B[14] ^= Bx[14]);
 	x15 = (B[15] ^= Bx[15]);
 	for (i = 0; i < 8; i += 2) {
 #define R(a, b) (((a) << (b)) | ((a) >> (32 - (b))))
 		/* Operate on columns. */
 		x04 ^= R(x00+x12, 7);	x09 ^= R(x05+x01, 7);
 		x14 ^= R(x10+x06, 7);	x03 ^= R(x15+x11, 7);
 		x08 ^= R(x04+x00, 9);	x13 ^= R(x09+x05, 9);
 		x02 ^= R(x14+x10, 9);	x07 ^= R(x03+x15, 9);
 		x12 ^= R(x08+x04,13);	x01 ^= R(x13+x09,13);
 		x06 ^= R(x02+x14,13);	x11 ^= R(x07+x03,13);
 		x00 ^= R(x12+x08,18);	x05 ^= R(x01+x13,18);
 		x10 ^= R(x06+x02,18);	x15 ^= R(x11+x07,18);
 		/* Operate on rows. */
 		x01 ^= R(x00+x03, 7);	x06 ^= R(x05+x04, 7);
 		x11 ^= R(x10+x09, 7);	x12 ^= R(x15+x14, 7);
 		x02 ^= R(x01+x00, 9);	x07 ^= R(x06+x05, 9);
 		x08 ^= R(x11+x10, 9);	x13 ^= R(x12+x15, 9);
 		x03 ^= R(x02+x01,13);	x04 ^= R(x07+x06,13);
 		x09 ^= R(x08+x11,13);	x14 ^= R(x13+x12,13);
 		x00 ^= R(x03+x02,18);	x05 ^= R(x04+x07,18);
 		x10 ^= R(x09+x08,18);	x15 ^= R(x14+x13,18);
 #undef R
 	}
 	B[ 0] += x00;
 	B[ 1] += x01;
 	B[ 2] += x02;
 	B[ 3] += x03;
 	B[ 4] += x04;
 	B[ 5] += x05;
 	B[ 6] += x06;
 	B[ 7] += x07;
 	B[ 8] += x08;
 	B[ 9] += x09;
 	B[10] += x10;
 	B[11] += x11;
 	B[12] += x12;
 	B[13] += x13;
 	B[14] += x14;
 	B[15] += x15;
 }
 static inline void scrypt_core(uint32_t *X, uint32_t *V)
 {
 	uint32_t i, j, k;
 	for (i = 0; i < 1024; i++) {
 		memcpy(&V[i * 32], X, 128);
 		xor_salsa8(&X[0], &X[16]);
 		xor_salsa8(&X[16], &X[0]);
 	}
 	for (i = 0; i < 1024; i++) {
 		j = 32 * (X[16] & 1023);
 		for (k = 0; k < 32; k++)
 			X[k] ^= V[j + k];
 		xor_salsa8(&X[0], &X[16]);
 		xor_salsa8(&X[16], &X[0]);
 	}
 }
 #endif
 #ifndef SCRYPT_MAX_WAYS
 #define SCRYPT_MAX_WAYS 1
 #define scrypt_best_throughput() 1
 #endif
 #define SCRYPT_BUFFER_SIZE (SCRYPT_MAX_WAYS * 131072 + 63)
 unsigned char *scrypt_buffer_alloc()
 {
 	return malloc(SCRYPT_BUFFER_SIZE);
 }
 static void scrypt_1024_1_1_256(const uint32_t *input, uint32_t *output,
 	uint32_t *midstate, unsigned char *scratchpad)
 {
 	uint32_t tstate[8], ostate[8];
 	uint32_t X[32];
 	uint32_t *V;
 	V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
 	memcpy(tstate, midstate, 32);
 	HMAC_SHA256_80_init(input, tstate, ostate);
 	PBKDF2_SHA256_80_128(tstate, ostate, input, X);
 	scrypt_core(X, V);
 	PBKDF2_SHA256_128_32(tstate, ostate, X, output);
 }
 #ifdef HAVE_SHA256_4WAY
 static void scrypt_1024_1_1_256_4way(const uint32_t *input,
 	uint32_t *output, uint32_t *midstate, unsigned char *scratchpad)
 {
 	uint32_t tstate[4 * 8] __attribute__((aligned(128)));
 	uint32_t ostate[4 * 8] __attribute__((aligned(128)));
 	uint32_t W[4 * 32] __attribute__((aligned(128)));
 	uint32_t X[4 * 32] __attribute__((aligned(128)));
 	uint32_t *V;
 	int i, k;
 	V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
 	for (i = 0; i < 20; i++)
 		for (k = 0; k < 4; k++)
 			W[4 * i + k] = input[k * 20 + i];
 	for (i = 0; i < 8; i++)
 		for (k = 0; k < 4; k++)
 			tstate[4 * i + k] = midstate[i];
 	HMAC_SHA256_80_init_4way(W, tstate, ostate);
 	PBKDF2_SHA256_80_128_4way(tstate, ostate, W, W);
 	for (i = 0; i < 32; i++)
 		for (k = 0; k < 4; k++)
 			X[k * 32 + i] = W[4 * i + k];
 	scrypt_core(X + 0 * 32, V);
 	scrypt_core(X + 1 * 32, V);
 	scrypt_core(X + 2 * 32, V);
 	scrypt_core(X + 3 * 32, V);
 	for (i = 0; i < 32; i++)
 		for (k = 0; k < 4; k++)
 			W[4 * i + k] = X[k * 32 + i];
 	PBKDF2_SHA256_128_32_4way(tstate, ostate, W, W);
 	for (i = 0; i < 8; i++)
 		for (k = 0; k < 4; k++)
 			output[k * 8 + i] = W[4 * i + k];
 }
 #endif /* HAVE_SHA256_4WAY */
 #ifdef HAVE_SCRYPT_3WAY
 static void scrypt_1024_1_1_256_3way(const uint32_t *input,
 	uint32_t *output, uint32_t *midstate, unsigned char *scratchpad)
 {
 	uint32_t tstate[3 * 8], ostate[3 * 8];
 	uint32_t X[3 * 32] __attribute__((aligned(64)));
 	uint32_t *V;
 	V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
 	memcpy(tstate +  0, midstate, 32);
 	memcpy(tstate +  8, midstate, 32);
 	memcpy(tstate + 16, midstate, 32);
 	HMAC_SHA256_80_init(input +  0, tstate +  0, ostate +  0);
 	HMAC_SHA256_80_init(input + 20, tstate +  8, ostate +  8);
 	HMAC_SHA256_80_init(input + 40, tstate + 16, ostate + 16);
 	PBKDF2_SHA256_80_128(tstate +  0, ostate +  0, input +  0, X +  0);
 	PBKDF2_SHA256_80_128(tstate +  8, ostate +  8, input + 20, X + 32);
 	PBKDF2_SHA256_80_128(tstate + 16, ostate + 16, input + 40, X + 64);
 	scrypt_core_3way(X, V);
 	PBKDF2_SHA256_128_32(tstate +  0, ostate +  0, X +  0, output +  0);
 	PBKDF2_SHA256_128_32(tstate +  8, ostate +  8, X + 32, output +  8);
 	PBKDF2_SHA256_128_32(tstate + 16, ostate + 16, X + 64, output + 16);
 }
 #ifdef HAVE_SHA256_4WAY
 static void scrypt_1024_1_1_256_12way(const uint32_t *input,
 	uint32_t *output, uint32_t *midstate, unsigned char *scratchpad)
 {
 	uint32_t tstate[12 * 8] __attribute__((aligned(128)));
 	uint32_t ostate[12 * 8] __attribute__((aligned(128)));
 	uint32_t W[12 * 32] __attribute__((aligned(128)));
 	uint32_t X[12 * 32] __attribute__((aligned(128)));
 	uint32_t *V;
 	int i, j, k;
 	V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
 	for (j = 0; j < 3; j++)
 		for (i = 0; i < 20; i++)
 			for (k = 0; k < 4; k++)
 				W[128 * j + 4 * i + k] = input[80 * j + k * 20 + i];
 	for (j = 0; j < 3; j++)
 		for (i = 0; i < 8; i++)
 			for (k = 0; k < 4; k++)
 				tstate[32 * j + 4 * i + k] = midstate[i];
 	HMAC_SHA256_80_init_4way(W +   0, tstate +  0, ostate +  0);
 	HMAC_SHA256_80_init_4way(W + 128, tstate + 32, ostate + 32);
 	HMAC_SHA256_80_init_4way(W + 256, tstate + 64, ostate + 64);
 	PBKDF2_SHA256_80_128_4way(tstate +  0, ostate +  0, W +   0, W +   0);
 	PBKDF2_SHA256_80_128_4way(tstate + 32, ostate + 32, W + 128, W + 128);
 	PBKDF2_SHA256_80_128_4way(tstate + 64, ostate + 64, W + 256, W + 256);
 	for (j = 0; j < 3; j++)
 		for (i = 0; i < 32; i++)
 			for (k = 0; k < 4; k++)
 				X[128 * j + k * 32 + i] = W[128 * j + 4 * i + k];
 	scrypt_core_3way(X + 0 * 96, V);
 	scrypt_core_3way(X + 1 * 96, V);
 	scrypt_core_3way(X + 2 * 96, V);
 	scrypt_core_3way(X + 3 * 96, V);
 	for (j = 0; j < 3; j++)
 		for (i = 0; i < 32; i++)
 			for (k = 0; k < 4; k++)
 				W[128 * j + 4 * i + k] = X[128 * j + k * 32 + i];
 	PBKDF2_SHA256_128_32_4way(tstate +  0, ostate +  0, W +   0, W +   0);
 	PBKDF2_SHA256_128_32_4way(tstate + 32, ostate + 32, W + 128, W + 128);
 	PBKDF2_SHA256_128_32_4way(tstate + 64, ostate + 64, W + 256, W + 256);
 	for (j = 0; j < 3; j++)
 		for (i = 0; i < 8; i++)
 			for (k = 0; k < 4; k++)
 				output[32 * j + k * 8 + i] = W[128 * j + 4 * i + k];
 }
 #endif /* HAVE_SHA256_4WAY */
 #endif /* HAVE_SCRYPT_3WAY */
 #ifdef HAVE_SCRYPT_6WAY
 static void scrypt_1024_1_1_256_24way(const uint32_t *input,
 	uint32_t *output, uint32_t *midstate, unsigned char *scratchpad)
 {
 	uint32_t tstate[24 * 8] __attribute__((aligned(128)));
 	uint32_t ostate[24 * 8] __attribute__((aligned(128)));
 	uint32_t W[24 * 32] __attribute__((aligned(128)));
 	uint32_t X[24 * 32] __attribute__((aligned(128)));
 	uint32_t *V;
 	int i, j, k;
 	V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
 	for (j = 0; j < 3; j++) 
 		for (i = 0; i < 20; i++)
 			for (k = 0; k < 8; k++)
 				W[8 * 32 * j + 8 * i + k] = input[8 * 20 * j + k * 20 + i];
 	for (j = 0; j < 3; j++)
 		for (i = 0; i < 8; i++)
 			for (k = 0; k < 8; k++)
 				tstate[8 * 8 * j + 8 * i + k] = midstate[i];
 	HMAC_SHA256_80_init_8way(W +   0, tstate +   0, ostate +   0);
 	HMAC_SHA256_80_init_8way(W + 256, tstate +  64, ostate +  64);
 	HMAC_SHA256_80_init_8way(W + 512, tstate + 128, ostate + 128);
 	PBKDF2_SHA256_80_128_8way(tstate +   0, ostate +   0, W +   0, W +   0);
 	PBKDF2_SHA256_80_128_8way(tstate +  64, ostate +  64, W + 256, W + 256);
 	PBKDF2_SHA256_80_128_8way(tstate + 128, ostate + 128, W + 512, W + 512);
 	for (j = 0; j < 3; j++)
 		for (i = 0; i < 32; i++)
 			for (k = 0; k < 8; k++)
 				X[8 * 32 * j + k * 32 + i] = W[8 * 32 * j + 8 * i + k];
 	scrypt_core_6way(X + 0 * 32, V);
 	scrypt_core_6way(X + 6 * 32, V);
 	scrypt_core_6way(X + 12 * 32, V);
 	scrypt_core_6way(X + 18 * 32, V);
 	for (j = 0; j < 3; j++)
 		for (i = 0; i < 32; i++)
 			for (k = 0; k < 8; k++)
 				W[8 * 32 * j + 8 * i + k] = X[8 * 32 * j + k * 32 + i];
 	PBKDF2_SHA256_128_32_8way(tstate +   0, ostate +   0, W +   0, W +   0);
 	PBKDF2_SHA256_128_32_8way(tstate +  64, ostate +  64, W + 256, W + 256);
 	PBKDF2_SHA256_128_32_8way(tstate + 128, ostate + 128, W + 512, W + 512);
 	for (j = 0; j < 3; j++)
 		for (i = 0; i < 8; i++)
 			for (k = 0; k < 8; k++)
 				output[8 * 8 * j + k * 8 + i] = W[8 * 32 * j + 8 * i + k];
 }
 #endif /* HAVE_SCRYPT_6WAY */
 int scanhash_scrypt(int thr_id, uint32_t *pdata,
 	unsigned char *scratchbuf, const uint32_t *ptarget,
 	uint32_t max_nonce, unsigned long *hashes_done)
 {
 	uint32_t data[SCRYPT_MAX_WAYS * 20], hash[SCRYPT_MAX_WAYS * 8];
 	uint32_t midstate[8];
 	uint32_t n = pdata[19] - 1;
 	const uint32_t Htarg = ptarget[7];
 	int throughput = scrypt_best_throughput();
 	int i;
 #ifdef HAVE_SHA256_4WAY
 	if (sha256_use_4way())
 		throughput *= 4;
 #endif
 	for (i = 0; i < throughput; i++)
 		memcpy(data + i * 20, pdata, 80);
 	sha256_init(midstate);
 	sha256_transform(midstate, data, 0);
 	do {
 		for (i = 0; i < throughput; i++)
 			data[i * 20 + 19] = ++n;
 #if defined(HAVE_SHA256_4WAY)
 		if (throughput == 4)
 			scrypt_1024_1_1_256_4way(data, hash, midstate, scratchbuf);
 		else
 #endif
 #if defined(HAVE_SCRYPT_3WAY) && defined(HAVE_SHA256_4WAY)
 		if (throughput == 12)
 			scrypt_1024_1_1_256_12way(data, hash, midstate, scratchbuf);
 		else
 #endif
 #if defined(HAVE_SCRYPT_6WAY)
 		if (throughput == 24)
 			scrypt_1024_1_1_256_24way(data, hash, midstate, scratchbuf);
 		else
 #endif
 #if defined(HAVE_SCRYPT_3WAY)
 		if (throughput == 3)
 			scrypt_1024_1_1_256_3way(data, hash, midstate, scratchbuf);
 		else
 #endif
 		scrypt_1024_1_1_256(data, hash, midstate, scratchbuf);
 		for (i = 0; i < throughput; i++) {
 			if (hash[i * 8 + 7] <= Htarg && fulltest(hash + i * 8, ptarget)) {
 				*hashes_done = n - pdata[19] + 1;
 				pdata[19] = data[i * 20 + 19];
 				return 1;
 			}
 		}
 	} while (n < max_nonce && !work_restart[thr_id].restart);
 	*hashes_done = n - pdata[19] + 1;
 	pdata[19] = n;
 	return 0;
 }
 bool register_scrypt_algo( algo_gate_t* gate )
 {
    gate->scanhash = &scanhash_scrypt;
    gate->hash     = &scrypt_hash;
 //    gate->get_max64 = scrypt_get_max64;
    return true;
 };
--- a/algo/x2.hide/sha2-arm.S
+++ b/algo/x2.hide/sha2-arm.S
--- a/algo/x2.hide/sha2-x64.S
+++ b/algo/x2.hide/sha2-x64.S
--- a/algo/x2.hide/sha2-x86.S
+++ b/algo/x2.hide/sha2-x86.S
--- a/algo/x2.hide/sha2.c
+++ b/algo/x2.hide/sha2.c
@@ -1,630 +0,0 @@
 /*
 * Copyright 2011 ArtForz
 * Copyright 2011-2013 pooler
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the Free
 * Software Foundation; either version 2 of the License, or (at your option)
 * any later version.  See COPYING for more details.
 */
 #include "../cpuminer-config.h"
 #include "../miner.h"
 #include <string.h>
 #include <stdint.h>
 #if defined(__arm__) && defined(__APCS_32__)
 #define EXTERN_SHA256
 #endif
 static const uint32_t sha256_h[8] = {
 	0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
 	0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
 };
 static const uint32_t sha256_k[64] = {
 	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
 	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
 	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
 	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
 	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
 	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
 	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
 	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
 	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
 	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
 	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
 	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
 	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
 	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
 	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
 	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
 };
 void sha256_init(uint32_t *state)
 {
 	memcpy(state, sha256_h, 32);
 }
 /* Elementary functions used by SHA256 */
 #define Ch(x, y, z)     ((x & (y ^ z)) ^ z)
 #define Maj(x, y, z)    ((x & (y | z)) | (y & z))
 #define ROTR(x, n)      ((x >> n) | (x << (32 - n)))
 #define S0(x)           (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
 #define S1(x)           (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
 #define s0(x)           (ROTR(x, 7) ^ ROTR(x, 18) ^ (x >> 3))
 #define s1(x)           (ROTR(x, 17) ^ ROTR(x, 19) ^ (x >> 10))
 /* SHA256 round function */
 #define RND(a, b, c, d, e, f, g, h, k) \
 	do { \
 		t0 = h + S1(e) + Ch(e, f, g) + k; \
 		t1 = S0(a) + Maj(a, b, c); \
 		d += t0; \
 		h  = t0 + t1; \
 	} while (0)
 /* Adjusted round function for rotating state */
 #define RNDr(S, W, i) \
 	RND(S[(64 - i) % 8], S[(65 - i) % 8], \
 	    S[(66 - i) % 8], S[(67 - i) % 8], \
 	    S[(68 - i) % 8], S[(69 - i) % 8], \
 	    S[(70 - i) % 8], S[(71 - i) % 8], \
 	    W[i] + sha256_k[i])
 #ifndef EXTERN_SHA256
 /*
 * SHA256 block compression function.  The 256-bit state is transformed via
 * the 512-bit input block to produce a new state.
 */
 void sha256_transform(uint32_t *state, const uint32_t *block, int swap)
 {
 	uint32_t W[64];
 	uint32_t S[8];
 	uint32_t t0, t1;
 	int i;
 	/* 1. Prepare message schedule W. */
 	if (swap) {
 		for (i = 0; i < 16; i++)
 			W[i] = swab32(block[i]);
 	} else
 		memcpy(W, block, 64);
 	for (i = 16; i < 64; i += 2) {
 		W[i]   = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
 		W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15];
 	}
 	/* 2. Initialize working variables. */
 	memcpy(S, state, 32);
 	/* 3. Mix. */
 	RNDr(S, W,  0);
 	RNDr(S, W,  1);
 	RNDr(S, W,  2);
 	RNDr(S, W,  3);
 	RNDr(S, W,  4);
 	RNDr(S, W,  5);
 	RNDr(S, W,  6);
 	RNDr(S, W,  7);
 	RNDr(S, W,  8);
 	RNDr(S, W,  9);
 	RNDr(S, W, 10);
 	RNDr(S, W, 11);
 	RNDr(S, W, 12);
 	RNDr(S, W, 13);
 	RNDr(S, W, 14);
 	RNDr(S, W, 15);
 	RNDr(S, W, 16);
 	RNDr(S, W, 17);
 	RNDr(S, W, 18);
 	RNDr(S, W, 19);
 	RNDr(S, W, 20);
 	RNDr(S, W, 21);
 	RNDr(S, W, 22);
 	RNDr(S, W, 23);
 	RNDr(S, W, 24);
 	RNDr(S, W, 25);
 	RNDr(S, W, 26);
 	RNDr(S, W, 27);
 	RNDr(S, W, 28);
 	RNDr(S, W, 29);
 	RNDr(S, W, 30);
 	RNDr(S, W, 31);
 	RNDr(S, W, 32);
 	RNDr(S, W, 33);
 	RNDr(S, W, 34);
 	RNDr(S, W, 35);
 	RNDr(S, W, 36);
 	RNDr(S, W, 37);
 	RNDr(S, W, 38);
 	RNDr(S, W, 39);
 	RNDr(S, W, 40);
 	RNDr(S, W, 41);
 	RNDr(S, W, 42);
 	RNDr(S, W, 43);
 	RNDr(S, W, 44);
 	RNDr(S, W, 45);
 	RNDr(S, W, 46);
 	RNDr(S, W, 47);
 	RNDr(S, W, 48);
 	RNDr(S, W, 49);
 	RNDr(S, W, 50);
 	RNDr(S, W, 51);
 	RNDr(S, W, 52);
 	RNDr(S, W, 53);
 	RNDr(S, W, 54);
 	RNDr(S, W, 55);
 	RNDr(S, W, 56);
 	RNDr(S, W, 57);
 	RNDr(S, W, 58);
 	RNDr(S, W, 59);
 	RNDr(S, W, 60);
 	RNDr(S, W, 61);
 	RNDr(S, W, 62);
 	RNDr(S, W, 63);
 	/* 4. Mix local working variables into global state */
 	for (i = 0; i < 8; i++)
 		state[i] += S[i];
 }
 #endif /* EXTERN_SHA256 */
 static const uint32_t sha256d_hash1[16] = {
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x80000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000100
 };
 static void sha256d_80_swap(uint32_t *hash, const uint32_t *data)
 {
 	uint32_t S[16];
 	int i;
 	sha256_init(S);
 	sha256_transform(S, data, 0);
 	sha256_transform(S, data + 16, 0);
 	memcpy(S + 8, sha256d_hash1 + 8, 32);
 	sha256_init(hash);
 	sha256_transform(hash, S, 0);
 	for (i = 0; i < 8; i++)
 		hash[i] = swab32(hash[i]);
 }
 void sha256d(unsigned char *hash, const unsigned char *data, int len)
 {
 	uint32_t S[16], T[16];
 	int i, r;
 	sha256_init(S);
 	for (r = len; r > -9; r -= 64) {
 		if (r < 64)
 			memset(T, 0, 64);
 		memcpy(T, data + len - r, r > 64 ? 64 : (r < 0 ? 0 : r));
 		if (r >= 0 && r < 64)
 			((unsigned char *)T)[r] = 0x80;
 		for (i = 0; i < 16; i++)
 			T[i] = be32dec(T + i);
 		if (r < 56)
 			T[15] = 8 * len;
 		sha256_transform(S, T, 0);
 	}
 	memcpy(S + 8, sha256d_hash1 + 8, 32);
 	sha256_init(T);
 	sha256_transform(T, S, 0);
 	for (i = 0; i < 8; i++)
 		be32enc((uint32_t *)hash + i, T[i]);
 }
 static inline void sha256d_preextend(uint32_t *W)
 {
 	W[16] = s1(W[14]) + W[ 9] + s0(W[ 1]) + W[ 0];
 	W[17] = s1(W[15]) + W[10] + s0(W[ 2]) + W[ 1];
 	W[18] = s1(W[16]) + W[11]             + W[ 2];
 	W[19] = s1(W[17]) + W[12] + s0(W[ 4]);
 	W[20] =             W[13] + s0(W[ 5]) + W[ 4];
 	W[21] =             W[14] + s0(W[ 6]) + W[ 5];
 	W[22] =             W[15] + s0(W[ 7]) + W[ 6];
 	W[23] =             W[16] + s0(W[ 8]) + W[ 7];
 	W[24] =             W[17] + s0(W[ 9]) + W[ 8];
 	W[25] =                     s0(W[10]) + W[ 9];
 	W[26] =                     s0(W[11]) + W[10];
 	W[27] =                     s0(W[12]) + W[11];
 	W[28] =                     s0(W[13]) + W[12];
 	W[29] =                     s0(W[14]) + W[13];
 	W[30] =                     s0(W[15]) + W[14];
 	W[31] =                     s0(W[16]) + W[15];
 }
 static inline void sha256d_prehash(uint32_t *S, const uint32_t *W)
 {
 	uint32_t t0, t1;
 	RNDr(S, W, 0);
 	RNDr(S, W, 1);
 	RNDr(S, W, 2);
 }
 #ifdef EXTERN_SHA256
 void sha256d_ms(uint32_t *hash, uint32_t *W,
 	const uint32_t *midstate, const uint32_t *prehash);
 #else
 static inline void sha256d_ms(uint32_t *hash, uint32_t *W,
 	const uint32_t *midstate, const uint32_t *prehash)
 {
 	uint32_t S[64];
 	uint32_t t0, t1;
 	int i;
 	S[18] = W[18];
 	S[19] = W[19];
 	S[20] = W[20];
 	S[22] = W[22];
 	S[23] = W[23];
 	S[24] = W[24];
 	S[30] = W[30];
 	S[31] = W[31];
 	W[18] += s0(W[3]);
 	W[19] += W[3];
 	W[20] += s1(W[18]);
 	W[21]  = s1(W[19]);
 	W[22] += s1(W[20]);
 	W[23] += s1(W[21]);
 	W[24] += s1(W[22]);
 	W[25]  = s1(W[23]) + W[18];
 	W[26]  = s1(W[24]) + W[19];
 	W[27]  = s1(W[25]) + W[20];
 	W[28]  = s1(W[26]) + W[21];
 	W[29]  = s1(W[27]) + W[22];
 	W[30] += s1(W[28]) + W[23];
 	W[31] += s1(W[29]) + W[24];
 	for (i = 32; i < 64; i += 2) {
 		W[i]   = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
 		W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15];
 	}
 	memcpy(S, prehash, 32);
 	RNDr(S, W,  3);
 	RNDr(S, W,  4);
 	RNDr(S, W,  5);
 	RNDr(S, W,  6);
 	RNDr(S, W,  7);
 	RNDr(S, W,  8);
 	RNDr(S, W,  9);
 	RNDr(S, W, 10);
 	RNDr(S, W, 11);
 	RNDr(S, W, 12);
 	RNDr(S, W, 13);
 	RNDr(S, W, 14);
 	RNDr(S, W, 15);
 	RNDr(S, W, 16);
 	RNDr(S, W, 17);
 	RNDr(S, W, 18);
 	RNDr(S, W, 19);
 	RNDr(S, W, 20);
 	RNDr(S, W, 21);
 	RNDr(S, W, 22);
 	RNDr(S, W, 23);
 	RNDr(S, W, 24);
 	RNDr(S, W, 25);
 	RNDr(S, W, 26);
 	RNDr(S, W, 27);
 	RNDr(S, W, 28);
 	RNDr(S, W, 29);
 	RNDr(S, W, 30);
 	RNDr(S, W, 31);
 	RNDr(S, W, 32);
 	RNDr(S, W, 33);
 	RNDr(S, W, 34);
 	RNDr(S, W, 35);
 	RNDr(S, W, 36);
 	RNDr(S, W, 37);
 	RNDr(S, W, 38);
 	RNDr(S, W, 39);
 	RNDr(S, W, 40);
 	RNDr(S, W, 41);
 	RNDr(S, W, 42);
 	RNDr(S, W, 43);
 	RNDr(S, W, 44);
 	RNDr(S, W, 45);
 	RNDr(S, W, 46);
 	RNDr(S, W, 47);
 	RNDr(S, W, 48);
 	RNDr(S, W, 49);
 	RNDr(S, W, 50);
 	RNDr(S, W, 51);
 	RNDr(S, W, 52);
 	RNDr(S, W, 53);
 	RNDr(S, W, 54);
 	RNDr(S, W, 55);
 	RNDr(S, W, 56);
 	RNDr(S, W, 57);
 	RNDr(S, W, 58);
 	RNDr(S, W, 59);
 	RNDr(S, W, 60);
 	RNDr(S, W, 61);
 	RNDr(S, W, 62);
 	RNDr(S, W, 63);
 	for (i = 0; i < 8; i++)
 		S[i] += midstate[i];
 	W[18] = S[18];
 	W[19] = S[19];
 	W[20] = S[20];
 	W[22] = S[22];
 	W[23] = S[23];
 	W[24] = S[24];
 	W[30] = S[30];
 	W[31] = S[31];
 	memcpy(S + 8, sha256d_hash1 + 8, 32);
 	S[16] = s1(sha256d_hash1[14]) + sha256d_hash1[ 9] + s0(S[ 1]) + S[ 0];
 	S[17] = s1(sha256d_hash1[15]) + sha256d_hash1[10] + s0(S[ 2]) + S[ 1];
 	S[18] = s1(S[16]) + sha256d_hash1[11] + s0(S[ 3]) + S[ 2];
 	S[19] = s1(S[17]) + sha256d_hash1[12] + s0(S[ 4]) + S[ 3];
 	S[20] = s1(S[18]) + sha256d_hash1[13] + s0(S[ 5]) + S[ 4];
 	S[21] = s1(S[19]) + sha256d_hash1[14] + s0(S[ 6]) + S[ 5];
 	S[22] = s1(S[20]) + sha256d_hash1[15] + s0(S[ 7]) + S[ 6];
 	S[23] = s1(S[21]) + S[16] + s0(sha256d_hash1[ 8]) + S[ 7];
 	S[24] = s1(S[22]) + S[17] + s0(sha256d_hash1[ 9]) + sha256d_hash1[ 8];
 	S[25] = s1(S[23]) + S[18] + s0(sha256d_hash1[10]) + sha256d_hash1[ 9];
 	S[26] = s1(S[24]) + S[19] + s0(sha256d_hash1[11]) + sha256d_hash1[10];
 	S[27] = s1(S[25]) + S[20] + s0(sha256d_hash1[12]) + sha256d_hash1[11];
 	S[28] = s1(S[26]) + S[21] + s0(sha256d_hash1[13]) + sha256d_hash1[12];
 	S[29] = s1(S[27]) + S[22] + s0(sha256d_hash1[14]) + sha256d_hash1[13];
 	S[30] = s1(S[28]) + S[23] + s0(sha256d_hash1[15]) + sha256d_hash1[14];
 	S[31] = s1(S[29]) + S[24] + s0(S[16])             + sha256d_hash1[15];
 	for (i = 32; i < 60; i += 2) {
 		S[i]   = s1(S[i - 2]) + S[i - 7] + s0(S[i - 15]) + S[i - 16];
 		S[i+1] = s1(S[i - 1]) + S[i - 6] + s0(S[i - 14]) + S[i - 15];
 	}
 	S[60] = s1(S[58]) + S[53] + s0(S[45]) + S[44];
 	sha256_init(hash);
 	RNDr(hash, S,  0);
 	RNDr(hash, S,  1);
 	RNDr(hash, S,  2);
 	RNDr(hash, S,  3);
 	RNDr(hash, S,  4);
 	RNDr(hash, S,  5);
 	RNDr(hash, S,  6);
 	RNDr(hash, S,  7);
 	RNDr(hash, S,  8);
 	RNDr(hash, S,  9);
 	RNDr(hash, S, 10);
 	RNDr(hash, S, 11);
 	RNDr(hash, S, 12);
 	RNDr(hash, S, 13);
 	RNDr(hash, S, 14);
 	RNDr(hash, S, 15);
 	RNDr(hash, S, 16);
 	RNDr(hash, S, 17);
 	RNDr(hash, S, 18);
 	RNDr(hash, S, 19);
 	RNDr(hash, S, 20);
 	RNDr(hash, S, 21);
 	RNDr(hash, S, 22);
 	RNDr(hash, S, 23);
 	RNDr(hash, S, 24);
 	RNDr(hash, S, 25);
 	RNDr(hash, S, 26);
 	RNDr(hash, S, 27);
 	RNDr(hash, S, 28);
 	RNDr(hash, S, 29);
 	RNDr(hash, S, 30);
 	RNDr(hash, S, 31);
 	RNDr(hash, S, 32);
 	RNDr(hash, S, 33);
 	RNDr(hash, S, 34);
 	RNDr(hash, S, 35);
 	RNDr(hash, S, 36);
 	RNDr(hash, S, 37);
 	RNDr(hash, S, 38);
 	RNDr(hash, S, 39);
 	RNDr(hash, S, 40);
 	RNDr(hash, S, 41);
 	RNDr(hash, S, 42);
 	RNDr(hash, S, 43);
 	RNDr(hash, S, 44);
 	RNDr(hash, S, 45);
 	RNDr(hash, S, 46);
 	RNDr(hash, S, 47);
 	RNDr(hash, S, 48);
 	RNDr(hash, S, 49);
 	RNDr(hash, S, 50);
 	RNDr(hash, S, 51);
 	RNDr(hash, S, 52);
 	RNDr(hash, S, 53);
 	RNDr(hash, S, 54);
 	RNDr(hash, S, 55);
 	RNDr(hash, S, 56);
 	hash[2] += hash[6] + S1(hash[3]) + Ch(hash[3], hash[4], hash[5])
 	         + S[57] + sha256_k[57];
 	hash[1] += hash[5] + S1(hash[2]) + Ch(hash[2], hash[3], hash[4])
 	         + S[58] + sha256_k[58];
 	hash[0] += hash[4] + S1(hash[1]) + Ch(hash[1], hash[2], hash[3])
 	         + S[59] + sha256_k[59];
 	hash[7] += hash[3] + S1(hash[0]) + Ch(hash[0], hash[1], hash[2])
 	         + S[60] + sha256_k[60]
 	         + sha256_h[7];
 }
 #endif /* EXTERN_SHA256 */
 #ifdef HAVE_SHA256_4WAY
 void sha256d_ms_4way(uint32_t *hash,  uint32_t *data,
 	const uint32_t *midstate, const uint32_t *prehash);
 static inline int scanhash_sha256d_4way(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done)
 {
 	uint32_t data[4 * 64] __attribute__((aligned(128)));
 	uint32_t hash[4 * 8] __attribute__((aligned(32)));
 	uint32_t midstate[4 * 8] __attribute__((aligned(32)));
 	uint32_t prehash[4 * 8] __attribute__((aligned(32)));
 	uint32_t n = pdata[19] - 1;
 	const uint32_t first_nonce = pdata[19];
 	const uint32_t Htarg = ptarget[7];
 	int i, j;
 	memcpy(data, pdata + 16, 64);
 	sha256d_preextend(data);
 	for (i = 31; i >= 0; i--)
 		for (j = 0; j < 4; j++)
 			data[i * 4 + j] = data[i];
 	sha256_init(midstate);
 	sha256_transform(midstate, pdata, 0);
 	memcpy(prehash, midstate, 32);
 	sha256d_prehash(prehash, pdata + 16);
 	for (i = 7; i >= 0; i--) {
 		for (j = 0; j < 4; j++) {
 			midstate[i * 4 + j] = midstate[i];
 			prehash[i * 4 + j] = prehash[i];
 		}
 	}
 	do {
 		for (i = 0; i < 4; i++)
 			data[4 * 3 + i] = ++n;
 		sha256d_ms_4way(hash, data, midstate, prehash);
 		for (i = 0; i < 4; i++) {
 			if (swab32(hash[4 * 7 + i]) <= Htarg) {
 				pdata[19] = data[4 * 3 + i];
 				sha256d_80_swap(hash, pdata);
 				if (fulltest(hash, ptarget)) {
 					*hashes_done = n - first_nonce + 1;
 					return 1;
 				}
 			}
 		}
 	} while (n < max_nonce && !work_restart[thr_id].restart);
 	*hashes_done = n - first_nonce + 1;
 	pdata[19] = n;
 	return 0;
 }
 #endif /* HAVE_SHA256_4WAY */
 #ifdef HAVE_SHA256_8WAY
 void sha256d_ms_8way(uint32_t *hash,  uint32_t *data,
 	const uint32_t *midstate, const uint32_t *prehash);
 static inline int scanhash_sha256d_8way(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done)
 {
 	uint32_t data[8 * 64] __attribute__((aligned(128)));
 	uint32_t hash[8 * 8] __attribute__((aligned(32)));
 	uint32_t midstate[8 * 8] __attribute__((aligned(32)));
 	uint32_t prehash[8 * 8] __attribute__((aligned(32)));
 	uint32_t n = pdata[19] - 1;
 	const uint32_t first_nonce = pdata[19];
 	const uint32_t Htarg = ptarget[7];
 	int i, j;
 	memcpy(data, pdata + 16, 64);
 	sha256d_preextend(data);
 	for (i = 31; i >= 0; i--)
 		for (j = 0; j < 8; j++)
 			data[i * 8 + j] = data[i];
 	sha256_init(midstate);
 	sha256_transform(midstate, pdata, 0);
 	memcpy(prehash, midstate, 32);
 	sha256d_prehash(prehash, pdata + 16);
 	for (i = 7; i >= 0; i--) {
 		for (j = 0; j < 8; j++) {
 			midstate[i * 8 + j] = midstate[i];
 			prehash[i * 8 + j] = prehash[i];
 		}
 	}
 	do {
 		for (i = 0; i < 8; i++)
 			data[8 * 3 + i] = ++n;
 		sha256d_ms_8way(hash, data, midstate, prehash);
 		for (i = 0; i < 8; i++) {
 			if (swab32(hash[8 * 7 + i]) <= Htarg) {
 				pdata[19] = data[8 * 3 + i];
 				sha256d_80_swap(hash, pdata);
 				if (fulltest(hash, ptarget)) {
 					*hashes_done = n - first_nonce + 1;
 					return 1;
 				}
 			}
 		}
 	} while (n < max_nonce && !work_restart[thr_id].restart);
 	*hashes_done = n - first_nonce + 1;
 	pdata[19] = n;
 	return 0;
 }
 #endif /* HAVE_SHA256_8WAY */
 int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
 	uint32_t max_nonce, unsigned long *hashes_done)
 {
 	uint32_t data[64] __attribute__((aligned(128)));
 	uint32_t hash[8] __attribute__((aligned(32)));
 	uint32_t midstate[8] __attribute__((aligned(32)));
 	uint32_t prehash[8] __attribute__((aligned(32)));
 	uint32_t n = pdata[19] - 1;
 	const uint32_t first_nonce = pdata[19];
 	const uint32_t Htarg = ptarget[7];
 #ifdef HAVE_SHA256_8WAY
 	if (sha256_use_8way())
 		return scanhash_sha256d_8way(thr_id, pdata, ptarget,
 			max_nonce, hashes_done);
 #endif
 #ifdef HAVE_SHA256_4WAY
 	if (sha256_use_4way())
 		return scanhash_sha256d_4way(thr_id, pdata, ptarget,
 			max_nonce, hashes_done);
 #endif
 	memcpy(data, pdata + 16, 64);
 	sha256d_preextend(data);
 	sha256_init(midstate);
 	sha256_transform(midstate, pdata, 0);
 	memcpy(prehash, midstate, 32);
 	sha256d_prehash(prehash, pdata + 16);
 	do {
 		data[3] = ++n;
 		sha256d_ms(hash, data, midstate, prehash);
 		if (swab32(hash[7]) <= Htarg) {
 			pdata[19] = data[3];
 			sha256d_80_swap(hash, pdata);
 			if (fulltest(hash, ptarget)) {
 				*hashes_done = n - first_nonce + 1;
 				return 1;
 			}
 		}
 	} while (n < max_nonce && !work_restart[thr_id].restart);
 	*hashes_done = n - first_nonce + 1;
 	pdata[19] = n;
 	return 0;
 }
--- a/algo/zr5.c
+++ b/algo/zr5.c
@@ -32,12 +32,10 @@
 #include <string.h>
 #include <stdint.h>
 #include "algo/groestl/sph_groestl.h"
 #include "algo/keccak/sph_keccak.h"
-#ifdef NO_AES_NI
+#ifndef NO_AES_NI
  #include "algo/groestl/sse2/grso.h"
  #include "algo/groestl/sse2/grso-macro.c"
 #else
  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/echo/aes_ni/hash_api.h"
 #endif
@@ -61,17 +59,21 @@
 #define POK_DATA_MASK 0xFFFF0000
 typedef struct {
-  #ifndef NO_AES_NI
+  #ifdef NO_AES_NI
-   hashState_groestl       groestl;
+    sph_groestl512_context  groestl;
  #else
    hashState_groestl       groestl;
  #endif
-  sph_keccak512_context    keccak;
+    sph_keccak512_context    keccak;
 } zr5_ctx_holder;
 zr5_ctx_holder zr5_ctx;
 void init_zr5_ctx()
 {
-  #ifndef NO_AES_NI
+  #ifdef NO_AES_NI
     sph_groestl512_init( &zr5_ctx.groestl );
  #else
     init_groestl( &zr5_ctx.groestl );
  #endif
     sph_keccak512_init(&zr5_ctx.keccak);
@@ -88,10 +90,6 @@ DATA_ALIGN16(sph_u64 hashctB);
 //memset(hash, 0, 128);
 #ifdef NO_AES_NI
  grsoState sts_grs;
 #endif
 static const int arrOrder[][4] =
 {
   { 0, 1, 2, 3 }, { 0, 1, 3, 2 }, { 0, 2, 1, 3 }, { 0, 2, 3, 1 },
@@ -123,9 +121,8 @@ static const int arrOrder[][4] =
 		break;
         case 1:
            #ifdef NO_AES_NI
-		{GRS_I;
+                sph_groestl512 (&ctx.groestl, hash, 64);
-		GRS_U;
+                sph_groestl512_close(&ctx.groestl, hash);
 		GRS_C; }
            #else
                update_groestl( &ctx.groestl, (char*)hash,512);
                final_groestl( &ctx.groestl, (char*)hash);
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.4.12])
+AC_INIT([cpuminer-opt], [3.5.0])
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -2031,7 +2031,7 @@ bool jr2_stratum_handle_response( json_t *val )
 static bool stratum_handle_response( char *buf )
 {
-	json_t *val, *res_val, *id_val;
+	json_t *val, *id_val;
 	json_error_t err;
 	bool ret = false;
@@ -2041,7 +2041,7 @@ static bool stratum_handle_response( char *buf )
           applog(LOG_INFO, "JSON decode failed(%d): %s", err.line, err.text);
 	   goto out;
 	}
-	res_val = json_object_get( val, "result" );
+        json_object_get( val, "result" );
 	id_val = json_object_get( val, "id" );
 	if ( !id_val || json_is_null(id_val) )
 		goto out;
@@ -2477,9 +2477,9 @@ void parse_arg(int key, char *arg )
 				show_usage_and_exit(1);
 			}
 			free(rpc_url);
-			rpc_url = (char*) malloc(strlen(hp) + 8);
+			rpc_url = (char*) malloc( strlen(hp) + 15 );
-			sprintf(rpc_url, "http://%s", hp);
+			sprintf( rpc_url, "stratum+tcp://%s", hp );
-			short_url = &rpc_url[sizeof("http://")-1];
+			short_url = &rpc_url[ sizeof("stratum+tcp://") - 1 ];
 		}
 		have_stratum = !opt_benchmark && !strncasecmp(rpc_url, "stratum", 7);
 		break;
--- a/miner.h
+++ b/miner.h
@@ -331,6 +331,7 @@ bool   has_sse();
 void   cpu_bestcpu_feature( char *outbuf, size_t maxsz );
 void   cpu_getname(char *outbuf, size_t maxsz);
 void   cpu_getmodelid(char *outbuf, size_t maxsz);
 void   cpu_brand_string( char* s );
 float cpu_temp( int core );