v3.5.9.1

2025-09-17 23:44:27 +00:00 · 2017-03-04 16:23:24 -05:00
284 changed files with 14215 additions and 30407 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -11,6 +11,7 @@ autom4te.cache
 Makefile
 Makefile.in
 INSTALL
+configure
 configure.lineno
 depcomp
 missing
--- a/12
+++ b/12
@@ -16,16 +16,4 @@ LucasJones

 tpruvot@github

-elmad
-
-djm34
-
-palmd
-
-ig0tik3d
-
-Wolf0
-
-Optiminer
-
 Jay D Dee
--- a/34
+++ b/34
@@ -5,31 +5,19 @@
 # ex: docker run -it --rm cpuminer-opt:latest -a cryptonight -o cryptonight.eu.nicehash.com:3355 -u 1MiningDW2GKzf4VQfmp4q2XoUvR6iy6PD.worker1 -p x -t 3
 #

-# Build
-FROM ubuntu:16.04 as builder
-
-RUN apt-get update \
-  && apt-get install -y \
-    build-essential \
+FROM ubuntu:16.04
+RUN BUILD_DEPS="build-essential \
    libssl-dev \
-    libgmp-dev \
-    libcurl4-openssl-dev \
-    libjansson-dev \
-    automake \
-  && rm -rf /var/lib/apt/lists/*
+	  libgmp-dev \
+	  libcurl4-openssl-dev \
+	  libjansson-dev \
+	  automake" && \
+
+	  apt-get update && \
+	  apt-get install -y ${BUILD_DEPS}

 COPY . /app/
-RUN cd /app/ && ./build.sh
+RUN	cd /app/ && ./build.sh

-# App
-FROM ubuntu:16.04
-
-RUN apt-get update \
-  && apt-get install -y \
-    libcurl3 \
-    libjansson4 \
-  && rm -rf /var/lib/apt/lists/*
-
-COPY --from=builder /app/cpuminer .
-ENTRYPOINT ["./cpuminer"]
+ENTRYPOINT ["/app/cpuminer"]
 CMD ["-h"]
--- a/Makefile.am
+++ b/Makefile.am
@@ -22,6 +22,29 @@ cpuminer_SOURCES = \
  api.c \
  sysinfos.c \
  algo-gate-api.c\
+  algo/groestl/sph_groestl.c \
+  algo/skein/sph_skein.c \
+  algo/bmw/sph_bmw.c \
+  algo/shavite/sph_shavite.c \
+  algo/shavite/shavite.c \
+  algo/echo/sph_echo.c \
+  algo/blake/sph_blake.c \
+  algo/blake/sph_blake2b.c \
+  algo/heavy/sph_hefty1.c \
+  algo/blake/mod_blakecoin.c \
+  algo/luffa/sph_luffa.c \
+  algo/cubehash/sph_cubehash.c \
+  algo/simd/sph_simd.c \
+  algo/hamsi/sph_hamsi.c \
+  algo/fugue/sph_fugue.c \
+  algo/gost/sph_gost.c \
+  algo/jh/sph_jh.c \
+  algo/keccak/sph_keccak.c \
+  algo/keccak/keccak.c\
+  algo/sha3/sph_sha2.c \
+  algo/sha3/sph_sha2big.c \
+  algo/shabal/sph_shabal.c \
+  algo/whirlpool/sph_whirlpool.c\
  crypto/blake2s.c \
  crypto/oaes_lib.c \
  crypto/c_keccak.c \
@@ -38,80 +61,58 @@ cpuminer_SOURCES = \
  algo/argon2/ar2/cores.c \
  algo/argon2/ar2/ar2-scrypt-jane.c \
  algo/argon2/ar2/blake2b.c \
-  algo/blake/sph_blake.c \
-  algo/blake/blake-hash-4way.c \
-  algo/blake/blake-gate.c \
+  algo/axiom.c \
  algo/blake/blake.c \
-  algo/blake/blake-4way.c \
-  algo/blake/sph_blake2b.c \
  algo/blake/blake2b.c \
  algo/blake/blake2s.c \
-  algo/blake/mod_blakecoin.c \
  algo/blake/blakecoin.c \
-  algo/blake/decred-gate.c \
  algo/blake/decred.c \
-  algo/blake/decred-4way.c \
-  algo/blake/pentablake-gate.c \
-  algo/blake/pentablake-4way.c \
  algo/blake/pentablake.c \
-  algo/bmw/sph_bmw.c \
-  algo/bmw/bmw-hash-4way.c \
  algo/bmw/bmw256.c \
+  algo/cubehash/sse2/cubehash_sse2.c\
  algo/cryptonight/cryptolight.c \
  algo/cryptonight/cryptonight-common.c\
  algo/cryptonight/cryptonight-aesni.c\
  algo/cryptonight/cryptonight.c\
-  algo/cubehash/sph_cubehash.c \
-  algo/cubehash/sse2/cubehash_sse2.c\
-  algo/echo/sph_echo.c \
+  algo/drop.c \
  algo/echo/aes_ni/hash.c\
-  algo/gost/sph_gost.c \
-  algo/groestl/sph_groestl.c \
+  algo/fresh.c \
  algo/groestl/groestl.c \
  algo/groestl/myr-groestl.c \
  algo/groestl/aes_ni/hash-groestl.c \
  algo/groestl/aes_ni/hash-groestl256.c \
-  algo/fugue/sph_fugue.c \
-  algo/hamsi/sph_hamsi.c \
-  algo/haval/haval.c\
-  algo/heavy/sph_hefty1.c \
+  algo/groestl/sse2/grso.c \
+  algo/groestl/sse2/grso-asm.c \
+  algo/haval/haval.c \
  algo/heavy/heavy.c \
  algo/heavy/bastion.c \
-  algo/hodl/aes.c \
+  algo/hmq1725.c \
+  algo/hodl/hodl.cpp \
  algo/hodl/hodl-gate.c \
+  algo/hodl/hodl_arith_uint256.cpp \
+  algo/hodl/hodl_uint256.cpp \
+  algo/hodl/hash.cpp \
+  algo/hodl/hmac_sha512.cpp \
+  algo/hodl/sha256.cpp \
+  algo/hodl/sha512.cpp \
+  algo/hodl/utilstrencodings.cpp \
  algo/hodl/hodl-wolf.c \
+  algo/hodl/aes.c \
  algo/hodl/sha512_avx.c \
  algo/hodl/sha512_avx2.c \
-  algo/jh/sph_jh.c \
-  algo/jh/jh-hash-4way.c \
-  algo/jh/jha-gate.c \
-  algo/jh/jha-4way.c \
-  algo/jh/jha.c \
-  algo/keccak/sph_keccak.c \
-  algo/keccak/keccak.c\
-  algo/keccak/keccak-hash-4way.c \
-  algo/keccak/keccak-4way.c\
-  algo/keccak/keccak-gate.c \
-  algo/keccak/sse2/keccak.c \
  algo/lbry.c \
-  algo/luffa/sph_luffa.c \
  algo/luffa/luffa.c \
  algo/luffa/sse2/luffa_for_sse2.c \
  algo/lyra2/lyra2.c \
  algo/lyra2/sponge.c \
  algo/lyra2/lyra2rev2.c \
  algo/lyra2/lyra2re.c \
-  algo/lyra2/lyra2z-gate.c \
-  algo/lyra2/lyra2z.c \
-  algo/lyra2/lyra2z-4way.c \
-  algo/lyra2/lyra2z330.c \
-  algo/lyra2/lyra2h.c \
+  algo/lyra2/zcoin.c \
+  algo/lyra2/zoin.c \
+  algo/keccak/sse2/keccak.c \
  algo/m7m.c \
-  algo/neoscrypt/neoscrypt.c \
-  algo/nist5/nist5-gate.c \
-  algo/nist5/nist5-4way.c \
-  algo/nist5/nist5.c \
-  algo/nist5/zr5.c \
+  algo/neoscrypt.c \
+  algo/nist5.c \
  algo/pluck.c \
  algo/quark/quark.c \
  algo/qubit/qubit.c \
@@ -119,86 +120,33 @@ cpuminer_SOURCES = \
  algo/ripemd/sph_ripemd.c \
  algo/scrypt.c \
  algo/scryptjane/scrypt-jane.c \
-  algo/sha/sph_sha2.c \
-  algo/sha/sph_sha2big.c \
-  algo/sha/sha2.c \
-  algo/sha/sha256t.c \
-  algo/shabal/sph_shabal.c \
-  algo/shabal/shabal-hash-4way.c \
-  algo/shavite/sph_shavite.c \
-  algo/shavite/sph-shavite-aesni.c \
-  algo/shavite/shavite.c \
-  algo/simd/sph_simd.c \
+  algo/sha2/sha2.c \
+  algo/sha2/sha256t.c \
  algo/simd/sse2/nist.c \
  algo/simd/sse2/vector.c \
-  algo/skein/sph_skein.c \
-  algo/skein/skein-hash-4way.c \
  algo/skein/skein.c \
-  algo/skein/skein-4way.c \
-  algo/skein/skein-gate.c \
  algo/skein/skein2.c \
-  algo/skein/skein2-4way.c \
-  algo/skein/skein2-gate.c \
-  algo/sm3/sm3.c \
+  algo/s3.c \
  algo/tiger/sph_tiger.c \
  algo/timetravel.c \
-  algo/timetravel10.c \
-  algo/whirlpool/sph_whirlpool.c \
-  algo/whirlpool/whirlpool-hash-4way.c \
-  algo/whirlpool/whirlpool-gate.c \
-  algo/whirlpool/whirlpool-4way.c \
+  algo/veltor.c \
  algo/whirlpool/whirlpool.c \
  algo/whirlpool/whirlpoolx.c \
-  algo/x11/x11-gate.c \
  algo/x11/x11.c \
-  algo/x11/x11-4way.c \
-  algo/x11/x11gost-gate.c \
-  algo/x11/x11gost.c \
-  algo/x11/x11gost-4way.c \
-  algo/x11/c11-gate.c \
-  algo/x11/c11.c \
-  algo/x11/c11-4way.c \
-  algo/x11/tribus-gate.c \
-  algo/x11/tribus.c \
-  algo/x11/tribus-4way.c \
-  algo/x11/fresh.c \
  algo/x11/x11evo.c \
-  algo/x13/x13-gate.c \
+  algo/x11/x11gost.c \
+  algo/x11/c11.c \
  algo/x13/x13.c \
-  algo/x13/x13-4way.c \
-  algo/x13/x13sm3-gate.c \
-  algo/x13/x13sm3.c \
-  algo/x13/x13sm3-4way.c \
-  algo/x13/phi1612-gate.c \
-  algo/x13/phi1612.c \
-  algo/x13/phi1612-4way.c \
-  algo/x13/skunk-gate.c \
-  algo/x13/skunk-4way.c \
-  algo/x13/skunk.c \
-  algo/x13/drop.c \
-  algo/x14/x14-gate.c \
  algo/x14/x14.c \
-  algo/x14/x14-4way.c \
-  algo/x14/veltor-gate.c \
-  algo/x14/veltor.c \
-  algo/x14/veltor-4way.c \
-  algo/x14/polytimos-gate.c \
-  algo/x14/polytimos.c \
-  algo/x14/polytimos-4way.c \
-  algo/x14/axiom.c \
-  algo/x15/x15-gate.c \
  algo/x15/x15.c \
-  algo/x15/x15-4way.c \
-  algo/x17/x17-gate.c \
  algo/x17/x17.c \
-  algo/x17/x17-4way.c \
-  algo/x17/xevan-gate.c \
-  algo/x17/xevan.c \
-  algo/x17/xevan-4way.c \
-  algo/x17/hmq1725.c \
+  algo/xevan.c \
  algo/yescrypt/yescrypt.c \
-  algo/yescrypt/sha256_Y.c \
-  algo/yescrypt/yescrypt-simd.c
+  algo/yescrypt/yescrypt-common.c \
+  algo/yescrypt/sha256_Y.c\
+  algo/yescrypt/yescrypt-simd.c\
+  algo/zr5.c
+

 disable_flags =

--- a/README.md
+++ b/README.md
@@ -23,66 +23,52 @@ Supported Algorithms
                          blakecoin    blake256r8
                          blake2s      Blake-2 S
                          bmw          BMW 256
-                          c11          Chaincoin
+                          c11          Flax
                          cryptolight  Cryptonight-light
                          cryptonight  cryptonote, Monero (XMR)
                          decred
-                          deep         Deepcoin (DCN)
-                          dmd-gr       Diamond-Groestl
                          drop         Dropcoin
                          fresh        Fresh
-                          groestl      Groestl coin
+                          groestl      groestl
                          heavy        Heavy
                          hmq1725      Espers
                          hodl         Hodlcoin
-                          jha          Jackpotcoin
-                          keccak       Maxcoin
-                          keccakc      Creative coin
+                          keccak       Keccak
                          lbry         LBC, LBRY Credits
                          luffa        Luffa
-                          lyra2h       Hppcoin
                          lyra2re      lyra2
-                          lyra2rev2    lyra2v2, Vertcoin
+                          lyra2rev2    lyrav2
                          lyra2z       Zcoin (XZC)
-                          lyra2z330    Lyra2 330 rows, Zoin (ZOI)
+                          lyra2zoin    Zoin (ZOI)
                          m7m          Magi (XMG)
                          myr-gr       Myriad-Groestl
                          neoscrypt    NeoScrypt(128, 2, 1)
                          nist5        Nist5
-                          pentablake   Pentablake
-                          phi1612      phi, LUX coin
                          pluck        Pluck:128 (Supcoin)
-                          polytimos    Ninja
+                          pentablake   Pentablake
                          quark        Quark
                          qubit        Qubit
                          scrypt       scrypt(1024, 1, 1) (default)
                          scrypt:N     scrypt(N, 1, 1)
                          scryptjane:nf
-                          sha256d      Double SHA-256
-                          sha256t      Triple SHA-256, Onecoin (OC)
+                          sha256d      SHA-256d
                          shavite3     Shavite3
                          skein        Skein+Sha (Skeincoin)
                          skein2       Double Skein (Woodcoin)
-                          skunk        Signatum (SIGT)
                          timetravel   Machinecoin (MAC)
-                          timetravel10 Bitcore
-                          tribus       Denarius (DNR)
                          vanilla      blake256r8vnl (VCash)
-                          veltor       (VLT)
+                          veltor
                          whirlpool
                          whirlpoolx
-                          x11          Dash
+                          x11          X11
                          x11evo       Revolvercoin
                          x11gost      sib (SibCoin)
                          x13          X13
-                          x13sm3       hsr (Hshare)
                          x14          X14
                          x15          X15
                          x17
                          xevan        Bitsend
-                          yescrypt     Globalboost-Y (BSTY)
-                          yescryptr8   BitZeny (ZNY)\n\
-                          yescryptr16  Yenten (YTN)
+                          yescrypt
                          zr5          Ziftr

 Requirements
@@ -97,25 +83,17 @@ algoritms for CPUs with AVX and AVX2, Sandybridge and Haswell respectively.
 Older CPUs are supported by cpuminer-multi by TPruvot but at reduced
 performance.

-ARM CPUs are not supported.
-
 2. 64 bit Linux OS. Ubuntu and Fedora based distributions, including Mint and
 Centos are known to work and have all dependencies in their repositories.
 Others may work but may require more effort.
 64 bit Windows OS is supported with mingw_w64 and msys or pre-built binaries.

-MacOS, OSx is not supported.
-
-3. Stratum pool. Some algos may work wallet mining using getwork.
+3. Stratum pool, cpuminer-opt only supports stratum minning. Some algos
+may work wallet mining but there are no guarantees.

 Errata
 ------

-AMD CPUs older than Piledriver, including Athlon x2 and Phenom II x4, are not
-supported by cpuminer-opt due to an incompatible implementation of SSE2 on
-these CPUs. Some algos may crash the miner with an invalid instruction.
-Users are recommended to use an unoptimized miner such as cpuminer-multi.
-
 cpuminer-opt does not work mining Decred algo at Nicehash and produces
 only "invalid extranonce2 size" rejects.

@@ -129,10 +107,6 @@ forum at:

 https://bitcointalk.org/index.php?topic=1326803.0

-All problem reports must be accompanied by a proper definition.
-This should include how the problem occurred, the command line and
-output from the miner showing the startup and any errors.
-
 Donations
 ---------

--- a/README.txt
+++ b/README.txt
@@ -1,9 +1,6 @@
 This file is included in the Windows binary package. Compile instructions
 for Linux and Windows can be found in RELEASE_NOTES.

-cpuminer is a console program that is executed from a DOS command prompt.
-There is no GUI and no mouse support.
-
 Choose the exe that best matches you CPU's features or use trial and
 error to find the fastest one that doesn't crash. Pay attention to
 the features listed at cpuminer startup to ensure you are mining at
@@ -11,27 +8,15 @@ optimum speed using all the available features.

 Architecture names and compile options used are only provided for Intel
 Core series. Pentium and Celeron often have fewer features.
+AMD is YMMV, see previous paragraph.

-AMD CPUs older than Piledriver, including Athlon x2 and Phenom II x4, are not
-supported by cpuminer-opt due to an incompatible implementation of SSE2 on
-these CPUs. Some algos may crash the miner with an invalid instruction.
-Users are recommended to use an unoptimized miner such as cpuminer-multi.
+Exe name                  Compile opts       Arch name

-Exe name                Compile flags              Arch name
+cpuminer-sse2.exe         -march=core2,      Core2   
+cpuminer-sse42.exe        -march=corei7,     Nehalem
+cpuminer-aes-sse42.exe    -maes -msse4.2     Westmere
+cpuminer-aes-avx.exe      -march=corei7-avx, Sandybridge, Ivybridge
+cpuminer-aes-avx2.exe     -march=core-avx2,  Haswell, Broadwell, Skylake, Kabylake

-cpuminer-sse2.exe      "-march=core2"              Core2   
-cpuminer-sse42.exe     "-march=corei7"             Nehalem
-cpuminer-aes-sse42.exe "-maes -msse4.2"            Westmere
-cpuminer-avx.exe       "-march=corei7-avx"         Sandybridge, Ivybridge
-cpuminer-avx2.exe      "-march=core-avx2"          Haswell...
-cpuminer-avx-sha       "-march=corei7-avx -msha"   Ryzen...
-cpuminer-4way.exe      "-march=core-avx2 -DFOUR_WAY"       same as avx2
-cpuminer-4way-sha.exe  "-march=core-avx2 -msha -DFOUR_WAY" same as avx2-sha

-4way requires a CPU with AES and AVX2. It is still under development and
-only a few algos are supported. See change log in RELEASE_NOTES in source
-package for supported algos.
-
-Ryzen CPus perform better with AVX than AVX2 therefore an avx-sha build
-is provided. Four way still uses AVX2. 

--- a/427
+++ b/427
@@ -1,338 +1,8 @@
-cpuminer-opt now supports HW SHA acceleration available on AMD Ryzen CPUs.
-This feature requires recent SW including GCC version 5 or higher and
-openssl version 1.1 or higher. It may also require using "-march=znver1"
-compile flag.
-
-HW SHA support is only available when compiled from source, Windows binaries
-are not yet available.
-
-cpuminer-opt is a console program, if you're using a mouse you're doing it
-wrong.
-
-Security warning
----------------
-
-Miner programs are often flagged as malware by antivirus programs. This is
-a false positive, they are flagged simply because they are miners. The source
-code is open for anyone to inspect. If you don't trust the software, don't use
-it.
-
-The cryptographic code has been taken from trusted sources but has been
-modified for speed at the expense of accepted security practices. This
-code should not be imported into applications where secure cryptography is
-required.
-
-Compile Instructions
--------------------
-
-Requirements:
-
-Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
-supported.
-64 bit Linux or Windows operating system. Apple is not supported.
-
-Building on linux prerequisites:
-
-It is assumed users know how to install packages on their system and
-be able to compile standard source packages. This is basic Linux and
-beyond the scope of cpuminer-opt.
-
-Make sure you have the basic development packages installed.
-Here is a good start:
-
-http://askubuntu.com/questions/457526/how-to-install-cpuminer-in-ubuntu
-
-Install any additional dependencies needed by cpuminer-opt. The list below
-are some of the ones that may not be in the default install and need to
-be installed manually. There may be others, read the error messages they
-will give a clue as to the missing package.
-
-The following command should install everything you need on Debian based
-distributions such as Ubuntu:
-
-sudo apt-get install build-essential libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev automake
-
-
-build-essential  (for Ubuntu, Development Tools package group on Fedora)
-automake
-libjansson-dev
-libgmp-dev
-libcurl4-openssl-dev
-libssl-dev
-pthreads
-zlib
-
-SHA support on AMD Ryzen CPUs requires gcc version 5 or higher and openssl 1.1
-or higher. Reports of improved performiance on Ryzen when using openssl 1.0.2
-have been due to AVX and AVX2 optimizations added to that version.
-Additional improvements are expected on Ryzen with openssl 1.1.
-"-march-znver1" or "-msha".
-
-Additional instructions for static compilalation can be found here:
-https://lxadm.com/Static_compilation_of_cpuminer
-Static builds should only considered in a homogeneous HW and SW environment.
-Local builds will always have the best performance and compatibility.
-
-Extract cpuminer source.
-
-tar xvzf cpuminer-opt-x.y.z.tar.gz
-cd cpuminer-opt-x.y.z
-
-Run ./build.sh to build on Linux or execute the following commands.
-
-./autogen.sh
-CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
-make
-
-Additional optional compile flags, add the following to CFLAGS to activate:
-
-DUSE_SPH_SHA
-
-SPH may give slightly better performance on algos that use sha256 when using
-openssl 1.0.1 or older. Openssl 1.0.2 adds AVX2 and 1.1 adds SHA and perform
-better than SPH.
-
-DFOUR_WAY
-
-4 way will give much better performance on supported algos with CPUs
-that have AVX2 and should only be used on CPUs with AVX2. 4 way algo
-support will be added incrementally, see change log below for supported algos.
- 
-Start mining.
-
-./cpuminer -a algo -o url -u username -p password
-
-Windows
-
-The following in how the Windows binary releases are built. It's old and
-not very good but it works, for me anyway.
-
-Building on Windows prerequisites:
-
-msys
-mingw_w64
-Visual C++ redistributable 2008 X64
-openssl
-
-Install msys and mingw_w64, only needed once.
-
-Unpack msys into C:\msys or your preferred directory.
-
-Install mingw_w64 from win-builds.
-Follow instructions, check "msys or cygwin" and "x86_64" and accept default
-existing msys instalation.
-
-Open a msys shell by double clicking on msys.bat.
-Note that msys shell uses linux syntax for file specifications, "C:\" is
-mounted at "/c/".
-
-Add mingw bin directory to PATH variable
-PATH="/c/msys/opt/windows_64/bin/:$PATH"
-
-Instalation complete, compile cpuminer-opt.
-
-Unpack cpuminer-opt source files using tar from msys shell, or using 7zip
-or similar Windows program.
-
-In msys shell cd to miner directory.
-cd /c/path/to/cpuminer-opt
-
-Run winbuild.sh to build on Windows or execute the following commands.
-
-./autogen.sh
-CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl
-make
-
-Start mining
-
-cpuminer.exe -a algo -o url -u user -p password
-
-The following tips may be useful for older AMD CPUs.
-
-AMD CPUs older than Piledriver, including Athlon x2 and Phenom II x4, are not
-supported by cpuminer-opt due to an incompatible implementation of SSE2 on
-these CPUs. Some algos may crash the miner with an invalid instruction.
-Users are recommended to use an unoptimized miner such as cpuminer-multi.
-
-Some users with AMD CPUs without AES_NI have reported problems compiling
-with build.sh or "-march=native". Problems have included compile errors
-and poor performance. These users are recommended to compile manually
-specifying "-march=btver1" on the configure command line.
-
-Support for even older x86_64 without AES_NI or SSE2 is not availble.
-
+Compile instruction for Linux and Windows are at the bottom of this file.

 Change Log
 ----------

-v3.7.9
-
-Partial 4way optimizations for veltor, skunk, polytimos, lyra2z.
-Additional 4way optimizations for X algos.
-New algo yescryptr8 for BitZeny, not to be confused with original
-yescrypt Globalboost-Y.
-
-v3.7.8
-
-Partial 4way optimization for most X algos including c11, xevan, phi, hsr
-
-v3.7.7
-
-Fixed regression caused by 64 CPU support.
-Fixed lyra2h.
-
-v3.7.6
-
-Added lyra2h algo for Hppcoin.
-Added support for more than 64 CPUs.
-Optimized shavite512 with AES, improves x11 etc.
-
-v3.7.5
-
-New algo keccakc for Creative coin with 4way optimizations
-
-Rewrote some AVX/AVX2 code for more consistent implementation and some
-optimizing.
-
-Enhanced capabilities check to support 4way, more precise reporting of
-features (not all algos use SSE2), and better error messages when using
-an incompatible pre-built version (Windows users).
-
-v3.7.4
-
-Removed unnecessary build options.
-
-Added 4way support for tribus and nist5.
-
-v3.7.3
-
-Added polytimos algo.
-
-Introducing 4-way AVX2 optimization giving up to 4x performance inprovement
-on many compute bound algos. First supported algos: skein, skein2, blake &
-keccak. This feature is only available when compiled from source. See above
-for instcuctions how to enable 4-way during compilation.
-
-Updated Dockerfile.
-
-v3.7.2
-
-Fixed yescryptr16
-Changed default sha256 and sha512 to openssl. This should be used when
-compiling with openssl 1.0.2 or higher (Ubuntu 16.04).
-This should increase the hashrate for yescrypt, yescryptr16, m7m, xevan, skein,
-myr-gr & others  when openssl 1.0.2 is installed.
-Users with openssl 1.0.1 (Ubuntu 14.04) may get better perforance by adding
-"-DUSE_SPH_SHA" to CLAGS. 
-Windows binaries are compiled with -DUSE_SPH_SHA and won't get the speedup.
-
-v3.7.1
-
-Added yescryptr16 algo for Yenten coin
-Added SHA support to yescrypt and yescryptr16
-Small code cleanup
-
-v3.7.0
-
-Fixed x14 misalignment bug.
-Fixed decred stake version bug.
-Getwork fixes for algos that use big endian data encoding: m7m, zr5, neoscrypt,
-decred.
-
-v3.6.10
-
-Fixed misalignment bug in hsr.
-
-v3.6.9
-
-Added phi1612 algo for LUX coin
-Added x13sm3 algo, alias hsr, for Hshare coin
-
-v3.6.8
-
-Fixed timetravel10 on Windows.
-
-v3.6.7
-
-Skunk algo added.
-Tribus a little faster.
-Minor restructuring.
-
-v3.6.6
-
-added tribus algo for Denarius (DNR)
-
-configure removed from .gitignore. This should allow git clone to compile
-on Windows/mingw.
-
-Fixed CPU temperature monitoring on some CPUs (Linux only).
-
-Fixed a compile error on FreeBSD (unsupported YMMV).
-
-v3.6.5
-
-Cryptonight a little faster.
-Added jha algo (Jackpotcoin) with AES optimizations.
-
-v3.6.4
-
-Added support for Bitcore (BTX) using the timetravel10 algo, optimized for
-AES and AVX2. 
-"-a bitcore" works as an alias and is less typing that "-a timetravel10".
-
-v3.6.3
-
-Fixed all known issues with SHA support on AMD Ryzen CPUs, still no
- Windows binaries.
-
-v3.6.2
-
-SHA accceleration is now supported on AMD Ryzen CPUs when compiled from source,
-  Windows binaries not yet available.
-Fixed groestl algo.
-Fixed dmd-gr (Diamond) algo.
-Fixed lbry compile error on Ryzen.
-Added SHA support to m7m algo.
-Hodl support for CPUs without AES has been removed, use legacy version.
-
-v3.6.1
-
-Fixed data alignment issue that broke lyra2rev2 AVX2 on Windows.
-Added preliminary support for HW accelerated SHA.
-Solo mining most algos should now work, cryptonight confirmed exception.
-
-v3.6.0
-
-Preliminary support for solo mining using getwork.
-
-v3.5.13
-
-Found more speed in Cubehash, algo improvement depends on chain length,
-  deep +8%, timetravel +1% , xevan +1%
-Fixed a getwork bug, solo mining is not yet supported but testing is encouraged
-
-v3.5.12
-
-New algo sha256t for Onecoin (OC), 29% faster than ocminer version.
-lyra2zoin algo renamed to lyra2z330, lyra2zoin and zoin still work
-  as aliases.
-
-v3.5.11
-
-Fixed hmq1725 crash on Ubuntu 16.04
-Fixed compile error in hodl.cpp with gcc 6.3
-Fixed x11 crash on Windows with AVX2
-
-v3.5.10
-
-Some AVX2 optimizations introduced for Luffa, shorter chained algos such
-  as Qubit and Deep should see the biggest gains, but many other algos should
-  also see improvement, longer chains like xevan not so much.
-Rewrite of Groestl AES, now 100% vectorized, small improvement.
-build.sh and winbuild.sh initialize with distclean instead of clean.
-Implemented a workaround for a compile error in hodl code when compiling
-  with gcc 6.3.
-
 V3.5.9

 Reduced stack usage for hmq1725 and small speedup.
@@ -498,3 +168,98 @@ AVX2 optimizations improving many algos:
   - x17 +2.8%
   - qubit +8.4%

+
+Compile Instructions
+--------------------
+
+Building on linux prerequisites:
+
+It is assumed users know how to install packages on their system and
+be able to compile standard source packages. This is basic Linux and
+beyond the scope of cpuminer-opt. 
+
+Make sure you have the basic development packages installed.
+Here is a good start:
+
+http://askubuntu.com/questions/457526/how-to-install-cpuminer-in-ubuntu
+
+Install any additional dependencies needed by cpuminer-opt. The list below
+are some of the ones that may not be in the default install and need to
+be installed manually. There may be others, read the error messages they
+will give a clue as to the missing package.
+
+The folliwing command should install everything you need on Debian based
+packages:
+
+sudo apt-get install build-essential libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev automake
+
+Building on Linux, see below for Windows.
+
+Dependencies
+
+build-essential  (for Ubuntu, Development Tools package group on Fedora)
+automake
+libjansson-dev
+libgmp-dev
+libcurl4-openssl-dev
+libssl-dev
+pthreads
+zlib
+
+tar xvzf [file.tar.gz]
+cd [file]
+
+Run build.sh to build on Linux or execute the following commands.
+
+./autogen.sh
+CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
+make
+
+Start mining.
+
+./cpuminer -a algo ...
+
+Building on Windows prerequisites:
+
+msys
+mingw_w64
+Visual C++ redistributable 2008 X64
+openssl, not sure about this
+
+Install msys and mingw_w64, only needed once.
+
+Unpack msys into C:\msys or your preferred directory.
+
+Install mingw__w64 from win-builds.
+Follow instructions, check "msys or cygwin" and "x86_64" and accept default
+existing msys instalation.
+
+Open a msys shell by double clicking on msys.bat.
+Note that msys shell uses linux syntax for file specifications, "C:\" is
+mounted at "/c/".
+
+Add mingw bin directory to PATH variable
+PATH="/c/msys/opt/windows_64/bin/:$PATH"
+
+Instalation complete, compile cpuminer-opt
+
+Unpack cpuminer-opt source files using tar from msys shell, or using 7zip
+or similar Windows program.
+
+In msys shell cd to miner directory.
+cd /c/path/to/cpuminer-opt
+
+Run winbuild.sh to build on Windows or execute the following commands.
+
+./autogen.sh
+CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl
+make
+
+The following tips may be useful for older AMD CPUs.
+
+Some users with AMD CPUs without AES_NI have reported problems compiling
+with build.sh or "-march=native". Problems have included compile errors
+and poor performance. These users are recommended to compile manually
+specifying "-march=btver1" on the configure command line.
+
+Support for even older x86_64 without AES_NI or SSE2 is not availble.
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -77,12 +77,6 @@ void algo_not_tested()
  applog(LOG_WARNING,"and bad things may happen. Use at your own risk.");
 }

-void four_way_not_tested()
-{
-  applog( LOG_WARNING,"Algo %s has not been tested using 4way. It may not", algo_names[opt_algo] );
-  applog( LOG_WARNING,"work or may be slower. Please report your results.");
-}
-
 void algo_not_implemented()
 {
  applog(LOG_ERR,"Algo %s has not been Implemented.",algo_names[opt_algo]);
@@ -104,12 +98,17 @@ void null_hash_suw()
 {
  applog(LOG_WARNING,"SWERR: null_hash_suw unsafe null function");
 };
+void null_hash_alt()
+{
+  applog(LOG_WARNING,"SWERR: null_hash_alt unsafe null function");
+};

 void init_algo_gate( algo_gate_t* gate )
 {
   gate->miner_thread_init       = (void*)&return_true;
   gate->scanhash                = (void*)&null_scanhash;
   gate->hash                    = (void*)&null_hash;
+   gate->hash_alt                = (void*)&null_hash_alt;
   gate->hash_suw                = (void*)&null_hash_suw;
   gate->get_new_work            = (void*)&std_get_new_work;
   gate->get_nonceptr            = (void*)&std_get_nonceptr;
@@ -120,17 +119,17 @@ void init_algo_gate( algo_gate_t* gate )
   gate->stratum_gen_work        = (void*)&std_stratum_gen_work;
   gate->build_stratum_request   = (void*)&std_le_build_stratum_request;
   gate->set_target              = (void*)&std_set_target;
-   gate->work_decode             = (void*)&std_le_work_decode;
-   gate->submit_getwork_result   = (void*)&std_le_submit_getwork_result;
+   gate->submit_getwork_result   = (void*)&std_submit_getwork_result;
   gate->build_extraheader       = (void*)&std_build_extraheader;
   gate->set_work_data_endian    = (void*)&do_nothing;
   gate->calc_network_diff       = (void*)&std_calc_network_diff;
+//   gate->prevent_dupes           = (void*)&return_false;
   gate->ready_to_mine           = (void*)&std_ready_to_mine;
   gate->resync_threads          = (void*)&do_nothing;
   gate->do_this_thread          = (void*)&return_true;
   gate->longpoll_rpc_call       = (void*)&std_longpoll_rpc_call;
   gate->stratum_handle_response = (void*)&std_stratum_handle_response;
-   gate->optimizations           = EMPTY_SET;
+   gate->optimizations           = SSE2_OPT;
   gate->ntime_index             = STD_NTIME_INDEX;
   gate->nbits_index             = STD_NBITS_INDEX;
   gate->nonce_index             = STD_NONCE_INDEX;
@@ -138,10 +137,6 @@ void init_algo_gate( algo_gate_t* gate )
   gate->work_cmp_size           = STD_WORK_CMP_SIZE;
 }

-// Ignore warnings for not yet defined register functions
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wimplicit-function-declaration"
-
 // called by each thread that uses the gate
 bool register_algo_gate( int algo, algo_gate_t *gate )
 {
@@ -155,73 +150,71 @@ bool register_algo_gate( int algo, algo_gate_t *gate )

   switch (algo)
   {
-     case ALGO_ARGON2:       register_argon2_algo      ( gate ); break;
-     case ALGO_AXIOM:        register_axiom_algo       ( gate ); break;
-     case ALGO_BASTION:      register_bastion_algo     ( gate ); break;
-     case ALGO_BLAKE:        register_blake_algo       ( gate ); break;
-     case ALGO_BLAKECOIN:    register_blakecoin_algo   ( gate ); break;
-//     case ALGO_BLAKE2B:      register_blake2b_algo    ( gate ); break;
-     case ALGO_BLAKE2S:      register_blake2s_algo     ( gate ); break;
-     case ALGO_C11:          register_c11_algo         ( gate ); break;
-     case ALGO_CRYPTOLIGHT:  register_cryptolight_algo ( gate ); break;
-     case ALGO_CRYPTONIGHT:  register_cryptonight_algo ( gate ); break;
-     case ALGO_DECRED:       register_decred_algo      ( gate ); break;
-     case ALGO_DEEP:         register_deep_algo        ( gate ); break;
-     case ALGO_DMD_GR:       register_dmd_gr_algo      ( gate ); break;
-     case ALGO_DROP:         register_drop_algo        ( gate ); break;
-     case ALGO_FRESH:        register_fresh_algo       ( gate ); break;
-     case ALGO_GROESTL:      register_groestl_algo     ( gate ); break;
-     case ALGO_HEAVY:        register_heavy_algo       ( gate ); break;
-     case ALGO_HMQ1725:      register_hmq1725_algo     ( gate ); break;
-     case ALGO_HODL:         register_hodl_algo        ( gate ); break;
-     case ALGO_JHA:          register_jha_algo         ( gate ); break;
-     case ALGO_KECCAK:       register_keccak_algo      ( gate ); break;
-     case ALGO_KECCAKC:      register_keccakc_algo     ( gate ); break;
-     case ALGO_LBRY:         register_lbry_algo        ( gate ); break;
-     case ALGO_LUFFA:        register_luffa_algo       ( gate ); break;
-     case ALGO_LYRA2H:       register_lyra2h_algo      ( gate ); break;
-     case ALGO_LYRA2RE:      register_lyra2re_algo     ( gate ); break;
-     case ALGO_LYRA2REV2:    register_lyra2rev2_algo   ( gate ); break;
-     case ALGO_LYRA2Z:       register_lyra2z_algo      ( gate ); break;
-     case ALGO_LYRA2Z330:    register_lyra2z330_algo   ( gate ); break;
-     case ALGO_M7M:          register_m7m_algo         ( gate ); break;
-     case ALGO_MYR_GR:       register_myriad_algo      ( gate ); break;
-     case ALGO_NEOSCRYPT:    register_neoscrypt_algo   ( gate ); break;
-     case ALGO_NIST5:        register_nist5_algo       ( gate ); break;
-     case ALGO_PENTABLAKE:   register_pentablake_algo  ( gate ); break;
-     case ALGO_PHI1612:      register_phi1612_algo     ( gate ); break;
-     case ALGO_PLUCK:        register_pluck_algo       ( gate ); break;
-     case ALGO_POLYTIMOS:    register_polytimos_algo   ( gate ); break;
-     case ALGO_QUARK:        register_quark_algo       ( gate ); break;
-     case ALGO_QUBIT:        register_qubit_algo       ( gate ); break;
-     case ALGO_SCRYPT:       register_scrypt_algo      ( gate ); break;
-     case ALGO_SCRYPTJANE:   register_scryptjane_algo  ( gate ); break;
-     case ALGO_SHA256D:      register_sha256d_algo     ( gate ); break;
-     case ALGO_SHA256T:      register_sha256t_algo     ( gate ); break;
-     case ALGO_SHAVITE3:     register_shavite_algo     ( gate ); break;
-     case ALGO_SKEIN:        register_skein_algo       ( gate ); break;
-     case ALGO_SKEIN2:       register_skein2_algo      ( gate ); break;
-     case ALGO_SKUNK:        register_skunk_algo       ( gate ); break;
-     case ALGO_TIMETRAVEL:   register_timetravel_algo  ( gate ); break;
-     case ALGO_TIMETRAVEL10: register_timetravel10_algo( gate ); break;
-     case ALGO_TRIBUS:       register_tribus_algo      ( gate ); break;
-     case ALGO_VANILLA:      register_vanilla_algo     ( gate ); break;
-     case ALGO_VELTOR:       register_veltor_algo      ( gate ); break;
-     case ALGO_WHIRLPOOL:    register_whirlpool_algo   ( gate ); break;
-     case ALGO_WHIRLPOOLX:   register_whirlpoolx_algo  ( gate ); break;
-     case ALGO_X11:          register_x11_algo         ( gate ); break;
-     case ALGO_X11EVO:       register_x11evo_algo      ( gate ); break;
-     case ALGO_X11GOST:      register_x11gost_algo     ( gate ); break;
-     case ALGO_X13:          register_x13_algo         ( gate ); break;
-     case ALGO_X13SM3:       register_x13sm3_algo      ( gate ); break;
-     case ALGO_X14:          register_x14_algo         ( gate ); break;
-     case ALGO_X15:          register_x15_algo         ( gate ); break;
-     case ALGO_X17:          register_x17_algo         ( gate ); break;
-     case ALGO_XEVAN:        register_xevan_algo       ( gate ); break;
-     case ALGO_YESCRYPT:     register_yescrypt_algo    ( gate ); break;
-     case ALGO_YESCRYPTR8:   register_yescryptr8_algo  ( gate ); break;
-     case ALGO_YESCRYPTR16:  register_yescryptr16_algo ( gate ); break;
-     case ALGO_ZR5:          register_zr5_algo         ( gate ); break;
+
+// Ignore warnings for not yet defined register fucntions
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wimplicit-function-declaration"
+
+     case ALGO_ARGON2:      register_argon2_algo     ( gate ); break;
+     case ALGO_AXIOM:       register_axiom_algo      ( gate ); break;
+     case ALGO_BASTION:     register_bastion_algo    ( gate ); break;
+     case ALGO_BLAKE:       register_blake_algo      ( gate ); break;
+     case ALGO_BLAKECOIN:   register_blakecoin_algo  ( gate ); break;
+//     case ALGO_BLAKE2B:     register_blake2b_algo    ( gate ); break;
+     case ALGO_BLAKE2S:     register_blake2s_algo    ( gate ); break;
+     case ALGO_C11:         register_c11_algo        ( gate ); break;
+     case ALGO_CRYPTOLIGHT: register_cryptolight_algo( gate ); break;
+     case ALGO_CRYPTONIGHT: register_cryptonight_algo( gate ); break;
+     case ALGO_DECRED:      register_decred_algo     ( gate ); break;
+     case ALGO_DEEP:        register_deep_algo       ( gate ); break;
+     case ALGO_DROP:        register_drop_algo       ( gate ); break;
+     case ALGO_FRESH:       register_fresh_algo      ( gate ); break;
+     case ALGO_GROESTL:     register_groestl_algo    ( gate ); break;
+     case ALGO_HEAVY:       register_heavy_algo      ( gate ); break;
+     case ALGO_HMQ1725:     register_hmq1725_algo    ( gate ); break;
+     case ALGO_HODL:        register_hodl_algo       ( gate ); break;
+     case ALGO_KECCAK:      register_keccak_algo     ( gate ); break;
+     case ALGO_LBRY:        register_lbry_algo       ( gate ); break;
+     case ALGO_LUFFA:       register_luffa_algo      ( gate ); break;
+     case ALGO_LYRA2RE:     register_lyra2re_algo    ( gate ); break;
+     case ALGO_LYRA2REV2:   register_lyra2rev2_algo  ( gate ); break;
+     case ALGO_LYRA2Z:      register_zcoin_algo      ( gate ); break;
+     case ALGO_LYRA2Z330:   register_lyra2z330_algo ( gate ); break;
+     case ALGO_M7M:         register_m7m_algo        ( gate ); break;
+     case ALGO_MYR_GR:      register_myriad_algo     ( gate ); break;
+     case ALGO_NEOSCRYPT:   register_neoscrypt_algo  ( gate ); break;
+     case ALGO_NIST5:       register_nist5_algo      ( gate ); break;
+     case ALGO_PENTABLAKE:  register_pentablake_algo ( gate ); break;
+     case ALGO_PLUCK:       register_pluck_algo      ( gate ); break;
+     case ALGO_QUARK:       register_quark_algo      ( gate ); break;
+     case ALGO_QUBIT:       register_qubit_algo      ( gate ); break;
+     case ALGO_SCRYPT:      register_scrypt_algo     ( gate ); break;
+     case ALGO_SCRYPTJANE:  register_scryptjane_algo ( gate ); break;
+     case ALGO_SHA256D:     register_sha256d_algo    ( gate ); break;
+     case ALGO_SHA256T:     register_sha256t_algo    ( gate ); break;
+     case ALGO_SHAVITE3:    register_shavite_algo    ( gate ); break;
+     case ALGO_SKEIN:       register_skein_algo      ( gate ); break;
+     case ALGO_SKEIN2:      register_skein2_algo     ( gate ); break;
+     case ALGO_S3:          register_s3_algo         ( gate ); break;
+     case ALGO_TIMETRAVEL:  register_timetravel_algo ( gate ); break;
+     case ALGO_VANILLA:     register_vanilla_algo    ( gate ); break;
+     case ALGO_VELTOR:      register_veltor_algo     ( gate ); break;
+     case ALGO_WHIRLPOOL:   register_whirlpool_algo  ( gate ); break;
+     case ALGO_WHIRLPOOLX:  register_whirlpoolx_algo ( gate ); break;
+     case ALGO_X11:         register_x11_algo        ( gate ); break;
+     case ALGO_X11EVO:      register_x11evo_algo     ( gate ); break;
+     case ALGO_X11GOST:     register_sib_algo        ( gate ); break;
+     case ALGO_X13:         register_x13_algo        ( gate ); break;
+     case ALGO_X14:         register_x14_algo        ( gate ); break;
+     case ALGO_X15:         register_x15_algo        ( gate ); break;
+     case ALGO_X17:         register_x17_algo        ( gate ); break;
+     case ALGO_XEVAN:       register_xevan_algo      ( gate ); break;
+     case ALGO_YESCRYPT:    register_yescrypt_algo   ( gate ); break;
+     case ALGO_ZR5:         register_zr5_algo        ( gate ); break;
+
+// restore warnings
+#pragma GCC diagnostic pop
+
    default:
        applog(LOG_ERR,"FAIL: algo_gate registration failed, unknown algo %s.\n", algo_names[opt_algo] );
        return false;
@@ -236,9 +229,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
  return true;
 }

-// restore warnings
-#pragma GCC diagnostic pop
-
 // override std defaults with jr2 defaults
 bool register_json_rpc2( algo_gate_t *gate )
 {
@@ -254,7 +244,6 @@ bool register_json_rpc2( algo_gate_t *gate )
  gate->nonce_index             = JR2_NONCE_INDEX;
  jsonrpc_2 = true;   // still needed
  opt_extranonce = false;
-//  have_gbt = false;
  return true;
 }

@@ -267,47 +256,41 @@ void exec_hash_function( int algo, void *output, const void *pdata )
  gate.hash( output, pdata, 0 );  
 }

+// an algo can have multiple aliases but the aliases must be unique
+
 #define PROPER (1)
 #define ALIAS  (0)

 // The only difference between the alias and the proper algo name is the
-// proper name is the one that is defined in ALGO_NAMES. There may be
+// proper name s the one that is defined in ALGO_NAMES, there may be
 // multiple aliases that map to the same proper name.
 // New aliases can be added anywhere in the array as long as NULL is last.
 // Alphabetic order of alias is recommended.
 const char* const algo_alias_map[][2] =
 {
 //   alias                proper
-  { "bitcore",           "timetravel10" },
-  { "bitzeny",           "yescryptr8"   },
-  { "blake256r8",        "blakecoin"    },
-  { "blake256r8vnl",     "vanilla"      },
-  { "blake256r14",       "blake"        },
-  { "blake256r14dcr",    "decred"       },
-  { "cryptonote",        "cryptonight"  },
-  { "cryptonight-light", "cryptolight"  },
-  { "diamond",           "dmd-gr"       },
-  { "droplp",            "drop"         },
-  { "espers",            "hmq1725"      },
-  { "flax",              "c11"          },
-  { "hsr",               "x13sm3"       },
-  { "jackpot",           "jha"          },
-  { "jane",              "scryptjane"   }, 
-  { "lyra2",             "lyra2re"      },
-  { "lyra2v2",           "lyra2rev2"    },
-  { "lyra2zoin",         "lyra2z330"    },
-  { "myriad",            "myr-gr"       },
-  { "neo",               "neoscrypt"    },
-  { "phi",               "phi1612"      },
-//  { "sia",               "blake2b"      },
-  { "sib",               "x11gost"      },
-  { "timetravel8",       "timetravel"   },
-  { "ziftr",             "zr5"          },
-  { "yenten",            "yescryptr16"  },
-  { "yescryptr8k",       "yescrypt"     },
-  { "zcoin",             "lyra2z"       },
-  { "zoin",              "lyra2z330"    },
-  { NULL,                NULL           }   
+  { "blake256r8",        "blakecoin"   },
+  { "blake256r8vnl",     "vanilla"     },
+  { "sia",               "blake2b"     },
+  { "blake256r14",       "blake"       },
+  { "cryptonote",        "cryptonight" },
+  { "cryptonight-light", "cryptolight" },
+  { "dmd-gr",            "groestl"     },
+  { "droplp",            "drop"        },
+  { "espers",            "hmq1725"     },
+  { "flax",              "c11"         },
+  { "jane",              "scryptjane"  }, 
+  { "lyra2",             "lyra2re"     },
+  { "lyra2v2",           "lyra2rev2"   },
+  { "lyra2zoin",         "lyra2z330"   },
+  { "myriad",            "myr-gr"      },
+  { "neo",               "neoscrypt"   },
+  { "sib",               "x11gost"     },
+  { "yes",               "yescrypt"    },
+  { "ziftr",             "zr5"         },
+  { "zcoin",             "lyra2z"      },
+  { "zoin",              "lyra2z330"   },
+  { NULL,                NULL          }   
 };

 // if arg is a valid alias for a known algo it is updated with the proper name.
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -85,13 +85,11 @@

 typedef  uint32_t set_t;

-#define EMPTY_SET       0
-#define SSE2_OPT        1
-#define AES_OPT         2  
-#define AVX_OPT         4
-#define AVX2_OPT        8
-#define SHA_OPT      0x10
-#define FOUR_WAY_OPT 0x20
+#define EMPTY_SET 0
+#define SSE2_OPT 1
+#define AES_OPT  2
+#define AVX_OPT  4
+#define AVX2_OPT 8

 // return set containing all elements from sets a & b
 inline set_t set_union ( set_t a, set_t b ) { return a | b; }
@@ -112,6 +110,7 @@ int ( *scanhash ) ( int, struct work*, uint32_t, uint64_t* );

 // optional unsafe, must be overwritten if algo uses function
 void ( *hash )     ( void*, const void*, uint32_t ) ;
+void ( *hash_alt ) ( void*, const void*, uint32_t );
 void ( *hash_suw ) ( void*, const void* );

 //optional, safe to use default in most cases
@@ -131,6 +130,7 @@ void ( *build_extraheader )      ( struct work*, struct stratum_ctx* );
 void ( *build_stratum_request )  ( char*, struct work*, struct stratum_ctx* );
 void ( *set_work_data_endian )   ( struct work* );
 double ( *calc_network_diff )    ( struct work* );
+//bool ( *prevent_dupes )          ( struct work*, struct stratum_ctx*, int );
 bool ( *ready_to_mine )          ( struct work*, struct stratum_ctx*, int );
 void ( *resync_threads )         ( struct work* );
 bool ( *do_this_thread )         ( int );
@@ -157,7 +157,7 @@ bool return_false();
 void *return_null();
 void algo_not_tested();
 void algo_not_implemented();
-void four_way_not_tested();
+

 // Warning: algo_gate.nonce_index should only be used in targetted code
 // due to different behaviours by different targets. The JR2 index uses an
@@ -185,6 +185,7 @@ int null_scanhash();

 // displays warning
 void null_hash    ();
+void null_hash_alt();
 void null_hash_suw();

 // optional safe targets, default listed first unless noted.
@@ -216,20 +217,18 @@ int64_t get_max64_0xffffLL();
 void std_set_target   ( struct work *work, double job_diff );
 void scrypt_set_target( struct work *work, double job_diff );

-bool std_le_work_decode( const json_t *val, struct work *work );
-bool std_be_work_decode( const json_t *val, struct work *work );
+bool std_work_decode( const json_t *val, struct work *work );
 bool jr2_work_decode( const json_t *val, struct work *work );

-bool std_le_submit_getwork_result( CURL *curl, struct work *work );
-bool std_be_submit_getwork_result( CURL *curl, struct work *work );
+bool std_submit_getwork_result( CURL *curl, struct work *work );
 bool jr2_submit_getwork_result( CURL *curl, struct work *work );

 void std_le_build_stratum_request( char *req, struct work *work );
 void std_be_build_stratum_request( char *req, struct work *work );
 void jr2_build_stratum_request   ( char *req, struct work *work );

-// Default is do_nothing (assumed LE)
-void set_work_data_big_endian( struct work *work );
+// set_work_data_endian target, default is do_nothing;
+void swab_work_data( struct work *work );

 double std_calc_network_diff( struct work *work );

--- a/algo/argon2/argon2a.c
+++ b/algo/argon2/argon2a.c
@@ -1,3 +1,5 @@
+#include "miner.h"
+
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
--- a/algo/x14/axiom.c
+++ b/algo/x14/axiom.c
@@ -1,3 +1,4 @@
+#include "miner.h"
 #include "algo-gate-api.h"

 #include <string.h>
@@ -5,11 +6,11 @@

 #include "algo/shabal/sph_shabal.h"

-static __thread uint32_t _ALIGN(64) M[65536][8];
+static __thread uint32_t _ALIGN(128) M[65536][8];

 void axiomhash(void *output, const void *input)
 {
-	sph_shabal256_context ctx __attribute__ ((aligned (64)));
+	sph_shabal256_context ctx;
 	const int N = 65536;

 	sph_shabal256_init(&ctx);
@@ -33,7 +34,7 @@ void axiomhash(void *output, const void *input)
 		sph_shabal256(&ctx, M[p], 32);
 		sph_shabal256(&ctx, M[j], 32);
 #else
-		uint8_t _ALIGN(64) hash[64];
+		uint8_t _ALIGN(128) hash[64];
 		memcpy(hash, M[p], 32);
 		memcpy(&hash[32], M[j], 32);
 		sph_shabal256(&ctx, hash, 64);
@@ -48,8 +49,8 @@ int scanhash_axiom(int thr_id, struct work *work,
 {
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
-	uint32_t _ALIGN(64) hash64[8];
-	uint32_t _ALIGN(64) endiandata[20];
+	uint32_t _ALIGN(128) hash64[8];
+	uint32_t _ALIGN(128) endiandata[20];

 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
@@ -81,6 +82,7 @@ bool register_axiom_algo( algo_gate_t* gate )
 {
    gate->scanhash  = (void*)&scanhash_axiom;
    gate->hash      = (void*)&axiomhash;
+    gate->hash_alt  = (void*)&axiomhash;
    gate->get_max64 = (void*)&get_max64_0x40LL;
    return true;
 }
--- a/algo/blake/b/sia-rpc.cpp
+++ b/algo/blake/b/sia-rpc.cpp
@@ -0,0 +1,203 @@
+#include <ccminer-config.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <inttypes.h>
+#include <unistd.h>
+#include <math.h>
+#include <sys/time.h>
+#include <time.h>
+#include <signal.h>
+#include <curl/curl.h>
+#include <miner.h>
+
+#include "sia-rpc.h"
+
+static bool sia_debug_diff = false;
+
+extern int share_result(int result, int pooln, double sharediff, const char *reason);
+
+/* compute nbits to get the network diff */
+static void calc_network_diff(struct work *work)
+{
+	uint32_t nbits = work->data[11]; // unsure if correct
+	uint32_t bits = (nbits & 0xffffff);
+	int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28
+
+	uint64_t diffone = 0x0000FFFF00000000ull;
+	double d = (double)0x0000ffff / (double)bits;
+
+	for (int m=shift; m < 29; m++) d *= 256.0;
+	for (int m=29; m < shift; m++) d /= 256.0;
+	if (sia_debug_diff)
+		applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits);
+
+	net_diff = d;
+}
+
+// ---- SIA LONGPOLL --------------------------------------------------------------------------------
+
+struct data_buffer {
+	void *buf;
+	size_t len;
+};
+
+static size_t sia_data_cb(const void *ptr, size_t size, size_t nmemb,
+			  void *user_data)
+{
+	struct data_buffer *db = (struct data_buffer *)user_data;
+	size_t len = size * nmemb;
+	size_t oldlen, newlen;
+	void *newmem;
+	static const uchar zero = 0;
+
+	oldlen = db->len;
+	newlen = oldlen + len;
+
+	newmem = realloc(db->buf, newlen + 1);
+	if (!newmem)
+		return 0;
+
+	db->buf = newmem;
+	db->len = newlen;
+	memcpy((char*)db->buf + oldlen, ptr, len);
+	memcpy((char*)db->buf + newlen, &zero, 1);	/* null terminate */
+
+	return len;
+}
+
+char* sia_getheader(CURL *curl, struct pool_infos *pool)
+{
+	char curl_err_str[CURL_ERROR_SIZE] = { 0 };
+	struct data_buffer all_data = { 0 };
+	struct curl_slist *headers = NULL;
+	char data[256] = { 0 };
+	char url[512];
+
+	// nanopool
+	snprintf(url, 512, "%s/miner/header?address=%s&worker=%s", //&longpoll
+		pool->url, pool->user, pool->pass);
+
+	if (opt_protocol)
+		curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
+	curl_easy_setopt(curl, CURLOPT_URL, url);
+	curl_easy_setopt(curl, CURLOPT_POST, 0);
+	curl_easy_setopt(curl, CURLOPT_ENCODING, "");
+	curl_easy_setopt(curl, CURLOPT_FAILONERROR, 0);
+	curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1);
+	curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1);
+	curl_easy_setopt(curl, CURLOPT_TIMEOUT, opt_timeout);
+	curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1);
+	curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_err_str);
+	curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, sia_data_cb);
+	curl_easy_setopt(curl, CURLOPT_WRITEDATA, &all_data);
+
+	headers = curl_slist_append(headers, "Accept: application/octet-stream");
+	headers = curl_slist_append(headers, "Expect:"); // disable Expect hdr
+	headers = curl_slist_append(headers, "User-Agent: Sia-Agent"); // required for now
+//	headers = curl_slist_append(headers, "User-Agent: " USER_AGENT);
+//	headers = curl_slist_append(headers, "X-Mining-Extensions: longpoll");
+
+	curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
+
+	int rc = curl_easy_perform(curl);
+	if (rc && strlen(curl_err_str)) {
+		applog(LOG_WARNING, "%s", curl_err_str);
+	}
+
+	if (all_data.len >= 112)
+		cbin2hex(data, (const char*) all_data.buf, 112);
+	if (opt_protocol || all_data.len != 112)
+		applog(LOG_DEBUG, "received %d bytes: %s", (int) all_data.len, data);
+
+	curl_slist_free_all(headers);
+
+	return rc == 0 && all_data.len ? strdup(data) : NULL;
+}
+
+bool sia_work_decode(const char *hexdata, struct work *work)
+{
+	uint8_t target[32];
+	if (!work) return false;
+
+	hex2bin((uchar*)target, &hexdata[0], 32);
+	swab256(work->target, target);
+	work->targetdiff = target_to_diff(work->target);
+
+	hex2bin((uchar*)work->data, &hexdata[64], 80);
+	// high 16 bits of the 64 bits nonce
+	work->data[9] = rand() << 16;
+
+	// use work ntime as job id
+	cbin2hex(work->job_id, (const char*)&work->data[10], 4);
+	calc_network_diff(work);
+
+	if (stratum_diff != work->targetdiff) {
+		stratum_diff = work->targetdiff;
+		applog(LOG_WARNING, "Pool diff set to %g", stratum_diff);
+	}
+
+	return true;
+}
+
+bool sia_submit(CURL *curl, struct pool_infos *pool, struct work *work)
+{
+	char curl_err_str[CURL_ERROR_SIZE] = { 0 };
+	struct data_buffer all_data = { 0 };
+	struct curl_slist *headers = NULL;
+	char buf[256] = { 0 };
+	char url[512];
+
+	if (opt_protocol)
+		applog_hex(work->data, 80);
+	//applog_hex(&work->data[8], 16);
+	//applog_hex(&work->data[10], 4);
+
+	// nanopool
+	snprintf(url, 512, "%s/miner/header?address=%s&worker=%s",
+		pool->url, pool->user, pool->pass);
+
+	if (opt_protocol)
+		curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
+	curl_easy_setopt(curl, CURLOPT_URL, url);
+	curl_easy_setopt(curl, CURLOPT_ENCODING, "");
+	curl_easy_setopt(curl, CURLOPT_FAILONERROR, 0);
+	curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1);
+	curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1);
+	curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_err_str);
+	curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1);
+	curl_easy_setopt(curl, CURLOPT_TIMEOUT, 10);
+
+	curl_easy_setopt(curl, CURLOPT_WRITEDATA, &all_data);
+	curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, sia_data_cb);
+
+	memcpy(buf, work->data, 80);
+	curl_easy_setopt(curl, CURLOPT_POST, 1);
+	curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE, 80);
+	curl_easy_setopt(curl, CURLOPT_POSTFIELDS, (void*) buf);
+
+//	headers = curl_slist_append(headers, "Content-Type: application/octet-stream");
+//	headers = curl_slist_append(headers, "Content-Length: 80");
+	headers = curl_slist_append(headers, "Accept:"); // disable Accept hdr
+	headers = curl_slist_append(headers, "Expect:"); // disable Expect hdr
+	headers = curl_slist_append(headers, "User-Agent: Sia-Agent");
+//	headers = curl_slist_append(headers, "User-Agent: " USER_AGENT);
+	curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
+
+	int res = curl_easy_perform(curl) == 0;
+	long errcode;
+	CURLcode c = curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &errcode);
+	if (errcode != 204) {
+		if (strlen(curl_err_str))
+			applog(LOG_ERR, "submit err %ld %s", errcode, curl_err_str);
+		res = 0;
+	}
+	share_result(res, work->pooln, work->sharediff[0], res ? NULL : (char*) all_data.buf);
+
+	curl_slist_free_all(headers);
+	return true;
+}
+
+// ---- END SIA LONGPOLL ----------------------------------------------------------------------------
--- a/algo/blake/b/sia-rpc.h
+++ b/algo/blake/b/sia-rpc.h
@@ -0,0 +1,6 @@
+#include <miner.h>
+
+char* sia_getheader(CURL *curl, struct pool_infos *pool);
+bool sia_work_decode(const char *hexdata, struct work *work);
+bool sia_submit(CURL *curl, struct pool_infos *pool, struct work *work);
+
--- a/algo/blake/blake-4way.c
+++ b/algo/blake/blake-4way.c
@@ -1,112 +0,0 @@
-#include "blake-gate.h"
-#include "sph_blake.h"
-#include "blake-hash-4way.h"
-#include <string.h>
-#include <stdint.h>
-#include <memory.h>
-
-#if defined (BLAKE_4WAY)
-
-void blakehash_4way(void *state, const void *input)
-{
-     uint32_t vhash[4*4] __attribute__ ((aligned (64)));
-     uint32_t hash0[4] __attribute__ ((aligned (32)));
-     uint32_t hash1[4] __attribute__ ((aligned (32)));
-     uint32_t hash2[4] __attribute__ ((aligned (32)));
-     uint32_t hash3[4] __attribute__ ((aligned (32)));
-     blake256_4way_context ctx;
-
-     blake256_4way_init( &ctx );
-     blake256_4way( &ctx, input, 16 );
-     blake256_4way_close( &ctx, vhash );
-
-     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
-
-     memcpy( state,    hash0, 32 );
-     memcpy( state+32, hash1, 32 );
-     memcpy( state+64, hash1, 32 );
-     memcpy( state+96, hash1, 32 );
-}
-
-int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done )
-{
-   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-   uint32_t hash[8*4] __attribute__ ((aligned (32)));
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const uint32_t first_nonce = pdata[19];
-//   uint32_t HTarget = ptarget[7];
-   uint32_t _ALIGN(32) edata[20];
-   uint32_t n = first_nonce;
-   uint32_t *nonces = work->nonces;
-   bool *found = work->nfound;
-   int num_found = 0;
-
-//   if (opt_benchmark)
-//      HTarget = 0x7f;
-
-   // we need big endian data...
-   swab32_array( edata, pdata, 20 );
-
-   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
-
-   uint32_t *noncep = vdata + 76;   // 19*4
-   do {
-      found[0] = found[1] = found[2] = found[3] = false;
-      be32enc( noncep,    n   );
-      be32enc( noncep +1, n+1 );
-      be32enc( noncep +2, n+2 );
-      be32enc( noncep +3, n+3 );
-
-      blakehash_4way( hash, vdata );
-
-      if ( hash[7] == 0 )
-      {
-         if ( fulltest( hash, ptarget ) )
-         {
-             found[0] = true;
-             num_found++;
-             nonces[0] = n;
-             pdata[19] = n;
-         }
-      }
-      if ( (hash+8)[7] == 0 ) 
-      {
-         if ( fulltest( hash+8, ptarget ) ) 
-         {
-             found[1] = true;
-             num_found++;
-             nonces[1] = n+1;
-         }
-      }
-      if ( (hash+16)[7] == 0 )
-      {
-          if ( fulltest( hash+8, ptarget ) )
-          {
-              found[2] = true;
-              num_found++;
-              nonces[2] = n+2;
-          }
-      }
-      if ( (hash+24)[7] == 0 )
-      {
-         if ( fulltest( hash+8, ptarget ) )
-         {
-              found[3] = true;
-              num_found++;
-              nonces[3] = n+3;
-         }
-      }
-       n += 4;
-      *hashes_done = n - first_nonce + 1;
-
-   } while ( (num_found == 0) && (n < max_nonce) 
-             && !work_restart[thr_id].restart );
-
-   *hashes_done = n - first_nonce + 1;
-   return num_found;
-}
-
-#endif
-
--- a/algo/blake/blake-gate.c
+++ b/algo/blake/blake-gate.c
@@ -1,26 +0,0 @@
-#include "blake-gate.h"
-
-int64_t blake_get_max64 ()
-{
-  return 0x7ffffLL;
-}
-
-bool register_blake_algo( algo_gate_t* gate )
-{
-  gate->get_max64 = (void*)&blake_get_max64;
-//#if defined (__AVX2__) && defined (FOUR_WAY)
-//   gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT;
-//  gate->scanhash  = (void*)&scanhash_blake_8way;
-//  gate->hash      = (void*)&blakehash_8way;
-#if defined(BLAKE_4WAY)
-  four_way_not_tested();
-  gate->optimizations = FOUR_WAY_OPT;
-  gate->scanhash  = (void*)&scanhash_blake_4way;
-  gate->hash      = (void*)&blakehash_4way;
-#else
-  gate->scanhash  = (void*)&scanhash_blake;
-  gate->hash      = (void*)&blakehash;
-#endif
-  return true;
-}
-
--- a/algo/blake/blake-gate.h
+++ b/algo/blake/blake-gate.h
@@ -1,21 +0,0 @@
-#ifndef __BLAKE_GATE_H__
-#define __BLAKE_GATE_H__
-
-#include "algo-gate-api.h"
-#include <stdint.h>
-
-#if defined(FOUR_WAY) && defined(__AVX__)
-  #define BLAKE_4WAY
-#endif
-
-#if defined (BLAKE_4WAY)
-void blakehash_4way(void *state, const void *input);
-int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done );
-#endif
-
-void blakehash( void *state, const void *input );
-int scanhash_blake( int thr_id, struct work *work, uint32_t max_nonce,
-                      uint64_t *hashes_done );
-
-#endif
--- a/algo/blake/blake-hash-4way.c
+++ b/algo/blake/blake-hash-4way.c
--- a/algo/blake/blake-hash-4way.h
+++ b/algo/blake/blake-hash-4way.h
@@ -1,105 +0,0 @@
-/* $Id: sph_blake.h 252 2011-06-07 17:55:14Z tp $ */
-/**
- * BLAKE interface. BLAKE is a family of functions which differ by their
- * output size; this implementation defines BLAKE for output sizes 224,
- * 256, 384 and 512 bits. This implementation conforms to the "third
- * round" specification.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- * 
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @file     sph_blake.h
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
-
-#ifndef __BLAKE_HASH_4WAY__
-#define __BLAKE_HASH_4WAY___
-
-#ifdef __cplusplus
-extern "C"{
-#endif
-
-#include <stddef.h>
-#include "algo/sha/sph_types.h"
-#include "avxdefs.h"
-
-/**
- * Output size (in bits) for BLAKE-256.
- */
-#define SPH_SIZE_blake256   256
-
-#if SPH_64
-
-/**
- * Output size (in bits) for BLAKE-512.
- */
-#define SPH_SIZE_blake512   512
-
-#endif
-
-#ifdef __AVX__
-typedef struct {
-        __m128i buf[16] __attribute__ ((aligned (64)));
-        __m128i H[8];
-        __m128i S[4];    
-        size_t ptr;
-	sph_u32 T0, T1;
-} blake_4way_small_context;
-
-typedef blake_4way_small_context blake256_4way_context;
-
-void blake256_4way_init(void *cc);
-void blake256_4way(void *cc, const void *data, size_t len);
-void blake256_4way_close(void *cc, void *dst);
-void blake256_4way_addbits_and_close(
-        void *cc, unsigned ub, unsigned n, void *dst);
-
-#endif
-
-#ifdef __AVX2__
-
-typedef struct {
-        __m256i buf[16] __attribute__ ((aligned (64)));
-        __m256i H[8];
-        __m256i S[4];   
-        size_t ptr;
-	sph_u64 T0, T1;
-} blake_4way_big_context;
-
-typedef blake_4way_big_context blake512_4way_context;
-
-void blake512_4way_init(void *cc);
-void blake512_4way(void *cc, const void *data, size_t len);
-void blake512_4way_close(void *cc, void *dst);
-void blake512_4way_addbits_and_close(
-	void *cc, unsigned ub, unsigned n, void *dst);
-
-#endif
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
--- a/algo/blake/blake.c
+++ b/algo/blake/blake.c
@@ -1,3 +1,4 @@
+#include "miner.h"
 #include "algo-gate-api.h"
 #include "sph_blake.h"

@@ -20,7 +21,7 @@ void blakehash(void *state, const void *input)
 {
 	sph_blake256_context ctx;

-	uint8_t hash[64] __attribute__ ((aligned (32)));
+	uint8_t hash[64];
 	uint8_t *ending = (uint8_t*) input;
 	ending += 64;

@@ -89,3 +90,19 @@ int scanhash_blake( int thr_id, struct work *work, uint32_t max_nonce,
 	return 0;
 }

+// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
+int64_t blake_get_max64 ()
+{
+  return 0x7ffffLL;
+}
+
+bool register_blake_algo( algo_gate_t* gate )
+{
+  gate->scanhash  = (void*)&scanhash_blake;
+  gate->hash      = (void*)&blakehash;
+  gate->hash_alt  = (void*)&blakehash;
+  gate->get_max64 = (void*)&blake_get_max64;
+  return true;
+}
+
+
--- a/algo/blake/blake2b.c
+++ b/algo/blake/blake2b.c
@@ -3,20 +3,22 @@
 * tpruvot@github 2015-2016
 */

+#include "miner.h"
 #include "algo-gate-api.h"
 #include <string.h>
 #include <stdint.h>
+
 #include "algo/blake/sph_blake2b.h"

-//static __thread sph_blake2b_ctx s_midstate;
-//static __thread sph_blake2b_ctx s_ctx;
+static __thread sph_blake2b_ctx s_midstate;
+static __thread sph_blake2b_ctx s_ctx;
 #define MIDLEN 76
 #define A 64

 void blake2b_hash(void *output, const void *input)
 {
 	uint8_t _ALIGN(A) hash[32];
-	sph_blake2b_ctx ctx __attribute__ ((aligned (64)));
+	sph_blake2b_ctx ctx;

 	sph_blake2b_init(&ctx, 32, NULL, 0);
 	sph_blake2b_update(&ctx, input, 80);
@@ -25,7 +27,6 @@ void blake2b_hash(void *output, const void *input)
 	memcpy(output, hash, 32);
 }

-/*
 static void blake2b_hash_end(uint32_t *output, const uint32_t *input)
 {
 	s_ctx.outlen = MIDLEN;
@@ -33,7 +34,6 @@ static void blake2b_hash_end(uint32_t *output, const uint32_t *input)
 	sph_blake2b_update(&s_ctx, (uint8_t*) &input[MIDLEN/4], 80 - MIDLEN);
 	sph_blake2b_final(&s_ctx, (uint8_t*) output);
 }
-*/

 int scanhash_blake2b( int thr_id, struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done )
@@ -44,6 +44,7 @@ int scanhash_blake2b( int thr_id, struct work *work, uint32_t max_nonce,
 	uint32_t *ptarget = work->target;

 	const uint32_t Htarg = ptarget[7];
+//        const uint32_t first_nonce = pdata[19];
 	const uint32_t first_nonce = pdata[8];

 	uint32_t n = first_nonce;
@@ -58,6 +59,7 @@ int scanhash_blake2b( int thr_id, struct work *work, uint32_t max_nonce,
 	//memcpy(&s_ctx, &s_midstate, sizeof(blake2b_ctx));

 	do {
+//                be32enc(&endiandata[19], n);
 		be32enc(&endiandata[8], n);
 		//blake2b_hash_end(vhashcpu, endiandata);
 		blake2b_hash(vhashcpu, endiandata);
@@ -65,6 +67,7 @@ int scanhash_blake2b( int thr_id, struct work *work, uint32_t max_nonce,
 		if (vhashcpu[7] < Htarg && fulltest(vhashcpu, ptarget)) {
 			work_set_target_ratio(work, vhashcpu);
 			*hashes_done = n - first_nonce + 1;
+//                        pdata[19] = n;
 			pdata[8] = n;
 			return 1;
 		}
@@ -72,6 +75,7 @@ int scanhash_blake2b( int thr_id, struct work *work, uint32_t max_nonce,

 	} while (n < max_nonce && !work_restart[thr_id].restart);
 	*hashes_done = n - first_nonce + 1;
+//        pdata[19] = n;
 	pdata[8] = n;

 	return 0;
@@ -169,8 +173,8 @@ void blake2b_get_new_work( struct work* work, struct work* g_work, int thr_id,
   uint32_t *nonceptr = algo_gate.get_nonceptr( work->data );

   if ( memcmp( &work->data[ wkcmp_off ], &g_work->data[ wkcmp_off ], wkcmp_sz )
-      && ( clean_job || ( *nonceptr >= *end_nonce_ptr ) 
-      || strcmp( work->job_id, g_work->job_id ) ) )
+      && ( clean_job || ( *nonceptr >= *end_nonce_ptr ) ) 
+      || strcmp( work->job_id, g_work->job_id ) )
   {
      work_free( work );
      work_copy( work, g_work );
@@ -219,8 +223,6 @@ bool register_blake2b_algo( algo_gate_t* gate )
  gate->hash                  = (void*)&blake2b_hash;
  gate->calc_network_diff     = (void*)&blake2b_calc_network_diff;
  gate->build_stratum_request = (void*)&blake2b_be_build_stratum_request;
-  gate->work_decode           = (void*)&std_be_work_decode;
-  gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
  gate->build_extraheader     = (void*)&blake2b_build_extraheader;
  gate->get_new_work          = (void*)&blake2b_get_new_work;
  gate->get_max64             = (void*)&blake2b_get_max64;
--- a/algo/blake/blake2s.c
+++ b/algo/blake/blake2s.c
@@ -1,3 +1,4 @@
+#include "miner.h"
 #include "algo-gate-api.h"

 #include <string.h>
@@ -12,7 +13,7 @@ static __thread blake2s_state s_ctx;
 void blake2s_hash(void *output, const void *input)
 {
 	unsigned char _ALIGN(64) hash[BLAKE2S_OUTBYTES];
-	blake2s_state blake2_ctx __attribute__ ((aligned (64)));
+	blake2s_state blake2_ctx;

 	blake2s_init(&blake2_ctx, BLAKE2S_OUTBYTES);
 	blake2s_update(&blake2_ctx, input, 80);
--- a/algo/blake/blakecoin.c
+++ b/algo/blake/blakecoin.c
@@ -1,3 +1,4 @@
+#include "miner.h"
 #include "algo-gate-api.h"
 #define BLAKE32_ROUNDS 8
 #include "sph_blake.h"
@@ -29,7 +30,7 @@ static void blake_midstate_init( const void* input )
 void blakecoinhash( void *state, const void *input )
 {
 	sph_blake256_context ctx;
-	uint8_t hash[64] __attribute__ ((aligned (32)));
+	uint8_t hash[64];
 	uint8_t *ending = (uint8_t*) input + 64;

        // copy cached midstate
@@ -92,12 +93,10 @@ int scanhash_blakecoin( int thr_id, struct work *work, uint32_t max_nonce,
 	return 0;
 }

-/*
 void blakecoin_gen_merkle_root ( char* merkle_root, struct stratum_ctx* sctx )
 {
 SHA256( sctx->job.coinbase, (int)sctx->job.coinbase_size, merkle_root );
 }
-*/

 // changed to get_max64_0x3fffffLL in cpuminer-multi-decred
 int64_t blakecoin_get_max64 ()
@@ -110,6 +109,7 @@ bool register_vanilla_algo( algo_gate_t* gate )
 {
    gate->scanhash = (void*)&scanhash_blakecoin;
    gate->hash     = (void*)&blakecoinhash;
+    gate->hash_alt = (void*)&blakecoinhash;
    gate->get_max64 = (void*)&blakecoin_get_max64;
    blakecoin_init( &blake_init_ctx );
    return true;
--- a/algo/blake/decred-4way.c
+++ b/algo/blake/decred-4way.c
@@ -1,157 +0,0 @@
-#include "decred-gate.h"
-#include "sph_blake.h"
-#include "blake-hash-4way.h"
-#include <string.h>
-#include <stdint.h>
-#include <memory.h>
-#include <unistd.h>
-
-#if defined (DECRED_4WAY)
-
-static __thread blake256_4way_context blake_mid;
-static __thread bool ctx_midstate_done = false;
-
-void decred_hash_4way( void *state, const void *input )
-{
-     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
-     uint32_t hash0[8] __attribute__ ((aligned (32)));
-     uint32_t hash1[8] __attribute__ ((aligned (32)));
-     uint32_t hash2[8] __attribute__ ((aligned (32)));
-     uint32_t hash3[8] __attribute__ ((aligned (32)));
-     blake256_4way_context ctx __attribute__ ((aligned (64)));
-
-     sph_blake256_context ctx2 __attribute__ ((aligned (64)));
-     uint32_t hash[16] __attribute__ ((aligned (64)));
-     uint32_t sin0[45], sin1[45], sin2[45], sin3[45];
-
-     mm_deinterleave_4x32x( sin0, sin1, sin2, sin3, input, 180*8 );
-
-     void *tail = input + ( DECRED_MIDSTATE_LEN << 2 );
-     int tail_len = 180 - DECRED_MIDSTATE_LEN; 
-
-     memcpy( &ctx, &blake_mid, sizeof(blake_mid) );
-     blake256_4way( &ctx, tail, tail_len );
-     blake256_4way_close( &ctx, vhash );
-/*
-     sph_blake256_init( &ctx2 );
-     sph_blake256( &ctx2, sin0, 180 );
-     sph_blake256_close( &ctx2, hash );
-*/
-/*
-     blake256_4way_init( &ctx );
-     blake256_4way( &ctx, input, 180 );
-     blake256_4way_close( &ctx, vhash );
-*/
-     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
-/*
-        for ( int i = 0; i < 8; i++ )
-          if ( hash[i] != hash0[i] )
-            printf(" hash mismatch, i = %u\n",i);
-
-printf("hash:  %08lx %08lx %08lx %08lx\n", *hash, *(hash+1),
-                             *(hash+2), *(hash+3) );
-printf("hash0: %08lx %08lx %08lx %08lx\n", *hash0, *(hash0+1),
-                             *(hash0+2), *(hash0+3) );
-printf("\n");
-*/
-
-     memcpy( state,    hash0, 32 );
-     memcpy( state+32, hash1, 32 );
-     memcpy( state+64, hash2, 32 );
-     memcpy( state+96, hash3, 32 );
-
-//     memcpy( state, hash, 32 );
-
-}
-
-int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done)
-{
-   uint32_t vdata[48*4] __attribute__ ((aligned (64)));
-   uint32_t hash[8*4] __attribute__ ((aligned (32)));
-        uint32_t _ALIGN(64) edata[48];
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
-        const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
-        uint32_t n = first_nonce;
-        const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];
-   uint32_t *nonces = work->nonces;
-   bool *found = work->nfound;
-   int num_found = 0;
-
-        ctx_midstate_done = false;
-        memcpy( edata, pdata, 180 );
-
-   // use the old way until  new way updated for size.
-   mm_interleave_4x32( vdata, edata, edata, edata, edata, 180*8 );
-
-   blake256_4way_init( &blake_mid );
-   blake256_4way( &blake_mid, vdata, DECRED_MIDSTATE_LEN );
-
-   uint32_t *noncep = vdata + DECRED_NONCE_INDEX * 4;
-   do {
-      found[0] = found[1] = found[2] = found[3] = false;
-      * noncep    = n;
-      *(noncep+1) = n+1;
-      *(noncep+2) = n+2;
-      *(noncep+3) = n+3;
-
-      decred_hash_4way( hash, vdata );
-
-      if ( hash[7] <= HTarget && fulltest( hash, ptarget ) )
-      {
-          work_set_target_ratio( work, hash );
-          found[0] = true;
-          num_found++;
-          nonces[0] = n;
-          pdata[DECRED_NONCE_INDEX] = n;
-      }
-/*
-      if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) )
-      {
-printf("found 1\n");          
-
-printf("vhash: %08lx %08lx %08lx %08lx\n", hash[8], hash[9], hash[10],hash[11] );
-printf("vhash: %08lx %08lx %08lx %08lx\n", hash[12], hash[13], hash[14],hash[15] );
-printf("shash: %08lx %08lx %08lx %08lx\n", shash[0], shash[1], shash[2],shash[3] );
-printf("shash: %08lx %08lx %08lx %08lx\n\n", shash[4], shash[5], shash[6],shash[7] );
-
-          work_set_target_ratio( work, hash+8 );
-          found[1] = true;
-          num_found++;
-          nonces[1] = n+1;
-      }
-*/
-      if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) )
-      {
-          work_set_target_ratio( work, hash+16 );
-          found[2] = true;
-          num_found++;
-          nonces[2] = n+2;
-      }
-/*
-      if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) )
-      {
-printf("found 3\n");          
-
-printf("vhash: %08lx %08lx %08lx %08lx\n", hash[0], hash[1], hash[2],hash[3] );
-printf("vhash: %08lx %08lx %08lx %08lx\n", hash[4], hash[5], hash[6],hash[7] );
-printf("shash: %08lx %08lx %08lx %08lx\n", shash[0], shash[1], shash[2],shash[3] );
-printf("shash: %08lx %08lx %08lx %08lx\n\n", shash[4], shash[5], shash[6],shash[7] );
-
-          work_set_target_ratio( work, hash+24 );
-          found[3] = true;
-          num_found++;
-          nonces[3] = n+3;
-      }
-*/
-      n += 2;
-//      n += 4;
-  } while ( (num_found == 0) && (n < max_nonce) 
-            && !work_restart[thr_id].restart );
-
-  *hashes_done = n - first_nonce + 1;
-  return num_found;
-}
-
-#endif
--- a/algo/blake/decred-gate.c
+++ b/algo/blake/decred-gate.c
@@ -1,174 +0,0 @@
-#include "decred-gate.h"
-#include <unistd.h>
-#include <memory.h>
-#include <string.h>
-
-uint32_t *decred_get_nonceptr( uint32_t *work_data )
-{
-   return &work_data[ DECRED_NONCE_INDEX ];
-}
-
-double decred_calc_network_diff( struct work* work )
-{
-   // sample for diff 43.281 : 1c05ea29
-   // todo: endian reversed on longpoll could be zr5 specific...
-   uint32_t nbits = work->data[ DECRED_NBITS_INDEX ];
-   uint32_t bits = ( nbits & 0xffffff );
-   int16_t shift = ( swab32(nbits) & 0xff ); // 0x1c = 28
-   int m;
-   double d = (double)0x0000ffff / (double)bits;
-
-   for ( m = shift; m < 29; m++ )
-       d *= 256.0;
-   for ( m = 29; m < shift; m++ )
-       d /= 256.0;
-   if ( shift == 28 )
-       d *= 256.0; // testnet
-   if ( opt_debug_diff )
-       applog( LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d,
-                           shift, bits );
-   return net_diff;
-}
-
-void decred_decode_extradata( struct work* work, uint64_t* net_blocks )
-{
-   // some random extradata to make the work unique
-   work->data[ DECRED_XNONCE_INDEX ] = (rand()*4);
-   work->height = work->data[32];
-   if (!have_longpoll && work->height > *net_blocks + 1)
-   {
-      char netinfo[64] = { 0 };
-      if (opt_showdiff && net_diff > 0.)
-      {
-         if (net_diff != work->targetdiff)
-            sprintf(netinfo, ", diff %.3f, target %.1f", net_diff,
-                   work->targetdiff);
-         else
-             sprintf(netinfo, ", diff %.3f", net_diff);
-       }
-       applog(LOG_BLUE, "%s block %d%s", algo_names[opt_algo], work->height,
-                       netinfo);
-       *net_blocks = work->height - 1;
-   }
-}
-
-void decred_be_build_stratum_request( char *req, struct work *work,
-                                      struct stratum_ctx *sctx )
-{
-   unsigned char *xnonce2str;
-   uint32_t ntime, nonce;
-   char ntimestr[9], noncestr[9];
-
-   be32enc( &ntime, work->data[ DECRED_NTIME_INDEX ] );
-   be32enc( &nonce, work->data[ DECRED_NONCE_INDEX ] );
-   bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) );
-   bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) );
-   xnonce2str = abin2hex( (char*)( &work->data[ DECRED_XNONCE_INDEX ] ),
-                                     sctx->xnonce1_size );
-   snprintf( req, JSON_BUF_LEN,
-        "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
-         rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
-   free(xnonce2str);
-}
-#define min(a,b) (a>b ? (b) :(a))
-
-void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
-{
-   uchar merkle_root[64] = { 0 };
-   uint32_t extraheader[32] = { 0 };
-   int headersize = 0;
-   uint32_t* extradata = (uint32_t*) sctx->xnonce1;
-   size_t t;
-   int i;
-
-   // getwork over stratum, getwork merkle + header passed in coinb1
-   memcpy(merkle_root, sctx->job.coinbase, 32);
-   headersize = min((int)sctx->job.coinbase_size - 32,
-                  sizeof(extraheader) );
-   memcpy( extraheader, &sctx->job.coinbase[32], headersize );
-
-   // Increment extranonce2 
-   for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ );
-
-   // Assemble block header 
-   memset( g_work->data, 0, sizeof(g_work->data) );
-   g_work->data[0] = le32dec( sctx->job.version );
-   for ( i = 0; i < 8; i++ )
-      g_work->data[1 + i] = swab32(
-                              le32dec( (uint32_t *) sctx->job.prevhash + i ) );
-   for ( i = 0; i < 8; i++ )
-      g_work->data[9 + i] = swab32( be32dec( (uint32_t *) merkle_root + i ) );
-
-//   for ( i = 0; i < 8; i++ ) // prevhash
-//      g_work->data[1 + i] = swab32( g_work->data[1 + i] );
-//   for ( i = 0; i < 8; i++ ) // merkle
-//      g_work->data[9 + i] = swab32( g_work->data[9 + i] );
-
-   for ( i = 0; i < headersize/4; i++ ) // header
-      g_work->data[17 + i] = extraheader[i];
-   // extradata
-
-   for ( i = 0; i < sctx->xnonce1_size/4; i++ )
-      g_work->data[ DECRED_XNONCE_INDEX + i ] = extradata[i];
-   for ( i = DECRED_XNONCE_INDEX + sctx->xnonce1_size/4; i < 45; i++ )
-      g_work->data[i] = 0;
-   g_work->data[37] = (rand()*4) << 8;
-   // block header suffix from coinb2 (stake version)
-   memcpy( &g_work->data[44],
-           &sctx->job.coinbase[ sctx->job.coinbase_size-4 ], 4 );
-   sctx->bloc_height = g_work->data[32];
-   //applog_hex(work->data, 180);
-   //applog_hex(&work->data[36], 36);
-}
-
-#undef min
-
-bool decred_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
-                           int thr_id )
-{
-   if ( have_stratum && strcmp(stratum->job.job_id, work->job_id)  )
-      // need to regen g_work..
-      return false;
-   if ( have_stratum && !work->data[0] && !opt_benchmark )
-   {
-      sleep(1);
-      return false;
-   }
-   // extradata: prevent duplicates
-   work->data[ DECRED_XNONCE_INDEX     ] += 1;
-   work->data[ DECRED_XNONCE_INDEX + 1 ] |= thr_id;
-   return true;
-}
-
-
-bool register_decred_algo( algo_gate_t* gate )
-{
-#if defined(DECRED_4WAY)
-  four_way_not_tested();
-  gate->optimizations = FOUR_WAY_OPT;
-  gate->scanhash  = (void*)&scanhash_decred_4way;
-  gate->hash      = (void*)&decred_hash_4way;
-#else
-  gate->optimizations = SSE2_OPT;
-  gate->scanhash  = (void*)&scanhash_decred;
-  gate->hash      = (void*)&decred_hash;
-#endif
-
-  gate->get_nonceptr          = (void*)&decred_get_nonceptr;
-  gate->get_max64             = (void*)&get_max64_0x3fffffLL;
-  gate->display_extra_data    = (void*)&decred_decode_extradata;
-  gate->build_stratum_request = (void*)&decred_be_build_stratum_request;
-  gate->work_decode           = (void*)&std_be_work_decode;
-  gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
-  gate->build_extraheader     = (void*)&decred_build_extraheader;
-  gate->ready_to_mine         = (void*)&decred_ready_to_mine;
-  gate->nbits_index           = DECRED_NBITS_INDEX;
-  gate->ntime_index           = DECRED_NTIME_INDEX;
-  gate->nonce_index           = DECRED_NONCE_INDEX;
-  gate->work_data_size        = DECRED_DATA_SIZE;
-  gate->work_cmp_size         = DECRED_WORK_COMPARE_SIZE;
-  allow_mininginfo            = false;
-  have_gbt                    = false;
-  return true;
-}
-
--- a/algo/blake/decred-gate.h
+++ b/algo/blake/decred-gate.h
@@ -1,36 +0,0 @@
-#ifndef __DECRED_GATE_H__
-#define __DECRED_GATE_H__
-
-#include "algo-gate-api.h"
-#include <stdint.h>
-
-#define DECRED_NBITS_INDEX 29
-#define DECRED_NTIME_INDEX 34
-#define DECRED_NONCE_INDEX 35
-#define DECRED_XNONCE_INDEX 36
-#define DECRED_DATA_SIZE 192
-#define DECRED_WORK_COMPARE_SIZE 140
-#define DECRED_MIDSTATE_LEN 128
-
-#if defined (__AVX2__) 
-//void blakehash_84way(void *state, const void *input);
-//int scanhash_blake_8way( int thr_id, struct work *work, uint32_t max_nonce,
-//                         uint64_t *hashes_done );
-#endif
-
-#if defined(FOUR_WAY) && defined(__AVX__)
-  #define DECRED_4WAY
-#endif
-
-#if defined (DECRED_4WAY)
-void decred_hash_4way(void *state, const void *input);
-int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done );
-#endif
-
-void decred_hash( void *state, const void *input );
-int scanhash_decred( int thr_id, struct work *work, uint32_t max_nonce,
-                     uint64_t *hashes_done );
-
-#endif
-
--- a/algo/blake/decred.c
+++ b/algo/blake/decred.c
@@ -1,11 +1,10 @@
-#include "decred-gate.h"
+#include "miner.h"
+#include "algo-gate-api.h"
 #include "sph_blake.h"

 #include <string.h>
 #include <stdint.h>
 #include <memory.h>
-#include <unistd.h>
-
 /*
 #ifndef min
 #define min(a,b) (a>b ? b : a)
@@ -14,33 +13,33 @@
 #define max(a,b) (a<b ? b : a)
 #endif
 */
-/*
+
 #define DECRED_NBITS_INDEX 29
 #define DECRED_NTIME_INDEX 34
 #define DECRED_NONCE_INDEX 35
 #define DECRED_XNONCE_INDEX 36
 #define DECRED_DATA_SIZE 192
 #define DECRED_WORK_COMPARE_SIZE 140
-*/
+
 static __thread sph_blake256_context blake_mid;
 static __thread bool ctx_midstate_done = false;

 void decred_hash(void *state, const void *input)
 {
-//        #define MIDSTATE_LEN 128
-        sph_blake256_context ctx __attribute__ ((aligned (64)));
+        #define MIDSTATE_LEN 128
+        sph_blake256_context ctx;

        uint8_t *ending = (uint8_t*) input;
-        ending += DECRED_MIDSTATE_LEN;
+        ending += MIDSTATE_LEN;

        if (!ctx_midstate_done) {
                sph_blake256_init(&blake_mid);
-                sph_blake256(&blake_mid, input, DECRED_MIDSTATE_LEN);
+                sph_blake256(&blake_mid, input, MIDSTATE_LEN);
                ctx_midstate_done = true;
        }
        memcpy(&ctx, &blake_mid, sizeof(blake_mid));

-        sph_blake256(&ctx, ending, (180 - DECRED_MIDSTATE_LEN));
+        sph_blake256(&ctx, ending, (180 - MIDSTATE_LEN));
        sph_blake256_close(&ctx, state);
 }

@@ -54,14 +53,14 @@ void decred_hash_simple(void *state, const void *input)

 int scanhash_decred(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done)
 {
-        uint32_t _ALIGN(64) endiandata[48];
-        uint32_t _ALIGN(64) hash32[8];
+        uint32_t _ALIGN(128) endiandata[48];
+        uint32_t _ALIGN(128) hash32[8];
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;

-//        #define DCR_NONCE_OFT32 35
+        #define DCR_NONCE_OFT32 35

-        const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
+        const uint32_t first_nonce = pdata[DCR_NONCE_OFT32];
        const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];

        uint32_t n = first_nonce;
@@ -81,7 +80,7 @@ int scanhash_decred(int thr_id, struct work *work, uint32_t max_nonce, uint64_t

        do {
                //be32enc(&endiandata[DCR_NONCE_OFT32], n);
-                endiandata[DECRED_NONCE_INDEX] = n;
+                endiandata[DCR_NONCE_OFT32] = n;
                decred_hash(hash32, endiandata);

                if (hash32[7] <= HTarget && fulltest(hash32, ptarget)) {
@@ -92,7 +91,7 @@ int scanhash_decred(int thr_id, struct work *work, uint32_t max_nonce, uint64_t
                        applog_hash(ptarget);
                        applog_compare_hash(hash32, ptarget);
 #endif
-                        pdata[DECRED_NONCE_INDEX] = n;
+                        pdata[DCR_NONCE_OFT32] = n;
                        return 1;
                }

@@ -101,17 +100,24 @@ int scanhash_decred(int thr_id, struct work *work, uint32_t max_nonce, uint64_t
        } while (n < max_nonce && !work_restart[thr_id].restart);

        *hashes_done = n - first_nonce + 1;
-        pdata[DECRED_NONCE_INDEX] = n;
+        pdata[DCR_NONCE_OFT32] = n;
        return 0;
 }

-/*
 uint32_t *decred_get_nonceptr( uint32_t *work_data )
 {
   return &work_data[ DECRED_NONCE_INDEX ];
 }

+// does decred need a custom stratum_get_g_work to fix nicehash
+//  bad extranonce2 size?
+// 
+// does decred need a custom init_nonce?
+// does it need to increment nonce, seems not because gen_work_now always
+// returns true
+
 double decred_calc_network_diff( struct work* work )
+//void decred_calc_network_diff( struct work* work )
 {
   // sample for diff 43.281 : 1c05ea29
   // todo: endian reversed on longpoll could be zr5 specific...
@@ -173,7 +179,7 @@ void decred_be_build_stratum_request( char *req, struct work *work,
         rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
   free(xnonce2str);
 }
-*/
+
 /*
 // data shared between gen_merkle_root and build_extraheader.
 __thread uint32_t decred_extraheader[32] = { 0 };
@@ -189,7 +195,6 @@ void decred_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx )
 }
 */

-/*
 #define min(a,b) (a>b ? (b) :(a))

 void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
@@ -227,15 +232,11 @@ void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
   for ( i = 0; i < headersize/4; i++ ) // header
      g_work->data[17 + i] = extraheader[i];
   // extradata
-
   for ( i = 0; i < sctx->xnonce1_size/4; i++ )
      g_work->data[ DECRED_XNONCE_INDEX + i ] = extradata[i];
   for ( i = DECRED_XNONCE_INDEX + sctx->xnonce1_size/4; i < 45; i++ )
      g_work->data[i] = 0;
   g_work->data[37] = (rand()*4) << 8;
-   // block header suffix from coinb2 (stake version)
-   memcpy( &g_work->data[44],
-           &sctx->job.coinbase[ sctx->job.coinbase_size-4 ], 4 );
   sctx->bloc_height = g_work->data[32];
   //applog_hex(work->data, 180);
   //applog_hex(&work->data[36], 36);
@@ -243,6 +244,21 @@ void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )

 #undef min

+/*
+bool decred_prevent_dupes( struct work* work, struct stratum_ctx* stratum,
+                           int thr_id )
+{
+return false;
+   if ( have_stratum && strcmp(stratum->job.job_id, work->job_id)  )
+      // need to regen g_work..
+      return true;
+   // extradata: prevent duplicates
+   work->data[ DECRED_XNONCE_INDEX     ] += 1;
+   work->data[ DECRED_XNONCE_INDEX + 1 ] |= thr_id;
+   return false;
+}
+*/
+
 bool decred_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
                           int thr_id )
 {
@@ -266,13 +282,14 @@ bool register_decred_algo( algo_gate_t* gate )
  gate->optimizations         = SSE2_OPT;
  gate->scanhash              = (void*)&scanhash_decred;
  gate->hash                  = (void*)&decred_hash;
+  gate->hash_alt              = (void*)&decred_hash;
  gate->get_nonceptr          = (void*)&decred_get_nonceptr;
  gate->get_max64             = (void*)&get_max64_0x3fffffLL;
  gate->display_extra_data    = (void*)&decred_decode_extradata;
  gate->build_stratum_request = (void*)&decred_be_build_stratum_request;
-  gate->work_decode           = (void*)&std_be_work_decode;
-  gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
+//  gate->gen_merkle_root       = (void*)&decred_gen_merkle_root;
  gate->build_extraheader     = (void*)&decred_build_extraheader;
+//  gate->prevent_dupes         = (void*)&decred_prevent_dupes;
  gate->ready_to_mine         = (void*)&decred_ready_to_mine;
  gate->nbits_index           = DECRED_NBITS_INDEX;
  gate->ntime_index           = DECRED_NTIME_INDEX;
@@ -283,4 +300,4 @@ bool register_decred_algo( algo_gate_t* gate )
  have_gbt                    = false;
  return true;
 }
-*/
+
--- a/algo/blake/pentablake-4way.c
+++ b/algo/blake/pentablake-4way.c
@@ -1,206 +0,0 @@
-#include "pentablake-gate.h"
-#include <stdlib.h>
-#include <stdint.h>
-#include <string.h>
-#include <stdio.h>
-
-#include "blake-hash-4way.h"
-#include "sph_blake.h"
-
-//#define DEBUG_ALGO
-
-#ifdef PENTABLAKE_4WAY
-
-extern void pentablakehash_4way( void *output, const void *input )
-{
-	unsigned char _ALIGN(32) hash[128];
-//	// same as uint32_t hashA[16], hashB[16];
-//	#define hashB hash+64
-
-     uint64_t hash0[8] __attribute__ ((aligned (64)));
-     uint64_t hash1[8] __attribute__ ((aligned (64)));
-     uint64_t hash2[8] __attribute__ ((aligned (64)));
-     uint64_t hash3[8] __attribute__ ((aligned (64)));
-     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
-     blake512_4way_context ctx;
-
-
-     blake512_4way_init( &ctx );
-     blake512_4way( &ctx, input, 80 );
-     blake512_4way_close( &ctx, vhash );
-
-uint64_t sin0[10], sin1[10], sin2[10], sin3[10];
-mm256_deinterleave_4x64( sin0, sin1, sin2, sin3, input, 640 );
-sph_blake512_context ctx2_blake;
-sph_blake512_init(&ctx2_blake);
-sph_blake512(&ctx2_blake, sin0, 80);
-sph_blake512_close(&ctx2_blake, (void*) hash);
-
-mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
-uint64_t* hash64 = (uint64_t*)hash;
-for( int i = 0; i < 8; i++ )
-{
-   if ( hash0[i] != hash64[i] )
-      printf("hash mismatch %u\n",i);
-}
-
-     blake512_4way_init( &ctx );
-     blake512_4way( &ctx, vhash, 64 );
-     blake512_4way_close( &ctx, vhash );
-
-     blake512_4way_init( &ctx );
-     blake512_4way( &ctx, vhash, 64 );
-     blake512_4way_close( &ctx, vhash );
-
-     blake512_4way_init( &ctx );
-     blake512_4way( &ctx, vhash, 64 );
-     blake512_4way_close( &ctx, vhash );
-
-     blake512_4way_init( &ctx );
-     blake512_4way( &ctx, vhash, 64 );
-     blake512_4way_close( &ctx, vhash );
-
-     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
-     memcpy( output,    hash0, 32 );
-     memcpy( output+32, hash1, 32 );
-     memcpy( output+64, hash2, 32 );
-     memcpy( output+96, hash3, 32 );
-
-/*
-     uint64_t sin0[10] __attribute__ ((aligned (64)));
-     uint64_t sin1[10] __attribute__ ((aligned (64)));
-     uint64_t sin2[10] __attribute__ ((aligned (64)));
-     uint64_t sin3[10] __attribute__ ((aligned (64)));
-
-	sph_blake512_context     ctx_blake;
-
-	sph_blake512_init(&ctx_blake);
-	sph_blake512(&ctx_blake, input, 80);
-	sph_blake512_close(&ctx_blake, hash);
-
-        sph_blake512_init(&ctx_blake);
-	sph_blake512(&ctx_blake, hash, 64);
-	sph_blake512_close(&ctx_blake, hash);
-
-        sph_blake512_init(&ctx_blake);
-	sph_blake512(&ctx_blake, hash, 64);
-	sph_blake512_close(&ctx_blake, hash);
-
-        sph_blake512_init(&ctx_blake);
-	sph_blake512(&ctx_blake, hash, 64);
-	sph_blake512_close(&ctx_blake, hash);
-
-        sph_blake512_init(&ctx_blake);
-	sph_blake512(&ctx_blake, hash, 64);
-	sph_blake512_close(&ctx_blake, hash);
-
-	memcpy(output, hash, 32);
-*/
-}
-
-int scanhash_pentablake_4way( int thr_id, struct work *work,
-                              uint32_t max_nonce, uint64_t *hashes_done )
-{
-    uint32_t hash[4*8] __attribute__ ((aligned (64)));
-    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-    uint32_t endiandata[32] __attribute__ ((aligned (64)));
-    uint32_t *pdata = work->data;
-    uint32_t *ptarget = work->target;
-    uint32_t n = pdata[19] - 1;
-    const uint32_t first_nonce = pdata[19];
-    const uint32_t Htarg = ptarget[7];
-    uint32_t *nonces = work->nonces;
-    bool *found = work->nfound;
-    int num_found = 0;
-    uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
-    uint32_t *noncep1 = vdata + 75;
-    uint32_t *noncep2 = vdata + 77;
-    uint32_t *noncep3 = vdata + 79;
-
-//    uint32_t _ALIGN(32) hash64[8];
-//    uint32_t _ALIGN(32) endiandata[32];
-
-    uint64_t htmax[] = {
-	0,
-	0xF,
-	0xFF,
-	0xFFF,
-	0xFFFF,
-	0x10000000
-    };
-    uint32_t masks[] = {
- 	0xFFFFFFFF,
-	0xFFFFFFF0,
-	0xFFFFFF00,
-	0xFFFFF000,
-	0xFFFF0000,
-	0
-    };
-
-	// we need bigendian data...
-    swab32_array( endiandata, pdata, 20 );
-
-    uint64_t *edata = (uint64_t*)endiandata;
-    mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
-
-    for ( int m=0; m < 6; m++ )
-    {
-        if ( Htarg <= htmax[m] )
-        {
-           uint32_t mask = masks[m];
-           do {
-              found[0] = found[1] = found[2] = found[3] = false;
-              be32enc( noncep0, n   );
-              be32enc( noncep1, n+1 );
-              be32enc( noncep2, n+2 );
-              be32enc( noncep3, n+3 );
-
-              pentablakehash_4way( hash, vdata );
-
-              // return immediately on nonce found, only one submit
-              if ( ( !(hash[7] & mask) ) && fulltest( hash, ptarget ) )
-              {
-                  found[0] = true;
-                  num_found++;
-                  nonces[0] = n;
-                  pdata[19] = n;
-                  *hashes_done = n - first_nonce + 1;
-                  return 1;
-              }
-              if ( (! ((hash+8)[7] & mask) ) && fulltest( hash+8, ptarget ) )
-              {
-                  found[1] = true;
-                  num_found++;
-                  nonces[1] = n;
-                  *hashes_done = n - first_nonce + 1;
-                  return 1;
-              }
-              if ( ( !((hash+16)[7] & mask) ) && fulltest( hash+16, ptarget ) )
-              {
-                  found[2] = true;
-                  num_found++;
-                  nonces[2] = n;
-                  *hashes_done = n - first_nonce + 1;
-                  return 1;
-              }
-              if ( ( !((hash+24)[7] & mask) ) && fulltest( hash+24, ptarget ) )
-              {
-                  found[3] = true;
-                  num_found++;
-                  nonces[3] = n;
-                  *hashes_done = n - first_nonce + 1;
-                  return 1;
-              }
-              n += 4;
-
-           } while (n < max_nonce && !work_restart[thr_id].restart);
-           break;
-        }
-    }
-
-    *hashes_done = n - first_nonce + 1;
-    pdata[19] = n;
-    return 0;
-} 
-
-#endif
--- a/algo/blake/pentablake-gate.c
+++ b/algo/blake/pentablake-gate.c
@@ -1,16 +0,0 @@
-#include "pentablake-gate.h"
-
-bool register_pentablake_algo( algo_gate_t* gate )
-{
-#if defined (PENTABLAKE_4WAY)
-    gate->scanhash  = (void*)&scanhash_pentablake_4way;
-    gate->hash      = (void*)&pentablakehash_4way;
-#else
-    gate->scanhash  = (void*)&scanhash_pentablake;
-    gate->hash      = (void*)&pentablakehash;
-#endif
-    gate->optimizations = FOUR_WAY_OPT;
-    gate->get_max64 = (void*)&get_max64_0x3ffff;
-    return true;
-};
-
--- a/algo/blake/pentablake-gate.h
+++ b/algo/blake/pentablake-gate.h
@@ -1,21 +0,0 @@
-#ifndef __PENTABLAKE_GATE_H__
-#define __PENTABLAKE_GATE_H__
-
-#include "algo-gate-api.h"
-#include <stdint.h>
-
-#if defined(FOUR_WAY) && defined(__AVX__)
-  #define PENTABLAKE_4WAY
-#endif
-
-#if defined(PENTABLAKE_4WAY)
-void pentablakehash_4way( void *state, const void *input );
-int scanhash_pentablake_4way( int thr_id, struct work *work,
-                              uint32_t max_nonce, uint64_t *hashes_done );
-#endif
-
-void pentablakehash( void *state, const void *input );
-int scanhash_pentablake( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done );
-#endif
-
--- a/algo/blake/pentablake.c
+++ b/algo/blake/pentablake.c
@@ -1,4 +1,5 @@
-#include "pentablake-gate.h"
+#include "miner.h"
+#include "algo-gate-api.h"
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -110,3 +111,11 @@ int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_nonce,
 	return 0;
 } 

+bool register_pentablake_algo( algo_gate_t* gate )
+{
+    gate->scanhash  = (void*)&scanhash_pentablake;
+    gate->hash      = (void*)&pentablakehash;
+    gate->get_max64 = (void*)&get_max64_0x3ffff;
+    return true;
+};
+
--- a/algo/blake/sph_blake.c
+++ b/algo/blake/sph_blake.c
@@ -813,7 +813,6 @@ blake32(sph_blake_small_context *sc, const void *data, size_t len)

 	buf = sc->buf;
 	ptr = sc->ptr;
-
 	if (len < (sizeof sc->buf) - ptr) {
 		memcpy(buf + ptr, data, len);
 		ptr += len;
@@ -872,7 +871,6 @@ blake32_close(sph_blake_small_context *sc,
 	} else {
 		sc->T0 -= 512 - bit_len;
 	}
-
 	if (bit_len <= 446) {
 		memset(u.buf + ptr + 1, 0, 55 - ptr);
 		if (out_size_w32 == 8)
@@ -892,9 +890,9 @@ blake32_close(sph_blake_small_context *sc,
 		sph_enc32be_aligned(u.buf + 60, tl);
 		blake32(sc, u.buf, 64);
 	}
-        out = dst;
-        for (k = 0; k < out_size_w32; k ++)
-                sph_enc32be(out + (k << 2), sc->H[k]);
+	out = dst;
+	for (k = 0; k < out_size_w32; k ++)
+		sph_enc32be(out + (k << 2), sc->H[k]);
 }

 #if SPH_64
@@ -984,11 +982,9 @@ blake64_close(sph_blake_big_context *sc,
 			u.buf[111] |= 1;
 		sph_enc64be_aligned(u.buf + 112, th);
 		sph_enc64be_aligned(u.buf + 120, tl);
-
 		blake64(sc, u.buf + ptr, 128 - ptr);
 	} else {
 		memset(u.buf + ptr + 1, 0, 127 - ptr);
-
 		blake64(sc, u.buf + ptr, 128 - ptr);
 		sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00);
 		sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFF);
@@ -997,7 +993,6 @@ blake64_close(sph_blake_big_context *sc,
 			u.buf[111] = 1;
 		sph_enc64be_aligned(u.buf + 112, th);
 		sph_enc64be_aligned(u.buf + 120, tl);
-
 		blake64(sc, u.buf, 128);
 	}
 	out = dst;
--- a/algo/blake/sph_blake.h
+++ b/algo/blake/sph_blake.h
@@ -42,7 +42,7 @@ extern "C"{
 #endif

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "algo/sha3/sph_types.h"

 /**
 * Output size (in bits) for BLAKE-224.
--- a/algo/blake/sph_blake2b.c
+++ b/algo/blake/sph_blake2b.c
@@ -31,7 +31,7 @@
 #include <stdint.h>
 #include <string.h>

-#include "algo/sha/sph_types.h"
+#include "algo/sha3/sph_types.h"
 #include "sph_blake2b.h"

 // Cyclic right rotation.
--- a/algo/bmw/bmw-hash-4way.c
+++ b/algo/bmw/bmw-hash-4way.c
@@ -1,969 +0,0 @@
-/* $Id: bmw.c 227 2010-06-16 17:28:38Z tp $ */
-/*
- * BMW implementation.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
-
-#include <stddef.h>
-#include <string.h>
-#include <limits.h>
-#include "bmw-hash-4way.h"
-
-#if defined(__AVX2__)
-
-#ifdef __cplusplus
-extern "C"{
-#endif
-
-//#include "sph_bmw.h"
-
-//#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_BMW
-#define SPH_SMALL_FOOTPRINT_BMW   1
-//#endif
-
-#ifdef _MSC_VER
-#pragma warning (disable: 4146)
-#endif
-
-//#undef SPH_ROTL64
-//#define SPH_ROTL64(x,n)  (((x) << (n)) | ((x) >> (64 - (n))))
-//#define SPH_ROTL64(x,n)  mm256_rotl_64(x,n)
-
-static const sph_u32 IV256[] = {
-	SPH_C32(0x40414243), SPH_C32(0x44454647),
-	SPH_C32(0x48494A4B), SPH_C32(0x4C4D4E4F),
-	SPH_C32(0x50515253), SPH_C32(0x54555657),
-	SPH_C32(0x58595A5B), SPH_C32(0x5C5D5E5F),
-	SPH_C32(0x60616263), SPH_C32(0x64656667),
-	SPH_C32(0x68696A6B), SPH_C32(0x6C6D6E6F),
-	SPH_C32(0x70717273), SPH_C32(0x74757677),
-	SPH_C32(0x78797A7B), SPH_C32(0x7C7D7E7F)
-};
-
-#if SPH_64
-
-static const sph_u64 IV512[] = {
-	SPH_C64(0x8081828384858687), SPH_C64(0x88898A8B8C8D8E8F),
-	SPH_C64(0x9091929394959697), SPH_C64(0x98999A9B9C9D9E9F),
-	SPH_C64(0xA0A1A2A3A4A5A6A7), SPH_C64(0xA8A9AAABACADAEAF),
-	SPH_C64(0xB0B1B2B3B4B5B6B7), SPH_C64(0xB8B9BABBBCBDBEBF),
-	SPH_C64(0xC0C1C2C3C4C5C6C7), SPH_C64(0xC8C9CACBCCCDCECF),
-	SPH_C64(0xD0D1D2D3D4D5D6D7), SPH_C64(0xD8D9DADBDCDDDEDF),
-	SPH_C64(0xE0E1E2E3E4E5E6E7), SPH_C64(0xE8E9EAEBECEDEEEF),
-	SPH_C64(0xF0F1F2F3F4F5F6F7), SPH_C64(0xF8F9FAFBFCFDFEFF)
-};
-
-#endif
-
-#define XCAT(x, y)    XCAT_(x, y)
-#define XCAT_(x, y)   x ## y
-
-#define LPAR   (
-
-/*
-#define ss0(x)    (((x) >> 1) ^ SPH_T32((x) << 3) \
-                  ^ SPH_ROTL32(x,  4) ^ SPH_ROTL32(x, 19))
-#define ss1(x)    (((x) >> 1) ^ SPH_T32((x) << 2) \
-                  ^ SPH_ROTL32(x,  8) ^ SPH_ROTL32(x, 23))
-#define ss2(x)    (((x) >> 2) ^ SPH_T32((x) << 1) \
-                  ^ SPH_ROTL32(x, 12) ^ SPH_ROTL32(x, 25))
-#define ss3(x)    (((x) >> 2) ^ SPH_T32((x) << 2) \
-                  ^ SPH_ROTL32(x, 15) ^ SPH_ROTL32(x, 29))
-#define ss4(x)    (((x) >> 1) ^ (x))
-#define ss5(x)    (((x) >> 2) ^ (x))
-#define rs1(x)    SPH_ROTL32(x,  3)
-#define rs2(x)    SPH_ROTL32(x,  7)
-#define rs3(x)    SPH_ROTL32(x, 13)
-#define rs4(x)    SPH_ROTL32(x, 16)
-#define rs5(x)    SPH_ROTL32(x, 19)
-#define rs6(x)    SPH_ROTL32(x, 23)
-#define rs7(x)    SPH_ROTL32(x, 27)
-
-#define Ks(j)   SPH_T32((sph_u32)(j) * SPH_C32(0x05555555))
-
-#define add_elt_s(mf, hf, j0m, j1m, j3m, j4m, j7m, j10m, j11m, j16) \
-	(SPH_T32(SPH_ROTL32(mf(j0m), j1m) + SPH_ROTL32(mf(j3m), j4m) \
-		- SPH_ROTL32(mf(j10m), j11m) + Ks(j16)) ^ hf(j7m))
-
-#define expand1s_inner(qf, mf, hf, i16, \
-		i0, i1, i2, i3, i4, i5, i6, i7, i8, \
-		i9, i10, i11, i12, i13, i14, i15, \
-		i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
-	SPH_T32(ss1(qf(i0)) + ss2(qf(i1)) + ss3(qf(i2)) + ss0(qf(i3)) \
-		+ ss1(qf(i4)) + ss2(qf(i5)) + ss3(qf(i6)) + ss0(qf(i7)) \
-		+ ss1(qf(i8)) + ss2(qf(i9)) + ss3(qf(i10)) + ss0(qf(i11)) \
-		+ ss1(qf(i12)) + ss2(qf(i13)) + ss3(qf(i14)) + ss0(qf(i15)) \
-		+ add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
-
-#define expand1s(qf, mf, hf, i16) \
-	expand1s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
-#define expand1s_(qf, mf, hf, i16, ix, iy) \
-	expand1s_inner LPAR qf, mf, hf, i16, ix, iy)
-
-#define expand2s_inner(qf, mf, hf, i16, \
-		i0, i1, i2, i3, i4, i5, i6, i7, i8, \
-		i9, i10, i11, i12, i13, i14, i15, \
-		i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
-	SPH_T32(qf(i0) + rs1(qf(i1)) + qf(i2) + rs2(qf(i3)) \
-		+ qf(i4) + rs3(qf(i5)) + qf(i6) + rs4(qf(i7)) \
-		+ qf(i8) + rs5(qf(i9)) + qf(i10) + rs6(qf(i11)) \
-		+ qf(i12) + rs7(qf(i13)) + ss4(qf(i14)) + ss5(qf(i15)) \
-		+ add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
-
-#define expand2s(qf, mf, hf, i16) \
-	expand2s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
-#define expand2s_(qf, mf, hf, i16, ix, iy) \
-	expand2s_inner LPAR qf, mf, hf, i16, ix, iy)
-*/
-#if SPH_64
-
-#define sb0(x) \
-   _mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 1), \
-                                       _mm256_slli_epi64( (x), 3) ), \
-                     _mm256_xor_si256( mm256_rotl_64( (x), 4), \
-                                       mm256_rotl_64( (x), 37) ) )
-
-#define sb1(x) \
-   _mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 1), \
-                                       _mm256_slli_epi64( (x), 2) ), \
-                     _mm256_xor_si256( mm256_rotl_64( (x), 13), \
-                                       mm256_rotl_64( (x), 43) ) )
-
-#define sb2(x) \
-   _mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 2), \
-                                       _mm256_slli_epi64( (x), 1) ), \
-                     _mm256_xor_si256( mm256_rotl_64( (x), 19), \
-                                       mm256_rotl_64( (x), 53) ) )
-
-#define sb3(x) \
-   _mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 2), \
-                                       _mm256_slli_epi64( (x), 2) ), \
-                     _mm256_xor_si256( mm256_rotl_64( (x), 28), \
-                                       mm256_rotl_64( (x), 59) ) )
-
-#define sb4(x) \
-  _mm256_xor_si256( (x), _mm256_srli_epi64( (x), 1 ) )
-
-#define sb5(x) \
-  _mm256_xor_si256( (x), _mm256_srli_epi64( (x), 2 ) )
-
-#define rb1(x)    mm256_rotl_64( x,  5 ) 
-#define rb2(x)    mm256_rotl_64( x, 11 ) 
-#define rb3(x)    mm256_rotl_64( x, 27 ) 
-#define rb4(x)    mm256_rotl_64( x, 32 ) 
-#define rb5(x)    mm256_rotl_64( x, 37 ) 
-#define rb6(x)    mm256_rotl_64( x, 43 ) 
-#define rb7(x)    mm256_rotl_64( x, 53 ) 
-
-#define rol_off( M, j, off ) \
-   mm256_rotl_64( M[ ( (j) + (off) ) & 15 ] , \
-                   ( ( (j) + (off) ) & 15 ) + 1 )
-
-#define add_elt_b( M, H, j ) \
-   _mm256_xor_si256( \
-      _mm256_add_epi64( \
-            _mm256_sub_epi64( _mm256_add_epi64( rol_off( M, j, 0 ), \
-                                                rol_off( M, j, 3 ) ), \
-                             rol_off( M, j, 10 ) ), \
-            _mm256_set1_epi64x( ( (j) + 16 ) * 0x0555555555555555ULL ) ), \
-       H[ ( (j)+7 ) & 15 ] )
-          
-#define expand1b( qt, M, H, i ) \
-   _mm256_add_epi64( \
-      _mm256_add_epi64( \
-         _mm256_add_epi64( \
-             _mm256_add_epi64( \
-                _mm256_add_epi64( sb1( qt[ (i)-16 ] ), \
-                                  sb2( qt[ (i)-15 ] ) ), \
-                _mm256_add_epi64( sb3( qt[ (i)-14 ] ), \
-                                  sb0( qt[ (i)-13 ] ) ) ), \
-             _mm256_add_epi64( \
-                _mm256_add_epi64( sb1( qt[ (i)-12 ] ), \
-                                  sb2( qt[ (i)-11 ] ) ), \
-                _mm256_add_epi64( sb3( qt[ (i)-10 ] ), \
-                                  sb0( qt[ (i)- 9 ] ) ) ) ), \
-         _mm256_add_epi64( \
-             _mm256_add_epi64( \
-                _mm256_add_epi64( sb1( qt[ (i)- 8 ] ), \
-                                  sb2( qt[ (i)- 7 ] ) ), \
-                _mm256_add_epi64( sb3( qt[ (i)- 6 ] ), \
-                                  sb0( qt[ (i)- 5 ] ) ) ), \
-             _mm256_add_epi64( \
-                _mm256_add_epi64( sb1( qt[ (i)- 4 ] ), \
-                                  sb2( qt[ (i)- 3 ] ) ), \
-                _mm256_add_epi64( sb3( qt[ (i)- 2 ] ), \
-                                  sb0( qt[ (i)- 1 ] ) ) ) ) ), \
-      add_elt_b( M, H, (i)-16 ) )
-
-#define expand2b( qt, M, H, i) \
-   _mm256_add_epi64( \
-      _mm256_add_epi64( \
-         _mm256_add_epi64( \
-             _mm256_add_epi64( \
-                _mm256_add_epi64( qt[ (i)-16 ], rb1( qt[ (i)-15 ] ) ), \
-                _mm256_add_epi64( qt[ (i)-14 ], rb2( qt[ (i)-13 ] ) ) ), \
-             _mm256_add_epi64( \
-                _mm256_add_epi64( qt[ (i)-12 ], rb3( qt[ (i)-11 ] ) ), \
-                _mm256_add_epi64( qt[ (i)-10 ], rb4( qt[ (i)- 9 ] ) ) ) ), \
-         _mm256_add_epi64( \
-             _mm256_add_epi64( \
-                _mm256_add_epi64( qt[ (i)- 8 ], rb5( qt[ (i)- 7 ] ) ), \
-                _mm256_add_epi64( qt[ (i)- 6 ], rb6( qt[ (i)- 5 ] ) ) ), \
-             _mm256_add_epi64( \
-                _mm256_add_epi64( qt[ (i)- 4 ], rb7( qt[ (i)- 3 ] ) ), \
-                _mm256_add_epi64( sb4( qt[ (i)- 2 ] ), \
-                                  sb5( qt[ (i)- 1 ] ) ) ) ) ), \
-      add_elt_b( M, H, (i)-16 ) )
-
-#endif
-
-/*
-#define MAKE_W( i0, op01, i1, op12, i2, op23, i3, op34, i4) \
-        ((M(i0) ^ H(i0)) op01 (M(i1) ^ H(i1)) op12 (M(i2) ^ H(i2)) \
-        op23 (M(i3) ^ H(i3)) op34 (M(i4) ^ H(i4)))
-*/
-
-/*
-#define Ws0    MAKE_W(SPH_T32,  5, -,  7, +, 10, +, 13, +, 14)
-#define Ws1    MAKE_W(SPH_T32,  6, -,  8, +, 11, +, 14, -, 15)
-#define Ws2    MAKE_W(SPH_T32,  0, +,  7, +,  9, -, 12, +, 15)
-#define Ws3    MAKE_W(SPH_T32,  0, -,  1, +,  8, -, 10, +, 13)
-#define Ws4    MAKE_W(SPH_T32,  1, +,  2, +,  9, -, 11, -, 14)
-#define Ws5    MAKE_W(SPH_T32,  3, -,  2, +, 10, -, 12, +, 15)
-#define Ws6    MAKE_W(SPH_T32,  4, -,  0, -,  3, -, 11, +, 13)
-#define Ws7    MAKE_W(SPH_T32,  1, -,  4, -,  5, -, 12, -, 14)
-#define Ws8    MAKE_W(SPH_T32,  2, -,  5, -,  6, +, 13, -, 15)
-#define Ws9    MAKE_W(SPH_T32,  0, -,  3, +,  6, -,  7, +, 14)
-#define Ws10   MAKE_W(SPH_T32,  8, -,  1, -,  4, -,  7, +, 15)
-#define Ws11   MAKE_W(SPH_T32,  8, -,  0, -,  2, -,  5, +,  9)
-#define Ws12   MAKE_W(SPH_T32,  1, +,  3, -,  6, -,  9, +, 10)
-#define Ws13   MAKE_W(SPH_T32,  2, +,  4, +,  7, +, 10, +, 11)
-#define Ws14   MAKE_W(SPH_T32,  3, -,  5, +,  8, -, 11, -, 12)
-#define Ws15   MAKE_W(SPH_T32, 12, -,  4, -,  6, -,  9, +, 13)
-
-#if SPH_SMALL_FOOTPRINT_BMW
-
-#define MAKE_Qas   do { \
-		unsigned u; \
-		sph_u32 Ws[16]; \
-		Ws[ 0] = Ws0; \
-		Ws[ 1] = Ws1; \
-		Ws[ 2] = Ws2; \
-		Ws[ 3] = Ws3; \
-		Ws[ 4] = Ws4; \
-		Ws[ 5] = Ws5; \
-		Ws[ 6] = Ws6; \
-		Ws[ 7] = Ws7; \
-		Ws[ 8] = Ws8; \
-		Ws[ 9] = Ws9; \
-		Ws[10] = Ws10; \
-		Ws[11] = Ws11; \
-		Ws[12] = Ws12; \
-		Ws[13] = Ws13; \
-		Ws[14] = Ws14; \
-		Ws[15] = Ws15; \
-		for (u = 0; u < 15; u += 5) { \
-			qt[u + 0] = SPH_T32(ss0(Ws[u + 0]) + H(u + 1)); \
-			qt[u + 1] = SPH_T32(ss1(Ws[u + 1]) + H(u + 2)); \
-			qt[u + 2] = SPH_T32(ss2(Ws[u + 2]) + H(u + 3)); \
-			qt[u + 3] = SPH_T32(ss3(Ws[u + 3]) + H(u + 4)); \
-			qt[u + 4] = SPH_T32(ss4(Ws[u + 4]) + H(u + 5)); \
-		} \
-		qt[15] = SPH_T32(ss0(Ws[15]) + H(0)); \
-	} while (0)
-
-#define MAKE_Qbs   do { \
-		qt[16] = expand1s(Qs, M, H, 16); \
-		qt[17] = expand1s(Qs, M, H, 17); \
-		qt[18] = expand2s(Qs, M, H, 18); \
-		qt[19] = expand2s(Qs, M, H, 19); \
-		qt[20] = expand2s(Qs, M, H, 20); \
-		qt[21] = expand2s(Qs, M, H, 21); \
-		qt[22] = expand2s(Qs, M, H, 22); \
-		qt[23] = expand2s(Qs, M, H, 23); \
-		qt[24] = expand2s(Qs, M, H, 24); \
-		qt[25] = expand2s(Qs, M, H, 25); \
-		qt[26] = expand2s(Qs, M, H, 26); \
-		qt[27] = expand2s(Qs, M, H, 27); \
-		qt[28] = expand2s(Qs, M, H, 28); \
-		qt[29] = expand2s(Qs, M, H, 29); \
-		qt[30] = expand2s(Qs, M, H, 30); \
-		qt[31] = expand2s(Qs, M, H, 31); \
-	} while (0)
-
-#else
-
-#define MAKE_Qas   do { \
-		qt[ 0] = SPH_T32(ss0(Ws0 ) + H( 1)); \
-		qt[ 1] = SPH_T32(ss1(Ws1 ) + H( 2)); \
-		qt[ 2] = SPH_T32(ss2(Ws2 ) + H( 3)); \
-		qt[ 3] = SPH_T32(ss3(Ws3 ) + H( 4)); \
-		qt[ 4] = SPH_T32(ss4(Ws4 ) + H( 5)); \
-		qt[ 5] = SPH_T32(ss0(Ws5 ) + H( 6)); \
-		qt[ 6] = SPH_T32(ss1(Ws6 ) + H( 7)); \
-		qt[ 7] = SPH_T32(ss2(Ws7 ) + H( 8)); \
-		qt[ 8] = SPH_T32(ss3(Ws8 ) + H( 9)); \
-		qt[ 9] = SPH_T32(ss4(Ws9 ) + H(10)); \
-		qt[10] = SPH_T32(ss0(Ws10) + H(11)); \
-		qt[11] = SPH_T32(ss1(Ws11) + H(12)); \
-		qt[12] = SPH_T32(ss2(Ws12) + H(13)); \
-		qt[13] = SPH_T32(ss3(Ws13) + H(14)); \
-		qt[14] = SPH_T32(ss4(Ws14) + H(15)); \
-		qt[15] = SPH_T32(ss0(Ws15) + H( 0)); \
-	} while (0)
-
-#define MAKE_Qbs   do { \
-		qt[16] = expand1s(Qs, M, H, 16); \
-		qt[17] = expand1s(Qs, M, H, 17); \
-		qt[18] = expand2s(Qs, M, H, 18); \
-		qt[19] = expand2s(Qs, M, H, 19); \
-		qt[20] = expand2s(Qs, M, H, 20); \
-		qt[21] = expand2s(Qs, M, H, 21); \
-		qt[22] = expand2s(Qs, M, H, 22); \
-		qt[23] = expand2s(Qs, M, H, 23); \
-		qt[24] = expand2s(Qs, M, H, 24); \
-		qt[25] = expand2s(Qs, M, H, 25); \
-		qt[26] = expand2s(Qs, M, H, 26); \
-		qt[27] = expand2s(Qs, M, H, 27); \
-		qt[28] = expand2s(Qs, M, H, 28); \
-		qt[29] = expand2s(Qs, M, H, 29); \
-		qt[30] = expand2s(Qs, M, H, 30); \
-		qt[31] = expand2s(Qs, M, H, 31); \
-	} while (0)
-
-#endif
-
-#define MAKE_Qs   do { \
-		MAKE_Qas; \
-		MAKE_Qbs; \
-	} while (0)
-
-#define Qs(j)   (qt[j])
-*/
-#if SPH_64
-
-#define Wb0 \
-   _mm256_add_epi64( \
-       _mm256_add_epi64( \
-          _mm256_add_epi64( \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 5], H[ 5] ), \
-                               _mm256_xor_si256( M[ 7], H[ 7] ) ), \
-             _mm256_xor_si256( M[10], H[10] ) ), \
-          _mm256_xor_si256( M[13], H[13] ) ), \
-       _mm256_xor_si256( M[14], H[14] ) )
-
-#define Wb1 \
-   _mm256_sub_epi64( \
-       _mm256_add_epi64( \
-          _mm256_add_epi64( \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 6], H[ 6] ), \
-                               _mm256_xor_si256( M[ 8], H[ 8] ) ), \
-             _mm256_xor_si256( M[11], H[11] ) ), \
-          _mm256_xor_si256( M[14], H[14] ) ), \
-       _mm256_xor_si256( M[15], H[15] ) )
-
-#define Wb2 \
-   _mm256_add_epi64( \
-       _mm256_sub_epi64( \
-          _mm256_add_epi64( \
-             _mm256_add_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
-                               _mm256_xor_si256( M[ 7], H[ 7] ) ), \
-             _mm256_xor_si256( M[ 9], H[ 9] ) ), \
-          _mm256_xor_si256( M[12], H[12] ) ), \
-       _mm256_xor_si256( M[15], H[15] ) )
-
-#define Wb3 \
-   _mm256_add_epi64( \
-       _mm256_sub_epi64( \
-          _mm256_add_epi64( \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
-                               _mm256_xor_si256( M[ 1], H[ 1] ) ), \
-             _mm256_xor_si256( M[ 8], H[ 8] ) ), \
-          _mm256_xor_si256( M[10], H[10] ) ), \
-       _mm256_xor_si256( M[13], H[13] ) )
-
-#define Wb4 \
-   _mm256_sub_epi64( \
-       _mm256_sub_epi64( \
-          _mm256_add_epi64( \
-             _mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
-                               _mm256_xor_si256( M[ 2], H[ 2] ) ), \
-             _mm256_xor_si256( M[ 9], H[ 9] ) ), \
-          _mm256_xor_si256( M[11], H[11] ) ), \
-       _mm256_xor_si256( M[14], H[14] ) )
-
-#define Wb5 \
-   _mm256_add_epi64( \
-       _mm256_sub_epi64( \
-          _mm256_add_epi64( \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \
-                               _mm256_xor_si256( M[ 2], H[ 2] ) ), \
-             _mm256_xor_si256( M[10], H[10] ) ), \
-          _mm256_xor_si256( M[12], H[12] ) ), \
-       _mm256_xor_si256( M[15], H[15] ) )
-
-#define Wb6 \
-   _mm256_add_epi64( \
-       _mm256_sub_epi64( \
-          _mm256_sub_epi64( \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 4], H[ 4] ), \
-                               _mm256_xor_si256( M[ 0], H[ 0] ) ), \
-             _mm256_xor_si256( M[ 3], H[ 3] ) ), \
-          _mm256_xor_si256( M[11], H[11] ) ), \
-       _mm256_xor_si256( M[13], H[13] ) )
-
-#define Wb7 \
-   _mm256_sub_epi64( \
-       _mm256_sub_epi64( \
-          _mm256_sub_epi64( \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
-                               _mm256_xor_si256( M[ 4], H[ 4] ) ), \
-             _mm256_xor_si256( M[ 5], H[ 5] ) ), \
-          _mm256_xor_si256( M[12], H[12] ) ), \
-       _mm256_xor_si256( M[14], H[14] ) )
-
-#define Wb8 \
-   _mm256_sub_epi64( \
-       _mm256_add_epi64( \
-          _mm256_sub_epi64( \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \
-                               _mm256_xor_si256( M[ 5], H[ 5] ) ), \
-             _mm256_xor_si256( M[ 6], H[ 6] ) ), \
-          _mm256_xor_si256( M[13], H[13] ) ), \
-       _mm256_xor_si256( M[15], H[15] ) )
-
-#define Wb9 \
-   _mm256_add_epi64( \
-       _mm256_sub_epi64( \
-          _mm256_add_epi64( \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
-                               _mm256_xor_si256( M[ 3], H[ 3] ) ), \
-             _mm256_xor_si256( M[ 6], H[ 6] ) ), \
-          _mm256_xor_si256( M[ 7], H[ 7] ) ), \
-       _mm256_xor_si256( M[14], H[14] ) )
-
-#define Wb10 \
-   _mm256_add_epi64( \
-       _mm256_sub_epi64( \
-          _mm256_sub_epi64( \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \
-                               _mm256_xor_si256( M[ 1], H[ 1] ) ), \
-             _mm256_xor_si256( M[ 4], H[ 4] ) ), \
-          _mm256_xor_si256( M[ 7], H[ 7] ) ), \
-       _mm256_xor_si256( M[15], H[15] ) )
-
-#define Wb11 \
-   _mm256_add_epi64( \
-       _mm256_sub_epi64( \
-          _mm256_sub_epi64( \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \
-                               _mm256_xor_si256( M[ 0], H[ 0] ) ), \
-             _mm256_xor_si256( M[ 2], H[ 2] ) ), \
-          _mm256_xor_si256( M[ 5], H[ 5] ) ), \
-       _mm256_xor_si256( M[ 9], H[ 9] ) )
-
-#define Wb12 \
-   _mm256_add_epi64( \
-       _mm256_sub_epi64( \
-          _mm256_sub_epi64( \
-             _mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
-                               _mm256_xor_si256( M[ 3], H[ 3] ) ), \
-             _mm256_xor_si256( M[ 6], H[ 6] ) ), \
-          _mm256_xor_si256( M[ 9], H[ 9] ) ), \
-       _mm256_xor_si256( M[10], H[10] ) )
-
-#define Wb13 \
-   _mm256_add_epi64( \
-       _mm256_add_epi64( \
-          _mm256_add_epi64( \
-             _mm256_add_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \
-                               _mm256_xor_si256( M[ 4], H[ 4] ) ), \
-             _mm256_xor_si256( M[ 7], H[ 7] ) ), \
-          _mm256_xor_si256( M[10], H[10] ) ), \
-       _mm256_xor_si256( M[11], H[11] ) )
-
-#define Wb14 \
-   _mm256_sub_epi64( \
-       _mm256_sub_epi64( \
-          _mm256_add_epi64( \
-             _mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \
-                               _mm256_xor_si256( M[ 5], H[ 5] ) ), \
-             _mm256_xor_si256( M[ 8], H[ 8] ) ), \
-          _mm256_xor_si256( M[11], H[11] ) ), \
-       _mm256_xor_si256( M[12], H[12] ) )
-
-#define Wb15 \
-   _mm256_add_epi64( \
-       _mm256_sub_epi64( \
-          _mm256_sub_epi64( \
-             _mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \
-                               _mm256_xor_si256( M[ 4], H[4] ) ), \
-             _mm256_xor_si256( M[ 6], H[ 6] ) ), \
-          _mm256_xor_si256( M[ 9], H[ 9] ) ), \
-       _mm256_xor_si256( M[13], H[13] ) )
-
-void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
-{
-   __m256i qt[32], xl, xh; \
-
-   qt[ 0] = sb0( Wb0 ) + H[ 1]; 
-   qt[ 1] = sb1( Wb1 ) + H[ 2]; 
-   qt[ 2] = sb2( Wb2 ) + H[ 3]; 
-   qt[ 3] = sb3( Wb3 ) + H[ 4]; 
-   qt[ 4] = sb4( Wb4 ) + H[ 5]; 
-   qt[ 5] = sb0( Wb5 ) + H[ 6]; 
-   qt[ 6] = sb1( Wb6 ) + H[ 7]; 
-   qt[ 7] = sb2( Wb7 ) + H[ 8]; 
-   qt[ 8] = sb3( Wb8 ) + H[ 9]; 
-   qt[ 9] = sb4( Wb9 ) + H[10]; 
-   qt[10] = sb0( Wb10) + H[11]; 
-   qt[11] = sb1( Wb11) + H[12]; 
-   qt[12] = sb2( Wb12) + H[13]; 
-   qt[13] = sb3( Wb13) + H[14];
-   qt[14] = sb4( Wb14) + H[15]; 
-   qt[15] = sb0( Wb15) + H[ 0]; 
-   qt[16] = expand1b( qt, M, H, 16 ); 
-   qt[17] = expand1b( qt, M, H, 17 ); 
-   qt[18] = expand2b( qt, M, H, 18 ); 
-   qt[19] = expand2b( qt, M, H, 19 ); 
-   qt[20] = expand2b( qt, M, H, 20 ); 
-   qt[21] = expand2b( qt, M, H, 21 ); 
-   qt[22] = expand2b( qt, M, H, 22 ); 
-   qt[23] = expand2b( qt, M, H, 23 ); 
-   qt[24] = expand2b( qt, M, H, 24 ); 
-   qt[25] = expand2b( qt, M, H, 25 ); 
-   qt[26] = expand2b( qt, M, H, 26 ); 
-   qt[27] = expand2b( qt, M, H, 27 ); 
-   qt[28] = expand2b( qt, M, H, 28 ); 
-   qt[29] = expand2b( qt, M, H, 29 ); 
-   qt[30] = expand2b( qt, M, H, 30 ); 
-   qt[31] = expand2b( qt, M, H, 31 ); 
-   xl = _mm256_xor_si256( 
-              _mm256_xor_si256( _mm256_xor_si256( qt[16], qt[17] ), 
-                                _mm256_xor_si256( qt[18], qt[19] ) ), 
-              _mm256_xor_si256( _mm256_xor_si256( qt[20], qt[21] ), 
-                                _mm256_xor_si256( qt[22], qt[23] ) ) ); 
-   xh = _mm256_xor_si256( xl, 
-             _mm256_xor_si256( 
-                 _mm256_xor_si256( _mm256_xor_si256( qt[24], qt[25] ),
-                                   _mm256_xor_si256( qt[26], qt[27] ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( qt[28], qt[29] ),
-                                   _mm256_xor_si256( qt[30], qt[31] ) )));
-   dH[ 0] = _mm256_add_epi64(
-                 _mm256_xor_si256( M[0],
-                      _mm256_xor_si256( _mm256_slli_epi64( xh, 5 ),
-                                        _mm256_srli_epi64( qt[16], 5 ) ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[24] ), qt[ 0] ));
-   dH[ 1] = _mm256_add_epi64(
-                 _mm256_xor_si256( M[1],
-                      _mm256_xor_si256( _mm256_srli_epi64( xh, 7 ),
-                                        _mm256_slli_epi64( qt[17], 8 ) ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[25] ), qt[ 1] ));
-   dH[ 2] = _mm256_add_epi64(
-                 _mm256_xor_si256( M[2],
-                      _mm256_xor_si256( _mm256_srli_epi64( xh, 5 ),
-                                        _mm256_slli_epi64( qt[18], 5 ) ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[26] ), qt[ 2] ));
-   dH[ 3] = _mm256_add_epi64(
-                 _mm256_xor_si256( M[3],
-                      _mm256_xor_si256( _mm256_srli_epi64( xh, 1 ),
-                                        _mm256_slli_epi64( qt[19], 5 ) ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[27] ), qt[ 3] ));
-   dH[ 4] = _mm256_add_epi64(
-                 _mm256_xor_si256( M[4],
-                      _mm256_xor_si256( _mm256_srli_epi64( xh, 3 ),
-                                        _mm256_slli_epi64( qt[20], 0 ) ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[28] ), qt[ 4] ));
-   dH[ 5] = _mm256_add_epi64(
-                 _mm256_xor_si256( M[5],
-                      _mm256_xor_si256( _mm256_slli_epi64( xh, 6 ),
-                                        _mm256_srli_epi64( qt[21], 6 ) ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[29] ), qt[ 5] ));
-   dH[ 6] = _mm256_add_epi64(
-                 _mm256_xor_si256( M[6],
-                      _mm256_xor_si256( _mm256_srli_epi64( xh, 4 ),
-                                        _mm256_slli_epi64( qt[22], 6 ) ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[30] ), qt[ 6] ));
-   dH[ 7] = _mm256_add_epi64(
-                 _mm256_xor_si256( M[7],
-                      _mm256_xor_si256( _mm256_srli_epi64( xh, 11 ),
-                                        _mm256_slli_epi64( qt[23], 2 ) ) ),
-                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[31] ), qt[ 7] ));
-   dH[ 8] = _mm256_add_epi64( _mm256_add_epi64(
-                 mm256_rotl_64( dH[4], 9 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[24] ), M[ 8] )),
-                 _mm256_xor_si256( _mm256_slli_epi64( xl, 8 ),
-                                   _mm256_xor_si256( qt[23], qt[ 8] ) ) );
-   dH[ 9] = _mm256_add_epi64( _mm256_add_epi64(
-                 mm256_rotl_64( dH[5], 10 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[25] ), M[ 9] )),
-                 _mm256_xor_si256( _mm256_srli_epi64( xl, 6 ),
-                                   _mm256_xor_si256( qt[16], qt[ 9] ) ) );
-   dH[10] = _mm256_add_epi64( _mm256_add_epi64(
-                 mm256_rotl_64( dH[6], 11 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[26] ), M[10] )),
-                 _mm256_xor_si256( _mm256_slli_epi64( xl, 6 ),
-                                   _mm256_xor_si256( qt[17], qt[10] ) ) );
-   dH[11] = _mm256_add_epi64( _mm256_add_epi64(
-                 mm256_rotl_64( dH[7], 12 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[27] ), M[11] )),
-                 _mm256_xor_si256( _mm256_slli_epi64( xl, 4 ),
-                                   _mm256_xor_si256( qt[18], qt[11] ) ) );
-   dH[12] = _mm256_add_epi64( _mm256_add_epi64(
-                 mm256_rotl_64( dH[0], 13 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[28] ), M[12] )),
-                 _mm256_xor_si256( _mm256_srli_epi64( xl, 3 ),
-                                   _mm256_xor_si256( qt[19], qt[12] ) ) );
-   dH[13] = _mm256_add_epi64( _mm256_add_epi64(
-                 mm256_rotl_64( dH[1], 14 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[29] ), M[13] )),
-                 _mm256_xor_si256( _mm256_srli_epi64( xl, 4 ),
-                                   _mm256_xor_si256( qt[20], qt[13] ) ) );
-   dH[14] = _mm256_add_epi64( _mm256_add_epi64(
-                 mm256_rotl_64( dH[2], 15 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[30] ), M[14] )),
-                 _mm256_xor_si256( _mm256_srli_epi64( xl, 7 ),
-                                   _mm256_xor_si256( qt[21], qt[14] ) ) );
-   dH[15] = _mm256_add_epi64( _mm256_add_epi64(
-                 mm256_rotl_64( dH[3], 16 ),
-                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[31] ), M[15] )),
-                 _mm256_xor_si256( _mm256_srli_epi64( xl, 2 ),
-                                   _mm256_xor_si256( qt[22], qt[15] ) ) );
-} 
-
-#endif  // 64
-
-//#define FOLDs   FOLD(sph_u32, MAKE_Qs, SPH_ROTL32, M, Qs, dH)
-
-
-/*
-static void
-compress_small(const unsigned char *data, const sph_u32 h[16], sph_u32 dh[16])
-{
-#define M(x)    sph_dec32le_aligned(data + 4 * (x))
-#define H(x)    (h[x])
-#define dH(x)   (dh[x])
-
-	FOLDs;
-
-#undef M
-#undef H
-#undef dH
-}
-
-static const sph_u32 final_s[16] = {
-	SPH_C32(0xaaaaaaa0), SPH_C32(0xaaaaaaa1), SPH_C32(0xaaaaaaa2),
-	SPH_C32(0xaaaaaaa3), SPH_C32(0xaaaaaaa4), SPH_C32(0xaaaaaaa5),
-	SPH_C32(0xaaaaaaa6), SPH_C32(0xaaaaaaa7), SPH_C32(0xaaaaaaa8),
-	SPH_C32(0xaaaaaaa9), SPH_C32(0xaaaaaaaa), SPH_C32(0xaaaaaaab),
-	SPH_C32(0xaaaaaaac), SPH_C32(0xaaaaaaad), SPH_C32(0xaaaaaaae),
-	SPH_C32(0xaaaaaaaf)
-};
-
-static void
-bmw32_4way_init(bmw_4way_small_context *sc, const sph_u32 *iv)
-{
-	memcpy(sc->H, iv, sizeof sc->H);
-	sc->ptr = 0;
-#if SPH_64
-	sc->bit_count = 0;
-#else
-	sc->bit_count_high = 0;
-	sc->bit_count_low = 0;
-#endif
-}
-
-static void
-bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
-{
-	unsigned char *buf;
-	size_t ptr;
-	sph_u32 htmp[16];
-	sph_u32 *h1, *h2;
-#if !SPH_64
-	sph_u32 tmp;
-#endif
-
-#if SPH_64
-	sc->bit_count += (sph_u64)len << 3;
-#else
-	tmp = sc->bit_count_low;
-	sc->bit_count_low = SPH_T32(tmp + ((sph_u32)len << 3));
-	if (sc->bit_count_low < tmp)
-		sc->bit_count_high ++;
-	sc->bit_count_high += len >> 29;
-#endif
-	buf = sc->buf;
-	ptr = sc->ptr;
-	h1 = sc->H;
-	h2 = htmp;
-	while (len > 0) {
-		size_t clen;
-
-		clen = (sizeof sc->buf) - ptr;
-		if (clen > len)
-			clen = len;
-		memcpy(buf + ptr, data, clen);
-		data = (const unsigned char *)data + clen;
-		len -= clen;
-		ptr += clen;
-		if (ptr == sizeof sc->buf) {
-			sph_u32 *ht;
-
-			compress_small(buf, h1, h2);
-			ht = h1;
-			h1 = h2;
-			h2 = ht;
-			ptr = 0;
-		}
-	}
-	sc->ptr = ptr;
-	if (h1 != sc->H)
-		memcpy(sc->H, h1, sizeof sc->H);
-}
-
-static void
-bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
-	void *dst, size_t out_size_w32)
-{
-	unsigned char *buf, *out;
-	size_t ptr, u, v;
-	unsigned z;
-	sph_u32 h1[16], h2[16], *h;
-
-	buf = sc->buf;
-	ptr = sc->ptr;
-	z = 0x80 >> n;
-	buf[ptr ++] = ((ub & -z) | z) & 0xFF;
-	h = sc->H;
-	if (ptr > (sizeof sc->buf) - 8) {
-		memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
-		compress_small(buf, h, h1);
-		ptr = 0;
-		h = h1;
-	}
-	memset(buf + ptr, 0, (sizeof sc->buf) - 8 - ptr);
-#if SPH_64
-	sph_enc64le_aligned(buf + (sizeof sc->buf) - 8,
-		SPH_T64(sc->bit_count + n));
-#else
-	sph_enc32le_aligned(buf + (sizeof sc->buf) - 8,
-		sc->bit_count_low + n);
-	sph_enc32le_aligned(buf + (sizeof sc->buf) - 4,
-		SPH_T32(sc->bit_count_high));
-#endif
-	compress_small(buf, h, h2);
-	for (u = 0; u < 16; u ++)
-		sph_enc32le_aligned(buf + 4 * u, h2[u]);
-	compress_small(buf, final_s, h1);
-	out = dst;
-	for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++)
-		sph_enc32le(out + 4 * u, h1[v]);
-}
-*/
-#if SPH_64
-
-static const __m256i final_b[16] =
-{
-   { 0xaaaaaaaaaaaaaaa0, 0xaaaaaaaaaaaaaaa0,
-     0xaaaaaaaaaaaaaaa0, 0xaaaaaaaaaaaaaaa0 },
-   { 0xaaaaaaaaaaaaaaa1, 0xaaaaaaaaaaaaaaa1,
-     0xaaaaaaaaaaaaaaa1, 0xaaaaaaaaaaaaaaa1 },
-   { 0xaaaaaaaaaaaaaaa2, 0xaaaaaaaaaaaaaaa2,
-     0xaaaaaaaaaaaaaaa2, 0xaaaaaaaaaaaaaaa2 },
-   { 0xaaaaaaaaaaaaaaa3, 0xaaaaaaaaaaaaaaa3,
-     0xaaaaaaaaaaaaaaa3, 0xaaaaaaaaaaaaaaa3 },
-   { 0xaaaaaaaaaaaaaaa4, 0xaaaaaaaaaaaaaaa4,
-     0xaaaaaaaaaaaaaaa4, 0xaaaaaaaaaaaaaaa4 },
-   { 0xaaaaaaaaaaaaaaa5, 0xaaaaaaaaaaaaaaa5,
-     0xaaaaaaaaaaaaaaa5, 0xaaaaaaaaaaaaaaa5 },
-   { 0xaaaaaaaaaaaaaaa6, 0xaaaaaaaaaaaaaaa6,
-     0xaaaaaaaaaaaaaaa6, 0xaaaaaaaaaaaaaaa6 },
-   { 0xaaaaaaaaaaaaaaa7, 0xaaaaaaaaaaaaaaa7,
-     0xaaaaaaaaaaaaaaa7, 0xaaaaaaaaaaaaaaa7 },
-   { 0xaaaaaaaaaaaaaaa8, 0xaaaaaaaaaaaaaaa8,
-     0xaaaaaaaaaaaaaaa8, 0xaaaaaaaaaaaaaaa8 },
-   { 0xaaaaaaaaaaaaaaa9, 0xaaaaaaaaaaaaaaa9,
-     0xaaaaaaaaaaaaaaa9, 0xaaaaaaaaaaaaaaa9 },
-   { 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa,
-     0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa },
-   { 0xaaaaaaaaaaaaaaab, 0xaaaaaaaaaaaaaaab,
-     0xaaaaaaaaaaaaaaab, 0xaaaaaaaaaaaaaaab },
-   { 0xaaaaaaaaaaaaaaac, 0xaaaaaaaaaaaaaaac,
-     0xaaaaaaaaaaaaaaac, 0xaaaaaaaaaaaaaaac },
-   { 0xaaaaaaaaaaaaaaad, 0xaaaaaaaaaaaaaaad,
-     0xaaaaaaaaaaaaaaad, 0xaaaaaaaaaaaaaaad },
-   { 0xaaaaaaaaaaaaaaae, 0xaaaaaaaaaaaaaaae,
-     0xaaaaaaaaaaaaaaae, 0xaaaaaaaaaaaaaaae },
-   { 0xaaaaaaaaaaaaaaaf, 0xaaaaaaaaaaaaaaaf,
-     0xaaaaaaaaaaaaaaaf, 0xaaaaaaaaaaaaaaaf }
-};
-
-static void
-bmw64_4way_init( bmw_4way_big_context *sc, const sph_u64 *iv )
-{
-   for ( int i = 0; i < 16; i++ )
-      sc->H[i] = _mm256_set1_epi64x( iv[i] );
-   sc->ptr = 0;
-   sc->bit_count = 0;
-}
-
-static void
-bmw64_4way( bmw_4way_big_context *sc, const void *data, size_t len )
-{
-   __m256i *vdata = (__m256i*)data;
-   __m256i *buf;
-   __m256i htmp[16];
-   __m256i *h1, *h2;
-   size_t ptr;
-   const int buf_size = 128;  // bytes of one lane, compatible with len
-
-   sc->bit_count += (sph_u64)len << 3;
-   buf = sc->buf;
-   ptr = sc->ptr;
-   h1 = sc->H;
-   h2 = htmp;
-   while ( len > 0 )
-   {
-      size_t clen;
-      clen = buf_size - ptr;
-      if ( clen > len )
-         clen = len;
-      memcpy_256( buf + (ptr>>3), vdata, clen >> 3 );
-      vdata = vdata + (clen>>3);
-      len -= clen;
-      ptr += clen;
-      if ( ptr == buf_size )
-      {
-         __m256i *ht;
-         compress_big( buf, h1, h2 );
-         ht = h1;
-         h1 = h2;
-         h2 = ht;
-         ptr = 0;
-      }
-   }
-   sc->ptr = ptr;
-   if ( h1 != sc->H )
-        memcpy_256( sc->H, h1, 16 );
-}
-
-static void
-bmw64_4way_close(bmw_4way_big_context *sc, unsigned ub, unsigned n,
-	void *dst, size_t out_size_w64)
-{
-   __m256i *buf;
-   __m256i h1[16], h2[16], *h;
-   size_t ptr, u, v;
-   unsigned z;
-   const int buf_size = 128;  // bytes of one lane, compatible with len
-
-   buf = sc->buf;
-   ptr = sc->ptr;
-   z = 0x80 >> n;
-   buf[ ptr>>3 ] = _mm256_set1_epi64x( z );
-   ptr += 8;
-   h = sc->H;
-
-   if (  ptr > (buf_size - 8) )
-   {
-      memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 );
-      compress_big( buf, h, h1 );
-      ptr = 0;
-      h = h1;
-   }
-   memset_zero_256( buf + (ptr>>3), (buf_size - 8 - ptr) >> 3 );
-   buf[ (buf_size - 8) >> 3 ] = _mm256_set1_epi64x( sc->bit_count + n );
-   compress_big( buf, h, h2 );
-   for ( u = 0; u < 16; u ++ )
-      buf[u] = h2[u];
-   compress_big( buf, final_b, h1 );
-   for (u = 0, v = 16 - out_size_w64; u < out_size_w64; u ++, v ++)
-      casti_m256i(dst,u) = h1[v];
-}
-
-#endif
-
-void
-bmw256_4way_init(void *cc)
-{
-//	bmw32_4way_init(cc, IV256);
-}
-
-void
-bmw256_4way(void *cc, const void *data, size_t len)
-{
-//	bmw32_4way(cc, data, len);
-}
-
-void
-bmw256_4way_close(void *cc, void *dst)
-{
-//	bmw256_4way_addbits_and_close(cc, 0, 0, dst);
-}
-
-void
-bmw256_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
-{
-//	bmw32_4way_close(cc, ub, n, dst, 8);
-}
-
-#if SPH_64
-
-void
-bmw512_4way_init(void *cc)
-{
-	bmw64_4way_init(cc, IV512);
-}
-
-void
-bmw512_4way(void *cc, const void *data, size_t len)
-{
-	bmw64_4way(cc, data, len);
-}
-
-void
-bmw512_4way_close(void *cc, void *dst)
-{
-	bmw512_4way_addbits_and_close(cc, 0, 0, dst);
-}
-
-void
-bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
-{
-	bmw64_4way_close(cc, ub, n, dst, 8);
-}
-
-#endif
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
--- a/algo/bmw/bmw-hash-4way.h
+++ b/algo/bmw/bmw-hash-4way.h
@@ -1,154 +0,0 @@
-/* $Id: sph_bmw.h 216 2010-06-08 09:46:57Z tp $ */
-/**
- * BMW interface. BMW (aka "Blue Midnight Wish") is a family of
- * functions which differ by their output size; this implementation
- * defines BMW for output sizes 224, 256, 384 and 512 bits.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- * 
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @file     sph_bmw.h
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
-
-#ifndef BMW_HASH_H__
-#define BMW_HASH_H__
-
-#ifdef __cplusplus
-extern "C"{
-#endif
-
-#include <stddef.h>
-#ifdef __AVX2__
-
-#include "algo/sha/sph_types.h"
-#include "avxdefs.h"
-
-/**
- * Output size (in bits) for BMW-224.
- */
-#define SPH_SIZE_bmw224   224
-
-/**
- * Output size (in bits) for BMW-256.
- */
-#define SPH_SIZE_bmw256   256
-
-#if SPH_64
-
-/**
- * Output size (in bits) for BMW-384.
- */
-#define SPH_SIZE_bmw384   384
-
-/**
- * Output size (in bits) for BMW-512.
- */
-#define SPH_SIZE_bmw512   512
-
-#endif
-
-/**
- * This structure is a context for BMW-224 and BMW-256 computations:
- * it contains the intermediate values and some data from the last
- * entered block. Once a BMW computation has been performed, the
- * context can be reused for another computation.
- *
- * The contents of this structure are private. A running BMW
- * computation can be cloned by copying the context (e.g. with a simple
- * <code>memcpy()</code>).
- */
-typedef struct {
-#ifndef DOXYGEN_IGNORE
-	unsigned char buf[64];    /* first field, for alignment */
-	size_t ptr;
-	sph_u32 H[16];
-#if SPH_64
-	sph_u64 bit_count;
-#else
-	sph_u32 bit_count_high, bit_count_low;
-#endif
-#endif
-} bmw_4way_small_context;
-
-typedef bmw_4way_small_context bmw256_4way_context;
-
-#if SPH_64
-
-/**
- * This structure is a context for BMW-384 and BMW-512 computations:
- * it contains the intermediate values and some data from the last
- * entered block. Once a BMW computation has been performed, the
- * context can be reused for another computation.
- *
- * The contents of this structure are private. A running BMW
- * computation can be cloned by copying the context (e.g. with a simple
- * <code>memcpy()</code>).
- */
-typedef struct {
-#ifndef DOXYGEN_IGNORE
-   __m256i buf[16];
-   __m256i H[16];
-
-//	unsigned char buf[128];    /* first field, for alignment */
-	size_t ptr;
-//	sph_u64 H[16];
-	sph_u64 bit_count;
-#endif
-} bmw_4way_big_context;
-
-typedef bmw_4way_big_context bmw512_4way_context;
-
-#endif
-
-void bmw256_4way_init(void *cc);
-
-void bmw256_4way(void *cc, const void *data, size_t len);
-
-void bmw256_4way_close(void *cc, void *dst);
-
-void bmw256_addbits_and_close(
-	void *cc, unsigned ub, unsigned n, void *dst);
-
-#if SPH_64
-
-void bmw512_4way_init(void *cc);
-
-void bmw512_4way(void *cc, const void *data, size_t len);
-
-void bmw512_4way_close(void *cc, void *dst);
-
-void bmw512_4way_addbits_and_close(
-	void *cc, unsigned ub, unsigned n, void *dst);
-
-#endif
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
-
-#endif
--- a/algo/bmw/bmw256.c
+++ b/algo/bmw/bmw256.c
@@ -1,3 +1,4 @@
+#include "miner.h"
 #include "algo-gate-api.h"

 #include <string.h>
--- a/algo/bmw/sph_bmw.h
+++ b/algo/bmw/sph_bmw.h
@@ -41,7 +41,7 @@ extern "C"{
 #endif

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "algo/sha3/sph_types.h"

 /**
 * Output size (in bits) for BMW-224.
--- a/algo/cryptonight/cryptolight.c
+++ b/algo/cryptonight/cryptolight.c
@@ -2,6 +2,7 @@
 // Distributed under the MIT/X11 software license, see the accompanying
 // file COPYING or http://www.opensource.org/licenses/mit-license.php.

+#include "miner.h"
 #include "algo-gate-api.h"

 #if defined(__arm__) || defined(_MSC_VER)
--- a/algo/cryptonight/cryptonight-aesni.c
+++ b/algo/cryptonight/cryptonight-aesni.c
@@ -109,43 +109,43 @@ static __thread cryptonight_ctx ctx;
 void cryptonight_hash_aes( void *restrict output, const void *input, int len )
 {
 #ifndef NO_AES_NI
-
+    keccak( (const uint8_t*)input, 76, (char*)&ctx.state.hs.b, 200 );
    uint8_t ExpandedKey[256] __attribute__((aligned(64)));
-    __m128i *longoutput, *expkey, *xmminput;
    size_t i, j;
    
-    keccak( (const uint8_t*)input, 76, (char*)&ctx.state.hs.b, 200 );
-    memcpy( ExpandedKey, ctx.state.hs.b, AES_KEY_SIZE );
-    ExpandAESKey256( ExpandedKey );
-    memcpy( ctx.text, ctx.state.init, INIT_SIZE_BYTE );
+    memcpy(ctx.text, ctx.state.init, INIT_SIZE_BYTE);
+    memcpy(ExpandedKey, ctx.state.hs.b, AES_KEY_SIZE);
+    ExpandAESKey256(ExpandedKey);
    
-    longoutput = (__m128i*)ctx.long_state;
-    xmminput   = (__m128i*)ctx.text;
-    expkey     = (__m128i*)ExpandedKey;
+    __m128i *longoutput, *expkey, *xmminput;
+    longoutput = (__m128i *)ctx.long_state;
+    expkey     = (__m128i *)ExpandedKey;
+    xmminput   = (__m128i *)ctx.text;
    
-    // prefetch expkey, xmminput and enough longoutput for 4 iterations
+    //for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE)
+    //    aesni_parallel_noxor(&ctx->long_state[i], ctx->text, ExpandedKey);
+    
+    // prefetch expkey, all of xmminput and enough longoutput for 4 loops
    _mm_prefetch( xmminput,     _MM_HINT_T0 );
    _mm_prefetch( xmminput + 4, _MM_HINT_T0 );
+    for ( i = 0; i < 64; i += 16 )
+    {
+       _mm_prefetch( longoutput + i,      _MM_HINT_T0 );
+       _mm_prefetch( longoutput + i +  4, _MM_HINT_T0 );
+       _mm_prefetch( longoutput + i +  8, _MM_HINT_T0 );
+       _mm_prefetch( longoutput + i + 12, _MM_HINT_T0 );
+    }
    _mm_prefetch( expkey,     _MM_HINT_T0 );
    _mm_prefetch( expkey + 4, _MM_HINT_T0 );
    _mm_prefetch( expkey + 8, _MM_HINT_T0 );
-    for ( i = 0; i < 64; i += 16 )
-    {
-        __builtin_prefetch( longoutput + i,      1, 0 );
-        __builtin_prefetch( longoutput + i +  4, 1, 0 );
-        __builtin_prefetch( longoutput + i +  8, 1, 0 );
-        __builtin_prefetch( longoutput + i + 12, 1, 0 );
-    }

-    // n-4 iterations
-    for ( i = 0; likely( i < MEMORY_M128I - 4*INIT_SIZE_M128I );
-                         i += INIT_SIZE_M128I )
+    for ( i = 0; likely( i < MEMORY_M128I ); i += INIT_SIZE_M128I )
    {
-        // prefetch 4 iterations ahead.
+        // prefetch 4 loops ahead,
        __builtin_prefetch( longoutput + i + 64, 1, 0 );
        __builtin_prefetch( longoutput + i + 68, 1, 0 );

-	for ( j = 0; j < 10; j++ )
+	for (j = 0; j < 10; j++ )
 	{
 		xmminput[0] = _mm_aesenc_si128( xmminput[0], expkey[j] );
 		xmminput[1] = _mm_aesenc_si128( xmminput[1], expkey[j] );
@@ -165,99 +165,84 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
 	_mm_store_si128( &( longoutput[i+6] ), xmminput[6] );
 	_mm_store_si128( &( longoutput[i+7] ), xmminput[7] );
    }
-    // last 4 iterations
-    for ( ; likely( i < MEMORY_M128I ); i += INIT_SIZE_M128I )
-    {
-        for ( j = 0; j < 10; j++ )
-        {
-                xmminput[0] = _mm_aesenc_si128( xmminput[0], expkey[j] );
-                xmminput[1] = _mm_aesenc_si128( xmminput[1], expkey[j] );
-                xmminput[2] = _mm_aesenc_si128( xmminput[2], expkey[j] );
-                xmminput[3] = _mm_aesenc_si128( xmminput[3], expkey[j] );
-                xmminput[4] = _mm_aesenc_si128( xmminput[4], expkey[j] );
-                xmminput[5] = _mm_aesenc_si128( xmminput[5], expkey[j] );
-                xmminput[6] = _mm_aesenc_si128( xmminput[6], expkey[j] );
-                xmminput[7] = _mm_aesenc_si128( xmminput[7], expkey[j] );
-        }
-        _mm_store_si128( &( longoutput[i  ] ), xmminput[0] );
-        _mm_store_si128( &( longoutput[i+1] ), xmminput[1] );
-        _mm_store_si128( &( longoutput[i+2] ), xmminput[2] );
-        _mm_store_si128( &( longoutput[i+3] ), xmminput[3] );
-        _mm_store_si128( &( longoutput[i+4] ), xmminput[4] );
-        _mm_store_si128( &( longoutput[i+5] ), xmminput[5] );
-        _mm_store_si128( &( longoutput[i+6] ), xmminput[6] );
-        _mm_store_si128( &( longoutput[i+7] ), xmminput[7] );
-    }

-    ctx.a[0] = ((uint64_t *)ctx.state.k)[0] ^ ((uint64_t *)ctx.state.k)[4];
-    ctx.b[0] = ((uint64_t *)ctx.state.k)[2] ^ ((uint64_t *)ctx.state.k)[6];
-    ctx.a[1] = ((uint64_t *)ctx.state.k)[1] ^ ((uint64_t *)ctx.state.k)[5];
-    ctx.b[1] = ((uint64_t *)ctx.state.k)[3] ^ ((uint64_t *)ctx.state.k)[7];
+//     cast_m128i( ctx.a ) = _mm_xor_si128( casti_m128i( ctx.state.k, 0 ) ,
+//                                          casti_m128i( ctx.state.k, 2 ) );
+//     cast_m128i( ctx.b ) = _mm_xor_si128( casti_m128i( ctx.state.k, 1 ),
+//                                          casti_m128i( ctx.state.k, 3 ) );

-    uint64_t a[2] __attribute((aligned(16))),
-             b[2] __attribute((aligned(16))),
-             c[2] __attribute((aligned(16)));
+     ctx.a[0] = ((uint64_t *)ctx.state.k)[0] ^ ((uint64_t *)ctx.state.k)[4];
+     ctx.b[0] = ((uint64_t *)ctx.state.k)[2] ^ ((uint64_t *)ctx.state.k)[6];
+     ctx.a[1] = ((uint64_t *)ctx.state.k)[1] ^ ((uint64_t *)ctx.state.k)[5];
+     ctx.b[1] = ((uint64_t *)ctx.state.k)[3] ^ ((uint64_t *)ctx.state.k)[7];
+
+//    for (i = 0; i < 2; i++) 
+//    {
+//     ctx.a[i] = ((uint64_t *)ctx.state.k)[i] ^  ((uint64_t *)ctx.state.k)[i+4];
+//     ctx.b[i] = ((uint64_t *)ctx.state.k)[i+2] ^ ((uint64_t *)ctx.state.k)[i+6];
+//    }
+
+    __m128i b_x = _mm_load_si128((__m128i *)ctx.b);
+    uint64_t a[2] __attribute((aligned(16))), b[2] __attribute((aligned(16)));
    a[0] = ctx.a[0];
    a[1] = ctx.a[1];
-    __m128i b_x = _mm_load_si128( (__m128i*)ctx.b );
-    __m128i a_x = _mm_load_si128( (__m128i*)a );
-    __m128i* lsa = (__m128i*)&ctx.long_state[ a[0] & 0x1FFFF0 ];
-    __m128i c_x = _mm_load_si128( lsa );
-    uint64_t *nextblock;
-    uint64_t hi, lo;
-
-    // n-1 iterations
-    for( i = 0; __builtin_expect( i < 0x7ffff, 1 ); i++ )
+	
+    for(i = 0; __builtin_expect(i < 0x80000, 1); i++)
    {	  
-	c_x = _mm_aesenc_si128( c_x, a_x );
-	_mm_store_si128( (__m128i*)c, c_x );
-        b_x = _mm_xor_si128( b_x, c_x );
-        nextblock = (uint64_t *)&ctx.long_state[c[0] & 0x1FFFF0];
-	_mm_store_si128( lsa, b_x );
+        uint64_t c[2];
+        __builtin_prefetch( &ctx.long_state[c[0] & 0x1FFFF0], 0, 1 );
+
+	__m128i c_x = _mm_load_si128( 
+                              (__m128i *)&ctx.long_state[a[0] & 0x1FFFF0]);
+	__m128i a_x = _mm_load_si128((__m128i *)a);
+	c_x = _mm_aesenc_si128(c_x, a_x);
+	_mm_store_si128((__m128i *)c, c_x);
+	
+	b_x = _mm_xor_si128(b_x, c_x);
+	_mm_store_si128((__m128i *)&ctx.long_state[a[0] & 0x1FFFF0], b_x);
+
+	uint64_t *nextblock = (uint64_t *)&ctx.long_state[c[0] & 0x1FFFF0];
+//	uint64_t b[2];
 	b[0] = nextblock[0];
 	b[1] = nextblock[1];

-        // hi,lo = 64bit x 64bit multiply of c[0] and b[0]
-	__asm__( "mulq %3\n\t"
-	         : "=d" ( hi ),
-	           "=a" ( lo )
-	         : "%a" ( c[0] ),
-	           "rm" ( b[0] )
-		 : "cc" );
+	{
+	  uint64_t hi, lo;
+	 // hi,lo = 64bit x 64bit multiply of c[0] and b[0]

-        b_x = c_x;
-        nextblock[0] = a[0] + hi;
-        nextblock[1] = a[1] + lo;
-        a[0] = b[0] ^ nextblock[0];
-        a[1] = b[1] ^ nextblock[1];
-        lsa = (__m128i*)&ctx.long_state[ a[0] & 0x1FFFF0 ];
-        a_x = _mm_load_si128( (__m128i*)a );
-        c_x = _mm_load_si128( lsa );
+	  __asm__("mulq %3\n\t"
+		  : "=d" (hi),
+		"=a" (lo)
+		  : "%a" (c[0]),
+		"rm" (b[0])
+		  : "cc" );
+	  
+	  a[0] += hi;
+	  a[1] += lo;
+	}
+	uint64_t *dst = (uint64_t*)&ctx.long_state[c[0] & 0x1FFFF0];
+//        __m128i *dst = (__m128i*)&ctx.long_state[c[0] & 0x1FFFF0];
+
+//        *dst = cast_m128i( a ); 
+	dst[0] = a[0];
+	dst[1] = a[1];
+
+//        cast_m128i( a ) = _mm_xor_si128( cast_m128i( a ), cast_m128i( b ) );
+	a[0] ^= b[0];
+	a[1] ^= b[1];
+	b_x = c_x;
+	__builtin_prefetch( &ctx.long_state[a[0] & 0x1FFFF0], 0, 3 );
    }
-    // abreviated nth iteration
-    c_x = _mm_aesenc_si128( c_x, a_x );
-    _mm_store_si128( (__m128i*)c, c_x );
-    b_x = _mm_xor_si128( b_x, c_x );
-    nextblock = (uint64_t *)&ctx.long_state[c[0] & 0x1FFFF0];
-    _mm_store_si128( lsa, b_x );
-    b[0] = nextblock[0];
-    b[1] = nextblock[1];
-
-    __asm__( "mulq %3\n\t"
-             : "=d" ( hi ),
-               "=a" ( lo )
-             : "%a" ( c[0] ),
-               "rm" ( b[0] )
-             : "cc" );
-
-    nextblock[0] = a[0] + hi;
-    nextblock[1] = a[1] + lo;

+    memcpy( ctx.text, ctx.state.init, INIT_SIZE_BYTE );
    memcpy( ExpandedKey, &ctx.state.hs.b[32], AES_KEY_SIZE );
    ExpandAESKey256( ExpandedKey );
-    memcpy( ctx.text, ctx.state.init, INIT_SIZE_BYTE );
+    
+    //for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE)
+    //    aesni_parallel_xor(&ctx->text, ExpandedKey, &ctx->long_state[i]);
    
    // prefetch expkey, all of xmminput and enough longoutput for 4 loops
+
    _mm_prefetch( xmminput,     _MM_HINT_T0 );
    _mm_prefetch( xmminput + 4, _MM_HINT_T0 );
    for ( i = 0; i < 64; i += 16 )
@@ -271,11 +256,9 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
    _mm_prefetch( expkey + 4, _MM_HINT_T0 );
    _mm_prefetch( expkey + 8, _MM_HINT_T0 );

-    // n-4 iterations
-    for ( i = 0; likely( i < MEMORY_M128I - 4*INIT_SIZE_M128I );
-                         i += INIT_SIZE_M128I )
+    for ( i = 0; likely( i < MEMORY_M128I ); i += INIT_SIZE_M128I )
    {
-        // stay 4 iterations ahead.
+        // stay 4 loops ahead,
        _mm_prefetch( longoutput + i + 64, _MM_HINT_T0 );
        _mm_prefetch( longoutput + i + 68, _MM_HINT_T0 );

@@ -300,34 +283,10 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
 	    xmminput[7] = _mm_aesenc_si128( xmminput[7], expkey[j] );
        }
    }
-    // last 4 iterations 
-    for ( ; likely( i < MEMORY_M128I ); i += INIT_SIZE_M128I )
-    {
-        xmminput[0] = _mm_xor_si128( longoutput[i  ], xmminput[0] );
-        xmminput[1] = _mm_xor_si128( longoutput[i+1], xmminput[1] );
-        xmminput[2] = _mm_xor_si128( longoutput[i+2], xmminput[2] );
-        xmminput[3] = _mm_xor_si128( longoutput[i+3], xmminput[3] );
-        xmminput[4] = _mm_xor_si128( longoutput[i+4], xmminput[4] );
-        xmminput[5] = _mm_xor_si128( longoutput[i+5], xmminput[5] );
-        xmminput[6] = _mm_xor_si128( longoutput[i+6], xmminput[6] );
-        xmminput[7] = _mm_xor_si128( longoutput[i+7], xmminput[7] );
-
-        for( j = 0; j < 10; j++ )
-        {
-            xmminput[0] = _mm_aesenc_si128( xmminput[0], expkey[j] );
-            xmminput[1] = _mm_aesenc_si128( xmminput[1], expkey[j] );
-            xmminput[2] = _mm_aesenc_si128( xmminput[2], expkey[j] );
-            xmminput[3] = _mm_aesenc_si128( xmminput[3], expkey[j] );
-            xmminput[4] = _mm_aesenc_si128( xmminput[4], expkey[j] );
-            xmminput[5] = _mm_aesenc_si128( xmminput[5], expkey[j] );
-            xmminput[6] = _mm_aesenc_si128( xmminput[6], expkey[j] );
-            xmminput[7] = _mm_aesenc_si128( xmminput[7], expkey[j] );
-        }
-    }
-
+        
    memcpy( ctx.state.init, ctx.text, INIT_SIZE_BYTE);
    keccakf( (uint64_t*)&ctx.state.hs.w, 24 );
-    extra_hashes[ctx.state.hs.b[0] & 3](&ctx.state, 200, output);

+    extra_hashes[ctx.state.hs.b[0] & 3](&ctx.state, 200, output);
 #endif
 }
--- a/algo/cryptonight/cryptonight-common.c
+++ b/algo/cryptonight/cryptonight-common.c
@@ -5,6 +5,7 @@
 // Modified for CPUminer by Lucas Jones

 #include "cpuminer-config.h"
+//#include "miner.h"
 #include "algo-gate-api.h"

 #ifndef NO_AES_NI
--- a/algo/cubehash/sph_cubehash.h
+++ b/algo/cubehash/sph_cubehash.h
@@ -42,7 +42,7 @@ extern "C"{
 #endif

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "algo/sha3/sph_types.h"

 /**
 * Output size (in bits) for CubeHash-224.
--- a/algo/cubehash/sse2/cubehash_sse2.c
+++ b/algo/cubehash/sse2/cubehash_sse2.c
@@ -9,7 +9,11 @@
 #include <immintrin.h>
 #endif
 #include "cubehash_sse2.h"
-#include "algo/sha/sha3-defs.h"
+#include "algo/sha3/sha3-defs.h"
+
+//enum { SUCCESS = 0, FAIL = 1, BAD_HASHBITLEN = 2 };
+
+//#if defined(OPTIMIZE_SSE2)

 static void transform( cubehashParam *sp )
 {
@@ -139,71 +143,72 @@ int cubehashInit(cubehashParam *sp, int hashbitlen, int rounds, int blockbytes)
    if ( blockbytes <= 0 || blockbytes >= 256)
         blockbytes = CUBEHASH_BLOCKBYTES;

-    // all sizes of __m128i
-    sp->hashlen   = hashbitlen/128;
-    sp->blocksize = blockbytes/16;
-    sp->rounds    = rounds;
-    sp->pos       = 0;
-
+    sp->hashbitlen = hashbitlen;
+    sp->rounds = rounds;
+    sp->blockbytes = blockbytes;
    for ( i = 0; i < 8; ++i )
         sp->x[i] = _mm_set_epi32(0, 0, 0, 0);
-
-    sp->x[0] = _mm_set_epi32( 0, rounds, blockbytes, hashbitlen / 8 );
-
+    sp->x[0] = _mm_set_epi32(0, sp->rounds, sp->blockbytes, hashbitlen / 8);
    for ( i = 0; i < 10; ++i )
         transform(sp);
-//    sp->pos = 0;
+    sp->pos = 0;
    return SUCCESS;
 }

+int
+cubehashReset(cubehashParam *sp)
+{
+    return cubehashInit(sp, sp->hashbitlen, sp->rounds, sp->blockbytes);
+}
+
 int cubehashUpdate( cubehashParam *sp, const byte *data, size_t size )
 {
-    const int len = size / 16;
-    const __m128i* in = (__m128i*)data;
-    int i;
+    uint64_t databitlen = 8 * size;

-    // It is assumed data is aligned to 256 bits and is a multiple of 128 bits.
-    // Current usage sata is either 64 or 80 bytes.
+    /* caller promises us that previous data had integral number of bytes */
+    /* so sp->pos is a multiple of 8 */

-    for ( i = 0; i < len; i++ )
+    while ( databitlen >= 8 )
    {
-        sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ], in[i] );
-        sp->pos++;
-        if ( sp->pos == sp->blocksize )
+	( (unsigned char *)sp->x )[sp->pos/8] ^= *data;
+	data += 1;
+	databitlen -= 8;
+	sp->pos += 8;
+	if ( sp->pos == 8 * sp->blockbytes )
        {
-           transform( sp );
-           sp->pos = 0;
-        }
+	    transform( sp );
+	    sp->pos = 0;
+	}
+    }
+    if ( databitlen > 0 )
+    {
+	( (unsigned char *)sp->x )[sp->pos/8] ^= *data;
+	sp->pos += databitlen;
    }
-
    return SUCCESS;
 }

 int cubehashDigest( cubehashParam *sp, byte *digest )
 {
-    __m128i* hash = (__m128i*)digest;
    int i;

-    // pos is zero for 64 byte data, 1 for 80 byte data.
-    sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ],
-                                      _mm_set_epi8( 0,0,0,0, 0,0,0,0,
-                                                    0,0,0,0, 0,0,0,0x80 ) );
-    transform( sp );
+    ( (unsigned char *)sp->x )[sp->pos/8] ^= ( 128 >> (sp->pos % 8) );
+    transform(sp);

-    sp->x[7] = _mm_xor_si128( sp->x[7], _mm_set_epi32( 1,0,0,0 ) );
-    transform( sp );
-    transform( sp );
-    transform( sp );
-    transform( sp );
-    transform( sp );
-    transform( sp );
-    transform( sp );
-    transform( sp );
-    transform( sp );
-    transform( sp );
+    sp->x[7] = _mm_xor_si128(sp->x[7], _mm_set_epi32(1, 0, 0, 0));
+    transform(sp);
+    transform(sp);
+    transform(sp);
+    transform(sp);
+    transform(sp);
+    transform(sp);
+    transform(sp);
+    transform(sp);
+    transform(sp);
+    transform(sp);

-    for ( i = 0; i < sp->hashlen; i++ )
-       hash[i] = sp->x[i];
+    for ( i = 0; i < sp->hashbitlen / 8; ++i )
+	digest[i] = ((unsigned char *) sp->x)[i];

    return SUCCESS;
 }
@@ -211,45 +216,48 @@ int cubehashDigest( cubehashParam *sp, byte *digest )
 int cubehashUpdateDigest( cubehashParam *sp, byte *digest,
                          const byte *data, size_t size )
 {
-    const int len = size / 16;
-    const __m128i* in = (__m128i*)data;
-    __m128i* hash = (__m128i*)digest;
+    uint64_t databitlen = 8 * size;
+    int hashlen128 = sp->hashbitlen/128;
    int i;

-    // It is assumed data is aligned to 256 bits and is a multiple of 128 bits.
-    // Current usage sata is either 64 or 80 bytes.
+    /* caller promises us that previous data had integral number of bytes */
+    /* so sp->pos is a multiple of 8 */

-    for ( i = 0; i < len; i++ )
+    while ( databitlen >= 8 )
    {
-        sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ], in[i] );
-        sp->pos++;
-        if ( sp->pos == sp->blocksize )
+        ( (unsigned char *)sp->x )[sp->pos/8] ^= *data;
+        data += 1;
+        databitlen -= 8;
+        sp->pos += 8;
+        if ( sp->pos == 8 * sp->blockbytes )
        {
-           transform( sp );
-           sp->pos = 0;
+            transform(sp);
+            sp->pos = 0;
        }
    }
+    if ( databitlen > 0 )
+    {
+        ( (unsigned char *)sp->x )[sp->pos/8] ^= *data;
+        sp->pos += databitlen;
+    }

-    // pos is zero for 64 byte data, 1 for 80 byte data.
-    sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ],
-                                      _mm_set_epi8( 0,0,0,0, 0,0,0,0,
-                                                    0,0,0,0, 0,0,0,0x80 ) );
+    ( (unsigned char *)sp->x )[sp->pos/8] ^= ( 128 >> (sp->pos % 8) );
    transform( sp );

-    sp->x[7] = _mm_xor_si128( sp->x[7], _mm_set_epi32( 1,0,0,0 ) );
-    transform( sp );
-    transform( sp );
-    transform( sp );
-    transform( sp );
-    transform( sp );
-    transform( sp );
-    transform( sp );
-    transform( sp );
-    transform( sp );
-    transform( sp );
+    sp->x[7] = _mm_xor_si128( sp->x[7], _mm_set_epi32(1,0,0,0) );
+    transform(sp);
+    transform(sp);
+    transform(sp);
+    transform(sp);
+    transform(sp);
+    transform(sp);
+    transform(sp);
+    transform(sp);
+    transform(sp);
+    transform(sp);

-    for ( i = 0; i < sp->hashlen; i++ )
-       hash[i] = sp->x[i];
+    for ( i = 0; i < hashlen128; i++ )
+       ( (__m128i*)digest )[i] = ( (__m128i*)sp->x )[i];

    return SUCCESS;
 }
--- a/algo/cubehash/sse2/cubehash_sse2.h
+++ b/algo/cubehash/sse2/cubehash_sse2.h
@@ -3,35 +3,58 @@

 #include "compat.h"
 #include <stdint.h>
-#include "algo/sha/sha3-defs.h"
+#include "algo/sha3/sha3-defs.h"
+//#include <beecrypt/beecrypt.h>

+//#if defined(__SSE2__)
 #define	OPTIMIZE_SSE2
+//#endif

+#if defined(OPTIMIZE_SSE2)
 #include <emmintrin.h>
+#endif

 /*!\brief Holds all the parameters necessary for the CUBEHASH algorithm.
 * \ingroup HASH_cubehash_m
 */

 struct _cubehashParam
+//#endif
 {
-    int hashlen;           // __m128i
+    int hashbitlen;
    int rounds;
-    int blocksize;         // __m128i
-    int pos;	           // number of __m128i read into x from current block
-    __m128i _ALIGN(256) x[8];  // aligned for __m256i
+    int blockbytes;
+    int pos;		/* number of bits read into x from current block */
+#if defined(OPTIMIZE_SSE2)
+    __m128i _ALIGN(256) x[8];
+#else
+    uint32_t x[32];
+#endif
 };

+//#ifndef __cplusplus
 typedef struct _cubehashParam cubehashParam;
+//#endif

 #ifdef __cplusplus
 extern "C" {
 #endif

+/*!\var cubehash256
+ * \brief Holds the full API description of the CUBEHASH algorithm.
+ */
+//extern BEECRYPTAPI const hashFunction cubehash256;
+
+//BEECRYPTAPI
 int cubehashInit(cubehashParam* sp, int hashbitlen, int rounds, int blockbytes);

+//BEECRYPTAPI
+int cubehashReset(cubehashParam* sp);
+
+//BEECRYPTAPI
 int cubehashUpdate(cubehashParam* sp, const byte *data, size_t size);

+//BEECRYPTAPI
 int cubehashDigest(cubehashParam* sp, byte *digest);

 int cubehashUpdateDigest( cubehashParam *sp, byte *digest, const byte *data,
--- a/algo/x13/drop.c
+++ b/algo/x13/drop.c
@@ -32,6 +32,7 @@
 #define POK_BOOL_MASK 0x00008000
 #define POK_DATA_MASK 0xFFFF0000
 
+#include "miner.h"
 #include "algo-gate-api.h"

 #include <string.h>
@@ -244,12 +245,12 @@ bool register_drop_algo( algo_gate_t* gate )
    algo_not_tested();
    gate->scanhash              = (void*)&scanhash_drop;
    gate->hash                  = (void*)&droplp_hash_pok;
+    gate->hash_alt              = (void*)&droplp_hash_pok;
+    gate->hash_suw              = (void*)&droplp_hash_pok;
    gate->get_new_work          = (void*)&drop_get_new_work;
    gate->set_target            = (void*)&scrypt_set_target;
    gate->build_stratum_request = (void*)&std_be_build_stratum_request;
-    gate->work_decode           = (void*)&std_be_work_decode;
-    gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
-    gate->set_work_data_endian  = (void*)&set_work_data_big_endian;
+    gate->set_work_data_endian  = (void*)&swab_work_data;
    gate->display_extra_data    = (void*)&drop_display_pok;
    gate->work_data_size        = 80;
    gate->work_cmp_size         = 72;
--- a/algo/echo/aes_ni/hash_api.h
+++ b/algo/echo/aes_ni/hash_api.h
@@ -22,7 +22,7 @@
 #endif


-#include "algo/sha/sha3_common.h"
+#include "algo/sha3/sha3_common.h"

 #include <emmintrin.h>

--- a/algo/echo/aes_ni/vperm.h
+++ b/algo/echo/aes_ni/vperm.h
@@ -16,7 +16,7 @@
 #ifndef VPERM_H
 #define VPERM_H

-#include "algo/sha/sha3_common.h"
+#include "algo/sha3/sha3_common.h"
 #include <tmmintrin.h>

 /*
@@ -53,12 +53,11 @@ extern const unsigned int _k_aesmix4[];
 	x  = _mm_shuffle_epi8(*((__m128i*)table + 0), x);\
 	x  = _mm_xor_si128(x, t1)

-#if 0
 // compiled erroneously with 32-bit msc compiler
-	t2 = _mm_shuffle_epi8(table[0], x);\
-	x  = _mm_shuffle_epi8(table[1], t1);\
-	x  = _mm_xor_si128(x, t2)
-#endif
+	//t2 = _mm_shuffle_epi8(table[0], x);\
+	//x  = _mm_shuffle_epi8(table[1], t1);\
+	//x  = _mm_xor_si128(x, t2)
+

 // input: x
 // output: t2, t3
--- a/algo/echo/sph_echo.c
+++ b/algo/echo/sph_echo.c
@@ -71,7 +71,7 @@ extern "C"{
 #endif

 #define AES_BIG_ENDIAN   0
-#include "algo/sha/aes_helper.c"
+#include "algo/sha3/aes_helper.c"

 #if SPH_ECHO_64

--- a/algo/echo/sph_echo.h
+++ b/algo/echo/sph_echo.h
@@ -41,7 +41,7 @@ extern "C"{
 #endif

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "algo/sha3/sph_types.h"

 /**
 * Output size (in bits) for ECHO-224.
--- a/algo/echo/sse2/sph_echo.h
+++ b/algo/echo/sse2/sph_echo.h
@@ -41,7 +41,7 @@ extern "C"{
 #endif

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "algo/sha3/sph_types.h"

 /**
 * Output size (in bits) for ECHO-224.
--- a/algo/x11/fresh.c
+++ b/algo/x11/fresh.c
@@ -1,3 +1,4 @@
+#include "miner.h"
 #include "algo-gate-api.h"

 #include <stdlib.h>
@@ -130,6 +131,7 @@ bool register_fresh_algo( algo_gate_t* gate )
    algo_not_tested();
    gate->scanhash   = (void*)&scanhash_fresh;
    gate->hash       = (void*)&freshhash;
+    gate->hash_alt   = (void*)&freshhash;
    gate->set_target = (void*)&fresh_set_target;
    gate->get_max64  = (void*)&get_max64_0x3ffff;
    return true;
--- a/algo/fugue/sph_fugue.h
+++ b/algo/fugue/sph_fugue.h
@@ -2,7 +2,7 @@
 #define SPH_FUGUE_H__

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "algo/sha3/sph_types.h"

 #ifdef __cplusplus
 extern "C"{
--- a/algo/gost/sph_gost.h
+++ b/algo/gost/sph_gost.h
@@ -41,7 +41,7 @@ extern "C"{
 #endif

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "algo/sha3/sph_types.h"

 /**
 * Output size (in bits) for GOST-256.
--- a/algo/groestl/aes_ni/brg_types.h
+++ b/algo/groestl/aes_ni/brg_types.h
--- a/algo/groestl/aes_ni/groestl-intr-aes.h
+++ b/algo/groestl/aes_ni/groestl-intr-aes.h
@@ -13,8 +13,8 @@

 /* global constants  */
 __m128i ROUND_CONST_Lx;
-//__m128i ROUND_CONST_L0[ROUNDS512];
-//__m128i ROUND_CONST_L7[ROUNDS512];
+__m128i ROUND_CONST_L0[ROUNDS512];
+__m128i ROUND_CONST_L7[ROUNDS512];
 __m128i ROUND_CONST_P[ROUNDS1024];
 __m128i ROUND_CONST_Q[ROUNDS1024];
 __m128i TRANSP_MASK;
@@ -22,9 +22,11 @@ __m128i SUBSH_MASK[8];
 __m128i ALL_1B;
 __m128i ALL_FF;

+
 #define tos(a)    #a
 #define tostr(a)  tos(a)

+
 /* xmm[i] will be multiplied by 2
 * xmm[j] will be lost
 * xmm[k] has to be all 0x1b */
@@ -151,6 +153,352 @@ __m128i ALL_FF;
  b1 = _mm_xor_si128(b1, a4);\
 }/*MixBytes*/

+#if (LENGTH <= 256)
+
+#define SET_CONSTANTS(){\
+   ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
+  TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
+  SUBSH_MASK[0] = _mm_set_epi32(0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00);\
+  SUBSH_MASK[1] = _mm_set_epi32(0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801);\
+  SUBSH_MASK[2] = _mm_set_epi32(0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02);\
+  SUBSH_MASK[3] = _mm_set_epi32(0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03);\
+  SUBSH_MASK[4] = _mm_set_epi32(0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04);\
+  SUBSH_MASK[5] = _mm_set_epi32(0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05);\
+  SUBSH_MASK[6] = _mm_set_epi32(0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906);\
+  SUBSH_MASK[7] = _mm_set_epi32(0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07);\
+  for(i = 0; i < ROUNDS512; i++)\
+  {\
+    ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
+    ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\
+  }\
+  ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\
+}while(0); \
+
+/* one round
+ * i = round number
+ * a0-a7 = input rows
+ * b0-b7 = output rows
+ */
+#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* AddRoundConstant */\
+  b1 = ROUND_CONST_Lx;\
+  a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\
+  a1 = _mm_xor_si128(a1, b1);\
+  a2 = _mm_xor_si128(a2, b1);\
+  a3 = _mm_xor_si128(a3, b1);\
+  a4 = _mm_xor_si128(a4, b1);\
+  a5 = _mm_xor_si128(a5, b1);\
+  a6 = _mm_xor_si128(a6, b1);\
+  a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\
+  \
+  /* ShiftBytes + SubBytes (interleaved) */\
+  b0 = _mm_xor_si128(b0,  b0);\
+  a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\
+  a0 = _mm_aesenclast_si128(a0, b0);\
+  a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\
+  a1 = _mm_aesenclast_si128(a1, b0);\
+  a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\
+  a2 = _mm_aesenclast_si128(a2, b0);\
+  a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\
+  a3 = _mm_aesenclast_si128(a3, b0);\
+  a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\
+  a4 = _mm_aesenclast_si128(a4, b0);\
+  a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\
+  a5 = _mm_aesenclast_si128(a5, b0);\
+  a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\
+  a6 = _mm_aesenclast_si128(a6, b0);\
+  a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\
+  a7 = _mm_aesenclast_si128(a7, b0);\
+  \
+  /* MixBytes */\
+  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
+\
+}
+
+/* 10 rounds, P and Q in parallel */
+#define ROUNDS_P_Q(){\
+  ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+}
+
+/* Matrix Transpose Step 1
+ * input is a 512-bit state with two columns in one xmm
+ * output is a 512-bit state with two rows in one xmm
+ * inputs: i0-i3
+ * outputs: i0, o1-o3
+ * clobbers: t0
+ */
+#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
+  t0 = TRANSP_MASK;\
+  \
+  i0 = _mm_shuffle_epi8(i0, t0);\
+  i1 = _mm_shuffle_epi8(i1, t0);\
+  i2 = _mm_shuffle_epi8(i2, t0);\
+  i3 = _mm_shuffle_epi8(i3, t0);\
+  \
+  o1 = i0;\
+  t0 = i2;\
+  \
+  i0 = _mm_unpacklo_epi16(i0, i1);\
+  o1 = _mm_unpackhi_epi16(o1, i1);\
+  i2 = _mm_unpacklo_epi16(i2, i3);\
+  t0 = _mm_unpackhi_epi16(t0, i3);\
+  \
+  i0 = _mm_shuffle_epi32(i0, 216);\
+  o1 = _mm_shuffle_epi32(o1, 216);\
+  i2 = _mm_shuffle_epi32(i2, 216);\
+  t0 = _mm_shuffle_epi32(t0, 216);\
+  \
+  o2 = i0;\
+  o3 = o1;\
+  \
+  i0 = _mm_unpacklo_epi32(i0, i2);\
+  o1 = _mm_unpacklo_epi32(o1, t0);\
+  o2 = _mm_unpackhi_epi32(o2, i2);\
+  o3 = _mm_unpackhi_epi32(o3, t0);\
+}/**/
+
+/* Matrix Transpose Step 2
+ * input are two 512-bit states with two rows in one xmm
+ * output are two 512-bit states with one row of each state in one xmm
+ * inputs: i0-i3 = P, i4-i7 = Q
+ * outputs: (i0, o1-o7) = (P|Q)
+ * possible reassignments: (output reg = input reg)
+ * * i1 -> o3-7
+ * * i2 -> o5-7
+ * * i3 -> o7
+ * * i4 -> o3-7
+ * * i5 -> o6-7
+ */
+#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
+  o1 = i0;\
+  o2 = i1;\
+  i0 = _mm_unpacklo_epi64(i0, i4);\
+  o1 = _mm_unpackhi_epi64(o1, i4);\
+  o3 = i1;\
+  o4 = i2;\
+  o2 = _mm_unpacklo_epi64(o2, i5);\
+  o3 = _mm_unpackhi_epi64(o3, i5);\
+  o5 = i2;\
+  o6 = i3;\
+  o4 = _mm_unpacklo_epi64(o4, i6);\
+  o5 = _mm_unpackhi_epi64(o5, i6);\
+  o7 = i3;\
+  o6 = _mm_unpacklo_epi64(o6, i7);\
+  o7 = _mm_unpackhi_epi64(o7, i7);\
+}/**/
+
+/* Matrix Transpose Inverse Step 2
+ * input are two 512-bit states with one row of each state in one xmm
+ * output are two 512-bit states with two rows in one xmm
+ * inputs: i0-i7 = (P|Q)
+ * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
+ */
+#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
+  o0 = i0;\
+  i0 = _mm_unpacklo_epi64(i0, i1);\
+  o0 = _mm_unpackhi_epi64(o0, i1);\
+  o1 = i2;\
+  i2 = _mm_unpacklo_epi64(i2, i3);\
+  o1 = _mm_unpackhi_epi64(o1, i3);\
+  o2 = i4;\
+  i4 = _mm_unpacklo_epi64(i4, i5);\
+  o2 = _mm_unpackhi_epi64(o2, i5);\
+  o3 = i6;\
+  i6 = _mm_unpacklo_epi64(i6, i7);\
+  o3 = _mm_unpackhi_epi64(o3, i7);\
+}/**/
+
+/* Matrix Transpose Output Step 2
+ * input is one 512-bit state with two rows in one xmm
+ * output is one 512-bit state with one row in the low 64-bits of one xmm
+ * inputs: i0,i2,i4,i6 = S
+ * outputs: (i0-7) = (0|S)
+ */
+#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
+  t0 = _mm_xor_si128(t0, t0);\
+  i1 = i0;\
+  i3 = i2;\
+  i5 = i4;\
+  i7 = i6;\
+  i0 = _mm_unpacklo_epi64(i0, t0);\
+  i1 = _mm_unpackhi_epi64(i1, t0);\
+  i2 = _mm_unpacklo_epi64(i2, t0);\
+  i3 = _mm_unpackhi_epi64(i3, t0);\
+  i4 = _mm_unpacklo_epi64(i4, t0);\
+  i5 = _mm_unpackhi_epi64(i5, t0);\
+  i6 = _mm_unpacklo_epi64(i6, t0);\
+  i7 = _mm_unpackhi_epi64(i7, t0);\
+}/**/
+
+/* Matrix Transpose Output Inverse Step 2
+ * input is one 512-bit state with one row in the low 64-bits of one xmm
+ * output is one 512-bit state with two rows in one xmm
+ * inputs: i0-i7 = (0|S)
+ * outputs: (i0, i2, i4, i6) = S
+ */
+#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
+  i0 = _mm_unpacklo_epi64(i0, i1);\
+  i2 = _mm_unpacklo_epi64(i2, i3);\
+  i4 = _mm_unpacklo_epi64(i4, i5);\
+  i6 = _mm_unpacklo_epi64(i6, i7);\
+endif\
+}/**/
+
+
+void INIT(u64* h)
+{
+  __m128i* const chaining = (__m128i*) h;
+  static __m128i xmm0, /*xmm1,*/ xmm2, /*xmm3, xmm4, xmm5,*/ xmm6, xmm7;
+  static __m128i /*xmm8, xmm9, xmm10, xmm11,*/ xmm12, xmm13, xmm14, xmm15;
+
+  /* load IV into registers xmm12 - xmm15 */
+  xmm12 = chaining[0];
+  xmm13 = chaining[1];
+  xmm14 = chaining[2];
+  xmm15 = chaining[3];
+
+  /* transform chaining value from column ordering into row ordering */
+  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
+  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
+
+  /* store transposed IV */
+  chaining[0] = xmm12;
+  chaining[1] = xmm2;
+  chaining[2] = xmm6;
+  chaining[3] = xmm7;
+}
+
+void TF512(u64* h, u64* m)
+{
+  __m128i* const chaining = (__m128i*) h;
+  __m128i* const message = (__m128i*) m;
+  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m128i TEMP0;
+  static __m128i TEMP1;
+  static __m128i TEMP2;
+
+#ifdef IACA_TRACE
+  IACA_START;
+#endif
+
+  /* load message into registers xmm12 - xmm15 */
+  xmm12 = message[0];
+  xmm13 = message[1];
+  xmm14 = message[2];
+  xmm15 = message[3];
+
+  /* transform message M from column ordering into row ordering */
+  /* we first put two rows (64 bit) of the message into one 128-bit xmm register */
+  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
+
+  /* load previous chaining value */
+  /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
+  xmm8 = chaining[0];
+  xmm0 = chaining[1];
+  xmm4 = chaining[2];
+  xmm5 = chaining[3];
+
+  /* xor message to CV get input of P */
+  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
+  xmm8 = _mm_xor_si128(xmm8, xmm12);
+  xmm0 = _mm_xor_si128(xmm0, xmm2);
+  xmm4 = _mm_xor_si128(xmm4, xmm6);
+  xmm5 = _mm_xor_si128(xmm5, xmm7);
+
+  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
+  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
+  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
+  Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
+
+  /* compute the two permutations P and Q in parallel */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P or two rows of Q in one xmm register */
+  Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3);
+
+  /* xor output of P and Q */
+  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
+  xmm0 = _mm_xor_si128(xmm0, xmm8);
+  xmm1 = _mm_xor_si128(xmm1, xmm10);
+  xmm2 = _mm_xor_si128(xmm2, xmm12);
+  xmm3 = _mm_xor_si128(xmm3, xmm14);
+
+  /* xor CV (feed-forward) */
+  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
+  xmm0 = _mm_xor_si128(xmm0, (chaining[0]));
+  xmm1 = _mm_xor_si128(xmm1, (chaining[1]));
+  xmm2 = _mm_xor_si128(xmm2, (chaining[2]));
+  xmm3 = _mm_xor_si128(xmm3, (chaining[3]));
+
+  /* store CV */
+  chaining[0] = xmm0;
+  chaining[1] = xmm1;
+  chaining[2] = xmm2;
+  chaining[3] = xmm3;
+
+#ifdef IACA_TRACE
+  IACA_END;
+#endif
+  return;
+}
+
+void OF512(u64* h)
+{
+  __m128i* const chaining = (__m128i*) h;
+  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m128i TEMP0;
+  static __m128i TEMP1;
+  static __m128i TEMP2;
+
+  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
+  xmm8 = chaining[0];
+  xmm10 = chaining[1];
+  xmm12 = chaining[2];
+  xmm14 = chaining[3];
+
+  /* there are now 2 rows of the CV in one xmm register */
+  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
+  /* result: the 8 input rows of P in xmm8 - xmm15 */
+  Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0);
+
+  /* compute the permutation P */
+  /* result: the output of P(CV) in xmm8 - xmm15 */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P in one xmm register */
+  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
+  Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
+  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
+  xmm10 = _mm_xor_si128(xmm10, (chaining[1]));
+  xmm12 = _mm_xor_si128(xmm12, (chaining[2]));
+  xmm14 = _mm_xor_si128(xmm14, (chaining[3]));
+
+  /* transform state back from row ordering into column ordering */
+  /* result: final hash value in xmm9, xmm11 */
+  Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0);
+
+  /* we only need to return the truncated half of the state */
+  chaining[2] = xmm9;
+  chaining[3] = xmm11;
+}
+
+#endif
+
+#if (LENGTH > 256)

 #define SET_CONSTANTS(){\
  ALL_FF = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\
@@ -420,8 +768,9 @@ __m128i ALL_FF;
 }/**/


-void INIT( __m128i* chaining )
+void INIT(u64* h)
 {
+   __m128i* const chaining = (__m128i*) h;
  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;

@@ -449,8 +798,10 @@ void INIT( __m128i* chaining )
  chaining[7] = xmm15;
 }

-void TF1024( __m128i* chaining, const __m128i* message )
+void TF1024(u64* h, u64* m)
 {
+  __m128i* const chaining = (__m128i*) h;
+  __m128i* const message = (__m128i*) m;
  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
  static __m128i QTEMP[8];
@@ -563,8 +914,9 @@ void TF1024( __m128i* chaining, const __m128i* message )
  return;
 }

-void OF1024( __m128i* chaining )
+void OF1024(u64* h)
 {
+  __m128i* const chaining = (__m128i*) h;
  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
  static __m128i TEMP0;
@@ -609,3 +961,5 @@ void OF1024( __m128i* chaining )
  return;
 }

+#endif
+
--- a/algo/groestl/aes_ni/groestl256-intr-aes.h
+++ b/algo/groestl/aes_ni/groestl256-intr-aes.h
@@ -15,8 +15,8 @@
 __m128i ROUND_CONST_Lx;
 __m128i ROUND_CONST_L0[ROUNDS512];
 __m128i ROUND_CONST_L7[ROUNDS512];
-//__m128i ROUND_CONST_P[ROUNDS1024];
-//__m128i ROUND_CONST_Q[ROUNDS1024];
+__m128i ROUND_CONST_P[ROUNDS1024];
+__m128i ROUND_CONST_Q[ROUNDS1024];
 __m128i TRANSP_MASK;
 __m128i SUBSH_MASK[8];
 __m128i ALL_1B;
@@ -351,8 +351,9 @@ __m128i ALL_FF;
 }/**/


-void INIT256( __m128i* chaining )
+void INIT256(u64* h)
 {
+  __m128i* const chaining = (__m128i*) h;
  static __m128i xmm0, /*xmm1,*/ xmm2, /*xmm3, xmm4, xmm5,*/ xmm6, xmm7;
  static __m128i /*xmm8, xmm9, xmm10, xmm11,*/ xmm12, xmm13, xmm14, xmm15;

@@ -373,8 +374,10 @@ void INIT256( __m128i* chaining )
  chaining[3] = xmm7;
 }

-void TF512( __m128i* chaining, __m128i* message )
+void TF512(u64* h, u64* m)
 {
+  __m128i* const chaining = (__m128i*) h;
+  __m128i* const message = (__m128i*) m;
  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
  static __m128i TEMP0;
@@ -446,8 +449,9 @@ void TF512( __m128i* chaining, __m128i* message )
  return;
 }

-void OF512( __m128i* chaining )
+void OF512(u64* h)
 {
+  __m128i* const chaining = (__m128i*) h;
  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
  static __m128i TEMP0;
--- a/algo/groestl/aes_ni/hash-groestl.c
+++ b/algo/groestl/aes_ni/hash-groestl.c
@@ -6,9 +6,6 @@
 * This code is placed in the public domain
 */

-// Optimized for hash and data length that are integrals of __m128i 
-
-
 #include <memory.h>
 #include "hash-groestl.h"
 #include "miner.h"
@@ -52,191 +49,196 @@
  #endif
 #endif

+
+/* digest up to len bytes of input (full blocks only) */
+void Transform( hashState_groestl *ctx, const u8 *in, unsigned long long len )
+{
+    /* increment block counter */
+    ctx->block_counter += len/SIZE;
+    /* digest message, one block at a time */
+    for ( ; len >= SIZE; len -= SIZE, in += SIZE )
+        TF1024( (u64*)ctx->chaining, (u64*)in );
+    asm volatile ("emms");
+}
+
+/* given state h, do h <- P(h)+h */
+void OutputTransformation( hashState_groestl *ctx )
+{
+    /* determine variant */
+    OF1024( (u64*)ctx->chaining );
+    asm volatile ("emms");
+}
+
+/* initialise context */
 HashReturn_gr init_groestl( hashState_groestl* ctx, int hashlen )
 {
-  int i;
+  u8 i = 0;

  ctx->hashlen = hashlen;
+
  SET_CONSTANTS();

+  for ( i = 0; i < SIZE / 8; i++ )
+    ctx->chaining[i] = 0;
+  for ( i = 0; i < SIZE; i++ )
+    ctx->buffer[i] = 0;
+
  if (ctx->chaining == NULL || ctx->buffer == NULL)
    return FAIL_GR;

-  for ( i = 0; i < SIZE512; i++ )
-  {
-     ctx->chaining[i] = _mm_setzero_si128();
-     ctx->buffer[i]   = _mm_setzero_si128();
-  }
-  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
+  /* set initial value */
+  ctx->chaining[COLS-1] = U64BIG((u64)LENGTH);
  INIT(ctx->chaining);
  ctx->buf_ptr = 0;
-  ctx->rem_ptr = 0;
+  ctx->block_counter = 0;

  return SUCCESS_GR;
 }

+/*
+HashReturn_gr init_groestl( hashState_groestl* ctx )
+{
+  return Xinit_groestl( ctx, 64 );
+}
+*/
+
 HashReturn_gr reinit_groestl( hashState_groestl* ctx )
 {
  int i;
+  for ( i = 0; i < SIZE / 8; i++ )
+    ctx->chaining[i] = 0;
+  for ( i = 0; i < SIZE; i++ )
+    ctx->buffer[i] = 0;

  if (ctx->chaining == NULL || ctx->buffer == NULL)
    return FAIL_GR;

-  for ( i = 0; i < SIZE512; i++ )
-  {
-     ctx->chaining[i] = _mm_setzero_si128();
-     ctx->buffer[i]   = _mm_setzero_si128();
-  }
-  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
-  INIT(ctx->chaining);
+  /* set initial value */
+  ctx->chaining[COLS-1] = U64BIG( (u64)LENGTH );
+  INIT( ctx->chaining );
  ctx->buf_ptr = 0;
-  ctx->rem_ptr = 0;
+  ctx->block_counter = 0;

  return SUCCESS_GR;
 }
-//// midstate is broken
-// To use midstate:
-// 1. midstate must process all full blocks.
-// 2. tail must be less than a full block and may not straddle a
-//    block boundary.
-// 3. midstate and tail each must be multiples of 128 bits.
-// 4. For best performance midstate length is a multiple of block size.
-// 5. Midstate will work at reduced impact than full hash, if total hash
-//    (midstate + tail) is less than 1 block.
-//    This, unfortunately, is the case with all current users.
-// 6. the morefull blocks the bigger the gain

-// use only for midstate precalc
-HashReturn_gr update_groestl( hashState_groestl* ctx, const void* input,
-                              DataLength_gr databitlen )
+/* update state with databitlen bits of input */
+HashReturn_gr update_groestl( hashState_groestl* ctx,
+	                      const BitSequence_gr* input,
+	                      DataLength_gr databitlen )
 {
-   __m128i* in = (__m128i*)input;
-   const int len = (int)databitlen / 128;  // bits to __m128i
-   const int blocks = len / SIZE512;    // __M128i to blocks
-   int rem = ctx->rem_ptr;
-   int i;
+  int i;
+  const int msglen = (int)(databitlen/8);

-   ctx->blk_count  = blocks;
-   ctx->databitlen = databitlen;
+  /* digest bulk of message */
+  Transform( ctx, input, msglen );

-   // digest any full blocks 
-   for ( i = 0; i < blocks; i++ )
-       TF1024( ctx->chaining, &in[ i * SIZE512 ] );
-   // adjust buf_ptr to last block
-   ctx->buf_ptr = blocks * SIZE512;
+  /* store remaining data in buffer */
+  i = ( msglen / SIZE ) * SIZE;
+  while ( i < msglen )
+    ctx->buffer[(int)ctx->buf_ptr++] = input[i++];

-   // copy any remaining data to buffer for final hash, it may already
-   // contain data from a previous update for a midstate precalc
-   for ( i = 0; i < len % SIZE512; i++ )
-       ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
-   // adjust rem_ptr for possible  new data
-   ctx->rem_ptr += i;
-
-   return SUCCESS_GR;
+  return SUCCESS_GR;
 }

-// deprecated do not use
-HashReturn_gr final_groestl( hashState_groestl* ctx, void* output )
+/* finalise: process remaining data (including padding), perform
+   output transformation, and write hash result to 'output' */
+HashReturn_gr final_groestl( hashState_groestl* ctx,
+	                     BitSequence_gr* output )
 {
-   const int len = (int)ctx->databitlen / 128;  // bits to __m128i 
-   const int blocks = ctx->blk_count + 1;       // adjust for final block
+  int i, j;

-   const int rem_ptr = ctx->rem_ptr;      // end of data start of padding
-   const int hashlen_m128i = ctx->hashlen / 16;  // bytes to __m128i
-   const int hash_offset = SIZE512 - hashlen_m128i;  // where in buffer
-   int i;
+  ctx->buffer[(int)ctx->buf_ptr++] = 0x80;
+  /* pad with '0'-bits */
+  if ( ctx->buf_ptr > SIZE - LENGTHFIELDLEN )
+  {
+    /* padding requires two blocks */
+    while ( ctx->buf_ptr < SIZE )
+      ctx->buffer[(int)ctx->buf_ptr++] = 0;
+    /* digest first padding block */
+    Transform( ctx, ctx->buffer, SIZE );
+    ctx->buf_ptr = 0;
+  }

-   // first pad byte = 0x80, last pad byte = block count
-   // everything in between is zero
+  // this will pad up to 120 bytes
+  while ( ctx->buf_ptr < SIZE - LENGTHFIELDLEN )
+    ctx->buffer[(int)ctx->buf_ptr++] = 0;

-   if ( rem_ptr == len - 1 )
-   {
-       // only 128 bits left in buffer, all padding at once
-       ctx->buffer[rem_ptr] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
-                                                  0,0,0,0, 0,0,0,0x80 );
-   }
-   else
-   {
-       // add first padding
-       ctx->buffer[rem_ptr] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
-                                            0,0,0,0, 0,0,0,0x80 );
-       // add zero padding
-       for ( i = rem_ptr + 1; i < SIZE512 - 1; i++ )
-           ctx->buffer[i] = _mm_setzero_si128();
+  /* length padding */
+  ctx->block_counter++;
+  ctx->buf_ptr = SIZE;
+  while ( ctx->buf_ptr > SIZE - LENGTHFIELDLEN )
+  {
+    ctx->buffer[(int)--ctx->buf_ptr] = (u8)ctx->block_counter;
+    ctx->block_counter >>= 8;
+  }

-       // add length padding, second last byte is zero unless blocks > 255
-       ctx->buffer[i] = _mm_set_epi8( blocks, blocks>>8, 0,0, 0,0,0,0,
-                                           0,         0 ,0,0, 0,0,0,0 );
-   }
+  /* digest final padding block */
+  Transform( ctx, ctx->buffer, SIZE );
+  /* perform output transformation */
+  OutputTransformation( ctx );

-   // digest final padding block and do output transform
-   TF1024( ctx->chaining, ctx->buffer );
-   OF1024( ctx->chaining );
+  // store hash result in output 
+  for ( i = ( SIZE - ctx->hashlen) / 16, j = 0; i < SIZE / 16; i++, j++ )
+       casti_m128i( output, j ) = casti_m128i( ctx->chaining , i );

-   // store hash result in output 
-   for ( i = 0; i < hashlen_m128i; i++ )
-      casti_m128i( output, i ) = ctx->chaining[ hash_offset + i];
-
-   return SUCCESS_GR;
+  return SUCCESS_GR;
 }

-HashReturn_gr update_and_final_groestl( hashState_groestl* ctx, void* output,
-                                const void* input, DataLength_gr databitlen )
+HashReturn_gr update_and_final_groestl( hashState_groestl* ctx,
+      BitSequence_gr* output, const BitSequence_gr* input,
+      DataLength_gr databitlen )
 {
-   const int len = (int)databitlen / 128;
-   const int hashlen_m128i = ctx->hashlen / 16;   // bytes to __m128i
-   const int hash_offset = SIZE512 - hashlen_m128i;
-   int rem = ctx->rem_ptr;
-   int blocks = len / SIZE512;
-   __m128i* in = (__m128i*)input;
-   int i;
+  const int inlen = (int)(databitlen/8);  // need bytes
+  int i, j;

-   // --- update ---
+  /* digest bulk of message */
+  Transform( ctx, input, inlen );

-   // digest any full blocks, process directly from input 
-   for ( i = 0; i < blocks; i++ )
-      TF1024( ctx->chaining, &in[ i * SIZE512 ] );
-   ctx->buf_ptr = blocks * SIZE512;
+  /* store remaining data in buffer */
+  i = ( inlen / SIZE ) * SIZE;
+  while ( i < inlen )
+     ctx->buffer[(int)ctx->buf_ptr++] = input[i++];

-   // copy any remaining data to buffer, it may already contain data
-   // from a previous update for a midstate precalc
-   for ( i = 0; i < len % SIZE512; i++ )
-       ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
-   i += rem;    // use i as rem_ptr in final
+  // start of final

-   //--- final ---
+  ctx->buffer[(int)ctx->buf_ptr++] = 0x80;

-   blocks++;      // adjust for final block
+  /* pad with '0'-bits */
+  if ( ctx->buf_ptr > SIZE - LENGTHFIELDLEN )
+  {
+    /* padding requires two blocks */
+    while ( ctx->buf_ptr < SIZE )
+      ctx->buffer[(int)ctx->buf_ptr++] = 0;
+    memset( ctx->buffer + ctx->buf_ptr, 0, SIZE - ctx->buf_ptr );
+    
+    /* digest first padding block */
+    Transform( ctx, ctx->buffer, SIZE );
+    ctx->buf_ptr = 0;
+  }

-   if ( i == len -1 )
-   {        
-       // only 128 bits left in buffer, all padding at once
-       ctx->buffer[i] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
-                                           0,0,0,0, 0,0,0,0x80 );
-   }   
-   else
-   {
-       // add first padding
-       ctx->buffer[i] = _mm_set_epi8( 0,0,0,0, 0,0,0,0, 
-                                      0,0,0,0, 0,0,0,0x80 );
-       // add zero padding
-       for ( i += 1; i < SIZE512 - 1; i++ )
-           ctx->buffer[i] = _mm_setzero_si128();
+  // this will pad up to 120 bytes
+  memset( ctx->buffer + ctx->buf_ptr, 0, SIZE - ctx->buf_ptr - LENGTHFIELDLEN );

-       // add length padding, second last byte is zero unless blocks > 255
-       ctx->buffer[i] = _mm_set_epi8( blocks, blocks>>8, 0,0, 0,0,0,0, 
-                                           0,         0 ,0,0, 0,0,0,0 );
-   }
+  /* length padding */
+  ctx->block_counter++;
+  ctx->buf_ptr = SIZE;
+  while (ctx->buf_ptr > SIZE - LENGTHFIELDLEN)
+  {
+    ctx->buffer[(int)--ctx->buf_ptr] = (u8)ctx->block_counter;
+    ctx->block_counter >>= 8;
+  }

-   // digest final padding block and do output transform
-   TF1024( ctx->chaining, ctx->buffer );
-   OF1024( ctx->chaining );
+  /* digest final padding block */
+  Transform( ctx, ctx->buffer, SIZE );
+  /* perform output transformation */
+  OutputTransformation( ctx );

-   // store hash result in output 
-   for ( i = 0; i < hashlen_m128i; i++ )
-      casti_m128i( output, i ) = ctx->chaining[ hash_offset + i ];
+  // store hash result in output 
+  for ( i = ( SIZE - ctx->hashlen) / 16, j = 0; i < SIZE / 16; i++, j++ )
+       casti_m128i( output, j ) = casti_m128i( ctx->chaining , i );

-   return SUCCESS_GR;
+  return SUCCESS_GR;
 }

 /* hash bit sequence */
--- a/algo/groestl/aes_ni/hash-groestl.h
+++ b/algo/groestl/aes_ni/hash-groestl.h
@@ -9,8 +9,6 @@
 #ifndef __hash_h
 #define __hash_h

-#include <immintrin.h>
-
 #include <stdio.h>
 #if defined(_WIN64) || defined(__WINDOWS__)
 #include <windows.h>
@@ -21,27 +19,27 @@

 #include "brg_endian.h"
 #define NEED_UINT_64T
-#include "algo/sha/brg_types.h"
+#include "brg_types.h"

 /* some sizes (number of bytes) */
 #define ROWS (8)
 #define LENGTHFIELDLEN (ROWS)
-//#define COLS512 (8)
+#define COLS512 (8)
 #define COLS1024 (16)
-//#define SIZE512 ((ROWS)*(COLS512))
-#define SIZE_1024 ((ROWS)*(COLS1024))
-//#define ROUNDS512 (10)
+#define SIZE512 ((ROWS)*(COLS512))
+#define SIZE1024 ((ROWS)*(COLS1024))
+#define ROUNDS512 (10)
 #define ROUNDS1024 (14)

-//#if LENGTH<=256
-//#define COLS (COLS512)
-//#define SIZE (SIZE512)
-//#define ROUNDS (ROUNDS512)
-//#else
+#if LENGTH<=256
+#define COLS (COLS512)
+#define SIZE (SIZE512)
+#define ROUNDS (ROUNDS512)
+#else
 #define COLS (COLS1024)
-//#define SIZE (SIZE1024)
+#define SIZE (SIZE1024)
 #define ROUNDS (ROUNDS1024)
-//#endif
+#endif

 #define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))

@@ -63,29 +61,31 @@ typedef unsigned char BitSequence_gr;
 typedef unsigned long long DataLength_gr;
 typedef enum { SUCCESS_GR = 0, FAIL_GR = 1, BAD_HASHBITLEN_GR = 2} HashReturn_gr;

-#define SIZE512 (SIZE_1024/16)
+// Use area128 overlay for buffer to facilitate fast copying

 typedef struct {
-  __attribute__ ((aligned (64))) __m128i chaining[SIZE512];
-  __attribute__ ((aligned (64))) __m128i buffer[SIZE512];
-  int hashlen;       // byte
-  int blk_count;     // SIZE_m128i
-  int buf_ptr;       // __m128i offset
-  int rem_ptr;
-  int databitlen;    // bits
+  __attribute__ ((aligned (32))) u64 chaining[SIZE/8];        // actual state
+  __attribute__ ((aligned (32))) BitSequence_gr buffer[SIZE]; // data buffer
+  u64 block_counter;        /* message block counter */
+  int hashlen;              // bytes
+  int buf_ptr;              /* data buffer pointer */
 } hashState_groestl;

+//HashReturn_gr init_groestl( hashState_groestl* );

 HashReturn_gr init_groestl( hashState_groestl*, int );

 HashReturn_gr reinit_groestl( hashState_groestl* );

-HashReturn_gr update_groestl( hashState_groestl*, const void*,
+HashReturn_gr update_groestl( hashState_groestl*, const BitSequence_gr*,
                              DataLength_gr );

-HashReturn_gr final_groestl( hashState_groestl*, void* );
+HashReturn_gr final_groestl( hashState_groestl*, BitSequence_gr* );

-HashReturn_gr update_and_final_groestl( hashState_groestl*,  void*,
-                                        const void*, DataLength_gr );
+HashReturn_gr hash_groestl( int, const BitSequence_gr*, DataLength_gr,
+                            BitSequence_gr* );
+
+HashReturn_gr update_and_final_groestl( hashState_groestl*,
+                       BitSequence_gr*, const BitSequence_gr*, DataLength_gr );

 #endif /* __hash_h */
--- a/algo/groestl/aes_ni/hash-groestl256.c
+++ b/algo/groestl/aes_ni/hash-groestl256.c
@@ -49,201 +49,187 @@
  #endif
 #endif

+
+/* digest up to len bytes of input (full blocks only) */
+void Transform256(hashState_groestl256 *ctx,
+	       const u8 *in, 
+	       unsigned long long len) {
+    /* increment block counter */
+    ctx->block_counter += len/SIZE;
+
+    /* digest message, one block at a time */
+    for (; len >= SIZE; len -= SIZE, in += SIZE)
+      TF512((u64*)ctx->chaining, (u64*)in);
+
+    asm volatile ("emms");
+}
+
+/* given state h, do h <- P(h)+h */
+void OutputTransformation256(hashState_groestl256 *ctx) {
+    /* determine variant */
+    OF512((u64*)ctx->chaining);
+
+    asm volatile ("emms");
+}
+
 /* initialise context */
 HashReturn_gr init_groestl256( hashState_groestl256* ctx, int hashlen )
 {
-  int i;
+  u8 i = 0;

  ctx->hashlen = hashlen;
+
  SET_CONSTANTS();

+  for (i=0; i<SIZE/8; i++)
+    ctx->chaining[i] = 0;
+  for (i=0; i<SIZE; i++)
+    ctx->buffer[i] = 0;
+
  if (ctx->chaining == NULL || ctx->buffer == NULL)
    return FAIL_GR;

-  for ( i = 0; i < SIZE256; i++ )
-  {
-     ctx->chaining[i] = _mm_setzero_si128();
-     ctx->buffer[i]   = _mm_setzero_si128();
-  }
-  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
-  INIT256( ctx->chaining );
+  /* set initial value */
+  ctx->chaining[COLS-1] = U64BIG((u64)256);
+
+  INIT256(ctx->chaining);
+
+  /* set other variables */
  ctx->buf_ptr = 0;
-  ctx->rem_ptr = 0;
+  ctx->block_counter = 0;

  return SUCCESS_GR;
 }

-
 HashReturn_gr reinit_groestl256(hashState_groestl256* ctx)
 {
  int i;
+  for (i=0; i<SIZE/8; i++)
+    ctx->chaining[i] = 0;
+  for (i=0; i<SIZE; i++)
+    ctx->buffer[i] = 0;

  if (ctx->chaining == NULL || ctx->buffer == NULL)
    return FAIL_GR;

-  for ( i = 0; i < SIZE256; i++ )
-  {
-     ctx->chaining[i] = _mm_setzero_si128();
-     ctx->buffer[i]   = _mm_setzero_si128();
-  }
-  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
+  /* set initial value */
+  ctx->chaining[COLS-1] = 256;
+
  INIT256(ctx->chaining);
+
+  /* set other variables */
  ctx->buf_ptr = 0;
-  ctx->rem_ptr = 0;
+  ctx->block_counter = 0;

  return SUCCESS_GR;
 }

-// Use this only for midstate and never for cryptonight
-HashReturn_gr update_groestl256( hashState_groestl256* ctx, const void* input,
-                                 DataLength_gr databitlen )
+HashReturn_gr update_groestl256( hashState_groestl256* ctx,
+                   const BitSequence_gr* input, DataLength_gr databitlen )
 {
-   __m128i* in = (__m128i*)input;
-   const int len = (int)databitlen / 128;  // bits to __m128i
-   const int blocks = len / SIZE256;    // __M128i to blocks
-   int rem = ctx->rem_ptr;
-   int i;
+  const int msglen = (int)(databitlen/8);  // bytes
+  int i;

-   ctx->blk_count = blocks;
-   ctx->databitlen = databitlen;
+  /* digest bulk of message */
+  Transform256( ctx, input, msglen );

-   // digest any full blocks 
-   for ( i = 0; i < blocks; i++ )
-       TF512( ctx->chaining, &in[ i * SIZE256 ] );
-   // adjust buf_ptr to last block
-   ctx->buf_ptr = blocks * SIZE256;
+  /* store remaining data in buffer */
+  i = ( msglen / SIZE ) * SIZE;
+  while ( i < msglen )
+     ctx->buffer[(int)ctx->buf_ptr++] = input[i++];

-   // Copy any remainder to buffer
-   for ( i = 0; i < len % SIZE256; i++ )
-       ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
-   // adjust rem_ptr for new data
-   ctx->rem_ptr += i;
-
-   return SUCCESS_GR;
+  return SUCCESS_GR;
 }

-// don't use this at all
-HashReturn_gr final_groestl256( hashState_groestl256* ctx, void* output )
+HashReturn_gr final_groestl256( hashState_groestl256* ctx,
+                                BitSequence_gr* output )
 {
-   const int len = (int)ctx->databitlen / 128;  // bits to __m128i 
-   const int blocks = ctx->blk_count + 1;       // adjust for final block
-   const int rem_ptr = ctx->rem_ptr;      // end of data start of padding
-   const int hashlen_m128i = ctx->hashlen / 16;  // bytes to __m128i
-   const int hash_offset = SIZE256 - hashlen_m128i;  // where in buffer
-   int i;
+  ctx->buffer[(int)ctx->buf_ptr++] = 0x80;

-   // first pad byte = 0x80, last pad byte = block count
-   // everything in between is zero
+  /* pad with '0'-bits */
+  if ( ctx->buf_ptr > SIZE - LENGTHFIELDLEN )
+  {
+    /* padding requires two blocks */
+    while ( ctx->buf_ptr < SIZE )
+      ctx->buffer[(int)ctx->buf_ptr++] = 0;
+    /* digest first padding block */
+    Transform256( ctx, ctx->buffer, SIZE );
+    ctx->buf_ptr = 0;
+  }
+  while ( ctx->buf_ptr < SIZE - LENGTHFIELDLEN )
+    ctx->buffer[(int)ctx->buf_ptr++] = 0;

-   if ( rem_ptr == len - 1 )
-   {
-       // all padding at once
-       ctx->buffer[rem_ptr] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
-                                                  0,0,0,0, 0,0,0,0x80 );
-   }
-   else
-   {
-       // add first padding
-       ctx->buffer[rem_ptr] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
-                                            0,0,0,0, 0,0,0,0x80 );
-       // add zero padding
-       for ( i = rem_ptr + 1; i < SIZE256 - 1; i++ )
-           ctx->buffer[i] = _mm_setzero_si128();
-       // add length padding
-       // cheat since we know the block count is trivial, good if block < 256
-       ctx->buffer[i] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
-                                           0,0,0,0, 0,0,0,0 );
-   }
+  /* length padding */
+  ctx->block_counter++;
+  ctx->buf_ptr = SIZE;
+  while ( ctx->buf_ptr > SIZE - LENGTHFIELDLEN )
+  {
+    ctx->buffer[(int)--ctx->buf_ptr] = (u8)ctx->block_counter;
+    ctx->block_counter >>= 8;
+  }

-   // digest final padding block and do output transform
-   TF512( ctx->chaining, ctx->buffer );
-   OF512( ctx->chaining );
+  /* digest final padding block */
+  Transform256( ctx, ctx->buffer, SIZE );
+  /* perform output transformation */
+  OutputTransformation256( ctx );

-   // store hash result in output 
-   for ( i = 0; i < hashlen_m128i; i++ )
-      casti_m128i( output, i ) = ctx->chaining[ hash_offset + i];
+  /* store hash result in output */
+  for ( int i = ( (SIZE - ctx->hashlen) / 16 ), j = 0; i < SIZE/16; i++, j++ )
+       casti_m128i( output, j ) = casti_m128i( ctx->chaining, i );

-   return SUCCESS_GR;
+  return SUCCESS_GR;
 }

 HashReturn_gr update_and_final_groestl256( hashState_groestl256* ctx,
-                   void* output, const void* input, DataLength_gr databitlen )
+                   BitSequence_gr* output,  const BitSequence_gr* input,
+                   DataLength_gr databitlen )
 {
-   const int len = (int)databitlen / 128;
-   const int hashlen_m128i = ctx->hashlen / 16;   // bytes to __m128i
-   const int hash_offset = SIZE256 - hashlen_m128i;
-   int rem = ctx->rem_ptr;
-   int blocks = len / SIZE256;
-   __m128i* in = (__m128i*)input;
-   int i;
+  const int msglen = (int)(databitlen/8);  // bytes
+  int i, j;

-   // --- update ---
+  /* digest bulk of message */
+  Transform256( ctx, input, msglen );

-   // digest any full blocks, process directly from input 
-   for ( i = 0; i < blocks; i++ )
-      TF512( ctx->chaining, &in[ i * SIZE256 ] );
-   ctx->buf_ptr = blocks * SIZE256;
+  /* store remaining data in buffer */
+  i = ( msglen / SIZE ) * SIZE;
+  while ( i < msglen )
+     ctx->buffer[(int)ctx->buf_ptr++] = input[i++];

-   // cryptonight has 200 byte input, an odd number of __m128i
-   // remainder is only 8 bytes, ie u64.
-   if ( databitlen % 128 !=0 )
-   {
-      // must be cryptonight, copy 64 bits of data
-      *(uint64_t*)(ctx->buffer) = *(uint64_t*)(&in[ ctx->buf_ptr ] );
-      i = -1; // signal for odd length
-   }
-   else   
-   { 
-      // Copy any remaining data to buffer for final transform
-      for ( i = 0; i < len % SIZE256; i++ )
-          ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
-      i += rem;   // use i as rem_ptr in final
-   }
+  // start of final
+  ctx->buffer[(int)ctx->buf_ptr++] = 0x80;

-   //--- final ---
+  /* pad with '0'-bits */
+  if ( ctx->buf_ptr > SIZE - LENGTHFIELDLEN )
+  {
+    /* padding requires two blocks */
+    while ( ctx->buf_ptr < SIZE )
+      ctx->buffer[(int)ctx->buf_ptr++] = 0;
+    /* digest first padding block */
+    Transform256( ctx, ctx->buffer, SIZE );
+    ctx->buf_ptr = 0;
+  }
+  while ( ctx->buf_ptr < SIZE - LENGTHFIELDLEN )
+    ctx->buffer[(int)ctx->buf_ptr++] = 0;

-   // adjust for final block
-   blocks++;
+  /* length padding */
+  ctx->block_counter++;
+  ctx->buf_ptr = SIZE;
+  while ( ctx->buf_ptr > SIZE - LENGTHFIELDLEN )
+  {
+    ctx->buffer[(int)--ctx->buf_ptr] = (u8)ctx->block_counter;
+    ctx->block_counter >>= 8;
+  }

-   if ( i == len - 1 )
-   {
-       // all padding at once
-       ctx->buffer[i] = _mm_set_epi8( blocks,blocks>>8,0,0, 0,0,0,0,
-                                           0,        0,0,0, 0,0,0,0x80 );
-   }
-   else
-   {
-      if ( i == -1 )
-      {
-         // cryptonight odd length
-         ((uint64_t*)ctx->buffer)[ 1 ] = 0x80ull;
-         // finish the block with zero and length padding as normal
-         i = 0;
-       }
-       else
-       {
-          // add first padding
-          ctx->buffer[i] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
-                                         0,0,0,0, 0,0,0,0x80 );
-       }
-       // add zero padding
-       for ( i += 1; i < SIZE256 - 1; i++ )
-           ctx->buffer[i] = _mm_setzero_si128();
-       // add length padding
-       // cheat since we know the block count is trivial, good if block < 256
-       ctx->buffer[i] = _mm_set_epi8( blocks,blocks>>8,0,0, 0,0,0,0,
-                                           0,        0,0,0, 0,0,0,0 );
-   }
+  /* digest final padding block */
+  Transform256( ctx, ctx->buffer, SIZE );
+  /* perform output transformation */
+  OutputTransformation256( ctx );

-   // digest final padding block and do output transform
-   TF512( ctx->chaining, ctx->buffer );
-   OF512( ctx->chaining );
+  /* store hash result in output */
+  for ( i = ( (SIZE - ctx->hashlen) / 16 ), j = 0; i < SIZE/16; i++, j++ )
+       casti_m128i( output, j ) = casti_m128i( ctx->chaining, i );

-   // store hash result in output 
-   for ( i = 0; i < hashlen_m128i; i++ )
-      casti_m128i( output, i ) = ctx->chaining[ hash_offset + i ];
-
-   return SUCCESS_GR;
+  return SUCCESS_GR;
 }

 /* hash bit sequence */
--- a/algo/groestl/aes_ni/hash-groestl256.h
+++ b/algo/groestl/aes_ni/hash-groestl256.h
@@ -9,7 +9,6 @@
 #ifndef __hash_h
 #define __hash_h

-#include <immintrin.h>
 #include <stdio.h>
 #if defined(_WIN64) || defined(__WINDOWS__)
 #include <windows.h>
@@ -35,27 +34,29 @@ typedef crypto_uint64 u64;

 #include "brg_endian.h"
 #define NEED_UINT_64T
-#include "algo/sha/brg_types.h"
+#include "brg_types.h"

 #ifdef IACA_TRACE
  #include IACA_MARKS
 #endif

-#define LENGTH (256)
+//#ifndef LENGTH
+//#define LENGTH (256)
+//#endif

 /* some sizes (number of bytes) */
 #define ROWS (8)
 #define LENGTHFIELDLEN (ROWS)
 #define COLS512 (8)
-//#define COLS1024 (16)
-#define SIZE_512 ((ROWS)*(COLS512))
-//#define SIZE1024 ((ROWS)*(COLS1024))
+#define COLS1024 (16)
+#define SIZE512 ((ROWS)*(COLS512))
+#define SIZE1024 ((ROWS)*(COLS1024))
 #define ROUNDS512 (10)
-//#define ROUNDS1024 (14)
+#define ROUNDS1024 (14)

 //#if LENGTH<=256
 #define COLS (COLS512)
-//#define SIZE (SIZE512)
+#define SIZE (SIZE512)
 #define ROUNDS (ROUNDS512)
 //#else
 //#define COLS (COLS1024)
@@ -88,34 +89,28 @@ typedef enum
    BAD_HASHBITLEN_GR = 2
 } HashReturn_gr;

-#define SIZE256 (SIZE_512/16)
-
 typedef struct {
-  __attribute__ ((aligned (32))) __m128i chaining[SIZE256];
-  __attribute__ ((aligned (32))) __m128i buffer[SIZE256];
-//  __attribute__ ((aligned (32))) u64 chaining[SIZE/8];      /* actual state */
-//  __attribute__ ((aligned (32))) BitSequence_gr buffer[SIZE];  /* data buffer */
-//  u64 block_counter;        /* message block counter */
+  __attribute__ ((aligned (32))) u64 chaining[SIZE/8];      /* actual state */
+  __attribute__ ((aligned (32))) BitSequence_gr buffer[SIZE];  /* data buffer */
+  u64 block_counter;        /* message block counter */
  int hashlen;              // bytes
-  int blk_count;
  int buf_ptr;              /* data buffer pointer */
-  int rem_ptr;
-  int databitlen;
 } hashState_groestl256;

 HashReturn_gr init_groestl256( hashState_groestl256*, int );

-HashReturn_gr reinit_groestl256( hashState_groestl256* );
+HashReturn_gr reinit_groestl( hashState_groestl256* );

-HashReturn_gr update_groestl256( hashState_groestl256*, const void*,
+HashReturn_gr update_groestl( hashState_groestl256*, const BitSequence_gr*,
                              DataLength_gr );

-HashReturn_gr final_groestl256( hashState_groestl256*, void* );
+HashReturn_gr final_groestl( hashState_groestl256*, BitSequence_gr* );

-HashReturn_gr hash_groestli256( int, const BitSequence_gr*, DataLength_gr,
+HashReturn_gr hash_groestl( int, const BitSequence_gr*, DataLength_gr,
                            BitSequence_gr* );

-HashReturn_gr update_and_final_groestl256( hashState_groestl256*, void*,
-                                           const void*, DataLength_gr );
+HashReturn_gr update_and_final_groestl256( hashState_groestl256*,
+                        BitSequence_gr*, const BitSequence_gr*,
+                        DataLength_gr );

 #endif /* __hash_h */
--- a/algo/groestl/groestl.c
+++ b/algo/groestl/groestl.c
@@ -1,3 +1,4 @@
+#include "miner.h"
 #include "algo-gate-api.h"

 #include <stdio.h>
@@ -14,7 +15,7 @@
 typedef struct
 {
 #ifdef NO_AES_NI
-    sph_groestl512_context groestl1, groestl2;
+    sph_groestl512_context groestl;
 #else
    hashState_groestl groestl1, groestl2;
 #endif
@@ -26,42 +27,43 @@ static groestl_ctx_holder groestl_ctx;
 void init_groestl_ctx()
 {
 #ifdef NO_AES_NI
-    sph_groestl512_init( &groestl_ctx.groestl1 );
-    sph_groestl512_init( &groestl_ctx.groestl2 );
+    sph_groestl512_init( &groestl_ctx.groestl );
 #else
    init_groestl( &groestl_ctx.groestl1, 64 );
    init_groestl( &groestl_ctx.groestl2, 64 );
 #endif
 }

-void groestlhash( void *output, const void *input )
+void groestlhash(void *output, const void *input)
 {
-     uint32_t hash[16] __attribute__ ((aligned (64)));
-     groestl_ctx_holder ctx __attribute__ ((aligned (64)));
+     uint32_t _ALIGN(32) hash[16];
+     groestl_ctx_holder ctx;
     memcpy( &ctx, &groestl_ctx, sizeof(groestl_ctx) );

+//     memset(&hash[0], 0, sizeof(hash));
+
 #ifdef NO_AES_NI
-     sph_groestl512(&ctx.groestl1, input, 80);
-     sph_groestl512_close(&ctx.groestl1, hash);
+     sph_groestl512(&ctx.groestl, input, 80);
+     sph_groestl512_close(&ctx.groestl, hash);

-     sph_groestl512(&ctx.groestl2, hash, 64);
-     sph_groestl512_close(&ctx.groestl2, hash);
+     sph_groestl512(&ctx.groestl, hash, 64);
+     sph_groestl512_close(&ctx.groestl, hash);
 #else
-     update_and_final_groestl( &ctx.groestl1, (char*)hash,
-                               (const char*)input, 640 );
+     update_groestl( &ctx.groestl1, (char*)input, 640 );
+     final_groestl( &ctx.groestl1,(char*)hash);

-     update_and_final_groestl( &ctx.groestl2, (char*)hash,
-                               (const char*)hash, 512 );
+     update_groestl( &ctx.groestl2, (char*)hash, 512 );
+     final_groestl( &ctx.groestl2, (char*)hash);
 #endif
-     memcpy(output, hash, 32);
+	memcpy(output, hash, 32);
 }

-int scanhash_groestl( int thr_id, struct work *work, uint32_t max_nonce,
-                      uint64_t *hashes_done )
+int scanhash_groestl(int thr_id, struct work *work,
+	uint32_t max_nonce, uint64_t *hashes_done)
 {
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
-        uint32_t endiandata[20] __attribute__ ((aligned (64)));
+        uint32_t _ALIGN(64) endiandata[20];
 	const uint32_t first_nonce = pdata[19];
 	uint32_t nonce = first_nonce;

@@ -72,7 +74,7 @@ int scanhash_groestl( int thr_id, struct work *work, uint32_t max_nonce,

 	do {
 		const uint32_t Htarg = ptarget[7];
-		uint32_t hash[8] __attribute__ ((aligned (64)));
+		uint32_t hash[8];
 		be32enc(&endiandata[19], nonce);
 		groestlhash(hash, endiandata);

@@ -98,21 +100,16 @@ void groestl_set_target( struct work* work, double job_diff )
 work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
 }

-bool register_dmd_gr_algo( algo_gate_t* gate )
+bool register_groestl_algo( algo_gate_t* gate )
 {
    init_groestl_ctx();
    gate->optimizations   = SSE2_OPT | AES_OPT;
    gate->scanhash        = (void*)&scanhash_groestl;
    gate->hash            = (void*)&groestlhash;
+    gate->hash_alt        = (void*)&groestlhash;
    gate->set_target      = (void*)&groestl_set_target;
+    gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
    gate->get_max64       = (void*)&get_max64_0x3ffff;
    return true;
 };

-bool register_groestl_algo( algo_gate_t* gate )
-{
-    register_dmd_gr_algo( gate );
-    gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
-    return true;
-};
-
--- a/algo/groestl/myr-groestl.c
+++ b/algo/groestl/myr-groestl.c
@@ -1,3 +1,4 @@
+#include "miner.h"
 #include "algo-gate-api.h"

 #include <stdio.h>
@@ -10,9 +11,7 @@
 #else
  #include "aes_ni/hash-groestl.h"
 #endif
-
-#include <openssl/sha.h>
-#include "algo/sha/sph_sha2.h"
+#include "algo/sha3/sph_sha2.h"

 typedef struct {
 #ifdef NO_AES_NI
@@ -20,11 +19,7 @@ typedef struct {
 #else
    hashState_groestl       groestl;
 #endif
-#ifndef USE_SPH_SHA
-   SHA256_CTX         sha;
-#else
-   sph_sha256_context sha;
-#endif
+    sph_sha256_context sha;
 } myrgr_ctx_holder;

 myrgr_ctx_holder myrgr_ctx;
@@ -36,44 +31,37 @@ void init_myrgr_ctx()
 #else
     init_groestl (&myrgr_ctx.groestl, 64 );
 #endif
-#ifndef USE_SPH_SHA
-   SHA256_Init( &myrgr_ctx.sha );
-#else
-   sph_sha256_init( &myrgr_ctx.sha );
-#endif
+     sph_sha256_init(&myrgr_ctx.sha);
 }

-void myriadhash( void *output, const void *input )
+void myriadhash(void *output, const void *input)
 {
-     myrgr_ctx_holder ctx __attribute__ ((aligned (64)));
-     memcpy( &ctx, &myrgr_ctx, sizeof(myrgr_ctx) );
-     uint32_t hash[16] __attribute__ ((aligned (64))); 
+        myrgr_ctx_holder ctx;
+        memcpy( &ctx, &myrgr_ctx, sizeof(myrgr_ctx) );
+
+ 	uint32_t _ALIGN(32) hash[16];

 #ifdef NO_AES_NI
-     sph_groestl512(&ctx.groestl, input, 80);
-     sph_groestl512_close(&ctx.groestl, hash);
+	sph_groestl512(&ctx.groestl, input, 80);
+	sph_groestl512_close(&ctx.groestl, hash);
 #else
-     update_and_final_groestl( &ctx.groestl, (char*)input,
-                               (const char*)input, 640 );
+        update_groestl( &ctx.groestl, (char*)input, 640 );
+        final_groestl( &ctx.groestl, (char*)hash);
 #endif

-#ifndef USE_SPH_SHA
-     SHA256_Update( &ctx.sha, hash, 64 );
-     SHA256_Final( (unsigned char*) hash, &ctx.sha );
-#else
-     sph_sha256(&ctx.sha, hash, 64);
-     sph_sha256_close(&ctx.sha, hash);
-#endif
-     memcpy(output, hash, 32);
+	sph_sha256(&ctx.sha, hash, 64);
+	sph_sha256_close(&ctx.sha, hash);
+
+	memcpy(output, hash, 32);
 }

-int scanhash_myriad( int thr_id, struct work *work, uint32_t max_nonce,
-                     uint64_t *hashes_done)
+int scanhash_myriad(int thr_id, struct work *work,
+	uint32_t max_nonce, uint64_t *hashes_done)
 {
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;

-	uint32_t endiandata[20] __attribute__ ((aligned (64)));
+	uint32_t _ALIGN(64) endiandata[20];
 	const uint32_t first_nonce = pdata[19];
 	uint32_t nonce = first_nonce;

@@ -84,7 +72,7 @@ int scanhash_myriad( int thr_id, struct work *work, uint32_t max_nonce,

 	do {
 		const uint32_t Htarg = ptarget[7];
-		uint32_t hash[8] __attribute__ ((aligned (64)));
+		uint32_t hash[8];
 		be32enc(&endiandata[19], nonce);
 		myriadhash(hash, endiandata);

@@ -104,10 +92,11 @@ int scanhash_myriad( int thr_id, struct work *work, uint32_t max_nonce,

 bool register_myriad_algo( algo_gate_t* gate )
 {
-    gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | SHA_OPT;
+    gate->optimizations = SSE2_OPT | AES_OPT;
    init_myrgr_ctx();
    gate->scanhash = (void*)&scanhash_myriad;
    gate->hash     = (void*)&myriadhash;
+    gate->hash_alt = (void*)&myriadhash;
    gate->get_max64 = (void*)&get_max64_0x3ffff;
    return true;
 };
--- a/algo/groestl/sph_groestl.h
+++ b/algo/groestl/sph_groestl.h
@@ -40,7 +40,7 @@ extern "C"{
 #endif

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "algo/sha3/sph_types.h"

 /**
 * Output size (in bits) for Groestl-224.
--- a/algo/groestl/sse2/brg_endian.h
+++ b/algo/groestl/sse2/brg_endian.h
@@ -0,0 +1,133 @@
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved.
+
+ LICENSE TERMS
+
+ The redistribution and use of this software (with or without changes)
+ is allowed without the payment of fees or royalties provided that:
+
+  1. source code distributions include the above copyright notice, this
+     list of conditions and the following disclaimer;
+
+  2. binary distributions include the above copyright notice, this list
+     of conditions and the following disclaimer in their documentation;
+
+  3. the name of the copyright holder is not used to endorse products
+     built using this software without specific written permission.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue Date: 20/12/2007
+*/
+
+#ifndef _BRG_ENDIAN_H
+#define _BRG_ENDIAN_H
+
+#define IS_BIG_ENDIAN      4321 /* byte 0 is most significant (mc68k) */
+#define IS_LITTLE_ENDIAN   1234 /* byte 0 is least significant (i386) */
+
+/* Include files where endian defines and byteswap functions may reside */
+#if defined( __sun )
+#  include <sys/isa_defs.h>
+#elif defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ )
+#  include <sys/endian.h>
+#elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) || \
+      defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ )
+#  include <machine/endian.h>
+#elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ )
+#  if !defined( __MINGW32__ ) && !defined( _AIX )
+#    include <endian.h>
+#    if !defined( __BEOS__ )
+#      include <byteswap.h>
+#    endif
+#  endif
+#endif
+
+/* Now attempt to set the define for platform byte order using any  */
+/* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which  */
+/* seem to encompass most endian symbol definitions                 */
+
+#if defined( BIG_ENDIAN ) && defined( LITTLE_ENDIAN )
+#  if defined( BYTE_ORDER ) && BYTE_ORDER == BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( BYTE_ORDER ) && BYTE_ORDER == LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( _BIG_ENDIAN ) && defined( _LITTLE_ENDIAN )
+#  if defined( _BYTE_ORDER ) && _BYTE_ORDER == _BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( _BYTE_ORDER ) && _BYTE_ORDER == _LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( _BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( _LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( __BIG_ENDIAN ) && defined( __LITTLE_ENDIAN )
+#  if defined( __BYTE_ORDER ) && __BYTE_ORDER == __BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( __BYTE_ORDER ) && __BYTE_ORDER == __LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( __BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( __LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( __BIG_ENDIAN__ ) && defined( __LITTLE_ENDIAN__ )
+#  if defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __BIG_ENDIAN__
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __LITTLE_ENDIAN__
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( __BIG_ENDIAN__ )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( __LITTLE_ENDIAN__ )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+/*  if the platform byte order could not be determined, then try to */
+/*  set this define using common machine defines                    */
+#if !defined(PLATFORM_BYTE_ORDER)
+
+#if   defined( __alpha__ ) || defined( __alpha ) || defined( i386 )       || \
+      defined( __i386__ )  || defined( _M_I86 )  || defined( _M_IX86 )    || \
+      defined( __OS2__ )   || defined( sun386 )  || defined( __TURBOC__ ) || \
+      defined( vax )       || defined( vms )     || defined( VMS )        || \
+      defined( __VMS )     || defined( _M_X64 )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+
+#elif defined( AMIGA )   || defined( applec )    || defined( __AS400__ )  || \
+      defined( _CRAY )   || defined( __hppa )    || defined( __hp9000 )   || \
+      defined( ibm370 )  || defined( mc68000 )   || defined( m68k )       || \
+      defined( __MRC__ ) || defined( __MVS__ )   || defined( __MWERKS__ ) || \
+      defined( sparc )   || defined( __sparc)    || defined( SYMANTEC_C ) || \
+      defined( __VOS__ ) || defined( __TIGCC__ ) || defined( __TANDEM )   || \
+      defined( THINK_C ) || defined( __VMCMS__ ) || defined( _AIX )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+
+#elif 0     /* **** EDIT HERE IF NECESSARY **** */
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#elif 0     /* **** EDIT HERE IF NECESSARY **** */
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#else
+#  error Please edit lines 126 or 128 in brg_endian.h to set the platform byte order
+#endif
+
+#endif
+
+#endif
--- a/algo/groestl/sse2/brg_types.h
+++ b/algo/groestl/sse2/brg_types.h
@@ -0,0 +1,231 @@
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved.
+
+ (a few lines added by Soeren S. Thomsen, October 2008)
+
+ LICENSE TERMS
+
+ The redistribution and use of this software (with or without changes)
+ is allowed without the payment of fees or royalties provided that:
+
+  1. source code distributions include the above copyright notice, this
+     list of conditions and the following disclaimer;
+
+  2. binary distributions include the above copyright notice, this list
+     of conditions and the following disclaimer in their documentation;
+
+  3. the name of the copyright holder is not used to endorse products
+     built using this software without specific written permission.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue Date: 20/12/2007
+
+ The unsigned integer types defined here are of the form uint_<nn>t where
+ <nn> is the length of the type; for example, the unsigned 32-bit type is
+ 'uint_32t'.  These are NOT the same as the 'C99 integer types' that are
+ defined in the inttypes.h and stdint.h headers since attempts to use these
+ types have shown that support for them is still highly variable.  However,
+ since the latter are of the form uint<nn>_t, a regular expression search
+ and replace (in VC++ search on 'uint_{:z}t' and replace with 'uint\1_t')
+ can be used to convert the types used here to the C99 standard types.
+*/
+
+#ifndef _BRG_TYPES_H
+#define _BRG_TYPES_H
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#include <limits.h>
+
+#if defined( _MSC_VER ) && ( _MSC_VER >= 1300 )
+#  include <stddef.h>
+#  define ptrint_t intptr_t
+#elif defined( __GNUC__ ) && ( __GNUC__ >= 3 )
+#  include <stdint.h>
+#  define ptrint_t intptr_t
+#else
+#  define ptrint_t int
+#endif
+
+#ifndef BRG_UI8
+#  define BRG_UI8
+#  if UCHAR_MAX == 255u
+     typedef unsigned char uint_8t;
+#  else
+#    error Please define uint_8t as an 8-bit unsigned integer type in brg_types.h
+#  endif
+#endif
+
+#ifndef BRG_UI16
+#  define BRG_UI16
+#  if USHRT_MAX == 65535u
+     typedef unsigned short uint_16t;
+#  else
+#    error Please define uint_16t as a 16-bit unsigned short type in brg_types.h
+#  endif
+#endif
+
+#ifndef BRG_UI32
+#  define BRG_UI32
+#  if UINT_MAX == 4294967295u
+#    define li_32(h) 0x##h##u
+     typedef unsigned int uint_32t;
+#  elif ULONG_MAX == 4294967295u
+#    define li_32(h) 0x##h##ul
+     typedef unsigned long uint_32t;
+#  elif defined( _CRAY )
+#    error This code needs 32-bit data types, which Cray machines do not provide
+#  else
+#    error Please define uint_32t as a 32-bit unsigned integer type in brg_types.h
+#  endif
+#endif
+
+#ifndef BRG_UI64
+#  if defined( __BORLANDC__ ) && !defined( __MSDOS__ )
+#    define BRG_UI64
+#    define li_64(h) 0x##h##ui64
+     typedef unsigned __int64 uint_64t;
+#  elif defined( _MSC_VER ) && ( _MSC_VER < 1300 )    /* 1300 == VC++ 7.0 */
+#    define BRG_UI64
+#    define li_64(h) 0x##h##ui64
+     typedef unsigned __int64 uint_64t;
+#  elif defined( __sun ) && defined( ULONG_MAX ) && ULONG_MAX == 0xfffffffful
+#    define BRG_UI64
+#    define li_64(h) 0x##h##ull
+     typedef unsigned long long uint_64t;
+#  elif defined( __MVS__ )
+#    define BRG_UI64
+#    define li_64(h) 0x##h##ull
+     typedef unsigned int long long uint_64t;
+#  elif defined( UINT_MAX ) && UINT_MAX > 4294967295u
+#    if UINT_MAX == 18446744073709551615u
+#      define BRG_UI64
+#      define li_64(h) 0x##h##u
+       typedef unsigned int uint_64t;
+#    endif
+#  elif defined( ULONG_MAX ) && ULONG_MAX > 4294967295u
+#    if ULONG_MAX == 18446744073709551615ul
+#      define BRG_UI64
+#      define li_64(h) 0x##h##ul
+       typedef unsigned long uint_64t;
+#    endif
+#  elif defined( ULLONG_MAX ) && ULLONG_MAX > 4294967295u
+#    if ULLONG_MAX == 18446744073709551615ull
+#      define BRG_UI64
+#      define li_64(h) 0x##h##ull
+       typedef unsigned long long uint_64t;
+#    endif
+#  elif defined( ULONG_LONG_MAX ) && ULONG_LONG_MAX > 4294967295u
+#    if ULONG_LONG_MAX == 18446744073709551615ull
+#      define BRG_UI64
+#      define li_64(h) 0x##h##ull
+       typedef unsigned long long uint_64t;
+#    endif
+#  endif
+#endif
+
+#if !defined( BRG_UI64 )
+#  if defined( NEED_UINT_64T )
+#    error Please define uint_64t as an unsigned 64 bit type in brg_types.h
+#  endif
+#endif
+
+#ifndef RETURN_VALUES
+#  define RETURN_VALUES
+#  if defined( DLL_EXPORT )
+#    if defined( _MSC_VER ) || defined ( __INTEL_COMPILER )
+#      define VOID_RETURN    __declspec( dllexport ) void __stdcall
+#      define INT_RETURN     __declspec( dllexport ) int  __stdcall
+#    elif defined( __GNUC__ )
+#      define VOID_RETURN    __declspec( __dllexport__ ) void
+#      define INT_RETURN     __declspec( __dllexport__ ) int
+#    else
+#      error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
+#    endif
+#  elif defined( DLL_IMPORT )
+#    if defined( _MSC_VER ) || defined ( __INTEL_COMPILER )
+#      define VOID_RETURN    __declspec( dllimport ) void __stdcall
+#      define INT_RETURN     __declspec( dllimport ) int  __stdcall
+#    elif defined( __GNUC__ )
+#      define VOID_RETURN    __declspec( __dllimport__ ) void
+#      define INT_RETURN     __declspec( __dllimport__ ) int
+#    else
+#      error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
+#    endif
+#  elif defined( __WATCOMC__ )
+#    define VOID_RETURN  void __cdecl
+#    define INT_RETURN   int  __cdecl
+#  else
+#    define VOID_RETURN  void
+#    define INT_RETURN   int
+#  endif
+#endif
+
+/*	These defines are used to detect and set the memory alignment of pointers.
+    Note that offsets are in bytes.
+
+	ALIGN_OFFSET(x,n)			return the positive or zero offset of 
+								the memory addressed by the pointer 'x' 
+								from an address that is aligned on an 
+								'n' byte boundary ('n' is a power of 2)
+
+	ALIGN_FLOOR(x,n)			return a pointer that points to memory
+								that is aligned on an 'n' byte boundary 
+								and is not higher than the memory address
+								pointed to by 'x' ('n' is a power of 2)
+
+	ALIGN_CEIL(x,n)				return a pointer that points to memory
+								that is aligned on an 'n' byte boundary 
+								and is not lower than the memory address
+								pointed to by 'x' ('n' is a power of 2)
+*/
+
+#define ALIGN_OFFSET(x,n)	(((ptrint_t)(x)) & ((n) - 1))
+#define ALIGN_FLOOR(x,n)	((uint_8t*)(x) - ( ((ptrint_t)(x)) & ((n) - 1)))
+#define ALIGN_CEIL(x,n)		((uint_8t*)(x) + (-((ptrint_t)(x)) & ((n) - 1)))
+
+/*  These defines are used to declare buffers in a way that allows
+    faster operations on longer variables to be used.  In all these
+    defines 'size' must be a power of 2 and >= 8. NOTE that the 
+    buffer size is in bytes but the type length is in bits
+
+    UNIT_TYPEDEF(x,size)        declares a variable 'x' of length 
+                                'size' bits
+
+    BUFR_TYPEDEF(x,size,bsize)  declares a buffer 'x' of length 'bsize' 
+                                bytes defined as an array of variables
+                                each of 'size' bits (bsize must be a 
+                                multiple of size / 8)
+
+    UNIT_CAST(x,size)           casts a variable to a type of 
+                                length 'size' bits
+
+    UPTR_CAST(x,size)           casts a pointer to a pointer to a 
+                                varaiable of length 'size' bits
+*/
+
+#define UI_TYPE(size)               uint_##size##t
+#define UNIT_TYPEDEF(x,size)        typedef UI_TYPE(size) x
+#define BUFR_TYPEDEF(x,size,bsize)  typedef UI_TYPE(size) x[bsize / (size >> 3)]
+#define UNIT_CAST(x,size)           ((UI_TYPE(size) )(x))  
+#define UPTR_CAST(x,size)           ((UI_TYPE(size)*)(x))
+
+  /* Added by Soeren S. Thomsen (begin) */
+#define u8 uint_8t
+#define u32 uint_32t
+#define u64 uint_64t
+  /* (end) */
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
--- a/algo/groestl/sse2/grso-asm.c
+++ b/algo/groestl/sse2/grso-asm.c
--- a/algo/groestl/sse2/grso-asm.h
+++ b/algo/groestl/sse2/grso-asm.h
@@ -0,0 +1,10 @@
+#ifndef GRSOASM_H
+#define GRSOASM_H
+
+#include "grso.h"
+
+void grsoP1024ASM (u64 *x) ;
+
+void grsoQ1024ASM (u64 *x) ;
+
+#endif 
--- a/algo/groestl/sse2/grso-asm2.c
+++ b/algo/groestl/sse2/grso-asm2.c
--- a/algo/groestl/sse2/grso-asm2.h
+++ b/algo/groestl/sse2/grso-asm2.h
@@ -0,0 +1,11 @@
+#ifndef GRSOASM_H
+#define GRSOASM_H
+/* really same as the mmx asm.h */
+/* made just in case something must be changed */
+#include "grso.h"
+
+void grsoP1024ASM (u64 *x) ;
+
+void grsoQ1024ASM (u64 *x) ;
+
+#endif 
--- a/algo/groestl/sse2/grso-macro.c
+++ b/algo/groestl/sse2/grso-macro.c
@@ -0,0 +1,110 @@
+/* hash.c     January 2011
+ *
+ * Groestl-512 implementation with inline assembly containing mmx and
+ * sse instructions. Optimized for Opteron.
+ * Authors: Krystian Matusiewicz and Soeren S. Thomsen
+ *
+ * This code is placed in the public domain
+ */
+
+//#include "grso.h"
+//#include "grso-asm.h"
+// #include "grsotab.h"
+
+#define DECL_GRS
+
+/* load initial constants */
+#define GRS_I \
+do { \
+  int i; \
+  /* set initial value */ \
+  for (i = 0; i < grsoCOLS-1; i++) sts_grs.grsstate[i] = 0; \
+  sts_grs.grsstate[grsoCOLS-1] = grsoU64BIG((u64)(8*grsoDIGESTSIZE)); \
+ \
+  /* set other variables */ \
+  sts_grs.grsbuf_ptr = 0; \
+  sts_grs.grsblock_counter = 0; \
+} while (0); \
+
+/* load hash */
+#define GRS_U \
+do { \
+    unsigned char* in = hash; \
+  unsigned long long index = 0; \
+ \
+  /* if the buffer contains data that has not yet been digested, first \
+     add data to buffer until full */ \
+  if (sts_grs.grsbuf_ptr) { \
+    while (sts_grs.grsbuf_ptr < grsoSIZE && index < 64) { \
+      hashbuf[(int)sts_grs.grsbuf_ptr++] = in[index++]; \
+    } \
+    if (sts_grs.grsbuf_ptr < grsoSIZE) continue; \
+ \
+    /* digest buffer */ \
+    sts_grs.grsbuf_ptr = 0; \
+    grsoTransform(&sts_grs, hashbuf, grsoSIZE); \
+  } \
+ \
+  /* digest bulk of message */ \
+  grsoTransform(&sts_grs, in+index, 64-index); \
+  index += ((64-index)/grsoSIZE)*grsoSIZE; \
+ \
+  /* store remaining data in buffer */ \
+  while (index < 64) { \
+    hashbuf[(int)sts_grs.grsbuf_ptr++] = in[index++]; \
+  } \
+ \
+} while (0);
+
+/* groestl512 hash loaded */
+/* hash = groestl512(loaded) */
+#define GRS_C \
+do { \
+    char *out = hash; \
+  int i, j = 0; \
+  unsigned char *s = (unsigned char*)sts_grs.grsstate; \
+ \
+  hashbuf[sts_grs.grsbuf_ptr++] = 0x80; \
+ \
+  /* pad with '0'-bits */ \
+  if (sts_grs.grsbuf_ptr > grsoSIZE-grsoLENGTHFIELDLEN) { \
+    /* padding requires two blocks */ \
+    while (sts_grs.grsbuf_ptr < grsoSIZE) { \
+      hashbuf[sts_grs.grsbuf_ptr++] = 0; \
+    } \
+    /* digest first padding block */ \
+    grsoTransform(&sts_grs, hashbuf, grsoSIZE); \
+    sts_grs.grsbuf_ptr = 0; \
+  } \
+  while (sts_grs.grsbuf_ptr < grsoSIZE-grsoLENGTHFIELDLEN) { \
+    hashbuf[sts_grs.grsbuf_ptr++] = 0; \
+  } \
+ \
+  /* length padding */ \
+  sts_grs.grsblock_counter++; \
+  sts_grs.grsbuf_ptr = grsoSIZE; \
+  while (sts_grs.grsbuf_ptr > grsoSIZE-grsoLENGTHFIELDLEN) { \
+    hashbuf[--sts_grs.grsbuf_ptr] = (unsigned char)sts_grs.grsblock_counter; \
+    sts_grs.grsblock_counter >>= 8; \
+  } \
+ \
+  /* digest final padding block */ \
+  grsoTransform(&sts_grs, hashbuf, grsoSIZE); \
+  /* perform output transformation */ \
+  grsoOutputTransformation(&sts_grs); \
+ \
+  /* store hash result in output */ \
+  for (i = grsoSIZE-grsoDIGESTSIZE; i < grsoSIZE; i++,j++) { \
+    out[j] = s[i]; \
+  } \
+ \
+  /* zeroise relevant variables and deallocate memory */ \
+  for (i = 0; i < grsoCOLS; i++) { \
+    sts_grs.grsstate[i] = 0; \
+  } \
+  for (i = 0; i < grsoSIZE; i++) { \
+    hashbuf[i] = 0; \
+  } \
+} while (0); 
+ 
+
--- a/algo/groestl/sse2/grso.c
+++ b/algo/groestl/sse2/grso.c
@@ -0,0 +1,57 @@
+/* hash.c     January 2011
+ *
+ * Groestl-512 implementation with inline assembly containing mmx and
+ * sse instructions. Optimized for Opteron.
+ * Authors: Krystian Matusiewicz and Soeren S. Thomsen
+ *
+ * This code is placed in the public domain
+ */
+
+#include "algo/groestl/sse2/grso-asm.h"
+#include "algo/groestl/sse2/grso.h"
+#include "algo/groestl/sse2/grsotab.h"
+
+/* digest up to len bytes of input (full blocks only) */
+void grsoTransform(grsoState *ctx, 
+	       const unsigned char *in, 
+	       unsigned long long len) {
+  u64 y[grsoCOLS+2] __attribute__ ((aligned (16)));
+  u64 z[grsoCOLS+2] __attribute__ ((aligned (16)));
+  u64 *m, *h = (u64*)ctx->grsstate;
+  int i;
+  
+  /* increment block counter */
+  ctx->grsblock_counter += len/grsoSIZE;
+  
+  /* digest message, one block at a time */
+  for (; len >= grsoSIZE; len -= grsoSIZE, in += grsoSIZE) {
+    m = (u64*)in;
+    for (i = 0; i < grsoCOLS; i++) {
+      y[i] = m[i];
+      z[i] = m[i] ^ h[i];
+    }
+
+    grsoQ1024ASM(y);
+    grsoP1024ASM(z);
+
+    /* h' == h + Q(m) + P(h+m) */
+    for (i = 0; i < grsoCOLS; i++) {
+      h[i] ^= z[i] ^ y[i];
+    }
+  }
+}
+
+/* given state h, do h <- P(h)+h */
+void grsoOutputTransformation(grsoState *ctx) {
+  u64 z[grsoCOLS] __attribute__ ((aligned (16)));
+  int j;
+
+  for (j = 0; j < grsoCOLS; j++) {
+    z[j] = ctx->grsstate[j];
+  }
+  grsoP1024ASM(z);
+  for (j = 0; j < grsoCOLS; j++) {
+    ctx->grsstate[j] ^= z[j];
+  }
+}
+
--- a/algo/groestl/sse2/grso.h
+++ b/algo/groestl/sse2/grso.h
@@ -0,0 +1,62 @@
+#ifndef __hash_h
+#define __hash_h
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "brg_endian.h"
+#include "brg_types.h"
+
+/* some sizes (number of bytes) */
+#define grsoROWS 8
+#define grsoLENGTHFIELDLEN grsoROWS
+#define grsoCOLS 16
+#define grsoSIZE (grsoROWS*grsoCOLS)
+#define grsoDIGESTSIZE 64
+
+#define grsoROUNDS 14
+
+#define grsoROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&((u64)0xffffffffffffffffULL))
+
+#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
+#error
+#endif /* IS_BIG_ENDIAN */
+
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*n)))
+#define grsoU64BIG(a)				\
+  ((grsoROTL64(a, 8) & ((u64)0x000000ff000000ffULL)) |	\
+   (grsoROTL64(a,24) & ((u64)0x0000ff000000ff00ULL)) |	\
+   (grsoROTL64(a,40) & ((u64)0x00ff000000ff0000ULL)) |	\
+   (grsoROTL64(a,56) & ((u64)0xff000000ff000000ULL)))
+#endif /* IS_LITTLE_ENDIAN */
+
+typedef struct {
+  u64 grsstate[grsoCOLS];             /* actual state */
+  u64 grsblock_counter;           /* message block counter */
+  int grsbuf_ptr;                 /* data buffer pointer */
+} grsoState;
+
+//extern int grsoInit(grsoState* ctx); 
+//extern int grsoUpdate(grsoState* ctx, const unsigned char* in,
+//	   unsigned long long len);
+//extern int grsoUpdateq(grsoState* ctx, const unsigned char* in);
+//extern int grsoFinal(grsoState* ctx,
+//	  unsigned char* out); 
+//
+//extern int grsohash(unsigned char *out,
+//		const unsigned char *in,
+//		unsigned long long len);
+
+/* digest up to len bytes of input (full blocks only) */
+void grsoTransform( grsoState *ctx, const unsigned char *in,
+                            unsigned long long len );
+
+/* given state h, do h <- P(h)+h */
+void grsoOutputTransformation( grsoState *ctx );
+
+int grso_init ( grsoState* sts_grs );
+int grso_update ( grsoState* sts_grs, char* hashbuf, char* hash );
+int grso_close ( grsoState *sts_grs, char* hashbuf, char* hash );
+
+
+#endif /* __hash_h */
--- a/algo/groestl/sse2/grsotab.h
+++ b/algo/groestl/sse2/grsotab.h
--- a/algo/hamsi/sph_hamsi.h
+++ b/algo/hamsi/sph_hamsi.h
@@ -36,7 +36,7 @@
 #define SPH_HAMSI_H__

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "algo/sha3/sph_types.h"

 #ifdef __cplusplus
 extern "C"{
--- a/algo/haval/sph-haval.h
+++ b/algo/haval/sph-haval.h
@@ -66,7 +66,7 @@ extern "C"{
 #endif

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
+#include "algo/sha3/sph_types.h"

 /**
 * Output size (in bits) for HAVAL-128/3.
--- a/algo/heavy/bastion.c
+++ b/algo/heavy/bastion.c
@@ -1,3 +1,4 @@
+#include "miner.h"
 #include "algo-gate-api.h"

 #include <stdio.h>
@@ -24,7 +25,7 @@

 void bastionhash(void *output, const void *input)
 {
-	unsigned char hash[64] __attribute__ ((aligned (64)));
+	unsigned char _ALIGN(128) hash[64] = { 0 };

 #ifdef NO_AES_NI
        sph_echo512_context     ctx_echo;
@@ -35,6 +36,7 @@ void bastionhash(void *output, const void *input)
 	sph_fugue512_context ctx_fugue;
 	sph_whirlpool_context ctx_whirlpool;
 	sph_shabal512_context ctx_shabal;
+//	sph_skein512_context ctx_skein;
 	sph_hamsi512_context ctx_hamsi;

        unsigned char hashbuf[128] __attribute__ ((aligned (16)));
@@ -45,10 +47,8 @@ void bastionhash(void *output, const void *input)
 	HEFTY1(input, 80, hash);

        init_luffa( &ctx_luffa, 512 );
-        update_and_final_luffa( &ctx_luffa, (BitSequence*)hash,
-                                (const BitSequence*)hash, 64 );
-//        update_luffa( &ctx_luffa, hash, 64 );
-//        final_luffa( &ctx_luffa, hash );
+        update_luffa( &ctx_luffa, hash, 64 );
+        final_luffa( &ctx_luffa, hash );

 	if (hash[0] & 0x8)
 	{
@@ -60,6 +60,9 @@ void bastionhash(void *output, const void *input)
                SKN_I;
                SKN_U;
                SKN_C;
+//		sph_skein512_init(&ctx_skein);
+//		sph_skein512(&ctx_skein, hash, 64);
+//		sph_skein512_close(&ctx_skein, hash);
 	}

 	sph_whirlpool_init(&ctx_whirlpool);
@@ -78,17 +81,13 @@ void bastionhash(void *output, const void *input)
 		sph_echo512_close(&ctx_echo, hash);
 #else
                init_echo( &ctx_echo, 512 );
-                update_final_echo ( &ctx_echo,(BitSequence*)hash,
-                                    (const BitSequence*)hash, 512 );
-//                update_echo ( &ctx_echo, hash, 512 );
-//                final_echo( &ctx_echo,  hash );
+                update_echo ( &ctx_echo, hash, 512 );
+                final_echo( &ctx_echo,  hash );
 #endif
 	} else {
                init_luffa( &ctx_luffa, 512 );
-                update_and_final_luffa( &ctx_luffa, (BitSequence*)hash,
-                                        (const BitSequence*)hash, 64 );
-//                update_luffa( &ctx_luffa, hash, 64 );
-//                final_luffa( &ctx_luffa, hash );
+                update_luffa( &ctx_luffa, hash, 64 );
+                final_luffa( &ctx_luffa, hash );
 	}

 	sph_shabal512_init(&ctx_shabal);
@@ -99,6 +98,9 @@ void bastionhash(void *output, const void *input)
        SKN_I;
        SKN_U;
        SKN_C;
+//	sph_skein512_init(&ctx_skein);
+//	sph_skein512(&ctx_skein, hash, 64);
+//	sph_skein512_close(&ctx_skein, hash);

 	if (hash[0] & 0x8)
 	{
@@ -122,10 +124,8 @@ void bastionhash(void *output, const void *input)
 		sph_hamsi512_close(&ctx_hamsi, hash);
 	} else {
                init_luffa( &ctx_luffa, 512 );
-                update_and_final_luffa( &ctx_luffa, (BitSequence*)hash,
-                                        (const BitSequence*)hash, 64 );
-//                update_luffa( &ctx_luffa, hash, 64 );
-//                final_luffa( &ctx_luffa, hash );
+                update_luffa( &ctx_luffa, hash, 64 );
+                final_luffa( &ctx_luffa, hash );
 	}

 	memcpy(output, hash, 32);
@@ -170,6 +170,7 @@ bool register_bastion_algo( algo_gate_t* gate )
  gate->optimizations = SSE2_OPT | AES_OPT;
  gate->scanhash = (void*)&scanhash_bastion;
  gate->hash     = (void*)&bastionhash;
+  gate->hash_alt = (void*)&bastionhash;
  return true;
 };

--- a/algo/heavy/heavy.c
+++ b/algo/heavy/heavy.c
@@ -2,6 +2,7 @@
 #include <openssl/sha.h>
 #include <stdint.h>

+#include "miner.h"
 #include "algo-gate-api.h"
 #include "sph_hefty1.h"
 #include "algo/keccak/sph_keccak.h"
--- a/algo/x17/hmq1725.c
+++ b/algo/x17/hmq1725.c
@@ -1,12 +1,16 @@
+#include "miner.h"
 #include "algo-gate-api.h"
+
 #include <string.h>
 #include <stdint.h>
+
 #include "algo/blake/sph_blake.h"
 #include "algo/bmw/sph_bmw.h"
 #include "algo/groestl/sph_groestl.h"
 #include "algo/jh/sph_jh.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
+
 #include "algo/luffa/sph_luffa.h"
 #include "algo/cubehash/sph_cubehash.h"
 #include "algo/shavite/sph_shavite.h"
@@ -16,13 +20,14 @@
 #include "algo/fugue/sph_fugue.h"
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
-#include "algo/sha/sph_sha2.h"
+#include "algo/sha2/sph-sha2.h"
 #include "algo/haval/sph-haval.h"
-#include <openssl/sha.h>
+
 #ifndef NO_AES_NI
  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/echo/aes_ni/hash_api.h"
 #endif
+
 #include "algo/luffa/sse2/luffa_for_sse2.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
 #include "algo/simd/sse2/nist.h"
@@ -42,11 +47,7 @@ typedef struct {
  sph_fugue512_context    fugue1, fugue2;
  sph_shabal512_context   shabal1;
  sph_whirlpool_context   whirlpool1, whirlpool2, whirlpool3, whirlpool4;
-#ifndef USE_SPH_SHA
-  SHA512_CTX              sha1, sha2;
-#else
  sph_sha512_context      sha1, sha2;
-#endif
  sph_haval256_5_context  haval1, haval2;
 #ifdef NO_AES_NI
  sph_groestl512_context  groestl1, groestl2;
@@ -57,8 +58,8 @@ typedef struct {
 #endif
 } hmq1725_ctx_holder;

-static hmq1725_ctx_holder hmq1725_ctx __attribute__ ((aligned (64)));
-static __thread sph_bmw512_context hmq_bmw_mid __attribute__ ((aligned (64)));
+static hmq1725_ctx_holder hmq1725_ctx;
+static __thread sph_bmw512_context hmq_bmw_mid;

 void init_hmq1725_ctx()
 {
@@ -101,13 +102,9 @@ void init_hmq1725_ctx()
    sph_whirlpool_init(&hmq1725_ctx.whirlpool3);
    sph_whirlpool_init(&hmq1725_ctx.whirlpool4);

-#ifndef USE_SPH_SHA
-    SHA512_Init( &hmq1725_ctx.sha1 );
-    SHA512_Init( &hmq1725_ctx.sha2 );
-#else
    sph_sha512_init(&hmq1725_ctx.sha1);
    sph_sha512_init(&hmq1725_ctx.sha2);
-#endif
+
    sph_haval256_5_init(&hmq1725_ctx.haval1);
    sph_haval256_5_init(&hmq1725_ctx.haval2);

@@ -130,13 +127,13 @@ void hmq_bmw512_midstate( const void* input )
    sph_bmw512( &hmq_bmw_mid, input, 64 );
 }

-__thread hmq1725_ctx_holder h_ctx __attribute__ ((aligned (64)));
+__thread hmq1725_ctx_holder h_ctx;

 extern void hmq1725hash(void *state, const void *input)
 {
    const uint32_t mask = 24;
-    uint32_t hashA[32] __attribute__((aligned(64)));
-    uint32_t hashB[32] __attribute__((aligned(64)));
+    uint32_t hashA[16] __attribute__((aligned(64)));
+    uint32_t hashB[16] __attribute__((aligned(64)));
    const int midlen = 64;            // bytes
    const int tail   = 80 - midlen;   // 16

@@ -274,13 +271,8 @@ extern void hmq1725hash(void *state, const void *input)
    }
    else
    {
-#ifndef USE_SPH_SHA
-        SHA512_Update( &h_ctx.sha1, hashB, 64 );
-        SHA512_Final( (unsigned char*) hashA, &h_ctx.sha1 );
-#else
        sph_sha512 (&h_ctx.sha1, hashB, 64); //7
        sph_sha512_close(&h_ctx.sha1, hashA); //8
-#endif
    }

 #ifdef NO_AES_NI
@@ -291,13 +283,8 @@ extern void hmq1725hash(void *state, const void *input)
                               (const char*)hashA, 512 );
 #endif

-#ifndef USE_SPH_SHA
-    SHA512_Update( &h_ctx.sha2, hashB, 64 );
-    SHA512_Final( (unsigned char*) hashA, &h_ctx.sha2 );
-#else
    sph_sha512 (&h_ctx.sha2, hashB, 64); //2 
    sph_sha512_close(&h_ctx.sha2, hashA); //3 
-#endif

    if ( hashA[0] & mask ) //4
    {
@@ -320,8 +307,8 @@ extern void hmq1725hash(void *state, const void *input)
 int scanhash_hmq1725( int thr_id, struct work *work, int32_t max_nonce,
                      uint64_t *hashes_done )
 {
-        uint32_t endiandata[32] __attribute__((aligned(64)));
-        uint32_t hash64[8] __attribute__((aligned(64)));
+        uint32_t endiandata[20] __attribute__((aligned(64)));
+        uint32_t hash64[8] __attribute__((aligned(32)));
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
 	uint32_t n = pdata[19] - 1;
@@ -429,10 +416,11 @@ int scanhash_hmq1725( int thr_id, struct work *work, int32_t max_nonce,
 bool register_hmq1725_algo( algo_gate_t* gate )
 {
  init_hmq1725_ctx();
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | SHA_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
  gate->set_target       = (void*)&scrypt_set_target;
  gate->scanhash         = (void*)&scanhash_hmq1725;
  gate->hash             = (void*)&hmq1725hash;
+  gate->hash_alt         = (void*)&hmq1725hash;
  return true;
 };

--- a/algo/hodl/block.h
+++ b/algo/hodl/block.h
@@ -0,0 +1,171 @@
+// Copyright (c) 2009-2010 Satoshi Nakamoto
+// Copyright (c) 2009-2013 The Bitcoin Core developers
+// Distributed under the MIT software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+#ifndef BITCOIN_PRIMITIVES_BLOCK_H
+#define BITCOIN_PRIMITIVES_BLOCK_H
+
+#include "serialize.h"
+#include "hodl_uint256.h"
+
+/** Nodes collect new transactions into a block, hash them into a hash tree,
+ * and scan through nonce values to make the block's hash satisfy proof-of-work
+ * requirements.  When they solve the proof-of-work, they broadcast the block
+ * to everyone and the block is added to the block chain.  The first transaction
+ * in the block is a special one that creates a new coin owned by the creator
+ * of the block.
+ */
+class CBlockHeader
+{
+public:
+    // header
+    static const int32_t CURRENT_VERSION=4;
+    int32_t nVersion;
+    uint256 hashPrevBlock;
+    uint256 hashMerkleRoot;
+    uint32_t nTime;
+    uint32_t nBits;
+    uint32_t nNonce;
+    uint32_t nStartLocation;
+    uint32_t nFinalCalculation;
+
+    CBlockHeader()
+    {
+        SetNull();
+    }
+
+    ADD_SERIALIZE_METHODS;
+
+    template <typename Stream, typename Operation>
+    inline void SerializationOp(Stream& s, Operation ser_action, int nType, int nVersion) {
+        READWRITE(this->nVersion);
+        nVersion = this->nVersion;
+        READWRITE(hashPrevBlock);
+        READWRITE(hashMerkleRoot);
+        READWRITE(nTime);
+        READWRITE(nBits);
+        READWRITE(nNonce);
+        READWRITE(nStartLocation);
+        READWRITE(nFinalCalculation);
+    }
+
+    void SetNull()
+    {
+        nVersion = CBlockHeader::CURRENT_VERSION;
+        hashPrevBlock.SetNull();
+        hashMerkleRoot.SetNull();
+        nTime = 0;
+        nBits = 0;
+        nNonce = 0;
+        nStartLocation = 0;
+        nFinalCalculation = 0;
+    }
+
+    bool IsNull() const
+    {
+        return (nBits == 0);
+    }
+
+    uint256 GetHash() const;
+    uint256 GetMidHash() const;
+    uint256 FindBestPatternHash(int& collisions,char *scratchpad,int nThreads);
+    uint256 FindBestPatternHash(int& collisions,char *scratchpad);
+
+    int64_t GetBlockTime() const
+    {
+        return (int64_t)nTime;
+    }
+};
+
+
+class CBlock : public CBlockHeader
+{
+public:
+    // network and disk
+    //std::vector<CTransaction> vtx;
+    std::vector<int> vtx;
+
+    // memory only
+    mutable std::vector<uint256> vMerkleTree;
+
+    CBlock()
+    {
+        SetNull();
+    }
+
+    CBlock(const CBlockHeader &header)
+    {
+        SetNull();
+        *((CBlockHeader*)this) = header;
+    }
+
+    ADD_SERIALIZE_METHODS;
+
+    template <typename Stream, typename Operation>
+    inline void SerializationOp(Stream& s, Operation ser_action, int nType, int nVersion) {
+        READWRITE(*(CBlockHeader*)this);
+        READWRITE(vtx);
+    }
+
+    void SetNull()
+    {
+        CBlockHeader::SetNull();
+        vtx.clear();
+        vMerkleTree.clear();
+    }
+
+    CBlockHeader GetBlockHeader() const
+    {
+        CBlockHeader block;
+        block.nVersion          = nVersion;
+        block.hashPrevBlock     = hashPrevBlock;
+        block.hashMerkleRoot    = hashMerkleRoot;
+        block.nTime             = nTime;
+        block.nBits             = nBits;
+        block.nNonce            = nNonce;
+        block.nStartLocation    = nStartLocation;
+        block.nFinalCalculation = nFinalCalculation;
+        return block;
+    }
+
+    std::string ToString() const;
+};
+
+
+/** Describes a place in the block chain to another node such that if the
+ * other node doesn't have the same branch, it can find a recent common trunk.
+ * The further back it is, the further before the fork it may be.
+ */
+struct CBlockLocator
+{
+    std::vector<uint256> vHave;
+
+    CBlockLocator() {}
+
+    CBlockLocator(const std::vector<uint256>& vHaveIn)
+    {
+        vHave = vHaveIn;
+    }
+
+    ADD_SERIALIZE_METHODS;
+
+    template <typename Stream, typename Operation>
+    inline void SerializationOp(Stream& s, Operation ser_action, int nType, int nVersion) {
+        if (!(nType & SER_GETHASH))
+            READWRITE(nVersion);
+        READWRITE(vHave);
+    }
+
+    void SetNull()
+    {
+        vHave.clear();
+    }
+
+    bool IsNull() const
+    {
+        return vHave.empty();
+    }
+};
+
+#endif // BITCOIN_PRIMITIVES_BLOCK_H
--- a/algo/hodl/common.h
+++ b/algo/hodl/common.h
@@ -0,0 +1,70 @@
+// Copyright (c) 2014 The Bitcoin Core developers
+// Distributed under the MIT software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+#ifndef BITCOIN_CRYPTO_COMMON_H
+#define BITCOIN_CRYPTO_COMMON_H
+
+#if defined(HAVE_CONFIG_H)
+#include "bitcoin-config.h"
+#endif
+
+#if ((defined(_WIN64) || defined(__WINDOWS__)))
+#include "hodl-endian.h"
+#endif
+
+#include <stdint.h>
+
+uint16_t static inline ReadLE16(const unsigned char* ptr)
+{
+    return le16toh(*((uint16_t*)ptr));
+}
+
+uint32_t static inline ReadLE32(const unsigned char* ptr)
+{
+    return le32toh(*((uint32_t*)ptr));
+}
+
+uint64_t static inline ReadLE64(const unsigned char* ptr)
+{
+    return le64toh(*((uint64_t*)ptr));
+}
+
+void static inline WriteLE16(unsigned char* ptr, uint16_t x)
+{
+    *((uint16_t*)ptr) = htole16(x);
+}
+
+void static inline WriteLE32(unsigned char* ptr, uint32_t x)
+{
+    *((uint32_t*)ptr) = htole32(x);
+}
+
+void static inline WriteLE64(unsigned char* ptr, uint64_t x)
+{
+    *((uint64_t*)ptr) = htole64(x);
+}
+
+uint32_t static inline ReadBE32(const unsigned char* ptr)
+{
+    return be32toh(*((uint32_t*)ptr));
+}
+
+uint64_t static inline ReadBE64(const unsigned char* ptr)
+{
+    return be64toh(*((uint64_t*)ptr));
+}
+
+void static inline WriteBE32(unsigned char* ptr, uint32_t x)
+{
+    *((uint32_t*)ptr) = htobe32(x);
+}
+
+void static inline WriteBE64(unsigned char* ptr, uint64_t x)
+{
+    *((uint64_t*)ptr) = htobe64(x);
+}
+
+#endif // BITCOIN_CRYPTO_COMMON_H
+
+//#endif
--- a/algo/hodl/hash.cpp
+++ b/algo/hodl/hash.cpp
@@ -0,0 +1,83 @@
+// Copyright (c) 2013-2014 The Bitcoin Core developers
+// Distributed under the MIT software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+#include "hash.h"
+#include "common.h"
+#include "hmac_sha512.h"
+
+
+inline uint32_t ROTL32(uint32_t x, int8_t r)
+{
+    return (x << r) | (x >> (32 - r));
+}
+
+unsigned int MurmurHash3(unsigned int nHashSeed, const std::vector<unsigned char>& vDataToHash)
+{
+    // The following is MurmurHash3 (x86_32), see http://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp
+    uint32_t h1 = nHashSeed;
+    if (vDataToHash.size() > 0)
+    {
+        const uint32_t c1 = 0xcc9e2d51;
+        const uint32_t c2 = 0x1b873593;
+
+        const int nblocks = vDataToHash.size() / 4;
+
+        //----------
+        // body
+        const uint8_t* blocks = &vDataToHash[0] + nblocks * 4;
+
+        for (int i = -nblocks; i; i++) {
+            uint32_t k1 = ReadLE32(blocks + i*4);
+
+            k1 *= c1;
+            k1 = ROTL32(k1, 15);
+            k1 *= c2;
+
+            h1 ^= k1;
+            h1 = ROTL32(h1, 13);
+            h1 = h1 * 5 + 0xe6546b64;
+        }
+
+        //----------
+        // tail
+        const uint8_t* tail = (const uint8_t*)(&vDataToHash[0] + nblocks * 4);
+
+        uint32_t k1 = 0;
+
+        switch (vDataToHash.size() & 3) {
+        case 3:
+            k1 ^= tail[2] << 16;
+        case 2:
+            k1 ^= tail[1] << 8;
+        case 1:
+            k1 ^= tail[0];
+            k1 *= c1;
+            k1 = ROTL32(k1, 15);
+            k1 *= c2;
+            h1 ^= k1;
+        };
+    }
+
+    //----------
+    // finalization
+    h1 ^= vDataToHash.size();
+    h1 ^= h1 >> 16;
+    h1 *= 0x85ebca6b;
+    h1 ^= h1 >> 13;
+    h1 *= 0xc2b2ae35;
+    h1 ^= h1 >> 16;
+
+    return h1;
+}
+
+void BIP32Hash(const ChainCode &chainCode, unsigned int nChild, unsigned char header, const unsigned char data[32], unsigned char output[64])
+{
+    unsigned char num[4];
+    num[0] = (nChild >> 24) & 0xFF;
+    num[1] = (nChild >> 16) & 0xFF;
+    num[2] = (nChild >>  8) & 0xFF;
+    num[3] = (nChild >>  0) & 0xFF;
+    CHMAC_SHA512(chainCode.begin(), chainCode.size()).Write(&header, 1).Write(data, 32).Write(num, 4).Finalize(output);
+}
+
--- a/algo/hodl/hash.h
+++ b/algo/hodl/hash.h
@@ -0,0 +1,176 @@
+// Copyright (c) 2009-2010 Satoshi Nakamoto
+// Copyright (c) 2009-2013 The Bitcoin Core developers
+// Distributed under the MIT software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+#ifndef BITCOIN_HASH_H
+#define BITCOIN_HASH_H
+
+#include <iostream>
+//#include "ripemd160.h"
+#include "sha256.h"
+#include "serialize.h"
+#include "hodl_uint256.h"
+//#include "version.h"
+
+#include <vector>
+
+static const int PROTOCOL_VERSION = 70002;
+
+typedef uint256 ChainCode;
+
+/** A hasher class for Bitcoin's 256-bit hash (double SHA-256). */
+class CHash256 {
+private:
+    CSHA256 sha;
+public:
+    static const size_t OUTPUT_SIZE = CSHA256::OUTPUT_SIZE;
+
+    void Finalize(unsigned char hash[OUTPUT_SIZE]) {
+        unsigned char buf[sha.OUTPUT_SIZE];
+        sha.Finalize(buf);
+        sha.Reset().Write(buf, sha.OUTPUT_SIZE).Finalize(hash);
+    }
+
+    CHash256& Write(const unsigned char *data, size_t len) {
+        sha.Write(data, len);
+        return *this;
+    }
+
+    CHash256& Reset() {
+        sha.Reset();
+        return *this;
+    }
+};
+
+
+/** A hasher class for Bitcoin's 160-bit hash (SHA-256 + RIPEMD-160). */
+/*
+class CHash160 {
+private:
+    CSHA256 sha;
+public:
+    static const size_t OUTPUT_SIZE = CRIPEMD160::OUTPUT_SIZE;
+
+    void Finalize(unsigned char hash[OUTPUT_SIZE]) {
+        unsigned char buf[sha.OUTPUT_SIZE];
+        sha.Finalize(buf);
+        CRIPEMD160().Write(buf, sha.OUTPUT_SIZE).Finalize(hash);
+    }
+
+    CHash160& Write(const unsigned char *data, size_t len) {
+        sha.Write(data, len);
+        return *this;
+    }
+
+    CHash160& Reset() {
+        sha.Reset();
+        return *this;
+    }
+};
+*/
+
+/** Compute the 256-bit hash of an object. */
+template<typename T1>
+inline uint256 Hash(const T1 pbegin, const T1 pend)
+{
+    static const unsigned char pblank[1] = {};
+    uint256 result;
+    CHash256().Write(pbegin == pend ? pblank : (const unsigned char*)&pbegin[0], (pend - pbegin) * sizeof(pbegin[0]))
+              .Finalize((unsigned char*)&result);
+    return result;
+}
+
+/** Compute the 256-bit hash of the concatenation of two objects. */
+template<typename T1, typename T2>
+inline uint256 Hash(const T1 p1begin, const T1 p1end,
+                    const T2 p2begin, const T2 p2end) {
+    static const unsigned char pblank[1] = {};
+    uint256 result;
+    CHash256().Write(p1begin == p1end ? pblank : (const unsigned char*)&p1begin[0], (p1end - p1begin) * sizeof(p1begin[0]))
+              .Write(p2begin == p2end ? pblank : (const unsigned char*)&p2begin[0], (p2end - p2begin) * sizeof(p2begin[0]))
+              .Finalize((unsigned char*)&result);
+    return result;
+}
+
+/** Compute the 256-bit hash of the concatenation of three objects. */
+template<typename T1, typename T2, typename T3>
+inline uint256 Hash(const T1 p1begin, const T1 p1end,
+                    const T2 p2begin, const T2 p2end,
+                    const T3 p3begin, const T3 p3end) {
+    static const unsigned char pblank[1] = {};
+    uint256 result;
+    CHash256().Write(p1begin == p1end ? pblank : (const unsigned char*)&p1begin[0], (p1end - p1begin) * sizeof(p1begin[0]))
+              .Write(p2begin == p2end ? pblank : (const unsigned char*)&p2begin[0], (p2end - p2begin) * sizeof(p2begin[0]))
+              .Write(p3begin == p3end ? pblank : (const unsigned char*)&p3begin[0], (p3end - p3begin) * sizeof(p3begin[0]))
+              .Finalize((unsigned char*)&result);
+    return result;
+}
+
+
+/** Compute the 160-bit hash an object. */
+/*
+template<typename T1>
+inline uint160 Hash160(const T1 pbegin, const T1 pend)
+{
+    static unsigned char pblank[1] = {};
+    uint160 result;
+    CHash160().Write(pbegin == pend ? pblank : (const unsigned char*)&pbegin[0], (pend - pbegin) * sizeof(pbegin[0]))
+              .Finalize((unsigned char*)&result);
+    return result;
+}
+*/
+/** Compute the 160-bit hash of a vector. */
+/*
+inline uint160 Hash160(const std::vector<unsigned char>& vch)
+{
+    return Hash160(vch.begin(), vch.end());
+}
+*/
+
+/** A writer stream (for serialization) that computes a 256-bit hash. */
+class CHashWriter
+{
+private:
+    CHash256 ctx;
+
+public:
+    int nType;
+    int nVersion;
+
+    CHashWriter(int nTypeIn, int nVersionIn) : nType(nTypeIn), nVersion(nVersionIn) {}
+
+    CHashWriter& write(const char *pch, size_t size) {
+        ctx.Write((const unsigned char*)pch, size);
+        return (*this);
+    }
+
+    // invalidates the object
+    uint256 GetHash() {
+        uint256 result;
+        ctx.Finalize((unsigned char*)&result);
+        return result;
+    }
+
+    template<typename T>
+    CHashWriter& operator<<(const T& obj) {
+        // Serialize to this stream
+        ::Serialize(*this, obj, nType, nVersion);
+        return (*this);
+    }
+};
+
+/** Compute the 256-bit hash of an object's serialization. */
+template<typename T>
+uint256 SerializeHash(const T& obj, int nType=SER_GETHASH, int nVersion=PROTOCOL_VERSION)
+{
+    CHashWriter ss(nType, nVersion);
+    ss << obj;
+    return ss.GetHash();
+}
+
+unsigned int MurmurHash3(unsigned int nHashSeed, const std::vector<unsigned char>& vDataToHash);
+
+void BIP32Hash(const ChainCode &chainCode, unsigned int nChild, unsigned char header, const unsigned char data[32], unsigned char output[64]);
+
+#endif // BITCOIN_HASH_H
--- a/algo/hodl/hmac_sha512.cpp
+++ b/algo/hodl/hmac_sha512.cpp
@@ -0,0 +1,33 @@
+// Copyright (c) 2014 The Bitcoin Core developers
+// Distributed under the MIT software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+#include "hmac_sha512.h"
+#include <string.h>
+
+CHMAC_SHA512::CHMAC_SHA512(const unsigned char* key, size_t keylen)
+{
+    unsigned char rkey[128];
+    if (keylen <= 128) {
+        memcpy(rkey, key, keylen);
+        memset(rkey + keylen, 0, 128 - keylen);
+    } else {
+        CSHA512().Write(key, keylen).Finalize(rkey);
+        memset(rkey + 64, 0, 64);
+    }
+
+    for (int n = 0; n < 128; n++)
+        rkey[n] ^= 0x5c;
+    outer.Write(rkey, 128);
+
+    for (int n = 0; n < 128; n++)
+        rkey[n] ^= 0x5c ^ 0x36;
+    inner.Write(rkey, 128);
+}
+
+void CHMAC_SHA512::Finalize(unsigned char hash[OUTPUT_SIZE])
+{
+    unsigned char temp[64];
+    inner.Finalize(temp);
+    outer.Write(temp, 64).Finalize(hash);
+}
--- a/algo/hodl/hmac_sha512.h
+++ b/algo/hodl/hmac_sha512.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2014 The Bitcoin Core developers
+// Distributed under the MIT software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+#ifndef BITCOIN_CRYPTO_HMAC_SHA512_H
+#define BITCOIN_CRYPTO_HMAC_SHA512_H
+
+#include "sha512.h"
+
+#include <stdint.h>
+#include <stdlib.h>
+
+/** A hasher class for HMAC-SHA-512. */
+class CHMAC_SHA512
+{
+private:
+    CSHA512 outer;
+    CSHA512 inner;
+
+public:
+    static const size_t OUTPUT_SIZE = 64;
+
+    CHMAC_SHA512(const unsigned char* key, size_t keylen);
+    CHMAC_SHA512& Write(const unsigned char* data, size_t len)
+    {
+        inner.Write(data, len);
+        return *this;
+    }
+    void Finalize(unsigned char hash[OUTPUT_SIZE]);
+};
+
+#endif // BITCOIN_CRYPTO_HMAC_SHA512_H
--- a/algo/hodl/hodl-gate.c
+++ b/algo/hodl/hodl-gate.c
@@ -1,7 +1,10 @@
 #include <memory.h>
 #include <stdlib.h>

+#include "miner.h"
+//#include "algo-gate-api.h"
 #include "hodl-gate.h"
+#include "hodl.h"
 #include "hodl-wolf.h"

 #define HODL_NSTARTLOC_INDEX 20
@@ -94,7 +97,11 @@ bool hodl_do_this_thread( int thr_id )
 int hodl_scanhash( int thr_id, struct work* work, uint32_t max_nonce,
                   uint64_t *hashes_done )
 {
-#ifndef NO_AES_NI
+#ifdef NO_AES_NI
+  GetPsuedoRandomData( hodl_scratchbuf, work->data, thr_id );
+  pthread_barrier_wait( &hodl_barrier );
+  return scanhash_hodl( thr_id, work, max_nonce, hashes_done );
+#else
  GenRandomGarbage( hodl_scratchbuf, work->data, thr_id );
  pthread_barrier_wait( &hodl_barrier );
  return scanhash_hodl_wolf( thr_id, work, max_nonce, hashes_done );
@@ -103,10 +110,6 @@ int hodl_scanhash( int thr_id, struct work* work, uint32_t max_nonce,

 bool register_hodl_algo( algo_gate_t* gate )
 {
-#ifdef NO_AES_NI
-  applog( LOG_ERR, "Only CPUs with AES are supported, use legacy version.");
-  return false;
-#endif
  pthread_barrier_init( &hodl_barrier, NULL, opt_n_threads );
  gate->optimizations         = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
  gate->scanhash              = (void*)&hodl_scanhash;
--- a/algo/hodl/hodl-wolf.c
+++ b/algo/hodl/hodl-wolf.c
@@ -13,41 +13,35 @@
 void GenerateGarbageCore(CacheEntry *Garbage, int ThreadID, int ThreadCount, void *MidHash)
 {
 #ifdef __AVX__
-    uint64_t* TempBufs[SHA512_PARALLEL_N] ;
+    uint64_t* TempBufs[SHA512_PARALLEL_N];
    uint64_t* desination[SHA512_PARALLEL_N];

-    for ( int i=0; i<SHA512_PARALLEL_N; ++i )
-    {
+    for (int i=0; i<SHA512_PARALLEL_N; ++i) {
        TempBufs[i] = (uint64_t*)malloc(32);
        memcpy(TempBufs[i], MidHash, 32);
    }

    uint32_t StartChunk = ThreadID * (TOTAL_CHUNKS / ThreadCount);
-    for ( uint32_t i = StartChunk;
-          i < StartChunk + (TOTAL_CHUNKS / ThreadCount); i+= SHA512_PARALLEL_N )
-    {
-        for ( int j=0; j<SHA512_PARALLEL_N; ++j )
-        {
-            ( (uint32_t*)TempBufs[j] )[0] = i + j;
-            desination[j] = (uint64_t*)( (uint8_t *)Garbage + ( (i+j)
-                            * GARBAGE_CHUNK_SIZE ) );
+    for(uint32_t i = StartChunk; i < StartChunk + (TOTAL_CHUNKS / ThreadCount); i+= SHA512_PARALLEL_N) {
+        for(int j=0; j<SHA512_PARALLEL_N; ++j) {
+            ((uint32_t*)TempBufs[j])[0] = i + j;
+            desination[j] = (uint64_t*)((uint8_t *)Garbage + ((i+j) * GARBAGE_CHUNK_SIZE));
        }
-        sha512Compute32b_parallel( TempBufs, desination );
+        sha512Compute32b_parallel(TempBufs, desination);
    }

-    for ( int i=0; i<SHA512_PARALLEL_N; ++i )
-        free( TempBufs[i] );
+    for (int i=0; i<SHA512_PARALLEL_N; ++i) {
+        free(TempBufs[i]);
+    }
 #else
    uint32_t TempBuf[8];
-    memcpy( TempBuf, MidHash, 32 );
+    memcpy(TempBuf, MidHash, 32);

    uint32_t StartChunk = ThreadID * (TOTAL_CHUNKS / ThreadCount);
-    for ( uint32_t i = StartChunk;
-          i < StartChunk + (TOTAL_CHUNKS / ThreadCount); ++i )
+    for(uint32_t i = StartChunk; i < StartChunk + (TOTAL_CHUNKS / ThreadCount); ++i)
    {
        TempBuf[0] = i;
-        SHA512( ( uint8_t *)TempBuf, 32,
-                ( (uint8_t *)Garbage ) + ( i * GARBAGE_CHUNK_SIZE ) );
+        SHA512((uint8_t *)TempBuf, 32, ((uint8_t *)Garbage) + (i * GARBAGE_CHUNK_SIZE));
    }
 #endif
 }
--- a/algo/hodl/hodl.cpp
+++ b/algo/hodl/hodl.cpp
@@ -0,0 +1,168 @@
+#include "miner.h"
+#include "hodl-gate.h"
+#include "hodl_uint256.h"
+#include "hodl_arith_uint256.h"
+#include "block.h"
+#include <sstream>
+#include "tinyformat.h"
+#include <unordered_map>
+#include "hash.h"
+#include <openssl/aes.h>
+#include <openssl/evp.h>
+#include <openssl/sha.h>
+
+#define BEGIN(a)            ((char*)&(a))
+#define END(a)              ((char*)&((&(a))[1]))
+#define PSUEDORANDOM_DATA_SIZE 30 //2^30 = 1GB
+#define PSUEDORANDOM_DATA_CHUNK_SIZE 6 //2^6 = 64 bytes //must be same as SHA512_DIGEST_LENGTH 64
+#define L2CACHE_TARGET 12 // 2^12 = 4096 bytes
+#define AES_ITERATIONS 15
+
+void SHA512Filler(char *mainMemoryPsuedoRandomData, int threadNumber, uint256 midHash){
+	//Generate psuedo random data to store in main memory
+	uint32_t chunks=(1<<(PSUEDORANDOM_DATA_SIZE-PSUEDORANDOM_DATA_CHUNK_SIZE)); //2^(30-6) = 16 mil
+	uint32_t chunkSize=(1<<(PSUEDORANDOM_DATA_CHUNK_SIZE)); //2^6 = 64 bytes
+        unsigned char hash_tmp[sizeof(midHash)];
+        memcpy((char*)&hash_tmp[0], (char*)&midHash, sizeof(midHash) );
+        uint32_t* index = (uint32_t*)hash_tmp;
+//        uint32_t chunksToProcess=chunks/totalThreads;
+        uint32_t chunksToProcess = chunks / opt_n_threads;
+        uint32_t startChunk=threadNumber*chunksToProcess;
+        for( uint32_t i = startChunk; i < startChunk+chunksToProcess;  i++){
+        	//This changes the first character of hash_tmp
+                *index = i;
+                SHA512((unsigned char*)hash_tmp, sizeof(hash_tmp), (unsigned char*)&(mainMemoryPsuedoRandomData[i*chunkSize]));
+        }
+}
+
+extern "C"
+// max_nonce is not used by this function
+int scanhash_hodl( int threadNumber, struct work* work, uint32_t max_nonce,
+                   uint64_t *hashes_done )
+{
+    unsigned char *mainMemoryPsuedoRandomData = hodl_scratchbuf;
+    uint32_t *pdata = work->data;
+    uint32_t *ptarget = work->target;
+
+    //retreive target
+    std::stringstream s;
+    for (int i = 7; i>=0; i--)
+      s << strprintf("%08x", ptarget[i]);
+
+    //retreive preveios hash
+    std::stringstream p;
+    for (int i = 0; i < 8; i++) 
+      p << strprintf("%08x", swab32(pdata[8 - i]));
+
+    //retreive merkleroot
+    std::stringstream m;
+    for (int i = 0; i < 8; i++) 
+      m << strprintf("%08x", swab32(pdata[16 - i]));
+
+    CBlock pblock;
+    pblock.SetNull();
+
+    pblock.nVersion=swab32(pdata[0]);
+    pblock.nNonce=swab32(pdata[19]);
+    pblock.nTime=swab32(pdata[17]);
+    pblock.nBits=swab32(pdata[18]);
+    pblock.hashPrevBlock=uint256S(p.str());
+    pblock.hashMerkleRoot=uint256S(m.str());
+    uint256 hashTarget=uint256S(s.str());
+    int collisions=0;
+    uint256 hash;
+
+		//Begin AES Search
+	        //Allocate temporary memory
+		uint32_t cacheMemorySize = (1<<L2CACHE_TARGET); //2^12 = 4096 bytes
+    		uint32_t comparisonSize=(1<<(PSUEDORANDOM_DATA_SIZE-L2CACHE_TARGET)); //2^(30-12) = 256K
+                unsigned char *cacheMemoryOperatingData;
+                unsigned char *cacheMemoryOperatingData2;
+                cacheMemoryOperatingData=new unsigned char[cacheMemorySize+16];
+                cacheMemoryOperatingData2=new unsigned char[cacheMemorySize];
+                //Create references to data as 32 bit arrays
+                uint32_t* cacheMemoryOperatingData32 = (uint32_t*)cacheMemoryOperatingData;
+                uint32_t* cacheMemoryOperatingData322 = (uint32_t*)cacheMemoryOperatingData2;
+
+                //Search for pattern in psuedorandom data
+                unsigned char key[32] = {0};
+                unsigned char iv[AES_BLOCK_SIZE];
+                int outlen1, outlen2;
+
+                //Iterate over the data
+//                int searchNumber=comparisonSize/totalThreads;
+                int searchNumber = comparisonSize / opt_n_threads;
+                int startLoc=threadNumber*searchNumber;
+		EVP_CIPHER_CTX ctx;
+                  for(int32_t k = startLoc;k<startLoc+searchNumber && !work_restart[threadNumber].restart;k++){
+                    //copy data to first l2 cache
+                    memcpy((char*)&cacheMemoryOperatingData[0], (char*)&mainMemoryPsuedoRandomData[k*cacheMemorySize], cacheMemorySize);
+                    for(int j=0;j<AES_ITERATIONS;j++){
+                        //use last 4 bytes of first cache as next location
+                        uint32_t nextLocation = cacheMemoryOperatingData32[(cacheMemorySize/4)-1]%comparisonSize;
+                        //Copy data from indicated location to second l2 cache -
+                        memcpy((char*)&cacheMemoryOperatingData2[0], (char*)&mainMemoryPsuedoRandomData[nextLocation*cacheMemorySize], cacheMemorySize);
+                        //XOR location data into second cache
+                        for(uint32_t i = 0; i < cacheMemorySize/4; i++)
+                            cacheMemoryOperatingData322[i] = cacheMemoryOperatingData32[i] ^ cacheMemoryOperatingData322[i];
+                        memcpy(key,(unsigned char*)&cacheMemoryOperatingData2[cacheMemorySize-32],32);
+                        memcpy(iv,(unsigned char*)&cacheMemoryOperatingData2[cacheMemorySize-AES_BLOCK_SIZE],AES_BLOCK_SIZE);
+                        EVP_EncryptInit(&ctx, EVP_aes_256_cbc(), key, iv);
+                        EVP_EncryptUpdate(&ctx, cacheMemoryOperatingData, &outlen1, cacheMemoryOperatingData2, cacheMemorySize);
+                        EVP_EncryptFinal(&ctx, cacheMemoryOperatingData + outlen1, &outlen2);
+                        EVP_CIPHER_CTX_cleanup(&ctx);
+                    }
+                    //use last X bits as solution
+                    uint32_t solution=cacheMemoryOperatingData32[(cacheMemorySize/4)-1]%comparisonSize;
+                    if(solution<1000){
+                        uint32_t proofOfCalculation=cacheMemoryOperatingData32[(cacheMemorySize/4)-2];
+			pblock.nStartLocation = k;
+                        pblock.nFinalCalculation = proofOfCalculation;
+                        hash = Hash(BEGIN(pblock.nVersion), END(pblock.nFinalCalculation));
+			collisions++;
+			if (UintToArith256(hash) <= UintToArith256(hashTarget) && !work_restart[threadNumber].restart){
+        		  pdata[21] = swab32(pblock.nFinalCalculation);
+        		  pdata[20] = swab32(pblock.nStartLocation);
+        		  *hashes_done = collisions;
+			  //free memory
+			  delete [] cacheMemoryOperatingData;
+                	  delete [] cacheMemoryOperatingData2;
+        		  return 1;
+    			}
+                    }
+                  }
+
+    //free memory
+    delete [] cacheMemoryOperatingData;
+    delete [] cacheMemoryOperatingData2;
+    *hashes_done = collisions;
+    return 0;
+}
+
+extern "C"
+void GetPsuedoRandomData( char* mainMemoryPsuedoRandomData, uint32_t *pdata,
+                            int thr_id )
+  {
+
+    //retreive preveios hash
+    std::stringstream p;
+    for (int i = 0; i < 8; i++)
+      p << strprintf("%08x", swab32(pdata[8 - i]));
+
+    //retreive merkleroot
+    std::stringstream m;
+    for (int i = 0; i < 8; i++)
+      m << strprintf("%08x", swab32(pdata[16 - i]));
+
+    CBlock pblock;
+    pblock.SetNull();
+
+    pblock.nVersion=swab32(pdata[0]);
+    pblock.nTime=swab32(pdata[17]);
+    pblock.nBits=swab32(pdata[18]);
+    pblock.hashPrevBlock= uint256S(p.str());
+    pblock.hashMerkleRoot= uint256S(m.str());
+    pblock.nNonce=swab32(pdata[19]);
+    uint256 midHash = Hash(BEGIN(pblock.nVersion), END(pblock.nNonce));
+    SHA512Filler( mainMemoryPsuedoRandomData, thr_id, midHash);
+  }
--- a/algo/hodl/hodl.h
+++ b/algo/hodl/hodl.h
@@ -0,0 +1,11 @@
+extern int scanhash_hodl( int thr_id, struct work* work, uint32_t max_nonce,
+    uint64_t *hashes_done );
+
+extern void GetPsuedoRandomData( char* mainMemoryPsuedoRandomData,
+                  uint32_t *pdata, int thr_id );
+
+void hodl_set_target( struct work* work, double diff );
+
+void hodl_copy_workdata( struct work* work, struct work* g_work );
+
+
--- a/algo/hodl/hodl_arith_uint256.cpp
+++ b/algo/hodl/hodl_arith_uint256.cpp
@@ -0,0 +1,258 @@
+// Copyright (c) 2009-2010 Satoshi Nakamoto
+// Copyright (c) 2009-2014 The Bitcoin developers
+// Distributed under the MIT software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+#include "hodl_arith_uint256.h"
+#include "hodl_uint256.h"
+#include "utilstrencodings.h"
+#include "common.h"
+#include <stdio.h>
+#include <string.h>
+
+template <unsigned int BITS>
+base_uint<BITS>::base_uint(const std::string& str)
+{
+    SetHex(str);
+}
+
+template <unsigned int BITS>
+base_uint<BITS>& base_uint<BITS>::operator<<=(unsigned int shift)
+{
+    base_uint<BITS> a(*this);
+    for (int i = 0; i < WIDTH; i++)
+        pn[i] = 0;
+    int k = shift / 32;
+    shift = shift % 32;
+    for (int i = 0; i < WIDTH; i++) {
+        if (i + k + 1 < WIDTH && shift != 0)
+            pn[i + k + 1] |= (a.pn[i] >> (32 - shift));
+        if (i + k < WIDTH)
+            pn[i + k] |= (a.pn[i] << shift);
+    }
+    return *this;
+}
+
+template <unsigned int BITS>
+base_uint<BITS>& base_uint<BITS>::operator>>=(unsigned int shift)
+{
+    base_uint<BITS> a(*this);
+    for (int i = 0; i < WIDTH; i++)
+        pn[i] = 0;
+    int k = shift / 32;
+    shift = shift % 32;
+    for (int i = 0; i < WIDTH; i++) {
+        if (i - k - 1 >= 0 && shift != 0)
+            pn[i - k - 1] |= (a.pn[i] << (32 - shift));
+        if (i - k >= 0)
+            pn[i - k] |= (a.pn[i] >> shift);
+    }
+    return *this;
+}
+
+template <unsigned int BITS>
+base_uint<BITS>& base_uint<BITS>::operator*=(uint32_t b32)
+{
+    uint64_t carry = 0;
+    for (int i = 0; i < WIDTH; i++) {
+        uint64_t n = carry + (uint64_t)b32 * pn[i];
+        pn[i] = n & 0xffffffff;
+        carry = n >> 32;
+    }
+    return *this;
+}
+
+template <unsigned int BITS>
+base_uint<BITS>& base_uint<BITS>::operator*=(const base_uint& b)
+{
+    base_uint<BITS> a = *this;
+    *this = 0;
+    for (int j = 0; j < WIDTH; j++) {
+        uint64_t carry = 0;
+        for (int i = 0; i + j < WIDTH; i++) {
+            uint64_t n = carry + pn[i + j] + (uint64_t)a.pn[j] * b.pn[i];
+            pn[i + j] = n & 0xffffffff;
+            carry = n >> 32;
+        }
+    }
+    return *this;
+}
+
+template <unsigned int BITS>
+base_uint<BITS>& base_uint<BITS>::operator/=(const base_uint& b)
+{
+    base_uint<BITS> div = b;     // make a copy, so we can shift.
+    base_uint<BITS> num = *this; // make a copy, so we can subtract.
+    *this = 0;                   // the quotient.
+    int num_bits = num.bits();
+    int div_bits = div.bits();
+    if (div_bits == 0)
+        throw uint_error("Division by zero");
+    if (div_bits > num_bits) // the result is certainly 0.
+        return *this;
+    int shift = num_bits - div_bits;
+    div <<= shift; // shift so that div and num align.
+    while (shift >= 0) {
+        if (num >= div) {
+            num -= div;
+            pn[shift / 32] |= (1 << (shift & 31)); // set a bit of the result.
+        }
+        div >>= 1; // shift back.
+        shift--;
+    }
+    // num now contains the remainder of the division.
+    return *this;
+}
+
+template <unsigned int BITS>
+int base_uint<BITS>::CompareTo(const base_uint<BITS>& b) const
+{
+    for (int i = WIDTH - 1; i >= 0; i--) {
+        if (pn[i] < b.pn[i])
+            return -1;
+        if (pn[i] > b.pn[i])
+            return 1;
+    }
+    return 0;
+}
+
+template <unsigned int BITS>
+bool base_uint<BITS>::EqualTo(uint64_t b) const
+{
+    for (int i = WIDTH - 1; i >= 2; i--) {
+        if (pn[i])
+            return false;
+    }
+    if (pn[1] != (b >> 32))
+        return false;
+    if (pn[0] != (b & 0xfffffffful))
+        return false;
+    return true;
+}
+
+template <unsigned int BITS>
+double base_uint<BITS>::getdouble() const
+{
+    double ret = 0.0;
+    double fact = 1.0;
+    for (int i = 0; i < WIDTH; i++) {
+        ret += fact * pn[i];
+        fact *= 4294967296.0;
+    }
+    return ret;
+}
+
+template <unsigned int BITS>
+std::string base_uint<BITS>::GetHex() const
+{
+    return ArithToUint256(*this).GetHex();
+}
+
+template <unsigned int BITS>
+void base_uint<BITS>::SetHex(const char* psz)
+{
+    *this = UintToArith256(uint256S(psz));
+}
+
+template <unsigned int BITS>
+void base_uint<BITS>::SetHex(const std::string& str)
+{
+    SetHex(str.c_str());
+}
+
+template <unsigned int BITS>
+std::string base_uint<BITS>::ToString() const
+{
+    return (GetHex());
+}
+
+template <unsigned int BITS>
+unsigned int base_uint<BITS>::bits() const
+{
+    for (int pos = WIDTH - 1; pos >= 0; pos--) {
+        if (pn[pos]) {
+            for (int bits = 31; bits > 0; bits--) {
+                if (pn[pos] & 1 << bits)
+                    return 32 * pos + bits + 1;
+            }
+            return 32 * pos + 1;
+        }
+    }
+    return 0;
+}
+
+// Explicit instantiations for base_uint<256>
+template base_uint<256>::base_uint(const std::string&);
+template base_uint<256>& base_uint<256>::operator<<=(unsigned int);
+template base_uint<256>& base_uint<256>::operator>>=(unsigned int);
+template base_uint<256>& base_uint<256>::operator*=(uint32_t b32);
+template base_uint<256>& base_uint<256>::operator*=(const base_uint<256>& b);
+template base_uint<256>& base_uint<256>::operator/=(const base_uint<256>& b);
+template int base_uint<256>::CompareTo(const base_uint<256>&) const;
+template bool base_uint<256>::EqualTo(uint64_t) const;
+template double base_uint<256>::getdouble() const;
+template std::string base_uint<256>::GetHex() const;
+template std::string base_uint<256>::ToString() const;
+template void base_uint<256>::SetHex(const char*);
+template void base_uint<256>::SetHex(const std::string&);
+template unsigned int base_uint<256>::bits() const;
+
+// This implementation directly uses shifts instead of going
+// through an intermediate MPI representation.
+arith_uint256& arith_uint256::SetCompact(uint32_t nCompact, bool* pfNegative, bool* pfOverflow)
+{
+    int nSize = nCompact >> 24;
+    uint32_t nWord = nCompact & 0x007fffff;
+    if (nSize <= 3) {
+        nWord >>= 8 * (3 - nSize);
+        *this = nWord;
+    } else {
+        *this = nWord;
+        *this <<= 8 * (nSize - 3);
+    }
+    if (pfNegative)
+        *pfNegative = nWord != 0 && (nCompact & 0x00800000) != 0;
+    if (pfOverflow)
+        *pfOverflow = nWord != 0 && ((nSize > 34) ||
+                                     (nWord > 0xff && nSize > 33) ||
+                                     (nWord > 0xffff && nSize > 32));
+    return *this;
+}
+
+uint32_t arith_uint256::GetCompact(bool fNegative) const
+{
+    int nSize = (bits() + 7) / 8;
+    uint32_t nCompact = 0;
+    if (nSize <= 3) {
+        nCompact = GetLow64() << 8 * (3 - nSize);
+    } else {
+        arith_uint256 bn = *this >> 8 * (nSize - 3);
+        nCompact = bn.GetLow64();
+    }
+    // The 0x00800000 bit denotes the sign.
+    // Thus, if it is already set, divide the mantissa by 256 and increase the exponent.
+    if (nCompact & 0x00800000) {
+        nCompact >>= 8;
+        nSize++;
+    }
+    assert((nCompact & ~0x007fffff) == 0);
+    assert(nSize < 256);
+    nCompact |= nSize << 24;
+    nCompact |= (fNegative && (nCompact & 0x007fffff) ? 0x00800000 : 0);
+    return nCompact;
+}
+
+uint256 ArithToUint256(const arith_uint256 &a)
+{
+    uint256 b;
+    for(int x=0; x<a.WIDTH; ++x)
+        WriteLE32(b.begin() + x*4, a.pn[x]);
+    return b;
+}
+arith_uint256 UintToArith256(const uint256 &a)
+{
+    arith_uint256 b;
+    for(int x=0; x<b.WIDTH; ++x)
+        b.pn[x] = ReadLE32(a.begin() + x*4);
+    return b;
+}
--- a/algo/hodl/hodl_arith_uint256.h
+++ b/algo/hodl/hodl_arith_uint256.h
@@ -0,0 +1,290 @@
+// Copyright (c) 2009-2010 Satoshi Nakamoto
+// Copyright (c) 2009-2014 The Bitcoin developers
+// Distributed under the MIT software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+#ifndef BITCOIN_ARITH_UINT256_H
+#define BITCOIN_ARITH_UINT256_H
+
+#include <assert.h>
+#include <cstring>
+#include <stdexcept>
+#include <stdint.h>
+#include <string>
+#include <vector>
+
+class uint256;
+
+class uint_error : public std::runtime_error {
+public:
+    explicit uint_error(const std::string& str) : std::runtime_error(str) {}
+};
+
+/** Template base class for unsigned big integers. */
+template<unsigned int BITS>
+class base_uint
+{
+protected:
+    enum { WIDTH=BITS/32 };
+    uint32_t pn[WIDTH];
+public:
+
+    base_uint()
+    {
+        for (int i = 0; i < WIDTH; i++)
+            pn[i] = 0;
+    }
+
+    base_uint(const base_uint& b)
+    {
+        for (int i = 0; i < WIDTH; i++)
+            pn[i] = b.pn[i];
+    }
+
+    base_uint& operator=(const base_uint& b)
+    {
+        for (int i = 0; i < WIDTH; i++)
+            pn[i] = b.pn[i];
+        return *this;
+    }
+
+    base_uint(uint64_t b)
+    {
+        pn[0] = (unsigned int)b;
+        pn[1] = (unsigned int)(b >> 32);
+        for (int i = 2; i < WIDTH; i++)
+            pn[i] = 0;
+    }
+
+    explicit base_uint(const std::string& str);
+
+    bool operator!() const
+    {
+        for (int i = 0; i < WIDTH; i++)
+            if (pn[i] != 0)
+                return false;
+        return true;
+    }
+
+    const base_uint operator~() const
+    {
+        base_uint ret;
+        for (int i = 0; i < WIDTH; i++)
+            ret.pn[i] = ~pn[i];
+        return ret;
+    }
+
+    const base_uint operator-() const
+    {
+        base_uint ret;
+        for (int i = 0; i < WIDTH; i++)
+            ret.pn[i] = ~pn[i];
+        ret++;
+        return ret;
+    }
+
+    double getdouble() const;
+
+    base_uint& operator=(uint64_t b)
+    {
+        pn[0] = (unsigned int)b;
+        pn[1] = (unsigned int)(b >> 32);
+        for (int i = 2; i < WIDTH; i++)
+            pn[i] = 0;
+        return *this;
+    }
+
+    base_uint& operator^=(const base_uint& b)
+    {
+        for (int i = 0; i < WIDTH; i++)
+            pn[i] ^= b.pn[i];
+        return *this;
+    }
+
+    base_uint& operator&=(const base_uint& b)
+    {
+        for (int i = 0; i < WIDTH; i++)
+            pn[i] &= b.pn[i];
+        return *this;
+    }
+
+    base_uint& operator|=(const base_uint& b)
+    {
+        for (int i = 0; i < WIDTH; i++)
+            pn[i] |= b.pn[i];
+        return *this;
+    }
+
+    base_uint& operator^=(uint64_t b)
+    {
+        pn[0] ^= (unsigned int)b;
+        pn[1] ^= (unsigned int)(b >> 32);
+        return *this;
+    }
+
+    base_uint& operator|=(uint64_t b)
+    {
+        pn[0] |= (unsigned int)b;
+        pn[1] |= (unsigned int)(b >> 32);
+        return *this;
+    }
+
+    base_uint& operator<<=(unsigned int shift);
+    base_uint& operator>>=(unsigned int shift);
+
+    base_uint& operator+=(const base_uint& b)
+    {
+        uint64_t carry = 0;
+        for (int i = 0; i < WIDTH; i++)
+        {
+            uint64_t n = carry + pn[i] + b.pn[i];
+            pn[i] = n & 0xffffffff;
+            carry = n >> 32;
+        }
+        return *this;
+    }
+
+    base_uint& operator-=(const base_uint& b)
+    {
+        *this += -b;
+        return *this;
+    }
+
+    base_uint& operator+=(uint64_t b64)
+    {
+        base_uint b;
+        b = b64;
+        *this += b;
+        return *this;
+    }
+
+    base_uint& operator-=(uint64_t b64)
+    {
+        base_uint b;
+        b = b64;
+        *this += -b;
+        return *this;
+    }
+
+    base_uint& operator*=(uint32_t b32);
+    base_uint& operator*=(const base_uint& b);
+    base_uint& operator/=(const base_uint& b);
+
+    base_uint& operator++()
+    {
+        // prefix operator
+        int i = 0;
+        while (++pn[i] == 0 && i < WIDTH-1)
+            i++;
+        return *this;
+    }
+
+    const base_uint operator++(int)
+    {
+        // postfix operator
+        const base_uint ret = *this;
+        ++(*this);
+        return ret;
+    }
+
+    base_uint& operator--()
+    {
+        // prefix operator
+        int i = 0;
+        while (--pn[i] == (uint32_t)-1 && i < WIDTH-1)
+            i++;
+        return *this;
+    }
+
+    const base_uint operator--(int)
+    {
+        // postfix operator
+        const base_uint ret = *this;
+        --(*this);
+        return ret;
+    }
+
+    int CompareTo(const base_uint& b) const;
+    bool EqualTo(uint64_t b) const;
+
+    friend inline const base_uint operator+(const base_uint& a, const base_uint& b) { return base_uint(a) += b; }
+    friend inline const base_uint operator-(const base_uint& a, const base_uint& b) { return base_uint(a) -= b; }
+    friend inline const base_uint operator*(const base_uint& a, const base_uint& b) { return base_uint(a) *= b; }
+    friend inline const base_uint operator/(const base_uint& a, const base_uint& b) { return base_uint(a) /= b; }
+    friend inline const base_uint operator|(const base_uint& a, const base_uint& b) { return base_uint(a) |= b; }
+    friend inline const base_uint operator&(const base_uint& a, const base_uint& b) { return base_uint(a) &= b; }
+    friend inline const base_uint operator^(const base_uint& a, const base_uint& b) { return base_uint(a) ^= b; }
+    friend inline const base_uint operator>>(const base_uint& a, int shift) { return base_uint(a) >>= shift; }
+    friend inline const base_uint operator<<(const base_uint& a, int shift) { return base_uint(a) <<= shift; }
+    friend inline const base_uint operator*(const base_uint& a, uint32_t b) { return base_uint(a) *= b; }
+    friend inline bool operator==(const base_uint& a, const base_uint& b) { return memcmp(a.pn, b.pn, sizeof(a.pn)) == 0; }
+    friend inline bool operator!=(const base_uint& a, const base_uint& b) { return memcmp(a.pn, b.pn, sizeof(a.pn)) != 0; }
+    friend inline bool operator>(const base_uint& a, const base_uint& b) { return a.CompareTo(b) > 0; }
+    friend inline bool operator<(const base_uint& a, const base_uint& b) { return a.CompareTo(b) < 0; }
+    friend inline bool operator>=(const base_uint& a, const base_uint& b) { return a.CompareTo(b) >= 0; }
+    friend inline bool operator<=(const base_uint& a, const base_uint& b) { return a.CompareTo(b) <= 0; }
+    friend inline bool operator==(const base_uint& a, uint64_t b) { return a.EqualTo(b); }
+    friend inline bool operator!=(const base_uint& a, uint64_t b) { return !a.EqualTo(b); }
+
+    std::string GetHex() const;
+    void SetHex(const char* psz);
+    void SetHex(const std::string& str);
+    std::string ToString() const;
+
+    unsigned int size() const
+    {
+        return sizeof(pn);
+    }
+
+    /**
+     * Returns the position of the highest bit set plus one, or zero if the
+     * value is zero.
+     */
+    unsigned int bits() const;
+
+    uint64_t GetLow64() const
+    {
+        assert(WIDTH >= 2);
+        return pn[0] | (uint64_t)pn[1] << 32;
+    }
+};
+
+/** 256-bit unsigned big integer. */
+class arith_uint256 : public base_uint<256> {
+public:
+    arith_uint256() {}
+    arith_uint256(const base_uint<256>& b) : base_uint<256>(b) {}
+    arith_uint256(uint64_t b) : base_uint<256>(b) {}
+    explicit arith_uint256(const std::string& str) : base_uint<256>(str) {}
+
+    /**
+     * The "compact" format is a representation of a whole
+     * number N using an unsigned 32bit number similar to a
+     * floating point format.
+     * The most significant 8 bits are the unsigned exponent of base 256.
+     * This exponent can be thought of as "number of bytes of N".
+     * The lower 23 bits are the mantissa.
+     * Bit number 24 (0x800000) represents the sign of N.
+     * N = (-1^sign) * mantissa * 256^(exponent-3)
+     *
+     * Satoshi's original implementation used BN_bn2mpi() and BN_mpi2bn().
+     * MPI uses the most significant bit of the first byte as sign.
+     * Thus 0x1234560000 is compact (0x05123456)
+     * and  0xc0de000000 is compact (0x0600c0de)
+     *
+     * Bitcoin only uses this "compact" format for encoding difficulty
+     * targets, which are unsigned 256bit quantities.  Thus, all the
+     * complexities of the sign bit and using base 256 are probably an
+     * implementation accident.
+     */
+    arith_uint256& SetCompact(uint32_t nCompact, bool *pfNegative = NULL, bool *pfOverflow = NULL);
+    uint32_t GetCompact(bool fNegative = false) const;
+
+    friend uint256 ArithToUint256(const arith_uint256 &);
+    friend arith_uint256 UintToArith256(const uint256 &);
+};
+
+uint256 ArithToUint256(const arith_uint256 &);
+arith_uint256 UintToArith256(const uint256 &);
+
+#endif // BITCOIN_ARITH_UINT256_H
--- a/algo/hodl/hodl_uint256.cpp
+++ b/algo/hodl/hodl_uint256.cpp
@@ -0,0 +1,145 @@
+// Copyright (c) 2009-2010 Satoshi Nakamoto
+// Copyright (c) 2009-2014 The Bitcoin Core developers
+// Distributed under the MIT software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+#include "hodl_uint256.h"
+#include "utilstrencodings.h"
+#include <stdio.h>
+#include <string.h>
+
+template <unsigned int BITS>
+base_blob<BITS>::base_blob(const std::vector<unsigned char>& vch)
+{
+    assert(vch.size() == sizeof(data));
+    memcpy(data, &vch[0], sizeof(data));
+}
+
+template <unsigned int BITS>
+std::string base_blob<BITS>::GetHex() const
+{
+    char psz[sizeof(data) * 2 + 1];
+    for (unsigned int i = 0; i < sizeof(data); i++)
+        sprintf(psz + i * 2, "%02x", data[sizeof(data) - i - 1]);
+    return std::string(psz, psz + sizeof(data) * 2);
+}
+
+template <unsigned int BITS>
+void base_blob<BITS>::SetHex(const char* psz)
+{
+    memset(data, 0, sizeof(data));
+
+    // skip leading spaces
+    while (isspace(*psz))
+        psz++;
+
+    // skip 0x
+    if (psz[0] == '0' && tolower(psz[1]) == 'x')
+        psz += 2;
+
+    // hex string to uint
+    const char* pbegin = psz;
+    while (::HexDigit(*psz) != -1)
+        psz++;
+    psz--;
+    unsigned char* p1 = (unsigned char*)data;
+    unsigned char* pend = p1 + WIDTH;
+    while (psz >= pbegin && p1 < pend) {
+        *p1 = ::HexDigit(*psz--);
+        if (psz >= pbegin) {
+            *p1 |= ((unsigned char)::HexDigit(*psz--) << 4);
+            p1++;
+        }
+    }
+}
+
+template <unsigned int BITS>
+void base_blob<BITS>::SetHex(const std::string& str)
+{
+    SetHex(str.c_str());
+}
+
+template <unsigned int BITS>
+std::string base_blob<BITS>::ToString() const
+{
+    return (GetHex());
+}
+
+// Explicit instantiations for base_blob<160>
+template base_blob<160>::base_blob(const std::vector<unsigned char>&);
+template std::string base_blob<160>::GetHex() const;
+template std::string base_blob<160>::ToString() const;
+template void base_blob<160>::SetHex(const char*);
+template void base_blob<160>::SetHex(const std::string&);
+
+// Explicit instantiations for base_blob<256>
+template base_blob<256>::base_blob(const std::vector<unsigned char>&);
+template std::string base_blob<256>::GetHex() const;
+template std::string base_blob<256>::ToString() const;
+template void base_blob<256>::SetHex(const char*);
+template void base_blob<256>::SetHex(const std::string&);
+
+static void inline HashMix(uint32_t& a, uint32_t& b, uint32_t& c)
+{
+    // Taken from lookup3, by Bob Jenkins.
+    a -= c;
+    a ^= ((c << 4) | (c >> 28));
+    c += b;
+    b -= a;
+    b ^= ((a << 6) | (a >> 26));
+    a += c;
+    c -= b;
+    c ^= ((b << 8) | (b >> 24));
+    b += a;
+    a -= c;
+    a ^= ((c << 16) | (c >> 16));
+    c += b;
+    b -= a;
+    b ^= ((a << 19) | (a >> 13));
+    a += c;
+    c -= b;
+    c ^= ((b << 4) | (b >> 28));
+    b += a;
+}
+
+static void inline HashFinal(uint32_t& a, uint32_t& b, uint32_t& c)
+{
+    // Taken from lookup3, by Bob Jenkins.
+    c ^= b;
+    c -= ((b << 14) | (b >> 18));
+    a ^= c;
+    a -= ((c << 11) | (c >> 21));
+    b ^= a;
+    b -= ((a << 25) | (a >> 7));
+    c ^= b;
+    c -= ((b << 16) | (b >> 16));
+    a ^= c;
+    a -= ((c << 4) | (c >> 28));
+    b ^= a;
+    b -= ((a << 14) | (a >> 18));
+    c ^= b;
+    c -= ((b << 24) | (b >> 8));
+}
+
+uint64_t uint256::GetHash(const uint256& salt) const
+{
+    uint32_t a, b, c;
+    const uint32_t *pn = (const uint32_t*)data;
+    const uint32_t *salt_pn = (const uint32_t*)salt.data;
+    a = b = c = 0xdeadbeef + WIDTH;
+
+    a += pn[0] ^ salt_pn[0];
+    b += pn[1] ^ salt_pn[1];
+    c += pn[2] ^ salt_pn[2];
+    HashMix(a, b, c);
+    a += pn[3] ^ salt_pn[3];
+    b += pn[4] ^ salt_pn[4];
+    c += pn[5] ^ salt_pn[5];
+    HashMix(a, b, c);
+    a += pn[6] ^ salt_pn[6];
+    b += pn[7] ^ salt_pn[7];
+    HashFinal(a, b, c);
+
+    return ((((uint64_t)b) << 32) | c);
+}
+
--- a/algo/hodl/hodl_uint256.h
+++ b/algo/hodl/hodl_uint256.h
@@ -0,0 +1,158 @@
+// Copyright (c) 2009-2010 Satoshi Nakamoto
+// Copyright (c) 2009-2014 The Bitcoin Core developers
+// Distributed under the MIT software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+#ifndef BITCOIN_UINT256_H
+#define BITCOIN_UINT256_H
+
+#include <assert.h>
+#include <cstring>
+#include <stdexcept>
+#include <stdint.h>
+#include <string>
+#include <vector>
+
+/** Template base class for fixed-sized opaque blobs. */
+template<unsigned int BITS>
+class base_blob
+{
+protected:
+    enum { WIDTH=BITS/8 };
+    uint8_t data[WIDTH];
+public:
+    base_blob()
+    {
+        memset(data, 0, sizeof(data));
+    }
+
+    explicit base_blob(const std::vector<unsigned char>& vch);
+
+    bool IsNull() const
+    {
+        for (int i = 0; i < WIDTH; i++)
+            if (data[i] != 0)
+                return false;
+        return true;
+    }
+
+    void SetNull()
+    {
+        memset(data, 0, sizeof(data));
+    }
+
+    friend inline bool operator==(const base_blob& a, const base_blob& b) { return memcmp(a.data, b.data, sizeof(a.data)) == 0; }
+    friend inline bool operator!=(const base_blob& a, const base_blob& b) { return memcmp(a.data, b.data, sizeof(a.data)) != 0; }
+    friend inline bool operator<(const base_blob& a, const base_blob& b) { return memcmp(a.data, b.data, sizeof(a.data)) < 0; }
+
+    std::string GetHex() const;
+    void SetHex(const char* psz);
+    void SetHex(const std::string& str);
+    std::string ToString() const;
+
+    unsigned char* begin()
+    {
+        return &data[0];
+    }
+
+    unsigned char* end()
+    {
+        return &data[WIDTH];
+    }
+
+    const unsigned char* begin() const
+    {
+        return &data[0];
+    }
+
+    const unsigned char* end() const
+    {
+        return &data[WIDTH];
+    }
+
+    unsigned int size() const
+    {
+        return sizeof(data);
+    }
+
+    unsigned int GetSerializeSize(int nType, int nVersion) const
+    {
+        return sizeof(data);
+    }
+
+    template<typename Stream>
+    void Serialize(Stream& s, int nType, int nVersion) const
+    {
+        s.write((char*)data, sizeof(data));
+    }
+
+    template<typename Stream>
+    void Unserialize(Stream& s, int nType, int nVersion)
+    {
+        s.read((char*)data, sizeof(data));
+    }
+};
+
+/** 160-bit opaque blob.
+ * @note This type is called uint160 for historical reasons only. It is an opaque
+ * blob of 160 bits and has no integer operations.
+ */
+class uint160 : public base_blob<160> {
+public:
+    uint160() {}
+    uint160(const base_blob<160>& b) : base_blob<160>(b) {}
+    explicit uint160(const std::vector<unsigned char>& vch) : base_blob<160>(vch) {}
+};
+
+/** 256-bit opaque blob.
+ * @note This type is called uint256 for historical reasons only. It is an
+ * opaque blob of 256 bits and has no integer operations. Use arith_uint256 if
+ * those are required.
+ */
+class uint256 : public base_blob<256> {
+public:
+    uint256() {}
+    uint256(const base_blob<256>& b) : base_blob<256>(b) {}
+    explicit uint256(const std::vector<unsigned char>& vch) : base_blob<256>(vch) {}
+
+    /** A cheap hash function that just returns 64 bits from the result, it can be
+     * used when the contents are considered uniformly random. It is not appropriate
+     * when the value can easily be influenced from outside as e.g. a network adversary could
+     * provide values to trigger worst-case behavior.
+     * @note The result of this function is not stable between little and big endian.
+     */
+    uint64_t GetCheapHash() const
+    {
+        uint64_t result;
+        memcpy((void*)&result, (void*)data, 8);
+        return result;
+    }
+
+    /** A more secure, salted hash function.
+     * @note This hash is not stable between little and big endian.
+     */
+    uint64_t GetHash(const uint256& salt) const;
+};
+
+/* uint256 from const char *.
+ * This is a separate function because the constructor uint256(const char*) can result
+ * in dangerously catching uint256(0).
+ */
+inline uint256 uint256S(const char *str)
+{
+    uint256 rv;
+    rv.SetHex(str);
+    return rv;
+}
+/* uint256 from std::string.
+ * This is a separate function because the constructor uint256(const std::string &str) can result
+ * in dangerously catching uint256(0) via std::string(const char*).
+ */
+inline uint256 uint256S(const std::string& str)
+{
+    uint256 rv;
+    rv.SetHex(str);
+    return rv;
+}
+
+#endif // BITCOIN_UINT256_H
--- a/algo/hodl/my-byteswap.h
+++ b/algo/hodl/my-byteswap.h
@@ -0,0 +1,155 @@
+/* Macros to swap the order of bytes in integer values.
+   Copyright (C) 1997-2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if !defined _BYTESWAP_H && !defined _NETINET_IN_H && !defined _ENDIAN_H
+# error "Never use <bits/byteswap.h> directly; include <byteswap.h> instead."
+#endif
+
+#ifndef _BITS_BYTESWAP_H
+#define _BITS_BYTESWAP_H 1
+
+#include <features.h>
+#include <bits/types.h>
+#include <bits/wordsize.h>
+
+/* Swap bytes in 16 bit value.  */
+#define __bswap_constant_16(x) \
+     ((unsigned short int) ((((x) >> 8) & 0xff) | (((x) & 0xff) << 8)))
+
+/* Get __bswap_16.  */
+#include <bits/byteswap-16.h>
+
+/* Swap bytes in 32 bit value.  */
+#define __bswap_constant_32(x) \
+     ((((x) & 0xff000000) >> 24) | (((x) & 0x00ff0000) >>  8) |		      \
+      (((x) & 0x0000ff00) <<  8) | (((x) & 0x000000ff) << 24))
+
+#ifdef __GNUC__
+# if __GNUC_PREREQ (4, 3)
+static __inline unsigned int
+__bswap_32 (unsigned int __bsx)
+{
+  return __builtin_bswap32 (__bsx);
+}
+# elif __GNUC__ >= 2
+#  if __WORDSIZE == 64 || (defined __i486__ || defined __pentium__	      \
+			   || defined __pentiumpro__ || defined __pentium4__  \
+			   || defined __k8__ || defined __athlon__	      \
+			   || defined __k6__ || defined __nocona__	      \
+			   || defined __core2__ || defined __geode__	      \
+			   || defined __amdfam10__)
+/* To swap the bytes in a word the i486 processors and up provide the
+   `bswap' opcode.  On i386 we have to use three instructions.  */
+#   define __bswap_32(x) \
+      (__extension__							      \
+       ({ unsigned int __v, __x = (x);					      \
+	  if (__builtin_constant_p (__x))				      \
+	    __v = __bswap_constant_32 (__x);				      \
+	  else								      \
+	    __asm__ ("bswap %0" : "=r" (__v) : "0" (__x));		      \
+	  __v; }))
+#  else
+#   define __bswap_32(x)						      \
+      (__extension__							      \
+       ({ unsigned int __v, __x = (x);					      \
+	  if (__builtin_constant_p (__x))				      \
+	    __v = __bswap_constant_32 (__x);				      \
+	  else								      \
+	    __asm__ ("rorw $8, %w0;"					      \
+		     "rorl $16, %0;"					      \
+		     "rorw $8, %w0"					      \
+		     : "=r" (__v)					      \
+		     : "0" (__x)					      \
+		     : "cc");						      \
+	  __v; }))
+#  endif
+# else
+#  define __bswap_32(x) \
+     (__extension__							      \
+      ({ unsigned int __x = (x); __bswap_constant_32 (__x); }))
+# endif
+#else
+static __inline unsigned int
+__bswap_32 (unsigned int __bsx)
+{
+  return __bswap_constant_32 (__bsx);
+}
+#endif
+
+
+#if __GNUC_PREREQ (2, 0)
+/* Swap bytes in 64 bit value.  */
+# define __bswap_constant_64(x) \
+     (__extension__ ((((x) & 0xff00000000000000ull) >> 56)		      \
+		     | (((x) & 0x00ff000000000000ull) >> 40)		      \
+		     | (((x) & 0x0000ff0000000000ull) >> 24)		      \
+		     | (((x) & 0x000000ff00000000ull) >> 8)		      \
+		     | (((x) & 0x00000000ff000000ull) << 8)		      \
+		     | (((x) & 0x0000000000ff0000ull) << 24)		      \
+		     | (((x) & 0x000000000000ff00ull) << 40)		      \
+		     | (((x) & 0x00000000000000ffull) << 56)))
+
+# if __GNUC_PREREQ (4, 3)
+static __inline __uint64_t
+__bswap_64 (__uint64_t __bsx)
+{
+  return __builtin_bswap64 (__bsx);
+}
+# elif __WORDSIZE == 64
+#  define __bswap_64(x) \
+     (__extension__							      \
+      ({ __uint64_t __v, __x = (x);					      \
+	 if (__builtin_constant_p (__x))				      \
+	   __v = __bswap_constant_64 (__x);				      \
+	 else								      \
+	   __asm__ ("bswap %q0" : "=r" (__v) : "0" (__x));		      \
+	 __v; }))
+# else
+#  define __bswap_64(x) \
+     (__extension__                                                           \
+      ({ union { __extension__ __uint64_t __ll;		                      \
+		 unsigned int __l[2]; } __w, __r;                             \
+	 if (__builtin_constant_p (x))                                        \
+	   __r.__ll = __bswap_constant_64 (x);                                \
+	 else                                                                 \
+	   {                                                                  \
+	     __w.__ll = (x);                                                  \
+	     __r.__l[0] = __bswap_32 (__w.__l[1]);                            \
+	     __r.__l[1] = __bswap_32 (__w.__l[0]);                            \
+	   }                                                                  \
+	 __r.__ll; }))
+# endif
+#else
+# define __bswap_constant_64(x) \
+     ((((x) & 0xff00000000000000ull) >> 56)				      \
+      | (((x) & 0x00ff000000000000ull) >> 40)				      \
+      | (((x) & 0x0000ff0000000000ull) >> 24)				      \
+      | (((x) & 0x000000ff00000000ull) >> 8)				      \
+      | (((x) & 0x00000000ff000000ull) << 8)				      \
+      | (((x) & 0x0000000000ff0000ull) << 24)				      \
+      | (((x) & 0x000000000000ff00ull) << 40)				      \
+      | (((x) & 0x00000000000000ffull) << 56))
+
+static __inline __uint64_t
+__bswap_64 (__uint64_t __bsx)
+{
+  return __bswap_constant_64 (__bsx);
+}
+#endif
+
+#endif /* _BITS_BYTESWAP_H */
--- a/algo/hodl/my-endian.h
+++ b/algo/hodl/my-endian.h
@@ -0,0 +1,103 @@
+/* Copyright (C) 1992-2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+// cloned from /usr/endian.h and modified
+
+
+
+#ifndef	_ENDIAN_H
+#define	_ENDIAN_H	1
+
+//#include <features.h>
+
+/* Definitions for byte order, according to significance of bytes,
+   from low addresses to high addresses.  The value is what you get by
+   putting '4' in the most significant byte, '3' in the second most
+   significant byte, '2' in the second least significant byte, and '1'
+   in the least significant byte, and then writing down one digit for
+   each byte, starting with the byte at the lowest address at the left,
+   and proceeding to the byte with the highest address at the right.  */
+
+#define	__LITTLE_ENDIAN	1234
+#define	__BIG_ENDIAN	4321
+#define	__PDP_ENDIAN	3412
+
+/* This file defines `__BYTE_ORDER' for the particular machine.  */
+//#include <bits/endian.h>
+#define __BYTE_ORDER __LITTLE_ENDIAN
+
+/* Some machines may need to use a different endianness for floating point
+   values.  */
+#ifndef __FLOAT_WORD_ORDER
+# define __FLOAT_WORD_ORDER __BYTE_ORDER
+#endif
+
+#ifdef	__USE_BSD
+# define LITTLE_ENDIAN	__LITTLE_ENDIAN
+# define BIG_ENDIAN	__BIG_ENDIAN
+# define PDP_ENDIAN	__PDP_ENDIAN
+# define BYTE_ORDER	__BYTE_ORDER
+#endif
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+# define __LONG_LONG_PAIR(HI, LO) LO, HI
+#elif __BYTE_ORDER == __BIG_ENDIAN
+# define __LONG_LONG_PAIR(HI, LO) HI, LO
+#endif
+
+
+#if defined __USE_BSD && !defined __ASSEMBLER__
+/* Conversion interfaces.  */
+//# include <bits/byteswap.h>
+#include "my-byteswap.h"
+
+
+# if __BYTE_ORDER == __LITTLE_ENDIAN
+#  define htobe16(x) __bswap_16 (x)
+#  define htole16(x) (x)
+#  define be16toh(x) __bswap_16 (x)
+#  define le16toh(x) (x)
+
+#  define htobe32(x) __bswap_32 (x)
+#  define htole32(x) (x)
+#  define be32toh(x) __bswap_32 (x)
+#  define le32toh(x) (x)
+
+#  define htobe64(x) __bswap_64 (x)
+#  define htole64(x) (x)
+#  define be64toh(x) __bswap_64 (x)
+#  define le64toh(x) (x)
+
+# else
+#  define htobe16(x) (x)
+#  define htole16(x) __bswap_16 (x)
+#  define be16toh(x) (x)
+#  define le16toh(x) __bswap_16 (x)
+
+#  define htobe32(x) (x)
+#  define htole32(x) __bswap_32 (x)
+#  define be32toh(x) (x)
+#  define le32toh(x) __bswap_32 (x)
+
+#  define htobe64(x) (x)
+#  define htole64(x) __bswap_64 (x)
+#  define be64toh(x) (x)
+#  define le64toh(x) __bswap_64 (x)
+# endif
+#endif
+
+#endif	/* endian.h */
--- a/algo/hodl/serialize.h
+++ b/algo/hodl/serialize.h
@@ -0,0 +1,862 @@
+// Copyright (c) 2009-2010 Satoshi Nakamoto
+// Copyright (c) 2009-2014 The Bitcoin Core developers
+// Distributed under the MIT software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+#ifndef BITCOIN_SERIALIZE_H
+#define BITCOIN_SERIALIZE_H
+
+#if ((defined(_WIN64) || defined(__WINDOWS__)))
+#include "hodl-endian.h"
+#endif
+
+#include <algorithm>
+#include <assert.h>
+#include <ios>
+#include <limits>
+#include <map>
+#include <set>
+#include <stdint.h>
+#include <string>
+#include <string.h>
+#include <utility>
+#include <vector>
+
+class CScript;
+
+static const unsigned int MAX_SIZE = 0x02000000;
+
+/**
+ * Used to bypass the rule against non-const reference to temporary
+ * where it makes sense with wrappers such as CFlatData or CTxDB
+ */
+template<typename T>
+inline T& REF(const T& val)
+{
+    return const_cast<T&>(val);
+}
+
+/**
+ * Used to acquire a non-const pointer "this" to generate bodies
+ * of const serialization operations from a template
+ */
+template<typename T>
+inline T* NCONST_PTR(const T* val)
+{
+    return const_cast<T*>(val);
+}
+
+/** 
+ * Get begin pointer of vector (non-const version).
+ * @note These functions avoid the undefined case of indexing into an empty
+ * vector, as well as that of indexing after the end of the vector.
+ */
+template <class T, class TAl>
+inline T* begin_ptr(std::vector<T,TAl>& v)
+{
+    return v.empty() ? NULL : &v[0];
+}
+/** Get begin pointer of vector (const version) */
+template <class T, class TAl>
+inline const T* begin_ptr(const std::vector<T,TAl>& v)
+{
+    return v.empty() ? NULL : &v[0];
+}
+/** Get end pointer of vector (non-const version) */
+template <class T, class TAl>
+inline T* end_ptr(std::vector<T,TAl>& v)
+{
+    return v.empty() ? NULL : (&v[0] + v.size());
+}
+/** Get end pointer of vector (const version) */
+template <class T, class TAl>
+inline const T* end_ptr(const std::vector<T,TAl>& v)
+{
+    return v.empty() ? NULL : (&v[0] + v.size());
+}
+
+/*
+ * Lowest-level serialization and conversion.
+ * @note Sizes of these types are verified in the tests
+ */
+template<typename Stream> inline void ser_writedata8(Stream &s, uint8_t obj)
+{
+    s.write((char*)&obj, 1);
+}
+template<typename Stream> inline void ser_writedata16(Stream &s, uint16_t obj)
+{
+    obj = htole16(obj);
+    s.write((char*)&obj, 2);
+}
+template<typename Stream> inline void ser_writedata32(Stream &s, uint32_t obj)
+{
+    obj = htole32(obj);
+    s.write((char*)&obj, 4);
+}
+template<typename Stream> inline void ser_writedata64(Stream &s, uint64_t obj)
+{
+    obj = htole64(obj);
+    s.write((char*)&obj, 8);
+}
+template<typename Stream> inline uint8_t ser_readdata8(Stream &s)
+{
+    uint8_t obj;
+    s.read((char*)&obj, 1);
+    return obj;
+}
+template<typename Stream> inline uint16_t ser_readdata16(Stream &s)
+{
+    uint16_t obj;
+    s.read((char*)&obj, 2);
+    return le16toh(obj);
+}
+template<typename Stream> inline uint32_t ser_readdata32(Stream &s)
+{
+    uint32_t obj;
+    s.read((char*)&obj, 4);
+    return le32toh(obj);
+}
+template<typename Stream> inline uint64_t ser_readdata64(Stream &s)
+{
+    uint64_t obj;
+    s.read((char*)&obj, 8);
+    return le64toh(obj);
+}
+inline uint64_t ser_double_to_uint64(double x)
+{
+    union { double x; uint64_t y; } tmp;
+    tmp.x = x;
+    return tmp.y;
+}
+inline uint32_t ser_float_to_uint32(float x)
+{
+    union { float x; uint32_t y; } tmp;
+    tmp.x = x;
+    return tmp.y;
+}
+inline double ser_uint64_to_double(uint64_t y)
+{
+    union { double x; uint64_t y; } tmp;
+    tmp.y = y;
+    return tmp.x;
+}
+inline float ser_uint32_to_float(uint32_t y)
+{
+    union { float x; uint32_t y; } tmp;
+    tmp.y = y;
+    return tmp.x;
+}
+
+
+/////////////////////////////////////////////////////////////////
+//
+// Templates for serializing to anything that looks like a stream,
+// i.e. anything that supports .read(char*, size_t) and .write(char*, size_t)
+//
+
+enum
+{
+    // primary actions
+    SER_NETWORK         = (1 << 0),
+    SER_DISK            = (1 << 1),
+    SER_GETHASH         = (1 << 2),
+};
+
+#define READWRITE(obj)      (::SerReadWrite(s, (obj), nType, nVersion, ser_action))
+
+/** 
+ * Implement three methods for serializable objects. These are actually wrappers over
+ * "SerializationOp" template, which implements the body of each class' serialization
+ * code. Adding "ADD_SERIALIZE_METHODS" in the body of the class causes these wrappers to be
+ * added as members. 
+ */
+#define ADD_SERIALIZE_METHODS                                                          \
+    size_t GetSerializeSize(int nType, int nVersion) const {                         \
+        CSizeComputer s(nType, nVersion);                                            \
+        NCONST_PTR(this)->SerializationOp(s, CSerActionSerialize(), nType, nVersion);\
+        return s.size();                                                             \
+    }                                                                                \
+    template<typename Stream>                                                        \
+    void Serialize(Stream& s, int nType, int nVersion) const {                       \
+        NCONST_PTR(this)->SerializationOp(s, CSerActionSerialize(), nType, nVersion);\
+    }                                                                                \
+    template<typename Stream>                                                        \
+    void Unserialize(Stream& s, int nType, int nVersion) {                           \
+        SerializationOp(s, CSerActionUnserialize(), nType, nVersion);                \
+    }
+
+/*
+ * Basic Types
+ */
+inline unsigned int GetSerializeSize(char a,      int, int=0) { return 1; }
+inline unsigned int GetSerializeSize(int8_t a,    int, int=0) { return 1; }
+inline unsigned int GetSerializeSize(uint8_t a,   int, int=0) { return 1; }
+inline unsigned int GetSerializeSize(int16_t a,   int, int=0) { return 2; }
+inline unsigned int GetSerializeSize(uint16_t a,  int, int=0) { return 2; }
+inline unsigned int GetSerializeSize(int32_t a,   int, int=0) { return 4; }
+inline unsigned int GetSerializeSize(uint32_t a,  int, int=0) { return 4; }
+inline unsigned int GetSerializeSize(int64_t a,   int, int=0) { return 8; }
+inline unsigned int GetSerializeSize(uint64_t a,  int, int=0) { return 8; }
+inline unsigned int GetSerializeSize(float a,     int, int=0) { return 4; }
+inline unsigned int GetSerializeSize(double a,    int, int=0) { return 8; }
+
+template<typename Stream> inline void Serialize(Stream& s, char a,         int, int=0) { ser_writedata8(s, a); } // TODO Get rid of bare char
+template<typename Stream> inline void Serialize(Stream& s, int8_t a,       int, int=0) { ser_writedata8(s, a); }
+template<typename Stream> inline void Serialize(Stream& s, uint8_t a,      int, int=0) { ser_writedata8(s, a); }
+template<typename Stream> inline void Serialize(Stream& s, int16_t a,      int, int=0) { ser_writedata16(s, a); }
+template<typename Stream> inline void Serialize(Stream& s, uint16_t a,     int, int=0) { ser_writedata16(s, a); }
+template<typename Stream> inline void Serialize(Stream& s, int32_t a,      int, int=0) { ser_writedata32(s, a); }
+template<typename Stream> inline void Serialize(Stream& s, uint32_t a,     int, int=0) { ser_writedata32(s, a); }
+template<typename Stream> inline void Serialize(Stream& s, int64_t a,      int, int=0) { ser_writedata64(s, a); }
+template<typename Stream> inline void Serialize(Stream& s, uint64_t a,     int, int=0) { ser_writedata64(s, a); }
+template<typename Stream> inline void Serialize(Stream& s, float a,        int, int=0) { ser_writedata32(s, ser_float_to_uint32(a)); }
+template<typename Stream> inline void Serialize(Stream& s, double a,       int, int=0) { ser_writedata64(s, ser_double_to_uint64(a)); }
+
+template<typename Stream> inline void Unserialize(Stream& s, char& a,      int, int=0) { a = ser_readdata8(s); } // TODO Get rid of bare char
+template<typename Stream> inline void Unserialize(Stream& s, int8_t& a,    int, int=0) { a = ser_readdata8(s); }
+template<typename Stream> inline void Unserialize(Stream& s, uint8_t& a,   int, int=0) { a = ser_readdata8(s); }
+template<typename Stream> inline void Unserialize(Stream& s, int16_t& a,   int, int=0) { a = ser_readdata16(s); }
+template<typename Stream> inline void Unserialize(Stream& s, uint16_t& a,  int, int=0) { a = ser_readdata16(s); }
+template<typename Stream> inline void Unserialize(Stream& s, int32_t& a,   int, int=0) { a = ser_readdata32(s); }
+template<typename Stream> inline void Unserialize(Stream& s, uint32_t& a,  int, int=0) { a = ser_readdata32(s); }
+template<typename Stream> inline void Unserialize(Stream& s, int64_t& a,   int, int=0) { a = ser_readdata64(s); }
+template<typename Stream> inline void Unserialize(Stream& s, uint64_t& a,  int, int=0) { a = ser_readdata64(s); }
+template<typename Stream> inline void Unserialize(Stream& s, float& a,     int, int=0) { a = ser_uint32_to_float(ser_readdata32(s)); }
+template<typename Stream> inline void Unserialize(Stream& s, double& a,    int, int=0) { a = ser_uint64_to_double(ser_readdata64(s)); }
+
+inline unsigned int GetSerializeSize(bool a, int, int=0)                          { return sizeof(char); }
+template<typename Stream> inline void Serialize(Stream& s, bool a, int, int=0)    { char f=a; ser_writedata8(s, f); }
+template<typename Stream> inline void Unserialize(Stream& s, bool& a, int, int=0) { char f=ser_readdata8(s); a=f; }
+
+
+
+
+
+
+/**
+ * Compact Size
+ * size <  253        -- 1 byte
+ * size <= USHRT_MAX  -- 3 bytes  (253 + 2 bytes)
+ * size <= UINT_MAX   -- 5 bytes  (254 + 4 bytes)
+ * size >  UINT_MAX   -- 9 bytes  (255 + 8 bytes)
+ */
+inline unsigned int GetSizeOfCompactSize(uint64_t nSize)
+{
+    if (nSize < 253)             return sizeof(unsigned char);
+    else if (nSize <= std::numeric_limits<unsigned short>::max()) return sizeof(unsigned char) + sizeof(unsigned short);
+    else if (nSize <= std::numeric_limits<unsigned int>::max())  return sizeof(unsigned char) + sizeof(unsigned int);
+    else                         return sizeof(unsigned char) + sizeof(uint64_t);
+}
+
+template<typename Stream>
+void WriteCompactSize(Stream& os, uint64_t nSize)
+{
+    if (nSize < 253)
+    {
+        ser_writedata8(os, nSize);
+    }
+    else if (nSize <= std::numeric_limits<unsigned short>::max())
+    {
+        ser_writedata8(os, 253);
+        ser_writedata16(os, nSize);
+    }
+    else if (nSize <= std::numeric_limits<unsigned int>::max())
+    {
+        ser_writedata8(os, 254);
+        ser_writedata32(os, nSize);
+    }
+    else
+    {
+        ser_writedata8(os, 255);
+        ser_writedata64(os, nSize);
+    }
+    return;
+}
+
+template<typename Stream>
+uint64_t ReadCompactSize(Stream& is)
+{
+    uint8_t chSize = ser_readdata8(is);
+    uint64_t nSizeRet = 0;
+    if (chSize < 253)
+    {
+        nSizeRet = chSize;
+    }
+    else if (chSize == 253)
+    {
+        nSizeRet = ser_readdata16(is);
+        if (nSizeRet < 253)
+            throw std::ios_base::failure("non-canonical ReadCompactSize()");
+    }
+    else if (chSize == 254)
+    {
+        nSizeRet = ser_readdata32(is);
+        if (nSizeRet < 0x10000u)
+            throw std::ios_base::failure("non-canonical ReadCompactSize()");
+    }
+    else
+    {
+        nSizeRet = ser_readdata64(is);
+        if (nSizeRet < 0x100000000ULL)
+            throw std::ios_base::failure("non-canonical ReadCompactSize()");
+    }
+    if (nSizeRet > (uint64_t)MAX_SIZE)
+        throw std::ios_base::failure("ReadCompactSize(): size too large");
+    return nSizeRet;
+}
+
+/**
+ * Variable-length integers: bytes are a MSB base-128 encoding of the number.
+ * The high bit in each byte signifies whether another digit follows. To make
+ * sure the encoding is one-to-one, one is subtracted from all but the last digit.
+ * Thus, the byte sequence a[] with length len, where all but the last byte
+ * has bit 128 set, encodes the number:
+ * 
+ *  (a[len-1] & 0x7F) + sum(i=1..len-1, 128^i*((a[len-i-1] & 0x7F)+1))
+ * 
+ * Properties:
+ * * Very small (0-127: 1 byte, 128-16511: 2 bytes, 16512-2113663: 3 bytes)
+ * * Every integer has exactly one encoding
+ * * Encoding does not depend on size of original integer type
+ * * No redundancy: every (infinite) byte sequence corresponds to a list
+ *   of encoded integers.
+ * 
+ * 0:         [0x00]  256:        [0x81 0x00]
+ * 1:         [0x01]  16383:      [0xFE 0x7F]
+ * 127:       [0x7F]  16384:      [0xFF 0x00]
+ * 128:  [0x80 0x00]  16511: [0x80 0xFF 0x7F]
+ * 255:  [0x80 0x7F]  65535: [0x82 0xFD 0x7F]
+ * 2^32:           [0x8E 0xFE 0xFE 0xFF 0x00]
+ */
+
+template<typename I>
+inline unsigned int GetSizeOfVarInt(I n)
+{
+    int nRet = 0;
+    while(true) {
+        nRet++;
+        if (n <= 0x7F)
+            break;
+        n = (n >> 7) - 1;
+    }
+    return nRet;
+}
+
+template<typename Stream, typename I>
+void WriteVarInt(Stream& os, I n)
+{
+    unsigned char tmp[(sizeof(n)*8+6)/7];
+    int len=0;
+    while(true) {
+        tmp[len] = (n & 0x7F) | (len ? 0x80 : 0x00);
+        if (n <= 0x7F)
+            break;
+        n = (n >> 7) - 1;
+        len++;
+    }
+    do {
+        ser_writedata8(os, tmp[len]);
+    } while(len--);
+}
+
+template<typename Stream, typename I>
+I ReadVarInt(Stream& is)
+{
+    I n = 0;
+    while(true) {
+        unsigned char chData = ser_readdata8(is);
+        n = (n << 7) | (chData & 0x7F);
+        if (chData & 0x80)
+            n++;
+        else
+            return n;
+    }
+}
+
+#define FLATDATA(obj) REF(CFlatData((char*)&(obj), (char*)&(obj) + sizeof(obj)))
+#define VARINT(obj) REF(WrapVarInt(REF(obj)))
+#define LIMITED_STRING(obj,n) REF(LimitedString< n >(REF(obj)))
+
+/** 
+ * Wrapper for serializing arrays and POD.
+ */
+class CFlatData
+{
+protected:
+    char* pbegin;
+    char* pend;
+public:
+    CFlatData(void* pbeginIn, void* pendIn) : pbegin((char*)pbeginIn), pend((char*)pendIn) { }
+    template <class T, class TAl>
+    explicit CFlatData(std::vector<T,TAl> &v)
+    {
+        pbegin = (char*)begin_ptr(v);
+        pend = (char*)end_ptr(v);
+    }
+    char* begin() { return pbegin; }
+    const char* begin() const { return pbegin; }
+    char* end() { return pend; }
+    const char* end() const { return pend; }
+
+    unsigned int GetSerializeSize(int, int=0) const
+    {
+        return pend - pbegin;
+    }
+
+    template<typename Stream>
+    void Serialize(Stream& s, int, int=0) const
+    {
+        s.write(pbegin, pend - pbegin);
+    }
+
+    template<typename Stream>
+    void Unserialize(Stream& s, int, int=0)
+    {
+        s.read(pbegin, pend - pbegin);
+    }
+};
+
+template<typename I>
+class CVarInt
+{
+protected:
+    I &n;
+public:
+    CVarInt(I& nIn) : n(nIn) { }
+
+    unsigned int GetSerializeSize(int, int) const {
+        return GetSizeOfVarInt<I>(n);
+    }
+
+    template<typename Stream>
+    void Serialize(Stream &s, int, int) const {
+        WriteVarInt<Stream,I>(s, n);
+    }
+
+    template<typename Stream>
+    void Unserialize(Stream& s, int, int) {
+        n = ReadVarInt<Stream,I>(s);
+    }
+};
+
+template<size_t Limit>
+class LimitedString
+{
+protected:
+    std::string& string;
+public:
+    LimitedString(std::string& string) : string(string) {}
+
+    template<typename Stream>
+    void Unserialize(Stream& s, int, int=0)
+    {
+        size_t size = ReadCompactSize(s);
+        if (size > Limit) {
+            throw std::ios_base::failure("String length limit exceeded");
+        }
+        string.resize(size);
+        if (size != 0)
+            s.read((char*)&string[0], size);
+    }
+
+    template<typename Stream>
+    void Serialize(Stream& s, int, int=0) const
+    {
+        WriteCompactSize(s, string.size());
+        if (!string.empty())
+            s.write((char*)&string[0], string.size());
+    }
+
+    unsigned int GetSerializeSize(int, int=0) const
+    {
+        return GetSizeOfCompactSize(string.size()) + string.size();
+    }
+};
+
+template<typename I>
+CVarInt<I> WrapVarInt(I& n) { return CVarInt<I>(n); }
+
+/**
+ * Forward declarations
+ */
+
+/**
+ *  string
+ */
+template<typename C> unsigned int GetSerializeSize(const std::basic_string<C>& str, int, int=0);
+template<typename Stream, typename C> void Serialize(Stream& os, const std::basic_string<C>& str, int, int=0);
+template<typename Stream, typename C> void Unserialize(Stream& is, std::basic_string<C>& str, int, int=0);
+
+/**
+ * vector
+ * vectors of unsigned char are a special case and are intended to be serialized as a single opaque blob.
+ */
+template<typename T, typename A> unsigned int GetSerializeSize_impl(const std::vector<T, A>& v, int nType, int nVersion, const unsigned char&);
+template<typename T, typename A, typename V> unsigned int GetSerializeSize_impl(const std::vector<T, A>& v, int nType, int nVersion, const V&);
+template<typename T, typename A> inline unsigned int GetSerializeSize(const std::vector<T, A>& v, int nType, int nVersion);
+template<typename Stream, typename T, typename A> void Serialize_impl(Stream& os, const std::vector<T, A>& v, int nType, int nVersion, const unsigned char&);
+template<typename Stream, typename T, typename A, typename V> void Serialize_impl(Stream& os, const std::vector<T, A>& v, int nType, int nVersion, const V&);
+template<typename Stream, typename T, typename A> inline void Serialize(Stream& os, const std::vector<T, A>& v, int nType, int nVersion);
+template<typename Stream, typename T, typename A> void Unserialize_impl(Stream& is, std::vector<T, A>& v, int nType, int nVersion, const unsigned char&);
+template<typename Stream, typename T, typename A, typename V> void Unserialize_impl(Stream& is, std::vector<T, A>& v, int nType, int nVersion, const V&);
+template<typename Stream, typename T, typename A> inline void Unserialize(Stream& is, std::vector<T, A>& v, int nType, int nVersion);
+
+/**
+ * others derived from vector
+ */
+extern inline unsigned int GetSerializeSize(const CScript& v, int nType, int nVersion);
+template<typename Stream> void Serialize(Stream& os, const CScript& v, int nType, int nVersion);
+template<typename Stream> void Unserialize(Stream& is, CScript& v, int nType, int nVersion);
+
+/**
+ * pair
+ */
+template<typename K, typename T> unsigned int GetSerializeSize(const std::pair<K, T>& item, int nType, int nVersion);
+template<typename Stream, typename K, typename T> void Serialize(Stream& os, const std::pair<K, T>& item, int nType, int nVersion);
+template<typename Stream, typename K, typename T> void Unserialize(Stream& is, std::pair<K, T>& item, int nType, int nVersion);
+
+/**
+ * map
+ */
+template<typename K, typename T, typename Pred, typename A> unsigned int GetSerializeSize(const std::map<K, T, Pred, A>& m, int nType, int nVersion);
+template<typename Stream, typename K, typename T, typename Pred, typename A> void Serialize(Stream& os, const std::map<K, T, Pred, A>& m, int nType, int nVersion);
+template<typename Stream, typename K, typename T, typename Pred, typename A> void Unserialize(Stream& is, std::map<K, T, Pred, A>& m, int nType, int nVersion);
+
+/**
+ * set
+ */
+template<typename K, typename Pred, typename A> unsigned int GetSerializeSize(const std::set<K, Pred, A>& m, int nType, int nVersion);
+template<typename Stream, typename K, typename Pred, typename A> void Serialize(Stream& os, const std::set<K, Pred, A>& m, int nType, int nVersion);
+template<typename Stream, typename K, typename Pred, typename A> void Unserialize(Stream& is, std::set<K, Pred, A>& m, int nType, int nVersion);
+
+
+
+
+
+/**
+ * If none of the specialized versions above matched, default to calling member function.
+ * "int nType" is changed to "long nType" to keep from getting an ambiguous overload error.
+ * The compiler will only cast int to long if none of the other templates matched.
+ * Thanks to Boost serialization for this idea.
+ */
+template<typename T>
+inline unsigned int GetSerializeSize(const T& a, long nType, int nVersion)
+{
+    return a.GetSerializeSize((int)nType, nVersion);
+}
+
+template<typename Stream, typename T>
+inline void Serialize(Stream& os, const T& a, long nType, int nVersion)
+{
+    a.Serialize(os, (int)nType, nVersion);
+}
+
+template<typename Stream, typename T>
+inline void Unserialize(Stream& is, T& a, long nType, int nVersion)
+{
+    a.Unserialize(is, (int)nType, nVersion);
+}
+
+
+
+
+
+/**
+ * string
+ */
+template<typename C>
+unsigned int GetSerializeSize(const std::basic_string<C>& str, int, int)
+{
+    return GetSizeOfCompactSize(str.size()) + str.size() * sizeof(str[0]);
+}
+
+template<typename Stream, typename C>
+void Serialize(Stream& os, const std::basic_string<C>& str, int, int)
+{
+    WriteCompactSize(os, str.size());
+    if (!str.empty())
+        os.write((char*)&str[0], str.size() * sizeof(str[0]));
+}
+
+template<typename Stream, typename C>
+void Unserialize(Stream& is, std::basic_string<C>& str, int, int)
+{
+    unsigned int nSize = ReadCompactSize(is);
+    str.resize(nSize);
+    if (nSize != 0)
+        is.read((char*)&str[0], nSize * sizeof(str[0]));
+}
+
+
+
+/**
+ * vector
+ */
+template<typename T, typename A>
+unsigned int GetSerializeSize_impl(const std::vector<T, A>& v, int nType, int nVersion, const unsigned char&)
+{
+    return (GetSizeOfCompactSize(v.size()) + v.size() * sizeof(T));
+}
+
+template<typename T, typename A, typename V>
+unsigned int GetSerializeSize_impl(const std::vector<T, A>& v, int nType, int nVersion, const V&)
+{
+    unsigned int nSize = GetSizeOfCompactSize(v.size());
+    for (typename std::vector<T, A>::const_iterator vi = v.begin(); vi != v.end(); ++vi)
+        nSize += GetSerializeSize((*vi), nType, nVersion);
+    return nSize;
+}
+
+template<typename T, typename A>
+inline unsigned int GetSerializeSize(const std::vector<T, A>& v, int nType, int nVersion)
+{
+    return GetSerializeSize_impl(v, nType, nVersion, T());
+}
+
+
+template<typename Stream, typename T, typename A>
+void Serialize_impl(Stream& os, const std::vector<T, A>& v, int nType, int nVersion, const unsigned char&)
+{
+    WriteCompactSize(os, v.size());
+    if (!v.empty())
+        os.write((char*)&v[0], v.size() * sizeof(T));
+}
+
+template<typename Stream, typename T, typename A, typename V>
+void Serialize_impl(Stream& os, const std::vector<T, A>& v, int nType, int nVersion, const V&)
+{
+    WriteCompactSize(os, v.size());
+    for (typename std::vector<T, A>::const_iterator vi = v.begin(); vi != v.end(); ++vi)
+        ::Serialize(os, (*vi), nType, nVersion);
+}
+
+template<typename Stream, typename T, typename A>
+inline void Serialize(Stream& os, const std::vector<T, A>& v, int nType, int nVersion)
+{
+    Serialize_impl(os, v, nType, nVersion, T());
+}
+
+
+template<typename Stream, typename T, typename A>
+void Unserialize_impl(Stream& is, std::vector<T, A>& v, int nType, int nVersion, const unsigned char&)
+{
+    // Limit size per read so bogus size value won't cause out of memory
+    v.clear();
+    unsigned int nSize = ReadCompactSize(is);
+    unsigned int i = 0;
+    while (i < nSize)
+    {
+        unsigned int blk = std::min(nSize - i, (unsigned int)(1 + 4999999 / sizeof(T)));
+        v.resize(i + blk);
+        is.read((char*)&v[i], blk * sizeof(T));
+        i += blk;
+    }
+}
+
+template<typename Stream, typename T, typename A, typename V>
+void Unserialize_impl(Stream& is, std::vector<T, A>& v, int nType, int nVersion, const V&)
+{
+    v.clear();
+    unsigned int nSize = ReadCompactSize(is);
+    unsigned int i = 0;
+    unsigned int nMid = 0;
+    while (nMid < nSize)
+    {
+        nMid += 5000000 / sizeof(T);
+        if (nMid > nSize)
+            nMid = nSize;
+        v.resize(nMid);
+        for (; i < nMid; i++)
+            Unserialize(is, v[i], nType, nVersion);
+    }
+}
+
+template<typename Stream, typename T, typename A>
+inline void Unserialize(Stream& is, std::vector<T, A>& v, int nType, int nVersion)
+{
+    Unserialize_impl(is, v, nType, nVersion, T());
+}
+
+
+
+/**
+ * others derived from vector
+ */
+inline unsigned int GetSerializeSize(const CScript& v, int nType, int nVersion)
+{
+    return GetSerializeSize((const std::vector<unsigned char>&)v, nType, nVersion);
+}
+
+template<typename Stream>
+void Serialize(Stream& os, const CScript& v, int nType, int nVersion)
+{
+    Serialize(os, (const std::vector<unsigned char>&)v, nType, nVersion);
+}
+
+template<typename Stream>
+void Unserialize(Stream& is, CScript& v, int nType, int nVersion)
+{
+    Unserialize(is, (std::vector<unsigned char>&)v, nType, nVersion);
+}
+
+
+
+/**
+ * pair
+ */
+template<typename K, typename T>
+unsigned int GetSerializeSize(const std::pair<K, T>& item, int nType, int nVersion)
+{
+    return GetSerializeSize(item.first, nType, nVersion) + GetSerializeSize(item.second, nType, nVersion);
+}
+
+template<typename Stream, typename K, typename T>
+void Serialize(Stream& os, const std::pair<K, T>& item, int nType, int nVersion)
+{
+    Serialize(os, item.first, nType, nVersion);
+    Serialize(os, item.second, nType, nVersion);
+}
+
+template<typename Stream, typename K, typename T>
+void Unserialize(Stream& is, std::pair<K, T>& item, int nType, int nVersion)
+{
+    Unserialize(is, item.first, nType, nVersion);
+    Unserialize(is, item.second, nType, nVersion);
+}
+
+
+
+/**
+ * map
+ */
+template<typename K, typename T, typename Pred, typename A>
+unsigned int GetSerializeSize(const std::map<K, T, Pred, A>& m, int nType, int nVersion)
+{
+    unsigned int nSize = GetSizeOfCompactSize(m.size());
+    for (typename std::map<K, T, Pred, A>::const_iterator mi = m.begin(); mi != m.end(); ++mi)
+        nSize += GetSerializeSize((*mi), nType, nVersion);
+    return nSize;
+}
+
+template<typename Stream, typename K, typename T, typename Pred, typename A>
+void Serialize(Stream& os, const std::map<K, T, Pred, A>& m, int nType, int nVersion)
+{
+    WriteCompactSize(os, m.size());
+    for (typename std::map<K, T, Pred, A>::const_iterator mi = m.begin(); mi != m.end(); ++mi)
+        Serialize(os, (*mi), nType, nVersion);
+}
+
+template<typename Stream, typename K, typename T, typename Pred, typename A>
+void Unserialize(Stream& is, std::map<K, T, Pred, A>& m, int nType, int nVersion)
+{
+    m.clear();
+    unsigned int nSize = ReadCompactSize(is);
+    typename std::map<K, T, Pred, A>::iterator mi = m.begin();
+    for (unsigned int i = 0; i < nSize; i++)
+    {
+        std::pair<K, T> item;
+        Unserialize(is, item, nType, nVersion);
+        mi = m.insert(mi, item);
+    }
+}
+
+
+
+/**
+ * set
+ */
+template<typename K, typename Pred, typename A>
+unsigned int GetSerializeSize(const std::set<K, Pred, A>& m, int nType, int nVersion)
+{
+    unsigned int nSize = GetSizeOfCompactSize(m.size());
+    for (typename std::set<K, Pred, A>::const_iterator it = m.begin(); it != m.end(); ++it)
+        nSize += GetSerializeSize((*it), nType, nVersion);
+    return nSize;
+}
+
+template<typename Stream, typename K, typename Pred, typename A>
+void Serialize(Stream& os, const std::set<K, Pred, A>& m, int nType, int nVersion)
+{
+    WriteCompactSize(os, m.size());
+    for (typename std::set<K, Pred, A>::const_iterator it = m.begin(); it != m.end(); ++it)
+        Serialize(os, (*it), nType, nVersion);
+}
+
+template<typename Stream, typename K, typename Pred, typename A>
+void Unserialize(Stream& is, std::set<K, Pred, A>& m, int nType, int nVersion)
+{
+    m.clear();
+    unsigned int nSize = ReadCompactSize(is);
+    typename std::set<K, Pred, A>::iterator it = m.begin();
+    for (unsigned int i = 0; i < nSize; i++)
+    {
+        K key;
+        Unserialize(is, key, nType, nVersion);
+        it = m.insert(it, key);
+    }
+}
+
+
+
+/**
+ * Support for ADD_SERIALIZE_METHODS and READWRITE macro
+ */
+struct CSerActionSerialize
+{
+    bool ForRead() const { return false; }
+};
+struct CSerActionUnserialize
+{
+    bool ForRead() const { return true; }
+};
+
+template<typename Stream, typename T>
+inline void SerReadWrite(Stream& s, const T& obj, int nType, int nVersion, CSerActionSerialize ser_action)
+{
+    ::Serialize(s, obj, nType, nVersion);
+}
+
+template<typename Stream, typename T>
+inline void SerReadWrite(Stream& s, T& obj, int nType, int nVersion, CSerActionUnserialize ser_action)
+{
+    ::Unserialize(s, obj, nType, nVersion);
+}
+
+
+
+
+
+
+
+
+
+class CSizeComputer
+{
+protected:
+    size_t nSize;
+
+public:
+    int nType;
+    int nVersion;
+
+    CSizeComputer(int nTypeIn, int nVersionIn) : nSize(0), nType(nTypeIn), nVersion(nVersionIn) {}
+
+    CSizeComputer& write(const char *psz, size_t nSize)
+    {
+        this->nSize += nSize;
+        return *this;
+    }
+
+    template<typename T>
+    CSizeComputer& operator<<(const T& obj)
+    {
+        ::Serialize(*this, obj, nType, nVersion);
+        return (*this);
+    }
+
+    size_t size() const {
+        return nSize;
+    }
+};
+
+#endif // BITCOIN_SERIALIZE_H
--- a/algo/hodl/sha256.cpp
+++ b/algo/hodl/sha256.cpp
@@ -0,0 +1,187 @@
+// Copyright (c) 2014 The Bitcoin Core developers
+// Distributed under the MIT software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+#include "sha256.h"
+#include "common.h"
+#include <string.h>
+
+// Internal implementation code.
+namespace
+{
+/// Internal SHA-256 implementation.
+namespace sha256
+{
+uint32_t inline Ch(uint32_t x, uint32_t y, uint32_t z) { return z ^ (x & (y ^ z)); }
+uint32_t inline Maj(uint32_t x, uint32_t y, uint32_t z) { return (x & y) | (z & (x | y)); }
+uint32_t inline Sigma0(uint32_t x) { return (x >> 2 | x << 30) ^ (x >> 13 | x << 19) ^ (x >> 22 | x << 10); }
+uint32_t inline Sigma1(uint32_t x) { return (x >> 6 | x << 26) ^ (x >> 11 | x << 21) ^ (x >> 25 | x << 7); }
+uint32_t inline sigma0(uint32_t x) { return (x >> 7 | x << 25) ^ (x >> 18 | x << 14) ^ (x >> 3); }
+uint32_t inline sigma1(uint32_t x) { return (x >> 17 | x << 15) ^ (x >> 19 | x << 13) ^ (x >> 10); }
+
+/** One round of SHA-256. */
+void inline Round(uint32_t a, uint32_t b, uint32_t c, uint32_t& d, uint32_t e, uint32_t f, uint32_t g, uint32_t& h, uint32_t k, uint32_t w)
+{
+    uint32_t t1 = h + Sigma1(e) + Ch(e, f, g) + k + w;
+    uint32_t t2 = Sigma0(a) + Maj(a, b, c);
+    d += t1;
+    h = t1 + t2;
+}
+
+/** Initialize SHA-256 state. */
+void inline Initialize(uint32_t* s)
+{
+    s[0] = 0x6a09e667ul;
+    s[1] = 0xbb67ae85ul;
+    s[2] = 0x3c6ef372ul;
+    s[3] = 0xa54ff53aul;
+    s[4] = 0x510e527ful;
+    s[5] = 0x9b05688cul;
+    s[6] = 0x1f83d9abul;
+    s[7] = 0x5be0cd19ul;
+}
+
+/** Perform one SHA-256 transformation, processing a 64-byte chunk. */
+void Transform(uint32_t* s, const unsigned char* chunk)
+{
+    uint32_t a = s[0], b = s[1], c = s[2], d = s[3], e = s[4], f = s[5], g = s[6], h = s[7];
+    uint32_t w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
+
+    Round(a, b, c, d, e, f, g, h, 0x428a2f98, w0 = ReadBE32(chunk + 0));
+    Round(h, a, b, c, d, e, f, g, 0x71374491, w1 = ReadBE32(chunk + 4));
+    Round(g, h, a, b, c, d, e, f, 0xb5c0fbcf, w2 = ReadBE32(chunk + 8));
+    Round(f, g, h, a, b, c, d, e, 0xe9b5dba5, w3 = ReadBE32(chunk + 12));
+    Round(e, f, g, h, a, b, c, d, 0x3956c25b, w4 = ReadBE32(chunk + 16));
+    Round(d, e, f, g, h, a, b, c, 0x59f111f1, w5 = ReadBE32(chunk + 20));
+    Round(c, d, e, f, g, h, a, b, 0x923f82a4, w6 = ReadBE32(chunk + 24));
+    Round(b, c, d, e, f, g, h, a, 0xab1c5ed5, w7 = ReadBE32(chunk + 28));
+    Round(a, b, c, d, e, f, g, h, 0xd807aa98, w8 = ReadBE32(chunk + 32));
+    Round(h, a, b, c, d, e, f, g, 0x12835b01, w9 = ReadBE32(chunk + 36));
+    Round(g, h, a, b, c, d, e, f, 0x243185be, w10 = ReadBE32(chunk + 40));
+    Round(f, g, h, a, b, c, d, e, 0x550c7dc3, w11 = ReadBE32(chunk + 44));
+    Round(e, f, g, h, a, b, c, d, 0x72be5d74, w12 = ReadBE32(chunk + 48));
+    Round(d, e, f, g, h, a, b, c, 0x80deb1fe, w13 = ReadBE32(chunk + 52));
+    Round(c, d, e, f, g, h, a, b, 0x9bdc06a7, w14 = ReadBE32(chunk + 56));
+    Round(b, c, d, e, f, g, h, a, 0xc19bf174, w15 = ReadBE32(chunk + 60));
+
+    Round(a, b, c, d, e, f, g, h, 0xe49b69c1, w0 += sigma1(w14) + w9 + sigma0(w1));
+    Round(h, a, b, c, d, e, f, g, 0xefbe4786, w1 += sigma1(w15) + w10 + sigma0(w2));
+    Round(g, h, a, b, c, d, e, f, 0x0fc19dc6, w2 += sigma1(w0) + w11 + sigma0(w3));
+    Round(f, g, h, a, b, c, d, e, 0x240ca1cc, w3 += sigma1(w1) + w12 + sigma0(w4));
+    Round(e, f, g, h, a, b, c, d, 0x2de92c6f, w4 += sigma1(w2) + w13 + sigma0(w5));
+    Round(d, e, f, g, h, a, b, c, 0x4a7484aa, w5 += sigma1(w3) + w14 + sigma0(w6));
+    Round(c, d, e, f, g, h, a, b, 0x5cb0a9dc, w6 += sigma1(w4) + w15 + sigma0(w7));
+    Round(b, c, d, e, f, g, h, a, 0x76f988da, w7 += sigma1(w5) + w0 + sigma0(w8));
+    Round(a, b, c, d, e, f, g, h, 0x983e5152, w8 += sigma1(w6) + w1 + sigma0(w9));
+    Round(h, a, b, c, d, e, f, g, 0xa831c66d, w9 += sigma1(w7) + w2 + sigma0(w10));
+    Round(g, h, a, b, c, d, e, f, 0xb00327c8, w10 += sigma1(w8) + w3 + sigma0(w11));
+    Round(f, g, h, a, b, c, d, e, 0xbf597fc7, w11 += sigma1(w9) + w4 + sigma0(w12));
+    Round(e, f, g, h, a, b, c, d, 0xc6e00bf3, w12 += sigma1(w10) + w5 + sigma0(w13));
+    Round(d, e, f, g, h, a, b, c, 0xd5a79147, w13 += sigma1(w11) + w6 + sigma0(w14));
+    Round(c, d, e, f, g, h, a, b, 0x06ca6351, w14 += sigma1(w12) + w7 + sigma0(w15));
+    Round(b, c, d, e, f, g, h, a, 0x14292967, w15 += sigma1(w13) + w8 + sigma0(w0));
+
+    Round(a, b, c, d, e, f, g, h, 0x27b70a85, w0 += sigma1(w14) + w9 + sigma0(w1));
+    Round(h, a, b, c, d, e, f, g, 0x2e1b2138, w1 += sigma1(w15) + w10 + sigma0(w2));
+    Round(g, h, a, b, c, d, e, f, 0x4d2c6dfc, w2 += sigma1(w0) + w11 + sigma0(w3));
+    Round(f, g, h, a, b, c, d, e, 0x53380d13, w3 += sigma1(w1) + w12 + sigma0(w4));
+    Round(e, f, g, h, a, b, c, d, 0x650a7354, w4 += sigma1(w2) + w13 + sigma0(w5));
+    Round(d, e, f, g, h, a, b, c, 0x766a0abb, w5 += sigma1(w3) + w14 + sigma0(w6));
+    Round(c, d, e, f, g, h, a, b, 0x81c2c92e, w6 += sigma1(w4) + w15 + sigma0(w7));
+    Round(b, c, d, e, f, g, h, a, 0x92722c85, w7 += sigma1(w5) + w0 + sigma0(w8));
+    Round(a, b, c, d, e, f, g, h, 0xa2bfe8a1, w8 += sigma1(w6) + w1 + sigma0(w9));
+    Round(h, a, b, c, d, e, f, g, 0xa81a664b, w9 += sigma1(w7) + w2 + sigma0(w10));
+    Round(g, h, a, b, c, d, e, f, 0xc24b8b70, w10 += sigma1(w8) + w3 + sigma0(w11));
+    Round(f, g, h, a, b, c, d, e, 0xc76c51a3, w11 += sigma1(w9) + w4 + sigma0(w12));
+    Round(e, f, g, h, a, b, c, d, 0xd192e819, w12 += sigma1(w10) + w5 + sigma0(w13));
+    Round(d, e, f, g, h, a, b, c, 0xd6990624, w13 += sigma1(w11) + w6 + sigma0(w14));
+    Round(c, d, e, f, g, h, a, b, 0xf40e3585, w14 += sigma1(w12) + w7 + sigma0(w15));
+    Round(b, c, d, e, f, g, h, a, 0x106aa070, w15 += sigma1(w13) + w8 + sigma0(w0));
+
+    Round(a, b, c, d, e, f, g, h, 0x19a4c116, w0 += sigma1(w14) + w9 + sigma0(w1));
+    Round(h, a, b, c, d, e, f, g, 0x1e376c08, w1 += sigma1(w15) + w10 + sigma0(w2));
+    Round(g, h, a, b, c, d, e, f, 0x2748774c, w2 += sigma1(w0) + w11 + sigma0(w3));
+    Round(f, g, h, a, b, c, d, e, 0x34b0bcb5, w3 += sigma1(w1) + w12 + sigma0(w4));
+    Round(e, f, g, h, a, b, c, d, 0x391c0cb3, w4 += sigma1(w2) + w13 + sigma0(w5));
+    Round(d, e, f, g, h, a, b, c, 0x4ed8aa4a, w5 += sigma1(w3) + w14 + sigma0(w6));
+    Round(c, d, e, f, g, h, a, b, 0x5b9cca4f, w6 += sigma1(w4) + w15 + sigma0(w7));
+    Round(b, c, d, e, f, g, h, a, 0x682e6ff3, w7 += sigma1(w5) + w0 + sigma0(w8));
+    Round(a, b, c, d, e, f, g, h, 0x748f82ee, w8 += sigma1(w6) + w1 + sigma0(w9));
+    Round(h, a, b, c, d, e, f, g, 0x78a5636f, w9 += sigma1(w7) + w2 + sigma0(w10));
+    Round(g, h, a, b, c, d, e, f, 0x84c87814, w10 += sigma1(w8) + w3 + sigma0(w11));
+    Round(f, g, h, a, b, c, d, e, 0x8cc70208, w11 += sigma1(w9) + w4 + sigma0(w12));
+    Round(e, f, g, h, a, b, c, d, 0x90befffa, w12 += sigma1(w10) + w5 + sigma0(w13));
+    Round(d, e, f, g, h, a, b, c, 0xa4506ceb, w13 += sigma1(w11) + w6 + sigma0(w14));
+    Round(c, d, e, f, g, h, a, b, 0xbef9a3f7, w14 + sigma1(w12) + w7 + sigma0(w15));
+    Round(b, c, d, e, f, g, h, a, 0xc67178f2, w15 + sigma1(w13) + w8 + sigma0(w0));
+
+    s[0] += a;
+    s[1] += b;
+    s[2] += c;
+    s[3] += d;
+    s[4] += e;
+    s[5] += f;
+    s[6] += g;
+    s[7] += h;
+}
+
+} // namespace sha256
+} // namespace
+
+
+////// SHA-256
+
+CSHA256::CSHA256() : bytes(0)
+{
+    sha256::Initialize(s);
+}
+
+CSHA256& CSHA256::Write(const unsigned char* data, size_t len)
+{
+    const unsigned char* end = data + len;
+    size_t bufsize = bytes % 64;
+    if (bufsize && bufsize + len >= 64) {
+        // Fill the buffer, and process it.
+        memcpy(buf + bufsize, data, 64 - bufsize);
+        bytes += 64 - bufsize;
+        data += 64 - bufsize;
+        sha256::Transform(s, buf);
+        bufsize = 0;
+    }
+    while (end >= data + 64) {
+        // Process full chunks directly from the source.
+        sha256::Transform(s, data);
+        bytes += 64;
+        data += 64;
+    }
+    if (end > data) {
+        // Fill the buffer with what remains.
+        memcpy(buf + bufsize, data, end - data);
+        bytes += end - data;
+    }
+    return *this;
+}
+
+void CSHA256::Finalize(unsigned char hash[OUTPUT_SIZE])
+{
+    static const unsigned char pad[64] = {0x80};
+    unsigned char sizedesc[8];
+    WriteBE64(sizedesc, bytes << 3);
+    Write(pad, 1 + ((119 - (bytes % 64)) % 64));
+    Write(sizedesc, 8);
+    WriteBE32(hash, s[0]);
+    WriteBE32(hash + 4, s[1]);
+    WriteBE32(hash + 8, s[2]);
+    WriteBE32(hash + 12, s[3]);
+    WriteBE32(hash + 16, s[4]);
+    WriteBE32(hash + 20, s[5]);
+    WriteBE32(hash + 24, s[6]);
+    WriteBE32(hash + 28, s[7]);
+}
+
+CSHA256& CSHA256::Reset()
+{
+    bytes = 0;
+    sha256::Initialize(s);
+    return *this;
+}
--- a/algo/hodl/sha256.h
+++ b/algo/hodl/sha256.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2014 The Bitcoin Core developers
+// Distributed under the MIT software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+#ifndef BITCOIN_CRYPTO_SHA256_H
+#define BITCOIN_CRYPTO_SHA256_H
+
+#include <stdint.h>
+#include <stdlib.h>
+
+/** A hasher class for SHA-256. */
+class CSHA256
+{
+private:
+    uint32_t s[8];
+    unsigned char buf[64];
+    size_t bytes;
+
+public:
+    static const size_t OUTPUT_SIZE = 32;
+
+    CSHA256();
+    CSHA256& Write(const unsigned char* data, size_t len);
+    void Finalize(unsigned char hash[OUTPUT_SIZE]);
+    CSHA256& Reset();
+};
+
+#endif // BITCOIN_CRYPTO_SHA256_H
--- a/algo/hodl/sha512.cpp
+++ b/algo/hodl/sha512.cpp
@@ -0,0 +1,205 @@
+// Copyright (c) 2014 The Bitcoin Core developers
+// Distributed under the MIT software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+#include "sha512.h"
+#include "common.h"
+#include <string.h>
+
+// Internal implementation code.
+namespace
+{
+/// Internal SHA-512 implementation.
+namespace sha512
+{
+uint64_t inline Ch(uint64_t x, uint64_t y, uint64_t z) { return z ^ (x & (y ^ z)); }
+uint64_t inline Maj(uint64_t x, uint64_t y, uint64_t z) { return (x & y) | (z & (x | y)); }
+uint64_t inline Sigma0(uint64_t x) { return (x >> 28 | x << 36) ^ (x >> 34 | x << 30) ^ (x >> 39 | x << 25); }
+uint64_t inline Sigma1(uint64_t x) { return (x >> 14 | x << 50) ^ (x >> 18 | x << 46) ^ (x >> 41 | x << 23); }
+uint64_t inline sigma0(uint64_t x) { return (x >> 1 | x << 63) ^ (x >> 8 | x << 56) ^ (x >> 7); }
+uint64_t inline sigma1(uint64_t x) { return (x >> 19 | x << 45) ^ (x >> 61 | x << 3) ^ (x >> 6); }
+
+/** One round of SHA-512. */
+void inline Round(uint64_t a, uint64_t b, uint64_t c, uint64_t& d, uint64_t e, uint64_t f, uint64_t g, uint64_t& h, uint64_t k, uint64_t w)
+{
+    uint64_t t1 = h + Sigma1(e) + Ch(e, f, g) + k + w;
+    uint64_t t2 = Sigma0(a) + Maj(a, b, c);
+    d += t1;
+    h = t1 + t2;
+}
+
+/** Initialize SHA-256 state. */
+void inline Initialize(uint64_t* s)
+{
+    s[0] = 0x6a09e667f3bcc908ull;
+    s[1] = 0xbb67ae8584caa73bull;
+    s[2] = 0x3c6ef372fe94f82bull;
+    s[3] = 0xa54ff53a5f1d36f1ull;
+    s[4] = 0x510e527fade682d1ull;
+    s[5] = 0x9b05688c2b3e6c1full;
+    s[6] = 0x1f83d9abfb41bd6bull;
+    s[7] = 0x5be0cd19137e2179ull;
+}
+
+/** Perform one SHA-512 transformation, processing a 128-byte chunk. */
+void Transform(uint64_t* s, const unsigned char* chunk)
+{
+    uint64_t a = s[0], b = s[1], c = s[2], d = s[3], e = s[4], f = s[5], g = s[6], h = s[7];
+    uint64_t w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
+
+    Round(a, b, c, d, e, f, g, h, 0x428a2f98d728ae22ull, w0 = ReadBE64(chunk + 0));
+    Round(h, a, b, c, d, e, f, g, 0x7137449123ef65cdull, w1 = ReadBE64(chunk + 8));
+    Round(g, h, a, b, c, d, e, f, 0xb5c0fbcfec4d3b2full, w2 = ReadBE64(chunk + 16));
+    Round(f, g, h, a, b, c, d, e, 0xe9b5dba58189dbbcull, w3 = ReadBE64(chunk + 24));
+    Round(e, f, g, h, a, b, c, d, 0x3956c25bf348b538ull, w4 = ReadBE64(chunk + 32));
+    Round(d, e, f, g, h, a, b, c, 0x59f111f1b605d019ull, w5 = ReadBE64(chunk + 40));
+    Round(c, d, e, f, g, h, a, b, 0x923f82a4af194f9bull, w6 = ReadBE64(chunk + 48));
+    Round(b, c, d, e, f, g, h, a, 0xab1c5ed5da6d8118ull, w7 = ReadBE64(chunk + 56));
+    Round(a, b, c, d, e, f, g, h, 0xd807aa98a3030242ull, w8 = ReadBE64(chunk + 64));
+    Round(h, a, b, c, d, e, f, g, 0x12835b0145706fbeull, w9 = ReadBE64(chunk + 72));
+    Round(g, h, a, b, c, d, e, f, 0x243185be4ee4b28cull, w10 = ReadBE64(chunk + 80));
+    Round(f, g, h, a, b, c, d, e, 0x550c7dc3d5ffb4e2ull, w11 = ReadBE64(chunk + 88));
+    Round(e, f, g, h, a, b, c, d, 0x72be5d74f27b896full, w12 = ReadBE64(chunk + 96));
+    Round(d, e, f, g, h, a, b, c, 0x80deb1fe3b1696b1ull, w13 = ReadBE64(chunk + 104));
+    Round(c, d, e, f, g, h, a, b, 0x9bdc06a725c71235ull, w14 = ReadBE64(chunk + 112));
+    Round(b, c, d, e, f, g, h, a, 0xc19bf174cf692694ull, w15 = ReadBE64(chunk + 120));
+
+    Round(a, b, c, d, e, f, g, h, 0xe49b69c19ef14ad2ull, w0 += sigma1(w14) + w9 + sigma0(w1));
+    Round(h, a, b, c, d, e, f, g, 0xefbe4786384f25e3ull, w1 += sigma1(w15) + w10 + sigma0(w2));
+    Round(g, h, a, b, c, d, e, f, 0x0fc19dc68b8cd5b5ull, w2 += sigma1(w0) + w11 + sigma0(w3));
+    Round(f, g, h, a, b, c, d, e, 0x240ca1cc77ac9c65ull, w3 += sigma1(w1) + w12 + sigma0(w4));
+    Round(e, f, g, h, a, b, c, d, 0x2de92c6f592b0275ull, w4 += sigma1(w2) + w13 + sigma0(w5));
+    Round(d, e, f, g, h, a, b, c, 0x4a7484aa6ea6e483ull, w5 += sigma1(w3) + w14 + sigma0(w6));
+    Round(c, d, e, f, g, h, a, b, 0x5cb0a9dcbd41fbd4ull, w6 += sigma1(w4) + w15 + sigma0(w7));
+    Round(b, c, d, e, f, g, h, a, 0x76f988da831153b5ull, w7 += sigma1(w5) + w0 + sigma0(w8));
+    Round(a, b, c, d, e, f, g, h, 0x983e5152ee66dfabull, w8 += sigma1(w6) + w1 + sigma0(w9));
+    Round(h, a, b, c, d, e, f, g, 0xa831c66d2db43210ull, w9 += sigma1(w7) + w2 + sigma0(w10));
+    Round(g, h, a, b, c, d, e, f, 0xb00327c898fb213full, w10 += sigma1(w8) + w3 + sigma0(w11));
+    Round(f, g, h, a, b, c, d, e, 0xbf597fc7beef0ee4ull, w11 += sigma1(w9) + w4 + sigma0(w12));
+    Round(e, f, g, h, a, b, c, d, 0xc6e00bf33da88fc2ull, w12 += sigma1(w10) + w5 + sigma0(w13));
+    Round(d, e, f, g, h, a, b, c, 0xd5a79147930aa725ull, w13 += sigma1(w11) + w6 + sigma0(w14));
+    Round(c, d, e, f, g, h, a, b, 0x06ca6351e003826full, w14 += sigma1(w12) + w7 + sigma0(w15));
+    Round(b, c, d, e, f, g, h, a, 0x142929670a0e6e70ull, w15 += sigma1(w13) + w8 + sigma0(w0));
+
+    Round(a, b, c, d, e, f, g, h, 0x27b70a8546d22ffcull, w0 += sigma1(w14) + w9 + sigma0(w1));
+    Round(h, a, b, c, d, e, f, g, 0x2e1b21385c26c926ull, w1 += sigma1(w15) + w10 + sigma0(w2));
+    Round(g, h, a, b, c, d, e, f, 0x4d2c6dfc5ac42aedull, w2 += sigma1(w0) + w11 + sigma0(w3));
+    Round(f, g, h, a, b, c, d, e, 0x53380d139d95b3dfull, w3 += sigma1(w1) + w12 + sigma0(w4));
+    Round(e, f, g, h, a, b, c, d, 0x650a73548baf63deull, w4 += sigma1(w2) + w13 + sigma0(w5));
+    Round(d, e, f, g, h, a, b, c, 0x766a0abb3c77b2a8ull, w5 += sigma1(w3) + w14 + sigma0(w6));
+    Round(c, d, e, f, g, h, a, b, 0x81c2c92e47edaee6ull, w6 += sigma1(w4) + w15 + sigma0(w7));
+    Round(b, c, d, e, f, g, h, a, 0x92722c851482353bull, w7 += sigma1(w5) + w0 + sigma0(w8));
+    Round(a, b, c, d, e, f, g, h, 0xa2bfe8a14cf10364ull, w8 += sigma1(w6) + w1 + sigma0(w9));
+    Round(h, a, b, c, d, e, f, g, 0xa81a664bbc423001ull, w9 += sigma1(w7) + w2 + sigma0(w10));
+    Round(g, h, a, b, c, d, e, f, 0xc24b8b70d0f89791ull, w10 += sigma1(w8) + w3 + sigma0(w11));
+    Round(f, g, h, a, b, c, d, e, 0xc76c51a30654be30ull, w11 += sigma1(w9) + w4 + sigma0(w12));
+    Round(e, f, g, h, a, b, c, d, 0xd192e819d6ef5218ull, w12 += sigma1(w10) + w5 + sigma0(w13));
+    Round(d, e, f, g, h, a, b, c, 0xd69906245565a910ull, w13 += sigma1(w11) + w6 + sigma0(w14));
+    Round(c, d, e, f, g, h, a, b, 0xf40e35855771202aull, w14 += sigma1(w12) + w7 + sigma0(w15));
+    Round(b, c, d, e, f, g, h, a, 0x106aa07032bbd1b8ull, w15 += sigma1(w13) + w8 + sigma0(w0));
+
+    Round(a, b, c, d, e, f, g, h, 0x19a4c116b8d2d0c8ull, w0 += sigma1(w14) + w9 + sigma0(w1));
+    Round(h, a, b, c, d, e, f, g, 0x1e376c085141ab53ull, w1 += sigma1(w15) + w10 + sigma0(w2));
+    Round(g, h, a, b, c, d, e, f, 0x2748774cdf8eeb99ull, w2 += sigma1(w0) + w11 + sigma0(w3));
+    Round(f, g, h, a, b, c, d, e, 0x34b0bcb5e19b48a8ull, w3 += sigma1(w1) + w12 + sigma0(w4));
+    Round(e, f, g, h, a, b, c, d, 0x391c0cb3c5c95a63ull, w4 += sigma1(w2) + w13 + sigma0(w5));
+    Round(d, e, f, g, h, a, b, c, 0x4ed8aa4ae3418acbull, w5 += sigma1(w3) + w14 + sigma0(w6));
+    Round(c, d, e, f, g, h, a, b, 0x5b9cca4f7763e373ull, w6 += sigma1(w4) + w15 + sigma0(w7));
+    Round(b, c, d, e, f, g, h, a, 0x682e6ff3d6b2b8a3ull, w7 += sigma1(w5) + w0 + sigma0(w8));
+    Round(a, b, c, d, e, f, g, h, 0x748f82ee5defb2fcull, w8 += sigma1(w6) + w1 + sigma0(w9));
+    Round(h, a, b, c, d, e, f, g, 0x78a5636f43172f60ull, w9 += sigma1(w7) + w2 + sigma0(w10));
+    Round(g, h, a, b, c, d, e, f, 0x84c87814a1f0ab72ull, w10 += sigma1(w8) + w3 + sigma0(w11));
+    Round(f, g, h, a, b, c, d, e, 0x8cc702081a6439ecull, w11 += sigma1(w9) + w4 + sigma0(w12));
+    Round(e, f, g, h, a, b, c, d, 0x90befffa23631e28ull, w12 += sigma1(w10) + w5 + sigma0(w13));
+    Round(d, e, f, g, h, a, b, c, 0xa4506cebde82bde9ull, w13 += sigma1(w11) + w6 + sigma0(w14));
+    Round(c, d, e, f, g, h, a, b, 0xbef9a3f7b2c67915ull, w14 += sigma1(w12) + w7 + sigma0(w15));
+    Round(b, c, d, e, f, g, h, a, 0xc67178f2e372532bull, w15 += sigma1(w13) + w8 + sigma0(w0));
+
+    Round(a, b, c, d, e, f, g, h, 0xca273eceea26619cull, w0 += sigma1(w14) + w9 + sigma0(w1));
+    Round(h, a, b, c, d, e, f, g, 0xd186b8c721c0c207ull, w1 += sigma1(w15) + w10 + sigma0(w2));
+    Round(g, h, a, b, c, d, e, f, 0xeada7dd6cde0eb1eull, w2 += sigma1(w0) + w11 + sigma0(w3));
+    Round(f, g, h, a, b, c, d, e, 0xf57d4f7fee6ed178ull, w3 += sigma1(w1) + w12 + sigma0(w4));
+    Round(e, f, g, h, a, b, c, d, 0x06f067aa72176fbaull, w4 += sigma1(w2) + w13 + sigma0(w5));
+    Round(d, e, f, g, h, a, b, c, 0x0a637dc5a2c898a6ull, w5 += sigma1(w3) + w14 + sigma0(w6));
+    Round(c, d, e, f, g, h, a, b, 0x113f9804bef90daeull, w6 += sigma1(w4) + w15 + sigma0(w7));
+    Round(b, c, d, e, f, g, h, a, 0x1b710b35131c471bull, w7 += sigma1(w5) + w0 + sigma0(w8));
+    Round(a, b, c, d, e, f, g, h, 0x28db77f523047d84ull, w8 += sigma1(w6) + w1 + sigma0(w9));
+    Round(h, a, b, c, d, e, f, g, 0x32caab7b40c72493ull, w9 += sigma1(w7) + w2 + sigma0(w10));
+    Round(g, h, a, b, c, d, e, f, 0x3c9ebe0a15c9bebcull, w10 += sigma1(w8) + w3 + sigma0(w11));
+    Round(f, g, h, a, b, c, d, e, 0x431d67c49c100d4cull, w11 += sigma1(w9) + w4 + sigma0(w12));
+    Round(e, f, g, h, a, b, c, d, 0x4cc5d4becb3e42b6ull, w12 += sigma1(w10) + w5 + sigma0(w13));
+    Round(d, e, f, g, h, a, b, c, 0x597f299cfc657e2aull, w13 += sigma1(w11) + w6 + sigma0(w14));
+    Round(c, d, e, f, g, h, a, b, 0x5fcb6fab3ad6faecull, w14 + sigma1(w12) + w7 + sigma0(w15));
+    Round(b, c, d, e, f, g, h, a, 0x6c44198c4a475817ull, w15 + sigma1(w13) + w8 + sigma0(w0));
+
+    s[0] += a;
+    s[1] += b;
+    s[2] += c;
+    s[3] += d;
+    s[4] += e;
+    s[5] += f;
+    s[6] += g;
+    s[7] += h;
+}
+
+} // namespace sha512
+
+} // namespace
+
+
+////// SHA-512
+
+CSHA512::CSHA512() : bytes(0)
+{
+    sha512::Initialize(s);
+}
+
+CSHA512& CSHA512::Write(const unsigned char* data, size_t len)
+{
+    const unsigned char* end = data + len;
+    size_t bufsize = bytes % 128;
+    if (bufsize && bufsize + len >= 128) {
+        // Fill the buffer, and process it.
+        memcpy(buf + bufsize, data, 128 - bufsize);
+        bytes += 128 - bufsize;
+        data += 128 - bufsize;
+        sha512::Transform(s, buf);
+        bufsize = 0;
+    }
+    while (end >= data + 128) {
+        // Process full chunks directly from the source.
+        sha512::Transform(s, data);
+        data += 128;
+        bytes += 128;
+    }
+    if (end > data) {
+        // Fill the buffer with what remains.
+        memcpy(buf + bufsize, data, end - data);
+        bytes += end - data;
+    }
+    return *this;
+}
+
+void CSHA512::Finalize(unsigned char hash[OUTPUT_SIZE])
+{
+    static const unsigned char pad[128] = {0x80};
+    unsigned char sizedesc[16] = {0x00};
+    WriteBE64(sizedesc + 8, bytes << 3);
+    Write(pad, 1 + ((239 - (bytes % 128)) % 128));
+    Write(sizedesc, 16);
+    WriteBE64(hash, s[0]);
+    WriteBE64(hash + 8, s[1]);
+    WriteBE64(hash + 16, s[2]);
+    WriteBE64(hash + 24, s[3]);
+    WriteBE64(hash + 32, s[4]);
+    WriteBE64(hash + 40, s[5]);
+    WriteBE64(hash + 48, s[6]);
+    WriteBE64(hash + 56, s[7]);
+}
+
+CSHA512& CSHA512::Reset()
+{
+    bytes = 0;
+    sha512::Initialize(s);
+    return *this;
+}
--- a/algo/hodl/sha512.h
+++ b/algo/hodl/sha512.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2014 The Bitcoin Core developers
+// Distributed under the MIT software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+#ifndef BITCOIN_CRYPTO_SHA512_H
+#define BITCOIN_CRYPTO_SHA512_H
+
+#include <stdint.h>
+#include <stdlib.h>
+
+/** A hasher class for SHA-512. */
+class CSHA512
+{
+private:
+    uint64_t s[8];
+    unsigned char buf[128];
+    size_t bytes;
+
+public:
+    static const size_t OUTPUT_SIZE = 64;
+
+    CSHA512();
+    CSHA512& Write(const unsigned char* data, size_t len);
+    void Finalize(unsigned char hash[OUTPUT_SIZE]);
+    CSHA512& Reset();
+};
+
+#endif // BITCOIN_CRYPTO_SHA512_H
--- a/algo/hodl/sha512_avx.c
+++ b/algo/hodl/sha512_avx.c
@@ -4,11 +4,6 @@
 //Dependencies
 #include <string.h>
 #include <stdlib.h>
-
-#ifdef __FreeBSD__
-#include <sys/endian.h>
-#endif 
-
 #include "tmmintrin.h"
 #include "smmintrin.h"

--- a/algo/hodl/sha512_avx2.c
+++ b/algo/hodl/sha512_avx2.c
@@ -3,11 +3,6 @@
 //Dependencies
 #include <string.h>
 #include <stdlib.h>
-
-#ifdef __FreeBSD__
-#include <sys/endian.h>
-#endif 
-
 #include "tmmintrin.h"
 #include "smmintrin.h"
 #include "immintrin.h"
--- a/Show More
+++ b/Show More