From ccfccbadd525c3e9d1525b3eb6d1124b75dc66b0 Mon Sep 17 00:00:00 2001
From: Jay D Dee <jayddee246@gmail.com>
Date: Thu, 10 Dec 2020 18:23:49 -0500
Subject: [PATCH] v3.15.3

---
 README.txt                        |  31 ++--
 RELEASE_NOTES                     |  11 +-
 algo-gate-api.c                   |  12 +-
 algo/groestl/myr-groestl.c        |  63 ++++----
 algo/m7m/m7m.c                    |  47 +++---
 algo/quark/hmq1725.c              |  16 +-
 algo/ripemd/lbry.c                |  40 ++---
 algo/sha/hmac-sha256-hash.c       |  97 ++++++++----
 algo/sha/hmac-sha256-hash.h       |  14 +-
 algo/sha/sha2.c                   |  21 +--
 algo/sha/sha256-hash-opt.c        | 200 +++++++++++++++++++++++++
 algo/sha/sha256q-4way.c           | 152 ++++++++++++-------
 algo/sha/sha256q.c                | 106 +++++---------
 algo/sha/sha256t-4way.c           | 135 +++++++++++------
 algo/sha/sha256t-gate.c           |  35 ++---
 algo/sha/sha256t-gate.h           |  26 +++-
 algo/sha/sha256t.c                | 104 +++++--------
 algo/sha/sph_sha2.c               |  35 +++--
 algo/sha/sph_sha2.h               |   2 +-
 algo/skein/skein-4way.c           |  46 +++---
 algo/skein/skein.c                |  10 +-
 algo/x16/hex.c                    |   6 +-
 algo/x16/minotaur.c               |  19 +--
 algo/x16/x16r-gate.h              |  23 +--
 algo/x16/x16r.c                   |   6 +-
 algo/x16/x16rv2.c                 |   8 +-
 algo/x16/x21s-4way.c              |  33 ++---
 algo/x16/x21s.c                   |  10 +-
 algo/x17/sonoa.c                  |  16 +-
 algo/x17/x17.c                    |  10 +-
 algo/x17/xevan.c                  | 200 ++++++++++++-------------
 algo/x20/x20r.c                   |  32 +---
 algo/x22/x22i-4way.c              | 235 +++++++++++++++++++++---------
 algo/x22/x22i-gate.c              |  36 +++--
 algo/x22/x22i-gate.h              |  20 +++
 algo/x22/x22i.c                   | 112 +++++++-------
 algo/x22/x25x-4way.c              |  85 +++++++++--
 algo/x22/x25x.c                   |  98 ++++++-------
 algo/yescrypt/yescrypt-simd.c     |  10 +-
 algo/yespower/yescrypt-r8g.c      |   4 +-
 algo/yespower/yespower-gate.c     |  33 ++---
 algo/yespower/yespower-opt.c      | 148 +++++++++----------
 algo/yespower/yespower.h          |   4 +-
 build-allarch.sh                  |   9 +-
 build-avx2.sh                     |  27 ++++
 configure                         |  20 +--
 configure.ac                      |   2 +-
 cpuminer.nsi => junk/cpuminer.nsi |   0
 cpuminer.sln => junk/cpuminer.sln |   0
 winbuild-cross.sh                 |  14 +-
 50 files changed, 1447 insertions(+), 976 deletions(-)
 create mode 100644 algo/sha/sha256-hash-opt.c
 create mode 100755 build-avx2.sh
 rename cpuminer.nsi => junk/cpuminer.nsi (100%)
 rename cpuminer.sln => junk/cpuminer.sln (100%)

diff --git a/README.txt b/README.txt
index 5d50a87..36298c0 100644
--- a/README.txt
+++ b/README.txt
@@ -14,7 +14,7 @@ miners. The source code is open for anyone to inspect. If you don't trust
 the software, don't use it.
 
 Choose the exe that best matches you CPU's features or use trial and
-error to find the fastest one that doesn't crash. Pay attention to
+error to find the fastest one that works. Pay attention to
 the features listed at cpuminer startup to ensure you are mining at
 optimum speed using the best available features.
 
@@ -35,22 +35,25 @@ https://en.wikipedia.org/wiki/List_of_Intel_CPU_microarchitectures
 https://en.wikipedia.org/wiki/List_of_AMD_CPU_microarchitectures
 
 
-Exe file name                Compile flags            Arch name
+Exe file name                Compile flags              Arch name
 
-cpuminer-sse2.exe            "-msse2"                 Core2, Nehalem   
-cpuminer-aes-sse42.exe       "-marxh=westmere"        Westmere
-cpuminer-avx.exe             "-march=corei7-avx"      Sandybridge, Ivybridge
-cpuminer-avx2.exe            "-march=core-avx2 -maes" Haswell(1)
-cpuminer-avx512.exe          "-march=skylake-avx512"  Skylake-X, Cascadelake-X
-cpuminer-zen.exe             "-march=znver1"          Zen1, Zen2
-cpuminer-zen3.exe            "-march=znver2 -mvaes"   Zen3(2) 
-cpuminer-avx512-sha-vaes.exe "-march=icelake-client"  Icelake(3)
+cpuminer-sse2.exe            "-msse2"                   Core2, Nehalem   
+cpuminer-aes-sse42.exe       "-march=westmere"          Westmere
+cpuminer-avx.exe             "-march=corei7-avx"        Sandybridge, Ivybridge
+cpuminer-avx2.exe            "-march=core-avx2 -maes"   Haswell(1)
+cpuminer-avx512.exe          "-march=skylake-avx512"    Skylake-X, Cascadelake
+cpuminer-avx512-sha.exe      "-march=cascadelake -msha" Rocketlake(2)
+cpuminer-avx512-sha-vaes.exe "-march=icelake-client"    Icelake, Tigerlake(3)
+cpuminer-zen.exe             "-march=znver1"            AMD Zen1, Zen2
+cpuminer-zen3.exe            "-march=znver2 -mvaes"     Zen3(4)
 
 (1) Haswell includes Broadwell, Skylake, Kabylake, Coffeelake & Cometlake. 
-(2) Zen3 build uses Zen2+VAES as workaround until Zen3 compiler support is
-    available. Zen2 CPUs should use Zen build.
-(3) Icelake is only available on some laptops. Mining with a laptop is not
-recommended.
+(2) Rocketlake build uses cascadelake+sha as a workaround until Rocketlake
+    compiler support is avalable.
+(3) Icelake & Tigerlake are only available on some laptops. Mining with a
+    laptop is not recommended.
+(4) Zen3 build uses zen2+vaes as a workaround until Zen3 compiler support is
+    available. Zen2 CPUs should use Zen1 build.
 
 Notes about included DLL files:
 
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index 106d499..a89d2f3 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -65,11 +65,20 @@ If not what makes it happen or not happen?
 Change Log
 ----------
 
+v3.15.3
+
+Yescrypt algos now use yespower v0.5, a little faster.
+New implementation of sha256 using SHA CPU extension.
+Replace Openssl with SPH for sha256 & sha512.
+AVX512 optimization for sha256t & sha256q.
+Faster sha256t, sha256q, x21s, x22i & x25x on CPUs with SHA without AVX512.
+AVX512+SHA build for Intel Rocketlake added to Windows binary package. 
+
 v3.15.2
 
 Zen3 AVX2+VAES optimization for x16*, x17, sonoa, xevan, x21s, x22i, x25x,
 allium.
-Zen3 build added to Windows binary package.
+Zen3 (AVX2+SHA+VAES) build added to Windows binary package.
 
 v3.15.1
 
diff --git a/algo-gate-api.c b/algo-gate-api.c
index 38da868..e407ef7 100644
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -370,11 +370,15 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
     case ALGO_X22I:          register_x22i_algo          ( gate ); break;
     case ALGO_X25X:          register_x25x_algo          ( gate ); break;
     case ALGO_XEVAN:         register_xevan_algo         ( gate ); break;
-    case ALGO_YESCRYPT:      register_yescrypt_algo      ( gate ); break;
-    case ALGO_YESCRYPTR8:    register_yescryptr8_algo    ( gate ); break;
+    case ALGO_YESCRYPT:      register_yescrypt_05_algo   ( gate ); break;
+//    case ALGO_YESCRYPT:      register_yescrypt_algo      ( gate ); break;
+    case ALGO_YESCRYPTR8:    register_yescryptr8_05_algo ( gate ); break;
+//    case ALGO_YESCRYPTR8:    register_yescryptr8_algo    ( gate ); break;
     case ALGO_YESCRYPTR8G:   register_yescryptr8g_algo   ( gate ); break;
-    case ALGO_YESCRYPTR16:   register_yescryptr16_algo   ( gate ); break;
-    case ALGO_YESCRYPTR32:   register_yescryptr32_algo   ( gate ); break;
+    case ALGO_YESCRYPTR16:   register_yescryptr16_05_algo( gate ); break;
+//    case ALGO_YESCRYPTR16:   register_yescryptr16_algo   ( gate ); break;
+    case ALGO_YESCRYPTR32:   register_yescryptr32_05_algo( gate ); break;
+//    case ALGO_YESCRYPTR32:   register_yescryptr32_algo   ( gate ); break;
     case ALGO_YESPOWER:      register_yespower_algo      ( gate ); break;
     case ALGO_YESPOWERR16:   register_yespowerr16_algo   ( gate ); break;
     case ALGO_YESPOWER_B2B:  register_yespower_b2b_algo  ( gate ); break;
diff --git a/algo/groestl/myr-groestl.c b/algo/groestl/myr-groestl.c
index fe5b920..5a67303 100644
--- a/algo/groestl/myr-groestl.c
+++ b/algo/groestl/myr-groestl.c
@@ -11,7 +11,7 @@
 #else
   #include "sph_groestl.h"
 #endif
-#include <openssl/sha.h>
+#include "algo/sha/sph_sha2.h"
 
 typedef struct {
 #ifdef __AES__
@@ -19,7 +19,7 @@ typedef struct {
 #else
     sph_groestl512_context  groestl;
 #endif
-    SHA256_CTX              sha;
+    sph_sha256_context      sha;
 } myrgr_ctx_holder;
 
 myrgr_ctx_holder myrgr_ctx;
@@ -31,7 +31,7 @@ void init_myrgr_ctx()
 #else
      sph_groestl512_init( &myrgr_ctx.groestl );
 #endif
-     SHA256_Init( &myrgr_ctx.sha );
+     sph_sha256_init( &myrgr_ctx.sha );
 }
 
 void myriad_hash(void *output, const void *input)
@@ -39,54 +39,55 @@ void myriad_hash(void *output, const void *input)
    myrgr_ctx_holder ctx;
    memcpy( &ctx, &myrgr_ctx, sizeof(myrgr_ctx) );
 
- 	uint32_t _ALIGN(32) hash[16];
+   uint32_t _ALIGN(32) hash[16];
 
 #ifdef __AES__
    update_groestl( &ctx.groestl, (char*)input, 640 );
    final_groestl( &ctx.groestl, (char*)hash);
 #else
-	sph_groestl512(&ctx.groestl, input, 80);
-	sph_groestl512_close(&ctx.groestl, hash);
+   sph_groestl512(&ctx.groestl, input, 80);
+   sph_groestl512_close(&ctx.groestl, hash);
 #endif
 
-   SHA256_Update( &ctx.sha, (unsigned char*)hash, 64 );
-   SHA256_Final( (unsigned char*)hash, &ctx.sha );
+   sph_sha256( &ctx.sha, hash, 64 );
+   sph_sha256_close( &ctx.sha, hash );
 
-	memcpy(output, hash, 32);
+   memcpy(output, hash, 32);
 }
 
 int scanhash_myriad( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr )
 {
-	uint32_t _ALIGN(64) endiandata[20];
+   uint32_t _ALIGN(64) endiandata[20];
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
-	const uint32_t first_nonce = pdata[19];
-	uint32_t nonce = first_nonce;
-   int thr_id = mythr->id;  // thr_id arg is deprecated
+   const uint32_t first_nonce = pdata[19];
+   uint32_t nonce = first_nonce;
+   int thr_id = mythr->id;
 
-	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x0000ff;
+   if (opt_benchmark)
+      ((uint32_t*)ptarget)[7] = 0x0000ff;
 
-        swab32_array( endiandata, pdata, 20 );
+   swab32_array( endiandata, pdata, 20 );
 
-	do {
-		const uint32_t Htarg = ptarget[7];
-		uint32_t hash[8];
-		be32enc(&endiandata[19], nonce);
-		myriad_hash(hash, endiandata);
+   do {
+      const uint32_t Htarg = ptarget[7];
+      uint32_t hash[8];
+      be32enc(&endiandata[19], nonce);
+      myriad_hash(hash, endiandata);
 
-		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
-			pdata[19] = nonce;
-			*hashes_done = pdata[19] - first_nonce;
-			return 1;
-		}
-		nonce++;
+      if (hash[7] <= Htarg && fulltest(hash, ptarget))
+      {
+         pdata[19] = nonce;
+         *hashes_done = pdata[19] - first_nonce;
+         return 1;
+      }
+      nonce++;
 
-	} while (nonce < max_nonce && !work_restart[thr_id].restart);
+   } while (nonce < max_nonce && !work_restart[thr_id].restart);
 
-	pdata[19] = nonce;
-	*hashes_done = pdata[19] - first_nonce + 1;
-	return 0;
+   pdata[19] = nonce;
+   *hashes_done = pdata[19] - first_nonce + 1;
+   return 0;
 }
 #endif
diff --git a/algo/m7m/m7m.c b/algo/m7m/m7m.c
index 159fed9..ab13a7e 100644
--- a/algo/m7m/m7m.c
+++ b/algo/m7m/m7m.c
@@ -12,8 +12,7 @@
 #include "algo/tiger/sph_tiger.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/ripemd/sph_ripemd.h"
-#include <openssl/sha.h>
-
+#include "algo/sha/sph_sha2.h"
 
 #define EPSa DBL_EPSILON
 #define EPS1 DBL_EPSILON
@@ -105,8 +104,8 @@ uint32_t sw2_( int nnounce )
 }
 
 typedef struct {
-    SHA256_CTX               sha256;
-    SHA512_CTX               sha512;
+    sph_sha256_context      sha256;
+    sph_sha512_context      sha512;
     sph_keccak512_context    keccak;
     sph_whirlpool_context    whirlpool;
     sph_haval256_5_context   haval;
@@ -118,8 +117,8 @@ m7m_ctx_holder m7m_ctx;
 
 void init_m7m_ctx()
 {
-    SHA256_Init( &m7m_ctx.sha256 );
-    SHA512_Init( &m7m_ctx.sha512 );
+    sph_sha256_init( &m7m_ctx );
+    sph_sha512_init( &m7m_ctx.sha512 );
     sph_keccak512_init( &m7m_ctx.keccak );
     sph_whirlpool_init( &m7m_ctx.whirlpool );
     sph_haval256_5_init( &m7m_ctx.haval );
@@ -143,11 +142,10 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
     uint32_t hash[8] __attribute__((aligned(64)));
     uint8_t bhash[7][64] __attribute__((aligned(64)));
     uint32_t n = pdata[19] - 1;
-    int thr_id = mythr->id;  // thr_id arg is deprecated
+    int thr_id = mythr->id;
     uint32_t usw_, mpzscale;
     const uint32_t first_nonce = pdata[19];
     char data_str[161], hash_str[65], target_str[65];
-    //uint8_t *bdata = 0;
     uint8_t bdata[8192] __attribute__ ((aligned (64)));
     int i, digits;
     int bytes;
@@ -155,12 +153,12 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
 
     m7m_ctx_holder ctx1, ctx2 __attribute__ ((aligned (64)));
     memcpy( &ctx1, &m7m_ctx, sizeof(m7m_ctx) );
-    SHA256_CTX         ctxf_sha256;
+    sph_sha256_context ctxf_sha256;
 
     memcpy(data, pdata, 80);
 
-    SHA256_Update(  &ctx1.sha256,    data, M7_MIDSTATE_LEN );
-    SHA512_Update(  &ctx1.sha512,    data, M7_MIDSTATE_LEN );
+    sph_sha256(     &ctx1.sha256,    data, M7_MIDSTATE_LEN );
+    sph_sha512(     &ctx1.sha512,    data, M7_MIDSTATE_LEN );
     sph_keccak512(  &ctx1.keccak,    data, M7_MIDSTATE_LEN );
     sph_whirlpool(  &ctx1.whirlpool, data, M7_MIDSTATE_LEN );
     sph_haval256_5( &ctx1.haval,     data, M7_MIDSTATE_LEN );
@@ -191,11 +189,11 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
 
         memcpy( &ctx2, &ctx1, sizeof(m7m_ctx) );
 
-        SHA256_Update(  &ctx2.sha256, data_p64, 80 - M7_MIDSTATE_LEN );
-        SHA256_Final( (unsigned char*) (bhash[0]), &ctx2.sha256 );
+        sph_sha256( &ctx2.sha256, data_p64, 80 - M7_MIDSTATE_LEN );
+        sph_sha256_close( &ctx2.sha256, bhash[0] );
 
-        SHA512_Update(  &ctx2.sha512, data_p64, 80 - M7_MIDSTATE_LEN );
-        SHA512_Final( (unsigned char*) (bhash[1]), &ctx2.sha512 );
+        sph_sha512(  &ctx2.sha512, data_p64, 80 - M7_MIDSTATE_LEN );
+        sph_sha512_close( &ctx2.sha512, bhash[1] );
 
         sph_keccak512( &ctx2.keccak, data_p64, 80 - M7_MIDSTATE_LEN );
         sph_keccak512_close( &ctx2.keccak, (void*)(bhash[2]) );
@@ -227,9 +225,9 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
         bytes = mpz_sizeinbase(product, 256);
         mpz_export((void *)bdata, NULL, -1, 1, 0, 0, product);
 
-        SHA256_Init( &ctxf_sha256 );
-        SHA256_Update(  &ctxf_sha256, bdata, bytes );
-        SHA256_Final( (unsigned char*) hash, &ctxf_sha256 );
+        sph_sha256_init( &ctxf_sha256 );
+        sph_sha256( &ctxf_sha256, bdata, bytes );
+        sph_sha256_close( &ctxf_sha256, hash );
 
         digits=(int)((sqrt((double)(n/2))*(1.+EPS))/9000+75);
         mp_bitcnt_t prec = (long int)(digits*BITS_PER_DIGIT+16);
@@ -262,18 +260,13 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
             mpzscale=bytes;
             mpz_export(bdata, NULL, -1, 1, 0, 0, product);
 
-            SHA256_Init( &ctxf_sha256 );
-            SHA256_Update(  &ctxf_sha256, bdata, bytes );
-            SHA256_Final( (unsigned char*) hash, &ctxf_sha256 );
-        }
-
+            sph_sha256_init( &ctxf_sha256 );
+            sph_sha256( &ctxf_sha256, bdata, bytes );
+            sph_sha256_close( &ctxf_sha256, hash );
+	}
 
         if ( unlikely( valid_hash( (uint64_t*)hash, (uint64_t*)ptarget ) 
              && !opt_benchmark ) )
-
-
-//        if ( unlikely( hash[7] <= ptarget[7] ) )
-//        if ( likely( fulltest( hash, ptarget ) && !opt_benchmark ) )        
         {
            if ( opt_debug )
            {
diff --git a/algo/quark/hmq1725.c b/algo/quark/hmq1725.c
index 41e3cfc..09da6c0 100644
--- a/algo/quark/hmq1725.c
+++ b/algo/quark/hmq1725.c
@@ -17,7 +17,7 @@
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/haval/sph-haval.h"
-#include <openssl/sha.h>
+#include "algo/sha/sph_sha2.h"
 #if defined(__AES__)
   #include "algo/groestl/aes_ni/hash-groestl.h"
   #include "algo/echo/aes_ni/hash_api.h"
@@ -44,7 +44,7 @@ typedef struct {
   sph_hamsi512_context    hamsi1;
   sph_shabal512_context   shabal1;
   sph_whirlpool_context   whirlpool1, whirlpool2, whirlpool3, whirlpool4;
-  SHA512_CTX              sha1, sha2;
+  sph_sha512_context      sha1, sha2;
   sph_haval256_5_context  haval1, haval2;
 #if defined(__AES__)
   hashState_echo          echo1, echo2;
@@ -106,8 +106,8 @@ void init_hmq1725_ctx()
     sph_whirlpool_init(&hmq1725_ctx.whirlpool3);
     sph_whirlpool_init(&hmq1725_ctx.whirlpool4);
 
-    SHA512_Init( &hmq1725_ctx.sha1 );
-    SHA512_Init( &hmq1725_ctx.sha2 );
+    sph_sha512_init( &hmq1725_ctx.sha1 );
+    sph_sha512_init( &hmq1725_ctx.sha2 );
 
     sph_haval256_5_init(&hmq1725_ctx.haval1);
     sph_haval256_5_init(&hmq1725_ctx.haval2);
@@ -285,8 +285,8 @@ extern void hmq1725hash(void *state, const void *input)
     }
     else
     {
-        SHA512_Update( &h_ctx.sha1, hashB, 64 );
-        SHA512_Final( (unsigned char*) hashA, &h_ctx.sha1 );
+        sph_sha512( &h_ctx.sha1, hashB, 64 );
+        sph_sha512_close( &h_ctx.sha1, hashA );
     }
 
 #if defined(__AES__)
@@ -297,8 +297,8 @@ extern void hmq1725hash(void *state, const void *input)
     sph_groestl512_close(&h_ctx.groestl2, hashB); //4
 #endif
 
-    SHA512_Update( &h_ctx.sha2, hashB, 64 );
-    SHA512_Final( (unsigned char*) hashA, &h_ctx.sha2 );
+    sph_sha512( &h_ctx.sha2, hashB, 64 );
+    sph_sha512_close( &h_ctx.sha2, hashA );
 
     if ( hashA[0] & mask ) //4
     {
diff --git a/algo/ripemd/lbry.c b/algo/ripemd/lbry.c
index 314d019..94f3417 100644
--- a/algo/ripemd/lbry.c
+++ b/algo/ripemd/lbry.c
@@ -7,28 +7,28 @@
 #include <string.h>
 #include <stdio.h>
 #include "sph_ripemd.h"
-#include <openssl/sha.h>
+#include "algo/sha/sph_sha2.h"
 
 void lbry_hash(void* output, const void* input)
 {
-   SHA256_CTX              ctx_sha256 __attribute__ ((aligned (64)));
-   SHA512_CTX              ctx_sha512 __attribute__ ((aligned (64)));
-   sph_ripemd160_context   ctx_ripemd __attribute__ ((aligned (64)));
+   sph_sha256_context    ctx_sha256 __attribute__ ((aligned (64)));
+   sph_sha512_context    ctx_sha512 __attribute__ ((aligned (64)));
+   sph_ripemd160_context ctx_ripemd __attribute__ ((aligned (64)));
    uint32_t _ALIGN(64) hashA[16];
    uint32_t _ALIGN(64) hashB[16];
    uint32_t _ALIGN(64) hashC[16];
 
-   SHA256_Init( &ctx_sha256 );
-   SHA256_Update( &ctx_sha256, input, 112 );
-   SHA256_Final( (unsigned char*) hashA, &ctx_sha256 );
+   sph_sha256_init( &ctx_sha256 );
+   sph_sha256( &ctx_sha256, input, 112 );
+   sph_sha256_close( &ctx_sha256, hashA );
 
-   SHA256_Init( &ctx_sha256 );
-   SHA256_Update( &ctx_sha256, hashA, 32 );
-   SHA256_Final( (unsigned char*) hashA, &ctx_sha256 );
+   sph_sha256_init( &ctx_sha256 );
+   sph_sha256( &ctx_sha256, hashA, 32 );
+   sph_sha256_close( &ctx_sha256, hashA );
 
-   SHA512_Init( &ctx_sha512 );
-   SHA512_Update( &ctx_sha512, hashA, 32 );
-   SHA512_Final( (unsigned char*) hashA, &ctx_sha512 );
+   sph_sha512_init( &ctx_sha512 );
+   sph_sha512( &ctx_sha512, hashA, 32 );
+   sph_sha512_close( &ctx_sha512, hashA );
 
    sph_ripemd160_init( &ctx_ripemd );
    sph_ripemd160 ( &ctx_ripemd, hashA, 32 );
@@ -38,14 +38,14 @@ void lbry_hash(void* output, const void* input)
    sph_ripemd160 ( &ctx_ripemd, hashA+8, 32 );
    sph_ripemd160_close( &ctx_ripemd, hashC );
 
-   SHA256_Init( &ctx_sha256 );
-   SHA256_Update( &ctx_sha256, hashB, 20 );
-   SHA256_Update( &ctx_sha256, hashC, 20 );
-   SHA256_Final( (unsigned char*) hashA, &ctx_sha256 );
+   sph_sha256_init( &ctx_sha256 );
+   sph_sha256( &ctx_sha256, hashB, 20 );
+   sph_sha256( &ctx_sha256, hashC, 20 );
+   sph_sha256_close( &ctx_sha256, hashA );
 
-   SHA256_Init( &ctx_sha256 );
-   SHA256_Update( &ctx_sha256, hashA, 32 );
-   SHA256_Final( (unsigned char*) hashA, &ctx_sha256 );
+   sph_sha256_init( &ctx_sha256 );
+   sph_sha256( &ctx_sha256, hashA, 32 );
+   sph_sha256_close( &ctx_sha256, hashA );
 
    memcpy( output, hashA, 32 );
 }
diff --git a/algo/sha/hmac-sha256-hash.c b/algo/sha/hmac-sha256-hash.c
index 99b68d8..3c2f4d2 100644
--- a/algo/sha/hmac-sha256-hash.c
+++ b/algo/sha/hmac-sha256-hash.c
@@ -39,10 +39,17 @@
 void
 SHA256_Buf( const void * in, size_t len, uint8_t digest[32] )
 {
-	SHA256_CTX ctx;
+#if defined(HMAC_SPH_SHA)
+   sph_sha256_context ctx;
+   sph_sha256_init( &ctx );
+   sph_sha256( &ctx, in, len );
+   sph_sha256_close( &ctx, digest );
+#else
+   SHA256_CTX ctx;
    SHA256_Init( &ctx );
    SHA256_Update( &ctx, in, len );
    SHA256_Final( digest, &ctx );
+#endif
 }
 
 /**
@@ -64,35 +71,59 @@ HMAC_SHA256_Buf( const void *K, size_t Klen, const void *in, size_t len,
 void
 HMAC_SHA256_Init( HMAC_SHA256_CTX *ctx, const void *_K, size_t Klen )
 {
-	unsigned char pad[64];
-	unsigned char khash[32];
-	const unsigned char * K = _K;
-	size_t i;
+   unsigned char pad[64];
+   unsigned char khash[32];
+   const unsigned char * K = _K;
+   size_t i;
 
-	/* If Klen > 64, the key is really SHA256(K). */
-	if ( Klen > 64 )
+   /* If Klen > 64, the key is really SHA256(K). */
+   if ( Klen > 64 )
    {
-		SHA256_Init( &ctx->ictx );
-		SHA256_Update( &ctx->ictx, K, Klen );
-		SHA256_Final( khash, &ctx->ictx );
-		K = khash;
-		Klen = 32;
-	}
+	   
+#if defined(HMAC_SPH_SHA)
+      sph_sha256_init( &ctx->ictx );
+      sph_sha256( &ctx->ictx, K, Klen );
+      sph_sha256_close( &ctx->ictx, khash );
+#else
+      SHA256_Init( &ctx->ictx );
+      SHA256_Update( &ctx->ictx, K, Klen );
+      SHA256_Final( khash, &ctx->ictx );
+#endif
+       K = khash;
+       Klen = 32;
+   }
 
-	/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
+   /* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
+#if defined(HMAC_SPH_SHA)
+   sph_sha256_init( &ctx->ictx );
+#else
    SHA256_Init( &ctx->ictx );
-
+#endif
 
    for ( i = 0; i < Klen; i++ )  pad[i] = K[i] ^ 0x36;
-   memset( pad + Klen, 0x36, 64 - Klen );
-	SHA256_Update( &ctx->ictx, pad, 64 );
 
-	/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
-	SHA256_Init( &ctx->octx );
+   memset( pad + Klen, 0x36, 64 - Klen );
+#if defined(HMAC_SPH_SHA)
+   sph_sha256( &ctx->ictx, pad, 64 );
+#else
+   SHA256_Update( &ctx->ictx, pad, 64 );
+#endif
+
+   /* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
+#if defined(HMAC_SPH_SHA)
+   sph_sha256_init( &ctx->octx );
+#else   
+   SHA256_Init( &ctx->octx );
+#endif
 
    for ( i = 0; i < Klen; i++ )  pad[i] = K[i] ^ 0x5c;
+
    memset( pad + Klen, 0x5c, 64 - Klen );
-	SHA256_Update( &ctx->octx, pad, 64 );
+#if defined(HMAC_SPH_SHA)
+   sph_sha256( &ctx->octx, pad, 64 );
+#else
+   SHA256_Update( &ctx->octx, pad, 64 );
+#endif
 }
 
 /* Add bytes to the HMAC-SHA256 operation. */
@@ -100,23 +131,33 @@ void
 HMAC_SHA256_Update( HMAC_SHA256_CTX *ctx, const void *in, size_t len )
 {
 	/* Feed data to the inner SHA256 operation. */
-	SHA256_Update( &ctx->ictx, in, len );
+#if defined(HMAC_SPH_SHA)
+   sph_sha256( &ctx->ictx, in, len );
+#else
+   SHA256_Update( &ctx->ictx, in, len );
+#endif
 }
 
 /* Finish an HMAC-SHA256 operation. */
 void
 HMAC_SHA256_Final( unsigned char digest[32], HMAC_SHA256_CTX *ctx )
 {
-	unsigned char ihash[32];
+   unsigned char ihash[32];
 
-	/* Finish the inner SHA256 operation. */
-	SHA256_Final( ihash, &ctx->ictx );
+#if defined(HMAC_SPH_SHA)
+   sph_sha256_close( &ctx->ictx, ihash );
+   sph_sha256( &ctx->octx, ihash, 32 );
+   sph_sha256_close( &ctx->octx, digest );
+#else
+   /* Finish the inner SHA256 operation. */
+   SHA256_Final( ihash, &ctx->ictx );
 
-	/* Feed the inner hash to the outer SHA256 operation. */
-	SHA256_Update( &ctx->octx, ihash, 32 );
+   /* Feed the inner hash to the outer SHA256 operation. */
+   SHA256_Update( &ctx->octx, ihash, 32 );
 
-	/* Finish the outer SHA256 operation. */
-	SHA256_Final( digest, &ctx->octx );
+   /* Finish the outer SHA256 operation. */
+   SHA256_Final( digest, &ctx->octx );
+#endif
 }
 
 /**
diff --git a/algo/sha/hmac-sha256-hash.h b/algo/sha/hmac-sha256-hash.h
index 0a020f6..41e5673 100644
--- a/algo/sha/hmac-sha256-hash.h
+++ b/algo/sha/hmac-sha256-hash.h
@@ -29,14 +29,24 @@
 #ifndef HMAC_SHA256_H__
 #define HMAC_SHA256_H__
 
+//#define HMAC_SSL_SHA 1
+#define HMAC_SPH_SHA 1
+
 #include <sys/types.h>
 #include <stdint.h>
+#include "sph_sha2.h"
 #include <openssl/sha.h>
 
+
 typedef struct HMAC_SHA256Context
 {
-     SHA256_CTX ictx;
-     SHA256_CTX octx;
+#if defined(HMAC_SPH_SHA)
+   sph_sha256_context ictx;
+   sph_sha256_context octx;
+#else
+   SHA256_CTX ictx;
+   SHA256_CTX octx;
+#endif
 } HMAC_SHA256_CTX;
 
 void SHA256_Buf( const void *, size_t len, uint8_t digest[32] );
diff --git a/algo/sha/sha2.c b/algo/sha/sha2.c
index 2b0c570..33cc6c1 100644
--- a/algo/sha/sha2.c
+++ b/algo/sha/sha2.c
@@ -12,7 +12,6 @@
 
 #include <string.h>
 #include <inttypes.h>
-#include <openssl/sha.h>
 
 #if defined(USE_ASM) && defined(__arm__) && defined(__APCS_32__)
 #define EXTERN_SHA256
@@ -198,16 +197,6 @@ static void sha256d_80_swap(uint32_t *hash, const uint32_t *data)
 
 extern void sha256d(unsigned char *hash, const unsigned char *data, int len)
 {
-#if defined(__SHA__)
-   SHA256_CTX ctx;
-   SHA256_Init( &ctx );
-   SHA256_Update( &ctx, data, len );
-   SHA256_Final( (unsigned char*)hash, &ctx );
-   SHA256_Init( &ctx );
-   SHA256_Update( &ctx, hash, 32 );
-   SHA256_Final( (unsigned char*)hash, &ctx );
-#else
-
    uint32_t S[16], T[16];
 	int i, r;
 
@@ -229,7 +218,6 @@ extern void sha256d(unsigned char *hash, const unsigned char *data, int len)
 	sha256_transform(T, S, 0);
 	for (i = 0; i < 8; i++)
 		be32enc((uint32_t *)hash + i, T[i]);
-#endif
 }
 
 static inline void sha256d_preextend(uint32_t *W)
@@ -676,14 +664,9 @@ int scanhash_SHA256d( struct work *work, const uint32_t max_nonce,
 
 bool register_sha256d_algo( algo_gate_t* gate )
 {
-#if defined(__SHA__)
-   gate->optimizations = SHA_OPT;
-   gate->scanhash = (void*)&scanhash_SHA256d;
-#else
    gate->optimizations = SSE2_OPT | AVX2_OPT;
    gate->scanhash = (void*)&scanhash_sha256d;
-#endif
-    gate->hash     = (void*)&sha256d;
-    return true;
+   gate->hash     = (void*)&sha256d;
+   return true;
 };
 
diff --git a/algo/sha/sha256-hash-opt.c b/algo/sha/sha256-hash-opt.c
new file mode 100644
index 0000000..fb049b1
--- /dev/null
+++ b/algo/sha/sha256-hash-opt.c
@@ -0,0 +1,200 @@
+/*   Intel SHA extensions using C intrinsics               */
+/*   Written and place in public domain by Jeffrey Walton  */
+/*   Based on code from Intel, and by Sean Gulley for      */
+/*   the miTLS project.                                    */
+
+// A drop in replacement for the function of the same name in sph_sha2.c.
+
+#if defined(__SHA__)
+
+#include "simd-utils.h"
+
+static void sha2_round( const uint8_t input[], uint32_t state[8] )
+{
+    __m128i STATE0, STATE1;
+    __m128i MSG, TMP, MASK;
+    __m128i TMSG0, TMSG1, TMSG2, TMSG3;
+    __m128i ABEF_SAVE, CDGH_SAVE;
+
+    // Load initial values
+    TMP = _mm_load_si128((__m128i*) &state[0]);
+    STATE1 = _mm_load_si128((__m128i*) &state[4]);
+    MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
+
+    TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
+    STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
+    STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
+    STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
+
+    // Save current hash
+    ABEF_SAVE = STATE0;
+    CDGH_SAVE = STATE1;
+
+    // Rounds 0-3
+    MSG = _mm_load_si128((const __m128i*) (input+0));
+    TMSG0 = _mm_shuffle_epi8(MSG, MASK);
+    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+
+    // Rounds 4-7
+    TMSG1 = _mm_load_si128((const __m128i*) (input+16));
+    TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
+    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
+
+    // Rounds 8-11
+    TMSG2 = _mm_load_si128((const __m128i*) (input+32));
+    TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
+    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
+
+    // Rounds 12-15
+    TMSG3 = _mm_load_si128((const __m128i*) (input+48));
+    TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
+    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
+    TMSG0 = _mm_add_epi32(TMSG0, TMP);
+    TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
+
+    // Rounds 16-19
+    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
+    TMSG1 = _mm_add_epi32(TMSG1, TMP);
+    TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
+
+    // Rounds 20-23
+    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
+    TMSG2 = _mm_add_epi32(TMSG2, TMP);
+    TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
+
+    // Rounds 24-27
+    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
+    TMSG3 = _mm_add_epi32(TMSG3, TMP);
+    TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
+
+    // Rounds 28-31
+    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x1429296706CA6351ULL,  0xD5A79147C6E00BF3ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
+    TMSG0 = _mm_add_epi32(TMSG0, TMP);
+    TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
+
+    // Rounds 32-35
+    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
+    TMSG1 = _mm_add_epi32(TMSG1, TMP);
+    TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
+
+    // Rounds 36-39
+    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
+    TMSG2 = _mm_add_epi32(TMSG2, TMP);
+    TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
+
+    // Rounds 40-43
+    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
+    TMSG3 = _mm_add_epi32(TMSG3, TMP);
+    TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
+
+    // Rounds 44-47
+    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
+    TMSG0 = _mm_add_epi32(TMSG0, TMP);
+    TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
+
+    // Rounds 48-51
+    MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
+    TMSG1 = _mm_add_epi32(TMSG1, TMP);
+    TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
+
+    // Rounds 52-55
+    MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
+    TMSG2 = _mm_add_epi32(TMSG2, TMP);
+    TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+
+    // Rounds 56-59
+    MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
+    TMSG3 = _mm_add_epi32(TMSG3, TMP);
+    TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+
+    // Rounds 60-63
+    MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+
+    // Add values back to state
+    STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
+    STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
+
+    TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
+    STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
+    STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
+    STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
+
+    // Save state
+    _mm_store_si128((__m128i*) &state[0], STATE0);
+    _mm_store_si128((__m128i*) &state[4], STATE1);
+}
+
+
+#endif
diff --git a/algo/sha/sha256q-4way.c b/algo/sha/sha256q-4way.c
index 34c67b4..19b8d10 100644
--- a/algo/sha/sha256q-4way.c
+++ b/algo/sha/sha256q-4way.c
@@ -5,6 +5,79 @@
 #include <stdio.h>
 #include "sha-hash-4way.h"
 
+#if defined(SHA256T_16WAY)
+
+static __thread sha256_16way_context sha256_ctx16 __attribute__ ((aligned (64)));
+
+void sha256q_16way_hash( void* output, const void* input )
+{
+   uint32_t vhash[8*16] __attribute__ ((aligned (64)));
+   sha256_16way_context ctx;
+   memcpy( &ctx, &sha256_ctx16, sizeof ctx );
+
+   sha256_16way_update( &ctx, input + (64<<4), 16 );
+   sha256_16way_close( &ctx, vhash );
+
+   sha256_16way_init( &ctx );
+   sha256_16way_update( &ctx, vhash, 32 );
+   sha256_16way_close( &ctx, vhash );
+
+   sha256_16way_init( &ctx );
+   sha256_16way_update( &ctx, vhash, 32 );
+   sha256_16way_close( &ctx, vhash );
+
+   sha256_16way_init( &ctx );
+   sha256_16way_update( &ctx, vhash, 32 );
+   sha256_16way_close( &ctx, output );
+}
+
+int scanhash_sha256q_16way( struct work *work, const uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t vdata[20*16] __attribute__ ((aligned (64)));
+   uint32_t hash32[8*16] __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t *hash32_d7 = &(hash32[7<<4]);
+   uint32_t *pdata = work->data;
+   const uint32_t *ptarget = work->target;
+   const uint32_t targ32_d7 = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 16;
+   uint32_t n = first_nonce;
+   __m512i  *noncev = (__m512i*)vdata + 19;   // aligned
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+
+   mm512_bswap32_intrlv80_16x32( vdata, pdata );
+   *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
+                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
+   sha256_16way_init( &sha256_ctx16 );
+   sha256_16way_update( &sha256_ctx16, vdata, 64 );
+
+   do
+   {
+     pdata[19] = n;
+     sha256q_16way_hash( hash32, vdata );
+     for ( int lane = 0; lane < 16; lane++ )
+     if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+     {
+        extr_lane_16x32( lane_hash, hash32, lane, 256 );
+        if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+        {
+           pdata[19] = bswap_32( n + lane );
+           submit_solution( work, lane_hash, mythr );
+        }
+      }
+      *noncev = _mm512_add_epi32( *noncev, m512_const1_32( 16 ) );
+      n += 16;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#endif
+
 #if defined(SHA256T_8WAY)
 
 static __thread sha256_8way_context sha256_ctx8 __attribute__ ((aligned (64)));
@@ -31,68 +104,47 @@ void sha256q_8way_hash( void* output, const void* input )
    sha256_8way_close( &ctx, output );
 }
 
-int scanhash_sha256q_8way( struct work *work, uint32_t max_nonce,
-	                   uint64_t *hashes_done, struct thr_info *mythr )
+int scanhash_sha256q_8way( struct work *work, const uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
-   uint32_t hash[8*8] __attribute__ ((aligned (32)));
+   uint32_t vdata[20*8]  __attribute__ ((aligned (64)));
+   uint32_t hash32[8*8]    __attribute__ ((aligned (32)));
    uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t *hash32_d7 = &(hash32[7<<3]);
    uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const uint32_t Htarg = ptarget[7];
+   const uint32_t *ptarget = work->target;
+   const uint32_t targ32_d7 = ptarget[7];
    const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
    uint32_t n = first_nonce;
    __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
-   int thr_id = mythr->id;  // thr_id arg is deprecated
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
 
-   const uint64_t htmax[] = {          0,
-                                     0xF,
-                                    0xFF,
-                                   0xFFF,
-                                  0xFFFF,
-                              0x10000000 };
-   const uint32_t masks[] = {  0xFFFFFFFF,
-                               0xFFFFFFF0,
-                               0xFFFFFF00,
-                               0xFFFFF000,
-                               0xFFFF0000,
-                                        0 };
-
-   // Need big endian data
    mm256_bswap32_intrlv80_8x32( vdata, pdata );
+   *noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
    sha256_8way_init( &sha256_ctx8 );
    sha256_8way_update( &sha256_ctx8, vdata, 64 );
 
-   for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
+   do
    {
-      uint32_t mask = masks[m];
-      do
-      {
-         *noncev = mm256_bswap_32(
-		            _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
-
-	      pdata[19] = n;
-         sha256q_8way_hash( hash, vdata );
-
-         uint32_t *hash7 = &(hash[7<<3]); 
-	 
-         for ( int lane = 0; lane < 8; lane++ )
-         if ( !( hash7[ lane ] & mask ) )
-         { 
-            // deinterleave hash for lane
-	         extr_lane_8x32( lane_hash, hash, lane, 256 );
-
-	         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
-            {
-	           pdata[19] = n + lane;
-              submit_solution( work, lane_hash, mythr );
-            }
-	      }
-         n += 8;
-      } while ( (n < max_nonce-10) && !work_restart[thr_id].restart );
-      break;
-   }
-   *hashes_done = n - first_nonce + 1;
+     pdata[19] = n;
+     sha256q_8way_hash( hash32, vdata );
+     for ( int lane = 0; lane < 8; lane++ )
+     if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+     {
+        extr_lane_8x32( lane_hash, hash32, lane, 256 );
+        if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+        {
+           pdata[19] = bswap_32( n + lane );
+           submit_solution( work, lane_hash, mythr );
+        }
+      }
+      *noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
+      n += 8;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
    return 0;
 }
 
diff --git a/algo/sha/sha256q.c b/algo/sha/sha256q.c
index 772ba41..cf9890e 100644
--- a/algo/sha/sha256q.c
+++ b/algo/sha/sha256q.c
@@ -1,108 +1,74 @@
 #include "sha256t-gate.h"
-
-#if !defined(SHA256T_16WAY) && !defined(SHA256T_8WAY) && !defined(SHA256T_4WAY)
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-#include <openssl/sha.h>
+#include "algo/sha/sph_sha2.h"
 
-static __thread SHA256_CTX sha256q_ctx __attribute__ ((aligned (64)));
+static __thread sph_sha256_context sha256q_ctx __attribute__ ((aligned (64)));
 
 void sha256q_midstate( const void* input )
 {
-    SHA256_Init( &sha256q_ctx );
-    SHA256_Update( &sha256q_ctx, input, 64 );
+   sph_sha256_init( &sha256q_ctx );
+   sph_sha256( &sha256q_ctx, input, 64 );
 }
 
-void sha256q_hash( void* output, const void* input )
+int sha256q_hash( void* output, const void* input )
 {
    uint32_t _ALIGN(64) hash[16];
    const int midlen = 64;            // bytes
    const int tail   = 80 - midlen;   // 16
 
-   SHA256_CTX ctx __attribute__ ((aligned (64)));
+   sph_sha256_context ctx __attribute__ ((aligned (64)));
    memcpy( &ctx, &sha256q_ctx, sizeof sha256q_ctx );
 
-   SHA256_Update( &ctx, input + midlen, tail );
-   SHA256_Final( (unsigned char*)hash, &ctx );
+   sph_sha256( &ctx, input + midlen, tail );
+   sph_sha256_close( &ctx, hash );
 
-   SHA256_Init( &ctx );
-   SHA256_Update( &ctx, hash, 32 );
-   SHA256_Final( (unsigned char*)hash, &ctx );
+   sph_sha256_init( &ctx );
+   sph_sha256( &ctx, hash, 32 );
+   sph_sha256_close( &ctx, hash );
 
-   SHA256_Init( &ctx );
-   SHA256_Update( &ctx, hash, 32 );
-   SHA256_Final( (unsigned char*)hash, &ctx );
+   sph_sha256_init( &ctx );
+   sph_sha256( &ctx, hash, 32 );
+   sph_sha256_close( &ctx, hash );
 
-   SHA256_Init( &ctx );
-   SHA256_Update( &ctx, hash, 32 );
-   SHA256_Final( (unsigned char*)hash, &ctx );
+   sph_sha256_init( &ctx );
+   sph_sha256( &ctx, hash, 32 );
+   sph_sha256_close( &ctx, output );
 
-   memcpy( output, hash, 32 );
+   return 1;
 }
 
 int scanhash_sha256q( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr )
 {
+   uint32_t edata[20] __attribute__((aligned(64)));
+   uint32_t hash[8] __attribute__((aligned(64)));
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
-   uint32_t n = pdata[19] - 1;
    const uint32_t first_nonce = pdata[19];
-   const uint32_t Htarg = ptarget[7];
-#ifdef _MSC_VER
-   uint32_t __declspec(align(32)) hash64[8];
-#else
-   uint32_t hash64[8] __attribute__((aligned(32)));
-#endif
-   uint32_t endiandata[32];
-   int thr_id = mythr->id;  // thr_id arg is deprecated
+   const uint32_t last_nonce = max_nonce - 1;
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
 
-   uint64_t htmax[] = {
-		0,
-		0xF,
-		0xFF,
-		0xFFF,
-		0xFFFF,
-		0x10000000
-	};
-   uint32_t masks[] = {
-		0xFFFFFFFF,
-		0xFFFFFFF0,
-		0xFFFFFF00,
-		0xFFFFF000,
-		0xFFFF0000,
-		0
-	};
+   mm128_bswap32_80( edata, pdata );
+   sha256q_midstate( edata );
 
-   // we need bigendian data...
-   casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
-   casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
-   casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
-   casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
-   casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
-
-   sha256q_midstate( endiandata );
-
-   for ( int m = 0; m < 6; m++ )
+   do
    {
-      if ( Htarg <= htmax[m] )
+      edata[19] = n;
+      if ( likely( sha256q_hash( hash, edata ) ) )
+      if ( unlikely( valid_hash( hash, ptarget ) && !bench ) )
       {
-         uint32_t mask = masks[m];
-         do {
-            pdata[19] = ++n;
-            be32enc(&endiandata[19], n);
-            sha256q_hash( hash64, endiandata );
-            if ( !( hash64[7] & mask ) )
-            if ( fulltest( hash64, ptarget ) && !opt_benchmark )
-               submit_solution( work, hash64, mythr );
-         } while ( n < max_nonce && !work_restart[thr_id].restart );
-         break;
+         pdata[19] = bswap_32( n );
+         submit_solution( work, hash, mythr );
       }
-   }
-   *hashes_done = n - first_nonce + 1;
+      n++;
+   } while ( n < last_nonce && !work_restart[thr_id].restart );
+   *hashes_done = n - first_nonce;
    pdata[19] = n;
    return 0;
 }
-#endif
+
diff --git a/algo/sha/sha256t-4way.c b/algo/sha/sha256t-4way.c
index b1d073a..eb11744 100644
--- a/algo/sha/sha256t-4way.c
+++ b/algo/sha/sha256t-4way.c
@@ -5,6 +5,75 @@
 #include <stdio.h>
 #include "sha-hash-4way.h"
 
+#if defined(SHA256T_16WAY)
+
+static __thread sha256_16way_context sha256_ctx16 __attribute__ ((aligned (64)));
+
+void sha256t_16way_hash( void* output, const void* input )
+{
+   uint32_t vhash[8*16] __attribute__ ((aligned (64)));
+   sha256_16way_context ctx;
+   memcpy( &ctx, &sha256_ctx16, sizeof ctx );
+
+   sha256_16way_update( &ctx, input + (64<<4), 16 );
+   sha256_16way_close( &ctx, vhash );
+
+   sha256_16way_init( &ctx );
+   sha256_16way_update( &ctx, vhash, 32 );
+   sha256_16way_close( &ctx, vhash );
+
+   sha256_16way_init( &ctx );
+   sha256_16way_update( &ctx, vhash, 32 );
+   sha256_16way_close( &ctx, output );
+}
+
+int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t vdata[20*16] __attribute__ ((aligned (64)));
+   uint32_t hash32[8*16] __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t *hash32_d7 = &(hash32[7<<4]);
+   uint32_t *pdata = work->data;
+   const uint32_t *ptarget = work->target;
+   const uint32_t targ32_d7 = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 16;
+   uint32_t n = first_nonce;
+   __m512i  *noncev = (__m512i*)vdata + 19;   // aligned
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+
+   mm512_bswap32_intrlv80_16x32( vdata, pdata );
+   *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
+                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
+   sha256_16way_init( &sha256_ctx16 );
+   sha256_16way_update( &sha256_ctx16, vdata, 64 );
+
+   do
+   {
+     pdata[19] = n;
+     sha256t_16way_hash( hash32, vdata );
+     for ( int lane = 0; lane < 16; lane++ )
+     if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+     {
+        extr_lane_16x32( lane_hash, hash32, lane, 256 );
+        if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+        {
+           pdata[19] = bswap_32( n + lane );
+           submit_solution( work, lane_hash, mythr );
+        }
+      }
+      *noncev = _mm512_add_epi32( *noncev, m512_const1_32( 16 ) );
+      n += 16;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#endif
+
 #if defined(SHA256T_8WAY)
 
 static __thread sha256_8way_context sha256_ctx8 __attribute__ ((aligned (64)));
@@ -31,61 +100,43 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
                            uint64_t *hashes_done, struct thr_info *mythr )
 {
    uint32_t vdata[20*8]  __attribute__ ((aligned (64)));
-   uint32_t hash[8*8]    __attribute__ ((aligned (32)));
+   uint32_t hash32[8*8]    __attribute__ ((aligned (32)));
    uint32_t lane_hash[8] __attribute__ ((aligned (32)));
-   uint32_t *hash7 = &(hash[7<<3]);
+   uint32_t *hash32_d7 = &(hash32[7<<3]);
    uint32_t *pdata = work->data;
    const uint32_t *ptarget = work->target;
-   const uint32_t Htarg = ptarget[7];
+   const uint32_t targ32_d7 = ptarget[7];
    const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
    uint32_t n = first_nonce;
    __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
    const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
 
-   const uint64_t htmax[] = {          0,
-                                     0xF,
-                                    0xFF,
-                                   0xFFF,
-                                  0xFFFF,
-                              0x10000000 };
-   const uint32_t masks[] = {  0xFFFFFFFF,
-                               0xFFFFFFF0,
-                               0xFFFFFF00,
-                               0xFFFFF000,
-                               0xFFFF0000,
-                                        0 };
-
-
-   // Need big endian data
    mm256_bswap32_intrlv80_8x32( vdata, pdata );
+   *noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
    sha256_8way_init( &sha256_ctx8 );
    sha256_8way_update( &sha256_ctx8, vdata, 64 );
 
-   for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
+   do
    {
-      const uint32_t mask = masks[m];
-      do
-      {
-        *noncev = mm256_bswap_32( _mm256_set_epi32(
-                                          n+7,n+6,n+5,n+4,n+3,n+2,n+1,n ) );
-         pdata[19] = n;
-         sha256t_8way_hash( hash, vdata );
-         for ( int lane = 0; lane < 8; lane++ )
-         if ( !( hash7[ lane ] & mask ) )
-         {
-            // deinterleave hash for lane
-            extr_lane_8x32( lane_hash, hash, lane, 256 );
-            if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
-            {
-              pdata[19] = n + lane;
-              submit_solution( work, lane_hash, mythr );
-	         }
-         }
-         n += 8;
-      } while ( (n < max_nonce-10) && !work_restart[thr_id].restart );
-      break;
-   }
-   *hashes_done = n - first_nonce + 1;
+     pdata[19] = n;
+     sha256t_8way_hash( hash32, vdata );
+     for ( int lane = 0; lane < 8; lane++ )
+     if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+     {
+        extr_lane_8x32( lane_hash, hash32, lane, 256 );
+        if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+        {
+           pdata[19] = bswap_32( n + lane );
+           submit_solution( work, lane_hash, mythr );
+        }
+      }
+      *noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
+      n += 8;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
    return 0;
 }
 
diff --git a/algo/sha/sha256t-gate.c b/algo/sha/sha256t-gate.c
index 15ce7db..166efe2 100644
--- a/algo/sha/sha256t-gate.c
+++ b/algo/sha/sha256t-gate.c
@@ -2,40 +2,41 @@
 
 bool register_sha256t_algo( algo_gate_t* gate )
 {
-#if defined(SHA256T_8WAY)
+    gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
+#if defined(SHA256T_16WAY)
+    gate->scanhash   = (void*)&scanhash_sha256t_16way;
+    gate->hash       = (void*)&sha256t_16way_hash;
+#elif defined(__SHA__)
+    gate->optimizations = SHA_OPT;
+    gate->scanhash   = (void*)&scanhash_sha256t;
+    gate->hash       = (void*)&sha256t_hash;
+#elif defined(SHA256T_8WAY)
     gate->scanhash   = (void*)&scanhash_sha256t_8way;
     gate->hash       = (void*)&sha256t_8way_hash;
 #else
     gate->scanhash   = (void*)&scanhash_sha256t_4way;
     gate->hash       = (void*)&sha256t_4way_hash;
-/*
-#else
-    gate->optimizations = SHA_OPT;
-    gate->scanhash   = (void*)&scanhash_sha256t;
-    gate->hash       = (void*)&sha256t_hash;
-*/
 #endif
-    gate->optimizations = SSE2_OPT | AVX2_OPT;
     return true;
 }
 
 bool register_sha256q_algo( algo_gate_t* gate )
 {
-#if defined(SHA256T_8WAY)
+    gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
+#if defined(SHA256T_16WAY)
+    gate->scanhash   = (void*)&scanhash_sha256q_16way;
+    gate->hash       = (void*)&sha256q_16way_hash;
+#elif defined(__SHA__)
+    gate->optimizations = SHA_OPT;
+    gate->scanhash   = (void*)&scanhash_sha256q;
+    gate->hash       = (void*)&sha256q_hash;
+#elif defined(SHA256T_8WAY)
     gate->scanhash   = (void*)&scanhash_sha256q_8way;
     gate->hash       = (void*)&sha256q_8way_hash;
 #else
     gate->scanhash   = (void*)&scanhash_sha256q_4way;
     gate->hash       = (void*)&sha256q_4way_hash;
-/*
-#else
-    gate->optimizations = SHA_OPT;
-    gate->scanhash   = (void*)&scanhash_sha256q;
-    gate->hash       = (void*)&sha256q_hash;
-*/
 #endif
-    gate->optimizations = SSE2_OPT | AVX2_OPT;
     return true;
-
 }
 
diff --git a/algo/sha/sha256t-gate.h b/algo/sha/sha256t-gate.h
index cb06f5a..46266f2 100644
--- a/algo/sha/sha256t-gate.h
+++ b/algo/sha/sha256t-gate.h
@@ -4,15 +4,27 @@
 #include <stdint.h>
 #include "algo-gate-api.h"
 
-#if defined(__AVX2__)
-  #define SHA256T_8WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define SHA256T_16WAY 1
+#elif defined(__AVX2__)
+  #define SHA256T_8WAY 1
 #else
-  #define SHA256T_4WAY
+  #define SHA256T_4WAY 1
 #endif
 
 bool register_sha256t_algo( algo_gate_t* gate );
 bool register_sha256q_algo( algo_gate_t* gate );
 
+#if defined(SHA256T_16WAY)
+
+void sha256t_16way_hash( void *output, const void *input );
+int scanhash_sha256t_16way( struct work *work, uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr );
+void sha256q_16way_hash( void *output, const void *input );
+int scanhash_sha256q_16way( struct work *work, uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr );
+#endif
+
 #if defined(SHA256T_8WAY)
 
 void sha256t_8way_hash( void *output, const void *input );
@@ -33,13 +45,13 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
                            uint64_t *hashes_done, struct thr_info *mythr );
 #endif
 
-/*
-void sha256t_hash( void *output, const void *input );
+
+int sha256t_hash( void *output, const void *input );
 int scanhash_sha256t( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr );
-void sha256q_hash( void *output, const void *input );
+int sha256q_hash( void *output, const void *input );
 int scanhash_sha256q( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr );
-*/
+
 #endif
 
diff --git a/algo/sha/sha256t.c b/algo/sha/sha256t.c
index 19f47d6..bd4edf0 100644
--- a/algo/sha/sha256t.c
+++ b/algo/sha/sha256t.c
@@ -1,107 +1,73 @@
 #include "sha256t-gate.h"
-
-// Obsolete
-
-#if !defined(SHA256T_16WAY) && !defined(SHA256T_8WAY) && !defined(SHA256T_4WAY)
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-#include <openssl/sha.h>
+#include "algo/sha/sph_sha2.h"
 
-static __thread SHA256_CTX sha256t_ctx __attribute__ ((aligned (64)));
+// Only used on CPUs with SHA
+
+static __thread sph_sha256_context sha256t_ctx __attribute__ ((aligned (64)));
 
 void sha256t_midstate( const void* input )
 {
-    SHA256_Init( &sha256t_ctx );
-    SHA256_Update( &sha256t_ctx, input, 64 );
+   sph_sha256_init( &sha256t_ctx );
+   sph_sha256( &sha256t_ctx, input, 64 );
 }
 
-void sha256t_hash( void* output, const void* input )
+int sha256t_hash( void* output, const void* input )
 {
    uint32_t _ALIGN(64) hash[16];
    const int midlen = 64;            // bytes
    const int tail   = 80 - midlen;   // 16
 
-   SHA256_CTX ctx __attribute__ ((aligned (64)));
+   sph_sha256_context ctx __attribute__ ((aligned (64)));
    memcpy( &ctx, &sha256t_ctx, sizeof sha256t_ctx );
 
-   SHA256_Update( &ctx, input + midlen, tail );
-   SHA256_Final( (unsigned char*)hash, &ctx );
+   sph_sha256( &ctx, input + midlen, tail );
+   sph_sha256_close( &ctx, hash );
 
-   SHA256_Init( &ctx );
-   SHA256_Update( &ctx, hash, 32 );
-   SHA256_Final( (unsigned char*)hash, &ctx );
+   sph_sha256_init( &ctx );
+   sph_sha256( &ctx, hash, 32 );
+   sph_sha256_close( &ctx, hash );
 
-   SHA256_Init( &ctx );
-   SHA256_Update( &ctx, hash, 32 );
-   SHA256_Final( (unsigned char*)hash, &ctx );
+   sph_sha256_init( &ctx );
+   sph_sha256( &ctx, hash, 32 );
+   sph_sha256_close( &ctx, output );
 
-   memcpy( output, hash, 32 );
+   return 1;
 }
 
 int scanhash_sha256t( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr )
 {
+   uint32_t edata[20] __attribute__((aligned(64)));
+   uint32_t hash[8] __attribute__((aligned(64)));
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
-   uint32_t n = pdata[19] - 1;
    const uint32_t first_nonce = pdata[19];
-   const uint32_t Htarg = ptarget[7];
-#ifdef _MSC_VER
-   uint32_t __declspec(align(32)) hash64[8];
-#else
-   uint32_t hash64[8] __attribute__((aligned(32)));
-#endif
-   uint32_t endiandata[32];
-   int thr_id = mythr->id;  // thr_id arg is deprecated
+   const uint32_t last_nonce = max_nonce - 1;
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
 
-   uint64_t htmax[] = {
-		0,
-		0xF,
-		0xFF,
-		0xFFF,
-		0xFFFF,
-		0x10000000
-	};
-   uint32_t masks[] = {
-		0xFFFFFFFF,
-		0xFFFFFFF0,
-		0xFFFFFF00,
-		0xFFFFF000,
-		0xFFFF0000,
-		0
-	};
+   mm128_bswap32_80( edata, pdata );
+   sha256t_midstate( edata );
 
-   // we need bigendian data...
-   casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
-   casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
-   casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
-   casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
-   casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
-
-   sha256t_midstate( endiandata );
-
-   for ( int m = 0; m < 6; m++ )
+   do
    {
-      if ( Htarg <= htmax[m] )
+      edata[19] = n;
+      if ( likely( sha256t_hash( hash, edata ) ) )
+      if ( unlikely( valid_hash( hash, ptarget ) && !bench ) )
       {
-         uint32_t mask = masks[m];
-         do {
-            pdata[19] = ++n;
-            be32enc(&endiandata[19], n);
-            sha256t_hash( hash64, endiandata );
-            if ( !(hash64[7] & mask) )
-            if ( fulltest( hash64, ptarget ) && !opt_benchmark )
-               submit_solution( work, hash64, mythr );
-         } while ( n < max_nonce && !work_restart[thr_id].restart );
-         break;
+         pdata[19] = bswap_32( n );
+         submit_solution( work, hash, mythr );
       }
-   }
-   *hashes_done = n - first_nonce + 1;
+      n++;
+   } while ( n < last_nonce && !work_restart[thr_id].restart );
+   *hashes_done = n - first_nonce;
    pdata[19] = n;
    return 0;
 }
-#endif
+
 
diff --git a/algo/sha/sph_sha2.c b/algo/sha/sph_sha2.c
index 1eb225d..e87936d 100644
--- a/algo/sha/sph_sha2.c
+++ b/algo/sha/sph_sha2.c
@@ -71,7 +71,11 @@ static const sph_u32 H256[8] = {
  * of the compression function.
  */
 
-#if SPH_SMALL_FOOTPRINT_SHA2
+#if defined(__SHA__)
+
+#include "sha256-hash-opt.c"
+
+#else   // no SHA
 
 static const sph_u32 K[64] = {
 	SPH_C32(0x428A2F98), SPH_C32(0x71374491),
@@ -108,6 +112,8 @@ static const sph_u32 K[64] = {
 	SPH_C32(0xBEF9A3F7), SPH_C32(0xC67178F2)
 };
 
+#if SPH_SMALL_FOOTPRINT_SHA2
+
 #define SHA2_MEXP1(in, pc)   do { \
 		W[pc] = in(pc); \
 	} while (0)
@@ -191,7 +197,7 @@ static const sph_u32 K[64] = {
 		(r)[7] = SPH_T32((r)[7] + H); \
 	} while (0)
 
-#else
+#else  // large footprint (default)
 
 #define SHA2_ROUND_BODY(in, r)   do { \
 		sph_u32 A, B, C, D, E, F, G, H, T1, T2; \
@@ -600,7 +606,7 @@ static const sph_u32 K[64] = {
 		(r)[7] = SPH_T32((r)[7] + H); \
 	} while (0)
 
-#endif
+#endif  // small footprint else
 
 /*
  * One round of SHA-224 / SHA-256. The data must be aligned for 32-bit access.
@@ -613,6 +619,9 @@ sha2_round(const unsigned char *data, sph_u32 r[8])
 #undef SHA2_IN
 }
 
+#endif   // SHA else
+
+
 /* see sph_sha2.h */
 void
 sph_sha224_init(void *cc)
@@ -653,7 +662,7 @@ void
 sph_sha224_close(void *cc, void *dst)
 {
 	sha224_close(cc, dst, 7);
-	sph_sha224_init(cc);
+//	sph_sha224_init(cc);
 }
 
 /* see sph_sha2.h */
@@ -661,7 +670,7 @@ void
 sph_sha224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 {
 	sha224_addbits_and_close(cc, ub, n, dst, 7);
-	sph_sha224_init(cc);
+//	sph_sha224_init(cc);
 }
 
 /* see sph_sha2.h */
@@ -677,14 +686,14 @@ void
 sph_sha256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 {
 	sha224_addbits_and_close(cc, ub, n, dst, 8);
-	sph_sha256_init(cc);
+//	sph_sha256_init(cc);
 }
 
 /* see sph_sha2.h */
-void
-sph_sha224_comp(const sph_u32 msg[16], sph_u32 val[8])
-{
-#define SHA2_IN(x)   msg[x]
-	SHA2_ROUND_BODY(SHA2_IN, val);
-#undef SHA2_IN
-}
+//void
+//sph_sha224_comp(const sph_u32 msg[16], sph_u32 val[8])
+//{
+//#define SHA2_IN(x)   msg[x]
+//	SHA2_ROUND_BODY(SHA2_IN, val);
+//#undef SHA2_IN
+//}
diff --git a/algo/sha/sph_sha2.h b/algo/sha/sph_sha2.h
index d5bda73..df0e836 100644
--- a/algo/sha/sph_sha2.h
+++ b/algo/sha/sph_sha2.h
@@ -73,7 +73,7 @@ typedef struct {
 	sph_u32 count_high, count_low;
 #endif
 #endif
-} sph_sha224_context;
+} sph_sha224_context __attribute__((aligned(64)));
 
 /**
  * This structure is a context for SHA-256 computations. It is identical
diff --git a/algo/skein/skein-4way.c b/algo/skein/skein-4way.c
index 0c1a7df..a12af43 100644
--- a/algo/skein/skein-4way.c
+++ b/algo/skein/skein-4way.c
@@ -2,14 +2,8 @@
 #include <string.h>
 #include <stdint.h>
 #include "skein-hash-4way.h"
-
-// 8 way is faster than SHA on Icelake
-// SHA is faster than 4 way on Ryzen
-//
-#if defined(__SHA__)
-  #include <openssl/sha.h>
-#endif
 #include "algo/sha/sha-hash-4way.h"
+#include "algo/sha/sph_sha2.h"
 
 #if defined (SKEIN_8WAY)
 
@@ -93,7 +87,7 @@ void skeinhash_4way( void *state, const void *input )
      uint32_t hash1[16] __attribute__ ((aligned (64)));
      uint32_t hash2[16] __attribute__ ((aligned (64)));
      uint32_t hash3[16] __attribute__ ((aligned (64)));
-     SHA256_CTX ctx_sha256;
+     sph_sha256_context ctx_sha256;
 #else
      uint32_t vhash32[16*4] __attribute__ ((aligned (64)));
      sha256_4way_context ctx_sha256;
@@ -102,31 +96,29 @@ void skeinhash_4way( void *state, const void *input )
      skein512_4way_final16( &ctx_skein, vhash64, input + (64*4) );
 
 #if defined(__SHA__)      
+
      dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 512 );
-
-     SHA256_Init( &ctx_sha256 );
-     SHA256_Update( &ctx_sha256, (unsigned char*)hash0, 64 );
-     SHA256_Final( (unsigned char*)hash0, &ctx_sha256 );
-
-     SHA256_Init( &ctx_sha256 );
-     SHA256_Update( &ctx_sha256, (unsigned char*)hash1, 64 );
-     SHA256_Final( (unsigned char*)hash1, &ctx_sha256 );
-
-     SHA256_Init( &ctx_sha256 );
-     SHA256_Update( &ctx_sha256, (unsigned char*)hash2, 64 );
-     SHA256_Final( (unsigned char*)hash2, &ctx_sha256 );
-
-     SHA256_Init( &ctx_sha256 );
-     SHA256_Update( &ctx_sha256, (unsigned char*)hash3, 64 );
-     SHA256_Final( (unsigned char*)hash3, &ctx_sha256 );
-
+     sph_sha256_init( &ctx_sha256 );
+     sph_sha256( &ctx_sha256, hash0, 64 );
+     sph_sha256_close( &ctx_sha256, hash0 );
+     sph_sha256_init( &ctx_sha256 );
+     sph_sha256( &ctx_sha256, hash1, 64 );
+     sph_sha256_close( &ctx_sha256, hash1 );
+     sph_sha256_init( &ctx_sha256 );
+     sph_sha256( &ctx_sha256, hash2, 64 );
+     sph_sha256_close( &ctx_sha256, hash2 );
+     sph_sha256_init( &ctx_sha256 );
+     sph_sha256( &ctx_sha256, hash3, 64 );
+     sph_sha256_close( &ctx_sha256, hash3 );
      intrlv_4x32( state, hash0, hash1, hash2, hash3, 256 );
-#else
-     rintrlv_4x64_4x32( vhash32, vhash64, 512 );
 
+#else
+
+     rintrlv_4x64_4x32( vhash32, vhash64, 512 );
      sha256_4way_init( &ctx_sha256 );
      sha256_4way_update( &ctx_sha256, vhash32, 64 );
      sha256_4way_close( &ctx_sha256, state );
+
 #endif
 }
 
diff --git a/algo/skein/skein.c b/algo/skein/skein.c
index dba2ca0..91eb325 100644
--- a/algo/skein/skein.c
+++ b/algo/skein/skein.c
@@ -5,21 +5,21 @@
 #include <string.h>
 #include <stdint.h>
 #include "sph_skein.h"
-#include <openssl/sha.h>
+#include "algo/sha/sph_sha2.h"
 
 void skeinhash(void *state, const void *input)
 {
      uint32_t hash[16] __attribute__ ((aligned (64)));
      sph_skein512_context ctx_skein;
-     SHA256_CTX           ctx_sha256;
+     sph_sha256_context   ctx_sha256;
 
      sph_skein512_init( &ctx_skein );
      sph_skein512( &ctx_skein, input, 80 );
      sph_skein512_close( &ctx_skein, hash );
 
-     SHA256_Init( &ctx_sha256 );
-     SHA256_Update( &ctx_sha256, (unsigned char*)hash, 64 );
-     SHA256_Final( (unsigned char*) hash, &ctx_sha256 );
+     sph_sha256_init( &ctx_sha256 );
+     sph_sha256( &ctx_sha256, hash, 64 );
+     sph_sha256_close( &ctx_sha256, hash );
 
      memcpy(state, hash, 32);
 }
diff --git a/algo/x16/hex.c b/algo/x16/hex.c
index ada1ca7..5d064d2 100644
--- a/algo/x16/hex.c
+++ b/algo/x16/hex.c
@@ -161,9 +161,9 @@ int hex_hash( void* output, const void* input, int thrid )
                 sph_whirlpool512_full( &ctx.whirlpool, hash, in,  size );
          break;
          case SHA_512:
-             SHA512_Init( &ctx.sha512 );
-             SHA512_Update( &ctx.sha512, in, size );
-             SHA512_Final( (unsigned char*) hash, &ctx.sha512 );
+             sph_sha512_init( &ctx.sha512 );
+             sph_sha512( &ctx.sha512, in, size );
+             sph_sha512_close( &ctx.sha512, hash );
          break;
       }
 
diff --git a/algo/x16/minotaur.c b/algo/x16/minotaur.c
index ed53242..c607c52 100644
--- a/algo/x16/minotaur.c
+++ b/algo/x16/minotaur.c
@@ -7,7 +7,6 @@
 #include <stdio.h>
 #include "algo/blake/sph_blake.h"
 #include "algo/bmw/sph_bmw.h"
-//#include "algo/jh/jh-hash-sse2.h"
 #include "algo/jh/sph_jh.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
@@ -18,7 +17,7 @@
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
-#include <openssl/sha.h>
+#include "algo/sha/sph_sha2.h"
 #if defined(__AES__)
   #include "algo/echo/aes_ni/hash_api.h"
   #include "algo/groestl/aes_ni/hash-groestl.h"
@@ -50,7 +49,6 @@ struct TortureGarden
         sph_blake512_context    blake;
         sph_bmw512_context      bmw;
         sph_skein512_context    skein;
-//        jh512_sse2_hashState    jh;
         sph_jh512_context       jh;
         sph_keccak512_context   keccak;
         hashState_luffa         luffa;
@@ -60,7 +58,7 @@ struct TortureGarden
         sph_hamsi512_context    hamsi;
         sph_shabal512_context   shabal;
         sph_whirlpool_context   whirlpool;
-        SHA512_CTX              sha512;
+        sph_sha512_context      sha512;
 
     struct TortureNode {
         unsigned int algo;
@@ -122,12 +120,11 @@ static void get_hash( void *output, const void *input, TortureGarden *garden,
             sph_hamsi512_close(&garden->hamsi, hash);          
             break;
         case 7:
-            SHA512_Init( &garden->sha512 );
-            SHA512_Update( &garden->sha512, input, 64 );
-            SHA512_Final( (unsigned char*)hash, &garden->sha512 );
+            sph_sha512_init( &garden->sha512 );
+            sph_sha512( &garden->sha512, input, 64 );
+            sph_sha512_close( &garden->sha512, hash );
             break;
         case 8:
-//            jh512_sse2_full( &garden->jh, hash, input, 64 );
             sph_jh512_init(&garden->jh);
             sph_jh512(&garden->jh, input, 64);
             sph_jh512_close(&garden->jh, hash);          
@@ -232,9 +229,9 @@ int minotaur_hash( void *output, const void *input, int thr_id )
     unsigned char hash[64] __attribute__ ((aligned (64)));
 
     // Find initial sha512 hash
-    SHA512_Init( &garden.sha512 );
-    SHA512_Update( &garden.sha512, input, 80 );
-    SHA512_Final( (unsigned char*) hash, &garden.sha512 );
+    sph_sha512_init( &garden.sha512 );
+    sph_sha512( &garden.sha512, input, 80 );
+    sph_sha512_close( &garden.sha512, hash );
 
     // algo 6 (Hamsi) is very slow. It's faster to skip hashing this nonce
     // if Hamsi is needed but only the first and last functions are
diff --git a/algo/x16/x16r-gate.h b/algo/x16/x16r-gate.h
index ed93599..748b7fa 100644
--- a/algo/x16/x16r-gate.h
+++ b/algo/x16/x16r-gate.h
@@ -20,13 +20,16 @@
 #include "algo/fugue/sph_fugue.h"
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
-#include <openssl/sha.h>
+#include "algo/sha/sph_sha2.h"
+
 #if defined(__AES__)
-  #include "algo/echo/aes_ni/hash_api.h"
-  #include "algo/groestl/aes_ni/hash-groestl.h"
-  #include "algo/fugue/fugue-aesni.h"
+#include "algo/echo/aes_ni/hash_api.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/fugue/fugue-aesni.h"
 #endif
+
 #if defined (__AVX2__)
+
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
 #include "algo/groestl/aes_ni/hash-groestl.h"
@@ -39,12 +42,14 @@
 #include "algo/hamsi/hamsi-hash-4way.h"
 #include "algo/shabal/shabal-hash-4way.h"
 #include "algo/sha/sha-hash-4way.h"
+
 #if defined(__VAES__)
-  #include "algo/groestl/groestl512-hash-4way.h"
-  #include "algo/shavite/shavite-hash-2way.h"
-  #include "algo/shavite/shavite-hash-4way.h"
-  #include "algo/echo/echo-hash-4way.h"
+#include "algo/groestl/groestl512-hash-4way.h"
+#include "algo/shavite/shavite-hash-2way.h"
+#include "algo/shavite/shavite-hash-4way.h"
+#include "algo/echo/echo-hash-4way.h"
 #endif
+
 #endif // AVX2
 
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
@@ -206,7 +211,7 @@ union _x16r_context_overlay
         sph_hamsi512_context    hamsi;
         sph_shabal512_context   shabal;
         sph_whirlpool_context   whirlpool;
-        SHA512_CTX              sha512;
+        sph_sha512_context      sha512;
 } __attribute__ ((aligned (64)));
 
 typedef union _x16r_context_overlay x16r_context_overlay;
diff --git a/algo/x16/x16r.c b/algo/x16/x16r.c
index b8cab34..f9ad45e 100644
--- a/algo/x16/x16r.c
+++ b/algo/x16/x16r.c
@@ -177,9 +177,9 @@ int x16r_hash_generic( void* output, const void* input, int thrid )
                sph_whirlpool512_full( &ctx.whirlpool, hash, in, size );
          break;
          case SHA_512:
-            SHA512_Init( &ctx.sha512 );
-            SHA512_Update( &ctx.sha512, in, size );
-            SHA512_Final( (unsigned char*) hash, &ctx.sha512 );
+            sph_sha512_init( &ctx.sha512 );
+            sph_sha512( &ctx.sha512, in, size );
+            sph_sha512_close( &ctx.sha512, hash );
          break;
       }
 
diff --git a/algo/x16/x16rv2.c b/algo/x16/x16rv2.c
index 6210932..4173afc 100644
--- a/algo/x16/x16rv2.c
+++ b/algo/x16/x16rv2.c
@@ -33,7 +33,7 @@ union _x16rv2_context_overlay
         sph_hamsi512_context    hamsi;
         sph_shabal512_context   shabal;
         sph_whirlpool_context   whirlpool;
-        SHA512_CTX              sha512;
+        sph_sha512_context      sha512;
         sph_tiger_context       tiger;
 };
 typedef union _x16rv2_context_overlay x16rv2_context_overlay;
@@ -155,9 +155,9 @@ int x16rv2_hash( void* output, const void* input, int thrid )
              sph_tiger( &ctx.tiger, in, size );
              sph_tiger_close( &ctx.tiger, hash );
              padtiger512( hash );
-             SHA512_Init( &ctx.sha512 );
-             SHA512_Update( &ctx.sha512, hash, 64 );
-             SHA512_Final( (unsigned char*) hash, &ctx.sha512 );
+             sph_sha512_init( &ctx.sha512 );
+             sph_sha512( &ctx.sha512, hash, 64 );
+             sph_sha512_close( &ctx.sha512, hash );
          break;
       }
 
diff --git a/algo/x16/x21s-4way.c b/algo/x16/x21s-4way.c
index 262971f..de2dbe6 100644
--- a/algo/x16/x21s-4way.c
+++ b/algo/x16/x21s-4way.c
@@ -13,7 +13,7 @@
 #include "algo/gost/sph_gost.h"
 #include "algo/lyra2/lyra2.h"
 #if defined(__SHA__)
- #include <openssl/sha.h>
+  #include "algo/sha/sph_sha2.h"
 #endif
 
 #if defined (X21S_8WAY)
@@ -209,7 +209,7 @@ union _x21s_4way_context_overlay
     sph_tiger_context       tiger;
     sph_gost512_context     gost;
 #if defined(__SHA__)
-    SHA256_CTX              sha256;
+    sph_sha256_context      sha256;
 #else
     sha256_4way_context     sha256;
 #endif
@@ -275,23 +275,18 @@ int x21s_4way_hash( void* output, const void* input, int thrid )
 
 #if defined(__SHA__)
 
-   SHA256_Init( &ctx.sha256 );
-   SHA256_Update( &ctx.sha256, hash0, 64 );
-   SHA256_Final( (unsigned char*)hash0, &ctx.sha256 );
-   SHA256_Init( &ctx.sha256 );
-   SHA256_Update( &ctx.sha256, hash1, 64 );
-   SHA256_Final( (unsigned char*)hash1, &ctx.sha256 );
-   SHA256_Init( &ctx.sha256 );
-   SHA256_Update( &ctx.sha256, hash2, 64 );
-   SHA256_Final( (unsigned char*)hash2, &ctx.sha256 );
-   SHA256_Init( &ctx.sha256 );
-   SHA256_Update( &ctx.sha256, hash3, 64 );
-   SHA256_Final( (unsigned char*)hash3, &ctx.sha256 );
-
-   memcpy( output,    hash0, 32 );
-   memcpy( output+32, hash1, 32 );
-   memcpy( output+64, hash2, 32 );
-   memcpy( output+96, hash3, 32 );
+   sph_sha256_init( &ctx.sha256 );
+   sph_sha256( &ctx.sha256, hash0, 64 );
+   sph_sha256_close( &ctx.sha256, output );
+   sph_sha256_init( &ctx.sha256 );
+   sph_sha256( &ctx.sha256, hash1, 64 );
+   sph_sha256_close( &ctx.sha256, output+32 );
+   sph_sha256_init( &ctx.sha256 );
+   sph_sha256( &ctx.sha256, hash2, 64 );
+   sph_sha256_close( &ctx.sha256, output+64 );
+   sph_sha256_init( &ctx.sha256 );
+   sph_sha256( &ctx.sha256, hash3, 64 );
+   sph_sha256_close( &ctx.sha256, output+96 );
 
 #else
 
diff --git a/algo/x16/x21s.c b/algo/x16/x21s.c
index 570b8a2..b81c07e 100644
--- a/algo/x16/x21s.c
+++ b/algo/x16/x21s.c
@@ -8,7 +8,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <openssl/sha.h>
+#include "algo/sha/sph_sha2.h"
 #include "algo/haval/sph-haval.h"
 #include "algo/tiger/sph_tiger.h"
 #include "algo/gost/sph_gost.h"
@@ -23,7 +23,7 @@ union _x21s_context_overlay
         sph_haval256_5_context  haval;
         sph_tiger_context       tiger;
         sph_gost512_context     gost;
-        SHA256_CTX              sha256;
+        sph_sha256_context      sha256;
 };
 typedef union _x21s_context_overlay x21s_context_overlay;
 
@@ -50,9 +50,9 @@ int x21s_hash( void* output, const void* input, int thrid )
    sph_gost512 ( &ctx.gost, (const void*) hash, 64 );
    sph_gost512_close( &ctx.gost, (void*) hash );
 
-   SHA256_Init( &ctx.sha256 );
-   SHA256_Update( &ctx.sha256, hash, 64 );
-   SHA256_Final( (unsigned char*)hash, &ctx.sha256 );
+   sph_sha256_init( &ctx.sha256 );
+   sph_sha256( &ctx.sha256, hash, 64 );
+   sph_sha256_close( &ctx.sha256, hash );
 
    memcpy( output, hash, 32 );
 
diff --git a/algo/x17/sonoa.c b/algo/x17/sonoa.c
index 4253ad8..d9fede2 100644
--- a/algo/x17/sonoa.c
+++ b/algo/x17/sonoa.c
@@ -20,7 +20,7 @@
 #include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
-#include <openssl/sha.h>
+#include "algo/sha/sph_sha2.h"
 #if defined(__AES__)
   #include "algo/echo/aes_ni/hash_api.h"
   #include "algo/groestl/aes_ni/hash-groestl.h"
@@ -53,7 +53,7 @@ typedef struct {
         sph_hamsi512_context    hamsi;
         sph_shabal512_context   shabal;
         sph_whirlpool_context   whirlpool;
-        SHA512_CTX              sha512;
+        sph_sha512_context      sha512;
         sph_haval256_5_context  haval;
 } sonoa_ctx_holder;
 
@@ -82,7 +82,7 @@ void init_sonoa_ctx()
         sph_hamsi512_init( &sonoa_ctx.hamsi );
         sph_shabal512_init( &sonoa_ctx.shabal );
         sph_whirlpool_init( &sonoa_ctx.whirlpool );
-        SHA512_Init( &sonoa_ctx.sha512 );
+        sph_sha512_init( &sonoa_ctx.sha512 );
         sph_haval256_5_init(&sonoa_ctx.haval);
 };
 
@@ -494,8 +494,8 @@ int sonoa_hash( void *state, const void *input, int thr_id )
    sph_whirlpool(&ctx.whirlpool, hash, 64);
    sph_whirlpool_close(&ctx.whirlpool, hash);
 
-   SHA512_Update( &ctx.sha512, hash, 64 );
-   SHA512_Final( (unsigned char*) hash, &ctx.sha512 );
+   sph_sha512( &ctx.sha512, hash, 64 );
+   sph_sha512_close( &ctx.sha512, hash );
 
    sph_whirlpool_init( &ctx.whirlpool );
    sph_whirlpool(&ctx.whirlpool, hash, 64);
@@ -574,9 +574,9 @@ int sonoa_hash( void *state, const void *input, int thr_id )
    sph_whirlpool(&ctx.whirlpool, hash, 64);
    sph_whirlpool_close(&ctx.whirlpool, hash);
 
-   SHA512_Init( &ctx.sha512 );
-   SHA512_Update( &ctx.sha512, hash, 64 );
-   SHA512_Final( (unsigned char*) hash, &ctx.sha512 );
+   sph_sha512_init( &ctx.sha512 );
+   sph_sha512( &ctx.sha512, hash, 64 );
+   sph_sha512_close( &ctx.sha512, hash );
 
    sph_haval256_5(&ctx.haval,(const void*) hash, 64);
    sph_haval256_5_close(&ctx.haval, hash);
diff --git a/algo/x17/x17.c b/algo/x17/x17.c
index e6a9a06..f01b349 100644
--- a/algo/x17/x17.c
+++ b/algo/x17/x17.c
@@ -19,7 +19,7 @@
 #include "algo/luffa/luffa_for_sse2.h" 
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
-#include <openssl/sha.h>
+#include "algo/sha/sph_sha2.h"
 #if defined(__AES__)
   #include "algo/fugue/fugue-aesni.h"
   #include "algo/echo/aes_ni/hash_api.h"
@@ -53,7 +53,7 @@ union _x17_context_overlay
         sph_hamsi512_context    hamsi;
         sph_shabal512_context   shabal;
         sph_whirlpool_context   whirlpool;
-        SHA512_CTX              sha512;
+        sph_sha512_context      sha512;
         sph_haval256_5_context  haval;
 };
 typedef union _x17_context_overlay x17_context_overlay;
@@ -140,9 +140,9 @@ int x17_hash(void *output, const void *input, int thr_id )
     sph_whirlpool( &ctx.whirlpool, hash, 64 );
     sph_whirlpool_close( &ctx.whirlpool, hash );
 
-    SHA512_Init( &ctx.sha512 );
-    SHA512_Update( &ctx.sha512, hash, 64 );
-    SHA512_Final( (unsigned char*)hash, &ctx.sha512 );
+    sph_sha512_init( &ctx.sha512 );
+    sph_sha512( &ctx.sha512, hash, 64 );
+    sph_sha512_close( &ctx.sha512, hash );
 
     sph_haval256_5_init(&ctx.haval);
     sph_haval256_5( &ctx.haval, (const void*)hash, 64 );
diff --git a/algo/x17/xevan.c b/algo/x17/xevan.c
index 08ed580..470add1 100644
--- a/algo/x17/xevan.c
+++ b/algo/x17/xevan.c
@@ -20,7 +20,7 @@
 #include "algo/haval/sph-haval.h"
 #include "algo/simd/nist.h"
 #include "algo/cubehash/cubehash_sse2.h"
-#include <openssl/sha.h>
+#include "algo/sha/sph_sha2.h"
 #if defined(__AES__)
   #include "algo/groestl/aes_ni/hash-groestl.h"
   #include "algo/echo/aes_ni/hash_api.h"
@@ -44,7 +44,7 @@ typedef struct {
         sph_hamsi512_context    hamsi;
         sph_shabal512_context   shabal;
         sph_whirlpool_context   whirlpool;
-        SHA512_CTX              sha512;
+        sph_sha512_context      sha512;
         sph_haval256_5_context  haval;
 #if defined(__AES__)
         hashState_echo          echo;
@@ -73,7 +73,7 @@ void init_xevan_ctx()
         sph_hamsi512_init( &xevan_ctx.hamsi );
         sph_shabal512_init( &xevan_ctx.shabal );
         sph_whirlpool_init( &xevan_ctx.whirlpool );
-        SHA512_Init( &xevan_ctx.sha512 );
+        sph_sha512_init( &xevan_ctx.sha512 );
         sph_haval256_5_init(&xevan_ctx.haval);
 #if defined(__AES__)
         init_groestl( &xevan_ctx.groestl, 64 );
@@ -95,97 +95,27 @@ int xevan_hash(void *output, const void *input, int thr_id )
 
    sph_blake512( &ctx.blake, input, 80 );
    sph_blake512_close( &ctx.blake, hash );
-	memset(&hash[16], 0, 64);
+   memset(&hash[16], 0, 64);
 
-	sph_bmw512(&ctx.bmw, hash, dataLen);
-	sph_bmw512_close(&ctx.bmw, hash);
+   sph_bmw512(&ctx.bmw, hash, dataLen);
+   sph_bmw512_close(&ctx.bmw, hash);
 
 #if defined(__AES__)
    update_and_final_groestl( &ctx.groestl, (char*)hash,
                                      (const char*)hash, dataLen*8 );
 #else
-	sph_groestl512(&ctx.groestl, hash, dataLen);
-	sph_groestl512_close(&ctx.groestl, hash);
-#endif
-
-	sph_skein512(&ctx.skein, hash, dataLen);
-	sph_skein512_close(&ctx.skein, hash);
-
-	sph_jh512(&ctx.jh, hash, dataLen);
-	sph_jh512_close(&ctx.jh, hash);
-
-	sph_keccak512(&ctx.keccak, hash, dataLen);
-	sph_keccak512_close(&ctx.keccak, hash);
-
-   update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
-                                 (const BitSequence*)hash, dataLen );
-
-   cubehashUpdateDigest( &ctx.cubehash, (byte*)hash,
-                                 (const byte*) hash, dataLen );
-
-	sph_shavite512(&ctx.shavite, hash, dataLen);
-	sph_shavite512_close(&ctx.shavite, hash);
-
-   update_final_sd( &ctx.simd, (BitSequence *)hash,
-                         (const BitSequence *)hash, dataLen*8 );
-
-#if defined(__AES__)
-   update_final_echo( &ctx.echo, (BitSequence *) hash,
-                           (const BitSequence *) hash, dataLen*8 );
-#else
-	sph_echo512(&ctx.echo, hash, dataLen);
-	sph_echo512_close(&ctx.echo, hash);
-#endif
-
-	sph_hamsi512(&ctx.hamsi, hash, dataLen);
-	sph_hamsi512_close(&ctx.hamsi, hash);
-
-#if defined(__AES__)
-    fugue512_Update( &ctx.fugue, hash, dataLen*8 );
-    fugue512_Final( &ctx.fugue, hash ); 
-#else
-	sph_fugue512(&ctx.fugue, hash, dataLen);
-	sph_fugue512_close(&ctx.fugue, hash);
-#endif
-
-	sph_shabal512(&ctx.shabal, hash, dataLen);
-	sph_shabal512_close(&ctx.shabal, hash);
-
-	sph_whirlpool(&ctx.whirlpool, hash, dataLen);
-	sph_whirlpool_close(&ctx.whirlpool, hash);
-
-   SHA512_Update( &ctx.sha512, hash, dataLen );
-   SHA512_Final( (unsigned char*) hash, &ctx.sha512 );
-
-	sph_haval256_5(&ctx.haval,(const void*) hash, dataLen);
-	sph_haval256_5_close(&ctx.haval, hash);
-
-	memset(&hash[8], 0, dataLen - 32);
-
-   memcpy( &ctx, &xevan_ctx, sizeof(xevan_ctx) );
-
-	sph_blake512(&ctx.blake, hash, dataLen);
-	sph_blake512_close(&ctx.blake, hash);
-
-	sph_bmw512(&ctx.bmw, hash, dataLen);
-	sph_bmw512_close(&ctx.bmw, hash);
-
-#if defined(__AES__)
-   update_and_final_groestl( &ctx.groestl, (char*)hash,
-                              (const BitSequence*)hash, dataLen*8 );
-#else
-	sph_groestl512(&ctx.groestl, hash, dataLen);
+   sph_groestl512(&ctx.groestl, hash, dataLen);
    sph_groestl512_close(&ctx.groestl, hash);
 #endif
 
-	sph_skein512(&ctx.skein, hash, dataLen);
-	sph_skein512_close(&ctx.skein, hash);
+   sph_skein512(&ctx.skein, hash, dataLen);
+   sph_skein512_close(&ctx.skein, hash);
 
-	sph_jh512(&ctx.jh, hash, dataLen);
-	sph_jh512_close(&ctx.jh, hash);
+   sph_jh512(&ctx.jh, hash, dataLen);
+   sph_jh512_close(&ctx.jh, hash);
 
-	sph_keccak512(&ctx.keccak, hash, dataLen);
-	sph_keccak512_close(&ctx.keccak, hash);
+   sph_keccak512(&ctx.keccak, hash, dataLen);
+   sph_keccak512_close(&ctx.keccak, hash);
 
    update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
                                  (const BitSequence*)hash, dataLen );
@@ -193,8 +123,8 @@ int xevan_hash(void *output, const void *input, int thr_id )
    cubehashUpdateDigest( &ctx.cubehash, (byte*)hash,
                                  (const byte*) hash, dataLen );
 
-	sph_shavite512(&ctx.shavite, hash, dataLen);
-	sph_shavite512_close(&ctx.shavite, hash);
+   sph_shavite512(&ctx.shavite, hash, dataLen);
+   sph_shavite512_close(&ctx.shavite, hash);
 
    update_final_sd( &ctx.simd, (BitSequence *)hash,
                          (const BitSequence *)hash, dataLen*8 );
@@ -207,30 +137,100 @@ int xevan_hash(void *output, const void *input, int thr_id )
    sph_echo512_close(&ctx.echo, hash);
 #endif
 
-	sph_hamsi512(&ctx.hamsi, hash, dataLen);
-	sph_hamsi512_close(&ctx.hamsi, hash);
+   sph_hamsi512(&ctx.hamsi, hash, dataLen);
+   sph_hamsi512_close(&ctx.hamsi, hash);
 
 #if defined(__AES__)
-    fugue512_Update( &ctx.fugue, hash, dataLen*8 );
-    fugue512_Final( &ctx.fugue, hash );   
+   fugue512_Update( &ctx.fugue, hash, dataLen*8 );
+   fugue512_Final( &ctx.fugue, hash ); 
 #else
-	sph_fugue512(&ctx.fugue, hash, dataLen);
-	sph_fugue512_close(&ctx.fugue, hash);
+   sph_fugue512(&ctx.fugue, hash, dataLen);
+   sph_fugue512_close(&ctx.fugue, hash);
 #endif
 
-	sph_shabal512(&ctx.shabal, hash, dataLen);
-	sph_shabal512_close(&ctx.shabal, hash);
+   sph_shabal512(&ctx.shabal, hash, dataLen);
+   sph_shabal512_close(&ctx.shabal, hash);
 
-	sph_whirlpool(&ctx.whirlpool, hash, dataLen);
-	sph_whirlpool_close(&ctx.whirlpool, hash);
+   sph_whirlpool(&ctx.whirlpool, hash, dataLen);
+   sph_whirlpool_close(&ctx.whirlpool, hash);
 
-   SHA512_Update( &ctx.sha512, hash, dataLen );
-   SHA512_Final( (unsigned char*) hash, &ctx.sha512 );
+   sph_sha512( &ctx.sha512, hash, dataLen );
+   sph_sha512_close( &ctx.sha512, hash );
 
-	sph_haval256_5(&ctx.haval,(const void*) hash, dataLen);
-	sph_haval256_5_close(&ctx.haval, hash);
+   sph_haval256_5(&ctx.haval,(const void*) hash, dataLen);
+   sph_haval256_5_close(&ctx.haval, hash);
 
-	memcpy(output, hash, 32);
+   memset(&hash[8], 0, dataLen - 32);
+
+   memcpy( &ctx, &xevan_ctx, sizeof(xevan_ctx) );
+
+   sph_blake512(&ctx.blake, hash, dataLen);
+   sph_blake512_close(&ctx.blake, hash);
+
+   sph_bmw512(&ctx.bmw, hash, dataLen);
+   sph_bmw512_close(&ctx.bmw, hash);
+
+#if defined(__AES__)
+   update_and_final_groestl( &ctx.groestl, (char*)hash,
+                              (const BitSequence*)hash, dataLen*8 );
+#else
+   sph_groestl512(&ctx.groestl, hash, dataLen);
+   sph_groestl512_close(&ctx.groestl, hash);
+#endif
+
+   sph_skein512(&ctx.skein, hash, dataLen);
+   sph_skein512_close(&ctx.skein, hash);
+
+   sph_jh512(&ctx.jh, hash, dataLen);
+   sph_jh512_close(&ctx.jh, hash);
+
+   sph_keccak512(&ctx.keccak, hash, dataLen);
+   sph_keccak512_close(&ctx.keccak, hash);
+
+   update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
+                                 (const BitSequence*)hash, dataLen );
+
+   cubehashUpdateDigest( &ctx.cubehash, (byte*)hash,
+                                 (const byte*) hash, dataLen );
+
+   sph_shavite512(&ctx.shavite, hash, dataLen);
+   sph_shavite512_close(&ctx.shavite, hash);
+
+   update_final_sd( &ctx.simd, (BitSequence *)hash,
+                         (const BitSequence *)hash, dataLen*8 );
+
+#if defined(__AES__)
+   update_final_echo( &ctx.echo, (BitSequence *) hash,
+                           (const BitSequence *) hash, dataLen*8 );
+#else
+   sph_echo512(&ctx.echo, hash, dataLen);
+   sph_echo512_close(&ctx.echo, hash);
+#endif
+
+   sph_hamsi512(&ctx.hamsi, hash, dataLen);
+   sph_hamsi512_close(&ctx.hamsi, hash);
+
+#if defined(__AES__)
+   fugue512_Update( &ctx.fugue, hash, dataLen*8 );
+   fugue512_Final( &ctx.fugue, hash );   
+#else
+   sph_fugue512(&ctx.fugue, hash, dataLen);
+   sph_fugue512_close(&ctx.fugue, hash);
+#endif
+
+   sph_shabal512(&ctx.shabal, hash, dataLen);
+   sph_shabal512_close(&ctx.shabal, hash);
+
+   sph_whirlpool(&ctx.whirlpool, hash, dataLen);
+   sph_whirlpool_close(&ctx.whirlpool, hash);
+
+   sph_sha512( &ctx.sha512, hash, dataLen );
+   sph_sha512_close( &ctx.sha512, hash );
+
+   sph_haval256_5(&ctx.haval,(const void*) hash, dataLen);
+   sph_haval256_5_close(&ctx.haval, hash);
+
+   memcpy(output, hash, 32);
 
    return 1;
 }
diff --git a/algo/x20/x20r.c b/algo/x20/x20r.c
index 901797e..8c3d43e 100644
--- a/algo/x20/x20r.c
+++ b/algo/x20/x20r.c
@@ -18,7 +18,7 @@
 #include "algo/radiogatun/sph_radiogatun.h"
 #include "algo/panama/sph_panama.h"
 #include "algo/gost/sph_gost.h"
-#include <openssl/sha.h>
+#include "algo/sha/sph_sha2.h"
 #if defined(__AES__)
   #include "algo/echo/aes_ni/hash_api.h"
   #include "algo/groestl/aes_ni/hash-groestl.h"
@@ -56,7 +56,7 @@ union _x20r_context_overlay
     sph_fugue512_context     fugue;
     sph_shabal512_context    shabal;
     sph_whirlpool_context    whirlpool;
-    SHA512_CTX               sha512;
+    sph_sha512_context       sha512;
     sph_haval256_5_context   haval;
     sph_gost512_context      gost;
     sph_radiogatun64_context radiogatun;
@@ -68,28 +68,6 @@ void x20r_hash(void* output, const void* input)
 {
    uint32_t _ALIGN(128) hash[64/4];
    x20r_context_overlay ctx;
-/*
-	sph_blake512_context     ctx_blake;
-	sph_bmw512_context       ctx_bmw;
-	sph_groestl512_context   ctx_groestl;
-	sph_skein512_context     ctx_skein;
-	sph_jh512_context        ctx_jh;
-	sph_keccak512_context    ctx_keccak;
-	sph_luffa512_context     ctx_luffa;
-	sph_cubehash512_context  ctx_cubehash;
-	sph_shavite512_context   ctx_shavite;
-	sph_simd512_context      ctx_simd;
-	sph_echo512_context      ctx_echo;
-	sph_hamsi512_context     ctx_hamsi;
-	sph_fugue512_context     ctx_fugue;
-	sph_shabal512_context    ctx_shabal;
-	sph_whirlpool_context    ctx_whirlpool;
-	sph_sha512_context       ctx_sha512;
-	sph_haval256_5_context   ctx_haval;
-	sph_gost512_context      ctx_gost;
-	sph_radiogatun64_context ctx_radiogatun;
-	sph_panama_context       ctx_panama;
-*/
    void *in = (void*) input;
    int size = 80;
 
@@ -194,9 +172,9 @@ void x20r_hash(void* output, const void* input)
 		sph_whirlpool_close(&ctx.whirlpool, hash);
 		break;
 	   case SHA_512:
-                SHA512_Init( &ctx.sha512 );
-                SHA512_Update( &ctx.sha512, in, size );
-                SHA512_Final( (unsigned char*) hash, &ctx.sha512 );
+                sph_sha512_Init( &ctx.sha512 );
+                sph_sha512( &ctx.sha512, in, size );
+                sph_sha512_close( &ctx.sha512, hash );
 		break;
 	   case HAVAL:
 		sph_haval256_5_init(&ctx.haval);
diff --git a/algo/x22/x22i-4way.c b/algo/x22/x22i-4way.c
index ba5714b..94b34cc 100644
--- a/algo/x22/x22i-4way.c
+++ b/algo/x22/x22i-4way.c
@@ -27,7 +27,9 @@
   #include "algo/shavite/shavite-hash-4way.h"
   #include "algo/echo/echo-hash-4way.h"
 #endif
-
+#if defined(__SHA__)
+  #include "algo/sha/sph_sha2.h"
+#endif
 
 #if defined(X22I_8WAY)
 
@@ -49,7 +51,11 @@ union _x22i_8way_ctx_overlay
     haval256_5_8way_context haval;
     sph_tiger_context       tiger;
     sph_gost512_context     gost;
+#if defined(X22I_8WAY_SHA)
+    sph_sha256_context      sha256;
+#else
     sha256_8way_context     sha256;
+#endif
 #if defined(__VAES__)
     groestl512_4way_context groestl;
     shavite512_4way_context shavite;
@@ -383,6 +389,35 @@ int x22i_8way_hash( void *output, const void *input, int thrid )
    sph_gost512 ( &ctx.gost, (const void*) hash7, 64 );
    sph_gost512_close( &ctx.gost, (void*) hash7 );
 
+#if defined(X22I_8WAY_SHA)
+
+   sph_sha256_init( &ctx.sha256 );
+   sph_sha256( &ctx.sha256, hash0, 64 );
+   sph_sha256_close( &ctx.sha256, output );
+   sph_sha256_init( &ctx.sha256 );
+   sph_sha256( &ctx.sha256, hash1, 64 );
+   sph_sha256_close( &ctx.sha256, output+32 );
+   sph_sha256_init( &ctx.sha256 );
+   sph_sha256( &ctx.sha256, hash2, 64 );
+   sph_sha256_close( &ctx.sha256, output+64 );
+   sph_sha256_init( &ctx.sha256 );
+   sph_sha256( &ctx.sha256, hash3, 64 );
+   sph_sha256_close( &ctx.sha256, output+96 );
+   sph_sha256_init( &ctx.sha256 );
+   sph_sha256( &ctx.sha256, hash4, 64 );
+   sph_sha256_close( &ctx.sha256, output+128 );
+   sph_sha256_init( &ctx.sha256 );
+   sph_sha256( &ctx.sha256, hash5, 64 );
+   sph_sha256_close( &ctx.sha256, output+160 );
+   sph_sha256_init( &ctx.sha256 );
+   sph_sha256( &ctx.sha256, hash6, 64 );
+   sph_sha256_close( &ctx.sha256, output+192 );
+   sph_sha256_init( &ctx.sha256 );
+   sph_sha256( &ctx.sha256, hash7, 64 );
+   sph_sha256_close( &ctx.sha256, output+224 );
+   
+#else
+
    intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3,
                            hash4, hash5, hash6, hash7 );
 
@@ -390,9 +425,55 @@ int x22i_8way_hash( void *output, const void *input, int thrid )
    sha256_8way_update( &ctx.sha256, vhash, 64 );
    sha256_8way_close( &ctx.sha256, output );
 
+#endif
+
    return 1;
 }
 
+#if defined(X22I_8WAY_SHA)
+
+int scanhash_x22i_8way_sha( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*8] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
+   __m512i  *noncev = (__m512i*)vdata + 9;
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+
+   if ( bench )  ptarget[7] = 0x08ff;
+
+   InitializeSWIFFTX();
+
+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+   *noncev = mm512_intrlv_blend_32(
+              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                n+3, 0, n+2, 0, n+1, 0, n,   0 ), *noncev );
+   do
+   {
+      if ( x22i_8way_hash( hash, vdata, thr_id ) )
+      for ( int i = 0; i < 8; i++ )
+      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
+      {
+         pdata[19] = bswap_32( n+i );
+         submit_solution( work, hash+(i<<3), mythr );
+      }
+      *noncev = _mm512_add_epi32( *noncev,
+                                  m512_const1_64( 0x0000000800000000 ) );
+      n += 8;
+   } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#else
+
 int scanhash_x22i_8way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr )
 {
@@ -440,53 +521,7 @@ int scanhash_x22i_8way( struct work *work, uint32_t max_nonce,
    return 0;
 }
 
-/*
-int scanhash_x22i_8way( struct work* work, uint32_t max_nonce,
-                   uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t hash[8*16] __attribute__ ((aligned (128)));
-   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
-   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
-   uint32_t *hash7 = &(hash[7<<3]);
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const uint32_t first_nonce = pdata[19];
-   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
-   uint32_t n = first_nonce;
-   const uint32_t last_nonce = max_nonce - 8;
-   const int thr_id = mythr->id;
-   const uint32_t Htarg = ptarget[7];
-
-   if (opt_benchmark)
-      ((uint32_t*)ptarget)[7] = 0x08ff;
-
-   InitializeSWIFFTX();
-
-   mm512_bswap32_intrlv80_8x64( vdata, pdata );
-   do
-   {
-      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
-               _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
-                                 n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
-      x22i_8way_hash( hash, vdata );
-
-      for ( int lane = 0; lane < 8; lane++ )
-      if unlikely( ( hash7[ lane ] <= Htarg ) )
-      {
-         extr_lane_8x32( lane_hash, hash, lane, 256 );
-         if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
-         {
-            pdata[19] = n + lane;
-            submit_solution( work, lane_hash, mythr );
-         }
-      }
-      n += 8;
-   } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
-
-   *hashes_done = n - first_nonce;
-   return 0;
-}
-*/
+#endif
 
 #elif defined(X22I_4WAY)
 
@@ -516,7 +551,11 @@ union _x22i_4way_ctx_overlay
     haval256_5_4way_context haval;
     sph_tiger_context       tiger;
     sph_gost512_context     gost;
+#if defined(X22I_4WAY_SHA)
+    sph_sha256_context      sha256;
+#else
     sha256_4way_context     sha256;
+#endif
 };
 typedef union _x22i_4way_ctx_overlay x22i_ctx_overlay;
 
@@ -543,23 +582,23 @@ int x22i_4way_hash( void *output, const void *input, int thrid )
 
 #if defined(__VAES__)
 
-     rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
+   rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
 
-     groestl512_2way_full( &ctx.groestl, vhashA, vhashA, 64 );
-     groestl512_2way_full( &ctx.groestl, vhashB, vhashB, 64 );
+   groestl512_2way_full( &ctx.groestl, vhashA, vhashA, 64 );
+   groestl512_2way_full( &ctx.groestl, vhashB, vhashB, 64 );
 
-     rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
+   rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
 
 #else
 
-     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
+   dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
 
-     groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
-     groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
-     groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
-     groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+   groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+   groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+   groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+   groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
 
-     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
+   intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
 #endif
 
@@ -655,7 +694,7 @@ int x22i_4way_hash( void *output, const void *input, int thrid )
 
    if ( work_restart[thrid].restart ) return false;
    
-	ComputeSingleSWIFFTX((unsigned char*)hash0, (unsigned char*)hashA0);
+   ComputeSingleSWIFFTX((unsigned char*)hash0, (unsigned char*)hashA0);
    ComputeSingleSWIFFTX((unsigned char*)hash1, (unsigned char*)hashA1);
    ComputeSingleSWIFFTX((unsigned char*)hash2, (unsigned char*)hashA2);
    ComputeSingleSWIFFTX((unsigned char*)hash3, (unsigned char*)hashA3);
@@ -669,7 +708,7 @@ int x22i_4way_hash( void *output, const void *input, int thrid )
    haval256_5_4way_close( &ctx.haval, vhash );
    dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
      
-	memset( hashA0, 0, 64 );
+   memset( hashA0, 0, 64 );
    memset( hashA1, 0, 64 );
    memset( hashA2, 0, 64 );
    memset( hashA3, 0, 64 );
@@ -684,8 +723,8 @@ int x22i_4way_hash( void *output, const void *input, int thrid )
    sph_tiger (&ctx.tiger, (const void*) hash2, 64);
    sph_tiger_close(&ctx.tiger, (void*) hashA2);
    sph_tiger_init(&ctx.tiger);
-	sph_tiger (&ctx.tiger, (const void*) hash3, 64);
-	sph_tiger_close(&ctx.tiger, (void*) hashA3);
+   sph_tiger (&ctx.tiger, (const void*) hash3, 64);
+   sph_tiger_close(&ctx.tiger, (void*) hashA3);
 
    if ( work_restart[thrid].restart ) return false;
 
@@ -712,9 +751,26 @@ int x22i_4way_hash( void *output, const void *input, int thrid )
    sph_gost512_init(&ctx.gost);
    sph_gost512 (&ctx.gost, (const void*) hash2, 64);
    sph_gost512_close(&ctx.gost, (void*) hash2);
-	sph_gost512_init(&ctx.gost);
-	sph_gost512 (&ctx.gost, (const void*) hash3, 64);
-	sph_gost512_close(&ctx.gost, (void*) hash3);
+   sph_gost512_init(&ctx.gost);
+   sph_gost512 (&ctx.gost, (const void*) hash3, 64);
+   sph_gost512_close(&ctx.gost, (void*) hash3);
+
+#if defined(X22I_4WAY_SHA)
+
+   sph_sha256_init( &ctx.sha256 );
+   sph_sha256( &ctx.sha256, hash0, 64 );
+   sph_sha256_close( &ctx.sha256, output );
+   sph_sha256_init( &ctx.sha256 );
+   sph_sha256( &ctx.sha256, hash1, 64 );
+   sph_sha256_close( &ctx.sha256, output+32 );
+   sph_sha256_init( &ctx.sha256 );
+   sph_sha256( &ctx.sha256, hash2, 64 );
+   sph_sha256_close( &ctx.sha256, output+64 );
+   sph_sha256_init( &ctx.sha256 );
+   sph_sha256( &ctx.sha256, hash3, 64 );
+   sph_sha256_close( &ctx.sha256, output+96 );
+
+#else
 
    intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );
 
@@ -722,11 +778,56 @@ int x22i_4way_hash( void *output, const void *input, int thrid )
    sha256_4way_update( &ctx.sha256, vhash, 64 );
    sha256_4way_close( &ctx.sha256, output );
 
+#endif
+
    return 1;
 }
 
+#if defined(X22I_4WAY_SHA)
+
+int scanhash_x22i_4way_sha( struct work* work, uint32_t max_nonce,
+                            uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*4] __attribute__ ((aligned (64)));
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 4;
+   __m256i  *noncev = (__m256i*)vdata + 9;
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+
+   if ( bench ) ptarget[7] = 0x08ff;
+
+   InitializeSWIFFTX();
+
+   mm256_bswap32_intrlv80_4x64( vdata, pdata );
+   *noncev = mm256_intrlv_blend_32(
+                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
+   do
+   {
+      if ( x22i_4way_hash( hash, vdata, thr_id ) )
+      for ( int i = 0; i < 4; i++ )
+      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
+      {
+         pdata[19] = bswap_32( n+i );
+         submit_solution( work, hash+(i<<3), mythr );
+      }
+      *noncev = _mm256_add_epi32( *noncev,
+                                  m256_const1_64( 0x0000000400000000 ) );
+      n += 4;
+   } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#else
+
 int scanhash_x22i_4way( struct work* work, uint32_t max_nonce,
-                   uint64_t *hashes_done, struct thr_info *mythr )
+                        uint64_t *hashes_done, struct thr_info *mythr )
 {
    uint32_t hash[8*4] __attribute__ ((aligned (64)));
    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
@@ -771,4 +872,6 @@ int scanhash_x22i_4way( struct work* work, uint32_t max_nonce,
    return 0;
 }
 
+#endif
+
 #endif  // X22I_4WAY
diff --git a/algo/x22/x22i-gate.c b/algo/x22/x22i-gate.c
index 243f69e..ff0cc80 100644
--- a/algo/x22/x22i-gate.c
+++ b/algo/x22/x22i-gate.c
@@ -7,21 +7,32 @@
 bool register_x22i_algo( algo_gate_t* gate )
 {
 #if defined (X22I_8WAY)
-  gate->scanhash  = (void*)&scanhash_x22i_8way;
-  gate->hash      = (void*)&x22i_8way_hash;
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT
-                      | AVX512_OPT | VAES_OPT;
-#elif defined (X22I_4WAY)
-  gate->scanhash  = (void*)&scanhash_x22i_4way;
-  gate->hash      = (void*)&x22i_4way_hash;
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT
-                      | AVX512_OPT | VAES_OPT;
+
+#if defined(X22I_8WAY_SHA)
+  gate->scanhash  = (void*)&scanhash_x22i_8way_sha;
 #else
+  gate->scanhash  = (void*)&scanhash_x22i_8way;
+#endif
+  gate->hash      = (void*)&x22i_8way_hash;
+
+#elif defined (X22I_4WAY)
+
+#if defined(X22I_4WAY_SHA)
+  gate->scanhash  = (void*)&scanhash_x22i_4way_sha;
+#else
+  gate->scanhash  = (void*)&scanhash_x22i_4way;
+#endif
+  gate->hash      = (void*)&x22i_4way_hash;
+
+#else
+
   gate->scanhash  = (void*)&scanhash_x22i;
   gate->hash      = (void*)&x22i_hash;
+
+#endif
+
   gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT
                       | AVX512_OPT | VAES_OPT | VAES256_OPT;
-#endif
   return true;
 };
 
@@ -37,9 +48,8 @@ bool register_x25x_algo( algo_gate_t* gate )
   gate->scanhash  = (void*)&scanhash_x25x;
   gate->hash      = (void*)&x25x_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT |
-	                VAES_OPT | VAES256_OPT;
-
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT |
+                        AVX512_OPT | VAES_OPT | VAES256_OPT;
   return true;
 };
 
diff --git a/algo/x22/x22i-gate.h b/algo/x22/x22i-gate.h
index 93eecda..0acedc7 100644
--- a/algo/x22/x22i-gate.h
+++ b/algo/x22/x22i-gate.h
@@ -12,19 +12,34 @@
   #define X22I_4WAY 1
 #endif
 
+#if defined(__SHA__)
+//  #define X22I_8WAY_SHA 1
+  #define X22I_4WAY_SHA 1
+#endif
+
 bool register_x22i_algo( algo_gate_t* gate );
 
 #if defined(X22I_8WAY)
 
 int x22i_8way_hash( void *state, const void *input, int thrid );
+#if defined(X22I_8WAY_SHA)
+int scanhash_x22i_8way_sha( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr );
+#else
 int scanhash_x22i_8way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
+#endif
 
 #elif defined(X22I_4WAY)
 
 int x22i_4way_hash( void *state, const void *input, int thrid );
+#if defined(X22I_4WAY_SHA)
+int scanhash_x22i_4way_sha( struct work *work, uint32_t max_nonce,
+                            uint64_t *hashes_done, struct thr_info *mythr );
+#else
 int scanhash_x22i_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
+#endif
 
 #else
 
@@ -40,6 +55,11 @@ int scanhash_x22i( struct work *work, uint32_t max_nonce,
   #define X25X_4WAY 1
 #endif
 
+#if defined(__SHA__)
+//  #define X25X_8WAY_SHA 1
+  #define X25X_4WAY_SHA 1
+#endif
+
 bool register_x25i_algo( algo_gate_t* gate );
 
 #if defined(X25X_8WAY)
diff --git a/algo/x22/x22i.c b/algo/x22/x22i.c
index 247ea4a..759e44c 100644
--- a/algo/x22/x22i.c
+++ b/algo/x22/x22i.c
@@ -23,7 +23,7 @@
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
-#include <openssl/sha.h>
+#include "algo/sha/sph_sha2.h"
 #include "algo/haval/sph-haval.h"
 #include "algo/tiger/sph_tiger.h"
 #include "algo/lyra2/lyra2.h"
@@ -53,11 +53,11 @@ union _x22i_context_overlay
         sph_hamsi512_context    hamsi;
         sph_shabal512_context   shabal;
         sph_whirlpool_context   whirlpool;
-        SHA512_CTX              sha512;
+        sph_sha512_context      sha512;
         sph_haval256_5_context  haval;
         sph_tiger_context       tiger;
         sph_gost512_context     gost;
-        SHA256_CTX              sha256;
+        sph_sha256_context      sha256;
 };
 typedef union _x22i_context_overlay x22i_context_overlay;
 
@@ -67,13 +67,13 @@ int x22i_hash( void *output, const void *input, int thrid )
    unsigned char hash2[65]    __attribute__((aligned(64))) = {0};
    x22i_context_overlay ctx;
 
-	sph_blake512_init(&ctx.blake);
-	sph_blake512(&ctx.blake, input, 80);
-	sph_blake512_close(&ctx.blake, hash);
+   sph_blake512_init(&ctx.blake);
+   sph_blake512(&ctx.blake, input, 80);
+   sph_blake512_close(&ctx.blake, hash);
 
-	sph_bmw512_init(&ctx.bmw);
-	sph_bmw512(&ctx.bmw, (const void*) hash, 64);
-	sph_bmw512_close(&ctx.bmw, hash);
+   sph_bmw512_init(&ctx.bmw);
+   sph_bmw512(&ctx.bmw, (const void*) hash, 64);
+   sph_bmw512_close(&ctx.bmw, hash);
 
 #if defined(__AES__)
    init_groestl( &ctx.groestl, 64 );
@@ -85,17 +85,17 @@ int x22i_hash( void *output, const void *input, int thrid )
    sph_groestl512_close( &ctx.groestl, hash );
 #endif
    
-	sph_skein512_init(&ctx.skein);
-	sph_skein512(&ctx.skein, (const void*) hash, 64);
-	sph_skein512_close(&ctx.skein, hash);
+   sph_skein512_init(&ctx.skein);
+   sph_skein512(&ctx.skein, (const void*) hash, 64);
+   sph_skein512_close(&ctx.skein, hash);
 
-	sph_jh512_init(&ctx.jh);
-	sph_jh512(&ctx.jh, (const void*) hash, 64);
-	sph_jh512_close(&ctx.jh, hash);
+   sph_jh512_init(&ctx.jh);
+   sph_jh512(&ctx.jh, (const void*) hash, 64);
+   sph_jh512_close(&ctx.jh, hash);
 
-	sph_keccak512_init(&ctx.keccak);
-	sph_keccak512(&ctx.keccak, (const void*) hash, 64);
-	sph_keccak512_close(&ctx.keccak, hash);
+   sph_keccak512_init(&ctx.keccak);
+   sph_keccak512(&ctx.keccak, (const void*) hash, 64);
+   sph_keccak512_close(&ctx.keccak, hash);
 
    if ( work_restart[thrid].restart ) return 0;
    
@@ -107,9 +107,9 @@ int x22i_hash( void *output, const void *input, int thrid )
    cubehashUpdateDigest( &ctx.cube, (byte*) hash,
                               (const byte*)hash, 64 );
 
-	sph_shavite512_init(&ctx.shavite);
-	sph_shavite512(&ctx.shavite, (const void*) hash, 64);
-	sph_shavite512_close(&ctx.shavite, hash);
+   sph_shavite512_init(&ctx.shavite);
+   sph_shavite512(&ctx.shavite, (const void*) hash, 64);
+   sph_shavite512_close(&ctx.shavite, hash);
 
    init_sd( &ctx.simd, 512 );
    update_final_sd( &ctx.simd, (BitSequence*)hash,
@@ -127,56 +127,56 @@ int x22i_hash( void *output, const void *input, int thrid )
 
    if ( work_restart[thrid].restart ) return 0;
    
-	sph_hamsi512_init(&ctx.hamsi);
-	sph_hamsi512(&ctx.hamsi, (const void*) hash, 64);
-	sph_hamsi512_close(&ctx.hamsi, hash);
+   sph_hamsi512_init(&ctx.hamsi);
+   sph_hamsi512(&ctx.hamsi, (const void*) hash, 64);
+   sph_hamsi512_close(&ctx.hamsi, hash);
 
 #if defined(__AES__)
-        fugue512_full( &ctx.fugue, hash, hash, 64 );
+   fugue512_full( &ctx.fugue, hash, hash, 64 );
 #else
-	sph_fugue512_init(&ctx.fugue);
-	sph_fugue512(&ctx.fugue, (const void*) hash, 64);
-	sph_fugue512_close(&ctx.fugue, hash);
+   sph_fugue512_init(&ctx.fugue);
+   sph_fugue512(&ctx.fugue, (const void*) hash, 64);
+   sph_fugue512_close(&ctx.fugue, hash);
 #endif
 
-	sph_shabal512_init(&ctx.shabal);
-	sph_shabal512(&ctx.shabal, (const void*) hash, 64);
-	sph_shabal512_close(&ctx.shabal, &hash[64]);
+   sph_shabal512_init(&ctx.shabal);
+   sph_shabal512(&ctx.shabal, (const void*) hash, 64);
+   sph_shabal512_close(&ctx.shabal, &hash[64]);
 
-	sph_whirlpool_init(&ctx.whirlpool);
-	sph_whirlpool (&ctx.whirlpool, (const void*) &hash[64], 64);
-	sph_whirlpool_close(&ctx.whirlpool, &hash[128]);
+   sph_whirlpool_init(&ctx.whirlpool);
+   sph_whirlpool (&ctx.whirlpool, (const void*) &hash[64], 64);
+   sph_whirlpool_close(&ctx.whirlpool, &hash[128]);
 
-   SHA512_Init( &ctx.sha512 );
-   SHA512_Update(  &ctx.sha512, (const void*) &hash[128], 64);
-   SHA512_Final( (void*) &hash[192], &ctx.sha512 );
-
-	ComputeSingleSWIFFTX((unsigned char*)hash, (unsigned char*)hash2);
+   sph_sha512_init( &ctx.sha512 );
+   sph_sha512( &ctx.sha512, &hash[128], 64 );
+   sph_sha512_close( &ctx.sha512, &hash[192] );
+   
+   ComputeSingleSWIFFTX((unsigned char*)hash, (unsigned char*)hash2);
 
    if ( work_restart[thrid].restart ) return 0;
    
-	memset(hash, 0, 64);
-	sph_haval256_5_init(&ctx.haval);
-	sph_haval256_5(&ctx.haval,(const void*) hash2, 64);
-	sph_haval256_5_close(&ctx.haval,hash);
+   memset(hash, 0, 64);
+   sph_haval256_5_init(&ctx.haval);
+   sph_haval256_5(&ctx.haval,(const void*) hash2, 64);
+   sph_haval256_5_close(&ctx.haval,hash);
 
-	memset(hash2, 0, 64);
-	sph_tiger_init(&ctx.tiger);
-	sph_tiger (&ctx.tiger, (const void*) hash, 64);
-	sph_tiger_close(&ctx.tiger, (void*) hash2);
+   memset(hash2, 0, 64);
+   sph_tiger_init(&ctx.tiger);
+   sph_tiger (&ctx.tiger, (const void*) hash, 64);
+   sph_tiger_close(&ctx.tiger, (void*) hash2);
 
-	memset(hash, 0, 64);
-	LYRA2RE((void*) hash, 32, (const void*) hash2, 32, (const void*) hash2, 32, 1, 4, 4);
+   memset(hash, 0, 64);
+   LYRA2RE((void*) hash, 32, (const void*) hash2, 32, (const void*) hash2, 32, 1, 4, 4);
 
-	sph_gost512_init(&ctx.gost);
-	sph_gost512 (&ctx.gost, (const void*) hash, 64);
-	sph_gost512_close(&ctx.gost, (void*) hash);
+   sph_gost512_init(&ctx.gost);
+   sph_gost512 (&ctx.gost, (const void*) hash, 64);
+   sph_gost512_close(&ctx.gost, (void*) hash);
 
-   SHA256_Init( &ctx.sha256 );
-   SHA256_Update(  &ctx.sha256, (const void*) hash, 64 );
-   SHA256_Final( (unsigned char*) hash, &ctx.sha256 );
+   sph_sha256_init( &ctx.sha256 );
+   sph_sha256( &ctx.sha256, hash, 64 );
+   sph_sha256_close( &ctx.sha256, hash );
 
-	memcpy(output, hash, 32);
+   memcpy(output, hash, 32);
 
    return 1;
 }
diff --git a/algo/x22/x25x-4way.c b/algo/x22/x25x-4way.c
index e44a82e..86f5699 100644
--- a/algo/x22/x25x-4way.c
+++ b/algo/x22/x25x-4way.c
@@ -32,6 +32,9 @@
   #include "algo/shavite/shavite-hash-4way.h"
   #include "algo/echo/echo-hash-4way.h"
 #endif
+#if defined(__SHA__)
+  #include "algo/sha/sph_sha2.h"
+#endif
 
 void x25x_shuffle( void *hash )
 {
@@ -80,7 +83,11 @@ union _x25x_8way_ctx_overlay
     haval256_5_8way_context haval;
     sph_tiger_context       tiger;
     sph_gost512_context     gost;
+#if defined(X25X_8WAY_SHA)
+    sph_sha256_context      sha256;
+#else
     sha256_8way_context     sha256;
+#endif
     panama_8way_context     panama;
     blake2s_8way_state      blake2s;
 #if defined(__VAES__)
@@ -216,9 +223,9 @@ int x25x_8way_hash( void *output, const void *input, int thrid )
 
 #else
 
-	sph_shavite512_init(&ctx.shavite);
-	sph_shavite512(&ctx.shavite, (const void*) hash0[7], 64);
-	sph_shavite512_close(&ctx.shavite, hash0[8]);
+   sph_shavite512_init(&ctx.shavite);
+   sph_shavite512(&ctx.shavite, (const void*) hash0[7], 64);
+   sph_shavite512_close(&ctx.shavite, hash0[8]);
    sph_shavite512_init(&ctx.shavite);
    sph_shavite512(&ctx.shavite, (const void*) hash1[7], 64);
    sph_shavite512_close(&ctx.shavite, hash1[8]);
@@ -322,9 +329,9 @@ int x25x_8way_hash( void *output, const void *input, int thrid )
    dintrlv_8x32_512( hash0[13], hash1[13], hash2[13], hash3[13],
                      hash4[13], hash5[13], hash6[13], hash7[13], vhash );
 
-	sph_whirlpool_init(&ctx.whirlpool);
-	sph_whirlpool (&ctx.whirlpool, (const void*) hash0[13], 64);
-	sph_whirlpool_close(&ctx.whirlpool, hash0[14]);
+   sph_whirlpool_init(&ctx.whirlpool);
+   sph_whirlpool (&ctx.whirlpool, (const void*) hash0[13], 64);
+   sph_whirlpool_close(&ctx.whirlpool, hash0[14]);
    sph_whirlpool_init(&ctx.whirlpool);
    sph_whirlpool (&ctx.whirlpool, (const void*) hash1[13], 64);
    sph_whirlpool_close(&ctx.whirlpool, hash1[14]);
@@ -373,9 +380,9 @@ int x25x_8way_hash( void *output, const void *input, int thrid )
    dintrlv_8x32_512( hash0[17], hash1[17], hash2[17], hash3[17],
                      hash4[17], hash5[17], hash6[17], hash7[17], vhash );
 
-	sph_tiger_init(&ctx.tiger);
-	sph_tiger (&ctx.tiger, (const void*) hash0[17], 64);
-	sph_tiger_close(&ctx.tiger, (void*) hash0[18]);
+   sph_tiger_init(&ctx.tiger);
+   sph_tiger (&ctx.tiger, (const void*) hash0[17], 64);
+   sph_tiger_close(&ctx.tiger, (void*) hash0[18]);
    sph_tiger_init(&ctx.tiger);
    sph_tiger (&ctx.tiger, (const void*) hash1[17], 64);
    sph_tiger_close(&ctx.tiger, (void*) hash1[18]);
@@ -437,6 +444,39 @@ int x25x_8way_hash( void *output, const void *input, int thrid )
    sph_gost512_init(&ctx.gost);
    sph_gost512 (&ctx.gost, (const void*) hash7[19], 64);
    sph_gost512_close(&ctx.gost, (void*) hash7[20]);
+
+#if defined(X25X_8WAY_SHA)
+
+   sph_sha256_init( &ctx.sha256 );
+   sph_sha256( &ctx.sha256, hash0[20], 64 );
+   sph_sha256_close( &ctx.sha256, hash0[21] );
+   sph_sha256_init( &ctx.sha256 );
+   sph_sha256( &ctx.sha256, hash1[20], 64 );
+   sph_sha256_close( &ctx.sha256, hash1[21] );
+   sph_sha256_init( &ctx.sha256 );
+   sph_sha256( &ctx.sha256, hash2[20], 64 );
+   sph_sha256_close( &ctx.sha256, hash2[21] );
+   sph_sha256_init( &ctx.sha256 );
+   sph_sha256( &ctx.sha256, hash3[20], 64 );
+   sph_sha256_close( &ctx.sha256, hash3[21] );
+   sph_sha256_init( &ctx.sha256 );
+   sph_sha256( &ctx.sha256, hash4[20], 64 );
+   sph_sha256_close( &ctx.sha256, hash4[21] );
+   sph_sha256_init( &ctx.sha256 );
+   sph_sha256( &ctx.sha256, hash5[20], 64 );
+   sph_sha256_close( &ctx.sha256, hash5[21] );
+   sph_sha256_init( &ctx.sha256 );
+   sph_sha256( &ctx.sha256, hash6[20], 64 );
+   sph_sha256_close( &ctx.sha256, hash6[21] );
+   sph_sha256_init( &ctx.sha256 );
+   sph_sha256( &ctx.sha256, hash7[20], 64 );
+   sph_sha256_close( &ctx.sha256, hash7[21] );
+
+   intrlv_8x32_512( vhash, hash0[21], hash1[21], hash2[21], hash3[21],
+                           hash4[21], hash5[21], hash6[21], hash7[21] );
+   
+#else
+
    intrlv_8x32_512( vhashA, hash0[20], hash1[20], hash2[20], hash3[20],
                             hash4[20], hash5[20], hash6[20], hash7[20] );
 
@@ -446,6 +486,8 @@ int x25x_8way_hash( void *output, const void *input, int thrid )
    dintrlv_8x32_512( hash0[21], hash1[21], hash2[21], hash3[21],
                      hash4[21], hash5[21], hash6[21], hash7[21], vhash );
 
+#endif
+
    panama_8way_init( &ctx.panama );
    panama_8way_update( &ctx.panama, vhash, 64 );
    panama_8way_close( &ctx.panama, vhash );
@@ -603,7 +645,11 @@ union _x25x_4way_ctx_overlay
     haval256_5_4way_context haval;
     sph_tiger_context       tiger;
     sph_gost512_context     gost;
+#if defined(X25X_4WAY_SHA)
+    sph_sha256_context      sha256;
+#else
     sha256_4way_context     sha256;
+#endif
     panama_4way_context     panama;
     blake2s_4way_state      blake2s;
 };
@@ -800,6 +846,25 @@ int x25x_4way_hash( void *output, const void *input, int thrid )
    sph_gost512 (&ctx.gost, (const void*) hash3[19], 64);
    sph_gost512_close(&ctx.gost, (void*) hash3[20]);
 
+#if defined(X25X_4WAY_SHA)
+
+   sph_sha256_init( &ctx.sha256 );
+   sph_sha256( &ctx.sha256, hash0[20], 64 );
+   sph_sha256_close( &ctx.sha256, hash0[21] );
+   sph_sha256_init( &ctx.sha256 );
+   sph_sha256( &ctx.sha256, hash1[20], 64 );
+   sph_sha256_close( &ctx.sha256, hash1[21] );
+   sph_sha256_init( &ctx.sha256 );
+   sph_sha256( &ctx.sha256, hash2[20], 64 );
+   sph_sha256_close( &ctx.sha256, hash2[21] );
+   sph_sha256_init( &ctx.sha256 );
+   sph_sha256( &ctx.sha256, hash3[20], 64 );
+   sph_sha256_close( &ctx.sha256, hash3[21] );
+
+   intrlv_4x32_512( vhash, hash0[21], hash1[21], hash2[21], hash3[21] );
+
+#else   
+
    intrlv_4x32_512( vhashX[0], hash0[20], hash1[20], hash2[20], hash3[20] );
    memset( vhash, 0, 64*4 );
 
@@ -808,6 +873,8 @@ int x25x_4way_hash( void *output, const void *input, int thrid )
    sha256_4way_close( &ctx.sha256, vhash );
    dintrlv_4x32_512( hash0[21], hash1[21], hash2[21], hash3[21], vhash );
 
+#endif
+
    panama_4way_init( &ctx.panama );
    panama_4way_update( &ctx.panama, vhash, 64 );
    panama_4way_close( &ctx.panama, vhash );
diff --git a/algo/x22/x25x.c b/algo/x22/x25x.c
index 7855698..42e7eda 100644
--- a/algo/x22/x25x.c
+++ b/algo/x22/x25x.c
@@ -23,7 +23,7 @@
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
-#include <openssl/sha.h>
+#include "algo/sha/sph_sha2.h"
 #include "algo/haval/sph-haval.h"
 #include "algo/tiger/sph_tiger.h"
 #include "algo/lyra2/lyra2.h"
@@ -56,11 +56,11 @@ union _x25x_context_overlay
         sph_hamsi512_context    hamsi;
         sph_shabal512_context   shabal;
         sph_whirlpool_context   whirlpool;
-        SHA512_CTX              sha512;
+        sph_sha512_context      sha512;
         sph_haval256_5_context  haval;
         sph_tiger_context       tiger;
         sph_gost512_context     gost;
-        SHA256_CTX              sha256;
+        sph_sha256_context      sha256;
         sph_panama_context      panama;
         blake2s_state           blake2s;
 };
@@ -71,13 +71,13 @@ int x25x_hash( void *output, const void *input, int thrid )
    unsigned char hash[25][64] __attribute__((aligned(64))) = {0};
    x25x_context_overlay ctx;
 
-	sph_blake512_init(&ctx.blake);
-	sph_blake512(&ctx.blake, input, 80);
-	sph_blake512_close(&ctx.blake, &hash[0] );
+   sph_blake512_init(&ctx.blake);
+   sph_blake512(&ctx.blake, input, 80);
+   sph_blake512_close(&ctx.blake, &hash[0] );
 
-	sph_bmw512_init(&ctx.bmw);
-	sph_bmw512(&ctx.bmw, (const void*) &hash[0], 64);
-	sph_bmw512_close(&ctx.bmw, &hash[1]);
+   sph_bmw512_init(&ctx.bmw);
+   sph_bmw512(&ctx.bmw, (const void*) &hash[0], 64);
+   sph_bmw512_close(&ctx.bmw, &hash[1]);
 
 #if defined(__AES__)
    init_groestl( &ctx.groestl, 64 );
@@ -89,17 +89,17 @@ int x25x_hash( void *output, const void *input, int thrid )
    sph_groestl512_close( &ctx.groestl, &hash[2] );
 #endif
    
-	sph_skein512_init(&ctx.skein);
-	sph_skein512(&ctx.skein, (const void*) &hash[2], 64);
-	sph_skein512_close(&ctx.skein, &hash[3]);
+   sph_skein512_init(&ctx.skein);
+   sph_skein512(&ctx.skein, (const void*) &hash[2], 64);
+   sph_skein512_close(&ctx.skein, &hash[3]);
 
-	sph_jh512_init(&ctx.jh);
-	sph_jh512(&ctx.jh, (const void*) &hash[3], 64);
-	sph_jh512_close(&ctx.jh, &hash[4]);
+   sph_jh512_init(&ctx.jh);
+   sph_jh512(&ctx.jh, (const void*) &hash[3], 64);
+   sph_jh512_close(&ctx.jh, &hash[4]);
 
-	sph_keccak512_init(&ctx.keccak);
-	sph_keccak512(&ctx.keccak, (const void*) &hash[4], 64);
-	sph_keccak512_close(&ctx.keccak, &hash[5]);
+   sph_keccak512_init(&ctx.keccak);
+   sph_keccak512(&ctx.keccak, (const void*) &hash[4], 64);
+   sph_keccak512_close(&ctx.keccak, &hash[5]);
 
    if ( work_restart[thrid].restart ) return 0;
    
@@ -111,9 +111,9 @@ int x25x_hash( void *output, const void *input, int thrid )
    cubehashUpdateDigest( &ctx.cube, (byte*) &hash[7],
                               (const byte*)&hash[6], 64 );
 
-	sph_shavite512_init(&ctx.shavite);
-	sph_shavite512(&ctx.shavite, (const void*) &hash[7], 64);
-	sph_shavite512_close(&ctx.shavite, &hash[8]);
+   sph_shavite512_init(&ctx.shavite);
+   sph_shavite512(&ctx.shavite, (const void*) &hash[7], 64);
+   sph_shavite512_close(&ctx.shavite, &hash[8]);
 
    init_sd( &ctx.simd, 512 );
    update_final_sd( &ctx.simd, (BitSequence*)&hash[9],
@@ -132,51 +132,51 @@ int x25x_hash( void *output, const void *input, int thrid )
    if ( work_restart[thrid].restart ) return 0;
 
    sph_hamsi512_init(&ctx.hamsi);
-	sph_hamsi512(&ctx.hamsi, (const void*) &hash[10], 64);
-	sph_hamsi512_close(&ctx.hamsi, &hash[11]);
+   sph_hamsi512(&ctx.hamsi, (const void*) &hash[10], 64);
+   sph_hamsi512_close(&ctx.hamsi, &hash[11]);
 
 #if defined(__AES__)
-        fugue512_full( &ctx.fugue, &hash[12], &hash[11], 64 );
+   fugue512_full( &ctx.fugue, &hash[12], &hash[11], 64 );
 #else
-	sph_fugue512_init(&ctx.fugue);
-	sph_fugue512(&ctx.fugue, (const void*) &hash[11], 64);
-	sph_fugue512_close(&ctx.fugue, &hash[12]);
+   sph_fugue512_init(&ctx.fugue);
+   sph_fugue512(&ctx.fugue, (const void*) &hash[11], 64);
+   sph_fugue512_close(&ctx.fugue, &hash[12]);
 #endif
 
-	sph_shabal512_init(&ctx.shabal);
-	sph_shabal512(&ctx.shabal, (const void*) &hash[12], 64);
-	sph_shabal512_close(&ctx.shabal, &hash[13]);
+   sph_shabal512_init(&ctx.shabal);
+   sph_shabal512(&ctx.shabal, (const void*) &hash[12], 64);
+   sph_shabal512_close(&ctx.shabal, &hash[13]);
 
-	sph_whirlpool_init(&ctx.whirlpool);
-	sph_whirlpool (&ctx.whirlpool, (const void*) &hash[13], 64);
-	sph_whirlpool_close(&ctx.whirlpool, &hash[14]);
+   sph_whirlpool_init(&ctx.whirlpool);
+   sph_whirlpool (&ctx.whirlpool, (const void*) &hash[13], 64);
+   sph_whirlpool_close(&ctx.whirlpool, &hash[14]);
 
-   SHA512_Init( &ctx.sha512 );
-   SHA512_Update(  &ctx.sha512, (const void*) &hash[14], 64);
-   SHA512_Final( (void*) &hash[15], &ctx.sha512 );
+   sph_sha512_init( &ctx.sha512 );
+   sph_sha512( &ctx.sha512, &hash[14], 64 );
+   sph_sha512_close( &ctx.sha512, &hash[15] );
 
    ComputeSingleSWIFFTX((unsigned char*)&hash[12], (unsigned char*)&hash[16]);
 
-	sph_haval256_5_init(&ctx.haval);
-	sph_haval256_5(&ctx.haval,(const void*) &hash[16], 64);
-	sph_haval256_5_close(&ctx.haval,&hash[17]);
+   sph_haval256_5_init(&ctx.haval);
+   sph_haval256_5(&ctx.haval,(const void*) &hash[16], 64);
+   sph_haval256_5_close(&ctx.haval,&hash[17]);
 
    if ( work_restart[thrid].restart ) return 0;
    
-	sph_tiger_init(&ctx.tiger);
-	sph_tiger (&ctx.tiger, (const void*) &hash[17], 64);
-	sph_tiger_close(&ctx.tiger, (void*) &hash[18]);
+   sph_tiger_init(&ctx.tiger);
+   sph_tiger (&ctx.tiger, (const void*) &hash[17], 64);
+   sph_tiger_close(&ctx.tiger, (void*) &hash[18]);
 
-	LYRA2RE( (void*)&hash[19], 32, (const void*)&hash[18], 32,
+   LYRA2RE( (void*)&hash[19], 32, (const void*)&hash[18], 32,
             (const void*)&hash[18], 32, 1, 4, 4 );
 
-	sph_gost512_init(&ctx.gost);
-	sph_gost512 (&ctx.gost, (const void*) &hash[19], 64);
-	sph_gost512_close(&ctx.gost, (void*) &hash[20]);
+   sph_gost512_init(&ctx.gost);
+   sph_gost512 (&ctx.gost, (const void*) &hash[19], 64);
+   sph_gost512_close(&ctx.gost, (void*) &hash[20]);
 
-   SHA256_Init( &ctx.sha256 );
-   SHA256_Update(  &ctx.sha256, (const void*) &hash[20], 64 );
-   SHA256_Final( (unsigned char*) &hash[21], &ctx.sha256 );
+   sph_sha256_init( &ctx.sha256 );
+   sph_sha256( &ctx.sha256, &hash[20], 64 );
+   sph_sha256_close( &ctx.sha256, &hash[21] );
 
    sph_panama_init(&ctx.panama);
    sph_panama (&ctx.panama, (const void*) &hash[21], 64 );
diff --git a/algo/yescrypt/yescrypt-simd.c b/algo/yescrypt/yescrypt-simd.c
index 41d97fe..0cbb528 100644
--- a/algo/yescrypt/yescrypt-simd.c
+++ b/algo/yescrypt/yescrypt-simd.c
@@ -1302,10 +1302,7 @@ yescrypt_kdf(const yescrypt_shared_t * shared, yescrypt_local_t * local,
 		S = (uint8_t *)XY + XY_size;
 
 	if (t || flags) {
-		SHA256_CTX ctx;
-		SHA256_Init(&ctx);
-		SHA256_Update(&ctx, passwd, passwdlen);
-		SHA256_Final(sha256, &ctx);
+		SHA256_Buf( passwd, passwdlen, sha256 );
 		passwd = sha256;
 		passwdlen = sizeof(sha256);
 	}
@@ -1382,10 +1379,7 @@ yescrypt_kdf(const yescrypt_shared_t * shared, yescrypt_local_t * local,
 	   }
 	   /* Compute StoredKey */
 	   {
-		SHA256_CTX ctx;
-		SHA256_Init(&ctx);
-		SHA256_Update(&ctx, sha256, sizeof(sha256));
-		SHA256_Final(buf, &ctx);
+		SHA256_Buf( sha256, sizeof(sha256), buf );
 	   }
 	}
 
diff --git a/algo/yespower/yescrypt-r8g.c b/algo/yespower/yescrypt-r8g.c
index 5ee79ea..27d1fd8 100644
--- a/algo/yespower/yescrypt-r8g.c
+++ b/algo/yespower/yescrypt-r8g.c
@@ -52,8 +52,8 @@ int scanhash_yespower_r8g( struct work *work, uint32_t max_nonce,
     endiandata[19] = n;
 
 // do sha256 prehash
-   SHA256_Init( &sha256_prehash_ctx );
-   SHA256_Update( &sha256_prehash_ctx, endiandata, 64 );
+   sph_sha256_init( &sha256_prehash_ctx );
+   sph_sha256( &sha256_prehash_ctx, endiandata, 64 );
     
     do {
        yespower_tls( (unsigned char *)endiandata, params.perslen,
diff --git a/algo/yespower/yespower-gate.c b/algo/yespower/yespower-gate.c
index f153ff8..d36e59e 100644
--- a/algo/yespower/yespower-gate.c
+++ b/algo/yespower/yespower-gate.c
@@ -33,7 +33,8 @@
 yespower_params_t yespower_params;
 
 //SHA256_CTX sha256_prehash_ctx;
-__thread SHA256_CTX sha256_prehash_ctx;
+__thread sph_sha256_context sha256_prehash_ctx;
+//__thread SHA256_CTX sha256_prehash_ctx;
 
 // YESPOWER
 
@@ -59,9 +60,9 @@ int scanhash_yespower( struct work *work, uint32_t max_nonce,
       be32enc( &endiandata[k], pdata[k] );
    endiandata[19] = n;
 
-// do sha256 prehash
-   SHA256_Init( &sha256_prehash_ctx );
-   SHA256_Update( &sha256_prehash_ctx, endiandata, 64 );
+   // do sha256 prehash
+   sph_sha256_init( &sha256_prehash_ctx );
+   sph_sha256( &sha256_prehash_ctx, endiandata, 64 );
 
    do {
       if ( yespower_hash( (char*)endiandata, (char*)vhash, 80, thr_id ) )
@@ -100,9 +101,9 @@ int scanhash_yespower_b2b( struct work *work, uint32_t max_nonce,
       be32enc( &endiandata[k], pdata[k] );
    endiandata[19] = n;
 
-// do sha256 prehash
-   SHA256_Init( &sha256_prehash_ctx );
-   SHA256_Update( &sha256_prehash_ctx, endiandata, 64 );
+   // do sha256 prehash
+   sph_sha256_init( &sha256_prehash_ctx );
+   sph_sha256( &sha256_prehash_ctx, endiandata, 64 );
 
    do {
       if (yespower_b2b_hash( (char*) endiandata, (char*) vhash, 80, thr_id ) )
@@ -165,25 +166,14 @@ bool register_yespowerr16_algo( algo_gate_t* gate )
   return true;
  };
 
-/* not used, doesn't work
-bool register_yescrypt_05_algo( algo_gate_t* gate )
-{
-   gate->optimizations = SSE2_OPT | SHA_OPT;
-   gate->scanhash   = (void*)&scanhash_yespower;
-   yespower_params.version = YESPOWER_0_5;
-   yespower_params.N       = 2048;
-   yespower_params.r       = 8;
-   yespower_params.pers    = NULL;
-   yespower_params.perslen = 0;
-   opt_target_factor = 65536.0;
-   return true;
-}
+// Legacy Yescrypt (yespower v0.5)
 
 bool register_yescrypt_05_algo( algo_gate_t* gate )
 {
    gate->optimizations = SSE2_OPT | SHA_OPT;
    gate->scanhash   = (void*)&scanhash_yespower;
    yespower_params.version = YESPOWER_0_5;
+   opt_target_factor = 65536.0;
 
    if ( opt_param_n )  yespower_params.N = opt_param_n;
    else                yespower_params.N = 2048;
@@ -202,8 +192,6 @@ bool register_yescrypt_05_algo( algo_gate_t* gate )
      yespower_params.perslen = 0;
    }
 
-//   YESCRYPT_P = 1;
-
    applog( LOG_NOTICE,"Yescrypt parameters: N= %d, R= %d.",
                                       yespower_params.N, yespower_params.r );
    if ( yespower_params.pers )
@@ -251,7 +239,6 @@ bool register_yescryptr32_05_algo( algo_gate_t* gate )
    opt_target_factor = 65536.0;
    return true;
 }
-*/
 
 // POWER2B
 
diff --git a/algo/yespower/yespower-opt.c b/algo/yespower/yespower-opt.c
index f26de51..a17cea3 100644
--- a/algo/yespower/yespower-opt.c
+++ b/algo/yespower/yespower-opt.c
@@ -1029,72 +1029,72 @@ int yespower(yespower_local_t *local,
     const yespower_params_t *params,
     yespower_binary_t *dst, int thrid )
 {
-	yespower_version_t version = params->version;
-	uint32_t N = params->N;
-	uint32_t r = params->r;
-	const uint8_t *pers = params->pers;
-	size_t perslen = params->perslen;
-	uint32_t Swidth;
-	size_t B_size, V_size, XY_size, need;
-	uint8_t *B, *S;
-	salsa20_blk_t *V, *XY;
-	pwxform_ctx_t ctx;
-	uint8_t sha256[32];
-   SHA256_CTX sha256_ctx;
+   yespower_version_t version = params->version;
+   uint32_t N = params->N;
+   uint32_t r = params->r;
+   const uint8_t *pers = params->pers;
+   size_t perslen = params->perslen;
+   uint32_t Swidth;
+   size_t B_size, V_size, XY_size, need;
+   uint8_t *B, *S;
+   salsa20_blk_t *V, *XY;
+   pwxform_ctx_t ctx;
+   uint8_t sha256[32];
+   sph_sha256_context sha256_ctx;
 
-	/* Sanity-check parameters */
-	if ( (version != YESPOWER_0_5 && version != YESPOWER_1_0)
+   /* Sanity-check parameters */
+   if ( (version != YESPOWER_0_5 && version != YESPOWER_1_0)
       || N < 1024 || N > 512 * 1024 || r < 8 || r > 32
       || (N & (N - 1)) != 0 || ( !pers && perslen ) )
    {
-		errno = EINVAL;
-		return -1;
-	}
+      errno = EINVAL;
+      return -1;
+   }
 
-	/* Allocate memory */
-	B_size = (size_t)128 * r;
-	V_size = B_size * N;
-	if ( version == YESPOWER_0_5 )
+   /* Allocate memory */
+   B_size = (size_t)128 * r;
+   V_size = B_size * N;
+   if ( version == YESPOWER_0_5 )
    {
-		XY_size = B_size * 2;
-		Swidth = Swidth_0_5;
-		ctx.Sbytes = 2 * Swidth_to_Sbytes1( Swidth );
-	} else {
-		XY_size = B_size + 64;
-		Swidth = Swidth_1_0;
-		ctx.Sbytes = 3 * Swidth_to_Sbytes1( Swidth );
-	}
-	need = B_size + V_size + XY_size + ctx.Sbytes;
-	if ( local->aligned_size < need )
+      XY_size = B_size * 2;
+      Swidth = Swidth_0_5;
+      ctx.Sbytes = 2 * Swidth_to_Sbytes1( Swidth );
+   }
+   else
    {
-		if ( free_region( local ) )
-			return -1;
-		if ( !alloc_region( local, need ) )
-			return -1;
-	}
-	B = (uint8_t *)local->aligned;
-	V = (salsa20_blk_t *)((uint8_t *)B + B_size);
-	XY = (salsa20_blk_t *)((uint8_t *)V + V_size);
-	S = (uint8_t *)XY + XY_size;
-	ctx.S0 = S;
-	ctx.S1 = S + Swidth_to_Sbytes1( Swidth );
+      XY_size = B_size + 64;
+      Swidth = Swidth_1_0;
+      ctx.Sbytes = 3 * Swidth_to_Sbytes1( Swidth );
+   }
+   need = B_size + V_size + XY_size + ctx.Sbytes;
+   if ( local->aligned_size < need )
+   {
+      if ( free_region( local ) )
+         return -1;
+      if ( !alloc_region( local, need ) )
+         return -1;
+   }
+   B = (uint8_t *)local->aligned;
+   V = (salsa20_blk_t *)((uint8_t *)B + B_size);
+   XY = (salsa20_blk_t *)((uint8_t *)V + V_size);
+   S = (uint8_t *)XY + XY_size;
+   ctx.S0 = S;
+   ctx.S1 = S + Swidth_to_Sbytes1( Swidth );
 
-
-// copy prehash, do tail   
+   // copy prehash, do tail   
    memcpy( &sha256_ctx, &sha256_prehash_ctx, sizeof sha256_ctx );
-   SHA256_Update( &sha256_ctx, src+64, srclen-64 );
-   SHA256_Final( sha256, &sha256_ctx );
 
-//   SHA256_Buf(src, srclen, sha256);
+   sph_sha256( &sha256_ctx, src+64, srclen-64 );
+   sph_sha256_close( &sha256_ctx, sha256 );
 
-	if ( version == YESPOWER_0_5 )
+   if ( version == YESPOWER_0_5 )
    {
       PBKDF2_SHA256( sha256, sizeof(sha256), src, srclen, 1, B, B_size );
 
       if ( work_restart[thrid].restart ) return 0;
    
       memcpy( sha256, B, sizeof(sha256) );
-		smix( B, r, N, V, XY, &ctx );
+      smix( B, r, N, V, XY, &ctx );
 
       if ( work_restart[thrid].restart ) return 0;
 
@@ -1108,54 +1108,36 @@ int yespower(yespower_local_t *local,
          src = pers;
          srclen = perslen;
       }
-      else
-         srclen = 0;
+
+      HMAC_SHA256_Buf( dst, sizeof(*dst), src, srclen, sha256 );
+      SHA256_Buf( sha256, sizeof(sha256), (uint8_t *)dst );
       
-      HMAC_SHA256_CTX ctx;
-      HMAC_SHA256_Init( &ctx, dst, sizeof(*dst) );
-      HMAC_SHA256_Update( &ctx, src, srclen );
-      HMAC_SHA256_Final( sha256, &ctx );
-
-//      SHA256_CTX ctx;
-      SHA256_Init( &sha256_ctx );
-      SHA256_Update( &sha256_ctx, sha256, sizeof(sha256) );
-      SHA256_Final( (unsigned char*)dst, &sha256_ctx );
-
-
-/*
-      if ( pers )
-      {
-			HMAC_SHA256_Buf( dst, sizeof(*dst), pers, perslen, sha256 );
-         SHA256_Buf( sha256, sizeof(sha256), (uint8_t *)dst );
-		}
-*/
    }
    else
    {
-		ctx.S2 = S + 2 * Swidth_to_Sbytes1( Swidth );
-		ctx.w = 0;
-
-		if ( pers )
+      ctx.S2 = S + 2 * Swidth_to_Sbytes1( Swidth );
+      ctx.w = 0;
+      if ( pers )
       {
-			src = pers;
-			srclen = perslen;
-		}
+         src = pers;
+         srclen = perslen;
+      }
       else
-			srclen = 0;
+         srclen = 0;
 
-		PBKDF2_SHA256( sha256, sizeof(sha256), src, srclen, 1, B, 128 );
-		memcpy( sha256, B, sizeof(sha256) );
+      PBKDF2_SHA256( sha256, sizeof(sha256), src, srclen, 1, B, 128 );
+      memcpy( sha256, B, sizeof(sha256) );
 
       if ( work_restart[thrid].restart ) return 0;
 
       smix_1_0( B, r, N, V, XY, &ctx );
-
+      
       HMAC_SHA256_Buf( B + B_size - 64, 64, sha256, sizeof(sha256),
                        (uint8_t *)dst );
-	}
+   }
 
-	/* Success! */
-	return 1;
+   /* Success! */
+   return 1;
 }
 
 /**
diff --git a/algo/yespower/yespower.h b/algo/yespower/yespower.h
index f4a31b5..c5b6d78 100644
--- a/algo/yespower/yespower.h
+++ b/algo/yespower/yespower.h
@@ -34,6 +34,7 @@
 #include <stdlib.h> /* for size_t */
 #include "miner.h"
 #include "simd-utils.h"
+#include "algo/sha/sph_sha2.h"
 #include <openssl/sha.h>
 
 #ifdef __cplusplus
@@ -79,7 +80,8 @@ typedef struct {
 extern yespower_params_t yespower_params;
 
 //SHA256_CTX sha256_prehash_ctx;
-extern __thread SHA256_CTX sha256_prehash_ctx;
+extern __thread sph_sha256_context sha256_prehash_ctx;
+//extern __thread SHA256_CTX sha256_prehash_ctx;
 
 /**
  * yespower_init_local(local):
diff --git a/build-allarch.sh b/build-allarch.sh
index eb1f71b..fa1d866 100755
--- a/build-allarch.sh
+++ b/build-allarch.sh
@@ -17,10 +17,11 @@ mv cpuminer.exe cpuminer-avx512-sha-vaes.exe
 strip -s cpuminer
 mv cpuminer cpuminer-avx512-sha-vaes
 
-# Rocketlake AVX512 AES SHA
+# Rocketlake AVX512 SHA AES
 make clean || echo clean
 rm -f config.status
-CFLAGS="-O3 -march=skylake-avx512 -msha -Wall -fno-common" ./configure --with-curl
+CFLAGS="-O3 -march=cascadelake -msha -Wall -fno-common" ./configure --with-curl
+#CFLAGS="-O3 -march=skylake-avx512 -msha -Wall -fno-common" ./configure --with-curl
 # CFLAGS="-O3 -march=rocketlake -Wall -fno-common" ./configure --with-curl
 make -j 8
 strip -s cpuminer.exe
@@ -99,7 +100,7 @@ mv cpuminer.exe cpuminer-sse2.exe
 strip -s cpuminer
 mv cpuminer cpuminer-sse2
 
-# Zen1 AVX2 SHA
+# AMD Zen1 AVX2 SHA
 make clean || echo done
 rm -f config.status
 CFLAGS="-O3 -march=znver1 -Wall -fno-common" ./configure --with-curl
@@ -109,7 +110,7 @@ mv cpuminer.exe cpuminer-zen.exe
 strip -s cpuminer
 mv cpuminer cpuminer-zen
 
-# Zen3 AVX2 SHA VAES
+# AMD Zen3 AVX2 SHA VAES
 make clean || echo done
 rm -f config.status
 CFLAGS="-O3 -march=znver2 -mvaes -Wall -fno-common" ./configure --with-curl
diff --git a/build-avx2.sh b/build-avx2.sh
new file mode 100755
index 0000000..7a12473
--- /dev/null
+++ b/build-avx2.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+#if [ "$OS" = "Windows_NT" ]; then
+#    ./mingw64.sh
+#    exit 0
+#fi
+
+# Linux build
+
+make distclean || echo clean
+
+rm -f config.status
+./autogen.sh || echo done
+
+# Ubuntu 10.04 (gcc 4.4)
+# extracflags="-O3 -march=native -Wall -D_REENTRANT -funroll-loops -fvariable-expansion-in-unroller -fmerge-all-constants -fbranch-target-load-optimize2 -fsched2-use-superblocks -falign-loops=16 -falign-functions=16 -falign-jumps=16 -falign-labels=16"
+
+# Debian 7.7 / Ubuntu 14.04 (gcc 4.7+)
+#extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores"
+
+#CFLAGS="-O3 -march=native -Wall" ./configure --with-curl --with-crypto=$HOME/usr
+CFLAGS="-O3 -march=haswell -maes -Wall" ./configure --with-curl
+#CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
+
+make -j 4
+
+strip -s cpuminer
diff --git a/configure b/configure
index 5955782..aa0b4f2 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.15.2.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.15.3.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.15.2'
-PACKAGE_STRING='cpuminer-opt 3.15.2'
+PACKAGE_VERSION='3.15.3'
+PACKAGE_STRING='cpuminer-opt 3.15.3'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.15.2 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.15.3 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1404,7 +1404,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.15.2:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.15.3:";;
    esac
   cat <<\_ACEOF
 
@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-cpuminer-opt configure 3.15.2
+cpuminer-opt configure 3.15.3
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by cpuminer-opt $as_me 3.15.2, which was
+It was created by cpuminer-opt $as_me 3.15.3, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2993,7 +2993,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='cpuminer-opt'
- VERSION='3.15.2'
+ VERSION='3.15.3'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.15.2, which was
+This file was extended by cpuminer-opt $as_me 3.15.3, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.15.2
+cpuminer-opt config.status 3.15.3
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index b5c82d2..cae81bd 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.15.2])
+AC_INIT([cpuminer-opt], [3.15.3])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/cpuminer.nsi b/junk/cpuminer.nsi
similarity index 100%
rename from cpuminer.nsi
rename to junk/cpuminer.nsi
diff --git a/cpuminer.sln b/junk/cpuminer.sln
similarity index 100%
rename from cpuminer.sln
rename to junk/cpuminer.sln
diff --git a/winbuild-cross.sh b/winbuild-cross.sh
index 5e3542b..58503fe 100755
--- a/winbuild-cross.sh
+++ b/winbuild-cross.sh
@@ -49,7 +49,16 @@ make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx512-sha-vaes.exe
 
-# Zen1 AVX2 SHA
+# Rocketlake AVX512 SHA AES
+make clean || echo clean
+rm -f config.status
+CFLAGS="-O3 -march=cascadelake -msha -Wall" ./configure $CONFIGURE_ARGS
+#CFLAGS="-O3 -march=rocketlake -Wall" ./configure $CONFIGURE_ARGS
+make -j 8
+strip -s cpuminer.exe
+mv cpuminer.exe release/cpuminer-avx512-sha.exe
+
+# Zen1 AVX2 AES SHA
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=znver1 -Wall" ./configure $CONFIGURE_ARGS
@@ -95,7 +104,6 @@ strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx.exe
 
 # Westmere SSE4.2 AES
-# -march=westmere is supported in gcc5
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=westmere -Wall" ./configure $CONFIGURE_ARGS
@@ -104,6 +112,7 @@ make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-aes-sse42.exe
 
+# Nehalem SSE4.2
 #make clean || echo clean
 #rm -f config.status
 #CFLAGS="-O3 -march=corei7 -Wall" ./configure $CONFIGURE_ARGS
@@ -111,6 +120,7 @@ mv cpuminer.exe release/cpuminer-aes-sse42.exe
 #strip -s cpuminer.exe
 #mv cpuminer.exe release/cpuminer-sse42.exe
 
+# Core2 SSSE3
 #make clean || echo clean
 #rm -f config.status
 #CFLAGS="-O3 -march=core2 -Wall" ./configure $CONFIGURE_ARGS