v3.16.2

2025-09-17 23:44:27 +00:00 · 2021-04-08 18:09:31 -04:00
parent 902ec046dd
commit f3333b0070
17 changed files with 826 additions and 336 deletions
--- a/algo/verthash/Verthash.c
+++ b/algo/verthash/Verthash.c
@@ -134,87 +134,117 @@ static inline uint32_t fnv1a(const uint32_t a, const uint32_t b)
    return (a ^ b) * 0x1000193;
 }

-void verthash_hash(const unsigned char* blob_bytes,
-                   const size_t blob_size,
-                   const unsigned char(*input)[VH_HEADER_SIZE],
-                   unsigned char(*output)[VH_HASH_OUT_SIZE])
+void verthash_hash( const unsigned char* blob_bytes,
+                    const size_t blob_size,
+                    const unsigned char(*input)[VH_HEADER_SIZE],
+                    unsigned char(*output)[VH_HASH_OUT_SIZE] )
 {
-    unsigned char p1[VH_HASH_OUT_SIZE] __attribute__ ((aligned (64)));
-    sha3(&input[0], VH_HEADER_SIZE, &p1[0], VH_HASH_OUT_SIZE);
-
-    unsigned char p0[VH_N_SUBSET];
-
-    unsigned char input_header[VH_HEADER_SIZE] __attribute__ ((aligned (64)));
-    memcpy(input_header, input, VH_HEADER_SIZE);
-
-    for (size_t i = 0; i < VH_N_ITER; ++i)
-    {
-        input_header[0] += 1;
-        sha3(&input_header[0], VH_HEADER_SIZE, p0 + i * VH_P0_SIZE, VH_P0_SIZE);
-    }
-
-    uint32_t* p0_index = (uint32_t*)p0;
+    unsigned char p1[ VH_HASH_OUT_SIZE ] __attribute__ ((aligned (64)));
+    unsigned char p0[ VH_N_SUBSET ] __attribute__ ((aligned (64)));
    uint32_t seek_indexes[VH_N_INDEXES] __attribute__ ((aligned (64)));
+    uint32_t* p0_index = (uint32_t*)p0;

+    verthash_sha3_512_final_8( p0, ( (uint64_t*)input )[ 9 ] );
+    
    for ( size_t x = 0; x < VH_N_ROT; ++x )
    {
        memcpy( seek_indexes + x * (VH_N_SUBSET / sizeof(uint32_t)),
                p0, VH_N_SUBSET);

-//#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)        
-// 512 bit vector processing is actually slower because it reduces the CPU
-// clock significantly, which also slows mem access. The AVX512 rol instruction
-// is still available for smaller vectors.
-
-//        for ( size_t y = 0; y < VH_N_SUBSET / sizeof(uint32_t); y += 16 )
-//        {
-//            __m512i *p0_v = (__m512i*)( p0_index + y );
-//            *p0_v = mm512_rol_32( *p0_v, 1 );
-//        }
-
 #if defined(__AVX2__)

-        for ( size_t y = 0; y < VH_N_SUBSET / sizeof(uint32_t); y += 8 )
+        for ( size_t y = 0; y < VH_N_SUBSET / sizeof(__m256i); y += 8)
        {
-            __m256i *p0_v = (__m256i*)( p0_index + y );
-            *p0_v = mm256_rol_32( *p0_v, 1 );
+           casti_m256i( p0_index, y   ) = mm256_rol_32(
+                                            casti_m256i( p0_index, y   ), 1 );
+           casti_m256i( p0_index, y+1 ) = mm256_rol_32( 
+                                            casti_m256i( p0_index, y+1 ), 1 );
+           casti_m256i( p0_index, y+2 ) = mm256_rol_32(
+                                            casti_m256i( p0_index, y+2 ), 1 );
+           casti_m256i( p0_index, y+3 ) = mm256_rol_32(
+                                            casti_m256i( p0_index, y+3 ), 1 );
+           casti_m256i( p0_index, y+4 ) = mm256_rol_32(
+                                            casti_m256i( p0_index, y+4 ), 1 );
+           casti_m256i( p0_index, y+5 ) = mm256_rol_32(
+                                            casti_m256i( p0_index, y+5 ), 1 );
+           casti_m256i( p0_index, y+6 ) = mm256_rol_32(
+                                            casti_m256i( p0_index, y+6 ), 1 );
+           casti_m256i( p0_index, y+7 ) = mm256_rol_32(
+                                            casti_m256i( p0_index, y+7 ), 1 );
        }

 #else

-        for ( size_t y = 0; y < VH_N_SUBSET / sizeof(uint32_t); y += 4 )
+        for ( size_t y = 0; y < VH_N_SUBSET / sizeof(__m128i); y += 8)
        {
-            __m128i *p0_v = (__m128i*)( p0_index + y );
-            *p0_v = mm128_rol_32( *p0_v, 1 );
+           casti_m128i( p0_index, y   ) = mm128_rol_32(
+                                            casti_m128i( p0_index, y   ), 1 );
+           casti_m128i( p0_index, y+1 ) = mm128_rol_32(
+                                            casti_m128i( p0_index, y+1 ), 1 );
+           casti_m128i( p0_index, y+2 ) = mm128_rol_32(
+                                            casti_m128i( p0_index, y+2 ), 1 );
+           casti_m128i( p0_index, y+3 ) = mm128_rol_32(
+                                            casti_m128i( p0_index, y+3 ), 1 );
+           casti_m128i( p0_index, y+4 ) = mm128_rol_32(
+                                            casti_m128i( p0_index, y+4 ), 1 );
+           casti_m128i( p0_index, y+5 ) = mm128_rol_32(
+                                            casti_m128i( p0_index, y+5 ), 1 );
+           casti_m128i( p0_index, y+6 ) = mm128_rol_32(
+                                            casti_m128i( p0_index, y+6 ), 1 );
+           casti_m128i( p0_index, y+7 ) = mm128_rol_32(
+                                            casti_m128i( p0_index, y+7 ), 1 );
        }
-
+        
 #endif

-//        for (size_t y = 0; y < VH_N_SUBSET / sizeof(uint32_t); ++y)
-//        {
-//            *(p0_index + y) = ( *(p0_index + y) << 1 )
-//            | ( 1 & (*(p0_index + y) >> 31) );
-//        }
    }

+    sha3( &input[0], VH_HEADER_SIZE, &p1[0], VH_HASH_OUT_SIZE );
+    
    uint32_t* p1_32 = (uint32_t*)p1;
    uint32_t* blob_bytes_32 = (uint32_t*)blob_bytes;
    uint32_t value_accumulator = 0x811c9dc5;
-    const uint32_t mdiv = ((blob_size - VH_HASH_OUT_SIZE) / VH_BYTE_ALIGNMENT) + 1;
-    for (size_t i = 0; i < VH_N_INDEXES; i++)
+    const uint32_t mdiv = ( ( blob_size - VH_HASH_OUT_SIZE )
+                             / VH_BYTE_ALIGNMENT ) + 1;
+#if defined (__AVX2__)        
+    const __m256i k = _mm256_set1_epi32( 0x1000193 );
+#elif defined(__SSE41__)
+    const __m128i k = _mm_set1_epi32( 0x1000193 );
+#endif
+
+    for ( size_t i = 0; i < VH_N_INDEXES; i++ )
    {
-        const uint32_t offset = (fnv1a(seek_indexes[i], value_accumulator) % mdiv) * VH_BYTE_ALIGNMENT / sizeof(uint32_t);
+        const uint32_t offset =
+                      ( fnv1a( seek_indexes[i], value_accumulator) % mdiv )
+                      * ( VH_BYTE_ALIGNMENT / sizeof(uint32_t) );
        const uint32_t *blob_off = blob_bytes_32 + offset;
-        for (size_t i2 = 0; i2 < VH_HASH_OUT_SIZE / sizeof(uint32_t); i2++)
-        {
-            const uint32_t value = *( blob_off + i2 );
-            uint32_t* p1_ptr = p1_32 + i2;
-            *p1_ptr = fnv1a( *p1_ptr, value );
-            value_accumulator = fnv1a( value_accumulator, value );
-        }
+
+        // update value accumulator for next seek index
+        value_accumulator = fnv1a( value_accumulator, blob_off[0] );
+        value_accumulator = fnv1a( value_accumulator, blob_off[1] );
+        value_accumulator = fnv1a( value_accumulator, blob_off[2] );
+        value_accumulator = fnv1a( value_accumulator, blob_off[3] );
+        value_accumulator = fnv1a( value_accumulator, blob_off[4] );
+        value_accumulator = fnv1a( value_accumulator, blob_off[5] );
+        value_accumulator = fnv1a( value_accumulator, blob_off[6] );
+        value_accumulator = fnv1a( value_accumulator, blob_off[7] );
+        
+#if defined (__AVX2__)        
+        *(__m256i*)p1_32 = _mm256_mullo_epi32( _mm256_xor_si256(
+                                  *(__m256i*)p1_32, *(__m256i*)blob_off ), k );
+#elif defined(__SSE41__)
+        casti_m128i( p1_32, 0 ) = _mm_mullo_epi32( _mm_xor_si128( 
+                    casti_m128i( p1_32, 0 ), casti_m128i( blob_off, 0 ) ), k );
+        casti_m128i( p1_32, 1 ) = _mm_mullo_epi32( _mm_xor_si128( 
+                    casti_m128i( p1_32, 1 ), casti_m128i( blob_off, 1 ) ), k );
+#else
+         for ( size_t i2 = 0; i2 < VH_HASH_OUT_SIZE / sizeof(uint32_t); i2++ )
+            p1_32[i2] = fnv1a( p1_32[i2], blob_off[i2] );
+#endif
+
    }

-    memcpy(output, p1, VH_HASH_OUT_SIZE);
+    memcpy( output, p1, VH_HASH_OUT_SIZE );
 }

 //-----------------------------------------------------------------------------
--- a/algo/verthash/Verthash.h
+++ b/algo/verthash/Verthash.h
@@ -52,6 +52,8 @@ void verthash_hash(const unsigned char* blob_bytes,
                   const unsigned char(*input)[VH_HEADER_SIZE],
                   unsigned char(*output)[VH_HASH_OUT_SIZE]);

+void verthash_sha3_512_prehash_72( const void *input );
+void verthash_sha3_512_final_8( void *hash, const uint64_t nonce );

 #endif // !Verthash_INCLUDE_ONCE

--- a/algo/verthash/tiny_sha3/sha3-4way.c
+++ b/algo/verthash/tiny_sha3/sha3-4way.c
@@ -0,0 +1,301 @@
+#if defined(__AVX2__)
+
+// sha3-4way.c
+// 19-Nov-11  Markku-Juhani O. Saarinen <mjos@iki.fi>
+// vectorization by JayDDee 2021-03-27
+//
+// Revised 07-Aug-15 to match with official release of FIPS PUB 202 "SHA3"
+// Revised 03-Sep-15 for portability + OpenSSL - style API
+
+#include "sha3-4way.h"
+
+// constants
+static const uint64_t keccakf_rndc[24] = {
+        0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
+        0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
+        0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
+        0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
+        0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
+        0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
+        0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
+        0x8000000000008080, 0x0000000080000001, 0x8000000080008008
+    };
+
+void sha3_4way_keccakf( __m256i st[25] )
+{
+   int i, j, r;
+   __m256i t, bc[5];
+
+   for ( r = 0; r < KECCAKF_ROUNDS; r++ )
+   {
+      // Theta
+      bc[0] = _mm256_xor_si256( st[0],
+                           mm256_xor4( st[5], st[10], st[15], st[20] ) );
+      bc[1] = _mm256_xor_si256( st[1],
+                           mm256_xor4( st[6], st[11], st[16], st[21] ) );
+      bc[2] = _mm256_xor_si256( st[2],
+                           mm256_xor4( st[7], st[12], st[17], st[22] ) );
+      bc[3] = _mm256_xor_si256( st[3],
+                           mm256_xor4( st[8], st[13], st[18], st[23] ) );
+      bc[4] = _mm256_xor_si256( st[4],
+                           mm256_xor4( st[9], st[14], st[19], st[24] ) );
+
+      for ( i = 0; i < 5; i++ )
+      {
+         t = _mm256_xor_si256( bc[ (i+4) % 5 ],
+                               mm256_rol_64( bc[ (i+1) % 5 ], 1 ) );
+         st[ i    ]  = _mm256_xor_si256( st[ i    ],  t );
+         st[ i+5  ]  = _mm256_xor_si256( st[ i+ 5 ],  t );
+         st[ i+10 ]  = _mm256_xor_si256( st[ i+10 ],  t );
+         st[ i+15 ]  = _mm256_xor_si256( st[ i+15 ],  t );
+         st[ i+20 ]  = _mm256_xor_si256( st[ i+20 ],  t );
+      }
+
+      // Rho Pi
+#define RHO_PI( i, c ) \
+   bc[0] = st[ i ]; \
+   st[ i ] = mm256_rol_64( t, c ); \
+   t = bc[0]
+
+      t = st[1];
+
+      RHO_PI( 10,  1 );
+      RHO_PI(  7,  3 );
+      RHO_PI( 11,  6 );
+      RHO_PI( 17, 10 );
+      RHO_PI( 18, 15 );
+      RHO_PI(  3, 21 );
+      RHO_PI(  5, 28 );
+      RHO_PI( 16, 36 );
+      RHO_PI(  8, 45 );
+      RHO_PI( 21, 55 );
+      RHO_PI( 24,  2 );
+      RHO_PI(  4, 14 );
+      RHO_PI( 15, 27 );
+      RHO_PI( 23, 41 );
+      RHO_PI( 19, 56 );
+      RHO_PI( 13,  8 );
+      RHO_PI( 12, 25 );
+      RHO_PI(  2, 43 );
+      RHO_PI( 20, 62 );
+      RHO_PI( 14, 18 );
+      RHO_PI( 22, 39 );
+      RHO_PI(  9, 61 );
+      RHO_PI(  6, 20 );
+      RHO_PI(  1, 44 );
+
+#undef RHO_PI        
+
+      //  Chi
+      for ( j = 0; j < 25; j += 5 )
+      {
+         memcpy( bc, &st[ j ], 5*32 );
+         st[ j   ] = _mm256_xor_si256( st[ j   ],
+                                       _mm256_andnot_si256( bc[1], bc[2] ) );
+         st[ j+1 ] = _mm256_xor_si256( st[ j+1 ],
+                                       _mm256_andnot_si256( bc[2], bc[3] ) );
+         st[ j+2 ] = _mm256_xor_si256( st[ j+2 ],
+                                       _mm256_andnot_si256( bc[3], bc[4] ) );
+         st[ j+3 ] = _mm256_xor_si256( st[ j+3 ],
+                                       _mm256_andnot_si256( bc[4], bc[0] ) );
+         st[ j+4 ] = _mm256_xor_si256( st[ j+4 ],
+                                       _mm256_andnot_si256( bc[0], bc[1] ) );
+      }
+
+      //  Iota
+      st[0] = _mm256_xor_si256( st[0],
+                                _mm256_set1_epi64x( keccakf_rndc[ r ] ) );
+   }
+}
+
+int sha3_4way_init( sha3_4way_ctx_t *c, int mdlen )
+{
+    for ( int i = 0; i < 25; i++ )  c->st[ i ] = m256_zero;
+    c->mdlen = mdlen;
+    c->rsiz = 200 - 2 * mdlen;
+    c->pt = 0;
+    return 1;
+}
+
+int sha3_4way_update( sha3_4way_ctx_t *c, const void *data, size_t len )
+{
+    size_t i;
+    int j =  c->pt;
+    const int rsiz = c->rsiz / 8;
+    const int l = len / 8;
+
+    for ( i = 0; i < l; i++ )
+    {
+        c->st[ j ] = _mm256_xor_si256( c->st[ j ],
+                                       ( (const __m256i*)data )[i] );
+        j++;
+        if ( j >= rsiz )
+        {
+            sha3_4way_keccakf( c->st );
+            j = 0;
+        }
+    }
+    c->pt = j;
+
+    return 1;
+}
+
+int sha3_4way_final( void *md, sha3_4way_ctx_t *c )
+{
+    c->st[ c->pt ] = _mm256_xor_si256( c->st[ c->pt ],
+                                       m256_const1_64( 6 ) );
+    c->st[ c->rsiz / 8 - 1 ] =
+                       _mm256_xor_si256( c->st[ c->rsiz / 8 - 1 ],
+                                         m256_const1_64( 0x8000000000000000 ) );
+    sha3_4way_keccakf( c->st );
+    memcpy( md, c->st, c->mdlen * 4 );
+    return 1;
+}
+
+void *sha3_4way( const void *in, size_t inlen, void *md, int mdlen )
+{
+    sha3_4way_ctx_t ctx;
+    sha3_4way_init( &ctx, mdlen);
+    sha3_4way_update( &ctx, in, inlen );
+    sha3_4way_final( md, &ctx );
+    return md;
+}
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+void sha3_8way_keccakf( __m512i st[25] )
+{
+    int i, j, r;
+    __m512i t, bc[5];
+
+    // actual iteration
+    for ( r = 0; r < KECCAKF_ROUNDS; r++ )
+    {
+
+        // Theta
+        for ( i = 0; i < 5; i++ )
+           bc[i] = _mm512_xor_si512( st[i], 
+              mm512_xor4( st[ i+5 ], st[ i+10 ], st[ i+15 ], st[i+20 ] ) );
+
+        for ( i = 0; i < 5; i++ )
+        {
+            t = _mm512_xor_si512( bc[(i + 4) % 5],
+                                  _mm512_rol_epi64( bc[(i + 1) % 5], 1 ) );
+            for ( j = 0; j < 25; j += 5 )
+                st[j + i]  = _mm512_xor_si512( st[j + i],  t );
+        }
+
+        // Rho Pi
+#define RHO_PI( i, c ) \
+   bc[0] = st[ i ]; \
+   st[ i ] = _mm512_rol_epi64( t, c ); \
+   t = bc[0]
+
+        t = st[1];
+
+        RHO_PI( 10,  1 );        
+        RHO_PI(  7,  3 );
+        RHO_PI( 11,  6 );
+        RHO_PI( 17, 10 );
+        RHO_PI( 18, 15 );
+        RHO_PI(  3, 21 );
+        RHO_PI(  5, 28 );
+        RHO_PI( 16, 36 );
+        RHO_PI(  8, 45 );
+        RHO_PI( 21, 55 );
+        RHO_PI( 24,  2 );
+        RHO_PI(  4, 14 );
+        RHO_PI( 15, 27 );
+        RHO_PI( 23, 41 );
+        RHO_PI( 19, 56 );
+        RHO_PI( 13,  8 );
+        RHO_PI( 12, 25 );
+        RHO_PI(  2, 43 );
+        RHO_PI( 20, 62 );
+        RHO_PI( 14, 18 );
+        RHO_PI( 22, 39 );
+        RHO_PI(  9, 61 );
+        RHO_PI(  6, 20 );
+        RHO_PI(  1, 44 );
+
+#undef RHO_PI        
+
+        //  Chi
+        for ( j = 0; j < 25; j += 5 )
+        {
+            for ( i = 0; i < 5; i++ )
+                bc[i] = st[j + i];
+            for ( i = 0; i < 5; i++ )
+                st[ j+i ] = _mm512_xor_si512(  st[ j+i ],  _mm512_andnot_si512(
+                                         bc[ (i+1) % 5 ], bc[ (i+2) % 5 ] ) );
+        }
+
+        //  Iota
+        st[0] = _mm512_xor_si512( st[0], _mm512_set1_epi64( keccakf_rndc[r] ) );
+    }
+}
+
+// Initialize the context for SHA3
+
+int sha3_8way_init( sha3_8way_ctx_t *c, int mdlen )
+{
+    for ( int i = 0; i < 25; i++ )  c->st[ i ] = m512_zero;
+    c->mdlen = mdlen;
+    c->rsiz = 200 - 2 * mdlen;
+    c->pt = 0;
+    return 1;
+}
+
+// update state with more data
+
+int sha3_8way_update( sha3_8way_ctx_t *c, const void *data, size_t len )
+{
+    size_t i;
+    int j =  c->pt;
+    const int rsiz = c->rsiz / 8;
+    const int l = len / 8;
+
+    for ( i = 0; i < l; i++ )
+    {
+        c->st[ j ] = _mm512_xor_si512( c->st[ j ],
+                                        ( (const __m512i*)data )[i] );
+        j++;
+        if ( j >= rsiz )
+        {
+            sha3_8way_keccakf( c->st );
+            j = 0;
+        }
+    }
+    c->pt = j;
+
+    return 1;
+}
+
+// finalize and output a hash
+
+int sha3_8way_final( void *md, sha3_8way_ctx_t *c )
+{
+    c->st[ c->pt ] =
+                       _mm512_xor_si512( c->st[ c->pt ],
+                                         m512_const1_64( 6 ) );
+    c->st[ c->rsiz / 8 - 1 ] =
+                       _mm512_xor_si512( c->st[ c->rsiz / 8 - 1 ],
+                                         m512_const1_64( 0x8000000000000000 ) );
+    sha3_8way_keccakf( c->st );
+    memcpy( md, c->st, c->mdlen * 8 );
+    return 1;
+}
+
+// compute a SHA-3 hash (md) of given byte length from "in"
+
+void *sha3_8way( const void *in, size_t inlen, void *md, int mdlen )
+{
+    sha3_8way_ctx_t sha3;
+    sha3_8way_init( &sha3, mdlen);
+    sha3_8way_update( &sha3, in, inlen );
+    sha3_8way_final( md, &sha3 );
+    return md;
+}
+
+#endif  // AVX512
+#endif  // AVX2
--- a/algo/verthash/tiny_sha3/sha3-4way.h
+++ b/algo/verthash/tiny_sha3/sha3-4way.h
@@ -0,0 +1,67 @@
+// sha3.h
+// 19-Nov-11  Markku-Juhani O. Saarinen <mjos@iki.fi>
+// 2021-03-27 JayDDee
+//
+#ifndef SHA3_4WAY_H
+#define SHA3_4WAY_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include "simd-utils.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#ifndef KECCAKF_ROUNDS
+#define KECCAKF_ROUNDS 24
+#endif
+
+#if defined(__AVX2__)
+
+typedef struct
+{
+   __m256i st[25];                     // 64-bit words * 4 lanes
+    int pt, rsiz, mdlen;                    // these don't overflow
+} sha3_4way_ctx_t __attribute__ ((aligned (64)));;
+
+// Compression function.
+void sha3_4way_keccakf( __m256i st[25] );
+
+// OpenSSL - like interfece
+int sha3_4way_init( sha3_4way_ctx_t *c, int mdlen );    // mdlen = hash output in bytes
+int sha3_4way_update( sha3_4way_ctx_t *c, const void *data, size_t len );
+int sha3_4way_final( void *md, sha3_4way_ctx_t *c );    // digest goes to md
+
+// compute a sha3 hash (md) of given byte length from "in"
+void *sha3_4way( const void *in, size_t inlen, void *md, int mdlen );
+
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// state context
+typedef struct
+{
+   __m512i st[25];                     // 64-bit words * 8 lanes
+    int pt, rsiz, mdlen;                    // these don't overflow
+} sha3_8way_ctx_t __attribute__ ((aligned (64)));;
+
+// Compression function.
+void sha3_8way_keccakf( __m512i st[25] );
+
+// OpenSSL - like interfece
+int sha3_8way_init( sha3_8way_ctx_t *c, int mdlen );    // mdlen = hash output in bytes
+int sha3_8way_update( sha3_8way_ctx_t *c, const void *data, size_t len );
+int sha3_8way_final( void *md, sha3_8way_ctx_t *c );    // digest goes to md
+
+// compute a sha3 hash (md) of given byte length from "in"
+void *sha3_8way( const void *in, size_t inlen, void *md, int mdlen );
+
+#endif // AVX512
+#endif // AVX2
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
--- a/algo/verthash/tiny_sha3/sha3.c
+++ b/algo/verthash/tiny_sha3/sha3.c
@@ -5,6 +5,7 @@
 // Revised 03-Sep-15 for portability + OpenSSL - style API

 #include "sha3.h"
+#include <string.h>

 // update the state with given number of rounds

@@ -21,6 +22,7 @@ void sha3_keccakf(uint64_t st[25])
        0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
        0x8000000000008080, 0x0000000080000001, 0x8000000080008008
    };
+/*
    const int keccakf_rotc[24] = {
        1,  3,  6,  10, 15, 21, 28, 36, 45, 55, 2,  14,
        27, 41, 56, 8,  25, 43, 62, 18, 39, 61, 20, 44
@@ -29,6 +31,7 @@ void sha3_keccakf(uint64_t st[25])
        10, 7,  11, 17, 18, 3, 5,  16, 8,  21, 24, 4,
        15, 23, 19, 13, 12, 2, 20, 14, 22, 9,  6,  1
    };
+*/

    // variables
    int i, j, r;
@@ -60,14 +63,50 @@ void sha3_keccakf(uint64_t st[25])
                st[j + i] ^= t;
        }

+        
        // Rho Pi
+#define RHO_PI( i, c ) \
+   bc[0] = st[ i ]; \
+   st[ i ] = ROTL64( t, c ); \
+   t = bc[0]
+
        t = st[1];
+
+        RHO_PI( 10,  1 );
+        RHO_PI(  7,  3 );
+        RHO_PI( 11,  6 );
+        RHO_PI( 17, 10 );
+        RHO_PI( 18, 15 );
+        RHO_PI(  3, 21 );
+        RHO_PI(  5, 28 );
+        RHO_PI( 16, 36 );
+        RHO_PI(  8, 45 );
+        RHO_PI( 21, 55 );
+        RHO_PI( 24,  2 );
+        RHO_PI(  4, 14 );
+        RHO_PI( 15, 27 );
+        RHO_PI( 23, 41 );
+        RHO_PI( 19, 56 );
+        RHO_PI( 13,  8 );
+        RHO_PI( 12, 25 );
+        RHO_PI(  2, 43 );
+        RHO_PI( 20, 62 );
+        RHO_PI( 14, 18 );
+        RHO_PI( 22, 39 );
+        RHO_PI(  9, 61 );
+        RHO_PI(  6, 20 );
+        RHO_PI(  1, 44 );
+
+#undef RHO_PI        
+
+/*        
        for (i = 0; i < 24; i++) {
            j = keccakf_piln[i];
            bc[0] = st[j];
            st[j] = ROTL64(t, keccakf_rotc[i]);
            t = bc[0];
        }
+*/

        //  Chi
        for (j = 0; j < 25; j += 5) {
@@ -118,17 +157,20 @@ int sha3_init(sha3_ctx_t *c, int mdlen)
 int sha3_update(sha3_ctx_t *c, const void *data, size_t len)
 {
    size_t i;
-    int j;
+    int j = c->pt / 8;
+    const int rsiz = c->rsiz / 8;
+    const int l = len / 8;

-    j = c->pt;
-    for (i = 0; i < len; i++) {
-        c->st.b[j++] ^= ((const uint8_t *) data)[i];
-        if (j >= c->rsiz) {
-            sha3_keccakf(c->st.q);
+    for ( i = 0; i < l; i++ )
+    {
+        c->st.q[ j++ ] ^= ( ((const uint64_t *) data) [i] );
+        if ( j >= rsiz )
+        {
+            sha3_keccakf( c->st.q );
            j = 0;
        }
    }
-    c->pt = j;
+    c->pt = j*8;

    return 1;
 }
@@ -137,16 +179,10 @@ int sha3_update(sha3_ctx_t *c, const void *data, size_t len)

 int sha3_final(void *md, sha3_ctx_t *c)
 {
-    int i;
-
-    c->st.b[c->pt] ^= 0x06;
-    c->st.b[c->rsiz - 1] ^= 0x80;
+    c->st.q[ c->pt / 8 ] ^= 6;
+    c->st.q[ c->rsiz / 8 - 1 ] ^= 0x8000000000000000;
    sha3_keccakf(c->st.q);
-
-    for (i = 0; i < c->mdlen; i++) {
-        ((uint8_t *) md)[i] = c->st.b[i];
-    }
-
+    memcpy( md, c->st.q, c->mdlen );
    return 1;
 }

@@ -155,7 +191,6 @@ int sha3_final(void *md, sha3_ctx_t *c)
 void *sha3(const void *in, size_t inlen, void *md, int mdlen)
 {
    sha3_ctx_t sha3;
-
    sha3_init(&sha3, mdlen);
    sha3_update(&sha3, in, inlen);
    sha3_final(md, &sha3);
--- a/algo/verthash/verthash-gate.c
+++ b/algo/verthash/verthash-gate.c
@@ -1,6 +1,7 @@
 #include "algo-gate-api.h"
 #include "algo/sha/sph_sha2.h"
 #include "Verthash.h"
+#include "tiny_sha3/sha3-4way.h"

 static verthash_info_t verthashInfo;

@@ -12,6 +13,82 @@ static const uint8_t verthashDatFileHash_bytes[32] =
  0x29, 0xec, 0xf8, 0x8f, 0x8a, 0xd4, 0x76, 0x39,
  0xb6, 0xed, 0xed, 0xaf, 0xd7, 0x21, 0xaa, 0x48 };

+#if defined(__AVX2__)
+
+static __thread sha3_4way_ctx_t sha3_mid_ctxA;
+static __thread sha3_4way_ctx_t sha3_mid_ctxB;
+
+#else
+
+static __thread sha3_ctx_t sha3_mid_ctx[8];
+
+#endif
+
+void verthash_sha3_512_prehash_72( const void *input )
+{
+#if defined(__AVX2__)
+   
+   __m256i vin[10];
+   mm256_intrlv80_4x64( vin, input );
+
+   sha3_4way_init( &sha3_mid_ctxA, 64 );
+   sha3_4way_init( &sha3_mid_ctxB, 64 );
+
+   vin[0] = _mm256_add_epi8( vin[0], _mm256_set_epi64x( 4,3,2,1 ) );
+   sha3_4way_update( &sha3_mid_ctxA, vin, 72 );
+
+   vin[0] = _mm256_add_epi8( vin[0], _mm256_set1_epi64x( 4 ) );
+   sha3_4way_update( &sha3_mid_ctxB, vin, 72 );
+
+#else
+
+   char in[80] __attribute__ ((aligned (64)));
+   memcpy( in, input, 80 );   
+   for ( int i = 0; i < 8; i++ )
+   {
+      in[0] += 1;
+      sha3_init( &sha3_mid_ctx[i], 64 );
+      sha3_update( &sha3_mid_ctx[i], in, 72 );
+   }
+
+#endif
+}
+
+void verthash_sha3_512_final_8( void *hash, const uint64_t nonce )
+{
+#if defined(__AVX2__)
+
+    __m256i vhashA[ 10 ] __attribute__ ((aligned (64)));
+    __m256i vhashB[ 10 ] __attribute__ ((aligned (64)));
+
+   sha3_4way_ctx_t ctx;
+   __m256i vnonce = _mm256_set1_epi64x( nonce );
+
+   memcpy( &ctx, &sha3_mid_ctxA, sizeof ctx );
+   sha3_4way_update( &ctx, &vnonce, 8 );
+   sha3_4way_final( vhashA, &ctx );
+
+   memcpy( &ctx, &sha3_mid_ctxB, sizeof ctx );
+   sha3_4way_update( &ctx, &vnonce, 8 );
+   sha3_4way_final( vhashB, &ctx );
+
+   dintrlv_4x64( hash,     hash+64,  hash+128, hash+192, vhashA, 512 );
+   dintrlv_4x64( hash+256, hash+320, hash+384, hash+448, vhashB, 512 );
+   
+#else
+
+   for ( int i = 0; i < 8; i++ )
+   {
+      sha3_ctx_t ctx;
+      memcpy( &ctx, &sha3_mid_ctx[i], sizeof ctx );
+      sha3_update( &ctx, &nonce, 8 );
+      sha3_final( hash + i*64, &ctx );
+   }
+   
+#endif
+}
+
+
 int scanhash_verthash( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr )
 {
@@ -26,6 +103,8 @@ int scanhash_verthash( struct work *work, uint32_t max_nonce,
   const bool bench = opt_benchmark;

   mm128_bswap32_80( edata, pdata );
+   verthash_sha3_512_prehash_72( edata );
+
   do
   {
      edata[19] = n;
@@ -51,15 +130,14 @@ bool register_verthash_algo( algo_gate_t* gate )

  opt_target_factor = 256.0;
  gate->scanhash  = (void*)&scanhash_verthash;
+  gate->optimizations = AVX2_OPT;
   
-  // verthash data file
  char *verthash_data_file = opt_data_file ? opt_data_file
                                           : default_verthash_data_file;
  
   int vhLoadResult = verthash_info_init( &verthashInfo, verthash_data_file );
   if (vhLoadResult == 0) // No Error
   {
-      //  and verify data file(if it was enabled)
      if ( opt_verify )
      {
         uint8_t vhDataFileHash[32] = { 0 };
@@ -78,7 +156,6 @@ bool register_verthash_algo( algo_gate_t* gate )
      }
   }
   else
-
   {
      // Handle Verthash error codes
      if ( vhLoadResult == 1 )