mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.7.9
This commit is contained in:
34
Makefile.am
34
Makefile.am
@@ -38,7 +38,6 @@ cpuminer_SOURCES = \
|
||||
algo/argon2/ar2/cores.c \
|
||||
algo/argon2/ar2/ar2-scrypt-jane.c \
|
||||
algo/argon2/ar2/blake2b.c \
|
||||
algo/axiom.c \
|
||||
algo/blake/sph_blake.c \
|
||||
algo/blake/blake-hash-4way.c \
|
||||
algo/blake/blake-gate.c \
|
||||
@@ -56,6 +55,7 @@ cpuminer_SOURCES = \
|
||||
algo/blake/pentablake-4way.c \
|
||||
algo/blake/pentablake.c \
|
||||
algo/bmw/sph_bmw.c \
|
||||
algo/bmw/bmw-hash-4way.c \
|
||||
algo/bmw/bmw256.c \
|
||||
algo/cryptonight/cryptolight.c \
|
||||
algo/cryptonight/cryptonight-common.c\
|
||||
@@ -63,10 +63,8 @@ cpuminer_SOURCES = \
|
||||
algo/cryptonight/cryptonight.c\
|
||||
algo/cubehash/sph_cubehash.c \
|
||||
algo/cubehash/sse2/cubehash_sse2.c\
|
||||
algo/drop.c \
|
||||
algo/echo/sph_echo.c \
|
||||
algo/echo/aes_ni/hash.c\
|
||||
algo/fresh.c \
|
||||
algo/gost/sph_gost.c \
|
||||
algo/groestl/sph_groestl.c \
|
||||
algo/groestl/groestl.c \
|
||||
@@ -113,9 +111,8 @@ cpuminer_SOURCES = \
|
||||
algo/nist5/nist5-gate.c \
|
||||
algo/nist5/nist5-4way.c \
|
||||
algo/nist5/nist5.c \
|
||||
algo/nist5/zr5.c \
|
||||
algo/pluck.c \
|
||||
algo/polytimos/polytimos-gate.c \
|
||||
algo/polytimos/polytimos.c \
|
||||
algo/quark/quark.c \
|
||||
algo/qubit/qubit.c \
|
||||
algo/qubit/deep.c \
|
||||
@@ -127,6 +124,7 @@ cpuminer_SOURCES = \
|
||||
algo/sha/sha2.c \
|
||||
algo/sha/sha256t.c \
|
||||
algo/shabal/sph_shabal.c \
|
||||
algo/shabal/shabal-hash-4way.c \
|
||||
algo/shavite/sph_shavite.c \
|
||||
algo/shavite/sph-shavite-aesni.c \
|
||||
algo/shavite/shavite.c \
|
||||
@@ -141,15 +139,10 @@ cpuminer_SOURCES = \
|
||||
algo/skein/skein2.c \
|
||||
algo/skein/skein2-4way.c \
|
||||
algo/skein/skein2-gate.c \
|
||||
algo/skunk.c \
|
||||
algo/sm3/sm3.c \
|
||||
algo/tiger/sph_tiger.c \
|
||||
algo/timetravel.c \
|
||||
algo/timetravel10.c \
|
||||
algo/tribus/tribus-gate.c \
|
||||
algo/tribus/tribus.c \
|
||||
algo/tribus/tribus-4way.c \
|
||||
algo/veltor.c \
|
||||
algo/whirlpool/sph_whirlpool.c \
|
||||
algo/whirlpool/whirlpool-hash-4way.c \
|
||||
algo/whirlpool/whirlpool-gate.c \
|
||||
@@ -165,6 +158,10 @@ cpuminer_SOURCES = \
|
||||
algo/x11/c11-gate.c \
|
||||
algo/x11/c11.c \
|
||||
algo/x11/c11-4way.c \
|
||||
algo/x11/tribus-gate.c \
|
||||
algo/x11/tribus.c \
|
||||
algo/x11/tribus-4way.c \
|
||||
algo/x11/fresh.c \
|
||||
algo/x11/x11evo.c \
|
||||
algo/x13/x13-gate.c \
|
||||
algo/x13/x13.c \
|
||||
@@ -175,9 +172,20 @@ cpuminer_SOURCES = \
|
||||
algo/x13/phi1612-gate.c \
|
||||
algo/x13/phi1612.c \
|
||||
algo/x13/phi1612-4way.c \
|
||||
algo/x13/skunk-gate.c \
|
||||
algo/x13/skunk-4way.c \
|
||||
algo/x13/skunk.c \
|
||||
algo/x13/drop.c \
|
||||
algo/x14/x14-gate.c \
|
||||
algo/x14/x14.c \
|
||||
algo/x14/x14-4way.c \
|
||||
algo/x14/veltor-gate.c \
|
||||
algo/x14/veltor.c \
|
||||
algo/x14/veltor-4way.c \
|
||||
algo/x14/polytimos-gate.c \
|
||||
algo/x14/polytimos.c \
|
||||
algo/x14/polytimos-4way.c \
|
||||
algo/x14/axiom.c \
|
||||
algo/x15/x15-gate.c \
|
||||
algo/x15/x15.c \
|
||||
algo/x15/x15-4way.c \
|
||||
@@ -189,10 +197,8 @@ cpuminer_SOURCES = \
|
||||
algo/x17/xevan-4way.c \
|
||||
algo/x17/hmq1725.c \
|
||||
algo/yescrypt/yescrypt.c \
|
||||
algo/yescrypt/sha256_Y.c\
|
||||
algo/yescrypt/yescrypt-simd.c\
|
||||
algo/zr5.c
|
||||
|
||||
algo/yescrypt/sha256_Y.c \
|
||||
algo/yescrypt/yescrypt-simd.c
|
||||
|
||||
disable_flags =
|
||||
|
||||
|
@@ -68,7 +68,7 @@ Supported Algorithms
|
||||
timetravel10 Bitcore
|
||||
tribus Denarius (DNR)
|
||||
vanilla blake256r8vnl (VCash)
|
||||
veltor
|
||||
veltor (VLT)
|
||||
whirlpool
|
||||
whirlpoolx
|
||||
x11 Dash
|
||||
@@ -81,6 +81,7 @@ Supported Algorithms
|
||||
x17
|
||||
xevan Bitsend
|
||||
yescrypt Globalboost-Y (BSTY)
|
||||
yescryptr8 BitZeny (ZNY)\n\
|
||||
yescryptr16 Yenten (YTN)
|
||||
zr5 Ziftr
|
||||
|
||||
|
@@ -165,6 +165,13 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
|
||||
Change Log
|
||||
----------
|
||||
|
||||
v3.7.9
|
||||
|
||||
Partial 4way optimizations for veltor, skunk, polytimos, lyra2z.
|
||||
Additional 4way optimizations for X algos.
|
||||
New algo yescryptr8 for BitZeny, not to be confused with original
|
||||
yescrypt Globalboost-Y.
|
||||
|
||||
v3.7.8
|
||||
|
||||
Partial 4way optimization for most X algos including c11, xevan, phi, hsr
|
||||
|
@@ -219,6 +219,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
case ALGO_X17: register_x17_algo ( gate ); break;
|
||||
case ALGO_XEVAN: register_xevan_algo ( gate ); break;
|
||||
case ALGO_YESCRYPT: register_yescrypt_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR8: register_yescryptr8_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR16: register_yescryptr16_algo ( gate ); break;
|
||||
case ALGO_ZR5: register_zr5_algo ( gate ); break;
|
||||
default:
|
||||
@@ -278,6 +279,7 @@ const char* const algo_alias_map[][2] =
|
||||
{
|
||||
// alias proper
|
||||
{ "bitcore", "timetravel10" },
|
||||
{ "bitzeny", "yescryptr8" },
|
||||
{ "blake256r8", "blakecoin" },
|
||||
{ "blake256r8vnl", "vanilla" },
|
||||
{ "blake256r14", "blake" },
|
||||
@@ -300,10 +302,9 @@ const char* const algo_alias_map[][2] =
|
||||
// { "sia", "blake2b" },
|
||||
{ "sib", "x11gost" },
|
||||
{ "timetravel8", "timetravel" },
|
||||
{ "yes", "yescrypt" },
|
||||
{ "ziftr", "zr5" },
|
||||
{ "yenten", "yescryptr16" },
|
||||
{ "yescryptr8", "yescrypt" },
|
||||
{ "yescryptr8k", "yescrypt" },
|
||||
{ "zcoin", "lyra2z" },
|
||||
{ "zoin", "lyra2z330" },
|
||||
{ NULL, NULL }
|
||||
|
@@ -36,7 +36,6 @@
|
||||
#include <string.h>
|
||||
#include <limits.h>
|
||||
|
||||
//#include "sph_blake.h"
|
||||
#include "blake-hash-4way.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
@@ -98,18 +97,6 @@ static const unsigned sigma[16][16] = {
|
||||
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
|
||||
};
|
||||
|
||||
/*
|
||||
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
|
||||
14 10 4 8 9 15 13 6 1 12 0 2 11 7 5 3
|
||||
11 8 12 0 5 2 15 13 10 14 3 6 7 1 9 4
|
||||
7 9 3 1 13 12 11 14 2 6 5 10 4 0 15 8
|
||||
9 0 5 7 2 4 10 15 14 1 11 12 6 8 3 13
|
||||
2 12 6 10 0 11 8 3 4 13 7 5 15 14 1 9
|
||||
12 5 1 15 14 13 4 10 0 7 6 3 9 2 8 11
|
||||
13 11 7 14 12 1 3 9 5 0 15 4 8 6 2 10
|
||||
6 15 14 9 11 3 0 8 12 2 13 7 1 4 10 5
|
||||
10 2 8 4 7 6 1 5 15 11 9 14 3 12 13 0
|
||||
*/
|
||||
#endif
|
||||
|
||||
#define Z00 0
|
||||
@@ -914,34 +901,29 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
|
||||
|
||||
ptr = sc->ptr;
|
||||
bit_len = ((unsigned)ptr << 3);
|
||||
// unsigned z = 0x80 >> n;
|
||||
// unsigned zz = ((ub & -z) | z) & 0xFF;
|
||||
// u.buf[ptr>>2] = _mm_set_epi32( zz, zz, zz, zz );
|
||||
u.buf[ptr>>2] = _mm_set1_epi32( 0x80 );
|
||||
tl = sc->T0 + bit_len;
|
||||
th = sc->T1;
|
||||
|
||||
if ( ptr == 0 )
|
||||
{
|
||||
sc->T0 = SPH_C32(0xFFFFFE00);
|
||||
sc->T1 = SPH_C32(0xFFFFFFFF);
|
||||
sc->T0 = SPH_C32(0xFFFFFE00UL);
|
||||
sc->T1 = SPH_C32(0xFFFFFFFFUL);
|
||||
}
|
||||
else if ( sc->T0 == 0 )
|
||||
{
|
||||
sc->T0 = SPH_C32(0xFFFFFE00) + bit_len;
|
||||
sc->T0 = SPH_C32(0xFFFFFE00UL) + bit_len;
|
||||
sc->T1 = SPH_T32(sc->T1 - 1);
|
||||
}
|
||||
else
|
||||
sc->T0 -= 512 - bit_len;
|
||||
|
||||
// if ( ptr <= 48 )
|
||||
if ( ptr <= 52 )
|
||||
{
|
||||
memset_zero_128( u.buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
|
||||
// memset_zero_128( u.buf + (ptr>>2) + 1, (48 - ptr) >> 2 );
|
||||
if (out_size_w32 == 8)
|
||||
u.buf[52>>2] = _mm_or_si128( u.buf[52>>2],
|
||||
_mm_set1_epi32( 0x010000000 ) );
|
||||
_mm_set1_epi32( 0x01000000UL ) );
|
||||
*(u.buf+(56>>2)) = mm_byteswap_32( _mm_set1_epi32( th ) );
|
||||
*(u.buf+(60>>2)) = mm_byteswap_32( _mm_set1_epi32( tl ) );
|
||||
blake32_4way( sc, u.buf + (ptr>>2), 64 - ptr );
|
||||
@@ -950,11 +932,11 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
|
||||
{
|
||||
memset_zero_128( u.buf + (ptr>>2) + 1, (60-ptr) >> 2 );
|
||||
blake32_4way( sc, u.buf + (ptr>>2), 64 - ptr );
|
||||
sc->T0 = SPH_C32(0xFFFFFE00);
|
||||
sc->T1 = SPH_C32(0xFFFFFFFF);
|
||||
sc->T0 = SPH_C32(0xFFFFFE00UL);
|
||||
sc->T1 = SPH_C32(0xFFFFFFFFUL);
|
||||
memset_zero_128( u.buf, 56>>2 );
|
||||
if (out_size_w32 == 8)
|
||||
u.buf[52>>2] = _mm_set1_epi32( 0x010000000 );
|
||||
u.buf[52>>2] = _mm_set1_epi32( 0x01000000UL );
|
||||
*(u.buf+(56>>2)) = mm_byteswap_32( _mm_set1_epi32( th ) );
|
||||
*(u.buf+(60>>2)) = mm_byteswap_32( _mm_set1_epi32( tl ) );
|
||||
blake32_4way( sc, u.buf, 64 );
|
||||
@@ -962,7 +944,6 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
|
||||
out = (__m128i*)dst;
|
||||
for ( k = 0; k < out_size_w32; k++ )
|
||||
out[k] = mm_byteswap_32( sc->H[k] );
|
||||
// out[k] = sc->H[k];
|
||||
}
|
||||
|
||||
#if defined (__AVX2__)
|
||||
@@ -975,9 +956,9 @@ blake64_4way_init( blake_4way_big_context *sc, const sph_u64 *iv,
|
||||
{
|
||||
int i;
|
||||
for ( i = 0; i < 8; i++ )
|
||||
sc->H[i] = _mm256_set_epi64x( iv[i], iv[i], iv[i], iv[i] );
|
||||
sc->H[i] = _mm256_set1_epi64x( iv[i] );
|
||||
for ( i = 0; i < 4; i++ )
|
||||
sc->S[i] = _mm256_set_epi64x( salt[i], salt[i], salt[i], salt[i] );
|
||||
sc->S[i] = _mm256_set1_epi64x( salt[i] );
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
}
|
||||
@@ -1049,12 +1030,12 @@ blake64_4way_close( blake_4way_big_context *sc,
|
||||
th = sc->T1;
|
||||
if (ptr == 0 )
|
||||
{
|
||||
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00);
|
||||
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFF);
|
||||
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
|
||||
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
|
||||
}
|
||||
else if ( sc->T0 == 0 )
|
||||
{
|
||||
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00) + bit_len;
|
||||
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL) + bit_len;
|
||||
sc->T1 = SPH_T64(sc->T1 - 1);
|
||||
}
|
||||
else
|
||||
@@ -1066,10 +1047,7 @@ blake64_4way_close( blake_4way_big_context *sc,
|
||||
memset_zero_256( u.buf + (ptr>>3) + 1, (104-ptr) >> 3 );
|
||||
if ( out_size_w64 == 8 )
|
||||
u.buf[(104>>3)] = _mm256_or_si256( u.buf[(104>>3)],
|
||||
_mm256_set_epi64x( 0x0100000000000000,
|
||||
0x0100000000000000,
|
||||
0x0100000000000000,
|
||||
0x0100000000000000 ) );
|
||||
_mm256_set1_epi64x( 0x0100000000000000ULL ) );
|
||||
*(u.buf+(112>>3)) = mm256_byteswap_64(
|
||||
_mm256_set_epi64x( th, th, th, th ) );
|
||||
*(u.buf+(120>>3)) = mm256_byteswap_64(
|
||||
@@ -1082,15 +1060,11 @@ blake64_4way_close( blake_4way_big_context *sc,
|
||||
memset_zero_256( u.buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
|
||||
|
||||
blake64_4way( sc, u.buf + (ptr>>3), 128 - ptr );
|
||||
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00);
|
||||
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFF);
|
||||
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
|
||||
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
|
||||
memset_zero_256( u.buf, 112>>3 );
|
||||
if ( out_size_w64 == 8 )
|
||||
u.buf[104>>3] = _mm256_set_epi64x( 0x0100000000000000,
|
||||
0x0100000000000000,
|
||||
0x0100000000000000,
|
||||
0x0100000000000000 );
|
||||
|
||||
u.buf[104>>3] = _mm256_set1_epi64x( 0x0100000000000000ULL );
|
||||
*(u.buf+(112>>3)) = mm256_byteswap_64(
|
||||
_mm256_set_epi64x( th, th, th, th ) );
|
||||
*(u.buf+(120>>3)) = mm256_byteswap_64(
|
||||
|
969
algo/bmw/bmw-hash-4way.c
Normal file
969
algo/bmw/bmw-hash-4way.c
Normal file
@@ -0,0 +1,969 @@
|
||||
/* $Id: bmw.c 227 2010-06-16 17:28:38Z tp $ */
|
||||
/*
|
||||
* BMW implementation.
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
#include <limits.h>
|
||||
#include "bmw-hash-4way.h"
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
|
||||
//#include "sph_bmw.h"
|
||||
|
||||
//#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_BMW
|
||||
#define SPH_SMALL_FOOTPRINT_BMW 1
|
||||
//#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning (disable: 4146)
|
||||
#endif
|
||||
|
||||
//#undef SPH_ROTL64
|
||||
//#define SPH_ROTL64(x,n) (((x) << (n)) | ((x) >> (64 - (n))))
|
||||
//#define SPH_ROTL64(x,n) mm256_rotl_64(x,n)
|
||||
|
||||
static const sph_u32 IV256[] = {
|
||||
SPH_C32(0x40414243), SPH_C32(0x44454647),
|
||||
SPH_C32(0x48494A4B), SPH_C32(0x4C4D4E4F),
|
||||
SPH_C32(0x50515253), SPH_C32(0x54555657),
|
||||
SPH_C32(0x58595A5B), SPH_C32(0x5C5D5E5F),
|
||||
SPH_C32(0x60616263), SPH_C32(0x64656667),
|
||||
SPH_C32(0x68696A6B), SPH_C32(0x6C6D6E6F),
|
||||
SPH_C32(0x70717273), SPH_C32(0x74757677),
|
||||
SPH_C32(0x78797A7B), SPH_C32(0x7C7D7E7F)
|
||||
};
|
||||
|
||||
#if SPH_64
|
||||
|
||||
static const sph_u64 IV512[] = {
|
||||
SPH_C64(0x8081828384858687), SPH_C64(0x88898A8B8C8D8E8F),
|
||||
SPH_C64(0x9091929394959697), SPH_C64(0x98999A9B9C9D9E9F),
|
||||
SPH_C64(0xA0A1A2A3A4A5A6A7), SPH_C64(0xA8A9AAABACADAEAF),
|
||||
SPH_C64(0xB0B1B2B3B4B5B6B7), SPH_C64(0xB8B9BABBBCBDBEBF),
|
||||
SPH_C64(0xC0C1C2C3C4C5C6C7), SPH_C64(0xC8C9CACBCCCDCECF),
|
||||
SPH_C64(0xD0D1D2D3D4D5D6D7), SPH_C64(0xD8D9DADBDCDDDEDF),
|
||||
SPH_C64(0xE0E1E2E3E4E5E6E7), SPH_C64(0xE8E9EAEBECEDEEEF),
|
||||
SPH_C64(0xF0F1F2F3F4F5F6F7), SPH_C64(0xF8F9FAFBFCFDFEFF)
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
#define XCAT(x, y) XCAT_(x, y)
|
||||
#define XCAT_(x, y) x ## y
|
||||
|
||||
#define LPAR (
|
||||
|
||||
/*
|
||||
#define ss0(x) (((x) >> 1) ^ SPH_T32((x) << 3) \
|
||||
^ SPH_ROTL32(x, 4) ^ SPH_ROTL32(x, 19))
|
||||
#define ss1(x) (((x) >> 1) ^ SPH_T32((x) << 2) \
|
||||
^ SPH_ROTL32(x, 8) ^ SPH_ROTL32(x, 23))
|
||||
#define ss2(x) (((x) >> 2) ^ SPH_T32((x) << 1) \
|
||||
^ SPH_ROTL32(x, 12) ^ SPH_ROTL32(x, 25))
|
||||
#define ss3(x) (((x) >> 2) ^ SPH_T32((x) << 2) \
|
||||
^ SPH_ROTL32(x, 15) ^ SPH_ROTL32(x, 29))
|
||||
#define ss4(x) (((x) >> 1) ^ (x))
|
||||
#define ss5(x) (((x) >> 2) ^ (x))
|
||||
#define rs1(x) SPH_ROTL32(x, 3)
|
||||
#define rs2(x) SPH_ROTL32(x, 7)
|
||||
#define rs3(x) SPH_ROTL32(x, 13)
|
||||
#define rs4(x) SPH_ROTL32(x, 16)
|
||||
#define rs5(x) SPH_ROTL32(x, 19)
|
||||
#define rs6(x) SPH_ROTL32(x, 23)
|
||||
#define rs7(x) SPH_ROTL32(x, 27)
|
||||
|
||||
#define Ks(j) SPH_T32((sph_u32)(j) * SPH_C32(0x05555555))
|
||||
|
||||
#define add_elt_s(mf, hf, j0m, j1m, j3m, j4m, j7m, j10m, j11m, j16) \
|
||||
(SPH_T32(SPH_ROTL32(mf(j0m), j1m) + SPH_ROTL32(mf(j3m), j4m) \
|
||||
- SPH_ROTL32(mf(j10m), j11m) + Ks(j16)) ^ hf(j7m))
|
||||
|
||||
#define expand1s_inner(qf, mf, hf, i16, \
|
||||
i0, i1, i2, i3, i4, i5, i6, i7, i8, \
|
||||
i9, i10, i11, i12, i13, i14, i15, \
|
||||
i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
|
||||
SPH_T32(ss1(qf(i0)) + ss2(qf(i1)) + ss3(qf(i2)) + ss0(qf(i3)) \
|
||||
+ ss1(qf(i4)) + ss2(qf(i5)) + ss3(qf(i6)) + ss0(qf(i7)) \
|
||||
+ ss1(qf(i8)) + ss2(qf(i9)) + ss3(qf(i10)) + ss0(qf(i11)) \
|
||||
+ ss1(qf(i12)) + ss2(qf(i13)) + ss3(qf(i14)) + ss0(qf(i15)) \
|
||||
+ add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
|
||||
|
||||
#define expand1s(qf, mf, hf, i16) \
|
||||
expand1s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
|
||||
#define expand1s_(qf, mf, hf, i16, ix, iy) \
|
||||
expand1s_inner LPAR qf, mf, hf, i16, ix, iy)
|
||||
|
||||
#define expand2s_inner(qf, mf, hf, i16, \
|
||||
i0, i1, i2, i3, i4, i5, i6, i7, i8, \
|
||||
i9, i10, i11, i12, i13, i14, i15, \
|
||||
i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
|
||||
SPH_T32(qf(i0) + rs1(qf(i1)) + qf(i2) + rs2(qf(i3)) \
|
||||
+ qf(i4) + rs3(qf(i5)) + qf(i6) + rs4(qf(i7)) \
|
||||
+ qf(i8) + rs5(qf(i9)) + qf(i10) + rs6(qf(i11)) \
|
||||
+ qf(i12) + rs7(qf(i13)) + ss4(qf(i14)) + ss5(qf(i15)) \
|
||||
+ add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
|
||||
|
||||
#define expand2s(qf, mf, hf, i16) \
|
||||
expand2s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
|
||||
#define expand2s_(qf, mf, hf, i16, ix, iy) \
|
||||
expand2s_inner LPAR qf, mf, hf, i16, ix, iy)
|
||||
*/
|
||||
#if SPH_64
|
||||
|
||||
#define sb0(x) \
|
||||
_mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 1), \
|
||||
_mm256_slli_epi64( (x), 3) ), \
|
||||
_mm256_xor_si256( mm256_rotl_64( (x), 4), \
|
||||
mm256_rotl_64( (x), 37) ) )
|
||||
|
||||
#define sb1(x) \
|
||||
_mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 1), \
|
||||
_mm256_slli_epi64( (x), 2) ), \
|
||||
_mm256_xor_si256( mm256_rotl_64( (x), 13), \
|
||||
mm256_rotl_64( (x), 43) ) )
|
||||
|
||||
#define sb2(x) \
|
||||
_mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 2), \
|
||||
_mm256_slli_epi64( (x), 1) ), \
|
||||
_mm256_xor_si256( mm256_rotl_64( (x), 19), \
|
||||
mm256_rotl_64( (x), 53) ) )
|
||||
|
||||
#define sb3(x) \
|
||||
_mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 2), \
|
||||
_mm256_slli_epi64( (x), 2) ), \
|
||||
_mm256_xor_si256( mm256_rotl_64( (x), 28), \
|
||||
mm256_rotl_64( (x), 59) ) )
|
||||
|
||||
#define sb4(x) \
|
||||
_mm256_xor_si256( (x), _mm256_srli_epi64( (x), 1 ) )
|
||||
|
||||
#define sb5(x) \
|
||||
_mm256_xor_si256( (x), _mm256_srli_epi64( (x), 2 ) )
|
||||
|
||||
#define rb1(x) mm256_rotl_64( x, 5 )
|
||||
#define rb2(x) mm256_rotl_64( x, 11 )
|
||||
#define rb3(x) mm256_rotl_64( x, 27 )
|
||||
#define rb4(x) mm256_rotl_64( x, 32 )
|
||||
#define rb5(x) mm256_rotl_64( x, 37 )
|
||||
#define rb6(x) mm256_rotl_64( x, 43 )
|
||||
#define rb7(x) mm256_rotl_64( x, 53 )
|
||||
|
||||
#define rol_off( M, j, off ) \
|
||||
mm256_rotl_64( M[ ( (j) + (off) ) & 15 ] , \
|
||||
( ( (j) + (off) ) & 15 ) + 1 )
|
||||
|
||||
#define add_elt_b( M, H, j ) \
|
||||
_mm256_xor_si256( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( _mm256_add_epi64( rol_off( M, j, 0 ), \
|
||||
rol_off( M, j, 3 ) ), \
|
||||
rol_off( M, j, 10 ) ), \
|
||||
_mm256_set1_epi64x( ( (j) + 16 ) * 0x0555555555555555ULL ) ), \
|
||||
H[ ( (j)+7 ) & 15 ] )
|
||||
|
||||
#define expand1b( qt, M, H, i ) \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( sb1( qt[ (i)-16 ] ), \
|
||||
sb2( qt[ (i)-15 ] ) ), \
|
||||
_mm256_add_epi64( sb3( qt[ (i)-14 ] ), \
|
||||
sb0( qt[ (i)-13 ] ) ) ), \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( sb1( qt[ (i)-12 ] ), \
|
||||
sb2( qt[ (i)-11 ] ) ), \
|
||||
_mm256_add_epi64( sb3( qt[ (i)-10 ] ), \
|
||||
sb0( qt[ (i)- 9 ] ) ) ) ), \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( sb1( qt[ (i)- 8 ] ), \
|
||||
sb2( qt[ (i)- 7 ] ) ), \
|
||||
_mm256_add_epi64( sb3( qt[ (i)- 6 ] ), \
|
||||
sb0( qt[ (i)- 5 ] ) ) ), \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( sb1( qt[ (i)- 4 ] ), \
|
||||
sb2( qt[ (i)- 3 ] ) ), \
|
||||
_mm256_add_epi64( sb3( qt[ (i)- 2 ] ), \
|
||||
sb0( qt[ (i)- 1 ] ) ) ) ) ), \
|
||||
add_elt_b( M, H, (i)-16 ) )
|
||||
|
||||
#define expand2b( qt, M, H, i) \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( qt[ (i)-16 ], rb1( qt[ (i)-15 ] ) ), \
|
||||
_mm256_add_epi64( qt[ (i)-14 ], rb2( qt[ (i)-13 ] ) ) ), \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( qt[ (i)-12 ], rb3( qt[ (i)-11 ] ) ), \
|
||||
_mm256_add_epi64( qt[ (i)-10 ], rb4( qt[ (i)- 9 ] ) ) ) ), \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( qt[ (i)- 8 ], rb5( qt[ (i)- 7 ] ) ), \
|
||||
_mm256_add_epi64( qt[ (i)- 6 ], rb6( qt[ (i)- 5 ] ) ) ), \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( qt[ (i)- 4 ], rb7( qt[ (i)- 3 ] ) ), \
|
||||
_mm256_add_epi64( sb4( qt[ (i)- 2 ] ), \
|
||||
sb5( qt[ (i)- 1 ] ) ) ) ) ), \
|
||||
add_elt_b( M, H, (i)-16 ) )
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
#define MAKE_W( i0, op01, i1, op12, i2, op23, i3, op34, i4) \
|
||||
((M(i0) ^ H(i0)) op01 (M(i1) ^ H(i1)) op12 (M(i2) ^ H(i2)) \
|
||||
op23 (M(i3) ^ H(i3)) op34 (M(i4) ^ H(i4)))
|
||||
*/
|
||||
|
||||
/*
|
||||
#define Ws0 MAKE_W(SPH_T32, 5, -, 7, +, 10, +, 13, +, 14)
|
||||
#define Ws1 MAKE_W(SPH_T32, 6, -, 8, +, 11, +, 14, -, 15)
|
||||
#define Ws2 MAKE_W(SPH_T32, 0, +, 7, +, 9, -, 12, +, 15)
|
||||
#define Ws3 MAKE_W(SPH_T32, 0, -, 1, +, 8, -, 10, +, 13)
|
||||
#define Ws4 MAKE_W(SPH_T32, 1, +, 2, +, 9, -, 11, -, 14)
|
||||
#define Ws5 MAKE_W(SPH_T32, 3, -, 2, +, 10, -, 12, +, 15)
|
||||
#define Ws6 MAKE_W(SPH_T32, 4, -, 0, -, 3, -, 11, +, 13)
|
||||
#define Ws7 MAKE_W(SPH_T32, 1, -, 4, -, 5, -, 12, -, 14)
|
||||
#define Ws8 MAKE_W(SPH_T32, 2, -, 5, -, 6, +, 13, -, 15)
|
||||
#define Ws9 MAKE_W(SPH_T32, 0, -, 3, +, 6, -, 7, +, 14)
|
||||
#define Ws10 MAKE_W(SPH_T32, 8, -, 1, -, 4, -, 7, +, 15)
|
||||
#define Ws11 MAKE_W(SPH_T32, 8, -, 0, -, 2, -, 5, +, 9)
|
||||
#define Ws12 MAKE_W(SPH_T32, 1, +, 3, -, 6, -, 9, +, 10)
|
||||
#define Ws13 MAKE_W(SPH_T32, 2, +, 4, +, 7, +, 10, +, 11)
|
||||
#define Ws14 MAKE_W(SPH_T32, 3, -, 5, +, 8, -, 11, -, 12)
|
||||
#define Ws15 MAKE_W(SPH_T32, 12, -, 4, -, 6, -, 9, +, 13)
|
||||
|
||||
#if SPH_SMALL_FOOTPRINT_BMW
|
||||
|
||||
#define MAKE_Qas do { \
|
||||
unsigned u; \
|
||||
sph_u32 Ws[16]; \
|
||||
Ws[ 0] = Ws0; \
|
||||
Ws[ 1] = Ws1; \
|
||||
Ws[ 2] = Ws2; \
|
||||
Ws[ 3] = Ws3; \
|
||||
Ws[ 4] = Ws4; \
|
||||
Ws[ 5] = Ws5; \
|
||||
Ws[ 6] = Ws6; \
|
||||
Ws[ 7] = Ws7; \
|
||||
Ws[ 8] = Ws8; \
|
||||
Ws[ 9] = Ws9; \
|
||||
Ws[10] = Ws10; \
|
||||
Ws[11] = Ws11; \
|
||||
Ws[12] = Ws12; \
|
||||
Ws[13] = Ws13; \
|
||||
Ws[14] = Ws14; \
|
||||
Ws[15] = Ws15; \
|
||||
for (u = 0; u < 15; u += 5) { \
|
||||
qt[u + 0] = SPH_T32(ss0(Ws[u + 0]) + H(u + 1)); \
|
||||
qt[u + 1] = SPH_T32(ss1(Ws[u + 1]) + H(u + 2)); \
|
||||
qt[u + 2] = SPH_T32(ss2(Ws[u + 2]) + H(u + 3)); \
|
||||
qt[u + 3] = SPH_T32(ss3(Ws[u + 3]) + H(u + 4)); \
|
||||
qt[u + 4] = SPH_T32(ss4(Ws[u + 4]) + H(u + 5)); \
|
||||
} \
|
||||
qt[15] = SPH_T32(ss0(Ws[15]) + H(0)); \
|
||||
} while (0)
|
||||
|
||||
#define MAKE_Qbs do { \
|
||||
qt[16] = expand1s(Qs, M, H, 16); \
|
||||
qt[17] = expand1s(Qs, M, H, 17); \
|
||||
qt[18] = expand2s(Qs, M, H, 18); \
|
||||
qt[19] = expand2s(Qs, M, H, 19); \
|
||||
qt[20] = expand2s(Qs, M, H, 20); \
|
||||
qt[21] = expand2s(Qs, M, H, 21); \
|
||||
qt[22] = expand2s(Qs, M, H, 22); \
|
||||
qt[23] = expand2s(Qs, M, H, 23); \
|
||||
qt[24] = expand2s(Qs, M, H, 24); \
|
||||
qt[25] = expand2s(Qs, M, H, 25); \
|
||||
qt[26] = expand2s(Qs, M, H, 26); \
|
||||
qt[27] = expand2s(Qs, M, H, 27); \
|
||||
qt[28] = expand2s(Qs, M, H, 28); \
|
||||
qt[29] = expand2s(Qs, M, H, 29); \
|
||||
qt[30] = expand2s(Qs, M, H, 30); \
|
||||
qt[31] = expand2s(Qs, M, H, 31); \
|
||||
} while (0)
|
||||
|
||||
#else
|
||||
|
||||
#define MAKE_Qas do { \
|
||||
qt[ 0] = SPH_T32(ss0(Ws0 ) + H( 1)); \
|
||||
qt[ 1] = SPH_T32(ss1(Ws1 ) + H( 2)); \
|
||||
qt[ 2] = SPH_T32(ss2(Ws2 ) + H( 3)); \
|
||||
qt[ 3] = SPH_T32(ss3(Ws3 ) + H( 4)); \
|
||||
qt[ 4] = SPH_T32(ss4(Ws4 ) + H( 5)); \
|
||||
qt[ 5] = SPH_T32(ss0(Ws5 ) + H( 6)); \
|
||||
qt[ 6] = SPH_T32(ss1(Ws6 ) + H( 7)); \
|
||||
qt[ 7] = SPH_T32(ss2(Ws7 ) + H( 8)); \
|
||||
qt[ 8] = SPH_T32(ss3(Ws8 ) + H( 9)); \
|
||||
qt[ 9] = SPH_T32(ss4(Ws9 ) + H(10)); \
|
||||
qt[10] = SPH_T32(ss0(Ws10) + H(11)); \
|
||||
qt[11] = SPH_T32(ss1(Ws11) + H(12)); \
|
||||
qt[12] = SPH_T32(ss2(Ws12) + H(13)); \
|
||||
qt[13] = SPH_T32(ss3(Ws13) + H(14)); \
|
||||
qt[14] = SPH_T32(ss4(Ws14) + H(15)); \
|
||||
qt[15] = SPH_T32(ss0(Ws15) + H( 0)); \
|
||||
} while (0)
|
||||
|
||||
#define MAKE_Qbs do { \
|
||||
qt[16] = expand1s(Qs, M, H, 16); \
|
||||
qt[17] = expand1s(Qs, M, H, 17); \
|
||||
qt[18] = expand2s(Qs, M, H, 18); \
|
||||
qt[19] = expand2s(Qs, M, H, 19); \
|
||||
qt[20] = expand2s(Qs, M, H, 20); \
|
||||
qt[21] = expand2s(Qs, M, H, 21); \
|
||||
qt[22] = expand2s(Qs, M, H, 22); \
|
||||
qt[23] = expand2s(Qs, M, H, 23); \
|
||||
qt[24] = expand2s(Qs, M, H, 24); \
|
||||
qt[25] = expand2s(Qs, M, H, 25); \
|
||||
qt[26] = expand2s(Qs, M, H, 26); \
|
||||
qt[27] = expand2s(Qs, M, H, 27); \
|
||||
qt[28] = expand2s(Qs, M, H, 28); \
|
||||
qt[29] = expand2s(Qs, M, H, 29); \
|
||||
qt[30] = expand2s(Qs, M, H, 30); \
|
||||
qt[31] = expand2s(Qs, M, H, 31); \
|
||||
} while (0)
|
||||
|
||||
#endif
|
||||
|
||||
#define MAKE_Qs do { \
|
||||
MAKE_Qas; \
|
||||
MAKE_Qbs; \
|
||||
} while (0)
|
||||
|
||||
#define Qs(j) (qt[j])
|
||||
*/
|
||||
#if SPH_64
|
||||
|
||||
#define Wb0 \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 5], H[ 5] ), \
|
||||
_mm256_xor_si256( M[ 7], H[ 7] ) ), \
|
||||
_mm256_xor_si256( M[10], H[10] ) ), \
|
||||
_mm256_xor_si256( M[13], H[13] ) ), \
|
||||
_mm256_xor_si256( M[14], H[14] ) )
|
||||
|
||||
#define Wb1 \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 6], H[ 6] ), \
|
||||
_mm256_xor_si256( M[ 8], H[ 8] ) ), \
|
||||
_mm256_xor_si256( M[11], H[11] ) ), \
|
||||
_mm256_xor_si256( M[14], H[14] ) ), \
|
||||
_mm256_xor_si256( M[15], H[15] ) )
|
||||
|
||||
#define Wb2 \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
|
||||
_mm256_xor_si256( M[ 7], H[ 7] ) ), \
|
||||
_mm256_xor_si256( M[ 9], H[ 9] ) ), \
|
||||
_mm256_xor_si256( M[12], H[12] ) ), \
|
||||
_mm256_xor_si256( M[15], H[15] ) )
|
||||
|
||||
#define Wb3 \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
|
||||
_mm256_xor_si256( M[ 1], H[ 1] ) ), \
|
||||
_mm256_xor_si256( M[ 8], H[ 8] ) ), \
|
||||
_mm256_xor_si256( M[10], H[10] ) ), \
|
||||
_mm256_xor_si256( M[13], H[13] ) )
|
||||
|
||||
#define Wb4 \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
|
||||
_mm256_xor_si256( M[ 2], H[ 2] ) ), \
|
||||
_mm256_xor_si256( M[ 9], H[ 9] ) ), \
|
||||
_mm256_xor_si256( M[11], H[11] ) ), \
|
||||
_mm256_xor_si256( M[14], H[14] ) )
|
||||
|
||||
#define Wb5 \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \
|
||||
_mm256_xor_si256( M[ 2], H[ 2] ) ), \
|
||||
_mm256_xor_si256( M[10], H[10] ) ), \
|
||||
_mm256_xor_si256( M[12], H[12] ) ), \
|
||||
_mm256_xor_si256( M[15], H[15] ) )
|
||||
|
||||
#define Wb6 \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 4], H[ 4] ), \
|
||||
_mm256_xor_si256( M[ 0], H[ 0] ) ), \
|
||||
_mm256_xor_si256( M[ 3], H[ 3] ) ), \
|
||||
_mm256_xor_si256( M[11], H[11] ) ), \
|
||||
_mm256_xor_si256( M[13], H[13] ) )
|
||||
|
||||
#define Wb7 \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
|
||||
_mm256_xor_si256( M[ 4], H[ 4] ) ), \
|
||||
_mm256_xor_si256( M[ 5], H[ 5] ) ), \
|
||||
_mm256_xor_si256( M[12], H[12] ) ), \
|
||||
_mm256_xor_si256( M[14], H[14] ) )
|
||||
|
||||
#define Wb8 \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \
|
||||
_mm256_xor_si256( M[ 5], H[ 5] ) ), \
|
||||
_mm256_xor_si256( M[ 6], H[ 6] ) ), \
|
||||
_mm256_xor_si256( M[13], H[13] ) ), \
|
||||
_mm256_xor_si256( M[15], H[15] ) )
|
||||
|
||||
#define Wb9 \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
|
||||
_mm256_xor_si256( M[ 3], H[ 3] ) ), \
|
||||
_mm256_xor_si256( M[ 6], H[ 6] ) ), \
|
||||
_mm256_xor_si256( M[ 7], H[ 7] ) ), \
|
||||
_mm256_xor_si256( M[14], H[14] ) )
|
||||
|
||||
#define Wb10 \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \
|
||||
_mm256_xor_si256( M[ 1], H[ 1] ) ), \
|
||||
_mm256_xor_si256( M[ 4], H[ 4] ) ), \
|
||||
_mm256_xor_si256( M[ 7], H[ 7] ) ), \
|
||||
_mm256_xor_si256( M[15], H[15] ) )
|
||||
|
||||
#define Wb11 \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \
|
||||
_mm256_xor_si256( M[ 0], H[ 0] ) ), \
|
||||
_mm256_xor_si256( M[ 2], H[ 2] ) ), \
|
||||
_mm256_xor_si256( M[ 5], H[ 5] ) ), \
|
||||
_mm256_xor_si256( M[ 9], H[ 9] ) )
|
||||
|
||||
#define Wb12 \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
|
||||
_mm256_xor_si256( M[ 3], H[ 3] ) ), \
|
||||
_mm256_xor_si256( M[ 6], H[ 6] ) ), \
|
||||
_mm256_xor_si256( M[ 9], H[ 9] ) ), \
|
||||
_mm256_xor_si256( M[10], H[10] ) )
|
||||
|
||||
#define Wb13 \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_add_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \
|
||||
_mm256_xor_si256( M[ 4], H[ 4] ) ), \
|
||||
_mm256_xor_si256( M[ 7], H[ 7] ) ), \
|
||||
_mm256_xor_si256( M[10], H[10] ) ), \
|
||||
_mm256_xor_si256( M[11], H[11] ) )
|
||||
|
||||
#define Wb14 \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \
|
||||
_mm256_xor_si256( M[ 5], H[ 5] ) ), \
|
||||
_mm256_xor_si256( M[ 8], H[ 8] ) ), \
|
||||
_mm256_xor_si256( M[11], H[11] ) ), \
|
||||
_mm256_xor_si256( M[12], H[12] ) )
|
||||
|
||||
#define Wb15 \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( \
|
||||
_mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \
|
||||
_mm256_xor_si256( M[ 4], H[4] ) ), \
|
||||
_mm256_xor_si256( M[ 6], H[ 6] ) ), \
|
||||
_mm256_xor_si256( M[ 9], H[ 9] ) ), \
|
||||
_mm256_xor_si256( M[13], H[13] ) )
|
||||
|
||||
void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
|
||||
{
|
||||
__m256i qt[32], xl, xh; \
|
||||
|
||||
qt[ 0] = sb0( Wb0 ) + H[ 1];
|
||||
qt[ 1] = sb1( Wb1 ) + H[ 2];
|
||||
qt[ 2] = sb2( Wb2 ) + H[ 3];
|
||||
qt[ 3] = sb3( Wb3 ) + H[ 4];
|
||||
qt[ 4] = sb4( Wb4 ) + H[ 5];
|
||||
qt[ 5] = sb0( Wb5 ) + H[ 6];
|
||||
qt[ 6] = sb1( Wb6 ) + H[ 7];
|
||||
qt[ 7] = sb2( Wb7 ) + H[ 8];
|
||||
qt[ 8] = sb3( Wb8 ) + H[ 9];
|
||||
qt[ 9] = sb4( Wb9 ) + H[10];
|
||||
qt[10] = sb0( Wb10) + H[11];
|
||||
qt[11] = sb1( Wb11) + H[12];
|
||||
qt[12] = sb2( Wb12) + H[13];
|
||||
qt[13] = sb3( Wb13) + H[14];
|
||||
qt[14] = sb4( Wb14) + H[15];
|
||||
qt[15] = sb0( Wb15) + H[ 0];
|
||||
qt[16] = expand1b( qt, M, H, 16 );
|
||||
qt[17] = expand1b( qt, M, H, 17 );
|
||||
qt[18] = expand2b( qt, M, H, 18 );
|
||||
qt[19] = expand2b( qt, M, H, 19 );
|
||||
qt[20] = expand2b( qt, M, H, 20 );
|
||||
qt[21] = expand2b( qt, M, H, 21 );
|
||||
qt[22] = expand2b( qt, M, H, 22 );
|
||||
qt[23] = expand2b( qt, M, H, 23 );
|
||||
qt[24] = expand2b( qt, M, H, 24 );
|
||||
qt[25] = expand2b( qt, M, H, 25 );
|
||||
qt[26] = expand2b( qt, M, H, 26 );
|
||||
qt[27] = expand2b( qt, M, H, 27 );
|
||||
qt[28] = expand2b( qt, M, H, 28 );
|
||||
qt[29] = expand2b( qt, M, H, 29 );
|
||||
qt[30] = expand2b( qt, M, H, 30 );
|
||||
qt[31] = expand2b( qt, M, H, 31 );
|
||||
xl = _mm256_xor_si256(
|
||||
_mm256_xor_si256( _mm256_xor_si256( qt[16], qt[17] ),
|
||||
_mm256_xor_si256( qt[18], qt[19] ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( qt[20], qt[21] ),
|
||||
_mm256_xor_si256( qt[22], qt[23] ) ) );
|
||||
xh = _mm256_xor_si256( xl,
|
||||
_mm256_xor_si256(
|
||||
_mm256_xor_si256( _mm256_xor_si256( qt[24], qt[25] ),
|
||||
_mm256_xor_si256( qt[26], qt[27] ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( qt[28], qt[29] ),
|
||||
_mm256_xor_si256( qt[30], qt[31] ) )));
|
||||
dH[ 0] = _mm256_add_epi64(
|
||||
_mm256_xor_si256( M[0],
|
||||
_mm256_xor_si256( _mm256_slli_epi64( xh, 5 ),
|
||||
_mm256_srli_epi64( qt[16], 5 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[24] ), qt[ 0] ));
|
||||
dH[ 1] = _mm256_add_epi64(
|
||||
_mm256_xor_si256( M[1],
|
||||
_mm256_xor_si256( _mm256_srli_epi64( xh, 7 ),
|
||||
_mm256_slli_epi64( qt[17], 8 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[25] ), qt[ 1] ));
|
||||
dH[ 2] = _mm256_add_epi64(
|
||||
_mm256_xor_si256( M[2],
|
||||
_mm256_xor_si256( _mm256_srli_epi64( xh, 5 ),
|
||||
_mm256_slli_epi64( qt[18], 5 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[26] ), qt[ 2] ));
|
||||
dH[ 3] = _mm256_add_epi64(
|
||||
_mm256_xor_si256( M[3],
|
||||
_mm256_xor_si256( _mm256_srli_epi64( xh, 1 ),
|
||||
_mm256_slli_epi64( qt[19], 5 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[27] ), qt[ 3] ));
|
||||
dH[ 4] = _mm256_add_epi64(
|
||||
_mm256_xor_si256( M[4],
|
||||
_mm256_xor_si256( _mm256_srli_epi64( xh, 3 ),
|
||||
_mm256_slli_epi64( qt[20], 0 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[28] ), qt[ 4] ));
|
||||
dH[ 5] = _mm256_add_epi64(
|
||||
_mm256_xor_si256( M[5],
|
||||
_mm256_xor_si256( _mm256_slli_epi64( xh, 6 ),
|
||||
_mm256_srli_epi64( qt[21], 6 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[29] ), qt[ 5] ));
|
||||
dH[ 6] = _mm256_add_epi64(
|
||||
_mm256_xor_si256( M[6],
|
||||
_mm256_xor_si256( _mm256_srli_epi64( xh, 4 ),
|
||||
_mm256_slli_epi64( qt[22], 6 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[30] ), qt[ 6] ));
|
||||
dH[ 7] = _mm256_add_epi64(
|
||||
_mm256_xor_si256( M[7],
|
||||
_mm256_xor_si256( _mm256_srli_epi64( xh, 11 ),
|
||||
_mm256_slli_epi64( qt[23], 2 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[31] ), qt[ 7] ));
|
||||
dH[ 8] = _mm256_add_epi64( _mm256_add_epi64(
|
||||
mm256_rotl_64( dH[4], 9 ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[24] ), M[ 8] )),
|
||||
_mm256_xor_si256( _mm256_slli_epi64( xl, 8 ),
|
||||
_mm256_xor_si256( qt[23], qt[ 8] ) ) );
|
||||
dH[ 9] = _mm256_add_epi64( _mm256_add_epi64(
|
||||
mm256_rotl_64( dH[5], 10 ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[25] ), M[ 9] )),
|
||||
_mm256_xor_si256( _mm256_srli_epi64( xl, 6 ),
|
||||
_mm256_xor_si256( qt[16], qt[ 9] ) ) );
|
||||
dH[10] = _mm256_add_epi64( _mm256_add_epi64(
|
||||
mm256_rotl_64( dH[6], 11 ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[26] ), M[10] )),
|
||||
_mm256_xor_si256( _mm256_slli_epi64( xl, 6 ),
|
||||
_mm256_xor_si256( qt[17], qt[10] ) ) );
|
||||
dH[11] = _mm256_add_epi64( _mm256_add_epi64(
|
||||
mm256_rotl_64( dH[7], 12 ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[27] ), M[11] )),
|
||||
_mm256_xor_si256( _mm256_slli_epi64( xl, 4 ),
|
||||
_mm256_xor_si256( qt[18], qt[11] ) ) );
|
||||
dH[12] = _mm256_add_epi64( _mm256_add_epi64(
|
||||
mm256_rotl_64( dH[0], 13 ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[28] ), M[12] )),
|
||||
_mm256_xor_si256( _mm256_srli_epi64( xl, 3 ),
|
||||
_mm256_xor_si256( qt[19], qt[12] ) ) );
|
||||
dH[13] = _mm256_add_epi64( _mm256_add_epi64(
|
||||
mm256_rotl_64( dH[1], 14 ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[29] ), M[13] )),
|
||||
_mm256_xor_si256( _mm256_srli_epi64( xl, 4 ),
|
||||
_mm256_xor_si256( qt[20], qt[13] ) ) );
|
||||
dH[14] = _mm256_add_epi64( _mm256_add_epi64(
|
||||
mm256_rotl_64( dH[2], 15 ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[30] ), M[14] )),
|
||||
_mm256_xor_si256( _mm256_srli_epi64( xl, 7 ),
|
||||
_mm256_xor_si256( qt[21], qt[14] ) ) );
|
||||
dH[15] = _mm256_add_epi64( _mm256_add_epi64(
|
||||
mm256_rotl_64( dH[3], 16 ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[31] ), M[15] )),
|
||||
_mm256_xor_si256( _mm256_srli_epi64( xl, 2 ),
|
||||
_mm256_xor_si256( qt[22], qt[15] ) ) );
|
||||
}
|
||||
|
||||
#endif // 64
|
||||
|
||||
//#define FOLDs FOLD(sph_u32, MAKE_Qs, SPH_ROTL32, M, Qs, dH)
|
||||
|
||||
|
||||
/*
|
||||
static void
|
||||
compress_small(const unsigned char *data, const sph_u32 h[16], sph_u32 dh[16])
|
||||
{
|
||||
#define M(x) sph_dec32le_aligned(data + 4 * (x))
|
||||
#define H(x) (h[x])
|
||||
#define dH(x) (dh[x])
|
||||
|
||||
FOLDs;
|
||||
|
||||
#undef M
|
||||
#undef H
|
||||
#undef dH
|
||||
}
|
||||
|
||||
static const sph_u32 final_s[16] = {
|
||||
SPH_C32(0xaaaaaaa0), SPH_C32(0xaaaaaaa1), SPH_C32(0xaaaaaaa2),
|
||||
SPH_C32(0xaaaaaaa3), SPH_C32(0xaaaaaaa4), SPH_C32(0xaaaaaaa5),
|
||||
SPH_C32(0xaaaaaaa6), SPH_C32(0xaaaaaaa7), SPH_C32(0xaaaaaaa8),
|
||||
SPH_C32(0xaaaaaaa9), SPH_C32(0xaaaaaaaa), SPH_C32(0xaaaaaaab),
|
||||
SPH_C32(0xaaaaaaac), SPH_C32(0xaaaaaaad), SPH_C32(0xaaaaaaae),
|
||||
SPH_C32(0xaaaaaaaf)
|
||||
};
|
||||
|
||||
static void
|
||||
bmw32_4way_init(bmw_4way_small_context *sc, const sph_u32 *iv)
|
||||
{
|
||||
memcpy(sc->H, iv, sizeof sc->H);
|
||||
sc->ptr = 0;
|
||||
#if SPH_64
|
||||
sc->bit_count = 0;
|
||||
#else
|
||||
sc->bit_count_high = 0;
|
||||
sc->bit_count_low = 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void
|
||||
bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
|
||||
{
|
||||
unsigned char *buf;
|
||||
size_t ptr;
|
||||
sph_u32 htmp[16];
|
||||
sph_u32 *h1, *h2;
|
||||
#if !SPH_64
|
||||
sph_u32 tmp;
|
||||
#endif
|
||||
|
||||
#if SPH_64
|
||||
sc->bit_count += (sph_u64)len << 3;
|
||||
#else
|
||||
tmp = sc->bit_count_low;
|
||||
sc->bit_count_low = SPH_T32(tmp + ((sph_u32)len << 3));
|
||||
if (sc->bit_count_low < tmp)
|
||||
sc->bit_count_high ++;
|
||||
sc->bit_count_high += len >> 29;
|
||||
#endif
|
||||
buf = sc->buf;
|
||||
ptr = sc->ptr;
|
||||
h1 = sc->H;
|
||||
h2 = htmp;
|
||||
while (len > 0) {
|
||||
size_t clen;
|
||||
|
||||
clen = (sizeof sc->buf) - ptr;
|
||||
if (clen > len)
|
||||
clen = len;
|
||||
memcpy(buf + ptr, data, clen);
|
||||
data = (const unsigned char *)data + clen;
|
||||
len -= clen;
|
||||
ptr += clen;
|
||||
if (ptr == sizeof sc->buf) {
|
||||
sph_u32 *ht;
|
||||
|
||||
compress_small(buf, h1, h2);
|
||||
ht = h1;
|
||||
h1 = h2;
|
||||
h2 = ht;
|
||||
ptr = 0;
|
||||
}
|
||||
}
|
||||
sc->ptr = ptr;
|
||||
if (h1 != sc->H)
|
||||
memcpy(sc->H, h1, sizeof sc->H);
|
||||
}
|
||||
|
||||
static void
|
||||
bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
|
||||
void *dst, size_t out_size_w32)
|
||||
{
|
||||
unsigned char *buf, *out;
|
||||
size_t ptr, u, v;
|
||||
unsigned z;
|
||||
sph_u32 h1[16], h2[16], *h;
|
||||
|
||||
buf = sc->buf;
|
||||
ptr = sc->ptr;
|
||||
z = 0x80 >> n;
|
||||
buf[ptr ++] = ((ub & -z) | z) & 0xFF;
|
||||
h = sc->H;
|
||||
if (ptr > (sizeof sc->buf) - 8) {
|
||||
memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
|
||||
compress_small(buf, h, h1);
|
||||
ptr = 0;
|
||||
h = h1;
|
||||
}
|
||||
memset(buf + ptr, 0, (sizeof sc->buf) - 8 - ptr);
|
||||
#if SPH_64
|
||||
sph_enc64le_aligned(buf + (sizeof sc->buf) - 8,
|
||||
SPH_T64(sc->bit_count + n));
|
||||
#else
|
||||
sph_enc32le_aligned(buf + (sizeof sc->buf) - 8,
|
||||
sc->bit_count_low + n);
|
||||
sph_enc32le_aligned(buf + (sizeof sc->buf) - 4,
|
||||
SPH_T32(sc->bit_count_high));
|
||||
#endif
|
||||
compress_small(buf, h, h2);
|
||||
for (u = 0; u < 16; u ++)
|
||||
sph_enc32le_aligned(buf + 4 * u, h2[u]);
|
||||
compress_small(buf, final_s, h1);
|
||||
out = dst;
|
||||
for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++)
|
||||
sph_enc32le(out + 4 * u, h1[v]);
|
||||
}
|
||||
*/
|
||||
#if SPH_64
|
||||
|
||||
static const __m256i final_b[16] =
|
||||
{
|
||||
{ 0xaaaaaaaaaaaaaaa0, 0xaaaaaaaaaaaaaaa0,
|
||||
0xaaaaaaaaaaaaaaa0, 0xaaaaaaaaaaaaaaa0 },
|
||||
{ 0xaaaaaaaaaaaaaaa1, 0xaaaaaaaaaaaaaaa1,
|
||||
0xaaaaaaaaaaaaaaa1, 0xaaaaaaaaaaaaaaa1 },
|
||||
{ 0xaaaaaaaaaaaaaaa2, 0xaaaaaaaaaaaaaaa2,
|
||||
0xaaaaaaaaaaaaaaa2, 0xaaaaaaaaaaaaaaa2 },
|
||||
{ 0xaaaaaaaaaaaaaaa3, 0xaaaaaaaaaaaaaaa3,
|
||||
0xaaaaaaaaaaaaaaa3, 0xaaaaaaaaaaaaaaa3 },
|
||||
{ 0xaaaaaaaaaaaaaaa4, 0xaaaaaaaaaaaaaaa4,
|
||||
0xaaaaaaaaaaaaaaa4, 0xaaaaaaaaaaaaaaa4 },
|
||||
{ 0xaaaaaaaaaaaaaaa5, 0xaaaaaaaaaaaaaaa5,
|
||||
0xaaaaaaaaaaaaaaa5, 0xaaaaaaaaaaaaaaa5 },
|
||||
{ 0xaaaaaaaaaaaaaaa6, 0xaaaaaaaaaaaaaaa6,
|
||||
0xaaaaaaaaaaaaaaa6, 0xaaaaaaaaaaaaaaa6 },
|
||||
{ 0xaaaaaaaaaaaaaaa7, 0xaaaaaaaaaaaaaaa7,
|
||||
0xaaaaaaaaaaaaaaa7, 0xaaaaaaaaaaaaaaa7 },
|
||||
{ 0xaaaaaaaaaaaaaaa8, 0xaaaaaaaaaaaaaaa8,
|
||||
0xaaaaaaaaaaaaaaa8, 0xaaaaaaaaaaaaaaa8 },
|
||||
{ 0xaaaaaaaaaaaaaaa9, 0xaaaaaaaaaaaaaaa9,
|
||||
0xaaaaaaaaaaaaaaa9, 0xaaaaaaaaaaaaaaa9 },
|
||||
{ 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa,
|
||||
0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa },
|
||||
{ 0xaaaaaaaaaaaaaaab, 0xaaaaaaaaaaaaaaab,
|
||||
0xaaaaaaaaaaaaaaab, 0xaaaaaaaaaaaaaaab },
|
||||
{ 0xaaaaaaaaaaaaaaac, 0xaaaaaaaaaaaaaaac,
|
||||
0xaaaaaaaaaaaaaaac, 0xaaaaaaaaaaaaaaac },
|
||||
{ 0xaaaaaaaaaaaaaaad, 0xaaaaaaaaaaaaaaad,
|
||||
0xaaaaaaaaaaaaaaad, 0xaaaaaaaaaaaaaaad },
|
||||
{ 0xaaaaaaaaaaaaaaae, 0xaaaaaaaaaaaaaaae,
|
||||
0xaaaaaaaaaaaaaaae, 0xaaaaaaaaaaaaaaae },
|
||||
{ 0xaaaaaaaaaaaaaaaf, 0xaaaaaaaaaaaaaaaf,
|
||||
0xaaaaaaaaaaaaaaaf, 0xaaaaaaaaaaaaaaaf }
|
||||
};
|
||||
|
||||
static void
|
||||
bmw64_4way_init( bmw_4way_big_context *sc, const sph_u64 *iv )
|
||||
{
|
||||
for ( int i = 0; i < 16; i++ )
|
||||
sc->H[i] = _mm256_set1_epi64x( iv[i] );
|
||||
sc->ptr = 0;
|
||||
sc->bit_count = 0;
|
||||
}
|
||||
|
||||
static void
|
||||
bmw64_4way( bmw_4way_big_context *sc, const void *data, size_t len )
|
||||
{
|
||||
__m256i *vdata = (__m256i*)data;
|
||||
__m256i *buf;
|
||||
__m256i htmp[16];
|
||||
__m256i *h1, *h2;
|
||||
size_t ptr;
|
||||
const int buf_size = 128; // bytes of one lane, compatible with len
|
||||
|
||||
sc->bit_count += (sph_u64)len << 3;
|
||||
buf = sc->buf;
|
||||
ptr = sc->ptr;
|
||||
h1 = sc->H;
|
||||
h2 = htmp;
|
||||
while ( len > 0 )
|
||||
{
|
||||
size_t clen;
|
||||
clen = buf_size - ptr;
|
||||
if ( clen > len )
|
||||
clen = len;
|
||||
memcpy_256( buf + (ptr>>3), vdata, clen >> 3 );
|
||||
vdata = vdata + (clen>>3);
|
||||
len -= clen;
|
||||
ptr += clen;
|
||||
if ( ptr == buf_size )
|
||||
{
|
||||
__m256i *ht;
|
||||
compress_big( buf, h1, h2 );
|
||||
ht = h1;
|
||||
h1 = h2;
|
||||
h2 = ht;
|
||||
ptr = 0;
|
||||
}
|
||||
}
|
||||
sc->ptr = ptr;
|
||||
if ( h1 != sc->H )
|
||||
memcpy_256( sc->H, h1, 16 );
|
||||
}
|
||||
|
||||
static void
|
||||
bmw64_4way_close(bmw_4way_big_context *sc, unsigned ub, unsigned n,
|
||||
void *dst, size_t out_size_w64)
|
||||
{
|
||||
__m256i *buf;
|
||||
__m256i h1[16], h2[16], *h;
|
||||
size_t ptr, u, v;
|
||||
unsigned z;
|
||||
const int buf_size = 128; // bytes of one lane, compatible with len
|
||||
|
||||
buf = sc->buf;
|
||||
ptr = sc->ptr;
|
||||
z = 0x80 >> n;
|
||||
buf[ ptr>>3 ] = _mm256_set1_epi64x( z );
|
||||
ptr += 8;
|
||||
h = sc->H;
|
||||
|
||||
if ( ptr > (buf_size - 8) )
|
||||
{
|
||||
memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 );
|
||||
compress_big( buf, h, h1 );
|
||||
ptr = 0;
|
||||
h = h1;
|
||||
}
|
||||
memset_zero_256( buf + (ptr>>3), (buf_size - 8 - ptr) >> 3 );
|
||||
buf[ (buf_size - 8) >> 3 ] = _mm256_set1_epi64x( sc->bit_count + n );
|
||||
compress_big( buf, h, h2 );
|
||||
for ( u = 0; u < 16; u ++ )
|
||||
buf[u] = h2[u];
|
||||
compress_big( buf, final_b, h1 );
|
||||
for (u = 0, v = 16 - out_size_w64; u < out_size_w64; u ++, v ++)
|
||||
casti_m256i(dst,u) = h1[v];
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
void
|
||||
bmw256_4way_init(void *cc)
|
||||
{
|
||||
// bmw32_4way_init(cc, IV256);
|
||||
}
|
||||
|
||||
void
|
||||
bmw256_4way(void *cc, const void *data, size_t len)
|
||||
{
|
||||
// bmw32_4way(cc, data, len);
|
||||
}
|
||||
|
||||
void
|
||||
bmw256_4way_close(void *cc, void *dst)
|
||||
{
|
||||
// bmw256_4way_addbits_and_close(cc, 0, 0, dst);
|
||||
}
|
||||
|
||||
void
|
||||
bmw256_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
{
|
||||
// bmw32_4way_close(cc, ub, n, dst, 8);
|
||||
}
|
||||
|
||||
#if SPH_64
|
||||
|
||||
void
|
||||
bmw512_4way_init(void *cc)
|
||||
{
|
||||
bmw64_4way_init(cc, IV512);
|
||||
}
|
||||
|
||||
void
|
||||
bmw512_4way(void *cc, const void *data, size_t len)
|
||||
{
|
||||
bmw64_4way(cc, data, len);
|
||||
}
|
||||
|
||||
void
|
||||
bmw512_4way_close(void *cc, void *dst)
|
||||
{
|
||||
bmw512_4way_addbits_and_close(cc, 0, 0, dst);
|
||||
}
|
||||
|
||||
void
|
||||
bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
{
|
||||
bmw64_4way_close(cc, ub, n, dst, 8);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
154
algo/bmw/bmw-hash-4way.h
Normal file
154
algo/bmw/bmw-hash-4way.h
Normal file
@@ -0,0 +1,154 @@
|
||||
/* $Id: sph_bmw.h 216 2010-06-08 09:46:57Z tp $ */
|
||||
/**
|
||||
* BMW interface. BMW (aka "Blue Midnight Wish") is a family of
|
||||
* functions which differ by their output size; this implementation
|
||||
* defines BMW for output sizes 224, 256, 384 and 512 bits.
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @file sph_bmw.h
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#ifndef BMW_HASH_H__
|
||||
#define BMW_HASH_H__
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
|
||||
#include <stddef.h>
|
||||
#ifdef __AVX2__
|
||||
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "avxdefs.h"
|
||||
|
||||
/**
|
||||
* Output size (in bits) for BMW-224.
|
||||
*/
|
||||
#define SPH_SIZE_bmw224 224
|
||||
|
||||
/**
|
||||
* Output size (in bits) for BMW-256.
|
||||
*/
|
||||
#define SPH_SIZE_bmw256 256
|
||||
|
||||
#if SPH_64
|
||||
|
||||
/**
|
||||
* Output size (in bits) for BMW-384.
|
||||
*/
|
||||
#define SPH_SIZE_bmw384 384
|
||||
|
||||
/**
|
||||
* Output size (in bits) for BMW-512.
|
||||
*/
|
||||
#define SPH_SIZE_bmw512 512
|
||||
|
||||
#endif
|
||||
|
||||
/**
|
||||
* This structure is a context for BMW-224 and BMW-256 computations:
|
||||
* it contains the intermediate values and some data from the last
|
||||
* entered block. Once a BMW computation has been performed, the
|
||||
* context can be reused for another computation.
|
||||
*
|
||||
* The contents of this structure are private. A running BMW
|
||||
* computation can be cloned by copying the context (e.g. with a simple
|
||||
* <code>memcpy()</code>).
|
||||
*/
|
||||
typedef struct {
|
||||
#ifndef DOXYGEN_IGNORE
|
||||
unsigned char buf[64]; /* first field, for alignment */
|
||||
size_t ptr;
|
||||
sph_u32 H[16];
|
||||
#if SPH_64
|
||||
sph_u64 bit_count;
|
||||
#else
|
||||
sph_u32 bit_count_high, bit_count_low;
|
||||
#endif
|
||||
#endif
|
||||
} bmw_4way_small_context;
|
||||
|
||||
typedef bmw_4way_small_context bmw256_4way_context;
|
||||
|
||||
#if SPH_64
|
||||
|
||||
/**
|
||||
* This structure is a context for BMW-384 and BMW-512 computations:
|
||||
* it contains the intermediate values and some data from the last
|
||||
* entered block. Once a BMW computation has been performed, the
|
||||
* context can be reused for another computation.
|
||||
*
|
||||
* The contents of this structure are private. A running BMW
|
||||
* computation can be cloned by copying the context (e.g. with a simple
|
||||
* <code>memcpy()</code>).
|
||||
*/
|
||||
typedef struct {
|
||||
#ifndef DOXYGEN_IGNORE
|
||||
__m256i buf[16];
|
||||
__m256i H[16];
|
||||
|
||||
// unsigned char buf[128]; /* first field, for alignment */
|
||||
size_t ptr;
|
||||
// sph_u64 H[16];
|
||||
sph_u64 bit_count;
|
||||
#endif
|
||||
} bmw_4way_big_context;
|
||||
|
||||
typedef bmw_4way_big_context bmw512_4way_context;
|
||||
|
||||
#endif
|
||||
|
||||
void bmw256_4way_init(void *cc);
|
||||
|
||||
void bmw256_4way(void *cc, const void *data, size_t len);
|
||||
|
||||
void bmw256_4way_close(void *cc, void *dst);
|
||||
|
||||
void bmw256_addbits_and_close(
|
||||
void *cc, unsigned ub, unsigned n, void *dst);
|
||||
|
||||
#if SPH_64
|
||||
|
||||
void bmw512_4way_init(void *cc);
|
||||
|
||||
void bmw512_4way(void *cc, const void *data, size_t len);
|
||||
|
||||
void bmw512_4way_close(void *cc, void *dst);
|
||||
|
||||
void bmw512_4way_addbits_and_close(
|
||||
void *cc, unsigned ub, unsigned n, void *dst);
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
@@ -96,34 +96,18 @@ extern "C"{
|
||||
do { \
|
||||
__m256i cc = _mm256_set_epi64x( c, c, c, c ); \
|
||||
x3 = mm256_not( x3 ); \
|
||||
x0 = _mm256_xor_si256( x0, _mm256_and_si256( cc, mm256_not( x2 ) ) ); \
|
||||
x0 = _mm256_xor_si256( x0, _mm256_andnot_si256( x2, cc ) ); \
|
||||
tmp = _mm256_xor_si256( cc, _mm256_and_si256( x0, x1 ) ); \
|
||||
x0 = _mm256_xor_si256( x0, _mm256_and_si256( x2, x3 ) ); \
|
||||
x3 = _mm256_xor_si256( x3, _mm256_and_si256( mm256_not( x1 ), x2 ) ); \
|
||||
x3 = _mm256_xor_si256( x3, _mm256_andnot_si256( x1, x2 ) ); \
|
||||
x1 = _mm256_xor_si256( x1, _mm256_and_si256( x0, x2 ) ); \
|
||||
x2 = _mm256_xor_si256( x2, _mm256_and_si256( x0, mm256_not( x3 ) ) ); \
|
||||
x2 = _mm256_xor_si256( x2, _mm256_andnot_si256( x3, x0 ) ); \
|
||||
x0 = _mm256_xor_si256( x0, _mm256_or_si256( x1, x3 ) ); \
|
||||
x3 = _mm256_xor_si256( x3, _mm256_and_si256( x1, x2 ) ); \
|
||||
x1 = _mm256_xor_si256( x1, _mm256_and_si256( tmp, x0 ) ); \
|
||||
x2 = _mm256_xor_si256( x2, tmp ); \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
#define Sb(x0, x1, x2, x3, c) do { \
|
||||
x3 = ~x3; \
|
||||
x0 ^= (c) & ~x2; \
|
||||
tmp = (c) ^ (x0 & x1); \
|
||||
x0 ^= x2 & x3; \
|
||||
x3 ^= ~x1 & x2; \
|
||||
x1 ^= x0 & x2; \
|
||||
x2 ^= x0 & ~x3; \
|
||||
x0 ^= x1 | x3; \
|
||||
x3 ^= x1 & x2; \
|
||||
x1 ^= tmp & x0; \
|
||||
x2 ^= tmp; \
|
||||
} while (0)
|
||||
*/
|
||||
|
||||
#define Lb(x0, x1, x2, x3, x4, x5, x6, x7) \
|
||||
do { \
|
||||
x4 = _mm256_xor_si256( x4, x1 ); \
|
||||
@@ -136,20 +120,6 @@ do { \
|
||||
x3 = _mm256_xor_si256( x3, x4 ); \
|
||||
} while (0)
|
||||
|
||||
|
||||
/*
|
||||
#define Lb(x0, x1, x2, x3, x4, x5, x6, x7) do { \
|
||||
x4 ^= x1; \
|
||||
x5 ^= x2; \
|
||||
x6 ^= x3 ^ x0; \
|
||||
x7 ^= x0; \
|
||||
x0 ^= x5; \
|
||||
x1 ^= x6; \
|
||||
x2 ^= x7 ^ x4; \
|
||||
x3 ^= x4; \
|
||||
} while (0)
|
||||
*/
|
||||
|
||||
#if SPH_JH_64
|
||||
|
||||
static const sph_u64 C[] = {
|
||||
|
@@ -4,13 +4,10 @@
|
||||
|
||||
#include <memory.h>
|
||||
#include <mm_malloc.h>
|
||||
//#include "algo-gate-api.h"
|
||||
#include "lyra2.h"
|
||||
#include "algo/blake/sph_blake.h"
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
//#include "avxdefs.h"
|
||||
|
||||
// same size, only difference is the name, lyra2 is done serially
|
||||
__thread uint64_t* lyra2z_4way_matrix;
|
||||
|
||||
bool lyra2z_4way_thread_init()
|
||||
@@ -26,12 +23,8 @@ void lyra2z_4way_midstate( const void* input )
|
||||
blake256_4way( &l2z_4way_blake_mid, input, 64 );
|
||||
}
|
||||
|
||||
// block 2050 new algo, blake plus new lyra parms. new input
|
||||
// is power of 2 so normal lyra can be used
|
||||
//void zcoin_hash(void *state, const void *input, uint32_t height)
|
||||
void lyra2z_4way_hash( void *state, const void *input )
|
||||
{
|
||||
// uint32_t _ALIGN(64) hash[16];
|
||||
uint32_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash2[8] __attribute__ ((aligned (64)));
|
||||
@@ -39,27 +32,21 @@ void lyra2z_4way_hash( void *state, const void *input )
|
||||
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
blake256_4way_context ctx_blake __attribute__ ((aligned (64)));
|
||||
|
||||
// memcpy( &ctx_blake, &l2z_4way_blake_mid, sizeof l2z_4way_blake_mid );
|
||||
// blake256_4way( &ctx_blake, input + (64*4), 16 );
|
||||
// blake256_4way_close( &ctx_blake, vhash );
|
||||
|
||||
blake256_4way_init( &ctx_blake );
|
||||
blake256_4way( &ctx_blake, input, 80 );
|
||||
memcpy( &ctx_blake, &l2z_4way_blake_mid, sizeof l2z_4way_blake_mid );
|
||||
blake256_4way( &ctx_blake, input + (64*4), 16 );
|
||||
blake256_4way_close( &ctx_blake, vhash );
|
||||
|
||||
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
|
||||
|
||||
LYRA2Z( lyra2z_4way_matrix, hash0, 32, hash0, 32, hash0, 32, 8, 8, 8 );
|
||||
// LYRA2Z( lyra2z_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
|
||||
LYRA2Z( lyra2z_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
|
||||
LYRA2Z( lyra2z_4way_matrix, hash2, 32, hash2, 32, hash2, 32, 8, 8, 8 );
|
||||
// LYRA2Z( lyra2z_4way_matrix, hash3, 32, hash3, 32, hash3, 32, 8, 8, 8 );
|
||||
LYRA2Z( lyra2z_4way_matrix, hash3, 32, hash3, 32, hash3, 32, 8, 8, 8 );
|
||||
|
||||
memcpy( state, hash0, 32 );
|
||||
memcpy( state+32, hash1, 32 );
|
||||
memcpy( state+64, hash2, 32 );
|
||||
memcpy( state+96, hash3, 32 );
|
||||
|
||||
// memcpy(state, hash, 32);
|
||||
}
|
||||
|
||||
int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
@@ -67,7 +54,6 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
{
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
// uint32_t _ALIGN(64) hash[8];
|
||||
uint32_t _ALIGN(64) edata[20];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
@@ -90,7 +76,7 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
|
||||
mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
|
||||
|
||||
// lyra2z_4way_midstate( vdata );
|
||||
lyra2z_4way_midstate( vdata );
|
||||
|
||||
do {
|
||||
found[0] = found[1] = found[2] = found[3] = false;
|
||||
@@ -104,42 +90,33 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
|
||||
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
|
||||
{
|
||||
printf("found 0\n");
|
||||
found[0] = true;
|
||||
num_found++;
|
||||
nonces[0] = pdata[19] = n;
|
||||
work_set_target_ratio( work, hash );
|
||||
}
|
||||
/*
|
||||
if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
|
||||
{
|
||||
printf("found 1\n");
|
||||
found[1] = true;
|
||||
num_found++;
|
||||
nonces[1] = n+1;
|
||||
work_set_target_ratio( work, hash+8 );
|
||||
}
|
||||
*/
|
||||
if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
|
||||
{
|
||||
printf("found 2\n");
|
||||
found[2] = true;
|
||||
num_found++;
|
||||
nonces[2] = n+2;
|
||||
work_set_target_ratio( work, hash+16 );
|
||||
}
|
||||
/*
|
||||
if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
|
||||
{
|
||||
printf("found 3\n");
|
||||
found[3] = true;
|
||||
num_found++;
|
||||
nonces[3] = n+3;
|
||||
work_set_target_ratio( work, hash+24 );
|
||||
}
|
||||
n += 4;
|
||||
*/
|
||||
n += 2;
|
||||
} while ( (num_found == 0) && (n < max_nonce-4)
|
||||
&& !work_restart[thr_id].restart);
|
||||
|
||||
@@ -149,21 +126,3 @@ printf("found 3\n");
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
|
||||
if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
|
||||
work_set_target_ratio(work, hash);
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce;
|
||||
return 1;
|
||||
}
|
||||
nonce++;
|
||||
|
||||
} while (nonce < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
*/
|
||||
|
||||
|
@@ -9,18 +9,15 @@ void lyra2z_set_target( struct work* work, double job_diff )
|
||||
bool register_lyra2z_algo( algo_gate_t* gate )
|
||||
{
|
||||
#ifdef LYRA2Z_4WAY
|
||||
four_way_not_tested();
|
||||
gate->optimizations = AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
|
||||
gate->miner_thread_init = (void*)&lyra2z_4way_thread_init;
|
||||
gate->scanhash = (void*)&scanhash_lyra2z_4way;
|
||||
gate->hash = (void*)&lyra2z_4way_hash;
|
||||
#else
|
||||
gate->optimizations = AVX_OPT | AVX2_OPT;
|
||||
gate->miner_thread_init = (void*)&lyra2z_thread_init;
|
||||
gate->scanhash = (void*)&scanhash_lyra2z;
|
||||
gate->hash = (void*)&lyra2z_hash;
|
||||
#endif
|
||||
|
||||
gate->optimizations = AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
|
||||
gate->get_max64 = (void*)&get_max64_0xffffLL;
|
||||
gate->set_target = (void*)&lyra2z_set_target;
|
||||
return true;
|
||||
|
@@ -2,7 +2,7 @@
|
||||
|
||||
bool register_nist5_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | FOUR_WAY_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
|
||||
#if defined (NIST5_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_nist5_4way;
|
||||
gate->hash = (void*)&nist5hash_4way;
|
||||
|
@@ -1,12 +0,0 @@
|
||||
#ifndef __POLYTIMOS_GATE_H__
|
||||
#define __POLYTIMOS_GATE_H__
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
void polytimos_hash( void *state, const void *input );
|
||||
int scanhash_polytimos( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
void init_polytimos_context();
|
||||
|
||||
#endif
|
@@ -1,31 +1,20 @@
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "algo/luffa/sph_luffa.h"
|
||||
#include "algo/cubehash/sph_cubehash.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/simd/sph_simd.h"
|
||||
#include "algo/echo/sph_echo.h"
|
||||
|
||||
#include "algo/luffa/sse2/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/sse2/cubehash_sse2.h"
|
||||
#include "algo/simd/sse2/nist.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
|
||||
#ifndef NO_AES_NI
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#else
|
||||
#include "algo/echo/sph_echo.h"
|
||||
#endif
|
||||
|
||||
typedef struct
|
||||
{
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cubehash;
|
||||
sph_shavite512_context shavite;
|
||||
hashState_sd simd;
|
||||
#ifdef NO_AES_NI
|
||||
sph_echo512_context echo;
|
||||
#else
|
||||
|
@@ -1,23 +1,16 @@
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "algo/luffa/sph_luffa.h"
|
||||
#include "algo/cubehash/sph_cubehash.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/simd/sph_simd.h"
|
||||
#include "algo/echo/sph_echo.h"
|
||||
|
||||
#include "algo/luffa/sse2/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/sse2/cubehash_sse2.h"
|
||||
#include "algo/simd/sse2/nist.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
|
||||
#ifndef NO_AES_NI
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#else
|
||||
#include "algo/echo/sph_echo.h"
|
||||
#endif
|
||||
|
||||
typedef struct
|
||||
|
618
algo/shabal/shabal-hash-4way.c
Normal file
618
algo/shabal/shabal-hash-4way.c
Normal file
@@ -0,0 +1,618 @@
|
||||
/* $Id: shabal.c 175 2010-05-07 16:03:20Z tp $ */
|
||||
/*
|
||||
* Shabal implementation.
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
|
||||
#ifdef __AVX2__
|
||||
|
||||
#include "shabal-hash-4way.h"
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning (disable: 4146)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Part of this code was automatically generated (the part between
|
||||
* the "BEGIN" and "END" markers).
|
||||
*/
|
||||
|
||||
#define sM 16
|
||||
|
||||
#define C32 SPH_C32
|
||||
#define T32 SPH_T32
|
||||
|
||||
#define O1 13
|
||||
#define O2 9
|
||||
#define O3 6
|
||||
|
||||
/*
|
||||
* We copy the state into local variables, so that the compiler knows
|
||||
* that it can optimize them at will.
|
||||
*/
|
||||
|
||||
/* BEGIN -- automatically generated code. */
|
||||
|
||||
#define DECL_STATE \
|
||||
__m128i A00, A01, A02, A03, A04, A05, A06, A07, \
|
||||
A08, A09, A0A, A0B; \
|
||||
__m128i B0, B1, B2, B3, B4, B5, B6, B7, \
|
||||
B8, B9, BA, BB, BC, BD, BE, BF; \
|
||||
__m128i C0, C1, C2, C3, C4, C5, C6, C7, \
|
||||
C8, C9, CA, CB, CC, CD, CE, CF; \
|
||||
__m128i M0, M1, M2, M3, M4, M5, M6, M7, \
|
||||
M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||
sph_u32 Wlow, Whigh;
|
||||
|
||||
#define READ_STATE(state) do { \
|
||||
A00 = (state)->A[0]; \
|
||||
A01 = (state)->A[1]; \
|
||||
A02 = (state)->A[2]; \
|
||||
A03 = (state)->A[3]; \
|
||||
A04 = (state)->A[4]; \
|
||||
A05 = (state)->A[5]; \
|
||||
A06 = (state)->A[6]; \
|
||||
A07 = (state)->A[7]; \
|
||||
A08 = (state)->A[8]; \
|
||||
A09 = (state)->A[9]; \
|
||||
A0A = (state)->A[10]; \
|
||||
A0B = (state)->A[11]; \
|
||||
B0 = (state)->B[0]; \
|
||||
B1 = (state)->B[1]; \
|
||||
B2 = (state)->B[2]; \
|
||||
B3 = (state)->B[3]; \
|
||||
B4 = (state)->B[4]; \
|
||||
B5 = (state)->B[5]; \
|
||||
B6 = (state)->B[6]; \
|
||||
B7 = (state)->B[7]; \
|
||||
B8 = (state)->B[8]; \
|
||||
B9 = (state)->B[9]; \
|
||||
BA = (state)->B[10]; \
|
||||
BB = (state)->B[11]; \
|
||||
BC = (state)->B[12]; \
|
||||
BD = (state)->B[13]; \
|
||||
BE = (state)->B[14]; \
|
||||
BF = (state)->B[15]; \
|
||||
C0 = (state)->C[0]; \
|
||||
C1 = (state)->C[1]; \
|
||||
C2 = (state)->C[2]; \
|
||||
C3 = (state)->C[3]; \
|
||||
C4 = (state)->C[4]; \
|
||||
C5 = (state)->C[5]; \
|
||||
C6 = (state)->C[6]; \
|
||||
C7 = (state)->C[7]; \
|
||||
C8 = (state)->C[8]; \
|
||||
C9 = (state)->C[9]; \
|
||||
CA = (state)->C[10]; \
|
||||
CB = (state)->C[11]; \
|
||||
CC = (state)->C[12]; \
|
||||
CD = (state)->C[13]; \
|
||||
CE = (state)->C[14]; \
|
||||
CF = (state)->C[15]; \
|
||||
Wlow = (state)->Wlow; \
|
||||
Whigh = (state)->Whigh; \
|
||||
} while (0)
|
||||
|
||||
#define WRITE_STATE(state) do { \
|
||||
(state)->A[0] = A00; \
|
||||
(state)->A[1] = A01; \
|
||||
(state)->A[2] = A02; \
|
||||
(state)->A[3] = A03; \
|
||||
(state)->A[4] = A04; \
|
||||
(state)->A[5] = A05; \
|
||||
(state)->A[6] = A06; \
|
||||
(state)->A[7] = A07; \
|
||||
(state)->A[8] = A08; \
|
||||
(state)->A[9] = A09; \
|
||||
(state)->A[10] = A0A; \
|
||||
(state)->A[11] = A0B; \
|
||||
(state)->B[0] = B0; \
|
||||
(state)->B[1] = B1; \
|
||||
(state)->B[2] = B2; \
|
||||
(state)->B[3] = B3; \
|
||||
(state)->B[4] = B4; \
|
||||
(state)->B[5] = B5; \
|
||||
(state)->B[6] = B6; \
|
||||
(state)->B[7] = B7; \
|
||||
(state)->B[8] = B8; \
|
||||
(state)->B[9] = B9; \
|
||||
(state)->B[10] = BA; \
|
||||
(state)->B[11] = BB; \
|
||||
(state)->B[12] = BC; \
|
||||
(state)->B[13] = BD; \
|
||||
(state)->B[14] = BE; \
|
||||
(state)->B[15] = BF; \
|
||||
(state)->C[0] = C0; \
|
||||
(state)->C[1] = C1; \
|
||||
(state)->C[2] = C2; \
|
||||
(state)->C[3] = C3; \
|
||||
(state)->C[4] = C4; \
|
||||
(state)->C[5] = C5; \
|
||||
(state)->C[6] = C6; \
|
||||
(state)->C[7] = C7; \
|
||||
(state)->C[8] = C8; \
|
||||
(state)->C[9] = C9; \
|
||||
(state)->C[10] = CA; \
|
||||
(state)->C[11] = CB; \
|
||||
(state)->C[12] = CC; \
|
||||
(state)->C[13] = CD; \
|
||||
(state)->C[14] = CE; \
|
||||
(state)->C[15] = CF; \
|
||||
(state)->Wlow = Wlow; \
|
||||
(state)->Whigh = Whigh; \
|
||||
} while (0)
|
||||
|
||||
#define DECODE_BLOCK \
|
||||
do { \
|
||||
M0 = buf[ 0]; \
|
||||
M1 = buf[ 1]; \
|
||||
M2 = buf[ 2]; \
|
||||
M3 = buf[ 3]; \
|
||||
M4 = buf[ 4]; \
|
||||
M5 = buf[ 5]; \
|
||||
M6 = buf[ 6]; \
|
||||
M7 = buf[ 7]; \
|
||||
M8 = buf[ 8]; \
|
||||
M9 = buf[ 9]; \
|
||||
MA = buf[10]; \
|
||||
MB = buf[11]; \
|
||||
MC = buf[12]; \
|
||||
MD = buf[13]; \
|
||||
ME = buf[14]; \
|
||||
MF = buf[15]; \
|
||||
} while (0)
|
||||
|
||||
#define INPUT_BLOCK_ADD \
|
||||
do { \
|
||||
B0 = _mm_add_epi32( B0, M0 );\
|
||||
B1 = _mm_add_epi32( B1, M1 );\
|
||||
B2 = _mm_add_epi32( B2, M2 );\
|
||||
B3 = _mm_add_epi32( B3, M3 );\
|
||||
B4 = _mm_add_epi32( B4, M4 );\
|
||||
B5 = _mm_add_epi32( B5, M5 );\
|
||||
B6 = _mm_add_epi32( B6, M6 );\
|
||||
B7 = _mm_add_epi32( B7, M7 );\
|
||||
B8 = _mm_add_epi32( B8, M8 );\
|
||||
B9 = _mm_add_epi32( B9, M9 );\
|
||||
BA = _mm_add_epi32( BA, MA );\
|
||||
BB = _mm_add_epi32( BB, MB );\
|
||||
BC = _mm_add_epi32( BC, MC );\
|
||||
BD = _mm_add_epi32( BD, MD );\
|
||||
BE = _mm_add_epi32( BE, ME );\
|
||||
BF = _mm_add_epi32( BF, MF );\
|
||||
} while (0)
|
||||
|
||||
#define INPUT_BLOCK_SUB \
|
||||
do { \
|
||||
C0 = _mm_sub_epi32( C0, M0 ); \
|
||||
C1 = _mm_sub_epi32( C1, M1 ); \
|
||||
C2 = _mm_sub_epi32( C2, M2 ); \
|
||||
C3 = _mm_sub_epi32( C3, M3 ); \
|
||||
C4 = _mm_sub_epi32( C4, M4 ); \
|
||||
C5 = _mm_sub_epi32( C5, M5 ); \
|
||||
C6 = _mm_sub_epi32( C6, M6 ); \
|
||||
C7 = _mm_sub_epi32( C7, M7 ); \
|
||||
C8 = _mm_sub_epi32( C8, M8 ); \
|
||||
C9 = _mm_sub_epi32( C9, M9 ); \
|
||||
CA = _mm_sub_epi32( CA, MA ); \
|
||||
CB = _mm_sub_epi32( CB, MB ); \
|
||||
CC = _mm_sub_epi32( CC, MC ); \
|
||||
CD = _mm_sub_epi32( CD, MD ); \
|
||||
CE = _mm_sub_epi32( CE, ME ); \
|
||||
CF = _mm_sub_epi32( CF, MF ); \
|
||||
} while (0)
|
||||
|
||||
#define XOR_W \
|
||||
do { \
|
||||
A00 = _mm_xor_si128( A00, _mm_set1_epi32( Wlow ) ); \
|
||||
A01 = _mm_xor_si128( A01, _mm_set1_epi32( Whigh ) ); \
|
||||
} while (0)
|
||||
/*
|
||||
#define SWAP(v1, v2) do { \
|
||||
sph_u32 tmp = (v1); \
|
||||
(v1) = (v2); \
|
||||
(v2) = tmp; \
|
||||
} while (0)
|
||||
*/
|
||||
#define SWAP_BC \
|
||||
do { \
|
||||
mm_swap_128( B0, C0 ); \
|
||||
mm_swap_128( B1, C1 ); \
|
||||
mm_swap_128( B2, C2 ); \
|
||||
mm_swap_128( B3, C3 ); \
|
||||
mm_swap_128( B4, C4 ); \
|
||||
mm_swap_128( B5, C5 ); \
|
||||
mm_swap_128( B6, C6 ); \
|
||||
mm_swap_128( B7, C7 ); \
|
||||
mm_swap_128( B8, C8 ); \
|
||||
mm_swap_128( B9, C9 ); \
|
||||
mm_swap_128( BA, CA ); \
|
||||
mm_swap_128( BB, CB ); \
|
||||
mm_swap_128( BC, CC ); \
|
||||
mm_swap_128( BD, CD ); \
|
||||
mm_swap_128( BE, CE ); \
|
||||
mm_swap_128( BF, CF ); \
|
||||
} while (0)
|
||||
|
||||
#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
|
||||
do { \
|
||||
xa0 = _mm_xor_si128( xm, _mm_xor_si128( xb1, _mm_xor_si128( \
|
||||
_mm_andnot_si128( xb3, xb2 ), \
|
||||
_mm_mullo_epi32( _mm_xor_si128( xa0, _mm_xor_si128( xc, \
|
||||
_mm_mullo_epi32( mm_rotl_32( xa1, 15 ), _mm_set1_epi32(5UL) ) \
|
||||
) ), _mm_set1_epi32(3UL) ) ) ) ); \
|
||||
xb0 = mm_not( _mm_xor_si128( xa0, mm_rotl_32( xb0, 1 ) ) ); \
|
||||
} while (0)
|
||||
|
||||
#define PERM_STEP_0 do { \
|
||||
PERM_ELT(A00, A0B, B0, BD, B9, B6, C8, M0); \
|
||||
PERM_ELT(A01, A00, B1, BE, BA, B7, C7, M1); \
|
||||
PERM_ELT(A02, A01, B2, BF, BB, B8, C6, M2); \
|
||||
PERM_ELT(A03, A02, B3, B0, BC, B9, C5, M3); \
|
||||
PERM_ELT(A04, A03, B4, B1, BD, BA, C4, M4); \
|
||||
PERM_ELT(A05, A04, B5, B2, BE, BB, C3, M5); \
|
||||
PERM_ELT(A06, A05, B6, B3, BF, BC, C2, M6); \
|
||||
PERM_ELT(A07, A06, B7, B4, B0, BD, C1, M7); \
|
||||
PERM_ELT(A08, A07, B8, B5, B1, BE, C0, M8); \
|
||||
PERM_ELT(A09, A08, B9, B6, B2, BF, CF, M9); \
|
||||
PERM_ELT(A0A, A09, BA, B7, B3, B0, CE, MA); \
|
||||
PERM_ELT(A0B, A0A, BB, B8, B4, B1, CD, MB); \
|
||||
PERM_ELT(A00, A0B, BC, B9, B5, B2, CC, MC); \
|
||||
PERM_ELT(A01, A00, BD, BA, B6, B3, CB, MD); \
|
||||
PERM_ELT(A02, A01, BE, BB, B7, B4, CA, ME); \
|
||||
PERM_ELT(A03, A02, BF, BC, B8, B5, C9, MF); \
|
||||
} while (0)
|
||||
|
||||
#define PERM_STEP_1 do { \
|
||||
PERM_ELT(A04, A03, B0, BD, B9, B6, C8, M0); \
|
||||
PERM_ELT(A05, A04, B1, BE, BA, B7, C7, M1); \
|
||||
PERM_ELT(A06, A05, B2, BF, BB, B8, C6, M2); \
|
||||
PERM_ELT(A07, A06, B3, B0, BC, B9, C5, M3); \
|
||||
PERM_ELT(A08, A07, B4, B1, BD, BA, C4, M4); \
|
||||
PERM_ELT(A09, A08, B5, B2, BE, BB, C3, M5); \
|
||||
PERM_ELT(A0A, A09, B6, B3, BF, BC, C2, M6); \
|
||||
PERM_ELT(A0B, A0A, B7, B4, B0, BD, C1, M7); \
|
||||
PERM_ELT(A00, A0B, B8, B5, B1, BE, C0, M8); \
|
||||
PERM_ELT(A01, A00, B9, B6, B2, BF, CF, M9); \
|
||||
PERM_ELT(A02, A01, BA, B7, B3, B0, CE, MA); \
|
||||
PERM_ELT(A03, A02, BB, B8, B4, B1, CD, MB); \
|
||||
PERM_ELT(A04, A03, BC, B9, B5, B2, CC, MC); \
|
||||
PERM_ELT(A05, A04, BD, BA, B6, B3, CB, MD); \
|
||||
PERM_ELT(A06, A05, BE, BB, B7, B4, CA, ME); \
|
||||
PERM_ELT(A07, A06, BF, BC, B8, B5, C9, MF); \
|
||||
} while (0)
|
||||
|
||||
#define PERM_STEP_2 do { \
|
||||
PERM_ELT(A08, A07, B0, BD, B9, B6, C8, M0); \
|
||||
PERM_ELT(A09, A08, B1, BE, BA, B7, C7, M1); \
|
||||
PERM_ELT(A0A, A09, B2, BF, BB, B8, C6, M2); \
|
||||
PERM_ELT(A0B, A0A, B3, B0, BC, B9, C5, M3); \
|
||||
PERM_ELT(A00, A0B, B4, B1, BD, BA, C4, M4); \
|
||||
PERM_ELT(A01, A00, B5, B2, BE, BB, C3, M5); \
|
||||
PERM_ELT(A02, A01, B6, B3, BF, BC, C2, M6); \
|
||||
PERM_ELT(A03, A02, B7, B4, B0, BD, C1, M7); \
|
||||
PERM_ELT(A04, A03, B8, B5, B1, BE, C0, M8); \
|
||||
PERM_ELT(A05, A04, B9, B6, B2, BF, CF, M9); \
|
||||
PERM_ELT(A06, A05, BA, B7, B3, B0, CE, MA); \
|
||||
PERM_ELT(A07, A06, BB, B8, B4, B1, CD, MB); \
|
||||
PERM_ELT(A08, A07, BC, B9, B5, B2, CC, MC); \
|
||||
PERM_ELT(A09, A08, BD, BA, B6, B3, CB, MD); \
|
||||
PERM_ELT(A0A, A09, BE, BB, B7, B4, CA, ME); \
|
||||
PERM_ELT(A0B, A0A, BF, BC, B8, B5, C9, MF); \
|
||||
} while (0)
|
||||
|
||||
#define APPLY_P \
|
||||
do { \
|
||||
B0 = mm_rotr_32( B0, 15 ); \
|
||||
B1 = mm_rotr_32( B1, 15 ); \
|
||||
B2 = mm_rotr_32( B2, 15 ); \
|
||||
B3 = mm_rotr_32( B3, 15 ); \
|
||||
B4 = mm_rotr_32( B4, 15 ); \
|
||||
B5 = mm_rotr_32( B5, 15 ); \
|
||||
B6 = mm_rotr_32( B6, 15 ); \
|
||||
B7 = mm_rotr_32( B7, 15 ); \
|
||||
B8 = mm_rotr_32( B8, 15 ); \
|
||||
B9 = mm_rotr_32( B9, 15 ); \
|
||||
BA = mm_rotr_32( BA, 15 ); \
|
||||
BB = mm_rotr_32( BB, 15 ); \
|
||||
BC = mm_rotr_32( BC, 15 ); \
|
||||
BD = mm_rotr_32( BD, 15 ); \
|
||||
BE = mm_rotr_32( BE, 15 ); \
|
||||
BF = mm_rotr_32( BF, 15 ); \
|
||||
PERM_STEP_0; \
|
||||
PERM_STEP_1; \
|
||||
PERM_STEP_2; \
|
||||
A0B = _mm_add_epi32( A0B, C6 ); \
|
||||
A0A = _mm_add_epi32( A0A, C5 ); \
|
||||
A09 = _mm_add_epi32( A09, C4 ); \
|
||||
A08 = _mm_add_epi32( A08, C3 ); \
|
||||
A07 = _mm_add_epi32( A07, C2 ); \
|
||||
A06 = _mm_add_epi32( A06, C1 ); \
|
||||
A05 = _mm_add_epi32( A05, C0 ); \
|
||||
A04 = _mm_add_epi32( A04, CF ); \
|
||||
A03 = _mm_add_epi32( A03, CE ); \
|
||||
A02 = _mm_add_epi32( A02, CD ); \
|
||||
A01 = _mm_add_epi32( A01, CC ); \
|
||||
A00 = _mm_add_epi32( A00, CB ); \
|
||||
A0B = _mm_add_epi32( A0B, CA ); \
|
||||
A0A = _mm_add_epi32( A0A, C9 ); \
|
||||
A09 = _mm_add_epi32( A09, C8 ); \
|
||||
A08 = _mm_add_epi32( A08, C7 ); \
|
||||
A07 = _mm_add_epi32( A07, C6 ); \
|
||||
A06 = _mm_add_epi32( A06, C5 ); \
|
||||
A05 = _mm_add_epi32( A05, C4 ); \
|
||||
A04 = _mm_add_epi32( A04, C3 ); \
|
||||
A03 = _mm_add_epi32( A03, C2 ); \
|
||||
A02 = _mm_add_epi32( A02, C1 ); \
|
||||
A01 = _mm_add_epi32( A01, C0 ); \
|
||||
A00 = _mm_add_epi32( A00, CF ); \
|
||||
A0B = _mm_add_epi32( A0B, CE ); \
|
||||
A0A = _mm_add_epi32( A0A, CD ); \
|
||||
A09 = _mm_add_epi32( A09, CC ); \
|
||||
A08 = _mm_add_epi32( A08, CB ); \
|
||||
A07 = _mm_add_epi32( A07, CA ); \
|
||||
A06 = _mm_add_epi32( A06, C9 ); \
|
||||
A05 = _mm_add_epi32( A05, C8 ); \
|
||||
A04 = _mm_add_epi32( A04, C7 ); \
|
||||
A03 = _mm_add_epi32( A03, C6 ); \
|
||||
A02 = _mm_add_epi32( A02, C5 ); \
|
||||
A01 = _mm_add_epi32( A01, C4 ); \
|
||||
A00 = _mm_add_epi32( A00, C3 ); \
|
||||
} while (0)
|
||||
|
||||
#define INCR_W do { \
|
||||
if ((Wlow = T32(Wlow + 1)) == 0) \
|
||||
Whigh = T32(Whigh + 1); \
|
||||
} while (0)
|
||||
|
||||
static const sph_u32 A_init_256[] = {
|
||||
C32(0x52F84552), C32(0xE54B7999), C32(0x2D8EE3EC), C32(0xB9645191),
|
||||
C32(0xE0078B86), C32(0xBB7C44C9), C32(0xD2B5C1CA), C32(0xB0D2EB8C),
|
||||
C32(0x14CE5A45), C32(0x22AF50DC), C32(0xEFFDBC6B), C32(0xEB21B74A)
|
||||
};
|
||||
|
||||
static const sph_u32 B_init_256[] = {
|
||||
C32(0xB555C6EE), C32(0x3E710596), C32(0xA72A652F), C32(0x9301515F),
|
||||
C32(0xDA28C1FA), C32(0x696FD868), C32(0x9CB6BF72), C32(0x0AFE4002),
|
||||
C32(0xA6E03615), C32(0x5138C1D4), C32(0xBE216306), C32(0xB38B8890),
|
||||
C32(0x3EA8B96B), C32(0x3299ACE4), C32(0x30924DD4), C32(0x55CB34A5)
|
||||
};
|
||||
|
||||
static const sph_u32 C_init_256[] = {
|
||||
C32(0xB405F031), C32(0xC4233EBA), C32(0xB3733979), C32(0xC0DD9D55),
|
||||
C32(0xC51C28AE), C32(0xA327B8E1), C32(0x56C56167), C32(0xED614433),
|
||||
C32(0x88B59D60), C32(0x60E2CEBA), C32(0x758B4B8B), C32(0x83E82A7F),
|
||||
C32(0xBC968828), C32(0xE6E00BF7), C32(0xBA839E55), C32(0x9B491C60)
|
||||
};
|
||||
|
||||
static const sph_u32 A_init_512[] = {
|
||||
C32(0x20728DFD), C32(0x46C0BD53), C32(0xE782B699), C32(0x55304632),
|
||||
C32(0x71B4EF90), C32(0x0EA9E82C), C32(0xDBB930F1), C32(0xFAD06B8B),
|
||||
C32(0xBE0CAE40), C32(0x8BD14410), C32(0x76D2ADAC), C32(0x28ACAB7F)
|
||||
};
|
||||
|
||||
static const sph_u32 B_init_512[] = {
|
||||
C32(0xC1099CB7), C32(0x07B385F3), C32(0xE7442C26), C32(0xCC8AD640),
|
||||
C32(0xEB6F56C7), C32(0x1EA81AA9), C32(0x73B9D314), C32(0x1DE85D08),
|
||||
C32(0x48910A5A), C32(0x893B22DB), C32(0xC5A0DF44), C32(0xBBC4324E),
|
||||
C32(0x72D2F240), C32(0x75941D99), C32(0x6D8BDE82), C32(0xA1A7502B)
|
||||
};
|
||||
|
||||
static const sph_u32 C_init_512[] = {
|
||||
C32(0xD9BF68D1), C32(0x58BAD750), C32(0x56028CB2), C32(0x8134F359),
|
||||
C32(0xB5D469D8), C32(0x941A8CC2), C32(0x418B2A6E), C32(0x04052780),
|
||||
C32(0x7F07D787), C32(0x5194358F), C32(0x3C60D665), C32(0xBE97D79A),
|
||||
C32(0x950C3434), C32(0xAED9A06D), C32(0x2537DC8D), C32(0x7CDB5969)
|
||||
};
|
||||
|
||||
static void
|
||||
shabal_4way_init( void *cc, unsigned size )
|
||||
{
|
||||
shabal_4way_context *sc = (shabal_4way_context*)cc;
|
||||
int i;
|
||||
|
||||
if ( size == 512 )
|
||||
{
|
||||
for ( i = 0; i < 12; i++ )
|
||||
sc->A[i] = _mm_set1_epi32( A_init_512[i] );
|
||||
for ( i = 0; i < 16; i++ )
|
||||
{
|
||||
sc->B[i] = _mm_set1_epi32( B_init_512[i] );
|
||||
sc->C[i] = _mm_set1_epi32( C_init_512[i] );
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( i = 0; i < 12; i++ )
|
||||
sc->A[i] = _mm_set1_epi32( A_init_256[i] );
|
||||
for ( i = 0; i < 16; i++ )
|
||||
{
|
||||
sc->B[i] = _mm_set1_epi32( B_init_256[i] );
|
||||
sc->C[i] = _mm_set1_epi32( C_init_256[i] );
|
||||
}
|
||||
}
|
||||
sc->Wlow = 1;
|
||||
sc->Whigh = 0;
|
||||
sc->ptr = 0;
|
||||
}
|
||||
|
||||
static void
|
||||
shabal_4way_core( void *cc, const unsigned char *data, size_t len )
|
||||
{
|
||||
shabal_4way_context *sc = (shabal_4way_context*)cc;
|
||||
__m128i *buf;
|
||||
__m128i *vdata = (__m128i*)data;
|
||||
const int buf_size = 64;
|
||||
size_t ptr;
|
||||
DECL_STATE
|
||||
|
||||
buf = sc->buf;
|
||||
ptr = sc->ptr;
|
||||
|
||||
if ( len < (buf_size - ptr ) )
|
||||
{
|
||||
memcpy_128( buf + (ptr>>2), vdata, len>>2 );
|
||||
ptr += len;
|
||||
sc->ptr = ptr;
|
||||
return;
|
||||
}
|
||||
READ_STATE(sc);
|
||||
|
||||
while ( len > 0 )
|
||||
{
|
||||
size_t clen;
|
||||
clen = buf_size - ptr;
|
||||
if ( clen > len )
|
||||
clen = len;
|
||||
memcpy_128( buf + (ptr>>2), vdata, clen>>2 );
|
||||
|
||||
ptr += clen;
|
||||
vdata += clen>>2;
|
||||
len -= clen;
|
||||
if ( ptr == buf_size )
|
||||
{
|
||||
DECODE_BLOCK;
|
||||
INPUT_BLOCK_ADD;
|
||||
XOR_W;
|
||||
APPLY_P;
|
||||
INPUT_BLOCK_SUB;
|
||||
SWAP_BC;
|
||||
INCR_W;
|
||||
ptr = 0;
|
||||
}
|
||||
}
|
||||
WRITE_STATE(sc);
|
||||
sc->ptr = ptr;
|
||||
}
|
||||
|
||||
static void
|
||||
shabal_4way_close( void *cc, unsigned ub, unsigned n, void *dst,
|
||||
unsigned size_words )
|
||||
{
|
||||
shabal_4way_context *sc = (shabal_4way_context*)cc;
|
||||
__m128i *buf;
|
||||
const int buf_size = 64;
|
||||
size_t ptr;
|
||||
int i;
|
||||
unsigned z, zz;
|
||||
DECL_STATE
|
||||
|
||||
buf = sc->buf;
|
||||
ptr = sc->ptr;
|
||||
z = 0x80 >> n;
|
||||
zz = ((ub & -z) | z) & 0xFF;
|
||||
buf[ptr>>2] = _mm_set1_epi32( zz );
|
||||
memset_zero_128( buf + (ptr>>2) + 1, ( (buf_size - ptr) >> 2 ) - 1 );
|
||||
READ_STATE(sc);
|
||||
DECODE_BLOCK;
|
||||
INPUT_BLOCK_ADD;
|
||||
XOR_W;
|
||||
APPLY_P;
|
||||
|
||||
for ( i = 0; i < 3; i ++ )
|
||||
{
|
||||
SWAP_BC;
|
||||
XOR_W;
|
||||
APPLY_P;
|
||||
}
|
||||
|
||||
__m128i *d = (__m128i*)dst;
|
||||
if ( size_words == 16 ) // 512
|
||||
{
|
||||
d[ 0] = B0; d[ 1] = B1; d[ 2] = B2; d[ 3] = B3;
|
||||
d[ 4] = B4; d[ 5] = B5; d[ 6] = B6; d[ 7] = B7;
|
||||
d[ 8] = B8; d[ 9] = B9; d[10] = BA; d[11] = BB;
|
||||
d[12] = BC; d[13] = BD; d[14] = BE; d[15] = BF;
|
||||
}
|
||||
else // 256
|
||||
{
|
||||
d[ 0] = B8; d[ 1] = B9; d[ 2] = BA; d[ 3] = BB;
|
||||
d[ 4] = BC; d[ 5] = BD; d[ 6] = BE; d[ 7] = BF;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
shabal256_4way_init( void *cc )
|
||||
{
|
||||
shabal_4way_init(cc, 256);
|
||||
}
|
||||
|
||||
void
|
||||
shabal256_4way( void *cc, const void *data, size_t len )
|
||||
{
|
||||
shabal_4way_core( cc, data, len );
|
||||
}
|
||||
|
||||
void
|
||||
shabal256_4way_close( void *cc, void *dst )
|
||||
{
|
||||
shabal_4way_close(cc, 0, 0, dst, 8);
|
||||
}
|
||||
|
||||
void
|
||||
shabal256_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
|
||||
void *dst )
|
||||
{
|
||||
shabal_4way_close(cc, ub, n, dst, 8);
|
||||
}
|
||||
|
||||
void
|
||||
shabal512_4way_init(void *cc)
|
||||
{
|
||||
shabal_4way_init(cc, 512);
|
||||
}
|
||||
|
||||
void
|
||||
shabal512_4way(void *cc, const void *data, size_t len)
|
||||
{
|
||||
shabal_4way_core(cc, data, len);
|
||||
}
|
||||
|
||||
void
|
||||
shabal512_4way_close(void *cc, void *dst)
|
||||
{
|
||||
shabal_4way_close(cc, 0, 0, dst, 16);
|
||||
}
|
||||
|
||||
void
|
||||
shabal512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
{
|
||||
shabal_4way_close(cc, ub, n, dst, 16);
|
||||
}
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
82
algo/shabal/shabal-hash-4way.h
Normal file
82
algo/shabal/shabal-hash-4way.h
Normal file
@@ -0,0 +1,82 @@
|
||||
/* $Id: sph_shabal.h 175 2010-05-07 16:03:20Z tp $ */
|
||||
/**
|
||||
* Shabal interface. Shabal is a family of functions which differ by
|
||||
* their output size; this implementation defines Shabal for output
|
||||
* sizes 192, 224, 256, 384 and 512 bits.
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @file sph_shabal.h
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#ifndef SHABAL_HASH_4WAY_H__
|
||||
#define SHABAL_HASH_4WAY_H__ 1
|
||||
|
||||
#ifdef __AVX2__
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "avxdefs.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
|
||||
#define SPH_SIZE_shabal256 256
|
||||
|
||||
#define SPH_SIZE_shabal512 512
|
||||
|
||||
typedef struct {
|
||||
__m128i buf[16] __attribute__ ((aligned (64)));
|
||||
__m128i A[12], B[16], C[16];
|
||||
sph_u32 Whigh, Wlow;
|
||||
size_t ptr;
|
||||
} shabal_4way_context;
|
||||
|
||||
typedef shabal_4way_context shabal256_4way_context;
|
||||
typedef shabal_4way_context shabal512_4way_context;
|
||||
|
||||
void shabal256_4way_init( void *cc );
|
||||
void shabal256_4way( void *cc, const void *data, size_t len );
|
||||
void shabal256_4way_close( void *cc, void *dst );
|
||||
void shabal256_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
|
||||
void *dst );
|
||||
|
||||
void shabal512_4way_init( void *cc );
|
||||
void shabal512_4way( void *cc, const void *data, size_t len );
|
||||
void shabal512_4way_close( void *cc, void *dst );
|
||||
void shabal512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
|
||||
void *dst );
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
@@ -7,7 +7,7 @@
|
||||
#include <stdint.h>
|
||||
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
@@ -20,7 +20,7 @@
|
||||
|
||||
typedef struct {
|
||||
blake512_4way_context blake;
|
||||
sph_bmw512_context bmw;
|
||||
bmw512_4way_context bmw;
|
||||
hashState_groestl groestl;
|
||||
skein512_4way_context skein;
|
||||
jh512_4way_context jh;
|
||||
@@ -37,7 +37,7 @@ c11_4way_ctx_holder c11_4way_ctx;
|
||||
void init_c11_4way_ctx()
|
||||
{
|
||||
blake512_4way_init( &c11_4way_ctx.blake );
|
||||
sph_bmw512_init( &c11_4way_ctx.bmw );
|
||||
bmw512_4way_init( &c11_4way_ctx.bmw );
|
||||
init_groestl( &c11_4way_ctx.groestl, 64 );
|
||||
skein512_4way_init( &c11_4way_ctx.skein );
|
||||
jh512_4way_init( &c11_4way_ctx.jh );
|
||||
@@ -63,22 +63,13 @@ void c11_4way_hash( void *state, const void *input )
|
||||
blake512_4way( &ctx.blake, input, 80 );
|
||||
blake512_4way_close( &ctx.blake, vhash );
|
||||
|
||||
// 2 Bmw
|
||||
bmw512_4way( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way_close( &ctx.bmw, vhash );
|
||||
|
||||
// Serial
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
// 2 Bmw
|
||||
sph_bmw512( &ctx.bmw, hash0, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash0 );
|
||||
memcpy( &ctx.bmw, &c11_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash1, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash1 );
|
||||
memcpy( &ctx.bmw, &c11_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash2, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash2 );
|
||||
memcpy( &ctx.bmw, &c11_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash3, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash3 );
|
||||
|
||||
// 3 Groestl
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
memcpy( &ctx.groestl, &c11_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
|
@@ -11,7 +11,7 @@ bool register_c11_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_c11;
|
||||
gate->hash = (void*)&c11_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
|
||||
gate->get_max64 = (void*)&get_max64_0x3ffff;
|
||||
return true;
|
||||
};
|
||||
|
@@ -7,7 +7,7 @@
|
||||
#include <stdint.h>
|
||||
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
@@ -20,7 +20,7 @@
|
||||
|
||||
typedef struct {
|
||||
blake512_4way_context blake;
|
||||
sph_bmw512_context bmw;
|
||||
bmw512_4way_context bmw;
|
||||
hashState_groestl groestl;
|
||||
skein512_4way_context skein;
|
||||
jh512_4way_context jh;
|
||||
@@ -37,7 +37,7 @@ x11_4way_ctx_holder x11_4way_ctx;
|
||||
void init_x11_4way_ctx()
|
||||
{
|
||||
blake512_4way_init( &x11_4way_ctx.blake );
|
||||
sph_bmw512_init( &x11_4way_ctx.bmw );
|
||||
bmw512_4way_init( &x11_4way_ctx.bmw );
|
||||
init_groestl( &x11_4way_ctx.groestl, 64 );
|
||||
skein512_4way_init( &x11_4way_ctx.skein );
|
||||
jh512_4way_init( &x11_4way_ctx.jh );
|
||||
@@ -63,22 +63,13 @@ void x11_4way_hash( void *state, const void *input )
|
||||
blake512_4way( &ctx.blake, input, 80 );
|
||||
blake512_4way_close( &ctx.blake, vhash );
|
||||
|
||||
// 2 Bmw
|
||||
bmw512_4way( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way_close( &ctx.bmw, vhash );
|
||||
|
||||
// Serial
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
// 2 Bmw
|
||||
sph_bmw512( &ctx.bmw, hash0, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash0 );
|
||||
memcpy( &ctx.bmw, &x11_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash1, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash1 );
|
||||
memcpy( &ctx.bmw, &x11_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash2, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash2 );
|
||||
memcpy( &ctx.bmw, &x11_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash3, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash3 );
|
||||
|
||||
// 3 Groestl
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
memcpy( &ctx.groestl, &x11_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
|
@@ -11,7 +11,7 @@ bool register_x11_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_x11;
|
||||
gate->hash = (void*)&x11_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
|
||||
gate->get_max64 = (void*)&get_max64_0x3ffff;
|
||||
return true;
|
||||
};
|
||||
|
@@ -7,7 +7,7 @@
|
||||
#include <stdint.h>
|
||||
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
@@ -21,7 +21,7 @@
|
||||
|
||||
typedef struct {
|
||||
blake512_4way_context blake;
|
||||
sph_bmw512_context bmw;
|
||||
bmw512_4way_context bmw;
|
||||
hashState_groestl groestl;
|
||||
skein512_4way_context skein;
|
||||
jh512_4way_context jh;
|
||||
@@ -39,7 +39,7 @@ x11gost_4way_ctx_holder x11gost_4way_ctx;
|
||||
void init_x11gost_4way_ctx()
|
||||
{
|
||||
blake512_4way_init( &x11gost_4way_ctx.blake );
|
||||
sph_bmw512_init( &x11gost_4way_ctx.bmw );
|
||||
bmw512_4way_init( &x11gost_4way_ctx.bmw );
|
||||
init_groestl( &x11gost_4way_ctx.groestl, 64 );
|
||||
skein512_4way_init( &x11gost_4way_ctx.skein );
|
||||
jh512_4way_init( &x11gost_4way_ctx.jh );
|
||||
@@ -65,21 +65,12 @@ void x11gost_4way_hash( void *state, const void *input )
|
||||
blake512_4way( &ctx.blake, input, 80 );
|
||||
blake512_4way_close( &ctx.blake, vhash );
|
||||
|
||||
bmw512_4way( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way_close( &ctx.bmw, vhash );
|
||||
|
||||
// Serial
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
sph_bmw512( &ctx.bmw, hash0, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash0 );
|
||||
memcpy( &ctx.bmw, &x11gost_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash1, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash1 );
|
||||
memcpy( &ctx.bmw, &x11gost_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash2, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash2 );
|
||||
memcpy( &ctx.bmw, &x11gost_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash3, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash3 );
|
||||
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl,
|
||||
sizeof(hashState_groestl) );
|
||||
@@ -110,8 +101,8 @@ void x11gost_4way_hash( void *state, const void *input )
|
||||
sph_gost512_close( &ctx.gost, hash0 );
|
||||
memcpy( &ctx.gost, &x11gost_4way_ctx.gost, sizeof(sph_gost512_context) );
|
||||
sph_gost512( &ctx.gost, hash1, 64 );
|
||||
memcpy( &ctx.gost, &x11gost_4way_ctx.gost, sizeof(sph_gost512_context) );
|
||||
sph_gost512_close( &ctx.gost, hash1 );
|
||||
memcpy( &ctx.gost, &x11gost_4way_ctx.gost, sizeof(sph_gost512_context) );
|
||||
sph_gost512( &ctx.gost, hash2, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash2 );
|
||||
memcpy( &ctx.gost, &x11gost_4way_ctx.gost, sizeof(sph_gost512_context) );
|
||||
|
@@ -11,7 +11,7 @@ bool register_x11gost_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_x11gost;
|
||||
gate->hash = (void*)&x11gost_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
|
||||
gate->get_max64 = (void*)&get_max64_0x3ffff;
|
||||
return true;
|
||||
};
|
||||
|
158
algo/x13/skunk-4way.c
Normal file
158
algo/x13/skunk-4way.c
Normal file
@@ -0,0 +1,158 @@
|
||||
#include "skunk-gate.h"
|
||||
|
||||
#ifdef __AVX2__
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/gost/sph_gost.h"
|
||||
#include "algo/fugue/sph_fugue.h"
|
||||
#include "algo/cubehash/sse2/cubehash_sse2.h"
|
||||
|
||||
typedef struct {
|
||||
skein512_4way_context skein;
|
||||
cubehashParam cube;
|
||||
sph_fugue512_context fugue;
|
||||
sph_gost512_context gost;
|
||||
} skunk_4way_ctx_holder;
|
||||
|
||||
static __thread skunk_4way_ctx_holder skunk_4way_ctx;
|
||||
|
||||
void skunk_4way_hash( void *output, const void *input )
|
||||
{
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
|
||||
skunk_4way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
memcpy( &ctx, &skunk_4way_ctx, sizeof(skunk_4way_ctx) );
|
||||
|
||||
skein512_4way( &ctx.skein, input, 80 );
|
||||
skein512_4way_close( &ctx.skein, vhash );
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*)hash0, 64 );
|
||||
memcpy( &ctx.cube, &skunk_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
|
||||
memcpy( &ctx.cube, &skunk_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
|
||||
memcpy( &ctx.cube, &skunk_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
|
||||
|
||||
sph_fugue512( &ctx.fugue, hash0, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash0 );
|
||||
sph_fugue512_init( &ctx.fugue );
|
||||
sph_fugue512( &ctx.fugue, hash1, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash1 );
|
||||
sph_fugue512_init( &ctx.fugue );
|
||||
sph_fugue512( &ctx.fugue, hash2, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash2 );
|
||||
sph_fugue512_init( &ctx.fugue );
|
||||
sph_fugue512( &ctx.fugue, hash3, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash3 );
|
||||
|
||||
sph_gost512( &ctx.gost, hash0, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash0 );
|
||||
sph_gost512_init( &ctx.gost );
|
||||
sph_gost512( &ctx.gost, hash1, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash1 );
|
||||
sph_gost512_init( &ctx.gost );
|
||||
sph_gost512( &ctx.gost, hash2, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash2 );
|
||||
sph_gost512_init( &ctx.gost );
|
||||
sph_gost512( &ctx.gost, hash3, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash3 );
|
||||
|
||||
memcpy( output, hash0, 32 );
|
||||
memcpy( output+32, hash1, 32 );
|
||||
memcpy( output+64, hash2, 32 );
|
||||
memcpy( output+96, hash3, 32 );
|
||||
}
|
||||
|
||||
int scanhash_skunk_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
{
|
||||
uint32_t hash[4*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t endiandata[20] __attribute__((aligned(64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
uint32_t *nonces = work->nonces;
|
||||
bool *found = work->nfound;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
|
||||
uint32_t *noncep1 = vdata + 75;
|
||||
uint32_t *noncep2 = vdata + 77;
|
||||
uint32_t *noncep3 = vdata + 79;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
volatile uint8_t *restart = &(work_restart[thr_id].restart);
|
||||
|
||||
if ( opt_benchmark )
|
||||
((uint32_t*)ptarget)[7] = 0x0cff;
|
||||
for ( int k = 0; k < 19; k++ )
|
||||
be32enc( &endiandata[k], pdata[k] );
|
||||
|
||||
uint64_t *edata = (uint64_t*)endiandata;
|
||||
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
|
||||
do
|
||||
{
|
||||
found[0] = found[1] = found[2] = found[3] = false;
|
||||
be32enc( noncep0, n );
|
||||
be32enc( noncep1, n+1 );
|
||||
be32enc( noncep2, n+2 );
|
||||
be32enc( noncep3, n+3 );
|
||||
|
||||
skunk_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
|
||||
{
|
||||
found[0] = true;
|
||||
num_found++;
|
||||
nonces[0] = n;
|
||||
work_set_target_ratio( work, hash );
|
||||
}
|
||||
if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
|
||||
{
|
||||
found[1] = true;
|
||||
num_found++;
|
||||
nonces[1] = n+1;
|
||||
work_set_target_ratio( work, hash+8 );
|
||||
}
|
||||
if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
|
||||
{
|
||||
found[2] = true;
|
||||
num_found++;
|
||||
nonces[2] = n+2;
|
||||
work_set_target_ratio( work, hash+16 );
|
||||
}
|
||||
if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
|
||||
{
|
||||
found[3] = true;
|
||||
num_found++;
|
||||
nonces[3] = n+3;
|
||||
work_set_target_ratio( work, hash+24 );
|
||||
}
|
||||
n +=4;
|
||||
} while ( ( num_found == 0 ) && ( n < max_nonce ) && !(*restart) );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return num_found;
|
||||
}
|
||||
|
||||
bool skunk_4way_thread_init()
|
||||
{
|
||||
skein512_4way_init( &skunk_4way_ctx.skein );
|
||||
cubehashInit( &skunk_4way_ctx.cube, 512, 16, 32 );
|
||||
sph_fugue512_init( &skunk_4way_ctx.fugue );
|
||||
sph_gost512_init( &skunk_4way_ctx.gost );
|
||||
return true;
|
||||
}
|
||||
|
||||
#endif
|
18
algo/x13/skunk-gate.c
Normal file
18
algo/x13/skunk-gate.c
Normal file
@@ -0,0 +1,18 @@
|
||||
#include "skunk-gate.h"
|
||||
|
||||
bool register_skunk_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
|
||||
#if defined (SKUNK_4WAY)
|
||||
gate->miner_thread_init = (void*)&skunk_4way_thread_init;
|
||||
gate->scanhash = (void*)&scanhash_skunk_4way;
|
||||
gate->hash = (void*)&skunk_4way_hash;
|
||||
// init_skunk_4way_ctx();
|
||||
#else
|
||||
gate->miner_thread_init = (void*)&skunk_thread_init;
|
||||
gate->scanhash = (void*)&scanhash_skunk;
|
||||
gate->hash = (void*)&skunkhash;
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
|
33
algo/x13/skunk-gate.h
Normal file
33
algo/x13/skunk-gate.h
Normal file
@@ -0,0 +1,33 @@
|
||||
#ifndef SKUNK_GATE_H__
|
||||
#define SKUNK_GATE_H__ 1
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(HASH_4WAY)
|
||||
#define SKUNK_4WAY
|
||||
#endif
|
||||
|
||||
bool register_skunk_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(SKUNK_4WAY)
|
||||
|
||||
void skunk_4way_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_skunk_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
bool skunk_4way_thread_init();
|
||||
//void init_skunk_4way_ctx();
|
||||
|
||||
#endif
|
||||
|
||||
void skunkhash( void *state, const void *input );
|
||||
|
||||
int scanhash_skunk( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
bool skunk_thread_init();
|
||||
|
||||
#endif
|
||||
|
@@ -1,10 +1,8 @@
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
#include "skunk-gate.h"
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "algo/gost/sph_gost.h"
|
||||
#include "algo/skein/sph_skein.h"
|
||||
#include "algo/fugue/sph_fugue.h"
|
||||
@@ -90,12 +88,3 @@ bool skunk_thread_init()
|
||||
sph_gost512_init( &skunk_ctx.gost );
|
||||
return true;
|
||||
}
|
||||
|
||||
bool register_skunk_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
|
||||
gate->miner_thread_init = (void*)&skunk_thread_init;
|
||||
gate->scanhash = (void*)&scanhash_skunk;
|
||||
gate->hash = (void*)&skunkhash;
|
||||
return true;
|
||||
}
|
@@ -7,7 +7,7 @@
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
@@ -22,7 +22,7 @@
|
||||
|
||||
typedef struct {
|
||||
blake512_4way_context blake;
|
||||
sph_bmw512_context bmw;
|
||||
bmw512_4way_context bmw;
|
||||
hashState_groestl groestl;
|
||||
skein512_4way_context skein;
|
||||
jh512_4way_context jh;
|
||||
@@ -41,7 +41,7 @@ x13_4way_ctx_holder x13_4way_ctx __attribute__ ((aligned (64)));
|
||||
void init_x13_4way_ctx()
|
||||
{
|
||||
blake512_4way_init( &x13_4way_ctx.blake );
|
||||
sph_bmw512_init( &x13_4way_ctx.bmw );
|
||||
bmw512_4way_init( &x13_4way_ctx.bmw );
|
||||
init_groestl( &x13_4way_ctx.groestl, 64 );
|
||||
skein512_4way_init( &x13_4way_ctx.skein );
|
||||
jh512_4way_init( &x13_4way_ctx.jh );
|
||||
@@ -69,22 +69,13 @@ void x13_4way_hash( void *state, const void *input )
|
||||
blake512_4way( &ctx.blake, input, 80 );
|
||||
blake512_4way_close( &ctx.blake, vhash );
|
||||
|
||||
// 2 Bmw
|
||||
bmw512_4way( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way_close( &ctx.bmw, vhash );
|
||||
|
||||
// Serial
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
// 2 Bmw
|
||||
sph_bmw512( &ctx.bmw, hash0, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash0 );
|
||||
memcpy( &ctx.bmw, &x13_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash1, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash1 );
|
||||
memcpy( &ctx.bmw, &x13_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash2, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash2 );
|
||||
memcpy( &ctx.bmw, &x13_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash3, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash3 );
|
||||
|
||||
// 3 Groestl
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
memcpy( &ctx.groestl, &x13_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
|
@@ -7,7 +7,7 @@
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
@@ -23,7 +23,7 @@
|
||||
|
||||
typedef struct {
|
||||
blake512_4way_context blake;
|
||||
sph_bmw512_context bmw;
|
||||
bmw512_4way_context bmw;
|
||||
hashState_groestl groestl;
|
||||
skein512_4way_context skein;
|
||||
jh512_4way_context jh;
|
||||
@@ -44,7 +44,7 @@ static __thread blake512_4way_context x13sm3_ctx_mid;
|
||||
void init_x13sm3_4way_ctx()
|
||||
{
|
||||
blake512_4way_init( &x13sm3_4way_ctx.blake );
|
||||
sph_bmw512_init( &x13sm3_4way_ctx.bmw );
|
||||
bmw512_4way_init( &x13sm3_4way_ctx.bmw );
|
||||
init_groestl( &x13sm3_4way_ctx.groestl, 64 );
|
||||
skein512_4way_init( &x13sm3_4way_ctx.skein );
|
||||
jh512_4way_init( &x13sm3_4way_ctx.jh );
|
||||
@@ -76,22 +76,13 @@ void x13sm3_4way_hash( void *state, const void *input )
|
||||
// blake512_4way( &ctx.blake, input, 80 );
|
||||
blake512_4way_close( &ctx.blake, vhash );
|
||||
|
||||
// Bmw
|
||||
bmw512_4way( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way_close( &ctx.bmw, vhash );
|
||||
|
||||
// Serial
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
// Bmw
|
||||
sph_bmw512( &ctx.bmw, hash0, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash0 );
|
||||
memcpy( &ctx.bmw, &x13sm3_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash1, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash1 );
|
||||
memcpy( &ctx.bmw, &x13sm3_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash2, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash2 );
|
||||
memcpy( &ctx.bmw, &x13sm3_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash3, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash3 );
|
||||
|
||||
// Groestl
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
memcpy( &ctx.groestl, &x13sm3_4way_ctx.groestl,
|
||||
|
185
algo/x14/polytimos-4way.c
Normal file
185
algo/x14/polytimos-4way.c
Normal file
@@ -0,0 +1,185 @@
|
||||
#include "polytimos-gate.h"
|
||||
|
||||
#if defined(__AVX2__) && defined(__AES__)
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/shabal/shabal-hash-4way.h"
|
||||
#include "algo/fugue//sph_fugue.h"
|
||||
#include "algo/luffa/sse2/luffa_for_sse2.h"
|
||||
//#include "algo/shabal/sph_shabal.h"
|
||||
#include "algo/gost/sph_gost.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
|
||||
typedef struct {
|
||||
skein512_4way_context skein;
|
||||
shabal512_4way_context shabal;
|
||||
hashState_echo echo;
|
||||
hashState_luffa luffa;
|
||||
sph_fugue512_context fugue;
|
||||
sph_gost512_context gost;
|
||||
} poly_4way_ctx_holder;
|
||||
|
||||
poly_4way_ctx_holder poly_4way_ctx;
|
||||
|
||||
void init_polytimos_4way_ctx()
|
||||
{
|
||||
skein512_4way_init( &poly_4way_ctx.skein );
|
||||
shabal512_4way_init( &poly_4way_ctx.shabal );
|
||||
init_echo( &poly_4way_ctx.echo, 512 );
|
||||
init_luffa( &poly_4way_ctx.luffa, 512 );
|
||||
sph_fugue512_init( &poly_4way_ctx.fugue );
|
||||
sph_gost512_init( &poly_4way_ctx.gost );
|
||||
}
|
||||
|
||||
void polytimos_4way_hash( void *output, const void *input )
|
||||
{
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
poly_4way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
memcpy( &ctx, &poly_4way_ctx, sizeof(poly_4way_ctx) );
|
||||
|
||||
skein512_4way( &ctx.skein, input, 80 );
|
||||
skein512_4way_close( &ctx.skein, vhash );
|
||||
|
||||
// Need to convert from 64 bit interleaved to 32 bit interleaved.
|
||||
uint32_t vhash32[16*4];
|
||||
mm256_reinterleave_4x32( vhash32, vhash, 512 );
|
||||
shabal512_4way( &ctx.shabal, vhash32, 64 );
|
||||
shabal512_4way_close( &ctx.shabal, vhash32 );
|
||||
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash32, 512 );
|
||||
|
||||
update_final_echo ( &ctx.echo, (BitSequence *)hash0,
|
||||
(const BitSequence *)hash0, 512 );
|
||||
memcpy( &ctx.echo, &poly_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||
(const BitSequence *) hash1, 512 );
|
||||
memcpy( &ctx.echo, &poly_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
||||
(const BitSequence *) hash2, 512 );
|
||||
memcpy( &ctx.echo, &poly_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||
(const BitSequence *) hash3, 512 );
|
||||
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
|
||||
(const BitSequence*)hash0, 64 );
|
||||
memcpy( &ctx.luffa, &poly_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
|
||||
(const BitSequence*)hash1, 64 );
|
||||
memcpy( &ctx.luffa, &poly_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
|
||||
(const BitSequence*)hash2, 64 );
|
||||
memcpy( &ctx.luffa, &poly_4way_ctx.luffa, sizeof(hashState_luffa) );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
|
||||
(const BitSequence*)hash3, 64 );
|
||||
|
||||
sph_fugue512( &ctx.fugue, hash0, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash0 );
|
||||
memcpy( &ctx.fugue, &poly_4way_ctx.fugue, sizeof(sph_fugue512_context) );
|
||||
sph_fugue512( &ctx.fugue, hash1, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash1 );
|
||||
memcpy( &ctx.fugue, &poly_4way_ctx.fugue, sizeof(sph_fugue512_context) );
|
||||
sph_fugue512( &ctx.fugue, hash2, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash2 );
|
||||
memcpy( &ctx.fugue, &poly_4way_ctx.fugue, sizeof(sph_fugue512_context) );
|
||||
sph_fugue512( &ctx.fugue, hash3, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash3 );
|
||||
|
||||
sph_gost512( &ctx.gost, hash0, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash0 );
|
||||
sph_gost512_init( &ctx.gost );
|
||||
sph_gost512( &ctx.gost, hash1, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash1 );
|
||||
sph_gost512_init( &ctx.gost );
|
||||
sph_gost512( &ctx.gost, hash2, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash2 );
|
||||
sph_gost512_init( &ctx.gost );
|
||||
sph_gost512( &ctx.gost, hash3, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash3 );
|
||||
|
||||
memcpy( output, hash0, 32 );
|
||||
memcpy( output+32, hash1, 32 );
|
||||
memcpy( output+64, hash2, 32 );
|
||||
memcpy( output+96, hash3, 32 );
|
||||
}
|
||||
|
||||
int scanhash_polytimos_4way( int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done )
|
||||
{
|
||||
uint32_t hash[4*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t endiandata[20] __attribute__((aligned(64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
uint32_t *nonces = work->nonces;
|
||||
bool *found = work->nfound;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
|
||||
uint32_t *noncep1 = vdata + 75;
|
||||
uint32_t *noncep2 = vdata + 77;
|
||||
uint32_t *noncep3 = vdata + 79;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
volatile uint8_t *restart = &(work_restart[thr_id].restart);
|
||||
|
||||
if ( opt_benchmark )
|
||||
ptarget[7] = 0x0cff;
|
||||
|
||||
for ( int i=0; i < 19; i++ )
|
||||
be32enc( &endiandata[i], pdata[i] );
|
||||
|
||||
uint64_t *edata = (uint64_t*)endiandata;
|
||||
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
|
||||
do {
|
||||
found[0] = found[1] = found[2] = found[3] = false;
|
||||
be32enc( noncep0, n );
|
||||
be32enc( noncep1, n+1 );
|
||||
be32enc( noncep2, n+2 );
|
||||
be32enc( noncep3, n+3 );
|
||||
|
||||
polytimos_4way_hash(hash, vdata);
|
||||
pdata[19] = n;
|
||||
|
||||
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
|
||||
{
|
||||
found[0] = true;
|
||||
num_found++;
|
||||
nonces[0] = n;
|
||||
work_set_target_ratio( work, hash );
|
||||
}
|
||||
if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
|
||||
{
|
||||
found[1] = true;
|
||||
num_found++;
|
||||
nonces[1] = n+1;
|
||||
work_set_target_ratio( work, hash+8 );
|
||||
}
|
||||
if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
|
||||
{
|
||||
found[2] = true;
|
||||
num_found++;
|
||||
nonces[2] = n+2;
|
||||
work_set_target_ratio( work, hash+16 );
|
||||
}
|
||||
if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
|
||||
{
|
||||
found[3] = true;
|
||||
num_found++;
|
||||
nonces[3] = n+3;
|
||||
work_set_target_ratio( work, hash+24 );
|
||||
}
|
||||
n += 4;
|
||||
|
||||
} while ( ( num_found == 0 ) && ( n < max_nonce ) && !(*restart));
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return num_found;
|
||||
}
|
||||
|
||||
#endif
|
@@ -2,10 +2,16 @@
|
||||
|
||||
bool register_polytimos_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
|
||||
#ifdef POLYTIMOS_4WAY
|
||||
init_polytimos_4way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_polytimos_4way;
|
||||
gate->hash = (void*)&polytimos_4way_hash;
|
||||
#else
|
||||
init_polytimos_context();
|
||||
gate->scanhash = (void*)&scanhash_polytimos;
|
||||
gate->hash = (void*)&polytimos_hash;
|
||||
#endif
|
||||
gate->get_max64 = (void*)&get_max64_0x3ffff;
|
||||
return true;
|
||||
};
|
32
algo/x14/polytimos-gate.h
Normal file
32
algo/x14/polytimos-gate.h
Normal file
@@ -0,0 +1,32 @@
|
||||
#ifndef POLYTIMOS_GATE_H__
|
||||
#define POLYTIMOS_GATE_H__ 1
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(HASH_4WAY) && defined(__AES__)
|
||||
#define POLYTIMOS_4WAY
|
||||
#endif
|
||||
|
||||
bool register_polytimos_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(POLYTIMOS_4WAY)
|
||||
|
||||
void polytimos_4way_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_polytimos_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_polytimos_4way_ctx();
|
||||
|
||||
#endif
|
||||
|
||||
void polytimos_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_polytimos( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_polytimos_ctx();
|
||||
|
||||
#endif
|
||||
|
154
algo/x14/veltor-4way.c
Normal file
154
algo/x14/veltor-4way.c
Normal file
@@ -0,0 +1,154 @@
|
||||
#include "veltor-gate.h"
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#if defined(__AVX2__) && defined(__AES__)
|
||||
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/shabal/shabal-hash-4way.h"
|
||||
#include "algo/gost/sph_gost.h"
|
||||
|
||||
typedef struct {
|
||||
skein512_4way_context skein;
|
||||
sph_shavite512_context shavite;
|
||||
shabal512_4way_context shabal;
|
||||
sph_gost512_context gost;
|
||||
} veltor_4way_ctx_holder;
|
||||
|
||||
veltor_4way_ctx_holder veltor_4way_ctx __attribute__ ((aligned (64)));
|
||||
|
||||
void init_veltor_4way_ctx()
|
||||
{
|
||||
skein512_4way_init( &veltor_4way_ctx.skein );
|
||||
sph_shavite512_init( &veltor_4way_ctx.shavite );
|
||||
shabal512_4way_init( &veltor_4way_ctx.shabal );
|
||||
sph_gost512_init( &veltor_4way_ctx.gost );
|
||||
}
|
||||
|
||||
void veltor_4way_hash( void *output, const void *input )
|
||||
{
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
veltor_4way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
memcpy( &ctx, &veltor_4way_ctx, sizeof(veltor_4way_ctx) );
|
||||
|
||||
skein512_4way( &ctx.skein, input, 80 );
|
||||
skein512_4way_close( &ctx.skein, vhash );
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
sph_shavite512( &ctx.shavite, hash0, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash0 );
|
||||
sph_shavite512_init( &ctx.shavite );
|
||||
sph_shavite512( &ctx.shavite, hash1, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash1 );
|
||||
sph_shavite512_init( &ctx.shavite );
|
||||
sph_shavite512( &ctx.shavite, hash2, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash2 );
|
||||
sph_shavite512_init( &ctx.shavite );
|
||||
sph_shavite512( &ctx.shavite, hash3, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash3 );
|
||||
|
||||
mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
shabal512_4way( &ctx.shabal, vhash, 64 );
|
||||
shabal512_4way_close( &ctx.shabal, vhash );
|
||||
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
sph_gost512( &ctx.gost, hash0, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash0 );
|
||||
sph_gost512_init( &ctx.gost );
|
||||
sph_gost512( &ctx.gost, hash1, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash1 );
|
||||
sph_gost512_init( &ctx.gost );
|
||||
sph_gost512( &ctx.gost, hash2, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash2 );
|
||||
sph_gost512_init( &ctx.gost );
|
||||
sph_gost512( &ctx.gost, hash3, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash3 );
|
||||
|
||||
memcpy( output, hash0, 32 );
|
||||
memcpy( output+32, hash1, 32 );
|
||||
memcpy( output+64, hash2, 32 );
|
||||
memcpy( output+96, hash3, 32 );
|
||||
}
|
||||
|
||||
int scanhash_veltor_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done )
|
||||
{
|
||||
uint32_t hash[4*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t endiandata[20] __attribute__((aligned(64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
uint32_t *nonces = work->nonces;
|
||||
bool *found = work->nfound;
|
||||
int num_found = 0;
|
||||
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
|
||||
uint32_t *noncep1 = vdata + 75;
|
||||
uint32_t *noncep2 = vdata + 77;
|
||||
uint32_t *noncep3 = vdata + 79;
|
||||
volatile uint8_t *restart = &(work_restart[thr_id].restart);
|
||||
|
||||
if ( opt_benchmark )
|
||||
ptarget[7] = 0x0cff;
|
||||
for ( int i=0; i < 19; i++ )
|
||||
{
|
||||
be32enc( &endiandata[i], pdata[i] );
|
||||
}
|
||||
|
||||
uint64_t *edata = (uint64_t*)endiandata;
|
||||
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
|
||||
do
|
||||
{
|
||||
found[0] = found[1] = found[2] = found[3] = false;
|
||||
be32enc( noncep0, n );
|
||||
be32enc( noncep1, n+1 );
|
||||
be32enc( noncep2, n+2 );
|
||||
be32enc( noncep3, n+3 );
|
||||
|
||||
veltor_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
|
||||
{
|
||||
found[0] = true;
|
||||
num_found++;
|
||||
nonces[0] = n;
|
||||
work_set_target_ratio( work, hash );
|
||||
}
|
||||
if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
|
||||
{
|
||||
found[1] = true;
|
||||
num_found++;
|
||||
nonces[1] = n+1;
|
||||
work_set_target_ratio( work, hash+8 );
|
||||
}
|
||||
if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
|
||||
{
|
||||
found[2] = true;
|
||||
num_found++;
|
||||
nonces[2] = n+2;
|
||||
work_set_target_ratio( work, hash+16 );
|
||||
}
|
||||
if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
|
||||
{
|
||||
found[3] = true;
|
||||
num_found++;
|
||||
nonces[3] = n+3;
|
||||
work_set_target_ratio( work, hash+24 );
|
||||
}
|
||||
n += 4;
|
||||
} while ( ( num_found == 0 ) && ( n < max_nonce ) && !(*restart) );
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return num_found;
|
||||
}
|
||||
|
||||
#endif
|
18
algo/x14/veltor-gate.c
Normal file
18
algo/x14/veltor-gate.c
Normal file
@@ -0,0 +1,18 @@
|
||||
#include "veltor-gate.h"
|
||||
|
||||
bool register_veltor_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (VELTOR_4WAY)
|
||||
init_veltor_4way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_veltor_4way;
|
||||
gate->hash = (void*)&veltor_4way_hash;
|
||||
#else
|
||||
init_veltor_ctx();
|
||||
gate->scanhash = (void*)&scanhash_veltor;
|
||||
gate->hash = (void*)&veltor_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
|
||||
gate->get_max64 = (void*)&get_max64_0x3ffff;
|
||||
return true;
|
||||
};
|
||||
|
32
algo/x14/veltor-gate.h
Normal file
32
algo/x14/veltor-gate.h
Normal file
@@ -0,0 +1,32 @@
|
||||
#ifndef VELTOR_GATE_H__
|
||||
#define VELTOR_GATE_H__ 1
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(HASH_4WAY) && defined(__AES__)
|
||||
#define VELTOR_4WAY
|
||||
#endif
|
||||
|
||||
bool register_veltor_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(VELTOR_4WAY)
|
||||
|
||||
void veltor_4way_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_veltor_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_veltor_4way_ctx();
|
||||
|
||||
#endif
|
||||
|
||||
void veltor_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_veltor( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done );
|
||||
|
||||
void init_veltor_ctx();
|
||||
|
||||
#endif
|
||||
|
@@ -1,4 +1,4 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include "veltor-gate.h"
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
@@ -34,7 +34,7 @@ void veltor_skein512_midstate( const void* input )
|
||||
sph_skein512( &veltor_skein_mid, input, 64 );
|
||||
}
|
||||
|
||||
void veltorhash(void *output, const void *input)
|
||||
void veltor_hash(void *output, const void *input)
|
||||
{
|
||||
uint32_t _ALIGN(64) hashA[16], hashB[16];
|
||||
|
||||
@@ -85,7 +85,7 @@ int scanhash_veltor(int thr_id, struct work *work, uint32_t max_nonce, uint64_t
|
||||
|
||||
do {
|
||||
be32enc(&endiandata[19], nonce);
|
||||
veltorhash(hash, endiandata);
|
||||
veltor_hash(hash, endiandata);
|
||||
|
||||
if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
|
||||
work_set_target_ratio(work, hash);
|
||||
@@ -101,14 +101,3 @@ int scanhash_veltor(int thr_id, struct work *work, uint32_t max_nonce, uint64_t
|
||||
*hashes_done = pdata[19] - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool register_veltor_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AES_OPT;
|
||||
init_veltor_ctx();
|
||||
gate->scanhash = (void*)&scanhash_veltor;
|
||||
gate->hash = (void*)&veltorhash;
|
||||
gate->get_max64 = (void*)&get_max64_0x3ffff;
|
||||
return true;
|
||||
}
|
||||
|
@@ -7,7 +7,7 @@
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
@@ -20,11 +20,11 @@
|
||||
#include "algo/echo/sph_echo.h"
|
||||
#include "algo/hamsi/sph_hamsi.h"
|
||||
#include "algo/fugue/sph_fugue.h"
|
||||
#include "algo/shabal/sph_shabal.h"
|
||||
#include "algo/shabal/shabal-hash-4way.h"
|
||||
|
||||
typedef struct {
|
||||
blake512_4way_context blake;
|
||||
sph_bmw512_context bmw;
|
||||
bmw512_4way_context bmw;
|
||||
hashState_groestl groestl;
|
||||
skein512_4way_context skein;
|
||||
jh512_4way_context jh;
|
||||
@@ -36,7 +36,7 @@ typedef struct {
|
||||
hashState_echo echo;
|
||||
sph_hamsi512_context hamsi;
|
||||
sph_fugue512_context fugue;
|
||||
sph_shabal512_context shabal;
|
||||
shabal512_4way_context shabal;
|
||||
} x14_4way_ctx_holder;
|
||||
|
||||
x14_4way_ctx_holder x14_4way_ctx __attribute__ ((aligned (64)));
|
||||
@@ -44,6 +44,7 @@ x14_4way_ctx_holder x14_4way_ctx __attribute__ ((aligned (64)));
|
||||
void init_x14_4way_ctx()
|
||||
{
|
||||
blake512_4way_init( &x14_4way_ctx.blake );
|
||||
bmw512_4way_init( &x14_4way_ctx.bmw );
|
||||
sph_bmw512_init( &x14_4way_ctx.bmw );
|
||||
init_groestl( &x14_4way_ctx.groestl, 64 );
|
||||
skein512_4way_init( &x14_4way_ctx.skein );
|
||||
@@ -56,7 +57,7 @@ void init_x14_4way_ctx()
|
||||
init_echo( &x14_4way_ctx.echo, 512 );
|
||||
sph_hamsi512_init( &x14_4way_ctx.hamsi );
|
||||
sph_fugue512_init( &x14_4way_ctx.fugue );
|
||||
sph_shabal512_init( &x14_4way_ctx.shabal );
|
||||
shabal512_4way_init( &x14_4way_ctx.shabal );
|
||||
};
|
||||
|
||||
void x14_4way_hash( void *state, const void *input )
|
||||
@@ -73,22 +74,13 @@ void x14_4way_hash( void *state, const void *input )
|
||||
blake512_4way( &ctx.blake, input, 80 );
|
||||
blake512_4way_close( &ctx.blake, vhash );
|
||||
|
||||
// 2 Bmw
|
||||
bmw512_4way( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way_close( &ctx.bmw, vhash );
|
||||
|
||||
// Serial
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
// 2 Bmw
|
||||
sph_bmw512( &ctx.bmw, hash0, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash0 );
|
||||
memcpy( &ctx.bmw, &x14_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash1, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash1 );
|
||||
memcpy( &ctx.bmw, &x14_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash2, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash2 );
|
||||
memcpy( &ctx.bmw, &x14_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash3, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash3 );
|
||||
|
||||
// 3 Groestl
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
memcpy( &ctx.groestl, &x14_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
@@ -113,7 +105,7 @@ void x14_4way_hash( void *state, const void *input )
|
||||
keccak512_4way( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way_close( &ctx.keccak, vhash );
|
||||
|
||||
// Serial to the end
|
||||
// Serial
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
// 7 Luffa
|
||||
@@ -144,9 +136,9 @@ void x14_4way_hash( void *state, const void *input )
|
||||
memcpy( &ctx.shavite, &x14_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash1, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash1 );
|
||||
memcpy( &ctx.shavite, &x14_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512_close( &ctx.shavite, hash1 );
|
||||
sph_shavite512( &ctx.shavite, hash2, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash2 );
|
||||
memcpy( &ctx.shavite, &x14_4way_ctx.shavite,
|
||||
@@ -206,19 +198,12 @@ void x14_4way_hash( void *state, const void *input )
|
||||
sph_fugue512( &ctx.fugue, hash3, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash3 );
|
||||
|
||||
// 14 Shabal
|
||||
sph_shabal512( &ctx.shabal, hash0, 64 );
|
||||
sph_shabal512_close( &ctx.shabal, hash0 );
|
||||
memcpy( &ctx.shabal, &x14_4way_ctx.shabal, sizeof(sph_shabal512_context) );
|
||||
sph_shabal512( &ctx.shabal, hash1, 64 );
|
||||
sph_shabal512_close( &ctx.shabal, hash1 );
|
||||
memcpy( &ctx.shabal, &x14_4way_ctx.shabal, sizeof(sph_shabal512_context) );
|
||||
sph_shabal512( &ctx.shabal, hash2, 64 );
|
||||
sph_shabal512_close( &ctx.shabal, hash2 );
|
||||
memcpy( &ctx.shabal, &x14_4way_ctx.shabal, sizeof(sph_shabal512_context) );
|
||||
sph_shabal512( &ctx.shabal, hash3, 64 );
|
||||
sph_shabal512_close( &ctx.shabal, hash3 );
|
||||
|
||||
// 14 Shabal, parallel 32 bit
|
||||
mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
shabal512_4way( &ctx.shabal, vhash, 64 );
|
||||
shabal512_4way_close( &ctx.shabal, vhash );
|
||||
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
memcpy( state, hash0, 32 );
|
||||
memcpy( state+32, hash1, 32 );
|
||||
memcpy( state+64, hash2, 32 );
|
||||
|
@@ -11,7 +11,7 @@ bool register_x14_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_x14;
|
||||
gate->hash = (void*)&x14hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
|
||||
gate->get_max64 = (void*)&get_max64_0x3ffff;
|
||||
return true;
|
||||
};
|
||||
|
@@ -7,7 +7,7 @@
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
@@ -20,12 +20,12 @@
|
||||
#include "algo/echo/sph_echo.h"
|
||||
#include "algo/hamsi/sph_hamsi.h"
|
||||
#include "algo/fugue/sph_fugue.h"
|
||||
#include "algo/shabal/sph_shabal.h"
|
||||
#include "algo/shabal/shabal-hash-4way.h"
|
||||
#include "algo/whirlpool/sph_whirlpool.h"
|
||||
|
||||
typedef struct {
|
||||
blake512_4way_context blake;
|
||||
sph_bmw512_context bmw;
|
||||
bmw512_4way_context bmw;
|
||||
hashState_groestl groestl;
|
||||
skein512_4way_context skein;
|
||||
jh512_4way_context jh;
|
||||
@@ -37,7 +37,7 @@ typedef struct {
|
||||
hashState_echo echo;
|
||||
sph_hamsi512_context hamsi;
|
||||
sph_fugue512_context fugue;
|
||||
sph_shabal512_context shabal;
|
||||
shabal512_4way_context shabal;
|
||||
sph_whirlpool_context whirlpool;
|
||||
} x15_4way_ctx_holder;
|
||||
|
||||
@@ -46,6 +46,7 @@ x15_4way_ctx_holder x15_4way_ctx __attribute__ ((aligned (64)));
|
||||
void init_x15_4way_ctx()
|
||||
{
|
||||
blake512_4way_init( &x15_4way_ctx.blake );
|
||||
bmw512_4way_init( &x15_4way_ctx.bmw );
|
||||
sph_bmw512_init( &x15_4way_ctx.bmw );
|
||||
init_groestl( &x15_4way_ctx.groestl, 64 );
|
||||
skein512_4way_init( &x15_4way_ctx.skein );
|
||||
@@ -58,7 +59,7 @@ void init_x15_4way_ctx()
|
||||
init_echo( &x15_4way_ctx.echo, 512 );
|
||||
sph_hamsi512_init( &x15_4way_ctx.hamsi );
|
||||
sph_fugue512_init( &x15_4way_ctx.fugue );
|
||||
sph_shabal512_init( &x15_4way_ctx.shabal );
|
||||
shabal512_4way_init( &x15_4way_ctx.shabal );
|
||||
sph_whirlpool_init( &x15_4way_ctx.whirlpool );
|
||||
};
|
||||
|
||||
@@ -76,22 +77,13 @@ void x15_4way_hash( void *state, const void *input )
|
||||
blake512_4way( &ctx.blake, input, 80 );
|
||||
blake512_4way_close( &ctx.blake, vhash );
|
||||
|
||||
// 2 Bmw
|
||||
bmw512_4way( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way_close( &ctx.bmw, vhash );
|
||||
|
||||
// Serial
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
// 2 Bmw
|
||||
sph_bmw512( &ctx.bmw, hash0, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash0 );
|
||||
memcpy( &ctx.bmw, &x15_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash1, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash1 );
|
||||
memcpy( &ctx.bmw, &x15_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash2, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash2 );
|
||||
memcpy( &ctx.bmw, &x15_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash3, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash3 );
|
||||
|
||||
// 3 Groestl
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
memcpy( &ctx.groestl, &x15_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
@@ -209,18 +201,11 @@ void x15_4way_hash( void *state, const void *input )
|
||||
sph_fugue512( &ctx.fugue, hash3, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash3 );
|
||||
|
||||
// 14 Shabal
|
||||
sph_shabal512( &ctx.shabal, hash0, 64 );
|
||||
sph_shabal512_close( &ctx.shabal, hash0 );
|
||||
memcpy( &ctx.shabal, &x15_4way_ctx.shabal, sizeof(sph_shabal512_context) );
|
||||
sph_shabal512( &ctx.shabal, hash1, 64 );
|
||||
sph_shabal512_close( &ctx.shabal, hash1 );
|
||||
memcpy( &ctx.shabal, &x15_4way_ctx.shabal, sizeof(sph_shabal512_context) );
|
||||
sph_shabal512( &ctx.shabal, hash2, 64 );
|
||||
sph_shabal512_close( &ctx.shabal, hash2 );
|
||||
memcpy( &ctx.shabal, &x15_4way_ctx.shabal, sizeof(sph_shabal512_context) );
|
||||
sph_shabal512( &ctx.shabal, hash3, 64 );
|
||||
sph_shabal512_close( &ctx.shabal, hash3 );
|
||||
// 14 Shabal, parallel 32 bit
|
||||
mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
shabal512_4way( &ctx.shabal, vhash, 64 );
|
||||
shabal512_4way_close( &ctx.shabal, vhash );
|
||||
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
// 15 Whirlpool
|
||||
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
|
||||
|
@@ -11,7 +11,7 @@ bool register_x15_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_x15;
|
||||
gate->hash = (void*)&x15hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -7,7 +7,7 @@
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
@@ -17,17 +17,16 @@
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/simd/sse2/nist.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#include "algo/echo/sph_echo.h"
|
||||
#include "algo/hamsi/sph_hamsi.h"
|
||||
#include "algo/fugue/sph_fugue.h"
|
||||
#include "algo/shabal/sph_shabal.h"
|
||||
#include "algo/shabal/shabal-hash-4way.h"
|
||||
#include "algo/whirlpool/sph_whirlpool.h"
|
||||
#include "algo/haval/sph-haval.h"
|
||||
#include <openssl/sha.h>
|
||||
|
||||
typedef struct {
|
||||
blake512_4way_context blake;
|
||||
sph_bmw512_context bmw;
|
||||
bmw512_4way_context bmw;
|
||||
hashState_groestl groestl;
|
||||
skein512_4way_context skein;
|
||||
jh512_4way_context jh;
|
||||
@@ -39,7 +38,7 @@ typedef struct {
|
||||
hashState_echo echo;
|
||||
sph_hamsi512_context hamsi;
|
||||
sph_fugue512_context fugue;
|
||||
sph_shabal512_context shabal;
|
||||
shabal512_4way_context shabal;
|
||||
sph_whirlpool_context whirlpool;
|
||||
SHA512_CTX sha512;
|
||||
sph_haval256_5_context haval;
|
||||
@@ -50,7 +49,7 @@ x17_4way_ctx_holder x17_4way_ctx __attribute__ ((aligned (64)));
|
||||
void init_x17_4way_ctx()
|
||||
{
|
||||
blake512_4way_init( &x17_4way_ctx.blake );
|
||||
sph_bmw512_init( &x17_4way_ctx.bmw );
|
||||
bmw512_4way_init( &x17_4way_ctx.bmw );
|
||||
init_groestl( &x17_4way_ctx.groestl, 64 );
|
||||
skein512_4way_init( &x17_4way_ctx.skein );
|
||||
jh512_4way_init( &x17_4way_ctx.jh );
|
||||
@@ -62,8 +61,7 @@ void init_x17_4way_ctx()
|
||||
init_echo( &x17_4way_ctx.echo, 512 );
|
||||
sph_hamsi512_init( &x17_4way_ctx.hamsi );
|
||||
sph_fugue512_init( &x17_4way_ctx.fugue );
|
||||
sph_shabal512_init( &x17_4way_ctx.shabal );
|
||||
sph_whirlpool_init( &x17_4way_ctx.whirlpool );
|
||||
shabal512_4way_init( &x17_4way_ctx.shabal );
|
||||
SHA512_Init( &x17_4way_ctx.sha512 );
|
||||
sph_haval256_5_init( &x17_4way_ctx.haval );
|
||||
};
|
||||
@@ -82,22 +80,13 @@ void x17_4way_hash( void *state, const void *input )
|
||||
blake512_4way( &ctx.blake, input, 80 );
|
||||
blake512_4way_close( &ctx.blake, vhash );
|
||||
|
||||
// 2 Bmw
|
||||
bmw512_4way( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way_close( &ctx.bmw, vhash );
|
||||
|
||||
// Serial
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
// 2 Bmw
|
||||
sph_bmw512( &ctx.bmw, hash0, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash0 );
|
||||
memcpy( &ctx.bmw, &x17_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash1, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash1 );
|
||||
memcpy( &ctx.bmw, &x17_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash2, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash2 );
|
||||
memcpy( &ctx.bmw, &x17_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash3, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash3 );
|
||||
|
||||
// 3 Groestl
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
memcpy( &ctx.groestl, &x17_4way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
@@ -215,18 +204,11 @@ void x17_4way_hash( void *state, const void *input )
|
||||
sph_fugue512( &ctx.fugue, hash3, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash3 );
|
||||
|
||||
// 14 Shabal
|
||||
sph_shabal512( &ctx.shabal, hash0, 64 );
|
||||
sph_shabal512_close( &ctx.shabal, hash0 );
|
||||
memcpy( &ctx.shabal, &x17_4way_ctx.shabal, sizeof(sph_shabal512_context) );
|
||||
sph_shabal512( &ctx.shabal, hash1, 64 );
|
||||
sph_shabal512_close( &ctx.shabal, hash1 );
|
||||
memcpy( &ctx.shabal, &x17_4way_ctx.shabal, sizeof(sph_shabal512_context) );
|
||||
sph_shabal512( &ctx.shabal, hash2, 64 );
|
||||
sph_shabal512_close( &ctx.shabal, hash2 );
|
||||
memcpy( &ctx.shabal, &x17_4way_ctx.shabal, sizeof(sph_shabal512_context) );
|
||||
sph_shabal512( &ctx.shabal, hash3, 64 );
|
||||
sph_shabal512_close( &ctx.shabal, hash3 );
|
||||
// 14 Shabal, parallel 32 bit
|
||||
mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
shabal512_4way( &ctx.shabal, vhash, 64 );
|
||||
shabal512_4way_close( &ctx.shabal, vhash );
|
||||
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
// 15 Whirlpool
|
||||
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
|
||||
|
@@ -11,7 +11,7 @@ bool register_x17_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_x17;
|
||||
gate->hash = (void*)&x17_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -7,7 +7,7 @@
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
@@ -19,7 +19,7 @@
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#include "algo/hamsi/sph_hamsi.h"
|
||||
#include "algo/fugue/sph_fugue.h"
|
||||
#include "algo/shabal/sph_shabal.h"
|
||||
#include "algo/shabal/shabal-hash-4way.h"
|
||||
#include "algo/whirlpool/sph_whirlpool.h"
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
#include "algo/haval/sph-haval.h"
|
||||
@@ -27,7 +27,7 @@
|
||||
|
||||
typedef struct {
|
||||
blake512_4way_context blake;
|
||||
sph_bmw512_context bmw;
|
||||
bmw512_4way_context bmw;
|
||||
hashState_groestl groestl;
|
||||
skein512_4way_context skein;
|
||||
jh512_4way_context jh;
|
||||
@@ -39,7 +39,7 @@ typedef struct {
|
||||
hashState_echo echo;
|
||||
sph_hamsi512_context hamsi;
|
||||
sph_fugue512_context fugue;
|
||||
sph_shabal512_context shabal;
|
||||
shabal512_4way_context shabal;
|
||||
sph_whirlpool_context whirlpool;
|
||||
SHA512_CTX sha512;
|
||||
sph_haval256_5_context haval;
|
||||
@@ -52,7 +52,7 @@ static __thread blake512_4way_context xevan_blake_4way_mid
|
||||
void init_xevan_4way_ctx()
|
||||
{
|
||||
blake512_4way_init(&xevan_4way_ctx.blake);
|
||||
sph_bmw512_init(&xevan_4way_ctx.bmw);
|
||||
bmw512_4way_init( &xevan_4way_ctx.bmw );
|
||||
init_groestl( &xevan_4way_ctx.groestl, 64 );
|
||||
skein512_4way_init(&xevan_4way_ctx.skein);
|
||||
jh512_4way_init(&xevan_4way_ctx.jh);
|
||||
@@ -64,7 +64,7 @@ void init_xevan_4way_ctx()
|
||||
init_echo( &xevan_4way_ctx.echo, 512 );
|
||||
sph_hamsi512_init( &xevan_4way_ctx.hamsi );
|
||||
sph_fugue512_init( &xevan_4way_ctx.fugue );
|
||||
sph_shabal512_init( &xevan_4way_ctx.shabal );
|
||||
shabal512_4way_init( &xevan_4way_ctx.shabal );
|
||||
sph_whirlpool_init( &xevan_4way_ctx.whirlpool );
|
||||
SHA512_Init( &xevan_4way_ctx.sha512 );
|
||||
sph_haval256_5_init( &xevan_4way_ctx.haval );
|
||||
@@ -90,25 +90,18 @@ void xevan_4way_hash( void *output, const void *input )
|
||||
xevan_4way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
memcpy( &ctx, &xevan_4way_ctx, sizeof(xevan_4way_ctx) );
|
||||
|
||||
// parallel way
|
||||
memcpy( &ctx.blake, &xevan_blake_4way_mid,
|
||||
sizeof(xevan_blake_4way_mid) );
|
||||
blake512_4way( &ctx.blake, input + (midlen<<2), tail );
|
||||
blake512_4way_close(&ctx.blake, vhash);
|
||||
|
||||
memset( &vhash[8<<2], 0, 64<<2 );
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
|
||||
|
||||
sph_bmw512( &ctx.bmw, hash0, dataLen );
|
||||
sph_bmw512_close( &ctx.bmw, hash0 );
|
||||
memcpy( &ctx.bmw, &xevan_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash1, dataLen );
|
||||
sph_bmw512_close( &ctx.bmw, hash1 );
|
||||
memcpy( &ctx.bmw, &xevan_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash2, dataLen );
|
||||
sph_bmw512_close( &ctx.bmw, hash2 );
|
||||
memcpy( &ctx.bmw, &xevan_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash3, dataLen );
|
||||
sph_bmw512_close( &ctx.bmw, hash3 );
|
||||
bmw512_4way( &ctx.bmw, vhash, dataLen );
|
||||
bmw512_4way_close( &ctx.bmw, vhash );
|
||||
|
||||
// Serial
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
|
||||
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0,
|
||||
dataLen<<3 );
|
||||
@@ -122,6 +115,7 @@ void xevan_4way_hash( void *output, const void *input )
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3,
|
||||
dataLen<<3 );
|
||||
|
||||
// Parallel 4way
|
||||
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
|
||||
|
||||
skein512_4way( &ctx.skein, vhash, dataLen );
|
||||
@@ -133,6 +127,7 @@ void xevan_4way_hash( void *output, const void *input )
|
||||
keccak512_4way( &ctx.keccak, vhash, dataLen );
|
||||
keccak512_4way_close( &ctx.keccak, vhash );
|
||||
|
||||
// Serial
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
|
||||
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
|
||||
@@ -222,21 +217,13 @@ void xevan_4way_hash( void *output, const void *input )
|
||||
sph_fugue512( &ctx.fugue, hash3, dataLen );
|
||||
sph_fugue512_close( &ctx.fugue, hash3 );
|
||||
|
||||
sph_shabal512( &ctx.shabal, hash0, dataLen );
|
||||
sph_shabal512_close( &ctx.shabal, hash0 );
|
||||
memcpy( &ctx.shabal, &xevan_4way_ctx.shabal,
|
||||
sizeof(sph_shabal512_context) );
|
||||
sph_shabal512( &ctx.shabal, hash1, dataLen );
|
||||
sph_shabal512_close( &ctx.shabal, hash1 );
|
||||
memcpy( &ctx.shabal, &xevan_4way_ctx.shabal,
|
||||
sizeof(sph_shabal512_context) );
|
||||
sph_shabal512( &ctx.shabal, hash2, dataLen );
|
||||
sph_shabal512_close( &ctx.shabal, hash2 );
|
||||
memcpy( &ctx.shabal, &xevan_4way_ctx.shabal,
|
||||
sizeof(sph_shabal512_context) );
|
||||
sph_shabal512( &ctx.shabal, hash3, dataLen );
|
||||
sph_shabal512_close( &ctx.shabal, hash3 );
|
||||
// Parallel 4way
|
||||
mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
|
||||
shabal512_4way( &ctx.shabal, vhash, dataLen );
|
||||
shabal512_4way_close( &ctx.shabal, vhash );
|
||||
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
|
||||
|
||||
// Serial
|
||||
sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
|
||||
sph_whirlpool_close( &ctx.whirlpool, hash0 );
|
||||
memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
|
||||
@@ -286,19 +273,10 @@ void xevan_4way_hash( void *output, const void *input )
|
||||
blake512_4way( &ctx.blake, vhash, dataLen );
|
||||
blake512_4way_close(&ctx.blake, vhash);
|
||||
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
|
||||
bmw512_4way( &ctx.bmw, vhash, dataLen );
|
||||
bmw512_4way_close( &ctx.bmw, vhash );
|
||||
|
||||
sph_bmw512( &ctx.bmw, hash0, dataLen );
|
||||
sph_bmw512_close( &ctx.bmw, hash0 );
|
||||
memcpy( &ctx.bmw, &xevan_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash1, dataLen );
|
||||
sph_bmw512_close( &ctx.bmw, hash1 );
|
||||
memcpy( &ctx.bmw, &xevan_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash2, dataLen );
|
||||
sph_bmw512_close( &ctx.bmw, hash2 );
|
||||
memcpy( &ctx.bmw, &xevan_4way_ctx.bmw, sizeof(sph_bmw512_context) );
|
||||
sph_bmw512( &ctx.bmw, hash3, dataLen );
|
||||
sph_bmw512_close( &ctx.bmw, hash3 );
|
||||
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
|
||||
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0,
|
||||
dataLen<<3 );
|
||||
@@ -412,20 +390,10 @@ void xevan_4way_hash( void *output, const void *input )
|
||||
sph_fugue512( &ctx.fugue, hash3, dataLen );
|
||||
sph_fugue512_close( &ctx.fugue, hash3 );
|
||||
|
||||
sph_shabal512( &ctx.shabal, hash0, dataLen );
|
||||
sph_shabal512_close( &ctx.shabal, hash0 );
|
||||
memcpy( &ctx.shabal, &xevan_4way_ctx.shabal,
|
||||
sizeof(sph_shabal512_context) );
|
||||
sph_shabal512( &ctx.shabal, hash1, dataLen );
|
||||
sph_shabal512_close( &ctx.shabal, hash1 );
|
||||
memcpy( &ctx.shabal, &xevan_4way_ctx.shabal,
|
||||
sizeof(sph_shabal512_context) );
|
||||
sph_shabal512( &ctx.shabal, hash2, dataLen );
|
||||
sph_shabal512_close( &ctx.shabal, hash2 );
|
||||
memcpy( &ctx.shabal, &xevan_4way_ctx.shabal,
|
||||
sizeof(sph_shabal512_context) );
|
||||
sph_shabal512( &ctx.shabal, hash3, dataLen );
|
||||
sph_shabal512_close( &ctx.shabal, hash3 );
|
||||
mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
|
||||
shabal512_4way( &ctx.shabal, vhash, dataLen );
|
||||
shabal512_4way_close( &ctx.shabal, vhash );
|
||||
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
|
||||
|
||||
sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
|
||||
sph_whirlpool_close( &ctx.whirlpool, hash0 );
|
||||
@@ -480,7 +448,6 @@ int scanhash_xevan_4way( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
{
|
||||
uint32_t hash[4*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
// uint32_t _ALIGN(64) hash[8];
|
||||
uint32_t _ALIGN(64) endiandata[20];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
|
@@ -16,7 +16,7 @@ bool register_xevan_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_xevan;
|
||||
gate->hash = (void*)&xevan_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
|
||||
gate->set_target = (void*)&xevan_set_target;
|
||||
gate->get_max64 = (void*)&get_max64_0xffffLL;
|
||||
return true;
|
||||
|
@@ -438,6 +438,20 @@ bool register_yescrypt_algo( algo_gate_t* gate )
|
||||
return true;
|
||||
}
|
||||
|
||||
bool register_yescryptr8_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | SHA_OPT;
|
||||
gate->scanhash = (void*)&scanhash_yescrypt;
|
||||
gate->hash = (void*)&yescrypt_hash;
|
||||
gate->set_target = (void*)&scrypt_set_target;
|
||||
gate->get_max64 = (void*)&yescrypt_get_max64;
|
||||
client_key_hack = false;
|
||||
YESCRYPT_N = 2048;
|
||||
YESCRYPT_R = 8;
|
||||
YESCRYPT_P = 1;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool register_yescryptr16_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | SHA_OPT;
|
||||
|
184
avxdefs.h
184
avxdefs.h
@@ -37,7 +37,7 @@
|
||||
#define mm_one_16 _mm_set1_epi16( 1U )
|
||||
|
||||
// Constant minus 1
|
||||
#define mm_neg1 _mm_set1_epi64x( 0xFFFFFFFFUL )
|
||||
#define mm_neg1 _mm_set1_epi64x( 0xFFFFFFFFFFFFFFFFULL )
|
||||
|
||||
//
|
||||
// Basic operations without equivalent SIMD intrinsic
|
||||
@@ -55,11 +55,11 @@
|
||||
|
||||
// Return bit n in position, all other bits zeroed.
|
||||
#define mm_bitextract_64 ( x, n ) \
|
||||
_mm_and_si128( _mm_set1_epi64x( 1ULL << (n) ), x )
|
||||
_mm_and_si128( _mm_slli_epi64( mm_one_64, n ), x )
|
||||
#define mm_bitextract_32 ( x, n ) \
|
||||
_mm_and_si128( _mm_set1_epi32( 1UL << (n) ), x )
|
||||
_mm_and_si128( _mm_slli_epi32( mm_one_32, n ), x )
|
||||
#define mm_bitextract_16 ( x, n ) \
|
||||
_mm_and_si128( _mm_set1_epi16( 1U << (n) ), x )
|
||||
_mm_and_si128( _mm_slli_epi16( mm_one_16, n ), x )
|
||||
|
||||
// Return bit n as bool
|
||||
#define mm_bittest_64( x, n ) \
|
||||
@@ -343,11 +343,11 @@ inline __m128i mm_byteswap_16( __m128i x )
|
||||
|
||||
// return bit n in position, all othr bits cleared
|
||||
#define mm256_bitextract_64 ( x, n ) \
|
||||
_mm256_and_si128( _mm256_set1_epi64x( 0ULL << (n) ), x )
|
||||
_mm256_and_si128( _mm256_slli_epi64( mm256_one_64, n ), x )
|
||||
#define mm256_bitextract_32 ( x, n ) \
|
||||
_mm256_and_si128( _mm256_set1_epi32( 0UL << (n) ), x )
|
||||
_mm256_and_si128( _mm256_slli_epi32( mm256_one_32, n ), x )
|
||||
#define mm256_bitextract_16 ( x, n ) \
|
||||
_mm256_and_si128( _mm256_set1_epi16( 0U << (n) ), x )
|
||||
_mm256_and_si128( _mm256_slli_epi16( mm256_one_16, n ), x )
|
||||
|
||||
// Return bit n as bool (bit 0)
|
||||
#define mm256_bittest_64( x, n ) \
|
||||
@@ -359,17 +359,17 @@ inline __m128i mm_byteswap_16( __m128i x )
|
||||
|
||||
// Return x with bit n set/cleared in all elements
|
||||
#define mm256_bitset_64( x, n ) \
|
||||
_mm256_or_si256( _mm256_set1_epi64x( 1ULL << (n) ), x )
|
||||
_mm256_or_si256( _mm256_slli_epi64( mm256_one_64, n ), x )
|
||||
#define mm256_bitclr_64( x, n ) \
|
||||
_mm256_andnot_si256( _mm256_set1_epi64x( 1ULL << (n) ), x )
|
||||
_mm256_andnot_si256( _mm256_slli_epi64( mm256_one_64, n ), x )
|
||||
#define mm256_bitset_32( x, n ) \
|
||||
_mm256_or_si256( _mm256_set1_epi32( 1UL << (n) ), x )
|
||||
_mm256_or_si256( _mm256_slli_epi32( mm256_one_32, n ), x )
|
||||
#define mm256_bitclr_32( x, n ) \
|
||||
_mm256_andnot_si256( mm256_not( _mm256_set1_epi32( 1UL << (n) ), x )
|
||||
_mm256_andnot_si256( _mm256_slli_epi32( mm256_one_32, n ), x )
|
||||
#define mm256_bitset_16( x, n ) \
|
||||
_mm256_or_si256( _mm256_set1_epi16( 1U << (n) ), x )
|
||||
_mm256_or_si256( _mm256_slli_epi16( mm256_one_16, n ), x )
|
||||
#define mm256_bitclr_16( x, n ) \
|
||||
_mm256_andnot_si256( _mm256_set1_epi16( 1U << (n) ), x )
|
||||
_mm256_andnot_si256( _mm256_slli_epi16( mm256_one_16, n ), x )
|
||||
|
||||
// Return x with bit n toggled
|
||||
#define mm256_bitflip_64( x, n ) \
|
||||
@@ -448,22 +448,21 @@ inline bool memcmp_256( __m256i src1, __m256i src2, int n )
|
||||
// shift, a little more work is needed.
|
||||
|
||||
// Optimized 64 bit permutations
|
||||
// Swap 128, aka rotate 2x64, 4x32, 8x16, 16x8
|
||||
// Swap 128 bit elements in 256 bit vector
|
||||
#define mm256_swap_128( w ) _mm256_permute4x64_epi64( w, 0x4e )
|
||||
//#define mm256_swap_128( w ) _mm256_permute2x128_si256( w, w, 1 )
|
||||
|
||||
// Rotate 256 bit vector by one 64 bit element, aka 2x32, 4x16, 8x8
|
||||
// Rotate 256 bit vector by one 64 bit element
|
||||
#define mm256_rotl256_1x64( w ) _mm256_permute4x64_epi64( w, 0x93 )
|
||||
#define mm256_rotr256_1x64( w ) _mm256_permute4x64_epi64( w, 0x39 )
|
||||
|
||||
// Swap hi/lo 64 bits in each 128 bit element
|
||||
// Swap 64 bits in each 128 bit element of 256 bit vector
|
||||
#define mm256_swap128_64( x ) _mm256_shuffle_epi32( x, 0x4e )
|
||||
|
||||
// Rotate 128 bit elements by 32 bits
|
||||
// Rotate 128 bit elements in 256 bit vector by 32 bits
|
||||
#define mm256_rotr128_1x32( x ) _mm256_shuffle_epi32( x, 0x39 )
|
||||
#define mm256_rotl128_1x32( x ) _mm256_shuffle_epi32( x, 0x93 )
|
||||
|
||||
// Swap hi/lo 32 bits in each 64 bit element
|
||||
// Swap 32 bits in each 64 bit element olf 256 bit vector
|
||||
#define mm256_swap64_32( x ) _mm256_shuffle_epi32( x, 0xb1 )
|
||||
|
||||
// Less efficient but more versatile. Use only for rotations that are not
|
||||
@@ -487,9 +486,9 @@ inline bool memcmp_256( __m256i src1, __m256i src2, int n )
|
||||
// Rotate two 256 bit vectors as one 512 bit vector
|
||||
|
||||
// Fast but limited to 128 bit granularity
|
||||
#define mm256_swap512_256(a, b) _mm256_permute2x128_si256( a, b, 0x1032 )
|
||||
#define mm256_rotr512_1x128(a, b) _mm256_permute2x128_si256( a, b, 0x0321 )
|
||||
#define mm256_rotl512_1x128(a, b) _mm256_permute2x128_si256( a, b, 0x2103 )
|
||||
#define mm256_swap512_256(a, b) _mm256_permute2x128_si256( a, b, 0x4e )
|
||||
#define mm256_rotr512_1x128(a, b) _mm256_permute2x128_si256( a, b, 0x39 )
|
||||
#define mm256_rotl512_1x128(a, b) _mm256_permute2x128_si256( a, b, 0x93 )
|
||||
|
||||
// Much slower, for 64 and 32 bit granularity
|
||||
#define mm256_rotr512_1x64(a, b) \
|
||||
@@ -677,6 +676,23 @@ inline void mm_interleave_4x32( void *dst, const void *src0, const void *src1,
|
||||
d[17] = _mm_set_epi32( s3[17], s2[17], s1[17], s0[17] );
|
||||
d[18] = _mm_set_epi32( s3[18], s2[18], s1[18], s0[18] );
|
||||
d[19] = _mm_set_epi32( s3[19], s2[19], s1[19], s0[19] );
|
||||
|
||||
if ( bit_len <= 640 ) return;
|
||||
|
||||
d[20] = _mm_set_epi32( s3[20], s2[20], s1[20], s0[20] );
|
||||
d[21] = _mm_set_epi32( s3[21], s2[21], s1[21], s0[21] );
|
||||
d[22] = _mm_set_epi32( s3[22], s2[22], s1[22], s0[22] );
|
||||
d[23] = _mm_set_epi32( s3[23], s2[23], s1[23], s0[23] );
|
||||
|
||||
d[24] = _mm_set_epi32( s3[24], s2[24], s1[24], s0[24] );
|
||||
d[25] = _mm_set_epi32( s3[25], s2[25], s1[25], s0[25] );
|
||||
d[26] = _mm_set_epi32( s3[26], s2[26], s1[26], s0[26] );
|
||||
d[27] = _mm_set_epi32( s3[27], s2[27], s1[27], s0[27] );
|
||||
d[28] = _mm_set_epi32( s3[28], s2[28], s1[28], s0[28] );
|
||||
d[29] = _mm_set_epi32( s3[29], s2[29], s1[29], s0[29] );
|
||||
d[30] = _mm_set_epi32( s3[30], s2[30], s1[30], s0[30] );
|
||||
d[31] = _mm_set_epi32( s3[31], s2[31], s1[31], s0[31] );
|
||||
// bit_len == 1024
|
||||
}
|
||||
|
||||
// bit_len must be multiple of 32
|
||||
@@ -735,6 +751,24 @@ inline void mm_deinterleave_4x32( void *dst0, void *dst1, void *dst2,
|
||||
d1[4] = _mm_set_epi32( s[77], s[73], s[69], s[65] );
|
||||
d2[4] = _mm_set_epi32( s[78], s[74], s[70], s[66] );
|
||||
d3[4] = _mm_set_epi32( s[79], s[75], s[71], s[67] );
|
||||
|
||||
if ( bit_len <= 640 ) return;
|
||||
|
||||
d0[5] = _mm_set_epi32( s[92], s[88], s[84], s[80] );
|
||||
d1[5] = _mm_set_epi32( s[93], s[89], s[85], s[81] );
|
||||
d2[5] = _mm_set_epi32( s[94], s[90], s[86], s[82] );
|
||||
d3[5] = _mm_set_epi32( s[95], s[91], s[87], s[83] );
|
||||
|
||||
d0[6] = _mm_set_epi32( s[108], s[104], s[100], s[ 96] );
|
||||
d1[6] = _mm_set_epi32( s[109], s[105], s[101], s[ 97] );
|
||||
d2[6] = _mm_set_epi32( s[110], s[106], s[102], s[ 98] );
|
||||
d3[6] = _mm_set_epi32( s[111], s[107], s[103], s[ 99] );
|
||||
|
||||
d0[7] = _mm_set_epi32( s[124], s[120], s[116], s[112] );
|
||||
d1[7] = _mm_set_epi32( s[125], s[121], s[117], s[113] );
|
||||
d2[7] = _mm_set_epi32( s[126], s[122], s[118], s[114] );
|
||||
d3[7] = _mm_set_epi32( s[127], s[123], s[119], s[115] );
|
||||
// bit_len == 1024
|
||||
}
|
||||
|
||||
// deinterleave 4 arrays into individual buffers for scalarm processing
|
||||
@@ -1074,6 +1108,41 @@ inline void mm256_deinterleave_8x32x( uint32_t *dst0, uint32_t *dst1,
|
||||
}
|
||||
}
|
||||
|
||||
// Can't do it in place
|
||||
inline void mm256_reinterleave_4x64x( void *dst, void *src, int bit_len )
|
||||
{
|
||||
__m256i* d = (__m256i*)dst;
|
||||
uint32_t *s = (uint32_t*)src;
|
||||
|
||||
d[0] = _mm256_set_epi32( s[7], s[3], s[6], s[2], s[5], s[1], s[4], s[0] );
|
||||
d[1] = _mm256_set_epi32( s[15],s[11],s[14],s[10],s[13],s[9],s[12], s[8] );
|
||||
d[2] = _mm256_set_epi32( s[23],s[19],s[22],s[18],s[21],s[17],s[20],s[16] );
|
||||
d[3] = _mm256_set_epi32( s[31],s[27],s[30],s[26],s[29],s[25],s[28],s[24] );
|
||||
|
||||
if ( bit_len <= 256 ) return;
|
||||
|
||||
d[4] = _mm256_set_epi32( s[39],s[35],s[38],s[34],s[37],s[33],s[36],s[32] );
|
||||
d[5] = _mm256_set_epi32( s[47],s[43],s[46],s[42],s[45],s[41],s[44],s[40] );
|
||||
d[6] = _mm256_set_epi32( s[55],s[51],s[54],s[50],s[53],s[49],s[52],s[48] );
|
||||
d[7] = _mm256_set_epi32( s[63],s[59],s[62],s[58],s[61],s[57],s[60],s[56] );
|
||||
|
||||
if ( bit_len <= 512 ) return;
|
||||
|
||||
d[8] = _mm256_set_epi32( s[71],s[67],s[70],s[66],s[69],s[65],s[68],s[64] );
|
||||
d[9] = _mm256_set_epi32( s[79],s[75],s[78],s[74],s[77],s[73],s[76],s[72] );
|
||||
|
||||
if ( bit_len <= 640 ) return;
|
||||
|
||||
d[10] = _mm256_set_epi32(s[87],s[83],s[86],s[82],s[85],s[81],s[84],s[80]);
|
||||
d[11] = _mm256_set_epi32(s[95],s[91],s[94],s[90],s[93],s[89],s[92],s[88]);
|
||||
|
||||
d[12] = _mm256_set_epi32(s[103],s[99],s[102],s[98],s[101],s[97],s[100],s[96]);
|
||||
d[13] = _mm256_set_epi32(s[111],s[107],s[110],s[106],s[109],s[105],s[108],s[104]);
|
||||
d[14] = _mm256_set_epi32(s[119],s[115],s[118],s[114],s[117],s[113],s[116],s[112]);
|
||||
d[15] = _mm256_set_epi32(s[127],s[123],s[126],s[122],s[125],s[121],s[124],s[120]);
|
||||
// bit_len == 1024
|
||||
}
|
||||
|
||||
// likely of no use.
|
||||
// convert 4x32 byte (128 bit) vectors to 4x64 (256 bit) vectors for AVX2
|
||||
// bit_len must be multiple of 64
|
||||
@@ -1081,35 +1150,70 @@ inline void mm256_reinterleave_4x64( uint64_t *dst, uint32_t *src,
|
||||
int bit_len )
|
||||
{
|
||||
uint32_t *d = (uint32_t*)dst;
|
||||
uint32_t *s = (uint32_t*)src;
|
||||
for ( int i = 0; i < bit_len >> 5; i += 8 )
|
||||
{
|
||||
*( d + i ) = *( src + i ); // 0 <- 0 8 <- 8
|
||||
*( d + i + 1 ) = *( src + i + 4 ); // 1 <- 4 9 <- 12
|
||||
*( d + i + 2 ) = *( src + i + 1 ); // 2 <- 1 10 <- 9
|
||||
*( d + i + 3 ) = *( src + i + 5 ); // 3 <- 5 11 <- 13
|
||||
*( d + i + 4 ) = *( src + i + 2 ); // 4 <- 2 12 <- 10
|
||||
*( d + i + 5 ) = *( src + i + 6 ); // 5 <- 6 13 <- 14
|
||||
*( d + i + 6 ) = *( src + i + 3 ); // 6 <- 3 14 <- 11
|
||||
*( d + i + 7 ) = *( src + i + 7 ); // 7 <- 7 15 <- 15
|
||||
*( d + i ) = *( s + i ); // 0 <- 0 8 <- 8
|
||||
*( d + i + 1 ) = *( s + i + 4 ); // 1 <- 4 9 <- 12
|
||||
*( d + i + 2 ) = *( s + i + 1 ); // 2 <- 1 10 <- 9
|
||||
*( d + i + 3 ) = *( s + i + 5 ); // 3 <- 5 11 <- 13
|
||||
*( d + i + 4 ) = *( s + i + 2 ); // 4 <- 2 12 <- 10
|
||||
*( d + i + 5 ) = *( s + i + 6 ); // 5 <- 6 13 <- 14
|
||||
*( d + i + 6 ) = *( s + i + 3 ); // 6 <- 3 14 <- 11
|
||||
*( d + i + 7 ) = *( s + i + 7 ); // 7 <- 7 15 <- 15
|
||||
}
|
||||
}
|
||||
|
||||
// convert 4x64 byte (256 bit) vectors to 4x32 (128 bit) vectors for AVX
|
||||
// bit_len must be multiple of 64
|
||||
inline void mm_reinterleave_4x32( uint32_t *dst, uint64_t *src,
|
||||
int bit_len )
|
||||
inline void mm256_reinterleave_4x32( void *dst, void *src, int bit_len )
|
||||
{
|
||||
__m256i *d = (__m256i*)dst;
|
||||
uint32_t *s = (uint32_t*)src;
|
||||
|
||||
d[0] = _mm256_set_epi32( s[ 7],s[ 5],s[ 3],s[ 1],s[ 6],s[ 4],s[ 2],s[ 0] );
|
||||
d[1] = _mm256_set_epi32( s[15],s[13],s[11],s[ 9],s[14],s[12],s[10],s[ 8] );
|
||||
d[2] = _mm256_set_epi32( s[23],s[21],s[19],s[17],s[22],s[20],s[18],s[16] );
|
||||
d[3] = _mm256_set_epi32( s[31],s[29],s[27],s[25],s[30],s[28],s[26],s[24] );
|
||||
|
||||
if ( bit_len <= 256 ) return;
|
||||
|
||||
d[4] = _mm256_set_epi32( s[39],s[37],s[35],s[33],s[38],s[36],s[34],s[32] );
|
||||
d[5] = _mm256_set_epi32( s[47],s[45],s[43],s[41],s[46],s[44],s[42],s[40] );
|
||||
d[6] = _mm256_set_epi32( s[55],s[53],s[51],s[49],s[54],s[52],s[50],s[48] );
|
||||
d[7] = _mm256_set_epi32( s[63],s[61],s[59],s[57],s[62],s[60],s[58],s[56] );
|
||||
|
||||
if ( bit_len <= 512 ) return;
|
||||
|
||||
d[8] = _mm256_set_epi32( s[71],s[69],s[67],s[65],s[70],s[68],s[66],s[64] );
|
||||
d[9] = _mm256_set_epi32( s[79],s[77],s[75],s[73],s[78],s[76],s[74],s[72] );
|
||||
|
||||
if ( bit_len <= 640 ) return;
|
||||
|
||||
d[10] = _mm256_set_epi32( s[87],s[85],s[83],s[81],s[86],s[84],s[82],s[80] );
|
||||
d[11] = _mm256_set_epi32( s[95],s[93],s[91],s[89],s[94],s[92],s[90],s[88] );
|
||||
|
||||
d[12] = _mm256_set_epi32( s[103],s[101],s[99],s[97],s[102],s[100],s[98],s[96] );
|
||||
d[13] = _mm256_set_epi32( s[111],s[109],s[107],s[105],s[110],s[108],s[106],s[104] );
|
||||
d[14] = _mm256_set_epi32( s[119],s[117],s[115],s[113],s[118],s[116],s[114],s[112] );
|
||||
d[15] = _mm256_set_epi32( s[127],s[125],s[123],s[121],s[126],s[124],s[122],s[120] );
|
||||
// bit_len == 1024
|
||||
}
|
||||
|
||||
inline void mm_reinterleave_4x32( void *dst, void *src, int bit_len )
|
||||
{
|
||||
uint32_t *d = (uint32_t*)dst;
|
||||
uint32_t *s = (uint32_t*)src;
|
||||
for ( int i = 0; i < bit_len >> 5; i +=8 )
|
||||
{
|
||||
*( dst + i ) = *( s + i );
|
||||
*( dst + i + 1 ) = *( s + i + 2 );
|
||||
*( dst + i + 2 ) = *( s + i + 4 );
|
||||
*( dst + i + 3 ) = *( s + i + 6 );
|
||||
*( dst + i + 4 ) = *( s + i + 1 );
|
||||
*( dst + i + 5 ) = *( s + i + 3 );
|
||||
*( dst + i + 6 ) = *( s + i + 5 );
|
||||
*( dst + i + 7 ) = *( s + i + 7 );
|
||||
*( d + i ) = *( s + i );
|
||||
*( d + i + 1 ) = *( s + i + 2 );
|
||||
*( d + i + 2 ) = *( s + i + 4 );
|
||||
*( d + i + 3 ) = *( s + i + 6 );
|
||||
*( d + i + 4 ) = *( s + i + 1 );
|
||||
*( d + i + 5 ) = *( s + i + 3 );
|
||||
*( d + i + 6 ) = *( s + i + 5 );
|
||||
*( d + i + 7 ) = *( s + i + 7 );
|
||||
}
|
||||
}
|
||||
|
||||
|
20
configure
vendored
20
configure
vendored
@@ -1,6 +1,6 @@
|
||||
#! /bin/sh
|
||||
# Guess values for system-dependent variables and create Makefiles.
|
||||
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.7.8.
|
||||
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.7.9.
|
||||
#
|
||||
#
|
||||
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
|
||||
@@ -577,8 +577,8 @@ MAKEFLAGS=
|
||||
# Identity of this package.
|
||||
PACKAGE_NAME='cpuminer-opt'
|
||||
PACKAGE_TARNAME='cpuminer-opt'
|
||||
PACKAGE_VERSION='3.7.8'
|
||||
PACKAGE_STRING='cpuminer-opt 3.7.8'
|
||||
PACKAGE_VERSION='3.7.9'
|
||||
PACKAGE_STRING='cpuminer-opt 3.7.9'
|
||||
PACKAGE_BUGREPORT=''
|
||||
PACKAGE_URL=''
|
||||
|
||||
@@ -1321,7 +1321,7 @@ if test "$ac_init_help" = "long"; then
|
||||
# Omit some internal or obsolete options to make the list less imposing.
|
||||
# This message is too long to be a string in the A/UX 3.1 sh.
|
||||
cat <<_ACEOF
|
||||
\`configure' configures cpuminer-opt 3.7.8 to adapt to many kinds of systems.
|
||||
\`configure' configures cpuminer-opt 3.7.9 to adapt to many kinds of systems.
|
||||
|
||||
Usage: $0 [OPTION]... [VAR=VALUE]...
|
||||
|
||||
@@ -1392,7 +1392,7 @@ fi
|
||||
|
||||
if test -n "$ac_init_help"; then
|
||||
case $ac_init_help in
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 3.7.8:";;
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 3.7.9:";;
|
||||
esac
|
||||
cat <<\_ACEOF
|
||||
|
||||
@@ -1497,7 +1497,7 @@ fi
|
||||
test -n "$ac_init_help" && exit $ac_status
|
||||
if $ac_init_version; then
|
||||
cat <<\_ACEOF
|
||||
cpuminer-opt configure 3.7.8
|
||||
cpuminer-opt configure 3.7.9
|
||||
generated by GNU Autoconf 2.69
|
||||
|
||||
Copyright (C) 2012 Free Software Foundation, Inc.
|
||||
@@ -2000,7 +2000,7 @@ cat >config.log <<_ACEOF
|
||||
This file contains any messages produced by compilers while
|
||||
running configure, to aid debugging if configure makes a mistake.
|
||||
|
||||
It was created by cpuminer-opt $as_me 3.7.8, which was
|
||||
It was created by cpuminer-opt $as_me 3.7.9, which was
|
||||
generated by GNU Autoconf 2.69. Invocation command line was
|
||||
|
||||
$ $0 $@
|
||||
@@ -2981,7 +2981,7 @@ fi
|
||||
|
||||
# Define the identity of the package.
|
||||
PACKAGE='cpuminer-opt'
|
||||
VERSION='3.7.8'
|
||||
VERSION='3.7.9'
|
||||
|
||||
|
||||
cat >>confdefs.h <<_ACEOF
|
||||
@@ -6677,7 +6677,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
|
||||
# report actual input values of CONFIG_FILES etc. instead of their
|
||||
# values after options handling.
|
||||
ac_log="
|
||||
This file was extended by cpuminer-opt $as_me 3.7.8, which was
|
||||
This file was extended by cpuminer-opt $as_me 3.7.9, which was
|
||||
generated by GNU Autoconf 2.69. Invocation command line was
|
||||
|
||||
CONFIG_FILES = $CONFIG_FILES
|
||||
@@ -6743,7 +6743,7 @@ _ACEOF
|
||||
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
|
||||
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
|
||||
ac_cs_version="\\
|
||||
cpuminer-opt config.status 3.7.8
|
||||
cpuminer-opt config.status 3.7.9
|
||||
configured by $0, generated by GNU Autoconf 2.69,
|
||||
with options \\"\$ac_cs_config\\"
|
||||
|
||||
|
@@ -1,4 +1,4 @@
|
||||
AC_INIT([cpuminer-opt], [3.7.8])
|
||||
AC_INIT([cpuminer-opt], [3.7.9])
|
||||
|
||||
AC_PREREQ([2.59c])
|
||||
AC_CANONICAL_SYSTEM
|
||||
|
5
miner.h
5
miner.h
@@ -546,6 +546,7 @@ enum algos {
|
||||
ALGO_X17,
|
||||
ALGO_XEVAN,
|
||||
ALGO_YESCRYPT,
|
||||
ALGO_YESCRYPTR8,
|
||||
ALGO_YESCRYPTR16,
|
||||
ALGO_ZR5,
|
||||
ALGO_COUNT
|
||||
@@ -617,6 +618,7 @@ static const char* const algo_names[] = {
|
||||
"x17",
|
||||
"xevan",
|
||||
"yescrypt",
|
||||
"yescryptr8",
|
||||
"yescryptr16",
|
||||
"zr5",
|
||||
"\0"
|
||||
@@ -741,8 +743,9 @@ Options:\n\
|
||||
x14 X14\n\
|
||||
x15 X15\n\
|
||||
x17\n\
|
||||
xevan Bitsend\n\
|
||||
xevan Bitsend (BSD)\n\
|
||||
yescrypt Globlboost-Y (BSTY)\n\
|
||||
yescryptr8 BitZeny (ZNY)\n\
|
||||
yescryptr16 Yenten (YTN)\n\
|
||||
zr5 Ziftr\n\
|
||||
-o, --url=URL URL of mining server\n\
|
||||
|
@@ -31,6 +31,7 @@ CFLAGS="-O3 -march=core-avx2 -Wall -DFOUR_WAY" ./configure $F
|
||||
make
|
||||
mv cpuminer.exe release/cpuminer-4way.exe
|
||||
|
||||
make clean || echo clean
|
||||
CFLAGS="-O3 -march=corei7-avx -msha -Wall" ./configure $F
|
||||
make
|
||||
strip -s cpuminer.exe
|
||||
|
Reference in New Issue
Block a user