Compare commits

..

2 Commits

Author SHA1 Message Date
Jay D Dee
58030e2788 v3.20.2 2022-08-01 20:21:05 -04:00
Jay D Dee
1321ac474c v3.20.1 2022-07-26 18:36:40 -04:00
34 changed files with 844 additions and 7120 deletions

View File

@@ -285,11 +285,9 @@ cpuminer_SOURCES = \
algo/x22/x22i-gate.c \
algo/x22/x25x.c \
algo/x22/x25x-4way.c \
algo/yescrypt/yescrypt.c \
algo/yescrypt/yescrypt-best.c \
algo/yespower/yespower-gate.c \
algo/yespower/yespower-blake2b.c \
algo/yespower/crypto/blake2b-yp.c \
algo/yespower/crypto/hmac-blake2b.c \
algo/yespower/yescrypt-r8g.c \
algo/yespower/yespower-opt.c

View File

@@ -65,6 +65,20 @@ If not what makes it happen or not happen?
Change Log
----------
v3.20.2
Bit rotation optimizations to Blake256, Blake512, Blake2b, Blake2s & Lyra2-blake2b for SSE2 & AVX2.
Removed old unused yescrypt library and other unused code.
v3.20.1
sph_blake2b optimized 1-way SSSE3 & AVX2.
Removed duplicate Blake2b used by Power2b algo, will now use optimized sph_blake2b.
Removed imprecise hash & target display from rejected share log.
Share and target difficulty is now displayed only for low diificulty shares.
Updated configure.ac to check for AVX512 asm support.
Small optimization to Lyra2 SSE2.
v3.20.0
#375 Fixed segfault in algos using Groestl VAES due to use of uninitialized data.

View File

@@ -371,15 +371,11 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
case ALGO_X22I: rc = register_x22i_algo ( gate ); break;
case ALGO_X25X: rc = register_x25x_algo ( gate ); break;
case ALGO_XEVAN: rc = register_xevan_algo ( gate ); break;
case ALGO_YESCRYPT: rc = register_yescrypt_05_algo ( gate ); break;
// case ALGO_YESCRYPT: register_yescrypt_algo ( gate ); break;
case ALGO_YESCRYPTR8: rc = register_yescryptr8_05_algo ( gate ); break;
// case ALGO_YESCRYPTR8: register_yescryptr8_algo ( gate ); break;
case ALGO_YESCRYPT: rc = register_yescrypt_algo ( gate ); break;
case ALGO_YESCRYPTR8: rc = register_yescryptr8_algo ( gate ); break;
case ALGO_YESCRYPTR8G: rc = register_yescryptr8g_algo ( gate ); break;
case ALGO_YESCRYPTR16: rc = register_yescryptr16_05_algo( gate ); break;
// case ALGO_YESCRYPTR16: register_yescryptr16_algo ( gate ); break;
case ALGO_YESCRYPTR32: rc = register_yescryptr32_05_algo( gate ); break;
// case ALGO_YESCRYPTR32: register_yescryptr32_algo ( gate ); break;
case ALGO_YESCRYPTR16: rc = register_yescryptr16_algo ( gate ); break;
case ALGO_YESCRYPTR32: rc = register_yescryptr32_algo ( gate ); break;
case ALGO_YESPOWER: rc = register_yespower_algo ( gate ); break;
case ALGO_YESPOWERR16: rc = register_yespowerr16_algo ( gate ); break;
case ALGO_YESPOWER_B2B: rc = register_yespower_b2b_algo ( gate ); break;

View File

@@ -400,18 +400,18 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
// Blake-256 4 way
#define GS_4WAY( m0, m1, c0, c1, a, b, c, d ) \
do { \
{ \
a = _mm_add_epi32( _mm_add_epi32( a, b ), \
_mm_xor_si128( _mm_set1_epi32( c1 ), m0 ) ); \
d = mm128_ror_32( _mm_xor_si128( d, a ), 16 ); \
d = mm128_swap32_16( _mm_xor_si128( d, a ) ); \
c = _mm_add_epi32( c, d ); \
b = mm128_ror_32( _mm_xor_si128( b, c ), 12 ); \
a = _mm_add_epi32( _mm_add_epi32( a, b ), \
_mm_xor_si128( _mm_set1_epi32( c0 ), m1 ) ); \
d = mm128_ror_32( _mm_xor_si128( d, a ), 8 ); \
d = mm128_shuflr32_8( _mm_xor_si128( d, a ) ); \
c = _mm_add_epi32( c, d ); \
b = mm128_ror_32( _mm_xor_si128( b, c ), 7 ); \
} while (0)
}
#if SPH_COMPACT_BLAKE_32
@@ -441,7 +441,8 @@ do { \
#else
#define ROUND_S_4WAY(r) do { \
#define ROUND_S_4WAY(r) \
{ \
GS_4WAY(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
GS_4WAY(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \
GS_4WAY(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \
@@ -450,7 +451,7 @@ do { \
GS_4WAY(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \
GS_4WAY(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \
GS_4WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
} while (0)
}
#endif
@@ -537,7 +538,7 @@ do { \
#if defined(__SSSE3__)
#define BLAKE256_4WAY_BLOCK_BSWAP32 do \
#define BLAKE256_4WAY_BLOCK_BSWAP32 \
{ \
__m128i shuf_bswap32 = _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
0x0405060700010203 ); \
@@ -557,11 +558,11 @@ do { \
MD = _mm_shuffle_epi8( buf[13], shuf_bswap32 ); \
ME = _mm_shuffle_epi8( buf[14], shuf_bswap32 ); \
MF = _mm_shuffle_epi8( buf[15], shuf_bswap32 ); \
} while(0)
}
#else // SSE2
#define BLAKE256_4WAY_BLOCK_BSWAP32 do \
#define BLAKE256_4WAY_BLOCK_BSWAP32 \
{ \
M0 = mm128_bswap_32( buf[0] ); \
M1 = mm128_bswap_32( buf[1] ); \
@@ -579,12 +580,12 @@ do { \
MD = mm128_bswap_32( buf[13] ); \
ME = mm128_bswap_32( buf[14] ); \
MF = mm128_bswap_32( buf[15] ); \
} while(0)
}
#endif // SSSE3 else SSE2
#define COMPRESS32_4WAY( rounds ) \
do { \
{ \
__m128i M0, M1, M2, M3, M4, M5, M6, M7; \
__m128i M8, M9, MA, MB, MC, MD, ME, MF; \
__m128i V0, V1, V2, V3, V4, V5, V6, V7; \
@@ -631,7 +632,7 @@ do { \
H5 = _mm_xor_si128( _mm_xor_si128( VD, V5 ), H5 ); \
H6 = _mm_xor_si128( _mm_xor_si128( VE, V6 ), H6 ); \
H7 = _mm_xor_si128( _mm_xor_si128( VF, V7 ), H7 ); \
} while (0)
}
#endif
@@ -642,20 +643,21 @@ do { \
// Blake-256 8 way
#define GS_8WAY( m0, m1, c0, c1, a, b, c, d ) \
do { \
{ \
a = _mm256_add_epi32( _mm256_add_epi32( a, b ), \
_mm256_xor_si256( _mm256_set1_epi32( c1 ), m0 ) ); \
d = mm256_ror_32( _mm256_xor_si256( d, a ), 16 ); \
d = mm256_swap32_16( _mm256_xor_si256( d, a ) ); \
c = _mm256_add_epi32( c, d ); \
b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \
a = _mm256_add_epi32( _mm256_add_epi32( a, b ), \
_mm256_xor_si256( _mm256_set1_epi32( c0 ), m1 ) ); \
d = mm256_ror_32( _mm256_xor_si256( d, a ), 8 ); \
d = mm256_shuflr32_8( _mm256_xor_si256( d, a ) ); \
c = _mm256_add_epi32( c, d ); \
b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \
} while (0)
}
#define ROUND_S_8WAY(r) do { \
#define ROUND_S_8WAY(r) \
{ \
GS_8WAY(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
GS_8WAY(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \
GS_8WAY(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \
@@ -664,7 +666,7 @@ do { \
GS_8WAY(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \
GS_8WAY(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \
GS_8WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
} while (0)
}
#define DECL_STATE32_8WAY \
__m256i H0, H1, H2, H3, H4, H5, H6, H7; \
@@ -699,7 +701,7 @@ do { \
} while (0)
#define COMPRESS32_8WAY( rounds ) \
do { \
{ \
__m256i M0, M1, M2, M3, M4, M5, M6, M7; \
__m256i M8, M9, MA, MB, MC, MD, ME, MF; \
__m256i V0, V1, V2, V3, V4, V5, V6, V7; \
@@ -764,10 +766,10 @@ do { \
H5 = mm256_xor3( VD, V5, H5 ); \
H6 = mm256_xor3( VE, V6, H6 ); \
H7 = mm256_xor3( VF, V7, H7 ); \
} while (0)
}
#define COMPRESS32_8WAY_LE( rounds ) \
do { \
{ \
__m256i M0, M1, M2, M3, M4, M5, M6, M7; \
__m256i M8, M9, MA, MB, MC, MD, ME, MF; \
__m256i V0, V1, V2, V3, V4, V5, V6, V7; \
@@ -829,7 +831,7 @@ do { \
H5 = mm256_xor3( VD, V5, H5 ); \
H6 = mm256_xor3( VE, V6, H6 ); \
H7 = mm256_xor3( VF, V7, H7 ); \
} while (0)
}
void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
const void *data )
@@ -861,7 +863,7 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
// G1
V[ 1] = _mm256_add_epi32( _mm256_add_epi32( V[ 1], V[ 5] ),
_mm256_xor_si256( _mm256_set1_epi32( CS3 ), M[ 2] ) );
V[13] = mm256_ror_32( _mm256_xor_si256( V[13], V[ 1] ), 16 );
V[13] = mm256_swap32_16( _mm256_xor_si256( V[13], V[ 1] ) );
V[ 9] = _mm256_add_epi32( V[ 9], V[13] );
V[ 5] = mm256_ror_32( _mm256_xor_si256( V[ 5], V[ 9] ), 12 );
V[ 1] = _mm256_add_epi32( V[ 1], V[ 5] );
@@ -881,7 +883,7 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
// G7
V[ 3] = _mm256_add_epi32( _mm256_add_epi32( V[ 3], V[ 4] ),
_mm256_xor_si256( _mm256_set1_epi32( CSF ), M[14] ) );
V[14] = mm256_ror_32( _mm256_xor_si256( V[14], V[ 3] ), 16 );
V[14] = mm256_swap32_16( _mm256_xor_si256( V[14], V[ 3] ) );
V[ 3] = _mm256_add_epi32( V[ 3],
_mm256_xor_si256( _mm256_set1_epi32( CSE ), M[15] ) );
}
@@ -935,18 +937,18 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
// G1
V1 = _mm256_add_epi32( V1,
_mm256_xor_si256( _mm256_set1_epi32( CS2 ), M3 ) );
VD = mm256_ror_32( _mm256_xor_si256( VD, V1 ), 8 );
VD = mm256_shuflr32_8( _mm256_xor_si256( VD, V1 ) );
V9 = _mm256_add_epi32( V9, VD );
V5 = mm256_ror_32( _mm256_xor_si256( V5, V9 ), 7 );
// G4
V0 = _mm256_add_epi32( V0, V5 );
VF = mm256_ror_32( _mm256_xor_si256( VF, V0 ), 16 );
VF = mm256_swap32_16( _mm256_xor_si256( VF, V0 ) );
VA = _mm256_add_epi32( VA, VF );
V5 = mm256_ror_32( _mm256_xor_si256( V5, VA ), 12 );
V0 = _mm256_add_epi32( V0, _mm256_add_epi32( V5,
_mm256_xor_si256( _mm256_set1_epi32( CS8 ), M9 ) ) );
VF = mm256_ror_32( _mm256_xor_si256( VF, V0 ), 8 );
VF = mm256_shuflr32_8( _mm256_xor_si256( VF, V0 ) );
VA = _mm256_add_epi32( VA, VF );
V5 = mm256_ror_32( _mm256_xor_si256( V5, VA ), 7 );
@@ -954,12 +956,12 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
GS_8WAY( MA, MB, CSA, CSB, V1, V6, VB, VC );
// G6
VD = mm256_ror_32( _mm256_xor_si256( VD, V2 ), 16 );
VD = mm256_swap32_16( _mm256_xor_si256( VD, V2 ) );
V8 = _mm256_add_epi32( V8, VD );
V7 = mm256_ror_32( _mm256_xor_si256( V7, V8 ), 12 );
V2 = _mm256_add_epi32( _mm256_add_epi32( V2, V7 ),
_mm256_xor_si256( _mm256_set1_epi32( CSC ), MD ) );
VD = mm256_ror_32( _mm256_xor_si256( VD, V2 ), 8 );
VD = mm256_shuflr32_8( _mm256_xor_si256( VD, V2 ) );
V8 = _mm256_add_epi32( V8, VD );
V7 = mm256_ror_32( _mm256_xor_si256( V7, V8 ), 7 );
@@ -967,7 +969,7 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
V9 = _mm256_add_epi32( V9, VE );
V4 = mm256_ror_32( _mm256_xor_si256( V4, V9 ), 12 );
V3 = _mm256_add_epi32( V3, V4 );
VE = mm256_ror_32( _mm256_xor_si256( VE, V3 ), 8 );
VE = mm256_shuflr32_8( _mm256_xor_si256( VE, V3 ) );
V9 = _mm256_add_epi32( V9, VE );
V4 = mm256_ror_32( _mm256_xor_si256( V4, V9 ), 7 );
@@ -1009,7 +1011,7 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
// Blake-256 16 way AVX512
#define GS_16WAY( m0, m1, c0, c1, a, b, c, d ) \
do { \
{ \
a = _mm512_add_epi32( _mm512_add_epi32( a, b ), \
_mm512_xor_si512( _mm512_set1_epi32( c1 ), m0 ) ); \
d = mm512_ror_32( _mm512_xor_si512( d, a ), 16 ); \
@@ -1020,9 +1022,10 @@ do { \
d = mm512_ror_32( _mm512_xor_si512( d, a ), 8 ); \
c = _mm512_add_epi32( c, d ); \
b = mm512_ror_32( _mm512_xor_si512( b, c ), 7 ); \
} while (0)
}
#define ROUND_S_16WAY(r) do { \
#define ROUND_S_16WAY(r) \
{ \
GS_16WAY(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
GS_16WAY(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \
GS_16WAY(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \
@@ -1031,7 +1034,7 @@ do { \
GS_16WAY(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \
GS_16WAY(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \
GS_16WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
} while (0)
}
#define DECL_STATE32_16WAY \
__m512i H0, H1, H2, H3, H4, H5, H6, H7; \
@@ -1066,7 +1069,7 @@ do { \
} while (0)
#define COMPRESS32_16WAY( rounds ) \
do { \
{ \
__m512i M0, M1, M2, M3, M4, M5, M6, M7; \
__m512i M8, M9, MA, MB, MC, MD, ME, MF; \
__m512i V0, V1, V2, V3, V4, V5, V6, V7; \
@@ -1133,10 +1136,10 @@ do { \
H5 = mm512_xor3( VD, V5, H5 ); \
H6 = mm512_xor3( VE, V6, H6 ); \
H7 = mm512_xor3( VF, V7, H7 ); \
} while (0)
}
#define COMPRESS32_16WAY_LE( rounds ) \
do { \
{ \
__m512i M0, M1, M2, M3, M4, M5, M6, M7; \
__m512i M8, M9, MA, MB, MC, MD, ME, MF; \
__m512i V0, V1, V2, V3, V4, V5, V6, V7; \
@@ -1198,7 +1201,7 @@ do { \
H5 = mm512_xor3( VD, V5, H5 ); \
H6 = mm512_xor3( VE, V6, H6 ); \
H7 = mm512_xor3( VF, V7, H7 ); \
} while (0)
}
// Blake-256 prehash of the second block is split onto 2 parts. The first part
// is constant for every nonce and only needs to be run once per job. The

View File

@@ -52,6 +52,180 @@ static const uint8_t sigma[12][16] =
};
#define Z00 0
#define Z01 1
#define Z02 2
#define Z03 3
#define Z04 4
#define Z05 5
#define Z06 6
#define Z07 7
#define Z08 8
#define Z09 9
#define Z0A A
#define Z0B B
#define Z0C C
#define Z0D D
#define Z0E E
#define Z0F F
#define Z10 E
#define Z11 A
#define Z12 4
#define Z13 8
#define Z14 9
#define Z15 F
#define Z16 D
#define Z17 6
#define Z18 1
#define Z19 C
#define Z1A 0
#define Z1B 2
#define Z1C B
#define Z1D 7
#define Z1E 5
#define Z1F 3
#define Z20 B
#define Z21 8
#define Z22 C
#define Z23 0
#define Z24 5
#define Z25 2
#define Z26 F
#define Z27 D
#define Z28 A
#define Z29 E
#define Z2A 3
#define Z2B 6
#define Z2C 7
#define Z2D 1
#define Z2E 9
#define Z2F 4
#define Z30 7
#define Z31 9
#define Z32 3
#define Z33 1
#define Z34 D
#define Z35 C
#define Z36 B
#define Z37 E
#define Z38 2
#define Z39 6
#define Z3A 5
#define Z3B A
#define Z3C 4
#define Z3D 0
#define Z3E F
#define Z3F 8
#define Z40 9
#define Z41 0
#define Z42 5
#define Z43 7
#define Z44 2
#define Z45 4
#define Z46 A
#define Z47 F
#define Z48 E
#define Z49 1
#define Z4A B
#define Z4B C
#define Z4C 6
#define Z4D 8
#define Z4E 3
#define Z4F D
#define Z50 2
#define Z51 C
#define Z52 6
#define Z53 A
#define Z54 0
#define Z55 B
#define Z56 8
#define Z57 3
#define Z58 4
#define Z59 D
#define Z5A 7
#define Z5B 5
#define Z5C F
#define Z5D E
#define Z5E 1
#define Z5F 9
#define Z60 C
#define Z61 5
#define Z62 1
#define Z63 F
#define Z64 E
#define Z65 D
#define Z66 4
#define Z67 A
#define Z68 0
#define Z69 7
#define Z6A 6
#define Z6B 3
#define Z6C 9
#define Z6D 2
#define Z6E 8
#define Z6F B
#define Z70 D
#define Z71 B
#define Z72 7
#define Z73 E
#define Z74 C
#define Z75 1
#define Z76 3
#define Z77 9
#define Z78 5
#define Z79 0
#define Z7A F
#define Z7B 4
#define Z7C 8
#define Z7D 6
#define Z7E 2
#define Z7F A
#define Z80 6
#define Z81 F
#define Z82 E
#define Z83 9
#define Z84 B
#define Z85 3
#define Z86 0
#define Z87 8
#define Z88 C
#define Z89 2
#define Z8A D
#define Z8B 7
#define Z8C 1
#define Z8D 4
#define Z8E A
#define Z8F 5
#define Z90 A
#define Z91 2
#define Z92 8
#define Z93 4
#define Z94 7
#define Z95 6
#define Z96 1
#define Z97 5
#define Z98 F
#define Z99 B
#define Z9A 9
#define Z9B E
#define Z9C 3
#define Z9D C
#define Z9E D
#define Z9F 0
#define Mx(r, i) Mx_(Z ## r ## i)
#define Mx_(n) Mx__(n)
#define Mx__(n) M ## n
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define B2B8W_G(a, b, c, d, x, y) \
@@ -214,11 +388,11 @@ void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
#define B2B_G(a, b, c, d, x, y) \
{ \
v[a] = _mm256_add_epi64( _mm256_add_epi64( v[a], v[b] ), x ); \
v[d] = mm256_ror_64( _mm256_xor_si256( v[d], v[a] ), 32 ); \
v[d] = mm256_swap64_32( _mm256_xor_si256( v[d], v[a] ) ); \
v[c] = _mm256_add_epi64( v[c], v[d] ); \
v[b] = mm256_ror_64( _mm256_xor_si256( v[b], v[c] ), 24 ); \
v[b] = mm256_shuflr64_24( _mm256_xor_si256( v[b], v[c] ) ); \
v[a] = _mm256_add_epi64( _mm256_add_epi64( v[a], v[b] ), y ); \
v[d] = mm256_ror_64( _mm256_xor_si256( v[d], v[a] ), 16 ); \
v[d] = mm256_shuflr64_16( _mm256_xor_si256( v[d], v[a] ) ); \
v[c] = _mm256_add_epi64( v[c], v[d] ); \
v[b] = mm256_ror_64( _mm256_xor_si256( v[b], v[c] ), 63 ); \
}

View File

@@ -108,11 +108,11 @@ do { \
uint8_t s0 = sigma0; \
uint8_t s1 = sigma1; \
a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ s0 ] ); \
d = mm128_ror_32( _mm_xor_si128( d, a ), 16 ); \
d = mm128_swap32_16( _mm_xor_si128( d, a ) ); \
c = _mm_add_epi32( c, d ); \
b = mm128_ror_32( _mm_xor_si128( b, c ), 12 ); \
a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ s1 ] ); \
d = mm128_ror_32( _mm_xor_si128( d, a ), 8 ); \
d = mm128_shuflr32_8( _mm_xor_si128( d, a ) ); \
c = _mm_add_epi32( c, d ); \
b = mm128_ror_32( _mm_xor_si128( b, c ), 7 ); \
} while(0)
@@ -320,11 +320,11 @@ do { \
uint8_t s0 = sigma0; \
uint8_t s1 = sigma1; \
a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m[ s0 ] ); \
d = mm256_ror_32( _mm256_xor_si256( d, a ), 16 ); \
d = mm256_swap32_16( _mm256_xor_si256( d, a ) ); \
c = _mm256_add_epi32( c, d ); \
b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \
a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m[ s1 ] ); \
d = mm256_ror_32( _mm256_xor_si256( d, a ), 8 ); \
d = mm256_shuflr32_8( _mm256_xor_si256( d, a ) ); \
c = _mm256_add_epi32( c, d ); \
b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \
} while(0)

View File

@@ -314,10 +314,11 @@ static const sph_u64 CB[16] = {
// Blake-512 8 way AVX512
#define GB_8WAY(m0, m1, c0, c1, a, b, c, d) do { \
#define GB_8WAY( m0, m1, c0, c1, a, b, c, d ) \
{ \
a = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512( \
_mm512_set1_epi64( c1 ), m0 ), b ), a ); \
d = mm512_ror_64( _mm512_xor_si512( d, a ), 32 ); \
d = mm512_swap64_32( _mm512_xor_si512( d, a ) ); \
c = _mm512_add_epi64( c, d ); \
b = mm512_ror_64( _mm512_xor_si512( b, c ), 25 ); \
a = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512( \
@@ -325,9 +326,10 @@ static const sph_u64 CB[16] = {
d = mm512_ror_64( _mm512_xor_si512( d, a ), 16 ); \
c = _mm512_add_epi64( c, d ); \
b = mm512_ror_64( _mm512_xor_si512( b, c ), 11 ); \
} while (0)
}
#define ROUND_B_8WAY(r) do { \
#define ROUND_B_8WAY( r ) \
{ \
GB_8WAY(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \
GB_8WAY(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \
GB_8WAY(Mx(r, 4), Mx(r, 5), CBx(r, 4), CBx(r, 5), V2, V6, VA, VE); \
@@ -336,13 +338,13 @@ static const sph_u64 CB[16] = {
GB_8WAY(Mx(r, A), Mx(r, B), CBx(r, A), CBx(r, B), V1, V6, VB, VC); \
GB_8WAY(Mx(r, C), Mx(r, D), CBx(r, C), CBx(r, D), V2, V7, V8, VD); \
GB_8WAY(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \
} while (0)
}
#define DECL_STATE64_8WAY \
__m512i H0, H1, H2, H3, H4, H5, H6, H7; \
uint64_t T0, T1;
#define COMPRESS64_8WAY( buf ) do \
#define COMPRESS64_8WAY( buf ) \
{ \
__m512i M0, M1, M2, M3, M4, M5, M6, M7; \
__m512i M8, M9, MA, MB, MC, MD, ME, MF; \
@@ -409,7 +411,7 @@ static const sph_u64 CB[16] = {
H5 = mm512_xor3( VD, V5, H5 ); \
H6 = mm512_xor3( VE, V6, H6 ); \
H7 = mm512_xor3( VF, V7, H7 ); \
} while (0)
}
void blake512_8way_compress( blake_8way_big_context *sc )
{
@@ -610,7 +612,7 @@ void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
V0 = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512(
_mm512_set1_epi64( CB9 ), sc->buf[ 8] ), V5 ), V0 );
VF = mm512_ror_64( _mm512_xor_si512( VF, V0 ), 32 );
VF = mm512_swap64_32( _mm512_xor_si512( VF, V0 ) );
VA = _mm512_add_epi64( VA, VF );
V5 = mm512_ror_64( _mm512_xor_si512( V5, VA ), 25 );
V0 = _mm512_add_epi64( V0, V5 );
@@ -714,7 +716,7 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
// V1 = _mm512_add_epi64( V1, _mm512_xor_si512( _mm512_set1_epi64( c1 ), m0 );
V1 = _mm512_add_epi64( V1, V5 );
VD = mm512_ror_64( _mm512_xor_si512( VD, V1 ), 32 );
VD = mm512_swap64_32( _mm512_xor_si512( VD, V1 ) );
V9 = _mm512_add_epi64( V9, VD );
V5 = mm512_ror_64( _mm512_xor_si512( V5, V9 ), 25 );
V1 = _mm512_add_epi64( V1, _mm512_add_epi64( _mm512_xor_si512(
@@ -728,7 +730,7 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
// V2 = _mm512_add_epi64( V2, V6 );
V2 = _mm512_add_epi64( V2, _mm512_xor_si512(
_mm512_set1_epi64( CBF ), M9 ) );
VE = mm512_ror_64( _mm512_xor_si512( VE, V2 ), 32 );
VE = mm512_swap64_32( _mm512_xor_si512( VE, V2 ) );
VA = _mm512_add_epi64( VA, VE );
V6 = mm512_ror_64( _mm512_xor_si512( V6, VA ), 25 );
V2 = _mm512_add_epi64( V2, _mm512_add_epi64( _mm512_xor_si512(
@@ -742,7 +744,7 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
// V3 = _mm512_add_epi64( V3, _mm512_add_epi64( _mm512_xor_si512(
// _mm512_set1_epi64( CBx(1, 7) ), Mx(1, 6) ), V7 ) );
VF = mm512_ror_64( _mm512_xor_si512( VF, V3 ), 32 );
VF = mm512_swap64_32( _mm512_xor_si512( VF, V3 ) );
VB = _mm512_add_epi64( VB, VF );
V7 = mm512_ror_64( _mm512_xor_si512( V7, VB ), 25 );
V3 = _mm512_add_epi64( V3, _mm512_add_epi64( _mm512_xor_si512(
@@ -757,7 +759,6 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
GB_8WAY(Mx(1, C), Mx(1, D), CBx(1, C), CBx(1, D), V2, V7, V8, VD);
GB_8WAY(Mx(1, E), Mx(1, F), CBx(1, E), CBx(1, F), V3, V4, V9, VE);
// remaining rounds
ROUND_B_8WAY(2);
ROUND_B_8WAY(3);
@@ -1055,20 +1056,22 @@ blake512_8way_close(void *cc, void *dst)
// Blake-512 4 way
#define GB_4WAY(m0, m1, c0, c1, a, b, c, d) do { \
#define GB_4WAY(m0, m1, c0, c1, a, b, c, d) \
{ \
a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \
_mm256_set1_epi64x( c1 ), m0 ), b ), a ); \
d = mm256_ror_64( _mm256_xor_si256( d, a ), 32 ); \
d = mm256_swap64_32( _mm256_xor_si256( d, a ) ); \
c = _mm256_add_epi64( c, d ); \
b = mm256_ror_64( _mm256_xor_si256( b, c ), 25 ); \
a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \
_mm256_set1_epi64x( c0 ), m1 ), b ), a ); \
d = mm256_ror_64( _mm256_xor_si256( d, a ), 16 ); \
d = mm256_shuflr64_16( _mm256_xor_si256( d, a ) ); \
c = _mm256_add_epi64( c, d ); \
b = mm256_ror_64( _mm256_xor_si256( b, c ), 11 ); \
} while (0)
}
#define ROUND_B_4WAY(r) do { \
#define ROUND_B_4WAY(r) \
{ \
GB_4WAY(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \
GB_4WAY(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \
GB_4WAY(Mx(r, 4), Mx(r, 5), CBx(r, 4), CBx(r, 5), V2, V6, VA, VE); \
@@ -1077,13 +1080,13 @@ blake512_8way_close(void *cc, void *dst)
GB_4WAY(Mx(r, A), Mx(r, B), CBx(r, A), CBx(r, B), V1, V6, VB, VC); \
GB_4WAY(Mx(r, C), Mx(r, D), CBx(r, C), CBx(r, D), V2, V7, V8, VD); \
GB_4WAY(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \
} while (0)
}
#define DECL_STATE64_4WAY \
__m256i H0, H1, H2, H3, H4, H5, H6, H7; \
uint64_t T0, T1;
#define COMPRESS64_4WAY do \
#define COMPRESS64_4WAY \
{ \
__m256i M0, M1, M2, M3, M4, M5, M6, M7; \
__m256i M8, M9, MA, MB, MC, MD, ME, MF; \
@@ -1148,7 +1151,7 @@ blake512_8way_close(void *cc, void *dst)
H5 = mm256_xor3( VD, V5, H5 ); \
H6 = mm256_xor3( VE, V6, H6 ); \
H7 = mm256_xor3( VF, V7, H7 ); \
} while (0)
}
void blake512_4way_compress( blake_4way_big_context *sc )
@@ -1278,7 +1281,7 @@ void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
// G4 skip nonce
V0 = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256(
_mm256_set1_epi64x( CB9 ), sc->buf[ 8] ), V5 ), V0 );
VF = mm256_ror_64( _mm256_xor_si256( VF, V0 ), 32 );
VF = mm256_swap64_32( _mm256_xor_si256( VF, V0 ) );
VA = _mm256_add_epi64( VA, VF );
V5 = mm256_ror_64( _mm256_xor_si256( V5, VA ), 25 );
V0 = _mm256_add_epi64( V0, V5 );
@@ -1365,7 +1368,7 @@ void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
// finish round 0, with the nonce now available
V0 = _mm256_add_epi64( V0, _mm256_xor_si256(
_mm256_set1_epi64x( CB8 ), M9 ) );
VF = mm256_ror_64( _mm256_xor_si256( VF, V0 ), 16 );
VF = mm256_shuflr64_16( _mm256_xor_si256( VF, V0 ) );
VA = _mm256_add_epi64( VA, VF );
V5 = mm256_ror_64( _mm256_xor_si256( V5, VA ), 11 );
@@ -1375,34 +1378,34 @@ void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
// G1
V1 = _mm256_add_epi64( V1, V5 );
VD = mm256_ror_64( _mm256_xor_si256( VD, V1 ), 32 );
VD = mm256_swap64_32( _mm256_xor_si256( VD, V1 ) );
V9 = _mm256_add_epi64( V9, VD );
V5 = mm256_ror_64( _mm256_xor_si256( V5, V9 ), 25 );
V1 = _mm256_add_epi64( V1, _mm256_add_epi64( _mm256_xor_si256(
_mm256_set1_epi64x( CBx(1,2) ), Mx(1,3) ), V5 ) );
VD = mm256_ror_64( _mm256_xor_si256( VD, V1 ), 16 );
VD = mm256_shuflr64_16( _mm256_xor_si256( VD, V1 ) );
V9 = _mm256_add_epi64( V9, VD );
V5 = mm256_ror_64( _mm256_xor_si256( V5, V9 ), 11 );
// G2
V2 = _mm256_add_epi64( V2, _mm256_xor_si256(
_mm256_set1_epi64x( CBF ), M9 ) );
VE = mm256_ror_64( _mm256_xor_si256( VE, V2 ), 32 );
VE = mm256_swap64_32( _mm256_xor_si256( VE, V2 ) );
VA = _mm256_add_epi64( VA, VE );
V6 = mm256_ror_64( _mm256_xor_si256( V6, VA ), 25 );
V2 = _mm256_add_epi64( V2, _mm256_add_epi64( _mm256_xor_si256(
_mm256_set1_epi64x( CB9 ), MF ), V6 ) );
VE = mm256_ror_64( _mm256_xor_si256( VE, V2 ), 16 );
VE = mm256_shuflr64_16( _mm256_xor_si256( VE, V2 ) );
VA = _mm256_add_epi64( VA, VE );
V6 = mm256_ror_64( _mm256_xor_si256( V6, VA ), 11 );
// G3
VF = mm256_ror_64( _mm256_xor_si256( VF, V3 ), 32 );
VF = mm256_swap64_32( _mm256_xor_si256( VF, V3 ) );
VB = _mm256_add_epi64( VB, VF );
V7 = mm256_ror_64( _mm256_xor_si256( V7, VB ), 25 );
V3 = _mm256_add_epi64( V3, _mm256_add_epi64( _mm256_xor_si256(
_mm256_set1_epi64x( CBx(1, 6) ), Mx(1, 7) ), V7 ) );
VF = mm256_ror_64( _mm256_xor_si256( VF, V3 ), 16 );
VF = mm256_shuflr64_16( _mm256_xor_si256( VF, V3 ) );
VB = _mm256_add_epi64( VB, VF );
V7 = mm256_ror_64( _mm256_xor_si256( V7, VB ), 11 );

View File

@@ -30,18 +30,11 @@
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include "simd-utils.h"
#include "algo/sha/sph_types.h"
#include "sph_blake2b.h"
// Cyclic right rotation.
#ifndef ROTR64
#define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y))))
#endif
// Little-endian byte access.
#define B2B_GET64(p) \
(((uint64_t) ((uint8_t *) (p))[0]) ^ \
(((uint64_t) ((uint8_t *) (p))[1]) << 8) ^ \
@@ -52,47 +45,141 @@
(((uint64_t) ((uint8_t *) (p))[6]) << 48) ^ \
(((uint64_t) ((uint8_t *) (p))[7]) << 56))
// G Mixing function.
#if defined(__AVX2__)
#define B2B_G(a, b, c, d, x, y) { \
v[a] = v[a] + v[b] + x; \
v[d] = ROTR64(v[d] ^ v[a], 32); \
v[c] = v[c] + v[d]; \
v[b] = ROTR64(v[b] ^ v[c], 24); \
v[a] = v[a] + v[b] + y; \
v[d] = ROTR64(v[d] ^ v[a], 16); \
v[c] = v[c] + v[d]; \
v[b] = ROTR64(v[b] ^ v[c], 63); }
#define BLAKE2B_G( Sa, Sb, Sc, Sd, Se, Sf, Sg, Sh ) \
{ \
V[0] = _mm256_add_epi64( V[0], _mm256_add_epi64( V[1], \
_mm256_set_epi64x( m[ sigmaR[ Sg ] ], m[ sigmaR[ Se ] ], \
m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
V[3] = mm256_swap64_32( _mm256_xor_si256( V[3], V[0] ) ); \
V[2] = _mm256_add_epi64( V[2], V[3] ); \
V[1] = mm256_shuflr64_24( _mm256_xor_si256( V[1], V[2] ) ); \
\
V[0] = _mm256_add_epi64( V[0], _mm256_add_epi64( V[1], \
_mm256_set_epi64x( m[ sigmaR[ Sh ] ], m[ sigmaR[ Sf ] ], \
m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
V[3] = mm256_shuflr64_16( _mm256_xor_si256( V[3], V[0] ) ); \
V[2] = _mm256_add_epi64( V[2], V[3] ); \
V[1] = mm256_ror_64( _mm256_xor_si256( V[1], V[2] ), 63 ); \
}
#define BLAKE2B_ROUND( R ) \
{ \
__m256i *V = (__m256i*)v; \
const uint8_t *sigmaR = sigma[R]; \
BLAKE2B_G( 0, 1, 2, 3, 4, 5, 6, 7 ); \
V[3] = mm256_shufll_64( V[3] ); \
V[2] = mm256_swap_128( V[2] ); \
V[1] = mm256_shuflr_64( V[1] ); \
BLAKE2B_G( 8, 9, 10, 11, 12, 13, 14, 15 ); \
V[3] = mm256_shuflr_64( V[3] ); \
V[2] = mm256_swap_128( V[2] ); \
V[1] = mm256_shufll_64( V[1] ); \
}
#elif defined(__SSSE3__)
#define BLAKE2B_G( Va, Vb, Vc, Vd, Sa, Sb, Sc, Sd ) \
{ \
Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
_mm_set_epi64x( m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
Vd = mm128_swap64_32( _mm_xor_si128( Vd, Va ) ); \
Vc = _mm_add_epi64( Vc, Vd ); \
Vb = mm128_shuflr64_24( _mm_xor_si128( Vb, Vc ) ); \
\
Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
_mm_set_epi64x( m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
Vd = mm128_shuflr64_16( _mm_xor_si128( Vd, Va ) ); \
Vc = _mm_add_epi64( Vc, Vd ); \
Vb = mm128_ror_64( _mm_xor_si128( Vb, Vc ), 63 ); \
}
#define BLAKE2B_ROUND( R ) \
{ \
__m128i *V = (__m128i*)v; \
__m128i V2, V3, V6, V7; \
const uint8_t *sigmaR = sigma[R]; \
BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
V2 = mm128_shufl2r_64( V[2], V[3] ); \
V3 = mm128_shufl2r_64( V[3], V[2] ); \
V6 = mm128_shufl2l_64( V[6], V[7] ); \
V7 = mm128_shufl2l_64( V[7], V[6] ); \
BLAKE2B_G( V[0], V2, V[5], V6, 8, 9, 10, 11 ); \
BLAKE2B_G( V[1], V3, V[4], V7, 12, 13, 14, 15 ); \
V[2] = mm128_shufl2l_64( V2, V3 ); \
V[3] = mm128_shufl2l_64( V3, V2 ); \
V[6] = mm128_shufl2r_64( V6, V7 ); \
V[7] = mm128_shufl2r_64( V7, V6 ); \
}
#else
#ifndef ROTR64
#define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y))))
#endif
#define BLAKE2B_G( R, Va, Vb, Vc, Vd, Sa, Sb ) \
{ \
Va = Va + Vb + m[ sigma[R][Sa] ]; \
Vd = ROTR64( Vd ^ Va, 32 ); \
Vc = Vc + Vd; \
Vb = ROTR64( Vb ^ Vc, 24 ); \
\
Va = Va + Vb + m[ sigma[R][Sb] ]; \
Vd = ROTR64( Vd ^ Va, 16 ); \
Vc = Vc + Vd; \
Vb = ROTR64( Vb ^ Vc, 63 ); \
}
#define BLAKE2B_ROUND( R ) \
{ \
BLAKE2B_G( R, v[ 0], v[ 4], v[ 8], v[12], 0, 1 ); \
BLAKE2B_G( R, v[ 1], v[ 5], v[ 9], v[13], 2, 3 ); \
BLAKE2B_G( R, v[ 2], v[ 6], v[10], v[14], 4, 5 ); \
BLAKE2B_G( R, v[ 3], v[ 7], v[11], v[15], 6, 7 ); \
BLAKE2B_G( R, v[ 0], v[ 5], v[10], v[15], 8, 9 ); \
BLAKE2B_G( R, v[ 1], v[ 6], v[11], v[12], 10, 11 ); \
BLAKE2B_G( R, v[ 2], v[ 7], v[ 8], v[13], 12, 13 ); \
BLAKE2B_G( R, v[ 3], v[ 4], v[ 9], v[14], 14, 15 ); \
}
#endif
// Initialization Vector.
static const uint64_t blake2b_iv[8] = {
static const uint64_t blake2b_iv[8] __attribute__ ((aligned (32))) =
{
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
};
static const uint8_t sigma[12][16] __attribute__ ((aligned (32))) =
{
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
};
// Compression function. "last" flag indicates last block.
static void blake2b_compress( sph_blake2b_ctx *ctx, int last )
{
const uint8_t sigma[12][16] = {
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
};
int i;
uint64_t v[16], m[16];
uint64_t v[16] __attribute__ ((aligned (32)));
uint64_t m[16] __attribute__ ((aligned (32)));
int i;
for (i = 0; i < 8; i++) { // init work variables
v[i] = ctx->h[i];
@@ -106,16 +193,8 @@ static void blake2b_compress( sph_blake2b_ctx *ctx, int last )
for (i = 0; i < 16; i++) // get little-endian words
m[i] = B2B_GET64(&ctx->b[8 * i]);
for (i = 0; i < 12; i++) { // twelve rounds
B2B_G( 0, 4, 8, 12, m[sigma[i][ 0]], m[sigma[i][ 1]]);
B2B_G( 1, 5, 9, 13, m[sigma[i][ 2]], m[sigma[i][ 3]]);
B2B_G( 2, 6, 10, 14, m[sigma[i][ 4]], m[sigma[i][ 5]]);
B2B_G( 3, 7, 11, 15, m[sigma[i][ 6]], m[sigma[i][ 7]]);
B2B_G( 0, 5, 10, 15, m[sigma[i][ 8]], m[sigma[i][ 9]]);
B2B_G( 1, 6, 11, 12, m[sigma[i][10]], m[sigma[i][11]]);
B2B_G( 2, 7, 8, 13, m[sigma[i][12]], m[sigma[i][13]]);
B2B_G( 3, 4, 9, 14, m[sigma[i][14]], m[sigma[i][15]]);
}
for (i = 0; i < 12; i++)
BLAKE2B_ROUND( i );
for( i = 0; i < 8; ++i )
ctx->h[i] ^= v[i] ^ v[i + 8];

View File

@@ -1,382 +0,0 @@
/*
* HEFTY1 cryptographic hash function
*
* Copyright (c) 2014, dbcc14 <BM-NBx4AKznJuyem3dArgVY8MGyABpihRy5>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* The views and conclusions contained in the software and documentation are those
* of the authors and should not be interpreted as representing official policies,
* either expressed or implied, of the FreeBSD Project.
*/
#include <assert.h>
#include <string.h>
#ifdef _MSC_VER
#define inline __inline
#endif
#include "sph_hefty1.h"
#define Min(A, B) (A <= B ? A : B)
#define RoundFunc(ctx, A, B, C, D, E, F, G, H, W, K) \
{ \
/* To thwart parallelism, Br modifies itself each time it's \
* called. This also means that calling it in different \
* orders yeilds different results. In C the order of \
* evaluation of function arguments and + operands are \
* unspecified (and depends on the compiler), so we must make \
* the order of Br calls explicit. \
*/ \
uint32_t brG = Br(ctx, G); \
uint32_t tmp1 = Ch(E, Br(ctx, F), brG) + H + W + K; \
uint32_t tmp2 = tmp1 + Sigma1(Br(ctx, E)); \
uint32_t brC = Br(ctx, C); \
uint32_t brB = Br(ctx, B); \
uint32_t tmp3 = Ma(Br(ctx, A), brB, brC); \
uint32_t tmp4 = tmp3 + Sigma0(Br(ctx, A)); \
H = G; \
G = F; \
F = E; \
E = D + Br(ctx, tmp2); \
D = C; \
C = B; \
B = A; \
A = tmp2 + tmp4; \
} \
/* Nothing up my sleeve constants */
const static uint32_t K[64] = {
0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL,
0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL,
0xd807aa98UL, 0x12835b01UL, 0x243185beUL, 0x550c7dc3UL,
0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, 0xc19bf174UL,
0xe49b69c1UL, 0xefbe4786UL, 0x0fc19dc6UL, 0x240ca1ccUL,
0x2de92c6fUL, 0x4a7484aaUL, 0x5cb0a9dcUL, 0x76f988daUL,
0x983e5152UL, 0xa831c66dUL, 0xb00327c8UL, 0xbf597fc7UL,
0xc6e00bf3UL, 0xd5a79147UL, 0x06ca6351UL, 0x14292967UL,
0x27b70a85UL, 0x2e1b2138UL, 0x4d2c6dfcUL, 0x53380d13UL,
0x650a7354UL, 0x766a0abbUL, 0x81c2c92eUL, 0x92722c85UL,
0xa2bfe8a1UL, 0xa81a664bUL, 0xc24b8b70UL, 0xc76c51a3UL,
0xd192e819UL, 0xd6990624UL, 0xf40e3585UL, 0x106aa070UL,
0x19a4c116UL, 0x1e376c08UL, 0x2748774cUL, 0x34b0bcb5UL,
0x391c0cb3UL, 0x4ed8aa4aUL, 0x5b9cca4fUL, 0x682e6ff3UL,
0x748f82eeUL, 0x78a5636fUL, 0x84c87814UL, 0x8cc70208UL,
0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL
};
/* Initial hash values */
const static uint32_t H[HEFTY1_STATE_WORDS] = {
0x6a09e667UL,
0xbb67ae85UL,
0x3c6ef372UL,
0xa54ff53aUL,
0x510e527fUL,
0x9b05688cUL,
0x1f83d9abUL,
0x5be0cd19UL
};
static inline uint32_t Rr(uint32_t X, uint8_t n)
{
return (X >> n) | (X << (32 - n));
}
static inline uint32_t Ch(uint32_t E, uint32_t F, uint32_t G)
{
return (E & F) ^ (~E & G);
}
static inline uint32_t Sigma1(uint32_t E)
{
return Rr(E, 6) ^ Rr(E, 11) ^ Rr(E, 25);
}
static inline uint32_t sigma1(uint32_t X)
{
return Rr(X, 17) ^ Rr(X, 19) ^ (X >> 10);
}
static inline uint32_t Ma(uint32_t A, uint32_t B, uint32_t C)
{
return (A & B) ^ (A & C) ^ (B & C);
}
static inline uint32_t Sigma0(uint32_t A)
{
return Rr(A, 2) ^ Rr(A, 13) ^ Rr(A, 22);
}
static inline uint32_t sigma0(uint32_t X)
{
return Rr(X, 7) ^ Rr(X, 18) ^ (X >> 3);
}
static inline uint32_t Reverse32(uint32_t n)
{
#if BYTE_ORDER == LITTLE_ENDIAN
return n << 24 | (n & 0x0000ff00) << 8 | (n & 0x00ff0000) >> 8 | n >> 24;
#else
return n;
#endif
}
static inline uint64_t Reverse64(uint64_t n)
{
#if BYTE_ORDER == LITTLE_ENDIAN
uint32_t a = n >> 32;
uint32_t b = (n << 32) >> 32;
return (uint64_t)Reverse32(b) << 32 | Reverse32(a);
#else
return n;
#endif
}
/* Smoosh byte into nibble */
static inline uint8_t Smoosh4(uint8_t X)
{
return (X >> 4) ^ (X & 0xf);
}
/* Smoosh 32-bit word into 2-bits */
static inline uint8_t Smoosh2(uint32_t X)
{
uint16_t w = (X >> 16) ^ (X & 0xffff);
uint8_t n = Smoosh4((w >> 8) ^ (w & 0xff));
return (n >> 2) ^ (n & 0x3);
}
static void Mangle(uint32_t *S)
{
uint32_t *R = S;
uint32_t *C = &S[1];
uint8_t r0 = Smoosh4(R[0] >> 24);
uint8_t r1 = Smoosh4(R[0] >> 16);
uint8_t r2 = Smoosh4(R[0] >> 8);
uint8_t r3 = Smoosh4(R[0] & 0xff);
int i;
/* Diffuse */
uint32_t tmp = 0;
for (i = 0; i < HEFTY1_SPONGE_WORDS - 1; i++) {
uint8_t r = Smoosh2(tmp);
switch (r) {
case 0:
C[i] ^= Rr(R[0], i + r0);
break;
case 1:
C[i] += Rr(~R[0], i + r1);
break;
case 2:
C[i] &= Rr(~R[0], i + r2);
break;
case 3:
C[i] ^= Rr(R[0], i + r3);
break;
}
tmp ^= C[i];
}
/* Compress */
tmp = 0;
for (i = 0; i < HEFTY1_SPONGE_WORDS - 1; i++)
if (i % 2)
tmp ^= C[i];
else
tmp += C[i];
R[0] ^= tmp;
}
static void Absorb(uint32_t *S, uint32_t X)
{
uint32_t *R = S;
R[0] ^= X;
Mangle(S);
}
static uint32_t Squeeze(uint32_t *S)
{
uint32_t Y = S[0];
Mangle(S);
return Y;
}
/* Branch, compress and serialize function */
static inline uint32_t Br(HEFTY1_CTX *ctx, uint32_t X)
{
uint32_t R = Squeeze(ctx->sponge);
uint8_t r0 = R >> 8;
uint8_t r1 = R & 0xff;
uint32_t Y = 1 << (r0 % 32);
switch (r1 % 4)
{
case 0:
/* Do nothing */
break;
case 1:
return X & ~Y;
case 2:
return X | Y;
case 3:
return X ^ Y;
}
return X;
}
static void HashBlock(HEFTY1_CTX *ctx)
{
uint32_t A, B, C, D, E, F, G, H;
uint32_t W[HEFTY1_BLOCK_BYTES];
assert(ctx);
A = ctx->h[0];
B = ctx->h[1];
C = ctx->h[2];
D = ctx->h[3];
E = ctx->h[4];
F = ctx->h[5];
G = ctx->h[6];
H = ctx->h[7];
int t = 0;
for (; t < 16; t++) {
W[t] = Reverse32(((uint32_t *)&ctx->block[0])[t]); /* To host byte order */
Absorb(ctx->sponge, W[t] ^ K[t]);
}
for (t = 0; t < 16; t++) {
Absorb(ctx->sponge, D ^ H);
RoundFunc(ctx, A, B, C, D, E, F, G, H, W[t], K[t]);
}
for (t = 16; t < 64; t++) {
Absorb(ctx->sponge, H + D);
W[t] = sigma1(W[t - 2]) + W[t - 7] + sigma0(W[t - 15]) + W[t - 16];
RoundFunc(ctx, A, B, C, D, E, F, G, H, W[t], K[t]);
}
ctx->h[0] += A;
ctx->h[1] += B;
ctx->h[2] += C;
ctx->h[3] += D;
ctx->h[4] += E;
ctx->h[5] += F;
ctx->h[6] += G;
ctx->h[7] += H;
A = 0;
B = 0;
C = 0;
D = 0;
E = 0;
F = 0;
G = 0;
H = 0;
memset(W, 0, sizeof(W));
}
/* Public interface */
void HEFTY1_Init(HEFTY1_CTX *ctx)
{
assert(ctx);
memcpy(ctx->h, H, sizeof(ctx->h));
memset(ctx->block, 0, sizeof(ctx->block));
ctx->written = 0;
memset(ctx->sponge, 0, sizeof(ctx->sponge));
}
void HEFTY1_Update(HEFTY1_CTX *ctx, const void *buf, size_t len)
{
assert(ctx);
uint64_t read = 0;
while (len) {
size_t end = (size_t)(ctx->written % HEFTY1_BLOCK_BYTES);
size_t count = Min(len, HEFTY1_BLOCK_BYTES - end);
memcpy(&ctx->block[end], &((unsigned char *)buf)[read], count);
len -= count;
read += count;
ctx->written += count;
if (!(ctx->written % HEFTY1_BLOCK_BYTES))
HashBlock(ctx);
}
}
void HEFTY1_Final(unsigned char *digest, HEFTY1_CTX *ctx)
{
assert(digest);
assert(ctx);
/* Pad message (FIPS 180 Section 5.1.1) */
size_t used = (size_t)(ctx->written % HEFTY1_BLOCK_BYTES);
ctx->block[used++] = 0x80; /* Append 1 to end of message */
if (used > HEFTY1_BLOCK_BYTES - 8) {
/* We have already written into the last 64bits, so
* we must continue into the next block. */
memset(&ctx->block[used], 0, HEFTY1_BLOCK_BYTES - used);
HashBlock(ctx);
used = 0; /* Create a new block (below) */
}
/* All remaining bits to zero */
memset(&ctx->block[used], 0, HEFTY1_BLOCK_BYTES - 8 - used);
/* The last 64bits encode the length (in network byte order) */
uint64_t *len = (uint64_t *)&ctx->block[HEFTY1_BLOCK_BYTES - 8];
*len = Reverse64(ctx->written*8);
HashBlock(ctx);
/* Convert back to network byte order */
int i = 0;
for (; i < HEFTY1_STATE_WORDS; i++)
ctx->h[i] = Reverse32(ctx->h[i]);
memcpy(digest, ctx->h, sizeof(ctx->h));
memset(ctx, 0, sizeof(HEFTY1_CTX));
}
unsigned char* HEFTY1(const unsigned char *buf, size_t len, unsigned char *digest)
{
HEFTY1_CTX ctx;
static unsigned char m[HEFTY1_DIGEST_BYTES];
if (!digest)
digest = m;
HEFTY1_Init(&ctx);
HEFTY1_Update(&ctx, buf, len);
HEFTY1_Final(digest, &ctx);
return digest;
}

View File

@@ -1,66 +0,0 @@
/*
* HEFTY1 cryptographic hash function
*
* Copyright (c) 2014, dbcc14 <BM-NBx4AKznJuyem3dArgVY8MGyABpihRy5>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* The views and conclusions contained in the software and documentation are those
* of the authors and should not be interpreted as representing official policies,
* either expressed or implied, of the FreeBSD Project.
*/
#ifndef __HEFTY1_H__
#define __HEFTY1_H__
#ifdef __cplusplus
extern "C" {
#endif
#ifndef WIN32
#include <sys/types.h>
#endif
#include <inttypes.h>
#define HEFTY1_DIGEST_BYTES 32
#define HEFTY1_BLOCK_BYTES 64
#define HEFTY1_STATE_WORDS 8
#define HEFTY1_SPONGE_WORDS 4
typedef struct HEFTY1_CTX {
uint32_t h[HEFTY1_STATE_WORDS];
uint8_t block[HEFTY1_BLOCK_BYTES];
uint64_t written;
uint32_t sponge[HEFTY1_SPONGE_WORDS];
} HEFTY1_CTX;
void HEFTY1_Init(HEFTY1_CTX *cxt);
void HEFTY1_Update(HEFTY1_CTX *cxt, const void *data, size_t len);
void HEFTY1_Final(unsigned char *digest, HEFTY1_CTX *cxt);
unsigned char* HEFTY1(const unsigned char *data, size_t len, unsigned char *digest);
#ifdef __cplusplus
}
#endif
#endif /* __HEFTY1_H__ */

View File

@@ -97,11 +97,11 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
// returns void, updates all args
#define G_4X64(a,b,c,d) \
a = _mm256_add_epi64( a, b ); \
d = mm256_ror_64( _mm256_xor_si256( d, a ), 32 ); \
d = mm256_swap64_32( _mm256_xor_si256( d, a ) ); \
c = _mm256_add_epi64( c, d ); \
b = mm256_ror_64( _mm256_xor_si256( b, c ), 24 ); \
b = mm256_shuflr64_24( _mm256_xor_si256( b, c ) ); \
a = _mm256_add_epi64( a, b ); \
d = mm256_ror_64( _mm256_xor_si256( d, a ), 16 ); \
d = mm256_shuflr64_16( _mm256_xor_si256( d, a ) ); \
c = _mm256_add_epi64( c, d ); \
b = mm256_ror_64( _mm256_xor_si256( b, c ), 63 );
@@ -137,11 +137,11 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
// returns void, all args updated
#define G_2X64(a,b,c,d) \
a = _mm_add_epi64( a, b ); \
d = mm128_ror_64( _mm_xor_si128( d, a), 32 ); \
d = mm128_swap64_32( _mm_xor_si128( d, a) ); \
c = _mm_add_epi64( c, d ); \
b = mm128_ror_64( _mm_xor_si128( b, c ), 24 ); \
b = mm128_shuflr64_24( _mm_xor_si128( b, c ) ); \
a = _mm_add_epi64( a, b ); \
d = mm128_ror_64( _mm_xor_si128( d, a ), 16 ); \
d = mm128_shuflr64_16( _mm_xor_si128( d, a ) ); \
c = _mm_add_epi64( c, d ); \
b = mm128_ror_64( _mm_xor_si128( b, c ), 63 );
@@ -150,12 +150,10 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
G_2X64( s1, s3, s5, s7 ); \
mm128_vrol256_64( s6, s7 ); \
mm128_vror256_64( s2, s3 ); \
mm128_swap256_128( s4, s5 ); \
G_2X64( s0, s2, s4, s6 ); \
G_2X64( s1, s3, s5, s7 ); \
G_2X64( s0, s2, s5, s6 ); \
G_2X64( s1, s3, s4, s7 ); \
mm128_vror256_64( s6, s7 ); \
mm128_vrol256_64( s2, s3 ); \
mm128_swap256_128( s4, s5 );
mm128_vrol256_64( s2, s3 );
#define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \

File diff suppressed because it is too large Load Diff

View File

@@ -1,186 +0,0 @@
/* $Id: sph_radiogatun.h 226 2010-06-16 17:28:08Z tp $ */
/**
* RadioGatun interface.
*
* RadioGatun has been published in: G. Bertoni, J. Daemen, M. Peeters
* and G. Van Assche, "RadioGatun, a belt-and-mill hash function",
* presented at the Second Cryptographic Hash Workshop, Santa Barbara,
* August 24-25, 2006. The main Web site, containing that article, the
* reference code and some test vectors, appears to be currently located
* at the following URL: http://radiogatun.noekeon.org/
*
* The presentation article does not specify endianness or padding. The
* reference code uses the following conventions, which we also apply
* here:
* <ul>
* <li>The input message is an integral number of sequences of three
* words. Each word is either a 32-bit of 64-bit word (depending on
* the version of RadioGatun).</li>
* <li>Input bytes are decoded into words using little-endian
* convention.</li>
* <li>Padding consists of a single bit of value 1, using little-endian
* convention within bytes (i.e. for a byte-oriented input, a single
* byte of value 0x01 is appended), then enough bits of value 0 to finish
* the current block.</li>
* <li>Output consists of 256 bits. Successive output words are encoded
* with little-endian convention.</li>
* </ul>
* These conventions are very close to those we use for PANAMA, which is
* a close ancestor or RadioGatun.
*
* RadioGatun is actually a family of functions, depending on some
* internal parameters. We implement here two functions, with a "belt
* length" of 13, a "belt width" of 3, and a "mill length" of 19. The
* RadioGatun[32] version uses 32-bit words, while the RadioGatun[64]
* variant uses 64-bit words.
*
* Strictly speaking, the name "RadioGatun" should use an acute accent
* on the "u", which we omitted here to keep strict ASCII-compatibility
* of this file.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @file sph_radiogatun.h
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#ifndef SPH_RADIOGATUN_H__
#define SPH_RADIOGATUN_H__
#include <stddef.h>
#include "algo/sha/sph_types.h"
/**
* Output size (in bits) for RadioGatun[32].
*/
#define SPH_SIZE_radiogatun32 256
/**
* This structure is a context for RadioGatun[32] computations: it
* contains intermediate values and some data from the last entered
* block. Once a RadioGatun[32] computation has been performed, the
* context can be reused for another computation.
*
* The contents of this structure are private. A running RadioGatun[32]
* computation can be cloned by copying the context (e.g. with a
* simple <code>memcpy()</code>).
*/
typedef struct {
#ifndef DOXYGEN_IGNORE
unsigned char data[156]; /* first field, for alignment */
unsigned data_ptr;
sph_u32 a[19], b[39];
#endif
} sph_radiogatun32_context;
/**
* Initialize a RadioGatun[32] context. This process performs no
* memory allocation.
*
* @param cc the RadioGatun[32] context (pointer to a
* <code>sph_radiogatun32_context</code>)
*/
void sph_radiogatun32_init(void *cc);
/**
* Process some data bytes. It is acceptable that <code>len</code> is zero
* (in which case this function does nothing).
*
* @param cc the RadioGatun[32] context
* @param data the input data
* @param len the input data length (in bytes)
*/
void sph_radiogatun32(void *cc, const void *data, size_t len);
/**
* Terminate the current RadioGatun[32] computation and output the
* result into the provided buffer. The destination buffer must be wide
* enough to accomodate the result (32 bytes). The context is
* automatically reinitialized.
*
* @param cc the RadioGatun[32] context
* @param dst the destination buffer
*/
void sph_radiogatun32_close(void *cc, void *dst);
#if SPH_64
/**
* Output size (in bits) for RadioGatun[64].
*/
#define SPH_SIZE_radiogatun64 256
/**
* This structure is a context for RadioGatun[64] computations: it
* contains intermediate values and some data from the last entered
* block. Once a RadioGatun[64] computation has been performed, the
* context can be reused for another computation.
*
* The contents of this structure are private. A running RadioGatun[64]
* computation can be cloned by copying the context (e.g. with a
* simple <code>memcpy()</code>).
*/
typedef struct {
#ifndef DOXYGEN_IGNORE
unsigned char data[312]; /* first field, for alignment */
unsigned data_ptr;
sph_u64 a[19], b[39];
#endif
} sph_radiogatun64_context;
/**
* Initialize a RadioGatun[64] context. This process performs no
* memory allocation.
*
* @param cc the RadioGatun[64] context (pointer to a
* <code>sph_radiogatun64_context</code>)
*/
void sph_radiogatun64_init(void *cc);
/**
* Process some data bytes. It is acceptable that <code>len</code> is zero
* (in which case this function does nothing).
*
* @param cc the RadioGatun[64] context
* @param data the input data
* @param len the input data length (in bytes)
*/
void sph_radiogatun64(void *cc, const void *data, size_t len);
/**
* Terminate the current RadioGatun[64] computation and output the
* result into the provided buffer. The destination buffer must be wide
* enough to accomodate the result (32 bytes). The context is
* automatically reinitialized.
*
* @param cc the RadioGatun[64] context
* @param dst the destination buffer
*/
void sph_radiogatun64_close(void *cc, void *dst);
#endif
#endif

View File

@@ -1,34 +0,0 @@
#include "x20r-gate.h"
void getAlgoString( const uint8_t* prevblock, char *output )
{
char *sptr = outpuit;
for ( int j = 0; j < X20R_HASH_FUNC_COUNT; j++ )
{
char b = (19 - j) >> 1; // 16 ascii hex chars, reversed
uint8_t algoDigit = (j & 1) ? prevblock[b] & 0xF : prevblock[b] >> 4;
if (algoDigit >= 10)
sprintf(sptr, "%c", 'A' + (algoDigit - 10));
else
sprintf(sptr, "%u", (uint32_t) algoDigit);
sptr++;
}
*sptr = '\0';
}
bool register_x20r_algo( algo_gate_t* gate )
{
#if defined (X20R_4WAY)
gate->scanhash = (void*)&scanhash_x20r_4way;
gate->hash = (void*)&x20r_4way_hash;
#else
gate->scanhash = (void*)&scanhash_x20r;
gate->hash = (void*)&x20r_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
x20_r_s_getAlgoString = (void*)&x20r_getAlgoString;
opt_target_factor = 256.;
return true;
};

View File

@@ -1,58 +0,0 @@
#ifndef X20R_GATE_H__
#define X20R_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
/*
#if defined(__AVX2__) && defined(__AES__)
#define X20R_4WAY
#endif
*/
enum x20r_Algo {
BLAKE = 0,
BMW,
GROESTL,
JH,
KECCAK,
SKEIN,
LUFFA,
CUBEHASH,
SHAVITE,
SIMD,
ECHO,
HAMSI,
FUGUE,
SHABAL,
WHIRLPOOL,
SHA_512,
HAVAL, // 256-bits output
GOST,
RADIOGATUN, // 256-bits output
PANAMA, // 256-bits output
X20R_HASH_FUNC_COUNT
};
void (*x20_r_s_getAlgoString) ( const uint8_t*, char* );
void x20r_getAlgoString( const uint8_t* prevblock, char *output );
bool register_xi20r_algo( algo_gate_t* gate );
#if defined(X20R_4WAY)
void x20r_4way_hash( void *state, const void *input );
int scanhash_x20r_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
void x20rhash( void *state, const void *input );
int scanhash_x20r( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif

View File

@@ -1,252 +0,0 @@
#include "x20r-gate.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "algo/blake/sph_blake.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/jh/sph_jh.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/fugue/sph_fugue.h"
#include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/haval/sph-haval.h"
#include "algo/radiogatun/sph_radiogatun.h"
#include "algo/panama/sph_panama.h"
#include "algo/gost/sph_gost.h"
#include "algo/sha/sph_sha2.h"
#if defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#else
#include "algo/groestl/sph_groestl.h"
#include "algo/echo/sph_echo.h"
#endif
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/simd/nist.h"
static __thread uint32_t s_ntime = UINT32_MAX;
static __thread char hashOrder[X20R_HASH_FUNC_COUNT + 1] = { 0 };
union _x20r_context_overlay
{
sph_blake512_context blake;
sph_bmw512_context bmw;
#if defined(__AES__)
hashState_groestl groestl;
hashState_echo echo;
#else
sph_groestl512_context groestl;
sph_echo512_context echo;
#endif
sph_skein512_context skein;
sph_jh512_context jh;
sph_keccak512_context keccak;
hashState_luffa luffa;
cubehashParam cube;
hashState_sd simd;
sph_shavite512_context shavite;
sph_hamsi512_context hamsi;
sph_fugue512_context fugue;
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
sph_sha512_context sha512;
sph_haval256_5_context haval;
sph_gost512_context gost;
sph_radiogatun64_context radiogatun;
sph_panama_context panama;
};
typedef union _x20r_context_overlay x20r_context_overlay;
void x20r_hash(void* output, const void* input)
{
uint32_t _ALIGN(128) hash[64/4];
x20r_context_overlay ctx;
void *in = (void*) input;
int size = 80;
if ( s_ntime == UINT32_MAX )
{
const uint8_t* in8 = (uint8_t*) input;
x20_r_s_getAlgoString(&in8[4], hashOrder);
}
for (int i = 0; i < 20; i++)
{
const char elem = hashOrder[i];
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch ( algo )
{
case BLAKE:
sph_blake512_init(&ctx.blake);
sph_blake512(&ctx.blake, in, size);
sph_blake512_close(&ctx.blake, hash);
break;
case BMW:
sph_bmw512_init(&ctx.bmw);
sph_bmw512(&ctx.bmw, in, size);
sph_bmw512_close(&ctx.bmw, hash);
break;
case GROESTL:
#if defined(__AES__)
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash,
(const char*)in, size<<3 );
#else
sph_groestl512_init(&ctx.groestl);
sph_groestl512(&ctx.groestl, in, size);
sph_groestl512_close(&ctx.groestl, hash);
#endif
break;
case SKEIN:
sph_skein512_init(&ctx.skein);
sph_skein512(&ctx.skein, in, size);
sph_skein512_close(&ctx.skein, hash);
break;
case JH:
sph_jh512_init(&ctx.jh);
sph_jh512(&ctx.jh, in, size);
sph_jh512_close(&ctx.jh, hash);
break;
case KECCAK:
sph_keccak512_init(&ctx.keccak);
sph_keccak512(&ctx.keccak, in, size);
sph_keccak512_close(&ctx.keccak, hash);
break;
case LUFFA:
init_luffa( &ctx.luffa, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
(const BitSequence*)in, size );
break;
case CUBEHASH:
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash,
(const byte*)in, size );
break;
case SHAVITE:
sph_shavite512_init(&ctx.shavite);
sph_shavite512(&ctx.shavite, in, size);
sph_shavite512_close(&ctx.shavite, hash);
break;
case SIMD:
init_sd( &ctx.simd, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)in, size<<3 );
break;
case ECHO:
#if defined(__AES__)
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash,
(const BitSequence *)in, size<<3 );
#else
sph_echo512_init(&ctx.echo);
sph_echo512(&ctx.echo, in, size);
sph_echo512_close(&ctx.echo, hash);
#endif
break;
case HAMSI:
sph_hamsi512_init(&ctx.hamsi);
sph_hamsi512(&ctx.hamsi, in, size);
sph_hamsi512_close(&ctx.hamsi, hash);
break;
case FUGUE:
sph_fugue512_init(&ctx.fugue);
sph_fugue512(&ctx.fugue, in, size);
sph_fugue512_close(&ctx.fugue, hash);
break;
case SHABAL:
sph_shabal512_init(&ctx.shabal);
sph_shabal512(&ctx.shabal, in, size);
sph_shabal512_close(&ctx.shabal, hash);
break;
case WHIRLPOOL:
sph_whirlpool_init(&ctx.whirlpool);
sph_whirlpool(&ctx.whirlpool, in, size);
sph_whirlpool_close(&ctx.whirlpool, hash);
break;
case SHA_512:
sph_sha512_Init( &ctx.sha512 );
sph_sha512( &ctx.sha512, in, size );
sph_sha512_close( &ctx.sha512, hash );
break;
case HAVAL:
sph_haval256_5_init(&ctx.haval);
sph_haval256_5(&ctx.haval, in, size);
sph_haval256_5_close(&ctx.haval, hash);
memset(&hash[8], 0, 32);
break;
case GOST:
sph_gost512_init(&ctx.gost);
sph_gost512(&ctx.gost, in, size);
sph_gost512_close(&ctx.gost, hash);
break;
case RADIOGATUN:
sph_radiogatun64_init(&ctx.radiogatun);
sph_radiogatun64(&ctx.radiogatun, in, size);
sph_radiogatun64_close(&ctx.radiogatun, hash);
memset(&hash[8], 0, 32);
break;
case PANAMA:
sph_panama_init(&ctx.panama);
sph_panama(&ctx.panama, in, size);
sph_panama_close(&ctx.panama, hash);
memset(&hash[8], 0, 32);
break;
}
in = (void*) hash;
size = 64;
}
memcpy(output, hash, 32);
}
int scanhash_x20r( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(128) hash32[8];
uint32_t _ALIGN(128) endiandata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t nonce = first_nonce;
int thr_id = mythr->id;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
for (int k=0; k < 19; k++)
be32enc( &endiandata[k], pdata[k] );
if ( s_ntime != pdata[17] )
{
uint32_t ntime = swab32(pdata[17]);
x20_r_s_getAlgoString( (const char*) (&endiandata[1]), hashOrder );
s_ntime = ntime;
if (opt_debug && !thr_id) applog(LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime);
}
if ( opt_benchmark )
ptarget[7] = 0x0cff;
do {
be32enc( &endiandata[19], nonce );
x20r_hash( hash32, endiandata );
if ( hash32[7] <= Htarg && fulltest( hash32, ptarget ) )
{
pdata[19] = nonce;
submit_solution( work, hash32, mythr );
}
nonce++;
} while (nonce < max_nonce && !(*restart));
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
}

View File

@@ -1,5 +0,0 @@
#ifdef __SSE2__
#include "yescrypt-simd.c"
#else
#include "yescrypt-opt.c"
#endif

View File

@@ -1,213 +0,0 @@
/*-
* Copyright 2013,2014 Alexander Peslyak
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#ifdef MAP_ANON
#include <sys/mman.h>
#endif
#include "yescrypt.h"
#define HUGEPAGE_THRESHOLD (12 * 1024 * 1024)
#ifdef __x86_64__
#define HUGEPAGE_SIZE (2 * 1024 * 1024)
#else
#undef HUGEPAGE_SIZE
#endif
/*
static __inline uint32_t
le32dec(const void *pp)
{
const uint8_t *p = (uint8_t const *)pp;
return ((uint32_t)(p[0]) + ((uint32_t)(p[1]) << 8) +
((uint32_t)(p[2]) << 16) + ((uint32_t)(p[3]) << 24));
}
static __inline void
le32enc(void *pp, uint32_t x)
{
uint8_t * p = (uint8_t *)pp;
p[0] = x & 0xff;
p[1] = (x >> 8) & 0xff;
p[2] = (x >> 16) & 0xff;
p[3] = (x >> 24) & 0xff;
}
*/
static void *
alloc_region(yescrypt_region_t * region, size_t size)
{
size_t base_size = size;
uint8_t * base, * aligned;
#ifdef MAP_ANON
int flags =
#ifdef MAP_NOCORE
MAP_NOCORE |
#endif
MAP_ANON | MAP_PRIVATE;
#if defined(MAP_HUGETLB) && defined(HUGEPAGE_SIZE)
size_t new_size = size;
const size_t hugepage_mask = (size_t)HUGEPAGE_SIZE - 1;
if (size >= HUGEPAGE_THRESHOLD && size + hugepage_mask >= size) {
flags |= MAP_HUGETLB;
/*
* Linux's munmap() fails on MAP_HUGETLB mappings if size is not a multiple of
* huge page size, so let's round up to huge page size here.
*/
new_size = size + hugepage_mask;
new_size &= ~hugepage_mask;
}
base = mmap(NULL, new_size, PROT_READ | PROT_WRITE, flags, -1, 0);
if (base != MAP_FAILED) {
base_size = new_size;
} else
if (flags & MAP_HUGETLB) {
flags &= ~MAP_HUGETLB;
base = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0);
}
#else
base = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0);
#endif
if (base == MAP_FAILED)
base = NULL;
aligned = base;
#elif defined(HAVE_POSIX_MEMALIGN)
if ((errno = posix_memalign((void **)&base, 64, size)) != 0)
base = NULL;
aligned = base;
#else
base = aligned = NULL;
if (size + 63 < size) {
errno = ENOMEM;
} else if ((base = malloc(size + 63)) != NULL) {
aligned = base + 63;
aligned -= (uintptr_t)aligned & 63;
}
#endif
region->base = base;
region->aligned = aligned;
region->base_size = base ? base_size : 0;
region->aligned_size = base ? size : 0;
return aligned;
}
static __inline void
init_region(yescrypt_region_t * region)
{
region->base = region->aligned = NULL;
region->base_size = region->aligned_size = 0;
}
static int
free_region(yescrypt_region_t * region)
{
if (region->base) {
#ifdef MAP_ANON
if (munmap(region->base, region->base_size))
return -1;
#else
free(region->base);
#endif
}
init_region(region);
return 0;
}
int yescrypt_init_shared(yescrypt_shared_t * shared, const uint8_t * param, size_t paramlen,
uint64_t N, uint32_t r, uint32_t p, yescrypt_init_shared_flags_t flags, uint32_t mask,
uint8_t * buf, size_t buflen)
{
yescrypt_shared1_t* shared1 = &shared->shared1;
yescrypt_shared_t dummy, half1, half2;
uint8_t salt[32];
if (flags & YESCRYPT_SHARED_PREALLOCATED) {
if (!shared1->aligned || !shared1->aligned_size)
return -1;
} else {
init_region(shared1);
}
shared->mask1 = 1;
if (!param && !paramlen && !N && !r && !p && !buf && !buflen)
return 0;
init_region(&dummy.shared1);
dummy.mask1 = 1;
if (yescrypt_kdf(&dummy, shared1,
param, paramlen, NULL, 0, N, r, p, 0,
YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | __YESCRYPT_INIT_SHARED_1,
salt, sizeof(salt), 0 ) )
goto out;
half1 = half2 = *shared;
half1.shared1.aligned_size /= 2;
half2.shared1.aligned = (void*) ((size_t)half2.shared1.aligned + half1.shared1.aligned_size);
half2.shared1.aligned_size = half1.shared1.aligned_size;
N /= 2;
if (p > 1 && yescrypt_kdf(&half1, &half2.shared1,
param, paramlen, salt, sizeof(salt), N, r, p, 0,
YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | __YESCRYPT_INIT_SHARED_2,
salt, sizeof(salt), 0 ))
goto out;
if (yescrypt_kdf(&half2, &half1.shared1,
param, paramlen, salt, sizeof(salt), N, r, p, 0,
YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | __YESCRYPT_INIT_SHARED_1,
salt, sizeof(salt), 0))
goto out;
if (yescrypt_kdf(&half1, &half2.shared1,
param, paramlen, salt, sizeof(salt), N, r, p, 0,
YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | __YESCRYPT_INIT_SHARED_1,
buf, buflen, 0))
goto out;
shared->mask1 = mask;
return 0;
out:
if (!(flags & YESCRYPT_SHARED_PREALLOCATED))
free_region(shared1);
return -1;
}
int
yescrypt_free_shared(yescrypt_shared_t * shared)
{
return free_region(&shared->shared1);
}
int
yescrypt_init_local(yescrypt_local_t * local)
{
init_region(local);
return 0;
}
int
yescrypt_free_local(yescrypt_local_t * local)
{
return free_region(local);
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,488 +0,0 @@
/*-
* Copyright 2013,2014 Alexander Peslyak
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "compat.h"
#include "yescrypt.h"
#include "algo/sha/hmac-sha256-hash.h"
#include "algo-gate-api.h"
#define BYTES2CHARS(bytes) \
((((bytes) * 8) + 5) / 6)
#define HASH_SIZE 32 /* bytes */
#define HASH_LEN BYTES2CHARS(HASH_SIZE) /* base-64 chars */
#define YESCRYPT_FLAGS (YESCRYPT_RW | YESCRYPT_PWXFORM)
static const char * const itoa64 =
"./0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
static uint8_t* encode64_uint32(uint8_t* dst, size_t dstlen, uint32_t src, uint32_t srcbits)
{
uint32_t bit;
for (bit = 0; bit < srcbits; bit += 6) {
if (dstlen < 1)
return NULL;
*dst++ = itoa64[src & 0x3f];
dstlen--;
src >>= 6;
}
return dst;
}
static uint8_t* encode64(uint8_t* dst, size_t dstlen, const uint8_t* src, size_t srclen)
{
size_t i;
for (i = 0; i < srclen; ) {
uint8_t * dnext;
uint32_t value = 0, bits = 0;
do {
value |= (uint32_t)src[i++] << bits;
bits += 8;
} while (bits < 24 && i < srclen);
dnext = encode64_uint32(dst, dstlen, value, bits);
if (!dnext)
return NULL;
dstlen -= dnext - dst;
dst = dnext;
}
return dst;
}
static int decode64_one(uint32_t* dst, uint8_t src)
{
const char * ptr = strchr(itoa64, src);
if (ptr) {
*dst = (uint32_t) (ptr - itoa64);
return 0;
}
*dst = 0;
return -1;
}
static const uint8_t* decode64_uint32(uint32_t* dst, uint32_t dstbits, const uint8_t* src)
{
uint32_t bit;
uint32_t value;
value = 0;
for (bit = 0; bit < dstbits; bit += 6) {
uint32_t one;
if (decode64_one(&one, *src)) {
*dst = 0;
return NULL;
}
src++;
value |= one << bit;
}
*dst = value;
return src;
}
uint8_t* yescrypt_r(const yescrypt_shared_t* shared, yescrypt_local_t* local,
const uint8_t* passwd, size_t passwdlen, const uint8_t* setting,
uint8_t* buf, size_t buflen, int thrid )
{
uint8_t hash[HASH_SIZE];
const uint8_t * src, * salt;
uint8_t * dst;
size_t prefixlen, saltlen, need;
uint8_t version;
uint64_t N;
uint32_t r, p;
yescrypt_flags_t flags = YESCRYPT_WORM;
printf("pass1 ...");
fflush(stdout);
if (setting[0] != '$' || setting[1] != '7') {
printf("died$7 ...");
fflush(stdout);
return NULL;
}
printf("died80 ...");
fflush(stdout);
src = setting + 2;
printf("hello '%p'\n", (char *)src);
fflush(stdout);
switch ((version = *src)) {
case '$':
printf("died2 ...");
fflush(stdout);
break;
case 'X':
src++;
flags = YESCRYPT_RW;
printf("died3 ...");
fflush(stdout);
break;
default:
printf("died4 ...");
fflush(stdout);
return NULL;
}
printf("pass2 ...");
fflush(stdout);
if (*src != '$') {
uint32_t decoded_flags;
if (decode64_one(&decoded_flags, *src)) {
printf("died5 ...");
fflush(stdout);
return NULL;
}
flags = decoded_flags;
if (*++src != '$') {
printf("died6 ...");
fflush(stdout);
return NULL;
}
}
src++;
{
uint32_t N_log2;
if (decode64_one(&N_log2, *src)) {
printf("died7 ...");
return NULL;
}
src++;
N = (uint64_t)1 << N_log2;
}
src = decode64_uint32(&r, 30, src);
if (!src) {
printf("died6 ...");
return NULL;
}
src = decode64_uint32(&p, 30, src);
if (!src) {
printf("died7 ...");
return NULL;
}
prefixlen = src - setting;
salt = src;
src = (uint8_t *)strrchr((char *)salt, '$');
if (src)
saltlen = src - salt;
else
saltlen = strlen((char *)salt);
need = prefixlen + saltlen + 1 + HASH_LEN + 1;
if (need > buflen || need < saltlen) {
printf("'%d %d %d'", (int) need, (int) buflen, (int) saltlen);
printf("died8killbuf ...");
fflush(stdout);
return NULL;
}
if ( yescrypt_kdf( shared, local, passwd, passwdlen, salt, saltlen, N, r, p,
0, flags, hash, sizeof(hash), thrid ) == -1 )
{
printf("died10 ...");
fflush(stdout);
return NULL;
}
dst = buf;
memcpy(dst, setting, prefixlen + saltlen);
dst += prefixlen + saltlen;
*dst++ = '$';
dst = encode64(dst, buflen - (dst - buf), hash, sizeof(hash));
/* Could zeroize hash[] here, but yescrypt_kdf() doesn't zeroize its
* memory allocations yet anyway. */
if (!dst || dst >= buf + buflen) { /* Can't happen */
printf("died11 ...");
return NULL;
}
*dst = 0; /* NUL termination */
printf("died12 ...");
fflush(stdout);
return buf;
}
uint8_t* yescrypt(const uint8_t* passwd, const uint8_t* setting, int thrid )
{
static uint8_t buf[4 + 1 + 5 + 5 + BYTES2CHARS(32) + 1 + HASH_LEN + 1];
yescrypt_shared_t shared;
yescrypt_local_t local;
uint8_t * retval;
if (yescrypt_init_shared(&shared, NULL, 0,
0, 0, 0, YESCRYPT_SHARED_DEFAULTS, 0, NULL, 0))
return NULL;
if (yescrypt_init_local(&local)) {
yescrypt_free_shared(&shared);
return NULL;
}
retval = yescrypt_r(&shared, &local,
passwd, 80, setting, buf, sizeof(buf), thrid );
//printf("hashse='%s'\n", (char *)retval);
if (yescrypt_free_local(&local)) {
yescrypt_free_shared(&shared);
return NULL;
}
if (yescrypt_free_shared(&shared))
return NULL;
return retval;
}
uint8_t* yescrypt_gensalt_r(uint32_t N_log2, uint32_t r, uint32_t p, yescrypt_flags_t flags,
const uint8_t* src, size_t srclen, uint8_t* buf, size_t buflen)
{
uint8_t * dst;
size_t prefixlen = 3 + 1 + 5 + 5;
size_t saltlen = BYTES2CHARS(srclen);
size_t need;
if (p == 1)
flags &= ~YESCRYPT_PARALLEL_SMIX;
if (flags) {
if (flags & ~0x3f)
return NULL;
prefixlen++;
if (flags != YESCRYPT_RW)
prefixlen++;
}
need = prefixlen + saltlen + 1;
if (need > buflen || need < saltlen || saltlen < srclen)
return NULL;
if (N_log2 > 63 || ((uint64_t)r * (uint64_t)p >= (1U << 30)))
return NULL;
dst = buf;
*dst++ = '$';
*dst++ = '7';
if (flags) {
*dst++ = 'X'; /* eXperimental, subject to change */
if (flags != YESCRYPT_RW)
*dst++ = itoa64[flags];
}
*dst++ = '$';
*dst++ = itoa64[N_log2];
dst = encode64_uint32(dst, buflen - (dst - buf), r, 30);
if (!dst) /* Can't happen */
return NULL;
dst = encode64_uint32(dst, buflen - (dst - buf), p, 30);
if (!dst) /* Can't happen */
return NULL;
dst = encode64(dst, buflen - (dst - buf), src, srclen);
if (!dst || dst >= buf + buflen) /* Can't happen */
return NULL;
*dst = 0; /* NUL termination */
return buf;
}
uint8_t* yescrypt_gensalt(uint32_t N_log2, uint32_t r, uint32_t p, yescrypt_flags_t flags,
const uint8_t * src, size_t srclen)
{
static uint8_t buf[4 + 1 + 5 + 5 + BYTES2CHARS(32) + 1];
return yescrypt_gensalt_r(N_log2, r, p, flags, src, srclen,
buf, sizeof(buf));
}
static int yescrypt_bsty(const uint8_t * passwd, size_t passwdlen,
const uint8_t * salt, size_t saltlen, uint64_t N, uint32_t r, uint32_t p,
uint8_t * buf, size_t buflen, int thrid )
{
static __thread int initialized = 0;
static __thread yescrypt_shared_t shared;
static __thread yescrypt_local_t local;
int retval;
if (!initialized) {
/* "shared" could in fact be shared, but it's simpler to keep it private
* along with "local". It's dummy and tiny anyway. */
if (yescrypt_init_shared(&shared, NULL, 0,
0, 0, 0, YESCRYPT_SHARED_DEFAULTS, 0, NULL, 0))
return -1;
if (yescrypt_init_local(&local)) {
yescrypt_free_shared(&shared);
return -1;
}
initialized = 1;
}
retval = yescrypt_kdf(&shared, &local,
passwd, passwdlen, salt, saltlen, N, r, p, 0, YESCRYPT_FLAGS,
buf, buflen, thrid );
#if 0
if (yescrypt_free_local(&local)) {
yescrypt_free_shared(&shared);
return -1;
}
if (yescrypt_free_shared(&shared))
return -1;
initialized = 0;
#endif
return retval;
}
// scrypt parameters initialized at run time.
uint64_t YESCRYPT_N;
uint32_t YESCRYPT_R;
uint32_t YESCRYPT_P;
char *yescrypt_client_key = NULL;
int yescrypt_client_key_len = 0;
/* main hash 80 bytes input */
int yescrypt_hash( const char *input, char *output, uint32_t len, int thrid )
{
return yescrypt_bsty( (uint8_t*)input, len, (uint8_t*)input, len, YESCRYPT_N,
YESCRYPT_R, YESCRYPT_P, (uint8_t*)output, 32, thrid );
}
/* for util.c test */
int yescrypthash(void *output, const void *input, int thrid)
{
return yescrypt_hash((char*) input, (char*) output, 80, thrid);
}
int scanhash_yescrypt( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(64) vhash[8];
uint32_t _ALIGN(64) endiandata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce;
uint32_t n = first_nonce;
int thr_id = mythr->id;
for ( int k = 0; k < 19; k++ )
be32enc( &endiandata[k], pdata[k] );
endiandata[19] = n;
do {
if ( yescrypt_hash((char*) endiandata, (char*) vhash, 80, thr_id ) )
if unlikely( valid_hash( vhash, ptarget ) && !opt_benchmark )
{
be32enc( pdata+19, n );
submit_solution( work, vhash, mythr );
}
endiandata[19] = ++n;
} while ( n < last_nonce && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce;
pdata[19] = n;
return 0;
}
void yescrypt_gate_base(algo_gate_t *gate )
{
gate->optimizations = SSE2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_yescrypt;
gate->hash = (void*)&yescrypt_hash;
opt_target_factor = 65536.0;
}
bool register_yescrypt_algo( algo_gate_t* gate )
{
yescrypt_gate_base( gate );
if ( opt_param_n ) YESCRYPT_N = opt_param_n;
else YESCRYPT_N = 2048;
if ( opt_param_r ) YESCRYPT_R = opt_param_r;
else YESCRYPT_R = 8;
if ( opt_param_key )
{
yescrypt_client_key = opt_param_key;
yescrypt_client_key_len = strlen( opt_param_key );
}
else
{
yescrypt_client_key = NULL;
yescrypt_client_key_len = 0;
}
YESCRYPT_P = 1;
applog( LOG_NOTICE,"Yescrypt parameters: N= %d, R= %d", YESCRYPT_N,
YESCRYPT_R );
if ( yescrypt_client_key )
applog( LOG_NOTICE,"Key= \"%s\"\n", yescrypt_client_key );
return true;
}
bool register_yescryptr8_algo( algo_gate_t* gate )
{
yescrypt_gate_base( gate );
yescrypt_client_key = "Client Key";
yescrypt_client_key_len = 10;
YESCRYPT_N = 2048;
YESCRYPT_R = 8;
YESCRYPT_P = 1;
return true;
}
bool register_yescryptr16_algo( algo_gate_t* gate )
{
yescrypt_gate_base( gate );
yescrypt_client_key = "Client Key";
yescrypt_client_key_len = 10;
YESCRYPT_N = 4096;
YESCRYPT_R = 16;
YESCRYPT_P = 1;
return true;
}
bool register_yescryptr32_algo( algo_gate_t* gate )
{
yescrypt_gate_base( gate );
yescrypt_client_key = "WaviBanana";
yescrypt_client_key_len = 10;
YESCRYPT_N = 4096;
YESCRYPT_R = 32;
YESCRYPT_P = 1;
return true;
}

View File

@@ -1,382 +0,0 @@
/*-
* Copyright 2009 Colin Percival
* Copyright 2013,2014 Alexander Peslyak
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* This file was originally written by Colin Percival as part of the Tarsnap
* online backup system.
*/
#ifndef YESCRYPT_H
#define YESCRYPT_H
#ifdef __cplusplus
extern "C" {
#endif
#include <stdint.h>
#include <stdlib.h> /* for size_t */
#include <stdbool.h>
#include "miner.h"
//#define __SSE4_1__
int yescrypt_hash(const char* input, char* output, uint32_t len, int thrid );
int yescrypthash(void *output, const void *input, int thrid );
/**
* crypto_scrypt(passwd, passwdlen, salt, saltlen, N, r, p, buf, buflen):
* Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r,
* p, buflen) and write the result into buf. The parameters r, p, and buflen
* must satisfy r * p < 2^30 and buflen <= (2^32 - 1) * 32. The parameter N
* must be a power of 2 greater than 1.
*
* Return 0 on success; or -1 on error.
*
* MT-safe as long as buf is local to the thread.
*/
extern int crypto_scrypt(const uint8_t * __passwd, size_t __passwdlen,
const uint8_t * __salt, size_t __saltlen,
uint64_t __N, uint32_t __r, uint32_t __p,
uint8_t * __buf, size_t __buflen);
/**
* Internal type used by the memory allocator. Please do not use it directly.
* Use yescrypt_shared_t and yescrypt_local_t as appropriate instead, since
* they might differ from each other in a future version.
*/
typedef struct {
void * base, * aligned;
size_t base_size, aligned_size;
} yescrypt_region_t;
/**
* Types for shared (ROM) and thread-local (RAM) data structures.
*/
typedef yescrypt_region_t yescrypt_shared1_t;
typedef struct {
yescrypt_shared1_t shared1;
uint32_t mask1;
} yescrypt_shared_t;
typedef yescrypt_region_t yescrypt_local_t;
/**
* Possible values for yescrypt_init_shared()'s flags argument.
*/
typedef enum {
YESCRYPT_SHARED_DEFAULTS = 0,
YESCRYPT_SHARED_PREALLOCATED = 0x100
} yescrypt_init_shared_flags_t;
/**
* Possible values for the flags argument of yescrypt_kdf(),
* yescrypt_gensalt_r(), yescrypt_gensalt(). These may be OR'ed together,
* except that YESCRYPT_WORM and YESCRYPT_RW are mutually exclusive.
* Please refer to the description of yescrypt_kdf() below for the meaning of
* these flags.
*/
typedef enum {
/* public */
YESCRYPT_WORM = 0,
YESCRYPT_RW = 1,
YESCRYPT_PARALLEL_SMIX = 2,
YESCRYPT_PWXFORM = 4,
/* private */
__YESCRYPT_INIT_SHARED_1 = 0x10000,
__YESCRYPT_INIT_SHARED_2 = 0x20000,
__YESCRYPT_INIT_SHARED = 0x30000
} yescrypt_flags_t;
extern char *yescrypt_client_key;
extern int yescrypt_client_key_len;
#define YESCRYPT_KNOWN_FLAGS \
(YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | YESCRYPT_PWXFORM | \
__YESCRYPT_INIT_SHARED)
/**
* yescrypt_init_shared(shared, param, paramlen, N, r, p, flags, mask,
* buf, buflen):
* Optionally allocate memory for and initialize the shared (ROM) data
* structure. The parameters N, r, and p must satisfy the same conditions as
* with crypto_scrypt(). param and paramlen specify a local parameter with
* which the ROM is seeded. If buf is not NULL, then it is used to return
* buflen bytes of message digest for the initialized ROM (the caller may use
* this to verify that the ROM has been computed in the same way that it was on
* a previous run).
*
* Return 0 on success; or -1 on error.
*
* If bit YESCRYPT_SHARED_PREALLOCATED in flags is set, then memory for the
* ROM is assumed to have been preallocated by the caller, with
* shared->shared1.aligned being the start address of the ROM and
* shared->shared1.aligned_size being its size (which must be consistent with
* N, r, and p). This may be used e.g. when the ROM is to be placed in a SysV
* shared memory segment allocated by the caller.
*
* mask controls the frequency of ROM accesses by yescrypt_kdf(). Normally it
* should be set to 1, to interleave RAM and ROM accesses, which works well
* when both regions reside in the machine's RAM anyway. Other values may be
* used e.g. when the ROM is memory-mapped from a disk file. Recommended mask
* values are powers of 2 minus 1 or minus 2. Here's the effect of some mask
* values:
* mask value ROM accesses in SMix 1st loop ROM accesses in SMix 2nd loop
* 0 0 1/2
* 1 1/2 1/2
* 2 0 1/4
* 3 1/4 1/4
* 6 0 1/8
* 7 1/8 1/8
* 14 0 1/16
* 15 1/16 1/16
* 1022 0 1/1024
* 1023 1/1024 1/1024
*
* Actual computation of the ROM contents may be avoided, if you don't intend
* to use a ROM but need a dummy shared structure, by calling this function
* with NULL, 0, 0, 0, 0, YESCRYPT_SHARED_DEFAULTS, 0, NULL, 0 for the
* arguments starting with param and on.
*
* MT-safe as long as shared is local to the thread.
*/
extern int yescrypt_init_shared(yescrypt_shared_t * __shared,
const uint8_t * __param, size_t __paramlen,
uint64_t __N, uint32_t __r, uint32_t __p,
yescrypt_init_shared_flags_t __flags, uint32_t __mask,
uint8_t * __buf, size_t __buflen);
/**
* yescrypt_free_shared(shared):
* Free memory that had been allocated with yescrypt_init_shared().
*
* Return 0 on success; or -1 on error.
*
* MT-safe as long as shared is local to the thread.
*/
extern int yescrypt_free_shared(yescrypt_shared_t * __shared);
/**
* yescrypt_init_local(local):
* Initialize the thread-local (RAM) data structure. Actual memory allocation
* is currently fully postponed until a call to yescrypt_kdf() or yescrypt_r().
*
* Return 0 on success; or -1 on error.
*
* MT-safe as long as local is local to the thread.
*/
extern int yescrypt_init_local(yescrypt_local_t * __local);
/**
* yescrypt_free_local(local):
* Free memory that may have been allocated for an initialized thread-local
* (RAM) data structure.
*
* Return 0 on success; or -1 on error.
*
* MT-safe as long as local is local to the thread.
*/
extern int yescrypt_free_local(yescrypt_local_t * __local);
/**
* yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen,
* N, r, p, t, flags, buf, buflen):
* Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r,
* p, buflen), or a revision of scrypt as requested by flags and shared, and
* write the result into buf. The parameters N, r, p, and buflen must satisfy
* the same conditions as with crypto_scrypt(). t controls computation time
* while not affecting peak memory usage. shared and flags may request
* special modes as described below. local is the thread-local data
* structure, allowing to preserve and reuse a memory allocation across calls,
* thereby reducing its overhead.
*
* Return 0 on success; or -1 on error.
*
* t controls computation time. t = 0 is optimal in terms of achieving the
* highest area-time for ASIC attackers. Thus, higher computation time, if
* affordable, is best achieved by increasing N rather than by increasing t.
* However, if the higher memory usage (which goes along with higher N) is not
* affordable, or if fine-tuning of the time is needed (recall that N must be a
* power of 2), then t = 1 or above may be used to increase time while staying
* at the same peak memory usage. t = 1 increases the time by 25% and
* decreases the normalized area-time to 96% of optimal. (Of course, in
* absolute terms the area-time increases with higher t. It's just that it
* would increase slightly more with higher N*r rather than with higher t.)
* t = 2 increases the time by another 20% and decreases the normalized
* area-time to 89% of optimal. Thus, these two values are reasonable to use
* for fine-tuning. Values of t higher than 2 result in further increase in
* time while reducing the efficiency much further (e.g., down to around 50% of
* optimal for t = 5, which runs 3 to 4 times slower than t = 0, with exact
* numbers varying by the flags settings).
*
* Classic scrypt is available by setting t = 0 and flags to YESCRYPT_WORM and
* passing a dummy shared structure (see the description of
* yescrypt_init_shared() above for how to produce one). In this mode, the
* thread-local memory region (RAM) is first sequentially written to and then
* randomly read from. This algorithm is friendly towards time-memory
* tradeoffs (TMTO), available both to defenders (albeit not in this
* implementation) and to attackers.
*
* Setting YESCRYPT_RW adds extra random reads and writes to the thread-local
* memory region (RAM), which makes TMTO a lot less efficient. This may be
* used to slow down the kinds of attackers who would otherwise benefit from
* classic scrypt's efficient TMTO. Since classic scrypt's TMTO allows not
* only for the tradeoff, but also for a decrease of attacker's area-time (by
* up to a constant factor), setting YESCRYPT_RW substantially increases the
* cost of attacks in area-time terms as well. Yet another benefit of it is
* that optimal area-time is reached at an earlier time than with classic
* scrypt, and t = 0 actually corresponds to this earlier completion time,
* resulting in quicker hash computations (and thus in higher request rate
* capacity). Due to these properties, YESCRYPT_RW should almost always be
* set, except when compatibility with classic scrypt or TMTO-friendliness are
* desired.
*
* YESCRYPT_PARALLEL_SMIX moves parallelism that is present with p > 1 to a
* lower level as compared to where it is in classic scrypt. This reduces
* flexibility for efficient computation (for both attackers and defenders) by
* requiring that, short of resorting to TMTO, the full amount of memory be
* allocated as needed for the specified p, regardless of whether that
* parallelism is actually being fully made use of or not. (For comparison, a
* single instance of classic scrypt may be computed in less memory without any
* CPU time overhead, but in more real time, by not making full use of the
* parallelism.) This may be desirable when the defender has enough memory
* with sufficiently low latency and high bandwidth for efficient full parallel
* execution, yet the required memory size is high enough that some likely
* attackers might end up being forced to choose between using higher latency
* memory than they could use otherwise (waiting for data longer) or using TMTO
* (waiting for data more times per one hash computation). The area-time cost
* for other kinds of attackers (who would use the same memory type and TMTO
* factor or no TMTO either way) remains roughly the same, given the same
* running time for the defender. In the TMTO-friendly YESCRYPT_WORM mode, as
* long as the defender has enough memory that is just as fast as the smaller
* per-thread regions would be, doesn't expect to ever need greater
* flexibility (except possibly via TMTO), and doesn't need backwards
* compatibility with classic scrypt, there are no other serious drawbacks to
* this setting. In the YESCRYPT_RW mode, which is meant to discourage TMTO,
* this new approach to parallelization makes TMTO less inefficient. (This is
* an unfortunate side-effect of avoiding some random writes, as we have to in
* order to allow for parallel threads to access a common memory region without
* synchronization overhead.) Thus, in this mode this setting poses an extra
* tradeoff of its own (higher area-time cost for a subset of attackers vs.
* better TMTO resistance). Setting YESCRYPT_PARALLEL_SMIX also changes the
* way the running time is to be controlled from N*r*p (for classic scrypt) to
* N*r (in this modification). All of this applies only when p > 1. For
* p = 1, this setting is a no-op.
*
* Passing a real shared structure, with ROM contents previously computed by
* yescrypt_init_shared(), enables the use of ROM and requires YESCRYPT_RW for
* the thread-local RAM region. In order to allow for initialization of the
* ROM to be split into a separate program, the shared->shared1.aligned and
* shared->shared1.aligned_size fields may be set by the caller of
* yescrypt_kdf() manually rather than with yescrypt_init_shared().
*
* local must be initialized with yescrypt_init_local().
*
* MT-safe as long as local and buf are local to the thread.
*/
extern int yescrypt_kdf(const yescrypt_shared_t * __shared,
yescrypt_local_t * __local,
const uint8_t * __passwd, size_t __passwdlen,
const uint8_t * __salt, size_t __saltlen,
uint64_t __N, uint32_t __r, uint32_t __p, uint32_t __t,
yescrypt_flags_t __flags,
uint8_t * __buf, size_t __buflen, int thrid);
/**
* yescrypt_r(shared, local, passwd, passwdlen, setting, buf, buflen):
* Compute and encode an scrypt or enhanced scrypt hash of passwd given the
* parameters and salt value encoded in setting. If the shared structure is
* not dummy, a ROM is used and YESCRYPT_RW is required. Otherwise, whether to
* use the YESCRYPT_WORM (classic scrypt) or YESCRYPT_RW (time-memory tradeoff
* discouraging modification) is determined by the setting string. shared and
* local must be initialized as described above for yescrypt_kdf(). buf must
* be large enough (as indicated by buflen) to hold the encoded hash string.
*
* Return the encoded hash string on success; or NULL on error.
*
* MT-safe as long as local and buf are local to the thread.
*/
extern uint8_t * yescrypt_r(const yescrypt_shared_t * __shared,
yescrypt_local_t * __local,
const uint8_t * __passwd, size_t __passwdlen,
const uint8_t * __setting,
uint8_t * __buf, size_t __buflen, int thrid);
/**
* yescrypt(passwd, setting):
* Compute and encode an scrypt or enhanced scrypt hash of passwd given the
* parameters and salt value encoded in setting. Whether to use the
* YESCRYPT_WORM (classic scrypt) or YESCRYPT_RW (time-memory tradeoff
* discouraging modification) is determined by the setting string.
*
* Return the encoded hash string on success; or NULL on error.
*
* This is a crypt(3)-like interface, which is simpler to use than
* yescrypt_r(), but it is not MT-safe, it does not allow for the use of a ROM,
* and it is slower than yescrypt_r() for repeated calls because it allocates
* and frees memory on each call.
*
* MT-unsafe.
*/
extern uint8_t * yescrypt(const uint8_t * __passwd, const uint8_t * __setting, int thrid );
/**
* yescrypt_gensalt_r(N_log2, r, p, flags, src, srclen, buf, buflen):
* Generate a setting string for use with yescrypt_r() and yescrypt() by
* encoding into it the parameters N_log2 (which is to be set to base 2
* logarithm of the desired value for N), r, p, flags, and a salt given by src
* (of srclen bytes). buf must be large enough (as indicated by buflen) to
* hold the setting string.
*
* Return the setting string on success; or NULL on error.
*
* MT-safe as long as buf is local to the thread.
*/
extern uint8_t * yescrypt_gensalt_r(
uint32_t __N_log2, uint32_t __r, uint32_t __p,
yescrypt_flags_t __flags,
const uint8_t * __src, size_t __srclen,
uint8_t * __buf, size_t __buflen);
/**
* yescrypt_gensalt(N_log2, r, p, flags, src, srclen):
* Generate a setting string for use with yescrypt_r() and yescrypt(). This
* function is the same as yescrypt_gensalt_r() except that it uses a static
* buffer and thus is not MT-safe.
*
* Return the setting string on success; or NULL on error.
*
* MT-unsafe.
*/
extern uint8_t * yescrypt_gensalt(
uint32_t __N_log2, uint32_t __r, uint32_t __p,
yescrypt_flags_t __flags,
const uint8_t * __src, size_t __srclen);
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -1,323 +0,0 @@
/*
* Copyright 2009 Colin Percival, 2014 savale
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* This file was originally written by Colin Percival as part of the Tarsnap
* online backup system.
*/
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include "simd-utils.h"
#include <algo/yespower/crypto/sph_types.h>
#include "blake2b-yp.h"
// Cyclic right rotation.
//#ifndef ROTR64
//#define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y))))
//#endif
#define ROTR64(x, y) ror64( x, y )
// Little-endian byte access.
#define B2B_GET64(p) \
(((uint64_t) ((uint8_t *) (p))[0]) ^ \
(((uint64_t) ((uint8_t *) (p))[1]) << 8) ^ \
(((uint64_t) ((uint8_t *) (p))[2]) << 16) ^ \
(((uint64_t) ((uint8_t *) (p))[3]) << 24) ^ \
(((uint64_t) ((uint8_t *) (p))[4]) << 32) ^ \
(((uint64_t) ((uint8_t *) (p))[5]) << 40) ^ \
(((uint64_t) ((uint8_t *) (p))[6]) << 48) ^ \
(((uint64_t) ((uint8_t *) (p))[7]) << 56))
// G Mixing function.
#define B2B_G(a, b, c, d, x, y) { \
v[a] = v[a] + v[b] + x; \
v[d] = ROTR64(v[d] ^ v[a], 32); \
v[c] = v[c] + v[d]; \
v[b] = ROTR64(v[b] ^ v[c], 24); \
v[a] = v[a] + v[b] + y; \
v[d] = ROTR64(v[d] ^ v[a], 16); \
v[c] = v[c] + v[d]; \
v[b] = ROTR64(v[b] ^ v[c], 63); }
// Initialization Vector.
static const uint64_t blake2b_iv[8] = {
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
};
// Compression function. "last" flag indicates last block.
static void blake2b_compress(blake2b_yp_ctx *ctx, int last)
{
const uint8_t sigma[12][16] = {
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
};
int i;
uint64_t v[16], m[16];
// init work variables
for (i = 0; i < 8; i++) {
v[i] = ctx->h[i];
v[i + 8] = blake2b_iv[i];
}
v[12] ^= ctx->t[0]; // low 64 bits of offset
v[13] ^= ctx->t[1]; // high 64 bits
// last block flag set ?
if (last) {
v[14] = ~v[14];
}
// get little-endian words
for (i = 0; i < 16; i++) {
m[i] = B2B_GET64(&ctx->b[8 * i]);
}
// twelve rounds
for (i = 0; i < 12; i++) {
B2B_G( 0, 4, 8, 12, m[sigma[i][ 0]], m[sigma[i][ 1]]);
B2B_G( 1, 5, 9, 13, m[sigma[i][ 2]], m[sigma[i][ 3]]);
B2B_G( 2, 6, 10, 14, m[sigma[i][ 4]], m[sigma[i][ 5]]);
B2B_G( 3, 7, 11, 15, m[sigma[i][ 6]], m[sigma[i][ 7]]);
B2B_G( 0, 5, 10, 15, m[sigma[i][ 8]], m[sigma[i][ 9]]);
B2B_G( 1, 6, 11, 12, m[sigma[i][10]], m[sigma[i][11]]);
B2B_G( 2, 7, 8, 13, m[sigma[i][12]], m[sigma[i][13]]);
B2B_G( 3, 4, 9, 14, m[sigma[i][14]], m[sigma[i][15]]);
}
for(i = 0; i < 8; ++i) {
ctx->h[i] ^= v[i] ^ v[i + 8];
}
}
// Initialize the hashing context "ctx" with optional key "key".
// 1 <= outlen <= 64 gives the digest size in bytes.
// Secret key (also <= 64 bytes) is optional (keylen = 0).
int blake2b_yp_init(blake2b_yp_ctx *ctx, size_t outlen,
const void *key, size_t keylen) // (keylen=0: no key)
{
size_t i;
// illegal parameters
if (outlen == 0 || outlen > 64 || keylen > 64) {
return -1;
}
// state, "param block"
for (i = 0; i < 8; i++) {
ctx->h[i] = blake2b_iv[i];
}
ctx->h[0] ^= 0x01010000 ^ (keylen << 8) ^ outlen;
ctx->t[0] = 0; // input count low word
ctx->t[1] = 0; // input count high word
ctx->c = 0; // pointer within buffer
ctx->outlen = outlen;
// zero input block
for (i = keylen; i < 128; i++) {
ctx->b[i] = 0;
}
if (keylen > 0) {
blake2b_yp_update(ctx, key, keylen);
ctx->c = 128; // at the end
}
return 0;
}
// Add "inlen" bytes from "in" into the hash.
void blake2b_yp_update(blake2b_yp_ctx *ctx,
const void *in, size_t inlen) // data bytes
{
size_t i;
for (i = 0; i < inlen; i++) {
if (ctx->c == 128) { // buffer full ?
ctx->t[0] += ctx->c; // add counters
if (ctx->t[0] < ctx->c) // carry overflow ?
ctx->t[1]++; // high word
blake2b_compress(ctx, 0); // compress (not last)
ctx->c = 0; // counter to zero
}
ctx->b[ctx->c++] = ((const uint8_t *) in)[i];
}
}
// Generate the message digest (size given in init).
// Result placed in "out".
void blake2b_yp_final(blake2b_yp_ctx *ctx, void *out)
{
size_t i;
ctx->t[0] += ctx->c; // mark last block offset
// carry overflow
if (ctx->t[0] < ctx->c) {
ctx->t[1]++; // high word
}
// fill up with zeros
while (ctx->c < 128) {
ctx->b[ctx->c++] = 0;
}
blake2b_compress(ctx, 1); // final block flag = 1
// little endian convert and store
for (i = 0; i < ctx->outlen; i++) {
((uint8_t *) out)[i] =
(ctx->h[i >> 3] >> (8 * (i & 7))) & 0xFF;
}
}
// inlen = number of bytes
void blake2b_yp_hash(void *out, const void *in, size_t inlen) {
blake2b_yp_ctx ctx;
blake2b_yp_init(&ctx, 32, NULL, 0);
blake2b_yp_update(&ctx, in, inlen);
blake2b_yp_final(&ctx, out);
}
// // keylen = number of bytes
void hmac_blake2b_yp_init(hmac_yp_ctx *hctx, const void *_key, size_t keylen) {
const uint8_t *key = _key;
uint8_t keyhash[32];
uint8_t pad[64];
uint64_t i;
if (keylen > 64) {
blake2b_yp_hash(keyhash, key, keylen);
key = keyhash;
keylen = 32;
}
blake2b_yp_init(&hctx->inner, 32, NULL, 0);
memset(pad, 0x36, 64);
for (i = 0; i < keylen; ++i) {
pad[i] ^= key[i];
}
blake2b_yp_update(&hctx->inner, pad, 64);
blake2b_yp_init(&hctx->outer, 32, NULL, 0);
memset(pad, 0x5c, 64);
for (i = 0; i < keylen; ++i) {
pad[i] ^= key[i];
}
blake2b_yp_update(&hctx->outer, pad, 64);
memset(keyhash, 0, 32);
}
// datalen = number of bits
void hmac_blake2b_yp_update(hmac_yp_ctx *hctx, const void *data, size_t datalen) {
// update the inner state
blake2b_yp_update(&hctx->inner, data, datalen);
}
void hmac_blake2b_yp_final(hmac_yp_ctx *hctx, uint8_t *digest) {
uint8_t ihash[32];
blake2b_yp_final(&hctx->inner, ihash);
blake2b_yp_update(&hctx->outer, ihash, 32);
blake2b_yp_final(&hctx->outer, digest);
memset(ihash, 0, 32);
}
// // keylen = number of bytes; inlen = number of bytes
void hmac_blake2b_yp_hash(void *out, const void *key, size_t keylen, const void *in, size_t inlen) {
hmac_yp_ctx hctx;
hmac_blake2b_yp_init(&hctx, key, keylen);
hmac_blake2b_yp_update(&hctx, in, inlen);
hmac_blake2b_yp_final(&hctx, out);
}
void pbkdf2_blake2b_yp(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen)
{
hmac_yp_ctx PShctx, hctx;
size_t i;
uint32_t ivec;
uint8_t U[32];
uint8_t T[32];
uint64_t j;
int k;
size_t clen;
/* Compute HMAC state after processing P and S. */
hmac_blake2b_yp_init(&PShctx, passwd, passwdlen);
hmac_blake2b_yp_update(&PShctx, salt, saltlen);
/* Iterate through the blocks. */
for (i = 0; i * 32 < dkLen; i++) {
/* Generate INT(i + 1). */
ivec = bswap_32( i+1 );
/* Compute U_1 = PRF(P, S || INT(i)). */
memcpy(&hctx, &PShctx, sizeof(hmac_yp_ctx));
hmac_blake2b_yp_update(&hctx, &ivec, 4);
hmac_blake2b_yp_final(&hctx, U);
/* T_i = U_1 ... */
memcpy(T, U, 32);
for (j = 2; j <= c; j++) {
/* Compute U_j. */
hmac_blake2b_yp_init(&hctx, passwd, passwdlen);
hmac_blake2b_yp_update(&hctx, U, 32);
hmac_blake2b_yp_final(&hctx, U);
/* ... xor U_j ... */
for (k = 0; k < 32; k++) {
T[k] ^= U[k];
}
}
/* Copy as many bytes as necessary into buf. */
clen = dkLen - i * 32;
if (clen > 32) {
clen = 32;
}
memcpy(&buf[i * 32], T, clen);
}
/* Clean PShctx, since we never called _Final on it. */
memset(&PShctx, 0, sizeof(hmac_yp_ctx));
}

View File

@@ -1,42 +0,0 @@
#pragma once
#ifndef __BLAKE2B_H__
#define __BLAKE2B_H__
#include <stddef.h>
#include <stdint.h>
#if defined(_MSC_VER) || defined(__x86_64__) || defined(__x86__)
#define NATIVE_LITTLE_ENDIAN
#endif
// state context
typedef struct {
uint8_t b[128]; // input buffer
uint64_t h[8]; // chained state
uint64_t t[2]; // total number of bytes
size_t c; // pointer for b[]
size_t outlen; // digest size
} blake2b_yp_ctx;
typedef struct {
blake2b_yp_ctx inner;
blake2b_yp_ctx outer;
} hmac_yp_ctx;
#if defined(__cplusplus)
extern "C" {
#endif
int blake2b_yp_init(blake2b_yp_ctx *ctx, size_t outlen, const void *key, size_t keylen);
void blake2b_yp_update(blake2b_yp_ctx *ctx, const void *in, size_t inlen);
void blake2b_yp_final(blake2b_yp_ctx *ctx, void *out);
void blake2b_yp_hash(void *out, const void *in, size_t inlen);
void hmac_blake2b_yp_hash(void *out, const void *key, size_t keylen, const void *in, size_t inlen);
void pbkdf2_blake2b_yp(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen);
#if defined(__cplusplus)
}
#endif
#endif

View File

@@ -0,0 +1,150 @@
/*
* Copyright 2009 Colin Percival, 2014 savale
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* This file was originally written by Colin Percival as part of the Tarsnap
* online backup system.
*/
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include "simd-utils.h"
#include "hmac-blake2b.h"
// keylen = number of bytes
void hmac_blake2b_init( hmac_blake2b_ctx *hctx, const void *_key,
size_t keylen )
{
const uint8_t *key = _key;
uint8_t keyhash[32];
uint8_t pad[64];
uint64_t i;
if (keylen > 64)
{
sph_blake2b_ctx ctx;
sph_blake2b_init( &ctx, 32, NULL, 0 );
sph_blake2b_update( &ctx, key, keylen );
sph_blake2b_final( &ctx, keyhash );
key = keyhash;
keylen = 32;
}
sph_blake2b_init( &hctx->inner, 32, NULL, 0 );
memset( pad, 0x36, 64 );
for ( i = 0; i < keylen; ++i )
pad[i] ^= key[i];
sph_blake2b_update( &hctx->inner, pad, 64 );
sph_blake2b_init( &hctx->outer, 32, NULL, 0 );
memset( pad, 0x5c, 64 );
for ( i = 0; i < keylen; ++i )
pad[i] ^= key[i];
sph_blake2b_update( &hctx->outer, pad, 64 );
memset( keyhash, 0, 32 );
}
// datalen = number of bits
void hmac_blake2b_update( hmac_blake2b_ctx *hctx, const void *data,
size_t datalen )
{
// update the inner state
sph_blake2b_update( &hctx->inner, data, datalen );
}
void hmac_blake2b_final( hmac_blake2b_ctx *hctx, uint8_t *digest )
{
uint8_t ihash[32];
sph_blake2b_final( &hctx->inner, ihash );
sph_blake2b_update( &hctx->outer, ihash, 32 );
sph_blake2b_final( &hctx->outer, digest );
memset( ihash, 0, 32 );
}
// // keylen = number of bytes; inlen = number of bytes
void hmac_blake2b_hash( void *out, const void *key, size_t keylen,
const void *in, size_t inlen )
{
hmac_blake2b_ctx hctx;
hmac_blake2b_init( &hctx, key, keylen );
hmac_blake2b_update( &hctx, in, inlen );
hmac_blake2b_final( &hctx, out );
}
void pbkdf2_blake2b( const uint8_t *passwd, size_t passwdlen,
const uint8_t *salt, size_t saltlen, uint64_t c,
uint8_t *buf, size_t dkLen )
{
hmac_blake2b_ctx PShctx, hctx;
size_t i;
uint32_t ivec;
uint8_t U[32];
uint8_t T[32];
uint64_t j;
int k;
size_t clen;
/* Compute HMAC state after processing P and S. */
hmac_blake2b_init( &PShctx, passwd, passwdlen );
hmac_blake2b_update( &PShctx, salt, saltlen );
/* Iterate through the blocks. */
for ( i = 0; i * 32 < dkLen; i++ )
{
/* Generate INT(i + 1). */
ivec = bswap_32( i+1 );
/* Compute U_1 = PRF(P, S || INT(i)). */
memcpy( &hctx, &PShctx, sizeof(hmac_blake2b_ctx) );
hmac_blake2b_update( &hctx, &ivec, 4 );
hmac_blake2b_final( &hctx, U );
/* T_i = U_1 ... */
memcpy( T, U, 32 );
for ( j = 2; j <= c; j++ )
{
/* Compute U_j. */
hmac_blake2b_init( &hctx, passwd, passwdlen );
hmac_blake2b_update( &hctx, U, 32 );
hmac_blake2b_final( &hctx, U );
/* ... xor U_j ... */
for ( k = 0; k < 32; k++ )
T[k] ^= U[k];
}
/* Copy as many bytes as necessary into buf. */
clen = dkLen - i * 32;
if (clen > 32)
clen = 32;
memcpy( &buf[i * 32], T, clen );
}
/* Clean PShctx, since we never called _Final on it. */
memset( &PShctx, 0, sizeof(hmac_blake2b_ctx) );
}

View File

@@ -0,0 +1,34 @@
#pragma once
#ifndef __HMAC_BLAKE2B_H__
#define __HMAC_BLAKE2B_H__
#include <stddef.h>
#include <stdint.h>
#include "algo/blake/sph_blake2b.h"
#if defined(_MSC_VER) || defined(__x86_64__) || defined(__x86__)
#define NATIVE_LITTLE_ENDIAN
#endif
typedef struct
{
sph_blake2b_ctx inner;
sph_blake2b_ctx outer;
} hmac_blake2b_ctx;
#if defined(__cplusplus)
extern "C" {
#endif
void hmac_blake2b_hash( void *out, const void *key, size_t keylen,
const void *in, size_t inlen );
void pbkdf2_blake2b( const uint8_t * passwd, size_t passwdlen,
const uint8_t * salt, size_t saltlen, uint64_t c,
uint8_t * buf, size_t dkLen );
#if defined(__cplusplus)
}
#endif
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -95,7 +95,7 @@
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "crypto/blake2b-yp.h"
#include "crypto/hmac-blake2b.h"
#include "yespower.h"
#ifdef __unix__
@@ -1136,6 +1136,7 @@ int yespower_b2b(yespower_local_t *local,
salsa20_blk_t *V, *XY;
pwxform_ctx_t ctx;
uint8_t init_hash[32];
sph_blake2b_ctx blake2b_ctx;
/* Sanity-check parameters */
if ((N < 1024 || N > 512 * 1024 || r < 8 || r > 32 ||
@@ -1167,7 +1168,9 @@ int yespower_b2b(yespower_local_t *local,
ctx.S0 = S;
ctx.S1 = S + Swidth_to_Sbytes1(Swidth);
blake2b_yp_hash(init_hash, src, srclen);
sph_blake2b_init( &blake2b_ctx, 32, NULL, 0 );
sph_blake2b_update( &blake2b_ctx, src, srclen );
sph_blake2b_final( &blake2b_ctx, init_hash );
ctx.S2 = S + 2 * Swidth_to_Sbytes1(Swidth);
ctx.w = 0;
@@ -1181,7 +1184,7 @@ int yespower_b2b(yespower_local_t *local,
if ( work_restart[thrid].restart ) return false;
pbkdf2_blake2b_yp(init_hash, sizeof(init_hash), src, srclen, 1, B, 128);
pbkdf2_blake2b(init_hash, sizeof(init_hash), src, srclen, 1, B, 128);
if ( work_restart[thrid].restart ) return false;
@@ -1190,7 +1193,7 @@ int yespower_b2b(yespower_local_t *local,
if ( work_restart[thrid].restart ) return false;
hmac_blake2b_yp_hash((uint8_t *)dst, B + B_size - 64, 64, init_hash, sizeof(init_hash));
hmac_blake2b_hash((uint8_t *)dst, B + B_size - 64, 64, init_hash, sizeof(init_hash));
/* Success! */
return 1;

View File

@@ -161,7 +161,7 @@ bool register_yespowerr16_algo( algo_gate_t* gate )
// Legacy Yescrypt (yespower v0.5)
bool register_yescrypt_05_algo( algo_gate_t* gate )
bool register_yescrypt_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_yespower;
@@ -194,7 +194,7 @@ bool register_yescrypt_05_algo( algo_gate_t* gate )
}
bool register_yescryptr8_05_algo( algo_gate_t* gate )
bool register_yescryptr8_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_yespower;
@@ -207,7 +207,7 @@ bool register_yescryptr8_05_algo( algo_gate_t* gate )
return true;
}
bool register_yescryptr16_05_algo( algo_gate_t* gate )
bool register_yescryptr16_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_yespower;
@@ -220,7 +220,7 @@ bool register_yescryptr16_05_algo( algo_gate_t* gate )
return true;
}
bool register_yescryptr32_05_algo( algo_gate_t* gate )
bool register_yescryptr32_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_yespower;
@@ -249,7 +249,7 @@ bool register_power2b_algo( algo_gate_t* gate )
applog( LOG_NOTICE,"Key= \"%s\"", yespower_params.pers );
applog( LOG_NOTICE,"Key length= %d\n", yespower_params.perslen );
gate->optimizations = SSE2_OPT;
gate->optimizations = SSE2_OPT | AVX2_OPT;
gate->scanhash = (void*)&scanhash_yespower_b2b;
gate->hash = (void*)&yespower_b2b_hash;
opt_target_factor = 65536.0;

48
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.20.0.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.20.2.
#
#
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='3.20.0'
PACKAGE_STRING='cpuminer-opt 3.20.0'
PACKAGE_VERSION='3.20.2'
PACKAGE_STRING='cpuminer-opt 3.20.2'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures cpuminer-opt 3.20.0 to adapt to many kinds of systems.
\`configure' configures cpuminer-opt 3.20.2 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1404,7 +1404,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 3.20.0:";;
short | recursive ) echo "Configuration of cpuminer-opt 3.20.2:";;
esac
cat <<\_ACEOF
@@ -1509,7 +1509,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
cpuminer-opt configure 3.20.0
cpuminer-opt configure 3.20.2
generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 3.20.0, which was
It was created by cpuminer-opt $as_me 3.20.2, which was
generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@
@@ -2993,7 +2993,7 @@ fi
# Define the identity of the package.
PACKAGE='cpuminer-opt'
VERSION='3.20.0'
VERSION='3.20.2'
cat >>confdefs.h <<_ACEOF
@@ -5820,6 +5820,34 @@ $as_echo "#define USE_AVX2 1" >>confdefs.h
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
$as_echo "yes" >&6; }
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we can compile AVX512 code" >&5
$as_echo_n "checking whether we can compile AVX512 code... " >&6; }
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
int
main ()
{
asm ("vpaddd %zmm0, %zmm1, %zmm2{%k1}");
;
return 0;
}
_ACEOF
if ac_fn_c_try_compile "$LINENO"; then :
$as_echo "#define USE_AVX512 1" >>confdefs.h
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
$as_echo "yes" >&6; }
else
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
$as_echo "no" >&6; }
{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the AVX512 instruction set." >&5
$as_echo "$as_me: WARNING: The assembler does not support the AVX512 instruction set." >&2;}
fi
rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
else
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
@@ -6690,7 +6718,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by cpuminer-opt $as_me 3.20.0, which was
This file was extended by cpuminer-opt $as_me 3.20.2, which was
generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -6756,7 +6784,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
cpuminer-opt config.status 3.20.0
cpuminer-opt config.status 3.20.2
configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [3.20.0])
AC_INIT([cpuminer-opt], [3.20.2])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM
@@ -93,6 +93,14 @@ then
AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vpaddd %ymm0, %ymm1, %ymm2");])],
AC_DEFINE(USE_AVX2, 1, [Define to 1 if AVX2 assembly is available.])
AC_MSG_RESULT(yes)
AC_MSG_CHECKING(whether we can compile AVX512 code)
AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vpaddd %zmm0, %zmm1, %zmm2{%k1}");])],
AC_DEFINE(USE_AVX512, 1, [Define to 1 if AVX512 assembly is available.])
AC_MSG_RESULT(yes)
,
AC_MSG_RESULT(no)
AC_MSG_WARN([The assembler does not support the AVX512 instruction set.])
)
,
AC_MSG_RESULT(no)
AC_MSG_WARN([The assembler does not support the AVX2 instruction set.])

View File

@@ -1300,6 +1300,7 @@ static int share_result( int result, struct work *work,
my_stats.share_count, acol, ares, scol, sres, rcol, rres, bcol,
bres, CL_N, share_time, latency );
/*
if ( unlikely( opt_debug || !result || solved ) )
{
if ( have_stratum )
@@ -1309,14 +1310,27 @@ static int share_result( int result, struct work *work,
applog2( LOG_INFO, "Diff %.5g, Block %d",
my_stats.share_diff, work ? work->height : last_block_height );
}
*/
if ( unlikely( !( opt_quiet || result || stale ) ) )
{
uint32_t str[8];
uint32_t *targ;
// uint32_t str[8];
// uint32_t *targ;
if ( reason ) applog( LOG_MINR, "Reject reason: %s", reason );
if ( reason ) applog2( LOG_MINR, "Reject reason: %s", reason );
{
// The exact hash is not avaiable here, it's just an imprecise
// approximation calculated from the share difficulty. It's useless
// for anything other than low diff rejects. Until and unless a
// solution is implemented to make the hash and targets avaiable
// don't bother displaying them. In the meantime display the diff for
// low diff rejects.
if ( strstr( reason, "difficulty" ) )
applog2( LOG_MINR, "Share diff: %.5g, Target: %.5g",
my_stats.share_diff, my_stats.target_diff );
/*
diff_to_hash( str, my_stats.share_diff );
applog2( LOG_INFO, "Hash: %08x%08x%08x%08x%08x%08x", str[7], str[6],
str[5], str[4], str[3],str[2], str[1], str[0] );
@@ -1330,6 +1344,8 @@ static int share_result( int result, struct work *work,
}
applog2( LOG_INFO, "Target: %08x%08x%08x%08x%08x%08x", targ[7], targ[6],
targ[5], targ[4], targ[3], targ[2], targ[1], targ[0] );
*/
}
}
return 1;
}

View File

@@ -273,9 +273,9 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
#endif
// Mask making
// Equivalent of AVX512 _mm_movepi64_mask & _mm_movepi32_mask.
// Returns 2 or 4 bit integer mask from MSB of 64 or 32 bit elements.
// Effectively a sign test.
#define mm_movmask_64( v ) \
_mm_castpd_si128( _mm_movmask_pd( _mm_castsi128_pd( v ) ) )
@@ -306,34 +306,11 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
//
// Bit rotations
// AVX512VL has implemented bit rotation for 128 bit vectors with
// 64 and 32 bit elements.
// x2 rotates elements in 2 individual vectors in a double buffered
// optimization for SSE2, does nothing for AVX512 but is there for
// transparency.
// compiler doesn't like when a variable is used for the last arg of
// _mm_rol_epi32, must be "8 bit immediate". Oddly _mm_slli has the same
// specification but works with a variable. Therefore use rol_var where
// necessary.
// sm3-hash-4way.c has one instance where mm128_rol_var_32 is required.
#define mm128_ror_var_64( v, c ) \
_mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) )
#define mm128_rol_var_64( v, c ) \
_mm_or_si128( _mm_slli_epi64( v, c ), _mm_srli_epi64( v, 64-(c) ) )
#define mm128_ror_var_32( v, c ) \
_mm_or_si128( _mm_srli_epi32( v, c ), _mm_slli_epi32( v, 32-(c) ) )
#define mm128_rol_var_32( v, c ) \
_mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
#if defined(__AVX512VL__)
//#if defined(__AVX512F__) && defined(__AVX512VL__)
#define mm128_ror_64 _mm_ror_epi64
#define mm128_rol_64 _mm_rol_epi64
@@ -358,10 +335,17 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
#else // SSE2
#define mm128_ror_64 mm128_ror_var_64
#define mm128_rol_64 mm128_rol_var_64
#define mm128_ror_32 mm128_ror_var_32
#define mm128_rol_32 mm128_rol_var_32
#define mm128_ror_64( v, c ) \
_mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) )
#define mm128_rol_64( v, c ) \
_mm_or_si128( _mm_slli_epi64( v, c ), _mm_srli_epi64( v, 64-(c) ) )
#define mm128_ror_32( v, c ) \
_mm_or_si128( _mm_srli_epi32( v, c ), _mm_slli_epi32( v, 32-(c) ) )
#define mm128_rol_32( v, c ) \
_mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
#define mm128_rorx2_64( v1, v0, c ) \
{ \
@@ -411,6 +395,11 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
#define mm128_rol_16( v, c ) \
_mm_or_si128( _mm_slli_epi16( v, c ), _mm_srli_epi16( v, 16-(c) ) )
// Deprecated.
#define mm128_rol_var_32( v, c ) \
_mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
//
// Limited 2 input shuffle, combines shuffle with blend. The destination low
// half is always taken from src a, and the high half from src b.
#define mm128_shuffle2_64( a, b, c ) \
@@ -421,7 +410,6 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
_mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( a ), \
_mm_castsi128_ps( b ), c ) );
//
// Rotate vector elements accross all lanes
@@ -432,21 +420,61 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
#define mm128_shuflr_32( v ) _mm_shuffle_epi32( v, 0x39 )
#define mm128_shufll_32( v ) _mm_shuffle_epi32( v, 0x93 )
// Swap 32 bit elements in 64 bit lanes
#define mm128_swap64_32( v ) _mm_shuffle_epi32( v, 0xb1 )
#define mm128_shuflr64_32 mm128_swap64_32
#define mm128_shufll64_32 mm128_swap64_32
#if defined(__SSSE3__)
// Rotate right by c bytes, no SSE2 equivalent.
static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
{ return _mm_alignr_epi8( v, v, c ); }
#endif
// Rotate byte elements within 64 or 32 bit lanes, AKA optimized bit rotations
// for multiples of 8 bits. Uses ror/rol macros when AVX512 is available
// (unlikely but faster), or when SSSE3 is not available (slower).
#define mm128_swap64_32( v ) _mm_shuffle_epi32( v, 0xb1 )
#define mm128_shuflr64_32 mm128_swap64_32
#define mm128_shufll64_32 mm128_swap64_32
#if defined(__SSSE3__) && !defined(__AVX512VL__)
#define mm128_shuflr64_24( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( \
0x0a09080f0e0d0c0b, 0x0201000706050403 ) )
#else
#define mm128_shuflr64_24( v ) mm128_ror_64( v, 24 )
#endif
#if defined(__SSSE3__) && !defined(__AVX512VL__)
#define mm128_shuflr64_16( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( \
0x09080f0e0d0c0b0a, 0x0100070605040302 ) )
#else
#define mm128_shuflr64_16( v ) mm128_ror_64( v, 16 )
#endif
#if defined(__SSSE3__) && !defined(__AVX512VL__)
#define mm128_swap32_16( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( \
0x0d0c0f0e09080b0a, 0x0504070601000302 ) )
#else
#define mm128_swap32_16( v ) mm128_ror_32( v, 16 )
#endif
#define mm128_shuflr32_16 mm128_swap32_16
#define mm128_shufll32_16 mm128_swap32_16
#if defined(__SSSE3__) && !defined(__AVX512VL__)
#define mm128_shuflr32_8( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( \
0x0c0f0e0d080b0a09, 0x0407060500030201 ) )
#else
#define mm128_shuflr32_8( v ) mm128_ror_32( v, 8 )
#endif
//
// Endian byte swap.
#if defined(__SSSE3__)
#define mm128_bswap_64( v ) \
_mm_shuffle_epi8( v, m128_const_64( 0x08090a0b0c0d0e0f, \
0x0001020304050607 ) )
@@ -537,8 +565,8 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
//
// Rotate in place concatenated 128 bit vectors as one 256 bit vector.
// Swap 128 bit vectorse.
// Swap 128 bit vectors.
// This should be avoided, it's more efficient to switch references.
#define mm128_swap256_128( v1, v2 ) \
v1 = _mm_xor_si128( v1, v2 ); \
v2 = _mm_xor_si128( v1, v2 ); \
@@ -546,15 +574,14 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
// Two input shuffle-rotate.
// Concatenate v1 & v2 and rotate as one 256 bit vector.
// Continue to use vror/vrol for now to avoid confusion with
// shufl2r/shufl2l function macros available with AVX512.
// Concatenate v1 & v2 and bit rotate as one 256 bit vector.
#if defined(__SSSE3__)
// Function macro with two inputs and one output, inputs are preserved.
// Two input functions are not available without SSSE3. Use procedure
// macros below instead.
// Function macros with two inputs and one output, inputs are preserved.
// Returns the high 128 bits, ie updated v1.
// These functions are preferred but only available with SSSE3. Use procedure
// macros below for SSE2 compatibility.
#define mm128_shufl2r_64( v1, v2 ) _mm_alignr_epi8( v2, v1, 8 )
#define mm128_shufl2l_64( v1, v2 ) _mm_alignr_epi8( v1, v2, 8 )
@@ -568,12 +595,9 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
#define mm128_shufl2r_8( v1, v2 ) _mm_alignr_epi8( v2, v1, 8 )
#define mm128_shufl2l_8( v1, v2 ) _mm_alignr_epi8( v1, v2, 8 )
// Procedure macros with 2 inputs and 2 outputs, inputs args are overwritten.
// These macros retain the vrol/vror name for now to avoid
// confusion with the shufl2r/shuffle2l function macros above.
// These may be renamed to something like shufl2r2 for 2 nputs and
// 2 outputs, ie SHUFfLe 2 inputs Right with 2 outputs.
// Procedure macros with 2 inputs and 2 outputs, input args are overwritten.
// Deprecated for SSSE3 and above, SSSE3 versions exist for only for
// compatibility with with existing code.
#define mm128_vror256_64( v1, v2 ) \
do { \

View File

@@ -13,6 +13,18 @@
// AVX512 implementations. They will be selected automatically but their use
// is limited because 256 bit vectors are less likely to be used when 512
// is available.
//
// AVX2 version of _mm256_shuffle_epi8 is limited to 128 bit lanes but AVX512
// version is not. Some usage has the index vector encoded as if full vector
// shuffles are supported. This has no side effects and would have the same
// results using either version.
// If needed and AVX512 is available, 256 bit full vector shuffles can be
// implemented using the AVX512 zero-mask feature with a NULL mask.
// Using intrinsics it's simple:
// _mm256_maskz_shuffle_epi8( k0, v, c )
// With asm it's a bit more complicated with the addition of the mask register
// and zero tag:
// vpshufb ymm0{k0}{z}, ymm1, ymm2
#if defined(__AVX__)
@@ -234,9 +246,9 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
#endif
// Mask making
// Equivalent of AVX512 _mm256_movepi64_mask & _mm256_movepi32_mask.
// Returns 4 or 8 bit integer mask from MSB of 64 or 32 bit elements.
// Effectively a sign test.
#define mm256_movmask_64( v ) \
_mm256_castpd_si256( _mm256_movmask_pd( _mm256_castsi256_pd( v ) ) )
@@ -273,42 +285,11 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
//
// Bit rotations.
//
// The only bit shift for more than 64 bits is with __int128 which is slow.
//
// AVX512 has bit rotate for 256 bit vectors with 64 or 32 bit elements
//
// x2 rotates elements in 2 individual vectors in a double buffered
// optimization for SSE2, does nothing for AVX512 but is there for
// optimization for AVX2, does nothing for AVX512 but is here for
// transparency.
// compiler doesn't like when a variable is used for the last arg of
// _mm_rol_epi32, must be "8 bit immediate". Therefore use rol_var where
// necessary.
#define mm256_ror_var_64( v, c ) \
_mm256_or_si256( _mm256_srli_epi64( v, c ), \
_mm256_slli_epi64( v, 64-(c) ) )
#define mm256_rol_var_64( v, c ) \
_mm256_or_si256( _mm256_slli_epi64( v, c ), \
_mm256_srli_epi64( v, 64-(c) ) )
#define mm256_ror_var_32( v, c ) \
_mm256_or_si256( _mm256_srli_epi32( v, c ), \
_mm256_slli_epi32( v, 32-(c) ) )
#define mm256_rol_var_32( v, c ) \
_mm256_or_si256( _mm256_slli_epi32( v, c ), \
_mm256_srli_epi32( v, 32-(c) ) )
// The spec says both F & VL are required, but just in case AMD
// decides to implement ROL/R without AVX512F.
#if defined(__AVX512VL__)
//#if defined(__AVX512F__) && defined(__AVX512VL__)
// AVX512, control must be 8 bit immediate.
#define mm256_ror_64 _mm256_ror_epi64
#define mm256_rol_64 _mm256_rol_epi64
@@ -333,10 +314,23 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
#else // AVX2
#define mm256_ror_64 mm256_ror_var_64
#define mm256_rol_64 mm256_rol_var_64
#define mm256_ror_32 mm256_ror_var_32
#define mm256_rol_32 mm256_rol_var_32
// use shuflr64 shuflr32 below for optimized bit rotations of multiples of 8.
#define mm256_ror_64( v, c ) \
_mm256_or_si256( _mm256_srli_epi64( v, c ), \
_mm256_slli_epi64( v, 64-(c) ) )
#define mm256_rol_64( v, c ) \
_mm256_or_si256( _mm256_slli_epi64( v, c ), \
_mm256_srli_epi64( v, 64-(c) ) )
#define mm256_ror_32( v, c ) \
_mm256_or_si256( _mm256_srli_epi32( v, c ), \
_mm256_slli_epi32( v, 32-(c) ) )
#define mm256_rol_32( v, c ) \
_mm256_or_si256( _mm256_slli_epi32( v, c ), \
_mm256_srli_epi32( v, 32-(c) ) )
#define mm256_rorx2_64( v1, v0, c ) \
{ \
@@ -388,6 +382,10 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
_mm256_or_si256( _mm256_slli_epi16( v, c ), \
_mm256_srli_epi16( v, 16-(c) ) )
// Deprecated.
#define mm256_rol_var_32( v, c ) \
_mm256_or_si256( _mm256_slli_epi32( v, c ), \
_mm256_srli_epi32( v, 32-(c) ) )
//
// Rotate elements accross all lanes.
@@ -399,7 +397,6 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
// Rotate 256 bit vector by one 64 bit element
#define mm256_shuflr_64( v ) _mm256_permute4x64_epi64( v, 0x39 )
#define mm256_shufll_64( v ) _mm256_permute4x64_epi64( v, 0x93 )
// Rotate 256 bit vector by one 32 bit element.
@@ -413,7 +410,6 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
m256_const_64( 0x0000000600000005, 0x0000000400000003, \
0x0000000200000001, 0x0000000000000007 ) )
//
// Rotate elements within each 128 bit lane of 256 bit vector.
@@ -426,7 +422,6 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
_mm256_castps_si256( _mm256_shuffle_ps( _mm256_castsi256_ps( a ), \
_mm256_castsi256_ps( b ), c ) );
#define mm256_swap128_64( v ) _mm256_shuffle_epi32( v, 0x4e )
#define mm256_shuflr128_64 mm256_swap128_64
#define mm256_shufll128_64 mm256_swap128_64
@@ -437,11 +432,52 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
{ return _mm256_alignr_epi8( v, v, c ); }
// Swap 32 bit elements in each 64 bit lane.
// Rotate byte elements within 64 or 32 bit lanes, AKA optimized bit
// rotations for multiples of 8 bits. Uses faster ror/rol instructions when
// AVX512 is available.
#define mm256_swap64_32( v ) _mm256_shuffle_epi32( v, 0xb1 )
#define mm256_shuflr64_32 mm256_swap64_32
#define mm256_shufll64_32 mm256_swap64_32
#if defined(__AVX512VL__)
#define mm256_shuflr64_24( v ) _mm256_ror_epi64( v, 24 )
#else
#define mm256_shuflr64_24( v ) \
_mm256_shuffle_epi8( v, _mm256_set_epi64x( \
0x0a09080f0e0d0c0b, 0x0201000706050403, \
0x0a09080f0e0d0c0b, 0x0201000706050403 ) )
#endif
#if defined(__AVX512VL__)
#define mm256_shuflr64_16( v ) _mm256_ror_epi64( v, 16 )
#else
#define mm256_shuflr64_16( v ) \
_mm256_shuffle_epi8( v, _mm256_set_epi64x( \
0x09080f0e0d0c0b0a, 0x0100070605040302, \
0x09080f0e0d0c0b0a, 0x0100070605040302 ) )
#endif
#if defined(__AVX512VL__)
#define mm256_swap32_16( v ) _mm256_ror_epi32( v, 16 )
#else
#define mm256_swap32_16( v ) \
_mm256_shuffle_epi8( v, _mm256_set_epi64x( \
0x0d0c0f0e09080b0a, 0x0504070601000302, \
0x0d0c0f0e09080b0a, 0x0504070601000302 ) )
#endif
#define mm256_shuflr32_16 mm256_swap32_16
#define mm256_shufll32_16 mm256_swap32_16
#if defined(__AVX512VL__)
#define mm256_shuflr32_8( v ) _mm256_ror_epi32( v, 8 )
#else
#define mm256_shuflr32_8( v ) \
_mm256_shuffle_epi8( v, _mm256_set_epi64x( \
0x0c0f0e0d080b0a09, 0x0407060500030201, \
0x0c0f0e0d080b0a09, 0x0407060500030201 ) )
#endif
// NOTE: _mm256_shuffle_epi8, like most shuffles, is restricted to 128 bit
// lanes. AVX512, however, supports full vector 8 bit shuffle. The AVX512VL +
// AVX512BW intrinsic _mm256_mask_shuffle_epi8 with a NULL mask, can be used if
@@ -496,18 +532,8 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
casti_m256i( d, 7 ) = _mm256_shuffle_epi8( casti_m256i( s, 7 ), ctl ); \
} while(0)
//
// Rotate two concatenated 256 bit vectors as one 512 bit vector by specified
// number of elements. Rotate is done in place, source arguments are
// overwritten.
// Some of these can use permute but appears to be slower. Maybe a Ryzen
// issue
// _mm256_alignr_epi 64/32 are only available with AVX512 but AVX512 also
// makes these macros unnecessary.
// continue using vror/vrol notation for now to avoid confusion with
// shufl2r/shufl2l macro functions available with AVX512.
// swap 256 bit vectors in place.
// This should be avoided, it's more efficient to switch references.
#define mm256_swap512_256( v1, v2 ) \
v1 = _mm256_xor_si256( v1, v2 ); \
v2 = _mm256_xor_si256( v1, v2 ); \

View File

@@ -316,58 +316,18 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
// Bit rotations.
// AVX512F has built-in fixed and variable bit rotation for 64 & 32 bit
// elements and can be called directly. But they only accept immediate 8
// for control arg.
// The workaround is a fraud, just a fluke of the compiler's optimizer.
// It fails without -O3. The compiler seems to unroll shift loops, eliminating
// the variable control, better than rotate loops.
// elements and can be called directly.
//
// _mm512_rol_epi64, _mm512_ror_epi64, _mm512_rol_epi32, _mm512_ror_epi32
// _mm512_rolv_epi64, _mm512_rorv_epi64, _mm512_rolv_epi32, _mm512_rorv_epi32
//
// For convenience and consistency with AVX2
// For convenience and consistency with AVX2 macros.
#define mm512_ror_64 _mm512_ror_epi64
#define mm512_rol_64 _mm512_rol_epi64
#define mm512_ror_32 _mm512_ror_epi32
#define mm512_rol_32 _mm512_rol_epi32
static inline __m512i mm512_ror_var_64( const __m512i v, const int c )
{
return _mm512_or_si512( _mm512_srli_epi64( v, c ),
_mm512_slli_epi64( v, 64-c ) );
}
static inline __m512i mm512_rol_var_64( const __m512i v, const int c )
{
return _mm512_or_si512( _mm512_slli_epi64( v, c ),
_mm512_srli_epi64( v, 64-c ) );
}
static inline __m512i mm512_ror_var_32( const __m512i v, const int c )
{
return _mm512_or_si512( _mm512_srli_epi32( v, c ),
_mm512_slli_epi32( v, 32-c ) );
}
static inline __m512i mm512_rol_var_32( const __m512i v, const int c )
{
return _mm512_or_si512( _mm512_slli_epi32( v, c ),
_mm512_srli_epi32( v, 32-c ) );
}
static inline __m512i mm512_ror_16( __m512i const v, const int c )
{
return _mm512_or_si512( _mm512_srli_epi16( v, c ),
_mm512_slli_epi16( v, 16-c ) );
}
static inline __m512i mm512_rol_16( const __m512i v, const int c )
{
return _mm512_or_si512( _mm512_slli_epi16( v, c ),
_mm512_srli_epi16( v, 16-c ) );
}
// Rotations using a vector control index are very slow due to overhead
// to generate the index vector. Repeated rotations using the same index
// are better handled by the calling function where the index only needs
@@ -599,22 +559,34 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
{ return _mm512_alignr_epi8( v, v, c ); }
// Swap 32 bits in each 64 bit lane. Can be done with rotate instruction
// but only with AVX512. Shuffle is just as fast and availble with AVX2
// & SSE2.
// Rotate byte elements in each 64 or 32 bit lane. Redundant for AVX512, all
// can be done with ror & rol. Defined only for convenience and consistency
// with AVX2 & SSE2 macros.
#define mm512_swap64_32( v ) _mm512_shuffle_epi32( v, 0xb1 )
#define mm512_shuflr64_32 mm512_swap64_32
#define mm512_shufll64_32 mm512_swap64_32
// Need good way to distinguish 1 input shuffles, 2 input shuffle functions,
// and 2 input 2 output shuffle macros.
//
// shuflr is 1 input
// shufl2r is 2 input ...
// Drop macros? They can easilly be rebuilt using shufl2 functions
#define mm512_shuflr64_24( v ) _mm512_ror_epi64( v, 24 )
#define mm512_shufll64_24( v ) _mm512_rol_epi64( v, 24 )
#define mm512_shuflr64_16( v ) _mm512_ror_epi64( v, 16 )
#define mm512_shufll64_16( v ) _mm512_rol_epi64( v, 16 )
#define mm512_shuflr64_8( v ) _mm512_ror_epi64( v, 8 )
#define mm512_shufll64_8( v ) _mm512_rol_epi64( v, 8 )
#define mm512_swap32_16( v ) _mm512_ror_epi32( v, 16 )
#define mm512_shuflr32_16 mm512_swap32_16
#define mm512_shufll32_16 mm512_swap32_16
#define mm512_shuflr32_8( v ) _mm512_ror_epi32( v, 8 )
#define mm512_shufll32_8( v ) _mm512_rol_epi32( v, 8 )
// 2 input, 1 output
// Rotate concatenated { v1, v2 ) right or left and return v1.
// Concatenate { v1, v2 ) then rotate right or left and return the high
// 512 bits, ie rotated v1.
#define mm512_shufl2r_256( v1, v2 ) _mm512_alignr_epi64( v2, v1, 4 )
#define mm512_shufl2l_256( v1, v2 ) _mm512_alignr_epi64( v1, v2, 4 )