mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.20.2
This commit is contained in:
@@ -285,8 +285,6 @@ cpuminer_SOURCES = \
|
||||
algo/x22/x22i-gate.c \
|
||||
algo/x22/x25x.c \
|
||||
algo/x22/x25x-4way.c \
|
||||
algo/yescrypt/yescrypt.c \
|
||||
algo/yescrypt/yescrypt-best.c \
|
||||
algo/yespower/yespower-gate.c \
|
||||
algo/yespower/yespower-blake2b.c \
|
||||
algo/yespower/crypto/hmac-blake2b.c \
|
||||
|
@@ -65,6 +65,11 @@ If not what makes it happen or not happen?
|
||||
Change Log
|
||||
----------
|
||||
|
||||
v3.20.2
|
||||
|
||||
Bit rotation optimizations to Blake256, Blake512, Blake2b, Blake2s & Lyra2-blake2b for SSE2 & AVX2.
|
||||
Removed old unused yescrypt library and other unused code.
|
||||
|
||||
v3.20.1
|
||||
|
||||
sph_blake2b optimized 1-way SSSE3 & AVX2.
|
||||
|
@@ -371,15 +371,11 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
case ALGO_X22I: rc = register_x22i_algo ( gate ); break;
|
||||
case ALGO_X25X: rc = register_x25x_algo ( gate ); break;
|
||||
case ALGO_XEVAN: rc = register_xevan_algo ( gate ); break;
|
||||
case ALGO_YESCRYPT: rc = register_yescrypt_05_algo ( gate ); break;
|
||||
// case ALGO_YESCRYPT: register_yescrypt_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR8: rc = register_yescryptr8_05_algo ( gate ); break;
|
||||
// case ALGO_YESCRYPTR8: register_yescryptr8_algo ( gate ); break;
|
||||
case ALGO_YESCRYPT: rc = register_yescrypt_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR8: rc = register_yescryptr8_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR8G: rc = register_yescryptr8g_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR16: rc = register_yescryptr16_05_algo( gate ); break;
|
||||
// case ALGO_YESCRYPTR16: register_yescryptr16_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR32: rc = register_yescryptr32_05_algo( gate ); break;
|
||||
// case ALGO_YESCRYPTR32: register_yescryptr32_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR16: rc = register_yescryptr16_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR32: rc = register_yescryptr32_algo ( gate ); break;
|
||||
case ALGO_YESPOWER: rc = register_yespower_algo ( gate ); break;
|
||||
case ALGO_YESPOWERR16: rc = register_yespowerr16_algo ( gate ); break;
|
||||
case ALGO_YESPOWER_B2B: rc = register_yespower_b2b_algo ( gate ); break;
|
||||
|
@@ -400,18 +400,18 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
// Blake-256 4 way
|
||||
|
||||
#define GS_4WAY( m0, m1, c0, c1, a, b, c, d ) \
|
||||
do { \
|
||||
{ \
|
||||
a = _mm_add_epi32( _mm_add_epi32( a, b ), \
|
||||
_mm_xor_si128( _mm_set1_epi32( c1 ), m0 ) ); \
|
||||
d = mm128_ror_32( _mm_xor_si128( d, a ), 16 ); \
|
||||
d = mm128_swap32_16( _mm_xor_si128( d, a ) ); \
|
||||
c = _mm_add_epi32( c, d ); \
|
||||
b = mm128_ror_32( _mm_xor_si128( b, c ), 12 ); \
|
||||
a = _mm_add_epi32( _mm_add_epi32( a, b ), \
|
||||
_mm_xor_si128( _mm_set1_epi32( c0 ), m1 ) ); \
|
||||
d = mm128_ror_32( _mm_xor_si128( d, a ), 8 ); \
|
||||
d = mm128_shuflr32_8( _mm_xor_si128( d, a ) ); \
|
||||
c = _mm_add_epi32( c, d ); \
|
||||
b = mm128_ror_32( _mm_xor_si128( b, c ), 7 ); \
|
||||
} while (0)
|
||||
}
|
||||
|
||||
#if SPH_COMPACT_BLAKE_32
|
||||
|
||||
@@ -441,7 +441,8 @@ do { \
|
||||
|
||||
#else
|
||||
|
||||
#define ROUND_S_4WAY(r) do { \
|
||||
#define ROUND_S_4WAY(r) \
|
||||
{ \
|
||||
GS_4WAY(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
|
||||
GS_4WAY(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \
|
||||
GS_4WAY(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \
|
||||
@@ -450,7 +451,7 @@ do { \
|
||||
GS_4WAY(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \
|
||||
GS_4WAY(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \
|
||||
GS_4WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
|
||||
} while (0)
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -537,7 +538,7 @@ do { \
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
#define BLAKE256_4WAY_BLOCK_BSWAP32 do \
|
||||
#define BLAKE256_4WAY_BLOCK_BSWAP32 \
|
||||
{ \
|
||||
__m128i shuf_bswap32 = _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
|
||||
0x0405060700010203 ); \
|
||||
@@ -557,11 +558,11 @@ do { \
|
||||
MD = _mm_shuffle_epi8( buf[13], shuf_bswap32 ); \
|
||||
ME = _mm_shuffle_epi8( buf[14], shuf_bswap32 ); \
|
||||
MF = _mm_shuffle_epi8( buf[15], shuf_bswap32 ); \
|
||||
} while(0)
|
||||
}
|
||||
|
||||
#else // SSE2
|
||||
|
||||
#define BLAKE256_4WAY_BLOCK_BSWAP32 do \
|
||||
#define BLAKE256_4WAY_BLOCK_BSWAP32 \
|
||||
{ \
|
||||
M0 = mm128_bswap_32( buf[0] ); \
|
||||
M1 = mm128_bswap_32( buf[1] ); \
|
||||
@@ -579,12 +580,12 @@ do { \
|
||||
MD = mm128_bswap_32( buf[13] ); \
|
||||
ME = mm128_bswap_32( buf[14] ); \
|
||||
MF = mm128_bswap_32( buf[15] ); \
|
||||
} while(0)
|
||||
}
|
||||
|
||||
#endif // SSSE3 else SSE2
|
||||
|
||||
#define COMPRESS32_4WAY( rounds ) \
|
||||
do { \
|
||||
{ \
|
||||
__m128i M0, M1, M2, M3, M4, M5, M6, M7; \
|
||||
__m128i M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||
__m128i V0, V1, V2, V3, V4, V5, V6, V7; \
|
||||
@@ -631,7 +632,7 @@ do { \
|
||||
H5 = _mm_xor_si128( _mm_xor_si128( VD, V5 ), H5 ); \
|
||||
H6 = _mm_xor_si128( _mm_xor_si128( VE, V6 ), H6 ); \
|
||||
H7 = _mm_xor_si128( _mm_xor_si128( VF, V7 ), H7 ); \
|
||||
} while (0)
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -642,20 +643,21 @@ do { \
|
||||
// Blake-256 8 way
|
||||
|
||||
#define GS_8WAY( m0, m1, c0, c1, a, b, c, d ) \
|
||||
do { \
|
||||
{ \
|
||||
a = _mm256_add_epi32( _mm256_add_epi32( a, b ), \
|
||||
_mm256_xor_si256( _mm256_set1_epi32( c1 ), m0 ) ); \
|
||||
d = mm256_ror_32( _mm256_xor_si256( d, a ), 16 ); \
|
||||
d = mm256_swap32_16( _mm256_xor_si256( d, a ) ); \
|
||||
c = _mm256_add_epi32( c, d ); \
|
||||
b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \
|
||||
a = _mm256_add_epi32( _mm256_add_epi32( a, b ), \
|
||||
_mm256_xor_si256( _mm256_set1_epi32( c0 ), m1 ) ); \
|
||||
d = mm256_ror_32( _mm256_xor_si256( d, a ), 8 ); \
|
||||
d = mm256_shuflr32_8( _mm256_xor_si256( d, a ) ); \
|
||||
c = _mm256_add_epi32( c, d ); \
|
||||
b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \
|
||||
} while (0)
|
||||
}
|
||||
|
||||
#define ROUND_S_8WAY(r) do { \
|
||||
#define ROUND_S_8WAY(r) \
|
||||
{ \
|
||||
GS_8WAY(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
|
||||
GS_8WAY(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \
|
||||
GS_8WAY(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \
|
||||
@@ -664,7 +666,7 @@ do { \
|
||||
GS_8WAY(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \
|
||||
GS_8WAY(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \
|
||||
GS_8WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
|
||||
} while (0)
|
||||
}
|
||||
|
||||
#define DECL_STATE32_8WAY \
|
||||
__m256i H0, H1, H2, H3, H4, H5, H6, H7; \
|
||||
@@ -699,7 +701,7 @@ do { \
|
||||
} while (0)
|
||||
|
||||
#define COMPRESS32_8WAY( rounds ) \
|
||||
do { \
|
||||
{ \
|
||||
__m256i M0, M1, M2, M3, M4, M5, M6, M7; \
|
||||
__m256i M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||
__m256i V0, V1, V2, V3, V4, V5, V6, V7; \
|
||||
@@ -764,10 +766,10 @@ do { \
|
||||
H5 = mm256_xor3( VD, V5, H5 ); \
|
||||
H6 = mm256_xor3( VE, V6, H6 ); \
|
||||
H7 = mm256_xor3( VF, V7, H7 ); \
|
||||
} while (0)
|
||||
}
|
||||
|
||||
#define COMPRESS32_8WAY_LE( rounds ) \
|
||||
do { \
|
||||
{ \
|
||||
__m256i M0, M1, M2, M3, M4, M5, M6, M7; \
|
||||
__m256i M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||
__m256i V0, V1, V2, V3, V4, V5, V6, V7; \
|
||||
@@ -829,7 +831,7 @@ do { \
|
||||
H5 = mm256_xor3( VD, V5, H5 ); \
|
||||
H6 = mm256_xor3( VE, V6, H6 ); \
|
||||
H7 = mm256_xor3( VF, V7, H7 ); \
|
||||
} while (0)
|
||||
}
|
||||
|
||||
void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
|
||||
const void *data )
|
||||
@@ -861,7 +863,7 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
|
||||
// G1
|
||||
V[ 1] = _mm256_add_epi32( _mm256_add_epi32( V[ 1], V[ 5] ),
|
||||
_mm256_xor_si256( _mm256_set1_epi32( CS3 ), M[ 2] ) );
|
||||
V[13] = mm256_ror_32( _mm256_xor_si256( V[13], V[ 1] ), 16 );
|
||||
V[13] = mm256_swap32_16( _mm256_xor_si256( V[13], V[ 1] ) );
|
||||
V[ 9] = _mm256_add_epi32( V[ 9], V[13] );
|
||||
V[ 5] = mm256_ror_32( _mm256_xor_si256( V[ 5], V[ 9] ), 12 );
|
||||
V[ 1] = _mm256_add_epi32( V[ 1], V[ 5] );
|
||||
@@ -881,7 +883,7 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
|
||||
// G7
|
||||
V[ 3] = _mm256_add_epi32( _mm256_add_epi32( V[ 3], V[ 4] ),
|
||||
_mm256_xor_si256( _mm256_set1_epi32( CSF ), M[14] ) );
|
||||
V[14] = mm256_ror_32( _mm256_xor_si256( V[14], V[ 3] ), 16 );
|
||||
V[14] = mm256_swap32_16( _mm256_xor_si256( V[14], V[ 3] ) );
|
||||
V[ 3] = _mm256_add_epi32( V[ 3],
|
||||
_mm256_xor_si256( _mm256_set1_epi32( CSE ), M[15] ) );
|
||||
}
|
||||
@@ -935,18 +937,18 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
// G1
|
||||
V1 = _mm256_add_epi32( V1,
|
||||
_mm256_xor_si256( _mm256_set1_epi32( CS2 ), M3 ) );
|
||||
VD = mm256_ror_32( _mm256_xor_si256( VD, V1 ), 8 );
|
||||
VD = mm256_shuflr32_8( _mm256_xor_si256( VD, V1 ) );
|
||||
V9 = _mm256_add_epi32( V9, VD );
|
||||
V5 = mm256_ror_32( _mm256_xor_si256( V5, V9 ), 7 );
|
||||
|
||||
// G4
|
||||
V0 = _mm256_add_epi32( V0, V5 );
|
||||
VF = mm256_ror_32( _mm256_xor_si256( VF, V0 ), 16 );
|
||||
VF = mm256_swap32_16( _mm256_xor_si256( VF, V0 ) );
|
||||
VA = _mm256_add_epi32( VA, VF );
|
||||
V5 = mm256_ror_32( _mm256_xor_si256( V5, VA ), 12 );
|
||||
V0 = _mm256_add_epi32( V0, _mm256_add_epi32( V5,
|
||||
_mm256_xor_si256( _mm256_set1_epi32( CS8 ), M9 ) ) );
|
||||
VF = mm256_ror_32( _mm256_xor_si256( VF, V0 ), 8 );
|
||||
VF = mm256_shuflr32_8( _mm256_xor_si256( VF, V0 ) );
|
||||
VA = _mm256_add_epi32( VA, VF );
|
||||
V5 = mm256_ror_32( _mm256_xor_si256( V5, VA ), 7 );
|
||||
|
||||
@@ -954,12 +956,12 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
GS_8WAY( MA, MB, CSA, CSB, V1, V6, VB, VC );
|
||||
|
||||
// G6
|
||||
VD = mm256_ror_32( _mm256_xor_si256( VD, V2 ), 16 );
|
||||
VD = mm256_swap32_16( _mm256_xor_si256( VD, V2 ) );
|
||||
V8 = _mm256_add_epi32( V8, VD );
|
||||
V7 = mm256_ror_32( _mm256_xor_si256( V7, V8 ), 12 );
|
||||
V2 = _mm256_add_epi32( _mm256_add_epi32( V2, V7 ),
|
||||
_mm256_xor_si256( _mm256_set1_epi32( CSC ), MD ) );
|
||||
VD = mm256_ror_32( _mm256_xor_si256( VD, V2 ), 8 );
|
||||
VD = mm256_shuflr32_8( _mm256_xor_si256( VD, V2 ) );
|
||||
V8 = _mm256_add_epi32( V8, VD );
|
||||
V7 = mm256_ror_32( _mm256_xor_si256( V7, V8 ), 7 );
|
||||
|
||||
@@ -967,7 +969,7 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
V9 = _mm256_add_epi32( V9, VE );
|
||||
V4 = mm256_ror_32( _mm256_xor_si256( V4, V9 ), 12 );
|
||||
V3 = _mm256_add_epi32( V3, V4 );
|
||||
VE = mm256_ror_32( _mm256_xor_si256( VE, V3 ), 8 );
|
||||
VE = mm256_shuflr32_8( _mm256_xor_si256( VE, V3 ) );
|
||||
V9 = _mm256_add_epi32( V9, VE );
|
||||
V4 = mm256_ror_32( _mm256_xor_si256( V4, V9 ), 7 );
|
||||
|
||||
@@ -1009,7 +1011,7 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
// Blake-256 16 way AVX512
|
||||
|
||||
#define GS_16WAY( m0, m1, c0, c1, a, b, c, d ) \
|
||||
do { \
|
||||
{ \
|
||||
a = _mm512_add_epi32( _mm512_add_epi32( a, b ), \
|
||||
_mm512_xor_si512( _mm512_set1_epi32( c1 ), m0 ) ); \
|
||||
d = mm512_ror_32( _mm512_xor_si512( d, a ), 16 ); \
|
||||
@@ -1020,9 +1022,10 @@ do { \
|
||||
d = mm512_ror_32( _mm512_xor_si512( d, a ), 8 ); \
|
||||
c = _mm512_add_epi32( c, d ); \
|
||||
b = mm512_ror_32( _mm512_xor_si512( b, c ), 7 ); \
|
||||
} while (0)
|
||||
}
|
||||
|
||||
#define ROUND_S_16WAY(r) do { \
|
||||
#define ROUND_S_16WAY(r) \
|
||||
{ \
|
||||
GS_16WAY(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
|
||||
GS_16WAY(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \
|
||||
GS_16WAY(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \
|
||||
@@ -1031,7 +1034,7 @@ do { \
|
||||
GS_16WAY(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \
|
||||
GS_16WAY(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \
|
||||
GS_16WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
|
||||
} while (0)
|
||||
}
|
||||
|
||||
#define DECL_STATE32_16WAY \
|
||||
__m512i H0, H1, H2, H3, H4, H5, H6, H7; \
|
||||
@@ -1066,7 +1069,7 @@ do { \
|
||||
} while (0)
|
||||
|
||||
#define COMPRESS32_16WAY( rounds ) \
|
||||
do { \
|
||||
{ \
|
||||
__m512i M0, M1, M2, M3, M4, M5, M6, M7; \
|
||||
__m512i M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||
__m512i V0, V1, V2, V3, V4, V5, V6, V7; \
|
||||
@@ -1133,10 +1136,10 @@ do { \
|
||||
H5 = mm512_xor3( VD, V5, H5 ); \
|
||||
H6 = mm512_xor3( VE, V6, H6 ); \
|
||||
H7 = mm512_xor3( VF, V7, H7 ); \
|
||||
} while (0)
|
||||
}
|
||||
|
||||
#define COMPRESS32_16WAY_LE( rounds ) \
|
||||
do { \
|
||||
{ \
|
||||
__m512i M0, M1, M2, M3, M4, M5, M6, M7; \
|
||||
__m512i M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||
__m512i V0, V1, V2, V3, V4, V5, V6, V7; \
|
||||
@@ -1198,7 +1201,7 @@ do { \
|
||||
H5 = mm512_xor3( VD, V5, H5 ); \
|
||||
H6 = mm512_xor3( VE, V6, H6 ); \
|
||||
H7 = mm512_xor3( VF, V7, H7 ); \
|
||||
} while (0)
|
||||
}
|
||||
|
||||
// Blake-256 prehash of the second block is split onto 2 parts. The first part
|
||||
// is constant for every nonce and only needs to be run once per job. The
|
||||
|
@@ -388,11 +388,11 @@ void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
|
||||
#define B2B_G(a, b, c, d, x, y) \
|
||||
{ \
|
||||
v[a] = _mm256_add_epi64( _mm256_add_epi64( v[a], v[b] ), x ); \
|
||||
v[d] = mm256_ror_64( _mm256_xor_si256( v[d], v[a] ), 32 ); \
|
||||
v[d] = mm256_swap64_32( _mm256_xor_si256( v[d], v[a] ) ); \
|
||||
v[c] = _mm256_add_epi64( v[c], v[d] ); \
|
||||
v[b] = mm256_ror_64( _mm256_xor_si256( v[b], v[c] ), 24 ); \
|
||||
v[b] = mm256_shuflr64_24( _mm256_xor_si256( v[b], v[c] ) ); \
|
||||
v[a] = _mm256_add_epi64( _mm256_add_epi64( v[a], v[b] ), y ); \
|
||||
v[d] = mm256_ror_64( _mm256_xor_si256( v[d], v[a] ), 16 ); \
|
||||
v[d] = mm256_shuflr64_16( _mm256_xor_si256( v[d], v[a] ) ); \
|
||||
v[c] = _mm256_add_epi64( v[c], v[d] ); \
|
||||
v[b] = mm256_ror_64( _mm256_xor_si256( v[b], v[c] ), 63 ); \
|
||||
}
|
||||
|
@@ -108,11 +108,11 @@ do { \
|
||||
uint8_t s0 = sigma0; \
|
||||
uint8_t s1 = sigma1; \
|
||||
a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ s0 ] ); \
|
||||
d = mm128_ror_32( _mm_xor_si128( d, a ), 16 ); \
|
||||
d = mm128_swap32_16( _mm_xor_si128( d, a ) ); \
|
||||
c = _mm_add_epi32( c, d ); \
|
||||
b = mm128_ror_32( _mm_xor_si128( b, c ), 12 ); \
|
||||
a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ s1 ] ); \
|
||||
d = mm128_ror_32( _mm_xor_si128( d, a ), 8 ); \
|
||||
d = mm128_shuflr32_8( _mm_xor_si128( d, a ) ); \
|
||||
c = _mm_add_epi32( c, d ); \
|
||||
b = mm128_ror_32( _mm_xor_si128( b, c ), 7 ); \
|
||||
} while(0)
|
||||
@@ -320,11 +320,11 @@ do { \
|
||||
uint8_t s0 = sigma0; \
|
||||
uint8_t s1 = sigma1; \
|
||||
a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m[ s0 ] ); \
|
||||
d = mm256_ror_32( _mm256_xor_si256( d, a ), 16 ); \
|
||||
d = mm256_swap32_16( _mm256_xor_si256( d, a ) ); \
|
||||
c = _mm256_add_epi32( c, d ); \
|
||||
b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \
|
||||
a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m[ s1 ] ); \
|
||||
d = mm256_ror_32( _mm256_xor_si256( d, a ), 8 ); \
|
||||
d = mm256_shuflr32_8( _mm256_xor_si256( d, a ) ); \
|
||||
c = _mm256_add_epi32( c, d ); \
|
||||
b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \
|
||||
} while(0)
|
||||
|
@@ -314,10 +314,11 @@ static const sph_u64 CB[16] = {
|
||||
|
||||
// Blake-512 8 way AVX512
|
||||
|
||||
#define GB_8WAY(m0, m1, c0, c1, a, b, c, d) do { \
|
||||
#define GB_8WAY( m0, m1, c0, c1, a, b, c, d ) \
|
||||
{ \
|
||||
a = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512( \
|
||||
_mm512_set1_epi64( c1 ), m0 ), b ), a ); \
|
||||
d = mm512_ror_64( _mm512_xor_si512( d, a ), 32 ); \
|
||||
d = mm512_swap64_32( _mm512_xor_si512( d, a ) ); \
|
||||
c = _mm512_add_epi64( c, d ); \
|
||||
b = mm512_ror_64( _mm512_xor_si512( b, c ), 25 ); \
|
||||
a = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512( \
|
||||
@@ -325,9 +326,10 @@ static const sph_u64 CB[16] = {
|
||||
d = mm512_ror_64( _mm512_xor_si512( d, a ), 16 ); \
|
||||
c = _mm512_add_epi64( c, d ); \
|
||||
b = mm512_ror_64( _mm512_xor_si512( b, c ), 11 ); \
|
||||
} while (0)
|
||||
}
|
||||
|
||||
#define ROUND_B_8WAY(r) do { \
|
||||
#define ROUND_B_8WAY( r ) \
|
||||
{ \
|
||||
GB_8WAY(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \
|
||||
GB_8WAY(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \
|
||||
GB_8WAY(Mx(r, 4), Mx(r, 5), CBx(r, 4), CBx(r, 5), V2, V6, VA, VE); \
|
||||
@@ -336,13 +338,13 @@ static const sph_u64 CB[16] = {
|
||||
GB_8WAY(Mx(r, A), Mx(r, B), CBx(r, A), CBx(r, B), V1, V6, VB, VC); \
|
||||
GB_8WAY(Mx(r, C), Mx(r, D), CBx(r, C), CBx(r, D), V2, V7, V8, VD); \
|
||||
GB_8WAY(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \
|
||||
} while (0)
|
||||
}
|
||||
|
||||
#define DECL_STATE64_8WAY \
|
||||
__m512i H0, H1, H2, H3, H4, H5, H6, H7; \
|
||||
uint64_t T0, T1;
|
||||
|
||||
#define COMPRESS64_8WAY( buf ) do \
|
||||
#define COMPRESS64_8WAY( buf ) \
|
||||
{ \
|
||||
__m512i M0, M1, M2, M3, M4, M5, M6, M7; \
|
||||
__m512i M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||
@@ -409,7 +411,7 @@ static const sph_u64 CB[16] = {
|
||||
H5 = mm512_xor3( VD, V5, H5 ); \
|
||||
H6 = mm512_xor3( VE, V6, H6 ); \
|
||||
H7 = mm512_xor3( VF, V7, H7 ); \
|
||||
} while (0)
|
||||
}
|
||||
|
||||
void blake512_8way_compress( blake_8way_big_context *sc )
|
||||
{
|
||||
@@ -610,7 +612,7 @@ void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
|
||||
|
||||
V0 = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512(
|
||||
_mm512_set1_epi64( CB9 ), sc->buf[ 8] ), V5 ), V0 );
|
||||
VF = mm512_ror_64( _mm512_xor_si512( VF, V0 ), 32 );
|
||||
VF = mm512_swap64_32( _mm512_xor_si512( VF, V0 ) );
|
||||
VA = _mm512_add_epi64( VA, VF );
|
||||
V5 = mm512_ror_64( _mm512_xor_si512( V5, VA ), 25 );
|
||||
V0 = _mm512_add_epi64( V0, V5 );
|
||||
@@ -714,7 +716,7 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
|
||||
// V1 = _mm512_add_epi64( V1, _mm512_xor_si512( _mm512_set1_epi64( c1 ), m0 );
|
||||
|
||||
V1 = _mm512_add_epi64( V1, V5 );
|
||||
VD = mm512_ror_64( _mm512_xor_si512( VD, V1 ), 32 );
|
||||
VD = mm512_swap64_32( _mm512_xor_si512( VD, V1 ) );
|
||||
V9 = _mm512_add_epi64( V9, VD );
|
||||
V5 = mm512_ror_64( _mm512_xor_si512( V5, V9 ), 25 );
|
||||
V1 = _mm512_add_epi64( V1, _mm512_add_epi64( _mm512_xor_si512(
|
||||
@@ -728,7 +730,7 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
|
||||
// V2 = _mm512_add_epi64( V2, V6 );
|
||||
V2 = _mm512_add_epi64( V2, _mm512_xor_si512(
|
||||
_mm512_set1_epi64( CBF ), M9 ) );
|
||||
VE = mm512_ror_64( _mm512_xor_si512( VE, V2 ), 32 );
|
||||
VE = mm512_swap64_32( _mm512_xor_si512( VE, V2 ) );
|
||||
VA = _mm512_add_epi64( VA, VE );
|
||||
V6 = mm512_ror_64( _mm512_xor_si512( V6, VA ), 25 );
|
||||
V2 = _mm512_add_epi64( V2, _mm512_add_epi64( _mm512_xor_si512(
|
||||
@@ -742,7 +744,7 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
|
||||
// V3 = _mm512_add_epi64( V3, _mm512_add_epi64( _mm512_xor_si512(
|
||||
// _mm512_set1_epi64( CBx(1, 7) ), Mx(1, 6) ), V7 ) );
|
||||
|
||||
VF = mm512_ror_64( _mm512_xor_si512( VF, V3 ), 32 );
|
||||
VF = mm512_swap64_32( _mm512_xor_si512( VF, V3 ) );
|
||||
VB = _mm512_add_epi64( VB, VF );
|
||||
V7 = mm512_ror_64( _mm512_xor_si512( V7, VB ), 25 );
|
||||
V3 = _mm512_add_epi64( V3, _mm512_add_epi64( _mm512_xor_si512(
|
||||
@@ -1054,20 +1056,22 @@ blake512_8way_close(void *cc, void *dst)
|
||||
|
||||
// Blake-512 4 way
|
||||
|
||||
#define GB_4WAY(m0, m1, c0, c1, a, b, c, d) do { \
|
||||
#define GB_4WAY(m0, m1, c0, c1, a, b, c, d) \
|
||||
{ \
|
||||
a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \
|
||||
_mm256_set1_epi64x( c1 ), m0 ), b ), a ); \
|
||||
d = mm256_ror_64( _mm256_xor_si256( d, a ), 32 ); \
|
||||
d = mm256_swap64_32( _mm256_xor_si256( d, a ) ); \
|
||||
c = _mm256_add_epi64( c, d ); \
|
||||
b = mm256_ror_64( _mm256_xor_si256( b, c ), 25 ); \
|
||||
a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \
|
||||
_mm256_set1_epi64x( c0 ), m1 ), b ), a ); \
|
||||
d = mm256_ror_64( _mm256_xor_si256( d, a ), 16 ); \
|
||||
d = mm256_shuflr64_16( _mm256_xor_si256( d, a ) ); \
|
||||
c = _mm256_add_epi64( c, d ); \
|
||||
b = mm256_ror_64( _mm256_xor_si256( b, c ), 11 ); \
|
||||
} while (0)
|
||||
}
|
||||
|
||||
#define ROUND_B_4WAY(r) do { \
|
||||
#define ROUND_B_4WAY(r) \
|
||||
{ \
|
||||
GB_4WAY(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \
|
||||
GB_4WAY(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \
|
||||
GB_4WAY(Mx(r, 4), Mx(r, 5), CBx(r, 4), CBx(r, 5), V2, V6, VA, VE); \
|
||||
@@ -1076,13 +1080,13 @@ blake512_8way_close(void *cc, void *dst)
|
||||
GB_4WAY(Mx(r, A), Mx(r, B), CBx(r, A), CBx(r, B), V1, V6, VB, VC); \
|
||||
GB_4WAY(Mx(r, C), Mx(r, D), CBx(r, C), CBx(r, D), V2, V7, V8, VD); \
|
||||
GB_4WAY(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \
|
||||
} while (0)
|
||||
}
|
||||
|
||||
#define DECL_STATE64_4WAY \
|
||||
__m256i H0, H1, H2, H3, H4, H5, H6, H7; \
|
||||
uint64_t T0, T1;
|
||||
|
||||
#define COMPRESS64_4WAY do \
|
||||
#define COMPRESS64_4WAY \
|
||||
{ \
|
||||
__m256i M0, M1, M2, M3, M4, M5, M6, M7; \
|
||||
__m256i M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||
@@ -1147,7 +1151,7 @@ blake512_8way_close(void *cc, void *dst)
|
||||
H5 = mm256_xor3( VD, V5, H5 ); \
|
||||
H6 = mm256_xor3( VE, V6, H6 ); \
|
||||
H7 = mm256_xor3( VF, V7, H7 ); \
|
||||
} while (0)
|
||||
}
|
||||
|
||||
|
||||
void blake512_4way_compress( blake_4way_big_context *sc )
|
||||
@@ -1277,7 +1281,7 @@ void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
|
||||
// G4 skip nonce
|
||||
V0 = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256(
|
||||
_mm256_set1_epi64x( CB9 ), sc->buf[ 8] ), V5 ), V0 );
|
||||
VF = mm256_ror_64( _mm256_xor_si256( VF, V0 ), 32 );
|
||||
VF = mm256_swap64_32( _mm256_xor_si256( VF, V0 ) );
|
||||
VA = _mm256_add_epi64( VA, VF );
|
||||
V5 = mm256_ror_64( _mm256_xor_si256( V5, VA ), 25 );
|
||||
V0 = _mm256_add_epi64( V0, V5 );
|
||||
@@ -1364,7 +1368,7 @@ void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
|
||||
// finish round 0, with the nonce now available
|
||||
V0 = _mm256_add_epi64( V0, _mm256_xor_si256(
|
||||
_mm256_set1_epi64x( CB8 ), M9 ) );
|
||||
VF = mm256_ror_64( _mm256_xor_si256( VF, V0 ), 16 );
|
||||
VF = mm256_shuflr64_16( _mm256_xor_si256( VF, V0 ) );
|
||||
VA = _mm256_add_epi64( VA, VF );
|
||||
V5 = mm256_ror_64( _mm256_xor_si256( V5, VA ), 11 );
|
||||
|
||||
@@ -1374,34 +1378,34 @@ void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
|
||||
|
||||
// G1
|
||||
V1 = _mm256_add_epi64( V1, V5 );
|
||||
VD = mm256_ror_64( _mm256_xor_si256( VD, V1 ), 32 );
|
||||
VD = mm256_swap64_32( _mm256_xor_si256( VD, V1 ) );
|
||||
V9 = _mm256_add_epi64( V9, VD );
|
||||
V5 = mm256_ror_64( _mm256_xor_si256( V5, V9 ), 25 );
|
||||
V1 = _mm256_add_epi64( V1, _mm256_add_epi64( _mm256_xor_si256(
|
||||
_mm256_set1_epi64x( CBx(1,2) ), Mx(1,3) ), V5 ) );
|
||||
VD = mm256_ror_64( _mm256_xor_si256( VD, V1 ), 16 );
|
||||
VD = mm256_shuflr64_16( _mm256_xor_si256( VD, V1 ) );
|
||||
V9 = _mm256_add_epi64( V9, VD );
|
||||
V5 = mm256_ror_64( _mm256_xor_si256( V5, V9 ), 11 );
|
||||
|
||||
// G2
|
||||
V2 = _mm256_add_epi64( V2, _mm256_xor_si256(
|
||||
_mm256_set1_epi64x( CBF ), M9 ) );
|
||||
VE = mm256_ror_64( _mm256_xor_si256( VE, V2 ), 32 );
|
||||
VE = mm256_swap64_32( _mm256_xor_si256( VE, V2 ) );
|
||||
VA = _mm256_add_epi64( VA, VE );
|
||||
V6 = mm256_ror_64( _mm256_xor_si256( V6, VA ), 25 );
|
||||
V2 = _mm256_add_epi64( V2, _mm256_add_epi64( _mm256_xor_si256(
|
||||
_mm256_set1_epi64x( CB9 ), MF ), V6 ) );
|
||||
VE = mm256_ror_64( _mm256_xor_si256( VE, V2 ), 16 );
|
||||
VE = mm256_shuflr64_16( _mm256_xor_si256( VE, V2 ) );
|
||||
VA = _mm256_add_epi64( VA, VE );
|
||||
V6 = mm256_ror_64( _mm256_xor_si256( V6, VA ), 11 );
|
||||
|
||||
// G3
|
||||
VF = mm256_ror_64( _mm256_xor_si256( VF, V3 ), 32 );
|
||||
VF = mm256_swap64_32( _mm256_xor_si256( VF, V3 ) );
|
||||
VB = _mm256_add_epi64( VB, VF );
|
||||
V7 = mm256_ror_64( _mm256_xor_si256( V7, VB ), 25 );
|
||||
V3 = _mm256_add_epi64( V3, _mm256_add_epi64( _mm256_xor_si256(
|
||||
_mm256_set1_epi64x( CBx(1, 6) ), Mx(1, 7) ), V7 ) );
|
||||
VF = mm256_ror_64( _mm256_xor_si256( VF, V3 ), 16 );
|
||||
VF = mm256_shuflr64_16( _mm256_xor_si256( VF, V3 ) );
|
||||
VB = _mm256_add_epi64( VB, VF );
|
||||
V7 = mm256_ror_64( _mm256_xor_si256( V7, VB ), 11 );
|
||||
|
||||
|
@@ -35,7 +35,6 @@
|
||||
#include "sph_blake2b.h"
|
||||
|
||||
// Little-endian byte access.
|
||||
|
||||
#define B2B_GET64(p) \
|
||||
(((uint64_t) ((uint8_t *) (p))[0]) ^ \
|
||||
(((uint64_t) ((uint8_t *) (p))[1]) << 8) ^ \
|
||||
@@ -46,30 +45,34 @@
|
||||
(((uint64_t) ((uint8_t *) (p))[6]) << 48) ^ \
|
||||
(((uint64_t) ((uint8_t *) (p))[7]) << 56))
|
||||
|
||||
// G Mixing function.
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#define BLAKE2B_G( R, Sa, Sb, Sc, Sd, Na, Nb ) \
|
||||
#define BLAKE2B_G( Sa, Sb, Sc, Sd, Se, Sf, Sg, Sh ) \
|
||||
{ \
|
||||
V[0] = _mm256_add_epi64( V[0], _mm256_add_epi64( V[1], \
|
||||
_mm256_set_epi64x( m[ sigma[R][Sd] ], m[ sigma[R][Sc] ], \
|
||||
m[ sigma[R][Sb] ], m[ sigma[R][Sa] ] ) ) ); \
|
||||
V[3] = mm256_ror_64( _mm256_xor_si256( V[3], V[0] ), Na ); \
|
||||
_mm256_set_epi64x( m[ sigmaR[ Sg ] ], m[ sigmaR[ Se ] ], \
|
||||
m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
|
||||
V[3] = mm256_swap64_32( _mm256_xor_si256( V[3], V[0] ) ); \
|
||||
V[2] = _mm256_add_epi64( V[2], V[3] ); \
|
||||
V[1] = mm256_ror_64( _mm256_xor_si256( V[1], V[2] ), Nb ); \
|
||||
V[1] = mm256_shuflr64_24( _mm256_xor_si256( V[1], V[2] ) ); \
|
||||
\
|
||||
V[0] = _mm256_add_epi64( V[0], _mm256_add_epi64( V[1], \
|
||||
_mm256_set_epi64x( m[ sigmaR[ Sh ] ], m[ sigmaR[ Sf ] ], \
|
||||
m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
|
||||
V[3] = mm256_shuflr64_16( _mm256_xor_si256( V[3], V[0] ) ); \
|
||||
V[2] = _mm256_add_epi64( V[2], V[3] ); \
|
||||
V[1] = mm256_ror_64( _mm256_xor_si256( V[1], V[2] ), 63 ); \
|
||||
}
|
||||
|
||||
#define BLAKE2B_ROUND( R ) \
|
||||
{ \
|
||||
__m256i *V = (__m256i*)v; \
|
||||
BLAKE2B_G( R, 0, 2, 4, 6, 32, 24 ); \
|
||||
BLAKE2B_G( R, 1, 3, 5, 7, 16, 63 ); \
|
||||
const uint8_t *sigmaR = sigma[R]; \
|
||||
BLAKE2B_G( 0, 1, 2, 3, 4, 5, 6, 7 ); \
|
||||
V[3] = mm256_shufll_64( V[3] ); \
|
||||
V[2] = mm256_swap_128( V[2] ); \
|
||||
V[1] = mm256_shuflr_64( V[1] ); \
|
||||
BLAKE2B_G( R, 8, 10, 12, 14, 32, 24 ); \
|
||||
BLAKE2B_G( R, 9, 11, 13, 15, 16, 63 ); \
|
||||
BLAKE2B_G( 8, 9, 10, 11, 12, 13, 14, 15 ); \
|
||||
V[3] = mm256_shuflr_64( V[3] ); \
|
||||
V[2] = mm256_swap_128( V[2] ); \
|
||||
V[1] = mm256_shufll_64( V[1] ); \
|
||||
@@ -77,31 +80,34 @@
|
||||
|
||||
#elif defined(__SSSE3__)
|
||||
|
||||
#define BLAKE2B_G( R, Va, Vb, Vc, Vd, Sa, Sb, Na, Nb ) \
|
||||
#define BLAKE2B_G( Va, Vb, Vc, Vd, Sa, Sb, Sc, Sd ) \
|
||||
{ \
|
||||
Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
|
||||
_mm_set_epi64x( m[ sigma[R][Sb] ], m[ sigma[R][Sa] ] ) ) ); \
|
||||
Vd = mm128_ror_64( _mm_xor_si128( Vd, Va ), Na ); \
|
||||
_mm_set_epi64x( m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
|
||||
Vd = mm128_swap64_32( _mm_xor_si128( Vd, Va ) ); \
|
||||
Vc = _mm_add_epi64( Vc, Vd ); \
|
||||
Vb = mm128_ror_64( _mm_xor_si128( Vb, Vc ), Nb ); \
|
||||
Vb = mm128_shuflr64_24( _mm_xor_si128( Vb, Vc ) ); \
|
||||
\
|
||||
Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
|
||||
_mm_set_epi64x( m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
|
||||
Vd = mm128_shuflr64_16( _mm_xor_si128( Vd, Va ) ); \
|
||||
Vc = _mm_add_epi64( Vc, Vd ); \
|
||||
Vb = mm128_ror_64( _mm_xor_si128( Vb, Vc ), 63 ); \
|
||||
}
|
||||
|
||||
#define BLAKE2B_ROUND( R ) \
|
||||
{ \
|
||||
__m128i *V = (__m128i*)v; \
|
||||
__m128i V2, V3, V6, V7; \
|
||||
BLAKE2B_G( R, V[0], V[2], V[4], V[6], 0, 2, 32, 24 ); \
|
||||
BLAKE2B_G( R, V[0], V[2], V[4], V[6], 1, 3, 16, 63 ); \
|
||||
BLAKE2B_G( R, V[1], V[3], V[5], V[7], 4, 6, 32, 24 ); \
|
||||
BLAKE2B_G( R, V[1], V[3], V[5], V[7], 5, 7, 16, 63 ); \
|
||||
const uint8_t *sigmaR = sigma[R]; \
|
||||
BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
|
||||
BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
|
||||
V2 = mm128_shufl2r_64( V[2], V[3] ); \
|
||||
V3 = mm128_shufl2r_64( V[3], V[2] ); \
|
||||
V6 = mm128_shufl2l_64( V[6], V[7] ); \
|
||||
V7 = mm128_shufl2l_64( V[7], V[6] ); \
|
||||
BLAKE2B_G( R, V[0], V2, V[5], V6, 8, 10, 32, 24 ); \
|
||||
BLAKE2B_G( R, V[0], V2, V[5], V6, 9, 11, 16, 63 ); \
|
||||
BLAKE2B_G( R, V[1], V3, V[4], V7, 12, 14, 32, 24 ); \
|
||||
BLAKE2B_G( R, V[1], V3, V[4], V7, 13, 15, 16, 63 ); \
|
||||
BLAKE2B_G( V[0], V2, V[5], V6, 8, 9, 10, 11 ); \
|
||||
BLAKE2B_G( V[1], V3, V[4], V7, 12, 13, 14, 15 ); \
|
||||
V[2] = mm128_shufl2l_64( V2, V3 ); \
|
||||
V[3] = mm128_shufl2l_64( V3, V2 ); \
|
||||
V[6] = mm128_shufl2r_64( V6, V7 ); \
|
||||
@@ -120,6 +126,7 @@
|
||||
Vd = ROTR64( Vd ^ Va, 32 ); \
|
||||
Vc = Vc + Vd; \
|
||||
Vb = ROTR64( Vb ^ Vc, 24 ); \
|
||||
\
|
||||
Va = Va + Vb + m[ sigma[R][Sb] ]; \
|
||||
Vd = ROTR64( Vd ^ Va, 16 ); \
|
||||
Vc = Vc + Vd; \
|
||||
|
@@ -1,382 +0,0 @@
|
||||
/*
|
||||
* HEFTY1 cryptographic hash function
|
||||
*
|
||||
* Copyright (c) 2014, dbcc14 <BM-NBx4AKznJuyem3dArgVY8MGyABpihRy5>
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* The views and conclusions contained in the software and documentation are those
|
||||
* of the authors and should not be interpreted as representing official policies,
|
||||
* either expressed or implied, of the FreeBSD Project.
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
#include <string.h>
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define inline __inline
|
||||
#endif
|
||||
|
||||
#include "sph_hefty1.h"
|
||||
|
||||
#define Min(A, B) (A <= B ? A : B)
|
||||
#define RoundFunc(ctx, A, B, C, D, E, F, G, H, W, K) \
|
||||
{ \
|
||||
/* To thwart parallelism, Br modifies itself each time it's \
|
||||
* called. This also means that calling it in different \
|
||||
* orders yeilds different results. In C the order of \
|
||||
* evaluation of function arguments and + operands are \
|
||||
* unspecified (and depends on the compiler), so we must make \
|
||||
* the order of Br calls explicit. \
|
||||
*/ \
|
||||
uint32_t brG = Br(ctx, G); \
|
||||
uint32_t tmp1 = Ch(E, Br(ctx, F), brG) + H + W + K; \
|
||||
uint32_t tmp2 = tmp1 + Sigma1(Br(ctx, E)); \
|
||||
uint32_t brC = Br(ctx, C); \
|
||||
uint32_t brB = Br(ctx, B); \
|
||||
uint32_t tmp3 = Ma(Br(ctx, A), brB, brC); \
|
||||
uint32_t tmp4 = tmp3 + Sigma0(Br(ctx, A)); \
|
||||
H = G; \
|
||||
G = F; \
|
||||
F = E; \
|
||||
E = D + Br(ctx, tmp2); \
|
||||
D = C; \
|
||||
C = B; \
|
||||
B = A; \
|
||||
A = tmp2 + tmp4; \
|
||||
} \
|
||||
|
||||
/* Nothing up my sleeve constants */
|
||||
const static uint32_t K[64] = {
|
||||
0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL,
|
||||
0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL,
|
||||
0xd807aa98UL, 0x12835b01UL, 0x243185beUL, 0x550c7dc3UL,
|
||||
0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, 0xc19bf174UL,
|
||||
0xe49b69c1UL, 0xefbe4786UL, 0x0fc19dc6UL, 0x240ca1ccUL,
|
||||
0x2de92c6fUL, 0x4a7484aaUL, 0x5cb0a9dcUL, 0x76f988daUL,
|
||||
0x983e5152UL, 0xa831c66dUL, 0xb00327c8UL, 0xbf597fc7UL,
|
||||
0xc6e00bf3UL, 0xd5a79147UL, 0x06ca6351UL, 0x14292967UL,
|
||||
0x27b70a85UL, 0x2e1b2138UL, 0x4d2c6dfcUL, 0x53380d13UL,
|
||||
0x650a7354UL, 0x766a0abbUL, 0x81c2c92eUL, 0x92722c85UL,
|
||||
0xa2bfe8a1UL, 0xa81a664bUL, 0xc24b8b70UL, 0xc76c51a3UL,
|
||||
0xd192e819UL, 0xd6990624UL, 0xf40e3585UL, 0x106aa070UL,
|
||||
0x19a4c116UL, 0x1e376c08UL, 0x2748774cUL, 0x34b0bcb5UL,
|
||||
0x391c0cb3UL, 0x4ed8aa4aUL, 0x5b9cca4fUL, 0x682e6ff3UL,
|
||||
0x748f82eeUL, 0x78a5636fUL, 0x84c87814UL, 0x8cc70208UL,
|
||||
0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL
|
||||
};
|
||||
|
||||
/* Initial hash values */
|
||||
const static uint32_t H[HEFTY1_STATE_WORDS] = {
|
||||
0x6a09e667UL,
|
||||
0xbb67ae85UL,
|
||||
0x3c6ef372UL,
|
||||
0xa54ff53aUL,
|
||||
0x510e527fUL,
|
||||
0x9b05688cUL,
|
||||
0x1f83d9abUL,
|
||||
0x5be0cd19UL
|
||||
};
|
||||
|
||||
static inline uint32_t Rr(uint32_t X, uint8_t n)
|
||||
{
|
||||
return (X >> n) | (X << (32 - n));
|
||||
}
|
||||
|
||||
static inline uint32_t Ch(uint32_t E, uint32_t F, uint32_t G)
|
||||
{
|
||||
return (E & F) ^ (~E & G);
|
||||
}
|
||||
|
||||
static inline uint32_t Sigma1(uint32_t E)
|
||||
{
|
||||
return Rr(E, 6) ^ Rr(E, 11) ^ Rr(E, 25);
|
||||
}
|
||||
|
||||
static inline uint32_t sigma1(uint32_t X)
|
||||
{
|
||||
return Rr(X, 17) ^ Rr(X, 19) ^ (X >> 10);
|
||||
}
|
||||
|
||||
static inline uint32_t Ma(uint32_t A, uint32_t B, uint32_t C)
|
||||
{
|
||||
return (A & B) ^ (A & C) ^ (B & C);
|
||||
}
|
||||
|
||||
static inline uint32_t Sigma0(uint32_t A)
|
||||
{
|
||||
return Rr(A, 2) ^ Rr(A, 13) ^ Rr(A, 22);
|
||||
}
|
||||
|
||||
static inline uint32_t sigma0(uint32_t X)
|
||||
{
|
||||
return Rr(X, 7) ^ Rr(X, 18) ^ (X >> 3);
|
||||
}
|
||||
|
||||
static inline uint32_t Reverse32(uint32_t n)
|
||||
{
|
||||
#if BYTE_ORDER == LITTLE_ENDIAN
|
||||
return n << 24 | (n & 0x0000ff00) << 8 | (n & 0x00ff0000) >> 8 | n >> 24;
|
||||
#else
|
||||
return n;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline uint64_t Reverse64(uint64_t n)
|
||||
{
|
||||
#if BYTE_ORDER == LITTLE_ENDIAN
|
||||
uint32_t a = n >> 32;
|
||||
uint32_t b = (n << 32) >> 32;
|
||||
|
||||
return (uint64_t)Reverse32(b) << 32 | Reverse32(a);
|
||||
#else
|
||||
return n;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Smoosh byte into nibble */
|
||||
static inline uint8_t Smoosh4(uint8_t X)
|
||||
{
|
||||
return (X >> 4) ^ (X & 0xf);
|
||||
}
|
||||
|
||||
/* Smoosh 32-bit word into 2-bits */
|
||||
static inline uint8_t Smoosh2(uint32_t X)
|
||||
{
|
||||
uint16_t w = (X >> 16) ^ (X & 0xffff);
|
||||
uint8_t n = Smoosh4((w >> 8) ^ (w & 0xff));
|
||||
return (n >> 2) ^ (n & 0x3);
|
||||
}
|
||||
|
||||
static void Mangle(uint32_t *S)
|
||||
{
|
||||
uint32_t *R = S;
|
||||
uint32_t *C = &S[1];
|
||||
|
||||
uint8_t r0 = Smoosh4(R[0] >> 24);
|
||||
uint8_t r1 = Smoosh4(R[0] >> 16);
|
||||
uint8_t r2 = Smoosh4(R[0] >> 8);
|
||||
uint8_t r3 = Smoosh4(R[0] & 0xff);
|
||||
|
||||
int i;
|
||||
|
||||
/* Diffuse */
|
||||
uint32_t tmp = 0;
|
||||
for (i = 0; i < HEFTY1_SPONGE_WORDS - 1; i++) {
|
||||
uint8_t r = Smoosh2(tmp);
|
||||
switch (r) {
|
||||
case 0:
|
||||
C[i] ^= Rr(R[0], i + r0);
|
||||
break;
|
||||
case 1:
|
||||
C[i] += Rr(~R[0], i + r1);
|
||||
break;
|
||||
case 2:
|
||||
C[i] &= Rr(~R[0], i + r2);
|
||||
break;
|
||||
case 3:
|
||||
C[i] ^= Rr(R[0], i + r3);
|
||||
break;
|
||||
}
|
||||
tmp ^= C[i];
|
||||
}
|
||||
|
||||
/* Compress */
|
||||
tmp = 0;
|
||||
for (i = 0; i < HEFTY1_SPONGE_WORDS - 1; i++)
|
||||
if (i % 2)
|
||||
tmp ^= C[i];
|
||||
else
|
||||
tmp += C[i];
|
||||
R[0] ^= tmp;
|
||||
}
|
||||
|
||||
static void Absorb(uint32_t *S, uint32_t X)
|
||||
{
|
||||
uint32_t *R = S;
|
||||
R[0] ^= X;
|
||||
Mangle(S);
|
||||
}
|
||||
|
||||
static uint32_t Squeeze(uint32_t *S)
|
||||
{
|
||||
uint32_t Y = S[0];
|
||||
Mangle(S);
|
||||
return Y;
|
||||
}
|
||||
|
||||
/* Branch, compress and serialize function */
|
||||
static inline uint32_t Br(HEFTY1_CTX *ctx, uint32_t X)
|
||||
{
|
||||
uint32_t R = Squeeze(ctx->sponge);
|
||||
|
||||
uint8_t r0 = R >> 8;
|
||||
uint8_t r1 = R & 0xff;
|
||||
|
||||
uint32_t Y = 1 << (r0 % 32);
|
||||
|
||||
switch (r1 % 4)
|
||||
{
|
||||
case 0:
|
||||
/* Do nothing */
|
||||
break;
|
||||
case 1:
|
||||
return X & ~Y;
|
||||
case 2:
|
||||
return X | Y;
|
||||
case 3:
|
||||
return X ^ Y;
|
||||
}
|
||||
|
||||
return X;
|
||||
}
|
||||
|
||||
static void HashBlock(HEFTY1_CTX *ctx)
|
||||
{
|
||||
uint32_t A, B, C, D, E, F, G, H;
|
||||
uint32_t W[HEFTY1_BLOCK_BYTES];
|
||||
|
||||
assert(ctx);
|
||||
|
||||
A = ctx->h[0];
|
||||
B = ctx->h[1];
|
||||
C = ctx->h[2];
|
||||
D = ctx->h[3];
|
||||
E = ctx->h[4];
|
||||
F = ctx->h[5];
|
||||
G = ctx->h[6];
|
||||
H = ctx->h[7];
|
||||
|
||||
int t = 0;
|
||||
for (; t < 16; t++) {
|
||||
W[t] = Reverse32(((uint32_t *)&ctx->block[0])[t]); /* To host byte order */
|
||||
Absorb(ctx->sponge, W[t] ^ K[t]);
|
||||
}
|
||||
|
||||
for (t = 0; t < 16; t++) {
|
||||
Absorb(ctx->sponge, D ^ H);
|
||||
RoundFunc(ctx, A, B, C, D, E, F, G, H, W[t], K[t]);
|
||||
}
|
||||
for (t = 16; t < 64; t++) {
|
||||
Absorb(ctx->sponge, H + D);
|
||||
W[t] = sigma1(W[t - 2]) + W[t - 7] + sigma0(W[t - 15]) + W[t - 16];
|
||||
RoundFunc(ctx, A, B, C, D, E, F, G, H, W[t], K[t]);
|
||||
}
|
||||
|
||||
ctx->h[0] += A;
|
||||
ctx->h[1] += B;
|
||||
ctx->h[2] += C;
|
||||
ctx->h[3] += D;
|
||||
ctx->h[4] += E;
|
||||
ctx->h[5] += F;
|
||||
ctx->h[6] += G;
|
||||
ctx->h[7] += H;
|
||||
|
||||
A = 0;
|
||||
B = 0;
|
||||
C = 0;
|
||||
D = 0;
|
||||
E = 0;
|
||||
F = 0;
|
||||
G = 0;
|
||||
H = 0;
|
||||
|
||||
memset(W, 0, sizeof(W));
|
||||
}
|
||||
|
||||
/* Public interface */
|
||||
|
||||
void HEFTY1_Init(HEFTY1_CTX *ctx)
|
||||
{
|
||||
assert(ctx);
|
||||
|
||||
memcpy(ctx->h, H, sizeof(ctx->h));
|
||||
memset(ctx->block, 0, sizeof(ctx->block));
|
||||
ctx->written = 0;
|
||||
memset(ctx->sponge, 0, sizeof(ctx->sponge));
|
||||
}
|
||||
|
||||
void HEFTY1_Update(HEFTY1_CTX *ctx, const void *buf, size_t len)
|
||||
{
|
||||
assert(ctx);
|
||||
|
||||
uint64_t read = 0;
|
||||
while (len) {
|
||||
size_t end = (size_t)(ctx->written % HEFTY1_BLOCK_BYTES);
|
||||
size_t count = Min(len, HEFTY1_BLOCK_BYTES - end);
|
||||
memcpy(&ctx->block[end], &((unsigned char *)buf)[read], count);
|
||||
len -= count;
|
||||
read += count;
|
||||
ctx->written += count;
|
||||
if (!(ctx->written % HEFTY1_BLOCK_BYTES))
|
||||
HashBlock(ctx);
|
||||
}
|
||||
}
|
||||
|
||||
void HEFTY1_Final(unsigned char *digest, HEFTY1_CTX *ctx)
|
||||
{
|
||||
assert(digest);
|
||||
assert(ctx);
|
||||
|
||||
/* Pad message (FIPS 180 Section 5.1.1) */
|
||||
size_t used = (size_t)(ctx->written % HEFTY1_BLOCK_BYTES);
|
||||
ctx->block[used++] = 0x80; /* Append 1 to end of message */
|
||||
if (used > HEFTY1_BLOCK_BYTES - 8) {
|
||||
/* We have already written into the last 64bits, so
|
||||
* we must continue into the next block. */
|
||||
memset(&ctx->block[used], 0, HEFTY1_BLOCK_BYTES - used);
|
||||
HashBlock(ctx);
|
||||
used = 0; /* Create a new block (below) */
|
||||
}
|
||||
|
||||
/* All remaining bits to zero */
|
||||
memset(&ctx->block[used], 0, HEFTY1_BLOCK_BYTES - 8 - used);
|
||||
|
||||
/* The last 64bits encode the length (in network byte order) */
|
||||
uint64_t *len = (uint64_t *)&ctx->block[HEFTY1_BLOCK_BYTES - 8];
|
||||
*len = Reverse64(ctx->written*8);
|
||||
|
||||
HashBlock(ctx);
|
||||
|
||||
/* Convert back to network byte order */
|
||||
int i = 0;
|
||||
for (; i < HEFTY1_STATE_WORDS; i++)
|
||||
ctx->h[i] = Reverse32(ctx->h[i]);
|
||||
|
||||
memcpy(digest, ctx->h, sizeof(ctx->h));
|
||||
memset(ctx, 0, sizeof(HEFTY1_CTX));
|
||||
}
|
||||
|
||||
unsigned char* HEFTY1(const unsigned char *buf, size_t len, unsigned char *digest)
|
||||
{
|
||||
HEFTY1_CTX ctx;
|
||||
static unsigned char m[HEFTY1_DIGEST_BYTES];
|
||||
|
||||
if (!digest)
|
||||
digest = m;
|
||||
|
||||
HEFTY1_Init(&ctx);
|
||||
HEFTY1_Update(&ctx, buf, len);
|
||||
HEFTY1_Final(digest, &ctx);
|
||||
|
||||
return digest;
|
||||
}
|
@@ -1,66 +0,0 @@
|
||||
/*
|
||||
* HEFTY1 cryptographic hash function
|
||||
*
|
||||
* Copyright (c) 2014, dbcc14 <BM-NBx4AKznJuyem3dArgVY8MGyABpihRy5>
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* The views and conclusions contained in the software and documentation are those
|
||||
* of the authors and should not be interpreted as representing official policies,
|
||||
* either expressed or implied, of the FreeBSD Project.
|
||||
*/
|
||||
|
||||
#ifndef __HEFTY1_H__
|
||||
#define __HEFTY1_H__
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#ifndef WIN32
|
||||
#include <sys/types.h>
|
||||
#endif
|
||||
|
||||
#include <inttypes.h>
|
||||
|
||||
#define HEFTY1_DIGEST_BYTES 32
|
||||
#define HEFTY1_BLOCK_BYTES 64
|
||||
#define HEFTY1_STATE_WORDS 8
|
||||
#define HEFTY1_SPONGE_WORDS 4
|
||||
|
||||
typedef struct HEFTY1_CTX {
|
||||
uint32_t h[HEFTY1_STATE_WORDS];
|
||||
uint8_t block[HEFTY1_BLOCK_BYTES];
|
||||
uint64_t written;
|
||||
uint32_t sponge[HEFTY1_SPONGE_WORDS];
|
||||
} HEFTY1_CTX;
|
||||
|
||||
void HEFTY1_Init(HEFTY1_CTX *cxt);
|
||||
void HEFTY1_Update(HEFTY1_CTX *cxt, const void *data, size_t len);
|
||||
void HEFTY1_Final(unsigned char *digest, HEFTY1_CTX *cxt);
|
||||
unsigned char* HEFTY1(const unsigned char *data, size_t len, unsigned char *digest);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* __HEFTY1_H__ */
|
@@ -97,11 +97,11 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
// returns void, updates all args
|
||||
#define G_4X64(a,b,c,d) \
|
||||
a = _mm256_add_epi64( a, b ); \
|
||||
d = mm256_ror_64( _mm256_xor_si256( d, a ), 32 ); \
|
||||
d = mm256_swap64_32( _mm256_xor_si256( d, a ) ); \
|
||||
c = _mm256_add_epi64( c, d ); \
|
||||
b = mm256_ror_64( _mm256_xor_si256( b, c ), 24 ); \
|
||||
b = mm256_shuflr64_24( _mm256_xor_si256( b, c ) ); \
|
||||
a = _mm256_add_epi64( a, b ); \
|
||||
d = mm256_ror_64( _mm256_xor_si256( d, a ), 16 ); \
|
||||
d = mm256_shuflr64_16( _mm256_xor_si256( d, a ) ); \
|
||||
c = _mm256_add_epi64( c, d ); \
|
||||
b = mm256_ror_64( _mm256_xor_si256( b, c ), 63 );
|
||||
|
||||
@@ -137,11 +137,11 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
// returns void, all args updated
|
||||
#define G_2X64(a,b,c,d) \
|
||||
a = _mm_add_epi64( a, b ); \
|
||||
d = mm128_ror_64( _mm_xor_si128( d, a), 32 ); \
|
||||
d = mm128_swap64_32( _mm_xor_si128( d, a) ); \
|
||||
c = _mm_add_epi64( c, d ); \
|
||||
b = mm128_ror_64( _mm_xor_si128( b, c ), 24 ); \
|
||||
b = mm128_shuflr64_24( _mm_xor_si128( b, c ) ); \
|
||||
a = _mm_add_epi64( a, b ); \
|
||||
d = mm128_ror_64( _mm_xor_si128( d, a ), 16 ); \
|
||||
d = mm128_shuflr64_16( _mm_xor_si128( d, a ) ); \
|
||||
c = _mm_add_epi64( c, d ); \
|
||||
b = mm128_ror_64( _mm_xor_si128( b, c ), 63 );
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -1,186 +0,0 @@
|
||||
/* $Id: sph_radiogatun.h 226 2010-06-16 17:28:08Z tp $ */
|
||||
/**
|
||||
* RadioGatun interface.
|
||||
*
|
||||
* RadioGatun has been published in: G. Bertoni, J. Daemen, M. Peeters
|
||||
* and G. Van Assche, "RadioGatun, a belt-and-mill hash function",
|
||||
* presented at the Second Cryptographic Hash Workshop, Santa Barbara,
|
||||
* August 24-25, 2006. The main Web site, containing that article, the
|
||||
* reference code and some test vectors, appears to be currently located
|
||||
* at the following URL: http://radiogatun.noekeon.org/
|
||||
*
|
||||
* The presentation article does not specify endianness or padding. The
|
||||
* reference code uses the following conventions, which we also apply
|
||||
* here:
|
||||
* <ul>
|
||||
* <li>The input message is an integral number of sequences of three
|
||||
* words. Each word is either a 32-bit of 64-bit word (depending on
|
||||
* the version of RadioGatun).</li>
|
||||
* <li>Input bytes are decoded into words using little-endian
|
||||
* convention.</li>
|
||||
* <li>Padding consists of a single bit of value 1, using little-endian
|
||||
* convention within bytes (i.e. for a byte-oriented input, a single
|
||||
* byte of value 0x01 is appended), then enough bits of value 0 to finish
|
||||
* the current block.</li>
|
||||
* <li>Output consists of 256 bits. Successive output words are encoded
|
||||
* with little-endian convention.</li>
|
||||
* </ul>
|
||||
* These conventions are very close to those we use for PANAMA, which is
|
||||
* a close ancestor or RadioGatun.
|
||||
*
|
||||
* RadioGatun is actually a family of functions, depending on some
|
||||
* internal parameters. We implement here two functions, with a "belt
|
||||
* length" of 13, a "belt width" of 3, and a "mill length" of 19. The
|
||||
* RadioGatun[32] version uses 32-bit words, while the RadioGatun[64]
|
||||
* variant uses 64-bit words.
|
||||
*
|
||||
* Strictly speaking, the name "RadioGatun" should use an acute accent
|
||||
* on the "u", which we omitted here to keep strict ASCII-compatibility
|
||||
* of this file.
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @file sph_radiogatun.h
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#ifndef SPH_RADIOGATUN_H__
|
||||
#define SPH_RADIOGATUN_H__
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
|
||||
/**
|
||||
* Output size (in bits) for RadioGatun[32].
|
||||
*/
|
||||
#define SPH_SIZE_radiogatun32 256
|
||||
|
||||
/**
|
||||
* This structure is a context for RadioGatun[32] computations: it
|
||||
* contains intermediate values and some data from the last entered
|
||||
* block. Once a RadioGatun[32] computation has been performed, the
|
||||
* context can be reused for another computation.
|
||||
*
|
||||
* The contents of this structure are private. A running RadioGatun[32]
|
||||
* computation can be cloned by copying the context (e.g. with a
|
||||
* simple <code>memcpy()</code>).
|
||||
*/
|
||||
typedef struct {
|
||||
#ifndef DOXYGEN_IGNORE
|
||||
unsigned char data[156]; /* first field, for alignment */
|
||||
unsigned data_ptr;
|
||||
sph_u32 a[19], b[39];
|
||||
#endif
|
||||
} sph_radiogatun32_context;
|
||||
|
||||
/**
|
||||
* Initialize a RadioGatun[32] context. This process performs no
|
||||
* memory allocation.
|
||||
*
|
||||
* @param cc the RadioGatun[32] context (pointer to a
|
||||
* <code>sph_radiogatun32_context</code>)
|
||||
*/
|
||||
void sph_radiogatun32_init(void *cc);
|
||||
|
||||
/**
|
||||
* Process some data bytes. It is acceptable that <code>len</code> is zero
|
||||
* (in which case this function does nothing).
|
||||
*
|
||||
* @param cc the RadioGatun[32] context
|
||||
* @param data the input data
|
||||
* @param len the input data length (in bytes)
|
||||
*/
|
||||
void sph_radiogatun32(void *cc, const void *data, size_t len);
|
||||
|
||||
/**
|
||||
* Terminate the current RadioGatun[32] computation and output the
|
||||
* result into the provided buffer. The destination buffer must be wide
|
||||
* enough to accomodate the result (32 bytes). The context is
|
||||
* automatically reinitialized.
|
||||
*
|
||||
* @param cc the RadioGatun[32] context
|
||||
* @param dst the destination buffer
|
||||
*/
|
||||
void sph_radiogatun32_close(void *cc, void *dst);
|
||||
|
||||
#if SPH_64
|
||||
|
||||
/**
|
||||
* Output size (in bits) for RadioGatun[64].
|
||||
*/
|
||||
#define SPH_SIZE_radiogatun64 256
|
||||
|
||||
/**
|
||||
* This structure is a context for RadioGatun[64] computations: it
|
||||
* contains intermediate values and some data from the last entered
|
||||
* block. Once a RadioGatun[64] computation has been performed, the
|
||||
* context can be reused for another computation.
|
||||
*
|
||||
* The contents of this structure are private. A running RadioGatun[64]
|
||||
* computation can be cloned by copying the context (e.g. with a
|
||||
* simple <code>memcpy()</code>).
|
||||
*/
|
||||
typedef struct {
|
||||
#ifndef DOXYGEN_IGNORE
|
||||
unsigned char data[312]; /* first field, for alignment */
|
||||
unsigned data_ptr;
|
||||
sph_u64 a[19], b[39];
|
||||
#endif
|
||||
} sph_radiogatun64_context;
|
||||
|
||||
/**
|
||||
* Initialize a RadioGatun[64] context. This process performs no
|
||||
* memory allocation.
|
||||
*
|
||||
* @param cc the RadioGatun[64] context (pointer to a
|
||||
* <code>sph_radiogatun64_context</code>)
|
||||
*/
|
||||
void sph_radiogatun64_init(void *cc);
|
||||
|
||||
/**
|
||||
* Process some data bytes. It is acceptable that <code>len</code> is zero
|
||||
* (in which case this function does nothing).
|
||||
*
|
||||
* @param cc the RadioGatun[64] context
|
||||
* @param data the input data
|
||||
* @param len the input data length (in bytes)
|
||||
*/
|
||||
void sph_radiogatun64(void *cc, const void *data, size_t len);
|
||||
|
||||
/**
|
||||
* Terminate the current RadioGatun[64] computation and output the
|
||||
* result into the provided buffer. The destination buffer must be wide
|
||||
* enough to accomodate the result (32 bytes). The context is
|
||||
* automatically reinitialized.
|
||||
*
|
||||
* @param cc the RadioGatun[64] context
|
||||
* @param dst the destination buffer
|
||||
*/
|
||||
void sph_radiogatun64_close(void *cc, void *dst);
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
@@ -1,34 +0,0 @@
|
||||
#include "x20r-gate.h"
|
||||
|
||||
void getAlgoString( const uint8_t* prevblock, char *output )
|
||||
{
|
||||
char *sptr = outpuit;
|
||||
|
||||
for ( int j = 0; j < X20R_HASH_FUNC_COUNT; j++ )
|
||||
{
|
||||
char b = (19 - j) >> 1; // 16 ascii hex chars, reversed
|
||||
uint8_t algoDigit = (j & 1) ? prevblock[b] & 0xF : prevblock[b] >> 4;
|
||||
if (algoDigit >= 10)
|
||||
sprintf(sptr, "%c", 'A' + (algoDigit - 10));
|
||||
else
|
||||
sprintf(sptr, "%u", (uint32_t) algoDigit);
|
||||
sptr++;
|
||||
}
|
||||
*sptr = '\0';
|
||||
}
|
||||
|
||||
bool register_x20r_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (X20R_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_x20r_4way;
|
||||
gate->hash = (void*)&x20r_4way_hash;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_x20r;
|
||||
gate->hash = (void*)&x20r_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
|
||||
x20_r_s_getAlgoString = (void*)&x20r_getAlgoString;
|
||||
opt_target_factor = 256.;
|
||||
return true;
|
||||
};
|
||||
|
@@ -1,58 +0,0 @@
|
||||
#ifndef X20R_GATE_H__
|
||||
#define X20R_GATE_H__ 1
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
/*
|
||||
#if defined(__AVX2__) && defined(__AES__)
|
||||
#define X20R_4WAY
|
||||
#endif
|
||||
*/
|
||||
|
||||
enum x20r_Algo {
|
||||
BLAKE = 0,
|
||||
BMW,
|
||||
GROESTL,
|
||||
JH,
|
||||
KECCAK,
|
||||
SKEIN,
|
||||
LUFFA,
|
||||
CUBEHASH,
|
||||
SHAVITE,
|
||||
SIMD,
|
||||
ECHO,
|
||||
HAMSI,
|
||||
FUGUE,
|
||||
SHABAL,
|
||||
WHIRLPOOL,
|
||||
SHA_512,
|
||||
HAVAL, // 256-bits output
|
||||
GOST,
|
||||
RADIOGATUN, // 256-bits output
|
||||
PANAMA, // 256-bits output
|
||||
X20R_HASH_FUNC_COUNT
|
||||
};
|
||||
|
||||
void (*x20_r_s_getAlgoString) ( const uint8_t*, char* );
|
||||
|
||||
void x20r_getAlgoString( const uint8_t* prevblock, char *output );
|
||||
|
||||
bool register_xi20r_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(X20R_4WAY)
|
||||
|
||||
void x20r_4way_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_x20r_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#endif
|
||||
|
||||
void x20rhash( void *state, const void *input );
|
||||
|
||||
int scanhash_x20r( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#endif
|
||||
|
252
algo/x20/x20r.c
252
algo/x20/x20r.c
@@ -1,252 +0,0 @@
|
||||
#include "x20r-gate.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "algo/blake/sph_blake.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
#include "algo/jh/sph_jh.h"
|
||||
#include "algo/keccak/sph_keccak.h"
|
||||
#include "algo/skein/sph_skein.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/hamsi/sph_hamsi.h"
|
||||
#include "algo/fugue/sph_fugue.h"
|
||||
#include "algo/shabal/sph_shabal.h"
|
||||
#include "algo/whirlpool/sph_whirlpool.h"
|
||||
#include "algo/haval/sph-haval.h"
|
||||
#include "algo/radiogatun/sph_radiogatun.h"
|
||||
#include "algo/panama/sph_panama.h"
|
||||
#include "algo/gost/sph_gost.h"
|
||||
#include "algo/sha/sph_sha2.h"
|
||||
#if defined(__AES__)
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#else
|
||||
#include "algo/groestl/sph_groestl.h"
|
||||
#include "algo/echo/sph_echo.h"
|
||||
#endif
|
||||
#include "algo/luffa/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#include "algo/simd/nist.h"
|
||||
|
||||
|
||||
static __thread uint32_t s_ntime = UINT32_MAX;
|
||||
static __thread char hashOrder[X20R_HASH_FUNC_COUNT + 1] = { 0 };
|
||||
|
||||
union _x20r_context_overlay
|
||||
{
|
||||
sph_blake512_context blake;
|
||||
sph_bmw512_context bmw;
|
||||
#if defined(__AES__)
|
||||
hashState_groestl groestl;
|
||||
hashState_echo echo;
|
||||
#else
|
||||
sph_groestl512_context groestl;
|
||||
sph_echo512_context echo;
|
||||
#endif
|
||||
sph_skein512_context skein;
|
||||
sph_jh512_context jh;
|
||||
sph_keccak512_context keccak;
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cube;
|
||||
hashState_sd simd;
|
||||
sph_shavite512_context shavite;
|
||||
sph_hamsi512_context hamsi;
|
||||
sph_fugue512_context fugue;
|
||||
sph_shabal512_context shabal;
|
||||
sph_whirlpool_context whirlpool;
|
||||
sph_sha512_context sha512;
|
||||
sph_haval256_5_context haval;
|
||||
sph_gost512_context gost;
|
||||
sph_radiogatun64_context radiogatun;
|
||||
sph_panama_context panama;
|
||||
};
|
||||
typedef union _x20r_context_overlay x20r_context_overlay;
|
||||
|
||||
void x20r_hash(void* output, const void* input)
|
||||
{
|
||||
uint32_t _ALIGN(128) hash[64/4];
|
||||
x20r_context_overlay ctx;
|
||||
void *in = (void*) input;
|
||||
int size = 80;
|
||||
|
||||
if ( s_ntime == UINT32_MAX )
|
||||
{
|
||||
const uint8_t* in8 = (uint8_t*) input;
|
||||
x20_r_s_getAlgoString(&in8[4], hashOrder);
|
||||
}
|
||||
|
||||
for (int i = 0; i < 20; i++)
|
||||
{
|
||||
const char elem = hashOrder[i];
|
||||
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
|
||||
|
||||
switch ( algo )
|
||||
{
|
||||
case BLAKE:
|
||||
sph_blake512_init(&ctx.blake);
|
||||
sph_blake512(&ctx.blake, in, size);
|
||||
sph_blake512_close(&ctx.blake, hash);
|
||||
break;
|
||||
case BMW:
|
||||
sph_bmw512_init(&ctx.bmw);
|
||||
sph_bmw512(&ctx.bmw, in, size);
|
||||
sph_bmw512_close(&ctx.bmw, hash);
|
||||
break;
|
||||
case GROESTL:
|
||||
#if defined(__AES__)
|
||||
init_groestl( &ctx.groestl, 64 );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash,
|
||||
(const char*)in, size<<3 );
|
||||
#else
|
||||
sph_groestl512_init(&ctx.groestl);
|
||||
sph_groestl512(&ctx.groestl, in, size);
|
||||
sph_groestl512_close(&ctx.groestl, hash);
|
||||
#endif
|
||||
break;
|
||||
case SKEIN:
|
||||
sph_skein512_init(&ctx.skein);
|
||||
sph_skein512(&ctx.skein, in, size);
|
||||
sph_skein512_close(&ctx.skein, hash);
|
||||
break;
|
||||
case JH:
|
||||
sph_jh512_init(&ctx.jh);
|
||||
sph_jh512(&ctx.jh, in, size);
|
||||
sph_jh512_close(&ctx.jh, hash);
|
||||
break;
|
||||
case KECCAK:
|
||||
sph_keccak512_init(&ctx.keccak);
|
||||
sph_keccak512(&ctx.keccak, in, size);
|
||||
sph_keccak512_close(&ctx.keccak, hash);
|
||||
break;
|
||||
case LUFFA:
|
||||
init_luffa( &ctx.luffa, 512 );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
|
||||
(const BitSequence*)in, size );
|
||||
break;
|
||||
case CUBEHASH:
|
||||
cubehashInit( &ctx.cube, 512, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash,
|
||||
(const byte*)in, size );
|
||||
break;
|
||||
case SHAVITE:
|
||||
sph_shavite512_init(&ctx.shavite);
|
||||
sph_shavite512(&ctx.shavite, in, size);
|
||||
sph_shavite512_close(&ctx.shavite, hash);
|
||||
break;
|
||||
case SIMD:
|
||||
init_sd( &ctx.simd, 512 );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash,
|
||||
(const BitSequence *)in, size<<3 );
|
||||
break;
|
||||
case ECHO:
|
||||
#if defined(__AES__)
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo ( &ctx.echo, (BitSequence *)hash,
|
||||
(const BitSequence *)in, size<<3 );
|
||||
#else
|
||||
sph_echo512_init(&ctx.echo);
|
||||
sph_echo512(&ctx.echo, in, size);
|
||||
sph_echo512_close(&ctx.echo, hash);
|
||||
#endif
|
||||
break;
|
||||
case HAMSI:
|
||||
sph_hamsi512_init(&ctx.hamsi);
|
||||
sph_hamsi512(&ctx.hamsi, in, size);
|
||||
sph_hamsi512_close(&ctx.hamsi, hash);
|
||||
break;
|
||||
case FUGUE:
|
||||
sph_fugue512_init(&ctx.fugue);
|
||||
sph_fugue512(&ctx.fugue, in, size);
|
||||
sph_fugue512_close(&ctx.fugue, hash);
|
||||
break;
|
||||
case SHABAL:
|
||||
sph_shabal512_init(&ctx.shabal);
|
||||
sph_shabal512(&ctx.shabal, in, size);
|
||||
sph_shabal512_close(&ctx.shabal, hash);
|
||||
break;
|
||||
case WHIRLPOOL:
|
||||
sph_whirlpool_init(&ctx.whirlpool);
|
||||
sph_whirlpool(&ctx.whirlpool, in, size);
|
||||
sph_whirlpool_close(&ctx.whirlpool, hash);
|
||||
break;
|
||||
case SHA_512:
|
||||
sph_sha512_Init( &ctx.sha512 );
|
||||
sph_sha512( &ctx.sha512, in, size );
|
||||
sph_sha512_close( &ctx.sha512, hash );
|
||||
break;
|
||||
case HAVAL:
|
||||
sph_haval256_5_init(&ctx.haval);
|
||||
sph_haval256_5(&ctx.haval, in, size);
|
||||
sph_haval256_5_close(&ctx.haval, hash);
|
||||
memset(&hash[8], 0, 32);
|
||||
break;
|
||||
case GOST:
|
||||
sph_gost512_init(&ctx.gost);
|
||||
sph_gost512(&ctx.gost, in, size);
|
||||
sph_gost512_close(&ctx.gost, hash);
|
||||
break;
|
||||
case RADIOGATUN:
|
||||
sph_radiogatun64_init(&ctx.radiogatun);
|
||||
sph_radiogatun64(&ctx.radiogatun, in, size);
|
||||
sph_radiogatun64_close(&ctx.radiogatun, hash);
|
||||
memset(&hash[8], 0, 32);
|
||||
break;
|
||||
case PANAMA:
|
||||
sph_panama_init(&ctx.panama);
|
||||
sph_panama(&ctx.panama, in, size);
|
||||
sph_panama_close(&ctx.panama, hash);
|
||||
memset(&hash[8], 0, 32);
|
||||
break;
|
||||
}
|
||||
in = (void*) hash;
|
||||
size = 64;
|
||||
}
|
||||
memcpy(output, hash, 32);
|
||||
}
|
||||
|
||||
int scanhash_x20r( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t _ALIGN(128) hash32[8];
|
||||
uint32_t _ALIGN(128) endiandata[20];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t nonce = first_nonce;
|
||||
int thr_id = mythr->id;
|
||||
volatile uint8_t *restart = &(work_restart[thr_id].restart);
|
||||
|
||||
for (int k=0; k < 19; k++)
|
||||
be32enc( &endiandata[k], pdata[k] );
|
||||
|
||||
if ( s_ntime != pdata[17] )
|
||||
{
|
||||
uint32_t ntime = swab32(pdata[17]);
|
||||
x20_r_s_getAlgoString( (const char*) (&endiandata[1]), hashOrder );
|
||||
s_ntime = ntime;
|
||||
if (opt_debug && !thr_id) applog(LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime);
|
||||
}
|
||||
|
||||
if ( opt_benchmark )
|
||||
ptarget[7] = 0x0cff;
|
||||
|
||||
do {
|
||||
be32enc( &endiandata[19], nonce );
|
||||
x20r_hash( hash32, endiandata );
|
||||
|
||||
if ( hash32[7] <= Htarg && fulltest( hash32, ptarget ) )
|
||||
{
|
||||
pdata[19] = nonce;
|
||||
submit_solution( work, hash32, mythr );
|
||||
}
|
||||
nonce++;
|
||||
|
||||
} while (nonce < max_nonce && !(*restart));
|
||||
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
@@ -1,5 +0,0 @@
|
||||
#ifdef __SSE2__
|
||||
#include "yescrypt-simd.c"
|
||||
#else
|
||||
#include "yescrypt-opt.c"
|
||||
#endif
|
@@ -1,213 +0,0 @@
|
||||
/*-
|
||||
* Copyright 2013,2014 Alexander Peslyak
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifdef MAP_ANON
|
||||
#include <sys/mman.h>
|
||||
#endif
|
||||
|
||||
#include "yescrypt.h"
|
||||
#define HUGEPAGE_THRESHOLD (12 * 1024 * 1024)
|
||||
|
||||
#ifdef __x86_64__
|
||||
#define HUGEPAGE_SIZE (2 * 1024 * 1024)
|
||||
#else
|
||||
#undef HUGEPAGE_SIZE
|
||||
#endif
|
||||
|
||||
/*
|
||||
static __inline uint32_t
|
||||
le32dec(const void *pp)
|
||||
{
|
||||
const uint8_t *p = (uint8_t const *)pp;
|
||||
|
||||
return ((uint32_t)(p[0]) + ((uint32_t)(p[1]) << 8) +
|
||||
((uint32_t)(p[2]) << 16) + ((uint32_t)(p[3]) << 24));
|
||||
}
|
||||
|
||||
static __inline void
|
||||
le32enc(void *pp, uint32_t x)
|
||||
{
|
||||
uint8_t * p = (uint8_t *)pp;
|
||||
|
||||
p[0] = x & 0xff;
|
||||
p[1] = (x >> 8) & 0xff;
|
||||
p[2] = (x >> 16) & 0xff;
|
||||
p[3] = (x >> 24) & 0xff;
|
||||
}
|
||||
*/
|
||||
|
||||
static void *
|
||||
alloc_region(yescrypt_region_t * region, size_t size)
|
||||
{
|
||||
size_t base_size = size;
|
||||
uint8_t * base, * aligned;
|
||||
#ifdef MAP_ANON
|
||||
int flags =
|
||||
#ifdef MAP_NOCORE
|
||||
MAP_NOCORE |
|
||||
#endif
|
||||
MAP_ANON | MAP_PRIVATE;
|
||||
#if defined(MAP_HUGETLB) && defined(HUGEPAGE_SIZE)
|
||||
size_t new_size = size;
|
||||
const size_t hugepage_mask = (size_t)HUGEPAGE_SIZE - 1;
|
||||
if (size >= HUGEPAGE_THRESHOLD && size + hugepage_mask >= size) {
|
||||
flags |= MAP_HUGETLB;
|
||||
/*
|
||||
* Linux's munmap() fails on MAP_HUGETLB mappings if size is not a multiple of
|
||||
* huge page size, so let's round up to huge page size here.
|
||||
*/
|
||||
new_size = size + hugepage_mask;
|
||||
new_size &= ~hugepage_mask;
|
||||
}
|
||||
base = mmap(NULL, new_size, PROT_READ | PROT_WRITE, flags, -1, 0);
|
||||
if (base != MAP_FAILED) {
|
||||
base_size = new_size;
|
||||
} else
|
||||
if (flags & MAP_HUGETLB) {
|
||||
flags &= ~MAP_HUGETLB;
|
||||
base = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0);
|
||||
}
|
||||
|
||||
#else
|
||||
base = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0);
|
||||
#endif
|
||||
if (base == MAP_FAILED)
|
||||
base = NULL;
|
||||
aligned = base;
|
||||
#elif defined(HAVE_POSIX_MEMALIGN)
|
||||
if ((errno = posix_memalign((void **)&base, 64, size)) != 0)
|
||||
base = NULL;
|
||||
aligned = base;
|
||||
#else
|
||||
base = aligned = NULL;
|
||||
if (size + 63 < size) {
|
||||
errno = ENOMEM;
|
||||
} else if ((base = malloc(size + 63)) != NULL) {
|
||||
aligned = base + 63;
|
||||
aligned -= (uintptr_t)aligned & 63;
|
||||
}
|
||||
#endif
|
||||
region->base = base;
|
||||
region->aligned = aligned;
|
||||
region->base_size = base ? base_size : 0;
|
||||
region->aligned_size = base ? size : 0;
|
||||
return aligned;
|
||||
}
|
||||
|
||||
static __inline void
|
||||
init_region(yescrypt_region_t * region)
|
||||
{
|
||||
region->base = region->aligned = NULL;
|
||||
region->base_size = region->aligned_size = 0;
|
||||
}
|
||||
|
||||
static int
|
||||
free_region(yescrypt_region_t * region)
|
||||
{
|
||||
if (region->base) {
|
||||
#ifdef MAP_ANON
|
||||
if (munmap(region->base, region->base_size))
|
||||
return -1;
|
||||
#else
|
||||
free(region->base);
|
||||
#endif
|
||||
}
|
||||
init_region(region);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int yescrypt_init_shared(yescrypt_shared_t * shared, const uint8_t * param, size_t paramlen,
|
||||
uint64_t N, uint32_t r, uint32_t p, yescrypt_init_shared_flags_t flags, uint32_t mask,
|
||||
uint8_t * buf, size_t buflen)
|
||||
{
|
||||
yescrypt_shared1_t* shared1 = &shared->shared1;
|
||||
yescrypt_shared_t dummy, half1, half2;
|
||||
uint8_t salt[32];
|
||||
|
||||
if (flags & YESCRYPT_SHARED_PREALLOCATED) {
|
||||
if (!shared1->aligned || !shared1->aligned_size)
|
||||
return -1;
|
||||
} else {
|
||||
init_region(shared1);
|
||||
}
|
||||
shared->mask1 = 1;
|
||||
if (!param && !paramlen && !N && !r && !p && !buf && !buflen)
|
||||
return 0;
|
||||
|
||||
init_region(&dummy.shared1);
|
||||
dummy.mask1 = 1;
|
||||
if (yescrypt_kdf(&dummy, shared1,
|
||||
param, paramlen, NULL, 0, N, r, p, 0,
|
||||
YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | __YESCRYPT_INIT_SHARED_1,
|
||||
salt, sizeof(salt), 0 ) )
|
||||
goto out;
|
||||
|
||||
half1 = half2 = *shared;
|
||||
half1.shared1.aligned_size /= 2;
|
||||
half2.shared1.aligned = (void*) ((size_t)half2.shared1.aligned + half1.shared1.aligned_size);
|
||||
half2.shared1.aligned_size = half1.shared1.aligned_size;
|
||||
N /= 2;
|
||||
|
||||
if (p > 1 && yescrypt_kdf(&half1, &half2.shared1,
|
||||
param, paramlen, salt, sizeof(salt), N, r, p, 0,
|
||||
YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | __YESCRYPT_INIT_SHARED_2,
|
||||
salt, sizeof(salt), 0 ))
|
||||
goto out;
|
||||
|
||||
if (yescrypt_kdf(&half2, &half1.shared1,
|
||||
param, paramlen, salt, sizeof(salt), N, r, p, 0,
|
||||
YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | __YESCRYPT_INIT_SHARED_1,
|
||||
salt, sizeof(salt), 0))
|
||||
goto out;
|
||||
|
||||
if (yescrypt_kdf(&half1, &half2.shared1,
|
||||
param, paramlen, salt, sizeof(salt), N, r, p, 0,
|
||||
YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | __YESCRYPT_INIT_SHARED_1,
|
||||
buf, buflen, 0))
|
||||
goto out;
|
||||
|
||||
shared->mask1 = mask;
|
||||
|
||||
return 0;
|
||||
|
||||
out:
|
||||
if (!(flags & YESCRYPT_SHARED_PREALLOCATED))
|
||||
free_region(shared1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
int
|
||||
yescrypt_free_shared(yescrypt_shared_t * shared)
|
||||
{
|
||||
return free_region(&shared->shared1);
|
||||
}
|
||||
|
||||
int
|
||||
yescrypt_init_local(yescrypt_local_t * local)
|
||||
{
|
||||
init_region(local);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int
|
||||
yescrypt_free_local(yescrypt_local_t * local)
|
||||
{
|
||||
return free_region(local);
|
||||
}
|
File diff suppressed because it is too large
Load Diff
@@ -1,488 +0,0 @@
|
||||
/*-
|
||||
* Copyright 2013,2014 Alexander Peslyak
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "compat.h"
|
||||
|
||||
#include "yescrypt.h"
|
||||
#include "algo/sha/hmac-sha256-hash.h"
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
#define BYTES2CHARS(bytes) \
|
||||
((((bytes) * 8) + 5) / 6)
|
||||
|
||||
#define HASH_SIZE 32 /* bytes */
|
||||
#define HASH_LEN BYTES2CHARS(HASH_SIZE) /* base-64 chars */
|
||||
#define YESCRYPT_FLAGS (YESCRYPT_RW | YESCRYPT_PWXFORM)
|
||||
|
||||
static const char * const itoa64 =
|
||||
"./0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
|
||||
|
||||
static uint8_t* encode64_uint32(uint8_t* dst, size_t dstlen, uint32_t src, uint32_t srcbits)
|
||||
{
|
||||
uint32_t bit;
|
||||
|
||||
for (bit = 0; bit < srcbits; bit += 6) {
|
||||
if (dstlen < 1)
|
||||
return NULL;
|
||||
*dst++ = itoa64[src & 0x3f];
|
||||
dstlen--;
|
||||
src >>= 6;
|
||||
}
|
||||
|
||||
return dst;
|
||||
}
|
||||
|
||||
static uint8_t* encode64(uint8_t* dst, size_t dstlen, const uint8_t* src, size_t srclen)
|
||||
{
|
||||
size_t i;
|
||||
|
||||
for (i = 0; i < srclen; ) {
|
||||
uint8_t * dnext;
|
||||
uint32_t value = 0, bits = 0;
|
||||
do {
|
||||
value |= (uint32_t)src[i++] << bits;
|
||||
bits += 8;
|
||||
} while (bits < 24 && i < srclen);
|
||||
dnext = encode64_uint32(dst, dstlen, value, bits);
|
||||
if (!dnext)
|
||||
return NULL;
|
||||
dstlen -= dnext - dst;
|
||||
dst = dnext;
|
||||
}
|
||||
|
||||
return dst;
|
||||
}
|
||||
|
||||
static int decode64_one(uint32_t* dst, uint8_t src)
|
||||
{
|
||||
const char * ptr = strchr(itoa64, src);
|
||||
if (ptr) {
|
||||
*dst = (uint32_t) (ptr - itoa64);
|
||||
return 0;
|
||||
}
|
||||
*dst = 0;
|
||||
return -1;
|
||||
}
|
||||
|
||||
static const uint8_t* decode64_uint32(uint32_t* dst, uint32_t dstbits, const uint8_t* src)
|
||||
{
|
||||
uint32_t bit;
|
||||
uint32_t value;
|
||||
|
||||
value = 0;
|
||||
for (bit = 0; bit < dstbits; bit += 6) {
|
||||
uint32_t one;
|
||||
if (decode64_one(&one, *src)) {
|
||||
*dst = 0;
|
||||
return NULL;
|
||||
}
|
||||
src++;
|
||||
value |= one << bit;
|
||||
}
|
||||
|
||||
*dst = value;
|
||||
return src;
|
||||
}
|
||||
|
||||
uint8_t* yescrypt_r(const yescrypt_shared_t* shared, yescrypt_local_t* local,
|
||||
const uint8_t* passwd, size_t passwdlen, const uint8_t* setting,
|
||||
uint8_t* buf, size_t buflen, int thrid )
|
||||
{
|
||||
uint8_t hash[HASH_SIZE];
|
||||
const uint8_t * src, * salt;
|
||||
uint8_t * dst;
|
||||
size_t prefixlen, saltlen, need;
|
||||
uint8_t version;
|
||||
uint64_t N;
|
||||
uint32_t r, p;
|
||||
yescrypt_flags_t flags = YESCRYPT_WORM;
|
||||
|
||||
printf("pass1 ...");
|
||||
fflush(stdout);
|
||||
|
||||
if (setting[0] != '$' || setting[1] != '7') {
|
||||
printf("died$7 ...");
|
||||
fflush(stdout);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
printf("died80 ...");
|
||||
fflush(stdout);
|
||||
|
||||
src = setting + 2;
|
||||
|
||||
printf("hello '%p'\n", (char *)src);
|
||||
fflush(stdout);
|
||||
|
||||
switch ((version = *src)) {
|
||||
case '$':
|
||||
printf("died2 ...");
|
||||
fflush(stdout);
|
||||
break;
|
||||
case 'X':
|
||||
src++;
|
||||
flags = YESCRYPT_RW;
|
||||
printf("died3 ...");
|
||||
fflush(stdout);
|
||||
break;
|
||||
default:
|
||||
printf("died4 ...");
|
||||
fflush(stdout);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
printf("pass2 ...");
|
||||
fflush(stdout);
|
||||
|
||||
if (*src != '$') {
|
||||
uint32_t decoded_flags;
|
||||
if (decode64_one(&decoded_flags, *src)) {
|
||||
printf("died5 ...");
|
||||
fflush(stdout);
|
||||
return NULL;
|
||||
}
|
||||
flags = decoded_flags;
|
||||
if (*++src != '$') {
|
||||
printf("died6 ...");
|
||||
fflush(stdout);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
src++;
|
||||
|
||||
{
|
||||
uint32_t N_log2;
|
||||
if (decode64_one(&N_log2, *src)) {
|
||||
printf("died7 ...");
|
||||
return NULL;
|
||||
}
|
||||
src++;
|
||||
N = (uint64_t)1 << N_log2;
|
||||
}
|
||||
|
||||
src = decode64_uint32(&r, 30, src);
|
||||
if (!src) {
|
||||
printf("died6 ...");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
src = decode64_uint32(&p, 30, src);
|
||||
if (!src) {
|
||||
printf("died7 ...");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
prefixlen = src - setting;
|
||||
|
||||
salt = src;
|
||||
src = (uint8_t *)strrchr((char *)salt, '$');
|
||||
if (src)
|
||||
saltlen = src - salt;
|
||||
else
|
||||
saltlen = strlen((char *)salt);
|
||||
|
||||
need = prefixlen + saltlen + 1 + HASH_LEN + 1;
|
||||
if (need > buflen || need < saltlen) {
|
||||
printf("'%d %d %d'", (int) need, (int) buflen, (int) saltlen);
|
||||
printf("died8killbuf ...");
|
||||
fflush(stdout);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if ( yescrypt_kdf( shared, local, passwd, passwdlen, salt, saltlen, N, r, p,
|
||||
0, flags, hash, sizeof(hash), thrid ) == -1 )
|
||||
{
|
||||
printf("died10 ...");
|
||||
fflush(stdout);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
dst = buf;
|
||||
memcpy(dst, setting, prefixlen + saltlen);
|
||||
dst += prefixlen + saltlen;
|
||||
*dst++ = '$';
|
||||
|
||||
dst = encode64(dst, buflen - (dst - buf), hash, sizeof(hash));
|
||||
/* Could zeroize hash[] here, but yescrypt_kdf() doesn't zeroize its
|
||||
* memory allocations yet anyway. */
|
||||
if (!dst || dst >= buf + buflen) { /* Can't happen */
|
||||
printf("died11 ...");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
*dst = 0; /* NUL termination */
|
||||
|
||||
printf("died12 ...");
|
||||
fflush(stdout);
|
||||
|
||||
return buf;
|
||||
}
|
||||
|
||||
uint8_t* yescrypt(const uint8_t* passwd, const uint8_t* setting, int thrid )
|
||||
{
|
||||
static uint8_t buf[4 + 1 + 5 + 5 + BYTES2CHARS(32) + 1 + HASH_LEN + 1];
|
||||
yescrypt_shared_t shared;
|
||||
yescrypt_local_t local;
|
||||
uint8_t * retval;
|
||||
|
||||
if (yescrypt_init_shared(&shared, NULL, 0,
|
||||
0, 0, 0, YESCRYPT_SHARED_DEFAULTS, 0, NULL, 0))
|
||||
return NULL;
|
||||
if (yescrypt_init_local(&local)) {
|
||||
yescrypt_free_shared(&shared);
|
||||
return NULL;
|
||||
}
|
||||
retval = yescrypt_r(&shared, &local,
|
||||
passwd, 80, setting, buf, sizeof(buf), thrid );
|
||||
//printf("hashse='%s'\n", (char *)retval);
|
||||
if (yescrypt_free_local(&local)) {
|
||||
yescrypt_free_shared(&shared);
|
||||
return NULL;
|
||||
}
|
||||
if (yescrypt_free_shared(&shared))
|
||||
return NULL;
|
||||
return retval;
|
||||
}
|
||||
|
||||
uint8_t* yescrypt_gensalt_r(uint32_t N_log2, uint32_t r, uint32_t p, yescrypt_flags_t flags,
|
||||
const uint8_t* src, size_t srclen, uint8_t* buf, size_t buflen)
|
||||
{
|
||||
uint8_t * dst;
|
||||
size_t prefixlen = 3 + 1 + 5 + 5;
|
||||
size_t saltlen = BYTES2CHARS(srclen);
|
||||
size_t need;
|
||||
|
||||
if (p == 1)
|
||||
flags &= ~YESCRYPT_PARALLEL_SMIX;
|
||||
|
||||
if (flags) {
|
||||
if (flags & ~0x3f)
|
||||
return NULL;
|
||||
|
||||
prefixlen++;
|
||||
if (flags != YESCRYPT_RW)
|
||||
prefixlen++;
|
||||
}
|
||||
|
||||
need = prefixlen + saltlen + 1;
|
||||
if (need > buflen || need < saltlen || saltlen < srclen)
|
||||
return NULL;
|
||||
|
||||
if (N_log2 > 63 || ((uint64_t)r * (uint64_t)p >= (1U << 30)))
|
||||
return NULL;
|
||||
|
||||
dst = buf;
|
||||
*dst++ = '$';
|
||||
*dst++ = '7';
|
||||
if (flags) {
|
||||
*dst++ = 'X'; /* eXperimental, subject to change */
|
||||
if (flags != YESCRYPT_RW)
|
||||
*dst++ = itoa64[flags];
|
||||
}
|
||||
*dst++ = '$';
|
||||
|
||||
*dst++ = itoa64[N_log2];
|
||||
|
||||
dst = encode64_uint32(dst, buflen - (dst - buf), r, 30);
|
||||
if (!dst) /* Can't happen */
|
||||
return NULL;
|
||||
|
||||
dst = encode64_uint32(dst, buflen - (dst - buf), p, 30);
|
||||
if (!dst) /* Can't happen */
|
||||
return NULL;
|
||||
|
||||
dst = encode64(dst, buflen - (dst - buf), src, srclen);
|
||||
if (!dst || dst >= buf + buflen) /* Can't happen */
|
||||
return NULL;
|
||||
|
||||
*dst = 0; /* NUL termination */
|
||||
|
||||
return buf;
|
||||
}
|
||||
|
||||
uint8_t* yescrypt_gensalt(uint32_t N_log2, uint32_t r, uint32_t p, yescrypt_flags_t flags,
|
||||
const uint8_t * src, size_t srclen)
|
||||
{
|
||||
static uint8_t buf[4 + 1 + 5 + 5 + BYTES2CHARS(32) + 1];
|
||||
return yescrypt_gensalt_r(N_log2, r, p, flags, src, srclen,
|
||||
buf, sizeof(buf));
|
||||
}
|
||||
|
||||
static int yescrypt_bsty(const uint8_t * passwd, size_t passwdlen,
|
||||
const uint8_t * salt, size_t saltlen, uint64_t N, uint32_t r, uint32_t p,
|
||||
uint8_t * buf, size_t buflen, int thrid )
|
||||
{
|
||||
static __thread int initialized = 0;
|
||||
static __thread yescrypt_shared_t shared;
|
||||
static __thread yescrypt_local_t local;
|
||||
int retval;
|
||||
if (!initialized) {
|
||||
/* "shared" could in fact be shared, but it's simpler to keep it private
|
||||
* along with "local". It's dummy and tiny anyway. */
|
||||
if (yescrypt_init_shared(&shared, NULL, 0,
|
||||
0, 0, 0, YESCRYPT_SHARED_DEFAULTS, 0, NULL, 0))
|
||||
return -1;
|
||||
if (yescrypt_init_local(&local)) {
|
||||
yescrypt_free_shared(&shared);
|
||||
return -1;
|
||||
}
|
||||
initialized = 1;
|
||||
}
|
||||
retval = yescrypt_kdf(&shared, &local,
|
||||
passwd, passwdlen, salt, saltlen, N, r, p, 0, YESCRYPT_FLAGS,
|
||||
buf, buflen, thrid );
|
||||
#if 0
|
||||
if (yescrypt_free_local(&local)) {
|
||||
yescrypt_free_shared(&shared);
|
||||
return -1;
|
||||
}
|
||||
if (yescrypt_free_shared(&shared))
|
||||
return -1;
|
||||
initialized = 0;
|
||||
#endif
|
||||
return retval;
|
||||
}
|
||||
|
||||
// scrypt parameters initialized at run time.
|
||||
uint64_t YESCRYPT_N;
|
||||
uint32_t YESCRYPT_R;
|
||||
uint32_t YESCRYPT_P;
|
||||
char *yescrypt_client_key = NULL;
|
||||
int yescrypt_client_key_len = 0;
|
||||
|
||||
/* main hash 80 bytes input */
|
||||
int yescrypt_hash( const char *input, char *output, uint32_t len, int thrid )
|
||||
{
|
||||
return yescrypt_bsty( (uint8_t*)input, len, (uint8_t*)input, len, YESCRYPT_N,
|
||||
YESCRYPT_R, YESCRYPT_P, (uint8_t*)output, 32, thrid );
|
||||
}
|
||||
|
||||
/* for util.c test */
|
||||
int yescrypthash(void *output, const void *input, int thrid)
|
||||
{
|
||||
return yescrypt_hash((char*) input, (char*) output, 80, thrid);
|
||||
}
|
||||
|
||||
int scanhash_yescrypt( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t _ALIGN(64) vhash[8];
|
||||
uint32_t _ALIGN(64) endiandata[20];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce;
|
||||
uint32_t n = first_nonce;
|
||||
int thr_id = mythr->id;
|
||||
|
||||
for ( int k = 0; k < 19; k++ )
|
||||
be32enc( &endiandata[k], pdata[k] );
|
||||
endiandata[19] = n;
|
||||
do {
|
||||
if ( yescrypt_hash((char*) endiandata, (char*) vhash, 80, thr_id ) )
|
||||
if unlikely( valid_hash( vhash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
be32enc( pdata+19, n );
|
||||
submit_solution( work, vhash, mythr );
|
||||
}
|
||||
endiandata[19] = ++n;
|
||||
} while ( n < last_nonce && !work_restart[thr_id].restart );
|
||||
*hashes_done = n - first_nonce;
|
||||
pdata[19] = n;
|
||||
return 0;
|
||||
}
|
||||
|
||||
void yescrypt_gate_base(algo_gate_t *gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | SHA_OPT;
|
||||
gate->scanhash = (void*)&scanhash_yescrypt;
|
||||
gate->hash = (void*)&yescrypt_hash;
|
||||
opt_target_factor = 65536.0;
|
||||
}
|
||||
|
||||
bool register_yescrypt_algo( algo_gate_t* gate )
|
||||
{
|
||||
yescrypt_gate_base( gate );
|
||||
|
||||
if ( opt_param_n ) YESCRYPT_N = opt_param_n;
|
||||
else YESCRYPT_N = 2048;
|
||||
|
||||
if ( opt_param_r ) YESCRYPT_R = opt_param_r;
|
||||
else YESCRYPT_R = 8;
|
||||
|
||||
if ( opt_param_key )
|
||||
{
|
||||
yescrypt_client_key = opt_param_key;
|
||||
yescrypt_client_key_len = strlen( opt_param_key );
|
||||
}
|
||||
else
|
||||
{
|
||||
yescrypt_client_key = NULL;
|
||||
yescrypt_client_key_len = 0;
|
||||
}
|
||||
|
||||
YESCRYPT_P = 1;
|
||||
|
||||
applog( LOG_NOTICE,"Yescrypt parameters: N= %d, R= %d", YESCRYPT_N,
|
||||
YESCRYPT_R );
|
||||
if ( yescrypt_client_key )
|
||||
applog( LOG_NOTICE,"Key= \"%s\"\n", yescrypt_client_key );
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool register_yescryptr8_algo( algo_gate_t* gate )
|
||||
{
|
||||
yescrypt_gate_base( gate );
|
||||
yescrypt_client_key = "Client Key";
|
||||
yescrypt_client_key_len = 10;
|
||||
YESCRYPT_N = 2048;
|
||||
YESCRYPT_R = 8;
|
||||
YESCRYPT_P = 1;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool register_yescryptr16_algo( algo_gate_t* gate )
|
||||
{
|
||||
yescrypt_gate_base( gate );
|
||||
yescrypt_client_key = "Client Key";
|
||||
yescrypt_client_key_len = 10;
|
||||
YESCRYPT_N = 4096;
|
||||
YESCRYPT_R = 16;
|
||||
YESCRYPT_P = 1;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool register_yescryptr32_algo( algo_gate_t* gate )
|
||||
{
|
||||
yescrypt_gate_base( gate );
|
||||
yescrypt_client_key = "WaviBanana";
|
||||
yescrypt_client_key_len = 10;
|
||||
YESCRYPT_N = 4096;
|
||||
YESCRYPT_R = 32;
|
||||
YESCRYPT_P = 1;
|
||||
return true;
|
||||
}
|
||||
|
@@ -1,382 +0,0 @@
|
||||
/*-
|
||||
* Copyright 2009 Colin Percival
|
||||
* Copyright 2013,2014 Alexander Peslyak
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*
|
||||
* This file was originally written by Colin Percival as part of the Tarsnap
|
||||
* online backup system.
|
||||
*/
|
||||
|
||||
#ifndef YESCRYPT_H
|
||||
#define YESCRYPT_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h> /* for size_t */
|
||||
#include <stdbool.h>
|
||||
#include "miner.h"
|
||||
|
||||
//#define __SSE4_1__
|
||||
|
||||
int yescrypt_hash(const char* input, char* output, uint32_t len, int thrid );
|
||||
|
||||
int yescrypthash(void *output, const void *input, int thrid );
|
||||
|
||||
/**
|
||||
* crypto_scrypt(passwd, passwdlen, salt, saltlen, N, r, p, buf, buflen):
|
||||
* Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r,
|
||||
* p, buflen) and write the result into buf. The parameters r, p, and buflen
|
||||
* must satisfy r * p < 2^30 and buflen <= (2^32 - 1) * 32. The parameter N
|
||||
* must be a power of 2 greater than 1.
|
||||
*
|
||||
* Return 0 on success; or -1 on error.
|
||||
*
|
||||
* MT-safe as long as buf is local to the thread.
|
||||
*/
|
||||
extern int crypto_scrypt(const uint8_t * __passwd, size_t __passwdlen,
|
||||
const uint8_t * __salt, size_t __saltlen,
|
||||
uint64_t __N, uint32_t __r, uint32_t __p,
|
||||
uint8_t * __buf, size_t __buflen);
|
||||
|
||||
/**
|
||||
* Internal type used by the memory allocator. Please do not use it directly.
|
||||
* Use yescrypt_shared_t and yescrypt_local_t as appropriate instead, since
|
||||
* they might differ from each other in a future version.
|
||||
*/
|
||||
typedef struct {
|
||||
void * base, * aligned;
|
||||
size_t base_size, aligned_size;
|
||||
} yescrypt_region_t;
|
||||
|
||||
/**
|
||||
* Types for shared (ROM) and thread-local (RAM) data structures.
|
||||
*/
|
||||
typedef yescrypt_region_t yescrypt_shared1_t;
|
||||
typedef struct {
|
||||
yescrypt_shared1_t shared1;
|
||||
uint32_t mask1;
|
||||
} yescrypt_shared_t;
|
||||
typedef yescrypt_region_t yescrypt_local_t;
|
||||
|
||||
/**
|
||||
* Possible values for yescrypt_init_shared()'s flags argument.
|
||||
*/
|
||||
typedef enum {
|
||||
YESCRYPT_SHARED_DEFAULTS = 0,
|
||||
YESCRYPT_SHARED_PREALLOCATED = 0x100
|
||||
} yescrypt_init_shared_flags_t;
|
||||
|
||||
/**
|
||||
* Possible values for the flags argument of yescrypt_kdf(),
|
||||
* yescrypt_gensalt_r(), yescrypt_gensalt(). These may be OR'ed together,
|
||||
* except that YESCRYPT_WORM and YESCRYPT_RW are mutually exclusive.
|
||||
* Please refer to the description of yescrypt_kdf() below for the meaning of
|
||||
* these flags.
|
||||
*/
|
||||
typedef enum {
|
||||
/* public */
|
||||
YESCRYPT_WORM = 0,
|
||||
YESCRYPT_RW = 1,
|
||||
YESCRYPT_PARALLEL_SMIX = 2,
|
||||
YESCRYPT_PWXFORM = 4,
|
||||
/* private */
|
||||
__YESCRYPT_INIT_SHARED_1 = 0x10000,
|
||||
__YESCRYPT_INIT_SHARED_2 = 0x20000,
|
||||
__YESCRYPT_INIT_SHARED = 0x30000
|
||||
} yescrypt_flags_t;
|
||||
|
||||
extern char *yescrypt_client_key;
|
||||
extern int yescrypt_client_key_len;
|
||||
|
||||
|
||||
#define YESCRYPT_KNOWN_FLAGS \
|
||||
(YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | YESCRYPT_PWXFORM | \
|
||||
__YESCRYPT_INIT_SHARED)
|
||||
|
||||
/**
|
||||
* yescrypt_init_shared(shared, param, paramlen, N, r, p, flags, mask,
|
||||
* buf, buflen):
|
||||
* Optionally allocate memory for and initialize the shared (ROM) data
|
||||
* structure. The parameters N, r, and p must satisfy the same conditions as
|
||||
* with crypto_scrypt(). param and paramlen specify a local parameter with
|
||||
* which the ROM is seeded. If buf is not NULL, then it is used to return
|
||||
* buflen bytes of message digest for the initialized ROM (the caller may use
|
||||
* this to verify that the ROM has been computed in the same way that it was on
|
||||
* a previous run).
|
||||
*
|
||||
* Return 0 on success; or -1 on error.
|
||||
*
|
||||
* If bit YESCRYPT_SHARED_PREALLOCATED in flags is set, then memory for the
|
||||
* ROM is assumed to have been preallocated by the caller, with
|
||||
* shared->shared1.aligned being the start address of the ROM and
|
||||
* shared->shared1.aligned_size being its size (which must be consistent with
|
||||
* N, r, and p). This may be used e.g. when the ROM is to be placed in a SysV
|
||||
* shared memory segment allocated by the caller.
|
||||
*
|
||||
* mask controls the frequency of ROM accesses by yescrypt_kdf(). Normally it
|
||||
* should be set to 1, to interleave RAM and ROM accesses, which works well
|
||||
* when both regions reside in the machine's RAM anyway. Other values may be
|
||||
* used e.g. when the ROM is memory-mapped from a disk file. Recommended mask
|
||||
* values are powers of 2 minus 1 or minus 2. Here's the effect of some mask
|
||||
* values:
|
||||
* mask value ROM accesses in SMix 1st loop ROM accesses in SMix 2nd loop
|
||||
* 0 0 1/2
|
||||
* 1 1/2 1/2
|
||||
* 2 0 1/4
|
||||
* 3 1/4 1/4
|
||||
* 6 0 1/8
|
||||
* 7 1/8 1/8
|
||||
* 14 0 1/16
|
||||
* 15 1/16 1/16
|
||||
* 1022 0 1/1024
|
||||
* 1023 1/1024 1/1024
|
||||
*
|
||||
* Actual computation of the ROM contents may be avoided, if you don't intend
|
||||
* to use a ROM but need a dummy shared structure, by calling this function
|
||||
* with NULL, 0, 0, 0, 0, YESCRYPT_SHARED_DEFAULTS, 0, NULL, 0 for the
|
||||
* arguments starting with param and on.
|
||||
*
|
||||
* MT-safe as long as shared is local to the thread.
|
||||
*/
|
||||
extern int yescrypt_init_shared(yescrypt_shared_t * __shared,
|
||||
const uint8_t * __param, size_t __paramlen,
|
||||
uint64_t __N, uint32_t __r, uint32_t __p,
|
||||
yescrypt_init_shared_flags_t __flags, uint32_t __mask,
|
||||
uint8_t * __buf, size_t __buflen);
|
||||
|
||||
/**
|
||||
* yescrypt_free_shared(shared):
|
||||
* Free memory that had been allocated with yescrypt_init_shared().
|
||||
*
|
||||
* Return 0 on success; or -1 on error.
|
||||
*
|
||||
* MT-safe as long as shared is local to the thread.
|
||||
*/
|
||||
extern int yescrypt_free_shared(yescrypt_shared_t * __shared);
|
||||
|
||||
/**
|
||||
* yescrypt_init_local(local):
|
||||
* Initialize the thread-local (RAM) data structure. Actual memory allocation
|
||||
* is currently fully postponed until a call to yescrypt_kdf() or yescrypt_r().
|
||||
*
|
||||
* Return 0 on success; or -1 on error.
|
||||
*
|
||||
* MT-safe as long as local is local to the thread.
|
||||
*/
|
||||
extern int yescrypt_init_local(yescrypt_local_t * __local);
|
||||
|
||||
/**
|
||||
* yescrypt_free_local(local):
|
||||
* Free memory that may have been allocated for an initialized thread-local
|
||||
* (RAM) data structure.
|
||||
*
|
||||
* Return 0 on success; or -1 on error.
|
||||
*
|
||||
* MT-safe as long as local is local to the thread.
|
||||
*/
|
||||
extern int yescrypt_free_local(yescrypt_local_t * __local);
|
||||
|
||||
/**
|
||||
* yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen,
|
||||
* N, r, p, t, flags, buf, buflen):
|
||||
* Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r,
|
||||
* p, buflen), or a revision of scrypt as requested by flags and shared, and
|
||||
* write the result into buf. The parameters N, r, p, and buflen must satisfy
|
||||
* the same conditions as with crypto_scrypt(). t controls computation time
|
||||
* while not affecting peak memory usage. shared and flags may request
|
||||
* special modes as described below. local is the thread-local data
|
||||
* structure, allowing to preserve and reuse a memory allocation across calls,
|
||||
* thereby reducing its overhead.
|
||||
*
|
||||
* Return 0 on success; or -1 on error.
|
||||
*
|
||||
* t controls computation time. t = 0 is optimal in terms of achieving the
|
||||
* highest area-time for ASIC attackers. Thus, higher computation time, if
|
||||
* affordable, is best achieved by increasing N rather than by increasing t.
|
||||
* However, if the higher memory usage (which goes along with higher N) is not
|
||||
* affordable, or if fine-tuning of the time is needed (recall that N must be a
|
||||
* power of 2), then t = 1 or above may be used to increase time while staying
|
||||
* at the same peak memory usage. t = 1 increases the time by 25% and
|
||||
* decreases the normalized area-time to 96% of optimal. (Of course, in
|
||||
* absolute terms the area-time increases with higher t. It's just that it
|
||||
* would increase slightly more with higher N*r rather than with higher t.)
|
||||
* t = 2 increases the time by another 20% and decreases the normalized
|
||||
* area-time to 89% of optimal. Thus, these two values are reasonable to use
|
||||
* for fine-tuning. Values of t higher than 2 result in further increase in
|
||||
* time while reducing the efficiency much further (e.g., down to around 50% of
|
||||
* optimal for t = 5, which runs 3 to 4 times slower than t = 0, with exact
|
||||
* numbers varying by the flags settings).
|
||||
*
|
||||
* Classic scrypt is available by setting t = 0 and flags to YESCRYPT_WORM and
|
||||
* passing a dummy shared structure (see the description of
|
||||
* yescrypt_init_shared() above for how to produce one). In this mode, the
|
||||
* thread-local memory region (RAM) is first sequentially written to and then
|
||||
* randomly read from. This algorithm is friendly towards time-memory
|
||||
* tradeoffs (TMTO), available both to defenders (albeit not in this
|
||||
* implementation) and to attackers.
|
||||
*
|
||||
* Setting YESCRYPT_RW adds extra random reads and writes to the thread-local
|
||||
* memory region (RAM), which makes TMTO a lot less efficient. This may be
|
||||
* used to slow down the kinds of attackers who would otherwise benefit from
|
||||
* classic scrypt's efficient TMTO. Since classic scrypt's TMTO allows not
|
||||
* only for the tradeoff, but also for a decrease of attacker's area-time (by
|
||||
* up to a constant factor), setting YESCRYPT_RW substantially increases the
|
||||
* cost of attacks in area-time terms as well. Yet another benefit of it is
|
||||
* that optimal area-time is reached at an earlier time than with classic
|
||||
* scrypt, and t = 0 actually corresponds to this earlier completion time,
|
||||
* resulting in quicker hash computations (and thus in higher request rate
|
||||
* capacity). Due to these properties, YESCRYPT_RW should almost always be
|
||||
* set, except when compatibility with classic scrypt or TMTO-friendliness are
|
||||
* desired.
|
||||
*
|
||||
* YESCRYPT_PARALLEL_SMIX moves parallelism that is present with p > 1 to a
|
||||
* lower level as compared to where it is in classic scrypt. This reduces
|
||||
* flexibility for efficient computation (for both attackers and defenders) by
|
||||
* requiring that, short of resorting to TMTO, the full amount of memory be
|
||||
* allocated as needed for the specified p, regardless of whether that
|
||||
* parallelism is actually being fully made use of or not. (For comparison, a
|
||||
* single instance of classic scrypt may be computed in less memory without any
|
||||
* CPU time overhead, but in more real time, by not making full use of the
|
||||
* parallelism.) This may be desirable when the defender has enough memory
|
||||
* with sufficiently low latency and high bandwidth for efficient full parallel
|
||||
* execution, yet the required memory size is high enough that some likely
|
||||
* attackers might end up being forced to choose between using higher latency
|
||||
* memory than they could use otherwise (waiting for data longer) or using TMTO
|
||||
* (waiting for data more times per one hash computation). The area-time cost
|
||||
* for other kinds of attackers (who would use the same memory type and TMTO
|
||||
* factor or no TMTO either way) remains roughly the same, given the same
|
||||
* running time for the defender. In the TMTO-friendly YESCRYPT_WORM mode, as
|
||||
* long as the defender has enough memory that is just as fast as the smaller
|
||||
* per-thread regions would be, doesn't expect to ever need greater
|
||||
* flexibility (except possibly via TMTO), and doesn't need backwards
|
||||
* compatibility with classic scrypt, there are no other serious drawbacks to
|
||||
* this setting. In the YESCRYPT_RW mode, which is meant to discourage TMTO,
|
||||
* this new approach to parallelization makes TMTO less inefficient. (This is
|
||||
* an unfortunate side-effect of avoiding some random writes, as we have to in
|
||||
* order to allow for parallel threads to access a common memory region without
|
||||
* synchronization overhead.) Thus, in this mode this setting poses an extra
|
||||
* tradeoff of its own (higher area-time cost for a subset of attackers vs.
|
||||
* better TMTO resistance). Setting YESCRYPT_PARALLEL_SMIX also changes the
|
||||
* way the running time is to be controlled from N*r*p (for classic scrypt) to
|
||||
* N*r (in this modification). All of this applies only when p > 1. For
|
||||
* p = 1, this setting is a no-op.
|
||||
*
|
||||
* Passing a real shared structure, with ROM contents previously computed by
|
||||
* yescrypt_init_shared(), enables the use of ROM and requires YESCRYPT_RW for
|
||||
* the thread-local RAM region. In order to allow for initialization of the
|
||||
* ROM to be split into a separate program, the shared->shared1.aligned and
|
||||
* shared->shared1.aligned_size fields may be set by the caller of
|
||||
* yescrypt_kdf() manually rather than with yescrypt_init_shared().
|
||||
*
|
||||
* local must be initialized with yescrypt_init_local().
|
||||
*
|
||||
* MT-safe as long as local and buf are local to the thread.
|
||||
*/
|
||||
extern int yescrypt_kdf(const yescrypt_shared_t * __shared,
|
||||
yescrypt_local_t * __local,
|
||||
const uint8_t * __passwd, size_t __passwdlen,
|
||||
const uint8_t * __salt, size_t __saltlen,
|
||||
uint64_t __N, uint32_t __r, uint32_t __p, uint32_t __t,
|
||||
yescrypt_flags_t __flags,
|
||||
uint8_t * __buf, size_t __buflen, int thrid);
|
||||
|
||||
/**
|
||||
* yescrypt_r(shared, local, passwd, passwdlen, setting, buf, buflen):
|
||||
* Compute and encode an scrypt or enhanced scrypt hash of passwd given the
|
||||
* parameters and salt value encoded in setting. If the shared structure is
|
||||
* not dummy, a ROM is used and YESCRYPT_RW is required. Otherwise, whether to
|
||||
* use the YESCRYPT_WORM (classic scrypt) or YESCRYPT_RW (time-memory tradeoff
|
||||
* discouraging modification) is determined by the setting string. shared and
|
||||
* local must be initialized as described above for yescrypt_kdf(). buf must
|
||||
* be large enough (as indicated by buflen) to hold the encoded hash string.
|
||||
*
|
||||
* Return the encoded hash string on success; or NULL on error.
|
||||
*
|
||||
* MT-safe as long as local and buf are local to the thread.
|
||||
*/
|
||||
extern uint8_t * yescrypt_r(const yescrypt_shared_t * __shared,
|
||||
yescrypt_local_t * __local,
|
||||
const uint8_t * __passwd, size_t __passwdlen,
|
||||
const uint8_t * __setting,
|
||||
uint8_t * __buf, size_t __buflen, int thrid);
|
||||
|
||||
/**
|
||||
* yescrypt(passwd, setting):
|
||||
* Compute and encode an scrypt or enhanced scrypt hash of passwd given the
|
||||
* parameters and salt value encoded in setting. Whether to use the
|
||||
* YESCRYPT_WORM (classic scrypt) or YESCRYPT_RW (time-memory tradeoff
|
||||
* discouraging modification) is determined by the setting string.
|
||||
*
|
||||
* Return the encoded hash string on success; or NULL on error.
|
||||
*
|
||||
* This is a crypt(3)-like interface, which is simpler to use than
|
||||
* yescrypt_r(), but it is not MT-safe, it does not allow for the use of a ROM,
|
||||
* and it is slower than yescrypt_r() for repeated calls because it allocates
|
||||
* and frees memory on each call.
|
||||
*
|
||||
* MT-unsafe.
|
||||
*/
|
||||
extern uint8_t * yescrypt(const uint8_t * __passwd, const uint8_t * __setting, int thrid );
|
||||
|
||||
/**
|
||||
* yescrypt_gensalt_r(N_log2, r, p, flags, src, srclen, buf, buflen):
|
||||
* Generate a setting string for use with yescrypt_r() and yescrypt() by
|
||||
* encoding into it the parameters N_log2 (which is to be set to base 2
|
||||
* logarithm of the desired value for N), r, p, flags, and a salt given by src
|
||||
* (of srclen bytes). buf must be large enough (as indicated by buflen) to
|
||||
* hold the setting string.
|
||||
*
|
||||
* Return the setting string on success; or NULL on error.
|
||||
*
|
||||
* MT-safe as long as buf is local to the thread.
|
||||
*/
|
||||
extern uint8_t * yescrypt_gensalt_r(
|
||||
uint32_t __N_log2, uint32_t __r, uint32_t __p,
|
||||
yescrypt_flags_t __flags,
|
||||
const uint8_t * __src, size_t __srclen,
|
||||
uint8_t * __buf, size_t __buflen);
|
||||
|
||||
/**
|
||||
* yescrypt_gensalt(N_log2, r, p, flags, src, srclen):
|
||||
* Generate a setting string for use with yescrypt_r() and yescrypt(). This
|
||||
* function is the same as yescrypt_gensalt_r() except that it uses a static
|
||||
* buffer and thus is not MT-safe.
|
||||
*
|
||||
* Return the setting string on success; or NULL on error.
|
||||
*
|
||||
* MT-unsafe.
|
||||
*/
|
||||
extern uint8_t * yescrypt_gensalt(
|
||||
uint32_t __N_log2, uint32_t __r, uint32_t __p,
|
||||
yescrypt_flags_t __flags,
|
||||
const uint8_t * __src, size_t __srclen);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
@@ -161,7 +161,7 @@ bool register_yespowerr16_algo( algo_gate_t* gate )
|
||||
|
||||
// Legacy Yescrypt (yespower v0.5)
|
||||
|
||||
bool register_yescrypt_05_algo( algo_gate_t* gate )
|
||||
bool register_yescrypt_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | SHA_OPT;
|
||||
gate->scanhash = (void*)&scanhash_yespower;
|
||||
@@ -194,7 +194,7 @@ bool register_yescrypt_05_algo( algo_gate_t* gate )
|
||||
}
|
||||
|
||||
|
||||
bool register_yescryptr8_05_algo( algo_gate_t* gate )
|
||||
bool register_yescryptr8_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | SHA_OPT;
|
||||
gate->scanhash = (void*)&scanhash_yespower;
|
||||
@@ -207,7 +207,7 @@ bool register_yescryptr8_05_algo( algo_gate_t* gate )
|
||||
return true;
|
||||
}
|
||||
|
||||
bool register_yescryptr16_05_algo( algo_gate_t* gate )
|
||||
bool register_yescryptr16_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | SHA_OPT;
|
||||
gate->scanhash = (void*)&scanhash_yespower;
|
||||
@@ -220,7 +220,7 @@ bool register_yescryptr16_05_algo( algo_gate_t* gate )
|
||||
return true;
|
||||
}
|
||||
|
||||
bool register_yescryptr32_05_algo( algo_gate_t* gate )
|
||||
bool register_yescryptr32_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | SHA_OPT;
|
||||
gate->scanhash = (void*)&scanhash_yespower;
|
||||
|
20
configure
vendored
20
configure
vendored
@@ -1,6 +1,6 @@
|
||||
#! /bin/sh
|
||||
# Guess values for system-dependent variables and create Makefiles.
|
||||
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.20.1.
|
||||
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.20.2.
|
||||
#
|
||||
#
|
||||
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
|
||||
@@ -577,8 +577,8 @@ MAKEFLAGS=
|
||||
# Identity of this package.
|
||||
PACKAGE_NAME='cpuminer-opt'
|
||||
PACKAGE_TARNAME='cpuminer-opt'
|
||||
PACKAGE_VERSION='3.20.1'
|
||||
PACKAGE_STRING='cpuminer-opt 3.20.1'
|
||||
PACKAGE_VERSION='3.20.2'
|
||||
PACKAGE_STRING='cpuminer-opt 3.20.2'
|
||||
PACKAGE_BUGREPORT=''
|
||||
PACKAGE_URL=''
|
||||
|
||||
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
|
||||
# Omit some internal or obsolete options to make the list less imposing.
|
||||
# This message is too long to be a string in the A/UX 3.1 sh.
|
||||
cat <<_ACEOF
|
||||
\`configure' configures cpuminer-opt 3.20.1 to adapt to many kinds of systems.
|
||||
\`configure' configures cpuminer-opt 3.20.2 to adapt to many kinds of systems.
|
||||
|
||||
Usage: $0 [OPTION]... [VAR=VALUE]...
|
||||
|
||||
@@ -1404,7 +1404,7 @@ fi
|
||||
|
||||
if test -n "$ac_init_help"; then
|
||||
case $ac_init_help in
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 3.20.1:";;
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 3.20.2:";;
|
||||
esac
|
||||
cat <<\_ACEOF
|
||||
|
||||
@@ -1509,7 +1509,7 @@ fi
|
||||
test -n "$ac_init_help" && exit $ac_status
|
||||
if $ac_init_version; then
|
||||
cat <<\_ACEOF
|
||||
cpuminer-opt configure 3.20.1
|
||||
cpuminer-opt configure 3.20.2
|
||||
generated by GNU Autoconf 2.69
|
||||
|
||||
Copyright (C) 2012 Free Software Foundation, Inc.
|
||||
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
|
||||
This file contains any messages produced by compilers while
|
||||
running configure, to aid debugging if configure makes a mistake.
|
||||
|
||||
It was created by cpuminer-opt $as_me 3.20.1, which was
|
||||
It was created by cpuminer-opt $as_me 3.20.2, which was
|
||||
generated by GNU Autoconf 2.69. Invocation command line was
|
||||
|
||||
$ $0 $@
|
||||
@@ -2993,7 +2993,7 @@ fi
|
||||
|
||||
# Define the identity of the package.
|
||||
PACKAGE='cpuminer-opt'
|
||||
VERSION='3.20.1'
|
||||
VERSION='3.20.2'
|
||||
|
||||
|
||||
cat >>confdefs.h <<_ACEOF
|
||||
@@ -6718,7 +6718,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
|
||||
# report actual input values of CONFIG_FILES etc. instead of their
|
||||
# values after options handling.
|
||||
ac_log="
|
||||
This file was extended by cpuminer-opt $as_me 3.20.1, which was
|
||||
This file was extended by cpuminer-opt $as_me 3.20.2, which was
|
||||
generated by GNU Autoconf 2.69. Invocation command line was
|
||||
|
||||
CONFIG_FILES = $CONFIG_FILES
|
||||
@@ -6784,7 +6784,7 @@ _ACEOF
|
||||
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
|
||||
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
|
||||
ac_cs_version="\\
|
||||
cpuminer-opt config.status 3.20.1
|
||||
cpuminer-opt config.status 3.20.2
|
||||
configured by $0, generated by GNU Autoconf 2.69,
|
||||
with options \\"\$ac_cs_config\\"
|
||||
|
||||
|
@@ -1,4 +1,4 @@
|
||||
AC_INIT([cpuminer-opt], [3.20.1])
|
||||
AC_INIT([cpuminer-opt], [3.20.2])
|
||||
|
||||
AC_PREREQ([2.59c])
|
||||
AC_CANONICAL_SYSTEM
|
||||
|
@@ -273,9 +273,9 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
#endif
|
||||
|
||||
// Mask making
|
||||
|
||||
// Equivalent of AVX512 _mm_movepi64_mask & _mm_movepi32_mask.
|
||||
// Returns 2 or 4 bit integer mask from MSB of 64 or 32 bit elements.
|
||||
// Effectively a sign test.
|
||||
|
||||
#define mm_movmask_64( v ) \
|
||||
_mm_castpd_si128( _mm_movmask_pd( _mm_castsi128_pd( v ) ) )
|
||||
@@ -306,34 +306,11 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
//
|
||||
// Bit rotations
|
||||
|
||||
// AVX512VL has implemented bit rotation for 128 bit vectors with
|
||||
// 64 and 32 bit elements.
|
||||
|
||||
// x2 rotates elements in 2 individual vectors in a double buffered
|
||||
// optimization for SSE2, does nothing for AVX512 but is there for
|
||||
// transparency.
|
||||
|
||||
// compiler doesn't like when a variable is used for the last arg of
|
||||
// _mm_rol_epi32, must be "8 bit immediate". Oddly _mm_slli has the same
|
||||
// specification but works with a variable. Therefore use rol_var where
|
||||
// necessary.
|
||||
// sm3-hash-4way.c has one instance where mm128_rol_var_32 is required.
|
||||
|
||||
#define mm128_ror_var_64( v, c ) \
|
||||
_mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) )
|
||||
|
||||
#define mm128_rol_var_64( v, c ) \
|
||||
_mm_or_si128( _mm_slli_epi64( v, c ), _mm_srli_epi64( v, 64-(c) ) )
|
||||
|
||||
#define mm128_ror_var_32( v, c ) \
|
||||
_mm_or_si128( _mm_srli_epi32( v, c ), _mm_slli_epi32( v, 32-(c) ) )
|
||||
|
||||
#define mm128_rol_var_32( v, c ) \
|
||||
_mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
|
||||
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
//#if defined(__AVX512F__) && defined(__AVX512VL__)
|
||||
|
||||
#define mm128_ror_64 _mm_ror_epi64
|
||||
#define mm128_rol_64 _mm_rol_epi64
|
||||
@@ -358,10 +335,17 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
|
||||
#else // SSE2
|
||||
|
||||
#define mm128_ror_64 mm128_ror_var_64
|
||||
#define mm128_rol_64 mm128_rol_var_64
|
||||
#define mm128_ror_32 mm128_ror_var_32
|
||||
#define mm128_rol_32 mm128_rol_var_32
|
||||
#define mm128_ror_64( v, c ) \
|
||||
_mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) )
|
||||
|
||||
#define mm128_rol_64( v, c ) \
|
||||
_mm_or_si128( _mm_slli_epi64( v, c ), _mm_srli_epi64( v, 64-(c) ) )
|
||||
|
||||
#define mm128_ror_32( v, c ) \
|
||||
_mm_or_si128( _mm_srli_epi32( v, c ), _mm_slli_epi32( v, 32-(c) ) )
|
||||
|
||||
#define mm128_rol_32( v, c ) \
|
||||
_mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
|
||||
|
||||
#define mm128_rorx2_64( v1, v0, c ) \
|
||||
{ \
|
||||
@@ -411,6 +395,11 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
#define mm128_rol_16( v, c ) \
|
||||
_mm_or_si128( _mm_slli_epi16( v, c ), _mm_srli_epi16( v, 16-(c) ) )
|
||||
|
||||
// Deprecated.
|
||||
#define mm128_rol_var_32( v, c ) \
|
||||
_mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
|
||||
|
||||
//
|
||||
// Limited 2 input shuffle, combines shuffle with blend. The destination low
|
||||
// half is always taken from src a, and the high half from src b.
|
||||
#define mm128_shuffle2_64( a, b, c ) \
|
||||
@@ -421,7 +410,6 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
_mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( a ), \
|
||||
_mm_castsi128_ps( b ), c ) );
|
||||
|
||||
|
||||
//
|
||||
// Rotate vector elements accross all lanes
|
||||
|
||||
@@ -432,21 +420,61 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
#define mm128_shuflr_32( v ) _mm_shuffle_epi32( v, 0x39 )
|
||||
#define mm128_shufll_32( v ) _mm_shuffle_epi32( v, 0x93 )
|
||||
|
||||
|
||||
// Swap 32 bit elements in 64 bit lanes
|
||||
#define mm128_swap64_32( v ) _mm_shuffle_epi32( v, 0xb1 )
|
||||
#define mm128_shuflr64_32 mm128_swap64_32
|
||||
#define mm128_shufll64_32 mm128_swap64_32
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
// Rotate right by c bytes, no SSE2 equivalent.
|
||||
static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
|
||||
{ return _mm_alignr_epi8( v, v, c ); }
|
||||
|
||||
#endif
|
||||
|
||||
// Rotate byte elements within 64 or 32 bit lanes, AKA optimized bit rotations
|
||||
// for multiples of 8 bits. Uses ror/rol macros when AVX512 is available
|
||||
// (unlikely but faster), or when SSSE3 is not available (slower).
|
||||
|
||||
#define mm128_swap64_32( v ) _mm_shuffle_epi32( v, 0xb1 )
|
||||
#define mm128_shuflr64_32 mm128_swap64_32
|
||||
#define mm128_shufll64_32 mm128_swap64_32
|
||||
|
||||
#if defined(__SSSE3__) && !defined(__AVX512VL__)
|
||||
#define mm128_shuflr64_24( v ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
0x0a09080f0e0d0c0b, 0x0201000706050403 ) )
|
||||
#else
|
||||
#define mm128_shuflr64_24( v ) mm128_ror_64( v, 24 )
|
||||
#endif
|
||||
|
||||
#if defined(__SSSE3__) && !defined(__AVX512VL__)
|
||||
#define mm128_shuflr64_16( v ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
0x09080f0e0d0c0b0a, 0x0100070605040302 ) )
|
||||
#else
|
||||
#define mm128_shuflr64_16( v ) mm128_ror_64( v, 16 )
|
||||
#endif
|
||||
|
||||
#if defined(__SSSE3__) && !defined(__AVX512VL__)
|
||||
#define mm128_swap32_16( v ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
0x0d0c0f0e09080b0a, 0x0504070601000302 ) )
|
||||
#else
|
||||
#define mm128_swap32_16( v ) mm128_ror_32( v, 16 )
|
||||
#endif
|
||||
#define mm128_shuflr32_16 mm128_swap32_16
|
||||
#define mm128_shufll32_16 mm128_swap32_16
|
||||
|
||||
#if defined(__SSSE3__) && !defined(__AVX512VL__)
|
||||
#define mm128_shuflr32_8( v ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
0x0c0f0e0d080b0a09, 0x0407060500030201 ) )
|
||||
#else
|
||||
#define mm128_shuflr32_8( v ) mm128_ror_32( v, 8 )
|
||||
#endif
|
||||
|
||||
//
|
||||
// Endian byte swap.
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
#define mm128_bswap_64( v ) \
|
||||
_mm_shuffle_epi8( v, m128_const_64( 0x08090a0b0c0d0e0f, \
|
||||
0x0001020304050607 ) )
|
||||
@@ -537,8 +565,8 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
|
||||
//
|
||||
// Rotate in place concatenated 128 bit vectors as one 256 bit vector.
|
||||
|
||||
// Swap 128 bit vectorse.
|
||||
|
||||
// Swap 128 bit vectors.
|
||||
// This should be avoided, it's more efficient to switch references.
|
||||
#define mm128_swap256_128( v1, v2 ) \
|
||||
v1 = _mm_xor_si128( v1, v2 ); \
|
||||
v2 = _mm_xor_si128( v1, v2 ); \
|
||||
@@ -552,8 +580,8 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
|
||||
|
||||
// Function macros with two inputs and one output, inputs are preserved.
|
||||
// Returns the high 128 bits, ie updated v1.
|
||||
// These two-input functions are not available without SSSE3. Use procedure
|
||||
// macros below instead.
|
||||
// These functions are preferred but only available with SSSE3. Use procedure
|
||||
// macros below for SSE2 compatibility.
|
||||
|
||||
#define mm128_shufl2r_64( v1, v2 ) _mm_alignr_epi8( v2, v1, 8 )
|
||||
#define mm128_shufl2l_64( v1, v2 ) _mm_alignr_epi8( v1, v2, 8 )
|
||||
@@ -568,8 +596,8 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
|
||||
#define mm128_shufl2l_8( v1, v2 ) _mm_alignr_epi8( v1, v2, 8 )
|
||||
|
||||
// Procedure macros with 2 inputs and 2 outputs, input args are overwritten.
|
||||
// Deprecated for SSSE3 and above, they exist for SSSE3 only for compatibility
|
||||
// with existing code. The function macros above can be used more effciently.
|
||||
// Deprecated for SSSE3 and above, SSSE3 versions exist for only for
|
||||
// compatibility with with existing code.
|
||||
|
||||
#define mm128_vror256_64( v1, v2 ) \
|
||||
do { \
|
||||
|
@@ -13,6 +13,18 @@
|
||||
// AVX512 implementations. They will be selected automatically but their use
|
||||
// is limited because 256 bit vectors are less likely to be used when 512
|
||||
// is available.
|
||||
//
|
||||
// AVX2 version of _mm256_shuffle_epi8 is limited to 128 bit lanes but AVX512
|
||||
// version is not. Some usage has the index vector encoded as if full vector
|
||||
// shuffles are supported. This has no side effects and would have the same
|
||||
// results using either version.
|
||||
// If needed and AVX512 is available, 256 bit full vector shuffles can be
|
||||
// implemented using the AVX512 zero-mask feature with a NULL mask.
|
||||
// Using intrinsics it's simple:
|
||||
// _mm256_maskz_shuffle_epi8( k0, v, c )
|
||||
// With asm it's a bit more complicated with the addition of the mask register
|
||||
// and zero tag:
|
||||
// vpshufb ymm0{k0}{z}, ymm1, ymm2
|
||||
|
||||
#if defined(__AVX__)
|
||||
|
||||
@@ -234,9 +246,9 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
|
||||
#endif
|
||||
|
||||
// Mask making
|
||||
|
||||
// Equivalent of AVX512 _mm256_movepi64_mask & _mm256_movepi32_mask.
|
||||
// Returns 4 or 8 bit integer mask from MSB of 64 or 32 bit elements.
|
||||
// Effectively a sign test.
|
||||
|
||||
#define mm256_movmask_64( v ) \
|
||||
_mm256_castpd_si256( _mm256_movmask_pd( _mm256_castsi256_pd( v ) ) )
|
||||
@@ -273,42 +285,11 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
|
||||
//
|
||||
// Bit rotations.
|
||||
//
|
||||
// The only bit shift for more than 64 bits is with __int128 which is slow.
|
||||
//
|
||||
// AVX512 has bit rotate for 256 bit vectors with 64 or 32 bit elements
|
||||
//
|
||||
// x2 rotates elements in 2 individual vectors in a double buffered
|
||||
// optimization for SSE2, does nothing for AVX512 but is there for
|
||||
// optimization for AVX2, does nothing for AVX512 but is here for
|
||||
// transparency.
|
||||
|
||||
|
||||
// compiler doesn't like when a variable is used for the last arg of
|
||||
// _mm_rol_epi32, must be "8 bit immediate". Therefore use rol_var where
|
||||
// necessary.
|
||||
|
||||
#define mm256_ror_var_64( v, c ) \
|
||||
_mm256_or_si256( _mm256_srli_epi64( v, c ), \
|
||||
_mm256_slli_epi64( v, 64-(c) ) )
|
||||
|
||||
#define mm256_rol_var_64( v, c ) \
|
||||
_mm256_or_si256( _mm256_slli_epi64( v, c ), \
|
||||
_mm256_srli_epi64( v, 64-(c) ) )
|
||||
|
||||
#define mm256_ror_var_32( v, c ) \
|
||||
_mm256_or_si256( _mm256_srli_epi32( v, c ), \
|
||||
_mm256_slli_epi32( v, 32-(c) ) )
|
||||
|
||||
#define mm256_rol_var_32( v, c ) \
|
||||
_mm256_or_si256( _mm256_slli_epi32( v, c ), \
|
||||
_mm256_srli_epi32( v, 32-(c) ) )
|
||||
|
||||
|
||||
// The spec says both F & VL are required, but just in case AMD
|
||||
// decides to implement ROL/R without AVX512F.
|
||||
#if defined(__AVX512VL__)
|
||||
//#if defined(__AVX512F__) && defined(__AVX512VL__)
|
||||
|
||||
// AVX512, control must be 8 bit immediate.
|
||||
|
||||
#define mm256_ror_64 _mm256_ror_epi64
|
||||
#define mm256_rol_64 _mm256_rol_epi64
|
||||
@@ -333,10 +314,23 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
|
||||
|
||||
#else // AVX2
|
||||
|
||||
#define mm256_ror_64 mm256_ror_var_64
|
||||
#define mm256_rol_64 mm256_rol_var_64
|
||||
#define mm256_ror_32 mm256_ror_var_32
|
||||
#define mm256_rol_32 mm256_rol_var_32
|
||||
// use shuflr64 shuflr32 below for optimized bit rotations of multiples of 8.
|
||||
|
||||
#define mm256_ror_64( v, c ) \
|
||||
_mm256_or_si256( _mm256_srli_epi64( v, c ), \
|
||||
_mm256_slli_epi64( v, 64-(c) ) )
|
||||
|
||||
#define mm256_rol_64( v, c ) \
|
||||
_mm256_or_si256( _mm256_slli_epi64( v, c ), \
|
||||
_mm256_srli_epi64( v, 64-(c) ) )
|
||||
|
||||
#define mm256_ror_32( v, c ) \
|
||||
_mm256_or_si256( _mm256_srli_epi32( v, c ), \
|
||||
_mm256_slli_epi32( v, 32-(c) ) )
|
||||
|
||||
#define mm256_rol_32( v, c ) \
|
||||
_mm256_or_si256( _mm256_slli_epi32( v, c ), \
|
||||
_mm256_srli_epi32( v, 32-(c) ) )
|
||||
|
||||
#define mm256_rorx2_64( v1, v0, c ) \
|
||||
{ \
|
||||
@@ -388,6 +382,10 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
|
||||
_mm256_or_si256( _mm256_slli_epi16( v, c ), \
|
||||
_mm256_srli_epi16( v, 16-(c) ) )
|
||||
|
||||
// Deprecated.
|
||||
#define mm256_rol_var_32( v, c ) \
|
||||
_mm256_or_si256( _mm256_slli_epi32( v, c ), \
|
||||
_mm256_srli_epi32( v, 32-(c) ) )
|
||||
|
||||
//
|
||||
// Rotate elements accross all lanes.
|
||||
@@ -399,7 +397,6 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
|
||||
|
||||
// Rotate 256 bit vector by one 64 bit element
|
||||
#define mm256_shuflr_64( v ) _mm256_permute4x64_epi64( v, 0x39 )
|
||||
|
||||
#define mm256_shufll_64( v ) _mm256_permute4x64_epi64( v, 0x93 )
|
||||
|
||||
// Rotate 256 bit vector by one 32 bit element.
|
||||
@@ -413,7 +410,6 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
|
||||
m256_const_64( 0x0000000600000005, 0x0000000400000003, \
|
||||
0x0000000200000001, 0x0000000000000007 ) )
|
||||
|
||||
|
||||
//
|
||||
// Rotate elements within each 128 bit lane of 256 bit vector.
|
||||
|
||||
@@ -426,7 +422,6 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
|
||||
_mm256_castps_si256( _mm256_shuffle_ps( _mm256_castsi256_ps( a ), \
|
||||
_mm256_castsi256_ps( b ), c ) );
|
||||
|
||||
|
||||
#define mm256_swap128_64( v ) _mm256_shuffle_epi32( v, 0x4e )
|
||||
#define mm256_shuflr128_64 mm256_swap128_64
|
||||
#define mm256_shufll128_64 mm256_swap128_64
|
||||
@@ -437,11 +432,52 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
|
||||
static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
|
||||
{ return _mm256_alignr_epi8( v, v, c ); }
|
||||
|
||||
// Swap 32 bit elements in each 64 bit lane.
|
||||
// Rotate byte elements within 64 or 32 bit lanes, AKA optimized bit
|
||||
// rotations for multiples of 8 bits. Uses faster ror/rol instructions when
|
||||
// AVX512 is available.
|
||||
|
||||
#define mm256_swap64_32( v ) _mm256_shuffle_epi32( v, 0xb1 )
|
||||
#define mm256_shuflr64_32 mm256_swap64_32
|
||||
#define mm256_shufll64_32 mm256_swap64_32
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
#define mm256_shuflr64_24( v ) _mm256_ror_epi64( v, 24 )
|
||||
#else
|
||||
#define mm256_shuflr64_24( v ) \
|
||||
_mm256_shuffle_epi8( v, _mm256_set_epi64x( \
|
||||
0x0a09080f0e0d0c0b, 0x0201000706050403, \
|
||||
0x0a09080f0e0d0c0b, 0x0201000706050403 ) )
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
#define mm256_shuflr64_16( v ) _mm256_ror_epi64( v, 16 )
|
||||
#else
|
||||
#define mm256_shuflr64_16( v ) \
|
||||
_mm256_shuffle_epi8( v, _mm256_set_epi64x( \
|
||||
0x09080f0e0d0c0b0a, 0x0100070605040302, \
|
||||
0x09080f0e0d0c0b0a, 0x0100070605040302 ) )
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
#define mm256_swap32_16( v ) _mm256_ror_epi32( v, 16 )
|
||||
#else
|
||||
#define mm256_swap32_16( v ) \
|
||||
_mm256_shuffle_epi8( v, _mm256_set_epi64x( \
|
||||
0x0d0c0f0e09080b0a, 0x0504070601000302, \
|
||||
0x0d0c0f0e09080b0a, 0x0504070601000302 ) )
|
||||
#endif
|
||||
#define mm256_shuflr32_16 mm256_swap32_16
|
||||
#define mm256_shufll32_16 mm256_swap32_16
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
#define mm256_shuflr32_8( v ) _mm256_ror_epi32( v, 8 )
|
||||
#else
|
||||
#define mm256_shuflr32_8( v ) \
|
||||
_mm256_shuffle_epi8( v, _mm256_set_epi64x( \
|
||||
0x0c0f0e0d080b0a09, 0x0407060500030201, \
|
||||
0x0c0f0e0d080b0a09, 0x0407060500030201 ) )
|
||||
#endif
|
||||
|
||||
// NOTE: _mm256_shuffle_epi8, like most shuffles, is restricted to 128 bit
|
||||
// lanes. AVX512, however, supports full vector 8 bit shuffle. The AVX512VL +
|
||||
// AVX512BW intrinsic _mm256_mask_shuffle_epi8 with a NULL mask, can be used if
|
||||
@@ -496,18 +532,8 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
|
||||
casti_m256i( d, 7 ) = _mm256_shuffle_epi8( casti_m256i( s, 7 ), ctl ); \
|
||||
} while(0)
|
||||
|
||||
//
|
||||
// Rotate two concatenated 256 bit vectors as one 512 bit vector by specified
|
||||
// number of elements. Rotate is done in place, source arguments are
|
||||
// overwritten.
|
||||
// Some of these can use permute but appears to be slower. Maybe a Ryzen
|
||||
// issue
|
||||
|
||||
// _mm256_alignr_epi 64/32 are only available with AVX512 but AVX512 also
|
||||
// makes these macros unnecessary.
|
||||
|
||||
// continue using vror/vrol notation for now to avoid confusion with
|
||||
// shufl2r/shufl2l macro functions available with AVX512.
|
||||
// swap 256 bit vectors in place.
|
||||
// This should be avoided, it's more efficient to switch references.
|
||||
#define mm256_swap512_256( v1, v2 ) \
|
||||
v1 = _mm256_xor_si256( v1, v2 ); \
|
||||
v2 = _mm256_xor_si256( v1, v2 ); \
|
||||
|
@@ -316,58 +316,18 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
|
||||
// Bit rotations.
|
||||
|
||||
// AVX512F has built-in fixed and variable bit rotation for 64 & 32 bit
|
||||
// elements and can be called directly. But they only accept immediate 8
|
||||
// for control arg.
|
||||
// The workaround is a fraud, just a fluke of the compiler's optimizer.
|
||||
// It fails without -O3. The compiler seems to unroll shift loops, eliminating
|
||||
// the variable control, better than rotate loops.
|
||||
// elements and can be called directly.
|
||||
//
|
||||
// _mm512_rol_epi64, _mm512_ror_epi64, _mm512_rol_epi32, _mm512_ror_epi32
|
||||
// _mm512_rolv_epi64, _mm512_rorv_epi64, _mm512_rolv_epi32, _mm512_rorv_epi32
|
||||
//
|
||||
|
||||
// For convenience and consistency with AVX2
|
||||
// For convenience and consistency with AVX2 macros.
|
||||
#define mm512_ror_64 _mm512_ror_epi64
|
||||
#define mm512_rol_64 _mm512_rol_epi64
|
||||
#define mm512_ror_32 _mm512_ror_epi32
|
||||
#define mm512_rol_32 _mm512_rol_epi32
|
||||
|
||||
static inline __m512i mm512_ror_var_64( const __m512i v, const int c )
|
||||
{
|
||||
return _mm512_or_si512( _mm512_srli_epi64( v, c ),
|
||||
_mm512_slli_epi64( v, 64-c ) );
|
||||
}
|
||||
|
||||
static inline __m512i mm512_rol_var_64( const __m512i v, const int c )
|
||||
{
|
||||
return _mm512_or_si512( _mm512_slli_epi64( v, c ),
|
||||
_mm512_srli_epi64( v, 64-c ) );
|
||||
}
|
||||
|
||||
static inline __m512i mm512_ror_var_32( const __m512i v, const int c )
|
||||
{
|
||||
return _mm512_or_si512( _mm512_srli_epi32( v, c ),
|
||||
_mm512_slli_epi32( v, 32-c ) );
|
||||
}
|
||||
|
||||
static inline __m512i mm512_rol_var_32( const __m512i v, const int c )
|
||||
{
|
||||
return _mm512_or_si512( _mm512_slli_epi32( v, c ),
|
||||
_mm512_srli_epi32( v, 32-c ) );
|
||||
}
|
||||
|
||||
static inline __m512i mm512_ror_16( __m512i const v, const int c )
|
||||
{
|
||||
return _mm512_or_si512( _mm512_srli_epi16( v, c ),
|
||||
_mm512_slli_epi16( v, 16-c ) );
|
||||
}
|
||||
|
||||
static inline __m512i mm512_rol_16( const __m512i v, const int c )
|
||||
{
|
||||
return _mm512_or_si512( _mm512_slli_epi16( v, c ),
|
||||
_mm512_srli_epi16( v, 16-c ) );
|
||||
}
|
||||
|
||||
// Rotations using a vector control index are very slow due to overhead
|
||||
// to generate the index vector. Repeated rotations using the same index
|
||||
// are better handled by the calling function where the index only needs
|
||||
@@ -599,22 +559,34 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
|
||||
static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
|
||||
{ return _mm512_alignr_epi8( v, v, c ); }
|
||||
|
||||
// Swap 32 bits in each 64 bit lane. Can be done with rotate instruction
|
||||
// but only with AVX512. Shuffle is just as fast and availble with AVX2
|
||||
// & SSE2.
|
||||
// Rotate byte elements in each 64 or 32 bit lane. Redundant for AVX512, all
|
||||
// can be done with ror & rol. Defined only for convenience and consistency
|
||||
// with AVX2 & SSE2 macros.
|
||||
|
||||
#define mm512_swap64_32( v ) _mm512_shuffle_epi32( v, 0xb1 )
|
||||
#define mm512_shuflr64_32 mm512_swap64_32
|
||||
#define mm512_shufll64_32 mm512_swap64_32
|
||||
|
||||
// Need good way to distinguish 1 input shuffles, 2 input shuffle functions,
|
||||
// and 2 input 2 output shuffle macros.
|
||||
//
|
||||
// shuflr is 1 input
|
||||
// shufl2r is 2 input ...
|
||||
// Drop macros? They can easilly be rebuilt using shufl2 functions
|
||||
#define mm512_shuflr64_24( v ) _mm512_ror_epi64( v, 24 )
|
||||
#define mm512_shufll64_24( v ) _mm512_rol_epi64( v, 24 )
|
||||
|
||||
#define mm512_shuflr64_16( v ) _mm512_ror_epi64( v, 16 )
|
||||
#define mm512_shufll64_16( v ) _mm512_rol_epi64( v, 16 )
|
||||
|
||||
#define mm512_shuflr64_8( v ) _mm512_ror_epi64( v, 8 )
|
||||
#define mm512_shufll64_8( v ) _mm512_rol_epi64( v, 8 )
|
||||
|
||||
#define mm512_swap32_16( v ) _mm512_ror_epi32( v, 16 )
|
||||
#define mm512_shuflr32_16 mm512_swap32_16
|
||||
#define mm512_shufll32_16 mm512_swap32_16
|
||||
|
||||
#define mm512_shuflr32_8( v ) _mm512_ror_epi32( v, 8 )
|
||||
#define mm512_shufll32_8( v ) _mm512_rol_epi32( v, 8 )
|
||||
|
||||
|
||||
// 2 input, 1 output
|
||||
// Rotate concatenated { v1, v2 ) right or left and return v1.
|
||||
// Concatenate { v1, v2 ) then rotate right or left and return the high
|
||||
// 512 bits, ie rotated v1.
|
||||
#define mm512_shufl2r_256( v1, v2 ) _mm512_alignr_epi64( v2, v1, 4 )
|
||||
#define mm512_shufl2l_256( v1, v2 ) _mm512_alignr_epi64( v1, v2, 4 )
|
||||
|
||||
|
Reference in New Issue
Block a user