mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
319 lines
8.7 KiB
C
319 lines
8.7 KiB
C
/* hash.c Aug 2011
|
|
* groestl512-hash-4way https://github.com/JayDDee/cpuminer-opt 2019-12.
|
|
*
|
|
* Groestl implementation for different versions.
|
|
* Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
|
|
*
|
|
* This code is placed in the public domain
|
|
*/
|
|
|
|
// Optimized for hash and data length that are integrals of __m128i
|
|
|
|
|
|
#include <memory.h>
|
|
#include "groestl256-intr-4way.h"
|
|
#include "miner.h"
|
|
#include "simd-utils.h"
|
|
|
|
#if defined(__AVX2__) && defined(__VAES__)
|
|
|
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
|
|
|
|
|
int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen )
|
|
{
|
|
int i;
|
|
|
|
ctx->hashlen = hashlen;
|
|
|
|
for ( i = 0; i < SIZE256; i++ )
|
|
{
|
|
ctx->chaining[i] = m512_zero;
|
|
ctx->buffer[i] = m512_zero;
|
|
}
|
|
|
|
// The only non-zero in the IV is len. It can be hard coded.
|
|
ctx->chaining[ 3 ] = m512_const2_64( 0, 0x0100000000000000 );
|
|
|
|
ctx->buf_ptr = 0;
|
|
ctx->rem_ptr = 0;
|
|
|
|
return 0;
|
|
}
|
|
|
|
int groestl256_4way_full( groestl256_4way_context* ctx, void* output,
|
|
const void* input, uint64_t datalen )
|
|
{
|
|
const int len = (int)datalen >> 4;
|
|
const int hashlen_m128i = 32 >> 4; // bytes to __m128i
|
|
const int hash_offset = SIZE256 - hashlen_m128i;
|
|
uint64_t blocks = len / SIZE256;
|
|
__m512i* in = (__m512i*)input;
|
|
int i;
|
|
|
|
// if (ctx->chaining == NULL || ctx->buffer == NULL)
|
|
// return 1;
|
|
|
|
for ( i = 0; i < SIZE256; i++ )
|
|
{
|
|
ctx->chaining[i] = m512_zero;
|
|
ctx->buffer[i] = m512_zero;
|
|
}
|
|
|
|
// The only non-zero in the IV is len. It can be hard coded.
|
|
ctx->chaining[ 3 ] = m512_const2_64( 0, 0x0100000000000000 );
|
|
ctx->buf_ptr = 0;
|
|
|
|
// --- update ---
|
|
|
|
// digest any full blocks, process directly from input
|
|
for ( i = 0; i < blocks; i++ )
|
|
TF512_4way( ctx->chaining, &in[ i * SIZE256 ] );
|
|
ctx->buf_ptr = blocks * SIZE256;
|
|
|
|
// copy any remaining data to buffer
|
|
for ( i = 0; i < len % SIZE256; i++ )
|
|
ctx->buffer[ i ] = in[ ctx->buf_ptr + i ];
|
|
// use i as rem_ptr in final
|
|
|
|
//--- final ---
|
|
|
|
blocks++; // adjust for final block
|
|
|
|
if ( i == SIZE256 - 1 )
|
|
{
|
|
// only 1 vector left in buffer, all padding at once
|
|
ctx->buffer[i] = m512_const2_64( blocks << 56, 0x80 );
|
|
}
|
|
else
|
|
{
|
|
// add first padding
|
|
ctx->buffer[i] = m512_const2_64( 0, 0x80 );
|
|
// add zero padding
|
|
for ( i += 1; i < SIZE256 - 1; i++ )
|
|
ctx->buffer[i] = m512_zero;
|
|
|
|
// add length padding, second last byte is zero unless blocks > 255
|
|
ctx->buffer[i] = m512_const2_64( blocks << 56, 0 );
|
|
}
|
|
|
|
// digest final padding block and do output transform
|
|
TF512_4way( ctx->chaining, ctx->buffer );
|
|
|
|
OF512_4way( ctx->chaining );
|
|
|
|
// store hash result in output
|
|
for ( i = 0; i < hashlen_m128i; i++ )
|
|
casti_m512i( output, i ) = ctx->chaining[ hash_offset + i ];
|
|
|
|
return 0;
|
|
}
|
|
|
|
int groestl256_4way_update_close( groestl256_4way_context* ctx, void* output,
|
|
const void* input, uint64_t databitlen )
|
|
{
|
|
const int len = (int)databitlen / 128;
|
|
const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i
|
|
const int hash_offset = SIZE256 - hashlen_m128i;
|
|
int rem = ctx->rem_ptr;
|
|
uint64_t blocks = len / SIZE256;
|
|
__m512i* in = (__m512i*)input;
|
|
int i;
|
|
|
|
// --- update ---
|
|
|
|
// digest any full blocks, process directly from input
|
|
for ( i = 0; i < blocks; i++ )
|
|
TF512_4way( ctx->chaining, &in[ i * SIZE256 ] );
|
|
ctx->buf_ptr = blocks * SIZE256;
|
|
|
|
// copy any remaining data to buffer, it may already contain data
|
|
// from a previous update for a midstate precalc
|
|
for ( i = 0; i < len % SIZE256; i++ )
|
|
ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
|
|
i += rem; // use i as rem_ptr in final
|
|
|
|
//--- final ---
|
|
|
|
blocks++; // adjust for final block
|
|
|
|
if ( i == SIZE256 - 1 )
|
|
{
|
|
// only 1 vector left in buffer, all padding at once
|
|
ctx->buffer[i] = m512_const2_64( blocks << 56, 0x80 );
|
|
}
|
|
else
|
|
{
|
|
// add first padding
|
|
ctx->buffer[i] = m512_const2_64( 0, 0x80 );
|
|
// add zero padding
|
|
for ( i += 1; i < SIZE256 - 1; i++ )
|
|
ctx->buffer[i] = m512_zero;
|
|
|
|
// add length padding, second last byte is zero unless blocks > 255
|
|
ctx->buffer[i] = m512_const2_64( blocks << 56, 0 );
|
|
}
|
|
|
|
// digest final padding block and do output transform
|
|
TF512_4way( ctx->chaining, ctx->buffer );
|
|
|
|
OF512_4way( ctx->chaining );
|
|
|
|
// store hash result in output
|
|
for ( i = 0; i < hashlen_m128i; i++ )
|
|
casti_m512i( output, i ) = ctx->chaining[ hash_offset + i ];
|
|
|
|
return 0;
|
|
}
|
|
|
|
#endif // AVX512
|
|
|
|
// AVX2 + VAES
|
|
|
|
int groestl256_2way_init( groestl256_2way_context* ctx, uint64_t hashlen )
|
|
{
|
|
int i;
|
|
|
|
ctx->hashlen = hashlen;
|
|
|
|
// if (ctx->chaining == NULL || ctx->buffer == NULL)
|
|
// return 1;
|
|
|
|
for ( i = 0; i < SIZE256; i++ )
|
|
{
|
|
ctx->chaining[i] = m256_zero;
|
|
ctx->buffer[i] = m256_zero;
|
|
}
|
|
|
|
// The only non-zero in the IV is len. It can be hard coded.
|
|
ctx->chaining[ 3 ] = m256_const2_64( 0, 0x0100000000000000 );
|
|
|
|
ctx->buf_ptr = 0;
|
|
ctx->rem_ptr = 0;
|
|
|
|
return 0;
|
|
}
|
|
|
|
int groestl256_2way_full( groestl256_2way_context* ctx, void* output,
|
|
const void* input, uint64_t datalen )
|
|
{
|
|
const int len = (int)datalen >> 4;
|
|
const int hashlen_m128i = 32 >> 4; // bytes to __m128i
|
|
const int hash_offset = SIZE256 - hashlen_m128i;
|
|
uint64_t blocks = len / SIZE256;
|
|
__m256i* in = (__m256i*)input;
|
|
int i;
|
|
|
|
for ( i = 0; i < SIZE256; i++ )
|
|
{
|
|
ctx->chaining[i] = m256_zero;
|
|
ctx->buffer[i] = m256_zero;
|
|
}
|
|
|
|
// The only non-zero in the IV is len. It can be hard coded.
|
|
ctx->chaining[ 3 ] = m256_const2_64( 0, 0x0100000000000000 );
|
|
ctx->buf_ptr = 0;
|
|
|
|
// --- update ---
|
|
|
|
// digest any full blocks, process directly from input
|
|
for ( i = 0; i < blocks; i++ )
|
|
TF512_2way( ctx->chaining, &in[ i * SIZE256 ] );
|
|
ctx->buf_ptr = blocks * SIZE256;
|
|
|
|
// copy any remaining data to buffer
|
|
for ( i = 0; i < len % SIZE256; i++ )
|
|
ctx->buffer[ i ] = in[ ctx->buf_ptr + i ];
|
|
// use i as rem_ptr in final
|
|
|
|
//--- final ---
|
|
|
|
blocks++; // adjust for final block
|
|
|
|
if ( i == SIZE256 - 1 )
|
|
{
|
|
// only 1 vector left in buffer, all padding at once
|
|
ctx->buffer[i] = m256_const2_64( blocks << 56, 0x80 );
|
|
}
|
|
else
|
|
{
|
|
// add first padding
|
|
ctx->buffer[i] = m256_const2_64( 0, 0x80 );
|
|
// add zero padding
|
|
for ( i += 1; i < SIZE256 - 1; i++ )
|
|
ctx->buffer[i] = m256_zero;
|
|
|
|
// add length padding, second last byte is zero unless blocks > 255
|
|
ctx->buffer[i] = m256_const2_64( blocks << 56, 0 );
|
|
}
|
|
|
|
// digest final padding block and do output transform
|
|
TF512_2way( ctx->chaining, ctx->buffer );
|
|
|
|
OF512_2way( ctx->chaining );
|
|
|
|
// store hash result in output
|
|
for ( i = 0; i < hashlen_m128i; i++ )
|
|
casti_m256i( output, i ) = ctx->chaining[ hash_offset + i ];
|
|
|
|
return 0;
|
|
}
|
|
int groestl256_2way_update_close( groestl256_2way_context* ctx, void* output,
|
|
const void* input, uint64_t databitlen )
|
|
{
|
|
const int len = (int)databitlen / 128;
|
|
const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i
|
|
const int hash_offset = SIZE256 - hashlen_m128i;
|
|
int rem = ctx->rem_ptr;
|
|
uint64_t blocks = len / SIZE256;
|
|
__m256i* in = (__m256i*)input;
|
|
int i;
|
|
|
|
// --- update ---
|
|
|
|
// digest any full blocks, process directly from input
|
|
for ( i = 0; i < blocks; i++ )
|
|
TF512_2way( ctx->chaining, &in[ i * SIZE256 ] );
|
|
ctx->buf_ptr = blocks * SIZE256;
|
|
|
|
// copy any remaining data to buffer, it may already contain data
|
|
// from a previous update for a midstate precalc
|
|
for ( i = 0; i < len % SIZE256; i++ )
|
|
ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
|
|
i += rem; // use i as rem_ptr in final
|
|
|
|
//--- final ---
|
|
|
|
blocks++; // adjust for final block
|
|
|
|
if ( i == SIZE256 - 1 )
|
|
{
|
|
// only 1 vector left in buffer, all padding at once
|
|
ctx->buffer[i] = m256_const2_64( blocks << 56, 0x80 );
|
|
}
|
|
else
|
|
{
|
|
// add first padding
|
|
ctx->buffer[i] = m256_const2_64( 0, 0x80 );
|
|
// add zero padding
|
|
for ( i += 1; i < SIZE256 - 1; i++ )
|
|
ctx->buffer[i] = m256_zero;
|
|
|
|
// add length padding, second last byte is zero unless blocks > 255
|
|
ctx->buffer[i] = m256_const2_64( blocks << 56, 0 );
|
|
}
|
|
|
|
// digest final padding block and do output transform
|
|
TF512_2way( ctx->chaining, ctx->buffer );
|
|
|
|
OF512_2way( ctx->chaining );
|
|
|
|
// store hash result in output
|
|
for ( i = 0; i < hashlen_m128i; i++ )
|
|
casti_m256i( output, i ) = ctx->chaining[ hash_offset + i ];
|
|
|
|
return 0;
|
|
}
|
|
|
|
#endif // VAES
|