mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.10.2
This commit is contained in:
715
algo/lyra2/lyra2-hash-2way.c
Normal file
715
algo/lyra2/lyra2-hash-2way.c
Normal file
@@ -0,0 +1,715 @@
|
||||
/**
|
||||
* Implementation of the Lyra2 Password Hashing Scheme (PHS).
|
||||
*
|
||||
* Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
|
||||
*
|
||||
* This software is hereby placed in the public domain.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
|
||||
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
||||
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||
* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
||||
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
#include <mm_malloc.h>
|
||||
#include "compat.h"
|
||||
#include "lyra2.h"
|
||||
#include "sponge.h"
|
||||
|
||||
/**
|
||||
* Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords
|
||||
* whose combined length is smaller than the size of the memory matrix, (i.e., (nRows x nCols x b) bits,
|
||||
* where "b" is the underlying sponge's bitrate). In this implementation, the "basil" is composed by all
|
||||
* integer parameters (treated as type "unsigned int") in the order they are provided, plus the value
|
||||
* of nCols, (i.e., basil = kLen || pwdlen || saltlen || timeCost || nRows || nCols).
|
||||
*
|
||||
* @param K The derived key to be output by the algorithm
|
||||
* @param kLen Desired key length
|
||||
* @param pwd User password
|
||||
* @param pwdlen Password length
|
||||
* @param salt Salt
|
||||
* @param saltlen Salt length
|
||||
* @param timeCost Parameter to determine the processing time (T)
|
||||
* @param nRows Number or rows of the memory matrix (R)
|
||||
* @param nCols Number of columns of the memory matrix (C)
|
||||
*
|
||||
* @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation)
|
||||
*/
|
||||
|
||||
int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
|
||||
const uint64_t pwdlen, const void *salt, const uint64_t saltlen,
|
||||
const uint64_t timeCost, const uint64_t nRows,
|
||||
const uint64_t nCols )
|
||||
{
|
||||
//====================== Basic variables ============================//
|
||||
uint64_t _ALIGN(256) state[16];
|
||||
int64_t row = 2; //index of row to be processed
|
||||
int64_t prev = 1; //index of prev (last row ever computed/modified)
|
||||
int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
|
||||
int64_t tau; //Time Loop iterator
|
||||
int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
|
||||
int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
|
||||
int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
|
||||
// int64_t i; //auxiliary iteration counter
|
||||
int64_t v64; // 64bit var for memcpy
|
||||
//====================================================================/
|
||||
|
||||
//=== Initializing the Memory Matrix and pointers to it =============//
|
||||
//Tries to allocate enough space for the whole memory matrix
|
||||
|
||||
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
|
||||
// const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
|
||||
// for Lyra2REv2, nCols = 4, v1 was using 8
|
||||
const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64
|
||||
: BLOCK_LEN_BLAKE2_SAFE_BYTES;
|
||||
uint64_t *ptrWord = wholeMatrix;
|
||||
|
||||
// memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
|
||||
|
||||
//=== Getting the password + salt + basil padded with 10*1 ==========//
|
||||
//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
|
||||
//but this ensures that the password copied locally will be overwritten as soon as possible
|
||||
|
||||
//First, we clean enough blocks for the password, salt, basil and padding
|
||||
int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) )
|
||||
/ BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
|
||||
|
||||
byte *ptrByte = (byte*) wholeMatrix;
|
||||
|
||||
//Prepends the password
|
||||
memcpy(ptrByte, pwd, pwdlen);
|
||||
ptrByte += pwdlen;
|
||||
|
||||
//Concatenates the salt
|
||||
memcpy(ptrByte, salt, saltlen);
|
||||
ptrByte += saltlen;
|
||||
|
||||
memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
|
||||
- (saltlen + pwdlen) );
|
||||
|
||||
//Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
|
||||
memcpy(ptrByte, &kLen, sizeof(int64_t));
|
||||
ptrByte += sizeof(uint64_t);
|
||||
v64 = pwdlen;
|
||||
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||
ptrByte += sizeof(uint64_t);
|
||||
v64 = saltlen;
|
||||
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||
ptrByte += sizeof(uint64_t);
|
||||
v64 = timeCost;
|
||||
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||
ptrByte += sizeof(uint64_t);
|
||||
v64 = nRows;
|
||||
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||
ptrByte += sizeof(uint64_t);
|
||||
v64 = nCols;
|
||||
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||
ptrByte += sizeof(uint64_t);
|
||||
|
||||
//Now comes the padding
|
||||
*ptrByte = 0x80; //first byte of padding: right after the password
|
||||
ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
|
||||
ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
|
||||
*ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
|
||||
|
||||
// from here on it's all simd acces to state and matrix
|
||||
// define vector pointers and adjust sizes and pointer offsets
|
||||
|
||||
//================= Initializing the Sponge State ====================//
|
||||
//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
|
||||
|
||||
// initState( state );
|
||||
|
||||
//========================= Setup Phase =============================//
|
||||
//Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
|
||||
|
||||
ptrWord = wholeMatrix;
|
||||
|
||||
absorbBlockBlake2Safe( state, ptrWord, nBlocksInput, BLOCK_LEN );
|
||||
/*
|
||||
for (i = 0; i < nBlocksInput; i++)
|
||||
{
|
||||
absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
|
||||
ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
|
||||
}
|
||||
*/
|
||||
|
||||
//Initializes M[0] and M[1]
|
||||
reducedSqueezeRow0( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here
|
||||
|
||||
reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
|
||||
nCols);
|
||||
|
||||
do
|
||||
{
|
||||
//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
|
||||
|
||||
reducedDuplexRowSetup( state, &wholeMatrix[prev*ROW_LEN_INT64],
|
||||
&wholeMatrix[rowa*ROW_LEN_INT64],
|
||||
&wholeMatrix[row*ROW_LEN_INT64], nCols );
|
||||
|
||||
//updates the value of row* (deterministically picked during Setup))
|
||||
rowa = (rowa + step) & (window - 1);
|
||||
//update prev: it now points to the last row ever computed
|
||||
|
||||
prev = row;
|
||||
//updates row: goes to the next row to be computed
|
||||
row++;
|
||||
|
||||
//Checks if all rows in the window where visited.
|
||||
if (rowa == 0)
|
||||
{
|
||||
step = window + gap; //changes the step: approximately doubles its value
|
||||
window *= 2; //doubles the size of the re-visitation window
|
||||
gap = -gap; //inverts the modifier to the step
|
||||
}
|
||||
|
||||
} while (row < nRows);
|
||||
|
||||
//===================== Wandering Phase =============================//
|
||||
row = 0; //Resets the visitation to the first row of the memory matrix
|
||||
for (tau = 1; tau <= timeCost; tau++)
|
||||
{
|
||||
//Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
|
||||
step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
|
||||
do
|
||||
{
|
||||
//Selects a pseudorandom index row*
|
||||
//-----------------------------------------------
|
||||
rowa = state[0] & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
|
||||
|
||||
//rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE)
|
||||
//-------------------------------------------
|
||||
|
||||
//Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
|
||||
reducedDuplexRow( state, &wholeMatrix[prev*ROW_LEN_INT64],
|
||||
&wholeMatrix[rowa*ROW_LEN_INT64],
|
||||
&wholeMatrix[row*ROW_LEN_INT64], nCols );
|
||||
//update prev: it now points to the last row ever computed
|
||||
prev = row;
|
||||
|
||||
//updates row: goes to the next row to be computed
|
||||
//----------------------------------------------------
|
||||
row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
|
||||
//row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
|
||||
//----------------------------------------------------
|
||||
|
||||
} while (row != 0);
|
||||
}
|
||||
|
||||
//===================== Wrap-up Phase ===============================//
|
||||
//Absorbs the last block of the memory matrix
|
||||
absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]);
|
||||
//Squeezes the key
|
||||
squeeze(state, K, (unsigned int) kLen);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////
|
||||
|
||||
// 2 way 256
|
||||
// drop salt, salt len arguments, hard code some others.
|
||||
// Data is interleaved 2x256.
|
||||
|
||||
int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
|
||||
const void *pwd, const uint64_t pwdlen, const void *salt,
|
||||
const uint64_t saltlen, const uint64_t timeCost, const uint64_t nRows,
|
||||
const uint64_t nCols )
|
||||
{
|
||||
//====================== Basic variables ============================//
|
||||
uint64_t _ALIGN(256) state[16];
|
||||
int64_t row = 2; //index of row to be processed
|
||||
int64_t prev = 1; //index of prev (last row ever computed/modified)
|
||||
int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
|
||||
int64_t tau; //Time Loop iterator
|
||||
int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
|
||||
int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
|
||||
int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
|
||||
// int64_t i; //auxiliary iteration counter
|
||||
int64_t v64; // 64bit var for memcpy
|
||||
uint64_t instance0 = 0; // Seperate instance for each lane
|
||||
uint64_t instance1 = 0;
|
||||
//====================================================================/
|
||||
|
||||
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
|
||||
const int64_t BLOCK_LEN = BLOCK_LEN_BLAKE2_SAFE_INT64;
|
||||
|
||||
uint64_t *ptrWord = wholeMatrix;
|
||||
|
||||
// 2 way 256 rewrite. Salt always == password, and data is interleaved,
|
||||
// need to build in parallel:
|
||||
// { password, (64 or 80 bytes)
|
||||
// salt, (64 or 80 bytes) = same as password
|
||||
// Klen, (u64) = 32 bytes
|
||||
// pwdlen, (u64)
|
||||
// saltlen, (u64)
|
||||
// timecost, (u64)
|
||||
// nrows, (u64)
|
||||
// ncols, (u64)
|
||||
// 0x80, (byte)
|
||||
// { 0 .. 0 },
|
||||
// 1 (byte)
|
||||
// }
|
||||
|
||||
// memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
|
||||
|
||||
int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) )
|
||||
/ BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
|
||||
|
||||
byte *ptrByte = (byte*) wholeMatrix;
|
||||
|
||||
//Prepends the password
|
||||
memcpy(ptrByte, pwd, pwdlen);
|
||||
ptrByte += pwdlen;
|
||||
|
||||
//Concatenates the salt
|
||||
memcpy(ptrByte, salt, saltlen);
|
||||
ptrByte += saltlen;
|
||||
|
||||
memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
|
||||
- (saltlen + pwdlen) );
|
||||
|
||||
//Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
|
||||
memcpy(ptrByte, &kLen, sizeof(int64_t));
|
||||
ptrByte += sizeof(uint64_t);
|
||||
v64 = pwdlen;
|
||||
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||
ptrByte += sizeof(uint64_t);
|
||||
v64 = saltlen;
|
||||
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||
ptrByte += sizeof(uint64_t);
|
||||
v64 = timeCost;
|
||||
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||
ptrByte += sizeof(uint64_t);
|
||||
v64 = nRows;
|
||||
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||
ptrByte += sizeof(uint64_t);
|
||||
v64 = nCols;
|
||||
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||
ptrByte += sizeof(uint64_t);
|
||||
|
||||
//Now comes the padding
|
||||
*ptrByte = 0x80; //first byte of padding: right after the password
|
||||
ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
|
||||
ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
|
||||
*ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
|
||||
|
||||
// from here on it's all simd acces to state and matrix
|
||||
// define vector pointers and adjust sizes and pointer offsets
|
||||
|
||||
ptrWord = wholeMatrix;
|
||||
|
||||
absorbBlockBlake2Safe( state, ptrWord, nBlocksInput, BLOCK_LEN );
|
||||
reducedSqueezeRow0( state, &wholeMatrix[0], nCols );
|
||||
|
||||
reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
|
||||
nCols);
|
||||
|
||||
do
|
||||
{
|
||||
|
||||
reducedDuplexRowSetup( state, &wholeMatrix[prev*ROW_LEN_INT64],
|
||||
&wholeMatrix[rowa*ROW_LEN_INT64],
|
||||
&wholeMatrix[row*ROW_LEN_INT64], nCols );
|
||||
|
||||
rowa = (rowa + step) & (window - 1);
|
||||
|
||||
prev = row;
|
||||
row++;
|
||||
|
||||
if (rowa == 0)
|
||||
{
|
||||
step = window + gap; //changes the step: approximately doubles its value
|
||||
window *= 2; //doubles the size of the re-visitation window
|
||||
gap = -gap; //inverts the modifier to the step
|
||||
}
|
||||
|
||||
} while (row < nRows);
|
||||
|
||||
row = 0;
|
||||
for (tau = 1; tau <= timeCost; tau++)
|
||||
{
|
||||
step = ((tau & 1) == 0) ? -1 : (nRows >> 1) - 1;
|
||||
do
|
||||
{
|
||||
// This part is not parallel, rowa will be different for each lane.
|
||||
// state (u64[16]) is interleaved 2x256, need to extract seperately.
|
||||
|
||||
// index = 2 * instance / 4 * 4 + instance % 4
|
||||
uint64_t index0 = ( ( (instance0 & 0xf) >> 3 ) << 2 )
|
||||
+ ( instance0 & 0x3 )
|
||||
uint64_t index1 = ( ( (instance1 & 0xf) >> 3 ) << 2 )
|
||||
+ ( instance1 & 0x3 )
|
||||
|
||||
instance0 = state[ index0 ] & 0xf;
|
||||
instance1 = (state+4)[ index1 ] & 0xf;
|
||||
|
||||
rowa0 = state[ instance0 ];
|
||||
rowa1 = (state+4)[ instance1 ];
|
||||
|
||||
reducedDuplexRow_2way( state, &wholeMatrix[prev*ROW_LEN_INT64],
|
||||
&wholeMatrix[rowa0*ROW_LEN_INT64],
|
||||
&wholeMatrix[rowa1*ROW_LEN_INT64],
|
||||
&wholeMatrix[row*ROW_LEN_INT64], nCols );
|
||||
/*
|
||||
instance = state[instance & 0xF];
|
||||
rowa = state[instance & 0xF] & (unsigned int)(nRows-1);
|
||||
|
||||
reducedDuplexRow( state, &wholeMatrix[prev*ROW_LEN_INT64],
|
||||
&wholeMatrix[rowa*ROW_LEN_INT64],
|
||||
&wholeMatrix[row*ROW_LEN_INT64], nCols );
|
||||
*/
|
||||
// End of divergence.
|
||||
|
||||
prev = row;
|
||||
row = (row + step) & (unsigned int)(nRows-1);
|
||||
|
||||
} while ( row != 0 );
|
||||
}
|
||||
|
||||
absorbBlock( state, &wholeMatrix[rowa*ROW_LEN_INT64] );
|
||||
squeeze( state, K, (unsigned int) kLen );
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
//////////////////////////////////////////////////
|
||||
int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
|
||||
const uint64_t pwdlen, const void *salt, const uint64_t saltlen,
|
||||
const uint64_t timeCost, const uint64_t nRows,
|
||||
const uint64_t nCols )
|
||||
{
|
||||
//========================== Basic variables ============================//
|
||||
uint64_t _ALIGN(256) state[16];
|
||||
int64_t row = 2; //index of row to be processed
|
||||
int64_t prev = 1; //index of prev (last row ever computed/modified)
|
||||
int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
|
||||
int64_t tau; //Time Loop iterator
|
||||
int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
|
||||
int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
|
||||
int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
|
||||
// int64_t i; //auxiliary iteration counter
|
||||
//=======================================================================/
|
||||
|
||||
//======= Initializing the Memory Matrix and pointers to it =============//
|
||||
//Tries to allocate enough space for the whole memory matrix
|
||||
|
||||
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
|
||||
// const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
|
||||
|
||||
// memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
|
||||
|
||||
//==== Getting the password + salt + basil padded with 10*1 ============//
|
||||
//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
|
||||
//but this ensures that the password copied locally will be overwritten as soon as possible
|
||||
|
||||
//First, we clean enough blocks for the password, salt, basil and padding
|
||||
uint64_t nBlocksInput = ( ( saltlen + pwdlen + 6 *
|
||||
sizeof (uint64_t) ) / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
|
||||
byte *ptrByte = (byte*) wholeMatrix;
|
||||
memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES );
|
||||
|
||||
//Prepends the password
|
||||
memcpy(ptrByte, pwd, pwdlen);
|
||||
ptrByte += pwdlen;
|
||||
|
||||
//Concatenates the salt
|
||||
memcpy(ptrByte, salt, saltlen);
|
||||
ptrByte += saltlen;
|
||||
//Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
|
||||
memcpy(ptrByte, &kLen, sizeof (uint64_t));
|
||||
ptrByte += sizeof (uint64_t);
|
||||
memcpy(ptrByte, &pwdlen, sizeof (uint64_t));
|
||||
ptrByte += sizeof (uint64_t);
|
||||
memcpy(ptrByte, &saltlen, sizeof (uint64_t));
|
||||
ptrByte += sizeof (uint64_t);
|
||||
memcpy(ptrByte, &timeCost, sizeof (uint64_t));
|
||||
ptrByte += sizeof (uint64_t);
|
||||
memcpy(ptrByte, &nRows, sizeof (uint64_t));
|
||||
ptrByte += sizeof (uint64_t);
|
||||
memcpy(ptrByte, &nCols, sizeof (uint64_t));
|
||||
ptrByte += sizeof (uint64_t);
|
||||
|
||||
//Now comes the padding
|
||||
*ptrByte = 0x80; //first byte of padding: right after the password
|
||||
ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
|
||||
ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
|
||||
*ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
|
||||
|
||||
//=================== Initializing the Sponge State ====================//
|
||||
//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
|
||||
// uint64_t *state = _mm_malloc(16 * sizeof(uint64_t), 32);
|
||||
// if (state == NULL) {
|
||||
// return -1;
|
||||
// }
|
||||
// initState( state );
|
||||
|
||||
//============================== Setup Phase =============================//
|
||||
//Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
|
||||
uint64_t *ptrWord = wholeMatrix;
|
||||
|
||||
absorbBlockBlake2Safe( state, ptrWord, nBlocksInput,
|
||||
BLOCK_LEN_BLAKE2_SAFE_INT64 );
|
||||
/*
|
||||
for ( i = 0; i < nBlocksInput; i++ )
|
||||
{
|
||||
absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
|
||||
ptrWord += BLOCK_LEN_BLAKE2_SAFE_INT64; //goes to next block of pad(pwd || salt || basil)
|
||||
}
|
||||
*/
|
||||
//Initializes M[0] and M[1]
|
||||
reducedSqueezeRow0(state, &wholeMatrix[0], nCols); //The locally copied password is most likely overwritten here
|
||||
reducedDuplexRow1(state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64], nCols);
|
||||
|
||||
do {
|
||||
//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
|
||||
reducedDuplexRowSetup(state, &wholeMatrix[prev*ROW_LEN_INT64], &wholeMatrix[rowa*ROW_LEN_INT64], &wholeMatrix[row*ROW_LEN_INT64], nCols);
|
||||
|
||||
//updates the value of row* (deterministically picked during Setup))
|
||||
rowa = (rowa + step) & (window - 1);
|
||||
//update prev: it now points to the last row ever computed
|
||||
prev = row;
|
||||
//updates row: goes to the next row to be computed
|
||||
row++;
|
||||
|
||||
//Checks if all rows in the window where visited.
|
||||
if (rowa == 0) {
|
||||
step = window + gap; //changes the step: approximately doubles its value
|
||||
window *= 2; //doubles the size of the re-visitation window
|
||||
gap = -gap; //inverts the modifier to the step
|
||||
}
|
||||
|
||||
} while (row < nRows);
|
||||
|
||||
//======================== Wandering Phase =============================//
|
||||
row = 0; //Resets the visitation to the first row of the memory matrix
|
||||
for ( tau = 1; tau <= timeCost; tau++ )
|
||||
{
|
||||
//Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
|
||||
step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
|
||||
do {
|
||||
//Selects a pseudorandom index row*
|
||||
//----------------------------------------------------------------------
|
||||
//rowa = ((unsigned int)state[0]) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
|
||||
rowa = ((uint64_t) (state[0])) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
|
||||
//-----------------------------------------------------------------
|
||||
|
||||
//Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
|
||||
reducedDuplexRow(state, &wholeMatrix[prev*ROW_LEN_INT64], &wholeMatrix[rowa*ROW_LEN_INT64], &wholeMatrix[row*ROW_LEN_INT64], nCols);
|
||||
|
||||
//update prev: it now points to the last row ever computed
|
||||
prev = row;
|
||||
|
||||
//updates row: goes to the next row to be computed
|
||||
//---------------------------------------------------------------
|
||||
//row = (row + step) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
|
||||
row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
|
||||
//--------------------------------------------------------------------
|
||||
|
||||
} while (row != 0);
|
||||
}
|
||||
|
||||
//========================= Wrap-up Phase ===============================//
|
||||
//Absorbs the last block of the memory matrix
|
||||
absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]);
|
||||
|
||||
//Squeezes the key
|
||||
squeeze( state, K, kLen );
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Lyra2RE doesn't like the new wholeMatrix implementation
|
||||
int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
|
||||
const void *salt, const uint64_t saltlen, const uint64_t timeCost,
|
||||
const uint64_t nRows, const uint64_t nCols )
|
||||
{
|
||||
//====================== Basic variables ============================//
|
||||
uint64_t _ALIGN(256) state[16];
|
||||
int64_t row = 2; //index of row to be processed
|
||||
int64_t prev = 1; //index of prev (last row ever computed/modified)
|
||||
int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
|
||||
int64_t tau; //Time Loop iterator
|
||||
int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
|
||||
int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
|
||||
int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
|
||||
int64_t i; //auxiliary iteration counter
|
||||
int64_t v64; // 64bit var for memcpy
|
||||
//====================================================================/
|
||||
|
||||
//=== Initializing the Memory Matrix and pointers to it =============//
|
||||
//Tries to allocate enough space for the whole memory matrix
|
||||
|
||||
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
|
||||
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
|
||||
// for Lyra2REv2, nCols = 4, v1 was using 8
|
||||
const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64
|
||||
: BLOCK_LEN_BLAKE2_SAFE_BYTES;
|
||||
|
||||
i = (int64_t)ROW_LEN_BYTES * nRows;
|
||||
uint64_t *wholeMatrix = _mm_malloc( i, 64 );
|
||||
if (wholeMatrix == NULL)
|
||||
return -1;
|
||||
|
||||
#if defined(__AVX2__)
|
||||
memset_zero_256( (__m256i*)wholeMatrix, i>>5 );
|
||||
#elif defined(__SSE2__)
|
||||
memset_zero_128( (__m128i*)wholeMatrix, i>>4 );
|
||||
#else
|
||||
memset( wholeMatrix, 0, i );
|
||||
#endif
|
||||
|
||||
uint64_t *ptrWord = wholeMatrix;
|
||||
|
||||
//=== Getting the password + salt + basil padded with 10*1 ==========//
|
||||
//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
|
||||
//but this ensures that the password copied locally will be overwritten as soon as possible
|
||||
|
||||
//First, we clean enough blocks for the password, salt, basil and padding
|
||||
int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) )
|
||||
/ BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
|
||||
|
||||
byte *ptrByte = (byte*) wholeMatrix;
|
||||
|
||||
//Prepends the password
|
||||
memcpy(ptrByte, pwd, pwdlen);
|
||||
ptrByte += pwdlen;
|
||||
|
||||
//Concatenates the salt
|
||||
memcpy(ptrByte, salt, saltlen);
|
||||
ptrByte += saltlen;
|
||||
|
||||
// memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
|
||||
// - (saltlen + pwdlen) );
|
||||
|
||||
//Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
|
||||
memcpy(ptrByte, &kLen, sizeof(int64_t));
|
||||
ptrByte += sizeof(uint64_t);
|
||||
v64 = pwdlen;
|
||||
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||
ptrByte += sizeof(uint64_t);
|
||||
v64 = saltlen;
|
||||
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||
ptrByte += sizeof(uint64_t);
|
||||
v64 = timeCost;
|
||||
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||
ptrByte += sizeof(uint64_t);
|
||||
v64 = nRows;
|
||||
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||
ptrByte += sizeof(uint64_t);
|
||||
v64 = nCols;
|
||||
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||
ptrByte += sizeof(uint64_t);
|
||||
|
||||
//Now comes the padding
|
||||
*ptrByte = 0x80; //first byte of padding: right after the password
|
||||
ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
|
||||
ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
|
||||
*ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
|
||||
|
||||
//================= Initializing the Sponge State ====================//
|
||||
//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
|
||||
|
||||
// initState( state );
|
||||
|
||||
//========================= Setup Phase =============================//
|
||||
//Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
|
||||
|
||||
ptrWord = wholeMatrix;
|
||||
|
||||
absorbBlockBlake2Safe( state, ptrWord, nBlocksInput, BLOCK_LEN );
|
||||
/*
|
||||
for (i = 0; i < nBlocksInput; i++)
|
||||
{
|
||||
absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
|
||||
ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
|
||||
}
|
||||
*/
|
||||
//Initializes M[0] and M[1]
|
||||
reducedSqueezeRow0( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here
|
||||
|
||||
reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
|
||||
nCols);
|
||||
|
||||
do
|
||||
{
|
||||
//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
|
||||
|
||||
reducedDuplexRowSetup( state, &wholeMatrix[prev*ROW_LEN_INT64],
|
||||
&wholeMatrix[rowa*ROW_LEN_INT64],
|
||||
&wholeMatrix[row*ROW_LEN_INT64], nCols );
|
||||
|
||||
//updates the value of row* (deterministically picked during Setup))
|
||||
rowa = (rowa + step) & (window - 1);
|
||||
//update prev: it now points to the last row ever computed
|
||||
|
||||
prev = row;
|
||||
//updates row: goes to the next row to be computed
|
||||
row++;
|
||||
|
||||
//Checks if all rows in the window where visited.
|
||||
if (rowa == 0)
|
||||
{
|
||||
step = window + gap; //changes the step: approximately doubles its value
|
||||
window *= 2; //doubles the size of the re-visitation window
|
||||
gap = -gap; //inverts the modifier to the step
|
||||
}
|
||||
|
||||
} while (row < nRows);
|
||||
|
||||
//===================== Wandering Phase =============================//
|
||||
row = 0; //Resets the visitation to the first row of the memory matrix
|
||||
for (tau = 1; tau <= timeCost; tau++)
|
||||
{
|
||||
//Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
|
||||
step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
|
||||
do
|
||||
{
|
||||
//Selects a pseudorandom index row*
|
||||
//-----------------------------------------------
|
||||
rowa = state[0] & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
|
||||
|
||||
//rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE)
|
||||
//-------------------------------------------
|
||||
|
||||
//Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
|
||||
reducedDuplexRow( state, &wholeMatrix[prev*ROW_LEN_INT64],
|
||||
&wholeMatrix[rowa*ROW_LEN_INT64],
|
||||
&wholeMatrix[row*ROW_LEN_INT64], nCols );
|
||||
//update prev: it now points to the last row ever computed
|
||||
prev = row;
|
||||
|
||||
//updates row: goes to the next row to be computed
|
||||
//----------------------------------------------------
|
||||
row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
|
||||
//row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
|
||||
//----------------------------------------------------
|
||||
|
||||
} while (row != 0);
|
||||
}
|
||||
|
||||
//===================== Wrap-up Phase ===============================//
|
||||
//Absorbs the last block of the memory matrix
|
||||
absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]);
|
||||
//Squeezes the key
|
||||
squeeze(state, K, (unsigned int) kLen);
|
||||
|
||||
//================== Freeing the memory =============================//
|
||||
_mm_free(wholeMatrix);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
319
algo/lyra2/sponge-2way.c
Normal file
319
algo/lyra2/sponge-2way.c
Normal file
@@ -0,0 +1,319 @@
|
||||
/**
|
||||
* A simple implementation of Blake2b's internal permutation
|
||||
* in the form of a sponge.
|
||||
*
|
||||
* Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
|
||||
*
|
||||
* This software is hereby placed in the public domain.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
|
||||
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
||||
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||
* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
||||
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "algo-gate.h"
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <time.h>
|
||||
#include <immintrin.h>
|
||||
#include "sponge.h"
|
||||
#include "lyra2.h"
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
inline void squeeze_2way( uint64_t *State, byte *Out, unsigned int len )
|
||||
{
|
||||
const int len_m256i = len / 32;
|
||||
const int fullBlocks = len_m256i / BLOCK_LEN_M256I;
|
||||
__m512i* state = (__m512i*)State;
|
||||
__m512i* out = (__m512i*)Out;
|
||||
int i;
|
||||
|
||||
//Squeezes full blocks
|
||||
for ( i = 0; i < fullBlocks; i++ )
|
||||
{
|
||||
memcpy_512( out, state, BLOCK_LEN_M256I*2 );
|
||||
LYRA_ROUND_2WAY_AVX2( state[0], state[1], state[2], state[3] );
|
||||
out += BLOCK_LEN_M256I*2;
|
||||
}
|
||||
//Squeezes remaining bytes
|
||||
memcpy_512( out, state, ( (len_m256i % BLOCK_LEN_M256I) * 2 ) );
|
||||
}
|
||||
|
||||
inline void absorbBlock_2way( uint64_t *State, const uint64_t *In )
|
||||
{
|
||||
register __m512i state0, state1, state2, state3;
|
||||
__m512i *in = (__m512i*)In;
|
||||
|
||||
state0 = _mm512_load_si512( (__m512i*)State );
|
||||
state1 = _mm512_load_si512( (__m512i*)State + 1 );
|
||||
state2 = _mm512_load_si512( (__m512i*)State + 2 );
|
||||
state3 = _mm512_load_si512( (__m512i*)State + 3 );
|
||||
|
||||
state0 = _mm512_xor_si512( state0, in[0] );
|
||||
state1 = _mm512_xor_si512( state1, in[1] );
|
||||
state2 = _mm512_xor_si512( state2, in[2] );
|
||||
|
||||
LYRA_12_ROUNDS_2WAY_AVX512( state0, state1, state2, state3 );
|
||||
|
||||
_mm512_store_si512( (__m512i*)State, state0 );
|
||||
_mm512_store_si512( (__m512i*)State + 1, state1 );
|
||||
_mm512_store_si512( (__m512i*)State + 2, state2 );
|
||||
_mm512_store_si512( (__m512i*)State + 3, state3 );
|
||||
|
||||
}
|
||||
|
||||
inline void absorbBlockBlake2Safe_2way( uint64_t *State, const uint64_t *In,
|
||||
const uint64_t nBlocks, const uint64_t block_len )
|
||||
{
|
||||
register __m512i state0, state1, state2, state3;
|
||||
|
||||
state0 =
|
||||
state1 = m512_zero;
|
||||
state2 = m512_const4_64( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL,
|
||||
0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
|
||||
state3 = m512_const4_64( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL,
|
||||
0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );
|
||||
|
||||
for ( int i = 0; i < nBlocks; i++ )
|
||||
{
|
||||
__m512i *in = (__m512i*)In;
|
||||
state0 = _mm512_xor_si512( state0, in[0] );
|
||||
state1 = _mm512_xor_si512( state1, in[1] );
|
||||
|
||||
LYRA_12_ROUNDS_2WAY_AVX512( state0, state1, state2, state3 );
|
||||
In += block_len * 2;
|
||||
}
|
||||
|
||||
_mm512_store_si512( (__m512i*)State, state0 );
|
||||
_mm512_store_si512( (__m512i*)State + 1, state1 );
|
||||
_mm512_store_si512( (__m512i*)State + 2, state2 );
|
||||
_mm512_store_si512( (__m512i*)State + 3, state3 );
|
||||
|
||||
}
|
||||
|
||||
inline void reducedSqueezeRow0_2way( uint64_t* State, uint64_t* rowOut,
|
||||
uint64_t nCols )
|
||||
{
|
||||
int i;
|
||||
|
||||
//M[row][C-1-col] = H.reduced_squeeze()
|
||||
|
||||
|
||||
register __m512i state0, state1, state2, state3;
|
||||
__m512i* out = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I * 2 );
|
||||
|
||||
state0 = _mm512_load_si512( (__m512i*)State );
|
||||
state1 = _mm512_load_si512( (__m512i*)State + 1 );
|
||||
state2 = _mm512_load_si512( (__m512i*)State + 2 );
|
||||
state3 = _mm512_load_si512( (__m512i*)State + 3 );
|
||||
|
||||
for ( i = 0; i < 9; i += 3)
|
||||
{
|
||||
_mm_prefetch( out - i, _MM_HINT_T0 );
|
||||
_mm_prefetch( out - i - 2, _MM_HINT_T0 );
|
||||
}
|
||||
|
||||
for ( i = 0; i < nCols; i++ )
|
||||
{
|
||||
_mm_prefetch( out - 9, _MM_HINT_T0 );
|
||||
_mm_prefetch( out - 11, _MM_HINT_T0 );
|
||||
|
||||
out[0] = state0;
|
||||
out[1] = state1;
|
||||
out[2] = state2;
|
||||
|
||||
//Goes to next block (column) that will receive the squeezed data
|
||||
out -= BLOCK_LEN_M256I * 2;
|
||||
|
||||
LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
|
||||
}
|
||||
|
||||
_mm512_store_si512( (__m512i*)State, state0 );
|
||||
_mm512_store_si512( (__m512i*)State + 1, state1 );
|
||||
_mm512_store_si512( (__m512i*)State + 2, state2 );
|
||||
_mm512_store_si512( (__m512i*)State + 3, state3 );
|
||||
}
|
||||
|
||||
// This function has to deal with gathering 2 256 bit rowin vectors from
|
||||
// non-contiguous memory. Extra work and performance penalty.
|
||||
|
||||
inline void reducedDuplexRow1_2way( uint64_t *State, uint64_t *rowIn,
|
||||
uint64_t *rowOut, uint64_t nCols )
|
||||
{
|
||||
int i;
|
||||
register __m512i state0, state1, state2, state3;
|
||||
__m512i *in = (__m256i*)rowIn;
|
||||
|
||||
state0 = _mm512_load_si512( (__m512i*)State );
|
||||
state1 = _mm512_load_si512( (__m512i*)State + 1 );
|
||||
state2 = _mm512_load_si512( (__m512i*)State + 2 );
|
||||
state3 = _mm512_load_si512( (__m512i*)State + 3 );
|
||||
|
||||
for ( i = 0; i < nCols; i++ )
|
||||
{
|
||||
state0 = _mm512_xor_si512( state0, in[0] );
|
||||
state1 = _mm512_xor_si512( state1, in[1] );
|
||||
state2 = _mm512_xor_si512( state2, in[2] );
|
||||
|
||||
LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
|
||||
|
||||
out[0] = _mm512_xor_si512( state0, in[0] );
|
||||
out[1] = _mm512_xor_si512( state1, in[1] );
|
||||
out[2] = _mm512_xor_si512( state2, in[2] );
|
||||
|
||||
//Input: next column (i.e., next block in sequence)
|
||||
in0 += BLOCK_LEN_M256I;
|
||||
in1 += BLOCK_LEN_M256I;
|
||||
//Output: goes to previous column
|
||||
out -= BLOCK_LEN_M256I * 2;
|
||||
}
|
||||
|
||||
_mm512_store_si256( (__m512i*)State, state0 );
|
||||
_mm512_store_si256( (__m512i*)State + 1, state1 );
|
||||
_mm512_store_si256( (__m512i*)State + 2, state2 );
|
||||
_mm512_store_si256( (__m512i*)State + 3, state3 );
|
||||
}
|
||||
}
|
||||
|
||||
inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
|
||||
uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols )
|
||||
{
|
||||
int i;
|
||||
|
||||
register __m512i state0, state1, state2, state3;
|
||||
__m512i* in = (__m512i*)rowIn;
|
||||
__m512i* inout = (__m512i*)rowInOut;
|
||||
__m512i* out = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I * 2 );
|
||||
__m512i t0, t1, t2;
|
||||
|
||||
state0 = _mm512_load_si512( (__m512i*)State );
|
||||
state1 = _mm512_load_si512( (__m512i*)State + 1 );
|
||||
state2 = _mm512_load_si512( (__m512i*)State + 2 );
|
||||
state3 = _mm512_load_si512( (__m512i*)State + 3 );
|
||||
|
||||
for ( i = 0; i < nCols; i++ )
|
||||
{
|
||||
state0 = _mm512_xor_si512( state0,
|
||||
_mm512_add_epi64( in[0], inout[0] ) );
|
||||
state1 = _mm512_xor_si512( state1,
|
||||
_mm512_add_epi64( in[1], inout[1] ) );
|
||||
state2 = _mm512_xor_si512( state2,
|
||||
_mm512_add_epi64( in[2], inout[2] ) );
|
||||
|
||||
LYRA_ROUND_2WAY AVX512( state0, state1, state2, state3 );
|
||||
|
||||
out[0] = _mm512_xor_si512( state0, in[0] );
|
||||
out[1] = _mm512_xor_si512( state1, in[1] );
|
||||
out[2] = _mm512_xor_si512( state2, in[2] );
|
||||
|
||||
//M[row*][col] = M[row*][col] XOR rotW(rand)
|
||||
t0 = _mm512_permutex_epi64( state0, 0x93 );
|
||||
t1 = _mm512_permutex_epi64( state1, 0x93 );
|
||||
t2 = _mm512_permutex_epi64( state2, 0x93 );
|
||||
|
||||
inout[0] = _mm512_xor_si512( inout[0],
|
||||
_mm512_mask_blend_epi32( t0, t2, 0x03 ) );
|
||||
inout[1] = _mm512_xor_si512( inout[1],
|
||||
_mm512_mask_blend_epi32( t1, t0, 0x03 ) );
|
||||
inout[2] = _mm512_xor_si512( inout[2],
|
||||
_mm512_mask_blend_epi32( t2, t1, 0x03 ) );
|
||||
|
||||
//Inputs: next column (i.e., next block in sequence)
|
||||
in += BLOCK_LEN_M256I * 2;
|
||||
inout += BLOCK_LEN_M256I * 2;
|
||||
//Output: goes to previous column
|
||||
out -= BLOCK_LEN_M256I * 2;
|
||||
}
|
||||
|
||||
_mm512_store_si512( (__m512i*)State, state0 );
|
||||
_mm512_store_si512( (__m512i*)State + 1, state1 );
|
||||
_mm512_store_si512( (__m512i*)State + 2, state2 );
|
||||
_mm512_store_si512( (__m512i*)State + 3, state3 );
|
||||
}
|
||||
|
||||
inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn1,
|
||||
uint64_t *rowIn0, uint64_t *rowInOut, uint64_t *rowOut,
|
||||
uint64_t nCols )
|
||||
{
|
||||
int i;
|
||||
|
||||
register __m512i state0, state1, state2, state3;
|
||||
__m256i *in0 = (__m256i*)rowIn0;
|
||||
__m256i *in0 = (__m256i*)rowIn0;
|
||||
__m2512* in = (__m512i*)rowIn;
|
||||
__m2512* inout = (__m512i*)rowInOut;
|
||||
__m512i* out = (__m512i*)rowOut;
|
||||
__m512i t0, t1, t2;
|
||||
|
||||
_mm_prefetch( in0, _MM_HINT_T0 );
|
||||
_mm_prefetch( in1, _MM_HINT_T0 );
|
||||
_mm_prefetch( in0 + 2, _MM_HINT_T0 );
|
||||
_mm_prefetch( in1 + 2, _MM_HINT_T0 );
|
||||
_mm_prefetch( in0 + 4, _MM_HINT_T0 );
|
||||
_mm_prefetch( in1 + 4, _MM_HINT_T0 );
|
||||
_mm_prefetch( in0 + 6, _MM_HINT_T0 );
|
||||
_mm_prefetch( in1 + 6, _MM_HINT_T0 );
|
||||
|
||||
state0 = _mm512_load_si512( (__m512i*)State );
|
||||
state1 = _mm512_load_si512( (__m512i*)State + 1 );
|
||||
state2 = _mm512_load_si512( (__m512i*)State + 2 );
|
||||
state3 = _mm512_load_si512( (__m512i*)State + 3 );
|
||||
|
||||
//Absorbing "M[prev] [+] M[row*]"
|
||||
|
||||
// state0 = _mm512_xor_si512( state0, mm512_concat_256( in1[0], in0[0] );
|
||||
// state1 = _mm512_xor_si512( state1, mm512_concat_256( in1[1], in0[1] );
|
||||
// state2 = _mm512_xor_si512( state2, mm512_concat_256( in1[2], in0[2] );
|
||||
t0 = mm512_concat_256( in1[0], in0[0] );
|
||||
t1 = mm512_concat_256( in1[1], in0[1] );
|
||||
t2 = mm512_concat_256( in1[2], in0[2] );
|
||||
|
||||
state0 = _mm512_xor_si512( state0,
|
||||
_mm512_add_epi64( t0, inout[0] ) );
|
||||
state1 = _mm512_xor_si512( state1,
|
||||
_mm512_add_epi64( t1, inout[1] ) );
|
||||
state2 = _mm512_xor_si512( state2,
|
||||
_mm512_add_epi64( t2, inout[2] ) );
|
||||
|
||||
//Applies the reduced-round transformation f to the sponge's state
|
||||
LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
|
||||
|
||||
//M[rowOut][col] = M[rowOut][col] XOR rand
|
||||
out[0] = _mm512_xor_si512( out[0], state0 );
|
||||
out[1] = _mm512_xor_si512( out[1], state1 );
|
||||
out[2] = _mm512_xor_si512( out[2], state2 );
|
||||
|
||||
//M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
|
||||
t0 = _mm512_permutex_epi64( state0, 0x93 );
|
||||
t1 = _mm512_permutex_epi64( state1, 0x93 );
|
||||
t2 = _mm512_permutex_epi64( state2, 0x93 );
|
||||
|
||||
inout[0] = _mm512_xor_si512( inout[0],
|
||||
_mm512_mask_blend_epi32( t0, t2, 0x03 ) );
|
||||
inout[1] = _mm512_xor_si512( inout[1],
|
||||
_mm512_mask_blend_epi32( t1, t0, 0x03 ) );
|
||||
inout[2] = _mm512_xor_si512( inout[2],
|
||||
_mm512_mask_blend_epi32( t2, t1, 0x03 ) );
|
||||
|
||||
//Goes to next block
|
||||
in += BLOCK_LEN_M256I * 2;
|
||||
out += BLOCK_LEN_M256I * 2;
|
||||
inout += BLOCK_LEN_M256I * 2;
|
||||
}
|
||||
|
||||
_mm512_store_si512( (__m512i*)State, state0 );
|
||||
_mm512_store_si512( (__m512i*)State + 1, state1 );
|
||||
_mm512_store_si512( (__m512i*)State + 2, state2 );
|
||||
_mm512_store_si512( (__m512i*)State + 3, state3 );
|
||||
}
|
||||
|
||||
#endif // AVX512
|
||||
@@ -52,8 +52,46 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
// However, 2 way parallel looks trivial to code for AVX512 except for
|
||||
// a data dependency with rowa.
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
#define G2W_4X64(a,b,c,d) \
|
||||
a = _mm512_add_epi64( a, b ); \
|
||||
d = mm512_ror_64( _mm512_xor_si512( d, a ), 32 ); \
|
||||
c = _mm512_add_epi64( c, d ); \
|
||||
b = mm512_ror_64( _mm512_xor_si512( b, c ), 24 ); \
|
||||
a = _mm512_add_epi64( a, b ); \
|
||||
d = mm512_ror_64( _mm512_xor_si512( d, a ), 16 ); \
|
||||
c = _mm512_add_epi64( c, d ); \
|
||||
b = mm512_ror_64( _mm512_xor_si512( b, c ), 63 );
|
||||
|
||||
#define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||
G_4X64( s0, s1, s2, s3 ); \
|
||||
s1 = mm512_ror_1x64( s1); \
|
||||
s2 = mm512_swap128_256( s2 ); \
|
||||
s3 = mm512_rol1x64_256( s3 ); \
|
||||
G_4X64( s0, s1, s2, s3 ); \
|
||||
s1 = mm512_rol1x64_256( s1 ); \
|
||||
s2 = mm512_swap128_256( s2 ); \
|
||||
s3 = mm512_ror1x64_256( s3 );
|
||||
|
||||
#define LYRA_12_ROUNDS_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 )
|
||||
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
#if defined __AVX2__
|
||||
// only available with avx2
|
||||
|
||||
// process 4 columns in parallel
|
||||
// returns void, updates all args
|
||||
@@ -89,9 +127,11 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_AVX2( s0, s1, s2, s3 )
|
||||
|
||||
#elif defined(__SSE2__)
|
||||
#endif
|
||||
|
||||
#if defined(__SSE2__)
|
||||
|
||||
// process 2 columns in parallel
|
||||
// returns void, all args updated
|
||||
@@ -129,7 +169,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7)
|
||||
|
||||
|
||||
#endif // AVX2 else SSE2
|
||||
@@ -161,6 +201,30 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
G(r,7,v[ 3],v[ 4],v[ 9],v[14]);
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
//---- Housekeeping
|
||||
void initState_2way( uint64_t state[/*16*/] );
|
||||
|
||||
//---- Squeezes
|
||||
void squeeze_2way( uint64_t *state, unsigned char *out, unsigned int len );
|
||||
void reducedSqueezeRow0_2way( uint64_t* state, uint64_t* row, uint64_t nCols );
|
||||
|
||||
//---- Absorbs
|
||||
void absorbBlock_2way( uint64_t *state, const uint64_t *in );
|
||||
void absorbBlockBlake2Safe_2way( uint64_t *state, const uint64_t *in,
|
||||
const uint64_t nBlocks, const uint64_t block_len );
|
||||
|
||||
//---- Duplexes
|
||||
void reducedDuplexRow1_2way( uint64_t *state, uint64_t *rowIn,
|
||||
uint64_t *rowOut, uint64_t nCols);
|
||||
void reducedDuplexRowSetup_2way( uint64_t *state, uint64_t *rowIn,
|
||||
uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols );
|
||||
void reducedDuplexRow_2way(uint64_t *state, uint64_t *rowIn1, uint64_t *rowIn0, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
//---- Housekeeping
|
||||
void initState(uint64_t state[/*16*/]);
|
||||
|
||||
@@ -178,20 +242,4 @@ void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, uint6
|
||||
void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);
|
||||
void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);
|
||||
|
||||
//---- Misc
|
||||
void printArray(unsigned char *array, unsigned int size, char *name);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
////TESTS////
|
||||
//void reducedDuplexRowc(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
|
||||
//void reducedDuplexRowd(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
|
||||
//void reducedDuplexRowSetupv4(uint64_t *state, uint64_t *rowIn1, uint64_t *rowIn2, uint64_t *rowOut1, uint64_t *rowOut2);
|
||||
//void reducedDuplexRowSetupv5(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
|
||||
//void reducedDuplexRowSetupv5c(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
|
||||
//void reducedDuplexRowSetupv5d(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
|
||||
/////////////
|
||||
|
||||
|
||||
#endif /* SPONGE_H_ */
|
||||
|
||||
Reference in New Issue
Block a user