mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
2995 lines
97 KiB
C
2995 lines
97 KiB
C
#include "scrypt-core-4way.h"
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
//
|
|
// Optimized Salsa implementation inspired by Pooler.
|
|
// Any similarities are not a coincidence.
|
|
//
|
|
// Implementations include reference X64, SSE2, AVX2 & AVX512
|
|
// using both serial and parallel vectoring using SIMD instruction.
|
|
//
|
|
// Generic macros are providedi and invoked with different targets depending
|
|
// on level of parallelism and data organization. Targets for any macros
|
|
// needed must be defined by the calling function. XOR, ROL32 and ADD32
|
|
// are needed in all cases. Additionally ROL_1X32, SWAP_64 and ROR_1X32
|
|
// shuffles are needed for serial SIMD.
|
|
//
|
|
// SALSA_8ROUNDS_SIMD uses vectors on serial data rather than traditional
|
|
// n-way parallel hashing.
|
|
// The SIMD version has different implied arguments {X0:X3}, representing
|
|
// an array of 4 vectors of 4 32 bit words, while the version used for
|
|
// regular parallel hashing has {x0:xf} representing array of 16 by 32 bit
|
|
// words.
|
|
// These arguments must be defined by the calling function.
|
|
// The calling function must also define targets for all macros used for
|
|
// arithmetic, logic and shuffling: XOR, ROL32, ADD32 for all targets and
|
|
// ROL_1X32, SWAP_64, ROR_1X32 for serial SIMD targets.
|
|
//
|
|
// Serial and parallel SIMD will be combined with AVX2 doing 2 way
|
|
// parallel over 4 way linear for 8 way throughput, and AVX512 doing
|
|
// 4 way parallel over 4 way linear for 16 way thoughput.
|
|
//
|
|
// The term SIMD128 here refers to vectors that contain multiple contiguous
|
|
// data from a single stream (lane) as opposed to parallel vectors that
|
|
// contain interleaved words of data from multiple streams.
|
|
//
|
|
// The sequencing of techniques in the naming convention is a little
|
|
// mixed up. The logical hierarchy top down is to put Nbuffs at the top
|
|
// where each buffer then performs another technique.
|
|
//
|
|
// Although, Nway and SIMS128 are listed in top down order Nbuffs is
|
|
// always listed last:
|
|
//
|
|
// scrypt_core_simd128_2way means a linear simd operation on 2 parallel
|
|
// streams of data while
|
|
// scrypt_core_2way_simd128 is 2 parallel streams linear SIMD vectors.
|
|
//
|
|
///////////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
// Used by all targets, needs XOR, ROL32 & ADD32 macros defined
|
|
// Function, return typically overwrites in1
|
|
//
|
|
#define ARX( in1, in2, in3, n ) \
|
|
XOR( in1, ROL32( ADD32( in2, in3 ), n ) )
|
|
|
|
// Multi buffering has 2 main benefits and one drawback.
|
|
// Traditionally double buffering has been used to empty one bucket
|
|
// while another is filling. This requires a second (or 3rd, etc)
|
|
// bucket. The computing analogy is to use 2 registers, 1 to read
|
|
// and 1 to write, and switch back and forth.
|
|
//
|
|
// The second benefit in computing is using multiple registers to
|
|
// provide data independence that improves multiple instruction issue and
|
|
// pipelining in the CPU. The number of buffers is limited by the number
|
|
// of registers available. Three seems to be a swet spot as a 4 variable
|
|
// data set uses 12 registers triple buffered, leaving 4 of 16 as temps.
|
|
// Many pipelined instructions require 3 clocks to complete and triple
|
|
// bufferin keeps the pipeline full. Many execution units are also 3 wide
|
|
// allowing up to 3 similar instructions to be issued per clock.
|
|
// However, execution units are shared by hyperthreading which reduces
|
|
// the effect on a single thread.
|
|
//
|
|
// The drawback is the increased size of the data. Although multi buffering
|
|
// also improves memory throughput this is offset by the amount of
|
|
// memory required and it's effect on cache performance and will eventually
|
|
// hit memory bus saturation.
|
|
//
|
|
// For example scryptn2 struggles with more than 4 buffers, multi
|
|
// buffered and parallel SIMD combined, and performance drops. This can
|
|
// be mitigated somewhat by reducing the number of CPU threads but
|
|
// ultimately excessive multi buffering has a negative impact.
|
|
//
|
|
// Unlike paralle SIMD, increasing multi buffering does not require a
|
|
// CPU technology increase, ie SSE2 to AVX2 or AVX2 TO AVX512.
|
|
// SSE2 is limited to 4 way SIMD but no theoretical limit to multibuffering.
|
|
// Multi buffering also does not suffer the clock penalty of increasing
|
|
// parallism.
|
|
//
|
|
// Multi buffering implementations here focus on powers of 2,
|
|
// to match sha256 without re-interleaving the data.
|
|
//
|
|
// A decision will have to be made at run time, based of the N factor,
|
|
// whether to use multi buffering or serial execution.
|
|
|
|
// Need TYPE macro defined.
|
|
#define ARX_2BUF( a1, a2, a3, b1, b2, b3, n ) \
|
|
do{ \
|
|
TYPE ta = ADD32( a2, a3 ); \
|
|
TYPE tb = ADD32( b2, b3 ); \
|
|
ta = ROL32( ta, n ); \
|
|
tb = ROL32( tb, n ); \
|
|
a1 = XOR( a1, ta ); \
|
|
b1 = XOR( b1, tb ); \
|
|
} while (0);
|
|
|
|
#define ARX_3BUF( a1, a2, a3, b1, b2, b3, c1, c2, c3, n ) \
|
|
do{ \
|
|
TYPE ta = ADD32( a2, a3 ); \
|
|
TYPE tb = ADD32( b2, b3 ); \
|
|
TYPE tc = ADD32( c2, c3 ); \
|
|
ta = ROL32( ta, n ); \
|
|
tb = ROL32( tb, n ); \
|
|
tc = ROL32( tc, n ); \
|
|
a1 = XOR( a1, ta ); \
|
|
b1 = XOR( b1, tb ); \
|
|
c1 = XOR( c1, tc ); \
|
|
} while (0);
|
|
|
|
|
|
// Used by SIMD128 and hybrid targets, needs also ROL_1X32, SWAP_64 &
|
|
// ROR_1X32 defined.
|
|
//
|
|
// Implied arguments ( X0 = { x3, x2, x1, x0 },
|
|
// X1 = { x7, x6, x5, x4 },
|
|
// X3 = { xb, xa, x9, x8 },
|
|
// X3 = { xf, xe, xd, xc } )
|
|
//
|
|
#define SALSA_2ROUNDS_SIMD128( X0, X1, X2, X3 ) \
|
|
/* Operate on columns */ \
|
|
X1 = ARX( X1, X0, X3, 7 ); /* ( x4, x0, xc, 7 ) */ \
|
|
X2 = ARX( X2, X1, X0, 9 ); /* ( x8, x4, x0, 9 ) */ \
|
|
X3 = ARX( X3, X2, X1, 13 ); /* ( xc, x8, x4, 13 ) */ \
|
|
X0 = ARX( X0, X3, X2, 18 ); /* ( x0, xc, x8, 18 ) */ \
|
|
/* Rearrange data */ \
|
|
X1 = ROL_1X32( X1 ); \
|
|
X3 = ROR_1X32( X3 ); \
|
|
X2 = SWAP_64( X2 ); \
|
|
/* Operate on rows */ \
|
|
X3 = ARX( X3, X0, X1, 7 ); \
|
|
X2 = ARX( X2, X3, X0, 9 ); \
|
|
X1 = ARX( X1, X2, X3, 13 ); \
|
|
X0 = ARX( X0, X1, X2, 18 ); \
|
|
/* Rearrange data */ \
|
|
X3 = ROL_1X32( X3 ); \
|
|
X1 = ROR_1X32( X1 ); \
|
|
X2 = SWAP_64( X2 ); \
|
|
|
|
// Final round optimization, don't rearange data back to original order on exit
|
|
// Used only on pre-AVX2 CPUs where blend instruction is not avaiable.
|
|
// It saves a few redundant shuffles.
|
|
#define SALSA_2ROUNDS_FINAL_SIMD128( X0, X1, X2, X3 ) \
|
|
/* Operate on columns */ \
|
|
X1 = ARX( X1, X0, X3, 7 ); /* ( x4, x0, xc, 7 ) */ \
|
|
X2 = ARX( X2, X1, X0, 9 ); /* ( x8, x4, x0, 9 ) */ \
|
|
X3 = ARX( X3, X2, X1, 13 ); /* ( xc, x8, x4, 13 ) */ \
|
|
X0 = ARX( X0, X3, X2, 18 ); /* ( x0, xc, x8, 18 ) */ \
|
|
/* Rearrange data */ \
|
|
X1 = ROL_1X32( X1 ); \
|
|
X3 = ROR_1X32( X3 ); \
|
|
X2 = SWAP_64( X2 ); \
|
|
/* Operate on rows */ \
|
|
X3 = ARX( X3, X0, X1, 7 ); \
|
|
X2 = ARX( X2, X3, X0, 9 ); \
|
|
X1 = ARX( X1, X2, X3, 13 ); \
|
|
X0 = ARX( X0, X1, X2, 18 ); \
|
|
/* Final round, don't rearrange data
|
|
X1 = ROR_1X32( X1 ); \
|
|
X2 = SWAP_64( X2 ); \
|
|
X3 = ROL_1X32( X3 ); */
|
|
|
|
// Implied args ( XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3 )
|
|
#define SALSA_2ROUNDS_SIMD128_2BUF \
|
|
ARX_2BUF( XA1, XA0, XA3, XB1, XB0, XB3, 7 ); \
|
|
ARX_2BUF( XA2, XA1, XA0, XB2, XB1, XB0, 9 ); \
|
|
ARX_2BUF( XA3, XA2, XA1, XB3, XB2, XB1, 13 ); \
|
|
ARX_2BUF( XA0, XA3, XA2, XB0, XB3, XB2, 18 ); \
|
|
XA1 = ROL_1X32( XA1 ); \
|
|
XB1 = ROL_1X32( XB1 ); \
|
|
XA3 = ROR_1X32( XA3 ); \
|
|
XB3 = ROR_1X32( XB3 ); \
|
|
XA2 = SWAP_64( XA2 ); \
|
|
XB2 = SWAP_64( XB2 ); \
|
|
ARX_2BUF( XA3, XA0, XA1, XB3, XB0, XB1, 7 ); \
|
|
ARX_2BUF( XA2, XA3, XA0, XB2, XB3, XB0, 9 ); \
|
|
ARX_2BUF( XA1, XA2, XA3, XB1, XB2, XB3, 13 ); \
|
|
ARX_2BUF( XA0, XA1, XA2, XB0, XB1, XB2, 18 ); \
|
|
XA3 = ROL_1X32( XA3 ); \
|
|
XB3 = ROL_1X32( XB3 ); \
|
|
XA1 = ROR_1X32( XA1 ); \
|
|
XB1 = ROR_1X32( XB1 ); \
|
|
XA2 = SWAP_64( XA2 ); \
|
|
XB2 = SWAP_64( XB2 );
|
|
|
|
// For use when fast bit rotate is not available.
|
|
// contains target specif instructions, only use with 128 bit vectrors.
|
|
#define SALSA_2ROUNDS_SIMD128_2BUF_SLOROT \
|
|
do{ \
|
|
TYPE TA = ADD32( XA0, XA3 ); \
|
|
TYPE TB = ADD32( XB0, XB3 ); \
|
|
TYPE T = _mm_slli_epi32( TA, 7 ); \
|
|
TA = _mm_srli_epi32( TA, 25 ); \
|
|
XA1 = XOR( XA1, T ); \
|
|
XA1 = XOR( XA1, TA ); \
|
|
T = _mm_slli_epi32( TB, 7 );\
|
|
TB = _mm_srli_epi32( TB, 25 ); \
|
|
XB1 = XOR( XB1, T ); \
|
|
XB1 = XOR( XB1, TB ); \
|
|
\
|
|
TA = ADD32( XA1, XA0 ); \
|
|
TB = ADD32( XB1, XB0 ); \
|
|
T = _mm_slli_epi32( TA, 9 ); \
|
|
TA = _mm_srli_epi32( TA, 23 ); \
|
|
XA2 = XOR( XA2, T ); \
|
|
XA2 = XOR( XA2, TA ); \
|
|
T = _mm_slli_epi32( TB, 9 );\
|
|
TB = _mm_srli_epi32( TB, 23 );\
|
|
XB2 = XOR( XB2, T ); \
|
|
XB2 = XOR( XB2, TB ); \
|
|
\
|
|
TA = ADD32( XA2, XA1 ); \
|
|
TB = ADD32( XB2, XB1 ); \
|
|
T = _mm_slli_epi32( TA, 13); \
|
|
TA = _mm_srli_epi32( TA, 19 ); \
|
|
XA1 = ROL_1X32( XA1 ); \
|
|
XB1 = ROL_1X32( XB1 ); \
|
|
XA3 = XOR( XA3, T ); \
|
|
XA3 = XOR( XA3, TA ); \
|
|
T = _mm_slli_epi32( TB, 13); \
|
|
TB = _mm_srli_epi32( TB, 19 ); \
|
|
XB3 = XOR( XB3, T ); \
|
|
XB3 = XOR( XB3, TB ); \
|
|
\
|
|
TA = ADD32( XA3, XA2 ); \
|
|
TB = ADD32( XB3, XB2 ); \
|
|
T = _mm_slli_epi32( TA, 18 ); \
|
|
TA = _mm_srli_epi32( TA, 14 ); \
|
|
XA2 = SWAP_64( XA2 ); \
|
|
XB2 = SWAP_64( XB2 ); \
|
|
XA0 = XOR( XA0, T ); \
|
|
XA0 = XOR( XA0, TA ); \
|
|
T = _mm_slli_epi32( TB, 18 ); \
|
|
TB = _mm_srli_epi32( TB, 14 ); \
|
|
XB0 = XOR( XB0, T ); \
|
|
XB0 = XOR( XB0, TB ); \
|
|
\
|
|
TA = ADD32( XA0, XA1 ); \
|
|
TB = ADD32( XB0, XB1 ); \
|
|
T = _mm_slli_epi32( TA, 7 ); \
|
|
TA = _mm_srli_epi32( TA, 25 ); \
|
|
XA3 = ROR_1X32( XA3 ); \
|
|
XA3 = XOR( XA3, T ); \
|
|
XA3 = XOR( XA3, TA ); \
|
|
T = _mm_slli_epi32( TB, 7 ); \
|
|
TB = _mm_srli_epi32( TB, 25 ); \
|
|
XB3 = ROR_1X32( XB3 ); \
|
|
XB3 = XOR( XB3, T ); \
|
|
XB3 = XOR( XB3, TB ); \
|
|
\
|
|
TA = ADD32( XA3, XA0 ); \
|
|
TB = ADD32( XB3, XB0 ); \
|
|
T = _mm_slli_epi32( TA, 9 ); \
|
|
TA = _mm_srli_epi32( TA, 23 ); \
|
|
XA2 = XOR( XA2, T ); \
|
|
XA2 = XOR( XA2, TA ); \
|
|
T = _mm_slli_epi32( TB, 9 ); \
|
|
TB = _mm_srli_epi32( TB, 23 ); \
|
|
XB2 = XOR( XB2, T ); \
|
|
XB2 = XOR( XB2, TB ); \
|
|
\
|
|
TA = ADD32( XA2, XA3 ); \
|
|
TB = ADD32( XB2, XB3 ); \
|
|
T = _mm_slli_epi32( TA, 13 ); \
|
|
TA = _mm_srli_epi32( TA, 19 ); \
|
|
XA3 = ROL_1X32( XA3 ); \
|
|
XB3 = ROL_1X32( XB3 ); \
|
|
XA1 = XOR( XA1, T ); \
|
|
XA1 = XOR( XA1, TA ); \
|
|
T = _mm_slli_epi32( TB, 13 ); \
|
|
TB = _mm_srli_epi32( TB, 19 ); \
|
|
XB1 = XOR( XB1, T ); \
|
|
XB1 = XOR( XB1, TB ); \
|
|
\
|
|
TA = ADD32( XA1, XA2 ); \
|
|
TB = ADD32( XB1, XB2 ); \
|
|
T = _mm_slli_epi32( TA, 18 ); \
|
|
TA = _mm_srli_epi32( TA, 14 ); \
|
|
XA2 = SWAP_64( XA2 ); \
|
|
XB2 = SWAP_64( XB2 ); \
|
|
XA0 = XOR( XA0, T ); \
|
|
XA0 = XOR( XA0, TA ); \
|
|
T = _mm_slli_epi32( TB, 18 ); \
|
|
TB = _mm_srli_epi32( TB, 14 ); \
|
|
XA1 = ROR_1X32( XA1 ); \
|
|
XB0 = XOR( XB0, T ); \
|
|
XB0 = XOR( XB0, TB ); \
|
|
XB1 = ROR_1X32( XB1 ); \
|
|
} while (0);
|
|
|
|
#define SALSA_2ROUNDS_FINAL_SIMD128_2BUF \
|
|
ARX_2BUF( XA1, XA0, XA3, XB1, XB0, XB3, 7 ); \
|
|
ARX_2BUF( XA2, XA1, XA0, XB2, XB1, XB0, 9 ); \
|
|
ARX_2BUF( XA3, XA2, XA1, XB3, XB2, XB1, 13 ); \
|
|
ARX_2BUF( XA0, XA3, XA2, XB0, XB3, XB2, 18 ); \
|
|
XA1 = ROL_1X32( XA1 ); \
|
|
XB1 = ROL_1X32( XB1 ); \
|
|
XA3 = ROR_1X32( XA3 ); \
|
|
XB3 = ROR_1X32( XB3 ); \
|
|
XA2 = SWAP_64( XA2 ); \
|
|
XB2 = SWAP_64( XB2 ); \
|
|
ARX_2BUF( XA3, XA0, XA1, XB3, XB0, XB1, 7 ); \
|
|
ARX_2BUF( XA2, XA3, XA0, XB2, XB3, XB0, 9 ); \
|
|
ARX_2BUF( XA1, XA2, XA3, XB1, XB2, XB3, 13 ); \
|
|
ARX_2BUF( XA0, XA1, XA2, XB0, XB1, XB2, 18 );
|
|
|
|
|
|
// Inlined ARX
|
|
#define SALSA_2ROUNDS_SIMD128_3BUF \
|
|
do{ \
|
|
TYPE TA = ADD32( XA0, XA3 ); \
|
|
TYPE TB = ADD32( XB0, XB3 ); \
|
|
TYPE TC = ADD32( XC0, XC3 ); \
|
|
TA = ROL32( TA, 7 ); \
|
|
TB = ROL32( TB, 7 ); \
|
|
TC = ROL32( TC, 7 ); \
|
|
XA1 = XOR( XA1, TA ); \
|
|
XB1 = XOR( XB1, TB ); \
|
|
XC1 = XOR( XC1, TC ); \
|
|
\
|
|
TA = ADD32( XA1, XA0 ); \
|
|
TB = ADD32( XB1, XB0 ); \
|
|
TC = ADD32( XC1, XC0 ); \
|
|
TA = ROL32( TA, 9 ); \
|
|
TB = ROL32( TB, 9 ); \
|
|
TC = ROL32( TC, 9 ); \
|
|
XA2 = XOR( XA2, TA ); \
|
|
XB2 = XOR( XB2, TB ); \
|
|
XC2 = XOR( XC2, TC ); \
|
|
\
|
|
TA = ADD32( XA2, XA1 ); \
|
|
XA1 = ROL_1X32( XA1 ); \
|
|
TB = ADD32( XB2, XB1 ); \
|
|
TC = ADD32( XC2, XC1 ); \
|
|
XB1 = ROL_1X32( XB1 ); \
|
|
TA = ROL32( TA, 13 ); \
|
|
XA3 = XOR( XA3, TA ); \
|
|
XC1 = ROL_1X32( XC1 ); \
|
|
TB = ROL32( TB, 13 ); \
|
|
XB3 = XOR( XB3, TB ); \
|
|
TC = ROL32( TC, 13 ); \
|
|
XC3 = XOR( XC3, TC ); \
|
|
\
|
|
TA = ADD32( XA3, XA2 ); \
|
|
XA2 = SWAP_64( XA2 ); \
|
|
TB = ADD32( XB3, XB2 ); \
|
|
TC = ADD32( XC3, XC2 ); \
|
|
TA = ROL32( TA, 18 ); \
|
|
XB2 = SWAP_64( XB2 ); \
|
|
XA0 = XOR( XA0, TA ); \
|
|
TB = ROL32( TB, 18 ); \
|
|
XB0 = XOR( XB0, TB ); \
|
|
XC2 = SWAP_64( XC2 ); \
|
|
TC = ROL32( TC, 18 ); \
|
|
XC0 = XOR( XC0, TC ); \
|
|
\
|
|
TA = ADD32( XA0, XA1 ); \
|
|
XA3 = ROR_1X32( XA3 ); \
|
|
TB = ADD32( XB0, XB1 ); \
|
|
TC = ADD32( XC0, XC1 ); \
|
|
TA = ROL32( TA, 7 ); \
|
|
XB3 = ROR_1X32( XB3 ); \
|
|
XA3 = XOR( XA3, TA ); \
|
|
TB = ROL32( TB, 7 ); \
|
|
XC3 = ROR_1X32( XC3 ); \
|
|
XB3 = XOR( XB3, TB ); \
|
|
TC = ROL32( TC, 7 ); \
|
|
XC3 = XOR( XC3, TC ); \
|
|
\
|
|
TA = ADD32( XA3, XA0 ); \
|
|
TB = ADD32( XB3, XB0 ); \
|
|
TC = ADD32( XC3, XC0 ); \
|
|
TA = ROL32( TA, 9 ); \
|
|
TB = ROL32( TB, 9 ); \
|
|
TC = ROL32( TC, 9 ); \
|
|
XA2 = XOR( XA2, TA ); \
|
|
XB2 = XOR( XB2, TB ); \
|
|
XC2 = XOR( XC2, TC ); \
|
|
\
|
|
TA = ADD32( XA2, XA3 ); \
|
|
TB = ADD32( XB2, XB3 ); \
|
|
TA = ROL32( TA, 13 ); \
|
|
TC = ADD32( XC2, XC3 ); \
|
|
XA3 = ROL_1X32( XA3 ); \
|
|
TB = ROL32( TB, 13 ); \
|
|
XB3 = ROL_1X32( XB3 ); \
|
|
XA1 = XOR( XA1, TA ); \
|
|
TC = ROL32( TC, 13 ); \
|
|
XC3 = ROL_1X32( XC3 ); \
|
|
XB1 = XOR( XB1, TB ); \
|
|
XC1 = XOR( XC1, TC ); \
|
|
\
|
|
TA = ADD32( XA1, XA2 ); \
|
|
XA2 = SWAP_64( XA2 ); \
|
|
TB = ADD32( XB1, XB2 ); \
|
|
XB2 = SWAP_64( XB2 ); \
|
|
TA = ROL32( TA, 18); \
|
|
TC = ADD32( XC1, XC2 ); \
|
|
XC2 = SWAP_64( XC2 ); \
|
|
TB = ROL32( TB, 18); \
|
|
XA0 = XOR( XA0, TA ); \
|
|
XA1 = ROR_1X32( XA1 ); \
|
|
TC = ROL32( TC, 18); \
|
|
XB0 = XOR( XB0, TB ); \
|
|
XB1 = ROR_1X32( XB1 ); \
|
|
XC0 = XOR( XC0, TC ); \
|
|
XC1 = ROR_1X32( XC1 ); \
|
|
} while (0);
|
|
|
|
|
|
// slow rot, an attempt to optimze non-avx512 bit rotations
|
|
// Contains target specific instructions, only for use with 128 bit vectors
|
|
#define SALSA_2ROUNDS_SIMD128_3BUF_SLOROT \
|
|
do{ \
|
|
TYPE TA = ADD32( XA0, XA3 ); \
|
|
TYPE TB = ADD32( XB0, XB3 ); \
|
|
TYPE TC = ADD32( XC0, XC3 ); \
|
|
TYPE T = _mm_slli_epi32( TA, 7 ); \
|
|
TA = _mm_srli_epi32( TA, 25 ); \
|
|
XA1 = XOR( XA1, T ); \
|
|
XA1 = XOR( XA1, TA ); \
|
|
T = _mm_slli_epi32( TB, 7 );\
|
|
TB = _mm_srli_epi32( TB, 25 ); \
|
|
XB1 = XOR( XB1, T ); \
|
|
XB1 = XOR( XB1, TB ); \
|
|
T = _mm_slli_epi32( TC, 7 );\
|
|
TC = _mm_srli_epi32( TC, 25 );\
|
|
XC1 = XOR( XC1, T ); \
|
|
XC1 = XOR( XC1, TC ); \
|
|
\
|
|
TA = ADD32( XA1, XA0 ); \
|
|
TB = ADD32( XB1, XB0 ); \
|
|
TC = ADD32( XC1, XC0 ); \
|
|
T = _mm_slli_epi32( TA, 9 ); \
|
|
TA = _mm_srli_epi32( TA, 23 ); \
|
|
XA2 = XOR( XA2, T ); \
|
|
XA2 = XOR( XA2, TA ); \
|
|
T = _mm_slli_epi32( TB, 9 );\
|
|
TB = _mm_srli_epi32( TB, 23 );\
|
|
XB2 = XOR( XB2, T ); \
|
|
XB2 = XOR( XB2, TB ); \
|
|
T = _mm_slli_epi32( TC, 9 );\
|
|
TC = _mm_srli_epi32( TC, 23 );\
|
|
XC2 = XOR( XC2, T ); \
|
|
XC2 = XOR( XC2, TC ); \
|
|
\
|
|
TA = ADD32( XA2, XA1 ); \
|
|
TB = ADD32( XB2, XB1 ); \
|
|
TC = ADD32( XC2, XC1 ); \
|
|
T = _mm_slli_epi32( TA, 13); \
|
|
TA = _mm_srli_epi32( TA, 19 ); \
|
|
XA1 = ROL_1X32( XA1 ); \
|
|
XB1 = ROL_1X32( XB1 ); \
|
|
XC1 = ROL_1X32( XC1 ); \
|
|
XA3 = XOR( XA3, T ); \
|
|
XA3 = XOR( XA3, TA ); \
|
|
T = _mm_slli_epi32( TB, 13); \
|
|
TB = _mm_srli_epi32( TB, 19 ); \
|
|
XB3 = XOR( XB3, T ); \
|
|
XB3 = XOR( XB3, TB ); \
|
|
T = _mm_slli_epi32( TC, 13); \
|
|
TC = _mm_srli_epi32( TC, 19 ); \
|
|
XC3 = XOR( XC3, T ); \
|
|
XC3 = XOR( XC3, TC ); \
|
|
\
|
|
TA = ADD32( XA3, XA2 ); \
|
|
TB = ADD32( XB3, XB2 ); \
|
|
TC = ADD32( XC3, XC2 ); \
|
|
T = _mm_slli_epi32( TA, 18 ); \
|
|
TA = _mm_srli_epi32( TA, 14 ); \
|
|
XA2 = SWAP_64( XA2 ); \
|
|
XB2 = SWAP_64( XB2 ); \
|
|
XC2 = SWAP_64( XC2 ); \
|
|
XA0 = XOR( XA0, T ); \
|
|
XA0 = XOR( XA0, TA ); \
|
|
T = _mm_slli_epi32( TB, 18 ); \
|
|
TB = _mm_srli_epi32( TB, 14 ); \
|
|
XB0 = XOR( XB0, T ); \
|
|
XB0 = XOR( XB0, TB ); \
|
|
T = _mm_slli_epi32( TC, 18 ); \
|
|
TC = _mm_srli_epi32( TC, 14 ); \
|
|
XC0 = XOR( XC0, T ); \
|
|
XC0 = XOR( XC0, TC ); \
|
|
\
|
|
TA = ADD32( XA0, XA1 ); \
|
|
TB = ADD32( XB0, XB1 ); \
|
|
TC = ADD32( XC0, XC1 ); \
|
|
T = _mm_slli_epi32( TA, 7 ); \
|
|
TA = _mm_srli_epi32( TA, 25 ); \
|
|
XA3 = ROR_1X32( XA3 ); \
|
|
XA3 = XOR( XA3, T ); \
|
|
XA3 = XOR( XA3, TA ); \
|
|
T = _mm_slli_epi32( TB, 7 ); \
|
|
TB = _mm_srli_epi32( TB, 25 ); \
|
|
XB3 = ROR_1X32( XB3 ); \
|
|
XB3 = XOR( XB3, T ); \
|
|
XB3 = XOR( XB3, TB ); \
|
|
T = _mm_slli_epi32( TC, 7 ); \
|
|
TC = _mm_srli_epi32( TC, 25 ); \
|
|
XC3 = ROR_1X32( XC3 ); \
|
|
XC3 = XOR( XC3, T ); \
|
|
XC3 = XOR( XC3, TC ); \
|
|
\
|
|
TA = ADD32( XA3, XA0 ); \
|
|
TB = ADD32( XB3, XB0 ); \
|
|
TC = ADD32( XC3, XC0 ); \
|
|
T = _mm_slli_epi32( TA, 9 ); \
|
|
TA = _mm_srli_epi32( TA, 23 ); \
|
|
XA2 = XOR( XA2, T ); \
|
|
XA2 = XOR( XA2, TA ); \
|
|
T = _mm_slli_epi32( TB, 9 ); \
|
|
TB = _mm_srli_epi32( TB, 23 ); \
|
|
XB2 = XOR( XB2, T ); \
|
|
XB2 = XOR( XB2, TB ); \
|
|
T = _mm_slli_epi32( TC, 9 ); \
|
|
TC = _mm_srli_epi32( TC, 23 ); \
|
|
XC2 = XOR( XC2, T ); \
|
|
XC2 = XOR( XC2, TC ); \
|
|
\
|
|
TA = ADD32( XA2, XA3 ); \
|
|
TB = ADD32( XB2, XB3 ); \
|
|
TC = ADD32( XC2, XC3 ); \
|
|
T = _mm_slli_epi32( TA, 13 ); \
|
|
TA = _mm_srli_epi32( TA, 19 ); \
|
|
XA3 = ROL_1X32( XA3 ); \
|
|
XB3 = ROL_1X32( XB3 ); \
|
|
XC3 = ROL_1X32( XC3 ); \
|
|
XA1 = XOR( XA1, T ); \
|
|
XA1 = XOR( XA1, TA ); \
|
|
T = _mm_slli_epi32( TB, 13 ); \
|
|
TB = _mm_srli_epi32( TB, 19 ); \
|
|
XB1 = XOR( XB1, T ); \
|
|
XB1 = XOR( XB1, TB ); \
|
|
T = _mm_slli_epi32( TC, 13 ); \
|
|
TC = _mm_srli_epi32( TC, 19 ); \
|
|
XC1 = XOR( XC1, T ); \
|
|
XC1 = XOR( XC1, TC ); \
|
|
\
|
|
TA = ADD32( XA1, XA2 ); \
|
|
TB = ADD32( XB1, XB2 ); \
|
|
TC = ADD32( XC1, XC2 ); \
|
|
T = _mm_slli_epi32( TA, 18 ); \
|
|
TA = _mm_srli_epi32( TA, 14 ); \
|
|
XA2 = SWAP_64( XA2 ); \
|
|
XB2 = SWAP_64( XB2 ); \
|
|
XA0 = XOR( XA0, T ); \
|
|
XA0 = XOR( XA0, TA ); \
|
|
T = _mm_slli_epi32( TB, 18 ); \
|
|
TB = _mm_srli_epi32( TB, 14 ); \
|
|
XC2 = SWAP_64( XC2 ); \
|
|
XA1 = ROR_1X32( XA1 ); \
|
|
XB0 = XOR( XB0, T ); \
|
|
XB0 = XOR( XB0, TB ); \
|
|
T = _mm_slli_epi32( TC, 18 ); \
|
|
TC = _mm_srli_epi32( TC, 14 ); \
|
|
XB1 = ROR_1X32( XB1 ); \
|
|
XC1 = ROR_1X32( XC1 ); \
|
|
XC0 = XOR( XC0, T ); \
|
|
XC0 = XOR( XC0, TC ); \
|
|
} while (0);
|
|
|
|
|
|
/*
|
|
// Standard version using ARX
|
|
#define SALSA_2ROUNDS_SIMD128_3BUF \
|
|
ARX_3BUF( XA1, XA0, XA3, XB1, XB0, XB3, \
|
|
XC1, XC0, XC3, 7 ); \
|
|
ARX_3BUF( XA2, XA1, XA0, XB2, XB1, XB0, \
|
|
XC2, XC1, XC0, 9 ); \
|
|
ARX_3BUF( XA3, XA2, XA1, XB3, XB2, XB1, \
|
|
XC3, XC2, XC1, 13 ); \
|
|
ARX_3BUF( XA0, XA3, XA2, XB0, XB3, XB2, \
|
|
XC0, XC3, XC2, 18 ); \
|
|
XA1 = ROL_1X32( XA1 ); \
|
|
XB1 = ROL_1X32( XB1 ); \
|
|
XC1 = ROL_1X32( XC1 ); \
|
|
XA3 = ROR_1X32( XA3 ); \
|
|
XB3 = ROR_1X32( XB3 ); \
|
|
XC3 = ROR_1X32( XC3 ); \
|
|
XA2 = SWAP_64( XA2 ); \
|
|
XB2 = SWAP_64( XB2 ); \
|
|
XC2 = SWAP_64( XC2 ); \
|
|
ARX_3BUF( XA3, XA0, XA1, XB3, XB0, XB1, \
|
|
XC3, XC0, XC1, 7 ); \
|
|
ARX_3BUF( XA2, XA3, XA0, XB2, XB3, XB0, \
|
|
XC2, XC3, XC0, 9 ); \
|
|
ARX_3BUF( XA1, XA2, XA3, XB1, XB2, XB3, \
|
|
XC1, XC2, XC3, 13 ); \
|
|
ARX_3BUF( XA0, XA1, XA2, XB0, XB1, XB2, \
|
|
XC0, XC1, XC2, 18 ); \
|
|
XA3 = ROL_1X32( XA3 ); \
|
|
XB3 = ROL_1X32( XB3 ); \
|
|
XC3 = ROL_1X32( XC3 ); \
|
|
XA1 = ROR_1X32( XA1 ); \
|
|
XB1 = ROR_1X32( XB1 ); \
|
|
XC1 = ROR_1X32( XC1 ); \
|
|
XA2 = SWAP_64( XA2 ); \
|
|
XB2 = SWAP_64( XB2 ); \
|
|
XC2 = SWAP_64( XC2 );
|
|
*/
|
|
|
|
#define SALSA_2ROUNDS_FINAL_SIMD128_3BUF \
|
|
ARX_3BUF( XA1, XA0, XA3, XB1, XB0, XB3, \
|
|
XC1, XC0, XC3, 7 ); \
|
|
ARX_3BUF( XA2, XA1, XA0, XB2, XB1, XB0, \
|
|
XC2, XC1, XC0, 9 ); \
|
|
ARX_3BUF( XA3, XA2, XA1, XB3, XB2, XB1, \
|
|
XC3, XC2, XC1, 13 ); \
|
|
ARX_3BUF( XA0, XA3, XA2, XB0, XB3, XB2, \
|
|
XC0, XC3, XC2, 18 ); \
|
|
XA1 = ROL_1X32( XA1 ); \
|
|
XB1 = ROL_1X32( XB1 ); \
|
|
XC1 = ROL_1X32( XC1 ); \
|
|
XA3 = ROR_1X32( XA3 ); \
|
|
XB3 = ROR_1X32( XB3 ); \
|
|
XC3 = ROR_1X32( XC3 ); \
|
|
XA2 = SWAP_64( XA2 ); \
|
|
XB2 = SWAP_64( XB2 ); \
|
|
XC2 = SWAP_64( XC2 ); \
|
|
ARX_3BUF( XA3, XA0, XA1, XB3, XB0, XB1, \
|
|
XC3, XC0, XC1, 7 ); \
|
|
ARX_3BUF( XA2, XA3, XA0, XB2, XB3, XB0, \
|
|
XC2, XC3, XC0, 9 ); \
|
|
ARX_3BUF( XA1, XA2, XA3, XB1, XB2, XB3, \
|
|
XC1, XC2, XC3, 13 ); \
|
|
ARX_3BUF( XA0, XA1, XA2, XB0, XB1, XB2, \
|
|
XC0, XC1, XC2, 18 );
|
|
|
|
|
|
#define SALSA_8ROUNDS_SIMD128 \
|
|
SALSA_2ROUNDS_SIMD128( X0, X1, X2, X3 ); \
|
|
SALSA_2ROUNDS_SIMD128( X0, X1, X2, X3 ); \
|
|
SALSA_2ROUNDS_SIMD128( X0, X1, X2, X3 ); \
|
|
SALSA_2ROUNDS_SIMD128( X0, X1, X2, X3 );
|
|
|
|
#define SALSA_8ROUNDS_FINAL_SIMD128 \
|
|
SALSA_2ROUNDS_SIMD128( X0, X1, X2, X3 ); \
|
|
SALSA_2ROUNDS_SIMD128( X0, X1, X2, X3 ); \
|
|
SALSA_2ROUNDS_SIMD128( X0, X1, X2, X3 ); \
|
|
SALSA_2ROUNDS_FINAL_SIMD128( X0, X1, X2, X3 );
|
|
|
|
// Implied args ( XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3 )
|
|
#define SALSA_8ROUNDS_SIMD128_2BUF \
|
|
SALSA_2ROUNDS_SIMD128_2BUF; \
|
|
SALSA_2ROUNDS_SIMD128_2BUF; \
|
|
SALSA_2ROUNDS_SIMD128_2BUF; \
|
|
SALSA_2ROUNDS_SIMD128_2BUF;
|
|
|
|
#define SALSA_8ROUNDS_SIMD128_2BUF_SLOROT \
|
|
SALSA_2ROUNDS_SIMD128_2BUF_SLOROT; \
|
|
SALSA_2ROUNDS_SIMD128_2BUF_SLOROT; \
|
|
SALSA_2ROUNDS_SIMD128_2BUF_SLOROT; \
|
|
SALSA_2ROUNDS_SIMD128_2BUF_SLOROT;
|
|
|
|
#define SALSA_8ROUNDS_FINAL_SIMD128_2BUF \
|
|
SALSA_2ROUNDS_SIMD128_2BUF; \
|
|
SALSA_2ROUNDS_SIMD128_2BUF; \
|
|
SALSA_2ROUNDS_SIMD128_2BUF; \
|
|
SALSA_2ROUNDS_FINAL_SIMD128_2BUF;
|
|
|
|
#define SALSA_8ROUNDS_SIMD128_3BUF \
|
|
SALSA_2ROUNDS_SIMD128_3BUF; \
|
|
SALSA_2ROUNDS_SIMD128_3BUF; \
|
|
SALSA_2ROUNDS_SIMD128_3BUF; \
|
|
SALSA_2ROUNDS_SIMD128_3BUF;
|
|
|
|
#define SALSA_8ROUNDS_SIMD128_3BUF_SLOROT \
|
|
SALSA_2ROUNDS_SIMD128_3BUF_SLOROT; \
|
|
SALSA_2ROUNDS_SIMD128_3BUF_SLOROT; \
|
|
SALSA_2ROUNDS_SIMD128_3BUF_SLOROT; \
|
|
SALSA_2ROUNDS_SIMD128_3BUF_SLOROT;
|
|
|
|
#define SALSA_8ROUNDS_FINAL_SIMD128_3BUF \
|
|
SALSA_2ROUNDS_SIMD128_3BUF; \
|
|
SALSA_2ROUNDS_SIMD128_3BUF; \
|
|
SALSA_2ROUNDS_SIMD128_3BUF; \
|
|
SALSA_2ROUNDS_FINAL_SIMD128_3BUF;
|
|
|
|
// Implied args ( XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3,
|
|
// XC0, XC1, XC2, XC3, XD0, XD1, XD2, XD3, )
|
|
#define SALSA_8ROUNDS_SIMD128_4BUF \
|
|
SALSA_2ROUNDS_SIMD128_4BUF; \
|
|
SALSA_2ROUNDS_SIMD128_4BUF; \
|
|
SALSA_2ROUNDS_SIMD128_4BUF; \
|
|
SALSA_2ROUNDS_SIMD128_4BUF;
|
|
|
|
#define SALSA_8ROUNDS_FINAL_SIMD128_4BUF \
|
|
SALSA_2ROUNDS_SIMD128_4BUF; \
|
|
SALSA_2ROUNDS_SIMD128_4BUF; \
|
|
SALSA_2ROUNDS_SIMD128_4BUF; \
|
|
SALSA_2ROUNDS_FINAL_SIMD128_4BUF;
|
|
|
|
// Used by reference code and pure parallel implementations
|
|
//
|
|
// Implied arguments ( x0, x1, x2, x3, x4, x5, x6, x7,
|
|
// x8, x9, xa, xb, xc, xd, xe, xf )
|
|
//
|
|
#define SALSA_COLUMN \
|
|
x4 = ARX( x4, x0, xc, 7 ); \
|
|
x9 = ARX( x9, x5, x1, 7 ); \
|
|
xe = ARX( xe, xa, x6, 7 ); \
|
|
x3 = ARX( x3, xf, xb, 7 ); \
|
|
x8 = ARX( x8, x4, x0, 9 ); \
|
|
xd = ARX( xd, x9, x5, 9 ); \
|
|
x2 = ARX( x2, xe, xa, 9 ); \
|
|
x7 = ARX( x7, x3, xf, 9 ); \
|
|
xc = ARX( xc, x8, x4, 13 ); \
|
|
x1 = ARX( x1, xd, x9, 13 ); \
|
|
x6 = ARX( x6, x2, xe, 13 ); \
|
|
xb = ARX( xb, x7, x3, 13 ); \
|
|
x0 = ARX( x0, xc, x8, 18 ); \
|
|
x5 = ARX( x5, x1, xd, 18 ); \
|
|
xa = ARX( xa, x6, x2, 18 ); \
|
|
xf = ARX( xf, xb, x7, 18 )
|
|
|
|
#define SALSA_ROW \
|
|
x1 = ARX( x1, x0, x3, 7 ); \
|
|
x6 = ARX( x6, x5, x4, 7 ); \
|
|
xb = ARX( xb, xa, x9, 7 ); \
|
|
xc = ARX( xc, xf, xe, 7 ); \
|
|
x2 = ARX( x2, x1, x0, 9 ); \
|
|
x7 = ARX( x7, x6, x5, 9 ); \
|
|
x8 = ARX( x8, xb, xa, 9 ); \
|
|
xd = ARX( xd, xc, xf, 9 ); \
|
|
x3 = ARX( x3, x2, x1, 13 ); \
|
|
x4 = ARX( x4, x7, x6, 13 ); \
|
|
x9 = ARX( x9, x8, xb, 13 ); \
|
|
xe = ARX( xe, xd, xc, 13 ); \
|
|
x0 = ARX( x0, x3, x2, 18 ); \
|
|
x5 = ARX( x5, x4, x7, 18 ); \
|
|
xa = ARX( xa, x9, x8, 18 ); \
|
|
xf = ARX( xf, xe, xd, 18 );
|
|
|
|
#define SALSA_2ROUNDS SALSA_COLUMN; SALSA_ROW;
|
|
|
|
#define SALSA_8ROUNDS \
|
|
SALSA_2ROUNDS; SALSA_2ROUNDS; SALSA_2ROUNDS; SALSA_2ROUNDS;
|
|
|
|
|
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
|
|
|
// Tested OK but very slow
|
|
// 16 way parallel, requires 16x32 interleaving
|
|
static void xor_salsa8_16way( __m512i * const B, const __m512i * const C)
|
|
{
|
|
__m512i x0 = B[ 0] = _mm512_xor_si512( B[ 0], C[ 0] );
|
|
__m512i x1 = B[ 1] = _mm512_xor_si512( B[ 1], C[ 1] );
|
|
__m512i x2 = B[ 2] = _mm512_xor_si512( B[ 2], C[ 2] );
|
|
__m512i x3 = B[ 3] = _mm512_xor_si512( B[ 3], C[ 3] );
|
|
__m512i x4 = B[ 4] = _mm512_xor_si512( B[ 4], C[ 4] );
|
|
__m512i x5 = B[ 5] = _mm512_xor_si512( B[ 5], C[ 5] );
|
|
__m512i x6 = B[ 6] = _mm512_xor_si512( B[ 6], C[ 6] );
|
|
__m512i x7 = B[ 7] = _mm512_xor_si512( B[ 7], C[ 7] );
|
|
__m512i x8 = B[ 8] = _mm512_xor_si512( B[ 8], C[ 8] );
|
|
__m512i x9 = B[ 9] = _mm512_xor_si512( B[ 9], C[ 9] );
|
|
__m512i xa = B[10] = _mm512_xor_si512( B[10], C[10] );
|
|
__m512i xb = B[11] = _mm512_xor_si512( B[11], C[11] );
|
|
__m512i xc = B[12] = _mm512_xor_si512( B[12], C[12] );
|
|
__m512i xd = B[13] = _mm512_xor_si512( B[13], C[13] );
|
|
__m512i xe = B[14] = _mm512_xor_si512( B[14], C[14] );
|
|
__m512i xf = B[15] = _mm512_xor_si512( B[15], C[15] );
|
|
|
|
#define ROL32 _mm512_rol_epi32
|
|
#define ADD32 _mm512_add_epi32
|
|
#define XOR _mm512_xor_si512
|
|
|
|
SALSA_8ROUNDS;
|
|
|
|
#undef ROL32
|
|
#undef ADD32
|
|
#undef XOR
|
|
|
|
B[ 0] = _mm512_add_epi32( B[ 0], x0 );
|
|
B[ 1] = _mm512_add_epi32( B[ 1], x1 );
|
|
B[ 2] = _mm512_add_epi32( B[ 2], x2 );
|
|
B[ 3] = _mm512_add_epi32( B[ 3], x3 );
|
|
B[ 4] = _mm512_add_epi32( B[ 4], x4 );
|
|
B[ 5] = _mm512_add_epi32( B[ 5], x5 );
|
|
B[ 6] = _mm512_add_epi32( B[ 6], x6 );
|
|
B[ 7] = _mm512_add_epi32( B[ 7], x7 );
|
|
B[ 8] = _mm512_add_epi32( B[ 8], x8 );
|
|
B[ 9] = _mm512_add_epi32( B[ 9], x9 );
|
|
B[10] = _mm512_add_epi32( B[10], xa );
|
|
B[11] = _mm512_add_epi32( B[11], xb );
|
|
B[12] = _mm512_add_epi32( B[12], xc );
|
|
B[13] = _mm512_add_epi32( B[13], xd );
|
|
B[14] = _mm512_add_epi32( B[14], xe );
|
|
B[15] = _mm512_add_epi32( B[15], xf );
|
|
}
|
|
|
|
void scrypt_core_16way( __m512i *X, __m512i *V, const uint32_t N )
|
|
{
|
|
for ( int n = 0; n < N; n++ )
|
|
{
|
|
memcpy( &V[n * 32], X, 128*16 );
|
|
xor_salsa8_16way( &X[ 0], &X[16] );
|
|
xor_salsa8_16way( &X[16], &X[ 0] );
|
|
}
|
|
for ( int n = 0; n < N; n++ )
|
|
{
|
|
m512_ovly *vptr[16]; // pointer to V offset for each lane
|
|
m512_ovly *x16 = (m512_ovly*)(&X[16]);
|
|
|
|
// create pointers to V for each lane using data from each lane of X[16]
|
|
// as index.
|
|
for ( int l = 0; l < 16; l++ )
|
|
{
|
|
uint32_t xl = (*x16).u32[l];
|
|
vptr[l] = (m512_ovly*)( &V[ 32 * ( xl & ( N-1 ) ) ] );
|
|
}
|
|
|
|
for ( int i = 0; i < 32; i++ )
|
|
{
|
|
m512_ovly v; // V value assembled from different indexes
|
|
for ( int l = 0; l < 8; l++ )
|
|
v.u32[l] = ( *(vptr[l] +i ) ) .u32[l];
|
|
X[i] = _mm512_xor_si512( X[i], v.m512 );
|
|
}
|
|
|
|
xor_salsa8_16way( &X[ 0], &X[16] );
|
|
xor_salsa8_16way( &X[16], &X[ 0] );
|
|
}
|
|
}
|
|
|
|
// Working, not up to date, needs stream, shuffle optimizations.
|
|
// 4x32 interleaving
|
|
static void salsa8_simd128_4way( __m128i *b, const __m128i *c )
|
|
{
|
|
__m512i X0, X1, X2, X3, Y0, Y1, Y2, Y3;
|
|
__m512i *B = (__m512i*)b;
|
|
const __m512i *C = (const __m512i*)c;
|
|
|
|
// mix C into B then shuffle B into X
|
|
B[0] = _mm512_xor_si512( B[0], C[0] );
|
|
B[1] = _mm512_xor_si512( B[1], C[1] );
|
|
B[2] = _mm512_xor_si512( B[2], C[2] );
|
|
B[3] = _mm512_xor_si512( B[3], C[3] );
|
|
|
|
Y0 = _mm512_mask_blend_epi64( 0x03, B[1], B[0] );
|
|
X0 = _mm512_mask_blend_epi64( 0x30, B[3], B[2] );
|
|
X0 = _mm512_mask_blend_epi64( 0x0f, X0, Y0 );
|
|
|
|
Y0 = _mm512_mask_blend_epi64( 0x03, B[2], B[1] );
|
|
X1 = _mm512_mask_blend_epi64( 0x30, B[0], B[3] );
|
|
X1 = _mm512_mask_blend_epi64( 0x0f, X1, Y0 );
|
|
|
|
Y0 = _mm512_mask_blend_epi64( 0x03, B[3], B[2] );
|
|
X2 = _mm512_mask_blend_epi64( 0x30, B[1], B[0] );
|
|
X2 = _mm512_mask_blend_epi64( 0x0f, X2, Y0 );
|
|
|
|
Y0 = _mm512_mask_blend_epi64( 0x03, B[0], B[3] );
|
|
X3 = _mm512_mask_blend_epi64( 0x30, B[2], B[1] );
|
|
X3 = _mm512_mask_blend_epi64( 0x0f, X3, Y0 );
|
|
|
|
// define targets for macros used in round function template
|
|
#define ROL_1X32 mm512_shufll_128
|
|
#define ROR_1X32 mm512_shuflr_128
|
|
#define SWAP_64 mm512_swap_256
|
|
#define ROL32 _mm512_rol_epi32
|
|
#define ADD32 _mm512_add_epi32
|
|
#define XOR _mm512_xor_si512
|
|
|
|
SALSA_8ROUNDS_SIMD128;
|
|
|
|
#undef ROL_1X32
|
|
#undef ROR_1X32
|
|
#undef SWAP_64
|
|
#undef ROL32
|
|
#undef ADD32
|
|
#undef XOR
|
|
|
|
Y0 = _mm512_mask_blend_epi64( 0xc0, X0, X1 );
|
|
Y1 = _mm512_mask_blend_epi64( 0x03, X0, X1 );
|
|
Y2 = _mm512_mask_blend_epi64( 0x0c, X0, X1 );
|
|
Y3 = _mm512_mask_blend_epi64( 0x30, X0, X1 );
|
|
|
|
Y0 = _mm512_mask_blend_epi64( 0x30, Y0, X2 );
|
|
Y1 = _mm512_mask_blend_epi64( 0xc0, Y1, X2 );
|
|
Y2 = _mm512_mask_blend_epi64( 0x03, Y2, X2 );
|
|
Y3 = _mm512_mask_blend_epi64( 0x0c, Y3, X2 );
|
|
|
|
Y0 = _mm512_mask_blend_epi64( 0x0c, Y0, X3 );
|
|
Y1 = _mm512_mask_blend_epi64( 0x30, Y1, X3 );
|
|
Y2 = _mm512_mask_blend_epi64( 0xc0, Y2, X3 );
|
|
Y3 = _mm512_mask_blend_epi64( 0x03, Y3, X3 );
|
|
|
|
B[0] = _mm512_add_epi32( B[0], Y0 );
|
|
B[1] = _mm512_add_epi32( B[1], Y1 );
|
|
B[2] = _mm512_add_epi32( B[2], Y2 );
|
|
B[3] = _mm512_add_epi32( B[3], Y3 );
|
|
}
|
|
|
|
// data format for 512 bits: 4 * ( 4 way 32 )
|
|
// { l3d3, l2d3, l1d3, l0d3, l3d2, l2d2, l1d2, l0d2,
|
|
// l3d1, l2d1, l1d1, l0d1, l3d0, l2d0, l1d0, l0d0 }
|
|
|
|
void scrypt_core_simd128_4way( __m128i *X, __m128i *V, const uint32_t N )
|
|
{
|
|
for ( int n = 0; n < N; n++ )
|
|
{
|
|
memcpy( &V[n * 32], X, 4*128 );
|
|
salsa8_simd128_4way( &X[ 0], &X[16] );
|
|
salsa8_simd128_4way( &X[16], &X[ 0] );
|
|
}
|
|
|
|
for ( int n = 0; n < N; n++ )
|
|
{
|
|
uint32_t x16[4]; // index into V for each lane
|
|
memcpy( x16, &X[16], 16 );
|
|
x16[0] = 32 * ( x16[0] & ( N-1) );
|
|
x16[1] = 32 * ( x16[1] & ( N-1) );
|
|
x16[2] = 32 * ( x16[2] & ( N-1) );
|
|
x16[3] = 32 * ( x16[3] & ( N-1) );
|
|
m128_ovly *v = (m128_ovly*)V;
|
|
|
|
for( int i = 0; i < 32; i++ )
|
|
{
|
|
X[i] = _mm_xor_si128( X[i], _mm_set_epi32( v[ x16[3] + i ].u32[3],
|
|
v[ x16[2] + i ].u32[2],
|
|
v[ x16[1] + i ].u32[1],
|
|
v[ x16[0] + i ].u32[0] ) );
|
|
}
|
|
|
|
salsa8_simd128_4way( &X[ 0], &X[16] );
|
|
salsa8_simd128_4way( &X[16], &X[ 0] );
|
|
}
|
|
}
|
|
|
|
// 4x memory usage
|
|
// Working
|
|
// 4x128 interleaving
|
|
static inline void salsa_shuffle_4way_simd128( __m512i *X )
|
|
{
|
|
__m512i t0 = _mm512_mask_blend_epi32( 0xaaaa, X[0], X[1] );
|
|
__m512i t1 = _mm512_mask_blend_epi32( 0x5555, X[0], X[1] );
|
|
__m512i t2 = _mm512_mask_blend_epi32( 0xaaaa, X[2], X[3] );
|
|
__m512i t3 = _mm512_mask_blend_epi32( 0x5555, X[2], X[3] );
|
|
X[0] = _mm512_mask_blend_epi32( 0xcccc, t0, t2 );
|
|
X[1] = _mm512_mask_blend_epi32( 0x6666, t1, t3 );
|
|
X[2] = _mm512_mask_blend_epi32( 0x3333, t0, t2 );
|
|
X[3] = _mm512_mask_blend_epi32( 0x9999, t1, t3 );
|
|
}
|
|
|
|
static inline void salsa_unshuffle_4way_simd128( __m512i *X )
|
|
{
|
|
__m512i t0 = _mm512_mask_blend_epi32( 0xcccc, X[0], X[2] );
|
|
__m512i t1 = _mm512_mask_blend_epi32( 0x3333, X[0], X[2] );
|
|
__m512i t2 = _mm512_mask_blend_epi32( 0x6666, X[1], X[3] );
|
|
__m512i t3 = _mm512_mask_blend_epi32( 0x9999, X[1], X[3] );
|
|
X[0] = _mm512_mask_blend_epi32( 0xaaaa, t0, t2 );
|
|
X[1] = _mm512_mask_blend_epi32( 0x5555, t0, t2 );
|
|
X[2] = _mm512_mask_blend_epi32( 0xaaaa, t1, t3 );
|
|
X[3] = _mm512_mask_blend_epi32( 0x5555, t1, t3 );
|
|
}
|
|
|
|
static void salsa8_4way_simd128( __m512i * const B, const __m512i * const C)
|
|
{
|
|
__m512i X0, X1, X2, X3;
|
|
|
|
X0 = B[0] = _mm512_xor_si512( B[0], C[0] );
|
|
X1 = B[1] = _mm512_xor_si512( B[1], C[1] );
|
|
X2 = B[2] = _mm512_xor_si512( B[2], C[2] );
|
|
X3 = B[3] = _mm512_xor_si512( B[3], C[3] );
|
|
|
|
#define ROL_1X32 mm512_shufll128_32 // shuffle within 128 bit lanes
|
|
#define ROR_1X32 mm512_shuflr128_32
|
|
#define SWAP_64 mm512_swap128_64
|
|
#define ROL32 _mm512_rol_epi32
|
|
#define ADD32 _mm512_add_epi32
|
|
#define XOR _mm512_xor_si512
|
|
|
|
SALSA_8ROUNDS_SIMD128;
|
|
|
|
#undef ROL_1X32
|
|
#undef ROR_1X32
|
|
#undef SWAP_64
|
|
#undef ROL32
|
|
#undef ADD32
|
|
#undef XOR
|
|
|
|
B[0] = _mm512_add_epi32( B[0], X0 );
|
|
B[1] = _mm512_add_epi32( B[1], X1 );
|
|
B[2] = _mm512_add_epi32( B[2], X2 );
|
|
B[3] = _mm512_add_epi32( B[3], X3 );
|
|
}
|
|
|
|
void scrypt_core_4way_simd128( __m512i *X, __m512i *V, const uint32_t N )
|
|
{
|
|
salsa_shuffle_4way_simd128( X );
|
|
salsa_shuffle_4way_simd128( X+4 );
|
|
|
|
for ( int n = 0; n < N; n++ )
|
|
{
|
|
memcpy( &V[n * 8], X, 128*4 );
|
|
salsa8_4way_simd128( &X[0], &X[4] );
|
|
salsa8_4way_simd128( &X[4], &X[0] );
|
|
}
|
|
|
|
for ( int n = 0; n < N; n++ )
|
|
{
|
|
m512_ovly x16;
|
|
x16 = ( (m512_ovly*)X )[4];
|
|
uint32_t j0 = 8 * ( x16.u32[ 0] & ( N-1 ) );
|
|
uint32_t j1 = 8 * ( x16.u32[ 4] & ( N-1 ) );
|
|
uint32_t j2 = 8 * ( x16.u32[ 8] & ( N-1 ) );
|
|
uint32_t j3 = 8 * ( x16.u32[12] & ( N-1 ) );
|
|
|
|
for ( int i = 0; i < 8; i++ )
|
|
{
|
|
__m512i v10 = _mm512_mask_blend_epi32( 0x000f, V[ j1+i ], V[ j0+i ] );
|
|
__m512i v32 = _mm512_mask_blend_epi32( 0x0f00, V[ j3+i ], V[ j2+i ] );
|
|
X[i] = _mm512_xor_si512( X[i], _mm512_mask_blend_epi32( 0x00ff,
|
|
v32, v10 ) );
|
|
}
|
|
|
|
salsa8_4way_simd128( &X[0], &X[4] );
|
|
salsa8_4way_simd128( &X[4], &X[0] );
|
|
}
|
|
|
|
salsa_unshuffle_4way_simd128( X );
|
|
salsa_unshuffle_4way_simd128( X+4 );
|
|
}
|
|
|
|
#endif // AVX512
|
|
|
|
#if defined(__AVX2__)
|
|
|
|
// 8x memory usage
|
|
// Tested OK but slow scrypt, very slow scryptn2, 2x4way is faster
|
|
// Crashes with large N & many threads, OOM? Use only for scrypt
|
|
// 8x32 interleaving
|
|
static void salsa8_8way( __m256i * const B, const __m256i * const C )
|
|
{
|
|
__m256i x0 = B[ 0] = _mm256_xor_si256( B[ 0], C[ 0] );
|
|
__m256i x1 = B[ 1] = _mm256_xor_si256( B[ 1], C[ 1] );
|
|
__m256i x2 = B[ 2] = _mm256_xor_si256( B[ 2], C[ 2] );
|
|
__m256i x3 = B[ 3] = _mm256_xor_si256( B[ 3], C[ 3] );
|
|
__m256i x4 = B[ 4] = _mm256_xor_si256( B[ 4], C[ 4] );
|
|
__m256i x5 = B[ 5] = _mm256_xor_si256( B[ 5], C[ 5] );
|
|
__m256i x6 = B[ 6] = _mm256_xor_si256( B[ 6], C[ 6] );
|
|
__m256i x7 = B[ 7] = _mm256_xor_si256( B[ 7], C[ 7] );
|
|
__m256i x8 = B[ 8] = _mm256_xor_si256( B[ 8], C[ 8] );
|
|
__m256i x9 = B[ 9] = _mm256_xor_si256( B[ 9], C[ 9] );
|
|
__m256i xa = B[10] = _mm256_xor_si256( B[10], C[10] );
|
|
__m256i xb = B[11] = _mm256_xor_si256( B[11], C[11] );
|
|
__m256i xc = B[12] = _mm256_xor_si256( B[12], C[12] );
|
|
__m256i xd = B[13] = _mm256_xor_si256( B[13], C[13] );
|
|
__m256i xe = B[14] = _mm256_xor_si256( B[14], C[14] );
|
|
__m256i xf = B[15] = _mm256_xor_si256( B[15], C[15] );
|
|
|
|
#define ROL32 mm256_rol_32
|
|
#define ADD32 _mm256_add_epi32
|
|
#define XOR _mm256_xor_si256
|
|
|
|
SALSA_8ROUNDS;
|
|
|
|
#undef ROL32
|
|
#undef ADD32
|
|
#undef XOR
|
|
|
|
B[ 0] = _mm256_add_epi32( B[ 0], x0 );
|
|
B[ 1] = _mm256_add_epi32( B[ 1], x1 );
|
|
B[ 2] = _mm256_add_epi32( B[ 2], x2 );
|
|
B[ 3] = _mm256_add_epi32( B[ 3], x3 );
|
|
B[ 4] = _mm256_add_epi32( B[ 4], x4 );
|
|
B[ 5] = _mm256_add_epi32( B[ 5], x5 );
|
|
B[ 6] = _mm256_add_epi32( B[ 6], x6 );
|
|
B[ 7] = _mm256_add_epi32( B[ 7], x7 );
|
|
B[ 8] = _mm256_add_epi32( B[ 8], x8 );
|
|
B[ 9] = _mm256_add_epi32( B[ 9], x9 );
|
|
B[10] = _mm256_add_epi32( B[10], xa );
|
|
B[11] = _mm256_add_epi32( B[11], xb );
|
|
B[12] = _mm256_add_epi32( B[12], xc );
|
|
B[13] = _mm256_add_epi32( B[13], xd );
|
|
B[14] = _mm256_add_epi32( B[14], xe );
|
|
B[15] = _mm256_add_epi32( B[15], xf );
|
|
}
|
|
|
|
void scrypt_core_8way( __m256i *X, __m256i *V, const uint32_t N )
|
|
{
|
|
for ( int n = 0; n < N; n++ )
|
|
{
|
|
memcpy( &V[n * 32], X, 128*8 );
|
|
salsa8_8way( &X[ 0], &X[16] );
|
|
salsa8_8way( &X[16], &X[ 0] );
|
|
}
|
|
|
|
for ( int n = 0; n < N; n++ )
|
|
{
|
|
m256_ovly *vptr[8]; // pointer to V offset for each lane
|
|
m256_ovly *x16 = (m256_ovly*)(&X[16]);
|
|
|
|
// create pointers to V for each lane using data from each lane of X[16]
|
|
// as index.
|
|
for ( int l = 0; l < 8; l++ )
|
|
{
|
|
uint32_t xl = (*x16).u32[l];
|
|
vptr[l] = (m256_ovly*)( &V[ 32 * ( xl & ( N-1 ) ) ] );
|
|
}
|
|
|
|
for ( int i = 0; i < 32; i++ )
|
|
{
|
|
m256_ovly v; // V value assembled from different indexes
|
|
for ( int l = 0; l < 8; l++ )
|
|
v.u32[l] = ( *(vptr[l] +i ) ) .u32[l];
|
|
X[i] = _mm256_xor_si256( X[i], v.m256 );
|
|
}
|
|
|
|
salsa8_8way( &X[ 0], &X[16] );
|
|
salsa8_8way( &X[16], &X[ 0] );
|
|
}
|
|
}
|
|
|
|
// 2x memory usage
|
|
// Working
|
|
// Essentially Pooler 6way
|
|
// 2x128 interleaved simd128
|
|
// ------- lane 1 ------- ------- lane 0 -------
|
|
// { l1x3, l1x2, l1x1, l1x0, l0x3, l0x2, l0x1, l0x0 } b[3] B[ 7: 0]
|
|
// { l1x7, l1x6, l1x5, l1x4, l0x7, l0x6, l0x5, l0x4 } b[2] B[15: 8]
|
|
// { l1xb, l1xa, l1c9, l1x8, l0xb, l0xa, l0x9, l0x8 } b[1] B[23:16]
|
|
// { l1xf, l1xe, l1xd, l1xc, l0xf, l0xe, l0xd, l0xc } b[0] B[31:24]
|
|
|
|
static inline void salsa_shuffle_2way_simd128( __m256i *X )
|
|
{
|
|
__m256i t0 = _mm256_blend_epi32( X[0], X[1], 0xaa );
|
|
__m256i t1 = _mm256_blend_epi32( X[0], X[1], 0x55 );
|
|
__m256i t2 = _mm256_blend_epi32( X[2], X[3], 0xaa );
|
|
__m256i t3 = _mm256_blend_epi32( X[2], X[3], 0x55 );
|
|
X[0] = _mm256_blend_epi32( t0, t2, 0xcc );
|
|
X[1] = _mm256_blend_epi32( t1, t3, 0x66 );
|
|
X[2] = _mm256_blend_epi32( t0, t2, 0x33 );
|
|
X[3] = _mm256_blend_epi32( t1, t3, 0x99 );
|
|
}
|
|
|
|
static inline void salsa_unshuffle_2way_simd128( __m256i *X )
|
|
{
|
|
__m256i t0 = _mm256_blend_epi32( X[0], X[2], 0xcc );
|
|
__m256i t1 = _mm256_blend_epi32( X[0], X[2], 0x33 );
|
|
__m256i t2 = _mm256_blend_epi32( X[1], X[3], 0x66 );
|
|
__m256i t3 = _mm256_blend_epi32( X[1], X[3], 0x99 );
|
|
X[0] = _mm256_blend_epi32( t0, t2, 0xaa );
|
|
X[1] = _mm256_blend_epi32( t0, t2, 0x55 );
|
|
X[2] = _mm256_blend_epi32( t1, t3, 0xaa );
|
|
X[3] = _mm256_blend_epi32( t1, t3, 0x55 );
|
|
}
|
|
|
|
static void salsa8_2way_simd128( __m256i * const B, const __m256i * const C)
|
|
{
|
|
__m256i X0, X1, X2, X3;
|
|
|
|
X0 = B[0] = _mm256_xor_si256( B[0], C[0] );
|
|
X1 = B[1] = _mm256_xor_si256( B[1], C[1] );
|
|
X2 = B[2] = _mm256_xor_si256( B[2], C[2] );
|
|
X3 = B[3] = _mm256_xor_si256( B[3], C[3] );
|
|
|
|
// define targets for macros used in round function template
|
|
#define ROL_1X32 mm256_shufll128_32 // shuffle within 128 bit lanes
|
|
#define ROR_1X32 mm256_shuflr128_32
|
|
#define SWAP_64 mm256_swap128_64
|
|
#define ROL32 mm256_rol_32
|
|
#define ADD32 _mm256_add_epi32
|
|
#define XOR _mm256_xor_si256
|
|
|
|
SALSA_8ROUNDS_SIMD128;
|
|
|
|
#undef ROL_1X32
|
|
#undef ROR_1X32
|
|
#undef SWAP_64
|
|
#undef ROL32
|
|
#undef ADD32
|
|
#undef XOR
|
|
|
|
B[0] = _mm256_add_epi32( B[0], X0 );
|
|
B[1] = _mm256_add_epi32( B[1], X1 );
|
|
B[2] = _mm256_add_epi32( B[2], X2 );
|
|
B[3] = _mm256_add_epi32( B[3], X3 );
|
|
}
|
|
|
|
void scrypt_core_2way_simd128( __m256i *X, __m256i *V, const uint32_t N )
|
|
{
|
|
salsa_shuffle_2way_simd128( X );
|
|
salsa_shuffle_2way_simd128( X+4 );
|
|
|
|
for ( int n = 0; n < N; n++ )
|
|
{
|
|
memcpy( &V[n * 8], X, 128*2 );
|
|
salsa8_2way_simd128( &X[0], &X[4] );
|
|
salsa8_2way_simd128( &X[4], &X[0] );
|
|
}
|
|
|
|
for ( int n = 0; n < N; n++ )
|
|
{
|
|
m256_ovly x16;
|
|
x16 = ( (m256_ovly*)X )[4];
|
|
uint32_t j0 = 8 * ( x16.u32[0] & ( N-1 ) );
|
|
uint32_t j1 = 8 * ( x16.u32[4] & ( N-1 ) );
|
|
|
|
for ( int i = 0; i < 8; i++ )
|
|
X[i] = _mm256_xor_si256( X[i], _mm256_blend_epi32( V[ j1+i ],
|
|
V[ j0+i ], 0x0f ) );
|
|
|
|
salsa8_2way_simd128( &X[0], &X[4] );
|
|
salsa8_2way_simd128( &X[4], &X[0] );
|
|
}
|
|
|
|
salsa_unshuffle_2way_simd128( X );
|
|
salsa_unshuffle_2way_simd128( X+4 );
|
|
}
|
|
|
|
// Working
|
|
// 2x128 interleaving
|
|
static void salsa8_2way_simd128_2buf( __m256i * const BA, __m256i * const BB,
|
|
const __m256i * const CA, const __m256i * const CB )
|
|
{
|
|
__m256i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3;
|
|
__m256i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
|
|
|
|
// mix C into B then shuffle B into X
|
|
BA[0] = _mm256_xor_si256( BA[0], CA[0] );
|
|
BB[0] = _mm256_xor_si256( BB[0], CB[0] );
|
|
BA[1] = _mm256_xor_si256( BA[1], CA[1] );
|
|
BB[1] = _mm256_xor_si256( BB[1], CB[1] );
|
|
BA[2] = _mm256_xor_si256( BA[2], CA[2] );
|
|
BB[2] = _mm256_xor_si256( BB[2], CB[2] );
|
|
BA[3] = _mm256_xor_si256( BA[3], CA[3] );
|
|
BB[3] = _mm256_xor_si256( BB[3], CB[3] );
|
|
|
|
YA0 = _mm256_blend_epi32( BA[1], BA[0], 0x11 );
|
|
YB0 = _mm256_blend_epi32( BB[1], BB[0], 0x11 );
|
|
XA0 = _mm256_blend_epi32( BA[3], BA[2], 0x44 );
|
|
XB0 = _mm256_blend_epi32( BB[3], BB[2], 0x44 );
|
|
XA0 = _mm256_blend_epi32( XA0, YA0, 0x33);
|
|
XB0 = _mm256_blend_epi32( XB0, YB0, 0x33);
|
|
|
|
YA0 = _mm256_blend_epi32( BA[2], BA[1], 0x11 );
|
|
YB0 = _mm256_blend_epi32( BB[2], BB[1], 0x11 );
|
|
XA1 = _mm256_blend_epi32( BA[0], BA[3], 0x44 );
|
|
XB1 = _mm256_blend_epi32( BB[0], BB[3], 0x44 );
|
|
XA1 = _mm256_blend_epi32( XA1, YA0, 0x33 );
|
|
XB1 = _mm256_blend_epi32( XB1, YB0, 0x33 );
|
|
|
|
YA0 = _mm256_blend_epi32( BA[3], BA[2], 0x11 );
|
|
YB0 = _mm256_blend_epi32( BB[3], BB[2], 0x11 );
|
|
XA2 = _mm256_blend_epi32( BA[1], BA[0], 0x44 );
|
|
XB2 = _mm256_blend_epi32( BB[1], BB[0], 0x44 );
|
|
XA2 = _mm256_blend_epi32( XA2, YA0, 0x33 );
|
|
XB2 = _mm256_blend_epi32( XB2, YB0, 0x33 );
|
|
|
|
YA0 = _mm256_blend_epi32( BA[0], BA[3], 0x11 );
|
|
YB0 = _mm256_blend_epi32( BB[0], BB[3], 0x11 );
|
|
XA3 = _mm256_blend_epi32( BA[2], BA[1], 0x44 );
|
|
XB3 = _mm256_blend_epi32( BB[2], BB[1], 0x44 );
|
|
XA3 = _mm256_blend_epi32( XA3, YA0, 0x33 );
|
|
XB3 = _mm256_blend_epi32( XB3, YB0, 0x33 );
|
|
|
|
// define targets for macros used in round function template
|
|
#define ROL_1X32 mm256_shufll128_32 // shuffle within 128 bit lanes
|
|
#define ROR_1X32 mm256_shuflr128_32
|
|
#define SWAP_64 mm256_swap128_64
|
|
#define ROL32 mm256_rol_32
|
|
#define ADD32 _mm256_add_epi32
|
|
#define XOR _mm256_xor_si256
|
|
#define TYPE __m256i
|
|
|
|
SALSA_8ROUNDS_SIMD128_2BUF;
|
|
|
|
#undef ROL_1X32
|
|
#undef ROR_1X32
|
|
#undef SWAP_64
|
|
#undef ROL32
|
|
#undef ADD32
|
|
#undef XOR
|
|
#undef TYPE
|
|
|
|
YA0 = _mm256_blend_epi32( XA0, XA1, 0x88 );
|
|
YB0 = _mm256_blend_epi32( XB0, XB1, 0x88 );
|
|
YA1 = _mm256_blend_epi32( XA0, XA1, 0x11 );
|
|
YB1 = _mm256_blend_epi32( XB0, XB1, 0x11 );
|
|
YA2 = _mm256_blend_epi32( XA0, XA1, 0x22 );
|
|
YB2 = _mm256_blend_epi32( XB0, XB1, 0x22 );
|
|
YA3 = _mm256_blend_epi32( XA0, XA1, 0x44 );
|
|
YB3 = _mm256_blend_epi32( XB0, XB1, 0x44 );
|
|
|
|
YA0 = _mm256_blend_epi32( YA0, XA2, 0x44 );
|
|
YB0 = _mm256_blend_epi32( YB0, XB2, 0x44 );
|
|
YA1 = _mm256_blend_epi32( YA1, XA2, 0x88 );
|
|
YB1 = _mm256_blend_epi32( YB1, XB2, 0x88 );
|
|
YA2 = _mm256_blend_epi32( YA2, XA2, 0x11 );
|
|
YB2 = _mm256_blend_epi32( YB2, XB2, 0x11 );
|
|
YA3 = _mm256_blend_epi32( YA3, XA2, 0x22 );
|
|
YB3 = _mm256_blend_epi32( YB3, XB2, 0x22 );
|
|
|
|
YA0 = _mm256_blend_epi32( YA0, XA3, 0x22 );
|
|
YB0 = _mm256_blend_epi32( YB0, XB3, 0x22 );
|
|
YA1 = _mm256_blend_epi32( YA1, XA3, 0x44 );
|
|
YB1 = _mm256_blend_epi32( YB1, XB3, 0x44 );
|
|
YA2 = _mm256_blend_epi32( YA2, XA3, 0x88 );
|
|
YB2 = _mm256_blend_epi32( YB2, XB3, 0x88 );
|
|
YA3 = _mm256_blend_epi32( YA3, XA3, 0x11 );
|
|
YB3 = _mm256_blend_epi32( YB3, XB3, 0x11 );
|
|
|
|
BA[0] = _mm256_add_epi32( BA[0], YA0 );
|
|
BB[0] = _mm256_add_epi32( BB[0], YB0 );
|
|
BA[1] = _mm256_add_epi32( BA[1], YA1 );
|
|
BB[1] = _mm256_add_epi32( BB[1], YB1 );
|
|
BA[2] = _mm256_add_epi32( BA[2], YA2 );
|
|
BB[2] = _mm256_add_epi32( BB[2], YB2 );
|
|
BA[3] = _mm256_add_epi32( BA[3], YA3 );
|
|
BB[3] = _mm256_add_epi32( BB[3], YB3 );
|
|
|
|
}
|
|
|
|
void scrypt_core_2way_simd128_2buf( __m256i *X, __m256i *V, const uint32_t N )
|
|
{
|
|
__m256i *X0 = X;
|
|
__m256i *X1 = X + 8;
|
|
__m256i *V0 = V;
|
|
__m256i *V1 = V + 8*N;
|
|
|
|
for ( int n = 0; n < N; n++ )
|
|
{
|
|
for ( int i = 0; i < 8; i++ )
|
|
{
|
|
_mm256_stream_si256( V0 + n*8 + i, X0[i] );
|
|
_mm256_stream_si256( V1 + n*8 + i, X1[i] );
|
|
}
|
|
salsa8_2way_simd128_2buf( &X0[0], &X1[0], &X0[4], &X1[4] );
|
|
salsa8_2way_simd128_2buf( &X0[4], &X1[4], &X0[0], &X1[0] );
|
|
}
|
|
for ( int n = 0; n < N; n++ )
|
|
{
|
|
const m256_ovly x16a = ( (m256_ovly*)X0 )[4];
|
|
const m256_ovly x16b = ( (m256_ovly*)X1 )[4];
|
|
|
|
const uint32_t j0a = 8 * ( x16a.u32[0] & ( N-1 ) );
|
|
const uint32_t j0b = 8 * ( x16b.u32[0] & ( N-1 ) );
|
|
const uint32_t j1a = 8 * ( x16a.u32[4] & ( N-1 ) );
|
|
const uint32_t j1b = 8 * ( x16b.u32[4] & ( N-1 ) );
|
|
|
|
for ( int i = 0; i < 8; i++ )
|
|
{
|
|
const __m256i V0j0a = _mm256_stream_load_si256( V0 + j0a + i );
|
|
const __m256i V0j1a = _mm256_stream_load_si256( V0 + j1a + i );
|
|
const __m256i V1j0b = _mm256_stream_load_si256( V1 + j0b + i );
|
|
const __m256i V1j1b = _mm256_stream_load_si256( V1 + j1b + i );
|
|
X0[i] = _mm256_xor_si256( X0[i],
|
|
_mm256_blend_epi32( V0j1a, V0j0a, 0x0f ) );
|
|
X1[i] = _mm256_xor_si256( X1[i],
|
|
_mm256_blend_epi32( V1j1b, V1j0b, 0x0f ) );
|
|
}
|
|
|
|
salsa8_2way_simd128_2buf( &X0[0], &X1[0], &X0[4], &X1[4] );
|
|
salsa8_2way_simd128_2buf( &X0[4], &X1[4], &X0[0], &X1[0] );
|
|
}
|
|
}
|
|
|
|
// Triple buffered, not up to date, needs stream optimization
|
|
// 2x128 interleaving
|
|
static void salsa8_2way_simd128_3buf( __m256i * const BA, __m256i * const BB,
|
|
__m256i * const BC, const __m256i * const CA, const __m256i * const CB,
|
|
const __m256i * const CC )
|
|
{
|
|
__m256i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3, XC0, XC1, XC2, XC3;
|
|
__m256i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3;
|
|
|
|
// mix C into B then shuffle B into X
|
|
BA[0] = _mm256_xor_si256( BA[0], CA[0] );
|
|
BB[0] = _mm256_xor_si256( BB[0], CB[0] );
|
|
BC[0] = _mm256_xor_si256( BC[0], CC[0] );
|
|
BA[1] = _mm256_xor_si256( BA[1], CA[1] );
|
|
BB[1] = _mm256_xor_si256( BB[1], CB[1] );
|
|
BC[1] = _mm256_xor_si256( BC[1], CC[1] );
|
|
BA[2] = _mm256_xor_si256( BA[2], CA[2] );
|
|
BB[2] = _mm256_xor_si256( BB[2], CB[2] );
|
|
BC[2] = _mm256_xor_si256( BC[2], CC[2] );
|
|
BA[3] = _mm256_xor_si256( BA[3], CA[3] );
|
|
BB[3] = _mm256_xor_si256( BB[3], CB[3] );
|
|
BC[3] = _mm256_xor_si256( BC[3], CC[3] );
|
|
|
|
YA0 = _mm256_blend_epi32( BA[1], BA[0], 0x11 );
|
|
YB0 = _mm256_blend_epi32( BB[1], BB[0], 0x11 );
|
|
YC0 = _mm256_blend_epi32( BC[1], BC[0], 0x11 );
|
|
XA0 = _mm256_blend_epi32( BA[3], BA[2], 0x44 );
|
|
XB0 = _mm256_blend_epi32( BB[3], BB[2], 0x44 );
|
|
XC0 = _mm256_blend_epi32( BC[3], BC[2], 0x44 );
|
|
XA0 = _mm256_blend_epi32( XA0, YA0, 0x33);
|
|
XB0 = _mm256_blend_epi32( XB0, YB0, 0x33);
|
|
XC0 = _mm256_blend_epi32( XC0, YC0, 0x33);
|
|
|
|
YA0 = _mm256_blend_epi32( BA[2], BA[1], 0x11 );
|
|
YB0 = _mm256_blend_epi32( BB[2], BB[1], 0x11 );
|
|
YC0 = _mm256_blend_epi32( BC[2], BC[1], 0x11 );
|
|
XA1 = _mm256_blend_epi32( BA[0], BA[3], 0x44 );
|
|
XB1 = _mm256_blend_epi32( BB[0], BB[3], 0x44 );
|
|
XC1 = _mm256_blend_epi32( BC[0], BC[3], 0x44 );
|
|
XA1 = _mm256_blend_epi32( XA1, YA0, 0x33 );
|
|
XB1 = _mm256_blend_epi32( XB1, YB0, 0x33 );
|
|
XC1 = _mm256_blend_epi32( XC1, YC0, 0x33 );
|
|
|
|
YA0 = _mm256_blend_epi32( BA[3], BA[2], 0x11 );
|
|
YB0 = _mm256_blend_epi32( BB[3], BB[2], 0x11 );
|
|
YC0 = _mm256_blend_epi32( BC[3], BC[2], 0x11 );
|
|
XA2 = _mm256_blend_epi32( BA[1], BA[0], 0x44 );
|
|
XB2 = _mm256_blend_epi32( BB[1], BB[0], 0x44 );
|
|
XC2 = _mm256_blend_epi32( BC[1], BC[0], 0x44 );
|
|
XA2 = _mm256_blend_epi32( XA2, YA0, 0x33 );
|
|
XB2 = _mm256_blend_epi32( XB2, YB0, 0x33 );
|
|
XC2 = _mm256_blend_epi32( XC2, YC0, 0x33 );
|
|
|
|
YA0 = _mm256_blend_epi32( BA[0], BA[3], 0x11 );
|
|
YB0 = _mm256_blend_epi32( BB[0], BB[3], 0x11 );
|
|
YC0 = _mm256_blend_epi32( BC[0], BC[3], 0x11 );
|
|
XA3 = _mm256_blend_epi32( BA[2], BA[1], 0x44 );
|
|
XB3 = _mm256_blend_epi32( BB[2], BB[1], 0x44 );
|
|
XC3 = _mm256_blend_epi32( BC[2], BC[1], 0x44 );
|
|
XA3 = _mm256_blend_epi32( XA3, YA0, 0x33 );
|
|
XB3 = _mm256_blend_epi32( XB3, YB0, 0x33 );
|
|
XC3 = _mm256_blend_epi32( XC3, YC0, 0x33 );
|
|
|
|
// define targets for macros used in round function template
|
|
#define ROL_1X32 mm256_shufll128_32 // shuffle within 128 bit lanes
|
|
#define ROR_1X32 mm256_shuflr128_32
|
|
#define SWAP_64 mm256_swap128_64
|
|
#define ROL32 mm256_rol_32
|
|
#define ADD32 _mm256_add_epi32
|
|
#define XOR _mm256_xor_si256
|
|
#define TYPE __m256i
|
|
|
|
SALSA_8ROUNDS_SIMD128_3BUF;
|
|
|
|
#undef ROL_1X32
|
|
#undef ROR_1X32
|
|
#undef SWAP_64
|
|
#undef ROL32
|
|
#undef ADD32
|
|
#undef XOR
|
|
#undef TYPE
|
|
|
|
YA0 = _mm256_blend_epi32( XA0, XA1, 0x88 );
|
|
YB0 = _mm256_blend_epi32( XB0, XB1, 0x88 );
|
|
YC0 = _mm256_blend_epi32( XC0, XC1, 0x88 );
|
|
YA1 = _mm256_blend_epi32( XA0, XA1, 0x11 );
|
|
YB1 = _mm256_blend_epi32( XB0, XB1, 0x11 );
|
|
YC1 = _mm256_blend_epi32( XC0, XC1, 0x11 );
|
|
YA2 = _mm256_blend_epi32( XA0, XA1, 0x22 );
|
|
YB2 = _mm256_blend_epi32( XB0, XB1, 0x22 );
|
|
YC2 = _mm256_blend_epi32( XC0, XC1, 0x22 );
|
|
YA3 = _mm256_blend_epi32( XA0, XA1, 0x44 );
|
|
YB3 = _mm256_blend_epi32( XB0, XB1, 0x44 );
|
|
YC3 = _mm256_blend_epi32( XC0, XC1, 0x44 );
|
|
|
|
YA0 = _mm256_blend_epi32( YA0, XA2, 0x44 );
|
|
YB0 = _mm256_blend_epi32( YB0, XB2, 0x44 );
|
|
YC0 = _mm256_blend_epi32( YC0, XC2, 0x44 );
|
|
YA1 = _mm256_blend_epi32( YA1, XA2, 0x88 );
|
|
YB1 = _mm256_blend_epi32( YB1, XB2, 0x88 );
|
|
YC1 = _mm256_blend_epi32( YC1, XC2, 0x88 );
|
|
YA2 = _mm256_blend_epi32( YA2, XA2, 0x11 );
|
|
YB2 = _mm256_blend_epi32( YB2, XB2, 0x11 );
|
|
YC2 = _mm256_blend_epi32( YC2, XC2, 0x11 );
|
|
YA3 = _mm256_blend_epi32( YA3, XA2, 0x22 );
|
|
YB3 = _mm256_blend_epi32( YB3, XB2, 0x22 );
|
|
YC3 = _mm256_blend_epi32( YC3, XC2, 0x22 );
|
|
|
|
YA0 = _mm256_blend_epi32( YA0, XA3, 0x22 );
|
|
YB0 = _mm256_blend_epi32( YB0, XB3, 0x22 );
|
|
YC0 = _mm256_blend_epi32( YC0, XC3, 0x22 );
|
|
YA1 = _mm256_blend_epi32( YA1, XA3, 0x44 );
|
|
YB1 = _mm256_blend_epi32( YB1, XB3, 0x44 );
|
|
YC1 = _mm256_blend_epi32( YC1, XC3, 0x44 );
|
|
YA2 = _mm256_blend_epi32( YA2, XA3, 0x88 );
|
|
YB2 = _mm256_blend_epi32( YB2, XB3, 0x88 );
|
|
YC2 = _mm256_blend_epi32( YC2, XC3, 0x88 );
|
|
YA3 = _mm256_blend_epi32( YA3, XA3, 0x11 );
|
|
YB3 = _mm256_blend_epi32( YB3, XB3, 0x11 );
|
|
YC3 = _mm256_blend_epi32( YC3, XC3, 0x11 );
|
|
|
|
BA[0] = _mm256_add_epi32( BA[0], YA0 );
|
|
BB[0] = _mm256_add_epi32( BB[0], YB0 );
|
|
BC[0] = _mm256_add_epi32( BC[0], YC0 );
|
|
BA[1] = _mm256_add_epi32( BA[1], YA1 );
|
|
BB[1] = _mm256_add_epi32( BB[1], YB1 );
|
|
BC[1] = _mm256_add_epi32( BC[1], YC1 );
|
|
BA[2] = _mm256_add_epi32( BA[2], YA2 );
|
|
BB[2] = _mm256_add_epi32( BB[2], YB2 );
|
|
BC[2] = _mm256_add_epi32( BC[2], YC2 );
|
|
BA[3] = _mm256_add_epi32( BA[3], YA3 );
|
|
BB[3] = _mm256_add_epi32( BB[3], YB3 );
|
|
BC[3] = _mm256_add_epi32( BC[3], YC3 );
|
|
|
|
}
|
|
|
|
void scrypt_core_2way_simd128_3buf( __m256i *X, __m256i *V, const uint32_t N )
|
|
{
|
|
__m256i *X0 = X;
|
|
__m256i *X1 = X+8;
|
|
__m256i *X2 = X+16;
|
|
__m256i *V0 = V;
|
|
__m256i *V1 = V + 8*N;
|
|
__m256i *V2 = V + 16*N;
|
|
|
|
for ( int n = 0; n < N; n++ )
|
|
{
|
|
memcpy( &V0[n * 8], X0, 128*2 );
|
|
memcpy( &V1[n * 8], X1, 128*2 );
|
|
memcpy( &V2[n * 8], X2, 128*2 );
|
|
salsa8_2way_simd128_3buf( &X0[0], &X1[0], &X2[0],
|
|
&X0[4], &X1[4], &X2[4] );
|
|
salsa8_2way_simd128_3buf( &X0[4], &X1[4], &X2[4],
|
|
&X0[0], &X1[0], &X2[0] );
|
|
}
|
|
for ( int n = 0; n < N; n++ )
|
|
{
|
|
m256_ovly x16a, x16b, x16c;
|
|
x16a = ( (m256_ovly*)X0 )[4];
|
|
x16b = ( (m256_ovly*)X1 )[4];
|
|
x16c = ( (m256_ovly*)X2 )[4];
|
|
|
|
uint32_t j0a = 8 * ( x16a.u32[0] & ( N-1 ) );
|
|
uint32_t j0b = 8 * ( x16b.u32[0] & ( N-1 ) );
|
|
uint32_t j0c = 8 * ( x16c.u32[0] & ( N-1 ) );
|
|
uint32_t j1a = 8 * ( x16a.u32[4] & ( N-1 ) );
|
|
uint32_t j1b = 8 * ( x16b.u32[4] & ( N-1 ) );
|
|
uint32_t j1c = 8 * ( x16c.u32[4] & ( N-1 ) );
|
|
|
|
for ( int i = 0; i < 8; i++ )
|
|
{
|
|
X0[i] = _mm256_xor_si256( X0[i],
|
|
_mm256_blend_epi32( V0[ j1a+i ], V0[ j0a+i ], 0x0f ) );
|
|
X1[i] = _mm256_xor_si256( X1[i],
|
|
_mm256_blend_epi32( V1[ j1b+i ], V1[ j0b+i ], 0x0f ) );
|
|
X2[i] = _mm256_xor_si256( X2[i],
|
|
_mm256_blend_epi32( V2[ j1c+i ], V2[ j0c+i ], 0x0f ) );
|
|
}
|
|
|
|
salsa8_2way_simd128_3buf( &X0[0], &X1[0], &X2[0],
|
|
&X0[4], &X1[4], &X2[4] );
|
|
salsa8_2way_simd128_3buf( &X0[4], &X1[4], &X2[4],
|
|
&X0[0], &X1[0], &X2[0] );
|
|
}
|
|
}
|
|
|
|
|
|
// 2x memory usage
|
|
|
|
// Tested OK, good speed
|
|
//
|
|
// Serial SIMD over 2 way parallel
|
|
|
|
// Uses uint64_t as a poorman's vector then applying linear SIMD to the
|
|
// pairs of data.
|
|
//
|
|
// Interleaving is standard 2 way.
|
|
// Use 64 bit shuffles but 32 bit arithmetic.
|
|
|
|
// B = { lane1, lane0 }
|
|
// b[i] = { B[4*i+3], B[4*i+2], B[4*i+1], B[4*i] }
|
|
|
|
// 2x32 interleaving
|
|
static void salsa8_simd128_2way( uint64_t *b, const uint64_t *c )
|
|
{
|
|
__m256i X0, X1, X2, X3, Y0, Y1, Y2, Y3;
|
|
__m256i *B = (__m256i*)b;
|
|
const __m256i *C = (const __m256i*)c;
|
|
|
|
// mix C into B then shuffle B into X
|
|
B[0] = _mm256_xor_si256( B[0], C[0] );
|
|
B[1] = _mm256_xor_si256( B[1], C[1] );
|
|
B[2] = _mm256_xor_si256( B[2], C[2] );
|
|
B[3] = _mm256_xor_si256( B[3], C[3] );
|
|
|
|
Y0 = _mm256_blend_epi32( B[1], B[0], 0x03 );
|
|
X0 = _mm256_blend_epi32( B[3], B[2], 0x30 );
|
|
X0 = _mm256_blend_epi32( X0, Y0, 0x0f);
|
|
|
|
Y0 = _mm256_blend_epi32( B[2], B[1], 0x03 );
|
|
X1 = _mm256_blend_epi32( B[0], B[3], 0x30 );
|
|
X1 = _mm256_blend_epi32( X1, Y0, 0x0f );
|
|
|
|
Y0 = _mm256_blend_epi32( B[3], B[2], 0x03 );
|
|
X2 = _mm256_blend_epi32( B[1], B[0], 0x30 );
|
|
X2 = _mm256_blend_epi32( X2, Y0, 0x0f );
|
|
|
|
Y0 = _mm256_blend_epi32( B[0], B[3], 0x03 );
|
|
X3 = _mm256_blend_epi32( B[2], B[1], 0x30 );
|
|
X3 = _mm256_blend_epi32( X3, Y0, 0x0f );
|
|
|
|
// define targets for macros used in round function template
|
|
#define ROL_1X32 mm256_shufll_64
|
|
#define ROR_1X32 mm256_shuflr_64
|
|
#define SWAP_64 mm256_swap_128
|
|
#define ROL32 mm256_rol_32
|
|
#define ADD32 _mm256_add_epi32
|
|
#define XOR _mm256_xor_si256
|
|
|
|
SALSA_8ROUNDS_SIMD128;
|
|
|
|
#undef ROL_1X32
|
|
#undef ROR_1X32
|
|
#undef SWAP_64
|
|
#undef ROL32
|
|
#undef ADD32
|
|
#undef XOR
|
|
|
|
Y0 = _mm256_blend_epi32( X0, X1, 0xc0 );
|
|
Y1 = _mm256_blend_epi32( X0, X1, 0x03 );
|
|
Y2 = _mm256_blend_epi32( X0, X1, 0x0c );
|
|
Y3 = _mm256_blend_epi32( X0, X1, 0x30 );
|
|
|
|
Y0 = _mm256_blend_epi32( Y0, X2, 0x30 );
|
|
Y1 = _mm256_blend_epi32( Y1, X2, 0xc0 );
|
|
Y2 = _mm256_blend_epi32( Y2, X2, 0x03 );
|
|
Y3 = _mm256_blend_epi32( Y3, X2, 0x0c );
|
|
|
|
Y0 = _mm256_blend_epi32( Y0, X3, 0x0c );
|
|
Y1 = _mm256_blend_epi32( Y1, X3, 0x30 );
|
|
Y2 = _mm256_blend_epi32( Y2, X3, 0xc0 );
|
|
Y3 = _mm256_blend_epi32( Y3, X3, 0x03 );
|
|
|
|
B[0] = _mm256_add_epi32( B[0], Y0 );
|
|
B[1] = _mm256_add_epi32( B[1], Y1 );
|
|
B[2] = _mm256_add_epi32( B[2], Y2 );
|
|
B[3] = _mm256_add_epi32( B[3], Y3 );
|
|
|
|
}
|
|
|
|
// data format for 256 bits: 4 * ( 2 way 32 )
|
|
// { l1d3, l0d3, l1d2, l0d2, l1d1, l0d1, l1d0, l0d0 }
|
|
|
|
void scrypt_core_simd128_2way( uint64_t *X, uint64_t *V, const uint32_t N )
|
|
{
|
|
for ( int n = 0; n < N; n++ )
|
|
{
|
|
for ( int i = 0; i < 8; i++ )
|
|
_mm256_stream_si256( (__m256i*)V + n*8 + i, casti_m256i( X, i ) );
|
|
salsa8_simd128_2way( &X[ 0], &X[16] );
|
|
salsa8_simd128_2way( &X[16], &X[ 0] );
|
|
}
|
|
|
|
for ( int n = 0; n < N; n++ )
|
|
{
|
|
// need 2 J's
|
|
const uint32_t j0 = 32 * ( (uint32_t)( X[16] ) & ( N-1 ) );
|
|
const uint32_t j1 = 32 * ( (uint32_t)( X[16] >> 32 ) & ( N-1 ) );
|
|
|
|
for ( int i = 0; i < 32; i++ )
|
|
X[i] ^= ( ( V[ j1 + i ] & 0xffffffff00000000 )
|
|
| ( V[ j0 + i ] & 0x00000000ffffffff ) );
|
|
|
|
salsa8_simd128_2way( &X[ 0], &X[16] );
|
|
salsa8_simd128_2way( &X[16], &X[ 0] );
|
|
}
|
|
}
|
|
|
|
// Double buffered, 4x memory usage
|
|
// 2x32 interleaving
|
|
static void salsa8_simd128_2way_2buf( uint64_t *ba, uint64_t *bb,
|
|
const uint64_t *ca, const uint64_t *cb )
|
|
{
|
|
__m256i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3;
|
|
__m256i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
|
|
__m256i *BA = (__m256i*)ba;
|
|
__m256i *BB = (__m256i*)bb;
|
|
const __m256i *CA = (const __m256i*)ca;
|
|
const __m256i *CB = (const __m256i*)cb;
|
|
|
|
// mix C into B then shuffle B into X
|
|
BA[0] = _mm256_xor_si256( BA[0], CA[0] );
|
|
BB[0] = _mm256_xor_si256( BB[0], CB[0] );
|
|
BA[1] = _mm256_xor_si256( BA[1], CA[1] );
|
|
BB[1] = _mm256_xor_si256( BB[1], CB[1] );
|
|
BA[2] = _mm256_xor_si256( BA[2], CA[2] );
|
|
BB[2] = _mm256_xor_si256( BB[2], CB[2] );
|
|
BA[3] = _mm256_xor_si256( BA[3], CA[3] );
|
|
BB[3] = _mm256_xor_si256( BB[3], CB[3] );
|
|
|
|
YA0 = _mm256_blend_epi32( BA[1], BA[0], 0x03 );
|
|
YB0 = _mm256_blend_epi32( BB[1], BB[0], 0x03 );
|
|
XA0 = _mm256_blend_epi32( BA[3], BA[2], 0x30 );
|
|
XB0 = _mm256_blend_epi32( BB[3], BB[2], 0x30 );
|
|
XA0 = _mm256_blend_epi32( XA0, YA0, 0x0f);
|
|
XB0 = _mm256_blend_epi32( XB0, YB0, 0x0f);
|
|
|
|
YA0 = _mm256_blend_epi32( BA[2], BA[1], 0x03 );
|
|
YB0 = _mm256_blend_epi32( BB[2], BB[1], 0x03 );
|
|
XA1 = _mm256_blend_epi32( BA[0], BA[3], 0x30 );
|
|
XB1 = _mm256_blend_epi32( BB[0], BB[3], 0x30 );
|
|
XA1 = _mm256_blend_epi32( XA1, YA0, 0x0f );
|
|
XB1 = _mm256_blend_epi32( XB1, YB0, 0x0f );
|
|
|
|
YA0 = _mm256_blend_epi32( BA[3], BA[2], 0x03 );
|
|
YB0 = _mm256_blend_epi32( BB[3], BB[2], 0x03 );
|
|
XA2 = _mm256_blend_epi32( BA[1], BA[0], 0x30 );
|
|
XB2 = _mm256_blend_epi32( BB[1], BB[0], 0x30 );
|
|
XA2 = _mm256_blend_epi32( XA2, YA0, 0x0f );
|
|
XB2 = _mm256_blend_epi32( XB2, YB0, 0x0f );
|
|
|
|
YA0 = _mm256_blend_epi32( BA[0], BA[3], 0x03 );
|
|
YB0 = _mm256_blend_epi32( BB[0], BB[3], 0x03 );
|
|
XA3 = _mm256_blend_epi32( BA[2], BA[1], 0x30 );
|
|
XB3 = _mm256_blend_epi32( BB[2], BB[1], 0x30 );
|
|
XA3 = _mm256_blend_epi32( XA3, YA0, 0x0f );
|
|
XB3 = _mm256_blend_epi32( XB3, YB0, 0x0f );
|
|
|
|
// define targets for macros used in round function template
|
|
#define ROL_1X32 mm256_shufll_64
|
|
#define ROR_1X32 mm256_shuflr_64
|
|
#define SWAP_64 mm256_swap_128
|
|
#define ROL32 mm256_rol_32
|
|
#define ADD32 _mm256_add_epi32
|
|
#define XOR _mm256_xor_si256
|
|
#define TYPE __m256i
|
|
|
|
SALSA_8ROUNDS_SIMD128_2BUF;
|
|
|
|
#undef ROL_1X32
|
|
#undef ROR_1X32
|
|
#undef SWAP_64
|
|
#undef ROL32
|
|
#undef ADD32
|
|
#undef XOR
|
|
#undef TYPE
|
|
|
|
YA0 = _mm256_blend_epi32( XA0, XA1, 0xc0 );
|
|
YB0 = _mm256_blend_epi32( XB0, XB1, 0xc0 );
|
|
YA1 = _mm256_blend_epi32( XA0, XA1, 0x03 );
|
|
YB1 = _mm256_blend_epi32( XB0, XB1, 0x03 );
|
|
YA2 = _mm256_blend_epi32( XA0, XA1, 0x0c );
|
|
YB2 = _mm256_blend_epi32( XB0, XB1, 0x0c );
|
|
YA3 = _mm256_blend_epi32( XA0, XA1, 0x30 );
|
|
YB3 = _mm256_blend_epi32( XB0, XB1, 0x30 );
|
|
|
|
YA0 = _mm256_blend_epi32( YA0, XA2, 0x30 );
|
|
YB0 = _mm256_blend_epi32( YB0, XB2, 0x30 );
|
|
YA1 = _mm256_blend_epi32( YA1, XA2, 0xc0 );
|
|
YB1 = _mm256_blend_epi32( YB1, XB2, 0xc0 );
|
|
YA2 = _mm256_blend_epi32( YA2, XA2, 0x03 );
|
|
YB2 = _mm256_blend_epi32( YB2, XB2, 0x03 );
|
|
YA3 = _mm256_blend_epi32( YA3, XA2, 0x0c );
|
|
YB3 = _mm256_blend_epi32( YB3, XB2, 0x0c );
|
|
|
|
YA0 = _mm256_blend_epi32( YA0, XA3, 0x0c );
|
|
YB0 = _mm256_blend_epi32( YB0, XB3, 0x0c );
|
|
YA1 = _mm256_blend_epi32( YA1, XA3, 0x30 );
|
|
YB1 = _mm256_blend_epi32( YB1, XB3, 0x30 );
|
|
YA2 = _mm256_blend_epi32( YA2, XA3, 0xc0 );
|
|
YB2 = _mm256_blend_epi32( YB2, XB3, 0xc0 );
|
|
YA3 = _mm256_blend_epi32( YA3, XA3, 0x03 );
|
|
YB3 = _mm256_blend_epi32( YB3, XB3, 0x03 );
|
|
|
|
BA[0] = _mm256_add_epi32( BA[0], YA0 );
|
|
BB[0] = _mm256_add_epi32( BB[0], YB0 );
|
|
BA[1] = _mm256_add_epi32( BA[1], YA1 );
|
|
BB[1] = _mm256_add_epi32( BB[1], YB1 );
|
|
BA[2] = _mm256_add_epi32( BA[2], YA2 );
|
|
BB[2] = _mm256_add_epi32( BB[2], YB2 );
|
|
BA[3] = _mm256_add_epi32( BA[3], YA3 );
|
|
BB[3] = _mm256_add_epi32( BB[3], YB3 );
|
|
|
|
}
|
|
|
|
void scrypt_core_simd128_2way_2buf( uint64_t *X, uint64_t *V, const uint32_t N )
|
|
|
|
{
|
|
uint64_t *X0 = X;
|
|
uint64_t *X1 = X+32;
|
|
uint64_t *V0 = V;
|
|
uint64_t *V1 = V + 32*N;
|
|
|
|
for ( int n = 0; n < N; n++ )
|
|
{
|
|
for ( int i = 0; i < 8; i++ )
|
|
{
|
|
_mm256_stream_si256( (__m256i*)V0 + n*8 + i, casti_m256i( X0, i ) );
|
|
_mm256_stream_si256( (__m256i*)V1 + n*8 + i, casti_m256i( X1, i ) );
|
|
}
|
|
salsa8_simd128_2way_2buf( &X0[ 0], &X1[ 0], &X0[16], &X1[16] );
|
|
salsa8_simd128_2way_2buf( &X0[16], &X1[16], &X0[ 0], &X1[ 0] );
|
|
}
|
|
|
|
for ( int n = 0; n < N; n++ )
|
|
{
|
|
// need 4 J's
|
|
const uint32_t j0l = 32 * ( (const uint32_t)( X0[16] ) & ( N-1 ) );
|
|
const uint32_t j0h = 32 * ( (const uint32_t)( X0[16] >> 32 ) & ( N-1 ) );
|
|
const uint32_t j1l = 32 * ( (const uint32_t)( X1[16] ) & ( N-1 ) );
|
|
const uint32_t j1h = 32 * ( (const uint32_t)( X1[16] >> 32 ) & ( N-1 ) );
|
|
|
|
for ( int i = 0; i < 32; i++ )
|
|
{
|
|
X0[i] ^= ( ( V0[ j0h + i ] & 0xffffffff00000000 )
|
|
| ( V0[ j0l + i ] & 0x00000000ffffffff ) );
|
|
X1[i] ^= ( ( V1[ j1h + i ] & 0xffffffff00000000 )
|
|
| ( V1[ j1l + i ] & 0x00000000ffffffff ) );
|
|
}
|
|
salsa8_simd128_2way_2buf( &X0[ 0], &X1[ 0], &X0[16], &X1[16] );
|
|
salsa8_simd128_2way_2buf( &X0[16], &X1[16], &X0[ 0], &X1[ 0] );
|
|
}
|
|
}
|
|
|
|
// Working, deprecated, not up to date
|
|
// Triple buffered 2 way, 6x memory usage
|
|
// 2x32 interleaving
|
|
static void salsa8_simd128_2way_3buf( uint64_t *BA, uint64_t *BB,
|
|
uint64_t *BC, const uint64_t *CA, const uint64_t *CB,
|
|
const uint64_t *CC )
|
|
{
|
|
__m256i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3,
|
|
XC0, XC1, XC2, XC3;
|
|
__m256i *ba = (__m256i*)BA;
|
|
__m256i *bb = (__m256i*)BB;
|
|
__m256i *bc = (__m256i*)BC;
|
|
const __m256i *ca = (const __m256i*)CA;
|
|
const __m256i *cb = (const __m256i*)CB;
|
|
const __m256i *cc = (const __m256i*)CC;
|
|
m256_ovly ya[4], yb[4], yc[4],
|
|
za[4], zb[4], zc[4];
|
|
|
|
// mix C into B then shuffle B into X
|
|
ba[0] = _mm256_xor_si256( ba[0], ca[0] );
|
|
bb[0] = _mm256_xor_si256( bb[0], cb[0] );
|
|
bc[0] = _mm256_xor_si256( bc[0], cc[0] );
|
|
ba[1] = _mm256_xor_si256( ba[1], ca[1] );
|
|
bb[1] = _mm256_xor_si256( bb[1], cb[1] );
|
|
bc[1] = _mm256_xor_si256( bc[1], cc[1] );
|
|
ba[2] = _mm256_xor_si256( ba[2], ca[2] );
|
|
bb[2] = _mm256_xor_si256( bb[2], cb[2] );
|
|
bc[2] = _mm256_xor_si256( bc[2], cc[2] );
|
|
ba[3] = _mm256_xor_si256( ba[3], ca[3] );
|
|
bb[3] = _mm256_xor_si256( bb[3], cb[3] );
|
|
bc[3] = _mm256_xor_si256( bc[3], cc[3] );
|
|
|
|
XA0 = _mm256_set_epi64x( BA[15], BA[10], BA[ 5], BA[ 0] );
|
|
XB0 = _mm256_set_epi64x( BB[15], BB[10], BB[ 5], BB[ 0] );
|
|
XC0 = _mm256_set_epi64x( BC[15], BC[10], BC[ 5], BC[ 0] );
|
|
XA1 = _mm256_set_epi64x( BA[ 3], BA[14], BA[ 9], BA[ 4] );
|
|
XB1 = _mm256_set_epi64x( BB[ 3], BB[14], BB[ 9], BB[ 4] );
|
|
XC1 = _mm256_set_epi64x( BC[ 3], BC[14], BC[ 9], BC[ 4] );
|
|
XA2 = _mm256_set_epi64x( BA[ 7], BA[ 2], BA[13], BA[ 8] );
|
|
XB2 = _mm256_set_epi64x( BB[ 7], BB[ 2], BB[13], BB[ 8] );
|
|
XC2 = _mm256_set_epi64x( BC[ 7], BC[ 2], BC[13], BC[ 8] );
|
|
XA3 = _mm256_set_epi64x( BA[11], BA[ 6], BA[ 1], BA[12] );
|
|
XB3 = _mm256_set_epi64x( BB[11], BB[ 6], BB[ 1], BB[12] );
|
|
XC3 = _mm256_set_epi64x( BC[11], BC[ 6], BC[ 1], BC[12] );
|
|
|
|
// define targets for macros used in round function template
|
|
#define ROL_1X32 mm256_shufll_64
|
|
#define ROR_1X32 mm256_shuflr_64
|
|
#define SWAP_64 mm256_swap_128
|
|
#define ROL32 mm256_rol_32
|
|
#define ADD32 _mm256_add_epi32
|
|
#define XOR _mm256_xor_si256
|
|
#define TYPE __m256i
|
|
|
|
SALSA_8ROUNDS_FINAL_SIMD128_3BUF;
|
|
|
|
#undef ROL_1X32
|
|
#undef ROR_1X32
|
|
#undef SWAP_64
|
|
#undef ROL32
|
|
#undef ADD32
|
|
#undef XOR
|
|
#undef TYPE
|
|
|
|
ya[0].m256 = XA0; yb[0].m256 = XB0;
|
|
yc[0].m256 = XC0;
|
|
ya[1].m256 = XA1; yb[1].m256 = XB1;
|
|
yc[1].m256 = XC1;
|
|
ya[2].m256 = XA2; yb[2].m256 = XB2;
|
|
yc[2].m256 = XC2;
|
|
ya[3].m256 = XA3; yb[3].m256 = XB3;
|
|
yc[3].m256 = XC3;
|
|
|
|
za[0].u64[0] = ya[0].u64[0];
|
|
zb[0].u64[0] = yb[0].u64[0];
|
|
zc[0].u64[0] = yc[0].u64[0];
|
|
za[0].u64[3] = ya[1].u64[0];
|
|
zb[0].u64[3] = yb[1].u64[0];
|
|
zc[0].u64[3] = yc[1].u64[0];
|
|
za[0].u64[2] = ya[2].u64[0];
|
|
zb[0].u64[2] = yb[2].u64[0];
|
|
zc[0].u64[2] = yc[2].u64[0];
|
|
za[0].u64[1] = ya[3].u64[0];
|
|
zb[0].u64[1] = yb[3].u64[0];
|
|
zc[0].u64[1] = yc[3].u64[0];
|
|
|
|
za[1].u64[1] = ya[0].u64[1];
|
|
zb[1].u64[1] = yb[0].u64[1];
|
|
zc[1].u64[1] = yc[0].u64[1];
|
|
za[1].u64[0] = ya[1].u64[1];
|
|
zb[1].u64[0] = yb[1].u64[1];
|
|
zc[1].u64[0] = yc[1].u64[1];
|
|
za[1].u64[3] = ya[2].u64[1];
|
|
zb[1].u64[3] = yb[2].u64[1];
|
|
zc[1].u64[3] = yc[2].u64[1];
|
|
za[1].u64[2] = ya[3].u64[1];
|
|
zb[1].u64[2] = yb[3].u64[1];
|
|
zc[1].u64[2] = yc[3].u64[1];
|
|
|
|
za[2].u64[2] = ya[0].u64[2];
|
|
zb[2].u64[2] = yb[0].u64[2];
|
|
zc[2].u64[2] = yc[0].u64[2];
|
|
za[2].u64[1] = ya[1].u64[2];
|
|
zb[2].u64[1] = yb[1].u64[2];
|
|
zc[2].u64[1] = yc[1].u64[2];
|
|
za[2].u64[0] = ya[2].u64[2];
|
|
zb[2].u64[0] = yb[2].u64[2];
|
|
zc[2].u64[0] = yc[2].u64[2];
|
|
za[2].u64[3] = ya[3].u64[2];
|
|
zb[2].u64[3] = yb[3].u64[2];
|
|
zc[2].u64[3] = yc[3].u64[2];
|
|
|
|
za[3].u64[3] = ya[0].u64[3];
|
|
zb[3].u64[3] = yb[0].u64[3];
|
|
zc[3].u64[3] = yc[0].u64[3];
|
|
za[3].u64[2] = ya[1].u64[3];
|
|
zb[3].u64[2] = yb[1].u64[3];
|
|
zc[3].u64[2] = yc[1].u64[3];
|
|
za[3].u64[1] = ya[2].u64[3];
|
|
zb[3].u64[1] = yb[2].u64[3];
|
|
zc[3].u64[1] = yc[2].u64[3];
|
|
za[3].u64[0] = ya[3].u64[3];
|
|
zb[3].u64[0] = yb[3].u64[3];
|
|
zc[3].u64[0] = yc[3].u64[3];
|
|
|
|
ba[0] = _mm256_add_epi32( ba[0], za[0].m256 );
|
|
bb[0] = _mm256_add_epi32( bb[0], zb[0].m256 );
|
|
bc[0] = _mm256_add_epi32( bc[0], zc[0].m256 );
|
|
ba[1] = _mm256_add_epi32( ba[1], za[1].m256 );
|
|
bb[1] = _mm256_add_epi32( bb[1], zb[1].m256 );
|
|
bc[1] = _mm256_add_epi32( bc[1], zc[1].m256 );
|
|
ba[2] = _mm256_add_epi32( ba[2], za[2].m256 );
|
|
bb[2] = _mm256_add_epi32( bb[2], zb[2].m256 );
|
|
bc[2] = _mm256_add_epi32( bc[2], zc[2].m256 );
|
|
ba[3] = _mm256_add_epi32( ba[3], za[3].m256 );
|
|
bb[3] = _mm256_add_epi32( bb[3], zb[3].m256 );
|
|
bc[3] = _mm256_add_epi32( bc[3], zc[3].m256 );
|
|
}
|
|
|
|
void scrypt_core_simd128_2way_3buf( uint64_t *X, uint64_t *V,
|
|
const uint32_t N )
|
|
{
|
|
uint64_t *X0 = X;
|
|
uint64_t *X1 = X+32;
|
|
uint64_t *X2 = X+64;
|
|
uint64_t *V0 = V;
|
|
uint64_t *V1 = V + 32*N;
|
|
uint64_t *V2 = V + 64*N;
|
|
|
|
for ( int n = 0; n < N; n++ )
|
|
{
|
|
memcpy( &V0[ n*32 ], X0, 2*128 );
|
|
memcpy( &V1[ n*32 ], X1, 2*128 );
|
|
memcpy( &V2[ n*32 ], X2, 2*128 );
|
|
salsa8_simd128_2way_3buf( &X0[ 0], &X1[ 0], &X2[ 0],
|
|
&X0[16], &X1[16], &X2[16] );
|
|
salsa8_simd128_2way_3buf( &X0[16], &X1[16], &X2[16],
|
|
&X0[ 0], &X1[ 0], &X2[ 0] );
|
|
}
|
|
|
|
for ( int n = 0; n < N; n++ )
|
|
{
|
|
uint32_t j0l = 32 * ( (uint32_t)( X0[16] ) & ( N-1 ) );
|
|
uint32_t j0h = 32 * ( (uint32_t)( X0[16] >> 32 ) & ( N-1 ) );
|
|
uint32_t j1l = 32 * ( (uint32_t)( X1[16] ) & ( N-1 ) );
|
|
uint32_t j1h = 32 * ( (uint32_t)( X1[16] >> 32 ) & ( N-1 ) );
|
|
uint32_t j2l = 32 * ( (uint32_t)( X2[16] ) & ( N-1 ) );
|
|
uint32_t j2h = 32 * ( (uint32_t)( X2[16] >> 32 ) & ( N-1 ) );
|
|
|
|
for ( int i = 0; i < 32; i++ )
|
|
{
|
|
X0[i] ^= ( ( V0[ j0h + i ] & 0xffffffff00000000 )
|
|
| ( V0[ j0l + i ] & 0x00000000ffffffff ) );
|
|
X1[i] ^= ( ( V1[ j1h + i ] & 0xffffffff00000000 )
|
|
| ( V1[ j1l + i ] & 0x00000000ffffffff ) );
|
|
X2[i] ^= ( ( V2[ j2h + i ] & 0xffffffff00000000 )
|
|
| ( V2[ j2l + i ] & 0x00000000ffffffff ) );
|
|
}
|
|
salsa8_simd128_2way_3buf( &X0[ 0], &X1[ 0], &X2[ 0],
|
|
&X0[16], &X1[16], &X2[16] );
|
|
salsa8_simd128_2way_3buf( &X0[16], &X1[16], &X2[16],
|
|
&X0[ 0], &X1[ 0], &X2[ 0] );
|
|
}
|
|
}
|
|
|
|
|
|
#endif // AVX2
|
|
|
|
#if defined(__SSE2__) // required and assumed
|
|
|
|
// Simple 4 way parallel.
|
|
// Tested OK
|
|
// Scyptn2 a little slower than pooler
|
|
// Scrypt 2x faster than pooler
|
|
// 4x memory usage
|
|
// 4x32 interleaving
|
|
static void xor_salsa8_4way( __m128i * const B, const __m128i * const C )
|
|
{
|
|
__m128i x0 = B[ 0] = _mm_xor_si128( B[ 0], C[ 0] );
|
|
__m128i x1 = B[ 1] = _mm_xor_si128( B[ 1], C[ 1] );
|
|
__m128i x2 = B[ 2] = _mm_xor_si128( B[ 2], C[ 2] );
|
|
__m128i x3 = B[ 3] = _mm_xor_si128( B[ 3], C[ 3] );
|
|
__m128i x4 = B[ 4] = _mm_xor_si128( B[ 4], C[ 4] );
|
|
__m128i x5 = B[ 5] = _mm_xor_si128( B[ 5], C[ 5] );
|
|
__m128i x6 = B[ 6] = _mm_xor_si128( B[ 6], C[ 6] );
|
|
__m128i x7 = B[ 7] = _mm_xor_si128( B[ 7], C[ 7] );
|
|
__m128i x8 = B[ 8] = _mm_xor_si128( B[ 8], C[ 8] );
|
|
__m128i x9 = B[ 9] = _mm_xor_si128( B[ 9], C[ 9] );
|
|
__m128i xa = B[10] = _mm_xor_si128( B[10], C[10] );
|
|
__m128i xb = B[11] = _mm_xor_si128( B[11], C[11] );
|
|
__m128i xc = B[12] = _mm_xor_si128( B[12], C[12] );
|
|
__m128i xd = B[13] = _mm_xor_si128( B[13], C[13] );
|
|
__m128i xe = B[14] = _mm_xor_si128( B[14], C[14] );
|
|
__m128i xf = B[15] = _mm_xor_si128( B[15], C[15] );
|
|
|
|
#define ROL32 mm128_rol_32
|
|
#define ADD32 _mm_add_epi32
|
|
#define XOR _mm_xor_si128
|
|
|
|
SALSA_8ROUNDS;
|
|
|
|
#undef ROL32
|
|
#undef ADD32
|
|
#undef XOR
|
|
|
|
B[ 0] = _mm_add_epi32( B[ 0], x0 );
|
|
B[ 1] = _mm_add_epi32( B[ 1], x1 );
|
|
B[ 2] = _mm_add_epi32( B[ 2], x2 );
|
|
B[ 3] = _mm_add_epi32( B[ 3], x3 );
|
|
B[ 4] = _mm_add_epi32( B[ 4], x4 );
|
|
B[ 5] = _mm_add_epi32( B[ 5], x5 );
|
|
B[ 6] = _mm_add_epi32( B[ 6], x6 );
|
|
B[ 7] = _mm_add_epi32( B[ 7], x7 );
|
|
B[ 8] = _mm_add_epi32( B[ 8], x8 );
|
|
B[ 9] = _mm_add_epi32( B[ 9], x9 );
|
|
B[10] = _mm_add_epi32( B[10], xa );
|
|
B[11] = _mm_add_epi32( B[11], xb );
|
|
B[12] = _mm_add_epi32( B[12], xc );
|
|
B[13] = _mm_add_epi32( B[13], xd );
|
|
B[14] = _mm_add_epi32( B[14], xe );
|
|
B[15] = _mm_add_epi32( B[15], xf );
|
|
}
|
|
|
|
void scrypt_core_4way( __m128i *X, __m128i *V, const uint32_t N )
|
|
{
|
|
for ( int n = 0; n < N; n++ )
|
|
{
|
|
memcpy( &V[ n*32 ], X, 128*4 );
|
|
xor_salsa8_4way( &X[ 0], &X[16] );
|
|
xor_salsa8_4way( &X[16], &X[ 0] );
|
|
}
|
|
for ( int n = 0; n < N; n++ )
|
|
{
|
|
m128_ovly *vptr[4];
|
|
m128_ovly *x16 = (m128_ovly*)(&X[16]);
|
|
|
|
for ( int l = 0; l < 4; l++ )
|
|
{
|
|
uint32_t xl = (*x16).u32[l];
|
|
vptr[l] = (m128_ovly*)( &V[ 32 * ( xl & ( N-1 ) ) ] );
|
|
}
|
|
|
|
for ( int i = 0; i < 32; i++ )
|
|
{
|
|
m128_ovly v;
|
|
for ( int l = 0; l < 4; l++ )
|
|
v.u32[l] = ( *(vptr[l] +i ) ) .u32[l];
|
|
X[i] = _mm_xor_si128( X[i], v.m128 );
|
|
}
|
|
|
|
xor_salsa8_4way( &X[ 0], &X[16] );
|
|
xor_salsa8_4way( &X[16], &X[ 0] );
|
|
}
|
|
}
|
|
|
|
|
|
// Linear SIMD single thread. No memory increase but some shuffling overhead
|
|
// required.
|
|
|
|
// 4 way 32 bit interleaved single 32 bit thread, interleave while loading,
|
|
// deinterleave while storing, do 2 way 128 & 4 way 128 parallel on top.
|
|
//
|
|
// SALSA_2ROUNDS( {x0,x5,xa,xf}, {x4,x9,xe,x3}, {x8,xd,x2,x7}, {xc,x1,x6,xb})
|
|
|
|
// Tested OK.
|
|
// No interleaving
|
|
static void salsa8_simd128( uint32_t *b, const uint32_t * const c)
|
|
{
|
|
__m128i X0, X1, X2, X3;
|
|
__m128i *B = (__m128i*)b;
|
|
const __m128i *C = (const __m128i*)c;
|
|
|
|
// define targets for macros used in round function template
|
|
#define ROL_1X32 mm128_shufll_32
|
|
#define ROR_1X32 mm128_shuflr_32
|
|
#define SWAP_64 mm128_swap_64
|
|
#define ROL32 mm128_rol_32
|
|
#define ADD32 _mm_add_epi32
|
|
#define XOR _mm_xor_si128
|
|
|
|
// mix C into B then shuffle B into X
|
|
B[0] = _mm_xor_si128( B[0], C[0] );
|
|
B[1] = _mm_xor_si128( B[1], C[1] );
|
|
B[2] = _mm_xor_si128( B[2], C[2] );
|
|
B[3] = _mm_xor_si128( B[3], C[3] );
|
|
|
|
#if defined(__SSE4_1__)
|
|
|
|
__m128i Y0, Y1, Y2, Y3;
|
|
|
|
#if defined(__AVX2__)
|
|
|
|
Y0 = _mm_blend_epi32( B[1], B[0], 0x1 );
|
|
X0 = _mm_blend_epi32( B[3], B[2], 0x4 );
|
|
Y1 = _mm_blend_epi32( B[2], B[1], 0x1 );
|
|
X1 = _mm_blend_epi32( B[0], B[3], 0x4 );
|
|
Y2 = _mm_blend_epi32( B[3], B[2], 0x1 );
|
|
X2 = _mm_blend_epi32( B[1], B[0], 0x4 );
|
|
Y3 = _mm_blend_epi32( B[0], B[3], 0x1 );
|
|
X3 = _mm_blend_epi32( B[2], B[1], 0x4 );
|
|
X0 = _mm_blend_epi32( X0, Y0, 0x3 );
|
|
X1 = _mm_blend_epi32( X1, Y1, 0x3 );
|
|
X2 = _mm_blend_epi32( X2, Y2, 0x3 );
|
|
X3 = _mm_blend_epi32( X3, Y3, 0x3 );
|
|
|
|
#else // SSE4_1
|
|
|
|
Y0 = _mm_blend_epi16( B[1], B[0], 0x03 );
|
|
X0 = _mm_blend_epi16( B[3], B[2], 0x30 );
|
|
Y1 = _mm_blend_epi16( B[2], B[1], 0x03 );
|
|
X1 = _mm_blend_epi16( B[0], B[3], 0x30 );
|
|
Y2 = _mm_blend_epi16( B[3], B[2], 0x03 );
|
|
X2 = _mm_blend_epi16( B[1], B[0], 0x30 );
|
|
Y3 = _mm_blend_epi16( B[0], B[3], 0x03 );
|
|
X3 = _mm_blend_epi16( B[2], B[1], 0x30 );
|
|
|
|
X0 = _mm_blend_epi16( X0, Y0, 0x0f );
|
|
X1 = _mm_blend_epi16( X1, Y1, 0x0f );
|
|
X2 = _mm_blend_epi16( X2, Y2, 0x0f );
|
|
X3 = _mm_blend_epi16( X3, Y3, 0x0f );
|
|
|
|
#endif // AVX2 else SSE4_1
|
|
|
|
SALSA_8ROUNDS_SIMD128;
|
|
|
|
#if defined(__AVX2__)
|
|
|
|
Y0 = _mm_blend_epi32( X0, X1, 0x8 );
|
|
Y1 = _mm_blend_epi32( X0, X1, 0x1 );
|
|
Y2 = _mm_blend_epi32( X0, X1, 0x2 );
|
|
Y3 = _mm_blend_epi32( X0, X1, 0x4 );
|
|
|
|
Y0 = _mm_blend_epi32( Y0, X2, 0x4 );
|
|
Y1 = _mm_blend_epi32( Y1, X2, 0x8 );
|
|
Y2 = _mm_blend_epi32( Y2, X2, 0x1 );
|
|
Y3 = _mm_blend_epi32( Y3, X2, 0x2 );
|
|
|
|
Y0 = _mm_blend_epi32( Y0, X3, 0x2 );
|
|
Y1 = _mm_blend_epi32( Y1, X3, 0x4 );
|
|
Y2 = _mm_blend_epi32( Y2, X3, 0x8 );
|
|
Y3 = _mm_blend_epi32( Y3, X3, 0x1 );
|
|
|
|
#else // SSE4_1
|
|
|
|
Y0 = _mm_blend_epi16( X0, X1, 0xc0 );
|
|
Y1 = _mm_blend_epi16( X0, X1, 0x03 );
|
|
Y2 = _mm_blend_epi16( X0, X1, 0x0c );
|
|
Y3 = _mm_blend_epi16( X0, X1, 0x30 );
|
|
|
|
Y0 = _mm_blend_epi16( Y0, X2, 0x30 );
|
|
Y1 = _mm_blend_epi16( Y1, X2, 0xc0 );
|
|
Y2 = _mm_blend_epi16( Y2, X2, 0x03 );
|
|
Y3 = _mm_blend_epi16( Y3, X2, 0x0c );
|
|
|
|
Y0 = _mm_blend_epi16( Y0, X3, 0x0c );
|
|
Y1 = _mm_blend_epi16( Y1, X3, 0x30 );
|
|
Y2 = _mm_blend_epi16( Y2, X3, 0xc0 );
|
|
Y3 = _mm_blend_epi16( Y3, X3, 0x03 );
|
|
|
|
#endif // AVX2 else SSE4_1
|
|
|
|
B[0] = _mm_add_epi32( B[0], Y0 );
|
|
B[1] = _mm_add_epi32( B[1], Y1 );
|
|
B[2] = _mm_add_epi32( B[2], Y2 );
|
|
B[3] = _mm_add_epi32( B[3], Y3 );
|
|
|
|
#else // SSE2
|
|
|
|
m128_ovly y[4], z[4];
|
|
|
|
X0 = _mm_set_epi32( b[15], b[10], b[ 5], b[ 0] );
|
|
X1 = _mm_set_epi32( b[ 3], b[14], b[ 9], b[ 4] );
|
|
X2 = _mm_set_epi32( b[ 7], b[ 2], b[13], b[ 8] );
|
|
X3 = _mm_set_epi32( b[11], b[ 6], b[ 1], b[12] );
|
|
|
|
SALSA_8ROUNDS_FINAL_SIMD128;
|
|
|
|
// Final round doesn't shuffle data back to original input order,
|
|
// process it as is.
|
|
// X0 is unchanged { xf, xa, x5, x0 }
|
|
// X1 is shuffled left 1 (rol_1x32) { xe, x9, x4, x3 }
|
|
// X2 is shuffled left 2 (swap_64) { xd, x8, x7, x2 }
|
|
// X3 is shuffled left 3 (ror_1x32) { xc, xb, x6, x1 }
|
|
|
|
y[0].m128 = X0;
|
|
y[1].m128 = X1;
|
|
y[2].m128 = X2;
|
|
y[3].m128 = X3;
|
|
|
|
z[0].u32[0] = y[0].u32[0];
|
|
z[0].u32[3] = y[1].u32[0];
|
|
z[0].u32[2] = y[2].u32[0];
|
|
z[0].u32[1] = y[3].u32[0];
|
|
|
|
z[1].u32[1] = y[0].u32[1];
|
|
z[1].u32[0] = y[1].u32[1];
|
|
z[1].u32[3] = y[2].u32[1];
|
|
z[1].u32[2] = y[3].u32[1];
|
|
|
|
z[2].u32[2] = y[0].u32[2];
|
|
z[2].u32[1] = y[1].u32[2];
|
|
z[2].u32[0] = y[2].u32[2];
|
|
z[2].u32[3] = y[3].u32[2];
|
|
|
|
z[3].u32[3] = y[0].u32[3];
|
|
z[3].u32[2] = y[1].u32[3];
|
|
z[3].u32[1] = y[2].u32[3];
|
|
z[3].u32[0] = y[3].u32[3];
|
|
|
|
B[0] = _mm_add_epi32( B[0], z[0].m128 );
|
|
B[1] = _mm_add_epi32( B[1], z[1].m128 );
|
|
B[2] = _mm_add_epi32( B[2], z[2].m128 );
|
|
B[3] = _mm_add_epi32( B[3], z[3].m128 );
|
|
|
|
#endif
|
|
|
|
#undef ROL_1X32
|
|
#undef ROR_1X32
|
|
#undef SWAP_64
|
|
#undef ROL32
|
|
#undef ADD32
|
|
#undef XOR
|
|
|
|
}
|
|
|
|
void scrypt_core_simd128( uint32_t *X, uint32_t *V, const uint32_t N )
|
|
{
|
|
for ( int n = 0; n < N; n++ )
|
|
{
|
|
for ( int i = 0; i < 8; i++ )
|
|
_mm_stream_si128( (__m128i*)V + n*8 + i, casti_m128i( X, i ) );
|
|
|
|
salsa8_simd128( &X[ 0], &X[16] );
|
|
salsa8_simd128( &X[16], &X[ 0] );
|
|
}
|
|
for ( int n = 0; n < N; n++ )
|
|
{
|
|
const int j = 32 * ( X[16] & ( N - 1 ) );
|
|
for ( int i = 0; i < 32; i++ )
|
|
X[i] ^= V[ j+i ];
|
|
salsa8_simd128( &X[ 0], &X[16] );
|
|
salsa8_simd128( &X[16], &X[ 0] );
|
|
}
|
|
}
|
|
|
|
// Double buffered, 2x memory usage
|
|
// No interleaving
|
|
|
|
static inline void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb )
|
|
{
|
|
__m128i *XA = (__m128i*)xa;
|
|
__m128i *XB = (__m128i*)xb;
|
|
|
|
#if defined(__SSE4_1__)
|
|
|
|
__m128i t0 = _mm_blend_epi16( XA[0], XA[1], 0xcc );
|
|
__m128i t1 = _mm_blend_epi16( XA[0], XA[1], 0x33 );
|
|
__m128i t2 = _mm_blend_epi16( XA[2], XA[3], 0xcc );
|
|
__m128i t3 = _mm_blend_epi16( XA[2], XA[3], 0x33 );
|
|
XA[0] = _mm_blend_epi16( t0, t2, 0xf0 );
|
|
XA[1] = _mm_blend_epi16( t1, t3, 0x3c );
|
|
XA[2] = _mm_blend_epi16( t0, t2, 0x0f );
|
|
XA[3] = _mm_blend_epi16( t1, t3, 0xc3 );
|
|
t0 = _mm_blend_epi16( XB[0], XB[1], 0xcc );
|
|
t1 = _mm_blend_epi16( XB[0], XB[1], 0x33 );
|
|
t2 = _mm_blend_epi16( XB[2], XB[3], 0xcc );
|
|
t3 = _mm_blend_epi16( XB[2], XB[3], 0x33 );
|
|
XB[0] = _mm_blend_epi16( t0, t2, 0xf0 );
|
|
XB[1] = _mm_blend_epi16( t1, t3, 0x3c );
|
|
XB[2] = _mm_blend_epi16( t0, t2, 0x0f );
|
|
XB[3] = _mm_blend_epi16( t1, t3, 0xc3 );
|
|
|
|
#else // SSE2
|
|
|
|
__m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
|
|
|
|
YA0 = _mm_set_epi32( xa[15], xa[10], xa[ 5], xa[ 0] );
|
|
YB0 = _mm_set_epi32( xb[15], xb[10], xb[ 5], xb[ 0] );
|
|
YA1 = _mm_set_epi32( xa[ 3], xa[14], xa[ 9], xa[ 4] );
|
|
YB1 = _mm_set_epi32( xb[ 3], xb[14], xb[ 9], xb[ 4] );
|
|
YA2 = _mm_set_epi32( xa[ 7], xa[ 2], xa[13], xa[ 8] );
|
|
YB2 = _mm_set_epi32( xb[ 7], xb[ 2], xb[13], xb[ 8] );
|
|
YA3 = _mm_set_epi32( xa[11], xa[ 6], xa[ 1], xa[12] );
|
|
YB3 = _mm_set_epi32( xb[11], xb[ 6], xb[ 1], xb[12] );
|
|
|
|
XA[0] = YA0;
|
|
XB[0] = YB0;
|
|
XA[1] = YA1;
|
|
XB[1] = YB1;
|
|
XA[2] = YA2;
|
|
XB[2] = YB2;
|
|
XA[3] = YA3;
|
|
XB[3] = YB3;
|
|
|
|
#endif
|
|
}
|
|
|
|
static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
|
|
{
|
|
|
|
__m128i *XA = (__m128i*)xa;
|
|
__m128i *XB = (__m128i*)xb;
|
|
|
|
#if defined(__SSE4_1__)
|
|
|
|
__m128i t0 = _mm_blend_epi16( XA[0], XA[2], 0xf0 );
|
|
__m128i t1 = _mm_blend_epi16( XA[0], XA[2], 0x0f );
|
|
__m128i t2 = _mm_blend_epi16( XA[1], XA[3], 0x3c );
|
|
__m128i t3 = _mm_blend_epi16( XA[1], XA[3], 0xc3 );
|
|
XA[0] = _mm_blend_epi16( t0, t2, 0xcc );
|
|
XA[1] = _mm_blend_epi16( t0, t2, 0x33 );
|
|
XA[2] = _mm_blend_epi16( t1, t3, 0xcc );
|
|
XA[3] = _mm_blend_epi16( t1, t3, 0x33 );
|
|
t0 = _mm_blend_epi16( XB[0], XB[2], 0xf0 );
|
|
t1 = _mm_blend_epi16( XB[0], XB[2], 0x0f );
|
|
t2 = _mm_blend_epi16( XB[1], XB[3], 0x3c );
|
|
t3 = _mm_blend_epi16( XB[1], XB[3], 0xc3 );
|
|
XB[0] = _mm_blend_epi16( t0, t2, 0xcc );
|
|
XB[1] = _mm_blend_epi16( t0, t2, 0x33 );
|
|
XB[2] = _mm_blend_epi16( t1, t3, 0xcc );
|
|
XB[3] = _mm_blend_epi16( t1, t3, 0x33 );
|
|
|
|
#else // SSE2
|
|
|
|
m128_ovly ya[4], za[4], yb[4], zb[4];
|
|
|
|
ya[0].m128 = XA[0];
|
|
yb[0].m128 = XB[0];
|
|
ya[1].m128 = XA[1];
|
|
yb[1].m128 = XB[1];
|
|
ya[2].m128 = XA[2];
|
|
yb[2].m128 = XB[2];
|
|
ya[3].m128 = XA[3];
|
|
yb[3].m128 = XB[3];
|
|
|
|
za[0].u32[0] = ya[0].u32[0];
|
|
zb[0].u32[0] = yb[0].u32[0];
|
|
za[0].u32[1] = ya[3].u32[1];
|
|
zb[0].u32[1] = yb[3].u32[1];
|
|
za[0].u32[2] = ya[2].u32[2];
|
|
zb[0].u32[2] = yb[2].u32[2];
|
|
za[0].u32[3] = ya[1].u32[3];
|
|
zb[0].u32[3] = yb[1].u32[3];
|
|
|
|
za[1].u32[0] = ya[1].u32[0];
|
|
zb[1].u32[0] = yb[1].u32[0];
|
|
za[1].u32[1] = ya[0].u32[1];
|
|
zb[1].u32[1] = yb[0].u32[1];
|
|
za[1].u32[2] = ya[3].u32[2];
|
|
zb[1].u32[2] = yb[3].u32[2];
|
|
za[1].u32[3] = ya[2].u32[3];
|
|
zb[1].u32[3] = yb[2].u32[3];
|
|
|
|
za[2].u32[0] = ya[2].u32[0];
|
|
zb[2].u32[0] = yb[2].u32[0];
|
|
za[2].u32[1] = ya[1].u32[1];
|
|
zb[2].u32[1] = yb[1].u32[1];
|
|
za[2].u32[2] = ya[0].u32[2];
|
|
zb[2].u32[2] = yb[0].u32[2];
|
|
za[2].u32[3] = ya[3].u32[3];
|
|
zb[2].u32[3] = yb[3].u32[3];
|
|
|
|
za[3].u32[0] = ya[3].u32[0];
|
|
zb[3].u32[0] = yb[3].u32[0];
|
|
za[3].u32[1] = ya[2].u32[1];
|
|
zb[3].u32[1] = yb[2].u32[1];
|
|
za[3].u32[2] = ya[1].u32[2];
|
|
zb[3].u32[2] = yb[1].u32[2];
|
|
za[3].u32[3] = ya[0].u32[3];
|
|
zb[3].u32[3] = yb[0].u32[3];
|
|
|
|
XA[0] = za[0].m128;
|
|
XB[0] = zb[0].m128;
|
|
XA[1] = za[1].m128;
|
|
XB[1] = zb[1].m128;
|
|
XA[2] = za[2].m128;
|
|
XB[2] = zb[2].m128;
|
|
XA[3] = za[3].m128;
|
|
XB[3] = zb[3].m128;
|
|
|
|
#endif
|
|
}
|
|
|
|
static void salsa8_simd128_2buf( uint32_t * const ba, uint32_t * const bb,
|
|
const uint32_t * const ca, const uint32_t * const cb )
|
|
{
|
|
__m128i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3;
|
|
__m128i *BA = (__m128i*)ba;
|
|
__m128i *BB = (__m128i*)bb;
|
|
const __m128i *CA = (const __m128i*)ca;
|
|
const __m128i *CB = (const __m128i*)cb;
|
|
|
|
// define targets for macros used in round function template
|
|
#define ROL_1X32 mm128_shufll_32
|
|
#define ROR_1X32 mm128_shuflr_32
|
|
#define SWAP_64 mm128_swap_64
|
|
#define ROL32 mm128_rol_32
|
|
#define ADD32 _mm_add_epi32
|
|
#define XOR _mm_xor_si128
|
|
#define TYPE __m128i
|
|
|
|
XA0 = BA[0] = _mm_xor_si128( BA[0], CA[0] );
|
|
XB0 = BB[0] = _mm_xor_si128( BB[0], CB[0] );
|
|
XA1 = BA[1] = _mm_xor_si128( BA[1], CA[1] );
|
|
XB1 = BB[1] = _mm_xor_si128( BB[1], CB[1] );
|
|
XA2 = BA[2] = _mm_xor_si128( BA[2], CA[2] );
|
|
XB2 = BB[2] = _mm_xor_si128( BB[2], CB[2] );
|
|
XA3 = BA[3] = _mm_xor_si128( BA[3], CA[3] );
|
|
XB3 = BB[3] = _mm_xor_si128( BB[3], CB[3] );
|
|
|
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
|
|
|
SALSA_8ROUNDS_SIMD128_2BUF;
|
|
|
|
#else
|
|
|
|
SALSA_8ROUNDS_SIMD128_2BUF_SLOROT;
|
|
|
|
#endif
|
|
|
|
BA[0] = _mm_add_epi32( BA[0], XA0 );
|
|
BB[0] = _mm_add_epi32( BB[0], XB0 );
|
|
BA[1] = _mm_add_epi32( BA[1], XA1 );
|
|
BB[1] = _mm_add_epi32( BB[1], XB1 );
|
|
BA[2] = _mm_add_epi32( BA[2], XA2 );
|
|
BB[2] = _mm_add_epi32( BB[2], XB2 );
|
|
BA[3] = _mm_add_epi32( BA[3], XA3 );
|
|
BB[3] = _mm_add_epi32( BB[3], XB3 );
|
|
|
|
#undef ROL_1X32
|
|
#undef ROR_1X32
|
|
#undef SWAP_64
|
|
#undef ROL32
|
|
#undef ADD32
|
|
#undef XOR
|
|
#undef TYPE
|
|
}
|
|
|
|
void scrypt_core_simd128_2buf( uint32_t *X, uint32_t *V, const uint32_t N )
|
|
{
|
|
uint32_t *X0 = X;
|
|
uint32_t *X1 = X+32;
|
|
uint32_t *V0 = V;
|
|
uint32_t *V1 = V + 32*N;
|
|
|
|
salsa_simd128_shuffle_2buf( X0, X1 );
|
|
salsa_simd128_shuffle_2buf( X0+16, X1+16 );
|
|
|
|
for ( int n = 0; n < N; n++ )
|
|
{
|
|
#if defined(__AVX__)
|
|
|
|
for ( int i = 0; i < 4; i++ )
|
|
{
|
|
_mm256_stream_si256( (__m256i*)V0 + n*4 + i, casti_m256i( X0, i ) );
|
|
_mm256_stream_si256( (__m256i*)V1 + n*4 + i, casti_m256i( X1, i ) );
|
|
}
|
|
|
|
#elif defined(__SSE4_1__)
|
|
|
|
for ( int i = 0; i < 8; i++ )
|
|
{
|
|
_mm_stream_si128( (__m128i*)V0 + n*8 + i, casti_m128i( X0, i ) );
|
|
_mm_stream_si128( (__m128i*)V1 + n*8 + i, casti_m128i( X1, i ) );
|
|
}
|
|
|
|
#else
|
|
|
|
memcpy( &V0[ n*32 ], X0, 128 );
|
|
memcpy( &V1[ n*32 ], X1, 128 );
|
|
|
|
#endif
|
|
|
|
salsa8_simd128_2buf( X0, X1, X0+16, X1+16 );
|
|
salsa8_simd128_2buf( X0+16, X1+16, X0 , X1 );
|
|
}
|
|
|
|
for ( int n = 0; n < N; n++ )
|
|
{
|
|
#if defined(__AVX2__)
|
|
|
|
const int j0 = 4 * ( X0[16] & ( N-1 ) );
|
|
const int j1 = 4 * ( X1[16] & ( N-1 ) );
|
|
|
|
const __m256i v00 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0 );
|
|
const __m256i v10 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1 );
|
|
const __m256i v01 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0+1 );
|
|
const __m256i v11 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1+1 );
|
|
const __m256i v02 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0+2 );
|
|
const __m256i v12 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1+2 );
|
|
const __m256i v03 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0+3 );
|
|
const __m256i v13 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1+3 );
|
|
|
|
casti_m256i( X0, 0 ) = _mm256_xor_si256( casti_m256i( X0, 0 ), v00 );
|
|
casti_m256i( X1, 0 ) = _mm256_xor_si256( casti_m256i( X1, 0 ), v10 );
|
|
casti_m256i( X0, 1 ) = _mm256_xor_si256( casti_m256i( X0, 1 ), v01 );
|
|
casti_m256i( X1, 1 ) = _mm256_xor_si256( casti_m256i( X1, 1 ), v11 );
|
|
casti_m256i( X0, 2 ) = _mm256_xor_si256( casti_m256i( X0, 2 ), v02 );
|
|
casti_m256i( X1, 2 ) = _mm256_xor_si256( casti_m256i( X1, 2 ), v12 );
|
|
casti_m256i( X0, 3 ) = _mm256_xor_si256( casti_m256i( X0, 3 ), v03 );
|
|
casti_m256i( X1, 3 ) = _mm256_xor_si256( casti_m256i( X1, 3 ), v13 );
|
|
|
|
#else
|
|
|
|
const int j0 = 8 * ( X0[16] & ( N-1 ) );
|
|
const int j1 = 8 * ( X1[16] & ( N-1 ) );
|
|
for ( int i = 0; i < 8; i++ )
|
|
{
|
|
const __m128i v0 = _mm_load_si128( ( (__m128i*)V0 ) +j0+i );
|
|
const __m128i v1 = _mm_load_si128( ( (__m128i*)V1 ) +j1+i );
|
|
casti_m128i( X0, i ) = _mm_xor_si128( casti_m128i( X0, i ), v0 );
|
|
casti_m128i( X1, i ) = _mm_xor_si128( casti_m128i( X1, i ), v1 );
|
|
}
|
|
|
|
#endif
|
|
|
|
salsa8_simd128_2buf( X0, X1, X0+16, X1+16 );
|
|
salsa8_simd128_2buf( X0+16, X1+16, X0 , X1 );
|
|
}
|
|
|
|
salsa_simd128_unshuffle_2buf( X0, X1 );
|
|
salsa_simd128_unshuffle_2buf( X0+16, X1+16 );
|
|
}
|
|
|
|
|
|
static inline void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb,
|
|
uint32_t *xc )
|
|
{
|
|
__m128i *XA = (__m128i*)xa;
|
|
__m128i *XB = (__m128i*)xb;
|
|
__m128i *XC = (__m128i*)xc;
|
|
|
|
#if defined(__SSE4_1__)
|
|
|
|
__m128i t0 = _mm_blend_epi16( XA[0], XA[1], 0xcc );
|
|
__m128i t1 = _mm_blend_epi16( XA[0], XA[1], 0x33 );
|
|
__m128i t2 = _mm_blend_epi16( XA[2], XA[3], 0xcc );
|
|
__m128i t3 = _mm_blend_epi16( XA[2], XA[3], 0x33 );
|
|
XA[0] = _mm_blend_epi16( t0, t2, 0xf0 );
|
|
XA[1] = _mm_blend_epi16( t1, t3, 0x3c );
|
|
XA[2] = _mm_blend_epi16( t0, t2, 0x0f );
|
|
XA[3] = _mm_blend_epi16( t1, t3, 0xc3 );
|
|
t0 = _mm_blend_epi16( XB[0], XB[1], 0xcc );
|
|
t1 = _mm_blend_epi16( XB[0], XB[1], 0x33 );
|
|
t2 = _mm_blend_epi16( XB[2], XB[3], 0xcc );
|
|
t3 = _mm_blend_epi16( XB[2], XB[3], 0x33 );
|
|
XB[0] = _mm_blend_epi16( t0, t2, 0xf0 );
|
|
XB[1] = _mm_blend_epi16( t1, t3, 0x3c );
|
|
XB[2] = _mm_blend_epi16( t0, t2, 0x0f );
|
|
XB[3] = _mm_blend_epi16( t1, t3, 0xc3 );
|
|
t0 = _mm_blend_epi16( XC[0], XC[1], 0xcc );
|
|
t1 = _mm_blend_epi16( XC[0], XC[1], 0x33 );
|
|
t2 = _mm_blend_epi16( XC[2], XC[3], 0xcc );
|
|
t3 = _mm_blend_epi16( XC[2], XC[3], 0x33 );
|
|
XC[0] = _mm_blend_epi16( t0, t2, 0xf0 );
|
|
XC[1] = _mm_blend_epi16( t1, t3, 0x3c );
|
|
XC[2] = _mm_blend_epi16( t0, t2, 0x0f );
|
|
XC[3] = _mm_blend_epi16( t1, t3, 0xc3 );
|
|
|
|
#else // SSE2
|
|
|
|
__m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3;
|
|
|
|
YA0 = _mm_set_epi32( xa[15], xa[10], xa[ 5], xa[ 0] );
|
|
YB0 = _mm_set_epi32( xb[15], xb[10], xb[ 5], xb[ 0] );
|
|
YC0 = _mm_set_epi32( xc[15], xc[10], xc[ 5], xc[ 0] );
|
|
YA1 = _mm_set_epi32( xa[ 3], xa[14], xa[ 9], xa[ 4] );
|
|
YB1 = _mm_set_epi32( xb[ 3], xb[14], xb[ 9], xb[ 4] );
|
|
YC1 = _mm_set_epi32( xc[ 3], xc[14], xc[ 9], xc[ 4] );
|
|
YA2 = _mm_set_epi32( xa[ 7], xa[ 2], xa[13], xa[ 8] );
|
|
YB2 = _mm_set_epi32( xb[ 7], xb[ 2], xb[13], xb[ 8] );
|
|
YC2 = _mm_set_epi32( xc[ 7], xc[ 2], xc[13], xc[ 8] );
|
|
YA3 = _mm_set_epi32( xa[11], xa[ 6], xa[ 1], xa[12] );
|
|
YB3 = _mm_set_epi32( xb[11], xb[ 6], xb[ 1], xb[12] );
|
|
YC3 = _mm_set_epi32( xc[11], xc[ 6], xc[ 1], xc[12] );
|
|
|
|
XA[0] = YA0;
|
|
XB[0] = YB0;
|
|
XC[0] = YC0;
|
|
XA[1] = YA1;
|
|
XB[1] = YB1;
|
|
XC[1] = YC1;
|
|
XA[2] = YA2;
|
|
XB[2] = YB2;
|
|
XC[2] = YC2;
|
|
XA[3] = YA3;
|
|
XB[3] = YB3;
|
|
XC[3] = YC3;
|
|
|
|
#endif
|
|
}
|
|
|
|
static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
|
|
uint32_t* xc )
|
|
{
|
|
__m128i *XA = (__m128i*)xa;
|
|
__m128i *XB = (__m128i*)xb;
|
|
__m128i *XC = (__m128i*)xc;
|
|
|
|
#if defined(__SSE4_1__)
|
|
|
|
__m128i t0 = _mm_blend_epi16( XA[0], XA[2], 0xf0 );
|
|
__m128i t1 = _mm_blend_epi16( XA[0], XA[2], 0x0f );
|
|
__m128i t2 = _mm_blend_epi16( XA[1], XA[3], 0x3c );
|
|
__m128i t3 = _mm_blend_epi16( XA[1], XA[3], 0xc3 );
|
|
XA[0] = _mm_blend_epi16( t0, t2, 0xcc );
|
|
XA[1] = _mm_blend_epi16( t0, t2, 0x33 );
|
|
XA[2] = _mm_blend_epi16( t1, t3, 0xcc );
|
|
XA[3] = _mm_blend_epi16( t1, t3, 0x33 );
|
|
t0 = _mm_blend_epi16( XB[0], XB[2], 0xf0 );
|
|
t1 = _mm_blend_epi16( XB[0], XB[2], 0x0f );
|
|
t2 = _mm_blend_epi16( XB[1], XB[3], 0x3c );
|
|
t3 = _mm_blend_epi16( XB[1], XB[3], 0xc3 );
|
|
XB[0] = _mm_blend_epi16( t0, t2, 0xcc );
|
|
XB[1] = _mm_blend_epi16( t0, t2, 0x33 );
|
|
XB[2] = _mm_blend_epi16( t1, t3, 0xcc );
|
|
XB[3] = _mm_blend_epi16( t1, t3, 0x33 );
|
|
t0 = _mm_blend_epi16( XC[0], XC[2], 0xf0 );
|
|
t1 = _mm_blend_epi16( XC[0], XC[2], 0x0f );
|
|
t2 = _mm_blend_epi16( XC[1], XC[3], 0x3c );
|
|
t3 = _mm_blend_epi16( XC[1], XC[3], 0xc3 );
|
|
XC[0] = _mm_blend_epi16( t0, t2, 0xcc );
|
|
XC[1] = _mm_blend_epi16( t0, t2, 0x33 );
|
|
XC[2] = _mm_blend_epi16( t1, t3, 0xcc );
|
|
XC[3] = _mm_blend_epi16( t1, t3, 0x33 );
|
|
|
|
#else // SSE2
|
|
|
|
m128_ovly ya[4], za[4], yb[4], zb[4], yc[4], zc[4];
|
|
|
|
ya[0].m128 = XA[0];
|
|
yb[0].m128 = XB[0];
|
|
yc[0].m128 = XC[0];
|
|
ya[1].m128 = XA[1];
|
|
yb[1].m128 = XB[1];
|
|
yc[1].m128 = XC[1];
|
|
ya[2].m128 = XA[2];
|
|
yb[2].m128 = XB[2];
|
|
yc[2].m128 = XC[2];
|
|
ya[3].m128 = XA[3];
|
|
yb[3].m128 = XB[3];
|
|
yc[3].m128 = XC[3];
|
|
|
|
za[0].u32[0] = ya[0].u32[0];
|
|
zb[0].u32[0] = yb[0].u32[0];
|
|
zc[0].u32[0] = yc[0].u32[0];
|
|
za[0].u32[1] = ya[3].u32[1];
|
|
zb[0].u32[1] = yb[3].u32[1];
|
|
zc[0].u32[1] = yc[3].u32[1];
|
|
za[0].u32[2] = ya[2].u32[2];
|
|
zb[0].u32[2] = yb[2].u32[2];
|
|
zc[0].u32[2] = yc[2].u32[2];
|
|
za[0].u32[3] = ya[1].u32[3];
|
|
zb[0].u32[3] = yb[1].u32[3];
|
|
zc[0].u32[3] = yc[1].u32[3];
|
|
|
|
za[1].u32[0] = ya[1].u32[0];
|
|
zb[1].u32[0] = yb[1].u32[0];
|
|
zc[1].u32[0] = yc[1].u32[0];
|
|
za[1].u32[1] = ya[0].u32[1];
|
|
zb[1].u32[1] = yb[0].u32[1];
|
|
zc[1].u32[1] = yc[0].u32[1];
|
|
za[1].u32[2] = ya[3].u32[2];
|
|
zb[1].u32[2] = yb[3].u32[2];
|
|
zc[1].u32[2] = yc[3].u32[2];
|
|
za[1].u32[3] = ya[2].u32[3];
|
|
zb[1].u32[3] = yb[2].u32[3];
|
|
zc[1].u32[3] = yc[2].u32[3];
|
|
|
|
za[2].u32[0] = ya[2].u32[0];
|
|
zb[2].u32[0] = yb[2].u32[0];
|
|
zc[2].u32[0] = yc[2].u32[0];
|
|
za[2].u32[1] = ya[1].u32[1];
|
|
zb[2].u32[1] = yb[1].u32[1];
|
|
zc[2].u32[1] = yc[1].u32[1];
|
|
za[2].u32[2] = ya[0].u32[2];
|
|
zb[2].u32[2] = yb[0].u32[2];
|
|
zc[2].u32[2] = yc[0].u32[2];
|
|
za[2].u32[3] = ya[3].u32[3];
|
|
zb[2].u32[3] = yb[3].u32[3];
|
|
zc[2].u32[3] = yc[3].u32[3];
|
|
|
|
za[3].u32[0] = ya[3].u32[0];
|
|
zb[3].u32[0] = yb[3].u32[0];
|
|
zc[3].u32[0] = yc[3].u32[0];
|
|
za[3].u32[1] = ya[2].u32[1];
|
|
zb[3].u32[1] = yb[2].u32[1];
|
|
zc[3].u32[1] = yc[2].u32[1];
|
|
za[3].u32[2] = ya[1].u32[2];
|
|
zb[3].u32[2] = yb[1].u32[2];
|
|
zc[3].u32[2] = yc[1].u32[2];
|
|
za[3].u32[3] = ya[0].u32[3];
|
|
zb[3].u32[3] = yb[0].u32[3];
|
|
zc[3].u32[3] = yc[0].u32[3];
|
|
|
|
XA[0] = za[0].m128;
|
|
XB[0] = zb[0].m128;
|
|
XC[0] = zc[0].m128;
|
|
XA[1] = za[1].m128;
|
|
XB[1] = zb[1].m128;
|
|
XC[1] = zc[1].m128;
|
|
XA[2] = za[2].m128;
|
|
XB[2] = zb[2].m128;
|
|
XC[2] = zc[2].m128;
|
|
XA[3] = za[3].m128;
|
|
XB[3] = zb[3].m128;
|
|
XC[3] = zc[3].m128;
|
|
|
|
#endif
|
|
}
|
|
|
|
// Triple buffered, 3x memory usage
|
|
// No interleaving
|
|
static void salsa8_simd128_3buf( uint32_t *ba, uint32_t *bb, uint32_t *bc,
|
|
const uint32_t *ca, const uint32_t *cb, const uint32_t *cc )
|
|
{
|
|
__m128i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3,
|
|
XC0, XC1, XC2, XC3;
|
|
__m128i *BA = (__m128i*)ba;
|
|
__m128i *BB = (__m128i*)bb;
|
|
__m128i *BC = (__m128i*)bc;
|
|
const __m128i *CA = (const __m128i*)ca;
|
|
const __m128i *CB = (const __m128i*)cb;
|
|
const __m128i *CC = (const __m128i*)cc;
|
|
|
|
// define targets for macros used in round function template
|
|
#define ROL_1X32 mm128_shufll_32
|
|
#define ROR_1X32 mm128_shuflr_32
|
|
#define SWAP_64 mm128_swap_64
|
|
#define ROL32 mm128_rol_32
|
|
#define ADD32 _mm_add_epi32
|
|
#define XOR _mm_xor_si128
|
|
#define TYPE __m128i
|
|
|
|
XA0 = BA[0] = _mm_xor_si128( BA[0], CA[0] );
|
|
XB0 = BB[0] = _mm_xor_si128( BB[0], CB[0] );
|
|
XC0 = BC[0] = _mm_xor_si128( BC[0], CC[0] );
|
|
XA1 = BA[1] = _mm_xor_si128( BA[1], CA[1] );
|
|
XB1 = BB[1] = _mm_xor_si128( BB[1], CB[1] );
|
|
XC1 = BC[1] = _mm_xor_si128( BC[1], CC[1] );
|
|
XA2 = BA[2] = _mm_xor_si128( BA[2], CA[2] );
|
|
XB2 = BB[2] = _mm_xor_si128( BB[2], CB[2] );
|
|
XC2 = BC[2] = _mm_xor_si128( BC[2], CC[2] );
|
|
XA3 = BA[3] = _mm_xor_si128( BA[3], CA[3] );
|
|
XB3 = BB[3] = _mm_xor_si128( BB[3], CB[3] );
|
|
XC3 = BC[3] = _mm_xor_si128( BC[3], CC[3] );
|
|
|
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
|
|
|
SALSA_8ROUNDS_SIMD128_3BUF;
|
|
|
|
#else
|
|
|
|
SALSA_8ROUNDS_SIMD128_3BUF_SLOROT;
|
|
|
|
#endif
|
|
|
|
BA[0] = _mm_add_epi32( BA[0], XA0 );
|
|
BB[0] = _mm_add_epi32( BB[0], XB0 );
|
|
BC[0] = _mm_add_epi32( BC[0], XC0 );
|
|
BA[1] = _mm_add_epi32( BA[1], XA1 );
|
|
BB[1] = _mm_add_epi32( BB[1], XB1 );
|
|
BC[1] = _mm_add_epi32( BC[1], XC1 );
|
|
BA[2] = _mm_add_epi32( BA[2], XA2 );
|
|
BB[2] = _mm_add_epi32( BB[2], XB2 );
|
|
BC[2] = _mm_add_epi32( BC[2], XC2 );
|
|
BA[3] = _mm_add_epi32( BA[3], XA3 );
|
|
BB[3] = _mm_add_epi32( BB[3], XB3 );
|
|
BC[3] = _mm_add_epi32( BC[3], XC3 );
|
|
|
|
#undef ROL_1X32
|
|
#undef ROR_1X32
|
|
#undef SWAP_64
|
|
#undef ROL32
|
|
#undef ADD32
|
|
#undef XOR
|
|
#undef TYPE
|
|
}
|
|
|
|
void scrypt_core_simd128_3buf( uint32_t *X, uint32_t *V, const uint32_t N )
|
|
{
|
|
uint32_t *X0 = X;
|
|
uint32_t *X1 = X+32;
|
|
uint32_t *X2 = X+64;
|
|
uint32_t *V0 = V;
|
|
uint32_t *V1 = V + 32*N;
|
|
uint32_t *V2 = V + 64*N;
|
|
|
|
salsa_simd128_shuffle_3buf( X0, X1, X2 );
|
|
salsa_simd128_shuffle_3buf( X0+16, X1+16, X2+16 );
|
|
|
|
for ( int n = 0; n < N; n++ )
|
|
{
|
|
#if defined(__AVX__)
|
|
|
|
for ( int i = 0; i < 4; i++ )
|
|
{
|
|
_mm256_stream_si256( (__m256i*)V0 + n*4 + i, casti_m256i( X0, i ) );
|
|
_mm256_stream_si256( (__m256i*)V1 + n*4 + i, casti_m256i( X1, i ) );
|
|
_mm256_stream_si256( (__m256i*)V2 + n*4 + i, casti_m256i( X2, i ) );
|
|
}
|
|
|
|
#elif defined(__SSE4_1__)
|
|
|
|
for ( int i = 0; i < 8; i++ )
|
|
{
|
|
_mm_stream_si128( (__m128i*)V0 + n*8 + i, casti_m128i( X0, i ) );
|
|
_mm_stream_si128( (__m128i*)V1 + n*8 + i, casti_m128i( X1, i ) );
|
|
_mm_stream_si128( (__m128i*)V2 + n*8 + i, casti_m128i( X2, i ) );
|
|
}
|
|
|
|
#else
|
|
|
|
memcpy( &V0[ n*32 ], X0, 128 );
|
|
memcpy( &V1[ n*32 ], X1, 128 );
|
|
memcpy( &V2[ n*32 ], X2, 128 );
|
|
|
|
#endif
|
|
|
|
salsa8_simd128_3buf( X0, X1, X2 , X0+16, X1+16, X2+16 );
|
|
salsa8_simd128_3buf( X0+16, X1+16, X2+16, X0, X1, X2 );
|
|
}
|
|
|
|
for ( int n = 0; n < N; n++ )
|
|
{
|
|
#if defined(__AVX2__)
|
|
|
|
const int j0 = 4 * ( X0[16] & ( N-1 ) );
|
|
const int j1 = 4 * ( X1[16] & ( N-1 ) );
|
|
const int j2 = 4 * ( X2[16] & ( N-1 ) );
|
|
|
|
const __m256i v00 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0 );
|
|
const __m256i v10 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1 );
|
|
const __m256i v20 = _mm256_stream_load_si256( ( (__m256i*)V2 ) +j2 );
|
|
const __m256i v01 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0+1 );
|
|
const __m256i v11 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1+1 );
|
|
const __m256i v21 = _mm256_stream_load_si256( ( (__m256i*)V2 ) +j2+1 );
|
|
const __m256i v02 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0+2 );
|
|
const __m256i v12 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1+2 );
|
|
const __m256i v22 = _mm256_stream_load_si256( ( (__m256i*)V2 ) +j2+2 );
|
|
const __m256i v03 = _mm256_stream_load_si256( ( (__m256i*)V0 ) +j0+3 );
|
|
const __m256i v13 = _mm256_stream_load_si256( ( (__m256i*)V1 ) +j1+3 );
|
|
const __m256i v23 = _mm256_stream_load_si256( ( (__m256i*)V2 ) +j2+3 );
|
|
|
|
casti_m256i( X0, 0 ) = _mm256_xor_si256( casti_m256i( X0, 0 ), v00 );
|
|
casti_m256i( X1, 0 ) = _mm256_xor_si256( casti_m256i( X1, 0 ), v10 );
|
|
casti_m256i( X2, 0 ) = _mm256_xor_si256( casti_m256i( X2, 0 ), v20 );
|
|
casti_m256i( X0, 1 ) = _mm256_xor_si256( casti_m256i( X0, 1 ), v01 );
|
|
casti_m256i( X1, 1 ) = _mm256_xor_si256( casti_m256i( X1, 1 ), v11 );
|
|
casti_m256i( X2, 1 ) = _mm256_xor_si256( casti_m256i( X2, 1 ), v21 );
|
|
casti_m256i( X0, 2 ) = _mm256_xor_si256( casti_m256i( X0, 2 ), v02 );
|
|
casti_m256i( X1, 2 ) = _mm256_xor_si256( casti_m256i( X1, 2 ), v12 );
|
|
casti_m256i( X2, 2 ) = _mm256_xor_si256( casti_m256i( X2, 2 ), v22 );
|
|
casti_m256i( X0, 3 ) = _mm256_xor_si256( casti_m256i( X0, 3 ), v03 );
|
|
casti_m256i( X1, 3 ) = _mm256_xor_si256( casti_m256i( X1, 3 ), v13 );
|
|
casti_m256i( X2, 3 ) = _mm256_xor_si256( casti_m256i( X2, 3 ), v23 );
|
|
|
|
#else
|
|
|
|
const int j0 = 8 * ( X0[16] & ( N-1 ) );
|
|
const int j1 = 8 * ( X1[16] & ( N-1 ) );
|
|
const int j2 = 8 * ( X2[16] & ( N-1 ) );
|
|
for ( int i = 0; i < 8; i++ )
|
|
{
|
|
const __m128i v0 = _mm_load_si128( ( (__m128i*)V0 ) +j0+i );
|
|
const __m128i v1 = _mm_load_si128( ( (__m128i*)V1 ) +j1+i );
|
|
const __m128i v2 = _mm_load_si128( ( (__m128i*)V2 ) +j2+i );
|
|
casti_m128i( X0, i ) = _mm_xor_si128( casti_m128i( X0, i ), v0 );
|
|
casti_m128i( X1, i ) = _mm_xor_si128( casti_m128i( X1, i ), v1 );
|
|
casti_m128i( X2, i ) = _mm_xor_si128( casti_m128i( X2, i ), v2 );
|
|
}
|
|
|
|
#endif
|
|
|
|
salsa8_simd128_3buf( X0, X1, X2 , X0+16, X1+16, X2+16 );
|
|
salsa8_simd128_3buf( X0+16, X1+16, X2+16, X0, X1, X2 );
|
|
}
|
|
|
|
salsa_simd128_unshuffle_3buf( X0, X1, X2 );
|
|
salsa_simd128_unshuffle_3buf( X0+16, X1+16, X2+16 );
|
|
|
|
}
|
|
|
|
|
|
#endif // SSE2
|
|
|
|
|
|
// Reference, used only for testing.
|
|
// Tested OK.
|
|
|
|
static void xor_salsa8(uint32_t * const B, const uint32_t * const C)
|
|
{
|
|
uint32_t x0 = (B[ 0] ^= C[ 0]),
|
|
x1 = (B[ 1] ^= C[ 1]),
|
|
x2 = (B[ 2] ^= C[ 2]),
|
|
x3 = (B[ 3] ^= C[ 3]);
|
|
uint32_t x4 = (B[ 4] ^= C[ 4]),
|
|
x5 = (B[ 5] ^= C[ 5]),
|
|
x6 = (B[ 6] ^= C[ 6]),
|
|
x7 = (B[ 7] ^= C[ 7]);
|
|
uint32_t x8 = (B[ 8] ^= C[ 8]),
|
|
x9 = (B[ 9] ^= C[ 9]),
|
|
xa = (B[10] ^= C[10]),
|
|
xb = (B[11] ^= C[11]);
|
|
uint32_t xc = (B[12] ^= C[12]),
|
|
xd = (B[13] ^= C[13]),
|
|
xe = (B[14] ^= C[14]),
|
|
xf = (B[15] ^= C[15]);
|
|
|
|
|
|
#define ROL32( a, c ) ror32( a, c )
|
|
#define ADD32( a, b ) ( (a)+(b) )
|
|
#define XOR( a, b ) ( (a)^(b) )
|
|
|
|
SALSA_8ROUNDS;
|
|
|
|
#undef ROL32
|
|
#undef ADD32
|
|
#undef XOR
|
|
|
|
B[ 0] += x0;
|
|
B[ 1] += x1;
|
|
B[ 2] += x2;
|
|
B[ 3] += x3;
|
|
B[ 4] += x4;
|
|
B[ 5] += x5;
|
|
B[ 6] += x6;
|
|
B[ 7] += x7;
|
|
B[ 8] += x8;
|
|
B[ 9] += x9;
|
|
B[10] += xa;
|
|
B[11] += xb;
|
|
B[12] += xc;
|
|
B[13] += xd;
|
|
B[14] += xe;
|
|
B[15] += xf;
|
|
}
|
|
|
|
/**
|
|
* @param X input/ouput
|
|
* @param V scratch buffer
|
|
* @param N factor (def. 1024)
|
|
*/
|
|
|
|
|
|
void scrypt_core_1way( uint32_t *X, uint32_t *V, const uint32_t N )
|
|
{
|
|
for ( int n = 0; n < N; n++ )
|
|
{
|
|
memcpy( &V[ n*32 ], X, 128 );
|
|
xor_salsa8( &X[ 0], &X[16] );
|
|
xor_salsa8( &X[16], &X[ 0] );
|
|
}
|
|
for ( int n = 0; n < N; n++ )
|
|
{
|
|
int j = 32 * ( X[16] & ( N - 1 ) );
|
|
for ( int i = 0; i < 32; i++ )
|
|
X[i] ^= V[ j+i ];
|
|
xor_salsa8( &X[ 0], &X[16] );
|
|
xor_salsa8( &X[16], &X[ 0] );
|
|
}
|
|
}
|
|
|
|
|
|
|