// convert blake256 32 bit to use 64 bit with serial vectoring // // cut calls to GS in half // // combine V // v0 = {V0,V1} // v1 = {V2,V3} // v2 = {V4,V5} // v3 = {V6,V7} // v4 = {V8,V9} // v5 = {VA,VB} // v6 = {VC,VD} // v7 = {CE,VF} // // v6x = {VD,VC} swap(VC,VD) swap(v6) // v7x = {VF,VE} swap(VE,VF) swap(v7) // // V0 = v1v0 // V1 = v3v2 // V2 = v5v4 // V3 = v7v6 // V4 = v9v8 // V5 = vbva // V6 = vdvc // V7 = vfve // // The rotate in ROUND is to effect straddle and unstraddle for the third // and 4th iteration of GS. // It concatenates 2 contiguous 256 bit vectors and extracts the middle // 256 bits. After the transform they must be restored with only the // chosen bits modified in the original 2 vectors. // ror1x128 achieves this by putting the chosen bits in arg1, the "low" // 256 bit vector and saves the untouched bits temporailly in arg0, the // "high" 256 bit vector. Simply reverse the process to restore data back // to original positions. // Use standard 4way when AVX2 is not available use x2 mode with AVX2. // // Data is organised the same as 32 bit 4 way, in effect serial vectoring // on top of parallel vectoring. Same data in the same place just taking // two chunks at a time. // // Transparent to user, x2 mode used when AVX2 detected. // Use existing 4way context but revert to scalar types. // Same interleave function (128 bit) or x2 with 256 bit? // User trsnaparency would have to apply to interleave as well. // // Use common 4way update and close /* typedef struct { unsigned char buf[64<<2]; uint32_t H[8<<2]; uint32_t S[4<<2]; size_t ptr; uint32_t T0, T1; int rounds; // 14 for blake, 8 for blakecoin & vanilla } blakex2_4way_small_context __attribute__ ((aligned (64))); */ static void blake32x2_4way_init( blake_4way_small_context *ctx, const uint32_t *iv, const uint32_t *salt, int rounds ) { casti_m128i( ctx->H, 0 ) = _mm_set1_epi32( iv[0] ); casti_m128i( ctx->H, 1 ) = _mm_set1_epi32( iv[1] ); casti_m128i( ctx->H, 2 ) = _mm_set1_epi32( iv[2] ); casti_m128i( ctx->H, 3 ) = _mm_set1_epi32( iv[3] ); casti_m128i( ctx->H, 4 ) = _mm_set1_epi32( iv[4] ); casti_m128i( ctx->H, 5 ) = _mm_set1_epi32( iv[5] ); casti_m128i( ctx->H, 6 ) = _mm_set1_epi32( iv[6] ); casti_m128i( ctx->H, 7 ) = _mm_set1_epi32( iv[7] ); casti_m128i( ctx->S, 0 ) = m128_zero; casti_m128i( ctx->S, 1 ) = m128_zero; casti_m128i( ctx->S, 2 ) = m128_zero; casti_m128i( ctx->S, 3 ) = m128_zero; /* sc->S[0] = _mm_set1_epi32( salt[0] ); sc->S[1] = _mm_set1_epi32( salt[1] ); sc->S[2] = _mm_set1_epi32( salt[2] ); sc->S[3] = _mm_set1_epi32( salt[3] ); */ ctx->T0 = ctx->T1 = 0; ctx->ptr = 0; ctx->rounds = rounds; } static void blake32x2( blake_4way_small_context *ctx, const void *data, size_t len ) { __m128i *buf = (__m256i*)ctx->buf; size_t bptr = ctx->ptr << 2; size_t vptr = ctx->ptr >> 3; size_t blen = len << 2; // unsigned char *buf = ctx->buf; // size_t ptr = ctx->ptr<<4; // repurposed DECL_STATE32x2 // buf = sc->buf; // ptr = sc->ptr; // adjust len for use with ptr, clen, all absolute bytes. // int blen = len<<2; if ( blen < (sizeof ctx->buf) - bptr ) { memcpy( buf + vptr, data, blen ); ptr += blen; ctx->ptr = bptr >> 2;; return; } READ_STATE32( ctx ); while ( blen > 0 ) { size_t clen; clen = ( sizeof sc->buf ) - ptr; if ( clen > blen ) clen = blen; memcpy( buf + vptr, data, clen ); bptr += clen; vptr = bptr >> 5; data = (const unsigned char *)data + clen; blen -= clen; if ( bptr == sizeof ctx->buf ) { if ( ( T0 = T0 + 512 ) < 512 ) // not needed, will never rollover T1 += 1; COMPRESS32x2_4WAY( ctx->rounds ); ptr = 0; } } WRITE_STATE32x2( ctx ); ctx->ptr = bptr >> 2; } static void blake32x2_4way_close( blake_4way_small_context *ctx, void *dst ) { __m256i buf[8] __attribute__ ((aligned (64))); size_t ptr = ctx->ptr; size_t vptr = ctx->ptr>>2; unsigned bit_len = ( (unsigned)ptr << 3 ); // one lane uint32_t th = ctx->T1; uint32_t tl = ctx->T0 + bit_len; if ( ptr == 0 ) { ctx->T0 = 0xFFFFFE00UL; ctx->T1 = 0xFFFFFFFFUL; } else if ( ctx->T0 == 0 ) { ctx->T0 = 0xFFFFFE00UL + bit_len; ctx->T1 -= 1; } else ctx->T0 -= 512 - bit_len; // memset doesn't do ints buf[ vptr ] = _mm256_set_epi32( 0,0,0,0, 0x80, 0x80, 0x80, 0x80 ); if ( vptr < 5 ) { memset_zero_256( buf + vptr + 1, 6 - vptr ); buf[ 6 ] = _mm256_or_si256( vbuf[ 6 ], _mm256_set_epi32( 0x01000000UL,0x01000000UL,0x01000000UL,0x01000000UL, 0,0,0,0 ) ); buf[ 7 ] = mm256_bswap_32( _mm256_set_epi32( tl,tl,tl,tl, th,th,th,th ) ); blake32x2_4way( ctx, buf + vptr, 64 - ptr ); } else { memset_zero_256( vbuf + vptr + 1, 7 - vptr ); blake32x2_4way( ctx, vbuf + ptr, 64 - ptr ); ctx->T0 = 0xFFFFFE00UL; ctx->T1 = 0xFFFFFFFFUL; buf[ 6 ] = mm256_zero; buf[ 6 ] = _mm256_set_epi32( 0,0,0,0, 0x01000000UL,0x01000000UL,0x01000000UL,0x01000000UL ); buf[ 7 ] = mm256_bswap_32( _mm256_set_epi32( tl, tl, tl, tl, th, th, th, th ); blake32x2_4way( ctx, buf, 64 ); } casti_m256i( dst, 0 ) = mm256_bswap_32( casti_m256i( ctx->H, 0 ) ); casti_m256i( dst, 1 ) = mm256_bswap_32( casti_m256i( ctx->H, 1 ) ); casti_m256i( dst, 2 ) = mm256_bswap_32( casti_m256i( ctx->H, 2 ) ); casti_m256i( dst, 3 ) = mm256_bswap_32( casti_m256i( ctx->H, 3 ) ); } #define DECL_STATE32x2_4WAY \ __m256i H0, H1, H2, H3; \ __m256i S0, S1; \ uint32_t T0, T1; #define READ_STATE32x2_4WAY(state) do \ { \ H0 = casti_m256i( state->H, 0 ); \ H1 = casti_m256i( state->H, 1 ); \ H2 = casti_m256i( state->H, 2 ); \ H3 = casti_m256i( state->H, 3 ); \ S0 = casti_m256i( state->S, 0 ); \ S1 = casti_m256i( state->S, 1 ); \ T0 = state->T0; \ T1 = state->T1; \ #define WRITE_STATE32x2_4WAY(state) do { \ casti_m256i( state->H, 0 ) = H0; \ casti_m256i( state->H, 1 ) = H1; \ casti_m256i( state->H, 2 ) = H2; \ casti_m256i( state->H, 3 ) = H3; \ casti_m256i( state->S, 0 ) = S0; \ casti_m256i( state->S, 1 ) = S1; \ state->T0 = T0; \ state->T1 = T1; \ } while (0) #define GSx2_4WAY( m0m2, m1m3, c0c2, c1c3, a, b, c, d ) do \ { \ a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \ _mm256_set_epi32( c1,c3, c1,c3, c1,c3, c1,c3 ), \ _mm256_set_epi32( m0,m2, m0,m2, m0,m2, m0,m2 ) ), b ), a ); \ d = mm256_ror_32( _mm_xor_si128( d, a ), 16 ); \ c = _mm256_add_epi32( c, d ); \ b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \ a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \ _mm256_set_epi32( c0,c2, c0,c2, c0,c2, c0,c2 ), \ _mm256_set_epi32( m1,m3, m1,m3, m1,m3, m1,m3 ) ), b ), a ); \ d = mm256_ror_32( _mm256_xor_si256( d, a ), 8 ); \ c = _mm256_add_epi32( c, d ); \ b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \ } while (0) #define ROUND_Sx2_4WAY(r) do \ { \ GS2_4WAY( Mx(r, 0), Mx(r, 1), Mx(r, 2), Mx(r, 3), \ CSx(r, 0), CSx(r, 1), CSx(r, 2), CSx(r, 3), V0, V2, V4, V6 ); \ GS2_4WAY( Mx(r, 4), Mx(r, 5), Mx(r, 6), Mx(r, 7), \ CSx(r, 4), CSx(r, 5), CSx(r, 6), CSx(r, 7), V1, V3, V5, V7 ); \ mm256_ror1x128_512( V3, V2 ); \ mm256_ror1x128_512( V6, V7 ); \ GS2_4WAY( Mx(r, 8), Mx(r, 9), Mx(r, A), Mx(r, B), \ CSx(r, 8), CSx(r, 9), CSx(r, A), CSx(r, B), V0, V2, V5, V7 ); \ GS2_4WAY( Mx(r, C), Mx(r, D), Mx(r, C), Mx(r, D), \ CSx(r, C), CSx(r, D), CSx(r, C), CSx(r, D), V1, V3, V4, V6 ); \ mm256_rol1x128_512( V2, V3 ); \ mm256_rol1x128_512( V7, V6 ); #define COMPRESS32x2_4WAY( rounds ) do \ { \ __m256i M0, M1, M2, M3, M4, M5, M6, M7; \ __m256i V0, V1, V2, V3, V4, V5, V6, V7; \ unsigned r; \ V0 = H0; \ V1 = H1; \ V2 = H2; \ V3 = H3; \ V4 = _mm256_xor_si256( S0, _mm256_set_epi32( CS1, CS1, CS1, CS1, \ CS0, CS0, CS0, CS0 ) ); \ V5 = _mm256_xor_si256( S1, _mm256_set_epi32( CS3, CS3, CS3, CS3, \ CS2, CS2, CS2, CS2 ) ); \ V6 = _mm256_xor_si256( _mm256_set1_epi32( T0 ), \ _mm256_set_epi32( CS5, CS5, CS5, CS5, \ CS4, CS4, CS4, CS4 ) ); \ V7 = _mm256_xor_si256( _mm256_set1_epi32( T1 ), \ _mm256_set_epi32( CS7, CS7, CS7, CS7, \ CS6, CS6, CS6, CS6 ) ); \ M0 = mm256_bswap_32( buf[ 0] ); \ M1 = mm256_bswap_32( buf[ 1] ); \ M2 = mm256_bswap_32( buf[ 2] ); \ M3 = mm256_bswap_32( buf[ 3] ); \ M4 = mm256_bswap_32( buf[ 4] ); \ M5 = mm256_bswap_32( buf[ 5] ); \ M6 = mm256_bswap_32( buf[ 6] ); \ M7 = mm256_bswap_32( buf[ 7] ); \ ROUND_Sx2_4WAY(0); \ ROUND_Sx2_4WAY(1); \ ROUND_Sx2_4WAY(2); \ ROUND_Sx2_4WAY(3); \ ROUND_Sx2_4WAY(4); \ ROUND_Sx2_4WAY(5); \ ROUND_Sx2_4WAY(6); \ ROUND_Sx2_4WAY(7); \ if (rounds == 14) \ { \ ROUND_Sx2_4WAY(8); \ ROUND_Sx2_4WAY(9); \ ROUND_Sx2_4WAY(0); \ ROUND_Sx2_4WAY(1); \ ROUND_Sx2_4WAY(2); \ ROUND_Sx2_4WAY(3); \ } \ H0 = _mm256_xor_si256( _mm256_xor_si256( \ _mm256_xor_si256( V8, V0 ), S0 ), H0 ); \ H1 = _mm256_xor_si256( _mm256_xor_si256( \ _mm256_xor_si256( V9, V1 ), S1 ), H1 ); \ H2 = _mm256_xor_si256( _mm256_xor_si256( \ _mm256_xor_si256( VA, V2 ), S2 ), H2 ); \ H3 = _mm256_xor_si256( _mm256_xor_si256( \ _mm256_xor_si256( VB, V3 ), S3 ), H3 ); \ } while (0)