/* * Copyright 2009 Colin Percival, 2014 savale * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * This file was originally written by Colin Percival as part of the Tarsnap * online backup system. */ #include #include #include #include "blake2b-hash-4way.h" #if defined(__AVX2__) static const uint8_t sigma[12][16] = { { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } }; #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) #define B2B8W_G(a, b, c, d, x, y) \ { \ v[a] = _mm512_add_epi64( _mm512_add_epi64( v[a], v[b] ), x ); \ v[d] = mm512_ror_64( _mm512_xor_si512( v[d], v[a] ), 32 ); \ v[c] = _mm512_add_epi64( v[c], v[d] ); \ v[b] = mm512_ror_64( _mm512_xor_si512( v[b], v[c] ), 24 ); \ v[a] = _mm512_add_epi64( _mm512_add_epi64( v[a], v[b] ), y ); \ v[d] = mm512_ror_64( _mm512_xor_si512( v[d], v[a] ), 16 ); \ v[c] = _mm512_add_epi64( v[c], v[d] ); \ v[b] = mm512_ror_64( _mm512_xor_si512( v[b], v[c] ), 63 ); \ } static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last ) { __m512i v[16], m[16]; v[ 0] = ctx->h[0]; v[ 1] = ctx->h[1]; v[ 2] = ctx->h[2]; v[ 3] = ctx->h[3]; v[ 4] = ctx->h[4]; v[ 5] = ctx->h[5]; v[ 6] = ctx->h[6]; v[ 7] = ctx->h[7]; v[ 8] = m512_const1_64( 0x6A09E667F3BCC908 ); v[ 9] = m512_const1_64( 0xBB67AE8584CAA73B ); v[10] = m512_const1_64( 0x3C6EF372FE94F82B ); v[11] = m512_const1_64( 0xA54FF53A5F1D36F1 ); v[12] = m512_const1_64( 0x510E527FADE682D1 ); v[13] = m512_const1_64( 0x9B05688C2B3E6C1F ); v[14] = m512_const1_64( 0x1F83D9ABFB41BD6B ); v[15] = m512_const1_64( 0x5BE0CD19137E2179 ); v[12] = _mm512_xor_si512( v[12], _mm512_set1_epi64( ctx->t[0] ) ); v[13] = _mm512_xor_si512( v[13], _mm512_set1_epi64( ctx->t[1] ) ); if ( last ) v[14] = mm512_not( v[14] ); m[ 0] = ctx->b[ 0]; m[ 1] = ctx->b[ 1]; m[ 2] = ctx->b[ 2]; m[ 3] = ctx->b[ 3]; m[ 4] = ctx->b[ 4]; m[ 5] = ctx->b[ 5]; m[ 6] = ctx->b[ 6]; m[ 7] = ctx->b[ 7]; m[ 8] = ctx->b[ 8]; m[ 9] = ctx->b[ 9]; m[10] = ctx->b[10]; m[11] = ctx->b[11]; m[12] = ctx->b[12]; m[13] = ctx->b[13]; m[14] = ctx->b[14]; m[15] = ctx->b[15]; for ( int i = 0; i < 12; i++ ) { B2B8W_G( 0, 4, 8, 12, m[ sigma[i][ 0] ], m[ sigma[i][ 1] ] ); B2B8W_G( 1, 5, 9, 13, m[ sigma[i][ 2] ], m[ sigma[i][ 3] ] ); B2B8W_G( 2, 6, 10, 14, m[ sigma[i][ 4] ], m[ sigma[i][ 5] ] ); B2B8W_G( 3, 7, 11, 15, m[ sigma[i][ 6] ], m[ sigma[i][ 7] ] ); B2B8W_G( 0, 5, 10, 15, m[ sigma[i][ 8] ], m[ sigma[i][ 9] ] ); B2B8W_G( 1, 6, 11, 12, m[ sigma[i][10] ], m[ sigma[i][11] ] ); B2B8W_G( 2, 7, 8, 13, m[ sigma[i][12] ], m[ sigma[i][13] ] ); B2B8W_G( 3, 4, 9, 14, m[ sigma[i][14] ], m[ sigma[i][15] ] ); } ctx->h[0] = mm512_xor3( ctx->h[0], v[0], v[ 8] ); ctx->h[1] = mm512_xor3( ctx->h[1], v[1], v[ 9] ); ctx->h[2] = mm512_xor3( ctx->h[2], v[2], v[10] ); ctx->h[3] = mm512_xor3( ctx->h[3], v[3], v[11] ); ctx->h[4] = mm512_xor3( ctx->h[4], v[4], v[12] ); ctx->h[5] = mm512_xor3( ctx->h[5], v[5], v[13] ); ctx->h[6] = mm512_xor3( ctx->h[6], v[6], v[14] ); ctx->h[7] = mm512_xor3( ctx->h[7], v[7], v[15] ); } int blake2b_8way_init( blake2b_8way_ctx *ctx ) { size_t i; ctx->h[0] = m512_const1_64( 0x6A09E667F3BCC908 ); ctx->h[1] = m512_const1_64( 0xBB67AE8584CAA73B ); ctx->h[2] = m512_const1_64( 0x3C6EF372FE94F82B ); ctx->h[3] = m512_const1_64( 0xA54FF53A5F1D36F1 ); ctx->h[4] = m512_const1_64( 0x510E527FADE682D1 ); ctx->h[5] = m512_const1_64( 0x9B05688C2B3E6C1F ); ctx->h[6] = m512_const1_64( 0x1F83D9ABFB41BD6B ); ctx->h[7] = m512_const1_64( 0x5BE0CD19137E2179 ); ctx->h[0] = _mm512_xor_si512( ctx->h[0], m512_const1_64( 0x01010020 ) ); ctx->t[0] = 0; ctx->t[1] = 0; ctx->c = 0; ctx->outlen = 32; for ( i = 0; i < 16; i++ ) ctx->b[i] = m512_zero; return 0; } void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input, size_t inlen ) { __m512i* in =(__m512i*)input; size_t i, c; c = ctx->c >> 3; for ( i = 0; i < (inlen >> 3); i++ ) { if ( ctx->c == 128 ) { ctx->t[0] += ctx->c; if ( ctx->t[0] < ctx->c ) ctx->t[1]++; blake2b_8way_compress( ctx, 0 ); ctx->c = 0; } ctx->b[ c++ ] = in[i]; ctx->c += 8; } } void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out ) { size_t c; c = ctx->c >> 3; ctx->t[0] += ctx->c; if ( ctx->t[0] < ctx->c ) ctx->t[1]++; while ( ctx->c < 128 ) { ctx->b[c++] = m512_zero; ctx->c += 8; } blake2b_8way_compress( ctx, 1 ); // final block flag = 1 casti_m512i( out, 0 ) = ctx->h[0]; casti_m512i( out, 1 ) = ctx->h[1]; casti_m512i( out, 2 ) = ctx->h[2]; casti_m512i( out, 3 ) = ctx->h[3]; } #endif // AVX512 // AVX2 // G Mixing function. #define B2B_G(a, b, c, d, x, y) \ { \ v[a] = _mm256_add_epi64( _mm256_add_epi64( v[a], v[b] ), x ); \ v[d] = mm256_ror_64( _mm256_xor_si256( v[d], v[a] ), 32 ); \ v[c] = _mm256_add_epi64( v[c], v[d] ); \ v[b] = mm256_ror_64( _mm256_xor_si256( v[b], v[c] ), 24 ); \ v[a] = _mm256_add_epi64( _mm256_add_epi64( v[a], v[b] ), y ); \ v[d] = mm256_ror_64( _mm256_xor_si256( v[d], v[a] ), 16 ); \ v[c] = _mm256_add_epi64( v[c], v[d] ); \ v[b] = mm256_ror_64( _mm256_xor_si256( v[b], v[c] ), 63 ); \ } // Initialization Vector. /* static const uint64_t blake2b_iv[8] = { 0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1, 0x510E527FADE682D1, 0x9B05688C2B3E6C1F, 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179 }; */ static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last ) { __m256i v[16], m[16]; v[ 0] = ctx->h[0]; v[ 1] = ctx->h[1]; v[ 2] = ctx->h[2]; v[ 3] = ctx->h[3]; v[ 4] = ctx->h[4]; v[ 5] = ctx->h[5]; v[ 6] = ctx->h[6]; v[ 7] = ctx->h[7]; v[ 8] = m256_const1_64( 0x6A09E667F3BCC908 ); v[ 9] = m256_const1_64( 0xBB67AE8584CAA73B ); v[10] = m256_const1_64( 0x3C6EF372FE94F82B ); v[11] = m256_const1_64( 0xA54FF53A5F1D36F1 ); v[12] = m256_const1_64( 0x510E527FADE682D1 ); v[13] = m256_const1_64( 0x9B05688C2B3E6C1F ); v[14] = m256_const1_64( 0x1F83D9ABFB41BD6B ); v[15] = m256_const1_64( 0x5BE0CD19137E2179 ); v[12] = _mm256_xor_si256( v[12], _mm256_set1_epi64x( ctx->t[0] ) ); v[13] = _mm256_xor_si256( v[13], _mm256_set1_epi64x( ctx->t[1] ) ); if ( last ) v[14] = mm256_not( v[14] ); m[ 0] = ctx->b[ 0]; m[ 1] = ctx->b[ 1]; m[ 2] = ctx->b[ 2]; m[ 3] = ctx->b[ 3]; m[ 4] = ctx->b[ 4]; m[ 5] = ctx->b[ 5]; m[ 6] = ctx->b[ 6]; m[ 7] = ctx->b[ 7]; m[ 8] = ctx->b[ 8]; m[ 9] = ctx->b[ 9]; m[10] = ctx->b[10]; m[11] = ctx->b[11]; m[12] = ctx->b[12]; m[13] = ctx->b[13]; m[14] = ctx->b[14]; m[15] = ctx->b[15]; for ( int i = 0; i < 12; i++ ) { B2B_G( 0, 4, 8, 12, m[ sigma[i][ 0] ], m[ sigma[i][ 1] ] ); B2B_G( 1, 5, 9, 13, m[ sigma[i][ 2] ], m[ sigma[i][ 3] ] ); B2B_G( 2, 6, 10, 14, m[ sigma[i][ 4] ], m[ sigma[i][ 5] ] ); B2B_G( 3, 7, 11, 15, m[ sigma[i][ 6] ], m[ sigma[i][ 7] ] ); B2B_G( 0, 5, 10, 15, m[ sigma[i][ 8] ], m[ sigma[i][ 9] ] ); B2B_G( 1, 6, 11, 12, m[ sigma[i][10] ], m[ sigma[i][11] ] ); B2B_G( 2, 7, 8, 13, m[ sigma[i][12] ], m[ sigma[i][13] ] ); B2B_G( 3, 4, 9, 14, m[ sigma[i][14] ], m[ sigma[i][15] ] ); } ctx->h[0] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[0], v[0] ), v[ 8] ); ctx->h[1] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[1], v[1] ), v[ 9] ); ctx->h[2] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[2], v[2] ), v[10] ); ctx->h[3] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[3], v[3] ), v[11] ); ctx->h[4] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[4], v[4] ), v[12] ); ctx->h[5] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[5], v[5] ), v[13] ); ctx->h[6] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[6], v[6] ), v[14] ); ctx->h[7] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[7], v[7] ), v[15] ); } int blake2b_4way_init( blake2b_4way_ctx *ctx ) { size_t i; ctx->h[0] = m256_const1_64( 0x6A09E667F3BCC908 ); ctx->h[1] = m256_const1_64( 0xBB67AE8584CAA73B ); ctx->h[2] = m256_const1_64( 0x3C6EF372FE94F82B ); ctx->h[3] = m256_const1_64( 0xA54FF53A5F1D36F1 ); ctx->h[4] = m256_const1_64( 0x510E527FADE682D1 ); ctx->h[5] = m256_const1_64( 0x9B05688C2B3E6C1F ); ctx->h[6] = m256_const1_64( 0x1F83D9ABFB41BD6B ); ctx->h[7] = m256_const1_64( 0x5BE0CD19137E2179 ); ctx->h[0] = _mm256_xor_si256( ctx->h[0], m256_const1_64( 0x01010020 ) ); ctx->t[0] = 0; ctx->t[1] = 0; ctx->c = 0; ctx->outlen = 32; for ( i = 0; i < 16; i++ ) ctx->b[i] = m256_zero; return 0; } void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input, size_t inlen ) { __m256i* in =(__m256i*)input; size_t i, c; c = ctx->c >> 3; for ( i = 0; i < (inlen >> 3); i++ ) { if ( ctx->c == 128 ) { ctx->t[0] += ctx->c; if ( ctx->t[0] < ctx->c ) ctx->t[1]++; blake2b_4way_compress( ctx, 0 ); ctx->c = 0; } ctx->b[ c++ ] = in[i]; ctx->c += 8; } } void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out ) { size_t c; c = ctx->c >> 3; ctx->t[0] += ctx->c; if ( ctx->t[0] < ctx->c ) ctx->t[1]++; while ( ctx->c < 128 ) { ctx->b[c++] = m256_zero; ctx->c += 8; } blake2b_4way_compress( ctx, 1 ); // final block flag = 1 casti_m256i( out, 0 ) = ctx->h[0]; casti_m256i( out, 1 ) = ctx->h[1]; casti_m256i( out, 2 ) = ctx->h[2]; casti_m256i( out, 3 ) = ctx->h[3]; } #endif // AVX2