/* * Copyright 2009 Colin Percival, 2014 savale * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * This file was originally written by Colin Percival as part of the Tarsnap * online backup system. */ #include #include #include #include "simd-utils.h" #include "compat/sph_types.h" #include "sph_blake2b.h" // Little-endian byte access. #define B2B_GET64(p) \ (((uint64_t) ((uint8_t *) (p))[0]) ^ \ (((uint64_t) ((uint8_t *) (p))[1]) << 8) ^ \ (((uint64_t) ((uint8_t *) (p))[2]) << 16) ^ \ (((uint64_t) ((uint8_t *) (p))[3]) << 24) ^ \ (((uint64_t) ((uint8_t *) (p))[4]) << 32) ^ \ (((uint64_t) ((uint8_t *) (p))[5]) << 40) ^ \ (((uint64_t) ((uint8_t *) (p))[6]) << 48) ^ \ (((uint64_t) ((uint8_t *) (p))[7]) << 56)) #if defined(__AVX2__) #define BLAKE2B_G( Sa, Sb, Sc, Sd, Se, Sf, Sg, Sh ) \ { \ V[0] = _mm256_add_epi64( V[0], _mm256_add_epi64( V[1], \ _mm256_set_epi64x( m[ sigmaR[ Sg ] ], m[ sigmaR[ Se ] ], \ m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \ V[3] = mm256_ror_64( _mm256_xor_si256( V[3], V[0] ), 32 ); \ V[2] = _mm256_add_epi64( V[2], V[3] ); \ V[1] = mm256_ror_64( _mm256_xor_si256( V[1], V[2] ), 24 ); \ \ V[0] = _mm256_add_epi64( V[0], _mm256_add_epi64( V[1], \ _mm256_set_epi64x( m[ sigmaR[ Sh ] ], m[ sigmaR[ Sf ] ], \ m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \ V[3] = mm256_ror_64( _mm256_xor_si256( V[3], V[0] ), 16 ); \ V[2] = _mm256_add_epi64( V[2], V[3] ); \ V[1] = mm256_ror_64( _mm256_xor_si256( V[1], V[2] ), 63 ); \ } // Pivot about V[1] instead of V[0] reduces latency. #define BLAKE2B_ROUND( R ) \ { \ __m256i *V = (__m256i*)v; \ const uint8_t *sigmaR = sigma[R]; \ BLAKE2B_G( 0, 1, 2, 3, 4, 5, 6, 7 ); \ V[0] = mm256_shufll_64( V[0] ); \ V[3] = mm256_swap_128( V[3] ); \ V[2] = mm256_shuflr_64( V[2] ); \ BLAKE2B_G( 14, 15, 8, 9, 10, 11, 12, 13 ); \ V[0] = mm256_shuflr_64( V[0] ); \ V[3] = mm256_swap_128( V[3] ); \ V[2] = mm256_shufll_64( V[2] ); \ } /* #define BLAKE2B_ROUND( R ) \ { \ __m256i *V = (__m256i*)v; \ const uint8_t *sigmaR = sigma[R]; \ BLAKE2B_G( 0, 1, 2, 3, 4, 5, 6, 7 ); \ V[3] = mm256_shufll_64( V[3] ); \ V[2] = mm256_swap_128( V[2] ); \ V[1] = mm256_shuflr_64( V[1] ); \ BLAKE2B_G( 8, 9, 10, 11, 12, 13, 14, 15 ); \ V[3] = mm256_shuflr_64( V[3] ); \ V[2] = mm256_swap_128( V[2] ); \ V[1] = mm256_shufll_64( V[1] ); \ } */ #elif defined(__SSE2__) || defined(__ARM_NEON) #define BLAKE2B_G( Va, Vb, Vc, Vd, Sa, Sb, Sc, Sd ) \ { \ Va = v128_add64( Va, v128_add64( Vb, \ v128_set64( m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \ Vd = v128_ror64( v128_xor( Vd, Va ), 32 ); \ Vc = v128_add64( Vc, Vd ); \ Vb = v128_ror64( v128_xor( Vb, Vc ), 24 ); \ \ Va = v128_add64( Va, v128_add64( Vb, \ v128_set64( m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \ Vd = v128_ror64( v128_xor( Vd, Va ), 16 ); \ Vc = v128_add64( Vc, Vd ); \ Vb = v128_ror64( v128_xor( Vb, Vc ), 63 ); \ } #define BLAKE2B_ROUND( R ) \ { \ v128_t *V = (v128_t*)v; \ v128_t V2, V3, V6, V7; \ const uint8_t *sigmaR = sigma[R]; \ BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \ BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \ V2 = v128_alignr64( V[3], V[2], 1 ); \ V3 = v128_alignr64( V[2], V[3], 1 ); \ V6 = v128_alignr64( V[6], V[7], 1 ); \ V7 = v128_alignr64( V[7], V[6], 1 ); \ BLAKE2B_G( V[0], V2, V[5], V6, 8, 9, 10, 11 ); \ BLAKE2B_G( V[1], V3, V[4], V7, 12, 13, 14, 15 ); \ V[2] = v128_alignr64( V2, V3, 1 ); \ V[3] = v128_alignr64( V3, V2, 1 ); \ V[6] = v128_alignr64( V7, V6, 1 ); \ V[7] = v128_alignr64( V6, V7, 1 ); \ } #else #ifndef ROTR64 #define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y)))) #endif #define BLAKE2B_G( R, Va, Vb, Vc, Vd, Sa, Sb ) \ { \ Va = Va + Vb + m[ sigma[R][Sa] ]; \ Vd = ROTR64( Vd ^ Va, 32 ); \ Vc = Vc + Vd; \ Vb = ROTR64( Vb ^ Vc, 24 ); \ \ Va = Va + Vb + m[ sigma[R][Sb] ]; \ Vd = ROTR64( Vd ^ Va, 16 ); \ Vc = Vc + Vd; \ Vb = ROTR64( Vb ^ Vc, 63 ); \ } #define BLAKE2B_ROUND( R ) \ { \ BLAKE2B_G( R, v[ 0], v[ 4], v[ 8], v[12], 0, 1 ); \ BLAKE2B_G( R, v[ 1], v[ 5], v[ 9], v[13], 2, 3 ); \ BLAKE2B_G( R, v[ 2], v[ 6], v[10], v[14], 4, 5 ); \ BLAKE2B_G( R, v[ 3], v[ 7], v[11], v[15], 6, 7 ); \ BLAKE2B_G( R, v[ 0], v[ 5], v[10], v[15], 8, 9 ); \ BLAKE2B_G( R, v[ 1], v[ 6], v[11], v[12], 10, 11 ); \ BLAKE2B_G( R, v[ 2], v[ 7], v[ 8], v[13], 12, 13 ); \ BLAKE2B_G( R, v[ 3], v[ 4], v[ 9], v[14], 14, 15 ); \ } #endif // Initialization Vector. static const uint64_t blake2b_iv[8] __attribute__ ((aligned (32))) = { 0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1, 0x510E527FADE682D1, 0x9B05688C2B3E6C1F, 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179 }; static const uint8_t sigma[12][16] __attribute__ ((aligned (32))) = { { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } }; // Compression function. "last" flag indicates last block. static void blake2b_compress( sph_blake2b_ctx *ctx, int last ) { uint64_t v[16] __attribute__ ((aligned (32))); uint64_t m[16] __attribute__ ((aligned (32))); int i; for (i = 0; i < 8; i++) { // init work variables v[i] = ctx->h[i]; v[i + 8] = blake2b_iv[i]; } v[12] ^= ctx->t[0]; // low 64 bits of offset v[13] ^= ctx->t[1]; // high 64 bits if (last) // last block flag set ? v[14] = ~v[14]; for (i = 0; i < 16; i++) // get little-endian words m[i] = B2B_GET64(&ctx->b[8 * i]); for (i = 0; i < 12; i++) BLAKE2B_ROUND( i ); for( i = 0; i < 8; ++i ) ctx->h[i] ^= v[i] ^ v[i + 8]; } // Initialize the hashing context "ctx" with optional key "key". // 1 <= outlen <= 64 gives the digest size in bytes. // Secret key (also <= 64 bytes) is optional (keylen = 0). int sph_blake2b_init( sph_blake2b_ctx *ctx, size_t outlen, const void *key, size_t keylen ) // (keylen=0: no key) { size_t i; if (outlen == 0 || outlen > 64 || keylen > 64) return -1; // illegal parameters for (i = 0; i < 8; i++) // state, "param block" ctx->h[i] = blake2b_iv[i]; ctx->h[0] ^= 0x01010000 ^ (keylen << 8) ^ outlen; ctx->t[0] = 0; // input count low word ctx->t[1] = 0; // input count high word ctx->c = 0; // pointer within buffer ctx->outlen = outlen; for (i = keylen; i < 128; i++) // zero input block ctx->b[i] = 0; if (keylen > 0) { sph_blake2b_update(ctx, key, keylen); ctx->c = 128; // at the end } return 0; } // Add "inlen" bytes from "in" into the hash. void sph_blake2b_update( sph_blake2b_ctx *ctx, const void *in, size_t inlen ) { size_t i; for (i = 0; i < inlen; i++) { if (ctx->c == 128) { // buffer full ? ctx->t[0] += ctx->c; // add counters if (ctx->t[0] < ctx->c) // carry overflow ? ctx->t[1]++; // high word blake2b_compress(ctx, 0); // compress (not last) ctx->c = 0; // counter to zero } ctx->b[ctx->c++] = ((const uint8_t *) in)[i]; } } // Generate the message digest (size given in init). // Result placed in "out". void sph_blake2b_final( sph_blake2b_ctx *ctx, void *out ) { size_t i; ctx->t[0] += ctx->c; // mark last block offset if (ctx->t[0] < ctx->c) // carry overflow ctx->t[1]++; // high word while (ctx->c < 128) // fill up with zeros ctx->b[ctx->c++] = 0; blake2b_compress(ctx, 1); // final block flag = 1 // little endian convert and store for (i = 0; i < ctx->outlen; i++) { ((uint8_t *) out)[i] = (ctx->h[i >> 3] >> (8 * (i & 7))) & 0xFF; } }