v3.21.3

2026-02-22 16:33:08 +00:00 · 2023-03-11 14:54:49 -05:00
parent fb93160641
commit b339450898
49 changed files with 1120 additions and 1119 deletions
--- a/algo/blake/sph-blake2s.c
+++ b/algo/blake/sph-blake2s.c
@@ -17,6 +17,7 @@

 #include "algo/sha/sph_types.h"
 #include "sph-blake2s.h"
+#include "simd-utils.h"

 static const uint32_t blake2s_IV[8] =
 {
@@ -225,6 +226,71 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES]
 	v[13] = S->t[1] ^ blake2s_IV[5];
 	v[14] = S->f[0] ^ blake2s_IV[6];
 	v[15] = S->f[1] ^ blake2s_IV[7];
+
+#if 0    
+//#if defined(__SSE2__) // always true
+
+The only application for this is to do a prehash for the blake2s algorithm.
+SSE2 also supports 4 way parallel hashing so that is preferred in most cases.
+Testing has found that using this serial SIMD code for prehash is slower than
+doing a parallel hash. A parallel hash has more instructions and uses more
+data. The serial hash uses fewer instructions and data and only needs to
+interleave the final hash into parallel streams. This has shown negligible
+improvement on other algos, notably blake256 which is almost identical.
+Considering the low frequency of prehash no statistically valid change
+was expected. It was simply better on paper.
+
+Furthermore, simply defining this macro has an additional negative effect on
+blake2s as a whole. There are no references to this macro, blake2s-4way does
+not include it in any header files, it's just another unused macro which should
+have no effect beyond the preprocessor. But just being visible to the compiler
+changes things in a dramatic way.
+
+These 2 things combined reduced the hash rate for blake2s by more than 5% when
+using serial SIMD for the blake2s prehash over 16way parallel prehash.
+16way parallel hashing was used in the high frequency nonce loop in both cases.
+Comsidering the prehash represents 50% of the algorithm and is done once vs
+the high frequency second half that is done mega, maybe giga, times more it's
+hard to imagine that big of an effect in either direction.
+
+#define ROUND( r ) \
+{ \
+   __m128i *V = (__m128i*)v; \
+   const uint8_t *sigma = blake2s_sigma[r]; \
+   V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], \
+                       _mm_set_epi32( m[ sigma[ 6 ] ], m[ sigma[ 4 ] ], \
+                                      m[ sigma[ 2 ] ], m[ sigma[ 0 ] ] ) ) ); \
+   V[3] = mm128_swap32_16( _mm_xor_si128( V[3], V[0] ) ); \
+   V[2] = _mm_add_epi32( V[2], V[3] ); \
+   V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 12 ); \
+   V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], \
+                        _mm_set_epi32( m[ sigma[ 7 ] ], m[ sigma[ 5 ] ], \
+                                       m[ sigma[ 3 ] ], m[ sigma[ 1 ] ] ) ) ); \
+   V[3] = mm128_shuflr32_8( _mm_xor_si128( V[3], V[0] ) ); \
+   V[2] = _mm_add_epi32( V[2], V[3] ); \
+   V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 7 ); \
+   V[3] = mm128_shufll_32( V[3] ); \
+   V[2] = mm128_swap_64( V[2] ); \
+   V[1] = mm128_shuflr_32( V[1] ); \
+   V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], \
+                        _mm_set_epi32( m[ sigma[14] ], m[ sigma[12] ], \
+                                       m[ sigma[10] ], m[ sigma[ 8] ] ) ) ); \
+   V[3] = mm128_swap32_16( _mm_xor_si128( V[3], V[0] ) ); \
+   V[2] = _mm_add_epi32( V[2], V[3] ); \
+   V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 12 ); \
+   V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], \
+                        _mm_set_epi32( m[ sigma[15] ], m[ sigma[13] ], \
+                                       m[ sigma[11] ], m[ sigma[ 9] ] ) ) ); \
+   V[3] = mm128_shuflr32_8( _mm_xor_si128( V[3], V[0] ) ); \
+   V[2] = _mm_add_epi32( V[2], V[3] ); \
+   V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 7 ); \
+   V[3] = mm128_shuflr_32( V[3] ); \
+   V[2] = mm128_swap_64( V[2] ); \
+   V[1] = mm128_shufll_32( V[1] ); \
+}
+
+#else
+
 #define G(r,i,a,b,c,d) \
 	do { \
 		a = a + b + m[blake2s_sigma[r][2*i+0]]; \
@@ -247,7 +313,10 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES]
 		G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
 		G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
 	} while(0)
-	ROUND( 0 );
+
+#endif
+
+   ROUND( 0 );
 	ROUND( 1 );
 	ROUND( 2 );
 	ROUND( 3 );