From fcd7727b0de2b806a633dac34031d8a5fae10a67 Mon Sep 17 00:00:00 2001 From: Jay D Dee Date: Fri, 24 Mar 2023 18:29:42 -0400 Subject: [PATCH] v3.22.1 --- RELEASE_NOTES | 5 + configure | 20 +- configure.ac | 2 +- simd-utils/intrlv.h | 1940 +++++++++++++++++-------------------------- util.c | 1 + winbuild-cross.sh | 2 +- 6 files changed, 780 insertions(+), 1190 deletions(-) diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 3c06614..7455d45 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -65,6 +65,11 @@ If not what makes it happen or not happen? Change Log ---------- +v3.22.1 + +#393 fixed segfault in GBT, regression from v3.22.0. +More efficient 32 bit data interleaving. + v3.22.0 Stratum: faster netdiff calculation. diff --git a/configure b/configure index 31de0b4..123af59 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.71 for cpuminer-opt 3.22.0. +# Generated by GNU Autoconf 2.71 for cpuminer-opt 3.22.1. # # # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation, @@ -608,8 +608,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='3.22.0' -PACKAGE_STRING='cpuminer-opt 3.22.0' +PACKAGE_VERSION='3.22.1' +PACKAGE_STRING='cpuminer-opt 3.22.1' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1360,7 +1360,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 3.22.0 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 3.22.1 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1432,7 +1432,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 3.22.0:";; + short | recursive ) echo "Configuration of cpuminer-opt 3.22.1:";; esac cat <<\_ACEOF @@ -1538,7 +1538,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 3.22.0 +cpuminer-opt configure 3.22.1 generated by GNU Autoconf 2.71 Copyright (C) 2021 Free Software Foundation, Inc. @@ -1985,7 +1985,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 3.22.0, which was +It was created by cpuminer-opt $as_me 3.22.1, which was generated by GNU Autoconf 2.71. Invocation command line was $ $0$ac_configure_args_raw @@ -3593,7 +3593,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='3.22.0' + VERSION='3.22.1' printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h @@ -7508,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 3.22.0, which was +This file was extended by cpuminer-opt $as_me 3.22.1, which was generated by GNU Autoconf 2.71. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -7576,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\ cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config='$ac_cs_config_escaped' ac_cs_version="\\ -cpuminer-opt config.status 3.22.0 +cpuminer-opt config.status 3.22.1 configured by $0, generated by GNU Autoconf 2.71, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index 1e79cf9..4fb62fb 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [3.22.0]) +AC_INIT([cpuminer-opt], [3.22.1]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/simd-utils/intrlv.h b/simd-utils/intrlv.h index 886dd95..97c2a02 100644 --- a/simd-utils/intrlv.h +++ b/simd-utils/intrlv.h @@ -11,53 +11,6 @@ // // 32 bit data -// Transpose 1 block consisting of 4x4x32 bit integers. -#define MM128_ILEAVE32( d0, d1, d2, d3, s0, s1, s2, s3 ) \ -{ \ - __m128i t0 = mm128_shuffle2_32( s0, s1, 0x44 ); \ - __m128i t1 = mm128_shuffle2_32( s0, s1, 0xee ); \ - __m128i t2 = mm128_shuffle2_32( s2, s3, 0x44 ); \ - __m128i t3 = mm128_shuffle2_32( s2, s3, 0xee ); \ - d0 = mm128_shuffle2_32( t0, t2, 0x88 ); \ - d1 = mm128_shuffle2_32( t0, t2, 0xdd ); \ - d2 = mm128_shuffle2_32( t1, t3, 0x88 ); \ - d3 = mm128_shuffle2_32( t1, t3, 0xdd ); \ -} - -#if defined(__AVX2__) - -// Transpose 2 contiguous blocks -#define MM256_ILEAVE32( d0, d1, d2, d3, s0, s1, s2, s3 ) \ -{ \ - __m256i t0 = mm256_shuffle2_32( s0, s1, 0x44 ); \ - __m256i t1 = mm256_shuffle2_32( s0, s1, 0xee ); \ - __m256i t2 = mm256_shuffle2_32( s2, s3, 0x44 ); \ - __m256i t3 = mm256_shuffle2_32( s2, s3, 0xee ); \ - d0 = mm256_shuffle2_32( t0, t2, 0x88 ); \ - d1 = mm256_shuffle2_32( t0, t2, 0xdd ); \ - d2 = mm256_shuffle2_32( t1, t3, 0x88 ); \ - d3 = mm256_shuffle2_32( t1, t3, 0xdd ); \ -} - -#endif - -#if defined(__AVX512F__) - -// Transpose 4 contiguous blocks. -#define MM512_ILEAVE32( d0, d1, d2, d3, s0, s1, s2, s3 ) \ -{ \ - __m512i t0 = mm512_shuffle2_32( s0, s1, 0x44 ); \ - __m512i t1 = mm512_shuffle2_32( s0, s1, 0xee ); \ - __m512i t2 = mm512_shuffle2_32( s2, s3, 0x44 ); \ - __m512i t3 = mm512_shuffle2_32( s2, s3, 0xee ); \ - d0 = mm512_shuffle2_32( t0, t2, 0x88 ); \ - d1 = mm512_shuffle2_32( t0, t2, 0xdd ); \ - d2 = mm512_shuffle2_32( t1, t3, 0x88 ); \ - d3 = mm512_shuffle2_32( t1, t3, 0xdd ); \ -} - -#endif - // 2x32 static inline void intrlv_2x32( void *dst, const void *src0, @@ -132,38 +85,161 @@ static inline void extr_lane_2x32( void *dst, const void *src, } // 4x32 -/* + +#if defined(__SSE4_1__) + +#define ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ) \ + D0 = mm128_shuflmov_32( S0, 1, S1, 0 ); \ + D1 = mm128_shuflmov_32( S1, 0, S0, 1 ); \ + D2 = mm128_shuflmov_32( S2, 0, S0, 2 ); \ + D3 = mm128_shuflmov_32( S3, 0, S0, 3 ); \ + D0 = mm128_shuflmov_32( D0, 2, S2, 0 ); \ + D1 = mm128_shuflmov_32( D1, 2, S2, 1 ); \ + D2 = mm128_shuflmov_32( D2, 1, S1, 2 ); \ + D3 = mm128_shuflmov_32( D3, 1, S1, 3 ); \ + D0 = mm128_shuflmov_32( D0, 3, S3, 0 ); \ + D1 = mm128_shuflmov_32( D1, 3, S3, 1 ); \ + D2 = mm128_shuflmov_32( D2, 3, S3, 2 ); \ + D3 = mm128_shuflmov_32( D3, 2, S2, 3 ); + +#define LOAD_SRCE( S0, S1, S2, S3, src0, i0, src1, i1, src2, i2, src3, i3 ) \ + S0 = _mm_load_si128( (const __m128i*)(src0) + (i0) ); \ + S1 = _mm_load_si128( (const __m128i*)(src1) + (i1) ); \ + S2 = _mm_load_si128( (const __m128i*)(src2) + (i2) ); \ + S3 = _mm_load_si128( (const __m128i*)(src3) + (i3) ); + +#define STORE_DEST( D0, D1, D2, D3, dst0, i0, dst1, i1, dst2, i2, dst3, i3 ) \ + _mm_store_si128( (__m128i*)(dst0) + (i0), D0 ); \ + _mm_store_si128( (__m128i*)(dst1) + (i1), D1 ); \ + _mm_store_si128( (__m128i*)(dst2) + (i2), D2 ); \ + _mm_store_si128( (__m128i*)(dst3) + (i3), D3 ); + + static inline void intrlv_4x32( void *dst, const void *src0, const void *src1, const void *src2, const void *src3, const int bit_len ) { - __m128i *d = (__m128i*)dst; - const __m128i *s0 = (const __m128i*)src0; - const __m128i *s1 = (const __m128i*)src1; - const __m128i *s2 = (const __m128i*)src2; - const __m128i *s3 = (const __m128i*)src3; + __m128i D0, D1, D2, D3, S0, S1, S2, S3; - MM128_ILEAVE32( d[ 0], d[ 1], d[ 2], d[ 3], s0[0], s1[0], s2[0], s3[0] ); - MM128_ILEAVE32( d[ 4], d[ 5], d[ 6], d[ 7], s0[1], s1[1], s2[1], s3[1] ); + LOAD_SRCE( S0, S1, S2, S3, src0, 0, src1, 0, src2, 0, src3, 0 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src0, 1, src1, 1, src2, 1, src3, 1 ); + STORE_DEST( D0, D1, D2, D3, dst, 0, dst, 1, dst, 2, dst, 3 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + STORE_DEST( D0, D1, D2, D3, dst, 4, dst, 5, dst, 6, dst, 7 ); if ( bit_len <= 256 ) return; - MM128_ILEAVE32( d[ 8], d[ 9], d[10], d[11], s0[2], s1[2], s2[2], s3[2] ); - MM128_ILEAVE32( d[12], d[13], d[14], d[15], s0[3], s1[3], s2[3], s3[3] ); + LOAD_SRCE( S0, S1, S2, S3, src0, 2, src1, 2, src2, 2, src3, 2 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src0, 3, src1, 3, src2, 3, src3, 3 ); + STORE_DEST( D0, D1, D2, D3, dst, 8, dst, 9, dst, 10, dst, 11 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + STORE_DEST( D0, D1, D2, D3, dst, 12, dst, 13, dst, 14, dst, 15 ); if ( bit_len <= 512 ) return; - MM128_ILEAVE32( d[16], d[17], d[18], d[19], s0[4], s1[4], s2[4], s3[4] ); + LOAD_SRCE( S0, S1, S2, S3, src0, 4, src1, 4, src2, 4, src3, 4 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + STORE_DEST( D0, D1, D2, D3, dst, 16, dst, 17, dst, 18, dst, 19 ); if ( bit_len <= 640 ) return; - MM128_ILEAVE32( d[20], d[21], d[22], d[23], s0[5], s1[5], s2[5], s3[5] ); - MM128_ILEAVE32( d[24], d[25], d[26], d[27], s0[6], s1[6], s2[6], s3[6] ); - MM128_ILEAVE32( d[28], d[29], d[30], d[31], s0[4], s1[4], s2[4], s3[4] ); -} -*/ + LOAD_SRCE( S0, S1, S2, S3, src0, 5, src1, 5, src2, 5, src3, 5 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src0, 6, src1, 6, src2, 6, src3, 6 ); + STORE_DEST( D0, D1, D2, D3, dst, 20, dst, 21, dst, 22, dst, 23 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src0, 7, src1, 7, src2, 7, src3, 7 ); + STORE_DEST( D0, D1, D2, D3, dst, 24, dst, 25, dst, 26, dst, 27 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + STORE_DEST( D0, D1, D2, D3, dst, 28, dst, 29, dst, 30, dst, 31 ); -static inline void intrlv_4x32( void * - dst, const void *src0, const void *src1, +// if ( bit_len <= 1024 ) return; +} + +static inline void intrlv_4x32_512( void *dst, const void *src0, + const void *src1, const void *src2, const void *src3 ) +{ + __m128i D0, D1, D2, D3, S0, S1, S2, S3; + + LOAD_SRCE( S0, S1, S2, S3, src0, 0, src1, 0, src2, 0, src3, 0 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src0, 1, src1, 1, src2, 1, src3, 1 ); + STORE_DEST( D0, D1, D2, D3, dst, 0, dst, 1, dst, 2, dst, 3 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src0, 2, src1, 2, src2, 2, src3, 2 ); + STORE_DEST( D0, D1, D2, D3, dst, 4, dst, 5, dst, 6, dst, 7 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src0, 3, src1, 3, src2, 3, src3, 3 ); + STORE_DEST( D0, D1, D2, D3, dst, 8, dst, 9, dst, 10, dst, 11 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + STORE_DEST( D0, D1, D2, D3, dst, 12, dst, 13, dst, 14, dst, 15 ); +} + +static inline void dintrlv_4x32( void *dst0, void *dst1, void *dst2, + void *dst3, const void *src, const int bit_len ) +{ + __m128i D0, D1, D2, D3, S0, S1, S2, S3; + + LOAD_SRCE( S0, S1, S2, S3, src, 0, src, 1, src, 2, src, 3 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 4, src, 5, src, 6, src, 7 ); + STORE_DEST( D0, D1, D2, D3, dst0, 0, dst1, 0, dst2, 0, dst3, 0 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + STORE_DEST( D0, D1, D2, D3, dst0, 1, dst1, 1, dst2, 1, dst3, 1 ); + + if ( bit_len <= 256 ) return; + + LOAD_SRCE( S0, S1, S2, S3, src, 8, src, 9, src, 10, src, 11 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 12, src, 13, src, 14, src, 15 ); + STORE_DEST( D0, D1, D2, D3, dst0, 2, dst1, 2, dst2, 2, dst3, 2 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + STORE_DEST( D0, D1, D2, D3, dst0, 3, dst1, 3, dst2, 3, dst3, 3 ); + + if ( bit_len <= 512 ) return; + + LOAD_SRCE( S0, S1, S2, S3, src, 16, src, 17, src, 18, src, 19 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + STORE_DEST( D0, D1, D2, D3, dst0, 4, dst1, 4, dst2, 4, dst3, 4 ); + + if ( bit_len <= 640 ) return; + + LOAD_SRCE( S0, S1, S2, S3, src, 20, src, 21, src, 22, src, 23 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 24, src, 25, src, 26, src, 27 ); + STORE_DEST( D0, D1, D2, D3, dst0, 5, dst1, 5, dst2, 5, dst3, 5 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 28, src, 29, src, 30, src, 31 ); + STORE_DEST( D0, D1, D2, D3, dst0, 6, dst1, 6, dst2, 6, dst3, 6 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + STORE_DEST( D0, D1, D2, D3, dst0, 7, dst1, 7, dst2, 7, dst3, 7 ); + +// if ( bit_len <= 1024 ) return; +} + +static inline void dintrlv_4x32_512( void *dst0, void *dst1, void *dst2, + void *dst3, const void *src ) +{ + __m128i D0, D1, D2, D3, S0, S1, S2, S3; + + LOAD_SRCE( S0, S1, S2, S3, src, 0, src, 1, src, 2, src, 3 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 4, src, 5, src, 6, src, 7 ); + STORE_DEST( D0, D1, D2, D3, dst0, 0, dst1, 0, dst2, 0, dst3, 0 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 8, src, 9, src, 10, src, 11 ); + STORE_DEST( D0, D1, D2, D3, dst0, 1, dst1, 1, dst2, 1, dst3, 1 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 12, src, 13, src, 14, src, 15 ); + STORE_DEST( D0, D1, D2, D3, dst0, 2, dst1, 2, dst2, 2, dst3, 2 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + STORE_DEST( D0, D1, D2, D3, dst0, 3, dst1, 3, dst2, 3, dst3, 3 ); +} + +#else // SSE2 + +static inline void intrlv_4x32( void *dst, const void *src0, const void *src1, const void *src2, const void *src3, const int bit_len ) { uint32_t *d = (uint32_t*)dst; @@ -208,49 +284,6 @@ static inline void intrlv_4x32( void * d[124] = s0[31]; d[125] = s1[31]; d[126] = s2[31]; d[127] = s3[31]; } -/* -static inline void intrlv_4x32_512( void *dst, const void *src0, - const void *src1, const void *src2, const void *src3 ) -{ -#if defined(__AVX2__) - - __m256i *d = (__m256i*)dst; - const __m256i *s0 = (const __m256i*)src0; - const __m256i *s1 = (const __m256i*)src1; - const __m256i *s2 = (const __m256i*)src2; - const __m256i *s3 = (const __m256i*)src3; - __m256i dt0, dt1, dt2, dt3; - - MM256_ILEAVE32( dt0, dt1, dt2, dt3, s0[0], s1[0], s2[0], s3[0] ); - - d[0] = _mm256_permute2x128_si256( dt0, dt1, 0x20 ); - d[1] = _mm256_permute2x128_si256( dt2, dt3, 0x20 ); - d[2] = _mm256_permute2x128_si256( dt0, dt1, 0x31 ); - d[3] = _mm256_permute2x128_si256( dt2, dt3, 0x31 ); - - MM256_ILEAVE32( dt0, dt1, dt2, dt3, s0[1], s1[1], s2[1], s3[1] ); - - d[4] = _mm256_permute2x128_si256( dt0, dt1, 0x20 ); - d[5] = _mm256_permute2x128_si256( dt2, dt3, 0x20 ); - d[6] = _mm256_permute2x128_si256( dt0, dt1, 0x31 ); - d[7] = _mm256_permute2x128_si256( dt2, dt3, 0x31 ); - -#else - - __m128i *d = (__m128i*)dst; - const __m128i *s0 = (const __m128i*)src0; - const __m128i *s1 = (const __m128i*)src1; - const __m128i *s2 = (const __m128i*)src2; - const __m128i *s3 = (const __m128i*)src3; - - MM128_ILEAVE32( d[ 0], d[ 1], d[ 2], d[ 3], s0[0], s1[0], s2[0], s3[0] ); - MM128_ILEAVE32( d[ 4], d[ 5], d[ 6], d[ 7], s0[1], s1[1], s2[1], s3[1] ); - MM128_ILEAVE32( d[ 8], d[ 9], d[10], d[11], s0[2], s1[2], s2[2], s3[2] ); - MM128_ILEAVE32( d[12], d[13], d[14], d[15], s0[3], s1[3], s2[3], s3[3] ); - -#endif -} -*/ static inline void intrlv_4x32_512( void *dst, const void *src0, const void *src1, const void *src2, const void *src3 ) @@ -278,37 +311,6 @@ static inline void intrlv_4x32_512( void *dst, const void *src0, d[ 60] = s0[15]; d[ 61] = s1[15]; d[ 62] = s2[15]; d[ 63] = s3[15]; } - -/* -static inline void dintrlv_4x32( void *dst0, void *dst1, void *dst2, - void *dst3, const void *src, const int bit_len ) -{ - __m128i *d0 = (__m128i*)dst0; - __m128i *d1 = (__m128i*)dst1; - __m128i *d2 = (__m128i*)dst2; - __m128i *d3 = (__m128i*)dst3; - const __m128i *s = (const __m128i*)src; - - MM128_ILEAVE32( d0[0], d1[0], d2[0], d3[0], s[ 0], s[ 1], s[ 2], s[ 3] ); - MM128_ILEAVE32( d0[1], d1[1], d2[1], d3[1], s[ 4], s[ 5], s[ 6], s[ 7] ); - - if ( bit_len <= 256 ) return; - - MM128_ILEAVE32( d0[2], d1[2], d2[2], d3[2], s[ 8], s[ 9], s[10], s[11] ); - MM128_ILEAVE32( d0[3], d1[3], d2[3], d3[3], s[12], s[13], s[14], s[15] ); - - if ( bit_len <= 512 ) return; - - MM128_ILEAVE32( d0[4], d1[4], d2[4], d3[4], s[16], s[17], s[18], s[19] ); - - if ( bit_len <= 640 ) return; - - MM128_ILEAVE32( d0[5], d1[5], d2[5], d3[5], s[20], s[21], s[22], s[23] ); - MM128_ILEAVE32( d0[6], d1[6], d2[6], d3[6], s[24], s[25], s[26], s[27] ); - MM128_ILEAVE32( d0[7], d1[7], d2[7], d3[7], s[28], s[29], s[30], s[31] ); -} -*/ - static inline void dintrlv_4x32( void *dst0, void *dst1, void *dst2, void *dst3, const void *src, const int bit_len ) { @@ -354,49 +356,6 @@ static inline void dintrlv_4x32( void *dst0, void *dst1, void *dst2, d0[31] = s[124]; d1[31] = s[125]; d2[31] = s[126]; d3[31] = s[127]; } -/* -static inline void dintrlv_4x32_512( void *dst0, void *dst1, void *dst2, - void *dst3, const void *src ) -{ -#if defined(__AVX2__) - - __m256i *d0 = (__m256i*)dst0; - __m256i *d1 = (__m256i*)dst1; - __m256i *d2 = (__m256i*)dst2; - __m256i *d3 = (__m256i*)dst3; - const __m256i *s = (const __m256i*)src; - - __m256i st0 = _mm256_permute2x128_si256( s[0], s[2], 0x20 ); - __m256i st2 = _mm256_permute2x128_si256( s[1], s[3], 0x20 ); - __m256i st1 = _mm256_permute2x128_si256( s[0], s[2], 0x31 ); - __m256i st3 = _mm256_permute2x128_si256( s[1], s[3], 0x31 ); - - MM256_ILEAVE32( d0[0], d1[0], d2[0], d3[0], st0, st1, st2, st3 ); - - st0 = _mm256_permute2x128_si256( s[4], s[6], 0x20 ); - st2 = _mm256_permute2x128_si256( s[5], s[7], 0x20 ); - st1 = _mm256_permute2x128_si256( s[4], s[6], 0x31 ); - st3 = _mm256_permute2x128_si256( s[5], s[7], 0x31 ); - - MM256_ILEAVE32( d0[1], d1[1], d2[1], d3[1], st0, st1, st2, st3 ); - -#else - - __m128i *d0 = (__m128i*)dst0; - __m128i *d1 = (__m128i*)dst1; - __m128i *d2 = (__m128i*)dst2; - __m128i *d3 = (__m128i*)dst3; - const __m128i *s = (const __m128i*)src; - - MM128_ILEAVE32( d0[0], d1[0], d2[0], d3[0], s[ 0], s[ 1], s[ 2], s[ 3] ); - MM128_ILEAVE32( d0[1], d1[1], d2[1], d3[1], s[ 4], s[ 5], s[ 6], s[ 7] ); - MM128_ILEAVE32( d0[2], d1[2], d2[2], d3[2], s[ 8], s[ 9], s[10], s[11] ); - MM128_ILEAVE32( d0[3], d1[3], d2[3], d3[3], s[12], s[13], s[14], s[15] ); - -#endif -} -*/ - static inline void dintrlv_4x32_512( void *dst0, void *dst1, void *dst2, void *dst3, const void *src ) { @@ -423,6 +382,7 @@ static inline void dintrlv_4x32_512( void *dst0, void *dst1, void *dst2, d0[15] = s[ 60]; d1[15] = s[ 61]; d2[15] = s[ 62]; d3[15] = s[ 63]; } +#endif // SSE4_1 else SSE2 static inline void extr_lane_4x32( void *d, const void *s, const int lane, const int bit_len ) @@ -590,204 +550,6 @@ static inline void mm128_bswap32_intrlv80_4x32( void *d, const void *src ) */ // 8x32 -/* -static inline void intrlv_8x32( void *dst, const void *src0, - const void *src1, const void *src2, const void *src3, const void *src4, - const void *src5, const void *src6, const void *src7, const int bit_len ) -{ - __m128i *d = (__m128i*)dst; - const __m128i *s0 = (const __m128i*)src0; - const __m128i *s1 = (const __m128i*)src1; - const __m128i *s2 = (const __m128i*)src2; - const __m128i *s3 = (const __m128i*)src3; - const __m128i *s4 = (const __m128i*)src4; - const __m128i *s5 = (const __m128i*)src5; - const __m128i *s6 = (const __m128i*)src6; - const __m128i *s7 = (const __m128i*)src7; - - MM128_ILEAVE32( d[ 0], d[ 2], d[ 4], d[ 6], s0[0], s1[0], s2[0], s3[0] ); - MM128_ILEAVE32( d[ 1], d[ 3], d[ 5], d[ 7], s4[0], s5[0], s6[0], s7[0] ); - MM128_ILEAVE32( d[ 8], d[10], d[12], d[14], s0[1], s1[1], s2[1], s3[1] ); - MM128_ILEAVE32( d[ 9], d[11], d[13], d[15], s4[1], s5[1], s6[1], s7[1] ); - - if ( bit_len <= 256 ) return; - - MM128_ILEAVE32( d[16], d[18], d[20], d[22], s0[2], s1[2], s2[2], s3[2] ); - MM128_ILEAVE32( d[17], d[19], d[21], d[23], s4[2], s5[2], s6[2], s7[2] ); - MM128_ILEAVE32( d[24], d[26], d[28], d[30], s0[3], s1[3], s2[3], s3[3] ); - MM128_ILEAVE32( d[25], d[27], d[29], d[31], s4[3], s5[3], s6[3], s7[3] ); - - if ( bit_len <= 512 ) return; - - MM128_ILEAVE32( d[32], d[34], d[36], d[38], s0[4], s1[4], s2[4], s3[4] ); - MM128_ILEAVE32( d[33], d[35], d[37], d[39], s4[4], s5[4], s6[4], s7[4] ); - - if ( bit_len <= 640 ) return; - - MM128_ILEAVE32( d[40], d[42], d[44], d[46], s0[5], s1[5], s2[5], s3[5] ); - MM128_ILEAVE32( d[41], d[43], d[45], d[47], s4[5], s5[5], s6[5], s7[5] ); - - MM128_ILEAVE32( d[48], d[50], d[52], d[54], s0[6], s1[6], s2[6], s3[6] ); - MM128_ILEAVE32( d[49], d[51], d[53], d[55], s4[6], s5[6], s6[6], s7[6] ); - MM128_ILEAVE32( d[56], d[58], d[60], d[62], s0[7], s1[7], s2[7], s3[7] ); - MM128_ILEAVE32( d[57], d[59], d[61], d[63], s4[7], s5[7], s6[7], s7[7] ); -} - -// Not used -static inline void intrlv_8x32_256( void *dst, const void *src0, - const void *src1, const void *src2, const void *src3, const void *src4, - const void *src5, const void *src6, const void *src7 ) -{ -#if defined(__AVX2__) - - __m256i *d = (__m256i*)dst; - const __m256i *s0 = (const __m256i*)src0; - const __m256i *s1 = (const __m256i*)src1; - const __m256i *s2 = (const __m256i*)src2; - const __m256i *s3 = (const __m256i*)src3; - const __m256i *s4 = (const __m256i*)src4; - const __m256i *s5 = (const __m256i*)src5; - const __m256i *s6 = (const __m256i*)src6; - const __m256i *s7 = (const __m256i*)src7; - __m256i dt0, dt1, dt2, dt3, dt4, dt5, dt6, dt7; - - MM256_ILEAVE32( dt0, dt1, dt2, dt3, s0[0], s1[0], s2[0], s3[0] ); - MM256_ILEAVE32( dt4, dt5, dt6, dt7, s4[0], s5[0], s6[0], s7[0] ); - - d[0] = _mm256_permute2x128_si256( dt0, dt4, 0x20 ); - d[4] = _mm256_permute2x128_si256( dt0, dt4, 0x31 ); - d[1] = _mm256_permute2x128_si256( dt1, dt5, 0x20 ); - d[5] = _mm256_permute2x128_si256( dt1, dt5, 0x31 ); - d[2] = _mm256_permute2x128_si256( dt2, dt6, 0x20 ); - d[6] = _mm256_permute2x128_si256( dt2, dt6, 0x31 ); - d[3] = _mm256_permute2x128_si256( dt3, dt7, 0x20 ); - d[7] = _mm256_permute2x128_si256( dt3, dt7, 0x31 ); - -#else -// Shouldn't get here, 8x32 used only with AVX2 - - __m128i *d = (__m128i*)dst; - const __m128i *s0 = (const __m128i*)src0; - const __m128i *s1 = (const __m128i*)src1; - const __m128i *s2 = (const __m128i*)src2; - const __m128i *s3 = (const __m128i*)src3; - const __m128i *s4 = (const __m128i*)src4; - const __m128i *s5 = (const __m128i*)src5; - const __m128i *s6 = (const __m128i*)src6; - const __m128i *s7 = (const __m128i*)src7; - - MM128_ILEAVE32( d[ 0], d[ 2], d[ 4], d[ 6], s0[0], s1[0], s2[0], s3[0] ); - MM128_ILEAVE32( d[ 1], d[ 3], d[ 5], d[ 7], s4[0], s5[0], s6[0], s7[0] ); - MM128_ILEAVE32( d[ 8], d[10], d[12], d[14], s0[1], s1[1], s2[1], s3[1] ); - MM128_ILEAVE32( d[ 9], d[11], d[13], d[15], s4[1], s5[1], s6[1], s7[1] ); - -#endif -} - -static inline void intrlv_8x32_512( void *dst, const void *src0, - const void *src1, const void *src2, const void *src3, const void *src4, - const void *src5, const void *src6, const void *src7 ) -{ -#if 0 //defined(__AVX512F__) - - __m512i *d = (__m512i*)dst; - const __m512i *s0 = (const __m512i*)src0; - const __m512i *s1 = (const __m512i*)src1; - const __m512i *s2 = (const __m512i*)src2; - const __m512i *s3 = (const __m512i*)src3; - const __m512i *s4 = (const __m512i*)src4; - const __m512i *s5 = (const __m512i*)src5; - const __m512i *s6 = (const __m512i*)src6; - const __m512i *s7 = (const __m512i*)src7; - - __m512i dt0, dt1, dt2, dt3, dt4, dt5, dt6, dt7, t0, t1, t2, t3; - - MM512_ILEAVE32( dt0, dt1, dt2, dt3, s0[0], s1[0], s2[0], s3[0] ); - MM512_ILEAVE32( dt4, dt5, dt6, dt7, s4[0], s5[0], s6[0], s7[0] ); - - t0 = _mm512_shuffle_i32x4( dt0, dt4, 0x44 ); - t2 = _mm512_shuffle_i32x4( dt1, dt5, 0x44 ); - t1 = _mm512_shuffle_i32x4( dt0, dt4, 0xee ); - t3 = _mm512_shuffle_i32x4( dt1, dt5, 0xee ); - - d[0] = _mm512_shuffle_i32x4( t0, t2, 0x88 ); - d[2] = _mm512_shuffle_i32x4( t0, t2, 0xdd ); - d[4] = _mm512_shuffle_i32x4( t1, t3, 0x88 ); - d[6] = _mm512_shuffle_i32x4( t1, t3, 0xdd ); - - t0 = _mm512_shuffle_i32x4( dt2, dt6, 0x44 ); - t2 = _mm512_shuffle_i32x4( dt3, dt7, 0x44 ); - t1 = _mm512_shuffle_i32x4( dt2, dt6, 0xee ); - t3 = _mm512_shuffle_i32x4( dt3, dt7, 0xee ); - - d[1] = _mm512_shuffle_i32x4( t0, t2, 0x88 ); - d[3] = _mm512_shuffle_i32x4( t0, t2, 0xdd ); - d[5] = _mm512_shuffle_i32x4( t1, t3, 0x88 ); - d[7] = _mm512_shuffle_i32x4( t1, t3, 0xdd ); - -#elif defined(__AVX2__) - - __m256i *d = (__m256i*)dst; - const __m256i *s0 = (const __m256i*)src0; - const __m256i *s1 = (const __m256i*)src1; - const __m256i *s2 = (const __m256i*)src2; - const __m256i *s3 = (const __m256i*)src3; - const __m256i *s4 = (const __m256i*)src4; - const __m256i *s5 = (const __m256i*)src5; - const __m256i *s6 = (const __m256i*)src6; - const __m256i *s7 = (const __m256i*)src7; - __m256i dt0, dt1, dt2, dt3, dt4, dt5, dt6, dt7; - - MM256_ILEAVE32( dt0, dt1, dt2, dt3, s0[0], s1[0], s2[0], s3[0] ); - MM256_ILEAVE32( dt4, dt5, dt6, dt7, s4[0], s5[0], s6[0], s7[0] ); - - d[0] = _mm256_permute2x128_si256( dt0, dt4, 0x20 ); - d[1] = _mm256_permute2x128_si256( dt1, dt5, 0x20 ); - d[4] = _mm256_permute2x128_si256( dt0, dt4, 0x31 ); - d[5] = _mm256_permute2x128_si256( dt1, dt5, 0x31 ); - d[2] = _mm256_permute2x128_si256( dt2, dt6, 0x20 ); - d[3] = _mm256_permute2x128_si256( dt3, dt7, 0x20 ); - d[6] = _mm256_permute2x128_si256( dt2, dt6, 0x31 ); - d[7] = _mm256_permute2x128_si256( dt3, dt7, 0x31 ); - - MM256_ILEAVE32( dt0, dt1, dt2, dt3, s0[1], s1[1], s2[1], s3[1] ); - MM256_ILEAVE32( dt4, dt5, dt6, dt7, s4[1], s5[1], s6[1], s7[1] ); - - d[ 8] = _mm256_permute2x128_si256( dt0, dt4, 0x20 ); - d[ 9] = _mm256_permute2x128_si256( dt1, dt5, 0x20 ); - d[12] = _mm256_permute2x128_si256( dt0, dt4, 0x31 ); - d[13] = _mm256_permute2x128_si256( dt1, dt5, 0x31 ); - d[10] = _mm256_permute2x128_si256( dt2, dt6, 0x20 ); - d[11] = _mm256_permute2x128_si256( dt3, dt7, 0x20 ); - d[14] = _mm256_permute2x128_si256( dt2, dt6, 0x31 ); - d[15] = _mm256_permute2x128_si256( dt3, dt7, 0x31 ); - -#else -// Shouldn't get here, 8x32 only used with AVX2 or AVX512 - - __m128i *d = (__m128i*)dst; - const __m128i *s0 = (const __m128i*)src0; - const __m128i *s1 = (const __m128i*)src1; - const __m128i *s2 = (const __m128i*)src2; - const __m128i *s3 = (const __m128i*)src3; - const __m128i *s4 = (const __m128i*)src4; - const __m128i *s5 = (const __m128i*)src5; - const __m128i *s6 = (const __m128i*)src6; - const __m128i *s7 = (const __m128i*)src7; - - MM128_ILEAVE32( d[ 0], d[ 2], d[ 4], d[ 6], s0[0], s1[0], s2[0], s3[0] ); - MM128_ILEAVE32( d[ 1], d[ 3], d[ 5], d[ 7], s4[0], s5[0], s6[0], s7[0] ); - MM128_ILEAVE32( d[ 8], d[10], d[12], d[14], s0[1], s1[1], s2[1], s3[1] ); - MM128_ILEAVE32( d[ 9], d[11], d[13], d[15], s4[1], s5[1], s6[1], s7[1] ); - - MM128_ILEAVE32( d[16], d[18], d[20], d[22], s0[2], s1[2], s2[2], s3[2] ); - MM128_ILEAVE32( d[17], d[19], d[21], d[23], s4[2], s5[2], s6[2], s7[2] ); - MM128_ILEAVE32( d[24], d[26], d[28], d[30], s0[3], s1[3], s2[3], s3[3] ); - MM128_ILEAVE32( d[25], d[27], d[29], d[31], s4[3], s5[3], s6[3], s7[3] ); - -#endif -} -*/ #define ILEAVE_8x32( i ) do \ { \ @@ -810,7 +572,8 @@ static inline void intrlv_8x32b( void *dst, const void *s0, const void *s1, ILEAVE_8x32( i ); } - +/* +// default static inline void intrlv_8x32( void *dst, const void *s0, const void *s1, const void *s2, const void *s3, const void *s4, const void *s5, const void *s6, const void *s7, const int bit_len ) @@ -835,9 +598,9 @@ static inline void intrlv_8x32( void *dst, const void *s0, const void *s1, ILEAVE_8x32( 28 ); ILEAVE_8x32( 29 ); ILEAVE_8x32( 30 ); ILEAVE_8x32( 31 ); } +*/ - - +/* default static inline void intrlv_8x32_512( void *dst, const void *s0, const void *s1, const void *s2, const void *s3, const void *s4, const void *s5, const void *s6, const void *s7 ) @@ -851,205 +614,212 @@ static inline void intrlv_8x32_512( void *dst, const void *s0, const void *s1, ILEAVE_8x32( 12 ); ILEAVE_8x32( 13 ); ILEAVE_8x32( 14 ); ILEAVE_8x32( 15 ); } - +*/ #undef ILEAVE_8x32 -/* +#if defined(__SSE4_1__) + +static inline void intrlv_8x32( void *dst, const void *s0, const void *s1, + const void *s2, const void *s3, const void *s4, const void *s5, + const void *s6, const void *s7, const int bit_len ) +{ + __m128i D0, D1, D2, D3, S0, S1, S2, S3; + + LOAD_SRCE( S0, S1, S2, S3, s0, 0, s1, 0, s2, 0, s3, 0 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s4, 0, s5, 0, s6, 0, s7, 0 ); + STORE_DEST( D0, D1, D2, D3, dst, 0, dst, 2, dst, 4, dst, 6 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s0, 1, s1, 1, s2, 1, s3, 1 ); + STORE_DEST( D0, D1, D2, D3, dst, 1, dst, 3, dst, 5, dst, 7 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s4, 1, s5, 1, s6, 1, s7, 1 ); + STORE_DEST( D0, D1, D2, D3, dst, 8, dst, 10, dst, 12, dst, 14 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + STORE_DEST( D0, D1, D2, D3, dst, 9, dst, 11, dst, 13, dst, 15 ); + + if ( bit_len <= 256 ) return; + + LOAD_SRCE( S0, S1, S2, S3, s0, 2, s1, 2, s2, 2, s3, 2 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s4, 2, s5, 2, s6, 2, s7, 2 ); + STORE_DEST( D0, D1, D2, D3, dst, 16, dst, 18, dst, 20, dst, 22 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s0, 3, s1, 3, s2, 3, s3, 3 ); + STORE_DEST( D0, D1, D2, D3, dst, 17, dst, 19, dst, 21, dst, 23 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s4, 3, s5, 3, s6, 3, s7, 3 ); + STORE_DEST( D0, D1, D2, D3, dst, 24, dst, 26, dst, 28, dst, 30 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + STORE_DEST( D0, D1, D2, D3, dst, 25, dst, 27, dst, 29, dst, 31 ); + + if ( bit_len <= 512 ) return; + + LOAD_SRCE( S0, S1, S2, S3, s0, 4, s1, 4, s2, 4, s3, 4 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s4, 4, s5, 4, s6, 4, s7, 4 ); + STORE_DEST( D0, D1, D2, D3, dst, 32, dst, 34, dst, 36, dst, 38 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + STORE_DEST( D0, D1, D2, D3, dst, 33, dst, 35, dst, 37, dst, 39 ); + + if ( bit_len <= 640 ) return; + + LOAD_SRCE( S0, S1, S2, S3, s0, 5, s1, 5, s2, 5, s3, 5 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s4, 5, s5, 5, s6, 5, s7, 5 ); + STORE_DEST( D0, D1, D2, D3, dst, 40, dst, 42, dst, 44, dst, 46 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s0, 6, s1, 6, s2, 6, s3, 6 ); + STORE_DEST( D0, D1, D2, D3, dst, 41, dst, 43, dst, 45, dst, 47 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s4, 6, s5, 6, s6, 6, s7, 6 ); + STORE_DEST( D0, D1, D2, D3, dst, 48, dst, 50, dst, 52, dst, 54 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s0, 7, s1, 7, s2, 7, s3, 7 ); + STORE_DEST( D0, D1, D2, D3, dst, 49, dst, 51, dst, 53, dst, 55 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s4, 7, s5, 7, s6, 7, s7, 7 ); + STORE_DEST( D0, D1, D2, D3, dst, 56, dst, 58, dst, 60, dst, 62 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + STORE_DEST( D0, D1, D2, D3, dst, 57, dst, 59, dst, 61, dst, 63 ); + +// if ( bit_len <= 1024 ) return; +} + +static inline void intrlv_8x32_512( void *dst, const void *s0, const void *s1, + const void *s2, const void *s3, const void *s4, const void *s5, + const void *s6, const void *s7 ) +{ + __m128i D0, D1, D2, D3, S0, S1, S2, S3; + + LOAD_SRCE( S0, S1, S2, S3, s0, 0, s1, 0, s2, 0, s3, 0 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s4, 0, s5, 0, s6, 0, s7, 0 ); + STORE_DEST( D0, D1, D2, D3, dst, 0, dst, 2, dst, 4, dst, 6 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s0, 1, s1, 1, s2, 1, s3, 1 ); + STORE_DEST( D0, D1, D2, D3, dst, 1, dst, 3, dst, 5, dst, 7 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s4, 1, s5, 1, s6, 1, s7, 1 ); + STORE_DEST( D0, D1, D2, D3, dst, 8, dst, 10, dst, 12, dst, 14 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s0, 2, s1, 2, s2, 2, s3, 2 ); + STORE_DEST( D0, D1, D2, D3, dst, 9, dst, 11, dst, 13, dst, 15 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s4, 2, s5, 2, s6, 2, s7, 2 ); + STORE_DEST( D0, D1, D2, D3, dst, 16, dst, 18, dst, 20, dst, 22 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s0, 3, s1, 3, s2, 3, s3, 3 ); + STORE_DEST( D0, D1, D2, D3, dst, 17, dst, 19, dst, 21, dst, 23 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s4, 3, s5, 3, s6, 3, s7, 3 ); + STORE_DEST( D0, D1, D2, D3, dst, 24, dst, 26, dst, 28, dst, 30 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + STORE_DEST( D0, D1, D2, D3, dst, 25, dst, 27, dst, 29, dst, 31 ); +} + static inline void dintrlv_8x32( void *dst0, void *dst1, void *dst2, void *dst3, void *dst4, void *dst5, void *dst6, void *dst7, const void *src, const int bit_len ) { - __m128i *d0 = (__m128i*)dst0; - __m128i *d1 = (__m128i*)dst1; - __m128i *d2 = (__m128i*)dst2; - __m128i *d3 = (__m128i*)dst3; - __m128i *d4 = (__m128i*)dst4; - __m128i *d5 = (__m128i*)dst5; - __m128i *d6 = (__m128i*)dst6; - __m128i *d7 = (__m128i*)dst7; - const __m128i *s = (const __m128i*)src; + __m128i D0, D1, D2, D3, S0, S1, S2, S3; - MM128_ILEAVE32( d0[0], d1[0], d2[0], d3[0], s[ 0], s[ 2], s[ 4], s[ 6] ); - MM128_ILEAVE32( d4[0], d5[0], d6[0], d7[0], s[ 1], s[ 3], s[ 5], s[ 7] ); - MM128_ILEAVE32( d0[1], d1[1], d2[1], d3[1], s[ 8], s[10], s[12], s[14] ); - MM128_ILEAVE32( d4[1], d5[1], d6[1], d7[1], s[ 9], s[11], s[13], s[15] ); + LOAD_SRCE( S0, S1, S2, S3, src, 0, src, 2, src, 4, src, 6 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 1, src, 3, src, 5, src, 7 ); + STORE_DEST( D0, D1, D2, D3, dst0, 0, dst1, 0, dst2, 0, dst3, 0 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 8, src, 10, src, 12, src, 14 ); + STORE_DEST( D0, D1, D2, D3, dst4, 0, dst5, 0, dst6, 0, dst7, 0 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 9, src, 11, src, 13, src, 15 ); + STORE_DEST( D0, D1, D2, D3, dst0, 1, dst1, 1, dst2, 1, dst3, 1 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + STORE_DEST( D0, D1, D2, D3, dst4, 1, dst5, 1, dst6, 1, dst7, 1 ); if ( bit_len <= 256 ) return; - MM128_ILEAVE32( d0[2], d1[2], d2[2], d3[2], s[16], s[18], s[20], s[22] ); - MM128_ILEAVE32( d4[2], d5[2], d6[2], d7[2], s[17], s[19], s[21], s[23] ); - MM128_ILEAVE32( d0[3], d1[3], d2[3], d3[3], s[24], s[26], s[28], s[30] ); - MM128_ILEAVE32( d4[3], d5[3], d6[3], d7[3], s[25], s[27], s[29], s[31] ); + LOAD_SRCE( S0, S1, S2, S3, src, 16, src, 18, src, 20, src, 22 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 17, src, 19, src, 21, src, 23 ); + STORE_DEST( D0, D1, D2, D3, dst0, 2, dst1, 2, dst2, 2, dst3, 2 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 24, src, 26, src, 28, src, 30 ); + STORE_DEST( D0, D1, D2, D3, dst4, 2, dst5, 2, dst6, 2, dst7, 2 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 25, src, 27, src, 29, src, 31 ); + STORE_DEST( D0, D1, D2, D3, dst0, 3, dst1, 3, dst2, 3, dst3, 3 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + STORE_DEST( D0, D1, D2, D3, dst4, 3, dst5, 3, dst6, 3, dst7, 3 ); if ( bit_len <= 512 ) return; - MM128_ILEAVE32( d0[4], d1[4], d2[4], d3[4], s[32], s[34], s[36], s[38] ); - MM128_ILEAVE32( d4[4], d5[4], d6[4], d7[4], s[33], s[35], s[37], s[39] ); + LOAD_SRCE( S0, S1, S2, S3, src, 32, src, 34, src, 36, src, 38 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 33, src, 35, src, 37, src, 39 ); + STORE_DEST( D0, D1, D2, D3, dst0, 4, dst1, 4, dst2, 4, dst3, 4 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + STORE_DEST( D0, D1, D2, D3, dst4, 4, dst5, 4, dst6, 4, dst7, 4 ); if ( bit_len <= 640 ) return; - MM128_ILEAVE32( d0[5], d1[5], d2[5], d3[5], s[40], s[42], s[44], s[46] ); - MM128_ILEAVE32( d4[5], d5[5], d6[5], d7[5], s[41], s[43], s[45], s[47] ); - MM128_ILEAVE32( d0[6], d1[6], d2[6], d3[6], s[48], s[50], s[52], s[54] ); - MM128_ILEAVE32( d4[6], d5[6], d6[6], d7[6], s[49], s[51], s[53], s[55] ); - MM128_ILEAVE32( d0[7], d1[7], d2[7], d3[7], s[56], s[58], s[60], s[62] ); - MM128_ILEAVE32( d4[7], d5[7], d6[7], d7[7], s[57], s[59], s[61], s[63] ); -} + LOAD_SRCE( S0, S1, S2, S3, src, 40, src, 42, src, 44, src, 46 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 41, src, 43, src, 45, src, 47 ); + STORE_DEST( D0, D1, D2, D3, dst0, 5, dst1, 5, dst2, 5, dst3, 5 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 48, src, 50, src, 52, src, 54 ); + STORE_DEST( D0, D1, D2, D3, dst4, 5, dst5, 5, dst6, 5, dst7, 5 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 49, src, 51, src, 53, src, 55 ); + STORE_DEST( D0, D1, D2, D3, dst0, 6, dst1, 6, dst2, 6, dst3, 6 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 56, src, 58, src, 60, src, 62 ); + STORE_DEST( D0, D1, D2, D3, dst4, 6, dst5, 6, dst6, 6, dst7, 6 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 57, src, 59, src, 61, src, 63 ); + STORE_DEST( D0, D1, D2, D3, dst0, 7, dst1, 7, dst2, 7, dst3, 7 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + STORE_DEST( D0, D1, D2, D3, dst4, 7, dst5, 7, dst6, 7, dst7, 7 ); -static inline void dintrlv_8x32_256( void *dst0, void *dst1, void *dst2, - void *dst3, void *dst4, void *dst5, void *dst6, void *dst7, - const void *src ) -{ -#if defined(__AVX2__) - - __m256i *d0 = (__m256i*)dst0; - __m256i *d1 = (__m256i*)dst1; - __m256i *d2 = (__m256i*)dst2; - __m256i *d3 = (__m256i*)dst3; - __m256i *d4 = (__m256i*)dst4; - __m256i *d5 = (__m256i*)dst5; - __m256i *d6 = (__m256i*)dst6; - __m256i *d7 = (__m256i*)dst7; - const __m256i *s = (const __m256i*)src; - - __m256i st0 = _mm256_permute2x128_si256( s[0], s[4], 0x20 ); - __m256i st1 = _mm256_permute2x128_si256( s[0], s[4], 0x31 ); - __m256i st2 = _mm256_permute2x128_si256( s[1], s[5], 0x20 ); - __m256i st3 = _mm256_permute2x128_si256( s[1], s[5], 0x31 ); - __m256i st4 = _mm256_permute2x128_si256( s[2], s[6], 0x20 ); - __m256i st5 = _mm256_permute2x128_si256( s[2], s[6], 0x31 ); - __m256i st6 = _mm256_permute2x128_si256( s[3], s[7], 0x20 ); - __m256i st7 = _mm256_permute2x128_si256( s[3], s[7], 0x31 ); - - MM256_ILEAVE32( d0[0], d1[0], d2[0], d3[0], st0, st2, st4, st6 ); - MM256_ILEAVE32( d4[0], d5[0], d6[0], d7[0], st1, st3, st5, st7 ); - -#else -// Not needed, 8x32 used only with AVX2, AVX512 - - __m128i *d0 = (__m128i*)dst0; - __m128i *d1 = (__m128i*)dst1; - __m128i *d2 = (__m128i*)dst2; - __m128i *d3 = (__m128i*)dst3; - __m128i *d4 = (__m128i*)dst4; - __m128i *d5 = (__m128i*)dst5; - __m128i *d6 = (__m128i*)dst6; - __m128i *d7 = (__m128i*)dst7; - const __m128i *s = (const __m128i*)src; - - MM128_ILEAVE32( d0[0], d1[0], d2[0], d3[0], s[ 0], s[ 2], s[ 4], s[ 6] ); - MM128_ILEAVE32( d4[0], d5[0], d6[0], d7[0], s[ 1], s[ 3], s[ 5], s[ 7] ); - MM128_ILEAVE32( d0[1], d1[1], d2[1], d3[1], s[ 8], s[10], s[12], s[14] ); - MM128_ILEAVE32( d4[1], d5[1], d6[1], d7[1], s[ 9], s[11], s[13], s[15] ); - -#endif +// if ( bit_len <= 1024 ) return; } static inline void dintrlv_8x32_512( void *dst0, void *dst1, void *dst2, void *dst3, void *dst4, void *dst5, void *dst6, void *dst7, const void *src ) { -#if 0 // defined(__AVX512F__) + __m128i D0, D1, D2, D3, S0, S1, S2, S3; - __m512i *d0 = (__m512i*)dst0; - __m512i *d1 = (__m512i*)dst1; - __m512i *d2 = (__m512i*)dst2; - __m512i *d3 = (__m512i*)dst3; - __m512i *d4 = (__m512i*)dst4; - __m512i *d5 = (__m512i*)dst5; - __m512i *d6 = (__m512i*)dst6; - __m512i *d7 = (__m512i*)dst7; - - - const __m512i *s = (const __m512i*)src; - - __m512i st0, st1, st2, st3, st4, st5, st6, st7, t0, t1, t2, t3; - - t0 = _mm512_shuffle_i32x4( s[0], s[2], 0x44 ); - t2 = _mm512_shuffle_i32x4( s[4], s[6], 0x44 ); - t1 = _mm512_shuffle_i32x4( s[0], s[2], 0xee ); - t3 = _mm512_shuffle_i32x4( s[4], s[6], 0xee ); - - st0 = _mm512_shuffle_i32x4( t0, t2, 0x88 ); - st4 = _mm512_shuffle_i32x4( t0, t2, 0xdd ); - st1 = _mm512_shuffle_i32x4( t1, t3, 0x88 ); - st5 = _mm512_shuffle_i32x4( t1, t3, 0xdd ); - - t0 = _mm512_shuffle_i32x4( s[1], s[3], 0x44 ); - t2 = _mm512_shuffle_i32x4( s[5], s[7], 0x44 ); - t1 = _mm512_shuffle_i32x4( s[1], s[3], 0xee ); - t3 = _mm512_shuffle_i32x4( s[5], s[7], 0xee ); - - st2 = _mm512_shuffle_i32x4( t0, t2, 0x88 ); - st6 = _mm512_shuffle_i32x4( t0, t2, 0xdd ); - st3 = _mm512_shuffle_i32x4( t1, t3, 0x88 ); - st7 = _mm512_shuffle_i32x4( t1, t3, 0xdd ); - - MM512_ILEAVE32( d0[0], d1[0], d2[0], d3[0], st0, st1, st2, st3 ); - MM512_ILEAVE32( d4[0], d5[0], d6[0], d7[0], st4, st5, st6, st7 ); - -#elif defined(__AVX2__) - - __m256i *d0 = (__m256i*)dst0; - __m256i *d1 = (__m256i*)dst1; - __m256i *d2 = (__m256i*)dst2; - __m256i *d3 = (__m256i*)dst3; - __m256i *d4 = (__m256i*)dst4; - __m256i *d5 = (__m256i*)dst5; - __m256i *d6 = (__m256i*)dst6; - __m256i *d7 = (__m256i*)dst7; - const __m256i *s = (const __m256i*)src; - - __m256i st0 = _mm256_permute2x128_si256( s[0], s[4], 0x20 ); - __m256i st2 = _mm256_permute2x128_si256( s[1], s[5], 0x20 ); - __m256i st1 = _mm256_permute2x128_si256( s[0], s[4], 0x31 ); - __m256i st3 = _mm256_permute2x128_si256( s[1], s[5], 0x31 ); - __m256i st4 = _mm256_permute2x128_si256( s[2], s[6], 0x20 ); - __m256i st6 = _mm256_permute2x128_si256( s[3], s[7], 0x20 ); - __m256i st5 = _mm256_permute2x128_si256( s[2], s[6], 0x31 ); - __m256i st7 = _mm256_permute2x128_si256( s[3], s[7], 0x31 ); - - MM256_ILEAVE32( d0[0], d1[0], d2[0], d3[0], st0, st2, st4, st6 ); - MM256_ILEAVE32( d4[0], d5[0], d6[0], d7[0], st1, st3, st5, st7 ); - - st0 = _mm256_permute2x128_si256( s[ 8], s[12], 0x20 ); - st2 = _mm256_permute2x128_si256( s[ 9], s[13], 0x20 ); - st1 = _mm256_permute2x128_si256( s[ 8], s[12], 0x31 ); - st3 = _mm256_permute2x128_si256( s[ 9], s[13], 0x31 ); - st4 = _mm256_permute2x128_si256( s[10], s[14], 0x20 ); - st6 = _mm256_permute2x128_si256( s[11], s[15], 0x20 ); - st5 = _mm256_permute2x128_si256( s[10], s[14], 0x31 ); - st7 = _mm256_permute2x128_si256( s[11], s[15], 0x31 ); - - MM256_ILEAVE32( d0[1], d1[1], d2[1], d3[1], st0, st2, st4, st6 ); - MM256_ILEAVE32( d4[1], d5[1], d6[1], d7[1], st1, st3, st5, st7 ); - -#else - - __m128i *d0 = (__m128i*)dst0; - __m128i *d1 = (__m128i*)dst1; - __m128i *d2 = (__m128i*)dst2; - __m128i *d3 = (__m128i*)dst3; - __m128i *d4 = (__m128i*)dst4; - __m128i *d5 = (__m128i*)dst5; - __m128i *d6 = (__m128i*)dst6; - __m128i *d7 = (__m128i*)dst7; - const __m128i *s = (const __m128i*)src; - - MM128_ILEAVE32( d0[0], d1[0], d2[0], d3[0], s[ 0], s[ 2], s[ 4], s[ 6] ); - MM128_ILEAVE32( d4[0], d5[0], d6[0], d7[0], s[ 1], s[ 3], s[ 5], s[ 7] ); - MM128_ILEAVE32( d0[1], d1[1], d2[1], d3[1], s[ 8], s[10], s[12], s[14] ); - MM128_ILEAVE32( d4[1], d5[1], d6[1], d7[1], s[ 9], s[11], s[13], s[15] ); - - MM128_ILEAVE32( d0[2], d1[2], d2[2], d3[2], s[16], s[18], s[20], s[22] ); - MM128_ILEAVE32( d4[2], d5[2], d6[2], d7[2], s[17], s[19], s[21], s[23] ); - MM128_ILEAVE32( d0[3], d1[3], d2[3], d3[3], s[24], s[26], s[28], s[30] ); - MM128_ILEAVE32( d4[3], d5[3], d6[3], d7[3], s[25], s[27], s[29], s[31] ); - -#endif + LOAD_SRCE( S0, S1, S2, S3, src, 0, src, 2, src, 4, src, 6 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 1, src, 3, src, 5, src, 7 ); + STORE_DEST( D0, D1, D2, D3, dst0, 0, dst1, 0, dst2, 0, dst3, 0 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 8, src, 10, src, 12, src, 14 ); + STORE_DEST( D0, D1, D2, D3, dst4, 0, dst5, 0, dst6, 0, dst7, 0 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 9, src, 11, src, 13, src, 15 ); + STORE_DEST( D0, D1, D2, D3, dst0, 1, dst1, 1, dst2, 1, dst3, 1 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 16, src, 18, src, 20, src, 22 ); + STORE_DEST( D0, D1, D2, D3, dst4, 1, dst5, 1, dst6, 1, dst7, 1 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 17, src, 19, src, 21, src, 23 ); + STORE_DEST( D0, D1, D2, D3, dst0, 2, dst1, 2, dst2, 2, dst3, 2 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 24, src, 26, src, 28, src, 30 ); + STORE_DEST( D0, D1, D2, D3, dst4, 2, dst5, 2, dst6, 2, dst7, 2 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 25, src, 27, src, 29, src, 31 ); + STORE_DEST( D0, D1, D2, D3, dst0, 3, dst1, 3, dst2, 3, dst3, 3 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + STORE_DEST( D0, D1, D2, D3, dst4, 3, dst5, 3, dst6, 3, dst7, 3 ); } -*/ + +#endif // SSE4_1 + #define DLEAVE_8x32( i ) do \ { \ @@ -1072,6 +842,7 @@ static inline void dintrlv_8x32b( void *d0, void *d1, void *d2, void *d3, DLEAVE_8x32( i ); } +/* default static inline void dintrlv_8x32( void *d0, void *d1, void *d2, void *d3, void *d4, void *d5, void *d6, void *d7, const void *src, const int bit_len ) @@ -1096,8 +867,9 @@ static inline void dintrlv_8x32( void *d0, void *d1, void *d2, void *d3, DLEAVE_8x32( 28 ); DLEAVE_8x32( 29 ); DLEAVE_8x32( 30 ); DLEAVE_8x32( 31 ); } +*/ - +/* default static inline void dintrlv_8x32_512( void *d0, void *d1, void *d2, void *d3, void *d4, void *d5, void *d6, void *d7, const void *src ) { @@ -1110,7 +882,7 @@ static inline void dintrlv_8x32_512( void *d0, void *d1, void *d2, void *d3, DLEAVE_8x32( 12 ); DLEAVE_8x32( 13 ); DLEAVE_8x32( 14 ); DLEAVE_8x32( 15 ); } - +*/ #undef DLEAVE_8x32 static inline void extr_lane_8x32( void *d, const void *s, @@ -1227,231 +999,8 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src ) #endif // AVX2 // 16x32 -/* -static inline void intrlv_16x32( void *dst, const void *src00, - const void *src01, const void *src02, const void *src03, const void *src04, - const void *src05, const void *src06, const void *src07, const void *src08, - const void *src09, const void *src10, const void *src11, const void *src12, - const void *src13, const void *src14, const void *src15, const int bit_len ) -{ - __m128i *d = (__m128i*)dst; - const __m128i *s00 = (const __m128i*)src00; - const __m128i *s01 = (const __m128i*)src01; - const __m128i *s02 = (const __m128i*)src02; - const __m128i *s03 = (const __m128i*)src03; - const __m128i *s04 = (const __m128i*)src04; - const __m128i *s05 = (const __m128i*)src05; - const __m128i *s06 = (const __m128i*)src06; - const __m128i *s07 = (const __m128i*)src07; - const __m128i *s08 = (const __m128i*)src08; - const __m128i *s09 = (const __m128i*)src09; - const __m128i *s10 = (const __m128i*)src10; - const __m128i *s11 = (const __m128i*)src11; - const __m128i *s12 = (const __m128i*)src12; - const __m128i *s13 = (const __m128i*)src13; - const __m128i *s14 = (const __m128i*)src14; - const __m128i *s15 = (const __m128i*)src15; - MM128_ILEAVE32( d[ 0], d[ 4], d[ 8], d[12], s00[0], s01[0], s02[0], s03[0] ); - MM128_ILEAVE32( d[ 1], d[ 5], d[ 9], d[13], s04[0], s05[0], s06[0], s07[0] ); - MM128_ILEAVE32( d[ 2], d[ 6], d[10], d[14], s08[0], s09[0], s10[0], s11[0] ); - MM128_ILEAVE32( d[ 3], d[ 7], d[11], d[15], s12[0], s13[0], s14[0], s15[0] ); - - MM128_ILEAVE32( d[16], d[20], d[24], d[28], s00[1], s01[1], s02[1], s03[1] ); - MM128_ILEAVE32( d[17], d[21], d[25], d[29], s04[1], s05[1], s06[1], s07[1] ); - MM128_ILEAVE32( d[18], d[22], d[26], d[30], s08[1], s09[1], s10[1], s11[1] ); - MM128_ILEAVE32( d[19], d[23], d[27], d[31], s12[1], s13[1], s14[1], s15[1] ); - - if ( bit_len <= 256 ) return; - - MM128_ILEAVE32( d[32], d[36], d[40], d[44], s00[2], s01[2], s02[2], s03[2] ); - MM128_ILEAVE32( d[33], d[37], d[41], d[45], s04[2], s05[2], s06[2], s07[2] ); - MM128_ILEAVE32( d[34], d[38], d[42], d[46], s08[2], s09[2], s10[2], s11[2] ); - MM128_ILEAVE32( d[35], d[39], d[43], d[47], s12[2], s13[2], s14[2], s15[2] ); - - MM128_ILEAVE32( d[48], d[52], d[56], d[60], s00[3], s01[3], s02[3], s03[3] ); - MM128_ILEAVE32( d[49], d[53], d[57], d[61], s04[3], s05[3], s06[3], s07[3] ); - MM128_ILEAVE32( d[50], d[54], d[58], d[62], s08[3], s09[3], s10[3], s11[3] ); - MM128_ILEAVE32( d[51], d[55], d[59], d[63], s12[3], s13[3], s14[3], s15[3] ); - - if ( bit_len <= 512 ) return; - - MM128_ILEAVE32( d[64], d[68], d[72], d[76], s00[4], s01[4], s02[4], s03[4] ); - MM128_ILEAVE32( d[65], d[69], d[73], d[77], s04[4], s05[4], s06[4], s07[4] ); - MM128_ILEAVE32( d[66], d[70], d[74], d[78], s08[4], s09[4], s10[4], s11[4] ); - MM128_ILEAVE32( d[67], d[71], d[75], d[79], s12[4], s13[4], s14[4], s15[4] ); - - if ( bit_len <= 640 ) return; - - MM128_ILEAVE32( d[80], d[84], d[88], d[92], s00[5], s01[5], s02[5], s03[5] ); - MM128_ILEAVE32( d[81], d[85], d[89], d[93], s04[5], s05[5], s06[5], s07[5] ); - MM128_ILEAVE32( d[82], d[86], d[90], d[94], s08[5], s09[5], s10[5], s11[5] ); - MM128_ILEAVE32( d[83], d[87], d[91], d[95], s12[5], s13[5], s14[5], s15[5] ); - - MM128_ILEAVE32( d[ 96], d[100], d[104], d[108], s00[6], s01[6], s02[6], s03[6] ); - MM128_ILEAVE32( d[ 97], d[101], d[105], d[109], s04[6], s05[6], s06[6], s07[6] ); - MM128_ILEAVE32( d[ 98], d[102], d[106], d[110], s08[6], s09[6], s10[6], s11[6] ); - MM128_ILEAVE32( d[ 99], d[103], d[107], d[111], s12[6], s13[6], s14[6], s15[6] ); - - MM128_ILEAVE32( d[112], d[116], d[120], d[124], s00[7], s01[7], s02[7], s03[7] ); - MM128_ILEAVE32( d[113], d[117], d[121], d[125], s04[7], s05[7], s06[7], s07[7] ); - MM128_ILEAVE32( d[114], d[118], d[122], d[126], s08[7], s09[7], s10[7], s11[7] ); - MM128_ILEAVE32( d[115], d[119], d[123], d[127], s12[7], s13[7], s14[7], s15[7] ); -} - -// Not used, only potential use is with AVX512 -#if defined(__AVX2__) - -static inline void intrlv_16x32_256( void *dst, const void *src00, - const void *src01, const void *src02, const void *src03, const void *src04, - const void *src05, const void *src06, const void *src07, const void *src08, - const void *src09, const void *src10, const void *src11, const void *src12, - const void *src13, const void *src14, const void *src15 ) -{ - __m256i *d = (__m256i*)dst; - const __m256i *s00 = (const __m256i*)src00; - const __m256i *s01 = (const __m256i*)src01; - const __m256i *s02 = (const __m256i*)src02; - const __m256i *s03 = (const __m256i*)src03; - const __m256i *s04 = (const __m256i*)src04; - const __m256i *s05 = (const __m256i*)src05; - const __m256i *s06 = (const __m256i*)src06; - const __m256i *s07 = (const __m256i*)src07; - const __m256i *s08 = (const __m256i*)src08; - const __m256i *s09 = (const __m256i*)src09; - const __m256i *s10 = (const __m256i*)src10; - const __m256i *s11 = (const __m256i*)src11; - const __m256i *s12 = (const __m256i*)src12; - const __m256i *s13 = (const __m256i*)src13; - const __m256i *s14 = (const __m256i*)src14; - const __m256i *s15 = (const __m256i*)src15; - __m256i dt0, dt1, dt2, dt3, dt4, dt5, dt6, dt7; - - MM256_ILEAVE32( dt0, dt1, dt2, dt3, s00[0], s01[0], s02[0], s03[0] ); - MM256_ILEAVE32( dt4, dt5, dt6, dt7, s04[0], s05[0], s06[0], s07[0] ); - - d[ 0] = _mm256_permute2x128_si256( dt0, dt4, 0x20 ); - d[ 8] = _mm256_permute2x128_si256( dt0, dt4, 0x31 ); - d[ 2] = _mm256_permute2x128_si256( dt1, dt5, 0x20 ); - d[10] = _mm256_permute2x128_si256( dt1, dt5, 0x31 ); - d[ 4] = _mm256_permute2x128_si256( dt2, dt6, 0x20 ); - d[12] = _mm256_permute2x128_si256( dt2, dt6, 0x31 ); - d[ 6] = _mm256_permute2x128_si256( dt3, dt7, 0x20 ); - d[14] = _mm256_permute2x128_si256( dt3, dt7, 0x31 ); - - MM256_ILEAVE32( dt0, dt1, dt1, dt3, s08[0], s09[0], s10[0], s11[0] ); - MM256_ILEAVE32( dt4, dt5, dt6, dt7, s12[0], s13[0], s14[0], s15[0] ); - - d[ 1] = _mm256_permute2x128_si256( dt0, dt4, 0x20 ); - d[ 9] = _mm256_permute2x128_si256( dt0, dt4, 0x31 ); - d[ 3] = _mm256_permute2x128_si256( dt1, dt5, 0x20 ); - d[11] = _mm256_permute2x128_si256( dt1, dt5, 0x31 ); - d[ 5] = _mm256_permute2x128_si256( dt2, dt6, 0x20 ); - d[13] = _mm256_permute2x128_si256( dt2, dt6, 0x31 ); - d[ 7] = _mm256_permute2x128_si256( dt3, dt7, 0x20 ); - d[15] = _mm256_permute2x128_si256( dt3, dt7, 0x31 ); -} -#endif - -// Not used -static inline void intrlv_16x32_512( void *dst, const void *src00, - const void *src01, const void *src02, const void *src03, const void *src04, - const void *src05, const void *src06, const void *src07, const void *src08, - const void *src09, const void *src10, const void *src11, const void *src12, - const void *src13, const void *src14, const void *src15 ) -{ -#if defined(__AVX512F__) - - __m512i *d = (__m512i*)dst; - const __m512i *s00 = (const __m512i*)src00; - const __m512i *s01 = (const __m512i*)src01; - const __m512i *s02 = (const __m512i*)src02; - const __m512i *s03 = (const __m512i*)src03; - const __m512i *s04 = (const __m512i*)src04; - const __m512i *s05 = (const __m512i*)src05; - const __m512i *s06 = (const __m512i*)src06; - const __m512i *s07 = (const __m512i*)src07; - const __m512i *s08 = (const __m512i*)src08; - const __m512i *s09 = (const __m512i*)src09; - const __m512i *s10 = (const __m512i*)src10; - const __m512i *s11 = (const __m512i*)src11; - const __m512i *s12 = (const __m512i*)src12; - const __m512i *s13 = (const __m512i*)src13; - const __m512i *s14 = (const __m512i*)src14; - const __m512i *s15 = (const __m512i*)src15; - __m512i st00, st01, st02, st03, st04, st05, st06, st07, - st08, st09, st10, st11, st12, st13, st14, st15, - t0, t1, t2, t3; - - MM512_ILEAVE32( st00, st01, st02, st03, s00[0], s01[0], s02[0], s03[0] ); - MM512_ILEAVE32( st04, st05, st06, st07, s04[0], s05[0], s06[0], s07[0] ); - MM512_ILEAVE32( st08, st09, st10, st11, s08[0], s09[0], s10[0], s11[0] ); - MM512_ILEAVE32( st12, st13, st14, st15, s12[0], s13[0], s14[0], s15[0] ); - - t0 = _mm512_shuffle_i32x4( st00, st04, 0x88 ); - t1 = _mm512_shuffle_i32x4( st00, st04, 0xdd ); - t2 = _mm512_shuffle_i32x4( st08, st12, 0x88 ); - t3 = _mm512_shuffle_i32x4( st08, st12, 0xdd ); - - d[ 0] = _mm512_shuffle_i32x4( t0, t2, 0x88 ); - d[ 8] = _mm512_shuffle_i32x4( t0, t2, 0xdd ); - d[ 4] = _mm512_shuffle_i32x4( t1, t3, 0x88 ); - d[12] = _mm512_shuffle_i32x4( t1, t3, 0xdd ); - - t0 = _mm512_shuffle_i32x4( st01, st05, 0x88 ); - t1 = _mm512_shuffle_i32x4( st01, st05, 0xdd ); - t2 = _mm512_shuffle_i32x4( st09, st13, 0x88 ); - t3 = _mm512_shuffle_i32x4( st09, st13, 0xdd ); - - d[ 1] = _mm512_shuffle_i32x4( t0, t2, 0x88 ); - d[ 9] = _mm512_shuffle_i32x4( t0, t2, 0xdd ); - d[ 5] = _mm512_shuffle_i32x4( t1, t3, 0x88 ); - d[13] = _mm512_shuffle_i32x4( t1, t3, 0xdd ); - - t0 = _mm512_shuffle_i32x4( st02, st06, 0x88 ); - t1 = _mm512_shuffle_i32x4( st02, st06, 0xdd ); - t2 = _mm512_shuffle_i32x4( st10, st14, 0x88 ); - t3 = _mm512_shuffle_i32x4( st10, st14, 0xdd ); - - d[ 2] = _mm512_shuffle_i32x4( t0, t2, 0x88 ); - d[10] = _mm512_shuffle_i32x4( t0, t2, 0xdd ); - d[ 6] = _mm512_shuffle_i32x4( t1, t3, 0x88 ); - d[14] = _mm512_shuffle_i32x4( t1, t3, 0xdd ); - - t0 = _mm512_shuffle_i32x4( st03, st07, 0x88 ); - t1 = _mm512_shuffle_i32x4( st03, st07, 0xdd ); - t2 = _mm512_shuffle_i32x4( st11, st15, 0x88 ); - t3 = _mm512_shuffle_i32x4( st11, st15, 0xdd ); - - d[ 3] = _mm512_shuffle_i32x4( t0, t2, 0x88 ); - d[11] = _mm512_shuffle_i32x4( t0, t2, 0xdd ); - d[ 7] = _mm512_shuffle_i32x4( t1, t3, 0x88 ); - d[15] = _mm512_shuffle_i32x4( t1, t3, 0xdd ); - -#endif -} -*/ - -#define ILEAVE_16x32( i ) do \ -{ \ - uint32_t *d = (uint32_t*)(dst) + ( (i) << 4 ); \ - d[ 0] = *( (const uint32_t*)(s00) +(i) ); \ - d[ 1] = *( (const uint32_t*)(s01) +(i) ); \ - d[ 2] = *( (const uint32_t*)(s02) +(i) ); \ - d[ 3] = *( (const uint32_t*)(s03) +(i) ); \ - d[ 4] = *( (const uint32_t*)(s04) +(i) ); \ - d[ 5] = *( (const uint32_t*)(s05) +(i) ); \ - d[ 6] = *( (const uint32_t*)(s06) +(i) ); \ - d[ 7] = *( (const uint32_t*)(s07) +(i) ); \ - d[ 8] = *( (const uint32_t*)(s08) +(i) ); \ - d[ 9] = *( (const uint32_t*)(s09) +(i) ); \ - d[10] = *( (const uint32_t*)(s10) +(i) ); \ - d[11] = *( (const uint32_t*)(s11) +(i) ); \ - d[12] = *( (const uint32_t*)(s12) +(i) ); \ - d[13] = *( (const uint32_t*)(s13) +(i) ); \ - d[14] = *( (const uint32_t*)(s14) +(i) ); \ - d[15] = *( (const uint32_t*)(s15) +(i) ); \ -} while(0) +#if defined(__SSE4_1__) static inline void intrlv_16x32( void *dst, const void *s00, const void *s01, const void *s02, const void *s03, const void *s04, @@ -1459,226 +1008,353 @@ static inline void intrlv_16x32( void *dst, const void *s00, const void *s09, const void *s10, const void *s11, const void *s12, const void *s13, const void *s14, const void *s15, const int bit_len ) { - ILEAVE_16x32( 0 ); ILEAVE_16x32( 1 ); - ILEAVE_16x32( 2 ); ILEAVE_16x32( 3 ); - ILEAVE_16x32( 4 ); ILEAVE_16x32( 5 ); - ILEAVE_16x32( 6 ); ILEAVE_16x32( 7 ); + __m128i D0, D1, D2, D3, S0, S1, S2, S3; + + LOAD_SRCE( S0, S1, S2, S3, s00, 0, s01, 0, s02, 0, s03, 0 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s04, 0, s05, 0, s06, 0, s07, 0 ); + STORE_DEST( D0, D1, D2, D3, dst, 0, dst, 4, dst, 8, dst, 12 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s08, 0, s09, 0, s10, 0, s11, 0 ); + STORE_DEST( D0, D1, D2, D3, dst, 1, dst, 5, dst, 9, dst, 13 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s12, 0, s13, 0, s14, 0, s15, 0 ); + STORE_DEST( D0, D1, D2, D3, dst, 2, dst, 6, dst, 10, dst, 14 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s00, 1, s01, 1, s02, 1, s03, 1 ); + STORE_DEST( D0, D1, D2, D3, dst, 3, dst, 7, dst, 11, dst, 15 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s04, 1, s05, 1, s06, 1, s07, 1 ); + STORE_DEST( D0, D1, D2, D3, dst, 16, dst, 20, dst, 24, dst, 28 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s08, 1, s09, 1, s10, 1, s11, 1 ); + STORE_DEST( D0, D1, D2, D3, dst, 17, dst, 21, dst, 25, dst, 29 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s12, 1, s13, 1, s14, 1, s15, 1 ); + STORE_DEST( D0, D1, D2, D3, dst, 18, dst, 22, dst, 26, dst, 30 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + STORE_DEST( D0, D1, D2, D3, dst, 19, dst, 23, dst, 27, dst, 31 ); + if ( bit_len <= 256 ) return; - ILEAVE_16x32( 8 ); ILEAVE_16x32( 9 ); - ILEAVE_16x32( 10 ); ILEAVE_16x32( 11 ); - ILEAVE_16x32( 12 ); ILEAVE_16x32( 13 ); - ILEAVE_16x32( 14 ); ILEAVE_16x32( 15 ); + + LOAD_SRCE( S0, S1, S2, S3, s00, 2, s01, 2, s02, 2, s03, 2 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s04, 2, s05, 2, s06, 2, s07, 2 ); + STORE_DEST( D0, D1, D2, D3, dst, 32, dst, 36, dst, 40, dst, 44 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s08, 2, s09, 2, s10, 2, s11, 2 ); + STORE_DEST( D0, D1, D2, D3, dst, 33, dst, 37, dst, 41, dst, 45 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s12, 2, s13, 2, s14, 2, s15, 2 ); + STORE_DEST( D0, D1, D2, D3, dst, 34, dst, 38, dst, 42, dst, 46 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s00, 3, s01, 3, s02, 3, s03, 3 ); + STORE_DEST( D0, D1, D2, D3, dst, 35, dst, 39, dst, 43, dst, 47 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s04, 3, s05, 3, s06, 3, s07, 3 ); + STORE_DEST( D0, D1, D2, D3, dst, 48, dst, 52, dst, 56, dst, 60 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s08, 3, s09, 3, s10, 3, s11, 3 ); + STORE_DEST( D0, D1, D2, D3, dst, 49, dst, 53, dst, 57, dst, 61 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s12, 3, s13, 3, s14, 3, s15, 3 ); + STORE_DEST( D0, D1, D2, D3, dst, 50, dst, 54, dst, 58, dst, 62 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + STORE_DEST( D0, D1, D2, D3, dst, 51, dst, 55, dst, 59, dst, 63 ); + if ( bit_len <= 512 ) return; - ILEAVE_16x32( 16 ); ILEAVE_16x32( 17 ); - ILEAVE_16x32( 18 ); ILEAVE_16x32( 19 ); + + LOAD_SRCE( S0, S1, S2, S3, s00, 4, s01, 4, s02, 4, s03, 4 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s04, 4, s05, 4, s06, 4, s07, 4 ); + STORE_DEST( D0, D1, D2, D3, dst, 64, dst, 68, dst, 72, dst, 76 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s08, 4, s09, 4, s10, 4, s11, 4 ); + STORE_DEST( D0, D1, D2, D3, dst, 65, dst, 69, dst, 73, dst, 77 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s12, 4, s13, 4, s14, 4, s15, 4 ); + STORE_DEST( D0, D1, D2, D3, dst, 66, dst, 70, dst, 74, dst, 78 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + STORE_DEST( D0, D1, D2, D3, dst, 67, dst, 71, dst, 75, dst, 79 ); + if ( bit_len <= 640 ) return; - ILEAVE_16x32( 20 ); ILEAVE_16x32( 21 ); - ILEAVE_16x32( 22 ); ILEAVE_16x32( 23 ); - ILEAVE_16x32( 24 ); ILEAVE_16x32( 25 ); - ILEAVE_16x32( 26 ); ILEAVE_16x32( 27 ); - ILEAVE_16x32( 28 ); ILEAVE_16x32( 29 ); - ILEAVE_16x32( 30 ); ILEAVE_16x32( 31 ); + + LOAD_SRCE( S0, S1, S2, S3, s00, 5, s01, 5, s02, 5, s03, 5 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s04, 5, s05, 5, s06, 5, s07, 5 ); + STORE_DEST( D0, D1, D2, D3, dst, 80, dst, 84, dst, 88, dst, 92 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s08, 5, s09, 5, s10, 5, s11, 5 ); + STORE_DEST( D0, D1, D2, D3, dst, 81, dst, 85, dst, 89, dst, 93 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s12, 5, s13, 5, s14, 5, s15, 5 ); + STORE_DEST( D0, D1, D2, D3, dst, 82, dst, 86, dst, 90, dst, 94 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s00, 6, s01, 6, s02, 6, s03, 6 ); + STORE_DEST( D0, D1, D2, D3, dst, 83, dst, 87, dst, 91, dst, 95 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s04, 6, s05, 6, s06, 6, s07, 6 ); + STORE_DEST( D0, D1, D2, D3, dst, 96, dst, 100, dst, 104, dst, 108 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s08, 6, s09, 6, s10, 6, s11, 6 ); + STORE_DEST( D0, D1, D2, D3, dst, 97, dst, 101, dst, 105, dst, 109 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s12, 6, s13, 6, s14, 6, s15, 6 ); + STORE_DEST( D0, D1, D2, D3, dst, 98, dst, 102, dst, 106, dst, 110 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s00, 7, s01, 7, s02, 7, s03, 7 ); + STORE_DEST( D0, D1, D2, D3, dst, 99, dst, 103, dst, 107, dst, 111 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s04, 7, s05, 7, s06, 7, s07, 7 ); + STORE_DEST( D0, D1, D2, D3, dst, 112, dst, 116, dst, 120, dst, 124 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s08, 7, s09, 7, s10, 7, s11, 7 ); + STORE_DEST( D0, D1, D2, D3, dst, 113, dst, 117, dst, 121, dst, 125 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s12, 7, s13, 7, s14, 7, s15, 7 ); + STORE_DEST( D0, D1, D2, D3, dst, 114, dst, 118, dst, 122, dst, 126 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + STORE_DEST( D0, D1, D2, D3, dst, 115, dst, 119, dst, 123, dst, 127 ); + +// if ( bit_len <= 1024 ) return; } - +// not used static inline void intrlv_16x32_512( void *dst, const void *s00, const void *s01, const void *s02, const void *s03, const void *s04, const void *s05, const void *s06, const void *s07, const void *s08, const void *s09, const void *s10, const void *s11, const void *s12, const void *s13, const void *s14, const void *s15 ) { - ILEAVE_16x32( 0 ); ILEAVE_16x32( 1 ); - ILEAVE_16x32( 2 ); ILEAVE_16x32( 3 ); - ILEAVE_16x32( 4 ); ILEAVE_16x32( 5 ); - ILEAVE_16x32( 6 ); ILEAVE_16x32( 7 ); - ILEAVE_16x32( 8 ); ILEAVE_16x32( 9 ); - ILEAVE_16x32( 10 ); ILEAVE_16x32( 11 ); - ILEAVE_16x32( 12 ); ILEAVE_16x32( 13 ); - ILEAVE_16x32( 14 ); ILEAVE_16x32( 15 ); + __m128i D0, D1, D2, D3, S0, S1, S2, S3; + + LOAD_SRCE( S0, S1, S2, S3, s00, 0, s01, 0, s02, 0, s03, 0 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s04, 0, s05, 0, s06, 0, s07, 0 ); + STORE_DEST( D0, D1, D2, D3, dst, 0, dst, 4, dst, 8, dst, 12 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s08, 0, s09, 0, s10, 0, s11, 0 ); + STORE_DEST( D0, D1, D2, D3, dst, 1, dst, 5, dst, 9, dst, 13 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s12, 0, s13, 0, s14, 0, s15, 0 ); + STORE_DEST( D0, D1, D2, D3, dst, 2, dst, 6, dst, 10, dst, 14 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s00, 1, s01, 1, s02, 1, s03, 1 ); + STORE_DEST( D0, D1, D2, D3, dst, 3, dst, 7, dst, 11, dst, 15 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s04, 1, s05, 1, s06, 1, s07, 1 ); + STORE_DEST( D0, D1, D2, D3, dst, 16, dst, 20, dst, 24, dst, 28 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s08, 1, s09, 1, s10, 1, s11, 1 ); + STORE_DEST( D0, D1, D2, D3, dst, 17, dst, 21, dst, 25, dst, 29 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s12, 1, s13, 1, s14, 1, s15, 1 ); + STORE_DEST( D0, D1, D2, D3, dst, 18, dst, 22, dst, 26, dst, 30 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s00, 2, s01, 2, s02, 2, s03, 2 ); + STORE_DEST( D0, D1, D2, D3, dst, 19, dst, 23, dst, 27, dst, 31 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s04, 2, s05, 2, s06, 2, s07, 2 ); + STORE_DEST( D0, D1, D2, D3, dst, 32, dst, 36, dst, 40, dst, 44 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s08, 2, s09, 2, s10, 2, s11, 2 ); + STORE_DEST( D0, D1, D2, D3, dst, 33, dst, 37, dst, 41, dst, 45 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s12, 2, s13, 2, s14, 2, s15, 2 ); + STORE_DEST( D0, D1, D2, D3, dst, 34, dst, 38, dst, 42, dst, 46 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s00, 3, s01, 3, s02, 3, s03, 3 ); + STORE_DEST( D0, D1, D2, D3, dst, 35, dst, 39, dst, 43, dst, 47 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s04, 3, s05, 3, s06, 3, s07, 3 ); + STORE_DEST( D0, D1, D2, D3, dst, 48, dst, 52, dst, 56, dst, 60 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s08, 3, s09, 3, s10, 3, s11, 3 ); + STORE_DEST( D0, D1, D2, D3, dst, 49, dst, 53, dst, 57, dst, 61 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, s12, 3, s13, 3, s14, 3, s15, 3 ); + STORE_DEST( D0, D1, D2, D3, dst, 50, dst, 54, dst, 58, dst, 62 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + STORE_DEST( D0, D1, D2, D3, dst, 51, dst, 55, dst, 59, dst, 63 ); } -#undef ILEAVE_16x32 - -/* static inline void dintrlv_16x32( void *dst00, void *dst01, void *dst02, void *dst03, void *dst04, void *dst05, void *dst06, void *dst07, void *dst08, void *dst09, void *dst10, void *dst11, void *dst12, void *dst13, void *dst14, void *dst15, const void *src, const int bit_len ) { - __m128i *d00 = (__m128i*)dst00; - __m128i *d01 = (__m128i*)dst01; - __m128i *d02 = (__m128i*)dst02; - __m128i *d03 = (__m128i*)dst03; - __m128i *d04 = (__m128i*)dst04; - __m128i *d05 = (__m128i*)dst05; - __m128i *d06 = (__m128i*)dst06; - __m128i *d07 = (__m128i*)dst07; - __m128i *d08 = (__m128i*)dst08; - __m128i *d09 = (__m128i*)dst09; - __m128i *d10 = (__m128i*)dst10; - __m128i *d11 = (__m128i*)dst11; - __m128i *d12 = (__m128i*)dst12; - __m128i *d13 = (__m128i*)dst13; - __m128i *d14 = (__m128i*)dst14; - __m128i *d15 = (__m128i*)dst15; - const __m128i *s = (const __m128i*)src; + __m128i D0, D1, D2, D3, S0, S1, S2, S3; - MM128_ILEAVE32( d00[0], d01[0], d02[0], d03[0], s[ 0], s[ 4], s[ 8], s[12] ); - MM128_ILEAVE32( d04[0], d05[0], d06[0], d07[0], s[ 1], s[ 5], s[ 9], s[13] ); - MM128_ILEAVE32( d08[0], d09[0], d10[0], d11[0], s[ 2], s[ 6], s[10], s[14] ); - MM128_ILEAVE32( d12[0], d13[0], d14[0], d15[0], s[ 3], s[ 7], s[11], s[15] ); - - MM128_ILEAVE32( d00[1], d01[1], d02[1], d03[1], s[16], s[20], s[24], s[28] ); - MM128_ILEAVE32( d04[1], d05[1], d06[1], d07[1], s[17], s[21], s[25], s[29] ); - MM128_ILEAVE32( d08[1], d09[1], d10[1], d11[1], s[18], s[22], s[26], s[30] ); - MM128_ILEAVE32( d12[1], d13[1], d14[1], d15[1], s[19], s[23], s[27], s[31] ); + LOAD_SRCE( S0, S1, S2, S3, src, 0, src, 4, src, 8, src, 12 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 1, src, 5, src, 9, src, 13 ); + STORE_DEST( D0, D1, D2, D3, dst00, 0, dst01, 0, dst02, 0, dst03, 0 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 2, src, 6, src, 10, src, 14 ); + STORE_DEST( D0, D1, D2, D3, dst04, 0, dst05, 0, dst06, 0, dst07, 0 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 3, src, 7, src, 11, src, 15 ); + STORE_DEST( D0, D1, D2, D3, dst08, 0, dst09, 0, dst10, 0, dst11, 0 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 16, src, 20, src, 24, src, 28 ); + STORE_DEST( D0, D1, D2, D3, dst12, 0, dst13, 0, dst14, 0, dst15, 0 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 17, src, 21, src, 25, src, 29 ); + STORE_DEST( D0, D1, D2, D3, dst00, 1, dst01, 1, dst02, 1, dst03, 1 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 18, src, 22, src, 26, src, 30 ); + STORE_DEST( D0, D1, D2, D3, dst04, 1, dst05, 1, dst06, 1, dst07, 1 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 19, src, 23, src, 27, src, 31 ); + STORE_DEST( D0, D1, D2, D3, dst08, 1, dst09, 1, dst10, 1, dst11, 1 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + STORE_DEST( D0, D1, D2, D3, dst12, 1, dst13, 1, dst14, 1, dst15, 1 ); if ( bit_len <= 256 ) return; - MM128_ILEAVE32( d00[2], d01[2], d02[2], d03[2], s[32], s[36], s[40], s[44] ); - MM128_ILEAVE32( d04[2], d05[2], d06[2], d07[2], s[33], s[37], s[41], s[45] ); - MM128_ILEAVE32( d08[2], d09[2], d10[2], d11[2], s[34], s[38], s[42], s[46] ); - MM128_ILEAVE32( d12[2], d13[2], d14[2], d15[2], s[35], s[39], s[43], s[47] ); - - MM128_ILEAVE32( d00[3], d01[3], d02[3], d03[3], s[48], s[52], s[56], s[60] ); - MM128_ILEAVE32( d04[3], d05[3], d06[3], d07[3], s[49], s[53], s[57], s[61] ); - MM128_ILEAVE32( d08[3], d09[3], d10[3], d11[3], s[50], s[54], s[58], s[62] ); - MM128_ILEAVE32( d12[3], d13[3], d14[3], d15[3], s[51], s[55], s[59], s[63] ); + LOAD_SRCE( S0, S1, S2, S3, src, 32, src, 36, src, 40, src, 44 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 33, src, 37, src, 41, src, 45 ); + STORE_DEST( D0, D1, D2, D3, dst00, 2, dst01, 2, dst02, 2, dst03, 2 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 34, src, 38, src, 42, src, 46 ); + STORE_DEST( D0, D1, D2, D3, dst04, 2, dst05, 2, dst06, 2, dst07, 2 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 35, src, 39, src, 43, src, 47 ); + STORE_DEST( D0, D1, D2, D3, dst08, 2, dst09, 2, dst10, 2, dst11, 2 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 48, src, 52, src, 56, src, 60 ); + STORE_DEST( D0, D1, D2, D3, dst12, 2, dst13, 2, dst14, 2, dst15, 2 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 49, src, 53, src, 57, src, 61 ); + STORE_DEST( D0, D1, D2, D3, dst00, 3, dst01, 3, dst02, 3, dst03, 3 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 50, src, 54, src, 58, src, 62 ); + STORE_DEST( D0, D1, D2, D3, dst04, 3, dst05, 3, dst06, 3, dst07, 3 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 51, src, 55, src, 59, src, 63 ); + STORE_DEST( D0, D1, D2, D3, dst08, 3, dst09, 3, dst10, 3, dst11, 3 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + STORE_DEST( D0, D1, D2, D3, dst12, 3, dst13, 3, dst14, 3, dst15, 3 ); if ( bit_len <= 512 ) return; - MM128_ILEAVE32( d00[4], d01[4], d02[4], d03[4], s[64], s[68], s[72], s[76] ); - MM128_ILEAVE32( d04[4], d05[4], d06[4], d07[4], s[65], s[69], s[73], s[77] ); - MM128_ILEAVE32( d08[4], d09[4], d10[4], d11[4], s[66], s[70], s[74], s[78] ); - MM128_ILEAVE32( d12[4], d13[4], d14[4], d15[4], s[67], s[71], s[75], s[79] ); - + LOAD_SRCE( S0, S1, S2, S3, src, 64, src, 68, src, 72, src, 76 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 65, src, 69, src, 73, src, 77 ); + STORE_DEST( D0, D1, D2, D3, dst00, 4, dst01, 4, dst02, 4, dst03, 4 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 66, src, 70, src, 74, src, 78 ); + STORE_DEST( D0, D1, D2, D3, dst04, 4, dst05, 4, dst06, 4, dst07, 4 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 67, src, 71, src, 75, src, 79 ); + STORE_DEST( D0, D1, D2, D3, dst08, 4, dst09, 4, dst10, 4, dst11, 4 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + STORE_DEST( D0, D1, D2, D3, dst12, 4, dst13, 4, dst14, 4, dst15, 4 ); + if ( bit_len <= 640 ) return; + + LOAD_SRCE( S0, S1, S2, S3, src, 80, src, 84, src, 88, src, 92 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 81, src, 85, src, 89, src, 93 ); + STORE_DEST( D0, D1, D2, D3, dst00, 5, dst01, 5, dst02, 5, dst03, 5 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 82, src, 86, src, 90, src, 94 ); + STORE_DEST( D0, D1, D2, D3, dst04, 5, dst05, 5, dst06, 5, dst07, 5 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 83, src, 87, src, 91, src, 95 ); + STORE_DEST( D0, D1, D2, D3, dst08, 5, dst09, 5, dst10, 5, dst11, 5 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + STORE_DEST( D0, D1, D2, D3, dst12, 5, dst13, 5, dst14, 5, dst15, 5 ); + LOAD_SRCE( S0, S1, S2, S3, src, 96, src, 100, src, 104, src, 108 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 97, src, 101, src, 105, src, 109 ); + STORE_DEST( D0, D1, D2, D3, dst00, 6, dst01, 6, dst02, 6, dst03, 6 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 98, src, 102, src, 106, src, 110 ); + STORE_DEST( D0, D1, D2, D3, dst04, 6, dst05, 6, dst06, 6, dst07, 6 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 99, src, 103, src, 107, src, 111 ); + STORE_DEST( D0, D1, D2, D3, dst08, 6, dst09, 6, dst10, 6, dst11, 6 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 112, src, 116, src, 120, src, 124 ); + STORE_DEST( D0, D1, D2, D3, dst12, 6, dst13, 6, dst14, 6, dst15, 6 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 113, src, 117, src, 121, src, 125 ); + STORE_DEST( D0, D1, D2, D3, dst00, 7, dst01, 7, dst02, 7, dst03, 7 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 114, src, 118, src, 122, src, 126 ); + STORE_DEST( D0, D1, D2, D3, dst04, 7, dst05, 7, dst06, 7, dst07, 7 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 115, src, 119, src, 123, src, 127 ); + STORE_DEST( D0, D1, D2, D3, dst08, 7, dst09, 7, dst10, 7, dst11, 7 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + STORE_DEST( D0, D1, D2, D3, dst12, 7, dst13, 7, dst14, 7, dst15, 7 ); - MM128_ILEAVE32( d00[5], d01[5], d02[5], d03[5], s[80], s[84], s[88], s[92] ); - MM128_ILEAVE32( d04[5], d05[5], d06[5], d07[5], s[81], s[85], s[89], s[93] ); - MM128_ILEAVE32( d08[5], d09[5], d10[5], d11[5], s[82], s[86], s[90], s[94] ); - MM128_ILEAVE32( d12[5], d13[5], d14[5], d15[5], s[83], s[87], s[91], s[95] ); - - MM128_ILEAVE32( d00[6], d01[6], d02[6], d03[6], s[ 96], s[100], s[104], s[108] ); - MM128_ILEAVE32( d04[6], d05[6], d06[6], d07[6], s[ 97], s[101], s[105], s[109] ); - MM128_ILEAVE32( d08[6], d09[6], d10[6], d11[6], s[ 98], s[102], s[106], s[110] ); - MM128_ILEAVE32( d12[6], d13[6], d14[6], d15[6], s[ 99], s[103], s[107], s[111] ); - - MM128_ILEAVE32( d00[7], d01[7], d02[7], d03[7], s[112], s[116], s[120], s[124] ); - MM128_ILEAVE32( d04[7], d05[7], d06[7], d07[7], s[113], s[117], s[121], s[125] ); - MM128_ILEAVE32( d08[7], d09[7], d10[7], d11[7], s[114], s[118], s[122], s[126] ); - MM128_ILEAVE32( d12[7], d13[7], d14[7], d15[7], s[115], s[119], s[123], s[127] ); +// if ( bit_len <= 1024 ) return; } -// 4 interleave algorithms same memory footprint: -// -// 1. 32 bit integer move -// -// Most instructions, all 32 bit loads & stores, use general purpose regs -// -// 2. SSE2 128 bit shuffle -// -// 128 bit loads and stores + fast shuffles, fewer total instructions: .75, -// uses 128 bit simd regs -// -// 3. AVX2 2x128 bit shuffle with 256 bit permute -// -// 256 bit loads and stores + slow 256 bit permutes, even fewer instructions: -// additional .5, uses 256 bit simd regs -// -// 4. AVX2 2x128 bit shuffle with union -// -// 128 bit loads, 256 bit stores + 128 bit moves using union + overhead -// converting from mm128 to mm256, compiler may choose mem ovly or - -static inline void dintrlv_16x32_256( void *dst00, void *dst01, void *dst02, - void *dst03, void *dst04, void *dst05, void *dst06, void *dst07, - void *dst08, void *dst09, void *dst10, void *dst11, void *dst12, - void *dst13, void *dst14, void *dst15, const void *src ) +// not used +static inline void dintrlv_16x32_512( void *dst00, void *dst01, void *dst02, + void *dst03, void *dst04, void *dst05, void *dst06, void *dst07, + void *dst08, void *dst09, void *dst10, void *dst11, void *dst12, + void *dst13, void *dst14, void *dst15, const void *src ) { -#if defined(__AVX2__) -// Can't use AVX512, min bit_len is 512 unless a single contiguous -// output buffer is used. + __m128i D0, D1, D2, D3, S0, S1, S2, S3; - const __m256i *s = (const __m256i*)src; - __m256i *d00 = (__m256i*)dst00; - __m256i *d01 = (__m256i*)dst01; - __m256i *d02 = (__m256i*)dst02; - __m256i *d03 = (__m256i*)dst03; - __m256i *d04 = (__m256i*)dst04; - __m256i *d05 = (__m256i*)dst05; - __m256i *d06 = (__m256i*)dst06; - __m256i *d07 = (__m256i*)dst07; - __m256i *d08 = (__m256i*)dst08; - __m256i *d09 = (__m256i*)dst09; - __m256i *d10 = (__m256i*)dst10; - __m256i *d11 = (__m256i*)dst11; - __m256i *d12 = (__m256i*)dst12; - __m256i *d13 = (__m256i*)dst13; - __m256i *d14 = (__m256i*)dst14; - __m256i *d15 = (__m256i*)dst15; - __m256i st0, st1, st2, st3, st4, st5, st6, st7; - - st0 = _mm256_permute2x128_si256( s[ 0], s[ 8], 0x20 ); - st4 = _mm256_permute2x128_si256( s[ 0], s[ 8], 0x31 ); - st1 = _mm256_permute2x128_si256( s[ 2], s[10], 0x20 ); - st5 = _mm256_permute2x128_si256( s[ 2], s[10], 0x31 ); - st2 = _mm256_permute2x128_si256( s[ 4], s[12], 0x20 ); - st6 = _mm256_permute2x128_si256( s[ 4], s[12], 0x31 ); - st3 = _mm256_permute2x128_si256( s[ 6], s[14], 0x20 ); - st7 = _mm256_permute2x128_si256( s[ 6], s[14], 0x31 ); - - MM256_ILEAVE32( d00[0], d01[0], d02[0], d03[0], st0, st1, st2, st3 ); - MM256_ILEAVE32( d04[0], d05[0], d06[0], d07[0], st4, st5, st6, st7 ); - - st0 = _mm256_permute2x128_si256( s[ 1], s[ 9], 0x20 ); - st4 = _mm256_permute2x128_si256( s[ 1], s[ 9], 0x31 ); - st1 = _mm256_permute2x128_si256( s[ 3], s[11], 0x20 ); - st5 = _mm256_permute2x128_si256( s[ 3], s[11], 0x31 ); - st2 = _mm256_permute2x128_si256( s[ 5], s[13], 0x20 ); - st6 = _mm256_permute2x128_si256( s[ 5], s[13], 0x31 ); - st3 = _mm256_permute2x128_si256( s[ 7], s[15], 0x20 ); - st7 = _mm256_permute2x128_si256( s[ 7], s[15], 0x31 ); - - MM256_ILEAVE32( d08[0], d09[0], d10[0], d11[0], st0, st1, st2, st3 ); - MM256_ILEAVE32( d12[0], d13[0], d14[0], d15[0], st4, st5, st6, st7 ); - - -#else -// not needed, 16x32 is only used with AVX512 - - __m128i *d00 = (__m128i*)dst00; - __m128i *d01 = (__m128i*)dst01; - __m128i *d02 = (__m128i*)dst02; - __m128i *d03 = (__m128i*)dst03; - __m128i *d04 = (__m128i*)dst04; - __m128i *d05 = (__m128i*)dst05; - __m128i *d06 = (__m128i*)dst06; - __m128i *d07 = (__m128i*)dst07; - __m128i *d08 = (__m128i*)dst08; - __m128i *d09 = (__m128i*)dst09; - __m128i *d10 = (__m128i*)dst10; - __m128i *d11 = (__m128i*)dst11; - __m128i *d12 = (__m128i*)dst12; - __m128i *d13 = (__m128i*)dst13; - __m128i *d14 = (__m128i*)dst14; - __m128i *d15 = (__m128i*)dst15; - const __m128i *s = (const __m128i*)src; - - MM128_ILEAVE32( d00[0], d01[0], d02[0], d03[0], s[ 0], s[ 4], s[ 8], s[12] ); - MM128_ILEAVE32( d04[0], d05[0], d06[0], d07[0], s[ 1], s[ 5], s[ 9], s[13] ); - MM128_ILEAVE32( d08[0], d09[0], d10[0], d11[0], s[ 2], s[ 6], s[10], s[14] ); - MM128_ILEAVE32( d12[0], d13[0], d14[0], d15[0], s[ 3], s[ 7], s[11], s[15] ); - - MM128_ILEAVE32( d00[1], d01[1], d02[1], d03[1], s[16], s[20], s[24], s[28] ); - MM128_ILEAVE32( d04[1], d05[1], d06[1], d07[1], s[17], s[21], s[25], s[29] ); - MM128_ILEAVE32( d08[1], d09[1], d10[1], d11[1], s[18], s[22], s[26], s[30] ); - MM128_ILEAVE32( d12[1], d13[1], d14[1], d15[1], s[19], s[23], s[27], s[31] ); - -#endif + LOAD_SRCE( S0, S1, S2, S3, src, 0, src, 4, src, 8, src, 12 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 1, src, 5, src, 9, src, 13 ); + STORE_DEST( D0, D1, D2, D3, dst00, 0, dst01, 0, dst02, 0, dst03, 0 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 2, src, 6, src, 10, src, 14 ); + STORE_DEST( D0, D1, D2, D3, dst04, 0, dst05, 0, dst06, 0, dst07, 0 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 3, src, 7, src, 11, src, 15 ); + STORE_DEST( D0, D1, D2, D3, dst08, 0, dst09, 0, dst10, 0, dst11, 0 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 16, src, 20, src, 24, src, 28 ); + STORE_DEST( D0, D1, D2, D3, dst12, 0, dst13, 0, dst14, 0, dst15, 0 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 17, src, 21, src, 25, src, 29 ); + STORE_DEST( D0, D1, D2, D3, dst00, 1, dst01, 1, dst02, 1, dst03, 1 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 18, src, 22, src, 26, src, 30 ); + STORE_DEST( D0, D1, D2, D3, dst04, 1, dst05, 1, dst06, 1, dst07, 1 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 19, src, 23, src, 27, src, 31 ); + STORE_DEST( D0, D1, D2, D3, dst08, 1, dst09, 1, dst10, 1, dst11, 1 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 32, src, 36, src, 40, src, 44 ); + STORE_DEST( D0, D1, D2, D3, dst12, 1, dst13, 1, dst14, 1, dst15, 1 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 33, src, 37, src, 41, src, 45 ); + STORE_DEST( D0, D1, D2, D3, dst00, 2, dst01, 2, dst02, 2, dst03, 2 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 34, src, 38, src, 42, src, 46 ); + STORE_DEST( D0, D1, D2, D3, dst04, 2, dst05, 2, dst06, 2, dst07, 2 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 35, src, 39, src, 43, src, 47 ); + STORE_DEST( D0, D1, D2, D3, dst08, 2, dst09, 2, dst10, 2, dst11, 2 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 48, src, 52, src, 56, src, 60 ); + STORE_DEST( D0, D1, D2, D3, dst12, 2, dst13, 2, dst14, 2, dst15, 2 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 49, src, 53, src, 57, src, 61 ); + STORE_DEST( D0, D1, D2, D3, dst00, 3, dst01, 3, dst02, 3, dst03, 3 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 50, src, 54, src, 58, src, 62 ); + STORE_DEST( D0, D1, D2, D3, dst04, 3, dst05, 3, dst06, 3, dst07, 3 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + LOAD_SRCE( S0, S1, S2, S3, src, 51, src, 55, src, 59, src, 63 ); + STORE_DEST( D0, D1, D2, D3, dst08, 3, dst09, 3, dst10, 3, dst11, 3 ); + ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); + STORE_DEST( D0, D1, D2, D3, dst12, 3, dst13, 3, dst14, 3, dst15, 3 ); } -*/ + +#endif // SSE4_1 + #define DLEAVE_16x32( i ) do \ { \ @@ -1701,52 +1377,6 @@ static inline void dintrlv_16x32_256( void *dst00, void *dst01, void *dst02, *( (uint32_t*)(d15) +(i) ) = s[15]; \ } while(0) - -static inline void dintrlv_16x32( void *d00, void *d01, void *d02, void *d03, - void *d04, void *d05, void *d06, void *d07, void *d08, void *d09, - void *d10, void *d11, void *d12, void *d13, void *d14, void *d15, - const void *src, const int bit_len ) -{ - DLEAVE_16x32( 0 ); DLEAVE_16x32( 1 ); - DLEAVE_16x32( 2 ); DLEAVE_16x32( 3 ); - DLEAVE_16x32( 4 ); DLEAVE_16x32( 5 ); - DLEAVE_16x32( 6 ); DLEAVE_16x32( 7 ); - if ( bit_len <= 256 ) return; - DLEAVE_16x32( 8 ); DLEAVE_16x32( 9 ); - DLEAVE_16x32( 10 ); DLEAVE_16x32( 11 ); - DLEAVE_16x32( 12 ); DLEAVE_16x32( 13 ); - DLEAVE_16x32( 14 ); DLEAVE_16x32( 15 ); - if ( bit_len <= 512 ) return; - DLEAVE_16x32( 16 ); DLEAVE_16x32( 17 ); - DLEAVE_16x32( 18 ); DLEAVE_16x32( 19 ); - if ( bit_len <= 640 ) return; - DLEAVE_16x32( 20 ); DLEAVE_16x32( 21 ); - DLEAVE_16x32( 22 ); DLEAVE_16x32( 23 ); - DLEAVE_16x32( 24 ); DLEAVE_16x32( 25 ); - DLEAVE_16x32( 26 ); DLEAVE_16x32( 27 ); - DLEAVE_16x32( 28 ); DLEAVE_16x32( 29 ); - DLEAVE_16x32( 30 ); DLEAVE_16x32( 31 ); -} - - -static inline void dintrlv_16x32_512( void *d00, void *d01, void *d02, - void *d03, void *d04, void *d05, void *d06, void *d07, - void *d08, void *d09, void *d10, void *d11, void *d12, - void *d13, void *d14, void *d15, const void *src ) -{ - DLEAVE_16x32( 0 ); DLEAVE_16x32( 1 ); - DLEAVE_16x32( 2 ); DLEAVE_16x32( 3 ); - DLEAVE_16x32( 4 ); DLEAVE_16x32( 5 ); - DLEAVE_16x32( 6 ); DLEAVE_16x32( 7 ); - DLEAVE_16x32( 8 ); DLEAVE_16x32( 9 ); - DLEAVE_16x32( 10 ); DLEAVE_16x32( 11 ); - DLEAVE_16x32( 12 ); DLEAVE_16x32( 13 ); - DLEAVE_16x32( 14 ); DLEAVE_16x32( 15 ); -} - -#undef DLEAVE_16x32 - - static inline void extr_lane_16x32( void *d, const void *s, const int lane, const int bit_len ) { @@ -2116,24 +1746,7 @@ static inline void extr_lane_4x64( void *dst, const void *src, const int lane, return; // bit_len == 512 } -/* -static inline void extr_lane_4x64( void *d, const void *s, - const int lane, const int bit_len ) -{ - ((uint64_t*)d)[ 0] = ((const uint64_t*)s)[ lane ]; - ((uint64_t*)d)[ 1] = ((const uint64_t*)s)[ lane+ 4 ]; - ((uint64_t*)d)[ 2] = ((const uint64_t*)s)[ lane+ 8 ]; - ((uint64_t*)d)[ 3] = ((const uint64_t*)s)[ lane+12 ]; - if ( bit_len <= 256 ) return; - ((uint64_t*)d)[ 4] = ((const uint64_t*)s)[ lane+16 ]; - ((uint64_t*)d)[ 5] = ((const uint64_t*)s)[ lane+20 ]; - ((uint64_t*)d)[ 6] = ((const uint64_t*)s)[ lane+24 ]; - ((uint64_t*)d)[ 7] = ((const uint64_t*)s)[ lane+28 ]; -} -*/ - -#if defined(__AVX2__) -// Doesn't really need AVX2, just SSSE3, but is only used with AVX2 code. +#if defined(__SSSE3__) static inline void mm256_intrlv80_4x64( void *d, const void *src ) { @@ -2531,22 +2144,6 @@ static inline void extr_lane_8x64( void *dst, const void *src, const int lane, return; } -/* -static inline void extr_lane_8x64( void *d, const void *s, - const int lane, const int bit_len ) -{ - ((uint64_t*)d)[ 0] = ((const uint64_t*)s)[ lane ]; - ((uint64_t*)d)[ 1] = ((const uint64_t*)s)[ lane+ 8 ]; - ((uint64_t*)d)[ 2] = ((const uint64_t*)s)[ lane+ 16 ]; - ((uint64_t*)d)[ 3] = ((const uint64_t*)s)[ lane+ 24 ]; - if ( bit_len <= 256 ) return; - ((uint64_t*)d)[ 4] = ((const uint64_t*)s)[ lane+ 32 ]; - ((uint64_t*)d)[ 5] = ((const uint64_t*)s)[ lane+ 40 ]; - ((uint64_t*)d)[ 6] = ((const uint64_t*)s)[ lane+ 48 ]; - ((uint64_t*)d)[ 7] = ((const uint64_t*)s)[ lane+ 56 ]; -} -*/ - #if defined(__AVX512F__) && defined(__AVX512VL__) // broadcast to all lanes @@ -2851,74 +2448,135 @@ static inline void dintrlv_2x256( void *dst0, void *dst1, // 4x64 -> 4x32 -#define RLEAVE_4x64_4x32( i ) do \ -{ \ - uint32_t *d = (uint32_t*)dst + (i); \ - const uint32_t *s = (const uint32_t*)src + (i); \ - d[0] = s[0]; d[1] = s[2]; \ - d[2] = s[4]; d[3] = s[6]; \ - d[4] = s[1]; d[5] = s[3]; \ - d[6] = s[5]; d[7] = s[7]; \ -} while(0) - - -// Convert 4x64 byte (256 bit) vectors to 4x32 (128 bit) vectors for AVX -// bit_len must be multiple of 64 static inline void rintrlv_4x64_4x32( void *dst, const void *src, const int bit_len ) { - RLEAVE_4x64_4x32( 0 ); RLEAVE_4x64_4x32( 8 ); - RLEAVE_4x64_4x32( 16 ); RLEAVE_4x64_4x32( 24 ); + const __m128i *s = (const __m128i*)src; + __m128i *d = (__m128i*)dst; + + d[ 0] = mm128_shuffle2_32( s[ 0], s[ 1], 0x88 ); + d[ 1] = mm128_shuffle2_32( s[ 0], s[ 1], 0xdd ); + d[ 2] = mm128_shuffle2_32( s[ 2], s[ 3], 0x88 ); + d[ 3] = mm128_shuffle2_32( s[ 2], s[ 3], 0xdd ); + d[ 4] = mm128_shuffle2_32( s[ 4], s[ 5], 0x88 ); + d[ 5] = mm128_shuffle2_32( s[ 4], s[ 5], 0xdd ); + d[ 6] = mm128_shuffle2_32( s[ 6], s[ 7], 0x88 ); + d[ 7] = mm128_shuffle2_32( s[ 6], s[ 7], 0xdd ); + if ( bit_len <= 256 ) return; - RLEAVE_4x64_4x32( 32 ); RLEAVE_4x64_4x32( 40 ); - RLEAVE_4x64_4x32( 48 ); RLEAVE_4x64_4x32( 56 ); + + d[ 8] = mm128_shuffle2_32( s[ 8], s[ 9], 0x88 ); + d[ 9] = mm128_shuffle2_32( s[ 8], s[ 9], 0xdd ); + d[10] = mm128_shuffle2_32( s[10], s[11], 0x88 ); + d[11] = mm128_shuffle2_32( s[10], s[11], 0xdd ); + d[12] = mm128_shuffle2_32( s[12], s[13], 0x88 ); + d[13] = mm128_shuffle2_32( s[12], s[13], 0xdd ); + d[14] = mm128_shuffle2_32( s[14], s[15], 0x88 ); + d[15] = mm128_shuffle2_32( s[14], s[15], 0xdd ); + if ( bit_len <= 512 ) return; - RLEAVE_4x64_4x32( 64 ); RLEAVE_4x64_4x32( 72 ); - if ( bit_len <= 640 ) return; - RLEAVE_4x64_4x32( 80 ); RLEAVE_4x64_4x32( 88 ); - RLEAVE_4x64_4x32( 96 ); RLEAVE_4x64_4x32( 104 ); - RLEAVE_4x64_4x32( 112 ); RLEAVE_4x64_4x32( 120 ); + + d[16] = mm128_shuffle2_32( s[16], s[17], 0x88 ); + d[17] = mm128_shuffle2_32( s[16], s[17], 0xdd ); + d[18] = mm128_shuffle2_32( s[18], s[19], 0x88 ); + d[19] = mm128_shuffle2_32( s[18], s[19], 0xdd ); + d[20] = mm128_shuffle2_32( s[20], s[21], 0x88 ); + d[21] = mm128_shuffle2_32( s[20], s[21], 0xdd ); + d[22] = mm128_shuffle2_32( s[22], s[23], 0x88 ); + d[23] = mm128_shuffle2_32( s[22], s[23], 0xdd ); + d[24] = mm128_shuffle2_32( s[24], s[25], 0x88 ); + d[25] = mm128_shuffle2_32( s[24], s[25], 0xdd ); + d[26] = mm128_shuffle2_32( s[26], s[27], 0x88 ); + d[27] = mm128_shuffle2_32( s[26], s[27], 0xdd ); + d[28] = mm128_shuffle2_32( s[28], s[29], 0x88 ); + d[29] = mm128_shuffle2_32( s[28], s[29], 0xdd ); + d[30] = mm128_shuffle2_32( s[30], s[31], 0x88 ); + d[31] = mm128_shuffle2_32( s[30], s[31], 0xdd ); + +// if ( bit_len <= 1024 ) return; } -#undef RLEAVE_4x64_4x32 - -#define RLEAVE_8x64_8x32( i ) do \ -{ \ - uint32_t *d = (uint32_t*)dst + (i); \ - const uint32_t *s = (const uint32_t*)src + (i); \ - d[ 0] = s[ 0]; d[ 1] = s[ 2]; d[ 2] = s[ 4]; d[ 3] = s[ 6]; \ - d[ 4] = s[ 8]; d[ 5] = s[10]; d[ 6] = s[12]; d[ 7] = s[14]; \ - d[ 8] = s[ 1]; d[ 9] = s[ 3]; d[10] = s[ 5]; d[11] = s[ 7]; \ - d[12] = s[ 9]; d[13] = s[11]; d[14] = s[13]; d[15] = s[15]; \ -} while(0) - - -// 8x64 -> 8x32 - static inline void rintrlv_8x64_8x32( void *dst, const void *src, - const int bit_len ) + const int bit_len ) { - RLEAVE_8x64_8x32( 0 ); RLEAVE_8x64_8x32( 16 ); - RLEAVE_8x64_8x32( 32 ); RLEAVE_8x64_8x32( 48 ); + const __m128i *s = (const __m128i*)src; + __m128i *d = (__m128i*)dst; + + d[ 0] = mm128_shuffle2_32( s[ 0], s[ 1], 0x88 ); + d[ 1] = mm128_shuffle2_32( s[ 2], s[ 3], 0x88 ); + d[ 2] = mm128_shuffle2_32( s[ 0], s[ 1], 0xdd ); + d[ 3] = mm128_shuffle2_32( s[ 2], s[ 3], 0xdd ); + d[ 4] = mm128_shuffle2_32( s[ 4], s[ 5], 0x88 ); + d[ 5] = mm128_shuffle2_32( s[ 6], s[ 7], 0x88 ); + d[ 6] = mm128_shuffle2_32( s[ 4], s[ 5], 0xdd ); + d[ 7] = mm128_shuffle2_32( s[ 6], s[ 7], 0xdd ); + d[ 8] = mm128_shuffle2_32( s[ 8], s[ 9], 0x88 ); + d[ 9] = mm128_shuffle2_32( s[10], s[11], 0x88 ); + d[10] = mm128_shuffle2_32( s[ 8], s[ 9], 0xdd ); + d[11] = mm128_shuffle2_32( s[10], s[11], 0xdd ); + d[12] = mm128_shuffle2_32( s[12], s[13], 0x88 ); + d[13] = mm128_shuffle2_32( s[14], s[15], 0x88 ); + d[14] = mm128_shuffle2_32( s[12], s[13], 0xdd ); + d[15] = mm128_shuffle2_32( s[14], s[15], 0xdd ); if ( bit_len <= 256 ) return; - RLEAVE_8x64_8x32( 64 ); RLEAVE_8x64_8x32( 80 ); - RLEAVE_8x64_8x32( 96 ); RLEAVE_8x64_8x32( 112 ); + d[16] = mm128_shuffle2_32( s[16], s[17], 0x88 ); + d[17] = mm128_shuffle2_32( s[18], s[19], 0x88 ); + d[18] = mm128_shuffle2_32( s[16], s[17], 0xdd ); + d[19] = mm128_shuffle2_32( s[18], s[19], 0xdd ); + d[20] = mm128_shuffle2_32( s[20], s[21], 0x88 ); + d[21] = mm128_shuffle2_32( s[22], s[23], 0x88 ); + d[22] = mm128_shuffle2_32( s[20], s[21], 0xdd ); + d[23] = mm128_shuffle2_32( s[22], s[23], 0xdd ); + d[24] = mm128_shuffle2_32( s[24], s[25], 0x88 ); + d[25] = mm128_shuffle2_32( s[26], s[27], 0x88 ); + d[26] = mm128_shuffle2_32( s[24], s[25], 0xdd ); + d[27] = mm128_shuffle2_32( s[26], s[27], 0xdd ); + d[28] = mm128_shuffle2_32( s[28], s[29], 0x88 ); + d[29] = mm128_shuffle2_32( s[30], s[31], 0x88 ); + d[30] = mm128_shuffle2_32( s[28], s[29], 0xdd ); + d[31] = mm128_shuffle2_32( s[30], s[31], 0xdd ); if ( bit_len <= 512 ) return; - - RLEAVE_8x64_8x32( 128 ); RLEAVE_8x64_8x32( 144 ); - if ( bit_len <= 640 ) return; + d[32] = mm128_shuffle2_32( s[32], s[33], 0x88 ); + d[33] = mm128_shuffle2_32( s[34], s[35], 0x88 ); + d[34] = mm128_shuffle2_32( s[32], s[33], 0xdd ); + d[35] = mm128_shuffle2_32( s[34], s[35], 0xdd ); + d[36] = mm128_shuffle2_32( s[36], s[37], 0x88 ); + d[37] = mm128_shuffle2_32( s[38], s[39], 0x88 ); + d[38] = mm128_shuffle2_32( s[36], s[37], 0xdd ); + d[39] = mm128_shuffle2_32( s[38], s[39], 0xdd ); + d[40] = mm128_shuffle2_32( s[40], s[41], 0x88 ); + d[41] = mm128_shuffle2_32( s[42], s[43], 0x88 ); + d[42] = mm128_shuffle2_32( s[40], s[41], 0xdd ); + d[43] = mm128_shuffle2_32( s[42], s[43], 0xdd ); + d[44] = mm128_shuffle2_32( s[44], s[45], 0x88 ); + d[45] = mm128_shuffle2_32( s[46], s[47], 0x88 ); + d[46] = mm128_shuffle2_32( s[44], s[45], 0xdd ); + d[47] = mm128_shuffle2_32( s[46], s[47], 0xdd ); - RLEAVE_8x64_8x32( 160 ); RLEAVE_8x64_8x32( 176 ); - RLEAVE_8x64_8x32( 192 ); RLEAVE_8x64_8x32( 208 ); - RLEAVE_8x64_8x32( 224 ); RLEAVE_8x64_8x32( 240 ); + d[48] = mm128_shuffle2_32( s[48], s[49], 0x88 ); + d[49] = mm128_shuffle2_32( s[50], s[51], 0x88 ); + d[50] = mm128_shuffle2_32( s[48], s[49], 0xdd ); + d[51] = mm128_shuffle2_32( s[50], s[51], 0xdd ); + d[52] = mm128_shuffle2_32( s[52], s[53], 0x88 ); + d[53] = mm128_shuffle2_32( s[54], s[55], 0x88 ); + d[54] = mm128_shuffle2_32( s[52], s[53], 0xdd ); + d[55] = mm128_shuffle2_32( s[54], s[55], 0xdd ); + d[56] = mm128_shuffle2_32( s[56], s[57], 0x88 ); + d[57] = mm128_shuffle2_32( s[58], s[59], 0x88 ); + d[58] = mm128_shuffle2_32( s[56], s[57], 0xdd ); + d[59] = mm128_shuffle2_32( s[58], s[59], 0xdd ); + d[60] = mm128_shuffle2_32( s[60], s[61], 0x88 ); + d[61] = mm128_shuffle2_32( s[62], s[63], 0x88 ); + d[62] = mm128_shuffle2_32( s[60], s[61], 0xdd ); + d[63] = mm128_shuffle2_32( s[62], s[63], 0xdd ); + +// if ( bit_len <= 1024 ) return; } -#undef RLEAVE_8x64_8x32 - // 4x32 -> 4x64 static inline void rintrlv_4x32_4x64( void *dst, @@ -3100,35 +2758,6 @@ static inline void rintrlv_8x32_4x128( void *dst0, void *dst1, } #undef RLEAVE_8X32_4X128 -/* -#define RLEAVE_4x32_4x64(i) do \ -{ \ - uint32_t *d = (uint32_t*)dst + (i); \ - const uint32_t *s = (const uint32_t*)src + (i); \ - d[0] = s[0]; d[1] = s[4]; \ - d[2] = s[1]; d[3] = s[5]; \ - d[4] = s[2]; d[5] = s[6]; \ - d[6] = s[3]; d[7] = s[7]; \ -} while(0) - -static inline void rintrlv_4x32_4x64( void *dst, - const void *src, int bit_len ) -{ - RLEAVE_4x32_4x64( 0 ); RLEAVE_4x32_4x64( 8 ); - RLEAVE_4x32_4x64( 16 ); RLEAVE_4x32_4x64( 24 ); - if ( bit_len <= 256 ) return; - RLEAVE_4x32_4x64( 32 ); RLEAVE_4x32_4x64( 40 ); - RLEAVE_4x32_4x64( 48 ); RLEAVE_4x32_4x64( 56 ); - if ( bit_len <= 512 ) return; - RLEAVE_4x32_4x64( 64 ); RLEAVE_4x32_4x64( 72 ); - RLEAVE_4x32_4x64( 80 ); RLEAVE_4x32_4x64( 88 ); - RLEAVE_4x32_4x64( 96 ); RLEAVE_4x32_4x64( 104 ); - RLEAVE_4x32_4x64( 112 ); RLEAVE_4x32_4x64( 120 ); -} - -#undef RLEAVE_4x32_4x64 -*/ - // 2x128 -> 4x64 static inline void rintrlv_2x128_4x64( void *dst, const void *src0, @@ -3174,32 +2803,6 @@ static inline void rintrlv_2x128_4x64( void *dst, const void *src0, d[31] = _mm_unpackhi_epi64( s1[14], s1[15] ); } -/* -#define RLEAVE_2x128_4x64( i ) do \ -{ \ - uint64_t *d = (uint64_t*)dst + ((i)<<1); \ - const uint64_t *s0 = (const uint64_t*)src0 + (i); \ - const uint64_t *s1 = (const uint64_t*)src1 + (i); \ - d[0] = s0[0]; d[1] = s0[2]; \ - d[2] = s1[0]; d[3] = s1[2]; \ - d[4] = s0[1]; d[5] = s0[3]; \ - d[6] = s1[1]; d[7] = s1[3]; \ -} while(0) - -static inline void rintrlv_2x128_4x64( void *dst, const void *src0, - const void *src1, int bit_len ) -{ - RLEAVE_2x128_4x64( 0 ); RLEAVE_2x128_4x64( 4 ); - if ( bit_len <= 256 ) return; - RLEAVE_2x128_4x64( 8 ); RLEAVE_2x128_4x64( 12 ); - if ( bit_len <= 512 ) return; - RLEAVE_2x128_4x64( 16 ); RLEAVE_2x128_4x64( 20 ); - RLEAVE_2x128_4x64( 24 ); RLEAVE_2x128_4x64( 28 ); -} - -#undef RLEAVE_2x128_4x64 -*/ - // 4x64 -> 2x128 static inline void rintrlv_4x64_2x128( void *dst0, void *dst1, @@ -3245,30 +2848,6 @@ static inline void rintrlv_4x64_2x128( void *dst0, void *dst1, d1[15] = _mm_unpackhi_epi64( s[29], s[31] ); } -/* -#define RLEAVE_4x64_2x128( i ) do \ -{ \ - uint64_t *d0 = (uint64_t*)dst0 + (i); \ - uint64_t *d1 = (uint64_t*)dst1 + (i); \ - const uint64_t *s = (const uint64_t*)src + ((i)<<1); \ - d0[0] = s[0]; d0[1] = s[4]; \ - d0[2] = s[1]; d0[3] = s[5]; \ - d1[0] = s[2]; d1[1] = s[6]; \ - d1[2] = s[3]; d1[3] = s[7]; \ -} while(0) - -static inline void rintrlv_4x64_2x128( void *dst0, void *dst1, - const void *src, int bit_len ) -{ - RLEAVE_4x64_2x128( 0 ); RLEAVE_4x64_2x128( 4 ); - if ( bit_len <= 256 ) return; - RLEAVE_4x64_2x128( 8 ); RLEAVE_4x64_2x128( 12 ); - if ( bit_len <= 512 ) return; - RLEAVE_4x64_2x128( 16 ); RLEAVE_4x64_2x128( 20 ); - RLEAVE_4x64_2x128( 24 ); RLEAVE_4x64_2x128( 28 ); -} -*/ - // 2x128 -> 8x64 static inline void rintrlv_4x128_8x64( void *dst, const void *src0, @@ -3687,4 +3266,9 @@ do { \ } while(0) #endif // AVX512 + +#undef ILEAVE_4x32 +#undef LOAD_SRCE +#undef ILEAVE_STORE_DEST + #endif // INTERLEAVE_H__ diff --git a/util.c b/util.c index f71421a..b0ef430 100644 --- a/util.c +++ b/util.c @@ -553,6 +553,7 @@ json_t *json_rpc_call(CURL *curl, const char *url, long timeout = (flags & JSON_RPC_LONGPOLL) ? opt_timeout : 30; struct header_info hi = {0}; + all_data.headers = &hi; /* it is assumed that 'curl' is freshly [re]initialized at this pt */ if (opt_protocol) curl_easy_setopt(curl, CURLOPT_VERBOSE, 1); diff --git a/winbuild-cross.sh b/winbuild-cross.sh index 4a3617b..336e3c7 100755 --- a/winbuild-cross.sh +++ b/winbuild-cross.sh @@ -129,7 +129,7 @@ make clean || echo clean # Native with CPU groups ennabled make clean || echo clean rm -f config.status -CFLAGS="-march=native $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS +CFLAGS="-march=native $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS make -j 8 strip -s cpuminer.exe