#if !defined(SIMD_128_H__)
#define SIMD_128_H__ 1

#if defined(__x86_64__) && defined(__SSE2__)

///////////////////////////////////////////////////////////////////////////////
//
//                 128 bit SSE vectors
//
// SSE2 is required for 128 bit integer support. Some functions are also
// optimized with SSSE3, SSE4.1 or AVX. Some of these more optimized
// functions don't have SSE2 equivalents and their use would break SSE2
// compatibility.
//
// Constants are an issue with simd. Simply put, immediate constants don't
// exist. All simd constants either reside in memory or a register and
// must be loaded from memory or generated at run time.
//
// Due to the cost of generating constants it is more efficient to
// define a local const for repeated references to the same constant.
//
// One common use for simd constants is as a control index for vector
// shuffle instructions. Alhough the ultimate instruction may execute in a
// single clock cycle, generating the control index adds several more cycles
// to the entire operation. 
//
// All of the utilities here assume all data is in registers except
// in rare cases where arguments are pointers.
//
// Some constants are generated using a memory overlay on the stack.
//
// Intrinsics automatically promote from REX to VEX when AVX is available
// but ASM needs to be done manually.
//
// APX supports EGPR which adds 16 more GPRs and 3 operand instructions.
// This may affect ASM that include instructions that are superseded by APX
// versions and are therefore incompatible with APX.
// As a result GCC-14 disables EGPR by default and can be enabled with
// "-mapx-inline-asm-use-gpr32"
//TODO
// Some ASM functions may need to be updated to support EGPR with APX.
//
///////////////////////////////////////////////////////////////////////////////

// New architecturally agnostic syntax: 
//
//           __m128i -> v128_t
//           _mm_    -> v128_
// 
//    There is also new syntax to accomodate ARM's stricter type checking of
//    vector element size. They have no effect on x86_64.

// direct translation of native intrinsics

#define v128_t                         __m128i 
#define v128u64_t                      v128_t
#define v128u32_t                      v128_t
#define v128u16_t                      v128_t
#define v128u8_t                       v128_t

#define v128_load                      _mm_load_si128
#define v128_store                     _mm_store_si128

// Needed for ARM, Doesn't do anything special on x86_64
#define v128_load1_64(p)               _mm_set1_epi64x(*(uint64_t*)(p) )
#define v128_load1_32(p)               _mm_set1_epi32( *(uint32_t*)(p) )
#define v128_load1_16(p)               _mm_set1_epi16( *(uint16_t*)(p) )
#define v128_load1_8( p)               _mm_set1_epi8(  *(uint8_t*) (p) )

// arithmetic
#define v128_add64                     _mm_add_epi64
#define v128_add32                     _mm_add_epi32
#define v128_add16                     _mm_add_epi16
#define v128_add8                      _mm_add_epi8

#define v128_sub64                     _mm_sub_epi64
#define v128_sub32                     _mm_sub_epi32
#define v128_sub16                     _mm_sub_epi16
#define v128_sub8                      _mm_sub_epi8

// save low half
#define v128_mul64                     _mm_mullo_epi64
#define v128_mul32                     _mm_mullo_epi32
#define v128_mul16                     _mm_mullo_epi16

// widen
#define v128_mulw32                    _mm_mul_epu32
#define v128_mulw16                    _mm_mul_epu16

// signed compare
#define v128_cmpeq64                   _mm_cmpeq_epi64
#define v128_cmpeq32                   _mm_cmpeq_epi32
#define v128_cmpeq16                   _mm_cmpeq_epi16
#define v128_cmpeq8                    _mm_cmpeq_epi8

#define v128_cmpgt64                   _mm_cmpgt_epi64
#define v128_cmpgt32                   _mm_cmpgt_epi32
#define v128_cmpgt16                   _mm_cmpgt_epi16
#define v128_cmpgt8                    _mm_cmpgt_epi8

#define v128_cmplt64                   _mm_cmplt_epi64
#define v128_cmplt32                   _mm_cmplt_epi32
#define v128_cmplt16                   _mm_cmplt_epi16
#define v128_cmplt8                    _mm_cmplt_epi8

// bit shift
#define v128_sl64                      _mm_slli_epi64
#define v128_sl32                      _mm_slli_epi32
#define v128_sl16                      _mm_slli_epi16
#define v128_sl8                       _mm_slli_epi8

#define v128_sr64                      _mm_srli_epi64
#define v128_sr32                      _mm_srli_epi32
#define v128_sr16                      _mm_srli_epi16
#define v128_sr8                       _mm_srli_epi8

#define v128_sra64                     _mm_srai_epi64
#define v128_sra32                     _mm_srai_epi32
#define v128_sra16                     _mm_srai_epi16
#define v128_sra8                      _mm_srai_epi8

// logic
#define v128_or                        _mm_or_si128
#define v128_and                       _mm_and_si128
#define v128_xor                       _mm_xor_si128
#define v128_xorq                      _mm_xor_si128
#define v128_andnot                    _mm_andnot_si128

// unpack
#define v128_unpacklo64                _mm_unpacklo_epi64
#define v128_unpackhi64                _mm_unpackhi_epi64
#define v128_unpacklo32                _mm_unpacklo_epi32
#define v128_unpackhi32                _mm_unpackhi_epi32
#define v128_unpacklo16                _mm_unpacklo_epi16
#define v128_unpackhi16                _mm_unpackhi_epi16
#define v128_unpacklo8                 _mm_unpacklo_epi8
#define v128_unpackhi8                 _mm_unpackhi_epi8

// AES
// Nokey means nothing on x86_64 but it saves an instruction and a register
// on ARM.
#define v128_aesenc                    _mm_aesenc_si128
#define v128_aesenc_nokey(v)           _mm_aesenc_si128( v, v128_zero )
#define v128_aesenclast                _mm_aesenclast_si128
#define v128_aesenclast_nokey(v)       _mm_aesenclast_si128( v, v128_zero )
#define v128_aesdec                    _mm_aesdec_si128
#define v128_aesdec_nokey(v)           _mm_aesdec_si128( v, v128_zero )
#define v128_aesdeclast                _mm_aesdeclast_si128
#define v128_aesdeclast_nokey(v)       _mm_aesdeclast_si128( v, v128_zero )

// Used instead of casting.
typedef union
{
   v128_t   v128;
   uint32_t u32[4];
} __attribute__ ((aligned (16))) v128_ovly;

// use for immediate constants, use load1 for mem.
#define v128_64                        _mm_set1_epi64x
#define v128_32                        _mm_set1_epi32
#define v128_16                        _mm_set1_epi16
#define v128_8                         _mm_set1_epi8

#define v128_set64                     _mm_set_epi64x
#define v128_set32                     _mm_set_epi32
#define v128_set16                     _mm_set_epi16
#define v128_set8                      _mm_set_epi8

// Deprecated. AVX512 adds EVEX encoding (3rd operand) and other improvements
// that make these functions either unnecessary or inefficient.
// In cases where an explicit move betweeen GP & SIMD registers is still
// necessary the cvt, set, or set1 intrinsics can be used allowing the
// compiler to exploit new features to produce optimum code.
// Currently only used internally and by Luffa.
// It also has implications for APX EGPR feature.

#define v128_mov64       _mm_cvtsi64_si128
#define v128_mov32       _mm_cvtsi32_si128

/*
static inline __m128i v128_mov64( const uint64_t n )
{
  __m128i a;
#if defined(__AVX__)
  asm( "vmovq %1, %0\n\t" : "=x"(a) : "r"(n) );
#else
  asm( "movq %1, %0\n\t" : "=x"(a) : "r"(n) );
#endif
  return a;
}

static inline __m128i v128_mov32( const uint32_t n )
{
  __m128i a;
#if defined(__AVX__)
  asm( "vmovd %1, %0\n\t" : "=x"(a) : "r"(n) );
#else  
  asm( "movd %1, %0\n\t" : "=x"(a) : "r"(n) );
#endif
  return a;
}
*/

// broadcast lane 0 to all lanes
#define v128_bcast64(v)                 _mm_shuffle_epi32( v, 0x44 )
#define v128_bcast32(v)                 _mm_shuffle_epi32( v, 0x00 )

// Not used, test first
/*
#if defined(__AVX2__)

#define v128_bcast16(v)                 _mm_broadcastw_epi16(v)

#else

#define v128_bcast16(v) \
   _mm_shuffle_epi32( _mm_shufflelo_epi16( v, 0x00 ), 0x00 )

#endif
*/

// Broadcast lane l to all lanes
#define v128_duplane64( v, l ) \
   ( (l) == 0 ) ? _mm_shuffle_epi32( v, 0x44 ) \
                : _mm_shuffle_epi32( v, 0xee )

#define v128_duplane32( v, l ) \
    ( (l) == 0 ) ? _mm_shuffle_epi32( v, 0x00 ) \
  : ( (l) == 1 ) ? _mm_shuffle_epi32( v, 0x55 ) \
  : ( (l) == 2 ) ? _mm_shuffle_epi32( v, 0xaa ) \
  :                _mm_shuffle_epi32( v, 0xff )

// Pseudo constants
#define v128_zero                       _mm_setzero_si128()

//#define v128_one                         v128_mov64(1)
#define v128_one                        _mm_cvtsi64_si128( 1 )

// ASM avoids the need to initialize return variable to avoid compiler warning.
// Macro abstracts function parentheses to look like an identifier.
static inline __m128i v128_neg1_fn()
{
   __m128i a;
#if defined(__AVX__)
   asm( "vpcmpeqq %0, %0, %0\n\t" : "=x"(a) );
#else
   asm( "pcmpeqq %0, %0\n\t" : "=x"(a) );
#endif
   return a;
}
#define v128_neg1                        v128_neg1_fn()

//
// Vector pointer cast

// p = any aligned pointer
// returns p as pointer to vector type
#define castp_v128(p)     ((__m128i*)(p))
#define castp_v128u64     castp_v128
#define castp_v128u32     castp_v128
#define castp_v128u16     castp_v128
#define castp_v128u8      castp_v128

// p = any aligned pointer
// returns *p, watch your pointer arithmetic
#define cast_v128(p)      (*((__m128i*)(p)))
#define cast_v128u64      cast_v128
#define cast_v128u32      cast_v128
#define cast_v128u16      cast_v128
#define cast_v128u8       cast_v128

// p = any aligned pointer, i = scaled array index
// returns value p[i]
#define casti_v128(p,i)    (((__m128i*)(p))[(i)])
#define casti_v128u64      casti_v128
#define casti_v128u32      casti_v128
#define casti_v128u16      casti_v128
#define casti_v128u8       casti_v128

// p = any aligned pointer, o = scaled offset
// returns pointer p+o
#define casto_v128(p,o) (((__m128i*)(p))+(o))

#if defined(__SSE4_1__)

#define v128_get64( v, l )         _mm_extract_epi64( v, l )
#define v128_get32( v, l )         _mm_extract_epi32( v, l )
#define v128_get16( v, l )         _mm_extract_epi16( v, l )
#define v128_get8(  v, l )         _mm_extract_epi8(  v, l )

#define v128_put64( v, u64, l )    _mm_insert_epi64( v, u64, l )
#define v128_put32( v, u32, l )    _mm_insert_epi32( v, u32, l )
#define v128_put16( v, u16, l )    _mm_insert_epi16( v, u16, l )
#define v128_put8(  v, u8,  l )    _mm_insert_epi8(  v, u8,  l )

/////////////////////////////////////////////////////////////
//
//      _mm_insert_ps( __m128i v1, __m128i v2, imm8 c )
//
// Fast and powerful but very limited in its application.
// It requires SSE4.1 but only works with 128 bit vectors with 32 bit
// elements. There is no equivalent instruction for 256 bit or 512 bit vectors.
// There's no integer version. There's no 64 bit, 16 bit or byte element
// sizing. It's unique.
//
// It can:
//   - zero any number of 32 bit elements of a 128 bit vector.
//   - extract any 32 bit element from one 128 bit vector and insert the
//     data to any 32 bit element of another 128 bit vector, or the same vector.
//   - do both simultaneoulsly.
//
//   It can be used as a more efficient replacement for _mm_insert_epi32
//   or _mm_extract_epi32.
//
// Control byte definition:
//    c[3:0] zero mask
//    c[5:4] destination element selector
//    c[7:6] source element selector

// Convert type and abbreviate name: eXtract Insert Mask = XIM
#define v128_xim32( v1, v0, c ) \
   _mm_castps_si128( _mm_insert_ps( _mm_castsi128_ps( v1 ), \
                                    _mm_castsi128_ps( v0 ), c ) )

// Examples of simple operations using xim:
/*
// Copy i32 to element c of dest and copy remaining elemnts from v.
#define v128_put32( v, i32, c ) \
      v128_xim_32( v, v128_mov32( i32 ), (c)<<4 )
*/


#define v128_mask32( v, m )    v128_xim32( v, v, m & 0xf )

// Zero 32 bit elements when corresponding bit in 4 bit mask is set.
//static inline __m128i v128_mask32( const __m128i v, const int m ) 
//{   return v128_xim32( v, v, m ); }

// Copy element l0 of v0 to element l1 of dest and copy remaining elements from v1.
#define v128_movlane32( v1, l1, v0, l0 ) \
  v128_xim32( v1, v0, ( (l1)<<4 ) | ( (l0)<<6 ) )

#endif  // SSE4_1

//
// Basic operations without equivalent SIMD intrinsic

// Bitwise not (~v)  
#if defined(VL256)

static inline __m128i v128_not( const __m128i v )
{  return _mm_ternarylogic_epi64( v, v, v, 1 ); }

#else

#define v128_not( v )          _mm_xor_si128( v, v128_neg1 ) 

#endif

static inline v128u64_t v128_negate_64( v128u64_t v )
{ return _mm_sub_epi64( _mm_xor_si128( v, v ), v ); }

static inline v128u32_t v128_negate_32( v128u32_t v )
{ return _mm_sub_epi32( _mm_xor_si128( v, v ), v ); }

static inline v128u16_t v128_negate_16( v128u16_t v ) 
{ return _mm_sub_epi16( _mm_xor_si128( v, v ), v ); }


// Add 4 values, fewer dependencies than sequential addition.
#define v128_add4_64( a, b, c, d ) \
   _mm_add_epi64( _mm_add_epi64( a, b ), _mm_add_epi64( c, d ) )

#define v128_add4_32( a, b, c, d ) \
   _mm_add_epi32( _mm_add_epi32( a, b ), _mm_add_epi32( c, d ) )

#define v128_add4_16( a, b, c, d ) \
   _mm_add_epi16( _mm_add_epi16( a, b ), _mm_add_epi16( c, d ) )

#define v128_add4_8( a, b, c, d ) \
   _mm_add_epi8( _mm_add_epi8( a, b ), _mm_add_epi8( c, d ) )

#define v128_xor4( a, b, c, d ) \
   _mm_xor_si128( _mm_xor_si128( a, b ), _mm_xor_si128( c, d ) )


// Memory functions
// Mostly for convenience, avoids calculating bytes.
// Assumes data is alinged and integral.
// n = number of __m128i, bytes/16

static inline void v128_memset_zero( v128_t *dst,  const int n )
{   for ( int i = 0; i < n; i++ ) dst[i] = v128_zero; }
#define memset_zero_128      v128_memset_zero

static inline void v128_memset( v128_t *dst, const v128_t a, const int n )
{   for ( int i = 0; i < n; i++ ) dst[i] = a; }

static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
{   for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
#define  memcpy_128           v128_memcpy  

// Boolean operations
#if defined(VL256)
// Macros with duplicate references to the same argument are
// not expression safe. Switch to inline function if required.

// ~v1 | v0
#define v128_ornot( v1, v0 )      _mm_ternarylogic_epi64( v1, v0, v0, 0xcf )

// a ^ b ^ c
#define v128_xor3( a, b, c )      _mm_ternarylogic_epi64( a, b, c, 0x96 )

// a & b & c
#define v128_and3( a, b, c )      _mm_ternarylogic_epi64( a, b, c, 0x80 )

// a | b | c
#define v128_or3( a, b, c )       _mm_ternarylogic_epi64( a, b, c, 0xfe )

// a ^ ( b & c )
#define v128_xorand( a, b, c )    _mm_ternarylogic_epi64( a, b, c, 0x78 )

// a & ( b ^ c )
#define v128_andxor( a, b, c )    _mm_ternarylogic_epi64( a, b, c, 0x60 )

// a ^ ( b | c )
#define v128_xoror( a, b, c )     _mm_ternarylogic_epi64( a, b, c, 0x1e )

// a ^ ( ~b & c )
#define v128_xorandnot( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xd2 )

// a | ( b & c )
#define v128_orand( a, b, c )     _mm_ternarylogic_epi64( a, b, c, 0xf8 )

// ~( a ^ b ), same as (~a) ^ b
#define v128_xnor( a, b )         _mm_ternarylogic_epi64( a, b, b, 0x81 )

#else

#define v128_ornot( v1, v0 )      _mm_or_si128( v128_not( v1 ), v0 )

#define v128_xor3( a, b, c )      _mm_xor_si128( a, _mm_xor_si128( b, c ) )

#define v128_and3( a, b, c )      _mm_and_si128( a, _mm_and_si128( b, c ) )

#define v128_or3( a, b, c )       _mm_or_si128( a, _mm_or_si128( b, c ) )

#define v128_xorand( a, b, c )    _mm_xor_si128( a, _mm_and_si128( b, c ) )

#define v128_andxor( a, b, c )    _mm_and_si128( a, _mm_xor_si128( b, c ))

#define v128_xoror( a, b, c )     _mm_xor_si128( a, _mm_or_si128( b, c ) )

#define v128_xorandnot( a, b, c ) _mm_xor_si128( a, _mm_andnot_si128( b, c ) )

#define v128_orand( a, b, c )     _mm_or_si128( a, _mm_and_si128( b, c ) )

#define v128_xnor( a, b )         v128_not( _mm_xor_si128( a, b ) )

#endif


// Mask making
// Equivalent of AVX512 _mm_movepi64_mask & _mm_movepi32_mask.
// Returns 2 or 4 bit integer mask from MSBit of 64 or 32 bit elements.
// Effectively a sign test.

#define v128_movmask64( v ) \
   _mm_movemask_pd( (__m128d)(v) )

#define v128_movmask32( v ) \
   _mm_movemask_ps( (__m128)(v) )

// Shuffle 16 bit elements within 64 bit lanes.
#define v128_shuffle16( v, c ) \
       _mm_shufflehi_epi16( _mm_shufflelo_epi16( v, c ), c )

#define v128_qrev32(v)      _mm_shuffle_epi32( v, 0xb1 )
#define v128_swap64_32(v)   _mm_shuffle_epi32( v, 0xb1 )  // grandfathered

#define v128_qrev16(v)      v128_shuffle16( v, 0x1b )
#define v128_lrev16(v)      v128_shuffle16( v, 0xb1 )

//
// Bit rotations

// Internal use only, should never be callled from application code.
#define v128_ror64_sse2( v, c ) \
   _mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) )

#define v128_rol64_sse2( v, c ) \
   _mm_or_si128( _mm_slli_epi64( v, c ), _mm_srli_epi64( v, 64-(c) ) )

#define v128_ror32_sse2( v, c ) \
   _mm_or_si128( _mm_srli_epi32( v, c ), _mm_slli_epi32( v, 32-(c) ) )

#define v128_rol32_sse2( v, c ) \
   _mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )

#if defined(VL256)

// AVX512 fastest for all rotations.
#define v128_ror64                _mm_ror_epi64
#define v128_rol64                _mm_rol_epi64
#define v128_ror32                _mm_ror_epi32
#define v128_rol32                _mm_rol_epi32

// ror/rol will always find the fastest but these names may fit better with
// application code performing byte operations rather than bit rotations.
#define v128_shuflr64_8( v)         _mm_ror_epi64( v,  8 )
#define v128_shufll64_8( v)         _mm_rol_epi64( v,  8 )
#define v128_shuflr64_16(v)         _mm_ror_epi64( v, 16 )
#define v128_shufll64_16(v)         _mm_rol_epi64( v, 16 )
#define v128_shuflr64_24(v)         _mm_ror_epi64( v, 24 )
#define v128_shufll64_24(v)         _mm_rol_epi64( v, 24 )
#define v128_shuflr32_8( v)         _mm_ror_epi32( v,  8 )
#define v128_shufll32_8( v)         _mm_rol_epi32( v,  8 )
#define v128_shuflr32_16(v)         _mm_ror_epi32( v, 16 )
#define v128_shufll32_16(v)         _mm_rol_epi32( v, 16 )

#elif defined(__SSSE3__)
// SSSE3: fastest 32 bit, very fast 16, fast 8

#define v128_shuflr64_8( v ) \
    _mm_shuffle_epi8( v, _mm_set_epi64x( \
                                  0x080f0e0d0c0b0a09, 0x0007060504030201 ) )

#define v128_shufll64_8( v ) \
   _mm_shuffle_epi8( v, _mm_set_epi64x( \
                                  0x0e0d0c0b0a09080f, 0x0605040302010007 ) )

#define v128_shuflr64_24( v ) \
    _mm_shuffle_epi8( v, _mm_set_epi64x( \
                                  0x0a09080f0e0d0c0b, 0x0201000706050403 ) )

#define v128_shufll64_24( v ) \
   _mm_shuffle_epi8( v, _mm_set_epi64x( \
                                  0x0c0b0a09080f0e0d, 0x0403020100070605 ) )

#define v128_shuflr32_8( v ) \
   _mm_shuffle_epi8( v, _mm_set_epi64x( \
                                  0x0c0f0e0d080b0a09, 0x0407060500030201 ) )

#define v128_shufll32_8( v ) \
   _mm_shuffle_epi8( v, _mm_set_epi64x( \
                                  0x0e0d0c0f0a09080b, 0x0605040702010003 ) )

#define v128_ror64( v, c ) \
   ( (c) ==  8 ) ? v128_shuflr64_8( v ) \
 : ( (c) == 16 ) ? v128_shuffle16( v, 0x39 ) \
 : ( (c) == 24 ) ? v128_shuflr64_24( v ) \
 : ( (c) == 32 ) ? _mm_shuffle_epi32( v, 0xb1 ) \
 : ( (c) == 40 ) ? v128_shufll64_24( v ) \
 : ( (c) == 48 ) ? v128_shuffle16( v, 0x93 ) \
 : ( (c) == 56 ) ? v128_shufll64_8( v ) \
 : v128_ror64_sse2( v, c ) 

#define v128_rol64( v, c ) \
   ( (c) ==  8 ) ? v128_shufll64_8( v ) \
 : ( (c) == 16 ) ? v128_shuffle16( v, 0x93 ) \
 : ( (c) == 24 ) ? v128_shufll64_24( v ) \
 : ( (c) == 32 ) ? _mm_shuffle_epi32( v, 0xb1 ) \
 : ( (c) == 40 ) ? v128_shuflr64_24( v ) \
 : ( (c) == 48 ) ? v128_shuffle16( v, 0x39 ) \
 : ( (c) == 56 ) ? v128_shuflr64_8( v ) \
 : v128_rol64_sse2( v, c ) 

#define v128_ror32( v, c ) \
   ( (c) ==  8 ) ? v128_shuflr32_8( v ) \
 : ( (c) == 16 ) ? v128_lrev16( v ) \
 : ( (c) == 24 ) ? v128_shufll32_8( v ) \
 : v128_ror32_sse2( v, c ) 

#define v128_rol32( v, c ) \
   ( (c) ==  8 ) ? v128_shufll32_8( v ) \
 : ( (c) == 16 ) ? v128_lrev16( v ) \
 : ( (c) == 24 ) ? v128_shuflr32_8( v ) \
 : v128_rol32_sse2( v, c )

#elif defined(__SSE2__)
// SSE2: fastest 32 bit, very fast 16, all else slow

#define v128_ror64( v, c ) \
   ( (c) == 16 ) ? v128_shuffle16( v, 0x39 ) \
 : ( (c) == 32 ) ? _mm_shuffle_epi32( v, 0xb1 ) \
 : ( (c) == 48 ) ? v128_shuffle16( v, 0x93 ) \
 : v128_ror64_sse2( v, c )

#define v128_rol64( v, c ) \
   ( (c) == 16 ) ? v128_shuffle16( v, 0x93 ) \
 : ( (c) == 32 ) ? _mm_shuffle_epi32( v, 0xb1 ) \
 : ( (c) == 48 ) ? v128_shuffle16( v, 0x39 ) \
 : v128_rol64_sse2( v, c )

#define v128_ror32( v, c ) \
  ( (c) == 16 ) ? v128_lrev16( v ) \
 : v128_ror32_sse2( v, c )

#define v128_rol32( v, c ) \
  ( (c) == 16 ) ? v128_lrev16( v ) \
 : v128_rol32_sse2( v, c )

#else 

#define v128_ror64         v128_ror64_sse2
#define v128_rol64         v128_rol64_sse2
#define v128_ror32         v128_ror32_sse2
#define v128_rol32         v128_rol32_sse2

#endif

// (v1 ^ v0) >>> n, ARM NEON has optimized version
#define v128_ror64xor( v1, v0, n )  v128_ror64( v128_xor( v1, v0 ), n ) 

/* not used
// x2 rotates elements in 2 individual vectors in a double buffered
// optimization for SSE2, does nothing for AVX512 but is there for
// transparency.

#if defined(VL256)

#define v128_2ror64( v1, v0, c ) \
   _mm_ror_epi64( v0, c ); \
   _mm_ror_epi64( v1, c )

#define v128_2rol64( v1, v0, c ) \
   _mm_rol_epi64( v0, c ); \
   _mm_rol_epi64( v1, c )

#define v128_2ror32( v1, v0, c ) \
   _mm_ror_epi32( v0, c ); \
   _mm_ror_epi32( v1, c )

#define v128_2rol32( v1, v0, c ) \
   _mm_rol_epi32( v0, c ); \
   _mm_rol_epi32( v1, c )

#else  // SSE2

#define v128_2ror64( v1, v0, c ) \
{ \
 __m128i t0 = _mm_srli_epi64( v0, c ); \
 __m128i t1 = _mm_srli_epi64( v1, c ); \
 v0 = _mm_slli_epi64( v0, 64-(c) ); \
 v1 = _mm_slli_epi64( v1, 64-(c) ); \
 v0 = _mm_or_si256( v0, t0 ); \
 v1 = _mm_or_si256( v1, t1 ); \
}

#define v128_2rol64( v1, v0, c ) \
{ \
 __m128i t0 = _mm_slli_epi64( v0, c ); \
 __m128i t1 = _mm_slli_epi64( v1, c ); \
 v0 = _mm_srli_epi64( v0, 64-(c) ); \
 v1 = _mm_srli_epi64( v1, 64-(c) ); \
 v0 = _mm_or_si256( v0, t0 ); \
 v1 = _mm_or_si256( v1, t1 ); \
}

#define v128_2ror32( v1, v0, c ) \
{ \
 __m128i t0 = _mm_srli_epi32( v0, c ); \
 __m128i t1 = _mm_srli_epi32( v1, c ); \
 v0 = _mm_slli_epi32( v0, 32-(c) ); \
 v1 = _mm_slli_epi32( v1, 32-(c) ); \
 v0 = _mm_or_si256( v0, t0 ); \
 v1 = _mm_or_si256( v1, t1 ); \
}

#define v128_2rol32( v1, v0, c ) \
{ \
 __m128i t0 = _mm_slli_epi32( v0, c ); \
 __m128i t1 = _mm_slli_epi32( v1, c ); \
 v0 = _mm_srli_epi32( v0, 32-(c) ); \
 v1 = _mm_srli_epi32( v1, 32-(c) ); \
 v0 = _mm_or_si256( v0, t0 ); \
 v1 = _mm_or_si256( v1, t1 ); \
}

#endif   // AVX512 else SSE2
*/

// Cross lane shuffles

// No NEON version
#define v128_shuffle32     _mm_shuffle_epi32

/* Not used, exists only for compatibility with NEON if ever needed.
#define v128_shufflev32( v, vmask ) \
  v128_shuffle32( v, v128_movmask32( vmask ) )
*/

#define v128_shuffle8     _mm_shuffle_epi8

// Limited 2 input shuffle, combines shuffle with blend. The destination low
// half is always taken from v1, and the high half from v2.
#define v128_shuffle2_64( v1, v2, c ) \
   _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( v1 ), \
                                     _mm_castsi128_pd( v2 ), c ) ); 

#define v128_shuffle2_32( v1, v2, c ) \
   _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( v1 ), \
                                     _mm_castsi128_ps( v2 ), c ) ); 

// Rotate vector elements accross all lanes

// reverse elements in vector
#define v128_swap64(v)      _mm_shuffle_epi32( v, 0x4e )  // grandfathered 
#define v128_rev64(v)       _mm_shuffle_epi32( v, 0x4e )  // preferred
#define v128_rev32(v)       _mm_shuffle_epi32( v, 0x1b )

// rotate vector elements
#define v128_shuflr32(v)    _mm_shuffle_epi32( v, 0x39 )
#define v128_shufll32(v)    _mm_shuffle_epi32( v, 0x93 )

// Endian byte swap.

#if defined(__SSSE3__)

#define v128_bswap128( v ) \
   _mm_shuffle_epi8( v, _mm_set_epi64x( 0x0001020304050607, \
                                        0x08090a0b0c0d0e0f ) )

#define v128_bswap64( v ) \
   _mm_shuffle_epi8( v, _mm_set_epi64x( 0x08090a0b0c0d0e0f, \
                                        0x0001020304050607 ) )

#define v128_bswap32( v ) \
   _mm_shuffle_epi8( v, _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
                                        0x0405060700010203 ) )
#define v128_bswap16( v ) \
   _mm_shuffle_epi8( v, _mm_set_epi64x( 0x0e0f0c0d0a0b0809, \
                                        0x0607040502030001 )

// 8 byte qword * 8 qwords * 2 lanes = 128 bytes
#define v128_block_bswap64( d, s ) \
{ \
  v128_t ctl = _mm_set_epi64x(  0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
  casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \
  casti_v128( d,1 ) = _mm_shuffle_epi8( casti_v128( s,1 ), ctl ); \
  casti_v128( d,2 ) = _mm_shuffle_epi8( casti_v128( s,2 ), ctl ); \
  casti_v128( d,3 ) = _mm_shuffle_epi8( casti_v128( s,3 ), ctl ); \
  casti_v128( d,4 ) = _mm_shuffle_epi8( casti_v128( s,4 ), ctl ); \
  casti_v128( d,5 ) = _mm_shuffle_epi8( casti_v128( s,5 ), ctl ); \
  casti_v128( d,6 ) = _mm_shuffle_epi8( casti_v128( s,6 ), ctl ); \
  casti_v128( d,7 ) = _mm_shuffle_epi8( casti_v128( s,7 ), ctl ); \
}
#define v128_block_bswap64_512     v128_block_bswap64

#define v128_block_bswap64_1024( d, s ) \
{ \
  v128_t ctl = _mm_set_epi64x(  0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
  casti_v128( d, 0 ) = _mm_shuffle_epi8( casti_v128( s, 0 ), ctl ); \
  casti_v128( d, 1 ) = _mm_shuffle_epi8( casti_v128( s, 1 ), ctl ); \
  casti_v128( d, 2 ) = _mm_shuffle_epi8( casti_v128( s, 2 ), ctl ); \
  casti_v128( d, 3 ) = _mm_shuffle_epi8( casti_v128( s, 3 ), ctl ); \
  casti_v128( d, 4 ) = _mm_shuffle_epi8( casti_v128( s, 4 ), ctl ); \
  casti_v128( d, 5 ) = _mm_shuffle_epi8( casti_v128( s, 5 ), ctl ); \
  casti_v128( d, 6 ) = _mm_shuffle_epi8( casti_v128( s, 6 ), ctl ); \
  casti_v128( d, 7 ) = _mm_shuffle_epi8( casti_v128( s, 7 ), ctl ); \
  casti_v128( d, 8 ) = _mm_shuffle_epi8( casti_v128( s, 8 ), ctl ); \
  casti_v128( d, 9 ) = _mm_shuffle_epi8( casti_v128( s, 9 ), ctl ); \
  casti_v128( d,10 ) = _mm_shuffle_epi8( casti_v128( s,10 ), ctl ); \
  casti_v128( d,11 ) = _mm_shuffle_epi8( casti_v128( s,11 ), ctl ); \
  casti_v128( d,12 ) = _mm_shuffle_epi8( casti_v128( s,12 ), ctl ); \
  casti_v128( d,13 ) = _mm_shuffle_epi8( casti_v128( s,13 ), ctl ); \
  casti_v128( d,14 ) = _mm_shuffle_epi8( casti_v128( s,14 ), ctl ); \
  casti_v128( d,15 ) = _mm_shuffle_epi8( casti_v128( s,15 ), ctl ); \
}

// 4 byte dword * 8 dwords * 4 lanes = 128 bytes
#define v128_block_bswap32( d, s ) \
{ \
  v128_t ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
  casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \
  casti_v128( d,1 ) = _mm_shuffle_epi8( casti_v128( s,1 ), ctl ); \
  casti_v128( d,2 ) = _mm_shuffle_epi8( casti_v128( s,2 ), ctl ); \
  casti_v128( d,3 ) = _mm_shuffle_epi8( casti_v128( s,3 ), ctl ); \
  casti_v128( d,4 ) = _mm_shuffle_epi8( casti_v128( s,4 ), ctl ); \
  casti_v128( d,5 ) = _mm_shuffle_epi8( casti_v128( s,5 ), ctl ); \
  casti_v128( d,6 ) = _mm_shuffle_epi8( casti_v128( s,6 ), ctl ); \
  casti_v128( d,7 ) = _mm_shuffle_epi8( casti_v128( s,7 ), ctl ); \
}
#define v128_block_bswap32_256       v128_block_bswap32


#define v128_block_bswap32_128( d, s ) \
{ \
  v128_t ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
  casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \
  casti_v128( d,1 ) = _mm_shuffle_epi8( casti_v128( s,1 ), ctl ); \
  casti_v128( d,2 ) = _mm_shuffle_epi8( casti_v128( s,2 ), ctl ); \
  casti_v128( d,3 ) = _mm_shuffle_epi8( casti_v128( s,3 ), ctl ); \
}   

#define v128_block_bswap32_512( d, s ) \
{ \
  v128_t ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
  casti_v128( d, 0 ) = _mm_shuffle_epi8( casti_v128( s, 0 ), ctl ); \
  casti_v128( d, 1 ) = _mm_shuffle_epi8( casti_v128( s, 1 ), ctl ); \
  casti_v128( d, 2 ) = _mm_shuffle_epi8( casti_v128( s, 2 ), ctl ); \
  casti_v128( d, 3 ) = _mm_shuffle_epi8( casti_v128( s, 3 ), ctl ); \
  casti_v128( d, 4 ) = _mm_shuffle_epi8( casti_v128( s, 4 ), ctl ); \
  casti_v128( d, 5 ) = _mm_shuffle_epi8( casti_v128( s, 5 ), ctl ); \
  casti_v128( d, 6 ) = _mm_shuffle_epi8( casti_v128( s, 6 ), ctl ); \
  casti_v128( d, 7 ) = _mm_shuffle_epi8( casti_v128( s, 7 ), ctl ); \
  casti_v128( d, 8 ) = _mm_shuffle_epi8( casti_v128( s, 8 ), ctl ); \
  casti_v128( d, 9 ) = _mm_shuffle_epi8( casti_v128( s, 9 ), ctl ); \
  casti_v128( d,10 ) = _mm_shuffle_epi8( casti_v128( s,10 ), ctl ); \
  casti_v128( d,11 ) = _mm_shuffle_epi8( casti_v128( s,11 ), ctl ); \
  casti_v128( d,12 ) = _mm_shuffle_epi8( casti_v128( s,12 ), ctl ); \
  casti_v128( d,13 ) = _mm_shuffle_epi8( casti_v128( s,13 ), ctl ); \
  casti_v128( d,14 ) = _mm_shuffle_epi8( casti_v128( s,14 ), ctl ); \
  casti_v128( d,15 ) = _mm_shuffle_epi8( casti_v128( s,15 ), ctl ); \
}

#else  // SSE2

static inline v128_t v128_bswap64( __m128i v )
{
      v = _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) );
      v = _mm_shufflelo_epi16( v, _MM_SHUFFLE( 0, 1, 2, 3 ) );
  return  _mm_shufflehi_epi16( v, _MM_SHUFFLE( 0, 1, 2, 3 ) );
}

static inline v128_t v128_bswap32( __m128i v )
{
      v = _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) );
      v = _mm_shufflelo_epi16( v, _MM_SHUFFLE( 2, 3, 0, 1 ) );
  return  _mm_shufflehi_epi16( v, _MM_SHUFFLE( 2, 3, 0, 1 ) );
}

static inline v128_t v128_bswap16( __m128i v )
{
  return _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) );
}

#define v128_bswap128( v )   v128_qrev32( v128_bswap64( v ) )

static inline void v128_block_bswap64( __m128i *d, const __m128i *s )
{
   d[0] = v128_bswap64( s[0] );
   d[1] = v128_bswap64( s[1] );
   d[2] = v128_bswap64( s[2] );
   d[3] = v128_bswap64( s[3] );
   d[4] = v128_bswap64( s[4] );
   d[5] = v128_bswap64( s[5] );
   d[6] = v128_bswap64( s[6] );
   d[7] = v128_bswap64( s[7] );
}

static inline void v128_block_bswap64_1024( __m128i *d, const __m128i *s )
{
   d[ 0] = v128_bswap64( s[ 0] );
   d[ 1] = v128_bswap64( s[ 1] );
   d[ 2] = v128_bswap64( s[ 2] );
   d[ 3] = v128_bswap64( s[ 3] );
   d[ 4] = v128_bswap64( s[ 4] );
   d[ 5] = v128_bswap64( s[ 5] );
   d[ 6] = v128_bswap64( s[ 6] );
   d[ 7] = v128_bswap64( s[ 7] );
   d[ 8] = v128_bswap64( s[ 8] );
   d[ 9] = v128_bswap64( s[ 9] );
   d[10] = v128_bswap64( s[10] );
   d[11] = v128_bswap64( s[11] );
   d[14] = v128_bswap64( s[12] );
   d[13] = v128_bswap64( s[13] );
   d[14] = v128_bswap64( s[14] );
   d[15] = v128_bswap64( s[15] );
}

static inline void v128_block_bswap32( __m128i *d, const __m128i *s )
{
   d[0] = v128_bswap32( s[0] );
   d[1] = v128_bswap32( s[1] );
   d[2] = v128_bswap32( s[2] );
   d[3] = v128_bswap32( s[3] );
   d[4] = v128_bswap32( s[4] );
   d[5] = v128_bswap32( s[5] );
   d[6] = v128_bswap32( s[6] );
   d[7] = v128_bswap32( s[7] );
}
#define v128_block_bswap32_256  v128_block_bswap32

static inline void v128_block_bswap32_512( __m128i *d, const __m128i *s )
{
   d[ 0] = v128_bswap32( s[ 0] );
   d[ 1] = v128_bswap32( s[ 1] );
   d[ 2] = v128_bswap32( s[ 2] );
   d[ 3] = v128_bswap32( s[ 3] );
   d[ 4] = v128_bswap32( s[ 4] );
   d[ 5] = v128_bswap32( s[ 5] );
   d[ 6] = v128_bswap32( s[ 6] );
   d[ 7] = v128_bswap32( s[ 7] );
   d[ 8] = v128_bswap32( s[ 8] );
   d[ 9] = v128_bswap32( s[ 9] );
   d[10] = v128_bswap32( s[10] );
   d[11] = v128_bswap32( s[11] );
   d[12] = v128_bswap32( s[12] );
   d[13] = v128_bswap32( s[13] );
   d[14] = v128_bswap32( s[14] );
   d[15] = v128_bswap32( s[15] );
}

#endif // SSSE3 else SSE2

// alignr instruction for 32 & 64 bit elements is only available with AVX512
// but emulated here. Behaviour is consistent with Intel alignr intrinsics.
#if defined(__SSSE3__)

#define v128_alignr8                   _mm_alignr_epi8
#define v128_alignr64( hi, lo, c )     _mm_alignr_epi8( hi, lo, (c)*8 )
#define v128_alignr32( hi, lo, c )     _mm_alignr_epi8( hi, lo, (c)*4 )

#else

#define v128_alignr8( hi, lo, c ) \
   _mm_or_si128( _mm_slli_si128( hi, 16-(c) ), _mm_srli_si128( lo, c ) )

// c arg is trivial only valid value is 1
#define v128_alignr64( hi, lo, c ) \
   _mm_or_si128( _mm_slli_si128( hi, 16-((c)*8) ), _mm_srli_si128( lo, (c)*8 ) )

#define v128_alignr32( hi, lo, c ) \
   _mm_or_si128( _mm_slli_si128( hi, 16-((c)*4) ), _mm_srli_si128( lo, (c)*4 ) )

#endif

// blend using vector mask
#if defined(__SSE4_1__)

// Bytewise using sign bit of each byte element of mask. Use full bitmask
// for compatibility with SSE2 & NEON.
#define v128_blendv                    _mm_blendv_epi8

#else

// Bitwise, use only byte wise for compatibility with SSE4_1.
#define v128_blendv( v1, v0, mask ) \
   v128_or( v128_andnot( mask, v1 ), v128_and( mask, v0 ) )

#endif

#endif // __SSE2__
#endif // SIMD_128_H__