#if !defined(SIMD_INT_H__)
#define SIMD_INT_H__ 1

//TODO compile time test for byte order
// be64 etc using HW bowap.
//
// Endian byte swap
#if defined(__x86_64__)

#define bswap_64    __builtin_bswap64
#define bswap_32    __builtin_bswap32

#elif defined(__aarch64__)

static inline uint64_t bswap_64( uint64_t a )
{
   uint64_t b;
   asm( "rev %0, %1\n\t" : "=r"(b) : "r"(a) );
   return b;
}

static inline uint32_t bswap_32( uint32_t a )
{
   uint32_t b;
   asm( "rev32 %0, %1\n\t" : "=r"(b) : "r"(a) );
   return b;
}

#else

#define bswap_64(x) \
    ( ( ( (x) & 0x00000000FFFFFFFF ) << 32 ) \
    | ( ( (x) & 0xFFFFFFFF00000000 ) >> 32 ) \
    | ( ( (x) & 0x0000FFFF0000FFFF ) << 16 ) \
    | ( ( (x) & 0xFFFF0000FFFF0000 ) >> 16 ) \
    | ( ( (x) & 0x00FF00FF00FF00FF ) <<  8 ) \
    | ( ( (x) & 0xFF00FF00FF00FF00 ) >>  8 ) )

#define bswap_32(x) \
   ( ( ( (x) << 24 ) & 0xff000000 ) | ( ((x) <<  8 ) & 0x00ff0000 ) \
   | ( ( (x) >>  8 ) & 0x0000ff00 ) | ( ((x) >> 24 ) & 0x000000ff ) )

// Poorman's 2 way parallel SIMD uses u64 to bswap 2 u32
#define bswap_32x2( u64 ) \
    ( ( (u64) & 0xff000000ff000000 ) >> 24 ) \
  | ( ( (u64) & 0x00ff000000ff0000 ) >>  8 ) \
  | ( ( (u64) & 0x0000ff000000ff00 ) <<  8 ) \
  | ( ( (u64) & 0x000000ff000000ff ) << 24 ) 

#endif

// 128 bit rotation
#define bswap_128( x ) \
    ( (uint128_t)(bswap_64( (uint64_t)(x & 0xffffffffffffffff) ) ) << 64 ); \
 || ( (uint128_t)(bswap_64( (uint64_t)(x >> 64) ) ) ); \
    

// Set byte order regardless of host order.
static inline uint64_t be64( const uint64_t u64 )
{
  const uint8_t *p = (uint8_t const *)&u64;
  return ( ( ( (uint64_t)(p[7])         + ( (uint64_t)(p[6]) <<  8 ) ) +
           ( ( (uint64_t)(p[5]) << 16 ) + ( (uint64_t)(p[4]) << 24 ) ) ) +
           ( ( (uint64_t)(p[3]) << 32 ) + ( (uint64_t)(p[2]) << 40 ) ) +
           ( ( (uint64_t)(p[1]) << 48 ) + ( (uint64_t)(p[0]) << 56 ) ) );
}

static inline uint64_t le64( const uint64_t u64 )
{
  const uint8_t *p = (uint8_t const *)&u64;
  return ( ( ( (uint64_t)(p[0])         + ( (uint64_t)(p[1]) <<  8 ) ) +
           ( ( (uint64_t)(p[2]) << 16 ) + ( (uint64_t)(p[3]) << 24 ) ) ) +
           ( ( (uint64_t)(p[3]) << 32 ) + ( (uint64_t)(p[1]) << 40 ) ) +
           ( ( (uint64_t)(p[2]) << 48 ) + ( (uint64_t)(p[3]) << 56 ) ) );
}

static inline uint32_t be32( const uint32_t u32 )
{
  const uint8_t *p = (uint8_t const *)&u32;
  return ( ( (uint32_t)(p[3])         + ( (uint32_t)(p[2]) <<  8 ) ) +
         ( ( (uint32_t)(p[1]) << 16 ) + ( (uint32_t)(p[0]) << 24 ) ) );
}

static inline uint32_t le32( const uint32_t u32 )
{
   const uint8_t *p = (uint8_t const *)&u32;
   return ( ( (uint32_t)(p[0])        + ( (uint32_t)(p[1]) <<  8 ) ) +
          ( ( (uint32_t)(p[2]) << 16) + ( (uint32_t)(p[3]) << 24 ) ) );
}

static inline uint16_t be16( const uint16_t u16 )
{
  const uint8_t *p = (uint8_t const *)&u16;
  return ( (uint16_t)(p[3]) ) + ( (uint16_t)(p[2]) <<  8 );
}

static inline uint32_t le162( const uint16_t u16 )
{
   const uint8_t *p = (uint8_t const *)&u16;
   return ( (uint16_t)(p[0]) ) + ( (uint16_t)(p[1]) <<  8 );
}

// Bit rotation
#if defined(__x86_64__)

#define rol64       __rolq
#define ror64       __rorq
#define rol32       __rold
#define ror32       __rord

#elif defined(__aarch64__)

static inline uint64_t ror64( uint64_t a, const int c )
{
   uint64_t b;
   asm( "ror %0, %1, %2\n\t" : "=r"(b) : "r"(a), "r"(c) );
   return b;
}
#define rol64( a, c )     ror64( a, 64-(c) )

static inline uint32_t ror32( uint32_t a, const int c )
{
   uint32_t b;
   asm( "ror %0, %1, %2\n\t" : "=r"(b) : "r"(a), "r"(c) );
   return b;
}
#define rol32( a, c )     ror32( a, 32-(c) )

#else

#define ror64( x, c )    ( ( (x) >> (c) ) | ( (x) << (64-(c)) ) )
#define rol64( x, c )    ( ( (x) << (c) ) | ( (x) >> (64-(c)) ) )
#define ror32( x, c )    ( ( (x) >> (c) ) | ( (x) << (32-(c)) ) )
#define rol32( x, c )    ( ( (x) << (c) ) | ( (x) >> (32-(c)) ) )

#endif

// Safe division, integer or floating point. For floating point it's as  
// safe as 0 is precisely zero.
// Returns safe_result if division by zero, typically zero.
#define safe_div( dividend, divisor, safe_result ) \
   ( (divisor) == 0 ? safe_result : ( (dividend) / (divisor) )  )


///////////////////////////////////////
// 
//      128 bit integers
//
// 128 bit integers are inneficient and not a shortcut for __m128i.
// Native type __int128 supported starting with GCC-4.8.
//
// __int128 uses two 64 bit GPRs to hold the data. The main benefits are
// for 128 bit arithmetic. Vectors are preferred when 128 bit arith
// is not required. int128 also works better with other integer sizes.
// Vectors benefit from wider registers.
//
// For safety use typecasting on all numeric arguments.
//
// Use typecasting for conversion to/from 128 bit vector:
// __m128i v128 = (__m128i)my_int128l
// __m256i v256 = _mm256_set_m128i( (__m128i)my_int128, (__m128i)my_int128 );
// my_int128 = (uint128_t)_mm256_extracti128_si256( v256, 1 );

// obsolete test
// Compiler check for __int128 support
// Configure also has a test for int128.
//#if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
  #define GCC_INT128 1
//#endif

#if !defined(GCC_INT128)
  #warning "__int128 not supported, requires GCC-4.8 or newer."
#endif

#if defined(GCC_INT128)

// Familiar looking type names
typedef          __int128  int128_t;
typedef unsigned __int128 uint128_t;

typedef union
{
   uint128_t u128;
   uint64_t  u64[2];
   uint32_t  u32[4];
} __attribute__ ((aligned (16))) u128_ovly;

// Extracting the low bits is a trivial cast.
// These specialized functions are optimized while providing a
// consistent interface.
#define u128_hi64( x )    ( (uint64_t)( (uint128_t)(x) >> 64 ) )
#define u128_lo64( x )    ( (uint64_t)(x) )

#endif  // GCC_INT128

#endif // SIMD_INT_H__