#if defined(__aarch64__) && defined(__ARM_NEON)

// targeted functions using generic names makes portable obsolete

#define v128_t                         uint32x4_t

// load & store
#define v128_load( p )                 vld1q_u32( (uint32_t*)(p) )
#define v128_store( p, v )             vst1q_u32( (uint32_t*)(p), v )

// arithmetic
#define v128_add64                     vaddq_u64
#define v128_add32                     vaddq_u32
#define v128_add16                     vaddq_u16
#define v128_add8                      vaddq_u8

#define v128_sub64                     vsubq_u64
#define v128_sub32                     vsubq_u32
#define v128_sub16                     vsubq_u16
#define v128_sub8                      vsubq_u8

// return low half
#define v128_mullo64                   vmulq_u64
#define v128_mullo32                   vmulq_u32
#define v128_mullo16                   vmulq_u16

// widen not working, use placeholders
//#define v128_mul32                     vmull_u32  
//#define v128_mul16                     vmull_u16
#define v128_mul64                   vmulq_u64
#define v128_mul32                   vmulq_u32
#define v128_mul16                   vmulq_u16

// compare
#define v128_cmpeq64                   vceqq_u64
#define v128_cmpeq32                   vceqq_u32
#define v128_cmpeq16                   vceqq_u16

#define v128_cmpgt64                   vcgtq_u64
#define v128_cmpgt32                   vcgtq_u32
#define v128_cmpgt16                   vcgtq_u16

#define v128_cmplt64                   vcltq_u64
#define v128_cmplt32                   vcltq_u32
#define v128_cmplt16                   vcltq_u16

// bit shift & rotate
#define v128_sl64                      vshlq_n_u64
#define v128_sl32                      vshlq_n_u32
#define v128_sl16                      vshlq_n_u16

#define v128_sr64                      vshrq_n_u64
#define v128_sr32                      vshrq_n_u32
#define v128_sr16                      vshrq_n_u16

#define v128_sra64                     vshrq_n_s64
#define v128_sra32                     vshrq_n_s32
#define v128_sra16                     vshrq_n_s16

// logical ops
#define v128_or                        vorrq_u32
#define v128_and                       vandq_u32
#define v128_not                       vmvnq_u32
#define v128_xor                       veorq_u32

#define v128_xor3( v2, v1, v0 )        v128_xor( v2, v128_xor( v1, v0 ) )
//#define v128_xor3                      veor3q_u32
#define v128_nor                       vornq_u32
#define v128_andnot( v1, v0 )          vandq_u32( vmvnq_u32(v1), v0 )
#define v128_xorandnot( v2, v1, v0 )   v128_xor( v2, v128_andnot( v1, v0 ) )
#define v128_and3( a, b, c )           v128_and( a, v128_and( b, c ) )
#define v128_or3( a, b, c )            v128_or( a, v128_or( b, c ) )
#define v128_xorand( a, b, c )         v128_xor( a, v128_and( b, c ) )
#define v128_andxor( a, b, c )         v128_and( a, v128_xor( b, c ))
#define v128_xoror( a, b, c )          v128_xor( a, v128_or( b, c ) )
#define v128_orand( a, b, c )          v128_or( a, v128_and( b, c ) )
#define v128_xnor( a, b )              v128_not( v128_xor( a, b ) )

#define v128_alignr64                  vextq_u64
#define v128_alignr32                  vextq_u32
#define v128_alignr8                   vextq_u8 

#define v128_unpacklo64                vtrn1q_u64
#define v128_unpackhi64                vtrn2q_u64

#define v128_unpacklo32                vtrn1q_u32
#define v128_unpackhi32                vtrn2q_u32

#define v128_unpacklo16                vtrn1q_u16
#define v128_unpackhi16                vtrn2q_u16

#define v128_unpacklo8                 vtrn1q_u8
#define v128_unpackhi8                 vtrn2q_u8

// AES
// consistent with Intel AES, break up for optimizing
#define v128_aesenc( v, k )            vaesmcq_u8( vaeseq_u8( v, k ) )
#define v128_aesenclast( v, k )        vaeseq_u8( v, k )

#define v128_aesdec( v, k )            vaesimcq_u8( vaesdq_u8( v, k ) )
#define v128_aesdeclast( v, k )        vaesdq_u8( v, k )

// pointer indexing
#define casti_v128( p, i )             (((uint32x4_t*)(p))[i])

#define cast_v128( p )                 (*((uint32x4_t*)(p)))


// Many NEON instructions are sized when they don't need to be, for example
// zero, which may cause the compiler to complain when the sizes don't match.
// use "-flax_vector_conversions".

#define u32_to_u64                     vreinterpretq_u64_u32
#define u64_to_u32                     vreinterpretq_u32_u64

#define u64_to_u8                      vreinterpretq_u8_u64
#define u8_to_u64                      vreinterpretq_u64_u8

#define u32_to_u8                      vreinterpretq_u8_u32
#define u8_to_u32                      vreinterpretq_u32_u8

#define v128_zero                      v128_64( 0ull )
//#define v128_zero_fn()                 v128_64( 0ull )
//#define v128_zero                      v128_zero_fn 

// set1
#define v128_32                        vmovq_n_u32
#define v128_64                        vmovq_n_u64

#define v128_set64( u64_1, u64_0 ) \
   ( (uint64x2_t)( ( (uint128_t)(u64_1) << 64 ) | (uint128_t)(u64_0) ) )
#define v128_set_64                    v128_set64    // deprecated

#define v128_set32( u32_3, u32_2, u32_1, u32_0 ) \
    (uint32x4_t)( ( (uint128_t)(u32_3) << 96 ) | ( (uint128_t)(u32_2) << 64 ) \
    | ( (uint128_t)(u32_1) << 64 ) | ( (uint128_t)(u32_0) ) )
#define v128_set_32                    v128_set32  // deprecated


static inline void v128_memset_zero( uint32x4_t *dst, const int n )
{  for( int i = 0; i < n; i++ )     dst[n] = (uint32x4_t)(uint128_t)0; }

static inline void v128_memset( uint32x4_t *dst, const uint32x4_t *src,
                                 const int n )
{  for( int i = 0; i < n; i++ )     dst[n] = src[n]; }
   
static inline void v128_memcpy( uint32x4_t *dst, const uint32x4_t *src, const int n )
{  for ( int i = 0; i < n; i ++ )  dst[i] = src[i]; }

// select src & dst lanes
#define v128_mov32( dst, ld, src, ls )   vcopyq_laneq_u32( dst, ld, src, ls )

// move src u64 to lane 0, neon needs a source vector to write into
#define v128_mov64( u64 )              (uint64x2_t)(uint128_t)(u64)

static inline uint64x2_t v128_negate64( uint64x2_t v )
{   return v128_sub64( v128_xor( v, v ), v ); }

static inline uint32x4_t v128_negate32( uint32x4_t v )
{   return v128_sub32( v128_xor( v, v ), v ); }

static inline uint16x8_t v128_negate16( uint16x8_t v )
{   return v128_sub64( v128_xor( v, v ), v ); }

#define v128_add4_32( v3, v2, v1, v0 ) \
   vaddq_u32( vaddq_u32( v3, v2 ), vaddq_u32( v1, v0 ) )

// how to build a bitmask from vector elements?
#define v128_movmask32                 _Static_assert (0, "No ARM target: v128_movmask32")
#define v128_movmask64                 _Static_assert (0, "No ARM target: v128_movmask64")


static inline uint64x2_t v128_ror64( uint64x2_t v, const int c )
{   return vsriq_n_u64( vsliq_n_u64( v, v, 64-(c) ), v, c ); }

static inline uint64x2_t v128_rol64( uint64x2_t v, const int c )
{   return vsriq_n_u64( vsliq_n_u64( v, v, c ), v, 64-(c) ); } 

static inline uint32x4_t v128_ror32( uint32x4_t v, const int c )
{   return vsriq_n_u32( vsliq_n_u32( v, v, 32-(c) ), v, c ); }

static inline uint32x4_t v128_rol32( uint32x4_t v, const int c )
{   return vsriq_n_u32( vsliq_n_u32( v, v, c ), v, 32-(c) ); }

static inline uint16x8_t v128_ror16( uint16x8_t v, const int c )
{   return vsriq_n_u16( vsliq_n_u16( v, v, 16-(c) ), v, c ); }

static inline uint16x8_t v128_rol16( uint16x8_t v, const int c )
{   return vsriq_n_u16( vsliq_n_u16( v, v, c ), v, 16-(c) ); }

// reverse endian byte order
#define v128_bswap16(v)                u8_to_u16( vrev16q_u8( u16_to_u8(v) ))
#define v128_bswap32(v)                u8_to_u32( vrev32q_u8( u32_to_u8(v) ))
#define v128_bswap64(v)                u8_to_u64( vrev64q_u8( u64_to_u8(v) ))
#define v128_bswap128(v)               v128_swap64( v128_bswap64(v) )

#define v128_block_bswap32( dst, src ) \
   casti_v128( dst, 0 ) = v128_bswap32( casti_v128( src, 0 ) ); \
   casti_v128( dst, 1 ) = v128_bswap32( casti_v128( src, 1 ) ); \
   casti_v128( dst, 2 ) = v128_bswap32( casti_v128( src, 2 ) ); \
   casti_v128( dst, 3 ) = v128_bswap32( casti_v128( src, 3 ) ); \
   casti_v128( dst, 4 ) = v128_bswap32( casti_v128( src, 4 ) ); \
   casti_v128( dst, 5 ) = v128_bswap32( casti_v128( src, 5 ) ); \
   casti_v128( dst, 6 ) = v128_bswap32( casti_v128( src, 6 ) ); \
   casti_v128( dst, 7 ) = v128_bswap32( casti_v128( src, 7 ) );

#define v128_block_bswap64( dst, src ) \
   dst[0] = v128_bswap64( src[0] ); \
   dst[1] = v128_bswap64( src[1] ); \
   dst[2] = v128_bswap64( src[2] ); \
   dst[3] = v128_bswap64( src[3] ); \
   dst[4] = v128_bswap64( src[4] ); \
   dst[5] = v128_bswap64( src[5] ); \
   dst[6] = v128_bswap64( src[6] ); \
   dst[7] = v128_bswap64( src[7] );

#define v128_rev32( v )                vrev64q_u32( v )

static inline uint32x4_t v128_swap64( uint32x4_t v )
{   return vextq_u64( v, v, 1 ); }

static inline uint32x4_t v128_swap32( uint32x4_t v )
{   return vextq_u32( v, v, 2 ); }

static inline uint32x4_t v128_shuflr32( uint32x4_t v )
{   return vextq_u32( v, v, 1 ); }

static inline uint32x4_t v128_shufll32( uint32x4_t v )
{   return vextq_u32( v, v, 3 ); }

#define v128_swap64_32(v)              v128_ror64( v, 32 )
#define v128_shuflr64_24(v)            v128_ror64( v, 24 ) 
#define v128_shuflr64_16(v)            v128_ror64( v, 16 )

#define v128_swap32_16(v)              v128_ror32( v, 16 )
#define v128_shuflr32_8(v)             v128_ror32( v,  8 )

// Not the same as SSE2, this uses vector mask, SSE2 uses imm8 mask.
#define v128_blend16( v1, v0, mask ) \
   v128_or( v128_and( mask, v1 ), v128_andnot( mask, v0 ) )

#endif