#if !defined(SIMD_NEON_H__) #define SIMD_NEON_H__ 1 #if defined(__aarch64__) && defined(__ARM_NEON) // Targeted functions supporting NEON SIMD 128 & 64 bit vectors. // Element size matters! // // Intel naming is generally used. // // Some advanced logical operations that require SHA3. Prior to GCC-13 // they also require armv8.2 // // veor3q( v2, v1, v0 ) xor3 v2 ^ v1 ^ v0 // vxarq_u64( v1, v0, n ) ror64xor ( v1 ^ v0 ) >>> n ) // vbcaxq_u{64,32,16,8}( v2, v1, v0 ) xorandnot v2 ^ ( v1 & ~v0 ) // // not used anywhere yet // vrax1q_u64( v1, v0 ) v1 ^ ( v0 <<< 1 ) // vsraq_n_u{64,32,16,8}( v1, v0, n ) v1 + ( v0 >> n ) #define v128_t uint32x4_t // default, #define v128u64_t uint64x2_t #define v128u32_t uint32x4_t #define v128u16_t uint16x8_t #define v128u8_t uint8x16_t // load & store #define v128_load( p ) vld1q_u32( (uint32_t*)(p) ) #define v128_store( p, v ) vst1q_u32( (uint32_t*)(p), v ) #define v128u64_load( p ) vld1q_u64( (uint64_t*)(p) ) #define v128u64_store( p, v ) vst1q_u64( (uint64_t*)(p), v ) #define v128u32_load( p ) vld1q_u32( (uint32_t*)(p) ) #define v128u32_store( p, v ) vst1q_u32( (uint32_t*)(p), v ) #define v128u16_load( p ) vld1q_u16( (uint16_t*)(p) ) #define v128u16_store( p, v ) vst1q_u16( (uint16_t*)(p), v ) #define v128u8_load( p ) vld1q_u16( (uint8_t*)(p) ) #define v128u8_store( p, v ) vst1q_u16( (uint8_t*)(p), v ) // load & set1 combined. What if source is already loaded? // Don't use, leave it up to the compiler to optimize. // Same with vld1q_lane. #define v128_load1_64(p) vld1q_dup_u64( (uint64_t*)(p) ) #define v128_load1_32(p) vld1q_dup_u32( (uint32_t*)(p) ) #define v128_load1_16(p) vld1q_dup_u16( (uint16_t*)(p) ) #define v128_load1_8( p) vld1q_dup_u8( (uint8_t*) (p) ) // arithmetic #define v128_add64 vaddq_u64 #define v128_add32 vaddq_u32 #define v128_add16 vaddq_u16 #define v128_add8 vaddq_u8 #define v128_add4_64( v3, v2, v1, v0 ) \ vaddq_u64( vaddq_u64( v3, v2 ), vaddq_u64( v1, v0 ) ) #define v128_add4_32( v3, v2, v1, v0 ) \ vaddq_u32( vaddq_u32( v3, v2 ), vaddq_u32( v1, v0 ) ) #define v128_sub64 vsubq_u64 #define v128_sub32 vsubq_u32 #define v128_sub16 vsubq_u16 #define v128_sub8 vsubq_u8 // returns low half #define v128_mul32 vmulq_u32 #define v128_mul16 vmulq_u16 // Widening multiply, realign source elements from x86_64 to NEON. #define v128_mulw32( v1, v0 ) \ vmull_u32( vmovn_u64( v1 ), vmovn_u64( v0 ) ) // compare #define v128_cmpeq64 vceqq_u64 #define v128_cmpeq32 vceqq_u32 #define v128_cmpeq16 vceqq_u16 #define v128_cmpeq8 vceqq_u8 // v128_cmp0, v128_cmpz, v128 testz #define v128_iszero vceqzq_u64 // Not yet needed //#define v128_cmpeq1 // Signed #define v128_cmpgt64( v1, v0 ) vcgtq_s64( (int64x2_t)v1, (int64x2_t)(v0) ) #define v128_cmpgt32( v1, v0 ) vcgtq_s32( (int32x4_t)v1, (int32x4_t)(v0) ) #define v128_cmpgt16( v1, v0 ) vcgtq_s16( (int16x8_t)v1, (int16x8_t)(v0) ) #define v128_cmpgt8( v1, v0 ) vcgtq_s8( (int8x16_t)v1, (int8x16_t)(v0) ) #define v128_cmplt64( v1, v0 ) vcltq_s64( (int64x2_t)v1, (int64x2_t)(v0) ) #define v128_cmplt32( v1, v0 ) vcltq_s32( (int32x4_t)v1, (int32x4_t)(v0) ) #define v128_cmplt16( v1, v0 ) vcltq_s16( (int16x8_t)v1, (int16x8_t)(v0) ) #define v128_cmplt8( v1, v0 ) vcltq_s8( (int8x16_t)v1, (int8x16_t)(v0) ) #define v128_cmpeq_zero vceqzq_u64 // Logical bit shift #define v128_sl64 vshlq_n_u64 #define v128_sl32 vshlq_n_u32 #define v128_sl16 vshlq_n_u16 #define v128_sl8 vshlq_n_u8 #define v128_sr64 vshrq_n_u64 #define v128_sr32 vshrq_n_u32 #define v128_sr16 vshrq_n_u16 #define v128_sr8 vshrq_n_u8 // Arithmetic shift. #define v128_sra64( v, c ) vshrq_n_s64( (int64x2_t)(v), c ) #define v128_sra32( v, c ) vshrq_n_s32( (int32x4_t)(v), c ) #define v128_sra16( v, c ) vshrq_n_s16( (int16x8_t)(v), c ) // unary logic #define v128_not vmvnq_u32 // binary logic #define v128_or vorrq_u32 #define v128_and vandq_u32 #define v128_xor veorq_u32 // ~v1 & v0 #define v128_andnot( v1, v0 ) vbicq_u32( v0, v1 ) // ~( a ^ b ), same as (~a) ^ b #define v128_xnor( v1, v0 ) v128_not( v128_xor( v1, v0 ) ) // ~v1 | v0, args reversed for consistency with x86_64 #define v128_ornot( v1, v0 ) vornq_u32( v0, v1 ) // ternary logic // This will compile with GCC-11 on armv8.2 and above. At this time there is no // known way to test arm minor version. #if defined(__ARM_FEATURE_SHA3) #define v128_xor3 veor3q_u32 #else #define v128_xor3( v2, v1, v0 ) veorq_u32( veorq_u32( v2, v1 ), v0 ) #endif // v2 & v1 & v0 #define v128_and3( v2, v1, v0 ) v128_and( v128_and( v2, v1 ), v0 ) // v2 | v1 | v0 #define v128_or3( v2, v1, v0 ) v128_or( v128_or( v2, v1 ), v0 ) // v2 ^ ( ~v1 & v0 ) #if defined(__ARM_FEATURE_SHA3) #define v128_xorandnot( v2, v1, v0 ) vbcaxq_u32( v2, v0, v1 ) #else #define v128_xorandnot( v2, v1, v0 ) v128_xor( v2, v128_andnot( v1, v0 ) ) #endif // a ^ ( b & c ) #define v128_xorand( v2, v1, v0 ) v128_xor( v2, v128_and( v1, v0 ) ) // a & ( b ^ c ) #define v128_andxor( v2, v1, v0 ) v128_and( v2, v128_xor( v1, v0 ) ) // a ^ ( b | c ) #define v128_xoror( v2, v1, v0 ) v128_xor( v2, v128_or( v1, v0 ) ) // v2 | ( v1 & v0 ) #define v128_orand( v2, v1, v0 ) v128_or( v2, v128_and( v1, v0 ) ) // shift 2 concatenated vectors right, args reversed for consistency with x86_64 #define v128_alignr64( v1, v0, c ) vextq_u64( v0, v1, c ) #define v128_alignr32( v1, v0, c ) vextq_u32( v0, v1, c ) #define v128_alignr8( v1, v0, c ) vextq_u8( v0, v1, c ) // Interleave high or low half of 2 vectors. #define v128_unpacklo64( v1, v0 ) vzip1q_u64( v1, v0 ) #define v128_unpackhi64( v1, v0 ) vzip2q_u64( v1, v0 ) #define v128_unpacklo32( v1, v0 ) vzip1q_u32( v1, v0 ) #define v128_unpackhi32( v1, v0 ) vzip2q_u32( v1, v0 ) #define v128_unpacklo16( v1, v0 ) vzip1q_u16( v1, v0 ) #define v128_unpackhi16( v1, v0 ) vzip2q_u16( v1, v0 ) #define v128_unpacklo8( v1, v0 ) vzip1q_u8( v1, v0 ) #define v128_unpackhi8( v1, v0 ) vzip2q_u8( v1, v0 ) // vzipq_u32 can do hi & lo and return uint32x4x2, no 64 bit version. // AES // consistent with Intel AES intrinsics, break up for optimizing #define v128_aesenc( v, k ) \ v128_xor( k, vaesmcq_u8( vaeseq_u8( v, v128_zero ) ) ) #define v128_aesenc_nokey( v ) \ vaesmcq_u8( vaeseq_u8( v, v128_zero ) ) #define v128_aesenclast( v, k ) \ v128_xor( k, vaeseq_u8( v, v128_zero ) ) #define v128_aesenclast_nokey( v ) \ vaeseq_u8( v, v128_zero ) #define v128_aesdec( v, k ) \ v128_xor( k, vaesimcq_u8( vaesdq_u8( v, v128_zero ) ) ) #define v128_aesdec_nokey( v ) \ vaesimcq_u8( vaesdq_u8( v, v128_zero ) ) #define v128_aesdeclast( v, k ) \ v128_xor( k, vaesdq_u8( v, v128_zero ) ) #define v128_aesdeclast_nokey( v ) \ vaesdq_u8( v, v128_zero ) typedef union { uint32x4_t v128; uint32x4_t m128; uint32_t u32[4]; } __attribute__ ((aligned (16))) v128_ovly; // Broadcast lane 0 to all lanes, consistent with x86_64 broadcast #define v128_bcast64(v) vdupq_laneq_u64( v, 0 ) #define v128_bcast32(v) vdupq_laneq_u32( v, 0 ) #define v128_bcast16(v) vdupq_laneq_u16( v, 0 ) // Broadcast lane l to all lanes #define v128_duplane64( v, l ) vdupq_laneq_u64( v, l ) #define v128_duplane32( v, l ) vdupq_laneq_u32( v, l ) #define v128_duplane16( v, l ) vdupq_laneq_u16( v, l ) // pointer indexing #define casti_v128( p, i ) (((uint32x4_t*)(p))[i]) #define cast_v128( p ) (*((uint32x4_t*)(p))) #define castp_v128( p ) ((uint32x4_t*)(p)) #define casti_v128u64( p, i ) (((uint64x2_t*)(p))[i]) #define cast_v128u64( p ) (*((uin64x24_t*)(p))) #define castp_v128u64( p ) ((uint64x2_t*)(p)) #define casti_v128u32( p, i ) (((uint32x4_t*)(p))[i]) #define cast_v128u32( p ) (*((uint32x4_t*)(p))) #define castp_v128u32( p ) ((uint32x4_t*)(p)) // set1 #define v128_64 vmovq_n_u64 #define v128_32 vmovq_n_u32 #define v128_16 vmovq_n_u16 #define v128_8 vmovq_n_u8 #define v128_zero v128_64( 0ull ) #define v128_neg1 v128_64( 0xffffffffffffffffull ) #define v64_set32( u32_1, u32_0 ) \ vcreate_u32( ( (uint64_t)(u32_1) << 32 ) | (uint64_t)(u32_0) ) #define v64_set16( u16_3, u16_2, u16_1, u16_0 ) \ vcreate_u16( ( (uint64_t)( ( (uint32_t)(u16_3) << 16) \ | (uint32_t)(u16_2) ) << 32 ) \ | ( (uint64_t)( ( (uint32_t)(u16_1) << 16) \ | (uint32_t)(u16_0) ) ) ) #define v64_set8( u8_7, u8_6, u8_5, u8_4, u8_3, u8_2, u8_1, u8_0 ) \ vcreate_u8( \ ( (uint64_t)( ( (uint32_t)( ((uint16_t)(u8_7) << 8) \ | (uint16_t)(u8_6) ) << 16 ) \ | ( (uint32_t)( ((uint16_t)(u8_5) << 8) \ | (uint16_t)(u8_4) ) ) ) << 32 ) \ | ( (uint64_t)( ( (uint32_t)( ((uint16_t)(u8_3) << 8) \ | (uint16_t)(u8_2) ) << 16 ) \ | ( (uint32_t)( ((uint16_t)(u8_1) << 8) \ | (uint16_t)(u8_0) ) ) ) ) ) #define v128_set64( u64_1, u64_0 ) \ vcombine_u64( vcreate_u64( u64_0 ), vcreate_u64( u64_1 ) ) #define v128_set32( u32_3, u32_2, u32_1, u32_0 ) \ vcombine_u32( v64_set32( u32_1, u32_0 ), v64_set32( u32_3, u32_2 ) ) #define v128_set16( u16_7, u16_6, u16_5, u16_4, u16_3, u16_2, u16_1, u16_0 ) \ vcombine_u16( v64_set16( u16_3, u16_2, u16_1, u16_0 ), \ v64_set16( u16_7, u16_6, u16_5, u16_4 ) ) #define v128_set8( u8_f, u8_e, u8_d, u8_c, u8_b, u8_a, u8_9, u8_8, \ u8_7, u8_6, u8_5, u8_4, u8_3, u8_2, u8_1, u8_0 ) \ vcombine_u8( v64_set8( u8_7, u8_6, u8_5, u8_4, u8_3, u8_2, u8_1, u8_0 ), \ v64_set8( u8_f, u8_e, u8_d, u8_c, u8_b, u8_a, u8_9, u8_8 ) ) // move single element from source to dest,lanes must be immediate constant // same as xim? #define v128_movlane64( v1, l1, v0, l0 ) vcopyq_laneq_u64( v1, l1, v0, l0 ) #define v128_movlane32( v1, l1, v0, l0 ) vcopyq_laneq_u32( v1, l1, v0, l0 ) #define v128_movlane16( v1, l1, v0, l0 ) vcopyq_laneq_u16( v1, l1, v0, l0 ) #define v128_movlane8( v1, l1, v0, l0 ) vcopyq_laneq_u8( v1, l1, v0, l0 ) #define v128_get64( v, l ) vgetq_lane_u64( v, l ) #define v128_get32( v, l ) vgetq_lane_u32( v, l ) #define v128_get16( v, l ) vgetq_lane_u16( v, l ) #define v128_get8( v, l ) vgetq_lane_u8( v, l ) #define v128_put64( v, i64, l ) vsetq_lane_u64( i64, v, l ) #define v128_put32( v, i32, l ) vsetq_lane_u32( i32, v, l ) #define v128_put16( v, i16, l ) vsetq_lane_u16( i16, v, l ) #define v128_put8( v, i8, l ) vsetq_lane_u8( i8, v, l ) #define v128_negate64 vnegq_s64 #define v128_negate32 vnegq_s32 #define v128_negate16 vnegq_s16 #define v128_negate8 vnegq_s8 // Nothing else seems to work static inline void v128_memset_zero( void *dst, const int n ) { memset( dst, 0, n*16 ); } static inline void v128_memset( void *dst, const void *src, const int n ) { for( int i = 0; i < n; i++ ) ((uint32x4_t*)dst)[n] = ((const uint32x4_t*)src)[n]; } static inline void v128_memcpy( void *dst, const void *src, const int n ) { for ( int i = 0; i < n; i ++ ) ((uint32x4_t*)dst)[i] = ((const uint32x4_t*)src)[i]; } // how to build a bitmask from vector elements? Efficiently??? #define v128_movmask32 #define v128_movmask64 // Bit rotation #define v128_ror64( v, c ) \ ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \ : vsriq_n_u64( vshlq_n_u64( ((uint64x2_t)(v)), 64-(c) ), \ ((uint64x2_t)(v)), c ) #define v128_rol64( v, c ) \ ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \ : vsliq_n_u64( vshrq_n_u64( ((uint64x2_t)(v)), 64-(c) ), \ ((uint64x2_t)(v)), c ) #define v128_ror32( v, c ) \ ( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)(v)) ) \ : vsriq_n_u32( vshlq_n_u32( ((uint32x4_t)(v)), 32-(c) ), \ ((uint32x4_t)(v)), c ) #define v128_rol32( v, c ) \ ( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)(v)) ) \ : vsliq_n_u32( vshrq_n_u32( ((uint32x4_t)(v)), 32-(c) ), \ ((uint32x4_t)(v)), c ) #define v128_ror16( v, c ) \ ( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint8x16_t)(v)) ) \ : vsriq_n_u16( vshlq_n_u16( ((uint16x8_t)(v)), 16-(c) ), \ ((uint16x8_t)(v)), c ) #define v128_rol16( v, c ) \ ( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint8x16_t)(v)) ) \ : vsliq_n_u16( vshrq_n_u16( ((uint16x8_t)(v)), 16-(c) ), \ ((uint16x8_t)(v)), c ) #define v128_ror8( v, c ) \ vsriq_n_u8( vshlq_n_u8( ((uint8x16_t)(v)), 8-(c) ), \ ((uint8x16_t)(v)), c ) #define v128_rol8( v, c ) \ vsliq_n_u8( vshrq_n_u8( ((uint8x16_t)(v)), 8-(c) ), \ ((uint8x16_t)(v)), c ) // ( v1 ^ v0 ) >>> c #if defined(__ARM_FEATURE_SHA3) #define v128_ror64xor( v1, v0, c ) vxarq_u64( v1, v0, c ) #else #define v128_ror64xor( v1, v0, c ) v128_ror64( v128_xor( v1, v0 ), c ) #endif #define v128_2ror64( v1, v0, c ) \ { \ uint64x2_t t0 = vshrq_n_u64( v0, c ); \ uint64x2_t t1 = vshrq_n_u64( v1, c ); \ v0 = vsliq_n_u64( v0, 64-(c) ); \ v1 = vsliq_n_u64( v1, 64-(c) ); \ v0 = vorrq_u64( v0, t0 ); \ v1 = vorrq_u64( v1, t1 ); \ } #define v128_2rol64_( v1, v0, c ) \ { \ uint64x2_t t0 = vshlq_n_u64( v0, c ); \ uint64x2_t t1 = vshlq_n_u64( v1, c ); \ v0 = vsriq_n_u64( v0, 64-(c) ); \ v1 = vsriq_n_u64( v1, 64-(c) ); \ v0 = vorrq_u64( v0, t0 ); \ v1 = vorrq_u64( v1, t1 ); \ } #define v128_2rorl32( v1, v0, c ) \ { \ uint32x4_t t0 = vshrq_n_u32( v0, c ); \ uint32x4_t t1 = vshrq_n_u32( v1, c ); \ v0 = vsliq_n_u32( v0, 32-(c) ); \ v1 = vsliq_n_u32( v1, 32-(c) ); \ v0 = vorrq_32( v0, t0 ); \ v1 = vorrq_u32( v1, t1 ); \ } #define v128_2ror32( v1, v0, c ) \ { \ uint32x4_t t0 = vshlq_n_u32( v0, c ); \ uint32x4_t t1 = vshlq_n_u32( v1, c ); \ v0 = vsriq_n_u32( v0, 32-(c) ); \ v1 = vsriq_n_u32( v1, 32-(c) ); \ v0 = vorrq_u32( v0, t0 ); \ v1 = vorrq_u32( v1, t1 ); \ } /* not used anywhere and hopefully never will // vector mask, use as last resort. prefer tbl, rev, alignr, etc #define v128_shufflev32( v, vmask ) \ v128_set32( ((uint32_t*)&v)[ ((uint32_t*)(&vmask))[3] ], \ ((uint32_t*)&v)[ ((uint32_t*)(&vmask))[2] ], \ ((uint32_t*)&v)[ ((uint32_t*)(&vmask))[1] ], \ ((uint32_t*)&v)[ ((uint32_t*)(&vmask))[0] ] ) \ */ #define v128_shuffle8( v, vmask ) \ vqtbl1q_u8( (uint8x16_t)(v), (uint8x16_t)(vmask) ) // sub-vector shuffles sometimes mirror bit rotation. Shuffle is faster. // Bit rotation already promotes faster widths. Usage is context sensitive. // reverse elements in vector lanes #define v128_qrev32 vrev64q_u32 #define v128_swap64_32 vrev64q_u32 // grandfathered #define v128_qrev16 vrev64q_u16 #define v128_lrev16 vrev32q_u16 // aka bswap // #define v128_qrev8 vrev64q_u8 // #define v128_lrev8 vrev32q_u8 // #define v128_wrev8 vrev16q_u8 // full vector rotation // reverse elements in vector static inline uint64x2_t v128_rev64( uint64x2_t v ) { return vextq_u64( v, v, 1 ); } #define v128_swap64 v128_rev64 // grandfathered #define v128_rev32(v) v128_rev64( v128_qrev32( v ) ) // shuffle-rotate vector elements static inline uint32x4_t v128_shuflr32( uint32x4_t v ) { return vextq_u32( v, v, 1 ); } static inline uint32x4_t v128_shufll32( uint32x4_t v ) { return vextq_u32( v, v, 3 ); } // reverse bits in bytes, nothing like it in x86_64 #define v128_bitrev8 vrbitq_u8 // reverse byte order #define v128_bswap16(v) (uint16x8_t)vrev16q_u8( (uint8x16_t)(v) ) #define v128_bswap32(v) (uint32x4_t)vrev32q_u8( (uint8x16_t)(v) ) #define v128_bswap64(v) (uint64x2_t)vrev64q_u8( (uint8x16_t)(v) ) #define v128_bswap128(v) (uint32x4_t)v128_rev64( v128_bswap64(v) ) // Useful for x86_64 but does nothing for ARM #define v128_block_bswap32( dst, src ) \ { \ casti_v128u32( dst,0 ) = v128_bswap32( casti_v128u32( src,0 ) ); \ casti_v128u32( dst,1 ) = v128_bswap32( casti_v128u32( src,1 ) ); \ casti_v128u32( dst,2 ) = v128_bswap32( casti_v128u32( src,2 ) ); \ casti_v128u32( dst,3 ) = v128_bswap32( casti_v128u32( src,3 ) ); \ casti_v128u32( dst,4 ) = v128_bswap32( casti_v128u32( src,4 ) ); \ casti_v128u32( dst,5 ) = v128_bswap32( casti_v128u32( src,5 ) ); \ casti_v128u32( dst,6 ) = v128_bswap32( casti_v128u32( src,6 ) ); \ casti_v128u32( dst,7 ) = v128_bswap32( casti_v128u32( src,7 ) ); \ } #define v128_block_bswap32_256 v128_block_bswap32 #define v128_block_bswap32_512( dst, src ) \ { \ casti_v128u32( dst, 0 ) = v128_bswap32( casti_v128u32( src, 0 ) ); \ casti_v128u32( dst, 1 ) = v128_bswap32( casti_v128u32( src, 1 ) ); \ casti_v128u32( dst, 2 ) = v128_bswap32( casti_v128u32( src, 2 ) ); \ casti_v128u32( dst, 3 ) = v128_bswap32( casti_v128u32( src, 3 ) ); \ casti_v128u32( dst, 4 ) = v128_bswap32( casti_v128u32( src, 4 ) ); \ casti_v128u32( dst, 5 ) = v128_bswap32( casti_v128u32( src, 5 ) ); \ casti_v128u32( dst, 6 ) = v128_bswap32( casti_v128u32( src, 6 ) ); \ casti_v128u32( dst, 7 ) = v128_bswap32( casti_v128u32( src, 7 ) ); \ casti_v128u32( dst, 8 ) = v128_bswap32( casti_v128u32( src, 8 ) ); \ casti_v128u32( dst, 9 ) = v128_bswap32( casti_v128u32( src, 9 ) ); \ casti_v128u32( dst,10 ) = v128_bswap32( casti_v128u32( src,10 ) ); \ casti_v128u32( dst,11 ) = v128_bswap32( casti_v128u32( src,11 ) ); \ casti_v128u32( dst,12 ) = v128_bswap32( casti_v128u32( src,12 ) ); \ casti_v128u32( dst,13 ) = v128_bswap32( casti_v128u32( src,13 ) ); \ casti_v128u32( dst,14 ) = v128_bswap32( casti_v128u32( src,14 ) ); \ casti_v128u32( dst,15 ) = v128_bswap32( casti_v128u32( src,15 ) ); \ } #define v128_block_bswap64( dst, src ) \ { \ casti_v128u64( dst,0 ) = v128_bswap64( casti_v128u64( src,0 ) ); \ casti_v128u64( dst,1 ) = v128_bswap64( casti_v128u64( src,1 ) ); \ casti_v128u64( dst,2 ) = v128_bswap64( casti_v128u64( src,2 ) ); \ casti_v128u64( dst,3 ) = v128_bswap64( casti_v128u64( src,3 ) ); \ casti_v128u64( dst,4 ) = v128_bswap64( casti_v128u64( src,4 ) ); \ casti_v128u64( dst,5 ) = v128_bswap64( casti_v128u64( src,5 ) ); \ casti_v128u64( dst,6 ) = v128_bswap64( casti_v128u64( src,6 ) ); \ casti_v128u64( dst,7 ) = v128_bswap64( casti_v128u64( src,7 ) ); \ } #define v128_block_bswap64_512 v128_block_bswap64 \ #define v128_block_bswap64_1024( dst, src ) \ { \ casti_v128u64( dst, 0 ) = v128_bswap64( casti_v128u64( src, 0 ) ); \ casti_v128u64( dst, 1 ) = v128_bswap64( casti_v128u64( src, 1 ) ); \ casti_v128u64( dst, 2 ) = v128_bswap64( casti_v128u64( src, 2 ) ); \ casti_v128u64( dst, 3 ) = v128_bswap64( casti_v128u64( src, 3 ) ); \ casti_v128u64( dst, 4 ) = v128_bswap64( casti_v128u64( src, 4 ) ); \ casti_v128u64( dst, 5 ) = v128_bswap64( casti_v128u64( src, 5 ) ); \ casti_v128u64( dst, 6 ) = v128_bswap64( casti_v128u64( src, 6 ) ); \ casti_v128u64( dst, 7 ) = v128_bswap64( casti_v128u64( src, 7 ) ); \ casti_v128u64( dst, 8 ) = v128_bswap64( casti_v128u64( src, 8 ) ); \ casti_v128u64( dst, 9 ) = v128_bswap64( casti_v128u64( src, 9 ) ); \ casti_v128u64( dst,10 ) = v128_bswap64( casti_v128u64( src,10 ) ); \ casti_v128u64( dst,11 ) = v128_bswap64( casti_v128u64( src,11 ) ); \ casti_v128u64( dst,12 ) = v128_bswap64( casti_v128u64( src,12 ) ); \ casti_v128u64( dst,13 ) = v128_bswap64( casti_v128u64( src,13 ) ); \ casti_v128u64( dst,14 ) = v128_bswap64( casti_v128u64( src,14 ) ); \ casti_v128u64( dst,15 ) = v128_bswap64( casti_v128u64( src,15 ) ); \ } // Bitwise blend using vector mask, use only bytewise for compatibility // with x86_64. #define v128_blendv( v1, v0, mask ) vbslq_u32( mask, v0, v1 ) #endif // __ARM_NEON #endif // SIMD_NEON_H__