#ifndef __VECTOR_H__ #define __VECTOR_H__ #include "compat.h" #if defined(__GNUC__) && !defined(__INTEL_COMPILER) /******************************* * Using GCC vector extensions * *******************************/ #if defined(__SSE2__) //typedef unsigned char v16qi __attribute__ ((vector_size (16))); typedef char v16qi __attribute__ ((vector_size (16))); typedef short v8hi __attribute__ ((vector_size (16))); typedef int v4si __attribute__ ((vector_size (16))); typedef float v4sf __attribute__ ((vector_size (16))); typedef long long int v2di __attribute__ ((vector_size (16))); typedef short v4hi __attribute__ ((vector_size (8))); typedef unsigned char v8qi __attribute__ ((vector_size (8))); typedef v16qi v8; typedef v8hi v16; typedef v4si v32; #define V16_SIZE 8 union cv { unsigned short u16[8]; v16 v16; }; union cv8 { unsigned char u8[16]; v8 v8; }; union u32 { u32 u[4]; v32 v; }; #define V3216(x) ((v16) (x)) #define V1632(x) ((v32) (x)) #define V168(x) ( (v8) (x)) #define V816(x) ((v16) (x)) #if 0 /* These instruction are shorter than the PAND/POR/... that GCC uses */ #define vec_and(x,y) ({v16 a = (v16) x; v16 b = (v16) y; __builtin_ia32_andps ((v4sf) a, (v4sf) b);}) #define vec_or(x,y) ({v16 a = (v16) x; v16 b = (v16) y; __builtin_ia32_orps ((v4sf) a, (v4sf) b);}) #define vec_xor(x,y) ({v16 a = (v16) x; v16 b = (v16) y; __builtin_ia32_xorps ((v4sf) a, (v4sf) b);}) #define vec_andn(x,y) ({v16 a = (v16) x; v16 b = (v16) y; __builtin_ia32_andnps ((v4sf) a, (v4sf) b);}) #define v16_and(x,y) ((v16) vec_and ((x), (y))) #define v16_or(x,y) ((v16) vec_or ((x), (y))) #define v16_xor(x,y) ((v16) vec_xor ((x), (y))) #define v16_andn(x,y) ((v16) vec_andn((x), (y))) #define v32_and(x,y) ((v32) vec_and ((x), (y))) #define v32_or(x,y) ((v32) vec_or ((x), (y))) #define v32_xor(x,y) ((v32) vec_xor ((x), (y))) #define v32_andn(x,y) ((v32) vec_andn((x), (y))) #endif #define vec_and(x,y) ((x)&(y)) #define vec_or(x,y) ((x)|(y)) #define vec_xor(x,y) ((x)^(y)) #define v16_and vec_and #define v16_or vec_or #define v16_xor vec_xor #define v32_and vec_and #define v32_or vec_or #define v32_xor vec_xor #define vec_andn(x,y) __builtin_ia32_pandn128 ((v2di) x, (v2di) y) #define v16_andn(x,y) ((v16) vec_andn(x,y)) #define v32_andn(x,y) ((v32) vec_andn(x,y)) #define v32_add(x,y) ((x)+(y)) #define v16_add(x,y) ((x)+(y)) #define v16_sub(x,y) ((x)-(y)) #define v16_mul(x,y) ((x)*(y)) #define v16_neg(x) (-(x)) #define v16_shift_l __builtin_ia32_psllwi128 #define v16_shift_r __builtin_ia32_psrawi128 #define v16_cmp __builtin_ia32_pcmpgtw128 #define v16_interleavel __builtin_ia32_punpcklwd128 #define v16_interleaveh __builtin_ia32_punpckhwd128 #define v16_mergel(a,b) V1632(__builtin_ia32_punpcklwd128(a,b)) #define v16_mergeh(a,b) V1632(__builtin_ia32_punpckhwd128(a,b)) #define v8_mergel(a,b) V816(__builtin_ia32_punpcklbw128(a,b)) #define v8_mergeh(a,b) V816(__builtin_ia32_punpckhbw128(a,b)) #define v32_shift_l __builtin_ia32_pslldi128 #define v32_shift_r __builtin_ia32_psrldi128 #define v32_rotate(x,n) \ v32_or(v32_shift_l(x,n), v32_shift_r(x,32-(n))) #define v32_shuf __builtin_ia32_pshufd #define SHUFXOR_1 0xb1 /* 0b10110001 */ #define SHUFXOR_2 0x4e /* 0b01001110 */ #define SHUFXOR_3 0x1b /* 0b00011011 */ #define CAT(x, y) x##y #define XCAT(x,y) CAT(x,y) #define v32_shufxor(x,s) v32_shuf(x,XCAT(SHUFXOR_,s)) #define v32_bswap(x) (x) #define v16_broadcast(x) ({ \ union u32 u; \ u32 xx = x; \ u.u[0] = xx | (xx << 16); \ V3216(v32_shuf(u.v,0)); }) #define CV(x) {{x, x, x, x, x, x, x, x}} #elif defined(__ALTIVEC__) #include typedef vector unsigned char v8; typedef vector signed short v16; typedef vector unsigned int v32; #define V3216(x) ((v16) (x)) #define V1632(x) ((v32) (x)) #define V168(x) ( (v8) (x)) #define V816(x) ((v16) (x)) #define V16_SIZE 8 #define print_vec print_sse #define MAKE_VECT(x, ...) {{x, __VA_ARGS__}} #define CV(x) MAKE_VECT(x, x, x, x, x, x, x, x) #define CV16(x) ((vector signed short) {x,x,x,x,x,x,x,x}) #define CVU16(x) ((vector unsigned short) {x,x,x,x,x,x,x,x}) #define CV32(x) ((vector unsigned int ) {x,x,x,x}) union cv { unsigned short u16[8]; v16 v16; }; union cv8 { unsigned char u8[16]; v8 v8; }; union ucv { unsigned short u16[8]; vector unsigned char v16; }; // Nasty hack to avoid macro expansion madness /* altivec.h is broken with Gcc 3.3 is C99 mode */ #if defined __STDC__ && __STDC_VERSION__ >= 199901L #define typeof __typeof #endif MAYBE_INLINE v16 vec_and_fun (v16 x, v16 y) { return vec_and (x, y); } MAYBE_INLINE v16 vec_or_fun (v16 x, v16 y) { return vec_or (x, y); } MAYBE_INLINE v16 vec_xor_fun (v16 x, v16 y) { return vec_xor (x, y); } #undef vec_and #undef vec_or #undef vec_xor #define vec_and(x,y) ((__typeof(x)) vec_and_fun((v16) x, (v16) y)) #define vec_or(x,y) ((__typeof(x)) vec_or_fun((v16) x, (v16) y)) #define vec_xor(x,y) ((__typeof(x)) vec_xor_fun((v16) x, (v16) y)) #define v16_and vec_and #define v16_or vec_or #define v16_xor vec_xor #define v32_and vec_and #define v32_or vec_or #define v32_xor vec_xor #define v32_add vec_add #define v16_add vec_add #define v16_sub vec_sub #define v16_mul(a,b) vec_mladd(a,b,CV16(0)) vector unsigned short ZZ = {0,0,0,0,0,0,0,0}; v16 v16_shift_l(v16 x,int s) { vector unsigned short shift = {s,s,s,s,s,s,s,s}; v16 y = vec_sl (x, shift); return y; } #define v16_shift_l(x,s) vec_sl (x,CVU16(s)) #define v16_shift_r(x,s) vec_sra(x,CVU16(s)) #define v16_cmp vec_cmpgt #define v16_mergel(a,b) V1632(vec_mergeh(b,a)) #define v16_mergeh(a,b) V1632(vec_mergel(b,a)) #define v16_interleavel(a,b) vec_mergeh(a,b) #define v16_interleaveh(a,b) vec_mergel(a,b) #define v8_mergel(a,b) V816(vec_mergeh(b,a)) #define v8_mergeh(a,b) V816(vec_mergel(b,a)) #define v32_rotate(x,s) vec_rl(x,CV32(s)) // #define v32_unpckl vec_mergel // #define v32_unpckh vec_mergeh #define vector_shuffle(x,s) vec_perm(x,x,s) static const v8 SHUFXOR_1 = {4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11}; static const v8 SHUFXOR_2 = {8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7}; static const v8 SHUFXOR_3 = {12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3}; #define v32_shufxor(x,s) vector_shuffle(x,SHUFXOR_##s) //static const v8 SHUFSWAP = {15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0}; static const v8 SHUFSWAP = {3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12}; #define v32_bswap(x) vector_shuffle(x,SHUFSWAP) #else #error "I don't know how to vectorize on this architecture." #endif #else /******************************** * Using MSVC/ICC vector instrinsics * ********************************/ #include typedef __m128i v8; typedef __m128i v16; typedef __m128i v32; #define V3216(x) (x) #define V1632(x) (x) #define V168(x) (x) #define V816(x) (x) #define V16_SIZE 8 union cv { unsigned short u16[8]; v16 v16; }; union cv8 { unsigned char u8[16]; v8 v8; }; #define CV(x) {{x, x, x, x, x, x, x, x}} #define vec_and _mm_and_si128 #define vec_or _mm_or_si128 #define vec_xor _mm_xor_si128 #define v16_and vec_and #define v16_or vec_or #define v16_xor vec_xor #define v32_and vec_and #define v32_or vec_or #define v32_xor vec_xor #define vector_shuffle(x,s) _mm_shuffle_epi8(x, s) #define v32_add _mm_add_epi32 #define v16_add _mm_add_epi16 #define v16_sub _mm_sub_epi16 #define v16_mul _mm_mullo_epi16 #define v16_neg(x) (-(x)) #define v16_shift_l _mm_slli_epi16 #define v16_shift_r _mm_srai_epi16 #define v16_cmp _mm_cmpgt_epi16 #define v16_interleavel _mm_unpacklo_epi16 #define v16_interleaveh _mm_unpackhi_epi16 #define v16_mergel _mm_unpacklo_epi16 #define v16_mergeh _mm_unpackhi_epi16 #define v8_mergel _mm_unpacklo_epi8 #define v8_mergeh _mm_unpackhi_epi8 #define v32_shift_l _mm_slli_epi32 #define v32_shift_r _mm_srli_epi32 #define v32_rotate(x,n) \ vec_or(v32_shift_l(x,n), v32_shift_r(x,32-(n))) #define v32_shuf _mm_shuffle_epi32 #define SHUFXOR_1 0xb1 /* 0b10110001 */ #define SHUFXOR_2 0x4e /* 0b01001110 */ #define SHUFXOR_3 0x1b /* 0b00011011 */ #define CAT(x, y) x##y #define XCAT(x,y) CAT(x,y) //#define v32_shufxor(x,s) v32_shuf(x,SHUFXOR_##s) #define v32_shufxor(x,s) v32_shuf(x,XCAT(SHUFXOR_,s)) #define v32_bswap(x) (x) #endif /* Twiddle tables */ static const union cv FFT64_Twiddle[] = { {{1, 2, 4, 8, 16, 32, 64, 128}}, {{1, 60, 2, 120, 4, -17, 8, -34}}, {{1, 120, 8, -68, 64, -30, -2, 17}}, {{1, 46, 60, -67, 2, 92, 120, 123}}, {{1, 92, -17, -22, 32, 117, -30, 67}}, {{1, -67, 120, -73, 8, -22, -68, -70}}, {{1, 123, -34, -70, 128, 67, 17, 35}}, }; static const union cv FFT128_Twiddle[] = { {{ 1, -118, 46, -31, 60, 116, -67, -61}}, {{ 2, 21, 92, -62, 120, -25, 123, -122}}, {{ 4, 42, -73, -124, -17, -50, -11, 13}}, {{ 8, 84, 111, 9, -34, -100, -22, 26}}, {{ 16, -89, -35, 18, -68, 57, -44, 52}}, {{ 32, 79, -70, 36, 121, 114, -88, 104}}, {{ 64, -99, 117, 72, -15, -29, 81, -49}}, {{128, 59, -23, -113, -30, -58, -95, -98}}, }; static const union cv FFT256_Twiddle[] = { {{ 1, 41, -118, 45, 46, 87, -31, 14}}, {{ 60, -110, 116, -127, -67, 80, -61, 69}}, {{ 2, 82, 21, 90, 92, -83, -62, 28}}, {{ 120, 37, -25, 3, 123, -97, -122, -119}}, {{ 4, -93, 42, -77, -73, 91, -124, 56}}, {{ -17, 74, -50, 6, -11, 63, 13, 19}}, {{ 8, 71, 84, 103, 111, -75, 9, 112}}, {{ -34, -109, -100, 12, -22, 126, 26, 38}}, {{ 16, -115, -89, -51, -35, 107, 18, -33}}, {{ -68, 39, 57, 24, -44, -5, 52, 76}}, {{ 32, 27, 79, -102, -70, -43, 36, -66}}, {{ 121, 78, 114, 48, -88, -10, 104, -105}}, {{ 64, 54, -99, 53, 117, -86, 72, 125}}, {{ -15, -101, -29, 96, 81, -20, -49, 47}}, {{ 128, 108, 59, 106, -23, 85, -113, -7}}, {{ -30, 55, -58, -65, -95, -40, -98, 94}} }; #endif