OLD | NEW |
1 /* Copyright (c) 2013 Jean-Marc Valin and John Ridges */ | 1 /* Copyright (c) 2013 Jean-Marc Valin and John Ridges |
| 2 Copyright (c) 2014, Cisco Systems, INC MingXiang WeiZhou MinPeng YanWang*/ |
2 /** | 3 /** |
3 @file pitch_sse.h | 4 @file pitch_sse.h |
4 @brief Pitch analysis | 5 @brief Pitch analysis |
5 */ | 6 */ |
6 | 7 |
7 /* | 8 /* |
8 Redistribution and use in source and binary forms, with or without | 9 Redistribution and use in source and binary forms, with or without |
9 modification, are permitted provided that the following conditions | 10 modification, are permitted provided that the following conditions |
10 are met: | 11 are met: |
11 | 12 |
(...skipping 13 matching lines...) Expand all Loading... |
25 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | 26 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
26 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | 27 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
27 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | 28 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
28 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | 29 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
29 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 30 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
30 */ | 31 */ |
31 | 32 |
32 #ifndef PITCH_SSE_H | 33 #ifndef PITCH_SSE_H |
33 #define PITCH_SSE_H | 34 #define PITCH_SSE_H |
34 | 35 |
| 36 #if defined(HAVE_CONFIG_H) |
| 37 #include "config.h" |
| 38 #endif |
| 39 |
| 40 #if defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2) |
| 41 #if defined(OPUS_X86_MAY_HAVE_SSE4_1) |
| 42 void xcorr_kernel_sse4_1( |
| 43 const opus_int16 *x, |
| 44 const opus_int16 *y, |
| 45 opus_val32 sum[4], |
| 46 int len); |
| 47 |
| 48 extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])( |
| 49 const opus_int16 *x, |
| 50 const opus_int16 *y, |
| 51 opus_val32 sum[4], |
| 52 int len); |
| 53 |
| 54 #define OVERRIDE_XCORR_KERNEL |
| 55 #define xcorr_kernel(x, y, sum, len, arch) \ |
| 56 ((*XCORR_KERNEL_IMPL[(arch) & OPUS_ARCHMASK])(x, y, sum, len)) |
| 57 |
| 58 opus_val32 celt_inner_prod_sse4_1( |
| 59 const opus_int16 *x, |
| 60 const opus_int16 *y, |
| 61 int N); |
| 62 #endif |
| 63 |
| 64 #if defined(OPUS_X86_MAY_HAVE_SSE2) |
| 65 opus_val32 celt_inner_prod_sse2( |
| 66 const opus_int16 *x, |
| 67 const opus_int16 *y, |
| 68 int N); |
| 69 #endif |
| 70 |
| 71 extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])( |
| 72 const opus_int16 *x, |
| 73 const opus_int16 *y, |
| 74 int N); |
| 75 |
| 76 #define OVERRIDE_CELT_INNER_PROD |
| 77 #define celt_inner_prod(x, y, N, arch) \ |
| 78 ((*CELT_INNER_PROD_IMPL[(arch) & OPUS_ARCHMASK])(x, y, N)) |
| 79 #else |
| 80 |
35 #include <xmmintrin.h> | 81 #include <xmmintrin.h> |
36 #include "arch.h" | 82 #include "arch.h" |
37 | 83 |
38 #define OVERRIDE_XCORR_KERNEL | 84 #define OVERRIDE_XCORR_KERNEL |
39 static OPUS_INLINE void xcorr_kernel(const opus_val16 *x, const opus_val16 *y, o
pus_val32 sum[4], int len) | 85 static OPUS_INLINE void xcorr_kernel_sse(const opus_val16 *x, const opus_val16 *
y, opus_val32 sum[4], int len) |
40 { | 86 { |
41 int j; | 87 int j; |
42 __m128 xsum1, xsum2; | 88 __m128 xsum1, xsum2; |
43 xsum1 = _mm_loadu_ps(sum); | 89 xsum1 = _mm_loadu_ps(sum); |
44 xsum2 = _mm_setzero_ps(); | 90 xsum2 = _mm_setzero_ps(); |
45 | 91 |
46 for (j = 0; j < len-3; j += 4) | 92 for (j = 0; j < len-3; j += 4) |
47 { | 93 { |
48 __m128 x0 = _mm_loadu_ps(x+j); | 94 __m128 x0 = _mm_loadu_ps(x+j); |
49 __m128 yj = _mm_loadu_ps(y+j); | 95 __m128 yj = _mm_loadu_ps(y+j); |
(...skipping 14 matching lines...) Expand all Loading... |
64 xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)
)); | 110 xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)
)); |
65 if (++j < len) | 111 if (++j < len) |
66 { | 112 { |
67 xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y
+j))); | 113 xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y
+j))); |
68 } | 114 } |
69 } | 115 } |
70 } | 116 } |
71 _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2)); | 117 _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2)); |
72 } | 118 } |
73 | 119 |
| 120 #define xcorr_kernel(_x, _y, _z, len, arch) \ |
| 121 ((void)(arch),xcorr_kernel_sse(_x, _y, _z, len)) |
| 122 |
74 #define OVERRIDE_DUAL_INNER_PROD | 123 #define OVERRIDE_DUAL_INNER_PROD |
75 static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y
01, const opus_val16 *y02, | 124 static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y
01, const opus_val16 *y02, |
76 int N, opus_val32 *xy1, opus_val32 *xy2) | 125 int N, opus_val32 *xy1, opus_val32 *xy2) |
77 { | 126 { |
78 int i; | 127 int i; |
79 __m128 xsum1, xsum2; | 128 __m128 xsum1, xsum2; |
80 xsum1 = _mm_setzero_ps(); | 129 xsum1 = _mm_setzero_ps(); |
81 xsum2 = _mm_setzero_ps(); | 130 xsum2 = _mm_setzero_ps(); |
82 for (i=0;i<N-3;i+=4) | 131 for (i=0;i<N-3;i+=4) |
83 { | 132 { |
(...skipping 10 matching lines...) Expand all Loading... |
94 xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2)); | 143 xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2)); |
95 xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55)); | 144 xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55)); |
96 _mm_store_ss(xy2, xsum2); | 145 _mm_store_ss(xy2, xsum2); |
97 for (;i<N;i++) | 146 for (;i<N;i++) |
98 { | 147 { |
99 *xy1 = MAC16_16(*xy1, x[i], y01[i]); | 148 *xy1 = MAC16_16(*xy1, x[i], y01[i]); |
100 *xy2 = MAC16_16(*xy2, x[i], y02[i]); | 149 *xy2 = MAC16_16(*xy2, x[i], y02[i]); |
101 } | 150 } |
102 } | 151 } |
103 | 152 |
| 153 #define OVERRIDE_CELT_INNER_PROD |
| 154 static OPUS_INLINE opus_val32 celt_inner_prod_sse(const opus_val16 *x, const opu
s_val16 *y, |
| 155 int N) |
| 156 { |
| 157 int i; |
| 158 float xy; |
| 159 __m128 sum; |
| 160 sum = _mm_setzero_ps(); |
| 161 /* FIXME: We should probably go 8-way and use 2 sums. */ |
| 162 for (i=0;i<N-3;i+=4) |
| 163 { |
| 164 __m128 xi = _mm_loadu_ps(x+i); |
| 165 __m128 yi = _mm_loadu_ps(y+i); |
| 166 sum = _mm_add_ps(sum,_mm_mul_ps(xi, yi)); |
| 167 } |
| 168 /* Horizontal sum */ |
| 169 sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum)); |
| 170 sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55)); |
| 171 _mm_store_ss(&xy, sum); |
| 172 for (;i<N;i++) |
| 173 { |
| 174 xy = MAC16_16(xy, x[i], y[i]); |
| 175 } |
| 176 return xy; |
| 177 } |
| 178 |
| 179 # define celt_inner_prod(_x, _y, len, arch) \ |
| 180 ((void)(arch),celt_inner_prod_sse(_x, _y, len)) |
| 181 |
104 #define OVERRIDE_COMB_FILTER_CONST | 182 #define OVERRIDE_COMB_FILTER_CONST |
105 static OPUS_INLINE void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, i
nt N, | 183 static OPUS_INLINE void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, i
nt N, |
106 opus_val16 g10, opus_val16 g11, opus_val16 g12) | 184 opus_val16 g10, opus_val16 g11, opus_val16 g12) |
107 { | 185 { |
108 int i; | 186 int i; |
109 __m128 x0v; | 187 __m128 x0v; |
110 __m128 g10v, g11v, g12v; | 188 __m128 g10v, g11v, g12v; |
111 g10v = _mm_load1_ps(&g10); | 189 g10v = _mm_load1_ps(&g10); |
112 g11v = _mm_load1_ps(&g11); | 190 g11v = _mm_load1_ps(&g11); |
113 g12v = _mm_load1_ps(&g12); | 191 g12v = _mm_load1_ps(&g12); |
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
147 { | 225 { |
148 y[i] = x[i] | 226 y[i] = x[i] |
149 + MULT16_32_Q15(g10,x[i-T]) | 227 + MULT16_32_Q15(g10,x[i-T]) |
150 + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1])) | 228 + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1])) |
151 + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2])); | 229 + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2])); |
152 } | 230 } |
153 #endif | 231 #endif |
154 } | 232 } |
155 | 233 |
156 #endif | 234 #endif |
| 235 #endif |
OLD | NEW |