OLD | NEW |
1 /* Copyright (c) 2013 Jean-Marc Valin and John Ridges */ | 1 /* Copyright (c) 2013 Jean-Marc Valin and John Ridges */ |
2 /** | 2 /** |
3 @file pitch_sse.h | 3 @file pitch_sse.h |
4 @brief Pitch analysis | 4 @brief Pitch analysis |
5 */ | 5 */ |
6 | 6 |
7 /* | 7 /* |
8 Redistribution and use in source and binary forms, with or without | 8 Redistribution and use in source and binary forms, with or without |
9 modification, are permitted provided that the following conditions | 9 modification, are permitted provided that the following conditions |
10 are met: | 10 are met: |
(...skipping 18 matching lines...) Expand all Loading... |
29 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 29 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
30 */ | 30 */ |
31 | 31 |
32 #ifndef PITCH_SSE_H | 32 #ifndef PITCH_SSE_H |
33 #define PITCH_SSE_H | 33 #define PITCH_SSE_H |
34 | 34 |
35 #include <xmmintrin.h> | 35 #include <xmmintrin.h> |
36 #include "arch.h" | 36 #include "arch.h" |
37 | 37 |
38 #define OVERRIDE_XCORR_KERNEL | 38 #define OVERRIDE_XCORR_KERNEL |
39 static inline void xcorr_kernel(const opus_val16 *x, const opus_val16 *y, opus_v
al32 sum[4], int len) | 39 static OPUS_INLINE void xcorr_kernel(const opus_val16 *x, const opus_val16 *y, o
pus_val32 sum[4], int len) |
40 { | 40 { |
41 int j; | 41 int j; |
42 __m128 xsum1, xsum2; | 42 __m128 xsum1, xsum2; |
43 xsum1 = _mm_loadu_ps(sum); | 43 xsum1 = _mm_loadu_ps(sum); |
44 xsum2 = _mm_setzero_ps(); | 44 xsum2 = _mm_setzero_ps(); |
45 | 45 |
46 for (j = 0; j < len-3; j += 4) | 46 for (j = 0; j < len-3; j += 4) |
47 { | 47 { |
48 __m128 x0 = _mm_loadu_ps(x+j); | 48 __m128 x0 = _mm_loadu_ps(x+j); |
49 __m128 yj = _mm_loadu_ps(y+j); | 49 __m128 yj = _mm_loadu_ps(y+j); |
(...skipping 15 matching lines...) Expand all Loading... |
65 if (++j < len) | 65 if (++j < len) |
66 { | 66 { |
67 xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y
+j))); | 67 xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y
+j))); |
68 } | 68 } |
69 } | 69 } |
70 } | 70 } |
71 _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2)); | 71 _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2)); |
72 } | 72 } |
73 | 73 |
74 #define OVERRIDE_DUAL_INNER_PROD | 74 #define OVERRIDE_DUAL_INNER_PROD |
75 static inline void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, c
onst opus_val16 *y02, | 75 static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y
01, const opus_val16 *y02, |
76 int N, opus_val32 *xy1, opus_val32 *xy2) | 76 int N, opus_val32 *xy1, opus_val32 *xy2) |
77 { | 77 { |
78 int i; | 78 int i; |
79 __m128 xsum1, xsum2; | 79 __m128 xsum1, xsum2; |
80 xsum1 = _mm_setzero_ps(); | 80 xsum1 = _mm_setzero_ps(); |
81 xsum2 = _mm_setzero_ps(); | 81 xsum2 = _mm_setzero_ps(); |
82 for (i=0;i<N-3;i+=4) | 82 for (i=0;i<N-3;i+=4) |
83 { | 83 { |
84 __m128 xi = _mm_loadu_ps(x+i); | 84 __m128 xi = _mm_loadu_ps(x+i); |
85 __m128 y1i = _mm_loadu_ps(y01+i); | 85 __m128 y1i = _mm_loadu_ps(y01+i); |
86 __m128 y2i = _mm_loadu_ps(y02+i); | 86 __m128 y2i = _mm_loadu_ps(y02+i); |
87 xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i)); | 87 xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i)); |
88 xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i)); | 88 xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i)); |
89 } | 89 } |
90 /* Horizontal sum */ | 90 /* Horizontal sum */ |
91 xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1)); | 91 xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1)); |
92 xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55)); | 92 xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55)); |
93 _mm_store_ss(xy1, xsum1); | 93 _mm_store_ss(xy1, xsum1); |
94 xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2)); | 94 xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2)); |
95 xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55)); | 95 xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55)); |
96 _mm_store_ss(xy2, xsum2); | 96 _mm_store_ss(xy2, xsum2); |
97 for (;i<N;i++) | 97 for (;i<N;i++) |
98 { | 98 { |
99 *xy1 = MAC16_16(*xy1, x[i], y01[i]); | 99 *xy1 = MAC16_16(*xy1, x[i], y01[i]); |
100 *xy2 = MAC16_16(*xy2, x[i], y02[i]); | 100 *xy2 = MAC16_16(*xy2, x[i], y02[i]); |
101 } | 101 } |
102 } | 102 } |
103 | 103 |
104 #define OVERRIDE_COMB_FILTER_CONST | 104 #define OVERRIDE_COMB_FILTER_CONST |
105 static inline void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N, | 105 static OPUS_INLINE void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, i
nt N, |
106 opus_val16 g10, opus_val16 g11, opus_val16 g12) | 106 opus_val16 g10, opus_val16 g11, opus_val16 g12) |
107 { | 107 { |
108 int i; | 108 int i; |
109 __m128 x0v; | 109 __m128 x0v; |
110 __m128 g10v, g11v, g12v; | 110 __m128 g10v, g11v, g12v; |
111 g10v = _mm_load1_ps(&g10); | 111 g10v = _mm_load1_ps(&g10); |
112 g11v = _mm_load1_ps(&g11); | 112 g11v = _mm_load1_ps(&g11); |
113 g12v = _mm_load1_ps(&g12); | 113 g12v = _mm_load1_ps(&g12); |
114 x0v = _mm_loadu_ps(&x[-T-2]); | 114 x0v = _mm_loadu_ps(&x[-T-2]); |
115 for (i=0;i<N-3;i+=4) | 115 for (i=0;i<N-3;i+=4) |
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
147 { | 147 { |
148 y[i] = x[i] | 148 y[i] = x[i] |
149 + MULT16_32_Q15(g10,x[i-T]) | 149 + MULT16_32_Q15(g10,x[i-T]) |
150 + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1])) | 150 + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1])) |
151 + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2])); | 151 + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2])); |
152 } | 152 } |
153 #endif | 153 #endif |
154 } | 154 } |
155 | 155 |
156 #endif | 156 #endif |
OLD | NEW |