| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. | 2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| (...skipping 11 matching lines...) Expand all Loading... |
| 22 #include "webrtc/modules/audio_processing/aec/aec_rdft.h" | 22 #include "webrtc/modules/audio_processing/aec/aec_rdft.h" |
| 23 | 23 |
| 24 __inline static float MulRe(float aRe, float aIm, float bRe, float bIm) { | 24 __inline static float MulRe(float aRe, float aIm, float bRe, float bIm) { |
| 25 return aRe * bRe - aIm * bIm; | 25 return aRe * bRe - aIm * bIm; |
| 26 } | 26 } |
| 27 | 27 |
| 28 __inline static float MulIm(float aRe, float aIm, float bRe, float bIm) { | 28 __inline static float MulIm(float aRe, float aIm, float bRe, float bIm) { |
| 29 return aRe * bIm + aIm * bRe; | 29 return aRe * bIm + aIm * bRe; |
| 30 } | 30 } |
| 31 | 31 |
| 32 static void FilterFarSSE2(int num_partitions, | 32 static void FilterFarSSE2( |
| 33 int xfBufBlockPos, | 33 int num_partitions, |
| 34 float xfBuf[2][kExtendedNumPartitions * PART_LEN1], | 34 int x_fft_buf_block_pos, |
| 35 float wfBuf[2][kExtendedNumPartitions * PART_LEN1], | 35 float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1], |
| 36 float yf[2][PART_LEN1]) { | 36 float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1], |
| 37 float y_fft[2][PART_LEN1]) { |
| 37 | 38 |
| 38 int i; | 39 int i; |
| 39 for (i = 0; i < num_partitions; i++) { | 40 for (i = 0; i < num_partitions; i++) { |
| 40 int j; | 41 int j; |
| 41 int xPos = (i + xfBufBlockPos) * PART_LEN1; | 42 int xPos = (i + x_fft_buf_block_pos) * PART_LEN1; |
| 42 int pos = i * PART_LEN1; | 43 int pos = i * PART_LEN1; |
| 43 // Check for wrap | 44 // Check for wrap |
| 44 if (i + xfBufBlockPos >= num_partitions) { | 45 if (i + x_fft_buf_block_pos >= num_partitions) { |
| 45 xPos -= num_partitions * (PART_LEN1); | 46 xPos -= num_partitions * (PART_LEN1); |
| 46 } | 47 } |
| 47 | 48 |
| 48 // vectorized code (four at once) | 49 // vectorized code (four at once) |
| 49 for (j = 0; j + 3 < PART_LEN1; j += 4) { | 50 for (j = 0; j + 3 < PART_LEN1; j += 4) { |
| 50 const __m128 xfBuf_re = _mm_loadu_ps(&xfBuf[0][xPos + j]); | 51 const __m128 x_fft_buf_re = _mm_loadu_ps(&x_fft_buf[0][xPos + j]); |
| 51 const __m128 xfBuf_im = _mm_loadu_ps(&xfBuf[1][xPos + j]); | 52 const __m128 x_fft_buf_im = _mm_loadu_ps(&x_fft_buf[1][xPos + j]); |
| 52 const __m128 wfBuf_re = _mm_loadu_ps(&wfBuf[0][pos + j]); | 53 const __m128 h_fft_buf_re = _mm_loadu_ps(&h_fft_buf[0][pos + j]); |
| 53 const __m128 wfBuf_im = _mm_loadu_ps(&wfBuf[1][pos + j]); | 54 const __m128 h_fft_buf_im = _mm_loadu_ps(&h_fft_buf[1][pos + j]); |
| 54 const __m128 yf_re = _mm_loadu_ps(&yf[0][j]); | 55 const __m128 y_fft_re = _mm_loadu_ps(&y_fft[0][j]); |
| 55 const __m128 yf_im = _mm_loadu_ps(&yf[1][j]); | 56 const __m128 y_fft_im = _mm_loadu_ps(&y_fft[1][j]); |
| 56 const __m128 a = _mm_mul_ps(xfBuf_re, wfBuf_re); | 57 const __m128 a = _mm_mul_ps(x_fft_buf_re, h_fft_buf_re); |
| 57 const __m128 b = _mm_mul_ps(xfBuf_im, wfBuf_im); | 58 const __m128 b = _mm_mul_ps(x_fft_buf_im, h_fft_buf_im); |
| 58 const __m128 c = _mm_mul_ps(xfBuf_re, wfBuf_im); | 59 const __m128 c = _mm_mul_ps(x_fft_buf_re, h_fft_buf_im); |
| 59 const __m128 d = _mm_mul_ps(xfBuf_im, wfBuf_re); | 60 const __m128 d = _mm_mul_ps(x_fft_buf_im, h_fft_buf_re); |
| 60 const __m128 e = _mm_sub_ps(a, b); | 61 const __m128 e = _mm_sub_ps(a, b); |
| 61 const __m128 f = _mm_add_ps(c, d); | 62 const __m128 f = _mm_add_ps(c, d); |
| 62 const __m128 g = _mm_add_ps(yf_re, e); | 63 const __m128 g = _mm_add_ps(y_fft_re, e); |
| 63 const __m128 h = _mm_add_ps(yf_im, f); | 64 const __m128 h = _mm_add_ps(y_fft_im, f); |
| 64 _mm_storeu_ps(&yf[0][j], g); | 65 _mm_storeu_ps(&y_fft[0][j], g); |
| 65 _mm_storeu_ps(&yf[1][j], h); | 66 _mm_storeu_ps(&y_fft[1][j], h); |
| 66 } | 67 } |
| 67 // scalar code for the remaining items. | 68 // scalar code for the remaining items. |
| 68 for (; j < PART_LEN1; j++) { | 69 for (; j < PART_LEN1; j++) { |
| 69 yf[0][j] += MulRe(xfBuf[0][xPos + j], | 70 y_fft[0][j] += MulRe(x_fft_buf[0][xPos + j], |
| 70 xfBuf[1][xPos + j], | 71 x_fft_buf[1][xPos + j], |
| 71 wfBuf[0][pos + j], | 72 h_fft_buf[0][pos + j], |
| 72 wfBuf[1][pos + j]); | 73 h_fft_buf[1][pos + j]); |
| 73 yf[1][j] += MulIm(xfBuf[0][xPos + j], | 74 y_fft[1][j] += MulIm(x_fft_buf[0][xPos + j], |
| 74 xfBuf[1][xPos + j], | 75 x_fft_buf[1][xPos + j], |
| 75 wfBuf[0][pos + j], | 76 h_fft_buf[0][pos + j], |
| 76 wfBuf[1][pos + j]); | 77 h_fft_buf[1][pos + j]); |
| 77 } | 78 } |
| 78 } | 79 } |
| 79 } | 80 } |
| 80 | 81 |
| 81 static void ScaleErrorSignalSSE2(int extended_filter_enabled, | 82 static void ScaleErrorSignalSSE2(int extended_filter_enabled, |
| 82 float normal_mu, | 83 float normal_mu, |
| 83 float normal_error_threshold, | 84 float normal_error_threshold, |
| 84 float *x_pow, | 85 float x_pow[PART_LEN1], |
| 85 float ef[2][PART_LEN1]) { | 86 float ef[2][PART_LEN1]) { |
| 86 const __m128 k1e_10f = _mm_set1_ps(1e-10f); | 87 const __m128 k1e_10f = _mm_set1_ps(1e-10f); |
| 87 const __m128 kMu = extended_filter_enabled ? _mm_set1_ps(kExtendedMu) | 88 const __m128 kMu = extended_filter_enabled ? _mm_set1_ps(kExtendedMu) |
| 88 : _mm_set1_ps(normal_mu); | 89 : _mm_set1_ps(normal_mu); |
| 89 const __m128 kThresh = extended_filter_enabled | 90 const __m128 kThresh = extended_filter_enabled |
| 90 ? _mm_set1_ps(kExtendedErrorThreshold) | 91 ? _mm_set1_ps(kExtendedErrorThreshold) |
| 91 : _mm_set1_ps(normal_error_threshold); | 92 : _mm_set1_ps(normal_error_threshold); |
| 92 | 93 |
| 93 int i; | 94 int i; |
| 94 // vectorized code (four at once) | 95 // vectorized code (four at once) |
| (...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 140 ef[1][i] *= abs_ef; | 141 ef[1][i] *= abs_ef; |
| 141 } | 142 } |
| 142 | 143 |
| 143 // Stepsize factor | 144 // Stepsize factor |
| 144 ef[0][i] *= mu; | 145 ef[0][i] *= mu; |
| 145 ef[1][i] *= mu; | 146 ef[1][i] *= mu; |
| 146 } | 147 } |
| 147 } | 148 } |
| 148 } | 149 } |
| 149 | 150 |
| 150 static void FilterAdaptationSSE2(AecCore* aec, | 151 static void FilterAdaptationSSE2( |
| 151 float* fft, | 152 int num_partitions, |
| 152 float ef[2][PART_LEN1]) { | 153 int x_fft_buf_block_pos, |
| 154 float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1], |
| 155 float e_fft[2][PART_LEN1], |
| 156 float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1]) { |
| 157 float fft[PART_LEN2]; |
| 153 int i, j; | 158 int i, j; |
| 154 const int num_partitions = aec->num_partitions; | |
| 155 for (i = 0; i < num_partitions; i++) { | 159 for (i = 0; i < num_partitions; i++) { |
| 156 int xPos = (i + aec->xfBufBlockPos) * (PART_LEN1); | 160 int xPos = (i + x_fft_buf_block_pos) * (PART_LEN1); |
| 157 int pos = i * PART_LEN1; | 161 int pos = i * PART_LEN1; |
| 158 // Check for wrap | 162 // Check for wrap |
| 159 if (i + aec->xfBufBlockPos >= num_partitions) { | 163 if (i + x_fft_buf_block_pos >= num_partitions) { |
| 160 xPos -= num_partitions * PART_LEN1; | 164 xPos -= num_partitions * PART_LEN1; |
| 161 } | 165 } |
| 162 | 166 |
| 163 // Process the whole array... | 167 // Process the whole array... |
| 164 for (j = 0; j < PART_LEN; j += 4) { | 168 for (j = 0; j < PART_LEN; j += 4) { |
| 165 // Load xfBuf and ef. | 169 // Load x_fft_buf and e_fft. |
| 166 const __m128 xfBuf_re = _mm_loadu_ps(&aec->xfBuf[0][xPos + j]); | 170 const __m128 x_fft_buf_re = _mm_loadu_ps(&x_fft_buf[0][xPos + j]); |
| 167 const __m128 xfBuf_im = _mm_loadu_ps(&aec->xfBuf[1][xPos + j]); | 171 const __m128 x_fft_buf_im = _mm_loadu_ps(&x_fft_buf[1][xPos + j]); |
| 168 const __m128 ef_re = _mm_loadu_ps(&ef[0][j]); | 172 const __m128 e_fft_re = _mm_loadu_ps(&e_fft[0][j]); |
| 169 const __m128 ef_im = _mm_loadu_ps(&ef[1][j]); | 173 const __m128 e_fft_im = _mm_loadu_ps(&e_fft[1][j]); |
| 170 // Calculate the product of conjugate(xfBuf) by ef. | 174 // Calculate the product of conjugate(x_fft_buf) by e_fft. |
| 171 // re(conjugate(a) * b) = aRe * bRe + aIm * bIm | 175 // re(conjugate(a) * b) = aRe * bRe + aIm * bIm |
| 172 // im(conjugate(a) * b)= aRe * bIm - aIm * bRe | 176 // im(conjugate(a) * b)= aRe * bIm - aIm * bRe |
| 173 const __m128 a = _mm_mul_ps(xfBuf_re, ef_re); | 177 const __m128 a = _mm_mul_ps(x_fft_buf_re, e_fft_re); |
| 174 const __m128 b = _mm_mul_ps(xfBuf_im, ef_im); | 178 const __m128 b = _mm_mul_ps(x_fft_buf_im, e_fft_im); |
| 175 const __m128 c = _mm_mul_ps(xfBuf_re, ef_im); | 179 const __m128 c = _mm_mul_ps(x_fft_buf_re, e_fft_im); |
| 176 const __m128 d = _mm_mul_ps(xfBuf_im, ef_re); | 180 const __m128 d = _mm_mul_ps(x_fft_buf_im, e_fft_re); |
| 177 const __m128 e = _mm_add_ps(a, b); | 181 const __m128 e = _mm_add_ps(a, b); |
| 178 const __m128 f = _mm_sub_ps(c, d); | 182 const __m128 f = _mm_sub_ps(c, d); |
| 179 // Interleave real and imaginary parts. | 183 // Interleave real and imaginary parts. |
| 180 const __m128 g = _mm_unpacklo_ps(e, f); | 184 const __m128 g = _mm_unpacklo_ps(e, f); |
| 181 const __m128 h = _mm_unpackhi_ps(e, f); | 185 const __m128 h = _mm_unpackhi_ps(e, f); |
| 182 // Store | 186 // Store |
| 183 _mm_storeu_ps(&fft[2 * j + 0], g); | 187 _mm_storeu_ps(&fft[2 * j + 0], g); |
| 184 _mm_storeu_ps(&fft[2 * j + 4], h); | 188 _mm_storeu_ps(&fft[2 * j + 4], h); |
| 185 } | 189 } |
| 186 // ... and fixup the first imaginary entry. | 190 // ... and fixup the first imaginary entry. |
| 187 fft[1] = MulRe(aec->xfBuf[0][xPos + PART_LEN], | 191 fft[1] = MulRe(x_fft_buf[0][xPos + PART_LEN], |
| 188 -aec->xfBuf[1][xPos + PART_LEN], | 192 -x_fft_buf[1][xPos + PART_LEN], |
| 189 ef[0][PART_LEN], | 193 e_fft[0][PART_LEN], |
| 190 ef[1][PART_LEN]); | 194 e_fft[1][PART_LEN]); |
| 191 | 195 |
| 192 aec_rdft_inverse_128(fft); | 196 aec_rdft_inverse_128(fft); |
| 193 memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN); | 197 memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN); |
| 194 | 198 |
| 195 // fft scaling | 199 // fft scaling |
| 196 { | 200 { |
| 197 float scale = 2.0f / PART_LEN2; | 201 float scale = 2.0f / PART_LEN2; |
| 198 const __m128 scale_ps = _mm_load_ps1(&scale); | 202 const __m128 scale_ps = _mm_load_ps1(&scale); |
| 199 for (j = 0; j < PART_LEN; j += 4) { | 203 for (j = 0; j < PART_LEN; j += 4) { |
| 200 const __m128 fft_ps = _mm_loadu_ps(&fft[j]); | 204 const __m128 fft_ps = _mm_loadu_ps(&fft[j]); |
| 201 const __m128 fft_scale = _mm_mul_ps(fft_ps, scale_ps); | 205 const __m128 fft_scale = _mm_mul_ps(fft_ps, scale_ps); |
| 202 _mm_storeu_ps(&fft[j], fft_scale); | 206 _mm_storeu_ps(&fft[j], fft_scale); |
| 203 } | 207 } |
| 204 } | 208 } |
| 205 aec_rdft_forward_128(fft); | 209 aec_rdft_forward_128(fft); |
| 206 | 210 |
| 207 { | 211 { |
| 208 float wt1 = aec->wfBuf[1][pos]; | 212 float wt1 = h_fft_buf[1][pos]; |
| 209 aec->wfBuf[0][pos + PART_LEN] += fft[1]; | 213 h_fft_buf[0][pos + PART_LEN] += fft[1]; |
| 210 for (j = 0; j < PART_LEN; j += 4) { | 214 for (j = 0; j < PART_LEN; j += 4) { |
| 211 __m128 wtBuf_re = _mm_loadu_ps(&aec->wfBuf[0][pos + j]); | 215 __m128 wtBuf_re = _mm_loadu_ps(&h_fft_buf[0][pos + j]); |
| 212 __m128 wtBuf_im = _mm_loadu_ps(&aec->wfBuf[1][pos + j]); | 216 __m128 wtBuf_im = _mm_loadu_ps(&h_fft_buf[1][pos + j]); |
| 213 const __m128 fft0 = _mm_loadu_ps(&fft[2 * j + 0]); | 217 const __m128 fft0 = _mm_loadu_ps(&fft[2 * j + 0]); |
| 214 const __m128 fft4 = _mm_loadu_ps(&fft[2 * j + 4]); | 218 const __m128 fft4 = _mm_loadu_ps(&fft[2 * j + 4]); |
| 215 const __m128 fft_re = | 219 const __m128 fft_re = |
| 216 _mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(2, 0, 2, 0)); | 220 _mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(2, 0, 2, 0)); |
| 217 const __m128 fft_im = | 221 const __m128 fft_im = |
| 218 _mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(3, 1, 3, 1)); | 222 _mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(3, 1, 3, 1)); |
| 219 wtBuf_re = _mm_add_ps(wtBuf_re, fft_re); | 223 wtBuf_re = _mm_add_ps(wtBuf_re, fft_re); |
| 220 wtBuf_im = _mm_add_ps(wtBuf_im, fft_im); | 224 wtBuf_im = _mm_add_ps(wtBuf_im, fft_im); |
| 221 _mm_storeu_ps(&aec->wfBuf[0][pos + j], wtBuf_re); | 225 _mm_storeu_ps(&h_fft_buf[0][pos + j], wtBuf_re); |
| 222 _mm_storeu_ps(&aec->wfBuf[1][pos + j], wtBuf_im); | 226 _mm_storeu_ps(&h_fft_buf[1][pos + j], wtBuf_im); |
| 223 } | 227 } |
| 224 aec->wfBuf[1][pos] = wt1; | 228 h_fft_buf[1][pos] = wt1; |
| 225 } | 229 } |
| 226 } | 230 } |
| 227 } | 231 } |
| 228 | 232 |
| 229 static __m128 mm_pow_ps(__m128 a, __m128 b) { | 233 static __m128 mm_pow_ps(__m128 a, __m128 b) { |
| 230 // a^b = exp2(b * log2(a)) | 234 // a^b = exp2(b * log2(a)) |
| 231 // exp2(x) and log2(x) are calculated using polynomial approximations. | 235 // exp2(x) and log2(x) are calculated using polynomial approximations. |
| 232 __m128 log2_a, b_log2_a, a_exp_b; | 236 __m128 log2_a, b_log2_a, a_exp_b; |
| 233 | 237 |
| 234 // Calculate log2(x), x = a. | 238 // Calculate log2(x), x = a. |
| (...skipping 495 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 730 } | 734 } |
| 731 } | 735 } |
| 732 | 736 |
| 733 void WebRtcAec_InitAec_SSE2(void) { | 737 void WebRtcAec_InitAec_SSE2(void) { |
| 734 WebRtcAec_FilterFar = FilterFarSSE2; | 738 WebRtcAec_FilterFar = FilterFarSSE2; |
| 735 WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2; | 739 WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2; |
| 736 WebRtcAec_FilterAdaptation = FilterAdaptationSSE2; | 740 WebRtcAec_FilterAdaptation = FilterAdaptationSSE2; |
| 737 WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2; | 741 WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2; |
| 738 WebRtcAec_SubbandCoherence = SubbandCoherenceSSE2; | 742 WebRtcAec_SubbandCoherence = SubbandCoherenceSSE2; |
| 739 } | 743 } |
| OLD | NEW |