OLD | NEW |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 // | 4 // |
5 // Input buffer layout, dividing the total buffer into regions (r0_ - r5_): | 5 // Input buffer layout, dividing the total buffer into regions (r0_ - r5_): |
6 // | 6 // |
7 // |----------------|-----------------------------------------|----------------| | 7 // |----------------|-----------------------------------------|----------------| |
8 // | 8 // |
9 // kBlockSize + kKernelSize / 2 | 9 // kBlockSize + kKernelSize / 2 |
10 // <---------------------------------------------------------> | 10 // <---------------------------------------------------------> |
(...skipping 22 matching lines...) Expand all Loading... |
33 | 33 |
34 // MSVC++ requires this to be set before any other includes to get M_PI. | 34 // MSVC++ requires this to be set before any other includes to get M_PI. |
35 #define _USE_MATH_DEFINES | 35 #define _USE_MATH_DEFINES |
36 | 36 |
37 #include "media/base/sinc_resampler.h" | 37 #include "media/base/sinc_resampler.h" |
38 | 38 |
39 #include <cmath> | 39 #include <cmath> |
40 | 40 |
41 #include "base/cpu.h" | 41 #include "base/cpu.h" |
42 #include "base/logging.h" | 42 #include "base/logging.h" |
43 #include "build/build_config.h" | |
44 | |
45 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__) | |
46 #include <xmmintrin.h> | |
47 #endif | |
48 | 43 |
49 #if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) | 44 #if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) |
50 #include <arm_neon.h> | 45 #include <arm_neon.h> |
51 #endif | 46 #endif |
52 | 47 |
53 namespace media { | 48 namespace media { |
54 | 49 |
55 namespace { | |
56 | |
57 enum { | |
58 // The kernel size can be adjusted for quality (higher is better) at the | |
59 // expense of performance. Must be a multiple of 32. | |
60 // TODO(dalecurtis): Test performance to see if we can jack this up to 64+. | |
61 kKernelSize = 32, | |
62 | |
63 // The number of destination frames generated per processing pass. Affects | |
64 // how often and for how much SincResampler calls back for input. Must be | |
65 // greater than kKernelSize. | |
66 kBlockSize = 512, | |
67 | |
68 // The kernel offset count is used for interpolation and is the number of | |
69 // sub-sample kernel shifts. Can be adjusted for quality (higher is better) | |
70 // at the expense of allocating more memory. | |
71 kKernelOffsetCount = 32, | |
72 kKernelStorageSize = kKernelSize * (kKernelOffsetCount + 1), | |
73 | |
74 // The size (in samples) of the internal buffer used by the resampler. | |
75 kBufferSize = kBlockSize + kKernelSize | |
76 }; | |
77 | |
78 } // namespace | |
79 | |
80 const int SincResampler::kMaximumLookAheadSize = kBufferSize; | |
81 | |
82 SincResampler::SincResampler(double io_sample_rate_ratio, const ReadCB& read_cb) | 50 SincResampler::SincResampler(double io_sample_rate_ratio, const ReadCB& read_cb) |
83 : io_sample_rate_ratio_(io_sample_rate_ratio), | 51 : io_sample_rate_ratio_(io_sample_rate_ratio), |
84 virtual_source_idx_(0), | 52 virtual_source_idx_(0), |
85 buffer_primed_(false), | 53 buffer_primed_(false), |
86 read_cb_(read_cb), | 54 read_cb_(read_cb), |
87 // Create input buffers with a 16-byte alignment for SSE optimizations. | 55 // Create input buffers with a 16-byte alignment for SSE optimizations. |
88 kernel_storage_(static_cast<float*>( | 56 kernel_storage_(static_cast<float*>( |
89 base::AlignedAlloc(sizeof(float) * kKernelStorageSize, 16))), | 57 base::AlignedAlloc(sizeof(float) * kKernelStorageSize, 16))), |
90 input_buffer_(static_cast<float*>( | 58 input_buffer_(static_cast<float*>( |
91 base::AlignedAlloc(sizeof(float) * kBufferSize, 16))), | 59 base::AlignedAlloc(sizeof(float) * kBufferSize, 16))), |
(...skipping 123 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
215 // This wraps the last input frames back to the start of the buffer. | 183 // This wraps the last input frames back to the start of the buffer. |
216 memcpy(r1_, r3_, sizeof(*input_buffer_.get()) * (kKernelSize / 2)); | 184 memcpy(r1_, r3_, sizeof(*input_buffer_.get()) * (kKernelSize / 2)); |
217 memcpy(r2_, r4_, sizeof(*input_buffer_.get()) * (kKernelSize / 2)); | 185 memcpy(r2_, r4_, sizeof(*input_buffer_.get()) * (kKernelSize / 2)); |
218 | 186 |
219 // Step (4) | 187 // Step (4) |
220 // Refresh the buffer with more input. | 188 // Refresh the buffer with more input. |
221 read_cb_.Run(r5_, kBlockSize); | 189 read_cb_.Run(r5_, kBlockSize); |
222 } | 190 } |
223 } | 191 } |
224 | 192 |
225 int SincResampler::ChunkSize() { | 193 int SincResampler::ChunkSize() const { |
226 return kBlockSize / io_sample_rate_ratio_; | 194 return kBlockSize / io_sample_rate_ratio_; |
227 } | 195 } |
228 | 196 |
229 void SincResampler::Flush() { | 197 void SincResampler::Flush() { |
230 virtual_source_idx_ = 0; | 198 virtual_source_idx_ = 0; |
231 buffer_primed_ = false; | 199 buffer_primed_ = false; |
232 memset(input_buffer_.get(), 0, sizeof(*input_buffer_.get()) * kBufferSize); | 200 memset(input_buffer_.get(), 0, sizeof(*input_buffer_.get()) * kBufferSize); |
233 } | 201 } |
234 | 202 |
235 float SincResampler::Convolve(const float* input_ptr, const float* k1, | 203 float SincResampler::Convolve(const float* input_ptr, const float* k1, |
236 const float* k2, | 204 const float* k2, |
237 double kernel_interpolation_factor) { | 205 double kernel_interpolation_factor) { |
| 206 // Ensure |k1|, |k2| are 16-byte aligned for SSE usage. Should always be true |
| 207 // so long as kKernelSize is a multiple of 16. |
| 208 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k1) & 0x0F); |
| 209 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k2) & 0x0F); |
| 210 |
238 // Rely on function level static initialization to keep ConvolveProc selection | 211 // Rely on function level static initialization to keep ConvolveProc selection |
239 // thread safe. | 212 // thread safe. |
240 typedef float (*ConvolveProc)(const float* src, const float* k1, | 213 typedef float (*ConvolveProc)(const float* src, const float* k1, |
241 const float* k2, | 214 const float* k2, |
242 double kernel_interpolation_factor); | 215 double kernel_interpolation_factor); |
243 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__) | 216 #if defined(ARCH_CPU_X86_FAMILY) |
| 217 #if defined(__SSE__) |
| 218 static const ConvolveProc kConvolveProc = Convolve_SSE; |
| 219 #else |
244 static const ConvolveProc kConvolveProc = | 220 static const ConvolveProc kConvolveProc = |
245 base::CPU().has_sse() ? Convolve_SSE : Convolve_C; | 221 base::CPU().has_sse() ? Convolve_SSE : Convolve_C; |
| 222 #endif |
246 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) | 223 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) |
247 static const ConvolveProc kConvolveProc = Convolve_NEON; | 224 static const ConvolveProc kConvolveProc = Convolve_NEON; |
248 #else | 225 #else |
249 static const ConvolveProc kConvolveProc = Convolve_C; | 226 static const ConvolveProc kConvolveProc = Convolve_C; |
250 #endif | 227 #endif |
251 | 228 |
252 return kConvolveProc(input_ptr, k1, k2, kernel_interpolation_factor); | 229 return kConvolveProc(input_ptr, k1, k2, kernel_interpolation_factor); |
253 } | 230 } |
254 | 231 |
255 float SincResampler::Convolve_C(const float* input_ptr, const float* k1, | 232 float SincResampler::Convolve_C(const float* input_ptr, const float* k1, |
256 const float* k2, | 233 const float* k2, |
257 double kernel_interpolation_factor) { | 234 double kernel_interpolation_factor) { |
258 float sum1 = 0; | 235 float sum1 = 0; |
259 float sum2 = 0; | 236 float sum2 = 0; |
260 | 237 |
261 // Generate a single output sample. Unrolling this loop hurt performance in | 238 // Generate a single output sample. Unrolling this loop hurt performance in |
262 // local testing. | 239 // local testing. |
263 int n = kKernelSize; | 240 int n = kKernelSize; |
264 while (n--) { | 241 while (n--) { |
265 sum1 += *input_ptr * *k1++; | 242 sum1 += *input_ptr * *k1++; |
266 sum2 += *input_ptr++ * *k2++; | 243 sum2 += *input_ptr++ * *k2++; |
267 } | 244 } |
268 | 245 |
269 // Linearly interpolate the two "convolutions". | 246 // Linearly interpolate the two "convolutions". |
270 return (1.0 - kernel_interpolation_factor) * sum1 | 247 return (1.0 - kernel_interpolation_factor) * sum1 |
271 + kernel_interpolation_factor * sum2; | 248 + kernel_interpolation_factor * sum2; |
272 } | 249 } |
273 | 250 |
274 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__) | |
275 float SincResampler::Convolve_SSE(const float* input_ptr, const float* k1, | |
276 const float* k2, | |
277 double kernel_interpolation_factor) { | |
278 // Ensure |k1|, |k2| are 16-byte aligned for SSE usage. Should always be true | |
279 // so long as kKernelSize is a multiple of 16. | |
280 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k1) & 0x0F); | |
281 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k2) & 0x0F); | |
282 | |
283 __m128 m_input; | |
284 __m128 m_sums1 = _mm_setzero_ps(); | |
285 __m128 m_sums2 = _mm_setzero_ps(); | |
286 | |
287 // Based on |input_ptr| alignment, we need to use loadu or load. Unrolling | |
288 // these loops hurt performance in local testing. | |
289 if (reinterpret_cast<uintptr_t>(input_ptr) & 0x0F) { | |
290 for (int i = 0; i < kKernelSize; i += 4) { | |
291 m_input = _mm_loadu_ps(input_ptr + i); | |
292 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); | |
293 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); | |
294 } | |
295 } else { | |
296 for (int i = 0; i < kKernelSize; i += 4) { | |
297 m_input = _mm_load_ps(input_ptr + i); | |
298 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); | |
299 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); | |
300 } | |
301 } | |
302 | |
303 // Linearly interpolate the two "convolutions". | |
304 m_sums1 = _mm_mul_ps(m_sums1, _mm_set_ps1(1.0 - kernel_interpolation_factor)); | |
305 m_sums2 = _mm_mul_ps(m_sums2, _mm_set_ps1(kernel_interpolation_factor)); | |
306 m_sums1 = _mm_add_ps(m_sums1, m_sums2); | |
307 | |
308 // Sum components together. | |
309 float result; | |
310 m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1); | |
311 _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps( | |
312 m_sums2, m_sums2, 1))); | |
313 | |
314 return result; | |
315 } | |
316 #endif | |
317 | |
318 #if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) | 251 #if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) |
319 float SincResampler::Convolve_NEON(const float* input_ptr, const float* k1, | 252 float SincResampler::Convolve_NEON(const float* input_ptr, const float* k1, |
320 const float* k2, | 253 const float* k2, |
321 double kernel_interpolation_factor) { | 254 double kernel_interpolation_factor) { |
322 float32x4_t m_input; | 255 float32x4_t m_input; |
323 float32x4_t m_sums1 = vmovq_n_f32(0); | 256 float32x4_t m_sums1 = vmovq_n_f32(0); |
324 float32x4_t m_sums2 = vmovq_n_f32(0); | 257 float32x4_t m_sums2 = vmovq_n_f32(0); |
325 | 258 |
326 const float* upper = input_ptr + kKernelSize; | 259 const float* upper = input_ptr + kKernelSize; |
327 for (; input_ptr < upper; ) { | 260 for (; input_ptr < upper; ) { |
(...skipping 10 matching lines...) Expand all Loading... |
338 vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)), | 271 vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)), |
339 m_sums2, vmovq_n_f32(kernel_interpolation_factor)); | 272 m_sums2, vmovq_n_f32(kernel_interpolation_factor)); |
340 | 273 |
341 // Sum components together. | 274 // Sum components together. |
342 float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1)); | 275 float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1)); |
343 return vget_lane_f32(vpadd_f32(m_half, m_half), 0); | 276 return vget_lane_f32(vpadd_f32(m_half, m_half), 0); |
344 } | 277 } |
345 #endif | 278 #endif |
346 | 279 |
347 } // namespace media | 280 } // namespace media |
OLD | NEW |