Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 // | 4 // |
| 5 // Initial input buffer layout, dividing into regions r0_ to r4_ (note: r0_, r3_ | 5 // Initial input buffer layout, dividing into regions r0_ to r4_ (note: r0_, r3_ |
| 6 // and r4_ will move after the first load): | 6 // and r4_ will move after the first load): |
| 7 // | 7 // |
| 8 // |----------------|-----------------------------------------|----------------| | 8 // |----------------|-----------------------------------------|----------------| |
| 9 // | 9 // |
| 10 // request_frames_ | 10 // request_frames_ |
| (...skipping 63 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 74 // |virtual_source_idx_|, etc. | 74 // |virtual_source_idx_|, etc. |
| 75 | 75 |
| 76 // MSVC++ requires this to be set before any other includes to get M_PI. | 76 // MSVC++ requires this to be set before any other includes to get M_PI. |
| 77 #define _USE_MATH_DEFINES | 77 #define _USE_MATH_DEFINES |
| 78 | 78 |
| 79 #include "media/base/sinc_resampler.h" | 79 #include "media/base/sinc_resampler.h" |
| 80 | 80 |
| 81 #include <cmath> | 81 #include <cmath> |
| 82 #include <limits> | 82 #include <limits> |
| 83 | 83 |
| 84 #include "base/cpu.h" | |
| 85 #include "base/logging.h" | 84 #include "base/logging.h" |
| 86 | 85 |
| 87 #if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) | 86 #if defined(ARCH_CPU_X86_FAMILY) |
| 87 #include <xmmintrin.h> | |
| 88 #define CONVOLVE_FUNC Convolve_SSE | |
| 89 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) | |
| 88 #include <arm_neon.h> | 90 #include <arm_neon.h> |
| 91 #define CONVOLVE_FUNC Convolve_NEON | |
| 92 #else | |
| 93 #define CONVOLVE_FUNC Convolve_C | |
| 89 #endif | 94 #endif |
| 90 | 95 |
| 91 namespace media { | 96 namespace media { |
| 92 | 97 |
| 93 static double SincScaleFactor(double io_ratio) { | 98 static double SincScaleFactor(double io_ratio) { |
| 94 // |sinc_scale_factor| is basically the normalized cutoff frequency of the | 99 // |sinc_scale_factor| is basically the normalized cutoff frequency of the |
| 95 // low-pass filter. | 100 // low-pass filter. |
| 96 double sinc_scale_factor = io_ratio > 1.0 ? 1.0 / io_ratio : 1.0; | 101 double sinc_scale_factor = io_ratio > 1.0 ? 1.0 / io_ratio : 1.0; |
| 97 | 102 |
| 98 // The sinc function is an idealized brick-wall filter, but since we're | 103 // The sinc function is an idealized brick-wall filter, but since we're |
| 99 // windowing it the transition from pass to stop does not happen right away. | 104 // windowing it the transition from pass to stop does not happen right away. |
| 100 // So we should adjust the low pass filter cutoff slightly downward to avoid | 105 // So we should adjust the low pass filter cutoff slightly downward to avoid |
| 101 // some aliasing at the very high-end. | 106 // some aliasing at the very high-end. |
| 102 // TODO(crogers): this value is empirical and to be more exact should vary | 107 // TODO(crogers): this value is empirical and to be more exact should vary |
| 103 // depending on kKernelSize. | 108 // depending on kKernelSize. |
| 104 sinc_scale_factor *= 0.9; | 109 sinc_scale_factor *= 0.9; |
| 105 | 110 |
| 106 return sinc_scale_factor; | 111 return sinc_scale_factor; |
| 107 } | 112 } |
| 108 | 113 |
| 109 // If we know the minimum architecture at compile time, avoid CPU detection. | |
| 110 // Force NaCl code to use C routines since (at present) nothing there uses these | |
| 111 // methods and plumbing the -msse built library is non-trivial. | |
| 112 #if defined(ARCH_CPU_X86_FAMILY) && !defined(OS_NACL) | |
| 113 #if defined(__SSE__) | |
| 114 #define CONVOLVE_FUNC Convolve_SSE | |
| 115 void SincResampler::InitializeCPUSpecificFeatures() {} | |
| 116 #else | |
| 117 // X86 CPU detection required. Functions will be set by | |
| 118 // InitializeCPUSpecificFeatures(). | |
| 119 // TODO(dalecurtis): Once Chrome moves to an SSE baseline this can be removed. | |
| 120 #define CONVOLVE_FUNC g_convolve_proc_ | |
| 121 | |
| 122 typedef float (*ConvolveProc)(const float*, const float*, const float*, double); | |
| 123 static ConvolveProc g_convolve_proc_ = NULL; | |
| 124 | |
| 125 void SincResampler::InitializeCPUSpecificFeatures() { | |
| 126 CHECK(!g_convolve_proc_); | |
| 127 g_convolve_proc_ = base::CPU().has_sse() ? Convolve_SSE : Convolve_C; | |
| 128 } | |
| 129 #endif | |
| 130 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) | |
| 131 #define CONVOLVE_FUNC Convolve_NEON | |
| 132 void SincResampler::InitializeCPUSpecificFeatures() {} | |
| 133 #else | |
| 134 // Unknown architecture. | |
| 135 #define CONVOLVE_FUNC Convolve_C | |
| 136 void SincResampler::InitializeCPUSpecificFeatures() {} | |
| 137 #endif | |
| 138 | |
| 139 SincResampler::SincResampler(double io_sample_rate_ratio, | 114 SincResampler::SincResampler(double io_sample_rate_ratio, |
| 140 int request_frames, | 115 int request_frames, |
| 141 const ReadCB& read_cb) | 116 const ReadCB& read_cb) |
| 142 : io_sample_rate_ratio_(io_sample_rate_ratio), | 117 : io_sample_rate_ratio_(io_sample_rate_ratio), |
| 143 read_cb_(read_cb), | 118 read_cb_(read_cb), |
| 144 request_frames_(request_frames), | 119 request_frames_(request_frames), |
| 145 input_buffer_size_(request_frames_ + kKernelSize), | 120 input_buffer_size_(request_frames_ + kKernelSize), |
| 146 // Create input buffers with a 16-byte alignment for SSE optimizations. | 121 // Create input buffers with a 16-byte alignment for SSE optimizations. |
| 147 kernel_storage_(static_cast<float*>( | 122 kernel_storage_(static_cast<float*>( |
| 148 base::AlignedAlloc(sizeof(float) * kKernelStorageSize, 16))), | 123 base::AlignedAlloc(sizeof(float) * kKernelStorageSize, 16))), |
| (...skipping 165 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 314 | 289 |
| 315 // Step (4) -- Reinitialize regions if necessary. | 290 // Step (4) -- Reinitialize regions if necessary. |
| 316 if (r0_ == r2_) | 291 if (r0_ == r2_) |
| 317 UpdateRegions(true); | 292 UpdateRegions(true); |
| 318 | 293 |
| 319 // Step (5) -- Refresh the buffer with more input. | 294 // Step (5) -- Refresh the buffer with more input. |
| 320 read_cb_.Run(request_frames_, r0_); | 295 read_cb_.Run(request_frames_, r0_); |
| 321 } | 296 } |
| 322 } | 297 } |
| 323 | 298 |
| 324 #undef CONVOLVE_FUNC | |
| 325 | |
| 326 int SincResampler::ChunkSize() const { | 299 int SincResampler::ChunkSize() const { |
| 327 return block_size_ / io_sample_rate_ratio_; | 300 return block_size_ / io_sample_rate_ratio_; |
| 328 } | 301 } |
| 329 | 302 |
| 330 void SincResampler::Flush() { | 303 void SincResampler::Flush() { |
| 331 virtual_source_idx_ = 0; | 304 virtual_source_idx_ = 0; |
| 332 buffer_primed_ = false; | 305 buffer_primed_ = false; |
| 333 memset(input_buffer_.get(), 0, | 306 memset(input_buffer_.get(), 0, |
| 334 sizeof(*input_buffer_.get()) * input_buffer_size_); | 307 sizeof(*input_buffer_.get()) * input_buffer_size_); |
| 335 UpdateRegions(false); | 308 UpdateRegions(false); |
| (...skipping 11 matching lines...) Expand all Loading... | |
| 347 while (n--) { | 320 while (n--) { |
| 348 sum1 += *input_ptr * *k1++; | 321 sum1 += *input_ptr * *k1++; |
| 349 sum2 += *input_ptr++ * *k2++; | 322 sum2 += *input_ptr++ * *k2++; |
| 350 } | 323 } |
| 351 | 324 |
| 352 // Linearly interpolate the two "convolutions". | 325 // Linearly interpolate the two "convolutions". |
| 353 return (1.0 - kernel_interpolation_factor) * sum1 | 326 return (1.0 - kernel_interpolation_factor) * sum1 |
| 354 + kernel_interpolation_factor * sum2; | 327 + kernel_interpolation_factor * sum2; |
| 355 } | 328 } |
| 356 | 329 |
| 357 #if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) | 330 #if defined(ARCH_CPU_X86_FAMILY) |
| 331 float SincResampler::Convolve_SSE(const float* input_ptr, const float* k1, | |
| 332 const float* k2, | |
| 333 double kernel_interpolation_factor) { | |
| 334 __m128 m_input; | |
| 335 __m128 m_sums1 = _mm_setzero_ps(); | |
| 336 __m128 m_sums2 = _mm_setzero_ps(); | |
| 337 | |
| 338 // Based on |input_ptr| alignment, we need to use loadu or load. Unrolling | |
| 339 // these loops hurt performance in local testing. | |
| 340 if (reinterpret_cast<uintptr_t>(input_ptr) & 0x0F) { | |
| 341 for (int i = 0; i < kKernelSize; i += 4) { | |
| 342 m_input = _mm_loadu_ps(input_ptr + i); | |
| 343 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); | |
| 344 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); | |
| 345 } | |
| 346 } else { | |
| 347 for (int i = 0; i < kKernelSize; i += 4) { | |
| 348 m_input = _mm_load_ps(input_ptr + i); | |
| 349 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); | |
| 350 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); | |
| 351 } | |
| 352 } | |
| 353 | |
| 354 // Linearly interpolate the two "convolutions". | |
| 355 m_sums1 = _mm_mul_ps(m_sums1, _mm_set_ps1(1.0 - kernel_interpolation_factor)); | |
| 356 m_sums2 = _mm_mul_ps(m_sums2, _mm_set_ps1(kernel_interpolation_factor)); | |
| 357 m_sums1 = _mm_add_ps(m_sums1, m_sums2); | |
| 358 | |
| 359 // Sum components together. | |
| 360 float result; | |
| 361 m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1); | |
| 362 _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps( | |
| 363 m_sums2, m_sums2, 1))); | |
| 364 | |
| 365 return result; | |
| 366 } | |
| 367 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) | |
|
rileya (GONE FROM CHROMIUM)
2014/05/30 01:12:29
Not too relevant to this CL, but iirc we don't set
| |
| 358 float SincResampler::Convolve_NEON(const float* input_ptr, const float* k1, | 368 float SincResampler::Convolve_NEON(const float* input_ptr, const float* k1, |
| 359 const float* k2, | 369 const float* k2, |
| 360 double kernel_interpolation_factor) { | 370 double kernel_interpolation_factor) { |
| 361 float32x4_t m_input; | 371 float32x4_t m_input; |
| 362 float32x4_t m_sums1 = vmovq_n_f32(0); | 372 float32x4_t m_sums1 = vmovq_n_f32(0); |
| 363 float32x4_t m_sums2 = vmovq_n_f32(0); | 373 float32x4_t m_sums2 = vmovq_n_f32(0); |
| 364 | 374 |
| 365 const float* upper = input_ptr + kKernelSize; | 375 const float* upper = input_ptr + kKernelSize; |
| 366 for (; input_ptr < upper; ) { | 376 for (; input_ptr < upper; ) { |
| 367 m_input = vld1q_f32(input_ptr); | 377 m_input = vld1q_f32(input_ptr); |
| 368 input_ptr += 4; | 378 input_ptr += 4; |
| 369 m_sums1 = vmlaq_f32(m_sums1, m_input, vld1q_f32(k1)); | 379 m_sums1 = vmlaq_f32(m_sums1, m_input, vld1q_f32(k1)); |
| 370 k1 += 4; | 380 k1 += 4; |
| 371 m_sums2 = vmlaq_f32(m_sums2, m_input, vld1q_f32(k2)); | 381 m_sums2 = vmlaq_f32(m_sums2, m_input, vld1q_f32(k2)); |
| 372 k2 += 4; | 382 k2 += 4; |
| 373 } | 383 } |
| 374 | 384 |
| 375 // Linearly interpolate the two "convolutions". | 385 // Linearly interpolate the two "convolutions". |
| 376 m_sums1 = vmlaq_f32( | 386 m_sums1 = vmlaq_f32( |
| 377 vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)), | 387 vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)), |
| 378 m_sums2, vmovq_n_f32(kernel_interpolation_factor)); | 388 m_sums2, vmovq_n_f32(kernel_interpolation_factor)); |
| 379 | 389 |
| 380 // Sum components together. | 390 // Sum components together. |
| 381 float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1)); | 391 float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1)); |
| 382 return vget_lane_f32(vpadd_f32(m_half, m_half), 0); | 392 return vget_lane_f32(vpadd_f32(m_half, m_half), 0); |
| 383 } | 393 } |
| 384 #endif | 394 #endif |
| 385 | 395 |
| 386 } // namespace media | 396 } // namespace media |
| OLD | NEW |