Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 // | 4 // |
| 5 // Input buffer layout, dividing the total buffer into regions (r0_ - r5_): | 5 // Input buffer layout, dividing the total buffer into regions (r0_ - r5_): |
| 6 // | 6 // |
| 7 // |----------------|-----------------------------------------|----------------| | 7 // |----------------|-----------------------------------------|----------------| |
| 8 // | 8 // |
| 9 // kBlockSize + kKernelSize / 2 | 9 // kBlockSize + kKernelSize / 2 |
| 10 // <---------------------------------------------------------> | 10 // <---------------------------------------------------------> |
| (...skipping 18 matching lines...) Expand all Loading... | |
| 29 // 5) Goto (2) until all of input is consumed. | 29 // 5) Goto (2) until all of input is consumed. |
| 30 // | 30 // |
| 31 // Note: we're glossing over how the sub-sample handling works with | 31 // Note: we're glossing over how the sub-sample handling works with |
| 32 // |virtual_source_idx_|, etc. | 32 // |virtual_source_idx_|, etc. |
| 33 | 33 |
| 34 // MSVC++ requires this to be set before any other includes to get M_PI. | 34 // MSVC++ requires this to be set before any other includes to get M_PI. |
| 35 #define _USE_MATH_DEFINES | 35 #define _USE_MATH_DEFINES |
| 36 | 36 |
| 37 #include "media/base/sinc_resampler.h" | 37 #include "media/base/sinc_resampler.h" |
| 38 | 38 |
| 39 #include <cmath> | |
| 40 | |
| 41 #include "base/cpu.h" | |
| 42 #include "base/logging.h" | |
| 43 #include "build/build_config.h" | |
| 44 | |
| 39 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__) | 45 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__) |
| 40 #include <xmmintrin.h> | 46 #include <xmmintrin.h> |
| 41 #endif | 47 #endif |
| 42 #include <cmath> | |
| 43 | 48 |
| 44 #include "base/cpu.h" | 49 #if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) |
| 45 #include "base/logging.h" | 50 #include <arm_neon.h> |
| 51 #endif | |
| 46 | 52 |
| 47 namespace media { | 53 namespace media { |
| 48 | 54 |
| 49 enum { | 55 enum { |
| 50 // The kernel size can be adjusted for quality (higher is better) at the | 56 // The kernel size can be adjusted for quality (higher is better) at the |
| 51 // expense of performance. Must be a multiple of 32. | 57 // expense of performance. Must be a multiple of 32. |
| 52 // TODO(dalecurtis): Test performance to see if we can jack this up to 64+. | 58 // TODO(dalecurtis): Test performance to see if we can jack this up to 64+. |
| 53 kKernelSize = 32, | 59 kKernelSize = 32, |
| 54 | 60 |
| 55 // The number of destination frames generated per processing pass. Affects | 61 // The number of destination frames generated per processing pass. Affects |
| (...skipping 168 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 224 const float* k2, | 230 const float* k2, |
| 225 double kernel_interpolation_factor) { | 231 double kernel_interpolation_factor) { |
| 226 // Rely on function level static initialization to keep ConvolveProc selection | 232 // Rely on function level static initialization to keep ConvolveProc selection |
| 227 // thread safe. | 233 // thread safe. |
| 228 typedef float (*ConvolveProc)(const float* src, const float* k1, | 234 typedef float (*ConvolveProc)(const float* src, const float* k1, |
| 229 const float* k2, | 235 const float* k2, |
| 230 double kernel_interpolation_factor); | 236 double kernel_interpolation_factor); |
| 231 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__) | 237 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__) |
| 232 static const ConvolveProc kConvolveProc = | 238 static const ConvolveProc kConvolveProc = |
| 233 base::CPU().has_sse() ? Convolve_SSE : Convolve_C; | 239 base::CPU().has_sse() ? Convolve_SSE : Convolve_C; |
| 240 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) | |
| 241 static const ConvolveProc kConvolveProc = Convolve_NEON; | |
| 234 #else | 242 #else |
| 235 static const ConvolveProc kConvolveProc = Convolve_C; | 243 static const ConvolveProc kConvolveProc = Convolve_C; |
| 236 #endif | 244 #endif |
| 237 | 245 |
| 238 return kConvolveProc(input_ptr, k1, k2, kernel_interpolation_factor); | 246 return kConvolveProc(input_ptr, k1, k2, kernel_interpolation_factor); |
| 239 } | 247 } |
| 240 | 248 |
| 241 float SincResampler::Convolve_C(const float* input_ptr, const float* k1, | 249 float SincResampler::Convolve_C(const float* input_ptr, const float* k1, |
| 242 const float* k2, | 250 const float* k2, |
| 243 double kernel_interpolation_factor) { | 251 double kernel_interpolation_factor) { |
| (...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 294 // Sum components together. | 302 // Sum components together. |
| 295 float result; | 303 float result; |
| 296 m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1); | 304 m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1); |
| 297 _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps( | 305 _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps( |
| 298 m_sums2, m_sums2, 1))); | 306 m_sums2, m_sums2, 1))); |
| 299 | 307 |
| 300 return result; | 308 return result; |
| 301 } | 309 } |
| 302 #endif | 310 #endif |
| 303 | 311 |
| 312 #if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) | |
| 313 float SincResampler::Convolve_NEON(const float* input_ptr, const float* k1, | |
| 314 const float* k2, | |
| 315 double kernel_interpolation_factor) { | |
| 316 float32x4_t m_input; | |
| 317 float32x4_t m_sums1 = vmovq_n_f32(0); | |
| 318 float32x4_t m_sums2 = vmovq_n_f32(0); | |
|
Johann
2012/09/25 19:02:51
For some reason it looks like it's assembling with
DaleCurtis
2012/09/25 20:30:01
Done. Old:
7d0: eddf 2b19 vldr d18, [pc, #100]
DaleCurtis
2012/09/25 21:27:01
Actually this triggers an uninitialized error and
| |
| 319 | |
| 320 const float* upper = input_ptr + kKernelSize; | |
| 321 for (; input_ptr < upper; ) { | |
| 322 m_input = vld1q_f32(input_ptr); | |
| 323 input_ptr += 4; | |
| 324 m_sums1 = vmlaq_f32(m_sums1, m_input, vld1q_f32(k1)); | |
| 325 k1 += 4; | |
| 326 m_sums2 = vmlaq_f32(m_sums2, m_input, vld1q_f32(k2)); | |
| 327 k2 += 4; | |
| 328 } | |
| 329 | |
| 330 // Linearly interpolate the two "convolutions". | |
| 331 m_sums1 = vmlaq_f32( | |
| 332 vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)), | |
| 333 m_sums2, vmovq_n_f32(kernel_interpolation_factor)); | |
| 334 | |
| 335 // Sum components together. | |
| 336 float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1)); | |
| 337 return vget_lane_f32(vpadd_f32(m_half, m_half), 0); | |
| 338 } | |
| 339 #endif | |
| 340 | |
| 304 } // namespace media | 341 } // namespace media |
| OLD | NEW |