| OLD | NEW |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 // | 4 // |
| 5 // Input buffer layout, dividing the total buffer into regions (r0_ - r5_): | 5 // Input buffer layout, dividing the total buffer into regions (r0_ - r5_): |
| 6 // | 6 // |
| 7 // |----------------|-----------------------------------------|----------------| | 7 // |----------------|-----------------------------------------|----------------| |
| 8 // | 8 // |
| 9 // kBlockSize + kKernelSize / 2 | 9 // kBlockSize + kKernelSize / 2 |
| 10 // <---------------------------------------------------------> | 10 // <---------------------------------------------------------> |
| (...skipping 18 matching lines...) Expand all Loading... |
| 29 // 5) Goto (2) until all of input is consumed. | 29 // 5) Goto (2) until all of input is consumed. |
| 30 // | 30 // |
| 31 // Note: we're glossing over how the sub-sample handling works with | 31 // Note: we're glossing over how the sub-sample handling works with |
| 32 // |virtual_source_idx_|, etc. | 32 // |virtual_source_idx_|, etc. |
| 33 | 33 |
| 34 // MSVC++ requires this to be set before any other includes to get M_PI. | 34 // MSVC++ requires this to be set before any other includes to get M_PI. |
| 35 #define _USE_MATH_DEFINES | 35 #define _USE_MATH_DEFINES |
| 36 | 36 |
| 37 #include "media/base/sinc_resampler.h" | 37 #include "media/base/sinc_resampler.h" |
| 38 | 38 |
| 39 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__) | |
| 40 #include <xmmintrin.h> | |
| 41 #endif | |
| 42 #include <cmath> | 39 #include <cmath> |
| 43 | 40 |
| 44 #include "base/cpu.h" | 41 #include "base/cpu.h" |
| 45 #include "base/logging.h" | 42 #include "base/logging.h" |
| 46 | 43 |
| 44 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__) |
| 45 #include <xmmintrin.h> |
| 46 #endif |
| 47 |
| 48 #if defined(ARCH_CPU_ARM_FAMILY) && defined(__ARM_NEON__) && defined(USE_NEON) |
| 49 #include <arm_neon.h> |
| 50 #endif |
| 51 |
| 47 namespace media { | 52 namespace media { |
| 48 | 53 |
| 49 enum { | 54 enum { |
| 50 // The kernel size can be adjusted for quality (higher is better) at the | 55 // The kernel size can be adjusted for quality (higher is better) at the |
| 51 // expense of performance. Must be a multiple of 32. | 56 // expense of performance. Must be a multiple of 32. |
| 52 // TODO(dalecurtis): Test performance to see if we can jack this up to 64+. | 57 // TODO(dalecurtis): Test performance to see if we can jack this up to 64+. |
| 53 kKernelSize = 32, | 58 kKernelSize = 32, |
| 54 | 59 |
| 55 // The number of destination frames generated per processing pass. Affects | 60 // The number of destination frames generated per processing pass. Affects |
| 56 // how often and for how much SincResampler calls back for input. Must be | 61 // how often and for how much SincResampler calls back for input. Must be |
| (...skipping 167 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 224 const float* k2, | 229 const float* k2, |
| 225 double kernel_interpolation_factor) { | 230 double kernel_interpolation_factor) { |
| 226 // Rely on function level static initialization to keep ConvolveProc selection | 231 // Rely on function level static initialization to keep ConvolveProc selection |
| 227 // thread safe. | 232 // thread safe. |
| 228 typedef float (*ConvolveProc)(const float* src, const float* k1, | 233 typedef float (*ConvolveProc)(const float* src, const float* k1, |
| 229 const float* k2, | 234 const float* k2, |
| 230 double kernel_interpolation_factor); | 235 double kernel_interpolation_factor); |
| 231 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__) | 236 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__) |
| 232 static const ConvolveProc kConvolveProc = | 237 static const ConvolveProc kConvolveProc = |
| 233 base::CPU().has_sse() ? Convolve_SSE : Convolve_C; | 238 base::CPU().has_sse() ? Convolve_SSE : Convolve_C; |
| 239 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(__ARM_NEON__) && defined(USE_NEON) |
| 240 static const ConvolveProc kConvolveProc = Convolve_NEON; |
| 234 #else | 241 #else |
| 235 static const ConvolveProc kConvolveProc = Convolve_C; | 242 static const ConvolveProc kConvolveProc = Convolve_C; |
| 236 #endif | 243 #endif |
| 237 | 244 |
| 238 return kConvolveProc(input_ptr, k1, k2, kernel_interpolation_factor); | 245 return kConvolveProc(input_ptr, k1, k2, kernel_interpolation_factor); |
| 239 } | 246 } |
| 240 | 247 |
| 241 float SincResampler::Convolve_C(const float* input_ptr, const float* k1, | 248 float SincResampler::Convolve_C(const float* input_ptr, const float* k1, |
| 242 const float* k2, | 249 const float* k2, |
| 243 double kernel_interpolation_factor) { | 250 double kernel_interpolation_factor) { |
| (...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 294 // Sum components together. | 301 // Sum components together. |
| 295 float result; | 302 float result; |
| 296 m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1); | 303 m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1); |
| 297 _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps( | 304 _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps( |
| 298 m_sums2, m_sums2, 1))); | 305 m_sums2, m_sums2, 1))); |
| 299 | 306 |
| 300 return result; | 307 return result; |
| 301 } | 308 } |
| 302 #endif | 309 #endif |
| 303 | 310 |
| 311 #if defined(ARCH_CPU_ARM_FAMILY) && defined(__ARM_NEON__) && defined(USE_NEON) |
| 312 float SincResampler::Convolve_NEON(const float* input_ptr, const float* k1, |
| 313 const float* k2, |
| 314 double kernel_interpolation_factor) { |
| 315 float32x4_t m_input; |
| 316 float32x4_t m_sums1 = vmovq_n_f32(0); |
| 317 float32x4_t m_sums2 = vmovq_n_f32(0); |
| 318 |
| 319 for (int i = 0; i < kKernelSize; i += 4) { |
| 320 m_input = vld1q_f32(input_ptr + i); |
| 321 m_sums1 = vaddq_f32(m_sums1, vmulq_f32(m_input, vld1q_f32(k1 + i))); |
| 322 m_sums2 = vaddq_f32(m_sums2, vmulq_f32(m_input, vld1q_f32(k2 + i))); |
| 323 } |
| 324 |
| 325 // Linearly interpolate the two "convolutions". |
| 326 m_sums1 = vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)); |
| 327 m_sums2 = vmulq_f32(m_sums2, vmovq_n_f32(kernel_interpolation_factor)); |
| 328 m_sums1 = vaddq_f32(m_sums1, m_sums2); |
| 329 |
| 330 // Sum components together. |
| 331 float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1)); |
| 332 return vget_lane_f32(vpadd_f32(m_half, m_half), 0); |
| 333 } |
| 334 #endif |
| 335 |
| 304 } // namespace media | 336 } // namespace media |
| OLD | NEW |