| OLD | NEW |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 // | 4 // |
| 5 // Input buffer layout, dividing the total buffer into regions (r0_ - r5_): | 5 // Input buffer layout, dividing the total buffer into regions (r0_ - r5_): |
| 6 // | 6 // |
| 7 // |----------------|-----------------------------------------|----------------| | 7 // |----------------|-----------------------------------------|----------------| |
| 8 // | 8 // |
| 9 // kBlockSize + kKernelSize / 2 | 9 // kBlockSize + kKernelSize / 2 |
| 10 // <---------------------------------------------------------> | 10 // <---------------------------------------------------------> |
| (...skipping 18 matching lines...) Expand all Loading... |
| 29 // 5) Goto (2) until all of input is consumed. | 29 // 5) Goto (2) until all of input is consumed. |
| 30 // | 30 // |
| 31 // Note: we're glossing over how the sub-sample handling works with | 31 // Note: we're glossing over how the sub-sample handling works with |
| 32 // |virtual_source_idx_|, etc. | 32 // |virtual_source_idx_|, etc. |
| 33 | 33 |
| 34 // MSVC++ requires this to be set before any other includes to get M_PI. | 34 // MSVC++ requires this to be set before any other includes to get M_PI. |
| 35 #define _USE_MATH_DEFINES | 35 #define _USE_MATH_DEFINES |
| 36 | 36 |
| 37 #include "media/base/sinc_resampler.h" | 37 #include "media/base/sinc_resampler.h" |
| 38 | 38 |
| 39 #include <cmath> |
| 40 |
| 41 #include "base/cpu.h" |
| 42 #include "base/logging.h" |
| 43 #include "build/build_config.h" |
| 44 |
| 39 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__) | 45 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__) |
| 40 #include <xmmintrin.h> | 46 #include <xmmintrin.h> |
| 41 #endif | 47 #endif |
| 42 #include <cmath> | |
| 43 | 48 |
| 44 #include "base/cpu.h" | 49 #if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) |
| 45 #include "base/logging.h" | 50 #include <arm_neon.h> |
| 51 #endif |
| 46 | 52 |
| 47 namespace media { | 53 namespace media { |
| 48 | 54 |
| 49 enum { | 55 enum { |
| 50 // The kernel size can be adjusted for quality (higher is better) at the | 56 // The kernel size can be adjusted for quality (higher is better) at the |
| 51 // expense of performance. Must be a multiple of 32. | 57 // expense of performance. Must be a multiple of 32. |
| 52 // TODO(dalecurtis): Test performance to see if we can jack this up to 64+. | 58 // TODO(dalecurtis): Test performance to see if we can jack this up to 64+. |
| 53 kKernelSize = 32, | 59 kKernelSize = 32, |
| 54 | 60 |
| 55 // The number of destination frames generated per processing pass. Affects | 61 // The number of destination frames generated per processing pass. Affects |
| (...skipping 168 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 224 const float* k2, | 230 const float* k2, |
| 225 double kernel_interpolation_factor) { | 231 double kernel_interpolation_factor) { |
| 226 // Rely on function level static initialization to keep ConvolveProc selection | 232 // Rely on function level static initialization to keep ConvolveProc selection |
| 227 // thread safe. | 233 // thread safe. |
| 228 typedef float (*ConvolveProc)(const float* src, const float* k1, | 234 typedef float (*ConvolveProc)(const float* src, const float* k1, |
| 229 const float* k2, | 235 const float* k2, |
| 230 double kernel_interpolation_factor); | 236 double kernel_interpolation_factor); |
| 231 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__) | 237 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__) |
| 232 static const ConvolveProc kConvolveProc = | 238 static const ConvolveProc kConvolveProc = |
| 233 base::CPU().has_sse() ? Convolve_SSE : Convolve_C; | 239 base::CPU().has_sse() ? Convolve_SSE : Convolve_C; |
| 240 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) |
| 241 static const ConvolveProc kConvolveProc = Convolve_NEON; |
| 234 #else | 242 #else |
| 235 static const ConvolveProc kConvolveProc = Convolve_C; | 243 static const ConvolveProc kConvolveProc = Convolve_C; |
| 236 #endif | 244 #endif |
| 237 | 245 |
| 238 return kConvolveProc(input_ptr, k1, k2, kernel_interpolation_factor); | 246 return kConvolveProc(input_ptr, k1, k2, kernel_interpolation_factor); |
| 239 } | 247 } |
| 240 | 248 |
| 241 float SincResampler::Convolve_C(const float* input_ptr, const float* k1, | 249 float SincResampler::Convolve_C(const float* input_ptr, const float* k1, |
| 242 const float* k2, | 250 const float* k2, |
| 243 double kernel_interpolation_factor) { | 251 double kernel_interpolation_factor) { |
| (...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 294 // Sum components together. | 302 // Sum components together. |
| 295 float result; | 303 float result; |
| 296 m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1); | 304 m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1); |
| 297 _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps( | 305 _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps( |
| 298 m_sums2, m_sums2, 1))); | 306 m_sums2, m_sums2, 1))); |
| 299 | 307 |
| 300 return result; | 308 return result; |
| 301 } | 309 } |
| 302 #endif | 310 #endif |
| 303 | 311 |
| 312 #if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) |
| 313 float SincResampler::Convolve_NEON(const float* input_ptr, const float* k1, |
| 314 const float* k2, |
| 315 double kernel_interpolation_factor) { |
| 316 float32x4_t m_input; |
| 317 float32x4_t m_sums1 = vmovq_n_f32(0); |
| 318 float32x4_t m_sums2 = vmovq_n_f32(0); |
| 319 |
| 320 for (int i = 0; i < kKernelSize; i += 4) { |
| 321 m_input = vld1q_f32(input_ptr + i); |
| 322 m_sums1 = vaddq_f32(m_sums1, vmulq_f32(m_input, vld1q_f32(k1 + i))); |
| 323 m_sums2 = vaddq_f32(m_sums2, vmulq_f32(m_input, vld1q_f32(k2 + i))); |
| 324 } |
| 325 |
| 326 // Linearly interpolate the two "convolutions". |
| 327 m_sums1 = vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)); |
| 328 m_sums2 = vmulq_f32(m_sums2, vmovq_n_f32(kernel_interpolation_factor)); |
| 329 m_sums1 = vaddq_f32(m_sums1, m_sums2); |
| 330 |
| 331 // Sum components together. |
| 332 float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1)); |
| 333 return vget_lane_f32(vpadd_f32(m_half, m_half), 0); |
| 334 } |
| 335 #endif |
| 336 |
| 304 } // namespace media | 337 } // namespace media |
| OLD | NEW |