Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(59)

Side by Side Diff: media/base/sinc_resampler.cc

Issue 10960023: Add ARM NEON intrinsic optimizations for SincResampler. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Created 8 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 // 4 //
5 // Input buffer layout, dividing the total buffer into regions (r0_ - r5_): 5 // Input buffer layout, dividing the total buffer into regions (r0_ - r5_):
6 // 6 //
7 // |----------------|-----------------------------------------|----------------| 7 // |----------------|-----------------------------------------|----------------|
8 // 8 //
9 // kBlockSize + kKernelSize / 2 9 // kBlockSize + kKernelSize / 2
10 // <---------------------------------------------------------> 10 // <--------------------------------------------------------->
(...skipping 18 matching lines...) Expand all
29 // 5) Goto (2) until all of input is consumed. 29 // 5) Goto (2) until all of input is consumed.
30 // 30 //
31 // Note: we're glossing over how the sub-sample handling works with 31 // Note: we're glossing over how the sub-sample handling works with
32 // |virtual_source_idx_|, etc. 32 // |virtual_source_idx_|, etc.
33 33
34 // MSVC++ requires this to be set before any other includes to get M_PI. 34 // MSVC++ requires this to be set before any other includes to get M_PI.
35 #define _USE_MATH_DEFINES 35 #define _USE_MATH_DEFINES
36 36
37 #include "media/base/sinc_resampler.h" 37 #include "media/base/sinc_resampler.h"
38 38
39 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
40 #include <xmmintrin.h>
41 #endif
42 #include <cmath> 39 #include <cmath>
43 40
44 #include "base/cpu.h" 41 #include "base/cpu.h"
45 #include "base/logging.h" 42 #include "base/logging.h"
46 43
44 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
45 #include <xmmintrin.h>
46 #endif
47
48 #if defined(ARCH_CPU_ARM_FAMILY) && defined(__ARM_NEON__) && defined(USE_NEON)
49 #include <arm_neon.h>
50 #endif
51
47 namespace media { 52 namespace media {
48 53
49 enum { 54 enum {
50 // The kernel size can be adjusted for quality (higher is better) at the 55 // The kernel size can be adjusted for quality (higher is better) at the
51 // expense of performance. Must be a multiple of 32. 56 // expense of performance. Must be a multiple of 32.
52 // TODO(dalecurtis): Test performance to see if we can jack this up to 64+. 57 // TODO(dalecurtis): Test performance to see if we can jack this up to 64+.
53 kKernelSize = 32, 58 kKernelSize = 32,
54 59
55 // The number of destination frames generated per processing pass. Affects 60 // The number of destination frames generated per processing pass. Affects
56 // how often and for how much SincResampler calls back for input. Must be 61 // how often and for how much SincResampler calls back for input. Must be
(...skipping 167 matching lines...) Expand 10 before | Expand all | Expand 10 after
224 const float* k2, 229 const float* k2,
225 double kernel_interpolation_factor) { 230 double kernel_interpolation_factor) {
226 // Rely on function level static initialization to keep ConvolveProc selection 231 // Rely on function level static initialization to keep ConvolveProc selection
227 // thread safe. 232 // thread safe.
228 typedef float (*ConvolveProc)(const float* src, const float* k1, 233 typedef float (*ConvolveProc)(const float* src, const float* k1,
229 const float* k2, 234 const float* k2,
230 double kernel_interpolation_factor); 235 double kernel_interpolation_factor);
231 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__) 236 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
232 static const ConvolveProc kConvolveProc = 237 static const ConvolveProc kConvolveProc =
233 base::CPU().has_sse() ? Convolve_SSE : Convolve_C; 238 base::CPU().has_sse() ? Convolve_SSE : Convolve_C;
239 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(__ARM_NEON__) && defined(USE_NEON)
240 static const ConvolveProc kConvolveProc = Convolve_NEON;
234 #else 241 #else
235 static const ConvolveProc kConvolveProc = Convolve_C; 242 static const ConvolveProc kConvolveProc = Convolve_C;
236 #endif 243 #endif
237 244
238 return kConvolveProc(input_ptr, k1, k2, kernel_interpolation_factor); 245 return kConvolveProc(input_ptr, k1, k2, kernel_interpolation_factor);
239 } 246 }
240 247
241 float SincResampler::Convolve_C(const float* input_ptr, const float* k1, 248 float SincResampler::Convolve_C(const float* input_ptr, const float* k1,
242 const float* k2, 249 const float* k2,
243 double kernel_interpolation_factor) { 250 double kernel_interpolation_factor) {
(...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after
294 // Sum components together. 301 // Sum components together.
295 float result; 302 float result;
296 m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1); 303 m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1);
297 _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps( 304 _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps(
298 m_sums2, m_sums2, 1))); 305 m_sums2, m_sums2, 1)));
299 306
300 return result; 307 return result;
301 } 308 }
302 #endif 309 #endif
303 310
311 #if defined(ARCH_CPU_ARM_FAMILY) && defined(__ARM_NEON__) && defined(USE_NEON)
312 float SincResampler::Convolve_NEON(const float* input_ptr, const float* k1,
313 const float* k2,
314 double kernel_interpolation_factor) {
315 float32x4_t m_input;
316 float32x4_t m_sums1 = vmovq_n_f32(0);
317 float32x4_t m_sums2 = vmovq_n_f32(0);
318
319 for (int i = 0; i < kKernelSize; i += 4) {
320 m_input = vld1q_f32(input_ptr + i);
321 m_sums1 = vaddq_f32(m_sums1, vmulq_f32(m_input, vld1q_f32(k1 + i)));
322 m_sums2 = vaddq_f32(m_sums2, vmulq_f32(m_input, vld1q_f32(k2 + i)));
323 }
324
325 // Linearly interpolate the two "convolutions".
326 m_sums1 = vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor));
327 m_sums2 = vmulq_f32(m_sums2, vmovq_n_f32(kernel_interpolation_factor));
328 m_sums1 = vaddq_f32(m_sums1, m_sums2);
329
330 // Sum components together.
331 float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1));
332 return vget_lane_f32(vpadd_f32(m_half, m_half), 0);
333 }
334 #endif
335
304 } // namespace media 336 } // namespace media
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698