Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(288)

Side by Side Diff: media/base/sinc_resampler.cc

Issue 10960023: Add ARM NEON intrinsic optimizations for SincResampler. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Fix NE issue for ARM. Created 8 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « media/base/sinc_resampler.h ('k') | media/base/sinc_resampler_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 // 4 //
5 // Input buffer layout, dividing the total buffer into regions (r0_ - r5_): 5 // Input buffer layout, dividing the total buffer into regions (r0_ - r5_):
6 // 6 //
7 // |----------------|-----------------------------------------|----------------| 7 // |----------------|-----------------------------------------|----------------|
8 // 8 //
9 // kBlockSize + kKernelSize / 2 9 // kBlockSize + kKernelSize / 2
10 // <---------------------------------------------------------> 10 // <--------------------------------------------------------->
(...skipping 18 matching lines...) Expand all
29 // 5) Goto (2) until all of input is consumed. 29 // 5) Goto (2) until all of input is consumed.
30 // 30 //
31 // Note: we're glossing over how the sub-sample handling works with 31 // Note: we're glossing over how the sub-sample handling works with
32 // |virtual_source_idx_|, etc. 32 // |virtual_source_idx_|, etc.
33 33
34 // MSVC++ requires this to be set before any other includes to get M_PI. 34 // MSVC++ requires this to be set before any other includes to get M_PI.
35 #define _USE_MATH_DEFINES 35 #define _USE_MATH_DEFINES
36 36
37 #include "media/base/sinc_resampler.h" 37 #include "media/base/sinc_resampler.h"
38 38
39 #include <cmath>
40
41 #include "base/cpu.h"
42 #include "base/logging.h"
43 #include "build/build_config.h"
44
39 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__) 45 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
40 #include <xmmintrin.h> 46 #include <xmmintrin.h>
41 #endif 47 #endif
42 #include <cmath>
43 48
44 #include "base/cpu.h" 49 #if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
45 #include "base/logging.h" 50 #include <arm_neon.h>
51 #endif
46 52
47 namespace media { 53 namespace media {
48 54
49 enum { 55 enum {
50 // The kernel size can be adjusted for quality (higher is better) at the 56 // The kernel size can be adjusted for quality (higher is better) at the
51 // expense of performance. Must be a multiple of 32. 57 // expense of performance. Must be a multiple of 32.
52 // TODO(dalecurtis): Test performance to see if we can jack this up to 64+. 58 // TODO(dalecurtis): Test performance to see if we can jack this up to 64+.
53 kKernelSize = 32, 59 kKernelSize = 32,
54 60
55 // The number of destination frames generated per processing pass. Affects 61 // The number of destination frames generated per processing pass. Affects
(...skipping 168 matching lines...) Expand 10 before | Expand all | Expand 10 after
224 const float* k2, 230 const float* k2,
225 double kernel_interpolation_factor) { 231 double kernel_interpolation_factor) {
226 // Rely on function level static initialization to keep ConvolveProc selection 232 // Rely on function level static initialization to keep ConvolveProc selection
227 // thread safe. 233 // thread safe.
228 typedef float (*ConvolveProc)(const float* src, const float* k1, 234 typedef float (*ConvolveProc)(const float* src, const float* k1,
229 const float* k2, 235 const float* k2,
230 double kernel_interpolation_factor); 236 double kernel_interpolation_factor);
231 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__) 237 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
232 static const ConvolveProc kConvolveProc = 238 static const ConvolveProc kConvolveProc =
233 base::CPU().has_sse() ? Convolve_SSE : Convolve_C; 239 base::CPU().has_sse() ? Convolve_SSE : Convolve_C;
240 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
241 static const ConvolveProc kConvolveProc = Convolve_NEON;
234 #else 242 #else
235 static const ConvolveProc kConvolveProc = Convolve_C; 243 static const ConvolveProc kConvolveProc = Convolve_C;
236 #endif 244 #endif
237 245
238 return kConvolveProc(input_ptr, k1, k2, kernel_interpolation_factor); 246 return kConvolveProc(input_ptr, k1, k2, kernel_interpolation_factor);
239 } 247 }
240 248
241 float SincResampler::Convolve_C(const float* input_ptr, const float* k1, 249 float SincResampler::Convolve_C(const float* input_ptr, const float* k1,
242 const float* k2, 250 const float* k2,
243 double kernel_interpolation_factor) { 251 double kernel_interpolation_factor) {
(...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after
294 // Sum components together. 302 // Sum components together.
295 float result; 303 float result;
296 m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1); 304 m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1);
297 _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps( 305 _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps(
298 m_sums2, m_sums2, 1))); 306 m_sums2, m_sums2, 1)));
299 307
300 return result; 308 return result;
301 } 309 }
302 #endif 310 #endif
303 311
312 #if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
313 float SincResampler::Convolve_NEON(const float* input_ptr, const float* k1,
314 const float* k2,
315 double kernel_interpolation_factor) {
316 float32x4_t m_input;
317 float32x4_t m_sums1 = vmovq_n_f32(0);
318 float32x4_t m_sums2 = vmovq_n_f32(0);
319
320 const float* upper = input_ptr + kKernelSize;
321 for (; input_ptr < upper; ) {
322 m_input = vld1q_f32(input_ptr);
323 input_ptr += 4;
324 m_sums1 = vmlaq_f32(m_sums1, m_input, vld1q_f32(k1));
325 k1 += 4;
326 m_sums2 = vmlaq_f32(m_sums2, m_input, vld1q_f32(k2));
327 k2 += 4;
328 }
329
330 // Linearly interpolate the two "convolutions".
331 m_sums1 = vmlaq_f32(
332 vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)),
333 m_sums2, vmovq_n_f32(kernel_interpolation_factor));
334
335 // Sum components together.
336 float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1));
337 return vget_lane_f32(vpadd_f32(m_half, m_half), 0);
338 }
339 #endif
340
304 } // namespace media 341 } // namespace media
OLDNEW
« no previous file with comments | « media/base/sinc_resampler.h ('k') | media/base/sinc_resampler_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698