media/base/sinc_resampler.cc - Issue 10960023: Add ARM NEON intrinsic optimizations for SincResampler.

Side by Side Diff: media/base/sinc_resampler.cc

Issue 10960023: Add ARM NEON intrinsic optimizations for SincResampler. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Created 8 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4 //	4 //

5 // Input buffer layout, dividing the total buffer into regions (r0_ - r5_):	5 // Input buffer layout, dividing the total buffer into regions (r0_ - r5_):

6 //	6 //

7 // \|----------------\|-----------------------------------------\|----------------\|	7 // \|----------------\|-----------------------------------------\|----------------\|

8 //	8 //

9 // kBlockSize + kKernelSize / 2	9 // kBlockSize + kKernelSize / 2

10 // <--------------------------------------------------------->	10 // <--------------------------------------------------------->

(...skipping 18 matching lines...) Expand all Loading...
29 // 5) Goto (2) until all of input is consumed.	29 // 5) Goto (2) until all of input is consumed.

30 //	30 //

31 // Note: we're glossing over how the sub-sample handling works with	31 // Note: we're glossing over how the sub-sample handling works with

32 // \|virtual_source_idx_\|, etc.	32 // \|virtual_source_idx_\|, etc.

33	33

34 // MSVC++ requires this to be set before any other includes to get M_PI.	34 // MSVC++ requires this to be set before any other includes to get M_PI.

35 #define _USE_MATH_DEFINES	35 #define _USE_MATH_DEFINES

36	36

37 #include "media/base/sinc_resampler.h"	37 #include "media/base/sinc_resampler.h"

38	38

39 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)

40 #include <xmmintrin.h>

41 #endif

42 #include <cmath>	39 #include <cmath>

43	40

44 #include "base/cpu.h"	41 #include "base/cpu.h"

45 #include "base/logging.h"	42 #include "base/logging.h"

46	43

	44 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)

	45 #include <xmmintrin.h>

	46 #endif

	47

	48 #if defined(ARCH_CPU_ARM_FAMILY) && defined(__ARM_NEON__) && defined(USE_NEON)

	49 #include <arm_neon.h>

	50 #endif

	51

47 namespace media {	52 namespace media {

48	53

49 enum {	54 enum {

50 // The kernel size can be adjusted for quality (higher is better) at the	55 // The kernel size can be adjusted for quality (higher is better) at the

51 // expense of performance. Must be a multiple of 32.	56 // expense of performance. Must be a multiple of 32.

52 // TODO(dalecurtis): Test performance to see if we can jack this up to 64+.	57 // TODO(dalecurtis): Test performance to see if we can jack this up to 64+.

53 kKernelSize = 32,	58 kKernelSize = 32,

54	59

55 // The number of destination frames generated per processing pass. Affects	60 // The number of destination frames generated per processing pass. Affects

56 // how often and for how much SincResampler calls back for input. Must be	61 // how often and for how much SincResampler calls back for input. Must be

(...skipping 167 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
224 const float* k2,	229 const float* k2,

225 double kernel_interpolation_factor) {	230 double kernel_interpolation_factor) {

226 // Rely on function level static initialization to keep ConvolveProc selection	231 // Rely on function level static initialization to keep ConvolveProc selection

227 // thread safe.	232 // thread safe.

228 typedef float (ConvolveProc)(const float src, const float* k1,	233 typedef float (ConvolveProc)(const float src, const float* k1,

229 const float* k2,	234 const float* k2,

230 double kernel_interpolation_factor);	235 double kernel_interpolation_factor);

231 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)	236 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)

232 static const ConvolveProc kConvolveProc =	237 static const ConvolveProc kConvolveProc =

233 base::CPU().has_sse() ? Convolve_SSE : Convolve_C;	238 base::CPU().has_sse() ? Convolve_SSE : Convolve_C;

	239 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(__ARM_NEON__) && defined(USE_NEON)

	240 static const ConvolveProc kConvolveProc = Convolve_NEON;

234 #else	241 #else

235 static const ConvolveProc kConvolveProc = Convolve_C;	242 static const ConvolveProc kConvolveProc = Convolve_C;

236 #endif	243 #endif

237	244

238 return kConvolveProc(input_ptr, k1, k2, kernel_interpolation_factor);	245 return kConvolveProc(input_ptr, k1, k2, kernel_interpolation_factor);

239 }	246 }

240	247

241 float SincResampler::Convolve_C(const float* input_ptr, const float* k1,	248 float SincResampler::Convolve_C(const float* input_ptr, const float* k1,

242 const float* k2,	249 const float* k2,

243 double kernel_interpolation_factor) {	250 double kernel_interpolation_factor) {

(...skipping 50 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
294 // Sum components together.	301 // Sum components together.

295 float result;	302 float result;

296 m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1);	303 m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1);

297 _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps(	304 _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps(

298 m_sums2, m_sums2, 1)));	305 m_sums2, m_sums2, 1)));

299	306

300 return result;	307 return result;

301 }	308 }

302 #endif	309 #endif

303	310

	311 #if defined(ARCH_CPU_ARM_FAMILY) && defined(__ARM_NEON__) && defined(USE_NEON)

	312 float SincResampler::Convolve_NEON(const float* input_ptr, const float* k1,

	313 const float* k2,

	314 double kernel_interpolation_factor) {

	315 float32x4_t m_input;

	316 float32x4_t m_sums1 = vmovq_n_f32(0);

	317 float32x4_t m_sums2 = vmovq_n_f32(0);

	318

	319 for (int i = 0; i < kKernelSize; i += 4) {

	320 m_input = vld1q_f32(input_ptr + i);

	321 m_sums1 = vaddq_f32(m_sums1, vmulq_f32(m_input, vld1q_f32(k1 + i)));

	322 m_sums2 = vaddq_f32(m_sums2, vmulq_f32(m_input, vld1q_f32(k2 + i)));

	323 }

	324

	325 // Linearly interpolate the two "convolutions".

	326 m_sums1 = vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor));

	327 m_sums2 = vmulq_f32(m_sums2, vmovq_n_f32(kernel_interpolation_factor));

	328 m_sums1 = vaddq_f32(m_sums1, m_sums2);

	329

	330 // Sum components together.

	331 float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1));

	332 return vget_lane_f32(vpadd_f32(m_half, m_half), 0);

	333 }

	334 #endif

	335

304 } // namespace media	336 } // namespace media

OLD	NEW

« no previous file with comments | « media/base/sinc_resampler.h ('k') | media/base/sinc_resampler_unittest.cc » ('j') | media/base/sinc_resampler_unittest.cc » ('J')