media/base/sinc_resampler.cc - Issue 10960023: Add ARM NEON intrinsic optimizations for SincResampler.

Side by Side Diff: media/base/sinc_resampler.cc

Issue 10960023: Add ARM NEON intrinsic optimizations for SincResampler. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Comments. Created 8 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4 //	4 //

5 // Input buffer layout, dividing the total buffer into regions (r0_ - r5_):	5 // Input buffer layout, dividing the total buffer into regions (r0_ - r5_):

6 //	6 //

7 // \|----------------\|-----------------------------------------\|----------------\|	7 // \|----------------\|-----------------------------------------\|----------------\|

8 //	8 //

9 // kBlockSize + kKernelSize / 2	9 // kBlockSize + kKernelSize / 2

10 // <--------------------------------------------------------->	10 // <--------------------------------------------------------->

(...skipping 18 matching lines...) Expand all Loading...
29 // 5) Goto (2) until all of input is consumed.	29 // 5) Goto (2) until all of input is consumed.

30 //	30 //

31 // Note: we're glossing over how the sub-sample handling works with	31 // Note: we're glossing over how the sub-sample handling works with

32 // \|virtual_source_idx_\|, etc.	32 // \|virtual_source_idx_\|, etc.

33	33

34 // MSVC++ requires this to be set before any other includes to get M_PI.	34 // MSVC++ requires this to be set before any other includes to get M_PI.

35 #define _USE_MATH_DEFINES	35 #define _USE_MATH_DEFINES

36	36

37 #include "media/base/sinc_resampler.h"	37 #include "media/base/sinc_resampler.h"

38	38

	39 #include <cmath>

	40

	41 #include "base/cpu.h"

	42 #include "base/logging.h"

	43 #include "build/build_config.h"

	44

39 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)	45 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)

40 #include <xmmintrin.h>	46 #include <xmmintrin.h>

41 #endif	47 #endif

42 #include <cmath>

43	48

44 #include "base/cpu.h"	49 #if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)

45 #include "base/logging.h"	50 #include <arm_neon.h>

	51 #endif

46	52

47 namespace media {	53 namespace media {

48	54

49 enum {	55 enum {

50 // The kernel size can be adjusted for quality (higher is better) at the	56 // The kernel size can be adjusted for quality (higher is better) at the

51 // expense of performance. Must be a multiple of 32.	57 // expense of performance. Must be a multiple of 32.

52 // TODO(dalecurtis): Test performance to see if we can jack this up to 64+.	58 // TODO(dalecurtis): Test performance to see if we can jack this up to 64+.

53 kKernelSize = 32,	59 kKernelSize = 32,

54	60

55 // The number of destination frames generated per processing pass. Affects	61 // The number of destination frames generated per processing pass. Affects

(...skipping 168 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
224 const float* k2,	230 const float* k2,

225 double kernel_interpolation_factor) {	231 double kernel_interpolation_factor) {

226 // Rely on function level static initialization to keep ConvolveProc selection	232 // Rely on function level static initialization to keep ConvolveProc selection

227 // thread safe.	233 // thread safe.

228 typedef float (ConvolveProc)(const float src, const float* k1,	234 typedef float (ConvolveProc)(const float src, const float* k1,

229 const float* k2,	235 const float* k2,

230 double kernel_interpolation_factor);	236 double kernel_interpolation_factor);

231 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)	237 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)

232 static const ConvolveProc kConvolveProc =	238 static const ConvolveProc kConvolveProc =

233 base::CPU().has_sse() ? Convolve_SSE : Convolve_C;	239 base::CPU().has_sse() ? Convolve_SSE : Convolve_C;

	240 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)

	241 static const ConvolveProc kConvolveProc = Convolve_NEON;

234 #else	242 #else

235 static const ConvolveProc kConvolveProc = Convolve_C;	243 static const ConvolveProc kConvolveProc = Convolve_C;

236 #endif	244 #endif

237	245

238 return kConvolveProc(input_ptr, k1, k2, kernel_interpolation_factor);	246 return kConvolveProc(input_ptr, k1, k2, kernel_interpolation_factor);

239 }	247 }

240	248

241 float SincResampler::Convolve_C(const float* input_ptr, const float* k1,	249 float SincResampler::Convolve_C(const float* input_ptr, const float* k1,

242 const float* k2,	250 const float* k2,

243 double kernel_interpolation_factor) {	251 double kernel_interpolation_factor) {

(...skipping 50 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
294 // Sum components together.	302 // Sum components together.

295 float result;	303 float result;

296 m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1);	304 m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1);

297 _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps(	305 _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps(

298 m_sums2, m_sums2, 1)));	306 m_sums2, m_sums2, 1)));

299	307

300 return result;	308 return result;

301 }	309 }

302 #endif	310 #endif

303	311

	312 #if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)

	313 float SincResampler::Convolve_NEON(const float* input_ptr, const float* k1,

	314 const float* k2,

	315 double kernel_interpolation_factor) {

	316 float32x4_t m_input;

	317 float32x4_t m_sums1 = vmovq_n_f32(0);

	318 float32x4_t m_sums2 = vmovq_n_f32(0);

	319

	320 for (int i = 0; i < kKernelSize; i += 4) {

	321 m_input = vld1q_f32(input_ptr + i);

	322 m_sums1 = vaddq_f32(m_sums1, vmulq_f32(m_input, vld1q_f32(k1 + i)));

	323 m_sums2 = vaddq_f32(m_sums2, vmulq_f32(m_input, vld1q_f32(k2 + i)));

	324 }

	325

	326 // Linearly interpolate the two "convolutions".

	327 m_sums1 = vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor));

	328 m_sums2 = vmulq_f32(m_sums2, vmovq_n_f32(kernel_interpolation_factor));

	329 m_sums1 = vaddq_f32(m_sums1, m_sums2);

	330

	331 // Sum components together.

	332 float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1));

	333 return vget_lane_f32(vpadd_f32(m_half, m_half), 0);

	334 }

	335 #endif

	336

304 } // namespace media	337 } // namespace media

OLD	NEW

« no previous file with comments | « media/base/sinc_resampler.h ('k') | media/base/sinc_resampler_unittest.cc » ('j') | media/base/sinc_resampler_unittest.cc » ('J')