media/base/sinc_resampler.cc - Issue 10960023: Add ARM NEON intrinsic optimizations for SincResampler.

Side by Side Diff: media/base/sinc_resampler.cc

Issue 10960023: Add ARM NEON intrinsic optimizations for SincResampler. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Use multiply-accumulate intrinsics. Created 8 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4 //	4 //

5 // Input buffer layout, dividing the total buffer into regions (r0_ - r5_):	5 // Input buffer layout, dividing the total buffer into regions (r0_ - r5_):

6 //	6 //

7 // \|----------------\|-----------------------------------------\|----------------\|	7 // \|----------------\|-----------------------------------------\|----------------\|

8 //	8 //

9 // kBlockSize + kKernelSize / 2	9 // kBlockSize + kKernelSize / 2

10 // <--------------------------------------------------------->	10 // <--------------------------------------------------------->

(...skipping 18 matching lines...) Expand all Loading...
29 // 5) Goto (2) until all of input is consumed.	29 // 5) Goto (2) until all of input is consumed.

30 //	30 //

31 // Note: we're glossing over how the sub-sample handling works with	31 // Note: we're glossing over how the sub-sample handling works with

32 // \|virtual_source_idx_\|, etc.	32 // \|virtual_source_idx_\|, etc.

33	33

34 // MSVC++ requires this to be set before any other includes to get M_PI.	34 // MSVC++ requires this to be set before any other includes to get M_PI.

35 #define _USE_MATH_DEFINES	35 #define _USE_MATH_DEFINES

36	36

37 #include "media/base/sinc_resampler.h"	37 #include "media/base/sinc_resampler.h"

38	38

	39 #include <cmath>

	40

	41 #include "base/cpu.h"

	42 #include "base/logging.h"

	43 #include "build/build_config.h"

	44

39 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)	45 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)

40 #include <xmmintrin.h>	46 #include <xmmintrin.h>

41 #endif	47 #endif

42 #include <cmath>

43	48

44 #include "base/cpu.h"	49 #if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)

45 #include "base/logging.h"	50 #include <arm_neon.h>

	51 #endif

46	52

47 namespace media {	53 namespace media {

48	54

49 enum {	55 enum {

50 // The kernel size can be adjusted for quality (higher is better) at the	56 // The kernel size can be adjusted for quality (higher is better) at the

51 // expense of performance. Must be a multiple of 32.	57 // expense of performance. Must be a multiple of 32.

52 // TODO(dalecurtis): Test performance to see if we can jack this up to 64+.	58 // TODO(dalecurtis): Test performance to see if we can jack this up to 64+.

53 kKernelSize = 32,	59 kKernelSize = 32,

54	60

55 // The number of destination frames generated per processing pass. Affects	61 // The number of destination frames generated per processing pass. Affects

(...skipping 168 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
224 const float* k2,	230 const float* k2,

225 double kernel_interpolation_factor) {	231 double kernel_interpolation_factor) {

226 // Rely on function level static initialization to keep ConvolveProc selection	232 // Rely on function level static initialization to keep ConvolveProc selection

227 // thread safe.	233 // thread safe.

228 typedef float (ConvolveProc)(const float src, const float* k1,	234 typedef float (ConvolveProc)(const float src, const float* k1,

229 const float* k2,	235 const float* k2,

230 double kernel_interpolation_factor);	236 double kernel_interpolation_factor);

231 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)	237 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)

232 static const ConvolveProc kConvolveProc =	238 static const ConvolveProc kConvolveProc =

233 base::CPU().has_sse() ? Convolve_SSE : Convolve_C;	239 base::CPU().has_sse() ? Convolve_SSE : Convolve_C;

	240 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)

	241 static const ConvolveProc kConvolveProc = Convolve_NEON;

234 #else	242 #else

235 static const ConvolveProc kConvolveProc = Convolve_C;	243 static const ConvolveProc kConvolveProc = Convolve_C;

236 #endif	244 #endif

237	245

238 return kConvolveProc(input_ptr, k1, k2, kernel_interpolation_factor);	246 return kConvolveProc(input_ptr, k1, k2, kernel_interpolation_factor);

239 }	247 }

240	248

241 float SincResampler::Convolve_C(const float* input_ptr, const float* k1,	249 float SincResampler::Convolve_C(const float* input_ptr, const float* k1,

242 const float* k2,	250 const float* k2,

243 double kernel_interpolation_factor) {	251 double kernel_interpolation_factor) {

(...skipping 50 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
294 // Sum components together.	302 // Sum components together.

295 float result;	303 float result;

296 m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1);	304 m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1);

297 _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps(	305 _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps(

298 m_sums2, m_sums2, 1)));	306 m_sums2, m_sums2, 1)));

299	307

300 return result;	308 return result;

301 }	309 }

302 #endif	310 #endif

303	311

	312 #if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)

	313 float SincResampler::Convolve_NEON(const float* input_ptr, const float* k1,

	314 const float* k2,

	315 double kernel_interpolation_factor) {

	316 float32x4_t m_input;

	317 float32x4_t m_sums1 = vmovq_n_f32(0);

	318 float32x4_t m_sums2 = vmovq_n_f32(0);
	Johann 2012/09/25 19:02:51 For some reason it looks like it's assembling with For some reason it looks like it's assembling with an actual load instruction (per the last disassembly I saw). Should be able to use VEOR as mentioned by JF. It might be messy because I don't see one in arm_neon.h that takes float32x4 DaleCurtis 2012/09/25 20:30:01 Done. Old: 7d0: eddf 2b19 vldr d18, [pc, #100] Show quoted text On 2012/09/25 19:02:51, Johann wrote: > For some reason it looks like it's assembling with an actual load instruction > (per the last disassembly I saw). Should be able to use VEOR as mentioned by JF. > It might be messy because I don't see one in arm_neon.h that takes float32x4 Done. Old: 7d0: eddf 2b19 vldr d18, [pc, #100] ; 838 <_Z13Convolve_NEONPKfS0_S0_d+0x68> 7d4: eddf 3b1a vldr d19, [pc, #104] ; 840 <_Z13Convolve_NEONPKfS0_S0_d+0x70> 7d8: ef62 61f2 vorr q11, q9, q9 7dc: f100 0380 add.w r3, r0, #128 ; 0x80 7e0: eddd 8b00 vldr d24, [sp] New: 7cc: 2300 movs r3, #0 7ce: eddd 8b00 vldr d24, [sp] 7d2: eea2 3b90 vdup.32 q9, r3 7d6: f100 0380 add.w r3, r0, #128 ; 0x80 7da: ff42 21f2 veor q9, q9, q9 7de: ef62 61f2 vorr q11, q9, q9 The new code seems to be "wasting" time zeroing twice though (vdup.32 and veor), but I suspect that's still better than two extra loads. DaleCurtis 2012/09/25 21:27:01 Actually this triggers an uninitialized error and Show quoted text On 2012/09/25 20:30:01, DaleCurtis wrote: > On 2012/09/25 19:02:51, Johann wrote: > > For some reason it looks like it's assembling with an actual load instruction > > (per the last disassembly I saw). Should be able to use VEOR as mentioned by > JF. > > It might be messy because I don't see one in arm_neon.h that takes float32x4 > > Done. Old: > 7d0: eddf 2b19 vldr d18, [pc, #100] ; 838 <_Z13Convolve_NEONPKfS0_S0_d+0x68> > 7d4: eddf 3b1a vldr d19, [pc, #104] ; 840 <_Z13Convolve_NEONPKfS0_S0_d+0x70> > 7d8: ef62 61f2 vorr q11, q9, q9 > 7dc: f100 0380 add.w r3, r0, #128 ; 0x80 > 7e0: eddd 8b00 vldr d24, [sp] > > New: > 7cc: 2300 movs r3, #0 > 7ce: eddd 8b00 vldr d24, [sp] > 7d2: eea2 3b90 vdup.32 q9, r3 > 7d6: f100 0380 add.w r3, r0, #128 ; 0x80 > 7da: ff42 21f2 veor q9, q9, q9 > 7de: ef62 61f2 vorr q11, q9, q9 > > The new code seems to be "wasting" time zeroing twice though (vdup.32 and veor), > but I suspect that's still better than two extra loads. Actually this triggers an uninitialized error and doesn't appear to be any faster per benchmarks: Convolve_C took 5642.36ms. Convolve_NEON(unaligned) took 2421.80ms; which is 2.33x faster than Convolve_C. Convolve_NEON (aligned) took 2415.05ms; which is 2.34x faster than Convolve_C and 1.00x faster than Convolve_NEON (unaligned). So rather than adding a #pragma ignore for the uninitialized variable, I'll just use vmovq and hope future compilers do something smarter.
	319

	320 const float* upper = input_ptr + kKernelSize;

	321 for (; input_ptr < upper; ) {

	322 m_input = vld1q_f32(input_ptr);

	323 input_ptr += 4;

	324 m_sums1 = vmlaq_f32(m_sums1, m_input, vld1q_f32(k1));

	325 k1 += 4;

	326 m_sums2 = vmlaq_f32(m_sums2, m_input, vld1q_f32(k2));

	327 k2 += 4;

	328 }

	329

	330 // Linearly interpolate the two "convolutions".

	331 m_sums1 = vmlaq_f32(

	332 vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)),

	333 m_sums2, vmovq_n_f32(kernel_interpolation_factor));

	334

	335 // Sum components together.

	336 float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1));

	337 return vget_lane_f32(vpadd_f32(m_half, m_half), 0);

	338 }

	339 #endif

	340

304 } // namespace media	341 } // namespace media

OLD	NEW

« no previous file with comments | « media/base/sinc_resampler.h ('k') | media/base/sinc_resampler_unittest.cc » ('j') | no next file with comments »