media/base/sinc_resampler.cc - Issue 308003004: Remove runtime CPU detection for SSE optimized media/ methods.

Side by Side Diff: media/base/sinc_resampler.cc

Issue 308003004: Remove runtime CPU detection for SSE optimized media/ methods. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Remove vector math. Created 6 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4 //	4 //

5 // Initial input buffer layout, dividing into regions r0_ to r4_ (note: r0_, r3_	5 // Initial input buffer layout, dividing into regions r0_ to r4_ (note: r0_, r3_

6 // and r4_ will move after the first load):	6 // and r4_ will move after the first load):

7 //	7 //

8 // \|----------------\|-----------------------------------------\|----------------\|	8 // \|----------------\|-----------------------------------------\|----------------\|

9 //	9 //

10 // request_frames_	10 // request_frames_

(...skipping 63 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
74 // \|virtual_source_idx_\|, etc.	74 // \|virtual_source_idx_\|, etc.

75	75

76 // MSVC++ requires this to be set before any other includes to get M_PI.	76 // MSVC++ requires this to be set before any other includes to get M_PI.

77 #define _USE_MATH_DEFINES	77 #define _USE_MATH_DEFINES

78	78

79 #include "media/base/sinc_resampler.h"	79 #include "media/base/sinc_resampler.h"

80	80

81 #include <cmath>	81 #include <cmath>

82 #include <limits>	82 #include <limits>

83	83

84 #include "base/cpu.h"

85 #include "base/logging.h"	84 #include "base/logging.h"

86	85

87 #if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)	86 #if defined(ARCH_CPU_X86_FAMILY)

	87 #include <xmmintrin.h>

	88 #define CONVOLVE_FUNC Convolve_SSE

	89 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)

88 #include <arm_neon.h>	90 #include <arm_neon.h>

	91 #define CONVOLVE_FUNC Convolve_NEON

	92 #else

	93 #define CONVOLVE_FUNC Convolve_C

89 #endif	94 #endif

90	95

91 namespace media {	96 namespace media {

92	97

93 static double SincScaleFactor(double io_ratio) {	98 static double SincScaleFactor(double io_ratio) {

94 // \|sinc_scale_factor\| is basically the normalized cutoff frequency of the	99 // \|sinc_scale_factor\| is basically the normalized cutoff frequency of the

95 // low-pass filter.	100 // low-pass filter.

96 double sinc_scale_factor = io_ratio > 1.0 ? 1.0 / io_ratio : 1.0;	101 double sinc_scale_factor = io_ratio > 1.0 ? 1.0 / io_ratio : 1.0;

97	102

98 // The sinc function is an idealized brick-wall filter, but since we're	103 // The sinc function is an idealized brick-wall filter, but since we're

99 // windowing it the transition from pass to stop does not happen right away.	104 // windowing it the transition from pass to stop does not happen right away.

100 // So we should adjust the low pass filter cutoff slightly downward to avoid	105 // So we should adjust the low pass filter cutoff slightly downward to avoid

101 // some aliasing at the very high-end.	106 // some aliasing at the very high-end.

102 // TODO(crogers): this value is empirical and to be more exact should vary	107 // TODO(crogers): this value is empirical and to be more exact should vary

103 // depending on kKernelSize.	108 // depending on kKernelSize.

104 sinc_scale_factor *= 0.9;	109 sinc_scale_factor *= 0.9;

105	110

106 return sinc_scale_factor;	111 return sinc_scale_factor;

107 }	112 }

108	113

109 // If we know the minimum architecture at compile time, avoid CPU detection.

110 // Force NaCl code to use C routines since (at present) nothing there uses these

111 // methods and plumbing the -msse built library is non-trivial.

112 #if defined(ARCH_CPU_X86_FAMILY) && !defined(OS_NACL)

113 #if defined(__SSE__)

114 #define CONVOLVE_FUNC Convolve_SSE

115 void SincResampler::InitializeCPUSpecificFeatures() {}

116 #else

117 // X86 CPU detection required. Functions will be set by

118 // InitializeCPUSpecificFeatures().

119 // TODO(dalecurtis): Once Chrome moves to an SSE baseline this can be removed.

120 #define CONVOLVE_FUNC g_convolve_proc_

121

122 typedef float (ConvolveProc)(const float, const float, const float, double);

123 static ConvolveProc g_convolve_proc_ = NULL;

124

125 void SincResampler::InitializeCPUSpecificFeatures() {

126 CHECK(!g_convolve_proc_);

127 g_convolve_proc_ = base::CPU().has_sse() ? Convolve_SSE : Convolve_C;

128 }

129 #endif

130 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)

131 #define CONVOLVE_FUNC Convolve_NEON

132 void SincResampler::InitializeCPUSpecificFeatures() {}

133 #else

134 // Unknown architecture.

135 #define CONVOLVE_FUNC Convolve_C

136 void SincResampler::InitializeCPUSpecificFeatures() {}

137 #endif

138

139 SincResampler::SincResampler(double io_sample_rate_ratio,	114 SincResampler::SincResampler(double io_sample_rate_ratio,

140 int request_frames,	115 int request_frames,

141 const ReadCB& read_cb)	116 const ReadCB& read_cb)

142 : io_sample_rate_ratio_(io_sample_rate_ratio),	117 : io_sample_rate_ratio_(io_sample_rate_ratio),

143 read_cb_(read_cb),	118 read_cb_(read_cb),

144 request_frames_(request_frames),	119 request_frames_(request_frames),

145 input_buffer_size_(request_frames_ + kKernelSize),	120 input_buffer_size_(request_frames_ + kKernelSize),

146 // Create input buffers with a 16-byte alignment for SSE optimizations.	121 // Create input buffers with a 16-byte alignment for SSE optimizations.

147 kernel_storage_(static_cast<float*>(	122 kernel_storage_(static_cast<float*>(

148 base::AlignedAlloc(sizeof(float) * kKernelStorageSize, 16))),	123 base::AlignedAlloc(sizeof(float) * kKernelStorageSize, 16))),

(...skipping 165 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
314	289

315 // Step (4) -- Reinitialize regions if necessary.	290 // Step (4) -- Reinitialize regions if necessary.

316 if (r0_ == r2_)	291 if (r0_ == r2_)

317 UpdateRegions(true);	292 UpdateRegions(true);

318	293

319 // Step (5) -- Refresh the buffer with more input.	294 // Step (5) -- Refresh the buffer with more input.

320 read_cb_.Run(request_frames_, r0_);	295 read_cb_.Run(request_frames_, r0_);

321 }	296 }

322 }	297 }

323	298

324 #undef CONVOLVE_FUNC

325

326 int SincResampler::ChunkSize() const {	299 int SincResampler::ChunkSize() const {

327 return block_size_ / io_sample_rate_ratio_;	300 return block_size_ / io_sample_rate_ratio_;

328 }	301 }

329	302

330 void SincResampler::Flush() {	303 void SincResampler::Flush() {

331 virtual_source_idx_ = 0;	304 virtual_source_idx_ = 0;

332 buffer_primed_ = false;	305 buffer_primed_ = false;

333 memset(input_buffer_.get(), 0,	306 memset(input_buffer_.get(), 0,

334 sizeof(input_buffer_.get()) input_buffer_size_);	307 sizeof(input_buffer_.get()) input_buffer_size_);

335 UpdateRegions(false);	308 UpdateRegions(false);

(...skipping 11 matching lines...) Expand all Loading...
347 while (n--) {	320 while (n--) {

348 sum1 += input_ptr *k1++;	321 sum1 += input_ptr *k1++;

349 sum2 += input_ptr++ *k2++;	322 sum2 += input_ptr++ *k2++;

350 }	323 }

351	324

352 // Linearly interpolate the two "convolutions".	325 // Linearly interpolate the two "convolutions".

353 return (1.0 - kernel_interpolation_factor) * sum1	326 return (1.0 - kernel_interpolation_factor) * sum1

354 + kernel_interpolation_factor * sum2;	327 + kernel_interpolation_factor * sum2;

355 }	328 }

356	329

357 #if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)	330 #if defined(ARCH_CPU_X86_FAMILY)

	331 float SincResampler::Convolve_SSE(const float* input_ptr, const float* k1,

	332 const float* k2,

	333 double kernel_interpolation_factor) {

	334 __m128 m_input;

	335 __m128 m_sums1 = _mm_setzero_ps();

	336 __m128 m_sums2 = _mm_setzero_ps();

	337

	338 // Based on \|input_ptr\| alignment, we need to use loadu or load. Unrolling

	339 // these loops hurt performance in local testing.

	340 if (reinterpret_cast<uintptr_t>(input_ptr) & 0x0F) {

	341 for (int i = 0; i < kKernelSize; i += 4) {

	342 m_input = _mm_loadu_ps(input_ptr + i);

	343 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));

	344 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));

	345 }

	346 } else {

	347 for (int i = 0; i < kKernelSize; i += 4) {

	348 m_input = _mm_load_ps(input_ptr + i);

	349 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));

	350 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));

	351 }

	352 }

	353

	354 // Linearly interpolate the two "convolutions".

	355 m_sums1 = _mm_mul_ps(m_sums1, _mm_set_ps1(1.0 - kernel_interpolation_factor));

	356 m_sums2 = _mm_mul_ps(m_sums2, _mm_set_ps1(kernel_interpolation_factor));

	357 m_sums1 = _mm_add_ps(m_sums1, m_sums2);

	358

	359 // Sum components together.

	360 float result;

	361 m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1);

	362 _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps(

	363 m_sums2, m_sums2, 1)));

	364

	365 return result;

	366 }

	367 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
	rileya (GONE FROM CHROMIUM) 2014/05/30 01:12:29 Not too relevant to this CL, but iirc we don't set Not too relevant to this CL, but iirc we don't set USE_NEON in official builds due to some specific chipsets not supporting it (Tegra 2 I think). It might be worth doing runtime detection if that is the case (I seem to recall some sort of 'arm_optionally_use_neon' gyp flag that might be relevant there). This is my recollection from probably 6 months ago though, so things may have changed since then.
358 float SincResampler::Convolve_NEON(const float* input_ptr, const float* k1,	368 float SincResampler::Convolve_NEON(const float* input_ptr, const float* k1,

359 const float* k2,	369 const float* k2,

360 double kernel_interpolation_factor) {	370 double kernel_interpolation_factor) {

361 float32x4_t m_input;	371 float32x4_t m_input;

362 float32x4_t m_sums1 = vmovq_n_f32(0);	372 float32x4_t m_sums1 = vmovq_n_f32(0);

363 float32x4_t m_sums2 = vmovq_n_f32(0);	373 float32x4_t m_sums2 = vmovq_n_f32(0);

364	374

365 const float* upper = input_ptr + kKernelSize;	375 const float* upper = input_ptr + kKernelSize;

366 for (; input_ptr < upper; ) {	376 for (; input_ptr < upper; ) {

367 m_input = vld1q_f32(input_ptr);	377 m_input = vld1q_f32(input_ptr);

368 input_ptr += 4;	378 input_ptr += 4;

369 m_sums1 = vmlaq_f32(m_sums1, m_input, vld1q_f32(k1));	379 m_sums1 = vmlaq_f32(m_sums1, m_input, vld1q_f32(k1));

370 k1 += 4;	380 k1 += 4;

371 m_sums2 = vmlaq_f32(m_sums2, m_input, vld1q_f32(k2));	381 m_sums2 = vmlaq_f32(m_sums2, m_input, vld1q_f32(k2));

372 k2 += 4;	382 k2 += 4;

373 }	383 }

374	384

375 // Linearly interpolate the two "convolutions".	385 // Linearly interpolate the two "convolutions".

376 m_sums1 = vmlaq_f32(	386 m_sums1 = vmlaq_f32(

377 vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)),	387 vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)),

378 m_sums2, vmovq_n_f32(kernel_interpolation_factor));	388 m_sums2, vmovq_n_f32(kernel_interpolation_factor));

379	389

380 // Sum components together.	390 // Sum components together.

381 float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1));	391 float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1));

382 return vget_lane_f32(vpadd_f32(m_half, m_half), 0);	392 return vget_lane_f32(vpadd_f32(m_half, m_half), 0);

383 }	393 }

384 #endif	394 #endif

385	395

386 } // namespace media	396 } // namespace media

OLD	NEW

« no previous file with comments | « media/base/sinc_resampler.h ('k') | media/base/sinc_resampler_perftest.cc » ('j') | no next file with comments »