media/base/sinc_resampler.cc - Issue 2556993002: Experiment with AVX optimizations for FMAC, FMUL operations.

Side by Side Diff: media/base/sinc_resampler.cc

Issue 2556993002: Experiment with AVX optimizations for FMAC, FMUL operations.

Patch Set: Created 4 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4 //	4 //

5 // Initial input buffer layout, dividing into regions r0_ to r4_ (note: r0_, r3_	5 // Initial input buffer layout, dividing into regions r0_ to r4_ (note: r0_, r3_

6 // and r4_ will move after the first load):	6 // and r4_ will move after the first load):

7 //	7 //

8 // \|----------------\|-----------------------------------------\|----------------\|	8 // \|----------------\|-----------------------------------------\|----------------\|

9 //	9 //

10 // request_frames_	10 // request_frames_

(...skipping 64 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
75	75

76 // MSVC++ requires this to be set before any other includes to get M_PI.	76 // MSVC++ requires this to be set before any other includes to get M_PI.

77 #define _USE_MATH_DEFINES	77 #define _USE_MATH_DEFINES

78	78

79 #include "media/base/sinc_resampler.h"	79 #include "media/base/sinc_resampler.h"

80	80

81 #include <cmath>	81 #include <cmath>

82 #include <limits>	82 #include <limits>

83	83

84 #include "base/logging.h"	84 #include "base/logging.h"

85 #include "build/build_config.h"

86

87 #if defined(ARCH_CPU_X86_FAMILY)

88 #include <xmmintrin.h>

89 #define CONVOLVE_FUNC Convolve_SSE

90 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)

91 #include <arm_neon.h>

92 #define CONVOLVE_FUNC Convolve_NEON

93 #else

94 #define CONVOLVE_FUNC Convolve_C

95 #endif

96	85

97 namespace media {	86 namespace media {

98	87

99 static double SincScaleFactor(double io_ratio) {	88 static double SincScaleFactor(double io_ratio) {

100 // \|sinc_scale_factor\| is basically the normalized cutoff frequency of the	89 // \|sinc_scale_factor\| is basically the normalized cutoff frequency of the

101 // low-pass filter.	90 // low-pass filter.

102 double sinc_scale_factor = io_ratio > 1.0 ? 1.0 / io_ratio : 1.0;	91 double sinc_scale_factor = io_ratio > 1.0 ? 1.0 / io_ratio : 1.0;

103	92

104 // The sinc function is an idealized brick-wall filter, but since we're	93 // The sinc function is an idealized brick-wall filter, but since we're

105 // windowing it the transition from pass to stop does not happen right away.	94 // windowing it the transition from pass to stop does not happen right away.

(...skipping 78 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
184	173

185 for (int i = 0; i < kKernelSize; ++i) {	174 for (int i = 0; i < kKernelSize; ++i) {

186 const int idx = i + offset_idx * kKernelSize;	175 const int idx = i + offset_idx * kKernelSize;

187 const float pre_sinc =	176 const float pre_sinc =

188 static_cast<float>(M_PI * (i - kKernelSize / 2 - subsample_offset));	177 static_cast<float>(M_PI * (i - kKernelSize / 2 - subsample_offset));

189 kernel_pre_sinc_storage_[idx] = pre_sinc;	178 kernel_pre_sinc_storage_[idx] = pre_sinc;

190	179

191 // Compute Blackman window, matching the offset of the sinc().	180 // Compute Blackman window, matching the offset of the sinc().

192 const float x = (i - subsample_offset) / kKernelSize;	181 const float x = (i - subsample_offset) / kKernelSize;

193 const float window = static_cast<float>(kA0 - kA1 * cos(2.0 * M_PI * x) +	182 const float window = static_cast<float>(kA0 - kA1 * cos(2.0 * M_PI * x) +

194 kA2 * cos(4.0 * M_PI * x));	183 kA2 * cos(4.0 * M_PI * x));

195 kernel_window_storage_[idx] = window;	184 kernel_window_storage_[idx] = window;

196	185

197 // Compute the sinc with offset, then window the sinc() function and store	186 // Compute the sinc with offset, then window the sinc() function and store

198 // at the correct offset.	187 // at the correct offset.

199 kernel_storage_[idx] = static_cast<float>(	188 kernel_storage_[idx] = static_cast<float>(

200 window * (pre_sinc ? sin(sinc_scale_factor * pre_sinc) / pre_sinc	189 window * (pre_sinc ? sin(sinc_scale_factor * pre_sinc) / pre_sinc

201 : sinc_scale_factor));	190 : sinc_scale_factor));

202 }	191 }

203 }	192 }

204 }	193 }

(...skipping 52 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
257 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k1) & 0x0F);	246 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k1) & 0x0F);

258 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k2) & 0x0F);	247 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k2) & 0x0F);

259	248

260 // Initialize input pointer based on quantized \|virtual_source_idx_\|.	249 // Initialize input pointer based on quantized \|virtual_source_idx_\|.

261 const float* input_ptr = r1_ + source_idx;	250 const float* input_ptr = r1_ + source_idx;

262	251

263 // Figure out how much to weight each kernel's "convolution".	252 // Figure out how much to weight each kernel's "convolution".

264 const double kernel_interpolation_factor =	253 const double kernel_interpolation_factor =

265 virtual_offset_idx - offset_idx;	254 virtual_offset_idx - offset_idx;

266 *destination++ =	255 *destination++ =

267 CONVOLVE_FUNC(input_ptr, k1, k2, kernel_interpolation_factor);	256 vector_math::Convolve(input_ptr, k1, k2, kernel_interpolation_factor);

268	257

269 // Advance the virtual index.	258 // Advance the virtual index.

270 virtual_source_idx_ += io_sample_rate_ratio_;	259 virtual_source_idx_ += io_sample_rate_ratio_;

271 if (!--remaining_frames)	260 if (!--remaining_frames)

272 return;	261 return;

273 }	262 }

274	263

275 // Wrap back around to the start.	264 // Wrap back around to the start.

276 DCHECK_GE(virtual_source_idx_, block_size_);	265 DCHECK_GE(virtual_source_idx_, block_size_);

277 virtual_source_idx_ -= block_size_;	266 virtual_source_idx_ -= block_size_;

(...skipping 24 matching lines...) Expand all Loading...
302 buffer_primed_ = false;	291 buffer_primed_ = false;

303 memset(input_buffer_.get(), 0,	292 memset(input_buffer_.get(), 0,

304 sizeof(input_buffer_.get()) input_buffer_size_);	293 sizeof(input_buffer_.get()) input_buffer_size_);

305 UpdateRegions(false);	294 UpdateRegions(false);

306 }	295 }

307	296

308 double SincResampler::BufferedFrames() const {	297 double SincResampler::BufferedFrames() const {

309 return buffer_primed_ ? request_frames_ - virtual_source_idx_ : 0;	298 return buffer_primed_ ? request_frames_ - virtual_source_idx_ : 0;

310 }	299 }

311	300

312 float SincResampler::Convolve_C(const float* input_ptr, const float* k1,

313 const float* k2,

314 double kernel_interpolation_factor) {

315 float sum1 = 0;

316 float sum2 = 0;

317

318 // Generate a single output sample. Unrolling this loop hurt performance in

319 // local testing.

320 int n = kKernelSize;

321 while (n--) {

322 sum1 += input_ptr *k1++;

323 sum2 += input_ptr++ *k2++;

324 }

325

326 // Linearly interpolate the two "convolutions".

327 return static_cast<float>((1.0 - kernel_interpolation_factor) * sum1 +

328 kernel_interpolation_factor * sum2);

329 }

330

331 #if defined(ARCH_CPU_X86_FAMILY)

332 float SincResampler::Convolve_SSE(const float* input_ptr, const float* k1,

333 const float* k2,

334 double kernel_interpolation_factor) {

335 __m128 m_input;

336 __m128 m_sums1 = _mm_setzero_ps();

337 __m128 m_sums2 = _mm_setzero_ps();

338

339 // Based on \|input_ptr\| alignment, we need to use loadu or load. Unrolling

340 // these loops hurt performance in local testing.

341 if (reinterpret_cast<uintptr_t>(input_ptr) & 0x0F) {

342 for (int i = 0; i < kKernelSize; i += 4) {

343 m_input = _mm_loadu_ps(input_ptr + i);

344 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));

345 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));

346 }

347 } else {

348 for (int i = 0; i < kKernelSize; i += 4) {

349 m_input = _mm_load_ps(input_ptr + i);

350 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));

351 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));

352 }

353 }

354

355 // Linearly interpolate the two "convolutions".

356 m_sums1 = _mm_mul_ps(m_sums1, _mm_set_ps1(

357 static_cast<float>(1.0 - kernel_interpolation_factor)));

358 m_sums2 = _mm_mul_ps(m_sums2, _mm_set_ps1(

359 static_cast<float>(kernel_interpolation_factor)));

360 m_sums1 = _mm_add_ps(m_sums1, m_sums2);

361

362 // Sum components together.

363 float result;

364 m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1);

365 _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps(

366 m_sums2, m_sums2, 1)));

367

368 return result;

369 }

370 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)

371 float SincResampler::Convolve_NEON(const float* input_ptr, const float* k1,

372 const float* k2,

373 double kernel_interpolation_factor) {

374 float32x4_t m_input;

375 float32x4_t m_sums1 = vmovq_n_f32(0);

376 float32x4_t m_sums2 = vmovq_n_f32(0);

377

378 const float* upper = input_ptr + kKernelSize;

379 for (; input_ptr < upper; ) {

380 m_input = vld1q_f32(input_ptr);

381 input_ptr += 4;

382 m_sums1 = vmlaq_f32(m_sums1, m_input, vld1q_f32(k1));

383 k1 += 4;

384 m_sums2 = vmlaq_f32(m_sums2, m_input, vld1q_f32(k2));

385 k2 += 4;

386 }

387

388 // Linearly interpolate the two "convolutions".

389 m_sums1 = vmlaq_f32(

390 vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)),

391 m_sums2, vmovq_n_f32(kernel_interpolation_factor));

392

393 // Sum components together.

394 float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1));

395 return vget_lane_f32(vpadd_f32(m_half, m_half), 0);

396 }

397 #endif

398

399 } // namespace media	301 } // namespace media

OLD	NEW

« no previous file with comments | « media/base/sinc_resampler.h ('k') | media/base/sinc_resampler_perftest.cc » ('j') | no next file with comments »