media/base/sinc_resampler.cc - Issue 2556993002: Experiment with AVX optimizations for FMAC, FMUL operations.

Unified Diff: media/base/sinc_resampler.cc

Issue 2556993002: Experiment with AVX optimizations for FMAC, FMUL operations.

Patch Set: Created 4 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: media/base/sinc_resampler.cc

diff --git a/media/base/sinc_resampler.cc b/media/base/sinc_resampler.cc

index cffb0c9d6dad86c02a410f6ded7233f756b7ea65..67ba750561959ace03424ec513fb19a5786658fe 100644

--- a/media/base/sinc_resampler.cc

+++ b/media/base/sinc_resampler.cc

@@ -82,17 +82,6 @@

#include <limits>

#include "base/logging.h"

-#include "build/build_config.h"

-#if defined(ARCH_CPU_X86_FAMILY)

-#include <xmmintrin.h>

-#define CONVOLVE_FUNC Convolve_SSE

-#elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)

-#include <arm_neon.h>

-#define CONVOLVE_FUNC Convolve_NEON

-#else

-#define CONVOLVE_FUNC Convolve_C

-#endif

namespace media {

@@ -191,7 +180,7 @@ void SincResampler::InitializeKernel() {

// Compute Blackman window, matching the offset of the sinc().

const float x = (i - subsample_offset) / kKernelSize;

const float window = static_cast<float>(kA0 - kA1 * cos(2.0 * M_PI * x) +

- kA2 * cos(4.0 * M_PI * x));

+ kA2 * cos(4.0 * M_PI * x));

kernel_window_storage_[idx] = window;

// Compute the sinc with offset, then window the sinc() function and store

@@ -264,7 +253,7 @@ void SincResampler::Resample(int frames, float* destination) {

const double kernel_interpolation_factor =

virtual_offset_idx - offset_idx;

*destination++ =

- CONVOLVE_FUNC(input_ptr, k1, k2, kernel_interpolation_factor);

+ vector_math::Convolve(input_ptr, k1, k2, kernel_interpolation_factor);

// Advance the virtual index.

virtual_source_idx_ += io_sample_rate_ratio_;

@@ -309,91 +298,4 @@ double SincResampler::BufferedFrames() const {

return buffer_primed_ ? request_frames_ - virtual_source_idx_ : 0;

}

-float SincResampler::Convolve_C(const float* input_ptr, const float* k1,

- const float* k2,

- double kernel_interpolation_factor) {

- float sum1 = 0;

- float sum2 = 0;

- // Generate a single output sample. Unrolling this loop hurt performance in

- // local testing.

- int n = kKernelSize;

- while (n--) {

- sum1 += *input_ptr * *k1++;

- sum2 += *input_ptr++ * *k2++;

- }

- // Linearly interpolate the two "convolutions".

- return static_cast<float>((1.0 - kernel_interpolation_factor) * sum1 +

- kernel_interpolation_factor * sum2);

-#if defined(ARCH_CPU_X86_FAMILY)

-float SincResampler::Convolve_SSE(const float* input_ptr, const float* k1,

- const float* k2,

- double kernel_interpolation_factor) {

- __m128 m_input;

- __m128 m_sums1 = _mm_setzero_ps();

- __m128 m_sums2 = _mm_setzero_ps();

- // Based on |input_ptr| alignment, we need to use loadu or load. Unrolling

- // these loops hurt performance in local testing.

- if (reinterpret_cast<uintptr_t>(input_ptr) & 0x0F) {

- for (int i = 0; i < kKernelSize; i += 4) {

- m_input = _mm_loadu_ps(input_ptr + i);

- m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));

- m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));

- }

- } else {

- for (int i = 0; i < kKernelSize; i += 4) {

- m_input = _mm_load_ps(input_ptr + i);

- m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));

- m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));

- }

- // Linearly interpolate the two "convolutions".

- m_sums1 = _mm_mul_ps(m_sums1, _mm_set_ps1(

- static_cast<float>(1.0 - kernel_interpolation_factor)));

- m_sums2 = _mm_mul_ps(m_sums2, _mm_set_ps1(

- static_cast<float>(kernel_interpolation_factor)));

- m_sums1 = _mm_add_ps(m_sums1, m_sums2);

- // Sum components together.

- float result;

- m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1);

- _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps(

- m_sums2, m_sums2, 1)));

- return result;

-#elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)

-float SincResampler::Convolve_NEON(const float* input_ptr, const float* k1,

- const float* k2,

- double kernel_interpolation_factor) {

- float32x4_t m_input;

- float32x4_t m_sums1 = vmovq_n_f32(0);

- float32x4_t m_sums2 = vmovq_n_f32(0);

- const float* upper = input_ptr + kKernelSize;

- for (; input_ptr < upper; ) {

- m_input = vld1q_f32(input_ptr);

- input_ptr += 4;

- m_sums1 = vmlaq_f32(m_sums1, m_input, vld1q_f32(k1));

- k1 += 4;

- m_sums2 = vmlaq_f32(m_sums2, m_input, vld1q_f32(k2));

- k2 += 4;

- }

- // Linearly interpolate the two "convolutions".

- m_sums1 = vmlaq_f32(

- vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)),

- m_sums2, vmovq_n_f32(kernel_interpolation_factor));

- // Sum components together.

- float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1));

- return vget_lane_f32(vpadd_f32(m_half, m_half), 0);

-#endif

} // namespace media

« no previous file with comments | « media/base/sinc_resampler.h ('k') | media/base/sinc_resampler_perftest.cc » ('j') | no next file with comments »