media/base/sinc_resampler.cc - Issue 12478002: Break out SSE functions into new media_sse target.

Unified Diff: media/base/sinc_resampler.cc

Issue 12478002: Break out SSE functions into new media_sse target. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Fix presubmit. Created 7 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: media/base/sinc_resampler.cc

diff --git a/media/base/sinc_resampler.cc b/media/base/sinc_resampler.cc

index d836fc7cbcc862c3dc9c022fab8355df8375a588..89499bb824b068a6341849aa67f0a6228f57f365 100644

--- a/media/base/sinc_resampler.cc

+++ b/media/base/sinc_resampler.cc

@@ -42,43 +42,12 @@

#include "base/logging.h"

#include "build/build_config.h"

-#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)

-#include <xmmintrin.h>

-#endif

#if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)

#include <arm_neon.h>

#endif

namespace media {

-namespace {

-enum {

- // The kernel size can be adjusted for quality (higher is better) at the

- // expense of performance. Must be a multiple of 32.

- // TODO(dalecurtis): Test performance to see if we can jack this up to 64+.

- kKernelSize = 32,

- // The number of destination frames generated per processing pass. Affects

- // how often and for how much SincResampler calls back for input. Must be

- // greater than kKernelSize.

- kBlockSize = 512,

- // The kernel offset count is used for interpolation and is the number of

- // sub-sample kernel shifts. Can be adjusted for quality (higher is better)

- // at the expense of allocating more memory.

- kKernelOffsetCount = 32,

- kKernelStorageSize = kKernelSize * (kKernelOffsetCount + 1),

- // The size (in samples) of the internal buffer used by the resampler.

- kBufferSize = kBlockSize + kKernelSize

-};

-} // namespace

-const int SincResampler::kMaximumLookAheadSize = kBufferSize;

SincResampler::SincResampler(double io_sample_rate_ratio, const ReadCB& read_cb)

: io_sample_rate_ratio_(io_sample_rate_ratio),

virtual_source_idx_(0),

@@ -222,7 +191,7 @@ void SincResampler::Resample(float* destination, int frames) {

}

-int SincResampler::ChunkSize() {

+int SincResampler::ChunkSize() const {

return kBlockSize / io_sample_rate_ratio_;

}

@@ -235,12 +204,17 @@ void SincResampler::Flush() {

float SincResampler::Convolve(const float* input_ptr, const float* k1,

const float* k2,

double kernel_interpolation_factor) {

+ // Ensure |k1|, |k2| are 16-byte aligned for SSE usage. Should always be true

+ // so long as kKernelSize is a multiple of 16.

+ DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k1) & 0x0F);

+ DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k2) & 0x0F);

// Rely on function level static initialization to keep ConvolveProc selection

// thread safe.

typedef float (*ConvolveProc)(const float* src, const float* k1,

const float* k2,

double kernel_interpolation_factor);

-#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)

+#if defined(ARCH_CPU_X86_FAMILY)

static const ConvolveProc kConvolveProc =

Mark Mentovai 2013/03/05 21:32:31 #if defined(OS_MACOSX), you can use Convolve_SSE d

DaleCurtis 2013/03/05 21:51:37 Even better I can just check for __SSE__.

base::CPU().has_sse() ? Convolve_SSE : Convolve_C;

#elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)

@@ -271,50 +245,6 @@ float SincResampler::Convolve_C(const float* input_ptr, const float* k1,

+ kernel_interpolation_factor * sum2;

}

-#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)

-float SincResampler::Convolve_SSE(const float* input_ptr, const float* k1,

- const float* k2,

- double kernel_interpolation_factor) {

- // Ensure |k1|, |k2| are 16-byte aligned for SSE usage. Should always be true

- // so long as kKernelSize is a multiple of 16.

- DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k1) & 0x0F);

- DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k2) & 0x0F);

- __m128 m_input;

- __m128 m_sums1 = _mm_setzero_ps();

- __m128 m_sums2 = _mm_setzero_ps();

- // Based on |input_ptr| alignment, we need to use loadu or load. Unrolling

- // these loops hurt performance in local testing.

- if (reinterpret_cast<uintptr_t>(input_ptr) & 0x0F) {

- for (int i = 0; i < kKernelSize; i += 4) {

- m_input = _mm_loadu_ps(input_ptr + i);

- m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));

- m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));

- }

- } else {

- for (int i = 0; i < kKernelSize; i += 4) {

- m_input = _mm_load_ps(input_ptr + i);

- m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));

- m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));

- }

- // Linearly interpolate the two "convolutions".

- m_sums1 = _mm_mul_ps(m_sums1, _mm_set_ps1(1.0 - kernel_interpolation_factor));

- m_sums2 = _mm_mul_ps(m_sums2, _mm_set_ps1(kernel_interpolation_factor));

- m_sums1 = _mm_add_ps(m_sums1, m_sums2);

- // Sum components together.

- float result;

- m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1);

- _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps(

- m_sums2, m_sums2, 1)));

- return result;

-#endif

#if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)

float SincResampler::Convolve_NEON(const float* input_ptr, const float* k1,

const float* k2,

« media/base/sinc_resampler.h ('K') | « media/base/sinc_resampler.h ('k') | media/base/sinc_resampler_unittest.cc » ('j') | media/base/vector_math.cc » ('J')