media/filters/wsola_internals.cc - Issue 2556993002: Experiment with AVX optimizations for FMAC, FMUL operations.

Keyboard Shortcuts

	File
u :	up to issue
j / k :	jump to file after / before current file
J / K :	jump to next file with a comment after / before current file
	Side-by-side diff
i :	toggle intra-line diffs
e :	expand all comments
c :	collapse all comments
s :	toggle showing all comments
n / p :	next / previous diff chunk or comment
N / P :	next / previous comment
<Up> / <Down> :	next / previous line

	Issue
u :	up to list of issues
j / k :	jump to patch after / before current patch
o / <Enter> :	open current patch in side-by-side view
i :	open current patch in unified diff view

	Issue List
j / k :	jump to issue after / before current issue
o / <Enter> :	open current issue

Unified Diff: media/filters/wsola_internals.cc

Issue 2556993002: Experiment with AVX optimizations for FMAC, FMUL operations.

Patch Set: Created 4 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: media/filters/wsola_internals.cc

diff --git a/media/filters/wsola_internals.cc b/media/filters/wsola_internals.cc

index 9e12f032f3cc021a151c933aec88a2fe82e31bf7..353232679af01ce70dbe291bff8f2d4e02420f47 100644

--- a/media/filters/wsola_internals.cc

+++ b/media/filters/wsola_internals.cc

@@ -14,14 +14,7 @@

#include "base/logging.h"

#include "media/base/audio_bus.h"

-#if defined(ARCH_CPU_X86_FAMILY)

-#define USE_SIMD 1

-#include <xmmintrin.h>

-#elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)

-#define USE_SIMD 1

-#include <arm_neon.h>

-#endif

+#include "media/base/vector_math.h"

namespace media {

@@ -56,55 +49,11 @@ void MultiChannelDotProduct(const AudioBus* a,

DCHECK_LE(frame_offset_a + num_frames, a->frames());

DCHECK_LE(frame_offset_b + num_frames, b->frames());

-// SIMD optimized variants can provide a massive speedup to this operation.

-#if defined(USE_SIMD)

- const int rem = num_frames % 4;

- const int last_index = num_frames - rem;

const int channels = a->channels();

for (int ch = 0; ch < channels; ++ch) {

- const float* a_src = a->channel(ch) + frame_offset_a;

- const float* b_src = b->channel(ch) + frame_offset_b;

-#if defined(ARCH_CPU_X86_FAMILY)

- // First sum all components.

- __m128 m_sum = _mm_setzero_ps();

- for (int s = 0; s < last_index; s += 4) {

- m_sum = _mm_add_ps(

- m_sum, _mm_mul_ps(_mm_loadu_ps(a_src + s), _mm_loadu_ps(b_src + s)));

- }

- // Reduce to a single float for this channel. Sadly, SSE1,2 doesn't have a

- // horizontal sum function, so we have to condense manually.

- m_sum = _mm_add_ps(_mm_movehl_ps(m_sum, m_sum), m_sum);

- _mm_store_ss(dot_product + ch,

- _mm_add_ss(m_sum, _mm_shuffle_ps(m_sum, m_sum, 1)));

-#elif defined(ARCH_CPU_ARM_FAMILY)

- // First sum all components.

- float32x4_t m_sum = vmovq_n_f32(0);

- for (int s = 0; s < last_index; s += 4)

- m_sum = vmlaq_f32(m_sum, vld1q_f32(a_src + s), vld1q_f32(b_src + s));

- // Reduce to a single float for this channel.

- float32x2_t m_half = vadd_f32(vget_high_f32(m_sum), vget_low_f32(m_sum));

- dot_product[ch] = vget_lane_f32(vpadd_f32(m_half, m_half), 0);

-#endif

- }

- if (!rem)

- return;

- num_frames = rem;

- frame_offset_a += last_index;

- frame_offset_b += last_index;

-#else

- memset(dot_product, 0, sizeof(*dot_product) * a->channels());

-#endif // defined(USE_SIMD)

- // C version is required to handle remainder of frames (% 4 != 0)

- for (int k = 0; k < a->channels(); ++k) {

- const float* ch_a = a->channel(k) + frame_offset_a;

- const float* ch_b = b->channel(k) + frame_offset_b;

- for (int n = 0; n < num_frames; ++n)

- dot_product[k] += *ch_a++ * *ch_b++;

+ dot_product[ch] =

+ vector_math::DotProduct(a->channel(ch) + frame_offset_a,

+ b->channel(ch) + frame_offset_b, num_frames);

}

« no previous file with comments | « media/base/vector_math_unittest.cc ('k') | no next file » | no next file with comments »