| Index: media/filters/wsola_internals.cc
|
| diff --git a/media/filters/wsola_internals.cc b/media/filters/wsola_internals.cc
|
| index a247df91c5cd761093ff1c0753f648ef0d372911..9e12f032f3cc021a151c933aec88a2fe82e31bf7 100644
|
| --- a/media/filters/wsola_internals.cc
|
| +++ b/media/filters/wsola_internals.cc
|
| @@ -15,6 +15,14 @@
|
| #include "base/logging.h"
|
| #include "media/base/audio_bus.h"
|
|
|
| +#if defined(ARCH_CPU_X86_FAMILY)
|
| +#define USE_SIMD 1
|
| +#include <xmmintrin.h>
|
| +#elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
|
| +#define USE_SIMD 1
|
| +#include <arm_neon.h>
|
| +#endif
|
| +
|
| namespace media {
|
|
|
| namespace internal {
|
| @@ -48,13 +56,55 @@ void MultiChannelDotProduct(const AudioBus* a,
|
| DCHECK_LE(frame_offset_a + num_frames, a->frames());
|
| DCHECK_LE(frame_offset_b + num_frames, b->frames());
|
|
|
| +// SIMD optimized variants can provide a massive speedup to this operation.
|
| +#if defined(USE_SIMD)
|
| + const int rem = num_frames % 4;
|
| + const int last_index = num_frames - rem;
|
| + const int channels = a->channels();
|
| + for (int ch = 0; ch < channels; ++ch) {
|
| + const float* a_src = a->channel(ch) + frame_offset_a;
|
| + const float* b_src = b->channel(ch) + frame_offset_b;
|
| +
|
| +#if defined(ARCH_CPU_X86_FAMILY)
|
| + // First sum all components.
|
| + __m128 m_sum = _mm_setzero_ps();
|
| + for (int s = 0; s < last_index; s += 4) {
|
| + m_sum = _mm_add_ps(
|
| + m_sum, _mm_mul_ps(_mm_loadu_ps(a_src + s), _mm_loadu_ps(b_src + s)));
|
| + }
|
| +
|
| + // Reduce to a single float for this channel. Sadly, SSE1,2 doesn't have a
|
| + // horizontal sum function, so we have to condense manually.
|
| + m_sum = _mm_add_ps(_mm_movehl_ps(m_sum, m_sum), m_sum);
|
| + _mm_store_ss(dot_product + ch,
|
| + _mm_add_ss(m_sum, _mm_shuffle_ps(m_sum, m_sum, 1)));
|
| +#elif defined(ARCH_CPU_ARM_FAMILY)
|
| + // First sum all components.
|
| + float32x4_t m_sum = vmovq_n_f32(0);
|
| + for (int s = 0; s < last_index; s += 4)
|
| + m_sum = vmlaq_f32(m_sum, vld1q_f32(a_src + s), vld1q_f32(b_src + s));
|
| +
|
| + // Reduce to a single float for this channel.
|
| + float32x2_t m_half = vadd_f32(vget_high_f32(m_sum), vget_low_f32(m_sum));
|
| + dot_product[ch] = vget_lane_f32(vpadd_f32(m_half, m_half), 0);
|
| +#endif
|
| + }
|
| +
|
| + if (!rem)
|
| + return;
|
| + num_frames = rem;
|
| + frame_offset_a += last_index;
|
| + frame_offset_b += last_index;
|
| +#else
|
| memset(dot_product, 0, sizeof(*dot_product) * a->channels());
|
| +#endif // defined(USE_SIMD)
|
| +
|
| + // C version is required to handle remainder of frames (% 4 != 0)
|
| for (int k = 0; k < a->channels(); ++k) {
|
| const float* ch_a = a->channel(k) + frame_offset_a;
|
| const float* ch_b = b->channel(k) + frame_offset_b;
|
| - for (int n = 0; n < num_frames; ++n) {
|
| + for (int n = 0; n < num_frames; ++n)
|
| dot_product[k] += *ch_a++ * *ch_b++;
|
| - }
|
| }
|
| }
|
|
|
|
|