| Index: media/filters/wsola_internals.cc
|
| diff --git a/media/filters/wsola_internals.cc b/media/filters/wsola_internals.cc
|
| index 9e12f032f3cc021a151c933aec88a2fe82e31bf7..353232679af01ce70dbe291bff8f2d4e02420f47 100644
|
| --- a/media/filters/wsola_internals.cc
|
| +++ b/media/filters/wsola_internals.cc
|
| @@ -14,14 +14,7 @@
|
|
|
| #include "base/logging.h"
|
| #include "media/base/audio_bus.h"
|
| -
|
| -#if defined(ARCH_CPU_X86_FAMILY)
|
| -#define USE_SIMD 1
|
| -#include <xmmintrin.h>
|
| -#elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
|
| -#define USE_SIMD 1
|
| -#include <arm_neon.h>
|
| -#endif
|
| +#include "media/base/vector_math.h"
|
|
|
| namespace media {
|
|
|
| @@ -56,55 +49,11 @@ void MultiChannelDotProduct(const AudioBus* a,
|
| DCHECK_LE(frame_offset_a + num_frames, a->frames());
|
| DCHECK_LE(frame_offset_b + num_frames, b->frames());
|
|
|
| -// SIMD optimized variants can provide a massive speedup to this operation.
|
| -#if defined(USE_SIMD)
|
| - const int rem = num_frames % 4;
|
| - const int last_index = num_frames - rem;
|
| const int channels = a->channels();
|
| for (int ch = 0; ch < channels; ++ch) {
|
| - const float* a_src = a->channel(ch) + frame_offset_a;
|
| - const float* b_src = b->channel(ch) + frame_offset_b;
|
| -
|
| -#if defined(ARCH_CPU_X86_FAMILY)
|
| - // First sum all components.
|
| - __m128 m_sum = _mm_setzero_ps();
|
| - for (int s = 0; s < last_index; s += 4) {
|
| - m_sum = _mm_add_ps(
|
| - m_sum, _mm_mul_ps(_mm_loadu_ps(a_src + s), _mm_loadu_ps(b_src + s)));
|
| - }
|
| -
|
| - // Reduce to a single float for this channel. Sadly, SSE1,2 doesn't have a
|
| - // horizontal sum function, so we have to condense manually.
|
| - m_sum = _mm_add_ps(_mm_movehl_ps(m_sum, m_sum), m_sum);
|
| - _mm_store_ss(dot_product + ch,
|
| - _mm_add_ss(m_sum, _mm_shuffle_ps(m_sum, m_sum, 1)));
|
| -#elif defined(ARCH_CPU_ARM_FAMILY)
|
| - // First sum all components.
|
| - float32x4_t m_sum = vmovq_n_f32(0);
|
| - for (int s = 0; s < last_index; s += 4)
|
| - m_sum = vmlaq_f32(m_sum, vld1q_f32(a_src + s), vld1q_f32(b_src + s));
|
| -
|
| - // Reduce to a single float for this channel.
|
| - float32x2_t m_half = vadd_f32(vget_high_f32(m_sum), vget_low_f32(m_sum));
|
| - dot_product[ch] = vget_lane_f32(vpadd_f32(m_half, m_half), 0);
|
| -#endif
|
| - }
|
| -
|
| - if (!rem)
|
| - return;
|
| - num_frames = rem;
|
| - frame_offset_a += last_index;
|
| - frame_offset_b += last_index;
|
| -#else
|
| - memset(dot_product, 0, sizeof(*dot_product) * a->channels());
|
| -#endif // defined(USE_SIMD)
|
| -
|
| - // C version is required to handle remainder of frames (% 4 != 0)
|
| - for (int k = 0; k < a->channels(); ++k) {
|
| - const float* ch_a = a->channel(k) + frame_offset_a;
|
| - const float* ch_b = b->channel(k) + frame_offset_b;
|
| - for (int n = 0; n < num_frames; ++n)
|
| - dot_product[k] += *ch_a++ * *ch_b++;
|
| + dot_product[ch] =
|
| + vector_math::DotProduct(a->channel(ch) + frame_offset_a,
|
| + b->channel(ch) + frame_offset_b, num_frames);
|
| }
|
| }
|
|
|
|
|