Index: media/filters/wsola_internals.cc |
diff --git a/media/filters/wsola_internals.cc b/media/filters/wsola_internals.cc |
index a247df91c5cd761093ff1c0753f648ef0d372911..9e12f032f3cc021a151c933aec88a2fe82e31bf7 100644 |
--- a/media/filters/wsola_internals.cc |
+++ b/media/filters/wsola_internals.cc |
@@ -15,6 +15,14 @@ |
#include "base/logging.h" |
#include "media/base/audio_bus.h" |
+#if defined(ARCH_CPU_X86_FAMILY) |
+#define USE_SIMD 1 |
+#include <xmmintrin.h> |
+#elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) |
+#define USE_SIMD 1 |
+#include <arm_neon.h> |
+#endif |
+ |
namespace media { |
namespace internal { |
@@ -48,13 +56,55 @@ void MultiChannelDotProduct(const AudioBus* a, |
DCHECK_LE(frame_offset_a + num_frames, a->frames()); |
DCHECK_LE(frame_offset_b + num_frames, b->frames()); |
+// SIMD optimized variants can provide a massive speedup to this operation. |
+#if defined(USE_SIMD) |
+ const int rem = num_frames % 4; |
+ const int last_index = num_frames - rem; |
+ const int channels = a->channels(); |
+ for (int ch = 0; ch < channels; ++ch) { |
+ const float* a_src = a->channel(ch) + frame_offset_a; |
+ const float* b_src = b->channel(ch) + frame_offset_b; |
+ |
+#if defined(ARCH_CPU_X86_FAMILY) |
+ // First sum all components. |
+ __m128 m_sum = _mm_setzero_ps(); |
+ for (int s = 0; s < last_index; s += 4) { |
+ m_sum = _mm_add_ps( |
+ m_sum, _mm_mul_ps(_mm_loadu_ps(a_src + s), _mm_loadu_ps(b_src + s))); |
+ } |
+ |
+ // Reduce to a single float for this channel. Sadly, SSE1,2 doesn't have a |
+ // horizontal sum function, so we have to condense manually. |
+ m_sum = _mm_add_ps(_mm_movehl_ps(m_sum, m_sum), m_sum); |
+ _mm_store_ss(dot_product + ch, |
+ _mm_add_ss(m_sum, _mm_shuffle_ps(m_sum, m_sum, 1))); |
+#elif defined(ARCH_CPU_ARM_FAMILY) |
+ // First sum all components. |
+ float32x4_t m_sum = vmovq_n_f32(0); |
+ for (int s = 0; s < last_index; s += 4) |
+ m_sum = vmlaq_f32(m_sum, vld1q_f32(a_src + s), vld1q_f32(b_src + s)); |
+ |
+ // Reduce to a single float for this channel. |
+ float32x2_t m_half = vadd_f32(vget_high_f32(m_sum), vget_low_f32(m_sum)); |
+ dot_product[ch] = vget_lane_f32(vpadd_f32(m_half, m_half), 0); |
+#endif |
+ } |
+ |
+ if (!rem) |
+ return; |
+ num_frames = rem; |
+ frame_offset_a += last_index; |
+ frame_offset_b += last_index; |
+#else |
memset(dot_product, 0, sizeof(*dot_product) * a->channels()); |
+#endif // defined(USE_SIMD) |
+ |
+ // C version is required to handle remainder of frames (% 4 != 0) |
for (int k = 0; k < a->channels(); ++k) { |
const float* ch_a = a->channel(k) + frame_offset_a; |
const float* ch_b = b->channel(k) + frame_offset_b; |
- for (int n = 0; n < num_frames; ++n) { |
+ for (int n = 0; n < num_frames; ++n) |
dot_product[k] += *ch_a++ * *ch_b++; |
- } |
} |
} |