media/filters/wsola_internals.cc - Issue 2527533002: Add SSE and NEON intrinsics for WSOLA's MultiChannelDotProduct().

Side by Side Diff: media/filters/wsola_internals.cc

Issue 2527533002: Add SSE and NEON intrinsics for WSOLA's MultiChannelDotProduct(). (Closed)

Patch Set: Zero initialize dot product for C version. Created 4 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2013 The Chromium Authors. All rights reserved.	1 // Copyright 2013 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 // MSVC++ requires this to be set before any other includes to get M_PI.	5 // MSVC++ requires this to be set before any other includes to get M_PI.

6 #define _USE_MATH_DEFINES	6 #define _USE_MATH_DEFINES

7	7

8 #include "media/filters/wsola_internals.h"	8 #include "media/filters/wsola_internals.h"

9	9

10 #include <algorithm>	10 #include <algorithm>

11 #include <cmath>	11 #include <cmath>

12 #include <limits>	12 #include <limits>

13 #include <memory>	13 #include <memory>

14	14

15 #include "base/logging.h"	15 #include "base/logging.h"

16 #include "media/base/audio_bus.h"	16 #include "media/base/audio_bus.h"

17	17

	18 #if defined(ARCH_CPU_X86_FAMILY)

	19 #define USE_SIMD 1

	20 #include <xmmintrin.h>

	21 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)

	22 #define USE_SIMD 1

	23 #include <arm_neon.h>

	24 #endif

	25

18 namespace media {	26 namespace media {

19	27

20 namespace internal {	28 namespace internal {

21	29

22 bool InInterval(int n, Interval q) {	30 bool InInterval(int n, Interval q) {

23 return n >= q.first && n <= q.second;	31 return n >= q.first && n <= q.second;

24 }	32 }

25	33

26 float MultiChannelSimilarityMeasure(const float* dot_prod_a_b,	34 float MultiChannelSimilarityMeasure(const float* dot_prod_a_b,

27 const float* energy_a,	35 const float* energy_a,

(...skipping 13 matching lines...) Expand all Loading...
41 const AudioBus* b,	49 const AudioBus* b,

42 int frame_offset_b,	50 int frame_offset_b,

43 int num_frames,	51 int num_frames,

44 float* dot_product) {	52 float* dot_product) {

45 DCHECK_EQ(a->channels(), b->channels());	53 DCHECK_EQ(a->channels(), b->channels());

46 DCHECK_GE(frame_offset_a, 0);	54 DCHECK_GE(frame_offset_a, 0);

47 DCHECK_GE(frame_offset_b, 0);	55 DCHECK_GE(frame_offset_b, 0);

48 DCHECK_LE(frame_offset_a + num_frames, a->frames());	56 DCHECK_LE(frame_offset_a + num_frames, a->frames());

49 DCHECK_LE(frame_offset_b + num_frames, b->frames());	57 DCHECK_LE(frame_offset_b + num_frames, b->frames());

50	58

	59 // SIMD optimized variants can provide a massive speedup to this operation.

	60 #if defined(USE_SIMD)

	61 const int rem = num_frames % 4;

	62 const int last_index = num_frames - rem;

	63 const int channels = a->channels();

	64 for (int ch = 0; ch < channels; ++ch) {

	65 const float* a_src = a->channel(ch) + frame_offset_a;

	66 const float* b_src = b->channel(ch) + frame_offset_b;

	67

	68 #if defined(ARCH_CPU_X86_FAMILY)

	69 // First sum all components.

	70 __m128 m_sum = _mm_setzero_ps();

	71 for (int s = 0; s < last_index; s += 4) {

	72 m_sum = _mm_add_ps(

	73 m_sum, _mm_mul_ps(_mm_loadu_ps(a_src + s), _mm_loadu_ps(b_src + s)));

	74 }

	75

	76 // Reduce to a single float for this channel. Sadly, SSE1,2 doesn't have a

	77 // horizontal sum function, so we have to condense manually.

	78 m_sum = _mm_add_ps(_mm_movehl_ps(m_sum, m_sum), m_sum);

	79 _mm_store_ss(dot_product + ch,

	80 _mm_add_ss(m_sum, _mm_shuffle_ps(m_sum, m_sum, 1)));

	81 #elif defined(ARCH_CPU_ARM_FAMILY)

	82 // First sum all components.

	83 float32x4_t m_sum = vmovq_n_f32(0);

	84 for (int s = 0; s < last_index; s += 4)

	85 m_sum = vmlaq_f32(m_sum, vld1q_f32(a_src + s), vld1q_f32(b_src + s));

	86

	87 // Reduce to a single float for this channel.

	88 float32x2_t m_half = vadd_f32(vget_high_f32(m_sum), vget_low_f32(m_sum));

	89 dot_product[ch] = vget_lane_f32(vpadd_f32(m_half, m_half), 0);

	90 #endif

	91 }

	92

	93 if (!rem)

	94 return;

	95 num_frames = rem;

	96 frame_offset_a += last_index;

	97 frame_offset_b += last_index;

	98 #else

51 memset(dot_product, 0, sizeof(dot_product) a->channels());	99 memset(dot_product, 0, sizeof(dot_product) a->channels());

	100 #endif // defined(USE_SIMD)

	101

	102 // C version is required to handle remainder of frames (% 4 != 0)

52 for (int k = 0; k < a->channels(); ++k) {	103 for (int k = 0; k < a->channels(); ++k) {

53 const float* ch_a = a->channel(k) + frame_offset_a;	104 const float* ch_a = a->channel(k) + frame_offset_a;

54 const float* ch_b = b->channel(k) + frame_offset_b;	105 const float* ch_b = b->channel(k) + frame_offset_b;

55 for (int n = 0; n < num_frames; ++n) {	106 for (int n = 0; n < num_frames; ++n)

56 dot_product[k] += ch_a++ *ch_b++;	107 dot_product[k] += ch_a++ *ch_b++;

57 }

58 }	108 }

59 }	109 }

60	110

61 void MultiChannelMovingBlockEnergies(const AudioBus* input,	111 void MultiChannelMovingBlockEnergies(const AudioBus* input,

62 int frames_per_block,	112 int frames_per_block,

63 float* energy) {	113 float* energy) {

64 int num_blocks = input->frames() - (frames_per_block - 1);	114 int num_blocks = input->frames() - (frames_per_block - 1);

65 int channels = input->channels();	115 int channels = input->channels();

66	116

67 for (int k = 0; k < input->channels(); ++k) {	117 for (int k = 0; k < input->channels(); ++k) {

(...skipping 193 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
261 void GetSymmetricHanningWindow(int window_length, float* window) {	311 void GetSymmetricHanningWindow(int window_length, float* window) {

262 const float scale = 2.0f * M_PI / window_length;	312 const float scale = 2.0f * M_PI / window_length;

263 for (int n = 0; n < window_length; ++n)	313 for (int n = 0; n < window_length; ++n)

264 window[n] = 0.5f * (1.0f - cosf(n * scale));	314 window[n] = 0.5f * (1.0f - cosf(n * scale));

265 }	315 }

266	316

267 } // namespace internal	317 } // namespace internal

268	318

269 } // namespace media	319 } // namespace media

270	320

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »