media/filters/wsola_internals.cc - Issue 2556993002: Experiment with AVX optimizations for FMAC, FMUL operations.

Side by Side Diff: media/filters/wsola_internals.cc

Issue 2556993002: Experiment with AVX optimizations for FMAC, FMUL operations.

Patch Set: Created 4 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2013 The Chromium Authors. All rights reserved.	1 // Copyright 2013 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 // MSVC++ requires this to be set before any other includes to get M_PI.	5 // MSVC++ requires this to be set before any other includes to get M_PI.

6 #define _USE_MATH_DEFINES	6 #define _USE_MATH_DEFINES

7	7

8 #include "media/filters/wsola_internals.h"	8 #include "media/filters/wsola_internals.h"

9	9

10 #include <algorithm>	10 #include <algorithm>

11 #include <cmath>	11 #include <cmath>

12 #include <limits>	12 #include <limits>

13 #include <memory>	13 #include <memory>

14	14

15 #include "base/logging.h"	15 #include "base/logging.h"

16 #include "media/base/audio_bus.h"	16 #include "media/base/audio_bus.h"

17	17 #include "media/base/vector_math.h"

18 #if defined(ARCH_CPU_X86_FAMILY)

19 #define USE_SIMD 1

20 #include <xmmintrin.h>

21 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)

22 #define USE_SIMD 1

23 #include <arm_neon.h>

24 #endif

25	18

26 namespace media {	19 namespace media {

27	20

28 namespace internal {	21 namespace internal {

29	22

30 bool InInterval(int n, Interval q) {	23 bool InInterval(int n, Interval q) {

31 return n >= q.first && n <= q.second;	24 return n >= q.first && n <= q.second;

32 }	25 }

33	26

34 float MultiChannelSimilarityMeasure(const float* dot_prod_a_b,	27 float MultiChannelSimilarityMeasure(const float* dot_prod_a_b,

(...skipping 14 matching lines...) Expand all Loading...
49 const AudioBus* b,	42 const AudioBus* b,

50 int frame_offset_b,	43 int frame_offset_b,

51 int num_frames,	44 int num_frames,

52 float* dot_product) {	45 float* dot_product) {

53 DCHECK_EQ(a->channels(), b->channels());	46 DCHECK_EQ(a->channels(), b->channels());

54 DCHECK_GE(frame_offset_a, 0);	47 DCHECK_GE(frame_offset_a, 0);

55 DCHECK_GE(frame_offset_b, 0);	48 DCHECK_GE(frame_offset_b, 0);

56 DCHECK_LE(frame_offset_a + num_frames, a->frames());	49 DCHECK_LE(frame_offset_a + num_frames, a->frames());

57 DCHECK_LE(frame_offset_b + num_frames, b->frames());	50 DCHECK_LE(frame_offset_b + num_frames, b->frames());

58	51

59 // SIMD optimized variants can provide a massive speedup to this operation.

60 #if defined(USE_SIMD)

61 const int rem = num_frames % 4;

62 const int last_index = num_frames - rem;

63 const int channels = a->channels();	52 const int channels = a->channels();

64 for (int ch = 0; ch < channels; ++ch) {	53 for (int ch = 0; ch < channels; ++ch) {

65 const float* a_src = a->channel(ch) + frame_offset_a;	54 dot_product[ch] =

66 const float* b_src = b->channel(ch) + frame_offset_b;	55 vector_math::DotProduct(a->channel(ch) + frame_offset_a,

67	56 b->channel(ch) + frame_offset_b, num_frames);

68 #if defined(ARCH_CPU_X86_FAMILY)

69 // First sum all components.

70 __m128 m_sum = _mm_setzero_ps();

71 for (int s = 0; s < last_index; s += 4) {

72 m_sum = _mm_add_ps(

73 m_sum, _mm_mul_ps(_mm_loadu_ps(a_src + s), _mm_loadu_ps(b_src + s)));

74 }

75

76 // Reduce to a single float for this channel. Sadly, SSE1,2 doesn't have a

77 // horizontal sum function, so we have to condense manually.

78 m_sum = _mm_add_ps(_mm_movehl_ps(m_sum, m_sum), m_sum);

79 _mm_store_ss(dot_product + ch,

80 _mm_add_ss(m_sum, _mm_shuffle_ps(m_sum, m_sum, 1)));

81 #elif defined(ARCH_CPU_ARM_FAMILY)

82 // First sum all components.

83 float32x4_t m_sum = vmovq_n_f32(0);

84 for (int s = 0; s < last_index; s += 4)

85 m_sum = vmlaq_f32(m_sum, vld1q_f32(a_src + s), vld1q_f32(b_src + s));

86

87 // Reduce to a single float for this channel.

88 float32x2_t m_half = vadd_f32(vget_high_f32(m_sum), vget_low_f32(m_sum));

89 dot_product[ch] = vget_lane_f32(vpadd_f32(m_half, m_half), 0);

90 #endif

91 }

92

93 if (!rem)

94 return;

95 num_frames = rem;

96 frame_offset_a += last_index;

97 frame_offset_b += last_index;

98 #else

99 memset(dot_product, 0, sizeof(dot_product) a->channels());

100 #endif // defined(USE_SIMD)

101

102 // C version is required to handle remainder of frames (% 4 != 0)

103 for (int k = 0; k < a->channels(); ++k) {

104 const float* ch_a = a->channel(k) + frame_offset_a;

105 const float* ch_b = b->channel(k) + frame_offset_b;

106 for (int n = 0; n < num_frames; ++n)

107 dot_product[k] += ch_a++ *ch_b++;

108 }	57 }

109 }	58 }

110	59

111 void MultiChannelMovingBlockEnergies(const AudioBus* input,	60 void MultiChannelMovingBlockEnergies(const AudioBus* input,

112 int frames_per_block,	61 int frames_per_block,

113 float* energy) {	62 float* energy) {

114 int num_blocks = input->frames() - (frames_per_block - 1);	63 int num_blocks = input->frames() - (frames_per_block - 1);

115 int channels = input->channels();	64 int channels = input->channels();

116	65

117 for (int k = 0; k < input->channels(); ++k) {	66 for (int k = 0; k < input->channels(); ++k) {

(...skipping 193 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
311 void GetSymmetricHanningWindow(int window_length, float* window) {	260 void GetSymmetricHanningWindow(int window_length, float* window) {

312 const float scale = 2.0f * M_PI / window_length;	261 const float scale = 2.0f * M_PI / window_length;

313 for (int n = 0; n < window_length; ++n)	262 for (int n = 0; n < window_length; ++n)

314 window[n] = 0.5f * (1.0f - cosf(n * scale));	263 window[n] = 0.5f * (1.0f - cosf(n * scale));

315 }	264 }

316	265

317 } // namespace internal	266 } // namespace internal

318	267

319 } // namespace media	268 } // namespace media

320	269

OLD	NEW

« no previous file with comments | « media/base/vector_math_unittest.cc ('k') | no next file » | no next file with comments »