Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(264)

Side by Side Diff: media/filters/wsola_internals.cc

Issue 2527533002: Add SSE and NEON intrinsics for WSOLA's MultiChannelDotProduct(). (Closed)
Patch Set: Zero initialize dot product for C version. Created 4 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2013 The Chromium Authors. All rights reserved. 1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 // MSVC++ requires this to be set before any other includes to get M_PI. 5 // MSVC++ requires this to be set before any other includes to get M_PI.
6 #define _USE_MATH_DEFINES 6 #define _USE_MATH_DEFINES
7 7
8 #include "media/filters/wsola_internals.h" 8 #include "media/filters/wsola_internals.h"
9 9
10 #include <algorithm> 10 #include <algorithm>
11 #include <cmath> 11 #include <cmath>
12 #include <limits> 12 #include <limits>
13 #include <memory> 13 #include <memory>
14 14
15 #include "base/logging.h" 15 #include "base/logging.h"
16 #include "media/base/audio_bus.h" 16 #include "media/base/audio_bus.h"
17 17
18 #if defined(ARCH_CPU_X86_FAMILY)
19 #define USE_SIMD 1
20 #include <xmmintrin.h>
21 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
22 #define USE_SIMD 1
23 #include <arm_neon.h>
24 #endif
25
18 namespace media { 26 namespace media {
19 27
20 namespace internal { 28 namespace internal {
21 29
22 bool InInterval(int n, Interval q) { 30 bool InInterval(int n, Interval q) {
23 return n >= q.first && n <= q.second; 31 return n >= q.first && n <= q.second;
24 } 32 }
25 33
26 float MultiChannelSimilarityMeasure(const float* dot_prod_a_b, 34 float MultiChannelSimilarityMeasure(const float* dot_prod_a_b,
27 const float* energy_a, 35 const float* energy_a,
(...skipping 13 matching lines...) Expand all
41 const AudioBus* b, 49 const AudioBus* b,
42 int frame_offset_b, 50 int frame_offset_b,
43 int num_frames, 51 int num_frames,
44 float* dot_product) { 52 float* dot_product) {
45 DCHECK_EQ(a->channels(), b->channels()); 53 DCHECK_EQ(a->channels(), b->channels());
46 DCHECK_GE(frame_offset_a, 0); 54 DCHECK_GE(frame_offset_a, 0);
47 DCHECK_GE(frame_offset_b, 0); 55 DCHECK_GE(frame_offset_b, 0);
48 DCHECK_LE(frame_offset_a + num_frames, a->frames()); 56 DCHECK_LE(frame_offset_a + num_frames, a->frames());
49 DCHECK_LE(frame_offset_b + num_frames, b->frames()); 57 DCHECK_LE(frame_offset_b + num_frames, b->frames());
50 58
59 // SIMD optimized variants can provide a massive speedup to this operation.
60 #if defined(USE_SIMD)
61 const int rem = num_frames % 4;
62 const int last_index = num_frames - rem;
63 const int channels = a->channels();
64 for (int ch = 0; ch < channels; ++ch) {
65 const float* a_src = a->channel(ch) + frame_offset_a;
66 const float* b_src = b->channel(ch) + frame_offset_b;
67
68 #if defined(ARCH_CPU_X86_FAMILY)
69 // First sum all components.
70 __m128 m_sum = _mm_setzero_ps();
71 for (int s = 0; s < last_index; s += 4) {
72 m_sum = _mm_add_ps(
73 m_sum, _mm_mul_ps(_mm_loadu_ps(a_src + s), _mm_loadu_ps(b_src + s)));
74 }
75
76 // Reduce to a single float for this channel. Sadly, SSE1,2 doesn't have a
77 // horizontal sum function, so we have to condense manually.
78 m_sum = _mm_add_ps(_mm_movehl_ps(m_sum, m_sum), m_sum);
79 _mm_store_ss(dot_product + ch,
80 _mm_add_ss(m_sum, _mm_shuffle_ps(m_sum, m_sum, 1)));
81 #elif defined(ARCH_CPU_ARM_FAMILY)
82 // First sum all components.
83 float32x4_t m_sum = vmovq_n_f32(0);
84 for (int s = 0; s < last_index; s += 4)
85 m_sum = vmlaq_f32(m_sum, vld1q_f32(a_src + s), vld1q_f32(b_src + s));
86
87 // Reduce to a single float for this channel.
88 float32x2_t m_half = vadd_f32(vget_high_f32(m_sum), vget_low_f32(m_sum));
89 dot_product[ch] = vget_lane_f32(vpadd_f32(m_half, m_half), 0);
90 #endif
91 }
92
93 if (!rem)
94 return;
95 num_frames = rem;
96 frame_offset_a += last_index;
97 frame_offset_b += last_index;
98 #else
51 memset(dot_product, 0, sizeof(*dot_product) * a->channels()); 99 memset(dot_product, 0, sizeof(*dot_product) * a->channels());
100 #endif // defined(USE_SIMD)
101
102 // C version is required to handle remainder of frames (% 4 != 0)
52 for (int k = 0; k < a->channels(); ++k) { 103 for (int k = 0; k < a->channels(); ++k) {
53 const float* ch_a = a->channel(k) + frame_offset_a; 104 const float* ch_a = a->channel(k) + frame_offset_a;
54 const float* ch_b = b->channel(k) + frame_offset_b; 105 const float* ch_b = b->channel(k) + frame_offset_b;
55 for (int n = 0; n < num_frames; ++n) { 106 for (int n = 0; n < num_frames; ++n)
56 dot_product[k] += *ch_a++ * *ch_b++; 107 dot_product[k] += *ch_a++ * *ch_b++;
57 }
58 } 108 }
59 } 109 }
60 110
61 void MultiChannelMovingBlockEnergies(const AudioBus* input, 111 void MultiChannelMovingBlockEnergies(const AudioBus* input,
62 int frames_per_block, 112 int frames_per_block,
63 float* energy) { 113 float* energy) {
64 int num_blocks = input->frames() - (frames_per_block - 1); 114 int num_blocks = input->frames() - (frames_per_block - 1);
65 int channels = input->channels(); 115 int channels = input->channels();
66 116
67 for (int k = 0; k < input->channels(); ++k) { 117 for (int k = 0; k < input->channels(); ++k) {
(...skipping 193 matching lines...) Expand 10 before | Expand all | Expand 10 after
261 void GetSymmetricHanningWindow(int window_length, float* window) { 311 void GetSymmetricHanningWindow(int window_length, float* window) {
262 const float scale = 2.0f * M_PI / window_length; 312 const float scale = 2.0f * M_PI / window_length;
263 for (int n = 0; n < window_length; ++n) 313 for (int n = 0; n < window_length; ++n)
264 window[n] = 0.5f * (1.0f - cosf(n * scale)); 314 window[n] = 0.5f * (1.0f - cosf(n * scale));
265 } 315 }
266 316
267 } // namespace internal 317 } // namespace internal
268 318
269 } // namespace media 319 } // namespace media
270 320
OLDNEW
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698