Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1881)

Side by Side Diff: media/base/vector_math.cc

Issue 2556993002: Experiment with AVX optimizations for FMAC, FMUL operations.
Patch Set: Created 4 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « media/base/vector_math.h ('k') | media/base/vector_math_avx.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "media/base/vector_math.h" 5 #include "media/base/vector_math.h"
6 #include "media/base/vector_math_testing.h"
7 6
8 #include <algorithm> 7 #include <algorithm>
9 8
9 #include "base/cpu.h"
10 #include "base/logging.h" 10 #include "base/logging.h"
11 #include "build/build_config.h" 11 #include "build/build_config.h"
12 #include "media/base/vector_math_testing.h"
12 13
13 // NaCl does not allow intrinsics. 14 // NaCl does not allow intrinsics.
14 #if defined(ARCH_CPU_X86_FAMILY) && !defined(OS_NACL) 15 #if defined(ARCH_CPU_X86_FAMILY) && !defined(OS_NACL)
15 #include <xmmintrin.h> 16 #include <xmmintrin.h>
17
18 using FmacProc = void (*)(const float*, float, int, float*);
19 static FmacProc g_fmac_proc_ = nullptr;
20 using DotProductProc = float (*)(const float*, const float*, int);
21 static DotProductProc g_dotproduct_proc_ = nullptr;
22 using ConvolveProc = float (*)(const float*,
23 const float*,
24 const float*,
25 double);
26 static ConvolveProc g_convolve_proc_ = nullptr;
27
28 // AVX FMAC only performs well on AVX2+ machines due to those machines actually
29 // having 256-bit processing units vs wrappers around 128 bit units.
30 #define INITIALIZE() \
31 do { \
32 CHECK(!g_fmac_proc_); \
33 CHECK(!g_convolve_proc_); \
34 CHECK(!g_dotproduct_proc_); \
35 base::CPU cpu_info; \
36 g_fmac_proc_ = cpu_info.has_avx2() ? FMAC_AVX : FMAC_SSE; \
37 g_convolve_proc_ = cpu_info.has_avx() ? Convolve_AVX : Convolve_SSE; \
38 g_dotproduct_proc_ = cpu_info.has_avx() ? DotProduct_AVX : DotProduct_SSE; \
39 } while (0)
40
41 #define CONVOLVE_FUNC g_convolve_proc_
42 #define DOTPRODUCT_FUNC g_dotproduct_proc_
43 #define FMAC_FUNC g_fmac_proc_
44
16 // Don't use custom SSE versions where the auto-vectorized C version performs 45 // Don't use custom SSE versions where the auto-vectorized C version performs
17 // better, which is anywhere clang is used. 46 // better, which is anywhere clang is used.
18 #if !defined(__clang__) 47 #if !defined(__clang__)
19 #define FMAC_FUNC FMAC_SSE
20 #define FMUL_FUNC FMUL_SSE 48 #define FMUL_FUNC FMUL_SSE
21 #else 49 #else
22 #define FMAC_FUNC FMAC_C
23 #define FMUL_FUNC FMUL_C 50 #define FMUL_FUNC FMUL_C
24 #endif 51 #endif
25 #define EWMAAndMaxPower_FUNC EWMAAndMaxPower_SSE 52 #define EWMAAndMaxPower_FUNC EWMAAndMaxPower_SSE
53
26 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) 54 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
55
56 // NEON optimized versions.
27 #include <arm_neon.h> 57 #include <arm_neon.h>
58 #define INITIALIZE()
59 #define CONVOLVE_FUNC Convolve_NEON
60 #define DOTPRODUCT_FUNC DotProduct_NEON
28 #define FMAC_FUNC FMAC_NEON 61 #define FMAC_FUNC FMAC_NEON
29 #define FMUL_FUNC FMUL_NEON 62 #define FMUL_FUNC FMUL_NEON
30 #define EWMAAndMaxPower_FUNC EWMAAndMaxPower_NEON 63 #define EWMAAndMaxPower_FUNC EWMAAndMaxPower_NEON
64
31 #else 65 #else
66
67 // No SIMD optimization versions.
68 #define INITIALIZE()
69 #define CONVOLVE_FUNC Convolve_C
70 #define DOTPRODUCT_FUNC DotProduct_C
32 #define FMAC_FUNC FMAC_C 71 #define FMAC_FUNC FMAC_C
33 #define FMUL_FUNC FMUL_C 72 #define FMUL_FUNC FMUL_C
34 #define EWMAAndMaxPower_FUNC EWMAAndMaxPower_C 73 #define EWMAAndMaxPower_FUNC EWMAAndMaxPower_C
74
35 #endif 75 #endif
36 76
37 namespace media { 77 namespace media {
38 namespace vector_math { 78 namespace vector_math {
39 79
40 void FMAC(const float src[], float scale, int len, float dest[]) { 80 void Initialize() {
41 // Ensure |src| and |dest| are 16-byte aligned. 81 INITIALIZE();
82 }
83
84 float Convolve(const float* src,
85 const float* k1,
86 const float* k2,
87 double kernel_interpolation_factor) {
88 return CONVOLVE_FUNC(src, k1, k2, kernel_interpolation_factor);
89 }
90
91 float Convolve_C(const float* src,
92 const float* k1,
93 const float* k2,
94 double kernel_interpolation_factor) {
95 float sum1 = 0;
96 float sum2 = 0;
97
98 // Generate a single output sample.
99 int n = kKernelSize;
100 while (n--) {
101 sum1 += *src * *k1++;
102 sum2 += *src++ * *k2++;
103 }
104
105 // Linearly interpolate the two "convolutions".
106 return static_cast<float>((1.0 - kernel_interpolation_factor) * sum1 +
107 kernel_interpolation_factor * sum2);
108 }
109
110 float DotProduct(const float* a, const float* b, int len) {
111 return DOTPRODUCT_FUNC(a, b, len);
112 }
113
114 float DotProduct_C(const float* a, const float* b, int len) {
115 float sum = 0;
116 for (int i = 0; i < len; ++i)
117 sum += a[i] * b[i];
118 return sum;
119 }
120
121 void FMAC(const float* src, float scale, int len, float* dest) {
122 // Ensure |src| and |dest| are aligned.
42 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(src) & (kRequiredAlignment - 1)); 123 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(src) & (kRequiredAlignment - 1));
43 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(dest) & (kRequiredAlignment - 1)); 124 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(dest) & (kRequiredAlignment - 1));
44 return FMAC_FUNC(src, scale, len, dest); 125 return FMAC_FUNC(src, scale, len, dest);
45 } 126 }
46 127
47 void FMAC_C(const float src[], float scale, int len, float dest[]) { 128 void FMAC_C(const float* src, float scale, int len, float* dest) {
48 for (int i = 0; i < len; ++i) 129 for (int i = 0; i < len; ++i)
49 dest[i] += src[i] * scale; 130 dest[i] += src[i] * scale;
50 } 131 }
51 132
52 void FMUL(const float src[], float scale, int len, float dest[]) { 133 void FMUL(const float* src, float scale, int len, float* dest) {
53 // Ensure |src| and |dest| are 16-byte aligned. 134 // Ensure |src| and |dest| are aligned.
54 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(src) & (kRequiredAlignment - 1)); 135 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(src) & (kRequiredAlignment - 1));
55 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(dest) & (kRequiredAlignment - 1)); 136 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(dest) & (kRequiredAlignment - 1));
56 return FMUL_FUNC(src, scale, len, dest); 137 return FMUL_FUNC(src, scale, len, dest);
57 } 138 }
58 139
59 void FMUL_C(const float src[], float scale, int len, float dest[]) { 140 void FMUL_C(const float* src, float scale, int len, float* dest) {
60 for (int i = 0; i < len; ++i) 141 for (int i = 0; i < len; ++i)
61 dest[i] = src[i] * scale; 142 dest[i] = src[i] * scale;
62 } 143 }
63 144
64 void Crossfade(const float src[], int len, float dest[]) { 145 std::pair<float, float> EWMAAndMaxPower(float initial_value,
65 float cf_ratio = 0; 146 const float* src,
66 const float cf_increment = 1.0f / len; 147 int len,
67 for (int i = 0; i < len; ++i, cf_ratio += cf_increment) 148 float smoothing_factor) {
68 dest[i] = (1.0f - cf_ratio) * src[i] + cf_ratio * dest[i];
69 }
70
71 std::pair<float, float> EWMAAndMaxPower(
72 float initial_value, const float src[], int len, float smoothing_factor) {
73 // Ensure |src| is 16-byte aligned. 149 // Ensure |src| is 16-byte aligned.
74 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(src) & (kRequiredAlignment - 1)); 150 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(src) & (kRequiredAlignment - 1));
75 return EWMAAndMaxPower_FUNC(initial_value, src, len, smoothing_factor); 151 return EWMAAndMaxPower_FUNC(initial_value, src, len, smoothing_factor);
76 } 152 }
77 153
78 std::pair<float, float> EWMAAndMaxPower_C( 154 std::pair<float, float> EWMAAndMaxPower_C(float initial_value,
79 float initial_value, const float src[], int len, float smoothing_factor) { 155 const float* src,
156 int len,
157 float smoothing_factor) {
80 std::pair<float, float> result(initial_value, 0.0f); 158 std::pair<float, float> result(initial_value, 0.0f);
81 const float weight_prev = 1.0f - smoothing_factor; 159 const float weight_prev = 1.0f - smoothing_factor;
82 for (int i = 0; i < len; ++i) { 160 for (int i = 0; i < len; ++i) {
83 result.first *= weight_prev; 161 result.first *= weight_prev;
84 const float sample = src[i]; 162 const float sample = src[i];
85 const float sample_squared = sample * sample; 163 const float sample_squared = sample * sample;
86 result.first += sample_squared * smoothing_factor; 164 result.first += sample_squared * smoothing_factor;
87 result.second = std::max(result.second, sample_squared); 165 result.second = std::max(result.second, sample_squared);
88 } 166 }
89 return result; 167 return result;
90 } 168 }
91 169
92 #if defined(ARCH_CPU_X86_FAMILY) && !defined(OS_NACL) 170 #if defined(ARCH_CPU_X86_FAMILY) && !defined(OS_NACL)
93 void FMUL_SSE(const float src[], float scale, int len, float dest[]) { 171 float Convolve_SSE(const float* src,
172 const float* k1,
173 const float* k2,
174 double kernel_interpolation_factor) {
175 __m128 m_input;
176 __m128 m_sums1 = _mm_setzero_ps();
177 __m128 m_sums2 = _mm_setzero_ps();
178
179 // Based on |input_ptr| alignment, we need to use loadu or load. Unrolling
180 // these loops hurt performance in local testing.
181 if (reinterpret_cast<uintptr_t>(src) & 0x0F) {
182 for (int i = 0; i < kKernelSize; i += 4) {
183 m_input = _mm_loadu_ps(src + i);
184 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));
185 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));
186 }
187 } else {
188 for (int i = 0; i < kKernelSize; i += 4) {
189 m_input = _mm_load_ps(src + i);
190 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));
191 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));
192 }
193 }
194
195 // Linearly interpolate the two "convolutions".
196 m_sums1 = _mm_mul_ps(
197 m_sums1,
198 _mm_set_ps1(static_cast<float>(1.0 - kernel_interpolation_factor)));
199 m_sums2 = _mm_mul_ps(
200 m_sums2, _mm_set_ps1(static_cast<float>(kernel_interpolation_factor)));
201 m_sums1 = _mm_add_ps(m_sums1, m_sums2);
202
203 // Sum components together.
204 float result;
205 m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1);
206 _mm_store_ss(&result,
207 _mm_add_ss(m_sums2, _mm_shuffle_ps(m_sums2, m_sums2, 1)));
208
209 return result;
210 }
211
212 float DotProduct_SSE(const float* a, const float* b, int len) {
94 const int rem = len % 4; 213 const int rem = len % 4;
95 const int last_index = len - rem; 214 const int last_index = len - rem;
96 __m128 m_scale = _mm_set_ps1(scale); 215
216 // First sum all components.
217 __m128 m_sum = _mm_setzero_ps();
218 for (int s = 0; s < last_index; s += 4) {
219 m_sum =
220 _mm_add_ps(m_sum, _mm_mul_ps(_mm_loadu_ps(a + s), _mm_loadu_ps(b + s)));
221 }
222
223 // Reduce to a single float for this channel. Sadly, SSE1,2 doesn't have a
224 // horizontal sum function, so we have to condense manually.
225 float sum;
226 m_sum = _mm_add_ps(_mm_movehl_ps(m_sum, m_sum), m_sum);
227 _mm_store_ss(&sum, _mm_add_ss(m_sum, _mm_shuffle_ps(m_sum, m_sum, 1)));
228
229 // Handle any remaining values that wouldn't fit in an SSE pass.
230 for (int i = last_index; i < len; ++i)
231 sum += a[i] * b[i];
232
233 return sum;
234 }
235
236 void FMUL_SSE(const float* src, float scale, int len, float* dest) {
237 const int rem = len % 4;
238 const int last_index = len - rem;
239 const __m128 m_scale = _mm_set_ps1(scale);
97 for (int i = 0; i < last_index; i += 4) 240 for (int i = 0; i < last_index; i += 4)
98 _mm_store_ps(dest + i, _mm_mul_ps(_mm_load_ps(src + i), m_scale)); 241 _mm_store_ps(dest + i, _mm_mul_ps(_mm_load_ps(src + i), m_scale));
99 242
100 // Handle any remaining values that wouldn't fit in an SSE pass. 243 // Handle any remaining values that wouldn't fit in an SSE pass.
101 for (int i = last_index; i < len; ++i) 244 for (int i = last_index; i < len; ++i)
102 dest[i] = src[i] * scale; 245 dest[i] = src[i] * scale;
103 } 246 }
104 247
105 void FMAC_SSE(const float src[], float scale, int len, float dest[]) { 248 void FMAC_SSE(const float* src, float scale, int len, float* dest) {
106 const int rem = len % 4; 249 const int rem = len % 4;
107 const int last_index = len - rem; 250 const int last_index = len - rem;
108 __m128 m_scale = _mm_set_ps1(scale); 251 const __m128 m_scale = _mm_set_ps1(scale);
109 for (int i = 0; i < last_index; i += 4) { 252 for (int i = 0; i < last_index; i += 4) {
110 _mm_store_ps(dest + i, _mm_add_ps(_mm_load_ps(dest + i), 253 _mm_store_ps(dest + i,
111 _mm_mul_ps(_mm_load_ps(src + i), m_scale))); 254 _mm_add_ps(_mm_load_ps(dest + i),
255 _mm_mul_ps(_mm_load_ps(src + i), m_scale)));
112 } 256 }
113 257
114 // Handle any remaining values that wouldn't fit in an SSE pass. 258 // Handle any remaining values that wouldn't fit in an SSE pass.
115 for (int i = last_index; i < len; ++i) 259 for (int i = last_index; i < len; ++i)
116 dest[i] += src[i] * scale; 260 dest[i] += src[i] * scale;
117 } 261 }
118 262
119 // Convenience macro to extract float 0 through 3 from the vector |a|. This is 263 // Convenience macro to extract float 0 through 3 from the vector |a|. This is
120 // needed because compilers other than clang don't support access via 264 // needed because compilers other than clang don't support access via
121 // operator[](). 265 // operator[]().
122 #define EXTRACT_FLOAT(a, i) \ 266 #define EXTRACT_FLOAT(a, i) \
123 (i == 0 ? \ 267 (i == 0 ? _mm_cvtss_f32(a) : _mm_cvtss_f32(_mm_shuffle_ps(a, a, i)))
124 _mm_cvtss_f32(a) : \
125 _mm_cvtss_f32(_mm_shuffle_ps(a, a, i)))
126 268
127 std::pair<float, float> EWMAAndMaxPower_SSE( 269 std::pair<float, float> EWMAAndMaxPower_SSE(float initial_value,
128 float initial_value, const float src[], int len, float smoothing_factor) { 270 const float* src,
271 int len,
272 float smoothing_factor) {
129 // When the recurrence is unrolled, we see that we can split it into 4 273 // When the recurrence is unrolled, we see that we can split it into 4
130 // separate lanes of evaluation: 274 // separate lanes of evaluation:
131 // 275 //
132 // y[n] = a(S[n]^2) + (1-a)(y[n-1]) 276 // y[n] = a(S[n]^2) + (1-a)(y[n-1])
133 // = a(S[n]^2) + (1-a)^1(aS[n-1]^2) + (1-a)^2(aS[n-2]^2) + ... 277 // = a(S[n]^2) + (1-a)^1(aS[n-1]^2) + (1-a)^2(aS[n-2]^2) + ...
134 // = z[n] + (1-a)^1(z[n-1]) + (1-a)^2(z[n-2]) + (1-a)^3(z[n-3]) 278 // = z[n] + (1-a)^1(z[n-1]) + (1-a)^2(z[n-2]) + (1-a)^3(z[n-3])
135 // 279 //
136 // where z[n] = a(S[n]^2) + (1-a)^4(z[n-4]) + (1-a)^8(z[n-8]) + ... 280 // where z[n] = a(S[n]^2) + (1-a)^4(z[n-4]) + (1-a)^8(z[n-8]) + ...
137 // 281 //
138 // Thus, the strategy here is to compute z[n], z[n-1], z[n-2], and z[n-3] in 282 // Thus, the strategy here is to compute z[n], z[n-1], z[n-2], and z[n-3] in
(...skipping 13 matching lines...) Expand all
152 // Compute z[n], z[n-1], z[n-2], and z[n-3] in parallel in lanes 3, 2, 1 and 296 // Compute z[n], z[n-1], z[n-2], and z[n-3] in parallel in lanes 3, 2, 1 and
153 // 0, respectively. 297 // 0, respectively.
154 __m128 max_x4 = _mm_setzero_ps(); 298 __m128 max_x4 = _mm_setzero_ps();
155 __m128 ewma_x4 = _mm_setr_ps(0.0f, 0.0f, 0.0f, initial_value); 299 __m128 ewma_x4 = _mm_setr_ps(0.0f, 0.0f, 0.0f, initial_value);
156 int i; 300 int i;
157 for (i = 0; i < last_index; i += 4) { 301 for (i = 0; i < last_index; i += 4) {
158 ewma_x4 = _mm_mul_ps(ewma_x4, weight_prev_4th_x4); 302 ewma_x4 = _mm_mul_ps(ewma_x4, weight_prev_4th_x4);
159 const __m128 sample_x4 = _mm_load_ps(src + i); 303 const __m128 sample_x4 = _mm_load_ps(src + i);
160 const __m128 sample_squared_x4 = _mm_mul_ps(sample_x4, sample_x4); 304 const __m128 sample_squared_x4 = _mm_mul_ps(sample_x4, sample_x4);
161 max_x4 = _mm_max_ps(max_x4, sample_squared_x4); 305 max_x4 = _mm_max_ps(max_x4, sample_squared_x4);
162 // Note: The compiler optimizes this to a single multiply-and-accumulate 306 ewma_x4 =
163 // instruction: 307 _mm_add_ps(ewma_x4, _mm_mul_ps(sample_squared_x4, smoothing_factor_x4));
164 ewma_x4 = _mm_add_ps(ewma_x4,
165 _mm_mul_ps(sample_squared_x4, smoothing_factor_x4));
166 } 308 }
167 309
168 // y[n] = z[n] + (1-a)^1(z[n-1]) + (1-a)^2(z[n-2]) + (1-a)^3(z[n-3]) 310 // y[n] = z[n] + (1-a)^1(z[n-1]) + (1-a)^2(z[n-2]) + (1-a)^3(z[n-3])
169 float ewma = EXTRACT_FLOAT(ewma_x4, 3); 311 float ewma = EXTRACT_FLOAT(ewma_x4, 3);
170 ewma_x4 = _mm_mul_ps(ewma_x4, weight_prev_x4); 312 ewma_x4 = _mm_mul_ps(ewma_x4, weight_prev_x4);
171 ewma += EXTRACT_FLOAT(ewma_x4, 2); 313 ewma += EXTRACT_FLOAT(ewma_x4, 2);
172 ewma_x4 = _mm_mul_ps(ewma_x4, weight_prev_x4); 314 ewma_x4 = _mm_mul_ps(ewma_x4, weight_prev_x4);
173 ewma += EXTRACT_FLOAT(ewma_x4, 1); 315 ewma += EXTRACT_FLOAT(ewma_x4, 1);
174 ewma_x4 = _mm_mul_ss(ewma_x4, weight_prev_x4); 316 ewma_x4 = _mm_mul_ss(ewma_x4, weight_prev_x4);
175 ewma += EXTRACT_FLOAT(ewma_x4, 0); 317 ewma += EXTRACT_FLOAT(ewma_x4, 0);
(...skipping 12 matching lines...) Expand all
188 const float sample_squared = sample * sample; 330 const float sample_squared = sample * sample;
189 result.first += sample_squared * smoothing_factor; 331 result.first += sample_squared * smoothing_factor;
190 result.second = std::max(result.second, sample_squared); 332 result.second = std::max(result.second, sample_squared);
191 } 333 }
192 334
193 return result; 335 return result;
194 } 336 }
195 #endif 337 #endif
196 338
197 #if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) 339 #if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
198 void FMAC_NEON(const float src[], float scale, int len, float dest[]) { 340 float Convolve_NEON(const float* src,
341 const float* k1,
342 const float* k2,
343 double kernel_interpolation_factor) {
344 float32x4_t m_input;
345 float32x4_t m_sums1 = vmovq_n_f32(0);
346 float32x4_t m_sums2 = vmovq_n_f32(0);
347
348 const float* upper = src + kKernelSize;
349 for (; input_ptr < upper;) {
350 m_input = vld1q_f32(src);
351 src += 4;
352 m_sums1 = vmlaq_f32(m_sums1, m_input, vld1q_f32(k1));
353 k1 += 4;
354 m_sums2 = vmlaq_f32(m_sums2, m_input, vld1q_f32(k2));
355 k2 += 4;
356 }
357
358 // Linearly interpolate the two "convolutions".
359 m_sums1 = vmlaq_f32(
360 vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)),
361 m_sums2, vmovq_n_f32(kernel_interpolation_factor));
362
363 // Sum components together.
364 float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1));
365 return vget_lane_f32(vpadd_f32(m_half, m_half), 0);
366 }
367
368 float DotProduct_NEON(const float* a, const float* b, int len) {
199 const int rem = len % 4; 369 const int rem = len % 4;
200 const int last_index = len - rem; 370 const int last_index = len - rem;
201 float32x4_t m_scale = vmovq_n_f32(scale); 371
372 // First sum all components.
373 float32x4_t m_sum = vmovq_n_f32(0);
374 for (int s = 0; s < last_index; s += 4)
375 m_sum = vmlaq_f32(m_sum, vld1q_f32(a + s), vld1q_f32(b + s));
376
377 // Reduce to a single float for this channel.
378 float32x2_t m_half = vadd_f32(vget_high_f32(m_sum), vget_low_f32(m_sum));
379 float sum = vget_lane_f32(vpadd_f32(m_half, m_half), 0);
380
381 // Handle any remaining values that wouldn't fit in an NEON pass.
382 for (int i = last_index; i < len; ++i)
383 sum += a[i] * b[i];
384
385 return sum;
386 }
387
388 void FMAC_NEON(const float* src, float scale, int len, float* dest) {
389 const int rem = len % 4;
390 const int last_index = len - rem;
391 const float32x4_t m_scale = vmovq_n_f32(scale);
202 for (int i = 0; i < last_index; i += 4) { 392 for (int i = 0; i < last_index; i += 4) {
203 vst1q_f32(dest + i, vmlaq_f32( 393 vst1q_f32(dest + i,
204 vld1q_f32(dest + i), vld1q_f32(src + i), m_scale)); 394 vmlaq_f32(vld1q_f32(dest + i), vld1q_f32(src + i), m_scale));
205 } 395 }
206 396
207 // Handle any remaining values that wouldn't fit in an NEON pass. 397 // Handle any remaining values that wouldn't fit in an NEON pass.
208 for (int i = last_index; i < len; ++i) 398 for (int i = last_index; i < len; ++i)
209 dest[i] += src[i] * scale; 399 dest[i] += src[i] * scale;
210 } 400 }
211 401
212 void FMUL_NEON(const float src[], float scale, int len, float dest[]) { 402 void FMUL_NEON(const float* src, float scale, int len, float* dest) {
213 const int rem = len % 4; 403 const int rem = len % 4;
214 const int last_index = len - rem; 404 const int last_index = len - rem;
215 float32x4_t m_scale = vmovq_n_f32(scale); 405 const float32x4_t m_scale = vmovq_n_f32(scale);
216 for (int i = 0; i < last_index; i += 4) 406 for (int i = 0; i < last_index; i += 4)
217 vst1q_f32(dest + i, vmulq_f32(vld1q_f32(src + i), m_scale)); 407 vst1q_f32(dest + i, vmulq_f32(vld1q_f32(src + i), m_scale));
218 408
219 // Handle any remaining values that wouldn't fit in an NEON pass. 409 // Handle any remaining values that wouldn't fit in an NEON pass.
220 for (int i = last_index; i < len; ++i) 410 for (int i = last_index; i < len; ++i)
221 dest[i] = src[i] * scale; 411 dest[i] = src[i] * scale;
222 } 412 }
223 413
224 std::pair<float, float> EWMAAndMaxPower_NEON( 414 std::pair<float, float> EWMAAndMaxPower_NEON(float initial_value,
225 float initial_value, const float src[], int len, float smoothing_factor) { 415 const float* src,
416 int len,
417 float smoothing_factor) {
226 // When the recurrence is unrolled, we see that we can split it into 4 418 // When the recurrence is unrolled, we see that we can split it into 4
227 // separate lanes of evaluation: 419 // separate lanes of evaluation:
228 // 420 //
229 // y[n] = a(S[n]^2) + (1-a)(y[n-1]) 421 // y[n] = a(S[n]^2) + (1-a)(y[n-1])
230 // = a(S[n]^2) + (1-a)^1(aS[n-1]^2) + (1-a)^2(aS[n-2]^2) + ... 422 // = a(S[n]^2) + (1-a)^1(aS[n-1]^2) + (1-a)^2(aS[n-2]^2) + ...
231 // = z[n] + (1-a)^1(z[n-1]) + (1-a)^2(z[n-2]) + (1-a)^3(z[n-3]) 423 // = z[n] + (1-a)^1(z[n-1]) + (1-a)^2(z[n-2]) + (1-a)^3(z[n-3])
232 // 424 //
233 // where z[n] = a(S[n]^2) + (1-a)^4(z[n-4]) + (1-a)^8(z[n-8]) + ... 425 // where z[n] = a(S[n]^2) + (1-a)^4(z[n-4]) + (1-a)^8(z[n-8]) + ...
234 // 426 //
235 // Thus, the strategy here is to compute z[n], z[n-1], z[n-2], and z[n-3] in 427 // Thus, the strategy here is to compute z[n], z[n-1], z[n-2], and z[n-3] in
(...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after
282 result.first += sample_squared * smoothing_factor; 474 result.first += sample_squared * smoothing_factor;
283 result.second = std::max(result.second, sample_squared); 475 result.second = std::max(result.second, sample_squared);
284 } 476 }
285 477
286 return result; 478 return result;
287 } 479 }
288 #endif 480 #endif
289 481
290 } // namespace vector_math 482 } // namespace vector_math
291 } // namespace media 483 } // namespace media
OLDNEW
« no previous file with comments | « media/base/vector_math.h ('k') | media/base/vector_math_avx.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698