Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(752)

Side by Side Diff: Source/platform/audio/VectorMath.cpp

Issue 715753002: [WIP] support arm_neon_optional flag in blink. Base URL: https://chromium.googlesource.com/chromium/blink.git@master
Patch Set: Created 6 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « Source/platform/Logging.cpp ('k') | Source/platform/audio/cpu/arm/VectorMathNEON.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright (C) 2010, Google Inc. All rights reserved. 2 * Copyright (C) 2010, Google Inc. All rights reserved.
3 * 3 *
4 * Redistribution and use in source and binary forms, with or without 4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions 5 * modification, are permitted provided that the following conditions
6 * are met: 6 * are met:
7 * 1. Redistributions of source code must retain the above copyright 7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer. 8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright 9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the 10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution. 11 * documentation and/or other materials provided with the distribution.
12 * 12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' AND AN Y 13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' AND AN Y
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
15 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 15 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
16 * DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE FOR AN Y 16 * DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE FOR AN Y
17 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 17 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
18 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 18 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
19 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND O N 19 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND O N
20 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 20 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
21 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 21 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
22 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 22 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 */ 23 */
24 24
25 #include "config.h" 25 #include "config.h"
26 26
27 #if ENABLE(WEB_AUDIO) 27 #if ENABLE(WEB_AUDIO)
28 28
29 #include "platform/audio/VectorMath.h" 29 #include "platform/audio/VectorMath.h"
30
30 #include "wtf/Assertions.h" 31 #include "wtf/Assertions.h"
31 #include "wtf/CPU.h" 32 #include <emmintrin.h>
32 #include <stdint.h> 33 #include <stdint.h>
33 34
34 #if OS(MACOSX) 35 #define SSE2_MULT_ADD(loadInstr, storeInstr) \
35 #include <Accelerate/Accelerate.h> 36 while (destP < endP) { \
36 #endif 37 pSource = _mm_load_ps(sourceP); \
38 temp = _mm_mul_ps(pSource, mScale); \
39 dest = _mm_##loadInstr##_ps(destP); \
40 dest = _mm_add_ps(dest, temp); \
41 _mm_##storeInstr##_ps(destP, dest); \
42 sourceP += 4; \
43 destP += 4; \
44 } \
37 45
38 #if CPU(X86) || CPU(X86_64) 46 #define SSE2_MULT(loadInstr, storeInstr) \
39 #include <emmintrin.h> 47 while (destP < endP) { \
40 #endif 48 pSource1 = _mm_load_ps(source1P); \
41 49 pSource2 = _mm_##loadInstr##_ps(source2P); \
42 #if HAVE(ARM_NEON_INTRINSICS) 50 dest = _mm_mul_ps(pSource1, pSource2); \
43 #include <arm_neon.h> 51 _mm_##storeInstr##_ps(destP, dest); \
44 #endif 52 source1P += 4; \
45 53 source2P += 4; \
46 #include <math.h> 54 destP += 4; \
47 #include <algorithm> 55 } \
48 56
49 namespace blink { 57 namespace blink {
50 58
51 namespace VectorMath { 59 namespace VectorMath {
52 60
53 #if OS(MACOSX)
54 // On the Mac we use the highly optimized versions in Accelerate.framework
55 // In 32-bit mode (__ppc__ or __i386__) <Accelerate/Accelerate.h> includes <vecL ib/vDSP_translate.h> which defines macros of the same name as
56 // our namespaced function names, so we must handle this case differently. Other architectures (64bit, ARM, etc.) do not include this header file.
57
58 void vsmul(const float* sourceP, int sourceStride, const float* scale, float* de stP, int destStride, size_t framesToProcess)
59 {
60 #if CPU(X86)
61 ::vsmul(sourceP, sourceStride, scale, destP, destStride, framesToProcess);
62 #else
63 vDSP_vsmul(sourceP, sourceStride, scale, destP, destStride, framesToProcess) ;
64 #endif
65 }
66
67 void vadd(const float* source1P, int sourceStride1, const float* source2P, int s ourceStride2, float* destP, int destStride, size_t framesToProcess)
68 {
69 #if CPU(X86)
70 ::vadd(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess);
71 #else
72 vDSP_vadd(source1P, sourceStride1, source2P, sourceStride2, destP, destStrid e, framesToProcess);
73 #endif
74 }
75
76 void vmul(const float* source1P, int sourceStride1, const float* source2P, int s ourceStride2, float* destP, int destStride, size_t framesToProcess)
77 {
78 #if CPU(X86)
79 ::vmul(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess);
80 #else
81 vDSP_vmul(source1P, sourceStride1, source2P, sourceStride2, destP, destStrid e, framesToProcess);
82 #endif
83 }
84
85 void zvmul(const float* real1P, const float* imag1P, const float* real2P, const float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess)
86 {
87 DSPSplitComplex sc1;
88 DSPSplitComplex sc2;
89 DSPSplitComplex dest;
90 sc1.realp = const_cast<float*>(real1P);
91 sc1.imagp = const_cast<float*>(imag1P);
92 sc2.realp = const_cast<float*>(real2P);
93 sc2.imagp = const_cast<float*>(imag2P);
94 dest.realp = realDestP;
95 dest.imagp = imagDestP;
96 #if CPU(X86)
97 ::zvmul(&sc1, 1, &sc2, 1, &dest, 1, framesToProcess, 1);
98 #else
99 vDSP_zvmul(&sc1, 1, &sc2, 1, &dest, 1, framesToProcess, 1);
100 #endif
101 }
102
103 void vsma(const float* sourceP, int sourceStride, const float* scale, float* des tP, int destStride, size_t framesToProcess)
104 {
105 vDSP_vsma(sourceP, sourceStride, scale, destP, destStride, destP, destStride , framesToProcess);
106 }
107
108 void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesT oProcess)
109 {
110 vDSP_maxmgv(sourceP, sourceStride, maxP, framesToProcess);
111 }
112
113 void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesTo Process)
114 {
115 vDSP_svesq(const_cast<float*>(sourceP), sourceStride, sumP, framesToProcess) ;
116 }
117
118 void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, c onst float* highThresholdP, float* destP, int destStride, size_t framesToProcess )
119 {
120 vDSP_vclip(const_cast<float*>(sourceP), sourceStride, const_cast<float*>(low ThresholdP), const_cast<float*>(highThresholdP), destP, destStride, framesToProc ess);
121 }
122 #else
123
124 void vsma(const float* sourceP, int sourceStride, const float* scale, float* des tP, int destStride, size_t framesToProcess) 61 void vsma(const float* sourceP, int sourceStride, const float* scale, float* des tP, int destStride, size_t framesToProcess)
125 { 62 {
126 int n = framesToProcess; 63 int n = framesToProcess;
127 64
128 #if CPU(X86) || CPU(X86_64)
129 if ((sourceStride == 1) && (destStride == 1)) { 65 if ((sourceStride == 1) && (destStride == 1)) {
130 float k = *scale; 66 float k = *scale;
131 67
132 // If the sourceP address is not 16-byte aligned, the first several fram es (at most three) should be processed separately. 68 // If the sourceP address is not 16-byte aligned, the first several fram es (at most three) should be processed separately.
133 while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) { 69 while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) {
134 *destP += k * *sourceP; 70 *destP += k * *sourceP;
135 sourceP++; 71 sourceP++;
136 destP++; 72 destP++;
137 n--; 73 n--;
138 } 74 }
139 75
140 // Now the sourceP is aligned, use SSE. 76 // Now the sourceP is aligned, use SSE.
141 int tailFrames = n % 4; 77 int tailFrames = n % 4;
142 const float* endP = destP + n - tailFrames; 78 const float* endP = destP + n - tailFrames;
143 79
144 __m128 pSource; 80 __m128 pSource;
145 __m128 dest; 81 __m128 dest;
146 __m128 temp; 82 __m128 temp;
147 __m128 mScale = _mm_set_ps1(k); 83 __m128 mScale = _mm_set_ps1(k);
148 84
149 bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F); 85 bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F);
150 86
151 #define SSE2_MULT_ADD(loadInstr, storeInstr) \
152 while (destP < endP) \
153 { \
154 pSource = _mm_load_ps(sourceP); \
155 temp = _mm_mul_ps(pSource, mScale); \
156 dest = _mm_##loadInstr##_ps(destP); \
157 dest = _mm_add_ps(dest, temp); \
158 _mm_##storeInstr##_ps(destP, dest); \
159 sourceP += 4; \
160 destP += 4; \
161 }
162
163 if (destAligned) 87 if (destAligned)
164 SSE2_MULT_ADD(load, store) 88 SSE2_MULT_ADD(load, store)
165 else 89 else
166 SSE2_MULT_ADD(loadu, storeu) 90 SSE2_MULT_ADD(loadu, storeu)
167 91
168 n = tailFrames; 92 n = tailFrames;
169 } 93 }
170 #elif HAVE(ARM_NEON_INTRINSICS)
171 if ((sourceStride == 1) && (destStride == 1)) {
172 int tailFrames = n % 4;
173 const float* endP = destP + n - tailFrames;
174 94
175 float32x4_t k = vdupq_n_f32(*scale);
176 while (destP < endP) {
177 float32x4_t source = vld1q_f32(sourceP);
178 float32x4_t dest = vld1q_f32(destP);
179
180 dest = vmlaq_f32(dest, source, k);
181 vst1q_f32(destP, dest);
182
183 sourceP += 4;
184 destP += 4;
185 }
186 n = tailFrames;
187 }
188 #endif
189 while (n) { 95 while (n) {
190 *destP += *sourceP * *scale; 96 *destP += *sourceP * *scale;
191 sourceP += sourceStride; 97 sourceP += sourceStride;
192 destP += destStride; 98 destP += destStride;
193 n--; 99 n--;
194 } 100 }
195 } 101 }
196 102
197 void vsmul(const float* sourceP, int sourceStride, const float* scale, float* de stP, int destStride, size_t framesToProcess) 103 void vsmul(const float* sourceP, int sourceStride, const float* scale, float* de stP, int destStride, size_t framesToProcess)
198 { 104 {
199 int n = framesToProcess; 105 int n = framesToProcess;
200 106
201 #if CPU(X86) || CPU(X86_64)
202 if ((sourceStride == 1) && (destStride == 1)) { 107 if ((sourceStride == 1) && (destStride == 1)) {
203 float k = *scale; 108 float k = *scale;
204 109
205 // If the sourceP address is not 16-byte aligned, the first several fram es (at most three) should be processed separately. 110 // If the sourceP address is not 16-byte aligned, the first several fram es (at most three) should be processed separately.
206 while ((reinterpret_cast<size_t>(sourceP) & 0x0F) && n) { 111 while ((reinterpret_cast<size_t>(sourceP) & 0x0F) && n) {
207 *destP = k * *sourceP; 112 *destP = k * *sourceP;
208 sourceP++; 113 sourceP++;
209 destP++; 114 destP++;
210 n--; 115 n--;
211 } 116 }
(...skipping 28 matching lines...) Expand all
240 145
241 // Non-SSE handling for remaining frames which is less than 4. 146 // Non-SSE handling for remaining frames which is less than 4.
242 n %= 4; 147 n %= 4;
243 while (n) { 148 while (n) {
244 *destP = k * *sourceP; 149 *destP = k * *sourceP;
245 sourceP++; 150 sourceP++;
246 destP++; 151 destP++;
247 n--; 152 n--;
248 } 153 }
249 } else { // If strides are not 1, rollback to normal algorithm. 154 } else { // If strides are not 1, rollback to normal algorithm.
250 #elif HAVE(ARM_NEON_INTRINSICS)
251 if ((sourceStride == 1) && (destStride == 1)) {
252 float k = *scale; 155 float k = *scale;
253 int tailFrames = n % 4; 156 while (n--) {
254 const float* endP = destP + n - tailFrames; 157 *destP = k * *sourceP;
255 158 sourceP += sourceStride;
256 while (destP < endP) { 159 destP += destStride;
257 float32x4_t source = vld1q_f32(sourceP);
258 vst1q_f32(destP, vmulq_n_f32(source, k));
259
260 sourceP += 4;
261 destP += 4;
262 } 160 }
263 n = tailFrames;
264 } 161 }
265 #endif
266 float k = *scale;
267 while (n--) {
268 *destP = k * *sourceP;
269 sourceP += sourceStride;
270 destP += destStride;
271 }
272 #if CPU(X86) || CPU(X86_64)
273 }
274 #endif
275 } 162 }
276 163
277 void vadd(const float* source1P, int sourceStride1, const float* source2P, int s ourceStride2, float* destP, int destStride, size_t framesToProcess) 164 void vadd(const float* source1P, int sourceStride1, const float* source2P, int s ourceStride2, float* destP, int destStride, size_t framesToProcess)
278 { 165 {
279 int n = framesToProcess; 166 int n = framesToProcess;
280 167
281 #if CPU(X86) || CPU(X86_64)
282 if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) { 168 if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) {
283 // If the sourceP address is not 16-byte aligned, the first several fram es (at most three) should be processed separately. 169 // If the sourceP address is not 16-byte aligned, the first several fram es (at most three) should be processed separately.
284 while ((reinterpret_cast<size_t>(source1P) & 0x0F) && n) { 170 while ((reinterpret_cast<size_t>(source1P) & 0x0F) && n) {
285 *destP = *source1P + *source2P; 171 *destP = *source1P + *source2P;
286 source1P++; 172 source1P++;
287 source2P++; 173 source2P++;
288 destP++; 174 destP++;
289 n--; 175 n--;
290 } 176 }
291 177
(...skipping 59 matching lines...) Expand 10 before | Expand all | Expand 10 after
351 // Non-SSE handling for remaining frames which is less than 4. 237 // Non-SSE handling for remaining frames which is less than 4.
352 n %= 4; 238 n %= 4;
353 while (n) { 239 while (n) {
354 *destP = *source1P + *source2P; 240 *destP = *source1P + *source2P;
355 source1P++; 241 source1P++;
356 source2P++; 242 source2P++;
357 destP++; 243 destP++;
358 n--; 244 n--;
359 } 245 }
360 } else { // if strides are not 1, rollback to normal algorithm 246 } else { // if strides are not 1, rollback to normal algorithm
361 #elif HAVE(ARM_NEON_INTRINSICS) 247 while (n--) {
362 if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) { 248 *destP = *source1P + *source2P;
363 int tailFrames = n % 4; 249 source1P += sourceStride1;
364 const float* endP = destP + n - tailFrames; 250 source2P += sourceStride2;
365 251 destP += destStride;
366 while (destP < endP) {
367 float32x4_t source1 = vld1q_f32(source1P);
368 float32x4_t source2 = vld1q_f32(source2P);
369 vst1q_f32(destP, vaddq_f32(source1, source2));
370
371 source1P += 4;
372 source2P += 4;
373 destP += 4;
374 } 252 }
375 n = tailFrames;
376 }
377 #endif
378 while (n--) {
379 *destP = *source1P + *source2P;
380 source1P += sourceStride1;
381 source2P += sourceStride2;
382 destP += destStride;
383 }
384 #if CPU(X86) || CPU(X86_64)
385 }
386 #endif
387 } 253 }
388 254
389 void vmul(const float* source1P, int sourceStride1, const float* source2P, int s ourceStride2, float* destP, int destStride, size_t framesToProcess) 255 void vmul(const float* source1P, int sourceStride1, const float* source2P, int s ourceStride2, float* destP, int destStride, size_t framesToProcess)
390 { 256 {
391
392 int n = framesToProcess; 257 int n = framesToProcess;
393 258
394 #if CPU(X86) || CPU(X86_64)
395 if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) { 259 if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) {
396 // If the source1P address is not 16-byte aligned, the first several fra mes (at most three) should be processed separately. 260 // If the source1P address is not 16-byte aligned, the first several fra mes (at most three) should be processed separately.
397 while ((reinterpret_cast<uintptr_t>(source1P) & 0x0F) && n) { 261 while ((reinterpret_cast<uintptr_t>(source1P) & 0x0F) && n) {
398 *destP = *source1P * *source2P; 262 *destP = *source1P * *source2P;
399 source1P++; 263 source1P++;
400 source2P++; 264 source2P++;
401 destP++; 265 destP++;
402 n--; 266 n--;
403 } 267 }
404 268
405 // Now the source1P address aligned and start to apply SSE. 269 // Now the source1P address aligned and start to apply SSE.
406 int tailFrames = n % 4; 270 int tailFrames = n % 4;
407 const float* endP = destP + n - tailFrames; 271 const float* endP = destP + n - tailFrames;
408 __m128 pSource1; 272 __m128 pSource1;
409 __m128 pSource2; 273 __m128 pSource2;
410 __m128 dest; 274 __m128 dest;
411 275
412 bool source2Aligned = !(reinterpret_cast<uintptr_t>(source2P) & 0x0F); 276 bool source2Aligned = !(reinterpret_cast<uintptr_t>(source2P) & 0x0F);
413 bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F); 277 bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F);
414 278
415 #define SSE2_MULT(loadInstr, storeInstr) \
416 while (destP < endP) \
417 { \
418 pSource1 = _mm_load_ps(source1P); \
419 pSource2 = _mm_##loadInstr##_ps(source2P); \
420 dest = _mm_mul_ps(pSource1, pSource2); \
421 _mm_##storeInstr##_ps(destP, dest); \
422 source1P += 4; \
423 source2P += 4; \
424 destP += 4; \
425 }
426
427 if (source2Aligned && destAligned) // Both aligned. 279 if (source2Aligned && destAligned) // Both aligned.
428 SSE2_MULT(load, store) 280 SSE2_MULT(load, store)
429 else if (source2Aligned && !destAligned) // Source2 is aligned but dest not. 281 else if (source2Aligned && !destAligned) // Source2 is aligned but dest not.
430 SSE2_MULT(load, storeu) 282 SSE2_MULT(load, storeu)
431 else if (!source2Aligned && destAligned) // Dest is aligned but source2 not. 283 else if (!source2Aligned && destAligned) // Dest is aligned but source2 not.
432 SSE2_MULT(loadu, store) 284 SSE2_MULT(loadu, store)
433 else // Neither aligned. 285 else // Neither aligned.
434 SSE2_MULT(loadu, storeu) 286 SSE2_MULT(loadu, storeu)
435 287
436 n = tailFrames; 288 n = tailFrames;
437 } 289 }
438 #elif HAVE(ARM_NEON_INTRINSICS)
439 if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) {
440 int tailFrames = n % 4;
441 const float* endP = destP + n - tailFrames;
442 290
443 while (destP < endP) {
444 float32x4_t source1 = vld1q_f32(source1P);
445 float32x4_t source2 = vld1q_f32(source2P);
446 vst1q_f32(destP, vmulq_f32(source1, source2));
447
448 source1P += 4;
449 source2P += 4;
450 destP += 4;
451 }
452 n = tailFrames;
453 }
454 #endif
455 while (n) { 291 while (n) {
456 *destP = *source1P * *source2P; 292 *destP = *source1P * *source2P;
457 source1P += sourceStride1; 293 source1P += sourceStride1;
458 source2P += sourceStride2; 294 source2P += sourceStride2;
459 destP += destStride; 295 destP += destStride;
460 n--; 296 n--;
461 } 297 }
462 } 298 }
463 299
464 void zvmul(const float* real1P, const float* imag1P, const float* real2P, const float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess) 300 void zvmul(const float* real1P, const float* imag1P, const float* real2P, const float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess)
465 { 301 {
466 unsigned i = 0; 302 unsigned i = 0;
467 #if CPU(X86) || CPU(X86_64) 303
468 // Only use the SSE optimization in the very common case that all addresses are 16-byte aligned. 304 // Only use the SSE optimization in the very common case that all addresses are 16-byte aligned.
469 // Otherwise, fall through to the scalar code below. 305 // Otherwise, fall through to the scalar code below.
470 if (!(reinterpret_cast<uintptr_t>(real1P) & 0x0F) 306 if (!(reinterpret_cast<uintptr_t>(real1P) & 0x0F)
471 && !(reinterpret_cast<uintptr_t>(imag1P) & 0x0F) 307 && !(reinterpret_cast<uintptr_t>(imag1P) & 0x0F)
472 && !(reinterpret_cast<uintptr_t>(real2P) & 0x0F) 308 && !(reinterpret_cast<uintptr_t>(real2P) & 0x0F)
473 && !(reinterpret_cast<uintptr_t>(imag2P) & 0x0F) 309 && !(reinterpret_cast<uintptr_t>(imag2P) & 0x0F)
474 && !(reinterpret_cast<uintptr_t>(realDestP) & 0x0F) 310 && !(reinterpret_cast<uintptr_t>(realDestP) & 0x0F)
475 && !(reinterpret_cast<uintptr_t>(imagDestP) & 0x0F)) { 311 && !(reinterpret_cast<uintptr_t>(imagDestP) & 0x0F)) {
476 312
477 unsigned endSize = framesToProcess - framesToProcess % 4; 313 unsigned endSize = framesToProcess - framesToProcess % 4;
478 while (i < endSize) { 314 while (i < endSize) {
479 __m128 real1 = _mm_load_ps(real1P + i); 315 __m128 real1 = _mm_load_ps(real1P + i);
480 __m128 real2 = _mm_load_ps(real2P + i); 316 __m128 real2 = _mm_load_ps(real2P + i);
481 __m128 imag1 = _mm_load_ps(imag1P + i); 317 __m128 imag1 = _mm_load_ps(imag1P + i);
482 __m128 imag2 = _mm_load_ps(imag2P + i); 318 __m128 imag2 = _mm_load_ps(imag2P + i);
483 __m128 real = _mm_mul_ps(real1, real2); 319 __m128 real = _mm_mul_ps(real1, real2);
484 real = _mm_sub_ps(real, _mm_mul_ps(imag1, imag2)); 320 real = _mm_sub_ps(real, _mm_mul_ps(imag1, imag2));
485 __m128 imag = _mm_mul_ps(real1, imag2); 321 __m128 imag = _mm_mul_ps(real1, imag2);
486 imag = _mm_add_ps(imag, _mm_mul_ps(imag1, real2)); 322 imag = _mm_add_ps(imag, _mm_mul_ps(imag1, real2));
487 _mm_store_ps(realDestP + i, real); 323 _mm_store_ps(realDestP + i, real);
488 _mm_store_ps(imagDestP + i, imag); 324 _mm_store_ps(imagDestP + i, imag);
489 i += 4; 325 i += 4;
490 } 326 }
491 } 327 }
492 #elif HAVE(ARM_NEON_INTRINSICS)
493 unsigned endSize = framesToProcess - framesToProcess % 4;
494 while (i < endSize) {
495 float32x4_t real1 = vld1q_f32(real1P + i);
496 float32x4_t real2 = vld1q_f32(real2P + i);
497 float32x4_t imag1 = vld1q_f32(imag1P + i);
498 float32x4_t imag2 = vld1q_f32(imag2P + i);
499 328
500 float32x4_t realResult = vmlsq_f32(vmulq_f32(real1, real2), imag1, i mag2);
501 float32x4_t imagResult = vmlaq_f32(vmulq_f32(real1, imag2), imag1, r eal2);
502
503 vst1q_f32(realDestP + i, realResult);
504 vst1q_f32(imagDestP + i, imagResult);
505
506 i += 4;
507 }
508 #endif
509 for (; i < framesToProcess; ++i) { 329 for (; i < framesToProcess; ++i) {
510 // Read and compute result before storing them, in case the 330 // Read and compute result before storing them, in case the
511 // destination is the same as one of the sources. 331 // destination is the same as one of the sources.
512 float realResult = real1P[i] * real2P[i] - imag1P[i] * imag2P[i]; 332 float realResult = real1P[i] * real2P[i] - imag1P[i] * imag2P[i];
513 float imagResult = real1P[i] * imag2P[i] + imag1P[i] * real2P[i]; 333 float imagResult = real1P[i] * imag2P[i] + imag1P[i] * real2P[i];
514 334
515 realDestP[i] = realResult; 335 realDestP[i] = realResult;
516 imagDestP[i] = imagResult; 336 imagDestP[i] = imagResult;
517 } 337 }
518 } 338 }
519 339
520 void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesTo Process) 340 void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesTo Process)
521 { 341 {
522 int n = framesToProcess; 342 int n = framesToProcess;
523 float sum = 0; 343 float sum = 0;
524 344
525 #if CPU(X86) || CPU(X86_64)
526 if (sourceStride == 1) { 345 if (sourceStride == 1) {
527 // If the sourceP address is not 16-byte aligned, the first several fram es (at most three) should be processed separately. 346 // If the sourceP address is not 16-byte aligned, the first several fram es (at most three) should be processed separately.
528 while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) { 347 while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) {
529 float sample = *sourceP; 348 float sample = *sourceP;
530 sum += sample * sample; 349 sum += sample * sample;
531 sourceP++; 350 sourceP++;
532 n--; 351 n--;
533 } 352 }
534 353
535 // Now the sourceP is aligned, use SSE. 354 // Now the sourceP is aligned, use SSE.
536 int tailFrames = n % 4; 355 int tailFrames = n % 4;
537 const float* endP = sourceP + n - tailFrames; 356 const float* endP = sourceP + n - tailFrames;
538 __m128 source; 357 __m128 source;
539 __m128 mSum = _mm_setzero_ps(); 358 __m128 mSum = _mm_setzero_ps();
540 359
541 while (sourceP < endP) { 360 while (sourceP < endP) {
542 source = _mm_load_ps(sourceP); 361 source = _mm_load_ps(sourceP);
543 source = _mm_mul_ps(source, source); 362 source = _mm_mul_ps(source, source);
544 mSum = _mm_add_ps(mSum, source); 363 mSum = _mm_add_ps(mSum, source);
545 sourceP += 4; 364 sourceP += 4;
546 } 365 }
547 366
548 // Summarize the SSE results. 367 // Summarize the SSE results.
549 const float* groupSumP = reinterpret_cast<float*>(&mSum); 368 const float* groupSumP = reinterpret_cast<float*>(&mSum);
550 sum += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3]; 369 sum += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3];
551 370
552 n = tailFrames; 371 n = tailFrames;
553 } 372 }
554 #elif HAVE(ARM_NEON_INTRINSICS)
555 if (sourceStride == 1) {
556 int tailFrames = n % 4;
557 const float* endP = sourceP + n - tailFrames;
558
559 float32x4_t fourSum = vdupq_n_f32(0);
560 while (sourceP < endP) {
561 float32x4_t source = vld1q_f32(sourceP);
562 fourSum = vmlaq_f32(fourSum, source, source);
563 sourceP += 4;
564 }
565 float32x2_t twoSum = vadd_f32(vget_low_f32(fourSum), vget_high_f32(fourS um));
566
567 float groupSum[2];
568 vst1_f32(groupSum, twoSum);
569 sum += groupSum[0] + groupSum[1];
570
571 n = tailFrames;
572 }
573 #endif
574 373
575 while (n--) { 374 while (n--) {
576 float sample = *sourceP; 375 float sample = *sourceP;
577 sum += sample * sample; 376 sum += sample * sample;
578 sourceP += sourceStride; 377 sourceP += sourceStride;
579 } 378 }
580 379
581 ASSERT(sumP); 380 ASSERT(sumP);
582 *sumP = sum; 381 *sumP = sum;
583 } 382 }
584 383
585 void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesT oProcess) 384 void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesT oProcess)
586 { 385 {
587 int n = framesToProcess; 386 int n = framesToProcess;
588 float max = 0; 387 float max = 0;
589 388
590 #if CPU(X86) || CPU(X86_64)
591 if (sourceStride == 1) { 389 if (sourceStride == 1) {
592 // If the sourceP address is not 16-byte aligned, the first several fram es (at most three) should be processed separately. 390 // If the sourceP address is not 16-byte aligned, the first several fram es (at most three) should be processed separately.
593 while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) { 391 while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) {
594 max = std::max(max, fabsf(*sourceP)); 392 max = std::max(max, fabsf(*sourceP));
595 sourceP++; 393 sourceP++;
596 n--; 394 n--;
597 } 395 }
598 396
599 // Now the sourceP is aligned, use SSE. 397 // Now the sourceP is aligned, use SSE.
600 int tailFrames = n % 4; 398 int tailFrames = n % 4;
(...skipping 13 matching lines...) Expand all
614 412
615 // Get max from the SSE results. 413 // Get max from the SSE results.
616 const float* groupMaxP = reinterpret_cast<float*>(&mMax); 414 const float* groupMaxP = reinterpret_cast<float*>(&mMax);
617 max = std::max(max, groupMaxP[0]); 415 max = std::max(max, groupMaxP[0]);
618 max = std::max(max, groupMaxP[1]); 416 max = std::max(max, groupMaxP[1]);
619 max = std::max(max, groupMaxP[2]); 417 max = std::max(max, groupMaxP[2]);
620 max = std::max(max, groupMaxP[3]); 418 max = std::max(max, groupMaxP[3]);
621 419
622 n = tailFrames; 420 n = tailFrames;
623 } 421 }
624 #elif HAVE(ARM_NEON_INTRINSICS)
625 if (sourceStride == 1) {
626 int tailFrames = n % 4;
627 const float* endP = sourceP + n - tailFrames;
628
629 float32x4_t fourMax = vdupq_n_f32(0);
630 while (sourceP < endP) {
631 float32x4_t source = vld1q_f32(sourceP);
632 fourMax = vmaxq_f32(fourMax, vabsq_f32(source));
633 sourceP += 4;
634 }
635 float32x2_t twoMax = vmax_f32(vget_low_f32(fourMax), vget_high_f32(fourM ax));
636
637 float groupMax[2];
638 vst1_f32(groupMax, twoMax);
639 max = std::max(groupMax[0], groupMax[1]);
640
641 n = tailFrames;
642 }
643 #endif
644 422
645 while (n--) { 423 while (n--) {
646 max = std::max(max, fabsf(*sourceP)); 424 max = std::max(max, fabsf(*sourceP));
647 sourceP += sourceStride; 425 sourceP += sourceStride;
648 } 426 }
649 427
650 ASSERT(maxP); 428 ASSERT(maxP);
651 *maxP = max; 429 *maxP = max;
652 } 430 }
653 431
654 void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, c onst float* highThresholdP, float* destP, int destStride, size_t framesToProcess ) 432 void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, c onst float* highThresholdP, float* destP, int destStride, size_t framesToProcess )
655 { 433 {
656 int n = framesToProcess; 434 int n = framesToProcess;
657 float lowThreshold = *lowThresholdP; 435 float lowThreshold = *lowThresholdP;
658 float highThreshold = *highThresholdP; 436 float highThreshold = *highThresholdP;
659 437
660 // FIXME: Optimize for SSE2. 438 // FIXME: Optimize for SSE2.
661 #if HAVE(ARM_NEON_INTRINSICS)
662 if ((sourceStride == 1) && (destStride == 1)) {
663 int tailFrames = n % 4;
664 const float* endP = destP + n - tailFrames;
665
666 float32x4_t low = vdupq_n_f32(lowThreshold);
667 float32x4_t high = vdupq_n_f32(highThreshold);
668 while (destP < endP) {
669 float32x4_t source = vld1q_f32(sourceP);
670 vst1q_f32(destP, vmaxq_f32(vminq_f32(source, high), low));
671 sourceP += 4;
672 destP += 4;
673 }
674 n = tailFrames;
675 }
676 #endif
677 while (n--) { 439 while (n--) {
678 *destP = std::max(std::min(*sourceP, highThreshold), lowThreshold); 440 *destP = std::max(std::min(*sourceP, highThreshold), lowThreshold);
679 sourceP += sourceStride; 441 sourceP += sourceStride;
680 destP += destStride; 442 destP += destStride;
681 } 443 }
682 } 444 }
683 445
684 #endif // OS(MACOSX)
685
686 } // namespace VectorMath 446 } // namespace VectorMath
687 447
688 } // namespace blink 448 } // namespace blink
689 449
690 #endif // ENABLE(WEB_AUDIO) 450 #endif // ENABLE(WEB_AUDIO)
OLDNEW
« no previous file with comments | « Source/platform/Logging.cpp ('k') | Source/platform/audio/cpu/arm/VectorMathNEON.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698