OLD | NEW |
1 /* | 1 /* |
2 * Copyright (C) 2010, Google Inc. All rights reserved. | 2 * Copyright (C) 2010, Google Inc. All rights reserved. |
3 * | 3 * |
4 * Redistribution and use in source and binary forms, with or without | 4 * Redistribution and use in source and binary forms, with or without |
5 * modification, are permitted provided that the following conditions | 5 * modification, are permitted provided that the following conditions |
6 * are met: | 6 * are met: |
7 * 1. Redistributions of source code must retain the above copyright | 7 * 1. Redistributions of source code must retain the above copyright |
8 * notice, this list of conditions and the following disclaimer. | 8 * notice, this list of conditions and the following disclaimer. |
9 * 2. Redistributions in binary form must reproduce the above copyright | 9 * 2. Redistributions in binary form must reproduce the above copyright |
10 * notice, this list of conditions and the following disclaimer in the | 10 * notice, this list of conditions and the following disclaimer in the |
11 * documentation and/or other materials provided with the distribution. | 11 * documentation and/or other materials provided with the distribution. |
12 * | 12 * |
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' AND AN
Y | 13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' AND AN
Y |
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | 14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
15 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | 15 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
16 * DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE FOR AN
Y | 16 * DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE FOR AN
Y |
17 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES | 17 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
18 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | 18 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
19 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND O
N | 19 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND O
N |
20 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | 20 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
21 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | 21 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
22 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 22 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
23 */ | 23 */ |
24 | 24 |
25 #include "config.h" | 25 #include "config.h" |
26 | 26 |
27 #if ENABLE(WEB_AUDIO) | 27 #if ENABLE(WEB_AUDIO) |
28 | 28 |
29 #include "platform/audio/VectorMath.h" | 29 #include "platform/audio/VectorMath.h" |
| 30 |
30 #include "wtf/Assertions.h" | 31 #include "wtf/Assertions.h" |
31 #include "wtf/CPU.h" | 32 #include <emmintrin.h> |
32 #include <stdint.h> | 33 #include <stdint.h> |
33 | 34 |
34 #if OS(MACOSX) | 35 #define SSE2_MULT_ADD(loadInstr, storeInstr) \ |
35 #include <Accelerate/Accelerate.h> | 36 while (destP < endP) { \ |
36 #endif | 37 pSource = _mm_load_ps(sourceP); \ |
| 38 temp = _mm_mul_ps(pSource, mScale); \ |
| 39 dest = _mm_##loadInstr##_ps(destP); \ |
| 40 dest = _mm_add_ps(dest, temp); \ |
| 41 _mm_##storeInstr##_ps(destP, dest); \ |
| 42 sourceP += 4; \ |
| 43 destP += 4; \ |
| 44 } \ |
37 | 45 |
38 #if CPU(X86) || CPU(X86_64) | 46 #define SSE2_MULT(loadInstr, storeInstr) \ |
39 #include <emmintrin.h> | 47 while (destP < endP) { \ |
40 #endif | 48 pSource1 = _mm_load_ps(source1P); \ |
41 | 49 pSource2 = _mm_##loadInstr##_ps(source2P); \ |
42 #if HAVE(ARM_NEON_INTRINSICS) | 50 dest = _mm_mul_ps(pSource1, pSource2); \ |
43 #include <arm_neon.h> | 51 _mm_##storeInstr##_ps(destP, dest); \ |
44 #endif | 52 source1P += 4; \ |
45 | 53 source2P += 4; \ |
46 #include <math.h> | 54 destP += 4; \ |
47 #include <algorithm> | 55 } \ |
48 | 56 |
49 namespace blink { | 57 namespace blink { |
50 | 58 |
51 namespace VectorMath { | 59 namespace VectorMath { |
52 | 60 |
53 #if OS(MACOSX) | |
54 // On the Mac we use the highly optimized versions in Accelerate.framework | |
55 // In 32-bit mode (__ppc__ or __i386__) <Accelerate/Accelerate.h> includes <vecL
ib/vDSP_translate.h> which defines macros of the same name as | |
56 // our namespaced function names, so we must handle this case differently. Other
architectures (64bit, ARM, etc.) do not include this header file. | |
57 | |
58 void vsmul(const float* sourceP, int sourceStride, const float* scale, float* de
stP, int destStride, size_t framesToProcess) | |
59 { | |
60 #if CPU(X86) | |
61 ::vsmul(sourceP, sourceStride, scale, destP, destStride, framesToProcess); | |
62 #else | |
63 vDSP_vsmul(sourceP, sourceStride, scale, destP, destStride, framesToProcess)
; | |
64 #endif | |
65 } | |
66 | |
67 void vadd(const float* source1P, int sourceStride1, const float* source2P, int s
ourceStride2, float* destP, int destStride, size_t framesToProcess) | |
68 { | |
69 #if CPU(X86) | |
70 ::vadd(source1P, sourceStride1, source2P, sourceStride2, destP, destStride,
framesToProcess); | |
71 #else | |
72 vDSP_vadd(source1P, sourceStride1, source2P, sourceStride2, destP, destStrid
e, framesToProcess); | |
73 #endif | |
74 } | |
75 | |
76 void vmul(const float* source1P, int sourceStride1, const float* source2P, int s
ourceStride2, float* destP, int destStride, size_t framesToProcess) | |
77 { | |
78 #if CPU(X86) | |
79 ::vmul(source1P, sourceStride1, source2P, sourceStride2, destP, destStride,
framesToProcess); | |
80 #else | |
81 vDSP_vmul(source1P, sourceStride1, source2P, sourceStride2, destP, destStrid
e, framesToProcess); | |
82 #endif | |
83 } | |
84 | |
85 void zvmul(const float* real1P, const float* imag1P, const float* real2P, const
float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess) | |
86 { | |
87 DSPSplitComplex sc1; | |
88 DSPSplitComplex sc2; | |
89 DSPSplitComplex dest; | |
90 sc1.realp = const_cast<float*>(real1P); | |
91 sc1.imagp = const_cast<float*>(imag1P); | |
92 sc2.realp = const_cast<float*>(real2P); | |
93 sc2.imagp = const_cast<float*>(imag2P); | |
94 dest.realp = realDestP; | |
95 dest.imagp = imagDestP; | |
96 #if CPU(X86) | |
97 ::zvmul(&sc1, 1, &sc2, 1, &dest, 1, framesToProcess, 1); | |
98 #else | |
99 vDSP_zvmul(&sc1, 1, &sc2, 1, &dest, 1, framesToProcess, 1); | |
100 #endif | |
101 } | |
102 | |
103 void vsma(const float* sourceP, int sourceStride, const float* scale, float* des
tP, int destStride, size_t framesToProcess) | |
104 { | |
105 vDSP_vsma(sourceP, sourceStride, scale, destP, destStride, destP, destStride
, framesToProcess); | |
106 } | |
107 | |
108 void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesT
oProcess) | |
109 { | |
110 vDSP_maxmgv(sourceP, sourceStride, maxP, framesToProcess); | |
111 } | |
112 | |
113 void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesTo
Process) | |
114 { | |
115 vDSP_svesq(const_cast<float*>(sourceP), sourceStride, sumP, framesToProcess)
; | |
116 } | |
117 | |
118 void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, c
onst float* highThresholdP, float* destP, int destStride, size_t framesToProcess
) | |
119 { | |
120 vDSP_vclip(const_cast<float*>(sourceP), sourceStride, const_cast<float*>(low
ThresholdP), const_cast<float*>(highThresholdP), destP, destStride, framesToProc
ess); | |
121 } | |
122 #else | |
123 | |
124 void vsma(const float* sourceP, int sourceStride, const float* scale, float* des
tP, int destStride, size_t framesToProcess) | 61 void vsma(const float* sourceP, int sourceStride, const float* scale, float* des
tP, int destStride, size_t framesToProcess) |
125 { | 62 { |
126 int n = framesToProcess; | 63 int n = framesToProcess; |
127 | 64 |
128 #if CPU(X86) || CPU(X86_64) | |
129 if ((sourceStride == 1) && (destStride == 1)) { | 65 if ((sourceStride == 1) && (destStride == 1)) { |
130 float k = *scale; | 66 float k = *scale; |
131 | 67 |
132 // If the sourceP address is not 16-byte aligned, the first several fram
es (at most three) should be processed separately. | 68 // If the sourceP address is not 16-byte aligned, the first several fram
es (at most three) should be processed separately. |
133 while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) { | 69 while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) { |
134 *destP += k * *sourceP; | 70 *destP += k * *sourceP; |
135 sourceP++; | 71 sourceP++; |
136 destP++; | 72 destP++; |
137 n--; | 73 n--; |
138 } | 74 } |
139 | 75 |
140 // Now the sourceP is aligned, use SSE. | 76 // Now the sourceP is aligned, use SSE. |
141 int tailFrames = n % 4; | 77 int tailFrames = n % 4; |
142 const float* endP = destP + n - tailFrames; | 78 const float* endP = destP + n - tailFrames; |
143 | 79 |
144 __m128 pSource; | 80 __m128 pSource; |
145 __m128 dest; | 81 __m128 dest; |
146 __m128 temp; | 82 __m128 temp; |
147 __m128 mScale = _mm_set_ps1(k); | 83 __m128 mScale = _mm_set_ps1(k); |
148 | 84 |
149 bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F); | 85 bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F); |
150 | 86 |
151 #define SSE2_MULT_ADD(loadInstr, storeInstr) \ | |
152 while (destP < endP) \ | |
153 { \ | |
154 pSource = _mm_load_ps(sourceP); \ | |
155 temp = _mm_mul_ps(pSource, mScale); \ | |
156 dest = _mm_##loadInstr##_ps(destP); \ | |
157 dest = _mm_add_ps(dest, temp); \ | |
158 _mm_##storeInstr##_ps(destP, dest); \ | |
159 sourceP += 4; \ | |
160 destP += 4; \ | |
161 } | |
162 | |
163 if (destAligned) | 87 if (destAligned) |
164 SSE2_MULT_ADD(load, store) | 88 SSE2_MULT_ADD(load, store) |
165 else | 89 else |
166 SSE2_MULT_ADD(loadu, storeu) | 90 SSE2_MULT_ADD(loadu, storeu) |
167 | 91 |
168 n = tailFrames; | 92 n = tailFrames; |
169 } | 93 } |
170 #elif HAVE(ARM_NEON_INTRINSICS) | |
171 if ((sourceStride == 1) && (destStride == 1)) { | |
172 int tailFrames = n % 4; | |
173 const float* endP = destP + n - tailFrames; | |
174 | 94 |
175 float32x4_t k = vdupq_n_f32(*scale); | |
176 while (destP < endP) { | |
177 float32x4_t source = vld1q_f32(sourceP); | |
178 float32x4_t dest = vld1q_f32(destP); | |
179 | |
180 dest = vmlaq_f32(dest, source, k); | |
181 vst1q_f32(destP, dest); | |
182 | |
183 sourceP += 4; | |
184 destP += 4; | |
185 } | |
186 n = tailFrames; | |
187 } | |
188 #endif | |
189 while (n) { | 95 while (n) { |
190 *destP += *sourceP * *scale; | 96 *destP += *sourceP * *scale; |
191 sourceP += sourceStride; | 97 sourceP += sourceStride; |
192 destP += destStride; | 98 destP += destStride; |
193 n--; | 99 n--; |
194 } | 100 } |
195 } | 101 } |
196 | 102 |
197 void vsmul(const float* sourceP, int sourceStride, const float* scale, float* de
stP, int destStride, size_t framesToProcess) | 103 void vsmul(const float* sourceP, int sourceStride, const float* scale, float* de
stP, int destStride, size_t framesToProcess) |
198 { | 104 { |
199 int n = framesToProcess; | 105 int n = framesToProcess; |
200 | 106 |
201 #if CPU(X86) || CPU(X86_64) | |
202 if ((sourceStride == 1) && (destStride == 1)) { | 107 if ((sourceStride == 1) && (destStride == 1)) { |
203 float k = *scale; | 108 float k = *scale; |
204 | 109 |
205 // If the sourceP address is not 16-byte aligned, the first several fram
es (at most three) should be processed separately. | 110 // If the sourceP address is not 16-byte aligned, the first several fram
es (at most three) should be processed separately. |
206 while ((reinterpret_cast<size_t>(sourceP) & 0x0F) && n) { | 111 while ((reinterpret_cast<size_t>(sourceP) & 0x0F) && n) { |
207 *destP = k * *sourceP; | 112 *destP = k * *sourceP; |
208 sourceP++; | 113 sourceP++; |
209 destP++; | 114 destP++; |
210 n--; | 115 n--; |
211 } | 116 } |
(...skipping 28 matching lines...) Expand all Loading... |
240 | 145 |
241 // Non-SSE handling for remaining frames which is less than 4. | 146 // Non-SSE handling for remaining frames which is less than 4. |
242 n %= 4; | 147 n %= 4; |
243 while (n) { | 148 while (n) { |
244 *destP = k * *sourceP; | 149 *destP = k * *sourceP; |
245 sourceP++; | 150 sourceP++; |
246 destP++; | 151 destP++; |
247 n--; | 152 n--; |
248 } | 153 } |
249 } else { // If strides are not 1, rollback to normal algorithm. | 154 } else { // If strides are not 1, rollback to normal algorithm. |
250 #elif HAVE(ARM_NEON_INTRINSICS) | |
251 if ((sourceStride == 1) && (destStride == 1)) { | |
252 float k = *scale; | 155 float k = *scale; |
253 int tailFrames = n % 4; | 156 while (n--) { |
254 const float* endP = destP + n - tailFrames; | 157 *destP = k * *sourceP; |
255 | 158 sourceP += sourceStride; |
256 while (destP < endP) { | 159 destP += destStride; |
257 float32x4_t source = vld1q_f32(sourceP); | |
258 vst1q_f32(destP, vmulq_n_f32(source, k)); | |
259 | |
260 sourceP += 4; | |
261 destP += 4; | |
262 } | 160 } |
263 n = tailFrames; | |
264 } | 161 } |
265 #endif | |
266 float k = *scale; | |
267 while (n--) { | |
268 *destP = k * *sourceP; | |
269 sourceP += sourceStride; | |
270 destP += destStride; | |
271 } | |
272 #if CPU(X86) || CPU(X86_64) | |
273 } | |
274 #endif | |
275 } | 162 } |
276 | 163 |
277 void vadd(const float* source1P, int sourceStride1, const float* source2P, int s
ourceStride2, float* destP, int destStride, size_t framesToProcess) | 164 void vadd(const float* source1P, int sourceStride1, const float* source2P, int s
ourceStride2, float* destP, int destStride, size_t framesToProcess) |
278 { | 165 { |
279 int n = framesToProcess; | 166 int n = framesToProcess; |
280 | 167 |
281 #if CPU(X86) || CPU(X86_64) | |
282 if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) { | 168 if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) { |
283 // If the sourceP address is not 16-byte aligned, the first several fram
es (at most three) should be processed separately. | 169 // If the sourceP address is not 16-byte aligned, the first several fram
es (at most three) should be processed separately. |
284 while ((reinterpret_cast<size_t>(source1P) & 0x0F) && n) { | 170 while ((reinterpret_cast<size_t>(source1P) & 0x0F) && n) { |
285 *destP = *source1P + *source2P; | 171 *destP = *source1P + *source2P; |
286 source1P++; | 172 source1P++; |
287 source2P++; | 173 source2P++; |
288 destP++; | 174 destP++; |
289 n--; | 175 n--; |
290 } | 176 } |
291 | 177 |
(...skipping 59 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
351 // Non-SSE handling for remaining frames which is less than 4. | 237 // Non-SSE handling for remaining frames which is less than 4. |
352 n %= 4; | 238 n %= 4; |
353 while (n) { | 239 while (n) { |
354 *destP = *source1P + *source2P; | 240 *destP = *source1P + *source2P; |
355 source1P++; | 241 source1P++; |
356 source2P++; | 242 source2P++; |
357 destP++; | 243 destP++; |
358 n--; | 244 n--; |
359 } | 245 } |
360 } else { // if strides are not 1, rollback to normal algorithm | 246 } else { // if strides are not 1, rollback to normal algorithm |
361 #elif HAVE(ARM_NEON_INTRINSICS) | 247 while (n--) { |
362 if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) { | 248 *destP = *source1P + *source2P; |
363 int tailFrames = n % 4; | 249 source1P += sourceStride1; |
364 const float* endP = destP + n - tailFrames; | 250 source2P += sourceStride2; |
365 | 251 destP += destStride; |
366 while (destP < endP) { | |
367 float32x4_t source1 = vld1q_f32(source1P); | |
368 float32x4_t source2 = vld1q_f32(source2P); | |
369 vst1q_f32(destP, vaddq_f32(source1, source2)); | |
370 | |
371 source1P += 4; | |
372 source2P += 4; | |
373 destP += 4; | |
374 } | 252 } |
375 n = tailFrames; | |
376 } | |
377 #endif | |
378 while (n--) { | |
379 *destP = *source1P + *source2P; | |
380 source1P += sourceStride1; | |
381 source2P += sourceStride2; | |
382 destP += destStride; | |
383 } | |
384 #if CPU(X86) || CPU(X86_64) | |
385 } | |
386 #endif | |
387 } | 253 } |
388 | 254 |
389 void vmul(const float* source1P, int sourceStride1, const float* source2P, int s
ourceStride2, float* destP, int destStride, size_t framesToProcess) | 255 void vmul(const float* source1P, int sourceStride1, const float* source2P, int s
ourceStride2, float* destP, int destStride, size_t framesToProcess) |
390 { | 256 { |
391 | |
392 int n = framesToProcess; | 257 int n = framesToProcess; |
393 | 258 |
394 #if CPU(X86) || CPU(X86_64) | |
395 if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) { | 259 if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) { |
396 // If the source1P address is not 16-byte aligned, the first several fra
mes (at most three) should be processed separately. | 260 // If the source1P address is not 16-byte aligned, the first several fra
mes (at most three) should be processed separately. |
397 while ((reinterpret_cast<uintptr_t>(source1P) & 0x0F) && n) { | 261 while ((reinterpret_cast<uintptr_t>(source1P) & 0x0F) && n) { |
398 *destP = *source1P * *source2P; | 262 *destP = *source1P * *source2P; |
399 source1P++; | 263 source1P++; |
400 source2P++; | 264 source2P++; |
401 destP++; | 265 destP++; |
402 n--; | 266 n--; |
403 } | 267 } |
404 | 268 |
405 // Now the source1P address aligned and start to apply SSE. | 269 // Now the source1P address aligned and start to apply SSE. |
406 int tailFrames = n % 4; | 270 int tailFrames = n % 4; |
407 const float* endP = destP + n - tailFrames; | 271 const float* endP = destP + n - tailFrames; |
408 __m128 pSource1; | 272 __m128 pSource1; |
409 __m128 pSource2; | 273 __m128 pSource2; |
410 __m128 dest; | 274 __m128 dest; |
411 | 275 |
412 bool source2Aligned = !(reinterpret_cast<uintptr_t>(source2P) & 0x0F); | 276 bool source2Aligned = !(reinterpret_cast<uintptr_t>(source2P) & 0x0F); |
413 bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F); | 277 bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F); |
414 | 278 |
415 #define SSE2_MULT(loadInstr, storeInstr) \ | |
416 while (destP < endP) \ | |
417 { \ | |
418 pSource1 = _mm_load_ps(source1P); \ | |
419 pSource2 = _mm_##loadInstr##_ps(source2P); \ | |
420 dest = _mm_mul_ps(pSource1, pSource2); \ | |
421 _mm_##storeInstr##_ps(destP, dest); \ | |
422 source1P += 4; \ | |
423 source2P += 4; \ | |
424 destP += 4; \ | |
425 } | |
426 | |
427 if (source2Aligned && destAligned) // Both aligned. | 279 if (source2Aligned && destAligned) // Both aligned. |
428 SSE2_MULT(load, store) | 280 SSE2_MULT(load, store) |
429 else if (source2Aligned && !destAligned) // Source2 is aligned but dest
not. | 281 else if (source2Aligned && !destAligned) // Source2 is aligned but dest
not. |
430 SSE2_MULT(load, storeu) | 282 SSE2_MULT(load, storeu) |
431 else if (!source2Aligned && destAligned) // Dest is aligned but source2
not. | 283 else if (!source2Aligned && destAligned) // Dest is aligned but source2
not. |
432 SSE2_MULT(loadu, store) | 284 SSE2_MULT(loadu, store) |
433 else // Neither aligned. | 285 else // Neither aligned. |
434 SSE2_MULT(loadu, storeu) | 286 SSE2_MULT(loadu, storeu) |
435 | 287 |
436 n = tailFrames; | 288 n = tailFrames; |
437 } | 289 } |
438 #elif HAVE(ARM_NEON_INTRINSICS) | |
439 if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) { | |
440 int tailFrames = n % 4; | |
441 const float* endP = destP + n - tailFrames; | |
442 | 290 |
443 while (destP < endP) { | |
444 float32x4_t source1 = vld1q_f32(source1P); | |
445 float32x4_t source2 = vld1q_f32(source2P); | |
446 vst1q_f32(destP, vmulq_f32(source1, source2)); | |
447 | |
448 source1P += 4; | |
449 source2P += 4; | |
450 destP += 4; | |
451 } | |
452 n = tailFrames; | |
453 } | |
454 #endif | |
455 while (n) { | 291 while (n) { |
456 *destP = *source1P * *source2P; | 292 *destP = *source1P * *source2P; |
457 source1P += sourceStride1; | 293 source1P += sourceStride1; |
458 source2P += sourceStride2; | 294 source2P += sourceStride2; |
459 destP += destStride; | 295 destP += destStride; |
460 n--; | 296 n--; |
461 } | 297 } |
462 } | 298 } |
463 | 299 |
464 void zvmul(const float* real1P, const float* imag1P, const float* real2P, const
float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess) | 300 void zvmul(const float* real1P, const float* imag1P, const float* real2P, const
float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess) |
465 { | 301 { |
466 unsigned i = 0; | 302 unsigned i = 0; |
467 #if CPU(X86) || CPU(X86_64) | 303 |
468 // Only use the SSE optimization in the very common case that all addresses
are 16-byte aligned. | 304 // Only use the SSE optimization in the very common case that all addresses
are 16-byte aligned. |
469 // Otherwise, fall through to the scalar code below. | 305 // Otherwise, fall through to the scalar code below. |
470 if (!(reinterpret_cast<uintptr_t>(real1P) & 0x0F) | 306 if (!(reinterpret_cast<uintptr_t>(real1P) & 0x0F) |
471 && !(reinterpret_cast<uintptr_t>(imag1P) & 0x0F) | 307 && !(reinterpret_cast<uintptr_t>(imag1P) & 0x0F) |
472 && !(reinterpret_cast<uintptr_t>(real2P) & 0x0F) | 308 && !(reinterpret_cast<uintptr_t>(real2P) & 0x0F) |
473 && !(reinterpret_cast<uintptr_t>(imag2P) & 0x0F) | 309 && !(reinterpret_cast<uintptr_t>(imag2P) & 0x0F) |
474 && !(reinterpret_cast<uintptr_t>(realDestP) & 0x0F) | 310 && !(reinterpret_cast<uintptr_t>(realDestP) & 0x0F) |
475 && !(reinterpret_cast<uintptr_t>(imagDestP) & 0x0F)) { | 311 && !(reinterpret_cast<uintptr_t>(imagDestP) & 0x0F)) { |
476 | 312 |
477 unsigned endSize = framesToProcess - framesToProcess % 4; | 313 unsigned endSize = framesToProcess - framesToProcess % 4; |
478 while (i < endSize) { | 314 while (i < endSize) { |
479 __m128 real1 = _mm_load_ps(real1P + i); | 315 __m128 real1 = _mm_load_ps(real1P + i); |
480 __m128 real2 = _mm_load_ps(real2P + i); | 316 __m128 real2 = _mm_load_ps(real2P + i); |
481 __m128 imag1 = _mm_load_ps(imag1P + i); | 317 __m128 imag1 = _mm_load_ps(imag1P + i); |
482 __m128 imag2 = _mm_load_ps(imag2P + i); | 318 __m128 imag2 = _mm_load_ps(imag2P + i); |
483 __m128 real = _mm_mul_ps(real1, real2); | 319 __m128 real = _mm_mul_ps(real1, real2); |
484 real = _mm_sub_ps(real, _mm_mul_ps(imag1, imag2)); | 320 real = _mm_sub_ps(real, _mm_mul_ps(imag1, imag2)); |
485 __m128 imag = _mm_mul_ps(real1, imag2); | 321 __m128 imag = _mm_mul_ps(real1, imag2); |
486 imag = _mm_add_ps(imag, _mm_mul_ps(imag1, real2)); | 322 imag = _mm_add_ps(imag, _mm_mul_ps(imag1, real2)); |
487 _mm_store_ps(realDestP + i, real); | 323 _mm_store_ps(realDestP + i, real); |
488 _mm_store_ps(imagDestP + i, imag); | 324 _mm_store_ps(imagDestP + i, imag); |
489 i += 4; | 325 i += 4; |
490 } | 326 } |
491 } | 327 } |
492 #elif HAVE(ARM_NEON_INTRINSICS) | |
493 unsigned endSize = framesToProcess - framesToProcess % 4; | |
494 while (i < endSize) { | |
495 float32x4_t real1 = vld1q_f32(real1P + i); | |
496 float32x4_t real2 = vld1q_f32(real2P + i); | |
497 float32x4_t imag1 = vld1q_f32(imag1P + i); | |
498 float32x4_t imag2 = vld1q_f32(imag2P + i); | |
499 | 328 |
500 float32x4_t realResult = vmlsq_f32(vmulq_f32(real1, real2), imag1, i
mag2); | |
501 float32x4_t imagResult = vmlaq_f32(vmulq_f32(real1, imag2), imag1, r
eal2); | |
502 | |
503 vst1q_f32(realDestP + i, realResult); | |
504 vst1q_f32(imagDestP + i, imagResult); | |
505 | |
506 i += 4; | |
507 } | |
508 #endif | |
509 for (; i < framesToProcess; ++i) { | 329 for (; i < framesToProcess; ++i) { |
510 // Read and compute result before storing them, in case the | 330 // Read and compute result before storing them, in case the |
511 // destination is the same as one of the sources. | 331 // destination is the same as one of the sources. |
512 float realResult = real1P[i] * real2P[i] - imag1P[i] * imag2P[i]; | 332 float realResult = real1P[i] * real2P[i] - imag1P[i] * imag2P[i]; |
513 float imagResult = real1P[i] * imag2P[i] + imag1P[i] * real2P[i]; | 333 float imagResult = real1P[i] * imag2P[i] + imag1P[i] * real2P[i]; |
514 | 334 |
515 realDestP[i] = realResult; | 335 realDestP[i] = realResult; |
516 imagDestP[i] = imagResult; | 336 imagDestP[i] = imagResult; |
517 } | 337 } |
518 } | 338 } |
519 | 339 |
520 void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesTo
Process) | 340 void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesTo
Process) |
521 { | 341 { |
522 int n = framesToProcess; | 342 int n = framesToProcess; |
523 float sum = 0; | 343 float sum = 0; |
524 | 344 |
525 #if CPU(X86) || CPU(X86_64) | |
526 if (sourceStride == 1) { | 345 if (sourceStride == 1) { |
527 // If the sourceP address is not 16-byte aligned, the first several fram
es (at most three) should be processed separately. | 346 // If the sourceP address is not 16-byte aligned, the first several fram
es (at most three) should be processed separately. |
528 while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) { | 347 while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) { |
529 float sample = *sourceP; | 348 float sample = *sourceP; |
530 sum += sample * sample; | 349 sum += sample * sample; |
531 sourceP++; | 350 sourceP++; |
532 n--; | 351 n--; |
533 } | 352 } |
534 | 353 |
535 // Now the sourceP is aligned, use SSE. | 354 // Now the sourceP is aligned, use SSE. |
536 int tailFrames = n % 4; | 355 int tailFrames = n % 4; |
537 const float* endP = sourceP + n - tailFrames; | 356 const float* endP = sourceP + n - tailFrames; |
538 __m128 source; | 357 __m128 source; |
539 __m128 mSum = _mm_setzero_ps(); | 358 __m128 mSum = _mm_setzero_ps(); |
540 | 359 |
541 while (sourceP < endP) { | 360 while (sourceP < endP) { |
542 source = _mm_load_ps(sourceP); | 361 source = _mm_load_ps(sourceP); |
543 source = _mm_mul_ps(source, source); | 362 source = _mm_mul_ps(source, source); |
544 mSum = _mm_add_ps(mSum, source); | 363 mSum = _mm_add_ps(mSum, source); |
545 sourceP += 4; | 364 sourceP += 4; |
546 } | 365 } |
547 | 366 |
548 // Summarize the SSE results. | 367 // Summarize the SSE results. |
549 const float* groupSumP = reinterpret_cast<float*>(&mSum); | 368 const float* groupSumP = reinterpret_cast<float*>(&mSum); |
550 sum += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3]; | 369 sum += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3]; |
551 | 370 |
552 n = tailFrames; | 371 n = tailFrames; |
553 } | 372 } |
554 #elif HAVE(ARM_NEON_INTRINSICS) | |
555 if (sourceStride == 1) { | |
556 int tailFrames = n % 4; | |
557 const float* endP = sourceP + n - tailFrames; | |
558 | |
559 float32x4_t fourSum = vdupq_n_f32(0); | |
560 while (sourceP < endP) { | |
561 float32x4_t source = vld1q_f32(sourceP); | |
562 fourSum = vmlaq_f32(fourSum, source, source); | |
563 sourceP += 4; | |
564 } | |
565 float32x2_t twoSum = vadd_f32(vget_low_f32(fourSum), vget_high_f32(fourS
um)); | |
566 | |
567 float groupSum[2]; | |
568 vst1_f32(groupSum, twoSum); | |
569 sum += groupSum[0] + groupSum[1]; | |
570 | |
571 n = tailFrames; | |
572 } | |
573 #endif | |
574 | 373 |
575 while (n--) { | 374 while (n--) { |
576 float sample = *sourceP; | 375 float sample = *sourceP; |
577 sum += sample * sample; | 376 sum += sample * sample; |
578 sourceP += sourceStride; | 377 sourceP += sourceStride; |
579 } | 378 } |
580 | 379 |
581 ASSERT(sumP); | 380 ASSERT(sumP); |
582 *sumP = sum; | 381 *sumP = sum; |
583 } | 382 } |
584 | 383 |
585 void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesT
oProcess) | 384 void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesT
oProcess) |
586 { | 385 { |
587 int n = framesToProcess; | 386 int n = framesToProcess; |
588 float max = 0; | 387 float max = 0; |
589 | 388 |
590 #if CPU(X86) || CPU(X86_64) | |
591 if (sourceStride == 1) { | 389 if (sourceStride == 1) { |
592 // If the sourceP address is not 16-byte aligned, the first several fram
es (at most three) should be processed separately. | 390 // If the sourceP address is not 16-byte aligned, the first several fram
es (at most three) should be processed separately. |
593 while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) { | 391 while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) { |
594 max = std::max(max, fabsf(*sourceP)); | 392 max = std::max(max, fabsf(*sourceP)); |
595 sourceP++; | 393 sourceP++; |
596 n--; | 394 n--; |
597 } | 395 } |
598 | 396 |
599 // Now the sourceP is aligned, use SSE. | 397 // Now the sourceP is aligned, use SSE. |
600 int tailFrames = n % 4; | 398 int tailFrames = n % 4; |
(...skipping 13 matching lines...) Expand all Loading... |
614 | 412 |
615 // Get max from the SSE results. | 413 // Get max from the SSE results. |
616 const float* groupMaxP = reinterpret_cast<float*>(&mMax); | 414 const float* groupMaxP = reinterpret_cast<float*>(&mMax); |
617 max = std::max(max, groupMaxP[0]); | 415 max = std::max(max, groupMaxP[0]); |
618 max = std::max(max, groupMaxP[1]); | 416 max = std::max(max, groupMaxP[1]); |
619 max = std::max(max, groupMaxP[2]); | 417 max = std::max(max, groupMaxP[2]); |
620 max = std::max(max, groupMaxP[3]); | 418 max = std::max(max, groupMaxP[3]); |
621 | 419 |
622 n = tailFrames; | 420 n = tailFrames; |
623 } | 421 } |
624 #elif HAVE(ARM_NEON_INTRINSICS) | |
625 if (sourceStride == 1) { | |
626 int tailFrames = n % 4; | |
627 const float* endP = sourceP + n - tailFrames; | |
628 | |
629 float32x4_t fourMax = vdupq_n_f32(0); | |
630 while (sourceP < endP) { | |
631 float32x4_t source = vld1q_f32(sourceP); | |
632 fourMax = vmaxq_f32(fourMax, vabsq_f32(source)); | |
633 sourceP += 4; | |
634 } | |
635 float32x2_t twoMax = vmax_f32(vget_low_f32(fourMax), vget_high_f32(fourM
ax)); | |
636 | |
637 float groupMax[2]; | |
638 vst1_f32(groupMax, twoMax); | |
639 max = std::max(groupMax[0], groupMax[1]); | |
640 | |
641 n = tailFrames; | |
642 } | |
643 #endif | |
644 | 422 |
645 while (n--) { | 423 while (n--) { |
646 max = std::max(max, fabsf(*sourceP)); | 424 max = std::max(max, fabsf(*sourceP)); |
647 sourceP += sourceStride; | 425 sourceP += sourceStride; |
648 } | 426 } |
649 | 427 |
650 ASSERT(maxP); | 428 ASSERT(maxP); |
651 *maxP = max; | 429 *maxP = max; |
652 } | 430 } |
653 | 431 |
654 void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, c
onst float* highThresholdP, float* destP, int destStride, size_t framesToProcess
) | 432 void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, c
onst float* highThresholdP, float* destP, int destStride, size_t framesToProcess
) |
655 { | 433 { |
656 int n = framesToProcess; | 434 int n = framesToProcess; |
657 float lowThreshold = *lowThresholdP; | 435 float lowThreshold = *lowThresholdP; |
658 float highThreshold = *highThresholdP; | 436 float highThreshold = *highThresholdP; |
659 | 437 |
660 // FIXME: Optimize for SSE2. | 438 // FIXME: Optimize for SSE2. |
661 #if HAVE(ARM_NEON_INTRINSICS) | |
662 if ((sourceStride == 1) && (destStride == 1)) { | |
663 int tailFrames = n % 4; | |
664 const float* endP = destP + n - tailFrames; | |
665 | |
666 float32x4_t low = vdupq_n_f32(lowThreshold); | |
667 float32x4_t high = vdupq_n_f32(highThreshold); | |
668 while (destP < endP) { | |
669 float32x4_t source = vld1q_f32(sourceP); | |
670 vst1q_f32(destP, vmaxq_f32(vminq_f32(source, high), low)); | |
671 sourceP += 4; | |
672 destP += 4; | |
673 } | |
674 n = tailFrames; | |
675 } | |
676 #endif | |
677 while (n--) { | 439 while (n--) { |
678 *destP = std::max(std::min(*sourceP, highThreshold), lowThreshold); | 440 *destP = std::max(std::min(*sourceP, highThreshold), lowThreshold); |
679 sourceP += sourceStride; | 441 sourceP += sourceStride; |
680 destP += destStride; | 442 destP += destStride; |
681 } | 443 } |
682 } | 444 } |
683 | 445 |
684 #endif // OS(MACOSX) | |
685 | |
686 } // namespace VectorMath | 446 } // namespace VectorMath |
687 | 447 |
688 } // namespace blink | 448 } // namespace blink |
689 | 449 |
690 #endif // ENABLE(WEB_AUDIO) | 450 #endif // ENABLE(WEB_AUDIO) |
OLD | NEW |