| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (C) 2010, Google Inc. All rights reserved. | 2 * Copyright (C) 2010, Google Inc. All rights reserved. |
| 3 * | 3 * |
| 4 * Redistribution and use in source and binary forms, with or without | 4 * Redistribution and use in source and binary forms, with or without |
| 5 * modification, are permitted provided that the following conditions | 5 * modification, are permitted provided that the following conditions |
| 6 * are met: | 6 * are met: |
| 7 * 1. Redistributions of source code must retain the above copyright | 7 * 1. Redistributions of source code must retain the above copyright |
| 8 * notice, this list of conditions and the following disclaimer. | 8 * notice, this list of conditions and the following disclaimer. |
| 9 * 2. Redistributions in binary form must reproduce the above copyright | 9 * 2. Redistributions in binary form must reproduce the above copyright |
| 10 * notice, this list of conditions and the following disclaimer in the | 10 * notice, this list of conditions and the following disclaimer in the |
| 11 * documentation and/or other materials provided with the distribution. | 11 * documentation and/or other materials provided with the distribution. |
| 12 * | 12 * |
| 13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' AND AN
Y | 13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' AND AN
Y |
| 14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | 14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| 15 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | 15 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| 16 * DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE FOR AN
Y | 16 * DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE FOR AN
Y |
| 17 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES | 17 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| 18 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | 18 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| 19 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND O
N | 19 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND O
N |
| 20 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | 20 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 21 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | 21 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| 22 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 22 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 23 */ | 23 */ |
| 24 | 24 |
| 25 #include "config.h" | 25 #include "config.h" |
| 26 | 26 |
| 27 #if ENABLE(WEB_AUDIO) | 27 #if ENABLE(WEB_AUDIO) |
| 28 | 28 |
| 29 #include "platform/audio/VectorMath.h" | 29 #include "platform/audio/VectorMath.h" |
| 30 |
| 30 #include "wtf/Assertions.h" | 31 #include "wtf/Assertions.h" |
| 31 #include "wtf/CPU.h" | 32 #include <emmintrin.h> |
| 32 #include <stdint.h> | 33 #include <stdint.h> |
| 33 | 34 |
| 34 #if OS(MACOSX) | 35 #define SSE2_MULT_ADD(loadInstr, storeInstr) \ |
| 35 #include <Accelerate/Accelerate.h> | 36 while (destP < endP) { \ |
| 36 #endif | 37 pSource = _mm_load_ps(sourceP); \ |
| 38 temp = _mm_mul_ps(pSource, mScale); \ |
| 39 dest = _mm_##loadInstr##_ps(destP); \ |
| 40 dest = _mm_add_ps(dest, temp); \ |
| 41 _mm_##storeInstr##_ps(destP, dest); \ |
| 42 sourceP += 4; \ |
| 43 destP += 4; \ |
| 44 } \ |
| 37 | 45 |
| 38 #if CPU(X86) || CPU(X86_64) | 46 #define SSE2_MULT(loadInstr, storeInstr) \ |
| 39 #include <emmintrin.h> | 47 while (destP < endP) { \ |
| 40 #endif | 48 pSource1 = _mm_load_ps(source1P); \ |
| 41 | 49 pSource2 = _mm_##loadInstr##_ps(source2P); \ |
| 42 #if HAVE(ARM_NEON_INTRINSICS) | 50 dest = _mm_mul_ps(pSource1, pSource2); \ |
| 43 #include <arm_neon.h> | 51 _mm_##storeInstr##_ps(destP, dest); \ |
| 44 #endif | 52 source1P += 4; \ |
| 45 | 53 source2P += 4; \ |
| 46 #include <math.h> | 54 destP += 4; \ |
| 47 #include <algorithm> | 55 } \ |
| 48 | 56 |
| 49 namespace blink { | 57 namespace blink { |
| 50 | 58 |
| 51 namespace VectorMath { | 59 namespace VectorMath { |
| 52 | 60 |
| 53 #if OS(MACOSX) | |
| 54 // On the Mac we use the highly optimized versions in Accelerate.framework | |
| 55 // In 32-bit mode (__ppc__ or __i386__) <Accelerate/Accelerate.h> includes <vecL
ib/vDSP_translate.h> which defines macros of the same name as | |
| 56 // our namespaced function names, so we must handle this case differently. Other
architectures (64bit, ARM, etc.) do not include this header file. | |
| 57 | |
| 58 void vsmul(const float* sourceP, int sourceStride, const float* scale, float* de
stP, int destStride, size_t framesToProcess) | |
| 59 { | |
| 60 #if CPU(X86) | |
| 61 ::vsmul(sourceP, sourceStride, scale, destP, destStride, framesToProcess); | |
| 62 #else | |
| 63 vDSP_vsmul(sourceP, sourceStride, scale, destP, destStride, framesToProcess)
; | |
| 64 #endif | |
| 65 } | |
| 66 | |
| 67 void vadd(const float* source1P, int sourceStride1, const float* source2P, int s
ourceStride2, float* destP, int destStride, size_t framesToProcess) | |
| 68 { | |
| 69 #if CPU(X86) | |
| 70 ::vadd(source1P, sourceStride1, source2P, sourceStride2, destP, destStride,
framesToProcess); | |
| 71 #else | |
| 72 vDSP_vadd(source1P, sourceStride1, source2P, sourceStride2, destP, destStrid
e, framesToProcess); | |
| 73 #endif | |
| 74 } | |
| 75 | |
| 76 void vmul(const float* source1P, int sourceStride1, const float* source2P, int s
ourceStride2, float* destP, int destStride, size_t framesToProcess) | |
| 77 { | |
| 78 #if CPU(X86) | |
| 79 ::vmul(source1P, sourceStride1, source2P, sourceStride2, destP, destStride,
framesToProcess); | |
| 80 #else | |
| 81 vDSP_vmul(source1P, sourceStride1, source2P, sourceStride2, destP, destStrid
e, framesToProcess); | |
| 82 #endif | |
| 83 } | |
| 84 | |
| 85 void zvmul(const float* real1P, const float* imag1P, const float* real2P, const
float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess) | |
| 86 { | |
| 87 DSPSplitComplex sc1; | |
| 88 DSPSplitComplex sc2; | |
| 89 DSPSplitComplex dest; | |
| 90 sc1.realp = const_cast<float*>(real1P); | |
| 91 sc1.imagp = const_cast<float*>(imag1P); | |
| 92 sc2.realp = const_cast<float*>(real2P); | |
| 93 sc2.imagp = const_cast<float*>(imag2P); | |
| 94 dest.realp = realDestP; | |
| 95 dest.imagp = imagDestP; | |
| 96 #if CPU(X86) | |
| 97 ::zvmul(&sc1, 1, &sc2, 1, &dest, 1, framesToProcess, 1); | |
| 98 #else | |
| 99 vDSP_zvmul(&sc1, 1, &sc2, 1, &dest, 1, framesToProcess, 1); | |
| 100 #endif | |
| 101 } | |
| 102 | |
| 103 void vsma(const float* sourceP, int sourceStride, const float* scale, float* des
tP, int destStride, size_t framesToProcess) | |
| 104 { | |
| 105 vDSP_vsma(sourceP, sourceStride, scale, destP, destStride, destP, destStride
, framesToProcess); | |
| 106 } | |
| 107 | |
| 108 void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesT
oProcess) | |
| 109 { | |
| 110 vDSP_maxmgv(sourceP, sourceStride, maxP, framesToProcess); | |
| 111 } | |
| 112 | |
| 113 void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesTo
Process) | |
| 114 { | |
| 115 vDSP_svesq(const_cast<float*>(sourceP), sourceStride, sumP, framesToProcess)
; | |
| 116 } | |
| 117 | |
| 118 void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, c
onst float* highThresholdP, float* destP, int destStride, size_t framesToProcess
) | |
| 119 { | |
| 120 vDSP_vclip(const_cast<float*>(sourceP), sourceStride, const_cast<float*>(low
ThresholdP), const_cast<float*>(highThresholdP), destP, destStride, framesToProc
ess); | |
| 121 } | |
| 122 #else | |
| 123 | |
| 124 void vsma(const float* sourceP, int sourceStride, const float* scale, float* des
tP, int destStride, size_t framesToProcess) | 61 void vsma(const float* sourceP, int sourceStride, const float* scale, float* des
tP, int destStride, size_t framesToProcess) |
| 125 { | 62 { |
| 126 int n = framesToProcess; | 63 int n = framesToProcess; |
| 127 | 64 |
| 128 #if CPU(X86) || CPU(X86_64) | |
| 129 if ((sourceStride == 1) && (destStride == 1)) { | 65 if ((sourceStride == 1) && (destStride == 1)) { |
| 130 float k = *scale; | 66 float k = *scale; |
| 131 | 67 |
| 132 // If the sourceP address is not 16-byte aligned, the first several fram
es (at most three) should be processed separately. | 68 // If the sourceP address is not 16-byte aligned, the first several fram
es (at most three) should be processed separately. |
| 133 while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) { | 69 while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) { |
| 134 *destP += k * *sourceP; | 70 *destP += k * *sourceP; |
| 135 sourceP++; | 71 sourceP++; |
| 136 destP++; | 72 destP++; |
| 137 n--; | 73 n--; |
| 138 } | 74 } |
| 139 | 75 |
| 140 // Now the sourceP is aligned, use SSE. | 76 // Now the sourceP is aligned, use SSE. |
| 141 int tailFrames = n % 4; | 77 int tailFrames = n % 4; |
| 142 const float* endP = destP + n - tailFrames; | 78 const float* endP = destP + n - tailFrames; |
| 143 | 79 |
| 144 __m128 pSource; | 80 __m128 pSource; |
| 145 __m128 dest; | 81 __m128 dest; |
| 146 __m128 temp; | 82 __m128 temp; |
| 147 __m128 mScale = _mm_set_ps1(k); | 83 __m128 mScale = _mm_set_ps1(k); |
| 148 | 84 |
| 149 bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F); | 85 bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F); |
| 150 | 86 |
| 151 #define SSE2_MULT_ADD(loadInstr, storeInstr) \ | |
| 152 while (destP < endP) \ | |
| 153 { \ | |
| 154 pSource = _mm_load_ps(sourceP); \ | |
| 155 temp = _mm_mul_ps(pSource, mScale); \ | |
| 156 dest = _mm_##loadInstr##_ps(destP); \ | |
| 157 dest = _mm_add_ps(dest, temp); \ | |
| 158 _mm_##storeInstr##_ps(destP, dest); \ | |
| 159 sourceP += 4; \ | |
| 160 destP += 4; \ | |
| 161 } | |
| 162 | |
| 163 if (destAligned) | 87 if (destAligned) |
| 164 SSE2_MULT_ADD(load, store) | 88 SSE2_MULT_ADD(load, store) |
| 165 else | 89 else |
| 166 SSE2_MULT_ADD(loadu, storeu) | 90 SSE2_MULT_ADD(loadu, storeu) |
| 167 | 91 |
| 168 n = tailFrames; | 92 n = tailFrames; |
| 169 } | 93 } |
| 170 #elif HAVE(ARM_NEON_INTRINSICS) | |
| 171 if ((sourceStride == 1) && (destStride == 1)) { | |
| 172 int tailFrames = n % 4; | |
| 173 const float* endP = destP + n - tailFrames; | |
| 174 | 94 |
| 175 float32x4_t k = vdupq_n_f32(*scale); | |
| 176 while (destP < endP) { | |
| 177 float32x4_t source = vld1q_f32(sourceP); | |
| 178 float32x4_t dest = vld1q_f32(destP); | |
| 179 | |
| 180 dest = vmlaq_f32(dest, source, k); | |
| 181 vst1q_f32(destP, dest); | |
| 182 | |
| 183 sourceP += 4; | |
| 184 destP += 4; | |
| 185 } | |
| 186 n = tailFrames; | |
| 187 } | |
| 188 #endif | |
| 189 while (n) { | 95 while (n) { |
| 190 *destP += *sourceP * *scale; | 96 *destP += *sourceP * *scale; |
| 191 sourceP += sourceStride; | 97 sourceP += sourceStride; |
| 192 destP += destStride; | 98 destP += destStride; |
| 193 n--; | 99 n--; |
| 194 } | 100 } |
| 195 } | 101 } |
| 196 | 102 |
| 197 void vsmul(const float* sourceP, int sourceStride, const float* scale, float* de
stP, int destStride, size_t framesToProcess) | 103 void vsmul(const float* sourceP, int sourceStride, const float* scale, float* de
stP, int destStride, size_t framesToProcess) |
| 198 { | 104 { |
| 199 int n = framesToProcess; | 105 int n = framesToProcess; |
| 200 | 106 |
| 201 #if CPU(X86) || CPU(X86_64) | |
| 202 if ((sourceStride == 1) && (destStride == 1)) { | 107 if ((sourceStride == 1) && (destStride == 1)) { |
| 203 float k = *scale; | 108 float k = *scale; |
| 204 | 109 |
| 205 // If the sourceP address is not 16-byte aligned, the first several fram
es (at most three) should be processed separately. | 110 // If the sourceP address is not 16-byte aligned, the first several fram
es (at most three) should be processed separately. |
| 206 while ((reinterpret_cast<size_t>(sourceP) & 0x0F) && n) { | 111 while ((reinterpret_cast<size_t>(sourceP) & 0x0F) && n) { |
| 207 *destP = k * *sourceP; | 112 *destP = k * *sourceP; |
| 208 sourceP++; | 113 sourceP++; |
| 209 destP++; | 114 destP++; |
| 210 n--; | 115 n--; |
| 211 } | 116 } |
| (...skipping 28 matching lines...) Expand all Loading... |
| 240 | 145 |
| 241 // Non-SSE handling for remaining frames which is less than 4. | 146 // Non-SSE handling for remaining frames which is less than 4. |
| 242 n %= 4; | 147 n %= 4; |
| 243 while (n) { | 148 while (n) { |
| 244 *destP = k * *sourceP; | 149 *destP = k * *sourceP; |
| 245 sourceP++; | 150 sourceP++; |
| 246 destP++; | 151 destP++; |
| 247 n--; | 152 n--; |
| 248 } | 153 } |
| 249 } else { // If strides are not 1, rollback to normal algorithm. | 154 } else { // If strides are not 1, rollback to normal algorithm. |
| 250 #elif HAVE(ARM_NEON_INTRINSICS) | |
| 251 if ((sourceStride == 1) && (destStride == 1)) { | |
| 252 float k = *scale; | 155 float k = *scale; |
| 253 int tailFrames = n % 4; | 156 while (n--) { |
| 254 const float* endP = destP + n - tailFrames; | 157 *destP = k * *sourceP; |
| 255 | 158 sourceP += sourceStride; |
| 256 while (destP < endP) { | 159 destP += destStride; |
| 257 float32x4_t source = vld1q_f32(sourceP); | |
| 258 vst1q_f32(destP, vmulq_n_f32(source, k)); | |
| 259 | |
| 260 sourceP += 4; | |
| 261 destP += 4; | |
| 262 } | 160 } |
| 263 n = tailFrames; | |
| 264 } | 161 } |
| 265 #endif | |
| 266 float k = *scale; | |
| 267 while (n--) { | |
| 268 *destP = k * *sourceP; | |
| 269 sourceP += sourceStride; | |
| 270 destP += destStride; | |
| 271 } | |
| 272 #if CPU(X86) || CPU(X86_64) | |
| 273 } | |
| 274 #endif | |
| 275 } | 162 } |
| 276 | 163 |
| 277 void vadd(const float* source1P, int sourceStride1, const float* source2P, int s
ourceStride2, float* destP, int destStride, size_t framesToProcess) | 164 void vadd(const float* source1P, int sourceStride1, const float* source2P, int s
ourceStride2, float* destP, int destStride, size_t framesToProcess) |
| 278 { | 165 { |
| 279 int n = framesToProcess; | 166 int n = framesToProcess; |
| 280 | 167 |
| 281 #if CPU(X86) || CPU(X86_64) | |
| 282 if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) { | 168 if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) { |
| 283 // If the sourceP address is not 16-byte aligned, the first several fram
es (at most three) should be processed separately. | 169 // If the sourceP address is not 16-byte aligned, the first several fram
es (at most three) should be processed separately. |
| 284 while ((reinterpret_cast<size_t>(source1P) & 0x0F) && n) { | 170 while ((reinterpret_cast<size_t>(source1P) & 0x0F) && n) { |
| 285 *destP = *source1P + *source2P; | 171 *destP = *source1P + *source2P; |
| 286 source1P++; | 172 source1P++; |
| 287 source2P++; | 173 source2P++; |
| 288 destP++; | 174 destP++; |
| 289 n--; | 175 n--; |
| 290 } | 176 } |
| 291 | 177 |
| (...skipping 59 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 351 // Non-SSE handling for remaining frames which is less than 4. | 237 // Non-SSE handling for remaining frames which is less than 4. |
| 352 n %= 4; | 238 n %= 4; |
| 353 while (n) { | 239 while (n) { |
| 354 *destP = *source1P + *source2P; | 240 *destP = *source1P + *source2P; |
| 355 source1P++; | 241 source1P++; |
| 356 source2P++; | 242 source2P++; |
| 357 destP++; | 243 destP++; |
| 358 n--; | 244 n--; |
| 359 } | 245 } |
| 360 } else { // if strides are not 1, rollback to normal algorithm | 246 } else { // if strides are not 1, rollback to normal algorithm |
| 361 #elif HAVE(ARM_NEON_INTRINSICS) | 247 while (n--) { |
| 362 if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) { | 248 *destP = *source1P + *source2P; |
| 363 int tailFrames = n % 4; | 249 source1P += sourceStride1; |
| 364 const float* endP = destP + n - tailFrames; | 250 source2P += sourceStride2; |
| 365 | 251 destP += destStride; |
| 366 while (destP < endP) { | |
| 367 float32x4_t source1 = vld1q_f32(source1P); | |
| 368 float32x4_t source2 = vld1q_f32(source2P); | |
| 369 vst1q_f32(destP, vaddq_f32(source1, source2)); | |
| 370 | |
| 371 source1P += 4; | |
| 372 source2P += 4; | |
| 373 destP += 4; | |
| 374 } | 252 } |
| 375 n = tailFrames; | |
| 376 } | |
| 377 #endif | |
| 378 while (n--) { | |
| 379 *destP = *source1P + *source2P; | |
| 380 source1P += sourceStride1; | |
| 381 source2P += sourceStride2; | |
| 382 destP += destStride; | |
| 383 } | |
| 384 #if CPU(X86) || CPU(X86_64) | |
| 385 } | |
| 386 #endif | |
| 387 } | 253 } |
| 388 | 254 |
| 389 void vmul(const float* source1P, int sourceStride1, const float* source2P, int s
ourceStride2, float* destP, int destStride, size_t framesToProcess) | 255 void vmul(const float* source1P, int sourceStride1, const float* source2P, int s
ourceStride2, float* destP, int destStride, size_t framesToProcess) |
| 390 { | 256 { |
| 391 | |
| 392 int n = framesToProcess; | 257 int n = framesToProcess; |
| 393 | 258 |
| 394 #if CPU(X86) || CPU(X86_64) | |
| 395 if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) { | 259 if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) { |
| 396 // If the source1P address is not 16-byte aligned, the first several fra
mes (at most three) should be processed separately. | 260 // If the source1P address is not 16-byte aligned, the first several fra
mes (at most three) should be processed separately. |
| 397 while ((reinterpret_cast<uintptr_t>(source1P) & 0x0F) && n) { | 261 while ((reinterpret_cast<uintptr_t>(source1P) & 0x0F) && n) { |
| 398 *destP = *source1P * *source2P; | 262 *destP = *source1P * *source2P; |
| 399 source1P++; | 263 source1P++; |
| 400 source2P++; | 264 source2P++; |
| 401 destP++; | 265 destP++; |
| 402 n--; | 266 n--; |
| 403 } | 267 } |
| 404 | 268 |
| 405 // Now the source1P address aligned and start to apply SSE. | 269 // Now the source1P address aligned and start to apply SSE. |
| 406 int tailFrames = n % 4; | 270 int tailFrames = n % 4; |
| 407 const float* endP = destP + n - tailFrames; | 271 const float* endP = destP + n - tailFrames; |
| 408 __m128 pSource1; | 272 __m128 pSource1; |
| 409 __m128 pSource2; | 273 __m128 pSource2; |
| 410 __m128 dest; | 274 __m128 dest; |
| 411 | 275 |
| 412 bool source2Aligned = !(reinterpret_cast<uintptr_t>(source2P) & 0x0F); | 276 bool source2Aligned = !(reinterpret_cast<uintptr_t>(source2P) & 0x0F); |
| 413 bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F); | 277 bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F); |
| 414 | 278 |
| 415 #define SSE2_MULT(loadInstr, storeInstr) \ | |
| 416 while (destP < endP) \ | |
| 417 { \ | |
| 418 pSource1 = _mm_load_ps(source1P); \ | |
| 419 pSource2 = _mm_##loadInstr##_ps(source2P); \ | |
| 420 dest = _mm_mul_ps(pSource1, pSource2); \ | |
| 421 _mm_##storeInstr##_ps(destP, dest); \ | |
| 422 source1P += 4; \ | |
| 423 source2P += 4; \ | |
| 424 destP += 4; \ | |
| 425 } | |
| 426 | |
| 427 if (source2Aligned && destAligned) // Both aligned. | 279 if (source2Aligned && destAligned) // Both aligned. |
| 428 SSE2_MULT(load, store) | 280 SSE2_MULT(load, store) |
| 429 else if (source2Aligned && !destAligned) // Source2 is aligned but dest
not. | 281 else if (source2Aligned && !destAligned) // Source2 is aligned but dest
not. |
| 430 SSE2_MULT(load, storeu) | 282 SSE2_MULT(load, storeu) |
| 431 else if (!source2Aligned && destAligned) // Dest is aligned but source2
not. | 283 else if (!source2Aligned && destAligned) // Dest is aligned but source2
not. |
| 432 SSE2_MULT(loadu, store) | 284 SSE2_MULT(loadu, store) |
| 433 else // Neither aligned. | 285 else // Neither aligned. |
| 434 SSE2_MULT(loadu, storeu) | 286 SSE2_MULT(loadu, storeu) |
| 435 | 287 |
| 436 n = tailFrames; | 288 n = tailFrames; |
| 437 } | 289 } |
| 438 #elif HAVE(ARM_NEON_INTRINSICS) | |
| 439 if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) { | |
| 440 int tailFrames = n % 4; | |
| 441 const float* endP = destP + n - tailFrames; | |
| 442 | 290 |
| 443 while (destP < endP) { | |
| 444 float32x4_t source1 = vld1q_f32(source1P); | |
| 445 float32x4_t source2 = vld1q_f32(source2P); | |
| 446 vst1q_f32(destP, vmulq_f32(source1, source2)); | |
| 447 | |
| 448 source1P += 4; | |
| 449 source2P += 4; | |
| 450 destP += 4; | |
| 451 } | |
| 452 n = tailFrames; | |
| 453 } | |
| 454 #endif | |
| 455 while (n) { | 291 while (n) { |
| 456 *destP = *source1P * *source2P; | 292 *destP = *source1P * *source2P; |
| 457 source1P += sourceStride1; | 293 source1P += sourceStride1; |
| 458 source2P += sourceStride2; | 294 source2P += sourceStride2; |
| 459 destP += destStride; | 295 destP += destStride; |
| 460 n--; | 296 n--; |
| 461 } | 297 } |
| 462 } | 298 } |
| 463 | 299 |
| 464 void zvmul(const float* real1P, const float* imag1P, const float* real2P, const
float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess) | 300 void zvmul(const float* real1P, const float* imag1P, const float* real2P, const
float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess) |
| 465 { | 301 { |
| 466 unsigned i = 0; | 302 unsigned i = 0; |
| 467 #if CPU(X86) || CPU(X86_64) | 303 |
| 468 // Only use the SSE optimization in the very common case that all addresses
are 16-byte aligned. | 304 // Only use the SSE optimization in the very common case that all addresses
are 16-byte aligned. |
| 469 // Otherwise, fall through to the scalar code below. | 305 // Otherwise, fall through to the scalar code below. |
| 470 if (!(reinterpret_cast<uintptr_t>(real1P) & 0x0F) | 306 if (!(reinterpret_cast<uintptr_t>(real1P) & 0x0F) |
| 471 && !(reinterpret_cast<uintptr_t>(imag1P) & 0x0F) | 307 && !(reinterpret_cast<uintptr_t>(imag1P) & 0x0F) |
| 472 && !(reinterpret_cast<uintptr_t>(real2P) & 0x0F) | 308 && !(reinterpret_cast<uintptr_t>(real2P) & 0x0F) |
| 473 && !(reinterpret_cast<uintptr_t>(imag2P) & 0x0F) | 309 && !(reinterpret_cast<uintptr_t>(imag2P) & 0x0F) |
| 474 && !(reinterpret_cast<uintptr_t>(realDestP) & 0x0F) | 310 && !(reinterpret_cast<uintptr_t>(realDestP) & 0x0F) |
| 475 && !(reinterpret_cast<uintptr_t>(imagDestP) & 0x0F)) { | 311 && !(reinterpret_cast<uintptr_t>(imagDestP) & 0x0F)) { |
| 476 | 312 |
| 477 unsigned endSize = framesToProcess - framesToProcess % 4; | 313 unsigned endSize = framesToProcess - framesToProcess % 4; |
| 478 while (i < endSize) { | 314 while (i < endSize) { |
| 479 __m128 real1 = _mm_load_ps(real1P + i); | 315 __m128 real1 = _mm_load_ps(real1P + i); |
| 480 __m128 real2 = _mm_load_ps(real2P + i); | 316 __m128 real2 = _mm_load_ps(real2P + i); |
| 481 __m128 imag1 = _mm_load_ps(imag1P + i); | 317 __m128 imag1 = _mm_load_ps(imag1P + i); |
| 482 __m128 imag2 = _mm_load_ps(imag2P + i); | 318 __m128 imag2 = _mm_load_ps(imag2P + i); |
| 483 __m128 real = _mm_mul_ps(real1, real2); | 319 __m128 real = _mm_mul_ps(real1, real2); |
| 484 real = _mm_sub_ps(real, _mm_mul_ps(imag1, imag2)); | 320 real = _mm_sub_ps(real, _mm_mul_ps(imag1, imag2)); |
| 485 __m128 imag = _mm_mul_ps(real1, imag2); | 321 __m128 imag = _mm_mul_ps(real1, imag2); |
| 486 imag = _mm_add_ps(imag, _mm_mul_ps(imag1, real2)); | 322 imag = _mm_add_ps(imag, _mm_mul_ps(imag1, real2)); |
| 487 _mm_store_ps(realDestP + i, real); | 323 _mm_store_ps(realDestP + i, real); |
| 488 _mm_store_ps(imagDestP + i, imag); | 324 _mm_store_ps(imagDestP + i, imag); |
| 489 i += 4; | 325 i += 4; |
| 490 } | 326 } |
| 491 } | 327 } |
| 492 #elif HAVE(ARM_NEON_INTRINSICS) | |
| 493 unsigned endSize = framesToProcess - framesToProcess % 4; | |
| 494 while (i < endSize) { | |
| 495 float32x4_t real1 = vld1q_f32(real1P + i); | |
| 496 float32x4_t real2 = vld1q_f32(real2P + i); | |
| 497 float32x4_t imag1 = vld1q_f32(imag1P + i); | |
| 498 float32x4_t imag2 = vld1q_f32(imag2P + i); | |
| 499 | 328 |
| 500 float32x4_t realResult = vmlsq_f32(vmulq_f32(real1, real2), imag1, i
mag2); | |
| 501 float32x4_t imagResult = vmlaq_f32(vmulq_f32(real1, imag2), imag1, r
eal2); | |
| 502 | |
| 503 vst1q_f32(realDestP + i, realResult); | |
| 504 vst1q_f32(imagDestP + i, imagResult); | |
| 505 | |
| 506 i += 4; | |
| 507 } | |
| 508 #endif | |
| 509 for (; i < framesToProcess; ++i) { | 329 for (; i < framesToProcess; ++i) { |
| 510 // Read and compute result before storing them, in case the | 330 // Read and compute result before storing them, in case the |
| 511 // destination is the same as one of the sources. | 331 // destination is the same as one of the sources. |
| 512 float realResult = real1P[i] * real2P[i] - imag1P[i] * imag2P[i]; | 332 float realResult = real1P[i] * real2P[i] - imag1P[i] * imag2P[i]; |
| 513 float imagResult = real1P[i] * imag2P[i] + imag1P[i] * real2P[i]; | 333 float imagResult = real1P[i] * imag2P[i] + imag1P[i] * real2P[i]; |
| 514 | 334 |
| 515 realDestP[i] = realResult; | 335 realDestP[i] = realResult; |
| 516 imagDestP[i] = imagResult; | 336 imagDestP[i] = imagResult; |
| 517 } | 337 } |
| 518 } | 338 } |
| 519 | 339 |
| 520 void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesTo
Process) | 340 void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesTo
Process) |
| 521 { | 341 { |
| 522 int n = framesToProcess; | 342 int n = framesToProcess; |
| 523 float sum = 0; | 343 float sum = 0; |
| 524 | 344 |
| 525 #if CPU(X86) || CPU(X86_64) | |
| 526 if (sourceStride == 1) { | 345 if (sourceStride == 1) { |
| 527 // If the sourceP address is not 16-byte aligned, the first several fram
es (at most three) should be processed separately. | 346 // If the sourceP address is not 16-byte aligned, the first several fram
es (at most three) should be processed separately. |
| 528 while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) { | 347 while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) { |
| 529 float sample = *sourceP; | 348 float sample = *sourceP; |
| 530 sum += sample * sample; | 349 sum += sample * sample; |
| 531 sourceP++; | 350 sourceP++; |
| 532 n--; | 351 n--; |
| 533 } | 352 } |
| 534 | 353 |
| 535 // Now the sourceP is aligned, use SSE. | 354 // Now the sourceP is aligned, use SSE. |
| 536 int tailFrames = n % 4; | 355 int tailFrames = n % 4; |
| 537 const float* endP = sourceP + n - tailFrames; | 356 const float* endP = sourceP + n - tailFrames; |
| 538 __m128 source; | 357 __m128 source; |
| 539 __m128 mSum = _mm_setzero_ps(); | 358 __m128 mSum = _mm_setzero_ps(); |
| 540 | 359 |
| 541 while (sourceP < endP) { | 360 while (sourceP < endP) { |
| 542 source = _mm_load_ps(sourceP); | 361 source = _mm_load_ps(sourceP); |
| 543 source = _mm_mul_ps(source, source); | 362 source = _mm_mul_ps(source, source); |
| 544 mSum = _mm_add_ps(mSum, source); | 363 mSum = _mm_add_ps(mSum, source); |
| 545 sourceP += 4; | 364 sourceP += 4; |
| 546 } | 365 } |
| 547 | 366 |
| 548 // Summarize the SSE results. | 367 // Summarize the SSE results. |
| 549 const float* groupSumP = reinterpret_cast<float*>(&mSum); | 368 const float* groupSumP = reinterpret_cast<float*>(&mSum); |
| 550 sum += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3]; | 369 sum += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3]; |
| 551 | 370 |
| 552 n = tailFrames; | 371 n = tailFrames; |
| 553 } | 372 } |
| 554 #elif HAVE(ARM_NEON_INTRINSICS) | |
| 555 if (sourceStride == 1) { | |
| 556 int tailFrames = n % 4; | |
| 557 const float* endP = sourceP + n - tailFrames; | |
| 558 | |
| 559 float32x4_t fourSum = vdupq_n_f32(0); | |
| 560 while (sourceP < endP) { | |
| 561 float32x4_t source = vld1q_f32(sourceP); | |
| 562 fourSum = vmlaq_f32(fourSum, source, source); | |
| 563 sourceP += 4; | |
| 564 } | |
| 565 float32x2_t twoSum = vadd_f32(vget_low_f32(fourSum), vget_high_f32(fourS
um)); | |
| 566 | |
| 567 float groupSum[2]; | |
| 568 vst1_f32(groupSum, twoSum); | |
| 569 sum += groupSum[0] + groupSum[1]; | |
| 570 | |
| 571 n = tailFrames; | |
| 572 } | |
| 573 #endif | |
| 574 | 373 |
| 575 while (n--) { | 374 while (n--) { |
| 576 float sample = *sourceP; | 375 float sample = *sourceP; |
| 577 sum += sample * sample; | 376 sum += sample * sample; |
| 578 sourceP += sourceStride; | 377 sourceP += sourceStride; |
| 579 } | 378 } |
| 580 | 379 |
| 581 ASSERT(sumP); | 380 ASSERT(sumP); |
| 582 *sumP = sum; | 381 *sumP = sum; |
| 583 } | 382 } |
| 584 | 383 |
| 585 void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesT
oProcess) | 384 void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesT
oProcess) |
| 586 { | 385 { |
| 587 int n = framesToProcess; | 386 int n = framesToProcess; |
| 588 float max = 0; | 387 float max = 0; |
| 589 | 388 |
| 590 #if CPU(X86) || CPU(X86_64) | |
| 591 if (sourceStride == 1) { | 389 if (sourceStride == 1) { |
| 592 // If the sourceP address is not 16-byte aligned, the first several fram
es (at most three) should be processed separately. | 390 // If the sourceP address is not 16-byte aligned, the first several fram
es (at most three) should be processed separately. |
| 593 while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) { | 391 while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) { |
| 594 max = std::max(max, fabsf(*sourceP)); | 392 max = std::max(max, fabsf(*sourceP)); |
| 595 sourceP++; | 393 sourceP++; |
| 596 n--; | 394 n--; |
| 597 } | 395 } |
| 598 | 396 |
| 599 // Now the sourceP is aligned, use SSE. | 397 // Now the sourceP is aligned, use SSE. |
| 600 int tailFrames = n % 4; | 398 int tailFrames = n % 4; |
| (...skipping 13 matching lines...) Expand all Loading... |
| 614 | 412 |
| 615 // Get max from the SSE results. | 413 // Get max from the SSE results. |
| 616 const float* groupMaxP = reinterpret_cast<float*>(&mMax); | 414 const float* groupMaxP = reinterpret_cast<float*>(&mMax); |
| 617 max = std::max(max, groupMaxP[0]); | 415 max = std::max(max, groupMaxP[0]); |
| 618 max = std::max(max, groupMaxP[1]); | 416 max = std::max(max, groupMaxP[1]); |
| 619 max = std::max(max, groupMaxP[2]); | 417 max = std::max(max, groupMaxP[2]); |
| 620 max = std::max(max, groupMaxP[3]); | 418 max = std::max(max, groupMaxP[3]); |
| 621 | 419 |
| 622 n = tailFrames; | 420 n = tailFrames; |
| 623 } | 421 } |
| 624 #elif HAVE(ARM_NEON_INTRINSICS) | |
| 625 if (sourceStride == 1) { | |
| 626 int tailFrames = n % 4; | |
| 627 const float* endP = sourceP + n - tailFrames; | |
| 628 | |
| 629 float32x4_t fourMax = vdupq_n_f32(0); | |
| 630 while (sourceP < endP) { | |
| 631 float32x4_t source = vld1q_f32(sourceP); | |
| 632 fourMax = vmaxq_f32(fourMax, vabsq_f32(source)); | |
| 633 sourceP += 4; | |
| 634 } | |
| 635 float32x2_t twoMax = vmax_f32(vget_low_f32(fourMax), vget_high_f32(fourM
ax)); | |
| 636 | |
| 637 float groupMax[2]; | |
| 638 vst1_f32(groupMax, twoMax); | |
| 639 max = std::max(groupMax[0], groupMax[1]); | |
| 640 | |
| 641 n = tailFrames; | |
| 642 } | |
| 643 #endif | |
| 644 | 422 |
| 645 while (n--) { | 423 while (n--) { |
| 646 max = std::max(max, fabsf(*sourceP)); | 424 max = std::max(max, fabsf(*sourceP)); |
| 647 sourceP += sourceStride; | 425 sourceP += sourceStride; |
| 648 } | 426 } |
| 649 | 427 |
| 650 ASSERT(maxP); | 428 ASSERT(maxP); |
| 651 *maxP = max; | 429 *maxP = max; |
| 652 } | 430 } |
| 653 | 431 |
| 654 void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, c
onst float* highThresholdP, float* destP, int destStride, size_t framesToProcess
) | 432 void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, c
onst float* highThresholdP, float* destP, int destStride, size_t framesToProcess
) |
| 655 { | 433 { |
| 656 int n = framesToProcess; | 434 int n = framesToProcess; |
| 657 float lowThreshold = *lowThresholdP; | 435 float lowThreshold = *lowThresholdP; |
| 658 float highThreshold = *highThresholdP; | 436 float highThreshold = *highThresholdP; |
| 659 | 437 |
| 660 // FIXME: Optimize for SSE2. | 438 // FIXME: Optimize for SSE2. |
| 661 #if HAVE(ARM_NEON_INTRINSICS) | |
| 662 if ((sourceStride == 1) && (destStride == 1)) { | |
| 663 int tailFrames = n % 4; | |
| 664 const float* endP = destP + n - tailFrames; | |
| 665 | |
| 666 float32x4_t low = vdupq_n_f32(lowThreshold); | |
| 667 float32x4_t high = vdupq_n_f32(highThreshold); | |
| 668 while (destP < endP) { | |
| 669 float32x4_t source = vld1q_f32(sourceP); | |
| 670 vst1q_f32(destP, vmaxq_f32(vminq_f32(source, high), low)); | |
| 671 sourceP += 4; | |
| 672 destP += 4; | |
| 673 } | |
| 674 n = tailFrames; | |
| 675 } | |
| 676 #endif | |
| 677 while (n--) { | 439 while (n--) { |
| 678 *destP = std::max(std::min(*sourceP, highThreshold), lowThreshold); | 440 *destP = std::max(std::min(*sourceP, highThreshold), lowThreshold); |
| 679 sourceP += sourceStride; | 441 sourceP += sourceStride; |
| 680 destP += destStride; | 442 destP += destStride; |
| 681 } | 443 } |
| 682 } | 444 } |
| 683 | 445 |
| 684 #endif // OS(MACOSX) | |
| 685 | |
| 686 } // namespace VectorMath | 446 } // namespace VectorMath |
| 687 | 447 |
| 688 } // namespace blink | 448 } // namespace blink |
| 689 | 449 |
| 690 #endif // ENABLE(WEB_AUDIO) | 450 #endif // ENABLE(WEB_AUDIO) |
| OLD | NEW |