OLD | NEW |
(Empty) | |
| 1 /* |
| 2 * Copyright (C) 2010, Google Inc. All rights reserved. |
| 3 * |
| 4 * Redistribution and use in source and binary forms, with or without |
| 5 * modification, are permitted provided that the following conditions |
| 6 * are met: |
| 7 * 1. Redistributions of source code must retain the above copyright |
| 8 * notice, this list of conditions and the following disclaimer. |
| 9 * 2. Redistributions in binary form must reproduce the above copyright |
| 10 * notice, this list of conditions and the following disclaimer in the |
| 11 * documentation and/or other materials provided with the distribution. |
| 12 * |
| 13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' AND AN
Y |
| 14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| 15 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| 16 * DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE FOR AN
Y |
| 17 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| 18 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| 19 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND O
N |
| 20 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 21 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| 22 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 23 */ |
| 24 |
| 25 #include "config.h" |
| 26 |
| 27 #if ENABLE(WEB_AUDIO) |
| 28 |
| 29 #include "platform/audio/cpu/arm/VectorMathNEON.h" |
| 30 |
| 31 #include "wtf/Assertions.h" |
| 32 #include "wtf/CPU.h" |
| 33 #include <algorithm> |
| 34 #include <arm_neon.h> |
| 35 |
| 36 namespace blink { |
| 37 |
| 38 namespace VectorMath { |
| 39 |
| 40 void vsma(const float* sourceP, int sourceStride, const float* scale, float* des
tP, int destStride, size_t framesToProcess) |
| 41 { |
| 42 int n = framesToProcess; |
| 43 |
| 44 if (WTF_CPU_ARM_HAS_NEON()) { |
| 45 if ((sourceStride == 1) && (destStride == 1)) { |
| 46 int tailFrames = n % 4; |
| 47 const float* endP = destP + n - tailFrames; |
| 48 |
| 49 float32x4_t k = vdupq_n_f32(*scale); |
| 50 while (destP < endP) { |
| 51 float32x4_t source = vld1q_f32(sourceP); |
| 52 float32x4_t dest = vld1q_f32(destP); |
| 53 |
| 54 dest = vmlaq_f32(dest, source, k); |
| 55 vst1q_f32(destP, dest); |
| 56 |
| 57 sourceP += 4; |
| 58 destP += 4; |
| 59 } |
| 60 n = tailFrames; |
| 61 } |
| 62 } |
| 63 |
| 64 while (n) { |
| 65 *destP += *sourceP * *scale; |
| 66 sourceP += sourceStride; |
| 67 destP += destStride; |
| 68 n--; |
| 69 } |
| 70 } |
| 71 |
| 72 void vsmul(const float* sourceP, int sourceStride, const float* scale, float* de
stP, int destStride, size_t framesToProcess) |
| 73 { |
| 74 int n = framesToProcess; |
| 75 |
| 76 if (WTF_CPU_ARM_HAS_NEON()) { |
| 77 if ((sourceStride == 1) && (destStride == 1)) { |
| 78 float k = *scale; |
| 79 int tailFrames = n % 4; |
| 80 const float* endP = destP + n - tailFrames; |
| 81 |
| 82 while (destP < endP) { |
| 83 float32x4_t source = vld1q_f32(sourceP); |
| 84 vst1q_f32(destP, vmulq_n_f32(source, k)); |
| 85 |
| 86 sourceP += 4; |
| 87 destP += 4; |
| 88 } |
| 89 n = tailFrames; |
| 90 } |
| 91 } |
| 92 |
| 93 float k = *scale; |
| 94 while (n--) { |
| 95 *destP = k * *sourceP; |
| 96 sourceP += sourceStride; |
| 97 destP += destStride; |
| 98 } |
| 99 } |
| 100 |
| 101 void vadd(const float* source1P, int sourceStride1, const float* source2P, int s
ourceStride2, float* destP, int destStride, size_t framesToProcess) |
| 102 { |
| 103 int n = framesToProcess; |
| 104 |
| 105 if (WTF_CPU_ARM_HAS_NEON()) { |
| 106 if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) { |
| 107 int tailFrames = n % 4; |
| 108 const float* endP = destP + n - tailFrames; |
| 109 |
| 110 while (destP < endP) { |
| 111 float32x4_t source1 = vld1q_f32(source1P); |
| 112 float32x4_t source2 = vld1q_f32(source2P); |
| 113 vst1q_f32(destP, vaddq_f32(source1, source2)); |
| 114 |
| 115 source1P += 4; |
| 116 source2P += 4; |
| 117 destP += 4; |
| 118 } |
| 119 n = tailFrames; |
| 120 } |
| 121 } |
| 122 |
| 123 while (n--) { |
| 124 *destP = *source1P + *source2P; |
| 125 source1P += sourceStride1; |
| 126 source2P += sourceStride2; |
| 127 destP += destStride; |
| 128 } |
| 129 } |
| 130 |
| 131 void vmul(const float* source1P, int sourceStride1, const float* source2P, int s
ourceStride2, float* destP, int destStride, size_t framesToProcess) |
| 132 { |
| 133 int n = framesToProcess; |
| 134 |
| 135 if (WTF_CPU_ARM_HAS_NEON()) { |
| 136 if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) { |
| 137 int tailFrames = n % 4; |
| 138 const float* endP = destP + n - tailFrames; |
| 139 |
| 140 while (destP < endP) { |
| 141 float32x4_t source1 = vld1q_f32(source1P); |
| 142 float32x4_t source2 = vld1q_f32(source2P); |
| 143 vst1q_f32(destP, vmulq_f32(source1, source2)); |
| 144 |
| 145 source1P += 4; |
| 146 source2P += 4; |
| 147 destP += 4; |
| 148 } |
| 149 n = tailFrames; |
| 150 } |
| 151 } |
| 152 |
| 153 while (n) { |
| 154 *destP = *source1P * *source2P; |
| 155 source1P += sourceStride1; |
| 156 source2P += sourceStride2; |
| 157 destP += destStride; |
| 158 n--; |
| 159 } |
| 160 } |
| 161 |
| 162 void zvmul(const float* real1P, const float* imag1P, const float* real2P, const
float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess) |
| 163 { |
| 164 unsigned i = 0; |
| 165 |
| 166 unsigned endSize = framesToProcess - framesToProcess % 4; |
| 167 |
| 168 if (WTF_CPU_ARM_HAS_NEON()) { |
| 169 while (i < endSize) { |
| 170 float32x4_t real1 = vld1q_f32(real1P + i); |
| 171 float32x4_t real2 = vld1q_f32(real2P + i); |
| 172 float32x4_t imag1 = vld1q_f32(imag1P + i); |
| 173 float32x4_t imag2 = vld1q_f32(imag2P + i); |
| 174 |
| 175 float32x4_t realResult = vmlsq_f32(vmulq_f32(real1, real2), imag1, i
mag2); |
| 176 float32x4_t imagResult = vmlaq_f32(vmulq_f32(real1, imag2), imag1, r
eal2); |
| 177 |
| 178 vst1q_f32(realDestP + i, realResult); |
| 179 vst1q_f32(imagDestP + i, imagResult); |
| 180 |
| 181 i += 4; |
| 182 } |
| 183 } |
| 184 for (; i < framesToProcess; ++i) { |
| 185 // Read and compute result before storing them, in case the |
| 186 // destination is the same as one of the sources. |
| 187 float realResult = real1P[i] * real2P[i] - imag1P[i] * imag2P[i]; |
| 188 float imagResult = real1P[i] * imag2P[i] + imag1P[i] * real2P[i]; |
| 189 |
| 190 realDestP[i] = realResult; |
| 191 imagDestP[i] = imagResult; |
| 192 } |
| 193 } |
| 194 |
| 195 void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesTo
Process) |
| 196 { |
| 197 int n = framesToProcess; |
| 198 float sum = 0; |
| 199 |
| 200 if (WTF_CPU_ARM_HAS_NEON()) { |
| 201 if (sourceStride == 1) { |
| 202 int tailFrames = n % 4; |
| 203 const float* endP = sourceP + n - tailFrames; |
| 204 |
| 205 float32x4_t fourSum = vdupq_n_f32(0); |
| 206 while (sourceP < endP) { |
| 207 float32x4_t source = vld1q_f32(sourceP); |
| 208 fourSum = vmlaq_f32(fourSum, source, source); |
| 209 sourceP += 4; |
| 210 } |
| 211 float32x2_t twoSum = vadd_f32(vget_low_f32(fourSum), vget_high_f32(f
ourSum)); |
| 212 |
| 213 float groupSum[2]; |
| 214 vst1_f32(groupSum, twoSum); |
| 215 sum += groupSum[0] + groupSum[1]; |
| 216 |
| 217 n = tailFrames; |
| 218 } |
| 219 } |
| 220 |
| 221 while (n--) { |
| 222 float sample = *sourceP; |
| 223 sum += sample * sample; |
| 224 sourceP += sourceStride; |
| 225 } |
| 226 |
| 227 ASSERT(sumP); |
| 228 *sumP = sum; |
| 229 } |
| 230 |
| 231 void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesT
oProcess) |
| 232 { |
| 233 int n = framesToProcess; |
| 234 float max = 0; |
| 235 |
| 236 if (WTF_CPU_ARM_HAS_NEON()) { |
| 237 if (sourceStride == 1) { |
| 238 int tailFrames = n % 4; |
| 239 const float* endP = sourceP + n - tailFrames; |
| 240 |
| 241 float32x4_t fourMax = vdupq_n_f32(0); |
| 242 while (sourceP < endP) { |
| 243 float32x4_t source = vld1q_f32(sourceP); |
| 244 fourMax = vmaxq_f32(fourMax, vabsq_f32(source)); |
| 245 sourceP += 4; |
| 246 } |
| 247 float32x2_t twoMax = vmax_f32(vget_low_f32(fourMax), vget_high_f32(f
ourMax)); |
| 248 |
| 249 float groupMax[2]; |
| 250 vst1_f32(groupMax, twoMax); |
| 251 max = std::max(groupMax[0], groupMax[1]); |
| 252 |
| 253 n = tailFrames; |
| 254 } |
| 255 } |
| 256 |
| 257 while (n--) { |
| 258 max = std::max(max, fabsf(*sourceP)); |
| 259 sourceP += sourceStride; |
| 260 } |
| 261 |
| 262 ASSERT(maxP); |
| 263 *maxP = max; |
| 264 } |
| 265 |
| 266 void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, c
onst float* highThresholdP, float* destP, int destStride, size_t framesToProcess
) |
| 267 { |
| 268 int n = framesToProcess; |
| 269 float lowThreshold = *lowThresholdP; |
| 270 float highThreshold = *highThresholdP; |
| 271 |
| 272 if (WTF_CPU_ARM_HAS_NEON()) { |
| 273 if ((sourceStride == 1) && (destStride == 1)) { |
| 274 int tailFrames = n % 4; |
| 275 const float* endP = destP + n - tailFrames; |
| 276 |
| 277 float32x4_t low = vdupq_n_f32(lowThreshold); |
| 278 float32x4_t high = vdupq_n_f32(highThreshold); |
| 279 while (destP < endP) { |
| 280 float32x4_t source = vld1q_f32(sourceP); |
| 281 vst1q_f32(destP, vmaxq_f32(vminq_f32(source, high), low)); |
| 282 sourceP += 4; |
| 283 destP += 4; |
| 284 } |
| 285 n = tailFrames; |
| 286 } |
| 287 } |
| 288 |
| 289 while (n--) { |
| 290 *destP = std::max(std::min(*sourceP, highThreshold), lowThreshold); |
| 291 sourceP += sourceStride; |
| 292 destP += destStride; |
| 293 } |
| 294 } |
| 295 |
| 296 } // namespace VectorMath |
| 297 |
| 298 } // namespace blink |
| 299 |
| 300 #endif // ENABLE(WEB_AUDIO) |
OLD | NEW |