| Index: Source/platform/audio/VectorMath.cpp
|
| diff --git a/Source/platform/audio/VectorMath.cpp b/Source/platform/audio/VectorMath.cpp
|
| index 219ed5463977ac5cc172f1117fd0cca3f9008a9a..771b9dc3607c4c3fbd499214b822513e940e0341 100644
|
| --- a/Source/platform/audio/VectorMath.cpp
|
| +++ b/Source/platform/audio/VectorMath.cpp
|
| @@ -169,19 +169,32 @@ void vsma(const float* sourceP, int sourceStride, const float* scale, float* des
|
| }
|
| #elif HAVE(ARM_NEON_INTRINSICS)
|
| if ((sourceStride == 1) && (destStride == 1)) {
|
| - int tailFrames = n % 4;
|
| - const float* endP = destP + n - tailFrames;
|
| -
|
| - float32x4_t k = vdupq_n_f32(*scale);
|
| - while (destP < endP) {
|
| - float32x4_t source = vld1q_f32(sourceP);
|
| - float32x4_t dest = vld1q_f32(destP);
|
| -
|
| - dest = vmlaq_f32(dest, source, k);
|
| - vst1q_f32(destP, dest);
|
| -
|
| - sourceP += 4;
|
| - destP += 4;
|
| + unsigned tailFrames = n & 15;
|
| + float32x4_t scaleNum = vdupq_n_f32(*scale);
|
| +
|
| + for (unsigned loopCount = (n >> 4); loopCount > 0; loopCount--) {
|
| + float32x4_t dest0 = vld1q_f32(destP);
|
| + float32x4_t dest1 = vld1q_f32(destP + 4);
|
| + float32x4_t dest2 = vld1q_f32(destP + 8);
|
| + float32x4_t dest3 = vld1q_f32(destP + 12);
|
| +
|
| + float32x4_t source0 = vld1q_f32(sourceP);
|
| + float32x4_t source1 = vld1q_f32(sourceP + 4);
|
| + float32x4_t source2 = vld1q_f32(sourceP + 8);
|
| + float32x4_t source3 = vld1q_f32(sourceP + 12);
|
| +
|
| + float32x4_t result0 = vmlaq_f32(dest0, source0, scaleNum);
|
| + float32x4_t result1 = vmlaq_f32(dest1, source1, scaleNum);
|
| + float32x4_t result2 = vmlaq_f32(dest2, source2, scaleNum);
|
| + float32x4_t result3 = vmlaq_f32(dest3, source3, scaleNum);
|
| +
|
| + vst1q_f32(destP, result0);
|
| + vst1q_f32(destP + 4, result1);
|
| + vst1q_f32(destP + 8, result2);
|
| + vst1q_f32(destP + 12, result3);
|
| +
|
| + sourceP += 16;
|
| + destP += 16;
|
| }
|
| n = tailFrames;
|
| }
|
| @@ -248,17 +261,28 @@ void vsmul(const float* sourceP, int sourceStride, const float* scale, float* de
|
| }
|
| } else { // If strides are not 1, rollback to normal algorithm.
|
| #elif HAVE(ARM_NEON_INTRINSICS)
|
| - if ((sourceStride == 1) && (destStride == 1)) {
|
| - float k = *scale;
|
| - int tailFrames = n % 4;
|
| - const float* endP = destP + n - tailFrames;
|
| -
|
| - while (destP < endP) {
|
| - float32x4_t source = vld1q_f32(sourceP);
|
| - vst1q_f32(destP, vmulq_n_f32(source, k));
|
| -
|
| - sourceP += 4;
|
| - destP += 4;
|
| + if (sourceStride == 1 && destStride == 1) {
|
| + unsigned tailFrames = framesToProcess & 15;
|
| + float32x4_t scaleNum = vdupq_n_f32(*scale);
|
| +
|
| + for (unsigned loopCount = (framesToProcess >> 4); loopCount > 0; loopCount--) {
|
| + float32x4_t source0 = vld1q_f32(sourceP);
|
| + float32x4_t source1 = vld1q_f32(sourceP + 4);
|
| + float32x4_t source2 = vld1q_f32(sourceP + 8);
|
| + float32x4_t source3 = vld1q_f32(sourceP + 12);
|
| +
|
| + float32x4_t result0 = vmulq_f32(source0, scaleNum);
|
| + float32x4_t result1 = vmulq_f32(source1, scaleNum);
|
| + float32x4_t result2 = vmulq_f32(source2, scaleNum);
|
| + float32x4_t result3 = vmulq_f32(source3, scaleNum);
|
| +
|
| + vst1q_f32(destP, result0);
|
| + vst1q_f32(destP + 4, result1);
|
| + vst1q_f32(destP + 8, result2);
|
| + vst1q_f32(destP + 12, result3);
|
| +
|
| + sourceP += 16;
|
| + destP += 16;
|
| }
|
| n = tailFrames;
|
| }
|
| @@ -359,18 +383,33 @@ void vadd(const float* source1P, int sourceStride1, const float* source2P, int s
|
| }
|
| } else { // if strides are not 1, rollback to normal algorithm
|
| #elif HAVE(ARM_NEON_INTRINSICS)
|
| - if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) {
|
| - int tailFrames = n % 4;
|
| - const float* endP = destP + n - tailFrames;
|
| -
|
| - while (destP < endP) {
|
| - float32x4_t source1 = vld1q_f32(source1P);
|
| - float32x4_t source2 = vld1q_f32(source2P);
|
| - vst1q_f32(destP, vaddq_f32(source1, source2));
|
| -
|
| - source1P += 4;
|
| - source2P += 4;
|
| - destP += 4;
|
| + if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) {
|
| + unsigned tailFrames = framesToProcess & 15;
|
| +
|
| + for (unsigned loopCount = (framesToProcess >> 4); loopCount > 0; loopCount--) {
|
| + float32x4_t source10 = vld1q_f32(source1P);
|
| + float32x4_t source11 = vld1q_f32(source1P + 4);
|
| + float32x4_t source12 = vld1q_f32(source1P + 8);
|
| + float32x4_t source13 = vld1q_f32(source1P + 12);
|
| +
|
| + float32x4_t source20 = vld1q_f32(source2P);
|
| + float32x4_t source21 = vld1q_f32(source2P + 4);
|
| + float32x4_t source22 = vld1q_f32(source2P + 8);
|
| + float32x4_t source23 = vld1q_f32(source2P + 12);
|
| +
|
| + float32x4_t result0 = vaddq_f32(source10, source20);
|
| + float32x4_t result1 = vaddq_f32(source11, source21);
|
| + float32x4_t result2 = vaddq_f32(source12, source22);
|
| + float32x4_t result3 = vaddq_f32(source13, source23);
|
| +
|
| + vst1q_f32(destP, result0);
|
| + vst1q_f32(destP + 4, result1);
|
| + vst1q_f32(destP + 8, result2);
|
| + vst1q_f32(destP + 12, result3);
|
| +
|
| + source1P += 16;
|
| + source2P += 16;
|
| + destP += 16;
|
| }
|
| n = tailFrames;
|
| }
|
| @@ -436,18 +475,33 @@ void vmul(const float* source1P, int sourceStride1, const float* source2P, int s
|
| n = tailFrames;
|
| }
|
| #elif HAVE(ARM_NEON_INTRINSICS)
|
| - if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) {
|
| - int tailFrames = n % 4;
|
| - const float* endP = destP + n - tailFrames;
|
| -
|
| - while (destP < endP) {
|
| - float32x4_t source1 = vld1q_f32(source1P);
|
| - float32x4_t source2 = vld1q_f32(source2P);
|
| - vst1q_f32(destP, vmulq_f32(source1, source2));
|
| -
|
| - source1P += 4;
|
| - source2P += 4;
|
| - destP += 4;
|
| + if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) {
|
| + unsigned tailFrames = n & 15;
|
| +
|
| + for (unsigned loopCount = (n >> 4); loopCount > 0; loopCount--) {
|
| + float32x4_t source10 = vld1q_f32(source1P);
|
| + float32x4_t source11 = vld1q_f32(source1P + 4);
|
| + float32x4_t source12 = vld1q_f32(source1P + 8);
|
| + float32x4_t source13 = vld1q_f32(source1P + 12);
|
| +
|
| + float32x4_t source20 = vld1q_f32(source2P);
|
| + float32x4_t source21 = vld1q_f32(source2P + 4);
|
| + float32x4_t source22 = vld1q_f32(source2P + 8);
|
| + float32x4_t source23 = vld1q_f32(source2P + 12);
|
| +
|
| + float32x4_t result0 = vmulq_f32(source10, source20);
|
| + float32x4_t result1 = vmulq_f32(source11, source21);
|
| + float32x4_t result2 = vmulq_f32(source12, source22);
|
| + float32x4_t result3 = vmulq_f32(source13, source23);
|
| +
|
| + vst1q_f32(destP, result0);
|
| + vst1q_f32(destP + 4, result1);
|
| + vst1q_f32(destP + 8, result2);
|
| + vst1q_f32(destP + 12, result3);
|
| +
|
| + source1P += 16;
|
| + source2P += 16;
|
| + destP += 16;
|
| }
|
| n = tailFrames;
|
| }
|
| @@ -553,20 +607,35 @@ void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesTo
|
| }
|
| #elif HAVE(ARM_NEON_INTRINSICS)
|
| if (sourceStride == 1) {
|
| - int tailFrames = n % 4;
|
| - const float* endP = sourceP + n - tailFrames;
|
| + unsigned tailFrames = n & 15;
|
|
|
| - float32x4_t fourSum = vdupq_n_f32(0);
|
| - while (sourceP < endP) {
|
| - float32x4_t source = vld1q_f32(sourceP);
|
| - fourSum = vmlaq_f32(fourSum, source, source);
|
| - sourceP += 4;
|
| + float32x4_t result0 = vdupq_n_f32(0);
|
| + float32x4_t result1 = result0;
|
| + float32x4_t result2 = result0;
|
| + float32x4_t result3 = result0;
|
| +
|
| + for (unsigned loopCount = (n >> 4); loopCount > 0; loopCount--) {
|
| + float32x4_t source0 = vld1q_f32(sourceP);
|
| + float32x4_t source1 = vld1q_f32(sourceP + 4);
|
| + float32x4_t source2 = vld1q_f32(sourceP + 8);
|
| + float32x4_t source3 = vld1q_f32(sourceP + 12);
|
| +
|
| + result0 = vmlaq_f32(result0, source0, source0);
|
| + result1 = vmlaq_f32(result1, source1, source1);
|
| + result2 = vmlaq_f32(result2, source2, source2);
|
| + result3 = vmlaq_f32(result3, source3, source3);
|
| +
|
| + sourceP += 16;
|
| }
|
| - float32x2_t twoSum = vadd_f32(vget_low_f32(fourSum), vget_high_f32(fourSum));
|
|
|
| - float groupSum[2];
|
| - vst1_f32(groupSum, twoSum);
|
| - sum += groupSum[0] + groupSum[1];
|
| + result0 = vaddq_f32(result0, result1);
|
| + result0 = vaddq_f32(result0, result2);
|
| + result0 = vaddq_f32(result0, result3);
|
| +
|
| + sum += vgetq_lane_f32(result0, 0);
|
| + sum += vgetq_lane_f32(result0, 1);
|
| + sum += vgetq_lane_f32(result0, 2);
|
| + sum += vgetq_lane_f32(result0, 3);
|
|
|
| n = tailFrames;
|
| }
|
|
|