Index: Source/platform/audio/VectorMath.cpp |
diff --git a/Source/platform/audio/VectorMath.cpp b/Source/platform/audio/VectorMath.cpp |
index 219ed5463977ac5cc172f1117fd0cca3f9008a9a..771b9dc3607c4c3fbd499214b822513e940e0341 100644 |
--- a/Source/platform/audio/VectorMath.cpp |
+++ b/Source/platform/audio/VectorMath.cpp |
@@ -169,19 +169,32 @@ void vsma(const float* sourceP, int sourceStride, const float* scale, float* des |
} |
#elif HAVE(ARM_NEON_INTRINSICS) |
if ((sourceStride == 1) && (destStride == 1)) { |
- int tailFrames = n % 4; |
- const float* endP = destP + n - tailFrames; |
- |
- float32x4_t k = vdupq_n_f32(*scale); |
- while (destP < endP) { |
- float32x4_t source = vld1q_f32(sourceP); |
- float32x4_t dest = vld1q_f32(destP); |
- |
- dest = vmlaq_f32(dest, source, k); |
- vst1q_f32(destP, dest); |
- |
- sourceP += 4; |
- destP += 4; |
+ unsigned tailFrames = n & 15; |
+ float32x4_t scaleNum = vdupq_n_f32(*scale); |
+ |
+ for (unsigned loopCount = (n >> 4); loopCount > 0; loopCount--) { |
+ float32x4_t dest0 = vld1q_f32(destP); |
+ float32x4_t dest1 = vld1q_f32(destP + 4); |
+ float32x4_t dest2 = vld1q_f32(destP + 8); |
+ float32x4_t dest3 = vld1q_f32(destP + 12); |
+ |
+ float32x4_t source0 = vld1q_f32(sourceP); |
+ float32x4_t source1 = vld1q_f32(sourceP + 4); |
+ float32x4_t source2 = vld1q_f32(sourceP + 8); |
+ float32x4_t source3 = vld1q_f32(sourceP + 12); |
+ |
+ float32x4_t result0 = vmlaq_f32(dest0, source0, scaleNum); |
+ float32x4_t result1 = vmlaq_f32(dest1, source1, scaleNum); |
+ float32x4_t result2 = vmlaq_f32(dest2, source2, scaleNum); |
+ float32x4_t result3 = vmlaq_f32(dest3, source3, scaleNum); |
+ |
+ vst1q_f32(destP, result0); |
+ vst1q_f32(destP + 4, result1); |
+ vst1q_f32(destP + 8, result2); |
+ vst1q_f32(destP + 12, result3); |
+ |
+ sourceP += 16; |
+ destP += 16; |
} |
n = tailFrames; |
} |
@@ -248,17 +261,28 @@ void vsmul(const float* sourceP, int sourceStride, const float* scale, float* de |
} |
} else { // If strides are not 1, rollback to normal algorithm. |
#elif HAVE(ARM_NEON_INTRINSICS) |
- if ((sourceStride == 1) && (destStride == 1)) { |
- float k = *scale; |
- int tailFrames = n % 4; |
- const float* endP = destP + n - tailFrames; |
- |
- while (destP < endP) { |
- float32x4_t source = vld1q_f32(sourceP); |
- vst1q_f32(destP, vmulq_n_f32(source, k)); |
- |
- sourceP += 4; |
- destP += 4; |
+ if (sourceStride == 1 && destStride == 1) { |
+ unsigned tailFrames = framesToProcess & 15; |
+ float32x4_t scaleNum = vdupq_n_f32(*scale); |
+ |
+ for (unsigned loopCount = (framesToProcess >> 4); loopCount > 0; loopCount--) { |
+ float32x4_t source0 = vld1q_f32(sourceP); |
+ float32x4_t source1 = vld1q_f32(sourceP + 4); |
+ float32x4_t source2 = vld1q_f32(sourceP + 8); |
+ float32x4_t source3 = vld1q_f32(sourceP + 12); |
+ |
+ float32x4_t result0 = vmulq_f32(source0, scaleNum); |
+ float32x4_t result1 = vmulq_f32(source1, scaleNum); |
+ float32x4_t result2 = vmulq_f32(source2, scaleNum); |
+ float32x4_t result3 = vmulq_f32(source3, scaleNum); |
+ |
+ vst1q_f32(destP, result0); |
+ vst1q_f32(destP + 4, result1); |
+ vst1q_f32(destP + 8, result2); |
+ vst1q_f32(destP + 12, result3); |
+ |
+ sourceP += 16; |
+ destP += 16; |
} |
n = tailFrames; |
} |
@@ -359,18 +383,33 @@ void vadd(const float* source1P, int sourceStride1, const float* source2P, int s |
} |
} else { // if strides are not 1, rollback to normal algorithm |
#elif HAVE(ARM_NEON_INTRINSICS) |
- if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) { |
- int tailFrames = n % 4; |
- const float* endP = destP + n - tailFrames; |
- |
- while (destP < endP) { |
- float32x4_t source1 = vld1q_f32(source1P); |
- float32x4_t source2 = vld1q_f32(source2P); |
- vst1q_f32(destP, vaddq_f32(source1, source2)); |
- |
- source1P += 4; |
- source2P += 4; |
- destP += 4; |
+ if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) { |
+ unsigned tailFrames = framesToProcess & 15; |
+ |
+ for (unsigned loopCount = (framesToProcess >> 4); loopCount > 0; loopCount--) { |
+ float32x4_t source10 = vld1q_f32(source1P); |
+ float32x4_t source11 = vld1q_f32(source1P + 4); |
+ float32x4_t source12 = vld1q_f32(source1P + 8); |
+ float32x4_t source13 = vld1q_f32(source1P + 12); |
+ |
+ float32x4_t source20 = vld1q_f32(source2P); |
+ float32x4_t source21 = vld1q_f32(source2P + 4); |
+ float32x4_t source22 = vld1q_f32(source2P + 8); |
+ float32x4_t source23 = vld1q_f32(source2P + 12); |
+ |
+ float32x4_t result0 = vaddq_f32(source10, source20); |
+ float32x4_t result1 = vaddq_f32(source11, source21); |
+ float32x4_t result2 = vaddq_f32(source12, source22); |
+ float32x4_t result3 = vaddq_f32(source13, source23); |
+ |
+ vst1q_f32(destP, result0); |
+ vst1q_f32(destP + 4, result1); |
+ vst1q_f32(destP + 8, result2); |
+ vst1q_f32(destP + 12, result3); |
+ |
+ source1P += 16; |
+ source2P += 16; |
+ destP += 16; |
} |
n = tailFrames; |
} |
@@ -436,18 +475,33 @@ void vmul(const float* source1P, int sourceStride1, const float* source2P, int s |
n = tailFrames; |
} |
#elif HAVE(ARM_NEON_INTRINSICS) |
- if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) { |
- int tailFrames = n % 4; |
- const float* endP = destP + n - tailFrames; |
- |
- while (destP < endP) { |
- float32x4_t source1 = vld1q_f32(source1P); |
- float32x4_t source2 = vld1q_f32(source2P); |
- vst1q_f32(destP, vmulq_f32(source1, source2)); |
- |
- source1P += 4; |
- source2P += 4; |
- destP += 4; |
+ if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) { |
+ unsigned tailFrames = n & 15; |
+ |
+ for (unsigned loopCount = (n >> 4); loopCount > 0; loopCount--) { |
+ float32x4_t source10 = vld1q_f32(source1P); |
+ float32x4_t source11 = vld1q_f32(source1P + 4); |
+ float32x4_t source12 = vld1q_f32(source1P + 8); |
+ float32x4_t source13 = vld1q_f32(source1P + 12); |
+ |
+ float32x4_t source20 = vld1q_f32(source2P); |
+ float32x4_t source21 = vld1q_f32(source2P + 4); |
+ float32x4_t source22 = vld1q_f32(source2P + 8); |
+ float32x4_t source23 = vld1q_f32(source2P + 12); |
+ |
+ float32x4_t result0 = vmulq_f32(source10, source20); |
+ float32x4_t result1 = vmulq_f32(source11, source21); |
+ float32x4_t result2 = vmulq_f32(source12, source22); |
+ float32x4_t result3 = vmulq_f32(source13, source23); |
+ |
+ vst1q_f32(destP, result0); |
+ vst1q_f32(destP + 4, result1); |
+ vst1q_f32(destP + 8, result2); |
+ vst1q_f32(destP + 12, result3); |
+ |
+ source1P += 16; |
+ source2P += 16; |
+ destP += 16; |
} |
n = tailFrames; |
} |
@@ -553,20 +607,35 @@ void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesTo |
} |
#elif HAVE(ARM_NEON_INTRINSICS) |
if (sourceStride == 1) { |
- int tailFrames = n % 4; |
- const float* endP = sourceP + n - tailFrames; |
+ unsigned tailFrames = n & 15; |
- float32x4_t fourSum = vdupq_n_f32(0); |
- while (sourceP < endP) { |
- float32x4_t source = vld1q_f32(sourceP); |
- fourSum = vmlaq_f32(fourSum, source, source); |
- sourceP += 4; |
+ float32x4_t result0 = vdupq_n_f32(0); |
+ float32x4_t result1 = result0; |
+ float32x4_t result2 = result0; |
+ float32x4_t result3 = result0; |
+ |
+ for (unsigned loopCount = (n >> 4); loopCount > 0; loopCount--) { |
+ float32x4_t source0 = vld1q_f32(sourceP); |
+ float32x4_t source1 = vld1q_f32(sourceP + 4); |
+ float32x4_t source2 = vld1q_f32(sourceP + 8); |
+ float32x4_t source3 = vld1q_f32(sourceP + 12); |
+ |
+ result0 = vmlaq_f32(result0, source0, source0); |
+ result1 = vmlaq_f32(result1, source1, source1); |
+ result2 = vmlaq_f32(result2, source2, source2); |
+ result3 = vmlaq_f32(result3, source3, source3); |
+ |
+ sourceP += 16; |
} |
- float32x2_t twoSum = vadd_f32(vget_low_f32(fourSum), vget_high_f32(fourSum)); |
- float groupSum[2]; |
- vst1_f32(groupSum, twoSum); |
- sum += groupSum[0] + groupSum[1]; |
+ result0 = vaddq_f32(result0, result1); |
+ result0 = vaddq_f32(result0, result2); |
+ result0 = vaddq_f32(result0, result3); |
+ |
+ sum += vgetq_lane_f32(result0, 0); |
+ sum += vgetq_lane_f32(result0, 1); |
+ sum += vgetq_lane_f32(result0, 2); |
+ sum += vgetq_lane_f32(result0, 3); |
n = tailFrames; |
} |