Chromium Code Reviews| Index: Source/platform/audio/VectorMath.cpp |
| diff --git a/Source/platform/audio/VectorMath.cpp b/Source/platform/audio/VectorMath.cpp |
| index 0bbe6bae9fdd85f346971db6589ac41a6c2d93f9..88ea1e5324db9bfe5da9a523b28221867ce72e99 100644 |
| --- a/Source/platform/audio/VectorMath.cpp |
| +++ b/Source/platform/audio/VectorMath.cpp |
| @@ -179,19 +179,34 @@ void vsma(const float* sourceP, int sourceStride, const float* scale, float* des |
| } |
| #elif HAVE(ARM_NEON_INTRINSICS) |
| if ((sourceStride == 1) && (destStride == 1)) { |
| - int tailFrames = n % 4; |
| - const float* endP = destP + n - tailFrames; |
| + unsigned tailFrames = n & 15; |
| + float32x4_t scaleNum = vdupq_n_f32(*scale); |
| - float32x4_t k = vdupq_n_f32(*scale); |
| - while (destP < endP) { |
| - float32x4_t source = vld1q_f32(sourceP); |
| - float32x4_t dest = vld1q_f32(destP); |
| + for (unsigned loopCount = (n >> 4); loopCount > 0; loopCount--) { |
| + float32x4_t dest0 = vld1q_f32(destP); |
| + float32x4_t source0 = vld1q_f32(sourceP); |
| - dest = vmlaq_f32(dest, source, k); |
| - vst1q_f32(destP, dest); |
| + float32x4_t dest1 = vld1q_f32(destP + 4); |
| + float32x4_t source1 = vld1q_f32(sourceP + 4); |
| - sourceP += 4; |
| - destP += 4; |
| + float32x4_t dest2 = vld1q_f32(destP + 8); |
| + float32x4_t source2 = vld1q_f32(sourceP + 8); |
| + |
| + float32x4_t dest3 = vld1q_f32(destP + 12); |
| + float32x4_t source3 = vld1q_f32(sourceP + 12); |
| + |
| + float32x4_t result0 = vmlaq_f32(dest0, source0, scaleNum); |
| + float32x4_t result1 = vmlaq_f32(dest1, source1, scaleNum); |
| + float32x4_t result2 = vmlaq_f32(dest2, source2, scaleNum); |
| + float32x4_t result3 = vmlaq_f32(dest3, source3, scaleNum); |
| + |
| + vst1q_f32(destP, result0); |
| + vst1q_f32(destP + 4, result1); |
| + vst1q_f32(destP + 8, result2); |
| + vst1q_f32(destP + 12, result3); |
| + |
| + sourceP += 16; |
| + destP += 16; |
| } |
| n = tailFrames; |
| } |
| @@ -258,17 +273,28 @@ void vsmul(const float* sourceP, int sourceStride, const float* scale, float* de |
| } |
| } else { // If strides are not 1, rollback to normal algorithm. |
| #elif HAVE(ARM_NEON_INTRINSICS) |
| - if ((sourceStride == 1) && (destStride == 1)) { |
| - float k = *scale; |
| - int tailFrames = n % 4; |
| - const float* endP = destP + n - tailFrames; |
| - |
| - while (destP < endP) { |
| - float32x4_t source = vld1q_f32(sourceP); |
| - vst1q_f32(destP, vmulq_n_f32(source, k)); |
| - |
| - sourceP += 4; |
| - destP += 4; |
| + if (sourceStride == 1 && destStride == 1) { |
| + unsigned tailFrames = framesToProcess & 15; |
| + float32x4_t scaleNum = vdupq_n_f32(*scale); |
|
Raymond Toy
2014/09/22 20:00:36
Is there any performance difference between making
KhNo
2014/09/23 02:07:32
No performance difference for scaleNum, It is just
|
| + |
| + for (unsigned loopCount = (framesToProcess >> 4); loopCount > 0; loopCount--) { |
| + float32x4_t source0 = vld1q_f32(sourceP); |
| + float32x4_t source1 = vld1q_f32(sourceP + 4); |
| + float32x4_t source2 = vld1q_f32(sourceP + 8); |
| + float32x4_t source3 = vld1q_f32(sourceP + 12); |
| + |
| + float32x4_t result0 = vmulq_f32(source0, scaleNum); |
| + float32x4_t result1 = vmulq_f32(source1, scaleNum); |
| + float32x4_t result2 = vmulq_f32(source2, scaleNum); |
| + float32x4_t result3 = vmulq_f32(source3, scaleNum); |
| + |
| + vst1q_f32(destP, result0); |
| + vst1q_f32(destP + 4, result1); |
| + vst1q_f32(destP + 8, result2); |
| + vst1q_f32(destP + 12, result3); |
| + |
| + sourceP += 16; |
| + destP += 16; |
| } |
| n = tailFrames; |
| } |
| @@ -369,18 +395,35 @@ void vadd(const float* source1P, int sourceStride1, const float* source2P, int s |
| } |
| } else { // if strides are not 1, rollback to normal algorithm |
| #elif HAVE(ARM_NEON_INTRINSICS) |
| - if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) { |
| - int tailFrames = n % 4; |
| - const float* endP = destP + n - tailFrames; |
| + if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) { |
| + unsigned tailFrames = framesToProcess & 15; |
| - while (destP < endP) { |
| - float32x4_t source1 = vld1q_f32(source1P); |
| - float32x4_t source2 = vld1q_f32(source2P); |
| - vst1q_f32(destP, vaddq_f32(source1, source2)); |
| + for (unsigned loopCount = (framesToProcess >> 4); loopCount > 0; loopCount--) { |
| + float32x4_t source10 = vld1q_f32(source1P); |
| + float32x4_t source20 = vld1q_f32(source2P); |
| - source1P += 4; |
| - source2P += 4; |
| - destP += 4; |
| + float32x4_t source11 = vld1q_f32(source1P + 4); |
| + float32x4_t source21 = vld1q_f32(source2P + 4); |
| + |
| + float32x4_t source12 = vld1q_f32(source1P + 8); |
| + float32x4_t source22 = vld1q_f32(source2P + 8); |
| + |
| + float32x4_t source13 = vld1q_f32(source1P + 12); |
| + float32x4_t source23 = vld1q_f32(source2P + 12); |
| + |
| + float32x4_t result0 = vaddq_f32(source10, source20); |
| + float32x4_t result1 = vaddq_f32(source11, source21); |
| + float32x4_t result2 = vaddq_f32(source12, source22); |
| + float32x4_t result3 = vaddq_f32(source13, source23); |
| + |
| + vst1q_f32(destP, result0); |
| + vst1q_f32(destP + 4, result1); |
| + vst1q_f32(destP + 8, result2); |
| + vst1q_f32(destP + 12, result3); |
| + |
| + source1P += 16; |
| + source2P += 16; |
| + destP += 16; |
| } |
| n = tailFrames; |
| } |
| @@ -446,18 +489,35 @@ void vmul(const float* source1P, int sourceStride1, const float* source2P, int s |
| n = tailFrames; |
| } |
| #elif HAVE(ARM_NEON_INTRINSICS) |
| - if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) { |
| - int tailFrames = n % 4; |
| - const float* endP = destP + n - tailFrames; |
| + if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) { |
| + unsigned tailFrames = n & 15; |
| - while (destP < endP) { |
| - float32x4_t source1 = vld1q_f32(source1P); |
| - float32x4_t source2 = vld1q_f32(source2P); |
| - vst1q_f32(destP, vmulq_f32(source1, source2)); |
| + for (unsigned loopCount = (n >> 4); loopCount > 0; loopCount--) { |
| + float32x4_t source10 = vld1q_f32(source1P); |
| + float32x4_t source20 = vld1q_f32(source2P); |
|
Raymond Toy
2014/09/22 20:00:35
It might be advantageous to load all of the elemen
KhNo
2014/09/23 02:07:32
Thanks for review, I think also it is better for c
|
| - source1P += 4; |
| - source2P += 4; |
| - destP += 4; |
| + float32x4_t source11 = vld1q_f32(source1P + 4); |
| + float32x4_t source21 = vld1q_f32(source2P + 4); |
| + |
| + float32x4_t source12 = vld1q_f32(source1P + 8); |
| + float32x4_t source22 = vld1q_f32(source2P + 8); |
| + |
| + float32x4_t source13 = vld1q_f32(source1P + 12); |
| + float32x4_t source23 = vld1q_f32(source2P + 12); |
| + |
| + float32x4_t result0 = vmulq_f32(source10, source20); |
| + float32x4_t result1 = vmulq_f32(source11, source21); |
| + float32x4_t result2 = vmulq_f32(source12, source22); |
| + float32x4_t result3 = vmulq_f32(source13, source23); |
| + |
| + vst1q_f32(destP, result0); |
| + vst1q_f32(destP + 4, result1); |
| + vst1q_f32(destP + 8, result2); |
| + vst1q_f32(destP + 12, result3); |
| + |
| + source1P += 16; |
| + source2P += 16; |
| + destP += 16; |
| } |
| n = tailFrames; |
| } |
| @@ -563,20 +623,35 @@ void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesTo |
| } |
| #elif HAVE(ARM_NEON_INTRINSICS) |
| if (sourceStride == 1) { |
| - int tailFrames = n % 4; |
| - const float* endP = sourceP + n - tailFrames; |
| + unsigned tailFrames = n & 15; |
| - float32x4_t fourSum = vdupq_n_f32(0); |
| - while (sourceP < endP) { |
| - float32x4_t source = vld1q_f32(sourceP); |
| - fourSum = vmlaq_f32(fourSum, source, source); |
| - sourceP += 4; |
| + float32x4_t result0 = vdupq_n_f32(0); |
| + float32x4_t result1 = result0; |
| + float32x4_t result2 = result0; |
| + float32x4_t result3 = result0; |
| + |
| + for (unsigned loopCount = (n >> 4); loopCount > 0; loopCount--) { |
| + float32x4_t source0 = vld1q_f32(sourceP); |
| + float32x4_t source1 = vld1q_f32(sourceP + 4); |
| + float32x4_t source2 = vld1q_f32(sourceP + 8); |
| + float32x4_t source3 = vld1q_f32(sourceP + 12); |
| + |
| + result0 = vmlaq_f32(result0, source0, source0); |
| + result1 = vmlaq_f32(result1, source1, source1); |
| + result2 = vmlaq_f32(result2, source2, source2); |
| + result3 = vmlaq_f32(result3, source3, source3); |
| + |
| + sourceP += 16; |
| } |
| - float32x2_t twoSum = vadd_f32(vget_low_f32(fourSum), vget_high_f32(fourSum)); |
| - float groupSum[2]; |
| - vst1_f32(groupSum, twoSum); |
| - sum += groupSum[0] + groupSum[1]; |
| + result0 = vaddq_f32(result0, result1); |
| + result0 = vaddq_f32(result0, result2); |
| + result0 = vaddq_f32(result0, result3); |
| + |
| + sum += vgetq_lane_f32(result0, 0); |
| + sum += vgetq_lane_f32(result0, 1); |
| + sum += vgetq_lane_f32(result0, 2); |
| + sum += vgetq_lane_f32(result0, 3); |
| n = tailFrames; |
| } |