Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(2014)

Unified Diff: Source/platform/audio/VectorMath.cpp

Issue 255573002: More optimize approach for NEON in VectorMath Base URL: https://chromium.googlesource.com/chromium/blink.git@master
Patch Set: relocate function calling for caching machanism. Created 6 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: Source/platform/audio/VectorMath.cpp
diff --git a/Source/platform/audio/VectorMath.cpp b/Source/platform/audio/VectorMath.cpp
index 219ed5463977ac5cc172f1117fd0cca3f9008a9a..771b9dc3607c4c3fbd499214b822513e940e0341 100644
--- a/Source/platform/audio/VectorMath.cpp
+++ b/Source/platform/audio/VectorMath.cpp
@@ -169,19 +169,32 @@ void vsma(const float* sourceP, int sourceStride, const float* scale, float* des
}
#elif HAVE(ARM_NEON_INTRINSICS)
if ((sourceStride == 1) && (destStride == 1)) {
- int tailFrames = n % 4;
- const float* endP = destP + n - tailFrames;
-
- float32x4_t k = vdupq_n_f32(*scale);
- while (destP < endP) {
- float32x4_t source = vld1q_f32(sourceP);
- float32x4_t dest = vld1q_f32(destP);
-
- dest = vmlaq_f32(dest, source, k);
- vst1q_f32(destP, dest);
-
- sourceP += 4;
- destP += 4;
+ unsigned tailFrames = n & 15;
+ float32x4_t scaleNum = vdupq_n_f32(*scale);
+
+ for (unsigned loopCount = (n >> 4); loopCount > 0; loopCount--) {
+ float32x4_t dest0 = vld1q_f32(destP);
+ float32x4_t dest1 = vld1q_f32(destP + 4);
+ float32x4_t dest2 = vld1q_f32(destP + 8);
+ float32x4_t dest3 = vld1q_f32(destP + 12);
+
+ float32x4_t source0 = vld1q_f32(sourceP);
+ float32x4_t source1 = vld1q_f32(sourceP + 4);
+ float32x4_t source2 = vld1q_f32(sourceP + 8);
+ float32x4_t source3 = vld1q_f32(sourceP + 12);
+
+ float32x4_t result0 = vmlaq_f32(dest0, source0, scaleNum);
+ float32x4_t result1 = vmlaq_f32(dest1, source1, scaleNum);
+ float32x4_t result2 = vmlaq_f32(dest2, source2, scaleNum);
+ float32x4_t result3 = vmlaq_f32(dest3, source3, scaleNum);
+
+ vst1q_f32(destP, result0);
+ vst1q_f32(destP + 4, result1);
+ vst1q_f32(destP + 8, result2);
+ vst1q_f32(destP + 12, result3);
+
+ sourceP += 16;
+ destP += 16;
}
n = tailFrames;
}
@@ -248,17 +261,28 @@ void vsmul(const float* sourceP, int sourceStride, const float* scale, float* de
}
} else { // If strides are not 1, rollback to normal algorithm.
#elif HAVE(ARM_NEON_INTRINSICS)
- if ((sourceStride == 1) && (destStride == 1)) {
- float k = *scale;
- int tailFrames = n % 4;
- const float* endP = destP + n - tailFrames;
-
- while (destP < endP) {
- float32x4_t source = vld1q_f32(sourceP);
- vst1q_f32(destP, vmulq_n_f32(source, k));
-
- sourceP += 4;
- destP += 4;
+ if (sourceStride == 1 && destStride == 1) {
+ unsigned tailFrames = framesToProcess & 15;
+ float32x4_t scaleNum = vdupq_n_f32(*scale);
+
+ for (unsigned loopCount = (framesToProcess >> 4); loopCount > 0; loopCount--) {
+ float32x4_t source0 = vld1q_f32(sourceP);
+ float32x4_t source1 = vld1q_f32(sourceP + 4);
+ float32x4_t source2 = vld1q_f32(sourceP + 8);
+ float32x4_t source3 = vld1q_f32(sourceP + 12);
+
+ float32x4_t result0 = vmulq_f32(source0, scaleNum);
+ float32x4_t result1 = vmulq_f32(source1, scaleNum);
+ float32x4_t result2 = vmulq_f32(source2, scaleNum);
+ float32x4_t result3 = vmulq_f32(source3, scaleNum);
+
+ vst1q_f32(destP, result0);
+ vst1q_f32(destP + 4, result1);
+ vst1q_f32(destP + 8, result2);
+ vst1q_f32(destP + 12, result3);
+
+ sourceP += 16;
+ destP += 16;
}
n = tailFrames;
}
@@ -359,18 +383,33 @@ void vadd(const float* source1P, int sourceStride1, const float* source2P, int s
}
} else { // if strides are not 1, rollback to normal algorithm
#elif HAVE(ARM_NEON_INTRINSICS)
- if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) {
- int tailFrames = n % 4;
- const float* endP = destP + n - tailFrames;
-
- while (destP < endP) {
- float32x4_t source1 = vld1q_f32(source1P);
- float32x4_t source2 = vld1q_f32(source2P);
- vst1q_f32(destP, vaddq_f32(source1, source2));
-
- source1P += 4;
- source2P += 4;
- destP += 4;
+ if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) {
+ unsigned tailFrames = framesToProcess & 15;
+
+ for (unsigned loopCount = (framesToProcess >> 4); loopCount > 0; loopCount--) {
+ float32x4_t source10 = vld1q_f32(source1P);
+ float32x4_t source11 = vld1q_f32(source1P + 4);
+ float32x4_t source12 = vld1q_f32(source1P + 8);
+ float32x4_t source13 = vld1q_f32(source1P + 12);
+
+ float32x4_t source20 = vld1q_f32(source2P);
+ float32x4_t source21 = vld1q_f32(source2P + 4);
+ float32x4_t source22 = vld1q_f32(source2P + 8);
+ float32x4_t source23 = vld1q_f32(source2P + 12);
+
+ float32x4_t result0 = vaddq_f32(source10, source20);
+ float32x4_t result1 = vaddq_f32(source11, source21);
+ float32x4_t result2 = vaddq_f32(source12, source22);
+ float32x4_t result3 = vaddq_f32(source13, source23);
+
+ vst1q_f32(destP, result0);
+ vst1q_f32(destP + 4, result1);
+ vst1q_f32(destP + 8, result2);
+ vst1q_f32(destP + 12, result3);
+
+ source1P += 16;
+ source2P += 16;
+ destP += 16;
}
n = tailFrames;
}
@@ -436,18 +475,33 @@ void vmul(const float* source1P, int sourceStride1, const float* source2P, int s
n = tailFrames;
}
#elif HAVE(ARM_NEON_INTRINSICS)
- if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) {
- int tailFrames = n % 4;
- const float* endP = destP + n - tailFrames;
-
- while (destP < endP) {
- float32x4_t source1 = vld1q_f32(source1P);
- float32x4_t source2 = vld1q_f32(source2P);
- vst1q_f32(destP, vmulq_f32(source1, source2));
-
- source1P += 4;
- source2P += 4;
- destP += 4;
+ if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) {
+ unsigned tailFrames = n & 15;
+
+ for (unsigned loopCount = (n >> 4); loopCount > 0; loopCount--) {
+ float32x4_t source10 = vld1q_f32(source1P);
+ float32x4_t source11 = vld1q_f32(source1P + 4);
+ float32x4_t source12 = vld1q_f32(source1P + 8);
+ float32x4_t source13 = vld1q_f32(source1P + 12);
+
+ float32x4_t source20 = vld1q_f32(source2P);
+ float32x4_t source21 = vld1q_f32(source2P + 4);
+ float32x4_t source22 = vld1q_f32(source2P + 8);
+ float32x4_t source23 = vld1q_f32(source2P + 12);
+
+ float32x4_t result0 = vmulq_f32(source10, source20);
+ float32x4_t result1 = vmulq_f32(source11, source21);
+ float32x4_t result2 = vmulq_f32(source12, source22);
+ float32x4_t result3 = vmulq_f32(source13, source23);
+
+ vst1q_f32(destP, result0);
+ vst1q_f32(destP + 4, result1);
+ vst1q_f32(destP + 8, result2);
+ vst1q_f32(destP + 12, result3);
+
+ source1P += 16;
+ source2P += 16;
+ destP += 16;
}
n = tailFrames;
}
@@ -553,20 +607,35 @@ void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesTo
}
#elif HAVE(ARM_NEON_INTRINSICS)
if (sourceStride == 1) {
- int tailFrames = n % 4;
- const float* endP = sourceP + n - tailFrames;
+ unsigned tailFrames = n & 15;
- float32x4_t fourSum = vdupq_n_f32(0);
- while (sourceP < endP) {
- float32x4_t source = vld1q_f32(sourceP);
- fourSum = vmlaq_f32(fourSum, source, source);
- sourceP += 4;
+ float32x4_t result0 = vdupq_n_f32(0);
+ float32x4_t result1 = result0;
+ float32x4_t result2 = result0;
+ float32x4_t result3 = result0;
+
+ for (unsigned loopCount = (n >> 4); loopCount > 0; loopCount--) {
+ float32x4_t source0 = vld1q_f32(sourceP);
+ float32x4_t source1 = vld1q_f32(sourceP + 4);
+ float32x4_t source2 = vld1q_f32(sourceP + 8);
+ float32x4_t source3 = vld1q_f32(sourceP + 12);
+
+ result0 = vmlaq_f32(result0, source0, source0);
+ result1 = vmlaq_f32(result1, source1, source1);
+ result2 = vmlaq_f32(result2, source2, source2);
+ result3 = vmlaq_f32(result3, source3, source3);
+
+ sourceP += 16;
}
- float32x2_t twoSum = vadd_f32(vget_low_f32(fourSum), vget_high_f32(fourSum));
- float groupSum[2];
- vst1_f32(groupSum, twoSum);
- sum += groupSum[0] + groupSum[1];
+ result0 = vaddq_f32(result0, result1);
+ result0 = vaddq_f32(result0, result2);
+ result0 = vaddq_f32(result0, result3);
+
+ sum += vgetq_lane_f32(result0, 0);
+ sum += vgetq_lane_f32(result0, 1);
+ sum += vgetq_lane_f32(result0, 2);
+ sum += vgetq_lane_f32(result0, 3);
n = tailFrames;
}
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698