Index: Source/platform/audio/VectorMath.cpp |
diff --git a/Source/platform/audio/VectorMath.cpp b/Source/platform/audio/VectorMath.cpp |
index 219ed5463977ac5cc172f1117fd0cca3f9008a9a..357957a5085a6b8b330fdf73b4d74a031cb681f2 100644 |
--- a/Source/platform/audio/VectorMath.cpp |
+++ b/Source/platform/audio/VectorMath.cpp |
@@ -27,105 +27,41 @@ |
#if ENABLE(WEB_AUDIO) |
#include "platform/audio/VectorMath.h" |
-#include "wtf/Assertions.h" |
-#include "wtf/CPU.h" |
-#include <stdint.h> |
- |
-#if OS(MACOSX) |
-#include <Accelerate/Accelerate.h> |
-#endif |
-#if CPU(X86) || CPU(X86_64) |
+#include "wtf/Assertions.h" |
#include <emmintrin.h> |
-#endif |
+#include <stdint.h> |
-#if HAVE(ARM_NEON_INTRINSICS) |
-#include <arm_neon.h> |
-#endif |
+#define SSE2_MULT_ADD(loadInstr, storeInstr) \ |
+ while (destP < endP) { \ |
+ pSource = _mm_load_ps(sourceP); \ |
+ temp = _mm_mul_ps(pSource, mScale); \ |
+ dest = _mm_##loadInstr##_ps(destP); \ |
+ dest = _mm_add_ps(dest, temp); \ |
+ _mm_##storeInstr##_ps(destP, dest); \ |
+ sourceP += 4; \ |
+ destP += 4; \ |
+ } \ |
-#include <math.h> |
-#include <algorithm> |
+#define SSE2_MULT(loadInstr, storeInstr) \ |
+ while (destP < endP) { \ |
+ pSource1 = _mm_load_ps(source1P); \ |
+ pSource2 = _mm_##loadInstr##_ps(source2P); \ |
+ dest = _mm_mul_ps(pSource1, pSource2); \ |
+ _mm_##storeInstr##_ps(destP, dest); \ |
+ source1P += 4; \ |
+ source2P += 4; \ |
+ destP += 4; \ |
+ } \ |
namespace blink { |
namespace VectorMath { |
-#if OS(MACOSX) |
-// On the Mac we use the highly optimized versions in Accelerate.framework |
-// In 32-bit mode (__ppc__ or __i386__) <Accelerate/Accelerate.h> includes <vecLib/vDSP_translate.h> which defines macros of the same name as |
-// our namespaced function names, so we must handle this case differently. Other architectures (64bit, ARM, etc.) do not include this header file. |
- |
-void vsmul(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess) |
-{ |
-#if CPU(X86) |
- ::vsmul(sourceP, sourceStride, scale, destP, destStride, framesToProcess); |
-#else |
- vDSP_vsmul(sourceP, sourceStride, scale, destP, destStride, framesToProcess); |
-#endif |
-} |
- |
-void vadd(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess) |
-{ |
-#if CPU(X86) |
- ::vadd(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess); |
-#else |
- vDSP_vadd(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess); |
-#endif |
-} |
- |
-void vmul(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess) |
-{ |
-#if CPU(X86) |
- ::vmul(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess); |
-#else |
- vDSP_vmul(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess); |
-#endif |
-} |
- |
-void zvmul(const float* real1P, const float* imag1P, const float* real2P, const float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess) |
-{ |
- DSPSplitComplex sc1; |
- DSPSplitComplex sc2; |
- DSPSplitComplex dest; |
- sc1.realp = const_cast<float*>(real1P); |
- sc1.imagp = const_cast<float*>(imag1P); |
- sc2.realp = const_cast<float*>(real2P); |
- sc2.imagp = const_cast<float*>(imag2P); |
- dest.realp = realDestP; |
- dest.imagp = imagDestP; |
-#if CPU(X86) |
- ::zvmul(&sc1, 1, &sc2, 1, &dest, 1, framesToProcess, 1); |
-#else |
- vDSP_zvmul(&sc1, 1, &sc2, 1, &dest, 1, framesToProcess, 1); |
-#endif |
-} |
- |
-void vsma(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess) |
-{ |
- vDSP_vsma(sourceP, sourceStride, scale, destP, destStride, destP, destStride, framesToProcess); |
-} |
- |
-void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesToProcess) |
-{ |
- vDSP_maxmgv(sourceP, sourceStride, maxP, framesToProcess); |
-} |
- |
-void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesToProcess) |
-{ |
- vDSP_svesq(const_cast<float*>(sourceP), sourceStride, sumP, framesToProcess); |
-} |
- |
-void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, const float* highThresholdP, float* destP, int destStride, size_t framesToProcess) |
-{ |
- vDSP_vclip(const_cast<float*>(sourceP), sourceStride, const_cast<float*>(lowThresholdP), const_cast<float*>(highThresholdP), destP, destStride, framesToProcess); |
-} |
-#else |
- |
void vsma(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess) |
{ |
int n = framesToProcess; |
-#if CPU(X86) || CPU(X86_64) |
if ((sourceStride == 1) && (destStride == 1)) { |
float k = *scale; |
@@ -148,18 +84,6 @@ void vsma(const float* sourceP, int sourceStride, const float* scale, float* des |
bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F); |
-#define SSE2_MULT_ADD(loadInstr, storeInstr) \ |
- while (destP < endP) \ |
- { \ |
- pSource = _mm_load_ps(sourceP); \ |
- temp = _mm_mul_ps(pSource, mScale); \ |
- dest = _mm_##loadInstr##_ps(destP); \ |
- dest = _mm_add_ps(dest, temp); \ |
- _mm_##storeInstr##_ps(destP, dest); \ |
- sourceP += 4; \ |
- destP += 4; \ |
- } |
- |
if (destAligned) |
SSE2_MULT_ADD(load, store) |
else |
@@ -167,25 +91,7 @@ void vsma(const float* sourceP, int sourceStride, const float* scale, float* des |
n = tailFrames; |
} |
-#elif HAVE(ARM_NEON_INTRINSICS) |
- if ((sourceStride == 1) && (destStride == 1)) { |
- int tailFrames = n % 4; |
- const float* endP = destP + n - tailFrames; |
- |
- float32x4_t k = vdupq_n_f32(*scale); |
- while (destP < endP) { |
- float32x4_t source = vld1q_f32(sourceP); |
- float32x4_t dest = vld1q_f32(destP); |
- |
- dest = vmlaq_f32(dest, source, k); |
- vst1q_f32(destP, dest); |
- sourceP += 4; |
- destP += 4; |
- } |
- n = tailFrames; |
- } |
-#endif |
while (n) { |
*destP += *sourceP * *scale; |
sourceP += sourceStride; |
@@ -198,7 +104,6 @@ void vsmul(const float* sourceP, int sourceStride, const float* scale, float* de |
{ |
int n = framesToProcess; |
-#if CPU(X86) || CPU(X86_64) |
if ((sourceStride == 1) && (destStride == 1)) { |
float k = *scale; |
@@ -247,38 +152,19 @@ void vsmul(const float* sourceP, int sourceStride, const float* scale, float* de |
n--; |
} |
} else { // If strides are not 1, rollback to normal algorithm. |
-#elif HAVE(ARM_NEON_INTRINSICS) |
- if ((sourceStride == 1) && (destStride == 1)) { |
float k = *scale; |
- int tailFrames = n % 4; |
- const float* endP = destP + n - tailFrames; |
- |
- while (destP < endP) { |
- float32x4_t source = vld1q_f32(sourceP); |
- vst1q_f32(destP, vmulq_n_f32(source, k)); |
- |
- sourceP += 4; |
- destP += 4; |
+ while (n--) { |
+ *destP = k * *sourceP; |
+ sourceP += sourceStride; |
+ destP += destStride; |
} |
- n = tailFrames; |
- } |
-#endif |
- float k = *scale; |
- while (n--) { |
- *destP = k * *sourceP; |
- sourceP += sourceStride; |
- destP += destStride; |
- } |
-#if CPU(X86) || CPU(X86_64) |
} |
-#endif |
} |
void vadd(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess) |
{ |
int n = framesToProcess; |
-#if CPU(X86) || CPU(X86_64) |
if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) { |
// If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately. |
while ((reinterpret_cast<size_t>(source1P) & 0x0F) && n) { |
@@ -358,40 +244,18 @@ void vadd(const float* source1P, int sourceStride1, const float* source2P, int s |
n--; |
} |
} else { // if strides are not 1, rollback to normal algorithm |
-#elif HAVE(ARM_NEON_INTRINSICS) |
- if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) { |
- int tailFrames = n % 4; |
- const float* endP = destP + n - tailFrames; |
- |
- while (destP < endP) { |
- float32x4_t source1 = vld1q_f32(source1P); |
- float32x4_t source2 = vld1q_f32(source2P); |
- vst1q_f32(destP, vaddq_f32(source1, source2)); |
- |
- source1P += 4; |
- source2P += 4; |
- destP += 4; |
+ while (n--) { |
+ *destP = *source1P + *source2P; |
+ source1P += sourceStride1; |
+ source2P += sourceStride2; |
+ destP += destStride; |
} |
- n = tailFrames; |
- } |
-#endif |
- while (n--) { |
- *destP = *source1P + *source2P; |
- source1P += sourceStride1; |
- source2P += sourceStride2; |
- destP += destStride; |
- } |
-#if CPU(X86) || CPU(X86_64) |
- } |
-#endif |
} |
void vmul(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess) |
{ |
- |
int n = framesToProcess; |
-#if CPU(X86) || CPU(X86_64) |
if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) { |
// If the source1P address is not 16-byte aligned, the first several frames (at most three) should be processed separately. |
while ((reinterpret_cast<uintptr_t>(source1P) & 0x0F) && n) { |
@@ -412,18 +276,6 @@ void vmul(const float* source1P, int sourceStride1, const float* source2P, int s |
bool source2Aligned = !(reinterpret_cast<uintptr_t>(source2P) & 0x0F); |
bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F); |
-#define SSE2_MULT(loadInstr, storeInstr) \ |
- while (destP < endP) \ |
- { \ |
- pSource1 = _mm_load_ps(source1P); \ |
- pSource2 = _mm_##loadInstr##_ps(source2P); \ |
- dest = _mm_mul_ps(pSource1, pSource2); \ |
- _mm_##storeInstr##_ps(destP, dest); \ |
- source1P += 4; \ |
- source2P += 4; \ |
- destP += 4; \ |
- } |
- |
if (source2Aligned && destAligned) // Both aligned. |
SSE2_MULT(load, store) |
else if (source2Aligned && !destAligned) // Source2 is aligned but dest not. |
@@ -435,23 +287,7 @@ void vmul(const float* source1P, int sourceStride1, const float* source2P, int s |
n = tailFrames; |
} |
-#elif HAVE(ARM_NEON_INTRINSICS) |
- if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) { |
- int tailFrames = n % 4; |
- const float* endP = destP + n - tailFrames; |
- |
- while (destP < endP) { |
- float32x4_t source1 = vld1q_f32(source1P); |
- float32x4_t source2 = vld1q_f32(source2P); |
- vst1q_f32(destP, vmulq_f32(source1, source2)); |
- source1P += 4; |
- source2P += 4; |
- destP += 4; |
- } |
- n = tailFrames; |
- } |
-#endif |
while (n) { |
*destP = *source1P * *source2P; |
source1P += sourceStride1; |
@@ -464,7 +300,7 @@ void vmul(const float* source1P, int sourceStride1, const float* source2P, int s |
void zvmul(const float* real1P, const float* imag1P, const float* real2P, const float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess) |
{ |
unsigned i = 0; |
-#if CPU(X86) || CPU(X86_64) |
+ |
// Only use the SSE optimization in the very common case that all addresses are 16-byte aligned. |
// Otherwise, fall through to the scalar code below. |
if (!(reinterpret_cast<uintptr_t>(real1P) & 0x0F) |
@@ -489,23 +325,7 @@ void zvmul(const float* real1P, const float* imag1P, const float* real2P, const |
i += 4; |
} |
} |
-#elif HAVE(ARM_NEON_INTRINSICS) |
- unsigned endSize = framesToProcess - framesToProcess % 4; |
- while (i < endSize) { |
- float32x4_t real1 = vld1q_f32(real1P + i); |
- float32x4_t real2 = vld1q_f32(real2P + i); |
- float32x4_t imag1 = vld1q_f32(imag1P + i); |
- float32x4_t imag2 = vld1q_f32(imag2P + i); |
- float32x4_t realResult = vmlsq_f32(vmulq_f32(real1, real2), imag1, imag2); |
- float32x4_t imagResult = vmlaq_f32(vmulq_f32(real1, imag2), imag1, real2); |
- |
- vst1q_f32(realDestP + i, realResult); |
- vst1q_f32(imagDestP + i, imagResult); |
- |
- i += 4; |
- } |
-#endif |
for (; i < framesToProcess; ++i) { |
// Read and compute result before storing them, in case the |
// destination is the same as one of the sources. |
@@ -522,7 +342,6 @@ void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesTo |
int n = framesToProcess; |
float sum = 0; |
-#if CPU(X86) || CPU(X86_64) |
if (sourceStride == 1) { |
// If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately. |
while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) { |
@@ -551,26 +370,6 @@ void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesTo |
n = tailFrames; |
} |
-#elif HAVE(ARM_NEON_INTRINSICS) |
- if (sourceStride == 1) { |
- int tailFrames = n % 4; |
- const float* endP = sourceP + n - tailFrames; |
- |
- float32x4_t fourSum = vdupq_n_f32(0); |
- while (sourceP < endP) { |
- float32x4_t source = vld1q_f32(sourceP); |
- fourSum = vmlaq_f32(fourSum, source, source); |
- sourceP += 4; |
- } |
- float32x2_t twoSum = vadd_f32(vget_low_f32(fourSum), vget_high_f32(fourSum)); |
- |
- float groupSum[2]; |
- vst1_f32(groupSum, twoSum); |
- sum += groupSum[0] + groupSum[1]; |
- |
- n = tailFrames; |
- } |
-#endif |
while (n--) { |
float sample = *sourceP; |
@@ -587,7 +386,6 @@ void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesT |
int n = framesToProcess; |
float max = 0; |
-#if CPU(X86) || CPU(X86_64) |
if (sourceStride == 1) { |
// If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately. |
while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) { |
@@ -621,26 +419,6 @@ void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesT |
n = tailFrames; |
} |
-#elif HAVE(ARM_NEON_INTRINSICS) |
- if (sourceStride == 1) { |
- int tailFrames = n % 4; |
- const float* endP = sourceP + n - tailFrames; |
- |
- float32x4_t fourMax = vdupq_n_f32(0); |
- while (sourceP < endP) { |
- float32x4_t source = vld1q_f32(sourceP); |
- fourMax = vmaxq_f32(fourMax, vabsq_f32(source)); |
- sourceP += 4; |
- } |
- float32x2_t twoMax = vmax_f32(vget_low_f32(fourMax), vget_high_f32(fourMax)); |
- |
- float groupMax[2]; |
- vst1_f32(groupMax, twoMax); |
- max = std::max(groupMax[0], groupMax[1]); |
- |
- n = tailFrames; |
- } |
-#endif |
while (n--) { |
max = std::max(max, fabsf(*sourceP)); |
@@ -658,22 +436,6 @@ void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, c |
float highThreshold = *highThresholdP; |
// FIXME: Optimize for SSE2. |
-#if HAVE(ARM_NEON_INTRINSICS) |
- if ((sourceStride == 1) && (destStride == 1)) { |
- int tailFrames = n % 4; |
- const float* endP = destP + n - tailFrames; |
- |
- float32x4_t low = vdupq_n_f32(lowThreshold); |
- float32x4_t high = vdupq_n_f32(highThreshold); |
- while (destP < endP) { |
- float32x4_t source = vld1q_f32(sourceP); |
- vst1q_f32(destP, vmaxq_f32(vminq_f32(source, high), low)); |
- sourceP += 4; |
- destP += 4; |
- } |
- n = tailFrames; |
- } |
-#endif |
while (n--) { |
*destP = std::max(std::min(*sourceP, highThreshold), lowThreshold); |
sourceP += sourceStride; |
@@ -681,8 +443,6 @@ void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, c |
} |
} |
-#endif // OS(MACOSX) |
- |
} // namespace VectorMath |
} // namespace blink |