| Index: Source/platform/audio/VectorMath.cpp
|
| diff --git a/Source/platform/audio/VectorMath.cpp b/Source/platform/audio/VectorMath.cpp
|
| index 219ed5463977ac5cc172f1117fd0cca3f9008a9a..357957a5085a6b8b330fdf73b4d74a031cb681f2 100644
|
| --- a/Source/platform/audio/VectorMath.cpp
|
| +++ b/Source/platform/audio/VectorMath.cpp
|
| @@ -27,105 +27,41 @@
|
| #if ENABLE(WEB_AUDIO)
|
|
|
| #include "platform/audio/VectorMath.h"
|
| -#include "wtf/Assertions.h"
|
| -#include "wtf/CPU.h"
|
| -#include <stdint.h>
|
| -
|
| -#if OS(MACOSX)
|
| -#include <Accelerate/Accelerate.h>
|
| -#endif
|
|
|
| -#if CPU(X86) || CPU(X86_64)
|
| +#include "wtf/Assertions.h"
|
| #include <emmintrin.h>
|
| -#endif
|
| +#include <stdint.h>
|
|
|
| -#if HAVE(ARM_NEON_INTRINSICS)
|
| -#include <arm_neon.h>
|
| -#endif
|
| +#define SSE2_MULT_ADD(loadInstr, storeInstr) \
|
| + while (destP < endP) { \
|
| + pSource = _mm_load_ps(sourceP); \
|
| + temp = _mm_mul_ps(pSource, mScale); \
|
| + dest = _mm_##loadInstr##_ps(destP); \
|
| + dest = _mm_add_ps(dest, temp); \
|
| + _mm_##storeInstr##_ps(destP, dest); \
|
| + sourceP += 4; \
|
| + destP += 4; \
|
| + } \
|
|
|
| -#include <math.h>
|
| -#include <algorithm>
|
| +#define SSE2_MULT(loadInstr, storeInstr) \
|
| + while (destP < endP) { \
|
| + pSource1 = _mm_load_ps(source1P); \
|
| + pSource2 = _mm_##loadInstr##_ps(source2P); \
|
| + dest = _mm_mul_ps(pSource1, pSource2); \
|
| + _mm_##storeInstr##_ps(destP, dest); \
|
| + source1P += 4; \
|
| + source2P += 4; \
|
| + destP += 4; \
|
| + } \
|
|
|
| namespace blink {
|
|
|
| namespace VectorMath {
|
|
|
| -#if OS(MACOSX)
|
| -// On the Mac we use the highly optimized versions in Accelerate.framework
|
| -// In 32-bit mode (__ppc__ or __i386__) <Accelerate/Accelerate.h> includes <vecLib/vDSP_translate.h> which defines macros of the same name as
|
| -// our namespaced function names, so we must handle this case differently. Other architectures (64bit, ARM, etc.) do not include this header file.
|
| -
|
| -void vsmul(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)
|
| -{
|
| -#if CPU(X86)
|
| - ::vsmul(sourceP, sourceStride, scale, destP, destStride, framesToProcess);
|
| -#else
|
| - vDSP_vsmul(sourceP, sourceStride, scale, destP, destStride, framesToProcess);
|
| -#endif
|
| -}
|
| -
|
| -void vadd(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)
|
| -{
|
| -#if CPU(X86)
|
| - ::vadd(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess);
|
| -#else
|
| - vDSP_vadd(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess);
|
| -#endif
|
| -}
|
| -
|
| -void vmul(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)
|
| -{
|
| -#if CPU(X86)
|
| - ::vmul(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess);
|
| -#else
|
| - vDSP_vmul(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess);
|
| -#endif
|
| -}
|
| -
|
| -void zvmul(const float* real1P, const float* imag1P, const float* real2P, const float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess)
|
| -{
|
| - DSPSplitComplex sc1;
|
| - DSPSplitComplex sc2;
|
| - DSPSplitComplex dest;
|
| - sc1.realp = const_cast<float*>(real1P);
|
| - sc1.imagp = const_cast<float*>(imag1P);
|
| - sc2.realp = const_cast<float*>(real2P);
|
| - sc2.imagp = const_cast<float*>(imag2P);
|
| - dest.realp = realDestP;
|
| - dest.imagp = imagDestP;
|
| -#if CPU(X86)
|
| - ::zvmul(&sc1, 1, &sc2, 1, &dest, 1, framesToProcess, 1);
|
| -#else
|
| - vDSP_zvmul(&sc1, 1, &sc2, 1, &dest, 1, framesToProcess, 1);
|
| -#endif
|
| -}
|
| -
|
| -void vsma(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)
|
| -{
|
| - vDSP_vsma(sourceP, sourceStride, scale, destP, destStride, destP, destStride, framesToProcess);
|
| -}
|
| -
|
| -void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesToProcess)
|
| -{
|
| - vDSP_maxmgv(sourceP, sourceStride, maxP, framesToProcess);
|
| -}
|
| -
|
| -void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesToProcess)
|
| -{
|
| - vDSP_svesq(const_cast<float*>(sourceP), sourceStride, sumP, framesToProcess);
|
| -}
|
| -
|
| -void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, const float* highThresholdP, float* destP, int destStride, size_t framesToProcess)
|
| -{
|
| - vDSP_vclip(const_cast<float*>(sourceP), sourceStride, const_cast<float*>(lowThresholdP), const_cast<float*>(highThresholdP), destP, destStride, framesToProcess);
|
| -}
|
| -#else
|
| -
|
| void vsma(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)
|
| {
|
| int n = framesToProcess;
|
|
|
| -#if CPU(X86) || CPU(X86_64)
|
| if ((sourceStride == 1) && (destStride == 1)) {
|
| float k = *scale;
|
|
|
| @@ -148,18 +84,6 @@ void vsma(const float* sourceP, int sourceStride, const float* scale, float* des
|
|
|
| bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F);
|
|
|
| -#define SSE2_MULT_ADD(loadInstr, storeInstr) \
|
| - while (destP < endP) \
|
| - { \
|
| - pSource = _mm_load_ps(sourceP); \
|
| - temp = _mm_mul_ps(pSource, mScale); \
|
| - dest = _mm_##loadInstr##_ps(destP); \
|
| - dest = _mm_add_ps(dest, temp); \
|
| - _mm_##storeInstr##_ps(destP, dest); \
|
| - sourceP += 4; \
|
| - destP += 4; \
|
| - }
|
| -
|
| if (destAligned)
|
| SSE2_MULT_ADD(load, store)
|
| else
|
| @@ -167,25 +91,7 @@ void vsma(const float* sourceP, int sourceStride, const float* scale, float* des
|
|
|
| n = tailFrames;
|
| }
|
| -#elif HAVE(ARM_NEON_INTRINSICS)
|
| - if ((sourceStride == 1) && (destStride == 1)) {
|
| - int tailFrames = n % 4;
|
| - const float* endP = destP + n - tailFrames;
|
| -
|
| - float32x4_t k = vdupq_n_f32(*scale);
|
| - while (destP < endP) {
|
| - float32x4_t source = vld1q_f32(sourceP);
|
| - float32x4_t dest = vld1q_f32(destP);
|
| -
|
| - dest = vmlaq_f32(dest, source, k);
|
| - vst1q_f32(destP, dest);
|
|
|
| - sourceP += 4;
|
| - destP += 4;
|
| - }
|
| - n = tailFrames;
|
| - }
|
| -#endif
|
| while (n) {
|
| *destP += *sourceP * *scale;
|
| sourceP += sourceStride;
|
| @@ -198,7 +104,6 @@ void vsmul(const float* sourceP, int sourceStride, const float* scale, float* de
|
| {
|
| int n = framesToProcess;
|
|
|
| -#if CPU(X86) || CPU(X86_64)
|
| if ((sourceStride == 1) && (destStride == 1)) {
|
| float k = *scale;
|
|
|
| @@ -247,38 +152,19 @@ void vsmul(const float* sourceP, int sourceStride, const float* scale, float* de
|
| n--;
|
| }
|
| } else { // If strides are not 1, rollback to normal algorithm.
|
| -#elif HAVE(ARM_NEON_INTRINSICS)
|
| - if ((sourceStride == 1) && (destStride == 1)) {
|
| float k = *scale;
|
| - int tailFrames = n % 4;
|
| - const float* endP = destP + n - tailFrames;
|
| -
|
| - while (destP < endP) {
|
| - float32x4_t source = vld1q_f32(sourceP);
|
| - vst1q_f32(destP, vmulq_n_f32(source, k));
|
| -
|
| - sourceP += 4;
|
| - destP += 4;
|
| + while (n--) {
|
| + *destP = k * *sourceP;
|
| + sourceP += sourceStride;
|
| + destP += destStride;
|
| }
|
| - n = tailFrames;
|
| - }
|
| -#endif
|
| - float k = *scale;
|
| - while (n--) {
|
| - *destP = k * *sourceP;
|
| - sourceP += sourceStride;
|
| - destP += destStride;
|
| - }
|
| -#if CPU(X86) || CPU(X86_64)
|
| }
|
| -#endif
|
| }
|
|
|
| void vadd(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)
|
| {
|
| int n = framesToProcess;
|
|
|
| -#if CPU(X86) || CPU(X86_64)
|
| if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) {
|
| // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
|
| while ((reinterpret_cast<size_t>(source1P) & 0x0F) && n) {
|
| @@ -358,40 +244,18 @@ void vadd(const float* source1P, int sourceStride1, const float* source2P, int s
|
| n--;
|
| }
|
| } else { // if strides are not 1, rollback to normal algorithm
|
| -#elif HAVE(ARM_NEON_INTRINSICS)
|
| - if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) {
|
| - int tailFrames = n % 4;
|
| - const float* endP = destP + n - tailFrames;
|
| -
|
| - while (destP < endP) {
|
| - float32x4_t source1 = vld1q_f32(source1P);
|
| - float32x4_t source2 = vld1q_f32(source2P);
|
| - vst1q_f32(destP, vaddq_f32(source1, source2));
|
| -
|
| - source1P += 4;
|
| - source2P += 4;
|
| - destP += 4;
|
| + while (n--) {
|
| + *destP = *source1P + *source2P;
|
| + source1P += sourceStride1;
|
| + source2P += sourceStride2;
|
| + destP += destStride;
|
| }
|
| - n = tailFrames;
|
| - }
|
| -#endif
|
| - while (n--) {
|
| - *destP = *source1P + *source2P;
|
| - source1P += sourceStride1;
|
| - source2P += sourceStride2;
|
| - destP += destStride;
|
| - }
|
| -#if CPU(X86) || CPU(X86_64)
|
| - }
|
| -#endif
|
| }
|
|
|
| void vmul(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)
|
| {
|
| -
|
| int n = framesToProcess;
|
|
|
| -#if CPU(X86) || CPU(X86_64)
|
| if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) {
|
| // If the source1P address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
|
| while ((reinterpret_cast<uintptr_t>(source1P) & 0x0F) && n) {
|
| @@ -412,18 +276,6 @@ void vmul(const float* source1P, int sourceStride1, const float* source2P, int s
|
| bool source2Aligned = !(reinterpret_cast<uintptr_t>(source2P) & 0x0F);
|
| bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F);
|
|
|
| -#define SSE2_MULT(loadInstr, storeInstr) \
|
| - while (destP < endP) \
|
| - { \
|
| - pSource1 = _mm_load_ps(source1P); \
|
| - pSource2 = _mm_##loadInstr##_ps(source2P); \
|
| - dest = _mm_mul_ps(pSource1, pSource2); \
|
| - _mm_##storeInstr##_ps(destP, dest); \
|
| - source1P += 4; \
|
| - source2P += 4; \
|
| - destP += 4; \
|
| - }
|
| -
|
| if (source2Aligned && destAligned) // Both aligned.
|
| SSE2_MULT(load, store)
|
| else if (source2Aligned && !destAligned) // Source2 is aligned but dest not.
|
| @@ -435,23 +287,7 @@ void vmul(const float* source1P, int sourceStride1, const float* source2P, int s
|
|
|
| n = tailFrames;
|
| }
|
| -#elif HAVE(ARM_NEON_INTRINSICS)
|
| - if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) {
|
| - int tailFrames = n % 4;
|
| - const float* endP = destP + n - tailFrames;
|
| -
|
| - while (destP < endP) {
|
| - float32x4_t source1 = vld1q_f32(source1P);
|
| - float32x4_t source2 = vld1q_f32(source2P);
|
| - vst1q_f32(destP, vmulq_f32(source1, source2));
|
|
|
| - source1P += 4;
|
| - source2P += 4;
|
| - destP += 4;
|
| - }
|
| - n = tailFrames;
|
| - }
|
| -#endif
|
| while (n) {
|
| *destP = *source1P * *source2P;
|
| source1P += sourceStride1;
|
| @@ -464,7 +300,7 @@ void vmul(const float* source1P, int sourceStride1, const float* source2P, int s
|
| void zvmul(const float* real1P, const float* imag1P, const float* real2P, const float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess)
|
| {
|
| unsigned i = 0;
|
| -#if CPU(X86) || CPU(X86_64)
|
| +
|
| // Only use the SSE optimization in the very common case that all addresses are 16-byte aligned.
|
| // Otherwise, fall through to the scalar code below.
|
| if (!(reinterpret_cast<uintptr_t>(real1P) & 0x0F)
|
| @@ -489,23 +325,7 @@ void zvmul(const float* real1P, const float* imag1P, const float* real2P, const
|
| i += 4;
|
| }
|
| }
|
| -#elif HAVE(ARM_NEON_INTRINSICS)
|
| - unsigned endSize = framesToProcess - framesToProcess % 4;
|
| - while (i < endSize) {
|
| - float32x4_t real1 = vld1q_f32(real1P + i);
|
| - float32x4_t real2 = vld1q_f32(real2P + i);
|
| - float32x4_t imag1 = vld1q_f32(imag1P + i);
|
| - float32x4_t imag2 = vld1q_f32(imag2P + i);
|
|
|
| - float32x4_t realResult = vmlsq_f32(vmulq_f32(real1, real2), imag1, imag2);
|
| - float32x4_t imagResult = vmlaq_f32(vmulq_f32(real1, imag2), imag1, real2);
|
| -
|
| - vst1q_f32(realDestP + i, realResult);
|
| - vst1q_f32(imagDestP + i, imagResult);
|
| -
|
| - i += 4;
|
| - }
|
| -#endif
|
| for (; i < framesToProcess; ++i) {
|
| // Read and compute result before storing them, in case the
|
| // destination is the same as one of the sources.
|
| @@ -522,7 +342,6 @@ void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesTo
|
| int n = framesToProcess;
|
| float sum = 0;
|
|
|
| -#if CPU(X86) || CPU(X86_64)
|
| if (sourceStride == 1) {
|
| // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
|
| while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) {
|
| @@ -551,26 +370,6 @@ void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesTo
|
|
|
| n = tailFrames;
|
| }
|
| -#elif HAVE(ARM_NEON_INTRINSICS)
|
| - if (sourceStride == 1) {
|
| - int tailFrames = n % 4;
|
| - const float* endP = sourceP + n - tailFrames;
|
| -
|
| - float32x4_t fourSum = vdupq_n_f32(0);
|
| - while (sourceP < endP) {
|
| - float32x4_t source = vld1q_f32(sourceP);
|
| - fourSum = vmlaq_f32(fourSum, source, source);
|
| - sourceP += 4;
|
| - }
|
| - float32x2_t twoSum = vadd_f32(vget_low_f32(fourSum), vget_high_f32(fourSum));
|
| -
|
| - float groupSum[2];
|
| - vst1_f32(groupSum, twoSum);
|
| - sum += groupSum[0] + groupSum[1];
|
| -
|
| - n = tailFrames;
|
| - }
|
| -#endif
|
|
|
| while (n--) {
|
| float sample = *sourceP;
|
| @@ -587,7 +386,6 @@ void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesT
|
| int n = framesToProcess;
|
| float max = 0;
|
|
|
| -#if CPU(X86) || CPU(X86_64)
|
| if (sourceStride == 1) {
|
| // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
|
| while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) {
|
| @@ -621,26 +419,6 @@ void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesT
|
|
|
| n = tailFrames;
|
| }
|
| -#elif HAVE(ARM_NEON_INTRINSICS)
|
| - if (sourceStride == 1) {
|
| - int tailFrames = n % 4;
|
| - const float* endP = sourceP + n - tailFrames;
|
| -
|
| - float32x4_t fourMax = vdupq_n_f32(0);
|
| - while (sourceP < endP) {
|
| - float32x4_t source = vld1q_f32(sourceP);
|
| - fourMax = vmaxq_f32(fourMax, vabsq_f32(source));
|
| - sourceP += 4;
|
| - }
|
| - float32x2_t twoMax = vmax_f32(vget_low_f32(fourMax), vget_high_f32(fourMax));
|
| -
|
| - float groupMax[2];
|
| - vst1_f32(groupMax, twoMax);
|
| - max = std::max(groupMax[0], groupMax[1]);
|
| -
|
| - n = tailFrames;
|
| - }
|
| -#endif
|
|
|
| while (n--) {
|
| max = std::max(max, fabsf(*sourceP));
|
| @@ -658,22 +436,6 @@ void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, c
|
| float highThreshold = *highThresholdP;
|
|
|
| // FIXME: Optimize for SSE2.
|
| -#if HAVE(ARM_NEON_INTRINSICS)
|
| - if ((sourceStride == 1) && (destStride == 1)) {
|
| - int tailFrames = n % 4;
|
| - const float* endP = destP + n - tailFrames;
|
| -
|
| - float32x4_t low = vdupq_n_f32(lowThreshold);
|
| - float32x4_t high = vdupq_n_f32(highThreshold);
|
| - while (destP < endP) {
|
| - float32x4_t source = vld1q_f32(sourceP);
|
| - vst1q_f32(destP, vmaxq_f32(vminq_f32(source, high), low));
|
| - sourceP += 4;
|
| - destP += 4;
|
| - }
|
| - n = tailFrames;
|
| - }
|
| -#endif
|
| while (n--) {
|
| *destP = std::max(std::min(*sourceP, highThreshold), lowThreshold);
|
| sourceP += sourceStride;
|
| @@ -681,8 +443,6 @@ void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, c
|
| }
|
| }
|
|
|
| -#endif // OS(MACOSX)
|
| -
|
| } // namespace VectorMath
|
|
|
| } // namespace blink
|
|
|