Source/platform/audio/VectorMath.cpp - Issue 715753002: [WIP] support arm_neon_optional flag in blink.

Unified Diff: Source/platform/audio/VectorMath.cpp

Issue 715753002: [WIP] support arm_neon_optional flag in blink. Base URL: https://chromium.googlesource.com/chromium/blink.git@master

Patch Set: Created 6 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: Source/platform/audio/VectorMath.cpp

diff --git a/Source/platform/audio/VectorMath.cpp b/Source/platform/audio/VectorMath.cpp

index 219ed5463977ac5cc172f1117fd0cca3f9008a9a..357957a5085a6b8b330fdf73b4d74a031cb681f2 100644

--- a/Source/platform/audio/VectorMath.cpp

+++ b/Source/platform/audio/VectorMath.cpp

@@ -27,105 +27,41 @@

#if ENABLE(WEB_AUDIO)

#include "platform/audio/VectorMath.h"

-#include "wtf/Assertions.h"

-#include "wtf/CPU.h"

-#include <stdint.h>

-#if OS(MACOSX)

-#include <Accelerate/Accelerate.h>

-#endif

-#if CPU(X86) || CPU(X86_64)

+#include "wtf/Assertions.h"

#include <emmintrin.h>

-#endif

+#include <stdint.h>

-#if HAVE(ARM_NEON_INTRINSICS)

-#include <arm_neon.h>

-#endif

+#define SSE2_MULT_ADD(loadInstr, storeInstr) \

+ while (destP < endP) { \

+ pSource = _mm_load_ps(sourceP); \

+ temp = _mm_mul_ps(pSource, mScale); \

+ dest = _mm_##loadInstr##_ps(destP); \

+ dest = _mm_add_ps(dest, temp); \

+ _mm_##storeInstr##_ps(destP, dest); \

+ sourceP += 4; \

+ destP += 4; \

+ } \

-#include <math.h>

-#include <algorithm>

+#define SSE2_MULT(loadInstr, storeInstr) \

+ while (destP < endP) { \

+ pSource1 = _mm_load_ps(source1P); \

+ pSource2 = _mm_##loadInstr##_ps(source2P); \

+ dest = _mm_mul_ps(pSource1, pSource2); \

+ _mm_##storeInstr##_ps(destP, dest); \

+ source1P += 4; \

+ source2P += 4; \

+ destP += 4; \

+ } \

namespace blink {

namespace VectorMath {

-#if OS(MACOSX)

-// On the Mac we use the highly optimized versions in Accelerate.framework

-// In 32-bit mode (__ppc__ or __i386__) <Accelerate/Accelerate.h> includes <vecLib/vDSP_translate.h> which defines macros of the same name as

-// our namespaced function names, so we must handle this case differently. Other architectures (64bit, ARM, etc.) do not include this header file.

-void vsmul(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)

-#if CPU(X86)

- ::vsmul(sourceP, sourceStride, scale, destP, destStride, framesToProcess);

-#else

- vDSP_vsmul(sourceP, sourceStride, scale, destP, destStride, framesToProcess);

-#endif

-void vadd(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)

-#if CPU(X86)

- ::vadd(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess);

-#else

- vDSP_vadd(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess);

-#endif

-void vmul(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)

-#if CPU(X86)

- ::vmul(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess);

-#else

- vDSP_vmul(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess);

-#endif

-void zvmul(const float* real1P, const float* imag1P, const float* real2P, const float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess)

- DSPSplitComplex sc1;

- DSPSplitComplex sc2;

- DSPSplitComplex dest;

- sc1.realp = const_cast<float*>(real1P);

- sc1.imagp = const_cast<float*>(imag1P);

- sc2.realp = const_cast<float*>(real2P);

- sc2.imagp = const_cast<float*>(imag2P);

- dest.realp = realDestP;

- dest.imagp = imagDestP;

-#if CPU(X86)

- ::zvmul(&sc1, 1, &sc2, 1, &dest, 1, framesToProcess, 1);

-#else

- vDSP_zvmul(&sc1, 1, &sc2, 1, &dest, 1, framesToProcess, 1);

-#endif

-void vsma(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)

- vDSP_vsma(sourceP, sourceStride, scale, destP, destStride, destP, destStride, framesToProcess);

-void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesToProcess)

- vDSP_maxmgv(sourceP, sourceStride, maxP, framesToProcess);

-void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesToProcess)

- vDSP_svesq(const_cast<float*>(sourceP), sourceStride, sumP, framesToProcess);

-void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, const float* highThresholdP, float* destP, int destStride, size_t framesToProcess)

- vDSP_vclip(const_cast<float*>(sourceP), sourceStride, const_cast<float*>(lowThresholdP), const_cast<float*>(highThresholdP), destP, destStride, framesToProcess);

-#else

void vsma(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)

{

int n = framesToProcess;

-#if CPU(X86) || CPU(X86_64)

if ((sourceStride == 1) && (destStride == 1)) {

float k = *scale;

@@ -148,18 +84,6 @@ void vsma(const float* sourceP, int sourceStride, const float* scale, float* des

bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F);

-#define SSE2_MULT_ADD(loadInstr, storeInstr) \

- while (destP < endP) \

- { \

- pSource = _mm_load_ps(sourceP); \

- temp = _mm_mul_ps(pSource, mScale); \

- dest = _mm_##loadInstr##_ps(destP); \

- dest = _mm_add_ps(dest, temp); \

- _mm_##storeInstr##_ps(destP, dest); \

- sourceP += 4; \

- destP += 4; \

- }

if (destAligned)

SSE2_MULT_ADD(load, store)

else

@@ -167,25 +91,7 @@ void vsma(const float* sourceP, int sourceStride, const float* scale, float* des

n = tailFrames;

}

-#elif HAVE(ARM_NEON_INTRINSICS)

- if ((sourceStride == 1) && (destStride == 1)) {

- int tailFrames = n % 4;

- const float* endP = destP + n - tailFrames;

- float32x4_t k = vdupq_n_f32(*scale);

- while (destP < endP) {

- float32x4_t source = vld1q_f32(sourceP);

- float32x4_t dest = vld1q_f32(destP);

- dest = vmlaq_f32(dest, source, k);

- vst1q_f32(destP, dest);

- sourceP += 4;

- destP += 4;

- }

- n = tailFrames;

- }

-#endif

while (n) {

*destP += *sourceP * *scale;

sourceP += sourceStride;

@@ -198,7 +104,6 @@ void vsmul(const float* sourceP, int sourceStride, const float* scale, float* de

{

int n = framesToProcess;

-#if CPU(X86) || CPU(X86_64)

if ((sourceStride == 1) && (destStride == 1)) {

float k = *scale;

@@ -247,38 +152,19 @@ void vsmul(const float* sourceP, int sourceStride, const float* scale, float* de

n--;

}

} else { // If strides are not 1, rollback to normal algorithm.

-#elif HAVE(ARM_NEON_INTRINSICS)

- if ((sourceStride == 1) && (destStride == 1)) {

float k = *scale;

- int tailFrames = n % 4;

- const float* endP = destP + n - tailFrames;

- while (destP < endP) {

- float32x4_t source = vld1q_f32(sourceP);

- vst1q_f32(destP, vmulq_n_f32(source, k));

- sourceP += 4;

- destP += 4;

+ while (n--) {

+ *destP = k * *sourceP;

+ sourceP += sourceStride;

+ destP += destStride;

}

- n = tailFrames;

- }

-#endif

- float k = *scale;

- while (n--) {

- *destP = k * *sourceP;

- sourceP += sourceStride;

- destP += destStride;

- }

-#if CPU(X86) || CPU(X86_64)

}

-#endif

}

void vadd(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)

{

int n = framesToProcess;

-#if CPU(X86) || CPU(X86_64)

if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) {

// If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.

while ((reinterpret_cast<size_t>(source1P) & 0x0F) && n) {

@@ -358,40 +244,18 @@ void vadd(const float* source1P, int sourceStride1, const float* source2P, int s

n--;

}

} else { // if strides are not 1, rollback to normal algorithm

-#elif HAVE(ARM_NEON_INTRINSICS)

- if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) {

- int tailFrames = n % 4;

- const float* endP = destP + n - tailFrames;

- while (destP < endP) {

- float32x4_t source1 = vld1q_f32(source1P);

- float32x4_t source2 = vld1q_f32(source2P);

- vst1q_f32(destP, vaddq_f32(source1, source2));

- source1P += 4;

- source2P += 4;

- destP += 4;

+ while (n--) {

+ *destP = *source1P + *source2P;

+ source1P += sourceStride1;

+ source2P += sourceStride2;

+ destP += destStride;

}

- n = tailFrames;

- }

-#endif

- while (n--) {

- *destP = *source1P + *source2P;

- source1P += sourceStride1;

- source2P += sourceStride2;

- destP += destStride;

- }

-#if CPU(X86) || CPU(X86_64)

- }

-#endif

}

void vmul(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)

{

int n = framesToProcess;

-#if CPU(X86) || CPU(X86_64)

if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) {

// If the source1P address is not 16-byte aligned, the first several frames (at most three) should be processed separately.

while ((reinterpret_cast<uintptr_t>(source1P) & 0x0F) && n) {

@@ -412,18 +276,6 @@ void vmul(const float* source1P, int sourceStride1, const float* source2P, int s

bool source2Aligned = !(reinterpret_cast<uintptr_t>(source2P) & 0x0F);

bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F);

-#define SSE2_MULT(loadInstr, storeInstr) \

- while (destP < endP) \

- { \

- pSource1 = _mm_load_ps(source1P); \

- pSource2 = _mm_##loadInstr##_ps(source2P); \

- dest = _mm_mul_ps(pSource1, pSource2); \

- _mm_##storeInstr##_ps(destP, dest); \

- source1P += 4; \

- source2P += 4; \

- destP += 4; \

- }

if (source2Aligned && destAligned) // Both aligned.

SSE2_MULT(load, store)

else if (source2Aligned && !destAligned) // Source2 is aligned but dest not.

@@ -435,23 +287,7 @@ void vmul(const float* source1P, int sourceStride1, const float* source2P, int s

n = tailFrames;

}

-#elif HAVE(ARM_NEON_INTRINSICS)

- if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) {

- int tailFrames = n % 4;

- const float* endP = destP + n - tailFrames;

- while (destP < endP) {

- float32x4_t source1 = vld1q_f32(source1P);

- float32x4_t source2 = vld1q_f32(source2P);

- vst1q_f32(destP, vmulq_f32(source1, source2));

- source1P += 4;

- source2P += 4;

- destP += 4;

- }

- n = tailFrames;

- }

-#endif

while (n) {

*destP = *source1P * *source2P;

source1P += sourceStride1;

@@ -464,7 +300,7 @@ void vmul(const float* source1P, int sourceStride1, const float* source2P, int s

void zvmul(const float* real1P, const float* imag1P, const float* real2P, const float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess)

{

unsigned i = 0;

-#if CPU(X86) || CPU(X86_64)

// Only use the SSE optimization in the very common case that all addresses are 16-byte aligned.

// Otherwise, fall through to the scalar code below.

if (!(reinterpret_cast<uintptr_t>(real1P) & 0x0F)

@@ -489,23 +325,7 @@ void zvmul(const float* real1P, const float* imag1P, const float* real2P, const

i += 4;

}

-#elif HAVE(ARM_NEON_INTRINSICS)

- unsigned endSize = framesToProcess - framesToProcess % 4;

- while (i < endSize) {

- float32x4_t real1 = vld1q_f32(real1P + i);

- float32x4_t real2 = vld1q_f32(real2P + i);

- float32x4_t imag1 = vld1q_f32(imag1P + i);

- float32x4_t imag2 = vld1q_f32(imag2P + i);

- float32x4_t realResult = vmlsq_f32(vmulq_f32(real1, real2), imag1, imag2);

- float32x4_t imagResult = vmlaq_f32(vmulq_f32(real1, imag2), imag1, real2);

- vst1q_f32(realDestP + i, realResult);

- vst1q_f32(imagDestP + i, imagResult);

- i += 4;

- }

-#endif

for (; i < framesToProcess; ++i) {

// Read and compute result before storing them, in case the

// destination is the same as one of the sources.

@@ -522,7 +342,6 @@ void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesTo

int n = framesToProcess;

float sum = 0;

-#if CPU(X86) || CPU(X86_64)

if (sourceStride == 1) {

// If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.

while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) {

@@ -551,26 +370,6 @@ void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesTo

n = tailFrames;

}

-#elif HAVE(ARM_NEON_INTRINSICS)

- if (sourceStride == 1) {

- int tailFrames = n % 4;

- const float* endP = sourceP + n - tailFrames;

- float32x4_t fourSum = vdupq_n_f32(0);

- while (sourceP < endP) {

- float32x4_t source = vld1q_f32(sourceP);

- fourSum = vmlaq_f32(fourSum, source, source);

- sourceP += 4;

- }

- float32x2_t twoSum = vadd_f32(vget_low_f32(fourSum), vget_high_f32(fourSum));

- float groupSum[2];

- vst1_f32(groupSum, twoSum);

- sum += groupSum[0] + groupSum[1];

- n = tailFrames;

- }

-#endif

while (n--) {

float sample = *sourceP;

@@ -587,7 +386,6 @@ void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesT

int n = framesToProcess;

float max = 0;

-#if CPU(X86) || CPU(X86_64)

if (sourceStride == 1) {

// If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.

while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) {

@@ -621,26 +419,6 @@ void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesT

n = tailFrames;

}

-#elif HAVE(ARM_NEON_INTRINSICS)

- if (sourceStride == 1) {

- int tailFrames = n % 4;

- const float* endP = sourceP + n - tailFrames;

- float32x4_t fourMax = vdupq_n_f32(0);

- while (sourceP < endP) {

- float32x4_t source = vld1q_f32(sourceP);

- fourMax = vmaxq_f32(fourMax, vabsq_f32(source));

- sourceP += 4;

- }

- float32x2_t twoMax = vmax_f32(vget_low_f32(fourMax), vget_high_f32(fourMax));

- float groupMax[2];

- vst1_f32(groupMax, twoMax);

- max = std::max(groupMax[0], groupMax[1]);

- n = tailFrames;

- }

-#endif

while (n--) {

max = std::max(max, fabsf(*sourceP));

@@ -658,22 +436,6 @@ void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, c

float highThreshold = *highThresholdP;

// FIXME: Optimize for SSE2.

-#if HAVE(ARM_NEON_INTRINSICS)

- if ((sourceStride == 1) && (destStride == 1)) {

- int tailFrames = n % 4;

- const float* endP = destP + n - tailFrames;

- float32x4_t low = vdupq_n_f32(lowThreshold);

- float32x4_t high = vdupq_n_f32(highThreshold);

- while (destP < endP) {

- float32x4_t source = vld1q_f32(sourceP);

- vst1q_f32(destP, vmaxq_f32(vminq_f32(source, high), low));

- sourceP += 4;

- destP += 4;

- }

- n = tailFrames;

- }

-#endif

while (n--) {

*destP = std::max(std::min(*sourceP, highThreshold), lowThreshold);

sourceP += sourceStride;

@@ -681,8 +443,6 @@ void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, c

}

-#endif // OS(MACOSX)

} // namespace VectorMath

} // namespace blink

« no previous file with comments | « Source/platform/Logging.cpp ('k') | Source/platform/audio/cpu/arm/VectorMathNEON.h » ('j') | no next file with comments »