Source/platform/audio/DirectConvolver.cpp - Issue 408563003: WebAudio: Add SSE2 optimization for DirectConvolver

Unified Diff: Source/platform/audio/DirectConvolver.cpp

Issue 408563003: WebAudio: Add SSE2 optimization for DirectConvolver (Closed) Base URL: svn://svn.chromium.org/blink/trunk

Patch Set: Created 6 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: Source/platform/audio/DirectConvolver.cpp

diff --git a/Source/platform/audio/DirectConvolver.cpp b/Source/platform/audio/DirectConvolver.cpp

index b801cee30a24202e400a115cf91e96a119043c23..34b02721ca19d637a1ecf76e62ef7095b72ee5b5 100644

--- a/Source/platform/audio/DirectConvolver.cpp

+++ b/Source/platform/audio/DirectConvolver.cpp

@@ -39,6 +39,10 @@

#include "platform/audio/VectorMath.h"

#include "wtf/CPU.h"

+#if CPU(X86) || CPU(X86_64)

+#include <emmintrin.h>

Ken Russell (switch to Gerrit) 2014/07/19 01:02:24 This header's available on all non-Mac OSs?

Raymond Toy 2014/07/21 17:09:57 Yes, I believe so. But we don't need to include th

+#endif

namespace blink {

using namespace VectorMath;

@@ -97,6 +101,47 @@ void DirectConvolver::process(AudioFloatArray* convolutionKernel, const float* s

vDSP_conv(inputP - kernelSize + 1, 1, kernelP + kernelSize - 1, -1, destP, 1, framesToProcess, kernelSize);

#endif // CPU(X86)

#else

+ size_t i = 0;

+#if CPU(X86) || CPU(X86_64)

+ // Convolution using SSE2. Currently only do this if both |kernelSize| and |framesToProcess|

+ // are multiples of 4. If not, use the straighforward loop below.

Ken Russell (switch to Gerrit) 2014/07/19 01:02:24 typo: straightforward

+ if ((kernelSize % 4 == 0) && (framesToProcess % 4 == 0)) {

+ // AudioFloatArray's are always aligned on at least a 16-byte boundary.

Ken Russell (switch to Gerrit) 2014/07/19 01:02:24 Sure hope that's true. I'm not verifying that asse

Raymond Toy 2014/07/21 17:09:57 For the record, it's in Source/platform/audio/Audi

+ AudioFloatArray kernelBuffer(4 * kernelSize);

+ __m128* kernelReversed = reinterpret_cast<__m128*>(kernelBuffer.data());

+ // Reverse the kernel and repeat each value across a vector

+ for (i = 0; i < kernelSize; ++i) {

+ kernelReversed[i] = _mm_set1_ps(kernelP[kernelSize - i - 1]);

+ }

+ float* inputStartP = inputP - kernelSize + 1;

+ // Do convolution with 4 inputs at a time.

+ for (i = 0; i < framesToProcess; i += 4) {

+ __m128 convolutionSum;

+ convolutionSum = _mm_setzero_ps();

+ // |kernelSize| is a multiple of 4 so we can unroll the loop by 4, manually.

+ for (size_t k = 0; k < kernelSize; k += 4) {

+ size_t dataOffset = i + k;

+ for (size_t m = 0; m < 4; ++m) {

+ __m128 sourceBlock;

+ __m128 product;

+ sourceBlock = _mm_loadu_ps(inputStartP + dataOffset + m);

+ product = _mm_mul_ps(kernelReversed[k + m], sourceBlock);

+ convolutionSum = _mm_add_ps(convolutionSum, product);

+ }

+ _mm_storeu_ps(destP + i, convolutionSum);

+ }

+ } else {

+#endif

// FIXME: The macro can be further optimized to avoid pipeline stalls. One possibility is to maintain 4 separate sums and change the macro to CONVOLVE_FOUR_SAMPLES.

#define CONVOLVE_ONE_SAMPLE \

do { \

@@ -104,7 +149,6 @@ void DirectConvolver::process(AudioFloatArray* convolutionKernel, const float* s

j++; \

} while (0)

- size_t i = 0;

while (i < framesToProcess) {

size_t j = 0;

float sum = 0;

@@ -368,6 +412,9 @@ void DirectConvolver::process(AudioFloatArray* convolutionKernel, const float* s

}

destP[i++] = sum;

}

+#if CPU(X86) || CPU(X86_64)

+ }

+#endif

#endif // OS(MACOSX)

// Copy 2nd half of input buffer to 1st half.

« no previous file with comments | « no previous file | no next file » | no next file with comments »