Index: Source/platform/audio/DirectConvolver.cpp |
diff --git a/Source/platform/audio/DirectConvolver.cpp b/Source/platform/audio/DirectConvolver.cpp |
index b801cee30a24202e400a115cf91e96a119043c23..34b02721ca19d637a1ecf76e62ef7095b72ee5b5 100644 |
--- a/Source/platform/audio/DirectConvolver.cpp |
+++ b/Source/platform/audio/DirectConvolver.cpp |
@@ -39,6 +39,10 @@ |
#include "platform/audio/VectorMath.h" |
#include "wtf/CPU.h" |
+#if CPU(X86) || CPU(X86_64) |
+#include <emmintrin.h> |
Ken Russell (switch to Gerrit)
2014/07/19 01:02:24
This header's available on all non-Mac OSs?
Raymond Toy
2014/07/21 17:09:57
Yes, I believe so. But we don't need to include th
|
+#endif |
+ |
namespace blink { |
using namespace VectorMath; |
@@ -97,6 +101,47 @@ void DirectConvolver::process(AudioFloatArray* convolutionKernel, const float* s |
vDSP_conv(inputP - kernelSize + 1, 1, kernelP + kernelSize - 1, -1, destP, 1, framesToProcess, kernelSize); |
#endif // CPU(X86) |
#else |
+ size_t i = 0; |
+#if CPU(X86) || CPU(X86_64) |
+ // Convolution using SSE2. Currently only do this if both |kernelSize| and |framesToProcess| |
+ // are multiples of 4. If not, use the straighforward loop below. |
Ken Russell (switch to Gerrit)
2014/07/19 01:02:24
typo: straightforward
|
+ |
+ if ((kernelSize % 4 == 0) && (framesToProcess % 4 == 0)) { |
+ // AudioFloatArray's are always aligned on at least a 16-byte boundary. |
Ken Russell (switch to Gerrit)
2014/07/19 01:02:24
Sure hope that's true. I'm not verifying that asse
Raymond Toy
2014/07/21 17:09:57
For the record, it's in Source/platform/audio/Audi
|
+ AudioFloatArray kernelBuffer(4 * kernelSize); |
+ __m128* kernelReversed = reinterpret_cast<__m128*>(kernelBuffer.data()); |
+ |
+ // Reverse the kernel and repeat each value across a vector |
+ for (i = 0; i < kernelSize; ++i) { |
+ kernelReversed[i] = _mm_set1_ps(kernelP[kernelSize - i - 1]); |
+ } |
+ |
+ float* inputStartP = inputP - kernelSize + 1; |
+ |
+ // Do convolution with 4 inputs at a time. |
+ for (i = 0; i < framesToProcess; i += 4) { |
+ __m128 convolutionSum; |
+ |
+ convolutionSum = _mm_setzero_ps(); |
+ |
+ // |kernelSize| is a multiple of 4 so we can unroll the loop by 4, manually. |
+ for (size_t k = 0; k < kernelSize; k += 4) { |
+ size_t dataOffset = i + k; |
+ |
+ for (size_t m = 0; m < 4; ++m) { |
+ __m128 sourceBlock; |
+ __m128 product; |
+ |
+ sourceBlock = _mm_loadu_ps(inputStartP + dataOffset + m); |
+ product = _mm_mul_ps(kernelReversed[k + m], sourceBlock); |
+ convolutionSum = _mm_add_ps(convolutionSum, product); |
+ } |
+ } |
+ _mm_storeu_ps(destP + i, convolutionSum); |
+ } |
+ } else { |
+#endif |
+ |
// FIXME: The macro can be further optimized to avoid pipeline stalls. One possibility is to maintain 4 separate sums and change the macro to CONVOLVE_FOUR_SAMPLES. |
#define CONVOLVE_ONE_SAMPLE \ |
do { \ |
@@ -104,7 +149,6 @@ void DirectConvolver::process(AudioFloatArray* convolutionKernel, const float* s |
j++; \ |
} while (0) |
- size_t i = 0; |
while (i < framesToProcess) { |
size_t j = 0; |
float sum = 0; |
@@ -368,6 +412,9 @@ void DirectConvolver::process(AudioFloatArray* convolutionKernel, const float* s |
} |
destP[i++] = sum; |
} |
+#if CPU(X86) || CPU(X86_64) |
+ } |
+#endif |
#endif // OS(MACOSX) |
// Copy 2nd half of input buffer to 1st half. |