Chromium Code Reviews| Index: Source/platform/audio/DirectConvolver.cpp |
| diff --git a/Source/platform/audio/DirectConvolver.cpp b/Source/platform/audio/DirectConvolver.cpp |
| index b801cee30a24202e400a115cf91e96a119043c23..34b02721ca19d637a1ecf76e62ef7095b72ee5b5 100644 |
| --- a/Source/platform/audio/DirectConvolver.cpp |
| +++ b/Source/platform/audio/DirectConvolver.cpp |
| @@ -39,6 +39,10 @@ |
| #include "platform/audio/VectorMath.h" |
| #include "wtf/CPU.h" |
| +#if CPU(X86) || CPU(X86_64) |
| +#include <emmintrin.h> |
|
Ken Russell (switch to Gerrit)
2014/07/19 01:02:24
This header's available on all non-Mac OSs?
Raymond Toy
2014/07/21 17:09:57
Yes, I believe so. But we don't need to include th
|
| +#endif |
| + |
| namespace blink { |
| using namespace VectorMath; |
| @@ -97,6 +101,47 @@ void DirectConvolver::process(AudioFloatArray* convolutionKernel, const float* s |
| vDSP_conv(inputP - kernelSize + 1, 1, kernelP + kernelSize - 1, -1, destP, 1, framesToProcess, kernelSize); |
| #endif // CPU(X86) |
| #else |
| + size_t i = 0; |
| +#if CPU(X86) || CPU(X86_64) |
| + // Convolution using SSE2. Currently only do this if both |kernelSize| and |framesToProcess| |
| + // are multiples of 4. If not, use the straighforward loop below. |
|
Ken Russell (switch to Gerrit)
2014/07/19 01:02:24
typo: straightforward
|
| + |
| + if ((kernelSize % 4 == 0) && (framesToProcess % 4 == 0)) { |
| + // AudioFloatArray's are always aligned on at least a 16-byte boundary. |
|
Ken Russell (switch to Gerrit)
2014/07/19 01:02:24
Sure hope that's true. I'm not verifying that asse
Raymond Toy
2014/07/21 17:09:57
For the record, it's in Source/platform/audio/Audi
|
| + AudioFloatArray kernelBuffer(4 * kernelSize); |
| + __m128* kernelReversed = reinterpret_cast<__m128*>(kernelBuffer.data()); |
| + |
| + // Reverse the kernel and repeat each value across a vector |
| + for (i = 0; i < kernelSize; ++i) { |
| + kernelReversed[i] = _mm_set1_ps(kernelP[kernelSize - i - 1]); |
| + } |
| + |
| + float* inputStartP = inputP - kernelSize + 1; |
| + |
| + // Do convolution with 4 inputs at a time. |
| + for (i = 0; i < framesToProcess; i += 4) { |
| + __m128 convolutionSum; |
| + |
| + convolutionSum = _mm_setzero_ps(); |
| + |
| + // |kernelSize| is a multiple of 4 so we can unroll the loop by 4, manually. |
| + for (size_t k = 0; k < kernelSize; k += 4) { |
| + size_t dataOffset = i + k; |
| + |
| + for (size_t m = 0; m < 4; ++m) { |
| + __m128 sourceBlock; |
| + __m128 product; |
| + |
| + sourceBlock = _mm_loadu_ps(inputStartP + dataOffset + m); |
| + product = _mm_mul_ps(kernelReversed[k + m], sourceBlock); |
| + convolutionSum = _mm_add_ps(convolutionSum, product); |
| + } |
| + } |
| + _mm_storeu_ps(destP + i, convolutionSum); |
| + } |
| + } else { |
| +#endif |
| + |
| // FIXME: The macro can be further optimized to avoid pipeline stalls. One possibility is to maintain 4 separate sums and change the macro to CONVOLVE_FOUR_SAMPLES. |
| #define CONVOLVE_ONE_SAMPLE \ |
| do { \ |
| @@ -104,7 +149,6 @@ void DirectConvolver::process(AudioFloatArray* convolutionKernel, const float* s |
| j++; \ |
| } while (0) |
| - size_t i = 0; |
| while (i < framesToProcess) { |
| size_t j = 0; |
| float sum = 0; |
| @@ -368,6 +412,9 @@ void DirectConvolver::process(AudioFloatArray* convolutionKernel, const float* s |
| } |
| destP[i++] = sum; |
| } |
| +#if CPU(X86) || CPU(X86_64) |
| + } |
| +#endif |
| #endif // OS(MACOSX) |
| // Copy 2nd half of input buffer to 1st half. |