OLD | NEW |
---|---|
1 /* | 1 /* |
2 * Copyright (C) 2012 Intel Inc. All rights reserved. | 2 * Copyright (C) 2012 Intel Inc. All rights reserved. |
3 * | 3 * |
4 * Redistribution and use in source and binary forms, with or without | 4 * Redistribution and use in source and binary forms, with or without |
5 * modification, are permitted provided that the following conditions | 5 * modification, are permitted provided that the following conditions |
6 * are met: | 6 * are met: |
7 * | 7 * |
8 * 1. Redistributions of source code must retain the above copyright | 8 * 1. Redistributions of source code must retain the above copyright |
9 * notice, this list of conditions and the following disclaimer. | 9 * notice, this list of conditions and the following disclaimer. |
10 * 2. Redistributions in binary form must reproduce the above copyright | 10 * 2. Redistributions in binary form must reproduce the above copyright |
(...skipping 21 matching lines...) Expand all Loading... | |
32 | 32 |
33 #include "platform/audio/DirectConvolver.h" | 33 #include "platform/audio/DirectConvolver.h" |
34 | 34 |
35 #if OS(MACOSX) | 35 #if OS(MACOSX) |
36 #include <Accelerate/Accelerate.h> | 36 #include <Accelerate/Accelerate.h> |
37 #endif | 37 #endif |
38 | 38 |
39 #include "platform/audio/VectorMath.h" | 39 #include "platform/audio/VectorMath.h" |
40 #include "wtf/CPU.h" | 40 #include "wtf/CPU.h" |
41 | 41 |
42 #if CPU(X86) || CPU(X86_64) | |
43 #include <emmintrin.h> | |
Ken Russell (switch to Gerrit)
2014/07/19 01:02:24
This header's available on all non-Mac OSs?
Raymond Toy
2014/07/21 17:09:57
Yes, I believe so. But we don't need to include th
| |
44 #endif | |
45 | |
42 namespace blink { | 46 namespace blink { |
43 | 47 |
44 using namespace VectorMath; | 48 using namespace VectorMath; |
45 | 49 |
46 DirectConvolver::DirectConvolver(size_t inputBlockSize) | 50 DirectConvolver::DirectConvolver(size_t inputBlockSize) |
47 : m_inputBlockSize(inputBlockSize) | 51 : m_inputBlockSize(inputBlockSize) |
48 #if USE(WEBAUDIO_IPP) | 52 #if USE(WEBAUDIO_IPP) |
49 , m_overlayBuffer(inputBlockSize) | 53 , m_overlayBuffer(inputBlockSize) |
50 #endif // USE(WEBAUDIO_IPP) | 54 #endif // USE(WEBAUDIO_IPP) |
51 , m_buffer(inputBlockSize * 2) | 55 , m_buffer(inputBlockSize * 2) |
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
90 // Copy samples to 2nd half of input buffer. | 94 // Copy samples to 2nd half of input buffer. |
91 memcpy(inputP, sourceP, sizeof(float) * framesToProcess); | 95 memcpy(inputP, sourceP, sizeof(float) * framesToProcess); |
92 | 96 |
93 #if OS(MACOSX) | 97 #if OS(MACOSX) |
94 #if CPU(X86) | 98 #if CPU(X86) |
95 conv(inputP - kernelSize + 1, 1, kernelP + kernelSize - 1, -1, destP, 1, fra mesToProcess, kernelSize); | 99 conv(inputP - kernelSize + 1, 1, kernelP + kernelSize - 1, -1, destP, 1, fra mesToProcess, kernelSize); |
96 #else | 100 #else |
97 vDSP_conv(inputP - kernelSize + 1, 1, kernelP + kernelSize - 1, -1, destP, 1 , framesToProcess, kernelSize); | 101 vDSP_conv(inputP - kernelSize + 1, 1, kernelP + kernelSize - 1, -1, destP, 1 , framesToProcess, kernelSize); |
98 #endif // CPU(X86) | 102 #endif // CPU(X86) |
99 #else | 103 #else |
104 size_t i = 0; | |
105 #if CPU(X86) || CPU(X86_64) | |
106 // Convolution using SSE2. Currently only do this if both |kernelSize| and | framesToProcess| | |
107 // are multiples of 4. If not, use the straighforward loop below. | |
Ken Russell (switch to Gerrit)
2014/07/19 01:02:24
typo: straightforward
| |
108 | |
109 if ((kernelSize % 4 == 0) && (framesToProcess % 4 == 0)) { | |
110 // AudioFloatArray's are always aligned on at least a 16-byte boundary. | |
Ken Russell (switch to Gerrit)
2014/07/19 01:02:24
Sure hope that's true. I'm not verifying that asse
Raymond Toy
2014/07/21 17:09:57
For the record, it's in Source/platform/audio/Audi
| |
111 AudioFloatArray kernelBuffer(4 * kernelSize); | |
112 __m128* kernelReversed = reinterpret_cast<__m128*>(kernelBuffer.data()); | |
113 | |
114 // Reverse the kernel and repeat each value across a vector | |
115 for (i = 0; i < kernelSize; ++i) { | |
116 kernelReversed[i] = _mm_set1_ps(kernelP[kernelSize - i - 1]); | |
117 } | |
118 | |
119 float* inputStartP = inputP - kernelSize + 1; | |
120 | |
121 // Do convolution with 4 inputs at a time. | |
122 for (i = 0; i < framesToProcess; i += 4) { | |
123 __m128 convolutionSum; | |
124 | |
125 convolutionSum = _mm_setzero_ps(); | |
126 | |
127 // |kernelSize| is a multiple of 4 so we can unroll the loop by 4, m anually. | |
128 for (size_t k = 0; k < kernelSize; k += 4) { | |
129 size_t dataOffset = i + k; | |
130 | |
131 for (size_t m = 0; m < 4; ++m) { | |
132 __m128 sourceBlock; | |
133 __m128 product; | |
134 | |
135 sourceBlock = _mm_loadu_ps(inputStartP + dataOffset + m); | |
136 product = _mm_mul_ps(kernelReversed[k + m], sourceBlock); | |
137 convolutionSum = _mm_add_ps(convolutionSum, product); | |
138 } | |
139 } | |
140 _mm_storeu_ps(destP + i, convolutionSum); | |
141 } | |
142 } else { | |
143 #endif | |
144 | |
100 // FIXME: The macro can be further optimized to avoid pipeline stalls. One p ossibility is to maintain 4 separate sums and change the macro to CONVOLVE_FOUR_ SAMPLES. | 145 // FIXME: The macro can be further optimized to avoid pipeline stalls. One p ossibility is to maintain 4 separate sums and change the macro to CONVOLVE_FOUR_ SAMPLES. |
101 #define CONVOLVE_ONE_SAMPLE \ | 146 #define CONVOLVE_ONE_SAMPLE \ |
102 do { \ | 147 do { \ |
103 sum += inputP[i - j] * kernelP[j]; \ | 148 sum += inputP[i - j] * kernelP[j]; \ |
104 j++; \ | 149 j++; \ |
105 } while (0) | 150 } while (0) |
106 | 151 |
107 size_t i = 0; | |
108 while (i < framesToProcess) { | 152 while (i < framesToProcess) { |
109 size_t j = 0; | 153 size_t j = 0; |
110 float sum = 0; | 154 float sum = 0; |
111 | 155 |
112 // FIXME: SSE optimization may be applied here. | 156 // FIXME: SSE optimization may be applied here. |
113 if (kernelSize == 32) { | 157 if (kernelSize == 32) { |
114 CONVOLVE_ONE_SAMPLE; // 1 | 158 CONVOLVE_ONE_SAMPLE; // 1 |
115 CONVOLVE_ONE_SAMPLE; // 2 | 159 CONVOLVE_ONE_SAMPLE; // 2 |
116 CONVOLVE_ONE_SAMPLE; // 3 | 160 CONVOLVE_ONE_SAMPLE; // 3 |
117 CONVOLVE_ONE_SAMPLE; // 4 | 161 CONVOLVE_ONE_SAMPLE; // 4 |
(...skipping 243 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
361 CONVOLVE_ONE_SAMPLE; // 127 | 405 CONVOLVE_ONE_SAMPLE; // 127 |
362 CONVOLVE_ONE_SAMPLE; // 128 | 406 CONVOLVE_ONE_SAMPLE; // 128 |
363 } else { | 407 } else { |
364 while (j < kernelSize) { | 408 while (j < kernelSize) { |
365 // Non-optimized using actual while loop. | 409 // Non-optimized using actual while loop. |
366 CONVOLVE_ONE_SAMPLE; | 410 CONVOLVE_ONE_SAMPLE; |
367 } | 411 } |
368 } | 412 } |
369 destP[i++] = sum; | 413 destP[i++] = sum; |
370 } | 414 } |
415 #if CPU(X86) || CPU(X86_64) | |
416 } | |
417 #endif | |
371 #endif // OS(MACOSX) | 418 #endif // OS(MACOSX) |
372 | 419 |
373 // Copy 2nd half of input buffer to 1st half. | 420 // Copy 2nd half of input buffer to 1st half. |
374 memcpy(m_buffer.data(), inputP, sizeof(float) * framesToProcess); | 421 memcpy(m_buffer.data(), inputP, sizeof(float) * framesToProcess); |
375 #endif | 422 #endif |
376 } | 423 } |
377 | 424 |
378 void DirectConvolver::reset() | 425 void DirectConvolver::reset() |
379 { | 426 { |
380 m_buffer.zero(); | 427 m_buffer.zero(); |
381 #if USE(WEBAUDIO_IPP) | 428 #if USE(WEBAUDIO_IPP) |
382 m_overlayBuffer.zero(); | 429 m_overlayBuffer.zero(); |
383 #endif // USE(WEBAUDIO_IPP) | 430 #endif // USE(WEBAUDIO_IPP) |
384 } | 431 } |
385 | 432 |
386 } // namespace blink | 433 } // namespace blink |
387 | 434 |
388 #endif // ENABLE(WEB_AUDIO) | 435 #endif // ENABLE(WEB_AUDIO) |
OLD | NEW |