OLD | NEW |
1 /* | 1 /* |
2 * Copyright (C) 2012 Intel Inc. All rights reserved. | 2 * Copyright (C) 2012 Intel Inc. All rights reserved. |
3 * | 3 * |
4 * Redistribution and use in source and binary forms, with or without | 4 * Redistribution and use in source and binary forms, with or without |
5 * modification, are permitted provided that the following conditions | 5 * modification, are permitted provided that the following conditions |
6 * are met: | 6 * are met: |
7 * | 7 * |
8 * 1. Redistributions of source code must retain the above copyright | 8 * 1. Redistributions of source code must retain the above copyright |
9 * notice, this list of conditions and the following disclaimer. | 9 * notice, this list of conditions and the following disclaimer. |
10 * 2. Redistributions in binary form must reproduce the above copyright | 10 * 2. Redistributions in binary form must reproduce the above copyright |
(...skipping 21 matching lines...) Expand all Loading... |
32 | 32 |
33 #include "platform/audio/DirectConvolver.h" | 33 #include "platform/audio/DirectConvolver.h" |
34 | 34 |
35 #if OS(MACOSX) | 35 #if OS(MACOSX) |
36 #include <Accelerate/Accelerate.h> | 36 #include <Accelerate/Accelerate.h> |
37 #endif | 37 #endif |
38 | 38 |
39 #include "platform/audio/VectorMath.h" | 39 #include "platform/audio/VectorMath.h" |
40 #include "wtf/CPU.h" | 40 #include "wtf/CPU.h" |
41 | 41 |
| 42 #if (CPU(X86) || CPU(X86_64)) && !(OS(MACOSX) || USE(WEBAUDIO_IPP)) |
| 43 #include <emmintrin.h> |
| 44 #endif |
| 45 |
42 namespace blink { | 46 namespace blink { |
43 | 47 |
44 using namespace VectorMath; | 48 using namespace VectorMath; |
45 | 49 |
46 DirectConvolver::DirectConvolver(size_t inputBlockSize) | 50 DirectConvolver::DirectConvolver(size_t inputBlockSize) |
47 : m_inputBlockSize(inputBlockSize) | 51 : m_inputBlockSize(inputBlockSize) |
48 #if USE(WEBAUDIO_IPP) | 52 #if USE(WEBAUDIO_IPP) |
49 , m_overlayBuffer(inputBlockSize) | 53 , m_overlayBuffer(inputBlockSize) |
50 #endif // USE(WEBAUDIO_IPP) | 54 #endif // USE(WEBAUDIO_IPP) |
51 , m_buffer(inputBlockSize * 2) | 55 , m_buffer(inputBlockSize * 2) |
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
90 // Copy samples to 2nd half of input buffer. | 94 // Copy samples to 2nd half of input buffer. |
91 memcpy(inputP, sourceP, sizeof(float) * framesToProcess); | 95 memcpy(inputP, sourceP, sizeof(float) * framesToProcess); |
92 | 96 |
93 #if OS(MACOSX) | 97 #if OS(MACOSX) |
94 #if CPU(X86) | 98 #if CPU(X86) |
95 conv(inputP - kernelSize + 1, 1, kernelP + kernelSize - 1, -1, destP, 1, fra
mesToProcess, kernelSize); | 99 conv(inputP - kernelSize + 1, 1, kernelP + kernelSize - 1, -1, destP, 1, fra
mesToProcess, kernelSize); |
96 #else | 100 #else |
97 vDSP_conv(inputP - kernelSize + 1, 1, kernelP + kernelSize - 1, -1, destP, 1
, framesToProcess, kernelSize); | 101 vDSP_conv(inputP - kernelSize + 1, 1, kernelP + kernelSize - 1, -1, destP, 1
, framesToProcess, kernelSize); |
98 #endif // CPU(X86) | 102 #endif // CPU(X86) |
99 #else | 103 #else |
| 104 size_t i = 0; |
| 105 #if CPU(X86) || CPU(X86_64) |
| 106 // Convolution using SSE2. Currently only do this if both |kernelSize| and |
framesToProcess| |
| 107 // are multiples of 4. If not, use the straightforward loop below. |
| 108 |
| 109 if ((kernelSize % 4 == 0) && (framesToProcess % 4 == 0)) { |
| 110 // AudioFloatArray's are always aligned on at least a 16-byte boundary. |
| 111 AudioFloatArray kernelBuffer(4 * kernelSize); |
| 112 __m128* kernelReversed = reinterpret_cast<__m128*>(kernelBuffer.data()); |
| 113 |
| 114 // Reverse the kernel and repeat each value across a vector |
| 115 for (i = 0; i < kernelSize; ++i) { |
| 116 kernelReversed[i] = _mm_set1_ps(kernelP[kernelSize - i - 1]); |
| 117 } |
| 118 |
| 119 float* inputStartP = inputP - kernelSize + 1; |
| 120 |
| 121 // Do convolution with 4 inputs at a time. |
| 122 for (i = 0; i < framesToProcess; i += 4) { |
| 123 __m128 convolutionSum; |
| 124 |
| 125 convolutionSum = _mm_setzero_ps(); |
| 126 |
| 127 // |kernelSize| is a multiple of 4 so we can unroll the loop by 4, m
anually. |
| 128 for (size_t k = 0; k < kernelSize; k += 4) { |
| 129 size_t dataOffset = i + k; |
| 130 |
| 131 for (size_t m = 0; m < 4; ++m) { |
| 132 __m128 sourceBlock; |
| 133 __m128 product; |
| 134 |
| 135 sourceBlock = _mm_loadu_ps(inputStartP + dataOffset + m); |
| 136 product = _mm_mul_ps(kernelReversed[k + m], sourceBlock); |
| 137 convolutionSum = _mm_add_ps(convolutionSum, product); |
| 138 } |
| 139 } |
| 140 _mm_storeu_ps(destP + i, convolutionSum); |
| 141 } |
| 142 } else { |
| 143 #endif |
| 144 |
100 // FIXME: The macro can be further optimized to avoid pipeline stalls. One p
ossibility is to maintain 4 separate sums and change the macro to CONVOLVE_FOUR_
SAMPLES. | 145 // FIXME: The macro can be further optimized to avoid pipeline stalls. One p
ossibility is to maintain 4 separate sums and change the macro to CONVOLVE_FOUR_
SAMPLES. |
101 #define CONVOLVE_ONE_SAMPLE \ | 146 #define CONVOLVE_ONE_SAMPLE \ |
102 do { \ | 147 do { \ |
103 sum += inputP[i - j] * kernelP[j]; \ | 148 sum += inputP[i - j] * kernelP[j]; \ |
104 j++; \ | 149 j++; \ |
105 } while (0) | 150 } while (0) |
106 | 151 |
107 size_t i = 0; | |
108 while (i < framesToProcess) { | 152 while (i < framesToProcess) { |
109 size_t j = 0; | 153 size_t j = 0; |
110 float sum = 0; | 154 float sum = 0; |
111 | 155 |
112 // FIXME: SSE optimization may be applied here. | 156 // FIXME: SSE optimization may be applied here. |
113 if (kernelSize == 32) { | 157 if (kernelSize == 32) { |
114 CONVOLVE_ONE_SAMPLE; // 1 | 158 CONVOLVE_ONE_SAMPLE; // 1 |
115 CONVOLVE_ONE_SAMPLE; // 2 | 159 CONVOLVE_ONE_SAMPLE; // 2 |
116 CONVOLVE_ONE_SAMPLE; // 3 | 160 CONVOLVE_ONE_SAMPLE; // 3 |
117 CONVOLVE_ONE_SAMPLE; // 4 | 161 CONVOLVE_ONE_SAMPLE; // 4 |
(...skipping 243 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
361 CONVOLVE_ONE_SAMPLE; // 127 | 405 CONVOLVE_ONE_SAMPLE; // 127 |
362 CONVOLVE_ONE_SAMPLE; // 128 | 406 CONVOLVE_ONE_SAMPLE; // 128 |
363 } else { | 407 } else { |
364 while (j < kernelSize) { | 408 while (j < kernelSize) { |
365 // Non-optimized using actual while loop. | 409 // Non-optimized using actual while loop. |
366 CONVOLVE_ONE_SAMPLE; | 410 CONVOLVE_ONE_SAMPLE; |
367 } | 411 } |
368 } | 412 } |
369 destP[i++] = sum; | 413 destP[i++] = sum; |
370 } | 414 } |
| 415 #if CPU(X86) || CPU(X86_64) |
| 416 } |
| 417 #endif |
371 #endif // OS(MACOSX) | 418 #endif // OS(MACOSX) |
372 | 419 |
373 // Copy 2nd half of input buffer to 1st half. | 420 // Copy 2nd half of input buffer to 1st half. |
374 memcpy(m_buffer.data(), inputP, sizeof(float) * framesToProcess); | 421 memcpy(m_buffer.data(), inputP, sizeof(float) * framesToProcess); |
375 #endif | 422 #endif |
376 } | 423 } |
377 | 424 |
378 void DirectConvolver::reset() | 425 void DirectConvolver::reset() |
379 { | 426 { |
380 m_buffer.zero(); | 427 m_buffer.zero(); |
381 #if USE(WEBAUDIO_IPP) | 428 #if USE(WEBAUDIO_IPP) |
382 m_overlayBuffer.zero(); | 429 m_overlayBuffer.zero(); |
383 #endif // USE(WEBAUDIO_IPP) | 430 #endif // USE(WEBAUDIO_IPP) |
384 } | 431 } |
385 | 432 |
386 } // namespace blink | 433 } // namespace blink |
387 | 434 |
388 #endif // ENABLE(WEB_AUDIO) | 435 #endif // ENABLE(WEB_AUDIO) |
OLD | NEW |