OLD | NEW |
1 /* | 1 /* |
2 * Copyright (C) 2012 Intel Inc. All rights reserved. | 2 * Copyright (C) 2012 Intel Inc. All rights reserved. |
3 * | 3 * |
4 * Redistribution and use in source and binary forms, with or without | 4 * Redistribution and use in source and binary forms, with or without |
5 * modification, are permitted provided that the following conditions | 5 * modification, are permitted provided that the following conditions |
6 * are met: | 6 * are met: |
7 * | 7 * |
8 * 1. Redistributions of source code must retain the above copyright | 8 * 1. Redistributions of source code must retain the above copyright |
9 * notice, this list of conditions and the following disclaimer. | 9 * notice, this list of conditions and the following disclaimer. |
10 * 2. Redistributions in binary form must reproduce the above copyright | 10 * 2. Redistributions in binary form must reproduce the above copyright |
(...skipping 66 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
77 #if CPU(X86) | 77 #if CPU(X86) |
78 conv(inputP - kernelSize + 1, 1, kernelP + kernelSize - 1, -1, destP, 1, | 78 conv(inputP - kernelSize + 1, 1, kernelP + kernelSize - 1, -1, destP, 1, |
79 framesToProcess, kernelSize); | 79 framesToProcess, kernelSize); |
80 #else | 80 #else |
81 vDSP_conv(inputP - kernelSize + 1, 1, kernelP + kernelSize - 1, -1, destP, 1, | 81 vDSP_conv(inputP - kernelSize + 1, 1, kernelP + kernelSize - 1, -1, destP, 1, |
82 framesToProcess, kernelSize); | 82 framesToProcess, kernelSize); |
83 #endif // CPU(X86) | 83 #endif // CPU(X86) |
84 #else | 84 #else |
85 size_t i = 0; | 85 size_t i = 0; |
86 #if CPU(X86) || CPU(X86_64) | 86 #if CPU(X86) || CPU(X86_64) |
87 // Convolution using SSE2. Currently only do this if both |kernelSize| and |fr
amesToProcess| | 87 // Convolution using SSE2. Currently only do this if both |kernelSize| and |
88 // are multiples of 4. If not, use the straightforward loop below. | 88 // |framesToProcess| are multiples of 4. If not, use the straightforward loop |
| 89 // below. |
89 | 90 |
90 if ((kernelSize % 4 == 0) && (framesToProcess % 4 == 0)) { | 91 if ((kernelSize % 4 == 0) && (framesToProcess % 4 == 0)) { |
91 // AudioFloatArray's are always aligned on at least a 16-byte boundary. | 92 // AudioFloatArray's are always aligned on at least a 16-byte boundary. |
92 AudioFloatArray kernelBuffer(4 * kernelSize); | 93 AudioFloatArray kernelBuffer(4 * kernelSize); |
93 __m128* kernelReversed = reinterpret_cast<__m128*>(kernelBuffer.data()); | 94 __m128* kernelReversed = reinterpret_cast<__m128*>(kernelBuffer.data()); |
94 | 95 |
95 // Reverse the kernel and repeat each value across a vector | 96 // Reverse the kernel and repeat each value across a vector |
96 for (i = 0; i < kernelSize; ++i) { | 97 for (i = 0; i < kernelSize; ++i) { |
97 kernelReversed[i] = _mm_set1_ps(kernelP[kernelSize - i - 1]); | 98 kernelReversed[i] = _mm_set1_ps(kernelP[kernelSize - i - 1]); |
98 } | 99 } |
99 | 100 |
100 float* inputStartP = inputP - kernelSize + 1; | 101 float* inputStartP = inputP - kernelSize + 1; |
101 | 102 |
102 // Do convolution with 4 inputs at a time. | 103 // Do convolution with 4 inputs at a time. |
103 for (i = 0; i < framesToProcess; i += 4) { | 104 for (i = 0; i < framesToProcess; i += 4) { |
104 __m128 convolutionSum; | 105 __m128 convolutionSum; |
105 | 106 |
106 convolutionSum = _mm_setzero_ps(); | 107 convolutionSum = _mm_setzero_ps(); |
107 | 108 |
108 // |kernelSize| is a multiple of 4 so we can unroll the loop by 4, manuall
y. | 109 // |kernelSize| is a multiple of 4 so we can unroll the loop by 4, |
| 110 // manually. |
109 for (size_t k = 0; k < kernelSize; k += 4) { | 111 for (size_t k = 0; k < kernelSize; k += 4) { |
110 size_t dataOffset = i + k; | 112 size_t dataOffset = i + k; |
111 | 113 |
112 for (size_t m = 0; m < 4; ++m) { | 114 for (size_t m = 0; m < 4; ++m) { |
113 __m128 sourceBlock; | 115 __m128 sourceBlock; |
114 __m128 product; | 116 __m128 product; |
115 | 117 |
116 sourceBlock = _mm_loadu_ps(inputStartP + dataOffset + m); | 118 sourceBlock = _mm_loadu_ps(inputStartP + dataOffset + m); |
117 product = _mm_mul_ps(kernelReversed[k + m], sourceBlock); | 119 product = _mm_mul_ps(kernelReversed[k + m], sourceBlock); |
118 convolutionSum = _mm_add_ps(convolutionSum, product); | 120 convolutionSum = _mm_add_ps(convolutionSum, product); |
119 } | 121 } |
120 } | 122 } |
121 _mm_storeu_ps(destP + i, convolutionSum); | 123 _mm_storeu_ps(destP + i, convolutionSum); |
122 } | 124 } |
123 } else { | 125 } else { |
124 #endif | 126 #endif |
125 | 127 |
126 // FIXME: The macro can be further optimized to avoid pipeline stalls. One possi
bility is to maintain 4 separate sums and change the macro to CONVOLVE_FOUR_SAMP
LES. | 128 // FIXME: The macro can be further optimized to avoid pipeline stalls. One |
| 129 // possibility is to maintain 4 separate sums and change the macro to |
| 130 // CONVOLVE_FOUR_SAMPLES. |
127 #define CONVOLVE_ONE_SAMPLE \ | 131 #define CONVOLVE_ONE_SAMPLE \ |
128 do { \ | 132 do { \ |
129 sum += inputP[i - j] * kernelP[j]; \ | 133 sum += inputP[i - j] * kernelP[j]; \ |
130 j++; \ | 134 j++; \ |
131 } while (0) | 135 } while (0) |
132 | 136 |
133 while (i < framesToProcess) { | 137 while (i < framesToProcess) { |
134 size_t j = 0; | 138 size_t j = 0; |
135 float sum = 0; | 139 float sum = 0; |
136 | 140 |
(...skipping 263 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
400 | 404 |
401 // Copy 2nd half of input buffer to 1st half. | 405 // Copy 2nd half of input buffer to 1st half. |
402 memcpy(m_buffer.data(), inputP, sizeof(float) * framesToProcess); | 406 memcpy(m_buffer.data(), inputP, sizeof(float) * framesToProcess); |
403 } | 407 } |
404 | 408 |
405 void DirectConvolver::reset() { | 409 void DirectConvolver::reset() { |
406 m_buffer.zero(); | 410 m_buffer.zero(); |
407 } | 411 } |
408 | 412 |
409 } // namespace blink | 413 } // namespace blink |
OLD | NEW |