| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (C) 2012 Intel Inc. All rights reserved. | 2 * Copyright (C) 2012 Intel Inc. All rights reserved. |
| 3 * | 3 * |
| 4 * Redistribution and use in source and binary forms, with or without | 4 * Redistribution and use in source and binary forms, with or without |
| 5 * modification, are permitted provided that the following conditions | 5 * modification, are permitted provided that the following conditions |
| 6 * are met: | 6 * are met: |
| 7 * | 7 * |
| 8 * 1. Redistributions of source code must retain the above copyright | 8 * 1. Redistributions of source code must retain the above copyright |
| 9 * notice, this list of conditions and the following disclaimer. | 9 * notice, this list of conditions and the following disclaimer. |
| 10 * 2. Redistributions in binary form must reproduce the above copyright | 10 * 2. Redistributions in binary form must reproduce the above copyright |
| (...skipping 66 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 77 #if CPU(X86) | 77 #if CPU(X86) |
| 78 conv(inputP - kernelSize + 1, 1, kernelP + kernelSize - 1, -1, destP, 1, | 78 conv(inputP - kernelSize + 1, 1, kernelP + kernelSize - 1, -1, destP, 1, |
| 79 framesToProcess, kernelSize); | 79 framesToProcess, kernelSize); |
| 80 #else | 80 #else |
| 81 vDSP_conv(inputP - kernelSize + 1, 1, kernelP + kernelSize - 1, -1, destP, 1, | 81 vDSP_conv(inputP - kernelSize + 1, 1, kernelP + kernelSize - 1, -1, destP, 1, |
| 82 framesToProcess, kernelSize); | 82 framesToProcess, kernelSize); |
| 83 #endif // CPU(X86) | 83 #endif // CPU(X86) |
| 84 #else | 84 #else |
| 85 size_t i = 0; | 85 size_t i = 0; |
| 86 #if CPU(X86) || CPU(X86_64) | 86 #if CPU(X86) || CPU(X86_64) |
| 87 // Convolution using SSE2. Currently only do this if both |kernelSize| and |fr
amesToProcess| | 87 // Convolution using SSE2. Currently only do this if both |kernelSize| and |
| 88 // are multiples of 4. If not, use the straightforward loop below. | 88 // |framesToProcess| are multiples of 4. If not, use the straightforward loop |
| 89 // below. |
| 89 | 90 |
| 90 if ((kernelSize % 4 == 0) && (framesToProcess % 4 == 0)) { | 91 if ((kernelSize % 4 == 0) && (framesToProcess % 4 == 0)) { |
| 91 // AudioFloatArray's are always aligned on at least a 16-byte boundary. | 92 // AudioFloatArray's are always aligned on at least a 16-byte boundary. |
| 92 AudioFloatArray kernelBuffer(4 * kernelSize); | 93 AudioFloatArray kernelBuffer(4 * kernelSize); |
| 93 __m128* kernelReversed = reinterpret_cast<__m128*>(kernelBuffer.data()); | 94 __m128* kernelReversed = reinterpret_cast<__m128*>(kernelBuffer.data()); |
| 94 | 95 |
| 95 // Reverse the kernel and repeat each value across a vector | 96 // Reverse the kernel and repeat each value across a vector |
| 96 for (i = 0; i < kernelSize; ++i) { | 97 for (i = 0; i < kernelSize; ++i) { |
| 97 kernelReversed[i] = _mm_set1_ps(kernelP[kernelSize - i - 1]); | 98 kernelReversed[i] = _mm_set1_ps(kernelP[kernelSize - i - 1]); |
| 98 } | 99 } |
| 99 | 100 |
| 100 float* inputStartP = inputP - kernelSize + 1; | 101 float* inputStartP = inputP - kernelSize + 1; |
| 101 | 102 |
| 102 // Do convolution with 4 inputs at a time. | 103 // Do convolution with 4 inputs at a time. |
| 103 for (i = 0; i < framesToProcess; i += 4) { | 104 for (i = 0; i < framesToProcess; i += 4) { |
| 104 __m128 convolutionSum; | 105 __m128 convolutionSum; |
| 105 | 106 |
| 106 convolutionSum = _mm_setzero_ps(); | 107 convolutionSum = _mm_setzero_ps(); |
| 107 | 108 |
| 108 // |kernelSize| is a multiple of 4 so we can unroll the loop by 4, manuall
y. | 109 // |kernelSize| is a multiple of 4 so we can unroll the loop by 4, |
| 110 // manually. |
| 109 for (size_t k = 0; k < kernelSize; k += 4) { | 111 for (size_t k = 0; k < kernelSize; k += 4) { |
| 110 size_t dataOffset = i + k; | 112 size_t dataOffset = i + k; |
| 111 | 113 |
| 112 for (size_t m = 0; m < 4; ++m) { | 114 for (size_t m = 0; m < 4; ++m) { |
| 113 __m128 sourceBlock; | 115 __m128 sourceBlock; |
| 114 __m128 product; | 116 __m128 product; |
| 115 | 117 |
| 116 sourceBlock = _mm_loadu_ps(inputStartP + dataOffset + m); | 118 sourceBlock = _mm_loadu_ps(inputStartP + dataOffset + m); |
| 117 product = _mm_mul_ps(kernelReversed[k + m], sourceBlock); | 119 product = _mm_mul_ps(kernelReversed[k + m], sourceBlock); |
| 118 convolutionSum = _mm_add_ps(convolutionSum, product); | 120 convolutionSum = _mm_add_ps(convolutionSum, product); |
| 119 } | 121 } |
| 120 } | 122 } |
| 121 _mm_storeu_ps(destP + i, convolutionSum); | 123 _mm_storeu_ps(destP + i, convolutionSum); |
| 122 } | 124 } |
| 123 } else { | 125 } else { |
| 124 #endif | 126 #endif |
| 125 | 127 |
| 126 // FIXME: The macro can be further optimized to avoid pipeline stalls. One possi
bility is to maintain 4 separate sums and change the macro to CONVOLVE_FOUR_SAMP
LES. | 128 // FIXME: The macro can be further optimized to avoid pipeline stalls. One |
| 129 // possibility is to maintain 4 separate sums and change the macro to |
| 130 // CONVOLVE_FOUR_SAMPLES. |
| 127 #define CONVOLVE_ONE_SAMPLE \ | 131 #define CONVOLVE_ONE_SAMPLE \ |
| 128 do { \ | 132 do { \ |
| 129 sum += inputP[i - j] * kernelP[j]; \ | 133 sum += inputP[i - j] * kernelP[j]; \ |
| 130 j++; \ | 134 j++; \ |
| 131 } while (0) | 135 } while (0) |
| 132 | 136 |
| 133 while (i < framesToProcess) { | 137 while (i < framesToProcess) { |
| 134 size_t j = 0; | 138 size_t j = 0; |
| 135 float sum = 0; | 139 float sum = 0; |
| 136 | 140 |
| (...skipping 263 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 400 | 404 |
| 401 // Copy 2nd half of input buffer to 1st half. | 405 // Copy 2nd half of input buffer to 1st half. |
| 402 memcpy(m_buffer.data(), inputP, sizeof(float) * framesToProcess); | 406 memcpy(m_buffer.data(), inputP, sizeof(float) * framesToProcess); |
| 403 } | 407 } |
| 404 | 408 |
| 405 void DirectConvolver::reset() { | 409 void DirectConvolver::reset() { |
| 406 m_buffer.zero(); | 410 m_buffer.zero(); |
| 407 } | 411 } |
| 408 | 412 |
| 409 } // namespace blink | 413 } // namespace blink |
| OLD | NEW |