Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(157)

Side by Side Diff: Source/platform/audio/DirectConvolver.cpp

Issue 408563003: WebAudio: Add SSE2 optimization for DirectConvolver (Closed) Base URL: svn://svn.chromium.org/blink/trunk
Patch Set: Created 6 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright (C) 2012 Intel Inc. All rights reserved. 2 * Copyright (C) 2012 Intel Inc. All rights reserved.
3 * 3 *
4 * Redistribution and use in source and binary forms, with or without 4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions 5 * modification, are permitted provided that the following conditions
6 * are met: 6 * are met:
7 * 7 *
8 * 1. Redistributions of source code must retain the above copyright 8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer. 9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright 10 * 2. Redistributions in binary form must reproduce the above copyright
(...skipping 21 matching lines...) Expand all
32 32
33 #include "platform/audio/DirectConvolver.h" 33 #include "platform/audio/DirectConvolver.h"
34 34
35 #if OS(MACOSX) 35 #if OS(MACOSX)
36 #include <Accelerate/Accelerate.h> 36 #include <Accelerate/Accelerate.h>
37 #endif 37 #endif
38 38
39 #include "platform/audio/VectorMath.h" 39 #include "platform/audio/VectorMath.h"
40 #include "wtf/CPU.h" 40 #include "wtf/CPU.h"
41 41
42 #if CPU(X86) || CPU(X86_64)
43 #include <emmintrin.h>
Ken Russell (switch to Gerrit) 2014/07/19 01:02:24 This header's available on all non-Mac OSs?
Raymond Toy 2014/07/21 17:09:57 Yes, I believe so. But we don't need to include th
44 #endif
45
42 namespace blink { 46 namespace blink {
43 47
44 using namespace VectorMath; 48 using namespace VectorMath;
45 49
46 DirectConvolver::DirectConvolver(size_t inputBlockSize) 50 DirectConvolver::DirectConvolver(size_t inputBlockSize)
47 : m_inputBlockSize(inputBlockSize) 51 : m_inputBlockSize(inputBlockSize)
48 #if USE(WEBAUDIO_IPP) 52 #if USE(WEBAUDIO_IPP)
49 , m_overlayBuffer(inputBlockSize) 53 , m_overlayBuffer(inputBlockSize)
50 #endif // USE(WEBAUDIO_IPP) 54 #endif // USE(WEBAUDIO_IPP)
51 , m_buffer(inputBlockSize * 2) 55 , m_buffer(inputBlockSize * 2)
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after
90 // Copy samples to 2nd half of input buffer. 94 // Copy samples to 2nd half of input buffer.
91 memcpy(inputP, sourceP, sizeof(float) * framesToProcess); 95 memcpy(inputP, sourceP, sizeof(float) * framesToProcess);
92 96
93 #if OS(MACOSX) 97 #if OS(MACOSX)
94 #if CPU(X86) 98 #if CPU(X86)
95 conv(inputP - kernelSize + 1, 1, kernelP + kernelSize - 1, -1, destP, 1, fra mesToProcess, kernelSize); 99 conv(inputP - kernelSize + 1, 1, kernelP + kernelSize - 1, -1, destP, 1, fra mesToProcess, kernelSize);
96 #else 100 #else
97 vDSP_conv(inputP - kernelSize + 1, 1, kernelP + kernelSize - 1, -1, destP, 1 , framesToProcess, kernelSize); 101 vDSP_conv(inputP - kernelSize + 1, 1, kernelP + kernelSize - 1, -1, destP, 1 , framesToProcess, kernelSize);
98 #endif // CPU(X86) 102 #endif // CPU(X86)
99 #else 103 #else
104 size_t i = 0;
105 #if CPU(X86) || CPU(X86_64)
106 // Convolution using SSE2. Currently only do this if both |kernelSize| and | framesToProcess|
107 // are multiples of 4. If not, use the straighforward loop below.
Ken Russell (switch to Gerrit) 2014/07/19 01:02:24 typo: straightforward
108
109 if ((kernelSize % 4 == 0) && (framesToProcess % 4 == 0)) {
110 // AudioFloatArray's are always aligned on at least a 16-byte boundary.
Ken Russell (switch to Gerrit) 2014/07/19 01:02:24 Sure hope that's true. I'm not verifying that asse
Raymond Toy 2014/07/21 17:09:57 For the record, it's in Source/platform/audio/Audi
111 AudioFloatArray kernelBuffer(4 * kernelSize);
112 __m128* kernelReversed = reinterpret_cast<__m128*>(kernelBuffer.data());
113
114 // Reverse the kernel and repeat each value across a vector
115 for (i = 0; i < kernelSize; ++i) {
116 kernelReversed[i] = _mm_set1_ps(kernelP[kernelSize - i - 1]);
117 }
118
119 float* inputStartP = inputP - kernelSize + 1;
120
121 // Do convolution with 4 inputs at a time.
122 for (i = 0; i < framesToProcess; i += 4) {
123 __m128 convolutionSum;
124
125 convolutionSum = _mm_setzero_ps();
126
127 // |kernelSize| is a multiple of 4 so we can unroll the loop by 4, m anually.
128 for (size_t k = 0; k < kernelSize; k += 4) {
129 size_t dataOffset = i + k;
130
131 for (size_t m = 0; m < 4; ++m) {
132 __m128 sourceBlock;
133 __m128 product;
134
135 sourceBlock = _mm_loadu_ps(inputStartP + dataOffset + m);
136 product = _mm_mul_ps(kernelReversed[k + m], sourceBlock);
137 convolutionSum = _mm_add_ps(convolutionSum, product);
138 }
139 }
140 _mm_storeu_ps(destP + i, convolutionSum);
141 }
142 } else {
143 #endif
144
100 // FIXME: The macro can be further optimized to avoid pipeline stalls. One p ossibility is to maintain 4 separate sums and change the macro to CONVOLVE_FOUR_ SAMPLES. 145 // FIXME: The macro can be further optimized to avoid pipeline stalls. One p ossibility is to maintain 4 separate sums and change the macro to CONVOLVE_FOUR_ SAMPLES.
101 #define CONVOLVE_ONE_SAMPLE \ 146 #define CONVOLVE_ONE_SAMPLE \
102 do { \ 147 do { \
103 sum += inputP[i - j] * kernelP[j]; \ 148 sum += inputP[i - j] * kernelP[j]; \
104 j++; \ 149 j++; \
105 } while (0) 150 } while (0)
106 151
107 size_t i = 0;
108 while (i < framesToProcess) { 152 while (i < framesToProcess) {
109 size_t j = 0; 153 size_t j = 0;
110 float sum = 0; 154 float sum = 0;
111 155
112 // FIXME: SSE optimization may be applied here. 156 // FIXME: SSE optimization may be applied here.
113 if (kernelSize == 32) { 157 if (kernelSize == 32) {
114 CONVOLVE_ONE_SAMPLE; // 1 158 CONVOLVE_ONE_SAMPLE; // 1
115 CONVOLVE_ONE_SAMPLE; // 2 159 CONVOLVE_ONE_SAMPLE; // 2
116 CONVOLVE_ONE_SAMPLE; // 3 160 CONVOLVE_ONE_SAMPLE; // 3
117 CONVOLVE_ONE_SAMPLE; // 4 161 CONVOLVE_ONE_SAMPLE; // 4
(...skipping 243 matching lines...) Expand 10 before | Expand all | Expand 10 after
361 CONVOLVE_ONE_SAMPLE; // 127 405 CONVOLVE_ONE_SAMPLE; // 127
362 CONVOLVE_ONE_SAMPLE; // 128 406 CONVOLVE_ONE_SAMPLE; // 128
363 } else { 407 } else {
364 while (j < kernelSize) { 408 while (j < kernelSize) {
365 // Non-optimized using actual while loop. 409 // Non-optimized using actual while loop.
366 CONVOLVE_ONE_SAMPLE; 410 CONVOLVE_ONE_SAMPLE;
367 } 411 }
368 } 412 }
369 destP[i++] = sum; 413 destP[i++] = sum;
370 } 414 }
415 #if CPU(X86) || CPU(X86_64)
416 }
417 #endif
371 #endif // OS(MACOSX) 418 #endif // OS(MACOSX)
372 419
373 // Copy 2nd half of input buffer to 1st half. 420 // Copy 2nd half of input buffer to 1st half.
374 memcpy(m_buffer.data(), inputP, sizeof(float) * framesToProcess); 421 memcpy(m_buffer.data(), inputP, sizeof(float) * framesToProcess);
375 #endif 422 #endif
376 } 423 }
377 424
378 void DirectConvolver::reset() 425 void DirectConvolver::reset()
379 { 426 {
380 m_buffer.zero(); 427 m_buffer.zero();
381 #if USE(WEBAUDIO_IPP) 428 #if USE(WEBAUDIO_IPP)
382 m_overlayBuffer.zero(); 429 m_overlayBuffer.zero();
383 #endif // USE(WEBAUDIO_IPP) 430 #endif // USE(WEBAUDIO_IPP)
384 } 431 }
385 432
386 } // namespace blink 433 } // namespace blink
387 434
388 #endif // ENABLE(WEB_AUDIO) 435 #endif // ENABLE(WEB_AUDIO)
OLDNEW
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698