Source/platform/audio/DirectConvolver.cpp - Issue 408563003: WebAudio: Add SSE2 optimization for DirectConvolver

Side by Side Diff: Source/platform/audio/DirectConvolver.cpp

Issue 408563003: WebAudio: Add SSE2 optimization for DirectConvolver (Closed) Base URL: svn://svn.chromium.org/blink/trunk

Patch Set: Created 6 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 /*	1 /*

2 * Copyright (C) 2012 Intel Inc. All rights reserved.	2 * Copyright (C) 2012 Intel Inc. All rights reserved.

3 *	3 *

4 * Redistribution and use in source and binary forms, with or without	4 * Redistribution and use in source and binary forms, with or without

5 * modification, are permitted provided that the following conditions	5 * modification, are permitted provided that the following conditions

6 * are met:	6 * are met:

7 *	7 *

8 * 1. Redistributions of source code must retain the above copyright	8 * 1. Redistributions of source code must retain the above copyright

9 * notice, this list of conditions and the following disclaimer.	9 * notice, this list of conditions and the following disclaimer.

10 * 2. Redistributions in binary form must reproduce the above copyright	10 * 2. Redistributions in binary form must reproduce the above copyright

(...skipping 21 matching lines...) Expand all Loading...
32	32

33 #include "platform/audio/DirectConvolver.h"	33 #include "platform/audio/DirectConvolver.h"

34	34

35 #if OS(MACOSX)	35 #if OS(MACOSX)

36 #include <Accelerate/Accelerate.h>	36 #include <Accelerate/Accelerate.h>

37 #endif	37 #endif

38	38

39 #include "platform/audio/VectorMath.h"	39 #include "platform/audio/VectorMath.h"

40 #include "wtf/CPU.h"	40 #include "wtf/CPU.h"

41	41

	42 #if CPU(X86) \|\| CPU(X86_64)

	43 #include <emmintrin.h>
	Ken Russell (switch to Gerrit) 2014/07/19 01:02:24 This header's available on all non-Mac OSs? This header's available on all non-Mac OSs? Raymond Toy 2014/07/21 17:09:57 Yes, I believe so. But we don't need to include th Show quoted text On 2014/07/19 01:02:24, Ken Russell wrote: > This header's available on all non-Mac OSs? Yes, I believe so. But we don't need to include this on OSX since we use OSX's direct convolver instead.
	44 #endif

	45

42 namespace blink {	46 namespace blink {

43	47

44 using namespace VectorMath;	48 using namespace VectorMath;

45	49

46 DirectConvolver::DirectConvolver(size_t inputBlockSize)	50 DirectConvolver::DirectConvolver(size_t inputBlockSize)

47 : m_inputBlockSize(inputBlockSize)	51 : m_inputBlockSize(inputBlockSize)

48 #if USE(WEBAUDIO_IPP)	52 #if USE(WEBAUDIO_IPP)

49 , m_overlayBuffer(inputBlockSize)	53 , m_overlayBuffer(inputBlockSize)

50 #endif // USE(WEBAUDIO_IPP)	54 #endif // USE(WEBAUDIO_IPP)

51 , m_buffer(inputBlockSize * 2)	55 , m_buffer(inputBlockSize * 2)

(...skipping 38 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
90 // Copy samples to 2nd half of input buffer.	94 // Copy samples to 2nd half of input buffer.

91 memcpy(inputP, sourceP, sizeof(float) * framesToProcess);	95 memcpy(inputP, sourceP, sizeof(float) * framesToProcess);

92	96

93 #if OS(MACOSX)	97 #if OS(MACOSX)

94 #if CPU(X86)	98 #if CPU(X86)

95 conv(inputP - kernelSize + 1, 1, kernelP + kernelSize - 1, -1, destP, 1, fra mesToProcess, kernelSize);	99 conv(inputP - kernelSize + 1, 1, kernelP + kernelSize - 1, -1, destP, 1, fra mesToProcess, kernelSize);

96 #else	100 #else

97 vDSP_conv(inputP - kernelSize + 1, 1, kernelP + kernelSize - 1, -1, destP, 1 , framesToProcess, kernelSize);	101 vDSP_conv(inputP - kernelSize + 1, 1, kernelP + kernelSize - 1, -1, destP, 1 , framesToProcess, kernelSize);

98 #endif // CPU(X86)	102 #endif // CPU(X86)

99 #else	103 #else

	104 size_t i = 0;

	105 #if CPU(X86) \|\| CPU(X86_64)

	106 // Convolution using SSE2. Currently only do this if both \|kernelSize\| and \| framesToProcess\|

	107 // are multiples of 4. If not, use the straighforward loop below.
	Ken Russell (switch to Gerrit) 2014/07/19 01:02:24 typo: straightforward typo: straightforward
	108

	109 if ((kernelSize % 4 == 0) && (framesToProcess % 4 == 0)) {

	110 // AudioFloatArray's are always aligned on at least a 16-byte boundary.
	Ken Russell (switch to Gerrit) 2014/07/19 01:02:24 Sure hope that's true. I'm not verifying that asse Sure hope that's true. I'm not verifying that assertion in this review. Raymond Toy 2014/07/21 17:09:57 For the record, it's in Source/platform/audio/Audi Show quoted text On 2014/07/19 01:02:24, Ken Russell wrote: > Sure hope that's true. I'm not verifying that assertion in this review. For the record, it's in Source/platform/audio/AudioArray.h. We get 32-byte alignment using FFMPEG or OpenMAX DL. Otherwise it's 16 bytes.
	111 AudioFloatArray kernelBuffer(4 * kernelSize);

	112 __m128* kernelReversed = reinterpret_cast<__m128*>(kernelBuffer.data());

	113

	114 // Reverse the kernel and repeat each value across a vector

	115 for (i = 0; i < kernelSize; ++i) {

	116 kernelReversed[i] = _mm_set1_ps(kernelP[kernelSize - i - 1]);

	117 }

	118

	119 float* inputStartP = inputP - kernelSize + 1;

	120

	121 // Do convolution with 4 inputs at a time.

	122 for (i = 0; i < framesToProcess; i += 4) {

	123 __m128 convolutionSum;

	124

	125 convolutionSum = _mm_setzero_ps();

	126

	127 // \|kernelSize\| is a multiple of 4 so we can unroll the loop by 4, m anually.

	128 for (size_t k = 0; k < kernelSize; k += 4) {

	129 size_t dataOffset = i + k;

	130

	131 for (size_t m = 0; m < 4; ++m) {

	132 __m128 sourceBlock;

	133 __m128 product;

	134

	135 sourceBlock = _mm_loadu_ps(inputStartP + dataOffset + m);

	136 product = _mm_mul_ps(kernelReversed[k + m], sourceBlock);

	137 convolutionSum = _mm_add_ps(convolutionSum, product);

	138 }

	139 }

	140 _mm_storeu_ps(destP + i, convolutionSum);

	141 }

	142 } else {

	143 #endif

	144

100 // FIXME: The macro can be further optimized to avoid pipeline stalls. One p ossibility is to maintain 4 separate sums and change the macro to CONVOLVE_FOUR_ SAMPLES.	145 // FIXME: The macro can be further optimized to avoid pipeline stalls. One p ossibility is to maintain 4 separate sums and change the macro to CONVOLVE_FOUR_ SAMPLES.

101 #define CONVOLVE_ONE_SAMPLE \	146 #define CONVOLVE_ONE_SAMPLE \

102 do { \	147 do { \

103 sum += inputP[i - j] * kernelP[j]; \	148 sum += inputP[i - j] * kernelP[j]; \

104 j++; \	149 j++; \

105 } while (0)	150 } while (0)

106	151

107 size_t i = 0;

108 while (i < framesToProcess) {	152 while (i < framesToProcess) {

109 size_t j = 0;	153 size_t j = 0;

110 float sum = 0;	154 float sum = 0;

111	155

112 // FIXME: SSE optimization may be applied here.	156 // FIXME: SSE optimization may be applied here.

113 if (kernelSize == 32) {	157 if (kernelSize == 32) {

114 CONVOLVE_ONE_SAMPLE; // 1	158 CONVOLVE_ONE_SAMPLE; // 1

115 CONVOLVE_ONE_SAMPLE; // 2	159 CONVOLVE_ONE_SAMPLE; // 2

116 CONVOLVE_ONE_SAMPLE; // 3	160 CONVOLVE_ONE_SAMPLE; // 3

117 CONVOLVE_ONE_SAMPLE; // 4	161 CONVOLVE_ONE_SAMPLE; // 4

(...skipping 243 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
361 CONVOLVE_ONE_SAMPLE; // 127	405 CONVOLVE_ONE_SAMPLE; // 127

362 CONVOLVE_ONE_SAMPLE; // 128	406 CONVOLVE_ONE_SAMPLE; // 128

363 } else {	407 } else {

364 while (j < kernelSize) {	408 while (j < kernelSize) {

365 // Non-optimized using actual while loop.	409 // Non-optimized using actual while loop.

366 CONVOLVE_ONE_SAMPLE;	410 CONVOLVE_ONE_SAMPLE;

367 }	411 }

368 }	412 }

369 destP[i++] = sum;	413 destP[i++] = sum;

370 }	414 }

	415 #if CPU(X86) \|\| CPU(X86_64)

	416 }

	417 #endif

371 #endif // OS(MACOSX)	418 #endif // OS(MACOSX)

372	419

373 // Copy 2nd half of input buffer to 1st half.	420 // Copy 2nd half of input buffer to 1st half.

374 memcpy(m_buffer.data(), inputP, sizeof(float) * framesToProcess);	421 memcpy(m_buffer.data(), inputP, sizeof(float) * framesToProcess);

375 #endif	422 #endif

376 }	423 }

377	424

378 void DirectConvolver::reset()	425 void DirectConvolver::reset()

379 {	426 {

380 m_buffer.zero();	427 m_buffer.zero();

381 #if USE(WEBAUDIO_IPP)	428 #if USE(WEBAUDIO_IPP)

382 m_overlayBuffer.zero();	429 m_overlayBuffer.zero();

383 #endif // USE(WEBAUDIO_IPP)	430 #endif // USE(WEBAUDIO_IPP)

384 }	431 }

385	432

386 } // namespace blink	433 } // namespace blink

387	434

388 #endif // ENABLE(WEB_AUDIO)	435 #endif // ENABLE(WEB_AUDIO)

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »