media/base/sinc_resampler.cc - Issue 10803003: Add SSE optimizations to SincResampler.

Side by Side Diff: media/base/sinc_resampler.cc

Issue 10803003: Add SSE optimizations to SincResampler. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Created 8 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4 //	4 //

5 // Input buffer layout, dividing the total buffer into regions (r0_ - r5_):	5 // Input buffer layout, dividing the total buffer into regions (r0_ - r5_):

6 //	6 //

7 // \|----------------\|-----------------------------------------\|----------------\|	7 // \|----------------\|-----------------------------------------\|----------------\|

8 //	8 //

9 // kBlockSize + kKernelSize / 2	9 // kBlockSize + kKernelSize / 2

10 // <--------------------------------------------------------->	10 // <--------------------------------------------------------->

(...skipping 20 matching lines...) Expand all Loading...
31 // Note: we're glossing over how the sub-sample handling works with	31 // Note: we're glossing over how the sub-sample handling works with

32 // \|virtual_source_idx_\|, etc.	32 // \|virtual_source_idx_\|, etc.

33	33

34 // MSVC++ requires this to be set before any other includes to get M_PI.	34 // MSVC++ requires this to be set before any other includes to get M_PI.

35 #define _USE_MATH_DEFINES	35 #define _USE_MATH_DEFINES

36	36

37 #include "media/base/sinc_resampler.h"	37 #include "media/base/sinc_resampler.h"

38	38

39 #include <cmath>	39 #include <cmath>

40	40

	41 #include "base/cpu.h"

41 #include "base/logging.h"	42 #include "base/logging.h"

42	43

	44 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)

	45 #include <xmmintrin.h>

	46 #endif

	47

43 namespace media {	48 namespace media {

44	49

45 enum {	50 enum {

46 // The kernel size can be adjusted for quality (higher is better) at the	51 // The kernel size can be adjusted for quality (higher is better) at the

47 // expense of performance. Must be an even number.	52 // expense of performance. Must be an even number.

48 // TODO(dalecurtis): Test performance to see if we can jack this up to 64+.	53 // TODO(dalecurtis): Test performance to see if we can jack this up to 64+.

49 kKernelSize = 32,	54 kKernelSize = 32,

50	55

51 // The number of destination frames generated per processing pass. Affects	56 // The number of destination frames generated per processing pass. Affects

52 // how often and for how much SincResampler calls back for input. Must be	57 // how often and for how much SincResampler calls back for input. Must be

53 // greater than kKernelSize.	58 // greater than kKernelSize.

54 kBlockSize = 512,	59 kBlockSize = 512,

55	60

56 // The kernel offset count is used for interpolation and is the number of	61 // The kernel offset count is used for interpolation and is the number of

57 // sub-sample kernel shifts. Can be adjusted for quality (higher is better)	62 // sub-sample kernel shifts. Can be adjusted for quality (higher is better)

58 // at the expense of allocating more memory.	63 // at the expense of allocating more memory.

59 kKernelOffsetCount = 32,	64 kKernelOffsetCount = 32,

60 kKernelStorageSize = kKernelSize * (kKernelOffsetCount + 1),	65 kKernelStorageSize = kKernelSize * (kKernelOffsetCount + 1),

61	66

62 // The size (in samples) of the internal buffer used by the resampler.	67 // The size (in samples) of the internal buffer used by the resampler.

63 kBufferSize = kBlockSize + kKernelSize	68 kBufferSize = kBlockSize + kKernelSize

64 };	69 };

65	70

	71

	72 #define CONVOLVE_ONE_SAMPLE \
	Ami GONE FROM CHROMIUM 2012/07/18 04:40:35 #undef this when done with it. Except you only use #undef this when done with it. Except you only use this once, so why not just inline? DaleCurtis 2012/07/21 02:35:14 Good point. Leftovers from the WebKit version of t Show quoted text On 2012/07/18 04:40:35, Ami Fischman wrote: > #undef this when done with it. > Except you only use this once, so why not just inline? Good point. Leftovers from the WebKit version of the optimizations.
	73 sum1 += input_ptr *k1++; \

	74 sum2 += input_ptr++ *k2++;

	75

	76 static double Convolve_C(float input_ptr, float k1, float* k2,

	77 double kernel_interpolation_factor) {

	78 float sum1 = 0;

	79 float sum2 = 0;

	80

	81 // Generate a single output sample. Unrolling this loop hurt performance in

	82 // local testing.

	83 int n = kKernelSize;

	84 while (n--) {

	85 CONVOLVE_ONE_SAMPLE

	86 }

	87

	88 // Linearly interpolate the two "convolutions".

	89 return (1.0 - kernel_interpolation_factor) * sum1

	90 + kernel_interpolation_factor * sum2;

	91 }

	92

	93 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)

	94 #define CONVOLVE_4_SAMPLES(load) \
	Ami GONE FROM CHROMIUM 2012/07/18 04:40:35 Any reason to use a #define instead of an inline f Any reason to use a #define instead of an inline function? DaleCurtis 2012/07/21 02:35:14 Not sure how that'd work since we're modifying the Show quoted text On 2012/07/18 04:40:35, Ami Fischman wrote: > Any reason to use a #define instead of an inline function? Not sure how that'd work since we're modifying the instruction used. In any case I've moved this directly into each for loop for readability.
	95 m_input = _mm_##load##_ps(input_ptr + i); \

	96 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); \

	97 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));

	98

	99 static const int kFloatsPerPass = sizeof(__m128) / sizeof(float);
	Ami GONE FROM CHROMIUM 2012/07/18 04:40:35 Not sure what this generality buys you given you'v Not sure what this generality buys you given you've hardcoded _4_ above already... DaleCurtis 2012/07/21 02:35:14 Removed. Show quoted text On 2012/07/18 04:40:35, Ami Fischman wrote: > Not sure what this generality buys you given you've hardcoded _4_ above > already... Removed.
	100 static double Convolve_SSE(float input_ptr, float k1, float* k2,
	Ami GONE FROM CHROMIUM 2012/07/18 04:40:35 i can haz a test that shows the two Convolve_* fun i can haz a test that shows the two Convolve_* functions return equal values? (obvs., that test would only run on some machines) DaleCurtis 2012/07/21 02:35:14 Done. Show quoted text On 2012/07/18 04:40:35, Ami Fischman wrote: > i can haz a test that shows the two Convolve_* functions return equal values? > (obvs., that test would only run on some machines) Done.
	101 double kernel_interpolation_factor) {

	102 // Ensure \|k1\|, \|k2\| are aligned for SSE usage. Should always be true so long

	103 // as kKernelSize is a power of 2.

	104 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k1) & (sizeof(__m128) - 1));

	105 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k2) & (sizeof(__m128) - 1));

	106

	107 __m128 m_input;

	108 __m128 m_sums1 = _mm_setzero_ps();

	109 __m128 m_sums2 = _mm_setzero_ps();

	110

	111 // Based on \|input_ptr\| alignment, we need to use loadu or load. Unrolling

	112 // these loops hurt performance in local testing.

	113 if (reinterpret_cast<uintptr_t>(input_ptr) & (sizeof(__m128) - 1)) {

	114 for (int i = 0; i < kKernelSize; i += kFloatsPerPass) {

	115 CONVOLVE_4_SAMPLES(loadu)

	116 }

	117 } else {

	118 for (int i = 0; i < kKernelSize; i += kFloatsPerPass) {

	119 CONVOLVE_4_SAMPLES(load)

	120 }

	121 }

	122

	123 // Linearly interpolate the two "convolutions".

	124 m_sums1 = _mm_mul_ps(m_sums1, _mm_set_ps1(1.0 - kernel_interpolation_factor));

	125 m_sums2 = _mm_mul_ps(m_sums2, _mm_set_ps1(kernel_interpolation_factor));

	126 m_sums1 = _mm_add_ps(m_sums1, m_sums2);

	127

	128 // Sum components together.

	129 float result;

	130 m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1);

	131 _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps(

	132 m_sums2, m_sums2, 1)));

	133

	134 return result;

	135 }

	136 #endif

	137

	138 typedef double (ConvolveProc)(float src, float* k1, float* k2,

	139 double kernel_interpolation_factor);

	140 static ConvolveProc convolve_proc = NULL;

	141

66 SincResampler::SincResampler(double io_sample_rate_ratio, const ReadCB& read_cb)	142 SincResampler::SincResampler(double io_sample_rate_ratio, const ReadCB& read_cb)

67 : io_sample_rate_ratio_(io_sample_rate_ratio),	143 : io_sample_rate_ratio_(io_sample_rate_ratio),

68 virtual_source_idx_(0),	144 virtual_source_idx_(0),

69 buffer_primed_(false),	145 buffer_primed_(false),

70 read_cb_(read_cb),	146 read_cb_(read_cb),

71 // TODO(dalecurtis): When we switch to AVX/SSE optimization, we'll need to	147 // TODO(dalecurtis): When we switch to SSE optimization, we'll need to

72 // allocate with 32-byte alignment and ensure they're sized % 32 bytes.	148 // allocate with 16-byte alignment (default linux, mac, not win)
	Ami GONE FROM CHROMIUM 2012/07/18 04:40:35 IDK what this means IDK what this means DaleCurtis 2012/07/21 02:35:14 Changed. Was a reminder to me to land my AlignedMe Show quoted text On 2012/07/18 04:40:35, Ami Fischman wrote: > IDK what this means Changed. Was a reminder to me to land my AlignedMemory CL and change this before this lands.
73 kernel_storage_(new float[kKernelStorageSize]),	149 kernel_storage_(new float[kKernelStorageSize]),

74 input_buffer_(new float[kBufferSize]),	150 input_buffer_(new float[kBufferSize]),

75 // Setup various region pointers in the buffer (see diagram above).	151 // Setup various region pointers in the buffer (see diagram above).

76 r0_(input_buffer_.get() + kKernelSize / 2),	152 r0_(input_buffer_.get() + kKernelSize / 2),

77 r1_(input_buffer_.get()),	153 r1_(input_buffer_.get()),

78 r2_(r0_),	154 r2_(r0_),

79 r3_(r0_ + kBlockSize - kKernelSize / 2),	155 r3_(r0_ + kBlockSize - kKernelSize / 2),

80 r4_(r0_ + kBlockSize),	156 r4_(r0_ + kBlockSize),

81 r5_(r0_ + kKernelSize / 2) {	157 r5_(r0_ + kKernelSize / 2) {

82 DCHECK_EQ(kKernelSize % 2, 0) << "kKernelSize must be even!";	158 DCHECK_EQ(kKernelSize % 2, 0) << "kKernelSize must be even!";

83 DCHECK_GT(kBlockSize, kKernelSize)	159 DCHECK_GT(kBlockSize, kKernelSize)

84 << "kBlockSize must be greater than kKernelSize!";	160 << "kBlockSize must be greater than kKernelSize!";

85 // Basic sanity checks to ensure buffer regions are laid out correctly:	161 // Basic sanity checks to ensure buffer regions are laid out correctly:

86 // r0_ and r2_ should always be the same position.	162 // r0_ and r2_ should always be the same position.

87 DCHECK_EQ(r0_, r2_);	163 DCHECK_EQ(r0_, r2_);

88 // r1_ at the beginning of the buffer.	164 // r1_ at the beginning of the buffer.

89 DCHECK_EQ(r1_, input_buffer_.get());	165 DCHECK_EQ(r1_, input_buffer_.get());

90 // r1_ left of r2_, r2_ left of r5_ and r1_, r2_ size correct.	166 // r1_ left of r2_, r2_ left of r5_ and r1_, r2_ size correct.

91 DCHECK_EQ(r2_ - r1_, r5_ - r2_);	167 DCHECK_EQ(r2_ - r1_, r5_ - r2_);

92 // r3_ left of r4_, r5_ left of r0_ and r3_ size correct.	168 // r3_ left of r4_, r5_ left of r0_ and r3_ size correct.

93 DCHECK_EQ(r4_ - r3_, r5_ - r0_);	169 DCHECK_EQ(r4_ - r3_, r5_ - r0_);

94 // r3_, r4_ size correct and r4_ at the end of the buffer.	170 // r3_, r4_ size correct and r4_ at the end of the buffer.

95 DCHECK_EQ(r4_ + (r4_ - r3_), r1_ + kBufferSize);	171 DCHECK_EQ(r4_ + (r4_ - r3_), r1_ + kBufferSize);

96 // r5_ size correct and at the end of the buffer.	172 // r5_ size correct and at the end of the buffer.

97 DCHECK_EQ(r5_ + kBlockSize, r1_ + kBufferSize);	173 DCHECK_EQ(r5_ + kBlockSize, r1_ + kBufferSize);

98	174

	175 if (!convolve_proc) {
	Ami GONE FROM CHROMIUM 2012/07/18 04:40:35 This is racy. This is racy. DaleCurtis 2012/07/21 02:35:14 Done. Show quoted text On 2012/07/18 04:40:35, Ami Fischman wrote: > This is racy. Done.
	176 convolve_proc = Convolve_C;

	177 base::CPU cpu;

	178 if (cpu.has_sse())

	179 convolve_proc = Convolve_SSE;
	Ami GONE FROM CHROMIUM 2012/07/18 04:40:35 Does this compile on build platforms that don't qu Does this compile on build platforms that don't qualify for the #ifdef above? DaleCurtis 2012/07/21 02:35:14 Done. Show quoted text On 2012/07/18 04:40:35, Ami Fischman wrote: > Does this compile on build platforms that don't qualify for the #ifdef above? Done.
	180 }

	181

99 memset(kernel_storage_.get(), 0,	182 memset(kernel_storage_.get(), 0,

100 sizeof(kernel_storage_.get()) kKernelStorageSize);	183 sizeof(kernel_storage_.get()) kKernelStorageSize);

101 memset(input_buffer_.get(), 0, sizeof(input_buffer_.get()) kBufferSize);	184 memset(input_buffer_.get(), 0, sizeof(input_buffer_.get()) kBufferSize);

102	185

103 InitializeKernel();	186 InitializeKernel();

104 }	187 }

105	188

106 SincResampler::~SincResampler() {}	189 SincResampler::~SincResampler() {}

107	190

108 void SincResampler::InitializeKernel() {	191 void SincResampler::InitializeKernel() {

(...skipping 52 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
161 while (remaining_frames) {	244 while (remaining_frames) {

162 while (virtual_source_idx_ < kBlockSize) {	245 while (virtual_source_idx_ < kBlockSize) {

163 // \|virtual_source_idx_\| lies in between two kernel offsets so figure out	246 // \|virtual_source_idx_\| lies in between two kernel offsets so figure out

164 // what they are.	247 // what they are.

165 int source_idx = static_cast<int>(virtual_source_idx_);	248 int source_idx = static_cast<int>(virtual_source_idx_);

166 double subsample_remainder = virtual_source_idx_ - source_idx;	249 double subsample_remainder = virtual_source_idx_ - source_idx;

167	250

168 double virtual_offset_idx = subsample_remainder * kKernelOffsetCount;	251 double virtual_offset_idx = subsample_remainder * kKernelOffsetCount;

169 int offset_idx = static_cast<int>(virtual_offset_idx);	252 int offset_idx = static_cast<int>(virtual_offset_idx);

170	253

	254 // We'll compute "convolutions" for the two kernels which straddle

	255 // \|virtual_source_idx_\|.

171 float* k1 = kernel_storage_.get() + offset_idx * kKernelSize;	256 float* k1 = kernel_storage_.get() + offset_idx * kKernelSize;

172 float* k2 = k1 + kKernelSize;	257 float* k2 = k1 + kKernelSize;

173	258

174 // Initialize input pointer based on quantized \|virtual_source_idx_\|.	259 // Initialize input pointer based on quantized \|virtual_source_idx_\|.

175 float* input_ptr = r1_ + source_idx;	260 float* input_ptr = r1_ + source_idx;

176	261

177 // We'll compute "convolutions" for the two kernels which straddle

178 // \|virtual_source_idx_\|.

179 float sum1 = 0;

180 float sum2 = 0;

181

182 // Figure out how much to weight each kernel's "convolution".	262 // Figure out how much to weight each kernel's "convolution".

183 double kernel_interpolation_factor = virtual_offset_idx - offset_idx;	263 double kernel_interpolation_factor = virtual_offset_idx - offset_idx;

184	264 *destination++ = convolve_proc(

185 // Generate a single output sample.	265 input_ptr, k1, k2, kernel_interpolation_factor);

186 int n = kKernelSize;

187 float input;

188 // TODO(dalecurtis): For initial commit, I've ripped out all the SSE

189 // optimizations, these definitely need to go back in before release.

190 while (n--) {

191 input = *input_ptr++;

192 sum1 += input * *k1++;

193 sum2 += input * *k2++;

194 }

195

196 // Linearly interpolate the two "convolutions".

197 double result = (1.0 - kernel_interpolation_factor) * sum1

198 + kernel_interpolation_factor * sum2;

199

200 *destination++ = result;

201	266

202 // Advance the virtual index.	267 // Advance the virtual index.

203 virtual_source_idx_ += io_sample_rate_ratio_;	268 virtual_source_idx_ += io_sample_rate_ratio_;

204	269

205 if (!--remaining_frames)	270 if (!--remaining_frames)

206 return;	271 return;

207 }	272 }

208	273

209 // Wrap back around to the start.	274 // Wrap back around to the start.

210 virtual_source_idx_ -= kBlockSize;	275 virtual_source_idx_ -= kBlockSize;

211	276

212 // Step (3) Copy r3_ to r1_ and r4_ to r2_.	277 // Step (3) Copy r3_ to r1_ and r4_ to r2_.

213 // This wraps the last input frames back to the start of the buffer.	278 // This wraps the last input frames back to the start of the buffer.

214 memcpy(r1_, r3_, sizeof(input_buffer_.get()) (kKernelSize / 2));	279 memcpy(r1_, r3_, sizeof(input_buffer_.get()) (kKernelSize / 2));

215 memcpy(r2_, r4_, sizeof(input_buffer_.get()) (kKernelSize / 2));	280 memcpy(r2_, r4_, sizeof(input_buffer_.get()) (kKernelSize / 2));

216	281

217 // Step (4)	282 // Step (4)

218 // Refresh the buffer with more input.	283 // Refresh the buffer with more input.

219 read_cb_.Run(r5_, kBlockSize);	284 read_cb_.Run(r5_, kBlockSize);

220 }	285 }

221 }	286 }

222	287

223 int SincResampler::ChunkSize() {	288 int SincResampler::ChunkSize() {

224 return kBlockSize / io_sample_rate_ratio_;	289 return kBlockSize / io_sample_rate_ratio_;

225 }	290 }

226	291

227 } // namespace media	292 } // namespace media

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »