media/base/sinc_resampler.cc - Issue 10803003: Add SSE optimizations to SincResampler.

Side by Side Diff: media/base/sinc_resampler.cc

Issue 10803003: Add SSE optimizations to SincResampler. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Fischman Fixes! Created 8 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4 //	4 //

5 // Input buffer layout, dividing the total buffer into regions (r0_ - r5_):	5 // Input buffer layout, dividing the total buffer into regions (r0_ - r5_):

6 //	6 //

7 // \|----------------\|-----------------------------------------\|----------------\|	7 // \|----------------\|-----------------------------------------\|----------------\|

8 //	8 //

9 // kBlockSize + kKernelSize / 2	9 // kBlockSize + kKernelSize / 2

10 // <--------------------------------------------------------->	10 // <--------------------------------------------------------->

(...skipping 18 matching lines...) Expand all Loading...
29 // 5) Goto (2) until all of input is consumed.	29 // 5) Goto (2) until all of input is consumed.

30 //	30 //

31 // Note: we're glossing over how the sub-sample handling works with	31 // Note: we're glossing over how the sub-sample handling works with

32 // \|virtual_source_idx_\|, etc.	32 // \|virtual_source_idx_\|, etc.

33	33

34 // MSVC++ requires this to be set before any other includes to get M_PI.	34 // MSVC++ requires this to be set before any other includes to get M_PI.

35 #define _USE_MATH_DEFINES	35 #define _USE_MATH_DEFINES

36	36

37 #include "media/base/sinc_resampler.h"	37 #include "media/base/sinc_resampler.h"

38	38

	39 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)

	40 #include <xmmintrin.h>

	41 #endif

39 #include <cmath>	42 #include <cmath>

40	43

	44 #include "base/cpu.h"

41 #include "base/logging.h"	45 #include "base/logging.h"

42	46

43 namespace media {	47 namespace media {

44	48

45 enum {	49 enum {

46 // The kernel size can be adjusted for quality (higher is better) at the	50 // The kernel size can be adjusted for quality (higher is better) at the

47 // expense of performance. Must be an even number.	51 // expense of performance. Must be an even number.

48 // TODO(dalecurtis): Test performance to see if we can jack this up to 64+.	52 // TODO(dalecurtis): Test performance to see if we can jack this up to 64+.

49 kKernelSize = 32,	53 kKernelSize = 32,

50	54

(...skipping 10 matching lines...) Expand all Loading...
61	65

62 // The size (in samples) of the internal buffer used by the resampler.	66 // The size (in samples) of the internal buffer used by the resampler.

63 kBufferSize = kBlockSize + kKernelSize	67 kBufferSize = kBlockSize + kKernelSize

64 };	68 };

65	69

66 SincResampler::SincResampler(double io_sample_rate_ratio, const ReadCB& read_cb)	70 SincResampler::SincResampler(double io_sample_rate_ratio, const ReadCB& read_cb)

67 : io_sample_rate_ratio_(io_sample_rate_ratio),	71 : io_sample_rate_ratio_(io_sample_rate_ratio),

68 virtual_source_idx_(0),	72 virtual_source_idx_(0),

69 buffer_primed_(false),	73 buffer_primed_(false),

70 read_cb_(read_cb),	74 read_cb_(read_cb),

71 // TODO(dalecurtis): When we switch to AVX/SSE optimization, we'll need to	75 // TODO(dalecurtis): Switch these to AlignedMemory<> instances.

72 // allocate with 32-byte alignment and ensure they're sized % 32 bytes.

73 kernel_storage_(new float[kKernelStorageSize]),	76 kernel_storage_(new float[kKernelStorageSize]),

74 input_buffer_(new float[kBufferSize]),	77 input_buffer_(new float[kBufferSize]),

75 // Setup various region pointers in the buffer (see diagram above).	78 // Setup various region pointers in the buffer (see diagram above).

76 r0_(input_buffer_.get() + kKernelSize / 2),	79 r0_(input_buffer_.get() + kKernelSize / 2),

77 r1_(input_buffer_.get()),	80 r1_(input_buffer_.get()),

78 r2_(r0_),	81 r2_(r0_),

79 r3_(r0_ + kBlockSize - kKernelSize / 2),	82 r3_(r0_ + kBlockSize - kKernelSize / 2),

80 r4_(r0_ + kBlockSize),	83 r4_(r0_ + kBlockSize),

81 r5_(r0_ + kKernelSize / 2) {	84 r5_(r0_ + kKernelSize / 2) {

82 DCHECK_EQ(kKernelSize % 2, 0) << "kKernelSize must be even!";	85 DCHECK_EQ(kKernelSize % 2, 0) << "kKernelSize must be even!";

(...skipping 78 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
161 while (remaining_frames) {	164 while (remaining_frames) {

162 while (virtual_source_idx_ < kBlockSize) {	165 while (virtual_source_idx_ < kBlockSize) {

163 // \|virtual_source_idx_\| lies in between two kernel offsets so figure out	166 // \|virtual_source_idx_\| lies in between two kernel offsets so figure out

164 // what they are.	167 // what they are.

165 int source_idx = static_cast<int>(virtual_source_idx_);	168 int source_idx = static_cast<int>(virtual_source_idx_);

166 double subsample_remainder = virtual_source_idx_ - source_idx;	169 double subsample_remainder = virtual_source_idx_ - source_idx;

167	170

168 double virtual_offset_idx = subsample_remainder * kKernelOffsetCount;	171 double virtual_offset_idx = subsample_remainder * kKernelOffsetCount;

169 int offset_idx = static_cast<int>(virtual_offset_idx);	172 int offset_idx = static_cast<int>(virtual_offset_idx);

170	173

	174 // We'll compute "convolutions" for the two kernels which straddle

	175 // \|virtual_source_idx_\|.

171 float* k1 = kernel_storage_.get() + offset_idx * kKernelSize;	176 float* k1 = kernel_storage_.get() + offset_idx * kKernelSize;

172 float* k2 = k1 + kKernelSize;	177 float* k2 = k1 + kKernelSize;

173	178

174 // Initialize input pointer based on quantized \|virtual_source_idx_\|.	179 // Initialize input pointer based on quantized \|virtual_source_idx_\|.

175 float* input_ptr = r1_ + source_idx;	180 float* input_ptr = r1_ + source_idx;

176	181

177 // We'll compute "convolutions" for the two kernels which straddle

178 // \|virtual_source_idx_\|.

179 float sum1 = 0;

180 float sum2 = 0;

181

182 // Figure out how much to weight each kernel's "convolution".	182 // Figure out how much to weight each kernel's "convolution".

183 double kernel_interpolation_factor = virtual_offset_idx - offset_idx;	183 double kernel_interpolation_factor = virtual_offset_idx - offset_idx;

184	184 *destination++ = Convolve(

185 // Generate a single output sample.	185 input_ptr, k1, k2, kernel_interpolation_factor);

186 int n = kKernelSize;

187 float input;

188 // TODO(dalecurtis): For initial commit, I've ripped out all the SSE

189 // optimizations, these definitely need to go back in before release.

190 while (n--) {

191 input = *input_ptr++;

192 sum1 += input * *k1++;

193 sum2 += input * *k2++;

194 }

195

196 // Linearly interpolate the two "convolutions".

197 double result = (1.0 - kernel_interpolation_factor) * sum1

198 + kernel_interpolation_factor * sum2;

199

200 *destination++ = result;

201	186

202 // Advance the virtual index.	187 // Advance the virtual index.

203 virtual_source_idx_ += io_sample_rate_ratio_;	188 virtual_source_idx_ += io_sample_rate_ratio_;

204	189

205 if (!--remaining_frames)	190 if (!--remaining_frames)

206 return;	191 return;

207 }	192 }

208	193

209 // Wrap back around to the start.	194 // Wrap back around to the start.

210 virtual_source_idx_ -= kBlockSize;	195 virtual_source_idx_ -= kBlockSize;

211	196

212 // Step (3) Copy r3_ to r1_ and r4_ to r2_.	197 // Step (3) Copy r3_ to r1_ and r4_ to r2_.

213 // This wraps the last input frames back to the start of the buffer.	198 // This wraps the last input frames back to the start of the buffer.

214 memcpy(r1_, r3_, sizeof(input_buffer_.get()) (kKernelSize / 2));	199 memcpy(r1_, r3_, sizeof(input_buffer_.get()) (kKernelSize / 2));

215 memcpy(r2_, r4_, sizeof(input_buffer_.get()) (kKernelSize / 2));	200 memcpy(r2_, r4_, sizeof(input_buffer_.get()) (kKernelSize / 2));

216	201

217 // Step (4)	202 // Step (4)

218 // Refresh the buffer with more input.	203 // Refresh the buffer with more input.

219 read_cb_.Run(r5_, kBlockSize);	204 read_cb_.Run(r5_, kBlockSize);

220 }	205 }

221 }	206 }

222	207

223 int SincResampler::ChunkSize() {	208 int SincResampler::ChunkSize() {

224 return kBlockSize / io_sample_rate_ratio_;	209 return kBlockSize / io_sample_rate_ratio_;

225 }	210 }

226	211

	212 float SincResampler::Convolve(const float* input_ptr, const float* k1,

	213 const float* k2,

	214 double kernel_interpolation_factor) {

	215 // Rely on function level static initialization to keep ConvolveProc selection

	216 // thread safe.

	217 typedef float (ConvolveProc)(const float src, const float* k1,

	218 const float* k2,

	219 double kernel_interpolation_factor);

	220 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)

	221 static const ConvolveProc kConvolveProc =

	222 base::CPU().has_sse() ? Convolve_SSE : Convolve_C;

	223 #else

	224 static const ConvolveProc kConvolveProc = Convolve_C;

	225 #endif

	226

	227 return kConvolveProc(input_ptr, k1, k2, kernel_interpolation_factor);

	228 }

	229

	230 float SincResampler::Convolve_C(const float* input_ptr, const float* k1,

	231 const float* k2,

	232 double kernel_interpolation_factor) {

	233 float sum1 = 0;

	234 float sum2 = 0;

	235

	236 // Generate a single output sample. Unrolling this loop hurt performance in

	237 // local testing.

	238 int n = kKernelSize;

	239 while (n--) {

	240 sum1 += input_ptr *k1++;

	241 sum2 += input_ptr++ *k2++;

	242 }

	243

	244 // Linearly interpolate the two "convolutions".

	245 return (1.0 - kernel_interpolation_factor) * sum1

	246 + kernel_interpolation_factor * sum2;

	247 }

	248

	249 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)

	250 float SincResampler::Convolve_SSE(const float* input_ptr, const float* k1,

	251 const float* k2,

	252 double kernel_interpolation_factor) {

	253 // Ensure \|k1\|, \|k2\| are 16-byte aligned for SSE usage. Should always be true

	254 // so long as kKernelSize is a multiple of 16.

	255 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k1) & 0x0F);

	256 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k2) & 0x0F);

	257

	258 __m128 m_input;

	259 __m128 m_sums1 = _mm_setzero_ps();

	260 __m128 m_sums2 = _mm_setzero_ps();

	261

	262 // Based on \|input_ptr\| alignment, we need to use loadu or load. Unrolling

	263 // these loops hurt performance in local testing.

	264 if (reinterpret_cast<uintptr_t>(input_ptr) & 0x0F) {

	265 for (int i = 0; i < kKernelSize; i += 4) {

	266 m_input = _mm_loadu_ps(input_ptr + i);
	Ami GONE FROM CHROMIUM 2012/07/21 17:25:09 CL description says "eating at most one unaligned CL description says "eating at most one unaligned load"; I think you need to clarify that :) DaleCurtis 2012/07/24 00:13:07 Done. Show quoted text On 2012/07/21 17:25:09, Ami Fischman wrote: > CL description says "eating at most one unaligned load"; I think you need to > clarify that :) Done.
	267 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));

	268 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));

	269 }

	270 } else {

	271 for (int i = 0; i < kKernelSize; i += 4) {

	272 m_input = _mm_load_ps(input_ptr + i);

	273 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));

	274 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));

	275 }

	276 }

	277

	278 // Linearly interpolate the two "convolutions".

	279 m_sums1 = _mm_mul_ps(m_sums1, _mm_set_ps1(1.0 - kernel_interpolation_factor));

	280 m_sums2 = _mm_mul_ps(m_sums2, _mm_set_ps1(kernel_interpolation_factor));

	281 m_sums1 = _mm_add_ps(m_sums1, m_sums2);

	282

	283 // Sum components together.

	284 float result;

	285 m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1);

	286 _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps(

	287 m_sums2, m_sums2, 1)));

	288

	289 return result;

	290 }

	291 #endif

	292

227 } // namespace media	293 } // namespace media

OLD	NEW

« media/base/sinc_resampler.h ('K') | « media/base/sinc_resampler.h ('k') | media/base/sinc_resampler_unittest.cc » ('j') | media/base/sinc_resampler_unittest.cc » ('J')