media/base/sinc_resampler.cc - Issue 12478002: Break out SSE functions into new media_sse target.

Side by Side Diff: media/base/sinc_resampler.cc

Issue 12478002: Break out SSE functions into new media_sse target. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Really fix iOS. Created 7 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4 //	4 //

5 // Input buffer layout, dividing the total buffer into regions (r0_ - r5_):	5 // Input buffer layout, dividing the total buffer into regions (r0_ - r5_):

6 //	6 //

7 // \|----------------\|-----------------------------------------\|----------------\|	7 // \|----------------\|-----------------------------------------\|----------------\|

8 //	8 //

9 // kBlockSize + kKernelSize / 2	9 // kBlockSize + kKernelSize / 2

10 // <--------------------------------------------------------->	10 // <--------------------------------------------------------->

(...skipping 22 matching lines...) Expand all Loading...
33	33

34 // MSVC++ requires this to be set before any other includes to get M_PI.	34 // MSVC++ requires this to be set before any other includes to get M_PI.

35 #define _USE_MATH_DEFINES	35 #define _USE_MATH_DEFINES

36	36

37 #include "media/base/sinc_resampler.h"	37 #include "media/base/sinc_resampler.h"

38	38

39 #include <cmath>	39 #include <cmath>

40	40

41 #include "base/cpu.h"	41 #include "base/cpu.h"

42 #include "base/logging.h"	42 #include "base/logging.h"

43 #include "build/build_config.h"

44

45 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)

46 #include <xmmintrin.h>

47 #endif

48	43

49 #if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)	44 #if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)

50 #include <arm_neon.h>	45 #include <arm_neon.h>

51 #endif	46 #endif

52	47

53 namespace media {	48 namespace media {

54	49

55 namespace {

56

57 enum {

58 // The kernel size can be adjusted for quality (higher is better) at the

59 // expense of performance. Must be a multiple of 32.

60 // TODO(dalecurtis): Test performance to see if we can jack this up to 64+.

61 kKernelSize = 32,

62

63 // The number of destination frames generated per processing pass. Affects

64 // how often and for how much SincResampler calls back for input. Must be

65 // greater than kKernelSize.

66 kBlockSize = 512,

67

68 // The kernel offset count is used for interpolation and is the number of

69 // sub-sample kernel shifts. Can be adjusted for quality (higher is better)

70 // at the expense of allocating more memory.

71 kKernelOffsetCount = 32,

72 kKernelStorageSize = kKernelSize * (kKernelOffsetCount + 1),

73

74 // The size (in samples) of the internal buffer used by the resampler.

75 kBufferSize = kBlockSize + kKernelSize

76 };

77

78 } // namespace

79

80 const int SincResampler::kMaximumLookAheadSize = kBufferSize;

81

82 SincResampler::SincResampler(double io_sample_rate_ratio, const ReadCB& read_cb)	50 SincResampler::SincResampler(double io_sample_rate_ratio, const ReadCB& read_cb)

83 : io_sample_rate_ratio_(io_sample_rate_ratio),	51 : io_sample_rate_ratio_(io_sample_rate_ratio),

84 virtual_source_idx_(0),	52 virtual_source_idx_(0),

85 buffer_primed_(false),	53 buffer_primed_(false),

86 read_cb_(read_cb),	54 read_cb_(read_cb),

87 // Create input buffers with a 16-byte alignment for SSE optimizations.	55 // Create input buffers with a 16-byte alignment for SSE optimizations.

88 kernel_storage_(static_cast<float*>(	56 kernel_storage_(static_cast<float*>(

89 base::AlignedAlloc(sizeof(float) * kKernelStorageSize, 16))),	57 base::AlignedAlloc(sizeof(float) * kKernelStorageSize, 16))),

90 input_buffer_(static_cast<float*>(	58 input_buffer_(static_cast<float*>(

91 base::AlignedAlloc(sizeof(float) * kBufferSize, 16))),	59 base::AlignedAlloc(sizeof(float) * kBufferSize, 16))),

(...skipping 123 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
215 // This wraps the last input frames back to the start of the buffer.	183 // This wraps the last input frames back to the start of the buffer.

216 memcpy(r1_, r3_, sizeof(input_buffer_.get()) (kKernelSize / 2));	184 memcpy(r1_, r3_, sizeof(input_buffer_.get()) (kKernelSize / 2));

217 memcpy(r2_, r4_, sizeof(input_buffer_.get()) (kKernelSize / 2));	185 memcpy(r2_, r4_, sizeof(input_buffer_.get()) (kKernelSize / 2));

218	186

219 // Step (4)	187 // Step (4)

220 // Refresh the buffer with more input.	188 // Refresh the buffer with more input.

221 read_cb_.Run(r5_, kBlockSize);	189 read_cb_.Run(r5_, kBlockSize);

222 }	190 }

223 }	191 }

224	192

225 int SincResampler::ChunkSize() {	193 int SincResampler::ChunkSize() const {

226 return kBlockSize / io_sample_rate_ratio_;	194 return kBlockSize / io_sample_rate_ratio_;

227 }	195 }

228	196

229 void SincResampler::Flush() {	197 void SincResampler::Flush() {

230 virtual_source_idx_ = 0;	198 virtual_source_idx_ = 0;

231 buffer_primed_ = false;	199 buffer_primed_ = false;

232 memset(input_buffer_.get(), 0, sizeof(input_buffer_.get()) kBufferSize);	200 memset(input_buffer_.get(), 0, sizeof(input_buffer_.get()) kBufferSize);

233 }	201 }

234	202

235 float SincResampler::Convolve(const float* input_ptr, const float* k1,	203 float SincResampler::Convolve(const float* input_ptr, const float* k1,

236 const float* k2,	204 const float* k2,

237 double kernel_interpolation_factor) {	205 double kernel_interpolation_factor) {

	206 // Ensure \|k1\|, \|k2\| are 16-byte aligned for SSE usage. Should always be true

	207 // so long as kKernelSize is a multiple of 16.

	208 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k1) & 0x0F);

	209 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k2) & 0x0F);

	210

238 // Rely on function level static initialization to keep ConvolveProc selection	211 // Rely on function level static initialization to keep ConvolveProc selection

239 // thread safe.	212 // thread safe.

240 typedef float (ConvolveProc)(const float src, const float* k1,	213 typedef float (ConvolveProc)(const float src, const float* k1,

241 const float* k2,	214 const float* k2,

242 double kernel_interpolation_factor);	215 double kernel_interpolation_factor);

243 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)	216 #if defined(ARCH_CPU_X86_FAMILY)

	217 #if defined(__SSE__)

	218 static const ConvolveProc kConvolveProc = Convolve_SSE;

	219 #else

244 static const ConvolveProc kConvolveProc =	220 static const ConvolveProc kConvolveProc =

245 base::CPU().has_sse() ? Convolve_SSE : Convolve_C;	221 base::CPU().has_sse() ? Convolve_SSE : Convolve_C;

	222 #endif

246 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)	223 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)

247 static const ConvolveProc kConvolveProc = Convolve_NEON;	224 static const ConvolveProc kConvolveProc = Convolve_NEON;

248 #else	225 #else

249 static const ConvolveProc kConvolveProc = Convolve_C;	226 static const ConvolveProc kConvolveProc = Convolve_C;

250 #endif	227 #endif

251	228

252 return kConvolveProc(input_ptr, k1, k2, kernel_interpolation_factor);	229 return kConvolveProc(input_ptr, k1, k2, kernel_interpolation_factor);

253 }	230 }

254	231

255 float SincResampler::Convolve_C(const float* input_ptr, const float* k1,	232 float SincResampler::Convolve_C(const float* input_ptr, const float* k1,

256 const float* k2,	233 const float* k2,

257 double kernel_interpolation_factor) {	234 double kernel_interpolation_factor) {

258 float sum1 = 0;	235 float sum1 = 0;

259 float sum2 = 0;	236 float sum2 = 0;

260	237

261 // Generate a single output sample. Unrolling this loop hurt performance in	238 // Generate a single output sample. Unrolling this loop hurt performance in

262 // local testing.	239 // local testing.

263 int n = kKernelSize;	240 int n = kKernelSize;

264 while (n--) {	241 while (n--) {

265 sum1 += input_ptr *k1++;	242 sum1 += input_ptr *k1++;

266 sum2 += input_ptr++ *k2++;	243 sum2 += input_ptr++ *k2++;

267 }	244 }

268	245

269 // Linearly interpolate the two "convolutions".	246 // Linearly interpolate the two "convolutions".

270 return (1.0 - kernel_interpolation_factor) * sum1	247 return (1.0 - kernel_interpolation_factor) * sum1

271 + kernel_interpolation_factor * sum2;	248 + kernel_interpolation_factor * sum2;

272 }	249 }

273	250

274 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)

275 float SincResampler::Convolve_SSE(const float* input_ptr, const float* k1,

276 const float* k2,

277 double kernel_interpolation_factor) {

278 // Ensure \|k1\|, \|k2\| are 16-byte aligned for SSE usage. Should always be true

279 // so long as kKernelSize is a multiple of 16.

280 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k1) & 0x0F);

281 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k2) & 0x0F);

282

283 __m128 m_input;

284 __m128 m_sums1 = _mm_setzero_ps();

285 __m128 m_sums2 = _mm_setzero_ps();

286

287 // Based on \|input_ptr\| alignment, we need to use loadu or load. Unrolling

288 // these loops hurt performance in local testing.

289 if (reinterpret_cast<uintptr_t>(input_ptr) & 0x0F) {

290 for (int i = 0; i < kKernelSize; i += 4) {

291 m_input = _mm_loadu_ps(input_ptr + i);

292 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));

293 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));

294 }

295 } else {

296 for (int i = 0; i < kKernelSize; i += 4) {

297 m_input = _mm_load_ps(input_ptr + i);

298 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));

299 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));

300 }

301 }

302

303 // Linearly interpolate the two "convolutions".

304 m_sums1 = _mm_mul_ps(m_sums1, _mm_set_ps1(1.0 - kernel_interpolation_factor));

305 m_sums2 = _mm_mul_ps(m_sums2, _mm_set_ps1(kernel_interpolation_factor));

306 m_sums1 = _mm_add_ps(m_sums1, m_sums2);

307

308 // Sum components together.

309 float result;

310 m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1);

311 _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps(

312 m_sums2, m_sums2, 1)));

313

314 return result;

315 }

316 #endif

317

318 #if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)	251 #if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)

319 float SincResampler::Convolve_NEON(const float* input_ptr, const float* k1,	252 float SincResampler::Convolve_NEON(const float* input_ptr, const float* k1,

320 const float* k2,	253 const float* k2,

321 double kernel_interpolation_factor) {	254 double kernel_interpolation_factor) {

322 float32x4_t m_input;	255 float32x4_t m_input;

323 float32x4_t m_sums1 = vmovq_n_f32(0);	256 float32x4_t m_sums1 = vmovq_n_f32(0);

324 float32x4_t m_sums2 = vmovq_n_f32(0);	257 float32x4_t m_sums2 = vmovq_n_f32(0);

325	258

326 const float* upper = input_ptr + kKernelSize;	259 const float* upper = input_ptr + kKernelSize;

327 for (; input_ptr < upper; ) {	260 for (; input_ptr < upper; ) {

(...skipping 10 matching lines...) Expand all Loading...
338 vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)),	271 vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)),

339 m_sums2, vmovq_n_f32(kernel_interpolation_factor));	272 m_sums2, vmovq_n_f32(kernel_interpolation_factor));

340	273

341 // Sum components together.	274 // Sum components together.

342 float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1));	275 float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1));

343 return vget_lane_f32(vpadd_f32(m_half, m_half), 0);	276 return vget_lane_f32(vpadd_f32(m_half, m_half), 0);

344 }	277 }

345 #endif	278 #endif

346	279

347 } // namespace media	280 } // namespace media

OLD	NEW

« no previous file with comments | « media/base/sinc_resampler.h ('k') | media/base/sinc_resampler_unittest.cc » ('j') | no next file with comments »