Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(149)

Side by Side Diff: media/base/sinc_resampler.cc

Issue 12478002: Break out SSE functions into new media_sse target. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Really fix iOS. Created 7 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « media/base/sinc_resampler.h ('k') | media/base/sinc_resampler_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 // 4 //
5 // Input buffer layout, dividing the total buffer into regions (r0_ - r5_): 5 // Input buffer layout, dividing the total buffer into regions (r0_ - r5_):
6 // 6 //
7 // |----------------|-----------------------------------------|----------------| 7 // |----------------|-----------------------------------------|----------------|
8 // 8 //
9 // kBlockSize + kKernelSize / 2 9 // kBlockSize + kKernelSize / 2
10 // <---------------------------------------------------------> 10 // <--------------------------------------------------------->
(...skipping 22 matching lines...) Expand all
33 33
34 // MSVC++ requires this to be set before any other includes to get M_PI. 34 // MSVC++ requires this to be set before any other includes to get M_PI.
35 #define _USE_MATH_DEFINES 35 #define _USE_MATH_DEFINES
36 36
37 #include "media/base/sinc_resampler.h" 37 #include "media/base/sinc_resampler.h"
38 38
39 #include <cmath> 39 #include <cmath>
40 40
41 #include "base/cpu.h" 41 #include "base/cpu.h"
42 #include "base/logging.h" 42 #include "base/logging.h"
43 #include "build/build_config.h"
44
45 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
46 #include <xmmintrin.h>
47 #endif
48 43
49 #if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) 44 #if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
50 #include <arm_neon.h> 45 #include <arm_neon.h>
51 #endif 46 #endif
52 47
53 namespace media { 48 namespace media {
54 49
55 namespace {
56
57 enum {
58 // The kernel size can be adjusted for quality (higher is better) at the
59 // expense of performance. Must be a multiple of 32.
60 // TODO(dalecurtis): Test performance to see if we can jack this up to 64+.
61 kKernelSize = 32,
62
63 // The number of destination frames generated per processing pass. Affects
64 // how often and for how much SincResampler calls back for input. Must be
65 // greater than kKernelSize.
66 kBlockSize = 512,
67
68 // The kernel offset count is used for interpolation and is the number of
69 // sub-sample kernel shifts. Can be adjusted for quality (higher is better)
70 // at the expense of allocating more memory.
71 kKernelOffsetCount = 32,
72 kKernelStorageSize = kKernelSize * (kKernelOffsetCount + 1),
73
74 // The size (in samples) of the internal buffer used by the resampler.
75 kBufferSize = kBlockSize + kKernelSize
76 };
77
78 } // namespace
79
80 const int SincResampler::kMaximumLookAheadSize = kBufferSize;
81
82 SincResampler::SincResampler(double io_sample_rate_ratio, const ReadCB& read_cb) 50 SincResampler::SincResampler(double io_sample_rate_ratio, const ReadCB& read_cb)
83 : io_sample_rate_ratio_(io_sample_rate_ratio), 51 : io_sample_rate_ratio_(io_sample_rate_ratio),
84 virtual_source_idx_(0), 52 virtual_source_idx_(0),
85 buffer_primed_(false), 53 buffer_primed_(false),
86 read_cb_(read_cb), 54 read_cb_(read_cb),
87 // Create input buffers with a 16-byte alignment for SSE optimizations. 55 // Create input buffers with a 16-byte alignment for SSE optimizations.
88 kernel_storage_(static_cast<float*>( 56 kernel_storage_(static_cast<float*>(
89 base::AlignedAlloc(sizeof(float) * kKernelStorageSize, 16))), 57 base::AlignedAlloc(sizeof(float) * kKernelStorageSize, 16))),
90 input_buffer_(static_cast<float*>( 58 input_buffer_(static_cast<float*>(
91 base::AlignedAlloc(sizeof(float) * kBufferSize, 16))), 59 base::AlignedAlloc(sizeof(float) * kBufferSize, 16))),
(...skipping 123 matching lines...) Expand 10 before | Expand all | Expand 10 after
215 // This wraps the last input frames back to the start of the buffer. 183 // This wraps the last input frames back to the start of the buffer.
216 memcpy(r1_, r3_, sizeof(*input_buffer_.get()) * (kKernelSize / 2)); 184 memcpy(r1_, r3_, sizeof(*input_buffer_.get()) * (kKernelSize / 2));
217 memcpy(r2_, r4_, sizeof(*input_buffer_.get()) * (kKernelSize / 2)); 185 memcpy(r2_, r4_, sizeof(*input_buffer_.get()) * (kKernelSize / 2));
218 186
219 // Step (4) 187 // Step (4)
220 // Refresh the buffer with more input. 188 // Refresh the buffer with more input.
221 read_cb_.Run(r5_, kBlockSize); 189 read_cb_.Run(r5_, kBlockSize);
222 } 190 }
223 } 191 }
224 192
225 int SincResampler::ChunkSize() { 193 int SincResampler::ChunkSize() const {
226 return kBlockSize / io_sample_rate_ratio_; 194 return kBlockSize / io_sample_rate_ratio_;
227 } 195 }
228 196
229 void SincResampler::Flush() { 197 void SincResampler::Flush() {
230 virtual_source_idx_ = 0; 198 virtual_source_idx_ = 0;
231 buffer_primed_ = false; 199 buffer_primed_ = false;
232 memset(input_buffer_.get(), 0, sizeof(*input_buffer_.get()) * kBufferSize); 200 memset(input_buffer_.get(), 0, sizeof(*input_buffer_.get()) * kBufferSize);
233 } 201 }
234 202
235 float SincResampler::Convolve(const float* input_ptr, const float* k1, 203 float SincResampler::Convolve(const float* input_ptr, const float* k1,
236 const float* k2, 204 const float* k2,
237 double kernel_interpolation_factor) { 205 double kernel_interpolation_factor) {
206 // Ensure |k1|, |k2| are 16-byte aligned for SSE usage. Should always be true
207 // so long as kKernelSize is a multiple of 16.
208 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k1) & 0x0F);
209 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k2) & 0x0F);
210
238 // Rely on function level static initialization to keep ConvolveProc selection 211 // Rely on function level static initialization to keep ConvolveProc selection
239 // thread safe. 212 // thread safe.
240 typedef float (*ConvolveProc)(const float* src, const float* k1, 213 typedef float (*ConvolveProc)(const float* src, const float* k1,
241 const float* k2, 214 const float* k2,
242 double kernel_interpolation_factor); 215 double kernel_interpolation_factor);
243 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__) 216 #if defined(ARCH_CPU_X86_FAMILY)
217 #if defined(__SSE__)
218 static const ConvolveProc kConvolveProc = Convolve_SSE;
219 #else
244 static const ConvolveProc kConvolveProc = 220 static const ConvolveProc kConvolveProc =
245 base::CPU().has_sse() ? Convolve_SSE : Convolve_C; 221 base::CPU().has_sse() ? Convolve_SSE : Convolve_C;
222 #endif
246 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) 223 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
247 static const ConvolveProc kConvolveProc = Convolve_NEON; 224 static const ConvolveProc kConvolveProc = Convolve_NEON;
248 #else 225 #else
249 static const ConvolveProc kConvolveProc = Convolve_C; 226 static const ConvolveProc kConvolveProc = Convolve_C;
250 #endif 227 #endif
251 228
252 return kConvolveProc(input_ptr, k1, k2, kernel_interpolation_factor); 229 return kConvolveProc(input_ptr, k1, k2, kernel_interpolation_factor);
253 } 230 }
254 231
255 float SincResampler::Convolve_C(const float* input_ptr, const float* k1, 232 float SincResampler::Convolve_C(const float* input_ptr, const float* k1,
256 const float* k2, 233 const float* k2,
257 double kernel_interpolation_factor) { 234 double kernel_interpolation_factor) {
258 float sum1 = 0; 235 float sum1 = 0;
259 float sum2 = 0; 236 float sum2 = 0;
260 237
261 // Generate a single output sample. Unrolling this loop hurt performance in 238 // Generate a single output sample. Unrolling this loop hurt performance in
262 // local testing. 239 // local testing.
263 int n = kKernelSize; 240 int n = kKernelSize;
264 while (n--) { 241 while (n--) {
265 sum1 += *input_ptr * *k1++; 242 sum1 += *input_ptr * *k1++;
266 sum2 += *input_ptr++ * *k2++; 243 sum2 += *input_ptr++ * *k2++;
267 } 244 }
268 245
269 // Linearly interpolate the two "convolutions". 246 // Linearly interpolate the two "convolutions".
270 return (1.0 - kernel_interpolation_factor) * sum1 247 return (1.0 - kernel_interpolation_factor) * sum1
271 + kernel_interpolation_factor * sum2; 248 + kernel_interpolation_factor * sum2;
272 } 249 }
273 250
274 #if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__)
275 float SincResampler::Convolve_SSE(const float* input_ptr, const float* k1,
276 const float* k2,
277 double kernel_interpolation_factor) {
278 // Ensure |k1|, |k2| are 16-byte aligned for SSE usage. Should always be true
279 // so long as kKernelSize is a multiple of 16.
280 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k1) & 0x0F);
281 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k2) & 0x0F);
282
283 __m128 m_input;
284 __m128 m_sums1 = _mm_setzero_ps();
285 __m128 m_sums2 = _mm_setzero_ps();
286
287 // Based on |input_ptr| alignment, we need to use loadu or load. Unrolling
288 // these loops hurt performance in local testing.
289 if (reinterpret_cast<uintptr_t>(input_ptr) & 0x0F) {
290 for (int i = 0; i < kKernelSize; i += 4) {
291 m_input = _mm_loadu_ps(input_ptr + i);
292 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));
293 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));
294 }
295 } else {
296 for (int i = 0; i < kKernelSize; i += 4) {
297 m_input = _mm_load_ps(input_ptr + i);
298 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));
299 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));
300 }
301 }
302
303 // Linearly interpolate the two "convolutions".
304 m_sums1 = _mm_mul_ps(m_sums1, _mm_set_ps1(1.0 - kernel_interpolation_factor));
305 m_sums2 = _mm_mul_ps(m_sums2, _mm_set_ps1(kernel_interpolation_factor));
306 m_sums1 = _mm_add_ps(m_sums1, m_sums2);
307
308 // Sum components together.
309 float result;
310 m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1);
311 _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps(
312 m_sums2, m_sums2, 1)));
313
314 return result;
315 }
316 #endif
317
318 #if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) 251 #if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
319 float SincResampler::Convolve_NEON(const float* input_ptr, const float* k1, 252 float SincResampler::Convolve_NEON(const float* input_ptr, const float* k1,
320 const float* k2, 253 const float* k2,
321 double kernel_interpolation_factor) { 254 double kernel_interpolation_factor) {
322 float32x4_t m_input; 255 float32x4_t m_input;
323 float32x4_t m_sums1 = vmovq_n_f32(0); 256 float32x4_t m_sums1 = vmovq_n_f32(0);
324 float32x4_t m_sums2 = vmovq_n_f32(0); 257 float32x4_t m_sums2 = vmovq_n_f32(0);
325 258
326 const float* upper = input_ptr + kKernelSize; 259 const float* upper = input_ptr + kKernelSize;
327 for (; input_ptr < upper; ) { 260 for (; input_ptr < upper; ) {
(...skipping 10 matching lines...) Expand all
338 vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)), 271 vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)),
339 m_sums2, vmovq_n_f32(kernel_interpolation_factor)); 272 m_sums2, vmovq_n_f32(kernel_interpolation_factor));
340 273
341 // Sum components together. 274 // Sum components together.
342 float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1)); 275 float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1));
343 return vget_lane_f32(vpadd_f32(m_half, m_half), 0); 276 return vget_lane_f32(vpadd_f32(m_half, m_half), 0);
344 } 277 }
345 #endif 278 #endif
346 279
347 } // namespace media 280 } // namespace media
OLDNEW
« no previous file with comments | « media/base/sinc_resampler.h ('k') | media/base/sinc_resampler_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698