Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(133)

Side by Side Diff: media/base/sinc_resampler.cc

Issue 308003004: Remove runtime CPU detection for SSE optimized media/ methods. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Remove vector math. Created 6 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « media/base/sinc_resampler.h ('k') | media/base/sinc_resampler_perftest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 // 4 //
5 // Initial input buffer layout, dividing into regions r0_ to r4_ (note: r0_, r3_ 5 // Initial input buffer layout, dividing into regions r0_ to r4_ (note: r0_, r3_
6 // and r4_ will move after the first load): 6 // and r4_ will move after the first load):
7 // 7 //
8 // |----------------|-----------------------------------------|----------------| 8 // |----------------|-----------------------------------------|----------------|
9 // 9 //
10 // request_frames_ 10 // request_frames_
(...skipping 63 matching lines...) Expand 10 before | Expand all | Expand 10 after
74 // |virtual_source_idx_|, etc. 74 // |virtual_source_idx_|, etc.
75 75
76 // MSVC++ requires this to be set before any other includes to get M_PI. 76 // MSVC++ requires this to be set before any other includes to get M_PI.
77 #define _USE_MATH_DEFINES 77 #define _USE_MATH_DEFINES
78 78
79 #include "media/base/sinc_resampler.h" 79 #include "media/base/sinc_resampler.h"
80 80
81 #include <cmath> 81 #include <cmath>
82 #include <limits> 82 #include <limits>
83 83
84 #include "base/cpu.h"
85 #include "base/logging.h" 84 #include "base/logging.h"
86 85
87 #if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) 86 #if defined(ARCH_CPU_X86_FAMILY)
87 #include <xmmintrin.h>
88 #define CONVOLVE_FUNC Convolve_SSE
89 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
88 #include <arm_neon.h> 90 #include <arm_neon.h>
91 #define CONVOLVE_FUNC Convolve_NEON
92 #else
93 #define CONVOLVE_FUNC Convolve_C
89 #endif 94 #endif
90 95
91 namespace media { 96 namespace media {
92 97
93 static double SincScaleFactor(double io_ratio) { 98 static double SincScaleFactor(double io_ratio) {
94 // |sinc_scale_factor| is basically the normalized cutoff frequency of the 99 // |sinc_scale_factor| is basically the normalized cutoff frequency of the
95 // low-pass filter. 100 // low-pass filter.
96 double sinc_scale_factor = io_ratio > 1.0 ? 1.0 / io_ratio : 1.0; 101 double sinc_scale_factor = io_ratio > 1.0 ? 1.0 / io_ratio : 1.0;
97 102
98 // The sinc function is an idealized brick-wall filter, but since we're 103 // The sinc function is an idealized brick-wall filter, but since we're
99 // windowing it the transition from pass to stop does not happen right away. 104 // windowing it the transition from pass to stop does not happen right away.
100 // So we should adjust the low pass filter cutoff slightly downward to avoid 105 // So we should adjust the low pass filter cutoff slightly downward to avoid
101 // some aliasing at the very high-end. 106 // some aliasing at the very high-end.
102 // TODO(crogers): this value is empirical and to be more exact should vary 107 // TODO(crogers): this value is empirical and to be more exact should vary
103 // depending on kKernelSize. 108 // depending on kKernelSize.
104 sinc_scale_factor *= 0.9; 109 sinc_scale_factor *= 0.9;
105 110
106 return sinc_scale_factor; 111 return sinc_scale_factor;
107 } 112 }
108 113
109 // If we know the minimum architecture at compile time, avoid CPU detection.
110 // Force NaCl code to use C routines since (at present) nothing there uses these
111 // methods and plumbing the -msse built library is non-trivial.
112 #if defined(ARCH_CPU_X86_FAMILY) && !defined(OS_NACL)
113 #if defined(__SSE__)
114 #define CONVOLVE_FUNC Convolve_SSE
115 void SincResampler::InitializeCPUSpecificFeatures() {}
116 #else
117 // X86 CPU detection required. Functions will be set by
118 // InitializeCPUSpecificFeatures().
119 // TODO(dalecurtis): Once Chrome moves to an SSE baseline this can be removed.
120 #define CONVOLVE_FUNC g_convolve_proc_
121
122 typedef float (*ConvolveProc)(const float*, const float*, const float*, double);
123 static ConvolveProc g_convolve_proc_ = NULL;
124
125 void SincResampler::InitializeCPUSpecificFeatures() {
126 CHECK(!g_convolve_proc_);
127 g_convolve_proc_ = base::CPU().has_sse() ? Convolve_SSE : Convolve_C;
128 }
129 #endif
130 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
131 #define CONVOLVE_FUNC Convolve_NEON
132 void SincResampler::InitializeCPUSpecificFeatures() {}
133 #else
134 // Unknown architecture.
135 #define CONVOLVE_FUNC Convolve_C
136 void SincResampler::InitializeCPUSpecificFeatures() {}
137 #endif
138
139 SincResampler::SincResampler(double io_sample_rate_ratio, 114 SincResampler::SincResampler(double io_sample_rate_ratio,
140 int request_frames, 115 int request_frames,
141 const ReadCB& read_cb) 116 const ReadCB& read_cb)
142 : io_sample_rate_ratio_(io_sample_rate_ratio), 117 : io_sample_rate_ratio_(io_sample_rate_ratio),
143 read_cb_(read_cb), 118 read_cb_(read_cb),
144 request_frames_(request_frames), 119 request_frames_(request_frames),
145 input_buffer_size_(request_frames_ + kKernelSize), 120 input_buffer_size_(request_frames_ + kKernelSize),
146 // Create input buffers with a 16-byte alignment for SSE optimizations. 121 // Create input buffers with a 16-byte alignment for SSE optimizations.
147 kernel_storage_(static_cast<float*>( 122 kernel_storage_(static_cast<float*>(
148 base::AlignedAlloc(sizeof(float) * kKernelStorageSize, 16))), 123 base::AlignedAlloc(sizeof(float) * kKernelStorageSize, 16))),
(...skipping 165 matching lines...) Expand 10 before | Expand all | Expand 10 after
314 289
315 // Step (4) -- Reinitialize regions if necessary. 290 // Step (4) -- Reinitialize regions if necessary.
316 if (r0_ == r2_) 291 if (r0_ == r2_)
317 UpdateRegions(true); 292 UpdateRegions(true);
318 293
319 // Step (5) -- Refresh the buffer with more input. 294 // Step (5) -- Refresh the buffer with more input.
320 read_cb_.Run(request_frames_, r0_); 295 read_cb_.Run(request_frames_, r0_);
321 } 296 }
322 } 297 }
323 298
324 #undef CONVOLVE_FUNC
325
326 int SincResampler::ChunkSize() const { 299 int SincResampler::ChunkSize() const {
327 return block_size_ / io_sample_rate_ratio_; 300 return block_size_ / io_sample_rate_ratio_;
328 } 301 }
329 302
330 void SincResampler::Flush() { 303 void SincResampler::Flush() {
331 virtual_source_idx_ = 0; 304 virtual_source_idx_ = 0;
332 buffer_primed_ = false; 305 buffer_primed_ = false;
333 memset(input_buffer_.get(), 0, 306 memset(input_buffer_.get(), 0,
334 sizeof(*input_buffer_.get()) * input_buffer_size_); 307 sizeof(*input_buffer_.get()) * input_buffer_size_);
335 UpdateRegions(false); 308 UpdateRegions(false);
(...skipping 11 matching lines...) Expand all
347 while (n--) { 320 while (n--) {
348 sum1 += *input_ptr * *k1++; 321 sum1 += *input_ptr * *k1++;
349 sum2 += *input_ptr++ * *k2++; 322 sum2 += *input_ptr++ * *k2++;
350 } 323 }
351 324
352 // Linearly interpolate the two "convolutions". 325 // Linearly interpolate the two "convolutions".
353 return (1.0 - kernel_interpolation_factor) * sum1 326 return (1.0 - kernel_interpolation_factor) * sum1
354 + kernel_interpolation_factor * sum2; 327 + kernel_interpolation_factor * sum2;
355 } 328 }
356 329
357 #if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) 330 #if defined(ARCH_CPU_X86_FAMILY)
331 float SincResampler::Convolve_SSE(const float* input_ptr, const float* k1,
332 const float* k2,
333 double kernel_interpolation_factor) {
334 __m128 m_input;
335 __m128 m_sums1 = _mm_setzero_ps();
336 __m128 m_sums2 = _mm_setzero_ps();
337
338 // Based on |input_ptr| alignment, we need to use loadu or load. Unrolling
339 // these loops hurt performance in local testing.
340 if (reinterpret_cast<uintptr_t>(input_ptr) & 0x0F) {
341 for (int i = 0; i < kKernelSize; i += 4) {
342 m_input = _mm_loadu_ps(input_ptr + i);
343 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));
344 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));
345 }
346 } else {
347 for (int i = 0; i < kKernelSize; i += 4) {
348 m_input = _mm_load_ps(input_ptr + i);
349 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));
350 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));
351 }
352 }
353
354 // Linearly interpolate the two "convolutions".
355 m_sums1 = _mm_mul_ps(m_sums1, _mm_set_ps1(1.0 - kernel_interpolation_factor));
356 m_sums2 = _mm_mul_ps(m_sums2, _mm_set_ps1(kernel_interpolation_factor));
357 m_sums1 = _mm_add_ps(m_sums1, m_sums2);
358
359 // Sum components together.
360 float result;
361 m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1);
362 _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps(
363 m_sums2, m_sums2, 1)));
364
365 return result;
366 }
367 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
rileya (GONE FROM CHROMIUM) 2014/05/30 01:12:29 Not too relevant to this CL, but iirc we don't set
358 float SincResampler::Convolve_NEON(const float* input_ptr, const float* k1, 368 float SincResampler::Convolve_NEON(const float* input_ptr, const float* k1,
359 const float* k2, 369 const float* k2,
360 double kernel_interpolation_factor) { 370 double kernel_interpolation_factor) {
361 float32x4_t m_input; 371 float32x4_t m_input;
362 float32x4_t m_sums1 = vmovq_n_f32(0); 372 float32x4_t m_sums1 = vmovq_n_f32(0);
363 float32x4_t m_sums2 = vmovq_n_f32(0); 373 float32x4_t m_sums2 = vmovq_n_f32(0);
364 374
365 const float* upper = input_ptr + kKernelSize; 375 const float* upper = input_ptr + kKernelSize;
366 for (; input_ptr < upper; ) { 376 for (; input_ptr < upper; ) {
367 m_input = vld1q_f32(input_ptr); 377 m_input = vld1q_f32(input_ptr);
368 input_ptr += 4; 378 input_ptr += 4;
369 m_sums1 = vmlaq_f32(m_sums1, m_input, vld1q_f32(k1)); 379 m_sums1 = vmlaq_f32(m_sums1, m_input, vld1q_f32(k1));
370 k1 += 4; 380 k1 += 4;
371 m_sums2 = vmlaq_f32(m_sums2, m_input, vld1q_f32(k2)); 381 m_sums2 = vmlaq_f32(m_sums2, m_input, vld1q_f32(k2));
372 k2 += 4; 382 k2 += 4;
373 } 383 }
374 384
375 // Linearly interpolate the two "convolutions". 385 // Linearly interpolate the two "convolutions".
376 m_sums1 = vmlaq_f32( 386 m_sums1 = vmlaq_f32(
377 vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)), 387 vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)),
378 m_sums2, vmovq_n_f32(kernel_interpolation_factor)); 388 m_sums2, vmovq_n_f32(kernel_interpolation_factor));
379 389
380 // Sum components together. 390 // Sum components together.
381 float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1)); 391 float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1));
382 return vget_lane_f32(vpadd_f32(m_half, m_half), 0); 392 return vget_lane_f32(vpadd_f32(m_half, m_half), 0);
383 } 393 }
384 #endif 394 #endif
385 395
386 } // namespace media 396 } // namespace media
OLDNEW
« no previous file with comments | « media/base/sinc_resampler.h ('k') | media/base/sinc_resampler_perftest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698