Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(144)

Side by Side Diff: media/base/sinc_resampler.cc

Issue 2556993002: Experiment with AVX optimizations for FMAC, FMUL operations.
Patch Set: Created 4 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « media/base/sinc_resampler.h ('k') | media/base/sinc_resampler_perftest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 // 4 //
5 // Initial input buffer layout, dividing into regions r0_ to r4_ (note: r0_, r3_ 5 // Initial input buffer layout, dividing into regions r0_ to r4_ (note: r0_, r3_
6 // and r4_ will move after the first load): 6 // and r4_ will move after the first load):
7 // 7 //
8 // |----------------|-----------------------------------------|----------------| 8 // |----------------|-----------------------------------------|----------------|
9 // 9 //
10 // request_frames_ 10 // request_frames_
(...skipping 64 matching lines...) Expand 10 before | Expand all | Expand 10 after
75 75
76 // MSVC++ requires this to be set before any other includes to get M_PI. 76 // MSVC++ requires this to be set before any other includes to get M_PI.
77 #define _USE_MATH_DEFINES 77 #define _USE_MATH_DEFINES
78 78
79 #include "media/base/sinc_resampler.h" 79 #include "media/base/sinc_resampler.h"
80 80
81 #include <cmath> 81 #include <cmath>
82 #include <limits> 82 #include <limits>
83 83
84 #include "base/logging.h" 84 #include "base/logging.h"
85 #include "build/build_config.h"
86
87 #if defined(ARCH_CPU_X86_FAMILY)
88 #include <xmmintrin.h>
89 #define CONVOLVE_FUNC Convolve_SSE
90 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
91 #include <arm_neon.h>
92 #define CONVOLVE_FUNC Convolve_NEON
93 #else
94 #define CONVOLVE_FUNC Convolve_C
95 #endif
96 85
97 namespace media { 86 namespace media {
98 87
99 static double SincScaleFactor(double io_ratio) { 88 static double SincScaleFactor(double io_ratio) {
100 // |sinc_scale_factor| is basically the normalized cutoff frequency of the 89 // |sinc_scale_factor| is basically the normalized cutoff frequency of the
101 // low-pass filter. 90 // low-pass filter.
102 double sinc_scale_factor = io_ratio > 1.0 ? 1.0 / io_ratio : 1.0; 91 double sinc_scale_factor = io_ratio > 1.0 ? 1.0 / io_ratio : 1.0;
103 92
104 // The sinc function is an idealized brick-wall filter, but since we're 93 // The sinc function is an idealized brick-wall filter, but since we're
105 // windowing it the transition from pass to stop does not happen right away. 94 // windowing it the transition from pass to stop does not happen right away.
(...skipping 78 matching lines...) Expand 10 before | Expand all | Expand 10 after
184 173
185 for (int i = 0; i < kKernelSize; ++i) { 174 for (int i = 0; i < kKernelSize; ++i) {
186 const int idx = i + offset_idx * kKernelSize; 175 const int idx = i + offset_idx * kKernelSize;
187 const float pre_sinc = 176 const float pre_sinc =
188 static_cast<float>(M_PI * (i - kKernelSize / 2 - subsample_offset)); 177 static_cast<float>(M_PI * (i - kKernelSize / 2 - subsample_offset));
189 kernel_pre_sinc_storage_[idx] = pre_sinc; 178 kernel_pre_sinc_storage_[idx] = pre_sinc;
190 179
191 // Compute Blackman window, matching the offset of the sinc(). 180 // Compute Blackman window, matching the offset of the sinc().
192 const float x = (i - subsample_offset) / kKernelSize; 181 const float x = (i - subsample_offset) / kKernelSize;
193 const float window = static_cast<float>(kA0 - kA1 * cos(2.0 * M_PI * x) + 182 const float window = static_cast<float>(kA0 - kA1 * cos(2.0 * M_PI * x) +
194 kA2 * cos(4.0 * M_PI * x)); 183 kA2 * cos(4.0 * M_PI * x));
195 kernel_window_storage_[idx] = window; 184 kernel_window_storage_[idx] = window;
196 185
197 // Compute the sinc with offset, then window the sinc() function and store 186 // Compute the sinc with offset, then window the sinc() function and store
198 // at the correct offset. 187 // at the correct offset.
199 kernel_storage_[idx] = static_cast<float>( 188 kernel_storage_[idx] = static_cast<float>(
200 window * (pre_sinc ? sin(sinc_scale_factor * pre_sinc) / pre_sinc 189 window * (pre_sinc ? sin(sinc_scale_factor * pre_sinc) / pre_sinc
201 : sinc_scale_factor)); 190 : sinc_scale_factor));
202 } 191 }
203 } 192 }
204 } 193 }
(...skipping 52 matching lines...) Expand 10 before | Expand all | Expand 10 after
257 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k1) & 0x0F); 246 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k1) & 0x0F);
258 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k2) & 0x0F); 247 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k2) & 0x0F);
259 248
260 // Initialize input pointer based on quantized |virtual_source_idx_|. 249 // Initialize input pointer based on quantized |virtual_source_idx_|.
261 const float* input_ptr = r1_ + source_idx; 250 const float* input_ptr = r1_ + source_idx;
262 251
263 // Figure out how much to weight each kernel's "convolution". 252 // Figure out how much to weight each kernel's "convolution".
264 const double kernel_interpolation_factor = 253 const double kernel_interpolation_factor =
265 virtual_offset_idx - offset_idx; 254 virtual_offset_idx - offset_idx;
266 *destination++ = 255 *destination++ =
267 CONVOLVE_FUNC(input_ptr, k1, k2, kernel_interpolation_factor); 256 vector_math::Convolve(input_ptr, k1, k2, kernel_interpolation_factor);
268 257
269 // Advance the virtual index. 258 // Advance the virtual index.
270 virtual_source_idx_ += io_sample_rate_ratio_; 259 virtual_source_idx_ += io_sample_rate_ratio_;
271 if (!--remaining_frames) 260 if (!--remaining_frames)
272 return; 261 return;
273 } 262 }
274 263
275 // Wrap back around to the start. 264 // Wrap back around to the start.
276 DCHECK_GE(virtual_source_idx_, block_size_); 265 DCHECK_GE(virtual_source_idx_, block_size_);
277 virtual_source_idx_ -= block_size_; 266 virtual_source_idx_ -= block_size_;
(...skipping 24 matching lines...) Expand all
302 buffer_primed_ = false; 291 buffer_primed_ = false;
303 memset(input_buffer_.get(), 0, 292 memset(input_buffer_.get(), 0,
304 sizeof(*input_buffer_.get()) * input_buffer_size_); 293 sizeof(*input_buffer_.get()) * input_buffer_size_);
305 UpdateRegions(false); 294 UpdateRegions(false);
306 } 295 }
307 296
308 double SincResampler::BufferedFrames() const { 297 double SincResampler::BufferedFrames() const {
309 return buffer_primed_ ? request_frames_ - virtual_source_idx_ : 0; 298 return buffer_primed_ ? request_frames_ - virtual_source_idx_ : 0;
310 } 299 }
311 300
312 float SincResampler::Convolve_C(const float* input_ptr, const float* k1,
313 const float* k2,
314 double kernel_interpolation_factor) {
315 float sum1 = 0;
316 float sum2 = 0;
317
318 // Generate a single output sample. Unrolling this loop hurt performance in
319 // local testing.
320 int n = kKernelSize;
321 while (n--) {
322 sum1 += *input_ptr * *k1++;
323 sum2 += *input_ptr++ * *k2++;
324 }
325
326 // Linearly interpolate the two "convolutions".
327 return static_cast<float>((1.0 - kernel_interpolation_factor) * sum1 +
328 kernel_interpolation_factor * sum2);
329 }
330
331 #if defined(ARCH_CPU_X86_FAMILY)
332 float SincResampler::Convolve_SSE(const float* input_ptr, const float* k1,
333 const float* k2,
334 double kernel_interpolation_factor) {
335 __m128 m_input;
336 __m128 m_sums1 = _mm_setzero_ps();
337 __m128 m_sums2 = _mm_setzero_ps();
338
339 // Based on |input_ptr| alignment, we need to use loadu or load. Unrolling
340 // these loops hurt performance in local testing.
341 if (reinterpret_cast<uintptr_t>(input_ptr) & 0x0F) {
342 for (int i = 0; i < kKernelSize; i += 4) {
343 m_input = _mm_loadu_ps(input_ptr + i);
344 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));
345 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));
346 }
347 } else {
348 for (int i = 0; i < kKernelSize; i += 4) {
349 m_input = _mm_load_ps(input_ptr + i);
350 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));
351 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));
352 }
353 }
354
355 // Linearly interpolate the two "convolutions".
356 m_sums1 = _mm_mul_ps(m_sums1, _mm_set_ps1(
357 static_cast<float>(1.0 - kernel_interpolation_factor)));
358 m_sums2 = _mm_mul_ps(m_sums2, _mm_set_ps1(
359 static_cast<float>(kernel_interpolation_factor)));
360 m_sums1 = _mm_add_ps(m_sums1, m_sums2);
361
362 // Sum components together.
363 float result;
364 m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1);
365 _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps(
366 m_sums2, m_sums2, 1)));
367
368 return result;
369 }
370 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
371 float SincResampler::Convolve_NEON(const float* input_ptr, const float* k1,
372 const float* k2,
373 double kernel_interpolation_factor) {
374 float32x4_t m_input;
375 float32x4_t m_sums1 = vmovq_n_f32(0);
376 float32x4_t m_sums2 = vmovq_n_f32(0);
377
378 const float* upper = input_ptr + kKernelSize;
379 for (; input_ptr < upper; ) {
380 m_input = vld1q_f32(input_ptr);
381 input_ptr += 4;
382 m_sums1 = vmlaq_f32(m_sums1, m_input, vld1q_f32(k1));
383 k1 += 4;
384 m_sums2 = vmlaq_f32(m_sums2, m_input, vld1q_f32(k2));
385 k2 += 4;
386 }
387
388 // Linearly interpolate the two "convolutions".
389 m_sums1 = vmlaq_f32(
390 vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)),
391 m_sums2, vmovq_n_f32(kernel_interpolation_factor));
392
393 // Sum components together.
394 float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1));
395 return vget_lane_f32(vpadd_f32(m_half, m_half), 0);
396 }
397 #endif
398
399 } // namespace media 301 } // namespace media
OLDNEW
« no previous file with comments | « media/base/sinc_resampler.h ('k') | media/base/sinc_resampler_perftest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698