OLD | NEW |
---|---|
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 // | 4 // |
5 // Initial input buffer layout, dividing into regions r0_ to r4_ (note: r0_, r3_ | 5 // Initial input buffer layout, dividing into regions r0_ to r4_ (note: r0_, r3_ |
6 // and r4_ will move after the first load): | 6 // and r4_ will move after the first load): |
7 // | 7 // |
8 // |----------------|-----------------------------------------|----------------| | 8 // |----------------|-----------------------------------------|----------------| |
9 // | 9 // |
10 // request_frames_ | 10 // request_frames_ |
(...skipping 63 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
74 // |virtual_source_idx_|, etc. | 74 // |virtual_source_idx_|, etc. |
75 | 75 |
76 // MSVC++ requires this to be set before any other includes to get M_PI. | 76 // MSVC++ requires this to be set before any other includes to get M_PI. |
77 #define _USE_MATH_DEFINES | 77 #define _USE_MATH_DEFINES |
78 | 78 |
79 #include "media/base/sinc_resampler.h" | 79 #include "media/base/sinc_resampler.h" |
80 | 80 |
81 #include <cmath> | 81 #include <cmath> |
82 #include <limits> | 82 #include <limits> |
83 | 83 |
84 #include "base/cpu.h" | |
85 #include "base/logging.h" | 84 #include "base/logging.h" |
86 | 85 |
87 #if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) | 86 #if defined(ARCH_CPU_X86_FAMILY) |
87 #include <xmmintrin.h> | |
88 #define CONVOLVE_FUNC Convolve_SSE | |
89 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) | |
88 #include <arm_neon.h> | 90 #include <arm_neon.h> |
91 #define CONVOLVE_FUNC Convolve_NEON | |
92 #else | |
93 #define CONVOLVE_FUNC Convolve_C | |
89 #endif | 94 #endif |
90 | 95 |
91 namespace media { | 96 namespace media { |
92 | 97 |
93 static double SincScaleFactor(double io_ratio) { | 98 static double SincScaleFactor(double io_ratio) { |
94 // |sinc_scale_factor| is basically the normalized cutoff frequency of the | 99 // |sinc_scale_factor| is basically the normalized cutoff frequency of the |
95 // low-pass filter. | 100 // low-pass filter. |
96 double sinc_scale_factor = io_ratio > 1.0 ? 1.0 / io_ratio : 1.0; | 101 double sinc_scale_factor = io_ratio > 1.0 ? 1.0 / io_ratio : 1.0; |
97 | 102 |
98 // The sinc function is an idealized brick-wall filter, but since we're | 103 // The sinc function is an idealized brick-wall filter, but since we're |
99 // windowing it the transition from pass to stop does not happen right away. | 104 // windowing it the transition from pass to stop does not happen right away. |
100 // So we should adjust the low pass filter cutoff slightly downward to avoid | 105 // So we should adjust the low pass filter cutoff slightly downward to avoid |
101 // some aliasing at the very high-end. | 106 // some aliasing at the very high-end. |
102 // TODO(crogers): this value is empirical and to be more exact should vary | 107 // TODO(crogers): this value is empirical and to be more exact should vary |
103 // depending on kKernelSize. | 108 // depending on kKernelSize. |
104 sinc_scale_factor *= 0.9; | 109 sinc_scale_factor *= 0.9; |
105 | 110 |
106 return sinc_scale_factor; | 111 return sinc_scale_factor; |
107 } | 112 } |
108 | 113 |
109 // If we know the minimum architecture at compile time, avoid CPU detection. | |
110 // Force NaCl code to use C routines since (at present) nothing there uses these | |
111 // methods and plumbing the -msse built library is non-trivial. | |
112 #if defined(ARCH_CPU_X86_FAMILY) && !defined(OS_NACL) | |
113 #if defined(__SSE__) | |
114 #define CONVOLVE_FUNC Convolve_SSE | |
115 void SincResampler::InitializeCPUSpecificFeatures() {} | |
116 #else | |
117 // X86 CPU detection required. Functions will be set by | |
118 // InitializeCPUSpecificFeatures(). | |
119 // TODO(dalecurtis): Once Chrome moves to an SSE baseline this can be removed. | |
120 #define CONVOLVE_FUNC g_convolve_proc_ | |
121 | |
122 typedef float (*ConvolveProc)(const float*, const float*, const float*, double); | |
123 static ConvolveProc g_convolve_proc_ = NULL; | |
124 | |
125 void SincResampler::InitializeCPUSpecificFeatures() { | |
126 CHECK(!g_convolve_proc_); | |
127 g_convolve_proc_ = base::CPU().has_sse() ? Convolve_SSE : Convolve_C; | |
128 } | |
129 #endif | |
130 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) | |
131 #define CONVOLVE_FUNC Convolve_NEON | |
132 void SincResampler::InitializeCPUSpecificFeatures() {} | |
133 #else | |
134 // Unknown architecture. | |
135 #define CONVOLVE_FUNC Convolve_C | |
136 void SincResampler::InitializeCPUSpecificFeatures() {} | |
137 #endif | |
138 | |
139 SincResampler::SincResampler(double io_sample_rate_ratio, | 114 SincResampler::SincResampler(double io_sample_rate_ratio, |
140 int request_frames, | 115 int request_frames, |
141 const ReadCB& read_cb) | 116 const ReadCB& read_cb) |
142 : io_sample_rate_ratio_(io_sample_rate_ratio), | 117 : io_sample_rate_ratio_(io_sample_rate_ratio), |
143 read_cb_(read_cb), | 118 read_cb_(read_cb), |
144 request_frames_(request_frames), | 119 request_frames_(request_frames), |
145 input_buffer_size_(request_frames_ + kKernelSize), | 120 input_buffer_size_(request_frames_ + kKernelSize), |
146 // Create input buffers with a 16-byte alignment for SSE optimizations. | 121 // Create input buffers with a 16-byte alignment for SSE optimizations. |
147 kernel_storage_(static_cast<float*>( | 122 kernel_storage_(static_cast<float*>( |
148 base::AlignedAlloc(sizeof(float) * kKernelStorageSize, 16))), | 123 base::AlignedAlloc(sizeof(float) * kKernelStorageSize, 16))), |
(...skipping 165 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
314 | 289 |
315 // Step (4) -- Reinitialize regions if necessary. | 290 // Step (4) -- Reinitialize regions if necessary. |
316 if (r0_ == r2_) | 291 if (r0_ == r2_) |
317 UpdateRegions(true); | 292 UpdateRegions(true); |
318 | 293 |
319 // Step (5) -- Refresh the buffer with more input. | 294 // Step (5) -- Refresh the buffer with more input. |
320 read_cb_.Run(request_frames_, r0_); | 295 read_cb_.Run(request_frames_, r0_); |
321 } | 296 } |
322 } | 297 } |
323 | 298 |
324 #undef CONVOLVE_FUNC | |
325 | |
326 int SincResampler::ChunkSize() const { | 299 int SincResampler::ChunkSize() const { |
327 return block_size_ / io_sample_rate_ratio_; | 300 return block_size_ / io_sample_rate_ratio_; |
328 } | 301 } |
329 | 302 |
330 void SincResampler::Flush() { | 303 void SincResampler::Flush() { |
331 virtual_source_idx_ = 0; | 304 virtual_source_idx_ = 0; |
332 buffer_primed_ = false; | 305 buffer_primed_ = false; |
333 memset(input_buffer_.get(), 0, | 306 memset(input_buffer_.get(), 0, |
334 sizeof(*input_buffer_.get()) * input_buffer_size_); | 307 sizeof(*input_buffer_.get()) * input_buffer_size_); |
335 UpdateRegions(false); | 308 UpdateRegions(false); |
(...skipping 11 matching lines...) Expand all Loading... | |
347 while (n--) { | 320 while (n--) { |
348 sum1 += *input_ptr * *k1++; | 321 sum1 += *input_ptr * *k1++; |
349 sum2 += *input_ptr++ * *k2++; | 322 sum2 += *input_ptr++ * *k2++; |
350 } | 323 } |
351 | 324 |
352 // Linearly interpolate the two "convolutions". | 325 // Linearly interpolate the two "convolutions". |
353 return (1.0 - kernel_interpolation_factor) * sum1 | 326 return (1.0 - kernel_interpolation_factor) * sum1 |
354 + kernel_interpolation_factor * sum2; | 327 + kernel_interpolation_factor * sum2; |
355 } | 328 } |
356 | 329 |
357 #if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) | 330 #if defined(ARCH_CPU_X86_FAMILY) |
331 float SincResampler::Convolve_SSE(const float* input_ptr, const float* k1, | |
332 const float* k2, | |
333 double kernel_interpolation_factor) { | |
334 __m128 m_input; | |
335 __m128 m_sums1 = _mm_setzero_ps(); | |
336 __m128 m_sums2 = _mm_setzero_ps(); | |
337 | |
338 // Based on |input_ptr| alignment, we need to use loadu or load. Unrolling | |
339 // these loops hurt performance in local testing. | |
340 if (reinterpret_cast<uintptr_t>(input_ptr) & 0x0F) { | |
341 for (int i = 0; i < kKernelSize; i += 4) { | |
342 m_input = _mm_loadu_ps(input_ptr + i); | |
343 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); | |
344 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); | |
345 } | |
346 } else { | |
347 for (int i = 0; i < kKernelSize; i += 4) { | |
348 m_input = _mm_load_ps(input_ptr + i); | |
349 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); | |
350 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); | |
351 } | |
352 } | |
353 | |
354 // Linearly interpolate the two "convolutions". | |
355 m_sums1 = _mm_mul_ps(m_sums1, _mm_set_ps1(1.0 - kernel_interpolation_factor)); | |
356 m_sums2 = _mm_mul_ps(m_sums2, _mm_set_ps1(kernel_interpolation_factor)); | |
357 m_sums1 = _mm_add_ps(m_sums1, m_sums2); | |
358 | |
359 // Sum components together. | |
360 float result; | |
361 m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1); | |
362 _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps( | |
363 m_sums2, m_sums2, 1))); | |
364 | |
365 return result; | |
366 } | |
367 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) | |
rileya (GONE FROM CHROMIUM)
2014/05/30 01:12:29
Not too relevant to this CL, but iirc we don't set
| |
358 float SincResampler::Convolve_NEON(const float* input_ptr, const float* k1, | 368 float SincResampler::Convolve_NEON(const float* input_ptr, const float* k1, |
359 const float* k2, | 369 const float* k2, |
360 double kernel_interpolation_factor) { | 370 double kernel_interpolation_factor) { |
361 float32x4_t m_input; | 371 float32x4_t m_input; |
362 float32x4_t m_sums1 = vmovq_n_f32(0); | 372 float32x4_t m_sums1 = vmovq_n_f32(0); |
363 float32x4_t m_sums2 = vmovq_n_f32(0); | 373 float32x4_t m_sums2 = vmovq_n_f32(0); |
364 | 374 |
365 const float* upper = input_ptr + kKernelSize; | 375 const float* upper = input_ptr + kKernelSize; |
366 for (; input_ptr < upper; ) { | 376 for (; input_ptr < upper; ) { |
367 m_input = vld1q_f32(input_ptr); | 377 m_input = vld1q_f32(input_ptr); |
368 input_ptr += 4; | 378 input_ptr += 4; |
369 m_sums1 = vmlaq_f32(m_sums1, m_input, vld1q_f32(k1)); | 379 m_sums1 = vmlaq_f32(m_sums1, m_input, vld1q_f32(k1)); |
370 k1 += 4; | 380 k1 += 4; |
371 m_sums2 = vmlaq_f32(m_sums2, m_input, vld1q_f32(k2)); | 381 m_sums2 = vmlaq_f32(m_sums2, m_input, vld1q_f32(k2)); |
372 k2 += 4; | 382 k2 += 4; |
373 } | 383 } |
374 | 384 |
375 // Linearly interpolate the two "convolutions". | 385 // Linearly interpolate the two "convolutions". |
376 m_sums1 = vmlaq_f32( | 386 m_sums1 = vmlaq_f32( |
377 vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)), | 387 vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)), |
378 m_sums2, vmovq_n_f32(kernel_interpolation_factor)); | 388 m_sums2, vmovq_n_f32(kernel_interpolation_factor)); |
379 | 389 |
380 // Sum components together. | 390 // Sum components together. |
381 float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1)); | 391 float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1)); |
382 return vget_lane_f32(vpadd_f32(m_half, m_half), 0); | 392 return vget_lane_f32(vpadd_f32(m_half, m_half), 0); |
383 } | 393 } |
384 #endif | 394 #endif |
385 | 395 |
386 } // namespace media | 396 } // namespace media |
OLD | NEW |