| OLD | NEW |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 // | 4 // |
| 5 // Initial input buffer layout, dividing into regions r0_ to r4_ (note: r0_, r3_ | 5 // Initial input buffer layout, dividing into regions r0_ to r4_ (note: r0_, r3_ |
| 6 // and r4_ will move after the first load): | 6 // and r4_ will move after the first load): |
| 7 // | 7 // |
| 8 // |----------------|-----------------------------------------|----------------| | 8 // |----------------|-----------------------------------------|----------------| |
| 9 // | 9 // |
| 10 // request_frames_ | 10 // request_frames_ |
| (...skipping 64 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 75 | 75 |
| 76 // MSVC++ requires this to be set before any other includes to get M_PI. | 76 // MSVC++ requires this to be set before any other includes to get M_PI. |
| 77 #define _USE_MATH_DEFINES | 77 #define _USE_MATH_DEFINES |
| 78 | 78 |
| 79 #include "media/base/sinc_resampler.h" | 79 #include "media/base/sinc_resampler.h" |
| 80 | 80 |
| 81 #include <cmath> | 81 #include <cmath> |
| 82 #include <limits> | 82 #include <limits> |
| 83 | 83 |
| 84 #include "base/logging.h" | 84 #include "base/logging.h" |
| 85 #include "build/build_config.h" | |
| 86 | |
| 87 #if defined(ARCH_CPU_X86_FAMILY) | |
| 88 #include <xmmintrin.h> | |
| 89 #define CONVOLVE_FUNC Convolve_SSE | |
| 90 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) | |
| 91 #include <arm_neon.h> | |
| 92 #define CONVOLVE_FUNC Convolve_NEON | |
| 93 #else | |
| 94 #define CONVOLVE_FUNC Convolve_C | |
| 95 #endif | |
| 96 | 85 |
| 97 namespace media { | 86 namespace media { |
| 98 | 87 |
| 99 static double SincScaleFactor(double io_ratio) { | 88 static double SincScaleFactor(double io_ratio) { |
| 100 // |sinc_scale_factor| is basically the normalized cutoff frequency of the | 89 // |sinc_scale_factor| is basically the normalized cutoff frequency of the |
| 101 // low-pass filter. | 90 // low-pass filter. |
| 102 double sinc_scale_factor = io_ratio > 1.0 ? 1.0 / io_ratio : 1.0; | 91 double sinc_scale_factor = io_ratio > 1.0 ? 1.0 / io_ratio : 1.0; |
| 103 | 92 |
| 104 // The sinc function is an idealized brick-wall filter, but since we're | 93 // The sinc function is an idealized brick-wall filter, but since we're |
| 105 // windowing it the transition from pass to stop does not happen right away. | 94 // windowing it the transition from pass to stop does not happen right away. |
| (...skipping 78 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 184 | 173 |
| 185 for (int i = 0; i < kKernelSize; ++i) { | 174 for (int i = 0; i < kKernelSize; ++i) { |
| 186 const int idx = i + offset_idx * kKernelSize; | 175 const int idx = i + offset_idx * kKernelSize; |
| 187 const float pre_sinc = | 176 const float pre_sinc = |
| 188 static_cast<float>(M_PI * (i - kKernelSize / 2 - subsample_offset)); | 177 static_cast<float>(M_PI * (i - kKernelSize / 2 - subsample_offset)); |
| 189 kernel_pre_sinc_storage_[idx] = pre_sinc; | 178 kernel_pre_sinc_storage_[idx] = pre_sinc; |
| 190 | 179 |
| 191 // Compute Blackman window, matching the offset of the sinc(). | 180 // Compute Blackman window, matching the offset of the sinc(). |
| 192 const float x = (i - subsample_offset) / kKernelSize; | 181 const float x = (i - subsample_offset) / kKernelSize; |
| 193 const float window = static_cast<float>(kA0 - kA1 * cos(2.0 * M_PI * x) + | 182 const float window = static_cast<float>(kA0 - kA1 * cos(2.0 * M_PI * x) + |
| 194 kA2 * cos(4.0 * M_PI * x)); | 183 kA2 * cos(4.0 * M_PI * x)); |
| 195 kernel_window_storage_[idx] = window; | 184 kernel_window_storage_[idx] = window; |
| 196 | 185 |
| 197 // Compute the sinc with offset, then window the sinc() function and store | 186 // Compute the sinc with offset, then window the sinc() function and store |
| 198 // at the correct offset. | 187 // at the correct offset. |
| 199 kernel_storage_[idx] = static_cast<float>( | 188 kernel_storage_[idx] = static_cast<float>( |
| 200 window * (pre_sinc ? sin(sinc_scale_factor * pre_sinc) / pre_sinc | 189 window * (pre_sinc ? sin(sinc_scale_factor * pre_sinc) / pre_sinc |
| 201 : sinc_scale_factor)); | 190 : sinc_scale_factor)); |
| 202 } | 191 } |
| 203 } | 192 } |
| 204 } | 193 } |
| (...skipping 52 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 257 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k1) & 0x0F); | 246 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k1) & 0x0F); |
| 258 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k2) & 0x0F); | 247 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k2) & 0x0F); |
| 259 | 248 |
| 260 // Initialize input pointer based on quantized |virtual_source_idx_|. | 249 // Initialize input pointer based on quantized |virtual_source_idx_|. |
| 261 const float* input_ptr = r1_ + source_idx; | 250 const float* input_ptr = r1_ + source_idx; |
| 262 | 251 |
| 263 // Figure out how much to weight each kernel's "convolution". | 252 // Figure out how much to weight each kernel's "convolution". |
| 264 const double kernel_interpolation_factor = | 253 const double kernel_interpolation_factor = |
| 265 virtual_offset_idx - offset_idx; | 254 virtual_offset_idx - offset_idx; |
| 266 *destination++ = | 255 *destination++ = |
| 267 CONVOLVE_FUNC(input_ptr, k1, k2, kernel_interpolation_factor); | 256 vector_math::Convolve(input_ptr, k1, k2, kernel_interpolation_factor); |
| 268 | 257 |
| 269 // Advance the virtual index. | 258 // Advance the virtual index. |
| 270 virtual_source_idx_ += io_sample_rate_ratio_; | 259 virtual_source_idx_ += io_sample_rate_ratio_; |
| 271 if (!--remaining_frames) | 260 if (!--remaining_frames) |
| 272 return; | 261 return; |
| 273 } | 262 } |
| 274 | 263 |
| 275 // Wrap back around to the start. | 264 // Wrap back around to the start. |
| 276 DCHECK_GE(virtual_source_idx_, block_size_); | 265 DCHECK_GE(virtual_source_idx_, block_size_); |
| 277 virtual_source_idx_ -= block_size_; | 266 virtual_source_idx_ -= block_size_; |
| (...skipping 24 matching lines...) Expand all Loading... |
| 302 buffer_primed_ = false; | 291 buffer_primed_ = false; |
| 303 memset(input_buffer_.get(), 0, | 292 memset(input_buffer_.get(), 0, |
| 304 sizeof(*input_buffer_.get()) * input_buffer_size_); | 293 sizeof(*input_buffer_.get()) * input_buffer_size_); |
| 305 UpdateRegions(false); | 294 UpdateRegions(false); |
| 306 } | 295 } |
| 307 | 296 |
| 308 double SincResampler::BufferedFrames() const { | 297 double SincResampler::BufferedFrames() const { |
| 309 return buffer_primed_ ? request_frames_ - virtual_source_idx_ : 0; | 298 return buffer_primed_ ? request_frames_ - virtual_source_idx_ : 0; |
| 310 } | 299 } |
| 311 | 300 |
| 312 float SincResampler::Convolve_C(const float* input_ptr, const float* k1, | |
| 313 const float* k2, | |
| 314 double kernel_interpolation_factor) { | |
| 315 float sum1 = 0; | |
| 316 float sum2 = 0; | |
| 317 | |
| 318 // Generate a single output sample. Unrolling this loop hurt performance in | |
| 319 // local testing. | |
| 320 int n = kKernelSize; | |
| 321 while (n--) { | |
| 322 sum1 += *input_ptr * *k1++; | |
| 323 sum2 += *input_ptr++ * *k2++; | |
| 324 } | |
| 325 | |
| 326 // Linearly interpolate the two "convolutions". | |
| 327 return static_cast<float>((1.0 - kernel_interpolation_factor) * sum1 + | |
| 328 kernel_interpolation_factor * sum2); | |
| 329 } | |
| 330 | |
| 331 #if defined(ARCH_CPU_X86_FAMILY) | |
| 332 float SincResampler::Convolve_SSE(const float* input_ptr, const float* k1, | |
| 333 const float* k2, | |
| 334 double kernel_interpolation_factor) { | |
| 335 __m128 m_input; | |
| 336 __m128 m_sums1 = _mm_setzero_ps(); | |
| 337 __m128 m_sums2 = _mm_setzero_ps(); | |
| 338 | |
| 339 // Based on |input_ptr| alignment, we need to use loadu or load. Unrolling | |
| 340 // these loops hurt performance in local testing. | |
| 341 if (reinterpret_cast<uintptr_t>(input_ptr) & 0x0F) { | |
| 342 for (int i = 0; i < kKernelSize; i += 4) { | |
| 343 m_input = _mm_loadu_ps(input_ptr + i); | |
| 344 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); | |
| 345 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); | |
| 346 } | |
| 347 } else { | |
| 348 for (int i = 0; i < kKernelSize; i += 4) { | |
| 349 m_input = _mm_load_ps(input_ptr + i); | |
| 350 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); | |
| 351 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); | |
| 352 } | |
| 353 } | |
| 354 | |
| 355 // Linearly interpolate the two "convolutions". | |
| 356 m_sums1 = _mm_mul_ps(m_sums1, _mm_set_ps1( | |
| 357 static_cast<float>(1.0 - kernel_interpolation_factor))); | |
| 358 m_sums2 = _mm_mul_ps(m_sums2, _mm_set_ps1( | |
| 359 static_cast<float>(kernel_interpolation_factor))); | |
| 360 m_sums1 = _mm_add_ps(m_sums1, m_sums2); | |
| 361 | |
| 362 // Sum components together. | |
| 363 float result; | |
| 364 m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1); | |
| 365 _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps( | |
| 366 m_sums2, m_sums2, 1))); | |
| 367 | |
| 368 return result; | |
| 369 } | |
| 370 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) | |
| 371 float SincResampler::Convolve_NEON(const float* input_ptr, const float* k1, | |
| 372 const float* k2, | |
| 373 double kernel_interpolation_factor) { | |
| 374 float32x4_t m_input; | |
| 375 float32x4_t m_sums1 = vmovq_n_f32(0); | |
| 376 float32x4_t m_sums2 = vmovq_n_f32(0); | |
| 377 | |
| 378 const float* upper = input_ptr + kKernelSize; | |
| 379 for (; input_ptr < upper; ) { | |
| 380 m_input = vld1q_f32(input_ptr); | |
| 381 input_ptr += 4; | |
| 382 m_sums1 = vmlaq_f32(m_sums1, m_input, vld1q_f32(k1)); | |
| 383 k1 += 4; | |
| 384 m_sums2 = vmlaq_f32(m_sums2, m_input, vld1q_f32(k2)); | |
| 385 k2 += 4; | |
| 386 } | |
| 387 | |
| 388 // Linearly interpolate the two "convolutions". | |
| 389 m_sums1 = vmlaq_f32( | |
| 390 vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)), | |
| 391 m_sums2, vmovq_n_f32(kernel_interpolation_factor)); | |
| 392 | |
| 393 // Sum components together. | |
| 394 float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1)); | |
| 395 return vget_lane_f32(vpadd_f32(m_half, m_half), 0); | |
| 396 } | |
| 397 #endif | |
| 398 | |
| 399 } // namespace media | 301 } // namespace media |
| OLD | NEW |