| OLD | NEW |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 // | 4 // |
| 5 // Initial input buffer layout, dividing into regions r0_ to r4_ (note: r0_, r3_ | 5 // Initial input buffer layout, dividing into regions r0_ to r4_ (note: r0_, r3_ |
| 6 // and r4_ will move after the first load): | 6 // and r4_ will move after the first load): |
| 7 // | 7 // |
| 8 // |----------------|-----------------------------------------|----------------| | 8 // |----------------|-----------------------------------------|----------------| |
| 9 // | 9 // |
| 10 // request_frames_ | 10 // request_frames_ |
| (...skipping 63 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 74 // |virtual_source_idx_|, etc. | 74 // |virtual_source_idx_|, etc. |
| 75 | 75 |
| 76 // MSVC++ requires this to be set before any other includes to get M_PI. | 76 // MSVC++ requires this to be set before any other includes to get M_PI. |
| 77 #define _USE_MATH_DEFINES | 77 #define _USE_MATH_DEFINES |
| 78 | 78 |
| 79 #include "media/base/sinc_resampler.h" | 79 #include "media/base/sinc_resampler.h" |
| 80 | 80 |
| 81 #include <cmath> | 81 #include <cmath> |
| 82 #include <limits> | 82 #include <limits> |
| 83 | 83 |
| 84 #include "base/debug/alias.h" | |
| 85 #include "base/logging.h" | 84 #include "base/logging.h" |
| 86 #include "build/build_config.h" | 85 #include "build/build_config.h" |
| 87 | 86 |
| 88 #if defined(ARCH_CPU_X86_FAMILY) | 87 #if defined(ARCH_CPU_X86_FAMILY) |
| 89 #include <xmmintrin.h> | 88 #include <xmmintrin.h> |
| 90 #define CONVOLVE_FUNC Convolve_SSE | 89 #define CONVOLVE_FUNC Convolve_SSE |
| 91 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) | 90 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) |
| 92 #include <arm_neon.h> | 91 #include <arm_neon.h> |
| 93 #define CONVOLVE_FUNC Convolve_NEON | 92 #define CONVOLVE_FUNC Convolve_NEON |
| 94 #else | 93 #else |
| (...skipping 95 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 190 kernel_pre_sinc_storage_[idx] = pre_sinc; | 189 kernel_pre_sinc_storage_[idx] = pre_sinc; |
| 191 | 190 |
| 192 // Compute Blackman window, matching the offset of the sinc(). | 191 // Compute Blackman window, matching the offset of the sinc(). |
| 193 const float x = (i - subsample_offset) / kKernelSize; | 192 const float x = (i - subsample_offset) / kKernelSize; |
| 194 const float window = static_cast<float>(kA0 - kA1 * cos(2.0 * M_PI * x) + | 193 const float window = static_cast<float>(kA0 - kA1 * cos(2.0 * M_PI * x) + |
| 195 kA2 * cos(4.0 * M_PI * x)); | 194 kA2 * cos(4.0 * M_PI * x)); |
| 196 kernel_window_storage_[idx] = window; | 195 kernel_window_storage_[idx] = window; |
| 197 | 196 |
| 198 // Compute the sinc with offset, then window the sinc() function and store | 197 // Compute the sinc with offset, then window the sinc() function and store |
| 199 // at the correct offset. | 198 // at the correct offset. |
| 200 kernel_storage_[idx] = static_cast<float>(window * | 199 kernel_storage_[idx] = static_cast<float>( |
| 201 ((pre_sinc == 0) ? | 200 window * (pre_sinc ? sin(sinc_scale_factor * pre_sinc) / pre_sinc |
| 202 sinc_scale_factor : | 201 : sinc_scale_factor)); |
| 203 (sin(sinc_scale_factor * pre_sinc) / pre_sinc))); | |
| 204 } | 202 } |
| 205 } | 203 } |
| 206 } | 204 } |
| 207 | 205 |
| 208 void SincResampler::SetRatio(double io_sample_rate_ratio) { | 206 void SincResampler::SetRatio(double io_sample_rate_ratio) { |
| 209 if (fabs(io_sample_rate_ratio_ - io_sample_rate_ratio) < | 207 if (fabs(io_sample_rate_ratio_ - io_sample_rate_ratio) < |
| 210 std::numeric_limits<double>::epsilon()) { | 208 std::numeric_limits<double>::epsilon()) { |
| 211 return; | 209 return; |
| 212 } | 210 } |
| 213 | 211 |
| 214 io_sample_rate_ratio_ = io_sample_rate_ratio; | 212 io_sample_rate_ratio_ = io_sample_rate_ratio; |
| 215 chunk_size_ = CalculateChunkSize(block_size_, io_sample_rate_ratio_); | 213 chunk_size_ = CalculateChunkSize(block_size_, io_sample_rate_ratio_); |
| 216 | 214 |
| 217 // Optimize reinitialization by reusing values which are independent of | 215 // Optimize reinitialization by reusing values which are independent of |
| 218 // |sinc_scale_factor|. Provides a 3x speedup. | 216 // |sinc_scale_factor|. Provides a 3x speedup. |
| 219 const double sinc_scale_factor = SincScaleFactor(io_sample_rate_ratio_); | 217 const double sinc_scale_factor = SincScaleFactor(io_sample_rate_ratio_); |
| 220 for (int offset_idx = 0; offset_idx <= kKernelOffsetCount; ++offset_idx) { | 218 for (int offset_idx = 0; offset_idx <= kKernelOffsetCount; ++offset_idx) { |
| 221 for (int i = 0; i < kKernelSize; ++i) { | 219 for (int i = 0; i < kKernelSize; ++i) { |
| 222 const int idx = i + offset_idx * kKernelSize; | 220 const int idx = i + offset_idx * kKernelSize; |
| 223 const float window = kernel_window_storage_[idx]; | 221 const float window = kernel_window_storage_[idx]; |
| 224 const float pre_sinc = kernel_pre_sinc_storage_[idx]; | 222 const float pre_sinc = kernel_pre_sinc_storage_[idx]; |
| 225 | 223 |
| 226 kernel_storage_[idx] = static_cast<float>(window * | 224 kernel_storage_[idx] = static_cast<float>( |
| 227 ((pre_sinc == 0) ? | 225 window * (pre_sinc ? sin(sinc_scale_factor * pre_sinc) / pre_sinc |
| 228 sinc_scale_factor : | 226 : sinc_scale_factor)); |
| 229 (sin(sinc_scale_factor * pre_sinc) / pre_sinc))); | |
| 230 } | 227 } |
| 231 } | 228 } |
| 232 } | 229 } |
| 233 | 230 |
| 234 void SincResampler::Resample(int frames, float* destination) { | 231 void SincResampler::Resample(int frames, float* destination) { |
| 235 int remaining_frames = frames; | 232 int remaining_frames = frames; |
| 236 | 233 |
| 237 // Step (1) -- Prime the input buffer at the start of the input stream. | 234 // Step (1) -- Prime the input buffer at the start of the input stream. |
| 238 if (!buffer_primed_ && remaining_frames) { | 235 if (!buffer_primed_ && remaining_frames) { |
| 239 read_cb_.Run(request_frames_, r0_); | 236 read_cb_.Run(request_frames_, r0_); |
| 240 buffer_primed_ = true; | 237 buffer_primed_ = true; |
| 241 } | 238 } |
| 242 | 239 |
| 243 // TODO(dalecurtis): Temporary debugging for http://crbug.com/663814 | 240 // Step (2) -- Resample! |
| 244 const double starting_idx = virtual_source_idx_; | |
| 245 CHECK(!std::isnan(virtual_source_idx_)); | |
| 246 | |
| 247 // Step (2) -- Resample! const what we can outside of the loop for speed. It | |
| 248 // actually has an impact on ARM performance. See inner loop comment below. | |
| 249 const double current_io_ratio = io_sample_rate_ratio_; | |
| 250 const float* const kernel_ptr = kernel_storage_.get(); | |
| 251 while (remaining_frames) { | 241 while (remaining_frames) { |
| 252 // Note: The loop construct here can severely impact performance on ARM | 242 while (virtual_source_idx_ < block_size_) { |
| 253 // or when built with clang. See https://codereview.chromium.org/18566009/ | |
| 254 int source_idx = static_cast<int>(virtual_source_idx_); | |
| 255 // TODO(dalecurtis): Temporary debugging for http://crbug.com/663814 | |
| 256 CHECK_GE(source_idx, 0); | |
| 257 while (source_idx < block_size_) { | |
| 258 // |virtual_source_idx_| lies in between two kernel offsets so figure out | 243 // |virtual_source_idx_| lies in between two kernel offsets so figure out |
| 259 // what they are. | 244 // what they are. |
| 260 const double subsample_remainder = virtual_source_idx_ - source_idx; | 245 const int source_idx = static_cast<int>(virtual_source_idx_); |
| 261 | |
| 262 const double virtual_offset_idx = | 246 const double virtual_offset_idx = |
| 263 subsample_remainder * kKernelOffsetCount; | 247 (virtual_source_idx_ - source_idx) * kKernelOffsetCount; |
| 264 const int offset_idx = static_cast<int>(virtual_offset_idx); | 248 const int offset_idx = static_cast<int>(virtual_offset_idx); |
| 265 | 249 |
| 266 // We'll compute "convolutions" for the two kernels which straddle | 250 // We'll compute "convolutions" for the two kernels which straddle |
| 267 // |virtual_source_idx_|. | 251 // |virtual_source_idx_|. |
| 268 const float* const k1 = kernel_ptr + offset_idx * kKernelSize; | 252 const float* k1 = kernel_storage_.get() + offset_idx * kKernelSize; |
| 269 const float* const k2 = k1 + kKernelSize; | 253 const float* k2 = k1 + kKernelSize; |
| 270 | 254 |
| 271 // Ensure |k1|, |k2| are 16-byte aligned for SIMD usage. Should always be | 255 // Ensure |k1|, |k2| are 16-byte aligned for SIMD usage. Should always be |
| 272 // true so long as kKernelSize is a multiple of 16. | 256 // true so long as kKernelSize is a multiple of 16. |
| 273 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k1) & 0x0F); | 257 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k1) & 0x0F); |
| 274 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k2) & 0x0F); | 258 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k2) & 0x0F); |
| 275 | 259 |
| 276 // Initialize input pointer based on quantized |virtual_source_idx_|. | 260 // Initialize input pointer based on quantized |virtual_source_idx_|. |
| 277 const float* const input_ptr = r1_ + source_idx; | 261 const float* input_ptr = r1_ + source_idx; |
| 278 | 262 |
| 279 // Figure out how much to weight each kernel's "convolution". | 263 // Figure out how much to weight each kernel's "convolution". |
| 280 const double kernel_interpolation_factor = | 264 const double kernel_interpolation_factor = |
| 281 virtual_offset_idx - offset_idx; | 265 virtual_offset_idx - offset_idx; |
| 282 *destination++ = CONVOLVE_FUNC( | 266 *destination++ = |
| 283 input_ptr, k1, k2, kernel_interpolation_factor); | 267 CONVOLVE_FUNC(input_ptr, k1, k2, kernel_interpolation_factor); |
| 284 | 268 |
| 285 // Advance the virtual index. | 269 // Advance the virtual index. |
| 286 virtual_source_idx_ += current_io_ratio; | 270 virtual_source_idx_ += io_sample_rate_ratio_; |
| 287 source_idx = static_cast<int>(virtual_source_idx_); | |
| 288 | |
| 289 // TODO(dalecurtis): Temporary debugging for http://crbug.com/663814 | |
| 290 base::debug::Alias(&starting_idx); | |
| 291 CHECK(!std::isnan(virtual_source_idx_)); | |
| 292 CHECK_GE(source_idx, 0); | |
| 293 if (!--remaining_frames) | 271 if (!--remaining_frames) |
| 294 return; | 272 return; |
| 295 } | 273 } |
| 296 | 274 |
| 297 // Wrap back around to the start. | 275 // Wrap back around to the start. |
| 298 DCHECK_GE(virtual_source_idx_, block_size_); | 276 DCHECK_GE(virtual_source_idx_, block_size_); |
| 299 virtual_source_idx_ -= block_size_; | 277 virtual_source_idx_ -= block_size_; |
| 300 | 278 |
| 301 // TODO(dalecurtis): Temporary debugging for http://crbug.com/663814 | |
| 302 base::debug::Alias(&starting_idx); | |
| 303 CHECK(!std::isnan(virtual_source_idx_)); | |
| 304 | |
| 305 // Step (3) -- Copy r3_, r4_ to r1_, r2_. | 279 // Step (3) -- Copy r3_, r4_ to r1_, r2_. |
| 306 // This wraps the last input frames back to the start of the buffer. | 280 // This wraps the last input frames back to the start of the buffer. |
| 307 memcpy(r1_, r3_, sizeof(*input_buffer_.get()) * kKernelSize); | 281 memcpy(r1_, r3_, sizeof(*input_buffer_.get()) * kKernelSize); |
| 308 | 282 |
| 309 // Step (4) -- Reinitialize regions if necessary. | 283 // Step (4) -- Reinitialize regions if necessary. |
| 310 if (r0_ == r2_) | 284 if (r0_ == r2_) |
| 311 UpdateRegions(true); | 285 UpdateRegions(true); |
| 312 | 286 |
| 313 // Step (5) -- Refresh the buffer with more input. | 287 // Step (5) -- Refresh the buffer with more input. |
| 314 read_cb_.Run(request_frames_, r0_); | 288 read_cb_.Run(request_frames_, r0_); |
| (...skipping 10 matching lines...) Expand all Loading... |
| 325 | 299 |
| 326 void SincResampler::Flush() { | 300 void SincResampler::Flush() { |
| 327 virtual_source_idx_ = 0; | 301 virtual_source_idx_ = 0; |
| 328 buffer_primed_ = false; | 302 buffer_primed_ = false; |
| 329 memset(input_buffer_.get(), 0, | 303 memset(input_buffer_.get(), 0, |
| 330 sizeof(*input_buffer_.get()) * input_buffer_size_); | 304 sizeof(*input_buffer_.get()) * input_buffer_size_); |
| 331 UpdateRegions(false); | 305 UpdateRegions(false); |
| 332 } | 306 } |
| 333 | 307 |
| 334 double SincResampler::BufferedFrames() const { | 308 double SincResampler::BufferedFrames() const { |
| 335 if (buffer_primed_) { | 309 return buffer_primed_ ? request_frames_ - virtual_source_idx_ : 0; |
| 336 return request_frames_ - virtual_source_idx_; | |
| 337 } else { | |
| 338 return 0.0; | |
| 339 } | |
| 340 } | 310 } |
| 341 | 311 |
| 342 float SincResampler::Convolve_C(const float* input_ptr, const float* k1, | 312 float SincResampler::Convolve_C(const float* input_ptr, const float* k1, |
| 343 const float* k2, | 313 const float* k2, |
| 344 double kernel_interpolation_factor) { | 314 double kernel_interpolation_factor) { |
| 345 float sum1 = 0; | 315 float sum1 = 0; |
| 346 float sum2 = 0; | 316 float sum2 = 0; |
| 347 | 317 |
| 348 // Generate a single output sample. Unrolling this loop hurt performance in | 318 // Generate a single output sample. Unrolling this loop hurt performance in |
| 349 // local testing. | 319 // local testing. |
| (...skipping 70 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 420 vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)), | 390 vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)), |
| 421 m_sums2, vmovq_n_f32(kernel_interpolation_factor)); | 391 m_sums2, vmovq_n_f32(kernel_interpolation_factor)); |
| 422 | 392 |
| 423 // Sum components together. | 393 // Sum components together. |
| 424 float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1)); | 394 float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1)); |
| 425 return vget_lane_f32(vpadd_f32(m_half, m_half), 0); | 395 return vget_lane_f32(vpadd_f32(m_half, m_half), 0); |
| 426 } | 396 } |
| 427 #endif | 397 #endif |
| 428 | 398 |
| 429 } // namespace media | 399 } // namespace media |
| OLD | NEW |