| OLD | NEW |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 // | 4 // |
| 5 // Initial input buffer layout, dividing into regions r0_ to r4_ (note: r0_, r3_ | 5 // Initial input buffer layout, dividing into regions r0_ to r4_ (note: r0_, r3_ |
| 6 // and r4_ will move after the first load): | 6 // and r4_ will move after the first load): |
| 7 // | 7 // |
| 8 // |----------------|-----------------------------------------|----------------| | 8 // |----------------|-----------------------------------------|----------------| |
| 9 // | 9 // |
| 10 // request_frames_ | 10 // request_frames_ |
| (...skipping 160 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 171 | 171 |
| 172 // Generates a set of windowed sinc() kernels. | 172 // Generates a set of windowed sinc() kernels. |
| 173 // We generate a range of sub-sample offsets from 0.0 to 1.0. | 173 // We generate a range of sub-sample offsets from 0.0 to 1.0. |
| 174 const double sinc_scale_factor = SincScaleFactor(io_sample_rate_ratio_); | 174 const double sinc_scale_factor = SincScaleFactor(io_sample_rate_ratio_); |
| 175 for (int offset_idx = 0; offset_idx <= kKernelOffsetCount; ++offset_idx) { | 175 for (int offset_idx = 0; offset_idx <= kKernelOffsetCount; ++offset_idx) { |
| 176 const float subsample_offset = | 176 const float subsample_offset = |
| 177 static_cast<float>(offset_idx) / kKernelOffsetCount; | 177 static_cast<float>(offset_idx) / kKernelOffsetCount; |
| 178 | 178 |
| 179 for (int i = 0; i < kKernelSize; ++i) { | 179 for (int i = 0; i < kKernelSize; ++i) { |
| 180 const int idx = i + offset_idx * kKernelSize; | 180 const int idx = i + offset_idx * kKernelSize; |
| 181 const float pre_sinc = M_PI * (i - kKernelSize / 2 - subsample_offset); | 181 const float pre_sinc = |
| 182 static_cast<float>(M_PI * (i - kKernelSize / 2 - subsample_offset)); |
| 182 kernel_pre_sinc_storage_[idx] = pre_sinc; | 183 kernel_pre_sinc_storage_[idx] = pre_sinc; |
| 183 | 184 |
| 184 // Compute Blackman window, matching the offset of the sinc(). | 185 // Compute Blackman window, matching the offset of the sinc(). |
| 185 const float x = (i - subsample_offset) / kKernelSize; | 186 const float x = (i - subsample_offset) / kKernelSize; |
| 186 const float window = | 187 const float window = static_cast<float>(kA0 - kA1 * cos(2.0 * M_PI * x) + |
| 187 kA0 - kA1 * cos(2.0 * M_PI * x) + kA2 * cos(4.0 * M_PI * x); | 188 kA2 * cos(4.0 * M_PI * x)); |
| 188 kernel_window_storage_[idx] = window; | 189 kernel_window_storage_[idx] = window; |
| 189 | 190 |
| 190 // Compute the sinc with offset, then window the sinc() function and store | 191 // Compute the sinc with offset, then window the sinc() function and store |
| 191 // at the correct offset. | 192 // at the correct offset. |
| 192 if (pre_sinc == 0) { | 193 kernel_storage_[idx] = static_cast<float>(window * |
| 193 kernel_storage_[idx] = sinc_scale_factor * window; | 194 ((pre_sinc == 0) ? |
| 194 } else { | 195 sinc_scale_factor : |
| 195 kernel_storage_[idx] = | 196 (sin(sinc_scale_factor * pre_sinc) / pre_sinc))); |
| 196 window * sin(sinc_scale_factor * pre_sinc) / pre_sinc; | |
| 197 } | |
| 198 } | 197 } |
| 199 } | 198 } |
| 200 } | 199 } |
| 201 | 200 |
| 202 void SincResampler::SetRatio(double io_sample_rate_ratio) { | 201 void SincResampler::SetRatio(double io_sample_rate_ratio) { |
| 203 if (fabs(io_sample_rate_ratio_ - io_sample_rate_ratio) < | 202 if (fabs(io_sample_rate_ratio_ - io_sample_rate_ratio) < |
| 204 std::numeric_limits<double>::epsilon()) { | 203 std::numeric_limits<double>::epsilon()) { |
| 205 return; | 204 return; |
| 206 } | 205 } |
| 207 | 206 |
| 208 io_sample_rate_ratio_ = io_sample_rate_ratio; | 207 io_sample_rate_ratio_ = io_sample_rate_ratio; |
| 209 | 208 |
| 210 // Optimize reinitialization by reusing values which are independent of | 209 // Optimize reinitialization by reusing values which are independent of |
| 211 // |sinc_scale_factor|. Provides a 3x speedup. | 210 // |sinc_scale_factor|. Provides a 3x speedup. |
| 212 const double sinc_scale_factor = SincScaleFactor(io_sample_rate_ratio_); | 211 const double sinc_scale_factor = SincScaleFactor(io_sample_rate_ratio_); |
| 213 for (int offset_idx = 0; offset_idx <= kKernelOffsetCount; ++offset_idx) { | 212 for (int offset_idx = 0; offset_idx <= kKernelOffsetCount; ++offset_idx) { |
| 214 for (int i = 0; i < kKernelSize; ++i) { | 213 for (int i = 0; i < kKernelSize; ++i) { |
| 215 const int idx = i + offset_idx * kKernelSize; | 214 const int idx = i + offset_idx * kKernelSize; |
| 216 const float window = kernel_window_storage_[idx]; | 215 const float window = kernel_window_storage_[idx]; |
| 217 const float pre_sinc = kernel_pre_sinc_storage_[idx]; | 216 const float pre_sinc = kernel_pre_sinc_storage_[idx]; |
| 218 | 217 |
| 219 if (pre_sinc == 0) { | 218 kernel_storage_[idx] = static_cast<float>(window * |
| 220 kernel_storage_[idx] = sinc_scale_factor * window; | 219 ((pre_sinc == 0) ? |
| 221 } else { | 220 sinc_scale_factor : |
| 222 kernel_storage_[idx] = | 221 (sin(sinc_scale_factor * pre_sinc) / pre_sinc))); |
| 223 window * sin(sinc_scale_factor * pre_sinc) / pre_sinc; | |
| 224 } | |
| 225 } | 222 } |
| 226 } | 223 } |
| 227 } | 224 } |
| 228 | 225 |
| 229 void SincResampler::Resample(int frames, float* destination) { | 226 void SincResampler::Resample(int frames, float* destination) { |
| 230 int remaining_frames = frames; | 227 int remaining_frames = frames; |
| 231 | 228 |
| 232 // Step (1) -- Prime the input buffer at the start of the input stream. | 229 // Step (1) -- Prime the input buffer at the start of the input stream. |
| 233 if (!buffer_primed_ && remaining_frames) { | 230 if (!buffer_primed_ && remaining_frames) { |
| 234 read_cb_.Run(request_frames_, r0_); | 231 read_cb_.Run(request_frames_, r0_); |
| 235 buffer_primed_ = true; | 232 buffer_primed_ = true; |
| 236 } | 233 } |
| 237 | 234 |
| 238 // Step (2) -- Resample! const what we can outside of the loop for speed. It | 235 // Step (2) -- Resample! const what we can outside of the loop for speed. It |
| 239 // actually has an impact on ARM performance. See inner loop comment below. | 236 // actually has an impact on ARM performance. See inner loop comment below. |
| 240 const double current_io_ratio = io_sample_rate_ratio_; | 237 const double current_io_ratio = io_sample_rate_ratio_; |
| 241 const float* const kernel_ptr = kernel_storage_.get(); | 238 const float* const kernel_ptr = kernel_storage_.get(); |
| 242 while (remaining_frames) { | 239 while (remaining_frames) { |
| 243 // Note: The loop construct here can severely impact performance on ARM | 240 // Note: The loop construct here can severely impact performance on ARM |
| 244 // or when built with clang. See https://codereview.chromium.org/18566009/ | 241 // or when built with clang. See https://codereview.chromium.org/18566009/ |
| 245 int source_idx = virtual_source_idx_; | 242 int source_idx = static_cast<int>(virtual_source_idx_); |
| 246 while (source_idx < block_size_) { | 243 while (source_idx < block_size_) { |
| 247 // |virtual_source_idx_| lies in between two kernel offsets so figure out | 244 // |virtual_source_idx_| lies in between two kernel offsets so figure out |
| 248 // what they are. | 245 // what they are. |
| 249 const double subsample_remainder = virtual_source_idx_ - source_idx; | 246 const double subsample_remainder = virtual_source_idx_ - source_idx; |
| 250 | 247 |
| 251 const double virtual_offset_idx = | 248 const double virtual_offset_idx = |
| 252 subsample_remainder * kKernelOffsetCount; | 249 subsample_remainder * kKernelOffsetCount; |
| 253 const int offset_idx = virtual_offset_idx; | 250 const int offset_idx = static_cast<int>(virtual_offset_idx); |
| 254 | 251 |
| 255 // We'll compute "convolutions" for the two kernels which straddle | 252 // We'll compute "convolutions" for the two kernels which straddle |
| 256 // |virtual_source_idx_|. | 253 // |virtual_source_idx_|. |
| 257 const float* const k1 = kernel_ptr + offset_idx * kKernelSize; | 254 const float* const k1 = kernel_ptr + offset_idx * kKernelSize; |
| 258 const float* const k2 = k1 + kKernelSize; | 255 const float* const k2 = k1 + kKernelSize; |
| 259 | 256 |
| 260 // Ensure |k1|, |k2| are 16-byte aligned for SIMD usage. Should always be | 257 // Ensure |k1|, |k2| are 16-byte aligned for SIMD usage. Should always be |
| 261 // true so long as kKernelSize is a multiple of 16. | 258 // true so long as kKernelSize is a multiple of 16. |
| 262 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k1) & 0x0F); | 259 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k1) & 0x0F); |
| 263 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k2) & 0x0F); | 260 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k2) & 0x0F); |
| 264 | 261 |
| 265 // Initialize input pointer based on quantized |virtual_source_idx_|. | 262 // Initialize input pointer based on quantized |virtual_source_idx_|. |
| 266 const float* const input_ptr = r1_ + source_idx; | 263 const float* const input_ptr = r1_ + source_idx; |
| 267 | 264 |
| 268 // Figure out how much to weight each kernel's "convolution". | 265 // Figure out how much to weight each kernel's "convolution". |
| 269 const double kernel_interpolation_factor = | 266 const double kernel_interpolation_factor = |
| 270 virtual_offset_idx - offset_idx; | 267 virtual_offset_idx - offset_idx; |
| 271 *destination++ = CONVOLVE_FUNC( | 268 *destination++ = CONVOLVE_FUNC( |
| 272 input_ptr, k1, k2, kernel_interpolation_factor); | 269 input_ptr, k1, k2, kernel_interpolation_factor); |
| 273 | 270 |
| 274 // Advance the virtual index. | 271 // Advance the virtual index. |
| 275 virtual_source_idx_ += current_io_ratio; | 272 virtual_source_idx_ += current_io_ratio; |
| 276 source_idx = virtual_source_idx_; | 273 source_idx = static_cast<int>(virtual_source_idx_); |
| 277 | 274 |
| 278 if (!--remaining_frames) | 275 if (!--remaining_frames) |
| 279 return; | 276 return; |
| 280 } | 277 } |
| 281 | 278 |
| 282 // Wrap back around to the start. | 279 // Wrap back around to the start. |
| 283 DCHECK_GE(virtual_source_idx_, block_size_); | 280 DCHECK_GE(virtual_source_idx_, block_size_); |
| 284 virtual_source_idx_ -= block_size_; | 281 virtual_source_idx_ -= block_size_; |
| 285 | 282 |
| 286 // Step (3) -- Copy r3_, r4_ to r1_, r2_. | 283 // Step (3) -- Copy r3_, r4_ to r1_, r2_. |
| 287 // This wraps the last input frames back to the start of the buffer. | 284 // This wraps the last input frames back to the start of the buffer. |
| 288 memcpy(r1_, r3_, sizeof(*input_buffer_.get()) * kKernelSize); | 285 memcpy(r1_, r3_, sizeof(*input_buffer_.get()) * kKernelSize); |
| 289 | 286 |
| 290 // Step (4) -- Reinitialize regions if necessary. | 287 // Step (4) -- Reinitialize regions if necessary. |
| 291 if (r0_ == r2_) | 288 if (r0_ == r2_) |
| 292 UpdateRegions(true); | 289 UpdateRegions(true); |
| 293 | 290 |
| 294 // Step (5) -- Refresh the buffer with more input. | 291 // Step (5) -- Refresh the buffer with more input. |
| 295 read_cb_.Run(request_frames_, r0_); | 292 read_cb_.Run(request_frames_, r0_); |
| 296 } | 293 } |
| 297 } | 294 } |
| 298 | 295 |
| 299 int SincResampler::ChunkSize() const { | 296 int SincResampler::ChunkSize() const { |
| 300 return block_size_ / io_sample_rate_ratio_; | 297 return static_cast<int>(block_size_ / io_sample_rate_ratio_); |
| 301 } | 298 } |
| 302 | 299 |
| 303 void SincResampler::Flush() { | 300 void SincResampler::Flush() { |
| 304 virtual_source_idx_ = 0; | 301 virtual_source_idx_ = 0; |
| 305 buffer_primed_ = false; | 302 buffer_primed_ = false; |
| 306 memset(input_buffer_.get(), 0, | 303 memset(input_buffer_.get(), 0, |
| 307 sizeof(*input_buffer_.get()) * input_buffer_size_); | 304 sizeof(*input_buffer_.get()) * input_buffer_size_); |
| 308 UpdateRegions(false); | 305 UpdateRegions(false); |
| 309 } | 306 } |
| 310 | 307 |
| 311 float SincResampler::Convolve_C(const float* input_ptr, const float* k1, | 308 float SincResampler::Convolve_C(const float* input_ptr, const float* k1, |
| 312 const float* k2, | 309 const float* k2, |
| 313 double kernel_interpolation_factor) { | 310 double kernel_interpolation_factor) { |
| 314 float sum1 = 0; | 311 float sum1 = 0; |
| 315 float sum2 = 0; | 312 float sum2 = 0; |
| 316 | 313 |
| 317 // Generate a single output sample. Unrolling this loop hurt performance in | 314 // Generate a single output sample. Unrolling this loop hurt performance in |
| 318 // local testing. | 315 // local testing. |
| 319 int n = kKernelSize; | 316 int n = kKernelSize; |
| 320 while (n--) { | 317 while (n--) { |
| 321 sum1 += *input_ptr * *k1++; | 318 sum1 += *input_ptr * *k1++; |
| 322 sum2 += *input_ptr++ * *k2++; | 319 sum2 += *input_ptr++ * *k2++; |
| 323 } | 320 } |
| 324 | 321 |
| 325 // Linearly interpolate the two "convolutions". | 322 // Linearly interpolate the two "convolutions". |
| 326 return (1.0 - kernel_interpolation_factor) * sum1 | 323 return static_cast<float>((1.0 - kernel_interpolation_factor) * sum1 + |
| 327 + kernel_interpolation_factor * sum2; | 324 kernel_interpolation_factor * sum2); |
| 328 } | 325 } |
| 329 | 326 |
| 330 #if defined(ARCH_CPU_X86_FAMILY) | 327 #if defined(ARCH_CPU_X86_FAMILY) |
| 331 float SincResampler::Convolve_SSE(const float* input_ptr, const float* k1, | 328 float SincResampler::Convolve_SSE(const float* input_ptr, const float* k1, |
| 332 const float* k2, | 329 const float* k2, |
| 333 double kernel_interpolation_factor) { | 330 double kernel_interpolation_factor) { |
| 334 __m128 m_input; | 331 __m128 m_input; |
| 335 __m128 m_sums1 = _mm_setzero_ps(); | 332 __m128 m_sums1 = _mm_setzero_ps(); |
| 336 __m128 m_sums2 = _mm_setzero_ps(); | 333 __m128 m_sums2 = _mm_setzero_ps(); |
| 337 | 334 |
| 338 // Based on |input_ptr| alignment, we need to use loadu or load. Unrolling | 335 // Based on |input_ptr| alignment, we need to use loadu or load. Unrolling |
| 339 // these loops hurt performance in local testing. | 336 // these loops hurt performance in local testing. |
| 340 if (reinterpret_cast<uintptr_t>(input_ptr) & 0x0F) { | 337 if (reinterpret_cast<uintptr_t>(input_ptr) & 0x0F) { |
| 341 for (int i = 0; i < kKernelSize; i += 4) { | 338 for (int i = 0; i < kKernelSize; i += 4) { |
| 342 m_input = _mm_loadu_ps(input_ptr + i); | 339 m_input = _mm_loadu_ps(input_ptr + i); |
| 343 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); | 340 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); |
| 344 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); | 341 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); |
| 345 } | 342 } |
| 346 } else { | 343 } else { |
| 347 for (int i = 0; i < kKernelSize; i += 4) { | 344 for (int i = 0; i < kKernelSize; i += 4) { |
| 348 m_input = _mm_load_ps(input_ptr + i); | 345 m_input = _mm_load_ps(input_ptr + i); |
| 349 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); | 346 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); |
| 350 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); | 347 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); |
| 351 } | 348 } |
| 352 } | 349 } |
| 353 | 350 |
| 354 // Linearly interpolate the two "convolutions". | 351 // Linearly interpolate the two "convolutions". |
| 355 m_sums1 = _mm_mul_ps(m_sums1, _mm_set_ps1(1.0 - kernel_interpolation_factor)); | 352 m_sums1 = _mm_mul_ps(m_sums1, _mm_set_ps1( |
| 356 m_sums2 = _mm_mul_ps(m_sums2, _mm_set_ps1(kernel_interpolation_factor)); | 353 static_cast<float>(1.0 - kernel_interpolation_factor))); |
| 354 m_sums2 = _mm_mul_ps(m_sums2, _mm_set_ps1( |
| 355 static_cast<float>(kernel_interpolation_factor))); |
| 357 m_sums1 = _mm_add_ps(m_sums1, m_sums2); | 356 m_sums1 = _mm_add_ps(m_sums1, m_sums2); |
| 358 | 357 |
| 359 // Sum components together. | 358 // Sum components together. |
| 360 float result; | 359 float result; |
| 361 m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1); | 360 m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1); |
| 362 _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps( | 361 _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps( |
| 363 m_sums2, m_sums2, 1))); | 362 m_sums2, m_sums2, 1))); |
| 364 | 363 |
| 365 return result; | 364 return result; |
| 366 } | 365 } |
| (...skipping 20 matching lines...) Expand all Loading... |
| 387 vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)), | 386 vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)), |
| 388 m_sums2, vmovq_n_f32(kernel_interpolation_factor)); | 387 m_sums2, vmovq_n_f32(kernel_interpolation_factor)); |
| 389 | 388 |
| 390 // Sum components together. | 389 // Sum components together. |
| 391 float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1)); | 390 float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1)); |
| 392 return vget_lane_f32(vpadd_f32(m_half, m_half), 0); | 391 return vget_lane_f32(vpadd_f32(m_half, m_half), 0); |
| 393 } | 392 } |
| 394 #endif | 393 #endif |
| 395 | 394 |
| 396 } // namespace media | 395 } // namespace media |
| OLD | NEW |