OLD | NEW |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 // | 4 // |
5 // Initial input buffer layout, dividing into regions r0_ to r4_ (note: r0_, r3_ | 5 // Initial input buffer layout, dividing into regions r0_ to r4_ (note: r0_, r3_ |
6 // and r4_ will move after the first load): | 6 // and r4_ will move after the first load): |
7 // | 7 // |
8 // |----------------|-----------------------------------------|----------------| | 8 // |----------------|-----------------------------------------|----------------| |
9 // | 9 // |
10 // request_frames_ | 10 // request_frames_ |
(...skipping 160 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
171 | 171 |
172 // Generates a set of windowed sinc() kernels. | 172 // Generates a set of windowed sinc() kernels. |
173 // We generate a range of sub-sample offsets from 0.0 to 1.0. | 173 // We generate a range of sub-sample offsets from 0.0 to 1.0. |
174 const double sinc_scale_factor = SincScaleFactor(io_sample_rate_ratio_); | 174 const double sinc_scale_factor = SincScaleFactor(io_sample_rate_ratio_); |
175 for (int offset_idx = 0; offset_idx <= kKernelOffsetCount; ++offset_idx) { | 175 for (int offset_idx = 0; offset_idx <= kKernelOffsetCount; ++offset_idx) { |
176 const float subsample_offset = | 176 const float subsample_offset = |
177 static_cast<float>(offset_idx) / kKernelOffsetCount; | 177 static_cast<float>(offset_idx) / kKernelOffsetCount; |
178 | 178 |
179 for (int i = 0; i < kKernelSize; ++i) { | 179 for (int i = 0; i < kKernelSize; ++i) { |
180 const int idx = i + offset_idx * kKernelSize; | 180 const int idx = i + offset_idx * kKernelSize; |
181 const float pre_sinc = M_PI * (i - kKernelSize / 2 - subsample_offset); | 181 const float pre_sinc = |
| 182 static_cast<float>(M_PI * (i - kKernelSize / 2 - subsample_offset)); |
182 kernel_pre_sinc_storage_[idx] = pre_sinc; | 183 kernel_pre_sinc_storage_[idx] = pre_sinc; |
183 | 184 |
184 // Compute Blackman window, matching the offset of the sinc(). | 185 // Compute Blackman window, matching the offset of the sinc(). |
185 const float x = (i - subsample_offset) / kKernelSize; | 186 const float x = (i - subsample_offset) / kKernelSize; |
186 const float window = | 187 const float window = static_cast<float>(kA0 - kA1 * cos(2.0 * M_PI * x) + |
187 kA0 - kA1 * cos(2.0 * M_PI * x) + kA2 * cos(4.0 * M_PI * x); | 188 kA2 * cos(4.0 * M_PI * x)); |
188 kernel_window_storage_[idx] = window; | 189 kernel_window_storage_[idx] = window; |
189 | 190 |
190 // Compute the sinc with offset, then window the sinc() function and store | 191 // Compute the sinc with offset, then window the sinc() function and store |
191 // at the correct offset. | 192 // at the correct offset. |
192 if (pre_sinc == 0) { | 193 kernel_storage_[idx] = static_cast<float>(window * |
193 kernel_storage_[idx] = sinc_scale_factor * window; | 194 ((pre_sinc == 0) ? |
194 } else { | 195 sinc_scale_factor : |
195 kernel_storage_[idx] = | 196 (sin(sinc_scale_factor * pre_sinc) / pre_sinc))); |
196 window * sin(sinc_scale_factor * pre_sinc) / pre_sinc; | |
197 } | |
198 } | 197 } |
199 } | 198 } |
200 } | 199 } |
201 | 200 |
202 void SincResampler::SetRatio(double io_sample_rate_ratio) { | 201 void SincResampler::SetRatio(double io_sample_rate_ratio) { |
203 if (fabs(io_sample_rate_ratio_ - io_sample_rate_ratio) < | 202 if (fabs(io_sample_rate_ratio_ - io_sample_rate_ratio) < |
204 std::numeric_limits<double>::epsilon()) { | 203 std::numeric_limits<double>::epsilon()) { |
205 return; | 204 return; |
206 } | 205 } |
207 | 206 |
208 io_sample_rate_ratio_ = io_sample_rate_ratio; | 207 io_sample_rate_ratio_ = io_sample_rate_ratio; |
209 | 208 |
210 // Optimize reinitialization by reusing values which are independent of | 209 // Optimize reinitialization by reusing values which are independent of |
211 // |sinc_scale_factor|. Provides a 3x speedup. | 210 // |sinc_scale_factor|. Provides a 3x speedup. |
212 const double sinc_scale_factor = SincScaleFactor(io_sample_rate_ratio_); | 211 const double sinc_scale_factor = SincScaleFactor(io_sample_rate_ratio_); |
213 for (int offset_idx = 0; offset_idx <= kKernelOffsetCount; ++offset_idx) { | 212 for (int offset_idx = 0; offset_idx <= kKernelOffsetCount; ++offset_idx) { |
214 for (int i = 0; i < kKernelSize; ++i) { | 213 for (int i = 0; i < kKernelSize; ++i) { |
215 const int idx = i + offset_idx * kKernelSize; | 214 const int idx = i + offset_idx * kKernelSize; |
216 const float window = kernel_window_storage_[idx]; | 215 const float window = kernel_window_storage_[idx]; |
217 const float pre_sinc = kernel_pre_sinc_storage_[idx]; | 216 const float pre_sinc = kernel_pre_sinc_storage_[idx]; |
218 | 217 |
219 if (pre_sinc == 0) { | 218 kernel_storage_[idx] = static_cast<float>(window * |
220 kernel_storage_[idx] = sinc_scale_factor * window; | 219 ((pre_sinc == 0) ? |
221 } else { | 220 sinc_scale_factor : |
222 kernel_storage_[idx] = | 221 (sin(sinc_scale_factor * pre_sinc) / pre_sinc))); |
223 window * sin(sinc_scale_factor * pre_sinc) / pre_sinc; | |
224 } | |
225 } | 222 } |
226 } | 223 } |
227 } | 224 } |
228 | 225 |
229 void SincResampler::Resample(int frames, float* destination) { | 226 void SincResampler::Resample(int frames, float* destination) { |
230 int remaining_frames = frames; | 227 int remaining_frames = frames; |
231 | 228 |
232 // Step (1) -- Prime the input buffer at the start of the input stream. | 229 // Step (1) -- Prime the input buffer at the start of the input stream. |
233 if (!buffer_primed_ && remaining_frames) { | 230 if (!buffer_primed_ && remaining_frames) { |
234 read_cb_.Run(request_frames_, r0_); | 231 read_cb_.Run(request_frames_, r0_); |
235 buffer_primed_ = true; | 232 buffer_primed_ = true; |
236 } | 233 } |
237 | 234 |
238 // Step (2) -- Resample! const what we can outside of the loop for speed. It | 235 // Step (2) -- Resample! const what we can outside of the loop for speed. It |
239 // actually has an impact on ARM performance. See inner loop comment below. | 236 // actually has an impact on ARM performance. See inner loop comment below. |
240 const double current_io_ratio = io_sample_rate_ratio_; | 237 const double current_io_ratio = io_sample_rate_ratio_; |
241 const float* const kernel_ptr = kernel_storage_.get(); | 238 const float* const kernel_ptr = kernel_storage_.get(); |
242 while (remaining_frames) { | 239 while (remaining_frames) { |
243 // Note: The loop construct here can severely impact performance on ARM | 240 // Note: The loop construct here can severely impact performance on ARM |
244 // or when built with clang. See https://codereview.chromium.org/18566009/ | 241 // or when built with clang. See https://codereview.chromium.org/18566009/ |
245 int source_idx = virtual_source_idx_; | 242 int source_idx = static_cast<int>(virtual_source_idx_); |
246 while (source_idx < block_size_) { | 243 while (source_idx < block_size_) { |
247 // |virtual_source_idx_| lies in between two kernel offsets so figure out | 244 // |virtual_source_idx_| lies in between two kernel offsets so figure out |
248 // what they are. | 245 // what they are. |
249 const double subsample_remainder = virtual_source_idx_ - source_idx; | 246 const double subsample_remainder = virtual_source_idx_ - source_idx; |
250 | 247 |
251 const double virtual_offset_idx = | 248 const double virtual_offset_idx = |
252 subsample_remainder * kKernelOffsetCount; | 249 subsample_remainder * kKernelOffsetCount; |
253 const int offset_idx = virtual_offset_idx; | 250 const int offset_idx = static_cast<int>(virtual_offset_idx); |
254 | 251 |
255 // We'll compute "convolutions" for the two kernels which straddle | 252 // We'll compute "convolutions" for the two kernels which straddle |
256 // |virtual_source_idx_|. | 253 // |virtual_source_idx_|. |
257 const float* const k1 = kernel_ptr + offset_idx * kKernelSize; | 254 const float* const k1 = kernel_ptr + offset_idx * kKernelSize; |
258 const float* const k2 = k1 + kKernelSize; | 255 const float* const k2 = k1 + kKernelSize; |
259 | 256 |
260 // Ensure |k1|, |k2| are 16-byte aligned for SIMD usage. Should always be | 257 // Ensure |k1|, |k2| are 16-byte aligned for SIMD usage. Should always be |
261 // true so long as kKernelSize is a multiple of 16. | 258 // true so long as kKernelSize is a multiple of 16. |
262 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k1) & 0x0F); | 259 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k1) & 0x0F); |
263 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k2) & 0x0F); | 260 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k2) & 0x0F); |
264 | 261 |
265 // Initialize input pointer based on quantized |virtual_source_idx_|. | 262 // Initialize input pointer based on quantized |virtual_source_idx_|. |
266 const float* const input_ptr = r1_ + source_idx; | 263 const float* const input_ptr = r1_ + source_idx; |
267 | 264 |
268 // Figure out how much to weight each kernel's "convolution". | 265 // Figure out how much to weight each kernel's "convolution". |
269 const double kernel_interpolation_factor = | 266 const double kernel_interpolation_factor = |
270 virtual_offset_idx - offset_idx; | 267 virtual_offset_idx - offset_idx; |
271 *destination++ = CONVOLVE_FUNC( | 268 *destination++ = CONVOLVE_FUNC( |
272 input_ptr, k1, k2, kernel_interpolation_factor); | 269 input_ptr, k1, k2, kernel_interpolation_factor); |
273 | 270 |
274 // Advance the virtual index. | 271 // Advance the virtual index. |
275 virtual_source_idx_ += current_io_ratio; | 272 virtual_source_idx_ += current_io_ratio; |
276 source_idx = virtual_source_idx_; | 273 source_idx = static_cast<int>(virtual_source_idx_); |
277 | 274 |
278 if (!--remaining_frames) | 275 if (!--remaining_frames) |
279 return; | 276 return; |
280 } | 277 } |
281 | 278 |
282 // Wrap back around to the start. | 279 // Wrap back around to the start. |
283 DCHECK_GE(virtual_source_idx_, block_size_); | 280 DCHECK_GE(virtual_source_idx_, block_size_); |
284 virtual_source_idx_ -= block_size_; | 281 virtual_source_idx_ -= block_size_; |
285 | 282 |
286 // Step (3) -- Copy r3_, r4_ to r1_, r2_. | 283 // Step (3) -- Copy r3_, r4_ to r1_, r2_. |
287 // This wraps the last input frames back to the start of the buffer. | 284 // This wraps the last input frames back to the start of the buffer. |
288 memcpy(r1_, r3_, sizeof(*input_buffer_.get()) * kKernelSize); | 285 memcpy(r1_, r3_, sizeof(*input_buffer_.get()) * kKernelSize); |
289 | 286 |
290 // Step (4) -- Reinitialize regions if necessary. | 287 // Step (4) -- Reinitialize regions if necessary. |
291 if (r0_ == r2_) | 288 if (r0_ == r2_) |
292 UpdateRegions(true); | 289 UpdateRegions(true); |
293 | 290 |
294 // Step (5) -- Refresh the buffer with more input. | 291 // Step (5) -- Refresh the buffer with more input. |
295 read_cb_.Run(request_frames_, r0_); | 292 read_cb_.Run(request_frames_, r0_); |
296 } | 293 } |
297 } | 294 } |
298 | 295 |
299 int SincResampler::ChunkSize() const { | 296 int SincResampler::ChunkSize() const { |
300 return block_size_ / io_sample_rate_ratio_; | 297 return static_cast<int>(block_size_ / io_sample_rate_ratio_); |
301 } | 298 } |
302 | 299 |
303 void SincResampler::Flush() { | 300 void SincResampler::Flush() { |
304 virtual_source_idx_ = 0; | 301 virtual_source_idx_ = 0; |
305 buffer_primed_ = false; | 302 buffer_primed_ = false; |
306 memset(input_buffer_.get(), 0, | 303 memset(input_buffer_.get(), 0, |
307 sizeof(*input_buffer_.get()) * input_buffer_size_); | 304 sizeof(*input_buffer_.get()) * input_buffer_size_); |
308 UpdateRegions(false); | 305 UpdateRegions(false); |
309 } | 306 } |
310 | 307 |
311 float SincResampler::Convolve_C(const float* input_ptr, const float* k1, | 308 float SincResampler::Convolve_C(const float* input_ptr, const float* k1, |
312 const float* k2, | 309 const float* k2, |
313 double kernel_interpolation_factor) { | 310 double kernel_interpolation_factor) { |
314 float sum1 = 0; | 311 float sum1 = 0; |
315 float sum2 = 0; | 312 float sum2 = 0; |
316 | 313 |
317 // Generate a single output sample. Unrolling this loop hurt performance in | 314 // Generate a single output sample. Unrolling this loop hurt performance in |
318 // local testing. | 315 // local testing. |
319 int n = kKernelSize; | 316 int n = kKernelSize; |
320 while (n--) { | 317 while (n--) { |
321 sum1 += *input_ptr * *k1++; | 318 sum1 += *input_ptr * *k1++; |
322 sum2 += *input_ptr++ * *k2++; | 319 sum2 += *input_ptr++ * *k2++; |
323 } | 320 } |
324 | 321 |
325 // Linearly interpolate the two "convolutions". | 322 // Linearly interpolate the two "convolutions". |
326 return (1.0 - kernel_interpolation_factor) * sum1 | 323 return static_cast<float>((1.0 - kernel_interpolation_factor) * sum1 + |
327 + kernel_interpolation_factor * sum2; | 324 kernel_interpolation_factor * sum2); |
328 } | 325 } |
329 | 326 |
330 #if defined(ARCH_CPU_X86_FAMILY) | 327 #if defined(ARCH_CPU_X86_FAMILY) |
331 float SincResampler::Convolve_SSE(const float* input_ptr, const float* k1, | 328 float SincResampler::Convolve_SSE(const float* input_ptr, const float* k1, |
332 const float* k2, | 329 const float* k2, |
333 double kernel_interpolation_factor) { | 330 double kernel_interpolation_factor) { |
334 __m128 m_input; | 331 __m128 m_input; |
335 __m128 m_sums1 = _mm_setzero_ps(); | 332 __m128 m_sums1 = _mm_setzero_ps(); |
336 __m128 m_sums2 = _mm_setzero_ps(); | 333 __m128 m_sums2 = _mm_setzero_ps(); |
337 | 334 |
338 // Based on |input_ptr| alignment, we need to use loadu or load. Unrolling | 335 // Based on |input_ptr| alignment, we need to use loadu or load. Unrolling |
339 // these loops hurt performance in local testing. | 336 // these loops hurt performance in local testing. |
340 if (reinterpret_cast<uintptr_t>(input_ptr) & 0x0F) { | 337 if (reinterpret_cast<uintptr_t>(input_ptr) & 0x0F) { |
341 for (int i = 0; i < kKernelSize; i += 4) { | 338 for (int i = 0; i < kKernelSize; i += 4) { |
342 m_input = _mm_loadu_ps(input_ptr + i); | 339 m_input = _mm_loadu_ps(input_ptr + i); |
343 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); | 340 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); |
344 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); | 341 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); |
345 } | 342 } |
346 } else { | 343 } else { |
347 for (int i = 0; i < kKernelSize; i += 4) { | 344 for (int i = 0; i < kKernelSize; i += 4) { |
348 m_input = _mm_load_ps(input_ptr + i); | 345 m_input = _mm_load_ps(input_ptr + i); |
349 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); | 346 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); |
350 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); | 347 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); |
351 } | 348 } |
352 } | 349 } |
353 | 350 |
354 // Linearly interpolate the two "convolutions". | 351 // Linearly interpolate the two "convolutions". |
355 m_sums1 = _mm_mul_ps(m_sums1, _mm_set_ps1(1.0 - kernel_interpolation_factor)); | 352 m_sums1 = _mm_mul_ps(m_sums1, _mm_set_ps1( |
356 m_sums2 = _mm_mul_ps(m_sums2, _mm_set_ps1(kernel_interpolation_factor)); | 353 static_cast<float>(1.0 - kernel_interpolation_factor))); |
| 354 m_sums2 = _mm_mul_ps(m_sums2, _mm_set_ps1( |
| 355 static_cast<float>(kernel_interpolation_factor))); |
357 m_sums1 = _mm_add_ps(m_sums1, m_sums2); | 356 m_sums1 = _mm_add_ps(m_sums1, m_sums2); |
358 | 357 |
359 // Sum components together. | 358 // Sum components together. |
360 float result; | 359 float result; |
361 m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1); | 360 m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1); |
362 _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps( | 361 _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps( |
363 m_sums2, m_sums2, 1))); | 362 m_sums2, m_sums2, 1))); |
364 | 363 |
365 return result; | 364 return result; |
366 } | 365 } |
(...skipping 20 matching lines...) Expand all Loading... |
387 vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)), | 386 vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)), |
388 m_sums2, vmovq_n_f32(kernel_interpolation_factor)); | 387 m_sums2, vmovq_n_f32(kernel_interpolation_factor)); |
389 | 388 |
390 // Sum components together. | 389 // Sum components together. |
391 float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1)); | 390 float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1)); |
392 return vget_lane_f32(vpadd_f32(m_half, m_half), 0); | 391 return vget_lane_f32(vpadd_f32(m_half, m_half), 0); |
393 } | 392 } |
394 #endif | 393 #endif |
395 | 394 |
396 } // namespace media | 395 } // namespace media |
OLD | NEW |