OLD | NEW |
(Empty) | |
| 1 // Copyright 2013 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. |
| 4 |
| 5 #include "media/base/sinc_resampler.h" |
| 6 |
| 7 #include <xmmintrin.h> |
| 8 |
| 9 namespace media { |
| 10 |
| 11 float SincResampler::Convolve_SSE(const float* input_ptr, const float* k1, |
| 12 const float* k2, |
| 13 double kernel_interpolation_factor) { |
| 14 __m128 m_input; |
| 15 __m128 m_sums1 = _mm_setzero_ps(); |
| 16 __m128 m_sums2 = _mm_setzero_ps(); |
| 17 |
| 18 // Based on |input_ptr| alignment, we need to use loadu or load. Unrolling |
| 19 // these loops hurt performance in local testing. |
| 20 if (reinterpret_cast<uintptr_t>(input_ptr) & 0x0F) { |
| 21 for (int i = 0; i < kKernelSize; i += 4) { |
| 22 m_input = _mm_loadu_ps(input_ptr + i); |
| 23 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); |
| 24 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); |
| 25 } |
| 26 } else { |
| 27 for (int i = 0; i < kKernelSize; i += 4) { |
| 28 m_input = _mm_load_ps(input_ptr + i); |
| 29 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); |
| 30 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); |
| 31 } |
| 32 } |
| 33 |
| 34 // Linearly interpolate the two "convolutions". |
| 35 m_sums1 = _mm_mul_ps(m_sums1, _mm_set_ps1(1.0 - kernel_interpolation_factor)); |
| 36 m_sums2 = _mm_mul_ps(m_sums2, _mm_set_ps1(kernel_interpolation_factor)); |
| 37 m_sums1 = _mm_add_ps(m_sums1, m_sums2); |
| 38 |
| 39 // Sum components together. |
| 40 float result; |
| 41 m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1); |
| 42 _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps( |
| 43 m_sums2, m_sums2, 1))); |
| 44 |
| 45 return result; |
| 46 } |
| 47 |
| 48 } // namespace media |
OLD | NEW |