| OLD | NEW |
| (Empty) | |
| 1 // Copyright 2016 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. |
| 4 |
| 5 #include <immintrin.h> |
| 6 #include <stdint.h> |
| 7 |
| 8 #include "media/base/vector_math.h" |
| 9 #include "media/base/vector_math_testing.h" |
| 10 |
| 11 namespace media { |
| 12 namespace vector_math { |
| 13 |
| 14 inline float HorizontalSum(__m256 v) { |
| 15 __m128 sum = |
| 16 _mm_add_ps(_mm256_castps256_ps128(v), _mm256_extractf128_ps(v, 1)); |
| 17 __m128 shuf = _mm_movehdup_ps(sum); |
| 18 sum = _mm_add_ps(sum, shuf); |
| 19 shuf = _mm_movehl_ps(shuf, sum); |
| 20 return _mm_cvtss_f32(_mm_add_ss(sum, shuf)); |
| 21 } |
| 22 |
| 23 float Convolve_AVX(const float* src, |
| 24 const float* k1, |
| 25 const float* k2, |
| 26 double kernel_interpolation_factor) { |
| 27 __m256 m_input; |
| 28 __m256 m_sums1 = _mm256_setzero_ps(); |
| 29 __m256 m_sums2 = _mm256_setzero_ps(); |
| 30 |
| 31 // Based on |input_ptr| alignment, we need to use loadu or load. Unrolling |
| 32 // these loops hurt performance in local testing. |
| 33 if (reinterpret_cast<uintptr_t>(src) & 0x1F) { |
| 34 for (int i = 0; i < kKernelSize; i += 8) { |
| 35 m_input = _mm256_loadu_ps(src + i); |
| 36 m_sums1 = _mm256_add_ps(m_sums1, |
| 37 _mm256_mul_ps(m_input, _mm256_load_ps(k1 + i))); |
| 38 m_sums2 = _mm256_add_ps(m_sums2, |
| 39 _mm256_mul_ps(m_input, _mm256_load_ps(k2 + i))); |
| 40 } |
| 41 } else { |
| 42 for (int i = 0; i < kKernelSize; i += 8) { |
| 43 m_input = _mm256_load_ps(src + i); |
| 44 m_sums1 = _mm256_add_ps(m_sums1, |
| 45 _mm256_mul_ps(m_input, _mm256_load_ps(k1 + i))); |
| 46 m_sums2 = _mm256_add_ps(m_sums2, |
| 47 _mm256_mul_ps(m_input, _mm256_load_ps(k2 + i))); |
| 48 } |
| 49 } |
| 50 |
| 51 // Linearly interpolate the two "convolutions". |
| 52 m_sums1 = _mm256_mul_ps( |
| 53 m_sums1, |
| 54 _mm256_set1_ps(static_cast<float>(1.0 - kernel_interpolation_factor))); |
| 55 m_sums2 = _mm256_mul_ps( |
| 56 m_sums2, _mm256_set1_ps(static_cast<float>(kernel_interpolation_factor))); |
| 57 |
| 58 // Sum components together. |
| 59 return HorizontalSum(_mm256_add_ps(m_sums1, m_sums2)); |
| 60 } |
| 61 |
| 62 float DotProduct_AVX(const float* a, const float* b, int len) { |
| 63 const int rem = len % 8; |
| 64 const int last_index = len - rem; |
| 65 |
| 66 // First sum all components. |
| 67 __m256 m_sum = _mm256_setzero_ps(); |
| 68 if (reinterpret_cast<uintptr_t>(a) & 0x1F || |
| 69 reinterpret_cast<uintptr_t>(b) & 0x1F) { |
| 70 for (int i = 0; i < last_index; i += 8) { |
| 71 m_sum = _mm256_add_ps( |
| 72 m_sum, _mm256_mul_ps(_mm256_loadu_ps(a + i), _mm256_loadu_ps(b + i))); |
| 73 } |
| 74 } else { |
| 75 for (int i = 0; i < last_index; i += 8) { |
| 76 m_sum = _mm256_add_ps( |
| 77 m_sum, _mm256_mul_ps(_mm256_load_ps(a + i), _mm256_load_ps(b + i))); |
| 78 } |
| 79 } |
| 80 |
| 81 float sum = HorizontalSum(m_sum); |
| 82 |
| 83 // Handle any remaining values that wouldn't fit in an AVX pass. |
| 84 for (int i = last_index; i < len; ++i) |
| 85 sum += a[i] * b[i]; |
| 86 |
| 87 return sum; |
| 88 } |
| 89 |
| 90 void FMAC_AVX(const float* src, float scale, int len, float* dest) { |
| 91 const int rem = len % 8; |
| 92 const int last_index = len - rem; |
| 93 const __m256 m_scale = _mm256_set1_ps(scale); |
| 94 for (int i = 0; i < last_index; i += 8) { |
| 95 _mm256_store_ps( |
| 96 dest + i, |
| 97 _mm256_add_ps(_mm256_load_ps(dest + i), |
| 98 _mm256_mul_ps(_mm256_load_ps(src + i), m_scale))); |
| 99 } |
| 100 |
| 101 // Handle any remaining values that wouldn't fit in an AVX pass. |
| 102 for (int i = last_index; i < len; ++i) |
| 103 dest[i] += src[i] * scale; |
| 104 } |
| 105 |
| 106 } // namespace vector_math |
| 107 } // namespace media |
| OLD | NEW |