| Index: media/base/vector_math_avx.cc
|
| diff --git a/media/base/vector_math_avx.cc b/media/base/vector_math_avx.cc
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..4e18dee23cde1ddecf762a7c3c27e787a1f59697
|
| --- /dev/null
|
| +++ b/media/base/vector_math_avx.cc
|
| @@ -0,0 +1,107 @@
|
| +// Copyright 2016 The Chromium Authors. All rights reserved.
|
| +// Use of this source code is governed by a BSD-style license that can be
|
| +// found in the LICENSE file.
|
| +
|
| +#include <immintrin.h>
|
| +#include <stdint.h>
|
| +
|
| +#include "media/base/vector_math.h"
|
| +#include "media/base/vector_math_testing.h"
|
| +
|
| +namespace media {
|
| +namespace vector_math {
|
| +
|
| +inline float HorizontalSum(__m256 v) {
|
| + __m128 sum =
|
| + _mm_add_ps(_mm256_castps256_ps128(v), _mm256_extractf128_ps(v, 1));
|
| + __m128 shuf = _mm_movehdup_ps(sum);
|
| + sum = _mm_add_ps(sum, shuf);
|
| + shuf = _mm_movehl_ps(shuf, sum);
|
| + return _mm_cvtss_f32(_mm_add_ss(sum, shuf));
|
| +}
|
| +
|
| +float Convolve_AVX(const float* src,
|
| + const float* k1,
|
| + const float* k2,
|
| + double kernel_interpolation_factor) {
|
| + __m256 m_input;
|
| + __m256 m_sums1 = _mm256_setzero_ps();
|
| + __m256 m_sums2 = _mm256_setzero_ps();
|
| +
|
| + // Based on |input_ptr| alignment, we need to use loadu or load. Unrolling
|
| + // these loops hurt performance in local testing.
|
| + if (reinterpret_cast<uintptr_t>(src) & 0x1F) {
|
| + for (int i = 0; i < kKernelSize; i += 8) {
|
| + m_input = _mm256_loadu_ps(src + i);
|
| + m_sums1 = _mm256_add_ps(m_sums1,
|
| + _mm256_mul_ps(m_input, _mm256_load_ps(k1 + i)));
|
| + m_sums2 = _mm256_add_ps(m_sums2,
|
| + _mm256_mul_ps(m_input, _mm256_load_ps(k2 + i)));
|
| + }
|
| + } else {
|
| + for (int i = 0; i < kKernelSize; i += 8) {
|
| + m_input = _mm256_load_ps(src + i);
|
| + m_sums1 = _mm256_add_ps(m_sums1,
|
| + _mm256_mul_ps(m_input, _mm256_load_ps(k1 + i)));
|
| + m_sums2 = _mm256_add_ps(m_sums2,
|
| + _mm256_mul_ps(m_input, _mm256_load_ps(k2 + i)));
|
| + }
|
| + }
|
| +
|
| + // Linearly interpolate the two "convolutions".
|
| + m_sums1 = _mm256_mul_ps(
|
| + m_sums1,
|
| + _mm256_set1_ps(static_cast<float>(1.0 - kernel_interpolation_factor)));
|
| + m_sums2 = _mm256_mul_ps(
|
| + m_sums2, _mm256_set1_ps(static_cast<float>(kernel_interpolation_factor)));
|
| +
|
| + // Sum components together.
|
| + return HorizontalSum(_mm256_add_ps(m_sums1, m_sums2));
|
| +}
|
| +
|
| +float DotProduct_AVX(const float* a, const float* b, int len) {
|
| + const int rem = len % 8;
|
| + const int last_index = len - rem;
|
| +
|
| + // First sum all components.
|
| + __m256 m_sum = _mm256_setzero_ps();
|
| + if (reinterpret_cast<uintptr_t>(a) & 0x1F ||
|
| + reinterpret_cast<uintptr_t>(b) & 0x1F) {
|
| + for (int i = 0; i < last_index; i += 8) {
|
| + m_sum = _mm256_add_ps(
|
| + m_sum, _mm256_mul_ps(_mm256_loadu_ps(a + i), _mm256_loadu_ps(b + i)));
|
| + }
|
| + } else {
|
| + for (int i = 0; i < last_index; i += 8) {
|
| + m_sum = _mm256_add_ps(
|
| + m_sum, _mm256_mul_ps(_mm256_load_ps(a + i), _mm256_load_ps(b + i)));
|
| + }
|
| + }
|
| +
|
| + float sum = HorizontalSum(m_sum);
|
| +
|
| + // Handle any remaining values that wouldn't fit in an AVX pass.
|
| + for (int i = last_index; i < len; ++i)
|
| + sum += a[i] * b[i];
|
| +
|
| + return sum;
|
| +}
|
| +
|
| +void FMAC_AVX(const float* src, float scale, int len, float* dest) {
|
| + const int rem = len % 8;
|
| + const int last_index = len - rem;
|
| + const __m256 m_scale = _mm256_set1_ps(scale);
|
| + for (int i = 0; i < last_index; i += 8) {
|
| + _mm256_store_ps(
|
| + dest + i,
|
| + _mm256_add_ps(_mm256_load_ps(dest + i),
|
| + _mm256_mul_ps(_mm256_load_ps(src + i), m_scale)));
|
| + }
|
| +
|
| + // Handle any remaining values that wouldn't fit in an AVX pass.
|
| + for (int i = last_index; i < len; ++i)
|
| + dest[i] += src[i] * scale;
|
| +}
|
| +
|
| +} // namespace vector_math
|
| +} // namespace media
|
|
|