Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1193)

Unified Diff: media/base/vector_math_avx.cc

Issue 2556993002: Experiment with AVX optimizations for FMAC, FMUL operations.
Patch Set: Created 4 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « media/base/vector_math.cc ('k') | media/base/vector_math_perftest.cc » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: media/base/vector_math_avx.cc
diff --git a/media/base/vector_math_avx.cc b/media/base/vector_math_avx.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4e18dee23cde1ddecf762a7c3c27e787a1f59697
--- /dev/null
+++ b/media/base/vector_math_avx.cc
@@ -0,0 +1,107 @@
+// Copyright 2016 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <immintrin.h>
+#include <stdint.h>
+
+#include "media/base/vector_math.h"
+#include "media/base/vector_math_testing.h"
+
+namespace media {
+namespace vector_math {
+
+inline float HorizontalSum(__m256 v) {
+ __m128 sum =
+ _mm_add_ps(_mm256_castps256_ps128(v), _mm256_extractf128_ps(v, 1));
+ __m128 shuf = _mm_movehdup_ps(sum);
+ sum = _mm_add_ps(sum, shuf);
+ shuf = _mm_movehl_ps(shuf, sum);
+ return _mm_cvtss_f32(_mm_add_ss(sum, shuf));
+}
+
+float Convolve_AVX(const float* src,
+ const float* k1,
+ const float* k2,
+ double kernel_interpolation_factor) {
+ __m256 m_input;
+ __m256 m_sums1 = _mm256_setzero_ps();
+ __m256 m_sums2 = _mm256_setzero_ps();
+
+ // Based on |input_ptr| alignment, we need to use loadu or load. Unrolling
+ // these loops hurt performance in local testing.
+ if (reinterpret_cast<uintptr_t>(src) & 0x1F) {
+ for (int i = 0; i < kKernelSize; i += 8) {
+ m_input = _mm256_loadu_ps(src + i);
+ m_sums1 = _mm256_add_ps(m_sums1,
+ _mm256_mul_ps(m_input, _mm256_load_ps(k1 + i)));
+ m_sums2 = _mm256_add_ps(m_sums2,
+ _mm256_mul_ps(m_input, _mm256_load_ps(k2 + i)));
+ }
+ } else {
+ for (int i = 0; i < kKernelSize; i += 8) {
+ m_input = _mm256_load_ps(src + i);
+ m_sums1 = _mm256_add_ps(m_sums1,
+ _mm256_mul_ps(m_input, _mm256_load_ps(k1 + i)));
+ m_sums2 = _mm256_add_ps(m_sums2,
+ _mm256_mul_ps(m_input, _mm256_load_ps(k2 + i)));
+ }
+ }
+
+ // Linearly interpolate the two "convolutions".
+ m_sums1 = _mm256_mul_ps(
+ m_sums1,
+ _mm256_set1_ps(static_cast<float>(1.0 - kernel_interpolation_factor)));
+ m_sums2 = _mm256_mul_ps(
+ m_sums2, _mm256_set1_ps(static_cast<float>(kernel_interpolation_factor)));
+
+ // Sum components together.
+ return HorizontalSum(_mm256_add_ps(m_sums1, m_sums2));
+}
+
+float DotProduct_AVX(const float* a, const float* b, int len) {
+ const int rem = len % 8;
+ const int last_index = len - rem;
+
+ // First sum all components.
+ __m256 m_sum = _mm256_setzero_ps();
+ if (reinterpret_cast<uintptr_t>(a) & 0x1F ||
+ reinterpret_cast<uintptr_t>(b) & 0x1F) {
+ for (int i = 0; i < last_index; i += 8) {
+ m_sum = _mm256_add_ps(
+ m_sum, _mm256_mul_ps(_mm256_loadu_ps(a + i), _mm256_loadu_ps(b + i)));
+ }
+ } else {
+ for (int i = 0; i < last_index; i += 8) {
+ m_sum = _mm256_add_ps(
+ m_sum, _mm256_mul_ps(_mm256_load_ps(a + i), _mm256_load_ps(b + i)));
+ }
+ }
+
+ float sum = HorizontalSum(m_sum);
+
+ // Handle any remaining values that wouldn't fit in an AVX pass.
+ for (int i = last_index; i < len; ++i)
+ sum += a[i] * b[i];
+
+ return sum;
+}
+
+void FMAC_AVX(const float* src, float scale, int len, float* dest) {
+ const int rem = len % 8;
+ const int last_index = len - rem;
+ const __m256 m_scale = _mm256_set1_ps(scale);
+ for (int i = 0; i < last_index; i += 8) {
+ _mm256_store_ps(
+ dest + i,
+ _mm256_add_ps(_mm256_load_ps(dest + i),
+ _mm256_mul_ps(_mm256_load_ps(src + i), m_scale)));
+ }
+
+ // Handle any remaining values that wouldn't fit in an AVX pass.
+ for (int i = last_index; i < len; ++i)
+ dest[i] += src[i] * scale;
+}
+
+} // namespace vector_math
+} // namespace media
« no previous file with comments | « media/base/vector_math.cc ('k') | media/base/vector_math_perftest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698