Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(260)

Side by Side Diff: media/base/vector_math_avx.cc

Issue 2556993002: Experiment with AVX optimizations for FMAC, FMUL operations.
Patch Set: Created 4 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « media/base/vector_math.cc ('k') | media/base/vector_math_perftest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 // Copyright 2016 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include <immintrin.h>
6 #include <stdint.h>
7
8 #include "media/base/vector_math.h"
9 #include "media/base/vector_math_testing.h"
10
11 namespace media {
12 namespace vector_math {
13
14 inline float HorizontalSum(__m256 v) {
15 __m128 sum =
16 _mm_add_ps(_mm256_castps256_ps128(v), _mm256_extractf128_ps(v, 1));
17 __m128 shuf = _mm_movehdup_ps(sum);
18 sum = _mm_add_ps(sum, shuf);
19 shuf = _mm_movehl_ps(shuf, sum);
20 return _mm_cvtss_f32(_mm_add_ss(sum, shuf));
21 }
22
23 float Convolve_AVX(const float* src,
24 const float* k1,
25 const float* k2,
26 double kernel_interpolation_factor) {
27 __m256 m_input;
28 __m256 m_sums1 = _mm256_setzero_ps();
29 __m256 m_sums2 = _mm256_setzero_ps();
30
31 // Based on |input_ptr| alignment, we need to use loadu or load. Unrolling
32 // these loops hurt performance in local testing.
33 if (reinterpret_cast<uintptr_t>(src) & 0x1F) {
34 for (int i = 0; i < kKernelSize; i += 8) {
35 m_input = _mm256_loadu_ps(src + i);
36 m_sums1 = _mm256_add_ps(m_sums1,
37 _mm256_mul_ps(m_input, _mm256_load_ps(k1 + i)));
38 m_sums2 = _mm256_add_ps(m_sums2,
39 _mm256_mul_ps(m_input, _mm256_load_ps(k2 + i)));
40 }
41 } else {
42 for (int i = 0; i < kKernelSize; i += 8) {
43 m_input = _mm256_load_ps(src + i);
44 m_sums1 = _mm256_add_ps(m_sums1,
45 _mm256_mul_ps(m_input, _mm256_load_ps(k1 + i)));
46 m_sums2 = _mm256_add_ps(m_sums2,
47 _mm256_mul_ps(m_input, _mm256_load_ps(k2 + i)));
48 }
49 }
50
51 // Linearly interpolate the two "convolutions".
52 m_sums1 = _mm256_mul_ps(
53 m_sums1,
54 _mm256_set1_ps(static_cast<float>(1.0 - kernel_interpolation_factor)));
55 m_sums2 = _mm256_mul_ps(
56 m_sums2, _mm256_set1_ps(static_cast<float>(kernel_interpolation_factor)));
57
58 // Sum components together.
59 return HorizontalSum(_mm256_add_ps(m_sums1, m_sums2));
60 }
61
62 float DotProduct_AVX(const float* a, const float* b, int len) {
63 const int rem = len % 8;
64 const int last_index = len - rem;
65
66 // First sum all components.
67 __m256 m_sum = _mm256_setzero_ps();
68 if (reinterpret_cast<uintptr_t>(a) & 0x1F ||
69 reinterpret_cast<uintptr_t>(b) & 0x1F) {
70 for (int i = 0; i < last_index; i += 8) {
71 m_sum = _mm256_add_ps(
72 m_sum, _mm256_mul_ps(_mm256_loadu_ps(a + i), _mm256_loadu_ps(b + i)));
73 }
74 } else {
75 for (int i = 0; i < last_index; i += 8) {
76 m_sum = _mm256_add_ps(
77 m_sum, _mm256_mul_ps(_mm256_load_ps(a + i), _mm256_load_ps(b + i)));
78 }
79 }
80
81 float sum = HorizontalSum(m_sum);
82
83 // Handle any remaining values that wouldn't fit in an AVX pass.
84 for (int i = last_index; i < len; ++i)
85 sum += a[i] * b[i];
86
87 return sum;
88 }
89
90 void FMAC_AVX(const float* src, float scale, int len, float* dest) {
91 const int rem = len % 8;
92 const int last_index = len - rem;
93 const __m256 m_scale = _mm256_set1_ps(scale);
94 for (int i = 0; i < last_index; i += 8) {
95 _mm256_store_ps(
96 dest + i,
97 _mm256_add_ps(_mm256_load_ps(dest + i),
98 _mm256_mul_ps(_mm256_load_ps(src + i), m_scale)));
99 }
100
101 // Handle any remaining values that wouldn't fit in an AVX pass.
102 for (int i = last_index; i < len; ++i)
103 dest[i] += src[i] * scale;
104 }
105
106 } // namespace vector_math
107 } // namespace media
OLDNEW
« no previous file with comments | « media/base/vector_math.cc ('k') | media/base/vector_math_perftest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698