Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1)

Side by Side Diff: src/opts/SkBitmapFilter_opts.h

Issue 2526733002: Add AVX2 version of ConvolveVertically (Closed)
Patch Set: Created 4 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/core/SkConvolver.cpp ('k') | src/opts/SkOpts_hsw.cpp » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2016 Google Inc. 2 * Copyright 2016 Google Inc.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 #ifndef SkBitmapFilter_opts_DEFINED 8 #ifndef SkBitmapFilter_opts_DEFINED
9 #define SkBitmapFilter_opts_DEFINED 9 #define SkBitmapFilter_opts_DEFINED
10 10
11 #include "SkConvolver.h" 11 #include "SkConvolver.h"
12 12
13 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 13 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
14 #include <immintrin.h>
mtklein_C 2016/12/05 18:42:46 I think we can just use immintrin.h for everything
xiangze.zhang 2016/12/07 11:17:01 Done.
15 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
14 #include <emmintrin.h> 16 #include <emmintrin.h>
15 #elif defined(SK_ARM_HAS_NEON) 17 #elif defined(SK_ARM_HAS_NEON)
16 #include <arm_neon.h> 18 #include <arm_neon.h>
17 #endif 19 #endif
18 20
19 namespace SK_OPTS_NS { 21 namespace SK_OPTS_NS {
20 22
21 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 23 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
24
25 static SK_ALWAYS_INLINE
26 void ComputeCoefficientRow(SkConvolutionFilter1D::ConvolutionFixed filterVal ue, const unsigned char* sourceDataRows,
mtklein_C 2016/12/05 18:42:45 Typically we name static functions with lower_case
xiangze.zhang 2016/12/07 11:17:01 Done.
27 __m256i& Y01, __m256i& Y23, __m256i& Y45, __m256i & Y67) {
28 __m256i coefs = _mm256_set1_epi16(filterValue);
29 __m256i pixels = _mm256_lddqu_si256(reinterpret_cast<const __m256i *>(so urceDataRows));
mtklein_C 2016/12/05 18:42:45 I have always shied away from lddqu, instead just
xiangze.zhang 2016/12/07 11:17:01 Done.
30 __m256i zero = _mm256_setzero_si256();
31
32 // [16] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
33 __m256i Yt1 = _mm256_unpacklo_epi8(pixels, zero);
mtklein_C 2016/12/05 18:42:45 Does Yt1 mean ymm, temporary #1? We usually find
xiangze.zhang 2016/12/07 11:17:01 Done.
34 // [32] cja3 cjb3 cjg3 cjr3 cja2 cjb2 cjg2 cjr2
35 __m256i Yt2 = _mm256_mulhi_epi16(Yt1, coefs);
36 // [32] cja1 cjb1 cjg1 cjr1 cja0 cjb0 cjg0 cjr0
37 Yt1 = _mm256_mullo_epi16(Yt1, coefs);
38
39 // pixel 0 and 1
40 __m256i Yt3 = _mm256_unpacklo_epi16(Yt1, Yt2);
41 Y01 = _mm256_add_epi32(Y01, Yt3);
42 // pixel 2 and 3
43 Yt3 = _mm256_unpackhi_epi16(Yt1, Yt2);
44 Y23 = _mm256_add_epi32(Y23, Yt3);
45
46 // [16] a7 b7 g7 r7 a6 b6 g6 r6 a5 b5 g5 r5 a4 b4 g4 r4
47 Yt1 = _mm256_unpackhi_epi8(pixels, zero);
48 // [32] cja7 cjb7 cjg7 cjr7 cja6 cjb6 cjg6 cjr6
49 Yt2 = _mm256_mulhi_epi16(Yt1, coefs);
50 // [32] cja5 cjb5 cjg5 cjr5 cja4 cjb4 cjg4 cjr4
51 Yt1 = _mm256_mullo_epi16(Yt1, coefs);
52
53 // pixel 4 and 5
54 Yt3 = _mm256_unpacklo_epi16(Yt1, Yt2);
55 Y45 = _mm256_add_epi32(Y45, Yt3);
56 // pixel 6 and 7
57 Yt3 = _mm256_unpackhi_epi16(Yt1, Yt2);
58 Y67 = _mm256_add_epi32(Y67, Yt3);
59 }
60
61 template<bool hasAlpha>
62 void ConvolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filte rValues,
63 int filterLength,
64 unsigned char* const * sourceDataRows,
65 int pixelWidth,
66 unsigned char* outRow) {
67
68 int outX, filterY;
69 int width = pixelWidth & ~7;
70 int length = filterLength & ~3;
71
72 __m256i Yt0, Yt1, Yt2;
73
74 // Output eight pixels per iteration (32 bytes).
75 for (outX = 0; outX < width; outX += 8) {
76 // Accumulated result for each pixel. 32 bits per RGBA channel.
77 __m256i Y01 = _mm256_setzero_si256();
78 __m256i Y23 = _mm256_setzero_si256();
79 __m256i Y45 = _mm256_setzero_si256();
80 __m256i Y67 = _mm256_setzero_si256();
81
82 // Convolve with 4 filter coefficient per iteration.
83 for (int filterY = 0; filterY < length; filterY += 4) {
84 ComputeCoefficientRow(filterValues[filterY], sourceDataRows[filt erY] + outX * 4, Y01, Y23, Y45, Y67);
mtklein_C 2016/12/05 18:42:46 This might line up neater with a couple "+ 0".
xiangze.zhang 2016/12/07 11:17:01 Done.
85 ComputeCoefficientRow(filterValues[filterY + 1], sourceDataRows[ filterY + 1] + outX * 4, Y01, Y23, Y45, Y67);
86 ComputeCoefficientRow(filterValues[filterY + 2], sourceDataRows[ filterY + 2] + outX * 4, Y01, Y23, Y45, Y67);
87 ComputeCoefficientRow(filterValues[filterY + 3], sourceDataRows[ filterY + 3] + outX * 4, Y01, Y23, Y45, Y67);
88 }
89 for (filterY = length; filterY < filterLength; filterY++) {
90 ComputeCoefficientRow(filterValues[filterY], sourceDataRows[filt erY] + outX * 4, Y01, Y23, Y45, Y67);
91 }
92
93 // Shift right for fixed point implementation.
94 Y01 = _mm256_srai_epi32(Y01, SkConvolutionFilter1D::kShiftBits);
95 Y23 = _mm256_srai_epi32(Y23, SkConvolutionFilter1D::kShiftBits);
96 Y45 = _mm256_srai_epi32(Y45, SkConvolutionFilter1D::kShiftBits);
97 Y67 = _mm256_srai_epi32(Y67, SkConvolutionFilter1D::kShiftBits);
98
99 // Packing 32 bits |accum| to 16 bits per channel (signed saturation ).
100 // [16] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
101 Yt0 = _mm256_packs_epi32(Y01, Y23);
102
103 // Packing 32 bits |accum| to 16 bits per channel (signed saturation ).
104 // [16] a7 b7 g7 r7 a6 b6 g6 r6 a5 b5 g5 r5 a4 b4 g4 r4
105 Yt1 = _mm256_packs_epi32(Y45, Y67);
106
107 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturatio n).
108 // [8] a7 b7 g7 r7 a6 b6 g6 r6 a5 b5 g5 r5 a4 b4 g4 r4 a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
109 Yt0 = _mm256_packus_epi16(Yt0, Yt1);
110
111 if (hasAlpha) {
112 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
113 Yt1 = _mm256_srli_epi32(Yt0, 8);
114 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
115 Yt2 = _mm256_max_epu8(Yt1, Yt0); // Max of r and g.
116 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
117 Yt1 = _mm256_srli_epi32(Yt0, 16);
118 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
119 Yt2 = _mm256_max_epu8(Yt1, Yt2); // Max of r and g and b.
120 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
121 Yt2 = _mm256_slli_epi32(Yt2, 24);
122
123 // Make sure the value of alpha channel is always larger than ma ximum
mtklein_C 2016/12/05 18:42:46 This comment might be better moved up just under i
xiangze.zhang 2016/12/07 11:17:01 Done.
124 // value of color channels.
125 Yt0 = _mm256_max_epu8(Yt2, Yt0);
126 } else {
127 __m256i mask = _mm256_set1_epi32(0xff000000);
128 Yt0 = _mm256_or_si256(Yt0, mask);
129 }
130
131 // Store the convolution result (32 bytes) and advance the pixel poi nters.
132 _mm256_storeu_si256(reinterpret_cast<__m256i *>(outRow), Yt0);
133 outRow += 32;
134 }
135
136 if (pixelWidth & 7) {
137 __m256i Y01 = _mm256_setzero_si256();
138 __m256i Y23 = _mm256_setzero_si256();
139 __m256i Y45 = _mm256_setzero_si256();
140 __m256i Y67 = _mm256_setzero_si256();
141
142 for (int filterY = 0; filterY < filterLength; filterY++) {
143 ComputeCoefficientRow(filterValues[filterY], sourceDataRows[filt erY] + outX * 4, Y01, Y23, Y45, Y67);
144 }
145
146 Y01 = _mm256_srai_epi32(Y01, SkConvolutionFilter1D::kShiftBits);
147 Y23 = _mm256_srai_epi32(Y23, SkConvolutionFilter1D::kShiftBits);
148 Y45 = _mm256_srai_epi32(Y45, SkConvolutionFilter1D::kShiftBits);
149 Y67 = _mm256_srai_epi32(Y67, SkConvolutionFilter1D::kShiftBits);
150
151 Yt0 = _mm256_packs_epi32(Y01, Y23);
152 Yt1 = _mm256_packs_epi32(Y45, Y67);
153 Yt0 = _mm256_packus_epi16(Yt0, Yt1);
154
155 if (hasAlpha) {
156 Yt1 = _mm256_srli_epi32(Yt0, 8);
157 Yt2 = _mm256_max_epu8(Yt1, Yt0); // Max of r and g.
mtklein_C 2016/12/05 18:42:45 This is the sort of code that using more variables
xiangze.zhang 2016/12/07 11:17:01 Done.
158 Yt1 = _mm256_srli_epi32(Yt0, 16);
159 Yt2 = _mm256_max_epu8(Yt1, Yt2); // Max of r and g and b.
160 Yt2 = _mm256_slli_epi32(Yt2, 24);
161 Yt0 = _mm256_max_epu8(Yt2, Yt0);
162 } else {
163 __m256i mask = _mm256_set1_epi32(0xff000000);
164 Yt0 = _mm256_or_si256(Yt0, mask);
165 }
166
167 for (int i = width; i < pixelWidth; i++) {
mtklein_C 2016/12/05 18:42:46 I'd have expected a call to _mm256_maskstore_epi32
xiangze.zhang 2016/12/07 11:17:01 I tried something like this: __m256i mask = _m
168 *(reinterpret_cast<int*>(outRow)) = _mm_cvtsi128_si32(_mm256_cas tsi256_si128(Yt0));
169 __m256i rotate = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
170 Yt0 = _mm256_permutevar8x32_epi32(Yt0, rotate);
171 outRow += 4;
172 }
173 }
174 }
175
176 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
22 177
23 static SK_ALWAYS_INLINE void AccumRemainder(const unsigned char* pixelsLeft, 178 static SK_ALWAYS_INLINE void AccumRemainder(const unsigned char* pixelsLeft,
24 const SkConvolutionFilter1D::ConvolutionFixed* filterValues, __m128i & accum, int r) { 179 const SkConvolutionFilter1D::ConvolutionFixed* filterValues, __m128i & accum, int r) {
25 int remainder[4] = {0}; 180 int remainder[4] = {0};
26 for (int i = 0; i < r; i++) { 181 for (int i = 0; i < r; i++) {
27 SkConvolutionFilter1D::ConvolutionFixed coeff = filterValues[i]; 182 SkConvolutionFilter1D::ConvolutionFixed coeff = filterValues[i];
28 remainder[0] += coeff * pixelsLeft[i * 4 + 0]; 183 remainder[0] += coeff * pixelsLeft[i * 4 + 0];
29 remainder[1] += coeff * pixelsLeft[i * 4 + 1]; 184 remainder[1] += coeff * pixelsLeft[i * 4 + 1];
30 remainder[2] += coeff * pixelsLeft[i * 4 + 2]; 185 remainder[2] += coeff * pixelsLeft[i * 4 + 2];
31 remainder[3] += coeff * pixelsLeft[i * 4 + 3]; 186 remainder[3] += coeff * pixelsLeft[i * 4 + 3];
(...skipping 899 matching lines...) Expand 10 before | Expand all | Expand 10 after
931 pixelWidth, outRow); 1086 pixelWidth, outRow);
932 } else { 1087 } else {
933 ConvolveVertically<false>(filterValues, filterLength, sourceDataRows , 1088 ConvolveVertically<false>(filterValues, filterLength, sourceDataRows ,
934 pixelWidth, outRow); 1089 pixelWidth, outRow);
935 } 1090 }
936 } 1091 }
937 1092
938 } // namespace SK_OPTS_NS 1093 } // namespace SK_OPTS_NS
939 1094
940 #endif//SkBitmapFilter_opts_DEFINED 1095 #endif//SkBitmapFilter_opts_DEFINED
OLDNEW
« no previous file with comments | « src/core/SkConvolver.cpp ('k') | src/opts/SkOpts_hsw.cpp » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698