src/opts/SkBitmapFilter_opts.h - Issue 2526733002: Add AVX2 version of ConvolveVertically

Side by Side Diff: src/opts/SkBitmapFilter_opts.h

Issue 2526733002: Add AVX2 version of ConvolveVertically (Closed)

Patch Set: Created 4 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 /*	1 /*

2 * Copyright 2016 Google Inc.	2 * Copyright 2016 Google Inc.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #ifndef SkBitmapFilter_opts_DEFINED	8 #ifndef SkBitmapFilter_opts_DEFINED

9 #define SkBitmapFilter_opts_DEFINED	9 #define SkBitmapFilter_opts_DEFINED

10	10

11 #include "SkConvolver.h"	11 #include "SkConvolver.h"

12	12

13 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2	13 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2

	14 #include <immintrin.h>
	mtklein_C 2016/12/05 18:42:46 I think we can just use immintrin.h for everything I think we can just use immintrin.h for everything >= SSE2. xiangze.zhang 2016/12/07 11:17:01 Done. Show quoted text On 2016/12/05 18:42:46, mtklein_C wrote: > I think we can just use immintrin.h for everything >= SSE2. Done.
	15 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2

14 #include <emmintrin.h>	16 #include <emmintrin.h>

15 #elif defined(SK_ARM_HAS_NEON)	17 #elif defined(SK_ARM_HAS_NEON)

16 #include <arm_neon.h>	18 #include <arm_neon.h>

17 #endif	19 #endif

18	20

19 namespace SK_OPTS_NS {	21 namespace SK_OPTS_NS {

20	22

21 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2	23 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2

	24

	25 static SK_ALWAYS_INLINE

	26 void ComputeCoefficientRow(SkConvolutionFilter1D::ConvolutionFixed filterVal ue, const unsigned char* sourceDataRows,
	mtklein_C 2016/12/05 18:42:45 Typically we name static functions with lower_case Typically we name static functions with lower_case_and_underscores(), and pass mutated values as pointers instead of references so that it's clear they're being mutated at the call site. In my experience this doesn't have any effect on code generation, especially with SK_ALWAYS_INLINE. xiangze.zhang 2016/12/07 11:17:01 Done. Show quoted text On 2016/12/05 18:42:45, mtklein_C wrote: > Typically we name static functions with lower_case_and_underscores(), and pass > mutated values as pointers instead of references so that it's clear they're > being mutated at the call site. In my experience this doesn't have any effect > on code generation, especially with SK_ALWAYS_INLINE. Done.
	27 __m256i& Y01, __m256i& Y23, __m256i& Y45, __m256i & Y67) {

	28 __m256i coefs = _mm256_set1_epi16(filterValue);

	29 __m256i pixels = _mm256_lddqu_si256(reinterpret_cast<const __m256i *>(so urceDataRows));
	mtklein_C 2016/12/05 18:42:45 I have always shied away from lddqu, instead just I have always shied away from lddqu, instead just using loadu. Is it important here? Not sure I see any difference when benching. xiangze.zhang 2016/12/07 11:17:01 Done. Show quoted text On 2016/12/05 18:42:45, mtklein_C wrote: > I have always shied away from lddqu, instead just using loadu. Is it important > here? Not sure I see any difference when benching. Done.
	30 __m256i zero = _mm256_setzero_si256();

	31

	32 // [16] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	33 __m256i Yt1 = _mm256_unpacklo_epi8(pixels, zero);
	mtklein_C 2016/12/05 18:42:45 Does Yt1 mean ymm, temporary #1? We usually find Does Yt1 mean ymm, temporary #1? We usually find this sort of code a lot easier to read and maintain if you give each value some sort of meaningful name (like pixels_03_16bit) then never change it, letting the compiler handle register allocation. It can also help readability if you can find ways to vertically align analogous parts of register or operation names. E.g. __m256i pixels_03_16bit = _mm256_unpacklo_epi8(pixels, zero); __m256i scaled_03_lo = _mm256_mullo_epi16(pixels_03_16bit, coefs), scaled_03_hi = _mm256_mulhi_epi16(pixels_03_16bit, coefs); Y01 = _mm256_add_epi32(Y01, _mm256_unpacklo_epi16(scaled_03_lo, scaled_03_hi)); Y23 = _mm256_add_epi32(Y23, _mm256_unpackhi_epi16(scaled_03_lo, scaled_03_hi)); ... xiangze.zhang 2016/12/07 11:17:01 Done. Show quoted text On 2016/12/05 18:42:45, mtklein_C wrote: > Does Yt1 mean ymm, temporary #1? > > We usually find this sort of code a lot easier to read and maintain if you give > each value some sort of meaningful name (like pixels_03_16bit) then never change > it, letting the compiler handle register allocation. It can also help > readability if you can find ways to vertically align analogous parts of register > or operation names. > > E.g. > > __m256i pixels_03_16bit = _mm256_unpacklo_epi8(pixels, zero); > > __m256i scaled_03_lo = _mm256_mullo_epi16(pixels_03_16bit, coefs), > scaled_03_hi = _mm256_mulhi_epi16(pixels_03_16bit, coefs); > > Y01 = _mm256_add_epi32(Y01, _mm256_unpacklo_epi16(scaled_03_lo, scaled_03_hi)); > Y23 = _mm256_add_epi32(Y23, _mm256_unpackhi_epi16(scaled_03_lo, scaled_03_hi)); > ... Done.
	34 // [32] cja3 cjb3 cjg3 cjr3 cja2 cjb2 cjg2 cjr2

	35 __m256i Yt2 = _mm256_mulhi_epi16(Yt1, coefs);

	36 // [32] cja1 cjb1 cjg1 cjr1 cja0 cjb0 cjg0 cjr0

	37 Yt1 = _mm256_mullo_epi16(Yt1, coefs);

	38

	39 // pixel 0 and 1

	40 __m256i Yt3 = _mm256_unpacklo_epi16(Yt1, Yt2);

	41 Y01 = _mm256_add_epi32(Y01, Yt3);

	42 // pixel 2 and 3

	43 Yt3 = _mm256_unpackhi_epi16(Yt1, Yt2);

	44 Y23 = _mm256_add_epi32(Y23, Yt3);

	45

	46 // [16] a7 b7 g7 r7 a6 b6 g6 r6 a5 b5 g5 r5 a4 b4 g4 r4

	47 Yt1 = _mm256_unpackhi_epi8(pixels, zero);

	48 // [32] cja7 cjb7 cjg7 cjr7 cja6 cjb6 cjg6 cjr6

	49 Yt2 = _mm256_mulhi_epi16(Yt1, coefs);

	50 // [32] cja5 cjb5 cjg5 cjr5 cja4 cjb4 cjg4 cjr4

	51 Yt1 = _mm256_mullo_epi16(Yt1, coefs);

	52

	53 // pixel 4 and 5

	54 Yt3 = _mm256_unpacklo_epi16(Yt1, Yt2);

	55 Y45 = _mm256_add_epi32(Y45, Yt3);

	56 // pixel 6 and 7

	57 Yt3 = _mm256_unpackhi_epi16(Yt1, Yt2);

	58 Y67 = _mm256_add_epi32(Y67, Yt3);

	59 }

	60

	61 template<bool hasAlpha>

	62 void ConvolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filte rValues,

	63 int filterLength,

	64 unsigned char* const * sourceDataRows,

	65 int pixelWidth,

	66 unsigned char* outRow) {

	67

	68 int outX, filterY;

	69 int width = pixelWidth & ~7;

	70 int length = filterLength & ~3;

	71

	72 __m256i Yt0, Yt1, Yt2;

	73

	74 // Output eight pixels per iteration (32 bytes).

	75 for (outX = 0; outX < width; outX += 8) {

	76 // Accumulated result for each pixel. 32 bits per RGBA channel.

	77 __m256i Y01 = _mm256_setzero_si256();

	78 __m256i Y23 = _mm256_setzero_si256();

	79 __m256i Y45 = _mm256_setzero_si256();

	80 __m256i Y67 = _mm256_setzero_si256();

	81

	82 // Convolve with 4 filter coefficient per iteration.

	83 for (int filterY = 0; filterY < length; filterY += 4) {

	84 ComputeCoefficientRow(filterValues[filterY], sourceDataRows[filt erY] + outX * 4, Y01, Y23, Y45, Y67);
	mtklein_C 2016/12/05 18:42:46 This might line up neater with a couple "+ 0". This might line up neater with a couple "+ 0". xiangze.zhang 2016/12/07 11:17:01 Done. Show quoted text On 2016/12/05 18:42:46, mtklein_C wrote: > This might line up neater with a couple "+ 0". Done.
	85 ComputeCoefficientRow(filterValues[filterY + 1], sourceDataRows[ filterY + 1] + outX * 4, Y01, Y23, Y45, Y67);

	86 ComputeCoefficientRow(filterValues[filterY + 2], sourceDataRows[ filterY + 2] + outX * 4, Y01, Y23, Y45, Y67);

	87 ComputeCoefficientRow(filterValues[filterY + 3], sourceDataRows[ filterY + 3] + outX * 4, Y01, Y23, Y45, Y67);

	88 }

	89 for (filterY = length; filterY < filterLength; filterY++) {

	90 ComputeCoefficientRow(filterValues[filterY], sourceDataRows[filt erY] + outX * 4, Y01, Y23, Y45, Y67);

	91 }

	92

	93 // Shift right for fixed point implementation.

	94 Y01 = _mm256_srai_epi32(Y01, SkConvolutionFilter1D::kShiftBits);

	95 Y23 = _mm256_srai_epi32(Y23, SkConvolutionFilter1D::kShiftBits);

	96 Y45 = _mm256_srai_epi32(Y45, SkConvolutionFilter1D::kShiftBits);

	97 Y67 = _mm256_srai_epi32(Y67, SkConvolutionFilter1D::kShiftBits);

	98

	99 // Packing 32 bits \|accum\| to 16 bits per channel (signed saturation ).

	100 // [16] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	101 Yt0 = _mm256_packs_epi32(Y01, Y23);

	102

	103 // Packing 32 bits \|accum\| to 16 bits per channel (signed saturation ).

	104 // [16] a7 b7 g7 r7 a6 b6 g6 r6 a5 b5 g5 r5 a4 b4 g4 r4

	105 Yt1 = _mm256_packs_epi32(Y45, Y67);

	106

	107 // Packing 16 bits \|accum\| to 8 bits per channel (unsigned saturatio n).

	108 // [8] a7 b7 g7 r7 a6 b6 g6 r6 a5 b5 g5 r5 a4 b4 g4 r4 a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0

	109 Yt0 = _mm256_packus_epi16(Yt0, Yt1);

	110

	111 if (hasAlpha) {

	112 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0

	113 Yt1 = _mm256_srli_epi32(Yt0, 8);

	114 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	115 Yt2 = _mm256_max_epu8(Yt1, Yt0); // Max of r and g.

	116 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0

	117 Yt1 = _mm256_srli_epi32(Yt0, 16);

	118 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0

	119 Yt2 = _mm256_max_epu8(Yt1, Yt2); // Max of r and g and b.

	120 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00

	121 Yt2 = _mm256_slli_epi32(Yt2, 24);

	122

	123 // Make sure the value of alpha channel is always larger than ma ximum
	mtklein_C 2016/12/05 18:42:46 This comment might be better moved up just under i This comment might be better moved up just under if (hasAlpha) { xiangze.zhang 2016/12/07 11:17:01 Done. Show quoted text On 2016/12/05 18:42:46, mtklein_C wrote: > This comment might be better moved up just under if (hasAlpha) { Done.
	124 // value of color channels.

	125 Yt0 = _mm256_max_epu8(Yt2, Yt0);

	126 } else {

	127 __m256i mask = _mm256_set1_epi32(0xff000000);

	128 Yt0 = _mm256_or_si256(Yt0, mask);

	129 }

	130

	131 // Store the convolution result (32 bytes) and advance the pixel poi nters.

	132 _mm256_storeu_si256(reinterpret_cast<__m256i *>(outRow), Yt0);

	133 outRow += 32;

	134 }

	135

	136 if (pixelWidth & 7) {

	137 __m256i Y01 = _mm256_setzero_si256();

	138 __m256i Y23 = _mm256_setzero_si256();

	139 __m256i Y45 = _mm256_setzero_si256();

	140 __m256i Y67 = _mm256_setzero_si256();

	141

	142 for (int filterY = 0; filterY < filterLength; filterY++) {

	143 ComputeCoefficientRow(filterValues[filterY], sourceDataRows[filt erY] + outX * 4, Y01, Y23, Y45, Y67);

	144 }

	145

	146 Y01 = _mm256_srai_epi32(Y01, SkConvolutionFilter1D::kShiftBits);

	147 Y23 = _mm256_srai_epi32(Y23, SkConvolutionFilter1D::kShiftBits);

	148 Y45 = _mm256_srai_epi32(Y45, SkConvolutionFilter1D::kShiftBits);

	149 Y67 = _mm256_srai_epi32(Y67, SkConvolutionFilter1D::kShiftBits);

	150

	151 Yt0 = _mm256_packs_epi32(Y01, Y23);

	152 Yt1 = _mm256_packs_epi32(Y45, Y67);

	153 Yt0 = _mm256_packus_epi16(Yt0, Yt1);

	154

	155 if (hasAlpha) {

	156 Yt1 = _mm256_srli_epi32(Yt0, 8);

	157 Yt2 = _mm256_max_epu8(Yt1, Yt0); // Max of r and g.
	mtklein_C 2016/12/05 18:42:45 This is the sort of code that using more variables This is the sort of code that using more variables with better names can help make readable. E.g. if (hasAlpha) { // If alpha is less than r,g, or b, set it to their max. auto max_rg = _mm256_max_epu8(pixels, _mm256_srli_epi32(pixels, 8)), max_rgb = _mm256_max_epu8(max_rg, _mm256_srli_epi32(pixels, 16)); pixels = _mm256_max_epu8(pixels, _mm256_slli_epi32(max_rgb, 24)); } else { // Force opaque. pixels = _mm256_or_si256(pixels, _mm256_set1_epi32(0xff000000)); } xiangze.zhang 2016/12/07 11:17:01 Done. Show quoted text On 2016/12/05 18:42:45, mtklein_C wrote: > This is the sort of code that using more variables with better names can help > make readable. E.g. > > if (hasAlpha) { > // If alpha is less than r,g, or b, set it to their max. > auto max_rg = _mm256_max_epu8(pixels, _mm256_srli_epi32(pixels, 8)), > max_rgb = _mm256_max_epu8(max_rg, _mm256_srli_epi32(pixels, 16)); > pixels = _mm256_max_epu8(pixels, _mm256_slli_epi32(max_rgb, 24)); > } else { > // Force opaque. > pixels = _mm256_or_si256(pixels, _mm256_set1_epi32(0xff000000)); > } Done.
	158 Yt1 = _mm256_srli_epi32(Yt0, 16);

	159 Yt2 = _mm256_max_epu8(Yt1, Yt2); // Max of r and g and b.

	160 Yt2 = _mm256_slli_epi32(Yt2, 24);

	161 Yt0 = _mm256_max_epu8(Yt2, Yt0);

	162 } else {

	163 __m256i mask = _mm256_set1_epi32(0xff000000);

	164 Yt0 = _mm256_or_si256(Yt0, mask);

	165 }

	166

	167 for (int i = width; i < pixelWidth; i++) {
	mtklein_C 2016/12/05 18:42:46 I'd have expected a call to _mm256_maskstore_epi32 I'd have expected a call to _mm256_maskstore_epi32()... is this way faster? xiangze.zhang 2016/12/07 11:17:01 I tried something like this: __m256i mask = _m Show quoted text On 2016/12/05 18:42:46, mtklein_C wrote: > I'd have expected a call to _mm256_maskstore_epi32()... is this way faster? I tried something like this: __m256i mask = _mm256_setr_epi32(0xffffffff, 0, 0, 0, 0, 0, 0, 0); for (int i = outX + 1; i < pixelWidth; i++) { __m256i shift = _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6); mask = _mm256_permutevar8x32_epi32(mask, shift); } _mm256_maskstore_epi32(reinterpret_cast<int*>(outRow), mask, accum); And nanobench time becomes a little longer. It is weird that bitmap_scale_filter_64_256 time also change from 306us to 314us, which should have no pixels left to go through this code path.
	168 (reinterpret_cast<int>(outRow)) = _mm_cvtsi128_si32(_mm256_cas tsi256_si128(Yt0));

	169 __m256i rotate = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);

	170 Yt0 = _mm256_permutevar8x32_epi32(Yt0, rotate);

	171 outRow += 4;

	172 }

	173 }

	174 }

	175

	176 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2

22	177

23 static SK_ALWAYS_INLINE void AccumRemainder(const unsigned char* pixelsLeft,	178 static SK_ALWAYS_INLINE void AccumRemainder(const unsigned char* pixelsLeft,

24 const SkConvolutionFilter1D::ConvolutionFixed* filterValues, __m128i & accum, int r) {	179 const SkConvolutionFilter1D::ConvolutionFixed* filterValues, __m128i & accum, int r) {

25 int remainder[4] = {0};	180 int remainder[4] = {0};

26 for (int i = 0; i < r; i++) {	181 for (int i = 0; i < r; i++) {

27 SkConvolutionFilter1D::ConvolutionFixed coeff = filterValues[i];	182 SkConvolutionFilter1D::ConvolutionFixed coeff = filterValues[i];

28 remainder[0] += coeff * pixelsLeft[i * 4 + 0];	183 remainder[0] += coeff * pixelsLeft[i * 4 + 0];

29 remainder[1] += coeff * pixelsLeft[i * 4 + 1];	184 remainder[1] += coeff * pixelsLeft[i * 4 + 1];

30 remainder[2] += coeff * pixelsLeft[i * 4 + 2];	185 remainder[2] += coeff * pixelsLeft[i * 4 + 2];

31 remainder[3] += coeff * pixelsLeft[i * 4 + 3];	186 remainder[3] += coeff * pixelsLeft[i * 4 + 3];

(...skipping 899 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
931 pixelWidth, outRow);	1086 pixelWidth, outRow);

932 } else {	1087 } else {

933 ConvolveVertically<false>(filterValues, filterLength, sourceDataRows ,	1088 ConvolveVertically<false>(filterValues, filterLength, sourceDataRows ,

934 pixelWidth, outRow);	1089 pixelWidth, outRow);

935 }	1090 }

936 }	1091 }

937	1092

938 } // namespace SK_OPTS_NS	1093 } // namespace SK_OPTS_NS

939	1094

940 #endif//SkBitmapFilter_opts_DEFINED	1095 #endif//SkBitmapFilter_opts_DEFINED

OLD	NEW

« no previous file with comments | « src/core/SkConvolver.cpp ('k') | src/opts/SkOpts_hsw.cpp » ('j') | no next file with comments »