src/opts/SkBlitRow_opts_SSE4.cpp - Issue 892623002: Add SSE optimization of Color32A_D565

Side by Side Diff: src/opts/SkBlitRow_opts_SSE4.cpp

Issue 892623002: Add SSE optimization of Color32A_D565 (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Created 5 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 #include "SkBlitRow_opts_SSE4.h"	1 #include "SkBlitRow_opts_SSE4.h"

2	2

3 // Some compilers can't compile SSSE3 or SSE4 intrinsics. We give them stub met hods.	3 // Some compilers can't compile SSSE3 or SSE4 intrinsics. We give them stub met hods.

4 // The stubs should never be called, so we make them crash just to confirm that.	4 // The stubs should never be called, so we make them crash just to confirm that.

5 #if SK_CPU_SSE_LEVEL < SK_CPU_SSE_LEVEL_SSE41	5 #if SK_CPU_SSE_LEVEL < SK_CPU_SSE_LEVEL_SSE41

6 void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT, const SkPMColor* SK_REST RICT, int, U8CPU) {	6 void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT, const SkPMColor* SK_REST RICT, int, U8CPU) {

7 sk_throw();	7 sk_throw();

8 }	8 }

9	9

	10 void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y) {

	11 sk_throw();

	12 }

	13

10 #else	14 #else

11	15

12 #include <emmintrin.h> // SSE2: Most _mm_foo() in this file.	16 #include <emmintrin.h> // SSE2: Most _mm_foo() in this file.

13 #include <smmintrin.h> // SSE4.1: _mm_testz_si128 and _mm_testc_si128.	17 #include <smmintrin.h> // SSE4.1: _mm_testz_si128 and _mm_testc_si128.

14	18

15 #include "SkColorPriv.h"	19 #include "SkColorPriv.h"

16 #include "SkColor_opts_SSE2.h"	20 #include "SkColor_opts_SSE2.h"

17	21

18 void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst,	22 void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst,

19 const SkPMColor* SK_RESTRICT src,	23 const SkPMColor* SK_RESTRICT src,

(...skipping 37 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
57 // Wrap up the last <= 15 pixels.	61 // Wrap up the last <= 15 pixels.

58 SkASSERT(count - (count16*16) <= 15);	62 SkASSERT(count - (count16*16) <= 15);

59 for (int i = count16*16; i < count; i++) {	63 for (int i = count16*16; i < count; i++) {

60 // This check is not really necessarily, but it prevents pointless autov ectorization.	64 // This check is not really necessarily, but it prevents pointless autov ectorization.

61 if (src[i] & 0xFF000000) {	65 if (src[i] & 0xFF000000) {

62 dst[i] = SkPMSrcOver(src[i], dst[i]);	66 dst[i] = SkPMSrcOver(src[i], dst[i]);

63 }	67 }

64 }	68 }

65 }	69 }

66	70

	71 void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y) {

	72 SkASSERT(count > 0);

	73 // dst must be 2-byte aligned.

	74 SkASSERT((((size_t)dst) & 0x01) == 0);
	mtklein 2015/01/30 18:17:37 Seems paranoid? If we need to keep this, let's wr Seems paranoid? If we need to keep this, let's write it as SkASSERT(SkIsAlign2((uintptr_t)dst)); henrik.smiding 2015/02/10 15:11:51 I thought so too, but I actually found it in the n Show quoted text On 2015/01/30 18:17:37, mtklein wrote: > Seems paranoid? If we need to keep this, let's write it as > > SkASSERT(SkIsAlign2((uintptr_t)dst)); I thought so too, but I actually found it in the new Color32A_D565_neon function. Will remove it.
	75

	76 uint32_t src_expand = (SkGetPackedG32(src) << 24) \|

	77 (SkGetPackedR32(src) << 13) \|

	78 (SkGetPackedB32(src) << 2);

	79 unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;

	80

	81 // Check if we have enough pixels to run SIMD, at all

	82 if (count >= 8) {
	mtklein 2015/01/30 18:17:37 This logic seems complex, and I'm worried we're ov This logic seems complex, and I'm worried we're overfitting to the benchmarks or machine we're using to test. Why don't we start by stripping all the smarts away and just use unaligned memory access? We can watch perf.skia.org to see its performance impact across a variety of machines, and then refine this if needed. uint32_t src_expand = ...; unsigned scale = ... ; const __m128i src_expand_wide = ...; const __m128i scale_wide = ...; const __m128i mask_green = ...; __m128i* dst_wide = (__m128i)dst; while (count >= 8) { __m128i pixels = _mm_loadu_si128(dst_wide); ... _mm_storeu_si128(dst_wide++, pixels); } while (count --> 0) { ... } henrik.smiding* 2015/02/10 15:11:52 I did this optimization based on reports that it w Show quoted text On 2015/01/30 18:17:37, mtklein wrote: > This logic seems complex, and I'm worried we're overfitting to the benchmarks or > machine we're using to test. > > Why don't we start by stripping all the smarts away and just use unaligned > memory access? We can watch http://perf.skia.org to see its performance impact across > a variety of machines, and then refine this if needed. > > uint32_t src_expand = ...; > unsigned scale = ... ; > > const __m128i src_expand_wide = ...; > const __m128i scale_wide = ...; > const __m128i mask_green = ...; > > __m128i* dst_wide = (__m128i*)dst; > while (count >= 8) { > __m128i pixels = _mm_loadu_si128(dst_wide); > ... > _mm_storeu_si128(dst_wide++, pixels); > } > while (count --> 0) { > ... > } I did this optimization based on reports that it was used heavily in Android WPS Office application. I actually added the unaligned loop at the last minute, to improve performance of smaller widths (8-63 pixels). I would prefer it if we kept the aligned loop, since aligned memory access is still faster. Especially on Atom/Silvermont cores.
	83 __m128i* dst_wide;

	84 const __m128i src_expand_wide = _mm_set1_epi32(src_expand);

	85 const __m128i scale_wide = _mm_set1_epi32(scale);

	86 const __m128i mask_green = _mm_set1_epi32(SK_R16_MASK_IN_PLACE \|

	87 SK_B16_MASK_IN_PLACE \|

	88 (SK_G16_MASK_IN_PLACE << 16));

	89

	90 // Check if we should run the aligned SIMD loop

	91 if ((count >= 64) \|\| ((((size_t)dst) & 0x0F) == 0)) {
	mtklein 2015/01/30 18:17:37 How'd you pick 64? How'd you pick 64? henrik.smiding 2015/02/10 15:11:52 I measured different widths with skia bench, using Show quoted text On 2015/01/30 18:17:37, mtklein wrote: > How'd you pick 64? I measured different widths with skia bench, using both aligned and unaligned loops running on both aligned and unaligned start addresses. The sweet-spot for the Silvermont core was somewhere between 64 and 96 pixels. I will remove this 'magic number' in the next patch, along with the unaligned loop.
	92 // Align dst to an even 16 byte address (0-7 pixels)

	93 while (((((size_t)dst) & 0x0F) != 0) && (count > 0)) {

	94 uint32_t dst_expand = SkExpand_rgb_16(dst) scale;
	mtklein 2015/01/30 18:17:37 You've got a lot of repeated code here. If it nee You've got a lot of repeated code here. If it needs to stick around, which I hope it doesn't, please factor the scalar and the 8x version out into their own functions, e.g. static void Color32A_D565_1x(uint16_t* dst, unsigned scale, uint32_t src_expand); static void Color32A_D565_8x(__m128i* dst, __m128i scale, __m128i src_expand); I'm somewhat surprised we don't have that 1x variant extracted as its own method in SkColorPriv.h already. Might make sense to do so if we can peg down the right name for it, then add the 8x variant to SkColor_opts_SSE2.h. henrik.smiding 2015/02/10 15:11:51 I'll re-factor the code a bit. When you're satisfi Show quoted text On 2015/01/30 18:17:37, mtklein wrote: > You've got a lot of repeated code here. If it needs to stick around, which I > hope it doesn't, please factor the scalar and the 8x version out into their own > functions, e.g. > > static void Color32A_D565_1x(uint16_t* dst, unsigned scale, uint32_t > src_expand); > static void Color32A_D565_8x(__m128i* dst, __m128i scale, __m128i src_expand); > > I'm somewhat surprised we don't have that 1x variant extracted as its own method > in SkColorPriv.h already. Might make sense to do so if we can peg down the > right name for it, then add the 8x variant to SkColor_opts_SSE2.h. I'll re-factor the code a bit. When you're satisfied I'll check if it's possible to move some of it to other files. Moving to header files as inline methods should not affect performance at least.
	95 *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);

	96 dst += 1;

	97 count--;

	98 }

	99

	100 dst_wide = reinterpret_cast<__m128i*>(dst);

	101 do {

	102 // Load 8 RGB565 pixels

	103 __m128i pixels = _mm_load_si128(dst_wide);

	104

	105 // Duplicate and mask

	106 __m128i pixels_high = _mm_unpackhi_epi16(pixels, pixels);

	107 pixels_high = _mm_and_si128(mask_green, pixels_high);

	108 pixels = _mm_unpacklo_epi16(pixels, pixels);

	109 pixels = _mm_and_si128(mask_green, pixels);

	110

	111 // Scale with alpha

	112 pixels_high = _mm_mullo_epi32(pixels_high, scale_wide);
	mtklein 2015/01/30 18:17:37 It doesn't seem like SSE4 is essential to the spee It doesn't seem like SSE4 is essential to the speed of this code. Am I thinking right it's just _mm_mullo_epi32 forcing us to do this in >=SSE4.1, and _mm_hadd_epi16 forcing us to do it in >=SSSE3? It seems like a shame to not implement this as an _SSE2.cpp optimization instead, and then come back with an SSSE3 or SSE4 variant only if they're really that much faster than the SSE2 code. The bulk of our x86 users do have SSE2, but not SSSE3 or SSE4. henrik.smiding 2015/02/10 15:11:51 I added the _mm_hadd instruction at the last minut Show quoted text On 2015/01/30 18:17:37, mtklein wrote: > It doesn't seem like SSE4 is essential to the speed of this code. Am I thinking > right it's just _mm_mullo_epi32 forcing us to do this in >=SSE4.1, and > _mm_hadd_epi16 forcing us to do it in >=SSSE3? It seems like a shame to not > implement this as an _SSE2.cpp optimization instead, and then come back with an > SSSE3 or SSE4 variant only if they're really that much faster than the SSE2 > code. The bulk of our x86 users do have SSE2, but not SSSE3 or SSE4. I added the _mm_hadd instruction at the last minute, replacing something like seven instructions. It made the code shorter and clearer, but since hadd is using micro-code, it didn't give that much on the performance side (on a Silvermont that is). _mm_mullo_epi32 is the only instruction that makes four 32-bit multiplications in one go. The only other instruction that does 32-bit multiplications in SSE2, only does two, requiring two calls to slow mul-instructions, and some shuffling/shifting on top of that. Since the multiplication step is the only really 'heavy' instruction in the loop, doubling it would impact performance. Also, not that far from the C-code's 1 mul/pixel. How do you know that most of your users only have SSE2? Do the majority of people sit on 10+ year old computers? I recall that SSE4 is as old as the entire ARMv7 architecture. I'll push a new version of this patch, and then write and measure a SSE2-only version. But I predict twice the code, and noticeable less performance (given ideal memory pre-fetching). But if it's still faster than the C-code I'll push a patch.
	113 pixels = _mm_mullo_epi32(pixels, scale_wide);

	114

	115 // Add src_expand_wide and shift down again

	116 pixels_high = _mm_add_epi32(pixels_high, src_expand_wide);

	117 pixels_high = _mm_srli_epi32(pixels_high, 5);

	118 pixels = _mm_add_epi32(pixels, src_expand_wide);

	119 pixels = _mm_srli_epi32(pixels, 5);

	120

	121 // Mask

	122 pixels_high = _mm_and_si128(mask_green, pixels_high);

	123 pixels = _mm_and_si128(mask_green, pixels);

	124

	125 // Combine into RGB565 and store

	126 pixels = _mm_hadd_epi16(pixels, pixels_high);

	127 _mm_store_si128(dst_wide, pixels);

	128 count -= 8;

	129 dst_wide++;

	130 } while (count >= 8);

	131 }

	132 else { // Unaligned loop to handle medium widths

	133 dst_wide = reinterpret_cast<__m128i*>(dst);

	134

	135 do {

	136 // Load 8 RGB565 pixels

	137 __m128i pixels = _mm_loadu_si128(dst_wide);

	138

	139 // Duplicate and mask

	140 __m128i pixels_high = _mm_unpackhi_epi16(pixels, pixels);

	141 pixels_high = _mm_and_si128(mask_green, pixels_high);

	142 pixels = _mm_unpacklo_epi16(pixels, pixels);

	143 pixels = _mm_and_si128(mask_green, pixels);

	144

	145 // Scale with alpha

	146 pixels_high = _mm_mullo_epi32(pixels_high, scale_wide);

	147 pixels = _mm_mullo_epi32(pixels, scale_wide);

	148

	149 // Add src_expand_wide and shift down again

	150 pixels_high = _mm_add_epi32(pixels_high, src_expand_wide);

	151 pixels_high = _mm_srli_epi32(pixels_high, 5);

	152 pixels = _mm_add_epi32(pixels, src_expand_wide);

	153 pixels = _mm_srli_epi32(pixels, 5);

	154

	155 // Mask

	156 pixels_high = _mm_and_si128(mask_green, pixels_high);

	157 pixels = _mm_and_si128(mask_green, pixels);

	158

	159 // Combine into RGB565 and store

	160 pixels = _mm_hadd_epi16(pixels, pixels_high);

	161 _mm_storeu_si128(dst_wide, pixels);

	162 count -= 8;

	163 dst_wide++;

	164 } while (count >= 8);

	165 }

	166

	167 dst = reinterpret_cast<uint16_t*>(dst_wide);

	168 }

	169

	170 // Small loop to handle remaining 0-7 pixels.

	171 while (count > 0) {

	172 uint32_t dst_expand = SkExpand_rgb_16(dst) scale;

	173 *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);

	174 dst += 1;

	175 count--;

	176 }

	177 }

	178

67 #endif	179 #endif

OLD	NEW

« src/core/SkBlitRow_D16.cpp ('K') | « src/opts/SkBlitRow_opts_SSE4.h ('k') | src/opts/opts_check_x86.cpp » ('j') | no next file with comments »