src/opts/SkBlitRow_opts_SSE4.cpp - Issue 892623002: Add SSE optimization of Color32A_D565

Side by Side Diff: src/opts/SkBlitRow_opts_SSE4.cpp

Issue 892623002: Add SSE optimization of Color32A_D565 (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Fixed VS warning Created 5 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 #include "SkBlitRow_opts_SSE4.h"	1 #include "SkBlitRow_opts_SSE4.h"

2	2

3 // Some compilers can't compile SSSE3 or SSE4 intrinsics. We give them stub met hods.	3 // Some compilers can't compile SSSE3 or SSE4 intrinsics. We give them stub met hods.

4 // The stubs should never be called, so we make them crash just to confirm that.	4 // The stubs should never be called, so we make them crash just to confirm that.

5 #if SK_CPU_SSE_LEVEL < SK_CPU_SSE_LEVEL_SSE41	5 #if SK_CPU_SSE_LEVEL < SK_CPU_SSE_LEVEL_SSE41

6 void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT, const SkPMColor* SK_REST RICT, int, U8CPU) {	6 void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT, const SkPMColor* SK_REST RICT, int, U8CPU) {

7 sk_throw();	7 sk_throw();

8 }	8 }

9	9

	10 void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y) {

	11 sk_throw();

	12 }

	13

10 #else	14 #else

11	15

12 #include <emmintrin.h> // SSE2: Most _mm_foo() in this file.	16 #include <smmintrin.h> // SSE4.1 intrinsics

13 #include <smmintrin.h> // SSE4.1: _mm_testz_si128 and _mm_testc_si128.

14	17

15 #include "SkColorPriv.h"	18 #include "SkColorPriv.h"

16 #include "SkColor_opts_SSE2.h"	19 #include "SkColor_opts_SSE2.h"

17	20

18 void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst,	21 void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst,

19 const SkPMColor* SK_RESTRICT src,	22 const SkPMColor* SK_RESTRICT src,

20 int count,	23 int count,

21 U8CPU alpha) {	24 U8CPU alpha) {

22 SkASSERT(alpha == 255);	25 SkASSERT(alpha == 255);

23 // As long as we can, we'll work on 16 pixel pairs at once.	26 // As long as we can, we'll work on 16 pixel pairs at once.

(...skipping 32 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
56	59

57 // Wrap up the last <= 15 pixels.	60 // Wrap up the last <= 15 pixels.

58 for (int i = count16*16; i < count; i++) {	61 for (int i = count16*16; i < count; i++) {

59 // This check is not really necessarily, but it prevents pointless autov ectorization.	62 // This check is not really necessarily, but it prevents pointless autov ectorization.

60 if (src[i] & 0xFF000000) {	63 if (src[i] & 0xFF000000) {

61 dst[i] = SkPMSrcOver(src[i], dst[i]);	64 dst[i] = SkPMSrcOver(src[i], dst[i]);

62 }	65 }

63 }	66 }

64 }	67 }

65	68

	69 static inline uint16_t Color32A_D565_1x(uint16_t dst, unsigned scale, uint32_t s rc_expand) {

	70 uint32_t dst_expand = SkExpand_rgb_16(dst) * scale;

	71 return SkCompact_rgb_16((src_expand + dst_expand) >> 5);

	72 }

	73

	74 void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y) {

	75 SkASSERT(count > 0);

	76

	77 uint32_t src_expand = (SkGetPackedG32(src) << 24) \|

	78 (SkGetPackedR32(src) << 13) \|

	79 (SkGetPackedB32(src) << 2);

	80 unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;

	81

	82 // Check if we have enough pixels to run SIMD

	83 if (count >= (int)(8 + (((16 - (size_t)dst) & 0x0F) >> 1))) {

	84 __m128i* dst_wide;

	85 const __m128i src_expand_wide = _mm_set1_epi32(src_expand);

	86 const __m128i scale_wide = _mm_set1_epi32(scale);

	87 const __m128i mask_green = _mm_set1_epi32(SK_R16_MASK_IN_PLACE \|

	88 SK_B16_MASK_IN_PLACE \|

	89 (SK_G16_MASK_IN_PLACE << 16));

	90

	91 // Align dst to an even 16 byte address (0-7 pixels)

	92 while (((((size_t)dst) & 0x0F) != 0) && (count > 0)) {

	93 dst = Color32A_D565_1x(dst, scale, src_expand);

	94 dst += 1;

	95 count--;

	96 }

	97

	98 dst_wide = reinterpret_cast<__m128i*>(dst);

	99 do {

	100 // Load 8 RGB565 pixels

	101 __m128i pixels = _mm_load_si128(dst_wide);

	102

	103 // Duplicate and mask

	104 __m128i pixels_high = _mm_unpackhi_epi16(pixels, pixels);

	105 pixels_high = _mm_and_si128(mask_green, pixels_high);

	106 pixels = _mm_unpacklo_epi16(pixels, pixels);

	107 pixels = _mm_and_si128(mask_green, pixels);

	108

	109 // Scale with alpha

	110 pixels_high = _mm_mullo_epi32(pixels_high, scale_wide);

	111 pixels = _mm_mullo_epi32(pixels, scale_wide);

	112

	113 // Add src_expand_wide and shift down again

	114 pixels_high = _mm_add_epi32(pixels_high, src_expand_wide);

	115 pixels_high = _mm_srli_epi32(pixels_high, 5);

	116 pixels = _mm_add_epi32(pixels, src_expand_wide);

	117 pixels = _mm_srli_epi32(pixels, 5);

	118

	119 // Mask

	120 pixels_high = _mm_and_si128(mask_green, pixels_high);

	121 pixels = _mm_and_si128(mask_green, pixels);

	122

	123 // Combine into RGB565 and store

	124 pixels = _mm_hadd_epi16(pixels, pixels_high);

	125 _mm_store_si128(dst_wide, pixels);

	126 count -= 8;

	127 dst_wide++;

	128 } while (count >= 8);

	129

	130 dst = reinterpret_cast<uint16_t*>(dst_wide);

	131 }

	132

	133 // Small loop to handle remaining pixels.

	134 while (count > 0) {

	135 dst = Color32A_D565_1x(dst, scale, src_expand);

	136 dst += 1;

	137 count--;

	138 }

	139 }

	140

66 #endif	141 #endif

OLD	NEW

« no previous file with comments | « src/opts/SkBlitRow_opts_SSE4.h ('k') | src/opts/opts_check_x86.cpp » ('j') | no next file with comments »