src/opts/SkBlitRow_opts_SSE4.cpp - Issue 923523002: Replace SSE optimization of Color32A_D565

Side by Side Diff: src/opts/SkBlitRow_opts_SSE4.cpp

Issue 923523002: Replace SSE optimization of Color32A_D565 (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Created 5 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 #include "SkBlitRow_opts_SSE4.h"	1 #include "SkBlitRow_opts_SSE4.h"

2	2

3 // Some compilers can't compile SSSE3 or SSE4 intrinsics. We give them stub met hods.	3 // Some compilers can't compile SSSE3 or SSE4 intrinsics. We give them stub met hods.

4 // The stubs should never be called, so we make them crash just to confirm that.	4 // The stubs should never be called, so we make them crash just to confirm that.

5 #if SK_CPU_SSE_LEVEL < SK_CPU_SSE_LEVEL_SSE41	5 #if SK_CPU_SSE_LEVEL < SK_CPU_SSE_LEVEL_SSE41

6 void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT, const SkPMColor* SK_REST RICT, int, U8CPU) {	6 void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT, const SkPMColor* SK_REST RICT, int, U8CPU) {

7 sk_throw();	7 sk_throw();

8 }	8 }

9	9

10 void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y) {	10 void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y) {

(...skipping 48 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
59	59

60 // Wrap up the last <= 15 pixels.	60 // Wrap up the last <= 15 pixels.

61 for (int i = count16*16; i < count; i++) {	61 for (int i = count16*16; i < count; i++) {

62 // This check is not really necessarily, but it prevents pointless autov ectorization.	62 // This check is not really necessarily, but it prevents pointless autov ectorization.

63 if (src[i] & 0xFF000000) {	63 if (src[i] & 0xFF000000) {

64 dst[i] = SkPMSrcOver(src[i], dst[i]);	64 dst[i] = SkPMSrcOver(src[i], dst[i]);

65 }	65 }

66 }	66 }

67 }	67 }

68	68

69 static inline uint16_t Color32A_D565_1x(uint16_t dst, unsigned scale, uint32_t s rc_expand) {

70 uint32_t dst_expand = SkExpand_rgb_16(dst) * scale;

71 return SkCompact_rgb_16((src_expand + dst_expand) >> 5);

72 }

73

74 void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y) {	69 void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y) {

75 SkASSERT(count > 0);	70 SkASSERT(count > 0);

76	71

77 uint32_t src_expand = (SkGetPackedG32(src) << 24) \|	72 uint32_t src_expand = (SkGetPackedG32(src) << 24) \|

78 (SkGetPackedR32(src) << 13) \|	73 (SkGetPackedR32(src) << 13) \|

79 (SkGetPackedB32(src) << 2);	74 (SkGetPackedB32(src) << 2);

80 unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;	75 unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;

81	76

82 // Check if we have enough pixels to run SIMD	77 // Check if we have enough pixels to run SIMD

83 if (count >= (int)(8 + (((16 - (size_t)dst) & 0x0F) >> 1))) {	78 if (count >= (int)(8 + (((16 - (size_t)dst) & 0x0F) >> 1))) {

84 __m128i* dst_wide;	79 __m128i* dst_wide;

85 const __m128i src_expand_wide = _mm_set1_epi32(src_expand);	80 const __m128i src_expand_wide = _mm_set1_epi32(src_expand);

86 const __m128i scale_wide = _mm_set1_epi32(scale);	81 const __m128i scale_wide = _mm_set1_epi32(scale);

87 const __m128i mask_green = _mm_set1_epi32(SK_R16_MASK_IN_PLACE \|	82 const __m128i mask_green = _mm_set1_epi32(SK_R16_MASK_IN_PLACE \|

88 SK_B16_MASK_IN_PLACE \|	83 SK_B16_MASK_IN_PLACE \|

89 (SK_G16_MASK_IN_PLACE << 16));	84 (SK_G16_MASK_IN_PLACE << 16));

90	85

91 // Align dst to an even 16 byte address (0-7 pixels)	86 // Align dst to an even 16 byte address (0-7 pixels)

92 while (((((size_t)dst) & 0x0F) != 0) && (count > 0)) {	87 while (((((size_t)dst) & 0x0F) != 0) && (count > 0)) {

93 dst = Color32A_D565_1x(dst, scale, src_expand);	88 dst = SkBlend32A_D565(dst, scale, src_expand);

94 dst += 1;	89 dst += 1;

95 count--;	90 count--;

96 }	91 }

97	92

98 dst_wide = reinterpret_cast<__m128i*>(dst);	93 dst_wide = reinterpret_cast<__m128i*>(dst);

99 do {	94 do {

100 // Load 8 RGB565 pixels	95 // Load 8 RGB565 pixels

101 __m128i pixels = _mm_load_si128(dst_wide);	96 __m128i pixels = _mm_load_si128(dst_wide);

102	97

103 // Duplicate and mask	98 // Duplicate and mask

(...skipping 21 matching lines...) Expand all Loading...
125 _mm_store_si128(dst_wide, pixels);	120 _mm_store_si128(dst_wide, pixels);

126 count -= 8;	121 count -= 8;

127 dst_wide++;	122 dst_wide++;

128 } while (count >= 8);	123 } while (count >= 8);

129	124

130 dst = reinterpret_cast<uint16_t*>(dst_wide);	125 dst = reinterpret_cast<uint16_t*>(dst_wide);

131 }	126 }

132	127

133 // Small loop to handle remaining pixels.	128 // Small loop to handle remaining pixels.

134 while (count > 0) {	129 while (count > 0) {

135 dst = Color32A_D565_1x(dst, scale, src_expand);	130 dst = SkBlend32A_D565(dst, scale, src_expand);

136 dst += 1;	131 dst += 1;

137 count--;	132 count--;

138 }	133 }

139 }	134 }

140	135

141 #endif	136 #endif

OLD	NEW

« include/core/SkColorPriv.h ('K') | « src/opts/SkBlitRow_opts_SSE2.cpp ('k') | src/opts/opts_check_x86.cpp » ('j') | src/opts/opts_check_x86.cpp » ('J')