src/opts/SkSwizzler_opts.h - Issue 1626463002: Refactor swizzle names and types.

Side by Side Diff: src/opts/SkSwizzler_opts.h

Issue 1626463002: Refactor swizzle names and types. (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright 2016 Google Inc.	2 * Copyright 2016 Google Inc.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #ifndef SkSwizzler_opts_DEFINED	8 #ifndef SkSwizzler_opts_DEFINED

9 #define SkSwizzler_opts_DEFINED	9 #define SkSwizzler_opts_DEFINED

10	10

11 #include "SkColorPriv.h"	11 #include "SkColorPriv.h"

12	12

13 namespace SK_OPTS_NS {	13 namespace SK_OPTS_NS {

14	14

15 // These variable names in these functions just pretend the input is BGRA.	15 static void RGBA_to_rgbA_portable(uint32_t* dst, const void* vsrc, int count) {
msarett 2016/01/22 15:35:08 I find this comment to still be useful? There is I find this comment to still be useful? There is one BGRA user (yes it's BMP :)). mtklein 2016/01/22 15:39:52 Sort of... now that the order and the function nam Show quoted text On 2016/01/22 15:35:08, msarett wrote: > I find this comment to still be useful? There is one BGRA user (yes it's BMP > :)). Sort of... now that the order and the function names agree with each other, do we really need to say that here? // The variable names in these functions follow the byte-order convention established by the function names. That should be implicit in most code. If we put a comment like this anywhere, it'd probably be a reminder in SkOpts.h, e.g. that BGRA_to_RGBA is the same as RGBA_to_BGRA? It almost seems too obvious? msarett 2016/01/22 15:40:51 Agreed that it's obvious. Let's drop it. Show quoted text On 2016/01/22 15:39:52, mtklein wrote: > On 2016/01/22 15:35:08, msarett wrote: > > I find this comment to still be useful? There is one BGRA user (yes it's BMP > > :)). > > Sort of... now that the order and the function names agree with each other, do > we really need to say that here? > // The variable names in these functions follow the byte-order convention > established by the function names. > That should be implicit in most code. > > If we put a comment like this anywhere, it'd probably be a reminder in SkOpts.h, > e.g. that BGRA_to_RGBA is the same as RGBA_to_BGRA? It almost seems too > obvious? Agreed that it's obvious. Let's drop it.
16 // They work fine with both RGBA and BGRA.	16 auto src = (const uint32_t*)vsrc;

17

18 static void premul_xxxa_portable(uint32_t dst[], const uint32_t src[], int count ) {

19 for (int i = 0; i < count; i++) {	17 for (int i = 0; i < count; i++) {

20 uint8_t a = src[i] >> 24,	18 uint8_t a = src[i] >> 24,

21 r = src[i] >> 16,	19 b = src[i] >> 16,

22 g = src[i] >> 8,	20 g = src[i] >> 8,

23 b = src[i] >> 0;	21 r = src[i] >> 0;

	22 b = (b*a+127)/255;

	23 g = (g*a+127)/255;

24 r = (r*a+127)/255;	24 r = (r*a+127)/255;

	25 dst[i] = (uint32_t)a << 24

	26 \| (uint32_t)b << 16

	27 \| (uint32_t)g << 8

	28 \| (uint32_t)r << 0;

	29 }

	30 }

	31

	32 static void RGBA_to_bgrA_portable(uint32_t* dst, const void* vsrc, int count) {

	33 auto src = (const uint32_t*)vsrc;

	34 for (int i = 0; i < count; i++) {

	35 uint8_t a = src[i] >> 24,

	36 b = src[i] >> 16,

	37 g = src[i] >> 8,

	38 r = src[i] >> 0;

	39 b = (b*a+127)/255;

25 g = (g*a+127)/255;	40 g = (g*a+127)/255;

26 b = (b*a+127)/255;	41 r = (r*a+127)/255;

27 dst[i] = (uint32_t)a << 24	42 dst[i] = (uint32_t)a << 24

28 \| (uint32_t)r << 16	43 \| (uint32_t)r << 16

29 \| (uint32_t)g << 8	44 \| (uint32_t)g << 8

30 \| (uint32_t)b << 0;	45 \| (uint32_t)b << 0;

31 }	46 }

32 }	47 }

33	48

34 static void premul_swaprb_xxxa_portable(uint32_t dst[], const uint32_t src[], in t count) {	49 static void RGBA_to_BGRA_portable(uint32_t* dst, const void* vsrc, int count) {

	50 auto src = (const uint32_t*)vsrc;

35 for (int i = 0; i < count; i++) {	51 for (int i = 0; i < count; i++) {

36 uint8_t a = src[i] >> 24,	52 uint8_t a = src[i] >> 24,

37 r = src[i] >> 16,	53 b = src[i] >> 16,

38 g = src[i] >> 8,	54 g = src[i] >> 8,

39 b = src[i] >> 0;	55 r = src[i] >> 0;

40 r = (r*a+127)/255;

41 g = (g*a+127)/255;

42 b = (b*a+127)/255;

43 dst[i] = (uint32_t)a << 24	56 dst[i] = (uint32_t)a << 24

44 \| (uint32_t)b << 16	57 \| (uint32_t)r << 16

45 \| (uint32_t)g << 8	58 \| (uint32_t)g << 8

46 \| (uint32_t)r << 0;	59 \| (uint32_t)b << 0;

47 }	60 }

48 }	61 }

49	62

50 static void swaprb_xxxa_portable(uint32_t dst[], const uint32_t src[], int count ) {

51 for (int i = 0; i < count; i++) {

52 uint8_t a = src[i] >> 24,

53 r = src[i] >> 16,

54 g = src[i] >> 8,

55 b = src[i] >> 0;

56 dst[i] = (uint32_t)a << 24

57 \| (uint32_t)b << 16

58 \| (uint32_t)g << 8

59 \| (uint32_t)r << 0;

60 }

61 }

62

63 #if defined(SK_ARM_HAS_NEON)	63 #if defined(SK_ARM_HAS_NEON)

64	64

65 // Rounded divide by 255, (x + 127) / 255	65 // Rounded divide by 255, (x + 127) / 255

66 static uint8x8_t div255_round(uint16x8_t x) {	66 static uint8x8_t div255_round(uint16x8_t x) {

67 // result = (x + 127) / 255	67 // result = (x + 127) / 255

68 // result = (x + 127) / 256 + error1	68 // result = (x + 127) / 256 + error1

69 //	69 //

70 // error1 = (x + 127) / (255 * 256)	70 // error1 = (x + 127) / (255 * 256)

71 // error1 = (x + 127) / (256 * 256) + error2	71 // error1 = (x + 127) / (256 * 256) + error2

72 //	72 //

(...skipping 12 matching lines...) Expand all Loading...
85 // "add, round, and narrow back to 8-bits" instruction.	85 // "add, round, and narrow back to 8-bits" instruction.

86 return vraddhn_u16(x, vrshrq_n_u16(x, 8));	86 return vraddhn_u16(x, vrshrq_n_u16(x, 8));

87 }	87 }

88	88

89 // Scale a byte by another, (x * y + 127) / 255	89 // Scale a byte by another, (x * y + 127) / 255

90 static uint8x8_t scale(uint8x8_t x, uint8x8_t y) {	90 static uint8x8_t scale(uint8x8_t x, uint8x8_t y) {

91 return div255_round(vmull_u8(x, y));	91 return div255_round(vmull_u8(x, y));

92 }	92 }

93	93

94 template <bool kSwapRB>	94 template <bool kSwapRB>

95 static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int count) {	95 static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {

	96 auto src = (const uint32_t*)vsrc;

96 while (count >= 8) {	97 while (count >= 8) {

97 // Load 8 pixels.	98 // Load 8 pixels.

98 uint8x8x4_t bgra = vld4_u8((const uint8_t*) src);	99 uint8x8x4_t bgra = vld4_u8((const uint8_t*) src);

99	100

100 uint8x8_t a = bgra.val[3],	101 uint8x8_t a = bgra.val[3],

101 r = bgra.val[2],	102 b = bgra.val[2],

102 g = bgra.val[1],	103 g = bgra.val[1],

103 b = bgra.val[0];	104 r = bgra.val[0];

104	105

105 // Premultiply.	106 // Premultiply.

	107 b = scale(b, a);

	108 g = scale(g, a);

106 r = scale(r, a);	109 r = scale(r, a);

107 g = scale(g, a);

108 b = scale(b, a);

109	110

110 // Store 8 premultiplied pixels.	111 // Store 8 premultiplied pixels.

111 if (kSwapRB) {	112 if (kSwapRB) {

	113 bgra.val[2] = r;

	114 bgra.val[1] = g;

	115 bgra.val[0] = b;

	116 } else {

112 bgra.val[2] = b;	117 bgra.val[2] = b;

113 bgra.val[1] = g;	118 bgra.val[1] = g;

114 bgra.val[0] = r;	119 bgra.val[0] = r;

115 } else {

116 bgra.val[2] = r;

117 bgra.val[1] = g;

118 bgra.val[0] = b;

119 }	120 }

120 vst4_u8((uint8_t*) dst, bgra);	121 vst4_u8((uint8_t*) dst, bgra);

121 src += 8;	122 src += 8;

122 dst += 8;	123 dst += 8;

123 count -= 8;	124 count -= 8;

124 }	125 }

125	126

126 // Call portable code to finish up the tail of [0,8) pixels.	127 // Call portable code to finish up the tail of [0,8) pixels.

127 auto proc = kSwapRB ? premul_swaprb_xxxa_portable : premul_xxxa_portable;	128 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;

128 proc(dst, src, count);	129 proc(dst, src, count);

129 }	130 }

130	131

131 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) {	132 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {

132 premul_xxxa_should_swaprb<false>(dst, src, count);	133 premul_should_swapRB<false>(dst, src, count);

133 }	134 }

134	135

135 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {	136 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {

136 premul_xxxa_should_swaprb<true>(dst, src, count);	137 premul_should_swapRB<true>(dst, src, count);

137 }	138 }

138	139

139 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {	140 static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) {

	141 auto src = (const uint32_t*)vsrc;

140 while (count >= 16) {	142 while (count >= 16) {

141 // Load 16 pixels.	143 // Load 16 pixels.

142 uint8x16x4_t bgra = vld4q_u8((const uint8_t*) src);	144 uint8x16x4_t bgra = vld4q_u8((const uint8_t*) src);

143	145

144 // Swap r and b.	146 // Swap r and b.

145 SkTSwap(bgra.val[0], bgra.val[2]);	147 SkTSwap(bgra.val[0], bgra.val[2]);

146	148

147 // Store 16 pixels.	149 // Store 16 pixels.

148 vst4q_u8((uint8_t*) dst, bgra);	150 vst4q_u8((uint8_t*) dst, bgra);

149 src += 16;	151 src += 16;

150 dst += 16;	152 dst += 16;

151 count -= 16;	153 count -= 16;

152 }	154 }

153	155

154 if (count >= 8) {	156 if (count >= 8) {

155 // Load 8 pixels.	157 // Load 8 pixels.

156 uint8x8x4_t bgra = vld4_u8((const uint8_t*) src);	158 uint8x8x4_t bgra = vld4_u8((const uint8_t*) src);

157	159

158 // Swap r and b.	160 // Swap r and b.

159 SkTSwap(bgra.val[0], bgra.val[2]);	161 SkTSwap(bgra.val[0], bgra.val[2]);

160	162

161 // Store 8 pixels.	163 // Store 8 pixels.

162 vst4_u8((uint8_t*) dst, bgra);	164 vst4_u8((uint8_t*) dst, bgra);

163 src += 8;	165 src += 8;

164 dst += 8;	166 dst += 8;

165 count -= 8;	167 count -= 8;

166 }	168 }

167	169

168 swaprb_xxxa_portable(dst, src, count);	170 RGBA_to_BGRA_portable(dst, src, count);

169 }	171 }

170	172

171 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3	173 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3

172	174

173 template <bool kSwapRB>	175 template <bool kSwapRB>

174 static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int count) {	176 static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {

	177 auto src = (const uint32_t*)vsrc;

175	178

176 auto premul8 = [](__m128i* lo, __m128i* hi) {	179 auto premul8 = [](__m128i* lo, __m128i* hi) {

177 const __m128i zeros = _mm_setzero_si128();	180 const __m128i zeros = _mm_setzero_si128();

178 const __m128i _128 = _mm_set1_epi16(128);	181 const __m128i _128 = _mm_set1_epi16(128);

179 const __m128i _257 = _mm_set1_epi16(257);	182 const __m128i _257 = _mm_set1_epi16(257);

180 __m128i planar;	183 __m128i planar;

181 if (kSwapRB) {	184 if (kSwapRB) {

182 planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);	185 planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);

183 } else {	186 } else {

184 planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);	187 planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);

185 }	188 }

186	189

187 // Swizzle the pixels to 8-bit planar.	190 // Swizzle the pixels to 8-bit planar.

188 lo = _mm_shuffle_epi8(lo, planar); // bbbbgggg rr rraaaa	191 lo = _mm_shuffle_epi8(lo, planar); // rrrrgggg bb bbaaaa

189 hi = _mm_shuffle_epi8(hi, planar); // BBBBGGGG RR RRAAAA	192 hi = _mm_shuffle_epi8(hi, planar); // RRRRGGGG BB BBAAAA

190 __m128i bg = _mm_unpacklo_epi32(lo, hi), // bbbbBBBB gg ggGGGG	193 __m128i rg = _mm_unpacklo_epi32(lo, hi), // rrrrRRRR gg ggGGGG

191 ra = _mm_unpackhi_epi32(lo, hi); // rrrrRRRR aa aaAAAA	194 ba = _mm_unpackhi_epi32(lo, hi); // bbbbBBBB aa aaAAAA

192	195

193 // Unpack to 16-bit planar.	196 // Unpack to 16-bit planar.

194 __m128i b = _mm_unpacklo_epi8(bg, zeros), // b_b_b_b_ B_ B_B_B_	197 __m128i r = _mm_unpacklo_epi8(rg, zeros), // r_r_r_r_ R_ R_R_R_

195 g = _mm_unpackhi_epi8(bg, zeros), // g_g_g_g_ G_ G_G_G_	198 g = _mm_unpackhi_epi8(rg, zeros), // g_g_g_g_ G_ G_G_G_

196 r = _mm_unpacklo_epi8(ra, zeros), // r_r_r_r_ R_ R_R_R_	199 b = _mm_unpacklo_epi8(ba, zeros), // b_b_b_b_ B_ B_B_B_

197 a = _mm_unpackhi_epi8(ra, zeros); // a_a_a_a_ A_ A_A_A_	200 a = _mm_unpackhi_epi8(ba, zeros); // a_a_a_a_ A_ A_A_A_

198	201

199 // Premultiply! (x+127)/255 == ((x+128)257)>>16 for 0 <= x <= 255255.	202 // Premultiply! (x+127)/255 == ((x+128)257)>>16 for 0 <= x <= 255255.

	203 r = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(r, a), _128), _257);

	204 g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(g, a), _128), _257);

200 b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(b, a), _128), _257);	205 b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(b, a), _128), _257);

201 g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(g, a), _128), _257);

202 r = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(r, a), _128), _257);

203	206

204 // Repack into interlaced pixels.	207 // Repack into interlaced pixels.

205 bg = _mm_or_si128(b, _mm_slli_epi16(g, 8)); // bgbgbgbg BG BGBGBG	208 rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)); // rgrgrgrg RG RGRGRG

206 ra = _mm_or_si128(r, _mm_slli_epi16(a, 8)); // rararara RA RARARA	209 ba = _mm_or_si128(b, _mm_slli_epi16(a, 8)); // babababa BA BABABA

207 *lo = _mm_unpacklo_epi16(bg, ra); // bgrabgra bg rabgra	210 *lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rg bargba

208 *hi = _mm_unpackhi_epi16(bg, ra); // BRGABGRA BG RABGRA	211 *hi = _mm_unpackhi_epi16(rg, ba); // RGBARGBA RG BARGBA

209 };	212 };

210	213

211 while (count >= 8) {	214 while (count >= 8) {

212 __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),	215 __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),

213 hi = _mm_loadu_si128((const __m128i*) (src + 4));	216 hi = _mm_loadu_si128((const __m128i*) (src + 4));

214	217

215 premul8(&lo, &hi);	218 premul8(&lo, &hi);

216	219

217 _mm_storeu_si128((__m128i*) (dst + 0), lo);	220 _mm_storeu_si128((__m128i*) (dst + 0), lo);

218 _mm_storeu_si128((__m128i*) (dst + 4), hi);	221 _mm_storeu_si128((__m128i*) (dst + 4), hi);

(...skipping 10 matching lines...) Expand all Loading...
229 premul8(&lo, &hi);	232 premul8(&lo, &hi);

230	233

231 _mm_storeu_si128((__m128i*) dst, lo);	234 _mm_storeu_si128((__m128i*) dst, lo);

232	235

233 src += 4;	236 src += 4;

234 dst += 4;	237 dst += 4;

235 count -= 4;	238 count -= 4;

236 }	239 }

237	240

238 // Call portable code to finish up the tail of [0,4) pixels.	241 // Call portable code to finish up the tail of [0,4) pixels.

239 auto proc = kSwapRB ? premul_swaprb_xxxa_portable : premul_xxxa_portable;	242 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;

240 proc(dst, src, count);	243 proc(dst, src, count);

241 }	244 }

242	245

243 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) {	246 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {

244 premul_xxxa_should_swaprb<false>(dst, src, count);	247 premul_should_swapRB<false>(dst, src, count);

245 }	248 }

246	249

247 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {	250 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {

248 premul_xxxa_should_swaprb<true>(dst, src, count);	251 premul_should_swapRB<true>(dst, src, count);

249 }	252 }

250	253

251 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {	254 static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) {

	255 auto src = (const uint32_t*)vsrc;

252 const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,1 5);	256 const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,1 5);

253	257

254 while (count >= 4) {	258 while (count >= 4) {

255 __m128i bgra = _mm_loadu_si128((const __m128i*) src);	259 __m128i rgba = _mm_loadu_si128((const __m128i*) src);

256 __m128i rgba = _mm_shuffle_epi8(bgra, swapRB);	260 __m128i bgra = _mm_shuffle_epi8(rgba, swapRB);

257 _mm_storeu_si128((__m128i*) dst, rgba);	261 _mm_storeu_si128((__m128i*) dst, bgra);

258	262

259 src += 4;	263 src += 4;

260 dst += 4;	264 dst += 4;

261 count -= 4;	265 count -= 4;

262 }	266 }

263	267

264 swaprb_xxxa_portable(dst, src, count);	268 RGBA_to_BGRA_portable(dst, src, count);

265 }	269 }

266	270

267 #else	271 #else

268	272

269 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) {	273 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {

270 premul_xxxa_portable(dst, src, count);	274 RGBA_to_rgbA_portable(dst, src, count);

271 }	275 }

272	276

273 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {	277 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {

274 premul_swaprb_xxxa_portable(dst, src, count);	278 RGBA_to_bgrA_portable(dst, src, count);

275 }	279 }

276	280

277 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {	281 static void RGBA_to_BGRA(uint32_t* dst, const void* src, int count) {

278 swaprb_xxxa_portable(dst, src, count);	282 RGBA_to_BGRA_portable(dst, src, count);

279 }	283 }

280	284

281 #endif	285 #endif

282	286

283 }	287 }

284	288

285 #endif // SkSwizzler_opts_DEFINED	289 #endif // SkSwizzler_opts_DEFINED

OLD	NEW

« no previous file with comments | « src/opts/SkOpts_ssse3.cpp ('k') | tests/SwizzlerTest.cpp » ('j') | no next file with comments »