src/opts/SkSwizzler_opts.h - Issue 1601883002: Add SSSE3 Optimizations for premul and swap

Side by Side Diff: src/opts/SkSwizzler_opts.h

Issue 1601883002: Add SSSE3 Optimizations for premul and swap (Closed) Base URL: https://skia.googlesource.com/skia.git@f-and-x

Patch Set: Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright 2016 Google Inc.	2 * Copyright 2016 Google Inc.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #ifndef SkSwizzler_opts_DEFINED	8 #ifndef SkSwizzler_opts_DEFINED

9 #define SkSwizzler_opts_DEFINED	9 #define SkSwizzler_opts_DEFINED

10	10

(...skipping 156 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
167 // Store 8 pixels.	167 // Store 8 pixels.

168 vst4_u8((uint8_t*) dst, bgra);	168 vst4_u8((uint8_t*) dst, bgra);

169 src += 8;	169 src += 8;

170 dst += 8;	170 dst += 8;

171 count -= 8;	171 count -= 8;

172 }	172 }

173	173

174 swaprb_xxxa_portable(dst, src, count);	174 swaprb_xxxa_portable(dst, src, count);

175 }	175 }

176	176

	177 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3

	178

	179 template <bool kSwapRB>

	180 static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int count) {
	msarett 2016/01/18 20:35:05 There are a lot of different ways to implement thi There are a lot of different ways to implement this. Specifically with the unpacking and packing of pixels. I'm not certain this is the fastest way, but it's a start.
	181 const __m128i zeros = _mm_setzero_si128();

	182 const __m128i _128 = _mm_set1_epi16(128);

	183 const __m128i _257 = _mm_set1_epi16(257);

	184 const __m128i combine = _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1 , 12, 8, 4, 0);

	185 __m128i split;

	186 if (kSwapRB) {

	187 split = _mm_set_epi8(15, 3, 7, 11, 14, 2, 6, 10, 13, 1, 5, 9, 12, 0, 4, 8);

	188 } else {

	189 split = combine;

	190 }

	191

	192 while (count >= 8) {

	193 __m128i argb_lo = _mm_loadu_si128((const __m128i*) src);

	194 __m128i argb_hi = _mm_loadu_si128((const __m128i*) (src + 4));

	195

	196 // argb_argb_argb_argb -> aaaa_rrrr_gggg_bbbb
	mtklein 2016/01/19 15:59:14 Let's kick some of these comments a little bit hig Let's kick some of these comments a little bit higher-level: // We'll load 8 pixels into 4 registers, each holding a 16-bit component plane. // First just load the 8 interlaced pixels. __m128i lo = _mm_loadu_si128(... +0), // bgrabgra bgrabgra hi = _mm_loadu_si128(... +4); // BGRABGRA BGRABGRA // Swizzle them to 8-bit planar. lo = _mm_shuffle_epi8(lo, planar); // bbbbgggg rrrraaaa hi = _mm_shuffle_epi8(hi, planar); // BBBBGGGG RRRRAAAA __m128i bg = _mm_unpacklo(...), // bbbbBBBB ggggGGGG ra = _mm_unpackhi(...); // rrrrRRRR aaaaAAAA // Unpack to 16-bit planar in four registers. __m128i b = _mm_unpacklo(...), // b_b_b_b_ B_B_B_B_ ...; // OK, premultiply! (x+127)/255 == ((x+128)257)>>16 for 0 <= x <= 255255. ... msarett 2016/01/19 17:34:38 Done. Ugggh, for some reason I thought the rest o Show quoted text On 2016/01/19 15:59:14, mtklein wrote: > Let's kick some of these comments a little bit higher-level: > > // We'll load 8 pixels into 4 registers, each holding a 16-bit component plane. > > // First just load the 8 interlaced pixels. > __m128i lo = _mm_loadu_si128(... +0), // bgrabgra bgrabgra > hi = _mm_loadu_si128(... +4); // BGRABGRA BGRABGRA > > // Swizzle them to 8-bit planar. > lo = _mm_shuffle_epi8(lo, planar); // bbbbgggg rrrraaaa > hi = _mm_shuffle_epi8(hi, planar); // BBBBGGGG RRRRAAAA > __m128i bg = _mm_unpacklo(...), // bbbbBBBB ggggGGGG > ra = _mm_unpackhi(...); // rrrrRRRR aaaaAAAA > > // Unpack to 16-bit planar in four registers. > __m128i b = _mm_unpacklo(...), // b_b_b_b_ B_B_B_B_ > ...; > > // OK, premultiply! (x+127)/255 == ((x+128)257)>>16 for 0 <= x <= 255255. > ... Done. Ugggh, for some reason I thought the rest of this file was written as ARGB (so I did that on purpose). But I'm realizing now that it's BGRA. And I agree that BGRA is easier to think about.
	197 argb_lo = _mm_shuffle_epi8(argb_lo, combine);

	198 argb_hi = _mm_shuffle_epi8(argb_hi, combine);

	199

	200 // aaaa_rrrr_gggg_bbbb -> aaaa_aaaa_rrrr_rrrr

	201 __m128i ar = _mm_unpackhi_epi32(argb_lo, argb_hi);

	202 // aaaa_rrrr_gggg_bbbb -> gggg_gggg_bbbb_bbbb

	203 __m128i gb = _mm_unpacklo_epi32(argb_lo, argb_hi);

	204

	205 // xxxx_xxxx_yyyy_yyyy -> 0x0x_0x0x_0x0x_0x0x

	206 // xxxx_xxxx_yyyy_yyyy -> 0y0y_0y0y_0y0y_0y0y

	207 __m128i a = _mm_unpackhi_epi8(ar, zeros);

	208 __m128i r = _mm_unpacklo_epi8(ar, zeros);

	209 __m128i g = _mm_unpackhi_epi8(gb, zeros);

	210 __m128i b = _mm_unpacklo_epi8(gb, zeros);

	211

	212 // (x + 127) / 255 == ((x + 128) * 257) >> 16 for 0 <= x <= 255 * 255
	msarett 2016/01/18 20:35:05 Thanks to Mike for this insight. Thanks to Mike for this insight.
	213 // Note that _mm_mulhi_epu16 performs the entire (y * 257) >> 16.

	214 r = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(a, r), _128), _257);
	mtklein 2016/01/19 15:59:14 This may be a matter of personal preference, but y This may be a matter of personal preference, but you might consider: auto scale = [](__m128i x, __m128i y) { return _mm_mulhi_epu16(...); }; r = scale(r,a); g = scale(g,a); b = scale(b,a); msarett 2016/01/19 17:34:38 Leaving as is, though I'm kind of indifferent. I Show quoted text On 2016/01/19 15:59:14, mtklein wrote: > This may be a matter of personal preference, but you might consider: > > auto scale = [](__m128i x, __m128i y) { return _mm_mulhi_epu16(...); }; > r = scale(r,a); > g = scale(g,a); > b = scale(b,a); Leaving as is, though I'm kind of indifferent. I needed to pass references to _128 and _257 to "scale" in order to get it to compile, and I found it a bit confusing.
	215 g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(a, g), _128), _257);

	216 b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(a, b), _128), _257);

	217

	218 // aaaa_rrrr_aaaa_rrrr
	mtklein 2016/01/19 15:59:14 I think we can do this repacking as something like I think we can do this repacking as something like: __m128i bg = b \| (g << 8)), ra = r \| (a << 8)), lo = unpacklo_epi16(bg, ra), hi = unpackhi_epi16(bg, ra); if (kSwapRB) { lo = shuffle_epi8(lo, swapRB) hi = shuffle_epi8(hi, swapRB) } storeu_si128(... +0, lo) storeu_si128(... +4, hi) Does that work? I think that makes the non-swapRB path a bit shorter, and the swapRB path no longer. msarett 2016/01/19 17:34:37 Yes this is better! Let's even swap BR in the "sw Show quoted text On 2016/01/19 15:59:14, mtklein wrote: > I think we can do this repacking as something like: > > __m128i bg = b \| (g << 8)), > ra = r \| (a << 8)), > lo = unpacklo_epi16(bg, ra), > hi = unpackhi_epi16(bg, ra); > > if (kSwapRB) { > lo = shuffle_epi8(lo, swapRB) > hi = shuffle_epi8(hi, swapRB) > } > storeu_si128(... +0, lo) > storeu_si128(... +4, hi) > > Does that work? I think that makes the non-swapRB path a bit shorter, and the > swapRB path no longer. Yes this is better! Let's even swap BR in the "swizzle to planar step". Then it is the same cost as not-swapping.
	219 ar = _mm_shuffle_epi32(_mm_packus_epi16(r, a), 0xD8);

	220 // gggg_bbbb_gggg_bbbb

	221 gb = _mm_shuffle_epi32(_mm_packus_epi16(b, g), 0xD8);

	222

	223 // aaaa_rrrr_gggg_bbbb

	224 argb_lo = _mm_unpacklo_epi64(gb, ar);

	225 argb_hi = _mm_unpackhi_epi64(gb, ar);

	226

	227 // aaaa_rrrr_gggg_bbbb -> argb_argb_argb_argb

	228 argb_lo = _mm_shuffle_epi8(argb_lo, split);

	229 argb_hi = _mm_shuffle_epi8(argb_hi, split);

	230

	231 _mm_storeu_si128((__m128i*) dst, argb_lo);

	232 _mm_storeu_si128((__m128i*) (dst + 4), argb_hi);

	233

	234 src += 8;

	235 dst += 8;

	236 count -= 8;

	237 }

	238

	239 if (count >= 4) {
	mtklein 2016/01/19 15:59:14 Reminder to self to circle back here when we're ha Reminder to self to circle back here when we're happy with n >= 8.
	240 __m128i argb = _mm_loadu_si128((const __m128i*) src);

	241

	242 // argb_argb_argb_argb -> aaaa_rrrr_gggg_bbbb

	243 argb = _mm_shuffle_epi8(argb, combine);

	244

	245 // aaaa_rrrr_gggg_bbbb -> 0000_aaaa_0000_rrrr

	246 __m128i ar = _mm_unpackhi_epi32(argb, zeros);

	247 // aaaa_rrrr_gggg_bbbb -> 0000_gggg_0000_bbbb

	248 __m128i gb = _mm_unpacklo_epi32(argb, zeros);

	249

	250 // xxxx_xxxx_yyyy_yyyy -> 0x0x_0x0x_0x0x_0x0x

	251 // xxxx_xxxx_yyyy_yyyy -> 0y0y_0y0y_0y0y_0y0y

	252 __m128i a = _mm_unpackhi_epi8(ar, zeros);

	253 __m128i r = _mm_unpacklo_epi8(ar, zeros);

	254 __m128i g = _mm_unpackhi_epi8(gb, zeros);

	255 __m128i b = _mm_unpacklo_epi8(gb, zeros);

	256

	257 // (x + 127) / 255 == ((x + 128) * 257) >> 16 for 0 <= x <= 255 * 255

	258 // Note that _mm_mulhi_epu16 performs the entire (y * 257) >> 16.

	259 r = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(a, r), _128), _257);

	260 g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(a, g), _128), _257);

	261 b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(a, b), _128), _257);

	262

	263 // aaaa_rrrr_0000_0000

	264 ar = _mm_shuffle_epi32(_mm_packus_epi16(r, a), 0x8F);

	265 // 0000_0000_gggg_bbbb

	266 gb = _mm_shuffle_epi32(_mm_packus_epi16(b, g), 0xD8);

	267

	268 // aaaa_rrrr_gggg_bbbb

	269 argb = _mm_or_si128(ar, gb);

	270

	271 // aaaa_rrrr_gggg_bbbb -> argb_argb_argb_argb

	272 argb = _mm_shuffle_epi8(argb, split);

	273

	274 _mm_storeu_si128((__m128i*) dst, argb);

	275

	276 src += 4;

	277 dst += 4;

	278 count -= 4;

	279 }

	280

	281 // Call portable code to finish up the tail of [0,4) pixels.

	282 auto proc = kSwapRB ? premul_swaprb_xxxa_portable : premul_xxxa_portable;

	283 proc(dst, src, count);

	284 }

	285

	286 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) {

	287 premul_xxxa_should_swaprb<false>(dst, src, count);

	288 }

	289

	290 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {

	291 premul_xxxa_should_swaprb<true>(dst, src, count);

	292 }

	293

	294 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {

	295 const __m128i swapRB = _mm_set_epi8(15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6 , 3, 0, 1, 2);
	mtklein 2016/01/19 15:59:15 I often find it's easier to read these if you use I often find it's easier to read these if you use _mm_setr_foo, so that the indices go in ascending order: _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15); If you do like them as you've written, they're perfectly fine. msarett 2016/01/19 17:34:37 I think you're right. Show quoted text On 2016/01/19 15:59:15, mtklein wrote: > I often find it's easier to read these if you use _mm_setr_foo, so that the > indices go in ascending order: > _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15); > > If you do like them as you've written, they're perfectly fine. I think you're right.
	296

	297 while (count >= 4) {

	298 __m128i argb = _mm_loadu_si128((const __m128i*) src);

	299 __m128i abgr = _mm_shuffle_epi8(argb, swapRB);

	300 _mm_storeu_si128((__m128i*) dst, abgr);

	301

	302 src += 4;

	303 dst += 4;

	304 count -= 4;

	305 }

	306

	307 swaprb_xxxa_portable(dst, src, count);

	308 }

	309

177 #else	310 #else

178	311

179 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) {	312 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) {

180 premul_xxxa_portable(dst, src, count);	313 premul_xxxa_portable(dst, src, count);

181 }	314 }

182	315

183 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {	316 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {

184 premul_swaprb_xxxa_portable(dst, src, count);	317 premul_swaprb_xxxa_portable(dst, src, count);

185 }	318 }

186	319

187 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {	320 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {

188 swaprb_xxxa_portable(dst, src, count);	321 swaprb_xxxa_portable(dst, src, count);

189 }	322 }

190	323

191 #endif	324 #endif

192	325

193 }	326 }

194	327

195 #endif // SkSwizzler_opts_DEFINED	328 #endif // SkSwizzler_opts_DEFINED

OLD	NEW

« no previous file with comments | « src/opts/SkOpts_ssse3.cpp ('k') | no next file » | no next file with comments »