src/opts/SkSwizzler_opts.h - Issue 1601883002: Add SSSE3 Optimizations for premul and swap

Side by Side Diff: src/opts/SkSwizzler_opts.h

Issue 1601883002: Add SSSE3 Optimizations for premul and swap (Closed) Base URL: https://skia.googlesource.com/skia.git@f-and-x

Patch Set: Use shared proc Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright 2016 Google Inc.	2 * Copyright 2016 Google Inc.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #ifndef SkSwizzler_opts_DEFINED	8 #ifndef SkSwizzler_opts_DEFINED

9 #define SkSwizzler_opts_DEFINED	9 #define SkSwizzler_opts_DEFINED

10	10

(...skipping 156 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
167 // Store 8 pixels.	167 // Store 8 pixels.

168 vst4_u8((uint8_t*) dst, bgra);	168 vst4_u8((uint8_t*) dst, bgra);

169 src += 8;	169 src += 8;

170 dst += 8;	170 dst += 8;

171 count -= 8;	171 count -= 8;

172 }	172 }

173	173

174 swaprb_xxxa_portable(dst, src, count);	174 swaprb_xxxa_portable(dst, src, count);

175 }	175 }

176	176

	177 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3

	178

	179 template <bool kSwapRB>

	180 static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int count) {

	181 const __m128i zeros = _mm_setzero_si128();

	182 const __m128i _128 = _mm_set1_epi16(128);

	183 const __m128i _257 = _mm_set1_epi16(257);

	184 __m128i planar;

	185 if (kSwapRB) {

	186 planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);

	187 } else {

	188 planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);

	189 }

	190

	191 auto premul8 = [&zeros, &_128, &_257, &planar](__m128i* lo, __m128i* hi) {
	mtklein 2016/01/19 20:15:02 The comments inside the while loop are now probabl The comments inside the while loop are now probably not necessary. Our SSE code works with interlaced pixels more or less by default, as that's the memory layout. If you want to leave a note about interlacing vs. planar, let's put it here on premul8: // lo and hi should point to interlaced pixels (bgra bgra ...) and will be left that way. mtklein 2016/01/19 20:15:02 Just out of curiosity, what happens to the codegen Just out of curiosity, what happens to the codegen / perf if we just move these constants inside? I'd hope it's unaffected? I don't personally mind writing this as auto premul8 = [&] { ... }; but I will never object to listing out each closed over variable. msarett 2016/01/19 21:02:39 Codegen is unaffected by moving the constants insi Show quoted text On 2016/01/19 20:15:02, mtklein wrote: > The comments inside the while loop are now probably not necessary. Our SSE code > works with interlaced pixels more or less by default, as that's the memory > layout. If you want to leave a note about interlacing vs. planar, let's put it > here on premul8: > > // lo and hi should point to interlaced pixels (bgra bgra ...) and will be > left that way. Codegen is unaffected by moving the constants inside the function. Comments also removed.
	192 // Swizzle the pixels to 8-bit planar.

	193 lo = _mm_shuffle_epi8(lo, planar); // bbbbgggg rr rraaaa

	194 hi = _mm_shuffle_epi8(hi, planar); // BBBBGGGG RR RRAAAA

	195 __m128i bg = _mm_unpacklo_epi32(lo, hi), // bbbbBBBB gg ggGGGG

	196 ra = _mm_unpackhi_epi32(lo, hi); // rrrrRRRR aa aaAAAA

	197

	198 // Unpack to 16-bit planar.

	199 __m128i b = _mm_unpacklo_epi8(bg, zeros), // b_b_b_b_ B_ B_B_B_

	200 g = _mm_unpackhi_epi8(bg, zeros), // g_g_g_g_ G_ G_G_G_

	201 r = _mm_unpacklo_epi8(ra, zeros), // r_r_r_r_ R_ R_R_R_

	202 a = _mm_unpackhi_epi8(ra, zeros); // a_a_a_a_ A_ A_A_A_

	203

	204 // Premultiply! (x+127)/255 == ((x+128)257)>>16 for 0 <= x <= 255255.

	205 b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(b, a), _128), _257);

	206 g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(g, a), _128), _257);

	207 r = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(r, a), _128), _257);

	208

	209 // Repack into interlaced pixels.

	210 bg = _mm_or_si128(b, _mm_slli_epi16(g, 8)); // bgbgbgbg BG BGBGBG

	211 ra = _mm_or_si128(r, _mm_slli_epi16(a, 8)); // rararara RA RARARA

	212 *lo = _mm_unpacklo_epi16(bg, ra); // bgrabgra bg rabgra

	213 *hi = _mm_unpackhi_epi16(bg, ra); // BRGABGRA BG RABGRA

	214 };

	215

	216 while (count >= 8) {

	217 // First just load the 8 interlaced pixels.

	218 __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)), // bgrabgra bg rabgra

	219 hi = _mm_loadu_si128((const __m128i*) (src + 4)); // BGRABGRA BG RABGRA

	220

	221 premul8(&lo, &hi);

	222

	223 // Store interlaced pixels.

	224 _mm_storeu_si128((__m128i*) (dst + 0), lo);

	225 _mm_storeu_si128((__m128i*) (dst + 4), hi);

	226

	227 src += 8;

	228 dst += 8;

	229 count -= 8;

	230 }

	231

	232 if (count >= 4) {

	233 // First just load 4 interlaced pixels.

	234 __m128i lo = _mm_loadu_si128((const __m128i*) src), // bgrabgra bg rabgra

	235 hi = _mm_setzero_si128();

	236

	237 premul8(&lo, &hi);

	238

	239 // Store interlaced pixels.

	240 _mm_storeu_si128((__m128i*) dst, lo);

	241

	242 src += 4;

	243 dst += 4;

	244 count -= 4;

	245 }

	246

	247 // Call portable code to finish up the tail of [0,4) pixels.

	248 auto proc = kSwapRB ? premul_swaprb_xxxa_portable : premul_xxxa_portable;

	249 proc(dst, src, count);

	250 }

	251

	252 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) {

	253 premul_xxxa_should_swaprb<false>(dst, src, count);

	254 }

	255

	256 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {

	257 premul_xxxa_should_swaprb<true>(dst, src, count);

	258 }

	259

	260 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {

	261 const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,1 5);

	262

	263 while (count >= 4) {

	264 __m128i bgra = _mm_loadu_si128((const __m128i*) src);

	265 __m128i rgba = _mm_shuffle_epi8(bgra, swapRB);

	266 _mm_storeu_si128((__m128i*) dst, rgba);

	267

	268 src += 4;

	269 dst += 4;

	270 count -= 4;

	271 }

	272

	273 swaprb_xxxa_portable(dst, src, count);

	274 }

	275

177 #else	276 #else

178	277

179 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) {	278 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) {

180 premul_xxxa_portable(dst, src, count);	279 premul_xxxa_portable(dst, src, count);

181 }	280 }

182	281

183 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {	282 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {

184 premul_swaprb_xxxa_portable(dst, src, count);	283 premul_swaprb_xxxa_portable(dst, src, count);

185 }	284 }

186	285

187 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {	286 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {

188 swaprb_xxxa_portable(dst, src, count);	287 swaprb_xxxa_portable(dst, src, count);

189 }	288 }

190	289

191 #endif	290 #endif

192	291

193 }	292 }

194	293

195 #endif // SkSwizzler_opts_DEFINED	294 #endif // SkSwizzler_opts_DEFINED

OLD	NEW

« no previous file with comments | « src/opts/SkOpts_ssse3.cpp ('k') | no next file » | no next file with comments »