src/opts/SkSwizzler_opts.h - Issue 1601883002: Add SSSE3 Optimizations for premul and swap

Side by Side Diff: src/opts/SkSwizzler_opts.h

Issue 1601883002: Add SSSE3 Optimizations for premul and swap (Closed) Base URL: https://skia.googlesource.com/skia.git@f-and-x

Patch Set: Faster repacking, style, comments Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright 2016 Google Inc.	2 * Copyright 2016 Google Inc.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #ifndef SkSwizzler_opts_DEFINED	8 #ifndef SkSwizzler_opts_DEFINED

9 #define SkSwizzler_opts_DEFINED	9 #define SkSwizzler_opts_DEFINED

10	10

(...skipping 156 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
167 // Store 8 pixels.	167 // Store 8 pixels.

168 vst4_u8((uint8_t*) dst, bgra);	168 vst4_u8((uint8_t*) dst, bgra);

169 src += 8;	169 src += 8;

170 dst += 8;	170 dst += 8;

171 count -= 8;	171 count -= 8;

172 }	172 }

173	173

174 swaprb_xxxa_portable(dst, src, count);	174 swaprb_xxxa_portable(dst, src, count);

175 }	175 }

176	176

	177 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3

	178

	179 template <bool kSwapRB>

	180 static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int count) {

	181 const __m128i zeros = _mm_setzero_si128();

	182 const __m128i _128 = _mm_set1_epi16(128);

	183 const __m128i _257 = _mm_set1_epi16(257);

	184 __m128i planar;

	185 if (kSwapRB) {

	186 planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);

	187 } else {

	188 planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);

	189 }

	190

	191 while (count >= 8) {

	192 // We'll load 8 pixels into 4 registers, each holding a 16-bit component plane.

	193

	194 // First just load the 8 interlaced pixels.

	195 __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)), // bgrabgra bg rabgra

	196 hi = _mm_loadu_si128((const __m128i*) (src + 4)); // BGRABGRA BG RABGRA

	197

	198 // Swizzle them to 8-bit planar.

	199 lo = _mm_shuffle_epi8(lo, planar); // bbbbgggg rr rraaaa

	200 hi = _mm_shuffle_epi8(lo, planar); // BBBBGGGG RR RRAAAA

	201 __m128i bg = _mm_unpacklo_epi32(lo, hi), // bbbbBBBB gg ggGGGG

	202 ra = _mm_unpackhi_epi32(lo, hi); // rrrrRRRR aa aaAAAA

	203

	204 // Unpack to 16-bit planar.

	205 __m128i b = _mm_unpacklo_epi8(bg, zeros), // b_b_b_b_ B_ B_B_B_

	206 g = _mm_unpackhi_epi8(bg, zeros), // g_g_g_g_ G_ G_G_G_

	207 r = _mm_unpacklo_epi8(ra, zeros), // r_r_r_r_ R_ R_R_R_

	208 a = _mm_unpackhi_epi8(ra, zeros); // a_a_a_a_ A_ A_A_A_

	209

	210 // Premultiply! (x+127)/255 == ((x+128)257)>>16 for 0 <= x <= 255255.

	211 b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(b, a), _128), _257);

	212 g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(g, a), _128), _257);

	213 r = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(r, a), _128), _257);

	214

	215 // Repack into interlaced pixels.

	216 bg = _mm_or_si128(b, _mm_slli_epi16(g, 8)); // bgbgbgbg BG BGBGBG

	217 ra = _mm_or_si128(r, _mm_slli_epi16(a, 8)); // rararara RA RARARA

	218 lo = _mm_unpacklo_epi16(bg, ra); // bgrabgra bg rabgra

	219 hi = _mm_unpackhi_epi16(bg, ra); // BRGABGRA BG RABGRA

	220

	221 // Store interlaced pixels.

	222 _mm_storeu_si128((__m128i*) (dst + 0), lo);

	223 _mm_storeu_si128((__m128i*) (dst + 4), hi);

	224

	225 src += 8;

	226 dst += 8;

	227 count -= 8;

	228 }

	229

	230 if (count >= 4) {
	mtklein 2016/01/19 18:28:30 OK, now that we've got count >= 8 in shape, let's OK, now that we've got count >= 8 in shape, let's do something like this? auto premul8 = [](__m128i* lo, __m128i* hi) { // Swizzle them to 8-bit planar. lo = _mm_shuffle_epi8(lo, planar); ... lo = _mm_unpacklo_epi16(bg, ra); hi = _mm_unpackhi_epi16(bg, ra); }; while (n >= 8) { __m128i lo = _mm_loadu_si128(...), hi = _mm_loadu_si128(....); premul8(&lo, &hi); _mm_storeu_si128(..., lo); _mm_storeu_si128(..., hi); ... } if (n >= 4) { __m128i lo = _mm_loadu_si128(...); hi = _mm_setzero_si128(); premul8(&lo, &hi); _mm_storeu_si128(..., lo); ... } handle n <= 3 msarett 2016/01/19 19:17:43 Done. Show quoted text On 2016/01/19 18:28:30, mtklein wrote: > OK, now that we've got count >= 8 in shape, let's do something like this? > > auto premul8 = [](__m128i* lo, __m128i* hi) { > // Swizzle them to 8-bit planar. > lo = _mm_shuffle_epi8(lo, planar); > ... > lo = _mm_unpacklo_epi16(bg, ra); > hi = _mm_unpackhi_epi16(bg, ra); > }; > > while (n >= 8) { > __m128i lo = _mm_loadu_si128(...), > hi = _mm_loadu_si128(....); > premul8(&lo, &hi); > _mm_storeu_si128(..., lo); > _mm_storeu_si128(..., hi); > ... > } > if (n >= 4) { > __m128i lo = _mm_loadu_si128(...); > hi = _mm_setzero_si128(); > premul8(&lo, &hi); > _mm_storeu_si128(..., lo); > ... > } > handle n <= 3 Done.
	231 // First just load 4 interlaced pixels.

	232 __m128i lo = _mm_loadu_si128((const __m128i*) src); // bgrabgra bg rabgra

	233

	234 // Swizzle them to 8-bit planar.

	235 lo = _mm_shuffle_epi8(lo, planar); // bbbbgggg rr rraaaa

	236 __m128i bg = _mm_unpacklo_epi32(lo, zeros), // bbbb____ gg gg____

	237 ra = _mm_unpackhi_epi32(lo, zeros); // rrrr____ aa aa____

	238

	239 // Unpack to 16-bit planar.

	240 __m128i b = _mm_unpacklo_epi8(bg, zeros), // b_b_b_b_ __ ______

	241 g = _mm_unpackhi_epi8(bg, zeros), // g_g_g_g_ __ ______

	242 r = _mm_unpacklo_epi8(ra, zeros), // r_r_r_r_ __ ______

	243 a = _mm_unpackhi_epi8(ra, zeros); // a_a_a_a_ __ ______

	244

	245 // Premultiply! (x+127)/255 == ((x+128)257)>>16 for 0 <= x <= 255255.

	246 b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(b, a), _128), _257);

	247 g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(g, a), _128), _257);

	248 r = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(r, a), _128), _257);

	249

	250 // Repack into interlaced pixels.

	251 bg = _mm_or_si128(b, _mm_slli_epi16(g, 8)); // bgbgbgbg __ ______

	252 ra = _mm_or_si128(r, _mm_slli_epi16(a, 8)); // rararara __ ______

	253 lo = _mm_unpacklo_epi16(bg, ra); // bgrabgra bg rabgra

	254

	255 // Store interlaced pixels.

	256 _mm_storeu_si128((__m128i*) dst, lo);

	257

	258 src += 4;

	259 dst += 4;

	260 count -= 4;

	261 }

	262

	263 // Call portable code to finish up the tail of [0,4) pixels.

	264 auto proc = kSwapRB ? premul_swaprb_xxxa_portable : premul_xxxa_portable;

	265 proc(dst, src, count);

	266 }

	267

	268 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) {

	269 premul_xxxa_should_swaprb<false>(dst, src, count);

	270 }

	271

	272 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {

	273 premul_xxxa_should_swaprb<true>(dst, src, count);

	274 }

	275

	276 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {

	277 const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,1 5);

	278

	279 while (count >= 4) {

	280 __m128i bgra = _mm_loadu_si128((const __m128i*) src);

	281 __m128i rgba = _mm_shuffle_epi8(bgra, swapRB);

	282 _mm_storeu_si128((__m128i*) dst, rgba);

	283

	284 src += 4;

	285 dst += 4;

	286 count -= 4;

	287 }

	288

	289 swaprb_xxxa_portable(dst, src, count);

	290 }

	291

177 #else	292 #else

178	293

179 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) {	294 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) {

180 premul_xxxa_portable(dst, src, count);	295 premul_xxxa_portable(dst, src, count);

181 }	296 }

182	297

183 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {	298 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {

184 premul_swaprb_xxxa_portable(dst, src, count);	299 premul_swaprb_xxxa_portable(dst, src, count);

185 }	300 }

186	301

187 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {	302 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {

188 swaprb_xxxa_portable(dst, src, count);	303 swaprb_xxxa_portable(dst, src, count);

189 }	304 }

190	305

191 #endif	306 #endif

192	307

193 }	308 }

194	309

195 #endif // SkSwizzler_opts_DEFINED	310 #endif // SkSwizzler_opts_DEFINED

OLD	NEW

« no previous file with comments | « src/opts/SkOpts_ssse3.cpp ('k') | no next file » | no next file with comments »