src/opts/SkSwizzler_opts.h - Issue 1601883002: Add SSSE3 Optimizations for premul and swap

Side by Side Diff: src/opts/SkSwizzler_opts.h

Issue 1601883002: Add SSSE3 Optimizations for premul and swap (Closed) Base URL: https://skia.googlesource.com/skia.git@f-and-x

Patch Set: Move constants into premul8 proc Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright 2016 Google Inc.	2 * Copyright 2016 Google Inc.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #ifndef SkSwizzler_opts_DEFINED	8 #ifndef SkSwizzler_opts_DEFINED

9 #define SkSwizzler_opts_DEFINED	9 #define SkSwizzler_opts_DEFINED

10	10

(...skipping 156 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
167 // Store 8 pixels.	167 // Store 8 pixels.

168 vst4_u8((uint8_t*) dst, bgra);	168 vst4_u8((uint8_t*) dst, bgra);

169 src += 8;	169 src += 8;

170 dst += 8;	170 dst += 8;

171 count -= 8;	171 count -= 8;

172 }	172 }

173	173

174 swaprb_xxxa_portable(dst, src, count);	174 swaprb_xxxa_portable(dst, src, count);

175 }	175 }

176	176

	177 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3

	178

	179 template <bool kSwapRB>

	180 static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int count) {

	181

	182 auto premul8 = [](__m128i* lo, __m128i* hi) {

	183 const __m128i zeros = _mm_setzero_si128();

	184 const __m128i _128 = _mm_set1_epi16(128);

	185 const __m128i _257 = _mm_set1_epi16(257);

	186 __m128i planar;

	187 if (kSwapRB) {

	188 planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);

	189 } else {

	190 planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);

	191 }

	192

	193 // Swizzle the pixels to 8-bit planar.

	194 lo = _mm_shuffle_epi8(lo, planar); // bbbbgggg rr rraaaa

	195 hi = _mm_shuffle_epi8(hi, planar); // BBBBGGGG RR RRAAAA

	196 __m128i bg = _mm_unpacklo_epi32(lo, hi), // bbbbBBBB gg ggGGGG

	197 ra = _mm_unpackhi_epi32(lo, hi); // rrrrRRRR aa aaAAAA

	198

	199 // Unpack to 16-bit planar.

	200 __m128i b = _mm_unpacklo_epi8(bg, zeros), // b_b_b_b_ B_ B_B_B_

	201 g = _mm_unpackhi_epi8(bg, zeros), // g_g_g_g_ G_ G_G_G_

	202 r = _mm_unpacklo_epi8(ra, zeros), // r_r_r_r_ R_ R_R_R_

	203 a = _mm_unpackhi_epi8(ra, zeros); // a_a_a_a_ A_ A_A_A_

	204

	205 // Premultiply! (x+127)/255 == ((x+128)257)>>16 for 0 <= x <= 255255.

	206 b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(b, a), _128), _257);

	207 g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(g, a), _128), _257);

	208 r = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(r, a), _128), _257);

	209

	210 // Repack into interlaced pixels.

	211 bg = _mm_or_si128(b, _mm_slli_epi16(g, 8)); // bgbgbgbg BG BGBGBG

	212 ra = _mm_or_si128(r, _mm_slli_epi16(a, 8)); // rararara RA RARARA

	213 *lo = _mm_unpacklo_epi16(bg, ra); // bgrabgra bg rabgra

	214 *hi = _mm_unpackhi_epi16(bg, ra); // BRGABGRA BG RABGRA

	215 };

	216

	217 while (count >= 8) {

	218 __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),

	219 hi = _mm_loadu_si128((const __m128i*) (src + 4));

	220

	221 premul8(&lo, &hi);

	222

	223 _mm_storeu_si128((__m128i*) (dst + 0), lo);

	224 _mm_storeu_si128((__m128i*) (dst + 4), hi);

	225

	226 src += 8;

	227 dst += 8;

	228 count -= 8;

	229 }

	230

	231 if (count >= 4) {

	232 __m128i lo = _mm_loadu_si128((const __m128i*) src),

	233 hi = _mm_setzero_si128();

	234

	235 premul8(&lo, &hi);

	236

	237 _mm_storeu_si128((__m128i*) dst, lo);

	238

	239 src += 4;

	240 dst += 4;

	241 count -= 4;

	242 }

	243

	244 // Call portable code to finish up the tail of [0,4) pixels.

	245 auto proc = kSwapRB ? premul_swaprb_xxxa_portable : premul_xxxa_portable;

	246 proc(dst, src, count);

	247 }

	248

	249 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) {

	250 premul_xxxa_should_swaprb<false>(dst, src, count);

	251 }

	252

	253 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {

	254 premul_xxxa_should_swaprb<true>(dst, src, count);

	255 }

	256

	257 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {

	258 const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,1 5);

	259

	260 while (count >= 4) {

	261 __m128i bgra = _mm_loadu_si128((const __m128i*) src);

	262 __m128i rgba = _mm_shuffle_epi8(bgra, swapRB);

	263 _mm_storeu_si128((__m128i*) dst, rgba);

	264

	265 src += 4;

	266 dst += 4;

	267 count -= 4;

	268 }

	269

	270 swaprb_xxxa_portable(dst, src, count);

	271 }

	272

177 #else	273 #else

178	274

179 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) {	275 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) {

180 premul_xxxa_portable(dst, src, count);	276 premul_xxxa_portable(dst, src, count);

181 }	277 }

182	278

183 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {	279 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {

184 premul_swaprb_xxxa_portable(dst, src, count);	280 premul_swaprb_xxxa_portable(dst, src, count);

185 }	281 }

186	282

187 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {	283 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {

188 swaprb_xxxa_portable(dst, src, count);	284 swaprb_xxxa_portable(dst, src, count);

189 }	285 }

190	286

191 #endif	287 #endif

192	288

193 }	289 }

194	290

195 #endif // SkSwizzler_opts_DEFINED	291 #endif // SkSwizzler_opts_DEFINED

OLD	NEW

« no previous file with comments | « src/opts/SkOpts_ssse3.cpp ('k') | no next file » | no next file with comments »