Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 /* | 1 /* |
| 2 * Copyright 2016 Google Inc. | 2 * Copyright 2016 Google Inc. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 #ifndef SkSwizzler_opts_DEFINED | 8 #ifndef SkSwizzler_opts_DEFINED |
| 9 #define SkSwizzler_opts_DEFINED | 9 #define SkSwizzler_opts_DEFINED |
| 10 | 10 |
| (...skipping 156 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 167 // Store 8 pixels. | 167 // Store 8 pixels. |
| 168 vst4_u8((uint8_t*) dst, bgra); | 168 vst4_u8((uint8_t*) dst, bgra); |
| 169 src += 8; | 169 src += 8; |
| 170 dst += 8; | 170 dst += 8; |
| 171 count -= 8; | 171 count -= 8; |
| 172 } | 172 } |
| 173 | 173 |
| 174 swaprb_xxxa_portable(dst, src, count); | 174 swaprb_xxxa_portable(dst, src, count); |
| 175 } | 175 } |
| 176 | 176 |
| 177 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 | |
| 178 | |
| 179 template <bool kSwapRB> | |
| 180 static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int count) { | |
| 181 const __m128i zeros = _mm_setzero_si128(); | |
| 182 const __m128i _128 = _mm_set1_epi16(128); | |
| 183 const __m128i _257 = _mm_set1_epi16(257); | |
| 184 __m128i planar; | |
| 185 if (kSwapRB) { | |
| 186 planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15); | |
| 187 } else { | |
| 188 planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15); | |
| 189 } | |
| 190 | |
| 191 auto premul8 = [&zeros, &_128, &_257, &planar](__m128i* lo, __m128i* hi) { | |
|
mtklein
2016/01/19 20:15:02
The comments inside the while loop are now probabl
mtklein
2016/01/19 20:15:02
Just out of curiosity, what happens to the codegen
msarett
2016/01/19 21:02:39
Codegen is unaffected by moving the constants insi
| |
| 192 // Swizzle the pixels to 8-bit planar. | |
| 193 *lo = _mm_shuffle_epi8(*lo, planar); // bbbbgggg rr rraaaa | |
| 194 *hi = _mm_shuffle_epi8(*hi, planar); // BBBBGGGG RR RRAAAA | |
| 195 __m128i bg = _mm_unpacklo_epi32(*lo, *hi), // bbbbBBBB gg ggGGGG | |
| 196 ra = _mm_unpackhi_epi32(*lo, *hi); // rrrrRRRR aa aaAAAA | |
| 197 | |
| 198 // Unpack to 16-bit planar. | |
| 199 __m128i b = _mm_unpacklo_epi8(bg, zeros), // b_b_b_b_ B_ B_B_B_ | |
| 200 g = _mm_unpackhi_epi8(bg, zeros), // g_g_g_g_ G_ G_G_G_ | |
| 201 r = _mm_unpacklo_epi8(ra, zeros), // r_r_r_r_ R_ R_R_R_ | |
| 202 a = _mm_unpackhi_epi8(ra, zeros); // a_a_a_a_ A_ A_A_A_ | |
| 203 | |
| 204 // Premultiply! (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255. | |
| 205 b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(b, a), _128), _257); | |
| 206 g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(g, a), _128), _257); | |
| 207 r = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(r, a), _128), _257); | |
| 208 | |
| 209 // Repack into interlaced pixels. | |
| 210 bg = _mm_or_si128(b, _mm_slli_epi16(g, 8)); // bgbgbgbg BG BGBGBG | |
| 211 ra = _mm_or_si128(r, _mm_slli_epi16(a, 8)); // rararara RA RARARA | |
| 212 *lo = _mm_unpacklo_epi16(bg, ra); // bgrabgra bg rabgra | |
| 213 *hi = _mm_unpackhi_epi16(bg, ra); // BRGABGRA BG RABGRA | |
| 214 }; | |
| 215 | |
| 216 while (count >= 8) { | |
| 217 // First just load the 8 interlaced pixels. | |
| 218 __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)), // bgrabgra bg rabgra | |
| 219 hi = _mm_loadu_si128((const __m128i*) (src + 4)); // BGRABGRA BG RABGRA | |
| 220 | |
| 221 premul8(&lo, &hi); | |
| 222 | |
| 223 // Store interlaced pixels. | |
| 224 _mm_storeu_si128((__m128i*) (dst + 0), lo); | |
| 225 _mm_storeu_si128((__m128i*) (dst + 4), hi); | |
| 226 | |
| 227 src += 8; | |
| 228 dst += 8; | |
| 229 count -= 8; | |
| 230 } | |
| 231 | |
| 232 if (count >= 4) { | |
| 233 // First just load 4 interlaced pixels. | |
| 234 __m128i lo = _mm_loadu_si128((const __m128i*) src), // bgrabgra bg rabgra | |
| 235 hi = _mm_setzero_si128(); | |
| 236 | |
| 237 premul8(&lo, &hi); | |
| 238 | |
| 239 // Store interlaced pixels. | |
| 240 _mm_storeu_si128((__m128i*) dst, lo); | |
| 241 | |
| 242 src += 4; | |
| 243 dst += 4; | |
| 244 count -= 4; | |
| 245 } | |
| 246 | |
| 247 // Call portable code to finish up the tail of [0,4) pixels. | |
| 248 auto proc = kSwapRB ? premul_swaprb_xxxa_portable : premul_xxxa_portable; | |
| 249 proc(dst, src, count); | |
| 250 } | |
| 251 | |
| 252 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) { | |
| 253 premul_xxxa_should_swaprb<false>(dst, src, count); | |
| 254 } | |
| 255 | |
| 256 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { | |
| 257 premul_xxxa_should_swaprb<true>(dst, src, count); | |
| 258 } | |
| 259 | |
| 260 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { | |
| 261 const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,1 5); | |
| 262 | |
| 263 while (count >= 4) { | |
| 264 __m128i bgra = _mm_loadu_si128((const __m128i*) src); | |
| 265 __m128i rgba = _mm_shuffle_epi8(bgra, swapRB); | |
| 266 _mm_storeu_si128((__m128i*) dst, rgba); | |
| 267 | |
| 268 src += 4; | |
| 269 dst += 4; | |
| 270 count -= 4; | |
| 271 } | |
| 272 | |
| 273 swaprb_xxxa_portable(dst, src, count); | |
| 274 } | |
| 275 | |
| 177 #else | 276 #else |
| 178 | 277 |
| 179 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) { | 278 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
| 180 premul_xxxa_portable(dst, src, count); | 279 premul_xxxa_portable(dst, src, count); |
| 181 } | 280 } |
| 182 | 281 |
| 183 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { | 282 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
| 184 premul_swaprb_xxxa_portable(dst, src, count); | 283 premul_swaprb_xxxa_portable(dst, src, count); |
| 185 } | 284 } |
| 186 | 285 |
| 187 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { | 286 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
| 188 swaprb_xxxa_portable(dst, src, count); | 287 swaprb_xxxa_portable(dst, src, count); |
| 189 } | 288 } |
| 190 | 289 |
| 191 #endif | 290 #endif |
| 192 | 291 |
| 193 } | 292 } |
| 194 | 293 |
| 195 #endif // SkSwizzler_opts_DEFINED | 294 #endif // SkSwizzler_opts_DEFINED |
| OLD | NEW |