Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 /* | 1 /* |
| 2 * Copyright 2016 Google Inc. | 2 * Copyright 2016 Google Inc. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 #ifndef SkSwizzler_opts_DEFINED | 8 #ifndef SkSwizzler_opts_DEFINED |
| 9 #define SkSwizzler_opts_DEFINED | 9 #define SkSwizzler_opts_DEFINED |
| 10 | 10 |
| (...skipping 156 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 167 // Store 8 pixels. | 167 // Store 8 pixels. |
| 168 vst4_u8((uint8_t*) dst, bgra); | 168 vst4_u8((uint8_t*) dst, bgra); |
| 169 src += 8; | 169 src += 8; |
| 170 dst += 8; | 170 dst += 8; |
| 171 count -= 8; | 171 count -= 8; |
| 172 } | 172 } |
| 173 | 173 |
| 174 swaprb_xxxa_portable(dst, src, count); | 174 swaprb_xxxa_portable(dst, src, count); |
| 175 } | 175 } |
| 176 | 176 |
| 177 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 | |
| 178 | |
| 179 template <bool kSwapRB> | |
| 180 static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int count) { | |
| 181 const __m128i zeros = _mm_setzero_si128(); | |
| 182 const __m128i _128 = _mm_set1_epi16(128); | |
| 183 const __m128i _257 = _mm_set1_epi16(257); | |
| 184 __m128i planar; | |
| 185 if (kSwapRB) { | |
| 186 planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15); | |
| 187 } else { | |
| 188 planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15); | |
| 189 } | |
| 190 | |
| 191 while (count >= 8) { | |
| 192 // We'll load 8 pixels into 4 registers, each holding a 16-bit component plane. | |
| 193 | |
| 194 // First just load the 8 interlaced pixels. | |
| 195 __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)), // bgrabgra bg rabgra | |
| 196 hi = _mm_loadu_si128((const __m128i*) (src + 4)); // BGRABGRA BG RABGRA | |
| 197 | |
| 198 // Swizzle them to 8-bit planar. | |
| 199 lo = _mm_shuffle_epi8(lo, planar); // bbbbgggg rr rraaaa | |
| 200 hi = _mm_shuffle_epi8(lo, planar); // BBBBGGGG RR RRAAAA | |
| 201 __m128i bg = _mm_unpacklo_epi32(lo, hi), // bbbbBBBB gg ggGGGG | |
| 202 ra = _mm_unpackhi_epi32(lo, hi); // rrrrRRRR aa aaAAAA | |
| 203 | |
| 204 // Unpack to 16-bit planar. | |
| 205 __m128i b = _mm_unpacklo_epi8(bg, zeros), // b_b_b_b_ B_ B_B_B_ | |
| 206 g = _mm_unpackhi_epi8(bg, zeros), // g_g_g_g_ G_ G_G_G_ | |
| 207 r = _mm_unpacklo_epi8(ra, zeros), // r_r_r_r_ R_ R_R_R_ | |
| 208 a = _mm_unpackhi_epi8(ra, zeros); // a_a_a_a_ A_ A_A_A_ | |
| 209 | |
| 210 // Premultiply! (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255. | |
| 211 b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(b, a), _128), _257); | |
| 212 g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(g, a), _128), _257); | |
| 213 r = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(r, a), _128), _257); | |
| 214 | |
| 215 // Repack into interlaced pixels. | |
| 216 bg = _mm_or_si128(b, _mm_slli_epi16(g, 8)); // bgbgbgbg BG BGBGBG | |
| 217 ra = _mm_or_si128(r, _mm_slli_epi16(a, 8)); // rararara RA RARARA | |
| 218 lo = _mm_unpacklo_epi16(bg, ra); // bgrabgra bg rabgra | |
| 219 hi = _mm_unpackhi_epi16(bg, ra); // BRGABGRA BG RABGRA | |
| 220 | |
| 221 // Store interlaced pixels. | |
| 222 _mm_storeu_si128((__m128i*) (dst + 0), lo); | |
| 223 _mm_storeu_si128((__m128i*) (dst + 4), hi); | |
| 224 | |
| 225 src += 8; | |
| 226 dst += 8; | |
| 227 count -= 8; | |
| 228 } | |
| 229 | |
| 230 if (count >= 4) { | |
|
mtklein
2016/01/19 18:28:30
OK, now that we've got count >= 8 in shape, let's
msarett
2016/01/19 19:17:43
Done.
| |
| 231 // First just load 4 interlaced pixels. | |
| 232 __m128i lo = _mm_loadu_si128((const __m128i*) src); // bgrabgra bg rabgra | |
| 233 | |
| 234 // Swizzle them to 8-bit planar. | |
| 235 lo = _mm_shuffle_epi8(lo, planar); // bbbbgggg rr rraaaa | |
| 236 __m128i bg = _mm_unpacklo_epi32(lo, zeros), // bbbb____ gg gg____ | |
| 237 ra = _mm_unpackhi_epi32(lo, zeros); // rrrr____ aa aa____ | |
| 238 | |
| 239 // Unpack to 16-bit planar. | |
| 240 __m128i b = _mm_unpacklo_epi8(bg, zeros), // b_b_b_b_ __ ______ | |
| 241 g = _mm_unpackhi_epi8(bg, zeros), // g_g_g_g_ __ ______ | |
| 242 r = _mm_unpacklo_epi8(ra, zeros), // r_r_r_r_ __ ______ | |
| 243 a = _mm_unpackhi_epi8(ra, zeros); // a_a_a_a_ __ ______ | |
| 244 | |
| 245 // Premultiply! (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255. | |
| 246 b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(b, a), _128), _257); | |
| 247 g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(g, a), _128), _257); | |
| 248 r = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(r, a), _128), _257); | |
| 249 | |
| 250 // Repack into interlaced pixels. | |
| 251 bg = _mm_or_si128(b, _mm_slli_epi16(g, 8)); // bgbgbgbg __ ______ | |
| 252 ra = _mm_or_si128(r, _mm_slli_epi16(a, 8)); // rararara __ ______ | |
| 253 lo = _mm_unpacklo_epi16(bg, ra); // bgrabgra bg rabgra | |
| 254 | |
| 255 // Store interlaced pixels. | |
| 256 _mm_storeu_si128((__m128i*) dst, lo); | |
| 257 | |
| 258 src += 4; | |
| 259 dst += 4; | |
| 260 count -= 4; | |
| 261 } | |
| 262 | |
| 263 // Call portable code to finish up the tail of [0,4) pixels. | |
| 264 auto proc = kSwapRB ? premul_swaprb_xxxa_portable : premul_xxxa_portable; | |
| 265 proc(dst, src, count); | |
| 266 } | |
| 267 | |
| 268 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) { | |
| 269 premul_xxxa_should_swaprb<false>(dst, src, count); | |
| 270 } | |
| 271 | |
| 272 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { | |
| 273 premul_xxxa_should_swaprb<true>(dst, src, count); | |
| 274 } | |
| 275 | |
| 276 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { | |
| 277 const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,1 5); | |
| 278 | |
| 279 while (count >= 4) { | |
| 280 __m128i bgra = _mm_loadu_si128((const __m128i*) src); | |
| 281 __m128i rgba = _mm_shuffle_epi8(bgra, swapRB); | |
| 282 _mm_storeu_si128((__m128i*) dst, rgba); | |
| 283 | |
| 284 src += 4; | |
| 285 dst += 4; | |
| 286 count -= 4; | |
| 287 } | |
| 288 | |
| 289 swaprb_xxxa_portable(dst, src, count); | |
| 290 } | |
| 291 | |
| 177 #else | 292 #else |
| 178 | 293 |
| 179 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) { | 294 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
| 180 premul_xxxa_portable(dst, src, count); | 295 premul_xxxa_portable(dst, src, count); |
| 181 } | 296 } |
| 182 | 297 |
| 183 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { | 298 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
| 184 premul_swaprb_xxxa_portable(dst, src, count); | 299 premul_swaprb_xxxa_portable(dst, src, count); |
| 185 } | 300 } |
| 186 | 301 |
| 187 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { | 302 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
| 188 swaprb_xxxa_portable(dst, src, count); | 303 swaprb_xxxa_portable(dst, src, count); |
| 189 } | 304 } |
| 190 | 305 |
| 191 #endif | 306 #endif |
| 192 | 307 |
| 193 } | 308 } |
| 194 | 309 |
| 195 #endif // SkSwizzler_opts_DEFINED | 310 #endif // SkSwizzler_opts_DEFINED |
| OLD | NEW |