Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 /* | 1 /* |
| 2 * Copyright 2016 Google Inc. | 2 * Copyright 2016 Google Inc. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 #ifndef SkSwizzler_opts_DEFINED | 8 #ifndef SkSwizzler_opts_DEFINED |
| 9 #define SkSwizzler_opts_DEFINED | 9 #define SkSwizzler_opts_DEFINED |
| 10 | 10 |
| (...skipping 156 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 167 // Store 8 pixels. | 167 // Store 8 pixels. |
| 168 vst4_u8((uint8_t*) dst, bgra); | 168 vst4_u8((uint8_t*) dst, bgra); |
| 169 src += 8; | 169 src += 8; |
| 170 dst += 8; | 170 dst += 8; |
| 171 count -= 8; | 171 count -= 8; |
| 172 } | 172 } |
| 173 | 173 |
| 174 swaprb_xxxa_portable(dst, src, count); | 174 swaprb_xxxa_portable(dst, src, count); |
| 175 } | 175 } |
| 176 | 176 |
| 177 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 | |
| 178 | |
| 179 template <bool kSwapRB> | |
| 180 static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int count) { | |
|
msarett
2016/01/18 20:35:05
There are a lot of different ways to implement thi
| |
| 181 const __m128i zeros = _mm_setzero_si128(); | |
| 182 const __m128i _128 = _mm_set1_epi16(128); | |
| 183 const __m128i _257 = _mm_set1_epi16(257); | |
| 184 const __m128i combine = _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1 , 12, 8, 4, 0); | |
| 185 __m128i split; | |
| 186 if (kSwapRB) { | |
| 187 split = _mm_set_epi8(15, 3, 7, 11, 14, 2, 6, 10, 13, 1, 5, 9, 12, 0, 4, 8); | |
| 188 } else { | |
| 189 split = combine; | |
| 190 } | |
| 191 | |
| 192 while (count >= 8) { | |
| 193 __m128i argb_lo = _mm_loadu_si128((const __m128i*) src); | |
| 194 __m128i argb_hi = _mm_loadu_si128((const __m128i*) (src + 4)); | |
| 195 | |
| 196 // argb_argb_argb_argb -> aaaa_rrrr_gggg_bbbb | |
|
mtklein
2016/01/19 15:59:14
Let's kick some of these comments a little bit hig
msarett
2016/01/19 17:34:38
Done.
Ugggh, for some reason I thought the rest o
| |
| 197 argb_lo = _mm_shuffle_epi8(argb_lo, combine); | |
| 198 argb_hi = _mm_shuffle_epi8(argb_hi, combine); | |
| 199 | |
| 200 // aaaa_rrrr_gggg_bbbb -> aaaa_aaaa_rrrr_rrrr | |
| 201 __m128i ar = _mm_unpackhi_epi32(argb_lo, argb_hi); | |
| 202 // aaaa_rrrr_gggg_bbbb -> gggg_gggg_bbbb_bbbb | |
| 203 __m128i gb = _mm_unpacklo_epi32(argb_lo, argb_hi); | |
| 204 | |
| 205 // xxxx_xxxx_yyyy_yyyy -> 0x0x_0x0x_0x0x_0x0x | |
| 206 // xxxx_xxxx_yyyy_yyyy -> 0y0y_0y0y_0y0y_0y0y | |
| 207 __m128i a = _mm_unpackhi_epi8(ar, zeros); | |
| 208 __m128i r = _mm_unpacklo_epi8(ar, zeros); | |
| 209 __m128i g = _mm_unpackhi_epi8(gb, zeros); | |
| 210 __m128i b = _mm_unpacklo_epi8(gb, zeros); | |
| 211 | |
| 212 // (x + 127) / 255 == ((x + 128) * 257) >> 16 for 0 <= x <= 255 * 255 | |
|
msarett
2016/01/18 20:35:05
Thanks to Mike for this insight.
| |
| 213 // Note that _mm_mulhi_epu16 performs the entire (y * 257) >> 16. | |
| 214 r = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(a, r), _128), _257); | |
|
mtklein
2016/01/19 15:59:14
This may be a matter of personal preference, but y
msarett
2016/01/19 17:34:38
Leaving as is, though I'm kind of indifferent.
I
| |
| 215 g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(a, g), _128), _257); | |
| 216 b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(a, b), _128), _257); | |
| 217 | |
| 218 // aaaa_rrrr_aaaa_rrrr | |
|
mtklein
2016/01/19 15:59:14
I think we can do this repacking as something like
msarett
2016/01/19 17:34:37
Yes this is better!
Let's even swap BR in the "sw
| |
| 219 ar = _mm_shuffle_epi32(_mm_packus_epi16(r, a), 0xD8); | |
| 220 // gggg_bbbb_gggg_bbbb | |
| 221 gb = _mm_shuffle_epi32(_mm_packus_epi16(b, g), 0xD8); | |
| 222 | |
| 223 // aaaa_rrrr_gggg_bbbb | |
| 224 argb_lo = _mm_unpacklo_epi64(gb, ar); | |
| 225 argb_hi = _mm_unpackhi_epi64(gb, ar); | |
| 226 | |
| 227 // aaaa_rrrr_gggg_bbbb -> argb_argb_argb_argb | |
| 228 argb_lo = _mm_shuffle_epi8(argb_lo, split); | |
| 229 argb_hi = _mm_shuffle_epi8(argb_hi, split); | |
| 230 | |
| 231 _mm_storeu_si128((__m128i*) dst, argb_lo); | |
| 232 _mm_storeu_si128((__m128i*) (dst + 4), argb_hi); | |
| 233 | |
| 234 src += 8; | |
| 235 dst += 8; | |
| 236 count -= 8; | |
| 237 } | |
| 238 | |
| 239 if (count >= 4) { | |
|
mtklein
2016/01/19 15:59:14
Reminder to self to circle back here when we're ha
| |
| 240 __m128i argb = _mm_loadu_si128((const __m128i*) src); | |
| 241 | |
| 242 // argb_argb_argb_argb -> aaaa_rrrr_gggg_bbbb | |
| 243 argb = _mm_shuffle_epi8(argb, combine); | |
| 244 | |
| 245 // aaaa_rrrr_gggg_bbbb -> 0000_aaaa_0000_rrrr | |
| 246 __m128i ar = _mm_unpackhi_epi32(argb, zeros); | |
| 247 // aaaa_rrrr_gggg_bbbb -> 0000_gggg_0000_bbbb | |
| 248 __m128i gb = _mm_unpacklo_epi32(argb, zeros); | |
| 249 | |
| 250 // xxxx_xxxx_yyyy_yyyy -> 0x0x_0x0x_0x0x_0x0x | |
| 251 // xxxx_xxxx_yyyy_yyyy -> 0y0y_0y0y_0y0y_0y0y | |
| 252 __m128i a = _mm_unpackhi_epi8(ar, zeros); | |
| 253 __m128i r = _mm_unpacklo_epi8(ar, zeros); | |
| 254 __m128i g = _mm_unpackhi_epi8(gb, zeros); | |
| 255 __m128i b = _mm_unpacklo_epi8(gb, zeros); | |
| 256 | |
| 257 // (x + 127) / 255 == ((x + 128) * 257) >> 16 for 0 <= x <= 255 * 255 | |
| 258 // Note that _mm_mulhi_epu16 performs the entire (y * 257) >> 16. | |
| 259 r = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(a, r), _128), _257); | |
| 260 g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(a, g), _128), _257); | |
| 261 b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(a, b), _128), _257); | |
| 262 | |
| 263 // aaaa_rrrr_0000_0000 | |
| 264 ar = _mm_shuffle_epi32(_mm_packus_epi16(r, a), 0x8F); | |
| 265 // 0000_0000_gggg_bbbb | |
| 266 gb = _mm_shuffle_epi32(_mm_packus_epi16(b, g), 0xD8); | |
| 267 | |
| 268 // aaaa_rrrr_gggg_bbbb | |
| 269 argb = _mm_or_si128(ar, gb); | |
| 270 | |
| 271 // aaaa_rrrr_gggg_bbbb -> argb_argb_argb_argb | |
| 272 argb = _mm_shuffle_epi8(argb, split); | |
| 273 | |
| 274 _mm_storeu_si128((__m128i*) dst, argb); | |
| 275 | |
| 276 src += 4; | |
| 277 dst += 4; | |
| 278 count -= 4; | |
| 279 } | |
| 280 | |
| 281 // Call portable code to finish up the tail of [0,4) pixels. | |
| 282 auto proc = kSwapRB ? premul_swaprb_xxxa_portable : premul_xxxa_portable; | |
| 283 proc(dst, src, count); | |
| 284 } | |
| 285 | |
| 286 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) { | |
| 287 premul_xxxa_should_swaprb<false>(dst, src, count); | |
| 288 } | |
| 289 | |
| 290 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { | |
| 291 premul_xxxa_should_swaprb<true>(dst, src, count); | |
| 292 } | |
| 293 | |
| 294 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { | |
| 295 const __m128i swapRB = _mm_set_epi8(15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6 , 3, 0, 1, 2); | |
|
mtklein
2016/01/19 15:59:15
I often find it's easier to read these if you use
msarett
2016/01/19 17:34:37
I think you're right.
| |
| 296 | |
| 297 while (count >= 4) { | |
| 298 __m128i argb = _mm_loadu_si128((const __m128i*) src); | |
| 299 __m128i abgr = _mm_shuffle_epi8(argb, swapRB); | |
| 300 _mm_storeu_si128((__m128i*) dst, abgr); | |
| 301 | |
| 302 src += 4; | |
| 303 dst += 4; | |
| 304 count -= 4; | |
| 305 } | |
| 306 | |
| 307 swaprb_xxxa_portable(dst, src, count); | |
| 308 } | |
| 309 | |
| 177 #else | 310 #else |
| 178 | 311 |
| 179 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) { | 312 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
| 180 premul_xxxa_portable(dst, src, count); | 313 premul_xxxa_portable(dst, src, count); |
| 181 } | 314 } |
| 182 | 315 |
| 183 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { | 316 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
| 184 premul_swaprb_xxxa_portable(dst, src, count); | 317 premul_swaprb_xxxa_portable(dst, src, count); |
| 185 } | 318 } |
| 186 | 319 |
| 187 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { | 320 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
| 188 swaprb_xxxa_portable(dst, src, count); | 321 swaprb_xxxa_portable(dst, src, count); |
| 189 } | 322 } |
| 190 | 323 |
| 191 #endif | 324 #endif |
| 192 | 325 |
| 193 } | 326 } |
| 194 | 327 |
| 195 #endif // SkSwizzler_opts_DEFINED | 328 #endif // SkSwizzler_opts_DEFINED |
| OLD | NEW |