Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 /* | 1 /* |
| 2 * Copyright 2016 Google Inc. | 2 * Copyright 2016 Google Inc. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 #ifndef SkSwizzler_opts_DEFINED | 8 #ifndef SkSwizzler_opts_DEFINED |
| 9 #define SkSwizzler_opts_DEFINED | 9 #define SkSwizzler_opts_DEFINED |
| 10 | 10 |
| 11 #include "SkColorPriv.h" | 11 #include "SkColorPriv.h" |
| 12 | 12 |
| 13 namespace SK_OPTS_NS { | 13 namespace SK_OPTS_NS { |
| 14 | 14 |
| 15 // These variable names in these functions just pretend the input is BGRA. | 15 // These variable names in these functions just pretend the input is BGRA. |
|
mtklein
2016/01/22 14:37:32
(or BGR)
msarett
2016/01/22 15:00:36
Done.
| |
| 16 // They work fine with both RGBA and BGRA. | 16 // They work fine with both RGBA and BGRA. |
|
mtklein
2016/01/22 14:37:32
(or both BGR and RGB).
msarett
2016/01/22 15:00:36
Done.
| |
| 17 | 17 |
| 18 static void premul_xxxa_portable(uint32_t dst[], const uint32_t src[], int count ) { | 18 static void premul_xxxa_portable(uint32_t dst[], const uint32_t src[], int count ) { |
|
mtklein
2016/01/22 14:37:33
Do you think we made things unnecessarily complica
mtklein
2016/01/22 14:44:04
Gonna write up a CL to demonstrate what I mean.
msarett
2016/01/22 15:00:36
Yes.
mtklein
2016/01/22 15:23:57
Oh, right, how about RGB_to_RGB1 / RGB_to_BGR1?
msarett
2016/01/22 17:27:23
Done.
| |
| 19 for (int i = 0; i < count; i++) { | 19 for (int i = 0; i < count; i++) { |
| 20 uint8_t a = src[i] >> 24, | 20 uint8_t a = src[i] >> 24, |
| 21 r = src[i] >> 16, | 21 r = src[i] >> 16, |
| 22 g = src[i] >> 8, | 22 g = src[i] >> 8, |
| 23 b = src[i] >> 0; | 23 b = src[i] >> 0; |
| 24 r = (r*a+127)/255; | 24 r = (r*a+127)/255; |
| 25 g = (g*a+127)/255; | 25 g = (g*a+127)/255; |
| 26 b = (b*a+127)/255; | 26 b = (b*a+127)/255; |
| 27 dst[i] = (uint32_t)a << 24 | 27 dst[i] = (uint32_t)a << 24 |
| 28 | (uint32_t)r << 16 | 28 | (uint32_t)r << 16 |
| (...skipping 24 matching lines...) Expand all Loading... | |
| 53 r = src[i] >> 16, | 53 r = src[i] >> 16, |
| 54 g = src[i] >> 8, | 54 g = src[i] >> 8, |
| 55 b = src[i] >> 0; | 55 b = src[i] >> 0; |
| 56 dst[i] = (uint32_t)a << 24 | 56 dst[i] = (uint32_t)a << 24 |
| 57 | (uint32_t)b << 16 | 57 | (uint32_t)b << 16 |
| 58 | (uint32_t)g << 8 | 58 | (uint32_t)g << 8 |
| 59 | (uint32_t)r << 0; | 59 | (uint32_t)r << 0; |
| 60 } | 60 } |
| 61 } | 61 } |
| 62 | 62 |
| 63 static void xxx_xxxa_portable(uint32_t dst[], const uint32_t src[], int count) { | |
|
mtklein
2016/01/22 14:37:32
src is three-byte right, not some sort of RGBx? S
msarett
2016/01/22 15:00:36
Agreed. I made a similar comment in Patch Set 1.
mtklein
2016/01/22 15:23:57
Good. Was going to suggest that. :)
While they'r
msarett
2016/01/22 17:27:23
Done.
| |
| 64 int i8 = 0; | |
| 65 const uint8_t* src8 = (const uint8_t*) src; | |
| 66 for (int i32 = 0; i32 < count; i32++) { | |
| 67 uint8_t b = src8[i8++], | |
| 68 g = src8[i8++], | |
| 69 r = src8[i8++]; | |
| 70 dst[i32] = (uint32_t) b << 0 | |
|
mtklein
2016/01/22 14:37:32
Let's keep our order consistent with the rest of t
msarett
2016/01/22 15:00:36
Done.
| |
| 71 | (uint32_t) g << 8 | |
| 72 | (uint32_t) r << 16 | |
| 73 | (uint32_t)0xFF << 24; | |
| 74 } | |
| 75 } | |
| 76 | |
| 77 static void xxx_swaprb_xxxa_portable(uint32_t dst[], const uint32_t src[], int c ount) { | |
| 78 int i8 = 0; | |
| 79 const uint8_t* src8 = (const uint8_t*) src; | |
| 80 for (int i32 = 0; i32 < count; i32++) { | |
| 81 uint8_t b = src8[i8++], | |
| 82 g = src8[i8++], | |
| 83 r = src8[i8++]; | |
| 84 dst[i32] = (uint32_t) r << 0 | |
| 85 | (uint32_t) g << 8 | |
| 86 | (uint32_t) b << 16 | |
| 87 | (uint32_t)0xFF << 24; | |
| 88 } | |
| 89 } | |
| 90 | |
| 63 #if defined(SK_ARM_HAS_NEON) | 91 #if defined(SK_ARM_HAS_NEON) |
| 64 | 92 |
| 65 // Rounded divide by 255, (x + 127) / 255 | 93 // Rounded divide by 255, (x + 127) / 255 |
| 66 static uint8x8_t div255_round(uint16x8_t x) { | 94 static uint8x8_t div255_round(uint16x8_t x) { |
| 67 // result = (x + 127) / 255 | 95 // result = (x + 127) / 255 |
| 68 // result = (x + 127) / 256 + error1 | 96 // result = (x + 127) / 256 + error1 |
| 69 // | 97 // |
| 70 // error1 = (x + 127) / (255 * 256) | 98 // error1 = (x + 127) / (255 * 256) |
| 71 // error1 = (x + 127) / (256 * 256) + error2 | 99 // error1 = (x + 127) / (256 * 256) + error2 |
| 72 // | 100 // |
| (...skipping 88 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 161 // Store 8 pixels. | 189 // Store 8 pixels. |
| 162 vst4_u8((uint8_t*) dst, bgra); | 190 vst4_u8((uint8_t*) dst, bgra); |
| 163 src += 8; | 191 src += 8; |
| 164 dst += 8; | 192 dst += 8; |
| 165 count -= 8; | 193 count -= 8; |
| 166 } | 194 } |
| 167 | 195 |
| 168 swaprb_xxxa_portable(dst, src, count); | 196 swaprb_xxxa_portable(dst, src, count); |
| 169 } | 197 } |
| 170 | 198 |
| 199 template <bool kSwapRB> | |
| 200 static void xxx_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int cou nt) { | |
| 201 const uint8_t* src8 = (const uint8_t*) src; | |
| 202 while (count >= 16) { | |
| 203 // Load 16 pixels. | |
| 204 uint8x16x3_t bgr = vld3q_u8(src8); | |
| 205 | |
| 206 // Insert an opaque alpha channel and swap if needed. | |
| 207 uint8x16x4_t bgra; | |
| 208 if (kSwapRB) { | |
| 209 bgra.val[0] = bgr.val[2]; | |
| 210 bgra.val[2] = bgr.val[0]; | |
| 211 } else { | |
| 212 bgra.val[0] = bgr.val[0]; | |
| 213 bgra.val[2] = bgr.val[2]; | |
| 214 } | |
| 215 bgra.val[1] = bgr.val[1]; | |
| 216 bgra.val[3] = vdupq_n_u8(0xFF); | |
| 217 | |
| 218 // Store 16 pixels. | |
| 219 vst4q_u8((uint8_t*) dst, bgra); | |
| 220 src8 += 48; | |
|
mtklein
2016/01/22 14:37:33
might write this as += 16*3?
I find it really ple
msarett
2016/01/22 15:00:36
Done.
| |
| 221 dst += 16; | |
| 222 count -= 16; | |
| 223 } | |
| 224 | |
| 225 if (count >= 8) { | |
| 226 // Load 8 pixels. | |
| 227 uint8x8x3_t bgr = vld3_u8(src8); | |
| 228 | |
| 229 // Insert an opaque alpha channel and swap if needed. | |
| 230 uint8x8x4_t bgra; | |
| 231 if (kSwapRB) { | |
| 232 bgra.val[0] = bgr.val[2]; | |
| 233 bgra.val[2] = bgr.val[0]; | |
| 234 } else { | |
| 235 bgra.val[0] = bgr.val[0]; | |
| 236 bgra.val[2] = bgr.val[2]; | |
| 237 } | |
| 238 bgra.val[1] = bgr.val[1]; | |
| 239 bgra.val[3] = vdup_n_u8(0xFF); | |
| 240 | |
| 241 // Store 8 pixels. | |
| 242 vst4_u8((uint8_t*) dst, bgra); | |
| 243 src8 += 24; | |
| 244 dst += 8; | |
| 245 count -= 8; | |
| 246 } | |
| 247 | |
| 248 // Call portable code to finish up the tail of [0,8) pixels. | |
| 249 auto proc = kSwapRB ? xxx_swaprb_xxxa_portable : xxx_xxxa_portable; | |
| 250 proc(dst, (const uint32_t*) src8, count); | |
| 251 } | |
| 252 | |
| 253 static void xxx_xxxa(uint32_t dst[], const uint32_t src[], int count) { | |
| 254 xxx_xxxa_should_swaprb<false>(dst, src, count); | |
| 255 } | |
| 256 | |
| 257 static void xxx_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { | |
| 258 xxx_xxxa_should_swaprb<true>(dst, src, count); | |
| 259 } | |
| 260 | |
| 171 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 | 261 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 |
| 172 | 262 |
| 173 template <bool kSwapRB> | 263 template <bool kSwapRB> |
| 174 static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int count) { | 264 static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int count) { |
| 175 | 265 |
| 176 auto premul8 = [](__m128i* lo, __m128i* hi) { | 266 auto premul8 = [](__m128i* lo, __m128i* hi) { |
| 177 const __m128i zeros = _mm_setzero_si128(); | 267 const __m128i zeros = _mm_setzero_si128(); |
| 178 const __m128i _128 = _mm_set1_epi16(128); | 268 const __m128i _128 = _mm_set1_epi16(128); |
| 179 const __m128i _257 = _mm_set1_epi16(257); | 269 const __m128i _257 = _mm_set1_epi16(257); |
| 180 __m128i planar; | 270 __m128i planar; |
| (...skipping 76 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 257 _mm_storeu_si128((__m128i*) dst, rgba); | 347 _mm_storeu_si128((__m128i*) dst, rgba); |
| 258 | 348 |
| 259 src += 4; | 349 src += 4; |
| 260 dst += 4; | 350 dst += 4; |
| 261 count -= 4; | 351 count -= 4; |
| 262 } | 352 } |
| 263 | 353 |
| 264 swaprb_xxxa_portable(dst, src, count); | 354 swaprb_xxxa_portable(dst, src, count); |
| 265 } | 355 } |
| 266 | 356 |
| 357 static void xxx_xxxa(uint32_t dst[], const uint32_t src[], int count) { | |
| 358 xxx_xxxa_portable(dst, src, count); | |
| 359 } | |
| 360 | |
| 361 static void xxx_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { | |
| 362 xxx_swaprb_xxxa_portable(dst, src, count); | |
| 363 } | |
| 364 | |
| 267 #else | 365 #else |
| 268 | 366 |
| 269 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) { | 367 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
| 270 premul_xxxa_portable(dst, src, count); | 368 premul_xxxa_portable(dst, src, count); |
| 271 } | 369 } |
| 272 | 370 |
| 273 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { | 371 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
| 274 premul_swaprb_xxxa_portable(dst, src, count); | 372 premul_swaprb_xxxa_portable(dst, src, count); |
| 275 } | 373 } |
| 276 | 374 |
| 277 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { | 375 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
| 278 swaprb_xxxa_portable(dst, src, count); | 376 swaprb_xxxa_portable(dst, src, count); |
| 279 } | 377 } |
| 280 | 378 |
| 379 static void xxx_xxxa(uint32_t dst[], const uint32_t src[], int count) { | |
| 380 xxx_xxxa_portable(dst, src, count); | |
| 381 } | |
| 382 | |
| 383 static void xxx_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { | |
| 384 xxx_swaprb_xxxa_portable(dst, src, count); | |
| 385 } | |
| 386 | |
| 281 #endif | 387 #endif |
| 282 | 388 |
| 283 } | 389 } |
| 284 | 390 |
| 285 #endif // SkSwizzler_opts_DEFINED | 391 #endif // SkSwizzler_opts_DEFINED |
| OLD | NEW |