| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2016 Google Inc. | 2 * Copyright 2016 Google Inc. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 #ifndef SkSwizzler_opts_DEFINED | 8 #ifndef SkSwizzler_opts_DEFINED |
| 9 #define SkSwizzler_opts_DEFINED | 9 #define SkSwizzler_opts_DEFINED |
| 10 | 10 |
| (...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 53 b = src[i] >> 16, | 53 b = src[i] >> 16, |
| 54 g = src[i] >> 8, | 54 g = src[i] >> 8, |
| 55 r = src[i] >> 0; | 55 r = src[i] >> 0; |
| 56 dst[i] = (uint32_t)a << 24 | 56 dst[i] = (uint32_t)a << 24 |
| 57 | (uint32_t)r << 16 | 57 | (uint32_t)r << 16 |
| 58 | (uint32_t)g << 8 | 58 | (uint32_t)g << 8 |
| 59 | (uint32_t)b << 0; | 59 | (uint32_t)b << 0; |
| 60 } | 60 } |
| 61 } | 61 } |
| 62 | 62 |
| 63 static void RGB_to_RGB1_portable(uint32_t dst[], const void* vsrc, int count) { |
| 64 const uint8_t* src = (const uint8_t*)vsrc; |
| 65 for (int i = 0; i < count; i++) { |
| 66 uint8_t r = src[0], |
| 67 g = src[1], |
| 68 b = src[2]; |
| 69 src += 3; |
| 70 dst[i] = (uint32_t)0xFF << 24 |
| 71 | (uint32_t)b << 16 |
| 72 | (uint32_t)g << 8 |
| 73 | (uint32_t)r << 0; |
| 74 } |
| 75 } |
| 76 |
| 77 static void RGB_to_BGR1_portable(uint32_t dst[], const void* vsrc, int count) { |
| 78 const uint8_t* src = (const uint8_t*)vsrc; |
| 79 for (int i = 0; i < count; i++) { |
| 80 uint8_t r = src[0], |
| 81 g = src[1], |
| 82 b = src[2]; |
| 83 src += 3; |
| 84 dst[i] = (uint32_t)0xFF << 24 |
| 85 | (uint32_t)r << 16 |
| 86 | (uint32_t)g << 8 |
| 87 | (uint32_t)b << 0; |
| 88 } |
| 89 } |
| 90 |
| 63 #if defined(SK_ARM_HAS_NEON) | 91 #if defined(SK_ARM_HAS_NEON) |
| 64 | 92 |
| 65 // Rounded divide by 255, (x + 127) / 255 | 93 // Rounded divide by 255, (x + 127) / 255 |
| 66 static uint8x8_t div255_round(uint16x8_t x) { | 94 static uint8x8_t div255_round(uint16x8_t x) { |
| 67 // result = (x + 127) / 255 | 95 // result = (x + 127) / 255 |
| 68 // result = (x + 127) / 256 + error1 | 96 // result = (x + 127) / 256 + error1 |
| 69 // | 97 // |
| 70 // error1 = (x + 127) / (255 * 256) | 98 // error1 = (x + 127) / (255 * 256) |
| 71 // error1 = (x + 127) / (256 * 256) + error2 | 99 // error1 = (x + 127) / (256 * 256) + error2 |
| 72 // | 100 // |
| (...skipping 16 matching lines...) Expand all Loading... |
| 89 // Scale a byte by another, (x * y + 127) / 255 | 117 // Scale a byte by another, (x * y + 127) / 255 |
| 90 static uint8x8_t scale(uint8x8_t x, uint8x8_t y) { | 118 static uint8x8_t scale(uint8x8_t x, uint8x8_t y) { |
| 91 return div255_round(vmull_u8(x, y)); | 119 return div255_round(vmull_u8(x, y)); |
| 92 } | 120 } |
| 93 | 121 |
| 94 template <bool kSwapRB> | 122 template <bool kSwapRB> |
| 95 static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) { | 123 static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) { |
| 96 auto src = (const uint32_t*)vsrc; | 124 auto src = (const uint32_t*)vsrc; |
| 97 while (count >= 8) { | 125 while (count >= 8) { |
| 98 // Load 8 pixels. | 126 // Load 8 pixels. |
| 99 uint8x8x4_t bgra = vld4_u8((const uint8_t*) src); | 127 uint8x8x4_t rgba = vld4_u8((const uint8_t*) src); |
| 100 | 128 |
| 101 uint8x8_t a = bgra.val[3], | 129 uint8x8_t a = rgba.val[3], |
| 102 b = bgra.val[2], | 130 b = rgba.val[2], |
| 103 g = bgra.val[1], | 131 g = rgba.val[1], |
| 104 r = bgra.val[0]; | 132 r = rgba.val[0]; |
| 105 | 133 |
| 106 // Premultiply. | 134 // Premultiply. |
| 107 b = scale(b, a); | 135 b = scale(b, a); |
| 108 g = scale(g, a); | 136 g = scale(g, a); |
| 109 r = scale(r, a); | 137 r = scale(r, a); |
| 110 | 138 |
| 111 // Store 8 premultiplied pixels. | 139 // Store 8 premultiplied pixels. |
| 112 if (kSwapRB) { | 140 if (kSwapRB) { |
| 113 bgra.val[2] = r; | 141 rgba.val[2] = r; |
| 114 bgra.val[1] = g; | 142 rgba.val[1] = g; |
| 115 bgra.val[0] = b; | 143 rgba.val[0] = b; |
| 116 } else { | 144 } else { |
| 117 bgra.val[2] = b; | 145 rgba.val[2] = b; |
| 118 bgra.val[1] = g; | 146 rgba.val[1] = g; |
| 119 bgra.val[0] = r; | 147 rgba.val[0] = r; |
| 120 } | 148 } |
| 121 vst4_u8((uint8_t*) dst, bgra); | 149 vst4_u8((uint8_t*) dst, rgba); |
| 122 src += 8; | 150 src += 8; |
| 123 dst += 8; | 151 dst += 8; |
| 124 count -= 8; | 152 count -= 8; |
| 125 } | 153 } |
| 126 | 154 |
| 127 // Call portable code to finish up the tail of [0,8) pixels. | 155 // Call portable code to finish up the tail of [0,8) pixels. |
| 128 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable; | 156 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable; |
| 129 proc(dst, src, count); | 157 proc(dst, src, count); |
| 130 } | 158 } |
| 131 | 159 |
| 132 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { | 160 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { |
| 133 premul_should_swapRB<false>(dst, src, count); | 161 premul_should_swapRB<false>(dst, src, count); |
| 134 } | 162 } |
| 135 | 163 |
| 136 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { | 164 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { |
| 137 premul_should_swapRB<true>(dst, src, count); | 165 premul_should_swapRB<true>(dst, src, count); |
| 138 } | 166 } |
| 139 | 167 |
| 140 static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) { | 168 static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) { |
| 141 auto src = (const uint32_t*)vsrc; | 169 auto src = (const uint32_t*)vsrc; |
| 142 while (count >= 16) { | 170 while (count >= 16) { |
| 143 // Load 16 pixels. | 171 // Load 16 pixels. |
| 144 uint8x16x4_t bgra = vld4q_u8((const uint8_t*) src); | 172 uint8x16x4_t rgba = vld4q_u8((const uint8_t*) src); |
| 145 | 173 |
| 146 // Swap r and b. | 174 // Swap r and b. |
| 147 SkTSwap(bgra.val[0], bgra.val[2]); | 175 SkTSwap(rgba.val[0], rgba.val[2]); |
| 148 | 176 |
| 149 // Store 16 pixels. | 177 // Store 16 pixels. |
| 150 vst4q_u8((uint8_t*) dst, bgra); | 178 vst4q_u8((uint8_t*) dst, rgba); |
| 151 src += 16; | 179 src += 16; |
| 152 dst += 16; | 180 dst += 16; |
| 153 count -= 16; | 181 count -= 16; |
| 154 } | 182 } |
| 155 | 183 |
| 156 if (count >= 8) { | 184 if (count >= 8) { |
| 157 // Load 8 pixels. | 185 // Load 8 pixels. |
| 158 uint8x8x4_t bgra = vld4_u8((const uint8_t*) src); | 186 uint8x8x4_t rgba = vld4_u8((const uint8_t*) src); |
| 159 | 187 |
| 160 // Swap r and b. | 188 // Swap r and b. |
| 161 SkTSwap(bgra.val[0], bgra.val[2]); | 189 SkTSwap(rgba.val[0], rgba.val[2]); |
| 162 | 190 |
| 163 // Store 8 pixels. | 191 // Store 8 pixels. |
| 164 vst4_u8((uint8_t*) dst, bgra); | 192 vst4_u8((uint8_t*) dst, rgba); |
| 165 src += 8; | 193 src += 8; |
| 166 dst += 8; | 194 dst += 8; |
| 167 count -= 8; | 195 count -= 8; |
| 168 } | 196 } |
| 169 | 197 |
| 170 RGBA_to_BGRA_portable(dst, src, count); | 198 RGBA_to_BGRA_portable(dst, src, count); |
| 171 } | 199 } |
| 172 | 200 |
| 201 template <bool kSwapRB> |
| 202 static void insert_alpha_should_swaprb(uint32_t dst[], const void* vsrc, int cou
nt) { |
| 203 const uint8_t* src = (const uint8_t*) vsrc; |
| 204 while (count >= 16) { |
| 205 // Load 16 pixels. |
| 206 uint8x16x3_t rgb = vld3q_u8(src); |
| 207 |
| 208 // Insert an opaque alpha channel and swap if needed. |
| 209 uint8x16x4_t rgba; |
| 210 if (kSwapRB) { |
| 211 rgba.val[0] = rgb.val[2]; |
| 212 rgba.val[2] = rgb.val[0]; |
| 213 } else { |
| 214 rgba.val[0] = rgb.val[0]; |
| 215 rgba.val[2] = rgb.val[2]; |
| 216 } |
| 217 rgba.val[1] = rgb.val[1]; |
| 218 rgba.val[3] = vdupq_n_u8(0xFF); |
| 219 |
| 220 // Store 16 pixels. |
| 221 vst4q_u8((uint8_t*) dst, rgba); |
| 222 src += 16*3; |
| 223 dst += 16; |
| 224 count -= 16; |
| 225 } |
| 226 |
| 227 if (count >= 8) { |
| 228 // Load 8 pixels. |
| 229 uint8x8x3_t rgb = vld3_u8(src); |
| 230 |
| 231 // Insert an opaque alpha channel and swap if needed. |
| 232 uint8x8x4_t rgba; |
| 233 if (kSwapRB) { |
| 234 rgba.val[0] = rgb.val[2]; |
| 235 rgba.val[2] = rgb.val[0]; |
| 236 } else { |
| 237 rgba.val[0] = rgb.val[0]; |
| 238 rgba.val[2] = rgb.val[2]; |
| 239 } |
| 240 rgba.val[1] = rgb.val[1]; |
| 241 rgba.val[3] = vdup_n_u8(0xFF); |
| 242 |
| 243 // Store 8 pixels. |
| 244 vst4_u8((uint8_t*) dst, rgba); |
| 245 src += 8*3; |
| 246 dst += 8; |
| 247 count -= 8; |
| 248 } |
| 249 |
| 250 // Call portable code to finish up the tail of [0,8) pixels. |
| 251 auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable; |
| 252 proc(dst, src, count); |
| 253 } |
| 254 |
| 255 static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) { |
| 256 insert_alpha_should_swaprb<false>(dst, src, count); |
| 257 } |
| 258 |
| 259 static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) { |
| 260 insert_alpha_should_swaprb<true>(dst, src, count); |
| 261 } |
| 262 |
| 173 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 | 263 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 |
| 174 | 264 |
| 175 template <bool kSwapRB> | 265 template <bool kSwapRB> |
| 176 static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) { | 266 static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) { |
| 177 auto src = (const uint32_t*)vsrc; | 267 auto src = (const uint32_t*)vsrc; |
| 178 | 268 |
| 179 auto premul8 = [](__m128i* lo, __m128i* hi) { | 269 auto premul8 = [](__m128i* lo, __m128i* hi) { |
| 180 const __m128i zeros = _mm_setzero_si128(); | 270 const __m128i zeros = _mm_setzero_si128(); |
| 181 const __m128i _128 = _mm_set1_epi16(128); | 271 const __m128i _128 = _mm_set1_epi16(128); |
| 182 const __m128i _257 = _mm_set1_epi16(257); | 272 const __m128i _257 = _mm_set1_epi16(257); |
| (...skipping 78 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 261 _mm_storeu_si128((__m128i*) dst, bgra); | 351 _mm_storeu_si128((__m128i*) dst, bgra); |
| 262 | 352 |
| 263 src += 4; | 353 src += 4; |
| 264 dst += 4; | 354 dst += 4; |
| 265 count -= 4; | 355 count -= 4; |
| 266 } | 356 } |
| 267 | 357 |
| 268 RGBA_to_BGRA_portable(dst, src, count); | 358 RGBA_to_BGRA_portable(dst, src, count); |
| 269 } | 359 } |
| 270 | 360 |
| 361 static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) { |
| 362 RGB_to_RGB1_portable(dst, src, count); |
| 363 } |
| 364 |
| 365 static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) { |
| 366 RGB_to_BGR1_portable(dst, src, count); |
| 367 } |
| 368 |
| 271 #else | 369 #else |
| 272 | 370 |
| 273 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { | 371 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { |
| 274 RGBA_to_rgbA_portable(dst, src, count); | 372 RGBA_to_rgbA_portable(dst, src, count); |
| 275 } | 373 } |
| 276 | 374 |
| 277 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { | 375 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { |
| 278 RGBA_to_bgrA_portable(dst, src, count); | 376 RGBA_to_bgrA_portable(dst, src, count); |
| 279 } | 377 } |
| 280 | 378 |
| 281 static void RGBA_to_BGRA(uint32_t* dst, const void* src, int count) { | 379 static void RGBA_to_BGRA(uint32_t* dst, const void* src, int count) { |
| 282 RGBA_to_BGRA_portable(dst, src, count); | 380 RGBA_to_BGRA_portable(dst, src, count); |
| 283 } | 381 } |
| 284 | 382 |
| 383 static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) { |
| 384 RGB_to_RGB1_portable(dst, src, count); |
| 385 } |
| 386 |
| 387 static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) { |
| 388 RGB_to_BGR1_portable(dst, src, count); |
| 389 } |
| 390 |
| 285 #endif | 391 #endif |
| 286 | 392 |
| 287 } | 393 } |
| 288 | 394 |
| 289 #endif // SkSwizzler_opts_DEFINED | 395 #endif // SkSwizzler_opts_DEFINED |
| OLD | NEW |