OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2016 Google Inc. | 2 * Copyright 2016 Google Inc. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #ifndef SkSwizzler_opts_DEFINED | 8 #ifndef SkSwizzler_opts_DEFINED |
9 #define SkSwizzler_opts_DEFINED | 9 #define SkSwizzler_opts_DEFINED |
10 | 10 |
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
53 b = src[i] >> 16, | 53 b = src[i] >> 16, |
54 g = src[i] >> 8, | 54 g = src[i] >> 8, |
55 r = src[i] >> 0; | 55 r = src[i] >> 0; |
56 dst[i] = (uint32_t)a << 24 | 56 dst[i] = (uint32_t)a << 24 |
57 | (uint32_t)r << 16 | 57 | (uint32_t)r << 16 |
58 | (uint32_t)g << 8 | 58 | (uint32_t)g << 8 |
59 | (uint32_t)b << 0; | 59 | (uint32_t)b << 0; |
60 } | 60 } |
61 } | 61 } |
62 | 62 |
| 63 static void RGB_to_RGB1_portable(uint32_t dst[], const void* vsrc, int count) { |
| 64 const uint8_t* src = (const uint8_t*)vsrc; |
| 65 for (int i = 0; i < count; i++) { |
| 66 uint8_t r = src[0], |
| 67 g = src[1], |
| 68 b = src[2]; |
| 69 src += 3; |
| 70 dst[i] = (uint32_t)0xFF << 24 |
| 71 | (uint32_t)b << 16 |
| 72 | (uint32_t)g << 8 |
| 73 | (uint32_t)r << 0; |
| 74 } |
| 75 } |
| 76 |
| 77 static void RGB_to_BGR1_portable(uint32_t dst[], const void* vsrc, int count) { |
| 78 const uint8_t* src = (const uint8_t*)vsrc; |
| 79 for (int i = 0; i < count; i++) { |
| 80 uint8_t r = src[0], |
| 81 g = src[1], |
| 82 b = src[2]; |
| 83 src += 3; |
| 84 dst[i] = (uint32_t)0xFF << 24 |
| 85 | (uint32_t)r << 16 |
| 86 | (uint32_t)g << 8 |
| 87 | (uint32_t)b << 0; |
| 88 } |
| 89 } |
| 90 |
63 #if defined(SK_ARM_HAS_NEON) | 91 #if defined(SK_ARM_HAS_NEON) |
64 | 92 |
65 // Rounded divide by 255, (x + 127) / 255 | 93 // Rounded divide by 255, (x + 127) / 255 |
66 static uint8x8_t div255_round(uint16x8_t x) { | 94 static uint8x8_t div255_round(uint16x8_t x) { |
67 // result = (x + 127) / 255 | 95 // result = (x + 127) / 255 |
68 // result = (x + 127) / 256 + error1 | 96 // result = (x + 127) / 256 + error1 |
69 // | 97 // |
70 // error1 = (x + 127) / (255 * 256) | 98 // error1 = (x + 127) / (255 * 256) |
71 // error1 = (x + 127) / (256 * 256) + error2 | 99 // error1 = (x + 127) / (256 * 256) + error2 |
72 // | 100 // |
(...skipping 16 matching lines...) Expand all Loading... |
89 // Scale a byte by another, (x * y + 127) / 255 | 117 // Scale a byte by another, (x * y + 127) / 255 |
90 static uint8x8_t scale(uint8x8_t x, uint8x8_t y) { | 118 static uint8x8_t scale(uint8x8_t x, uint8x8_t y) { |
91 return div255_round(vmull_u8(x, y)); | 119 return div255_round(vmull_u8(x, y)); |
92 } | 120 } |
93 | 121 |
94 template <bool kSwapRB> | 122 template <bool kSwapRB> |
95 static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) { | 123 static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) { |
96 auto src = (const uint32_t*)vsrc; | 124 auto src = (const uint32_t*)vsrc; |
97 while (count >= 8) { | 125 while (count >= 8) { |
98 // Load 8 pixels. | 126 // Load 8 pixels. |
99 uint8x8x4_t bgra = vld4_u8((const uint8_t*) src); | 127 uint8x8x4_t rgba = vld4_u8((const uint8_t*) src); |
100 | 128 |
101 uint8x8_t a = bgra.val[3], | 129 uint8x8_t a = rgba.val[3], |
102 b = bgra.val[2], | 130 b = rgba.val[2], |
103 g = bgra.val[1], | 131 g = rgba.val[1], |
104 r = bgra.val[0]; | 132 r = rgba.val[0]; |
105 | 133 |
106 // Premultiply. | 134 // Premultiply. |
107 b = scale(b, a); | 135 b = scale(b, a); |
108 g = scale(g, a); | 136 g = scale(g, a); |
109 r = scale(r, a); | 137 r = scale(r, a); |
110 | 138 |
111 // Store 8 premultiplied pixels. | 139 // Store 8 premultiplied pixels. |
112 if (kSwapRB) { | 140 if (kSwapRB) { |
113 bgra.val[2] = r; | 141 rgba.val[2] = r; |
114 bgra.val[1] = g; | 142 rgba.val[1] = g; |
115 bgra.val[0] = b; | 143 rgba.val[0] = b; |
116 } else { | 144 } else { |
117 bgra.val[2] = b; | 145 rgba.val[2] = b; |
118 bgra.val[1] = g; | 146 rgba.val[1] = g; |
119 bgra.val[0] = r; | 147 rgba.val[0] = r; |
120 } | 148 } |
121 vst4_u8((uint8_t*) dst, bgra); | 149 vst4_u8((uint8_t*) dst, rgba); |
122 src += 8; | 150 src += 8; |
123 dst += 8; | 151 dst += 8; |
124 count -= 8; | 152 count -= 8; |
125 } | 153 } |
126 | 154 |
127 // Call portable code to finish up the tail of [0,8) pixels. | 155 // Call portable code to finish up the tail of [0,8) pixels. |
128 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable; | 156 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable; |
129 proc(dst, src, count); | 157 proc(dst, src, count); |
130 } | 158 } |
131 | 159 |
132 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { | 160 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { |
133 premul_should_swapRB<false>(dst, src, count); | 161 premul_should_swapRB<false>(dst, src, count); |
134 } | 162 } |
135 | 163 |
136 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { | 164 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { |
137 premul_should_swapRB<true>(dst, src, count); | 165 premul_should_swapRB<true>(dst, src, count); |
138 } | 166 } |
139 | 167 |
140 static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) { | 168 static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) { |
141 auto src = (const uint32_t*)vsrc; | 169 auto src = (const uint32_t*)vsrc; |
142 while (count >= 16) { | 170 while (count >= 16) { |
143 // Load 16 pixels. | 171 // Load 16 pixels. |
144 uint8x16x4_t bgra = vld4q_u8((const uint8_t*) src); | 172 uint8x16x4_t rgba = vld4q_u8((const uint8_t*) src); |
145 | 173 |
146 // Swap r and b. | 174 // Swap r and b. |
147 SkTSwap(bgra.val[0], bgra.val[2]); | 175 SkTSwap(rgba.val[0], rgba.val[2]); |
148 | 176 |
149 // Store 16 pixels. | 177 // Store 16 pixels. |
150 vst4q_u8((uint8_t*) dst, bgra); | 178 vst4q_u8((uint8_t*) dst, rgba); |
151 src += 16; | 179 src += 16; |
152 dst += 16; | 180 dst += 16; |
153 count -= 16; | 181 count -= 16; |
154 } | 182 } |
155 | 183 |
156 if (count >= 8) { | 184 if (count >= 8) { |
157 // Load 8 pixels. | 185 // Load 8 pixels. |
158 uint8x8x4_t bgra = vld4_u8((const uint8_t*) src); | 186 uint8x8x4_t rgba = vld4_u8((const uint8_t*) src); |
159 | 187 |
160 // Swap r and b. | 188 // Swap r and b. |
161 SkTSwap(bgra.val[0], bgra.val[2]); | 189 SkTSwap(rgba.val[0], rgba.val[2]); |
162 | 190 |
163 // Store 8 pixels. | 191 // Store 8 pixels. |
164 vst4_u8((uint8_t*) dst, bgra); | 192 vst4_u8((uint8_t*) dst, rgba); |
165 src += 8; | 193 src += 8; |
166 dst += 8; | 194 dst += 8; |
167 count -= 8; | 195 count -= 8; |
168 } | 196 } |
169 | 197 |
170 RGBA_to_BGRA_portable(dst, src, count); | 198 RGBA_to_BGRA_portable(dst, src, count); |
171 } | 199 } |
172 | 200 |
| 201 template <bool kSwapRB> |
| 202 static void insert_alpha_should_swaprb(uint32_t dst[], const void* vsrc, int cou
nt) { |
| 203 const uint8_t* src = (const uint8_t*) vsrc; |
| 204 while (count >= 16) { |
| 205 // Load 16 pixels. |
| 206 uint8x16x3_t rgb = vld3q_u8(src); |
| 207 |
| 208 // Insert an opaque alpha channel and swap if needed. |
| 209 uint8x16x4_t rgba; |
| 210 if (kSwapRB) { |
| 211 rgba.val[0] = rgb.val[2]; |
| 212 rgba.val[2] = rgb.val[0]; |
| 213 } else { |
| 214 rgba.val[0] = rgb.val[0]; |
| 215 rgba.val[2] = rgb.val[2]; |
| 216 } |
| 217 rgba.val[1] = rgb.val[1]; |
| 218 rgba.val[3] = vdupq_n_u8(0xFF); |
| 219 |
| 220 // Store 16 pixels. |
| 221 vst4q_u8((uint8_t*) dst, rgba); |
| 222 src += 16*3; |
| 223 dst += 16; |
| 224 count -= 16; |
| 225 } |
| 226 |
| 227 if (count >= 8) { |
| 228 // Load 8 pixels. |
| 229 uint8x8x3_t rgb = vld3_u8(src); |
| 230 |
| 231 // Insert an opaque alpha channel and swap if needed. |
| 232 uint8x8x4_t rgba; |
| 233 if (kSwapRB) { |
| 234 rgba.val[0] = rgb.val[2]; |
| 235 rgba.val[2] = rgb.val[0]; |
| 236 } else { |
| 237 rgba.val[0] = rgb.val[0]; |
| 238 rgba.val[2] = rgb.val[2]; |
| 239 } |
| 240 rgba.val[1] = rgb.val[1]; |
| 241 rgba.val[3] = vdup_n_u8(0xFF); |
| 242 |
| 243 // Store 8 pixels. |
| 244 vst4_u8((uint8_t*) dst, rgba); |
| 245 src += 8*3; |
| 246 dst += 8; |
| 247 count -= 8; |
| 248 } |
| 249 |
| 250 // Call portable code to finish up the tail of [0,8) pixels. |
| 251 auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable; |
| 252 proc(dst, src, count); |
| 253 } |
| 254 |
| 255 static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) { |
| 256 insert_alpha_should_swaprb<false>(dst, src, count); |
| 257 } |
| 258 |
| 259 static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) { |
| 260 insert_alpha_should_swaprb<true>(dst, src, count); |
| 261 } |
| 262 |
173 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 | 263 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 |
174 | 264 |
175 template <bool kSwapRB> | 265 template <bool kSwapRB> |
176 static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) { | 266 static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) { |
177 auto src = (const uint32_t*)vsrc; | 267 auto src = (const uint32_t*)vsrc; |
178 | 268 |
179 auto premul8 = [](__m128i* lo, __m128i* hi) { | 269 auto premul8 = [](__m128i* lo, __m128i* hi) { |
180 const __m128i zeros = _mm_setzero_si128(); | 270 const __m128i zeros = _mm_setzero_si128(); |
181 const __m128i _128 = _mm_set1_epi16(128); | 271 const __m128i _128 = _mm_set1_epi16(128); |
182 const __m128i _257 = _mm_set1_epi16(257); | 272 const __m128i _257 = _mm_set1_epi16(257); |
(...skipping 78 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
261 _mm_storeu_si128((__m128i*) dst, bgra); | 351 _mm_storeu_si128((__m128i*) dst, bgra); |
262 | 352 |
263 src += 4; | 353 src += 4; |
264 dst += 4; | 354 dst += 4; |
265 count -= 4; | 355 count -= 4; |
266 } | 356 } |
267 | 357 |
268 RGBA_to_BGRA_portable(dst, src, count); | 358 RGBA_to_BGRA_portable(dst, src, count); |
269 } | 359 } |
270 | 360 |
| 361 static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) { |
| 362 RGB_to_RGB1_portable(dst, src, count); |
| 363 } |
| 364 |
| 365 static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) { |
| 366 RGB_to_BGR1_portable(dst, src, count); |
| 367 } |
| 368 |
271 #else | 369 #else |
272 | 370 |
273 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { | 371 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { |
274 RGBA_to_rgbA_portable(dst, src, count); | 372 RGBA_to_rgbA_portable(dst, src, count); |
275 } | 373 } |
276 | 374 |
277 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { | 375 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { |
278 RGBA_to_bgrA_portable(dst, src, count); | 376 RGBA_to_bgrA_portable(dst, src, count); |
279 } | 377 } |
280 | 378 |
281 static void RGBA_to_BGRA(uint32_t* dst, const void* src, int count) { | 379 static void RGBA_to_BGRA(uint32_t* dst, const void* src, int count) { |
282 RGBA_to_BGRA_portable(dst, src, count); | 380 RGBA_to_BGRA_portable(dst, src, count); |
283 } | 381 } |
284 | 382 |
| 383 static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) { |
| 384 RGB_to_RGB1_portable(dst, src, count); |
| 385 } |
| 386 |
| 387 static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) { |
| 388 RGB_to_BGR1_portable(dst, src, count); |
| 389 } |
| 390 |
285 #endif | 391 #endif |
286 | 392 |
287 } | 393 } |
288 | 394 |
289 #endif // SkSwizzler_opts_DEFINED | 395 #endif // SkSwizzler_opts_DEFINED |
OLD | NEW |