OLD | NEW |
---|---|
1 /* | 1 /* |
2 * Copyright 2016 Google Inc. | 2 * Copyright 2016 Google Inc. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #ifndef SkSwizzler_opts_DEFINED | 8 #ifndef SkSwizzler_opts_DEFINED |
9 #define SkSwizzler_opts_DEFINED | 9 #define SkSwizzler_opts_DEFINED |
10 | 10 |
11 #include "SkColorPriv.h" | 11 #include "SkColorPriv.h" |
12 | 12 |
13 namespace SK_OPTS_NS { | 13 namespace SK_OPTS_NS { |
14 | 14 |
15 // These variable names in these functions just pretend the input is BGRA. | 15 // These variable names in these functions just pretend the input is BGRA. |
mtklein
2016/01/22 14:37:32
(or BGR)
msarett
2016/01/22 15:00:36
Done.
| |
16 // They work fine with both RGBA and BGRA. | 16 // They work fine with both RGBA and BGRA. |
mtklein
2016/01/22 14:37:32
(or both BGR and RGB).
msarett
2016/01/22 15:00:36
Done.
| |
17 | 17 |
18 static void premul_xxxa_portable(uint32_t dst[], const uint32_t src[], int count ) { | 18 static void premul_xxxa_portable(uint32_t dst[], const uint32_t src[], int count ) { |
mtklein
2016/01/22 14:37:33
Do you think we made things unnecessarily complica
mtklein
2016/01/22 14:44:04
Gonna write up a CL to demonstrate what I mean.
msarett
2016/01/22 15:00:36
Yes.
mtklein
2016/01/22 15:23:57
Oh, right, how about RGB_to_RGB1 / RGB_to_BGR1?
msarett
2016/01/22 17:27:23
Done.
| |
19 for (int i = 0; i < count; i++) { | 19 for (int i = 0; i < count; i++) { |
20 uint8_t a = src[i] >> 24, | 20 uint8_t a = src[i] >> 24, |
21 r = src[i] >> 16, | 21 r = src[i] >> 16, |
22 g = src[i] >> 8, | 22 g = src[i] >> 8, |
23 b = src[i] >> 0; | 23 b = src[i] >> 0; |
24 r = (r*a+127)/255; | 24 r = (r*a+127)/255; |
25 g = (g*a+127)/255; | 25 g = (g*a+127)/255; |
26 b = (b*a+127)/255; | 26 b = (b*a+127)/255; |
27 dst[i] = (uint32_t)a << 24 | 27 dst[i] = (uint32_t)a << 24 |
28 | (uint32_t)r << 16 | 28 | (uint32_t)r << 16 |
(...skipping 24 matching lines...) Expand all Loading... | |
53 r = src[i] >> 16, | 53 r = src[i] >> 16, |
54 g = src[i] >> 8, | 54 g = src[i] >> 8, |
55 b = src[i] >> 0; | 55 b = src[i] >> 0; |
56 dst[i] = (uint32_t)a << 24 | 56 dst[i] = (uint32_t)a << 24 |
57 | (uint32_t)b << 16 | 57 | (uint32_t)b << 16 |
58 | (uint32_t)g << 8 | 58 | (uint32_t)g << 8 |
59 | (uint32_t)r << 0; | 59 | (uint32_t)r << 0; |
60 } | 60 } |
61 } | 61 } |
62 | 62 |
63 static void xxx_xxxa_portable(uint32_t dst[], const uint32_t src[], int count) { | |
mtklein
2016/01/22 14:37:32
src is three-byte right, not some sort of RGBx? S
msarett
2016/01/22 15:00:36
Agreed. I made a similar comment in Patch Set 1.
mtklein
2016/01/22 15:23:57
Good. Was going to suggest that. :)
While they'r
msarett
2016/01/22 17:27:23
Done.
| |
64 int i8 = 0; | |
65 const uint8_t* src8 = (const uint8_t*) src; | |
66 for (int i32 = 0; i32 < count; i32++) { | |
67 uint8_t b = src8[i8++], | |
68 g = src8[i8++], | |
69 r = src8[i8++]; | |
70 dst[i32] = (uint32_t) b << 0 | |
mtklein
2016/01/22 14:37:32
Let's keep our order consistent with the rest of t
msarett
2016/01/22 15:00:36
Done.
| |
71 | (uint32_t) g << 8 | |
72 | (uint32_t) r << 16 | |
73 | (uint32_t)0xFF << 24; | |
74 } | |
75 } | |
76 | |
77 static void xxx_swaprb_xxxa_portable(uint32_t dst[], const uint32_t src[], int c ount) { | |
78 int i8 = 0; | |
79 const uint8_t* src8 = (const uint8_t*) src; | |
80 for (int i32 = 0; i32 < count; i32++) { | |
81 uint8_t b = src8[i8++], | |
82 g = src8[i8++], | |
83 r = src8[i8++]; | |
84 dst[i32] = (uint32_t) r << 0 | |
85 | (uint32_t) g << 8 | |
86 | (uint32_t) b << 16 | |
87 | (uint32_t)0xFF << 24; | |
88 } | |
89 } | |
90 | |
63 #if defined(SK_ARM_HAS_NEON) | 91 #if defined(SK_ARM_HAS_NEON) |
64 | 92 |
65 // Rounded divide by 255, (x + 127) / 255 | 93 // Rounded divide by 255, (x + 127) / 255 |
66 static uint8x8_t div255_round(uint16x8_t x) { | 94 static uint8x8_t div255_round(uint16x8_t x) { |
67 // result = (x + 127) / 255 | 95 // result = (x + 127) / 255 |
68 // result = (x + 127) / 256 + error1 | 96 // result = (x + 127) / 256 + error1 |
69 // | 97 // |
70 // error1 = (x + 127) / (255 * 256) | 98 // error1 = (x + 127) / (255 * 256) |
71 // error1 = (x + 127) / (256 * 256) + error2 | 99 // error1 = (x + 127) / (256 * 256) + error2 |
72 // | 100 // |
(...skipping 88 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
161 // Store 8 pixels. | 189 // Store 8 pixels. |
162 vst4_u8((uint8_t*) dst, bgra); | 190 vst4_u8((uint8_t*) dst, bgra); |
163 src += 8; | 191 src += 8; |
164 dst += 8; | 192 dst += 8; |
165 count -= 8; | 193 count -= 8; |
166 } | 194 } |
167 | 195 |
168 swaprb_xxxa_portable(dst, src, count); | 196 swaprb_xxxa_portable(dst, src, count); |
169 } | 197 } |
170 | 198 |
199 template <bool kSwapRB> | |
200 static void xxx_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int cou nt) { | |
201 const uint8_t* src8 = (const uint8_t*) src; | |
202 while (count >= 16) { | |
203 // Load 16 pixels. | |
204 uint8x16x3_t bgr = vld3q_u8(src8); | |
205 | |
206 // Insert an opaque alpha channel and swap if needed. | |
207 uint8x16x4_t bgra; | |
208 if (kSwapRB) { | |
209 bgra.val[0] = bgr.val[2]; | |
210 bgra.val[2] = bgr.val[0]; | |
211 } else { | |
212 bgra.val[0] = bgr.val[0]; | |
213 bgra.val[2] = bgr.val[2]; | |
214 } | |
215 bgra.val[1] = bgr.val[1]; | |
216 bgra.val[3] = vdupq_n_u8(0xFF); | |
217 | |
218 // Store 16 pixels. | |
219 vst4q_u8((uint8_t*) dst, bgra); | |
220 src8 += 48; | |
mtklein
2016/01/22 14:37:33
might write this as += 16*3?
I find it really ple
msarett
2016/01/22 15:00:36
Done.
| |
221 dst += 16; | |
222 count -= 16; | |
223 } | |
224 | |
225 if (count >= 8) { | |
226 // Load 8 pixels. | |
227 uint8x8x3_t bgr = vld3_u8(src8); | |
228 | |
229 // Insert an opaque alpha channel and swap if needed. | |
230 uint8x8x4_t bgra; | |
231 if (kSwapRB) { | |
232 bgra.val[0] = bgr.val[2]; | |
233 bgra.val[2] = bgr.val[0]; | |
234 } else { | |
235 bgra.val[0] = bgr.val[0]; | |
236 bgra.val[2] = bgr.val[2]; | |
237 } | |
238 bgra.val[1] = bgr.val[1]; | |
239 bgra.val[3] = vdup_n_u8(0xFF); | |
240 | |
241 // Store 8 pixels. | |
242 vst4_u8((uint8_t*) dst, bgra); | |
243 src8 += 24; | |
244 dst += 8; | |
245 count -= 8; | |
246 } | |
247 | |
248 // Call portable code to finish up the tail of [0,8) pixels. | |
249 auto proc = kSwapRB ? xxx_swaprb_xxxa_portable : xxx_xxxa_portable; | |
250 proc(dst, (const uint32_t*) src8, count); | |
251 } | |
252 | |
253 static void xxx_xxxa(uint32_t dst[], const uint32_t src[], int count) { | |
254 xxx_xxxa_should_swaprb<false>(dst, src, count); | |
255 } | |
256 | |
257 static void xxx_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { | |
258 xxx_xxxa_should_swaprb<true>(dst, src, count); | |
259 } | |
260 | |
171 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 | 261 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 |
172 | 262 |
173 template <bool kSwapRB> | 263 template <bool kSwapRB> |
174 static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int count) { | 264 static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int count) { |
175 | 265 |
176 auto premul8 = [](__m128i* lo, __m128i* hi) { | 266 auto premul8 = [](__m128i* lo, __m128i* hi) { |
177 const __m128i zeros = _mm_setzero_si128(); | 267 const __m128i zeros = _mm_setzero_si128(); |
178 const __m128i _128 = _mm_set1_epi16(128); | 268 const __m128i _128 = _mm_set1_epi16(128); |
179 const __m128i _257 = _mm_set1_epi16(257); | 269 const __m128i _257 = _mm_set1_epi16(257); |
180 __m128i planar; | 270 __m128i planar; |
(...skipping 76 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
257 _mm_storeu_si128((__m128i*) dst, rgba); | 347 _mm_storeu_si128((__m128i*) dst, rgba); |
258 | 348 |
259 src += 4; | 349 src += 4; |
260 dst += 4; | 350 dst += 4; |
261 count -= 4; | 351 count -= 4; |
262 } | 352 } |
263 | 353 |
264 swaprb_xxxa_portable(dst, src, count); | 354 swaprb_xxxa_portable(dst, src, count); |
265 } | 355 } |
266 | 356 |
357 static void xxx_xxxa(uint32_t dst[], const uint32_t src[], int count) { | |
358 xxx_xxxa_portable(dst, src, count); | |
359 } | |
360 | |
361 static void xxx_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { | |
362 xxx_swaprb_xxxa_portable(dst, src, count); | |
363 } | |
364 | |
267 #else | 365 #else |
268 | 366 |
269 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) { | 367 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
270 premul_xxxa_portable(dst, src, count); | 368 premul_xxxa_portable(dst, src, count); |
271 } | 369 } |
272 | 370 |
273 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { | 371 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
274 premul_swaprb_xxxa_portable(dst, src, count); | 372 premul_swaprb_xxxa_portable(dst, src, count); |
275 } | 373 } |
276 | 374 |
277 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { | 375 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
278 swaprb_xxxa_portable(dst, src, count); | 376 swaprb_xxxa_portable(dst, src, count); |
279 } | 377 } |
280 | 378 |
379 static void xxx_xxxa(uint32_t dst[], const uint32_t src[], int count) { | |
380 xxx_xxxa_portable(dst, src, count); | |
381 } | |
382 | |
383 static void xxx_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { | |
384 xxx_swaprb_xxxa_portable(dst, src, count); | |
385 } | |
386 | |
281 #endif | 387 #endif |
282 | 388 |
283 } | 389 } |
284 | 390 |
285 #endif // SkSwizzler_opts_DEFINED | 391 #endif // SkSwizzler_opts_DEFINED |
OLD | NEW |