OLD | NEW |
---|---|
1 /* | 1 /* |
2 * Copyright 2016 Google Inc. | 2 * Copyright 2016 Google Inc. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #ifndef SkSwizzler_opts_DEFINED | 8 #ifndef SkSwizzler_opts_DEFINED |
9 #define SkSwizzler_opts_DEFINED | 9 #define SkSwizzler_opts_DEFINED |
10 | 10 |
11 #include "SkColorPriv.h" | 11 #include "SkColorPriv.h" |
12 | 12 |
13 namespace SK_OPTS_NS { | 13 namespace SK_OPTS_NS { |
14 | 14 |
15 // These variable names in these functions just pretend the input is BGRA. | 15 static void RGBA_to_rgbA_portable(uint32_t* dst, const void* vsrc, int count) { |
msarett
2016/01/22 15:35:08
I find this comment to still be useful? There is
mtklein
2016/01/22 15:39:52
Sort of... now that the order and the function nam
msarett
2016/01/22 15:40:51
Agreed that it's obvious. Let's drop it.
| |
16 // They work fine with both RGBA and BGRA. | 16 auto src = (const uint32_t*)vsrc; |
17 | |
18 static void premul_xxxa_portable(uint32_t dst[], const uint32_t src[], int count ) { | |
19 for (int i = 0; i < count; i++) { | 17 for (int i = 0; i < count; i++) { |
20 uint8_t a = src[i] >> 24, | 18 uint8_t a = src[i] >> 24, |
21 r = src[i] >> 16, | 19 b = src[i] >> 16, |
22 g = src[i] >> 8, | 20 g = src[i] >> 8, |
23 b = src[i] >> 0; | 21 r = src[i] >> 0; |
22 b = (b*a+127)/255; | |
23 g = (g*a+127)/255; | |
24 r = (r*a+127)/255; | 24 r = (r*a+127)/255; |
25 dst[i] = (uint32_t)a << 24 | |
26 | (uint32_t)b << 16 | |
27 | (uint32_t)g << 8 | |
28 | (uint32_t)r << 0; | |
29 } | |
30 } | |
31 | |
32 static void RGBA_to_bgrA_portable(uint32_t* dst, const void* vsrc, int count) { | |
33 auto src = (const uint32_t*)vsrc; | |
34 for (int i = 0; i < count; i++) { | |
35 uint8_t a = src[i] >> 24, | |
36 b = src[i] >> 16, | |
37 g = src[i] >> 8, | |
38 r = src[i] >> 0; | |
39 b = (b*a+127)/255; | |
25 g = (g*a+127)/255; | 40 g = (g*a+127)/255; |
26 b = (b*a+127)/255; | 41 r = (r*a+127)/255; |
27 dst[i] = (uint32_t)a << 24 | 42 dst[i] = (uint32_t)a << 24 |
28 | (uint32_t)r << 16 | 43 | (uint32_t)r << 16 |
29 | (uint32_t)g << 8 | 44 | (uint32_t)g << 8 |
30 | (uint32_t)b << 0; | 45 | (uint32_t)b << 0; |
31 } | 46 } |
32 } | 47 } |
33 | 48 |
34 static void premul_swaprb_xxxa_portable(uint32_t dst[], const uint32_t src[], in t count) { | 49 static void RGBA_to_BGRA_portable(uint32_t* dst, const void* vsrc, int count) { |
50 auto src = (const uint32_t*)vsrc; | |
35 for (int i = 0; i < count; i++) { | 51 for (int i = 0; i < count; i++) { |
36 uint8_t a = src[i] >> 24, | 52 uint8_t a = src[i] >> 24, |
37 r = src[i] >> 16, | 53 b = src[i] >> 16, |
38 g = src[i] >> 8, | 54 g = src[i] >> 8, |
39 b = src[i] >> 0; | 55 r = src[i] >> 0; |
40 r = (r*a+127)/255; | |
41 g = (g*a+127)/255; | |
42 b = (b*a+127)/255; | |
43 dst[i] = (uint32_t)a << 24 | 56 dst[i] = (uint32_t)a << 24 |
44 | (uint32_t)b << 16 | 57 | (uint32_t)r << 16 |
45 | (uint32_t)g << 8 | 58 | (uint32_t)g << 8 |
46 | (uint32_t)r << 0; | 59 | (uint32_t)b << 0; |
47 } | 60 } |
48 } | 61 } |
49 | 62 |
50 static void swaprb_xxxa_portable(uint32_t dst[], const uint32_t src[], int count ) { | |
51 for (int i = 0; i < count; i++) { | |
52 uint8_t a = src[i] >> 24, | |
53 r = src[i] >> 16, | |
54 g = src[i] >> 8, | |
55 b = src[i] >> 0; | |
56 dst[i] = (uint32_t)a << 24 | |
57 | (uint32_t)b << 16 | |
58 | (uint32_t)g << 8 | |
59 | (uint32_t)r << 0; | |
60 } | |
61 } | |
62 | |
63 #if defined(SK_ARM_HAS_NEON) | 63 #if defined(SK_ARM_HAS_NEON) |
64 | 64 |
65 // Rounded divide by 255, (x + 127) / 255 | 65 // Rounded divide by 255, (x + 127) / 255 |
66 static uint8x8_t div255_round(uint16x8_t x) { | 66 static uint8x8_t div255_round(uint16x8_t x) { |
67 // result = (x + 127) / 255 | 67 // result = (x + 127) / 255 |
68 // result = (x + 127) / 256 + error1 | 68 // result = (x + 127) / 256 + error1 |
69 // | 69 // |
70 // error1 = (x + 127) / (255 * 256) | 70 // error1 = (x + 127) / (255 * 256) |
71 // error1 = (x + 127) / (256 * 256) + error2 | 71 // error1 = (x + 127) / (256 * 256) + error2 |
72 // | 72 // |
(...skipping 12 matching lines...) Expand all Loading... | |
85 // "add, round, and narrow back to 8-bits" instruction. | 85 // "add, round, and narrow back to 8-bits" instruction. |
86 return vraddhn_u16(x, vrshrq_n_u16(x, 8)); | 86 return vraddhn_u16(x, vrshrq_n_u16(x, 8)); |
87 } | 87 } |
88 | 88 |
89 // Scale a byte by another, (x * y + 127) / 255 | 89 // Scale a byte by another, (x * y + 127) / 255 |
90 static uint8x8_t scale(uint8x8_t x, uint8x8_t y) { | 90 static uint8x8_t scale(uint8x8_t x, uint8x8_t y) { |
91 return div255_round(vmull_u8(x, y)); | 91 return div255_round(vmull_u8(x, y)); |
92 } | 92 } |
93 | 93 |
94 template <bool kSwapRB> | 94 template <bool kSwapRB> |
95 static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int count) { | 95 static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) { |
96 auto src = (const uint32_t*)vsrc; | |
96 while (count >= 8) { | 97 while (count >= 8) { |
97 // Load 8 pixels. | 98 // Load 8 pixels. |
98 uint8x8x4_t bgra = vld4_u8((const uint8_t*) src); | 99 uint8x8x4_t bgra = vld4_u8((const uint8_t*) src); |
99 | 100 |
100 uint8x8_t a = bgra.val[3], | 101 uint8x8_t a = bgra.val[3], |
101 r = bgra.val[2], | 102 b = bgra.val[2], |
102 g = bgra.val[1], | 103 g = bgra.val[1], |
103 b = bgra.val[0]; | 104 r = bgra.val[0]; |
104 | 105 |
105 // Premultiply. | 106 // Premultiply. |
107 b = scale(b, a); | |
108 g = scale(g, a); | |
106 r = scale(r, a); | 109 r = scale(r, a); |
107 g = scale(g, a); | |
108 b = scale(b, a); | |
109 | 110 |
110 // Store 8 premultiplied pixels. | 111 // Store 8 premultiplied pixels. |
111 if (kSwapRB) { | 112 if (kSwapRB) { |
113 bgra.val[2] = r; | |
114 bgra.val[1] = g; | |
115 bgra.val[0] = b; | |
116 } else { | |
112 bgra.val[2] = b; | 117 bgra.val[2] = b; |
113 bgra.val[1] = g; | 118 bgra.val[1] = g; |
114 bgra.val[0] = r; | 119 bgra.val[0] = r; |
115 } else { | |
116 bgra.val[2] = r; | |
117 bgra.val[1] = g; | |
118 bgra.val[0] = b; | |
119 } | 120 } |
120 vst4_u8((uint8_t*) dst, bgra); | 121 vst4_u8((uint8_t*) dst, bgra); |
121 src += 8; | 122 src += 8; |
122 dst += 8; | 123 dst += 8; |
123 count -= 8; | 124 count -= 8; |
124 } | 125 } |
125 | 126 |
126 // Call portable code to finish up the tail of [0,8) pixels. | 127 // Call portable code to finish up the tail of [0,8) pixels. |
127 auto proc = kSwapRB ? premul_swaprb_xxxa_portable : premul_xxxa_portable; | 128 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable; |
128 proc(dst, src, count); | 129 proc(dst, src, count); |
129 } | 130 } |
130 | 131 |
131 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) { | 132 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { |
132 premul_xxxa_should_swaprb<false>(dst, src, count); | 133 premul_should_swapRB<false>(dst, src, count); |
133 } | 134 } |
134 | 135 |
135 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { | 136 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { |
136 premul_xxxa_should_swaprb<true>(dst, src, count); | 137 premul_should_swapRB<true>(dst, src, count); |
137 } | 138 } |
138 | 139 |
139 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { | 140 static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) { |
141 auto src = (const uint32_t*)vsrc; | |
140 while (count >= 16) { | 142 while (count >= 16) { |
141 // Load 16 pixels. | 143 // Load 16 pixels. |
142 uint8x16x4_t bgra = vld4q_u8((const uint8_t*) src); | 144 uint8x16x4_t bgra = vld4q_u8((const uint8_t*) src); |
143 | 145 |
144 // Swap r and b. | 146 // Swap r and b. |
145 SkTSwap(bgra.val[0], bgra.val[2]); | 147 SkTSwap(bgra.val[0], bgra.val[2]); |
146 | 148 |
147 // Store 16 pixels. | 149 // Store 16 pixels. |
148 vst4q_u8((uint8_t*) dst, bgra); | 150 vst4q_u8((uint8_t*) dst, bgra); |
149 src += 16; | 151 src += 16; |
150 dst += 16; | 152 dst += 16; |
151 count -= 16; | 153 count -= 16; |
152 } | 154 } |
153 | 155 |
154 if (count >= 8) { | 156 if (count >= 8) { |
155 // Load 8 pixels. | 157 // Load 8 pixels. |
156 uint8x8x4_t bgra = vld4_u8((const uint8_t*) src); | 158 uint8x8x4_t bgra = vld4_u8((const uint8_t*) src); |
157 | 159 |
158 // Swap r and b. | 160 // Swap r and b. |
159 SkTSwap(bgra.val[0], bgra.val[2]); | 161 SkTSwap(bgra.val[0], bgra.val[2]); |
160 | 162 |
161 // Store 8 pixels. | 163 // Store 8 pixels. |
162 vst4_u8((uint8_t*) dst, bgra); | 164 vst4_u8((uint8_t*) dst, bgra); |
163 src += 8; | 165 src += 8; |
164 dst += 8; | 166 dst += 8; |
165 count -= 8; | 167 count -= 8; |
166 } | 168 } |
167 | 169 |
168 swaprb_xxxa_portable(dst, src, count); | 170 RGBA_to_BGRA_portable(dst, src, count); |
169 } | 171 } |
170 | 172 |
171 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 | 173 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 |
172 | 174 |
173 template <bool kSwapRB> | 175 template <bool kSwapRB> |
174 static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int count) { | 176 static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) { |
177 auto src = (const uint32_t*)vsrc; | |
175 | 178 |
176 auto premul8 = [](__m128i* lo, __m128i* hi) { | 179 auto premul8 = [](__m128i* lo, __m128i* hi) { |
177 const __m128i zeros = _mm_setzero_si128(); | 180 const __m128i zeros = _mm_setzero_si128(); |
178 const __m128i _128 = _mm_set1_epi16(128); | 181 const __m128i _128 = _mm_set1_epi16(128); |
179 const __m128i _257 = _mm_set1_epi16(257); | 182 const __m128i _257 = _mm_set1_epi16(257); |
180 __m128i planar; | 183 __m128i planar; |
181 if (kSwapRB) { | 184 if (kSwapRB) { |
182 planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15); | 185 planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15); |
183 } else { | 186 } else { |
184 planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15); | 187 planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15); |
185 } | 188 } |
186 | 189 |
187 // Swizzle the pixels to 8-bit planar. | 190 // Swizzle the pixels to 8-bit planar. |
188 *lo = _mm_shuffle_epi8(*lo, planar); // bbbbgggg rr rraaaa | 191 *lo = _mm_shuffle_epi8(*lo, planar); // rrrrgggg bb bbaaaa |
189 *hi = _mm_shuffle_epi8(*hi, planar); // BBBBGGGG RR RRAAAA | 192 *hi = _mm_shuffle_epi8(*hi, planar); // RRRRGGGG BB BBAAAA |
190 __m128i bg = _mm_unpacklo_epi32(*lo, *hi), // bbbbBBBB gg ggGGGG | 193 __m128i rg = _mm_unpacklo_epi32(*lo, *hi), // rrrrRRRR gg ggGGGG |
191 ra = _mm_unpackhi_epi32(*lo, *hi); // rrrrRRRR aa aaAAAA | 194 ba = _mm_unpackhi_epi32(*lo, *hi); // bbbbBBBB aa aaAAAA |
192 | 195 |
193 // Unpack to 16-bit planar. | 196 // Unpack to 16-bit planar. |
194 __m128i b = _mm_unpacklo_epi8(bg, zeros), // b_b_b_b_ B_ B_B_B_ | 197 __m128i r = _mm_unpacklo_epi8(rg, zeros), // r_r_r_r_ R_ R_R_R_ |
195 g = _mm_unpackhi_epi8(bg, zeros), // g_g_g_g_ G_ G_G_G_ | 198 g = _mm_unpackhi_epi8(rg, zeros), // g_g_g_g_ G_ G_G_G_ |
196 r = _mm_unpacklo_epi8(ra, zeros), // r_r_r_r_ R_ R_R_R_ | 199 b = _mm_unpacklo_epi8(ba, zeros), // b_b_b_b_ B_ B_B_B_ |
197 a = _mm_unpackhi_epi8(ra, zeros); // a_a_a_a_ A_ A_A_A_ | 200 a = _mm_unpackhi_epi8(ba, zeros); // a_a_a_a_ A_ A_A_A_ |
198 | 201 |
199 // Premultiply! (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255. | 202 // Premultiply! (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255. |
203 r = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(r, a), _128), _257); | |
204 g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(g, a), _128), _257); | |
200 b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(b, a), _128), _257); | 205 b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(b, a), _128), _257); |
201 g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(g, a), _128), _257); | |
202 r = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(r, a), _128), _257); | |
203 | 206 |
204 // Repack into interlaced pixels. | 207 // Repack into interlaced pixels. |
205 bg = _mm_or_si128(b, _mm_slli_epi16(g, 8)); // bgbgbgbg BG BGBGBG | 208 rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)); // rgrgrgrg RG RGRGRG |
206 ra = _mm_or_si128(r, _mm_slli_epi16(a, 8)); // rararara RA RARARA | 209 ba = _mm_or_si128(b, _mm_slli_epi16(a, 8)); // babababa BA BABABA |
207 *lo = _mm_unpacklo_epi16(bg, ra); // bgrabgra bg rabgra | 210 *lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rg bargba |
208 *hi = _mm_unpackhi_epi16(bg, ra); // BRGABGRA BG RABGRA | 211 *hi = _mm_unpackhi_epi16(rg, ba); // RGBARGBA RG BARGBA |
209 }; | 212 }; |
210 | 213 |
211 while (count >= 8) { | 214 while (count >= 8) { |
212 __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)), | 215 __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)), |
213 hi = _mm_loadu_si128((const __m128i*) (src + 4)); | 216 hi = _mm_loadu_si128((const __m128i*) (src + 4)); |
214 | 217 |
215 premul8(&lo, &hi); | 218 premul8(&lo, &hi); |
216 | 219 |
217 _mm_storeu_si128((__m128i*) (dst + 0), lo); | 220 _mm_storeu_si128((__m128i*) (dst + 0), lo); |
218 _mm_storeu_si128((__m128i*) (dst + 4), hi); | 221 _mm_storeu_si128((__m128i*) (dst + 4), hi); |
(...skipping 10 matching lines...) Expand all Loading... | |
229 premul8(&lo, &hi); | 232 premul8(&lo, &hi); |
230 | 233 |
231 _mm_storeu_si128((__m128i*) dst, lo); | 234 _mm_storeu_si128((__m128i*) dst, lo); |
232 | 235 |
233 src += 4; | 236 src += 4; |
234 dst += 4; | 237 dst += 4; |
235 count -= 4; | 238 count -= 4; |
236 } | 239 } |
237 | 240 |
238 // Call portable code to finish up the tail of [0,4) pixels. | 241 // Call portable code to finish up the tail of [0,4) pixels. |
239 auto proc = kSwapRB ? premul_swaprb_xxxa_portable : premul_xxxa_portable; | 242 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable; |
240 proc(dst, src, count); | 243 proc(dst, src, count); |
241 } | 244 } |
242 | 245 |
243 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) { | 246 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { |
244 premul_xxxa_should_swaprb<false>(dst, src, count); | 247 premul_should_swapRB<false>(dst, src, count); |
245 } | 248 } |
246 | 249 |
247 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { | 250 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { |
248 premul_xxxa_should_swaprb<true>(dst, src, count); | 251 premul_should_swapRB<true>(dst, src, count); |
249 } | 252 } |
250 | 253 |
251 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { | 254 static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) { |
255 auto src = (const uint32_t*)vsrc; | |
252 const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,1 5); | 256 const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,1 5); |
253 | 257 |
254 while (count >= 4) { | 258 while (count >= 4) { |
255 __m128i bgra = _mm_loadu_si128((const __m128i*) src); | 259 __m128i rgba = _mm_loadu_si128((const __m128i*) src); |
256 __m128i rgba = _mm_shuffle_epi8(bgra, swapRB); | 260 __m128i bgra = _mm_shuffle_epi8(rgba, swapRB); |
257 _mm_storeu_si128((__m128i*) dst, rgba); | 261 _mm_storeu_si128((__m128i*) dst, bgra); |
258 | 262 |
259 src += 4; | 263 src += 4; |
260 dst += 4; | 264 dst += 4; |
261 count -= 4; | 265 count -= 4; |
262 } | 266 } |
263 | 267 |
264 swaprb_xxxa_portable(dst, src, count); | 268 RGBA_to_BGRA_portable(dst, src, count); |
265 } | 269 } |
266 | 270 |
267 #else | 271 #else |
268 | 272 |
269 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) { | 273 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { |
270 premul_xxxa_portable(dst, src, count); | 274 RGBA_to_rgbA_portable(dst, src, count); |
271 } | 275 } |
272 | 276 |
273 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { | 277 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { |
274 premul_swaprb_xxxa_portable(dst, src, count); | 278 RGBA_to_bgrA_portable(dst, src, count); |
275 } | 279 } |
276 | 280 |
277 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { | 281 static void RGBA_to_BGRA(uint32_t* dst, const void* src, int count) { |
278 swaprb_xxxa_portable(dst, src, count); | 282 RGBA_to_BGRA_portable(dst, src, count); |
279 } | 283 } |
280 | 284 |
281 #endif | 285 #endif |
282 | 286 |
283 } | 287 } |
284 | 288 |
285 #endif // SkSwizzler_opts_DEFINED | 289 #endif // SkSwizzler_opts_DEFINED |
OLD | NEW |