OLD | NEW |
---|---|
1 /* | 1 /* |
2 * Copyright 2016 Google Inc. | 2 * Copyright 2016 Google Inc. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #ifndef SkSwizzler_opts_DEFINED | 8 #ifndef SkSwizzler_opts_DEFINED |
9 #define SkSwizzler_opts_DEFINED | 9 #define SkSwizzler_opts_DEFINED |
10 | 10 |
(...skipping 156 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
167 // Store 8 pixels. | 167 // Store 8 pixels. |
168 vst4_u8((uint8_t*) dst, bgra); | 168 vst4_u8((uint8_t*) dst, bgra); |
169 src += 8; | 169 src += 8; |
170 dst += 8; | 170 dst += 8; |
171 count -= 8; | 171 count -= 8; |
172 } | 172 } |
173 | 173 |
174 swaprb_xxxa_portable(dst, src, count); | 174 swaprb_xxxa_portable(dst, src, count); |
175 } | 175 } |
176 | 176 |
177 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 | |
178 | |
179 template <bool kSwapRB> | |
180 static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int count) { | |
msarett
2016/01/18 20:35:05
There are a lot of different ways to implement thi
| |
181 const __m128i zeros = _mm_setzero_si128(); | |
182 const __m128i _128 = _mm_set1_epi16(128); | |
183 const __m128i _257 = _mm_set1_epi16(257); | |
184 const __m128i combine = _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1 , 12, 8, 4, 0); | |
185 __m128i split; | |
186 if (kSwapRB) { | |
187 split = _mm_set_epi8(15, 3, 7, 11, 14, 2, 6, 10, 13, 1, 5, 9, 12, 0, 4, 8); | |
188 } else { | |
189 split = combine; | |
190 } | |
191 | |
192 while (count >= 8) { | |
193 __m128i argb_lo = _mm_loadu_si128((const __m128i*) src); | |
194 __m128i argb_hi = _mm_loadu_si128((const __m128i*) (src + 4)); | |
195 | |
196 // argb_argb_argb_argb -> aaaa_rrrr_gggg_bbbb | |
mtklein
2016/01/19 15:59:14
Let's kick some of these comments a little bit hig
msarett
2016/01/19 17:34:38
Done.
Ugggh, for some reason I thought the rest o
| |
197 argb_lo = _mm_shuffle_epi8(argb_lo, combine); | |
198 argb_hi = _mm_shuffle_epi8(argb_hi, combine); | |
199 | |
200 // aaaa_rrrr_gggg_bbbb -> aaaa_aaaa_rrrr_rrrr | |
201 __m128i ar = _mm_unpackhi_epi32(argb_lo, argb_hi); | |
202 // aaaa_rrrr_gggg_bbbb -> gggg_gggg_bbbb_bbbb | |
203 __m128i gb = _mm_unpacklo_epi32(argb_lo, argb_hi); | |
204 | |
205 // xxxx_xxxx_yyyy_yyyy -> 0x0x_0x0x_0x0x_0x0x | |
206 // xxxx_xxxx_yyyy_yyyy -> 0y0y_0y0y_0y0y_0y0y | |
207 __m128i a = _mm_unpackhi_epi8(ar, zeros); | |
208 __m128i r = _mm_unpacklo_epi8(ar, zeros); | |
209 __m128i g = _mm_unpackhi_epi8(gb, zeros); | |
210 __m128i b = _mm_unpacklo_epi8(gb, zeros); | |
211 | |
212 // (x + 127) / 255 == ((x + 128) * 257) >> 16 for 0 <= x <= 255 * 255 | |
msarett
2016/01/18 20:35:05
Thanks to Mike for this insight.
| |
213 // Note that _mm_mulhi_epu16 performs the entire (y * 257) >> 16. | |
214 r = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(a, r), _128), _257); | |
mtklein
2016/01/19 15:59:14
This may be a matter of personal preference, but y
msarett
2016/01/19 17:34:38
Leaving as is, though I'm kind of indifferent.
I
| |
215 g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(a, g), _128), _257); | |
216 b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(a, b), _128), _257); | |
217 | |
218 // aaaa_rrrr_aaaa_rrrr | |
mtklein
2016/01/19 15:59:14
I think we can do this repacking as something like
msarett
2016/01/19 17:34:37
Yes this is better!
Let's even swap BR in the "sw
| |
219 ar = _mm_shuffle_epi32(_mm_packus_epi16(r, a), 0xD8); | |
220 // gggg_bbbb_gggg_bbbb | |
221 gb = _mm_shuffle_epi32(_mm_packus_epi16(b, g), 0xD8); | |
222 | |
223 // aaaa_rrrr_gggg_bbbb | |
224 argb_lo = _mm_unpacklo_epi64(gb, ar); | |
225 argb_hi = _mm_unpackhi_epi64(gb, ar); | |
226 | |
227 // aaaa_rrrr_gggg_bbbb -> argb_argb_argb_argb | |
228 argb_lo = _mm_shuffle_epi8(argb_lo, split); | |
229 argb_hi = _mm_shuffle_epi8(argb_hi, split); | |
230 | |
231 _mm_storeu_si128((__m128i*) dst, argb_lo); | |
232 _mm_storeu_si128((__m128i*) (dst + 4), argb_hi); | |
233 | |
234 src += 8; | |
235 dst += 8; | |
236 count -= 8; | |
237 } | |
238 | |
239 if (count >= 4) { | |
mtklein
2016/01/19 15:59:14
Reminder to self to circle back here when we're ha
| |
240 __m128i argb = _mm_loadu_si128((const __m128i*) src); | |
241 | |
242 // argb_argb_argb_argb -> aaaa_rrrr_gggg_bbbb | |
243 argb = _mm_shuffle_epi8(argb, combine); | |
244 | |
245 // aaaa_rrrr_gggg_bbbb -> 0000_aaaa_0000_rrrr | |
246 __m128i ar = _mm_unpackhi_epi32(argb, zeros); | |
247 // aaaa_rrrr_gggg_bbbb -> 0000_gggg_0000_bbbb | |
248 __m128i gb = _mm_unpacklo_epi32(argb, zeros); | |
249 | |
250 // xxxx_xxxx_yyyy_yyyy -> 0x0x_0x0x_0x0x_0x0x | |
251 // xxxx_xxxx_yyyy_yyyy -> 0y0y_0y0y_0y0y_0y0y | |
252 __m128i a = _mm_unpackhi_epi8(ar, zeros); | |
253 __m128i r = _mm_unpacklo_epi8(ar, zeros); | |
254 __m128i g = _mm_unpackhi_epi8(gb, zeros); | |
255 __m128i b = _mm_unpacklo_epi8(gb, zeros); | |
256 | |
257 // (x + 127) / 255 == ((x + 128) * 257) >> 16 for 0 <= x <= 255 * 255 | |
258 // Note that _mm_mulhi_epu16 performs the entire (y * 257) >> 16. | |
259 r = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(a, r), _128), _257); | |
260 g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(a, g), _128), _257); | |
261 b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(a, b), _128), _257); | |
262 | |
263 // aaaa_rrrr_0000_0000 | |
264 ar = _mm_shuffle_epi32(_mm_packus_epi16(r, a), 0x8F); | |
265 // 0000_0000_gggg_bbbb | |
266 gb = _mm_shuffle_epi32(_mm_packus_epi16(b, g), 0xD8); | |
267 | |
268 // aaaa_rrrr_gggg_bbbb | |
269 argb = _mm_or_si128(ar, gb); | |
270 | |
271 // aaaa_rrrr_gggg_bbbb -> argb_argb_argb_argb | |
272 argb = _mm_shuffle_epi8(argb, split); | |
273 | |
274 _mm_storeu_si128((__m128i*) dst, argb); | |
275 | |
276 src += 4; | |
277 dst += 4; | |
278 count -= 4; | |
279 } | |
280 | |
281 // Call portable code to finish up the tail of [0,4) pixels. | |
282 auto proc = kSwapRB ? premul_swaprb_xxxa_portable : premul_xxxa_portable; | |
283 proc(dst, src, count); | |
284 } | |
285 | |
286 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) { | |
287 premul_xxxa_should_swaprb<false>(dst, src, count); | |
288 } | |
289 | |
290 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { | |
291 premul_xxxa_should_swaprb<true>(dst, src, count); | |
292 } | |
293 | |
294 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { | |
295 const __m128i swapRB = _mm_set_epi8(15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6 , 3, 0, 1, 2); | |
mtklein
2016/01/19 15:59:15
I often find it's easier to read these if you use
msarett
2016/01/19 17:34:37
I think you're right.
| |
296 | |
297 while (count >= 4) { | |
298 __m128i argb = _mm_loadu_si128((const __m128i*) src); | |
299 __m128i abgr = _mm_shuffle_epi8(argb, swapRB); | |
300 _mm_storeu_si128((__m128i*) dst, abgr); | |
301 | |
302 src += 4; | |
303 dst += 4; | |
304 count -= 4; | |
305 } | |
306 | |
307 swaprb_xxxa_portable(dst, src, count); | |
308 } | |
309 | |
177 #else | 310 #else |
178 | 311 |
179 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) { | 312 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
180 premul_xxxa_portable(dst, src, count); | 313 premul_xxxa_portable(dst, src, count); |
181 } | 314 } |
182 | 315 |
183 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { | 316 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
184 premul_swaprb_xxxa_portable(dst, src, count); | 317 premul_swaprb_xxxa_portable(dst, src, count); |
185 } | 318 } |
186 | 319 |
187 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { | 320 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
188 swaprb_xxxa_portable(dst, src, count); | 321 swaprb_xxxa_portable(dst, src, count); |
189 } | 322 } |
190 | 323 |
191 #endif | 324 #endif |
192 | 325 |
193 } | 326 } |
194 | 327 |
195 #endif // SkSwizzler_opts_DEFINED | 328 #endif // SkSwizzler_opts_DEFINED |
OLD | NEW |