OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2016 Google Inc. | 2 * Copyright 2016 Google Inc. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #ifndef SkSwizzler_opts_DEFINED | 8 #ifndef SkSwizzler_opts_DEFINED |
9 #define SkSwizzler_opts_DEFINED | 9 #define SkSwizzler_opts_DEFINED |
10 | 10 |
(...skipping 156 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
167 // Store 8 pixels. | 167 // Store 8 pixels. |
168 vst4_u8((uint8_t*) dst, bgra); | 168 vst4_u8((uint8_t*) dst, bgra); |
169 src += 8; | 169 src += 8; |
170 dst += 8; | 170 dst += 8; |
171 count -= 8; | 171 count -= 8; |
172 } | 172 } |
173 | 173 |
174 swaprb_xxxa_portable(dst, src, count); | 174 swaprb_xxxa_portable(dst, src, count); |
175 } | 175 } |
176 | 176 |
| 177 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 |
| 178 |
| 179 template <bool kSwapRB> |
| 180 static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int
count) { |
| 181 |
| 182 auto premul8 = [](__m128i* lo, __m128i* hi) { |
| 183 const __m128i zeros = _mm_setzero_si128(); |
| 184 const __m128i _128 = _mm_set1_epi16(128); |
| 185 const __m128i _257 = _mm_set1_epi16(257); |
| 186 __m128i planar; |
| 187 if (kSwapRB) { |
| 188 planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15); |
| 189 } else { |
| 190 planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15); |
| 191 } |
| 192 |
| 193 // Swizzle the pixels to 8-bit planar. |
| 194 *lo = _mm_shuffle_epi8(*lo, planar); // bbbbgggg rr
rraaaa |
| 195 *hi = _mm_shuffle_epi8(*hi, planar); // BBBBGGGG RR
RRAAAA |
| 196 __m128i bg = _mm_unpacklo_epi32(*lo, *hi), // bbbbBBBB gg
ggGGGG |
| 197 ra = _mm_unpackhi_epi32(*lo, *hi); // rrrrRRRR aa
aaAAAA |
| 198 |
| 199 // Unpack to 16-bit planar. |
| 200 __m128i b = _mm_unpacklo_epi8(bg, zeros), // b_b_b_b_ B_
B_B_B_ |
| 201 g = _mm_unpackhi_epi8(bg, zeros), // g_g_g_g_ G_
G_G_G_ |
| 202 r = _mm_unpacklo_epi8(ra, zeros), // r_r_r_r_ R_
R_R_R_ |
| 203 a = _mm_unpackhi_epi8(ra, zeros); // a_a_a_a_ A_
A_A_A_ |
| 204 |
| 205 // Premultiply! (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255. |
| 206 b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(b, a), _128), _257); |
| 207 g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(g, a), _128), _257); |
| 208 r = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(r, a), _128), _257); |
| 209 |
| 210 // Repack into interlaced pixels. |
| 211 bg = _mm_or_si128(b, _mm_slli_epi16(g, 8)); // bgbgbgbg BG
BGBGBG |
| 212 ra = _mm_or_si128(r, _mm_slli_epi16(a, 8)); // rararara RA
RARARA |
| 213 *lo = _mm_unpacklo_epi16(bg, ra); // bgrabgra bg
rabgra |
| 214 *hi = _mm_unpackhi_epi16(bg, ra); // BRGABGRA BG
RABGRA |
| 215 }; |
| 216 |
| 217 while (count >= 8) { |
| 218 __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)), |
| 219 hi = _mm_loadu_si128((const __m128i*) (src + 4)); |
| 220 |
| 221 premul8(&lo, &hi); |
| 222 |
| 223 _mm_storeu_si128((__m128i*) (dst + 0), lo); |
| 224 _mm_storeu_si128((__m128i*) (dst + 4), hi); |
| 225 |
| 226 src += 8; |
| 227 dst += 8; |
| 228 count -= 8; |
| 229 } |
| 230 |
| 231 if (count >= 4) { |
| 232 __m128i lo = _mm_loadu_si128((const __m128i*) src), |
| 233 hi = _mm_setzero_si128(); |
| 234 |
| 235 premul8(&lo, &hi); |
| 236 |
| 237 _mm_storeu_si128((__m128i*) dst, lo); |
| 238 |
| 239 src += 4; |
| 240 dst += 4; |
| 241 count -= 4; |
| 242 } |
| 243 |
| 244 // Call portable code to finish up the tail of [0,4) pixels. |
| 245 auto proc = kSwapRB ? premul_swaprb_xxxa_portable : premul_xxxa_portable; |
| 246 proc(dst, src, count); |
| 247 } |
| 248 |
| 249 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
| 250 premul_xxxa_should_swaprb<false>(dst, src, count); |
| 251 } |
| 252 |
| 253 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count)
{ |
| 254 premul_xxxa_should_swaprb<true>(dst, src, count); |
| 255 } |
| 256 |
| 257 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
| 258 const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,1
5); |
| 259 |
| 260 while (count >= 4) { |
| 261 __m128i bgra = _mm_loadu_si128((const __m128i*) src); |
| 262 __m128i rgba = _mm_shuffle_epi8(bgra, swapRB); |
| 263 _mm_storeu_si128((__m128i*) dst, rgba); |
| 264 |
| 265 src += 4; |
| 266 dst += 4; |
| 267 count -= 4; |
| 268 } |
| 269 |
| 270 swaprb_xxxa_portable(dst, src, count); |
| 271 } |
| 272 |
177 #else | 273 #else |
178 | 274 |
179 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) { | 275 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
180 premul_xxxa_portable(dst, src, count); | 276 premul_xxxa_portable(dst, src, count); |
181 } | 277 } |
182 | 278 |
183 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count)
{ | 279 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count)
{ |
184 premul_swaprb_xxxa_portable(dst, src, count); | 280 premul_swaprb_xxxa_portable(dst, src, count); |
185 } | 281 } |
186 | 282 |
187 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { | 283 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
188 swaprb_xxxa_portable(dst, src, count); | 284 swaprb_xxxa_portable(dst, src, count); |
189 } | 285 } |
190 | 286 |
191 #endif | 287 #endif |
192 | 288 |
193 } | 289 } |
194 | 290 |
195 #endif // SkSwizzler_opts_DEFINED | 291 #endif // SkSwizzler_opts_DEFINED |
OLD | NEW |