OLD | NEW |
---|---|
1 /* | 1 /* |
2 * Copyright 2016 Google Inc. | 2 * Copyright 2016 Google Inc. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #ifndef SkSwizzler_opts_DEFINED | 8 #ifndef SkSwizzler_opts_DEFINED |
9 #define SkSwizzler_opts_DEFINED | 9 #define SkSwizzler_opts_DEFINED |
10 | 10 |
(...skipping 156 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
167 // Store 8 pixels. | 167 // Store 8 pixels. |
168 vst4_u8((uint8_t*) dst, bgra); | 168 vst4_u8((uint8_t*) dst, bgra); |
169 src += 8; | 169 src += 8; |
170 dst += 8; | 170 dst += 8; |
171 count -= 8; | 171 count -= 8; |
172 } | 172 } |
173 | 173 |
174 swaprb_xxxa_portable(dst, src, count); | 174 swaprb_xxxa_portable(dst, src, count); |
175 } | 175 } |
176 | 176 |
177 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 | |
178 | |
179 template <bool kSwapRB> | |
180 static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int count) { | |
181 const __m128i zeros = _mm_setzero_si128(); | |
182 const __m128i _128 = _mm_set1_epi16(128); | |
183 const __m128i _257 = _mm_set1_epi16(257); | |
184 __m128i planar; | |
185 if (kSwapRB) { | |
186 planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15); | |
187 } else { | |
188 planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15); | |
189 } | |
190 | |
191 while (count >= 8) { | |
192 // We'll load 8 pixels into 4 registers, each holding a 16-bit component plane. | |
193 | |
194 // First just load the 8 interlaced pixels. | |
195 __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)), // bgrabgra bg rabgra | |
196 hi = _mm_loadu_si128((const __m128i*) (src + 4)); // BGRABGRA BG RABGRA | |
197 | |
198 // Swizzle them to 8-bit planar. | |
199 lo = _mm_shuffle_epi8(lo, planar); // bbbbgggg rr rraaaa | |
200 hi = _mm_shuffle_epi8(lo, planar); // BBBBGGGG RR RRAAAA | |
201 __m128i bg = _mm_unpacklo_epi32(lo, hi), // bbbbBBBB gg ggGGGG | |
202 ra = _mm_unpackhi_epi32(lo, hi); // rrrrRRRR aa aaAAAA | |
203 | |
204 // Unpack to 16-bit planar. | |
205 __m128i b = _mm_unpacklo_epi8(bg, zeros), // b_b_b_b_ B_ B_B_B_ | |
206 g = _mm_unpackhi_epi8(bg, zeros), // g_g_g_g_ G_ G_G_G_ | |
207 r = _mm_unpacklo_epi8(ra, zeros), // r_r_r_r_ R_ R_R_R_ | |
208 a = _mm_unpackhi_epi8(ra, zeros); // a_a_a_a_ A_ A_A_A_ | |
209 | |
210 // Premultiply! (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255. | |
211 b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(b, a), _128), _257); | |
212 g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(g, a), _128), _257); | |
213 r = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(r, a), _128), _257); | |
214 | |
215 // Repack into interlaced pixels. | |
216 bg = _mm_or_si128(b, _mm_slli_epi16(g, 8)); // bgbgbgbg BG BGBGBG | |
217 ra = _mm_or_si128(r, _mm_slli_epi16(a, 8)); // rararara RA RARARA | |
218 lo = _mm_unpacklo_epi16(bg, ra); // bgrabgra bg rabgra | |
219 hi = _mm_unpackhi_epi16(bg, ra); // BRGABGRA BG RABGRA | |
220 | |
221 // Store interlaced pixels. | |
222 _mm_storeu_si128((__m128i*) (dst + 0), lo); | |
223 _mm_storeu_si128((__m128i*) (dst + 4), hi); | |
224 | |
225 src += 8; | |
226 dst += 8; | |
227 count -= 8; | |
228 } | |
229 | |
230 if (count >= 4) { | |
mtklein
2016/01/19 18:28:30
OK, now that we've got count >= 8 in shape, let's
msarett
2016/01/19 19:17:43
Done.
| |
231 // First just load 4 interlaced pixels. | |
232 __m128i lo = _mm_loadu_si128((const __m128i*) src); // bgrabgra bg rabgra | |
233 | |
234 // Swizzle them to 8-bit planar. | |
235 lo = _mm_shuffle_epi8(lo, planar); // bbbbgggg rr rraaaa | |
236 __m128i bg = _mm_unpacklo_epi32(lo, zeros), // bbbb____ gg gg____ | |
237 ra = _mm_unpackhi_epi32(lo, zeros); // rrrr____ aa aa____ | |
238 | |
239 // Unpack to 16-bit planar. | |
240 __m128i b = _mm_unpacklo_epi8(bg, zeros), // b_b_b_b_ __ ______ | |
241 g = _mm_unpackhi_epi8(bg, zeros), // g_g_g_g_ __ ______ | |
242 r = _mm_unpacklo_epi8(ra, zeros), // r_r_r_r_ __ ______ | |
243 a = _mm_unpackhi_epi8(ra, zeros); // a_a_a_a_ __ ______ | |
244 | |
245 // Premultiply! (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255. | |
246 b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(b, a), _128), _257); | |
247 g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(g, a), _128), _257); | |
248 r = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(r, a), _128), _257); | |
249 | |
250 // Repack into interlaced pixels. | |
251 bg = _mm_or_si128(b, _mm_slli_epi16(g, 8)); // bgbgbgbg __ ______ | |
252 ra = _mm_or_si128(r, _mm_slli_epi16(a, 8)); // rararara __ ______ | |
253 lo = _mm_unpacklo_epi16(bg, ra); // bgrabgra bg rabgra | |
254 | |
255 // Store interlaced pixels. | |
256 _mm_storeu_si128((__m128i*) dst, lo); | |
257 | |
258 src += 4; | |
259 dst += 4; | |
260 count -= 4; | |
261 } | |
262 | |
263 // Call portable code to finish up the tail of [0,4) pixels. | |
264 auto proc = kSwapRB ? premul_swaprb_xxxa_portable : premul_xxxa_portable; | |
265 proc(dst, src, count); | |
266 } | |
267 | |
268 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) { | |
269 premul_xxxa_should_swaprb<false>(dst, src, count); | |
270 } | |
271 | |
272 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { | |
273 premul_xxxa_should_swaprb<true>(dst, src, count); | |
274 } | |
275 | |
276 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { | |
277 const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,1 5); | |
278 | |
279 while (count >= 4) { | |
280 __m128i bgra = _mm_loadu_si128((const __m128i*) src); | |
281 __m128i rgba = _mm_shuffle_epi8(bgra, swapRB); | |
282 _mm_storeu_si128((__m128i*) dst, rgba); | |
283 | |
284 src += 4; | |
285 dst += 4; | |
286 count -= 4; | |
287 } | |
288 | |
289 swaprb_xxxa_portable(dst, src, count); | |
290 } | |
291 | |
177 #else | 292 #else |
178 | 293 |
179 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) { | 294 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
180 premul_xxxa_portable(dst, src, count); | 295 premul_xxxa_portable(dst, src, count); |
181 } | 296 } |
182 | 297 |
183 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { | 298 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
184 premul_swaprb_xxxa_portable(dst, src, count); | 299 premul_swaprb_xxxa_portable(dst, src, count); |
185 } | 300 } |
186 | 301 |
187 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { | 302 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
188 swaprb_xxxa_portable(dst, src, count); | 303 swaprb_xxxa_portable(dst, src, count); |
189 } | 304 } |
190 | 305 |
191 #endif | 306 #endif |
192 | 307 |
193 } | 308 } |
194 | 309 |
195 #endif // SkSwizzler_opts_DEFINED | 310 #endif // SkSwizzler_opts_DEFINED |
OLD | NEW |