OLD | NEW |
---|---|
1 /* | 1 /* |
2 * Copyright 2016 Google Inc. | 2 * Copyright 2016 Google Inc. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #ifndef SkSwizzler_opts_DEFINED | 8 #ifndef SkSwizzler_opts_DEFINED |
9 #define SkSwizzler_opts_DEFINED | 9 #define SkSwizzler_opts_DEFINED |
10 | 10 |
(...skipping 156 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
167 // Store 8 pixels. | 167 // Store 8 pixels. |
168 vst4_u8((uint8_t*) dst, bgra); | 168 vst4_u8((uint8_t*) dst, bgra); |
169 src += 8; | 169 src += 8; |
170 dst += 8; | 170 dst += 8; |
171 count -= 8; | 171 count -= 8; |
172 } | 172 } |
173 | 173 |
174 swaprb_xxxa_portable(dst, src, count); | 174 swaprb_xxxa_portable(dst, src, count); |
175 } | 175 } |
176 | 176 |
177 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 | |
178 | |
179 template <bool kSwapRB> | |
180 static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int count) { | |
181 const __m128i zeros = _mm_setzero_si128(); | |
182 const __m128i _128 = _mm_set1_epi16(128); | |
183 const __m128i _257 = _mm_set1_epi16(257); | |
184 __m128i planar; | |
185 if (kSwapRB) { | |
186 planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15); | |
187 } else { | |
188 planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15); | |
189 } | |
190 | |
191 auto premul8 = [&zeros, &_128, &_257, &planar](__m128i* lo, __m128i* hi) { | |
mtklein
2016/01/19 20:15:02
The comments inside the while loop are now probabl
mtklein
2016/01/19 20:15:02
Just out of curiosity, what happens to the codegen
msarett
2016/01/19 21:02:39
Codegen is unaffected by moving the constants insi
| |
192 // Swizzle the pixels to 8-bit planar. | |
193 *lo = _mm_shuffle_epi8(*lo, planar); // bbbbgggg rr rraaaa | |
194 *hi = _mm_shuffle_epi8(*hi, planar); // BBBBGGGG RR RRAAAA | |
195 __m128i bg = _mm_unpacklo_epi32(*lo, *hi), // bbbbBBBB gg ggGGGG | |
196 ra = _mm_unpackhi_epi32(*lo, *hi); // rrrrRRRR aa aaAAAA | |
197 | |
198 // Unpack to 16-bit planar. | |
199 __m128i b = _mm_unpacklo_epi8(bg, zeros), // b_b_b_b_ B_ B_B_B_ | |
200 g = _mm_unpackhi_epi8(bg, zeros), // g_g_g_g_ G_ G_G_G_ | |
201 r = _mm_unpacklo_epi8(ra, zeros), // r_r_r_r_ R_ R_R_R_ | |
202 a = _mm_unpackhi_epi8(ra, zeros); // a_a_a_a_ A_ A_A_A_ | |
203 | |
204 // Premultiply! (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255. | |
205 b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(b, a), _128), _257); | |
206 g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(g, a), _128), _257); | |
207 r = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(r, a), _128), _257); | |
208 | |
209 // Repack into interlaced pixels. | |
210 bg = _mm_or_si128(b, _mm_slli_epi16(g, 8)); // bgbgbgbg BG BGBGBG | |
211 ra = _mm_or_si128(r, _mm_slli_epi16(a, 8)); // rararara RA RARARA | |
212 *lo = _mm_unpacklo_epi16(bg, ra); // bgrabgra bg rabgra | |
213 *hi = _mm_unpackhi_epi16(bg, ra); // BRGABGRA BG RABGRA | |
214 }; | |
215 | |
216 while (count >= 8) { | |
217 // First just load the 8 interlaced pixels. | |
218 __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)), // bgrabgra bg rabgra | |
219 hi = _mm_loadu_si128((const __m128i*) (src + 4)); // BGRABGRA BG RABGRA | |
220 | |
221 premul8(&lo, &hi); | |
222 | |
223 // Store interlaced pixels. | |
224 _mm_storeu_si128((__m128i*) (dst + 0), lo); | |
225 _mm_storeu_si128((__m128i*) (dst + 4), hi); | |
226 | |
227 src += 8; | |
228 dst += 8; | |
229 count -= 8; | |
230 } | |
231 | |
232 if (count >= 4) { | |
233 // First just load 4 interlaced pixels. | |
234 __m128i lo = _mm_loadu_si128((const __m128i*) src), // bgrabgra bg rabgra | |
235 hi = _mm_setzero_si128(); | |
236 | |
237 premul8(&lo, &hi); | |
238 | |
239 // Store interlaced pixels. | |
240 _mm_storeu_si128((__m128i*) dst, lo); | |
241 | |
242 src += 4; | |
243 dst += 4; | |
244 count -= 4; | |
245 } | |
246 | |
247 // Call portable code to finish up the tail of [0,4) pixels. | |
248 auto proc = kSwapRB ? premul_swaprb_xxxa_portable : premul_xxxa_portable; | |
249 proc(dst, src, count); | |
250 } | |
251 | |
252 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) { | |
253 premul_xxxa_should_swaprb<false>(dst, src, count); | |
254 } | |
255 | |
256 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { | |
257 premul_xxxa_should_swaprb<true>(dst, src, count); | |
258 } | |
259 | |
260 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { | |
261 const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,1 5); | |
262 | |
263 while (count >= 4) { | |
264 __m128i bgra = _mm_loadu_si128((const __m128i*) src); | |
265 __m128i rgba = _mm_shuffle_epi8(bgra, swapRB); | |
266 _mm_storeu_si128((__m128i*) dst, rgba); | |
267 | |
268 src += 4; | |
269 dst += 4; | |
270 count -= 4; | |
271 } | |
272 | |
273 swaprb_xxxa_portable(dst, src, count); | |
274 } | |
275 | |
177 #else | 276 #else |
178 | 277 |
179 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) { | 278 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
180 premul_xxxa_portable(dst, src, count); | 279 premul_xxxa_portable(dst, src, count); |
181 } | 280 } |
182 | 281 |
183 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { | 282 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
184 premul_swaprb_xxxa_portable(dst, src, count); | 283 premul_swaprb_xxxa_portable(dst, src, count); |
185 } | 284 } |
186 | 285 |
187 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { | 286 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
188 swaprb_xxxa_portable(dst, src, count); | 287 swaprb_xxxa_portable(dst, src, count); |
189 } | 288 } |
190 | 289 |
191 #endif | 290 #endif |
192 | 291 |
193 } | 292 } |
194 | 293 |
195 #endif // SkSwizzler_opts_DEFINED | 294 #endif // SkSwizzler_opts_DEFINED |
OLD | NEW |