Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1)

Side by Side Diff: src/opts/SkSwizzler_opts.h

Issue 1601883002: Add SSSE3 Optimizations for premul and swap (Closed) Base URL: https://skia.googlesource.com/skia.git@f-and-x
Patch Set: Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/opts/SkOpts_ssse3.cpp ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2016 Google Inc. 2 * Copyright 2016 Google Inc.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 #ifndef SkSwizzler_opts_DEFINED 8 #ifndef SkSwizzler_opts_DEFINED
9 #define SkSwizzler_opts_DEFINED 9 #define SkSwizzler_opts_DEFINED
10 10
(...skipping 156 matching lines...) Expand 10 before | Expand all | Expand 10 after
167 // Store 8 pixels. 167 // Store 8 pixels.
168 vst4_u8((uint8_t*) dst, bgra); 168 vst4_u8((uint8_t*) dst, bgra);
169 src += 8; 169 src += 8;
170 dst += 8; 170 dst += 8;
171 count -= 8; 171 count -= 8;
172 } 172 }
173 173
174 swaprb_xxxa_portable(dst, src, count); 174 swaprb_xxxa_portable(dst, src, count);
175 } 175 }
176 176
177 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
178
179 template <bool kSwapRB>
180 static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int count) {
msarett 2016/01/18 20:35:05 There are a lot of different ways to implement thi
181 const __m128i zeros = _mm_setzero_si128();
182 const __m128i _128 = _mm_set1_epi16(128);
183 const __m128i _257 = _mm_set1_epi16(257);
184 const __m128i combine = _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1 , 12, 8, 4, 0);
185 __m128i split;
186 if (kSwapRB) {
187 split = _mm_set_epi8(15, 3, 7, 11, 14, 2, 6, 10, 13, 1, 5, 9, 12, 0, 4, 8);
188 } else {
189 split = combine;
190 }
191
192 while (count >= 8) {
193 __m128i argb_lo = _mm_loadu_si128((const __m128i*) src);
194 __m128i argb_hi = _mm_loadu_si128((const __m128i*) (src + 4));
195
196 // argb_argb_argb_argb -> aaaa_rrrr_gggg_bbbb
mtklein 2016/01/19 15:59:14 Let's kick some of these comments a little bit hig
msarett 2016/01/19 17:34:38 Done. Ugggh, for some reason I thought the rest o
197 argb_lo = _mm_shuffle_epi8(argb_lo, combine);
198 argb_hi = _mm_shuffle_epi8(argb_hi, combine);
199
200 // aaaa_rrrr_gggg_bbbb -> aaaa_aaaa_rrrr_rrrr
201 __m128i ar = _mm_unpackhi_epi32(argb_lo, argb_hi);
202 // aaaa_rrrr_gggg_bbbb -> gggg_gggg_bbbb_bbbb
203 __m128i gb = _mm_unpacklo_epi32(argb_lo, argb_hi);
204
205 // xxxx_xxxx_yyyy_yyyy -> 0x0x_0x0x_0x0x_0x0x
206 // xxxx_xxxx_yyyy_yyyy -> 0y0y_0y0y_0y0y_0y0y
207 __m128i a = _mm_unpackhi_epi8(ar, zeros);
208 __m128i r = _mm_unpacklo_epi8(ar, zeros);
209 __m128i g = _mm_unpackhi_epi8(gb, zeros);
210 __m128i b = _mm_unpacklo_epi8(gb, zeros);
211
212 // (x + 127) / 255 == ((x + 128) * 257) >> 16 for 0 <= x <= 255 * 255
msarett 2016/01/18 20:35:05 Thanks to Mike for this insight.
213 // Note that _mm_mulhi_epu16 performs the entire (y * 257) >> 16.
214 r = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(a, r), _128), _257);
mtklein 2016/01/19 15:59:14 This may be a matter of personal preference, but y
msarett 2016/01/19 17:34:38 Leaving as is, though I'm kind of indifferent. I
215 g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(a, g), _128), _257);
216 b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(a, b), _128), _257);
217
218 // aaaa_rrrr_aaaa_rrrr
mtklein 2016/01/19 15:59:14 I think we can do this repacking as something like
msarett 2016/01/19 17:34:37 Yes this is better! Let's even swap BR in the "sw
219 ar = _mm_shuffle_epi32(_mm_packus_epi16(r, a), 0xD8);
220 // gggg_bbbb_gggg_bbbb
221 gb = _mm_shuffle_epi32(_mm_packus_epi16(b, g), 0xD8);
222
223 // aaaa_rrrr_gggg_bbbb
224 argb_lo = _mm_unpacklo_epi64(gb, ar);
225 argb_hi = _mm_unpackhi_epi64(gb, ar);
226
227 // aaaa_rrrr_gggg_bbbb -> argb_argb_argb_argb
228 argb_lo = _mm_shuffle_epi8(argb_lo, split);
229 argb_hi = _mm_shuffle_epi8(argb_hi, split);
230
231 _mm_storeu_si128((__m128i*) dst, argb_lo);
232 _mm_storeu_si128((__m128i*) (dst + 4), argb_hi);
233
234 src += 8;
235 dst += 8;
236 count -= 8;
237 }
238
239 if (count >= 4) {
mtklein 2016/01/19 15:59:14 Reminder to self to circle back here when we're ha
240 __m128i argb = _mm_loadu_si128((const __m128i*) src);
241
242 // argb_argb_argb_argb -> aaaa_rrrr_gggg_bbbb
243 argb = _mm_shuffle_epi8(argb, combine);
244
245 // aaaa_rrrr_gggg_bbbb -> 0000_aaaa_0000_rrrr
246 __m128i ar = _mm_unpackhi_epi32(argb, zeros);
247 // aaaa_rrrr_gggg_bbbb -> 0000_gggg_0000_bbbb
248 __m128i gb = _mm_unpacklo_epi32(argb, zeros);
249
250 // xxxx_xxxx_yyyy_yyyy -> 0x0x_0x0x_0x0x_0x0x
251 // xxxx_xxxx_yyyy_yyyy -> 0y0y_0y0y_0y0y_0y0y
252 __m128i a = _mm_unpackhi_epi8(ar, zeros);
253 __m128i r = _mm_unpacklo_epi8(ar, zeros);
254 __m128i g = _mm_unpackhi_epi8(gb, zeros);
255 __m128i b = _mm_unpacklo_epi8(gb, zeros);
256
257 // (x + 127) / 255 == ((x + 128) * 257) >> 16 for 0 <= x <= 255 * 255
258 // Note that _mm_mulhi_epu16 performs the entire (y * 257) >> 16.
259 r = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(a, r), _128), _257);
260 g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(a, g), _128), _257);
261 b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(a, b), _128), _257);
262
263 // aaaa_rrrr_0000_0000
264 ar = _mm_shuffle_epi32(_mm_packus_epi16(r, a), 0x8F);
265 // 0000_0000_gggg_bbbb
266 gb = _mm_shuffle_epi32(_mm_packus_epi16(b, g), 0xD8);
267
268 // aaaa_rrrr_gggg_bbbb
269 argb = _mm_or_si128(ar, gb);
270
271 // aaaa_rrrr_gggg_bbbb -> argb_argb_argb_argb
272 argb = _mm_shuffle_epi8(argb, split);
273
274 _mm_storeu_si128((__m128i*) dst, argb);
275
276 src += 4;
277 dst += 4;
278 count -= 4;
279 }
280
281 // Call portable code to finish up the tail of [0,4) pixels.
282 auto proc = kSwapRB ? premul_swaprb_xxxa_portable : premul_xxxa_portable;
283 proc(dst, src, count);
284 }
285
286 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) {
287 premul_xxxa_should_swaprb<false>(dst, src, count);
288 }
289
290 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {
291 premul_xxxa_should_swaprb<true>(dst, src, count);
292 }
293
294 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {
295 const __m128i swapRB = _mm_set_epi8(15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6 , 3, 0, 1, 2);
mtklein 2016/01/19 15:59:15 I often find it's easier to read these if you use
msarett 2016/01/19 17:34:37 I think you're right.
296
297 while (count >= 4) {
298 __m128i argb = _mm_loadu_si128((const __m128i*) src);
299 __m128i abgr = _mm_shuffle_epi8(argb, swapRB);
300 _mm_storeu_si128((__m128i*) dst, abgr);
301
302 src += 4;
303 dst += 4;
304 count -= 4;
305 }
306
307 swaprb_xxxa_portable(dst, src, count);
308 }
309
177 #else 310 #else
178 311
179 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) { 312 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) {
180 premul_xxxa_portable(dst, src, count); 313 premul_xxxa_portable(dst, src, count);
181 } 314 }
182 315
183 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { 316 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {
184 premul_swaprb_xxxa_portable(dst, src, count); 317 premul_swaprb_xxxa_portable(dst, src, count);
185 } 318 }
186 319
187 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { 320 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {
188 swaprb_xxxa_portable(dst, src, count); 321 swaprb_xxxa_portable(dst, src, count);
189 } 322 }
190 323
191 #endif 324 #endif
192 325
193 } 326 }
194 327
195 #endif // SkSwizzler_opts_DEFINED 328 #endif // SkSwizzler_opts_DEFINED
OLDNEW
« no previous file with comments | « src/opts/SkOpts_ssse3.cpp ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698