 Chromium Code Reviews
 Chromium Code Reviews Issue 1601883002:
  Add SSSE3 Optimizations for premul and swap  (Closed) 
  Base URL: https://skia.googlesource.com/skia.git@f-and-x
    
  
    Issue 1601883002:
  Add SSSE3 Optimizations for premul and swap  (Closed) 
  Base URL: https://skia.googlesource.com/skia.git@f-and-x| Index: src/opts/SkSwizzler_opts.h | 
| diff --git a/src/opts/SkSwizzler_opts.h b/src/opts/SkSwizzler_opts.h | 
| index c7edcb9ece068ec29b321b95295163e908b6f004..ad935ce1d8f0fcd12c2f20e23c6b48fed3645997 100644 | 
| --- a/src/opts/SkSwizzler_opts.h | 
| +++ b/src/opts/SkSwizzler_opts.h | 
| @@ -174,6 +174,139 @@ static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { | 
| swaprb_xxxa_portable(dst, src, count); | 
| } | 
| +#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 | 
| + | 
| +template <bool kSwapRB> | 
| +static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int count) { | 
| 
msarett
2016/01/18 20:35:05
There are a lot of different ways to implement thi
 | 
| + const __m128i zeros = _mm_setzero_si128(); | 
| + const __m128i _128 = _mm_set1_epi16(128); | 
| + const __m128i _257 = _mm_set1_epi16(257); | 
| + const __m128i combine = _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); | 
| + __m128i split; | 
| + if (kSwapRB) { | 
| + split = _mm_set_epi8(15, 3, 7, 11, 14, 2, 6, 10, 13, 1, 5, 9, 12, 0, 4, 8); | 
| + } else { | 
| + split = combine; | 
| + } | 
| + | 
| + while (count >= 8) { | 
| + __m128i argb_lo = _mm_loadu_si128((const __m128i*) src); | 
| + __m128i argb_hi = _mm_loadu_si128((const __m128i*) (src + 4)); | 
| + | 
| + // argb_argb_argb_argb -> aaaa_rrrr_gggg_bbbb | 
| 
mtklein
2016/01/19 15:59:14
Let's kick some of these comments a little bit hig
 
msarett
2016/01/19 17:34:38
Done.
Ugggh, for some reason I thought the rest o
 | 
| + argb_lo = _mm_shuffle_epi8(argb_lo, combine); | 
| + argb_hi = _mm_shuffle_epi8(argb_hi, combine); | 
| + | 
| + // aaaa_rrrr_gggg_bbbb -> aaaa_aaaa_rrrr_rrrr | 
| + __m128i ar = _mm_unpackhi_epi32(argb_lo, argb_hi); | 
| + // aaaa_rrrr_gggg_bbbb -> gggg_gggg_bbbb_bbbb | 
| + __m128i gb = _mm_unpacklo_epi32(argb_lo, argb_hi); | 
| + | 
| + // xxxx_xxxx_yyyy_yyyy -> 0x0x_0x0x_0x0x_0x0x | 
| + // xxxx_xxxx_yyyy_yyyy -> 0y0y_0y0y_0y0y_0y0y | 
| + __m128i a = _mm_unpackhi_epi8(ar, zeros); | 
| + __m128i r = _mm_unpacklo_epi8(ar, zeros); | 
| + __m128i g = _mm_unpackhi_epi8(gb, zeros); | 
| + __m128i b = _mm_unpacklo_epi8(gb, zeros); | 
| + | 
| + // (x + 127) / 255 == ((x + 128) * 257) >> 16 for 0 <= x <= 255 * 255 | 
| 
msarett
2016/01/18 20:35:05
Thanks to Mike for this insight.
 | 
| + // Note that _mm_mulhi_epu16 performs the entire (y * 257) >> 16. | 
| + r = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(a, r), _128), _257); | 
| 
mtklein
2016/01/19 15:59:14
This may be a matter of personal preference, but y
 
msarett
2016/01/19 17:34:38
Leaving as is, though I'm kind of indifferent.
I
 | 
| + g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(a, g), _128), _257); | 
| + b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(a, b), _128), _257); | 
| + | 
| + // aaaa_rrrr_aaaa_rrrr | 
| 
mtklein
2016/01/19 15:59:14
I think we can do this repacking as something like
 
msarett
2016/01/19 17:34:37
Yes this is better!
Let's even swap BR in the "sw
 | 
| + ar = _mm_shuffle_epi32(_mm_packus_epi16(r, a), 0xD8); | 
| + // gggg_bbbb_gggg_bbbb | 
| + gb = _mm_shuffle_epi32(_mm_packus_epi16(b, g), 0xD8); | 
| + | 
| + // aaaa_rrrr_gggg_bbbb | 
| + argb_lo = _mm_unpacklo_epi64(gb, ar); | 
| + argb_hi = _mm_unpackhi_epi64(gb, ar); | 
| + | 
| + // aaaa_rrrr_gggg_bbbb -> argb_argb_argb_argb | 
| + argb_lo = _mm_shuffle_epi8(argb_lo, split); | 
| + argb_hi = _mm_shuffle_epi8(argb_hi, split); | 
| + | 
| + _mm_storeu_si128((__m128i*) dst, argb_lo); | 
| + _mm_storeu_si128((__m128i*) (dst + 4), argb_hi); | 
| + | 
| + src += 8; | 
| + dst += 8; | 
| + count -= 8; | 
| + } | 
| + | 
| + if (count >= 4) { | 
| 
mtklein
2016/01/19 15:59:14
Reminder to self to circle back here when we're ha
 | 
| + __m128i argb = _mm_loadu_si128((const __m128i*) src); | 
| + | 
| + // argb_argb_argb_argb -> aaaa_rrrr_gggg_bbbb | 
| + argb = _mm_shuffle_epi8(argb, combine); | 
| + | 
| + // aaaa_rrrr_gggg_bbbb -> 0000_aaaa_0000_rrrr | 
| + __m128i ar = _mm_unpackhi_epi32(argb, zeros); | 
| + // aaaa_rrrr_gggg_bbbb -> 0000_gggg_0000_bbbb | 
| + __m128i gb = _mm_unpacklo_epi32(argb, zeros); | 
| + | 
| + // xxxx_xxxx_yyyy_yyyy -> 0x0x_0x0x_0x0x_0x0x | 
| + // xxxx_xxxx_yyyy_yyyy -> 0y0y_0y0y_0y0y_0y0y | 
| + __m128i a = _mm_unpackhi_epi8(ar, zeros); | 
| + __m128i r = _mm_unpacklo_epi8(ar, zeros); | 
| + __m128i g = _mm_unpackhi_epi8(gb, zeros); | 
| + __m128i b = _mm_unpacklo_epi8(gb, zeros); | 
| + | 
| + // (x + 127) / 255 == ((x + 128) * 257) >> 16 for 0 <= x <= 255 * 255 | 
| + // Note that _mm_mulhi_epu16 performs the entire (y * 257) >> 16. | 
| + r = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(a, r), _128), _257); | 
| + g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(a, g), _128), _257); | 
| + b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(a, b), _128), _257); | 
| + | 
| + // aaaa_rrrr_0000_0000 | 
| + ar = _mm_shuffle_epi32(_mm_packus_epi16(r, a), 0x8F); | 
| + // 0000_0000_gggg_bbbb | 
| + gb = _mm_shuffle_epi32(_mm_packus_epi16(b, g), 0xD8); | 
| + | 
| + // aaaa_rrrr_gggg_bbbb | 
| + argb = _mm_or_si128(ar, gb); | 
| + | 
| + // aaaa_rrrr_gggg_bbbb -> argb_argb_argb_argb | 
| + argb = _mm_shuffle_epi8(argb, split); | 
| + | 
| + _mm_storeu_si128((__m128i*) dst, argb); | 
| + | 
| + src += 4; | 
| + dst += 4; | 
| + count -= 4; | 
| + } | 
| + | 
| + // Call portable code to finish up the tail of [0,4) pixels. | 
| + auto proc = kSwapRB ? premul_swaprb_xxxa_portable : premul_xxxa_portable; | 
| + proc(dst, src, count); | 
| +} | 
| + | 
| +static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) { | 
| + premul_xxxa_should_swaprb<false>(dst, src, count); | 
| +} | 
| + | 
| +static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { | 
| + premul_xxxa_should_swaprb<true>(dst, src, count); | 
| +} | 
| + | 
| +static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { | 
| + const __m128i swapRB = _mm_set_epi8(15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6, 3, 0, 1, 2); | 
| 
mtklein
2016/01/19 15:59:15
I often find it's easier to read these if you use
 
msarett
2016/01/19 17:34:37
I think you're right.
 | 
| + | 
| + while (count >= 4) { | 
| + __m128i argb = _mm_loadu_si128((const __m128i*) src); | 
| + __m128i abgr = _mm_shuffle_epi8(argb, swapRB); | 
| + _mm_storeu_si128((__m128i*) dst, abgr); | 
| + | 
| + src += 4; | 
| + dst += 4; | 
| + count -= 4; | 
| + } | 
| + | 
| + swaprb_xxxa_portable(dst, src, count); | 
| +} | 
| + | 
| #else | 
| static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) { |