src/opts/SkSwizzler_opts.h - Issue 1601883002: Add SSSE3 Optimizations for premul and swap

Unified Diff: src/opts/SkSwizzler_opts.h

Issue 1601883002: Add SSSE3 Optimizations for premul and swap (Closed) Base URL: https://skia.googlesource.com/skia.git@f-and-x

Patch Set: Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/opts/SkSwizzler_opts.h

diff --git a/src/opts/SkSwizzler_opts.h b/src/opts/SkSwizzler_opts.h

index c7edcb9ece068ec29b321b95295163e908b6f004..ad935ce1d8f0fcd12c2f20e23c6b48fed3645997 100644

--- a/src/opts/SkSwizzler_opts.h

+++ b/src/opts/SkSwizzler_opts.h

@@ -174,6 +174,139 @@ static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {

swaprb_xxxa_portable(dst, src, count);

}

+#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3

+template <bool kSwapRB>

+static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int count) {

msarett 2016/01/18 20:35:05 There are a lot of different ways to implement thi

+ const __m128i zeros = _mm_setzero_si128();

+ const __m128i _128 = _mm_set1_epi16(128);

+ const __m128i _257 = _mm_set1_epi16(257);

+ const __m128i combine = _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);

+ __m128i split;

+ if (kSwapRB) {

+ split = _mm_set_epi8(15, 3, 7, 11, 14, 2, 6, 10, 13, 1, 5, 9, 12, 0, 4, 8);

+ } else {

+ split = combine;

+ }

+ while (count >= 8) {

+ __m128i argb_lo = _mm_loadu_si128((const __m128i*) src);

+ __m128i argb_hi = _mm_loadu_si128((const __m128i*) (src + 4));

+ // argb_argb_argb_argb -> aaaa_rrrr_gggg_bbbb

mtklein 2016/01/19 15:59:14 Let's kick some of these comments a little bit hig

msarett 2016/01/19 17:34:38 Done. Ugggh, for some reason I thought the rest o

+ argb_lo = _mm_shuffle_epi8(argb_lo, combine);

+ argb_hi = _mm_shuffle_epi8(argb_hi, combine);

+ // aaaa_rrrr_gggg_bbbb -> aaaa_aaaa_rrrr_rrrr

+ __m128i ar = _mm_unpackhi_epi32(argb_lo, argb_hi);

+ // aaaa_rrrr_gggg_bbbb -> gggg_gggg_bbbb_bbbb

+ __m128i gb = _mm_unpacklo_epi32(argb_lo, argb_hi);

+ // xxxx_xxxx_yyyy_yyyy -> 0x0x_0x0x_0x0x_0x0x

+ // xxxx_xxxx_yyyy_yyyy -> 0y0y_0y0y_0y0y_0y0y

+ __m128i a = _mm_unpackhi_epi8(ar, zeros);

+ __m128i r = _mm_unpacklo_epi8(ar, zeros);

+ __m128i g = _mm_unpackhi_epi8(gb, zeros);

+ __m128i b = _mm_unpacklo_epi8(gb, zeros);

+ // (x + 127) / 255 == ((x + 128) * 257) >> 16 for 0 <= x <= 255 * 255

msarett 2016/01/18 20:35:05 Thanks to Mike for this insight.

+ // Note that _mm_mulhi_epu16 performs the entire (y * 257) >> 16.

+ r = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(a, r), _128), _257);

mtklein 2016/01/19 15:59:14 This may be a matter of personal preference, but y

msarett 2016/01/19 17:34:38 Leaving as is, though I'm kind of indifferent. I

+ g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(a, g), _128), _257);

+ b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(a, b), _128), _257);

+ // aaaa_rrrr_aaaa_rrrr

mtklein 2016/01/19 15:59:14 I think we can do this repacking as something like

msarett 2016/01/19 17:34:37 Yes this is better! Let's even swap BR in the "sw

+ ar = _mm_shuffle_epi32(_mm_packus_epi16(r, a), 0xD8);

+ // gggg_bbbb_gggg_bbbb

+ gb = _mm_shuffle_epi32(_mm_packus_epi16(b, g), 0xD8);

+ // aaaa_rrrr_gggg_bbbb

+ argb_lo = _mm_unpacklo_epi64(gb, ar);

+ argb_hi = _mm_unpackhi_epi64(gb, ar);

+ // aaaa_rrrr_gggg_bbbb -> argb_argb_argb_argb

+ argb_lo = _mm_shuffle_epi8(argb_lo, split);

+ argb_hi = _mm_shuffle_epi8(argb_hi, split);

+ _mm_storeu_si128((__m128i*) dst, argb_lo);

+ _mm_storeu_si128((__m128i*) (dst + 4), argb_hi);

+ src += 8;

+ dst += 8;

+ count -= 8;

+ }

+ if (count >= 4) {

mtklein 2016/01/19 15:59:14 Reminder to self to circle back here when we're ha

+ __m128i argb = _mm_loadu_si128((const __m128i*) src);

+ // argb_argb_argb_argb -> aaaa_rrrr_gggg_bbbb

+ argb = _mm_shuffle_epi8(argb, combine);

+ // aaaa_rrrr_gggg_bbbb -> 0000_aaaa_0000_rrrr

+ __m128i ar = _mm_unpackhi_epi32(argb, zeros);

+ // aaaa_rrrr_gggg_bbbb -> 0000_gggg_0000_bbbb

+ __m128i gb = _mm_unpacklo_epi32(argb, zeros);

+ // xxxx_xxxx_yyyy_yyyy -> 0x0x_0x0x_0x0x_0x0x

+ // xxxx_xxxx_yyyy_yyyy -> 0y0y_0y0y_0y0y_0y0y

+ __m128i a = _mm_unpackhi_epi8(ar, zeros);

+ __m128i r = _mm_unpacklo_epi8(ar, zeros);

+ __m128i g = _mm_unpackhi_epi8(gb, zeros);

+ __m128i b = _mm_unpacklo_epi8(gb, zeros);

+ // (x + 127) / 255 == ((x + 128) * 257) >> 16 for 0 <= x <= 255 * 255

+ // Note that _mm_mulhi_epu16 performs the entire (y * 257) >> 16.

+ r = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(a, r), _128), _257);

+ g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(a, g), _128), _257);

+ b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(a, b), _128), _257);

+ // aaaa_rrrr_0000_0000

+ ar = _mm_shuffle_epi32(_mm_packus_epi16(r, a), 0x8F);

+ // 0000_0000_gggg_bbbb

+ gb = _mm_shuffle_epi32(_mm_packus_epi16(b, g), 0xD8);

+ // aaaa_rrrr_gggg_bbbb

+ argb = _mm_or_si128(ar, gb);

+ // aaaa_rrrr_gggg_bbbb -> argb_argb_argb_argb

+ argb = _mm_shuffle_epi8(argb, split);

+ _mm_storeu_si128((__m128i*) dst, argb);

+ src += 4;

+ dst += 4;

+ count -= 4;

+ }

+ // Call portable code to finish up the tail of [0,4) pixels.

+ auto proc = kSwapRB ? premul_swaprb_xxxa_portable : premul_xxxa_portable;

+ proc(dst, src, count);

+static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) {

+ premul_xxxa_should_swaprb<false>(dst, src, count);

+static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {

+ premul_xxxa_should_swaprb<true>(dst, src, count);

+static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {

+ const __m128i swapRB = _mm_set_epi8(15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6, 3, 0, 1, 2);

mtklein 2016/01/19 15:59:15 I often find it's easier to read these if you use

msarett 2016/01/19 17:34:37 I think you're right.

+ while (count >= 4) {

+ __m128i argb = _mm_loadu_si128((const __m128i*) src);

+ __m128i abgr = _mm_shuffle_epi8(argb, swapRB);

+ _mm_storeu_si128((__m128i*) dst, abgr);

+ src += 4;

+ dst += 4;

+ count -= 4;

+ }

+ swaprb_xxxa_portable(dst, src, count);

#else

static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) {

« no previous file with comments | « src/opts/SkOpts_ssse3.cpp ('k') | no next file » | no next file with comments »