Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(424)

Unified Diff: src/opts/SkSwizzler_opts.h

Issue 1601883002: Add SSSE3 Optimizations for premul and swap (Closed) Base URL: https://skia.googlesource.com/skia.git@f-and-x
Patch Set: Faster repacking, style, comments Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « src/opts/SkOpts_ssse3.cpp ('k') | no next file » | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: src/opts/SkSwizzler_opts.h
diff --git a/src/opts/SkSwizzler_opts.h b/src/opts/SkSwizzler_opts.h
index c7edcb9ece068ec29b321b95295163e908b6f004..2d8476d11b2c54867c7c0b73b7ba0a2d85f312be 100644
--- a/src/opts/SkSwizzler_opts.h
+++ b/src/opts/SkSwizzler_opts.h
@@ -174,6 +174,121 @@ static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {
swaprb_xxxa_portable(dst, src, count);
}
+#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
+
+template <bool kSwapRB>
+static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int count) {
+ const __m128i zeros = _mm_setzero_si128();
+ const __m128i _128 = _mm_set1_epi16(128);
+ const __m128i _257 = _mm_set1_epi16(257);
+ __m128i planar;
+ if (kSwapRB) {
+ planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
+ } else {
+ planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
+ }
+
+ while (count >= 8) {
+ // We'll load 8 pixels into 4 registers, each holding a 16-bit component plane.
+
+ // First just load the 8 interlaced pixels.
+ __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)), // bgrabgra bgrabgra
+ hi = _mm_loadu_si128((const __m128i*) (src + 4)); // BGRABGRA BGRABGRA
+
+ // Swizzle them to 8-bit planar.
+ lo = _mm_shuffle_epi8(lo, planar); // bbbbgggg rrrraaaa
+ hi = _mm_shuffle_epi8(lo, planar); // BBBBGGGG RRRRAAAA
+ __m128i bg = _mm_unpacklo_epi32(lo, hi), // bbbbBBBB ggggGGGG
+ ra = _mm_unpackhi_epi32(lo, hi); // rrrrRRRR aaaaAAAA
+
+ // Unpack to 16-bit planar.
+ __m128i b = _mm_unpacklo_epi8(bg, zeros), // b_b_b_b_ B_B_B_B_
+ g = _mm_unpackhi_epi8(bg, zeros), // g_g_g_g_ G_G_G_G_
+ r = _mm_unpacklo_epi8(ra, zeros), // r_r_r_r_ R_R_R_R_
+ a = _mm_unpackhi_epi8(ra, zeros); // a_a_a_a_ A_A_A_A_
+
+ // Premultiply! (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255.
+ b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(b, a), _128), _257);
+ g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(g, a), _128), _257);
+ r = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(r, a), _128), _257);
+
+ // Repack into interlaced pixels.
+ bg = _mm_or_si128(b, _mm_slli_epi16(g, 8)); // bgbgbgbg BGBGBGBG
+ ra = _mm_or_si128(r, _mm_slli_epi16(a, 8)); // rararara RARARARA
+ lo = _mm_unpacklo_epi16(bg, ra); // bgrabgra bgrabgra
+ hi = _mm_unpackhi_epi16(bg, ra); // BRGABGRA BGRABGRA
+
+ // Store interlaced pixels.
+ _mm_storeu_si128((__m128i*) (dst + 0), lo);
+ _mm_storeu_si128((__m128i*) (dst + 4), hi);
+
+ src += 8;
+ dst += 8;
+ count -= 8;
+ }
+
+ if (count >= 4) {
mtklein 2016/01/19 18:28:30 OK, now that we've got count >= 8 in shape, let's
msarett 2016/01/19 19:17:43 Done.
+ // First just load 4 interlaced pixels.
+ __m128i lo = _mm_loadu_si128((const __m128i*) src); // bgrabgra bgrabgra
+
+ // Swizzle them to 8-bit planar.
+ lo = _mm_shuffle_epi8(lo, planar); // bbbbgggg rrrraaaa
+ __m128i bg = _mm_unpacklo_epi32(lo, zeros), // bbbb____ gggg____
+ ra = _mm_unpackhi_epi32(lo, zeros); // rrrr____ aaaa____
+
+ // Unpack to 16-bit planar.
+ __m128i b = _mm_unpacklo_epi8(bg, zeros), // b_b_b_b_ ________
+ g = _mm_unpackhi_epi8(bg, zeros), // g_g_g_g_ ________
+ r = _mm_unpacklo_epi8(ra, zeros), // r_r_r_r_ ________
+ a = _mm_unpackhi_epi8(ra, zeros); // a_a_a_a_ ________
+
+ // Premultiply! (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255.
+ b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(b, a), _128), _257);
+ g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(g, a), _128), _257);
+ r = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(r, a), _128), _257);
+
+ // Repack into interlaced pixels.
+ bg = _mm_or_si128(b, _mm_slli_epi16(g, 8)); // bgbgbgbg ________
+ ra = _mm_or_si128(r, _mm_slli_epi16(a, 8)); // rararara ________
+ lo = _mm_unpacklo_epi16(bg, ra); // bgrabgra bgrabgra
+
+ // Store interlaced pixels.
+ _mm_storeu_si128((__m128i*) dst, lo);
+
+ src += 4;
+ dst += 4;
+ count -= 4;
+ }
+
+ // Call portable code to finish up the tail of [0,4) pixels.
+ auto proc = kSwapRB ? premul_swaprb_xxxa_portable : premul_xxxa_portable;
+ proc(dst, src, count);
+}
+
+static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) {
+ premul_xxxa_should_swaprb<false>(dst, src, count);
+}
+
+static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {
+ premul_xxxa_should_swaprb<true>(dst, src, count);
+}
+
+static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {
+ const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15);
+
+ while (count >= 4) {
+ __m128i bgra = _mm_loadu_si128((const __m128i*) src);
+ __m128i rgba = _mm_shuffle_epi8(bgra, swapRB);
+ _mm_storeu_si128((__m128i*) dst, rgba);
+
+ src += 4;
+ dst += 4;
+ count -= 4;
+ }
+
+ swaprb_xxxa_portable(dst, src, count);
+}
+
#else
static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) {
« no previous file with comments | « src/opts/SkOpts_ssse3.cpp ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698