Index: src/opts/SkSwizzler_opts.h |
diff --git a/src/opts/SkSwizzler_opts.h b/src/opts/SkSwizzler_opts.h |
index b0cf4cad5324eae5eb64fcce75eb755d253a02d5..baef30d4ddcae5ddbc20ad32246fdceb71ab6fac 100644 |
--- a/src/opts/SkSwizzler_opts.h |
+++ b/src/opts/SkSwizzler_opts.h |
@@ -60,6 +60,34 @@ static void swaprb_xxxa_portable(uint32_t dst[], const uint32_t src[], int count |
} |
} |
+static void xxx_xxxa_portable(uint32_t dst[], const uint32_t src[], int count) { |
mtklein
2016/01/22 14:37:32
src is three-byte right, not some sort of RGBx? S
msarett
2016/01/22 15:00:36
Agreed. I made a similar comment in Patch Set 1.
mtklein
2016/01/22 15:23:57
Good. Was going to suggest that. :)
While they'r
msarett
2016/01/22 17:27:23
Done.
|
+ int i8 = 0; |
+ const uint8_t* src8 = (const uint8_t*) src; |
+ for (int i32 = 0; i32 < count; i32++) { |
+ uint8_t b = src8[i8++], |
+ g = src8[i8++], |
+ r = src8[i8++]; |
+ dst[i32] = (uint32_t) b << 0 |
mtklein
2016/01/22 14:37:32
Let's keep our order consistent with the rest of t
msarett
2016/01/22 15:00:36
Done.
|
+ | (uint32_t) g << 8 |
+ | (uint32_t) r << 16 |
+ | (uint32_t)0xFF << 24; |
+ } |
+} |
+ |
+static void xxx_swaprb_xxxa_portable(uint32_t dst[], const uint32_t src[], int count) { |
+ int i8 = 0; |
+ const uint8_t* src8 = (const uint8_t*) src; |
+ for (int i32 = 0; i32 < count; i32++) { |
+ uint8_t b = src8[i8++], |
+ g = src8[i8++], |
+ r = src8[i8++]; |
+ dst[i32] = (uint32_t) r << 0 |
+ | (uint32_t) g << 8 |
+ | (uint32_t) b << 16 |
+ | (uint32_t)0xFF << 24; |
+ } |
+} |
+ |
#if defined(SK_ARM_HAS_NEON) |
// Rounded divide by 255, (x + 127) / 255 |
@@ -168,6 +196,68 @@ static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
swaprb_xxxa_portable(dst, src, count); |
} |
+template <bool kSwapRB> |
+static void xxx_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int count) { |
+ const uint8_t* src8 = (const uint8_t*) src; |
+ while (count >= 16) { |
+ // Load 16 pixels. |
+ uint8x16x3_t bgr = vld3q_u8(src8); |
+ |
+ // Insert an opaque alpha channel and swap if needed. |
+ uint8x16x4_t bgra; |
+ if (kSwapRB) { |
+ bgra.val[0] = bgr.val[2]; |
+ bgra.val[2] = bgr.val[0]; |
+ } else { |
+ bgra.val[0] = bgr.val[0]; |
+ bgra.val[2] = bgr.val[2]; |
+ } |
+ bgra.val[1] = bgr.val[1]; |
+ bgra.val[3] = vdupq_n_u8(0xFF); |
+ |
+ // Store 16 pixels. |
+ vst4q_u8((uint8_t*) dst, bgra); |
+ src8 += 48; |
mtklein
2016/01/22 14:37:33
might write this as += 16*3?
I find it really ple
msarett
2016/01/22 15:00:36
Done.
|
+ dst += 16; |
+ count -= 16; |
+ } |
+ |
+ if (count >= 8) { |
+ // Load 8 pixels. |
+ uint8x8x3_t bgr = vld3_u8(src8); |
+ |
+ // Insert an opaque alpha channel and swap if needed. |
+ uint8x8x4_t bgra; |
+ if (kSwapRB) { |
+ bgra.val[0] = bgr.val[2]; |
+ bgra.val[2] = bgr.val[0]; |
+ } else { |
+ bgra.val[0] = bgr.val[0]; |
+ bgra.val[2] = bgr.val[2]; |
+ } |
+ bgra.val[1] = bgr.val[1]; |
+ bgra.val[3] = vdup_n_u8(0xFF); |
+ |
+ // Store 8 pixels. |
+ vst4_u8((uint8_t*) dst, bgra); |
+ src8 += 24; |
+ dst += 8; |
+ count -= 8; |
+ } |
+ |
+ // Call portable code to finish up the tail of [0,8) pixels. |
+ auto proc = kSwapRB ? xxx_swaprb_xxxa_portable : xxx_xxxa_portable; |
+ proc(dst, (const uint32_t*) src8, count); |
+} |
+ |
+static void xxx_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
+ xxx_xxxa_should_swaprb<false>(dst, src, count); |
+} |
+ |
+static void xxx_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
+ xxx_xxxa_should_swaprb<true>(dst, src, count); |
+} |
+ |
#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 |
template <bool kSwapRB> |
@@ -264,6 +354,14 @@ static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
swaprb_xxxa_portable(dst, src, count); |
} |
+static void xxx_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
+ xxx_xxxa_portable(dst, src, count); |
+} |
+ |
+static void xxx_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
+ xxx_swaprb_xxxa_portable(dst, src, count); |
+} |
+ |
#else |
static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
@@ -278,6 +376,14 @@ static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
swaprb_xxxa_portable(dst, src, count); |
} |
+static void xxx_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
+ xxx_xxxa_portable(dst, src, count); |
+} |
+ |
+static void xxx_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
+ xxx_swaprb_xxxa_portable(dst, src, count); |
+} |
+ |
#endif |
} |