Index: src/opts/SkSwizzler_opts.h |
diff --git a/src/opts/SkSwizzler_opts.h b/src/opts/SkSwizzler_opts.h |
index 15eec3a355a1bee38f28008c7ef0a1fc36ccafb5..345e6974d020ed1dcaeffa97a7951400c2867c2e 100644 |
--- a/src/opts/SkSwizzler_opts.h |
+++ b/src/opts/SkSwizzler_opts.h |
@@ -12,13 +12,16 @@ |
namespace SK_OPTS_NS { |
+template <int kSampleSize> |
static void RGBA_to_rgbA_portable(uint32_t* dst, const void* vsrc, int count) { |
auto src = (const uint32_t*)vsrc; |
+ int j = 0; |
for (int i = 0; i < count; i++) { |
- uint8_t a = src[i] >> 24, |
- b = src[i] >> 16, |
- g = src[i] >> 8, |
- r = src[i] >> 0; |
+ uint8_t a = src[j] >> 24, |
+ b = src[j] >> 16, |
+ g = src[j] >> 8, |
+ r = src[j] >> 0; |
+ j += kSampleSize; |
b = (b*a+127)/255; |
g = (g*a+127)/255; |
r = (r*a+127)/255; |
@@ -29,13 +32,16 @@ static void RGBA_to_rgbA_portable(uint32_t* dst, const void* vsrc, int count) { |
} |
} |
+template <int kSampleSize> |
static void RGBA_to_bgrA_portable(uint32_t* dst, const void* vsrc, int count) { |
auto src = (const uint32_t*)vsrc; |
+ int j = 0; |
for (int i = 0; i < count; i++) { |
- uint8_t a = src[i] >> 24, |
- b = src[i] >> 16, |
- g = src[i] >> 8, |
- r = src[i] >> 0; |
+ uint8_t a = src[j] >> 24, |
+ b = src[j] >> 16, |
+ g = src[j] >> 8, |
+ r = src[j] >> 0; |
+ j += kSampleSize; |
b = (b*a+127)/255; |
g = (g*a+127)/255; |
r = (r*a+127)/255; |
@@ -225,7 +231,7 @@ static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) { |
} |
// Call portable code to finish up the tail of [0,8) pixels. |
- auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable; |
+ auto proc = kSwapRB ? RGBA_to_bgrA_portable<1> : RGBA_to_rgbA_portable<1>; |
proc(dst, src, count); |
} |
@@ -484,6 +490,121 @@ static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) { |
inverted_cmyk_to<kBGR1>(dst, src, count); |
} |
+static void load_rgba_sample2(const uint32_t* src, uint8x8_t* r, uint8x8_t* g, uint8x8_t* b, |
+ uint8x8_t* a) { |
+ // Load 16 pixels. |
+ uint8x16x4_t rgba = vld4q_u8((const uint8_t*) src); |
+ |
+ // Choose 8 pixels. |
+ // pxpxpxpxpxpxpxpx -> pppppppp |
+ *r = vmovn_u16(vreinterpretq_u16_u8(rgba.val[0])); |
+ *g = vmovn_u16(vreinterpretq_u16_u8(rgba.val[1])); |
+ *b = vmovn_u16(vreinterpretq_u16_u8(rgba.val[2])); |
+ *a = vmovn_u16(vreinterpretq_u16_u8(rgba.val[3])); |
+} |
+ |
+template <int kSampleSize> |
+static void load_rgba_sample(const uint32_t* src, uint8x8_t* r, uint8x8_t* g, uint8x8_t* b, |
+ uint8x8_t* a) { |
+ uint8x8_t rgba0 = vld1_u8((const uint8_t*) src); // rgba xxxx |
+ uint8x8_t rgba1 = vld1_u8((const uint8_t*) src + kSampleSize); // rgba xxxx |
+ uint8x8_t rgba01 = vzip_u8(rgba0, rgba1).val[0]; // rrgg bbaa |
+ |
+ uint8x8_t rgba2 = vld1_u8((const uint8_t*) src + 2*kSampleSize); // rgba xxxx |
+ uint8x8_t rgba3 = vld1_u8((const uint8_t*) src + 3*kSampleSize); // rgba xxxx |
+ uint8x8_t rgba23 = vzip_u8(rgba2, rgba3).val[0]; // rrgg bbaa |
+ |
+ uint16x4x2_t rgba03 = vzip_u16(vreinterpret_u16_u8(rgba01), // rrrr gggg |
+ vreinterpret_u16_u8(rgba23)); // bbbb aaaa |
+ |
+ uint8x8_t rgba4 = vld1_u8((const uint8_t*) src + 4*kSampleSize); // rgba xxxx |
+ uint8x8_t rgba5 = vld1_u8((const uint8_t*) src + 5*kSampleSize); // rgba xxxx |
+ uint8x8_t rgba45 = vzip_u8(rgba4, rgba5).val[0]; // rrgg bbaa |
+ |
+ uint8x8_t rgba6 = vld1_u8((const uint8_t*) src + 6*kSampleSize); // rgba xxxx |
+ uint8x8_t rgba7 = vld1_u8((const uint8_t*) src + 7*kSampleSize); // rgba xxxx |
+ uint8x8_t rgba67 = vzip_u8(rgba6, rgba7).val[0]; // rrgg bbaa |
+ |
+ uint16x4x2_t rgba47 = vzip_u16(vreinterpret_u16_u8(rgba45), // rrrr gggg |
+ vreinterpret_u16_u8(rgba67)); // bbbb aaaa |
+ |
+ uint32x2x2_t rg = vzip_u32(vreinterpret_u32_u16(rgba03.val[0]), // rrrr rrrr |
+ vreinterpret_u32_u16(rgba47.val[0])); // gggg gggg |
+ uint32x2x2_t ba = vzip_u32(vreinterpret_u32_u16(rgba03.val[1]), // bbbb bbbb |
+ vreinterpret_u32_u16(rgba47.val[1])); // aaaa aaaa |
+ |
+ *r = vreinterpret_u8_u32(rg.val[0]); |
+ *g = vreinterpret_u8_u32(rg.val[1]); |
+ *b = vreinterpret_u8_u32(ba.val[0]); |
+ *a = vreinterpret_u8_u32(ba.val[1]); |
+} |
+ |
+template <bool kSwapRB, int kSampleSize> |
+static void premul_should_swapRB_sample(uint32_t* dst, const void* vsrc, int count) { |
+ auto src = (const uint32_t*)vsrc; |
+ |
+ // We must use 9 as the limit to be sure that we don't read past the end of our memory. |
+ while (count >= 9) { |
+ // Load pixels. |
+ uint8x8_t r, g, b, a; |
+ if (2 == kSampleSize) { |
+ load_rgba_sample2(src, &r, &g, &b, &a); |
+ } else { |
+ load_rgba_sample<kSampleSize>(src, &r, &g, &b, &a); |
+ } |
+ |
+ // Premultiply. |
+ r = scale(r, a); |
+ g = scale(g, a); |
+ b = scale(b, a); |
+ |
+ // Store 8 premultiplied pixels. |
+ uint8x8x4_t result; |
+ if (kSwapRB) { |
+ result.val[0] = b; |
+ result.val[1] = g; |
+ result.val[2] = r; |
+ result.val[3] = a; |
+ } else { |
+ result.val[0] = r; |
+ result.val[1] = g; |
+ result.val[2] = b; |
+ result.val[3] = a; |
+ } |
+ vst4_u8((uint8_t*) dst, result); |
+ src += 8*kSampleSize; |
+ dst += 8; |
+ count -= 8; |
+ } |
+ |
+ auto proc = kSwapRB ? RGBA_to_bgrA_portable<kSampleSize> : RGBA_to_rgbA_portable<kSampleSize>; |
+ proc(dst, src, count); |
+} |
+ |
+static void RGBA_to_rgbA_sample2(uint32_t dst[], const void* src, int count) { |
+ premul_should_swapRB_sample<false, 2>(dst, src, count); |
+} |
+ |
+static void RGBA_to_bgrA_sample2(uint32_t dst[], const void* src, int count) { |
+ premul_should_swapRB_sample<true, 2>(dst, src, count); |
+} |
+ |
+static void RGBA_to_rgbA_sample4(uint32_t dst[], const void* src, int count) { |
+ premul_should_swapRB_sample<false, 4>(dst, src, count); |
+} |
+ |
+static void RGBA_to_bgrA_sample4(uint32_t dst[], const void* src, int count) { |
+ premul_should_swapRB_sample<true, 4>(dst, src, count); |
+} |
+ |
+static void RGBA_to_rgbA_sample8(uint32_t dst[], const void* src, int count) { |
+ premul_should_swapRB_sample<false, 8>(dst, src, count); |
+} |
+ |
+static void RGBA_to_bgrA_sample8(uint32_t dst[], const void* src, int count) { |
+ premul_should_swapRB_sample<true, 8>(dst, src, count); |
+} |
+ |
#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 |
// Scale a byte by another. |
@@ -561,7 +682,7 @@ static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) { |
} |
// Call portable code to finish up the tail of [0,4) pixels. |
- auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable; |
+ auto proc = kSwapRB ? RGBA_to_bgrA_portable<1> : RGBA_to_rgbA_portable<1>; |
proc(dst, src, count); |
} |
@@ -791,14 +912,38 @@ static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) { |
inverted_cmyk_to<kBGR1>(dst, src, count); |
} |
+static void RGBA_to_rgbA_sample2(uint32_t dst[], const void* src, int count) { |
+ RGBA_to_rgbA_portable<2>(dst, src, count); |
+} |
+ |
+static void RGBA_to_bgrA_sample2(uint32_t dst[], const void* src, int count) { |
+ RGBA_to_bgrA_portable<2>(dst, src, count); |
+} |
+ |
+static void RGBA_to_rgbA_sample4(uint32_t dst[], const void* src, int count) { |
+ RGBA_to_rgbA_portable<4>(dst, src, count); |
+} |
+ |
+static void RGBA_to_bgrA_sample4(uint32_t dst[], const void* src, int count) { |
+ RGBA_to_bgrA_portable<4>(dst, src, count); |
+} |
+ |
+static void RGBA_to_rgbA_sample8(uint32_t dst[], const void* src, int count) { |
+ RGBA_to_rgbA_portable<8>(dst, src, count); |
+} |
+ |
+static void RGBA_to_bgrA_sample8(uint32_t dst[], const void* src, int count) { |
+ RGBA_to_bgrA_portable<8>(dst, src, count); |
+} |
+ |
#else |
static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { |
- RGBA_to_rgbA_portable(dst, src, count); |
+ RGBA_to_rgbA_portable<1>(dst, src, count); |
} |
static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { |
- RGBA_to_bgrA_portable(dst, src, count); |
+ RGBA_to_bgrA_portable<1>(dst, src, count); |
} |
static void RGBA_to_BGRA(uint32_t* dst, const void* src, int count) { |
@@ -833,6 +978,30 @@ static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) { |
inverted_CMYK_to_BGR1_portable(dst, src, count); |
} |
+static void RGBA_to_rgbA_sample2(uint32_t dst[], const void* src, int count) { |
+ RGBA_to_rgbA_portable<2>(dst, src, count); |
+} |
+ |
+static void RGBA_to_bgrA_sample2(uint32_t dst[], const void* src, int count) { |
+ RGBA_to_bgrA_portable<2>(dst, src, count); |
+} |
+ |
+static void RGBA_to_rgbA_sample4(uint32_t dst[], const void* src, int count) { |
+ RGBA_to_rgbA_portable<4>(dst, src, count); |
+} |
+ |
+static void RGBA_to_bgrA_sample4(uint32_t dst[], const void* src, int count) { |
+ RGBA_to_bgrA_portable<4>(dst, src, count); |
+} |
+ |
+static void RGBA_to_rgbA_sample8(uint32_t dst[], const void* src, int count) { |
+ RGBA_to_rgbA_portable<8>(dst, src, count); |
+} |
+ |
+static void RGBA_to_bgrA_sample8(uint32_t dst[], const void* src, int count) { |
+ RGBA_to_bgrA_portable<8>(dst, src, count); |
+} |
+ |
#endif |
} |