| Index: src/opts/SkSwizzler_opts.h
|
| diff --git a/src/opts/SkSwizzler_opts.h b/src/opts/SkSwizzler_opts.h
|
| index 15eec3a355a1bee38f28008c7ef0a1fc36ccafb5..e7f50c7d356418a29766ffadcbce468f859c6244 100644
|
| --- a/src/opts/SkSwizzler_opts.h
|
| +++ b/src/opts/SkSwizzler_opts.h
|
| @@ -12,13 +12,16 @@
|
|
|
| namespace SK_OPTS_NS {
|
|
|
| +template <int kSampleSize>
|
| static void RGBA_to_rgbA_portable(uint32_t* dst, const void* vsrc, int count) {
|
| auto src = (const uint32_t*)vsrc;
|
| + int j = 0;
|
| for (int i = 0; i < count; i++) {
|
| - uint8_t a = src[i] >> 24,
|
| - b = src[i] >> 16,
|
| - g = src[i] >> 8,
|
| - r = src[i] >> 0;
|
| + uint8_t a = src[j] >> 24,
|
| + b = src[j] >> 16,
|
| + g = src[j] >> 8,
|
| + r = src[j] >> 0;
|
| + j += kSampleSize;
|
| b = (b*a+127)/255;
|
| g = (g*a+127)/255;
|
| r = (r*a+127)/255;
|
| @@ -29,13 +32,16 @@ static void RGBA_to_rgbA_portable(uint32_t* dst, const void* vsrc, int count) {
|
| }
|
| }
|
|
|
| +template <int kSampleSize>
|
| static void RGBA_to_bgrA_portable(uint32_t* dst, const void* vsrc, int count) {
|
| auto src = (const uint32_t*)vsrc;
|
| + int j = 0;
|
| for (int i = 0; i < count; i++) {
|
| - uint8_t a = src[i] >> 24,
|
| - b = src[i] >> 16,
|
| - g = src[i] >> 8,
|
| - r = src[i] >> 0;
|
| + uint8_t a = src[j] >> 24,
|
| + b = src[j] >> 16,
|
| + g = src[j] >> 8,
|
| + r = src[j] >> 0;
|
| + j += kSampleSize;
|
| b = (b*a+127)/255;
|
| g = (g*a+127)/255;
|
| r = (r*a+127)/255;
|
| @@ -225,7 +231,7 @@ static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {
|
| }
|
|
|
| // Call portable code to finish up the tail of [0,8) pixels.
|
| - auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
|
| + auto proc = kSwapRB ? RGBA_to_bgrA_portable<1> : RGBA_to_rgbA_portable<1>;
|
| proc(dst, src, count);
|
| }
|
|
|
| @@ -484,6 +490,148 @@ static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
|
| inverted_cmyk_to<kBGR1>(dst, src, count);
|
| }
|
|
|
| +static void load_rgba_sample2(const uint32_t* src, uint8x8_t* r, uint8x8_t* g, uint8x8_t* b,
|
| + uint8x8_t* a) {
|
| + // Load 16 pixels.
|
| + uint8x16x4_t rgba = vld4q_u8((const uint8_t*) src);
|
| +
|
| + // Choose 8 pixels.
|
| + // pxpxpxpxpxpxpxpx -> pppppppp
|
| + *r = vmovn_u16(vreinterpretq_u16_u8(rgba.val[0]));
|
| + *g = vmovn_u16(vreinterpretq_u16_u8(rgba.val[1]));
|
| + *b = vmovn_u16(vreinterpretq_u16_u8(rgba.val[2]));
|
| + *a = vmovn_u16(vreinterpretq_u16_u8(rgba.val[3]));
|
| +}
|
| +
|
| +static void load_rgba_sample4(const uint32_t* src, uint8x8_t* r, uint8x8_t* g, uint8x8_t* b,
|
| + uint8x8_t* a) {
|
| + // Load 32 pixels.
|
| + uint8x16x4_t rgba0 = vld4q_u8((const uint8_t*) src);
|
| + uint8x16x4_t rgba1 = vld4q_u8((const uint8_t*) src + 16);
|
| +
|
| + // Choose 8 pixels.
|
| + // pxxxpxxpxxxpxxx -> pxpxpxpx
|
| + uint16x4_t r0 = vmovn_u32(vreinterpretq_u32_u8(rgba0.val[0])),
|
| + g0 = vmovn_u32(vreinterpretq_u32_u8(rgba0.val[1])),
|
| + b0 = vmovn_u32(vreinterpretq_u32_u8(rgba0.val[2])),
|
| + a0 = vmovn_u32(vreinterpretq_u32_u8(rgba0.val[3]));
|
| + uint16x4_t r1 = vmovn_u32(vreinterpretq_u32_u8(rgba1.val[0])),
|
| + g1 = vmovn_u32(vreinterpretq_u32_u8(rgba1.val[1])),
|
| + b1 = vmovn_u32(vreinterpretq_u32_u8(rgba1.val[2])),
|
| + a1 = vmovn_u32(vreinterpretq_u32_u8(rgba1.val[3]));
|
| +
|
| + // pxpxpxpx, pxpxpxpx -> pppppppp
|
| + *r = vmovn_u16(vcombine_u16(r0, r1));
|
| + *g = vmovn_u16(vcombine_u16(g0, g1));
|
| + *b = vmovn_u16(vcombine_u16(b0, b1));
|
| + *a = vmovn_u16(vcombine_u16(a0, a1));
|
| +}
|
| +
|
| +static void load_rgba_sample8(const uint32_t* src, uint8x8_t* r, uint8x8_t* g, uint8x8_t* b,
|
| + uint8x8_t* a) {
|
| + uint8x8_t rgba0 = vld1_u8((const uint8_t*) src); // rgba xxxx
|
| + uint8x8_t rgba1 = vld1_u8((const uint8_t*) src + 8); // rgba xxxx
|
| + uint8x8_t rgba01 = vzip_u8(rgba0, rgba1).val[0]; // rrgg bbaa
|
| +
|
| + uint8x8_t rgba2 = vld1_u8((const uint8_t*) src + 16); // rgba xxxx
|
| + uint8x8_t rgba3 = vld1_u8((const uint8_t*) src + 24); // rgba xxxx
|
| + uint8x8_t rgba23 = vzip_u8(rgba2, rgba3).val[0]; // rrgg bbaa
|
| +
|
| + // rrrr gggg, bbbb aaaa
|
| + uint16x4x2_t rgba03 = vzip_u16(vreinterpret_u16_u8(rgba01), vreinterpret_u16_u8(rgba23));
|
| +
|
| + uint8x8_t rgba4 = vld1_u8((const uint8_t*) src + 32); // rgba xxxx
|
| + uint8x8_t rgba5 = vld1_u8((const uint8_t*) src + 40); // rgba xxxx
|
| + uint8x8_t rgba45 = vzip_u8(rgba4, rgba5).val[0]; // rrgg bbaa
|
| +
|
| + uint8x8_t rgba6 = vld1_u8((const uint8_t*) src + 48); // rgba xxxx
|
| + uint8x8_t rgba7 = vld1_u8((const uint8_t*) src + 56); // rgba xxxx
|
| + uint8x8_t rgba67 = vzip_u8(rgba6, rgba7).val[0]; // rrgg bbaa
|
| +
|
| + // rrrr gggg, bbbb aaaa
|
| + uint16x4x2_t rgba47 = vzip_u16(vreinterpret_u16_u8(rgba45), vreinterpret_u16_u8(rgba67));
|
| +
|
| + // rrrr rrrr, gggg gggg
|
| + uint32x2x2_t rg = vzip_u32(vreinterpret_u32_u16(rgba03.val[0]),
|
| + vreinterpret_u32_u16(rgba47.val[0]));
|
| + // bbbb bbbb, aaaa aaaa
|
| + uint32x2x2_t ba = vzip_u32(vreinterpret_u32_u16(rgba03.val[1]),
|
| + vreinterpret_u32_u16(rgba47.val[1]));
|
| +
|
| + *r = vreinterpret_u8_u32(rg.val[0]);
|
| + *g = vreinterpret_u8_u32(rg.val[1]);
|
| + *b = vreinterpret_u8_u32(ba.val[0]);
|
| + *a = vreinterpret_u8_u32(ba.val[1]);
|
| +}
|
| +
|
| +template <bool kSwapRB, int kSampleSize>
|
| +static void premul_should_swapRB_sample(uint32_t* dst, const void* vsrc, int count) {
|
| + auto src = (const uint32_t*)vsrc;
|
| +
|
| + // We must use 9 as the limit to be sure that we don't read past the end of our memory.
|
| + while (count >= 9) {
|
| + // Load pixels.
|
| + uint8x8_t r, g, b, a;
|
| + if (2 == kSampleSize) {
|
| + load_rgba_sample2(src, &r, &g, &b, &a);
|
| + } else if (4 == kSampleSize) {
|
| + load_rgba_sample4(src, &r, &g, &b, &a);
|
| + } else if (8 == kSampleSize) {
|
| + load_rgba_sample8(src, &r, &g, &b, &a);
|
| + }
|
| +
|
| + // Premultiply.
|
| + r = scale(r, a);
|
| + g = scale(g, a);
|
| + b = scale(b, a);
|
| +
|
| + // Store 8 premultiplied pixels.
|
| + uint8x8x4_t result;
|
| + if (kSwapRB) {
|
| + result.val[0] = b;
|
| + result.val[1] = g;
|
| + result.val[2] = r;
|
| + result.val[3] = a;
|
| + } else {
|
| + result.val[0] = r;
|
| + result.val[1] = g;
|
| + result.val[2] = b;
|
| + result.val[3] = a;
|
| + }
|
| + vst4_u8((uint8_t*) dst, result);
|
| + src += 8*kSampleSize;
|
| + dst += 8;
|
| + count -= 8;
|
| + }
|
| +
|
| + auto proc = kSwapRB ? RGBA_to_bgrA_portable<kSampleSize> : RGBA_to_rgbA_portable<kSampleSize>;
|
| + proc(dst, src, count);
|
| +}
|
| +
|
| +static void RGBA_to_rgbA_sample2(uint32_t dst[], const void* src, int count) {
|
| + premul_should_swapRB_sample<false, 2>(dst, src, count);
|
| +}
|
| +
|
| +static void RGBA_to_bgrA_sample2(uint32_t dst[], const void* src, int count) {
|
| + premul_should_swapRB_sample<true, 2>(dst, src, count);
|
| +}
|
| +
|
| +static void RGBA_to_rgbA_sample4(uint32_t dst[], const void* src, int count) {
|
| + premul_should_swapRB_sample<false, 4>(dst, src, count);
|
| +}
|
| +
|
| +static void RGBA_to_bgrA_sample4(uint32_t dst[], const void* src, int count) {
|
| + premul_should_swapRB_sample<true, 4>(dst, src, count);
|
| +}
|
| +
|
| +static void RGBA_to_rgbA_sample8(uint32_t dst[], const void* src, int count) {
|
| + premul_should_swapRB_sample<false, 8>(dst, src, count);
|
| +}
|
| +
|
| +static void RGBA_to_bgrA_sample8(uint32_t dst[], const void* src, int count) {
|
| + premul_should_swapRB_sample<true, 8>(dst, src, count);
|
| +}
|
| +
|
| #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
|
|
|
| // Scale a byte by another.
|
| @@ -561,7 +709,7 @@ static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {
|
| }
|
|
|
| // Call portable code to finish up the tail of [0,4) pixels.
|
| - auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
|
| + auto proc = kSwapRB ? RGBA_to_bgrA_portable<1> : RGBA_to_rgbA_portable<1>;
|
| proc(dst, src, count);
|
| }
|
|
|
| @@ -791,14 +939,38 @@ static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
|
| inverted_cmyk_to<kBGR1>(dst, src, count);
|
| }
|
|
|
| +static void RGBA_to_rgbA_sample2(uint32_t dst[], const void* src, int count) {
|
| + RGBA_to_rgbA_portable<2>(dst, src, count);
|
| +}
|
| +
|
| +static void RGBA_to_bgrA_sample2(uint32_t dst[], const void* src, int count) {
|
| + RGBA_to_bgrA_portable<2>(dst, src, count);
|
| +}
|
| +
|
| +static void RGBA_to_rgbA_sample4(uint32_t dst[], const void* src, int count) {
|
| + RGBA_to_rgbA_portable<4>(dst, src, count);
|
| +}
|
| +
|
| +static void RGBA_to_bgrA_sample4(uint32_t dst[], const void* src, int count) {
|
| + RGBA_to_bgrA_portable<4>(dst, src, count);
|
| +}
|
| +
|
| +static void RGBA_to_rgbA_sample8(uint32_t dst[], const void* src, int count) {
|
| + RGBA_to_rgbA_portable<8>(dst, src, count);
|
| +}
|
| +
|
| +static void RGBA_to_bgrA_sample8(uint32_t dst[], const void* src, int count) {
|
| + RGBA_to_bgrA_portable<8>(dst, src, count);
|
| +}
|
| +
|
| #else
|
|
|
| static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
|
| - RGBA_to_rgbA_portable(dst, src, count);
|
| + RGBA_to_rgbA_portable<1>(dst, src, count);
|
| }
|
|
|
| static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
|
| - RGBA_to_bgrA_portable(dst, src, count);
|
| + RGBA_to_bgrA_portable<1>(dst, src, count);
|
| }
|
|
|
| static void RGBA_to_BGRA(uint32_t* dst, const void* src, int count) {
|
| @@ -833,6 +1005,30 @@ static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
|
| inverted_CMYK_to_BGR1_portable(dst, src, count);
|
| }
|
|
|
| +static void RGBA_to_rgbA_sample2(uint32_t dst[], const void* src, int count) {
|
| + RGBA_to_rgbA_portable<2>(dst, src, count);
|
| +}
|
| +
|
| +static void RGBA_to_bgrA_sample2(uint32_t dst[], const void* src, int count) {
|
| + RGBA_to_bgrA_portable<2>(dst, src, count);
|
| +}
|
| +
|
| +static void RGBA_to_rgbA_sample4(uint32_t dst[], const void* src, int count) {
|
| + RGBA_to_rgbA_portable<4>(dst, src, count);
|
| +}
|
| +
|
| +static void RGBA_to_bgrA_sample4(uint32_t dst[], const void* src, int count) {
|
| + RGBA_to_bgrA_portable<4>(dst, src, count);
|
| +}
|
| +
|
| +static void RGBA_to_rgbA_sample8(uint32_t dst[], const void* src, int count) {
|
| + RGBA_to_rgbA_portable<8>(dst, src, count);
|
| +}
|
| +
|
| +static void RGBA_to_bgrA_sample8(uint32_t dst[], const void* src, int count) {
|
| + RGBA_to_bgrA_portable<8>(dst, src, count);
|
| +}
|
| +
|
| #endif
|
|
|
| }
|
|
|