src/opts/SkBlitRow_opts_arm_neon.cpp - Issue 847363002: skia: blend32_16_row for neon version

Unified Diff: src/opts/SkBlitRow_opts_arm_neon.cpp

Issue 847363002: skia: blend32_16_row for neon version (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: skia: blend32_16_row for neon version Created 5 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/opts/SkBlitRow_opts_arm_neon.cpp

diff --git a/src/opts/SkBlitRow_opts_arm_neon.cpp b/src/opts/SkBlitRow_opts_arm_neon.cpp

index a1ff1722b5f5e7c820bed0a3c17f6af90f2e7cba..8981fd7fee6967ad23f2d3713099890f4dd30304 100644

--- a/src/opts/SkBlitRow_opts_arm_neon.cpp

+++ b/src/opts/SkBlitRow_opts_arm_neon.cpp

@@ -465,6 +465,130 @@ void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,

}

#endif // #ifdef SK_CPU_ARM32

+static uint32_t pmcolor_to_expand16(SkPMColor c) {

+ unsigned r = SkGetPackedR32(c);

+ unsigned g = SkGetPackedG32(c);

+ unsigned b = SkGetPackedB32(c);

+ return (g << 24) | (r << 13) | (b << 2);

+void Color32A_D565_neon(uint16_t dst[], SkPMColor src, int count, int x, int y) {

+ uint32_t src_expand;

+ unsigned scale;

+ uint16x8_t vmask_blue;

+ if (count <= 0) return;

+ SkASSERT(((size_t)dst & 0x01) == 0);

+ /*

+ * This preamble code is in order to make dst aligned to 8 bytes

+ * in the next mutiple bytes read & write access.

+ */

+ src_expand = pmcolor_to_expand16(src);

+ scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;

+#define DST_ALIGN 8

+ /*

+ * preamble_size is in byte, meantime, this blend32_16_row_neon updates 2 bytes at a time.

+ */

+ int preamble_size = (DST_ALIGN - (size_t)dst) & (DST_ALIGN - 1);

+ for (int i = 0; i < preamble_size; i+=2, dst++) {

+ uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale;

+ *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);

+ if (--count == 0)

+ break;

+ }

+ int count16 = 0;

+ count16 = count >> 4;

+ vmask_blue = vmovq_n_u16(SK_B16_MASK);

+ if (count16) {

+ uint16x8_t wide_sr;

+ uint16x8_t wide_sg;

+ uint16x8_t wide_sb;

+ uint16x8_t wide_256_sa;

+ unsigned sr = SkGetPackedR32(src);

+ unsigned sg = SkGetPackedG32(src);

+ unsigned sb = SkGetPackedB32(src);

+ unsigned sa = SkGetPackedA32(src);

+ // Operation: dst_rgb = src_rgb + ((256 - src_a) >> 3) x dst_rgb

+ // sr: 8-bit based, dr: 5-bit based, with dr x ((256-sa)>>3), 5-bit left shifted,

+ //thus, for sr, do 2-bit left shift to match MSB : (8 + 2 = 5 + 5)

+ wide_sr = vshlq_n_u16(vmovl_u8(vdup_n_u8(sr)), 2); // widen and src_red shift

+ // sg: 8-bit based, dg: 6-bit based, with dg x ((256-sa)>>3), 5-bit left shifted,

+ //thus, for sg, do 3-bit left shift to match MSB : (8 + 3 = 6 + 5)

+ wide_sg = vshlq_n_u16(vmovl_u8(vdup_n_u8(sg)), 3); // widen and src_grn shift

+ // sb: 8-bit based, db: 5-bit based, with db x ((256-sa)>>3), 5-bit left shifted,

+ //thus, for sb, do 2-bit left shift to match MSB : (8 + 2 = 5 + 5)

+ wide_sb = vshlq_n_u16(vmovl_u8(vdup_n_u8(sb)), 2); // widen and src blu shift

+ wide_256_sa =

+ vshrq_n_u16(vsubw_u8(vdupq_n_u16(256), vdup_n_u8(sa)), 3); // (256 - sa) >> 3

+ while (count16-- > 0) {

+ uint16x8_t vdst1, vdst1_r, vdst1_g, vdst1_b;

+ uint16x8_t vdst2, vdst2_r, vdst2_g, vdst2_b;

+ vdst1 = vld1q_u16(dst);

+ dst += 8;

+ vdst2 = vld1q_u16(dst);

+ dst -= 8; //to store dst again.

+ vdst1_g = vshlq_n_u16(vdst1, SK_R16_BITS); // shift green to top of lanes

+ vdst1_b = vdst1 & vmask_blue; // extract blue

+ vdst1_r = vshrq_n_u16(vdst1, SK_R16_SHIFT); // extract red

+ vdst1_g = vshrq_n_u16(vdst1_g, SK_R16_BITS + SK_B16_BITS); // extract green

+ vdst2_g = vshlq_n_u16(vdst2, SK_R16_BITS); // shift green to top of lanes

+ vdst2_b = vdst2 & vmask_blue; // extract blue

+ vdst2_r = vshrq_n_u16(vdst2, SK_R16_SHIFT); // extract red

+ vdst2_g = vshrq_n_u16(vdst2_g, SK_R16_BITS + SK_B16_BITS); // extract green

+ vdst1_r = vmlaq_u16(wide_sr, wide_256_sa, vdst1_r); // sr + (256-sa) x dr1

+ vdst1_g = vmlaq_u16(wide_sg, wide_256_sa, vdst1_g); // sg + (256-sa) x dg1

+ vdst1_b = vmlaq_u16(wide_sb, wide_256_sa, vdst1_b); // sb + (256-sa) x db1

+ vdst2_r = vmlaq_u16(wide_sr, wide_256_sa, vdst2_r); // sr + (256-sa) x dr2

+ vdst2_g = vmlaq_u16(wide_sg, wide_256_sa, vdst2_g); // sg + (256-sa) x dg2

+ vdst2_b = vmlaq_u16(wide_sb, wide_256_sa, vdst2_b); // sb + (256-sa) x db2

+ vdst1_r = vshrq_n_u16(vdst1_r, 5); // 5-bit right shift for 5-bit red

+ vdst1_g = vshrq_n_u16(vdst1_g, 5); // 5-bit right shift for 6-bit green

+ vdst1_b = vshrq_n_u16(vdst1_b, 5); // 5-bit right shift for 5-bit blue

+ vdst1 = vsliq_n_u16(vdst1_b, vdst1_g, SK_G16_SHIFT); // insert green into blue

+ vdst1 = vsliq_n_u16(vdst1, vdst1_r, SK_R16_SHIFT); // insert red into green/blue

+ vdst2_r = vshrq_n_u16(vdst2_r, 5); // 5-bit right shift for 5-bit red

+ vdst2_g = vshrq_n_u16(vdst2_g, 5); // 5-bit right shift for 6-bit green

+ vdst2_b = vshrq_n_u16(vdst2_b, 5); // 5-bit right shift for 5-bit blue

+ vdst2 = vsliq_n_u16(vdst2_b, vdst2_g, SK_G16_SHIFT); // insert green into blue

+ vdst2 = vsliq_n_u16(vdst2, vdst2_r, SK_R16_SHIFT); // insert red into green/blue

+ vst1q_u16(dst, vdst1);

+ dst += 8;

+ vst1q_u16(dst, vdst2);

+ dst += 8;

+ }

+ count &= 0xF;

+ if (count > 0) {

+ do {

+ uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale;

+ *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);

+ dst += 1;

+ } while (--count != 0);

+ }

static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {

prod += vdupq_n_u16(128);

prod += vshrq_n_u16(prod, 8);

@@ -1665,6 +1789,13 @@ const SkBlitRow::Proc16 sk_blitrow_platform_565_procs_arm_neon[] = {

NULL, // S32A_D565_Blend_Dither

};

+const SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm_neon[] = {

+ Color32A_D565_neon, // Color32_D565,

+ Color32A_D565_neon, // Color32A_D565,

+ Color32A_D565_neon, // Color32_D565_Dither,

+ Color32A_D565_neon, // Color32A_D565_Dither

+};

const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {

NULL, // S32_Opaque,

S32_Blend_BlitRow32_neon, // S32_Blend,

« no previous file with comments | « src/opts/SkBlitRow_opts_arm_neon.h ('k') | no next file » | no next file with comments »