src/opts/SkBlitMask_opts_arm_neon.cpp - Issue 1213723002: Optimize RGB16 blitV functions with NEON for ARM platform.

Unified Diff: src/opts/SkBlitMask_opts_arm_neon.cpp

Issue 1213723002: Optimize RGB16 blitV functions with NEON for ARM platform. (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Add macro define for data load/store Created 5 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/opts/SkBlitMask_opts_arm_neon.cpp

diff --git a/src/opts/SkBlitMask_opts_arm_neon.cpp b/src/opts/SkBlitMask_opts_arm_neon.cpp

index 84b5c3da5af0f570a5f4c445cbe92de38e96a4f0..78549ebf1a83f7e7c3f9c5abc4e0f54b418a733f 100644

--- a/src/opts/SkBlitMask_opts_arm_neon.cpp

+++ b/src/opts/SkBlitMask_opts_arm_neon.cpp

@@ -1,3 +1,10 @@

+/*

+ *

+ * Use of this source code is governed by a BSD-style license that can be

+ * found in the LICENSE file.

+ */

#include "SkBlitMask.h"

#include "SkColor_opts_neon.h"

@@ -252,3 +259,77 @@ void SkBlitLCD16Row_neon(SkPMColor dst[], const uint16_t src[],

dst[i] = SkBlendLCD16(colA, colR, colG, colB, dst[i], src[i]);

}

+#define LOAD_LANE_16(reg, n) \

+ reg = vld1q_lane_u16(device, reg, n); \

+ device = (uint16_t*)((char*)device + deviceRB);

+#define STORE_LANE_16(reg, n) \

+ vst1_lane_u16(dst, reg, n); \

+ dst = (uint16_t*)((char*)dst + deviceRB);

+void SkRGB16BlitterBlitV_neon(uint16_t* device,

+ int height,

+ size_t deviceRB,

+ unsigned scale,

+ uint32_t src32) {

+ if (height >= 8)

+ {

+ uint16_t* dst = device;

+ // prepare constants

+ uint16x8_t vdev = vdupq_n_u16(0);

+ uint16x8_t vmaskq_g16 = vdupq_n_u16(SK_G16_MASK_IN_PLACE);

mtklein 2015/06/29 17:16:17 Oh, I was actually asking about reducing the four

yang.zhang 2015/06/30 04:51:53 Yeah. The results are the same. But I think there

mtklein 2015/06/30 12:42:55 Oh, just seemed tidier. I agree it's not a big de

+ uint16x8_t vmaskq_ng16 = vdupq_n_u16(~SK_G16_MASK_IN_PLACE);

+ uint32x4_t vsrc32 = vdupq_n_u32(src32);

+ uint32x4_t vscale5 = vdupq_n_u32((uint32_t)scale);

+ while (height >= 8){

+ LOAD_LANE_16(vdev, 0)

+ LOAD_LANE_16(vdev, 1)

+ LOAD_LANE_16(vdev, 2)

+ LOAD_LANE_16(vdev, 3)

+ LOAD_LANE_16(vdev, 4)

+ LOAD_LANE_16(vdev, 5)

+ LOAD_LANE_16(vdev, 6)

+ LOAD_LANE_16(vdev, 7)

+ // Expand_rgb_16

+ uint16x8x2_t vdst = vzipq_u16((vdev & vmaskq_ng16), (vdev & vmaskq_g16));

mtklein 2015/06/29 17:16:17 Remind me, why do we need to zip these together?

yang.zhang 2015/06/30 04:51:53 Here, I used vzip instruction to implement the fol

mtklein 2015/06/30 12:42:55 sgtm

+ uint32x4_t vdst32_lo = vmulq_u32(vreinterpretq_u32_u16(vdst.val[0]), vscale5);

+ uint32x4_t vdst32_hi = vmulq_u32(vreinterpretq_u32_u16(vdst.val[1]), vscale5);

+ // Compact_rgb_16

+ vdst32_lo = vaddq_u32(vdst32_lo, vsrc32);

+ vdst32_hi = vaddq_u32(vdst32_hi, vsrc32);

+ vdst32_lo = vshrq_n_u32(vdst32_lo, 5);

+ vdst32_hi = vshrq_n_u32(vdst32_hi, 5);

+ uint16x4_t vtmp_lo = vmovn_u32(vdst32_lo) & vget_low_u16(vmaskq_ng16);

+ uint16x4_t vtmp_hi = vshrn_n_u32(vdst32_lo, 16) & vget_low_u16(vmaskq_g16);

+ uint16x4_t vdst16_lo = vorr_u16(vtmp_lo, vtmp_hi);

+ vtmp_lo = vmovn_u32(vdst32_hi) & vget_low_u16(vmaskq_ng16);

+ vtmp_hi = vshrn_n_u32(vdst32_hi, 16) & vget_low_u16(vmaskq_g16);

+ uint16x4_t vdst16_hi = vorr_u16(vtmp_lo, vtmp_hi);

+ STORE_LANE_16(vdst16_lo, 0)

+ STORE_LANE_16(vdst16_lo, 1)

+ STORE_LANE_16(vdst16_lo, 2)

+ STORE_LANE_16(vdst16_lo, 3)

+ STORE_LANE_16(vdst16_hi, 0)

+ STORE_LANE_16(vdst16_hi, 1)

+ STORE_LANE_16(vdst16_hi, 2)

+ STORE_LANE_16(vdst16_hi, 3)

+ height -= 8;

+ }

+ while (height != 0){

+ uint32_t dst32 = SkExpand_rgb_16(*device) * scale;

+ *device = SkCompact_rgb_16((src32 + dst32) >> 5);

+ device = (uint16_t*)((char*)device + deviceRB);

+ height--;

+ }

+#undef LOAD_LANE_16

+#undef STORE_LANE_16

« no previous file with comments | « src/core/SkBlitter_RGB16.cpp ('k') | no next file » | no next file with comments »