src/opts/SkBlitMask_opts_arm_neon.cpp - Issue 1213723002: Optimize RGB16 blitV functions with NEON for ARM platform.

Unified Diff: src/opts/SkBlitMask_opts_arm_neon.cpp

Issue 1213723002: Optimize RGB16 blitV functions with NEON for ARM platform. (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Created 5 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/opts/SkBlitMask_opts_arm_neon.cpp

diff --git a/src/opts/SkBlitMask_opts_arm_neon.cpp b/src/opts/SkBlitMask_opts_arm_neon.cpp

index 84b5c3da5af0f570a5f4c445cbe92de38e96a4f0..7bd55d7d8dfea0c69de9161527700d587c526ed2 100644

--- a/src/opts/SkBlitMask_opts_arm_neon.cpp

+++ b/src/opts/SkBlitMask_opts_arm_neon.cpp

@@ -1,3 +1,10 @@

+/*

mtklein 2015/06/26 14:05:50 Let's put 2013 (file created) or 2015 (now) here.

yang.zhang 2015/06/29 07:25:57 Done.

+ *

+ * Use of this source code is governed by a BSD-style license that can be

+ * found in the LICENSE file.

+ */

#include "SkBlitMask.h"

#include "SkColor_opts_neon.h"

@@ -252,3 +259,144 @@ void SkBlitLCD16Row_neon(SkPMColor dst[], const uint16_t src[],

dst[i] = SkBlendLCD16(colA, colR, colG, colB, dst[i], src[i]);

}

+void SkRGB16BlitterBlitV_neon(uint16_t* device,

+ int height,

+ size_t deviceRB,

+ unsigned scale,

+ uint32_t src32) {

+ uint32x4_t vsrc32, vscale5;

mtklein 2015/06/26 14:05:50 Does writing it like this recover any of the slowd

yang.zhang 2015/06/29 07:25:57 Yeah. The setup code may have an effect on the cas

+ uint16x8_t vmaskq_g16, vmaskq_ng16;

+ uint16x4_t vmask_g16, vmask_ng16;

+ uint16x8_t vdev;

+ uint16x8x2_t vdst32;

mtklein 2015/06/26 14:05:50 I'd prefer if if you could move the declarations o

yang.zhang 2015/06/29 07:25:57 Done.

+ uint32x4_t vdst32_lo, vdst32_hi;

+ uint16x4_t vtmp_lo, vtmp_hi;

+ uint16x4_t vdst16_lo, vdst16_hi;

+ uint16_t* dst = device;

+ // prepare constants

+ vdev = vdupq_n_u16(0);

+ vmaskq_g16 = vdupq_n_u16(SK_G16_MASK_IN_PLACE);

mtklein 2015/06/26 14:05:50 Why do we make four masks here when we can use van

yang.zhang 2015/06/29 07:25:57 Done.

+ vmaskq_ng16 = vdupq_n_u16(~SK_G16_MASK_IN_PLACE);

+ vmask_g16 = vdup_n_u16(SK_G16_MASK_IN_PLACE);

+ vmask_ng16 = vdup_n_u16(~SK_G16_MASK_IN_PLACE);

+ vsrc32 = vdupq_n_u32(src32);

+ vscale5 = vdupq_n_u32((uint32_t)scale);

+ while (height >= 8){

+ vdev = vld1q_lane_u16(device, vdev, 0);

mtklein 2015/06/26 14:05:50 This code (and the stores) might read more clearly

yang.zhang 2015/06/29 07:25:57 Done.

+ device = (uint16_t*)((char*)device + deviceRB);

+ vdev = vld1q_lane_u16(device, vdev, 1);

+ device = (uint16_t*)((char*)device + deviceRB);

+ vdev = vld1q_lane_u16(device, vdev, 2);

+ device = (uint16_t*)((char*)device + deviceRB);

+ vdev = vld1q_lane_u16(device, vdev, 3);

+ device = (uint16_t*)((char*)device + deviceRB);

+ vdev = vld1q_lane_u16(device, vdev, 4);

+ device = (uint16_t*)((char*)device + deviceRB);

+ vdev = vld1q_lane_u16(device, vdev, 5);

+ device = (uint16_t*)((char*)device + deviceRB);

+ vdev = vld1q_lane_u16(device, vdev, 6);

+ device = (uint16_t*)((char*)device + deviceRB);

+ vdev = vld1q_lane_u16(device, vdev, 7);

+ device = (uint16_t*)((char*)device + deviceRB);

+ // Expand_rgb_16

+ vdst32 = vzipq_u16((vdev & vmaskq_ng16), (vdev & vmaskq_g16));

+ vdst32_lo = vmulq_u32(vreinterpretq_u32_u16(vdst32.val[0]), vscale5);

+ vdst32_hi = vmulq_u32(vreinterpretq_u32_u16(vdst32.val[1]), vscale5);

+ // Compact_rgb_16

+ vdst32_lo = vaddq_u32(vdst32_lo, vsrc32);

+ vdst32_hi = vaddq_u32(vdst32_hi, vsrc32);

+ vdst32_lo = vshrq_n_u32(vdst32_lo, 5);

+ vdst32_hi = vshrq_n_u32(vdst32_hi, 5);

+ vtmp_lo = vmovn_u32(vdst32_lo) & vmask_ng16;

+ vtmp_hi = vshrn_n_u32(vdst32_lo, 16) & vmask_g16;

+ vdst16_lo = vorr_u16(vtmp_lo, vtmp_hi);

+ vtmp_lo = vmovn_u32(vdst32_hi) & vmask_ng16;

+ vtmp_hi = vshrn_n_u32(vdst32_hi, 16) & vmask_g16;

+ vdst16_hi = vorr_u16(vtmp_lo, vtmp_hi);

+ vst1_lane_u16(dst, vdst16_lo, 0);

+ dst = (uint16_t*)((char*)dst + deviceRB);

+ vst1_lane_u16(dst, vdst16_lo, 1);

+ dst = (uint16_t*)((char*)dst + deviceRB);

+ vst1_lane_u16(dst, vdst16_lo, 2);

+ dst = (uint16_t*)((char*)dst + deviceRB);

+ vst1_lane_u16(dst, vdst16_lo, 3);

+ dst = (uint16_t*)((char*)dst + deviceRB);

+ vst1_lane_u16(dst, vdst16_hi, 0);

+ dst = (uint16_t*)((char*)dst + deviceRB);

+ vst1_lane_u16(dst, vdst16_hi, 1);

+ dst = (uint16_t*)((char*)dst + deviceRB);

+ vst1_lane_u16(dst, vdst16_hi, 2);

+ dst = (uint16_t*)((char*)dst + deviceRB);

+ vst1_lane_u16(dst, vdst16_hi, 3);

+ dst = (uint16_t*)((char*)dst + deviceRB);

+ height -= 8;

+ }

+ while (height != 0){

+ uint32_t dst32 = SkExpand_rgb_16(*device) * scale;

+ *device = SkCompact_rgb_16((src32 + dst32) >> 5);

+ device = (uint16_t*)((char*)device + deviceRB);

+ height--;

+ }

+void SkRGB16BlitterBlitH_neon(uint16_t* device,

mtklein 2015/06/26 14:05:50 Let's leave this out until it's used?

+ int height,

+ unsigned scale,

+ uint32_t src32) {

+ uint32x4_t vsrc32, vscale5;

+ uint16x8_t vmaskq_g16, vmaskq_ng16;

+ uint16x4_t vmask_g16, vmask_ng16;

+ uint16x8_t vdev;

+ uint16x8x2_t vdst32;

+ uint32x4_t vdst32_lo, vdst32_hi;

+ uint16x4_t vtmp_lo, vtmp_hi;

+ uint16x4_t vdst16_lo, vdst16_hi;

+ // prepare constants

+ vdev = vdupq_n_u16(0);

+ vmaskq_g16 = vdupq_n_u16(SK_G16_MASK_IN_PLACE);

+ vmaskq_ng16 = vdupq_n_u16(~SK_G16_MASK_IN_PLACE);

+ vmask_g16 = vdup_n_u16(SK_G16_MASK_IN_PLACE);

+ vmask_ng16 = vdup_n_u16(~SK_G16_MASK_IN_PLACE);

+ vsrc32 = vdupq_n_u32(src32);

+ vscale5 = vdupq_n_u32((uint32_t)scale);

+ while (height >= 8){

+ vdev = vld1q_u16(device);

+ // Expand_rgb_16

+ vdst32 = vzipq_u16((vdev & vmaskq_ng16), (vdev & vmaskq_g16));

+ vdst32_lo = vmulq_u32(vreinterpretq_u32_u16(vdst32.val[0]), vscale5);

+ vdst32_hi = vmulq_u32(vreinterpretq_u32_u16(vdst32.val[1]), vscale5);

+ // Compact_rgb_16

+ vdst32_lo = vaddq_u32(vdst32_lo, vsrc32);

+ vdst32_hi = vaddq_u32(vdst32_hi, vsrc32);

+ vdst32_lo = vshrq_n_u32(vdst32_lo, 5);

+ vdst32_hi = vshrq_n_u32(vdst32_hi, 5);

+ vtmp_lo = vmovn_u32(vdst32_lo) & vmask_ng16;

+ vtmp_hi = vshrn_n_u32(vdst32_lo, 16) & vmask_g16;

+ vdst16_lo = vorr_u16(vtmp_lo, vtmp_hi);

+ vtmp_lo = vmovn_u32(vdst32_hi) & vmask_ng16;

+ vtmp_hi = vshrn_n_u32(vdst32_hi, 16) & vmask_g16;

+ vdst16_hi = vorr_u16(vtmp_lo, vtmp_hi);

+ vst1q_u16(device, vcombine_u16(vdst16_lo, vdst16_hi));

+ device += 8;

+ height -= 8;

+ }

+ while (height != 0){

+ uint32_t dst32 = SkExpand_rgb_16(*device) * scale;

+ *device++ = SkCompact_rgb_16((src32 + dst32) >> 5);

+ height--;

+ }

« no previous file with comments | « src/core/SkBlitter_RGB16.cpp ('k') | no next file » | no next file with comments »