src/opts/SkBlitMask_opts_arm_neon.cpp - Issue 1213723002: Optimize RGB16 blitV functions with NEON for ARM platform.

Side by Side Diff: src/opts/SkBlitMask_opts_arm_neon.cpp

Issue 1213723002: Optimize RGB16 blitV functions with NEON for ARM platform. (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Add macro define for data load/store Created 5 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	1 /*

	2 * Copyright 2013 The Android Open Source Project

	3 *

	4 * Use of this source code is governed by a BSD-style license that can be

	5 * found in the LICENSE file.

	6 */

	7

1	8

2 #include "SkBlitMask.h"	9 #include "SkBlitMask.h"

3 #include "SkColor_opts_neon.h"	10 #include "SkColor_opts_neon.h"

4	11

5 static void D32_A8_Black_neon(void* SK_RESTRICT dst, size_t dstRB,	12 static void D32_A8_Black_neon(void* SK_RESTRICT dst, size_t dstRB,

6 const void* SK_RESTRICT maskPtr, size_t maskRB,	13 const void* SK_RESTRICT maskPtr, size_t maskRB,

7 SkColor, int width, int height) {	14 SkColor, int width, int height) {

8 SkPMColor* SK_RESTRICT device = (SkPMColor*)dst;	15 SkPMColor* SK_RESTRICT device = (SkPMColor*)dst;

9 const uint8_t* SK_RESTRICT mask = (const uint8_t*)maskPtr;	16 const uint8_t* SK_RESTRICT mask = (const uint8_t*)maskPtr;

10	17

(...skipping 234 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
245	252

246 dst += 8;	253 dst += 8;

247 src += 8;	254 src += 8;

248 width -= 8;	255 width -= 8;

249 }	256 }

250	257

251 for (int i = 0; i < width; i++) {	258 for (int i = 0; i < width; i++) {

252 dst[i] = SkBlendLCD16(colA, colR, colG, colB, dst[i], src[i]);	259 dst[i] = SkBlendLCD16(colA, colR, colG, colB, dst[i], src[i]);

253 }	260 }

254 }	261 }

	262

	263 #define LOAD_LANE_16(reg, n) \

	264 reg = vld1q_lane_u16(device, reg, n); \

	265 device = (uint16_t)((char)device + deviceRB);

	266

	267 #define STORE_LANE_16(reg, n) \

	268 vst1_lane_u16(dst, reg, n); \

	269 dst = (uint16_t)((char)dst + deviceRB);

	270

	271 void SkRGB16BlitterBlitV_neon(uint16_t* device,

	272 int height,

	273 size_t deviceRB,

	274 unsigned scale,

	275 uint32_t src32) {

	276 if (height >= 8)

	277 {

	278 uint16_t* dst = device;

	279

	280 // prepare constants

	281 uint16x8_t vdev = vdupq_n_u16(0);

	282 uint16x8_t vmaskq_g16 = vdupq_n_u16(SK_G16_MASK_IN_PLACE);
	mtklein 2015/06/29 17:16:17 Oh, I was actually asking about reducing the four Oh, I was actually asking about reducing the four masks to two the other way, but given what you've done here I think it can just be one! What I meant was, use a single mask with vandq, or vbicq when you'd use ~mask: uint16x8_t greenMask = vdupq_n_u16(SK_G16_MASK_IN_PLACE); ... uint16x8x2_t vdst = vzipq_u16(vbicq_u16(vdev, greenMask), vandq_u16(vdev, greenMask)); ... yang.zhang 2015/06/30 04:51:53 Yeah. The results are the same. But I think there Show quoted text On 2015/06/29 17:16:17, mtklein wrote: > Oh, I was actually asking about reducing the four masks to two the other way, > but given what you've done here I think it can just be one! > > What I meant was, use a single mask with vandq, or vbicq when you'd use ~mask: > > uint16x8_t greenMask = vdupq_n_u16(SK_G16_MASK_IN_PLACE); > ... > > uint16x8x2_t vdst = vzipq_u16(vbicq_u16(vdev, greenMask), > vandq_u16(vdev, greenMask)); > ... Yeah. The results are the same. But I think there isn't difference on performance. Besides using a single mask, is there any other benefit? mtklein 2015/06/30 12:42:55 Oh, just seemed tidier. I agree it's not a big de Show quoted text On 2015/06/30 04:51:53, yang.zhang wrote: > On 2015/06/29 17:16:17, mtklein wrote: > > Oh, I was actually asking about reducing the four masks to two the other way, > > but given what you've done here I think it can just be one! > > > > What I meant was, use a single mask with vandq, or vbicq when you'd use ~mask: > > > > uint16x8_t greenMask = vdupq_n_u16(SK_G16_MASK_IN_PLACE); > > ... > > > > uint16x8x2_t vdst = vzipq_u16(vbicq_u16(vdev, greenMask), > > vandq_u16(vdev, greenMask)); > > ... > > Yeah. The results are the same. But I think there isn't difference on > performance. Besides using a single mask, is there any other benefit? Oh, just seemed tidier. I agree it's not a big deal either way.
	283 uint16x8_t vmaskq_ng16 = vdupq_n_u16(~SK_G16_MASK_IN_PLACE);

	284 uint32x4_t vsrc32 = vdupq_n_u32(src32);

	285 uint32x4_t vscale5 = vdupq_n_u32((uint32_t)scale);

	286

	287 while (height >= 8){

	288 LOAD_LANE_16(vdev, 0)

	289 LOAD_LANE_16(vdev, 1)

	290 LOAD_LANE_16(vdev, 2)

	291 LOAD_LANE_16(vdev, 3)

	292 LOAD_LANE_16(vdev, 4)

	293 LOAD_LANE_16(vdev, 5)

	294 LOAD_LANE_16(vdev, 6)

	295 LOAD_LANE_16(vdev, 7)

	296

	297 // Expand_rgb_16

	298 uint16x8x2_t vdst = vzipq_u16((vdev & vmaskq_ng16), (vdev & vmaskq_g 16));
	mtklein 2015/06/29 17:16:17 Remind me, why do we need to zip these together? Remind me, why do we need to zip these together? Aren't the operations done to _hi and _lo always the same? Can't we just operate on two vectors without zipping them, one with red and blue, the other with just green? uint16x8_t rb = vbicq_u16(vdev, greenMask), g = vandq_u16(vdev, greenMask); ... yang.zhang 2015/06/30 04:51:53 Here, I used vzip instruction to implement the fol Show quoted text On 2015/06/29 17:16:17, mtklein wrote: > Remind me, why do we need to zip these together? Aren't the operations done to > _hi and _lo always the same? > > Can't we just operate on two vectors without zipping them, one with red and > blue, the other with just green? > > uint16x8_t rb = vbicq_u16(vdev, greenMask), > g = vandq_u16(vdev, greenMask); > ... Here, I used vzip instruction to implement the following operations. C implementation: ((c & SK_G16_MASK_IN_PLACE) << 16) \| (c & ~SK_G16_MASK_IN_PLACE) another NEON implementation: uint32x4_t dev_lo = vmovl_u16(vget_low_u16(vdev)); uint32x4_t dev_hi = vmovl_u16(vget_high_u16(vdev)); // unpack them in 32 bits dev_lo = (dev_lo & vmask_ng16) \| vshlq_n_u32(dev_lo & vmask_g16, 16); dev_hi = (dev_hi & vmask_ng16) \| vshlq_n_u32(dev_hi & vmask_g16, 16); I think that using vzip instruction is better because less instructions are needed. mtklein 2015/06/30 12:42:55 sgtm Show quoted text On 2015/06/30 04:51:53, yang.zhang wrote: > On 2015/06/29 17:16:17, mtklein wrote: > > Remind me, why do we need to zip these together? Aren't the operations done > to > > _hi and _lo always the same? > > > > Can't we just operate on two vectors without zipping them, one with red and > > blue, the other with just green? > > > > uint16x8_t rb = vbicq_u16(vdev, greenMask), > > g = vandq_u16(vdev, greenMask); > > ... > Here, I used vzip instruction to implement the following operations. > > C implementation: > ((c & SK_G16_MASK_IN_PLACE) << 16) \| (c & ~SK_G16_MASK_IN_PLACE) > > another NEON implementation: > uint32x4_t dev_lo = vmovl_u16(vget_low_u16(vdev)); > uint32x4_t dev_hi = vmovl_u16(vget_high_u16(vdev)); > // unpack them in 32 bits > dev_lo = (dev_lo & vmask_ng16) \| vshlq_n_u32(dev_lo & vmask_g16, 16); > dev_hi = (dev_hi & vmask_ng16) \| vshlq_n_u32(dev_hi & vmask_g16, 16); > > I think that using vzip instruction is better because less instructions are > needed. sgtm
	299 uint32x4_t vdst32_lo = vmulq_u32(vreinterpretq_u32_u16(vdst.val[0]), vscale5);

	300 uint32x4_t vdst32_hi = vmulq_u32(vreinterpretq_u32_u16(vdst.val[1]), vscale5);

	301

	302 // Compact_rgb_16

	303 vdst32_lo = vaddq_u32(vdst32_lo, vsrc32);

	304 vdst32_hi = vaddq_u32(vdst32_hi, vsrc32);

	305 vdst32_lo = vshrq_n_u32(vdst32_lo, 5);

	306 vdst32_hi = vshrq_n_u32(vdst32_hi, 5);

	307

	308 uint16x4_t vtmp_lo = vmovn_u32(vdst32_lo) & vget_low_u16(vmaskq_ng16 );

	309 uint16x4_t vtmp_hi = vshrn_n_u32(vdst32_lo, 16) & vget_low_u16(vmask q_g16);

	310 uint16x4_t vdst16_lo = vorr_u16(vtmp_lo, vtmp_hi);

	311 vtmp_lo = vmovn_u32(vdst32_hi) & vget_low_u16(vmaskq_ng16);

	312 vtmp_hi = vshrn_n_u32(vdst32_hi, 16) & vget_low_u16(vmaskq_g16);

	313 uint16x4_t vdst16_hi = vorr_u16(vtmp_lo, vtmp_hi);

	314

	315 STORE_LANE_16(vdst16_lo, 0)

	316 STORE_LANE_16(vdst16_lo, 1)

	317 STORE_LANE_16(vdst16_lo, 2)

	318 STORE_LANE_16(vdst16_lo, 3)

	319 STORE_LANE_16(vdst16_hi, 0)

	320 STORE_LANE_16(vdst16_hi, 1)

	321 STORE_LANE_16(vdst16_hi, 2)

	322 STORE_LANE_16(vdst16_hi, 3)

	323 height -= 8;

	324 }

	325 }

	326 while (height != 0){

	327 uint32_t dst32 = SkExpand_rgb_16(device) scale;

	328 *device = SkCompact_rgb_16((src32 + dst32) >> 5);

	329 device = (uint16_t)((char)device + deviceRB);

	330 height--;

	331 }

	332 }

	333

	334 #undef LOAD_LANE_16

	335 #undef STORE_LANE_16

OLD	NEW

« no previous file with comments | « src/core/SkBlitter_RGB16.cpp ('k') | no next file » | no next file with comments »