src/opts/SkSwizzler_opts.h - Issue 1577703006: Optimized premultiplying swizzles for NEON

Side by Side Diff: src/opts/SkSwizzler_opts.h

Issue 1577703006: Optimized premultiplying swizzles for NEON (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Fixed loop bugs Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 /*

	2 * Copyright 2016 Google Inc.

	3 *

	4 * Use of this source code is governed by a BSD-style license that can be

	5 * found in the LICENSE file.

	6 */

	7

	8 #ifndef SkSwizzler_opts_DEFINED

	9 #define SkSwizzler_opts_DEFINED

	10

	11 #include "SkColorPriv.h"

	12

	13 namespace SK_OPTS_NS {

	14

	15 // These variable names in these functions just pretend the input is BGRA.

	16 // They work fine with both RGBA and BGRA.

	17

	18 #if defined(SK_ARM_HAS_NEON)

	19

	20 // Rounded divide by 255, (x + 127) / 255

	21 static uint8x8_t div255_round(uint16x8_t x) {

	22 // result = (x + 127) / 255

	23 // result = (x + 127) / 256 + error1

	24 //

	25 // error1 = (x + 127) / (255 * 256)

	26 // error1 = (x + 127) / (256 * 256) + error2

	27 //

	28 // error2 = (x + 127) / (255 * 256 * 256)

	29 //

	30 // The maximum value of error2 is too small to matter. Thus:

	31 // result = (x + 127) / 256 + (x + 127) / (256 * 256)

	32 // result = ((x + 127) / 256 + x + 127) / 256

	33 // result = ((x + 127) >> 8 + x + 127) >> 8

	34 //

	35 // Use >>> to represent "rounded right shift" which, conveniently,

	36 // NEON supports in one instruction.

	37 // result = ((x >>> 8) + x) >>> 8

	38 //

	39 // Note that the second right shift is actually performed as an

	40 // "add, round, and narrow back to 8-bits" instruction.

	41 return vraddhn_u16(x, vrshrq_n_u16(x, 8));

	42 }

	43

	44 // Scale a byte by another, (x * y + 127) / 255

	45 static uint8x8_t scale(uint8x8_t x, uint8x8_t y) {

	46 return div255_round(vmull_u8(x, y));

	47 }

	48

	49 template <bool kSwapRB>

	50 static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int count) {

	51 while (count >= 8) {

	52 // Load 8 pixels.

	53 uint8x8x4_t bgra = vld4_u8((const uint8_t*) src);

	54

	55 uint8x8_t b = bgra.val[0],
	mtklein 2016/01/13 21:04:17 It'd be nice to harmonize the order we list things It'd be nice to harmonize the order we list things now that we've got the names straight. Up here we write things in b,g,r,a (0->3) order, but in the portable code we write things in a,r,g,b (3->0) order. Either works for me, but as you might guess, I like a,r,g,b (3->0). msarett 2016/01/13 21:30:27 Done. Show quoted text On 2016/01/13 21:04:17, mtklein wrote: > It'd be nice to harmonize the order we list things now that we've got the names > straight. > > Up here we write things in b,g,r,a (0->3) order, but in the portable code we > write things in a,r,g,b (3->0) order. Either works for me, but as you might > guess, I like a,r,g,b (3->0). Done.
	56 g = bgra.val[1],

	57 r = bgra.val[2],

	58 a = bgra.val[3];

	59

	60 // Premultiply.

	61 b = scale(b, a);

	62 g = scale(g, a);

	63 r = scale(r, a);

	64

	65 // Store 8 premultiplied pixels.

	66 if (kSwapRB) {

	67 bgra.val[0] = r;

	68 bgra.val[1] = g;

	69 bgra.val[2] = b;

	70 } else {

	71 bgra.val[0] = b;

	72 bgra.val[1] = g;

	73 bgra.val[2] = r;

	74 }

	75 vst4_u8((uint8_t*) dst, bgra);

	76 src += 8;

	77 dst += 8;

	78 count -= 8;

	79 }

	80

	81 while (count --> 0) {
	mtklein 2016/01/13 21:04:17 This loop can't be correct. It's not looking at k This loop can't be correct. It's not looking at kSwapRB. (We can't call SkPremultiplyARGBInline... that's always returning SkPMColor order.) Why don't we pull the portable code up top as premul_xxxa_portable, etc, then from in here you can do your NEON loop, then just // Call portable code to finish up the tail of [0,8) pixels. auto proc = kSwapRB ? premul_swaprb_xxxa_portable : premul_xxxa_portable; proc(dst, src, count); msarett 2016/01/13 21:30:27 Done. Show quoted text On 2016/01/13 21:04:17, mtklein wrote: > This loop can't be correct. It's not looking at kSwapRB. (We can't call > SkPremultiplyARGBInline... that's always returning SkPMColor order.) > > Why don't we pull the portable code up top as premul_xxxa_portable, etc, then > from in here you can do your NEON loop, then just > > // Call portable code to finish up the tail of [0,8) pixels. > auto proc = kSwapRB ? premul_swaprb_xxxa_portable : premul_xxxa_portable; > proc(dst, src, count); Done.
	82 const uint8_t* src8 = (const uint8_t*) src;

	83 *dst++ = SkPremultiplyARGBInline(src8[3], src8[0], src8[1], src8[2]);

	84 src++;

	85 }

	86 }

	87

	88 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) {

	89 premul_xxxa_should_swaprb<false>(dst, src, count);

	90 }

	91

	92 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {

	93 premul_xxxa_should_swaprb<true>(dst, src, count);

	94 }

	95

	96 #else

	97

	98 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) {

	99 for (int i = 0; i < count; i++) {

	100 uint8_t a = src[i] >> 24,

	101 r = src[i] >> 16,

	102 g = src[i] >> 8,

	103 b = src[i] >> 0;

	104 r = (r*a+127)/255;

	105 g = (g*a+127)/255;

	106 b = (b*a+127)/255;

	107 dst[i] = (uint32_t)a << 24

	108 \| (uint32_t)r << 16

	109 \| (uint32_t)g << 8

	110 \| (uint32_t)b << 0;

	111 }

	112 }

	113

	114 static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {

	115 for (int i = 0; i < count; i++) {

	116 uint8_t a = src[i] >> 24,

	117 r = src[i] >> 16,

	118 g = src[i] >> 8,

	119 b = src[i] >> 0;

	120 r = (r*a+127)/255;

	121 g = (g*a+127)/255;

	122 b = (b*a+127)/255;

	123 dst[i] = (uint32_t)a << 24

	124 \| (uint32_t)b << 16

	125 \| (uint32_t)g << 8

	126 \| (uint32_t)r << 0;

	127 }

	128 }

	129

	130 #endif

	131

	132 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {

	133 for (int i = 0; i < count; i++) {

	134 uint8_t a = src[i] >> 24,

	135 r = src[i] >> 16,

	136 g = src[i] >> 8,

	137 b = src[i] >> 0;

	138 dst[i] = (uint32_t)a << 24

	139 \| (uint32_t)b << 16

	140 \| (uint32_t)g << 8

	141 \| (uint32_t)r << 0;

	142 }

	143 }

	144

	145 }

	146

	147 #endif // SkSwizzler_opts_DEFINED

OLD	NEW

« no previous file with comments | « src/opts/SkOpts_neon.cpp ('k') | no next file » | no next file with comments »