src/opts/SkSwizzler_opts.h - Issue 1577703006: Optimized premultiplying swizzles for NEON

Side by Side Diff: src/opts/SkSwizzler_opts.h

Issue 1577703006: Optimized premultiplying swizzles for NEON (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Response to comments Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 /*

	2 * Copyright 2016 Google Inc.

	3 *

	4 * Use of this source code is governed by a BSD-style license that can be

	5 * found in the LICENSE file.

	6 */

	7

	8 #ifndef SkSwizzler_opts_DEFINED

	9 #define SkSwizzler_opts_DEFINED

	10

	11 #include "SkColorPriv.h"

	12

	13 namespace SK_OPTS_NS {

	14

	15 // These variable names in these functions just pretend the input is RGBA.
	mtklein 2016/01/13 17:04:37 You're going to the dark side! BGRA is the format You're going to the dark side! BGRA is the format to prefer, because it matches SkColor. Written as a literal, it's 0xAARRGGBB, which is what we're used to writing and breaking apart with shifts. I admit this is pretty arbitrary, but the arbitrariness is why I'd like to stick to the convention of writing things as if they were BGRA. It's nice to hop back and forth between files without twisting your brain. msarett 2016/01/13 18:52:05 SGTM. I just wanted NEON and default to be the sa Show quoted text On 2016/01/13 17:04:37, mtklein wrote: > You're going to the dark side! > > BGRA is the format to prefer, because it matches SkColor. Written as a literal, > it's 0xAARRGGBB, which is what we're used to writing and breaking apart with > shifts. > > I admit this is pretty arbitrary, but the arbitrariness is why I'd like to stick > to the convention of writing things as if they were BGRA. It's nice to hop back > and forth between files without twisting your brain. SGTM. I just wanted NEON and default to be the same. Didn't know it mattered which one I picked :).
	16 // They work fine with both RGBA and BGRA.

	17

	18 #if defined(SK_ARM_HAS_NEON)

	19

	20 // Rounded divide by 255 (x + 127) / 255
	mtklein 2016/01/13 17:04:36 add a comma after the first 255? add a comma after the first 255? msarett 2016/01/13 18:52:05 Done. Show quoted text On 2016/01/13 17:04:36, mtklein wrote: > add a comma after the first 255? Done.
	21 // result = (x + 127) / 255
	mtklein 2016/01/13 17:04:36 Let's move the rest of these implementation-focuse Let's move the rest of these implementation-focused comments to just inside the function body. msarett 2016/01/13 18:52:05 Done. Show quoted text On 2016/01/13 17:04:36, mtklein wrote: > Let's move the rest of these implementation-focused comments to just inside the > function body. Done.
	22 // result = (x + 127) / 256 + error1

	23 //

	24 // error1 = (x + 127) / (255 * 256)

	25 // error1 = (x + 127) / (256 * 256) + error2

	26 //

	27 // error2 = (x + 127) / (255 * 256 * 256)

	28 //

	29 // The maximum value of error2 is too small to matter. Thus:

	30 // result = (x + 127) / 256 + (x + 127) / (256 * 256)

	31 // result = ((x + 127) / 256 + x + 127) / 256

	32 // result = ((x + 127) >> 8 + x + 127) >> 8

	33 //

	34 // Use >>> to represent "rounded right shift" which, conveniently,

	35 // NEON supports.
	mtklein 2016/01/13 17:04:37 ...supports in one instruction. ? ...supports in one instruction. ? msarett 2016/01/13 18:52:05 Done. Show quoted text On 2016/01/13 17:04:37, mtklein wrote: > ...supports in one instruction. > ? Done.
	36 // result = ((x >>> 8) + x) >>> 8

	37 //

	38 // Note that the second right shift is actually performed as an

	39 // "add, round, and narrow back to 8-bits" instruction.

	40 static uint8x8_t div255_round(uint16x8_t x) {

	41 return vraddhn_u16(x, vrshrq_n_u16(x, 8));

	42 }

	43

	44 // Scale a byte by another (x * y + 127) / 255
	mtklein 2016/01/13 17:04:37 comma after another? comma after another? msarett 2016/01/13 18:52:05 Done. Show quoted text On 2016/01/13 17:04:37, mtklein wrote: > comma after another? Done.
	45 static uint8x8_t scale(uint8x8_t x, uint8x8_t y) {

	46 return div255_round(vmull_u8(x, y));

	47 }

	48

	49 template <bool kSwapRB>

	50 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) {

	51 int i = 0;

	52 uint8_t* dst8 = (uint8_t*) dst;

	53 const uint8_t* src8 = (const uint8_t*) src;
	mtklein 2016/01/13 17:04:36 Generally I prefer to cast the pointers inside eac Generally I prefer to cast the pointers inside each load and store, so that all the constants we use match up. It takes thought to see that src8 += 32 is correct, but while src += 8 is a bit clearer. msarett 2016/01/13 18:52:05 Done. Show quoted text On 2016/01/13 17:04:36, mtklein wrote: > Generally I prefer to cast the pointers inside each load and store, so that all > the constants we use match up. It takes thought to see that src8 += 32 is > correct, but while src += 8 is a bit clearer. Done.
	54 for (; i < count; i += 8) {
	mtklein 2016/01/13 17:04:36 I think this walks i too far for, e.g. count == 15 I think this walks i too far for, e.g. count == 15. I've found the safest way to get these things right is to decrement count: while (count >= 8) { count -= 8; } while (count > 0) { count--; } I further usually write that last one as while (count --> 0) { } because it both looks cute and is a pretty strong indication that count will not be referred to inside the loop. msarett 2016/01/13 18:52:05 Done. Apologies for the messy job porting this co Show quoted text On 2016/01/13 17:04:36, mtklein wrote: > I think this walks i too far for, e.g. count == 15. > > I've found the safest way to get these things right is to decrement count: > > while (count >= 8) { > > count -= 8; > } > while (count > 0) { > > count--; > } > > I further usually write that last one as > > while (count --> 0) { > > } > > because it both looks cute and is a pretty strong indication that count will not > be referred to inside the loop. Done. Apologies for the messy job porting this code. I've verified that perf is unaffected and tested again (this time with the opts turned on :)).
	55 // Load 8 pixels.

	56 uint8x8x4_t rgba = vld4_u8(src8);

	57

	58 uint8x8_t r = rgba.val[0],

	59 g = rgba.val[1],

	60 b = rgba.val[2],

	61 a = rgba.val[3];

	62

	63 // Premultiply.

	64 r = scale(r, a);

	65 g = scale(g, a);

	66 b = scale(b, a);

	67

	68 // Store 8 premultiplied pixels.

	69 if (kSwapRB) {

	70 rgba.val[0] = b;

	71 rgba.val[1] = g;

	72 rgba.val[2] = r;

	73 } else {

	74 rgba.val[0] = r;

	75 rgba.val[1] = g;

	76 rgba.val[2] = b;

	77 }

	78 vst4_u8(dst8, rgba);

	79 src8 += 32;

	80 dst8 += 32;

	81 }

	82

	83 dst = (uint32_t*) dst8;

	84 for (; i < count; i++) {

	85 dst[i] = SkPremultiplyARGBInline(src8[3], src8[0], src8[1], src8[2]);
	mtklein 2016/01/13 17:04:37 dst[i] = ... ... dst++ this steps dst forward twi dst[i] = ... ... dst++ this steps dst forward twice. I've found the most foolproof way to write these methods never needs an 'i': decrement count, increment dst and src. msarett 2016/01/13 18:52:05 Done. Show quoted text On 2016/01/13 17:04:37, mtklein wrote: > dst[i] = ... > ... > dst++ > > this steps dst forward twice. > > I've found the most foolproof way to write these methods never needs an 'i': > decrement count, increment dst and src. Done.
	86 src8 += 4;

	87 dst++;

	88 }

	89 }

	90

	91 #else

	92

	93 template <bool kSwapRB>

	94 static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) {
	mtklein 2016/01/13 17:04:37 This change is fine by me if you like, but it's no This change is fine by me if you like, but it's not necessary once you add shims. msarett 2016/01/13 18:52:05 Acknowledged. Changing back to the original. Show quoted text On 2016/01/13 17:04:37, mtklein wrote: > This change is fine by me if you like, but it's not necessary once you add > shims. Acknowledged. Changing back to the original.
	95 for (int i = 0; i < count; i++) {

	96 uint8_t a = src[i] >> 24,

	97 b = src[i] >> 16,

	98 g = src[i] >> 8,

	99 r = src[i] >> 0;

	100 b = (b*a+127)/255;

	101 g = (g*a+127)/255;

	102 r = (r*a+127)/255;

	103 if (kSwapRB) {

	104 dst[i] = (uint32_t)a << 24

	105 \| (uint32_t)b << 16

	106 \| (uint32_t)g << 8

	107 \| (uint32_t)r << 0;

	108 } else {

	109 dst[i] = (uint32_t)a << 24

	110 \| (uint32_t)r << 16

	111 \| (uint32_t)g << 8

	112 \| (uint32_t)b << 0;

	113 }

	114 }

	115 }

	116

	117 #endif

	118

	119 static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {

	120 for (int i = 0; i < count; i++) {

	121 uint8_t a = src[i] >> 24,

	122 b = src[i] >> 16,

	123 g = src[i] >> 8,

	124 r = src[i] >> 0;

	125 dst[i] = (uint32_t)a << 24

	126 \| (uint32_t)r << 16

	127 \| (uint32_t)g << 8

	128 \| (uint32_t)b << 0;

	129 }

	130 }

	131

	132 }

	133

	134 #endif // SkSwizzler_opts_DEFINED

OLD	NEW

« src/core/SkOpts.cpp ('K') | « src/opts/SkOpts_neon.cpp ('k') | no next file » | no next file with comments »