Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(834)

Unified Diff: src/opts/SkBlitRow_opts_arm_neon.cpp

Issue 177963003: ARM Skia NEON patches - 25 - S32A_D565_Opaque_Dither clean/bugfix/speed (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: Add ignored-tests.txt Created 6 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« expectations/gm/ignored-tests.txt ('K') | « expectations/gm/ignored-tests.txt ('k') | no next file » | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: src/opts/SkBlitRow_opts_arm_neon.cpp
diff --git a/src/opts/SkBlitRow_opts_arm_neon.cpp b/src/opts/SkBlitRow_opts_arm_neon.cpp
index 07570fac6aa0810911a646bc1bf1f21a56bd145a..67b42c9e267113f64029542bd74009a5e72544af 100644
--- a/src/opts/SkBlitRow_opts_arm_neon.cpp
+++ b/src/opts/SkBlitRow_opts_arm_neon.cpp
@@ -970,9 +970,8 @@ void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
#define UNROLL 8
if (count >= UNROLL) {
- uint8x8_t dbase;
-#if defined(DEBUG_OPAQUE_DITHER)
+#if defined(DEBUG_OPAQUE_DITHER)
uint16_t tmpbuf[UNROLL];
int td[UNROLL];
int tdv[UNROLL];
@@ -983,6 +982,7 @@ void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
int noisy = 0;
#endif
+ uint8x8_t dbase;
const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
dbase = vld1_u8(dstart);
@@ -991,27 +991,27 @@ void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
uint16x8_t dst8, scale8, alpha8;
uint16x8_t dst_r, dst_g, dst_b;
-#if defined(DEBUG_OPAQUE_DITHER)
- /* calculate 8 elements worth into a temp buffer */
- {
- int my_y = y;
- int my_x = x;
- SkPMColor* my_src = (SkPMColor*)src;
- uint16_t* my_dst = dst;
- int i;
-
- DITHER_565_SCAN(my_y);
- for(i=0;i<UNROLL;i++) {
+#if defined(DEBUG_OPAQUE_DITHER)
+ // calculate 8 elements worth into a temp buffer
+ {
+ int my_y = y;
+ int my_x = x;
+ SkPMColor* my_src = (SkPMColor*)src;
+ uint16_t* my_dst = dst;
+ int i;
+
+ DITHER_565_SCAN(my_y);
+ for(i = 0; i < UNROLL; i++) {
SkPMColor c = *my_src++;
SkPMColorAssert(c);
if (c) {
unsigned a = SkGetPackedA32(c);
int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
- tdv[i] = DITHER_VALUE(my_x);
- ta[i] = a;
- tap[i] = SkAlpha255To256(a);
- td[i] = d;
+ tdv[i] = DITHER_VALUE(my_x);
+ ta[i] = a;
+ tap[i] = SkAlpha255To256(a);
+ td[i] = d;
unsigned sr = SkGetPackedR32(c);
unsigned sg = SkGetPackedG32(c);
@@ -1025,147 +1025,126 @@ void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
// now src and dst expanded are in g:11 r:10 x:1 b:10
tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
- td[i] = d;
-
+ td[i] = d;
} else {
- tmpbuf[i] = *my_dst;
- ta[i] = tdv[i] = td[i] = 0xbeef;
- }
- in_dst[i] = *my_dst;
+ tmpbuf[i] = *my_dst;
+ ta[i] = tdv[i] = td[i] = 0xbeef;
+ }
+ in_dst[i] = *my_dst;
my_dst += 1;
DITHER_INC_X(my_x);
- }
- }
+ }
+ }
#endif
- /* source is in ABGR */
+
{
register uint8x8_t d0 asm("d0");
register uint8x8_t d1 asm("d1");
register uint8x8_t d2 asm("d2");
register uint8x8_t d3 asm("d3");
- asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */"
- : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
- : "r" (src)
- );
+ asm ("vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */"
+ : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src)
+ :
+ );
+#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
+ sr = d2; sg = d1; sb = d0; sa = d3;
+#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
sr = d0; sg = d1; sb = d2; sa = d3;
+#endif
}
- /* calculate 'd', which will be 0..7 */
- /* dbase[] is 0..7; alpha is 0..256; 16 bits suffice */
-#if defined(SK_BUILD_FOR_ANDROID)
- /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
- alpha8 = vaddw_u8(vmovl_u8(sa), vdup_n_u8(1));
-#else
- alpha8 = vaddw_u8(vmovl_u8(sa), vshr_n_u8(sa, 7));
-#endif
- alpha8 = vmulq_u16(alpha8, vmovl_u8(dbase));
- d = vshrn_n_u16(alpha8, 8); /* narrowing too */
+ /* calculate 'd', which will be 0..7
+ * dbase[] is 0..7; alpha is 0..256; 16 bits suffice
+ */
+ alpha8 = vmovl_u8(dbase);
+ alpha8 = vmlal_u8(alpha8, sa, dbase);
+ d = vshrn_n_u16(alpha8, 8); // narrowing too
- /* sr = sr - (sr>>5) + d */
+ // sr = sr - (sr>>5) + d
/* watching for 8-bit overflow. d is 0..7; risky range of
* sr is >248; and then (sr>>5) is 7 so it offsets 'd';
- * safe as long as we do ((sr-sr>>5) + d) */
+ * safe as long as we do ((sr-sr>>5) + d)
+ */
sr = vsub_u8(sr, vshr_n_u8(sr, 5));
sr = vadd_u8(sr, d);
- /* sb = sb - (sb>>5) + d */
+ // sb = sb - (sb>>5) + d
sb = vsub_u8(sb, vshr_n_u8(sb, 5));
sb = vadd_u8(sb, d);
- /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
+ // sg = sg - (sg>>6) + d>>1; similar logic for overflows
sg = vsub_u8(sg, vshr_n_u8(sg, 6));
sg = vadd_u8(sg, vshr_n_u8(d,1));
- /* need to pick up 8 dst's -- at 16 bits each, 128 bits */
+ // need to pick up 8 dst's -- at 16 bits each, 128 bits
dst8 = vld1q_u16(dst);
- dst_b = vandq_u16(dst8, vdupq_n_u16(0x001F));
- dst_g = vandq_u16(vshrq_n_u16(dst8,5), vdupq_n_u16(0x003F));
- dst_r = vshrq_n_u16(dst8,11); /* clearing hi bits */
-
- /* blend */
-#if 1
- /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
- /* originally 255-sa + 1 */
+ dst_b = vandq_u16(dst8, vdupq_n_u16(SK_B16_MASK));
+ dst_g = vshrq_n_u16(vshlq_n_u16(dst8, SK_R16_BITS), SK_R16_BITS + SK_B16_BITS);
+ dst_r = vshrq_n_u16(dst8, SK_R16_SHIFT); // clearing hi bits
+
+ // blend
scale8 = vsubw_u8(vdupq_n_u16(256), sa);
-#else
- scale8 = vsubw_u8(vdupq_n_u16(255), sa);
- scale8 = vaddq_u16(scale8, vshrq_n_u16(scale8, 7));
-#endif
-#if 1
- /* combine the addq and mul, save 3 insns */
+ // combine the addq and mul, save 3 insns
scale8 = vshrq_n_u16(scale8, 3);
dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
-#else
- /* known correct, but +3 insns over above */
- scale8 = vshrq_n_u16(scale8, 3);
- dst_b = vmulq_u16(dst_b, scale8);
- dst_g = vmulq_u16(dst_g, scale8);
- dst_r = vmulq_u16(dst_r, scale8);
-
- /* combine */
- /* NB: vshll widens, need to preserve those bits */
- dst_b = vaddq_u16(dst_b, vshll_n_u8(sb,2));
- dst_g = vaddq_u16(dst_g, vshll_n_u8(sg,3));
- dst_r = vaddq_u16(dst_r, vshll_n_u8(sr,2));
-#endif
- /* repack to store */
- dst8 = vandq_u16(vshrq_n_u16(dst_b, 5), vdupq_n_u16(0x001F));
+ // repack to store
+ dst8 = vshrq_n_u16(dst_b, 5);
dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
vst1q_u16(dst, dst8);
-#if defined(DEBUG_OPAQUE_DITHER)
- /* verify my 8 elements match the temp buffer */
- {
- int i, bad=0;
- static int invocation;
-
- for (i=0;i<UNROLL;i++)
- if (tmpbuf[i] != dst[i]) bad=1;
- if (bad) {
- SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
- invocation, offset);
- SkDebugf(" alpha 0x%x\n", alpha);
- for (i=0;i<UNROLL;i++)
- SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
- i, ((tmpbuf[i] != dst[i])?"BAD":"got"),
- dst[i], tmpbuf[i], in_dst[i], src[i], td[i], tdv[i], tap[i], ta[i]);
-
- showme16("alpha8", &alpha8, sizeof(alpha8));
- showme16("scale8", &scale8, sizeof(scale8));
- showme8("d", &d, sizeof(d));
- showme16("dst8", &dst8, sizeof(dst8));
- showme16("dst_b", &dst_b, sizeof(dst_b));
- showme16("dst_g", &dst_g, sizeof(dst_g));
- showme16("dst_r", &dst_r, sizeof(dst_r));
- showme8("sb", &sb, sizeof(sb));
- showme8("sg", &sg, sizeof(sg));
- showme8("sr", &sr, sizeof(sr));
-
- /* cop out */
- return;
- }
- offset += UNROLL;
- invocation++;
- }
-#endif
+#if defined(DEBUG_OPAQUE_DITHER)
+ // verify my 8 elements match the temp buffer
+ {
+ int i, bad=0;
+ static int invocation;
- dst += UNROLL;
- src += UNROLL;
+ for (i = 0; i < UNROLL; i++) {
+ if (tmpbuf[i] != dst[i]) {
+ bad=1;
+ }
+ }
+ if (bad) {
+ SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
+ invocation, offset);
+ SkDebugf(" alpha 0x%x\n", alpha);
+ for (i = 0; i < UNROLL; i++)
+ SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
+ i, ((tmpbuf[i] != dst[i])?"BAD":"got"), dst[i], tmpbuf[i],
+ in_dst[i], src[i-8], td[i], tdv[i], tap[i], ta[i]);
+
+ showme16("alpha8", &alpha8, sizeof(alpha8));
+ showme16("scale8", &scale8, sizeof(scale8));
+ showme8("d", &d, sizeof(d));
+ showme16("dst8", &dst8, sizeof(dst8));
+ showme16("dst_b", &dst_b, sizeof(dst_b));
+ showme16("dst_g", &dst_g, sizeof(dst_g));
+ showme16("dst_r", &dst_r, sizeof(dst_r));
+ showme8("sb", &sb, sizeof(sb));
+ showme8("sg", &sg, sizeof(sg));
+ showme8("sr", &sr, sizeof(sr));
+
+ return;
+ }
+ offset += UNROLL;
+ invocation++;
+ }
+#endif
+ dst += UNROLL;
count -= UNROLL;
- /* skip x += UNROLL, since it's unchanged mod-4 */
+ // skip x += UNROLL, since it's unchanged mod-4
} while (count >= UNROLL);
}
#undef UNROLL
- /* residuals */
+ // residuals
if (count > 0) {
DITHER_565_SCAN(y);
do {
« expectations/gm/ignored-tests.txt ('K') | « expectations/gm/ignored-tests.txt ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698