Index: src/opts/SkBlitRow_opts_arm_neon.cpp |
diff --git a/src/opts/SkBlitRow_opts_arm_neon.cpp b/src/opts/SkBlitRow_opts_arm_neon.cpp |
index 07570fac6aa0810911a646bc1bf1f21a56bd145a..67b42c9e267113f64029542bd74009a5e72544af 100644 |
--- a/src/opts/SkBlitRow_opts_arm_neon.cpp |
+++ b/src/opts/SkBlitRow_opts_arm_neon.cpp |
@@ -970,9 +970,8 @@ void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, |
#define UNROLL 8 |
if (count >= UNROLL) { |
- uint8x8_t dbase; |
-#if defined(DEBUG_OPAQUE_DITHER) |
+#if defined(DEBUG_OPAQUE_DITHER) |
uint16_t tmpbuf[UNROLL]; |
int td[UNROLL]; |
int tdv[UNROLL]; |
@@ -983,6 +982,7 @@ void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, |
int noisy = 0; |
#endif |
+ uint8x8_t dbase; |
const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; |
dbase = vld1_u8(dstart); |
@@ -991,27 +991,27 @@ void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, |
uint16x8_t dst8, scale8, alpha8; |
uint16x8_t dst_r, dst_g, dst_b; |
-#if defined(DEBUG_OPAQUE_DITHER) |
- /* calculate 8 elements worth into a temp buffer */ |
- { |
- int my_y = y; |
- int my_x = x; |
- SkPMColor* my_src = (SkPMColor*)src; |
- uint16_t* my_dst = dst; |
- int i; |
- |
- DITHER_565_SCAN(my_y); |
- for(i=0;i<UNROLL;i++) { |
+#if defined(DEBUG_OPAQUE_DITHER) |
+ // calculate 8 elements worth into a temp buffer |
+ { |
+ int my_y = y; |
+ int my_x = x; |
+ SkPMColor* my_src = (SkPMColor*)src; |
+ uint16_t* my_dst = dst; |
+ int i; |
+ |
+ DITHER_565_SCAN(my_y); |
+ for(i = 0; i < UNROLL; i++) { |
SkPMColor c = *my_src++; |
SkPMColorAssert(c); |
if (c) { |
unsigned a = SkGetPackedA32(c); |
int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a)); |
- tdv[i] = DITHER_VALUE(my_x); |
- ta[i] = a; |
- tap[i] = SkAlpha255To256(a); |
- td[i] = d; |
+ tdv[i] = DITHER_VALUE(my_x); |
+ ta[i] = a; |
+ tap[i] = SkAlpha255To256(a); |
+ td[i] = d; |
unsigned sr = SkGetPackedR32(c); |
unsigned sg = SkGetPackedG32(c); |
@@ -1025,147 +1025,126 @@ void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, |
dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); |
// now src and dst expanded are in g:11 r:10 x:1 b:10 |
tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); |
- td[i] = d; |
- |
+ td[i] = d; |
} else { |
- tmpbuf[i] = *my_dst; |
- ta[i] = tdv[i] = td[i] = 0xbeef; |
- } |
- in_dst[i] = *my_dst; |
+ tmpbuf[i] = *my_dst; |
+ ta[i] = tdv[i] = td[i] = 0xbeef; |
+ } |
+ in_dst[i] = *my_dst; |
my_dst += 1; |
DITHER_INC_X(my_x); |
- } |
- } |
+ } |
+ } |
#endif |
- /* source is in ABGR */ |
+ |
{ |
register uint8x8_t d0 asm("d0"); |
register uint8x8_t d1 asm("d1"); |
register uint8x8_t d2 asm("d2"); |
register uint8x8_t d3 asm("d3"); |
- asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */" |
- : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3) |
- : "r" (src) |
- ); |
+ asm ("vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */" |
+ : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src) |
+ : |
+ ); |
+#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) |
+ sr = d2; sg = d1; sb = d0; sa = d3; |
+#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) |
sr = d0; sg = d1; sb = d2; sa = d3; |
+#endif |
} |
- /* calculate 'd', which will be 0..7 */ |
- /* dbase[] is 0..7; alpha is 0..256; 16 bits suffice */ |
-#if defined(SK_BUILD_FOR_ANDROID) |
- /* SkAlpha255To256() semantic a+1 vs a+a>>7 */ |
- alpha8 = vaddw_u8(vmovl_u8(sa), vdup_n_u8(1)); |
-#else |
- alpha8 = vaddw_u8(vmovl_u8(sa), vshr_n_u8(sa, 7)); |
-#endif |
- alpha8 = vmulq_u16(alpha8, vmovl_u8(dbase)); |
- d = vshrn_n_u16(alpha8, 8); /* narrowing too */ |
+ /* calculate 'd', which will be 0..7 |
+ * dbase[] is 0..7; alpha is 0..256; 16 bits suffice |
+ */ |
+ alpha8 = vmovl_u8(dbase); |
+ alpha8 = vmlal_u8(alpha8, sa, dbase); |
+ d = vshrn_n_u16(alpha8, 8); // narrowing too |
- /* sr = sr - (sr>>5) + d */ |
+ // sr = sr - (sr>>5) + d |
/* watching for 8-bit overflow. d is 0..7; risky range of |
* sr is >248; and then (sr>>5) is 7 so it offsets 'd'; |
- * safe as long as we do ((sr-sr>>5) + d) */ |
+ * safe as long as we do ((sr-sr>>5) + d) |
+ */ |
sr = vsub_u8(sr, vshr_n_u8(sr, 5)); |
sr = vadd_u8(sr, d); |
- /* sb = sb - (sb>>5) + d */ |
+ // sb = sb - (sb>>5) + d |
sb = vsub_u8(sb, vshr_n_u8(sb, 5)); |
sb = vadd_u8(sb, d); |
- /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */ |
+ // sg = sg - (sg>>6) + d>>1; similar logic for overflows |
sg = vsub_u8(sg, vshr_n_u8(sg, 6)); |
sg = vadd_u8(sg, vshr_n_u8(d,1)); |
- /* need to pick up 8 dst's -- at 16 bits each, 128 bits */ |
+ // need to pick up 8 dst's -- at 16 bits each, 128 bits |
dst8 = vld1q_u16(dst); |
- dst_b = vandq_u16(dst8, vdupq_n_u16(0x001F)); |
- dst_g = vandq_u16(vshrq_n_u16(dst8,5), vdupq_n_u16(0x003F)); |
- dst_r = vshrq_n_u16(dst8,11); /* clearing hi bits */ |
- |
- /* blend */ |
-#if 1 |
- /* SkAlpha255To256() semantic a+1 vs a+a>>7 */ |
- /* originally 255-sa + 1 */ |
+ dst_b = vandq_u16(dst8, vdupq_n_u16(SK_B16_MASK)); |
+ dst_g = vshrq_n_u16(vshlq_n_u16(dst8, SK_R16_BITS), SK_R16_BITS + SK_B16_BITS); |
+ dst_r = vshrq_n_u16(dst8, SK_R16_SHIFT); // clearing hi bits |
+ |
+ // blend |
scale8 = vsubw_u8(vdupq_n_u16(256), sa); |
-#else |
- scale8 = vsubw_u8(vdupq_n_u16(255), sa); |
- scale8 = vaddq_u16(scale8, vshrq_n_u16(scale8, 7)); |
-#endif |
-#if 1 |
- /* combine the addq and mul, save 3 insns */ |
+ // combine the addq and mul, save 3 insns |
scale8 = vshrq_n_u16(scale8, 3); |
dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8); |
dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8); |
dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8); |
-#else |
- /* known correct, but +3 insns over above */ |
- scale8 = vshrq_n_u16(scale8, 3); |
- dst_b = vmulq_u16(dst_b, scale8); |
- dst_g = vmulq_u16(dst_g, scale8); |
- dst_r = vmulq_u16(dst_r, scale8); |
- |
- /* combine */ |
- /* NB: vshll widens, need to preserve those bits */ |
- dst_b = vaddq_u16(dst_b, vshll_n_u8(sb,2)); |
- dst_g = vaddq_u16(dst_g, vshll_n_u8(sg,3)); |
- dst_r = vaddq_u16(dst_r, vshll_n_u8(sr,2)); |
-#endif |
- /* repack to store */ |
- dst8 = vandq_u16(vshrq_n_u16(dst_b, 5), vdupq_n_u16(0x001F)); |
+ // repack to store |
+ dst8 = vshrq_n_u16(dst_b, 5); |
dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5); |
dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11); |
vst1q_u16(dst, dst8); |
-#if defined(DEBUG_OPAQUE_DITHER) |
- /* verify my 8 elements match the temp buffer */ |
- { |
- int i, bad=0; |
- static int invocation; |
- |
- for (i=0;i<UNROLL;i++) |
- if (tmpbuf[i] != dst[i]) bad=1; |
- if (bad) { |
- SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n", |
- invocation, offset); |
- SkDebugf(" alpha 0x%x\n", alpha); |
- for (i=0;i<UNROLL;i++) |
- SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n", |
- i, ((tmpbuf[i] != dst[i])?"BAD":"got"), |
- dst[i], tmpbuf[i], in_dst[i], src[i], td[i], tdv[i], tap[i], ta[i]); |
- |
- showme16("alpha8", &alpha8, sizeof(alpha8)); |
- showme16("scale8", &scale8, sizeof(scale8)); |
- showme8("d", &d, sizeof(d)); |
- showme16("dst8", &dst8, sizeof(dst8)); |
- showme16("dst_b", &dst_b, sizeof(dst_b)); |
- showme16("dst_g", &dst_g, sizeof(dst_g)); |
- showme16("dst_r", &dst_r, sizeof(dst_r)); |
- showme8("sb", &sb, sizeof(sb)); |
- showme8("sg", &sg, sizeof(sg)); |
- showme8("sr", &sr, sizeof(sr)); |
- |
- /* cop out */ |
- return; |
- } |
- offset += UNROLL; |
- invocation++; |
- } |
-#endif |
+#if defined(DEBUG_OPAQUE_DITHER) |
+ // verify my 8 elements match the temp buffer |
+ { |
+ int i, bad=0; |
+ static int invocation; |
- dst += UNROLL; |
- src += UNROLL; |
+ for (i = 0; i < UNROLL; i++) { |
+ if (tmpbuf[i] != dst[i]) { |
+ bad=1; |
+ } |
+ } |
+ if (bad) { |
+ SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n", |
+ invocation, offset); |
+ SkDebugf(" alpha 0x%x\n", alpha); |
+ for (i = 0; i < UNROLL; i++) |
+ SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n", |
+ i, ((tmpbuf[i] != dst[i])?"BAD":"got"), dst[i], tmpbuf[i], |
+ in_dst[i], src[i-8], td[i], tdv[i], tap[i], ta[i]); |
+ |
+ showme16("alpha8", &alpha8, sizeof(alpha8)); |
+ showme16("scale8", &scale8, sizeof(scale8)); |
+ showme8("d", &d, sizeof(d)); |
+ showme16("dst8", &dst8, sizeof(dst8)); |
+ showme16("dst_b", &dst_b, sizeof(dst_b)); |
+ showme16("dst_g", &dst_g, sizeof(dst_g)); |
+ showme16("dst_r", &dst_r, sizeof(dst_r)); |
+ showme8("sb", &sb, sizeof(sb)); |
+ showme8("sg", &sg, sizeof(sg)); |
+ showme8("sr", &sr, sizeof(sr)); |
+ |
+ return; |
+ } |
+ offset += UNROLL; |
+ invocation++; |
+ } |
+#endif |
+ dst += UNROLL; |
count -= UNROLL; |
- /* skip x += UNROLL, since it's unchanged mod-4 */ |
+ // skip x += UNROLL, since it's unchanged mod-4 |
} while (count >= UNROLL); |
} |
#undef UNROLL |
- /* residuals */ |
+ // residuals |
if (count > 0) { |
DITHER_565_SCAN(y); |
do { |