Chromium Code Reviews| Index: src/opts/SkBlitRow_opts_arm_neon.cpp |
| diff --git a/src/opts/SkBlitRow_opts_arm_neon.cpp b/src/opts/SkBlitRow_opts_arm_neon.cpp |
| index 7868108378bbab635ac8fa99d4b8fb458db775de..39606fc6677cd620e0ef7354ca910dd1b7cc8962 100644 |
| --- a/src/opts/SkBlitRow_opts_arm_neon.cpp |
| +++ b/src/opts/SkBlitRow_opts_arm_neon.cpp |
| @@ -1033,13 +1033,6 @@ void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, |
| /////////////////////////////////////////////////////////////////////////////// |
| -/* 2009/10/27: RBE says "a work in progress"; debugging says ok; |
| - * speedup untested, but ARM version is 26 insns/iteration and |
| - * this NEON version is 21 insns/iteration-of-8 (2.62insns/element) |
| - * which is 10x the native version; that's pure instruction counts, |
| - * not accounting for any instruction or memory latencies. |
| - */ |
| - |
| #undef DEBUG_S32_OPAQUE_DITHER |
| void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, |
| @@ -1065,11 +1058,17 @@ void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, |
| register uint8x8_t d2 asm("d2"); |
| register uint8x8_t d3 asm("d3"); |
| - asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */" |
| - : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3) |
| - : "r" (src) |
| - ); |
| - sr = d0; sg = d1; sb = d2; |
| + asm ( |
| + "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */" |
| + : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) |
| + : |
| + ); |
| + sg = d1; |
| +#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) |
| + sr = d2; sb = d0; |
| +#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) |
| + sr = d0; sb = d2; |
| +#endif |
| } |
| /* XXX: if we want to prefetch, hide it in the above asm() |
| * using the gcc __builtin_prefetch(), the prefetch will |
| @@ -1087,13 +1086,13 @@ void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, |
| /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */ |
| sg = vsub_u8(sg, vshr_n_u8(sg, 6)); |
| - dg = vaddl_u8(sg, vshr_n_u8(d,1)); |
| + dg = vaddl_u8(sg, vshr_n_u8(d, 1)); |
| /* XXX: check that the "d>>1" here is hoisted */ |
| /* pack high bits of each into 565 format (rgb, b is lsb) */ |
| dst8 = vshrq_n_u16(db, 3); |
| dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5); |
| - dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr,3), 11); |
| + dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11); |
| /* store it */ |
| vst1q_u16(dst, dst8); |
| @@ -1104,7 +1103,7 @@ void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, |
| int i, myx = x, myy = y; |
| DITHER_565_SCAN(myy); |
| for (i=0;i<UNROLL;i++) { |
| - SkPMColor c = src[i]; |
| + SkPMColor c = src[i-8]; |
|
mtklein
2013/09/12 20:07:36
Can you tack on something like
// The '!' in th
kevin.petit.not.used.account
2013/09/13 12:08:27
Done.
|
| unsigned dither = DITHER_VALUE(myx); |
| uint16_t val = SkDitherRGB32To565(c, dither); |
| if (val != dst[i]) { |
| @@ -1117,7 +1116,6 @@ void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, |
| #endif |
| dst += UNROLL; |
| - src += UNROLL; |
|
mtklein
2013/09/12 20:07:36
Maybe even duplicate the same note here about why
kevin.petit.not.used.account
2013/09/13 12:08:27
Done.
|
| count -= UNROLL; |
| x += UNROLL; /* probably superfluous */ |
| } |