Chromium Code Reviews| Index: src/opts/SkBlitRow_opts_arm_neon.cpp |
| diff --git a/src/opts/SkBlitRow_opts_arm_neon.cpp b/src/opts/SkBlitRow_opts_arm_neon.cpp |
| index 6ec1b4f78786a4f75434c39c2e71b17ed3d0c22f..01a6a2aa74593e0b25c7b90367bb9c3041790b60 100644 |
| --- a/src/opts/SkBlitRow_opts_arm_neon.cpp |
| +++ b/src/opts/SkBlitRow_opts_arm_neon.cpp |
| @@ -17,7 +17,53 @@ |
| #include "SkColor_opts_neon.h" |
| #include <arm_neon.h> |
| -#ifdef SK_CPU_ARM32 |
| +#ifdef SK_CPU_ARM64 |
| +static inline uint8x8x4_t sk_vld4_u8_arm64_3(const SkPMColor* SK_RESTRICT & src) { |
| + uint8x8x4_t vsrc; |
| + uint8x8_t vsrc_0, vsrc_1, vsrc_2; |
| + |
| + asm ( |
| + "ld4 {v0.8b - v3.8b}, [%[src]], #32 \t\n" |
| + "mov %[vsrc0].8b, v0.8b \t\n" |
| + "mov %[vsrc1].8b, v1.8b \t\n" |
| + "mov %[vsrc2].8b, v2.8b \t\n" |
| + : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1), |
| + [vsrc2] "=w" (vsrc_2), [src] "+&r" (src) |
| + : : "v0", "v1", "v2", "v3" |
| + ); |
| + |
| + vsrc.val[0] = vsrc_0; |
| + vsrc.val[1] = vsrc_1; |
| + vsrc.val[2] = vsrc_2; |
| + |
| + return vsrc; |
| +} |
| + |
| +static inline uint8x8x4_t sk_vld4_u8_arm64_4(const SkPMColor* SK_RESTRICT & src) { |
| + uint8x8x4_t vsrc; |
| + uint8x8_t vsrc_0, vsrc_1, vsrc_2, vsrc_3; |
| + |
| + asm ( |
| + "ld4 {v0.8b - v3.8b}, [%[src]], #32 \t\n" |
| + "mov %[vsrc0].8b, v0.8b \t\n" |
| + "mov %[vsrc1].8b, v1.8b \t\n" |
| + "mov %[vsrc2].8b, v2.8b \t\n" |
| + "mov %[vsrc3].8b, v3.8b \t\n" |
| + : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1), |
| + [vsrc2] "=w" (vsrc_2), [vsrc3] "=w" (vsrc_3), |
| + [src] "+&r" (src) |
| + : : "v0", "v1", "v2", "v3" |
| + ); |
| + |
| + vsrc.val[0] = vsrc_0; |
| + vsrc.val[1] = vsrc_1; |
| + vsrc.val[2] = vsrc_2; |
| + vsrc.val[3] = vsrc_3; |
| + |
| + return vsrc; |
| +} |
| +#endif |
| + |
| void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, |
| const SkPMColor* SK_RESTRICT src, int count, |
| U8CPU alpha, int /*x*/, int /*y*/) { |
| @@ -28,7 +74,12 @@ void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, |
| uint16x8_t vdst; |
| // Load |
| +#ifdef SK_CPU_ARM64 |
| + vsrc = sk_vld4_u8_arm64_3(src); |
| +#else |
| vsrc = vld4_u8((uint8_t*)src); |
| + src += 8; |
| +#endif |
| // Convert src to 565 |
| vdst = SkPixel32ToPixel16_neon8(vsrc); |
| @@ -38,7 +89,6 @@ void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, |
| // Prepare next iteration |
| dst += 8; |
| - src += 8; |
| count -= 8; |
| }; |
| @@ -64,11 +114,14 @@ void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst, |
| vmask_blue = vmovq_n_u16(0x1F); |
| while (count >= 8) { |
| + uint8x8x4_t vsrc; |
| uint16x8_t vdst, vdst_r, vdst_g, vdst_b; |
| uint16x8_t vres_r, vres_g, vres_b; |
| - uint8x8_t vsrc_r, vsrc_g, vsrc_b; |
| // Load src |
| +#ifdef SK_CPU_ARM64 |
| + vsrc = sk_vld4_u8_arm64_3(src); |
| +#else |
| { |
| register uint8x8_t d0 asm("d0"); |
| register uint8x8_t d1 asm("d1"); |
| @@ -80,13 +133,11 @@ void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst, |
| : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) |
| : |
| ); |
| - vsrc_g = d1; |
| -#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) |
| - vsrc_r = d2; vsrc_b = d0; |
| -#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) |
| - vsrc_r = d0; vsrc_b = d2; |
| -#endif |
| + vsrc.val[0] = d0; |
| + vsrc.val[1] = d1; |
| + vsrc.val[2] = d2; |
| } |
| +#endif |
| // Load and unpack dst |
| vdst = vld1q_u16(dst); |
| @@ -95,15 +146,15 @@ void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst, |
| vdst_r = vshrq_n_u16(vdst, 6+5); // extract red |
| vdst_g = vshrq_n_u16(vdst_g, 5+5); // extract green |
| - // Shift src to 565 |
| - vsrc_r = vshr_n_u8(vsrc_r, 3); // shift red to 565 range |
| - vsrc_g = vshr_n_u8(vsrc_g, 2); // shift green to 565 range |
| - vsrc_b = vshr_n_u8(vsrc_b, 3); // shift blue to 565 range |
| + // Shift src to 565 range |
| + vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 3); |
| + vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 2); |
| + vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 3); |
| // Scale src - dst |
| - vres_r = vmovl_u8(vsrc_r) - vdst_r; |
| - vres_g = vmovl_u8(vsrc_g) - vdst_g; |
| - vres_b = vmovl_u8(vsrc_b) - vdst_b; |
| + vres_r = vmovl_u8(vsrc.val[NEON_R]) - vdst_r; |
| + vres_g = vmovl_u8(vsrc.val[NEON_G]) - vdst_g; |
| + vres_b = vmovl_u8(vsrc.val[NEON_B]) - vdst_b; |
| vres_r = vshrq_n_u16(vres_r * vscale, 8); |
| vres_g = vshrq_n_u16(vres_g * vscale, 8); |
| @@ -136,6 +187,7 @@ void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst, |
| } |
| } |
| +#ifdef SK_CPU_ARM32 |
| void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, |
| const SkPMColor* SK_RESTRICT src, int count, |
| U8CPU alpha, int /*x*/, int /*y*/) { |
| @@ -313,6 +365,7 @@ void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, |
| ); |
| } |
| } |
| +#endif |
| static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) { |
| prod += vdupq_n_u16(128); |
| @@ -346,6 +399,9 @@ void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst, |
| // load pixels |
| vdst = vld1q_u16(dst); |
| +#ifdef SK_CPU_ARM64 |
| + vsrc = sk_vld4_u8_arm64_4(src); |
| +#else |
| #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) |
|
mtklein
2014/06/06 14:41:17
Think it makes sense to follow up and do the same
|
| asm ( |
| "vld4.u8 %h[vsrc], [%[src]]!" |
| @@ -369,6 +425,7 @@ void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst, |
| vsrc.val[2] = d2; |
| vsrc.val[3] = d3; |
| #endif |
| +#endif // #ifdef SK_CPU_ARM64 |
| // deinterleave dst |
| @@ -468,6 +525,7 @@ void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src, |
| do { |
| + uint8x8x4_t vsrc; |
| uint8x8_t vsrc_r, vsrc_g, vsrc_b; |
| uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b; |
| uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b; |
| @@ -478,6 +536,9 @@ void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src, |
| int8x8_t vres8_r, vres8_g, vres8_b; |
| // Load source and add dither |
| +#ifdef SK_CPU_ARM64 |
| + vsrc = sk_vld4_u8_arm64_3(src); |
| +#else |
| { |
| register uint8x8_t d0 asm("d0"); |
| register uint8x8_t d1 asm("d1"); |
| @@ -485,17 +546,18 @@ void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src, |
| register uint8x8_t d3 asm("d3"); |
| asm ( |
| - "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */" |
| + "vld4.8 {d0-d3},[%[src]]! " |
| : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) |
| : |
| ); |
| - vsrc_g = d1; |
| -#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) |
| - vsrc_r = d2; vsrc_b = d0; |
| -#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) |
| - vsrc_r = d0; vsrc_b = d2; |
| -#endif |
| + vsrc.val[0] = d0; |
| + vsrc.val[1] = d1; |
| + vsrc.val[2] = d2; |
| } |
| +#endif |
| + vsrc_r = vsrc.val[NEON_R]; |
| + vsrc_g = vsrc.val[NEON_G]; |
| + vsrc_b = vsrc.val[NEON_B]; |
| vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6 |
| vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5 |
| @@ -576,7 +638,6 @@ void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src, |
| } while (--count != 0); |
| } |
| } |
| -#endif |
| void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, |
| const SkPMColor* SK_RESTRICT src, |
| @@ -1047,6 +1108,7 @@ static void showme16(char *str, void *p, int len) |
| SkDebugf("%s\n", buf); |
| } |
| #endif |
| +#endif // #ifdef SK_CPU_ARM32 |
| void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, |
| const SkPMColor* SK_RESTRICT src, |
| @@ -1073,6 +1135,7 @@ void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, |
| dbase = vld1_u8(dstart); |
| do { |
| + uint8x8x4_t vsrc; |
| uint8x8_t sr, sg, sb, sa, d; |
| uint16x8_t dst8, scale8, alpha8; |
| uint16x8_t dst_r, dst_g, dst_b; |
| @@ -1123,23 +1186,29 @@ void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, |
| } |
| #endif |
| - |
| +#ifdef SK_CPU_ARM64 |
| + vsrc = sk_vld4_u8_arm64_4(src); |
| +#else |
| { |
| register uint8x8_t d0 asm("d0"); |
| register uint8x8_t d1 asm("d1"); |
| register uint8x8_t d2 asm("d2"); |
| register uint8x8_t d3 asm("d3"); |
| - asm ("vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */" |
| + asm ("vld4.8 {d0-d3},[%[src]]! " |
| : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src) |
| : |
| ); |
| -#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) |
| - sr = d2; sg = d1; sb = d0; sa = d3; |
| -#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) |
| - sr = d0; sg = d1; sb = d2; sa = d3; |
| -#endif |
| + vsrc.val[0] = d0; |
| + vsrc.val[1] = d1; |
| + vsrc.val[2] = d2; |
| + vsrc.val[3] = d3; |
| } |
| +#endif |
| + sa = vsrc.val[NEON_A]; |
| + sr = vsrc.val[NEON_R]; |
| + sg = vsrc.val[NEON_G]; |
| + sb = vsrc.val[NEON_B]; |
| /* calculate 'd', which will be 0..7 |
| * dbase[] is 0..7; alpha is 0..256; 16 bits suffice |
| @@ -1283,7 +1352,11 @@ void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, |
| uint8x8_t sr, sg, sb; |
| uint16x8_t dr, dg, db; |
| uint16x8_t dst8; |
| + uint8x8x4_t vsrc; |
| +#ifdef SK_CPU_ARM64 |
| + vsrc = sk_vld4_u8_arm64_3(src); |
| +#else |
| { |
| register uint8x8_t d0 asm("d0"); |
| register uint8x8_t d1 asm("d1"); |
| @@ -1291,17 +1364,19 @@ void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, |
| register uint8x8_t d3 asm("d3"); |
| asm ( |
| - "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */" |
| + "vld4.8 {d0-d3},[%[src]]! " |
| : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) |
| : |
| ); |
| - sg = d1; |
| -#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) |
| - sr = d2; sb = d0; |
| -#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) |
| - sr = d0; sb = d2; |
| -#endif |
| + vsrc.val[0] = d0; |
| + vsrc.val[1] = d1; |
| + vsrc.val[2] = d2; |
| } |
| +#endif |
| + sr = vsrc.val[NEON_R]; |
| + sg = vsrc.val[NEON_G]; |
| + sb = vsrc.val[NEON_B]; |
| + |
| /* XXX: if we want to prefetch, hide it in the above asm() |
| * using the gcc __builtin_prefetch(), the prefetch will |
| * fall to the bottom of the loop -- it won't stick up |
| @@ -1369,7 +1444,6 @@ void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, |
| } while (--count != 0); |
| } |
| } |
| -#endif |
| void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count, |
| SkPMColor color) { |
| @@ -1475,11 +1549,14 @@ void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count, |
| /////////////////////////////////////////////////////////////////////////////// |
| const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = { |
| -#ifdef SK_CPU_ARM32 |
| // no dither |
| S32_D565_Opaque_neon, |
| S32_D565_Blend_neon, |
| +#ifdef SK_CPU_ARM32 |
| S32A_D565_Opaque_neon, |
| +#else |
| + NULL, |
| +#endif |
| S32A_D565_Blend_neon, |
| // dither |
| @@ -1487,10 +1564,6 @@ const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = { |
| S32_D565_Blend_Dither_neon, |
| S32A_D565_Opaque_Dither_neon, |
| NULL, // S32A_D565_Blend_Dither |
| -#else |
| - NULL, NULL, NULL, NULL, |
| - NULL, NULL, NULL, NULL |
| -#endif |
| }; |
| const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { |