Chromium Code Reviews| Index: src/opts/SkBlitRow_opts_arm_neon.cpp |
| diff --git a/src/opts/SkBlitRow_opts_arm_neon.cpp b/src/opts/SkBlitRow_opts_arm_neon.cpp |
| index 7868108378bbab635ac8fa99d4b8fb458db775de..15627b70dc5f2e5d50e50c1495ab15e32b9ac2c4 100644 |
| --- a/src/opts/SkBlitRow_opts_arm_neon.cpp |
| +++ b/src/opts/SkBlitRow_opts_arm_neon.cpp |
| @@ -15,9 +15,45 @@ |
| #include "SkUtils.h" |
| #include "SkCachePreload_arm.h" |
| - |
| +#include "SkColor_opts_neon.h" |
| #include <arm_neon.h> |
| +void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, |
| + const SkPMColor* SK_RESTRICT src, int count, |
| + U8CPU alpha, int /*x*/, int /*y*/) { |
| + SkASSERT(255 == alpha); |
| + |
| + while (count >= 8) { |
| + uint8x8x4_t vsrc; |
| + uint16x8_t vdst; |
| + |
| + // Load |
| + vsrc = vld4_u8((uint8_t*)src); |
| + |
| + // Convert src to 565 |
| + vdst = vshll_n_u8(vsrc.val[NEON_R], 8); |
| + vdst = vsriq_n_u16(vdst, vshll_n_u8(vsrc.val[NEON_G], 8), 5); |
| + vdst = vsriq_n_u16(vdst, vshll_n_u8(vsrc.val[NEON_B], 8), 5+6); |
| + |
| + // Store |
| + vst1q_u16(dst, vdst); |
| + |
| + // Prepare next iteration |
| + dst += 8; |
| + src += 8; |
| + count -= 8; |
| + }; |
| + |
| + // Leftovers |
| + while (count > 0) { |
| + SkPMColor c = *src++; |
| + SkPMColorAssert(c); |
| + *dst = SkPixel32ToPixel16_ToU16(c); |
| + dst += 1; |
|
mtklein
2013/09/20 13:36:08
For symmetry I'd go with a ++ here.
kevin.petit.not.used.account
2013/09/20 14:41:56
Done.
|
| + count--; |
| + }; |
| +} |
| + |
| void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, |
| const SkPMColor* SK_RESTRICT src, int count, |
| U8CPU alpha, int /*x*/, int /*y*/) { |
| @@ -1240,10 +1276,10 @@ void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count, |
| const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = { |
| // no dither |
| - // NOTE: For the two functions below, we don't have a special version |
| - // that assumes that each source pixel is opaque. But our S32A is |
| - // still faster than the default, so use it. |
| - S32A_D565_Opaque_neon, // really S32_D565_Opaque |
| + // NOTE: For the S32_D565_Blend function below, we don't have a special |
| + // version that assumes that each source pixel is opaque. But our |
| + // S32A is still faster than the default, so use it. |
| + S32_D565_Opaque_neon, |
| S32A_D565_Blend_neon, // really S32_D565_Blend |
| S32A_D565_Opaque_neon, |
| S32A_D565_Blend_neon, |