| Index: src/opts/SkBlitRow_opts_arm_neon.cpp
|
| diff --git a/src/opts/SkBlitRow_opts_arm_neon.cpp b/src/opts/SkBlitRow_opts_arm_neon.cpp
|
| index 705ee998ddcfd6e64cb011c71cf7d8f25bbc0cd0..ffa0a8b3e41f981bc3499cf0b3322c3f89210fe9 100644
|
| --- a/src/opts/SkBlitRow_opts_arm_neon.cpp
|
| +++ b/src/opts/SkBlitRow_opts_arm_neon.cpp
|
| @@ -15,9 +15,45 @@
|
| #include "SkUtils.h"
|
|
|
| #include "SkCachePreload_arm.h"
|
| -
|
| +#include "SkColor_opts_neon.h"
|
| #include <arm_neon.h>
|
|
|
| +void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
|
| + const SkPMColor* SK_RESTRICT src, int count,
|
| + U8CPU alpha, int /*x*/, int /*y*/) {
|
| + SkASSERT(255 == alpha);
|
| +
|
| + while (count >= 8) {
|
| + uint8x8x4_t vsrc;
|
| + uint16x8_t vdst;
|
| +
|
| + // Load
|
| + vsrc = vld4_u8((uint8_t*)src);
|
| +
|
| + // Convert src to 565
|
| + vdst = vshll_n_u8(vsrc.val[NEON_R], 8);
|
| + vdst = vsriq_n_u16(vdst, vshll_n_u8(vsrc.val[NEON_G], 8), 5);
|
| + vdst = vsriq_n_u16(vdst, vshll_n_u8(vsrc.val[NEON_B], 8), 5+6);
|
| +
|
| + // Store
|
| + vst1q_u16(dst, vdst);
|
| +
|
| + // Prepare next iteration
|
| + dst += 8;
|
| + src += 8;
|
| + count -= 8;
|
| + };
|
| +
|
| + // Leftovers
|
| + while (count > 0) {
|
| + SkPMColor c = *src++;
|
| + SkPMColorAssert(c);
|
| + *dst = SkPixel32ToPixel16_ToU16(c);
|
| + dst++;
|
| + count--;
|
| + };
|
| +}
|
| +
|
| void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
|
| const SkPMColor* SK_RESTRICT src, int count,
|
| U8CPU alpha, int /*x*/, int /*y*/) {
|
| @@ -1330,10 +1366,10 @@ void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,
|
|
|
| const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {
|
| // no dither
|
| - // NOTE: For the two functions below, we don't have a special version
|
| - // that assumes that each source pixel is opaque. But our S32A is
|
| - // still faster than the default, so use it.
|
| - S32A_D565_Opaque_neon, // really S32_D565_Opaque
|
| + // NOTE: For the S32_D565_Blend function below, we don't have a special
|
| + // version that assumes that each source pixel is opaque. But our
|
| + // S32A is still faster than the default, so use it.
|
| + S32_D565_Opaque_neon,
|
| S32A_D565_Blend_neon, // really S32_D565_Blend
|
| S32A_D565_Opaque_neon,
|
| S32A_D565_Blend_neon,
|
|
|