| Index: src/opts/SkBlitRow_opts_arm_neon.cpp
|
| diff --git a/src/opts/SkBlitRow_opts_arm_neon.cpp b/src/opts/SkBlitRow_opts_arm_neon.cpp
|
| index 950e4f71d124e6c6682082913f5252924bc52e4b..9503323d87a42900bd248e1a649f571acf90fbf9 100644
|
| --- a/src/opts/SkBlitRow_opts_arm_neon.cpp
|
| +++ b/src/opts/SkBlitRow_opts_arm_neon.cpp
|
| @@ -1384,84 +1384,88 @@ void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,
|
| unsigned colorA = SkGetPackedA32(color);
|
| if (255 == colorA) {
|
| sk_memset32(dst, color, count);
|
| - } else {
|
| - unsigned scale = 256 - SkAlpha255To256(colorA);
|
| + return;
|
| + }
|
|
|
| - if (count >= 8) {
|
| - // at the end of this assembly, count will have been decremented
|
| - // to a negative value. That is, if count mod 8 = x, it will be
|
| - // -8 +x coming out.
|
| - asm volatile (
|
| - PLD128(src, 0)
|
| -
|
| - "vdup.32 q0, %[color] \n\t"
|
| -
|
| - PLD128(src, 128)
|
| -
|
| - // scale numerical interval [0-255], so load as 8 bits
|
| - "vdup.8 d2, %[scale] \n\t"
|
| -
|
| - PLD128(src, 256)
|
| -
|
| - "subs %[count], %[count], #8 \n\t"
|
| -
|
| - PLD128(src, 384)
|
| -
|
| - "Loop_Color32: \n\t"
|
| -
|
| - // load src color, 8 pixels, 4 64 bit registers
|
| - // (and increment src).
|
| - "vld1.32 {d4-d7}, [%[src]]! \n\t"
|
| -
|
| - PLD128(src, 384)
|
| -
|
| - // multiply long by scale, 64 bits at a time,
|
| - // destination into a 128 bit register.
|
| - "vmull.u8 q4, d4, d2 \n\t"
|
| - "vmull.u8 q5, d5, d2 \n\t"
|
| - "vmull.u8 q6, d6, d2 \n\t"
|
| - "vmull.u8 q7, d7, d2 \n\t"
|
| -
|
| - // shift the 128 bit registers, containing the 16
|
| - // bit scaled values back to 8 bits, narrowing the
|
| - // results to 64 bit registers.
|
| - "vshrn.i16 d8, q4, #8 \n\t"
|
| - "vshrn.i16 d9, q5, #8 \n\t"
|
| - "vshrn.i16 d10, q6, #8 \n\t"
|
| - "vshrn.i16 d11, q7, #8 \n\t"
|
| -
|
| - // adding back the color, using 128 bit registers.
|
| - "vadd.i8 q6, q4, q0 \n\t"
|
| - "vadd.i8 q7, q5, q0 \n\t"
|
| -
|
| - // store back the 8 calculated pixels (2 128 bit
|
| - // registers), and increment dst.
|
| - "vst1.32 {d12-d15}, [%[dst]]! \n\t"
|
| -
|
| - "subs %[count], %[count], #8 \n\t"
|
| - "bge Loop_Color32 \n\t"
|
| - : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)
|
| - : [color] "r" (color), [scale] "r" (scale)
|
| - : "cc", "memory",
|
| - "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
|
| - "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15"
|
| - );
|
| - // At this point, if we went through the inline assembly, count is
|
| - // a negative value:
|
| - // if the value is -8, there is no pixel left to process.
|
| - // if the value is -7, there is one pixel left to process
|
| - // ...
|
| - // And'ing it with 7 will give us the number of pixels
|
| - // left to process.
|
| - count = count & 0x7;
|
| - }
|
| + unsigned scale = 256 - SkAlpha255To256(colorA);
|
|
|
| - while (count > 0) {
|
| - *dst = color + SkAlphaMulQ(*src, scale);
|
| - src += 1;
|
| - dst += 1;
|
| - count--;
|
| - }
|
| + if (count >= 8) {
|
| + uint32x4_t vcolor;
|
| + uint8x8_t vscale;
|
| +
|
| + vcolor = vdupq_n_u32(color);
|
| +
|
| + // scale numerical interval [0-255], so load as 8 bits
|
| + vscale = vdup_n_u8(scale);
|
| +
|
| + do {
|
| + // load src color, 8 pixels, 4 64 bit registers
|
| + // (and increment src).
|
| + uint32x2x4_t vsrc;
|
| +#if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
|
| + asm (
|
| + "vld1.32 %h[vsrc], [%[src]]!"
|
| + : [vsrc] "=w" (vsrc), [src] "+r" (src)
|
| + : :
|
| + );
|
| +#else // (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
|
| + vsrc.val[0] = vld1_u32(src);
|
| + vsrc.val[1] = vld1_u32(src+2);
|
| + vsrc.val[2] = vld1_u32(src+4);
|
| + vsrc.val[3] = vld1_u32(src+6);
|
| + src += 8;
|
| +#endif
|
| +
|
| + // multiply long by scale, 64 bits at a time,
|
| + // destination into a 128 bit register.
|
| + uint16x8x4_t vtmp;
|
| + vtmp.val[0] = vmull_u8(vreinterpret_u8_u32(vsrc.val[0]), vscale);
|
| + vtmp.val[1] = vmull_u8(vreinterpret_u8_u32(vsrc.val[1]), vscale);
|
| + vtmp.val[2] = vmull_u8(vreinterpret_u8_u32(vsrc.val[2]), vscale);
|
| + vtmp.val[3] = vmull_u8(vreinterpret_u8_u32(vsrc.val[3]), vscale);
|
| +
|
| + // shift the 128 bit registers, containing the 16
|
| + // bit scaled values back to 8 bits, narrowing the
|
| + // results to 64 bit registers.
|
| + uint8x16x2_t vres;
|
| + vres.val[0] = vcombine_u8(
|
| + vshrn_n_u16(vtmp.val[0], 8),
|
| + vshrn_n_u16(vtmp.val[1], 8));
|
| + vres.val[1] = vcombine_u8(
|
| + vshrn_n_u16(vtmp.val[2], 8),
|
| + vshrn_n_u16(vtmp.val[3], 8));
|
| +
|
| + // adding back the color, using 128 bit registers.
|
| + uint32x4x2_t vdst;
|
| + vdst.val[0] = vreinterpretq_u32_u8(vres.val[0] +
|
| + vreinterpretq_u8_u32(vcolor));
|
| + vdst.val[1] = vreinterpretq_u32_u8(vres.val[1] +
|
| + vreinterpretq_u8_u32(vcolor));
|
| +
|
| + // store back the 8 calculated pixels (2 128 bit
|
| + // registers), and increment dst.
|
| +#if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
|
| + asm (
|
| + "vst1.32 %h[vdst], [%[dst]]!"
|
| + : [dst] "+r" (dst)
|
| + : [vdst] "w" (vdst)
|
| + : "memory"
|
| + );
|
| +#else // (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
|
| + vst1q_u32(dst, vdst.val[0]);
|
| + vst1q_u32(dst+4, vdst.val[1]);
|
| + dst += 8;
|
| +#endif
|
| + count -= 8;
|
| +
|
| + } while (count >= 8);
|
| + }
|
| +
|
| + while (count > 0) {
|
| + *dst = color + SkAlphaMulQ(*src, scale);
|
| + src += 1;
|
| + dst += 1;
|
| + count--;
|
| }
|
| }
|
|
|
|
|