src/opts/SkXfermode_opts_arm_neon.cpp - Issue 216113005: Revert of ARM Skia NEON patches - 35 - First AArch64 support

Unified Diff: src/opts/SkXfermode_opts_arm_neon.cpp

Issue 216113005: Revert of ARM Skia NEON patches - 35 - First AArch64 support (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Created 6 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/opts/SkXfermode_opts_arm_neon.cpp

diff --git a/src/opts/SkXfermode_opts_arm_neon.cpp b/src/opts/SkXfermode_opts_arm_neon.cpp

index 70e92af66bc548f9f42c0ecc70b8f8513794d096..17a82f554b6fb51a08bdf8fa63d3a5b003fae34b 100644

--- a/src/opts/SkXfermode_opts_arm_neon.cpp

+++ b/src/opts/SkXfermode_opts_arm_neon.cpp

@@ -41,13 +41,8 @@

static inline uint8x8_t SkDiv255Round_neon8_32_8(int32x4_t p1, int32x4_t p2) {

uint16x8_t tmp;

-#ifdef SK_CPU_ARM64

- tmp = vmovn_high_u32(vmovn_u32(vreinterpretq_u32_s32(p1)),

- vreinterpretq_u32_s32(p2));

-#else

tmp = vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(p1)),

vmovn_u32(vreinterpretq_u32_s32(p2)));

-#endif

tmp += vdupq_n_u16(128);

tmp += vshrq_n_u16(tmp, 8);

@@ -71,11 +66,7 @@

// Test if <= 0

cmp1 = vcleq_s32(val1, vdupq_n_s32(0));

cmp2 = vcleq_s32(val2, vdupq_n_s32(0));

-#ifdef SK_CPU_ARM64

- cmp16 = vmovn_high_u32(vmovn_u32(cmp1), cmp2);

-#else

cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2));

-#endif

cmp8_1 = vmovn_u16(cmp16);

// Init to zero

@@ -84,11 +75,7 @@

// Test if >= 255*255

cmp1 = vcgeq_s32(val1, vdupq_n_s32(255*255));

cmp2 = vcgeq_s32(val2, vdupq_n_s32(255*255));

-#ifdef SK_CPU_ARM64

- cmp16 = vmovn_high_u32(vmovn_u32(cmp1), cmp2);

-#else

cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2));

-#endif

cmp8 = vmovn_u16(cmp16);

// Insert 255 where true

@@ -422,19 +409,11 @@

if (overlay) {

dc2 = vshll_n_u8(dc, 1);

scdc2_1 = vmull_u16(vget_low_u16(dc2), vget_low_u16(vmovl_u8(sc)));

-#ifdef SK_CPU_ARM64

- scdc2_2 = vmull_high_u16(dc2, vmovl_u8(sc));

-#else

scdc2_2 = vmull_u16(vget_high_u16(dc2), vget_high_u16(vmovl_u8(sc)));

-#endif

} else {

sc2 = vshll_n_u8(sc, 1);

scdc2_1 = vmull_u16(vget_low_u16(sc2), vget_low_u16(vmovl_u8(dc)));

-#ifdef SK_CPU_ARM64

- scdc2_2 = vmull_high_u16(sc2, vmovl_u8(dc));

-#else

scdc2_2 = vmull_u16(vget_high_u16(sc2), vget_high_u16(vmovl_u8(dc)));

-#endif

}

// Calc COM

@@ -442,20 +421,12 @@

com1 = vreinterpretq_s32_u32(

vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc)));

com2 = vreinterpretq_s32_u32(

-#ifdef SK_CPU_ARM64

- vmull_high_u16(const255, sc_plus_dc));

-#else

vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc)));

-#endif

// Calc SUB

int32x4_t sub1, sub2;

sub1 = vreinterpretq_s32_u32(vaddl_u16(vget_low_u16(scda), vget_low_u16(dcsa)));

-#ifdef SK_CPU_ARM64

- sub2 = vreinterpretq_s32_u32(vaddl_high_u16(scda, dcsa));

-#else

sub2 = vreinterpretq_s32_u32(vaddl_u16(vget_high_u16(scda), vget_high_u16(dcsa)));

-#endif

sub1 = vsubq_s32(sub1, vreinterpretq_s32_u32(scdc2_1));

sub2 = vsubq_s32(sub2, vreinterpretq_s32_u32(scdc2_2));

@@ -473,14 +444,10 @@

int32x4_t val2_1, val2_2;

uint32x4_t cmp1, cmp2;

- // Doing a signed lengthening allows to save a few instructions

- // thanks to sign extension.

- cmp1 = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_low_u16(cmp))));

-#ifdef SK_CPU_ARM64

- cmp2 = vreinterpretq_u32_s32(vmovl_high_s16(vreinterpretq_s16_u16(cmp)));

-#else

- cmp2 = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_high_u16(cmp))));

-#endif

+ cmp1 = vmovl_u16(vget_low_u16(cmp));

+ cmp1 |= vshlq_n_u32(cmp1, 16);

+ cmp2 = vmovl_u16(vget_high_u16(cmp));

+ cmp2 |= vshlq_n_u32(cmp2, 16);

// Calc COM - SUB

val1_1 = com1 - sub1;

@@ -491,11 +458,7 @@

val2_2 = com2 + sub2;

val2_1 = vsubq_s32(val2_1, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sada))));

-#ifdef SK_CPU_ARM64

- val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_high_u16(sada)));

-#else

val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sada))));

-#endif

// Insert where needed

val1_1 = vbslq_s32(cmp1, val1_1, val2_1);

@@ -665,19 +628,11 @@

term1_1 = vreinterpretq_s32_u32(

vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc)));

term1_2 = vreinterpretq_s32_u32(

-#ifdef SK_CPU_ARM64

- vmull_high_u16(const255, sc_plus_dc));

-#else

vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc)));

-#endif

/* Calc the second term */

term2_1 = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(scdc), 1));

-#ifdef SK_CPU_ARM64

- term2_2 = vreinterpretq_s32_u32(vshll_high_n_u16(scdc, 1));

-#else

term2_2 = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(scdc), 1));

-#endif

return clamp_div255round_simd8_32(term1_1 - term2_1, term1_2 - term2_2);

}

@@ -706,18 +661,10 @@

scdc = vmull_u8(sc, dc);

val1 = vaddl_u16(vget_low_u16(t1), vget_low_u16(t2));

-#ifdef SK_CPU_ARM64

- val2 = vaddl_high_u16(t1, t2);

-#else

val2 = vaddl_u16(vget_high_u16(t1), vget_high_u16(t2));

-#endif

val1 = vaddw_u16(val1, vget_low_u16(scdc));

-#ifdef SK_CPU_ARM64

- val2 = vaddw_high_u16(val2, scdc);

-#else

val2 = vaddw_u16(val2, vget_high_u16(scdc));

-#endif

return clamp_div255round_simd8_32(

vreinterpretq_s32_u32(val1), vreinterpretq_s32_u32(val2));

@@ -761,10 +708,6 @@

while (count >= 8) {

uint8x8x4_t vsrc, vdst, vres;

-#ifdef SK_CPU_ARM64

- vsrc = vld4_u8((uint8_t*)src);

- vdst = vld4_u8((uint8_t*)dst);

-#else

#if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))

asm volatile (

"vld4.u8 %h[vsrc], [%[src]]! \t\n"

@@ -797,7 +740,6 @@

vsrc.val[2] = d2; vdst.val[2] = d6;

vsrc.val[3] = d3; vdst.val[3] = d7;

#endif

-#endif // #ifdef SK_CPU_ARM64

vres = procSIMD(vsrc, vdst);

@@ -805,9 +747,6 @@

count -= 8;

dst += 8;

-#ifdef SK_CPU_ARM64

- src += 8;

-#endif

}

// Leftovers

for (int i = 0; i < count; i++) {

@@ -844,9 +783,6 @@

vdst = vld1q_u16(dst);

-#ifdef SK_CPU_ARM64

- vsrc = vld4_u8((uint8_t*)src);

-#else

#if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))

asm volatile (

"vld4.u8 %h[vsrc], [%[src]]! \t\n"

@@ -870,7 +806,6 @@

vsrc.val[2] = d2;

vsrc.val[3] = d3;

#endif

-#endif // #ifdef SK_CPU_ARM64

vdst32 = SkPixel16ToPixel32_neon8(vdst);

vres = procSIMD(vsrc, vdst32);

@@ -880,9 +815,6 @@

count -= 8;

dst += 8;

-#ifdef SK_CPU_ARM64

- src += 8;

-#endif

}

for (int i = 0; i < count; i++) {

SkPMColor dstC = SkPixel16ToPixel32(dst[i]);

« no previous file with comments | « src/opts/SkBitmapProcState_opts_arm.cpp ('k') | no next file » | no next file with comments »