src/opts/SkXfermode_opts_arm_neon.cpp - Issue 143423004: ARM Skia NEON patches - 35 - First AArch64 support

Side by Side Diff: src/opts/SkXfermode_opts_arm_neon.cpp

Issue 143423004: ARM Skia NEON patches - 35 - First AArch64 support (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Some more gyp file tuning Created 6 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 #include "SkXfermode.h"	1 #include "SkXfermode.h"

2 #include "SkXfermode_proccoeff.h"	2 #include "SkXfermode_proccoeff.h"

3 #include "SkColorPriv.h"	3 #include "SkColorPriv.h"

4	4

5 #include <arm_neon.h>	5 #include <arm_neon.h>

6 #include "SkColor_opts_neon.h"	6 #include "SkColor_opts_neon.h"

7 #include "SkXfermode_opts_arm_neon.h"	7 #include "SkXfermode_opts_arm_neon.h"

8	8

9 #define SkAlphaMulAlpha(a, b) SkMulDiv255Round(a, b)	9 #define SkAlphaMulAlpha(a, b) SkMulDiv255Round(a, b)

10	10

(...skipping 23 matching lines...) Expand all Loading...
34 ret = vaddq_u16(ret, vshrq_n_u16(ret, 8));	34 ret = vaddq_u16(ret, vshrq_n_u16(ret, 8));

35	35

36 ret = vshrq_n_u16(ret, 8);	36 ret = vshrq_n_u16(ret, 8);

37	37

38 return ret;	38 return ret;

39 }	39 }

40	40

41 static inline uint8x8_t SkDiv255Round_neon8_32_8(int32x4_t p1, int32x4_t p2) {	41 static inline uint8x8_t SkDiv255Round_neon8_32_8(int32x4_t p1, int32x4_t p2) {

42 uint16x8_t tmp;	42 uint16x8_t tmp;

43	43

	44 #ifdef SK_CPU_ARM64

	45 tmp = vmovn_high_u32(vmovn_u32(vreinterpretq_u32_s32(p1)),

	46 vreinterpretq_u32_s32(p2));

	47 #else

44 tmp = vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(p1)),	48 tmp = vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(p1)),

45 vmovn_u32(vreinterpretq_u32_s32(p2)));	49 vmovn_u32(vreinterpretq_u32_s32(p2)));

	50 #endif

46	51

47 tmp += vdupq_n_u16(128);	52 tmp += vdupq_n_u16(128);

48 tmp += vshrq_n_u16(tmp, 8);	53 tmp += vshrq_n_u16(tmp, 8);

49	54

50 return vshrn_n_u16(tmp, 8);	55 return vshrn_n_u16(tmp, 8);

51 }	56 }

52	57

53 static inline uint16x8_t SkDiv255Round_neon8_16_16(uint16x8_t prod) {	58 static inline uint16x8_t SkDiv255Round_neon8_16_16(uint16x8_t prod) {

54 prod += vdupq_n_u16(128);	59 prod += vdupq_n_u16(128);

55 prod += vshrq_n_u16(prod, 8);	60 prod += vshrq_n_u16(prod, 8);

56	61

57 return vshrq_n_u16(prod, 8);	62 return vshrq_n_u16(prod, 8);

58 }	63 }

59	64

60 static inline uint8x8_t clamp_div255round_simd8_32(int32x4_t val1, int32x4_t val 2) {	65 static inline uint8x8_t clamp_div255round_simd8_32(int32x4_t val1, int32x4_t val 2) {

61 uint8x8_t ret;	66 uint8x8_t ret;

62 uint32x4_t cmp1, cmp2;	67 uint32x4_t cmp1, cmp2;

63 uint16x8_t cmp16;	68 uint16x8_t cmp16;

64 uint8x8_t cmp8, cmp8_1;	69 uint8x8_t cmp8, cmp8_1;

65	70

66 // Test if <= 0	71 // Test if <= 0

67 cmp1 = vcleq_s32(val1, vdupq_n_s32(0));	72 cmp1 = vcleq_s32(val1, vdupq_n_s32(0));

68 cmp2 = vcleq_s32(val2, vdupq_n_s32(0));	73 cmp2 = vcleq_s32(val2, vdupq_n_s32(0));

	74 #ifdef SK_CPU_ARM64

	75 cmp16 = vmovn_high_u32(vmovn_u32(cmp1), cmp2);

	76 #else

69 cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2));	77 cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2));

	78 #endif

70 cmp8_1 = vmovn_u16(cmp16);	79 cmp8_1 = vmovn_u16(cmp16);

71	80

72 // Init to zero	81 // Init to zero

73 ret = vdup_n_u8(0);	82 ret = vdup_n_u8(0);

74	83

75 // Test if >= 255*255	84 // Test if >= 255*255

76 cmp1 = vcgeq_s32(val1, vdupq_n_s32(255*255));	85 cmp1 = vcgeq_s32(val1, vdupq_n_s32(255*255));

77 cmp2 = vcgeq_s32(val2, vdupq_n_s32(255*255));	86 cmp2 = vcgeq_s32(val2, vdupq_n_s32(255*255));

	87 #ifdef SK_CPU_ARM64

	88 cmp16 = vmovn_high_u32(vmovn_u32(cmp1), cmp2);

	89 #else

78 cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2));	90 cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2));

	91 #endif

79 cmp8 = vmovn_u16(cmp16);	92 cmp8 = vmovn_u16(cmp16);

80	93

81 // Insert 255 where true	94 // Insert 255 where true

82 ret = vbsl_u8(cmp8, vdup_n_u8(255), ret);	95 ret = vbsl_u8(cmp8, vdup_n_u8(255), ret);

83	96

84 // Calc SkDiv255Round	97 // Calc SkDiv255Round

85 uint8x8_t div = SkDiv255Round_neon8_32_8(val1, val2);	98 uint8x8_t div = SkDiv255Round_neon8_32_8(val1, val2);

86	99

87 // Insert where false and previous test false	100 // Insert where false and previous test false

88 cmp8 = cmp8 \| cmp8_1;	101 cmp8 = cmp8 \| cmp8_1;

(...skipping 313 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
402 uint16x8_t scda = vmull_u8(sc, da);	415 uint16x8_t scda = vmull_u8(sc, da);

403 uint16x8_t dcsa = vmull_u8(dc, sa);	416 uint16x8_t dcsa = vmull_u8(dc, sa);

404 uint16x8_t sada = vmull_u8(sa, da);	417 uint16x8_t sada = vmull_u8(sa, da);

405	418

406 // Prepare non common subexpressions	419 // Prepare non common subexpressions

407 uint16x8_t dc2, sc2;	420 uint16x8_t dc2, sc2;

408 uint32x4_t scdc2_1, scdc2_2;	421 uint32x4_t scdc2_1, scdc2_2;

409 if (overlay) {	422 if (overlay) {

410 dc2 = vshll_n_u8(dc, 1);	423 dc2 = vshll_n_u8(dc, 1);

411 scdc2_1 = vmull_u16(vget_low_u16(dc2), vget_low_u16(vmovl_u8(sc)));	424 scdc2_1 = vmull_u16(vget_low_u16(dc2), vget_low_u16(vmovl_u8(sc)));

	425 #ifdef SK_CPU_ARM64

	426 scdc2_2 = vmull_high_u16(dc2, vmovl_u8(sc));

	427 #else

412 scdc2_2 = vmull_u16(vget_high_u16(dc2), vget_high_u16(vmovl_u8(sc)));	428 scdc2_2 = vmull_u16(vget_high_u16(dc2), vget_high_u16(vmovl_u8(sc)));

	429 #endif

413 } else {	430 } else {

414 sc2 = vshll_n_u8(sc, 1);	431 sc2 = vshll_n_u8(sc, 1);

415 scdc2_1 = vmull_u16(vget_low_u16(sc2), vget_low_u16(vmovl_u8(dc)));	432 scdc2_1 = vmull_u16(vget_low_u16(sc2), vget_low_u16(vmovl_u8(dc)));

	433 #ifdef SK_CPU_ARM64

	434 scdc2_2 = vmull_high_u16(sc2, vmovl_u8(dc));

	435 #else

416 scdc2_2 = vmull_u16(vget_high_u16(sc2), vget_high_u16(vmovl_u8(dc)));	436 scdc2_2 = vmull_u16(vget_high_u16(sc2), vget_high_u16(vmovl_u8(dc)));

	437 #endif

417 }	438 }

418	439

419 // Calc COM	440 // Calc COM

420 int32x4_t com1, com2;	441 int32x4_t com1, com2;

421 com1 = vreinterpretq_s32_u32(	442 com1 = vreinterpretq_s32_u32(

422 vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc)));	443 vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc)));

423 com2 = vreinterpretq_s32_u32(	444 com2 = vreinterpretq_s32_u32(

	445 #ifdef SK_CPU_ARM64

	446 vmull_high_u16(const255, sc_plus_dc));

	447 #else

424 vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc)));	448 vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc)));

	449 #endif

425	450

426 // Calc SUB	451 // Calc SUB

427 int32x4_t sub1, sub2;	452 int32x4_t sub1, sub2;

428 sub1 = vreinterpretq_s32_u32(vaddl_u16(vget_low_u16(scda), vget_low_u16(dcsa )));	453 sub1 = vreinterpretq_s32_u32(vaddl_u16(vget_low_u16(scda), vget_low_u16(dcsa )));

	454 #ifdef SK_CPU_ARM64

	455 sub2 = vreinterpretq_s32_u32(vaddl_high_u16(scda, dcsa));

	456 #else

429 sub2 = vreinterpretq_s32_u32(vaddl_u16(vget_high_u16(scda), vget_high_u16(dc sa)));	457 sub2 = vreinterpretq_s32_u32(vaddl_u16(vget_high_u16(scda), vget_high_u16(dc sa)));

	458 #endif

430 sub1 = vsubq_s32(sub1, vreinterpretq_s32_u32(scdc2_1));	459 sub1 = vsubq_s32(sub1, vreinterpretq_s32_u32(scdc2_1));

431 sub2 = vsubq_s32(sub2, vreinterpretq_s32_u32(scdc2_2));	460 sub2 = vsubq_s32(sub2, vreinterpretq_s32_u32(scdc2_2));

432	461

433 // Compare 2*dc <= da	462 // Compare 2*dc <= da

434 uint16x8_t cmp;	463 uint16x8_t cmp;

435	464

436 if (overlay) {	465 if (overlay) {

437 cmp = vcleq_u16(dc2, vmovl_u8(da));	466 cmp = vcleq_u16(dc2, vmovl_u8(da));

438 } else {	467 } else {

439 cmp = vcleq_u16(sc2, vmovl_u8(sa));	468 cmp = vcleq_u16(sc2, vmovl_u8(sa));

440 }	469 }

441	470

442 // Prepare variables	471 // Prepare variables

443 int32x4_t val1_1, val1_2;	472 int32x4_t val1_1, val1_2;

444 int32x4_t val2_1, val2_2;	473 int32x4_t val2_1, val2_2;

445 uint32x4_t cmp1, cmp2;	474 uint32x4_t cmp1, cmp2;

446	475

447 cmp1 = vmovl_u16(vget_low_u16(cmp));	476 // Doing a signed lengthening allows to save a few instructions

448 cmp1 \|= vshlq_n_u32(cmp1, 16);	477 // thanks to sign extension.

449 cmp2 = vmovl_u16(vget_high_u16(cmp));	478 cmp1 = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_low_u16(cmp ))));

450 cmp2 \|= vshlq_n_u32(cmp2, 16);	479 #ifdef SK_CPU_ARM64

	480 cmp2 = vreinterpretq_u32_s32(vmovl_high_s16(vreinterpretq_s16_u16(cmp)));

	481 #else

	482 cmp2 = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_high_u16(cm p))));

	483 #endif

451	484

452 // Calc COM - SUB	485 // Calc COM - SUB

453 val1_1 = com1 - sub1;	486 val1_1 = com1 - sub1;

454 val1_2 = com2 - sub2;	487 val1_2 = com2 - sub2;

455	488

456 // Calc COM + SUB - sa*da	489 // Calc COM + SUB - sa*da

457 val2_1 = com1 + sub1;	490 val2_1 = com1 + sub1;

458 val2_2 = com2 + sub2;	491 val2_2 = com2 + sub2;

459	492

460 val2_1 = vsubq_s32(val2_1, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sada ))));	493 val2_1 = vsubq_s32(val2_1, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sada ))));

	494 #ifdef SK_CPU_ARM64

	495 val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_high_u16(sada)));

	496 #else

461 val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sad a))));	497 val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sad a))));

	498 #endif

462	499

463 // Insert where needed	500 // Insert where needed

464 val1_1 = vbslq_s32(cmp1, val1_1, val2_1);	501 val1_1 = vbslq_s32(cmp1, val1_1, val2_1);

465 val1_2 = vbslq_s32(cmp2, val1_2, val2_2);	502 val1_2 = vbslq_s32(cmp2, val1_2, val2_2);

466	503

467 // Call the clamp_div255round function	504 // Call the clamp_div255round function

468 return clamp_div255round_simd8_32(val1_1, val1_2);	505 return clamp_div255round_simd8_32(val1_1, val1_2);

469 }	506 }

470	507

471 static inline uint8x8_t overlay_color(uint8x8_t sc, uint8x8_t dc,	508 static inline uint8x8_t overlay_color(uint8x8_t sc, uint8x8_t dc,

(...skipping 149 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
621 sc_plus_dc = vaddl_u8(sc, dc);	658 sc_plus_dc = vaddl_u8(sc, dc);

622 scdc = vmull_u8(sc, dc);	659 scdc = vmull_u8(sc, dc);

623	660

624 /* Prepare constants */	661 /* Prepare constants */

625 const255 = vdupq_n_u16(255);	662 const255 = vdupq_n_u16(255);

626	663

627 /* Calc the first term */	664 /* Calc the first term */

628 term1_1 = vreinterpretq_s32_u32(	665 term1_1 = vreinterpretq_s32_u32(

629 vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc)));	666 vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc)));

630 term1_2 = vreinterpretq_s32_u32(	667 term1_2 = vreinterpretq_s32_u32(

	668 #ifdef SK_CPU_ARM64

	669 vmull_high_u16(const255, sc_plus_dc));

	670 #else

631 vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc)));	671 vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc)));

	672 #endif

632	673

633 /* Calc the second term */	674 /* Calc the second term */

634 term2_1 = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(scdc), 1));	675 term2_1 = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(scdc), 1));

	676 #ifdef SK_CPU_ARM64

	677 term2_2 = vreinterpretq_s32_u32(vshll_high_n_u16(scdc, 1));

	678 #else

635 term2_2 = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(scdc), 1));	679 term2_2 = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(scdc), 1));

	680 #endif

636	681

637 return clamp_div255round_simd8_32(term1_1 - term2_1, term1_2 - term2_2);	682 return clamp_div255round_simd8_32(term1_1 - term2_1, term1_2 - term2_2);

638 }	683 }

639	684

640 uint8x8x4_t exclusion_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {	685 uint8x8x4_t exclusion_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {

641 uint8x8x4_t ret;	686 uint8x8x4_t ret;

642	687

643 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]);	688 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]);

644 ret.val[NEON_R] = exclusion_color(src.val[NEON_R], dst.val[NEON_R],	689 ret.val[NEON_R] = exclusion_color(src.val[NEON_R], dst.val[NEON_R],

645 src.val[NEON_A], dst.val[NEON_A]);	690 src.val[NEON_A], dst.val[NEON_A]);

646 ret.val[NEON_G] = exclusion_color(src.val[NEON_G], dst.val[NEON_G],	691 ret.val[NEON_G] = exclusion_color(src.val[NEON_G], dst.val[NEON_G],

647 src.val[NEON_A], dst.val[NEON_A]);	692 src.val[NEON_A], dst.val[NEON_A]);

648 ret.val[NEON_B] = exclusion_color(src.val[NEON_B], dst.val[NEON_B],	693 ret.val[NEON_B] = exclusion_color(src.val[NEON_B], dst.val[NEON_B],

649 src.val[NEON_A], dst.val[NEON_A]);	694 src.val[NEON_A], dst.val[NEON_A]);

650	695

651 return ret;	696 return ret;

652 }	697 }

653	698

654 static inline uint8x8_t blendfunc_multiply_color(uint8x8_t sc, uint8x8_t dc,	699 static inline uint8x8_t blendfunc_multiply_color(uint8x8_t sc, uint8x8_t dc,

655 uint8x8_t sa, uint8x8_t da) {	700 uint8x8_t sa, uint8x8_t da) {

656 uint32x4_t val1, val2;	701 uint32x4_t val1, val2;

657 uint16x8_t scdc, t1, t2;	702 uint16x8_t scdc, t1, t2;

658	703

659 t1 = vmull_u8(sc, vdup_n_u8(255) - da);	704 t1 = vmull_u8(sc, vdup_n_u8(255) - da);

660 t2 = vmull_u8(dc, vdup_n_u8(255) - sa);	705 t2 = vmull_u8(dc, vdup_n_u8(255) - sa);

661 scdc = vmull_u8(sc, dc);	706 scdc = vmull_u8(sc, dc);

662	707

663 val1 = vaddl_u16(vget_low_u16(t1), vget_low_u16(t2));	708 val1 = vaddl_u16(vget_low_u16(t1), vget_low_u16(t2));

	709 #ifdef SK_CPU_ARM64

	710 val2 = vaddl_high_u16(t1, t2);

	711 #else

664 val2 = vaddl_u16(vget_high_u16(t1), vget_high_u16(t2));	712 val2 = vaddl_u16(vget_high_u16(t1), vget_high_u16(t2));

	713 #endif

665	714

666 val1 = vaddw_u16(val1, vget_low_u16(scdc));	715 val1 = vaddw_u16(val1, vget_low_u16(scdc));

	716 #ifdef SK_CPU_ARM64

	717 val2 = vaddw_high_u16(val2, scdc);

	718 #else

667 val2 = vaddw_u16(val2, vget_high_u16(scdc));	719 val2 = vaddw_u16(val2, vget_high_u16(scdc));

	720 #endif

668	721

669 return clamp_div255round_simd8_32(	722 return clamp_div255round_simd8_32(

670 vreinterpretq_s32_u32(val1), vreinterpretq_s32_u32(val2));	723 vreinterpretq_s32_u32(val1), vreinterpretq_s32_u32(val2));

671 }	724 }

672	725

673 uint8x8x4_t multiply_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {	726 uint8x8x4_t multiply_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {

674 uint8x8x4_t ret;	727 uint8x8x4_t ret;

675	728

676 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]);	729 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]);

677 ret.val[NEON_R] = blendfunc_multiply_color(src.val[NEON_R], dst.val[NEON_R],	730 ret.val[NEON_R] = blendfunc_multiply_color(src.val[NEON_R], dst.val[NEON_R],

(...skipping 23 matching lines...) Expand all Loading...
701	754

702 SkXfermodeProc proc = this->getProc();	755 SkXfermodeProc proc = this->getProc();

703 SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD );	756 SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD );

704 SkASSERT(procSIMD != NULL);	757 SkASSERT(procSIMD != NULL);

705	758

706 if (NULL == aa) {	759 if (NULL == aa) {

707 // Unrolled NEON code	760 // Unrolled NEON code

708 while (count >= 8) {	761 while (count >= 8) {

709 uint8x8x4_t vsrc, vdst, vres;	762 uint8x8x4_t vsrc, vdst, vres;

710	763

	764 #ifdef SK_CPU_ARM64

	765 vsrc = vld4_u8((uint8_t*)src);

	766 vdst = vld4_u8((uint8_t*)dst);

	767 #else

711 #if (__GNUC__ > 4) \|\| ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))	768 #if (__GNUC__ > 4) \|\| ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))

712 asm volatile (	769 asm volatile (

713 "vld4.u8 %h[vsrc], [%[src]]! \t\n"	770 "vld4.u8 %h[vsrc], [%[src]]! \t\n"

714 "vld4.u8 %h[vdst], [%[dst]] \t\n"	771 "vld4.u8 %h[vdst], [%[dst]] \t\n"

715 : [vsrc] "=w" (vsrc), [vdst] "=w" (vdst), [src] "+&r" (src)	772 : [vsrc] "=w" (vsrc), [vdst] "=w" (vdst), [src] "+&r" (src)

716 : [dst] "r" (dst)	773 : [dst] "r" (dst)

717 :	774 :

718 );	775 );

719 #else	776 #else

720 register uint8x8_t d0 asm("d0");	777 register uint8x8_t d0 asm("d0");

(...skipping 12 matching lines...) Expand all Loading...
733 "=w" (d4), "=w" (d5), "=w" (d6), "=w" (d7),	790 "=w" (d4), "=w" (d5), "=w" (d6), "=w" (d7),

734 [src] "+&r" (src)	791 [src] "+&r" (src)

735 : [dst] "r" (dst)	792 : [dst] "r" (dst)

736 :	793 :

737 );	794 );

738 vsrc.val[0] = d0; vdst.val[0] = d4;	795 vsrc.val[0] = d0; vdst.val[0] = d4;

739 vsrc.val[1] = d1; vdst.val[1] = d5;	796 vsrc.val[1] = d1; vdst.val[1] = d5;

740 vsrc.val[2] = d2; vdst.val[2] = d6;	797 vsrc.val[2] = d2; vdst.val[2] = d6;

741 vsrc.val[3] = d3; vdst.val[3] = d7;	798 vsrc.val[3] = d3; vdst.val[3] = d7;

742 #endif	799 #endif

	800 #endif // #ifdef SK_CPU_ARM64

743	801

744 vres = procSIMD(vsrc, vdst);	802 vres = procSIMD(vsrc, vdst);

745	803

746 vst4_u8((uint8_t*)dst, vres);	804 vst4_u8((uint8_t*)dst, vres);

747	805

748 count -= 8;	806 count -= 8;

749 dst += 8;	807 dst += 8;

	808 #ifdef SK_CPU_ARM64

	809 src += 8;

	810 #endif

750 }	811 }

751 // Leftovers	812 // Leftovers

752 for (int i = 0; i < count; i++) {	813 for (int i = 0; i < count; i++) {

753 dst[i] = proc(src[i], dst[i]);	814 dst[i] = proc(src[i], dst[i]);

754 }	815 }

755 } else {	816 } else {

756 for (int i = count - 1; i >= 0; --i) {	817 for (int i = count - 1; i >= 0; --i) {

757 unsigned a = aa[i];	818 unsigned a = aa[i];

758 if (0 != a) {	819 if (0 != a) {

759 SkPMColor dstC = dst[i];	820 SkPMColor dstC = dst[i];

(...skipping 16 matching lines...) Expand all Loading...
776 SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD );	837 SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD );

777 SkASSERT(procSIMD != NULL);	838 SkASSERT(procSIMD != NULL);

778	839

779 if (NULL == aa) {	840 if (NULL == aa) {

780 while(count >= 8) {	841 while(count >= 8) {

781 uint16x8_t vdst, vres16;	842 uint16x8_t vdst, vres16;

782 uint8x8x4_t vdst32, vsrc, vres;	843 uint8x8x4_t vdst32, vsrc, vres;

783	844

784 vdst = vld1q_u16(dst);	845 vdst = vld1q_u16(dst);

785	846

	847 #ifdef SK_CPU_ARM64

	848 vsrc = vld4_u8((uint8_t*)src);

	849 #else

786 #if (__GNUC__ > 4) \|\| ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))	850 #if (__GNUC__ > 4) \|\| ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))

787 asm volatile (	851 asm volatile (

788 "vld4.u8 %h[vsrc], [%[src]]! \t\n"	852 "vld4.u8 %h[vsrc], [%[src]]! \t\n"

789 : [vsrc] "=w" (vsrc), [src] "+&r" (src)	853 : [vsrc] "=w" (vsrc), [src] "+&r" (src)

790 : :	854 : :

791 );	855 );

792 #else	856 #else

793 register uint8x8_t d0 asm("d0");	857 register uint8x8_t d0 asm("d0");

794 register uint8x8_t d1 asm("d1");	858 register uint8x8_t d1 asm("d1");

795 register uint8x8_t d2 asm("d2");	859 register uint8x8_t d2 asm("d2");

796 register uint8x8_t d3 asm("d3");	860 register uint8x8_t d3 asm("d3");

797	861

798 asm volatile (	862 asm volatile (

799 "vld4.u8 {d0-d3},[%[src]]!;"	863 "vld4.u8 {d0-d3},[%[src]]!;"

800 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),	864 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),

801 [src] "+&r" (src)	865 [src] "+&r" (src)

802 : :	866 : :

803 );	867 );

804 vsrc.val[0] = d0;	868 vsrc.val[0] = d0;

805 vsrc.val[1] = d1;	869 vsrc.val[1] = d1;

806 vsrc.val[2] = d2;	870 vsrc.val[2] = d2;

807 vsrc.val[3] = d3;	871 vsrc.val[3] = d3;

808 #endif	872 #endif

	873 #endif // #ifdef SK_CPU_ARM64

809	874

810 vdst32 = SkPixel16ToPixel32_neon8(vdst);	875 vdst32 = SkPixel16ToPixel32_neon8(vdst);

811 vres = procSIMD(vsrc, vdst32);	876 vres = procSIMD(vsrc, vdst32);

812 vres16 = SkPixel32ToPixel16_neon8(vres);	877 vres16 = SkPixel32ToPixel16_neon8(vres);

813	878

814 vst1q_u16(dst, vres16);	879 vst1q_u16(dst, vres16);

815	880

816 count -= 8;	881 count -= 8;

817 dst += 8;	882 dst += 8;

	883 #ifdef SK_CPU_ARM64

	884 src += 8;

	885 #endif

818 }	886 }

819 for (int i = 0; i < count; i++) {	887 for (int i = 0; i < count; i++) {

820 SkPMColor dstC = SkPixel16ToPixel32(dst[i]);	888 SkPMColor dstC = SkPixel16ToPixel32(dst[i]);

821 dst[i] = SkPixel32ToPixel16_ToU16(proc(src[i], dstC));	889 dst[i] = SkPixel32ToPixel16_ToU16(proc(src[i], dstC));

822 }	890 }

823 } else {	891 } else {

824 for (int i = count - 1; i >= 0; --i) {	892 for (int i = count - 1; i >= 0; --i) {

825 unsigned a = aa[i];	893 unsigned a = aa[i];

826 if (0 != a) {	894 if (0 != a) {

827 SkPMColor dstC = SkPixel16ToPixel32(dst[i]);	895 SkPMColor dstC = SkPixel16ToPixel32(dst[i]);

(...skipping 100 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
928	996

929 if (procSIMD != NULL) {	997 if (procSIMD != NULL) {

930 return SkNEW_ARGS(SkNEONProcCoeffXfermode, (rec, mode, procSIMD));	998 return SkNEW_ARGS(SkNEONProcCoeffXfermode, (rec, mode, procSIMD));

931 }	999 }

932 return NULL;	1000 return NULL;

933 }	1001 }

934	1002

935 SkXfermodeProc SkPlatformXfermodeProcFactory_impl_neon(SkXfermode::Mode mode) {	1003 SkXfermodeProc SkPlatformXfermodeProcFactory_impl_neon(SkXfermode::Mode mode) {

936 return gNEONXfermodeProcs1[mode];	1004 return gNEONXfermodeProcs1[mode];

937 }	1005 }

OLD	NEW

« no previous file with comments | « src/opts/SkBitmapProcState_opts_arm.cpp ('k') | no next file » | no next file with comments »