src/opts/SkXfermode_opts_arm_neon.cpp - Issue 216113005: Revert of ARM Skia NEON patches - 35 - First AArch64 support

Side by Side Diff: src/opts/SkXfermode_opts_arm_neon.cpp

Issue 216113005: Revert of ARM Skia NEON patches - 35 - First AArch64 support (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Created 6 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 #include "SkXfermode.h"	1 #include "SkXfermode.h"

2 #include "SkXfermode_proccoeff.h"	2 #include "SkXfermode_proccoeff.h"

3 #include "SkColorPriv.h"	3 #include "SkColorPriv.h"

4	4

5 #include <arm_neon.h>	5 #include <arm_neon.h>

6 #include "SkColor_opts_neon.h"	6 #include "SkColor_opts_neon.h"

7 #include "SkXfermode_opts_arm_neon.h"	7 #include "SkXfermode_opts_arm_neon.h"

8	8

9 #define SkAlphaMulAlpha(a, b) SkMulDiv255Round(a, b)	9 #define SkAlphaMulAlpha(a, b) SkMulDiv255Round(a, b)

10	10

(...skipping 23 matching lines...) Expand all Loading...
34 ret = vaddq_u16(ret, vshrq_n_u16(ret, 8));	34 ret = vaddq_u16(ret, vshrq_n_u16(ret, 8));

35	35

36 ret = vshrq_n_u16(ret, 8);	36 ret = vshrq_n_u16(ret, 8);

37	37

38 return ret;	38 return ret;

39 }	39 }

40	40

41 static inline uint8x8_t SkDiv255Round_neon8_32_8(int32x4_t p1, int32x4_t p2) {	41 static inline uint8x8_t SkDiv255Round_neon8_32_8(int32x4_t p1, int32x4_t p2) {

42 uint16x8_t tmp;	42 uint16x8_t tmp;

43	43

44 #ifdef SK_CPU_ARM64

45 tmp = vmovn_high_u32(vmovn_u32(vreinterpretq_u32_s32(p1)),

46 vreinterpretq_u32_s32(p2));

47 #else

48 tmp = vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(p1)),	44 tmp = vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(p1)),

49 vmovn_u32(vreinterpretq_u32_s32(p2)));	45 vmovn_u32(vreinterpretq_u32_s32(p2)));

50 #endif

51	46

52 tmp += vdupq_n_u16(128);	47 tmp += vdupq_n_u16(128);

53 tmp += vshrq_n_u16(tmp, 8);	48 tmp += vshrq_n_u16(tmp, 8);

54	49

55 return vshrn_n_u16(tmp, 8);	50 return vshrn_n_u16(tmp, 8);

56 }	51 }

57	52

58 static inline uint16x8_t SkDiv255Round_neon8_16_16(uint16x8_t prod) {	53 static inline uint16x8_t SkDiv255Round_neon8_16_16(uint16x8_t prod) {

59 prod += vdupq_n_u16(128);	54 prod += vdupq_n_u16(128);

60 prod += vshrq_n_u16(prod, 8);	55 prod += vshrq_n_u16(prod, 8);

61	56

62 return vshrq_n_u16(prod, 8);	57 return vshrq_n_u16(prod, 8);

63 }	58 }

64	59

65 static inline uint8x8_t clamp_div255round_simd8_32(int32x4_t val1, int32x4_t val 2) {	60 static inline uint8x8_t clamp_div255round_simd8_32(int32x4_t val1, int32x4_t val 2) {

66 uint8x8_t ret;	61 uint8x8_t ret;

67 uint32x4_t cmp1, cmp2;	62 uint32x4_t cmp1, cmp2;

68 uint16x8_t cmp16;	63 uint16x8_t cmp16;

69 uint8x8_t cmp8, cmp8_1;	64 uint8x8_t cmp8, cmp8_1;

70	65

71 // Test if <= 0	66 // Test if <= 0

72 cmp1 = vcleq_s32(val1, vdupq_n_s32(0));	67 cmp1 = vcleq_s32(val1, vdupq_n_s32(0));

73 cmp2 = vcleq_s32(val2, vdupq_n_s32(0));	68 cmp2 = vcleq_s32(val2, vdupq_n_s32(0));

74 #ifdef SK_CPU_ARM64

75 cmp16 = vmovn_high_u32(vmovn_u32(cmp1), cmp2);

76 #else

77 cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2));	69 cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2));

78 #endif

79 cmp8_1 = vmovn_u16(cmp16);	70 cmp8_1 = vmovn_u16(cmp16);

80	71

81 // Init to zero	72 // Init to zero

82 ret = vdup_n_u8(0);	73 ret = vdup_n_u8(0);

83	74

84 // Test if >= 255*255	75 // Test if >= 255*255

85 cmp1 = vcgeq_s32(val1, vdupq_n_s32(255*255));	76 cmp1 = vcgeq_s32(val1, vdupq_n_s32(255*255));

86 cmp2 = vcgeq_s32(val2, vdupq_n_s32(255*255));	77 cmp2 = vcgeq_s32(val2, vdupq_n_s32(255*255));

87 #ifdef SK_CPU_ARM64

88 cmp16 = vmovn_high_u32(vmovn_u32(cmp1), cmp2);

89 #else

90 cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2));	78 cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2));

91 #endif

92 cmp8 = vmovn_u16(cmp16);	79 cmp8 = vmovn_u16(cmp16);

93	80

94 // Insert 255 where true	81 // Insert 255 where true

95 ret = vbsl_u8(cmp8, vdup_n_u8(255), ret);	82 ret = vbsl_u8(cmp8, vdup_n_u8(255), ret);

96	83

97 // Calc SkDiv255Round	84 // Calc SkDiv255Round

98 uint8x8_t div = SkDiv255Round_neon8_32_8(val1, val2);	85 uint8x8_t div = SkDiv255Round_neon8_32_8(val1, val2);

99	86

100 // Insert where false and previous test false	87 // Insert where false and previous test false

101 cmp8 = cmp8 \| cmp8_1;	88 cmp8 = cmp8 \| cmp8_1;

(...skipping 313 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
415 uint16x8_t scda = vmull_u8(sc, da);	402 uint16x8_t scda = vmull_u8(sc, da);

416 uint16x8_t dcsa = vmull_u8(dc, sa);	403 uint16x8_t dcsa = vmull_u8(dc, sa);

417 uint16x8_t sada = vmull_u8(sa, da);	404 uint16x8_t sada = vmull_u8(sa, da);

418	405

419 // Prepare non common subexpressions	406 // Prepare non common subexpressions

420 uint16x8_t dc2, sc2;	407 uint16x8_t dc2, sc2;

421 uint32x4_t scdc2_1, scdc2_2;	408 uint32x4_t scdc2_1, scdc2_2;

422 if (overlay) {	409 if (overlay) {

423 dc2 = vshll_n_u8(dc, 1);	410 dc2 = vshll_n_u8(dc, 1);

424 scdc2_1 = vmull_u16(vget_low_u16(dc2), vget_low_u16(vmovl_u8(sc)));	411 scdc2_1 = vmull_u16(vget_low_u16(dc2), vget_low_u16(vmovl_u8(sc)));

425 #ifdef SK_CPU_ARM64

426 scdc2_2 = vmull_high_u16(dc2, vmovl_u8(sc));

427 #else

428 scdc2_2 = vmull_u16(vget_high_u16(dc2), vget_high_u16(vmovl_u8(sc)));	412 scdc2_2 = vmull_u16(vget_high_u16(dc2), vget_high_u16(vmovl_u8(sc)));

429 #endif

430 } else {	413 } else {

431 sc2 = vshll_n_u8(sc, 1);	414 sc2 = vshll_n_u8(sc, 1);

432 scdc2_1 = vmull_u16(vget_low_u16(sc2), vget_low_u16(vmovl_u8(dc)));	415 scdc2_1 = vmull_u16(vget_low_u16(sc2), vget_low_u16(vmovl_u8(dc)));

433 #ifdef SK_CPU_ARM64

434 scdc2_2 = vmull_high_u16(sc2, vmovl_u8(dc));

435 #else

436 scdc2_2 = vmull_u16(vget_high_u16(sc2), vget_high_u16(vmovl_u8(dc)));	416 scdc2_2 = vmull_u16(vget_high_u16(sc2), vget_high_u16(vmovl_u8(dc)));

437 #endif

438 }	417 }

439	418

440 // Calc COM	419 // Calc COM

441 int32x4_t com1, com2;	420 int32x4_t com1, com2;

442 com1 = vreinterpretq_s32_u32(	421 com1 = vreinterpretq_s32_u32(

443 vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc)));	422 vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc)));

444 com2 = vreinterpretq_s32_u32(	423 com2 = vreinterpretq_s32_u32(

445 #ifdef SK_CPU_ARM64

446 vmull_high_u16(const255, sc_plus_dc));

447 #else

448 vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc)));	424 vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc)));

449 #endif

450	425

451 // Calc SUB	426 // Calc SUB

452 int32x4_t sub1, sub2;	427 int32x4_t sub1, sub2;

453 sub1 = vreinterpretq_s32_u32(vaddl_u16(vget_low_u16(scda), vget_low_u16(dcsa )));	428 sub1 = vreinterpretq_s32_u32(vaddl_u16(vget_low_u16(scda), vget_low_u16(dcsa )));

454 #ifdef SK_CPU_ARM64

455 sub2 = vreinterpretq_s32_u32(vaddl_high_u16(scda, dcsa));

456 #else

457 sub2 = vreinterpretq_s32_u32(vaddl_u16(vget_high_u16(scda), vget_high_u16(dc sa)));	429 sub2 = vreinterpretq_s32_u32(vaddl_u16(vget_high_u16(scda), vget_high_u16(dc sa)));

458 #endif

459 sub1 = vsubq_s32(sub1, vreinterpretq_s32_u32(scdc2_1));	430 sub1 = vsubq_s32(sub1, vreinterpretq_s32_u32(scdc2_1));

460 sub2 = vsubq_s32(sub2, vreinterpretq_s32_u32(scdc2_2));	431 sub2 = vsubq_s32(sub2, vreinterpretq_s32_u32(scdc2_2));

461	432

462 // Compare 2*dc <= da	433 // Compare 2*dc <= da

463 uint16x8_t cmp;	434 uint16x8_t cmp;

464	435

465 if (overlay) {	436 if (overlay) {

466 cmp = vcleq_u16(dc2, vmovl_u8(da));	437 cmp = vcleq_u16(dc2, vmovl_u8(da));

467 } else {	438 } else {

468 cmp = vcleq_u16(sc2, vmovl_u8(sa));	439 cmp = vcleq_u16(sc2, vmovl_u8(sa));

469 }	440 }

470	441

471 // Prepare variables	442 // Prepare variables

472 int32x4_t val1_1, val1_2;	443 int32x4_t val1_1, val1_2;

473 int32x4_t val2_1, val2_2;	444 int32x4_t val2_1, val2_2;

474 uint32x4_t cmp1, cmp2;	445 uint32x4_t cmp1, cmp2;

475	446

476 // Doing a signed lengthening allows to save a few instructions	447 cmp1 = vmovl_u16(vget_low_u16(cmp));

477 // thanks to sign extension.	448 cmp1 \|= vshlq_n_u32(cmp1, 16);

478 cmp1 = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_low_u16(cmp ))));	449 cmp2 = vmovl_u16(vget_high_u16(cmp));

479 #ifdef SK_CPU_ARM64	450 cmp2 \|= vshlq_n_u32(cmp2, 16);

480 cmp2 = vreinterpretq_u32_s32(vmovl_high_s16(vreinterpretq_s16_u16(cmp)));

481 #else

482 cmp2 = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_high_u16(cm p))));

483 #endif

484	451

485 // Calc COM - SUB	452 // Calc COM - SUB

486 val1_1 = com1 - sub1;	453 val1_1 = com1 - sub1;

487 val1_2 = com2 - sub2;	454 val1_2 = com2 - sub2;

488	455

489 // Calc COM + SUB - sa*da	456 // Calc COM + SUB - sa*da

490 val2_1 = com1 + sub1;	457 val2_1 = com1 + sub1;

491 val2_2 = com2 + sub2;	458 val2_2 = com2 + sub2;

492	459

493 val2_1 = vsubq_s32(val2_1, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sada ))));	460 val2_1 = vsubq_s32(val2_1, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sada ))));

494 #ifdef SK_CPU_ARM64

495 val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_high_u16(sada)));

496 #else

497 val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sad a))));	461 val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sad a))));

498 #endif

499	462

500 // Insert where needed	463 // Insert where needed

501 val1_1 = vbslq_s32(cmp1, val1_1, val2_1);	464 val1_1 = vbslq_s32(cmp1, val1_1, val2_1);

502 val1_2 = vbslq_s32(cmp2, val1_2, val2_2);	465 val1_2 = vbslq_s32(cmp2, val1_2, val2_2);

503	466

504 // Call the clamp_div255round function	467 // Call the clamp_div255round function

505 return clamp_div255round_simd8_32(val1_1, val1_2);	468 return clamp_div255round_simd8_32(val1_1, val1_2);

506 }	469 }

507	470

508 static inline uint8x8_t overlay_color(uint8x8_t sc, uint8x8_t dc,	471 static inline uint8x8_t overlay_color(uint8x8_t sc, uint8x8_t dc,

(...skipping 149 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
658 sc_plus_dc = vaddl_u8(sc, dc);	621 sc_plus_dc = vaddl_u8(sc, dc);

659 scdc = vmull_u8(sc, dc);	622 scdc = vmull_u8(sc, dc);

660	623

661 /* Prepare constants */	624 /* Prepare constants */

662 const255 = vdupq_n_u16(255);	625 const255 = vdupq_n_u16(255);

663	626

664 /* Calc the first term */	627 /* Calc the first term */

665 term1_1 = vreinterpretq_s32_u32(	628 term1_1 = vreinterpretq_s32_u32(

666 vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc)));	629 vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc)));

667 term1_2 = vreinterpretq_s32_u32(	630 term1_2 = vreinterpretq_s32_u32(

668 #ifdef SK_CPU_ARM64

669 vmull_high_u16(const255, sc_plus_dc));

670 #else

671 vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc)));	631 vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc)));

672 #endif

673	632

674 /* Calc the second term */	633 /* Calc the second term */

675 term2_1 = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(scdc), 1));	634 term2_1 = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(scdc), 1));

676 #ifdef SK_CPU_ARM64

677 term2_2 = vreinterpretq_s32_u32(vshll_high_n_u16(scdc, 1));

678 #else

679 term2_2 = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(scdc), 1));	635 term2_2 = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(scdc), 1));

680 #endif

681	636

682 return clamp_div255round_simd8_32(term1_1 - term2_1, term1_2 - term2_2);	637 return clamp_div255round_simd8_32(term1_1 - term2_1, term1_2 - term2_2);

683 }	638 }

684	639

685 uint8x8x4_t exclusion_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {	640 uint8x8x4_t exclusion_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {

686 uint8x8x4_t ret;	641 uint8x8x4_t ret;

687	642

688 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]);	643 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]);

689 ret.val[NEON_R] = exclusion_color(src.val[NEON_R], dst.val[NEON_R],	644 ret.val[NEON_R] = exclusion_color(src.val[NEON_R], dst.val[NEON_R],

690 src.val[NEON_A], dst.val[NEON_A]);	645 src.val[NEON_A], dst.val[NEON_A]);

691 ret.val[NEON_G] = exclusion_color(src.val[NEON_G], dst.val[NEON_G],	646 ret.val[NEON_G] = exclusion_color(src.val[NEON_G], dst.val[NEON_G],

692 src.val[NEON_A], dst.val[NEON_A]);	647 src.val[NEON_A], dst.val[NEON_A]);

693 ret.val[NEON_B] = exclusion_color(src.val[NEON_B], dst.val[NEON_B],	648 ret.val[NEON_B] = exclusion_color(src.val[NEON_B], dst.val[NEON_B],

694 src.val[NEON_A], dst.val[NEON_A]);	649 src.val[NEON_A], dst.val[NEON_A]);

695	650

696 return ret;	651 return ret;

697 }	652 }

698	653

699 static inline uint8x8_t blendfunc_multiply_color(uint8x8_t sc, uint8x8_t dc,	654 static inline uint8x8_t blendfunc_multiply_color(uint8x8_t sc, uint8x8_t dc,

700 uint8x8_t sa, uint8x8_t da) {	655 uint8x8_t sa, uint8x8_t da) {

701 uint32x4_t val1, val2;	656 uint32x4_t val1, val2;

702 uint16x8_t scdc, t1, t2;	657 uint16x8_t scdc, t1, t2;

703	658

704 t1 = vmull_u8(sc, vdup_n_u8(255) - da);	659 t1 = vmull_u8(sc, vdup_n_u8(255) - da);

705 t2 = vmull_u8(dc, vdup_n_u8(255) - sa);	660 t2 = vmull_u8(dc, vdup_n_u8(255) - sa);

706 scdc = vmull_u8(sc, dc);	661 scdc = vmull_u8(sc, dc);

707	662

708 val1 = vaddl_u16(vget_low_u16(t1), vget_low_u16(t2));	663 val1 = vaddl_u16(vget_low_u16(t1), vget_low_u16(t2));

709 #ifdef SK_CPU_ARM64

710 val2 = vaddl_high_u16(t1, t2);

711 #else

712 val2 = vaddl_u16(vget_high_u16(t1), vget_high_u16(t2));	664 val2 = vaddl_u16(vget_high_u16(t1), vget_high_u16(t2));

713 #endif

714	665

715 val1 = vaddw_u16(val1, vget_low_u16(scdc));	666 val1 = vaddw_u16(val1, vget_low_u16(scdc));

716 #ifdef SK_CPU_ARM64

717 val2 = vaddw_high_u16(val2, scdc);

718 #else

719 val2 = vaddw_u16(val2, vget_high_u16(scdc));	667 val2 = vaddw_u16(val2, vget_high_u16(scdc));

720 #endif

721	668

722 return clamp_div255round_simd8_32(	669 return clamp_div255round_simd8_32(

723 vreinterpretq_s32_u32(val1), vreinterpretq_s32_u32(val2));	670 vreinterpretq_s32_u32(val1), vreinterpretq_s32_u32(val2));

724 }	671 }

725	672

726 uint8x8x4_t multiply_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {	673 uint8x8x4_t multiply_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {

727 uint8x8x4_t ret;	674 uint8x8x4_t ret;

728	675

729 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]);	676 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]);

730 ret.val[NEON_R] = blendfunc_multiply_color(src.val[NEON_R], dst.val[NEON_R],	677 ret.val[NEON_R] = blendfunc_multiply_color(src.val[NEON_R], dst.val[NEON_R],

(...skipping 23 matching lines...) Expand all Loading...
754	701

755 SkXfermodeProc proc = this->getProc();	702 SkXfermodeProc proc = this->getProc();

756 SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD );	703 SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD );

757 SkASSERT(procSIMD != NULL);	704 SkASSERT(procSIMD != NULL);

758	705

759 if (NULL == aa) {	706 if (NULL == aa) {

760 // Unrolled NEON code	707 // Unrolled NEON code

761 while (count >= 8) {	708 while (count >= 8) {

762 uint8x8x4_t vsrc, vdst, vres;	709 uint8x8x4_t vsrc, vdst, vres;

763	710

764 #ifdef SK_CPU_ARM64

765 vsrc = vld4_u8((uint8_t*)src);

766 vdst = vld4_u8((uint8_t*)dst);

767 #else

768 #if (__GNUC__ > 4) \|\| ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))	711 #if (__GNUC__ > 4) \|\| ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))

769 asm volatile (	712 asm volatile (

770 "vld4.u8 %h[vsrc], [%[src]]! \t\n"	713 "vld4.u8 %h[vsrc], [%[src]]! \t\n"

771 "vld4.u8 %h[vdst], [%[dst]] \t\n"	714 "vld4.u8 %h[vdst], [%[dst]] \t\n"

772 : [vsrc] "=w" (vsrc), [vdst] "=w" (vdst), [src] "+&r" (src)	715 : [vsrc] "=w" (vsrc), [vdst] "=w" (vdst), [src] "+&r" (src)

773 : [dst] "r" (dst)	716 : [dst] "r" (dst)

774 :	717 :

775 );	718 );

776 #else	719 #else

777 register uint8x8_t d0 asm("d0");	720 register uint8x8_t d0 asm("d0");

(...skipping 12 matching lines...) Expand all Loading...
790 "=w" (d4), "=w" (d5), "=w" (d6), "=w" (d7),	733 "=w" (d4), "=w" (d5), "=w" (d6), "=w" (d7),

791 [src] "+&r" (src)	734 [src] "+&r" (src)

792 : [dst] "r" (dst)	735 : [dst] "r" (dst)

793 :	736 :

794 );	737 );

795 vsrc.val[0] = d0; vdst.val[0] = d4;	738 vsrc.val[0] = d0; vdst.val[0] = d4;

796 vsrc.val[1] = d1; vdst.val[1] = d5;	739 vsrc.val[1] = d1; vdst.val[1] = d5;

797 vsrc.val[2] = d2; vdst.val[2] = d6;	740 vsrc.val[2] = d2; vdst.val[2] = d6;

798 vsrc.val[3] = d3; vdst.val[3] = d7;	741 vsrc.val[3] = d3; vdst.val[3] = d7;

799 #endif	742 #endif

800 #endif // #ifdef SK_CPU_ARM64

801	743

802 vres = procSIMD(vsrc, vdst);	744 vres = procSIMD(vsrc, vdst);

803	745

804 vst4_u8((uint8_t*)dst, vres);	746 vst4_u8((uint8_t*)dst, vres);

805	747

806 count -= 8;	748 count -= 8;

807 dst += 8;	749 dst += 8;

808 #ifdef SK_CPU_ARM64

809 src += 8;

810 #endif

811 }	750 }

812 // Leftovers	751 // Leftovers

813 for (int i = 0; i < count; i++) {	752 for (int i = 0; i < count; i++) {

814 dst[i] = proc(src[i], dst[i]);	753 dst[i] = proc(src[i], dst[i]);

815 }	754 }

816 } else {	755 } else {

817 for (int i = count - 1; i >= 0; --i) {	756 for (int i = count - 1; i >= 0; --i) {

818 unsigned a = aa[i];	757 unsigned a = aa[i];

819 if (0 != a) {	758 if (0 != a) {

820 SkPMColor dstC = dst[i];	759 SkPMColor dstC = dst[i];

(...skipping 16 matching lines...) Expand all Loading...
837 SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD );	776 SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD );

838 SkASSERT(procSIMD != NULL);	777 SkASSERT(procSIMD != NULL);

839	778

840 if (NULL == aa) {	779 if (NULL == aa) {

841 while(count >= 8) {	780 while(count >= 8) {

842 uint16x8_t vdst, vres16;	781 uint16x8_t vdst, vres16;

843 uint8x8x4_t vdst32, vsrc, vres;	782 uint8x8x4_t vdst32, vsrc, vres;

844	783

845 vdst = vld1q_u16(dst);	784 vdst = vld1q_u16(dst);

846	785

847 #ifdef SK_CPU_ARM64

848 vsrc = vld4_u8((uint8_t*)src);

849 #else

850 #if (__GNUC__ > 4) \|\| ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))	786 #if (__GNUC__ > 4) \|\| ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))

851 asm volatile (	787 asm volatile (

852 "vld4.u8 %h[vsrc], [%[src]]! \t\n"	788 "vld4.u8 %h[vsrc], [%[src]]! \t\n"

853 : [vsrc] "=w" (vsrc), [src] "+&r" (src)	789 : [vsrc] "=w" (vsrc), [src] "+&r" (src)

854 : :	790 : :

855 );	791 );

856 #else	792 #else

857 register uint8x8_t d0 asm("d0");	793 register uint8x8_t d0 asm("d0");

858 register uint8x8_t d1 asm("d1");	794 register uint8x8_t d1 asm("d1");

859 register uint8x8_t d2 asm("d2");	795 register uint8x8_t d2 asm("d2");

860 register uint8x8_t d3 asm("d3");	796 register uint8x8_t d3 asm("d3");

861	797

862 asm volatile (	798 asm volatile (

863 "vld4.u8 {d0-d3},[%[src]]!;"	799 "vld4.u8 {d0-d3},[%[src]]!;"

864 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),	800 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),

865 [src] "+&r" (src)	801 [src] "+&r" (src)

866 : :	802 : :

867 );	803 );

868 vsrc.val[0] = d0;	804 vsrc.val[0] = d0;

869 vsrc.val[1] = d1;	805 vsrc.val[1] = d1;

870 vsrc.val[2] = d2;	806 vsrc.val[2] = d2;

871 vsrc.val[3] = d3;	807 vsrc.val[3] = d3;

872 #endif	808 #endif

873 #endif // #ifdef SK_CPU_ARM64

874	809

875 vdst32 = SkPixel16ToPixel32_neon8(vdst);	810 vdst32 = SkPixel16ToPixel32_neon8(vdst);

876 vres = procSIMD(vsrc, vdst32);	811 vres = procSIMD(vsrc, vdst32);

877 vres16 = SkPixel32ToPixel16_neon8(vres);	812 vres16 = SkPixel32ToPixel16_neon8(vres);

878	813

879 vst1q_u16(dst, vres16);	814 vst1q_u16(dst, vres16);

880	815

881 count -= 8;	816 count -= 8;

882 dst += 8;	817 dst += 8;

883 #ifdef SK_CPU_ARM64

884 src += 8;

885 #endif

886 }	818 }

887 for (int i = 0; i < count; i++) {	819 for (int i = 0; i < count; i++) {

888 SkPMColor dstC = SkPixel16ToPixel32(dst[i]);	820 SkPMColor dstC = SkPixel16ToPixel32(dst[i]);

889 dst[i] = SkPixel32ToPixel16_ToU16(proc(src[i], dstC));	821 dst[i] = SkPixel32ToPixel16_ToU16(proc(src[i], dstC));

890 }	822 }

891 } else {	823 } else {

892 for (int i = count - 1; i >= 0; --i) {	824 for (int i = count - 1; i >= 0; --i) {

893 unsigned a = aa[i];	825 unsigned a = aa[i];

894 if (0 != a) {	826 if (0 != a) {

895 SkPMColor dstC = SkPixel16ToPixel32(dst[i]);	827 SkPMColor dstC = SkPixel16ToPixel32(dst[i]);

(...skipping 100 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
996	928

997 if (procSIMD != NULL) {	929 if (procSIMD != NULL) {

998 return SkNEW_ARGS(SkNEONProcCoeffXfermode, (rec, mode, procSIMD));	930 return SkNEW_ARGS(SkNEONProcCoeffXfermode, (rec, mode, procSIMD));

999 }	931 }

1000 return NULL;	932 return NULL;

1001 }	933 }

1002	934

1003 SkXfermodeProc SkPlatformXfermodeProcFactory_impl_neon(SkXfermode::Mode mode) {	935 SkXfermodeProc SkPlatformXfermodeProcFactory_impl_neon(SkXfermode::Mode mode) {

1004 return gNEONXfermodeProcs1[mode];	936 return gNEONXfermodeProcs1[mode];

1005 }	937 }

OLD	NEW

« no previous file with comments | « src/opts/SkBitmapProcState_opts_arm.cpp ('k') | no next file » | no next file with comments »