OLD | NEW |
1 #include "SkXfermode.h" | 1 #include "SkXfermode.h" |
2 #include "SkXfermode_proccoeff.h" | 2 #include "SkXfermode_proccoeff.h" |
3 #include "SkColorPriv.h" | 3 #include "SkColorPriv.h" |
4 | 4 |
5 #include <arm_neon.h> | 5 #include <arm_neon.h> |
6 #include "SkColor_opts_neon.h" | 6 #include "SkColor_opts_neon.h" |
7 #include "SkXfermode_opts_arm_neon.h" | 7 #include "SkXfermode_opts_arm_neon.h" |
8 | 8 |
9 #define SkAlphaMulAlpha(a, b) SkMulDiv255Round(a, b) | 9 #define SkAlphaMulAlpha(a, b) SkMulDiv255Round(a, b) |
10 | 10 |
(...skipping 23 matching lines...) Expand all Loading... |
34 ret = vaddq_u16(ret, vshrq_n_u16(ret, 8)); | 34 ret = vaddq_u16(ret, vshrq_n_u16(ret, 8)); |
35 | 35 |
36 ret = vshrq_n_u16(ret, 8); | 36 ret = vshrq_n_u16(ret, 8); |
37 | 37 |
38 return ret; | 38 return ret; |
39 } | 39 } |
40 | 40 |
41 static inline uint8x8_t SkDiv255Round_neon8_32_8(int32x4_t p1, int32x4_t p2) { | 41 static inline uint8x8_t SkDiv255Round_neon8_32_8(int32x4_t p1, int32x4_t p2) { |
42 uint16x8_t tmp; | 42 uint16x8_t tmp; |
43 | 43 |
44 #ifdef SK_CPU_ARM64 | |
45 tmp = vmovn_high_u32(vmovn_u32(vreinterpretq_u32_s32(p1)), | |
46 vreinterpretq_u32_s32(p2)); | |
47 #else | |
48 tmp = vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(p1)), | 44 tmp = vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(p1)), |
49 vmovn_u32(vreinterpretq_u32_s32(p2))); | 45 vmovn_u32(vreinterpretq_u32_s32(p2))); |
50 #endif | |
51 | 46 |
52 tmp += vdupq_n_u16(128); | 47 tmp += vdupq_n_u16(128); |
53 tmp += vshrq_n_u16(tmp, 8); | 48 tmp += vshrq_n_u16(tmp, 8); |
54 | 49 |
55 return vshrn_n_u16(tmp, 8); | 50 return vshrn_n_u16(tmp, 8); |
56 } | 51 } |
57 | 52 |
58 static inline uint16x8_t SkDiv255Round_neon8_16_16(uint16x8_t prod) { | 53 static inline uint16x8_t SkDiv255Round_neon8_16_16(uint16x8_t prod) { |
59 prod += vdupq_n_u16(128); | 54 prod += vdupq_n_u16(128); |
60 prod += vshrq_n_u16(prod, 8); | 55 prod += vshrq_n_u16(prod, 8); |
61 | 56 |
62 return vshrq_n_u16(prod, 8); | 57 return vshrq_n_u16(prod, 8); |
63 } | 58 } |
64 | 59 |
65 static inline uint8x8_t clamp_div255round_simd8_32(int32x4_t val1, int32x4_t val
2) { | 60 static inline uint8x8_t clamp_div255round_simd8_32(int32x4_t val1, int32x4_t val
2) { |
66 uint8x8_t ret; | 61 uint8x8_t ret; |
67 uint32x4_t cmp1, cmp2; | 62 uint32x4_t cmp1, cmp2; |
68 uint16x8_t cmp16; | 63 uint16x8_t cmp16; |
69 uint8x8_t cmp8, cmp8_1; | 64 uint8x8_t cmp8, cmp8_1; |
70 | 65 |
71 // Test if <= 0 | 66 // Test if <= 0 |
72 cmp1 = vcleq_s32(val1, vdupq_n_s32(0)); | 67 cmp1 = vcleq_s32(val1, vdupq_n_s32(0)); |
73 cmp2 = vcleq_s32(val2, vdupq_n_s32(0)); | 68 cmp2 = vcleq_s32(val2, vdupq_n_s32(0)); |
74 #ifdef SK_CPU_ARM64 | |
75 cmp16 = vmovn_high_u32(vmovn_u32(cmp1), cmp2); | |
76 #else | |
77 cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2)); | 69 cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2)); |
78 #endif | |
79 cmp8_1 = vmovn_u16(cmp16); | 70 cmp8_1 = vmovn_u16(cmp16); |
80 | 71 |
81 // Init to zero | 72 // Init to zero |
82 ret = vdup_n_u8(0); | 73 ret = vdup_n_u8(0); |
83 | 74 |
84 // Test if >= 255*255 | 75 // Test if >= 255*255 |
85 cmp1 = vcgeq_s32(val1, vdupq_n_s32(255*255)); | 76 cmp1 = vcgeq_s32(val1, vdupq_n_s32(255*255)); |
86 cmp2 = vcgeq_s32(val2, vdupq_n_s32(255*255)); | 77 cmp2 = vcgeq_s32(val2, vdupq_n_s32(255*255)); |
87 #ifdef SK_CPU_ARM64 | |
88 cmp16 = vmovn_high_u32(vmovn_u32(cmp1), cmp2); | |
89 #else | |
90 cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2)); | 78 cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2)); |
91 #endif | |
92 cmp8 = vmovn_u16(cmp16); | 79 cmp8 = vmovn_u16(cmp16); |
93 | 80 |
94 // Insert 255 where true | 81 // Insert 255 where true |
95 ret = vbsl_u8(cmp8, vdup_n_u8(255), ret); | 82 ret = vbsl_u8(cmp8, vdup_n_u8(255), ret); |
96 | 83 |
97 // Calc SkDiv255Round | 84 // Calc SkDiv255Round |
98 uint8x8_t div = SkDiv255Round_neon8_32_8(val1, val2); | 85 uint8x8_t div = SkDiv255Round_neon8_32_8(val1, val2); |
99 | 86 |
100 // Insert where false and previous test false | 87 // Insert where false and previous test false |
101 cmp8 = cmp8 | cmp8_1; | 88 cmp8 = cmp8 | cmp8_1; |
(...skipping 313 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
415 uint16x8_t scda = vmull_u8(sc, da); | 402 uint16x8_t scda = vmull_u8(sc, da); |
416 uint16x8_t dcsa = vmull_u8(dc, sa); | 403 uint16x8_t dcsa = vmull_u8(dc, sa); |
417 uint16x8_t sada = vmull_u8(sa, da); | 404 uint16x8_t sada = vmull_u8(sa, da); |
418 | 405 |
419 // Prepare non common subexpressions | 406 // Prepare non common subexpressions |
420 uint16x8_t dc2, sc2; | 407 uint16x8_t dc2, sc2; |
421 uint32x4_t scdc2_1, scdc2_2; | 408 uint32x4_t scdc2_1, scdc2_2; |
422 if (overlay) { | 409 if (overlay) { |
423 dc2 = vshll_n_u8(dc, 1); | 410 dc2 = vshll_n_u8(dc, 1); |
424 scdc2_1 = vmull_u16(vget_low_u16(dc2), vget_low_u16(vmovl_u8(sc))); | 411 scdc2_1 = vmull_u16(vget_low_u16(dc2), vget_low_u16(vmovl_u8(sc))); |
425 #ifdef SK_CPU_ARM64 | |
426 scdc2_2 = vmull_high_u16(dc2, vmovl_u8(sc)); | |
427 #else | |
428 scdc2_2 = vmull_u16(vget_high_u16(dc2), vget_high_u16(vmovl_u8(sc))); | 412 scdc2_2 = vmull_u16(vget_high_u16(dc2), vget_high_u16(vmovl_u8(sc))); |
429 #endif | |
430 } else { | 413 } else { |
431 sc2 = vshll_n_u8(sc, 1); | 414 sc2 = vshll_n_u8(sc, 1); |
432 scdc2_1 = vmull_u16(vget_low_u16(sc2), vget_low_u16(vmovl_u8(dc))); | 415 scdc2_1 = vmull_u16(vget_low_u16(sc2), vget_low_u16(vmovl_u8(dc))); |
433 #ifdef SK_CPU_ARM64 | |
434 scdc2_2 = vmull_high_u16(sc2, vmovl_u8(dc)); | |
435 #else | |
436 scdc2_2 = vmull_u16(vget_high_u16(sc2), vget_high_u16(vmovl_u8(dc))); | 416 scdc2_2 = vmull_u16(vget_high_u16(sc2), vget_high_u16(vmovl_u8(dc))); |
437 #endif | |
438 } | 417 } |
439 | 418 |
440 // Calc COM | 419 // Calc COM |
441 int32x4_t com1, com2; | 420 int32x4_t com1, com2; |
442 com1 = vreinterpretq_s32_u32( | 421 com1 = vreinterpretq_s32_u32( |
443 vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc))); | 422 vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc))); |
444 com2 = vreinterpretq_s32_u32( | 423 com2 = vreinterpretq_s32_u32( |
445 #ifdef SK_CPU_ARM64 | |
446 vmull_high_u16(const255, sc_plus_dc)); | |
447 #else | |
448 vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc))); | 424 vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc))); |
449 #endif | |
450 | 425 |
451 // Calc SUB | 426 // Calc SUB |
452 int32x4_t sub1, sub2; | 427 int32x4_t sub1, sub2; |
453 sub1 = vreinterpretq_s32_u32(vaddl_u16(vget_low_u16(scda), vget_low_u16(dcsa
))); | 428 sub1 = vreinterpretq_s32_u32(vaddl_u16(vget_low_u16(scda), vget_low_u16(dcsa
))); |
454 #ifdef SK_CPU_ARM64 | |
455 sub2 = vreinterpretq_s32_u32(vaddl_high_u16(scda, dcsa)); | |
456 #else | |
457 sub2 = vreinterpretq_s32_u32(vaddl_u16(vget_high_u16(scda), vget_high_u16(dc
sa))); | 429 sub2 = vreinterpretq_s32_u32(vaddl_u16(vget_high_u16(scda), vget_high_u16(dc
sa))); |
458 #endif | |
459 sub1 = vsubq_s32(sub1, vreinterpretq_s32_u32(scdc2_1)); | 430 sub1 = vsubq_s32(sub1, vreinterpretq_s32_u32(scdc2_1)); |
460 sub2 = vsubq_s32(sub2, vreinterpretq_s32_u32(scdc2_2)); | 431 sub2 = vsubq_s32(sub2, vreinterpretq_s32_u32(scdc2_2)); |
461 | 432 |
462 // Compare 2*dc <= da | 433 // Compare 2*dc <= da |
463 uint16x8_t cmp; | 434 uint16x8_t cmp; |
464 | 435 |
465 if (overlay) { | 436 if (overlay) { |
466 cmp = vcleq_u16(dc2, vmovl_u8(da)); | 437 cmp = vcleq_u16(dc2, vmovl_u8(da)); |
467 } else { | 438 } else { |
468 cmp = vcleq_u16(sc2, vmovl_u8(sa)); | 439 cmp = vcleq_u16(sc2, vmovl_u8(sa)); |
469 } | 440 } |
470 | 441 |
471 // Prepare variables | 442 // Prepare variables |
472 int32x4_t val1_1, val1_2; | 443 int32x4_t val1_1, val1_2; |
473 int32x4_t val2_1, val2_2; | 444 int32x4_t val2_1, val2_2; |
474 uint32x4_t cmp1, cmp2; | 445 uint32x4_t cmp1, cmp2; |
475 | 446 |
476 // Doing a signed lengthening allows to save a few instructions | 447 cmp1 = vmovl_u16(vget_low_u16(cmp)); |
477 // thanks to sign extension. | 448 cmp1 |= vshlq_n_u32(cmp1, 16); |
478 cmp1 = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_low_u16(cmp
)))); | 449 cmp2 = vmovl_u16(vget_high_u16(cmp)); |
479 #ifdef SK_CPU_ARM64 | 450 cmp2 |= vshlq_n_u32(cmp2, 16); |
480 cmp2 = vreinterpretq_u32_s32(vmovl_high_s16(vreinterpretq_s16_u16(cmp))); | |
481 #else | |
482 cmp2 = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_high_u16(cm
p)))); | |
483 #endif | |
484 | 451 |
485 // Calc COM - SUB | 452 // Calc COM - SUB |
486 val1_1 = com1 - sub1; | 453 val1_1 = com1 - sub1; |
487 val1_2 = com2 - sub2; | 454 val1_2 = com2 - sub2; |
488 | 455 |
489 // Calc COM + SUB - sa*da | 456 // Calc COM + SUB - sa*da |
490 val2_1 = com1 + sub1; | 457 val2_1 = com1 + sub1; |
491 val2_2 = com2 + sub2; | 458 val2_2 = com2 + sub2; |
492 | 459 |
493 val2_1 = vsubq_s32(val2_1, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sada
)))); | 460 val2_1 = vsubq_s32(val2_1, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sada
)))); |
494 #ifdef SK_CPU_ARM64 | |
495 val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_high_u16(sada))); | |
496 #else | |
497 val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sad
a)))); | 461 val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sad
a)))); |
498 #endif | |
499 | 462 |
500 // Insert where needed | 463 // Insert where needed |
501 val1_1 = vbslq_s32(cmp1, val1_1, val2_1); | 464 val1_1 = vbslq_s32(cmp1, val1_1, val2_1); |
502 val1_2 = vbslq_s32(cmp2, val1_2, val2_2); | 465 val1_2 = vbslq_s32(cmp2, val1_2, val2_2); |
503 | 466 |
504 // Call the clamp_div255round function | 467 // Call the clamp_div255round function |
505 return clamp_div255round_simd8_32(val1_1, val1_2); | 468 return clamp_div255round_simd8_32(val1_1, val1_2); |
506 } | 469 } |
507 | 470 |
508 static inline uint8x8_t overlay_color(uint8x8_t sc, uint8x8_t dc, | 471 static inline uint8x8_t overlay_color(uint8x8_t sc, uint8x8_t dc, |
(...skipping 149 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
658 sc_plus_dc = vaddl_u8(sc, dc); | 621 sc_plus_dc = vaddl_u8(sc, dc); |
659 scdc = vmull_u8(sc, dc); | 622 scdc = vmull_u8(sc, dc); |
660 | 623 |
661 /* Prepare constants */ | 624 /* Prepare constants */ |
662 const255 = vdupq_n_u16(255); | 625 const255 = vdupq_n_u16(255); |
663 | 626 |
664 /* Calc the first term */ | 627 /* Calc the first term */ |
665 term1_1 = vreinterpretq_s32_u32( | 628 term1_1 = vreinterpretq_s32_u32( |
666 vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc))); | 629 vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc))); |
667 term1_2 = vreinterpretq_s32_u32( | 630 term1_2 = vreinterpretq_s32_u32( |
668 #ifdef SK_CPU_ARM64 | |
669 vmull_high_u16(const255, sc_plus_dc)); | |
670 #else | |
671 vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc))); | 631 vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc))); |
672 #endif | |
673 | 632 |
674 /* Calc the second term */ | 633 /* Calc the second term */ |
675 term2_1 = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(scdc), 1)); | 634 term2_1 = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(scdc), 1)); |
676 #ifdef SK_CPU_ARM64 | |
677 term2_2 = vreinterpretq_s32_u32(vshll_high_n_u16(scdc, 1)); | |
678 #else | |
679 term2_2 = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(scdc), 1)); | 635 term2_2 = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(scdc), 1)); |
680 #endif | |
681 | 636 |
682 return clamp_div255round_simd8_32(term1_1 - term2_1, term1_2 - term2_2); | 637 return clamp_div255round_simd8_32(term1_1 - term2_1, term1_2 - term2_2); |
683 } | 638 } |
684 | 639 |
685 uint8x8x4_t exclusion_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | 640 uint8x8x4_t exclusion_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { |
686 uint8x8x4_t ret; | 641 uint8x8x4_t ret; |
687 | 642 |
688 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); | 643 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); |
689 ret.val[NEON_R] = exclusion_color(src.val[NEON_R], dst.val[NEON_R], | 644 ret.val[NEON_R] = exclusion_color(src.val[NEON_R], dst.val[NEON_R], |
690 src.val[NEON_A], dst.val[NEON_A]); | 645 src.val[NEON_A], dst.val[NEON_A]); |
691 ret.val[NEON_G] = exclusion_color(src.val[NEON_G], dst.val[NEON_G], | 646 ret.val[NEON_G] = exclusion_color(src.val[NEON_G], dst.val[NEON_G], |
692 src.val[NEON_A], dst.val[NEON_A]); | 647 src.val[NEON_A], dst.val[NEON_A]); |
693 ret.val[NEON_B] = exclusion_color(src.val[NEON_B], dst.val[NEON_B], | 648 ret.val[NEON_B] = exclusion_color(src.val[NEON_B], dst.val[NEON_B], |
694 src.val[NEON_A], dst.val[NEON_A]); | 649 src.val[NEON_A], dst.val[NEON_A]); |
695 | 650 |
696 return ret; | 651 return ret; |
697 } | 652 } |
698 | 653 |
699 static inline uint8x8_t blendfunc_multiply_color(uint8x8_t sc, uint8x8_t dc, | 654 static inline uint8x8_t blendfunc_multiply_color(uint8x8_t sc, uint8x8_t dc, |
700 uint8x8_t sa, uint8x8_t da) { | 655 uint8x8_t sa, uint8x8_t da) { |
701 uint32x4_t val1, val2; | 656 uint32x4_t val1, val2; |
702 uint16x8_t scdc, t1, t2; | 657 uint16x8_t scdc, t1, t2; |
703 | 658 |
704 t1 = vmull_u8(sc, vdup_n_u8(255) - da); | 659 t1 = vmull_u8(sc, vdup_n_u8(255) - da); |
705 t2 = vmull_u8(dc, vdup_n_u8(255) - sa); | 660 t2 = vmull_u8(dc, vdup_n_u8(255) - sa); |
706 scdc = vmull_u8(sc, dc); | 661 scdc = vmull_u8(sc, dc); |
707 | 662 |
708 val1 = vaddl_u16(vget_low_u16(t1), vget_low_u16(t2)); | 663 val1 = vaddl_u16(vget_low_u16(t1), vget_low_u16(t2)); |
709 #ifdef SK_CPU_ARM64 | |
710 val2 = vaddl_high_u16(t1, t2); | |
711 #else | |
712 val2 = vaddl_u16(vget_high_u16(t1), vget_high_u16(t2)); | 664 val2 = vaddl_u16(vget_high_u16(t1), vget_high_u16(t2)); |
713 #endif | |
714 | 665 |
715 val1 = vaddw_u16(val1, vget_low_u16(scdc)); | 666 val1 = vaddw_u16(val1, vget_low_u16(scdc)); |
716 #ifdef SK_CPU_ARM64 | |
717 val2 = vaddw_high_u16(val2, scdc); | |
718 #else | |
719 val2 = vaddw_u16(val2, vget_high_u16(scdc)); | 667 val2 = vaddw_u16(val2, vget_high_u16(scdc)); |
720 #endif | |
721 | 668 |
722 return clamp_div255round_simd8_32( | 669 return clamp_div255round_simd8_32( |
723 vreinterpretq_s32_u32(val1), vreinterpretq_s32_u32(val2)); | 670 vreinterpretq_s32_u32(val1), vreinterpretq_s32_u32(val2)); |
724 } | 671 } |
725 | 672 |
726 uint8x8x4_t multiply_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | 673 uint8x8x4_t multiply_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { |
727 uint8x8x4_t ret; | 674 uint8x8x4_t ret; |
728 | 675 |
729 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); | 676 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); |
730 ret.val[NEON_R] = blendfunc_multiply_color(src.val[NEON_R], dst.val[NEON_R], | 677 ret.val[NEON_R] = blendfunc_multiply_color(src.val[NEON_R], dst.val[NEON_R], |
(...skipping 23 matching lines...) Expand all Loading... |
754 | 701 |
755 SkXfermodeProc proc = this->getProc(); | 702 SkXfermodeProc proc = this->getProc(); |
756 SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD
); | 703 SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD
); |
757 SkASSERT(procSIMD != NULL); | 704 SkASSERT(procSIMD != NULL); |
758 | 705 |
759 if (NULL == aa) { | 706 if (NULL == aa) { |
760 // Unrolled NEON code | 707 // Unrolled NEON code |
761 while (count >= 8) { | 708 while (count >= 8) { |
762 uint8x8x4_t vsrc, vdst, vres; | 709 uint8x8x4_t vsrc, vdst, vres; |
763 | 710 |
764 #ifdef SK_CPU_ARM64 | |
765 vsrc = vld4_u8((uint8_t*)src); | |
766 vdst = vld4_u8((uint8_t*)dst); | |
767 #else | |
768 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) | 711 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) |
769 asm volatile ( | 712 asm volatile ( |
770 "vld4.u8 %h[vsrc], [%[src]]! \t\n" | 713 "vld4.u8 %h[vsrc], [%[src]]! \t\n" |
771 "vld4.u8 %h[vdst], [%[dst]] \t\n" | 714 "vld4.u8 %h[vdst], [%[dst]] \t\n" |
772 : [vsrc] "=w" (vsrc), [vdst] "=w" (vdst), [src] "+&r" (src) | 715 : [vsrc] "=w" (vsrc), [vdst] "=w" (vdst), [src] "+&r" (src) |
773 : [dst] "r" (dst) | 716 : [dst] "r" (dst) |
774 : | 717 : |
775 ); | 718 ); |
776 #else | 719 #else |
777 register uint8x8_t d0 asm("d0"); | 720 register uint8x8_t d0 asm("d0"); |
(...skipping 12 matching lines...) Expand all Loading... |
790 "=w" (d4), "=w" (d5), "=w" (d6), "=w" (d7), | 733 "=w" (d4), "=w" (d5), "=w" (d6), "=w" (d7), |
791 [src] "+&r" (src) | 734 [src] "+&r" (src) |
792 : [dst] "r" (dst) | 735 : [dst] "r" (dst) |
793 : | 736 : |
794 ); | 737 ); |
795 vsrc.val[0] = d0; vdst.val[0] = d4; | 738 vsrc.val[0] = d0; vdst.val[0] = d4; |
796 vsrc.val[1] = d1; vdst.val[1] = d5; | 739 vsrc.val[1] = d1; vdst.val[1] = d5; |
797 vsrc.val[2] = d2; vdst.val[2] = d6; | 740 vsrc.val[2] = d2; vdst.val[2] = d6; |
798 vsrc.val[3] = d3; vdst.val[3] = d7; | 741 vsrc.val[3] = d3; vdst.val[3] = d7; |
799 #endif | 742 #endif |
800 #endif // #ifdef SK_CPU_ARM64 | |
801 | 743 |
802 vres = procSIMD(vsrc, vdst); | 744 vres = procSIMD(vsrc, vdst); |
803 | 745 |
804 vst4_u8((uint8_t*)dst, vres); | 746 vst4_u8((uint8_t*)dst, vres); |
805 | 747 |
806 count -= 8; | 748 count -= 8; |
807 dst += 8; | 749 dst += 8; |
808 #ifdef SK_CPU_ARM64 | |
809 src += 8; | |
810 #endif | |
811 } | 750 } |
812 // Leftovers | 751 // Leftovers |
813 for (int i = 0; i < count; i++) { | 752 for (int i = 0; i < count; i++) { |
814 dst[i] = proc(src[i], dst[i]); | 753 dst[i] = proc(src[i], dst[i]); |
815 } | 754 } |
816 } else { | 755 } else { |
817 for (int i = count - 1; i >= 0; --i) { | 756 for (int i = count - 1; i >= 0; --i) { |
818 unsigned a = aa[i]; | 757 unsigned a = aa[i]; |
819 if (0 != a) { | 758 if (0 != a) { |
820 SkPMColor dstC = dst[i]; | 759 SkPMColor dstC = dst[i]; |
(...skipping 16 matching lines...) Expand all Loading... |
837 SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD
); | 776 SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD
); |
838 SkASSERT(procSIMD != NULL); | 777 SkASSERT(procSIMD != NULL); |
839 | 778 |
840 if (NULL == aa) { | 779 if (NULL == aa) { |
841 while(count >= 8) { | 780 while(count >= 8) { |
842 uint16x8_t vdst, vres16; | 781 uint16x8_t vdst, vres16; |
843 uint8x8x4_t vdst32, vsrc, vres; | 782 uint8x8x4_t vdst32, vsrc, vres; |
844 | 783 |
845 vdst = vld1q_u16(dst); | 784 vdst = vld1q_u16(dst); |
846 | 785 |
847 #ifdef SK_CPU_ARM64 | |
848 vsrc = vld4_u8((uint8_t*)src); | |
849 #else | |
850 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) | 786 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) |
851 asm volatile ( | 787 asm volatile ( |
852 "vld4.u8 %h[vsrc], [%[src]]! \t\n" | 788 "vld4.u8 %h[vsrc], [%[src]]! \t\n" |
853 : [vsrc] "=w" (vsrc), [src] "+&r" (src) | 789 : [vsrc] "=w" (vsrc), [src] "+&r" (src) |
854 : : | 790 : : |
855 ); | 791 ); |
856 #else | 792 #else |
857 register uint8x8_t d0 asm("d0"); | 793 register uint8x8_t d0 asm("d0"); |
858 register uint8x8_t d1 asm("d1"); | 794 register uint8x8_t d1 asm("d1"); |
859 register uint8x8_t d2 asm("d2"); | 795 register uint8x8_t d2 asm("d2"); |
860 register uint8x8_t d3 asm("d3"); | 796 register uint8x8_t d3 asm("d3"); |
861 | 797 |
862 asm volatile ( | 798 asm volatile ( |
863 "vld4.u8 {d0-d3},[%[src]]!;" | 799 "vld4.u8 {d0-d3},[%[src]]!;" |
864 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), | 800 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), |
865 [src] "+&r" (src) | 801 [src] "+&r" (src) |
866 : : | 802 : : |
867 ); | 803 ); |
868 vsrc.val[0] = d0; | 804 vsrc.val[0] = d0; |
869 vsrc.val[1] = d1; | 805 vsrc.val[1] = d1; |
870 vsrc.val[2] = d2; | 806 vsrc.val[2] = d2; |
871 vsrc.val[3] = d3; | 807 vsrc.val[3] = d3; |
872 #endif | 808 #endif |
873 #endif // #ifdef SK_CPU_ARM64 | |
874 | 809 |
875 vdst32 = SkPixel16ToPixel32_neon8(vdst); | 810 vdst32 = SkPixel16ToPixel32_neon8(vdst); |
876 vres = procSIMD(vsrc, vdst32); | 811 vres = procSIMD(vsrc, vdst32); |
877 vres16 = SkPixel32ToPixel16_neon8(vres); | 812 vres16 = SkPixel32ToPixel16_neon8(vres); |
878 | 813 |
879 vst1q_u16(dst, vres16); | 814 vst1q_u16(dst, vres16); |
880 | 815 |
881 count -= 8; | 816 count -= 8; |
882 dst += 8; | 817 dst += 8; |
883 #ifdef SK_CPU_ARM64 | |
884 src += 8; | |
885 #endif | |
886 } | 818 } |
887 for (int i = 0; i < count; i++) { | 819 for (int i = 0; i < count; i++) { |
888 SkPMColor dstC = SkPixel16ToPixel32(dst[i]); | 820 SkPMColor dstC = SkPixel16ToPixel32(dst[i]); |
889 dst[i] = SkPixel32ToPixel16_ToU16(proc(src[i], dstC)); | 821 dst[i] = SkPixel32ToPixel16_ToU16(proc(src[i], dstC)); |
890 } | 822 } |
891 } else { | 823 } else { |
892 for (int i = count - 1; i >= 0; --i) { | 824 for (int i = count - 1; i >= 0; --i) { |
893 unsigned a = aa[i]; | 825 unsigned a = aa[i]; |
894 if (0 != a) { | 826 if (0 != a) { |
895 SkPMColor dstC = SkPixel16ToPixel32(dst[i]); | 827 SkPMColor dstC = SkPixel16ToPixel32(dst[i]); |
(...skipping 100 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
996 | 928 |
997 if (procSIMD != NULL) { | 929 if (procSIMD != NULL) { |
998 return SkNEW_ARGS(SkNEONProcCoeffXfermode, (rec, mode, procSIMD)); | 930 return SkNEW_ARGS(SkNEONProcCoeffXfermode, (rec, mode, procSIMD)); |
999 } | 931 } |
1000 return NULL; | 932 return NULL; |
1001 } | 933 } |
1002 | 934 |
1003 SkXfermodeProc SkPlatformXfermodeProcFactory_impl_neon(SkXfermode::Mode mode) { | 935 SkXfermodeProc SkPlatformXfermodeProcFactory_impl_neon(SkXfermode::Mode mode) { |
1004 return gNEONXfermodeProcs1[mode]; | 936 return gNEONXfermodeProcs1[mode]; |
1005 } | 937 } |
OLD | NEW |