OLD | NEW |
1 #include "SkXfermode.h" | 1 #include "SkXfermode.h" |
2 #include "SkXfermode_proccoeff.h" | 2 #include "SkXfermode_proccoeff.h" |
3 #include "SkColorPriv.h" | 3 #include "SkColorPriv.h" |
4 | 4 |
5 #include <arm_neon.h> | 5 #include <arm_neon.h> |
6 #include "SkColor_opts_neon.h" | 6 #include "SkColor_opts_neon.h" |
7 #include "SkXfermode_opts_arm_neon.h" | 7 #include "SkXfermode_opts_arm_neon.h" |
8 | 8 |
9 #define SkAlphaMulAlpha(a, b) SkMulDiv255Round(a, b) | 9 #define SkAlphaMulAlpha(a, b) SkMulDiv255Round(a, b) |
10 | 10 |
(...skipping 23 matching lines...) Expand all Loading... |
34 ret = vaddq_u16(ret, vshrq_n_u16(ret, 8)); | 34 ret = vaddq_u16(ret, vshrq_n_u16(ret, 8)); |
35 | 35 |
36 ret = vshrq_n_u16(ret, 8); | 36 ret = vshrq_n_u16(ret, 8); |
37 | 37 |
38 return ret; | 38 return ret; |
39 } | 39 } |
40 | 40 |
41 static inline uint8x8_t SkDiv255Round_neon8_32_8(int32x4_t p1, int32x4_t p2) { | 41 static inline uint8x8_t SkDiv255Round_neon8_32_8(int32x4_t p1, int32x4_t p2) { |
42 uint16x8_t tmp; | 42 uint16x8_t tmp; |
43 | 43 |
| 44 #ifdef SK_CPU_ARM64 |
| 45 tmp = vmovn_high_u32(vmovn_u32(vreinterpretq_u32_s32(p1)), |
| 46 vreinterpretq_u32_s32(p2)); |
| 47 #else |
44 tmp = vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(p1)), | 48 tmp = vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(p1)), |
45 vmovn_u32(vreinterpretq_u32_s32(p2))); | 49 vmovn_u32(vreinterpretq_u32_s32(p2))); |
| 50 #endif |
46 | 51 |
47 tmp += vdupq_n_u16(128); | 52 tmp += vdupq_n_u16(128); |
48 tmp += vshrq_n_u16(tmp, 8); | 53 tmp += vshrq_n_u16(tmp, 8); |
49 | 54 |
50 return vshrn_n_u16(tmp, 8); | 55 return vshrn_n_u16(tmp, 8); |
51 } | 56 } |
52 | 57 |
53 static inline uint16x8_t SkDiv255Round_neon8_16_16(uint16x8_t prod) { | 58 static inline uint16x8_t SkDiv255Round_neon8_16_16(uint16x8_t prod) { |
54 prod += vdupq_n_u16(128); | 59 prod += vdupq_n_u16(128); |
55 prod += vshrq_n_u16(prod, 8); | 60 prod += vshrq_n_u16(prod, 8); |
56 | 61 |
57 return vshrq_n_u16(prod, 8); | 62 return vshrq_n_u16(prod, 8); |
58 } | 63 } |
59 | 64 |
60 static inline uint8x8_t clamp_div255round_simd8_32(int32x4_t val1, int32x4_t val
2) { | 65 static inline uint8x8_t clamp_div255round_simd8_32(int32x4_t val1, int32x4_t val
2) { |
61 uint8x8_t ret; | 66 uint8x8_t ret; |
62 uint32x4_t cmp1, cmp2; | 67 uint32x4_t cmp1, cmp2; |
63 uint16x8_t cmp16; | 68 uint16x8_t cmp16; |
64 uint8x8_t cmp8, cmp8_1; | 69 uint8x8_t cmp8, cmp8_1; |
65 | 70 |
66 // Test if <= 0 | 71 // Test if <= 0 |
67 cmp1 = vcleq_s32(val1, vdupq_n_s32(0)); | 72 cmp1 = vcleq_s32(val1, vdupq_n_s32(0)); |
68 cmp2 = vcleq_s32(val2, vdupq_n_s32(0)); | 73 cmp2 = vcleq_s32(val2, vdupq_n_s32(0)); |
| 74 #ifdef SK_CPU_ARM64 |
| 75 cmp16 = vmovn_high_u32(vmovn_u32(cmp1), cmp2); |
| 76 #else |
69 cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2)); | 77 cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2)); |
| 78 #endif |
70 cmp8_1 = vmovn_u16(cmp16); | 79 cmp8_1 = vmovn_u16(cmp16); |
71 | 80 |
72 // Init to zero | 81 // Init to zero |
73 ret = vdup_n_u8(0); | 82 ret = vdup_n_u8(0); |
74 | 83 |
75 // Test if >= 255*255 | 84 // Test if >= 255*255 |
76 cmp1 = vcgeq_s32(val1, vdupq_n_s32(255*255)); | 85 cmp1 = vcgeq_s32(val1, vdupq_n_s32(255*255)); |
77 cmp2 = vcgeq_s32(val2, vdupq_n_s32(255*255)); | 86 cmp2 = vcgeq_s32(val2, vdupq_n_s32(255*255)); |
| 87 #ifdef SK_CPU_ARM64 |
| 88 cmp16 = vmovn_high_u32(vmovn_u32(cmp1), cmp2); |
| 89 #else |
78 cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2)); | 90 cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2)); |
| 91 #endif |
79 cmp8 = vmovn_u16(cmp16); | 92 cmp8 = vmovn_u16(cmp16); |
80 | 93 |
81 // Insert 255 where true | 94 // Insert 255 where true |
82 ret = vbsl_u8(cmp8, vdup_n_u8(255), ret); | 95 ret = vbsl_u8(cmp8, vdup_n_u8(255), ret); |
83 | 96 |
84 // Calc SkDiv255Round | 97 // Calc SkDiv255Round |
85 uint8x8_t div = SkDiv255Round_neon8_32_8(val1, val2); | 98 uint8x8_t div = SkDiv255Round_neon8_32_8(val1, val2); |
86 | 99 |
87 // Insert where false and previous test false | 100 // Insert where false and previous test false |
88 cmp8 = cmp8 | cmp8_1; | 101 cmp8 = cmp8 | cmp8_1; |
(...skipping 313 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
402 uint16x8_t scda = vmull_u8(sc, da); | 415 uint16x8_t scda = vmull_u8(sc, da); |
403 uint16x8_t dcsa = vmull_u8(dc, sa); | 416 uint16x8_t dcsa = vmull_u8(dc, sa); |
404 uint16x8_t sada = vmull_u8(sa, da); | 417 uint16x8_t sada = vmull_u8(sa, da); |
405 | 418 |
406 // Prepare non common subexpressions | 419 // Prepare non common subexpressions |
407 uint16x8_t dc2, sc2; | 420 uint16x8_t dc2, sc2; |
408 uint32x4_t scdc2_1, scdc2_2; | 421 uint32x4_t scdc2_1, scdc2_2; |
409 if (overlay) { | 422 if (overlay) { |
410 dc2 = vshll_n_u8(dc, 1); | 423 dc2 = vshll_n_u8(dc, 1); |
411 scdc2_1 = vmull_u16(vget_low_u16(dc2), vget_low_u16(vmovl_u8(sc))); | 424 scdc2_1 = vmull_u16(vget_low_u16(dc2), vget_low_u16(vmovl_u8(sc))); |
| 425 #ifdef SK_CPU_ARM64 |
| 426 scdc2_2 = vmull_high_u16(dc2, vmovl_u8(sc)); |
| 427 #else |
412 scdc2_2 = vmull_u16(vget_high_u16(dc2), vget_high_u16(vmovl_u8(sc))); | 428 scdc2_2 = vmull_u16(vget_high_u16(dc2), vget_high_u16(vmovl_u8(sc))); |
| 429 #endif |
413 } else { | 430 } else { |
414 sc2 = vshll_n_u8(sc, 1); | 431 sc2 = vshll_n_u8(sc, 1); |
415 scdc2_1 = vmull_u16(vget_low_u16(sc2), vget_low_u16(vmovl_u8(dc))); | 432 scdc2_1 = vmull_u16(vget_low_u16(sc2), vget_low_u16(vmovl_u8(dc))); |
| 433 #ifdef SK_CPU_ARM64 |
| 434 scdc2_2 = vmull_high_u16(sc2, vmovl_u8(dc)); |
| 435 #else |
416 scdc2_2 = vmull_u16(vget_high_u16(sc2), vget_high_u16(vmovl_u8(dc))); | 436 scdc2_2 = vmull_u16(vget_high_u16(sc2), vget_high_u16(vmovl_u8(dc))); |
| 437 #endif |
417 } | 438 } |
418 | 439 |
419 // Calc COM | 440 // Calc COM |
420 int32x4_t com1, com2; | 441 int32x4_t com1, com2; |
421 com1 = vreinterpretq_s32_u32( | 442 com1 = vreinterpretq_s32_u32( |
422 vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc))); | 443 vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc))); |
423 com2 = vreinterpretq_s32_u32( | 444 com2 = vreinterpretq_s32_u32( |
| 445 #ifdef SK_CPU_ARM64 |
| 446 vmull_high_u16(const255, sc_plus_dc)); |
| 447 #else |
424 vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc))); | 448 vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc))); |
| 449 #endif |
425 | 450 |
426 // Calc SUB | 451 // Calc SUB |
427 int32x4_t sub1, sub2; | 452 int32x4_t sub1, sub2; |
428 sub1 = vreinterpretq_s32_u32(vaddl_u16(vget_low_u16(scda), vget_low_u16(dcsa
))); | 453 sub1 = vreinterpretq_s32_u32(vaddl_u16(vget_low_u16(scda), vget_low_u16(dcsa
))); |
| 454 #ifdef SK_CPU_ARM64 |
| 455 sub2 = vreinterpretq_s32_u32(vaddl_high_u16(scda, dcsa)); |
| 456 #else |
429 sub2 = vreinterpretq_s32_u32(vaddl_u16(vget_high_u16(scda), vget_high_u16(dc
sa))); | 457 sub2 = vreinterpretq_s32_u32(vaddl_u16(vget_high_u16(scda), vget_high_u16(dc
sa))); |
| 458 #endif |
430 sub1 = vsubq_s32(sub1, vreinterpretq_s32_u32(scdc2_1)); | 459 sub1 = vsubq_s32(sub1, vreinterpretq_s32_u32(scdc2_1)); |
431 sub2 = vsubq_s32(sub2, vreinterpretq_s32_u32(scdc2_2)); | 460 sub2 = vsubq_s32(sub2, vreinterpretq_s32_u32(scdc2_2)); |
432 | 461 |
433 // Compare 2*dc <= da | 462 // Compare 2*dc <= da |
434 uint16x8_t cmp; | 463 uint16x8_t cmp; |
435 | 464 |
436 if (overlay) { | 465 if (overlay) { |
437 cmp = vcleq_u16(dc2, vmovl_u8(da)); | 466 cmp = vcleq_u16(dc2, vmovl_u8(da)); |
438 } else { | 467 } else { |
439 cmp = vcleq_u16(sc2, vmovl_u8(sa)); | 468 cmp = vcleq_u16(sc2, vmovl_u8(sa)); |
440 } | 469 } |
441 | 470 |
442 // Prepare variables | 471 // Prepare variables |
443 int32x4_t val1_1, val1_2; | 472 int32x4_t val1_1, val1_2; |
444 int32x4_t val2_1, val2_2; | 473 int32x4_t val2_1, val2_2; |
445 uint32x4_t cmp1, cmp2; | 474 uint32x4_t cmp1, cmp2; |
446 | 475 |
447 cmp1 = vmovl_u16(vget_low_u16(cmp)); | 476 // Doing a signed lengthening allows to save a few instructions |
448 cmp1 |= vshlq_n_u32(cmp1, 16); | 477 // thanks to sign extension. |
449 cmp2 = vmovl_u16(vget_high_u16(cmp)); | 478 cmp1 = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_low_u16(cmp
)))); |
450 cmp2 |= vshlq_n_u32(cmp2, 16); | 479 #ifdef SK_CPU_ARM64 |
| 480 cmp2 = vreinterpretq_u32_s32(vmovl_high_s16(vreinterpretq_s16_u16(cmp))); |
| 481 #else |
| 482 cmp2 = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_high_u16(cm
p)))); |
| 483 #endif |
451 | 484 |
452 // Calc COM - SUB | 485 // Calc COM - SUB |
453 val1_1 = com1 - sub1; | 486 val1_1 = com1 - sub1; |
454 val1_2 = com2 - sub2; | 487 val1_2 = com2 - sub2; |
455 | 488 |
456 // Calc COM + SUB - sa*da | 489 // Calc COM + SUB - sa*da |
457 val2_1 = com1 + sub1; | 490 val2_1 = com1 + sub1; |
458 val2_2 = com2 + sub2; | 491 val2_2 = com2 + sub2; |
459 | 492 |
460 val2_1 = vsubq_s32(val2_1, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sada
)))); | 493 val2_1 = vsubq_s32(val2_1, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sada
)))); |
| 494 #ifdef SK_CPU_ARM64 |
| 495 val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_high_u16(sada))); |
| 496 #else |
461 val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sad
a)))); | 497 val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sad
a)))); |
| 498 #endif |
462 | 499 |
463 // Insert where needed | 500 // Insert where needed |
464 val1_1 = vbslq_s32(cmp1, val1_1, val2_1); | 501 val1_1 = vbslq_s32(cmp1, val1_1, val2_1); |
465 val1_2 = vbslq_s32(cmp2, val1_2, val2_2); | 502 val1_2 = vbslq_s32(cmp2, val1_2, val2_2); |
466 | 503 |
467 // Call the clamp_div255round function | 504 // Call the clamp_div255round function |
468 return clamp_div255round_simd8_32(val1_1, val1_2); | 505 return clamp_div255round_simd8_32(val1_1, val1_2); |
469 } | 506 } |
470 | 507 |
471 static inline uint8x8_t overlay_color(uint8x8_t sc, uint8x8_t dc, | 508 static inline uint8x8_t overlay_color(uint8x8_t sc, uint8x8_t dc, |
(...skipping 149 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
621 sc_plus_dc = vaddl_u8(sc, dc); | 658 sc_plus_dc = vaddl_u8(sc, dc); |
622 scdc = vmull_u8(sc, dc); | 659 scdc = vmull_u8(sc, dc); |
623 | 660 |
624 /* Prepare constants */ | 661 /* Prepare constants */ |
625 const255 = vdupq_n_u16(255); | 662 const255 = vdupq_n_u16(255); |
626 | 663 |
627 /* Calc the first term */ | 664 /* Calc the first term */ |
628 term1_1 = vreinterpretq_s32_u32( | 665 term1_1 = vreinterpretq_s32_u32( |
629 vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc))); | 666 vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc))); |
630 term1_2 = vreinterpretq_s32_u32( | 667 term1_2 = vreinterpretq_s32_u32( |
| 668 #ifdef SK_CPU_ARM64 |
| 669 vmull_high_u16(const255, sc_plus_dc)); |
| 670 #else |
631 vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc))); | 671 vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc))); |
| 672 #endif |
632 | 673 |
633 /* Calc the second term */ | 674 /* Calc the second term */ |
634 term2_1 = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(scdc), 1)); | 675 term2_1 = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(scdc), 1)); |
| 676 #ifdef SK_CPU_ARM64 |
| 677 term2_2 = vreinterpretq_s32_u32(vshll_high_n_u16(scdc, 1)); |
| 678 #else |
635 term2_2 = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(scdc), 1)); | 679 term2_2 = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(scdc), 1)); |
| 680 #endif |
636 | 681 |
637 return clamp_div255round_simd8_32(term1_1 - term2_1, term1_2 - term2_2); | 682 return clamp_div255round_simd8_32(term1_1 - term2_1, term1_2 - term2_2); |
638 } | 683 } |
639 | 684 |
640 uint8x8x4_t exclusion_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | 685 uint8x8x4_t exclusion_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { |
641 uint8x8x4_t ret; | 686 uint8x8x4_t ret; |
642 | 687 |
643 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); | 688 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); |
644 ret.val[NEON_R] = exclusion_color(src.val[NEON_R], dst.val[NEON_R], | 689 ret.val[NEON_R] = exclusion_color(src.val[NEON_R], dst.val[NEON_R], |
645 src.val[NEON_A], dst.val[NEON_A]); | 690 src.val[NEON_A], dst.val[NEON_A]); |
646 ret.val[NEON_G] = exclusion_color(src.val[NEON_G], dst.val[NEON_G], | 691 ret.val[NEON_G] = exclusion_color(src.val[NEON_G], dst.val[NEON_G], |
647 src.val[NEON_A], dst.val[NEON_A]); | 692 src.val[NEON_A], dst.val[NEON_A]); |
648 ret.val[NEON_B] = exclusion_color(src.val[NEON_B], dst.val[NEON_B], | 693 ret.val[NEON_B] = exclusion_color(src.val[NEON_B], dst.val[NEON_B], |
649 src.val[NEON_A], dst.val[NEON_A]); | 694 src.val[NEON_A], dst.val[NEON_A]); |
650 | 695 |
651 return ret; | 696 return ret; |
652 } | 697 } |
653 | 698 |
654 static inline uint8x8_t blendfunc_multiply_color(uint8x8_t sc, uint8x8_t dc, | 699 static inline uint8x8_t blendfunc_multiply_color(uint8x8_t sc, uint8x8_t dc, |
655 uint8x8_t sa, uint8x8_t da) { | 700 uint8x8_t sa, uint8x8_t da) { |
656 uint32x4_t val1, val2; | 701 uint32x4_t val1, val2; |
657 uint16x8_t scdc, t1, t2; | 702 uint16x8_t scdc, t1, t2; |
658 | 703 |
659 t1 = vmull_u8(sc, vdup_n_u8(255) - da); | 704 t1 = vmull_u8(sc, vdup_n_u8(255) - da); |
660 t2 = vmull_u8(dc, vdup_n_u8(255) - sa); | 705 t2 = vmull_u8(dc, vdup_n_u8(255) - sa); |
661 scdc = vmull_u8(sc, dc); | 706 scdc = vmull_u8(sc, dc); |
662 | 707 |
663 val1 = vaddl_u16(vget_low_u16(t1), vget_low_u16(t2)); | 708 val1 = vaddl_u16(vget_low_u16(t1), vget_low_u16(t2)); |
| 709 #ifdef SK_CPU_ARM64 |
| 710 val2 = vaddl_high_u16(t1, t2); |
| 711 #else |
664 val2 = vaddl_u16(vget_high_u16(t1), vget_high_u16(t2)); | 712 val2 = vaddl_u16(vget_high_u16(t1), vget_high_u16(t2)); |
| 713 #endif |
665 | 714 |
666 val1 = vaddw_u16(val1, vget_low_u16(scdc)); | 715 val1 = vaddw_u16(val1, vget_low_u16(scdc)); |
| 716 #ifdef SK_CPU_ARM64 |
| 717 val2 = vaddw_high_u16(val2, scdc); |
| 718 #else |
667 val2 = vaddw_u16(val2, vget_high_u16(scdc)); | 719 val2 = vaddw_u16(val2, vget_high_u16(scdc)); |
| 720 #endif |
668 | 721 |
669 return clamp_div255round_simd8_32( | 722 return clamp_div255round_simd8_32( |
670 vreinterpretq_s32_u32(val1), vreinterpretq_s32_u32(val2)); | 723 vreinterpretq_s32_u32(val1), vreinterpretq_s32_u32(val2)); |
671 } | 724 } |
672 | 725 |
673 uint8x8x4_t multiply_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | 726 uint8x8x4_t multiply_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { |
674 uint8x8x4_t ret; | 727 uint8x8x4_t ret; |
675 | 728 |
676 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); | 729 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); |
677 ret.val[NEON_R] = blendfunc_multiply_color(src.val[NEON_R], dst.val[NEON_R], | 730 ret.val[NEON_R] = blendfunc_multiply_color(src.val[NEON_R], dst.val[NEON_R], |
(...skipping 23 matching lines...) Expand all Loading... |
701 | 754 |
702 SkXfermodeProc proc = this->getProc(); | 755 SkXfermodeProc proc = this->getProc(); |
703 SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD
); | 756 SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD
); |
704 SkASSERT(procSIMD != NULL); | 757 SkASSERT(procSIMD != NULL); |
705 | 758 |
706 if (NULL == aa) { | 759 if (NULL == aa) { |
707 // Unrolled NEON code | 760 // Unrolled NEON code |
708 while (count >= 8) { | 761 while (count >= 8) { |
709 uint8x8x4_t vsrc, vdst, vres; | 762 uint8x8x4_t vsrc, vdst, vres; |
710 | 763 |
| 764 #ifdef SK_CPU_ARM64 |
| 765 vsrc = vld4_u8((uint8_t*)src); |
| 766 vdst = vld4_u8((uint8_t*)dst); |
| 767 #else |
711 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) | 768 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) |
712 asm volatile ( | 769 asm volatile ( |
713 "vld4.u8 %h[vsrc], [%[src]]! \t\n" | 770 "vld4.u8 %h[vsrc], [%[src]]! \t\n" |
714 "vld4.u8 %h[vdst], [%[dst]] \t\n" | 771 "vld4.u8 %h[vdst], [%[dst]] \t\n" |
715 : [vsrc] "=w" (vsrc), [vdst] "=w" (vdst), [src] "+&r" (src) | 772 : [vsrc] "=w" (vsrc), [vdst] "=w" (vdst), [src] "+&r" (src) |
716 : [dst] "r" (dst) | 773 : [dst] "r" (dst) |
717 : | 774 : |
718 ); | 775 ); |
719 #else | 776 #else |
720 register uint8x8_t d0 asm("d0"); | 777 register uint8x8_t d0 asm("d0"); |
(...skipping 12 matching lines...) Expand all Loading... |
733 "=w" (d4), "=w" (d5), "=w" (d6), "=w" (d7), | 790 "=w" (d4), "=w" (d5), "=w" (d6), "=w" (d7), |
734 [src] "+&r" (src) | 791 [src] "+&r" (src) |
735 : [dst] "r" (dst) | 792 : [dst] "r" (dst) |
736 : | 793 : |
737 ); | 794 ); |
738 vsrc.val[0] = d0; vdst.val[0] = d4; | 795 vsrc.val[0] = d0; vdst.val[0] = d4; |
739 vsrc.val[1] = d1; vdst.val[1] = d5; | 796 vsrc.val[1] = d1; vdst.val[1] = d5; |
740 vsrc.val[2] = d2; vdst.val[2] = d6; | 797 vsrc.val[2] = d2; vdst.val[2] = d6; |
741 vsrc.val[3] = d3; vdst.val[3] = d7; | 798 vsrc.val[3] = d3; vdst.val[3] = d7; |
742 #endif | 799 #endif |
| 800 #endif // #ifdef SK_CPU_ARM64 |
743 | 801 |
744 vres = procSIMD(vsrc, vdst); | 802 vres = procSIMD(vsrc, vdst); |
745 | 803 |
746 vst4_u8((uint8_t*)dst, vres); | 804 vst4_u8((uint8_t*)dst, vres); |
747 | 805 |
748 count -= 8; | 806 count -= 8; |
749 dst += 8; | 807 dst += 8; |
| 808 #ifdef SK_CPU_ARM64 |
| 809 src += 8; |
| 810 #endif |
750 } | 811 } |
751 // Leftovers | 812 // Leftovers |
752 for (int i = 0; i < count; i++) { | 813 for (int i = 0; i < count; i++) { |
753 dst[i] = proc(src[i], dst[i]); | 814 dst[i] = proc(src[i], dst[i]); |
754 } | 815 } |
755 } else { | 816 } else { |
756 for (int i = count - 1; i >= 0; --i) { | 817 for (int i = count - 1; i >= 0; --i) { |
757 unsigned a = aa[i]; | 818 unsigned a = aa[i]; |
758 if (0 != a) { | 819 if (0 != a) { |
759 SkPMColor dstC = dst[i]; | 820 SkPMColor dstC = dst[i]; |
(...skipping 16 matching lines...) Expand all Loading... |
776 SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD
); | 837 SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD
); |
777 SkASSERT(procSIMD != NULL); | 838 SkASSERT(procSIMD != NULL); |
778 | 839 |
779 if (NULL == aa) { | 840 if (NULL == aa) { |
780 while(count >= 8) { | 841 while(count >= 8) { |
781 uint16x8_t vdst, vres16; | 842 uint16x8_t vdst, vres16; |
782 uint8x8x4_t vdst32, vsrc, vres; | 843 uint8x8x4_t vdst32, vsrc, vres; |
783 | 844 |
784 vdst = vld1q_u16(dst); | 845 vdst = vld1q_u16(dst); |
785 | 846 |
| 847 #ifdef SK_CPU_ARM64 |
| 848 vsrc = vld4_u8((uint8_t*)src); |
| 849 #else |
786 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) | 850 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) |
787 asm volatile ( | 851 asm volatile ( |
788 "vld4.u8 %h[vsrc], [%[src]]! \t\n" | 852 "vld4.u8 %h[vsrc], [%[src]]! \t\n" |
789 : [vsrc] "=w" (vsrc), [src] "+&r" (src) | 853 : [vsrc] "=w" (vsrc), [src] "+&r" (src) |
790 : : | 854 : : |
791 ); | 855 ); |
792 #else | 856 #else |
793 register uint8x8_t d0 asm("d0"); | 857 register uint8x8_t d0 asm("d0"); |
794 register uint8x8_t d1 asm("d1"); | 858 register uint8x8_t d1 asm("d1"); |
795 register uint8x8_t d2 asm("d2"); | 859 register uint8x8_t d2 asm("d2"); |
796 register uint8x8_t d3 asm("d3"); | 860 register uint8x8_t d3 asm("d3"); |
797 | 861 |
798 asm volatile ( | 862 asm volatile ( |
799 "vld4.u8 {d0-d3},[%[src]]!;" | 863 "vld4.u8 {d0-d3},[%[src]]!;" |
800 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), | 864 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), |
801 [src] "+&r" (src) | 865 [src] "+&r" (src) |
802 : : | 866 : : |
803 ); | 867 ); |
804 vsrc.val[0] = d0; | 868 vsrc.val[0] = d0; |
805 vsrc.val[1] = d1; | 869 vsrc.val[1] = d1; |
806 vsrc.val[2] = d2; | 870 vsrc.val[2] = d2; |
807 vsrc.val[3] = d3; | 871 vsrc.val[3] = d3; |
808 #endif | 872 #endif |
| 873 #endif // #ifdef SK_CPU_ARM64 |
809 | 874 |
810 vdst32 = SkPixel16ToPixel32_neon8(vdst); | 875 vdst32 = SkPixel16ToPixel32_neon8(vdst); |
811 vres = procSIMD(vsrc, vdst32); | 876 vres = procSIMD(vsrc, vdst32); |
812 vres16 = SkPixel32ToPixel16_neon8(vres); | 877 vres16 = SkPixel32ToPixel16_neon8(vres); |
813 | 878 |
814 vst1q_u16(dst, vres16); | 879 vst1q_u16(dst, vres16); |
815 | 880 |
816 count -= 8; | 881 count -= 8; |
817 dst += 8; | 882 dst += 8; |
| 883 #ifdef SK_CPU_ARM64 |
| 884 src += 8; |
| 885 #endif |
818 } | 886 } |
819 for (int i = 0; i < count; i++) { | 887 for (int i = 0; i < count; i++) { |
820 SkPMColor dstC = SkPixel16ToPixel32(dst[i]); | 888 SkPMColor dstC = SkPixel16ToPixel32(dst[i]); |
821 dst[i] = SkPixel32ToPixel16_ToU16(proc(src[i], dstC)); | 889 dst[i] = SkPixel32ToPixel16_ToU16(proc(src[i], dstC)); |
822 } | 890 } |
823 } else { | 891 } else { |
824 for (int i = count - 1; i >= 0; --i) { | 892 for (int i = count - 1; i >= 0; --i) { |
825 unsigned a = aa[i]; | 893 unsigned a = aa[i]; |
826 if (0 != a) { | 894 if (0 != a) { |
827 SkPMColor dstC = SkPixel16ToPixel32(dst[i]); | 895 SkPMColor dstC = SkPixel16ToPixel32(dst[i]); |
(...skipping 100 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
928 | 996 |
929 if (procSIMD != NULL) { | 997 if (procSIMD != NULL) { |
930 return SkNEW_ARGS(SkNEONProcCoeffXfermode, (rec, mode, procSIMD)); | 998 return SkNEW_ARGS(SkNEONProcCoeffXfermode, (rec, mode, procSIMD)); |
931 } | 999 } |
932 return NULL; | 1000 return NULL; |
933 } | 1001 } |
934 | 1002 |
935 SkXfermodeProc SkPlatformXfermodeProcFactory_impl_neon(SkXfermode::Mode mode) { | 1003 SkXfermodeProc SkPlatformXfermodeProcFactory_impl_neon(SkXfermode::Mode mode) { |
936 return gNEONXfermodeProcs1[mode]; | 1004 return gNEONXfermodeProcs1[mode]; |
937 } | 1005 } |
OLD | NEW |