| OLD | NEW |
| 1 #include "SkXfermode.h" | 1 #include "SkXfermode.h" |
| 2 #include "SkXfermode_proccoeff.h" | 2 #include "SkXfermode_proccoeff.h" |
| 3 #include "SkColorPriv.h" | 3 #include "SkColorPriv.h" |
| 4 | 4 |
| 5 #include <arm_neon.h> | 5 #include <arm_neon.h> |
| 6 #include "SkColor_opts_neon.h" | 6 #include "SkColor_opts_neon.h" |
| 7 #include "SkXfermode_opts_arm_neon.h" | 7 #include "SkXfermode_opts_arm_neon.h" |
| 8 | 8 |
| 9 #define SkAlphaMulAlpha(a, b) SkMulDiv255Round(a, b) | 9 #define SkAlphaMulAlpha(a, b) SkMulDiv255Round(a, b) |
| 10 | 10 |
| (...skipping 23 matching lines...) Expand all Loading... |
| 34 ret = vaddq_u16(ret, vshrq_n_u16(ret, 8)); | 34 ret = vaddq_u16(ret, vshrq_n_u16(ret, 8)); |
| 35 | 35 |
| 36 ret = vshrq_n_u16(ret, 8); | 36 ret = vshrq_n_u16(ret, 8); |
| 37 | 37 |
| 38 return ret; | 38 return ret; |
| 39 } | 39 } |
| 40 | 40 |
| 41 static inline uint8x8_t SkDiv255Round_neon8_32_8(int32x4_t p1, int32x4_t p2) { | 41 static inline uint8x8_t SkDiv255Round_neon8_32_8(int32x4_t p1, int32x4_t p2) { |
| 42 uint16x8_t tmp; | 42 uint16x8_t tmp; |
| 43 | 43 |
| 44 #ifdef SK_CPU_ARM64 |
| 45 tmp = vmovn_high_u32(vmovn_u32(vreinterpretq_u32_s32(p1)), |
| 46 vreinterpretq_u32_s32(p2)); |
| 47 #else |
| 44 tmp = vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(p1)), | 48 tmp = vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(p1)), |
| 45 vmovn_u32(vreinterpretq_u32_s32(p2))); | 49 vmovn_u32(vreinterpretq_u32_s32(p2))); |
| 50 #endif |
| 46 | 51 |
| 47 tmp += vdupq_n_u16(128); | 52 tmp += vdupq_n_u16(128); |
| 48 tmp += vshrq_n_u16(tmp, 8); | 53 tmp += vshrq_n_u16(tmp, 8); |
| 49 | 54 |
| 50 return vshrn_n_u16(tmp, 8); | 55 return vshrn_n_u16(tmp, 8); |
| 51 } | 56 } |
| 52 | 57 |
| 53 static inline uint16x8_t SkDiv255Round_neon8_16_16(uint16x8_t prod) { | 58 static inline uint16x8_t SkDiv255Round_neon8_16_16(uint16x8_t prod) { |
| 54 prod += vdupq_n_u16(128); | 59 prod += vdupq_n_u16(128); |
| 55 prod += vshrq_n_u16(prod, 8); | 60 prod += vshrq_n_u16(prod, 8); |
| 56 | 61 |
| 57 return vshrq_n_u16(prod, 8); | 62 return vshrq_n_u16(prod, 8); |
| 58 } | 63 } |
| 59 | 64 |
| 60 static inline uint8x8_t clamp_div255round_simd8_32(int32x4_t val1, int32x4_t val
2) { | 65 static inline uint8x8_t clamp_div255round_simd8_32(int32x4_t val1, int32x4_t val
2) { |
| 61 uint8x8_t ret; | 66 uint8x8_t ret; |
| 62 uint32x4_t cmp1, cmp2; | 67 uint32x4_t cmp1, cmp2; |
| 63 uint16x8_t cmp16; | 68 uint16x8_t cmp16; |
| 64 uint8x8_t cmp8, cmp8_1; | 69 uint8x8_t cmp8, cmp8_1; |
| 65 | 70 |
| 66 // Test if <= 0 | 71 // Test if <= 0 |
| 67 cmp1 = vcleq_s32(val1, vdupq_n_s32(0)); | 72 cmp1 = vcleq_s32(val1, vdupq_n_s32(0)); |
| 68 cmp2 = vcleq_s32(val2, vdupq_n_s32(0)); | 73 cmp2 = vcleq_s32(val2, vdupq_n_s32(0)); |
| 74 #ifdef SK_CPU_ARM64 |
| 75 cmp16 = vmovn_high_u32(vmovn_u32(cmp1), cmp2); |
| 76 #else |
| 69 cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2)); | 77 cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2)); |
| 78 #endif |
| 70 cmp8_1 = vmovn_u16(cmp16); | 79 cmp8_1 = vmovn_u16(cmp16); |
| 71 | 80 |
| 72 // Init to zero | 81 // Init to zero |
| 73 ret = vdup_n_u8(0); | 82 ret = vdup_n_u8(0); |
| 74 | 83 |
| 75 // Test if >= 255*255 | 84 // Test if >= 255*255 |
| 76 cmp1 = vcgeq_s32(val1, vdupq_n_s32(255*255)); | 85 cmp1 = vcgeq_s32(val1, vdupq_n_s32(255*255)); |
| 77 cmp2 = vcgeq_s32(val2, vdupq_n_s32(255*255)); | 86 cmp2 = vcgeq_s32(val2, vdupq_n_s32(255*255)); |
| 87 #ifdef SK_CPU_ARM64 |
| 88 cmp16 = vmovn_high_u32(vmovn_u32(cmp1), cmp2); |
| 89 #else |
| 78 cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2)); | 90 cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2)); |
| 91 #endif |
| 79 cmp8 = vmovn_u16(cmp16); | 92 cmp8 = vmovn_u16(cmp16); |
| 80 | 93 |
| 81 // Insert 255 where true | 94 // Insert 255 where true |
| 82 ret = vbsl_u8(cmp8, vdup_n_u8(255), ret); | 95 ret = vbsl_u8(cmp8, vdup_n_u8(255), ret); |
| 83 | 96 |
| 84 // Calc SkDiv255Round | 97 // Calc SkDiv255Round |
| 85 uint8x8_t div = SkDiv255Round_neon8_32_8(val1, val2); | 98 uint8x8_t div = SkDiv255Round_neon8_32_8(val1, val2); |
| 86 | 99 |
| 87 // Insert where false and previous test false | 100 // Insert where false and previous test false |
| 88 cmp8 = cmp8 | cmp8_1; | 101 cmp8 = cmp8 | cmp8_1; |
| (...skipping 313 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 402 uint16x8_t scda = vmull_u8(sc, da); | 415 uint16x8_t scda = vmull_u8(sc, da); |
| 403 uint16x8_t dcsa = vmull_u8(dc, sa); | 416 uint16x8_t dcsa = vmull_u8(dc, sa); |
| 404 uint16x8_t sada = vmull_u8(sa, da); | 417 uint16x8_t sada = vmull_u8(sa, da); |
| 405 | 418 |
| 406 // Prepare non common subexpressions | 419 // Prepare non common subexpressions |
| 407 uint16x8_t dc2, sc2; | 420 uint16x8_t dc2, sc2; |
| 408 uint32x4_t scdc2_1, scdc2_2; | 421 uint32x4_t scdc2_1, scdc2_2; |
| 409 if (overlay) { | 422 if (overlay) { |
| 410 dc2 = vshll_n_u8(dc, 1); | 423 dc2 = vshll_n_u8(dc, 1); |
| 411 scdc2_1 = vmull_u16(vget_low_u16(dc2), vget_low_u16(vmovl_u8(sc))); | 424 scdc2_1 = vmull_u16(vget_low_u16(dc2), vget_low_u16(vmovl_u8(sc))); |
| 425 #ifdef SK_CPU_ARM64 |
| 426 scdc2_2 = vmull_high_u16(dc2, vmovl_u8(sc)); |
| 427 #else |
| 412 scdc2_2 = vmull_u16(vget_high_u16(dc2), vget_high_u16(vmovl_u8(sc))); | 428 scdc2_2 = vmull_u16(vget_high_u16(dc2), vget_high_u16(vmovl_u8(sc))); |
| 429 #endif |
| 413 } else { | 430 } else { |
| 414 sc2 = vshll_n_u8(sc, 1); | 431 sc2 = vshll_n_u8(sc, 1); |
| 415 scdc2_1 = vmull_u16(vget_low_u16(sc2), vget_low_u16(vmovl_u8(dc))); | 432 scdc2_1 = vmull_u16(vget_low_u16(sc2), vget_low_u16(vmovl_u8(dc))); |
| 433 #ifdef SK_CPU_ARM64 |
| 434 scdc2_2 = vmull_high_u16(sc2, vmovl_u8(dc)); |
| 435 #else |
| 416 scdc2_2 = vmull_u16(vget_high_u16(sc2), vget_high_u16(vmovl_u8(dc))); | 436 scdc2_2 = vmull_u16(vget_high_u16(sc2), vget_high_u16(vmovl_u8(dc))); |
| 437 #endif |
| 417 } | 438 } |
| 418 | 439 |
| 419 // Calc COM | 440 // Calc COM |
| 420 int32x4_t com1, com2; | 441 int32x4_t com1, com2; |
| 421 com1 = vreinterpretq_s32_u32( | 442 com1 = vreinterpretq_s32_u32( |
| 422 vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc))); | 443 vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc))); |
| 423 com2 = vreinterpretq_s32_u32( | 444 com2 = vreinterpretq_s32_u32( |
| 445 #ifdef SK_CPU_ARM64 |
| 446 vmull_high_u16(const255, sc_plus_dc)); |
| 447 #else |
| 424 vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc))); | 448 vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc))); |
| 449 #endif |
| 425 | 450 |
| 426 // Calc SUB | 451 // Calc SUB |
| 427 int32x4_t sub1, sub2; | 452 int32x4_t sub1, sub2; |
| 428 sub1 = vreinterpretq_s32_u32(vaddl_u16(vget_low_u16(scda), vget_low_u16(dcsa
))); | 453 sub1 = vreinterpretq_s32_u32(vaddl_u16(vget_low_u16(scda), vget_low_u16(dcsa
))); |
| 454 #ifdef SK_CPU_ARM64 |
| 455 sub2 = vreinterpretq_s32_u32(vaddl_high_u16(scda, dcsa)); |
| 456 #else |
| 429 sub2 = vreinterpretq_s32_u32(vaddl_u16(vget_high_u16(scda), vget_high_u16(dc
sa))); | 457 sub2 = vreinterpretq_s32_u32(vaddl_u16(vget_high_u16(scda), vget_high_u16(dc
sa))); |
| 458 #endif |
| 430 sub1 = vsubq_s32(sub1, vreinterpretq_s32_u32(scdc2_1)); | 459 sub1 = vsubq_s32(sub1, vreinterpretq_s32_u32(scdc2_1)); |
| 431 sub2 = vsubq_s32(sub2, vreinterpretq_s32_u32(scdc2_2)); | 460 sub2 = vsubq_s32(sub2, vreinterpretq_s32_u32(scdc2_2)); |
| 432 | 461 |
| 433 // Compare 2*dc <= da | 462 // Compare 2*dc <= da |
| 434 uint16x8_t cmp; | 463 uint16x8_t cmp; |
| 435 | 464 |
| 436 if (overlay) { | 465 if (overlay) { |
| 437 cmp = vcleq_u16(dc2, vmovl_u8(da)); | 466 cmp = vcleq_u16(dc2, vmovl_u8(da)); |
| 438 } else { | 467 } else { |
| 439 cmp = vcleq_u16(sc2, vmovl_u8(sa)); | 468 cmp = vcleq_u16(sc2, vmovl_u8(sa)); |
| 440 } | 469 } |
| 441 | 470 |
| 442 // Prepare variables | 471 // Prepare variables |
| 443 int32x4_t val1_1, val1_2; | 472 int32x4_t val1_1, val1_2; |
| 444 int32x4_t val2_1, val2_2; | 473 int32x4_t val2_1, val2_2; |
| 445 uint32x4_t cmp1, cmp2; | 474 uint32x4_t cmp1, cmp2; |
| 446 | 475 |
| 447 cmp1 = vmovl_u16(vget_low_u16(cmp)); | 476 // Doing a signed lengthening allows to save a few instructions |
| 448 cmp1 |= vshlq_n_u32(cmp1, 16); | 477 // thanks to sign extension. |
| 449 cmp2 = vmovl_u16(vget_high_u16(cmp)); | 478 cmp1 = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_low_u16(cmp
)))); |
| 450 cmp2 |= vshlq_n_u32(cmp2, 16); | 479 #ifdef SK_CPU_ARM64 |
| 480 cmp2 = vreinterpretq_u32_s32(vmovl_high_s16(vreinterpretq_s16_u16(cmp))); |
| 481 #else |
| 482 cmp2 = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_high_u16(cm
p)))); |
| 483 #endif |
| 451 | 484 |
| 452 // Calc COM - SUB | 485 // Calc COM - SUB |
| 453 val1_1 = com1 - sub1; | 486 val1_1 = com1 - sub1; |
| 454 val1_2 = com2 - sub2; | 487 val1_2 = com2 - sub2; |
| 455 | 488 |
| 456 // Calc COM + SUB - sa*da | 489 // Calc COM + SUB - sa*da |
| 457 val2_1 = com1 + sub1; | 490 val2_1 = com1 + sub1; |
| 458 val2_2 = com2 + sub2; | 491 val2_2 = com2 + sub2; |
| 459 | 492 |
| 460 val2_1 = vsubq_s32(val2_1, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sada
)))); | 493 val2_1 = vsubq_s32(val2_1, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sada
)))); |
| 494 #ifdef SK_CPU_ARM64 |
| 495 val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_high_u16(sada))); |
| 496 #else |
| 461 val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sad
a)))); | 497 val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sad
a)))); |
| 498 #endif |
| 462 | 499 |
| 463 // Insert where needed | 500 // Insert where needed |
| 464 val1_1 = vbslq_s32(cmp1, val1_1, val2_1); | 501 val1_1 = vbslq_s32(cmp1, val1_1, val2_1); |
| 465 val1_2 = vbslq_s32(cmp2, val1_2, val2_2); | 502 val1_2 = vbslq_s32(cmp2, val1_2, val2_2); |
| 466 | 503 |
| 467 // Call the clamp_div255round function | 504 // Call the clamp_div255round function |
| 468 return clamp_div255round_simd8_32(val1_1, val1_2); | 505 return clamp_div255round_simd8_32(val1_1, val1_2); |
| 469 } | 506 } |
| 470 | 507 |
| 471 static inline uint8x8_t overlay_color(uint8x8_t sc, uint8x8_t dc, | 508 static inline uint8x8_t overlay_color(uint8x8_t sc, uint8x8_t dc, |
| (...skipping 149 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 621 sc_plus_dc = vaddl_u8(sc, dc); | 658 sc_plus_dc = vaddl_u8(sc, dc); |
| 622 scdc = vmull_u8(sc, dc); | 659 scdc = vmull_u8(sc, dc); |
| 623 | 660 |
| 624 /* Prepare constants */ | 661 /* Prepare constants */ |
| 625 const255 = vdupq_n_u16(255); | 662 const255 = vdupq_n_u16(255); |
| 626 | 663 |
| 627 /* Calc the first term */ | 664 /* Calc the first term */ |
| 628 term1_1 = vreinterpretq_s32_u32( | 665 term1_1 = vreinterpretq_s32_u32( |
| 629 vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc))); | 666 vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc))); |
| 630 term1_2 = vreinterpretq_s32_u32( | 667 term1_2 = vreinterpretq_s32_u32( |
| 668 #ifdef SK_CPU_ARM64 |
| 669 vmull_high_u16(const255, sc_plus_dc)); |
| 670 #else |
| 631 vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc))); | 671 vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc))); |
| 672 #endif |
| 632 | 673 |
| 633 /* Calc the second term */ | 674 /* Calc the second term */ |
| 634 term2_1 = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(scdc), 1)); | 675 term2_1 = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(scdc), 1)); |
| 676 #ifdef SK_CPU_ARM64 |
| 677 term2_2 = vreinterpretq_s32_u32(vshll_high_n_u16(scdc, 1)); |
| 678 #else |
| 635 term2_2 = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(scdc), 1)); | 679 term2_2 = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(scdc), 1)); |
| 680 #endif |
| 636 | 681 |
| 637 return clamp_div255round_simd8_32(term1_1 - term2_1, term1_2 - term2_2); | 682 return clamp_div255round_simd8_32(term1_1 - term2_1, term1_2 - term2_2); |
| 638 } | 683 } |
| 639 | 684 |
| 640 uint8x8x4_t exclusion_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | 685 uint8x8x4_t exclusion_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { |
| 641 uint8x8x4_t ret; | 686 uint8x8x4_t ret; |
| 642 | 687 |
| 643 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); | 688 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); |
| 644 ret.val[NEON_R] = exclusion_color(src.val[NEON_R], dst.val[NEON_R], | 689 ret.val[NEON_R] = exclusion_color(src.val[NEON_R], dst.val[NEON_R], |
| 645 src.val[NEON_A], dst.val[NEON_A]); | 690 src.val[NEON_A], dst.val[NEON_A]); |
| 646 ret.val[NEON_G] = exclusion_color(src.val[NEON_G], dst.val[NEON_G], | 691 ret.val[NEON_G] = exclusion_color(src.val[NEON_G], dst.val[NEON_G], |
| 647 src.val[NEON_A], dst.val[NEON_A]); | 692 src.val[NEON_A], dst.val[NEON_A]); |
| 648 ret.val[NEON_B] = exclusion_color(src.val[NEON_B], dst.val[NEON_B], | 693 ret.val[NEON_B] = exclusion_color(src.val[NEON_B], dst.val[NEON_B], |
| 649 src.val[NEON_A], dst.val[NEON_A]); | 694 src.val[NEON_A], dst.val[NEON_A]); |
| 650 | 695 |
| 651 return ret; | 696 return ret; |
| 652 } | 697 } |
| 653 | 698 |
| 654 static inline uint8x8_t blendfunc_multiply_color(uint8x8_t sc, uint8x8_t dc, | 699 static inline uint8x8_t blendfunc_multiply_color(uint8x8_t sc, uint8x8_t dc, |
| 655 uint8x8_t sa, uint8x8_t da) { | 700 uint8x8_t sa, uint8x8_t da) { |
| 656 uint32x4_t val1, val2; | 701 uint32x4_t val1, val2; |
| 657 uint16x8_t scdc, t1, t2; | 702 uint16x8_t scdc, t1, t2; |
| 658 | 703 |
| 659 t1 = vmull_u8(sc, vdup_n_u8(255) - da); | 704 t1 = vmull_u8(sc, vdup_n_u8(255) - da); |
| 660 t2 = vmull_u8(dc, vdup_n_u8(255) - sa); | 705 t2 = vmull_u8(dc, vdup_n_u8(255) - sa); |
| 661 scdc = vmull_u8(sc, dc); | 706 scdc = vmull_u8(sc, dc); |
| 662 | 707 |
| 663 val1 = vaddl_u16(vget_low_u16(t1), vget_low_u16(t2)); | 708 val1 = vaddl_u16(vget_low_u16(t1), vget_low_u16(t2)); |
| 709 #ifdef SK_CPU_ARM64 |
| 710 val2 = vaddl_high_u16(t1, t2); |
| 711 #else |
| 664 val2 = vaddl_u16(vget_high_u16(t1), vget_high_u16(t2)); | 712 val2 = vaddl_u16(vget_high_u16(t1), vget_high_u16(t2)); |
| 713 #endif |
| 665 | 714 |
| 666 val1 = vaddw_u16(val1, vget_low_u16(scdc)); | 715 val1 = vaddw_u16(val1, vget_low_u16(scdc)); |
| 716 #ifdef SK_CPU_ARM64 |
| 717 val2 = vaddw_high_u16(val2, scdc); |
| 718 #else |
| 667 val2 = vaddw_u16(val2, vget_high_u16(scdc)); | 719 val2 = vaddw_u16(val2, vget_high_u16(scdc)); |
| 720 #endif |
| 668 | 721 |
| 669 return clamp_div255round_simd8_32( | 722 return clamp_div255round_simd8_32( |
| 670 vreinterpretq_s32_u32(val1), vreinterpretq_s32_u32(val2)); | 723 vreinterpretq_s32_u32(val1), vreinterpretq_s32_u32(val2)); |
| 671 } | 724 } |
| 672 | 725 |
| 673 uint8x8x4_t multiply_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | 726 uint8x8x4_t multiply_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { |
| 674 uint8x8x4_t ret; | 727 uint8x8x4_t ret; |
| 675 | 728 |
| 676 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); | 729 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); |
| 677 ret.val[NEON_R] = blendfunc_multiply_color(src.val[NEON_R], dst.val[NEON_R], | 730 ret.val[NEON_R] = blendfunc_multiply_color(src.val[NEON_R], dst.val[NEON_R], |
| (...skipping 23 matching lines...) Expand all Loading... |
| 701 | 754 |
| 702 SkXfermodeProc proc = this->getProc(); | 755 SkXfermodeProc proc = this->getProc(); |
| 703 SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD
); | 756 SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD
); |
| 704 SkASSERT(procSIMD != NULL); | 757 SkASSERT(procSIMD != NULL); |
| 705 | 758 |
| 706 if (NULL == aa) { | 759 if (NULL == aa) { |
| 707 // Unrolled NEON code | 760 // Unrolled NEON code |
| 708 while (count >= 8) { | 761 while (count >= 8) { |
| 709 uint8x8x4_t vsrc, vdst, vres; | 762 uint8x8x4_t vsrc, vdst, vres; |
| 710 | 763 |
| 764 #ifdef SK_CPU_ARM64 |
| 765 vsrc = vld4_u8((uint8_t*)src); |
| 766 vdst = vld4_u8((uint8_t*)dst); |
| 767 #else |
| 711 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) | 768 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) |
| 712 asm volatile ( | 769 asm volatile ( |
| 713 "vld4.u8 %h[vsrc], [%[src]]! \t\n" | 770 "vld4.u8 %h[vsrc], [%[src]]! \t\n" |
| 714 "vld4.u8 %h[vdst], [%[dst]] \t\n" | 771 "vld4.u8 %h[vdst], [%[dst]] \t\n" |
| 715 : [vsrc] "=w" (vsrc), [vdst] "=w" (vdst), [src] "+&r" (src) | 772 : [vsrc] "=w" (vsrc), [vdst] "=w" (vdst), [src] "+&r" (src) |
| 716 : [dst] "r" (dst) | 773 : [dst] "r" (dst) |
| 717 : | 774 : |
| 718 ); | 775 ); |
| 719 #else | 776 #else |
| 720 register uint8x8_t d0 asm("d0"); | 777 register uint8x8_t d0 asm("d0"); |
| (...skipping 12 matching lines...) Expand all Loading... |
| 733 "=w" (d4), "=w" (d5), "=w" (d6), "=w" (d7), | 790 "=w" (d4), "=w" (d5), "=w" (d6), "=w" (d7), |
| 734 [src] "+&r" (src) | 791 [src] "+&r" (src) |
| 735 : [dst] "r" (dst) | 792 : [dst] "r" (dst) |
| 736 : | 793 : |
| 737 ); | 794 ); |
| 738 vsrc.val[0] = d0; vdst.val[0] = d4; | 795 vsrc.val[0] = d0; vdst.val[0] = d4; |
| 739 vsrc.val[1] = d1; vdst.val[1] = d5; | 796 vsrc.val[1] = d1; vdst.val[1] = d5; |
| 740 vsrc.val[2] = d2; vdst.val[2] = d6; | 797 vsrc.val[2] = d2; vdst.val[2] = d6; |
| 741 vsrc.val[3] = d3; vdst.val[3] = d7; | 798 vsrc.val[3] = d3; vdst.val[3] = d7; |
| 742 #endif | 799 #endif |
| 800 #endif // #ifdef SK_CPU_ARM64 |
| 743 | 801 |
| 744 vres = procSIMD(vsrc, vdst); | 802 vres = procSIMD(vsrc, vdst); |
| 745 | 803 |
| 746 vst4_u8((uint8_t*)dst, vres); | 804 vst4_u8((uint8_t*)dst, vres); |
| 747 | 805 |
| 748 count -= 8; | 806 count -= 8; |
| 749 dst += 8; | 807 dst += 8; |
| 808 #ifdef SK_CPU_ARM64 |
| 809 src += 8; |
| 810 #endif |
| 750 } | 811 } |
| 751 // Leftovers | 812 // Leftovers |
| 752 for (int i = 0; i < count; i++) { | 813 for (int i = 0; i < count; i++) { |
| 753 dst[i] = proc(src[i], dst[i]); | 814 dst[i] = proc(src[i], dst[i]); |
| 754 } | 815 } |
| 755 } else { | 816 } else { |
| 756 for (int i = count - 1; i >= 0; --i) { | 817 for (int i = count - 1; i >= 0; --i) { |
| 757 unsigned a = aa[i]; | 818 unsigned a = aa[i]; |
| 758 if (0 != a) { | 819 if (0 != a) { |
| 759 SkPMColor dstC = dst[i]; | 820 SkPMColor dstC = dst[i]; |
| (...skipping 16 matching lines...) Expand all Loading... |
| 776 SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD
); | 837 SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD
); |
| 777 SkASSERT(procSIMD != NULL); | 838 SkASSERT(procSIMD != NULL); |
| 778 | 839 |
| 779 if (NULL == aa) { | 840 if (NULL == aa) { |
| 780 while(count >= 8) { | 841 while(count >= 8) { |
| 781 uint16x8_t vdst, vres16; | 842 uint16x8_t vdst, vres16; |
| 782 uint8x8x4_t vdst32, vsrc, vres; | 843 uint8x8x4_t vdst32, vsrc, vres; |
| 783 | 844 |
| 784 vdst = vld1q_u16(dst); | 845 vdst = vld1q_u16(dst); |
| 785 | 846 |
| 847 #ifdef SK_CPU_ARM64 |
| 848 vsrc = vld4_u8((uint8_t*)src); |
| 849 #else |
| 786 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) | 850 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) |
| 787 asm volatile ( | 851 asm volatile ( |
| 788 "vld4.u8 %h[vsrc], [%[src]]! \t\n" | 852 "vld4.u8 %h[vsrc], [%[src]]! \t\n" |
| 789 : [vsrc] "=w" (vsrc), [src] "+&r" (src) | 853 : [vsrc] "=w" (vsrc), [src] "+&r" (src) |
| 790 : : | 854 : : |
| 791 ); | 855 ); |
| 792 #else | 856 #else |
| 793 register uint8x8_t d0 asm("d0"); | 857 register uint8x8_t d0 asm("d0"); |
| 794 register uint8x8_t d1 asm("d1"); | 858 register uint8x8_t d1 asm("d1"); |
| 795 register uint8x8_t d2 asm("d2"); | 859 register uint8x8_t d2 asm("d2"); |
| 796 register uint8x8_t d3 asm("d3"); | 860 register uint8x8_t d3 asm("d3"); |
| 797 | 861 |
| 798 asm volatile ( | 862 asm volatile ( |
| 799 "vld4.u8 {d0-d3},[%[src]]!;" | 863 "vld4.u8 {d0-d3},[%[src]]!;" |
| 800 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), | 864 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), |
| 801 [src] "+&r" (src) | 865 [src] "+&r" (src) |
| 802 : : | 866 : : |
| 803 ); | 867 ); |
| 804 vsrc.val[0] = d0; | 868 vsrc.val[0] = d0; |
| 805 vsrc.val[1] = d1; | 869 vsrc.val[1] = d1; |
| 806 vsrc.val[2] = d2; | 870 vsrc.val[2] = d2; |
| 807 vsrc.val[3] = d3; | 871 vsrc.val[3] = d3; |
| 808 #endif | 872 #endif |
| 873 #endif // #ifdef SK_CPU_ARM64 |
| 809 | 874 |
| 810 vdst32 = SkPixel16ToPixel32_neon8(vdst); | 875 vdst32 = SkPixel16ToPixel32_neon8(vdst); |
| 811 vres = procSIMD(vsrc, vdst32); | 876 vres = procSIMD(vsrc, vdst32); |
| 812 vres16 = SkPixel32ToPixel16_neon8(vres); | 877 vres16 = SkPixel32ToPixel16_neon8(vres); |
| 813 | 878 |
| 814 vst1q_u16(dst, vres16); | 879 vst1q_u16(dst, vres16); |
| 815 | 880 |
| 816 count -= 8; | 881 count -= 8; |
| 817 dst += 8; | 882 dst += 8; |
| 883 #ifdef SK_CPU_ARM64 |
| 884 src += 8; |
| 885 #endif |
| 818 } | 886 } |
| 819 for (int i = 0; i < count; i++) { | 887 for (int i = 0; i < count; i++) { |
| 820 SkPMColor dstC = SkPixel16ToPixel32(dst[i]); | 888 SkPMColor dstC = SkPixel16ToPixel32(dst[i]); |
| 821 dst[i] = SkPixel32ToPixel16_ToU16(proc(src[i], dstC)); | 889 dst[i] = SkPixel32ToPixel16_ToU16(proc(src[i], dstC)); |
| 822 } | 890 } |
| 823 } else { | 891 } else { |
| 824 for (int i = count - 1; i >= 0; --i) { | 892 for (int i = count - 1; i >= 0; --i) { |
| 825 unsigned a = aa[i]; | 893 unsigned a = aa[i]; |
| 826 if (0 != a) { | 894 if (0 != a) { |
| 827 SkPMColor dstC = SkPixel16ToPixel32(dst[i]); | 895 SkPMColor dstC = SkPixel16ToPixel32(dst[i]); |
| (...skipping 100 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 928 | 996 |
| 929 if (procSIMD != NULL) { | 997 if (procSIMD != NULL) { |
| 930 return SkNEW_ARGS(SkNEONProcCoeffXfermode, (rec, mode, procSIMD)); | 998 return SkNEW_ARGS(SkNEONProcCoeffXfermode, (rec, mode, procSIMD)); |
| 931 } | 999 } |
| 932 return NULL; | 1000 return NULL; |
| 933 } | 1001 } |
| 934 | 1002 |
| 935 SkXfermodeProc SkPlatformXfermodeProcFactory_impl_neon(SkXfermode::Mode mode) { | 1003 SkXfermodeProc SkPlatformXfermodeProcFactory_impl_neon(SkXfermode::Mode mode) { |
| 936 return gNEONXfermodeProcs1[mode]; | 1004 return gNEONXfermodeProcs1[mode]; |
| 937 } | 1005 } |
| OLD | NEW |