| OLD | NEW |
| 1 #include "SkXfermode.h" | 1 #include "SkXfermode.h" |
| 2 #include "SkXfermode_proccoeff.h" | 2 #include "SkXfermode_proccoeff.h" |
| 3 #include "SkColorPriv.h" | 3 #include "SkColorPriv.h" |
| 4 | 4 |
| 5 #include <arm_neon.h> | 5 #include <arm_neon.h> |
| 6 #include "SkColor_opts_neon.h" | 6 #include "SkColor_opts_neon.h" |
| 7 #include "SkXfermode_opts_arm_neon.h" | 7 #include "SkXfermode_opts_arm_neon.h" |
| 8 | 8 |
| 9 #define SkAlphaMulAlpha(a, b) SkMulDiv255Round(a, b) | 9 #define SkAlphaMulAlpha(a, b) SkMulDiv255Round(a, b) |
| 10 | 10 |
| (...skipping 23 matching lines...) Expand all Loading... |
| 34 ret = vaddq_u16(ret, vshrq_n_u16(ret, 8)); | 34 ret = vaddq_u16(ret, vshrq_n_u16(ret, 8)); |
| 35 | 35 |
| 36 ret = vshrq_n_u16(ret, 8); | 36 ret = vshrq_n_u16(ret, 8); |
| 37 | 37 |
| 38 return ret; | 38 return ret; |
| 39 } | 39 } |
| 40 | 40 |
| 41 static inline uint8x8_t SkDiv255Round_neon8_32_8(int32x4_t p1, int32x4_t p2) { | 41 static inline uint8x8_t SkDiv255Round_neon8_32_8(int32x4_t p1, int32x4_t p2) { |
| 42 uint16x8_t tmp; | 42 uint16x8_t tmp; |
| 43 | 43 |
| 44 #ifdef SK_CPU_ARM64 | |
| 45 tmp = vmovn_high_u32(vmovn_u32(vreinterpretq_u32_s32(p1)), | |
| 46 vreinterpretq_u32_s32(p2)); | |
| 47 #else | |
| 48 tmp = vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(p1)), | 44 tmp = vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(p1)), |
| 49 vmovn_u32(vreinterpretq_u32_s32(p2))); | 45 vmovn_u32(vreinterpretq_u32_s32(p2))); |
| 50 #endif | |
| 51 | 46 |
| 52 tmp += vdupq_n_u16(128); | 47 tmp += vdupq_n_u16(128); |
| 53 tmp += vshrq_n_u16(tmp, 8); | 48 tmp += vshrq_n_u16(tmp, 8); |
| 54 | 49 |
| 55 return vshrn_n_u16(tmp, 8); | 50 return vshrn_n_u16(tmp, 8); |
| 56 } | 51 } |
| 57 | 52 |
| 58 static inline uint16x8_t SkDiv255Round_neon8_16_16(uint16x8_t prod) { | 53 static inline uint16x8_t SkDiv255Round_neon8_16_16(uint16x8_t prod) { |
| 59 prod += vdupq_n_u16(128); | 54 prod += vdupq_n_u16(128); |
| 60 prod += vshrq_n_u16(prod, 8); | 55 prod += vshrq_n_u16(prod, 8); |
| 61 | 56 |
| 62 return vshrq_n_u16(prod, 8); | 57 return vshrq_n_u16(prod, 8); |
| 63 } | 58 } |
| 64 | 59 |
| 65 static inline uint8x8_t clamp_div255round_simd8_32(int32x4_t val1, int32x4_t val
2) { | 60 static inline uint8x8_t clamp_div255round_simd8_32(int32x4_t val1, int32x4_t val
2) { |
| 66 uint8x8_t ret; | 61 uint8x8_t ret; |
| 67 uint32x4_t cmp1, cmp2; | 62 uint32x4_t cmp1, cmp2; |
| 68 uint16x8_t cmp16; | 63 uint16x8_t cmp16; |
| 69 uint8x8_t cmp8, cmp8_1; | 64 uint8x8_t cmp8, cmp8_1; |
| 70 | 65 |
| 71 // Test if <= 0 | 66 // Test if <= 0 |
| 72 cmp1 = vcleq_s32(val1, vdupq_n_s32(0)); | 67 cmp1 = vcleq_s32(val1, vdupq_n_s32(0)); |
| 73 cmp2 = vcleq_s32(val2, vdupq_n_s32(0)); | 68 cmp2 = vcleq_s32(val2, vdupq_n_s32(0)); |
| 74 #ifdef SK_CPU_ARM64 | |
| 75 cmp16 = vmovn_high_u32(vmovn_u32(cmp1), cmp2); | |
| 76 #else | |
| 77 cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2)); | 69 cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2)); |
| 78 #endif | |
| 79 cmp8_1 = vmovn_u16(cmp16); | 70 cmp8_1 = vmovn_u16(cmp16); |
| 80 | 71 |
| 81 // Init to zero | 72 // Init to zero |
| 82 ret = vdup_n_u8(0); | 73 ret = vdup_n_u8(0); |
| 83 | 74 |
| 84 // Test if >= 255*255 | 75 // Test if >= 255*255 |
| 85 cmp1 = vcgeq_s32(val1, vdupq_n_s32(255*255)); | 76 cmp1 = vcgeq_s32(val1, vdupq_n_s32(255*255)); |
| 86 cmp2 = vcgeq_s32(val2, vdupq_n_s32(255*255)); | 77 cmp2 = vcgeq_s32(val2, vdupq_n_s32(255*255)); |
| 87 #ifdef SK_CPU_ARM64 | |
| 88 cmp16 = vmovn_high_u32(vmovn_u32(cmp1), cmp2); | |
| 89 #else | |
| 90 cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2)); | 78 cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2)); |
| 91 #endif | |
| 92 cmp8 = vmovn_u16(cmp16); | 79 cmp8 = vmovn_u16(cmp16); |
| 93 | 80 |
| 94 // Insert 255 where true | 81 // Insert 255 where true |
| 95 ret = vbsl_u8(cmp8, vdup_n_u8(255), ret); | 82 ret = vbsl_u8(cmp8, vdup_n_u8(255), ret); |
| 96 | 83 |
| 97 // Calc SkDiv255Round | 84 // Calc SkDiv255Round |
| 98 uint8x8_t div = SkDiv255Round_neon8_32_8(val1, val2); | 85 uint8x8_t div = SkDiv255Round_neon8_32_8(val1, val2); |
| 99 | 86 |
| 100 // Insert where false and previous test false | 87 // Insert where false and previous test false |
| 101 cmp8 = cmp8 | cmp8_1; | 88 cmp8 = cmp8 | cmp8_1; |
| (...skipping 313 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 415 uint16x8_t scda = vmull_u8(sc, da); | 402 uint16x8_t scda = vmull_u8(sc, da); |
| 416 uint16x8_t dcsa = vmull_u8(dc, sa); | 403 uint16x8_t dcsa = vmull_u8(dc, sa); |
| 417 uint16x8_t sada = vmull_u8(sa, da); | 404 uint16x8_t sada = vmull_u8(sa, da); |
| 418 | 405 |
| 419 // Prepare non common subexpressions | 406 // Prepare non common subexpressions |
| 420 uint16x8_t dc2, sc2; | 407 uint16x8_t dc2, sc2; |
| 421 uint32x4_t scdc2_1, scdc2_2; | 408 uint32x4_t scdc2_1, scdc2_2; |
| 422 if (overlay) { | 409 if (overlay) { |
| 423 dc2 = vshll_n_u8(dc, 1); | 410 dc2 = vshll_n_u8(dc, 1); |
| 424 scdc2_1 = vmull_u16(vget_low_u16(dc2), vget_low_u16(vmovl_u8(sc))); | 411 scdc2_1 = vmull_u16(vget_low_u16(dc2), vget_low_u16(vmovl_u8(sc))); |
| 425 #ifdef SK_CPU_ARM64 | |
| 426 scdc2_2 = vmull_high_u16(dc2, vmovl_u8(sc)); | |
| 427 #else | |
| 428 scdc2_2 = vmull_u16(vget_high_u16(dc2), vget_high_u16(vmovl_u8(sc))); | 412 scdc2_2 = vmull_u16(vget_high_u16(dc2), vget_high_u16(vmovl_u8(sc))); |
| 429 #endif | |
| 430 } else { | 413 } else { |
| 431 sc2 = vshll_n_u8(sc, 1); | 414 sc2 = vshll_n_u8(sc, 1); |
| 432 scdc2_1 = vmull_u16(vget_low_u16(sc2), vget_low_u16(vmovl_u8(dc))); | 415 scdc2_1 = vmull_u16(vget_low_u16(sc2), vget_low_u16(vmovl_u8(dc))); |
| 433 #ifdef SK_CPU_ARM64 | |
| 434 scdc2_2 = vmull_high_u16(sc2, vmovl_u8(dc)); | |
| 435 #else | |
| 436 scdc2_2 = vmull_u16(vget_high_u16(sc2), vget_high_u16(vmovl_u8(dc))); | 416 scdc2_2 = vmull_u16(vget_high_u16(sc2), vget_high_u16(vmovl_u8(dc))); |
| 437 #endif | |
| 438 } | 417 } |
| 439 | 418 |
| 440 // Calc COM | 419 // Calc COM |
| 441 int32x4_t com1, com2; | 420 int32x4_t com1, com2; |
| 442 com1 = vreinterpretq_s32_u32( | 421 com1 = vreinterpretq_s32_u32( |
| 443 vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc))); | 422 vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc))); |
| 444 com2 = vreinterpretq_s32_u32( | 423 com2 = vreinterpretq_s32_u32( |
| 445 #ifdef SK_CPU_ARM64 | |
| 446 vmull_high_u16(const255, sc_plus_dc)); | |
| 447 #else | |
| 448 vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc))); | 424 vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc))); |
| 449 #endif | |
| 450 | 425 |
| 451 // Calc SUB | 426 // Calc SUB |
| 452 int32x4_t sub1, sub2; | 427 int32x4_t sub1, sub2; |
| 453 sub1 = vreinterpretq_s32_u32(vaddl_u16(vget_low_u16(scda), vget_low_u16(dcsa
))); | 428 sub1 = vreinterpretq_s32_u32(vaddl_u16(vget_low_u16(scda), vget_low_u16(dcsa
))); |
| 454 #ifdef SK_CPU_ARM64 | |
| 455 sub2 = vreinterpretq_s32_u32(vaddl_high_u16(scda, dcsa)); | |
| 456 #else | |
| 457 sub2 = vreinterpretq_s32_u32(vaddl_u16(vget_high_u16(scda), vget_high_u16(dc
sa))); | 429 sub2 = vreinterpretq_s32_u32(vaddl_u16(vget_high_u16(scda), vget_high_u16(dc
sa))); |
| 458 #endif | |
| 459 sub1 = vsubq_s32(sub1, vreinterpretq_s32_u32(scdc2_1)); | 430 sub1 = vsubq_s32(sub1, vreinterpretq_s32_u32(scdc2_1)); |
| 460 sub2 = vsubq_s32(sub2, vreinterpretq_s32_u32(scdc2_2)); | 431 sub2 = vsubq_s32(sub2, vreinterpretq_s32_u32(scdc2_2)); |
| 461 | 432 |
| 462 // Compare 2*dc <= da | 433 // Compare 2*dc <= da |
| 463 uint16x8_t cmp; | 434 uint16x8_t cmp; |
| 464 | 435 |
| 465 if (overlay) { | 436 if (overlay) { |
| 466 cmp = vcleq_u16(dc2, vmovl_u8(da)); | 437 cmp = vcleq_u16(dc2, vmovl_u8(da)); |
| 467 } else { | 438 } else { |
| 468 cmp = vcleq_u16(sc2, vmovl_u8(sa)); | 439 cmp = vcleq_u16(sc2, vmovl_u8(sa)); |
| 469 } | 440 } |
| 470 | 441 |
| 471 // Prepare variables | 442 // Prepare variables |
| 472 int32x4_t val1_1, val1_2; | 443 int32x4_t val1_1, val1_2; |
| 473 int32x4_t val2_1, val2_2; | 444 int32x4_t val2_1, val2_2; |
| 474 uint32x4_t cmp1, cmp2; | 445 uint32x4_t cmp1, cmp2; |
| 475 | 446 |
| 476 // Doing a signed lengthening allows to save a few instructions | 447 cmp1 = vmovl_u16(vget_low_u16(cmp)); |
| 477 // thanks to sign extension. | 448 cmp1 |= vshlq_n_u32(cmp1, 16); |
| 478 cmp1 = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_low_u16(cmp
)))); | 449 cmp2 = vmovl_u16(vget_high_u16(cmp)); |
| 479 #ifdef SK_CPU_ARM64 | 450 cmp2 |= vshlq_n_u32(cmp2, 16); |
| 480 cmp2 = vreinterpretq_u32_s32(vmovl_high_s16(vreinterpretq_s16_u16(cmp))); | |
| 481 #else | |
| 482 cmp2 = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_high_u16(cm
p)))); | |
| 483 #endif | |
| 484 | 451 |
| 485 // Calc COM - SUB | 452 // Calc COM - SUB |
| 486 val1_1 = com1 - sub1; | 453 val1_1 = com1 - sub1; |
| 487 val1_2 = com2 - sub2; | 454 val1_2 = com2 - sub2; |
| 488 | 455 |
| 489 // Calc COM + SUB - sa*da | 456 // Calc COM + SUB - sa*da |
| 490 val2_1 = com1 + sub1; | 457 val2_1 = com1 + sub1; |
| 491 val2_2 = com2 + sub2; | 458 val2_2 = com2 + sub2; |
| 492 | 459 |
| 493 val2_1 = vsubq_s32(val2_1, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sada
)))); | 460 val2_1 = vsubq_s32(val2_1, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sada
)))); |
| 494 #ifdef SK_CPU_ARM64 | |
| 495 val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_high_u16(sada))); | |
| 496 #else | |
| 497 val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sad
a)))); | 461 val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sad
a)))); |
| 498 #endif | |
| 499 | 462 |
| 500 // Insert where needed | 463 // Insert where needed |
| 501 val1_1 = vbslq_s32(cmp1, val1_1, val2_1); | 464 val1_1 = vbslq_s32(cmp1, val1_1, val2_1); |
| 502 val1_2 = vbslq_s32(cmp2, val1_2, val2_2); | 465 val1_2 = vbslq_s32(cmp2, val1_2, val2_2); |
| 503 | 466 |
| 504 // Call the clamp_div255round function | 467 // Call the clamp_div255round function |
| 505 return clamp_div255round_simd8_32(val1_1, val1_2); | 468 return clamp_div255round_simd8_32(val1_1, val1_2); |
| 506 } | 469 } |
| 507 | 470 |
| 508 static inline uint8x8_t overlay_color(uint8x8_t sc, uint8x8_t dc, | 471 static inline uint8x8_t overlay_color(uint8x8_t sc, uint8x8_t dc, |
| (...skipping 149 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 658 sc_plus_dc = vaddl_u8(sc, dc); | 621 sc_plus_dc = vaddl_u8(sc, dc); |
| 659 scdc = vmull_u8(sc, dc); | 622 scdc = vmull_u8(sc, dc); |
| 660 | 623 |
| 661 /* Prepare constants */ | 624 /* Prepare constants */ |
| 662 const255 = vdupq_n_u16(255); | 625 const255 = vdupq_n_u16(255); |
| 663 | 626 |
| 664 /* Calc the first term */ | 627 /* Calc the first term */ |
| 665 term1_1 = vreinterpretq_s32_u32( | 628 term1_1 = vreinterpretq_s32_u32( |
| 666 vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc))); | 629 vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc))); |
| 667 term1_2 = vreinterpretq_s32_u32( | 630 term1_2 = vreinterpretq_s32_u32( |
| 668 #ifdef SK_CPU_ARM64 | |
| 669 vmull_high_u16(const255, sc_plus_dc)); | |
| 670 #else | |
| 671 vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc))); | 631 vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc))); |
| 672 #endif | |
| 673 | 632 |
| 674 /* Calc the second term */ | 633 /* Calc the second term */ |
| 675 term2_1 = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(scdc), 1)); | 634 term2_1 = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(scdc), 1)); |
| 676 #ifdef SK_CPU_ARM64 | |
| 677 term2_2 = vreinterpretq_s32_u32(vshll_high_n_u16(scdc, 1)); | |
| 678 #else | |
| 679 term2_2 = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(scdc), 1)); | 635 term2_2 = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(scdc), 1)); |
| 680 #endif | |
| 681 | 636 |
| 682 return clamp_div255round_simd8_32(term1_1 - term2_1, term1_2 - term2_2); | 637 return clamp_div255round_simd8_32(term1_1 - term2_1, term1_2 - term2_2); |
| 683 } | 638 } |
| 684 | 639 |
| 685 uint8x8x4_t exclusion_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | 640 uint8x8x4_t exclusion_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { |
| 686 uint8x8x4_t ret; | 641 uint8x8x4_t ret; |
| 687 | 642 |
| 688 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); | 643 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); |
| 689 ret.val[NEON_R] = exclusion_color(src.val[NEON_R], dst.val[NEON_R], | 644 ret.val[NEON_R] = exclusion_color(src.val[NEON_R], dst.val[NEON_R], |
| 690 src.val[NEON_A], dst.val[NEON_A]); | 645 src.val[NEON_A], dst.val[NEON_A]); |
| 691 ret.val[NEON_G] = exclusion_color(src.val[NEON_G], dst.val[NEON_G], | 646 ret.val[NEON_G] = exclusion_color(src.val[NEON_G], dst.val[NEON_G], |
| 692 src.val[NEON_A], dst.val[NEON_A]); | 647 src.val[NEON_A], dst.val[NEON_A]); |
| 693 ret.val[NEON_B] = exclusion_color(src.val[NEON_B], dst.val[NEON_B], | 648 ret.val[NEON_B] = exclusion_color(src.val[NEON_B], dst.val[NEON_B], |
| 694 src.val[NEON_A], dst.val[NEON_A]); | 649 src.val[NEON_A], dst.val[NEON_A]); |
| 695 | 650 |
| 696 return ret; | 651 return ret; |
| 697 } | 652 } |
| 698 | 653 |
| 699 static inline uint8x8_t blendfunc_multiply_color(uint8x8_t sc, uint8x8_t dc, | 654 static inline uint8x8_t blendfunc_multiply_color(uint8x8_t sc, uint8x8_t dc, |
| 700 uint8x8_t sa, uint8x8_t da) { | 655 uint8x8_t sa, uint8x8_t da) { |
| 701 uint32x4_t val1, val2; | 656 uint32x4_t val1, val2; |
| 702 uint16x8_t scdc, t1, t2; | 657 uint16x8_t scdc, t1, t2; |
| 703 | 658 |
| 704 t1 = vmull_u8(sc, vdup_n_u8(255) - da); | 659 t1 = vmull_u8(sc, vdup_n_u8(255) - da); |
| 705 t2 = vmull_u8(dc, vdup_n_u8(255) - sa); | 660 t2 = vmull_u8(dc, vdup_n_u8(255) - sa); |
| 706 scdc = vmull_u8(sc, dc); | 661 scdc = vmull_u8(sc, dc); |
| 707 | 662 |
| 708 val1 = vaddl_u16(vget_low_u16(t1), vget_low_u16(t2)); | 663 val1 = vaddl_u16(vget_low_u16(t1), vget_low_u16(t2)); |
| 709 #ifdef SK_CPU_ARM64 | |
| 710 val2 = vaddl_high_u16(t1, t2); | |
| 711 #else | |
| 712 val2 = vaddl_u16(vget_high_u16(t1), vget_high_u16(t2)); | 664 val2 = vaddl_u16(vget_high_u16(t1), vget_high_u16(t2)); |
| 713 #endif | |
| 714 | 665 |
| 715 val1 = vaddw_u16(val1, vget_low_u16(scdc)); | 666 val1 = vaddw_u16(val1, vget_low_u16(scdc)); |
| 716 #ifdef SK_CPU_ARM64 | |
| 717 val2 = vaddw_high_u16(val2, scdc); | |
| 718 #else | |
| 719 val2 = vaddw_u16(val2, vget_high_u16(scdc)); | 667 val2 = vaddw_u16(val2, vget_high_u16(scdc)); |
| 720 #endif | |
| 721 | 668 |
| 722 return clamp_div255round_simd8_32( | 669 return clamp_div255round_simd8_32( |
| 723 vreinterpretq_s32_u32(val1), vreinterpretq_s32_u32(val2)); | 670 vreinterpretq_s32_u32(val1), vreinterpretq_s32_u32(val2)); |
| 724 } | 671 } |
| 725 | 672 |
| 726 uint8x8x4_t multiply_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { | 673 uint8x8x4_t multiply_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { |
| 727 uint8x8x4_t ret; | 674 uint8x8x4_t ret; |
| 728 | 675 |
| 729 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); | 676 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); |
| 730 ret.val[NEON_R] = blendfunc_multiply_color(src.val[NEON_R], dst.val[NEON_R], | 677 ret.val[NEON_R] = blendfunc_multiply_color(src.val[NEON_R], dst.val[NEON_R], |
| (...skipping 23 matching lines...) Expand all Loading... |
| 754 | 701 |
| 755 SkXfermodeProc proc = this->getProc(); | 702 SkXfermodeProc proc = this->getProc(); |
| 756 SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD
); | 703 SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD
); |
| 757 SkASSERT(procSIMD != NULL); | 704 SkASSERT(procSIMD != NULL); |
| 758 | 705 |
| 759 if (NULL == aa) { | 706 if (NULL == aa) { |
| 760 // Unrolled NEON code | 707 // Unrolled NEON code |
| 761 while (count >= 8) { | 708 while (count >= 8) { |
| 762 uint8x8x4_t vsrc, vdst, vres; | 709 uint8x8x4_t vsrc, vdst, vres; |
| 763 | 710 |
| 764 #ifdef SK_CPU_ARM64 | |
| 765 vsrc = vld4_u8((uint8_t*)src); | |
| 766 vdst = vld4_u8((uint8_t*)dst); | |
| 767 #else | |
| 768 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) | 711 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) |
| 769 asm volatile ( | 712 asm volatile ( |
| 770 "vld4.u8 %h[vsrc], [%[src]]! \t\n" | 713 "vld4.u8 %h[vsrc], [%[src]]! \t\n" |
| 771 "vld4.u8 %h[vdst], [%[dst]] \t\n" | 714 "vld4.u8 %h[vdst], [%[dst]] \t\n" |
| 772 : [vsrc] "=w" (vsrc), [vdst] "=w" (vdst), [src] "+&r" (src) | 715 : [vsrc] "=w" (vsrc), [vdst] "=w" (vdst), [src] "+&r" (src) |
| 773 : [dst] "r" (dst) | 716 : [dst] "r" (dst) |
| 774 : | 717 : |
| 775 ); | 718 ); |
| 776 #else | 719 #else |
| 777 register uint8x8_t d0 asm("d0"); | 720 register uint8x8_t d0 asm("d0"); |
| (...skipping 12 matching lines...) Expand all Loading... |
| 790 "=w" (d4), "=w" (d5), "=w" (d6), "=w" (d7), | 733 "=w" (d4), "=w" (d5), "=w" (d6), "=w" (d7), |
| 791 [src] "+&r" (src) | 734 [src] "+&r" (src) |
| 792 : [dst] "r" (dst) | 735 : [dst] "r" (dst) |
| 793 : | 736 : |
| 794 ); | 737 ); |
| 795 vsrc.val[0] = d0; vdst.val[0] = d4; | 738 vsrc.val[0] = d0; vdst.val[0] = d4; |
| 796 vsrc.val[1] = d1; vdst.val[1] = d5; | 739 vsrc.val[1] = d1; vdst.val[1] = d5; |
| 797 vsrc.val[2] = d2; vdst.val[2] = d6; | 740 vsrc.val[2] = d2; vdst.val[2] = d6; |
| 798 vsrc.val[3] = d3; vdst.val[3] = d7; | 741 vsrc.val[3] = d3; vdst.val[3] = d7; |
| 799 #endif | 742 #endif |
| 800 #endif // #ifdef SK_CPU_ARM64 | |
| 801 | 743 |
| 802 vres = procSIMD(vsrc, vdst); | 744 vres = procSIMD(vsrc, vdst); |
| 803 | 745 |
| 804 vst4_u8((uint8_t*)dst, vres); | 746 vst4_u8((uint8_t*)dst, vres); |
| 805 | 747 |
| 806 count -= 8; | 748 count -= 8; |
| 807 dst += 8; | 749 dst += 8; |
| 808 #ifdef SK_CPU_ARM64 | |
| 809 src += 8; | |
| 810 #endif | |
| 811 } | 750 } |
| 812 // Leftovers | 751 // Leftovers |
| 813 for (int i = 0; i < count; i++) { | 752 for (int i = 0; i < count; i++) { |
| 814 dst[i] = proc(src[i], dst[i]); | 753 dst[i] = proc(src[i], dst[i]); |
| 815 } | 754 } |
| 816 } else { | 755 } else { |
| 817 for (int i = count - 1; i >= 0; --i) { | 756 for (int i = count - 1; i >= 0; --i) { |
| 818 unsigned a = aa[i]; | 757 unsigned a = aa[i]; |
| 819 if (0 != a) { | 758 if (0 != a) { |
| 820 SkPMColor dstC = dst[i]; | 759 SkPMColor dstC = dst[i]; |
| (...skipping 16 matching lines...) Expand all Loading... |
| 837 SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD
); | 776 SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD
); |
| 838 SkASSERT(procSIMD != NULL); | 777 SkASSERT(procSIMD != NULL); |
| 839 | 778 |
| 840 if (NULL == aa) { | 779 if (NULL == aa) { |
| 841 while(count >= 8) { | 780 while(count >= 8) { |
| 842 uint16x8_t vdst, vres16; | 781 uint16x8_t vdst, vres16; |
| 843 uint8x8x4_t vdst32, vsrc, vres; | 782 uint8x8x4_t vdst32, vsrc, vres; |
| 844 | 783 |
| 845 vdst = vld1q_u16(dst); | 784 vdst = vld1q_u16(dst); |
| 846 | 785 |
| 847 #ifdef SK_CPU_ARM64 | |
| 848 vsrc = vld4_u8((uint8_t*)src); | |
| 849 #else | |
| 850 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) | 786 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) |
| 851 asm volatile ( | 787 asm volatile ( |
| 852 "vld4.u8 %h[vsrc], [%[src]]! \t\n" | 788 "vld4.u8 %h[vsrc], [%[src]]! \t\n" |
| 853 : [vsrc] "=w" (vsrc), [src] "+&r" (src) | 789 : [vsrc] "=w" (vsrc), [src] "+&r" (src) |
| 854 : : | 790 : : |
| 855 ); | 791 ); |
| 856 #else | 792 #else |
| 857 register uint8x8_t d0 asm("d0"); | 793 register uint8x8_t d0 asm("d0"); |
| 858 register uint8x8_t d1 asm("d1"); | 794 register uint8x8_t d1 asm("d1"); |
| 859 register uint8x8_t d2 asm("d2"); | 795 register uint8x8_t d2 asm("d2"); |
| 860 register uint8x8_t d3 asm("d3"); | 796 register uint8x8_t d3 asm("d3"); |
| 861 | 797 |
| 862 asm volatile ( | 798 asm volatile ( |
| 863 "vld4.u8 {d0-d3},[%[src]]!;" | 799 "vld4.u8 {d0-d3},[%[src]]!;" |
| 864 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), | 800 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), |
| 865 [src] "+&r" (src) | 801 [src] "+&r" (src) |
| 866 : : | 802 : : |
| 867 ); | 803 ); |
| 868 vsrc.val[0] = d0; | 804 vsrc.val[0] = d0; |
| 869 vsrc.val[1] = d1; | 805 vsrc.val[1] = d1; |
| 870 vsrc.val[2] = d2; | 806 vsrc.val[2] = d2; |
| 871 vsrc.val[3] = d3; | 807 vsrc.val[3] = d3; |
| 872 #endif | 808 #endif |
| 873 #endif // #ifdef SK_CPU_ARM64 | |
| 874 | 809 |
| 875 vdst32 = SkPixel16ToPixel32_neon8(vdst); | 810 vdst32 = SkPixel16ToPixel32_neon8(vdst); |
| 876 vres = procSIMD(vsrc, vdst32); | 811 vres = procSIMD(vsrc, vdst32); |
| 877 vres16 = SkPixel32ToPixel16_neon8(vres); | 812 vres16 = SkPixel32ToPixel16_neon8(vres); |
| 878 | 813 |
| 879 vst1q_u16(dst, vres16); | 814 vst1q_u16(dst, vres16); |
| 880 | 815 |
| 881 count -= 8; | 816 count -= 8; |
| 882 dst += 8; | 817 dst += 8; |
| 883 #ifdef SK_CPU_ARM64 | |
| 884 src += 8; | |
| 885 #endif | |
| 886 } | 818 } |
| 887 for (int i = 0; i < count; i++) { | 819 for (int i = 0; i < count; i++) { |
| 888 SkPMColor dstC = SkPixel16ToPixel32(dst[i]); | 820 SkPMColor dstC = SkPixel16ToPixel32(dst[i]); |
| 889 dst[i] = SkPixel32ToPixel16_ToU16(proc(src[i], dstC)); | 821 dst[i] = SkPixel32ToPixel16_ToU16(proc(src[i], dstC)); |
| 890 } | 822 } |
| 891 } else { | 823 } else { |
| 892 for (int i = count - 1; i >= 0; --i) { | 824 for (int i = count - 1; i >= 0; --i) { |
| 893 unsigned a = aa[i]; | 825 unsigned a = aa[i]; |
| 894 if (0 != a) { | 826 if (0 != a) { |
| 895 SkPMColor dstC = SkPixel16ToPixel32(dst[i]); | 827 SkPMColor dstC = SkPixel16ToPixel32(dst[i]); |
| (...skipping 100 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 996 | 928 |
| 997 if (procSIMD != NULL) { | 929 if (procSIMD != NULL) { |
| 998 return SkNEW_ARGS(SkNEONProcCoeffXfermode, (rec, mode, procSIMD)); | 930 return SkNEW_ARGS(SkNEONProcCoeffXfermode, (rec, mode, procSIMD)); |
| 999 } | 931 } |
| 1000 return NULL; | 932 return NULL; |
| 1001 } | 933 } |
| 1002 | 934 |
| 1003 SkXfermodeProc SkPlatformXfermodeProcFactory_impl_neon(SkXfermode::Mode mode) { | 935 SkXfermodeProc SkPlatformXfermodeProcFactory_impl_neon(SkXfermode::Mode mode) { |
| 1004 return gNEONXfermodeProcs1[mode]; | 936 return gNEONXfermodeProcs1[mode]; |
| 1005 } | 937 } |
| OLD | NEW |