OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2012 The Android Open Source Project | 2 * Copyright 2012 The Android Open Source Project |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #include "SkBlitRow_opts_arm_neon.h" | 8 #include "SkBlitRow_opts_arm_neon.h" |
9 | 9 |
10 #include "SkBlitMask.h" | 10 #include "SkBlitMask.h" |
(...skipping 447 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
458 SkPMColorAssert(c); | 458 SkPMColorAssert(c); |
459 if (c) { | 459 if (c) { |
460 *dst = SkSrcOver32To16(c, *dst); | 460 *dst = SkSrcOver32To16(c, *dst); |
461 } | 461 } |
462 dst += 1; | 462 dst += 1; |
463 } while (--count != 0); | 463 } while (--count != 0); |
464 } | 464 } |
465 } | 465 } |
466 #endif // #ifdef SK_CPU_ARM32 | 466 #endif // #ifdef SK_CPU_ARM32 |
467 | 467 |
| 468 static uint32_t pmcolor_to_expand16(SkPMColor c) { |
| 469 unsigned r = SkGetPackedR32(c); |
| 470 unsigned g = SkGetPackedG32(c); |
| 471 unsigned b = SkGetPackedB32(c); |
| 472 return (g << 24) | (r << 13) | (b << 2); |
| 473 } |
| 474 |
| 475 void Color32A_D565_neon(uint16_t dst[], SkPMColor src[], int count, int x, int y
) { |
| 476 uint32_t src_expand; |
| 477 unsigned scale; |
| 478 |
| 479 if (count <= 0) return; |
| 480 SkASSERT(((size_t)dst & 0x01) == 0); |
| 481 |
| 482 /* |
| 483 * This preamble code is in order to make dst aligned to 8 bytes |
| 484 * in the next mutiple bytes read & write access. |
| 485 */ |
| 486 src_expand = pmcolor_to_expand16(*src); |
| 487 scale = SkAlpha255To256(0xFF - SkGetPackedA32(*src)) >> 3; |
| 488 |
| 489 #define DST_ALIGN 8 |
| 490 |
| 491 /* |
| 492 * preamble_size is in byte, meantime, this blend32_16_row_neon updates 2 by
tes at a time. |
| 493 */ |
| 494 int preamble_size = (DST_ALIGN - (size_t)dst) & (DST_ALIGN - 1); |
| 495 |
| 496 for (int i = 0; i < preamble_size; i+=2, dst++) { |
| 497 uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale; |
| 498 *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5); |
| 499 if (--count == 0) |
| 500 break; |
| 501 } |
| 502 #ifdef SK_CPU_ARM64 |
| 503 asm ( |
| 504 "lsr x2, %[count], #4 \n\t" |
| 505 "mov x1, x2 \n\t" //
calc. count>>4 |
| 506 "cbz x1, back \n\t" //
if (count>>4) == 0, exit |
| 507 "ld4 {v24.8b, v25.8b, v26.8b, v27.8b}, [%[src]] \n\t" //
load eight src ABGR32 pixels |
| 508 |
| 509 "uxtl v4.8h, v24.8b \n\t" //
widen red to 16 bits |
| 510 "uxtl v5.8h, v25.8b \n\t" //
widen green to 16 bits |
| 511 "uxtl v6.8h, v26.8b \n\t" //
widen blue to 16 bits |
| 512 |
| 513 "shl v4.8h, v4.8h, #2 \n\t" //
src red = src_red << 2 (later will do >> 5 to make 5 bit red) |
| 514 "shl v5.8h, v5.8h, #3 \n\t" //
src grn = src_grn << 3 (later will do >> 5 to make 6 bit grn) |
| 515 "shl v6.8h, v6.8h, #2 \n\t" //
src blu = src_blu << 2 (later will do >> 5 to make 5 bit blu) |
| 516 |
| 517 "movi v21.8h, #1, lsl#8 \n\t" //
set up constant 256 (1<<8) |
| 518 "uxtl v14.8h, v27.8b \n\t" //
widen alpha to 16 bits |
| 519 "sub v14.8h, v21.8h, v14.8h \n\t" //
256 - sa |
| 520 "ushr v14.8h, v14.8h, #3 \n\t" //
(256 - sa) >> 3 |
| 521 |
| 522 "front: \n\t" |
| 523 "ld1 {v0.8h, v1.8h}, [%[dst]] \n\t" //
load sixteen dst RGB565 pixels |
| 524 //set PREFETCH_DISTANCE to 128 |
| 525 "prfum pldl1keep, [%[dst], #128] \n\t" |
| 526 |
| 527 "subs x1, x1, #1 \n\t" //
decrement loop counter |
| 528 |
| 529 "shl v9.8h, v0.8h, #5 \n\t" //
shift green to top of lanes |
| 530 |
| 531 "shl v10.8h, v0.8h, #11 \n\t" //
shift blue to top of lanes |
| 532 "ushr v10.8h, v10.8h, #11 \n\t" //
extract blue |
| 533 |
| 534 "ushr v8.8h, v0.8h, #11 \n\t" //
extract red |
| 535 "ushr v9.8h, v9.8h, #10 \n\t" //
extract green |
| 536 |
| 537 "shl v3.8h, v1.8h, #5 \n\t" //
shift green to top of lanes |
| 538 |
| 539 "shl v7.8h, v1.8h, #11 \n\t" //
shift blue to top of lanes |
| 540 "ushr v7.8h, v7.8h, #11 \n\t" //
extract blue |
| 541 |
| 542 "ushr v2.8h, v1.8h, #11 \n\t" //
extract red |
| 543 "ushr v3.8h, v3.8h, #10 \n\t" //
extract green |
| 544 |
| 545 //If we use src in mla, directly, vd.8h is updated by mla, so
need to calculate src in next loop again, which requires 12 cycles. |
| 546 //Instead, copy src to other registers and use them as a mla d
estination. |
| 547 //6 shl commands are needed, but we don't need to update src.
In total, will get 6 cycle benefit in each loop. |
| 548 |
| 549 "shl v15.8h, v4.8h, #0 \n\t" //
copy dst 0 red result |
| 550 "shl v16.8h, v5.8h, #0 \n\t" //
copy dst 0 grn result |
| 551 "shl v17.8h, v6.8h, #0 \n\t" //
copy dst 0 blu result |
| 552 "mla v15.8h, v8.8h, v14.8h \n\t" //
dst 0 red result = dst_red * dst_scale |
| 553 "mla v16.8h, v9.8h, v14.8h \n\t" //
dst 0 grn result = dst_grn * dst_scale |
| 554 "mla v17.8h, v10.8h, v14.8h \n\t" //
dst 0 blu result = dst_blu * dst_scale |
| 555 |
| 556 "shl v20.8h, v4.8h, #0 \n\t" //
copy dst 1 red result |
| 557 "shl v19.8h, v5.8h, #0 \n\t" //
copy dst 1 grn result |
| 558 "shl v18.8h, v6.8h, #0 \n\t" //
copy dst 1 blu result |
| 559 "mla v20.8h, v2.8h, v14.8h \n\t" //
dst 1 red result = dst_red * dst_scale |
| 560 "mla v19.8h, v3.8h, v14.8h \n\t" //
dst 1 grn result = dst_grn * dst_scale |
| 561 "mla v18.8h, v7.8h, v14.8h \n\t" //
dst 1 blu result = dst_blu * dst_scale |
| 562 |
| 563 "ushr v15.8h, v15.8h, #5 \n\t" //
dst 0 red result >> 5 |
| 564 "ushr v16.8h, v16.8h, #5 \n\t" //
dst 0 grn result >> 5 |
| 565 "ushr v17.8h, v17.8h, #5 \n\t" //
dst 0 blu result >> 5 |
| 566 |
| 567 "ushr v20.8h, v20.8h, #5 \n\t" //
dst 1 red result >> 5 |
| 568 "ushr v19.8h, v19.8h, #5 \n\t" //
dst 1 grn result >> 5 |
| 569 "ushr v18.8h, v18.8h, #5 \n\t" //
dst 1 blu result >> 5 |
| 570 |
| 571 "sli v17.8h, v16.8h, #5 \n\t" //
dst 0 insert green into blue |
| 572 "sli v17.8h, v15.8h, #11 \n\t" //
dst 0 insert red into green/blue |
| 573 |
| 574 "sli v18.8h, v19.8h, #5 \n\t" //
dst 1 insert green into blue |
| 575 "sli v18.8h, v20.8h, #11 \n\t" //
dst 1 insert red into green/blue |
| 576 |
| 577 "st1 {v17.8h, v18.8h}, [%[dst]], #32 \n\t" //
write pixel back to dst 0 and dst 1, update ptr |
| 578 "cbnz x1, front \n\t" //
if counter !=0, loop |
| 579 "back: \n\t" //
exit |
| 580 |
| 581 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count) |
| 582 : : "x1", "x2", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
, "v8", "v9", "v10", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", |
| 583 "v24", "v25", "v26", "v27" |
| 584 ); |
| 585 #else |
| 586 asm volatile ( |
| 587 "movs r4, %[count], lsr #4 \n\t" // cal
c. count>>4 |
| 588 "beq 2f \n\t" // if
(count>>4) == 0, exit |
| 589 "vmov.u16 q15, #0x1f \n\t" // set
up blue mask |
| 590 "vld4.u8 {d24, d25, d26, d27}, [%[src]] \n\t" // loa
d eight src ABGR32 pixels |
| 591 |
| 592 "vmov r5, r6, d24 \n\t" // sav
e src red in r5, r6 |
| 593 "vmov r7, r8, d25 \n\t" // sav
e src green in r7, r8 |
| 594 "vmov r9, r10, d26 \n\t" // sav
e src blue in r9, r10 |
| 595 "vmov r11, r12, d27 \n\t" // sav
e src alpha in r11, r12 |
| 596 |
| 597 |
| 598 "1: \n\t" |
| 599 "vld1.u16 {d0, d1, d2, d3}, [%[dst]] \n\t" // loa
d sixteen dst RGB565 pixels |
| 600 //set PREFETCH_DISTANCE to 128 |
| 601 "pld [%[dst], #128] \n\t" |
| 602 |
| 603 "subs r4, r4, #1 \n\t" // dec
rement loop counter |
| 604 |
| 605 "vmov d24, r5, r6 \n\t" // src
red to d24 |
| 606 "vmov d25, r7, r8 \n\t" // src
green to d25 |
| 607 "vmov d26, r9, r10 \n\t" // src
blue to d26 |
| 608 "vmov d27, r11, r12 \n\t" // src
alpha to d27 |
| 609 |
| 610 "vmov.u16 q3, #256 \n\t" // set
up constant |
| 611 "vmovl.u8 q14, d27 \n\t" // wid
en alpha to 16 bits |
| 612 // dst_scale = q14 |
| 613 "vsub.u16 q14, q3, q14 \n\t" // 256
- sa |
| 614 "vshr.u16 q14, q14, #3 \n\t" // (25
6 - sa) >> 3 |
| 615 |
| 616 |
| 617 // dst_0_rgb = {q8, q9, q10} |
| 618 "vshl.u16 q9, q0, #5 \n\t" // shi
ft green to top of lanes |
| 619 "vand q10, q0, q15 \n\t" // ext
ract blue |
| 620 "vshr.u16 q8, q0, #11 \n\t" // ext
ract red |
| 621 "vshr.u16 q9, q9, #10 \n\t" // ext
ract green |
| 622 |
| 623 //use q3 for dst_1 green. In the next loop, needs to set q3 to
256 again. |
| 624 // dst_1_rgb = {q2, q3, q7} |
| 625 "vshl.u16 q3, q1, #5 \n\t" // shi
ft green to top of lanes |
| 626 "vand q7, q1, q15 \n\t" // ext
ract blue |
| 627 "vshr.u16 q2, q1, #11 \n\t" // ext
ract red |
| 628 "vshr.u16 q3, q3, #10 \n\t" // ext
ract green |
| 629 |
| 630 // srcrgba = {q4, q5, q6, q14}, alpha calculation is done alre
ady in above. |
| 631 // q4, q5, q6 will have each channel's result of dst_1_rgb. |
| 632 "vmovl.u8 q4, d24 \n\t" // wid
en red to 16 bits |
| 633 "vmovl.u8 q5, d25 \n\t" // wid
en green to 16 bits |
| 634 "vmovl.u8 q6, d26 \n\t" // wid
en blue to 16 bits |
| 635 |
| 636 // srcrgba = {q11, q12, q13, q14}, alpha calculation is done a
lready in above. |
| 637 // q11, q12, q13 will have each channel's result of dst_0_rgb. |
| 638 "vmovl.u8 q11, d24 \n\t" // wid
en red to 16 bits |
| 639 "vmovl.u8 q12, d25 \n\t" // wid
en green to 16 bits |
| 640 "vmovl.u8 q13, d26 \n\t" // wid
en blue to 16 bits |
| 641 |
| 642 "vshl.u16 q11, q11, #2 \n\t" // dst
0 red result = src_red << 2 (later will do >> 5 to make 5 bit red) |
| 643 "vshl.u16 q12, q12, #3 \n\t" // dst
0 grn result = src_grn << 3 (later will do >> 5 to make 6 bit grn) |
| 644 "vshl.u16 q13, q13, #2 \n\t" // dst
0 blu result = src_blu << 2 (later will do >> 5 to make 5 bit blu) |
| 645 |
| 646 "vshl.u16 q4, q4, #2 \n\t" // dst
1 red result = src_red << 2 (later will do >> 5 to make 5 bit red) |
| 647 "vshl.u16 q5, q5, #3 \n\t" // dst
1 grn result = src_grn << 3 (later will do >> 5 to make 6 bit grn) |
| 648 "vshl.u16 q6, q6, #2 \n\t" // dst
1 blu result = src_blu << 2 (later will do >> 5 to make 5 bit blu) |
| 649 |
| 650 "vmla.u16 q11, q8, q14 \n\t" // dst
0 red result += dst_red * dst_scale |
| 651 "vmla.u16 q12, q9, q14 \n\t" // dst
0 grn result += dst_grn * dst_scale |
| 652 "vmla.u16 q13, q10, q14 \n\t" // dst
0 blu result += dst_blu * dst_scale |
| 653 |
| 654 "vmla.u16 q4, q2, q14 \n\t" // dst
1 red result += dst_red * dst_scale |
| 655 "vmla.u16 q5, q3, q14 \n\t" // dst
1 grn result += dst_grn * dst_scale |
| 656 "vmla.u16 q6, q7, q14 \n\t" // dst
1 blu result += dst_blu * dst_scale |
| 657 |
| 658 "vshr.u16 q11, q11, #5 \n\t" // dst
0 red result >> 5 |
| 659 "vshr.u16 q12, q12, #5 \n\t" // dst
0 grn result >> 5 |
| 660 "vshr.u16 q13, q13, #5 \n\t" // dst
0 blu result >> 5 |
| 661 |
| 662 "vshr.u16 q4, q4, #5 \n\t" // dst
1 red result >> 5 |
| 663 "vshr.u16 q5, q5, #5 \n\t" // dst
1 grn result >> 5 |
| 664 "vshr.u16 q14, q6, #5 \n\t" // dst
1 blu result >> 5 |
| 665 |
| 666 "vsli.u16 q13, q12, #5 \n\t" // dst
0 insert green into blue |
| 667 "vsli.u16 q13, q11, #11 \n\t" // dst
0 insert red into green/blue |
| 668 |
| 669 "vsli.u16 q14, q5, #5 \n\t" // dst
1 insert green into blue |
| 670 "vsli.u16 q14, q4, #11 \n\t" // dst
1 insert red into green/blue |
| 671 |
| 672 "vst1.16 {d26, d27, d28, d29}, [%[dst]]! \n\t" // wri
te pixel back to dst 0 and dst 1, update ptr |
| 673 |
| 674 "bne 1b \n\t" // if
counter != 0, loop |
| 675 "2: \n\t" // exi
t |
| 676 |
| 677 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count) |
| 678 : |
| 679 : "cc", "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "
r11", "r12", |
| 680 "d0", "d1", "d2", "d3", "d4", "d5", "d
6", "d7", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", |
| 681 "d17", "d18", "d19", "d20", "d21", "d2
2", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31" |
| 682 ); |
| 683 #endif |
| 684 count &= 0xF; |
| 685 if (count > 0) { |
| 686 do { |
| 687 uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale; |
| 688 *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5); |
| 689 dst += 1; |
| 690 } while (--count != 0); |
| 691 } |
| 692 } |
| 693 |
468 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) { | 694 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) { |
469 prod += vdupq_n_u16(128); | 695 prod += vdupq_n_u16(128); |
470 prod += vshrq_n_u16(prod, 8); | 696 prod += vshrq_n_u16(prod, 8); |
471 return vshrq_n_u16(prod, 8); | 697 return vshrq_n_u16(prod, 8); |
472 } | 698 } |
473 | 699 |
474 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst, | 700 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst, |
475 const SkPMColor* SK_RESTRICT src, int count, | 701 const SkPMColor* SK_RESTRICT src, int count, |
476 U8CPU alpha, int /*x*/, int /*y*/) { | 702 U8CPU alpha, int /*x*/, int /*y*/) { |
477 SkASSERT(255 > alpha); | 703 SkASSERT(255 > alpha); |
(...skipping 1180 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1658 // https://code.google.com/p/skia/issues/detail?id=2797 | 1884 // https://code.google.com/p/skia/issues/detail?id=2797 |
1659 #endif | 1885 #endif |
1660 | 1886 |
1661 // dither | 1887 // dither |
1662 S32_D565_Opaque_Dither_neon, | 1888 S32_D565_Opaque_Dither_neon, |
1663 S32_D565_Blend_Dither_neon, | 1889 S32_D565_Blend_Dither_neon, |
1664 S32A_D565_Opaque_Dither_neon, | 1890 S32A_D565_Opaque_Dither_neon, |
1665 NULL, // S32A_D565_Blend_Dither | 1891 NULL, // S32A_D565_Blend_Dither |
1666 }; | 1892 }; |
1667 | 1893 |
| 1894 const SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm_neon[] = { |
| 1895 #if 0 |
| 1896 Color32_D565_neon, |
| 1897 Color32A_D565_neon, |
| 1898 Color32_D565_Dither_neon, |
| 1899 Color32A_D565_Dither_neon |
| 1900 #else |
| 1901 // TODO: stop cheating and fill in the above specializations! |
| 1902 Color32A_D565_neon, |
| 1903 Color32A_D565_neon, |
| 1904 Color32A_D565_neon, |
| 1905 Color32A_D565_neon, |
| 1906 #endif |
| 1907 }; |
| 1908 |
1668 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { | 1909 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { |
1669 NULL, // S32_Opaque, | 1910 NULL, // S32_Opaque, |
1670 S32_Blend_BlitRow32_neon, // S32_Blend, | 1911 S32_Blend_BlitRow32_neon, // S32_Blend, |
1671 /* | 1912 /* |
1672 * We have two choices for S32A_Opaque procs. The one reads the src alpha | 1913 * We have two choices for S32A_Opaque procs. The one reads the src alpha |
1673 * value and attempts to optimize accordingly. The optimization is | 1914 * value and attempts to optimize accordingly. The optimization is |
1674 * sensitive to the source content and is not a win in all cases. For | 1915 * sensitive to the source content and is not a win in all cases. For |
1675 * example, if there are a lot of transitions between the alpha states, | 1916 * example, if there are a lot of transitions between the alpha states, |
1676 * the performance will almost certainly be worse. However, for many | 1917 * the performance will almost certainly be worse. However, for many |
1677 * common cases the performance is equivalent or better than the standard | 1918 * common cases the performance is equivalent or better than the standard |
1678 * case where we do not inspect the src alpha. | 1919 * case where we do not inspect the src alpha. |
1679 */ | 1920 */ |
1680 #if SK_A32_SHIFT == 24 | 1921 #if SK_A32_SHIFT == 24 |
1681 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor | 1922 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor |
1682 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, | 1923 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, |
1683 #else | 1924 #else |
1684 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, | 1925 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, |
1685 #endif | 1926 #endif |
1686 #ifdef SK_CPU_ARM32 | 1927 #ifdef SK_CPU_ARM32 |
1687 S32A_Blend_BlitRow32_neon // S32A_Blend | 1928 S32A_Blend_BlitRow32_neon // S32A_Blend |
1688 #else | 1929 #else |
1689 NULL | 1930 NULL |
1690 #endif | 1931 #endif |
1691 }; | 1932 }; |
OLD | NEW |