Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(249)

Side by Side Diff: src/opts/SkBlitRow_opts_arm_neon.cpp

Issue 847363002: skia: blend32_16_row for neon version (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: Created 5 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright 2012 The Android Open Source Project 2 * Copyright 2012 The Android Open Source Project
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 #include "SkBlitRow_opts_arm_neon.h" 8 #include "SkBlitRow_opts_arm_neon.h"
9 9
10 #include "SkBlitMask.h" 10 #include "SkBlitMask.h"
(...skipping 447 matching lines...) Expand 10 before | Expand all | Expand 10 after
458 SkPMColorAssert(c); 458 SkPMColorAssert(c);
459 if (c) { 459 if (c) {
460 *dst = SkSrcOver32To16(c, *dst); 460 *dst = SkSrcOver32To16(c, *dst);
461 } 461 }
462 dst += 1; 462 dst += 1;
463 } while (--count != 0); 463 } while (--count != 0);
464 } 464 }
465 } 465 }
466 #endif // #ifdef SK_CPU_ARM32 466 #endif // #ifdef SK_CPU_ARM32
467 467
468 static uint32_t pmcolor_to_expand16(SkPMColor c) {
469 unsigned r = SkGetPackedR32(c);
470 unsigned g = SkGetPackedG32(c);
471 unsigned b = SkGetPackedB32(c);
472 return (g << 24) | (r << 13) | (b << 2);
473 }
474
475 void Color32A_D565_neon(uint16_t dst[], SkPMColor src[], int count, int x, int y ) {
476 uint32_t src_expand;
477 unsigned scale;
478
479 if (count <= 0) return;
480 SkASSERT(((size_t)dst & 0x01) == 0);
481
482 /*
483 * This preamble code is in order to make dst aligned to 8 bytes
484 * in the next mutiple bytes read & write access.
485 */
486 src_expand = pmcolor_to_expand16(*src);
487 scale = SkAlpha255To256(0xFF - SkGetPackedA32(*src)) >> 3;
488
489 #define DST_ALIGN 8
490
491 /*
492 * preamble_size is in byte, meantime, this blend32_16_row_neon updates 2 by tes at a time.
493 */
494 int preamble_size = (DST_ALIGN - (size_t)dst) & (DST_ALIGN - 1);
495
496 for (int i = 0; i < preamble_size; i+=2, dst++) {
497 uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale;
498 *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);
499 if (--count == 0)
500 break;
501 }
502 #ifdef SK_CPU_ARM64
503 asm (
504 "lsr x2, %[count], #4 \n\t"
505 "mov x1, x2 \n\t" // calc. count>>4
506 "cbz x1, back \n\t" // if (count>>4) == 0, exit
507 "ld4 {v24.8b, v25.8b, v26.8b, v27.8b}, [%[src]] \n\t" // load eight src ABGR32 pixels
508
509 "uxtl v4.8h, v24.8b \n\t" // widen red to 16 bits
510 "uxtl v5.8h, v25.8b \n\t" // widen green to 16 bits
511 "uxtl v6.8h, v26.8b \n\t" // widen blue to 16 bits
512
513 "shl v4.8h, v4.8h, #2 \n\t" // src red = src_red << 2 (later will do >> 5 to make 5 bit red)
514 "shl v5.8h, v5.8h, #3 \n\t" // src grn = src_grn << 3 (later will do >> 5 to make 6 bit grn)
515 "shl v6.8h, v6.8h, #2 \n\t" // src blu = src_blu << 2 (later will do >> 5 to make 5 bit blu)
516
517 "movi v21.8h, #1, lsl#8 \n\t" // set up constant 256 (1<<8)
518 "uxtl v14.8h, v27.8b \n\t" // widen alpha to 16 bits
519 "sub v14.8h, v21.8h, v14.8h \n\t" // 256 - sa
520 "ushr v14.8h, v14.8h, #3 \n\t" // (256 - sa) >> 3
521
522 "front: \n\t"
523 "ld1 {v0.8h, v1.8h}, [%[dst]] \n\t" // load sixteen dst RGB565 pixels
524 //set PREFETCH_DISTANCE to 128
525 "prfum pldl1keep, [%[dst], #128] \n\t"
526
527 "subs x1, x1, #1 \n\t" // decrement loop counter
528
529 "shl v9.8h, v0.8h, #5 \n\t" // shift green to top of lanes
530
531 "shl v10.8h, v0.8h, #11 \n\t" // shift blue to top of lanes
532 "ushr v10.8h, v10.8h, #11 \n\t" // extract blue
533
534 "ushr v8.8h, v0.8h, #11 \n\t" // extract red
535 "ushr v9.8h, v9.8h, #10 \n\t" // extract green
536
537 "shl v3.8h, v1.8h, #5 \n\t" // shift green to top of lanes
538
539 "shl v7.8h, v1.8h, #11 \n\t" // shift blue to top of lanes
540 "ushr v7.8h, v7.8h, #11 \n\t" // extract blue
541
542 "ushr v2.8h, v1.8h, #11 \n\t" // extract red
543 "ushr v3.8h, v3.8h, #10 \n\t" // extract green
544
545 //If we use src in mla, directly, vd.8h is updated by mla, so need to calculate src in next loop again, which requires 12 cycles.
546 //Instead, copy src to other registers and use them as a mla d estination.
547 //6 shl commands are needed, but we don't need to update src. In total, will get 6 cycle benefit in each loop.
548
549 "shl v15.8h, v4.8h, #0 \n\t" // copy dst 0 red result
550 "shl v16.8h, v5.8h, #0 \n\t" // copy dst 0 grn result
551 "shl v17.8h, v6.8h, #0 \n\t" // copy dst 0 blu result
552 "mla v15.8h, v8.8h, v14.8h \n\t" // dst 0 red result = dst_red * dst_scale
553 "mla v16.8h, v9.8h, v14.8h \n\t" // dst 0 grn result = dst_grn * dst_scale
554 "mla v17.8h, v10.8h, v14.8h \n\t" // dst 0 blu result = dst_blu * dst_scale
555
556 "shl v20.8h, v4.8h, #0 \n\t" // copy dst 1 red result
557 "shl v19.8h, v5.8h, #0 \n\t" // copy dst 1 grn result
558 "shl v18.8h, v6.8h, #0 \n\t" // copy dst 1 blu result
559 "mla v20.8h, v2.8h, v14.8h \n\t" // dst 1 red result = dst_red * dst_scale
560 "mla v19.8h, v3.8h, v14.8h \n\t" // dst 1 grn result = dst_grn * dst_scale
561 "mla v18.8h, v7.8h, v14.8h \n\t" // dst 1 blu result = dst_blu * dst_scale
562
563 "ushr v15.8h, v15.8h, #5 \n\t" // dst 0 red result >> 5
564 "ushr v16.8h, v16.8h, #5 \n\t" // dst 0 grn result >> 5
565 "ushr v17.8h, v17.8h, #5 \n\t" // dst 0 blu result >> 5
566
567 "ushr v20.8h, v20.8h, #5 \n\t" // dst 1 red result >> 5
568 "ushr v19.8h, v19.8h, #5 \n\t" // dst 1 grn result >> 5
569 "ushr v18.8h, v18.8h, #5 \n\t" // dst 1 blu result >> 5
570
571 "sli v17.8h, v16.8h, #5 \n\t" // dst 0 insert green into blue
572 "sli v17.8h, v15.8h, #11 \n\t" // dst 0 insert red into green/blue
573
574 "sli v18.8h, v19.8h, #5 \n\t" // dst 1 insert green into blue
575 "sli v18.8h, v20.8h, #11 \n\t" // dst 1 insert red into green/blue
576
577 "st1 {v17.8h, v18.8h}, [%[dst]], #32 \n\t" // write pixel back to dst 0 and dst 1, update ptr
578 "cbnz x1, front \n\t" // if counter !=0, loop
579 "back: \n\t" // exit
580
581 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)
582 : : "x1", "x2", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" , "v8", "v9", "v10", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21",
583 "v24", "v25", "v26", "v27"
584 );
585 #else
586 asm volatile (
587 "movs r4, %[count], lsr #4 \n\t" // cal c. count>>4
588 "beq 2f \n\t" // if (count>>4) == 0, exit
589 "vmov.u16 q15, #0x1f \n\t" // set up blue mask
590 "vld4.u8 {d24, d25, d26, d27}, [%[src]] \n\t" // loa d eight src ABGR32 pixels
591
592 "vmov r5, r6, d24 \n\t" // sav e src red in r5, r6
593 "vmov r7, r8, d25 \n\t" // sav e src green in r7, r8
594 "vmov r9, r10, d26 \n\t" // sav e src blue in r9, r10
595 "vmov r11, r12, d27 \n\t" // sav e src alpha in r11, r12
596
597
598 "1: \n\t"
599 "vld1.u16 {d0, d1, d2, d3}, [%[dst]] \n\t" // loa d sixteen dst RGB565 pixels
600 //set PREFETCH_DISTANCE to 128
601 "pld [%[dst], #128] \n\t"
602
603 "subs r4, r4, #1 \n\t" // dec rement loop counter
604
605 "vmov d24, r5, r6 \n\t" // src red to d24
606 "vmov d25, r7, r8 \n\t" // src green to d25
607 "vmov d26, r9, r10 \n\t" // src blue to d26
608 "vmov d27, r11, r12 \n\t" // src alpha to d27
609
610 "vmov.u16 q3, #256 \n\t" // set up constant
611 "vmovl.u8 q14, d27 \n\t" // wid en alpha to 16 bits
612 // dst_scale = q14
613 "vsub.u16 q14, q3, q14 \n\t" // 256 - sa
614 "vshr.u16 q14, q14, #3 \n\t" // (25 6 - sa) >> 3
615
616
617 // dst_0_rgb = {q8, q9, q10}
618 "vshl.u16 q9, q0, #5 \n\t" // shi ft green to top of lanes
619 "vand q10, q0, q15 \n\t" // ext ract blue
620 "vshr.u16 q8, q0, #11 \n\t" // ext ract red
621 "vshr.u16 q9, q9, #10 \n\t" // ext ract green
622
623 //use q3 for dst_1 green. In the next loop, needs to set q3 to 256 again.
624 // dst_1_rgb = {q2, q3, q7}
625 "vshl.u16 q3, q1, #5 \n\t" // shi ft green to top of lanes
626 "vand q7, q1, q15 \n\t" // ext ract blue
627 "vshr.u16 q2, q1, #11 \n\t" // ext ract red
628 "vshr.u16 q3, q3, #10 \n\t" // ext ract green
629
630 // srcrgba = {q4, q5, q6, q14}, alpha calculation is done alre ady in above.
631 // q4, q5, q6 will have each channel's result of dst_1_rgb.
632 "vmovl.u8 q4, d24 \n\t" // wid en red to 16 bits
633 "vmovl.u8 q5, d25 \n\t" // wid en green to 16 bits
634 "vmovl.u8 q6, d26 \n\t" // wid en blue to 16 bits
635
636 // srcrgba = {q11, q12, q13, q14}, alpha calculation is done a lready in above.
637 // q11, q12, q13 will have each channel's result of dst_0_rgb.
638 "vmovl.u8 q11, d24 \n\t" // wid en red to 16 bits
639 "vmovl.u8 q12, d25 \n\t" // wid en green to 16 bits
640 "vmovl.u8 q13, d26 \n\t" // wid en blue to 16 bits
641
642 "vshl.u16 q11, q11, #2 \n\t" // dst 0 red result = src_red << 2 (later will do >> 5 to make 5 bit red)
643 "vshl.u16 q12, q12, #3 \n\t" // dst 0 grn result = src_grn << 3 (later will do >> 5 to make 6 bit grn)
644 "vshl.u16 q13, q13, #2 \n\t" // dst 0 blu result = src_blu << 2 (later will do >> 5 to make 5 bit blu)
645
646 "vshl.u16 q4, q4, #2 \n\t" // dst 1 red result = src_red << 2 (later will do >> 5 to make 5 bit red)
647 "vshl.u16 q5, q5, #3 \n\t" // dst 1 grn result = src_grn << 3 (later will do >> 5 to make 6 bit grn)
648 "vshl.u16 q6, q6, #2 \n\t" // dst 1 blu result = src_blu << 2 (later will do >> 5 to make 5 bit blu)
649
650 "vmla.u16 q11, q8, q14 \n\t" // dst 0 red result += dst_red * dst_scale
651 "vmla.u16 q12, q9, q14 \n\t" // dst 0 grn result += dst_grn * dst_scale
652 "vmla.u16 q13, q10, q14 \n\t" // dst 0 blu result += dst_blu * dst_scale
653
654 "vmla.u16 q4, q2, q14 \n\t" // dst 1 red result += dst_red * dst_scale
655 "vmla.u16 q5, q3, q14 \n\t" // dst 1 grn result += dst_grn * dst_scale
656 "vmla.u16 q6, q7, q14 \n\t" // dst 1 blu result += dst_blu * dst_scale
657
658 "vshr.u16 q11, q11, #5 \n\t" // dst 0 red result >> 5
659 "vshr.u16 q12, q12, #5 \n\t" // dst 0 grn result >> 5
660 "vshr.u16 q13, q13, #5 \n\t" // dst 0 blu result >> 5
661
662 "vshr.u16 q4, q4, #5 \n\t" // dst 1 red result >> 5
663 "vshr.u16 q5, q5, #5 \n\t" // dst 1 grn result >> 5
664 "vshr.u16 q14, q6, #5 \n\t" // dst 1 blu result >> 5
665
666 "vsli.u16 q13, q12, #5 \n\t" // dst 0 insert green into blue
667 "vsli.u16 q13, q11, #11 \n\t" // dst 0 insert red into green/blue
668
669 "vsli.u16 q14, q5, #5 \n\t" // dst 1 insert green into blue
670 "vsli.u16 q14, q4, #11 \n\t" // dst 1 insert red into green/blue
671
672 "vst1.16 {d26, d27, d28, d29}, [%[dst]]! \n\t" // wri te pixel back to dst 0 and dst 1, update ptr
673
674 "bne 1b \n\t" // if counter != 0, loop
675 "2: \n\t" // exi t
676
677 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)
678 :
679 : "cc", "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", " r11", "r12",
680 "d0", "d1", "d2", "d3", "d4", "d5", "d 6", "d7", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16",
681 "d17", "d18", "d19", "d20", "d21", "d2 2", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"
682 );
683 #endif
684 count &= 0xF;
685 if (count > 0) {
686 do {
687 uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale;
688 *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);
689 dst += 1;
690 } while (--count != 0);
691 }
692 }
693
468 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) { 694 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {
469 prod += vdupq_n_u16(128); 695 prod += vdupq_n_u16(128);
470 prod += vshrq_n_u16(prod, 8); 696 prod += vshrq_n_u16(prod, 8);
471 return vshrq_n_u16(prod, 8); 697 return vshrq_n_u16(prod, 8);
472 } 698 }
473 699
474 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst, 700 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
475 const SkPMColor* SK_RESTRICT src, int count, 701 const SkPMColor* SK_RESTRICT src, int count,
476 U8CPU alpha, int /*x*/, int /*y*/) { 702 U8CPU alpha, int /*x*/, int /*y*/) {
477 SkASSERT(255 > alpha); 703 SkASSERT(255 > alpha);
(...skipping 1180 matching lines...) Expand 10 before | Expand all | Expand 10 after
1658 // https://code.google.com/p/skia/issues/detail?id=2797 1884 // https://code.google.com/p/skia/issues/detail?id=2797
1659 #endif 1885 #endif
1660 1886
1661 // dither 1887 // dither
1662 S32_D565_Opaque_Dither_neon, 1888 S32_D565_Opaque_Dither_neon,
1663 S32_D565_Blend_Dither_neon, 1889 S32_D565_Blend_Dither_neon,
1664 S32A_D565_Opaque_Dither_neon, 1890 S32A_D565_Opaque_Dither_neon,
1665 NULL, // S32A_D565_Blend_Dither 1891 NULL, // S32A_D565_Blend_Dither
1666 }; 1892 };
1667 1893
1894 const SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm_neon[] = {
1895 #if 0
1896 Color32_D565_neon,
1897 Color32A_D565_neon,
1898 Color32_D565_Dither_neon,
1899 Color32A_D565_Dither_neon
1900 #else
1901 // TODO: stop cheating and fill in the above specializations!
1902 Color32A_D565_neon,
1903 Color32A_D565_neon,
1904 Color32A_D565_neon,
1905 Color32A_D565_neon,
1906 #endif
1907 };
1908
1668 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { 1909 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
1669 NULL, // S32_Opaque, 1910 NULL, // S32_Opaque,
1670 S32_Blend_BlitRow32_neon, // S32_Blend, 1911 S32_Blend_BlitRow32_neon, // S32_Blend,
1671 /* 1912 /*
1672 * We have two choices for S32A_Opaque procs. The one reads the src alpha 1913 * We have two choices for S32A_Opaque procs. The one reads the src alpha
1673 * value and attempts to optimize accordingly. The optimization is 1914 * value and attempts to optimize accordingly. The optimization is
1674 * sensitive to the source content and is not a win in all cases. For 1915 * sensitive to the source content and is not a win in all cases. For
1675 * example, if there are a lot of transitions between the alpha states, 1916 * example, if there are a lot of transitions between the alpha states,
1676 * the performance will almost certainly be worse. However, for many 1917 * the performance will almost certainly be worse. However, for many
1677 * common cases the performance is equivalent or better than the standard 1918 * common cases the performance is equivalent or better than the standard
1678 * case where we do not inspect the src alpha. 1919 * case where we do not inspect the src alpha.
1679 */ 1920 */
1680 #if SK_A32_SHIFT == 24 1921 #if SK_A32_SHIFT == 24
1681 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor 1922 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
1682 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, 1923 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,
1683 #else 1924 #else
1684 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, 1925 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,
1685 #endif 1926 #endif
1686 #ifdef SK_CPU_ARM32 1927 #ifdef SK_CPU_ARM32
1687 S32A_Blend_BlitRow32_neon // S32A_Blend 1928 S32A_Blend_BlitRow32_neon // S32A_Blend
1688 #else 1929 #else
1689 NULL 1930 NULL
1690 #endif 1931 #endif
1691 }; 1932 };
OLDNEW
« src/core/SkBlitter_RGB16.cpp ('K') | « src/opts/SkBlitRow_opts_arm_neon.h ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698