Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(169)

Side by Side Diff: src/opts/SkBlitRow_opts_arm_neon.cpp

Issue 845293002: skia: blend32_16_row for neon version (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: Created 5 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/opts/SkBlitRow_opts_arm_neon.h ('k') | src/opts/SkBlitRow_opts_mips_dsp.cpp » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2012 The Android Open Source Project 2 * Copyright 2012 The Android Open Source Project
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 #include "SkBlitRow_opts_arm_neon.h" 8 #include "SkBlitRow_opts_arm_neon.h"
9 9
10 #include "SkBlitMask.h" 10 #include "SkBlitMask.h"
(...skipping 447 matching lines...) Expand 10 before | Expand all | Expand 10 after
458 SkPMColorAssert(c); 458 SkPMColorAssert(c);
459 if (c) { 459 if (c) {
460 *dst = SkSrcOver32To16(c, *dst); 460 *dst = SkSrcOver32To16(c, *dst);
461 } 461 }
462 dst += 1; 462 dst += 1;
463 } while (--count != 0); 463 } while (--count != 0);
464 } 464 }
465 } 465 }
466 #endif // #ifdef SK_CPU_ARM32 466 #endif // #ifdef SK_CPU_ARM32
467 467
468 static uint32_t pmcolor_to_expand16(SkPMColor c) {
469 unsigned r = SkGetPackedR32(c);
470 unsigned g = SkGetPackedG32(c);
471 unsigned b = SkGetPackedB32(c);
472 return (g << 24) | (r << 13) | (b << 2);
473 }
474
475 void blend32_16_row_neon(const SkPMColor* SK_RESTRICT src,
476 uint16_t dst[], int count) {
477 uint32_t src_expand;
478 unsigned scale;
479
480 if (count <= 0) return;
481 SkASSERT(((size_t)dst & 0x01) == 0);
482
483 /*
484 * This preamble code is in order to make dst aligned to 8 bytes
485 * in the next mutiple bytes read & write access.
486 */
487 src_expand = pmcolor_to_expand16(*src);
488 scale = SkAlpha255To256(0xFF - SkGetPackedA32(*src)) >> 3;
489
490 #define DST_ALIGN 8
491
492 /*
493 * preamble_size is in byte, meantime, this blend32_16_row_neon updates 2 by tes at a time.
494 */
495 int preamble_size = (DST_ALIGN - (size_t)dst) & (DST_ALIGN - 1);
496
497 for (int i = 0; i < preamble_size; i+=2, dst++) {
498 uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale;
499 *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);
500 if (--count == 0)
501 break;
502 }
503 #ifdef SK_CPU_ARM64
504 asm (
505 "lsr x2, %[count], #4 \n\t"
506 "mov x1, x2 \n\t" // calc. count>>4
507 "cbz x1, back \n\t" // if (count>>4) == 0, exit
508 "ld4 {v24.8b, v25.8b, v26.8b, v27.8b}, [%[src]] \n\t" // load eight src ABGR32 pixels
509
510 "uxtl v4.8h, v24.8b \n\t" // widen red to 16 bits
511 "uxtl v5.8h, v25.8b \n\t" // widen green to 16 bits
512 "uxtl v6.8h, v26.8b \n\t" // widen blue to 16 bits
513
514 "shl v4.8h, v4.8h, #2 \n\t" // src red = src_red << 2 (later will do >> 5 to make 5 bit red)
515 "shl v5.8h, v5.8h, #3 \n\t" // src grn = src_grn << 3 (later will do >> 5 to make 6 bit grn)
516 "shl v6.8h, v6.8h, #2 \n\t" // src blu = src_blu << 2 (later will do >> 5 to make 5 bit blu)
517
518 "movi v21.8h, #1, lsl#8 \n\t" // set up constant 256 (1<<8)
519 "uxtl v14.8h, v27.8b \n\t" // widen alpha to 16 bits
520 "sub v14.8h, v21.8h, v14.8h \n\t" // 256 - sa
521 "ushr v14.8h, v14.8h, #3 \n\t" // (256 - sa) >> 3
522
523 "front: \n\t"
524 "ld1 {v0.8h, v1.8h}, [%[dst]] \n\t" // load sixteen dst RGB565 pixels
525 //set PREFETCH_DISTANCE to 128
526 "prfum pldl1keep, [%[dst], #128] \n\t"
527
528 "subs x1, x1, #1 \n\t" // decrement loop counter
529
530 "shl v9.8h, v0.8h, #5 \n\t" // shift green to top of lanes
531
532 "shl v10.8h, v0.8h, #11 \n\t" // shift blue to top of lanes
533 "ushr v10.8h, v10.8h, #11 \n\t" // extract blue
534
535 "ushr v8.8h, v0.8h, #11 \n\t" // extract red
536 "ushr v9.8h, v9.8h, #10 \n\t" // extract green
537
538 "shl v3.8h, v1.8h, #5 \n\t" // shift green to top of lanes
539
540 "shl v7.8h, v1.8h, #11 \n\t" // shift blue to top of lanes
541 "ushr v7.8h, v7.8h, #11 \n\t" // extract blue
542
543 "ushr v2.8h, v1.8h, #11 \n\t" // extract red
544 "ushr v3.8h, v3.8h, #10 \n\t" // extract green
545
546 //If we use src in mla, directly, vd.8h is updated by mla, so need to calculate src in next loop again, which requires 12 cycles.
547 //Instead, copy src to other registers and use them as a mla d estination.
548 //6 shl commands are needed, but we don't need to update src. In total, will get 6 cycle benefit in each loop.
549
550 "shl v15.8h, v4.8h, #0 \n\t" // copy dst 0 red result
551 "shl v16.8h, v5.8h, #0 \n\t" // copy dst 0 grn result
552 "shl v17.8h, v6.8h, #0 \n\t" // copy dst 0 blu result
553 "mla v15.8h, v8.8h, v14.8h \n\t" // dst 0 red result = dst_red * dst_scale
554 "mla v16.8h, v9.8h, v14.8h \n\t" // dst 0 grn result = dst_grn * dst_scale
555 "mla v17.8h, v10.8h, v14.8h \n\t" // dst 0 blu result = dst_blu * dst_scale
556
557 "shl v20.8h, v4.8h, #0 \n\t" // copy dst 1 red result
558 "shl v19.8h, v5.8h, #0 \n\t" // copy dst 1 grn result
559 "shl v18.8h, v6.8h, #0 \n\t" // copy dst 1 blu result
560 "mla v20.8h, v2.8h, v14.8h \n\t" // dst 1 red result = dst_red * dst_scale
561 "mla v19.8h, v3.8h, v14.8h \n\t" // dst 1 grn result = dst_grn * dst_scale
562 "mla v18.8h, v7.8h, v14.8h \n\t" // dst 1 blu result = dst_blu * dst_scale
563
564 "ushr v15.8h, v15.8h, #5 \n\t" // dst 0 red result >> 5
565 "ushr v16.8h, v16.8h, #5 \n\t" // dst 0 grn result >> 5
566 "ushr v17.8h, v17.8h, #5 \n\t" // dst 0 blu result >> 5
567
568 "ushr v20.8h, v20.8h, #5 \n\t" // dst 1 red result >> 5
569 "ushr v19.8h, v19.8h, #5 \n\t" // dst 1 grn result >> 5
570 "ushr v18.8h, v18.8h, #5 \n\t" // dst 1 blu result >> 5
571
572 "sli v17.8h, v16.8h, #5 \n\t" // dst 0 insert green into blue
573 "sli v17.8h, v15.8h, #11 \n\t" // dst 0 insert red into green/blue
574
575 "sli v18.8h, v19.8h, #5 \n\t" // dst 1 insert green into blue
576 "sli v18.8h, v20.8h, #11 \n\t" // dst 1 insert red into green/blue
577
578 "st1 {v17.8h, v18.8h}, [%[dst]], #32 \n\t" // write pixel back to dst 0 and dst 1, update ptr
579 "cbnz x1, front \n\t" // if counter !=0, loop
580 "back: \n\t" // exit
581
582 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)
583 : : "x1", "x2", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" , "v8", "v9", "v10", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21",
584 "v24", "v25", "v26", "v27"
585 );
586 #else
587 asm volatile (
588 "movs r4, %[count], lsr #4 \n\t" // cal c. count>>4
589 "beq 2f \n\t" // if (count>>4) == 0, exit
590 "vmov.u16 q15, #0x1f \n\t" // set up blue mask
591 "vld4.u8 {d24, d25, d26, d27}, [%[src]] \n\t" // loa d eight src ABGR32 pixels
592
593 "vmov r5, r6, d24 \n\t" // sav e src red in r5, r6
594 "vmov r7, r8, d25 \n\t" // sav e src green in r7, r8
595 "vmov r9, r10, d26 \n\t" // sav e src blue in r9, r10
596 "vmov r11, r12, d27 \n\t" // sav e src alpha in r11, r12
597
598
599 "1: \n\t"
600 "vld1.u16 {d0, d1, d2, d3}, [%[dst]] \n\t" // loa d sixteen dst RGB565 pixels
601 //set PREFETCH_DISTANCE to 128
602 "pld [%[dst], #128] \n\t"
603
604 "subs r4, r4, #1 \n\t" // dec rement loop counter
605
606 "vmov d24, r5, r6 \n\t" // src red to d24
607 "vmov d25, r7, r8 \n\t" // src green to d25
608 "vmov d26, r9, r10 \n\t" // src blue to d26
609 "vmov d27, r11, r12 \n\t" // src alpha to d27
610
611 "vmov.u16 q3, #256 \n\t" // set up constant
612 "vmovl.u8 q14, d27 \n\t" // wid en alpha to 16 bits
613 // dst_scale = q14
614 "vsub.u16 q14, q3, q14 \n\t" // 256 - sa
615 "vshr.u16 q14, q14, #3 \n\t" // (25 6 - sa) >> 3
616
617
618 // dst_0_rgb = {q8, q9, q10}
619 "vshl.u16 q9, q0, #5 \n\t" // shi ft green to top of lanes
620 "vand q10, q0, q15 \n\t" // ext ract blue
621 "vshr.u16 q8, q0, #11 \n\t" // ext ract red
622 "vshr.u16 q9, q9, #10 \n\t" // ext ract green
623
624 //use q3 for dst_1 green. In the next loop, needs to set q3 to 256 again.
625 // dst_1_rgb = {q2, q3, q7}
626 "vshl.u16 q3, q1, #5 \n\t" // shi ft green to top of lanes
627 "vand q7, q1, q15 \n\t" // ext ract blue
628 "vshr.u16 q2, q1, #11 \n\t" // ext ract red
629 "vshr.u16 q3, q3, #10 \n\t" // ext ract green
630
631 // srcrgba = {q4, q5, q6, q14}, alpha calculation is done alre ady in above.
632 // q4, q5, q6 will have each channel's result of dst_1_rgb.
633 "vmovl.u8 q4, d24 \n\t" // wid en red to 16 bits
634 "vmovl.u8 q5, d25 \n\t" // wid en green to 16 bits
635 "vmovl.u8 q6, d26 \n\t" // wid en blue to 16 bits
636
637 // srcrgba = {q11, q12, q13, q14}, alpha calculation is done a lready in above.
638 // q11, q12, q13 will have each channel's result of dst_0_rgb.
639 "vmovl.u8 q11, d24 \n\t" // wid en red to 16 bits
640 "vmovl.u8 q12, d25 \n\t" // wid en green to 16 bits
641 "vmovl.u8 q13, d26 \n\t" // wid en blue to 16 bits
642
643 "vshl.u16 q11, q11, #2 \n\t" // dst 0 red result = src_red << 2 (later will do >> 5 to make 5 bit red)
644 "vshl.u16 q12, q12, #3 \n\t" // dst 0 grn result = src_grn << 3 (later will do >> 5 to make 6 bit grn)
645 "vshl.u16 q13, q13, #2 \n\t" // dst 0 blu result = src_blu << 2 (later will do >> 5 to make 5 bit blu)
646
647 "vshl.u16 q4, q4, #2 \n\t" // dst 1 red result = src_red << 2 (later will do >> 5 to make 5 bit red)
648 "vshl.u16 q5, q5, #3 \n\t" // dst 1 grn result = src_grn << 3 (later will do >> 5 to make 6 bit grn)
649 "vshl.u16 q6, q6, #2 \n\t" // dst 1 blu result = src_blu << 2 (later will do >> 5 to make 5 bit blu)
650
651 "vmla.u16 q11, q8, q14 \n\t" // dst 0 red result += dst_red * dst_scale
652 "vmla.u16 q12, q9, q14 \n\t" // dst 0 grn result += dst_grn * dst_scale
653 "vmla.u16 q13, q10, q14 \n\t" // dst 0 blu result += dst_blu * dst_scale
654
655 "vmla.u16 q4, q2, q14 \n\t" // dst 1 red result += dst_red * dst_scale
656 "vmla.u16 q5, q3, q14 \n\t" // dst 1 grn result += dst_grn * dst_scale
657 "vmla.u16 q6, q7, q14 \n\t" // dst 1 blu result += dst_blu * dst_scale
658
659 "vshr.u16 q11, q11, #5 \n\t" // dst 0 red result >> 5
660 "vshr.u16 q12, q12, #5 \n\t" // dst 0 grn result >> 5
661 "vshr.u16 q13, q13, #5 \n\t" // dst 0 blu result >> 5
662
663 "vshr.u16 q4, q4, #5 \n\t" // dst 1 red result >> 5
664 "vshr.u16 q5, q5, #5 \n\t" // dst 1 grn result >> 5
665 "vshr.u16 q14, q6, #5 \n\t" // dst 1 blu result >> 5
666
667 "vsli.u16 q13, q12, #5 \n\t" // dst 0 insert green into blue
668 "vsli.u16 q13, q11, #11 \n\t" // dst 0 insert red into green/blue
669
670 "vsli.u16 q14, q5, #5 \n\t" // dst 1 insert green into blue
671 "vsli.u16 q14, q4, #11 \n\t" // dst 1 insert red into green/blue
672
673 "vst1.16 {d26, d27, d28, d29}, [%[dst]]! \n\t" // wri te pixel back to dst 0 and dst 1, update ptr
674
675 "bne 1b \n\t" // if counter != 0, loop
676 "2: \n\t" // exi t
677
678 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)
679 :
680 : "cc", "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", " r11", "r12",
681 "d0", "d1", "d2", "d3", "d4", "d5", "d 6", "d7", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16",
682 "d17", "d18", "d19", "d20", "d21", "d2 2", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"
683 );
684 #endif
685 count &= 0xF;
686 if (count > 0) {
687 do {
688 uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale;
689 *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);
690 dst += 1;
691 } while (--count != 0);
692 }
693 }
694
468 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) { 695 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {
469 prod += vdupq_n_u16(128); 696 prod += vdupq_n_u16(128);
470 prod += vshrq_n_u16(prod, 8); 697 prod += vshrq_n_u16(prod, 8);
471 return vshrq_n_u16(prod, 8); 698 return vshrq_n_u16(prod, 8);
472 } 699 }
473 700
474 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst, 701 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
475 const SkPMColor* SK_RESTRICT src, int count, 702 const SkPMColor* SK_RESTRICT src, int count,
476 U8CPU alpha, int /*x*/, int /*y*/) { 703 U8CPU alpha, int /*x*/, int /*y*/) {
477 SkASSERT(255 > alpha); 704 SkASSERT(255 > alpha);
(...skipping 1204 matching lines...) Expand 10 before | Expand all | Expand 10 after
1682 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, 1909 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,
1683 #else 1910 #else
1684 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, 1911 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,
1685 #endif 1912 #endif
1686 #ifdef SK_CPU_ARM32 1913 #ifdef SK_CPU_ARM32
1687 S32A_Blend_BlitRow32_neon // S32A_Blend 1914 S32A_Blend_BlitRow32_neon // S32A_Blend
1688 #else 1915 #else
1689 NULL 1916 NULL
1690 #endif 1917 #endif
1691 }; 1918 };
OLDNEW
« no previous file with comments | « src/opts/SkBlitRow_opts_arm_neon.h ('k') | src/opts/SkBlitRow_opts_mips_dsp.cpp » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698