OLD | NEW |
---|---|
1 /* | 1 /* |
2 * Copyright 2014 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2014 The LibYuv Project Authors. All rights reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 545 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
556 [kYToRgb]"r"(&yuvconstants->kYToRgb) | 556 [kYToRgb]"r"(&yuvconstants->kYToRgb) |
557 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 557 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
558 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 558 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
559 ); | 559 ); |
560 } | 560 } |
561 | 561 |
562 void YUY2ToARGBRow_NEON(const uint8* src_yuy2, | 562 void YUY2ToARGBRow_NEON(const uint8* src_yuy2, |
563 uint8* dst_argb, | 563 uint8* dst_argb, |
564 const struct YuvConstants* yuvconstants, | 564 const struct YuvConstants* yuvconstants, |
565 int width) { | 565 int width) { |
566 int64 width64 = (int64)(width); | |
567 asm volatile ( | 566 asm volatile ( |
568 YUVTORGB_SETUP | 567 YUVTORGB_SETUP |
569 "movi v23.8b, #255 \n" | 568 "movi v23.8b, #255 \n" |
570 "1: \n" | 569 "1: \n" |
571 READYUY2 | 570 READYUY2 |
572 YUVTORGB(v22, v21, v20) | 571 YUVTORGB(v22, v21, v20) |
573 "subs %w2, %w2, #8 \n" | 572 "subs %w2, %w2, #8 \n" |
574 MEMACCESS(1) | 573 MEMACCESS(1) |
575 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" | 574 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" |
576 "b.gt 1b \n" | 575 "b.gt 1b \n" |
577 : "+r"(src_yuy2), // %0 | 576 : "+r"(src_yuy2), // %0 |
578 "+r"(dst_argb), // %1 | 577 "+r"(dst_argb), // %1 |
579 "+r"(width64) // %2 | 578 "+r"(width) // %2 |
580 : [kUVToRB]"r"(&yuvconstants->kUVToRB), | 579 : [kUVToRB]"r"(&yuvconstants->kUVToRB), |
581 [kUVToG]"r"(&yuvconstants->kUVToG), | 580 [kUVToG]"r"(&yuvconstants->kUVToG), |
582 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), | 581 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), |
583 [kYToRgb]"r"(&yuvconstants->kYToRgb) | 582 [kYToRgb]"r"(&yuvconstants->kYToRgb) |
584 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 583 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
585 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 584 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
586 ); | 585 ); |
587 } | 586 } |
588 | 587 |
589 void UYVYToARGBRow_NEON(const uint8* src_uyvy, | 588 void UYVYToARGBRow_NEON(const uint8* src_uyvy, |
590 uint8* dst_argb, | 589 uint8* dst_argb, |
591 const struct YuvConstants* yuvconstants, | 590 const struct YuvConstants* yuvconstants, |
592 int width) { | 591 int width) { |
593 int64 width64 = (int64)(width); | |
594 asm volatile ( | 592 asm volatile ( |
595 YUVTORGB_SETUP | 593 YUVTORGB_SETUP |
596 "movi v23.8b, #255 \n" | 594 "movi v23.8b, #255 \n" |
597 "1: \n" | 595 "1: \n" |
598 READUYVY | 596 READUYVY |
599 YUVTORGB(v22, v21, v20) | 597 YUVTORGB(v22, v21, v20) |
600 "subs %w2, %w2, #8 \n" | 598 "subs %w2, %w2, #8 \n" |
601 MEMACCESS(1) | 599 MEMACCESS(1) |
602 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n" | 600 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n" |
603 "b.gt 1b \n" | 601 "b.gt 1b \n" |
604 : "+r"(src_uyvy), // %0 | 602 : "+r"(src_uyvy), // %0 |
605 "+r"(dst_argb), // %1 | 603 "+r"(dst_argb), // %1 |
606 "+r"(width64) // %2 | 604 "+r"(width) // %2 |
607 : [kUVToRB]"r"(&yuvconstants->kUVToRB), | 605 : [kUVToRB]"r"(&yuvconstants->kUVToRB), |
608 [kUVToG]"r"(&yuvconstants->kUVToG), | 606 [kUVToG]"r"(&yuvconstants->kUVToG), |
609 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), | 607 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), |
610 [kYToRgb]"r"(&yuvconstants->kYToRgb) | 608 [kYToRgb]"r"(&yuvconstants->kYToRgb) |
611 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 609 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
612 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 610 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
613 ); | 611 ); |
614 } | 612 } |
615 | 613 |
616 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. | 614 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. |
(...skipping 84 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
701 "st1 {v0.16b}, [%0], #16 \n" // store | 699 "st1 {v0.16b}, [%0], #16 \n" // store |
702 "b.gt 1b \n" | 700 "b.gt 1b \n" |
703 : "+r"(dst), // %0 | 701 : "+r"(dst), // %0 |
704 "+r"(count) // %1 | 702 "+r"(count) // %1 |
705 : "r"(v32) // %2 | 703 : "r"(v32) // %2 |
706 : "cc", "memory", "v0" | 704 : "cc", "memory", "v0" |
707 ); | 705 ); |
708 } | 706 } |
709 | 707 |
710 void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { | 708 void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { |
711 int64 width64 = (int64) width; | 709 src += width - 16; |
712 asm volatile ( | 710 asm volatile ( |
713 // Start at end of source row. | |
714 "add %0, %0, %2 \n" | |
715 "sub %0, %0, #16 \n" | |
716 | |
717 "1: \n" | 711 "1: \n" |
718 MEMACCESS(0) | 712 MEMACCESS(0) |
719 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 | 713 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 |
720 "subs %2, %2, #16 \n" // 16 pixels per loop. | 714 "subs %w2, %w2, #16 \n" // 16 pixels per loop. |
721 "rev64 v0.16b, v0.16b \n" | 715 "rev64 v0.16b, v0.16b \n" |
722 MEMACCESS(1) | 716 MEMACCESS(1) |
723 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 | 717 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 |
724 MEMACCESS(1) | 718 MEMACCESS(1) |
725 "st1 {v0.D}[0], [%1], #8 \n" | 719 "st1 {v0.D}[0], [%1], #8 \n" |
726 "b.gt 1b \n" | 720 "b.gt 1b \n" |
727 : "+r"(src), // %0 | 721 : "+&r"(src), // %0 |
728 "+r"(dst), // %1 | 722 "+&r"(dst), // %1 |
729 "+r"(width64) // %2 | 723 "+&r"(width) // %2 |
730 : "r"((ptrdiff_t)-16) // %3 | 724 : "r"((ptrdiff_t)-16) // %3 |
731 : "cc", "memory", "v0" | 725 : "cc", "memory", "v0" |
732 ); | 726 ); |
733 } | 727 } |
734 | 728 |
729 // TODO(fbarchard): Consider single rev64 | |
735 void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, | 730 void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, |
736 int width) { | 731 int width) { |
737 int64 width64 = (int64) width; | 732 src_uv += width * 2 - 16; |
738 asm volatile ( | 733 asm volatile ( |
739 // Start at end of source row. | |
740 "add %0, %0, %3, lsl #1 \n" | |
741 "sub %0, %0, #16 \n" | |
742 | |
743 "1: \n" | 734 "1: \n" |
744 MEMACCESS(0) | 735 MEMACCESS(0) |
745 "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16 | 736 "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16 |
746 "subs %3, %3, #8 \n" // 8 pixels per loop. | 737 "subs %w3, %w3, #8 \n" // 8 pixels per loop. |
747 "rev64 v0.8b, v0.8b \n" | 738 "rev64 v0.8b, v0.8b \n" |
748 "rev64 v1.8b, v1.8b \n" | 739 "rev64 v1.8b, v1.8b \n" |
749 MEMACCESS(1) | 740 MEMACCESS(1) |
750 "st1 {v0.8b}, [%1], #8 \n" // dst += 8 | 741 "st1 {v0.8b}, [%1], #8 \n" // dst += 8 |
751 MEMACCESS(2) | 742 MEMACCESS(2) |
752 "st1 {v1.8b}, [%2], #8 \n" | 743 "st1 {v1.8b}, [%2], #8 \n" |
753 "b.gt 1b \n" | 744 "b.gt 1b \n" |
754 : "+r"(src_uv), // %0 | 745 : "+&r"(src_uv), // %0 |
755 "+r"(dst_u), // %1 | 746 "+&r"(dst_u), // %1 |
756 "+r"(dst_v), // %2 | 747 "+&r"(dst_v), // %2 |
757 "+r"(width64) // %3 | 748 "+&r"(width) // %3 |
758 : "r"((ptrdiff_t)-16) // %4 | 749 : "r"((ptrdiff_t)-16) // %4 |
759 : "cc", "memory", "v0", "v1" | 750 : "cc", "memory", "v0", "v1" |
760 ); | 751 ); |
761 } | 752 } |
762 | 753 |
763 void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { | 754 void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { |
764 int64 width64 = (int64) width; | 755 // Start at end of source row. |
756 src += width * 4 - 16; | |
fbarchard1
2016/06/07 22:24:50
Code generated is:
0000000000000000 <ARGBMirrorRo
| |
765 asm volatile ( | 757 asm volatile ( |
766 // Start at end of source row. | |
767 "add %0, %0, %2, lsl #2 \n" | |
768 "sub %0, %0, #16 \n" | |
769 | |
770 "1: \n" | 758 "1: \n" |
771 MEMACCESS(0) | 759 MEMACCESS(0) |
772 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 | 760 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 |
773 "subs %2, %2, #4 \n" // 4 pixels per loop. | 761 "subs %w2, %w2, #4 \n" // 4 pixels per loop. |
774 "rev64 v0.4s, v0.4s \n" | 762 "rev64 v0.4s, v0.4s \n" |
775 MEMACCESS(1) | 763 MEMACCESS(1) |
776 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 | 764 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 |
777 MEMACCESS(1) | 765 MEMACCESS(1) |
778 "st1 {v0.D}[0], [%1], #8 \n" | 766 "st1 {v0.D}[0], [%1], #8 \n" |
779 "b.gt 1b \n" | 767 "b.gt 1b \n" |
780 : "+r"(src), // %0 | 768 : "+&r"(src), // %0 |
781 "+r"(dst), // %1 | 769 "+&r"(dst), // %1 |
782 "+r"(width64) // %2 | 770 "+&r"(width) // %2 |
783 : "r"((ptrdiff_t)-16) // %3 | 771 : "r"((ptrdiff_t)-16) // %3 |
784 : "cc", "memory", "v0" | 772 : "cc", "memory", "v0" |
785 ); | 773 ); |
786 } | 774 } |
787 | 775 |
788 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) { | 776 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) { |
789 asm volatile ( | 777 asm volatile ( |
790 "movi v4.8b, #255 \n" // Alpha | 778 "movi v4.8b, #255 \n" // Alpha |
791 "1: \n" | 779 "1: \n" |
792 MEMACCESS(0) | 780 MEMACCESS(0) |
793 "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. | 781 "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. |
794 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 782 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
795 MEMACCESS(1) | 783 MEMACCESS(1) |
796 "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels | 784 "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels |
797 "b.gt 1b \n" | 785 "b.gt 1b \n" |
798 : "+r"(src_rgb24), // %0 | 786 : "+r"(src_rgb24), // %0 |
799 "+r"(dst_argb), // %1 | 787 "+r"(dst_argb), // %1 |
800 "+r"(width) // %2 | 788 "+r"(width) // %2 |
801 : | 789 : |
802 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List | 790 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List |
803 ); | 791 ); |
804 } | 792 } |
805 | 793 |
806 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) { | 794 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) { |
807 asm volatile ( | 795 asm volatile ( |
808 "movi v5.8b, #255 \n" // Alpha | 796 "movi v5.8b, #255 \n" // Alpha |
809 "1: \n" | 797 "1: \n" |
810 MEMACCESS(0) | 798 MEMACCESS(0) |
811 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b | 799 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b |
812 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 800 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
813 "orr v3.8b, v1.8b, v1.8b \n" // move g | 801 "orr v3.8b, v1.8b, v1.8b \n" // move g |
814 "orr v4.8b, v0.8b, v0.8b \n" // move r | 802 "orr v4.8b, v0.8b, v0.8b \n" // move r |
815 MEMACCESS(1) | 803 MEMACCESS(1) |
816 "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a | 804 "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a |
817 "b.gt 1b \n" | 805 "b.gt 1b \n" |
818 : "+r"(src_raw), // %0 | 806 : "+r"(src_raw), // %0 |
819 "+r"(dst_argb), // %1 | 807 "+r"(dst_argb), // %1 |
820 "+r"(width) // %2 | 808 "+r"(width) // %2 |
821 : | 809 : |
822 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List | 810 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List |
823 ); | 811 ); |
824 } | 812 } |
825 | 813 |
826 void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) { | 814 void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) { |
827 asm volatile ( | 815 asm volatile ( |
828 "1: \n" | 816 "1: \n" |
829 MEMACCESS(0) | 817 MEMACCESS(0) |
830 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b | 818 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b |
(...skipping 1977 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
2808 "r"(6LL) // %5 | 2796 "r"(6LL) // %5 |
2809 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List | 2797 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
2810 ); | 2798 ); |
2811 } | 2799 } |
2812 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) | 2800 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) |
2813 | 2801 |
2814 #ifdef __cplusplus | 2802 #ifdef __cplusplus |
2815 } // extern "C" | 2803 } // extern "C" |
2816 } // namespace libyuv | 2804 } // namespace libyuv |
2817 #endif | 2805 #endif |
OLD | NEW |