| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2014 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2014 The LibYuv Project Authors. All rights reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| (...skipping 545 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 556 [kYToRgb]"r"(&yuvconstants->kYToRgb) | 556 [kYToRgb]"r"(&yuvconstants->kYToRgb) |
| 557 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 557 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
| 558 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 558 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
| 559 ); | 559 ); |
| 560 } | 560 } |
| 561 | 561 |
| 562 void YUY2ToARGBRow_NEON(const uint8* src_yuy2, | 562 void YUY2ToARGBRow_NEON(const uint8* src_yuy2, |
| 563 uint8* dst_argb, | 563 uint8* dst_argb, |
| 564 const struct YuvConstants* yuvconstants, | 564 const struct YuvConstants* yuvconstants, |
| 565 int width) { | 565 int width) { |
| 566 int64 width64 = (int64)(width); | |
| 567 asm volatile ( | 566 asm volatile ( |
| 568 YUVTORGB_SETUP | 567 YUVTORGB_SETUP |
| 569 "movi v23.8b, #255 \n" | 568 "movi v23.8b, #255 \n" |
| 570 "1: \n" | 569 "1: \n" |
| 571 READYUY2 | 570 READYUY2 |
| 572 YUVTORGB(v22, v21, v20) | 571 YUVTORGB(v22, v21, v20) |
| 573 "subs %w2, %w2, #8 \n" | 572 "subs %w2, %w2, #8 \n" |
| 574 MEMACCESS(1) | 573 MEMACCESS(1) |
| 575 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" | 574 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" |
| 576 "b.gt 1b \n" | 575 "b.gt 1b \n" |
| 577 : "+r"(src_yuy2), // %0 | 576 : "+r"(src_yuy2), // %0 |
| 578 "+r"(dst_argb), // %1 | 577 "+r"(dst_argb), // %1 |
| 579 "+r"(width64) // %2 | 578 "+r"(width) // %2 |
| 580 : [kUVToRB]"r"(&yuvconstants->kUVToRB), | 579 : [kUVToRB]"r"(&yuvconstants->kUVToRB), |
| 581 [kUVToG]"r"(&yuvconstants->kUVToG), | 580 [kUVToG]"r"(&yuvconstants->kUVToG), |
| 582 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), | 581 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), |
| 583 [kYToRgb]"r"(&yuvconstants->kYToRgb) | 582 [kYToRgb]"r"(&yuvconstants->kYToRgb) |
| 584 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 583 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
| 585 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 584 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
| 586 ); | 585 ); |
| 587 } | 586 } |
| 588 | 587 |
| 589 void UYVYToARGBRow_NEON(const uint8* src_uyvy, | 588 void UYVYToARGBRow_NEON(const uint8* src_uyvy, |
| 590 uint8* dst_argb, | 589 uint8* dst_argb, |
| 591 const struct YuvConstants* yuvconstants, | 590 const struct YuvConstants* yuvconstants, |
| 592 int width) { | 591 int width) { |
| 593 int64 width64 = (int64)(width); | |
| 594 asm volatile ( | 592 asm volatile ( |
| 595 YUVTORGB_SETUP | 593 YUVTORGB_SETUP |
| 596 "movi v23.8b, #255 \n" | 594 "movi v23.8b, #255 \n" |
| 597 "1: \n" | 595 "1: \n" |
| 598 READUYVY | 596 READUYVY |
| 599 YUVTORGB(v22, v21, v20) | 597 YUVTORGB(v22, v21, v20) |
| 600 "subs %w2, %w2, #8 \n" | 598 "subs %w2, %w2, #8 \n" |
| 601 MEMACCESS(1) | 599 MEMACCESS(1) |
| 602 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n" | 600 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n" |
| 603 "b.gt 1b \n" | 601 "b.gt 1b \n" |
| 604 : "+r"(src_uyvy), // %0 | 602 : "+r"(src_uyvy), // %0 |
| 605 "+r"(dst_argb), // %1 | 603 "+r"(dst_argb), // %1 |
| 606 "+r"(width64) // %2 | 604 "+r"(width) // %2 |
| 607 : [kUVToRB]"r"(&yuvconstants->kUVToRB), | 605 : [kUVToRB]"r"(&yuvconstants->kUVToRB), |
| 608 [kUVToG]"r"(&yuvconstants->kUVToG), | 606 [kUVToG]"r"(&yuvconstants->kUVToG), |
| 609 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), | 607 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), |
| 610 [kYToRgb]"r"(&yuvconstants->kYToRgb) | 608 [kYToRgb]"r"(&yuvconstants->kYToRgb) |
| 611 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 609 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
| 612 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 610 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
| 613 ); | 611 ); |
| 614 } | 612 } |
| 615 | 613 |
| 616 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. | 614 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. |
| (...skipping 57 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 674 : // Input registers | 672 : // Input registers |
| 675 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List | 673 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
| 676 ); | 674 ); |
| 677 } | 675 } |
| 678 | 676 |
| 679 // SetRow writes 'count' bytes using an 8 bit value repeated. | 677 // SetRow writes 'count' bytes using an 8 bit value repeated. |
| 680 void SetRow_NEON(uint8* dst, uint8 v8, int count) { | 678 void SetRow_NEON(uint8* dst, uint8 v8, int count) { |
| 681 asm volatile ( | 679 asm volatile ( |
| 682 "dup v0.16b, %w2 \n" // duplicate 16 bytes | 680 "dup v0.16b, %w2 \n" // duplicate 16 bytes |
| 683 "1: \n" | 681 "1: \n" |
| 684 "subs %w1, %w1, #16 \n" // 16 bytes per loop | 682 "subs %w1, %w1, #16 \n" // 16 bytes per loop |
| 685 MEMACCESS(0) | 683 MEMACCESS(0) |
| 686 "st1 {v0.16b}, [%0], #16 \n" // store | 684 "st1 {v0.16b}, [%0], #16 \n" // store |
| 687 "b.gt 1b \n" | 685 "b.gt 1b \n" |
| 688 : "+r"(dst), // %0 | 686 : "+r"(dst), // %0 |
| 689 "+r"(count) // %1 | 687 "+r"(count) // %1 |
| 690 : "r"(v8) // %2 | 688 : "r"(v8) // %2 |
| 691 : "cc", "memory", "v0" | 689 : "cc", "memory", "v0" |
| 692 ); | 690 ); |
| 693 } | 691 } |
| 694 | 692 |
| 695 void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) { | 693 void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) { |
| 696 asm volatile ( | 694 asm volatile ( |
| 697 "dup v0.4s, %w2 \n" // duplicate 4 ints | 695 "dup v0.4s, %w2 \n" // duplicate 4 ints |
| 698 "1: \n" | 696 "1: \n" |
| 699 "subs %w1, %w1, #4 \n" // 4 ints per loop | 697 "subs %w1, %w1, #4 \n" // 4 ints per loop |
| 700 MEMACCESS(0) | 698 MEMACCESS(0) |
| 701 "st1 {v0.16b}, [%0], #16 \n" // store | 699 "st1 {v0.16b}, [%0], #16 \n" // store |
| 702 "b.gt 1b \n" | 700 "b.gt 1b \n" |
| 703 : "+r"(dst), // %0 | 701 : "+r"(dst), // %0 |
| 704 "+r"(count) // %1 | 702 "+r"(count) // %1 |
| 705 : "r"(v32) // %2 | 703 : "r"(v32) // %2 |
| 706 : "cc", "memory", "v0" | 704 : "cc", "memory", "v0" |
| 707 ); | 705 ); |
| 708 } | 706 } |
| 709 | 707 |
| 710 void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { | 708 void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { |
| 711 int64 width64 = (int64) width; | |
| 712 asm volatile ( | 709 asm volatile ( |
| 713 // Start at end of source row. | 710 // Start at end of source row. |
| 714 "add %0, %0, %2 \n" | 711 "add %0, %0, %w2, sxtw \n" |
| 715 "sub %0, %0, #16 \n" | 712 "sub %0, %0, #16 \n" |
| 716 | |
| 717 "1: \n" | 713 "1: \n" |
| 718 MEMACCESS(0) | 714 MEMACCESS(0) |
| 719 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 | 715 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 |
| 720 "subs %2, %2, #16 \n" // 16 pixels per loop. | 716 "subs %w2, %w2, #16 \n" // 16 pixels per loop. |
| 721 "rev64 v0.16b, v0.16b \n" | 717 "rev64 v0.16b, v0.16b \n" |
| 722 MEMACCESS(1) | 718 MEMACCESS(1) |
| 723 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 | 719 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 |
| 724 MEMACCESS(1) | 720 MEMACCESS(1) |
| 725 "st1 {v0.D}[0], [%1], #8 \n" | 721 "st1 {v0.D}[0], [%1], #8 \n" |
| 726 "b.gt 1b \n" | 722 "b.gt 1b \n" |
| 727 : "+r"(src), // %0 | 723 : "+r"(src), // %0 |
| 728 "+r"(dst), // %1 | 724 "+r"(dst), // %1 |
| 729 "+r"(width64) // %2 | 725 "+r"(width) // %2 |
| 730 : "r"((ptrdiff_t)-16) // %3 | 726 : "r"((ptrdiff_t)-16) // %3 |
| 731 : "cc", "memory", "v0" | 727 : "cc", "memory", "v0" |
| 732 ); | 728 ); |
| 733 } | 729 } |
| 734 | 730 |
| 735 void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, | 731 void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, |
| 736 int width) { | 732 int width) { |
| 737 int64 width64 = (int64) width; | |
| 738 asm volatile ( | 733 asm volatile ( |
| 739 // Start at end of source row. | 734 // Start at end of source row. |
| 740 "add %0, %0, %3, lsl #1 \n" | 735 "add %0, %0, %w3, sxtw #1 \n" |
| 741 "sub %0, %0, #16 \n" | 736 "sub %0, %0, #16 \n" |
| 742 | |
| 743 "1: \n" | 737 "1: \n" |
| 744 MEMACCESS(0) | 738 MEMACCESS(0) |
| 745 "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16 | 739 "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16 |
| 746 "subs %3, %3, #8 \n" // 8 pixels per loop. | 740 "subs %w3, %w3, #8 \n" // 8 pixels per loop. |
| 747 "rev64 v0.8b, v0.8b \n" | 741 "rev64 v0.8b, v0.8b \n" |
| 748 "rev64 v1.8b, v1.8b \n" | 742 "rev64 v1.8b, v1.8b \n" |
| 749 MEMACCESS(1) | 743 MEMACCESS(1) |
| 750 "st1 {v0.8b}, [%1], #8 \n" // dst += 8 | 744 "st1 {v0.8b}, [%1], #8 \n" // dst += 8 |
| 751 MEMACCESS(2) | 745 MEMACCESS(2) |
| 752 "st1 {v1.8b}, [%2], #8 \n" | 746 "st1 {v1.8b}, [%2], #8 \n" |
| 753 "b.gt 1b \n" | 747 "b.gt 1b \n" |
| 754 : "+r"(src_uv), // %0 | 748 : "+r"(src_uv), // %0 |
| 755 "+r"(dst_u), // %1 | 749 "+r"(dst_u), // %1 |
| 756 "+r"(dst_v), // %2 | 750 "+r"(dst_v), // %2 |
| 757 "+r"(width64) // %3 | 751 "+r"(width) // %3 |
| 758 : "r"((ptrdiff_t)-16) // %4 | 752 : "r"((ptrdiff_t)-16) // %4 |
| 759 : "cc", "memory", "v0", "v1" | 753 : "cc", "memory", "v0", "v1" |
| 760 ); | 754 ); |
| 761 } | 755 } |
| 762 | 756 |
| 763 void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { | 757 void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { |
| 764 int64 width64 = (int64) width; | |
| 765 asm volatile ( | 758 asm volatile ( |
| 766 // Start at end of source row. | 759 // Start at end of source row. |
| 767 "add %0, %0, %2, lsl #2 \n" | 760 "add %0, %0, %w2, sxtw #2 \n" |
| 768 "sub %0, %0, #16 \n" | 761 "sub %0, %0, #16 \n" |
| 769 | |
| 770 "1: \n" | 762 "1: \n" |
| 771 MEMACCESS(0) | 763 MEMACCESS(0) |
| 772 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 | 764 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 |
| 773 "subs %2, %2, #4 \n" // 4 pixels per loop. | 765 "subs %w2, %w2, #4 \n" // 4 pixels per loop. |
| 774 "rev64 v0.4s, v0.4s \n" | 766 "rev64 v0.4s, v0.4s \n" |
| 775 MEMACCESS(1) | 767 MEMACCESS(1) |
| 776 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 | 768 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 |
| 777 MEMACCESS(1) | 769 MEMACCESS(1) |
| 778 "st1 {v0.D}[0], [%1], #8 \n" | 770 "st1 {v0.D}[0], [%1], #8 \n" |
| 779 "b.gt 1b \n" | 771 "b.gt 1b \n" |
| 780 : "+r"(src), // %0 | 772 : "+r"(src), // %0 |
| 781 "+r"(dst), // %1 | 773 "+r"(dst), // %1 |
| 782 "+r"(width64) // %2 | 774 "+r"(width) // %2 |
| 783 : "r"((ptrdiff_t)-16) // %3 | 775 : "r"((ptrdiff_t)-16) // %3 |
| 784 : "cc", "memory", "v0" | 776 : "cc", "memory", "v0" |
| 785 ); | 777 ); |
| 786 } | 778 } |
| 787 | 779 |
| 788 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) { | 780 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) { |
| 789 asm volatile ( | 781 asm volatile ( |
| 790 "movi v4.8b, #255 \n" // Alpha | 782 "movi v4.8b, #255 \n" // Alpha |
| 791 "1: \n" | 783 "1: \n" |
| 792 MEMACCESS(0) | 784 MEMACCESS(0) |
| 793 "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. | 785 "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. |
| 794 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 786 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| 795 MEMACCESS(1) | 787 MEMACCESS(1) |
| 796 "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels | 788 "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels |
| 797 "b.gt 1b \n" | 789 "b.gt 1b \n" |
| 798 : "+r"(src_rgb24), // %0 | 790 : "+r"(src_rgb24), // %0 |
| 799 "+r"(dst_argb), // %1 | 791 "+r"(dst_argb), // %1 |
| 800 "+r"(width) // %2 | 792 "+r"(width) // %2 |
| 801 : | 793 : |
| 802 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List | 794 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List |
| 803 ); | 795 ); |
| 804 } | 796 } |
| 805 | 797 |
| 806 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) { | 798 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) { |
| 807 asm volatile ( | 799 asm volatile ( |
| 808 "movi v5.8b, #255 \n" // Alpha | 800 "movi v5.8b, #255 \n" // Alpha |
| 809 "1: \n" | 801 "1: \n" |
| 810 MEMACCESS(0) | 802 MEMACCESS(0) |
| 811 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b | 803 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b |
| 812 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 804 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| 813 "orr v3.8b, v1.8b, v1.8b \n" // move g | 805 "orr v3.8b, v1.8b, v1.8b \n" // move g |
| 814 "orr v4.8b, v0.8b, v0.8b \n" // move r | 806 "orr v4.8b, v0.8b, v0.8b \n" // move r |
| 815 MEMACCESS(1) | 807 MEMACCESS(1) |
| 816 "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a | 808 "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a |
| 817 "b.gt 1b \n" | 809 "b.gt 1b \n" |
| 818 : "+r"(src_raw), // %0 | 810 : "+r"(src_raw), // %0 |
| 819 "+r"(dst_argb), // %1 | 811 "+r"(dst_argb), // %1 |
| 820 "+r"(width) // %2 | 812 "+r"(width) // %2 |
| 821 : | 813 : |
| 822 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List | 814 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List |
| 823 ); | 815 ); |
| 824 } | 816 } |
| 825 | 817 |
| 826 void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) { | 818 void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) { |
| 827 asm volatile ( | 819 asm volatile ( |
| 828 "1: \n" | 820 "1: \n" |
| 829 MEMACCESS(0) | 821 MEMACCESS(0) |
| 830 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b | 822 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b |
| (...skipping 1977 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2808 "r"(6LL) // %5 | 2800 "r"(6LL) // %5 |
| 2809 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List | 2801 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
| 2810 ); | 2802 ); |
| 2811 } | 2803 } |
| 2812 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) | 2804 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) |
| 2813 | 2805 |
| 2814 #ifdef __cplusplus | 2806 #ifdef __cplusplus |
| 2815 } // extern "C" | 2807 } // extern "C" |
| 2816 } // namespace libyuv | 2808 } // namespace libyuv |
| 2817 #endif | 2809 #endif |
| OLD | NEW |