source/row_neon64.cc - Issue 2043073003: neon64 use width int directly.

Side by Side Diff: source/row_neon64.cc

Issue 2043073003: neon64 use width int directly. (Closed) Base URL: https://chromium.googlesource.com/libyuv/libyuv@master

Patch Set: remove trailing tab Created 4 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright 2014 The LibYuv Project Authors. All rights reserved.	2 * Copyright 2014 The LibYuv Project Authors. All rights reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

(...skipping 545 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
556 [kYToRgb]"r"(&yuvconstants->kYToRgb)	556 [kYToRgb]"r"(&yuvconstants->kYToRgb)

557 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",	557 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",

558 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"	558 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"

559 );	559 );

560 }	560 }

561	561

562 void YUY2ToARGBRow_NEON(const uint8* src_yuy2,	562 void YUY2ToARGBRow_NEON(const uint8* src_yuy2,

563 uint8* dst_argb,	563 uint8* dst_argb,

564 const struct YuvConstants* yuvconstants,	564 const struct YuvConstants* yuvconstants,

565 int width) {	565 int width) {

566 int64 width64 = (int64)(width);

567 asm volatile (	566 asm volatile (

568 YUVTORGB_SETUP	567 YUVTORGB_SETUP

569 "movi v23.8b, #255 \n"	568 "movi v23.8b, #255 \n"

570 "1: \n"	569 "1: \n"

571 READYUY2	570 READYUY2

572 YUVTORGB(v22, v21, v20)	571 YUVTORGB(v22, v21, v20)

573 "subs %w2, %w2, #8 \n"	572 "subs %w2, %w2, #8 \n"

574 MEMACCESS(1)	573 MEMACCESS(1)

575 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"	574 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"

576 "b.gt 1b \n"	575 "b.gt 1b \n"

577 : "+r"(src_yuy2), // %0	576 : "+r"(src_yuy2), // %0

578 "+r"(dst_argb), // %1	577 "+r"(dst_argb), // %1

579 "+r"(width64) // %2	578 "+r"(width) // %2

580 : [kUVToRB]"r"(&yuvconstants->kUVToRB),	579 : [kUVToRB]"r"(&yuvconstants->kUVToRB),

581 [kUVToG]"r"(&yuvconstants->kUVToG),	580 [kUVToG]"r"(&yuvconstants->kUVToG),

582 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),	581 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

583 [kYToRgb]"r"(&yuvconstants->kYToRgb)	582 [kYToRgb]"r"(&yuvconstants->kYToRgb)

584 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",	583 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",

585 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"	584 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"

586 );	585 );

587 }	586 }

588	587

589 void UYVYToARGBRow_NEON(const uint8* src_uyvy,	588 void UYVYToARGBRow_NEON(const uint8* src_uyvy,

590 uint8* dst_argb,	589 uint8* dst_argb,

591 const struct YuvConstants* yuvconstants,	590 const struct YuvConstants* yuvconstants,

592 int width) {	591 int width) {

593 int64 width64 = (int64)(width);

594 asm volatile (	592 asm volatile (

595 YUVTORGB_SETUP	593 YUVTORGB_SETUP

596 "movi v23.8b, #255 \n"	594 "movi v23.8b, #255 \n"

597 "1: \n"	595 "1: \n"

598 READUYVY	596 READUYVY

599 YUVTORGB(v22, v21, v20)	597 YUVTORGB(v22, v21, v20)

600 "subs %w2, %w2, #8 \n"	598 "subs %w2, %w2, #8 \n"

601 MEMACCESS(1)	599 MEMACCESS(1)

602 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n"	600 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n"

603 "b.gt 1b \n"	601 "b.gt 1b \n"

604 : "+r"(src_uyvy), // %0	602 : "+r"(src_uyvy), // %0

605 "+r"(dst_argb), // %1	603 "+r"(dst_argb), // %1

606 "+r"(width64) // %2	604 "+r"(width) // %2

607 : [kUVToRB]"r"(&yuvconstants->kUVToRB),	605 : [kUVToRB]"r"(&yuvconstants->kUVToRB),

608 [kUVToG]"r"(&yuvconstants->kUVToG),	606 [kUVToG]"r"(&yuvconstants->kUVToG),

609 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),	607 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

610 [kYToRgb]"r"(&yuvconstants->kYToRgb)	608 [kYToRgb]"r"(&yuvconstants->kYToRgb)

611 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",	609 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",

612 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"	610 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"

613 );	611 );

614 }	612 }

615	613

616 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.	614 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.

(...skipping 57 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
674 : // Input registers	672 : // Input registers

675 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List	673 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List

676 );	674 );

677 }	675 }

678	676

679 // SetRow writes 'count' bytes using an 8 bit value repeated.	677 // SetRow writes 'count' bytes using an 8 bit value repeated.

680 void SetRow_NEON(uint8* dst, uint8 v8, int count) {	678 void SetRow_NEON(uint8* dst, uint8 v8, int count) {

681 asm volatile (	679 asm volatile (

682 "dup v0.16b, %w2 \n" // duplicate 16 bytes	680 "dup v0.16b, %w2 \n" // duplicate 16 bytes

683 "1: \n"	681 "1: \n"

684 "subs %w1, %w1, #16 \n" // 16 bytes per loop	682 "subs %w1, %w1, #16 \n" // 16 bytes per loop

685 MEMACCESS(0)	683 MEMACCESS(0)

686 "st1 {v0.16b}, [%0], #16 \n" // store	684 "st1 {v0.16b}, [%0], #16 \n" // store

687 "b.gt 1b \n"	685 "b.gt 1b \n"

688 : "+r"(dst), // %0	686 : "+r"(dst), // %0

689 "+r"(count) // %1	687 "+r"(count) // %1

690 : "r"(v8) // %2	688 : "r"(v8) // %2

691 : "cc", "memory", "v0"	689 : "cc", "memory", "v0"

692 );	690 );

693 }	691 }

694	692

695 void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {	693 void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {

696 asm volatile (	694 asm volatile (

697 "dup v0.4s, %w2 \n" // duplicate 4 ints	695 "dup v0.4s, %w2 \n" // duplicate 4 ints

698 "1: \n"	696 "1: \n"

699 "subs %w1, %w1, #4 \n" // 4 ints per loop	697 "subs %w1, %w1, #4 \n" // 4 ints per loop

700 MEMACCESS(0)	698 MEMACCESS(0)

701 "st1 {v0.16b}, [%0], #16 \n" // store	699 "st1 {v0.16b}, [%0], #16 \n" // store

702 "b.gt 1b \n"	700 "b.gt 1b \n"

703 : "+r"(dst), // %0	701 : "+r"(dst), // %0

704 "+r"(count) // %1	702 "+r"(count) // %1

705 : "r"(v32) // %2	703 : "r"(v32) // %2

706 : "cc", "memory", "v0"	704 : "cc", "memory", "v0"

707 );	705 );

708 }	706 }

709	707

710 void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {	708 void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {

711 int64 width64 = (int64) width;

712 asm volatile (	709 asm volatile (

713 // Start at end of source row.	710 // Start at end of source row.

714 "add %0, %0, %2 \n"	711 "add %0, %0, %w2, sxtw \n"

715 "sub %0, %0, #16 \n"	712 "sub %0, %0, #16 \n"

716

717 "1: \n"	713 "1: \n"

718 MEMACCESS(0)	714 MEMACCESS(0)

719 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16	715 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16

720 "subs %2, %2, #16 \n" // 16 pixels per loop.	716 "subs %w2, %w2, #16 \n" // 16 pixels per loop.

721 "rev64 v0.16b, v0.16b \n"	717 "rev64 v0.16b, v0.16b \n"

722 MEMACCESS(1)	718 MEMACCESS(1)

723 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16	719 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16

724 MEMACCESS(1)	720 MEMACCESS(1)

725 "st1 {v0.D}[0], [%1], #8 \n"	721 "st1 {v0.D}[0], [%1], #8 \n"

726 "b.gt 1b \n"	722 "b.gt 1b \n"

727 : "+r"(src), // %0	723 : "+r"(src), // %0

728 "+r"(dst), // %1	724 "+r"(dst), // %1

729 "+r"(width64) // %2	725 "+r"(width) // %2

730 : "r"((ptrdiff_t)-16) // %3	726 : "r"((ptrdiff_t)-16) // %3

731 : "cc", "memory", "v0"	727 : "cc", "memory", "v0"

732 );	728 );

733 }	729 }

734	730

735 void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,	731 void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

736 int width) {	732 int width) {

737 int64 width64 = (int64) width;

738 asm volatile (	733 asm volatile (

739 // Start at end of source row.	734 // Start at end of source row.

740 "add %0, %0, %3, lsl #1 \n"	735 "add %0, %0, %w3, sxtw #1 \n"

741 "sub %0, %0, #16 \n"	736 "sub %0, %0, #16 \n"

742

743 "1: \n"	737 "1: \n"

744 MEMACCESS(0)	738 MEMACCESS(0)

745 "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16	739 "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16

746 "subs %3, %3, #8 \n" // 8 pixels per loop.	740 "subs %w3, %w3, #8 \n" // 8 pixels per loop.

747 "rev64 v0.8b, v0.8b \n"	741 "rev64 v0.8b, v0.8b \n"

748 "rev64 v1.8b, v1.8b \n"	742 "rev64 v1.8b, v1.8b \n"

749 MEMACCESS(1)	743 MEMACCESS(1)

750 "st1 {v0.8b}, [%1], #8 \n" // dst += 8	744 "st1 {v0.8b}, [%1], #8 \n" // dst += 8

751 MEMACCESS(2)	745 MEMACCESS(2)

752 "st1 {v1.8b}, [%2], #8 \n"	746 "st1 {v1.8b}, [%2], #8 \n"

753 "b.gt 1b \n"	747 "b.gt 1b \n"

754 : "+r"(src_uv), // %0	748 : "+r"(src_uv), // %0

755 "+r"(dst_u), // %1	749 "+r"(dst_u), // %1

756 "+r"(dst_v), // %2	750 "+r"(dst_v), // %2

757 "+r"(width64) // %3	751 "+r"(width) // %3

758 : "r"((ptrdiff_t)-16) // %4	752 : "r"((ptrdiff_t)-16) // %4

759 : "cc", "memory", "v0", "v1"	753 : "cc", "memory", "v0", "v1"

760 );	754 );

761 }	755 }

762	756

763 void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {	757 void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {

764 int64 width64 = (int64) width;

765 asm volatile (	758 asm volatile (

766 // Start at end of source row.	759 // Start at end of source row.

767 "add %0, %0, %2, lsl #2 \n"	760 "add %0, %0, %w2, sxtw #2 \n"

768 "sub %0, %0, #16 \n"	761 "sub %0, %0, #16 \n"

769

770 "1: \n"	762 "1: \n"

771 MEMACCESS(0)	763 MEMACCESS(0)

772 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16	764 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16

773 "subs %2, %2, #4 \n" // 4 pixels per loop.	765 "subs %w2, %w2, #4 \n" // 4 pixels per loop.

774 "rev64 v0.4s, v0.4s \n"	766 "rev64 v0.4s, v0.4s \n"

775 MEMACCESS(1)	767 MEMACCESS(1)

776 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16	768 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16

777 MEMACCESS(1)	769 MEMACCESS(1)

778 "st1 {v0.D}[0], [%1], #8 \n"	770 "st1 {v0.D}[0], [%1], #8 \n"

779 "b.gt 1b \n"	771 "b.gt 1b \n"

780 : "+r"(src), // %0	772 : "+r"(src), // %0

781 "+r"(dst), // %1	773 "+r"(dst), // %1

782 "+r"(width64) // %2	774 "+r"(width) // %2

783 : "r"((ptrdiff_t)-16) // %3	775 : "r"((ptrdiff_t)-16) // %3

784 : "cc", "memory", "v0"	776 : "cc", "memory", "v0"

785 );	777 );

786 }	778 }

787	779

788 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {	780 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {

789 asm volatile (	781 asm volatile (

790 "movi v4.8b, #255 \n" // Alpha	782 "movi v4.8b, #255 \n" // Alpha

791 "1: \n"	783 "1: \n"

792 MEMACCESS(0)	784 MEMACCESS(0)

793 "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.	785 "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.

794 "subs %w2, %w2, #8 \n" // 8 processed per loop.	786 "subs %w2, %w2, #8 \n" // 8 processed per loop.

795 MEMACCESS(1)	787 MEMACCESS(1)

796 "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels	788 "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels

797 "b.gt 1b \n"	789 "b.gt 1b \n"

798 : "+r"(src_rgb24), // %0	790 : "+r"(src_rgb24), // %0

799 "+r"(dst_argb), // %1	791 "+r"(dst_argb), // %1

800 "+r"(width) // %2	792 "+r"(width) // %2

801 :	793 :

802 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List	794 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List

803 );	795 );

804 }	796 }

805	797

806 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {	798 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {

807 asm volatile (	799 asm volatile (

808 "movi v5.8b, #255 \n" // Alpha	800 "movi v5.8b, #255 \n" // Alpha

809 "1: \n"	801 "1: \n"

810 MEMACCESS(0)	802 MEMACCESS(0)

811 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b	803 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b

812 "subs %w2, %w2, #8 \n" // 8 processed per loop.	804 "subs %w2, %w2, #8 \n" // 8 processed per loop.

813 "orr v3.8b, v1.8b, v1.8b \n" // move g	805 "orr v3.8b, v1.8b, v1.8b \n" // move g

814 "orr v4.8b, v0.8b, v0.8b \n" // move r	806 "orr v4.8b, v0.8b, v0.8b \n" // move r

815 MEMACCESS(1)	807 MEMACCESS(1)

816 "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a	808 "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a

817 "b.gt 1b \n"	809 "b.gt 1b \n"

818 : "+r"(src_raw), // %0	810 : "+r"(src_raw), // %0

819 "+r"(dst_argb), // %1	811 "+r"(dst_argb), // %1

820 "+r"(width) // %2	812 "+r"(width) // %2

821 :	813 :

822 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List	814 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List

823 );	815 );

824 }	816 }

825	817

826 void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {	818 void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {

827 asm volatile (	819 asm volatile (

828 "1: \n"	820 "1: \n"

829 MEMACCESS(0)	821 MEMACCESS(0)

830 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b	822 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b

(...skipping 1977 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2808 "r"(6LL) // %5	2800 "r"(6LL) // %5

2809 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List	2801 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List

2810 );	2802 );

2811 }	2803 }

2812 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)	2804 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)

2813	2805

2814 #ifdef __cplusplus	2806 #ifdef __cplusplus

2815 } // extern "C"	2807 } // extern "C"

2816 } // namespace libyuv	2808 } // namespace libyuv

2817 #endif	2809 #endif

OLD	NEW

« no previous file with comments | « include/libyuv/version.h ('k') | no next file » | no next file with comments »