source/row_neon64.cc - Issue 2043073003: neon64 use width int directly.

Side by Side Diff: source/row_neon64.cc

Issue 2043073003: neon64 use width int directly. (Closed) Base URL: https://chromium.googlesource.com/libyuv/libyuv@master

Patch Set: bump version Created 4 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright 2014 The LibYuv Project Authors. All rights reserved.	2 * Copyright 2014 The LibYuv Project Authors. All rights reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

(...skipping 545 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
556 [kYToRgb]"r"(&yuvconstants->kYToRgb)	556 [kYToRgb]"r"(&yuvconstants->kYToRgb)

557 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",	557 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",

558 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"	558 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"

559 );	559 );

560 }	560 }

561	561

562 void YUY2ToARGBRow_NEON(const uint8* src_yuy2,	562 void YUY2ToARGBRow_NEON(const uint8* src_yuy2,

563 uint8* dst_argb,	563 uint8* dst_argb,

564 const struct YuvConstants* yuvconstants,	564 const struct YuvConstants* yuvconstants,

565 int width) {	565 int width) {

566 int64 width64 = (int64)(width);

567 asm volatile (	566 asm volatile (

568 YUVTORGB_SETUP	567 YUVTORGB_SETUP

569 "movi v23.8b, #255 \n"	568 "movi v23.8b, #255 \n"

570 "1: \n"	569 "1: \n"

571 READYUY2	570 READYUY2

572 YUVTORGB(v22, v21, v20)	571 YUVTORGB(v22, v21, v20)

573 "subs %w2, %w2, #8 \n"	572 "subs %w2, %w2, #8 \n"

574 MEMACCESS(1)	573 MEMACCESS(1)

575 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"	574 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"

576 "b.gt 1b \n"	575 "b.gt 1b \n"

577 : "+r"(src_yuy2), // %0	576 : "+r"(src_yuy2), // %0

578 "+r"(dst_argb), // %1	577 "+r"(dst_argb), // %1

579 "+r"(width64) // %2	578 "+r"(width) // %2

580 : [kUVToRB]"r"(&yuvconstants->kUVToRB),	579 : [kUVToRB]"r"(&yuvconstants->kUVToRB),

581 [kUVToG]"r"(&yuvconstants->kUVToG),	580 [kUVToG]"r"(&yuvconstants->kUVToG),

582 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),	581 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

583 [kYToRgb]"r"(&yuvconstants->kYToRgb)	582 [kYToRgb]"r"(&yuvconstants->kYToRgb)

584 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",	583 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",

585 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"	584 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"

586 );	585 );

587 }	586 }

588	587

589 void UYVYToARGBRow_NEON(const uint8* src_uyvy,	588 void UYVYToARGBRow_NEON(const uint8* src_uyvy,

590 uint8* dst_argb,	589 uint8* dst_argb,

591 const struct YuvConstants* yuvconstants,	590 const struct YuvConstants* yuvconstants,

592 int width) {	591 int width) {

593 int64 width64 = (int64)(width);

594 asm volatile (	592 asm volatile (

595 YUVTORGB_SETUP	593 YUVTORGB_SETUP

596 "movi v23.8b, #255 \n"	594 "movi v23.8b, #255 \n"

597 "1: \n"	595 "1: \n"

598 READUYVY	596 READUYVY

599 YUVTORGB(v22, v21, v20)	597 YUVTORGB(v22, v21, v20)

600 "subs %w2, %w2, #8 \n"	598 "subs %w2, %w2, #8 \n"

601 MEMACCESS(1)	599 MEMACCESS(1)

602 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n"	600 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n"

603 "b.gt 1b \n"	601 "b.gt 1b \n"

604 : "+r"(src_uyvy), // %0	602 : "+r"(src_uyvy), // %0

605 "+r"(dst_argb), // %1	603 "+r"(dst_argb), // %1

606 "+r"(width64) // %2	604 "+r"(width) // %2

607 : [kUVToRB]"r"(&yuvconstants->kUVToRB),	605 : [kUVToRB]"r"(&yuvconstants->kUVToRB),

608 [kUVToG]"r"(&yuvconstants->kUVToG),	606 [kUVToG]"r"(&yuvconstants->kUVToG),

609 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),	607 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

610 [kYToRgb]"r"(&yuvconstants->kYToRgb)	608 [kYToRgb]"r"(&yuvconstants->kYToRgb)

611 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",	609 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",

612 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"	610 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"

613 );	611 );

614 }	612 }

615	613

616 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.	614 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.

(...skipping 84 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
701 "st1 {v0.16b}, [%0], #16 \n" // store	699 "st1 {v0.16b}, [%0], #16 \n" // store

702 "b.gt 1b \n"	700 "b.gt 1b \n"

703 : "+r"(dst), // %0	701 : "+r"(dst), // %0

704 "+r"(count) // %1	702 "+r"(count) // %1

705 : "r"(v32) // %2	703 : "r"(v32) // %2

706 : "cc", "memory", "v0"	704 : "cc", "memory", "v0"

707 );	705 );

708 }	706 }

709	707

710 void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {	708 void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {

711 int64 width64 = (int64) width;	709 src += width - 16;

712 asm volatile (	710 asm volatile (

713 // Start at end of source row.

714 "add %0, %0, %2 \n"

715 "sub %0, %0, #16 \n"

716

717 "1: \n"	711 "1: \n"

718 MEMACCESS(0)	712 MEMACCESS(0)

719 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16	713 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16

720 "subs %2, %2, #16 \n" // 16 pixels per loop.	714 "subs %w2, %w2, #16 \n" // 16 pixels per loop.

721 "rev64 v0.16b, v0.16b \n"	715 "rev64 v0.16b, v0.16b \n"

722 MEMACCESS(1)	716 MEMACCESS(1)

723 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16	717 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16

724 MEMACCESS(1)	718 MEMACCESS(1)

725 "st1 {v0.D}[0], [%1], #8 \n"	719 "st1 {v0.D}[0], [%1], #8 \n"

726 "b.gt 1b \n"	720 "b.gt 1b \n"

727 : "+r"(src), // %0	721 : "+&r"(src), // %0

728 "+r"(dst), // %1	722 "+&r"(dst), // %1

729 "+r"(width64) // %2	723 "+&r"(width) // %2

730 : "r"((ptrdiff_t)-16) // %3	724 : "r"((ptrdiff_t)-16) // %3

731 : "cc", "memory", "v0"	725 : "cc", "memory", "v0"

732 );	726 );

733 }	727 }

734	728

	729 // TODO(fbarchard): Consider single rev64

735 void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,	730 void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

736 int width) {	731 int width) {

737 int64 width64 = (int64) width;	732 src_uv += width * 2 - 16;

738 asm volatile (	733 asm volatile (

739 // Start at end of source row.

740 "add %0, %0, %3, lsl #1 \n"

741 "sub %0, %0, #16 \n"

742

743 "1: \n"	734 "1: \n"

744 MEMACCESS(0)	735 MEMACCESS(0)

745 "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16	736 "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16

746 "subs %3, %3, #8 \n" // 8 pixels per loop.	737 "subs %w3, %w3, #8 \n" // 8 pixels per loop.

747 "rev64 v0.8b, v0.8b \n"	738 "rev64 v0.8b, v0.8b \n"

748 "rev64 v1.8b, v1.8b \n"	739 "rev64 v1.8b, v1.8b \n"

749 MEMACCESS(1)	740 MEMACCESS(1)

750 "st1 {v0.8b}, [%1], #8 \n" // dst += 8	741 "st1 {v0.8b}, [%1], #8 \n" // dst += 8

751 MEMACCESS(2)	742 MEMACCESS(2)

752 "st1 {v1.8b}, [%2], #8 \n"	743 "st1 {v1.8b}, [%2], #8 \n"

753 "b.gt 1b \n"	744 "b.gt 1b \n"

754 : "+r"(src_uv), // %0	745 : "+&r"(src_uv), // %0

755 "+r"(dst_u), // %1	746 "+&r"(dst_u), // %1

756 "+r"(dst_v), // %2	747 "+&r"(dst_v), // %2

757 "+r"(width64) // %3	748 "+&r"(width) // %3

758 : "r"((ptrdiff_t)-16) // %4	749 : "r"((ptrdiff_t)-16) // %4

759 : "cc", "memory", "v0", "v1"	750 : "cc", "memory", "v0", "v1"

760 );	751 );

761 }	752 }

762	753

763 void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {	754 void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {

764 int64 width64 = (int64) width;	755 // Start at end of source row.

	756 src += width * 4 - 16;
	fbarchard1 2016/06/07 22:24:50 Code generated is: 0000000000000000 <ARGBMirrorRo Code generated is: 0000000000000000 <ARGBMirrorRow_NEON>: 0: 51001043 sub w3, w2, #0x4 4: 531e7463 lsl w3, w3, #2 8: 8b23c000 add x0, x0, w3, sxtw c: 928001e3 mov x3, #0xfffffffffffffff0 // #-16 10: 4cc37000 ld1 {v0.16b}, [x0], x3 14: 71001042 subs w2, w2, #0x4 18: 4ea00800 rev64 v0.4s, v0.4s 1c: 4d9f8420 st1 {v0.d}[1], [x1], #8 20: 0d9f8420 st1 {v0.d}[0], [x1], #8 24: 54ffff6c b.gt 10 <ARGBMirrorRow_NEON+0x10> 28: d65f03c0 ret could 4: 531e7463 lsl w3, w3, #2 8: 8b23c000 add x0, x0, w3, sxtw be replaced with:? 8: 8b23c000 add x0, x0, w3, sxtw #2
765 asm volatile (	757 asm volatile (

766 // Start at end of source row.

767 "add %0, %0, %2, lsl #2 \n"

768 "sub %0, %0, #16 \n"

769

770 "1: \n"	758 "1: \n"

771 MEMACCESS(0)	759 MEMACCESS(0)

772 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16	760 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16

773 "subs %2, %2, #4 \n" // 4 pixels per loop.	761 "subs %w2, %w2, #4 \n" // 4 pixels per loop.

774 "rev64 v0.4s, v0.4s \n"	762 "rev64 v0.4s, v0.4s \n"

775 MEMACCESS(1)	763 MEMACCESS(1)

776 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16	764 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16

777 MEMACCESS(1)	765 MEMACCESS(1)

778 "st1 {v0.D}[0], [%1], #8 \n"	766 "st1 {v0.D}[0], [%1], #8 \n"

779 "b.gt 1b \n"	767 "b.gt 1b \n"

780 : "+r"(src), // %0	768 : "+&r"(src), // %0

781 "+r"(dst), // %1	769 "+&r"(dst), // %1

782 "+r"(width64) // %2	770 "+&r"(width) // %2

783 : "r"((ptrdiff_t)-16) // %3	771 : "r"((ptrdiff_t)-16) // %3

784 : "cc", "memory", "v0"	772 : "cc", "memory", "v0"

785 );	773 );

786 }	774 }

787	775

788 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {	776 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {

789 asm volatile (	777 asm volatile (

790 "movi v4.8b, #255 \n" // Alpha	778 "movi v4.8b, #255 \n" // Alpha

791 "1: \n"	779 "1: \n"

792 MEMACCESS(0)	780 MEMACCESS(0)

793 "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.	781 "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.

794 "subs %w2, %w2, #8 \n" // 8 processed per loop.	782 "subs %w2, %w2, #8 \n" // 8 processed per loop.

795 MEMACCESS(1)	783 MEMACCESS(1)

796 "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels	784 "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels

797 "b.gt 1b \n"	785 "b.gt 1b \n"

798 : "+r"(src_rgb24), // %0	786 : "+r"(src_rgb24), // %0

799 "+r"(dst_argb), // %1	787 "+r"(dst_argb), // %1

800 "+r"(width) // %2	788 "+r"(width) // %2

801 :	789 :

802 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List	790 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List

803 );	791 );

804 }	792 }

805	793

806 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {	794 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {

807 asm volatile (	795 asm volatile (

808 "movi v5.8b, #255 \n" // Alpha	796 "movi v5.8b, #255 \n" // Alpha

809 "1: \n"	797 "1: \n"

810 MEMACCESS(0)	798 MEMACCESS(0)

811 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b	799 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b

812 "subs %w2, %w2, #8 \n" // 8 processed per loop.	800 "subs %w2, %w2, #8 \n" // 8 processed per loop.

813 "orr v3.8b, v1.8b, v1.8b \n" // move g	801 "orr v3.8b, v1.8b, v1.8b \n" // move g

814 "orr v4.8b, v0.8b, v0.8b \n" // move r	802 "orr v4.8b, v0.8b, v0.8b \n" // move r

815 MEMACCESS(1)	803 MEMACCESS(1)

816 "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a	804 "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a

817 "b.gt 1b \n"	805 "b.gt 1b \n"

818 : "+r"(src_raw), // %0	806 : "+r"(src_raw), // %0

819 "+r"(dst_argb), // %1	807 "+r"(dst_argb), // %1

820 "+r"(width) // %2	808 "+r"(width) // %2

821 :	809 :

822 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List	810 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List

823 );	811 );

824 }	812 }

825	813

826 void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {	814 void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {

827 asm volatile (	815 asm volatile (

828 "1: \n"	816 "1: \n"

829 MEMACCESS(0)	817 MEMACCESS(0)

830 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b	818 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b

(...skipping 1977 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2808 "r"(6LL) // %5	2796 "r"(6LL) // %5

2809 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List	2797 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List

2810 );	2798 );

2811 }	2799 }

2812 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)	2800 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)

2813	2801

2814 #ifdef __cplusplus	2802 #ifdef __cplusplus

2815 } // extern "C"	2803 } // extern "C"

2816 } // namespace libyuv	2804 } // namespace libyuv

2817 #endif	2805 #endif

OLD	NEW

« no previous file with comments | « include/libyuv/version.h ('k') | no next file » | no next file with comments »