source/libvpx/third_party/libyuv/source/scale_neon.cc - Issue 1302353004: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/third_party/libyuv/source/scale_neon.cc

Issue 1302353004: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master

Patch Set: Created 5 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.	2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

(...skipping 25 matching lines...) Expand all Loading...
36 "vst1.8 {q1}, [%1]! \n" // store odd pixels	36 "vst1.8 {q1}, [%1]! \n" // store odd pixels

37 "bgt 1b \n"	37 "bgt 1b \n"

38 : "+r"(src_ptr), // %0	38 : "+r"(src_ptr), // %0

39 "+r"(dst), // %1	39 "+r"(dst), // %1

40 "+r"(dst_width) // %2	40 "+r"(dst_width) // %2

41 :	41 :

42 : "q0", "q1" // Clobber List	42 : "q0", "q1" // Clobber List

43 );	43 );

44 }	44 }

45	45

	46 // Read 32x1 average down and write 16x1.

	47 void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

	48 uint8* dst, int dst_width) {

	49 asm volatile (

	50 ".p2align 2 \n"

	51 "1: \n"

	52 MEMACCESS(0)

	53 "vld1.8 {q0, q1}, [%0]! \n" // load pixels and post inc

	54 "subs %2, %2, #16 \n" // 16 processed per loop

	55 "vpaddl.u8 q0, q0 \n" // add adjacent

	56 "vpaddl.u8 q1, q1 \n"

	57 "vrshrn.u16 d0, q0, #1 \n" // downshift, round and pack

	58 "vrshrn.u16 d1, q1, #1 \n"

	59 MEMACCESS(1)

	60 "vst1.8 {q0}, [%1]! \n"

	61 "bgt 1b \n"

	62 : "+r"(src_ptr), // %0

	63 "+r"(dst), // %1

	64 "+r"(dst_width) // %2

	65 :

	66 : "q0", "q1" // Clobber List

	67 );

	68 }

	69

46 // Read 32x2 average down and write 16x1.	70 // Read 32x2 average down and write 16x1.

47 void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,	71 void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

48 uint8* dst, int dst_width) {	72 uint8* dst, int dst_width) {

49 asm volatile (	73 asm volatile (

50 // change the stride to row 2 pointer	74 // change the stride to row 2 pointer

51 "add %1, %0 \n"	75 "add %1, %0 \n"

52 ".p2align 2 \n"	76 ".p2align 2 \n"

53 "1: \n"	77 "1: \n"

54 MEMACCESS(0)	78 MEMACCESS(0)

55 "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc	79 "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc

(...skipping 454 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
510 : "+r"(src_ptr), // %0	534 : "+r"(src_ptr), // %0

511 "+r"(dst_ptr), // %1	535 "+r"(dst_ptr), // %1

512 "+r"(dst_width), // %2	536 "+r"(dst_width), // %2

513 "+r"(src_stride) // %3	537 "+r"(src_stride) // %3

514 : "r"(&kMult38_Div6), // %4	538 : "r"(&kMult38_Div6), // %4

515 "r"(&kShuf38_2) // %5	539 "r"(&kShuf38_2) // %5

516 : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"	540 : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"

517 );	541 );

518 }	542 }

519	543

	544 void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

	545 uint16* dst_ptr, int src_width, int src_height) {

	546 const uint8* src_tmp = NULL;

	547 asm volatile (

	548 ".p2align 2 \n"

	549 "1: \n"

	550 "mov %0, %1 \n"

	551 "mov r12, %5 \n"

	552 "veor q2, q2, q2 \n"

	553 "veor q3, q3, q3 \n"

	554 "2: \n"

	555 // load 16 pixels into q0

	556 MEMACCESS(0)

	557 "vld1.8 {q0}, [%0], %3 \n"

	558 "vaddw.u8 q3, q3, d1 \n"

	559 "vaddw.u8 q2, q2, d0 \n"

	560 "subs r12, r12, #1 \n"

	561 "bgt 2b \n"

	562 MEMACCESS(2)

	563 "vst1.16 {q2, q3}, [%2]! \n" // store pixels

	564 "add %1, %1, #16 \n"

	565 "subs %4, %4, #16 \n" // 16 processed per loop

	566 "bgt 1b \n"

	567 : "+r"(src_tmp), // %0

	568 "+r"(src_ptr), // %1

	569 "+r"(dst_ptr), // %2

	570 "+r"(src_stride), // %3

	571 "+r"(src_width), // %4

	572 "+r"(src_height) // %5

	573 :

	574 : "memory", "cc", "r12", "q0", "q1", "q2", "q3" // Clobber List

	575 );

	576 }

	577

	578 // TODO(Yang Zhang): Investigate less load instructions for

	579 // the x/dx stepping

	580 #define LOAD2_DATA8_LANE(n) \

	581 "lsr %5, %3, #16 \n" \

	582 "add %6, %1, %5 \n" \

	583 "add %3, %3, %4 \n" \

	584 MEMACCESS(6) \

	585 "vld2.8 {d6["#n"], d7["#n"]}, [%6] \n"

	586

	587 void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,

	588 int dst_width, int x, int dx) {

	589 int dx_offset[4] = {0, 1, 2, 3};

	590 int* tmp = dx_offset;

	591 const uint8* src_tmp = src_ptr;

	592 asm volatile (

	593 ".p2align 2 \n"

	594 "vdup.32 q0, %3 \n" // x

	595 "vdup.32 q1, %4 \n" // dx

	596 "vld1.32 {q2}, [%5] \n" // 0 1 2 3

	597 "vshl.i32 q3, q1, #2 \n" // 4 * dx

	598 "vmul.s32 q1, q1, q2 \n"

	599 // x , x + 1 * dx, x + 2 * dx, x + 3 * dx

	600 "vadd.s32 q1, q1, q0 \n"

	601 // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx

	602 "vadd.s32 q2, q1, q3 \n"

	603 "vshl.i32 q0, q3, #1 \n" // 8 * dx

	604 "1: \n"

	605 LOAD2_DATA8_LANE(0)

	606 LOAD2_DATA8_LANE(1)

	607 LOAD2_DATA8_LANE(2)

	608 LOAD2_DATA8_LANE(3)

	609 LOAD2_DATA8_LANE(4)

	610 LOAD2_DATA8_LANE(5)

	611 LOAD2_DATA8_LANE(6)

	612 LOAD2_DATA8_LANE(7)

	613 "vmov q10, q1 \n"

	614 "vmov q11, q2 \n"

	615 "vuzp.16 q10, q11 \n"

	616 "vmovl.u8 q8, d6 \n"

	617 "vmovl.u8 q9, d7 \n"

	618 "vsubl.s16 q11, d18, d16 \n"

	619 "vsubl.s16 q12, d19, d17 \n"

	620 "vmovl.u16 q13, d20 \n"

	621 "vmovl.u16 q10, d21 \n"

	622 "vmul.s32 q11, q11, q13 \n"

	623 "vmul.s32 q12, q12, q10 \n"

	624 "vshrn.s32 d18, q11, #16 \n"

	625 "vshrn.s32 d19, q12, #16 \n"

	626 "vadd.s16 q8, q8, q9 \n"

	627 "vmovn.s16 d6, q8 \n"

	628

	629 MEMACCESS(0)

	630 "vst1.8 {d6}, [%0]! \n" // store pixels

	631 "vadd.s32 q1, q1, q0 \n"

	632 "vadd.s32 q2, q2, q0 \n"

	633 "subs %2, %2, #8 \n" // 8 processed per loop

	634 "bgt 1b \n"

	635 : "+r"(dst_ptr), // %0

	636 "+r"(src_ptr), // %1

	637 "+r"(dst_width), // %2

	638 "+r"(x), // %3

	639 "+r"(dx), // %4

	640 "+r"(tmp), // %5

	641 "+r"(src_tmp) // %6

	642 :

	643 : "memory", "cc", "q0", "q1", "q2", "q3",

	644 "q8", "q9", "q10", "q11", "q12", "q13"

	645 );

	646 }

	647

	648 #undef LOAD2_DATA8_LANE

	649

520 // 16x2 -> 16x1	650 // 16x2 -> 16x1

521 void ScaleFilterRows_NEON(uint8* dst_ptr,	651 void ScaleFilterRows_NEON(uint8* dst_ptr,

522 const uint8* src_ptr, ptrdiff_t src_stride,	652 const uint8* src_ptr, ptrdiff_t src_stride,

523 int dst_width, int source_y_fraction) {	653 int dst_width, int source_y_fraction) {

524 asm volatile (	654 asm volatile (

525 "cmp %4, #0 \n"	655 "cmp %4, #0 \n"

526 "beq 100f \n"	656 "beq 100f \n"

527 "add %2, %1 \n"	657 "add %2, %1 \n"

528 "cmp %4, #64 \n"	658 "cmp %4, #64 \n"

529 "beq 75f \n"	659 "beq 75f \n"

(...skipping 103 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
633 "vst1.8 {q3}, [%1]! \n"	763 "vst1.8 {q3}, [%1]! \n"

634 "bgt 1b \n"	764 "bgt 1b \n"

635 : "+r"(src_ptr), // %0	765 : "+r"(src_ptr), // %0

636 "+r"(dst), // %1	766 "+r"(dst), // %1

637 "+r"(dst_width) // %2	767 "+r"(dst_width) // %2

638 :	768 :

639 : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List	769 : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List

640 );	770 );

641 }	771 }

642	772

	773 void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,

	774 uint8* dst_argb, int dst_width) {

	775 asm volatile (

	776 ".p2align 2 \n"

	777 "1: \n"

	778 MEMACCESS(0)

	779 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.

	780 MEMACCESS(0)

	781 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.

	782 "subs %2, %2, #8 \n" // 8 processed per loop

	783 "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.

	784 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.

	785 "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.

	786 "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.

	787 "vrshrn.u16 d0, q0, #1 \n" // downshift, round and pack

	788 "vrshrn.u16 d1, q1, #1 \n"

	789 "vrshrn.u16 d2, q2, #1 \n"

	790 "vrshrn.u16 d3, q3, #1 \n"

	791 MEMACCESS(1)

	792 "vst4.8 {d0, d1, d2, d3}, [%1]! \n"

	793 "bgt 1b \n"

	794 : "+r"(src_argb), // %0

	795 "+r"(dst_argb), // %1

	796 "+r"(dst_width) // %2

	797 :

	798 : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List

	799 );

	800 }

	801

643 void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,	802 void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

644 uint8* dst, int dst_width) {	803 uint8* dst, int dst_width) {

645 asm volatile (	804 asm volatile (

646 // change the stride to row 2 pointer	805 // change the stride to row 2 pointer

647 "add %1, %1, %0 \n"	806 "add %1, %1, %0 \n"

648 ".p2align 2 \n"	807 ".p2align 2 \n"

649 "1: \n"	808 "1: \n"

650 MEMACCESS(0)	809 MEMACCESS(0)

651 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.	810 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.

652 MEMACCESS(0)	811 MEMACCESS(0)

(...skipping 97 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
750 "bgt 1b \n"	909 "bgt 1b \n"

751 : "+r"(src_argb), // %0	910 : "+r"(src_argb), // %0

752 "+r"(src_stride), // %1	911 "+r"(src_stride), // %1

753 "+r"(dst_argb), // %2	912 "+r"(dst_argb), // %2

754 "+r"(dst_width) // %3	913 "+r"(dst_width) // %3

755 : "r"(src_stepx) // %4	914 : "r"(src_stepx) // %4

756 : "memory", "cc", "r12", "q0", "q1", "q2", "q3"	915 : "memory", "cc", "r12", "q0", "q1", "q2", "q3"

757 );	916 );

758 }	917 }

759	918

	919 // TODO(Yang Zhang): Investigate less load instructions for

	920 // the x/dx stepping

	921 #define LOAD1_DATA32_LANE(dn, n) \

	922 "lsr %5, %3, #16 \n" \

	923 "add %6, %1, %5, lsl #2 \n" \

	924 "add %3, %3, %4 \n" \

	925 MEMACCESS(6) \

	926 "vld1.32 {"#dn"["#n"]}, [%6] \n"

	927

	928 void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,

	929 int dst_width, int x, int dx) {

	930 int tmp = 0;

	931 const uint8* src_tmp = src_argb;

	932 asm volatile (

	933 ".p2align 2 \n"

	934 "1: \n"

	935 LOAD1_DATA32_LANE(d0, 0)

	936 LOAD1_DATA32_LANE(d0, 1)

	937 LOAD1_DATA32_LANE(d1, 0)

	938 LOAD1_DATA32_LANE(d1, 1)

	939 LOAD1_DATA32_LANE(d2, 0)

	940 LOAD1_DATA32_LANE(d2, 1)

	941 LOAD1_DATA32_LANE(d3, 0)

	942 LOAD1_DATA32_LANE(d3, 1)

	943

	944 MEMACCESS(0)

	945 "vst1.32 {q0, q1}, [%0]! \n" // store pixels

	946 "subs %2, %2, #8 \n" // 8 processed per loop

	947 "bgt 1b \n"

	948 : "+r"(dst_argb), // %0

	949 "+r"(src_argb), // %1

	950 "+r"(dst_width), // %2

	951 "+r"(x), // %3

	952 "+r"(dx), // %4

	953 "+r"(tmp), // %5

	954 "+r"(src_tmp) // %6

	955 :

	956 : "memory", "cc", "q0", "q1"

	957 );

	958 }

	959

	960 #undef LOAD1_DATA32_LANE

	961

	962 // TODO(Yang Zhang): Investigate less load instructions for

	963 // the x/dx stepping

	964 #define LOAD2_DATA32_LANE(dn1, dn2, n) \

	965 "lsr %5, %3, #16 \n" \

	966 "add %6, %1, %5, lsl #2 \n" \

	967 "add %3, %3, %4 \n" \

	968 MEMACCESS(6) \

	969 "vld2.32 {"#dn1"["#n"], "#dn2"["#n"]}, [%6] \n"

	970

	971 void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,

	972 int dst_width, int x, int dx) {

	973 int dx_offset[4] = {0, 1, 2, 3};

	974 int* tmp = dx_offset;

	975 const uint8* src_tmp = src_argb;

	976 asm volatile (

	977 ".p2align 2 \n"

	978 "vdup.32 q0, %3 \n" // x

	979 "vdup.32 q1, %4 \n" // dx

	980 "vld1.32 {q2}, [%5] \n" // 0 1 2 3

	981 "vshl.i32 q9, q1, #2 \n" // 4 * dx

	982 "vmul.s32 q1, q1, q2 \n"

	983 "vmov.i8 q3, #0x7f \n" // 0x7F

	984 "vmov.i16 q15, #0x7f \n" // 0x7F

	985 // x , x + 1 * dx, x + 2 * dx, x + 3 * dx

	986 "vadd.s32 q8, q1, q0 \n"

	987 "1: \n"

	988 // d0, d1: a

	989 // d2, d3: b

	990 LOAD2_DATA32_LANE(d0, d2, 0)

	991 LOAD2_DATA32_LANE(d0, d2, 1)

	992 LOAD2_DATA32_LANE(d1, d3, 0)

	993 LOAD2_DATA32_LANE(d1, d3, 1)

	994 "vshrn.i32 d22, q8, #9 \n"

	995 "vand.16 d22, d22, d30 \n"

	996 "vdup.8 d24, d22[0] \n"

	997 "vdup.8 d25, d22[2] \n"

	998 "vdup.8 d26, d22[4] \n"

	999 "vdup.8 d27, d22[6] \n"

	1000 "vext.8 d4, d24, d25, #4 \n"

	1001 "vext.8 d5, d26, d27, #4 \n" // f

	1002 "veor.8 q10, q2, q3 \n" // 0x7f ^ f

	1003 "vmull.u8 q11, d0, d20 \n"

	1004 "vmull.u8 q12, d1, d21 \n"

	1005 "vmull.u8 q13, d2, d4 \n"

	1006 "vmull.u8 q14, d3, d5 \n"

	1007 "vadd.i16 q11, q11, q13 \n"

	1008 "vadd.i16 q12, q12, q14 \n"

	1009 "vshrn.i16 d0, q11, #7 \n"

	1010 "vshrn.i16 d1, q12, #7 \n"

	1011

	1012 MEMACCESS(0)

	1013 "vst1.32 {d0, d1}, [%0]! \n" // store pixels

	1014 "vadd.s32 q8, q8, q9 \n"

	1015 "subs %2, %2, #4 \n" // 4 processed per loop

	1016 "bgt 1b \n"

	1017 : "+r"(dst_argb), // %0

	1018 "+r"(src_argb), // %1

	1019 "+r"(dst_width), // %2

	1020 "+r"(x), // %3

	1021 "+r"(dx), // %4

	1022 "+r"(tmp), // %5

	1023 "+r"(src_tmp) // %6

	1024 :

	1025 : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9",

	1026 "q10", "q11", "q12", "q13", "q14", "q15"

	1027 );

	1028 }

	1029

	1030 #undef LOAD2_DATA32_LANE

	1031

760 #endif // defined(__ARM_NEON__) && !defined(__aarch64__)	1032 #endif // defined(__ARM_NEON__) && !defined(__aarch64__)

761	1033

762 #ifdef __cplusplus	1034 #ifdef __cplusplus

763 } // extern "C"	1035 } // extern "C"

764 } // namespace libyuv	1036 } // namespace libyuv

765 #endif	1037 #endif

OLD	NEW

« no previous file with comments | « source/libvpx/third_party/libyuv/source/scale_gcc.cc ('k') | source/libvpx/third_party/libyuv/source/scale_neon64.cc » ('j') | no next file with comments »