source/scale_neon.cc - Issue 1895743008: Remove initialize to zero on output variables for inline.

Side by Side Diff: source/scale_neon.cc

Issue 1895743008: Remove initialize to zero on output variables for inline. (Closed) Base URL: https://chromium.googlesource.com/libyuv/libyuv@master

Patch Set: use early write for all outputs to avoid them being reassigned to input Created 4 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.	2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

(...skipping 514 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
525 "+r"(dst_width), // %2	525 "+r"(dst_width), // %2

526 "+r"(src_stride) // %3	526 "+r"(src_stride) // %3

527 : "r"(&kMult38_Div6), // %4	527 : "r"(&kMult38_Div6), // %4

528 "r"(&kShuf38_2) // %5	528 "r"(&kShuf38_2) // %5

529 : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"	529 : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"

530 );	530 );

531 }	531 }

532	532

533 void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,	533 void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

534 uint16* dst_ptr, int src_width, int src_height) {	534 uint16* dst_ptr, int src_width, int src_height) {

535 const uint8* src_tmp = NULL;	535 const uint8* src_tmp;

536 asm volatile (	536 asm volatile (

537 "1: \n"	537 "1: \n"

538 "mov %0, %1 \n"	538 "mov %0, %1 \n"

539 "mov r12, %5 \n"	539 "mov r12, %5 \n"

540 "veor q2, q2, q2 \n"	540 "veor q2, q2, q2 \n"

541 "veor q3, q3, q3 \n"	541 "veor q3, q3, q3 \n"

542 "2: \n"	542 "2: \n"

543 // load 16 pixels into q0	543 // load 16 pixels into q0

544 MEMACCESS(0)	544 MEMACCESS(0)

545 "vld1.8 {q0}, [%0], %3 \n"	545 "vld1.8 {q0}, [%0], %3 \n"

546 "vaddw.u8 q3, q3, d1 \n"	546 "vaddw.u8 q3, q3, d1 \n"

547 "vaddw.u8 q2, q2, d0 \n"	547 "vaddw.u8 q2, q2, d0 \n"

548 "subs r12, r12, #1 \n"	548 "subs r12, r12, #1 \n"

549 "bgt 2b \n"	549 "bgt 2b \n"

550 MEMACCESS(2)	550 MEMACCESS(2)

551 "vst1.16 {q2, q3}, [%2]! \n" // store pixels	551 "vst1.16 {q2, q3}, [%2]! \n" // store pixels

552 "add %1, %1, #16 \n"	552 "add %1, %1, #16 \n"

553 "subs %4, %4, #16 \n" // 16 processed per loop	553 "subs %4, %4, #16 \n" // 16 processed per loop

554 "bgt 1b \n"	554 "bgt 1b \n"

555 : "+r"(src_tmp), // %0	555 : "=&r"(src_tmp), // %0

556 "+r"(src_ptr), // %1	556 "+r"(src_ptr), // %1

557 "+r"(dst_ptr), // %2	557 "+r"(dst_ptr), // %2

558 "+r"(src_stride), // %3	558 "+r"(src_stride), // %3

559 "+r"(src_width), // %4	559 "+r"(src_width), // %4

560 "+r"(src_height) // %5	560 "+r"(src_height) // %5

561 :	561 :

562 : "memory", "cc", "r12", "q0", "q1", "q2", "q3" // Clobber List	562 : "memory", "cc", "r12", "q0", "q1", "q2", "q3" // Clobber List

563 );	563 );

564 }	564 }

565	565

566 // TODO(Yang Zhang): Investigate less load instructions for	566 // TODO(Yang Zhang): Investigate less load instructions for

567 // the x/dx stepping	567 // the x/dx stepping

568 #define LOAD2_DATA8_LANE(n) \	568 #define LOAD2_DATA8_LANE(n) \

569 "lsr %5, %3, #16 \n" \	569 "lsr %5, %3, #16 \n" \

570 "add %6, %1, %5 \n" \	570 "add %6, %1, %5 \n" \

(...skipping 331 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
902 // the x/dx stepping	902 // the x/dx stepping

903 #define LOAD1_DATA32_LANE(dn, n) \	903 #define LOAD1_DATA32_LANE(dn, n) \

904 "lsr %5, %3, #16 \n" \	904 "lsr %5, %3, #16 \n" \

905 "add %6, %1, %5, lsl #2 \n" \	905 "add %6, %1, %5, lsl #2 \n" \

906 "add %3, %3, %4 \n" \	906 "add %3, %3, %4 \n" \

907 MEMACCESS(6) \	907 MEMACCESS(6) \

908 "vld1.32 {"#dn"["#n"]}, [%6] \n"	908 "vld1.32 {"#dn"["#n"]}, [%6] \n"

909	909

910 void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,	910 void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,

911 int dst_width, int x, int dx) {	911 int dst_width, int x, int dx) {

912 int tmp = 0;	912 int tmp;

913 const uint8* src_tmp = src_argb;	913 const uint8* src_tmp = src_argb;

914 asm volatile (	914 asm volatile (

915 "1: \n"	915 "1: \n"

916 LOAD1_DATA32_LANE(d0, 0)	916 LOAD1_DATA32_LANE(d0, 0)

917 LOAD1_DATA32_LANE(d0, 1)	917 LOAD1_DATA32_LANE(d0, 1)

918 LOAD1_DATA32_LANE(d1, 0)	918 LOAD1_DATA32_LANE(d1, 0)

919 LOAD1_DATA32_LANE(d1, 1)	919 LOAD1_DATA32_LANE(d1, 1)

920 LOAD1_DATA32_LANE(d2, 0)	920 LOAD1_DATA32_LANE(d2, 0)

921 LOAD1_DATA32_LANE(d2, 1)	921 LOAD1_DATA32_LANE(d2, 1)

922 LOAD1_DATA32_LANE(d3, 0)	922 LOAD1_DATA32_LANE(d3, 0)

923 LOAD1_DATA32_LANE(d3, 1)	923 LOAD1_DATA32_LANE(d3, 1)

924	924

925 MEMACCESS(0)	925 MEMACCESS(0)

926 "vst1.32 {q0, q1}, [%0]! \n" // store pixels	926 "vst1.32 {q0, q1}, [%0]! \n" // store pixels

927 "subs %2, %2, #8 \n" // 8 processed per loop	927 "subs %2, %2, #8 \n" // 8 processed per loop

928 "bgt 1b \n"	928 "bgt 1b \n"

929 : "+r"(dst_argb), // %0	929 : "+r"(dst_argb), // %0

930 "+r"(src_argb), // %1	930 "+r"(src_argb), // %1

931 "+r"(dst_width), // %2	931 "+r"(dst_width), // %2

932 "+r"(x), // %3	932 "+r"(x), // %3

933 "+r"(dx), // %4	933 "+r"(dx), // %4

934 "+r"(tmp), // %5	934 "=&r"(tmp), // %5

935 "+r"(src_tmp) // %6	935 "+r"(src_tmp) // %6

936 :	936 :

937 : "memory", "cc", "q0", "q1"	937 : "memory", "cc", "q0", "q1"

938 );	938 );

939 }	939 }

940	940

941 #undef LOAD1_DATA32_LANE	941 #undef LOAD1_DATA32_LANE

942	942

943 // TODO(Yang Zhang): Investigate less load instructions for	943 // TODO(Yang Zhang): Investigate less load instructions for

944 // the x/dx stepping	944 // the x/dx stepping

945 #define LOAD2_DATA32_LANE(dn1, dn2, n) \	945 #define LOAD2_DATA32_LANE(dn1, dn2, n) \

(...skipping 62 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1008 }	1008 }

1009	1009

1010 #undef LOAD2_DATA32_LANE	1010 #undef LOAD2_DATA32_LANE

1011	1011

1012 #endif // defined(__ARM_NEON__) && !defined(__aarch64__)	1012 #endif // defined(__ARM_NEON__) && !defined(__aarch64__)

1013	1013

1014 #ifdef __cplusplus	1014 #ifdef __cplusplus

1015 } // extern "C"	1015 } // extern "C"

1016 } // namespace libyuv	1016 } // namespace libyuv

1017 #endif	1017 #endif

OLD	NEW

« no previous file with comments | « source/scale_gcc.cc ('k') | source/scale_neon64.cc » ('j') | no next file with comments »