source/scale_neon64.cc - Issue 1895743008: Remove initialize to zero on output variables for inline.

Side by Side Diff: source/scale_neon64.cc

Issue 1895743008: Remove initialize to zero on output variables for inline. (Closed) Base URL: https://chromium.googlesource.com/libyuv/libyuv@master

Patch Set: use early write for all outputs to avoid them being reassigned to input Created 4 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright 2014 The LibYuv Project Authors. All rights reserved.	2 * Copyright 2014 The LibYuv Project Authors. All rights reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

(...skipping 529 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
540 "+r"(dst_width) // %3	540 "+r"(dst_width) // %3

541 : "r"(&kMult38_Div6), // %4	541 : "r"(&kMult38_Div6), // %4

542 "r"(&kShuf38_2) // %5	542 "r"(&kShuf38_2) // %5

543 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",	543 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",

544 "v18", "v19", "v30", "v31", "memory", "cc"	544 "v18", "v19", "v30", "v31", "memory", "cc"

545 );	545 );

546 }	546 }

547	547

548 void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,	548 void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

549 uint16* dst_ptr, int src_width, int src_height) {	549 uint16* dst_ptr, int src_width, int src_height) {

550 const uint8* src_tmp = NULL;	550 const uint8* src_tmp;

551 asm volatile (	551 asm volatile (

552 "1: \n"	552 "1: \n"

553 "mov %0, %1 \n"	553 "mov %0, %1 \n"

554 "mov w12, %w5 \n"	554 "mov w12, %w5 \n"

555 "eor v2.16b, v2.16b, v2.16b \n"	555 "eor v2.16b, v2.16b, v2.16b \n"

556 "eor v3.16b, v3.16b, v3.16b \n"	556 "eor v3.16b, v3.16b, v3.16b \n"

557 "2: \n"	557 "2: \n"

558 // load 16 pixels into q0	558 // load 16 pixels into q0

559 MEMACCESS(0)	559 MEMACCESS(0)

560 "ld1 {v0.16b}, [%0], %3 \n"	560 "ld1 {v0.16b}, [%0], %3 \n"

561 "uaddw2 v3.8h, v3.8h, v0.16b \n"	561 "uaddw2 v3.8h, v3.8h, v0.16b \n"

562 "uaddw v2.8h, v2.8h, v0.8b \n"	562 "uaddw v2.8h, v2.8h, v0.8b \n"

563 "subs w12, w12, #1 \n"	563 "subs w12, w12, #1 \n"

564 "b.gt 2b \n"	564 "b.gt 2b \n"

565 MEMACCESS(2)	565 MEMACCESS(2)

566 "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels	566 "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels

567 "add %1, %1, #16 \n"	567 "add %1, %1, #16 \n"

568 "subs %w4, %w4, #16 \n" // 16 processed per loop	568 "subs %w4, %w4, #16 \n" // 16 processed per loop

569 "b.gt 1b \n"	569 "b.gt 1b \n"

570 : "+r"(src_tmp), // %0	570 : "=&r"(src_tmp), // %0

571 "+r"(src_ptr), // %1	571 "+r"(src_ptr), // %1

572 "+r"(dst_ptr), // %2	572 "+r"(dst_ptr), // %2

573 "+r"(src_stride), // %3	573 "+r"(src_stride), // %3

574 "+r"(src_width), // %4	574 "+r"(src_width), // %4

575 "+r"(src_height) // %5	575 "+r"(src_height) // %5

576 :	576 :

577 : "memory", "cc", "w12", "v0", "v1", "v2", "v3" // Clobber List	577 : "memory", "cc", "w12", "v0", "v1", "v2", "v3" // Clobber List

578 );	578 );

579 }	579 }

580	580

581 // TODO(Yang Zhang): Investigate less load instructions for	581 // TODO(Yang Zhang): Investigate less load instructions for

582 // the x/dx stepping	582 // the x/dx stepping

583 #define LOAD2_DATA8_LANE(n) \	583 #define LOAD2_DATA8_LANE(n) \

584 "lsr %5, %3, #16 \n" \	584 "lsr %5, %3, #16 \n" \

585 "add %6, %1, %5 \n" \	585 "add %6, %1, %5 \n" \

(...skipping 338 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
924 "add %3, %3, %4 \n" \	924 "add %3, %3, %4 \n" \

925 MEMACCESS(6) \	925 MEMACCESS(6) \

926 "ld1 {"#vn".s}["#n"], [%6] \n"	926 "ld1 {"#vn".s}["#n"], [%6] \n"

927	927

928 void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,	928 void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,

929 int dst_width, int x, int dx) {	929 int dst_width, int x, int dx) {

930 const uint8* src_tmp = src_argb;	930 const uint8* src_tmp = src_argb;

931 int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning.	931 int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning.

932 int64 x64 = (int64) x;	932 int64 x64 = (int64) x;

933 int64 dx64 = (int64) dx;	933 int64 dx64 = (int64) dx;

934 int64 tmp64 = 0;	934 int64 tmp64;

935 asm volatile (	935 asm volatile (

936 "1: \n"	936 "1: \n"

937 LOAD1_DATA32_LANE(v0, 0)	937 LOAD1_DATA32_LANE(v0, 0)

938 LOAD1_DATA32_LANE(v0, 1)	938 LOAD1_DATA32_LANE(v0, 1)

939 LOAD1_DATA32_LANE(v0, 2)	939 LOAD1_DATA32_LANE(v0, 2)

940 LOAD1_DATA32_LANE(v0, 3)	940 LOAD1_DATA32_LANE(v0, 3)

941 LOAD1_DATA32_LANE(v1, 0)	941 LOAD1_DATA32_LANE(v1, 0)

942 LOAD1_DATA32_LANE(v1, 1)	942 LOAD1_DATA32_LANE(v1, 1)

943 LOAD1_DATA32_LANE(v1, 2)	943 LOAD1_DATA32_LANE(v1, 2)

944 LOAD1_DATA32_LANE(v1, 3)	944 LOAD1_DATA32_LANE(v1, 3)

945	945

946 MEMACCESS(0)	946 MEMACCESS(0)

947 "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels	947 "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels

948 "subs %w2, %w2, #8 \n" // 8 processed per loop	948 "subs %w2, %w2, #8 \n" // 8 processed per loop

949 "b.gt 1b \n"	949 "b.gt 1b \n"

950 : "+r"(dst_argb), // %0	950 : "+r"(dst_argb), // %0

951 "+r"(src_argb), // %1	951 "+r"(src_argb), // %1

952 "+r"(dst_width64), // %2	952 "+r"(dst_width64), // %2

953 "+r"(x64), // %3	953 "+r"(x64), // %3

954 "+r"(dx64), // %4	954 "+r"(dx64), // %4

955 "+r"(tmp64), // %5	955 "=&r"(tmp64), // %5

956 "+r"(src_tmp) // %6	956 "+r"(src_tmp) // %6

957 :	957 :

958 : "memory", "cc", "v0", "v1"	958 : "memory", "cc", "v0", "v1"

959 );	959 );

960 }	960 }

961	961

962 #undef LOAD1_DATA32_LANE	962 #undef LOAD1_DATA32_LANE

963	963

964 // TODO(Yang Zhang): Investigate less load instructions for	964 // TODO(Yang Zhang): Investigate less load instructions for

965 // the x/dx stepping	965 // the x/dx stepping

966 #define LOAD2_DATA32_LANE(vn1, vn2, n) \	966 #define LOAD2_DATA32_LANE(vn1, vn2, n) \

(...skipping 66 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1033 }	1033 }

1034	1034

1035 #undef LOAD2_DATA32_LANE	1035 #undef LOAD2_DATA32_LANE

1036	1036

1037 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)	1037 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)

1038	1038

1039 #ifdef __cplusplus	1039 #ifdef __cplusplus

1040 } // extern "C"	1040 } // extern "C"

1041 } // namespace libyuv	1041 } // namespace libyuv

1042 #endif	1042 #endif

OLD	NEW

« no previous file with comments | « source/scale_neon.cc ('k') | no next file » | no next file with comments »