Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(776)

Side by Side Diff: source/scale_neon64.cc

Issue 1895743008: Remove initialize to zero on output variables for inline. (Closed) Base URL: https://chromium.googlesource.com/libyuv/libyuv@master
Patch Set: use early write for all outputs to avoid them being reassigned to input Created 4 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/scale_neon.cc ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2014 The LibYuv Project Authors. All rights reserved. 2 * Copyright 2014 The LibYuv Project Authors. All rights reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
(...skipping 529 matching lines...) Expand 10 before | Expand all | Expand 10 after
540 "+r"(dst_width) // %3 540 "+r"(dst_width) // %3
541 : "r"(&kMult38_Div6), // %4 541 : "r"(&kMult38_Div6), // %4
542 "r"(&kShuf38_2) // %5 542 "r"(&kShuf38_2) // %5
543 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", 543 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
544 "v18", "v19", "v30", "v31", "memory", "cc" 544 "v18", "v19", "v30", "v31", "memory", "cc"
545 ); 545 );
546 } 546 }
547 547
548 void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, 548 void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
549 uint16* dst_ptr, int src_width, int src_height) { 549 uint16* dst_ptr, int src_width, int src_height) {
550 const uint8* src_tmp = NULL; 550 const uint8* src_tmp;
551 asm volatile ( 551 asm volatile (
552 "1: \n" 552 "1: \n"
553 "mov %0, %1 \n" 553 "mov %0, %1 \n"
554 "mov w12, %w5 \n" 554 "mov w12, %w5 \n"
555 "eor v2.16b, v2.16b, v2.16b \n" 555 "eor v2.16b, v2.16b, v2.16b \n"
556 "eor v3.16b, v3.16b, v3.16b \n" 556 "eor v3.16b, v3.16b, v3.16b \n"
557 "2: \n" 557 "2: \n"
558 // load 16 pixels into q0 558 // load 16 pixels into q0
559 MEMACCESS(0) 559 MEMACCESS(0)
560 "ld1 {v0.16b}, [%0], %3 \n" 560 "ld1 {v0.16b}, [%0], %3 \n"
561 "uaddw2 v3.8h, v3.8h, v0.16b \n" 561 "uaddw2 v3.8h, v3.8h, v0.16b \n"
562 "uaddw v2.8h, v2.8h, v0.8b \n" 562 "uaddw v2.8h, v2.8h, v0.8b \n"
563 "subs w12, w12, #1 \n" 563 "subs w12, w12, #1 \n"
564 "b.gt 2b \n" 564 "b.gt 2b \n"
565 MEMACCESS(2) 565 MEMACCESS(2)
566 "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels 566 "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels
567 "add %1, %1, #16 \n" 567 "add %1, %1, #16 \n"
568 "subs %w4, %w4, #16 \n" // 16 processed per loop 568 "subs %w4, %w4, #16 \n" // 16 processed per loop
569 "b.gt 1b \n" 569 "b.gt 1b \n"
570 : "+r"(src_tmp), // %0 570 : "=&r"(src_tmp), // %0
571 "+r"(src_ptr), // %1 571 "+r"(src_ptr), // %1
572 "+r"(dst_ptr), // %2 572 "+r"(dst_ptr), // %2
573 "+r"(src_stride), // %3 573 "+r"(src_stride), // %3
574 "+r"(src_width), // %4 574 "+r"(src_width), // %4
575 "+r"(src_height) // %5 575 "+r"(src_height) // %5
576 : 576 :
577 : "memory", "cc", "w12", "v0", "v1", "v2", "v3" // Clobber List 577 : "memory", "cc", "w12", "v0", "v1", "v2", "v3" // Clobber List
578 ); 578 );
579 } 579 }
580 580
581 // TODO(Yang Zhang): Investigate less load instructions for 581 // TODO(Yang Zhang): Investigate less load instructions for
582 // the x/dx stepping 582 // the x/dx stepping
583 #define LOAD2_DATA8_LANE(n) \ 583 #define LOAD2_DATA8_LANE(n) \
584 "lsr %5, %3, #16 \n" \ 584 "lsr %5, %3, #16 \n" \
585 "add %6, %1, %5 \n" \ 585 "add %6, %1, %5 \n" \
(...skipping 338 matching lines...) Expand 10 before | Expand all | Expand 10 after
924 "add %3, %3, %4 \n" \ 924 "add %3, %3, %4 \n" \
925 MEMACCESS(6) \ 925 MEMACCESS(6) \
926 "ld1 {"#vn".s}["#n"], [%6] \n" 926 "ld1 {"#vn".s}["#n"], [%6] \n"
927 927
928 void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb, 928 void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
929 int dst_width, int x, int dx) { 929 int dst_width, int x, int dx) {
930 const uint8* src_tmp = src_argb; 930 const uint8* src_tmp = src_argb;
931 int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning. 931 int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning.
932 int64 x64 = (int64) x; 932 int64 x64 = (int64) x;
933 int64 dx64 = (int64) dx; 933 int64 dx64 = (int64) dx;
934 int64 tmp64 = 0; 934 int64 tmp64;
935 asm volatile ( 935 asm volatile (
936 "1: \n" 936 "1: \n"
937 LOAD1_DATA32_LANE(v0, 0) 937 LOAD1_DATA32_LANE(v0, 0)
938 LOAD1_DATA32_LANE(v0, 1) 938 LOAD1_DATA32_LANE(v0, 1)
939 LOAD1_DATA32_LANE(v0, 2) 939 LOAD1_DATA32_LANE(v0, 2)
940 LOAD1_DATA32_LANE(v0, 3) 940 LOAD1_DATA32_LANE(v0, 3)
941 LOAD1_DATA32_LANE(v1, 0) 941 LOAD1_DATA32_LANE(v1, 0)
942 LOAD1_DATA32_LANE(v1, 1) 942 LOAD1_DATA32_LANE(v1, 1)
943 LOAD1_DATA32_LANE(v1, 2) 943 LOAD1_DATA32_LANE(v1, 2)
944 LOAD1_DATA32_LANE(v1, 3) 944 LOAD1_DATA32_LANE(v1, 3)
945 945
946 MEMACCESS(0) 946 MEMACCESS(0)
947 "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels 947 "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels
948 "subs %w2, %w2, #8 \n" // 8 processed per loop 948 "subs %w2, %w2, #8 \n" // 8 processed per loop
949 "b.gt 1b \n" 949 "b.gt 1b \n"
950 : "+r"(dst_argb), // %0 950 : "+r"(dst_argb), // %0
951 "+r"(src_argb), // %1 951 "+r"(src_argb), // %1
952 "+r"(dst_width64), // %2 952 "+r"(dst_width64), // %2
953 "+r"(x64), // %3 953 "+r"(x64), // %3
954 "+r"(dx64), // %4 954 "+r"(dx64), // %4
955 "+r"(tmp64), // %5 955 "=&r"(tmp64), // %5
956 "+r"(src_tmp) // %6 956 "+r"(src_tmp) // %6
957 : 957 :
958 : "memory", "cc", "v0", "v1" 958 : "memory", "cc", "v0", "v1"
959 ); 959 );
960 } 960 }
961 961
962 #undef LOAD1_DATA32_LANE 962 #undef LOAD1_DATA32_LANE
963 963
964 // TODO(Yang Zhang): Investigate less load instructions for 964 // TODO(Yang Zhang): Investigate less load instructions for
965 // the x/dx stepping 965 // the x/dx stepping
966 #define LOAD2_DATA32_LANE(vn1, vn2, n) \ 966 #define LOAD2_DATA32_LANE(vn1, vn2, n) \
(...skipping 66 matching lines...) Expand 10 before | Expand all | Expand 10 after
1033 } 1033 }
1034 1034
1035 #undef LOAD2_DATA32_LANE 1035 #undef LOAD2_DATA32_LANE
1036 1036
1037 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) 1037 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
1038 1038
1039 #ifdef __cplusplus 1039 #ifdef __cplusplus
1040 } // extern "C" 1040 } // extern "C"
1041 } // namespace libyuv 1041 } // namespace libyuv
1042 #endif 1042 #endif
OLDNEW
« no previous file with comments | « source/scale_neon.cc ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698