Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(81)

Side by Side Diff: source/scale_neon.cc

Issue 1895743008: Remove initialize to zero on output variables for inline. (Closed) Base URL: https://chromium.googlesource.com/libyuv/libyuv@master
Patch Set: use early write for all outputs to avoid them being reassigned to input Created 4 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/scale_gcc.cc ('k') | source/scale_neon64.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
(...skipping 514 matching lines...) Expand 10 before | Expand all | Expand 10 after
525 "+r"(dst_width), // %2 525 "+r"(dst_width), // %2
526 "+r"(src_stride) // %3 526 "+r"(src_stride) // %3
527 : "r"(&kMult38_Div6), // %4 527 : "r"(&kMult38_Div6), // %4
528 "r"(&kShuf38_2) // %5 528 "r"(&kShuf38_2) // %5
529 : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc" 529 : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
530 ); 530 );
531 } 531 }
532 532
533 void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, 533 void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
534 uint16* dst_ptr, int src_width, int src_height) { 534 uint16* dst_ptr, int src_width, int src_height) {
535 const uint8* src_tmp = NULL; 535 const uint8* src_tmp;
536 asm volatile ( 536 asm volatile (
537 "1: \n" 537 "1: \n"
538 "mov %0, %1 \n" 538 "mov %0, %1 \n"
539 "mov r12, %5 \n" 539 "mov r12, %5 \n"
540 "veor q2, q2, q2 \n" 540 "veor q2, q2, q2 \n"
541 "veor q3, q3, q3 \n" 541 "veor q3, q3, q3 \n"
542 "2: \n" 542 "2: \n"
543 // load 16 pixels into q0 543 // load 16 pixels into q0
544 MEMACCESS(0) 544 MEMACCESS(0)
545 "vld1.8 {q0}, [%0], %3 \n" 545 "vld1.8 {q0}, [%0], %3 \n"
546 "vaddw.u8 q3, q3, d1 \n" 546 "vaddw.u8 q3, q3, d1 \n"
547 "vaddw.u8 q2, q2, d0 \n" 547 "vaddw.u8 q2, q2, d0 \n"
548 "subs r12, r12, #1 \n" 548 "subs r12, r12, #1 \n"
549 "bgt 2b \n" 549 "bgt 2b \n"
550 MEMACCESS(2) 550 MEMACCESS(2)
551 "vst1.16 {q2, q3}, [%2]! \n" // store pixels 551 "vst1.16 {q2, q3}, [%2]! \n" // store pixels
552 "add %1, %1, #16 \n" 552 "add %1, %1, #16 \n"
553 "subs %4, %4, #16 \n" // 16 processed per loop 553 "subs %4, %4, #16 \n" // 16 processed per loop
554 "bgt 1b \n" 554 "bgt 1b \n"
555 : "+r"(src_tmp), // %0 555 : "=&r"(src_tmp), // %0
556 "+r"(src_ptr), // %1 556 "+r"(src_ptr), // %1
557 "+r"(dst_ptr), // %2 557 "+r"(dst_ptr), // %2
558 "+r"(src_stride), // %3 558 "+r"(src_stride), // %3
559 "+r"(src_width), // %4 559 "+r"(src_width), // %4
560 "+r"(src_height) // %5 560 "+r"(src_height) // %5
561 : 561 :
562 : "memory", "cc", "r12", "q0", "q1", "q2", "q3" // Clobber List 562 : "memory", "cc", "r12", "q0", "q1", "q2", "q3" // Clobber List
563 ); 563 );
564 } 564 }
565 565
566 // TODO(Yang Zhang): Investigate less load instructions for 566 // TODO(Yang Zhang): Investigate less load instructions for
567 // the x/dx stepping 567 // the x/dx stepping
568 #define LOAD2_DATA8_LANE(n) \ 568 #define LOAD2_DATA8_LANE(n) \
569 "lsr %5, %3, #16 \n" \ 569 "lsr %5, %3, #16 \n" \
570 "add %6, %1, %5 \n" \ 570 "add %6, %1, %5 \n" \
(...skipping 331 matching lines...) Expand 10 before | Expand all | Expand 10 after
902 // the x/dx stepping 902 // the x/dx stepping
903 #define LOAD1_DATA32_LANE(dn, n) \ 903 #define LOAD1_DATA32_LANE(dn, n) \
904 "lsr %5, %3, #16 \n" \ 904 "lsr %5, %3, #16 \n" \
905 "add %6, %1, %5, lsl #2 \n" \ 905 "add %6, %1, %5, lsl #2 \n" \
906 "add %3, %3, %4 \n" \ 906 "add %3, %3, %4 \n" \
907 MEMACCESS(6) \ 907 MEMACCESS(6) \
908 "vld1.32 {"#dn"["#n"]}, [%6] \n" 908 "vld1.32 {"#dn"["#n"]}, [%6] \n"
909 909
910 void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb, 910 void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
911 int dst_width, int x, int dx) { 911 int dst_width, int x, int dx) {
912 int tmp = 0; 912 int tmp;
913 const uint8* src_tmp = src_argb; 913 const uint8* src_tmp = src_argb;
914 asm volatile ( 914 asm volatile (
915 "1: \n" 915 "1: \n"
916 LOAD1_DATA32_LANE(d0, 0) 916 LOAD1_DATA32_LANE(d0, 0)
917 LOAD1_DATA32_LANE(d0, 1) 917 LOAD1_DATA32_LANE(d0, 1)
918 LOAD1_DATA32_LANE(d1, 0) 918 LOAD1_DATA32_LANE(d1, 0)
919 LOAD1_DATA32_LANE(d1, 1) 919 LOAD1_DATA32_LANE(d1, 1)
920 LOAD1_DATA32_LANE(d2, 0) 920 LOAD1_DATA32_LANE(d2, 0)
921 LOAD1_DATA32_LANE(d2, 1) 921 LOAD1_DATA32_LANE(d2, 1)
922 LOAD1_DATA32_LANE(d3, 0) 922 LOAD1_DATA32_LANE(d3, 0)
923 LOAD1_DATA32_LANE(d3, 1) 923 LOAD1_DATA32_LANE(d3, 1)
924 924
925 MEMACCESS(0) 925 MEMACCESS(0)
926 "vst1.32 {q0, q1}, [%0]! \n" // store pixels 926 "vst1.32 {q0, q1}, [%0]! \n" // store pixels
927 "subs %2, %2, #8 \n" // 8 processed per loop 927 "subs %2, %2, #8 \n" // 8 processed per loop
928 "bgt 1b \n" 928 "bgt 1b \n"
929 : "+r"(dst_argb), // %0 929 : "+r"(dst_argb), // %0
930 "+r"(src_argb), // %1 930 "+r"(src_argb), // %1
931 "+r"(dst_width), // %2 931 "+r"(dst_width), // %2
932 "+r"(x), // %3 932 "+r"(x), // %3
933 "+r"(dx), // %4 933 "+r"(dx), // %4
934 "+r"(tmp), // %5 934 "=&r"(tmp), // %5
935 "+r"(src_tmp) // %6 935 "+r"(src_tmp) // %6
936 : 936 :
937 : "memory", "cc", "q0", "q1" 937 : "memory", "cc", "q0", "q1"
938 ); 938 );
939 } 939 }
940 940
941 #undef LOAD1_DATA32_LANE 941 #undef LOAD1_DATA32_LANE
942 942
943 // TODO(Yang Zhang): Investigate less load instructions for 943 // TODO(Yang Zhang): Investigate less load instructions for
944 // the x/dx stepping 944 // the x/dx stepping
945 #define LOAD2_DATA32_LANE(dn1, dn2, n) \ 945 #define LOAD2_DATA32_LANE(dn1, dn2, n) \
(...skipping 62 matching lines...) Expand 10 before | Expand all | Expand 10 after
1008 } 1008 }
1009 1009
1010 #undef LOAD2_DATA32_LANE 1010 #undef LOAD2_DATA32_LANE
1011 1011
1012 #endif // defined(__ARM_NEON__) && !defined(__aarch64__) 1012 #endif // defined(__ARM_NEON__) && !defined(__aarch64__)
1013 1013
1014 #ifdef __cplusplus 1014 #ifdef __cplusplus
1015 } // extern "C" 1015 } // extern "C"
1016 } // namespace libyuv 1016 } // namespace libyuv
1017 #endif 1017 #endif
OLDNEW
« no previous file with comments | « source/scale_gcc.cc ('k') | source/scale_neon64.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698