OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 514 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
525 "+r"(dst_width), // %2 | 525 "+r"(dst_width), // %2 |
526 "+r"(src_stride) // %3 | 526 "+r"(src_stride) // %3 |
527 : "r"(&kMult38_Div6), // %4 | 527 : "r"(&kMult38_Div6), // %4 |
528 "r"(&kShuf38_2) // %5 | 528 "r"(&kShuf38_2) // %5 |
529 : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc" | 529 : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc" |
530 ); | 530 ); |
531 } | 531 } |
532 | 532 |
533 void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, | 533 void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, |
534 uint16* dst_ptr, int src_width, int src_height) { | 534 uint16* dst_ptr, int src_width, int src_height) { |
535 const uint8* src_tmp = NULL; | 535 const uint8* src_tmp; |
536 asm volatile ( | 536 asm volatile ( |
537 "1: \n" | 537 "1: \n" |
538 "mov %0, %1 \n" | 538 "mov %0, %1 \n" |
539 "mov r12, %5 \n" | 539 "mov r12, %5 \n" |
540 "veor q2, q2, q2 \n" | 540 "veor q2, q2, q2 \n" |
541 "veor q3, q3, q3 \n" | 541 "veor q3, q3, q3 \n" |
542 "2: \n" | 542 "2: \n" |
543 // load 16 pixels into q0 | 543 // load 16 pixels into q0 |
544 MEMACCESS(0) | 544 MEMACCESS(0) |
545 "vld1.8 {q0}, [%0], %3 \n" | 545 "vld1.8 {q0}, [%0], %3 \n" |
546 "vaddw.u8 q3, q3, d1 \n" | 546 "vaddw.u8 q3, q3, d1 \n" |
547 "vaddw.u8 q2, q2, d0 \n" | 547 "vaddw.u8 q2, q2, d0 \n" |
548 "subs r12, r12, #1 \n" | 548 "subs r12, r12, #1 \n" |
549 "bgt 2b \n" | 549 "bgt 2b \n" |
550 MEMACCESS(2) | 550 MEMACCESS(2) |
551 "vst1.16 {q2, q3}, [%2]! \n" // store pixels | 551 "vst1.16 {q2, q3}, [%2]! \n" // store pixels |
552 "add %1, %1, #16 \n" | 552 "add %1, %1, #16 \n" |
553 "subs %4, %4, #16 \n" // 16 processed per loop | 553 "subs %4, %4, #16 \n" // 16 processed per loop |
554 "bgt 1b \n" | 554 "bgt 1b \n" |
555 : "+r"(src_tmp), // %0 | 555 : "=&r"(src_tmp), // %0 |
556 "+r"(src_ptr), // %1 | 556 "+r"(src_ptr), // %1 |
557 "+r"(dst_ptr), // %2 | 557 "+r"(dst_ptr), // %2 |
558 "+r"(src_stride), // %3 | 558 "+r"(src_stride), // %3 |
559 "+r"(src_width), // %4 | 559 "+r"(src_width), // %4 |
560 "+r"(src_height) // %5 | 560 "+r"(src_height) // %5 |
561 : | 561 : |
562 : "memory", "cc", "r12", "q0", "q1", "q2", "q3" // Clobber List | 562 : "memory", "cc", "r12", "q0", "q1", "q2", "q3" // Clobber List |
563 ); | 563 ); |
564 } | 564 } |
565 | 565 |
566 // TODO(Yang Zhang): Investigate less load instructions for | 566 // TODO(Yang Zhang): Investigate less load instructions for |
567 // the x/dx stepping | 567 // the x/dx stepping |
568 #define LOAD2_DATA8_LANE(n) \ | 568 #define LOAD2_DATA8_LANE(n) \ |
569 "lsr %5, %3, #16 \n" \ | 569 "lsr %5, %3, #16 \n" \ |
570 "add %6, %1, %5 \n" \ | 570 "add %6, %1, %5 \n" \ |
(...skipping 331 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
902 // the x/dx stepping | 902 // the x/dx stepping |
903 #define LOAD1_DATA32_LANE(dn, n) \ | 903 #define LOAD1_DATA32_LANE(dn, n) \ |
904 "lsr %5, %3, #16 \n" \ | 904 "lsr %5, %3, #16 \n" \ |
905 "add %6, %1, %5, lsl #2 \n" \ | 905 "add %6, %1, %5, lsl #2 \n" \ |
906 "add %3, %3, %4 \n" \ | 906 "add %3, %3, %4 \n" \ |
907 MEMACCESS(6) \ | 907 MEMACCESS(6) \ |
908 "vld1.32 {"#dn"["#n"]}, [%6] \n" | 908 "vld1.32 {"#dn"["#n"]}, [%6] \n" |
909 | 909 |
910 void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb, | 910 void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb, |
911 int dst_width, int x, int dx) { | 911 int dst_width, int x, int dx) { |
912 int tmp = 0; | 912 int tmp; |
913 const uint8* src_tmp = src_argb; | 913 const uint8* src_tmp = src_argb; |
914 asm volatile ( | 914 asm volatile ( |
915 "1: \n" | 915 "1: \n" |
916 LOAD1_DATA32_LANE(d0, 0) | 916 LOAD1_DATA32_LANE(d0, 0) |
917 LOAD1_DATA32_LANE(d0, 1) | 917 LOAD1_DATA32_LANE(d0, 1) |
918 LOAD1_DATA32_LANE(d1, 0) | 918 LOAD1_DATA32_LANE(d1, 0) |
919 LOAD1_DATA32_LANE(d1, 1) | 919 LOAD1_DATA32_LANE(d1, 1) |
920 LOAD1_DATA32_LANE(d2, 0) | 920 LOAD1_DATA32_LANE(d2, 0) |
921 LOAD1_DATA32_LANE(d2, 1) | 921 LOAD1_DATA32_LANE(d2, 1) |
922 LOAD1_DATA32_LANE(d3, 0) | 922 LOAD1_DATA32_LANE(d3, 0) |
923 LOAD1_DATA32_LANE(d3, 1) | 923 LOAD1_DATA32_LANE(d3, 1) |
924 | 924 |
925 MEMACCESS(0) | 925 MEMACCESS(0) |
926 "vst1.32 {q0, q1}, [%0]! \n" // store pixels | 926 "vst1.32 {q0, q1}, [%0]! \n" // store pixels |
927 "subs %2, %2, #8 \n" // 8 processed per loop | 927 "subs %2, %2, #8 \n" // 8 processed per loop |
928 "bgt 1b \n" | 928 "bgt 1b \n" |
929 : "+r"(dst_argb), // %0 | 929 : "+r"(dst_argb), // %0 |
930 "+r"(src_argb), // %1 | 930 "+r"(src_argb), // %1 |
931 "+r"(dst_width), // %2 | 931 "+r"(dst_width), // %2 |
932 "+r"(x), // %3 | 932 "+r"(x), // %3 |
933 "+r"(dx), // %4 | 933 "+r"(dx), // %4 |
934 "+r"(tmp), // %5 | 934 "=&r"(tmp), // %5 |
935 "+r"(src_tmp) // %6 | 935 "+r"(src_tmp) // %6 |
936 : | 936 : |
937 : "memory", "cc", "q0", "q1" | 937 : "memory", "cc", "q0", "q1" |
938 ); | 938 ); |
939 } | 939 } |
940 | 940 |
941 #undef LOAD1_DATA32_LANE | 941 #undef LOAD1_DATA32_LANE |
942 | 942 |
943 // TODO(Yang Zhang): Investigate less load instructions for | 943 // TODO(Yang Zhang): Investigate less load instructions for |
944 // the x/dx stepping | 944 // the x/dx stepping |
945 #define LOAD2_DATA32_LANE(dn1, dn2, n) \ | 945 #define LOAD2_DATA32_LANE(dn1, dn2, n) \ |
(...skipping 62 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1008 } | 1008 } |
1009 | 1009 |
1010 #undef LOAD2_DATA32_LANE | 1010 #undef LOAD2_DATA32_LANE |
1011 | 1011 |
1012 #endif // defined(__ARM_NEON__) && !defined(__aarch64__) | 1012 #endif // defined(__ARM_NEON__) && !defined(__aarch64__) |
1013 | 1013 |
1014 #ifdef __cplusplus | 1014 #ifdef __cplusplus |
1015 } // extern "C" | 1015 } // extern "C" |
1016 } // namespace libyuv | 1016 } // namespace libyuv |
1017 #endif | 1017 #endif |
OLD | NEW |