| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| (...skipping 514 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 525 "+r"(dst_width), // %2 | 525 "+r"(dst_width), // %2 |
| 526 "+r"(src_stride) // %3 | 526 "+r"(src_stride) // %3 |
| 527 : "r"(&kMult38_Div6), // %4 | 527 : "r"(&kMult38_Div6), // %4 |
| 528 "r"(&kShuf38_2) // %5 | 528 "r"(&kShuf38_2) // %5 |
| 529 : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc" | 529 : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc" |
| 530 ); | 530 ); |
| 531 } | 531 } |
| 532 | 532 |
| 533 void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, | 533 void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, |
| 534 uint16* dst_ptr, int src_width, int src_height) { | 534 uint16* dst_ptr, int src_width, int src_height) { |
| 535 const uint8* src_tmp = NULL; | 535 const uint8* src_tmp; |
| 536 asm volatile ( | 536 asm volatile ( |
| 537 "1: \n" | 537 "1: \n" |
| 538 "mov %0, %1 \n" | 538 "mov %0, %1 \n" |
| 539 "mov r12, %5 \n" | 539 "mov r12, %5 \n" |
| 540 "veor q2, q2, q2 \n" | 540 "veor q2, q2, q2 \n" |
| 541 "veor q3, q3, q3 \n" | 541 "veor q3, q3, q3 \n" |
| 542 "2: \n" | 542 "2: \n" |
| 543 // load 16 pixels into q0 | 543 // load 16 pixels into q0 |
| 544 MEMACCESS(0) | 544 MEMACCESS(0) |
| 545 "vld1.8 {q0}, [%0], %3 \n" | 545 "vld1.8 {q0}, [%0], %3 \n" |
| 546 "vaddw.u8 q3, q3, d1 \n" | 546 "vaddw.u8 q3, q3, d1 \n" |
| 547 "vaddw.u8 q2, q2, d0 \n" | 547 "vaddw.u8 q2, q2, d0 \n" |
| 548 "subs r12, r12, #1 \n" | 548 "subs r12, r12, #1 \n" |
| 549 "bgt 2b \n" | 549 "bgt 2b \n" |
| 550 MEMACCESS(2) | 550 MEMACCESS(2) |
| 551 "vst1.16 {q2, q3}, [%2]! \n" // store pixels | 551 "vst1.16 {q2, q3}, [%2]! \n" // store pixels |
| 552 "add %1, %1, #16 \n" | 552 "add %1, %1, #16 \n" |
| 553 "subs %4, %4, #16 \n" // 16 processed per loop | 553 "subs %4, %4, #16 \n" // 16 processed per loop |
| 554 "bgt 1b \n" | 554 "bgt 1b \n" |
| 555 : "+r"(src_tmp), // %0 | 555 : "=&r"(src_tmp), // %0 |
| 556 "+r"(src_ptr), // %1 | 556 "+r"(src_ptr), // %1 |
| 557 "+r"(dst_ptr), // %2 | 557 "+r"(dst_ptr), // %2 |
| 558 "+r"(src_stride), // %3 | 558 "+r"(src_stride), // %3 |
| 559 "+r"(src_width), // %4 | 559 "+r"(src_width), // %4 |
| 560 "+r"(src_height) // %5 | 560 "+r"(src_height) // %5 |
| 561 : | 561 : |
| 562 : "memory", "cc", "r12", "q0", "q1", "q2", "q3" // Clobber List | 562 : "memory", "cc", "r12", "q0", "q1", "q2", "q3" // Clobber List |
| 563 ); | 563 ); |
| 564 } | 564 } |
| 565 | 565 |
| 566 // TODO(Yang Zhang): Investigate less load instructions for | 566 // TODO(Yang Zhang): Investigate less load instructions for |
| 567 // the x/dx stepping | 567 // the x/dx stepping |
| 568 #define LOAD2_DATA8_LANE(n) \ | 568 #define LOAD2_DATA8_LANE(n) \ |
| 569 "lsr %5, %3, #16 \n" \ | 569 "lsr %5, %3, #16 \n" \ |
| 570 "add %6, %1, %5 \n" \ | 570 "add %6, %1, %5 \n" \ |
| (...skipping 331 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 902 // the x/dx stepping | 902 // the x/dx stepping |
| 903 #define LOAD1_DATA32_LANE(dn, n) \ | 903 #define LOAD1_DATA32_LANE(dn, n) \ |
| 904 "lsr %5, %3, #16 \n" \ | 904 "lsr %5, %3, #16 \n" \ |
| 905 "add %6, %1, %5, lsl #2 \n" \ | 905 "add %6, %1, %5, lsl #2 \n" \ |
| 906 "add %3, %3, %4 \n" \ | 906 "add %3, %3, %4 \n" \ |
| 907 MEMACCESS(6) \ | 907 MEMACCESS(6) \ |
| 908 "vld1.32 {"#dn"["#n"]}, [%6] \n" | 908 "vld1.32 {"#dn"["#n"]}, [%6] \n" |
| 909 | 909 |
| 910 void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb, | 910 void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb, |
| 911 int dst_width, int x, int dx) { | 911 int dst_width, int x, int dx) { |
| 912 int tmp = 0; | 912 int tmp; |
| 913 const uint8* src_tmp = src_argb; | 913 const uint8* src_tmp = src_argb; |
| 914 asm volatile ( | 914 asm volatile ( |
| 915 "1: \n" | 915 "1: \n" |
| 916 LOAD1_DATA32_LANE(d0, 0) | 916 LOAD1_DATA32_LANE(d0, 0) |
| 917 LOAD1_DATA32_LANE(d0, 1) | 917 LOAD1_DATA32_LANE(d0, 1) |
| 918 LOAD1_DATA32_LANE(d1, 0) | 918 LOAD1_DATA32_LANE(d1, 0) |
| 919 LOAD1_DATA32_LANE(d1, 1) | 919 LOAD1_DATA32_LANE(d1, 1) |
| 920 LOAD1_DATA32_LANE(d2, 0) | 920 LOAD1_DATA32_LANE(d2, 0) |
| 921 LOAD1_DATA32_LANE(d2, 1) | 921 LOAD1_DATA32_LANE(d2, 1) |
| 922 LOAD1_DATA32_LANE(d3, 0) | 922 LOAD1_DATA32_LANE(d3, 0) |
| 923 LOAD1_DATA32_LANE(d3, 1) | 923 LOAD1_DATA32_LANE(d3, 1) |
| 924 | 924 |
| 925 MEMACCESS(0) | 925 MEMACCESS(0) |
| 926 "vst1.32 {q0, q1}, [%0]! \n" // store pixels | 926 "vst1.32 {q0, q1}, [%0]! \n" // store pixels |
| 927 "subs %2, %2, #8 \n" // 8 processed per loop | 927 "subs %2, %2, #8 \n" // 8 processed per loop |
| 928 "bgt 1b \n" | 928 "bgt 1b \n" |
| 929 : "+r"(dst_argb), // %0 | 929 : "+r"(dst_argb), // %0 |
| 930 "+r"(src_argb), // %1 | 930 "+r"(src_argb), // %1 |
| 931 "+r"(dst_width), // %2 | 931 "+r"(dst_width), // %2 |
| 932 "+r"(x), // %3 | 932 "+r"(x), // %3 |
| 933 "+r"(dx), // %4 | 933 "+r"(dx), // %4 |
| 934 "+r"(tmp), // %5 | 934 "=&r"(tmp), // %5 |
| 935 "+r"(src_tmp) // %6 | 935 "+r"(src_tmp) // %6 |
| 936 : | 936 : |
| 937 : "memory", "cc", "q0", "q1" | 937 : "memory", "cc", "q0", "q1" |
| 938 ); | 938 ); |
| 939 } | 939 } |
| 940 | 940 |
| 941 #undef LOAD1_DATA32_LANE | 941 #undef LOAD1_DATA32_LANE |
| 942 | 942 |
| 943 // TODO(Yang Zhang): Investigate less load instructions for | 943 // TODO(Yang Zhang): Investigate less load instructions for |
| 944 // the x/dx stepping | 944 // the x/dx stepping |
| 945 #define LOAD2_DATA32_LANE(dn1, dn2, n) \ | 945 #define LOAD2_DATA32_LANE(dn1, dn2, n) \ |
| (...skipping 62 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1008 } | 1008 } |
| 1009 | 1009 |
| 1010 #undef LOAD2_DATA32_LANE | 1010 #undef LOAD2_DATA32_LANE |
| 1011 | 1011 |
| 1012 #endif // defined(__ARM_NEON__) && !defined(__aarch64__) | 1012 #endif // defined(__ARM_NEON__) && !defined(__aarch64__) |
| 1013 | 1013 |
| 1014 #ifdef __cplusplus | 1014 #ifdef __cplusplus |
| 1015 } // extern "C" | 1015 } // extern "C" |
| 1016 } // namespace libyuv | 1016 } // namespace libyuv |
| 1017 #endif | 1017 #endif |
| OLD | NEW |