| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2014 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2014 The LibYuv Project Authors. All rights reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| (...skipping 529 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 540 "+r"(dst_width) // %3 | 540 "+r"(dst_width) // %3 |
| 541 : "r"(&kMult38_Div6), // %4 | 541 : "r"(&kMult38_Div6), // %4 |
| 542 "r"(&kShuf38_2) // %5 | 542 "r"(&kShuf38_2) // %5 |
| 543 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", | 543 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", |
| 544 "v18", "v19", "v30", "v31", "memory", "cc" | 544 "v18", "v19", "v30", "v31", "memory", "cc" |
| 545 ); | 545 ); |
| 546 } | 546 } |
| 547 | 547 |
| 548 void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, | 548 void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, |
| 549 uint16* dst_ptr, int src_width, int src_height) { | 549 uint16* dst_ptr, int src_width, int src_height) { |
| 550 const uint8* src_tmp = NULL; | 550 const uint8* src_tmp; |
| 551 asm volatile ( | 551 asm volatile ( |
| 552 "1: \n" | 552 "1: \n" |
| 553 "mov %0, %1 \n" | 553 "mov %0, %1 \n" |
| 554 "mov w12, %w5 \n" | 554 "mov w12, %w5 \n" |
| 555 "eor v2.16b, v2.16b, v2.16b \n" | 555 "eor v2.16b, v2.16b, v2.16b \n" |
| 556 "eor v3.16b, v3.16b, v3.16b \n" | 556 "eor v3.16b, v3.16b, v3.16b \n" |
| 557 "2: \n" | 557 "2: \n" |
| 558 // load 16 pixels into q0 | 558 // load 16 pixels into q0 |
| 559 MEMACCESS(0) | 559 MEMACCESS(0) |
| 560 "ld1 {v0.16b}, [%0], %3 \n" | 560 "ld1 {v0.16b}, [%0], %3 \n" |
| 561 "uaddw2 v3.8h, v3.8h, v0.16b \n" | 561 "uaddw2 v3.8h, v3.8h, v0.16b \n" |
| 562 "uaddw v2.8h, v2.8h, v0.8b \n" | 562 "uaddw v2.8h, v2.8h, v0.8b \n" |
| 563 "subs w12, w12, #1 \n" | 563 "subs w12, w12, #1 \n" |
| 564 "b.gt 2b \n" | 564 "b.gt 2b \n" |
| 565 MEMACCESS(2) | 565 MEMACCESS(2) |
| 566 "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels | 566 "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels |
| 567 "add %1, %1, #16 \n" | 567 "add %1, %1, #16 \n" |
| 568 "subs %w4, %w4, #16 \n" // 16 processed per loop | 568 "subs %w4, %w4, #16 \n" // 16 processed per loop |
| 569 "b.gt 1b \n" | 569 "b.gt 1b \n" |
| 570 : "+r"(src_tmp), // %0 | 570 : "=&r"(src_tmp), // %0 |
| 571 "+r"(src_ptr), // %1 | 571 "+r"(src_ptr), // %1 |
| 572 "+r"(dst_ptr), // %2 | 572 "+r"(dst_ptr), // %2 |
| 573 "+r"(src_stride), // %3 | 573 "+r"(src_stride), // %3 |
| 574 "+r"(src_width), // %4 | 574 "+r"(src_width), // %4 |
| 575 "+r"(src_height) // %5 | 575 "+r"(src_height) // %5 |
| 576 : | 576 : |
| 577 : "memory", "cc", "w12", "v0", "v1", "v2", "v3" // Clobber List | 577 : "memory", "cc", "w12", "v0", "v1", "v2", "v3" // Clobber List |
| 578 ); | 578 ); |
| 579 } | 579 } |
| 580 | 580 |
| 581 // TODO(Yang Zhang): Investigate less load instructions for | 581 // TODO(Yang Zhang): Investigate less load instructions for |
| 582 // the x/dx stepping | 582 // the x/dx stepping |
| 583 #define LOAD2_DATA8_LANE(n) \ | 583 #define LOAD2_DATA8_LANE(n) \ |
| 584 "lsr %5, %3, #16 \n" \ | 584 "lsr %5, %3, #16 \n" \ |
| 585 "add %6, %1, %5 \n" \ | 585 "add %6, %1, %5 \n" \ |
| (...skipping 338 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 924 "add %3, %3, %4 \n" \ | 924 "add %3, %3, %4 \n" \ |
| 925 MEMACCESS(6) \ | 925 MEMACCESS(6) \ |
| 926 "ld1 {"#vn".s}["#n"], [%6] \n" | 926 "ld1 {"#vn".s}["#n"], [%6] \n" |
| 927 | 927 |
| 928 void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb, | 928 void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb, |
| 929 int dst_width, int x, int dx) { | 929 int dst_width, int x, int dx) { |
| 930 const uint8* src_tmp = src_argb; | 930 const uint8* src_tmp = src_argb; |
| 931 int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning. | 931 int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning. |
| 932 int64 x64 = (int64) x; | 932 int64 x64 = (int64) x; |
| 933 int64 dx64 = (int64) dx; | 933 int64 dx64 = (int64) dx; |
| 934 int64 tmp64 = 0; | 934 int64 tmp64; |
| 935 asm volatile ( | 935 asm volatile ( |
| 936 "1: \n" | 936 "1: \n" |
| 937 LOAD1_DATA32_LANE(v0, 0) | 937 LOAD1_DATA32_LANE(v0, 0) |
| 938 LOAD1_DATA32_LANE(v0, 1) | 938 LOAD1_DATA32_LANE(v0, 1) |
| 939 LOAD1_DATA32_LANE(v0, 2) | 939 LOAD1_DATA32_LANE(v0, 2) |
| 940 LOAD1_DATA32_LANE(v0, 3) | 940 LOAD1_DATA32_LANE(v0, 3) |
| 941 LOAD1_DATA32_LANE(v1, 0) | 941 LOAD1_DATA32_LANE(v1, 0) |
| 942 LOAD1_DATA32_LANE(v1, 1) | 942 LOAD1_DATA32_LANE(v1, 1) |
| 943 LOAD1_DATA32_LANE(v1, 2) | 943 LOAD1_DATA32_LANE(v1, 2) |
| 944 LOAD1_DATA32_LANE(v1, 3) | 944 LOAD1_DATA32_LANE(v1, 3) |
| 945 | 945 |
| 946 MEMACCESS(0) | 946 MEMACCESS(0) |
| 947 "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels | 947 "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels |
| 948 "subs %w2, %w2, #8 \n" // 8 processed per loop | 948 "subs %w2, %w2, #8 \n" // 8 processed per loop |
| 949 "b.gt 1b \n" | 949 "b.gt 1b \n" |
| 950 : "+r"(dst_argb), // %0 | 950 : "+r"(dst_argb), // %0 |
| 951 "+r"(src_argb), // %1 | 951 "+r"(src_argb), // %1 |
| 952 "+r"(dst_width64), // %2 | 952 "+r"(dst_width64), // %2 |
| 953 "+r"(x64), // %3 | 953 "+r"(x64), // %3 |
| 954 "+r"(dx64), // %4 | 954 "+r"(dx64), // %4 |
| 955 "+r"(tmp64), // %5 | 955 "=&r"(tmp64), // %5 |
| 956 "+r"(src_tmp) // %6 | 956 "+r"(src_tmp) // %6 |
| 957 : | 957 : |
| 958 : "memory", "cc", "v0", "v1" | 958 : "memory", "cc", "v0", "v1" |
| 959 ); | 959 ); |
| 960 } | 960 } |
| 961 | 961 |
| 962 #undef LOAD1_DATA32_LANE | 962 #undef LOAD1_DATA32_LANE |
| 963 | 963 |
| 964 // TODO(Yang Zhang): Investigate less load instructions for | 964 // TODO(Yang Zhang): Investigate less load instructions for |
| 965 // the x/dx stepping | 965 // the x/dx stepping |
| 966 #define LOAD2_DATA32_LANE(vn1, vn2, n) \ | 966 #define LOAD2_DATA32_LANE(vn1, vn2, n) \ |
| (...skipping 66 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1033 } | 1033 } |
| 1034 | 1034 |
| 1035 #undef LOAD2_DATA32_LANE | 1035 #undef LOAD2_DATA32_LANE |
| 1036 | 1036 |
| 1037 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) | 1037 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) |
| 1038 | 1038 |
| 1039 #ifdef __cplusplus | 1039 #ifdef __cplusplus |
| 1040 } // extern "C" | 1040 } // extern "C" |
| 1041 } // namespace libyuv | 1041 } // namespace libyuv |
| 1042 #endif | 1042 #endif |
| OLD | NEW |