OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2014 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2014 The LibYuv Project Authors. All rights reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 529 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
540 "+r"(dst_width) // %3 | 540 "+r"(dst_width) // %3 |
541 : "r"(&kMult38_Div6), // %4 | 541 : "r"(&kMult38_Div6), // %4 |
542 "r"(&kShuf38_2) // %5 | 542 "r"(&kShuf38_2) // %5 |
543 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", | 543 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", |
544 "v18", "v19", "v30", "v31", "memory", "cc" | 544 "v18", "v19", "v30", "v31", "memory", "cc" |
545 ); | 545 ); |
546 } | 546 } |
547 | 547 |
548 void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, | 548 void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, |
549 uint16* dst_ptr, int src_width, int src_height) { | 549 uint16* dst_ptr, int src_width, int src_height) { |
550 const uint8* src_tmp = NULL; | 550 const uint8* src_tmp; |
551 asm volatile ( | 551 asm volatile ( |
552 "1: \n" | 552 "1: \n" |
553 "mov %0, %1 \n" | 553 "mov %0, %1 \n" |
554 "mov w12, %w5 \n" | 554 "mov w12, %w5 \n" |
555 "eor v2.16b, v2.16b, v2.16b \n" | 555 "eor v2.16b, v2.16b, v2.16b \n" |
556 "eor v3.16b, v3.16b, v3.16b \n" | 556 "eor v3.16b, v3.16b, v3.16b \n" |
557 "2: \n" | 557 "2: \n" |
558 // load 16 pixels into q0 | 558 // load 16 pixels into q0 |
559 MEMACCESS(0) | 559 MEMACCESS(0) |
560 "ld1 {v0.16b}, [%0], %3 \n" | 560 "ld1 {v0.16b}, [%0], %3 \n" |
561 "uaddw2 v3.8h, v3.8h, v0.16b \n" | 561 "uaddw2 v3.8h, v3.8h, v0.16b \n" |
562 "uaddw v2.8h, v2.8h, v0.8b \n" | 562 "uaddw v2.8h, v2.8h, v0.8b \n" |
563 "subs w12, w12, #1 \n" | 563 "subs w12, w12, #1 \n" |
564 "b.gt 2b \n" | 564 "b.gt 2b \n" |
565 MEMACCESS(2) | 565 MEMACCESS(2) |
566 "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels | 566 "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels |
567 "add %1, %1, #16 \n" | 567 "add %1, %1, #16 \n" |
568 "subs %w4, %w4, #16 \n" // 16 processed per loop | 568 "subs %w4, %w4, #16 \n" // 16 processed per loop |
569 "b.gt 1b \n" | 569 "b.gt 1b \n" |
570 : "+r"(src_tmp), // %0 | 570 : "=&r"(src_tmp), // %0 |
571 "+r"(src_ptr), // %1 | 571 "+r"(src_ptr), // %1 |
572 "+r"(dst_ptr), // %2 | 572 "+r"(dst_ptr), // %2 |
573 "+r"(src_stride), // %3 | 573 "+r"(src_stride), // %3 |
574 "+r"(src_width), // %4 | 574 "+r"(src_width), // %4 |
575 "+r"(src_height) // %5 | 575 "+r"(src_height) // %5 |
576 : | 576 : |
577 : "memory", "cc", "w12", "v0", "v1", "v2", "v3" // Clobber List | 577 : "memory", "cc", "w12", "v0", "v1", "v2", "v3" // Clobber List |
578 ); | 578 ); |
579 } | 579 } |
580 | 580 |
581 // TODO(Yang Zhang): Investigate less load instructions for | 581 // TODO(Yang Zhang): Investigate less load instructions for |
582 // the x/dx stepping | 582 // the x/dx stepping |
583 #define LOAD2_DATA8_LANE(n) \ | 583 #define LOAD2_DATA8_LANE(n) \ |
584 "lsr %5, %3, #16 \n" \ | 584 "lsr %5, %3, #16 \n" \ |
585 "add %6, %1, %5 \n" \ | 585 "add %6, %1, %5 \n" \ |
(...skipping 338 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
924 "add %3, %3, %4 \n" \ | 924 "add %3, %3, %4 \n" \ |
925 MEMACCESS(6) \ | 925 MEMACCESS(6) \ |
926 "ld1 {"#vn".s}["#n"], [%6] \n" | 926 "ld1 {"#vn".s}["#n"], [%6] \n" |
927 | 927 |
928 void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb, | 928 void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb, |
929 int dst_width, int x, int dx) { | 929 int dst_width, int x, int dx) { |
930 const uint8* src_tmp = src_argb; | 930 const uint8* src_tmp = src_argb; |
931 int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning. | 931 int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning. |
932 int64 x64 = (int64) x; | 932 int64 x64 = (int64) x; |
933 int64 dx64 = (int64) dx; | 933 int64 dx64 = (int64) dx; |
934 int64 tmp64 = 0; | 934 int64 tmp64; |
935 asm volatile ( | 935 asm volatile ( |
936 "1: \n" | 936 "1: \n" |
937 LOAD1_DATA32_LANE(v0, 0) | 937 LOAD1_DATA32_LANE(v0, 0) |
938 LOAD1_DATA32_LANE(v0, 1) | 938 LOAD1_DATA32_LANE(v0, 1) |
939 LOAD1_DATA32_LANE(v0, 2) | 939 LOAD1_DATA32_LANE(v0, 2) |
940 LOAD1_DATA32_LANE(v0, 3) | 940 LOAD1_DATA32_LANE(v0, 3) |
941 LOAD1_DATA32_LANE(v1, 0) | 941 LOAD1_DATA32_LANE(v1, 0) |
942 LOAD1_DATA32_LANE(v1, 1) | 942 LOAD1_DATA32_LANE(v1, 1) |
943 LOAD1_DATA32_LANE(v1, 2) | 943 LOAD1_DATA32_LANE(v1, 2) |
944 LOAD1_DATA32_LANE(v1, 3) | 944 LOAD1_DATA32_LANE(v1, 3) |
945 | 945 |
946 MEMACCESS(0) | 946 MEMACCESS(0) |
947 "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels | 947 "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels |
948 "subs %w2, %w2, #8 \n" // 8 processed per loop | 948 "subs %w2, %w2, #8 \n" // 8 processed per loop |
949 "b.gt 1b \n" | 949 "b.gt 1b \n" |
950 : "+r"(dst_argb), // %0 | 950 : "+r"(dst_argb), // %0 |
951 "+r"(src_argb), // %1 | 951 "+r"(src_argb), // %1 |
952 "+r"(dst_width64), // %2 | 952 "+r"(dst_width64), // %2 |
953 "+r"(x64), // %3 | 953 "+r"(x64), // %3 |
954 "+r"(dx64), // %4 | 954 "+r"(dx64), // %4 |
955 "+r"(tmp64), // %5 | 955 "=&r"(tmp64), // %5 |
956 "+r"(src_tmp) // %6 | 956 "+r"(src_tmp) // %6 |
957 : | 957 : |
958 : "memory", "cc", "v0", "v1" | 958 : "memory", "cc", "v0", "v1" |
959 ); | 959 ); |
960 } | 960 } |
961 | 961 |
962 #undef LOAD1_DATA32_LANE | 962 #undef LOAD1_DATA32_LANE |
963 | 963 |
964 // TODO(Yang Zhang): Investigate less load instructions for | 964 // TODO(Yang Zhang): Investigate less load instructions for |
965 // the x/dx stepping | 965 // the x/dx stepping |
966 #define LOAD2_DATA32_LANE(vn1, vn2, n) \ | 966 #define LOAD2_DATA32_LANE(vn1, vn2, n) \ |
(...skipping 66 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1033 } | 1033 } |
1034 | 1034 |
1035 #undef LOAD2_DATA32_LANE | 1035 #undef LOAD2_DATA32_LANE |
1036 | 1036 |
1037 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) | 1037 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) |
1038 | 1038 |
1039 #ifdef __cplusplus | 1039 #ifdef __cplusplus |
1040 } // extern "C" | 1040 } // extern "C" |
1041 } // namespace libyuv | 1041 } // namespace libyuv |
1042 #endif | 1042 #endif |
OLD | NEW |