OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2014 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2014 The LibYuv Project Authors. All rights reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 569 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
580 | 580 |
581 // TODO(Yang Zhang): Investigate less load instructions for | 581 // TODO(Yang Zhang): Investigate less load instructions for |
582 // the x/dx stepping | 582 // the x/dx stepping |
583 #define LOAD2_DATA8_LANE(n) \ | 583 #define LOAD2_DATA8_LANE(n) \ |
584 "lsr %5, %3, #16 \n" \ | 584 "lsr %5, %3, #16 \n" \ |
585 "add %6, %1, %5 \n" \ | 585 "add %6, %1, %5 \n" \ |
586 "add %3, %3, %4 \n" \ | 586 "add %3, %3, %4 \n" \ |
587 MEMACCESS(6) \ | 587 MEMACCESS(6) \ |
588 "ld2 {v4.b, v5.b}["#n"], [%6] \n" | 588 "ld2 {v4.b, v5.b}["#n"], [%6] \n" |
589 | 589 |
| 590 // The NEON version mimics this formula (from row_common.cc): |
| 591 // #define BLENDER(a, b, f) (uint8)((int)(a) + \ |
| 592 // ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16)) |
| 593 |
590 void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, | 594 void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, |
591 int dst_width, int x, int dx) { | 595 int dst_width, int x, int dx) { |
592 int dx_offset[4] = {0, 1, 2, 3}; | 596 int dx_offset[4] = {0, 1, 2, 3}; |
593 int* tmp = dx_offset; | 597 int* tmp = dx_offset; |
594 const uint8* src_tmp = src_ptr; | 598 const uint8* src_tmp = src_ptr; |
595 int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning. | 599 int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning. |
596 int64 x64 = (int64) x; | 600 int64 x64 = (int64) x; |
597 int64 dx64 = (int64) dx; | 601 int64 dx64 = (int64) dx; |
598 asm volatile ( | 602 asm volatile ( |
599 "dup v0.4s, %w3 \n" // x | 603 "dup v0.4s, %w3 \n" // x |
(...skipping 19 matching lines...) Expand all Loading... |
619 "mov v7.16b, v2.16b \n" | 623 "mov v7.16b, v2.16b \n" |
620 "uzp1 v6.8h, v6.8h, v7.8h \n" | 624 "uzp1 v6.8h, v6.8h, v7.8h \n" |
621 "ushll v4.8h, v4.8b, #0 \n" | 625 "ushll v4.8h, v4.8b, #0 \n" |
622 "ushll v5.8h, v5.8b, #0 \n" | 626 "ushll v5.8h, v5.8b, #0 \n" |
623 "ssubl v16.4s, v5.4h, v4.4h \n" | 627 "ssubl v16.4s, v5.4h, v4.4h \n" |
624 "ssubl2 v17.4s, v5.8h, v4.8h \n" | 628 "ssubl2 v17.4s, v5.8h, v4.8h \n" |
625 "ushll v7.4s, v6.4h, #0 \n" | 629 "ushll v7.4s, v6.4h, #0 \n" |
626 "ushll2 v6.4s, v6.8h, #0 \n" | 630 "ushll2 v6.4s, v6.8h, #0 \n" |
627 "mul v16.4s, v16.4s, v7.4s \n" | 631 "mul v16.4s, v16.4s, v7.4s \n" |
628 "mul v17.4s, v17.4s, v6.4s \n" | 632 "mul v17.4s, v17.4s, v6.4s \n" |
629 "rshrn v6.4h, v16.4s, #16 \n" | 633 "rshrn v6.4h, v16.4s, #16 \n" |
630 "rshrn2 v6.8h, v17.4s, #16 \n" | 634 "rshrn2 v6.8h, v17.4s, #16 \n" |
631 "add v4.8h, v4.8h, v6.8h \n" | 635 "add v4.8h, v4.8h, v6.8h \n" |
632 "xtn v4.8b, v4.8h \n" | 636 "xtn v4.8b, v4.8h \n" |
633 | 637 |
634 MEMACCESS(0) | 638 MEMACCESS(0) |
635 "st1 {v4.8b}, [%0], #8 \n" // store pixels | 639 "st1 {v4.8b}, [%0], #8 \n" // store pixels |
636 "add v1.4s, v1.4s, v0.4s \n" | 640 "add v1.4s, v1.4s, v0.4s \n" |
637 "add v2.4s, v2.4s, v0.4s \n" | 641 "add v2.4s, v2.4s, v0.4s \n" |
638 "subs %w2, %w2, #8 \n" // 8 processed per loop | 642 "subs %w2, %w2, #8 \n" // 8 processed per loop |
639 "b.gt 1b \n" | 643 "b.gt 1b \n" |
640 : "+r"(dst_ptr), // %0 | 644 : "+r"(dst_ptr), // %0 |
(...skipping 392 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1033 } | 1037 } |
1034 | 1038 |
1035 #undef LOAD2_DATA32_LANE | 1039 #undef LOAD2_DATA32_LANE |
1036 | 1040 |
1037 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) | 1041 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) |
1038 | 1042 |
1039 #ifdef __cplusplus | 1043 #ifdef __cplusplus |
1040 } // extern "C" | 1044 } // extern "C" |
1041 } // namespace libyuv | 1045 } // namespace libyuv |
1042 #endif | 1046 #endif |
OLD | NEW |