| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2014 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2014 The LibYuv Project Authors. All rights reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| (...skipping 569 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 580 | 580 |
| 581 // TODO(Yang Zhang): Investigate less load instructions for | 581 // TODO(Yang Zhang): Investigate less load instructions for |
| 582 // the x/dx stepping | 582 // the x/dx stepping |
| 583 #define LOAD2_DATA8_LANE(n) \ | 583 #define LOAD2_DATA8_LANE(n) \ |
| 584 "lsr %5, %3, #16 \n" \ | 584 "lsr %5, %3, #16 \n" \ |
| 585 "add %6, %1, %5 \n" \ | 585 "add %6, %1, %5 \n" \ |
| 586 "add %3, %3, %4 \n" \ | 586 "add %3, %3, %4 \n" \ |
| 587 MEMACCESS(6) \ | 587 MEMACCESS(6) \ |
| 588 "ld2 {v4.b, v5.b}["#n"], [%6] \n" | 588 "ld2 {v4.b, v5.b}["#n"], [%6] \n" |
| 589 | 589 |
| 590 // The NEON version mimics this formula (from row_common.cc): |
| 591 // #define BLENDER(a, b, f) (uint8)((int)(a) + \ |
| 592 // ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16)) |
| 593 |
| 590 void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, | 594 void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, |
| 591 int dst_width, int x, int dx) { | 595 int dst_width, int x, int dx) { |
| 592 int dx_offset[4] = {0, 1, 2, 3}; | 596 int dx_offset[4] = {0, 1, 2, 3}; |
| 593 int* tmp = dx_offset; | 597 int* tmp = dx_offset; |
| 594 const uint8* src_tmp = src_ptr; | 598 const uint8* src_tmp = src_ptr; |
| 595 int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning. | 599 int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning. |
| 596 int64 x64 = (int64) x; | 600 int64 x64 = (int64) x; |
| 597 int64 dx64 = (int64) dx; | 601 int64 dx64 = (int64) dx; |
| 598 asm volatile ( | 602 asm volatile ( |
| 599 "dup v0.4s, %w3 \n" // x | 603 "dup v0.4s, %w3 \n" // x |
| (...skipping 19 matching lines...) Expand all Loading... |
| 619 "mov v7.16b, v2.16b \n" | 623 "mov v7.16b, v2.16b \n" |
| 620 "uzp1 v6.8h, v6.8h, v7.8h \n" | 624 "uzp1 v6.8h, v6.8h, v7.8h \n" |
| 621 "ushll v4.8h, v4.8b, #0 \n" | 625 "ushll v4.8h, v4.8b, #0 \n" |
| 622 "ushll v5.8h, v5.8b, #0 \n" | 626 "ushll v5.8h, v5.8b, #0 \n" |
| 623 "ssubl v16.4s, v5.4h, v4.4h \n" | 627 "ssubl v16.4s, v5.4h, v4.4h \n" |
| 624 "ssubl2 v17.4s, v5.8h, v4.8h \n" | 628 "ssubl2 v17.4s, v5.8h, v4.8h \n" |
| 625 "ushll v7.4s, v6.4h, #0 \n" | 629 "ushll v7.4s, v6.4h, #0 \n" |
| 626 "ushll2 v6.4s, v6.8h, #0 \n" | 630 "ushll2 v6.4s, v6.8h, #0 \n" |
| 627 "mul v16.4s, v16.4s, v7.4s \n" | 631 "mul v16.4s, v16.4s, v7.4s \n" |
| 628 "mul v17.4s, v17.4s, v6.4s \n" | 632 "mul v17.4s, v17.4s, v6.4s \n" |
| 629 "rshrn v6.4h, v16.4s, #16 \n" | 633 "rshrn v6.4h, v16.4s, #16 \n" |
| 630 "rshrn2 v6.8h, v17.4s, #16 \n" | 634 "rshrn2 v6.8h, v17.4s, #16 \n" |
| 631 "add v4.8h, v4.8h, v6.8h \n" | 635 "add v4.8h, v4.8h, v6.8h \n" |
| 632 "xtn v4.8b, v4.8h \n" | 636 "xtn v4.8b, v4.8h \n" |
| 633 | 637 |
| 634 MEMACCESS(0) | 638 MEMACCESS(0) |
| 635 "st1 {v4.8b}, [%0], #8 \n" // store pixels | 639 "st1 {v4.8b}, [%0], #8 \n" // store pixels |
| 636 "add v1.4s, v1.4s, v0.4s \n" | 640 "add v1.4s, v1.4s, v0.4s \n" |
| 637 "add v2.4s, v2.4s, v0.4s \n" | 641 "add v2.4s, v2.4s, v0.4s \n" |
| 638 "subs %w2, %w2, #8 \n" // 8 processed per loop | 642 "subs %w2, %w2, #8 \n" // 8 processed per loop |
| 639 "b.gt 1b \n" | 643 "b.gt 1b \n" |
| 640 : "+r"(dst_ptr), // %0 | 644 : "+r"(dst_ptr), // %0 |
| (...skipping 392 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1033 } | 1037 } |
| 1034 | 1038 |
| 1035 #undef LOAD2_DATA32_LANE | 1039 #undef LOAD2_DATA32_LANE |
| 1036 | 1040 |
| 1037 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) | 1041 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) |
| 1038 | 1042 |
| 1039 #ifdef __cplusplus | 1043 #ifdef __cplusplus |
| 1040 } // extern "C" | 1044 } // extern "C" |
| 1041 } // namespace libyuv | 1045 } // namespace libyuv |
| 1042 #endif | 1046 #endif |
| OLD | NEW |