| OLD | NEW | 
|---|
| 1 /* | 1 /* | 
| 2  *  Copyright 2014 The LibYuv Project Authors. All rights reserved. | 2  *  Copyright 2014 The LibYuv Project Authors. All rights reserved. | 
| 3  * | 3  * | 
| 4  *  Use of this source code is governed by a BSD-style license | 4  *  Use of this source code is governed by a BSD-style license | 
| 5  *  that can be found in the LICENSE file in the root of the source | 5  *  that can be found in the LICENSE file in the root of the source | 
| 6  *  tree. An additional intellectual property rights grant can be found | 6  *  tree. An additional intellectual property rights grant can be found | 
| 7  *  in the file PATENTS. All contributing project authors may | 7  *  in the file PATENTS. All contributing project authors may | 
| 8  *  be found in the AUTHORS file in the root of the source tree. | 8  *  be found in the AUTHORS file in the root of the source tree. | 
| 9  */ | 9  */ | 
| 10 | 10 | 
| (...skipping 569 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 580 | 580 | 
| 581 // TODO(Yang Zhang): Investigate less load instructions for | 581 // TODO(Yang Zhang): Investigate less load instructions for | 
| 582 // the x/dx stepping | 582 // the x/dx stepping | 
| 583 #define LOAD2_DATA8_LANE(n)                                    \ | 583 #define LOAD2_DATA8_LANE(n)                                    \ | 
| 584     "lsr        %5, %3, #16                    \n"             \ | 584     "lsr        %5, %3, #16                    \n"             \ | 
| 585     "add        %6, %1, %5                    \n"              \ | 585     "add        %6, %1, %5                    \n"              \ | 
| 586     "add        %3, %3, %4                     \n"             \ | 586     "add        %3, %3, %4                     \n"             \ | 
| 587     MEMACCESS(6)                                               \ | 587     MEMACCESS(6)                                               \ | 
| 588     "ld2        {v4.b, v5.b}["#n"], [%6]      \n" | 588     "ld2        {v4.b, v5.b}["#n"], [%6]      \n" | 
| 589 | 589 | 
|  | 590 // The NEON version mimics this formula (from row_common.cc): | 
|  | 591 // #define BLENDER(a, b, f) (uint8)((int)(a) + \ | 
|  | 592 //    ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16)) | 
|  | 593 | 
| 590 void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, | 594 void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, | 
| 591                           int dst_width, int x, int dx) { | 595                           int dst_width, int x, int dx) { | 
| 592   int dx_offset[4] = {0, 1, 2, 3}; | 596   int dx_offset[4] = {0, 1, 2, 3}; | 
| 593   int* tmp = dx_offset; | 597   int* tmp = dx_offset; | 
| 594   const uint8* src_tmp = src_ptr; | 598   const uint8* src_tmp = src_ptr; | 
| 595   int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning. | 599   int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning. | 
| 596   int64 x64 = (int64) x; | 600   int64 x64 = (int64) x; | 
| 597   int64 dx64 = (int64) dx; | 601   int64 dx64 = (int64) dx; | 
| 598   asm volatile ( | 602   asm volatile ( | 
| 599     "dup        v0.4s, %w3                     \n"  // x | 603     "dup        v0.4s, %w3                     \n"  // x | 
| (...skipping 19 matching lines...) Expand all  Loading... | 
| 619     "mov       v7.16b, v2.16b                  \n" | 623     "mov       v7.16b, v2.16b                  \n" | 
| 620     "uzp1      v6.8h, v6.8h, v7.8h             \n" | 624     "uzp1      v6.8h, v6.8h, v7.8h             \n" | 
| 621     "ushll     v4.8h, v4.8b, #0                \n" | 625     "ushll     v4.8h, v4.8b, #0                \n" | 
| 622     "ushll     v5.8h, v5.8b, #0                \n" | 626     "ushll     v5.8h, v5.8b, #0                \n" | 
| 623     "ssubl     v16.4s, v5.4h, v4.4h            \n" | 627     "ssubl     v16.4s, v5.4h, v4.4h            \n" | 
| 624     "ssubl2    v17.4s, v5.8h, v4.8h            \n" | 628     "ssubl2    v17.4s, v5.8h, v4.8h            \n" | 
| 625     "ushll     v7.4s, v6.4h, #0                \n" | 629     "ushll     v7.4s, v6.4h, #0                \n" | 
| 626     "ushll2    v6.4s, v6.8h, #0                \n" | 630     "ushll2    v6.4s, v6.8h, #0                \n" | 
| 627     "mul       v16.4s, v16.4s, v7.4s           \n" | 631     "mul       v16.4s, v16.4s, v7.4s           \n" | 
| 628     "mul       v17.4s, v17.4s, v6.4s           \n" | 632     "mul       v17.4s, v17.4s, v6.4s           \n" | 
| 629     "rshrn      v6.4h, v16.4s, #16             \n" | 633     "rshrn     v6.4h, v16.4s, #16              \n" | 
| 630     "rshrn2     v6.8h, v17.4s, #16             \n" | 634     "rshrn2    v6.8h, v17.4s, #16              \n" | 
| 631     "add       v4.8h, v4.8h, v6.8h             \n" | 635     "add       v4.8h, v4.8h, v6.8h             \n" | 
| 632     "xtn       v4.8b, v4.8h                    \n" | 636     "xtn       v4.8b, v4.8h                    \n" | 
| 633 | 637 | 
| 634     MEMACCESS(0) | 638     MEMACCESS(0) | 
| 635     "st1       {v4.8b}, [%0], #8               \n"  // store pixels | 639     "st1       {v4.8b}, [%0], #8               \n"  // store pixels | 
| 636     "add       v1.4s, v1.4s, v0.4s             \n" | 640     "add       v1.4s, v1.4s, v0.4s             \n" | 
| 637     "add       v2.4s, v2.4s, v0.4s             \n" | 641     "add       v2.4s, v2.4s, v0.4s             \n" | 
| 638     "subs      %w2, %w2, #8                    \n"  // 8 processed per loop | 642     "subs      %w2, %w2, #8                    \n"  // 8 processed per loop | 
| 639     "b.gt      1b                              \n" | 643     "b.gt      1b                              \n" | 
| 640   : "+r"(dst_ptr),          // %0 | 644   : "+r"(dst_ptr),          // %0 | 
| (...skipping 392 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 1033 } | 1037 } | 
| 1034 | 1038 | 
| 1035 #undef LOAD2_DATA32_LANE | 1039 #undef LOAD2_DATA32_LANE | 
| 1036 | 1040 | 
| 1037 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) | 1041 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) | 
| 1038 | 1042 | 
| 1039 #ifdef __cplusplus | 1043 #ifdef __cplusplus | 
| 1040 }  // extern "C" | 1044 }  // extern "C" | 
| 1041 }  // namespace libyuv | 1045 }  // namespace libyuv | 
| 1042 #endif | 1046 #endif | 
| OLD | NEW | 
|---|