| OLD | NEW | 
|    1 /* |    1 /* | 
|    2  *  Copyright 2014 The LibYuv Project Authors. All rights reserved. |    2  *  Copyright 2014 The LibYuv Project Authors. All rights reserved. | 
|    3  * |    3  * | 
|    4  *  Use of this source code is governed by a BSD-style license |    4  *  Use of this source code is governed by a BSD-style license | 
|    5  *  that can be found in the LICENSE file in the root of the source |    5  *  that can be found in the LICENSE file in the root of the source | 
|    6  *  tree. An additional intellectual property rights grant can be found |    6  *  tree. An additional intellectual property rights grant can be found | 
|    7  *  in the file PATENTS. All contributing project authors may |    7  *  in the file PATENTS. All contributing project authors may | 
|    8  *  be found in the AUTHORS file in the root of the source tree. |    8  *  be found in the AUTHORS file in the root of the source tree. | 
|    9  */ |    9  */ | 
|   10  |   10  | 
 |   11 #include "libyuv/scale.h" | 
|   11 #include "libyuv/row.h" |   12 #include "libyuv/row.h" | 
 |   13 #include "libyuv/scale_row.h" | 
|   12  |   14  | 
|   13 #ifdef __cplusplus |   15 #ifdef __cplusplus | 
|   14 namespace libyuv { |   16 namespace libyuv { | 
|   15 extern "C" { |   17 extern "C" { | 
|   16 #endif |   18 #endif | 
|   17  |   19  | 
|   18 // This module is for GCC Neon. |   20 // This module is for GCC Neon armv8 64 bit. | 
|   19 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) |   21 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) | 
|   20 #ifdef HAS_SCALEROWDOWN2_NEON |   22  | 
|   21 // Read 32x1 throw away even pixels, and write 16x1. |   23 // Read 32x1 throw away even pixels, and write 16x1. | 
|   22 void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, |   24 void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, | 
|   23                         uint8* dst, int dst_width) { |   25                         uint8* dst, int dst_width) { | 
|   24   asm volatile ( |   26   asm volatile ( | 
|   25     ".p2align   2                              \n" |  | 
|   26   "1:                                          \n" |   27   "1:                                          \n" | 
|   27     // load even pixels into q0, odd into q1 |   28     // load even pixels into v0, odd into v1 | 
|   28     MEMACCESS(0) |   29     MEMACCESS(0) | 
|   29     "vld2.8     {q0, q1}, [%0]!                \n" |   30     "ld2        {v0.16b,v1.16b}, [%0], #32    \n" | 
|   30     "subs       %2, %2, #16                    \n"  // 16 processed per loop |   31     "subs       %2, %2, #16                    \n"  // 16 processed per loop | 
|   31     MEMACCESS(1) |   32     MEMACCESS(1) | 
|   32     "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels |   33     "st1        {v1.16b}, [%1], #16            \n"  // store odd pixels | 
|   33     "bgt        1b                             \n" |   34     "b.gt       1b                             \n" | 
|   34   : "+r"(src_ptr),          // %0 |   35   : "+r"(src_ptr),          // %0 | 
|   35     "+r"(dst),              // %1 |   36     "+r"(dst),              // %1 | 
|   36     "+r"(dst_width)         // %2 |   37     "+r"(dst_width)         // %2 | 
|   37   : |   38   : | 
|   38   : "q0", "q1"              // Clobber List |   39   : "v0", "v1"              // Clobber List | 
|   39   ); |   40   ); | 
|   40 } |   41 } | 
|   41 #endif //HAS_SCALEROWDOWN2_NEON |  | 
|   42  |   42  | 
|   43 #ifdef HAS_SCALEROWDOWN2_NEON |  | 
|   44 // Read 32x2 average down and write 16x1. |   43 // Read 32x2 average down and write 16x1. | 
|   45 void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, |   44 void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, | 
|   46                            uint8* dst, int dst_width) { |   45                            uint8* dst, int dst_width) { | 
|   47   asm volatile ( |   46   asm volatile ( | 
|   48     // change the stride to row 2 pointer |   47     // change the stride to row 2 pointer | 
|   49     "add        %1, %0                         \n" |   48     "add        %1, %1, %0                     \n" | 
|   50     ".p2align   2                              \n" |  | 
|   51   "1:                                          \n" |   49   "1:                                          \n" | 
|   52     MEMACCESS(0) |   50     MEMACCESS(0) | 
|   53     "vld1.8     {q0, q1}, [%0]!                \n"  // load row 1 and post inc |   51     "ld1        {v0.16b,v1.16b}, [%0], #32    \n"  // load row 1 and post inc | 
|   54     MEMACCESS(1) |   52     MEMACCESS(1) | 
|   55     "vld1.8     {q2, q3}, [%1]!                \n"  // load row 2 and post inc |   53     "ld1        {v2.16b, v3.16b}, [%1], #32    \n"  // load row 2 and post inc | 
|   56     "subs       %3, %3, #16                    \n"  // 16 processed per loop |   54     "subs       %3, %3, #16                    \n"  // 16 processed per loop | 
|   57     "vpaddl.u8  q0, q0                         \n"  // row 1 add adjacent |   55     "uaddlp     v0.8h, v0.16b                  \n"  // row 1 add adjacent | 
|   58     "vpaddl.u8  q1, q1                         \n" |   56     "uaddlp     v1.8h, v1.16b                  \n" | 
|   59     "vpadal.u8  q0, q2                         \n"  // row 2 add adjacent + row1 |   57     "uadalp     v0.8h, v2.16b                  \n"  // row 2 add adjacent + row1 | 
|   60     "vpadal.u8  q1, q3                         \n" |   58     "uadalp     v1.8h, v3.16b                  \n" | 
|   61     "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack |   59     "rshrn      v0.8b, v0.8h, #2               \n"  // downshift, round and pack | 
|   62     "vrshrn.u16 d1, q1, #2                     \n" |   60     "rshrn2     v0.16b, v1.8h, #2              \n" | 
|   63     MEMACCESS(2) |   61     MEMACCESS(2) | 
|   64     "vst1.8     {q0}, [%2]!                    \n" |   62     "st1        {v0.16b}, [%2], #16            \n" | 
|   65     "bgt        1b                             \n" |   63     "b.gt       1b                             \n" | 
|   66   : "+r"(src_ptr),          // %0 |   64   : "+r"(src_ptr),          // %0 | 
|   67     "+r"(src_stride),       // %1 |   65     "+r"(src_stride),       // %1 | 
|   68     "+r"(dst),              // %2 |   66     "+r"(dst),              // %2 | 
|   69     "+r"(dst_width)         // %3 |   67     "+r"(dst_width)         // %3 | 
|   70   : |   68   : | 
|   71   : "q0", "q1", "q2", "q3"     // Clobber List |   69   : "v0", "v1", "v2", "v3"     // Clobber List | 
|   72   ); |   70   ); | 
|   73 } |   71 } | 
|   74 #endif //HAS_SCALEROWDOWN2_NEON |  | 
|   75  |   72  | 
|   76 #ifdef HAS_SCALEROWDOWN4_NEON |  | 
|   77 void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, |   73 void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, | 
|   78                         uint8* dst_ptr, int dst_width) { |   74                         uint8* dst_ptr, int dst_width) { | 
|   79   asm volatile ( |   75   asm volatile ( | 
|   80     ".p2align   2                              \n" |  | 
|   81   "1:                                          \n" |   76   "1:                                          \n" | 
|   82     MEMACCESS(0) |   77     MEMACCESS(0) | 
|   83     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n" // src line 0 |   78     "ld4     {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32          \n"  // src line 0 | 
|   84     "subs       %2, %2, #8                     \n" // 8 processed per loop |   79     "subs       %2, %2, #8                     \n"  // 8 processed per loop | 
|   85     MEMACCESS(1) |   80     MEMACCESS(1) | 
|   86     "vst1.8     {d2}, [%1]!                    \n" |   81     "st1     {v2.8b}, [%1], #8                 \n" | 
|   87     "bgt        1b                             \n" |   82     "b.gt       1b                             \n" | 
|   88   : "+r"(src_ptr),          // %0 |   83   : "+r"(src_ptr),          // %0 | 
|   89     "+r"(dst_ptr),          // %1 |   84     "+r"(dst_ptr),          // %1 | 
|   90     "+r"(dst_width)         // %2 |   85     "+r"(dst_width)         // %2 | 
|   91   : |   86   : | 
|   92   : "q0", "q1", "memory", "cc" |   87   : "v0", "v1", "v2", "v3", "memory", "cc" | 
|   93   ); |   88   ); | 
|   94 } |   89 } | 
|   95 #endif //HAS_SCALEROWDOWN4_NEON |  | 
|   96  |   90  | 
|   97 #ifdef HAS_SCALEROWDOWN4_NEON |  | 
|   98 void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, |   91 void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, | 
|   99                            uint8* dst_ptr, int dst_width) { |   92                            uint8* dst_ptr, int dst_width) { | 
|  100   const uint8* src_ptr1 = src_ptr + src_stride; |   93   const uint8* src_ptr1 = src_ptr + src_stride; | 
|  101   const uint8* src_ptr2 = src_ptr + src_stride * 2; |   94   const uint8* src_ptr2 = src_ptr + src_stride * 2; | 
|  102   const uint8* src_ptr3 = src_ptr + src_stride * 3; |   95   const uint8* src_ptr3 = src_ptr + src_stride * 3; | 
|  103 asm volatile ( |   96 asm volatile ( | 
|  104     ".p2align   2                              \n" |  | 
|  105   "1:                                          \n" |   97   "1:                                          \n" | 
|  106     MEMACCESS(0) |   98     MEMACCESS(0) | 
|  107     "vld1.8     {q0}, [%0]!                    \n"   // load up 16x4 |   99     "ld1     {v0.16b}, [%0], #16               \n"   // load up 16x4 | 
|  108     MEMACCESS(3) |  100     MEMACCESS(3) | 
|  109     "vld1.8     {q1}, [%3]!                    \n" |  101     "ld1     {v1.16b}, [%2], #16               \n" | 
|  110     MEMACCESS(4) |  102     MEMACCESS(4) | 
|  111     "vld1.8     {q2}, [%4]!                    \n" |  103     "ld1     {v2.16b}, [%3], #16               \n" | 
|  112     MEMACCESS(5) |  104     MEMACCESS(5) | 
|  113     "vld1.8     {q3}, [%5]!                    \n" |  105     "ld1     {v3.16b}, [%4], #16               \n" | 
|  114     "subs       %2, %2, #4                     \n" |  106     "subs    %5, %5, #4                        \n" | 
|  115     "vpaddl.u8  q0, q0                         \n" |  107     "uaddlp  v0.8h, v0.16b                     \n" | 
|  116     "vpadal.u8  q0, q1                         \n" |  108     "uadalp  v0.8h, v1.16b                     \n" | 
|  117     "vpadal.u8  q0, q2                         \n" |  109     "uadalp  v0.8h, v2.16b                     \n" | 
|  118     "vpadal.u8  q0, q3                         \n" |  110     "uadalp  v0.8h, v3.16b                     \n" | 
|  119     "vpaddl.u16 q0, q0                         \n" |  111     "addp    v0.8h, v0.8h, v0.8h               \n" | 
|  120     "vrshrn.u32 d0, q0, #4                     \n"   // divide by 16 w/rounding |  112     "rshrn   v0.8b, v0.8h, #4                  \n"   // divide by 16 w/rounding | 
|  121     "vmovn.u16  d0, q0                         \n" |  | 
|  122     MEMACCESS(1) |  113     MEMACCESS(1) | 
|  123     "vst1.32    {d0[0]}, [%1]!                 \n" |  114     "st1    {v0.s}[0], [%1], #4                \n" | 
|  124     "bgt        1b                             \n" |  115     "b.gt       1b                             \n" | 
|  125   : "+r"(src_ptr),   // %0 |  116   : "+r"(src_ptr),   // %0 | 
|  126     "+r"(dst_ptr),   // %1 |  117     "+r"(dst_ptr),   // %1 | 
|  127     "+r"(dst_width), // %2 |  118     "+r"(src_ptr1),  // %2 | 
|  128     "+r"(src_ptr1),  // %3 |  119     "+r"(src_ptr2),  // %3 | 
|  129     "+r"(src_ptr2),  // %4 |  120     "+r"(src_ptr3),  // %4 | 
|  130     "+r"(src_ptr3)   // %5 |  121     "+r"(dst_width)  // %5 | 
|  131   : |  122   : | 
|  132   : "q0", "q1", "q2", "q3", "memory", "cc" |  123   : "v0", "v1", "v2", "v3", "memory", "cc" | 
|  133   ); |  124   ); | 
|  134 } |  125 } | 
|  135 #endif //HAS_SCALEROWDOWN4_NEON |  | 
|  136  |  126  | 
|  137 #ifdef HAS_SCALEROWDOWN34_NEON |  | 
|  138 // Down scale from 4 to 3 pixels. Use the neon multilane read/write |  127 // Down scale from 4 to 3 pixels. Use the neon multilane read/write | 
|  139 // to load up the every 4th pixel into a 4 different registers. |  128 // to load up the every 4th pixel into a 4 different registers. | 
|  140 // Point samples 32 pixels to 24 pixels. |  129 // Point samples 32 pixels to 24 pixels. | 
|  141 void ScaleRowDown34_NEON(const uint8* src_ptr, |  130 void ScaleRowDown34_NEON(const uint8* src_ptr, | 
|  142                          ptrdiff_t src_stride, |  131                          ptrdiff_t src_stride, | 
|  143                          uint8* dst_ptr, int dst_width) { |  132                          uint8* dst_ptr, int dst_width) { | 
|  144   asm volatile ( |  133   asm volatile ( | 
|  145     ".p2align   2                              \n" |  134   "1:                                                  \n" | 
|  146   "1:                                          \n" |  | 
|  147     MEMACCESS(0) |  135     MEMACCESS(0) | 
|  148     "vld4.8     {d0, d1, d2, d3}, [%0]!      \n" // src line 0 |  136     "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src l
     ine 0 | 
|  149     "subs       %2, %2, #24                  \n" |  137     "subs      %2, %2, #24                             \n" | 
|  150     "vmov       d2, d3                       \n" // order d0, d1, d2 |  138     "orr       v2.16b, v3.16b, v3.16b                  \n"  // order v0, v1, v2 | 
|  151     MEMACCESS(1) |  139     MEMACCESS(1) | 
|  152     "vst3.8     {d0, d1, d2}, [%1]!          \n" |  140     "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n" | 
|  153     "bgt        1b                           \n" |  141     "b.gt      1b                                      \n" | 
|  154   : "+r"(src_ptr),          // %0 |  142   : "+r"(src_ptr),          // %0 | 
|  155     "+r"(dst_ptr),          // %1 |  143     "+r"(dst_ptr),          // %1 | 
|  156     "+r"(dst_width)         // %2 |  144     "+r"(dst_width)         // %2 | 
|  157   : |  145   : | 
|  158   : "d0", "d1", "d2", "d3", "memory", "cc" |  146   : "v0", "v1", "v2", "v3", "memory", "cc" | 
|  159   ); |  147   ); | 
|  160 } |  148 } | 
|  161 #endif //HAS_SCALEROWDOWN34_NEON |  | 
|  162  |  149  | 
|  163 #ifdef HAS_SCALEROWDOWN34_NEON |  | 
|  164 void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, |  150 void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, | 
|  165                                ptrdiff_t src_stride, |  151                                ptrdiff_t src_stride, | 
|  166                                uint8* dst_ptr, int dst_width) { |  152                                uint8* dst_ptr, int dst_width) { | 
|  167   asm volatile ( |  153   asm volatile ( | 
|  168     "vmov.u8    d24, #3                        \n" |  154     "movi      v20.8b, #3                              \n" | 
|  169     "add        %3, %0                         \n" |  155     "add       %3, %3, %0                              \n" | 
|  170     ".p2align   2                              \n" |  156   "1:                                                  \n" | 
|  171   "1:                                          \n" |  | 
|  172     MEMACCESS(0) |  157     MEMACCESS(0) | 
|  173     "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0 |  158     "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src l
     ine 0 | 
|  174     MEMACCESS(3) |  159     MEMACCESS(3) | 
|  175     "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" // src line 1 |  160     "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32                \n"  // src l
     ine 1 | 
|  176     "subs         %2, %2, #24                  \n" |  161     "subs         %2, %2, #24                          \n" | 
|  177  |  162  | 
|  178     // filter src line 0 with src line 1 |  163     // filter src line 0 with src line 1 | 
|  179     // expand chars to shorts to allow for room |  164     // expand chars to shorts to allow for room | 
|  180     // when adding lines together |  165     // when adding lines together | 
|  181     "vmovl.u8     q8, d4                       \n" |  166     "ushll     v16.8h, v4.8b, #0                       \n" | 
|  182     "vmovl.u8     q9, d5                       \n" |  167     "ushll     v17.8h, v5.8b, #0                       \n" | 
|  183     "vmovl.u8     q10, d6                      \n" |  168     "ushll     v18.8h, v6.8b, #0                       \n" | 
|  184     "vmovl.u8     q11, d7                      \n" |  169     "ushll     v19.8h, v7.8b, #0                       \n" | 
|  185  |  170  | 
|  186     // 3 * line_0 + line_1 |  171     // 3 * line_0 + line_1 | 
|  187     "vmlal.u8     q8, d0, d24                  \n" |  172     "umlal     v16.8h, v0.8b, v20.8b                   \n" | 
|  188     "vmlal.u8     q9, d1, d24                  \n" |  173     "umlal     v17.8h, v1.8b, v20.8b                   \n" | 
|  189     "vmlal.u8     q10, d2, d24                 \n" |  174     "umlal     v18.8h, v2.8b, v20.8b                   \n" | 
|  190     "vmlal.u8     q11, d3, d24                 \n" |  175     "umlal     v19.8h, v3.8b, v20.8b                   \n" | 
|  191  |  176  | 
|  192     // (3 * line_0 + line_1) >> 2 |  177     // (3 * line_0 + line_1) >> 2 | 
|  193     "vqrshrn.u16  d0, q8, #2                   \n" |  178     "uqrshrn   v0.8b, v16.8h, #2                       \n" | 
|  194     "vqrshrn.u16  d1, q9, #2                   \n" |  179     "uqrshrn   v1.8b, v17.8h, #2                       \n" | 
|  195     "vqrshrn.u16  d2, q10, #2                  \n" |  180     "uqrshrn   v2.8b, v18.8h, #2                       \n" | 
|  196     "vqrshrn.u16  d3, q11, #2                  \n" |  181     "uqrshrn   v3.8b, v19.8h, #2                       \n" | 
|  197  |  182  | 
|  198     // a0 = (src[0] * 3 + s[1] * 1) >> 2 |  183     // a0 = (src[0] * 3 + s[1] * 1) >> 2 | 
|  199     "vmovl.u8     q8, d1                       \n" |  184     "ushll     v16.8h, v1.8b, #0                       \n" | 
|  200     "vmlal.u8     q8, d0, d24                  \n" |  185     "umlal     v16.8h, v0.8b, v20.8b                   \n" | 
|  201     "vqrshrn.u16  d0, q8, #2                   \n" |  186     "uqrshrn   v0.8b, v16.8h, #2                       \n" | 
|  202  |  187  | 
|  203     // a1 = (src[1] * 1 + s[2] * 1) >> 1 |  188     // a1 = (src[1] * 1 + s[2] * 1) >> 1 | 
|  204     "vrhadd.u8    d1, d1, d2                   \n" |  189     "urhadd    v1.8b, v1.8b, v2.8b                     \n" | 
|  205  |  190  | 
|  206     // a2 = (src[2] * 1 + s[3] * 3) >> 2 |  191     // a2 = (src[2] * 1 + s[3] * 3) >> 2 | 
|  207     "vmovl.u8     q8, d2                       \n" |  192     "ushll     v16.8h, v2.8b, #0                       \n" | 
|  208     "vmlal.u8     q8, d3, d24                  \n" |  193     "umlal     v16.8h, v3.8b, v20.8b                   \n" | 
|  209     "vqrshrn.u16  d2, q8, #2                   \n" |  194     "uqrshrn   v2.8b, v16.8h, #2                       \n" | 
|  210  |  195  | 
|  211     MEMACCESS(1) |  196     MEMACCESS(1) | 
|  212     "vst3.8       {d0, d1, d2}, [%1]!          \n" |  197     "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n" | 
|  213  |  198  | 
|  214     "bgt          1b                           \n" |  199     "b.gt      1b                                      \n" | 
|  215   : "+r"(src_ptr),          // %0 |  200   : "+r"(src_ptr),          // %0 | 
|  216     "+r"(dst_ptr),          // %1 |  201     "+r"(dst_ptr),          // %1 | 
|  217     "+r"(dst_width),        // %2 |  202     "+r"(dst_width),        // %2 | 
|  218     "+r"(src_stride)        // %3 |  203     "+r"(src_stride)        // %3 | 
|  219   : |  204   : | 
|  220   : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc" |  205   : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", | 
 |  206     "v20", "memory", "cc" | 
|  221   ); |  207   ); | 
|  222 } |  208 } | 
|  223 #endif //ScaleRowDown34_0_Box_NEON |  | 
|  224  |  209  | 
|  225 #ifdef HAS_SCALEROWDOWN34_NEON |  | 
|  226 void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, |  210 void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, | 
|  227                                ptrdiff_t src_stride, |  211                                ptrdiff_t src_stride, | 
|  228                                uint8* dst_ptr, int dst_width) { |  212                                uint8* dst_ptr, int dst_width) { | 
|  229   asm volatile ( |  213   asm volatile ( | 
|  230     "vmov.u8    d24, #3                        \n" |  214     "movi      v20.8b, #3                              \n" | 
|  231     "add        %3, %0                         \n" |  215     "add       %3, %3, %0                              \n" | 
|  232     ".p2align   2                              \n" |  216   "1:                                                  \n" | 
|  233   "1:                                          \n" |  | 
|  234     MEMACCESS(0) |  217     MEMACCESS(0) | 
|  235     "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0 |  218     "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src l
     ine 0 | 
|  236     MEMACCESS(3) |  219     MEMACCESS(3) | 
|  237     "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" // src line 1 |  220     "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32                \n"  // src l
     ine 1 | 
|  238     "subs         %2, %2, #24                  \n" |  221     "subs         %2, %2, #24                          \n" | 
|  239     // average src line 0 with src line 1 |  222     // average src line 0 with src line 1 | 
|  240     "vrhadd.u8    q0, q0, q2                   \n" |  223     "urhadd    v0.8b, v0.8b, v4.8b                     \n" | 
|  241     "vrhadd.u8    q1, q1, q3                   \n" |  224     "urhadd    v1.8b, v1.8b, v5.8b                     \n" | 
 |  225     "urhadd    v2.8b, v2.8b, v6.8b                     \n" | 
 |  226     "urhadd    v3.8b, v3.8b, v7.8b                     \n" | 
|  242  |  227  | 
|  243     // a0 = (src[0] * 3 + s[1] * 1) >> 2 |  228     // a0 = (src[0] * 3 + s[1] * 1) >> 2 | 
|  244     "vmovl.u8     q3, d1                       \n" |  229     "ushll     v4.8h, v1.8b, #0                        \n" | 
|  245     "vmlal.u8     q3, d0, d24                  \n" |  230     "umlal     v4.8h, v0.8b, v20.8b                    \n" | 
|  246     "vqrshrn.u16  d0, q3, #2                   \n" |  231     "uqrshrn   v0.8b, v4.8h, #2                        \n" | 
|  247  |  232  | 
|  248     // a1 = (src[1] * 1 + s[2] * 1) >> 1 |  233     // a1 = (src[1] * 1 + s[2] * 1) >> 1 | 
|  249     "vrhadd.u8    d1, d1, d2                   \n" |  234     "urhadd    v1.8b, v1.8b, v2.8b                     \n" | 
|  250  |  235  | 
|  251     // a2 = (src[2] * 1 + s[3] * 3) >> 2 |  236     // a2 = (src[2] * 1 + s[3] * 3) >> 2 | 
|  252     "vmovl.u8     q3, d2                       \n" |  237     "ushll     v4.8h, v2.8b, #0                        \n" | 
|  253     "vmlal.u8     q3, d3, d24                  \n" |  238     "umlal     v4.8h, v3.8b, v20.8b                    \n" | 
|  254     "vqrshrn.u16  d2, q3, #2                   \n" |  239     "uqrshrn   v2.8b, v4.8h, #2                        \n" | 
|  255  |  240  | 
|  256     MEMACCESS(1) |  241     MEMACCESS(1) | 
|  257     "vst3.8       {d0, d1, d2}, [%1]!          \n" |  242     "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n" | 
|  258     "bgt          1b                           \n" |  243     "b.gt      1b                                      \n" | 
|  259   : "+r"(src_ptr),          // %0 |  244   : "+r"(src_ptr),          // %0 | 
|  260     "+r"(dst_ptr),          // %1 |  245     "+r"(dst_ptr),          // %1 | 
|  261     "+r"(dst_width),        // %2 |  246     "+r"(dst_width),        // %2 | 
|  262     "+r"(src_stride)        // %3 |  247     "+r"(src_stride)        // %3 | 
|  263   : |  248   : | 
|  264   : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc" |  249   : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc" | 
|  265   ); |  250   ); | 
|  266 } |  251 } | 
|  267 #endif //HAS_SCALEROWDOWN34_NEON |  | 
|  268  |  252  | 
|  269 #ifdef HAS_SCALEROWDOWN38_NEON |  | 
|  270 #define HAS_SCALEROWDOWN38_NEON |  | 
|  271 static uvec8 kShuf38 = |  253 static uvec8 kShuf38 = | 
|  272   { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 }; |  254   { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 }; | 
|  273 static uvec8 kShuf38_2 = |  255 static uvec8 kShuf38_2 = | 
|  274   { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 }; |  256   { 0, 16, 32, 2, 18, 33, 4, 20, 34, 6, 22, 35, 0, 0, 0, 0 }; | 
|  275 static vec16 kMult38_Div6 = |  257 static vec16 kMult38_Div6 = | 
|  276   { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, |  258   { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, | 
|  277     65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 }; |  259     65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 }; | 
|  278 static vec16 kMult38_Div9 = |  260 static vec16 kMult38_Div9 = | 
|  279   { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, |  261   { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, | 
|  280     65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 }; |  262     65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 }; | 
|  281  |  263  | 
|  282 // 32 -> 12 |  264 // 32 -> 12 | 
|  283 void ScaleRowDown38_NEON(const uint8* src_ptr, |  265 void ScaleRowDown38_NEON(const uint8* src_ptr, | 
|  284                          ptrdiff_t src_stride, |  266                          ptrdiff_t src_stride, | 
|  285                          uint8* dst_ptr, int dst_width) { |  267                          uint8* dst_ptr, int dst_width) { | 
|  286   asm volatile ( |  268   asm volatile ( | 
|  287     MEMACCESS(3) |  269     MEMACCESS(3) | 
|  288     "vld1.8     {q3}, [%3]                     \n" |  270     "ld1       {v3.16b}, [%3]                          \n" | 
|  289     ".p2align   2                              \n" |  271   "1:                                                  \n" | 
|  290   "1:                                          \n" |  272     MEMACCESS(0) | 
|  291     MEMACCESS(0) |  273     "ld1       {v0.16b,v1.16b}, [%0], #32             \n" | 
|  292     "vld1.8     {d0, d1, d2, d3}, [%0]!        \n" |  274     "subs      %2, %2, #12                             \n" | 
|  293     "subs       %2, %2, #12                    \n" |  275     "tbl       v2.16b, {v0.16b,v1.16b}, v3.16b        \n" | 
|  294     "vtbl.u8    d4, {d0, d1, d2, d3}, d6       \n" |  276     MEMACCESS(1) | 
|  295     "vtbl.u8    d5, {d0, d1, d2, d3}, d7       \n" |  277     "st1       {v2.8b}, [%1], #8                       \n" | 
|  296     MEMACCESS(1) |  278     MEMACCESS(1) | 
|  297     "vst1.8     {d4}, [%1]!                    \n" |  279     "st1       {v2.s}[2], [%1], #4                     \n" | 
|  298     MEMACCESS(1) |  280     "b.gt      1b                                      \n" | 
|  299     "vst1.32    {d5[0]}, [%1]!                 \n" |  | 
|  300     "bgt        1b                             \n" |  | 
|  301   : "+r"(src_ptr),          // %0 |  281   : "+r"(src_ptr),          // %0 | 
|  302     "+r"(dst_ptr),          // %1 |  282     "+r"(dst_ptr),          // %1 | 
|  303     "+r"(dst_width)         // %2 |  283     "+r"(dst_width)         // %2 | 
|  304   : "r"(&kShuf38)           // %3 |  284   : "r"(&kShuf38)           // %3 | 
|  305   : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc" |  285   : "v0", "v1", "v2", "v3", "memory", "cc" | 
|  306   ); |  286   ); | 
|  307 } |  287 } | 
|  308  |  288  | 
|  309 #endif //HAS_SCALEROWDOWN38_NEON |  | 
|  310  |  | 
|  311 #ifdef HAS_SCALEROWDOWN38_NEON |  | 
|  312 // 32x3 -> 12x1 |  289 // 32x3 -> 12x1 | 
|  313 void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, |  290 void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, | 
|  314                                       ptrdiff_t src_stride, |  291                                       ptrdiff_t src_stride, | 
|  315                                       uint8* dst_ptr, int dst_width) { |  292                                       uint8* dst_ptr, int dst_width) { | 
|  316   const uint8* src_ptr1 = src_ptr + src_stride * 2; |  293   const uint8* src_ptr1 = src_ptr + src_stride * 2; | 
 |  294   ptrdiff_t tmp_src_stride = src_stride; | 
|  317  |  295  | 
|  318   asm volatile ( |  296   asm volatile ( | 
|  319     MEMACCESS(5) |  297     MEMACCESS(5) | 
|  320     "vld1.16    {q13}, [%5]                    \n" |  298     "ld1       {v29.8h}, [%5]                          \n" | 
|  321     MEMACCESS(6) |  299     MEMACCESS(6) | 
|  322     "vld1.8     {q14}, [%6]                    \n" |  300     "ld1       {v30.16b}, [%6]                         \n" | 
|  323     MEMACCESS(7) |  301     MEMACCESS(7) | 
|  324     "vld1.8     {q15}, [%7]                    \n" |  302     "ld1       {v31.8h}, [%7]                          \n" | 
|  325     "add        %3, %0                         \n" |  303     "add       %2, %2, %0                              \n" | 
|  326     ".p2align   2                              \n" |  304   "1:                                                  \n" | 
|  327   "1:                                          \n" |  305  | 
|  328  |  306     // 00 40 01 41 02 42 03 43 | 
|  329     // d0 = 00 40 01 41 02 42 03 43 |  307     // 10 50 11 51 12 52 13 53 | 
|  330     // d1 = 10 50 11 51 12 52 13 53 |  308     // 20 60 21 61 22 62 23 63 | 
|  331     // d2 = 20 60 21 61 22 62 23 63 |  309     // 30 70 31 71 32 72 33 73 | 
|  332     // d3 = 30 70 31 71 32 72 33 73 |  310     MEMACCESS(0) | 
|  333     MEMACCESS(0) |  311     "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n" | 
|  334     "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" |  | 
|  335     MEMACCESS(3) |  312     MEMACCESS(3) | 
|  336     "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" |  313     "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32                \n" | 
|  337     MEMACCESS(4) |  314     MEMACCESS(4) | 
|  338     "vld4.8       {d16, d17, d18, d19}, [%4]!  \n" |  315     "ld4       {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32              \n" | 
|  339     "subs         %2, %2, #12                  \n" |  316     "subs      %4, %4, #12                             \n" | 
|  340  |  317  | 
|  341     // Shuffle the input data around to get align the data |  318     // Shuffle the input data around to get align the data | 
|  342     //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 |  319     //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 | 
|  343     // d0 = 00 10 01 11 02 12 03 13 |  320     // 00 10 01 11 02 12 03 13 | 
|  344     // d1 = 40 50 41 51 42 52 43 53 |  321     // 40 50 41 51 42 52 43 53 | 
|  345     "vtrn.u8      d0, d1                       \n" |  322     "trn1      v20.8b, v0.8b, v1.8b                    \n" | 
|  346     "vtrn.u8      d4, d5                       \n" |  323     "trn2      v21.8b, v0.8b, v1.8b                    \n" | 
|  347     "vtrn.u8      d16, d17                     \n" |  324     "trn1      v22.8b, v4.8b, v5.8b                    \n" | 
|  348  |  325     "trn2      v23.8b, v4.8b, v5.8b                    \n" | 
|  349     // d2 = 20 30 21 31 22 32 23 33 |  326     "trn1      v24.8b, v16.8b, v17.8b                  \n" | 
|  350     // d3 = 60 70 61 71 62 72 63 73 |  327     "trn2      v25.8b, v16.8b, v17.8b                  \n" | 
|  351     "vtrn.u8      d2, d3                       \n" |  328  | 
|  352     "vtrn.u8      d6, d7                       \n" |  329     // 20 30 21 31 22 32 23 33 | 
|  353     "vtrn.u8      d18, d19                     \n" |  330     // 60 70 61 71 62 72 63 73 | 
|  354  |  331     "trn1      v0.8b, v2.8b, v3.8b                     \n" | 
|  355     // d0 = 00+10 01+11 02+12 03+13 |  332     "trn2      v1.8b, v2.8b, v3.8b                     \n" | 
|  356     // d2 = 40+50 41+51 42+52 43+53 |  333     "trn1      v4.8b, v6.8b, v7.8b                     \n" | 
|  357     "vpaddl.u8    q0, q0                       \n" |  334     "trn2      v5.8b, v6.8b, v7.8b                     \n" | 
|  358     "vpaddl.u8    q2, q2                       \n" |  335     "trn1      v16.8b, v18.8b, v19.8b                  \n" | 
|  359     "vpaddl.u8    q8, q8                       \n" |  336     "trn2      v17.8b, v18.8b, v19.8b                  \n" | 
|  360  |  337  | 
|  361     // d3 = 60+70 61+71 62+72 63+73 |  338     // 00+10 01+11 02+12 03+13 | 
|  362     "vpaddl.u8    d3, d3                       \n" |  339     // 40+50 41+51 42+52 43+53 | 
|  363     "vpaddl.u8    d7, d7                       \n" |  340     "uaddlp    v20.4h, v20.8b                          \n" | 
|  364     "vpaddl.u8    d19, d19                     \n" |  341     "uaddlp    v21.4h, v21.8b                          \n" | 
 |  342     "uaddlp    v22.4h, v22.8b                          \n" | 
 |  343     "uaddlp    v23.4h, v23.8b                          \n" | 
 |  344     "uaddlp    v24.4h, v24.8b                          \n" | 
 |  345     "uaddlp    v25.4h, v25.8b                          \n" | 
 |  346  | 
 |  347     // 60+70 61+71 62+72 63+73 | 
 |  348     "uaddlp    v1.4h, v1.8b                            \n" | 
 |  349     "uaddlp    v5.4h, v5.8b                            \n" | 
 |  350     "uaddlp    v17.4h, v17.8b                          \n" | 
|  365  |  351  | 
|  366     // combine source lines |  352     // combine source lines | 
|  367     "vadd.u16     q0, q2                       \n" |  353     "add       v20.4h, v20.4h, v22.4h                  \n" | 
|  368     "vadd.u16     q0, q8                       \n" |  354     "add       v21.4h, v21.4h, v23.4h                  \n" | 
|  369     "vadd.u16     d4, d3, d7                   \n" |  355     "add       v20.4h, v20.4h, v24.4h                  \n" | 
|  370     "vadd.u16     d4, d19                      \n" |  356     "add       v21.4h, v21.4h, v25.4h                  \n" | 
 |  357     "add       v2.4h, v1.4h, v5.4h                     \n" | 
 |  358     "add       v2.4h, v2.4h, v17.4h                    \n" | 
|  371  |  359  | 
|  372     // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] |  360     // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] | 
|  373     //             + s[6 + st * 1] + s[7 + st * 1] |  361     //             + s[6 + st * 1] + s[7 + st * 1] | 
|  374     //             + s[6 + st * 2] + s[7 + st * 2]) / 6 |  362     //             + s[6 + st * 2] + s[7 + st * 2]) / 6 | 
|  375     "vqrdmulh.s16 q2, q2, q13                  \n" |  363     "sqrdmulh  v2.8h, v2.8h, v29.8h                    \n" | 
|  376     "vmovn.u16    d4, q2                       \n" |  364     "xtn       v2.8b,  v2.8h                           \n" | 
|  377  |  365  | 
|  378     // Shuffle 2,3 reg around so that 2 can be added to the |  366     // Shuffle 2,3 reg around so that 2 can be added to the | 
|  379     //  0,1 reg and 3 can be added to the 4,5 reg. This |  367     //  0,1 reg and 3 can be added to the 4,5 reg. This | 
|  380     //  requires expanding from u8 to u16 as the 0,1 and 4,5 |  368     //  requires expanding from u8 to u16 as the 0,1 and 4,5 | 
|  381     //  registers are already expanded. Then do transposes |  369     //  registers are already expanded. Then do transposes | 
|  382     //  to get aligned. |  370     //  to get aligned. | 
|  383     // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 |  371     // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 | 
|  384     "vmovl.u8     q1, d2                       \n" |  372     "ushll     v16.8h, v16.8b, #0                      \n" | 
|  385     "vmovl.u8     q3, d6                       \n" |  373     "uaddl     v0.8h, v0.8b, v4.8b                     \n" | 
|  386     "vmovl.u8     q9, d18                      \n" |  | 
|  387  |  374  | 
|  388     // combine source lines |  375     // combine source lines | 
|  389     "vadd.u16     q1, q3                       \n" |  376     "add       v0.8h, v0.8h, v16.8h                    \n" | 
|  390     "vadd.u16     q1, q9                       \n" |  377  | 
|  391  |  378     // xx 20 xx 21 xx 22 xx 23 | 
|  392     // d4 = xx 20 xx 30 xx 22 xx 32 |  379     // xx 30 xx 31 xx 32 xx 33 | 
|  393     // d5 = xx 21 xx 31 xx 23 xx 33 |  380     "trn1      v1.8h, v0.8h, v0.8h                     \n" | 
|  394     "vtrn.u32     d2, d3                       \n" |  381     "trn2      v4.8h, v0.8h, v0.8h                     \n" | 
|  395  |  382     "xtn       v0.4h, v1.4s                            \n" | 
|  396     // d4 = xx 20 xx 21 xx 22 xx 23 |  383     "xtn       v4.4h, v4.4s                            \n" | 
|  397     // d5 = xx 30 xx 31 xx 32 xx 33 |  | 
|  398     "vtrn.u16     d2, d3                       \n" |  | 
|  399  |  384  | 
|  400     // 0+1+2, 3+4+5 |  385     // 0+1+2, 3+4+5 | 
|  401     "vadd.u16     q0, q1                       \n" |  386     "add       v20.8h, v20.8h, v0.8h                   \n" | 
 |  387     "add       v21.8h, v21.8h, v4.8h                   \n" | 
|  402  |  388  | 
|  403     // Need to divide, but can't downshift as the the value |  389     // Need to divide, but can't downshift as the the value | 
|  404     //  isn't a power of 2. So multiply by 65536 / n |  390     //  isn't a power of 2. So multiply by 65536 / n | 
|  405     //  and take the upper 16 bits. |  391     //  and take the upper 16 bits. | 
|  406     "vqrdmulh.s16 q0, q0, q15                  \n" |  392     "sqrdmulh  v0.8h, v20.8h, v31.8h                   \n" | 
 |  393     "sqrdmulh  v1.8h, v21.8h, v31.8h                   \n" | 
|  407  |  394  | 
|  408     // Align for table lookup, vtbl requires registers to |  395     // Align for table lookup, vtbl requires registers to | 
|  409     //  be adjacent |  396     //  be adjacent | 
|  410     "vmov.u8      d2, d4                       \n" |  397     "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n" | 
|  411  |  398  | 
|  412     "vtbl.u8      d3, {d0, d1, d2}, d28        \n" |  399     MEMACCESS(1) | 
|  413     "vtbl.u8      d4, {d0, d1, d2}, d29        \n" |  400     "st1       {v3.8b}, [%1], #8                       \n" | 
|  414  |  401     MEMACCESS(1) | 
|  415     MEMACCESS(1) |  402     "st1       {v3.s}[2], [%1], #4                     \n" | 
|  416     "vst1.8       {d3}, [%1]!                  \n" |  403     "b.gt      1b                                      \n" | 
|  417     MEMACCESS(1) |  | 
|  418     "vst1.32      {d4[0]}, [%1]!               \n" |  | 
|  419     "bgt          1b                           \n" |  | 
|  420   : "+r"(src_ptr),          // %0 |  404   : "+r"(src_ptr),          // %0 | 
|  421     "+r"(dst_ptr),          // %1 |  405     "+r"(dst_ptr),          // %1 | 
|  422     "+r"(dst_width),        // %2 |  406     "+r"(tmp_src_stride),   // %2 | 
|  423     "+r"(src_stride),       // %3 |  407     "+r"(src_ptr1),         // %3 | 
|  424     "+r"(src_ptr1)          // %4 |  408     "+r"(dst_width)         // %4 | 
|  425   : "r"(&kMult38_Div6),     // %5 |  409   : "r"(&kMult38_Div6),     // %5 | 
|  426     "r"(&kShuf38_2),        // %6 |  410     "r"(&kShuf38_2),        // %6 | 
|  427     "r"(&kMult38_Div9)      // %7 |  411     "r"(&kMult38_Div9)      // %7 | 
|  428   : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc" |  412   : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", | 
|  429   ); |  413     "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29", | 
|  430 } |  414     "v30", "v31", "memory", "cc" | 
|  431 #endif //HAS_SCALEROWDOWN38_NEON |  415   ); | 
|  432  |  416 } | 
|  433 #ifdef HAS_SCALEROWDOWN38_NEON |  417  | 
|  434 // 32x2 -> 12x1 |  418 // 32x2 -> 12x1 | 
|  435 void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, |  419 void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, | 
|  436                                ptrdiff_t src_stride, |  420                                ptrdiff_t src_stride, | 
|  437                                uint8* dst_ptr, int dst_width) { |  421                                uint8* dst_ptr, int dst_width) { | 
 |  422   // TODO(fbarchard): use src_stride directly for clang 3.5+. | 
 |  423   ptrdiff_t tmp_src_stride = src_stride; | 
|  438   asm volatile ( |  424   asm volatile ( | 
|  439     MEMACCESS(4) |  425     MEMACCESS(4) | 
|  440     "vld1.16    {q13}, [%4]                    \n" |  426     "ld1       {v30.8h}, [%4]                          \n" | 
|  441     MEMACCESS(5) |  427     MEMACCESS(5) | 
|  442     "vld1.8     {q14}, [%5]                    \n" |  428     "ld1       {v31.16b}, [%5]                         \n" | 
|  443     "add        %3, %0                         \n" |  429     "add       %2, %2, %0                              \n" | 
|  444     ".p2align   2                              \n" |  430   "1:                                                  \n" | 
|  445   "1:                                          \n" |  431  | 
|  446  |  432     // 00 40 01 41 02 42 03 43 | 
|  447     // d0 = 00 40 01 41 02 42 03 43 |  433     // 10 50 11 51 12 52 13 53 | 
|  448     // d1 = 10 50 11 51 12 52 13 53 |  434     // 20 60 21 61 22 62 23 63 | 
|  449     // d2 = 20 60 21 61 22 62 23 63 |  435     // 30 70 31 71 32 72 33 73 | 
|  450     // d3 = 30 70 31 71 32 72 33 73 |  436     MEMACCESS(0) | 
|  451     MEMACCESS(0) |  437     "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n" | 
|  452     "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" |  | 
|  453     MEMACCESS(3) |  438     MEMACCESS(3) | 
|  454     "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" |  439     "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32                \n" | 
|  455     "subs         %2, %2, #12                  \n" |  440     "subs      %3, %3, #12                             \n" | 
|  456  |  441  | 
|  457     // Shuffle the input data around to get align the data |  442     // Shuffle the input data around to get align the data | 
|  458     //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 |  443     //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 | 
|  459     // d0 = 00 10 01 11 02 12 03 13 |  444     // 00 10 01 11 02 12 03 13 | 
|  460     // d1 = 40 50 41 51 42 52 43 53 |  445     // 40 50 41 51 42 52 43 53 | 
|  461     "vtrn.u8      d0, d1                       \n" |  446     "trn1      v16.8b, v0.8b, v1.8b                    \n" | 
|  462     "vtrn.u8      d4, d5                       \n" |  447     "trn2      v17.8b, v0.8b, v1.8b                    \n" | 
|  463  |  448     "trn1      v18.8b, v4.8b, v5.8b                    \n" | 
|  464     // d2 = 20 30 21 31 22 32 23 33 |  449     "trn2      v19.8b, v4.8b, v5.8b                    \n" | 
|  465     // d3 = 60 70 61 71 62 72 63 73 |  450  | 
|  466     "vtrn.u8      d2, d3                       \n" |  451     // 20 30 21 31 22 32 23 33 | 
|  467     "vtrn.u8      d6, d7                       \n" |  452     // 60 70 61 71 62 72 63 73 | 
|  468  |  453     "trn1      v0.8b, v2.8b, v3.8b                     \n" | 
|  469     // d0 = 00+10 01+11 02+12 03+13 |  454     "trn2      v1.8b, v2.8b, v3.8b                     \n" | 
|  470     // d2 = 40+50 41+51 42+52 43+53 |  455     "trn1      v4.8b, v6.8b, v7.8b                     \n" | 
|  471     "vpaddl.u8    q0, q0                       \n" |  456     "trn2      v5.8b, v6.8b, v7.8b                     \n" | 
|  472     "vpaddl.u8    q2, q2                       \n" |  457  | 
|  473  |  458     // 00+10 01+11 02+12 03+13 | 
|  474     // d3 = 60+70 61+71 62+72 63+73 |  459     // 40+50 41+51 42+52 43+53 | 
|  475     "vpaddl.u8    d3, d3                       \n" |  460     "uaddlp    v16.4h, v16.8b                          \n" | 
|  476     "vpaddl.u8    d7, d7                       \n" |  461     "uaddlp    v17.4h, v17.8b                          \n" | 
 |  462     "uaddlp    v18.4h, v18.8b                          \n" | 
 |  463     "uaddlp    v19.4h, v19.8b                          \n" | 
 |  464  | 
 |  465     // 60+70 61+71 62+72 63+73 | 
 |  466     "uaddlp    v1.4h, v1.8b                            \n" | 
 |  467     "uaddlp    v5.4h, v5.8b                            \n" | 
|  477  |  468  | 
|  478     // combine source lines |  469     // combine source lines | 
|  479     "vadd.u16     q0, q2                       \n" |  470     "add       v16.4h, v16.4h, v18.4h                  \n" | 
|  480     "vadd.u16     d4, d3, d7                   \n" |  471     "add       v17.4h, v17.4h, v19.4h                  \n" | 
 |  472     "add       v2.4h, v1.4h, v5.4h                     \n" | 
|  481  |  473  | 
|  482     // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 |  474     // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 | 
|  483     "vqrshrn.u16  d4, q2, #2                   \n" |  475     "uqrshrn   v2.8b, v2.8h, #2                        \n" | 
|  484  |  476  | 
|  485     // Shuffle 2,3 reg around so that 2 can be added to the |  477     // Shuffle 2,3 reg around so that 2 can be added to the | 
|  486     //  0,1 reg and 3 can be added to the 4,5 reg. This |  478     //  0,1 reg and 3 can be added to the 4,5 reg. This | 
|  487     //  requires expanding from u8 to u16 as the 0,1 and 4,5 |  479     //  requires expanding from u8 to u16 as the 0,1 and 4,5 | 
|  488     //  registers are already expanded. Then do transposes |  480     //  registers are already expanded. Then do transposes | 
|  489     //  to get aligned. |  481     //  to get aligned. | 
|  490     // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 |  482     // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 | 
|  491     "vmovl.u8     q1, d2                       \n" |  | 
|  492     "vmovl.u8     q3, d6                       \n" |  | 
|  493  |  483  | 
|  494     // combine source lines |  484     // combine source lines | 
|  495     "vadd.u16     q1, q3                       \n" |  485     "uaddl     v0.8h, v0.8b, v4.8b                     \n" | 
|  496  |  486  | 
|  497     // d4 = xx 20 xx 30 xx 22 xx 32 |  487     // xx 20 xx 21 xx 22 xx 23 | 
|  498     // d5 = xx 21 xx 31 xx 23 xx 33 |  488     // xx 30 xx 31 xx 32 xx 33 | 
|  499     "vtrn.u32     d2, d3                       \n" |  489     "trn1      v1.8h, v0.8h, v0.8h                     \n" | 
|  500  |  490     "trn2      v4.8h, v0.8h, v0.8h                     \n" | 
|  501     // d4 = xx 20 xx 21 xx 22 xx 23 |  491     "xtn       v0.4h, v1.4s                            \n" | 
|  502     // d5 = xx 30 xx 31 xx 32 xx 33 |  492     "xtn       v4.4h, v4.4s                            \n" | 
|  503     "vtrn.u16     d2, d3                       \n" |  | 
|  504  |  493  | 
|  505     // 0+1+2, 3+4+5 |  494     // 0+1+2, 3+4+5 | 
|  506     "vadd.u16     q0, q1                       \n" |  495     "add       v16.8h, v16.8h, v0.8h                   \n" | 
 |  496     "add       v17.8h, v17.8h, v4.8h                   \n" | 
|  507  |  497  | 
|  508     // Need to divide, but can't downshift as the the value |  498     // Need to divide, but can't downshift as the the value | 
|  509     //  isn't a power of 2. So multiply by 65536 / n |  499     //  isn't a power of 2. So multiply by 65536 / n | 
|  510     //  and take the upper 16 bits. |  500     //  and take the upper 16 bits. | 
|  511     "vqrdmulh.s16 q0, q0, q13                  \n" |  501     "sqrdmulh  v0.8h, v16.8h, v30.8h                   \n" | 
 |  502     "sqrdmulh  v1.8h, v17.8h, v30.8h                   \n" | 
|  512  |  503  | 
|  513     // Align for table lookup, vtbl requires registers to |  504     // Align for table lookup, vtbl requires registers to | 
|  514     //  be adjacent |  505     //  be adjacent | 
|  515     "vmov.u8      d2, d4                       \n" |  506  | 
|  516  |  507     "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n" | 
|  517     "vtbl.u8      d3, {d0, d1, d2}, d28        \n" |  508  | 
|  518     "vtbl.u8      d4, {d0, d1, d2}, d29        \n" |  509     MEMACCESS(1) | 
|  519  |  510     "st1       {v3.8b}, [%1], #8                       \n" | 
|  520     MEMACCESS(1) |  511     MEMACCESS(1) | 
|  521     "vst1.8       {d3}, [%1]!                  \n" |  512     "st1       {v3.s}[2], [%1], #4                     \n" | 
|  522     MEMACCESS(1) |  513     "b.gt      1b                                      \n" | 
|  523     "vst1.32      {d4[0]}, [%1]!               \n" |  514   : "+r"(src_ptr),         // %0 | 
|  524     "bgt          1b                           \n" |  515     "+r"(dst_ptr),         // %1 | 
|  525   : "+r"(src_ptr),       // %0 |  516     "+r"(tmp_src_stride),  // %2 | 
|  526     "+r"(dst_ptr),       // %1 |  517     "+r"(dst_width)        // %3 | 
|  527     "+r"(dst_width),     // %2 |  518   : "r"(&kMult38_Div6),    // %4 | 
|  528     "+r"(src_stride)     // %3 |  519     "r"(&kShuf38_2)        // %5 | 
|  529   : "r"(&kMult38_Div6),  // %4 |  520   : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", | 
|  530     "r"(&kShuf38_2)      // %5 |  521     "v18", "v19", "v30", "v31", "memory", "cc" | 
|  531   : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc" |  522   ); | 
|  532   ); |  523 } | 
|  533 } |  524  | 
|  534 #endif //HAS_SCALEROWDOWN38_NEON |  | 
|  535  |  | 
|  536 #if 0 |  | 
|  537 // 16x2 -> 16x1 |  525 // 16x2 -> 16x1 | 
|  538 void ScaleFilterRows_NEON(uint8* dst_ptr, |  526 void ScaleFilterRows_NEON(uint8* dst_ptr, | 
|  539                           const uint8* src_ptr, ptrdiff_t src_stride, |  527                           const uint8* src_ptr, ptrdiff_t src_stride, | 
|  540                           int dst_width, int source_y_fraction) { |  528                           int dst_width, int source_y_fraction) { | 
 |  529     int y_fraction = 256 - source_y_fraction; | 
|  541   asm volatile ( |  530   asm volatile ( | 
|  542     "cmp          %4, #0                       \n" |  531     "cmp          %4, #0                       \n" | 
|  543     "beq          100f                         \n" |  532     "b.eq         100f                         \n" | 
|  544     "add          %2, %1                       \n" |  533     "add          %2, %2, %1                   \n" | 
|  545     "cmp          %4, #64                      \n" |  534     "cmp          %4, #64                      \n" | 
|  546     "beq          75f                          \n" |  535     "b.eq         75f                          \n" | 
|  547     "cmp          %4, #128                     \n" |  536     "cmp          %4, #128                     \n" | 
|  548     "beq          50f                          \n" |  537     "b.eq         50f                          \n" | 
|  549     "cmp          %4, #192                     \n" |  538     "cmp          %4, #192                     \n" | 
|  550     "beq          25f                          \n" |  539     "b.eq         25f                          \n" | 
|  551  |  540  | 
|  552     "vdup.8       d5, %4                       \n" |  541     "dup          v5.8b, %w4                   \n" | 
|  553     "rsb          %4, #256                     \n" |  542     "dup          v4.8b, %w5                   \n" | 
|  554     "vdup.8       d4, %4                       \n" |  | 
|  555     // General purpose row blend. |  543     // General purpose row blend. | 
|  556   "1:                                          \n" |  544   "1:                                          \n" | 
|  557     MEMACCESS(1) |  545     MEMACCESS(1) | 
|  558     "vld1.8       {q0}, [%1]!                  \n" |  546     "ld1          {v0.16b}, [%1], #16          \n" | 
|  559     MEMACCESS(2) |  547     MEMACCESS(2) | 
|  560     "vld1.8       {q1}, [%2]!                  \n" |  548     "ld1          {v1.16b}, [%2], #16          \n" | 
|  561     "subs         %3, %3, #16                  \n" |  549     "subs         %3, %3, #16                  \n" | 
|  562     "vmull.u8     q13, d0, d4                  \n" |  550     "umull        v6.8h, v0.8b, v4.8b          \n" | 
|  563     "vmull.u8     q14, d1, d4                  \n" |  551     "umull2       v7.8h, v0.16b, v4.16b        \n" | 
|  564     "vmlal.u8     q13, d2, d5                  \n" |  552     "umlal        v6.8h, v1.8b, v5.8b          \n" | 
|  565     "vmlal.u8     q14, d3, d5                  \n" |  553     "umlal2       v7.8h, v1.16b, v5.16b        \n" | 
|  566     "vrshrn.u16   d0, q13, #8                  \n" |  554     "rshrn        v0.8b, v6.8h, #8             \n" | 
|  567     "vrshrn.u16   d1, q14, #8                  \n" |  555     "rshrn2       v0.16b, v7.8h, #8            \n" | 
|  568     MEMACCESS(0) |  556     MEMACCESS(0) | 
|  569     "vst1.8       {q0}, [%0]!                  \n" |  557     "st1          {v0.16b}, [%0], #16          \n" | 
|  570     "bgt          1b                           \n" |  558     "b.gt         1b                           \n" | 
|  571     "b            99f                          \n" |  559     "b            99f                          \n" | 
|  572  |  560  | 
|  573     // Blend 25 / 75. |  561     // Blend 25 / 75. | 
|  574   "25:                                         \n" |  562   "25:                                         \n" | 
|  575     MEMACCESS(1) |  563     MEMACCESS(1) | 
|  576     "vld1.8       {q0}, [%1]!                  \n" |  564     "ld1          {v0.16b}, [%1], #16          \n" | 
|  577     MEMACCESS(2) |  565     MEMACCESS(2) | 
|  578     "vld1.8       {q1}, [%2]!                  \n" |  566     "ld1          {v1.16b}, [%2], #16          \n" | 
|  579     "subs         %3, %3, #16                  \n" |  567     "subs         %3, %3, #16                  \n" | 
|  580     "vrhadd.u8    q0, q1                       \n" |  568     "urhadd       v0.16b, v0.16b, v1.16b       \n" | 
|  581     "vrhadd.u8    q0, q1                       \n" |  569     "urhadd       v0.16b, v0.16b, v1.16b       \n" | 
|  582     MEMACCESS(0) |  570     MEMACCESS(0) | 
|  583     "vst1.8       {q0}, [%0]!                  \n" |  571     "st1          {v0.16b}, [%0], #16          \n" | 
|  584     "bgt          25b                          \n" |  572     "b.gt         25b                          \n" | 
|  585     "b            99f                          \n" |  573     "b            99f                          \n" | 
|  586  |  574  | 
|  587     // Blend 50 / 50. |  575     // Blend 50 / 50. | 
|  588   "50:                                         \n" |  576   "50:                                         \n" | 
|  589     MEMACCESS(1) |  577     MEMACCESS(1) | 
|  590     "vld1.8       {q0}, [%1]!                  \n" |  578     "ld1          {v0.16b}, [%1], #16          \n" | 
|  591     MEMACCESS(2) |  579     MEMACCESS(2) | 
|  592     "vld1.8       {q1}, [%2]!                  \n" |  580     "ld1          {v1.16b}, [%2], #16          \n" | 
|  593     "subs         %3, %3, #16                  \n" |  581     "subs         %3, %3, #16                  \n" | 
|  594     "vrhadd.u8    q0, q1                       \n" |  582     "urhadd       v0.16b, v0.16b, v1.16b       \n" | 
|  595     MEMACCESS(0) |  583     MEMACCESS(0) | 
|  596     "vst1.8       {q0}, [%0]!                  \n" |  584     "st1          {v0.16b}, [%0], #16          \n" | 
|  597     "bgt          50b                          \n" |  585     "b.gt         50b                          \n" | 
|  598     "b            99f                          \n" |  586     "b            99f                          \n" | 
|  599  |  587  | 
|  600     // Blend 75 / 25. |  588     // Blend 75 / 25. | 
|  601   "75:                                         \n" |  589   "75:                                         \n" | 
|  602     MEMACCESS(1) |  590     MEMACCESS(1) | 
|  603     "vld1.8       {q1}, [%1]!                  \n" |  591     "ld1          {v1.16b}, [%1], #16          \n" | 
|  604     MEMACCESS(2) |  592     MEMACCESS(2) | 
|  605     "vld1.8       {q0}, [%2]!                  \n" |  593     "ld1          {v0.16b}, [%2], #16          \n" | 
|  606     "subs         %3, %3, #16                  \n" |  594     "subs         %3, %3, #16                  \n" | 
|  607     "vrhadd.u8    q0, q1                       \n" |  595     "urhadd       v0.16b, v0.16b, v1.16b       \n" | 
|  608     "vrhadd.u8    q0, q1                       \n" |  596     "urhadd       v0.16b, v0.16b, v1.16b       \n" | 
|  609     MEMACCESS(0) |  597     MEMACCESS(0) | 
|  610     "vst1.8       {q0}, [%0]!                  \n" |  598     "st1          {v0.16b}, [%0], #16          \n" | 
|  611     "bgt          75b                          \n" |  599     "b.gt         75b                          \n" | 
|  612     "b            99f                          \n" |  600     "b            99f                          \n" | 
|  613  |  601  | 
|  614     // Blend 100 / 0 - Copy row unchanged. |  602     // Blend 100 / 0 - Copy row unchanged. | 
|  615   "100:                                        \n" |  603   "100:                                        \n" | 
|  616     MEMACCESS(1) |  604     MEMACCESS(1) | 
|  617     "vld1.8       {q0}, [%1]!                  \n" |  605     "ld1          {v0.16b}, [%1], #16          \n" | 
|  618     "subs         %3, %3, #16                  \n" |  606     "subs         %3, %3, #16                  \n" | 
|  619     MEMACCESS(0) |  607     MEMACCESS(0) | 
|  620     "vst1.8       {q0}, [%0]!                  \n" |  608     "st1          {v0.16b}, [%0], #16          \n" | 
|  621     "bgt          100b                         \n" |  609     "b.gt         100b                         \n" | 
|  622  |  610  | 
|  623   "99:                                         \n" |  611   "99:                                         \n" | 
|  624     MEMACCESS(0) |  612     MEMACCESS(0) | 
|  625     "vst1.8       {d1[7]}, [%0]                \n" |  613     "st1          {v0.b}[15], [%0]             \n" | 
|  626   : "+r"(dst_ptr),          // %0 |  614   : "+r"(dst_ptr),          // %0 | 
|  627     "+r"(src_ptr),          // %1 |  615     "+r"(src_ptr),          // %1 | 
|  628     "+r"(src_stride),       // %2 |  616     "+r"(src_stride),       // %2 | 
|  629     "+r"(dst_width),        // %3 |  617     "+r"(dst_width),        // %3 | 
|  630     "+r"(source_y_fraction) // %4 |  618     "+r"(source_y_fraction),// %4 | 
 |  619     "+r"(y_fraction)        // %5 | 
|  631   : |  620   : | 
|  632   : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc" |  621   : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc" | 
|  633   ); |  622   ); | 
|  634 } |  623 } | 
|  635 #endif //0 |  624  | 
|  636  |  | 
|  637 #ifdef HAS_SCALEARGBROWDOWN2_NEON |  | 
|  638 void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, |  625 void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, | 
|  639                             uint8* dst, int dst_width) { |  626                             uint8* dst, int dst_width) { | 
|  640   asm volatile ( |  627   asm volatile ( | 
|  641     ".p2align   2                              \n" |  | 
|  642   "1:                                          \n" |  628   "1:                                          \n" | 
|  643     // load even pixels into q0, odd into q1 |  629     // load even pixels into q0, odd into q1 | 
|  644     MEMACCESS(0) |  630     MEMACCESS (0) | 
|  645     "vld2.32    {q0, q1}, [%0]!                \n" |  631     "ld2        {v0.4s, v1.4s}, [%0], #32      \n" | 
|  646     MEMACCESS(0) |  632     MEMACCESS (0) | 
|  647     "vld2.32    {q2, q3}, [%0]!                \n" |  633     "ld2        {v2.4s, v3.4s}, [%0], #32      \n" | 
|  648     "subs       %2, %2, #8                     \n"  // 8 processed per loop |  634     "subs       %2, %2, #8                     \n"  // 8 processed per loop | 
|  649     MEMACCESS(1) |  635     MEMACCESS (1) | 
|  650     "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels |  636     "st1        {v1.16b}, [%1], #16            \n"  // store odd pixels | 
|  651     MEMACCESS(1) |  637     MEMACCESS (1) | 
|  652     "vst1.8     {q3}, [%1]!                    \n" |  638     "st1        {v3.16b}, [%1], #16            \n" | 
|  653     "bgt        1b                             \n" |  639     "b.gt       1b                             \n" | 
|  654   : "+r"(src_ptr),          // %0 |  640   : "+r" (src_ptr),          // %0 | 
|  655     "+r"(dst),              // %1 |  641     "+r" (dst),              // %1 | 
|  656     "+r"(dst_width)         // %2 |  642     "+r" (dst_width)         // %2 | 
|  657   : |  643   : | 
|  658   : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List |  644   : "memory", "cc", "v0", "v1", "v2", "v3"  // Clobber List | 
|  659   ); |  645   ); | 
|  660 } |  646 } | 
|  661 #endif //HAS_SCALEARGBROWDOWN2_NEON |  647  | 
|  662  |  | 
|  663 #ifdef HAS_SCALEARGBROWDOWN2_NEON |  | 
|  664 void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, |  648 void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, | 
|  665                                uint8* dst, int dst_width) { |  649                                uint8* dst, int dst_width) { | 
|  666   asm volatile ( |  650   asm volatile ( | 
|  667     // change the stride to row 2 pointer |  651     // change the stride to row 2 pointer | 
|  668     "add        %1, %1, %0                     \n" |  652     "add        %1, %1, %0                     \n" | 
|  669     ".p2align   2                              \n" |  | 
|  670   "1:                                          \n" |  653   "1:                                          \n" | 
|  671     MEMACCESS(0) |  654     MEMACCESS (0) | 
|  672     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels. |  655     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64   \n"  // load 8 ARGB p
     ixels. | 
|  673     MEMACCESS(0) |  | 
|  674     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels. |  | 
|  675     "subs       %3, %3, #8                     \n"  // 8 processed per loop. |  656     "subs       %3, %3, #8                     \n"  // 8 processed per loop. | 
|  676     "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts. |  657     "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts. | 
|  677     "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts. |  658     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts. | 
|  678     "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts. |  659     "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts. | 
|  679     "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts. |  660     "uaddlp     v3.8h, v3.16b                  \n"  // A 16 bytes -> 8 shorts. | 
|  680     MEMACCESS(1) |  661     MEMACCESS (1) | 
|  681     "vld4.8     {d16, d18, d20, d22}, [%1]!    \n"  // load 8 more ARGB pixels. |  662     "ld4        {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n"  // load 8 more
      ARGB pixels. | 
|  682     MEMACCESS(1) |  663     "uadalp     v0.8h, v16.16b                 \n"  // B 16 bytes -> 8 shorts. | 
|  683     "vld4.8     {d17, d19, d21, d23}, [%1]!    \n"  // load last 8 ARGB pixels. |  664     "uadalp     v1.8h, v17.16b                 \n"  // G 16 bytes -> 8 shorts. | 
|  684     "vpadal.u8  q0, q8                         \n"  // B 16 bytes -> 8 shorts. |  665     "uadalp     v2.8h, v18.16b                 \n"  // R 16 bytes -> 8 shorts. | 
|  685     "vpadal.u8  q1, q9                         \n"  // G 16 bytes -> 8 shorts. |  666     "uadalp     v3.8h, v19.16b                 \n"  // A 16 bytes -> 8 shorts. | 
|  686     "vpadal.u8  q2, q10                        \n"  // R 16 bytes -> 8 shorts. |  667     "rshrn      v0.8b, v0.8h, #2               \n"  // downshift, round and pack | 
|  687     "vpadal.u8  q3, q11                        \n"  // A 16 bytes -> 8 shorts. |  668     "rshrn      v1.8b, v1.8h, #2               \n" | 
|  688     "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack |  669     "rshrn      v2.8b, v2.8h, #2               \n" | 
|  689     "vrshrn.u16 d1, q1, #2                     \n" |  670     "rshrn      v3.8b, v3.8h, #2               \n" | 
|  690     "vrshrn.u16 d2, q2, #2                     \n" |  671     MEMACCESS (2) | 
|  691     "vrshrn.u16 d3, q3, #2                     \n" |  672     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32     \n" | 
|  692     MEMACCESS(2) |  673     "b.gt       1b                             \n" | 
|  693     "vst4.8     {d0, d1, d2, d3}, [%2]!        \n" |  674   : "+r" (src_ptr),          // %0 | 
|  694     "bgt        1b                             \n" |  675     "+r" (src_stride),       // %1 | 
|  695   : "+r"(src_ptr),          // %0 |  676     "+r" (dst),              // %2 | 
|  696     "+r"(src_stride),       // %1 |  677     "+r" (dst_width)         // %3 | 
|  697     "+r"(dst),              // %2 |  | 
|  698     "+r"(dst_width)         // %3 |  | 
|  699   : |  678   : | 
|  700   : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11" |  679   : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19" | 
|  701   ); |  680   ); | 
|  702 } |  681 } | 
|  703 #endif //HAS_SCALEARGBROWDOWN2_NEON |  682  | 
|  704  |  | 
|  705 #ifdef HAS_SCALEARGBROWDOWNEVEN_NEON |  | 
|  706 // Reads 4 pixels at a time. |  683 // Reads 4 pixels at a time. | 
|  707 // Alignment requirement: src_argb 4 byte aligned. |  684 // Alignment requirement: src_argb 4 byte aligned. | 
|  708 void ScaleARGBRowDownEven_NEON(const uint8* src_argb,  ptrdiff_t src_stride, |  685 void ScaleARGBRowDownEven_NEON(const uint8* src_argb,  ptrdiff_t src_stride, | 
|  709                                int src_stepx, uint8* dst_argb, int dst_width) { |  686                                int src_stepx, uint8* dst_argb, int dst_width) { | 
|  710   asm volatile ( |  687   asm volatile ( | 
|  711     "mov        r12, %3, lsl #2                \n" |  | 
|  712     ".p2align   2                              \n" |  | 
|  713   "1:                                          \n" |  688   "1:                                          \n" | 
|  714     MEMACCESS(0) |  689     MEMACCESS(0) | 
|  715     "vld1.32    {d0[0]}, [%0], r12             \n" |  690     "ld1        {v0.s}[0], [%0], %3            \n" | 
|  716     MEMACCESS(0) |  691     MEMACCESS(0) | 
|  717     "vld1.32    {d0[1]}, [%0], r12             \n" |  692     "ld1        {v0.s}[1], [%0], %3            \n" | 
|  718     MEMACCESS(0) |  693     MEMACCESS(0) | 
|  719     "vld1.32    {d1[0]}, [%0], r12             \n" |  694     "ld1        {v0.s}[2], [%0], %3            \n" | 
|  720     MEMACCESS(0) |  695     MEMACCESS(0) | 
|  721     "vld1.32    {d1[1]}, [%0], r12             \n" |  696     "ld1        {v0.s}[3], [%0], %3            \n" | 
|  722     "subs       %2, %2, #4                     \n"  // 4 pixels per loop. |  697     "subs       %2, %2, #4                     \n"  // 4 pixels per loop. | 
|  723     MEMACCESS(1) |  698     MEMACCESS(1) | 
|  724     "vst1.8     {q0}, [%1]!                    \n" |  699     "st1        {v0.16b}, [%1], #16            \n" | 
|  725     "bgt        1b                             \n" |  700     "b.gt       1b                             \n" | 
|  726   : "+r"(src_argb),    // %0 |  701   : "+r"(src_argb),    // %0 | 
|  727     "+r"(dst_argb),    // %1 |  702     "+r"(dst_argb),    // %1 | 
|  728     "+r"(dst_width)    // %2 |  703     "+r"(dst_width)    // %2 | 
|  729   : "r"(src_stepx)     // %3 |  704   : "r"(static_cast<ptrdiff_t>(src_stepx * 4)) // %3 | 
|  730   : "memory", "cc", "r12", "q0" |  705   : "memory", "cc", "v0" | 
|  731   ); |  706   ); | 
|  732 } |  707 } | 
|  733 #endif //HAS_SCALEARGBROWDOWNEVEN_NEON |  708  | 
|  734  |  | 
|  735 #ifdef HAS_SCALEARGBROWDOWNEVEN_NEON |  | 
|  736 // Reads 4 pixels at a time. |  709 // Reads 4 pixels at a time. | 
|  737 // Alignment requirement: src_argb 4 byte aligned. |  710 // Alignment requirement: src_argb 4 byte aligned. | 
 |  711 // TODO, might be worth another optimization pass in future. | 
 |  712 // It could be upgraded to 8 pixels at a time to start with. | 
|  738 void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, |  713 void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, | 
|  739                                   int src_stepx, |  714                                   int src_stepx, | 
|  740                                   uint8* dst_argb, int dst_width) { |  715                                   uint8* dst_argb, int dst_width) { | 
|  741   asm volatile ( |  716   asm volatile ( | 
|  742     "mov        r12, %4, lsl #2                \n" |  | 
|  743     "add        %1, %1, %0                     \n" |  717     "add        %1, %1, %0                     \n" | 
|  744     ".p2align   2                              \n" |  | 
|  745   "1:                                          \n" |  718   "1:                                          \n" | 
|  746     MEMACCESS(0) |  719     MEMACCESS(0) | 
|  747     "vld1.8     {d0}, [%0], r12                \n"  // Read 4 2x2 blocks -> 2x1 |  720     "ld1     {v0.8b}, [%0], %4                 \n"  // Read 4 2x2 blocks -> 2x1 | 
|  748     MEMACCESS(1) |  721     MEMACCESS(1) | 
|  749     "vld1.8     {d1}, [%1], r12                \n" |  722     "ld1     {v1.8b}, [%1], %4                 \n" | 
|  750     MEMACCESS(0) |  723     MEMACCESS(0) | 
|  751     "vld1.8     {d2}, [%0], r12                \n" |  724     "ld1     {v2.8b}, [%0], %4                 \n" | 
|  752     MEMACCESS(1) |  725     MEMACCESS(1) | 
|  753     "vld1.8     {d3}, [%1], r12                \n" |  726     "ld1     {v3.8b}, [%1], %4                 \n" | 
|  754     MEMACCESS(0) |  727     MEMACCESS(0) | 
|  755     "vld1.8     {d4}, [%0], r12                \n" |  728     "ld1     {v4.8b}, [%0], %4                 \n" | 
|  756     MEMACCESS(1) |  729     MEMACCESS(1) | 
|  757     "vld1.8     {d5}, [%1], r12                \n" |  730     "ld1     {v5.8b}, [%1], %4                 \n" | 
|  758     MEMACCESS(0) |  731     MEMACCESS(0) | 
|  759     "vld1.8     {d6}, [%0], r12                \n" |  732     "ld1     {v6.8b}, [%0], %4                 \n" | 
|  760     MEMACCESS(1) |  733     MEMACCESS(1) | 
|  761     "vld1.8     {d7}, [%1], r12                \n" |  734     "ld1     {v7.8b}, [%1], %4                 \n" | 
|  762     "vaddl.u8   q0, d0, d1                     \n" |  735     "uaddl   v0.8h, v0.8b, v1.8b               \n" | 
|  763     "vaddl.u8   q1, d2, d3                     \n" |  736     "uaddl   v2.8h, v2.8b, v3.8b               \n" | 
|  764     "vaddl.u8   q2, d4, d5                     \n" |  737     "uaddl   v4.8h, v4.8b, v5.8b               \n" | 
|  765     "vaddl.u8   q3, d6, d7                     \n" |  738     "uaddl   v6.8h, v6.8b, v7.8b               \n" | 
|  766     "vswp.8     d1, d2                         \n"  // ab_cd -> ac_bd |  739     "mov     v16.d[1], v0.d[1]                 \n"  // ab_cd -> ac_bd | 
|  767     "vswp.8     d5, d6                         \n"  // ef_gh -> eg_fh |  740     "mov     v0.d[1], v2.d[0]                  \n" | 
|  768     "vadd.u16   q0, q0, q1                     \n"  // (a+b)_(c+d) |  741     "mov     v2.d[0], v16.d[1]                 \n" | 
|  769     "vadd.u16   q2, q2, q3                     \n"  // (e+f)_(g+h) |  742     "mov     v16.d[1], v4.d[1]                 \n"  // ef_gh -> eg_fh | 
|  770     "vrshrn.u16 d0, q0, #2                     \n"  // first 2 pixels. |  743     "mov     v4.d[1], v6.d[0]                  \n" | 
|  771     "vrshrn.u16 d1, q2, #2                     \n"  // next 2 pixels. |  744     "mov     v6.d[0], v16.d[1]                 \n" | 
 |  745     "add     v0.8h, v0.8h, v2.8h               \n"  // (a+b)_(c+d) | 
 |  746     "add     v4.8h, v4.8h, v6.8h               \n"  // (e+f)_(g+h) | 
 |  747     "rshrn   v0.8b, v0.8h, #2                  \n"  // first 2 pixels. | 
 |  748     "rshrn2  v0.16b, v4.8h, #2                 \n"  // next 2 pixels. | 
|  772     "subs       %3, %3, #4                     \n"  // 4 pixels per loop. |  749     "subs       %3, %3, #4                     \n"  // 4 pixels per loop. | 
|  773     MEMACCESS(2) |  750     MEMACCESS(2) | 
|  774     "vst1.8     {q0}, [%2]!                    \n" |  751     "st1     {v0.16b}, [%2], #16               \n" | 
|  775     "bgt        1b                             \n" |  752     "b.gt       1b                             \n" | 
|  776   : "+r"(src_argb),    // %0 |  753   : "+r"(src_argb),    // %0 | 
|  777     "+r"(src_stride),  // %1 |  754     "+r"(src_stride),  // %1 | 
|  778     "+r"(dst_argb),    // %2 |  755     "+r"(dst_argb),    // %2 | 
|  779     "+r"(dst_width)    // %3 |  756     "+r"(dst_width)    // %3 | 
|  780   : "r"(src_stepx)     // %4 |  757   : "r"(src_stepx * 4) // %4 | 
|  781   : "memory", "cc", "r12", "q0", "q1", "q2", "q3" |  758   : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" | 
|  782   ); |  759   ); | 
|  783 } |  760 } | 
|  784 #endif  // HAS_SCALEARGBROWDOWNEVEN_NEON |  761 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) | 
|  785 #endif  // __aarch64__ |  | 
|  786  |  762  | 
|  787 #ifdef __cplusplus |  763 #ifdef __cplusplus | 
|  788 }  // extern "C" |  764 }  // extern "C" | 
|  789 }  // namespace libyuv |  765 }  // namespace libyuv | 
|  790 #endif |  766 #endif | 
| OLD | NEW |