| OLD | NEW | 
|     1 /* |     1 /* | 
|     2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved. |     2  *  Copyright 2014 The LibYuv Project Authors. All rights reserved. | 
|     3  * |     3  * | 
|     4  *  Use of this source code is governed by a BSD-style license |     4  *  Use of this source code is governed by a BSD-style license | 
|     5  *  that can be found in the LICENSE file in the root of the source |     5  *  that can be found in the LICENSE file in the root of the source | 
|     6  *  tree. An additional intellectual property rights grant can be found |     6  *  tree. An additional intellectual property rights grant can be found | 
|     7  *  in the file PATENTS. All contributing project authors may |     7  *  in the file PATENTS. All contributing project authors may | 
|     8  *  be found in the AUTHORS file in the root of the source tree. |     8  *  be found in the AUTHORS file in the root of the source tree. | 
|     9  */ |     9  */ | 
|    10  |    10  | 
|    11 #include "libyuv/row.h" |    11 #include "libyuv/row.h" | 
|    12  |    12  | 
|    13 #ifdef __cplusplus |    13 #ifdef __cplusplus | 
|    14 namespace libyuv { |    14 namespace libyuv { | 
|    15 extern "C" { |    15 extern "C" { | 
|    16 #endif |    16 #endif | 
|    17  |    17  | 
|    18 // This module is for GCC Neon |    18 // This module is for GCC Neon armv8 64 bit. | 
|    19 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) |    19 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) | 
|    20  |    20  | 
|    21 // Read 8 Y, 4 U and 4 V from 422 |    21 // Read 8 Y, 4 U and 4 V from 422 | 
|    22 #define READYUV422                                                             \ |    22 #define READYUV422                                                             \ | 
|    23     MEMACCESS(0)                                                               \ |    23     MEMACCESS(0)                                                               \ | 
|    24     "vld1.8     {d0}, [%0]!                    \n"                             \ |    24     "ld1        {v0.8b}, [%0], #8              \n"                             \ | 
|    25     MEMACCESS(1)                                                               \ |    25     MEMACCESS(1)                                                               \ | 
|    26     "vld1.32    {d2[0]}, [%1]!                 \n"                             \ |    26     "ld1        {v1.s}[0], [%1], #4            \n"                             \ | 
|    27     MEMACCESS(2)                                                               \ |    27     MEMACCESS(2)                                                               \ | 
|    28     "vld1.32    {d2[1]}, [%2]!                 \n" |    28     "ld1        {v1.s}[1], [%2], #4            \n" | 
|    29  |    29  | 
|    30 // Read 8 Y, 2 U and 2 V from 422 |    30 // Read 8 Y, 2 U and 2 V from 422 | 
|    31 #define READYUV411                                                             \ |    31 #define READYUV411                                                             \ | 
|    32     MEMACCESS(0)                                                               \ |    32     MEMACCESS(0)                                                               \ | 
|    33     "vld1.8     {d0}, [%0]!                    \n"                             \ |    33     "ld1        {v0.8b}, [%0], #8              \n"                             \ | 
|    34     MEMACCESS(1)                                                               \ |    34     MEMACCESS(1)                                                               \ | 
|    35     "vld1.16    {d2[0]}, [%1]!                 \n"                             \ |    35     "ld1        {v2.h}[0], [%1], #2            \n"                             \ | 
|    36     MEMACCESS(2)                                                               \ |    36     MEMACCESS(2)                                                               \ | 
|    37     "vld1.16    {d2[1]}, [%2]!                 \n"                             \ |    37     "ld1        {v2.h}[1], [%2], #2            \n"                             \ | 
|    38     "vmov.u8    d3, d2                         \n"                             \ |    38     "zip1       v1.8b, v2.8b, v2.8b            \n" | 
|    39     "vzip.u8    d2, d3                         \n" |  | 
|    40  |    39  | 
|    41 // Read 8 Y, 8 U and 8 V from 444 |    40 // Read 8 Y, 8 U and 8 V from 444 | 
|    42 #define READYUV444                                                             \ |    41 #define READYUV444                                                             \ | 
|    43     MEMACCESS(0)                                                               \ |    42     MEMACCESS(0)                                                               \ | 
|    44     "vld1.8     {d0}, [%0]!                    \n"                             \ |    43     "ld1        {v0.8b}, [%0], #8              \n"                             \ | 
|    45     MEMACCESS(1)                                                               \ |    44     MEMACCESS(1)                                                               \ | 
|    46     "vld1.8     {d2}, [%1]!                    \n"                             \ |    45     "ld1        {v1.d}[0], [%1], #8            \n"                             \ | 
|    47     MEMACCESS(2)                                                               \ |    46     MEMACCESS(2)                                                               \ | 
|    48     "vld1.8     {d3}, [%2]!                    \n"                             \ |    47     "ld1        {v1.d}[1], [%2], #8            \n"                             \ | 
|    49     "vpaddl.u8  q1, q1                         \n"                             \ |    48     "uaddlp     v1.8h, v1.16b                  \n"                             \ | 
|    50     "vrshrn.u16 d2, q1, #1                     \n" |    49     "rshrn      v1.8b, v1.8h, #1               \n" | 
|    51  |    50  | 
|    52 // Read 8 Y, and set 4 U and 4 V to 128 |    51 // Read 8 Y, and set 4 U and 4 V to 128 | 
|    53 #define READYUV400                                                             \ |    52 #define READYUV400                                                             \ | 
|    54     MEMACCESS(0)                                                               \ |    53     MEMACCESS(0)                                                               \ | 
|    55     "vld1.8     {d0}, [%0]!                    \n"                             \ |    54     "ld1        {v0.8b}, [%0], #8              \n"                             \ | 
|    56     "vmov.u8    d2, #128                       \n" |    55     "movi       v1.8b , #128                   \n" | 
|    57  |    56  | 
|    58 // Read 8 Y and 4 UV from NV12 |    57 // Read 8 Y and 4 UV from NV12 | 
|    59 #define READNV12                                                               \ |    58 #define READNV12                                                               \ | 
|    60     MEMACCESS(0)                                                               \ |    59     MEMACCESS(0)                                                               \ | 
|    61     "vld1.8     {d0}, [%0]!                    \n"                             \ |    60     "ld1        {v0.8b}, [%0], #8              \n"                             \ | 
|    62     MEMACCESS(1)                                                               \ |    61     MEMACCESS(1)                                                               \ | 
|    63     "vld1.8     {d2}, [%1]!                    \n"                             \ |    62     "ld1        {v2.8b}, [%1], #8              \n"                             \ | 
|    64     "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\ |    63     "uzp1       v1.8b, v2.8b, v2.8b            \n"                             \ | 
|    65     "vuzp.u8    d2, d3                         \n"                             \ |    64     "uzp2       v3.8b, v2.8b, v2.8b            \n"                             \ | 
|    66     "vtrn.u32   d2, d3                         \n" |    65     "ins        v1.s[1], v3.s[0]               \n" | 
|    67  |    66  | 
|    68 // Read 8 Y and 4 VU from NV21 |    67 // Read 8 Y and 4 VU from NV21 | 
|    69 #define READNV21                                                               \ |    68 #define READNV21                                                               \ | 
|    70     MEMACCESS(0)                                                               \ |    69     MEMACCESS(0)                                                               \ | 
|    71     "vld1.8     {d0}, [%0]!                    \n"                             \ |    70     "ld1        {v0.8b}, [%0], #8              \n"                             \ | 
|    72     MEMACCESS(1)                                                               \ |    71     MEMACCESS(1)                                                               \ | 
|    73     "vld1.8     {d2}, [%1]!                    \n"                             \ |    72     "ld1        {v2.8b}, [%1], #8              \n"                             \ | 
|    74     "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\ |    73     "uzp1       v3.8b, v2.8b, v2.8b            \n"                             \ | 
|    75     "vuzp.u8    d3, d2                         \n"                             \ |    74     "uzp2       v1.8b, v2.8b, v2.8b            \n"                             \ | 
|    76     "vtrn.u32   d2, d3                         \n" |    75     "ins        v1.s[1], v3.s[0]               \n" | 
|    77  |    76  | 
|    78 // Read 8 YUY2 |    77 // Read 8 YUY2 | 
|    79 #define READYUY2                                                               \ |    78 #define READYUY2                                                               \ | 
|    80     MEMACCESS(0)                                                               \ |    79     MEMACCESS(0)                                                               \ | 
|    81     "vld2.8     {d0, d2}, [%0]!                \n"                             \ |    80     "ld2        {v0.8b, v1.8b}, [%0], #16      \n"                             \ | 
|    82     "vmov.u8    d3, d2                         \n"                             \ |    81     "uzp2       v3.8b, v1.8b, v1.8b            \n"                             \ | 
|    83     "vuzp.u8    d2, d3                         \n"                             \ |    82     "uzp1       v1.8b, v1.8b, v1.8b            \n"                             \ | 
|    84     "vtrn.u32   d2, d3                         \n" |    83     "ins        v1.s[1], v3.s[0]               \n" | 
|    85  |    84  | 
|    86 // Read 8 UYVY |    85 // Read 8 UYVY | 
|    87 #define READUYVY                                                               \ |    86 #define READUYVY                                                               \ | 
|    88     MEMACCESS(0)                                                               \ |    87     MEMACCESS(0)                                                               \ | 
|    89     "vld2.8     {d2, d3}, [%0]!                \n"                             \ |    88     "ld2        {v2.8b, v3.8b}, [%0], #16      \n"                             \ | 
|    90     "vmov.u8    d0, d3                         \n"                             \ |    89     "orr        v0.8b, v3.8b, v3.8b            \n"                             \ | 
|    91     "vmov.u8    d3, d2                         \n"                             \ |    90     "uzp1       v1.8b, v2.8b, v2.8b            \n"                             \ | 
|    92     "vuzp.u8    d2, d3                         \n"                             \ |    91     "uzp2       v3.8b, v2.8b, v2.8b            \n"                             \ | 
|    93     "vtrn.u32   d2, d3                         \n" |    92     "ins        v1.s[1], v3.s[0]               \n" | 
|    94  |    93  | 
|    95 #define YUV422TORGB                                                            \ |    94 #define YUV422TORGB_SETUP_REG                                                  \ | 
|    96     "veor.u8    d2, d26                        \n"/*subtract 128 from u and v*/\ |    95     "ld1r       {v24.8h}, [%[kUVBiasBGR]], #2  \n"                             \ | 
|    97     "vmull.s8   q8, d2, d24                    \n"/*  u/v B/R component      */\ |    96     "ld1r       {v25.8h}, [%[kUVBiasBGR]], #2  \n"                             \ | 
|    98     "vmull.s8   q9, d2, d25                    \n"/*  u/v G component        */\ |    97     "ld1r       {v26.8h}, [%[kUVBiasBGR]]      \n"                             \ | 
|    99     "vmov.u8    d1, #0                         \n"/*  split odd/even y apart */\ |    98     "ld1r       {v31.4s}, [%[kYToRgb]]         \n"                             \ | 
|   100     "vtrn.u8    d0, d1                         \n"                             \ |    99     "movi       v27.8h, #128                   \n"                             \ | 
|   101     "vsub.s16   q0, q0, q15                    \n"/*  offset y               */\ |   100     "movi       v28.8h, #102                   \n"                             \ | 
|   102     "vmul.s16   q0, q0, q14                    \n"                             \ |   101     "movi       v29.8h, #25                    \n"                             \ | 
|   103     "vadd.s16   d18, d19                       \n"                             \ |   102     "movi       v30.8h, #52                    \n" | 
|   104     "vqadd.s16  d20, d0, d16                   \n" /* B */                     \ |  | 
|   105     "vqadd.s16  d21, d1, d16                   \n"                             \ |  | 
|   106     "vqadd.s16  d22, d0, d17                   \n" /* R */                     \ |  | 
|   107     "vqadd.s16  d23, d1, d17                   \n"                             \ |  | 
|   108     "vqadd.s16  d16, d0, d18                   \n" /* G */                     \ |  | 
|   109     "vqadd.s16  d17, d1, d18                   \n"                             \ |  | 
|   110     "vqshrun.s16 d0, q10, #6                   \n" /* B */                     \ |  | 
|   111     "vqshrun.s16 d1, q11, #6                   \n" /* G */                     \ |  | 
|   112     "vqshrun.s16 d2, q8, #6                    \n" /* R */                     \ |  | 
|   113     "vmovl.u8   q10, d0                        \n"/*  set up for reinterleave*/\ |  | 
|   114     "vmovl.u8   q11, d1                        \n"                             \ |  | 
|   115     "vmovl.u8   q8, d2                         \n"                             \ |  | 
|   116     "vtrn.u8    d20, d21                       \n"                             \ |  | 
|   117     "vtrn.u8    d22, d23                       \n"                             \ |  | 
|   118     "vtrn.u8    d16, d17                       \n"                             \ |  | 
|   119     "vmov.u8    d21, d16                       \n" |  | 
|   120  |   103  | 
|   121 static vec8 kUVToRB  = { 127, 127, 127, 127, 102, 102, 102, 102, |   104 #define YUV422TORGB(vR, vG, vB)                                                \ | 
|   122                          0, 0, 0, 0, 0, 0, 0, 0 }; |   105     "uxtl       v0.8h, v0.8b                   \n" /* Extract Y    */          \ | 
|   123 static vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52, |   106     "shll       v2.8h, v1.8b, #8               \n" /* Replicate UV */          \ | 
|   124                        0, 0, 0, 0, 0, 0, 0, 0 }; |   107     "ushll2     v3.4s, v0.8h, #0               \n" /* Y */                     \ | 
 |   108     "ushll      v0.4s, v0.4h, #0               \n"                             \ | 
 |   109     "mul        v3.4s, v3.4s, v31.4s           \n"                             \ | 
 |   110     "mul        v0.4s, v0.4s, v31.4s           \n"                             \ | 
 |   111     "sqshrun    v0.4h, v0.4s, #16              \n"                             \ | 
 |   112     "sqshrun2   v0.8h, v3.4s, #16              \n" /* Y */                     \ | 
 |   113     "uaddw      v1.8h, v2.8h, v1.8b            \n" /* Replicate UV */          \ | 
 |   114     "mov        v2.d[0], v1.d[1]               \n" /* Extract V */             \ | 
 |   115     "uxtl       v2.8h, v2.8b                   \n"                             \ | 
 |   116     "uxtl       v1.8h, v1.8b                   \n" /* Extract U */             \ | 
 |   117     "mul        v3.8h, v1.8h, v27.8h           \n"                             \ | 
 |   118     "mul        v5.8h, v1.8h, v29.8h           \n"                             \ | 
 |   119     "mul        v6.8h, v2.8h, v30.8h           \n"                             \ | 
 |   120     "mul        v7.8h, v2.8h, v28.8h           \n"                             \ | 
 |   121     "sqadd      v6.8h, v6.8h, v5.8h            \n"                             \ | 
 |   122     "sqadd      " #vB ".8h, v24.8h, v0.8h      \n" /* B */                     \ | 
 |   123     "sqadd      " #vG ".8h, v25.8h, v0.8h      \n" /* G */                     \ | 
 |   124     "sqadd      " #vR ".8h, v26.8h, v0.8h      \n" /* R */                     \ | 
 |   125     "sqadd      " #vB ".8h, " #vB ".8h, v3.8h  \n" /* B */                     \ | 
 |   126     "sqsub      " #vG ".8h, " #vG ".8h, v6.8h  \n" /* G */                     \ | 
 |   127     "sqadd      " #vR ".8h, " #vR ".8h, v7.8h  \n" /* R */                     \ | 
 |   128     "sqshrun    " #vB ".8b, " #vB ".8h, #6     \n" /* B */                     \ | 
 |   129     "sqshrun    " #vG ".8b, " #vG ".8h, #6     \n" /* G */                     \ | 
 |   130     "sqshrun    " #vR ".8b, " #vR ".8h, #6     \n" /* R */                     \ | 
 |   131  | 
 |   132 // YUV to RGB conversion constants. | 
 |   133 // Y contribution to R,G,B.  Scale and bias. | 
 |   134 #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ | 
 |   135 #define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */ | 
 |   136  | 
 |   137 // U and V contributions to R,G,B. | 
 |   138 #define UB -128 /* -min(128, round(2.018 * 64)) */ | 
 |   139 #define UG 25 /* -round(-0.391 * 64) */ | 
 |   140 #define VG 52 /* -round(-0.813 * 64) */ | 
 |   141 #define VR -102 /* -round(1.596 * 64) */ | 
 |   142  | 
 |   143 // Bias values to subtract 16 from Y and 128 from U and V. | 
 |   144 #define BB (UB * 128            - YGB) | 
 |   145 #define BG (UG * 128 + VG * 128 - YGB) | 
 |   146 #define BR            (VR * 128 - YGB) | 
 |   147  | 
 |   148 static vec16 kUVBiasBGR = { BB, BG, BR, 0, 0, 0, 0, 0 }; | 
 |   149 static vec32 kYToRgb = { 0x0101 * YG, 0, 0, 0 }; | 
 |   150  | 
 |   151 #undef YG | 
 |   152 #undef YGB | 
 |   153 #undef UB | 
 |   154 #undef UG | 
 |   155 #undef VG | 
 |   156 #undef VR | 
 |   157 #undef BB | 
 |   158 #undef BG | 
 |   159 #undef BR | 
 |   160  | 
 |   161 #define RGBTOUV_SETUP_REG                                                      \ | 
 |   162     "movi       v20.8h, #56, lsl #0  \n"  /* UB/VR coefficient (0.875) / 2 */  \ | 
 |   163     "movi       v21.8h, #37, lsl #0  \n"  /* UG coefficient (-0.5781) / 2  */  \ | 
 |   164     "movi       v22.8h, #19, lsl #0  \n"  /* UR coefficient (-0.2969) / 2  */  \ | 
 |   165     "movi       v23.8h, #9,  lsl #0  \n"  /* VB coefficient (-0.1406) / 2  */  \ | 
 |   166     "movi       v24.8h, #47, lsl #0  \n"  /* VG coefficient (-0.7344) / 2  */  \ | 
 |   167     "movi       v25.16b, #0x80       \n"  /* 128.5 (0x8080 in 16-bit)      */ | 
 |   168  | 
|   125  |   169  | 
|   126 #ifdef HAS_I444TOARGBROW_NEON |   170 #ifdef HAS_I444TOARGBROW_NEON | 
|   127 void I444ToARGBRow_NEON(const uint8* src_y, |   171 void I444ToARGBRow_NEON(const uint8* src_y, | 
|   128                         const uint8* src_u, |   172                         const uint8* src_u, | 
|   129                         const uint8* src_v, |   173                         const uint8* src_v, | 
|   130                         uint8* dst_argb, |   174                         uint8* dst_argb, | 
|   131                         int width) { |   175                         int width) { | 
|   132   asm volatile ( |   176   asm volatile ( | 
|   133     MEMACCESS(5) |   177     YUV422TORGB_SETUP_REG | 
|   134     "vld1.8     {d24}, [%5]                    \n" |  | 
|   135     MEMACCESS(6) |  | 
|   136     "vld1.8     {d25}, [%6]                    \n" |  | 
|   137     "vmov.u8    d26, #128                      \n" |  | 
|   138     "vmov.u16   q14, #74                       \n" |  | 
|   139     "vmov.u16   q15, #16                       \n" |  | 
|   140     ".p2align   2                              \n" |  | 
|   141   "1:                                          \n" |   178   "1:                                          \n" | 
|   142     READYUV444 |   179     READYUV444 | 
|   143     YUV422TORGB |   180     YUV422TORGB(v22, v21, v20) | 
|   144     "subs       %4, %4, #8                     \n" |   181     "subs       %4, %4, #8                     \n" | 
|   145     "vmov.u8    d23, #255                      \n" |   182     "movi       v23.8b, #255                   \n" /* A */ | 
|   146     MEMACCESS(3) |   183     MEMACCESS(3) | 
|   147     "vst4.8     {d20, d21, d22, d23}, [%3]!    \n" |   184     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" | 
|   148     "bgt        1b                             \n" |   185     "b.gt       1b                             \n" | 
|   149     : "+r"(src_y),     // %0 |   186     : "+r"(src_y),     // %0 | 
|   150       "+r"(src_u),     // %1 |   187       "+r"(src_u),     // %1 | 
|   151       "+r"(src_v),     // %2 |   188       "+r"(src_v),     // %2 | 
|   152       "+r"(dst_argb),  // %3 |   189       "+r"(dst_argb),  // %3 | 
|   153       "+r"(width)      // %4 |   190       "+r"(width)      // %4 | 
|   154     : "r"(&kUVToRB),   // %5 |   191     : [kUVBiasBGR]"r"(&kUVBiasBGR), | 
|   155       "r"(&kUVToG)     // %6 |   192       [kYToRgb]"r"(&kYToRgb) | 
|   156     : "cc", "memory", "q0", "q1", "q2", "q3", |   193     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 
|   157       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |   194       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 
|   158   ); |   195   ); | 
|   159 } |   196 } | 
|   160 #endif  // HAS_I444TOARGBROW_NEON |   197 #endif  // HAS_I444TOARGBROW_NEON | 
|   161  |   198  | 
|   162 #ifdef HAS_I422TOARGBROW_NEON |   199 #ifdef HAS_I422TOARGBROW_NEON | 
|   163 void I422ToARGBRow_NEON(const uint8* src_y, |   200 void I422ToARGBRow_NEON(const uint8* src_y, | 
|   164                         const uint8* src_u, |   201                         const uint8* src_u, | 
|   165                         const uint8* src_v, |   202                         const uint8* src_v, | 
|   166                         uint8* dst_argb, |   203                         uint8* dst_argb, | 
|   167                         int width) { |   204                         int width) { | 
|   168   asm volatile ( |   205   asm volatile ( | 
|   169     MEMACCESS(5) |   206     YUV422TORGB_SETUP_REG | 
|   170     "vld1.8     {d24}, [%5]                    \n" |  | 
|   171     MEMACCESS(6) |  | 
|   172     "vld1.8     {d25}, [%6]                    \n" |  | 
|   173     "vmov.u8    d26, #128                      \n" |  | 
|   174     "vmov.u16   q14, #74                       \n" |  | 
|   175     "vmov.u16   q15, #16                       \n" |  | 
|   176     ".p2align   2                              \n" |  | 
|   177   "1:                                          \n" |   207   "1:                                          \n" | 
|   178     READYUV422 |   208     READYUV422 | 
|   179     YUV422TORGB |   209     YUV422TORGB(v22, v21, v20) | 
|   180     "subs       %4, %4, #8                     \n" |   210     "subs       %4, %4, #8                     \n" | 
|   181     "vmov.u8    d23, #255                      \n" |   211     "movi       v23.8b, #255                   \n" /* A */ | 
|   182     MEMACCESS(3) |   212     MEMACCESS(3) | 
|   183     "vst4.8     {d20, d21, d22, d23}, [%3]!    \n" |   213     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n" | 
|   184     "bgt        1b                             \n" |   214     "b.gt       1b                             \n" | 
|   185     : "+r"(src_y),     // %0 |   215     : "+r"(src_y),     // %0 | 
|   186       "+r"(src_u),     // %1 |   216       "+r"(src_u),     // %1 | 
|   187       "+r"(src_v),     // %2 |   217       "+r"(src_v),     // %2 | 
|   188       "+r"(dst_argb),  // %3 |   218       "+r"(dst_argb),  // %3 | 
|   189       "+r"(width)      // %4 |   219       "+r"(width)      // %4 | 
|   190     : "r"(&kUVToRB),   // %5 |   220     : [kUVBiasBGR]"r"(&kUVBiasBGR), | 
|   191       "r"(&kUVToG)     // %6 |   221       [kYToRgb]"r"(&kYToRgb) | 
|   192     : "cc", "memory", "q0", "q1", "q2", "q3", |   222     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 
|   193       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |   223       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 
|   194   ); |   224   ); | 
|   195 } |   225 } | 
|   196 #endif  // HAS_I422TOARGBROW_NEON |   226 #endif  // HAS_I422TOARGBROW_NEON | 
|   197  |   227  | 
|   198 #ifdef HAS_I411TOARGBROW_NEON |   228 #ifdef HAS_I411TOARGBROW_NEON | 
|   199 void I411ToARGBRow_NEON(const uint8* src_y, |   229 void I411ToARGBRow_NEON(const uint8* src_y, | 
|   200                         const uint8* src_u, |   230                         const uint8* src_u, | 
|   201                         const uint8* src_v, |   231                         const uint8* src_v, | 
|   202                         uint8* dst_argb, |   232                         uint8* dst_argb, | 
|   203                         int width) { |   233                         int width) { | 
|   204   asm volatile ( |   234   asm volatile ( | 
|   205     MEMACCESS(5) |   235     YUV422TORGB_SETUP_REG | 
|   206     "vld1.8     {d24}, [%5]                    \n" |  | 
|   207     MEMACCESS(6) |  | 
|   208     "vld1.8     {d25}, [%6]                    \n" |  | 
|   209     "vmov.u8    d26, #128                      \n" |  | 
|   210     "vmov.u16   q14, #74                       \n" |  | 
|   211     "vmov.u16   q15, #16                       \n" |  | 
|   212     ".p2align   2                              \n" |  | 
|   213   "1:                                          \n" |   236   "1:                                          \n" | 
|   214     READYUV411 |   237     READYUV411 | 
|   215     YUV422TORGB |   238     YUV422TORGB(v22, v21, v20) | 
|   216     "subs       %4, %4, #8                     \n" |   239     "subs       %4, %4, #8                     \n" | 
|   217     "vmov.u8    d23, #255                      \n" |   240     "movi       v23.8b, #255                   \n" /* A */ | 
|   218     MEMACCESS(3) |   241     MEMACCESS(3) | 
|   219     "vst4.8     {d20, d21, d22, d23}, [%3]!    \n" |   242     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n" | 
|   220     "bgt        1b                             \n" |   243     "b.gt       1b                             \n" | 
|   221     : "+r"(src_y),     // %0 |   244     : "+r"(src_y),     // %0 | 
|   222       "+r"(src_u),     // %1 |   245       "+r"(src_u),     // %1 | 
|   223       "+r"(src_v),     // %2 |   246       "+r"(src_v),     // %2 | 
|   224       "+r"(dst_argb),  // %3 |   247       "+r"(dst_argb),  // %3 | 
|   225       "+r"(width)      // %4 |   248       "+r"(width)      // %4 | 
|   226     : "r"(&kUVToRB),   // %5 |   249     : [kUVBiasBGR]"r"(&kUVBiasBGR), | 
|   227       "r"(&kUVToG)     // %6 |   250       [kYToRgb]"r"(&kYToRgb) | 
|   228     : "cc", "memory", "q0", "q1", "q2", "q3", |   251     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 
|   229       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |   252       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 
|   230   ); |   253   ); | 
|   231 } |   254 } | 
|   232 #endif  // HAS_I411TOARGBROW_NEON |   255 #endif  // HAS_I411TOARGBROW_NEON | 
|   233  |   256  | 
|   234 #ifdef HAS_I422TOBGRAROW_NEON |   257 #ifdef HAS_I422TOBGRAROW_NEON | 
|   235 void I422ToBGRARow_NEON(const uint8* src_y, |   258 void I422ToBGRARow_NEON(const uint8* src_y, | 
|   236                         const uint8* src_u, |   259                         const uint8* src_u, | 
|   237                         const uint8* src_v, |   260                         const uint8* src_v, | 
|   238                         uint8* dst_bgra, |   261                         uint8* dst_bgra, | 
|   239                         int width) { |   262                         int width) { | 
|   240   asm volatile ( |   263   asm volatile ( | 
|   241     MEMACCESS(5) |   264     YUV422TORGB_SETUP_REG | 
|   242     "vld1.8     {d24}, [%5]                    \n" |  | 
|   243     MEMACCESS(6) |  | 
|   244     "vld1.8     {d25}, [%6]                    \n" |  | 
|   245     "vmov.u8    d26, #128                      \n" |  | 
|   246     "vmov.u16   q14, #74                       \n" |  | 
|   247     "vmov.u16   q15, #16                       \n" |  | 
|   248     ".p2align   2                              \n" |  | 
|   249   "1:                                          \n" |   265   "1:                                          \n" | 
|   250     READYUV422 |   266     READYUV422 | 
|   251     YUV422TORGB |   267     YUV422TORGB(v21, v22, v23) | 
|   252     "subs       %4, %4, #8                     \n" |   268     "subs       %4, %4, #8                     \n" | 
|   253     "vswp.u8    d20, d22                       \n" |   269     "movi       v20.8b, #255                   \n" /* A */ | 
|   254     "vmov.u8    d19, #255                      \n" |  | 
|   255     MEMACCESS(3) |   270     MEMACCESS(3) | 
|   256     "vst4.8     {d19, d20, d21, d22}, [%3]!    \n" |   271     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n" | 
|   257     "bgt        1b                             \n" |   272     "b.gt       1b                             \n" | 
|   258     : "+r"(src_y),     // %0 |   273     : "+r"(src_y),     // %0 | 
|   259       "+r"(src_u),     // %1 |   274       "+r"(src_u),     // %1 | 
|   260       "+r"(src_v),     // %2 |   275       "+r"(src_v),     // %2 | 
|   261       "+r"(dst_bgra),  // %3 |   276       "+r"(dst_bgra),  // %3 | 
|   262       "+r"(width)      // %4 |   277       "+r"(width)      // %4 | 
|   263     : "r"(&kUVToRB),   // %5 |   278     : [kUVBiasBGR]"r"(&kUVBiasBGR), | 
|   264       "r"(&kUVToG)     // %6 |   279       [kYToRgb]"r"(&kYToRgb) | 
|   265     : "cc", "memory", "q0", "q1", "q2", "q3", |   280     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 
|   266       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |   281       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 
|   267   ); |   282   ); | 
|   268 } |   283 } | 
|   269 #endif  // HAS_I422TOBGRAROW_NEON |   284 #endif  // HAS_I422TOBGRAROW_NEON | 
|   270  |   285  | 
|   271 #ifdef HAS_I422TOABGRROW_NEON |   286 #ifdef HAS_I422TOABGRROW_NEON | 
|   272 void I422ToABGRRow_NEON(const uint8* src_y, |   287 void I422ToABGRRow_NEON(const uint8* src_y, | 
|   273                         const uint8* src_u, |   288                         const uint8* src_u, | 
|   274                         const uint8* src_v, |   289                         const uint8* src_v, | 
|   275                         uint8* dst_abgr, |   290                         uint8* dst_abgr, | 
|   276                         int width) { |   291                         int width) { | 
|   277   asm volatile ( |   292   asm volatile ( | 
|   278     MEMACCESS(5) |   293     YUV422TORGB_SETUP_REG | 
|   279     "vld1.8     {d24}, [%5]                    \n" |  | 
|   280     MEMACCESS(6) |  | 
|   281     "vld1.8     {d25}, [%6]                    \n" |  | 
|   282     "vmov.u8    d26, #128                      \n" |  | 
|   283     "vmov.u16   q14, #74                       \n" |  | 
|   284     "vmov.u16   q15, #16                       \n" |  | 
|   285     ".p2align   2                              \n" |  | 
|   286   "1:                                          \n" |   294   "1:                                          \n" | 
|   287     READYUV422 |   295     READYUV422 | 
|   288     YUV422TORGB |   296     YUV422TORGB(v20, v21, v22) | 
|   289     "subs       %4, %4, #8                     \n" |   297     "subs       %4, %4, #8                     \n" | 
|   290     "vswp.u8    d20, d22                       \n" |   298     "movi       v23.8b, #255                   \n" /* A */ | 
|   291     "vmov.u8    d23, #255                      \n" |  | 
|   292     MEMACCESS(3) |   299     MEMACCESS(3) | 
|   293     "vst4.8     {d20, d21, d22, d23}, [%3]!    \n" |   300     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n" | 
|   294     "bgt        1b                             \n" |   301     "b.gt       1b                             \n" | 
|   295     : "+r"(src_y),     // %0 |   302     : "+r"(src_y),     // %0 | 
|   296       "+r"(src_u),     // %1 |   303       "+r"(src_u),     // %1 | 
|   297       "+r"(src_v),     // %2 |   304       "+r"(src_v),     // %2 | 
|   298       "+r"(dst_abgr),  // %3 |   305       "+r"(dst_abgr),  // %3 | 
|   299       "+r"(width)      // %4 |   306       "+r"(width)      // %4 | 
|   300     : "r"(&kUVToRB),   // %5 |   307     : [kUVBiasBGR]"r"(&kUVBiasBGR), | 
|   301       "r"(&kUVToG)     // %6 |   308       [kYToRgb]"r"(&kYToRgb) | 
|   302     : "cc", "memory", "q0", "q1", "q2", "q3", |   309     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 
|   303       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |   310       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 
|   304   ); |   311   ); | 
|   305 } |   312 } | 
|   306 #endif  // HAS_I422TOABGRROW_NEON |   313 #endif  // HAS_I422TOABGRROW_NEON | 
|   307  |   314  | 
|   308 #ifdef HAS_I422TORGBAROW_NEON |   315 #ifdef HAS_I422TORGBAROW_NEON | 
|   309 void I422ToRGBARow_NEON(const uint8* src_y, |   316 void I422ToRGBARow_NEON(const uint8* src_y, | 
|   310                         const uint8* src_u, |   317                         const uint8* src_u, | 
|   311                         const uint8* src_v, |   318                         const uint8* src_v, | 
|   312                         uint8* dst_rgba, |   319                         uint8* dst_rgba, | 
|   313                         int width) { |   320                         int width) { | 
|   314   asm volatile ( |   321   asm volatile ( | 
|   315     MEMACCESS(5) |   322     YUV422TORGB_SETUP_REG | 
|   316     "vld1.8     {d24}, [%5]                    \n" |  | 
|   317     MEMACCESS(6) |  | 
|   318     "vld1.8     {d25}, [%6]                    \n" |  | 
|   319     "vmov.u8    d26, #128                      \n" |  | 
|   320     "vmov.u16   q14, #74                       \n" |  | 
|   321     "vmov.u16   q15, #16                       \n" |  | 
|   322     ".p2align   2                              \n" |  | 
|   323   "1:                                          \n" |   323   "1:                                          \n" | 
|   324     READYUV422 |   324     READYUV422 | 
|   325     YUV422TORGB |   325     YUV422TORGB(v23, v22, v21) | 
|   326     "subs       %4, %4, #8                     \n" |   326     "subs       %4, %4, #8                     \n" | 
|   327     "vmov.u8    d19, #255                      \n" |   327     "movi       v20.8b, #255                   \n" /* A */ | 
|   328     MEMACCESS(3) |   328     MEMACCESS(3) | 
|   329     "vst4.8     {d19, d20, d21, d22}, [%3]!    \n" |   329     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n" | 
|   330     "bgt        1b                             \n" |   330     "b.gt       1b                             \n" | 
|   331     : "+r"(src_y),     // %0 |   331     : "+r"(src_y),     // %0 | 
|   332       "+r"(src_u),     // %1 |   332       "+r"(src_u),     // %1 | 
|   333       "+r"(src_v),     // %2 |   333       "+r"(src_v),     // %2 | 
|   334       "+r"(dst_rgba),  // %3 |   334       "+r"(dst_rgba),  // %3 | 
|   335       "+r"(width)      // %4 |   335       "+r"(width)      // %4 | 
|   336     : "r"(&kUVToRB),   // %5 |   336     : [kUVBiasBGR]"r"(&kUVBiasBGR), | 
|   337       "r"(&kUVToG)     // %6 |   337       [kYToRgb]"r"(&kYToRgb) | 
|   338     : "cc", "memory", "q0", "q1", "q2", "q3", |   338     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 
|   339       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |   339       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 
|   340   ); |   340   ); | 
|   341 } |   341 } | 
|   342 #endif  // HAS_I422TORGBAROW_NEON |   342 #endif  // HAS_I422TORGBAROW_NEON | 
|   343  |   343  | 
|   344 #ifdef HAS_I422TORGB24ROW_NEON |   344 #ifdef HAS_I422TORGB24ROW_NEON | 
|   345 void I422ToRGB24Row_NEON(const uint8* src_y, |   345 void I422ToRGB24Row_NEON(const uint8* src_y, | 
|   346                          const uint8* src_u, |   346                          const uint8* src_u, | 
|   347                          const uint8* src_v, |   347                          const uint8* src_v, | 
|   348                          uint8* dst_rgb24, |   348                          uint8* dst_rgb24, | 
|   349                          int width) { |   349                          int width) { | 
|   350   asm volatile ( |   350   asm volatile ( | 
|   351     MEMACCESS(5) |   351     YUV422TORGB_SETUP_REG | 
|   352     "vld1.8     {d24}, [%5]                    \n" |  | 
|   353     MEMACCESS(6) |  | 
|   354     "vld1.8     {d25}, [%6]                    \n" |  | 
|   355     "vmov.u8    d26, #128                      \n" |  | 
|   356     "vmov.u16   q14, #74                       \n" |  | 
|   357     "vmov.u16   q15, #16                       \n" |  | 
|   358     ".p2align   2                              \n" |  | 
|   359   "1:                                          \n" |   352   "1:                                          \n" | 
|   360     READYUV422 |   353     READYUV422 | 
|   361     YUV422TORGB |   354     YUV422TORGB(v22, v21, v20) | 
|   362     "subs       %4, %4, #8                     \n" |   355     "subs       %4, %4, #8                     \n" | 
|   363     MEMACCESS(3) |   356     MEMACCESS(3) | 
|   364     "vst3.8     {d20, d21, d22}, [%3]!         \n" |   357     "st3        {v20.8b,v21.8b,v22.8b}, [%3], #24     \n" | 
|   365     "bgt        1b                             \n" |   358     "b.gt       1b                             \n" | 
|   366     : "+r"(src_y),      // %0 |   359     : "+r"(src_y),     // %0 | 
|   367       "+r"(src_u),      // %1 |   360       "+r"(src_u),     // %1 | 
|   368       "+r"(src_v),      // %2 |   361       "+r"(src_v),     // %2 | 
|   369       "+r"(dst_rgb24),  // %3 |   362       "+r"(dst_rgb24), // %3 | 
|   370       "+r"(width)       // %4 |   363       "+r"(width)      // %4 | 
|   371     : "r"(&kUVToRB),    // %5 |   364     : [kUVBiasBGR]"r"(&kUVBiasBGR), | 
|   372       "r"(&kUVToG)      // %6 |   365       [kYToRgb]"r"(&kYToRgb) | 
|   373     : "cc", "memory", "q0", "q1", "q2", "q3", |   366     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 
|   374       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |   367       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 
|   375   ); |   368   ); | 
|   376 } |   369 } | 
|   377 #endif  // HAS_I422TORGB24ROW_NEON |   370 #endif  // HAS_I422TORGB24ROW_NEON | 
|   378  |   371  | 
|   379 #ifdef HAS_I422TORAWROW_NEON |   372 #ifdef HAS_I422TORAWROW_NEON | 
|   380 void I422ToRAWRow_NEON(const uint8* src_y, |   373 void I422ToRAWRow_NEON(const uint8* src_y, | 
|   381                        const uint8* src_u, |   374                        const uint8* src_u, | 
|   382                        const uint8* src_v, |   375                        const uint8* src_v, | 
|   383                        uint8* dst_raw, |   376                        uint8* dst_raw, | 
|   384                        int width) { |   377                        int width) { | 
|   385   asm volatile ( |   378   asm volatile ( | 
|   386     MEMACCESS(5) |   379     YUV422TORGB_SETUP_REG | 
|   387     "vld1.8     {d24}, [%5]                    \n" |  | 
|   388     MEMACCESS(6) |  | 
|   389     "vld1.8     {d25}, [%6]                    \n" |  | 
|   390     "vmov.u8    d26, #128                      \n" |  | 
|   391     "vmov.u16   q14, #74                       \n" |  | 
|   392     "vmov.u16   q15, #16                       \n" |  | 
|   393     ".p2align   2                              \n" |  | 
|   394   "1:                                          \n" |   380   "1:                                          \n" | 
|   395     READYUV422 |   381     READYUV422 | 
|   396     YUV422TORGB |   382     YUV422TORGB(v20, v21, v22) | 
|   397     "subs       %4, %4, #8                     \n" |   383     "subs       %4, %4, #8                     \n" | 
|   398     "vswp.u8    d20, d22                       \n" |  | 
|   399     MEMACCESS(3) |   384     MEMACCESS(3) | 
|   400     "vst3.8     {d20, d21, d22}, [%3]!         \n" |   385     "st3        {v20.8b,v21.8b,v22.8b}, [%3], #24     \n" | 
|   401     "bgt        1b                             \n" |   386     "b.gt       1b                             \n" | 
|   402     : "+r"(src_y),    // %0 |   387     : "+r"(src_y),     // %0 | 
|   403       "+r"(src_u),    // %1 |   388       "+r"(src_u),     // %1 | 
|   404       "+r"(src_v),    // %2 |   389       "+r"(src_v),     // %2 | 
|   405       "+r"(dst_raw),  // %3 |   390       "+r"(dst_raw),   // %3 | 
|   406       "+r"(width)     // %4 |   391       "+r"(width)      // %4 | 
|   407     : "r"(&kUVToRB),  // %5 |   392     : [kUVBiasBGR]"r"(&kUVBiasBGR), | 
|   408       "r"(&kUVToG)    // %6 |   393       [kYToRgb]"r"(&kYToRgb) | 
|   409     : "cc", "memory", "q0", "q1", "q2", "q3", |   394     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 
|   410       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |   395       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 
|   411   ); |   396   ); | 
|   412 } |   397 } | 
|   413 #endif  // HAS_I422TORAWROW_NEON |   398 #endif  // HAS_I422TORAWROW_NEON | 
|   414  |   399  | 
|   415 #define ARGBTORGB565                                                           \ |   400 #define ARGBTORGB565                                                           \ | 
|   416     "vshr.u8    d20, d20, #3                   \n"  /* B                    */ \ |   401     "shll       v0.8h,  v22.8b, #8             \n"  /* R                    */ \ | 
|   417     "vshr.u8    d21, d21, #2                   \n"  /* G                    */ \ |   402     "shll       v20.8h, v20.8b, #8             \n"  /* B                    */ \ | 
|   418     "vshr.u8    d22, d22, #3                   \n"  /* R                    */ \ |   403     "shll       v21.8h, v21.8b, #8             \n"  /* G                    */ \ | 
|   419     "vmovl.u8   q8, d20                        \n"  /* B                    */ \ |   404     "sri        v0.8h,  v21.8h, #5             \n"  /* RG                   */ \ | 
|   420     "vmovl.u8   q9, d21                        \n"  /* G                    */ \ |   405     "sri        v0.8h,  v20.8h, #11            \n"  /* RGB                  */ | 
|   421     "vmovl.u8   q10, d22                       \n"  /* R                    */ \ |  | 
|   422     "vshl.u16   q9, q9, #5                     \n"  /* G                    */ \ |  | 
|   423     "vshl.u16   q10, q10, #11                  \n"  /* R                    */ \ |  | 
|   424     "vorr       q0, q8, q9                     \n"  /* BG                   */ \ |  | 
|   425     "vorr       q0, q0, q10                    \n"  /* BGR                  */ |  | 
|   426  |   406  | 
|   427 #ifdef HAS_I422TORGB565ROW_NEON |   407 #ifdef HAS_I422TORGB565ROW_NEON | 
|   428 void I422ToRGB565Row_NEON(const uint8* src_y, |   408 void I422ToRGB565Row_NEON(const uint8* src_y, | 
|   429                           const uint8* src_u, |   409                           const uint8* src_u, | 
|   430                           const uint8* src_v, |   410                           const uint8* src_v, | 
|   431                           uint8* dst_rgb565, |   411                           uint8* dst_rgb565, | 
|   432                           int width) { |   412                           int width) { | 
|   433   asm volatile ( |   413   asm volatile ( | 
|   434     MEMACCESS(5) |   414     YUV422TORGB_SETUP_REG | 
|   435     "vld1.8     {d24}, [%5]                    \n" |  | 
|   436     MEMACCESS(6) |  | 
|   437     "vld1.8     {d25}, [%6]                    \n" |  | 
|   438     "vmov.u8    d26, #128                      \n" |  | 
|   439     "vmov.u16   q14, #74                       \n" |  | 
|   440     "vmov.u16   q15, #16                       \n" |  | 
|   441     ".p2align   2                              \n" |  | 
|   442   "1:                                          \n" |   415   "1:                                          \n" | 
|   443     READYUV422 |   416     READYUV422 | 
|   444     YUV422TORGB |   417     YUV422TORGB(v22, v21, v20) | 
|   445     "subs       %4, %4, #8                     \n" |   418     "subs       %4, %4, #8                     \n" | 
|   446     ARGBTORGB565 |   419     ARGBTORGB565 | 
|   447     MEMACCESS(3) |   420     MEMACCESS(3) | 
|   448     "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels RGB565. |   421     "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels RGB565. | 
|   449     "bgt        1b                             \n" |   422     "b.gt       1b                             \n" | 
|   450     : "+r"(src_y),    // %0 |   423     : "+r"(src_y),    // %0 | 
|   451       "+r"(src_u),    // %1 |   424       "+r"(src_u),    // %1 | 
|   452       "+r"(src_v),    // %2 |   425       "+r"(src_v),    // %2 | 
|   453       "+r"(dst_rgb565),  // %3 |   426       "+r"(dst_rgb565),  // %3 | 
|   454       "+r"(width)     // %4 |   427       "+r"(width)     // %4 | 
|   455     : "r"(&kUVToRB),  // %5 |   428     : [kUVBiasBGR]"r"(&kUVBiasBGR), | 
|   456       "r"(&kUVToG)    // %6 |   429       [kYToRgb]"r"(&kYToRgb) | 
|   457     : "cc", "memory", "q0", "q1", "q2", "q3", |   430     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 
|   458       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |   431       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 
|   459   ); |   432   ); | 
|   460 } |   433 } | 
|   461 #endif  // HAS_I422TORGB565ROW_NEON |   434 #endif  // HAS_I422TORGB565ROW_NEON | 
|   462  |   435  | 
|   463 #define ARGBTOARGB1555                                                         \ |   436 #define ARGBTOARGB1555                                                         \ | 
|   464     "vshr.u8    q10, q10, #3                   \n"  /* B                    */ \ |   437     "shll       v0.8h,  v23.8b, #8             \n"  /* A                    */ \ | 
|   465     "vshr.u8    d22, d22, #3                   \n"  /* R                    */ \ |   438     "shll       v22.8h, v22.8b, #8             \n"  /* R                    */ \ | 
|   466     "vshr.u8    d23, d23, #7                   \n"  /* A                    */ \ |   439     "shll       v20.8h, v20.8b, #8             \n"  /* B                    */ \ | 
|   467     "vmovl.u8   q8, d20                        \n"  /* B                    */ \ |   440     "shll       v21.8h, v21.8b, #8             \n"  /* G                    */ \ | 
|   468     "vmovl.u8   q9, d21                        \n"  /* G                    */ \ |   441     "sri        v0.8h,  v22.8h, #1             \n"  /* AR                   */ \ | 
|   469     "vmovl.u8   q10, d22                       \n"  /* R                    */ \ |   442     "sri        v0.8h,  v21.8h, #6             \n"  /* ARG                  */ \ | 
|   470     "vmovl.u8   q11, d23                       \n"  /* A                    */ \ |   443     "sri        v0.8h,  v20.8h, #11            \n"  /* ARGB                 */ | 
|   471     "vshl.u16   q9, q9, #5                     \n"  /* G                    */ \ |  | 
|   472     "vshl.u16   q10, q10, #10                  \n"  /* R                    */ \ |  | 
|   473     "vshl.u16   q11, q11, #15                  \n"  /* A                    */ \ |  | 
|   474     "vorr       q0, q8, q9                     \n"  /* BG                   */ \ |  | 
|   475     "vorr       q1, q10, q11                   \n"  /* RA                   */ \ |  | 
|   476     "vorr       q0, q0, q1                     \n"  /* BGRA                 */ |  | 
|   477  |   444  | 
|   478 #ifdef HAS_I422TOARGB1555ROW_NEON |   445 #ifdef HAS_I422TOARGB1555ROW_NEON | 
|   479 void I422ToARGB1555Row_NEON(const uint8* src_y, |   446 void I422ToARGB1555Row_NEON(const uint8* src_y, | 
|   480                             const uint8* src_u, |   447                             const uint8* src_u, | 
|   481                             const uint8* src_v, |   448                             const uint8* src_v, | 
|   482                             uint8* dst_argb1555, |   449                             uint8* dst_argb1555, | 
|   483                             int width) { |   450                             int width) { | 
|   484   asm volatile ( |   451   asm volatile ( | 
|   485     MEMACCESS(5) |   452     YUV422TORGB_SETUP_REG | 
|   486     "vld1.8     {d24}, [%5]                    \n" |  | 
|   487     MEMACCESS(6) |  | 
|   488     "vld1.8     {d25}, [%6]                    \n" |  | 
|   489     "vmov.u8    d26, #128                      \n" |  | 
|   490     "vmov.u16   q14, #74                       \n" |  | 
|   491     "vmov.u16   q15, #16                       \n" |  | 
|   492     ".p2align   2                              \n" |  | 
|   493   "1:                                          \n" |   453   "1:                                          \n" | 
|   494     READYUV422 |   454     READYUV422 | 
|   495     YUV422TORGB |   455     YUV422TORGB(v22, v21, v20) | 
|   496     "subs       %4, %4, #8                     \n" |   456     "subs       %4, %4, #8                     \n" | 
|   497     "vmov.u8    d23, #255                      \n" |   457     "movi       v23.8b, #255                   \n" | 
|   498     ARGBTOARGB1555 |   458     ARGBTOARGB1555 | 
|   499     MEMACCESS(3) |   459     MEMACCESS(3) | 
|   500     "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB1555. |   460     "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels RGB565. | 
|   501     "bgt        1b                             \n" |   461     "b.gt       1b                             \n" | 
|   502     : "+r"(src_y),    // %0 |   462     : "+r"(src_y),    // %0 | 
|   503       "+r"(src_u),    // %1 |   463       "+r"(src_u),    // %1 | 
|   504       "+r"(src_v),    // %2 |   464       "+r"(src_v),    // %2 | 
|   505       "+r"(dst_argb1555),  // %3 |   465       "+r"(dst_argb1555),  // %3 | 
|   506       "+r"(width)     // %4 |   466       "+r"(width)     // %4 | 
|   507     : "r"(&kUVToRB),  // %5 |   467     : [kUVBiasBGR]"r"(&kUVBiasBGR), | 
|   508       "r"(&kUVToG)    // %6 |   468       [kYToRgb]"r"(&kYToRgb) | 
|   509     : "cc", "memory", "q0", "q1", "q2", "q3", |   469     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 
|   510       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |   470       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 
|   511   ); |   471   ); | 
|   512 } |   472 } | 
|   513 #endif  // HAS_I422TOARGB1555ROW_NEON |   473 #endif  // HAS_I422TOARGB1555ROW_NEON | 
|   514  |   474  | 
|   515 #define ARGBTOARGB4444                                                         \ |   475 #define ARGBTOARGB4444                                                         \ | 
|   516     "vshr.u8    d20, d20, #4                   \n"  /* B                    */ \ |   476     /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f        */ \ | 
|   517     "vbic.32    d21, d21, d4                   \n"  /* G                    */ \ |   477     "ushr       v20.8b, v20.8b, #4             \n"  /* B                    */ \ | 
|   518     "vshr.u8    d22, d22, #4                   \n"  /* R                    */ \ |   478     "bic        v21.8b, v21.8b, v4.8b          \n"  /* G                    */ \ | 
|   519     "vbic.32    d23, d23, d4                   \n"  /* A                    */ \ |   479     "ushr       v22.8b, v22.8b, #4             \n"  /* R                    */ \ | 
|   520     "vorr       d0, d20, d21                   \n"  /* BG                   */ \ |   480     "bic        v23.8b, v23.8b, v4.8b          \n"  /* A                    */ \ | 
|   521     "vorr       d1, d22, d23                   \n"  /* RA                   */ \ |   481     "orr        v0.8b,  v20.8b, v21.8b         \n"  /* BG                   */ \ | 
|   522     "vzip.u8    d0, d1                         \n"  /* BGRA                 */ |   482     "orr        v1.8b,  v22.8b, v23.8b         \n"  /* RA                   */ \ | 
 |   483     "zip1       v0.16b, v0.16b, v1.16b         \n"  /* BGRA                 */ | 
|   523  |   484  | 
|   524 #ifdef HAS_I422TOARGB4444ROW_NEON |   485 #ifdef HAS_I422TOARGB4444ROW_NEON | 
|   525 void I422ToARGB4444Row_NEON(const uint8* src_y, |   486 void I422ToARGB4444Row_NEON(const uint8* src_y, | 
|   526                             const uint8* src_u, |   487                             const uint8* src_u, | 
|   527                             const uint8* src_v, |   488                             const uint8* src_v, | 
|   528                             uint8* dst_argb4444, |   489                             uint8* dst_argb4444, | 
|   529                             int width) { |   490                             int width) { | 
|   530   asm volatile ( |   491   asm volatile ( | 
|   531     MEMACCESS(5) |   492     YUV422TORGB_SETUP_REG | 
|   532     "vld1.8     {d24}, [%5]                    \n" |   493     "movi       v4.16b, #0x0f                  \n"  // bits to clear with vbic. | 
|   533     MEMACCESS(6) |  | 
|   534     "vld1.8     {d25}, [%6]                    \n" |  | 
|   535     "vmov.u8    d26, #128                      \n" |  | 
|   536     "vmov.u16   q14, #74                       \n" |  | 
|   537     "vmov.u16   q15, #16                       \n" |  | 
|   538     "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic. |  | 
|   539     ".p2align   2                              \n" |  | 
|   540   "1:                                          \n" |   494   "1:                                          \n" | 
|   541     READYUV422 |   495     READYUV422 | 
|   542     YUV422TORGB |   496     YUV422TORGB(v22, v21, v20) | 
|   543     "subs       %4, %4, #8                     \n" |   497     "subs       %4, %4, #8                     \n" | 
|   544     "vmov.u8    d23, #255                      \n" |   498     "movi       v23.8b, #255                   \n" | 
|   545     ARGBTOARGB4444 |   499     ARGBTOARGB4444 | 
|   546     MEMACCESS(3) |   500     MEMACCESS(3) | 
|   547     "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB4444. |   501     "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels ARGB4444. | 
|   548     "bgt        1b                             \n" |   502     "b.gt       1b                             \n" | 
|   549     : "+r"(src_y),    // %0 |   503     : "+r"(src_y),    // %0 | 
|   550       "+r"(src_u),    // %1 |   504       "+r"(src_u),    // %1 | 
|   551       "+r"(src_v),    // %2 |   505       "+r"(src_v),    // %2 | 
|   552       "+r"(dst_argb4444),  // %3 |   506       "+r"(dst_argb4444),  // %3 | 
|   553       "+r"(width)     // %4 |   507       "+r"(width)     // %4 | 
|   554     : "r"(&kUVToRB),  // %5 |   508     : [kUVBiasBGR]"r"(&kUVBiasBGR), | 
|   555       "r"(&kUVToG)    // %6 |   509       [kYToRgb]"r"(&kYToRgb) | 
|   556     : "cc", "memory", "q0", "q1", "q2", "q3", |   510     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 
|   557       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |   511       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 
|   558   ); |   512   ); | 
|   559 } |   513 } | 
|   560 #endif  // HAS_I422TOARGB4444ROW_NEON |   514 #endif  // HAS_I422TOARGB4444ROW_NEON | 
|   561  |   515  | 
|   562 #ifdef HAS_YTOARGBROW_NEON |   516 #ifdef HAS_YTOARGBROW_NEON | 
|   563 void YToARGBRow_NEON(const uint8* src_y, |   517 void YToARGBRow_NEON(const uint8* src_y, | 
|   564                      uint8* dst_argb, |   518                      uint8* dst_argb, | 
|   565                      int width) { |   519                      int width) { | 
|   566   asm volatile ( |   520   asm volatile ( | 
|   567     MEMACCESS(3) |   521     YUV422TORGB_SETUP_REG | 
|   568     "vld1.8     {d24}, [%3]                    \n" |  | 
|   569     MEMACCESS(4) |  | 
|   570     "vld1.8     {d25}, [%4]                    \n" |  | 
|   571     "vmov.u8    d26, #128                      \n" |  | 
|   572     "vmov.u16   q14, #74                       \n" |  | 
|   573     "vmov.u16   q15, #16                       \n" |  | 
|   574     ".p2align   2                              \n" |  | 
|   575   "1:                                          \n" |   522   "1:                                          \n" | 
|   576     READYUV400 |   523     READYUV400 | 
|   577     YUV422TORGB |   524     YUV422TORGB(v22, v21, v20) | 
|   578     "subs       %2, %2, #8                     \n" |   525     "subs       %2, %2, #8                     \n" | 
|   579     "vmov.u8    d23, #255                      \n" |   526     "movi       v23.8b, #255                   \n" | 
|   580     MEMACCESS(1) |   527     MEMACCESS(1) | 
|   581     "vst4.8     {d20, d21, d22, d23}, [%1]!    \n" |   528     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n" | 
|   582     "bgt        1b                             \n" |   529     "b.gt       1b                             \n" | 
|   583     : "+r"(src_y),     // %0 |   530     : "+r"(src_y),     // %0 | 
|   584       "+r"(dst_argb),  // %1 |   531       "+r"(dst_argb),  // %1 | 
|   585       "+r"(width)      // %2 |   532       "+r"(width)      // %2 | 
|   586     : "r"(&kUVToRB),   // %3 |   533     : [kUVBiasBGR]"r"(&kUVBiasBGR), | 
|   587       "r"(&kUVToG)     // %4 |   534       [kYToRgb]"r"(&kYToRgb) | 
|   588     : "cc", "memory", "q0", "q1", "q2", "q3", |   535     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 
|   589       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |   536       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 
|   590   ); |   537   ); | 
|   591 } |   538 } | 
|   592 #endif  // HAS_YTOARGBROW_NEON |   539 #endif  // HAS_YTOARGBROW_NEON | 
|   593  |   540  | 
|   594 #ifdef HAS_I400TOARGBROW_NEON |   541 #ifdef HAS_I400TOARGBROW_NEON | 
|   595 void I400ToARGBRow_NEON(const uint8* src_y, |   542 void I400ToARGBRow_NEON(const uint8* src_y, | 
|   596                         uint8* dst_argb, |   543                         uint8* dst_argb, | 
|   597                         int width) { |   544                         int width) { | 
|   598   asm volatile ( |   545   asm volatile ( | 
|   599     ".p2align   2                              \n" |   546     "movi       v23.8b, #255                   \n" | 
|   600     "vmov.u8    d23, #255                      \n" |  | 
|   601   "1:                                          \n" |   547   "1:                                          \n" | 
|   602     MEMACCESS(0) |   548     MEMACCESS(0) | 
|   603     "vld1.8     {d20}, [%0]!                   \n" |   549     "ld1        {v20.8b}, [%0], #8             \n" | 
|   604     "vmov       d21, d20                       \n" |   550     "orr        v21.8b, v20.8b, v20.8b         \n" | 
|   605     "vmov       d22, d20                       \n" |   551     "orr        v22.8b, v20.8b, v20.8b         \n" | 
|   606     "subs       %2, %2, #8                     \n" |   552     "subs       %2, %2, #8                     \n" | 
|   607     MEMACCESS(1) |   553     MEMACCESS(1) | 
|   608     "vst4.8     {d20, d21, d22, d23}, [%1]!    \n" |   554     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n" | 
|   609     "bgt        1b                             \n" |   555     "b.gt       1b                             \n" | 
|   610     : "+r"(src_y),     // %0 |   556     : "+r"(src_y),     // %0 | 
|   611       "+r"(dst_argb),  // %1 |   557       "+r"(dst_argb),  // %1 | 
|   612       "+r"(width)      // %2 |   558       "+r"(width)      // %2 | 
|   613     : |   559     : | 
|   614     : "cc", "memory", "d20", "d21", "d22", "d23" |   560     : "cc", "memory", "v20", "v21", "v22", "v23" | 
|   615   ); |   561   ); | 
|   616 } |   562 } | 
|   617 #endif  // HAS_I400TOARGBROW_NEON |   563 #endif  // HAS_I400TOARGBROW_NEON | 
|   618  |   564  | 
|   619 #ifdef HAS_NV12TOARGBROW_NEON |   565 #ifdef HAS_NV12TOARGBROW_NEON | 
|   620 void NV12ToARGBRow_NEON(const uint8* src_y, |   566 void NV12ToARGBRow_NEON(const uint8* src_y, | 
|   621                         const uint8* src_uv, |   567                         const uint8* src_uv, | 
|   622                         uint8* dst_argb, |   568                         uint8* dst_argb, | 
|   623                         int width) { |   569                         int width) { | 
|   624   asm volatile ( |   570   asm volatile ( | 
|   625     MEMACCESS(4) |   571     YUV422TORGB_SETUP_REG | 
|   626     "vld1.8     {d24}, [%4]                    \n" |  | 
|   627     MEMACCESS(5) |  | 
|   628     "vld1.8     {d25}, [%5]                    \n" |  | 
|   629     "vmov.u8    d26, #128                      \n" |  | 
|   630     "vmov.u16   q14, #74                       \n" |  | 
|   631     "vmov.u16   q15, #16                       \n" |  | 
|   632     ".p2align   2                              \n" |  | 
|   633   "1:                                          \n" |   572   "1:                                          \n" | 
|   634     READNV12 |   573     READNV12 | 
|   635     YUV422TORGB |   574     YUV422TORGB(v22, v21, v20) | 
|   636     "subs       %3, %3, #8                     \n" |   575     "subs       %3, %3, #8                     \n" | 
|   637     "vmov.u8    d23, #255                      \n" |   576     "movi       v23.8b, #255                   \n" | 
|   638     MEMACCESS(2) |   577     MEMACCESS(2) | 
|   639     "vst4.8     {d20, d21, d22, d23}, [%2]!    \n" |   578     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n" | 
|   640     "bgt        1b                             \n" |   579     "b.gt       1b                             \n" | 
|   641     : "+r"(src_y),     // %0 |   580     : "+r"(src_y),     // %0 | 
|   642       "+r"(src_uv),    // %1 |   581       "+r"(src_uv),    // %1 | 
|   643       "+r"(dst_argb),  // %2 |   582       "+r"(dst_argb),  // %2 | 
|   644       "+r"(width)      // %3 |   583       "+r"(width)      // %3 | 
|   645     : "r"(&kUVToRB),   // %4 |   584     : [kUVBiasBGR]"r"(&kUVBiasBGR), | 
|   646       "r"(&kUVToG)     // %5 |   585       [kYToRgb]"r"(&kYToRgb) | 
|   647     : "cc", "memory", "q0", "q1", "q2", "q3", |   586     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 
|   648       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |   587       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 
|   649   ); |   588   ); | 
|   650 } |   589 } | 
|   651 #endif  // HAS_NV12TOARGBROW_NEON |   590 #endif  // HAS_NV12TOARGBROW_NEON | 
|   652  |   591  | 
|   653 #ifdef HAS_NV21TOARGBROW_NEON |   592 #ifdef HAS_NV21TOARGBROW_NEON | 
|   654 void NV21ToARGBRow_NEON(const uint8* src_y, |   593 void NV21ToARGBRow_NEON(const uint8* src_y, | 
|   655                         const uint8* src_uv, |   594                         const uint8* src_uv, | 
|   656                         uint8* dst_argb, |   595                         uint8* dst_argb, | 
|   657                         int width) { |   596                         int width) { | 
|   658   asm volatile ( |   597   asm volatile ( | 
|   659     MEMACCESS(4) |   598     YUV422TORGB_SETUP_REG | 
|   660     "vld1.8     {d24}, [%4]                    \n" |  | 
|   661     MEMACCESS(5) |  | 
|   662     "vld1.8     {d25}, [%5]                    \n" |  | 
|   663     "vmov.u8    d26, #128                      \n" |  | 
|   664     "vmov.u16   q14, #74                       \n" |  | 
|   665     "vmov.u16   q15, #16                       \n" |  | 
|   666     ".p2align   2                              \n" |  | 
|   667   "1:                                          \n" |   599   "1:                                          \n" | 
|   668     READNV21 |   600     READNV21 | 
|   669     YUV422TORGB |   601     YUV422TORGB(v22, v21, v20) | 
|   670     "subs       %3, %3, #8                     \n" |   602     "subs       %3, %3, #8                     \n" | 
|   671     "vmov.u8    d23, #255                      \n" |   603     "movi       v23.8b, #255                   \n" | 
|   672     MEMACCESS(2) |   604     MEMACCESS(2) | 
|   673     "vst4.8     {d20, d21, d22, d23}, [%2]!    \n" |   605     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n" | 
|   674     "bgt        1b                             \n" |   606     "b.gt       1b                             \n" | 
|   675     : "+r"(src_y),     // %0 |   607     : "+r"(src_y),     // %0 | 
|   676       "+r"(src_uv),    // %1 |   608       "+r"(src_uv),    // %1 | 
|   677       "+r"(dst_argb),  // %2 |   609       "+r"(dst_argb),  // %2 | 
|   678       "+r"(width)      // %3 |   610       "+r"(width)      // %3 | 
|   679     : "r"(&kUVToRB),   // %4 |   611     : [kUVBiasBGR]"r"(&kUVBiasBGR), | 
|   680       "r"(&kUVToG)     // %5 |   612       [kYToRgb]"r"(&kYToRgb) | 
|   681     : "cc", "memory", "q0", "q1", "q2", "q3", |   613     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 
|   682       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |   614       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 
|   683   ); |   615   ); | 
|   684 } |   616 } | 
|   685 #endif  // HAS_NV21TOARGBROW_NEON |   617 #endif  // HAS_NV21TOARGBROW_NEON | 
|   686  |   618  | 
|   687 #ifdef HAS_NV12TORGB565ROW_NEON |   619 #ifdef HAS_NV12TORGB565ROW_NEON | 
|   688 void NV12ToRGB565Row_NEON(const uint8* src_y, |   620 void NV12ToRGB565Row_NEON(const uint8* src_y, | 
|   689                           const uint8* src_uv, |   621                           const uint8* src_uv, | 
|   690                           uint8* dst_rgb565, |   622                           uint8* dst_rgb565, | 
|   691                           int width) { |   623                           int width) { | 
|   692   asm volatile ( |   624   asm volatile ( | 
|   693     MEMACCESS(4) |   625     YUV422TORGB_SETUP_REG | 
|   694     "vld1.8     {d24}, [%4]                    \n" |  | 
|   695     MEMACCESS(5) |  | 
|   696     "vld1.8     {d25}, [%5]                    \n" |  | 
|   697     "vmov.u8    d26, #128                      \n" |  | 
|   698     "vmov.u16   q14, #74                       \n" |  | 
|   699     "vmov.u16   q15, #16                       \n" |  | 
|   700     ".p2align   2                              \n" |  | 
|   701   "1:                                          \n" |   626   "1:                                          \n" | 
|   702     READNV12 |   627     READNV12 | 
|   703     YUV422TORGB |   628     YUV422TORGB(v22, v21, v20) | 
|   704     "subs       %3, %3, #8                     \n" |   629     "subs       %3, %3, #8                     \n" | 
|   705     ARGBTORGB565 |   630     ARGBTORGB565 | 
|   706     MEMACCESS(2) |   631     MEMACCESS(2) | 
|   707     "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565. |   632     "st1        {v0.8h}, [%2], 16              \n"  // store 8 pixels RGB565. | 
|   708     "bgt        1b                             \n" |   633     "b.gt       1b                             \n" | 
|   709     : "+r"(src_y),     // %0 |   634     : "+r"(src_y),     // %0 | 
|   710       "+r"(src_uv),    // %1 |   635       "+r"(src_uv),    // %1 | 
|   711       "+r"(dst_rgb565),  // %2 |   636       "+r"(dst_rgb565),  // %2 | 
|   712       "+r"(width)      // %3 |   637       "+r"(width)      // %3 | 
|   713     : "r"(&kUVToRB),   // %4 |   638     : [kUVBiasBGR]"r"(&kUVBiasBGR), | 
|   714       "r"(&kUVToG)     // %5 |   639       [kYToRgb]"r"(&kYToRgb) | 
|   715     : "cc", "memory", "q0", "q1", "q2", "q3", |   640     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 
|   716       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |   641       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 
|   717   ); |   642   ); | 
|   718 } |   643 } | 
|   719 #endif  // HAS_NV12TORGB565ROW_NEON |   644 #endif  // HAS_NV12TORGB565ROW_NEON | 
|   720  |   645  | 
|   721 #ifdef HAS_NV21TORGB565ROW_NEON |   646 #ifdef HAS_NV21TORGB565ROW_NEON | 
|   722 void NV21ToRGB565Row_NEON(const uint8* src_y, |   647 void NV21ToRGB565Row_NEON(const uint8* src_y, | 
|   723                           const uint8* src_uv, |   648                           const uint8* src_uv, | 
|   724                           uint8* dst_rgb565, |   649                           uint8* dst_rgb565, | 
|   725                           int width) { |   650                           int width) { | 
|   726   asm volatile ( |   651   asm volatile ( | 
|   727     MEMACCESS(4) |   652     YUV422TORGB_SETUP_REG | 
|   728     "vld1.8     {d24}, [%4]                    \n" |  | 
|   729     MEMACCESS(5) |  | 
|   730     "vld1.8     {d25}, [%5]                    \n" |  | 
|   731     "vmov.u8    d26, #128                      \n" |  | 
|   732     "vmov.u16   q14, #74                       \n" |  | 
|   733     "vmov.u16   q15, #16                       \n" |  | 
|   734     ".p2align   2                              \n" |  | 
|   735   "1:                                          \n" |   653   "1:                                          \n" | 
|   736     READNV21 |   654     READNV21 | 
|   737     YUV422TORGB |   655     YUV422TORGB(v22, v21, v20) | 
|   738     "subs       %3, %3, #8                     \n" |   656     "subs       %3, %3, #8                     \n" | 
|   739     ARGBTORGB565 |   657     ARGBTORGB565 | 
|   740     MEMACCESS(2) |   658     MEMACCESS(2) | 
|   741     "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565. |   659     "st1        {v0.8h}, [%2], 16              \n"  // store 8 pixels RGB565. | 
|   742     "bgt        1b                             \n" |   660     "b.gt       1b                             \n" | 
|   743     : "+r"(src_y),     // %0 |   661     : "+r"(src_y),     // %0 | 
|   744       "+r"(src_uv),    // %1 |   662       "+r"(src_uv),    // %1 | 
|   745       "+r"(dst_rgb565),  // %2 |   663       "+r"(dst_rgb565),  // %2 | 
|   746       "+r"(width)      // %3 |   664       "+r"(width)      // %3 | 
|   747     : "r"(&kUVToRB),   // %4 |   665     : [kUVBiasBGR]"r"(&kUVBiasBGR), | 
|   748       "r"(&kUVToG)     // %5 |   666       [kYToRgb]"r"(&kYToRgb) | 
|   749     : "cc", "memory", "q0", "q1", "q2", "q3", |   667     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 
|   750       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |   668       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 
|   751   ); |   669   ); | 
|   752 } |   670 } | 
|   753 #endif  // HAS_NV21TORGB565ROW_NEON |   671 #endif  // HAS_NV21TORGB565ROW_NEON | 
|   754  |   672  | 
|   755 #ifdef HAS_YUY2TOARGBROW_NEON |   673 #ifdef HAS_YUY2TOARGBROW_NEON | 
|   756 void YUY2ToARGBRow_NEON(const uint8* src_yuy2, |   674 void YUY2ToARGBRow_NEON(const uint8* src_yuy2, | 
|   757                         uint8* dst_argb, |   675                         uint8* dst_argb, | 
|   758                         int width) { |   676                         int width) { | 
|   759   asm volatile ( |   677   asm volatile ( | 
|   760     MEMACCESS(3) |   678     YUV422TORGB_SETUP_REG | 
|   761     "vld1.8     {d24}, [%3]                    \n" |  | 
|   762     MEMACCESS(4) |  | 
|   763     "vld1.8     {d25}, [%4]                    \n" |  | 
|   764     "vmov.u8    d26, #128                      \n" |  | 
|   765     "vmov.u16   q14, #74                       \n" |  | 
|   766     "vmov.u16   q15, #16                       \n" |  | 
|   767     ".p2align   2                              \n" |  | 
|   768   "1:                                          \n" |   679   "1:                                          \n" | 
|   769     READYUY2 |   680     READYUY2 | 
|   770     YUV422TORGB |   681     YUV422TORGB(v22, v21, v20) | 
|   771     "subs       %2, %2, #8                     \n" |   682     "subs       %2, %2, #8                     \n" | 
|   772     "vmov.u8    d23, #255                      \n" |   683     "movi       v23.8b, #255                   \n" | 
|   773     MEMACCESS(1) |   684     MEMACCESS(1) | 
|   774     "vst4.8     {d20, d21, d22, d23}, [%1]!    \n" |   685     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32      \n" | 
|   775     "bgt        1b                             \n" |   686     "b.gt       1b                             \n" | 
|   776     : "+r"(src_yuy2),  // %0 |   687     : "+r"(src_yuy2),  // %0 | 
|   777       "+r"(dst_argb),  // %1 |   688       "+r"(dst_argb),  // %1 | 
|   778       "+r"(width)      // %2 |   689       "+r"(width)      // %2 | 
|   779     : "r"(&kUVToRB),   // %3 |   690     : [kUVBiasBGR]"r"(&kUVBiasBGR), | 
|   780       "r"(&kUVToG)     // %4 |   691       [kYToRgb]"r"(&kYToRgb) | 
|   781     : "cc", "memory", "q0", "q1", "q2", "q3", |   692     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 
|   782       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |   693       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 
|   783   ); |   694   ); | 
|   784 } |   695 } | 
|   785 #endif  // HAS_YUY2TOARGBROW_NEON |   696 #endif  // HAS_YUY2TOARGBROW_NEON | 
|   786  |   697  | 
|   787 #ifdef HAS_UYVYTOARGBROW_NEON |   698 #ifdef HAS_UYVYTOARGBROW_NEON | 
|   788 void UYVYToARGBRow_NEON(const uint8* src_uyvy, |   699 void UYVYToARGBRow_NEON(const uint8* src_uyvy, | 
|   789                         uint8* dst_argb, |   700                         uint8* dst_argb, | 
|   790                         int width) { |   701                         int width) { | 
|   791   asm volatile ( |   702   asm volatile ( | 
|   792     MEMACCESS(3) |   703     YUV422TORGB_SETUP_REG | 
|   793     "vld1.8     {d24}, [%3]                    \n" |  | 
|   794     MEMACCESS(4) |  | 
|   795     "vld1.8     {d25}, [%4]                    \n" |  | 
|   796     "vmov.u8    d26, #128                      \n" |  | 
|   797     "vmov.u16   q14, #74                       \n" |  | 
|   798     "vmov.u16   q15, #16                       \n" |  | 
|   799     ".p2align   2                              \n" |  | 
|   800   "1:                                          \n" |   704   "1:                                          \n" | 
|   801     READUYVY |   705     READUYVY | 
|   802     YUV422TORGB |   706     YUV422TORGB(v22, v21, v20) | 
|   803     "subs       %2, %2, #8                     \n" |   707     "subs       %2, %2, #8                     \n" | 
|   804     "vmov.u8    d23, #255                      \n" |   708     "movi       v23.8b, #255                   \n" | 
|   805     MEMACCESS(1) |   709     MEMACCESS(1) | 
|   806     "vst4.8     {d20, d21, d22, d23}, [%1]!    \n" |   710     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32      \n" | 
|   807     "bgt        1b                             \n" |   711     "b.gt       1b                             \n" | 
|   808     : "+r"(src_uyvy),  // %0 |   712     : "+r"(src_uyvy),  // %0 | 
|   809       "+r"(dst_argb),  // %1 |   713       "+r"(dst_argb),  // %1 | 
|   810       "+r"(width)      // %2 |   714       "+r"(width)      // %2 | 
|   811     : "r"(&kUVToRB),   // %3 |   715     : [kUVBiasBGR]"r"(&kUVBiasBGR), | 
|   812       "r"(&kUVToG)     // %4 |   716       [kYToRgb]"r"(&kYToRgb) | 
|   813     : "cc", "memory", "q0", "q1", "q2", "q3", |   717     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 
|   814       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |   718       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 
|   815   ); |   719   ); | 
|   816 } |   720 } | 
|   817 #endif  // HAS_UYVYTOARGBROW_NEON |   721 #endif  // HAS_UYVYTOARGBROW_NEON | 
|   818  |   722  | 
|   819 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. |   723 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. | 
|   820 #ifdef HAS_SPLITUVROW_NEON |   724 #ifdef HAS_SPLITUVROW_NEON | 
|   821 void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, |   725 void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, | 
|   822                      int width) { |   726                      int width) { | 
|   823   asm volatile ( |   727   asm volatile ( | 
|   824     ".p2align   2                              \n" |  | 
|   825   "1:                                          \n" |   728   "1:                                          \n" | 
|   826     MEMACCESS(0) |   729     MEMACCESS(0) | 
|   827     "ld2        {v0.16b, v1.16b}, [%0], #32    \n"  // load 16 pairs of UV |   730     "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pairs of UV | 
|   828     "subs       %3, %3, #16                    \n"  // 16 processed per loop |   731     "subs       %3, %3, #16                    \n"  // 16 processed per loop | 
|   829     MEMACCESS(1) |   732     MEMACCESS(1) | 
|   830     "st1        {v0.16b}, [%1], #16            \n"  // store U |   733     "st1        {v0.16b}, [%1], #16            \n"  // store U | 
|   831     MEMACCESS(2) |   734     MEMACCESS(2) | 
|   832     "st1        {v1.16b}, [%2], #16            \n"  // store V |   735     "st1        {v1.16b}, [%2], #16            \n"  // store V | 
|   833     "bgt        1b                             \n" |   736     "b.gt       1b                             \n" | 
|   834     : "+r"(src_uv),  // %0 |   737     : "+r"(src_uv),  // %0 | 
|   835       "+r"(dst_u),   // %1 |   738       "+r"(dst_u),   // %1 | 
|   836       "+r"(dst_v),   // %2 |   739       "+r"(dst_v),   // %2 | 
|   837       "+r"(width)    // %3  // Output registers |   740       "+r"(width)    // %3  // Output registers | 
|   838     :                       // Input registers |   741     :                       // Input registers | 
|   839     : "cc", "memory", "v0", "v1"  // Clobber List |   742     : "cc", "memory", "v0", "v1"  // Clobber List | 
|   840   ); |   743   ); | 
|   841 } |   744 } | 
|   842 #endif  // HAS_SPLITUVROW_NEON |   745 #endif  // HAS_SPLITUVROW_NEON | 
|   843  |   746  | 
|   844 // Reads 16 U's and V's and writes out 16 pairs of UV. |   747 // Reads 16 U's and V's and writes out 16 pairs of UV. | 
|   845 #ifdef HAS_MERGEUVROW_NEON |   748 #ifdef HAS_MERGEUVROW_NEON | 
|   846 void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, |   749 void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, | 
|   847                      int width) { |   750                      int width) { | 
|   848   asm volatile ( |   751   asm volatile ( | 
|   849     ".p2align   2                              \n" |  | 
|   850   "1:                                          \n" |   752   "1:                                          \n" | 
|   851     MEMACCESS(0) |   753     MEMACCESS(0) | 
|   852     "ld1        {v0.16b}, [%0], #16            \n"  // load U |   754     "ld1        {v0.16b}, [%0], #16            \n"  // load U | 
|   853     MEMACCESS(1) |   755     MEMACCESS(1) | 
|   854     "ld1        {v1.16b}, [%1], #16            \n"  // load V |   756     "ld1        {v1.16b}, [%1], #16            \n"  // load V | 
|   855     "subs       %3, %3, #16                    \n"  // 16 processed per loop |   757     "subs       %3, %3, #16                    \n"  // 16 processed per loop | 
|   856     MEMACCESS(2) |   758     MEMACCESS(2) | 
|   857     "st2        {v0.16b, v1.16b}, [%2], #32    \n"  // store 16 pairs of UV |   759     "st2        {v0.16b,v1.16b}, [%2], #32     \n"  // store 16 pairs of UV | 
|   858     "bgt        1b                             \n" |   760     "b.gt       1b                             \n" | 
|   859     : |   761     : | 
|   860       "+r"(src_u),   // %0 |   762       "+r"(src_u),   // %0 | 
|   861       "+r"(src_v),   // %1 |   763       "+r"(src_v),   // %1 | 
|   862       "+r"(dst_uv),  // %2 |   764       "+r"(dst_uv),  // %2 | 
|   863       "+r"(width)    // %3  // Output registers |   765       "+r"(width)    // %3  // Output registers | 
|   864     :                       // Input registers |   766     :                       // Input registers | 
|   865     : "cc", "memory", "v0", "v1"  // Clobber List |   767     : "cc", "memory", "v0", "v1"  // Clobber List | 
|   866   ); |   768   ); | 
|   867 } |   769 } | 
|   868 #endif  // HAS_MERGEUVROW_NEON |   770 #endif  // HAS_MERGEUVROW_NEON | 
|   869  |   771  | 
|   870 // Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15. |   772 // Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15. | 
|   871 #ifdef HAS_COPYROW_NEON |   773 #ifdef HAS_COPYROW_NEON | 
|   872 void CopyRow_NEON(const uint8* src, uint8* dst, int count) { |   774 void CopyRow_NEON(const uint8* src, uint8* dst, int count) { | 
|   873   asm volatile ( |   775   asm volatile ( | 
|   874     ".p2align   2                              \n" |  | 
|   875   "1:                                          \n" |   776   "1:                                          \n" | 
|   876     MEMACCESS(0) |   777     MEMACCESS(0) | 
|   877     "ld1        {v0.8b-v3.8b}, [%0], #32       \n"  // load 32 |   778     "ld1        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32       \n"  // load 32 | 
|   878     "subs       %2, %2, #32                    \n"  // 32 processed per loop |   779     "subs       %2, %2, #32                    \n"  // 32 processed per loop | 
|   879     MEMACCESS(1) |   780     MEMACCESS(1) | 
|   880     "st1        {v0.8b-v3.8b}, [%1], #32       \n"  // store 32 |   781     "st1        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32       \n"  // store 32 | 
|   881     "bgt        1b                             \n" |   782     "b.gt       1b                             \n" | 
|   882   : "+r"(src),   // %0 |   783   : "+r"(src),   // %0 | 
|   883     "+r"(dst),   // %1 |   784     "+r"(dst),   // %1 | 
|   884     "+r"(count)  // %2  // Output registers |   785     "+r"(count)  // %2  // Output registers | 
|   885   :                     // Input registers |   786   :                     // Input registers | 
|   886   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List |   787   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List | 
|   887   ); |   788   ); | 
|   888 } |   789 } | 
|   889 #endif  // HAS_COPYROW_NEON |   790 #endif  // HAS_COPYROW_NEON | 
|   890  |   791  | 
|   891 // SetRow8 writes 'count' bytes using a 32 bit value repeated. |   792 // SetRow writes 'count' bytes using an 8 bit value repeated. | 
|   892 #ifdef HAS_SETROW_NEON |   793 void SetRow_NEON(uint8* dst, uint8 v8, int count) { | 
|   893 void SetRow_NEON(uint8* dst, uint32 v32, int count) { |  | 
|   894   asm volatile ( |   794   asm volatile ( | 
|   895     "dup        v0.4s, %w2                     \n"  // duplicate 4 ints |   795     "dup        v0.16b, %w2                    \n"  // duplicate 16 bytes | 
|   896     "1:                                        \n" |   796   "1:                                          \n" | 
|   897     "subs      %1, %1, #16                     \n"  // 16 bytes per loop |   797     "subs      %1, %1, #16                     \n"  // 16 bytes per loop | 
|   898     MEMACCESS(0) |   798     MEMACCESS(0) | 
|   899     "st1        {v0.16b}, [%0], #16            \n"  // store |   799     "st1        {v0.16b}, [%0], #16            \n"  // store | 
|   900     "bgt       1b                              \n" |   800     "b.gt      1b                              \n" | 
 |   801   : "+r"(dst),   // %0 | 
 |   802     "+r"(count)  // %1 | 
 |   803   : "r"(v8)      // %2 | 
 |   804   : "cc", "memory", "v0" | 
 |   805   ); | 
 |   806 } | 
 |   807  | 
 |   808 void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) { | 
 |   809   asm volatile ( | 
 |   810     "dup        v0.4s, %w2                     \n"  // duplicate 4 ints | 
 |   811   "1:                                          \n" | 
 |   812     "subs      %1, %1, #4                      \n"  // 4 ints per loop | 
 |   813     MEMACCESS(0) | 
 |   814     "st1        {v0.16b}, [%0], #16            \n"  // store | 
 |   815     "b.gt      1b                              \n" | 
|   901   : "+r"(dst),   // %0 |   816   : "+r"(dst),   // %0 | 
|   902     "+r"(count)  // %1 |   817     "+r"(count)  // %1 | 
|   903   : "r"(v32)     // %2 |   818   : "r"(v32)     // %2 | 
|   904   : "cc", "memory", "v0" |   819   : "cc", "memory", "v0" | 
|   905   ); |   820   ); | 
|   906 } |   821 } | 
|   907 #endif  // HAS_SETROW_NEON |  | 
|   908  |  | 
|   909 // TODO(fbarchard): Make fully assembler |  | 
|   910 // SetRow32 writes 'count' words using a 32 bit value repeated. |  | 
|   911 #ifdef HAS_ARGBSETROWS_NEON |  | 
|   912 void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width, |  | 
|   913                       int dst_stride, int height) { |  | 
|   914   for (int y = 0; y < height; ++y) { |  | 
|   915     SetRow_NEON(dst, v32, width << 2); |  | 
|   916     dst += dst_stride; |  | 
|   917   } |  | 
|   918 } |  | 
|   919 #endif  // HAS_ARGBSETROWS_NEON |  | 
|   920  |   822  | 
|   921 #ifdef HAS_MIRRORROW_NEON |   823 #ifdef HAS_MIRRORROW_NEON | 
|   922 void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { |   824 void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { | 
|   923   asm volatile ( |   825   asm volatile ( | 
|   924     // Start at end of source row. |   826     // Start at end of source row. | 
|   925     "add        %0, %0, %2                     \n" |   827     "add        %0, %0, %2                     \n" | 
|   926     "sub        %0, %0, #16                    \n" |   828     "sub        %0, %0, #16                    \n" | 
|   927  |   829  | 
|   928     ".p2align   2                              \n" |  | 
|   929   "1:                                          \n" |   830   "1:                                          \n" | 
|   930     MEMACCESS(0) |   831     MEMACCESS(0) | 
|   931     "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16 |   832     "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16 | 
|   932     "subs       %2, %2, #16                    \n"  // 16 pixels per loop. |   833     "subs       %2, %2, #16                    \n"  // 16 pixels per loop. | 
|   933     "rev64      v0.16b, v0.16b                 \n" |   834     "rev64      v0.16b, v0.16b                 \n" | 
|   934     MEMACCESS(1) |   835     MEMACCESS(1) | 
|   935     "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16 |   836     "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16 | 
|   936     MEMACCESS(1) |   837     MEMACCESS(1) | 
|   937     "st1        {v0.D}[0], [%1], #8            \n" |   838     "st1        {v0.D}[0], [%1], #8            \n" | 
|   938     "bgt        1b                             \n" |   839     "b.gt       1b                             \n" | 
|   939   : "+r"(src),   // %0 |   840   : "+r"(src),   // %0 | 
|   940     "+r"(dst),   // %1 |   841     "+r"(dst),   // %1 | 
|   941     "+r"(width)  // %2 |   842     "+r"(width)  // %2 | 
|   942   : "r"((ptrdiff_t)-16)    // %3 |   843   : "r"((ptrdiff_t)-16)    // %3 | 
|   943   : "cc", "memory", "v0" |   844   : "cc", "memory", "v0" | 
|   944   ); |   845   ); | 
|   945 } |   846 } | 
|   946 #endif  // HAS_MIRRORROW_NEON |   847 #endif  // HAS_MIRRORROW_NEON | 
|   947  |   848  | 
|   948 #ifdef HAS_MIRRORUVROW_NEON |   849 #ifdef HAS_MIRRORUVROW_NEON | 
|   949 void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, |   850 void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, | 
|   950                       int width) { |   851                       int width) { | 
|   951   asm volatile ( |   852   asm volatile ( | 
|   952     // Start at end of source row. |   853     // Start at end of source row. | 
|   953     "add        %0, %0, %3, lsl #1             \n" |   854     "add        %0, %0, %3, lsl #1             \n" | 
|   954     "sub        %0, %0, #16                    \n" |   855     "sub        %0, %0, #16                    \n" | 
|   955  |   856  | 
|   956     ".p2align   2                              \n" |  | 
|   957   "1:                                          \n" |   857   "1:                                          \n" | 
|   958     MEMACCESS(0) |   858     MEMACCESS(0) | 
|   959     "ld2        {v0.8b, v1.8b}, [%0], %4       \n"  // src -= 16 |   859     "ld2        {v0.8b, v1.8b}, [%0], %4       \n"  // src -= 16 | 
|   960     "subs       %3, %3, #8                     \n"  // 8 pixels per loop. |   860     "subs       %3, %3, #8                     \n"  // 8 pixels per loop. | 
|   961     "rev64      v0.8b, v0.8b                   \n" |   861     "rev64      v0.8b, v0.8b                   \n" | 
|   962     "rev64      v1.8b, v1.8b                   \n" |   862     "rev64      v1.8b, v1.8b                   \n" | 
|   963     MEMACCESS(1) |   863     MEMACCESS(1) | 
|   964     "st1        {v0.8b}, [%1], #8               \n"  // dst += 8 |   864     "st1        {v0.8b}, [%1], #8              \n"  // dst += 8 | 
|   965     MEMACCESS(2) |   865     MEMACCESS(2) | 
|   966     "st1        {v1.8b}, [%2], #8               \n" |   866     "st1        {v1.8b}, [%2], #8              \n" | 
|   967     "bgt        1b                             \n" |   867     "b.gt       1b                             \n" | 
|   968   : "+r"(src_uv),  // %0 |   868   : "+r"(src_uv),  // %0 | 
|   969     "+r"(dst_u),   // %1 |   869     "+r"(dst_u),   // %1 | 
|   970     "+r"(dst_v),   // %2 |   870     "+r"(dst_v),   // %2 | 
|   971     "+r"(width)    // %3 |   871     "+r"(width)    // %3 | 
|   972   : "r"((ptrdiff_t)-16)      // %4 |   872   : "r"((ptrdiff_t)-16)      // %4 | 
|   973   : "cc", "memory", "v0", "v1" |   873   : "cc", "memory", "v0", "v1" | 
|   974   ); |   874   ); | 
|   975 } |   875 } | 
|   976 #endif  // HAS_MIRRORUVROW_NEON |   876 #endif  // HAS_MIRRORUVROW_NEON | 
|   977  |   877  | 
|   978 #ifdef HAS_ARGBMIRRORROW_NEON |   878 #ifdef HAS_ARGBMIRRORROW_NEON | 
|   979 void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { |   879 void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { | 
|   980   asm volatile ( |   880   asm volatile ( | 
|   981     // Start at end of source row. |   881     // Start at end of source row. | 
|   982     "add        %0, %0, %2, lsl #2             \n" |   882     "add        %0, %0, %2, lsl #2             \n" | 
|   983     "sub        %0, %0, #16                    \n" |   883     "sub        %0, %0, #16                    \n" | 
|   984  |   884  | 
|   985     ".p2align   2                              \n" |  | 
|   986   "1:                                          \n" |   885   "1:                                          \n" | 
|   987     MEMACCESS(0) |   886     MEMACCESS(0) | 
|   988     "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16 |   887     "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16 | 
|   989     "subs       %2, %2, #4                     \n"  // 4 pixels per loop. |   888     "subs       %2, %2, #4                     \n"  // 4 pixels per loop. | 
|   990     "rev64      v0.4s, v0.4s                   \n" |   889     "rev64      v0.4s, v0.4s                   \n" | 
|   991     MEMACCESS(1) |   890     MEMACCESS(1) | 
|   992     "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16 |   891     "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16 | 
|   993     MEMACCESS(1) |   892     MEMACCESS(1) | 
|   994     "st1        {v0.D}[0], [%1], #8            \n" |   893     "st1        {v0.D}[0], [%1], #8            \n" | 
|   995     "bgt        1b                             \n" |   894     "b.gt       1b                             \n" | 
|   996   : "+r"(src),   // %0 |   895   : "+r"(src),   // %0 | 
|   997     "+r"(dst),   // %1 |   896     "+r"(dst),   // %1 | 
|   998     "+r"(width)  // %2 |   897     "+r"(width)  // %2 | 
|   999   : "r"((ptrdiff_t)-16)    // %3 |   898   : "r"((ptrdiff_t)-16)    // %3 | 
|  1000   : "cc", "memory", "v0" |   899   : "cc", "memory", "v0" | 
|  1001   ); |   900   ); | 
|  1002 } |   901 } | 
|  1003 #endif  // HAS_ARGBMIRRORROW_NEON |   902 #endif  // HAS_ARGBMIRRORROW_NEON | 
|  1004  |   903  | 
|  1005 #ifdef HAS_RGB24TOARGBROW_NEON |   904 #ifdef HAS_RGB24TOARGBROW_NEON | 
|  1006 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { |   905 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { | 
|  1007   asm volatile ( |   906   asm volatile ( | 
|  1008     "movi       v4.8b, #255                    \n"  // Alpha |   907     "movi       v4.8b, #255                    \n"  // Alpha | 
|  1009     ".p2align   2                              \n" |  | 
|  1010   "1:                                          \n" |   908   "1:                                          \n" | 
|  1011     MEMACCESS(0) |   909     MEMACCESS(0) | 
|  1012     "ld3        {v1.8b-v3.8b}, [%0], #24       \n"  // load 8 pixels of RGB24. |   910     "ld3        {v1.8b,v2.8b,v3.8b}, [%0], #24 \n"  // load 8 pixels of RGB24. | 
|  1013     "subs       %2, %2, #8                     \n"  // 8 processed per loop. |   911     "subs       %2, %2, #8                     \n"  // 8 processed per loop. | 
|  1014     MEMACCESS(1) |   912     MEMACCESS(1) | 
|  1015     "st4        {v1.8b-v4.8b}, [%1], #32       \n"  // store 8 pixels of ARGB. |   913     "st4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n"  // store 8 ARGB pixels | 
|  1016     "bgt        1b                             \n" |   914     "b.gt       1b                             \n" | 
|  1017   : "+r"(src_rgb24),  // %0 |   915   : "+r"(src_rgb24),  // %0 | 
|  1018     "+r"(dst_argb),   // %1 |   916     "+r"(dst_argb),   // %1 | 
|  1019     "+r"(pix)         // %2 |   917     "+r"(pix)         // %2 | 
|  1020   : |   918   : | 
|  1021   : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List |   919   : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List | 
|  1022   ); |   920   ); | 
|  1023 } |   921 } | 
|  1024 #endif  // HAS_RGB24TOARGBROW_NEON |   922 #endif  // HAS_RGB24TOARGBROW_NEON | 
|  1025  |   923  | 
|  1026 #ifdef HAS_RAWTOARGBROW_NEON |   924 #ifdef HAS_RAWTOARGBROW_NEON | 
|  1027 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { |   925 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { | 
|  1028   asm volatile ( |   926   asm volatile ( | 
|  1029     "movi       v5.8b, #255                    \n"  // Alpha |   927     "movi       v5.8b, #255                    \n"  // Alpha | 
|  1030     ".p2align   2                              \n" |  | 
|  1031   "1:                                          \n" |   928   "1:                                          \n" | 
|  1032     MEMACCESS(0) |   929     MEMACCESS(0) | 
|  1033     "ld3        {v0.8b-v2.8b}, [%0], #24       \n"  // read r g b |   930     "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b | 
|  1034     "subs       %2, %2, #8                     \n"  // 8 processed per loop. |   931     "subs       %2, %2, #8                     \n"  // 8 processed per loop. | 
|  1035     "mov        v3.8b, v1.8b                   \n"  // move g |   932     "orr        v3.8b, v1.8b, v1.8b            \n"  // move g | 
|  1036     "mov        v4.8b, v0.8b                   \n"  // move r |   933     "orr        v4.8b, v0.8b, v0.8b            \n"  // move r | 
|  1037     MEMACCESS(1) |   934     MEMACCESS(1) | 
|  1038     "st4        {v2.8b-v5.8b}, [%1], #32       \n"  // store b g r a |   935     "st4        {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n"  // store b g r a | 
|  1039     "bgt        1b                             \n" |   936     "b.gt       1b                             \n" | 
|  1040   : "+r"(src_raw),   // %0 |   937   : "+r"(src_raw),   // %0 | 
|  1041     "+r"(dst_argb),  // %1 |   938     "+r"(dst_argb),  // %1 | 
|  1042     "+r"(pix)        // %2 |   939     "+r"(pix)        // %2 | 
|  1043   : |   940   : | 
|  1044   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"  // Clobber List |   941   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"  // Clobber List | 
|  1045   ); |   942   ); | 
|  1046 } |   943 } | 
|  1047 #endif  // HAS_RAWTOARGBROW_NEON |   944 #endif  // HAS_RAWTOARGBROW_NEON | 
|  1048  |   945  | 
|  1049 #define RGB565TOARGB                                                           \ |   946 #define RGB565TOARGB                                                           \ | 
|  1050     "vshrn.u16  d6, q0, #5                     \n"  /* G xxGGGGGG           */ \ |   947     "shrn       v6.8b, v0.8h, #5               \n"  /* G xxGGGGGG           */ \ | 
|  1051     "vuzp.u8    d0, d1                         \n"  /* d0 xxxBBBBB RRRRRxxx */ \ |   948     "shl        v6.8b, v6.8b, #2               \n"  /* G GGGGGG00 upper 6   */ \ | 
|  1052     "vshl.u8    d6, d6, #2                     \n"  /* G GGGGGG00 upper 6   */ \ |   949     "ushr       v4.8b, v6.8b, #6               \n"  /* G 000000GG lower 2   */ \ | 
|  1053     "vshr.u8    d1, d1, #3                     \n"  /* R 000RRRRR lower 5   */ \ |   950     "orr        v1.8b, v4.8b, v6.8b            \n"  /* G                    */ \ | 
|  1054     "vshl.u8    q0, q0, #3                     \n"  /* B,R BBBBB000 upper 5 */ \ |   951     "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \ | 
|  1055     "vshr.u8    q2, q0, #5                     \n"  /* B,R 00000BBB lower 3 */ \ |   952     "ushr       v0.8h, v0.8h, #11              \n"  /* R 000RRRRR           */ \ | 
|  1056     "vorr.u8    d0, d0, d4                     \n"  /* B                    */ \ |   953     "xtn2       v2.16b,v0.8h                   \n"  /* R in upper part      */ \ | 
|  1057     "vshr.u8    d4, d6, #6                     \n"  /* G 000000GG lower 2   */ \ |   954     "shl        v2.16b, v2.16b, #3             \n"  /* R,B BBBBB000 upper 5 */ \ | 
|  1058     "vorr.u8    d2, d1, d5                     \n"  /* R                    */ \ |   955     "ushr       v0.16b, v2.16b, #5             \n"  /* R,B 00000BBB lower 3 */ \ | 
|  1059     "vorr.u8    d1, d4, d6                     \n"  /* G                    */ |   956     "orr        v0.16b, v0.16b, v2.16b         \n"  /* R,B                  */ \ | 
 |   957     "dup        v2.2D, v0.D[1]                 \n"  /* R                    */ | 
|  1060  |   958  | 
|  1061 #ifdef HAS_RGB565TOARGBROW_NEON |   959 #ifdef HAS_RGB565TOARGBROW_NEON | 
|  1062 void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) { |   960 void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) { | 
|  1063   asm volatile ( |   961   asm volatile ( | 
|  1064     "vmov.u8    d3, #255                       \n"  // Alpha |   962     "movi       v3.8b, #255                    \n"  // Alpha | 
|  1065     ".p2align   2                              \n" |  | 
|  1066   "1:                                          \n" |   963   "1:                                          \n" | 
|  1067     MEMACCESS(0) |   964     MEMACCESS(0) | 
|  1068     "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels. |   965     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels. | 
|  1069     "subs       %2, %2, #8                     \n"  // 8 processed per loop. |   966     "subs       %2, %2, #8                     \n"  // 8 processed per loop. | 
|  1070     RGB565TOARGB |   967     RGB565TOARGB | 
|  1071     MEMACCESS(1) |   968     MEMACCESS(1) | 
|  1072     "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB. |   969     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels | 
|  1073     "bgt        1b                             \n" |   970     "b.gt       1b                             \n" | 
|  1074   : "+r"(src_rgb565),  // %0 |   971   : "+r"(src_rgb565),  // %0 | 
|  1075     "+r"(dst_argb),    // %1 |   972     "+r"(dst_argb),    // %1 | 
|  1076     "+r"(pix)          // %2 |   973     "+r"(pix)          // %2 | 
|  1077   : |   974   : | 
|  1078   : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List |   975   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6"  // Clobber List | 
|  1079   ); |   976   ); | 
|  1080 } |   977 } | 
|  1081 #endif  // HAS_RGB565TOARGBROW_NEON |   978 #endif  // HAS_RGB565TOARGBROW_NEON | 
|  1082  |   979  | 
|  1083 #define ARGB1555TOARGB                                                         \ |   980 #define ARGB1555TOARGB                                                         \ | 
|  1084     "vshrn.u16  d7, q0, #8                     \n"  /* A Arrrrrxx           */ \ |   981     "ushr       v2.8h, v0.8h, #10              \n"  /* R xxxRRRRR           */ \ | 
|  1085     "vshr.u8    d6, d7, #2                     \n"  /* R xxxRRRRR           */ \ |   982     "shl        v2.8h, v2.8h, #3               \n"  /* R RRRRR000 upper 5   */ \ | 
|  1086     "vshrn.u16  d5, q0, #5                     \n"  /* G xxxGGGGG           */ \ |   983     "xtn        v3.8b, v2.8h                   \n"  /* RRRRR000 AAAAAAAA    */ \ | 
|  1087     "vmovn.u16  d4, q0                         \n"  /* B xxxBBBBB           */ \ |   984                                                                                \ | 
|  1088     "vshr.u8    d7, d7, #7                     \n"  /* A 0000000A           */ \ |   985     "sshr       v2.8h, v0.8h, #15              \n"  /* A AAAAAAAA           */ \ | 
|  1089     "vneg.s8    d7, d7                         \n"  /* A AAAAAAAA upper 8   */ \ |   986     "xtn2       v3.16b, v2.8h                  \n"                             \ | 
|  1090     "vshl.u8    d6, d6, #3                     \n"  /* R RRRRR000 upper 5   */ \ |   987                                                                                \ | 
|  1091     "vshr.u8    q1, q3, #5                     \n"  /* R,A 00000RRR lower 3 */ \ |   988     "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \ | 
|  1092     "vshl.u8    q0, q2, #3                     \n"  /* B,G BBBBB000 upper 5 */ \ |   989     "shrn2      v2.16b,v0.8h, #5               \n"  /* G xxxGGGGG           */ \ | 
|  1093     "vshr.u8    q2, q0, #5                     \n"  /* B,G 00000BBB lower 3 */ \ |   990                                                                                \ | 
|  1094     "vorr.u8    q1, q1, q3                     \n"  /* R,A                  */ \ |   991     "ushr       v1.16b, v3.16b, #5             \n"  /* R,A 00000RRR lower 3 */ \ | 
|  1095     "vorr.u8    q0, q0, q2                     \n"  /* B,G                  */ \ |   992     "shl        v0.16b, v2.16b, #3             \n"  /* B,G BBBBB000 upper 5 */ \ | 
 |   993     "ushr       v2.16b, v0.16b, #5             \n"  /* B,G 00000BBB lower 3 */ \ | 
 |   994                                                                                \ | 
 |   995     "orr        v0.16b, v0.16b, v2.16b         \n"  /* B,G                  */ \ | 
 |   996     "orr        v2.16b, v1.16b, v3.16b         \n"  /* R,A                  */ \ | 
 |   997     "dup        v1.2D, v0.D[1]                 \n"                             \ | 
 |   998     "dup        v3.2D, v2.D[1]                 \n" | 
|  1096  |   999  | 
|  1097 // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha. |  1000 // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha. | 
|  1098 #define RGB555TOARGB                                                           \ |  1001 #define RGB555TOARGB                                                           \ | 
|  1099     "vshrn.u16  d6, q0, #5                     \n"  /* G xxxGGGGG           */ \ |  1002     "ushr       v2.8h, v0.8h, #10              \n"  /* R xxxRRRRR           */ \ | 
|  1100     "vuzp.u8    d0, d1                         \n"  /* d0 xxxBBBBB xRRRRRxx */ \ |  1003     "shl        v2.8h, v2.8h, #3               \n"  /* R RRRRR000 upper 5   */ \ | 
|  1101     "vshl.u8    d6, d6, #3                     \n"  /* G GGGGG000 upper 5   */ \ |  1004     "xtn        v3.8b, v2.8h                   \n"  /* RRRRR000             */ \ | 
|  1102     "vshr.u8    d1, d1, #2                     \n"  /* R 00xRRRRR lower 5   */ \ |  1005                                                                                \ | 
|  1103     "vshl.u8    q0, q0, #3                     \n"  /* B,R BBBBB000 upper 5 */ \ |  1006     "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \ | 
|  1104     "vshr.u8    q2, q0, #5                     \n"  /* B,R 00000BBB lower 3 */ \ |  1007     "shrn2      v2.16b,v0.8h, #5               \n"  /* G xxxGGGGG           */ \ | 
|  1105     "vorr.u8    d0, d0, d4                     \n"  /* B                    */ \ |  1008                                                                                \ | 
|  1106     "vshr.u8    d4, d6, #5                     \n"  /* G 00000GGG lower 3   */ \ |  1009     "ushr       v1.16b, v3.16b, #5             \n"  /* R   00000RRR lower 3 */ \ | 
|  1107     "vorr.u8    d2, d1, d5                     \n"  /* R                    */ \ |  1010     "shl        v0.16b, v2.16b, #3             \n"  /* B,G BBBBB000 upper 5 */ \ | 
|  1108     "vorr.u8    d1, d4, d6                     \n"  /* G                    */ |  1011     "ushr       v2.16b, v0.16b, #5             \n"  /* B,G 00000BBB lower 3 */ \ | 
 |  1012                                                                                \ | 
 |  1013     "orr        v0.16b, v0.16b, v2.16b         \n"  /* B,G                  */ \ | 
 |  1014     "orr        v2.16b, v1.16b, v3.16b         \n"  /* R                    */ \ | 
 |  1015     "dup        v1.2D, v0.D[1]                 \n"  /* G */                    \ | 
|  1109  |  1016  | 
|  1110 #ifdef HAS_ARGB1555TOARGBROW_NEON |  1017 #ifdef HAS_ARGB1555TOARGBROW_NEON | 
|  1111 void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, |  1018 void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, | 
|  1112                             int pix) { |  1019                             int pix) { | 
|  1113   asm volatile ( |  1020   asm volatile ( | 
|  1114     "vmov.u8    d3, #255                       \n"  // Alpha |  1021     "movi       v3.8b, #255                    \n"  // Alpha | 
|  1115     ".p2align   2                              \n" |  | 
|  1116   "1:                                          \n" |  1022   "1:                                          \n" | 
|  1117     MEMACCESS(0) |  1023     MEMACCESS(0) | 
|  1118     "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels. |  1024     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels. | 
|  1119     "subs       %2, %2, #8                     \n"  // 8 processed per loop. |  1025     "subs       %2, %2, #8                     \n"  // 8 processed per loop. | 
|  1120     ARGB1555TOARGB |  1026     ARGB1555TOARGB | 
|  1121     MEMACCESS(1) |  1027     MEMACCESS(1) | 
|  1122     "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB. |  1028     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels | 
|  1123     "bgt        1b                             \n" |  1029     "b.gt       1b                             \n" | 
|  1124   : "+r"(src_argb1555),  // %0 |  1030   : "+r"(src_argb1555),  // %0 | 
|  1125     "+r"(dst_argb),    // %1 |  1031     "+r"(dst_argb),    // %1 | 
|  1126     "+r"(pix)          // %2 |  1032     "+r"(pix)          // %2 | 
|  1127   : |  1033   : | 
|  1128   : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List |  1034   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List | 
|  1129   ); |  1035   ); | 
|  1130 } |  1036 } | 
|  1131 #endif  // HAS_ARGB1555TOARGBROW_NEON |  1037 #endif  // HAS_ARGB1555TOARGBROW_NEON | 
|  1132  |  1038  | 
|  1133 #define ARGB4444TOARGB                                                         \ |  1039 #define ARGB4444TOARGB                                                         \ | 
|  1134     "vuzp.u8    d0, d1                         \n"  /* d0 BG, d1 RA         */ \ |  1040     "shrn       v1.8b,  v0.8h, #8              \n"  /* v1(l) AR             */ \ | 
|  1135     "vshl.u8    q2, q0, #4                     \n"  /* B,R BBBB0000         */ \ |  1041     "xtn2       v1.16b, v0.8h                  \n"  /* v1(h) GB             */ \ | 
|  1136     "vshr.u8    q1, q0, #4                     \n"  /* G,A 0000GGGG         */ \ |  1042     "shl        v2.16b, v1.16b, #4             \n"  /* B,R BBBB0000         */ \ | 
|  1137     "vshr.u8    q0, q2, #4                     \n"  /* B,R 0000BBBB         */ \ |  1043     "ushr       v3.16b, v1.16b, #4             \n"  /* G,A 0000GGGG         */ \ | 
|  1138     "vorr.u8    q0, q0, q2                     \n"  /* B,R BBBBBBBB         */ \ |  1044     "ushr       v0.16b, v2.16b, #4             \n"  /* B,R 0000BBBB         */ \ | 
|  1139     "vshl.u8    q2, q1, #4                     \n"  /* G,A GGGG0000         */ \ |  1045     "shl        v1.16b, v3.16b, #4             \n"  /* G,A GGGG0000         */ \ | 
|  1140     "vorr.u8    q1, q1, q2                     \n"  /* G,A GGGGGGGG         */ \ |  1046     "orr        v2.16b, v0.16b, v2.16b         \n"  /* B,R BBBBBBBB         */ \ | 
|  1141     "vswp.u8    d1, d2                         \n"  /* B,R,G,A -> B,G,R,A   */ |  1047     "orr        v3.16b, v1.16b, v3.16b         \n"  /* G,A GGGGGGGG         */ \ | 
 |  1048     "dup        v0.2D, v2.D[1]                 \n"                             \ | 
 |  1049     "dup        v1.2D, v3.D[1]                 \n" | 
|  1142  |  1050  | 
|  1143 #ifdef HAS_ARGB4444TOARGBROW_NEON |  1051 #ifdef HAS_ARGB4444TOARGBROW_NEON | 
|  1144 void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, |  1052 void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, | 
|  1145                             int pix) { |  1053                             int pix) { | 
|  1146   asm volatile ( |  1054   asm volatile ( | 
|  1147     "vmov.u8    d3, #255                       \n"  // Alpha |  | 
|  1148     ".p2align   2                              \n" |  | 
|  1149   "1:                                          \n" |  1055   "1:                                          \n" | 
|  1150     MEMACCESS(0) |  1056     MEMACCESS(0) | 
|  1151     "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels. |  1057     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels. | 
|  1152     "subs       %2, %2, #8                     \n"  // 8 processed per loop. |  1058     "subs       %2, %2, #8                     \n"  // 8 processed per loop. | 
|  1153     ARGB4444TOARGB |  1059     ARGB4444TOARGB | 
|  1154     MEMACCESS(1) |  1060     MEMACCESS(1) | 
|  1155     "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB. |  1061     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels | 
|  1156     "bgt        1b                             \n" |  1062     "b.gt       1b                             \n" | 
|  1157   : "+r"(src_argb4444),  // %0 |  1063   : "+r"(src_argb4444),  // %0 | 
|  1158     "+r"(dst_argb),    // %1 |  1064     "+r"(dst_argb),    // %1 | 
|  1159     "+r"(pix)          // %2 |  1065     "+r"(pix)          // %2 | 
|  1160   : |  1066   : | 
|  1161   : "cc", "memory", "q0", "q1", "q2"  // Clobber List |  1067   : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List | 
|  1162   ); |  1068   ); | 
|  1163 } |  1069 } | 
|  1164 #endif  // HAS_ARGB4444TOARGBROW_NEON |  1070 #endif  // HAS_ARGB4444TOARGBROW_NEON | 
|  1165  |  1071  | 
|  1166 #ifdef HAS_ARGBTORGB24ROW_NEON |  1072 #ifdef HAS_ARGBTORGB24ROW_NEON | 
|  1167 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { |  1073 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { | 
|  1168   asm volatile ( |  1074   asm volatile ( | 
|  1169     ".p2align   2                              \n" |  | 
|  1170   "1:                                          \n" |  1075   "1:                                          \n" | 
|  1171     MEMACCESS(0) |  1076     MEMACCESS(0) | 
|  1172     "ld4        {v1.8b-v4.8b}, [%0], #32       \n"  // load 8 pixels of ARGB. |  1077     "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load 8 ARGB pixels | 
|  1173     "subs       %2, %2, #8                     \n"  // 8 processed per loop. |  1078     "subs       %2, %2, #8                     \n"  // 8 processed per loop. | 
|  1174     MEMACCESS(1) |  1079     MEMACCESS(1) | 
|  1175     "st3        {v1.8b-v3.8b}, [%1], #24       \n"  // store 8 pixels of RGB24. |  1080     "st3        {v1.8b,v2.8b,v3.8b}, [%1], #24 \n"  // store 8 pixels of RGB24. | 
|  1176     "bgt        1b                             \n" |  1081     "b.gt       1b                             \n" | 
|  1177   : "+r"(src_argb),   // %0 |  1082   : "+r"(src_argb),   // %0 | 
|  1178     "+r"(dst_rgb24),  // %1 |  1083     "+r"(dst_rgb24),  // %1 | 
|  1179     "+r"(pix)         // %2 |  1084     "+r"(pix)         // %2 | 
|  1180   : |  1085   : | 
|  1181   : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List |  1086   : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List | 
|  1182   ); |  1087   ); | 
|  1183 } |  1088 } | 
|  1184 #endif  // HAS_ARGBTORGB24ROW_NEON |  1089 #endif  // HAS_ARGBTORGB24ROW_NEON | 
|  1185  |  1090  | 
|  1186 #ifdef HAS_ARGBTORAWROW_NEON |  1091 #ifdef HAS_ARGBTORAWROW_NEON | 
|  1187 void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { |  1092 void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { | 
|  1188   asm volatile ( |  1093   asm volatile ( | 
|  1189     ".p2align   2                              \n" |  | 
|  1190   "1:                                          \n" |  1094   "1:                                          \n" | 
|  1191     MEMACCESS(0) |  1095     MEMACCESS(0) | 
|  1192     "ld4        {v1.8b-v4.8b}, [%0], #32       \n"  // load b g r a |  1096     "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load b g r a | 
|  1193     "subs       %2, %2, #8                     \n"  // 8 processed per loop. |  1097     "subs       %2, %2, #8                     \n"  // 8 processed per loop. | 
|  1194     "mov        v4.8b, v2.8b                   \n"  // mov g |  1098     "orr        v4.8b, v2.8b, v2.8b            \n"  // mov g | 
|  1195     "mov        v5.8b, v1.8b                   \n"  // mov b |  1099     "orr        v5.8b, v1.8b, v1.8b            \n"  // mov b | 
|  1196     MEMACCESS(1) |  1100     MEMACCESS(1) | 
|  1197     "st3        {v3.8b-v5.8b}, [%1], #24       \n"  // store r g b |  1101     "st3        {v3.8b,v4.8b,v5.8b}, [%1], #24 \n"  // store r g b | 
|  1198     "bgt        1b                             \n" |  1102     "b.gt       1b                             \n" | 
|  1199   : "+r"(src_argb),  // %0 |  1103   : "+r"(src_argb),  // %0 | 
|  1200     "+r"(dst_raw),   // %1 |  1104     "+r"(dst_raw),   // %1 | 
|  1201     "+r"(pix)        // %2 |  1105     "+r"(pix)        // %2 | 
|  1202   : |  1106   : | 
|  1203   : "cc", "memory", "v1", "v2", "v3", "v4", "v5"  // Clobber List |  1107   : "cc", "memory", "v1", "v2", "v3", "v4", "v5"  // Clobber List | 
|  1204   ); |  1108   ); | 
|  1205 } |  1109 } | 
|  1206 #endif  // HAS_ARGBTORAWROW_NEON |  1110 #endif  // HAS_ARGBTORAWROW_NEON | 
|  1207  |  1111  | 
|  1208 #ifdef HAS_YUY2TOYROW_NEON |  1112 #ifdef HAS_YUY2TOYROW_NEON | 
|  1209 void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { |  1113 void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { | 
|  1210   asm volatile ( |  1114   asm volatile ( | 
|  1211     ".p2align   2                              \n" |  | 
|  1212   "1:                                          \n" |  1115   "1:                                          \n" | 
|  1213     MEMACCESS(0) |  1116     MEMACCESS(0) | 
|  1214     "ld2        {v0.16b, v1.16b}, [%0], #32    \n"  // load 16 pixels of YUY2. |  1117     "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of YUY2. | 
|  1215     "subs       %2, %2, #16                    \n"  // 16 processed per loop. |  1118     "subs       %2, %2, #16                    \n"  // 16 processed per loop. | 
|  1216     MEMACCESS(1) |  1119     MEMACCESS(1) | 
|  1217     "st1        {v0.16b}, [%1], #16            \n"  // store 16 pixels of Y. |  1120     "st1        {v0.16b}, [%1], #16            \n"  // store 16 pixels of Y. | 
|  1218     "bgt        1b                             \n" |  1121     "b.gt       1b                             \n" | 
|  1219   : "+r"(src_yuy2),  // %0 |  1122   : "+r"(src_yuy2),  // %0 | 
|  1220     "+r"(dst_y),     // %1 |  1123     "+r"(dst_y),     // %1 | 
|  1221     "+r"(pix)        // %2 |  1124     "+r"(pix)        // %2 | 
|  1222   : |  1125   : | 
|  1223   : "cc", "memory", "v0", "v1"  // Clobber List |  1126   : "cc", "memory", "v0", "v1"  // Clobber List | 
|  1224   ); |  1127   ); | 
|  1225 } |  1128 } | 
|  1226 #endif  // HAS_YUY2TOYROW_NEON |  1129 #endif  // HAS_YUY2TOYROW_NEON | 
|  1227  |  1130  | 
|  1228 #ifdef HAS_UYVYTOYROW_NEON |  1131 #ifdef HAS_UYVYTOYROW_NEON | 
|  1229 void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { |  1132 void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { | 
|  1230   asm volatile ( |  1133   asm volatile ( | 
|  1231     ".p2align   2                              \n" |  | 
|  1232   "1:                                          \n" |  1134   "1:                                          \n" | 
|  1233     MEMACCESS(0) |  1135     MEMACCESS(0) | 
|  1234     "ld2        {v0.16b, v1.16b}, [%0], #32    \n"  // load 16 pixels of UYVY. |  1136     "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of UYVY. | 
|  1235     "subs       %2, %2, #16                    \n"  // 16 processed per loop. |  1137     "subs       %2, %2, #16                    \n"  // 16 processed per loop. | 
|  1236     MEMACCESS(1) |  1138     MEMACCESS(1) | 
|  1237     "st1        {v1.16b}, [%1], #16            \n"  // store 16 pixels of Y. |  1139     "st1        {v1.16b}, [%1], #16            \n"  // store 16 pixels of Y. | 
|  1238     "bgt        1b                             \n" |  1140     "b.gt       1b                             \n" | 
|  1239   : "+r"(src_uyvy),  // %0 |  1141   : "+r"(src_uyvy),  // %0 | 
|  1240     "+r"(dst_y),     // %1 |  1142     "+r"(dst_y),     // %1 | 
|  1241     "+r"(pix)        // %2 |  1143     "+r"(pix)        // %2 | 
|  1242   : |  1144   : | 
|  1243   : "cc", "memory", "v0", "v1"  // Clobber List |  1145   : "cc", "memory", "v0", "v1"  // Clobber List | 
|  1244   ); |  1146   ); | 
|  1245 } |  1147 } | 
|  1246 #endif  // HAS_UYVYTOYROW_NEON |  1148 #endif  // HAS_UYVYTOYROW_NEON | 
|  1247  |  1149  | 
|  1248 #ifdef HAS_YUY2TOUV422ROW_NEON |  1150 #ifdef HAS_YUY2TOUV422ROW_NEON | 
|  1249 void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, |  1151 void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, | 
|  1250                          int pix) { |  1152                          int pix) { | 
|  1251   asm volatile ( |  1153   asm volatile ( | 
|  1252     ".p2align   2                              \n" |  | 
|  1253   "1:                                          \n" |  1154   "1:                                          \n" | 
|  1254     MEMACCESS(0) |  1155     MEMACCESS(0) | 
|  1255     "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load 16 pixels of YUY2. |  1156     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 YUY2 pixels | 
|  1256     "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs. |  1157     "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs. | 
|  1257     MEMACCESS(1) |  1158     MEMACCESS(1) | 
|  1258     "st1        {v1.8b}, [%1], #8              \n"  // store 8 U. |  1159     "st1        {v1.8b}, [%1], #8              \n"  // store 8 U. | 
|  1259     MEMACCESS(2) |  1160     MEMACCESS(2) | 
|  1260     "st1        {v3.8b}, [%2], #8              \n"  // store 8 V. |  1161     "st1        {v3.8b}, [%2], #8              \n"  // store 8 V. | 
|  1261     "bgt        1b                             \n" |  1162     "b.gt       1b                             \n" | 
|  1262   : "+r"(src_yuy2),  // %0 |  1163   : "+r"(src_yuy2),  // %0 | 
|  1263     "+r"(dst_u),     // %1 |  1164     "+r"(dst_u),     // %1 | 
|  1264     "+r"(dst_v),     // %2 |  1165     "+r"(dst_v),     // %2 | 
|  1265     "+r"(pix)        // %3 |  1166     "+r"(pix)        // %3 | 
|  1266   : |  1167   : | 
|  1267   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List |  1168   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List | 
|  1268   ); |  1169   ); | 
|  1269 } |  1170 } | 
|  1270 #endif  // HAS_YUY2TOUV422ROW_NEON |  1171 #endif  // HAS_YUY2TOUV422ROW_NEON | 
|  1271  |  1172  | 
|  1272 #ifdef HAS_UYVYTOUV422ROW_NEON |  1173 #ifdef HAS_UYVYTOUV422ROW_NEON | 
|  1273 void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, |  1174 void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, | 
|  1274                          int pix) { |  1175                          int pix) { | 
|  1275   asm volatile ( |  1176   asm volatile ( | 
|  1276     ".p2align   2                              \n" |  | 
|  1277   "1:                                          \n" |  1177   "1:                                          \n" | 
|  1278     MEMACCESS(0) |  1178     MEMACCESS(0) | 
|  1279     "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load 16 pixels of UYVY. |  1179     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 UYVY pixels | 
|  1280     "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs. |  1180     "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs. | 
|  1281     MEMACCESS(1) |  1181     MEMACCESS(1) | 
|  1282     "st1        {v0.8b}, [%1], #8              \n"  // store 8 U. |  1182     "st1        {v0.8b}, [%1], #8              \n"  // store 8 U. | 
|  1283     MEMACCESS(2) |  1183     MEMACCESS(2) | 
|  1284     "st1        {v2.8b}, [%2], #8              \n"  // store 8 V. |  1184     "st1        {v2.8b}, [%2], #8              \n"  // store 8 V. | 
|  1285     "bgt        1b                             \n" |  1185     "b.gt       1b                             \n" | 
|  1286   : "+r"(src_uyvy),  // %0 |  1186   : "+r"(src_uyvy),  // %0 | 
|  1287     "+r"(dst_u),     // %1 |  1187     "+r"(dst_u),     // %1 | 
|  1288     "+r"(dst_v),     // %2 |  1188     "+r"(dst_v),     // %2 | 
|  1289     "+r"(pix)        // %3 |  1189     "+r"(pix)        // %3 | 
|  1290   : |  1190   : | 
|  1291   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List |  1191   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List | 
|  1292   ); |  1192   ); | 
|  1293 } |  1193 } | 
|  1294 #endif  // HAS_UYVYTOUV422ROW_NEON |  1194 #endif  // HAS_UYVYTOUV422ROW_NEON | 
|  1295  |  1195  | 
|  1296 #ifdef HAS_YUY2TOUVROW_NEON |  1196 #ifdef HAS_YUY2TOUVROW_NEON | 
|  1297 void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, |  1197 void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, | 
|  1298                       uint8* dst_u, uint8* dst_v, int pix) { |  1198                       uint8* dst_u, uint8* dst_v, int pix) { | 
 |  1199   const uint8* src_yuy2b = src_yuy2 + stride_yuy2; | 
|  1299   asm volatile ( |  1200   asm volatile ( | 
|  1300     "add        %x1, %x0, %w1, sxtw            \n"  // stride + src_yuy2 |  | 
|  1301     ".p2align   2                              \n" |  | 
|  1302   "1:                                          \n" |  1201   "1:                                          \n" | 
|  1303     MEMACCESS(0) |  1202     MEMACCESS(0) | 
|  1304     "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load 16 pixels of YUY2. |  1203     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels | 
|  1305     "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs. |  1204     "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs. | 
|  1306     MEMACCESS(1) |  1205     MEMACCESS(1) | 
|  1307     "ld4        {v4.8b-v7.8b}, [%1], #32       \n"  // load next row YUY2. |  1206     "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row | 
|  1308     "urhadd     v1.8b, v1.8b, v5.8b            \n"  // average rows of U |  1207     "urhadd     v1.8b, v1.8b, v5.8b            \n"  // average rows of U | 
|  1309     "urhadd     v3.8b, v3.8b, v7.8b            \n"  // average rows of V |  1208     "urhadd     v3.8b, v3.8b, v7.8b            \n"  // average rows of V | 
|  1310     MEMACCESS(2) |  1209     MEMACCESS(2) | 
|  1311     "st1        {v1.8b}, [%2], #8              \n"  // store 8 U. |  1210     "st1        {v1.8b}, [%2], #8              \n"  // store 8 U. | 
|  1312     MEMACCESS(3) |  1211     MEMACCESS(3) | 
|  1313     "st1        {v3.8b}, [%3], #8              \n"  // store 8 V. |  1212     "st1        {v3.8b}, [%3], #8              \n"  // store 8 V. | 
|  1314     "bgt        1b                             \n" |  1213     "b.gt       1b                             \n" | 
|  1315   : "+r"(src_yuy2),     // %0 |  1214   : "+r"(src_yuy2),     // %0 | 
|  1316     "+r"(stride_yuy2),  // %1 |  1215     "+r"(src_yuy2b),    // %1 | 
|  1317     "+r"(dst_u),        // %2 |  1216     "+r"(dst_u),        // %2 | 
|  1318     "+r"(dst_v),        // %3 |  1217     "+r"(dst_v),        // %3 | 
|  1319     "+r"(pix)           // %4 |  1218     "+r"(pix)           // %4 | 
|  1320   : |  1219   : | 
|  1321   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"  // Clobber L
      ist |  1220   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", | 
 |  1221     "v5", "v6", "v7"  // Clobber List | 
|  1322   ); |  1222   ); | 
|  1323 } |  1223 } | 
|  1324 #endif  // HAS_YUY2TOUVROW_NEON |  1224 #endif  // HAS_YUY2TOUVROW_NEON | 
|  1325  |  1225  | 
|  1326 #ifdef HAS_UYVYTOUVROW_NEON |  1226 #ifdef HAS_UYVYTOUVROW_NEON | 
|  1327 void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, |  1227 void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, | 
|  1328                       uint8* dst_u, uint8* dst_v, int pix) { |  1228                       uint8* dst_u, uint8* dst_v, int pix) { | 
 |  1229   const uint8* src_uyvyb = src_uyvy + stride_uyvy; | 
|  1329   asm volatile ( |  1230   asm volatile ( | 
|  1330     "add        %x1, %x0, %w1, sxtw            \n"  // stride + src_uyvy |  | 
|  1331     ".p2align   2                              \n" |  | 
|  1332   "1:                                          \n" |  1231   "1:                                          \n" | 
|  1333     MEMACCESS(0) |  1232     MEMACCESS(0) | 
|  1334     "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load 16 pixels of UYVY. |  1233     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels | 
|  1335     "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs. |  1234     "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs. | 
|  1336     MEMACCESS(1) |  1235     MEMACCESS(1) | 
|  1337     "ld4        {v4.8b-v7.8b}, [%1], #32       \n"  // load next row UYVY. |  1236     "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row | 
|  1338     "urhadd     v0.8b, v0.8b, v4.8b            \n"  // average rows of U |  1237     "urhadd     v0.8b, v0.8b, v4.8b            \n"  // average rows of U | 
|  1339     "urhadd     v2.8b, v2.8b, v6.8b            \n"  // average rows of V |  1238     "urhadd     v2.8b, v2.8b, v6.8b            \n"  // average rows of V | 
|  1340     MEMACCESS(2) |  1239     MEMACCESS(2) | 
|  1341     "st1        {v0.8b}, [%2], #8              \n"  // store 8 U. |  1240     "st1        {v0.8b}, [%2], #8              \n"  // store 8 U. | 
|  1342     MEMACCESS(3) |  1241     MEMACCESS(3) | 
|  1343     "st1        {v2.8b}, [%3], #8              \n"  // store 8 V. |  1242     "st1        {v2.8b}, [%3], #8              \n"  // store 8 V. | 
|  1344     "bgt        1b                             \n" |  1243     "b.gt       1b                             \n" | 
|  1345   : "+r"(src_uyvy),     // %0 |  1244   : "+r"(src_uyvy),     // %0 | 
|  1346     "+r"(stride_uyvy),  // %1 |  1245     "+r"(src_uyvyb),    // %1 | 
|  1347     "+r"(dst_u),        // %2 |  1246     "+r"(dst_u),        // %2 | 
|  1348     "+r"(dst_v),        // %3 |  1247     "+r"(dst_v),        // %3 | 
|  1349     "+r"(pix)           // %4 |  1248     "+r"(pix)           // %4 | 
|  1350   : |  1249   : | 
|  1351   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"  // Clobber L
      ist |  1250   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", | 
 |  1251     "v5", "v6", "v7"  // Clobber List | 
|  1352   ); |  1252   ); | 
|  1353 } |  1253 } | 
|  1354 #endif  // HAS_UYVYTOUVROW_NEON |  1254 #endif  // HAS_UYVYTOUVROW_NEON | 
|  1355  |  1255  | 
|  1356 #ifdef HAS_HALFROW_NEON |  | 
|  1357 void HalfRow_NEON(const uint8* src_uv, int src_uv_stride, |  | 
|  1358                   uint8* dst_uv, int pix) { |  | 
|  1359   asm volatile ( |  | 
|  1360     // change the stride to row 2 pointer |  | 
|  1361     "add        %x1, %x0, %w1, sxtw            \n" |  | 
|  1362   "1:                                          \n" |  | 
|  1363     MEMACCESS(0) |  | 
|  1364     "ld1        {v0.16b}, [%0], #16            \n"  // load row 1 16 pixels. |  | 
|  1365     "subs       %3, %3, #16                    \n"  // 16 processed per loop |  | 
|  1366     MEMACCESS(1) |  | 
|  1367     "ld1        {v1.16b}, [%1], #16            \n"  // load row 2 16 pixels. |  | 
|  1368     "urhadd     v0.16b, v0.16b, v1.16b         \n"  // average row 1 and 2 |  | 
|  1369     MEMACCESS(2) |  | 
|  1370     "st1        {v0.16b}, [%2], #16            \n" |  | 
|  1371     "bgt        1b                             \n" |  | 
|  1372   : "+r"(src_uv),         // %0 |  | 
|  1373     "+r"(src_uv_stride),  // %1 |  | 
|  1374     "+r"(dst_uv),         // %2 |  | 
|  1375     "+r"(pix)             // %3 |  | 
|  1376   : |  | 
|  1377   : "cc", "memory", "v0", "v1"  // Clobber List |  | 
|  1378   ); |  | 
|  1379 } |  | 
|  1380 #endif  // HAS_HALFROW_NEON |  | 
|  1381  |  | 
|  1382 // Select 2 channels from ARGB on alternating pixels.  e.g.  BGBGBGBG |  | 
|  1383 #ifdef HAS_ARGBTOBAYERROW_NEON |  | 
|  1384 void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer, |  | 
|  1385                          uint32 selector, int pix) { |  | 
|  1386   asm volatile ( |  | 
|  1387     "mov        v2.s[0], %w3                   \n"  // selector |  | 
|  1388   "1:                                          \n" |  | 
|  1389     MEMACCESS(0) |  | 
|  1390     "ld1        {v0.16b, v1.16b}, [%0], 32     \n"  // load row 8 pixels. |  | 
|  1391     "subs       %2, %2, #8                     \n"  // 8 processed per loop |  | 
|  1392     "tbl        v4.8b, {v0.16b}, v2.8b         \n"  // look up 4 pixels |  | 
|  1393     "tbl        v5.8b, {v1.16b}, v2.8b         \n"  // look up 4 pixels |  | 
|  1394     "trn1       v4.4s, v4.4s, v5.4s            \n"  // combine 8 pixels |  | 
|  1395     MEMACCESS(1) |  | 
|  1396     "st1        {v4.8b}, [%1], #8              \n"  // store 8. |  | 
|  1397     "bgt        1b                             \n" |  | 
|  1398   : "+r"(src_argb),   // %0 |  | 
|  1399     "+r"(dst_bayer),  // %1 |  | 
|  1400     "+r"(pix)         // %2 |  | 
|  1401   : "r"(selector)     // %3 |  | 
|  1402   : "cc", "memory", "v0", "v1", "v2", "v4", "v5"   // Clobber List |  | 
|  1403   ); |  | 
|  1404 } |  | 
|  1405 #endif  // HAS_ARGBTOBAYERROW_NEON |  | 
|  1406  |  | 
|  1407 // Select G channels from ARGB.  e.g.  GGGGGGGG |  1256 // Select G channels from ARGB.  e.g.  GGGGGGGG | 
|  1408 #ifdef HAS_ARGBTOBAYERGGROW_NEON |  1257 #ifdef HAS_ARGBTOBAYERGGROW_NEON | 
|  1409 void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer, |  1258 void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer, | 
|  1410                            uint32 /*selector*/, int pix) { |  1259                            uint32 /*selector*/, int pix) { | 
|  1411   asm volatile ( |  1260   asm volatile ( | 
|  1412   "1:                                          \n" |  1261   "1:                                          \n" | 
|  1413     MEMACCESS(0) |  1262     MEMACCESS(0) | 
|  1414     "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load row 8 pixels. |  1263     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load row 8 pixels | 
|  1415     "subs       %2, %2, #8                     \n"  // 8 processed per loop |  1264     "subs       %2, %2, #8                     \n"  // 8 processed per loop | 
|  1416     MEMACCESS(1) |  1265     MEMACCESS(1) | 
|  1417     "st1        {v1.8b}, [%1], #8              \n"  // store 8 G's. |  1266     "st1        {v1.8b}, [%1], #8              \n"  // store 8 G's. | 
|  1418     "bgt        1b                             \n" |  1267     "b.gt       1b                             \n" | 
|  1419   : "+r"(src_argb),   // %0 |  1268   : "+r"(src_argb),   // %0 | 
|  1420     "+r"(dst_bayer),  // %1 |  1269     "+r"(dst_bayer),  // %1 | 
|  1421     "+r"(pix)         // %2 |  1270     "+r"(pix)         // %2 | 
|  1422   : |  1271   : | 
|  1423   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List |  1272   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List | 
|  1424   ); |  1273   ); | 
|  1425 } |  1274 } | 
|  1426 #endif  // HAS_ARGBTOBAYERGGROW_NEON |  1275 #endif  // HAS_ARGBTOBAYERGGROW_NEON | 
|  1427  |  1276  | 
|  1428 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. |  1277 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. | 
|  1429 #ifdef HAS_ARGBSHUFFLEROW_NEON |  1278 #ifdef HAS_ARGBSHUFFLEROW_NEON | 
|  1430 void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, |  1279 void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, | 
|  1431                          const uint8* shuffler, int pix) { |  1280                          const uint8* shuffler, int pix) { | 
|  1432   asm volatile ( |  1281   asm volatile ( | 
|  1433     MEMACCESS(3) |  1282     MEMACCESS(3) | 
|  1434     "ld1        {v2.16b}, [%3]                 \n"  // shuffler |  1283     "ld1        {v2.16b}, [%3]                 \n"  // shuffler | 
|  1435   "1:                                          \n" |  1284   "1:                                          \n" | 
|  1436     MEMACCESS(0) |  1285     MEMACCESS(0) | 
|  1437     "ld1        {v0.16b}, [%0], #16            \n"  // load 4 pixels. |  1286     "ld1        {v0.16b}, [%0], #16            \n"  // load 4 pixels. | 
|  1438     "subs       %2, %2, #4                     \n"  // 4 processed per loop |  1287     "subs       %2, %2, #4                     \n"  // 4 processed per loop | 
|  1439     "tbl        v1.16b, {v0.16b}, v2.16b       \n"  // look up 4 pixels |  1288     "tbl        v1.16b, {v0.16b}, v2.16b       \n"  // look up 4 pixels | 
|  1440     MEMACCESS(1) |  1289     MEMACCESS(1) | 
|  1441     "st1        {v1.16b}, [%1], #16            \n"  // store 4. |  1290     "st1        {v1.16b}, [%1], #16            \n"  // store 4. | 
|  1442     "bgt        1b                             \n" |  1291     "b.gt       1b                             \n" | 
|  1443   : "+r"(src_argb),  // %0 |  1292   : "+r"(src_argb),  // %0 | 
|  1444     "+r"(dst_argb),  // %1 |  1293     "+r"(dst_argb),  // %1 | 
|  1445     "+r"(pix)        // %2 |  1294     "+r"(pix)        // %2 | 
|  1446   : "r"(shuffler)    // %3 |  1295   : "r"(shuffler)    // %3 | 
|  1447   : "cc", "memory", "v0", "v1", "v2"  // Clobber List |  1296   : "cc", "memory", "v0", "v1", "v2"  // Clobber List | 
|  1448   ); |  1297   ); | 
|  1449 } |  1298 } | 
|  1450 #endif  // HAS_ARGBSHUFFLEROW_NEON |  1299 #endif  // HAS_ARGBSHUFFLEROW_NEON | 
|  1451  |  1300  | 
|  1452 #ifdef HAS_I422TOYUY2ROW_NEON |  1301 #ifdef HAS_I422TOYUY2ROW_NEON | 
|  1453 void I422ToYUY2Row_NEON(const uint8* src_y, |  1302 void I422ToYUY2Row_NEON(const uint8* src_y, | 
|  1454                         const uint8* src_u, |  1303                         const uint8* src_u, | 
|  1455                         const uint8* src_v, |  1304                         const uint8* src_v, | 
|  1456                         uint8* dst_yuy2, int width) { |  1305                         uint8* dst_yuy2, int width) { | 
|  1457   asm volatile ( |  1306   asm volatile ( | 
|  1458     ".p2align   2                              \n" |  | 
|  1459   "1:                                          \n" |  1307   "1:                                          \n" | 
|  1460     MEMACCESS(0) |  1308     MEMACCESS(0) | 
|  1461     "ld2        {v0.8b, v1.8b}, [%0], #16      \n"  // load 16 Ys |  1309     "ld2        {v0.8b, v1.8b}, [%0], #16      \n"  // load 16 Ys | 
|  1462     "mov        v2.8b, v1.8b                   \n" |  1310     "orr        v2.8b, v1.8b, v1.8b            \n" | 
|  1463     MEMACCESS(1) |  1311     MEMACCESS(1) | 
|  1464     "ld1        {v1.8b}, [%1], #8              \n"  // load 8 Us |  1312     "ld1        {v1.8b}, [%1], #8              \n"  // load 8 Us | 
|  1465     MEMACCESS(2) |  1313     MEMACCESS(2) | 
|  1466     "ld1        {v3.8b}, [%2], #8              \n"  // load 8 Vs |  1314     "ld1        {v3.8b}, [%2], #8              \n"  // load 8 Vs | 
|  1467     "subs       %4, %4, #16                    \n"  // 16 pixels |  1315     "subs       %4, %4, #16                    \n"  // 16 pixels | 
|  1468     MEMACCESS(3) |  1316     MEMACCESS(3) | 
|  1469     "st4        {v0.8b-v3.8b}, [%3], #32       \n"  // Store 8 YUY2/16 pixels. |  1317     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels. | 
|  1470     "bgt        1b                             \n" |  1318     "b.gt       1b                             \n" | 
|  1471   : "+r"(src_y),     // %0 |  1319   : "+r"(src_y),     // %0 | 
|  1472     "+r"(src_u),     // %1 |  1320     "+r"(src_u),     // %1 | 
|  1473     "+r"(src_v),     // %2 |  1321     "+r"(src_v),     // %2 | 
|  1474     "+r"(dst_yuy2),  // %3 |  1322     "+r"(dst_yuy2),  // %3 | 
|  1475     "+r"(width)      // %4 |  1323     "+r"(width)      // %4 | 
|  1476   : |  1324   : | 
|  1477   : "cc", "memory", "v0", "v1", "v2", "v3" |  1325   : "cc", "memory", "v0", "v1", "v2", "v3" | 
|  1478   ); |  1326   ); | 
|  1479 } |  1327 } | 
|  1480 #endif  // HAS_I422TOYUY2ROW_NEON |  1328 #endif  // HAS_I422TOYUY2ROW_NEON | 
|  1481  |  1329  | 
|  1482 #ifdef HAS_I422TOUYVYROW_NEON |  1330 #ifdef HAS_I422TOUYVYROW_NEON | 
|  1483 void I422ToUYVYRow_NEON(const uint8* src_y, |  1331 void I422ToUYVYRow_NEON(const uint8* src_y, | 
|  1484                         const uint8* src_u, |  1332                         const uint8* src_u, | 
|  1485                         const uint8* src_v, |  1333                         const uint8* src_v, | 
|  1486                         uint8* dst_uyvy, int width) { |  1334                         uint8* dst_uyvy, int width) { | 
|  1487   asm volatile ( |  1335   asm volatile ( | 
|  1488     ".p2align   2                              \n" |  | 
|  1489   "1:                                          \n" |  1336   "1:                                          \n" | 
|  1490     MEMACCESS(0) |  1337     MEMACCESS(0) | 
|  1491     "ld2        {v1.8b, v2.8b}, [%0], #16      \n"  // load 16 Ys |  1338     "ld2        {v1.8b,v2.8b}, [%0], #16       \n"  // load 16 Ys | 
|  1492     "mov        v3.8b, v2.8b                   \n" |  1339     "orr        v3.8b, v2.8b, v2.8b            \n" | 
|  1493     MEMACCESS(1) |  1340     MEMACCESS(1) | 
|  1494     "ld1        {v0.8b}, [%1], #8              \n"  // load 8 Us |  1341     "ld1        {v0.8b}, [%1], #8              \n"  // load 8 Us | 
|  1495     MEMACCESS(2) |  1342     MEMACCESS(2) | 
|  1496     "ld1        {v2.8b}, [%2], #8              \n"  // load 8 Vs |  1343     "ld1        {v2.8b}, [%2], #8              \n"  // load 8 Vs | 
|  1497     "subs       %4, %4, #16                    \n"  // 16 pixels |  1344     "subs       %4, %4, #16                    \n"  // 16 pixels | 
|  1498     MEMACCESS(3) |  1345     MEMACCESS(3) | 
|  1499     "st4        {v0.8b-v3.8b}, [%3], #32       \n"  // Store 8 UYVY/16 pixels. |  1346     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels. | 
|  1500     "bgt        1b                             \n" |  1347     "b.gt       1b                             \n" | 
|  1501   : "+r"(src_y),     // %0 |  1348   : "+r"(src_y),     // %0 | 
|  1502     "+r"(src_u),     // %1 |  1349     "+r"(src_u),     // %1 | 
|  1503     "+r"(src_v),     // %2 |  1350     "+r"(src_v),     // %2 | 
|  1504     "+r"(dst_uyvy),  // %3 |  1351     "+r"(dst_uyvy),  // %3 | 
|  1505     "+r"(width)      // %4 |  1352     "+r"(width)      // %4 | 
|  1506   : |  1353   : | 
|  1507   : "cc", "memory", "v0", "v1", "v2", "v3" |  1354   : "cc", "memory", "v0", "v1", "v2", "v3" | 
|  1508   ); |  1355   ); | 
|  1509 } |  1356 } | 
|  1510 #endif  // HAS_I422TOUYVYROW_NEON |  1357 #endif  // HAS_I422TOUYVYROW_NEON | 
|  1511  |  1358  | 
|  1512 #ifdef HAS_ARGBTORGB565ROW_NEON |  1359 #ifdef HAS_ARGBTORGB565ROW_NEON | 
|  1513 void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { |  1360 void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { | 
|  1514   asm volatile ( |  1361   asm volatile ( | 
|  1515     ".p2align   2                              \n" |  | 
|  1516   "1:                                          \n" |  1362   "1:                                          \n" | 
|  1517     MEMACCESS(0) |  1363     MEMACCESS(0) | 
|  1518     "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB. |  1364     "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels | 
|  1519     "subs       %2, %2, #8                     \n"  // 8 processed per loop. |  1365     "subs       %2, %2, #8                     \n"  // 8 processed per loop. | 
|  1520     ARGBTORGB565 |  1366     ARGBTORGB565 | 
|  1521     MEMACCESS(1) |  1367     MEMACCESS(1) | 
|  1522     "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels RGB565. |  1368     "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels RGB565. | 
|  1523     "bgt        1b                             \n" |  1369     "b.gt       1b                             \n" | 
|  1524   : "+r"(src_argb),  // %0 |  1370   : "+r"(src_argb),  // %0 | 
|  1525     "+r"(dst_rgb565),  // %1 |  1371     "+r"(dst_rgb565),  // %1 | 
|  1526     "+r"(pix)        // %2 |  1372     "+r"(pix)        // %2 | 
|  1527   : |  1373   : | 
|  1528   : "cc", "memory", "q0", "q8", "q9", "q10", "q11" |  1374   : "cc", "memory", "v0", "v20", "v21", "v22", "v23" | 
|  1529   ); |  1375   ); | 
|  1530 } |  1376 } | 
|  1531 #endif  // HAS_ARGBTORGB565ROW_NEON |  1377 #endif  // HAS_ARGBTORGB565ROW_NEON | 
|  1532  |  1378  | 
|  1533 #ifdef HAS_ARGBTOARGB1555ROW_NEON |  1379 #ifdef HAS_ARGBTOARGB1555ROW_NEON | 
|  1534 void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, |  1380 void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, | 
|  1535                             int pix) { |  1381                             int pix) { | 
|  1536   asm volatile ( |  1382   asm volatile ( | 
|  1537     ".p2align   2                              \n" |  | 
|  1538   "1:                                          \n" |  1383   "1:                                          \n" | 
|  1539     MEMACCESS(0) |  1384     MEMACCESS(0) | 
|  1540     "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB. |  1385     "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels | 
|  1541     "subs       %2, %2, #8                     \n"  // 8 processed per loop. |  1386     "subs       %2, %2, #8                     \n"  // 8 processed per loop. | 
|  1542     ARGBTOARGB1555 |  1387     ARGBTOARGB1555 | 
|  1543     MEMACCESS(1) |  1388     MEMACCESS(1) | 
|  1544     "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB1555. |  1389     "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels ARGB1555. | 
|  1545     "bgt        1b                             \n" |  1390     "b.gt       1b                             \n" | 
|  1546   : "+r"(src_argb),  // %0 |  1391   : "+r"(src_argb),  // %0 | 
|  1547     "+r"(dst_argb1555),  // %1 |  1392     "+r"(dst_argb1555),  // %1 | 
|  1548     "+r"(pix)        // %2 |  1393     "+r"(pix)        // %2 | 
|  1549   : |  1394   : | 
|  1550   : "cc", "memory", "q0", "q8", "q9", "q10", "q11" |  1395   : "cc", "memory", "v0", "v20", "v21", "v22", "v23" | 
|  1551   ); |  1396   ); | 
|  1552 } |  1397 } | 
|  1553 #endif  // HAS_ARGBTOARGB1555ROW_NEON |  1398 #endif  // HAS_ARGBTOARGB1555ROW_NEON | 
|  1554  |  1399  | 
|  1555 #ifdef HAS_ARGBTOARGB4444ROW_NEON |  1400 #ifdef HAS_ARGBTOARGB4444ROW_NEON | 
|  1556 void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, |  1401 void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, | 
|  1557                             int pix) { |  1402                             int pix) { | 
|  1558   asm volatile ( |  1403   asm volatile ( | 
|  1559     "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic. |  1404     "movi       v4.16b, #0x0f                  \n"  // bits to clear with vbic. | 
|  1560     ".p2align   2                              \n" |  | 
|  1561   "1:                                          \n" |  1405   "1:                                          \n" | 
|  1562     MEMACCESS(0) |  1406     MEMACCESS(0) | 
|  1563     "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB. |  1407     "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels | 
|  1564     "subs       %2, %2, #8                     \n"  // 8 processed per loop. |  1408     "subs       %2, %2, #8                     \n"  // 8 processed per loop. | 
|  1565     ARGBTOARGB4444 |  1409     ARGBTOARGB4444 | 
|  1566     MEMACCESS(1) |  1410     MEMACCESS(1) | 
|  1567     "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB4444. |  1411     "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels ARGB4444. | 
|  1568     "bgt        1b                             \n" |  1412     "b.gt       1b                             \n" | 
|  1569   : "+r"(src_argb),      // %0 |  1413   : "+r"(src_argb),      // %0 | 
|  1570     "+r"(dst_argb4444),  // %1 |  1414     "+r"(dst_argb4444),  // %1 | 
|  1571     "+r"(pix)            // %2 |  1415     "+r"(pix)            // %2 | 
|  1572   : |  1416   : | 
|  1573   : "cc", "memory", "q0", "q8", "q9", "q10", "q11" |  1417   : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23" | 
|  1574   ); |  1418   ); | 
|  1575 } |  1419 } | 
|  1576 #endif  // HAS_ARGBTOARGB4444ROW_NEON |  1420 #endif  // HAS_ARGBTOARGB4444ROW_NEON | 
|  1577  |  1421  | 
|  1578 #ifdef HAS_ARGBTOYROW_NEON |  1422 #ifdef HAS_ARGBTOYROW_NEON | 
|  1579 void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { |  1423 void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { | 
|  1580   asm volatile ( |  1424   asm volatile ( | 
|  1581     "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient |  1425     "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient | 
|  1582     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient |  1426     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient | 
|  1583     "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient |  1427     "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient | 
|  1584     "movi       v7.8b, #16                     \n"  // Add 16 constant |  1428     "movi       v7.8b, #16                     \n"  // Add 16 constant | 
|  1585     ".p2align   2                              \n" |  | 
|  1586   "1:                                          \n" |  1429   "1:                                          \n" | 
|  1587     MEMACCESS(0) |  1430     MEMACCESS(0) | 
|  1588     "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load 8 ARGB pixels. |  1431     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels. | 
|  1589     "subs       %2, %2, #8                     \n"  // 8 processed per loop. |  1432     "subs       %2, %2, #8                     \n"  // 8 processed per loop. | 
|  1590     "umull      v3.8h, v0.8b, v4.8b            \n"  // B |  1433     "umull      v3.8h, v0.8b, v4.8b            \n"  // B | 
|  1591     "umlal      v3.8h, v1.8b, v5.8b            \n"  // G |  1434     "umlal      v3.8h, v1.8b, v5.8b            \n"  // G | 
|  1592     "umlal      v3.8h, v2.8b, v6.8b            \n"  // R |  1435     "umlal      v3.8h, v2.8b, v6.8b            \n"  // R | 
|  1593     "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y |  1436     "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y | 
|  1594     "uqadd      v0.8b, v0.8b, v7.8b            \n" |  1437     "uqadd      v0.8b, v0.8b, v7.8b            \n" | 
|  1595     MEMACCESS(1) |  1438     MEMACCESS(1) | 
|  1596     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y. |  1439     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y. | 
|  1597     "bgt        1b                             \n" |  1440     "b.gt       1b                             \n" | 
|  1598   : "+r"(src_argb),  // %0 |  1441   : "+r"(src_argb),  // %0 | 
|  1599     "+r"(dst_y),     // %1 |  1442     "+r"(dst_y),     // %1 | 
|  1600     "+r"(pix)        // %2 |  1443     "+r"(pix)        // %2 | 
|  1601   : |  1444   : | 
|  1602   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" |  1445   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" | 
|  1603   ); |  1446   ); | 
|  1604 } |  1447 } | 
|  1605 #endif  // HAS_ARGBTOYROW_NEON |  1448 #endif  // HAS_ARGBTOYROW_NEON | 
|  1606  |  1449  | 
|  1607 #ifdef HAS_ARGBTOYJROW_NEON |  1450 #ifdef HAS_ARGBTOYJROW_NEON | 
|  1608 void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { |  1451 void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { | 
|  1609   asm volatile ( |  1452   asm volatile ( | 
|  1610     "movi       v4.8b, #15                     \n"  // B * 0.11400 coefficient |  1453     "movi       v4.8b, #15                     \n"  // B * 0.11400 coefficient | 
|  1611     "movi       v5.8b, #75                     \n"  // G * 0.58700 coefficient |  1454     "movi       v5.8b, #75                     \n"  // G * 0.58700 coefficient | 
|  1612     "movi       v6.8b, #38                     \n"  // R * 0.29900 coefficient |  1455     "movi       v6.8b, #38                     \n"  // R * 0.29900 coefficient | 
|  1613     ".p2align   2                              \n" |  | 
|  1614   "1:                                          \n" |  1456   "1:                                          \n" | 
|  1615     MEMACCESS(0) |  1457     MEMACCESS(0) | 
|  1616     "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load 8 ARGB pixels. |  1458     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels. | 
|  1617     "subs       %2, %2, #8                     \n"  // 8 processed per loop. |  1459     "subs       %2, %2, #8                     \n"  // 8 processed per loop. | 
|  1618     "umull      v3.8h, v0.8b, v4.8b            \n"  // B |  1460     "umull      v3.8h, v0.8b, v4.8b            \n"  // B | 
|  1619     "umlal      v3.8h, v1.8b, v5.8b            \n"  // G |  1461     "umlal      v3.8h, v1.8b, v5.8b            \n"  // G | 
|  1620     "umlal      v3.8h, v2.8b, v6.8b            \n"  // R |  1462     "umlal      v3.8h, v2.8b, v6.8b            \n"  // R | 
|  1621     "sqrshrun   v0.8b, v3.8h, #7               \n"  // 15 bit to 8 bit Y |  1463     "sqrshrun   v0.8b, v3.8h, #7               \n"  // 15 bit to 8 bit Y | 
|  1622     MEMACCESS(1) |  1464     MEMACCESS(1) | 
|  1623     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y. |  1465     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y. | 
|  1624     "bgt        1b                             \n" |  1466     "b.gt       1b                             \n" | 
|  1625   : "+r"(src_argb),  // %0 |  1467   : "+r"(src_argb),  // %0 | 
|  1626     "+r"(dst_y),     // %1 |  1468     "+r"(dst_y),     // %1 | 
|  1627     "+r"(pix)        // %2 |  1469     "+r"(pix)        // %2 | 
|  1628   : |  1470   : | 
|  1629   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" |  1471   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" | 
|  1630   ); |  1472   ); | 
|  1631 } |  1473 } | 
|  1632 #endif  // HAS_ARGBTOYJROW_NEON |  1474 #endif  // HAS_ARGBTOYJROW_NEON | 
|  1633  |  1475  | 
|  1634 // 8x1 pixels. |  1476 // 8x1 pixels. | 
|  1635 #ifdef HAS_ARGBTOUV444ROW_NEON |  1477 #ifdef HAS_ARGBTOUV444ROW_NEON | 
|  1636 void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, |  1478 void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, | 
|  1637                          int pix) { |  1479                          int pix) { | 
|  1638   asm volatile ( |  1480   asm volatile ( | 
|  1639     "vmov.u8    d24, #112                      \n"  // UB / VR 0.875 coefficient |  1481     "movi       v24.8b, #112                   \n"  // UB / VR 0.875 coefficient | 
|  1640     "vmov.u8    d25, #74                       \n"  // UG -0.5781 coefficient |  1482     "movi       v25.8b, #74                    \n"  // UG -0.5781 coefficient | 
|  1641     "vmov.u8    d26, #38                       \n"  // UR -0.2969 coefficient |  1483     "movi       v26.8b, #38                    \n"  // UR -0.2969 coefficient | 
|  1642     "vmov.u8    d27, #18                       \n"  // VB -0.1406 coefficient |  1484     "movi       v27.8b, #18                    \n"  // VB -0.1406 coefficient | 
|  1643     "vmov.u8    d28, #94                       \n"  // VG -0.7344 coefficient |  1485     "movi       v28.8b, #94                    \n"  // VG -0.7344 coefficient | 
|  1644     "vmov.u16   q15, #0x8080                   \n"  // 128.5 |  1486     "movi       v29.16b,#0x80                  \n"  // 128.5 | 
|  1645     ".p2align   2                              \n" |  | 
|  1646   "1:                                          \n" |  1487   "1:                                          \n" | 
|  1647     MEMACCESS(0) |  1488     MEMACCESS(0) | 
|  1648     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels. |  1489     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels. | 
|  1649     "subs       %3, %3, #8                     \n"  // 8 processed per loop. |  1490     "subs       %3, %3, #8                     \n"  // 8 processed per loop. | 
|  1650     "vmull.u8   q2, d0, d24                    \n"  // B |  1491     "umull      v4.8h, v0.8b, v24.8b           \n"  // B | 
|  1651     "vmlsl.u8   q2, d1, d25                    \n"  // G |  1492     "umlsl      v4.8h, v1.8b, v25.8b           \n"  // G | 
|  1652     "vmlsl.u8   q2, d2, d26                    \n"  // R |  1493     "umlsl      v4.8h, v2.8b, v26.8b           \n"  // R | 
|  1653     "vadd.u16   q2, q2, q15                    \n"  // +128 -> unsigned |  1494     "add        v4.8h, v4.8h, v29.8h           \n"  // +128 -> unsigned | 
|  1654  |  1495  | 
|  1655     "vmull.u8   q3, d2, d24                    \n"  // R |  1496     "umull      v3.8h, v2.8b, v24.8b           \n"  // R | 
|  1656     "vmlsl.u8   q3, d1, d28                    \n"  // G |  1497     "umlsl      v3.8h, v1.8b, v28.8b           \n"  // G | 
|  1657     "vmlsl.u8   q3, d0, d27                    \n"  // B |  1498     "umlsl      v3.8h, v0.8b, v27.8b           \n"  // B | 
|  1658     "vadd.u16   q3, q3, q15                    \n"  // +128 -> unsigned |  1499     "add        v3.8h, v3.8h, v29.8h           \n"  // +128 -> unsigned | 
|  1659  |  1500  | 
|  1660     "vqshrn.u16  d0, q2, #8                    \n"  // 16 bit to 8 bit U |  1501     "uqshrn     v0.8b, v4.8h, #8               \n"  // 16 bit to 8 bit U | 
|  1661     "vqshrn.u16  d1, q3, #8                    \n"  // 16 bit to 8 bit V |  1502     "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V | 
|  1662  |  1503  | 
|  1663     MEMACCESS(1) |  1504     MEMACCESS(1) | 
|  1664     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U. |  1505     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U. | 
|  1665     MEMACCESS(2) |  1506     MEMACCESS(2) | 
|  1666     "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V. |  1507     "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V. | 
|  1667     "bgt        1b                             \n" |  1508     "b.gt       1b                             \n" | 
|  1668   : "+r"(src_argb),  // %0 |  1509   : "+r"(src_argb),  // %0 | 
|  1669     "+r"(dst_u),     // %1 |  1510     "+r"(dst_u),     // %1 | 
|  1670     "+r"(dst_v),     // %2 |  1511     "+r"(dst_v),     // %2 | 
|  1671     "+r"(pix)        // %3 |  1512     "+r"(pix)        // %3 | 
|  1672   : |  1513   : | 
|  1673   : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15" |  1514   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", | 
 |  1515     "v24", "v25", "v26", "v27", "v28", "v29" | 
|  1674   ); |  1516   ); | 
|  1675 } |  1517 } | 
|  1676 #endif  // HAS_ARGBTOUV444ROW_NEON |  1518 #endif  // HAS_ARGBTOUV444ROW_NEON | 
|  1677  |  1519  | 
|  1678 // 16x1 pixels -> 8x1.  pix is number of argb pixels. e.g. 16. |  1520 // 16x1 pixels -> 8x1.  pix is number of argb pixels. e.g. 16. | 
|  1679 #ifdef HAS_ARGBTOUV422ROW_NEON |  1521 #ifdef HAS_ARGBTOUV422ROW_NEON | 
|  1680 void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, |  1522 void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, | 
|  1681                          int pix) { |  1523                          int pix) { | 
|  1682   asm volatile ( |  1524   asm volatile ( | 
|  1683     "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient |  1525     RGBTOUV_SETUP_REG | 
|  1684     "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient |  | 
|  1685     "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient |  | 
|  1686     "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient |  | 
|  1687     "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient |  | 
|  1688     "vmov.u16   q15, #0x8080                   \n"  // 128.5 |  | 
|  1689     ".p2align   2                              \n" |  | 
|  1690   "1:                                          \n" |  1526   "1:                                          \n" | 
|  1691     MEMACCESS(0) |  1527     MEMACCESS(0) | 
|  1692     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels. |  1528     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels. | 
|  1693     MEMACCESS(0) |  | 
|  1694     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels. |  | 
|  1695  |  1529  | 
|  1696     "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts. |  1530     "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts. | 
|  1697     "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts. |  1531     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts. | 
|  1698     "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts. |  1532     "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts. | 
|  1699  |  1533  | 
|  1700     "subs       %3, %3, #16                    \n"  // 16 processed per loop. |  1534     "subs       %3, %3, #16                    \n"  // 16 processed per loop. | 
|  1701     "vmul.s16   q8, q0, q10                    \n"  // B |  1535     "mul        v3.8h, v0.8h, v20.8h           \n"  // B | 
|  1702     "vmls.s16   q8, q1, q11                    \n"  // G |  1536     "mls        v3.8h, v1.8h, v21.8h           \n"  // G | 
|  1703     "vmls.s16   q8, q2, q12                    \n"  // R |  1537     "mls        v3.8h, v2.8h, v22.8h           \n"  // R | 
|  1704     "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned |  1538     "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned | 
|  1705  |  1539  | 
|  1706     "vmul.s16   q9, q2, q10                    \n"  // R |  1540     "mul        v4.8h, v2.8h, v20.8h           \n"  // R | 
|  1707     "vmls.s16   q9, q1, q14                    \n"  // G |  1541     "mls        v4.8h, v1.8h, v24.8h           \n"  // G | 
|  1708     "vmls.s16   q9, q0, q13                    \n"  // B |  1542     "mls        v4.8h, v0.8h, v23.8h           \n"  // B | 
|  1709     "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned |  1543     "add        v4.8h, v4.8h, v25.8h           \n"  // +128 -> unsigned | 
|  1710  |  1544  | 
|  1711     "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U |  1545     "uqshrn     v0.8b, v3.8h, #8               \n"  // 16 bit to 8 bit U | 
|  1712     "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V |  1546     "uqshrn     v1.8b, v4.8h, #8               \n"  // 16 bit to 8 bit V | 
|  1713  |  1547  | 
|  1714     MEMACCESS(1) |  1548     MEMACCESS(1) | 
|  1715     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U. |  1549     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U. | 
|  1716     MEMACCESS(2) |  1550     MEMACCESS(2) | 
|  1717     "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V. |  1551     "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V. | 
|  1718     "bgt        1b                             \n" |  1552     "b.gt       1b                             \n" | 
|  1719   : "+r"(src_argb),  // %0 |  1553   : "+r"(src_argb),  // %0 | 
|  1720     "+r"(dst_u),     // %1 |  1554     "+r"(dst_u),     // %1 | 
|  1721     "+r"(dst_v),     // %2 |  1555     "+r"(dst_v),     // %2 | 
|  1722     "+r"(pix)        // %3 |  1556     "+r"(pix)        // %3 | 
|  1723   : |  1557   : | 
|  1724   : "cc", "memory", "q0", "q1", "q2", "q3", |  1558   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 
|  1725     "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |  1559     "v20", "v21", "v22", "v23", "v24", "v25" | 
|  1726   ); |  1560   ); | 
|  1727 } |  1561 } | 
|  1728 #endif  // HAS_ARGBTOUV422ROW_NEON |  1562 #endif  // HAS_ARGBTOUV422ROW_NEON | 
|  1729  |  1563  | 
|  1730 // 32x1 pixels -> 8x1.  pix is number of argb pixels. e.g. 32. |  1564 // 32x1 pixels -> 8x1.  pix is number of argb pixels. e.g. 32. | 
|  1731 #ifdef HAS_ARGBTOUV411ROW_NEON |  1565 #ifdef HAS_ARGBTOUV411ROW_NEON | 
|  1732 void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, |  1566 void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, | 
|  1733                          int pix) { |  1567                          int pix) { | 
|  1734   asm volatile ( |  1568   asm volatile ( | 
|  1735     "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient |  1569     RGBTOUV_SETUP_REG | 
|  1736     "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient |  | 
|  1737     "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient |  | 
|  1738     "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient |  | 
|  1739     "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient |  | 
|  1740     "vmov.u16   q15, #0x8080                   \n"  // 128.5 |  | 
|  1741     ".p2align   2                              \n" |  | 
|  1742   "1:                                          \n" |  1570   "1:                                          \n" | 
|  1743     MEMACCESS(0) |  1571     MEMACCESS(0) | 
|  1744     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels. |  1572     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels. | 
 |  1573     "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts. | 
 |  1574     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts. | 
 |  1575     "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts. | 
|  1745     MEMACCESS(0) |  1576     MEMACCESS(0) | 
|  1746     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels. |  1577     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%0], #64 \n"  // load next 16. | 
|  1747     "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts. |  1578     "uaddlp     v4.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts. | 
|  1748     "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts. |  1579     "uaddlp     v5.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts. | 
|  1749     "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts. |  1580     "uaddlp     v6.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts. | 
|  1750     MEMACCESS(0) |  | 
|  1751     "vld4.8     {d8, d10, d12, d14}, [%0]!     \n"  // load 8 more ARGB pixels. |  | 
|  1752     MEMACCESS(0) |  | 
|  1753     "vld4.8     {d9, d11, d13, d15}, [%0]!     \n"  // load last 8 ARGB pixels. |  | 
|  1754     "vpaddl.u8  q4, q4                         \n"  // B 16 bytes -> 8 shorts. |  | 
|  1755     "vpaddl.u8  q5, q5                         \n"  // G 16 bytes -> 8 shorts. |  | 
|  1756     "vpaddl.u8  q6, q6                         \n"  // R 16 bytes -> 8 shorts. |  | 
|  1757  |  1581  | 
|  1758     "vpadd.u16  d0, d0, d1                     \n"  // B 16 shorts -> 8 shorts. |  1582     "addp       v0.8h, v0.8h, v4.8h            \n"  // B 16 shorts -> 8 shorts. | 
|  1759     "vpadd.u16  d1, d8, d9                     \n"  // B |  1583     "addp       v1.8h, v1.8h, v5.8h            \n"  // G 16 shorts -> 8 shorts. | 
|  1760     "vpadd.u16  d2, d2, d3                     \n"  // G 16 shorts -> 8 shorts. |  1584     "addp       v2.8h, v2.8h, v6.8h            \n"  // R 16 shorts -> 8 shorts. | 
|  1761     "vpadd.u16  d3, d10, d11                   \n"  // G |  | 
|  1762     "vpadd.u16  d4, d4, d5                     \n"  // R 16 shorts -> 8 shorts. |  | 
|  1763     "vpadd.u16  d5, d12, d13                   \n"  // R |  | 
|  1764  |  1585  | 
|  1765     "vrshr.u16  q0, q0, #1                     \n"  // 2x average |  1586     "urshr      v0.8h, v0.8h, #1               \n"  // 2x average | 
|  1766     "vrshr.u16  q1, q1, #1                     \n" |  1587     "urshr      v1.8h, v1.8h, #1               \n" | 
|  1767     "vrshr.u16  q2, q2, #1                     \n" |  1588     "urshr      v2.8h, v2.8h, #1               \n" | 
|  1768  |  1589  | 
|  1769     "subs       %3, %3, #32                    \n"  // 32 processed per loop. |  1590     "subs       %3, %3, #32                    \n"  // 32 processed per loop. | 
|  1770     "vmul.s16   q8, q0, q10                    \n"  // B |  1591     "mul        v3.8h, v0.8h, v20.8h           \n"  // B | 
|  1771     "vmls.s16   q8, q1, q11                    \n"  // G |  1592     "mls        v3.8h, v1.8h, v21.8h           \n"  // G | 
|  1772     "vmls.s16   q8, q2, q12                    \n"  // R |  1593     "mls        v3.8h, v2.8h, v22.8h           \n"  // R | 
|  1773     "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned |  1594     "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned | 
|  1774     "vmul.s16   q9, q2, q10                    \n"  // R |  1595     "mul        v4.8h, v2.8h, v20.8h           \n"  // R | 
|  1775     "vmls.s16   q9, q1, q14                    \n"  // G |  1596     "mls        v4.8h, v1.8h, v24.8h           \n"  // G | 
|  1776     "vmls.s16   q9, q0, q13                    \n"  // B |  1597     "mls        v4.8h, v0.8h, v23.8h           \n"  // B | 
|  1777     "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned |  1598     "add        v4.8h, v4.8h, v25.8h           \n"  // +128 -> unsigned | 
|  1778     "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U |  1599     "uqshrn     v0.8b, v3.8h, #8               \n"  // 16 bit to 8 bit U | 
|  1779     "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V |  1600     "uqshrn     v1.8b, v4.8h, #8               \n"  // 16 bit to 8 bit V | 
|  1780     MEMACCESS(1) |  1601     MEMACCESS(1) | 
|  1781     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U. |  1602     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U. | 
|  1782     MEMACCESS(2) |  1603     MEMACCESS(2) | 
|  1783     "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V. |  1604     "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V. | 
|  1784     "bgt        1b                             \n" |  1605     "b.gt       1b                             \n" | 
|  1785   : "+r"(src_argb),  // %0 |  1606   : "+r"(src_argb),  // %0 | 
|  1786     "+r"(dst_u),     // %1 |  1607     "+r"(dst_u),     // %1 | 
|  1787     "+r"(dst_v),     // %2 |  1608     "+r"(dst_v),     // %2 | 
|  1788     "+r"(pix)        // %3 |  1609     "+r"(pix)        // %3 | 
|  1789   : |  1610   : | 
|  1790   : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", |  1611   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 
|  1791     "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |  1612     "v20", "v21", "v22", "v23", "v24", "v25" | 
|  1792   ); |  1613   ); | 
|  1793 } |  1614 } | 
|  1794 #endif  // HAS_ARGBTOUV411ROW_NEON |  1615 #endif  // HAS_ARGBTOUV411ROW_NEON | 
|  1795  |  1616  | 
|  1796 // 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16. |  1617 // 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16. | 
|  1797 #define RGBTOUV(QB, QG, QR) \ |  1618 #define RGBTOUV(QB, QG, QR) \ | 
|  1798     "vmul.s16   q8, " #QB ", q10               \n"  /* B                    */ \ |  1619     "mul        v3.8h, " #QB ",v20.8h          \n"  /* B                    */ \ | 
|  1799     "vmls.s16   q8, " #QG ", q11               \n"  /* G                    */ \ |  1620     "mul        v4.8h, " #QR ",v20.8h          \n"  /* R                    */ \ | 
|  1800     "vmls.s16   q8, " #QR ", q12               \n"  /* R                    */ \ |  1621     "mls        v3.8h, " #QG ",v21.8h          \n"  /* G                    */ \ | 
|  1801     "vadd.u16   q8, q8, q15                    \n"  /* +128 -> unsigned     */ \ |  1622     "mls        v4.8h, " #QG ",v24.8h          \n"  /* G                    */ \ | 
|  1802     "vmul.s16   q9, " #QR ", q10               \n"  /* R                    */ \ |  1623     "mls        v3.8h, " #QR ",v22.8h          \n"  /* R                    */ \ | 
|  1803     "vmls.s16   q9, " #QG ", q14               \n"  /* G                    */ \ |  1624     "mls        v4.8h, " #QB ",v23.8h          \n"  /* B                    */ \ | 
|  1804     "vmls.s16   q9, " #QB ", q13               \n"  /* B                    */ \ |  1625     "add        v3.8h, v3.8h, v25.8h           \n"  /* +128 -> unsigned     */ \ | 
|  1805     "vadd.u16   q9, q9, q15                    \n"  /* +128 -> unsigned     */ \ |  1626     "add        v4.8h, v4.8h, v25.8h           \n"  /* +128 -> unsigned     */ \ | 
|  1806     "vqshrn.u16  d0, q8, #8                    \n"  /* 16 bit to 8 bit U    */ \ |  1627     "uqshrn     v0.8b, v3.8h, #8               \n"  /* 16 bit to 8 bit U    */ \ | 
|  1807     "vqshrn.u16  d1, q9, #8                    \n"  /* 16 bit to 8 bit V    */ |  1628     "uqshrn     v1.8b, v4.8h, #8               \n"  /* 16 bit to 8 bit V    */ | 
|  1808  |  1629  | 
|  1809 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. |  1630 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. | 
 |  1631 // TODO(fbarchard): consider ptrdiff_t for all strides. | 
 |  1632  | 
|  1810 #ifdef HAS_ARGBTOUVROW_NEON |  1633 #ifdef HAS_ARGBTOUVROW_NEON | 
|  1811 void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, |  1634 void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, | 
|  1812                       uint8* dst_u, uint8* dst_v, int pix) { |  1635                       uint8* dst_u, uint8* dst_v, int pix) { | 
 |  1636   const uint8* src_argb_1 = src_argb + src_stride_argb; | 
|  1813   asm volatile ( |  1637   asm volatile ( | 
|  1814     "add        %1, %0, %1                     \n"  // src_stride + src_argb |  1638     RGBTOUV_SETUP_REG | 
|  1815     "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient |  | 
|  1816     "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient |  | 
|  1817     "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient |  | 
|  1818     "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient |  | 
|  1819     "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient |  | 
|  1820     "vmov.u16   q15, #0x8080                   \n"  // 128.5 |  | 
|  1821     ".p2align   2                              \n" |  | 
|  1822   "1:                                          \n" |  1639   "1:                                          \n" | 
|  1823     MEMACCESS(0) |  1640     MEMACCESS(0) | 
|  1824     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels. |  1641     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels. | 
|  1825     MEMACCESS(0) |  1642     "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts. | 
|  1826     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels. |  1643     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts. | 
|  1827     "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts. |  1644     "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts. | 
|  1828     "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts. |  1645  | 
|  1829     "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts. |  | 
|  1830     MEMACCESS(1) |  1646     MEMACCESS(1) | 
|  1831     "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels. |  1647     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16 | 
|  1832     MEMACCESS(1) |  1648     "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts. | 
|  1833     "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels. |  1649     "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts. | 
|  1834     "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts. |  1650     "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts. | 
|  1835     "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts. |  | 
|  1836     "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts. |  | 
|  1837  |  1651  | 
|  1838     "vrshr.u16  q0, q0, #1                     \n"  // 2x average |  1652     "urshr      v0.8h, v0.8h, #1               \n"  // 2x average | 
|  1839     "vrshr.u16  q1, q1, #1                     \n" |  1653     "urshr      v1.8h, v1.8h, #1               \n" | 
|  1840     "vrshr.u16  q2, q2, #1                     \n" |  1654     "urshr      v2.8h, v2.8h, #1               \n" | 
|  1841  |  1655  | 
|  1842     "subs       %4, %4, #16                    \n"  // 32 processed per loop. |  1656     "subs       %4, %4, #16                    \n"  // 32 processed per loop. | 
|  1843     RGBTOUV(q0, q1, q2) |  1657     RGBTOUV(v0.8h, v1.8h, v2.8h) | 
|  1844     MEMACCESS(2) |  1658     MEMACCESS(2) | 
|  1845     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U. |  1659     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U. | 
|  1846     MEMACCESS(3) |  1660     MEMACCESS(3) | 
|  1847     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V. |  1661     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V. | 
|  1848     "bgt        1b                             \n" |  1662     "b.gt       1b                             \n" | 
|  1849   : "+r"(src_argb),  // %0 |  1663   : "+r"(src_argb),  // %0 | 
|  1850     "+r"(src_stride_argb),  // %1 |  1664     "+r"(src_argb_1),  // %1 | 
|  1851     "+r"(dst_u),     // %2 |  1665     "+r"(dst_u),     // %2 | 
|  1852     "+r"(dst_v),     // %3 |  1666     "+r"(dst_v),     // %3 | 
|  1853     "+r"(pix)        // %4 |  1667     "+r"(pix)        // %4 | 
|  1854   : |  1668   : | 
|  1855   : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", |  1669   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 
|  1856     "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |  1670     "v20", "v21", "v22", "v23", "v24", "v25" | 
|  1857   ); |  1671   ); | 
|  1858 } |  1672 } | 
|  1859 #endif  // HAS_ARGBTOUVROW_NEON |  1673 #endif  // HAS_ARGBTOUVROW_NEON | 
|  1860  |  1674  | 
|  1861 // TODO(fbarchard): Subsample match C code. |  1675 // TODO(fbarchard): Subsample match C code. | 
|  1862 #ifdef HAS_ARGBTOUVJROW_NEON |  1676 #ifdef HAS_ARGBTOUVJROW_NEON | 
|  1863 void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, |  1677 void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, | 
|  1864                        uint8* dst_u, uint8* dst_v, int pix) { |  1678                        uint8* dst_u, uint8* dst_v, int pix) { | 
 |  1679   const uint8* src_argb_1 = src_argb + src_stride_argb; | 
|  1865   asm volatile ( |  1680   asm volatile ( | 
|  1866     "add        %1, %0, %1                     \n"  // src_stride + src_argb |  1681     "movi       v20.8h, #63, lsl #0            \n"  // UB/VR coeff (0.500) / 2 | 
|  1867     "vmov.s16   q10, #127 / 2                  \n"  // UB / VR 0.500 coefficient |  1682     "movi       v21.8h, #42, lsl #0            \n"  // UG coeff (-0.33126) / 2 | 
|  1868     "vmov.s16   q11, #84 / 2                   \n"  // UG -0.33126 coefficient |  1683     "movi       v22.8h, #21, lsl #0            \n"  // UR coeff (-0.16874) / 2 | 
|  1869     "vmov.s16   q12, #43 / 2                   \n"  // UR -0.16874 coefficient |  1684     "movi       v23.8h, #10, lsl #0            \n"  // VB coeff (-0.08131) / 2 | 
|  1870     "vmov.s16   q13, #20 / 2                   \n"  // VB -0.08131 coefficient |  1685     "movi       v24.8h, #53, lsl #0            \n"  // VG coeff (-0.41869) / 2 | 
|  1871     "vmov.s16   q14, #107 / 2                  \n"  // VG -0.41869 coefficient |  1686     "movi       v25.16b, #0x80                 \n"  // 128.5 (0x8080 in 16-bit) | 
|  1872     "vmov.u16   q15, #0x8080                   \n"  // 128.5 |  | 
|  1873     ".p2align   2                              \n" |  | 
|  1874   "1:                                          \n" |  1687   "1:                                          \n" | 
|  1875     MEMACCESS(0) |  1688     MEMACCESS(0) | 
|  1876     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels. |  1689     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels. | 
|  1877     MEMACCESS(0) |  1690     "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts. | 
|  1878     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels. |  1691     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts. | 
|  1879     "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts. |  1692     "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts. | 
|  1880     "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts. |  | 
|  1881     "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts. |  | 
|  1882     MEMACCESS(1) |  1693     MEMACCESS(1) | 
|  1883     "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels. |  1694     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64  \n"  // load next 16 | 
|  1884     MEMACCESS(1) |  1695     "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts. | 
|  1885     "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels. |  1696     "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts. | 
|  1886     "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts. |  1697     "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts. | 
|  1887     "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts. |  | 
|  1888     "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts. |  | 
|  1889  |  1698  | 
|  1890     "vrshr.u16  q0, q0, #1                     \n"  // 2x average |  1699     "urshr      v0.8h, v0.8h, #1               \n"  // 2x average | 
|  1891     "vrshr.u16  q1, q1, #1                     \n" |  1700     "urshr      v1.8h, v1.8h, #1               \n" | 
|  1892     "vrshr.u16  q2, q2, #1                     \n" |  1701     "urshr      v2.8h, v2.8h, #1               \n" | 
|  1893  |  1702  | 
|  1894     "subs       %4, %4, #16                    \n"  // 32 processed per loop. |  1703     "subs       %4, %4, #16                    \n"  // 32 processed per loop. | 
|  1895     RGBTOUV(q0, q1, q2) |  1704     RGBTOUV(v0.8h, v1.8h, v2.8h) | 
|  1896     MEMACCESS(2) |  1705     MEMACCESS(2) | 
|  1897     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U. |  1706     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U. | 
|  1898     MEMACCESS(3) |  1707     MEMACCESS(3) | 
|  1899     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V. |  1708     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V. | 
|  1900     "bgt        1b                             \n" |  1709     "b.gt       1b                             \n" | 
|  1901   : "+r"(src_argb),  // %0 |  1710   : "+r"(src_argb),  // %0 | 
|  1902     "+r"(src_stride_argb),  // %1 |  1711     "+r"(src_argb_1),  // %1 | 
|  1903     "+r"(dst_u),     // %2 |  1712     "+r"(dst_u),     // %2 | 
|  1904     "+r"(dst_v),     // %3 |  1713     "+r"(dst_v),     // %3 | 
|  1905     "+r"(pix)        // %4 |  1714     "+r"(pix)        // %4 | 
|  1906   : |  1715   : | 
|  1907   : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", |  1716   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 
|  1908     "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |  1717     "v20", "v21", "v22", "v23", "v24", "v25" | 
|  1909   ); |  1718   ); | 
|  1910 } |  1719 } | 
|  1911 #endif  // HAS_ARGBTOUVJROW_NEON |  1720 #endif  // HAS_ARGBTOUVJROW_NEON | 
|  1912  |  1721  | 
|  1913 #ifdef HAS_BGRATOUVROW_NEON |  1722 #ifdef HAS_BGRATOUVROW_NEON | 
|  1914 void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, |  1723 void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, | 
|  1915                       uint8* dst_u, uint8* dst_v, int pix) { |  1724                       uint8* dst_u, uint8* dst_v, int pix) { | 
 |  1725   const uint8* src_bgra_1 = src_bgra + src_stride_bgra; | 
|  1916   asm volatile ( |  1726   asm volatile ( | 
|  1917     "add        %1, %0, %1                     \n"  // src_stride + src_bgra |  1727     RGBTOUV_SETUP_REG | 
|  1918     "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient |  | 
|  1919     "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient |  | 
|  1920     "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient |  | 
|  1921     "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient |  | 
|  1922     "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient |  | 
|  1923     "vmov.u16   q15, #0x8080                   \n"  // 128.5 |  | 
|  1924     ".p2align   2                              \n" |  | 
|  1925   "1:                                          \n" |  1728   "1:                                          \n" | 
|  1926     MEMACCESS(0) |  1729     MEMACCESS(0) | 
|  1927     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 BGRA pixels. |  1730     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels. | 
|  1928     MEMACCESS(0) |  1731     "uaddlp     v0.8h, v3.16b                  \n"  // B 16 bytes -> 8 shorts. | 
|  1929     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 BGRA pixels. |  1732     "uaddlp     v3.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts. | 
|  1930     "vpaddl.u8  q3, q3                         \n"  // B 16 bytes -> 8 shorts. |  1733     "uaddlp     v2.8h, v1.16b                  \n"  // R 16 bytes -> 8 shorts. | 
|  1931     "vpaddl.u8  q2, q2                         \n"  // G 16 bytes -> 8 shorts. |  | 
|  1932     "vpaddl.u8  q1, q1                         \n"  // R 16 bytes -> 8 shorts. |  | 
|  1933     MEMACCESS(1) |  1734     MEMACCESS(1) | 
|  1934     "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more BGRA pixels. |  1735     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more | 
|  1935     MEMACCESS(1) |  1736     "uadalp     v0.8h, v7.16b                  \n"  // B 16 bytes -> 8 shorts. | 
|  1936     "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 BGRA pixels. |  1737     "uadalp     v3.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts. | 
|  1937     "vpadal.u8  q3, q7                         \n"  // B 16 bytes -> 8 shorts. |  1738     "uadalp     v2.8h, v5.16b                  \n"  // R 16 bytes -> 8 shorts. | 
|  1938     "vpadal.u8  q2, q6                         \n"  // G 16 bytes -> 8 shorts. |  | 
|  1939     "vpadal.u8  q1, q5                         \n"  // R 16 bytes -> 8 shorts. |  | 
|  1940  |  1739  | 
|  1941     "vrshr.u16  q1, q1, #1                     \n"  // 2x average |  1740     "urshr      v0.8h, v0.8h, #1               \n"  // 2x average | 
|  1942     "vrshr.u16  q2, q2, #1                     \n" |  1741     "urshr      v1.8h, v3.8h, #1               \n" | 
|  1943     "vrshr.u16  q3, q3, #1                     \n" |  1742     "urshr      v2.8h, v2.8h, #1               \n" | 
|  1944  |  1743  | 
|  1945     "subs       %4, %4, #16                    \n"  // 32 processed per loop. |  1744     "subs       %4, %4, #16                    \n"  // 32 processed per loop. | 
|  1946     RGBTOUV(q3, q2, q1) |  1745     RGBTOUV(v0.8h, v1.8h, v2.8h) | 
|  1947     MEMACCESS(2) |  1746     MEMACCESS(2) | 
|  1948     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U. |  1747     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U. | 
|  1949     MEMACCESS(3) |  1748     MEMACCESS(3) | 
|  1950     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V. |  1749     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V. | 
|  1951     "bgt        1b                             \n" |  1750     "b.gt       1b                             \n" | 
|  1952   : "+r"(src_bgra),  // %0 |  1751   : "+r"(src_bgra),  // %0 | 
|  1953     "+r"(src_stride_bgra),  // %1 |  1752     "+r"(src_bgra_1),  // %1 | 
|  1954     "+r"(dst_u),     // %2 |  1753     "+r"(dst_u),     // %2 | 
|  1955     "+r"(dst_v),     // %3 |  1754     "+r"(dst_v),     // %3 | 
|  1956     "+r"(pix)        // %4 |  1755     "+r"(pix)        // %4 | 
|  1957   : |  1756   : | 
|  1958   : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", |  1757   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 
|  1959     "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |  1758     "v20", "v21", "v22", "v23", "v24", "v25" | 
|  1960   ); |  1759   ); | 
|  1961 } |  1760 } | 
|  1962 #endif  // HAS_BGRATOUVROW_NEON |  1761 #endif  // HAS_BGRATOUVROW_NEON | 
|  1963  |  1762  | 
|  1964 #ifdef HAS_ABGRTOUVROW_NEON |  1763 #ifdef HAS_ABGRTOUVROW_NEON | 
|  1965 void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, |  1764 void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, | 
|  1966                       uint8* dst_u, uint8* dst_v, int pix) { |  1765                       uint8* dst_u, uint8* dst_v, int pix) { | 
 |  1766   const uint8* src_abgr_1 = src_abgr + src_stride_abgr; | 
|  1967   asm volatile ( |  1767   asm volatile ( | 
|  1968     "add        %1, %0, %1                     \n"  // src_stride + src_abgr |  1768     RGBTOUV_SETUP_REG | 
|  1969     "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient |  | 
|  1970     "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient |  | 
|  1971     "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient |  | 
|  1972     "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient |  | 
|  1973     "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient |  | 
|  1974     "vmov.u16   q15, #0x8080                   \n"  // 128.5 |  | 
|  1975     ".p2align   2                              \n" |  | 
|  1976   "1:                                          \n" |  1769   "1:                                          \n" | 
|  1977     MEMACCESS(0) |  1770     MEMACCESS(0) | 
|  1978     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ABGR pixels. |  1771     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels. | 
|  1979     MEMACCESS(0) |  1772     "uaddlp     v3.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts. | 
|  1980     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ABGR pixels. |  1773     "uaddlp     v2.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts. | 
|  1981     "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts. |  1774     "uaddlp     v1.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts. | 
|  1982     "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts. |  | 
|  1983     "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts. |  | 
|  1984     MEMACCESS(1) |  1775     MEMACCESS(1) | 
|  1985     "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ABGR pixels. |  1776     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more. | 
|  1986     MEMACCESS(1) |  1777     "uadalp     v3.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts. | 
|  1987     "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ABGR pixels. |  1778     "uadalp     v2.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts. | 
|  1988     "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts. |  1779     "uadalp     v1.8h, v4.16b                  \n"  // R 16 bytes -> 8 shorts. | 
|  1989     "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts. |  | 
|  1990     "vpadal.u8  q0, q4                         \n"  // R 16 bytes -> 8 shorts. |  | 
|  1991  |  1780  | 
|  1992     "vrshr.u16  q0, q0, #1                     \n"  // 2x average |  1781     "urshr      v0.8h, v3.8h, #1               \n"  // 2x average | 
|  1993     "vrshr.u16  q1, q1, #1                     \n" |  1782     "urshr      v2.8h, v2.8h, #1               \n" | 
|  1994     "vrshr.u16  q2, q2, #1                     \n" |  1783     "urshr      v1.8h, v1.8h, #1               \n" | 
|  1995  |  1784  | 
|  1996     "subs       %4, %4, #16                    \n"  // 32 processed per loop. |  1785     "subs       %4, %4, #16                    \n"  // 32 processed per loop. | 
|  1997     RGBTOUV(q2, q1, q0) |  1786     RGBTOUV(v0.8h, v2.8h, v1.8h) | 
|  1998     MEMACCESS(2) |  1787     MEMACCESS(2) | 
|  1999     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U. |  1788     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U. | 
|  2000     MEMACCESS(3) |  1789     MEMACCESS(3) | 
|  2001     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V. |  1790     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V. | 
|  2002     "bgt        1b                             \n" |  1791     "b.gt       1b                             \n" | 
|  2003   : "+r"(src_abgr),  // %0 |  1792   : "+r"(src_abgr),  // %0 | 
|  2004     "+r"(src_stride_abgr),  // %1 |  1793     "+r"(src_abgr_1),  // %1 | 
|  2005     "+r"(dst_u),     // %2 |  1794     "+r"(dst_u),     // %2 | 
|  2006     "+r"(dst_v),     // %3 |  1795     "+r"(dst_v),     // %3 | 
|  2007     "+r"(pix)        // %4 |  1796     "+r"(pix)        // %4 | 
|  2008   : |  1797   : | 
|  2009   : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", |  1798   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 
|  2010     "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |  1799     "v20", "v21", "v22", "v23", "v24", "v25" | 
|  2011   ); |  1800   ); | 
|  2012 } |  1801 } | 
|  2013 #endif  // HAS_ABGRTOUVROW_NEON |  1802 #endif  // HAS_ABGRTOUVROW_NEON | 
|  2014  |  1803  | 
|  2015 #ifdef HAS_RGBATOUVROW_NEON |  1804 #ifdef HAS_RGBATOUVROW_NEON | 
|  2016 void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, |  1805 void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, | 
|  2017                       uint8* dst_u, uint8* dst_v, int pix) { |  1806                       uint8* dst_u, uint8* dst_v, int pix) { | 
 |  1807   const uint8* src_rgba_1 = src_rgba + src_stride_rgba; | 
|  2018   asm volatile ( |  1808   asm volatile ( | 
|  2019     "add        %1, %0, %1                     \n"  // src_stride + src_rgba |  1809     RGBTOUV_SETUP_REG | 
|  2020     "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient |  | 
|  2021     "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient |  | 
|  2022     "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient |  | 
|  2023     "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient |  | 
|  2024     "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient |  | 
|  2025     "vmov.u16   q15, #0x8080                   \n"  // 128.5 |  | 
|  2026     ".p2align   2                              \n" |  | 
|  2027   "1:                                          \n" |  1810   "1:                                          \n" | 
|  2028     MEMACCESS(0) |  1811     MEMACCESS(0) | 
|  2029     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 RGBA pixels. |  1812     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels. | 
|  2030     MEMACCESS(0) |  1813     "uaddlp     v0.8h, v1.16b                  \n"  // B 16 bytes -> 8 shorts. | 
|  2031     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 RGBA pixels. |  1814     "uaddlp     v1.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts. | 
|  2032     "vpaddl.u8  q0, q1                         \n"  // B 16 bytes -> 8 shorts. |  1815     "uaddlp     v2.8h, v3.16b                  \n"  // R 16 bytes -> 8 shorts. | 
|  2033     "vpaddl.u8  q1, q2                         \n"  // G 16 bytes -> 8 shorts. |  | 
|  2034     "vpaddl.u8  q2, q3                         \n"  // R 16 bytes -> 8 shorts. |  | 
|  2035     MEMACCESS(1) |  1816     MEMACCESS(1) | 
|  2036     "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more RGBA pixels. |  1817     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more. | 
|  2037     MEMACCESS(1) |  1818     "uadalp     v0.8h, v5.16b                  \n"  // B 16 bytes -> 8 shorts. | 
|  2038     "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 RGBA pixels. |  1819     "uadalp     v1.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts. | 
|  2039     "vpadal.u8  q0, q5                         \n"  // B 16 bytes -> 8 shorts. |  1820     "uadalp     v2.8h, v7.16b                  \n"  // R 16 bytes -> 8 shorts. | 
|  2040     "vpadal.u8  q1, q6                         \n"  // G 16 bytes -> 8 shorts. |  | 
|  2041     "vpadal.u8  q2, q7                         \n"  // R 16 bytes -> 8 shorts. |  | 
|  2042  |  1821  | 
|  2043     "vrshr.u16  q0, q0, #1                     \n"  // 2x average |  1822     "urshr      v0.8h, v0.8h, #1               \n"  // 2x average | 
|  2044     "vrshr.u16  q1, q1, #1                     \n" |  1823     "urshr      v1.8h, v1.8h, #1               \n" | 
|  2045     "vrshr.u16  q2, q2, #1                     \n" |  1824     "urshr      v2.8h, v2.8h, #1               \n" | 
|  2046  |  1825  | 
|  2047     "subs       %4, %4, #16                    \n"  // 32 processed per loop. |  1826     "subs       %4, %4, #16                    \n"  // 32 processed per loop. | 
|  2048     RGBTOUV(q0, q1, q2) |  1827     RGBTOUV(v0.8h, v1.8h, v2.8h) | 
|  2049     MEMACCESS(2) |  1828     MEMACCESS(2) | 
|  2050     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U. |  1829     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U. | 
|  2051     MEMACCESS(3) |  1830     MEMACCESS(3) | 
|  2052     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V. |  1831     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V. | 
|  2053     "bgt        1b                             \n" |  1832     "b.gt       1b                             \n" | 
|  2054   : "+r"(src_rgba),  // %0 |  1833   : "+r"(src_rgba),  // %0 | 
|  2055     "+r"(src_stride_rgba),  // %1 |  1834     "+r"(src_rgba_1),  // %1 | 
|  2056     "+r"(dst_u),     // %2 |  1835     "+r"(dst_u),     // %2 | 
|  2057     "+r"(dst_v),     // %3 |  1836     "+r"(dst_v),     // %3 | 
|  2058     "+r"(pix)        // %4 |  1837     "+r"(pix)        // %4 | 
|  2059   : |  1838   : | 
|  2060   : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", |  1839   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 
|  2061     "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |  1840     "v20", "v21", "v22", "v23", "v24", "v25" | 
|  2062   ); |  1841   ); | 
|  2063 } |  1842 } | 
|  2064 #endif  // HAS_RGBATOUVROW_NEON |  1843 #endif  // HAS_RGBATOUVROW_NEON | 
|  2065  |  1844  | 
|  2066 #ifdef HAS_RGB24TOUVROW_NEON |  1845 #ifdef HAS_RGB24TOUVROW_NEON | 
|  2067 void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, |  1846 void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, | 
|  2068                        uint8* dst_u, uint8* dst_v, int pix) { |  1847                        uint8* dst_u, uint8* dst_v, int pix) { | 
 |  1848   const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24; | 
|  2069   asm volatile ( |  1849   asm volatile ( | 
|  2070     "add        %1, %0, %1                     \n"  // src_stride + src_rgb24 |  1850     RGBTOUV_SETUP_REG | 
|  2071     "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient |  | 
|  2072     "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient |  | 
|  2073     "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient |  | 
|  2074     "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient |  | 
|  2075     "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient |  | 
|  2076     "vmov.u16   q15, #0x8080                   \n"  // 128.5 |  | 
|  2077     ".p2align   2                              \n" |  | 
|  2078   "1:                                          \n" |  1851   "1:                                          \n" | 
|  2079     MEMACCESS(0) |  1852     MEMACCESS(0) | 
|  2080     "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RGB24 pixels. |  1853     "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 pixels. | 
|  2081     MEMACCESS(0) |  1854     "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts. | 
|  2082     "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RGB24 pixels. |  1855     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts. | 
|  2083     "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts. |  1856     "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts. | 
|  2084     "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts. |  | 
|  2085     "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts. |  | 
|  2086     MEMACCESS(1) |  1857     MEMACCESS(1) | 
|  2087     "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RGB24 pixels. |  1858     "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 16 more. | 
|  2088     MEMACCESS(1) |  1859     "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts. | 
|  2089     "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RGB24 pixels. |  1860     "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts. | 
|  2090     "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts. |  1861     "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts. | 
|  2091     "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts. |  | 
|  2092     "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts. |  | 
|  2093  |  1862  | 
|  2094     "vrshr.u16  q0, q0, #1                     \n"  // 2x average |  1863     "urshr      v0.8h, v0.8h, #1               \n"  // 2x average | 
|  2095     "vrshr.u16  q1, q1, #1                     \n" |  1864     "urshr      v1.8h, v1.8h, #1               \n" | 
|  2096     "vrshr.u16  q2, q2, #1                     \n" |  1865     "urshr      v2.8h, v2.8h, #1               \n" | 
|  2097  |  1866  | 
|  2098     "subs       %4, %4, #16                    \n"  // 32 processed per loop. |  1867     "subs       %4, %4, #16                    \n"  // 32 processed per loop. | 
|  2099     RGBTOUV(q0, q1, q2) |  1868     RGBTOUV(v0.8h, v1.8h, v2.8h) | 
|  2100     MEMACCESS(2) |  1869     MEMACCESS(2) | 
|  2101     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U. |  1870     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U. | 
|  2102     MEMACCESS(3) |  1871     MEMACCESS(3) | 
|  2103     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V. |  1872     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V. | 
|  2104     "bgt        1b                             \n" |  1873     "b.gt       1b                             \n" | 
|  2105   : "+r"(src_rgb24),  // %0 |  1874   : "+r"(src_rgb24),  // %0 | 
|  2106     "+r"(src_stride_rgb24),  // %1 |  1875     "+r"(src_rgb24_1),  // %1 | 
|  2107     "+r"(dst_u),     // %2 |  1876     "+r"(dst_u),     // %2 | 
|  2108     "+r"(dst_v),     // %3 |  1877     "+r"(dst_v),     // %3 | 
|  2109     "+r"(pix)        // %4 |  1878     "+r"(pix)        // %4 | 
|  2110   : |  1879   : | 
|  2111   : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", |  1880   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 
|  2112     "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |  1881     "v20", "v21", "v22", "v23", "v24", "v25" | 
|  2113   ); |  1882   ); | 
|  2114 } |  1883 } | 
|  2115 #endif  // HAS_RGB24TOUVROW_NEON |  1884 #endif  // HAS_RGB24TOUVROW_NEON | 
|  2116  |  1885  | 
|  2117 #ifdef HAS_RAWTOUVROW_NEON |  1886 #ifdef HAS_RAWTOUVROW_NEON | 
|  2118 void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, |  1887 void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, | 
|  2119                      uint8* dst_u, uint8* dst_v, int pix) { |  1888                      uint8* dst_u, uint8* dst_v, int pix) { | 
 |  1889   const uint8* src_raw_1 = src_raw + src_stride_raw; | 
|  2120   asm volatile ( |  1890   asm volatile ( | 
|  2121     "add        %1, %0, %1                     \n"  // src_stride + src_raw |  1891     RGBTOUV_SETUP_REG | 
|  2122     "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient |  | 
|  2123     "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient |  | 
|  2124     "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient |  | 
|  2125     "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient |  | 
|  2126     "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient |  | 
|  2127     "vmov.u16   q15, #0x8080                   \n"  // 128.5 |  | 
|  2128     ".p2align   2                              \n" |  | 
|  2129   "1:                                          \n" |  1892   "1:                                          \n" | 
|  2130     MEMACCESS(0) |  1893     MEMACCESS(0) | 
|  2131     "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RAW pixels. |  1894     "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 8 RAW pixels. | 
|  2132     MEMACCESS(0) |  1895     "uaddlp     v2.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts. | 
|  2133     "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RAW pixels. |  1896     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts. | 
|  2134     "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts. |  1897     "uaddlp     v0.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts. | 
|  2135     "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts. |  | 
|  2136     "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts. |  | 
|  2137     MEMACCESS(1) |  1898     MEMACCESS(1) | 
|  2138     "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RAW pixels. |  1899     "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 8 more RAW pixels | 
|  2139     MEMACCESS(1) |  1900     "uadalp     v2.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts. | 
|  2140     "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RAW pixels. |  1901     "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts. | 
|  2141     "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts. |  1902     "uadalp     v0.8h, v4.16b                  \n"  // R 16 bytes -> 8 shorts. | 
|  2142     "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts. |  | 
|  2143     "vpadal.u8  q0, q4                         \n"  // R 16 bytes -> 8 shorts. |  | 
|  2144  |  1903  | 
|  2145     "vrshr.u16  q0, q0, #1                     \n"  // 2x average |  1904     "urshr      v2.8h, v2.8h, #1               \n"  // 2x average | 
|  2146     "vrshr.u16  q1, q1, #1                     \n" |  1905     "urshr      v1.8h, v1.8h, #1               \n" | 
|  2147     "vrshr.u16  q2, q2, #1                     \n" |  1906     "urshr      v0.8h, v0.8h, #1               \n" | 
|  2148  |  1907  | 
|  2149     "subs       %4, %4, #16                    \n"  // 32 processed per loop. |  1908     "subs       %4, %4, #16                    \n"  // 32 processed per loop. | 
|  2150     RGBTOUV(q2, q1, q0) |  1909     RGBTOUV(v2.8h, v1.8h, v0.8h) | 
|  2151     MEMACCESS(2) |  1910     MEMACCESS(2) | 
|  2152     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U. |  1911     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U. | 
|  2153     MEMACCESS(3) |  1912     MEMACCESS(3) | 
|  2154     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V. |  1913     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V. | 
|  2155     "bgt        1b                             \n" |  1914     "b.gt       1b                             \n" | 
|  2156   : "+r"(src_raw),  // %0 |  1915   : "+r"(src_raw),  // %0 | 
|  2157     "+r"(src_stride_raw),  // %1 |  1916     "+r"(src_raw_1),  // %1 | 
|  2158     "+r"(dst_u),     // %2 |  1917     "+r"(dst_u),     // %2 | 
|  2159     "+r"(dst_v),     // %3 |  1918     "+r"(dst_v),     // %3 | 
|  2160     "+r"(pix)        // %4 |  1919     "+r"(pix)        // %4 | 
|  2161   : |  1920   : | 
|  2162   : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", |  1921   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 
|  2163     "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |  1922     "v20", "v21", "v22", "v23", "v24", "v25" | 
|  2164   ); |  1923   ); | 
|  2165 } |  1924 } | 
|  2166 #endif  // HAS_RAWTOUVROW_NEON |  1925 #endif  // HAS_RAWTOUVROW_NEON | 
|  2167  |  1926  | 
|  2168 // 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16. |  1927 // 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16. | 
|  2169 #ifdef HAS_RGB565TOUVROW_NEON |  1928 #ifdef HAS_RGB565TOUVROW_NEON | 
|  2170 void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, |  1929 void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, | 
|  2171                         uint8* dst_u, uint8* dst_v, int pix) { |  1930                         uint8* dst_u, uint8* dst_v, int pix) { | 
 |  1931   const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565; | 
|  2172   asm volatile ( |  1932   asm volatile ( | 
|  2173     "add        %1, %0, %1                     \n"  // src_stride + src_argb |  1933     "movi       v22.8h, #56, lsl #0            \n"  // UB / VR coeff (0.875) / 2 | 
|  2174     "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient |  1934     "movi       v23.8h, #37, lsl #0            \n"  // UG coeff (-0.5781) / 2 | 
|  2175     "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient |  1935     "movi       v24.8h, #19, lsl #0            \n"  // UR coeff (-0.2969) / 2 | 
|  2176     "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient |  1936     "movi       v25.8h, #9 , lsl #0            \n"  // VB coeff (-0.1406) / 2 | 
|  2177     "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient |  1937     "movi       v26.8h, #47, lsl #0            \n"  // VG coeff (-0.7344) / 2 | 
|  2178     "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient |  1938     "movi       v27.16b, #0x80                 \n"  // 128.5 (0x8080 in 16-bit) | 
|  2179     "vmov.u16   q15, #0x8080                   \n"  // 128.5 |  | 
|  2180     ".p2align   2                              \n" |  | 
|  2181   "1:                                          \n" |  1939   "1:                                          \n" | 
|  2182     MEMACCESS(0) |  1940     MEMACCESS(0) | 
|  2183     "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels. |  1941     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels. | 
|  2184     RGB565TOARGB |  1942     RGB565TOARGB | 
|  2185     "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts. |  1943     "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts. | 
|  2186     "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts. |  1944     "uaddlp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts. | 
|  2187     "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts. |  1945     "uaddlp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts. | 
|  2188     MEMACCESS(0) |  1946     MEMACCESS(0) | 
|  2189     "vld1.8     {q0}, [%0]!                    \n"  // next 8 RGB565 pixels. |  1947     "ld1        {v0.16b}, [%0], #16            \n"  // next 8 RGB565 pixels. | 
|  2190     RGB565TOARGB |  1948     RGB565TOARGB | 
|  2191     "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts. |  1949     "uaddlp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts. | 
|  2192     "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts. |  1950     "uaddlp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts. | 
|  2193     "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts. |  1951     "uaddlp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts. | 
|  2194  |  1952  | 
|  2195     MEMACCESS(1) |  1953     MEMACCESS(1) | 
|  2196     "vld1.8     {q0}, [%1]!                    \n"  // load 8 RGB565 pixels. |  1954     "ld1        {v0.16b}, [%1], #16            \n"  // load 8 RGB565 pixels. | 
|  2197     RGB565TOARGB |  1955     RGB565TOARGB | 
|  2198     "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts. |  1956     "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts. | 
|  2199     "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts. |  1957     "uadalp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts. | 
|  2200     "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts. |  1958     "uadalp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts. | 
|  2201     MEMACCESS(1) |  1959     MEMACCESS(1) | 
|  2202     "vld1.8     {q0}, [%1]!                    \n"  // next 8 RGB565 pixels. |  1960     "ld1        {v0.16b}, [%1], #16            \n"  // next 8 RGB565 pixels. | 
|  2203     RGB565TOARGB |  1961     RGB565TOARGB | 
|  2204     "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts. |  1962     "uadalp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts. | 
|  2205     "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts. |  1963     "uadalp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts. | 
|  2206     "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts. |  1964     "uadalp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts. | 
|  2207  |  1965  | 
|  2208     "vrshr.u16  q4, q4, #1                     \n"  // 2x average |  1966     "ins        v16.D[1], v17.D[0]             \n" | 
|  2209     "vrshr.u16  q5, q5, #1                     \n" |  1967     "ins        v18.D[1], v19.D[0]             \n" | 
|  2210     "vrshr.u16  q6, q6, #1                     \n" |  1968     "ins        v20.D[1], v21.D[0]             \n" | 
 |  1969  | 
 |  1970     "urshr      v4.8h, v16.8h, #1              \n"  // 2x average | 
 |  1971     "urshr      v5.8h, v18.8h, #1              \n" | 
 |  1972     "urshr      v6.8h, v20.8h, #1              \n" | 
|  2211  |  1973  | 
|  2212     "subs       %4, %4, #16                    \n"  // 16 processed per loop. |  1974     "subs       %4, %4, #16                    \n"  // 16 processed per loop. | 
|  2213     "vmul.s16   q8, q4, q10                    \n"  // B |  1975     "mul        v16.8h, v4.8h, v22.8h          \n"  // B | 
|  2214     "vmls.s16   q8, q5, q11                    \n"  // G |  1976     "mls        v16.8h, v5.8h, v23.8h          \n"  // G | 
|  2215     "vmls.s16   q8, q6, q12                    \n"  // R |  1977     "mls        v16.8h, v6.8h, v24.8h          \n"  // R | 
|  2216     "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned |  1978     "add        v16.8h, v16.8h, v27.8h         \n"  // +128 -> unsigned | 
|  2217     "vmul.s16   q9, q6, q10                    \n"  // R |  1979     "mul        v17.8h, v6.8h, v22.8h          \n"  // R | 
|  2218     "vmls.s16   q9, q5, q14                    \n"  // G |  1980     "mls        v17.8h, v5.8h, v26.8h          \n"  // G | 
|  2219     "vmls.s16   q9, q4, q13                    \n"  // B |  1981     "mls        v17.8h, v4.8h, v25.8h          \n"  // B | 
|  2220     "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned |  1982     "add        v17.8h, v17.8h, v27.8h         \n"  // +128 -> unsigned | 
|  2221     "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U |  1983     "uqshrn     v0.8b, v16.8h, #8              \n"  // 16 bit to 8 bit U | 
|  2222     "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V |  1984     "uqshrn     v1.8b, v17.8h, #8              \n"  // 16 bit to 8 bit V | 
|  2223     MEMACCESS(2) |  1985     MEMACCESS(2) | 
|  2224     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U. |  1986     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U. | 
|  2225     MEMACCESS(3) |  1987     MEMACCESS(3) | 
|  2226     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V. |  1988     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V. | 
|  2227     "bgt        1b                             \n" |  1989     "b.gt       1b                             \n" | 
|  2228   : "+r"(src_rgb565),  // %0 |  1990   : "+r"(src_rgb565),  // %0 | 
|  2229     "+r"(src_stride_rgb565),  // %1 |  1991     "+r"(src_rgb565_1),  // %1 | 
|  2230     "+r"(dst_u),     // %2 |  1992     "+r"(dst_u),     // %2 | 
|  2231     "+r"(dst_v),     // %3 |  1993     "+r"(dst_v),     // %3 | 
|  2232     "+r"(pix)        // %4 |  1994     "+r"(pix)        // %4 | 
|  2233   : |  1995   : | 
|  2234   : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", |  1996   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 
|  2235     "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |  1997     "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", | 
 |  1998     "v25", "v26", "v27" | 
|  2236   ); |  1999   ); | 
|  2237 } |  2000 } | 
|  2238 #endif  // HAS_RGB565TOUVROW_NEON |  2001 #endif  // HAS_RGB565TOUVROW_NEON | 
|  2239  |  2002  | 
|  2240 // 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16. |  2003 // 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16. | 
|  2241 #ifdef HAS_ARGB1555TOUVROW_NEON |  2004 #ifdef HAS_ARGB1555TOUVROW_NEON | 
|  2242 void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, |  2005 void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, | 
|  2243                         uint8* dst_u, uint8* dst_v, int pix) { |  2006                         uint8* dst_u, uint8* dst_v, int pix) { | 
 |  2007   const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555; | 
|  2244   asm volatile ( |  2008   asm volatile ( | 
|  2245     "add        %1, %0, %1                     \n"  // src_stride + src_argb |  2009     RGBTOUV_SETUP_REG | 
|  2246     "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient |  | 
|  2247     "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient |  | 
|  2248     "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient |  | 
|  2249     "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient |  | 
|  2250     "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient |  | 
|  2251     "vmov.u16   q15, #0x8080                   \n"  // 128.5 |  | 
|  2252     ".p2align   2                              \n" |  | 
|  2253   "1:                                          \n" |  2010   "1:                                          \n" | 
|  2254     MEMACCESS(0) |  2011     MEMACCESS(0) | 
|  2255     "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels. |  2012     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels. | 
|  2256     RGB555TOARGB |  2013     RGB555TOARGB | 
|  2257     "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts. |  2014     "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts. | 
|  2258     "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts. |  2015     "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts. | 
|  2259     "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts. |  2016     "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts. | 
|  2260     MEMACCESS(0) |  2017     MEMACCESS(0) | 
|  2261     "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB1555 pixels. |  2018     "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB1555 pixels. | 
|  2262     RGB555TOARGB |  2019     RGB555TOARGB | 
|  2263     "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts. |  2020     "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts. | 
|  2264     "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts. |  2021     "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts. | 
|  2265     "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts. |  2022     "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts. | 
|  2266  |  2023  | 
|  2267     MEMACCESS(1) |  2024     MEMACCESS(1) | 
|  2268     "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB1555 pixels. |  2025     "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB1555 pixels. | 
|  2269     RGB555TOARGB |  2026     RGB555TOARGB | 
|  2270     "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts. |  2027     "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts. | 
|  2271     "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts. |  2028     "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts. | 
|  2272     "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts. |  2029     "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts. | 
|  2273     MEMACCESS(1) |  2030     MEMACCESS(1) | 
|  2274     "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB1555 pixels. |  2031     "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB1555 pixels. | 
|  2275     RGB555TOARGB |  2032     RGB555TOARGB | 
|  2276     "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts. |  2033     "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts. | 
|  2277     "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts. |  2034     "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts. | 
|  2278     "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts. |  2035     "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts. | 
|  2279  |  2036  | 
|  2280     "vrshr.u16  q4, q4, #1                     \n"  // 2x average |  2037     "ins        v16.D[1], v26.D[0]             \n" | 
|  2281     "vrshr.u16  q5, q5, #1                     \n" |  2038     "ins        v17.D[1], v27.D[0]             \n" | 
|  2282     "vrshr.u16  q6, q6, #1                     \n" |  2039     "ins        v18.D[1], v28.D[0]             \n" | 
 |  2040  | 
 |  2041     "urshr      v4.8h, v16.8h, #1              \n"  // 2x average | 
 |  2042     "urshr      v5.8h, v17.8h, #1              \n" | 
 |  2043     "urshr      v6.8h, v18.8h, #1              \n" | 
|  2283  |  2044  | 
|  2284     "subs       %4, %4, #16                    \n"  // 16 processed per loop. |  2045     "subs       %4, %4, #16                    \n"  // 16 processed per loop. | 
|  2285     "vmul.s16   q8, q4, q10                    \n"  // B |  2046     "mul        v2.8h, v4.8h, v20.8h           \n"  // B | 
|  2286     "vmls.s16   q8, q5, q11                    \n"  // G |  2047     "mls        v2.8h, v5.8h, v21.8h           \n"  // G | 
|  2287     "vmls.s16   q8, q6, q12                    \n"  // R |  2048     "mls        v2.8h, v6.8h, v22.8h           \n"  // R | 
|  2288     "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned |  2049     "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned | 
|  2289     "vmul.s16   q9, q6, q10                    \n"  // R |  2050     "mul        v3.8h, v6.8h, v20.8h           \n"  // R | 
|  2290     "vmls.s16   q9, q5, q14                    \n"  // G |  2051     "mls        v3.8h, v5.8h, v24.8h           \n"  // G | 
|  2291     "vmls.s16   q9, q4, q13                    \n"  // B |  2052     "mls        v3.8h, v4.8h, v23.8h           \n"  // B | 
|  2292     "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned |  2053     "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned | 
|  2293     "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U |  2054     "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U | 
|  2294     "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V |  2055     "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V | 
|  2295     MEMACCESS(2) |  2056     MEMACCESS(2) | 
|  2296     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U. |  2057     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U. | 
|  2297     MEMACCESS(3) |  2058     MEMACCESS(3) | 
|  2298     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V. |  2059     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V. | 
|  2299     "bgt        1b                             \n" |  2060     "b.gt       1b                             \n" | 
|  2300   : "+r"(src_argb1555),  // %0 |  2061   : "+r"(src_argb1555),  // %0 | 
|  2301     "+r"(src_stride_argb1555),  // %1 |  2062     "+r"(src_argb1555_1),  // %1 | 
|  2302     "+r"(dst_u),     // %2 |  2063     "+r"(dst_u),     // %2 | 
|  2303     "+r"(dst_v),     // %3 |  2064     "+r"(dst_v),     // %3 | 
|  2304     "+r"(pix)        // %4 |  2065     "+r"(pix)        // %4 | 
|  2305   : |  2066   : | 
|  2306   : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", |  2067   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", | 
|  2307     "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |  2068     "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", | 
 |  2069     "v26", "v27", "v28" | 
|  2308   ); |  2070   ); | 
|  2309 } |  2071 } | 
|  2310 #endif  // HAS_ARGB1555TOUVROW_NEON |  2072 #endif  // HAS_ARGB1555TOUVROW_NEON | 
|  2311  |  2073  | 
|  2312 // 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16. |  2074 // 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16. | 
|  2313 #ifdef HAS_ARGB4444TOUVROW_NEON |  2075 #ifdef HAS_ARGB4444TOUVROW_NEON | 
|  2314 void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, |  2076 void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, | 
|  2315                           uint8* dst_u, uint8* dst_v, int pix) { |  2077                           uint8* dst_u, uint8* dst_v, int pix) { | 
 |  2078   const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444; | 
|  2316   asm volatile ( |  2079   asm volatile ( | 
|  2317     "add        %1, %0, %1                     \n"  // src_stride + src_argb |  2080     RGBTOUV_SETUP_REG | 
|  2318     "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient |  | 
|  2319     "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient |  | 
|  2320     "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient |  | 
|  2321     "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient |  | 
|  2322     "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient |  | 
|  2323     "vmov.u16   q15, #0x8080                   \n"  // 128.5 |  | 
|  2324     ".p2align   2                              \n" |  | 
|  2325   "1:                                          \n" |  2081   "1:                                          \n" | 
|  2326     MEMACCESS(0) |  2082     MEMACCESS(0) | 
|  2327     "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels. |  2083     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels. | 
|  2328     ARGB4444TOARGB |  2084     ARGB4444TOARGB | 
|  2329     "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts. |  2085     "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts. | 
|  2330     "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts. |  2086     "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts. | 
|  2331     "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts. |  2087     "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts. | 
|  2332     MEMACCESS(0) |  2088     MEMACCESS(0) | 
|  2333     "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB4444 pixels. |  2089     "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB4444 pixels. | 
|  2334     ARGB4444TOARGB |  2090     ARGB4444TOARGB | 
|  2335     "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts. |  2091     "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts. | 
|  2336     "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts. |  2092     "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts. | 
|  2337     "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts. |  2093     "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts. | 
|  2338  |  2094  | 
|  2339     MEMACCESS(1) |  2095     MEMACCESS(1) | 
|  2340     "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB4444 pixels. |  2096     "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB4444 pixels. | 
|  2341     ARGB4444TOARGB |  2097     ARGB4444TOARGB | 
|  2342     "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts. |  2098     "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts. | 
|  2343     "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts. |  2099     "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts. | 
|  2344     "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts. |  2100     "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts. | 
|  2345     MEMACCESS(1) |  2101     MEMACCESS(1) | 
|  2346     "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB4444 pixels. |  2102     "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB4444 pixels. | 
|  2347     ARGB4444TOARGB |  2103     ARGB4444TOARGB | 
|  2348     "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts. |  2104     "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts. | 
|  2349     "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts. |  2105     "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts. | 
|  2350     "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts. |  2106     "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts. | 
|  2351  |  2107  | 
|  2352     "vrshr.u16  q4, q4, #1                     \n"  // 2x average |  2108     "ins        v16.D[1], v26.D[0]             \n" | 
|  2353     "vrshr.u16  q5, q5, #1                     \n" |  2109     "ins        v17.D[1], v27.D[0]             \n" | 
|  2354     "vrshr.u16  q6, q6, #1                     \n" |  2110     "ins        v18.D[1], v28.D[0]             \n" | 
 |  2111  | 
 |  2112     "urshr      v4.8h, v16.8h, #1              \n"  // 2x average | 
 |  2113     "urshr      v5.8h, v17.8h, #1              \n" | 
 |  2114     "urshr      v6.8h, v18.8h, #1              \n" | 
|  2355  |  2115  | 
|  2356     "subs       %4, %4, #16                    \n"  // 16 processed per loop. |  2116     "subs       %4, %4, #16                    \n"  // 16 processed per loop. | 
|  2357     "vmul.s16   q8, q4, q10                    \n"  // B |  2117     "mul        v2.8h, v4.8h, v20.8h           \n"  // B | 
|  2358     "vmls.s16   q8, q5, q11                    \n"  // G |  2118     "mls        v2.8h, v5.8h, v21.8h           \n"  // G | 
|  2359     "vmls.s16   q8, q6, q12                    \n"  // R |  2119     "mls        v2.8h, v6.8h, v22.8h           \n"  // R | 
|  2360     "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned |  2120     "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned | 
|  2361     "vmul.s16   q9, q6, q10                    \n"  // R |  2121     "mul        v3.8h, v6.8h, v20.8h           \n"  // R | 
|  2362     "vmls.s16   q9, q5, q14                    \n"  // G |  2122     "mls        v3.8h, v5.8h, v24.8h           \n"  // G | 
|  2363     "vmls.s16   q9, q4, q13                    \n"  // B |  2123     "mls        v3.8h, v4.8h, v23.8h           \n"  // B | 
|  2364     "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned |  2124     "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned | 
|  2365     "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U |  2125     "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U | 
|  2366     "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V |  2126     "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V | 
|  2367     MEMACCESS(2) |  2127     MEMACCESS(2) | 
|  2368     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U. |  2128     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U. | 
|  2369     MEMACCESS(3) |  2129     MEMACCESS(3) | 
|  2370     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V. |  2130     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V. | 
|  2371     "bgt        1b                             \n" |  2131     "b.gt       1b                             \n" | 
|  2372   : "+r"(src_argb4444),  // %0 |  2132   : "+r"(src_argb4444),  // %0 | 
|  2373     "+r"(src_stride_argb4444),  // %1 |  2133     "+r"(src_argb4444_1),  // %1 | 
|  2374     "+r"(dst_u),     // %2 |  2134     "+r"(dst_u),     // %2 | 
|  2375     "+r"(dst_v),     // %3 |  2135     "+r"(dst_v),     // %3 | 
|  2376     "+r"(pix)        // %4 |  2136     "+r"(pix)        // %4 | 
|  2377   : |  2137   : | 
|  2378   : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", |  2138   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", | 
|  2379     "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |  2139     "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", | 
 |  2140     "v26", "v27", "v28" | 
 |  2141  | 
|  2380   ); |  2142   ); | 
|  2381 } |  2143 } | 
|  2382 #endif  // HAS_ARGB4444TOUVROW_NEON |  2144 #endif  // HAS_ARGB4444TOUVROW_NEON | 
|  2383  |  2145  | 
|  2384 #ifdef HAS_RGB565TOYROW_NEON |  2146 #ifdef HAS_RGB565TOYROW_NEON | 
|  2385 void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) { |  2147 void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) { | 
|  2386   asm volatile ( |  2148   asm volatile ( | 
|  2387     "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient |  2149     "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient | 
|  2388     "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient |  2150     "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient | 
|  2389     "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient |  2151     "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient | 
|  2390     "vmov.u8    d27, #16                       \n"  // Add 16 constant |  2152     "movi       v27.8b, #16                    \n"  // Add 16 constant | 
|  2391     ".p2align   2                              \n" |  | 
|  2392   "1:                                          \n" |  2153   "1:                                          \n" | 
|  2393     MEMACCESS(0) |  2154     MEMACCESS(0) | 
|  2394     "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels. |  2155     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels. | 
|  2395     "subs       %2, %2, #8                     \n"  // 8 processed per loop. |  2156     "subs       %2, %2, #8                     \n"  // 8 processed per loop. | 
|  2396     RGB565TOARGB |  2157     RGB565TOARGB | 
|  2397     "vmull.u8   q2, d0, d24                    \n"  // B |  2158     "umull      v3.8h, v0.8b, v24.8b           \n"  // B | 
|  2398     "vmlal.u8   q2, d1, d25                    \n"  // G |  2159     "umlal      v3.8h, v1.8b, v25.8b           \n"  // G | 
|  2399     "vmlal.u8   q2, d2, d26                    \n"  // R |  2160     "umlal      v3.8h, v2.8b, v26.8b           \n"  // R | 
|  2400     "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y |  2161     "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y | 
|  2401     "vqadd.u8   d0, d27                        \n" |  2162     "uqadd      v0.8b, v0.8b, v27.8b           \n" | 
|  2402     MEMACCESS(1) |  2163     MEMACCESS(1) | 
|  2403     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y. |  2164     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y. | 
|  2404     "bgt        1b                             \n" |  2165     "b.gt       1b                             \n" | 
|  2405   : "+r"(src_rgb565),  // %0 |  2166   : "+r"(src_rgb565),  // %0 | 
|  2406     "+r"(dst_y),       // %1 |  2167     "+r"(dst_y),       // %1 | 
|  2407     "+r"(pix)          // %2 |  2168     "+r"(pix)          // %2 | 
|  2408   : |  2169   : | 
|  2409   : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13" |  2170   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", | 
 |  2171     "v24", "v25", "v26", "v27" | 
|  2410   ); |  2172   ); | 
|  2411 } |  2173 } | 
|  2412 #endif  // HAS_RGB565TOYROW_NEON |  2174 #endif  // HAS_RGB565TOYROW_NEON | 
|  2413  |  2175  | 
|  2414 #ifdef HAS_ARGB1555TOYROW_NEON |  2176 #ifdef HAS_ARGB1555TOYROW_NEON | 
|  2415 void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) { |  2177 void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) { | 
|  2416   asm volatile ( |  2178   asm volatile ( | 
|  2417     "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient |  2179     "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient | 
|  2418     "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient |  2180     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient | 
|  2419     "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient |  2181     "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient | 
|  2420     "vmov.u8    d27, #16                       \n"  // Add 16 constant |  2182     "movi       v7.8b, #16                     \n"  // Add 16 constant | 
|  2421     ".p2align   2                              \n" |  | 
|  2422   "1:                                          \n" |  2183   "1:                                          \n" | 
|  2423     MEMACCESS(0) |  2184     MEMACCESS(0) | 
|  2424     "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels. |  2185     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels. | 
|  2425     "subs       %2, %2, #8                     \n"  // 8 processed per loop. |  2186     "subs       %2, %2, #8                     \n"  // 8 processed per loop. | 
|  2426     ARGB1555TOARGB |  2187     ARGB1555TOARGB | 
|  2427     "vmull.u8   q2, d0, d24                    \n"  // B |  2188     "umull      v3.8h, v0.8b, v4.8b            \n"  // B | 
|  2428     "vmlal.u8   q2, d1, d25                    \n"  // G |  2189     "umlal      v3.8h, v1.8b, v5.8b            \n"  // G | 
|  2429     "vmlal.u8   q2, d2, d26                    \n"  // R |  2190     "umlal      v3.8h, v2.8b, v6.8b            \n"  // R | 
|  2430     "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y |  2191     "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y | 
|  2431     "vqadd.u8   d0, d27                        \n" |  2192     "uqadd      v0.8b, v0.8b, v7.8b            \n" | 
|  2432     MEMACCESS(1) |  2193     MEMACCESS(1) | 
|  2433     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y. |  2194     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y. | 
|  2434     "bgt        1b                             \n" |  2195     "b.gt       1b                             \n" | 
|  2435   : "+r"(src_argb1555),  // %0 |  2196   : "+r"(src_argb1555),  // %0 | 
|  2436     "+r"(dst_y),         // %1 |  2197     "+r"(dst_y),         // %1 | 
|  2437     "+r"(pix)            // %2 |  2198     "+r"(pix)            // %2 | 
|  2438   : |  2199   : | 
|  2439   : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13" |  2200   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" | 
|  2440   ); |  2201   ); | 
|  2441 } |  2202 } | 
|  2442 #endif  // HAS_ARGB1555TOYROW_NEON |  2203 #endif  // HAS_ARGB1555TOYROW_NEON | 
|  2443  |  2204  | 
|  2444 #ifdef HAS_ARGB4444TOYROW_NEON |  2205 #ifdef HAS_ARGB4444TOYROW_NEON | 
|  2445 void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) { |  2206 void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) { | 
|  2446   asm volatile ( |  2207   asm volatile ( | 
|  2447     "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient |  2208     "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient | 
|  2448     "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient |  2209     "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient | 
|  2449     "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient |  2210     "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient | 
|  2450     "vmov.u8    d27, #16                       \n"  // Add 16 constant |  2211     "movi       v27.8b, #16                    \n"  // Add 16 constant | 
|  2451     ".p2align   2                              \n" |  | 
|  2452   "1:                                          \n" |  2212   "1:                                          \n" | 
|  2453     MEMACCESS(0) |  2213     MEMACCESS(0) | 
|  2454     "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels. |  2214     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels. | 
|  2455     "subs       %2, %2, #8                     \n"  // 8 processed per loop. |  2215     "subs       %2, %2, #8                     \n"  // 8 processed per loop. | 
|  2456     ARGB4444TOARGB |  2216     ARGB4444TOARGB | 
|  2457     "vmull.u8   q2, d0, d24                    \n"  // B |  2217     "umull      v3.8h, v0.8b, v24.8b           \n"  // B | 
|  2458     "vmlal.u8   q2, d1, d25                    \n"  // G |  2218     "umlal      v3.8h, v1.8b, v25.8b           \n"  // G | 
|  2459     "vmlal.u8   q2, d2, d26                    \n"  // R |  2219     "umlal      v3.8h, v2.8b, v26.8b           \n"  // R | 
|  2460     "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y |  2220     "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y | 
|  2461     "vqadd.u8   d0, d27                        \n" |  2221     "uqadd      v0.8b, v0.8b, v27.8b           \n" | 
|  2462     MEMACCESS(1) |  2222     MEMACCESS(1) | 
|  2463     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y. |  2223     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y. | 
|  2464     "bgt        1b                             \n" |  2224     "b.gt       1b                             \n" | 
|  2465   : "+r"(src_argb4444),  // %0 |  2225   : "+r"(src_argb4444),  // %0 | 
|  2466     "+r"(dst_y),         // %1 |  2226     "+r"(dst_y),         // %1 | 
|  2467     "+r"(pix)            // %2 |  2227     "+r"(pix)            // %2 | 
|  2468   : |  2228   : | 
|  2469   : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13" |  2229   : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27" | 
|  2470   ); |  2230   ); | 
|  2471 } |  2231 } | 
|  2472 #endif  // HAS_ARGB4444TOYROW_NEON |  2232 #endif  // HAS_ARGB4444TOYROW_NEON | 
|  2473  |  2233  | 
|  2474 #ifdef HAS_BGRATOYROW_NEON |  2234 #ifdef HAS_BGRATOYROW_NEON | 
|  2475 void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) { |  2235 void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) { | 
|  2476   asm volatile ( |  2236   asm volatile ( | 
|  2477     "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient |  2237     "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient | 
|  2478     "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient |  2238     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient | 
|  2479     "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient |  2239     "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient | 
|  2480     "vmov.u8    d7, #16                        \n"  // Add 16 constant |  2240     "movi       v7.8b, #16                     \n"  // Add 16 constant | 
|  2481     ".p2align   2                              \n" |  | 
|  2482   "1:                                          \n" |  2241   "1:                                          \n" | 
|  2483     MEMACCESS(0) |  2242     MEMACCESS(0) | 
|  2484     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of BGRA. |  2243     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels. | 
|  2485     "subs       %2, %2, #8                     \n"  // 8 processed per loop. |  2244     "subs       %2, %2, #8                     \n"  // 8 processed per loop. | 
|  2486     "vmull.u8   q8, d1, d4                     \n"  // R |  2245     "umull      v16.8h, v1.8b, v4.8b           \n"  // R | 
|  2487     "vmlal.u8   q8, d2, d5                     \n"  // G |  2246     "umlal      v16.8h, v2.8b, v5.8b           \n"  // G | 
|  2488     "vmlal.u8   q8, d3, d6                     \n"  // B |  2247     "umlal      v16.8h, v3.8b, v6.8b           \n"  // B | 
|  2489     "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y |  2248     "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y | 
|  2490     "vqadd.u8   d0, d7                         \n" |  2249     "uqadd      v0.8b, v0.8b, v7.8b            \n" | 
|  2491     MEMACCESS(1) |  2250     MEMACCESS(1) | 
|  2492     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y. |  2251     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y. | 
|  2493     "bgt        1b                             \n" |  2252     "b.gt       1b                             \n" | 
|  2494   : "+r"(src_bgra),  // %0 |  2253   : "+r"(src_bgra),  // %0 | 
|  2495     "+r"(dst_y),     // %1 |  2254     "+r"(dst_y),     // %1 | 
|  2496     "+r"(pix)        // %2 |  2255     "+r"(pix)        // %2 | 
|  2497   : |  2256   : | 
|  2498   : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" |  2257   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" | 
|  2499   ); |  2258   ); | 
|  2500 } |  2259 } | 
|  2501 #endif  // HAS_BGRATOYROW_NEON |  2260 #endif  // HAS_BGRATOYROW_NEON | 
|  2502  |  2261  | 
|  2503 #ifdef HAS_ABGRTOYROW_NEON |  2262 #ifdef HAS_ABGRTOYROW_NEON | 
|  2504 void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) { |  2263 void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) { | 
|  2505   asm volatile ( |  2264   asm volatile ( | 
|  2506     "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient |  2265     "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient | 
|  2507     "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient |  2266     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient | 
|  2508     "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient |  2267     "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient | 
|  2509     "vmov.u8    d7, #16                        \n"  // Add 16 constant |  2268     "movi       v7.8b, #16                     \n"  // Add 16 constant | 
|  2510     ".p2align   2                              \n" |  | 
|  2511   "1:                                          \n" |  2269   "1:                                          \n" | 
|  2512     MEMACCESS(0) |  2270     MEMACCESS(0) | 
|  2513     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ABGR. |  2271     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels. | 
|  2514     "subs       %2, %2, #8                     \n"  // 8 processed per loop. |  2272     "subs       %2, %2, #8                     \n"  // 8 processed per loop. | 
|  2515     "vmull.u8   q8, d0, d4                     \n"  // R |  2273     "umull      v16.8h, v0.8b, v4.8b           \n"  // R | 
|  2516     "vmlal.u8   q8, d1, d5                     \n"  // G |  2274     "umlal      v16.8h, v1.8b, v5.8b           \n"  // G | 
|  2517     "vmlal.u8   q8, d2, d6                     \n"  // B |  2275     "umlal      v16.8h, v2.8b, v6.8b           \n"  // B | 
|  2518     "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y |  2276     "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y | 
|  2519     "vqadd.u8   d0, d7                         \n" |  2277     "uqadd      v0.8b, v0.8b, v7.8b            \n" | 
|  2520     MEMACCESS(1) |  2278     MEMACCESS(1) | 
|  2521     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y. |  2279     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y. | 
|  2522     "bgt        1b                             \n" |  2280     "b.gt       1b                             \n" | 
|  2523   : "+r"(src_abgr),  // %0 |  2281   : "+r"(src_abgr),  // %0 | 
|  2524     "+r"(dst_y),  // %1 |  2282     "+r"(dst_y),     // %1 | 
|  2525     "+r"(pix)        // %2 |  2283     "+r"(pix)        // %2 | 
|  2526   : |  2284   : | 
|  2527   : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" |  2285   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" | 
|  2528   ); |  2286   ); | 
|  2529 } |  2287 } | 
|  2530 #endif  // HAS_ABGRTOYROW_NEON |  2288 #endif  // HAS_ABGRTOYROW_NEON | 
|  2531  |  2289  | 
|  2532 #ifdef HAS_RGBATOYROW_NEON |  2290 #ifdef HAS_RGBATOYROW_NEON | 
|  2533 void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) { |  2291 void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) { | 
|  2534   asm volatile ( |  2292   asm volatile ( | 
|  2535     "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient |  2293     "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient | 
|  2536     "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient |  2294     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient | 
|  2537     "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient |  2295     "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient | 
|  2538     "vmov.u8    d7, #16                        \n"  // Add 16 constant |  2296     "movi       v7.8b, #16                     \n"  // Add 16 constant | 
|  2539     ".p2align   2                              \n" |  | 
|  2540   "1:                                          \n" |  2297   "1:                                          \n" | 
|  2541     MEMACCESS(0) |  2298     MEMACCESS(0) | 
|  2542     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of RGBA. |  2299     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels. | 
|  2543     "subs       %2, %2, #8                     \n"  // 8 processed per loop. |  2300     "subs       %2, %2, #8                     \n"  // 8 processed per loop. | 
|  2544     "vmull.u8   q8, d1, d4                     \n"  // B |  2301     "umull      v16.8h, v1.8b, v4.8b           \n"  // B | 
|  2545     "vmlal.u8   q8, d2, d5                     \n"  // G |  2302     "umlal      v16.8h, v2.8b, v5.8b           \n"  // G | 
|  2546     "vmlal.u8   q8, d3, d6                     \n"  // R |  2303     "umlal      v16.8h, v3.8b, v6.8b           \n"  // R | 
|  2547     "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y |  2304     "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y | 
|  2548     "vqadd.u8   d0, d7                         \n" |  2305     "uqadd      v0.8b, v0.8b, v7.8b            \n" | 
|  2549     MEMACCESS(1) |  2306     MEMACCESS(1) | 
|  2550     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y. |  2307     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y. | 
|  2551     "bgt        1b                             \n" |  2308     "b.gt       1b                             \n" | 
|  2552   : "+r"(src_rgba),  // %0 |  2309   : "+r"(src_rgba),  // %0 | 
|  2553     "+r"(dst_y),  // %1 |  2310     "+r"(dst_y),     // %1 | 
|  2554     "+r"(pix)        // %2 |  2311     "+r"(pix)        // %2 | 
|  2555   : |  2312   : | 
|  2556   : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" |  2313   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" | 
|  2557   ); |  2314   ); | 
|  2558 } |  2315 } | 
|  2559 #endif  // HAS_RGBATOYROW_NEON |  2316 #endif  // HAS_RGBATOYROW_NEON | 
|  2560  |  2317  | 
|  2561 #ifdef HAS_RGB24TOYROW_NEON |  2318 #ifdef HAS_RGB24TOYROW_NEON | 
|  2562 void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) { |  2319 void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) { | 
|  2563   asm volatile ( |  2320   asm volatile ( | 
|  2564     "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient |  2321     "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient | 
|  2565     "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient |  2322     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient | 
|  2566     "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient |  2323     "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient | 
|  2567     "vmov.u8    d7, #16                        \n"  // Add 16 constant |  2324     "movi       v7.8b, #16                     \n"  // Add 16 constant | 
|  2568     ".p2align   2                              \n" |  | 
|  2569   "1:                                          \n" |  2325   "1:                                          \n" | 
|  2570     MEMACCESS(0) |  2326     MEMACCESS(0) | 
|  2571     "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RGB24. |  2327     "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels. | 
|  2572     "subs       %2, %2, #8                     \n"  // 8 processed per loop. |  2328     "subs       %2, %2, #8                     \n"  // 8 processed per loop. | 
|  2573     "vmull.u8   q8, d0, d4                     \n"  // B |  2329     "umull      v16.8h, v0.8b, v4.8b           \n"  // B | 
|  2574     "vmlal.u8   q8, d1, d5                     \n"  // G |  2330     "umlal      v16.8h, v1.8b, v5.8b           \n"  // G | 
|  2575     "vmlal.u8   q8, d2, d6                     \n"  // R |  2331     "umlal      v16.8h, v2.8b, v6.8b           \n"  // R | 
|  2576     "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y |  2332     "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y | 
|  2577     "vqadd.u8   d0, d7                         \n" |  2333     "uqadd      v0.8b, v0.8b, v7.8b            \n" | 
|  2578     MEMACCESS(1) |  2334     MEMACCESS(1) | 
|  2579     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y. |  2335     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y. | 
|  2580     "bgt        1b                             \n" |  2336     "b.gt       1b                             \n" | 
|  2581   : "+r"(src_rgb24),  // %0 |  2337   : "+r"(src_rgb24),  // %0 | 
|  2582     "+r"(dst_y),  // %1 |  2338     "+r"(dst_y),      // %1 | 
|  2583     "+r"(pix)        // %2 |  2339     "+r"(pix)         // %2 | 
|  2584   : |  2340   : | 
|  2585   : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" |  2341   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" | 
|  2586   ); |  2342   ); | 
|  2587 } |  2343 } | 
|  2588 #endif  // HAS_RGB24TOYROW_NEON |  2344 #endif  // HAS_RGB24TOYROW_NEON | 
|  2589  |  2345  | 
|  2590 #ifdef HAS_RAWTOYROW_NEON |  2346 #ifdef HAS_RAWTOYROW_NEON | 
|  2591 void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { |  2347 void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { | 
|  2592   asm volatile ( |  2348   asm volatile ( | 
|  2593     "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient |  2349     "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient | 
|  2594     "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient |  2350     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient | 
|  2595     "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient |  2351     "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient | 
|  2596     "vmov.u8    d7, #16                        \n"  // Add 16 constant |  2352     "movi       v7.8b, #16                     \n"  // Add 16 constant | 
|  2597     ".p2align   2                              \n" |  | 
|  2598   "1:                                          \n" |  2353   "1:                                          \n" | 
|  2599     MEMACCESS(0) |  2354     MEMACCESS(0) | 
|  2600     "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RAW. |  2355     "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels. | 
|  2601     "subs       %2, %2, #8                     \n"  // 8 processed per loop. |  2356     "subs       %2, %2, #8                     \n"  // 8 processed per loop. | 
|  2602     "vmull.u8   q8, d0, d4                     \n"  // B |  2357     "umull      v16.8h, v0.8b, v4.8b           \n"  // B | 
|  2603     "vmlal.u8   q8, d1, d5                     \n"  // G |  2358     "umlal      v16.8h, v1.8b, v5.8b           \n"  // G | 
|  2604     "vmlal.u8   q8, d2, d6                     \n"  // R |  2359     "umlal      v16.8h, v2.8b, v6.8b           \n"  // R | 
|  2605     "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y |  2360     "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y | 
|  2606     "vqadd.u8   d0, d7                         \n" |  2361     "uqadd      v0.8b, v0.8b, v7.8b            \n" | 
|  2607     MEMACCESS(1) |  2362     MEMACCESS(1) | 
|  2608     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y. |  2363     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y. | 
|  2609     "bgt        1b                             \n" |  2364     "b.gt       1b                             \n" | 
|  2610   : "+r"(src_raw),  // %0 |  2365   : "+r"(src_raw),  // %0 | 
|  2611     "+r"(dst_y),  // %1 |  2366     "+r"(dst_y),    // %1 | 
|  2612     "+r"(pix)        // %2 |  2367     "+r"(pix)       // %2 | 
|  2613   : |  2368   : | 
|  2614   : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" |  2369   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" | 
|  2615   ); |  2370   ); | 
|  2616 } |  2371 } | 
|  2617 #endif  // HAS_RAWTOYROW_NEON |  2372 #endif  // HAS_RAWTOYROW_NEON | 
|  2618  |  2373  | 
|  2619 // Bilinear filter 16x2 -> 16x1 |  2374 // Bilinear filter 16x2 -> 16x1 | 
|  2620 #ifdef HAS_INTERPOLATEROW_NEON |  2375 #ifdef HAS_INTERPOLATEROW_NEON | 
|  2621 void InterpolateRow_NEON(uint8* dst_ptr, |  2376 void InterpolateRow_NEON(uint8* dst_ptr, | 
|  2622                          const uint8* src_ptr, ptrdiff_t src_stride, |  2377                          const uint8* src_ptr, ptrdiff_t src_stride, | 
|  2623                          int dst_width, int source_y_fraction) { |  2378                          int dst_width, int source_y_fraction) { | 
 |  2379   int y1_fraction = source_y_fraction; | 
 |  2380   int y0_fraction = 256 - y1_fraction; | 
 |  2381   const uint8* src_ptr1 = src_ptr + src_stride; | 
|  2624   asm volatile ( |  2382   asm volatile ( | 
|  2625     "cmp        %4, #0                         \n" |  2383     "cmp        %4, #0                         \n" | 
|  2626     "beq        100f                           \n" |  2384     "b.eq       100f                           \n" | 
|  2627     "add        %2, %1                         \n" |  | 
|  2628     "cmp        %4, #64                        \n" |  2385     "cmp        %4, #64                        \n" | 
|  2629     "beq        75f                            \n" |  2386     "b.eq       75f                            \n" | 
|  2630     "cmp        %4, #128                       \n" |  2387     "cmp        %4, #128                       \n" | 
|  2631     "beq        50f                            \n" |  2388     "b.eq       50f                            \n" | 
|  2632     "cmp        %4, #192                       \n" |  2389     "cmp        %4, #192                       \n" | 
|  2633     "beq        25f                            \n" |  2390     "b.eq       25f                            \n" | 
|  2634  |  2391  | 
|  2635     "vdup.8     d5, %4                         \n" |  2392     "dup        v5.16b, %w4                    \n" | 
|  2636     "rsb        %4, #256                       \n" |  2393     "dup        v4.16b, %w5                    \n" | 
|  2637     "vdup.8     d4, %4                         \n" |  | 
|  2638     // General purpose row blend. |  2394     // General purpose row blend. | 
|  2639   "1:                                          \n" |  2395   "1:                                          \n" | 
|  2640     MEMACCESS(1) |  2396     MEMACCESS(1) | 
|  2641     "vld1.8     {q0}, [%1]!                    \n" |  2397     "ld1        {v0.16b}, [%1], #16            \n" | 
|  2642     MEMACCESS(2) |  2398     MEMACCESS(2) | 
|  2643     "vld1.8     {q1}, [%2]!                    \n" |  2399     "ld1        {v1.16b}, [%2], #16            \n" | 
|  2644     "subs       %3, %3, #16                    \n" |  2400     "subs       %3, %3, #16                    \n" | 
|  2645     "vmull.u8   q13, d0, d4                    \n" |  2401     "umull      v2.8h, v0.8b,  v4.8b           \n" | 
|  2646     "vmull.u8   q14, d1, d4                    \n" |  2402     "umull2     v3.8h, v0.16b, v4.16b          \n" | 
|  2647     "vmlal.u8   q13, d2, d5                    \n" |  2403     "umlal      v2.8h, v1.8b,  v5.8b           \n" | 
|  2648     "vmlal.u8   q14, d3, d5                    \n" |  2404     "umlal2     v3.8h, v1.16b, v5.16b          \n" | 
|  2649     "vrshrn.u16 d0, q13, #8                    \n" |  2405     "rshrn      v0.8b,  v2.8h, #8              \n" | 
|  2650     "vrshrn.u16 d1, q14, #8                    \n" |  2406     "rshrn2     v0.16b, v3.8h, #8              \n" | 
|  2651     MEMACCESS(0) |  2407     MEMACCESS(0) | 
|  2652     "vst1.8     {q0}, [%0]!                    \n" |  2408     "st1        {v0.16b}, [%0], #16            \n" | 
|  2653     "bgt        1b                             \n" |  2409     "b.gt       1b                             \n" | 
|  2654     "b          99f                            \n" |  2410     "b          99f                            \n" | 
|  2655  |  2411  | 
|  2656     // Blend 25 / 75. |  2412     // Blend 25 / 75. | 
|  2657   "25:                                         \n" |  2413   "25:                                         \n" | 
|  2658     MEMACCESS(1) |  2414     MEMACCESS(1) | 
|  2659     "vld1.8     {q0}, [%1]!                    \n" |  2415     "ld1        {v0.16b}, [%1], #16            \n" | 
|  2660     MEMACCESS(2) |  2416     MEMACCESS(2) | 
|  2661     "vld1.8     {q1}, [%2]!                    \n" |  2417     "ld1        {v1.16b}, [%2], #16            \n" | 
|  2662     "subs       %3, %3, #16                    \n" |  2418     "subs       %3, %3, #16                    \n" | 
|  2663     "vrhadd.u8  q0, q1                         \n" |  2419     "urhadd     v0.16b, v0.16b, v1.16b         \n" | 
|  2664     "vrhadd.u8  q0, q1                         \n" |  2420     "urhadd     v0.16b, v0.16b, v1.16b         \n" | 
|  2665     MEMACCESS(0) |  2421     MEMACCESS(0) | 
|  2666     "vst1.8     {q0}, [%0]!                    \n" |  2422     "st1        {v0.16b}, [%0], #16            \n" | 
|  2667     "bgt        25b                            \n" |  2423     "b.gt       25b                            \n" | 
|  2668     "b          99f                            \n" |  2424     "b          99f                            \n" | 
|  2669  |  2425  | 
|  2670     // Blend 50 / 50. |  2426     // Blend 50 / 50. | 
|  2671   "50:                                         \n" |  2427   "50:                                         \n" | 
|  2672     MEMACCESS(1) |  2428     MEMACCESS(1) | 
|  2673     "vld1.8     {q0}, [%1]!                    \n" |  2429     "ld1        {v0.16b}, [%1], #16            \n" | 
|  2674     MEMACCESS(2) |  2430     MEMACCESS(2) | 
|  2675     "vld1.8     {q1}, [%2]!                    \n" |  2431     "ld1        {v1.16b}, [%2], #16            \n" | 
|  2676     "subs       %3, %3, #16                    \n" |  2432     "subs       %3, %3, #16                    \n" | 
|  2677     "vrhadd.u8  q0, q1                         \n" |  2433     "urhadd     v0.16b, v0.16b, v1.16b         \n" | 
|  2678     MEMACCESS(0) |  2434     MEMACCESS(0) | 
|  2679     "vst1.8     {q0}, [%0]!                    \n" |  2435     "st1        {v0.16b}, [%0], #16            \n" | 
|  2680     "bgt        50b                            \n" |  2436     "b.gt       50b                            \n" | 
|  2681     "b          99f                            \n" |  2437     "b          99f                            \n" | 
|  2682  |  2438  | 
|  2683     // Blend 75 / 25. |  2439     // Blend 75 / 25. | 
|  2684   "75:                                         \n" |  2440   "75:                                         \n" | 
|  2685     MEMACCESS(1) |  2441     MEMACCESS(1) | 
|  2686     "vld1.8     {q1}, [%1]!                    \n" |  2442     "ld1        {v1.16b}, [%1], #16            \n" | 
|  2687     MEMACCESS(2) |  2443     MEMACCESS(2) | 
|  2688     "vld1.8     {q0}, [%2]!                    \n" |  2444     "ld1        {v0.16b}, [%2], #16            \n" | 
|  2689     "subs       %3, %3, #16                    \n" |  2445     "subs       %3, %3, #16                    \n" | 
|  2690     "vrhadd.u8  q0, q1                         \n" |  2446     "urhadd     v0.16b, v0.16b, v1.16b         \n" | 
|  2691     "vrhadd.u8  q0, q1                         \n" |  2447     "urhadd     v0.16b, v0.16b, v1.16b         \n" | 
|  2692     MEMACCESS(0) |  2448     MEMACCESS(0) | 
|  2693     "vst1.8     {q0}, [%0]!                    \n" |  2449     "st1        {v0.16b}, [%0], #16            \n" | 
|  2694     "bgt        75b                            \n" |  2450     "b.gt       75b                            \n" | 
|  2695     "b          99f                            \n" |  2451     "b          99f                            \n" | 
|  2696  |  2452  | 
|  2697     // Blend 100 / 0 - Copy row unchanged. |  2453     // Blend 100 / 0 - Copy row unchanged. | 
|  2698   "100:                                        \n" |  2454   "100:                                        \n" | 
|  2699     MEMACCESS(1) |  2455     MEMACCESS(1) | 
|  2700     "vld1.8     {q0}, [%1]!                    \n" |  2456     "ld1        {v0.16b}, [%1], #16            \n" | 
|  2701     "subs       %3, %3, #16                    \n" |  2457     "subs       %3, %3, #16                    \n" | 
|  2702     MEMACCESS(0) |  2458     MEMACCESS(0) | 
|  2703     "vst1.8     {q0}, [%0]!                    \n" |  2459     "st1        {v0.16b}, [%0], #16            \n" | 
|  2704     "bgt        100b                           \n" |  2460     "b.gt       100b                           \n" | 
|  2705  |  2461  | 
|  2706   "99:                                         \n" |  2462   "99:                                         \n" | 
|  2707   : "+r"(dst_ptr),          // %0 |  2463   : "+r"(dst_ptr),          // %0 | 
|  2708     "+r"(src_ptr),          // %1 |  2464     "+r"(src_ptr),          // %1 | 
|  2709     "+r"(src_stride),       // %2 |  2465     "+r"(src_ptr1),         // %2 | 
|  2710     "+r"(dst_width),        // %3 |  2466     "+r"(dst_width),        // %3 | 
|  2711     "+r"(source_y_fraction) // %4 |  2467     "+r"(y1_fraction),      // %4 | 
 |  2468     "+r"(y0_fraction)       // %5 | 
|  2712   : |  2469   : | 
|  2713   : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14" |  2470   : "cc", "memory", "v0", "v1", "v3", "v4", "v5" | 
|  2714   ); |  2471   ); | 
|  2715 } |  2472 } | 
|  2716 #endif  // HAS_INTERPOLATEROW_NEON |  2473 #endif  // HAS_INTERPOLATEROW_NEON | 
|  2717  |  2474  | 
|  2718 // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr |  2475 // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr | 
|  2719 #ifdef HAS_ARGBBLENDROW_NEON |  2476 #ifdef HAS_ARGBBLENDROW_NEON | 
|  2720 void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, |  2477 void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, | 
|  2721                        uint8* dst_argb, int width) { |  2478                        uint8* dst_argb, int width) { | 
|  2722   asm volatile ( |  2479   asm volatile ( | 
|  2723     "subs       %3, #8                         \n" |  2480     "subs       %3, %3, #8                     \n" | 
|  2724     "blt        89f                            \n" |  2481     "b.lt       89f                            \n" | 
|  2725     // Blend 8 pixels. |  2482     // Blend 8 pixels. | 
|  2726   "8:                                          \n" |  2483   "8:                                          \n" | 
|  2727     MEMACCESS(0) |  2484     MEMACCESS(0) | 
|  2728     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB0. |  2485     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB0 pixels | 
|  2729     MEMACCESS(1) |  2486     MEMACCESS(1) | 
|  2730     "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 pixels of ARGB1. |  2487     "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 ARGB1 pixels | 
|  2731     "subs       %3, %3, #8                     \n"  // 8 processed per loop. |  2488     "subs       %3, %3, #8                     \n"  // 8 processed per loop. | 
|  2732     "vmull.u8   q10, d4, d3                    \n"  // db * a |  2489     "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a | 
|  2733     "vmull.u8   q11, d5, d3                    \n"  // dg * a |  2490     "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a | 
|  2734     "vmull.u8   q12, d6, d3                    \n"  // dr * a |  2491     "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a | 
|  2735     "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8 |  2492     "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8 | 
|  2736     "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8 |  2493     "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8 | 
|  2737     "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8 |  2494     "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8 | 
|  2738     "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256 |  2495     "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256) | 
|  2739     "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256 |  2496     "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256) | 
|  2740     "vqadd.u8   q0, q0, q2                     \n"  // + sbg |  2497     "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256) | 
|  2741     "vqadd.u8   d2, d2, d6                     \n"  // + sr |  2498     "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb | 
|  2742     "vmov.u8    d3, #255                       \n"  // a = 255 |  2499     "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg | 
 |  2500     "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr | 
 |  2501     "movi       v3.8b, #255                    \n"  // a = 255 | 
|  2743     MEMACCESS(2) |  2502     MEMACCESS(2) | 
|  2744     "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 pixels of ARGB. |  2503     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels | 
|  2745     "bge        8b                             \n" |  2504     "b.ge       8b                             \n" | 
|  2746  |  2505  | 
|  2747   "89:                                         \n" |  2506   "89:                                         \n" | 
|  2748     "adds       %3, #8-1                       \n" |  2507     "adds       %3, %3, #8-1                   \n" | 
|  2749     "blt        99f                            \n" |  2508     "b.lt       99f                            \n" | 
|  2750  |  2509  | 
|  2751     // Blend 1 pixels. |  2510     // Blend 1 pixels. | 
|  2752   "1:                                          \n" |  2511   "1:                                          \n" | 
|  2753     MEMACCESS(0) |  2512     MEMACCESS(0) | 
|  2754     "vld4.8     {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n"  // load 1 pixel ARGB0. |  2513     "ld4        {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n"  // load 1 pixel ARGB0. | 
|  2755     MEMACCESS(1) |  2514     MEMACCESS(1) | 
|  2756     "vld4.8     {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n"  // load 1 pixel ARGB1. |  2515     "ld4        {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n"  // load 1 pixel ARGB1. | 
|  2757     "subs       %3, %3, #1                     \n"  // 1 processed per loop. |  2516     "subs       %3, %3, #1                     \n"  // 1 processed per loop. | 
|  2758     "vmull.u8   q10, d4, d3                    \n"  // db * a |  2517     "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a | 
|  2759     "vmull.u8   q11, d5, d3                    \n"  // dg * a |  2518     "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a | 
|  2760     "vmull.u8   q12, d6, d3                    \n"  // dr * a |  2519     "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a | 
|  2761     "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8 |  2520     "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8 | 
|  2762     "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8 |  2521     "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8 | 
|  2763     "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8 |  2522     "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8 | 
|  2764     "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256 |  2523     "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256) | 
|  2765     "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256 |  2524     "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256) | 
|  2766     "vqadd.u8   q0, q0, q2                     \n"  // + sbg |  2525     "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256) | 
|  2767     "vqadd.u8   d2, d2, d6                     \n"  // + sr |  2526     "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb | 
|  2768     "vmov.u8    d3, #255                       \n"  // a = 255 |  2527     "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg | 
 |  2528     "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr | 
 |  2529     "movi       v3.8b, #255                    \n"  // a = 255 | 
|  2769     MEMACCESS(2) |  2530     MEMACCESS(2) | 
|  2770     "vst4.8     {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n"  // store 1 pixel. |  2531     "st4        {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n"  // store 1 pixel. | 
|  2771     "bge        1b                             \n" |  2532     "b.ge       1b                             \n" | 
|  2772  |  2533  | 
|  2773   "99:                                         \n" |  2534   "99:                                         \n" | 
|  2774  |  2535  | 
|  2775   : "+r"(src_argb0),    // %0 |  2536   : "+r"(src_argb0),    // %0 | 
|  2776     "+r"(src_argb1),    // %1 |  2537     "+r"(src_argb1),    // %1 | 
|  2777     "+r"(dst_argb),     // %2 |  2538     "+r"(dst_argb),     // %2 | 
|  2778     "+r"(width)         // %3 |  2539     "+r"(width)         // %3 | 
|  2779   : |  2540   : | 
|  2780   : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12" |  2541   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 
 |  2542     "v16", "v17", "v18" | 
|  2781   ); |  2543   ); | 
|  2782 } |  2544 } | 
|  2783 #endif  // HAS_ARGBBLENDROW_NEON |  2545 #endif  // HAS_ARGBBLENDROW_NEON | 
|  2784  |  2546  | 
|  2785 // Attenuate 8 pixels at a time. |  2547 // Attenuate 8 pixels at a time. | 
|  2786 #ifdef HAS_ARGBATTENUATEROW_NEON |  2548 #ifdef HAS_ARGBATTENUATEROW_NEON | 
|  2787 void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { |  2549 void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { | 
|  2788   asm volatile ( |  2550   asm volatile ( | 
|  2789     // Attenuate 8 pixels. |  2551     // Attenuate 8 pixels. | 
|  2790   "1:                                          \n" |  2552   "1:                                          \n" | 
|  2791     MEMACCESS(0) |  2553     MEMACCESS(0) | 
|  2792     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB. |  2554     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels | 
|  2793     "subs       %2, %2, #8                     \n"  // 8 processed per loop. |  2555     "subs       %2, %2, #8                     \n"  // 8 processed per loop. | 
|  2794     "vmull.u8   q10, d0, d3                    \n"  // b * a |  2556     "umull      v4.8h, v0.8b, v3.8b            \n"  // b * a | 
|  2795     "vmull.u8   q11, d1, d3                    \n"  // g * a |  2557     "umull      v5.8h, v1.8b, v3.8b            \n"  // g * a | 
|  2796     "vmull.u8   q12, d2, d3                    \n"  // r * a |  2558     "umull      v6.8h, v2.8b, v3.8b            \n"  // r * a | 
|  2797     "vqrshrn.u16 d0, q10, #8                   \n"  // b >>= 8 |  2559     "uqrshrn    v0.8b, v4.8h, #8               \n"  // b >>= 8 | 
|  2798     "vqrshrn.u16 d1, q11, #8                   \n"  // g >>= 8 |  2560     "uqrshrn    v1.8b, v5.8h, #8               \n"  // g >>= 8 | 
|  2799     "vqrshrn.u16 d2, q12, #8                   \n"  // r >>= 8 |  2561     "uqrshrn    v2.8b, v6.8h, #8               \n"  // r >>= 8 | 
|  2800     MEMACCESS(1) |  2562     MEMACCESS(1) | 
|  2801     "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB. |  2563     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels | 
|  2802     "bgt        1b                             \n" |  2564     "b.gt       1b                             \n" | 
|  2803   : "+r"(src_argb),   // %0 |  2565   : "+r"(src_argb),   // %0 | 
|  2804     "+r"(dst_argb),   // %1 |  2566     "+r"(dst_argb),   // %1 | 
|  2805     "+r"(width)       // %2 |  2567     "+r"(width)       // %2 | 
|  2806   : |  2568   : | 
|  2807   : "cc", "memory", "q0", "q1", "q10", "q11", "q12" |  2569   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" | 
|  2808   ); |  2570   ); | 
|  2809 } |  2571 } | 
|  2810 #endif  // HAS_ARGBATTENUATEROW_NEON |  2572 #endif  // HAS_ARGBATTENUATEROW_NEON | 
|  2811  |  2573  | 
|  2812 // Quantize 8 ARGB pixels (32 bytes). |  2574 // Quantize 8 ARGB pixels (32 bytes). | 
|  2813 // dst = (dst * scale >> 16) * interval_size + interval_offset; |  2575 // dst = (dst * scale >> 16) * interval_size + interval_offset; | 
|  2814 #ifdef HAS_ARGBQUANTIZEROW_NEON |  2576 #ifdef HAS_ARGBQUANTIZEROW_NEON | 
|  2815 void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, |  2577 void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, | 
|  2816                           int interval_offset, int width) { |  2578                           int interval_offset, int width) { | 
|  2817   asm volatile ( |  2579   asm volatile ( | 
|  2818     "vdup.u16   q8, %2                         \n" |  2580     "dup        v4.8h, %w2                     \n" | 
|  2819     "vshr.u16   q8, q8, #1                     \n"  // scale >>= 1 |  2581     "ushr       v4.8h, v4.8h, #1               \n"  // scale >>= 1 | 
|  2820     "vdup.u16   q9, %3                         \n"  // interval multiply. |  2582     "dup        v5.8h, %w3                     \n"  // interval multiply. | 
|  2821     "vdup.u16   q10, %4                        \n"  // interval add |  2583     "dup        v6.8h, %w4                     \n"  // interval add | 
|  2822  |  2584  | 
|  2823     // 8 pixel loop. |  2585     // 8 pixel loop. | 
|  2824     ".p2align   2                              \n" |  | 
|  2825   "1:                                          \n" |  2586   "1:                                          \n" | 
|  2826     MEMACCESS(0) |  2587     MEMACCESS(0) | 
|  2827     "vld4.8     {d0, d2, d4, d6}, [%0]         \n"  // load 8 pixels of ARGB. |  2588     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0]  \n"  // load 8 pixels of ARGB. | 
|  2828     "subs       %1, %1, #8                     \n"  // 8 processed per loop. |  2589     "subs       %1, %1, #8                     \n"  // 8 processed per loop. | 
|  2829     "vmovl.u8   q0, d0                         \n"  // b (0 .. 255) |  2590     "uxtl       v0.8h, v0.8b                   \n"  // b (0 .. 255) | 
|  2830     "vmovl.u8   q1, d2                         \n" |  2591     "uxtl       v1.8h, v1.8b                   \n" | 
|  2831     "vmovl.u8   q2, d4                         \n" |  2592     "uxtl       v2.8h, v2.8b                   \n" | 
|  2832     "vqdmulh.s16 q0, q0, q8                    \n"  // b * scale |  2593     "sqdmulh    v0.8h, v0.8h, v4.8h            \n"  // b * scale | 
|  2833     "vqdmulh.s16 q1, q1, q8                    \n"  // g |  2594     "sqdmulh    v1.8h, v1.8h, v4.8h            \n"  // g | 
|  2834     "vqdmulh.s16 q2, q2, q8                    \n"  // r |  2595     "sqdmulh    v2.8h, v2.8h, v4.8h            \n"  // r | 
|  2835     "vmul.u16   q0, q0, q9                     \n"  // b * interval_size |  2596     "mul        v0.8h, v0.8h, v5.8h            \n"  // b * interval_size | 
|  2836     "vmul.u16   q1, q1, q9                     \n"  // g |  2597     "mul        v1.8h, v1.8h, v5.8h            \n"  // g | 
|  2837     "vmul.u16   q2, q2, q9                     \n"  // r |  2598     "mul        v2.8h, v2.8h, v5.8h            \n"  // r | 
|  2838     "vadd.u16   q0, q0, q10                    \n"  // b + interval_offset |  2599     "add        v0.8h, v0.8h, v6.8h            \n"  // b + interval_offset | 
|  2839     "vadd.u16   q1, q1, q10                    \n"  // g |  2600     "add        v1.8h, v1.8h, v6.8h            \n"  // g | 
|  2840     "vadd.u16   q2, q2, q10                    \n"  // r |  2601     "add        v2.8h, v2.8h, v6.8h            \n"  // r | 
|  2841     "vqmovn.u16 d0, q0                         \n" |  2602     "uqxtn      v0.8b, v0.8h                   \n" | 
|  2842     "vqmovn.u16 d2, q1                         \n" |  2603     "uqxtn      v1.8b, v1.8h                   \n" | 
|  2843     "vqmovn.u16 d4, q2                         \n" |  2604     "uqxtn      v2.8b, v2.8h                   \n" | 
|  2844     MEMACCESS(0) |  2605     MEMACCESS(0) | 
|  2845     "vst4.8     {d0, d2, d4, d6}, [%0]!        \n"  // store 8 pixels of ARGB. |  2606     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 ARGB pixels | 
|  2846     "bgt        1b                             \n" |  2607     "b.gt       1b                             \n" | 
|  2847   : "+r"(dst_argb),       // %0 |  2608   : "+r"(dst_argb),       // %0 | 
|  2848     "+r"(width)           // %1 |  2609     "+r"(width)           // %1 | 
|  2849   : "r"(scale),           // %2 |  2610   : "r"(scale),           // %2 | 
|  2850     "r"(interval_size),   // %3 |  2611     "r"(interval_size),   // %3 | 
|  2851     "r"(interval_offset)  // %4 |  2612     "r"(interval_offset)  // %4 | 
|  2852   : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10" |  2613   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" | 
|  2853   ); |  2614   ); | 
|  2854 } |  2615 } | 
|  2855 #endif  // HAS_ARGBQUANTIZEROW_NEON |  2616 #endif  // HAS_ARGBQUANTIZEROW_NEON | 
|  2856  |  2617  | 
|  2857 // Shade 8 pixels at a time by specified value. |  2618 // Shade 8 pixels at a time by specified value. | 
|  2858 // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8. |  2619 // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8. | 
|  2859 // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set. |  2620 // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set. | 
|  2860 #ifdef HAS_ARGBSHADEROW_NEON |  2621 #ifdef HAS_ARGBSHADEROW_NEON | 
|  2861 void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, |  2622 void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, | 
|  2862                        uint32 value) { |  2623                        uint32 value) { | 
|  2863   asm volatile ( |  2624   asm volatile ( | 
|  2864     "vdup.u32   q0, %3                         \n"  // duplicate scale value. |  2625     "dup        v0.4s, %w3                     \n"  // duplicate scale value. | 
|  2865     "vzip.u8    d0, d1                         \n"  // d0 aarrggbb. |  2626     "zip1       v0.8b, v0.8b, v0.8b            \n"  // v0.8b aarrggbb. | 
|  2866     "vshr.u16   q0, q0, #1                     \n"  // scale / 2. |  2627     "ushr       v0.8h, v0.8h, #1               \n"  // scale / 2. | 
|  2867  |  2628  | 
|  2868     // 8 pixel loop. |  2629     // 8 pixel loop. | 
|  2869     ".p2align   2                              \n" |  | 
|  2870   "1:                                          \n" |  2630   "1:                                          \n" | 
|  2871     MEMACCESS(0) |  2631     MEMACCESS(0) | 
|  2872     "vld4.8     {d20, d22, d24, d26}, [%0]!    \n"  // load 8 pixels of ARGB. |  2632     "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n"  // load 8 ARGB pixels. | 
|  2873     "subs       %2, %2, #8                     \n"  // 8 processed per loop. |  2633     "subs       %2, %2, #8                     \n"  // 8 processed per loop. | 
|  2874     "vmovl.u8   q10, d20                       \n"  // b (0 .. 255) |  2634     "uxtl       v4.8h, v4.8b                   \n"  // b (0 .. 255) | 
|  2875     "vmovl.u8   q11, d22                       \n" |  2635     "uxtl       v5.8h, v5.8b                   \n" | 
|  2876     "vmovl.u8   q12, d24                       \n" |  2636     "uxtl       v6.8h, v6.8b                   \n" | 
|  2877     "vmovl.u8   q13, d26                       \n" |  2637     "uxtl       v7.8h, v7.8b                   \n" | 
|  2878     "vqrdmulh.s16 q10, q10, d0[0]              \n"  // b * scale * 2 |  2638     "sqrdmulh   v4.8h, v4.8h, v0.h[0]          \n"  // b * scale * 2 | 
|  2879     "vqrdmulh.s16 q11, q11, d0[1]              \n"  // g |  2639     "sqrdmulh   v5.8h, v5.8h, v0.h[1]          \n"  // g | 
|  2880     "vqrdmulh.s16 q12, q12, d0[2]              \n"  // r |  2640     "sqrdmulh   v6.8h, v6.8h, v0.h[2]          \n"  // r | 
|  2881     "vqrdmulh.s16 q13, q13, d0[3]              \n"  // a |  2641     "sqrdmulh   v7.8h, v7.8h, v0.h[3]          \n"  // a | 
|  2882     "vqmovn.u16 d20, q10                       \n" |  2642     "uqxtn      v4.8b, v4.8h                   \n" | 
|  2883     "vqmovn.u16 d22, q11                       \n" |  2643     "uqxtn      v5.8b, v5.8h                   \n" | 
|  2884     "vqmovn.u16 d24, q12                       \n" |  2644     "uqxtn      v6.8b, v6.8h                   \n" | 
|  2885     "vqmovn.u16 d26, q13                       \n" |  2645     "uqxtn      v7.8b, v7.8h                   \n" | 
|  2886     MEMACCESS(1) |  2646     MEMACCESS(1) | 
|  2887     "vst4.8     {d20, d22, d24, d26}, [%1]!    \n"  // store 8 pixels of ARGB. |  2647     "st4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // store 8 ARGB pixels | 
|  2888     "bgt        1b                             \n" |  2648     "b.gt       1b                             \n" | 
|  2889   : "+r"(src_argb),       // %0 |  2649   : "+r"(src_argb),       // %0 | 
|  2890     "+r"(dst_argb),       // %1 |  2650     "+r"(dst_argb),       // %1 | 
|  2891     "+r"(width)           // %2 |  2651     "+r"(width)           // %2 | 
|  2892   : "r"(value)            // %3 |  2652   : "r"(value)            // %3 | 
|  2893   : "cc", "memory", "q0", "q10", "q11", "q12", "q13" |  2653   : "cc", "memory", "v0", "v4", "v5", "v6", "v7" | 
|  2894   ); |  2654   ); | 
|  2895 } |  2655 } | 
|  2896 #endif  // HAS_ARGBSHADEROW_NEON |  2656 #endif  // HAS_ARGBSHADEROW_NEON | 
|  2897  |  2657  | 
|  2898 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels |  2658 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels | 
|  2899 // Similar to ARGBToYJ but stores ARGB. |  2659 // Similar to ARGBToYJ but stores ARGB. | 
|  2900 // C code is (15 * b + 75 * g + 38 * r + 64) >> 7; |  2660 // C code is (15 * b + 75 * g + 38 * r + 64) >> 7; | 
|  2901 #ifdef HAS_ARGBGRAYROW_NEON |  2661 #ifdef HAS_ARGBGRAYROW_NEON | 
|  2902 void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { |  2662 void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { | 
|  2903   asm volatile ( |  2663   asm volatile ( | 
|  2904     "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient |  2664     "movi       v24.8b, #15                    \n"  // B * 0.11400 coefficient | 
|  2905     "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient |  2665     "movi       v25.8b, #75                    \n"  // G * 0.58700 coefficient | 
|  2906     "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient |  2666     "movi       v26.8b, #38                    \n"  // R * 0.29900 coefficient | 
|  2907     ".p2align   2                              \n" |  | 
|  2908   "1:                                          \n" |  2667   "1:                                          \n" | 
|  2909     MEMACCESS(0) |  2668     MEMACCESS(0) | 
|  2910     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels. |  2669     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels. | 
|  2911     "subs       %2, %2, #8                     \n"  // 8 processed per loop. |  2670     "subs       %2, %2, #8                     \n"  // 8 processed per loop. | 
|  2912     "vmull.u8   q2, d0, d24                    \n"  // B |  2671     "umull      v4.8h, v0.8b, v24.8b           \n"  // B | 
|  2913     "vmlal.u8   q2, d1, d25                    \n"  // G |  2672     "umlal      v4.8h, v1.8b, v25.8b           \n"  // G | 
|  2914     "vmlal.u8   q2, d2, d26                    \n"  // R |  2673     "umlal      v4.8h, v2.8b, v26.8b           \n"  // R | 
|  2915     "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit B |  2674     "sqrshrun   v0.8b, v4.8h, #7               \n"  // 15 bit to 8 bit B | 
|  2916     "vmov       d1, d0                         \n"  // G |  2675     "orr        v1.8b, v0.8b, v0.8b            \n"  // G | 
|  2917     "vmov       d2, d0                         \n"  // R |  2676     "orr        v2.8b, v0.8b, v0.8b            \n"  // R | 
|  2918     MEMACCESS(1) |  2677     MEMACCESS(1) | 
|  2919     "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 ARGB pixels. |  2678     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 pixels. | 
|  2920     "bgt        1b                             \n" |  2679     "b.gt       1b                             \n" | 
|  2921   : "+r"(src_argb),  // %0 |  2680   : "+r"(src_argb),  // %0 | 
|  2922     "+r"(dst_argb),  // %1 |  2681     "+r"(dst_argb),  // %1 | 
|  2923     "+r"(width)      // %2 |  2682     "+r"(width)      // %2 | 
|  2924   : |  2683   : | 
|  2925   : "cc", "memory", "q0", "q1", "q2", "q12", "q13" |  2684   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26" | 
|  2926   ); |  2685   ); | 
|  2927 } |  2686 } | 
|  2928 #endif  // HAS_ARGBGRAYROW_NEON |  2687 #endif  // HAS_ARGBGRAYROW_NEON | 
|  2929  |  2688  | 
|  2930 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. |  2689 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. | 
|  2931 //    b = (r * 35 + g * 68 + b * 17) >> 7 |  2690 //    b = (r * 35 + g * 68 + b * 17) >> 7 | 
|  2932 //    g = (r * 45 + g * 88 + b * 22) >> 7 |  2691 //    g = (r * 45 + g * 88 + b * 22) >> 7 | 
|  2933 //    r = (r * 50 + g * 98 + b * 24) >> 7 |  2692 //    r = (r * 50 + g * 98 + b * 24) >> 7 | 
|  2934  |  2693  | 
|  2935 #ifdef HAS_ARGBSEPIAROW_NEON |  2694 #ifdef HAS_ARGBSEPIAROW_NEON | 
|  2936 void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { |  2695 void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { | 
|  2937   asm volatile ( |  2696   asm volatile ( | 
|  2938     "vmov.u8    d20, #17                       \n"  // BB coefficient |  2697     "movi       v20.8b, #17                    \n"  // BB coefficient | 
|  2939     "vmov.u8    d21, #68                       \n"  // BG coefficient |  2698     "movi       v21.8b, #68                    \n"  // BG coefficient | 
|  2940     "vmov.u8    d22, #35                       \n"  // BR coefficient |  2699     "movi       v22.8b, #35                    \n"  // BR coefficient | 
|  2941     "vmov.u8    d24, #22                       \n"  // GB coefficient |  2700     "movi       v24.8b, #22                    \n"  // GB coefficient | 
|  2942     "vmov.u8    d25, #88                       \n"  // GG coefficient |  2701     "movi       v25.8b, #88                    \n"  // GG coefficient | 
|  2943     "vmov.u8    d26, #45                       \n"  // GR coefficient |  2702     "movi       v26.8b, #45                    \n"  // GR coefficient | 
|  2944     "vmov.u8    d28, #24                       \n"  // BB coefficient |  2703     "movi       v28.8b, #24                    \n"  // BB coefficient | 
|  2945     "vmov.u8    d29, #98                       \n"  // BG coefficient |  2704     "movi       v29.8b, #98                    \n"  // BG coefficient | 
|  2946     "vmov.u8    d30, #50                       \n"  // BR coefficient |  2705     "movi       v30.8b, #50                    \n"  // BR coefficient | 
|  2947     ".p2align   2                              \n" |  | 
|  2948   "1:                                          \n" |  2706   "1:                                          \n" | 
|  2949     MEMACCESS(0) |  2707     MEMACCESS(0) | 
|  2950     "vld4.8     {d0, d1, d2, d3}, [%0]         \n"  // load 8 ARGB pixels. |  2708     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n"  // load 8 ARGB pixels. | 
|  2951     "subs       %1, %1, #8                     \n"  // 8 processed per loop. |  2709     "subs       %1, %1, #8                     \n"  // 8 processed per loop. | 
|  2952     "vmull.u8   q2, d0, d20                    \n"  // B to Sepia B |  2710     "umull      v4.8h, v0.8b, v20.8b           \n"  // B to Sepia B | 
|  2953     "vmlal.u8   q2, d1, d21                    \n"  // G |  2711     "umlal      v4.8h, v1.8b, v21.8b           \n"  // G | 
|  2954     "vmlal.u8   q2, d2, d22                    \n"  // R |  2712     "umlal      v4.8h, v2.8b, v22.8b           \n"  // R | 
|  2955     "vmull.u8   q3, d0, d24                    \n"  // B to Sepia G |  2713     "umull      v5.8h, v0.8b, v24.8b           \n"  // B to Sepia G | 
|  2956     "vmlal.u8   q3, d1, d25                    \n"  // G |  2714     "umlal      v5.8h, v1.8b, v25.8b           \n"  // G | 
|  2957     "vmlal.u8   q3, d2, d26                    \n"  // R |  2715     "umlal      v5.8h, v2.8b, v26.8b           \n"  // R | 
|  2958     "vmull.u8   q8, d0, d28                    \n"  // B to Sepia R |  2716     "umull      v6.8h, v0.8b, v28.8b           \n"  // B to Sepia R | 
|  2959     "vmlal.u8   q8, d1, d29                    \n"  // G |  2717     "umlal      v6.8h, v1.8b, v29.8b           \n"  // G | 
|  2960     "vmlal.u8   q8, d2, d30                    \n"  // R |  2718     "umlal      v6.8h, v2.8b, v30.8b           \n"  // R | 
|  2961     "vqshrn.u16 d0, q2, #7                     \n"  // 16 bit to 8 bit B |  2719     "uqshrn     v0.8b, v4.8h, #7               \n"  // 16 bit to 8 bit B | 
|  2962     "vqshrn.u16 d1, q3, #7                     \n"  // 16 bit to 8 bit G |  2720     "uqshrn     v1.8b, v5.8h, #7               \n"  // 16 bit to 8 bit G | 
|  2963     "vqshrn.u16 d2, q8, #7                     \n"  // 16 bit to 8 bit R |  2721     "uqshrn     v2.8b, v6.8h, #7               \n"  // 16 bit to 8 bit R | 
|  2964     MEMACCESS(0) |  2722     MEMACCESS(0) | 
|  2965     "vst4.8     {d0, d1, d2, d3}, [%0]!        \n"  // store 8 ARGB pixels. |  2723     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 pixels. | 
|  2966     "bgt        1b                             \n" |  2724     "b.gt       1b                             \n" | 
|  2967   : "+r"(dst_argb),  // %0 |  2725   : "+r"(dst_argb),  // %0 | 
|  2968     "+r"(width)      // %1 |  2726     "+r"(width)      // %1 | 
|  2969   : |  2727   : | 
|  2970   : "cc", "memory", "q0", "q1", "q2", "q3", |  2728   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 
|  2971     "q10", "q11", "q12", "q13", "q14", "q15" |  2729     "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30" | 
|  2972   ); |  2730   ); | 
|  2973 } |  2731 } | 
|  2974 #endif  // HAS_ARGBSEPIAROW_NEON |  2732 #endif  // HAS_ARGBSEPIAROW_NEON | 
|  2975  |  2733  | 
|  2976 // Tranform 8 ARGB pixels (32 bytes) with color matrix. |  2734 // Tranform 8 ARGB pixels (32 bytes) with color matrix. | 
|  2977 // TODO(fbarchard): Was same as Sepia except matrix is provided.  This function |  2735 // TODO(fbarchard): Was same as Sepia except matrix is provided.  This function | 
|  2978 // needs to saturate.  Consider doing a non-saturating version. |  2736 // needs to saturate.  Consider doing a non-saturating version. | 
|  2979 #ifdef HAS_ARGBCOLORMATRIXROW_NEON |  2737 #ifdef HAS_ARGBCOLORMATRIXROW_NEON | 
|  2980 void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, |  2738 void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, | 
|  2981                              const int8* matrix_argb, int width) { |  2739                              const int8* matrix_argb, int width) { | 
|  2982   asm volatile ( |  2740   asm volatile ( | 
|  2983     MEMACCESS(3) |  2741     MEMACCESS(3) | 
|  2984     "vld1.8     {q2}, [%3]                     \n"  // load 3 ARGB vectors. |  2742     "ld1        {v2.16b}, [%3]                 \n"  // load 3 ARGB vectors. | 
|  2985     "vmovl.s8   q0, d4                         \n"  // B,G coefficients s16. |  2743     "sxtl       v0.8h, v2.8b                   \n"  // B,G coefficients s16. | 
|  2986     "vmovl.s8   q1, d5                         \n"  // R,A coefficients s16. |  2744     "sxtl2      v1.8h, v2.16b                  \n"  // R,A coefficients s16. | 
|  2987  |  2745  | 
|  2988     ".p2align   2                              \n" |  | 
|  2989   "1:                                          \n" |  2746   "1:                                          \n" | 
|  2990     MEMACCESS(0) |  2747     MEMACCESS(0) | 
|  2991     "vld4.8     {d16, d18, d20, d22}, [%0]!    \n"  // load 8 ARGB pixels. |  2748     "ld4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8 pixels. | 
|  2992     "subs       %2, %2, #8                     \n"  // 8 processed per loop. |  2749     "subs       %2, %2, #8                     \n"  // 8 processed per loop. | 
|  2993     "vmovl.u8   q8, d16                        \n"  // b (0 .. 255) 16 bit |  2750     "uxtl       v16.8h, v16.8b                 \n"  // b (0 .. 255) 16 bit | 
|  2994     "vmovl.u8   q9, d18                        \n"  // g |  2751     "uxtl       v17.8h, v17.8b                 \n"  // g | 
|  2995     "vmovl.u8   q10, d20                       \n"  // r |  2752     "uxtl       v18.8h, v18.8b                 \n"  // r | 
|  2996     "vmovl.u8   q15, d22                       \n"  // a |  2753     "uxtl       v19.8h, v19.8b                 \n"  // a | 
|  2997     "vmul.s16   q12, q8, d0[0]                 \n"  // B = B * Matrix B |  2754     "mul        v22.8h, v16.8h, v0.h[0]        \n"  // B = B * Matrix B | 
|  2998     "vmul.s16   q13, q8, d1[0]                 \n"  // G = B * Matrix G |  2755     "mul        v23.8h, v16.8h, v0.h[4]        \n"  // G = B * Matrix G | 
|  2999     "vmul.s16   q14, q8, d2[0]                 \n"  // R = B * Matrix R |  2756     "mul        v24.8h, v16.8h, v1.h[0]        \n"  // R = B * Matrix R | 
|  3000     "vmul.s16   q15, q8, d3[0]                 \n"  // A = B * Matrix A |  2757     "mul        v25.8h, v16.8h, v1.h[4]        \n"  // A = B * Matrix A | 
|  3001     "vmul.s16   q4, q9, d0[1]                  \n"  // B += G * Matrix B |  2758     "mul        v4.8h, v17.8h, v0.h[1]         \n"  // B += G * Matrix B | 
|  3002     "vmul.s16   q5, q9, d1[1]                  \n"  // G += G * Matrix G |  2759     "mul        v5.8h, v17.8h, v0.h[5]         \n"  // G += G * Matrix G | 
|  3003     "vmul.s16   q6, q9, d2[1]                  \n"  // R += G * Matrix R |  2760     "mul        v6.8h, v17.8h, v1.h[1]         \n"  // R += G * Matrix R | 
|  3004     "vmul.s16   q7, q9, d3[1]                  \n"  // A += G * Matrix A |  2761     "mul        v7.8h, v17.8h, v1.h[5]         \n"  // A += G * Matrix A | 
|  3005     "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B |  2762     "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B | 
|  3006     "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G |  2763     "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G | 
|  3007     "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R |  2764     "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R | 
|  3008     "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A |  2765     "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A | 
|  3009     "vmul.s16   q4, q10, d0[2]                 \n"  // B += R * Matrix B |  2766     "mul        v4.8h, v18.8h, v0.h[2]         \n"  // B += R * Matrix B | 
|  3010     "vmul.s16   q5, q10, d1[2]                 \n"  // G += R * Matrix G |  2767     "mul        v5.8h, v18.8h, v0.h[6]         \n"  // G += R * Matrix G | 
|  3011     "vmul.s16   q6, q10, d2[2]                 \n"  // R += R * Matrix R |  2768     "mul        v6.8h, v18.8h, v1.h[2]         \n"  // R += R * Matrix R | 
|  3012     "vmul.s16   q7, q10, d3[2]                 \n"  // A += R * Matrix A |  2769     "mul        v7.8h, v18.8h, v1.h[6]         \n"  // A += R * Matrix A | 
|  3013     "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B |  2770     "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B | 
|  3014     "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G |  2771     "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G | 
|  3015     "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R |  2772     "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R | 
|  3016     "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A |  2773     "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A | 
|  3017     "vmul.s16   q4, q15, d0[3]                 \n"  // B += A * Matrix B |  2774     "mul        v4.8h, v19.8h, v0.h[3]         \n"  // B += A * Matrix B | 
|  3018     "vmul.s16   q5, q15, d1[3]                 \n"  // G += A * Matrix G |  2775     "mul        v5.8h, v19.8h, v0.h[7]         \n"  // G += A * Matrix G | 
|  3019     "vmul.s16   q6, q15, d2[3]                 \n"  // R += A * Matrix R |  2776     "mul        v6.8h, v19.8h, v1.h[3]         \n"  // R += A * Matrix R | 
|  3020     "vmul.s16   q7, q15, d3[3]                 \n"  // A += A * Matrix A |  2777     "mul        v7.8h, v19.8h, v1.h[7]         \n"  // A += A * Matrix A | 
|  3021     "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B |  2778     "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B | 
|  3022     "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G |  2779     "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G | 
|  3023     "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R |  2780     "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R | 
|  3024     "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A |  2781     "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A | 
|  3025     "vqshrun.s16 d16, q12, #6                  \n"  // 16 bit to 8 bit B |  2782     "sqshrun    v16.8b, v22.8h, #6             \n"  // 16 bit to 8 bit B | 
|  3026     "vqshrun.s16 d18, q13, #6                  \n"  // 16 bit to 8 bit G |  2783     "sqshrun    v17.8b, v23.8h, #6             \n"  // 16 bit to 8 bit G | 
|  3027     "vqshrun.s16 d20, q14, #6                  \n"  // 16 bit to 8 bit R |  2784     "sqshrun    v18.8b, v24.8h, #6             \n"  // 16 bit to 8 bit R | 
|  3028     "vqshrun.s16 d22, q15, #6                  \n"  // 16 bit to 8 bit A |  2785     "sqshrun    v19.8b, v25.8h, #6             \n"  // 16 bit to 8 bit A | 
|  3029     MEMACCESS(1) |  2786     MEMACCESS(1) | 
|  3030     "vst4.8     {d16, d18, d20, d22}, [%1]!    \n"  // store 8 ARGB pixels. |  2787     "st4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n"  // store 8 pixels. | 
|  3031     "bgt        1b                             \n" |  2788     "b.gt       1b                             \n" | 
|  3032   : "+r"(src_argb),   // %0 |  2789   : "+r"(src_argb),   // %0 | 
|  3033     "+r"(dst_argb),   // %1 |  2790     "+r"(dst_argb),   // %1 | 
|  3034     "+r"(width)       // %2 |  2791     "+r"(width)       // %2 | 
|  3035   : "r"(matrix_argb)  // %3 |  2792   : "r"(matrix_argb)  // %3 | 
|  3036   : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", |  2793   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17"
      , | 
|  3037     "q10", "q11", "q12", "q13", "q14", "q15" |  2794     "v18", "v19", "v22", "v23", "v24", "v25" | 
|  3038   ); |  2795   ); | 
|  3039 } |  2796 } | 
|  3040 #endif  // HAS_ARGBCOLORMATRIXROW_NEON |  2797 #endif  // HAS_ARGBCOLORMATRIXROW_NEON | 
|  3041  |  2798  | 
|  3042 // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable. |  2799 // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable. | 
|  3043 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. |  2800 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. | 
|  3044 #ifdef HAS_ARGBMULTIPLYROW_NEON |  2801 #ifdef HAS_ARGBMULTIPLYROW_NEON | 
|  3045 void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, |  2802 void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, | 
|  3046                           uint8* dst_argb, int width) { |  2803                           uint8* dst_argb, int width) { | 
|  3047   asm volatile ( |  2804   asm volatile ( | 
|  3048     // 8 pixel loop. |  2805     // 8 pixel loop. | 
|  3049     ".p2align   2                              \n" |  | 
|  3050   "1:                                          \n" |  2806   "1:                                          \n" | 
|  3051     MEMACCESS(0) |  2807     MEMACCESS(0) | 
|  3052     "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load 8 ARGB pixels. |  2808     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels. | 
|  3053     MEMACCESS(1) |  2809     MEMACCESS(1) | 
|  3054     "ld4        {v4.8b-v7.8b}, [%1], #32       \n"  // load 8 more ARGB pixels. |  2810     "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels. | 
|  3055     "subs       %3, %3, #8                     \n"  // 8 processed per loop. |  2811     "subs       %3, %3, #8                     \n"  // 8 processed per loop. | 
|  3056     "umull      v0.8h, v0.8b, v4.8b            \n"  // multiply B |  2812     "umull      v0.8h, v0.8b, v4.8b            \n"  // multiply B | 
|  3057     "umull      v1.8h, v1.8b, v5.8b            \n"  // multiply G |  2813     "umull      v1.8h, v1.8b, v5.8b            \n"  // multiply G | 
|  3058     "umull      v2.8h, v2.8b, v6.8b            \n"  // multiply R |  2814     "umull      v2.8h, v2.8b, v6.8b            \n"  // multiply R | 
|  3059     "umull      v3.8h, v3.8b, v7.8b            \n"  // multiply A |  2815     "umull      v3.8h, v3.8b, v7.8b            \n"  // multiply A | 
|  3060     "rshrn      v0.8b, v0.8h, #8               \n"  // 16 bit to 8 bit B |  2816     "rshrn      v0.8b, v0.8h, #8               \n"  // 16 bit to 8 bit B | 
|  3061     "rshrn      v1.8b, v1.8h, #8               \n"  // 16 bit to 8 bit G |  2817     "rshrn      v1.8b, v1.8h, #8               \n"  // 16 bit to 8 bit G | 
|  3062     "rshrn      v2.8b, v2.8h, #8               \n"  // 16 bit to 8 bit R |  2818     "rshrn      v2.8b, v2.8h, #8               \n"  // 16 bit to 8 bit R | 
|  3063     "rshrn      v3.8b, v3.8h, #8               \n"  // 16 bit to 8 bit A |  2819     "rshrn      v3.8b, v3.8h, #8               \n"  // 16 bit to 8 bit A | 
|  3064     MEMACCESS(2) |  2820     MEMACCESS(2) | 
|  3065     "st4        {v0.8b-v3.8b}, [%2], #32       \n"  // store 8 ARGB pixels. |  2821     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels | 
|  3066     "bgt        1b                             \n" |  2822     "b.gt       1b                             \n" | 
|  3067  |  2823  | 
|  3068   : "+r"(src_argb0),  // %0 |  2824   : "+r"(src_argb0),  // %0 | 
|  3069     "+r"(src_argb1),  // %1 |  2825     "+r"(src_argb1),  // %1 | 
|  3070     "+r"(dst_argb),   // %2 |  2826     "+r"(dst_argb),   // %2 | 
|  3071     "+r"(width)       // %3 |  2827     "+r"(width)       // %3 | 
|  3072   : |  2828   : | 
|  3073   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" |  2829   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" | 
|  3074   ); |  2830   ); | 
|  3075 } |  2831 } | 
|  3076 #endif  // HAS_ARGBMULTIPLYROW_NEON |  2832 #endif  // HAS_ARGBMULTIPLYROW_NEON | 
|  3077  |  2833  | 
|  3078 // Add 2 rows of ARGB pixels together, 8 pixels at a time. |  2834 // Add 2 rows of ARGB pixels together, 8 pixels at a time. | 
|  3079 #ifdef HAS_ARGBADDROW_NEON |  2835 #ifdef HAS_ARGBADDROW_NEON | 
|  3080 void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, |  2836 void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, | 
|  3081                      uint8* dst_argb, int width) { |  2837                      uint8* dst_argb, int width) { | 
|  3082   asm volatile ( |  2838   asm volatile ( | 
|  3083     // 8 pixel loop. |  2839     // 8 pixel loop. | 
|  3084     ".p2align   2                              \n" |  | 
|  3085   "1:                                          \n" |  2840   "1:                                          \n" | 
|  3086     MEMACCESS(0) |  2841     MEMACCESS(0) | 
|  3087     "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load 8 ARGB pixels. |  2842     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels. | 
|  3088     MEMACCESS(1) |  2843     MEMACCESS(1) | 
|  3089     "ld4        {v4.8b-v7.8b}, [%1], #32       \n"  // load 8 more ARGB pixels. |  2844     "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels. | 
|  3090     "subs       %3, %3, #8                     \n"  // 8 processed per loop. |  2845     "subs       %3, %3, #8                     \n"  // 8 processed per loop. | 
|  3091     "uqadd      v0.8b, v0.8b, v4.8b            \n" |  2846     "uqadd      v0.8b, v0.8b, v4.8b            \n" | 
|  3092     "uqadd      v1.8b, v1.8b, v5.8b            \n" |  2847     "uqadd      v1.8b, v1.8b, v5.8b            \n" | 
|  3093     "uqadd      v2.8b, v2.8b, v6.8b            \n" |  2848     "uqadd      v2.8b, v2.8b, v6.8b            \n" | 
|  3094     "uqadd      v3.8b, v3.8b, v7.8b            \n" |  2849     "uqadd      v3.8b, v3.8b, v7.8b            \n" | 
|  3095     MEMACCESS(2) |  2850     MEMACCESS(2) | 
|  3096     "st4        {v0.8b-v3.8b}, [%2], #32       \n"  // store 8 ARGB pixels. |  2851     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels | 
|  3097     "bgt        1b                             \n" |  2852     "b.gt       1b                             \n" | 
|  3098  |  2853  | 
|  3099   : "+r"(src_argb0),  // %0 |  2854   : "+r"(src_argb0),  // %0 | 
|  3100     "+r"(src_argb1),  // %1 |  2855     "+r"(src_argb1),  // %1 | 
|  3101     "+r"(dst_argb),   // %2 |  2856     "+r"(dst_argb),   // %2 | 
|  3102     "+r"(width)       // %3 |  2857     "+r"(width)       // %3 | 
|  3103   : |  2858   : | 
|  3104   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" |  2859   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" | 
|  3105   ); |  2860   ); | 
|  3106 } |  2861 } | 
|  3107 #endif  // HAS_ARGBADDROW_NEON |  2862 #endif  // HAS_ARGBADDROW_NEON | 
|  3108  |  2863  | 
|  3109 // Subtract 2 rows of ARGB pixels, 8 pixels at a time. |  2864 // Subtract 2 rows of ARGB pixels, 8 pixels at a time. | 
|  3110 #ifdef HAS_ARGBSUBTRACTROW_NEON |  2865 #ifdef HAS_ARGBSUBTRACTROW_NEON | 
|  3111 void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, |  2866 void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, | 
|  3112                           uint8* dst_argb, int width) { |  2867                           uint8* dst_argb, int width) { | 
|  3113   asm volatile ( |  2868   asm volatile ( | 
|  3114     // 8 pixel loop. |  2869     // 8 pixel loop. | 
|  3115     ".p2align   2                              \n" |  | 
|  3116   "1:                                          \n" |  2870   "1:                                          \n" | 
|  3117     MEMACCESS(0) |  2871     MEMACCESS(0) | 
|  3118     "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load 8 ARGB pixels. |  2872     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels. | 
|  3119     MEMACCESS(1) |  2873     MEMACCESS(1) | 
|  3120     "ld4        {v4.8b-v7.8b}, [%1], #32       \n"  // load 8 more ARGB pixels. |  2874     "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels. | 
|  3121     "subs       %3, %3, #8                     \n"  // 8 processed per loop. |  2875     "subs       %3, %3, #8                     \n"  // 8 processed per loop. | 
|  3122     "uqsub      v0.8b, v0.8b, v4.8b            \n" |  2876     "uqsub      v0.8b, v0.8b, v4.8b            \n" | 
|  3123     "uqsub      v1.8b, v1.8b, v5.8b            \n" |  2877     "uqsub      v1.8b, v1.8b, v5.8b            \n" | 
|  3124     "uqsub      v2.8b, v2.8b, v6.8b            \n" |  2878     "uqsub      v2.8b, v2.8b, v6.8b            \n" | 
|  3125     "uqsub      v3.8b, v3.8b, v7.8b            \n" |  2879     "uqsub      v3.8b, v3.8b, v7.8b            \n" | 
|  3126     MEMACCESS(2) |  2880     MEMACCESS(2) | 
|  3127     "st4        {v0.8b-v3.8b}, [%2], #32       \n"  // store 8 ARGB pixels. |  2881     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels | 
|  3128     "bgt        1b                             \n" |  2882     "b.gt       1b                             \n" | 
|  3129  |  2883  | 
|  3130   : "+r"(src_argb0),  // %0 |  2884   : "+r"(src_argb0),  // %0 | 
|  3131     "+r"(src_argb1),  // %1 |  2885     "+r"(src_argb1),  // %1 | 
|  3132     "+r"(dst_argb),   // %2 |  2886     "+r"(dst_argb),   // %2 | 
|  3133     "+r"(width)       // %3 |  2887     "+r"(width)       // %3 | 
|  3134   : |  2888   : | 
|  3135   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" |  2889   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" | 
|  3136   ); |  2890   ); | 
|  3137 } |  2891 } | 
|  3138 #endif  // HAS_ARGBSUBTRACTROW_NEON |  2892 #endif  // HAS_ARGBSUBTRACTROW_NEON | 
|  3139  |  2893  | 
|  3140 // Adds Sobel X and Sobel Y and stores Sobel into ARGB. |  2894 // Adds Sobel X and Sobel Y and stores Sobel into ARGB. | 
|  3141 // A = 255 |  2895 // A = 255 | 
|  3142 // R = Sobel |  2896 // R = Sobel | 
|  3143 // G = Sobel |  2897 // G = Sobel | 
|  3144 // B = Sobel |  2898 // B = Sobel | 
|  3145 #ifdef HAS_SOBELROW_NEON |  2899 #ifdef HAS_SOBELROW_NEON | 
|  3146 void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, |  2900 void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, | 
|  3147                      uint8* dst_argb, int width) { |  2901                      uint8* dst_argb, int width) { | 
|  3148   asm volatile ( |  2902   asm volatile ( | 
|  3149     "movi       v3.8b, #255                    \n"  // alpha |  2903     "movi       v3.8b, #255                    \n"  // alpha | 
|  3150     // 8 pixel loop. |  2904     // 8 pixel loop. | 
|  3151     ".p2align   2                              \n" |  | 
|  3152   "1:                                          \n" |  2905   "1:                                          \n" | 
|  3153     MEMACCESS(0) |  2906     MEMACCESS(0) | 
|  3154     "ld1        {v0.8b}, [%0], #8              \n"  // load 8 sobelx. |  2907     "ld1        {v0.8b}, [%0], #8              \n"  // load 8 sobelx. | 
|  3155     MEMACCESS(1) |  2908     MEMACCESS(1) | 
|  3156     "ld1        {v1.8b}, [%1], #8              \n"  // load 8 sobely. |  2909     "ld1        {v1.8b}, [%1], #8              \n"  // load 8 sobely. | 
|  3157     "subs       %3, %3, #8                     \n"  // 8 processed per loop. |  2910     "subs       %3, %3, #8                     \n"  // 8 processed per loop. | 
|  3158     "uqadd      v0.8b, v0.8b, v1.8b            \n"  // add |  2911     "uqadd      v0.8b, v0.8b, v1.8b            \n"  // add | 
|  3159     "mov        v1.8b, v0.8b                   \n" |  2912     "orr        v1.8b, v0.8b, v0.8b            \n" | 
|  3160     "mov        v2.8b, v0.8b                   \n" |  2913     "orr        v2.8b, v0.8b, v0.8b            \n" | 
|  3161     MEMACCESS(2) |  2914     MEMACCESS(2) | 
|  3162     "st4        {v0.8b-v3.8b}, [%2], #32       \n"  // store 8 ARGB pixels. |  2915     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels | 
|  3163     "bgt        1b                             \n" |  2916     "b.gt       1b                             \n" | 
|  3164   : "+r"(src_sobelx),  // %0 |  2917   : "+r"(src_sobelx),  // %0 | 
|  3165     "+r"(src_sobely),  // %1 |  2918     "+r"(src_sobely),  // %1 | 
|  3166     "+r"(dst_argb),    // %2 |  2919     "+r"(dst_argb),    // %2 | 
|  3167     "+r"(width)        // %3 |  2920     "+r"(width)        // %3 | 
|  3168   : |  2921   : | 
|  3169   : "cc", "memory", "v0", "v1", "v2", "v3" |  2922   : "cc", "memory", "v0", "v1", "v2", "v3" | 
|  3170   ); |  2923   ); | 
|  3171 } |  2924 } | 
|  3172 #endif  // HAS_SOBELROW_NEON |  2925 #endif  // HAS_SOBELROW_NEON | 
|  3173  |  2926  | 
|  3174 // Adds Sobel X and Sobel Y and stores Sobel into plane. |  2927 // Adds Sobel X and Sobel Y and stores Sobel into plane. | 
|  3175 #ifdef HAS_SOBELTOPLANEROW_NEON |  2928 #ifdef HAS_SOBELTOPLANEROW_NEON | 
|  3176 void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, |  2929 void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, | 
|  3177                           uint8* dst_y, int width) { |  2930                           uint8* dst_y, int width) { | 
|  3178   asm volatile ( |  2931   asm volatile ( | 
|  3179     // 16 pixel loop. |  2932     // 16 pixel loop. | 
|  3180     ".p2align   2                              \n" |  | 
|  3181   "1:                                          \n" |  2933   "1:                                          \n" | 
|  3182     MEMACCESS(0) |  2934     MEMACCESS(0) | 
|  3183     "ld1        {v0.16b}, [%0], #16            \n"  // load 16 sobelx. |  2935     "ld1        {v0.16b}, [%0], #16            \n"  // load 16 sobelx. | 
|  3184     MEMACCESS(1) |  2936     MEMACCESS(1) | 
|  3185     "ld1        {v1.16b}, [%1], #16            \n"  // load 16 sobely. |  2937     "ld1        {v1.16b}, [%1], #16            \n"  // load 16 sobely. | 
|  3186     "subs       %3, %3, #16                    \n"  // 16 processed per loop. |  2938     "subs       %3, %3, #16                    \n"  // 16 processed per loop. | 
|  3187     "uqadd      v0.16b, v0.16b, v1.16b         \n"  // add |  2939     "uqadd      v0.16b, v0.16b, v1.16b         \n"  // add | 
|  3188     MEMACCESS(2) |  2940     MEMACCESS(2) | 
|  3189     "st1        {v0.16b}, [%2], #16            \n"  // store 16 pixels. |  2941     "st1        {v0.16b}, [%2], #16            \n"  // store 16 pixels. | 
|  3190     "bgt        1b                             \n" |  2942     "b.gt       1b                             \n" | 
|  3191   : "+r"(src_sobelx),  // %0 |  2943   : "+r"(src_sobelx),  // %0 | 
|  3192     "+r"(src_sobely),  // %1 |  2944     "+r"(src_sobely),  // %1 | 
|  3193     "+r"(dst_y),       // %2 |  2945     "+r"(dst_y),       // %2 | 
|  3194     "+r"(width)        // %3 |  2946     "+r"(width)        // %3 | 
|  3195   : |  2947   : | 
|  3196   : "cc", "memory", "v0", "v1" |  2948   : "cc", "memory", "v0", "v1" | 
|  3197   ); |  2949   ); | 
|  3198 } |  2950 } | 
|  3199 #endif  // HAS_SOBELTOPLANEROW_NEON |  2951 #endif  // HAS_SOBELTOPLANEROW_NEON | 
|  3200  |  2952  | 
|  3201 // Mixes Sobel X, Sobel Y and Sobel into ARGB. |  2953 // Mixes Sobel X, Sobel Y and Sobel into ARGB. | 
|  3202 // A = 255 |  2954 // A = 255 | 
|  3203 // R = Sobel X |  2955 // R = Sobel X | 
|  3204 // G = Sobel |  2956 // G = Sobel | 
|  3205 // B = Sobel Y |  2957 // B = Sobel Y | 
|  3206 #ifdef HAS_SOBELXYROW_NEON |  2958 #ifdef HAS_SOBELXYROW_NEON | 
|  3207 void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, |  2959 void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, | 
|  3208                      uint8* dst_argb, int width) { |  2960                      uint8* dst_argb, int width) { | 
|  3209   asm volatile ( |  2961   asm volatile ( | 
|  3210     "movi       v3.8b, #255                    \n"  // alpha |  2962     "movi       v3.8b, #255                    \n"  // alpha | 
|  3211     // 8 pixel loop. |  2963     // 8 pixel loop. | 
|  3212     ".p2align   2                              \n" |  | 
|  3213   "1:                                          \n" |  2964   "1:                                          \n" | 
|  3214     MEMACCESS(0) |  2965     MEMACCESS(0) | 
|  3215     "ld1        {v2.8b}, [%0], #8              \n"  // load 8 sobelx. |  2966     "ld1        {v2.8b}, [%0], #8              \n"  // load 8 sobelx. | 
|  3216     MEMACCESS(1) |  2967     MEMACCESS(1) | 
|  3217     "ld1        {v0.8b}, [%1], #8              \n"  // load 8 sobely. |  2968     "ld1        {v0.8b}, [%1], #8              \n"  // load 8 sobely. | 
|  3218     "subs       %3, %3, #8                     \n"  // 8 processed per loop. |  2969     "subs       %3, %3, #8                     \n"  // 8 processed per loop. | 
|  3219     "uqadd      v1.8b, v0.8b, v2.8b            \n"  // add |  2970     "uqadd      v1.8b, v0.8b, v2.8b            \n"  // add | 
|  3220     MEMACCESS(2) |  2971     MEMACCESS(2) | 
|  3221     "st4        {v0.8b-v3.8b}, [%2], #32       \n"  // store 8 ARGB pixels. |  2972     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels | 
|  3222     "bgt        1b                             \n" |  2973     "b.gt       1b                             \n" | 
|  3223   : "+r"(src_sobelx),  // %0 |  2974   : "+r"(src_sobelx),  // %0 | 
|  3224     "+r"(src_sobely),  // %1 |  2975     "+r"(src_sobely),  // %1 | 
|  3225     "+r"(dst_argb),    // %2 |  2976     "+r"(dst_argb),    // %2 | 
|  3226     "+r"(width)        // %3 |  2977     "+r"(width)        // %3 | 
|  3227   : |  2978   : | 
|  3228   : "cc", "memory", "v0", "v1", "v2", "v3" |  2979   : "cc", "memory", "v0", "v1", "v2", "v3" | 
|  3229   ); |  2980   ); | 
|  3230 } |  2981 } | 
|  3231 #endif  // HAS_SOBELXYROW_NEON |  2982 #endif  // HAS_SOBELXYROW_NEON | 
|  3232  |  2983  | 
|  3233 // SobelX as a matrix is |  2984 // SobelX as a matrix is | 
|  3234 // -1  0  1 |  2985 // -1  0  1 | 
|  3235 // -2  0  2 |  2986 // -2  0  2 | 
|  3236 // -1  0  1 |  2987 // -1  0  1 | 
|  3237 #ifdef HAS_SOBELXROW_NEON |  2988 #ifdef HAS_SOBELXROW_NEON | 
|  3238 void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, |  2989 void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, | 
|  3239                     const uint8* src_y2, uint8* dst_sobelx, int width) { |  2990                     const uint8* src_y2, uint8* dst_sobelx, int width) { | 
|  3240   asm volatile ( |  2991   asm volatile ( | 
|  3241     ".p2align   2                              \n" |  | 
|  3242   "1:                                          \n" |  2992   "1:                                          \n" | 
|  3243     MEMACCESS(0) |  2993     MEMACCESS(0) | 
|  3244     "ld1        {v0.8b}, [%0],%5               \n"  // top |  2994     "ld1        {v0.8b}, [%0],%5               \n"  // top | 
|  3245     MEMACCESS(0) |  2995     MEMACCESS(0) | 
|  3246     "ld1        {v1.8b}, [%0],%6               \n" |  2996     "ld1        {v1.8b}, [%0],%6               \n" | 
|  3247     "usubl      v0.8h, v0.8b, v1.8b            \n" |  2997     "usubl      v0.8h, v0.8b, v1.8b            \n" | 
|  3248     MEMACCESS(1) |  2998     MEMACCESS(1) | 
|  3249     "ld1        {v2.8b}, [%1],%5               \n"  // center * 2 |  2999     "ld1        {v2.8b}, [%1],%5               \n"  // center * 2 | 
|  3250     MEMACCESS(1) |  3000     MEMACCESS(1) | 
|  3251     "ld1        {v3.8b}, [%1],%6               \n" |  3001     "ld1        {v3.8b}, [%1],%6               \n" | 
|  3252     "usubl      v1.8h, v2.8b, v3.8b            \n" |  3002     "usubl      v1.8h, v2.8b, v3.8b            \n" | 
|  3253     "add        v0.8h, v0.8h, v1.8h            \n" |  3003     "add        v0.8h, v0.8h, v1.8h            \n" | 
|  3254     "add        v0.8h, v0.8h, v1.8h            \n" |  3004     "add        v0.8h, v0.8h, v1.8h            \n" | 
|  3255     MEMACCESS(2) |  3005     MEMACCESS(2) | 
|  3256     "ld1        {v2.8b}, [%2],%5               \n"  // bottom |  3006     "ld1        {v2.8b}, [%2],%5               \n"  // bottom | 
|  3257     MEMACCESS(2) |  3007     MEMACCESS(2) | 
|  3258     "ld1        {v3.8b}, [%2],%6               \n" |  3008     "ld1        {v3.8b}, [%2],%6               \n" | 
|  3259     "subs       %4, %4, #8                     \n"  // 8 pixels |  3009     "subs       %4, %4, #8                     \n"  // 8 pixels | 
|  3260     "usubl      v1.8h, v2.8b, v3.8b            \n" |  3010     "usubl      v1.8h, v2.8b, v3.8b            \n" | 
|  3261     "add        v0.8h, v0.8h, v1.8h            \n" |  3011     "add        v0.8h, v0.8h, v1.8h            \n" | 
|  3262     "abs        v0.8h, v0.8h                   \n" |  3012     "abs        v0.8h, v0.8h                   \n" | 
|  3263     "uqxtn      v0.8b, v0.8h                   \n" |  3013     "uqxtn      v0.8b, v0.8h                   \n" | 
|  3264     MEMACCESS(3) |  3014     MEMACCESS(3) | 
|  3265     "st1        {v0.8b}, [%3], #8              \n"  // store 8 sobelx |  3015     "st1        {v0.8b}, [%3], #8              \n"  // store 8 sobelx | 
|  3266     "bgt        1b                             \n" |  3016     "b.gt       1b                             \n" | 
|  3267   : "+r"(src_y0),      // %0 |  3017   : "+r"(src_y0),      // %0 | 
|  3268     "+r"(src_y1),      // %1 |  3018     "+r"(src_y1),      // %1 | 
|  3269     "+r"(src_y2),      // %2 |  3019     "+r"(src_y2),      // %2 | 
|  3270     "+r"(dst_sobelx),  // %3 |  3020     "+r"(dst_sobelx),  // %3 | 
|  3271     "+r"(width)        // %4 |  3021     "+r"(width)        // %4 | 
|  3272   : "r"(2),            // %5 |  3022   : "r"(2),            // %5 | 
|  3273     "r"(6)             // %6 |  3023     "r"(6)             // %6 | 
|  3274   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List |  3024   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List | 
|  3275   ); |  3025   ); | 
|  3276 } |  3026 } | 
|  3277 #endif  // HAS_SOBELXROW_NEON |  3027 #endif  // HAS_SOBELXROW_NEON | 
|  3278  |  3028  | 
|  3279 // SobelY as a matrix is |  3029 // SobelY as a matrix is | 
|  3280 // -1 -2 -1 |  3030 // -1 -2 -1 | 
|  3281 //  0  0  0 |  3031 //  0  0  0 | 
|  3282 //  1  2  1 |  3032 //  1  2  1 | 
|  3283 #ifdef HAS_SOBELYROW_NEON |  3033 #ifdef HAS_SOBELYROW_NEON | 
|  3284 void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, |  3034 void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, | 
|  3285                     uint8* dst_sobely, int width) { |  3035                     uint8* dst_sobely, int width) { | 
|  3286   asm volatile ( |  3036   asm volatile ( | 
|  3287     ".p2align   2                              \n" |  | 
|  3288   "1:                                          \n" |  3037   "1:                                          \n" | 
|  3289     MEMACCESS(0) |  3038     MEMACCESS(0) | 
|  3290     "ld1        {v0.8b}, [%0],%4               \n"  // left |  3039     "ld1        {v0.8b}, [%0],%4               \n"  // left | 
|  3291     MEMACCESS(1) |  3040     MEMACCESS(1) | 
|  3292     "ld1        {v1.8b}, [%1],%4               \n" |  3041     "ld1        {v1.8b}, [%1],%4               \n" | 
|  3293     "usubl      v0.8h, v0.8b, v1.8b            \n" |  3042     "usubl      v0.8h, v0.8b, v1.8b            \n" | 
|  3294     MEMACCESS(0) |  3043     MEMACCESS(0) | 
|  3295     "ld1        {v2.8b}, [%0],%4               \n"  // center * 2 |  3044     "ld1        {v2.8b}, [%0],%4               \n"  // center * 2 | 
|  3296     MEMACCESS(1) |  3045     MEMACCESS(1) | 
|  3297     "ld1        {v3.8b}, [%1],%4               \n" |  3046     "ld1        {v3.8b}, [%1],%4               \n" | 
|  3298     "usubl      v1.8h, v2.8b, v3.8b            \n" |  3047     "usubl      v1.8h, v2.8b, v3.8b            \n" | 
|  3299     "add        v0.8h, v0.8h, v1.8h            \n" |  3048     "add        v0.8h, v0.8h, v1.8h            \n" | 
|  3300     "add        v0.8h, v0.8h, v1.8h            \n" |  3049     "add        v0.8h, v0.8h, v1.8h            \n" | 
|  3301     MEMACCESS(0) |  3050     MEMACCESS(0) | 
|  3302     "ld1        {v2.8b}, [%0],%5               \n"  // right |  3051     "ld1        {v2.8b}, [%0],%5               \n"  // right | 
|  3303     MEMACCESS(1) |  3052     MEMACCESS(1) | 
|  3304     "ld1        {v3.8b}, [%1],%5               \n" |  3053     "ld1        {v3.8b}, [%1],%5               \n" | 
|  3305     "subs       %3, %3, #8                     \n"  // 8 pixels |  3054     "subs       %3, %3, #8                     \n"  // 8 pixels | 
|  3306     "usubl      v1.8h, v2.8b, v3.8b            \n" |  3055     "usubl      v1.8h, v2.8b, v3.8b            \n" | 
|  3307     "add        v0.8h, v0.8h, v1.8h            \n" |  3056     "add        v0.8h, v0.8h, v1.8h            \n" | 
|  3308     "abs        v0.8h, v0.8h                   \n" |  3057     "abs        v0.8h, v0.8h                   \n" | 
|  3309     "uqxtn      v0.8b, v0.8h                   \n" |  3058     "uqxtn      v0.8b, v0.8h                   \n" | 
|  3310     MEMACCESS(2) |  3059     MEMACCESS(2) | 
|  3311     "st1        {v0.8b}, [%2], #8              \n"  // store 8 sobely |  3060     "st1        {v0.8b}, [%2], #8              \n"  // store 8 sobely | 
|  3312     "bgt        1b                             \n" |  3061     "b.gt       1b                             \n" | 
|  3313   : "+r"(src_y0),      // %0 |  3062   : "+r"(src_y0),      // %0 | 
|  3314     "+r"(src_y1),      // %1 |  3063     "+r"(src_y1),      // %1 | 
|  3315     "+r"(dst_sobely),  // %2 |  3064     "+r"(dst_sobely),  // %2 | 
|  3316     "+r"(width)        // %3 |  3065     "+r"(width)        // %3 | 
|  3317   : "r"(1),            // %4 |  3066   : "r"(1),            // %4 | 
|  3318     "r"(6)             // %5 |  3067     "r"(6)             // %5 | 
|  3319   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List |  3068   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List | 
|  3320   ); |  3069   ); | 
|  3321 } |  3070 } | 
|  3322 #endif  // HAS_SOBELYROW_NEON |  3071 #endif  // HAS_SOBELYROW_NEON | 
|  3323 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) |  3072 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) | 
|  3324  |  3073  | 
|  3325 #ifdef __cplusplus |  3074 #ifdef __cplusplus | 
|  3326 }  // extern "C" |  3075 }  // extern "C" | 
|  3327 }  // namespace libyuv |  3076 }  // namespace libyuv | 
|  3328 #endif |  3077 #endif | 
| OLD | NEW |