Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(19)

Side by Side Diff: source/libvpx/third_party/libyuv/source/row_neon64.cc

Issue 996503002: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 2 * Copyright 2014 The LibYuv Project Authors. All rights reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
11 #include "libyuv/row.h" 11 #include "libyuv/row.h"
12 12
13 #ifdef __cplusplus 13 #ifdef __cplusplus
14 namespace libyuv { 14 namespace libyuv {
15 extern "C" { 15 extern "C" {
16 #endif 16 #endif
17 17
18 // This module is for GCC Neon 18 // This module is for GCC Neon armv8 64 bit.
19 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) 19 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
20 20
21 // Read 8 Y, 4 U and 4 V from 422 21 // Read 8 Y, 4 U and 4 V from 422
22 #define READYUV422 \ 22 #define READYUV422 \
23 MEMACCESS(0) \ 23 MEMACCESS(0) \
24 "vld1.8 {d0}, [%0]! \n" \ 24 "ld1 {v0.8b}, [%0], #8 \n" \
25 MEMACCESS(1) \ 25 MEMACCESS(1) \
26 "vld1.32 {d2[0]}, [%1]! \n" \ 26 "ld1 {v1.s}[0], [%1], #4 \n" \
27 MEMACCESS(2) \ 27 MEMACCESS(2) \
28 "vld1.32 {d2[1]}, [%2]! \n" 28 "ld1 {v1.s}[1], [%2], #4 \n"
29 29
30 // Read 8 Y, 2 U and 2 V from 422 30 // Read 8 Y, 2 U and 2 V from 422
31 #define READYUV411 \ 31 #define READYUV411 \
32 MEMACCESS(0) \ 32 MEMACCESS(0) \
33 "vld1.8 {d0}, [%0]! \n" \ 33 "ld1 {v0.8b}, [%0], #8 \n" \
34 MEMACCESS(1) \ 34 MEMACCESS(1) \
35 "vld1.16 {d2[0]}, [%1]! \n" \ 35 "ld1 {v2.h}[0], [%1], #2 \n" \
36 MEMACCESS(2) \ 36 MEMACCESS(2) \
37 "vld1.16 {d2[1]}, [%2]! \n" \ 37 "ld1 {v2.h}[1], [%2], #2 \n" \
38 "vmov.u8 d3, d2 \n" \ 38 "zip1 v1.8b, v2.8b, v2.8b \n"
39 "vzip.u8 d2, d3 \n"
40 39
41 // Read 8 Y, 8 U and 8 V from 444 40 // Read 8 Y, 8 U and 8 V from 444
42 #define READYUV444 \ 41 #define READYUV444 \
43 MEMACCESS(0) \ 42 MEMACCESS(0) \
44 "vld1.8 {d0}, [%0]! \n" \ 43 "ld1 {v0.8b}, [%0], #8 \n" \
45 MEMACCESS(1) \ 44 MEMACCESS(1) \
46 "vld1.8 {d2}, [%1]! \n" \ 45 "ld1 {v1.d}[0], [%1], #8 \n" \
47 MEMACCESS(2) \ 46 MEMACCESS(2) \
48 "vld1.8 {d3}, [%2]! \n" \ 47 "ld1 {v1.d}[1], [%2], #8 \n" \
49 "vpaddl.u8 q1, q1 \n" \ 48 "uaddlp v1.8h, v1.16b \n" \
50 "vrshrn.u16 d2, q1, #1 \n" 49 "rshrn v1.8b, v1.8h, #1 \n"
51 50
52 // Read 8 Y, and set 4 U and 4 V to 128 51 // Read 8 Y, and set 4 U and 4 V to 128
53 #define READYUV400 \ 52 #define READYUV400 \
54 MEMACCESS(0) \ 53 MEMACCESS(0) \
55 "vld1.8 {d0}, [%0]! \n" \ 54 "ld1 {v0.8b}, [%0], #8 \n" \
56 "vmov.u8 d2, #128 \n" 55 "movi v1.8b , #128 \n"
57 56
58 // Read 8 Y and 4 UV from NV12 57 // Read 8 Y and 4 UV from NV12
59 #define READNV12 \ 58 #define READNV12 \
60 MEMACCESS(0) \ 59 MEMACCESS(0) \
61 "vld1.8 {d0}, [%0]! \n" \ 60 "ld1 {v0.8b}, [%0], #8 \n" \
62 MEMACCESS(1) \ 61 MEMACCESS(1) \
63 "vld1.8 {d2}, [%1]! \n" \ 62 "ld1 {v2.8b}, [%1], #8 \n" \
64 "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\ 63 "uzp1 v1.8b, v2.8b, v2.8b \n" \
65 "vuzp.u8 d2, d3 \n" \ 64 "uzp2 v3.8b, v2.8b, v2.8b \n" \
66 "vtrn.u32 d2, d3 \n" 65 "ins v1.s[1], v3.s[0] \n"
67 66
68 // Read 8 Y and 4 VU from NV21 67 // Read 8 Y and 4 VU from NV21
69 #define READNV21 \ 68 #define READNV21 \
70 MEMACCESS(0) \ 69 MEMACCESS(0) \
71 "vld1.8 {d0}, [%0]! \n" \ 70 "ld1 {v0.8b}, [%0], #8 \n" \
72 MEMACCESS(1) \ 71 MEMACCESS(1) \
73 "vld1.8 {d2}, [%1]! \n" \ 72 "ld1 {v2.8b}, [%1], #8 \n" \
74 "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\ 73 "uzp1 v3.8b, v2.8b, v2.8b \n" \
75 "vuzp.u8 d3, d2 \n" \ 74 "uzp2 v1.8b, v2.8b, v2.8b \n" \
76 "vtrn.u32 d2, d3 \n" 75 "ins v1.s[1], v3.s[0] \n"
77 76
78 // Read 8 YUY2 77 // Read 8 YUY2
79 #define READYUY2 \ 78 #define READYUY2 \
80 MEMACCESS(0) \ 79 MEMACCESS(0) \
81 "vld2.8 {d0, d2}, [%0]! \n" \ 80 "ld2 {v0.8b, v1.8b}, [%0], #16 \n" \
82 "vmov.u8 d3, d2 \n" \ 81 "uzp2 v3.8b, v1.8b, v1.8b \n" \
83 "vuzp.u8 d2, d3 \n" \ 82 "uzp1 v1.8b, v1.8b, v1.8b \n" \
84 "vtrn.u32 d2, d3 \n" 83 "ins v1.s[1], v3.s[0] \n"
85 84
86 // Read 8 UYVY 85 // Read 8 UYVY
87 #define READUYVY \ 86 #define READUYVY \
88 MEMACCESS(0) \ 87 MEMACCESS(0) \
89 "vld2.8 {d2, d3}, [%0]! \n" \ 88 "ld2 {v2.8b, v3.8b}, [%0], #16 \n" \
90 "vmov.u8 d0, d3 \n" \ 89 "orr v0.8b, v3.8b, v3.8b \n" \
91 "vmov.u8 d3, d2 \n" \ 90 "uzp1 v1.8b, v2.8b, v2.8b \n" \
92 "vuzp.u8 d2, d3 \n" \ 91 "uzp2 v3.8b, v2.8b, v2.8b \n" \
93 "vtrn.u32 d2, d3 \n" 92 "ins v1.s[1], v3.s[0] \n"
94 93
95 #define YUV422TORGB \ 94 #define YUV422TORGB_SETUP_REG \
96 "veor.u8 d2, d26 \n"/*subtract 128 from u and v*/\ 95 "ld1r {v24.8h}, [%[kUVBiasBGR]], #2 \n" \
97 "vmull.s8 q8, d2, d24 \n"/* u/v B/R component */\ 96 "ld1r {v25.8h}, [%[kUVBiasBGR]], #2 \n" \
98 "vmull.s8 q9, d2, d25 \n"/* u/v G component */\ 97 "ld1r {v26.8h}, [%[kUVBiasBGR]] \n" \
99 "vmov.u8 d1, #0 \n"/* split odd/even y apart */\ 98 "ld1r {v31.4s}, [%[kYToRgb]] \n" \
100 "vtrn.u8 d0, d1 \n" \ 99 "movi v27.8h, #128 \n" \
101 "vsub.s16 q0, q0, q15 \n"/* offset y */\ 100 "movi v28.8h, #102 \n" \
102 "vmul.s16 q0, q0, q14 \n" \ 101 "movi v29.8h, #25 \n" \
103 "vadd.s16 d18, d19 \n" \ 102 "movi v30.8h, #52 \n"
104 "vqadd.s16 d20, d0, d16 \n" /* B */ \
105 "vqadd.s16 d21, d1, d16 \n" \
106 "vqadd.s16 d22, d0, d17 \n" /* R */ \
107 "vqadd.s16 d23, d1, d17 \n" \
108 "vqadd.s16 d16, d0, d18 \n" /* G */ \
109 "vqadd.s16 d17, d1, d18 \n" \
110 "vqshrun.s16 d0, q10, #6 \n" /* B */ \
111 "vqshrun.s16 d1, q11, #6 \n" /* G */ \
112 "vqshrun.s16 d2, q8, #6 \n" /* R */ \
113 "vmovl.u8 q10, d0 \n"/* set up for reinterleave*/\
114 "vmovl.u8 q11, d1 \n" \
115 "vmovl.u8 q8, d2 \n" \
116 "vtrn.u8 d20, d21 \n" \
117 "vtrn.u8 d22, d23 \n" \
118 "vtrn.u8 d16, d17 \n" \
119 "vmov.u8 d21, d16 \n"
120 103
121 static vec8 kUVToRB = { 127, 127, 127, 127, 102, 102, 102, 102, 104 #define YUV422TORGB(vR, vG, vB) \
122 0, 0, 0, 0, 0, 0, 0, 0 }; 105 "uxtl v0.8h, v0.8b \n" /* Extract Y */ \
123 static vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52, 106 "shll v2.8h, v1.8b, #8 \n" /* Replicate UV */ \
124 0, 0, 0, 0, 0, 0, 0, 0 }; 107 "ushll2 v3.4s, v0.8h, #0 \n" /* Y */ \
108 "ushll v0.4s, v0.4h, #0 \n" \
109 "mul v3.4s, v3.4s, v31.4s \n" \
110 "mul v0.4s, v0.4s, v31.4s \n" \
111 "sqshrun v0.4h, v0.4s, #16 \n" \
112 "sqshrun2 v0.8h, v3.4s, #16 \n" /* Y */ \
113 "uaddw v1.8h, v2.8h, v1.8b \n" /* Replicate UV */ \
114 "mov v2.d[0], v1.d[1] \n" /* Extract V */ \
115 "uxtl v2.8h, v2.8b \n" \
116 "uxtl v1.8h, v1.8b \n" /* Extract U */ \
117 "mul v3.8h, v1.8h, v27.8h \n" \
118 "mul v5.8h, v1.8h, v29.8h \n" \
119 "mul v6.8h, v2.8h, v30.8h \n" \
120 "mul v7.8h, v2.8h, v28.8h \n" \
121 "sqadd v6.8h, v6.8h, v5.8h \n" \
122 "sqadd " #vB ".8h, v24.8h, v0.8h \n" /* B */ \
123 "sqadd " #vG ".8h, v25.8h, v0.8h \n" /* G */ \
124 "sqadd " #vR ".8h, v26.8h, v0.8h \n" /* R */ \
125 "sqadd " #vB ".8h, " #vB ".8h, v3.8h \n" /* B */ \
126 "sqsub " #vG ".8h, " #vG ".8h, v6.8h \n" /* G */ \
127 "sqadd " #vR ".8h, " #vR ".8h, v7.8h \n" /* R */ \
128 "sqshrun " #vB ".8b, " #vB ".8h, #6 \n" /* B */ \
129 "sqshrun " #vG ".8b, " #vG ".8h, #6 \n" /* G */ \
130 "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */ \
131
132 // YUV to RGB conversion constants.
133 // Y contribution to R,G,B. Scale and bias.
134 #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
135 #define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */
136
137 // U and V contributions to R,G,B.
138 #define UB -128 /* -min(128, round(2.018 * 64)) */
139 #define UG 25 /* -round(-0.391 * 64) */
140 #define VG 52 /* -round(-0.813 * 64) */
141 #define VR -102 /* -round(1.596 * 64) */
142
143 // Bias values to subtract 16 from Y and 128 from U and V.
144 #define BB (UB * 128 - YGB)
145 #define BG (UG * 128 + VG * 128 - YGB)
146 #define BR (VR * 128 - YGB)
147
148 static vec16 kUVBiasBGR = { BB, BG, BR, 0, 0, 0, 0, 0 };
149 static vec32 kYToRgb = { 0x0101 * YG, 0, 0, 0 };
150
151 #undef YG
152 #undef YGB
153 #undef UB
154 #undef UG
155 #undef VG
156 #undef VR
157 #undef BB
158 #undef BG
159 #undef BR
160
161 #define RGBTOUV_SETUP_REG \
162 "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \
163 "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \
164 "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \
165 "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \
166 "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \
167 "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */
168
125 169
126 #ifdef HAS_I444TOARGBROW_NEON 170 #ifdef HAS_I444TOARGBROW_NEON
127 void I444ToARGBRow_NEON(const uint8* src_y, 171 void I444ToARGBRow_NEON(const uint8* src_y,
128 const uint8* src_u, 172 const uint8* src_u,
129 const uint8* src_v, 173 const uint8* src_v,
130 uint8* dst_argb, 174 uint8* dst_argb,
131 int width) { 175 int width) {
132 asm volatile ( 176 asm volatile (
133 MEMACCESS(5) 177 YUV422TORGB_SETUP_REG
134 "vld1.8 {d24}, [%5] \n"
135 MEMACCESS(6)
136 "vld1.8 {d25}, [%6] \n"
137 "vmov.u8 d26, #128 \n"
138 "vmov.u16 q14, #74 \n"
139 "vmov.u16 q15, #16 \n"
140 ".p2align 2 \n"
141 "1: \n" 178 "1: \n"
142 READYUV444 179 READYUV444
143 YUV422TORGB 180 YUV422TORGB(v22, v21, v20)
144 "subs %4, %4, #8 \n" 181 "subs %4, %4, #8 \n"
145 "vmov.u8 d23, #255 \n" 182 "movi v23.8b, #255 \n" /* A */
146 MEMACCESS(3) 183 MEMACCESS(3)
147 "vst4.8 {d20, d21, d22, d23}, [%3]! \n" 184 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
148 "bgt 1b \n" 185 "b.gt 1b \n"
149 : "+r"(src_y), // %0 186 : "+r"(src_y), // %0
150 "+r"(src_u), // %1 187 "+r"(src_u), // %1
151 "+r"(src_v), // %2 188 "+r"(src_v), // %2
152 "+r"(dst_argb), // %3 189 "+r"(dst_argb), // %3
153 "+r"(width) // %4 190 "+r"(width) // %4
154 : "r"(&kUVToRB), // %5 191 : [kUVBiasBGR]"r"(&kUVBiasBGR),
155 "r"(&kUVToG) // %6 192 [kYToRgb]"r"(&kYToRgb)
156 : "cc", "memory", "q0", "q1", "q2", "q3", 193 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
157 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 194 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
158 ); 195 );
159 } 196 }
160 #endif // HAS_I444TOARGBROW_NEON 197 #endif // HAS_I444TOARGBROW_NEON
161 198
162 #ifdef HAS_I422TOARGBROW_NEON 199 #ifdef HAS_I422TOARGBROW_NEON
163 void I422ToARGBRow_NEON(const uint8* src_y, 200 void I422ToARGBRow_NEON(const uint8* src_y,
164 const uint8* src_u, 201 const uint8* src_u,
165 const uint8* src_v, 202 const uint8* src_v,
166 uint8* dst_argb, 203 uint8* dst_argb,
167 int width) { 204 int width) {
168 asm volatile ( 205 asm volatile (
169 MEMACCESS(5) 206 YUV422TORGB_SETUP_REG
170 "vld1.8 {d24}, [%5] \n"
171 MEMACCESS(6)
172 "vld1.8 {d25}, [%6] \n"
173 "vmov.u8 d26, #128 \n"
174 "vmov.u16 q14, #74 \n"
175 "vmov.u16 q15, #16 \n"
176 ".p2align 2 \n"
177 "1: \n" 207 "1: \n"
178 READYUV422 208 READYUV422
179 YUV422TORGB 209 YUV422TORGB(v22, v21, v20)
180 "subs %4, %4, #8 \n" 210 "subs %4, %4, #8 \n"
181 "vmov.u8 d23, #255 \n" 211 "movi v23.8b, #255 \n" /* A */
182 MEMACCESS(3) 212 MEMACCESS(3)
183 "vst4.8 {d20, d21, d22, d23}, [%3]! \n" 213 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
184 "bgt 1b \n" 214 "b.gt 1b \n"
185 : "+r"(src_y), // %0 215 : "+r"(src_y), // %0
186 "+r"(src_u), // %1 216 "+r"(src_u), // %1
187 "+r"(src_v), // %2 217 "+r"(src_v), // %2
188 "+r"(dst_argb), // %3 218 "+r"(dst_argb), // %3
189 "+r"(width) // %4 219 "+r"(width) // %4
190 : "r"(&kUVToRB), // %5 220 : [kUVBiasBGR]"r"(&kUVBiasBGR),
191 "r"(&kUVToG) // %6 221 [kYToRgb]"r"(&kYToRgb)
192 : "cc", "memory", "q0", "q1", "q2", "q3", 222 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
193 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 223 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
194 ); 224 );
195 } 225 }
196 #endif // HAS_I422TOARGBROW_NEON 226 #endif // HAS_I422TOARGBROW_NEON
197 227
198 #ifdef HAS_I411TOARGBROW_NEON 228 #ifdef HAS_I411TOARGBROW_NEON
199 void I411ToARGBRow_NEON(const uint8* src_y, 229 void I411ToARGBRow_NEON(const uint8* src_y,
200 const uint8* src_u, 230 const uint8* src_u,
201 const uint8* src_v, 231 const uint8* src_v,
202 uint8* dst_argb, 232 uint8* dst_argb,
203 int width) { 233 int width) {
204 asm volatile ( 234 asm volatile (
205 MEMACCESS(5) 235 YUV422TORGB_SETUP_REG
206 "vld1.8 {d24}, [%5] \n"
207 MEMACCESS(6)
208 "vld1.8 {d25}, [%6] \n"
209 "vmov.u8 d26, #128 \n"
210 "vmov.u16 q14, #74 \n"
211 "vmov.u16 q15, #16 \n"
212 ".p2align 2 \n"
213 "1: \n" 236 "1: \n"
214 READYUV411 237 READYUV411
215 YUV422TORGB 238 YUV422TORGB(v22, v21, v20)
216 "subs %4, %4, #8 \n" 239 "subs %4, %4, #8 \n"
217 "vmov.u8 d23, #255 \n" 240 "movi v23.8b, #255 \n" /* A */
218 MEMACCESS(3) 241 MEMACCESS(3)
219 "vst4.8 {d20, d21, d22, d23}, [%3]! \n" 242 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
220 "bgt 1b \n" 243 "b.gt 1b \n"
221 : "+r"(src_y), // %0 244 : "+r"(src_y), // %0
222 "+r"(src_u), // %1 245 "+r"(src_u), // %1
223 "+r"(src_v), // %2 246 "+r"(src_v), // %2
224 "+r"(dst_argb), // %3 247 "+r"(dst_argb), // %3
225 "+r"(width) // %4 248 "+r"(width) // %4
226 : "r"(&kUVToRB), // %5 249 : [kUVBiasBGR]"r"(&kUVBiasBGR),
227 "r"(&kUVToG) // %6 250 [kYToRgb]"r"(&kYToRgb)
228 : "cc", "memory", "q0", "q1", "q2", "q3", 251 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
229 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 252 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
230 ); 253 );
231 } 254 }
232 #endif // HAS_I411TOARGBROW_NEON 255 #endif // HAS_I411TOARGBROW_NEON
233 256
234 #ifdef HAS_I422TOBGRAROW_NEON 257 #ifdef HAS_I422TOBGRAROW_NEON
235 void I422ToBGRARow_NEON(const uint8* src_y, 258 void I422ToBGRARow_NEON(const uint8* src_y,
236 const uint8* src_u, 259 const uint8* src_u,
237 const uint8* src_v, 260 const uint8* src_v,
238 uint8* dst_bgra, 261 uint8* dst_bgra,
239 int width) { 262 int width) {
240 asm volatile ( 263 asm volatile (
241 MEMACCESS(5) 264 YUV422TORGB_SETUP_REG
242 "vld1.8 {d24}, [%5] \n"
243 MEMACCESS(6)
244 "vld1.8 {d25}, [%6] \n"
245 "vmov.u8 d26, #128 \n"
246 "vmov.u16 q14, #74 \n"
247 "vmov.u16 q15, #16 \n"
248 ".p2align 2 \n"
249 "1: \n" 265 "1: \n"
250 READYUV422 266 READYUV422
251 YUV422TORGB 267 YUV422TORGB(v21, v22, v23)
252 "subs %4, %4, #8 \n" 268 "subs %4, %4, #8 \n"
253 "vswp.u8 d20, d22 \n" 269 "movi v20.8b, #255 \n" /* A */
254 "vmov.u8 d19, #255 \n"
255 MEMACCESS(3) 270 MEMACCESS(3)
256 "vst4.8 {d19, d20, d21, d22}, [%3]! \n" 271 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
257 "bgt 1b \n" 272 "b.gt 1b \n"
258 : "+r"(src_y), // %0 273 : "+r"(src_y), // %0
259 "+r"(src_u), // %1 274 "+r"(src_u), // %1
260 "+r"(src_v), // %2 275 "+r"(src_v), // %2
261 "+r"(dst_bgra), // %3 276 "+r"(dst_bgra), // %3
262 "+r"(width) // %4 277 "+r"(width) // %4
263 : "r"(&kUVToRB), // %5 278 : [kUVBiasBGR]"r"(&kUVBiasBGR),
264 "r"(&kUVToG) // %6 279 [kYToRgb]"r"(&kYToRgb)
265 : "cc", "memory", "q0", "q1", "q2", "q3", 280 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
266 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 281 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
267 ); 282 );
268 } 283 }
269 #endif // HAS_I422TOBGRAROW_NEON 284 #endif // HAS_I422TOBGRAROW_NEON
270 285
271 #ifdef HAS_I422TOABGRROW_NEON 286 #ifdef HAS_I422TOABGRROW_NEON
272 void I422ToABGRRow_NEON(const uint8* src_y, 287 void I422ToABGRRow_NEON(const uint8* src_y,
273 const uint8* src_u, 288 const uint8* src_u,
274 const uint8* src_v, 289 const uint8* src_v,
275 uint8* dst_abgr, 290 uint8* dst_abgr,
276 int width) { 291 int width) {
277 asm volatile ( 292 asm volatile (
278 MEMACCESS(5) 293 YUV422TORGB_SETUP_REG
279 "vld1.8 {d24}, [%5] \n"
280 MEMACCESS(6)
281 "vld1.8 {d25}, [%6] \n"
282 "vmov.u8 d26, #128 \n"
283 "vmov.u16 q14, #74 \n"
284 "vmov.u16 q15, #16 \n"
285 ".p2align 2 \n"
286 "1: \n" 294 "1: \n"
287 READYUV422 295 READYUV422
288 YUV422TORGB 296 YUV422TORGB(v20, v21, v22)
289 "subs %4, %4, #8 \n" 297 "subs %4, %4, #8 \n"
290 "vswp.u8 d20, d22 \n" 298 "movi v23.8b, #255 \n" /* A */
291 "vmov.u8 d23, #255 \n"
292 MEMACCESS(3) 299 MEMACCESS(3)
293 "vst4.8 {d20, d21, d22, d23}, [%3]! \n" 300 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
294 "bgt 1b \n" 301 "b.gt 1b \n"
295 : "+r"(src_y), // %0 302 : "+r"(src_y), // %0
296 "+r"(src_u), // %1 303 "+r"(src_u), // %1
297 "+r"(src_v), // %2 304 "+r"(src_v), // %2
298 "+r"(dst_abgr), // %3 305 "+r"(dst_abgr), // %3
299 "+r"(width) // %4 306 "+r"(width) // %4
300 : "r"(&kUVToRB), // %5 307 : [kUVBiasBGR]"r"(&kUVBiasBGR),
301 "r"(&kUVToG) // %6 308 [kYToRgb]"r"(&kYToRgb)
302 : "cc", "memory", "q0", "q1", "q2", "q3", 309 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
303 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 310 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
304 ); 311 );
305 } 312 }
306 #endif // HAS_I422TOABGRROW_NEON 313 #endif // HAS_I422TOABGRROW_NEON
307 314
308 #ifdef HAS_I422TORGBAROW_NEON 315 #ifdef HAS_I422TORGBAROW_NEON
309 void I422ToRGBARow_NEON(const uint8* src_y, 316 void I422ToRGBARow_NEON(const uint8* src_y,
310 const uint8* src_u, 317 const uint8* src_u,
311 const uint8* src_v, 318 const uint8* src_v,
312 uint8* dst_rgba, 319 uint8* dst_rgba,
313 int width) { 320 int width) {
314 asm volatile ( 321 asm volatile (
315 MEMACCESS(5) 322 YUV422TORGB_SETUP_REG
316 "vld1.8 {d24}, [%5] \n"
317 MEMACCESS(6)
318 "vld1.8 {d25}, [%6] \n"
319 "vmov.u8 d26, #128 \n"
320 "vmov.u16 q14, #74 \n"
321 "vmov.u16 q15, #16 \n"
322 ".p2align 2 \n"
323 "1: \n" 323 "1: \n"
324 READYUV422 324 READYUV422
325 YUV422TORGB 325 YUV422TORGB(v23, v22, v21)
326 "subs %4, %4, #8 \n" 326 "subs %4, %4, #8 \n"
327 "vmov.u8 d19, #255 \n" 327 "movi v20.8b, #255 \n" /* A */
328 MEMACCESS(3) 328 MEMACCESS(3)
329 "vst4.8 {d19, d20, d21, d22}, [%3]! \n" 329 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
330 "bgt 1b \n" 330 "b.gt 1b \n"
331 : "+r"(src_y), // %0 331 : "+r"(src_y), // %0
332 "+r"(src_u), // %1 332 "+r"(src_u), // %1
333 "+r"(src_v), // %2 333 "+r"(src_v), // %2
334 "+r"(dst_rgba), // %3 334 "+r"(dst_rgba), // %3
335 "+r"(width) // %4 335 "+r"(width) // %4
336 : "r"(&kUVToRB), // %5 336 : [kUVBiasBGR]"r"(&kUVBiasBGR),
337 "r"(&kUVToG) // %6 337 [kYToRgb]"r"(&kYToRgb)
338 : "cc", "memory", "q0", "q1", "q2", "q3", 338 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
339 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 339 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
340 ); 340 );
341 } 341 }
342 #endif // HAS_I422TORGBAROW_NEON 342 #endif // HAS_I422TORGBAROW_NEON
343 343
344 #ifdef HAS_I422TORGB24ROW_NEON 344 #ifdef HAS_I422TORGB24ROW_NEON
345 void I422ToRGB24Row_NEON(const uint8* src_y, 345 void I422ToRGB24Row_NEON(const uint8* src_y,
346 const uint8* src_u, 346 const uint8* src_u,
347 const uint8* src_v, 347 const uint8* src_v,
348 uint8* dst_rgb24, 348 uint8* dst_rgb24,
349 int width) { 349 int width) {
350 asm volatile ( 350 asm volatile (
351 MEMACCESS(5) 351 YUV422TORGB_SETUP_REG
352 "vld1.8 {d24}, [%5] \n"
353 MEMACCESS(6)
354 "vld1.8 {d25}, [%6] \n"
355 "vmov.u8 d26, #128 \n"
356 "vmov.u16 q14, #74 \n"
357 "vmov.u16 q15, #16 \n"
358 ".p2align 2 \n"
359 "1: \n" 352 "1: \n"
360 READYUV422 353 READYUV422
361 YUV422TORGB 354 YUV422TORGB(v22, v21, v20)
362 "subs %4, %4, #8 \n" 355 "subs %4, %4, #8 \n"
363 MEMACCESS(3) 356 MEMACCESS(3)
364 "vst3.8 {d20, d21, d22}, [%3]! \n" 357 "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
365 "bgt 1b \n" 358 "b.gt 1b \n"
366 : "+r"(src_y), // %0 359 : "+r"(src_y), // %0
367 "+r"(src_u), // %1 360 "+r"(src_u), // %1
368 "+r"(src_v), // %2 361 "+r"(src_v), // %2
369 "+r"(dst_rgb24), // %3 362 "+r"(dst_rgb24), // %3
370 "+r"(width) // %4 363 "+r"(width) // %4
371 : "r"(&kUVToRB), // %5 364 : [kUVBiasBGR]"r"(&kUVBiasBGR),
372 "r"(&kUVToG) // %6 365 [kYToRgb]"r"(&kYToRgb)
373 : "cc", "memory", "q0", "q1", "q2", "q3", 366 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
374 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 367 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
375 ); 368 );
376 } 369 }
377 #endif // HAS_I422TORGB24ROW_NEON 370 #endif // HAS_I422TORGB24ROW_NEON
378 371
379 #ifdef HAS_I422TORAWROW_NEON 372 #ifdef HAS_I422TORAWROW_NEON
380 void I422ToRAWRow_NEON(const uint8* src_y, 373 void I422ToRAWRow_NEON(const uint8* src_y,
381 const uint8* src_u, 374 const uint8* src_u,
382 const uint8* src_v, 375 const uint8* src_v,
383 uint8* dst_raw, 376 uint8* dst_raw,
384 int width) { 377 int width) {
385 asm volatile ( 378 asm volatile (
386 MEMACCESS(5) 379 YUV422TORGB_SETUP_REG
387 "vld1.8 {d24}, [%5] \n"
388 MEMACCESS(6)
389 "vld1.8 {d25}, [%6] \n"
390 "vmov.u8 d26, #128 \n"
391 "vmov.u16 q14, #74 \n"
392 "vmov.u16 q15, #16 \n"
393 ".p2align 2 \n"
394 "1: \n" 380 "1: \n"
395 READYUV422 381 READYUV422
396 YUV422TORGB 382 YUV422TORGB(v20, v21, v22)
397 "subs %4, %4, #8 \n" 383 "subs %4, %4, #8 \n"
398 "vswp.u8 d20, d22 \n"
399 MEMACCESS(3) 384 MEMACCESS(3)
400 "vst3.8 {d20, d21, d22}, [%3]! \n" 385 "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
401 "bgt 1b \n" 386 "b.gt 1b \n"
402 : "+r"(src_y), // %0 387 : "+r"(src_y), // %0
403 "+r"(src_u), // %1 388 "+r"(src_u), // %1
404 "+r"(src_v), // %2 389 "+r"(src_v), // %2
405 "+r"(dst_raw), // %3 390 "+r"(dst_raw), // %3
406 "+r"(width) // %4 391 "+r"(width) // %4
407 : "r"(&kUVToRB), // %5 392 : [kUVBiasBGR]"r"(&kUVBiasBGR),
408 "r"(&kUVToG) // %6 393 [kYToRgb]"r"(&kYToRgb)
409 : "cc", "memory", "q0", "q1", "q2", "q3", 394 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
410 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 395 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
411 ); 396 );
412 } 397 }
413 #endif // HAS_I422TORAWROW_NEON 398 #endif // HAS_I422TORAWROW_NEON
414 399
415 #define ARGBTORGB565 \ 400 #define ARGBTORGB565 \
416 "vshr.u8 d20, d20, #3 \n" /* B */ \ 401 "shll v0.8h, v22.8b, #8 \n" /* R */ \
417 "vshr.u8 d21, d21, #2 \n" /* G */ \ 402 "shll v20.8h, v20.8b, #8 \n" /* B */ \
418 "vshr.u8 d22, d22, #3 \n" /* R */ \ 403 "shll v21.8h, v21.8b, #8 \n" /* G */ \
419 "vmovl.u8 q8, d20 \n" /* B */ \ 404 "sri v0.8h, v21.8h, #5 \n" /* RG */ \
420 "vmovl.u8 q9, d21 \n" /* G */ \ 405 "sri v0.8h, v20.8h, #11 \n" /* RGB */
421 "vmovl.u8 q10, d22 \n" /* R */ \
422 "vshl.u16 q9, q9, #5 \n" /* G */ \
423 "vshl.u16 q10, q10, #11 \n" /* R */ \
424 "vorr q0, q8, q9 \n" /* BG */ \
425 "vorr q0, q0, q10 \n" /* BGR */
426 406
427 #ifdef HAS_I422TORGB565ROW_NEON 407 #ifdef HAS_I422TORGB565ROW_NEON
428 void I422ToRGB565Row_NEON(const uint8* src_y, 408 void I422ToRGB565Row_NEON(const uint8* src_y,
429 const uint8* src_u, 409 const uint8* src_u,
430 const uint8* src_v, 410 const uint8* src_v,
431 uint8* dst_rgb565, 411 uint8* dst_rgb565,
432 int width) { 412 int width) {
433 asm volatile ( 413 asm volatile (
434 MEMACCESS(5) 414 YUV422TORGB_SETUP_REG
435 "vld1.8 {d24}, [%5] \n"
436 MEMACCESS(6)
437 "vld1.8 {d25}, [%6] \n"
438 "vmov.u8 d26, #128 \n"
439 "vmov.u16 q14, #74 \n"
440 "vmov.u16 q15, #16 \n"
441 ".p2align 2 \n"
442 "1: \n" 415 "1: \n"
443 READYUV422 416 READYUV422
444 YUV422TORGB 417 YUV422TORGB(v22, v21, v20)
445 "subs %4, %4, #8 \n" 418 "subs %4, %4, #8 \n"
446 ARGBTORGB565 419 ARGBTORGB565
447 MEMACCESS(3) 420 MEMACCESS(3)
448 "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565. 421 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565.
449 "bgt 1b \n" 422 "b.gt 1b \n"
450 : "+r"(src_y), // %0 423 : "+r"(src_y), // %0
451 "+r"(src_u), // %1 424 "+r"(src_u), // %1
452 "+r"(src_v), // %2 425 "+r"(src_v), // %2
453 "+r"(dst_rgb565), // %3 426 "+r"(dst_rgb565), // %3
454 "+r"(width) // %4 427 "+r"(width) // %4
455 : "r"(&kUVToRB), // %5 428 : [kUVBiasBGR]"r"(&kUVBiasBGR),
456 "r"(&kUVToG) // %6 429 [kYToRgb]"r"(&kYToRgb)
457 : "cc", "memory", "q0", "q1", "q2", "q3", 430 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
458 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 431 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
459 ); 432 );
460 } 433 }
461 #endif // HAS_I422TORGB565ROW_NEON 434 #endif // HAS_I422TORGB565ROW_NEON
462 435
463 #define ARGBTOARGB1555 \ 436 #define ARGBTOARGB1555 \
464 "vshr.u8 q10, q10, #3 \n" /* B */ \ 437 "shll v0.8h, v23.8b, #8 \n" /* A */ \
465 "vshr.u8 d22, d22, #3 \n" /* R */ \ 438 "shll v22.8h, v22.8b, #8 \n" /* R */ \
466 "vshr.u8 d23, d23, #7 \n" /* A */ \ 439 "shll v20.8h, v20.8b, #8 \n" /* B */ \
467 "vmovl.u8 q8, d20 \n" /* B */ \ 440 "shll v21.8h, v21.8b, #8 \n" /* G */ \
468 "vmovl.u8 q9, d21 \n" /* G */ \ 441 "sri v0.8h, v22.8h, #1 \n" /* AR */ \
469 "vmovl.u8 q10, d22 \n" /* R */ \ 442 "sri v0.8h, v21.8h, #6 \n" /* ARG */ \
470 "vmovl.u8 q11, d23 \n" /* A */ \ 443 "sri v0.8h, v20.8h, #11 \n" /* ARGB */
471 "vshl.u16 q9, q9, #5 \n" /* G */ \
472 "vshl.u16 q10, q10, #10 \n" /* R */ \
473 "vshl.u16 q11, q11, #15 \n" /* A */ \
474 "vorr q0, q8, q9 \n" /* BG */ \
475 "vorr q1, q10, q11 \n" /* RA */ \
476 "vorr q0, q0, q1 \n" /* BGRA */
477 444
478 #ifdef HAS_I422TOARGB1555ROW_NEON 445 #ifdef HAS_I422TOARGB1555ROW_NEON
479 void I422ToARGB1555Row_NEON(const uint8* src_y, 446 void I422ToARGB1555Row_NEON(const uint8* src_y,
480 const uint8* src_u, 447 const uint8* src_u,
481 const uint8* src_v, 448 const uint8* src_v,
482 uint8* dst_argb1555, 449 uint8* dst_argb1555,
483 int width) { 450 int width) {
484 asm volatile ( 451 asm volatile (
485 MEMACCESS(5) 452 YUV422TORGB_SETUP_REG
486 "vld1.8 {d24}, [%5] \n"
487 MEMACCESS(6)
488 "vld1.8 {d25}, [%6] \n"
489 "vmov.u8 d26, #128 \n"
490 "vmov.u16 q14, #74 \n"
491 "vmov.u16 q15, #16 \n"
492 ".p2align 2 \n"
493 "1: \n" 453 "1: \n"
494 READYUV422 454 READYUV422
495 YUV422TORGB 455 YUV422TORGB(v22, v21, v20)
496 "subs %4, %4, #8 \n" 456 "subs %4, %4, #8 \n"
497 "vmov.u8 d23, #255 \n" 457 "movi v23.8b, #255 \n"
498 ARGBTOARGB1555 458 ARGBTOARGB1555
499 MEMACCESS(3) 459 MEMACCESS(3)
500 "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB1555. 460 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565.
501 "bgt 1b \n" 461 "b.gt 1b \n"
502 : "+r"(src_y), // %0 462 : "+r"(src_y), // %0
503 "+r"(src_u), // %1 463 "+r"(src_u), // %1
504 "+r"(src_v), // %2 464 "+r"(src_v), // %2
505 "+r"(dst_argb1555), // %3 465 "+r"(dst_argb1555), // %3
506 "+r"(width) // %4 466 "+r"(width) // %4
507 : "r"(&kUVToRB), // %5 467 : [kUVBiasBGR]"r"(&kUVBiasBGR),
508 "r"(&kUVToG) // %6 468 [kYToRgb]"r"(&kYToRgb)
509 : "cc", "memory", "q0", "q1", "q2", "q3", 469 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
510 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 470 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
511 ); 471 );
512 } 472 }
513 #endif // HAS_I422TOARGB1555ROW_NEON 473 #endif // HAS_I422TOARGB1555ROW_NEON
514 474
515 #define ARGBTOARGB4444 \ 475 #define ARGBTOARGB4444 \
516 "vshr.u8 d20, d20, #4 \n" /* B */ \ 476 /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \
517 "vbic.32 d21, d21, d4 \n" /* G */ \ 477 "ushr v20.8b, v20.8b, #4 \n" /* B */ \
518 "vshr.u8 d22, d22, #4 \n" /* R */ \ 478 "bic v21.8b, v21.8b, v4.8b \n" /* G */ \
519 "vbic.32 d23, d23, d4 \n" /* A */ \ 479 "ushr v22.8b, v22.8b, #4 \n" /* R */ \
520 "vorr d0, d20, d21 \n" /* BG */ \ 480 "bic v23.8b, v23.8b, v4.8b \n" /* A */ \
521 "vorr d1, d22, d23 \n" /* RA */ \ 481 "orr v0.8b, v20.8b, v21.8b \n" /* BG */ \
522 "vzip.u8 d0, d1 \n" /* BGRA */ 482 "orr v1.8b, v22.8b, v23.8b \n" /* RA */ \
483 "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */
523 484
524 #ifdef HAS_I422TOARGB4444ROW_NEON 485 #ifdef HAS_I422TOARGB4444ROW_NEON
525 void I422ToARGB4444Row_NEON(const uint8* src_y, 486 void I422ToARGB4444Row_NEON(const uint8* src_y,
526 const uint8* src_u, 487 const uint8* src_u,
527 const uint8* src_v, 488 const uint8* src_v,
528 uint8* dst_argb4444, 489 uint8* dst_argb4444,
529 int width) { 490 int width) {
530 asm volatile ( 491 asm volatile (
531 MEMACCESS(5) 492 YUV422TORGB_SETUP_REG
532 "vld1.8 {d24}, [%5] \n" 493 "movi v4.16b, #0x0f \n" // bits to clear with vbic.
533 MEMACCESS(6)
534 "vld1.8 {d25}, [%6] \n"
535 "vmov.u8 d26, #128 \n"
536 "vmov.u16 q14, #74 \n"
537 "vmov.u16 q15, #16 \n"
538 "vmov.u8 d4, #0x0f \n" // bits to clear with vbic.
539 ".p2align 2 \n"
540 "1: \n" 494 "1: \n"
541 READYUV422 495 READYUV422
542 YUV422TORGB 496 YUV422TORGB(v22, v21, v20)
543 "subs %4, %4, #8 \n" 497 "subs %4, %4, #8 \n"
544 "vmov.u8 d23, #255 \n" 498 "movi v23.8b, #255 \n"
545 ARGBTOARGB4444 499 ARGBTOARGB4444
546 MEMACCESS(3) 500 MEMACCESS(3)
547 "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB4444. 501 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels ARGB4444.
548 "bgt 1b \n" 502 "b.gt 1b \n"
549 : "+r"(src_y), // %0 503 : "+r"(src_y), // %0
550 "+r"(src_u), // %1 504 "+r"(src_u), // %1
551 "+r"(src_v), // %2 505 "+r"(src_v), // %2
552 "+r"(dst_argb4444), // %3 506 "+r"(dst_argb4444), // %3
553 "+r"(width) // %4 507 "+r"(width) // %4
554 : "r"(&kUVToRB), // %5 508 : [kUVBiasBGR]"r"(&kUVBiasBGR),
555 "r"(&kUVToG) // %6 509 [kYToRgb]"r"(&kYToRgb)
556 : "cc", "memory", "q0", "q1", "q2", "q3", 510 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
557 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 511 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
558 ); 512 );
559 } 513 }
560 #endif // HAS_I422TOARGB4444ROW_NEON 514 #endif // HAS_I422TOARGB4444ROW_NEON
561 515
562 #ifdef HAS_YTOARGBROW_NEON 516 #ifdef HAS_YTOARGBROW_NEON
563 void YToARGBRow_NEON(const uint8* src_y, 517 void YToARGBRow_NEON(const uint8* src_y,
564 uint8* dst_argb, 518 uint8* dst_argb,
565 int width) { 519 int width) {
566 asm volatile ( 520 asm volatile (
567 MEMACCESS(3) 521 YUV422TORGB_SETUP_REG
568 "vld1.8 {d24}, [%3] \n"
569 MEMACCESS(4)
570 "vld1.8 {d25}, [%4] \n"
571 "vmov.u8 d26, #128 \n"
572 "vmov.u16 q14, #74 \n"
573 "vmov.u16 q15, #16 \n"
574 ".p2align 2 \n"
575 "1: \n" 522 "1: \n"
576 READYUV400 523 READYUV400
577 YUV422TORGB 524 YUV422TORGB(v22, v21, v20)
578 "subs %2, %2, #8 \n" 525 "subs %2, %2, #8 \n"
579 "vmov.u8 d23, #255 \n" 526 "movi v23.8b, #255 \n"
580 MEMACCESS(1) 527 MEMACCESS(1)
581 "vst4.8 {d20, d21, d22, d23}, [%1]! \n" 528 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
582 "bgt 1b \n" 529 "b.gt 1b \n"
583 : "+r"(src_y), // %0 530 : "+r"(src_y), // %0
584 "+r"(dst_argb), // %1 531 "+r"(dst_argb), // %1
585 "+r"(width) // %2 532 "+r"(width) // %2
586 : "r"(&kUVToRB), // %3 533 : [kUVBiasBGR]"r"(&kUVBiasBGR),
587 "r"(&kUVToG) // %4 534 [kYToRgb]"r"(&kYToRgb)
588 : "cc", "memory", "q0", "q1", "q2", "q3", 535 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
589 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 536 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
590 ); 537 );
591 } 538 }
592 #endif // HAS_YTOARGBROW_NEON 539 #endif // HAS_YTOARGBROW_NEON
593 540
594 #ifdef HAS_I400TOARGBROW_NEON 541 #ifdef HAS_I400TOARGBROW_NEON
595 void I400ToARGBRow_NEON(const uint8* src_y, 542 void I400ToARGBRow_NEON(const uint8* src_y,
596 uint8* dst_argb, 543 uint8* dst_argb,
597 int width) { 544 int width) {
598 asm volatile ( 545 asm volatile (
599 ".p2align 2 \n" 546 "movi v23.8b, #255 \n"
600 "vmov.u8 d23, #255 \n"
601 "1: \n" 547 "1: \n"
602 MEMACCESS(0) 548 MEMACCESS(0)
603 "vld1.8 {d20}, [%0]! \n" 549 "ld1 {v20.8b}, [%0], #8 \n"
604 "vmov d21, d20 \n" 550 "orr v21.8b, v20.8b, v20.8b \n"
605 "vmov d22, d20 \n" 551 "orr v22.8b, v20.8b, v20.8b \n"
606 "subs %2, %2, #8 \n" 552 "subs %2, %2, #8 \n"
607 MEMACCESS(1) 553 MEMACCESS(1)
608 "vst4.8 {d20, d21, d22, d23}, [%1]! \n" 554 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
609 "bgt 1b \n" 555 "b.gt 1b \n"
610 : "+r"(src_y), // %0 556 : "+r"(src_y), // %0
611 "+r"(dst_argb), // %1 557 "+r"(dst_argb), // %1
612 "+r"(width) // %2 558 "+r"(width) // %2
613 : 559 :
614 : "cc", "memory", "d20", "d21", "d22", "d23" 560 : "cc", "memory", "v20", "v21", "v22", "v23"
615 ); 561 );
616 } 562 }
617 #endif // HAS_I400TOARGBROW_NEON 563 #endif // HAS_I400TOARGBROW_NEON
618 564
619 #ifdef HAS_NV12TOARGBROW_NEON 565 #ifdef HAS_NV12TOARGBROW_NEON
620 void NV12ToARGBRow_NEON(const uint8* src_y, 566 void NV12ToARGBRow_NEON(const uint8* src_y,
621 const uint8* src_uv, 567 const uint8* src_uv,
622 uint8* dst_argb, 568 uint8* dst_argb,
623 int width) { 569 int width) {
624 asm volatile ( 570 asm volatile (
625 MEMACCESS(4) 571 YUV422TORGB_SETUP_REG
626 "vld1.8 {d24}, [%4] \n"
627 MEMACCESS(5)
628 "vld1.8 {d25}, [%5] \n"
629 "vmov.u8 d26, #128 \n"
630 "vmov.u16 q14, #74 \n"
631 "vmov.u16 q15, #16 \n"
632 ".p2align 2 \n"
633 "1: \n" 572 "1: \n"
634 READNV12 573 READNV12
635 YUV422TORGB 574 YUV422TORGB(v22, v21, v20)
636 "subs %3, %3, #8 \n" 575 "subs %3, %3, #8 \n"
637 "vmov.u8 d23, #255 \n" 576 "movi v23.8b, #255 \n"
638 MEMACCESS(2) 577 MEMACCESS(2)
639 "vst4.8 {d20, d21, d22, d23}, [%2]! \n" 578 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
640 "bgt 1b \n" 579 "b.gt 1b \n"
641 : "+r"(src_y), // %0 580 : "+r"(src_y), // %0
642 "+r"(src_uv), // %1 581 "+r"(src_uv), // %1
643 "+r"(dst_argb), // %2 582 "+r"(dst_argb), // %2
644 "+r"(width) // %3 583 "+r"(width) // %3
645 : "r"(&kUVToRB), // %4 584 : [kUVBiasBGR]"r"(&kUVBiasBGR),
646 "r"(&kUVToG) // %5 585 [kYToRgb]"r"(&kYToRgb)
647 : "cc", "memory", "q0", "q1", "q2", "q3", 586 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
648 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 587 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
649 ); 588 );
650 } 589 }
651 #endif // HAS_NV12TOARGBROW_NEON 590 #endif // HAS_NV12TOARGBROW_NEON
652 591
653 #ifdef HAS_NV21TOARGBROW_NEON 592 #ifdef HAS_NV21TOARGBROW_NEON
654 void NV21ToARGBRow_NEON(const uint8* src_y, 593 void NV21ToARGBRow_NEON(const uint8* src_y,
655 const uint8* src_uv, 594 const uint8* src_uv,
656 uint8* dst_argb, 595 uint8* dst_argb,
657 int width) { 596 int width) {
658 asm volatile ( 597 asm volatile (
659 MEMACCESS(4) 598 YUV422TORGB_SETUP_REG
660 "vld1.8 {d24}, [%4] \n"
661 MEMACCESS(5)
662 "vld1.8 {d25}, [%5] \n"
663 "vmov.u8 d26, #128 \n"
664 "vmov.u16 q14, #74 \n"
665 "vmov.u16 q15, #16 \n"
666 ".p2align 2 \n"
667 "1: \n" 599 "1: \n"
668 READNV21 600 READNV21
669 YUV422TORGB 601 YUV422TORGB(v22, v21, v20)
670 "subs %3, %3, #8 \n" 602 "subs %3, %3, #8 \n"
671 "vmov.u8 d23, #255 \n" 603 "movi v23.8b, #255 \n"
672 MEMACCESS(2) 604 MEMACCESS(2)
673 "vst4.8 {d20, d21, d22, d23}, [%2]! \n" 605 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
674 "bgt 1b \n" 606 "b.gt 1b \n"
675 : "+r"(src_y), // %0 607 : "+r"(src_y), // %0
676 "+r"(src_uv), // %1 608 "+r"(src_uv), // %1
677 "+r"(dst_argb), // %2 609 "+r"(dst_argb), // %2
678 "+r"(width) // %3 610 "+r"(width) // %3
679 : "r"(&kUVToRB), // %4 611 : [kUVBiasBGR]"r"(&kUVBiasBGR),
680 "r"(&kUVToG) // %5 612 [kYToRgb]"r"(&kYToRgb)
681 : "cc", "memory", "q0", "q1", "q2", "q3", 613 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
682 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 614 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
683 ); 615 );
684 } 616 }
685 #endif // HAS_NV21TOARGBROW_NEON 617 #endif // HAS_NV21TOARGBROW_NEON
686 618
687 #ifdef HAS_NV12TORGB565ROW_NEON 619 #ifdef HAS_NV12TORGB565ROW_NEON
688 void NV12ToRGB565Row_NEON(const uint8* src_y, 620 void NV12ToRGB565Row_NEON(const uint8* src_y,
689 const uint8* src_uv, 621 const uint8* src_uv,
690 uint8* dst_rgb565, 622 uint8* dst_rgb565,
691 int width) { 623 int width) {
692 asm volatile ( 624 asm volatile (
693 MEMACCESS(4) 625 YUV422TORGB_SETUP_REG
694 "vld1.8 {d24}, [%4] \n"
695 MEMACCESS(5)
696 "vld1.8 {d25}, [%5] \n"
697 "vmov.u8 d26, #128 \n"
698 "vmov.u16 q14, #74 \n"
699 "vmov.u16 q15, #16 \n"
700 ".p2align 2 \n"
701 "1: \n" 626 "1: \n"
702 READNV12 627 READNV12
703 YUV422TORGB 628 YUV422TORGB(v22, v21, v20)
704 "subs %3, %3, #8 \n" 629 "subs %3, %3, #8 \n"
705 ARGBTORGB565 630 ARGBTORGB565
706 MEMACCESS(2) 631 MEMACCESS(2)
707 "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. 632 "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565.
708 "bgt 1b \n" 633 "b.gt 1b \n"
709 : "+r"(src_y), // %0 634 : "+r"(src_y), // %0
710 "+r"(src_uv), // %1 635 "+r"(src_uv), // %1
711 "+r"(dst_rgb565), // %2 636 "+r"(dst_rgb565), // %2
712 "+r"(width) // %3 637 "+r"(width) // %3
713 : "r"(&kUVToRB), // %4 638 : [kUVBiasBGR]"r"(&kUVBiasBGR),
714 "r"(&kUVToG) // %5 639 [kYToRgb]"r"(&kYToRgb)
715 : "cc", "memory", "q0", "q1", "q2", "q3", 640 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
716 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 641 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
717 ); 642 );
718 } 643 }
719 #endif // HAS_NV12TORGB565ROW_NEON 644 #endif // HAS_NV12TORGB565ROW_NEON
720 645
721 #ifdef HAS_NV21TORGB565ROW_NEON 646 #ifdef HAS_NV21TORGB565ROW_NEON
722 void NV21ToRGB565Row_NEON(const uint8* src_y, 647 void NV21ToRGB565Row_NEON(const uint8* src_y,
723 const uint8* src_uv, 648 const uint8* src_uv,
724 uint8* dst_rgb565, 649 uint8* dst_rgb565,
725 int width) { 650 int width) {
726 asm volatile ( 651 asm volatile (
727 MEMACCESS(4) 652 YUV422TORGB_SETUP_REG
728 "vld1.8 {d24}, [%4] \n"
729 MEMACCESS(5)
730 "vld1.8 {d25}, [%5] \n"
731 "vmov.u8 d26, #128 \n"
732 "vmov.u16 q14, #74 \n"
733 "vmov.u16 q15, #16 \n"
734 ".p2align 2 \n"
735 "1: \n" 653 "1: \n"
736 READNV21 654 READNV21
737 YUV422TORGB 655 YUV422TORGB(v22, v21, v20)
738 "subs %3, %3, #8 \n" 656 "subs %3, %3, #8 \n"
739 ARGBTORGB565 657 ARGBTORGB565
740 MEMACCESS(2) 658 MEMACCESS(2)
741 "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. 659 "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565.
742 "bgt 1b \n" 660 "b.gt 1b \n"
743 : "+r"(src_y), // %0 661 : "+r"(src_y), // %0
744 "+r"(src_uv), // %1 662 "+r"(src_uv), // %1
745 "+r"(dst_rgb565), // %2 663 "+r"(dst_rgb565), // %2
746 "+r"(width) // %3 664 "+r"(width) // %3
747 : "r"(&kUVToRB), // %4 665 : [kUVBiasBGR]"r"(&kUVBiasBGR),
748 "r"(&kUVToG) // %5 666 [kYToRgb]"r"(&kYToRgb)
749 : "cc", "memory", "q0", "q1", "q2", "q3", 667 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
750 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 668 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
751 ); 669 );
752 } 670 }
753 #endif // HAS_NV21TORGB565ROW_NEON 671 #endif // HAS_NV21TORGB565ROW_NEON
754 672
755 #ifdef HAS_YUY2TOARGBROW_NEON 673 #ifdef HAS_YUY2TOARGBROW_NEON
756 void YUY2ToARGBRow_NEON(const uint8* src_yuy2, 674 void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
757 uint8* dst_argb, 675 uint8* dst_argb,
758 int width) { 676 int width) {
759 asm volatile ( 677 asm volatile (
760 MEMACCESS(3) 678 YUV422TORGB_SETUP_REG
761 "vld1.8 {d24}, [%3] \n"
762 MEMACCESS(4)
763 "vld1.8 {d25}, [%4] \n"
764 "vmov.u8 d26, #128 \n"
765 "vmov.u16 q14, #74 \n"
766 "vmov.u16 q15, #16 \n"
767 ".p2align 2 \n"
768 "1: \n" 679 "1: \n"
769 READYUY2 680 READYUY2
770 YUV422TORGB 681 YUV422TORGB(v22, v21, v20)
771 "subs %2, %2, #8 \n" 682 "subs %2, %2, #8 \n"
772 "vmov.u8 d23, #255 \n" 683 "movi v23.8b, #255 \n"
773 MEMACCESS(1) 684 MEMACCESS(1)
774 "vst4.8 {d20, d21, d22, d23}, [%1]! \n" 685 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
775 "bgt 1b \n" 686 "b.gt 1b \n"
776 : "+r"(src_yuy2), // %0 687 : "+r"(src_yuy2), // %0
777 "+r"(dst_argb), // %1 688 "+r"(dst_argb), // %1
778 "+r"(width) // %2 689 "+r"(width) // %2
779 : "r"(&kUVToRB), // %3 690 : [kUVBiasBGR]"r"(&kUVBiasBGR),
780 "r"(&kUVToG) // %4 691 [kYToRgb]"r"(&kYToRgb)
781 : "cc", "memory", "q0", "q1", "q2", "q3", 692 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
782 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 693 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
783 ); 694 );
784 } 695 }
785 #endif // HAS_YUY2TOARGBROW_NEON 696 #endif // HAS_YUY2TOARGBROW_NEON
786 697
787 #ifdef HAS_UYVYTOARGBROW_NEON 698 #ifdef HAS_UYVYTOARGBROW_NEON
788 void UYVYToARGBRow_NEON(const uint8* src_uyvy, 699 void UYVYToARGBRow_NEON(const uint8* src_uyvy,
789 uint8* dst_argb, 700 uint8* dst_argb,
790 int width) { 701 int width) {
791 asm volatile ( 702 asm volatile (
792 MEMACCESS(3) 703 YUV422TORGB_SETUP_REG
793 "vld1.8 {d24}, [%3] \n"
794 MEMACCESS(4)
795 "vld1.8 {d25}, [%4] \n"
796 "vmov.u8 d26, #128 \n"
797 "vmov.u16 q14, #74 \n"
798 "vmov.u16 q15, #16 \n"
799 ".p2align 2 \n"
800 "1: \n" 704 "1: \n"
801 READUYVY 705 READUYVY
802 YUV422TORGB 706 YUV422TORGB(v22, v21, v20)
803 "subs %2, %2, #8 \n" 707 "subs %2, %2, #8 \n"
804 "vmov.u8 d23, #255 \n" 708 "movi v23.8b, #255 \n"
805 MEMACCESS(1) 709 MEMACCESS(1)
806 "vst4.8 {d20, d21, d22, d23}, [%1]! \n" 710 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n"
807 "bgt 1b \n" 711 "b.gt 1b \n"
808 : "+r"(src_uyvy), // %0 712 : "+r"(src_uyvy), // %0
809 "+r"(dst_argb), // %1 713 "+r"(dst_argb), // %1
810 "+r"(width) // %2 714 "+r"(width) // %2
811 : "r"(&kUVToRB), // %3 715 : [kUVBiasBGR]"r"(&kUVBiasBGR),
812 "r"(&kUVToG) // %4 716 [kYToRgb]"r"(&kYToRgb)
813 : "cc", "memory", "q0", "q1", "q2", "q3", 717 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
814 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 718 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
815 ); 719 );
816 } 720 }
817 #endif // HAS_UYVYTOARGBROW_NEON 721 #endif // HAS_UYVYTOARGBROW_NEON
818 722
819 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. 723 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
820 #ifdef HAS_SPLITUVROW_NEON 724 #ifdef HAS_SPLITUVROW_NEON
821 void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, 725 void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
822 int width) { 726 int width) {
823 asm volatile ( 727 asm volatile (
824 ".p2align 2 \n"
825 "1: \n" 728 "1: \n"
826 MEMACCESS(0) 729 MEMACCESS(0)
827 "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pairs of UV 730 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV
828 "subs %3, %3, #16 \n" // 16 processed per loop 731 "subs %3, %3, #16 \n" // 16 processed per loop
829 MEMACCESS(1) 732 MEMACCESS(1)
830 "st1 {v0.16b}, [%1], #16 \n" // store U 733 "st1 {v0.16b}, [%1], #16 \n" // store U
831 MEMACCESS(2) 734 MEMACCESS(2)
832 "st1 {v1.16b}, [%2], #16 \n" // store V 735 "st1 {v1.16b}, [%2], #16 \n" // store V
833 "bgt 1b \n" 736 "b.gt 1b \n"
834 : "+r"(src_uv), // %0 737 : "+r"(src_uv), // %0
835 "+r"(dst_u), // %1 738 "+r"(dst_u), // %1
836 "+r"(dst_v), // %2 739 "+r"(dst_v), // %2
837 "+r"(width) // %3 // Output registers 740 "+r"(width) // %3 // Output registers
838 : // Input registers 741 : // Input registers
839 : "cc", "memory", "v0", "v1" // Clobber List 742 : "cc", "memory", "v0", "v1" // Clobber List
840 ); 743 );
841 } 744 }
842 #endif // HAS_SPLITUVROW_NEON 745 #endif // HAS_SPLITUVROW_NEON
843 746
844 // Reads 16 U's and V's and writes out 16 pairs of UV. 747 // Reads 16 U's and V's and writes out 16 pairs of UV.
845 #ifdef HAS_MERGEUVROW_NEON 748 #ifdef HAS_MERGEUVROW_NEON
846 void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, 749 void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
847 int width) { 750 int width) {
848 asm volatile ( 751 asm volatile (
849 ".p2align 2 \n"
850 "1: \n" 752 "1: \n"
851 MEMACCESS(0) 753 MEMACCESS(0)
852 "ld1 {v0.16b}, [%0], #16 \n" // load U 754 "ld1 {v0.16b}, [%0], #16 \n" // load U
853 MEMACCESS(1) 755 MEMACCESS(1)
854 "ld1 {v1.16b}, [%1], #16 \n" // load V 756 "ld1 {v1.16b}, [%1], #16 \n" // load V
855 "subs %3, %3, #16 \n" // 16 processed per loop 757 "subs %3, %3, #16 \n" // 16 processed per loop
856 MEMACCESS(2) 758 MEMACCESS(2)
857 "st2 {v0.16b, v1.16b}, [%2], #32 \n" // store 16 pairs of UV 759 "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV
858 "bgt 1b \n" 760 "b.gt 1b \n"
859 : 761 :
860 "+r"(src_u), // %0 762 "+r"(src_u), // %0
861 "+r"(src_v), // %1 763 "+r"(src_v), // %1
862 "+r"(dst_uv), // %2 764 "+r"(dst_uv), // %2
863 "+r"(width) // %3 // Output registers 765 "+r"(width) // %3 // Output registers
864 : // Input registers 766 : // Input registers
865 : "cc", "memory", "v0", "v1" // Clobber List 767 : "cc", "memory", "v0", "v1" // Clobber List
866 ); 768 );
867 } 769 }
868 #endif // HAS_MERGEUVROW_NEON 770 #endif // HAS_MERGEUVROW_NEON
869 771
870 // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. 772 // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
871 #ifdef HAS_COPYROW_NEON 773 #ifdef HAS_COPYROW_NEON
872 void CopyRow_NEON(const uint8* src, uint8* dst, int count) { 774 void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
873 asm volatile ( 775 asm volatile (
874 ".p2align 2 \n"
875 "1: \n" 776 "1: \n"
876 MEMACCESS(0) 777 MEMACCESS(0)
877 "ld1 {v0.8b-v3.8b}, [%0], #32 \n" // load 32 778 "ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32
878 "subs %2, %2, #32 \n" // 32 processed per loop 779 "subs %2, %2, #32 \n" // 32 processed per loop
879 MEMACCESS(1) 780 MEMACCESS(1)
880 "st1 {v0.8b-v3.8b}, [%1], #32 \n" // store 32 781 "st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32
881 "bgt 1b \n" 782 "b.gt 1b \n"
882 : "+r"(src), // %0 783 : "+r"(src), // %0
883 "+r"(dst), // %1 784 "+r"(dst), // %1
884 "+r"(count) // %2 // Output registers 785 "+r"(count) // %2 // Output registers
885 : // Input registers 786 : // Input registers
886 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 787 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
887 ); 788 );
888 } 789 }
889 #endif // HAS_COPYROW_NEON 790 #endif // HAS_COPYROW_NEON
890 791
891 // SetRow8 writes 'count' bytes using a 32 bit value repeated. 792 // SetRow writes 'count' bytes using an 8 bit value repeated.
892 #ifdef HAS_SETROW_NEON 793 void SetRow_NEON(uint8* dst, uint8 v8, int count) {
893 void SetRow_NEON(uint8* dst, uint32 v32, int count) {
894 asm volatile ( 794 asm volatile (
895 "dup v0.4s, %w2 \n" // duplicate 4 ints 795 "dup v0.16b, %w2 \n" // duplicate 16 bytes
896 "1: \n" 796 "1: \n"
897 "subs %1, %1, #16 \n" // 16 bytes per loop 797 "subs %1, %1, #16 \n" // 16 bytes per loop
898 MEMACCESS(0) 798 MEMACCESS(0)
899 "st1 {v0.16b}, [%0], #16 \n" // store 799 "st1 {v0.16b}, [%0], #16 \n" // store
900 "bgt 1b \n" 800 "b.gt 1b \n"
801 : "+r"(dst), // %0
802 "+r"(count) // %1
803 : "r"(v8) // %2
804 : "cc", "memory", "v0"
805 );
806 }
807
808 void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
809 asm volatile (
810 "dup v0.4s, %w2 \n" // duplicate 4 ints
811 "1: \n"
812 "subs %1, %1, #4 \n" // 4 ints per loop
813 MEMACCESS(0)
814 "st1 {v0.16b}, [%0], #16 \n" // store
815 "b.gt 1b \n"
901 : "+r"(dst), // %0 816 : "+r"(dst), // %0
902 "+r"(count) // %1 817 "+r"(count) // %1
903 : "r"(v32) // %2 818 : "r"(v32) // %2
904 : "cc", "memory", "v0" 819 : "cc", "memory", "v0"
905 ); 820 );
906 } 821 }
907 #endif // HAS_SETROW_NEON
908
909 // TODO(fbarchard): Make fully assembler
910 // SetRow32 writes 'count' words using a 32 bit value repeated.
911 #ifdef HAS_ARGBSETROWS_NEON
912 void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width,
913 int dst_stride, int height) {
914 for (int y = 0; y < height; ++y) {
915 SetRow_NEON(dst, v32, width << 2);
916 dst += dst_stride;
917 }
918 }
919 #endif // HAS_ARGBSETROWS_NEON
920 822
921 #ifdef HAS_MIRRORROW_NEON 823 #ifdef HAS_MIRRORROW_NEON
922 void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { 824 void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
923 asm volatile ( 825 asm volatile (
924 // Start at end of source row. 826 // Start at end of source row.
925 "add %0, %0, %2 \n" 827 "add %0, %0, %2 \n"
926 "sub %0, %0, #16 \n" 828 "sub %0, %0, #16 \n"
927 829
928 ".p2align 2 \n"
929 "1: \n" 830 "1: \n"
930 MEMACCESS(0) 831 MEMACCESS(0)
931 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 832 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
932 "subs %2, %2, #16 \n" // 16 pixels per loop. 833 "subs %2, %2, #16 \n" // 16 pixels per loop.
933 "rev64 v0.16b, v0.16b \n" 834 "rev64 v0.16b, v0.16b \n"
934 MEMACCESS(1) 835 MEMACCESS(1)
935 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 836 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
936 MEMACCESS(1) 837 MEMACCESS(1)
937 "st1 {v0.D}[0], [%1], #8 \n" 838 "st1 {v0.D}[0], [%1], #8 \n"
938 "bgt 1b \n" 839 "b.gt 1b \n"
939 : "+r"(src), // %0 840 : "+r"(src), // %0
940 "+r"(dst), // %1 841 "+r"(dst), // %1
941 "+r"(width) // %2 842 "+r"(width) // %2
942 : "r"((ptrdiff_t)-16) // %3 843 : "r"((ptrdiff_t)-16) // %3
943 : "cc", "memory", "v0" 844 : "cc", "memory", "v0"
944 ); 845 );
945 } 846 }
946 #endif // HAS_MIRRORROW_NEON 847 #endif // HAS_MIRRORROW_NEON
947 848
948 #ifdef HAS_MIRRORUVROW_NEON 849 #ifdef HAS_MIRRORUVROW_NEON
949 void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, 850 void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
950 int width) { 851 int width) {
951 asm volatile ( 852 asm volatile (
952 // Start at end of source row. 853 // Start at end of source row.
953 "add %0, %0, %3, lsl #1 \n" 854 "add %0, %0, %3, lsl #1 \n"
954 "sub %0, %0, #16 \n" 855 "sub %0, %0, #16 \n"
955 856
956 ".p2align 2 \n"
957 "1: \n" 857 "1: \n"
958 MEMACCESS(0) 858 MEMACCESS(0)
959 "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16 859 "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16
960 "subs %3, %3, #8 \n" // 8 pixels per loop. 860 "subs %3, %3, #8 \n" // 8 pixels per loop.
961 "rev64 v0.8b, v0.8b \n" 861 "rev64 v0.8b, v0.8b \n"
962 "rev64 v1.8b, v1.8b \n" 862 "rev64 v1.8b, v1.8b \n"
963 MEMACCESS(1) 863 MEMACCESS(1)
964 "st1 {v0.8b}, [%1], #8 \n" // dst += 8 864 "st1 {v0.8b}, [%1], #8 \n" // dst += 8
965 MEMACCESS(2) 865 MEMACCESS(2)
966 "st1 {v1.8b}, [%2], #8 \n" 866 "st1 {v1.8b}, [%2], #8 \n"
967 "bgt 1b \n" 867 "b.gt 1b \n"
968 : "+r"(src_uv), // %0 868 : "+r"(src_uv), // %0
969 "+r"(dst_u), // %1 869 "+r"(dst_u), // %1
970 "+r"(dst_v), // %2 870 "+r"(dst_v), // %2
971 "+r"(width) // %3 871 "+r"(width) // %3
972 : "r"((ptrdiff_t)-16) // %4 872 : "r"((ptrdiff_t)-16) // %4
973 : "cc", "memory", "v0", "v1" 873 : "cc", "memory", "v0", "v1"
974 ); 874 );
975 } 875 }
976 #endif // HAS_MIRRORUVROW_NEON 876 #endif // HAS_MIRRORUVROW_NEON
977 877
978 #ifdef HAS_ARGBMIRRORROW_NEON 878 #ifdef HAS_ARGBMIRRORROW_NEON
979 void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { 879 void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
980 asm volatile ( 880 asm volatile (
981 // Start at end of source row. 881 // Start at end of source row.
982 "add %0, %0, %2, lsl #2 \n" 882 "add %0, %0, %2, lsl #2 \n"
983 "sub %0, %0, #16 \n" 883 "sub %0, %0, #16 \n"
984 884
985 ".p2align 2 \n"
986 "1: \n" 885 "1: \n"
987 MEMACCESS(0) 886 MEMACCESS(0)
988 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 887 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
989 "subs %2, %2, #4 \n" // 4 pixels per loop. 888 "subs %2, %2, #4 \n" // 4 pixels per loop.
990 "rev64 v0.4s, v0.4s \n" 889 "rev64 v0.4s, v0.4s \n"
991 MEMACCESS(1) 890 MEMACCESS(1)
992 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 891 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
993 MEMACCESS(1) 892 MEMACCESS(1)
994 "st1 {v0.D}[0], [%1], #8 \n" 893 "st1 {v0.D}[0], [%1], #8 \n"
995 "bgt 1b \n" 894 "b.gt 1b \n"
996 : "+r"(src), // %0 895 : "+r"(src), // %0
997 "+r"(dst), // %1 896 "+r"(dst), // %1
998 "+r"(width) // %2 897 "+r"(width) // %2
999 : "r"((ptrdiff_t)-16) // %3 898 : "r"((ptrdiff_t)-16) // %3
1000 : "cc", "memory", "v0" 899 : "cc", "memory", "v0"
1001 ); 900 );
1002 } 901 }
1003 #endif // HAS_ARGBMIRRORROW_NEON 902 #endif // HAS_ARGBMIRRORROW_NEON
1004 903
1005 #ifdef HAS_RGB24TOARGBROW_NEON 904 #ifdef HAS_RGB24TOARGBROW_NEON
1006 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { 905 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
1007 asm volatile ( 906 asm volatile (
1008 "movi v4.8b, #255 \n" // Alpha 907 "movi v4.8b, #255 \n" // Alpha
1009 ".p2align 2 \n"
1010 "1: \n" 908 "1: \n"
1011 MEMACCESS(0) 909 MEMACCESS(0)
1012 "ld3 {v1.8b-v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. 910 "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
1013 "subs %2, %2, #8 \n" // 8 processed per loop. 911 "subs %2, %2, #8 \n" // 8 processed per loop.
1014 MEMACCESS(1) 912 MEMACCESS(1)
1015 "st4 {v1.8b-v4.8b}, [%1], #32 \n" // store 8 pixels of ARGB. 913 "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels
1016 "bgt 1b \n" 914 "b.gt 1b \n"
1017 : "+r"(src_rgb24), // %0 915 : "+r"(src_rgb24), // %0
1018 "+r"(dst_argb), // %1 916 "+r"(dst_argb), // %1
1019 "+r"(pix) // %2 917 "+r"(pix) // %2
1020 : 918 :
1021 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List 919 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
1022 ); 920 );
1023 } 921 }
1024 #endif // HAS_RGB24TOARGBROW_NEON 922 #endif // HAS_RGB24TOARGBROW_NEON
1025 923
1026 #ifdef HAS_RAWTOARGBROW_NEON 924 #ifdef HAS_RAWTOARGBROW_NEON
1027 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { 925 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
1028 asm volatile ( 926 asm volatile (
1029 "movi v5.8b, #255 \n" // Alpha 927 "movi v5.8b, #255 \n" // Alpha
1030 ".p2align 2 \n"
1031 "1: \n" 928 "1: \n"
1032 MEMACCESS(0) 929 MEMACCESS(0)
1033 "ld3 {v0.8b-v2.8b}, [%0], #24 \n" // read r g b 930 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
1034 "subs %2, %2, #8 \n" // 8 processed per loop. 931 "subs %2, %2, #8 \n" // 8 processed per loop.
1035 "mov v3.8b, v1.8b \n" // move g 932 "orr v3.8b, v1.8b, v1.8b \n" // move g
1036 "mov v4.8b, v0.8b \n" // move r 933 "orr v4.8b, v0.8b, v0.8b \n" // move r
1037 MEMACCESS(1) 934 MEMACCESS(1)
1038 "st4 {v2.8b-v5.8b}, [%1], #32 \n" // store b g r a 935 "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a
1039 "bgt 1b \n" 936 "b.gt 1b \n"
1040 : "+r"(src_raw), // %0 937 : "+r"(src_raw), // %0
1041 "+r"(dst_argb), // %1 938 "+r"(dst_argb), // %1
1042 "+r"(pix) // %2 939 "+r"(pix) // %2
1043 : 940 :
1044 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List 941 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List
1045 ); 942 );
1046 } 943 }
1047 #endif // HAS_RAWTOARGBROW_NEON 944 #endif // HAS_RAWTOARGBROW_NEON
1048 945
1049 #define RGB565TOARGB \ 946 #define RGB565TOARGB \
1050 "vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \ 947 "shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \
1051 "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \ 948 "shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6 */ \
1052 "vshl.u8 d6, d6, #2 \n" /* G GGGGGG00 upper 6 */ \ 949 "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \
1053 "vshr.u8 d1, d1, #3 \n" /* R 000RRRRR lower 5 */ \ 950 "orr v1.8b, v4.8b, v6.8b \n" /* G */ \
1054 "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \ 951 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
1055 "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \ 952 "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \
1056 "vorr.u8 d0, d0, d4 \n" /* B */ \ 953 "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \
1057 "vshr.u8 d4, d6, #6 \n" /* G 000000GG lower 2 */ \ 954 "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \
1058 "vorr.u8 d2, d1, d5 \n" /* R */ \ 955 "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \
1059 "vorr.u8 d1, d4, d6 \n" /* G */ 956 "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \
957 "dup v2.2D, v0.D[1] \n" /* R */
1060 958
1061 #ifdef HAS_RGB565TOARGBROW_NEON 959 #ifdef HAS_RGB565TOARGBROW_NEON
1062 void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) { 960 void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {
1063 asm volatile ( 961 asm volatile (
1064 "vmov.u8 d3, #255 \n" // Alpha 962 "movi v3.8b, #255 \n" // Alpha
1065 ".p2align 2 \n"
1066 "1: \n" 963 "1: \n"
1067 MEMACCESS(0) 964 MEMACCESS(0)
1068 "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. 965 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
1069 "subs %2, %2, #8 \n" // 8 processed per loop. 966 "subs %2, %2, #8 \n" // 8 processed per loop.
1070 RGB565TOARGB 967 RGB565TOARGB
1071 MEMACCESS(1) 968 MEMACCESS(1)
1072 "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. 969 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
1073 "bgt 1b \n" 970 "b.gt 1b \n"
1074 : "+r"(src_rgb565), // %0 971 : "+r"(src_rgb565), // %0
1075 "+r"(dst_argb), // %1 972 "+r"(dst_argb), // %1
1076 "+r"(pix) // %2 973 "+r"(pix) // %2
1077 : 974 :
1078 : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List 975 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List
1079 ); 976 );
1080 } 977 }
1081 #endif // HAS_RGB565TOARGBROW_NEON 978 #endif // HAS_RGB565TOARGBROW_NEON
1082 979
1083 #define ARGB1555TOARGB \ 980 #define ARGB1555TOARGB \
1084 "vshrn.u16 d7, q0, #8 \n" /* A Arrrrrxx */ \ 981 "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \
1085 "vshr.u8 d6, d7, #2 \n" /* R xxxRRRRR */ \ 982 "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \
1086 "vshrn.u16 d5, q0, #5 \n" /* G xxxGGGGG */ \ 983 "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \
1087 "vmovn.u16 d4, q0 \n" /* B xxxBBBBB */ \ 984 \
1088 "vshr.u8 d7, d7, #7 \n" /* A 0000000A */ \ 985 "sshr v2.8h, v0.8h, #15 \n" /* A AAAAAAAA */ \
1089 "vneg.s8 d7, d7 \n" /* A AAAAAAAA upper 8 */ \ 986 "xtn2 v3.16b, v2.8h \n" \
1090 "vshl.u8 d6, d6, #3 \n" /* R RRRRR000 upper 5 */ \ 987 \
1091 "vshr.u8 q1, q3, #5 \n" /* R,A 00000RRR lower 3 */ \ 988 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
1092 "vshl.u8 q0, q2, #3 \n" /* B,G BBBBB000 upper 5 */ \ 989 "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \
1093 "vshr.u8 q2, q0, #5 \n" /* B,G 00000BBB lower 3 */ \ 990 \
1094 "vorr.u8 q1, q1, q3 \n" /* R,A */ \ 991 "ushr v1.16b, v3.16b, #5 \n" /* R,A 00000RRR lower 3 */ \
1095 "vorr.u8 q0, q0, q2 \n" /* B,G */ \ 992 "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \
993 "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \
994 \
995 "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \
996 "orr v2.16b, v1.16b, v3.16b \n" /* R,A */ \
997 "dup v1.2D, v0.D[1] \n" \
998 "dup v3.2D, v2.D[1] \n"
1096 999
1097 // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha. 1000 // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
1098 #define RGB555TOARGB \ 1001 #define RGB555TOARGB \
1099 "vshrn.u16 d6, q0, #5 \n" /* G xxxGGGGG */ \ 1002 "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \
1100 "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB xRRRRRxx */ \ 1003 "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \
1101 "vshl.u8 d6, d6, #3 \n" /* G GGGGG000 upper 5 */ \ 1004 "xtn v3.8b, v2.8h \n" /* RRRRR000 */ \
1102 "vshr.u8 d1, d1, #2 \n" /* R 00xRRRRR lower 5 */ \ 1005 \
1103 "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \ 1006 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
1104 "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \ 1007 "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \
1105 "vorr.u8 d0, d0, d4 \n" /* B */ \ 1008 \
1106 "vshr.u8 d4, d6, #5 \n" /* G 00000GGG lower 3 */ \ 1009 "ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \
1107 "vorr.u8 d2, d1, d5 \n" /* R */ \ 1010 "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \
1108 "vorr.u8 d1, d4, d6 \n" /* G */ 1011 "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \
1012 \
1013 "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \
1014 "orr v2.16b, v1.16b, v3.16b \n" /* R */ \
1015 "dup v1.2D, v0.D[1] \n" /* G */ \
1109 1016
1110 #ifdef HAS_ARGB1555TOARGBROW_NEON 1017 #ifdef HAS_ARGB1555TOARGBROW_NEON
1111 void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, 1018 void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
1112 int pix) { 1019 int pix) {
1113 asm volatile ( 1020 asm volatile (
1114 "vmov.u8 d3, #255 \n" // Alpha 1021 "movi v3.8b, #255 \n" // Alpha
1115 ".p2align 2 \n"
1116 "1: \n" 1022 "1: \n"
1117 MEMACCESS(0) 1023 MEMACCESS(0)
1118 "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. 1024 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
1119 "subs %2, %2, #8 \n" // 8 processed per loop. 1025 "subs %2, %2, #8 \n" // 8 processed per loop.
1120 ARGB1555TOARGB 1026 ARGB1555TOARGB
1121 MEMACCESS(1) 1027 MEMACCESS(1)
1122 "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. 1028 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
1123 "bgt 1b \n" 1029 "b.gt 1b \n"
1124 : "+r"(src_argb1555), // %0 1030 : "+r"(src_argb1555), // %0
1125 "+r"(dst_argb), // %1 1031 "+r"(dst_argb), // %1
1126 "+r"(pix) // %2 1032 "+r"(pix) // %2
1127 : 1033 :
1128 : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List 1034 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
1129 ); 1035 );
1130 } 1036 }
1131 #endif // HAS_ARGB1555TOARGBROW_NEON 1037 #endif // HAS_ARGB1555TOARGBROW_NEON
1132 1038
1133 #define ARGB4444TOARGB \ 1039 #define ARGB4444TOARGB \
1134 "vuzp.u8 d0, d1 \n" /* d0 BG, d1 RA */ \ 1040 "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \
1135 "vshl.u8 q2, q0, #4 \n" /* B,R BBBB0000 */ \ 1041 "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \
1136 "vshr.u8 q1, q0, #4 \n" /* G,A 0000GGGG */ \ 1042 "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \
1137 "vshr.u8 q0, q2, #4 \n" /* B,R 0000BBBB */ \ 1043 "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \
1138 "vorr.u8 q0, q0, q2 \n" /* B,R BBBBBBBB */ \ 1044 "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \
1139 "vshl.u8 q2, q1, #4 \n" /* G,A GGGG0000 */ \ 1045 "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \
1140 "vorr.u8 q1, q1, q2 \n" /* G,A GGGGGGGG */ \ 1046 "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \
1141 "vswp.u8 d1, d2 \n" /* B,R,G,A -> B,G,R,A */ 1047 "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \
1048 "dup v0.2D, v2.D[1] \n" \
1049 "dup v1.2D, v3.D[1] \n"
1142 1050
1143 #ifdef HAS_ARGB4444TOARGBROW_NEON 1051 #ifdef HAS_ARGB4444TOARGBROW_NEON
1144 void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, 1052 void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
1145 int pix) { 1053 int pix) {
1146 asm volatile ( 1054 asm volatile (
1147 "vmov.u8 d3, #255 \n" // Alpha
1148 ".p2align 2 \n"
1149 "1: \n" 1055 "1: \n"
1150 MEMACCESS(0) 1056 MEMACCESS(0)
1151 "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. 1057 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
1152 "subs %2, %2, #8 \n" // 8 processed per loop. 1058 "subs %2, %2, #8 \n" // 8 processed per loop.
1153 ARGB4444TOARGB 1059 ARGB4444TOARGB
1154 MEMACCESS(1) 1060 MEMACCESS(1)
1155 "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. 1061 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
1156 "bgt 1b \n" 1062 "b.gt 1b \n"
1157 : "+r"(src_argb4444), // %0 1063 : "+r"(src_argb4444), // %0
1158 "+r"(dst_argb), // %1 1064 "+r"(dst_argb), // %1
1159 "+r"(pix) // %2 1065 "+r"(pix) // %2
1160 : 1066 :
1161 : "cc", "memory", "q0", "q1", "q2" // Clobber List 1067 : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
1162 ); 1068 );
1163 } 1069 }
1164 #endif // HAS_ARGB4444TOARGBROW_NEON 1070 #endif // HAS_ARGB4444TOARGBROW_NEON
1165 1071
1166 #ifdef HAS_ARGBTORGB24ROW_NEON 1072 #ifdef HAS_ARGBTORGB24ROW_NEON
1167 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { 1073 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
1168 asm volatile ( 1074 asm volatile (
1169 ".p2align 2 \n"
1170 "1: \n" 1075 "1: \n"
1171 MEMACCESS(0) 1076 MEMACCESS(0)
1172 "ld4 {v1.8b-v4.8b}, [%0], #32 \n" // load 8 pixels of ARGB. 1077 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB pixels
1173 "subs %2, %2, #8 \n" // 8 processed per loop. 1078 "subs %2, %2, #8 \n" // 8 processed per loop.
1174 MEMACCESS(1) 1079 MEMACCESS(1)
1175 "st3 {v1.8b-v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24. 1080 "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24.
1176 "bgt 1b \n" 1081 "b.gt 1b \n"
1177 : "+r"(src_argb), // %0 1082 : "+r"(src_argb), // %0
1178 "+r"(dst_rgb24), // %1 1083 "+r"(dst_rgb24), // %1
1179 "+r"(pix) // %2 1084 "+r"(pix) // %2
1180 : 1085 :
1181 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List 1086 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
1182 ); 1087 );
1183 } 1088 }
1184 #endif // HAS_ARGBTORGB24ROW_NEON 1089 #endif // HAS_ARGBTORGB24ROW_NEON
1185 1090
1186 #ifdef HAS_ARGBTORAWROW_NEON 1091 #ifdef HAS_ARGBTORAWROW_NEON
1187 void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { 1092 void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
1188 asm volatile ( 1093 asm volatile (
1189 ".p2align 2 \n"
1190 "1: \n" 1094 "1: \n"
1191 MEMACCESS(0) 1095 MEMACCESS(0)
1192 "ld4 {v1.8b-v4.8b}, [%0], #32 \n" // load b g r a 1096 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a
1193 "subs %2, %2, #8 \n" // 8 processed per loop. 1097 "subs %2, %2, #8 \n" // 8 processed per loop.
1194 "mov v4.8b, v2.8b \n" // mov g 1098 "orr v4.8b, v2.8b, v2.8b \n" // mov g
1195 "mov v5.8b, v1.8b \n" // mov b 1099 "orr v5.8b, v1.8b, v1.8b \n" // mov b
1196 MEMACCESS(1) 1100 MEMACCESS(1)
1197 "st3 {v3.8b-v5.8b}, [%1], #24 \n" // store r g b 1101 "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b
1198 "bgt 1b \n" 1102 "b.gt 1b \n"
1199 : "+r"(src_argb), // %0 1103 : "+r"(src_argb), // %0
1200 "+r"(dst_raw), // %1 1104 "+r"(dst_raw), // %1
1201 "+r"(pix) // %2 1105 "+r"(pix) // %2
1202 : 1106 :
1203 : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List 1107 : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List
1204 ); 1108 );
1205 } 1109 }
1206 #endif // HAS_ARGBTORAWROW_NEON 1110 #endif // HAS_ARGBTORAWROW_NEON
1207 1111
1208 #ifdef HAS_YUY2TOYROW_NEON 1112 #ifdef HAS_YUY2TOYROW_NEON
1209 void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { 1113 void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
1210 asm volatile ( 1114 asm volatile (
1211 ".p2align 2 \n"
1212 "1: \n" 1115 "1: \n"
1213 MEMACCESS(0) 1116 MEMACCESS(0)
1214 "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. 1117 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
1215 "subs %2, %2, #16 \n" // 16 processed per loop. 1118 "subs %2, %2, #16 \n" // 16 processed per loop.
1216 MEMACCESS(1) 1119 MEMACCESS(1)
1217 "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. 1120 "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
1218 "bgt 1b \n" 1121 "b.gt 1b \n"
1219 : "+r"(src_yuy2), // %0 1122 : "+r"(src_yuy2), // %0
1220 "+r"(dst_y), // %1 1123 "+r"(dst_y), // %1
1221 "+r"(pix) // %2 1124 "+r"(pix) // %2
1222 : 1125 :
1223 : "cc", "memory", "v0", "v1" // Clobber List 1126 : "cc", "memory", "v0", "v1" // Clobber List
1224 ); 1127 );
1225 } 1128 }
1226 #endif // HAS_YUY2TOYROW_NEON 1129 #endif // HAS_YUY2TOYROW_NEON
1227 1130
1228 #ifdef HAS_UYVYTOYROW_NEON 1131 #ifdef HAS_UYVYTOYROW_NEON
1229 void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { 1132 void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
1230 asm volatile ( 1133 asm volatile (
1231 ".p2align 2 \n"
1232 "1: \n" 1134 "1: \n"
1233 MEMACCESS(0) 1135 MEMACCESS(0)
1234 "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. 1136 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
1235 "subs %2, %2, #16 \n" // 16 processed per loop. 1137 "subs %2, %2, #16 \n" // 16 processed per loop.
1236 MEMACCESS(1) 1138 MEMACCESS(1)
1237 "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. 1139 "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
1238 "bgt 1b \n" 1140 "b.gt 1b \n"
1239 : "+r"(src_uyvy), // %0 1141 : "+r"(src_uyvy), // %0
1240 "+r"(dst_y), // %1 1142 "+r"(dst_y), // %1
1241 "+r"(pix) // %2 1143 "+r"(pix) // %2
1242 : 1144 :
1243 : "cc", "memory", "v0", "v1" // Clobber List 1145 : "cc", "memory", "v0", "v1" // Clobber List
1244 ); 1146 );
1245 } 1147 }
1246 #endif // HAS_UYVYTOYROW_NEON 1148 #endif // HAS_UYVYTOYROW_NEON
1247 1149
1248 #ifdef HAS_YUY2TOUV422ROW_NEON 1150 #ifdef HAS_YUY2TOUV422ROW_NEON
1249 void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, 1151 void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
1250 int pix) { 1152 int pix) {
1251 asm volatile ( 1153 asm volatile (
1252 ".p2align 2 \n"
1253 "1: \n" 1154 "1: \n"
1254 MEMACCESS(0) 1155 MEMACCESS(0)
1255 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of YUY2. 1156 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 pixels
1256 "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. 1157 "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
1257 MEMACCESS(1) 1158 MEMACCESS(1)
1258 "st1 {v1.8b}, [%1], #8 \n" // store 8 U. 1159 "st1 {v1.8b}, [%1], #8 \n" // store 8 U.
1259 MEMACCESS(2) 1160 MEMACCESS(2)
1260 "st1 {v3.8b}, [%2], #8 \n" // store 8 V. 1161 "st1 {v3.8b}, [%2], #8 \n" // store 8 V.
1261 "bgt 1b \n" 1162 "b.gt 1b \n"
1262 : "+r"(src_yuy2), // %0 1163 : "+r"(src_yuy2), // %0
1263 "+r"(dst_u), // %1 1164 "+r"(dst_u), // %1
1264 "+r"(dst_v), // %2 1165 "+r"(dst_v), // %2
1265 "+r"(pix) // %3 1166 "+r"(pix) // %3
1266 : 1167 :
1267 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 1168 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
1268 ); 1169 );
1269 } 1170 }
1270 #endif // HAS_YUY2TOUV422ROW_NEON 1171 #endif // HAS_YUY2TOUV422ROW_NEON
1271 1172
1272 #ifdef HAS_UYVYTOUV422ROW_NEON 1173 #ifdef HAS_UYVYTOUV422ROW_NEON
1273 void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, 1174 void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
1274 int pix) { 1175 int pix) {
1275 asm volatile ( 1176 asm volatile (
1276 ".p2align 2 \n"
1277 "1: \n" 1177 "1: \n"
1278 MEMACCESS(0) 1178 MEMACCESS(0)
1279 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of UYVY. 1179 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY pixels
1280 "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. 1180 "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
1281 MEMACCESS(1) 1181 MEMACCESS(1)
1282 "st1 {v0.8b}, [%1], #8 \n" // store 8 U. 1182 "st1 {v0.8b}, [%1], #8 \n" // store 8 U.
1283 MEMACCESS(2) 1183 MEMACCESS(2)
1284 "st1 {v2.8b}, [%2], #8 \n" // store 8 V. 1184 "st1 {v2.8b}, [%2], #8 \n" // store 8 V.
1285 "bgt 1b \n" 1185 "b.gt 1b \n"
1286 : "+r"(src_uyvy), // %0 1186 : "+r"(src_uyvy), // %0
1287 "+r"(dst_u), // %1 1187 "+r"(dst_u), // %1
1288 "+r"(dst_v), // %2 1188 "+r"(dst_v), // %2
1289 "+r"(pix) // %3 1189 "+r"(pix) // %3
1290 : 1190 :
1291 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 1191 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
1292 ); 1192 );
1293 } 1193 }
1294 #endif // HAS_UYVYTOUV422ROW_NEON 1194 #endif // HAS_UYVYTOUV422ROW_NEON
1295 1195
1296 #ifdef HAS_YUY2TOUVROW_NEON 1196 #ifdef HAS_YUY2TOUVROW_NEON
1297 void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, 1197 void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
1298 uint8* dst_u, uint8* dst_v, int pix) { 1198 uint8* dst_u, uint8* dst_v, int pix) {
1199 const uint8* src_yuy2b = src_yuy2 + stride_yuy2;
1299 asm volatile ( 1200 asm volatile (
1300 "add %x1, %x0, %w1, sxtw \n" // stride + src_yuy2
1301 ".p2align 2 \n"
1302 "1: \n" 1201 "1: \n"
1303 MEMACCESS(0) 1202 MEMACCESS(0)
1304 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of YUY2. 1203 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
1305 "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. 1204 "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
1306 MEMACCESS(1) 1205 MEMACCESS(1)
1307 "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load next row YUY2. 1206 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
1308 "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U 1207 "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U
1309 "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V 1208 "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V
1310 MEMACCESS(2) 1209 MEMACCESS(2)
1311 "st1 {v1.8b}, [%2], #8 \n" // store 8 U. 1210 "st1 {v1.8b}, [%2], #8 \n" // store 8 U.
1312 MEMACCESS(3) 1211 MEMACCESS(3)
1313 "st1 {v3.8b}, [%3], #8 \n" // store 8 V. 1212 "st1 {v3.8b}, [%3], #8 \n" // store 8 V.
1314 "bgt 1b \n" 1213 "b.gt 1b \n"
1315 : "+r"(src_yuy2), // %0 1214 : "+r"(src_yuy2), // %0
1316 "+r"(stride_yuy2), // %1 1215 "+r"(src_yuy2b), // %1
1317 "+r"(dst_u), // %2 1216 "+r"(dst_u), // %2
1318 "+r"(dst_v), // %3 1217 "+r"(dst_v), // %3
1319 "+r"(pix) // %4 1218 "+r"(pix) // %4
1320 : 1219 :
1321 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber L ist 1220 : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
1221 "v5", "v6", "v7" // Clobber List
1322 ); 1222 );
1323 } 1223 }
1324 #endif // HAS_YUY2TOUVROW_NEON 1224 #endif // HAS_YUY2TOUVROW_NEON
1325 1225
1326 #ifdef HAS_UYVYTOUVROW_NEON 1226 #ifdef HAS_UYVYTOUVROW_NEON
1327 void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, 1227 void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
1328 uint8* dst_u, uint8* dst_v, int pix) { 1228 uint8* dst_u, uint8* dst_v, int pix) {
1229 const uint8* src_uyvyb = src_uyvy + stride_uyvy;
1329 asm volatile ( 1230 asm volatile (
1330 "add %x1, %x0, %w1, sxtw \n" // stride + src_uyvy
1331 ".p2align 2 \n"
1332 "1: \n" 1231 "1: \n"
1333 MEMACCESS(0) 1232 MEMACCESS(0)
1334 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of UYVY. 1233 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
1335 "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. 1234 "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
1336 MEMACCESS(1) 1235 MEMACCESS(1)
1337 "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load next row UYVY. 1236 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
1338 "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U 1237 "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U
1339 "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V 1238 "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V
1340 MEMACCESS(2) 1239 MEMACCESS(2)
1341 "st1 {v0.8b}, [%2], #8 \n" // store 8 U. 1240 "st1 {v0.8b}, [%2], #8 \n" // store 8 U.
1342 MEMACCESS(3) 1241 MEMACCESS(3)
1343 "st1 {v2.8b}, [%3], #8 \n" // store 8 V. 1242 "st1 {v2.8b}, [%3], #8 \n" // store 8 V.
1344 "bgt 1b \n" 1243 "b.gt 1b \n"
1345 : "+r"(src_uyvy), // %0 1244 : "+r"(src_uyvy), // %0
1346 "+r"(stride_uyvy), // %1 1245 "+r"(src_uyvyb), // %1
1347 "+r"(dst_u), // %2 1246 "+r"(dst_u), // %2
1348 "+r"(dst_v), // %3 1247 "+r"(dst_v), // %3
1349 "+r"(pix) // %4 1248 "+r"(pix) // %4
1350 : 1249 :
1351 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber L ist 1250 : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
1251 "v5", "v6", "v7" // Clobber List
1352 ); 1252 );
1353 } 1253 }
1354 #endif // HAS_UYVYTOUVROW_NEON 1254 #endif // HAS_UYVYTOUVROW_NEON
1355 1255
1356 #ifdef HAS_HALFROW_NEON
1357 void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
1358 uint8* dst_uv, int pix) {
1359 asm volatile (
1360 // change the stride to row 2 pointer
1361 "add %x1, %x0, %w1, sxtw \n"
1362 "1: \n"
1363 MEMACCESS(0)
1364 "ld1 {v0.16b}, [%0], #16 \n" // load row 1 16 pixels.
1365 "subs %3, %3, #16 \n" // 16 processed per loop
1366 MEMACCESS(1)
1367 "ld1 {v1.16b}, [%1], #16 \n" // load row 2 16 pixels.
1368 "urhadd v0.16b, v0.16b, v1.16b \n" // average row 1 and 2
1369 MEMACCESS(2)
1370 "st1 {v0.16b}, [%2], #16 \n"
1371 "bgt 1b \n"
1372 : "+r"(src_uv), // %0
1373 "+r"(src_uv_stride), // %1
1374 "+r"(dst_uv), // %2
1375 "+r"(pix) // %3
1376 :
1377 : "cc", "memory", "v0", "v1" // Clobber List
1378 );
1379 }
1380 #endif // HAS_HALFROW_NEON
1381
1382 // Select 2 channels from ARGB on alternating pixels. e.g. BGBGBGBG
1383 #ifdef HAS_ARGBTOBAYERROW_NEON
1384 void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
1385 uint32 selector, int pix) {
1386 asm volatile (
1387 "mov v2.s[0], %w3 \n" // selector
1388 "1: \n"
1389 MEMACCESS(0)
1390 "ld1 {v0.16b, v1.16b}, [%0], 32 \n" // load row 8 pixels.
1391 "subs %2, %2, #8 \n" // 8 processed per loop
1392 "tbl v4.8b, {v0.16b}, v2.8b \n" // look up 4 pixels
1393 "tbl v5.8b, {v1.16b}, v2.8b \n" // look up 4 pixels
1394 "trn1 v4.4s, v4.4s, v5.4s \n" // combine 8 pixels
1395 MEMACCESS(1)
1396 "st1 {v4.8b}, [%1], #8 \n" // store 8.
1397 "bgt 1b \n"
1398 : "+r"(src_argb), // %0
1399 "+r"(dst_bayer), // %1
1400 "+r"(pix) // %2
1401 : "r"(selector) // %3
1402 : "cc", "memory", "v0", "v1", "v2", "v4", "v5" // Clobber List
1403 );
1404 }
1405 #endif // HAS_ARGBTOBAYERROW_NEON
1406
1407 // Select G channels from ARGB. e.g. GGGGGGGG 1256 // Select G channels from ARGB. e.g. GGGGGGGG
1408 #ifdef HAS_ARGBTOBAYERGGROW_NEON 1257 #ifdef HAS_ARGBTOBAYERGGROW_NEON
1409 void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer, 1258 void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
1410 uint32 /*selector*/, int pix) { 1259 uint32 /*selector*/, int pix) {
1411 asm volatile ( 1260 asm volatile (
1412 "1: \n" 1261 "1: \n"
1413 MEMACCESS(0) 1262 MEMACCESS(0)
1414 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load row 8 pixels. 1263 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load row 8 pixels
1415 "subs %2, %2, #8 \n" // 8 processed per loop 1264 "subs %2, %2, #8 \n" // 8 processed per loop
1416 MEMACCESS(1) 1265 MEMACCESS(1)
1417 "st1 {v1.8b}, [%1], #8 \n" // store 8 G's. 1266 "st1 {v1.8b}, [%1], #8 \n" // store 8 G's.
1418 "bgt 1b \n" 1267 "b.gt 1b \n"
1419 : "+r"(src_argb), // %0 1268 : "+r"(src_argb), // %0
1420 "+r"(dst_bayer), // %1 1269 "+r"(dst_bayer), // %1
1421 "+r"(pix) // %2 1270 "+r"(pix) // %2
1422 : 1271 :
1423 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 1272 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
1424 ); 1273 );
1425 } 1274 }
1426 #endif // HAS_ARGBTOBAYERGGROW_NEON 1275 #endif // HAS_ARGBTOBAYERGGROW_NEON
1427 1276
1428 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. 1277 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
1429 #ifdef HAS_ARGBSHUFFLEROW_NEON 1278 #ifdef HAS_ARGBSHUFFLEROW_NEON
1430 void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, 1279 void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
1431 const uint8* shuffler, int pix) { 1280 const uint8* shuffler, int pix) {
1432 asm volatile ( 1281 asm volatile (
1433 MEMACCESS(3) 1282 MEMACCESS(3)
1434 "ld1 {v2.16b}, [%3] \n" // shuffler 1283 "ld1 {v2.16b}, [%3] \n" // shuffler
1435 "1: \n" 1284 "1: \n"
1436 MEMACCESS(0) 1285 MEMACCESS(0)
1437 "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. 1286 "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
1438 "subs %2, %2, #4 \n" // 4 processed per loop 1287 "subs %2, %2, #4 \n" // 4 processed per loop
1439 "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels 1288 "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
1440 MEMACCESS(1) 1289 MEMACCESS(1)
1441 "st1 {v1.16b}, [%1], #16 \n" // store 4. 1290 "st1 {v1.16b}, [%1], #16 \n" // store 4.
1442 "bgt 1b \n" 1291 "b.gt 1b \n"
1443 : "+r"(src_argb), // %0 1292 : "+r"(src_argb), // %0
1444 "+r"(dst_argb), // %1 1293 "+r"(dst_argb), // %1
1445 "+r"(pix) // %2 1294 "+r"(pix) // %2
1446 : "r"(shuffler) // %3 1295 : "r"(shuffler) // %3
1447 : "cc", "memory", "v0", "v1", "v2" // Clobber List 1296 : "cc", "memory", "v0", "v1", "v2" // Clobber List
1448 ); 1297 );
1449 } 1298 }
1450 #endif // HAS_ARGBSHUFFLEROW_NEON 1299 #endif // HAS_ARGBSHUFFLEROW_NEON
1451 1300
1452 #ifdef HAS_I422TOYUY2ROW_NEON 1301 #ifdef HAS_I422TOYUY2ROW_NEON
1453 void I422ToYUY2Row_NEON(const uint8* src_y, 1302 void I422ToYUY2Row_NEON(const uint8* src_y,
1454 const uint8* src_u, 1303 const uint8* src_u,
1455 const uint8* src_v, 1304 const uint8* src_v,
1456 uint8* dst_yuy2, int width) { 1305 uint8* dst_yuy2, int width) {
1457 asm volatile ( 1306 asm volatile (
1458 ".p2align 2 \n"
1459 "1: \n" 1307 "1: \n"
1460 MEMACCESS(0) 1308 MEMACCESS(0)
1461 "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys 1309 "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys
1462 "mov v2.8b, v1.8b \n" 1310 "orr v2.8b, v1.8b, v1.8b \n"
1463 MEMACCESS(1) 1311 MEMACCESS(1)
1464 "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us 1312 "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
1465 MEMACCESS(2) 1313 MEMACCESS(2)
1466 "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs 1314 "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
1467 "subs %4, %4, #16 \n" // 16 pixels 1315 "subs %4, %4, #16 \n" // 16 pixels
1468 MEMACCESS(3) 1316 MEMACCESS(3)
1469 "st4 {v0.8b-v3.8b}, [%3], #32 \n" // Store 8 YUY2/16 pixels. 1317 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
1470 "bgt 1b \n" 1318 "b.gt 1b \n"
1471 : "+r"(src_y), // %0 1319 : "+r"(src_y), // %0
1472 "+r"(src_u), // %1 1320 "+r"(src_u), // %1
1473 "+r"(src_v), // %2 1321 "+r"(src_v), // %2
1474 "+r"(dst_yuy2), // %3 1322 "+r"(dst_yuy2), // %3
1475 "+r"(width) // %4 1323 "+r"(width) // %4
1476 : 1324 :
1477 : "cc", "memory", "v0", "v1", "v2", "v3" 1325 : "cc", "memory", "v0", "v1", "v2", "v3"
1478 ); 1326 );
1479 } 1327 }
1480 #endif // HAS_I422TOYUY2ROW_NEON 1328 #endif // HAS_I422TOYUY2ROW_NEON
1481 1329
1482 #ifdef HAS_I422TOUYVYROW_NEON 1330 #ifdef HAS_I422TOUYVYROW_NEON
1483 void I422ToUYVYRow_NEON(const uint8* src_y, 1331 void I422ToUYVYRow_NEON(const uint8* src_y,
1484 const uint8* src_u, 1332 const uint8* src_u,
1485 const uint8* src_v, 1333 const uint8* src_v,
1486 uint8* dst_uyvy, int width) { 1334 uint8* dst_uyvy, int width) {
1487 asm volatile ( 1335 asm volatile (
1488 ".p2align 2 \n"
1489 "1: \n" 1336 "1: \n"
1490 MEMACCESS(0) 1337 MEMACCESS(0)
1491 "ld2 {v1.8b, v2.8b}, [%0], #16 \n" // load 16 Ys 1338 "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys
1492 "mov v3.8b, v2.8b \n" 1339 "orr v3.8b, v2.8b, v2.8b \n"
1493 MEMACCESS(1) 1340 MEMACCESS(1)
1494 "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us 1341 "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
1495 MEMACCESS(2) 1342 MEMACCESS(2)
1496 "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs 1343 "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
1497 "subs %4, %4, #16 \n" // 16 pixels 1344 "subs %4, %4, #16 \n" // 16 pixels
1498 MEMACCESS(3) 1345 MEMACCESS(3)
1499 "st4 {v0.8b-v3.8b}, [%3], #32 \n" // Store 8 UYVY/16 pixels. 1346 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
1500 "bgt 1b \n" 1347 "b.gt 1b \n"
1501 : "+r"(src_y), // %0 1348 : "+r"(src_y), // %0
1502 "+r"(src_u), // %1 1349 "+r"(src_u), // %1
1503 "+r"(src_v), // %2 1350 "+r"(src_v), // %2
1504 "+r"(dst_uyvy), // %3 1351 "+r"(dst_uyvy), // %3
1505 "+r"(width) // %4 1352 "+r"(width) // %4
1506 : 1353 :
1507 : "cc", "memory", "v0", "v1", "v2", "v3" 1354 : "cc", "memory", "v0", "v1", "v2", "v3"
1508 ); 1355 );
1509 } 1356 }
1510 #endif // HAS_I422TOUYVYROW_NEON 1357 #endif // HAS_I422TOUYVYROW_NEON
1511 1358
1512 #ifdef HAS_ARGBTORGB565ROW_NEON 1359 #ifdef HAS_ARGBTORGB565ROW_NEON
1513 void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { 1360 void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
1514 asm volatile ( 1361 asm volatile (
1515 ".p2align 2 \n"
1516 "1: \n" 1362 "1: \n"
1517 MEMACCESS(0) 1363 MEMACCESS(0)
1518 "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. 1364 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
1519 "subs %2, %2, #8 \n" // 8 processed per loop. 1365 "subs %2, %2, #8 \n" // 8 processed per loop.
1520 ARGBTORGB565 1366 ARGBTORGB565
1521 MEMACCESS(1) 1367 MEMACCESS(1)
1522 "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565. 1368 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565.
1523 "bgt 1b \n" 1369 "b.gt 1b \n"
1524 : "+r"(src_argb), // %0 1370 : "+r"(src_argb), // %0
1525 "+r"(dst_rgb565), // %1 1371 "+r"(dst_rgb565), // %1
1526 "+r"(pix) // %2 1372 "+r"(pix) // %2
1527 : 1373 :
1528 : "cc", "memory", "q0", "q8", "q9", "q10", "q11" 1374 : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
1529 ); 1375 );
1530 } 1376 }
1531 #endif // HAS_ARGBTORGB565ROW_NEON 1377 #endif // HAS_ARGBTORGB565ROW_NEON
1532 1378
1533 #ifdef HAS_ARGBTOARGB1555ROW_NEON 1379 #ifdef HAS_ARGBTOARGB1555ROW_NEON
1534 void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, 1380 void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
1535 int pix) { 1381 int pix) {
1536 asm volatile ( 1382 asm volatile (
1537 ".p2align 2 \n"
1538 "1: \n" 1383 "1: \n"
1539 MEMACCESS(0) 1384 MEMACCESS(0)
1540 "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. 1385 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
1541 "subs %2, %2, #8 \n" // 8 processed per loop. 1386 "subs %2, %2, #8 \n" // 8 processed per loop.
1542 ARGBTOARGB1555 1387 ARGBTOARGB1555
1543 MEMACCESS(1) 1388 MEMACCESS(1)
1544 "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB1555. 1389 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB1555.
1545 "bgt 1b \n" 1390 "b.gt 1b \n"
1546 : "+r"(src_argb), // %0 1391 : "+r"(src_argb), // %0
1547 "+r"(dst_argb1555), // %1 1392 "+r"(dst_argb1555), // %1
1548 "+r"(pix) // %2 1393 "+r"(pix) // %2
1549 : 1394 :
1550 : "cc", "memory", "q0", "q8", "q9", "q10", "q11" 1395 : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
1551 ); 1396 );
1552 } 1397 }
1553 #endif // HAS_ARGBTOARGB1555ROW_NEON 1398 #endif // HAS_ARGBTOARGB1555ROW_NEON
1554 1399
1555 #ifdef HAS_ARGBTOARGB4444ROW_NEON 1400 #ifdef HAS_ARGBTOARGB4444ROW_NEON
1556 void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, 1401 void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
1557 int pix) { 1402 int pix) {
1558 asm volatile ( 1403 asm volatile (
1559 "vmov.u8 d4, #0x0f \n" // bits to clear with vbic. 1404 "movi v4.16b, #0x0f \n" // bits to clear with vbic.
1560 ".p2align 2 \n"
1561 "1: \n" 1405 "1: \n"
1562 MEMACCESS(0) 1406 MEMACCESS(0)
1563 "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. 1407 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
1564 "subs %2, %2, #8 \n" // 8 processed per loop. 1408 "subs %2, %2, #8 \n" // 8 processed per loop.
1565 ARGBTOARGB4444 1409 ARGBTOARGB4444
1566 MEMACCESS(1) 1410 MEMACCESS(1)
1567 "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB4444. 1411 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB4444.
1568 "bgt 1b \n" 1412 "b.gt 1b \n"
1569 : "+r"(src_argb), // %0 1413 : "+r"(src_argb), // %0
1570 "+r"(dst_argb4444), // %1 1414 "+r"(dst_argb4444), // %1
1571 "+r"(pix) // %2 1415 "+r"(pix) // %2
1572 : 1416 :
1573 : "cc", "memory", "q0", "q8", "q9", "q10", "q11" 1417 : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23"
1574 ); 1418 );
1575 } 1419 }
1576 #endif // HAS_ARGBTOARGB4444ROW_NEON 1420 #endif // HAS_ARGBTOARGB4444ROW_NEON
1577 1421
1578 #ifdef HAS_ARGBTOYROW_NEON 1422 #ifdef HAS_ARGBTOYROW_NEON
1579 void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { 1423 void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
1580 asm volatile ( 1424 asm volatile (
1581 "movi v4.8b, #13 \n" // B * 0.1016 coefficient 1425 "movi v4.8b, #13 \n" // B * 0.1016 coefficient
1582 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 1426 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
1583 "movi v6.8b, #33 \n" // R * 0.2578 coefficient 1427 "movi v6.8b, #33 \n" // R * 0.2578 coefficient
1584 "movi v7.8b, #16 \n" // Add 16 constant 1428 "movi v7.8b, #16 \n" // Add 16 constant
1585 ".p2align 2 \n"
1586 "1: \n" 1429 "1: \n"
1587 MEMACCESS(0) 1430 MEMACCESS(0)
1588 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 1431 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
1589 "subs %2, %2, #8 \n" // 8 processed per loop. 1432 "subs %2, %2, #8 \n" // 8 processed per loop.
1590 "umull v3.8h, v0.8b, v4.8b \n" // B 1433 "umull v3.8h, v0.8b, v4.8b \n" // B
1591 "umlal v3.8h, v1.8b, v5.8b \n" // G 1434 "umlal v3.8h, v1.8b, v5.8b \n" // G
1592 "umlal v3.8h, v2.8b, v6.8b \n" // R 1435 "umlal v3.8h, v2.8b, v6.8b \n" // R
1593 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y 1436 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
1594 "uqadd v0.8b, v0.8b, v7.8b \n" 1437 "uqadd v0.8b, v0.8b, v7.8b \n"
1595 MEMACCESS(1) 1438 MEMACCESS(1)
1596 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 1439 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
1597 "bgt 1b \n" 1440 "b.gt 1b \n"
1598 : "+r"(src_argb), // %0 1441 : "+r"(src_argb), // %0
1599 "+r"(dst_y), // %1 1442 "+r"(dst_y), // %1
1600 "+r"(pix) // %2 1443 "+r"(pix) // %2
1601 : 1444 :
1602 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" 1445 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
1603 ); 1446 );
1604 } 1447 }
1605 #endif // HAS_ARGBTOYROW_NEON 1448 #endif // HAS_ARGBTOYROW_NEON
1606 1449
1607 #ifdef HAS_ARGBTOYJROW_NEON 1450 #ifdef HAS_ARGBTOYJROW_NEON
1608 void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { 1451 void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
1609 asm volatile ( 1452 asm volatile (
1610 "movi v4.8b, #15 \n" // B * 0.11400 coefficient 1453 "movi v4.8b, #15 \n" // B * 0.11400 coefficient
1611 "movi v5.8b, #75 \n" // G * 0.58700 coefficient 1454 "movi v5.8b, #75 \n" // G * 0.58700 coefficient
1612 "movi v6.8b, #38 \n" // R * 0.29900 coefficient 1455 "movi v6.8b, #38 \n" // R * 0.29900 coefficient
1613 ".p2align 2 \n"
1614 "1: \n" 1456 "1: \n"
1615 MEMACCESS(0) 1457 MEMACCESS(0)
1616 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 1458 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
1617 "subs %2, %2, #8 \n" // 8 processed per loop. 1459 "subs %2, %2, #8 \n" // 8 processed per loop.
1618 "umull v3.8h, v0.8b, v4.8b \n" // B 1460 "umull v3.8h, v0.8b, v4.8b \n" // B
1619 "umlal v3.8h, v1.8b, v5.8b \n" // G 1461 "umlal v3.8h, v1.8b, v5.8b \n" // G
1620 "umlal v3.8h, v2.8b, v6.8b \n" // R 1462 "umlal v3.8h, v2.8b, v6.8b \n" // R
1621 "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y 1463 "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y
1622 MEMACCESS(1) 1464 MEMACCESS(1)
1623 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 1465 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
1624 "bgt 1b \n" 1466 "b.gt 1b \n"
1625 : "+r"(src_argb), // %0 1467 : "+r"(src_argb), // %0
1626 "+r"(dst_y), // %1 1468 "+r"(dst_y), // %1
1627 "+r"(pix) // %2 1469 "+r"(pix) // %2
1628 : 1470 :
1629 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" 1471 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
1630 ); 1472 );
1631 } 1473 }
1632 #endif // HAS_ARGBTOYJROW_NEON 1474 #endif // HAS_ARGBTOYJROW_NEON
1633 1475
1634 // 8x1 pixels. 1476 // 8x1 pixels.
1635 #ifdef HAS_ARGBTOUV444ROW_NEON 1477 #ifdef HAS_ARGBTOUV444ROW_NEON
1636 void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, 1478 void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
1637 int pix) { 1479 int pix) {
1638 asm volatile ( 1480 asm volatile (
1639 "vmov.u8 d24, #112 \n" // UB / VR 0.875 coefficient 1481 "movi v24.8b, #112 \n" // UB / VR 0.875 coefficient
1640 "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient 1482 "movi v25.8b, #74 \n" // UG -0.5781 coefficient
1641 "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient 1483 "movi v26.8b, #38 \n" // UR -0.2969 coefficient
1642 "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient 1484 "movi v27.8b, #18 \n" // VB -0.1406 coefficient
1643 "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient 1485 "movi v28.8b, #94 \n" // VG -0.7344 coefficient
1644 "vmov.u16 q15, #0x8080 \n" // 128.5 1486 "movi v29.16b,#0x80 \n" // 128.5
1645 ".p2align 2 \n"
1646 "1: \n" 1487 "1: \n"
1647 MEMACCESS(0) 1488 MEMACCESS(0)
1648 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. 1489 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
1649 "subs %3, %3, #8 \n" // 8 processed per loop. 1490 "subs %3, %3, #8 \n" // 8 processed per loop.
1650 "vmull.u8 q2, d0, d24 \n" // B 1491 "umull v4.8h, v0.8b, v24.8b \n" // B
1651 "vmlsl.u8 q2, d1, d25 \n" // G 1492 "umlsl v4.8h, v1.8b, v25.8b \n" // G
1652 "vmlsl.u8 q2, d2, d26 \n" // R 1493 "umlsl v4.8h, v2.8b, v26.8b \n" // R
1653 "vadd.u16 q2, q2, q15 \n" // +128 -> unsigned 1494 "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned
1654 1495
1655 "vmull.u8 q3, d2, d24 \n" // R 1496 "umull v3.8h, v2.8b, v24.8b \n" // R
1656 "vmlsl.u8 q3, d1, d28 \n" // G 1497 "umlsl v3.8h, v1.8b, v28.8b \n" // G
1657 "vmlsl.u8 q3, d0, d27 \n" // B 1498 "umlsl v3.8h, v0.8b, v27.8b \n" // B
1658 "vadd.u16 q3, q3, q15 \n" // +128 -> unsigned 1499 "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned
1659 1500
1660 "vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U 1501 "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U
1661 "vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V 1502 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
1662 1503
1663 MEMACCESS(1) 1504 MEMACCESS(1)
1664 "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. 1505 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
1665 MEMACCESS(2) 1506 MEMACCESS(2)
1666 "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. 1507 "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
1667 "bgt 1b \n" 1508 "b.gt 1b \n"
1668 : "+r"(src_argb), // %0 1509 : "+r"(src_argb), // %0
1669 "+r"(dst_u), // %1 1510 "+r"(dst_u), // %1
1670 "+r"(dst_v), // %2 1511 "+r"(dst_v), // %2
1671 "+r"(pix) // %3 1512 "+r"(pix) // %3
1672 : 1513 :
1673 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15" 1514 : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
1515 "v24", "v25", "v26", "v27", "v28", "v29"
1674 ); 1516 );
1675 } 1517 }
1676 #endif // HAS_ARGBTOUV444ROW_NEON 1518 #endif // HAS_ARGBTOUV444ROW_NEON
1677 1519
1678 // 16x1 pixels -> 8x1. pix is number of argb pixels. e.g. 16. 1520 // 16x1 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
1679 #ifdef HAS_ARGBTOUV422ROW_NEON 1521 #ifdef HAS_ARGBTOUV422ROW_NEON
1680 void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, 1522 void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
1681 int pix) { 1523 int pix) {
1682 asm volatile ( 1524 asm volatile (
1683 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient 1525 RGBTOUV_SETUP_REG
1684 "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
1685 "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
1686 "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
1687 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
1688 "vmov.u16 q15, #0x8080 \n" // 128.5
1689 ".p2align 2 \n"
1690 "1: \n" 1526 "1: \n"
1691 MEMACCESS(0) 1527 MEMACCESS(0)
1692 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. 1528 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
1693 MEMACCESS(0)
1694 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
1695 1529
1696 "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. 1530 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
1697 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. 1531 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
1698 "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. 1532 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
1699 1533
1700 "subs %3, %3, #16 \n" // 16 processed per loop. 1534 "subs %3, %3, #16 \n" // 16 processed per loop.
1701 "vmul.s16 q8, q0, q10 \n" // B 1535 "mul v3.8h, v0.8h, v20.8h \n" // B
1702 "vmls.s16 q8, q1, q11 \n" // G 1536 "mls v3.8h, v1.8h, v21.8h \n" // G
1703 "vmls.s16 q8, q2, q12 \n" // R 1537 "mls v3.8h, v2.8h, v22.8h \n" // R
1704 "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned 1538 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
1705 1539
1706 "vmul.s16 q9, q2, q10 \n" // R 1540 "mul v4.8h, v2.8h, v20.8h \n" // R
1707 "vmls.s16 q9, q1, q14 \n" // G 1541 "mls v4.8h, v1.8h, v24.8h \n" // G
1708 "vmls.s16 q9, q0, q13 \n" // B 1542 "mls v4.8h, v0.8h, v23.8h \n" // B
1709 "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned 1543 "add v4.8h, v4.8h, v25.8h \n" // +128 -> unsigned
1710 1544
1711 "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U 1545 "uqshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit U
1712 "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V 1546 "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V
1713 1547
1714 MEMACCESS(1) 1548 MEMACCESS(1)
1715 "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. 1549 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
1716 MEMACCESS(2) 1550 MEMACCESS(2)
1717 "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. 1551 "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
1718 "bgt 1b \n" 1552 "b.gt 1b \n"
1719 : "+r"(src_argb), // %0 1553 : "+r"(src_argb), // %0
1720 "+r"(dst_u), // %1 1554 "+r"(dst_u), // %1
1721 "+r"(dst_v), // %2 1555 "+r"(dst_v), // %2
1722 "+r"(pix) // %3 1556 "+r"(pix) // %3
1723 : 1557 :
1724 : "cc", "memory", "q0", "q1", "q2", "q3", 1558 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1725 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 1559 "v20", "v21", "v22", "v23", "v24", "v25"
1726 ); 1560 );
1727 } 1561 }
1728 #endif // HAS_ARGBTOUV422ROW_NEON 1562 #endif // HAS_ARGBTOUV422ROW_NEON
1729 1563
1730 // 32x1 pixels -> 8x1. pix is number of argb pixels. e.g. 32. 1564 // 32x1 pixels -> 8x1. pix is number of argb pixels. e.g. 32.
1731 #ifdef HAS_ARGBTOUV411ROW_NEON 1565 #ifdef HAS_ARGBTOUV411ROW_NEON
1732 void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, 1566 void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
1733 int pix) { 1567 int pix) {
1734 asm volatile ( 1568 asm volatile (
1735 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient 1569 RGBTOUV_SETUP_REG
1736 "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
1737 "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
1738 "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
1739 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
1740 "vmov.u16 q15, #0x8080 \n" // 128.5
1741 ".p2align 2 \n"
1742 "1: \n" 1570 "1: \n"
1743 MEMACCESS(0) 1571 MEMACCESS(0)
1744 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. 1572 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
1573 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
1574 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
1575 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
1745 MEMACCESS(0) 1576 MEMACCESS(0)
1746 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. 1577 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%0], #64 \n" // load next 16.
1747 "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. 1578 "uaddlp v4.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
1748 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. 1579 "uaddlp v5.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
1749 "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. 1580 "uaddlp v6.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
1750 MEMACCESS(0)
1751 "vld4.8 {d8, d10, d12, d14}, [%0]! \n" // load 8 more ARGB pixels.
1752 MEMACCESS(0)
1753 "vld4.8 {d9, d11, d13, d15}, [%0]! \n" // load last 8 ARGB pixels.
1754 "vpaddl.u8 q4, q4 \n" // B 16 bytes -> 8 shorts.
1755 "vpaddl.u8 q5, q5 \n" // G 16 bytes -> 8 shorts.
1756 "vpaddl.u8 q6, q6 \n" // R 16 bytes -> 8 shorts.
1757 1581
1758 "vpadd.u16 d0, d0, d1 \n" // B 16 shorts -> 8 shorts. 1582 "addp v0.8h, v0.8h, v4.8h \n" // B 16 shorts -> 8 shorts.
1759 "vpadd.u16 d1, d8, d9 \n" // B 1583 "addp v1.8h, v1.8h, v5.8h \n" // G 16 shorts -> 8 shorts.
1760 "vpadd.u16 d2, d2, d3 \n" // G 16 shorts -> 8 shorts. 1584 "addp v2.8h, v2.8h, v6.8h \n" // R 16 shorts -> 8 shorts.
1761 "vpadd.u16 d3, d10, d11 \n" // G
1762 "vpadd.u16 d4, d4, d5 \n" // R 16 shorts -> 8 shorts.
1763 "vpadd.u16 d5, d12, d13 \n" // R
1764 1585
1765 "vrshr.u16 q0, q0, #1 \n" // 2x average 1586 "urshr v0.8h, v0.8h, #1 \n" // 2x average
1766 "vrshr.u16 q1, q1, #1 \n" 1587 "urshr v1.8h, v1.8h, #1 \n"
1767 "vrshr.u16 q2, q2, #1 \n" 1588 "urshr v2.8h, v2.8h, #1 \n"
1768 1589
1769 "subs %3, %3, #32 \n" // 32 processed per loop. 1590 "subs %3, %3, #32 \n" // 32 processed per loop.
1770 "vmul.s16 q8, q0, q10 \n" // B 1591 "mul v3.8h, v0.8h, v20.8h \n" // B
1771 "vmls.s16 q8, q1, q11 \n" // G 1592 "mls v3.8h, v1.8h, v21.8h \n" // G
1772 "vmls.s16 q8, q2, q12 \n" // R 1593 "mls v3.8h, v2.8h, v22.8h \n" // R
1773 "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned 1594 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
1774 "vmul.s16 q9, q2, q10 \n" // R 1595 "mul v4.8h, v2.8h, v20.8h \n" // R
1775 "vmls.s16 q9, q1, q14 \n" // G 1596 "mls v4.8h, v1.8h, v24.8h \n" // G
1776 "vmls.s16 q9, q0, q13 \n" // B 1597 "mls v4.8h, v0.8h, v23.8h \n" // B
1777 "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned 1598 "add v4.8h, v4.8h, v25.8h \n" // +128 -> unsigned
1778 "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U 1599 "uqshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit U
1779 "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V 1600 "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V
1780 MEMACCESS(1) 1601 MEMACCESS(1)
1781 "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. 1602 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
1782 MEMACCESS(2) 1603 MEMACCESS(2)
1783 "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. 1604 "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
1784 "bgt 1b \n" 1605 "b.gt 1b \n"
1785 : "+r"(src_argb), // %0 1606 : "+r"(src_argb), // %0
1786 "+r"(dst_u), // %1 1607 "+r"(dst_u), // %1
1787 "+r"(dst_v), // %2 1608 "+r"(dst_v), // %2
1788 "+r"(pix) // %3 1609 "+r"(pix) // %3
1789 : 1610 :
1790 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", 1611 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1791 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 1612 "v20", "v21", "v22", "v23", "v24", "v25"
1792 ); 1613 );
1793 } 1614 }
1794 #endif // HAS_ARGBTOUV411ROW_NEON 1615 #endif // HAS_ARGBTOUV411ROW_NEON
1795 1616
1796 // 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. 1617 // 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
1797 #define RGBTOUV(QB, QG, QR) \ 1618 #define RGBTOUV(QB, QG, QR) \
1798 "vmul.s16 q8, " #QB ", q10 \n" /* B */ \ 1619 "mul v3.8h, " #QB ",v20.8h \n" /* B */ \
1799 "vmls.s16 q8, " #QG ", q11 \n" /* G */ \ 1620 "mul v4.8h, " #QR ",v20.8h \n" /* R */ \
1800 "vmls.s16 q8, " #QR ", q12 \n" /* R */ \ 1621 "mls v3.8h, " #QG ",v21.8h \n" /* G */ \
1801 "vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \ 1622 "mls v4.8h, " #QG ",v24.8h \n" /* G */ \
1802 "vmul.s16 q9, " #QR ", q10 \n" /* R */ \ 1623 "mls v3.8h, " #QR ",v22.8h \n" /* R */ \
1803 "vmls.s16 q9, " #QG ", q14 \n" /* G */ \ 1624 "mls v4.8h, " #QB ",v23.8h \n" /* B */ \
1804 "vmls.s16 q9, " #QB ", q13 \n" /* B */ \ 1625 "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \
1805 "vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \ 1626 "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \
1806 "vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \ 1627 "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \
1807 "vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */ 1628 "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */
1808 1629
1809 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. 1630 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
1631 // TODO(fbarchard): consider ptrdiff_t for all strides.
1632
1810 #ifdef HAS_ARGBTOUVROW_NEON 1633 #ifdef HAS_ARGBTOUVROW_NEON
1811 void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, 1634 void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
1812 uint8* dst_u, uint8* dst_v, int pix) { 1635 uint8* dst_u, uint8* dst_v, int pix) {
1636 const uint8* src_argb_1 = src_argb + src_stride_argb;
1813 asm volatile ( 1637 asm volatile (
1814 "add %1, %0, %1 \n" // src_stride + src_argb 1638 RGBTOUV_SETUP_REG
1815 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
1816 "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
1817 "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
1818 "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
1819 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
1820 "vmov.u16 q15, #0x8080 \n" // 128.5
1821 ".p2align 2 \n"
1822 "1: \n" 1639 "1: \n"
1823 MEMACCESS(0) 1640 MEMACCESS(0)
1824 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. 1641 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
1825 MEMACCESS(0) 1642 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
1826 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. 1643 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
1827 "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. 1644 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
1828 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. 1645
1829 "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
1830 MEMACCESS(1) 1646 MEMACCESS(1)
1831 "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. 1647 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
1832 MEMACCESS(1) 1648 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
1833 "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. 1649 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
1834 "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. 1650 "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
1835 "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
1836 "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
1837 1651
1838 "vrshr.u16 q0, q0, #1 \n" // 2x average 1652 "urshr v0.8h, v0.8h, #1 \n" // 2x average
1839 "vrshr.u16 q1, q1, #1 \n" 1653 "urshr v1.8h, v1.8h, #1 \n"
1840 "vrshr.u16 q2, q2, #1 \n" 1654 "urshr v2.8h, v2.8h, #1 \n"
1841 1655
1842 "subs %4, %4, #16 \n" // 32 processed per loop. 1656 "subs %4, %4, #16 \n" // 32 processed per loop.
1843 RGBTOUV(q0, q1, q2) 1657 RGBTOUV(v0.8h, v1.8h, v2.8h)
1844 MEMACCESS(2) 1658 MEMACCESS(2)
1845 "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. 1659 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1846 MEMACCESS(3) 1660 MEMACCESS(3)
1847 "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. 1661 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1848 "bgt 1b \n" 1662 "b.gt 1b \n"
1849 : "+r"(src_argb), // %0 1663 : "+r"(src_argb), // %0
1850 "+r"(src_stride_argb), // %1 1664 "+r"(src_argb_1), // %1
1851 "+r"(dst_u), // %2 1665 "+r"(dst_u), // %2
1852 "+r"(dst_v), // %3 1666 "+r"(dst_v), // %3
1853 "+r"(pix) // %4 1667 "+r"(pix) // %4
1854 : 1668 :
1855 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", 1669 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1856 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 1670 "v20", "v21", "v22", "v23", "v24", "v25"
1857 ); 1671 );
1858 } 1672 }
1859 #endif // HAS_ARGBTOUVROW_NEON 1673 #endif // HAS_ARGBTOUVROW_NEON
1860 1674
1861 // TODO(fbarchard): Subsample match C code. 1675 // TODO(fbarchard): Subsample match C code.
1862 #ifdef HAS_ARGBTOUVJROW_NEON 1676 #ifdef HAS_ARGBTOUVJROW_NEON
1863 void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, 1677 void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
1864 uint8* dst_u, uint8* dst_v, int pix) { 1678 uint8* dst_u, uint8* dst_v, int pix) {
1679 const uint8* src_argb_1 = src_argb + src_stride_argb;
1865 asm volatile ( 1680 asm volatile (
1866 "add %1, %0, %1 \n" // src_stride + src_argb 1681 "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2
1867 "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient 1682 "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2
1868 "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient 1683 "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2
1869 "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient 1684 "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2
1870 "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient 1685 "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2
1871 "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient 1686 "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
1872 "vmov.u16 q15, #0x8080 \n" // 128.5
1873 ".p2align 2 \n"
1874 "1: \n" 1687 "1: \n"
1875 MEMACCESS(0) 1688 MEMACCESS(0)
1876 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. 1689 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
1877 MEMACCESS(0) 1690 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
1878 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. 1691 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
1879 "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. 1692 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
1880 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
1881 "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
1882 MEMACCESS(1) 1693 MEMACCESS(1)
1883 "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. 1694 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
1884 MEMACCESS(1) 1695 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
1885 "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. 1696 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
1886 "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. 1697 "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
1887 "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
1888 "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
1889 1698
1890 "vrshr.u16 q0, q0, #1 \n" // 2x average 1699 "urshr v0.8h, v0.8h, #1 \n" // 2x average
1891 "vrshr.u16 q1, q1, #1 \n" 1700 "urshr v1.8h, v1.8h, #1 \n"
1892 "vrshr.u16 q2, q2, #1 \n" 1701 "urshr v2.8h, v2.8h, #1 \n"
1893 1702
1894 "subs %4, %4, #16 \n" // 32 processed per loop. 1703 "subs %4, %4, #16 \n" // 32 processed per loop.
1895 RGBTOUV(q0, q1, q2) 1704 RGBTOUV(v0.8h, v1.8h, v2.8h)
1896 MEMACCESS(2) 1705 MEMACCESS(2)
1897 "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. 1706 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1898 MEMACCESS(3) 1707 MEMACCESS(3)
1899 "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. 1708 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1900 "bgt 1b \n" 1709 "b.gt 1b \n"
1901 : "+r"(src_argb), // %0 1710 : "+r"(src_argb), // %0
1902 "+r"(src_stride_argb), // %1 1711 "+r"(src_argb_1), // %1
1903 "+r"(dst_u), // %2 1712 "+r"(dst_u), // %2
1904 "+r"(dst_v), // %3 1713 "+r"(dst_v), // %3
1905 "+r"(pix) // %4 1714 "+r"(pix) // %4
1906 : 1715 :
1907 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", 1716 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1908 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 1717 "v20", "v21", "v22", "v23", "v24", "v25"
1909 ); 1718 );
1910 } 1719 }
1911 #endif // HAS_ARGBTOUVJROW_NEON 1720 #endif // HAS_ARGBTOUVJROW_NEON
1912 1721
1913 #ifdef HAS_BGRATOUVROW_NEON 1722 #ifdef HAS_BGRATOUVROW_NEON
1914 void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, 1723 void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
1915 uint8* dst_u, uint8* dst_v, int pix) { 1724 uint8* dst_u, uint8* dst_v, int pix) {
1725 const uint8* src_bgra_1 = src_bgra + src_stride_bgra;
1916 asm volatile ( 1726 asm volatile (
1917 "add %1, %0, %1 \n" // src_stride + src_bgra 1727 RGBTOUV_SETUP_REG
1918 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
1919 "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
1920 "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
1921 "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
1922 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
1923 "vmov.u16 q15, #0x8080 \n" // 128.5
1924 ".p2align 2 \n"
1925 "1: \n" 1728 "1: \n"
1926 MEMACCESS(0) 1729 MEMACCESS(0)
1927 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels. 1730 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
1928 MEMACCESS(0) 1731 "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts.
1929 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels. 1732 "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
1930 "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts. 1733 "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts.
1931 "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts.
1932 "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts.
1933 MEMACCESS(1) 1734 MEMACCESS(1)
1934 "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels. 1735 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more
1935 MEMACCESS(1) 1736 "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts.
1936 "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels. 1737 "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
1937 "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts. 1738 "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts.
1938 "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts.
1939 "vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts.
1940 1739
1941 "vrshr.u16 q1, q1, #1 \n" // 2x average 1740 "urshr v0.8h, v0.8h, #1 \n" // 2x average
1942 "vrshr.u16 q2, q2, #1 \n" 1741 "urshr v1.8h, v3.8h, #1 \n"
1943 "vrshr.u16 q3, q3, #1 \n" 1742 "urshr v2.8h, v2.8h, #1 \n"
1944 1743
1945 "subs %4, %4, #16 \n" // 32 processed per loop. 1744 "subs %4, %4, #16 \n" // 32 processed per loop.
1946 RGBTOUV(q3, q2, q1) 1745 RGBTOUV(v0.8h, v1.8h, v2.8h)
1947 MEMACCESS(2) 1746 MEMACCESS(2)
1948 "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. 1747 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1949 MEMACCESS(3) 1748 MEMACCESS(3)
1950 "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. 1749 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1951 "bgt 1b \n" 1750 "b.gt 1b \n"
1952 : "+r"(src_bgra), // %0 1751 : "+r"(src_bgra), // %0
1953 "+r"(src_stride_bgra), // %1 1752 "+r"(src_bgra_1), // %1
1954 "+r"(dst_u), // %2 1753 "+r"(dst_u), // %2
1955 "+r"(dst_v), // %3 1754 "+r"(dst_v), // %3
1956 "+r"(pix) // %4 1755 "+r"(pix) // %4
1957 : 1756 :
1958 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", 1757 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1959 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 1758 "v20", "v21", "v22", "v23", "v24", "v25"
1960 ); 1759 );
1961 } 1760 }
1962 #endif // HAS_BGRATOUVROW_NEON 1761 #endif // HAS_BGRATOUVROW_NEON
1963 1762
1964 #ifdef HAS_ABGRTOUVROW_NEON 1763 #ifdef HAS_ABGRTOUVROW_NEON
1965 void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, 1764 void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
1966 uint8* dst_u, uint8* dst_v, int pix) { 1765 uint8* dst_u, uint8* dst_v, int pix) {
1766 const uint8* src_abgr_1 = src_abgr + src_stride_abgr;
1967 asm volatile ( 1767 asm volatile (
1968 "add %1, %0, %1 \n" // src_stride + src_abgr 1768 RGBTOUV_SETUP_REG
1969 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
1970 "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
1971 "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
1972 "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
1973 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
1974 "vmov.u16 q15, #0x8080 \n" // 128.5
1975 ".p2align 2 \n"
1976 "1: \n" 1769 "1: \n"
1977 MEMACCESS(0) 1770 MEMACCESS(0)
1978 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. 1771 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
1979 MEMACCESS(0) 1772 "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
1980 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels. 1773 "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
1981 "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. 1774 "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
1982 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
1983 "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
1984 MEMACCESS(1) 1775 MEMACCESS(1)
1985 "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels. 1776 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
1986 MEMACCESS(1) 1777 "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
1987 "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels. 1778 "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
1988 "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. 1779 "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
1989 "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
1990 "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
1991 1780
1992 "vrshr.u16 q0, q0, #1 \n" // 2x average 1781 "urshr v0.8h, v3.8h, #1 \n" // 2x average
1993 "vrshr.u16 q1, q1, #1 \n" 1782 "urshr v2.8h, v2.8h, #1 \n"
1994 "vrshr.u16 q2, q2, #1 \n" 1783 "urshr v1.8h, v1.8h, #1 \n"
1995 1784
1996 "subs %4, %4, #16 \n" // 32 processed per loop. 1785 "subs %4, %4, #16 \n" // 32 processed per loop.
1997 RGBTOUV(q2, q1, q0) 1786 RGBTOUV(v0.8h, v2.8h, v1.8h)
1998 MEMACCESS(2) 1787 MEMACCESS(2)
1999 "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. 1788 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
2000 MEMACCESS(3) 1789 MEMACCESS(3)
2001 "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. 1790 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
2002 "bgt 1b \n" 1791 "b.gt 1b \n"
2003 : "+r"(src_abgr), // %0 1792 : "+r"(src_abgr), // %0
2004 "+r"(src_stride_abgr), // %1 1793 "+r"(src_abgr_1), // %1
2005 "+r"(dst_u), // %2 1794 "+r"(dst_u), // %2
2006 "+r"(dst_v), // %3 1795 "+r"(dst_v), // %3
2007 "+r"(pix) // %4 1796 "+r"(pix) // %4
2008 : 1797 :
2009 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", 1798 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2010 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 1799 "v20", "v21", "v22", "v23", "v24", "v25"
2011 ); 1800 );
2012 } 1801 }
2013 #endif // HAS_ABGRTOUVROW_NEON 1802 #endif // HAS_ABGRTOUVROW_NEON
2014 1803
2015 #ifdef HAS_RGBATOUVROW_NEON 1804 #ifdef HAS_RGBATOUVROW_NEON
2016 void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, 1805 void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
2017 uint8* dst_u, uint8* dst_v, int pix) { 1806 uint8* dst_u, uint8* dst_v, int pix) {
1807 const uint8* src_rgba_1 = src_rgba + src_stride_rgba;
2018 asm volatile ( 1808 asm volatile (
2019 "add %1, %0, %1 \n" // src_stride + src_rgba 1809 RGBTOUV_SETUP_REG
2020 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
2021 "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
2022 "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
2023 "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
2024 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
2025 "vmov.u16 q15, #0x8080 \n" // 128.5
2026 ".p2align 2 \n"
2027 "1: \n" 1810 "1: \n"
2028 MEMACCESS(0) 1811 MEMACCESS(0)
2029 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels. 1812 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
2030 MEMACCESS(0) 1813 "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts.
2031 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels. 1814 "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
2032 "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts. 1815 "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts.
2033 "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts.
2034 "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts.
2035 MEMACCESS(1) 1816 MEMACCESS(1)
2036 "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels. 1817 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
2037 MEMACCESS(1) 1818 "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts.
2038 "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels. 1819 "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
2039 "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts. 1820 "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts.
2040 "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts.
2041 "vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts.
2042 1821
2043 "vrshr.u16 q0, q0, #1 \n" // 2x average 1822 "urshr v0.8h, v0.8h, #1 \n" // 2x average
2044 "vrshr.u16 q1, q1, #1 \n" 1823 "urshr v1.8h, v1.8h, #1 \n"
2045 "vrshr.u16 q2, q2, #1 \n" 1824 "urshr v2.8h, v2.8h, #1 \n"
2046 1825
2047 "subs %4, %4, #16 \n" // 32 processed per loop. 1826 "subs %4, %4, #16 \n" // 32 processed per loop.
2048 RGBTOUV(q0, q1, q2) 1827 RGBTOUV(v0.8h, v1.8h, v2.8h)
2049 MEMACCESS(2) 1828 MEMACCESS(2)
2050 "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. 1829 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
2051 MEMACCESS(3) 1830 MEMACCESS(3)
2052 "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. 1831 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
2053 "bgt 1b \n" 1832 "b.gt 1b \n"
2054 : "+r"(src_rgba), // %0 1833 : "+r"(src_rgba), // %0
2055 "+r"(src_stride_rgba), // %1 1834 "+r"(src_rgba_1), // %1
2056 "+r"(dst_u), // %2 1835 "+r"(dst_u), // %2
2057 "+r"(dst_v), // %3 1836 "+r"(dst_v), // %3
2058 "+r"(pix) // %4 1837 "+r"(pix) // %4
2059 : 1838 :
2060 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", 1839 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2061 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 1840 "v20", "v21", "v22", "v23", "v24", "v25"
2062 ); 1841 );
2063 } 1842 }
2064 #endif // HAS_RGBATOUVROW_NEON 1843 #endif // HAS_RGBATOUVROW_NEON
2065 1844
2066 #ifdef HAS_RGB24TOUVROW_NEON 1845 #ifdef HAS_RGB24TOUVROW_NEON
2067 void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, 1846 void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
2068 uint8* dst_u, uint8* dst_v, int pix) { 1847 uint8* dst_u, uint8* dst_v, int pix) {
1848 const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
2069 asm volatile ( 1849 asm volatile (
2070 "add %1, %0, %1 \n" // src_stride + src_rgb24 1850 RGBTOUV_SETUP_REG
2071 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
2072 "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
2073 "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
2074 "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
2075 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
2076 "vmov.u16 q15, #0x8080 \n" // 128.5
2077 ".p2align 2 \n"
2078 "1: \n" 1851 "1: \n"
2079 MEMACCESS(0) 1852 MEMACCESS(0)
2080 "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. 1853 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels.
2081 MEMACCESS(0) 1854 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
2082 "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels. 1855 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
2083 "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. 1856 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
2084 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
2085 "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
2086 MEMACCESS(1) 1857 MEMACCESS(1)
2087 "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels. 1858 "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more.
2088 MEMACCESS(1) 1859 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
2089 "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels. 1860 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
2090 "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. 1861 "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
2091 "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
2092 "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
2093 1862
2094 "vrshr.u16 q0, q0, #1 \n" // 2x average 1863 "urshr v0.8h, v0.8h, #1 \n" // 2x average
2095 "vrshr.u16 q1, q1, #1 \n" 1864 "urshr v1.8h, v1.8h, #1 \n"
2096 "vrshr.u16 q2, q2, #1 \n" 1865 "urshr v2.8h, v2.8h, #1 \n"
2097 1866
2098 "subs %4, %4, #16 \n" // 32 processed per loop. 1867 "subs %4, %4, #16 \n" // 32 processed per loop.
2099 RGBTOUV(q0, q1, q2) 1868 RGBTOUV(v0.8h, v1.8h, v2.8h)
2100 MEMACCESS(2) 1869 MEMACCESS(2)
2101 "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. 1870 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
2102 MEMACCESS(3) 1871 MEMACCESS(3)
2103 "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. 1872 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
2104 "bgt 1b \n" 1873 "b.gt 1b \n"
2105 : "+r"(src_rgb24), // %0 1874 : "+r"(src_rgb24), // %0
2106 "+r"(src_stride_rgb24), // %1 1875 "+r"(src_rgb24_1), // %1
2107 "+r"(dst_u), // %2 1876 "+r"(dst_u), // %2
2108 "+r"(dst_v), // %3 1877 "+r"(dst_v), // %3
2109 "+r"(pix) // %4 1878 "+r"(pix) // %4
2110 : 1879 :
2111 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", 1880 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2112 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 1881 "v20", "v21", "v22", "v23", "v24", "v25"
2113 ); 1882 );
2114 } 1883 }
2115 #endif // HAS_RGB24TOUVROW_NEON 1884 #endif // HAS_RGB24TOUVROW_NEON
2116 1885
2117 #ifdef HAS_RAWTOUVROW_NEON 1886 #ifdef HAS_RAWTOUVROW_NEON
2118 void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, 1887 void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
2119 uint8* dst_u, uint8* dst_v, int pix) { 1888 uint8* dst_u, uint8* dst_v, int pix) {
1889 const uint8* src_raw_1 = src_raw + src_stride_raw;
2120 asm volatile ( 1890 asm volatile (
2121 "add %1, %0, %1 \n" // src_stride + src_raw 1891 RGBTOUV_SETUP_REG
2122 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
2123 "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
2124 "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
2125 "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
2126 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
2127 "vmov.u16 q15, #0x8080 \n" // 128.5
2128 ".p2align 2 \n"
2129 "1: \n" 1892 "1: \n"
2130 MEMACCESS(0) 1893 MEMACCESS(0)
2131 "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. 1894 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels.
2132 MEMACCESS(0) 1895 "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
2133 "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels. 1896 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
2134 "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. 1897 "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
2135 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
2136 "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
2137 MEMACCESS(1) 1898 MEMACCESS(1)
2138 "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels. 1899 "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels
2139 MEMACCESS(1) 1900 "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
2140 "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels. 1901 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
2141 "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. 1902 "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
2142 "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
2143 "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
2144 1903
2145 "vrshr.u16 q0, q0, #1 \n" // 2x average 1904 "urshr v2.8h, v2.8h, #1 \n" // 2x average
2146 "vrshr.u16 q1, q1, #1 \n" 1905 "urshr v1.8h, v1.8h, #1 \n"
2147 "vrshr.u16 q2, q2, #1 \n" 1906 "urshr v0.8h, v0.8h, #1 \n"
2148 1907
2149 "subs %4, %4, #16 \n" // 32 processed per loop. 1908 "subs %4, %4, #16 \n" // 32 processed per loop.
2150 RGBTOUV(q2, q1, q0) 1909 RGBTOUV(v2.8h, v1.8h, v0.8h)
2151 MEMACCESS(2) 1910 MEMACCESS(2)
2152 "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. 1911 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
2153 MEMACCESS(3) 1912 MEMACCESS(3)
2154 "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. 1913 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
2155 "bgt 1b \n" 1914 "b.gt 1b \n"
2156 : "+r"(src_raw), // %0 1915 : "+r"(src_raw), // %0
2157 "+r"(src_stride_raw), // %1 1916 "+r"(src_raw_1), // %1
2158 "+r"(dst_u), // %2 1917 "+r"(dst_u), // %2
2159 "+r"(dst_v), // %3 1918 "+r"(dst_v), // %3
2160 "+r"(pix) // %4 1919 "+r"(pix) // %4
2161 : 1920 :
2162 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", 1921 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2163 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 1922 "v20", "v21", "v22", "v23", "v24", "v25"
2164 ); 1923 );
2165 } 1924 }
2166 #endif // HAS_RAWTOUVROW_NEON 1925 #endif // HAS_RAWTOUVROW_NEON
2167 1926
2168 // 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. 1927 // 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
2169 #ifdef HAS_RGB565TOUVROW_NEON 1928 #ifdef HAS_RGB565TOUVROW_NEON
2170 void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, 1929 void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
2171 uint8* dst_u, uint8* dst_v, int pix) { 1930 uint8* dst_u, uint8* dst_v, int pix) {
1931 const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
2172 asm volatile ( 1932 asm volatile (
2173 "add %1, %0, %1 \n" // src_stride + src_argb 1933 "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) / 2
2174 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient 1934 "movi v23.8h, #37, lsl #0 \n" // UG coeff (-0.5781) / 2
2175 "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient 1935 "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2
2176 "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient 1936 "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2
2177 "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient 1937 "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2
2178 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient 1938 "movi v27.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
2179 "vmov.u16 q15, #0x8080 \n" // 128.5
2180 ".p2align 2 \n"
2181 "1: \n" 1939 "1: \n"
2182 MEMACCESS(0) 1940 MEMACCESS(0)
2183 "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. 1941 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
2184 RGB565TOARGB 1942 RGB565TOARGB
2185 "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. 1943 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
2186 "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. 1944 "uaddlp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
2187 "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. 1945 "uaddlp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
2188 MEMACCESS(0) 1946 MEMACCESS(0)
2189 "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels. 1947 "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels.
2190 RGB565TOARGB 1948 RGB565TOARGB
2191 "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. 1949 "uaddlp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
2192 "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. 1950 "uaddlp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
2193 "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. 1951 "uaddlp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
2194 1952
2195 MEMACCESS(1) 1953 MEMACCESS(1)
2196 "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels. 1954 "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels.
2197 RGB565TOARGB 1955 RGB565TOARGB
2198 "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. 1956 "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
2199 "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. 1957 "uadalp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
2200 "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. 1958 "uadalp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
2201 MEMACCESS(1) 1959 MEMACCESS(1)
2202 "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels. 1960 "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels.
2203 RGB565TOARGB 1961 RGB565TOARGB
2204 "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. 1962 "uadalp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
2205 "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. 1963 "uadalp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
2206 "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. 1964 "uadalp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
2207 1965
2208 "vrshr.u16 q4, q4, #1 \n" // 2x average 1966 "ins v16.D[1], v17.D[0] \n"
2209 "vrshr.u16 q5, q5, #1 \n" 1967 "ins v18.D[1], v19.D[0] \n"
2210 "vrshr.u16 q6, q6, #1 \n" 1968 "ins v20.D[1], v21.D[0] \n"
1969
1970 "urshr v4.8h, v16.8h, #1 \n" // 2x average
1971 "urshr v5.8h, v18.8h, #1 \n"
1972 "urshr v6.8h, v20.8h, #1 \n"
2211 1973
2212 "subs %4, %4, #16 \n" // 16 processed per loop. 1974 "subs %4, %4, #16 \n" // 16 processed per loop.
2213 "vmul.s16 q8, q4, q10 \n" // B 1975 "mul v16.8h, v4.8h, v22.8h \n" // B
2214 "vmls.s16 q8, q5, q11 \n" // G 1976 "mls v16.8h, v5.8h, v23.8h \n" // G
2215 "vmls.s16 q8, q6, q12 \n" // R 1977 "mls v16.8h, v6.8h, v24.8h \n" // R
2216 "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned 1978 "add v16.8h, v16.8h, v27.8h \n" // +128 -> unsigned
2217 "vmul.s16 q9, q6, q10 \n" // R 1979 "mul v17.8h, v6.8h, v22.8h \n" // R
2218 "vmls.s16 q9, q5, q14 \n" // G 1980 "mls v17.8h, v5.8h, v26.8h \n" // G
2219 "vmls.s16 q9, q4, q13 \n" // B 1981 "mls v17.8h, v4.8h, v25.8h \n" // B
2220 "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned 1982 "add v17.8h, v17.8h, v27.8h \n" // +128 -> unsigned
2221 "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U 1983 "uqshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit U
2222 "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V 1984 "uqshrn v1.8b, v17.8h, #8 \n" // 16 bit to 8 bit V
2223 MEMACCESS(2) 1985 MEMACCESS(2)
2224 "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. 1986 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
2225 MEMACCESS(3) 1987 MEMACCESS(3)
2226 "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. 1988 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
2227 "bgt 1b \n" 1989 "b.gt 1b \n"
2228 : "+r"(src_rgb565), // %0 1990 : "+r"(src_rgb565), // %0
2229 "+r"(src_stride_rgb565), // %1 1991 "+r"(src_rgb565_1), // %1
2230 "+r"(dst_u), // %2 1992 "+r"(dst_u), // %2
2231 "+r"(dst_v), // %3 1993 "+r"(dst_v), // %3
2232 "+r"(pix) // %4 1994 "+r"(pix) // %4
2233 : 1995 :
2234 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", 1996 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2235 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 1997 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
1998 "v25", "v26", "v27"
2236 ); 1999 );
2237 } 2000 }
2238 #endif // HAS_RGB565TOUVROW_NEON 2001 #endif // HAS_RGB565TOUVROW_NEON
2239 2002
2240 // 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. 2003 // 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
2241 #ifdef HAS_ARGB1555TOUVROW_NEON 2004 #ifdef HAS_ARGB1555TOUVROW_NEON
2242 void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, 2005 void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
2243 uint8* dst_u, uint8* dst_v, int pix) { 2006 uint8* dst_u, uint8* dst_v, int pix) {
2007 const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
2244 asm volatile ( 2008 asm volatile (
2245 "add %1, %0, %1 \n" // src_stride + src_argb 2009 RGBTOUV_SETUP_REG
2246 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
2247 "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
2248 "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
2249 "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
2250 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
2251 "vmov.u16 q15, #0x8080 \n" // 128.5
2252 ".p2align 2 \n"
2253 "1: \n" 2010 "1: \n"
2254 MEMACCESS(0) 2011 MEMACCESS(0)
2255 "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. 2012 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
2256 RGB555TOARGB 2013 RGB555TOARGB
2257 "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. 2014 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
2258 "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. 2015 "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
2259 "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. 2016 "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
2260 MEMACCESS(0) 2017 MEMACCESS(0)
2261 "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels. 2018 "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels.
2262 RGB555TOARGB 2019 RGB555TOARGB
2263 "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. 2020 "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
2264 "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. 2021 "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
2265 "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. 2022 "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
2266 2023
2267 MEMACCESS(1) 2024 MEMACCESS(1)
2268 "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels. 2025 "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels.
2269 RGB555TOARGB 2026 RGB555TOARGB
2270 "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. 2027 "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
2271 "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. 2028 "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
2272 "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. 2029 "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
2273 MEMACCESS(1) 2030 MEMACCESS(1)
2274 "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels. 2031 "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels.
2275 RGB555TOARGB 2032 RGB555TOARGB
2276 "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. 2033 "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
2277 "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. 2034 "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
2278 "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. 2035 "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
2279 2036
2280 "vrshr.u16 q4, q4, #1 \n" // 2x average 2037 "ins v16.D[1], v26.D[0] \n"
2281 "vrshr.u16 q5, q5, #1 \n" 2038 "ins v17.D[1], v27.D[0] \n"
2282 "vrshr.u16 q6, q6, #1 \n" 2039 "ins v18.D[1], v28.D[0] \n"
2040
2041 "urshr v4.8h, v16.8h, #1 \n" // 2x average
2042 "urshr v5.8h, v17.8h, #1 \n"
2043 "urshr v6.8h, v18.8h, #1 \n"
2283 2044
2284 "subs %4, %4, #16 \n" // 16 processed per loop. 2045 "subs %4, %4, #16 \n" // 16 processed per loop.
2285 "vmul.s16 q8, q4, q10 \n" // B 2046 "mul v2.8h, v4.8h, v20.8h \n" // B
2286 "vmls.s16 q8, q5, q11 \n" // G 2047 "mls v2.8h, v5.8h, v21.8h \n" // G
2287 "vmls.s16 q8, q6, q12 \n" // R 2048 "mls v2.8h, v6.8h, v22.8h \n" // R
2288 "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned 2049 "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned
2289 "vmul.s16 q9, q6, q10 \n" // R 2050 "mul v3.8h, v6.8h, v20.8h \n" // R
2290 "vmls.s16 q9, q5, q14 \n" // G 2051 "mls v3.8h, v5.8h, v24.8h \n" // G
2291 "vmls.s16 q9, q4, q13 \n" // B 2052 "mls v3.8h, v4.8h, v23.8h \n" // B
2292 "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned 2053 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
2293 "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U 2054 "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U
2294 "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V 2055 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
2295 MEMACCESS(2) 2056 MEMACCESS(2)
2296 "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. 2057 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
2297 MEMACCESS(3) 2058 MEMACCESS(3)
2298 "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. 2059 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
2299 "bgt 1b \n" 2060 "b.gt 1b \n"
2300 : "+r"(src_argb1555), // %0 2061 : "+r"(src_argb1555), // %0
2301 "+r"(src_stride_argb1555), // %1 2062 "+r"(src_argb1555_1), // %1
2302 "+r"(dst_u), // %2 2063 "+r"(dst_u), // %2
2303 "+r"(dst_v), // %3 2064 "+r"(dst_v), // %3
2304 "+r"(pix) // %4 2065 "+r"(pix) // %4
2305 : 2066 :
2306 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", 2067 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
2307 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 2068 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
2069 "v26", "v27", "v28"
2308 ); 2070 );
2309 } 2071 }
2310 #endif // HAS_ARGB1555TOUVROW_NEON 2072 #endif // HAS_ARGB1555TOUVROW_NEON
2311 2073
2312 // 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. 2074 // 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
2313 #ifdef HAS_ARGB4444TOUVROW_NEON 2075 #ifdef HAS_ARGB4444TOUVROW_NEON
2314 void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, 2076 void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
2315 uint8* dst_u, uint8* dst_v, int pix) { 2077 uint8* dst_u, uint8* dst_v, int pix) {
2078 const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
2316 asm volatile ( 2079 asm volatile (
2317 "add %1, %0, %1 \n" // src_stride + src_argb 2080 RGBTOUV_SETUP_REG
2318 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
2319 "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
2320 "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
2321 "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
2322 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
2323 "vmov.u16 q15, #0x8080 \n" // 128.5
2324 ".p2align 2 \n"
2325 "1: \n" 2081 "1: \n"
2326 MEMACCESS(0) 2082 MEMACCESS(0)
2327 "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. 2083 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
2328 ARGB4444TOARGB 2084 ARGB4444TOARGB
2329 "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. 2085 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
2330 "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. 2086 "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
2331 "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. 2087 "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
2332 MEMACCESS(0) 2088 MEMACCESS(0)
2333 "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels. 2089 "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels.
2334 ARGB4444TOARGB 2090 ARGB4444TOARGB
2335 "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. 2091 "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
2336 "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. 2092 "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
2337 "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. 2093 "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
2338 2094
2339 MEMACCESS(1) 2095 MEMACCESS(1)
2340 "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels. 2096 "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels.
2341 ARGB4444TOARGB 2097 ARGB4444TOARGB
2342 "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. 2098 "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
2343 "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. 2099 "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
2344 "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. 2100 "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
2345 MEMACCESS(1) 2101 MEMACCESS(1)
2346 "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels. 2102 "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels.
2347 ARGB4444TOARGB 2103 ARGB4444TOARGB
2348 "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. 2104 "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
2349 "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. 2105 "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
2350 "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. 2106 "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
2351 2107
2352 "vrshr.u16 q4, q4, #1 \n" // 2x average 2108 "ins v16.D[1], v26.D[0] \n"
2353 "vrshr.u16 q5, q5, #1 \n" 2109 "ins v17.D[1], v27.D[0] \n"
2354 "vrshr.u16 q6, q6, #1 \n" 2110 "ins v18.D[1], v28.D[0] \n"
2111
2112 "urshr v4.8h, v16.8h, #1 \n" // 2x average
2113 "urshr v5.8h, v17.8h, #1 \n"
2114 "urshr v6.8h, v18.8h, #1 \n"
2355 2115
2356 "subs %4, %4, #16 \n" // 16 processed per loop. 2116 "subs %4, %4, #16 \n" // 16 processed per loop.
2357 "vmul.s16 q8, q4, q10 \n" // B 2117 "mul v2.8h, v4.8h, v20.8h \n" // B
2358 "vmls.s16 q8, q5, q11 \n" // G 2118 "mls v2.8h, v5.8h, v21.8h \n" // G
2359 "vmls.s16 q8, q6, q12 \n" // R 2119 "mls v2.8h, v6.8h, v22.8h \n" // R
2360 "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned 2120 "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned
2361 "vmul.s16 q9, q6, q10 \n" // R 2121 "mul v3.8h, v6.8h, v20.8h \n" // R
2362 "vmls.s16 q9, q5, q14 \n" // G 2122 "mls v3.8h, v5.8h, v24.8h \n" // G
2363 "vmls.s16 q9, q4, q13 \n" // B 2123 "mls v3.8h, v4.8h, v23.8h \n" // B
2364 "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned 2124 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
2365 "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U 2125 "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U
2366 "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V 2126 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
2367 MEMACCESS(2) 2127 MEMACCESS(2)
2368 "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. 2128 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
2369 MEMACCESS(3) 2129 MEMACCESS(3)
2370 "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. 2130 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
2371 "bgt 1b \n" 2131 "b.gt 1b \n"
2372 : "+r"(src_argb4444), // %0 2132 : "+r"(src_argb4444), // %0
2373 "+r"(src_stride_argb4444), // %1 2133 "+r"(src_argb4444_1), // %1
2374 "+r"(dst_u), // %2 2134 "+r"(dst_u), // %2
2375 "+r"(dst_v), // %3 2135 "+r"(dst_v), // %3
2376 "+r"(pix) // %4 2136 "+r"(pix) // %4
2377 : 2137 :
2378 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", 2138 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
2379 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 2139 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
2140 "v26", "v27", "v28"
2141
2380 ); 2142 );
2381 } 2143 }
2382 #endif // HAS_ARGB4444TOUVROW_NEON 2144 #endif // HAS_ARGB4444TOUVROW_NEON
2383 2145
2384 #ifdef HAS_RGB565TOYROW_NEON 2146 #ifdef HAS_RGB565TOYROW_NEON
2385 void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) { 2147 void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
2386 asm volatile ( 2148 asm volatile (
2387 "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient 2149 "movi v24.8b, #13 \n" // B * 0.1016 coefficient
2388 "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient 2150 "movi v25.8b, #65 \n" // G * 0.5078 coefficient
2389 "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient 2151 "movi v26.8b, #33 \n" // R * 0.2578 coefficient
2390 "vmov.u8 d27, #16 \n" // Add 16 constant 2152 "movi v27.8b, #16 \n" // Add 16 constant
2391 ".p2align 2 \n"
2392 "1: \n" 2153 "1: \n"
2393 MEMACCESS(0) 2154 MEMACCESS(0)
2394 "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. 2155 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
2395 "subs %2, %2, #8 \n" // 8 processed per loop. 2156 "subs %2, %2, #8 \n" // 8 processed per loop.
2396 RGB565TOARGB 2157 RGB565TOARGB
2397 "vmull.u8 q2, d0, d24 \n" // B 2158 "umull v3.8h, v0.8b, v24.8b \n" // B
2398 "vmlal.u8 q2, d1, d25 \n" // G 2159 "umlal v3.8h, v1.8b, v25.8b \n" // G
2399 "vmlal.u8 q2, d2, d26 \n" // R 2160 "umlal v3.8h, v2.8b, v26.8b \n" // R
2400 "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y 2161 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
2401 "vqadd.u8 d0, d27 \n" 2162 "uqadd v0.8b, v0.8b, v27.8b \n"
2402 MEMACCESS(1) 2163 MEMACCESS(1)
2403 "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. 2164 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2404 "bgt 1b \n" 2165 "b.gt 1b \n"
2405 : "+r"(src_rgb565), // %0 2166 : "+r"(src_rgb565), // %0
2406 "+r"(dst_y), // %1 2167 "+r"(dst_y), // %1
2407 "+r"(pix) // %2 2168 "+r"(pix) // %2
2408 : 2169 :
2409 : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13" 2170 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6",
2171 "v24", "v25", "v26", "v27"
2410 ); 2172 );
2411 } 2173 }
2412 #endif // HAS_RGB565TOYROW_NEON 2174 #endif // HAS_RGB565TOYROW_NEON
2413 2175
2414 #ifdef HAS_ARGB1555TOYROW_NEON 2176 #ifdef HAS_ARGB1555TOYROW_NEON
2415 void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) { 2177 void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
2416 asm volatile ( 2178 asm volatile (
2417 "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient 2179 "movi v4.8b, #13 \n" // B * 0.1016 coefficient
2418 "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient 2180 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
2419 "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient 2181 "movi v6.8b, #33 \n" // R * 0.2578 coefficient
2420 "vmov.u8 d27, #16 \n" // Add 16 constant 2182 "movi v7.8b, #16 \n" // Add 16 constant
2421 ".p2align 2 \n"
2422 "1: \n" 2183 "1: \n"
2423 MEMACCESS(0) 2184 MEMACCESS(0)
2424 "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. 2185 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
2425 "subs %2, %2, #8 \n" // 8 processed per loop. 2186 "subs %2, %2, #8 \n" // 8 processed per loop.
2426 ARGB1555TOARGB 2187 ARGB1555TOARGB
2427 "vmull.u8 q2, d0, d24 \n" // B 2188 "umull v3.8h, v0.8b, v4.8b \n" // B
2428 "vmlal.u8 q2, d1, d25 \n" // G 2189 "umlal v3.8h, v1.8b, v5.8b \n" // G
2429 "vmlal.u8 q2, d2, d26 \n" // R 2190 "umlal v3.8h, v2.8b, v6.8b \n" // R
2430 "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y 2191 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
2431 "vqadd.u8 d0, d27 \n" 2192 "uqadd v0.8b, v0.8b, v7.8b \n"
2432 MEMACCESS(1) 2193 MEMACCESS(1)
2433 "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. 2194 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2434 "bgt 1b \n" 2195 "b.gt 1b \n"
2435 : "+r"(src_argb1555), // %0 2196 : "+r"(src_argb1555), // %0
2436 "+r"(dst_y), // %1 2197 "+r"(dst_y), // %1
2437 "+r"(pix) // %2 2198 "+r"(pix) // %2
2438 : 2199 :
2439 : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13" 2200 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
2440 ); 2201 );
2441 } 2202 }
2442 #endif // HAS_ARGB1555TOYROW_NEON 2203 #endif // HAS_ARGB1555TOYROW_NEON
2443 2204
2444 #ifdef HAS_ARGB4444TOYROW_NEON 2205 #ifdef HAS_ARGB4444TOYROW_NEON
2445 void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) { 2206 void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
2446 asm volatile ( 2207 asm volatile (
2447 "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient 2208 "movi v24.8b, #13 \n" // B * 0.1016 coefficient
2448 "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient 2209 "movi v25.8b, #65 \n" // G * 0.5078 coefficient
2449 "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient 2210 "movi v26.8b, #33 \n" // R * 0.2578 coefficient
2450 "vmov.u8 d27, #16 \n" // Add 16 constant 2211 "movi v27.8b, #16 \n" // Add 16 constant
2451 ".p2align 2 \n"
2452 "1: \n" 2212 "1: \n"
2453 MEMACCESS(0) 2213 MEMACCESS(0)
2454 "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. 2214 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
2455 "subs %2, %2, #8 \n" // 8 processed per loop. 2215 "subs %2, %2, #8 \n" // 8 processed per loop.
2456 ARGB4444TOARGB 2216 ARGB4444TOARGB
2457 "vmull.u8 q2, d0, d24 \n" // B 2217 "umull v3.8h, v0.8b, v24.8b \n" // B
2458 "vmlal.u8 q2, d1, d25 \n" // G 2218 "umlal v3.8h, v1.8b, v25.8b \n" // G
2459 "vmlal.u8 q2, d2, d26 \n" // R 2219 "umlal v3.8h, v2.8b, v26.8b \n" // R
2460 "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y 2220 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
2461 "vqadd.u8 d0, d27 \n" 2221 "uqadd v0.8b, v0.8b, v27.8b \n"
2462 MEMACCESS(1) 2222 MEMACCESS(1)
2463 "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. 2223 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2464 "bgt 1b \n" 2224 "b.gt 1b \n"
2465 : "+r"(src_argb4444), // %0 2225 : "+r"(src_argb4444), // %0
2466 "+r"(dst_y), // %1 2226 "+r"(dst_y), // %1
2467 "+r"(pix) // %2 2227 "+r"(pix) // %2
2468 : 2228 :
2469 : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13" 2229 : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27"
2470 ); 2230 );
2471 } 2231 }
2472 #endif // HAS_ARGB4444TOYROW_NEON 2232 #endif // HAS_ARGB4444TOYROW_NEON
2473 2233
2474 #ifdef HAS_BGRATOYROW_NEON 2234 #ifdef HAS_BGRATOYROW_NEON
2475 void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) { 2235 void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
2476 asm volatile ( 2236 asm volatile (
2477 "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient 2237 "movi v4.8b, #33 \n" // R * 0.2578 coefficient
2478 "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient 2238 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
2479 "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient 2239 "movi v6.8b, #13 \n" // B * 0.1016 coefficient
2480 "vmov.u8 d7, #16 \n" // Add 16 constant 2240 "movi v7.8b, #16 \n" // Add 16 constant
2481 ".p2align 2 \n"
2482 "1: \n" 2241 "1: \n"
2483 MEMACCESS(0) 2242 MEMACCESS(0)
2484 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA. 2243 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
2485 "subs %2, %2, #8 \n" // 8 processed per loop. 2244 "subs %2, %2, #8 \n" // 8 processed per loop.
2486 "vmull.u8 q8, d1, d4 \n" // R 2245 "umull v16.8h, v1.8b, v4.8b \n" // R
2487 "vmlal.u8 q8, d2, d5 \n" // G 2246 "umlal v16.8h, v2.8b, v5.8b \n" // G
2488 "vmlal.u8 q8, d3, d6 \n" // B 2247 "umlal v16.8h, v3.8b, v6.8b \n" // B
2489 "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y 2248 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
2490 "vqadd.u8 d0, d7 \n" 2249 "uqadd v0.8b, v0.8b, v7.8b \n"
2491 MEMACCESS(1) 2250 MEMACCESS(1)
2492 "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. 2251 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2493 "bgt 1b \n" 2252 "b.gt 1b \n"
2494 : "+r"(src_bgra), // %0 2253 : "+r"(src_bgra), // %0
2495 "+r"(dst_y), // %1 2254 "+r"(dst_y), // %1
2496 "+r"(pix) // %2 2255 "+r"(pix) // %2
2497 : 2256 :
2498 : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" 2257 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2499 ); 2258 );
2500 } 2259 }
2501 #endif // HAS_BGRATOYROW_NEON 2260 #endif // HAS_BGRATOYROW_NEON
2502 2261
2503 #ifdef HAS_ABGRTOYROW_NEON 2262 #ifdef HAS_ABGRTOYROW_NEON
2504 void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) { 2263 void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
2505 asm volatile ( 2264 asm volatile (
2506 "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient 2265 "movi v4.8b, #33 \n" // R * 0.2578 coefficient
2507 "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient 2266 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
2508 "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient 2267 "movi v6.8b, #13 \n" // B * 0.1016 coefficient
2509 "vmov.u8 d7, #16 \n" // Add 16 constant 2268 "movi v7.8b, #16 \n" // Add 16 constant
2510 ".p2align 2 \n"
2511 "1: \n" 2269 "1: \n"
2512 MEMACCESS(0) 2270 MEMACCESS(0)
2513 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR. 2271 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
2514 "subs %2, %2, #8 \n" // 8 processed per loop. 2272 "subs %2, %2, #8 \n" // 8 processed per loop.
2515 "vmull.u8 q8, d0, d4 \n" // R 2273 "umull v16.8h, v0.8b, v4.8b \n" // R
2516 "vmlal.u8 q8, d1, d5 \n" // G 2274 "umlal v16.8h, v1.8b, v5.8b \n" // G
2517 "vmlal.u8 q8, d2, d6 \n" // B 2275 "umlal v16.8h, v2.8b, v6.8b \n" // B
2518 "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y 2276 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
2519 "vqadd.u8 d0, d7 \n" 2277 "uqadd v0.8b, v0.8b, v7.8b \n"
2520 MEMACCESS(1) 2278 MEMACCESS(1)
2521 "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. 2279 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2522 "bgt 1b \n" 2280 "b.gt 1b \n"
2523 : "+r"(src_abgr), // %0 2281 : "+r"(src_abgr), // %0
2524 "+r"(dst_y), // %1 2282 "+r"(dst_y), // %1
2525 "+r"(pix) // %2 2283 "+r"(pix) // %2
2526 : 2284 :
2527 : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" 2285 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2528 ); 2286 );
2529 } 2287 }
2530 #endif // HAS_ABGRTOYROW_NEON 2288 #endif // HAS_ABGRTOYROW_NEON
2531 2289
2532 #ifdef HAS_RGBATOYROW_NEON 2290 #ifdef HAS_RGBATOYROW_NEON
2533 void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) { 2291 void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
2534 asm volatile ( 2292 asm volatile (
2535 "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient 2293 "movi v4.8b, #13 \n" // B * 0.1016 coefficient
2536 "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient 2294 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
2537 "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient 2295 "movi v6.8b, #33 \n" // R * 0.2578 coefficient
2538 "vmov.u8 d7, #16 \n" // Add 16 constant 2296 "movi v7.8b, #16 \n" // Add 16 constant
2539 ".p2align 2 \n"
2540 "1: \n" 2297 "1: \n"
2541 MEMACCESS(0) 2298 MEMACCESS(0)
2542 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA. 2299 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
2543 "subs %2, %2, #8 \n" // 8 processed per loop. 2300 "subs %2, %2, #8 \n" // 8 processed per loop.
2544 "vmull.u8 q8, d1, d4 \n" // B 2301 "umull v16.8h, v1.8b, v4.8b \n" // B
2545 "vmlal.u8 q8, d2, d5 \n" // G 2302 "umlal v16.8h, v2.8b, v5.8b \n" // G
2546 "vmlal.u8 q8, d3, d6 \n" // R 2303 "umlal v16.8h, v3.8b, v6.8b \n" // R
2547 "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y 2304 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
2548 "vqadd.u8 d0, d7 \n" 2305 "uqadd v0.8b, v0.8b, v7.8b \n"
2549 MEMACCESS(1) 2306 MEMACCESS(1)
2550 "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. 2307 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2551 "bgt 1b \n" 2308 "b.gt 1b \n"
2552 : "+r"(src_rgba), // %0 2309 : "+r"(src_rgba), // %0
2553 "+r"(dst_y), // %1 2310 "+r"(dst_y), // %1
2554 "+r"(pix) // %2 2311 "+r"(pix) // %2
2555 : 2312 :
2556 : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" 2313 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2557 ); 2314 );
2558 } 2315 }
2559 #endif // HAS_RGBATOYROW_NEON 2316 #endif // HAS_RGBATOYROW_NEON
2560 2317
2561 #ifdef HAS_RGB24TOYROW_NEON 2318 #ifdef HAS_RGB24TOYROW_NEON
2562 void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) { 2319 void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
2563 asm volatile ( 2320 asm volatile (
2564 "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient 2321 "movi v4.8b, #13 \n" // B * 0.1016 coefficient
2565 "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient 2322 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
2566 "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient 2323 "movi v6.8b, #33 \n" // R * 0.2578 coefficient
2567 "vmov.u8 d7, #16 \n" // Add 16 constant 2324 "movi v7.8b, #16 \n" // Add 16 constant
2568 ".p2align 2 \n"
2569 "1: \n" 2325 "1: \n"
2570 MEMACCESS(0) 2326 MEMACCESS(0)
2571 "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24. 2327 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
2572 "subs %2, %2, #8 \n" // 8 processed per loop. 2328 "subs %2, %2, #8 \n" // 8 processed per loop.
2573 "vmull.u8 q8, d0, d4 \n" // B 2329 "umull v16.8h, v0.8b, v4.8b \n" // B
2574 "vmlal.u8 q8, d1, d5 \n" // G 2330 "umlal v16.8h, v1.8b, v5.8b \n" // G
2575 "vmlal.u8 q8, d2, d6 \n" // R 2331 "umlal v16.8h, v2.8b, v6.8b \n" // R
2576 "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y 2332 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
2577 "vqadd.u8 d0, d7 \n" 2333 "uqadd v0.8b, v0.8b, v7.8b \n"
2578 MEMACCESS(1) 2334 MEMACCESS(1)
2579 "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. 2335 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2580 "bgt 1b \n" 2336 "b.gt 1b \n"
2581 : "+r"(src_rgb24), // %0 2337 : "+r"(src_rgb24), // %0
2582 "+r"(dst_y), // %1 2338 "+r"(dst_y), // %1
2583 "+r"(pix) // %2 2339 "+r"(pix) // %2
2584 : 2340 :
2585 : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" 2341 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2586 ); 2342 );
2587 } 2343 }
2588 #endif // HAS_RGB24TOYROW_NEON 2344 #endif // HAS_RGB24TOYROW_NEON
2589 2345
2590 #ifdef HAS_RAWTOYROW_NEON 2346 #ifdef HAS_RAWTOYROW_NEON
2591 void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { 2347 void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
2592 asm volatile ( 2348 asm volatile (
2593 "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient 2349 "movi v4.8b, #33 \n" // R * 0.2578 coefficient
2594 "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient 2350 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
2595 "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient 2351 "movi v6.8b, #13 \n" // B * 0.1016 coefficient
2596 "vmov.u8 d7, #16 \n" // Add 16 constant 2352 "movi v7.8b, #16 \n" // Add 16 constant
2597 ".p2align 2 \n"
2598 "1: \n" 2353 "1: \n"
2599 MEMACCESS(0) 2354 MEMACCESS(0)
2600 "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW. 2355 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
2601 "subs %2, %2, #8 \n" // 8 processed per loop. 2356 "subs %2, %2, #8 \n" // 8 processed per loop.
2602 "vmull.u8 q8, d0, d4 \n" // B 2357 "umull v16.8h, v0.8b, v4.8b \n" // B
2603 "vmlal.u8 q8, d1, d5 \n" // G 2358 "umlal v16.8h, v1.8b, v5.8b \n" // G
2604 "vmlal.u8 q8, d2, d6 \n" // R 2359 "umlal v16.8h, v2.8b, v6.8b \n" // R
2605 "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y 2360 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
2606 "vqadd.u8 d0, d7 \n" 2361 "uqadd v0.8b, v0.8b, v7.8b \n"
2607 MEMACCESS(1) 2362 MEMACCESS(1)
2608 "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. 2363 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2609 "bgt 1b \n" 2364 "b.gt 1b \n"
2610 : "+r"(src_raw), // %0 2365 : "+r"(src_raw), // %0
2611 "+r"(dst_y), // %1 2366 "+r"(dst_y), // %1
2612 "+r"(pix) // %2 2367 "+r"(pix) // %2
2613 : 2368 :
2614 : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" 2369 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2615 ); 2370 );
2616 } 2371 }
2617 #endif // HAS_RAWTOYROW_NEON 2372 #endif // HAS_RAWTOYROW_NEON
2618 2373
2619 // Bilinear filter 16x2 -> 16x1 2374 // Bilinear filter 16x2 -> 16x1
2620 #ifdef HAS_INTERPOLATEROW_NEON 2375 #ifdef HAS_INTERPOLATEROW_NEON
2621 void InterpolateRow_NEON(uint8* dst_ptr, 2376 void InterpolateRow_NEON(uint8* dst_ptr,
2622 const uint8* src_ptr, ptrdiff_t src_stride, 2377 const uint8* src_ptr, ptrdiff_t src_stride,
2623 int dst_width, int source_y_fraction) { 2378 int dst_width, int source_y_fraction) {
2379 int y1_fraction = source_y_fraction;
2380 int y0_fraction = 256 - y1_fraction;
2381 const uint8* src_ptr1 = src_ptr + src_stride;
2624 asm volatile ( 2382 asm volatile (
2625 "cmp %4, #0 \n" 2383 "cmp %4, #0 \n"
2626 "beq 100f \n" 2384 "b.eq 100f \n"
2627 "add %2, %1 \n"
2628 "cmp %4, #64 \n" 2385 "cmp %4, #64 \n"
2629 "beq 75f \n" 2386 "b.eq 75f \n"
2630 "cmp %4, #128 \n" 2387 "cmp %4, #128 \n"
2631 "beq 50f \n" 2388 "b.eq 50f \n"
2632 "cmp %4, #192 \n" 2389 "cmp %4, #192 \n"
2633 "beq 25f \n" 2390 "b.eq 25f \n"
2634 2391
2635 "vdup.8 d5, %4 \n" 2392 "dup v5.16b, %w4 \n"
2636 "rsb %4, #256 \n" 2393 "dup v4.16b, %w5 \n"
2637 "vdup.8 d4, %4 \n"
2638 // General purpose row blend. 2394 // General purpose row blend.
2639 "1: \n" 2395 "1: \n"
2640 MEMACCESS(1) 2396 MEMACCESS(1)
2641 "vld1.8 {q0}, [%1]! \n" 2397 "ld1 {v0.16b}, [%1], #16 \n"
2642 MEMACCESS(2) 2398 MEMACCESS(2)
2643 "vld1.8 {q1}, [%2]! \n" 2399 "ld1 {v1.16b}, [%2], #16 \n"
2644 "subs %3, %3, #16 \n" 2400 "subs %3, %3, #16 \n"
2645 "vmull.u8 q13, d0, d4 \n" 2401 "umull v2.8h, v0.8b, v4.8b \n"
2646 "vmull.u8 q14, d1, d4 \n" 2402 "umull2 v3.8h, v0.16b, v4.16b \n"
2647 "vmlal.u8 q13, d2, d5 \n" 2403 "umlal v2.8h, v1.8b, v5.8b \n"
2648 "vmlal.u8 q14, d3, d5 \n" 2404 "umlal2 v3.8h, v1.16b, v5.16b \n"
2649 "vrshrn.u16 d0, q13, #8 \n" 2405 "rshrn v0.8b, v2.8h, #8 \n"
2650 "vrshrn.u16 d1, q14, #8 \n" 2406 "rshrn2 v0.16b, v3.8h, #8 \n"
2651 MEMACCESS(0) 2407 MEMACCESS(0)
2652 "vst1.8 {q0}, [%0]! \n" 2408 "st1 {v0.16b}, [%0], #16 \n"
2653 "bgt 1b \n" 2409 "b.gt 1b \n"
2654 "b 99f \n" 2410 "b 99f \n"
2655 2411
2656 // Blend 25 / 75. 2412 // Blend 25 / 75.
2657 "25: \n" 2413 "25: \n"
2658 MEMACCESS(1) 2414 MEMACCESS(1)
2659 "vld1.8 {q0}, [%1]! \n" 2415 "ld1 {v0.16b}, [%1], #16 \n"
2660 MEMACCESS(2) 2416 MEMACCESS(2)
2661 "vld1.8 {q1}, [%2]! \n" 2417 "ld1 {v1.16b}, [%2], #16 \n"
2662 "subs %3, %3, #16 \n" 2418 "subs %3, %3, #16 \n"
2663 "vrhadd.u8 q0, q1 \n" 2419 "urhadd v0.16b, v0.16b, v1.16b \n"
2664 "vrhadd.u8 q0, q1 \n" 2420 "urhadd v0.16b, v0.16b, v1.16b \n"
2665 MEMACCESS(0) 2421 MEMACCESS(0)
2666 "vst1.8 {q0}, [%0]! \n" 2422 "st1 {v0.16b}, [%0], #16 \n"
2667 "bgt 25b \n" 2423 "b.gt 25b \n"
2668 "b 99f \n" 2424 "b 99f \n"
2669 2425
2670 // Blend 50 / 50. 2426 // Blend 50 / 50.
2671 "50: \n" 2427 "50: \n"
2672 MEMACCESS(1) 2428 MEMACCESS(1)
2673 "vld1.8 {q0}, [%1]! \n" 2429 "ld1 {v0.16b}, [%1], #16 \n"
2674 MEMACCESS(2) 2430 MEMACCESS(2)
2675 "vld1.8 {q1}, [%2]! \n" 2431 "ld1 {v1.16b}, [%2], #16 \n"
2676 "subs %3, %3, #16 \n" 2432 "subs %3, %3, #16 \n"
2677 "vrhadd.u8 q0, q1 \n" 2433 "urhadd v0.16b, v0.16b, v1.16b \n"
2678 MEMACCESS(0) 2434 MEMACCESS(0)
2679 "vst1.8 {q0}, [%0]! \n" 2435 "st1 {v0.16b}, [%0], #16 \n"
2680 "bgt 50b \n" 2436 "b.gt 50b \n"
2681 "b 99f \n" 2437 "b 99f \n"
2682 2438
2683 // Blend 75 / 25. 2439 // Blend 75 / 25.
2684 "75: \n" 2440 "75: \n"
2685 MEMACCESS(1) 2441 MEMACCESS(1)
2686 "vld1.8 {q1}, [%1]! \n" 2442 "ld1 {v1.16b}, [%1], #16 \n"
2687 MEMACCESS(2) 2443 MEMACCESS(2)
2688 "vld1.8 {q0}, [%2]! \n" 2444 "ld1 {v0.16b}, [%2], #16 \n"
2689 "subs %3, %3, #16 \n" 2445 "subs %3, %3, #16 \n"
2690 "vrhadd.u8 q0, q1 \n" 2446 "urhadd v0.16b, v0.16b, v1.16b \n"
2691 "vrhadd.u8 q0, q1 \n" 2447 "urhadd v0.16b, v0.16b, v1.16b \n"
2692 MEMACCESS(0) 2448 MEMACCESS(0)
2693 "vst1.8 {q0}, [%0]! \n" 2449 "st1 {v0.16b}, [%0], #16 \n"
2694 "bgt 75b \n" 2450 "b.gt 75b \n"
2695 "b 99f \n" 2451 "b 99f \n"
2696 2452
2697 // Blend 100 / 0 - Copy row unchanged. 2453 // Blend 100 / 0 - Copy row unchanged.
2698 "100: \n" 2454 "100: \n"
2699 MEMACCESS(1) 2455 MEMACCESS(1)
2700 "vld1.8 {q0}, [%1]! \n" 2456 "ld1 {v0.16b}, [%1], #16 \n"
2701 "subs %3, %3, #16 \n" 2457 "subs %3, %3, #16 \n"
2702 MEMACCESS(0) 2458 MEMACCESS(0)
2703 "vst1.8 {q0}, [%0]! \n" 2459 "st1 {v0.16b}, [%0], #16 \n"
2704 "bgt 100b \n" 2460 "b.gt 100b \n"
2705 2461
2706 "99: \n" 2462 "99: \n"
2707 : "+r"(dst_ptr), // %0 2463 : "+r"(dst_ptr), // %0
2708 "+r"(src_ptr), // %1 2464 "+r"(src_ptr), // %1
2709 "+r"(src_stride), // %2 2465 "+r"(src_ptr1), // %2
2710 "+r"(dst_width), // %3 2466 "+r"(dst_width), // %3
2711 "+r"(source_y_fraction) // %4 2467 "+r"(y1_fraction), // %4
2468 "+r"(y0_fraction) // %5
2712 : 2469 :
2713 : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14" 2470 : "cc", "memory", "v0", "v1", "v3", "v4", "v5"
2714 ); 2471 );
2715 } 2472 }
2716 #endif // HAS_INTERPOLATEROW_NEON 2473 #endif // HAS_INTERPOLATEROW_NEON
2717 2474
2718 // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr 2475 // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
2719 #ifdef HAS_ARGBBLENDROW_NEON 2476 #ifdef HAS_ARGBBLENDROW_NEON
2720 void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, 2477 void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
2721 uint8* dst_argb, int width) { 2478 uint8* dst_argb, int width) {
2722 asm volatile ( 2479 asm volatile (
2723 "subs %3, #8 \n" 2480 "subs %3, %3, #8 \n"
2724 "blt 89f \n" 2481 "b.lt 89f \n"
2725 // Blend 8 pixels. 2482 // Blend 8 pixels.
2726 "8: \n" 2483 "8: \n"
2727 MEMACCESS(0) 2484 MEMACCESS(0)
2728 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0. 2485 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 pixels
2729 MEMACCESS(1) 2486 MEMACCESS(1)
2730 "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1. 2487 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 pixels
2731 "subs %3, %3, #8 \n" // 8 processed per loop. 2488 "subs %3, %3, #8 \n" // 8 processed per loop.
2732 "vmull.u8 q10, d4, d3 \n" // db * a 2489 "umull v16.8h, v4.8b, v3.8b \n" // db * a
2733 "vmull.u8 q11, d5, d3 \n" // dg * a 2490 "umull v17.8h, v5.8b, v3.8b \n" // dg * a
2734 "vmull.u8 q12, d6, d3 \n" // dr * a 2491 "umull v18.8h, v6.8b, v3.8b \n" // dr * a
2735 "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 2492 "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
2736 "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 2493 "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
2737 "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 2494 "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
2738 "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 2495 "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
2739 "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 2496 "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
2740 "vqadd.u8 q0, q0, q2 \n" // + sbg 2497 "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
2741 "vqadd.u8 d2, d2, d6 \n" // + sr 2498 "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
2742 "vmov.u8 d3, #255 \n" // a = 255 2499 "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
2500 "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
2501 "movi v3.8b, #255 \n" // a = 255
2743 MEMACCESS(2) 2502 MEMACCESS(2)
2744 "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB. 2503 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
2745 "bge 8b \n" 2504 "b.ge 8b \n"
2746 2505
2747 "89: \n" 2506 "89: \n"
2748 "adds %3, #8-1 \n" 2507 "adds %3, %3, #8-1 \n"
2749 "blt 99f \n" 2508 "b.lt 99f \n"
2750 2509
2751 // Blend 1 pixels. 2510 // Blend 1 pixels.
2752 "1: \n" 2511 "1: \n"
2753 MEMACCESS(0) 2512 MEMACCESS(0)
2754 "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0. 2513 "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0.
2755 MEMACCESS(1) 2514 MEMACCESS(1)
2756 "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1. 2515 "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1.
2757 "subs %3, %3, #1 \n" // 1 processed per loop. 2516 "subs %3, %3, #1 \n" // 1 processed per loop.
2758 "vmull.u8 q10, d4, d3 \n" // db * a 2517 "umull v16.8h, v4.8b, v3.8b \n" // db * a
2759 "vmull.u8 q11, d5, d3 \n" // dg * a 2518 "umull v17.8h, v5.8b, v3.8b \n" // dg * a
2760 "vmull.u8 q12, d6, d3 \n" // dr * a 2519 "umull v18.8h, v6.8b, v3.8b \n" // dr * a
2761 "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 2520 "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
2762 "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 2521 "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
2763 "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 2522 "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
2764 "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 2523 "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
2765 "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 2524 "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
2766 "vqadd.u8 q0, q0, q2 \n" // + sbg 2525 "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
2767 "vqadd.u8 d2, d2, d6 \n" // + sr 2526 "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
2768 "vmov.u8 d3, #255 \n" // a = 255 2527 "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
2528 "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
2529 "movi v3.8b, #255 \n" // a = 255
2769 MEMACCESS(2) 2530 MEMACCESS(2)
2770 "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel. 2531 "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel.
2771 "bge 1b \n" 2532 "b.ge 1b \n"
2772 2533
2773 "99: \n" 2534 "99: \n"
2774 2535
2775 : "+r"(src_argb0), // %0 2536 : "+r"(src_argb0), // %0
2776 "+r"(src_argb1), // %1 2537 "+r"(src_argb1), // %1
2777 "+r"(dst_argb), // %2 2538 "+r"(dst_argb), // %2
2778 "+r"(width) // %3 2539 "+r"(width) // %3
2779 : 2540 :
2780 : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12" 2541 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2542 "v16", "v17", "v18"
2781 ); 2543 );
2782 } 2544 }
2783 #endif // HAS_ARGBBLENDROW_NEON 2545 #endif // HAS_ARGBBLENDROW_NEON
2784 2546
2785 // Attenuate 8 pixels at a time. 2547 // Attenuate 8 pixels at a time.
2786 #ifdef HAS_ARGBATTENUATEROW_NEON 2548 #ifdef HAS_ARGBATTENUATEROW_NEON
2787 void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { 2549 void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
2788 asm volatile ( 2550 asm volatile (
2789 // Attenuate 8 pixels. 2551 // Attenuate 8 pixels.
2790 "1: \n" 2552 "1: \n"
2791 MEMACCESS(0) 2553 MEMACCESS(0)
2792 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB. 2554 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels
2793 "subs %2, %2, #8 \n" // 8 processed per loop. 2555 "subs %2, %2, #8 \n" // 8 processed per loop.
2794 "vmull.u8 q10, d0, d3 \n" // b * a 2556 "umull v4.8h, v0.8b, v3.8b \n" // b * a
2795 "vmull.u8 q11, d1, d3 \n" // g * a 2557 "umull v5.8h, v1.8b, v3.8b \n" // g * a
2796 "vmull.u8 q12, d2, d3 \n" // r * a 2558 "umull v6.8h, v2.8b, v3.8b \n" // r * a
2797 "vqrshrn.u16 d0, q10, #8 \n" // b >>= 8 2559 "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8
2798 "vqrshrn.u16 d1, q11, #8 \n" // g >>= 8 2560 "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8
2799 "vqrshrn.u16 d2, q12, #8 \n" // r >>= 8 2561 "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8
2800 MEMACCESS(1) 2562 MEMACCESS(1)
2801 "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. 2563 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
2802 "bgt 1b \n" 2564 "b.gt 1b \n"
2803 : "+r"(src_argb), // %0 2565 : "+r"(src_argb), // %0
2804 "+r"(dst_argb), // %1 2566 "+r"(dst_argb), // %1
2805 "+r"(width) // %2 2567 "+r"(width) // %2
2806 : 2568 :
2807 : "cc", "memory", "q0", "q1", "q10", "q11", "q12" 2569 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
2808 ); 2570 );
2809 } 2571 }
2810 #endif // HAS_ARGBATTENUATEROW_NEON 2572 #endif // HAS_ARGBATTENUATEROW_NEON
2811 2573
2812 // Quantize 8 ARGB pixels (32 bytes). 2574 // Quantize 8 ARGB pixels (32 bytes).
2813 // dst = (dst * scale >> 16) * interval_size + interval_offset; 2575 // dst = (dst * scale >> 16) * interval_size + interval_offset;
2814 #ifdef HAS_ARGBQUANTIZEROW_NEON 2576 #ifdef HAS_ARGBQUANTIZEROW_NEON
2815 void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, 2577 void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
2816 int interval_offset, int width) { 2578 int interval_offset, int width) {
2817 asm volatile ( 2579 asm volatile (
2818 "vdup.u16 q8, %2 \n" 2580 "dup v4.8h, %w2 \n"
2819 "vshr.u16 q8, q8, #1 \n" // scale >>= 1 2581 "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1
2820 "vdup.u16 q9, %3 \n" // interval multiply. 2582 "dup v5.8h, %w3 \n" // interval multiply.
2821 "vdup.u16 q10, %4 \n" // interval add 2583 "dup v6.8h, %w4 \n" // interval add
2822 2584
2823 // 8 pixel loop. 2585 // 8 pixel loop.
2824 ".p2align 2 \n"
2825 "1: \n" 2586 "1: \n"
2826 MEMACCESS(0) 2587 MEMACCESS(0)
2827 "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB. 2588 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 pixels of ARGB.
2828 "subs %1, %1, #8 \n" // 8 processed per loop. 2589 "subs %1, %1, #8 \n" // 8 processed per loop.
2829 "vmovl.u8 q0, d0 \n" // b (0 .. 255) 2590 "uxtl v0.8h, v0.8b \n" // b (0 .. 255)
2830 "vmovl.u8 q1, d2 \n" 2591 "uxtl v1.8h, v1.8b \n"
2831 "vmovl.u8 q2, d4 \n" 2592 "uxtl v2.8h, v2.8b \n"
2832 "vqdmulh.s16 q0, q0, q8 \n" // b * scale 2593 "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale
2833 "vqdmulh.s16 q1, q1, q8 \n" // g 2594 "sqdmulh v1.8h, v1.8h, v4.8h \n" // g
2834 "vqdmulh.s16 q2, q2, q8 \n" // r 2595 "sqdmulh v2.8h, v2.8h, v4.8h \n" // r
2835 "vmul.u16 q0, q0, q9 \n" // b * interval_size 2596 "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size
2836 "vmul.u16 q1, q1, q9 \n" // g 2597 "mul v1.8h, v1.8h, v5.8h \n" // g
2837 "vmul.u16 q2, q2, q9 \n" // r 2598 "mul v2.8h, v2.8h, v5.8h \n" // r
2838 "vadd.u16 q0, q0, q10 \n" // b + interval_offset 2599 "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset
2839 "vadd.u16 q1, q1, q10 \n" // g 2600 "add v1.8h, v1.8h, v6.8h \n" // g
2840 "vadd.u16 q2, q2, q10 \n" // r 2601 "add v2.8h, v2.8h, v6.8h \n" // r
2841 "vqmovn.u16 d0, q0 \n" 2602 "uqxtn v0.8b, v0.8h \n"
2842 "vqmovn.u16 d2, q1 \n" 2603 "uqxtn v1.8b, v1.8h \n"
2843 "vqmovn.u16 d4, q2 \n" 2604 "uqxtn v2.8b, v2.8h \n"
2844 MEMACCESS(0) 2605 MEMACCESS(0)
2845 "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB. 2606 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB pixels
2846 "bgt 1b \n" 2607 "b.gt 1b \n"
2847 : "+r"(dst_argb), // %0 2608 : "+r"(dst_argb), // %0
2848 "+r"(width) // %1 2609 "+r"(width) // %1
2849 : "r"(scale), // %2 2610 : "r"(scale), // %2
2850 "r"(interval_size), // %3 2611 "r"(interval_size), // %3
2851 "r"(interval_offset) // %4 2612 "r"(interval_offset) // %4
2852 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10" 2613 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
2853 ); 2614 );
2854 } 2615 }
2855 #endif // HAS_ARGBQUANTIZEROW_NEON 2616 #endif // HAS_ARGBQUANTIZEROW_NEON
2856 2617
2857 // Shade 8 pixels at a time by specified value. 2618 // Shade 8 pixels at a time by specified value.
2858 // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8. 2619 // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
2859 // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set. 2620 // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
2860 #ifdef HAS_ARGBSHADEROW_NEON 2621 #ifdef HAS_ARGBSHADEROW_NEON
2861 void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, 2622 void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
2862 uint32 value) { 2623 uint32 value) {
2863 asm volatile ( 2624 asm volatile (
2864 "vdup.u32 q0, %3 \n" // duplicate scale value. 2625 "dup v0.4s, %w3 \n" // duplicate scale value.
2865 "vzip.u8 d0, d1 \n" // d0 aarrggbb. 2626 "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb.
2866 "vshr.u16 q0, q0, #1 \n" // scale / 2. 2627 "ushr v0.8h, v0.8h, #1 \n" // scale / 2.
2867 2628
2868 // 8 pixel loop. 2629 // 8 pixel loop.
2869 ".p2align 2 \n"
2870 "1: \n" 2630 "1: \n"
2871 MEMACCESS(0) 2631 MEMACCESS(0)
2872 "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB. 2632 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB pixels.
2873 "subs %2, %2, #8 \n" // 8 processed per loop. 2633 "subs %2, %2, #8 \n" // 8 processed per loop.
2874 "vmovl.u8 q10, d20 \n" // b (0 .. 255) 2634 "uxtl v4.8h, v4.8b \n" // b (0 .. 255)
2875 "vmovl.u8 q11, d22 \n" 2635 "uxtl v5.8h, v5.8b \n"
2876 "vmovl.u8 q12, d24 \n" 2636 "uxtl v6.8h, v6.8b \n"
2877 "vmovl.u8 q13, d26 \n" 2637 "uxtl v7.8h, v7.8b \n"
2878 "vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale * 2 2638 "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2
2879 "vqrdmulh.s16 q11, q11, d0[1] \n" // g 2639 "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g
2880 "vqrdmulh.s16 q12, q12, d0[2] \n" // r 2640 "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r
2881 "vqrdmulh.s16 q13, q13, d0[3] \n" // a 2641 "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a
2882 "vqmovn.u16 d20, q10 \n" 2642 "uqxtn v4.8b, v4.8h \n"
2883 "vqmovn.u16 d22, q11 \n" 2643 "uqxtn v5.8b, v5.8h \n"
2884 "vqmovn.u16 d24, q12 \n" 2644 "uqxtn v6.8b, v6.8h \n"
2885 "vqmovn.u16 d26, q13 \n" 2645 "uqxtn v7.8b, v7.8h \n"
2886 MEMACCESS(1) 2646 MEMACCESS(1)
2887 "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB. 2647 "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB pixels
2888 "bgt 1b \n" 2648 "b.gt 1b \n"
2889 : "+r"(src_argb), // %0 2649 : "+r"(src_argb), // %0
2890 "+r"(dst_argb), // %1 2650 "+r"(dst_argb), // %1
2891 "+r"(width) // %2 2651 "+r"(width) // %2
2892 : "r"(value) // %3 2652 : "r"(value) // %3
2893 : "cc", "memory", "q0", "q10", "q11", "q12", "q13" 2653 : "cc", "memory", "v0", "v4", "v5", "v6", "v7"
2894 ); 2654 );
2895 } 2655 }
2896 #endif // HAS_ARGBSHADEROW_NEON 2656 #endif // HAS_ARGBSHADEROW_NEON
2897 2657
2898 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels 2658 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
2899 // Similar to ARGBToYJ but stores ARGB. 2659 // Similar to ARGBToYJ but stores ARGB.
2900 // C code is (15 * b + 75 * g + 38 * r + 64) >> 7; 2660 // C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
2901 #ifdef HAS_ARGBGRAYROW_NEON 2661 #ifdef HAS_ARGBGRAYROW_NEON
2902 void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { 2662 void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
2903 asm volatile ( 2663 asm volatile (
2904 "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient 2664 "movi v24.8b, #15 \n" // B * 0.11400 coefficient
2905 "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient 2665 "movi v25.8b, #75 \n" // G * 0.58700 coefficient
2906 "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient 2666 "movi v26.8b, #38 \n" // R * 0.29900 coefficient
2907 ".p2align 2 \n"
2908 "1: \n" 2667 "1: \n"
2909 MEMACCESS(0) 2668 MEMACCESS(0)
2910 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. 2669 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
2911 "subs %2, %2, #8 \n" // 8 processed per loop. 2670 "subs %2, %2, #8 \n" // 8 processed per loop.
2912 "vmull.u8 q2, d0, d24 \n" // B 2671 "umull v4.8h, v0.8b, v24.8b \n" // B
2913 "vmlal.u8 q2, d1, d25 \n" // G 2672 "umlal v4.8h, v1.8b, v25.8b \n" // G
2914 "vmlal.u8 q2, d2, d26 \n" // R 2673 "umlal v4.8h, v2.8b, v26.8b \n" // R
2915 "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit B 2674 "sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B
2916 "vmov d1, d0 \n" // G 2675 "orr v1.8b, v0.8b, v0.8b \n" // G
2917 "vmov d2, d0 \n" // R 2676 "orr v2.8b, v0.8b, v0.8b \n" // R
2918 MEMACCESS(1) 2677 MEMACCESS(1)
2919 "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels. 2678 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels.
2920 "bgt 1b \n" 2679 "b.gt 1b \n"
2921 : "+r"(src_argb), // %0 2680 : "+r"(src_argb), // %0
2922 "+r"(dst_argb), // %1 2681 "+r"(dst_argb), // %1
2923 "+r"(width) // %2 2682 "+r"(width) // %2
2924 : 2683 :
2925 : "cc", "memory", "q0", "q1", "q2", "q12", "q13" 2684 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26"
2926 ); 2685 );
2927 } 2686 }
2928 #endif // HAS_ARGBGRAYROW_NEON 2687 #endif // HAS_ARGBGRAYROW_NEON
2929 2688
2930 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. 2689 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
2931 // b = (r * 35 + g * 68 + b * 17) >> 7 2690 // b = (r * 35 + g * 68 + b * 17) >> 7
2932 // g = (r * 45 + g * 88 + b * 22) >> 7 2691 // g = (r * 45 + g * 88 + b * 22) >> 7
2933 // r = (r * 50 + g * 98 + b * 24) >> 7 2692 // r = (r * 50 + g * 98 + b * 24) >> 7
2934 2693
2935 #ifdef HAS_ARGBSEPIAROW_NEON 2694 #ifdef HAS_ARGBSEPIAROW_NEON
2936 void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { 2695 void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
2937 asm volatile ( 2696 asm volatile (
2938 "vmov.u8 d20, #17 \n" // BB coefficient 2697 "movi v20.8b, #17 \n" // BB coefficient
2939 "vmov.u8 d21, #68 \n" // BG coefficient 2698 "movi v21.8b, #68 \n" // BG coefficient
2940 "vmov.u8 d22, #35 \n" // BR coefficient 2699 "movi v22.8b, #35 \n" // BR coefficient
2941 "vmov.u8 d24, #22 \n" // GB coefficient 2700 "movi v24.8b, #22 \n" // GB coefficient
2942 "vmov.u8 d25, #88 \n" // GG coefficient 2701 "movi v25.8b, #88 \n" // GG coefficient
2943 "vmov.u8 d26, #45 \n" // GR coefficient 2702 "movi v26.8b, #45 \n" // GR coefficient
2944 "vmov.u8 d28, #24 \n" // BB coefficient 2703 "movi v28.8b, #24 \n" // BB coefficient
2945 "vmov.u8 d29, #98 \n" // BG coefficient 2704 "movi v29.8b, #98 \n" // BG coefficient
2946 "vmov.u8 d30, #50 \n" // BR coefficient 2705 "movi v30.8b, #50 \n" // BR coefficient
2947 ".p2align 2 \n"
2948 "1: \n" 2706 "1: \n"
2949 MEMACCESS(0) 2707 MEMACCESS(0)
2950 "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels. 2708 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels.
2951 "subs %1, %1, #8 \n" // 8 processed per loop. 2709 "subs %1, %1, #8 \n" // 8 processed per loop.
2952 "vmull.u8 q2, d0, d20 \n" // B to Sepia B 2710 "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B
2953 "vmlal.u8 q2, d1, d21 \n" // G 2711 "umlal v4.8h, v1.8b, v21.8b \n" // G
2954 "vmlal.u8 q2, d2, d22 \n" // R 2712 "umlal v4.8h, v2.8b, v22.8b \n" // R
2955 "vmull.u8 q3, d0, d24 \n" // B to Sepia G 2713 "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G
2956 "vmlal.u8 q3, d1, d25 \n" // G 2714 "umlal v5.8h, v1.8b, v25.8b \n" // G
2957 "vmlal.u8 q3, d2, d26 \n" // R 2715 "umlal v5.8h, v2.8b, v26.8b \n" // R
2958 "vmull.u8 q8, d0, d28 \n" // B to Sepia R 2716 "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R
2959 "vmlal.u8 q8, d1, d29 \n" // G 2717 "umlal v6.8h, v1.8b, v29.8b \n" // G
2960 "vmlal.u8 q8, d2, d30 \n" // R 2718 "umlal v6.8h, v2.8b, v30.8b \n" // R
2961 "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B 2719 "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B
2962 "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G 2720 "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G
2963 "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R 2721 "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R
2964 MEMACCESS(0) 2722 MEMACCESS(0)
2965 "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels. 2723 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels.
2966 "bgt 1b \n" 2724 "b.gt 1b \n"
2967 : "+r"(dst_argb), // %0 2725 : "+r"(dst_argb), // %0
2968 "+r"(width) // %1 2726 "+r"(width) // %1
2969 : 2727 :
2970 : "cc", "memory", "q0", "q1", "q2", "q3", 2728 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2971 "q10", "q11", "q12", "q13", "q14", "q15" 2729 "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30"
2972 ); 2730 );
2973 } 2731 }
2974 #endif // HAS_ARGBSEPIAROW_NEON 2732 #endif // HAS_ARGBSEPIAROW_NEON
2975 2733
2976 // Tranform 8 ARGB pixels (32 bytes) with color matrix. 2734 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
2977 // TODO(fbarchard): Was same as Sepia except matrix is provided. This function 2735 // TODO(fbarchard): Was same as Sepia except matrix is provided. This function
2978 // needs to saturate. Consider doing a non-saturating version. 2736 // needs to saturate. Consider doing a non-saturating version.
2979 #ifdef HAS_ARGBCOLORMATRIXROW_NEON 2737 #ifdef HAS_ARGBCOLORMATRIXROW_NEON
2980 void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, 2738 void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
2981 const int8* matrix_argb, int width) { 2739 const int8* matrix_argb, int width) {
2982 asm volatile ( 2740 asm volatile (
2983 MEMACCESS(3) 2741 MEMACCESS(3)
2984 "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors. 2742 "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors.
2985 "vmovl.s8 q0, d4 \n" // B,G coefficients s16. 2743 "sxtl v0.8h, v2.8b \n" // B,G coefficients s16.
2986 "vmovl.s8 q1, d5 \n" // R,A coefficients s16. 2744 "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16.
2987 2745
2988 ".p2align 2 \n"
2989 "1: \n" 2746 "1: \n"
2990 MEMACCESS(0) 2747 MEMACCESS(0)
2991 "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels. 2748 "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 pixels.
2992 "subs %2, %2, #8 \n" // 8 processed per loop. 2749 "subs %2, %2, #8 \n" // 8 processed per loop.
2993 "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit 2750 "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit
2994 "vmovl.u8 q9, d18 \n" // g 2751 "uxtl v17.8h, v17.8b \n" // g
2995 "vmovl.u8 q10, d20 \n" // r 2752 "uxtl v18.8h, v18.8b \n" // r
2996 "vmovl.u8 q15, d22 \n" // a 2753 "uxtl v19.8h, v19.8b \n" // a
2997 "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B 2754 "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B
2998 "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G 2755 "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G
2999 "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R 2756 "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R
3000 "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A 2757 "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A
3001 "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B 2758 "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B
3002 "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G 2759 "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G
3003 "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R 2760 "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R
3004 "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A 2761 "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A
3005 "vqadd.s16 q12, q12, q4 \n" // Accumulate B 2762 "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
3006 "vqadd.s16 q13, q13, q5 \n" // Accumulate G 2763 "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
3007 "vqadd.s16 q14, q14, q6 \n" // Accumulate R 2764 "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
3008 "vqadd.s16 q15, q15, q7 \n" // Accumulate A 2765 "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
3009 "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B 2766 "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B
3010 "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G 2767 "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G
3011 "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R 2768 "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R
3012 "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A 2769 "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A
3013 "vqadd.s16 q12, q12, q4 \n" // Accumulate B 2770 "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
3014 "vqadd.s16 q13, q13, q5 \n" // Accumulate G 2771 "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
3015 "vqadd.s16 q14, q14, q6 \n" // Accumulate R 2772 "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
3016 "vqadd.s16 q15, q15, q7 \n" // Accumulate A 2773 "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
3017 "vmul.s16 q4, q15, d0[3] \n" // B += A * Matrix B 2774 "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B
3018 "vmul.s16 q5, q15, d1[3] \n" // G += A * Matrix G 2775 "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G
3019 "vmul.s16 q6, q15, d2[3] \n" // R += A * Matrix R 2776 "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R
3020 "vmul.s16 q7, q15, d3[3] \n" // A += A * Matrix A 2777 "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A
3021 "vqadd.s16 q12, q12, q4 \n" // Accumulate B 2778 "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
3022 "vqadd.s16 q13, q13, q5 \n" // Accumulate G 2779 "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
3023 "vqadd.s16 q14, q14, q6 \n" // Accumulate R 2780 "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
3024 "vqadd.s16 q15, q15, q7 \n" // Accumulate A 2781 "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
3025 "vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B 2782 "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B
3026 "vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G 2783 "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G
3027 "vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R 2784 "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R
3028 "vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A 2785 "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A
3029 MEMACCESS(1) 2786 MEMACCESS(1)
3030 "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels. 2787 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 pixels.
3031 "bgt 1b \n" 2788 "b.gt 1b \n"
3032 : "+r"(src_argb), // %0 2789 : "+r"(src_argb), // %0
3033 "+r"(dst_argb), // %1 2790 "+r"(dst_argb), // %1
3034 "+r"(width) // %2 2791 "+r"(width) // %2
3035 : "r"(matrix_argb) // %3 2792 : "r"(matrix_argb) // %3
3036 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", 2793 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17" ,
3037 "q10", "q11", "q12", "q13", "q14", "q15" 2794 "v18", "v19", "v22", "v23", "v24", "v25"
3038 ); 2795 );
3039 } 2796 }
3040 #endif // HAS_ARGBCOLORMATRIXROW_NEON 2797 #endif // HAS_ARGBCOLORMATRIXROW_NEON
3041 2798
3042 // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable. 2799 // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
3043 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. 2800 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
3044 #ifdef HAS_ARGBMULTIPLYROW_NEON 2801 #ifdef HAS_ARGBMULTIPLYROW_NEON
3045 void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, 2802 void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
3046 uint8* dst_argb, int width) { 2803 uint8* dst_argb, int width) {
3047 asm volatile ( 2804 asm volatile (
3048 // 8 pixel loop. 2805 // 8 pixel loop.
3049 ".p2align 2 \n"
3050 "1: \n" 2806 "1: \n"
3051 MEMACCESS(0) 2807 MEMACCESS(0)
3052 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 2808 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
3053 MEMACCESS(1) 2809 MEMACCESS(1)
3054 "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels. 2810 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels.
3055 "subs %3, %3, #8 \n" // 8 processed per loop. 2811 "subs %3, %3, #8 \n" // 8 processed per loop.
3056 "umull v0.8h, v0.8b, v4.8b \n" // multiply B 2812 "umull v0.8h, v0.8b, v4.8b \n" // multiply B
3057 "umull v1.8h, v1.8b, v5.8b \n" // multiply G 2813 "umull v1.8h, v1.8b, v5.8b \n" // multiply G
3058 "umull v2.8h, v2.8b, v6.8b \n" // multiply R 2814 "umull v2.8h, v2.8b, v6.8b \n" // multiply R
3059 "umull v3.8h, v3.8b, v7.8b \n" // multiply A 2815 "umull v3.8h, v3.8b, v7.8b \n" // multiply A
3060 "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B 2816 "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B
3061 "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G 2817 "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G
3062 "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R 2818 "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R
3063 "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A 2819 "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A
3064 MEMACCESS(2) 2820 MEMACCESS(2)
3065 "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. 2821 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
3066 "bgt 1b \n" 2822 "b.gt 1b \n"
3067 2823
3068 : "+r"(src_argb0), // %0 2824 : "+r"(src_argb0), // %0
3069 "+r"(src_argb1), // %1 2825 "+r"(src_argb1), // %1
3070 "+r"(dst_argb), // %2 2826 "+r"(dst_argb), // %2
3071 "+r"(width) // %3 2827 "+r"(width) // %3
3072 : 2828 :
3073 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" 2829 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
3074 ); 2830 );
3075 } 2831 }
3076 #endif // HAS_ARGBMULTIPLYROW_NEON 2832 #endif // HAS_ARGBMULTIPLYROW_NEON
3077 2833
3078 // Add 2 rows of ARGB pixels together, 8 pixels at a time. 2834 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
3079 #ifdef HAS_ARGBADDROW_NEON 2835 #ifdef HAS_ARGBADDROW_NEON
3080 void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, 2836 void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
3081 uint8* dst_argb, int width) { 2837 uint8* dst_argb, int width) {
3082 asm volatile ( 2838 asm volatile (
3083 // 8 pixel loop. 2839 // 8 pixel loop.
3084 ".p2align 2 \n"
3085 "1: \n" 2840 "1: \n"
3086 MEMACCESS(0) 2841 MEMACCESS(0)
3087 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 2842 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
3088 MEMACCESS(1) 2843 MEMACCESS(1)
3089 "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels. 2844 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels.
3090 "subs %3, %3, #8 \n" // 8 processed per loop. 2845 "subs %3, %3, #8 \n" // 8 processed per loop.
3091 "uqadd v0.8b, v0.8b, v4.8b \n" 2846 "uqadd v0.8b, v0.8b, v4.8b \n"
3092 "uqadd v1.8b, v1.8b, v5.8b \n" 2847 "uqadd v1.8b, v1.8b, v5.8b \n"
3093 "uqadd v2.8b, v2.8b, v6.8b \n" 2848 "uqadd v2.8b, v2.8b, v6.8b \n"
3094 "uqadd v3.8b, v3.8b, v7.8b \n" 2849 "uqadd v3.8b, v3.8b, v7.8b \n"
3095 MEMACCESS(2) 2850 MEMACCESS(2)
3096 "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. 2851 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
3097 "bgt 1b \n" 2852 "b.gt 1b \n"
3098 2853
3099 : "+r"(src_argb0), // %0 2854 : "+r"(src_argb0), // %0
3100 "+r"(src_argb1), // %1 2855 "+r"(src_argb1), // %1
3101 "+r"(dst_argb), // %2 2856 "+r"(dst_argb), // %2
3102 "+r"(width) // %3 2857 "+r"(width) // %3
3103 : 2858 :
3104 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" 2859 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
3105 ); 2860 );
3106 } 2861 }
3107 #endif // HAS_ARGBADDROW_NEON 2862 #endif // HAS_ARGBADDROW_NEON
3108 2863
3109 // Subtract 2 rows of ARGB pixels, 8 pixels at a time. 2864 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
3110 #ifdef HAS_ARGBSUBTRACTROW_NEON 2865 #ifdef HAS_ARGBSUBTRACTROW_NEON
3111 void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, 2866 void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
3112 uint8* dst_argb, int width) { 2867 uint8* dst_argb, int width) {
3113 asm volatile ( 2868 asm volatile (
3114 // 8 pixel loop. 2869 // 8 pixel loop.
3115 ".p2align 2 \n"
3116 "1: \n" 2870 "1: \n"
3117 MEMACCESS(0) 2871 MEMACCESS(0)
3118 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 2872 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
3119 MEMACCESS(1) 2873 MEMACCESS(1)
3120 "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels. 2874 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels.
3121 "subs %3, %3, #8 \n" // 8 processed per loop. 2875 "subs %3, %3, #8 \n" // 8 processed per loop.
3122 "uqsub v0.8b, v0.8b, v4.8b \n" 2876 "uqsub v0.8b, v0.8b, v4.8b \n"
3123 "uqsub v1.8b, v1.8b, v5.8b \n" 2877 "uqsub v1.8b, v1.8b, v5.8b \n"
3124 "uqsub v2.8b, v2.8b, v6.8b \n" 2878 "uqsub v2.8b, v2.8b, v6.8b \n"
3125 "uqsub v3.8b, v3.8b, v7.8b \n" 2879 "uqsub v3.8b, v3.8b, v7.8b \n"
3126 MEMACCESS(2) 2880 MEMACCESS(2)
3127 "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. 2881 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
3128 "bgt 1b \n" 2882 "b.gt 1b \n"
3129 2883
3130 : "+r"(src_argb0), // %0 2884 : "+r"(src_argb0), // %0
3131 "+r"(src_argb1), // %1 2885 "+r"(src_argb1), // %1
3132 "+r"(dst_argb), // %2 2886 "+r"(dst_argb), // %2
3133 "+r"(width) // %3 2887 "+r"(width) // %3
3134 : 2888 :
3135 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" 2889 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
3136 ); 2890 );
3137 } 2891 }
3138 #endif // HAS_ARGBSUBTRACTROW_NEON 2892 #endif // HAS_ARGBSUBTRACTROW_NEON
3139 2893
3140 // Adds Sobel X and Sobel Y and stores Sobel into ARGB. 2894 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
3141 // A = 255 2895 // A = 255
3142 // R = Sobel 2896 // R = Sobel
3143 // G = Sobel 2897 // G = Sobel
3144 // B = Sobel 2898 // B = Sobel
3145 #ifdef HAS_SOBELROW_NEON 2899 #ifdef HAS_SOBELROW_NEON
3146 void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, 2900 void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
3147 uint8* dst_argb, int width) { 2901 uint8* dst_argb, int width) {
3148 asm volatile ( 2902 asm volatile (
3149 "movi v3.8b, #255 \n" // alpha 2903 "movi v3.8b, #255 \n" // alpha
3150 // 8 pixel loop. 2904 // 8 pixel loop.
3151 ".p2align 2 \n"
3152 "1: \n" 2905 "1: \n"
3153 MEMACCESS(0) 2906 MEMACCESS(0)
3154 "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. 2907 "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx.
3155 MEMACCESS(1) 2908 MEMACCESS(1)
3156 "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. 2909 "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely.
3157 "subs %3, %3, #8 \n" // 8 processed per loop. 2910 "subs %3, %3, #8 \n" // 8 processed per loop.
3158 "uqadd v0.8b, v0.8b, v1.8b \n" // add 2911 "uqadd v0.8b, v0.8b, v1.8b \n" // add
3159 "mov v1.8b, v0.8b \n" 2912 "orr v1.8b, v0.8b, v0.8b \n"
3160 "mov v2.8b, v0.8b \n" 2913 "orr v2.8b, v0.8b, v0.8b \n"
3161 MEMACCESS(2) 2914 MEMACCESS(2)
3162 "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. 2915 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
3163 "bgt 1b \n" 2916 "b.gt 1b \n"
3164 : "+r"(src_sobelx), // %0 2917 : "+r"(src_sobelx), // %0
3165 "+r"(src_sobely), // %1 2918 "+r"(src_sobely), // %1
3166 "+r"(dst_argb), // %2 2919 "+r"(dst_argb), // %2
3167 "+r"(width) // %3 2920 "+r"(width) // %3
3168 : 2921 :
3169 : "cc", "memory", "v0", "v1", "v2", "v3" 2922 : "cc", "memory", "v0", "v1", "v2", "v3"
3170 ); 2923 );
3171 } 2924 }
3172 #endif // HAS_SOBELROW_NEON 2925 #endif // HAS_SOBELROW_NEON
3173 2926
3174 // Adds Sobel X and Sobel Y and stores Sobel into plane. 2927 // Adds Sobel X and Sobel Y and stores Sobel into plane.
3175 #ifdef HAS_SOBELTOPLANEROW_NEON 2928 #ifdef HAS_SOBELTOPLANEROW_NEON
3176 void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, 2929 void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
3177 uint8* dst_y, int width) { 2930 uint8* dst_y, int width) {
3178 asm volatile ( 2931 asm volatile (
3179 // 16 pixel loop. 2932 // 16 pixel loop.
3180 ".p2align 2 \n"
3181 "1: \n" 2933 "1: \n"
3182 MEMACCESS(0) 2934 MEMACCESS(0)
3183 "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. 2935 "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx.
3184 MEMACCESS(1) 2936 MEMACCESS(1)
3185 "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. 2937 "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely.
3186 "subs %3, %3, #16 \n" // 16 processed per loop. 2938 "subs %3, %3, #16 \n" // 16 processed per loop.
3187 "uqadd v0.16b, v0.16b, v1.16b \n" // add 2939 "uqadd v0.16b, v0.16b, v1.16b \n" // add
3188 MEMACCESS(2) 2940 MEMACCESS(2)
3189 "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels. 2941 "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels.
3190 "bgt 1b \n" 2942 "b.gt 1b \n"
3191 : "+r"(src_sobelx), // %0 2943 : "+r"(src_sobelx), // %0
3192 "+r"(src_sobely), // %1 2944 "+r"(src_sobely), // %1
3193 "+r"(dst_y), // %2 2945 "+r"(dst_y), // %2
3194 "+r"(width) // %3 2946 "+r"(width) // %3
3195 : 2947 :
3196 : "cc", "memory", "v0", "v1" 2948 : "cc", "memory", "v0", "v1"
3197 ); 2949 );
3198 } 2950 }
3199 #endif // HAS_SOBELTOPLANEROW_NEON 2951 #endif // HAS_SOBELTOPLANEROW_NEON
3200 2952
3201 // Mixes Sobel X, Sobel Y and Sobel into ARGB. 2953 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
3202 // A = 255 2954 // A = 255
3203 // R = Sobel X 2955 // R = Sobel X
3204 // G = Sobel 2956 // G = Sobel
3205 // B = Sobel Y 2957 // B = Sobel Y
3206 #ifdef HAS_SOBELXYROW_NEON 2958 #ifdef HAS_SOBELXYROW_NEON
3207 void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, 2959 void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
3208 uint8* dst_argb, int width) { 2960 uint8* dst_argb, int width) {
3209 asm volatile ( 2961 asm volatile (
3210 "movi v3.8b, #255 \n" // alpha 2962 "movi v3.8b, #255 \n" // alpha
3211 // 8 pixel loop. 2963 // 8 pixel loop.
3212 ".p2align 2 \n"
3213 "1: \n" 2964 "1: \n"
3214 MEMACCESS(0) 2965 MEMACCESS(0)
3215 "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. 2966 "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx.
3216 MEMACCESS(1) 2967 MEMACCESS(1)
3217 "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. 2968 "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely.
3218 "subs %3, %3, #8 \n" // 8 processed per loop. 2969 "subs %3, %3, #8 \n" // 8 processed per loop.
3219 "uqadd v1.8b, v0.8b, v2.8b \n" // add 2970 "uqadd v1.8b, v0.8b, v2.8b \n" // add
3220 MEMACCESS(2) 2971 MEMACCESS(2)
3221 "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. 2972 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
3222 "bgt 1b \n" 2973 "b.gt 1b \n"
3223 : "+r"(src_sobelx), // %0 2974 : "+r"(src_sobelx), // %0
3224 "+r"(src_sobely), // %1 2975 "+r"(src_sobely), // %1
3225 "+r"(dst_argb), // %2 2976 "+r"(dst_argb), // %2
3226 "+r"(width) // %3 2977 "+r"(width) // %3
3227 : 2978 :
3228 : "cc", "memory", "v0", "v1", "v2", "v3" 2979 : "cc", "memory", "v0", "v1", "v2", "v3"
3229 ); 2980 );
3230 } 2981 }
3231 #endif // HAS_SOBELXYROW_NEON 2982 #endif // HAS_SOBELXYROW_NEON
3232 2983
3233 // SobelX as a matrix is 2984 // SobelX as a matrix is
3234 // -1 0 1 2985 // -1 0 1
3235 // -2 0 2 2986 // -2 0 2
3236 // -1 0 1 2987 // -1 0 1
3237 #ifdef HAS_SOBELXROW_NEON 2988 #ifdef HAS_SOBELXROW_NEON
3238 void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, 2989 void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
3239 const uint8* src_y2, uint8* dst_sobelx, int width) { 2990 const uint8* src_y2, uint8* dst_sobelx, int width) {
3240 asm volatile ( 2991 asm volatile (
3241 ".p2align 2 \n"
3242 "1: \n" 2992 "1: \n"
3243 MEMACCESS(0) 2993 MEMACCESS(0)
3244 "ld1 {v0.8b}, [%0],%5 \n" // top 2994 "ld1 {v0.8b}, [%0],%5 \n" // top
3245 MEMACCESS(0) 2995 MEMACCESS(0)
3246 "ld1 {v1.8b}, [%0],%6 \n" 2996 "ld1 {v1.8b}, [%0],%6 \n"
3247 "usubl v0.8h, v0.8b, v1.8b \n" 2997 "usubl v0.8h, v0.8b, v1.8b \n"
3248 MEMACCESS(1) 2998 MEMACCESS(1)
3249 "ld1 {v2.8b}, [%1],%5 \n" // center * 2 2999 "ld1 {v2.8b}, [%1],%5 \n" // center * 2
3250 MEMACCESS(1) 3000 MEMACCESS(1)
3251 "ld1 {v3.8b}, [%1],%6 \n" 3001 "ld1 {v3.8b}, [%1],%6 \n"
3252 "usubl v1.8h, v2.8b, v3.8b \n" 3002 "usubl v1.8h, v2.8b, v3.8b \n"
3253 "add v0.8h, v0.8h, v1.8h \n" 3003 "add v0.8h, v0.8h, v1.8h \n"
3254 "add v0.8h, v0.8h, v1.8h \n" 3004 "add v0.8h, v0.8h, v1.8h \n"
3255 MEMACCESS(2) 3005 MEMACCESS(2)
3256 "ld1 {v2.8b}, [%2],%5 \n" // bottom 3006 "ld1 {v2.8b}, [%2],%5 \n" // bottom
3257 MEMACCESS(2) 3007 MEMACCESS(2)
3258 "ld1 {v3.8b}, [%2],%6 \n" 3008 "ld1 {v3.8b}, [%2],%6 \n"
3259 "subs %4, %4, #8 \n" // 8 pixels 3009 "subs %4, %4, #8 \n" // 8 pixels
3260 "usubl v1.8h, v2.8b, v3.8b \n" 3010 "usubl v1.8h, v2.8b, v3.8b \n"
3261 "add v0.8h, v0.8h, v1.8h \n" 3011 "add v0.8h, v0.8h, v1.8h \n"
3262 "abs v0.8h, v0.8h \n" 3012 "abs v0.8h, v0.8h \n"
3263 "uqxtn v0.8b, v0.8h \n" 3013 "uqxtn v0.8b, v0.8h \n"
3264 MEMACCESS(3) 3014 MEMACCESS(3)
3265 "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx 3015 "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx
3266 "bgt 1b \n" 3016 "b.gt 1b \n"
3267 : "+r"(src_y0), // %0 3017 : "+r"(src_y0), // %0
3268 "+r"(src_y1), // %1 3018 "+r"(src_y1), // %1
3269 "+r"(src_y2), // %2 3019 "+r"(src_y2), // %2
3270 "+r"(dst_sobelx), // %3 3020 "+r"(dst_sobelx), // %3
3271 "+r"(width) // %4 3021 "+r"(width) // %4
3272 : "r"(2), // %5 3022 : "r"(2), // %5
3273 "r"(6) // %6 3023 "r"(6) // %6
3274 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 3024 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
3275 ); 3025 );
3276 } 3026 }
3277 #endif // HAS_SOBELXROW_NEON 3027 #endif // HAS_SOBELXROW_NEON
3278 3028
3279 // SobelY as a matrix is 3029 // SobelY as a matrix is
3280 // -1 -2 -1 3030 // -1 -2 -1
3281 // 0 0 0 3031 // 0 0 0
3282 // 1 2 1 3032 // 1 2 1
3283 #ifdef HAS_SOBELYROW_NEON 3033 #ifdef HAS_SOBELYROW_NEON
3284 void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, 3034 void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
3285 uint8* dst_sobely, int width) { 3035 uint8* dst_sobely, int width) {
3286 asm volatile ( 3036 asm volatile (
3287 ".p2align 2 \n"
3288 "1: \n" 3037 "1: \n"
3289 MEMACCESS(0) 3038 MEMACCESS(0)
3290 "ld1 {v0.8b}, [%0],%4 \n" // left 3039 "ld1 {v0.8b}, [%0],%4 \n" // left
3291 MEMACCESS(1) 3040 MEMACCESS(1)
3292 "ld1 {v1.8b}, [%1],%4 \n" 3041 "ld1 {v1.8b}, [%1],%4 \n"
3293 "usubl v0.8h, v0.8b, v1.8b \n" 3042 "usubl v0.8h, v0.8b, v1.8b \n"
3294 MEMACCESS(0) 3043 MEMACCESS(0)
3295 "ld1 {v2.8b}, [%0],%4 \n" // center * 2 3044 "ld1 {v2.8b}, [%0],%4 \n" // center * 2
3296 MEMACCESS(1) 3045 MEMACCESS(1)
3297 "ld1 {v3.8b}, [%1],%4 \n" 3046 "ld1 {v3.8b}, [%1],%4 \n"
3298 "usubl v1.8h, v2.8b, v3.8b \n" 3047 "usubl v1.8h, v2.8b, v3.8b \n"
3299 "add v0.8h, v0.8h, v1.8h \n" 3048 "add v0.8h, v0.8h, v1.8h \n"
3300 "add v0.8h, v0.8h, v1.8h \n" 3049 "add v0.8h, v0.8h, v1.8h \n"
3301 MEMACCESS(0) 3050 MEMACCESS(0)
3302 "ld1 {v2.8b}, [%0],%5 \n" // right 3051 "ld1 {v2.8b}, [%0],%5 \n" // right
3303 MEMACCESS(1) 3052 MEMACCESS(1)
3304 "ld1 {v3.8b}, [%1],%5 \n" 3053 "ld1 {v3.8b}, [%1],%5 \n"
3305 "subs %3, %3, #8 \n" // 8 pixels 3054 "subs %3, %3, #8 \n" // 8 pixels
3306 "usubl v1.8h, v2.8b, v3.8b \n" 3055 "usubl v1.8h, v2.8b, v3.8b \n"
3307 "add v0.8h, v0.8h, v1.8h \n" 3056 "add v0.8h, v0.8h, v1.8h \n"
3308 "abs v0.8h, v0.8h \n" 3057 "abs v0.8h, v0.8h \n"
3309 "uqxtn v0.8b, v0.8h \n" 3058 "uqxtn v0.8b, v0.8h \n"
3310 MEMACCESS(2) 3059 MEMACCESS(2)
3311 "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely 3060 "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely
3312 "bgt 1b \n" 3061 "b.gt 1b \n"
3313 : "+r"(src_y0), // %0 3062 : "+r"(src_y0), // %0
3314 "+r"(src_y1), // %1 3063 "+r"(src_y1), // %1
3315 "+r"(dst_sobely), // %2 3064 "+r"(dst_sobely), // %2
3316 "+r"(width) // %3 3065 "+r"(width) // %3
3317 : "r"(1), // %4 3066 : "r"(1), // %4
3318 "r"(6) // %5 3067 "r"(6) // %5
3319 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 3068 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
3320 ); 3069 );
3321 } 3070 }
3322 #endif // HAS_SOBELYROW_NEON 3071 #endif // HAS_SOBELYROW_NEON
3323 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) 3072 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
3324 3073
3325 #ifdef __cplusplus 3074 #ifdef __cplusplus
3326 } // extern "C" 3075 } // extern "C"
3327 } // namespace libyuv 3076 } // namespace libyuv
3328 #endif 3077 #endif
OLDNEW
« no previous file with comments | « source/libvpx/third_party/libyuv/source/row_neon.cc ('k') | source/libvpx/third_party/libyuv/source/row_posix.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698