Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(14)

Side by Side Diff: source/row_neon64.cc

Issue 2044223002: Remove ifdefs for neon in row_neon*.cc (Closed) Base URL: https://chromium.googlesource.com/libyuv/libyuv@master
Patch Set: Created 4 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/row_neon.cc ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2014 The LibYuv Project Authors. All rights reserved. 2 * Copyright 2014 The LibYuv Project Authors. All rights reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
(...skipping 109 matching lines...) Expand 10 before | Expand all | Expand 10 after
120 "sqadd " #vB ".8h, v24.8h, v0.8h \n" /* B */ \ 120 "sqadd " #vB ".8h, v24.8h, v0.8h \n" /* B */ \
121 "sqadd " #vG ".8h, v25.8h, v0.8h \n" /* G */ \ 121 "sqadd " #vG ".8h, v25.8h, v0.8h \n" /* G */ \
122 "sqadd " #vR ".8h, v26.8h, v0.8h \n" /* R */ \ 122 "sqadd " #vR ".8h, v26.8h, v0.8h \n" /* R */ \
123 "sqadd " #vB ".8h, " #vB ".8h, v3.8h \n" /* B */ \ 123 "sqadd " #vB ".8h, " #vB ".8h, v3.8h \n" /* B */ \
124 "sqsub " #vG ".8h, " #vG ".8h, v6.8h \n" /* G */ \ 124 "sqsub " #vG ".8h, " #vG ".8h, v6.8h \n" /* G */ \
125 "sqadd " #vR ".8h, " #vR ".8h, v7.8h \n" /* R */ \ 125 "sqadd " #vR ".8h, " #vR ".8h, v7.8h \n" /* R */ \
126 "sqshrun " #vB ".8b, " #vB ".8h, #6 \n" /* B */ \ 126 "sqshrun " #vB ".8b, " #vB ".8h, #6 \n" /* B */ \
127 "sqshrun " #vG ".8b, " #vG ".8h, #6 \n" /* G */ \ 127 "sqshrun " #vG ".8b, " #vG ".8h, #6 \n" /* G */ \
128 "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */ \ 128 "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */ \
129 129
130 #ifdef HAS_I444TOARGBROW_NEON
131 void I444ToARGBRow_NEON(const uint8* src_y, 130 void I444ToARGBRow_NEON(const uint8* src_y,
132 const uint8* src_u, 131 const uint8* src_u,
133 const uint8* src_v, 132 const uint8* src_v,
134 uint8* dst_argb, 133 uint8* dst_argb,
135 const struct YuvConstants* yuvconstants, 134 const struct YuvConstants* yuvconstants,
136 int width) { 135 int width) {
137 asm volatile ( 136 asm volatile (
138 YUVTORGB_SETUP 137 YUVTORGB_SETUP
139 "movi v23.8b, #255 \n" /* A */ 138 "movi v23.8b, #255 \n" /* A */
140 "1: \n" 139 "1: \n"
141 READYUV444 140 READYUV444
142 YUVTORGB(v22, v21, v20) 141 YUVTORGB(v22, v21, v20)
143 "subs %w4, %w4, #8 \n" 142 "subs %w4, %w4, #8 \n"
144 MEMACCESS(3) 143 MEMACCESS(3)
145 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" 144 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
146 "b.gt 1b \n" 145 "b.gt 1b \n"
147 : "+r"(src_y), // %0 146 : "+r"(src_y), // %0
148 "+r"(src_u), // %1 147 "+r"(src_u), // %1
149 "+r"(src_v), // %2 148 "+r"(src_v), // %2
150 "+r"(dst_argb), // %3 149 "+r"(dst_argb), // %3
151 "+r"(width) // %4 150 "+r"(width) // %4
152 : [kUVToRB]"r"(&yuvconstants->kUVToRB), 151 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
153 [kUVToG]"r"(&yuvconstants->kUVToG), 152 [kUVToG]"r"(&yuvconstants->kUVToG),
154 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), 153 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
155 [kYToRgb]"r"(&yuvconstants->kYToRgb) 154 [kYToRgb]"r"(&yuvconstants->kYToRgb)
156 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 155 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
157 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 156 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
158 ); 157 );
159 } 158 }
160 #endif // HAS_I444TOARGBROW_NEON
161 159
162 #ifdef HAS_I422TOARGBROW_NEON
163 void I422ToARGBRow_NEON(const uint8* src_y, 160 void I422ToARGBRow_NEON(const uint8* src_y,
164 const uint8* src_u, 161 const uint8* src_u,
165 const uint8* src_v, 162 const uint8* src_v,
166 uint8* dst_argb, 163 uint8* dst_argb,
167 const struct YuvConstants* yuvconstants, 164 const struct YuvConstants* yuvconstants,
168 int width) { 165 int width) {
169 asm volatile ( 166 asm volatile (
170 YUVTORGB_SETUP 167 YUVTORGB_SETUP
171 "movi v23.8b, #255 \n" /* A */ 168 "movi v23.8b, #255 \n" /* A */
172 "1: \n" 169 "1: \n"
173 READYUV422 170 READYUV422
174 YUVTORGB(v22, v21, v20) 171 YUVTORGB(v22, v21, v20)
175 "subs %w4, %w4, #8 \n" 172 "subs %w4, %w4, #8 \n"
176 MEMACCESS(3) 173 MEMACCESS(3)
177 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" 174 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
178 "b.gt 1b \n" 175 "b.gt 1b \n"
179 : "+r"(src_y), // %0 176 : "+r"(src_y), // %0
180 "+r"(src_u), // %1 177 "+r"(src_u), // %1
181 "+r"(src_v), // %2 178 "+r"(src_v), // %2
182 "+r"(dst_argb), // %3 179 "+r"(dst_argb), // %3
183 "+r"(width) // %4 180 "+r"(width) // %4
184 : [kUVToRB]"r"(&yuvconstants->kUVToRB), 181 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
185 [kUVToG]"r"(&yuvconstants->kUVToG), 182 [kUVToG]"r"(&yuvconstants->kUVToG),
186 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), 183 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
187 [kYToRgb]"r"(&yuvconstants->kYToRgb) 184 [kYToRgb]"r"(&yuvconstants->kYToRgb)
188 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 185 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
189 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 186 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
190 ); 187 );
191 } 188 }
192 #endif // HAS_I422TOARGBROW_NEON
193 189
194 #ifdef HAS_I422ALPHATOARGBROW_NEON
195 void I422AlphaToARGBRow_NEON(const uint8* src_y, 190 void I422AlphaToARGBRow_NEON(const uint8* src_y,
196 const uint8* src_u, 191 const uint8* src_u,
197 const uint8* src_v, 192 const uint8* src_v,
198 const uint8* src_a, 193 const uint8* src_a,
199 uint8* dst_argb, 194 uint8* dst_argb,
200 const struct YuvConstants* yuvconstants, 195 const struct YuvConstants* yuvconstants,
201 int width) { 196 int width) {
202 asm volatile ( 197 asm volatile (
203 YUVTORGB_SETUP 198 YUVTORGB_SETUP
204 "1: \n" 199 "1: \n"
(...skipping 12 matching lines...) Expand all
217 "+r"(dst_argb), // %4 212 "+r"(dst_argb), // %4
218 "+r"(width) // %5 213 "+r"(width) // %5
219 : [kUVToRB]"r"(&yuvconstants->kUVToRB), 214 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
220 [kUVToG]"r"(&yuvconstants->kUVToG), 215 [kUVToG]"r"(&yuvconstants->kUVToG),
221 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), 216 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
222 [kYToRgb]"r"(&yuvconstants->kYToRgb) 217 [kYToRgb]"r"(&yuvconstants->kYToRgb)
223 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 218 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
224 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 219 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
225 ); 220 );
226 } 221 }
227 #endif // HAS_I422ALPHATOARGBROW_NEON
228 222
229 #ifdef HAS_I411TOARGBROW_NEON
230 void I411ToARGBRow_NEON(const uint8* src_y, 223 void I411ToARGBRow_NEON(const uint8* src_y,
231 const uint8* src_u, 224 const uint8* src_u,
232 const uint8* src_v, 225 const uint8* src_v,
233 uint8* dst_argb, 226 uint8* dst_argb,
234 const struct YuvConstants* yuvconstants, 227 const struct YuvConstants* yuvconstants,
235 int width) { 228 int width) {
236 asm volatile ( 229 asm volatile (
237 YUVTORGB_SETUP 230 YUVTORGB_SETUP
238 "movi v23.8b, #255 \n" /* A */ 231 "movi v23.8b, #255 \n" /* A */
239 "1: \n" 232 "1: \n"
240 READYUV411 233 READYUV411
241 YUVTORGB(v22, v21, v20) 234 YUVTORGB(v22, v21, v20)
242 "subs %w4, %w4, #8 \n" 235 "subs %w4, %w4, #8 \n"
243 MEMACCESS(3) 236 MEMACCESS(3)
244 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" 237 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
245 "b.gt 1b \n" 238 "b.gt 1b \n"
246 : "+r"(src_y), // %0 239 : "+r"(src_y), // %0
247 "+r"(src_u), // %1 240 "+r"(src_u), // %1
248 "+r"(src_v), // %2 241 "+r"(src_v), // %2
249 "+r"(dst_argb), // %3 242 "+r"(dst_argb), // %3
250 "+r"(width) // %4 243 "+r"(width) // %4
251 : [kUVToRB]"r"(&yuvconstants->kUVToRB), 244 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
252 [kUVToG]"r"(&yuvconstants->kUVToG), 245 [kUVToG]"r"(&yuvconstants->kUVToG),
253 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), 246 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
254 [kYToRgb]"r"(&yuvconstants->kYToRgb) 247 [kYToRgb]"r"(&yuvconstants->kYToRgb)
255 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 248 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
256 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 249 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
257 ); 250 );
258 } 251 }
259 #endif // HAS_I411TOARGBROW_NEON
260 252
261 #ifdef HAS_I422TORGBAROW_NEON
262 void I422ToRGBARow_NEON(const uint8* src_y, 253 void I422ToRGBARow_NEON(const uint8* src_y,
263 const uint8* src_u, 254 const uint8* src_u,
264 const uint8* src_v, 255 const uint8* src_v,
265 uint8* dst_rgba, 256 uint8* dst_rgba,
266 const struct YuvConstants* yuvconstants, 257 const struct YuvConstants* yuvconstants,
267 int width) { 258 int width) {
268 asm volatile ( 259 asm volatile (
269 YUVTORGB_SETUP 260 YUVTORGB_SETUP
270 "movi v20.8b, #255 \n" /* A */ 261 "movi v20.8b, #255 \n" /* A */
271 "1: \n" 262 "1: \n"
272 READYUV422 263 READYUV422
273 YUVTORGB(v23, v22, v21) 264 YUVTORGB(v23, v22, v21)
274 "subs %w4, %w4, #8 \n" 265 "subs %w4, %w4, #8 \n"
275 MEMACCESS(3) 266 MEMACCESS(3)
276 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" 267 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
277 "b.gt 1b \n" 268 "b.gt 1b \n"
278 : "+r"(src_y), // %0 269 : "+r"(src_y), // %0
279 "+r"(src_u), // %1 270 "+r"(src_u), // %1
280 "+r"(src_v), // %2 271 "+r"(src_v), // %2
281 "+r"(dst_rgba), // %3 272 "+r"(dst_rgba), // %3
282 "+r"(width) // %4 273 "+r"(width) // %4
283 : [kUVToRB]"r"(&yuvconstants->kUVToRB), 274 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
284 [kUVToG]"r"(&yuvconstants->kUVToG), 275 [kUVToG]"r"(&yuvconstants->kUVToG),
285 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), 276 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
286 [kYToRgb]"r"(&yuvconstants->kYToRgb) 277 [kYToRgb]"r"(&yuvconstants->kYToRgb)
287 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 278 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
288 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 279 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
289 ); 280 );
290 } 281 }
291 #endif // HAS_I422TORGBAROW_NEON
292 282
293 #ifdef HAS_I422TORGB24ROW_NEON
294 void I422ToRGB24Row_NEON(const uint8* src_y, 283 void I422ToRGB24Row_NEON(const uint8* src_y,
295 const uint8* src_u, 284 const uint8* src_u,
296 const uint8* src_v, 285 const uint8* src_v,
297 uint8* dst_rgb24, 286 uint8* dst_rgb24,
298 const struct YuvConstants* yuvconstants, 287 const struct YuvConstants* yuvconstants,
299 int width) { 288 int width) {
300 asm volatile ( 289 asm volatile (
301 YUVTORGB_SETUP 290 YUVTORGB_SETUP
302 "1: \n" 291 "1: \n"
303 READYUV422 292 READYUV422
304 YUVTORGB(v22, v21, v20) 293 YUVTORGB(v22, v21, v20)
305 "subs %w4, %w4, #8 \n" 294 "subs %w4, %w4, #8 \n"
306 MEMACCESS(3) 295 MEMACCESS(3)
307 "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n" 296 "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
308 "b.gt 1b \n" 297 "b.gt 1b \n"
309 : "+r"(src_y), // %0 298 : "+r"(src_y), // %0
310 "+r"(src_u), // %1 299 "+r"(src_u), // %1
311 "+r"(src_v), // %2 300 "+r"(src_v), // %2
312 "+r"(dst_rgb24), // %3 301 "+r"(dst_rgb24), // %3
313 "+r"(width) // %4 302 "+r"(width) // %4
314 : [kUVToRB]"r"(&yuvconstants->kUVToRB), 303 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
315 [kUVToG]"r"(&yuvconstants->kUVToG), 304 [kUVToG]"r"(&yuvconstants->kUVToG),
316 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), 305 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
317 [kYToRgb]"r"(&yuvconstants->kYToRgb) 306 [kYToRgb]"r"(&yuvconstants->kYToRgb)
318 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 307 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
319 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 308 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
320 ); 309 );
321 } 310 }
322 #endif // HAS_I422TORGB24ROW_NEON
323 311
324 #define ARGBTORGB565 \ 312 #define ARGBTORGB565 \
325 "shll v0.8h, v22.8b, #8 \n" /* R */ \ 313 "shll v0.8h, v22.8b, #8 \n" /* R */ \
326 "shll v21.8h, v21.8b, #8 \n" /* G */ \ 314 "shll v21.8h, v21.8b, #8 \n" /* G */ \
327 "shll v20.8h, v20.8b, #8 \n" /* B */ \ 315 "shll v20.8h, v20.8b, #8 \n" /* B */ \
328 "sri v0.8h, v21.8h, #5 \n" /* RG */ \ 316 "sri v0.8h, v21.8h, #5 \n" /* RG */ \
329 "sri v0.8h, v20.8h, #11 \n" /* RGB */ 317 "sri v0.8h, v20.8h, #11 \n" /* RGB */
330 318
331 #ifdef HAS_I422TORGB565ROW_NEON
332 void I422ToRGB565Row_NEON(const uint8* src_y, 319 void I422ToRGB565Row_NEON(const uint8* src_y,
333 const uint8* src_u, 320 const uint8* src_u,
334 const uint8* src_v, 321 const uint8* src_v,
335 uint8* dst_rgb565, 322 uint8* dst_rgb565,
336 const struct YuvConstants* yuvconstants, 323 const struct YuvConstants* yuvconstants,
337 int width) { 324 int width) {
338 asm volatile ( 325 asm volatile (
339 YUVTORGB_SETUP 326 YUVTORGB_SETUP
340 "1: \n" 327 "1: \n"
341 READYUV422 328 READYUV422
342 YUVTORGB(v22, v21, v20) 329 YUVTORGB(v22, v21, v20)
343 "subs %w4, %w4, #8 \n" 330 "subs %w4, %w4, #8 \n"
344 ARGBTORGB565 331 ARGBTORGB565
345 MEMACCESS(3) 332 MEMACCESS(3)
346 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. 333 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565.
347 "b.gt 1b \n" 334 "b.gt 1b \n"
348 : "+r"(src_y), // %0 335 : "+r"(src_y), // %0
349 "+r"(src_u), // %1 336 "+r"(src_u), // %1
350 "+r"(src_v), // %2 337 "+r"(src_v), // %2
351 "+r"(dst_rgb565), // %3 338 "+r"(dst_rgb565), // %3
352 "+r"(width) // %4 339 "+r"(width) // %4
353 : [kUVToRB]"r"(&yuvconstants->kUVToRB), 340 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
354 [kUVToG]"r"(&yuvconstants->kUVToG), 341 [kUVToG]"r"(&yuvconstants->kUVToG),
355 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), 342 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
356 [kYToRgb]"r"(&yuvconstants->kYToRgb) 343 [kYToRgb]"r"(&yuvconstants->kYToRgb)
357 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 344 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
358 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 345 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
359 ); 346 );
360 } 347 }
361 #endif // HAS_I422TORGB565ROW_NEON
362 348
363 #define ARGBTOARGB1555 \ 349 #define ARGBTOARGB1555 \
364 "shll v0.8h, v23.8b, #8 \n" /* A */ \ 350 "shll v0.8h, v23.8b, #8 \n" /* A */ \
365 "shll v22.8h, v22.8b, #8 \n" /* R */ \ 351 "shll v22.8h, v22.8b, #8 \n" /* R */ \
366 "shll v21.8h, v21.8b, #8 \n" /* G */ \ 352 "shll v21.8h, v21.8b, #8 \n" /* G */ \
367 "shll v20.8h, v20.8b, #8 \n" /* B */ \ 353 "shll v20.8h, v20.8b, #8 \n" /* B */ \
368 "sri v0.8h, v22.8h, #1 \n" /* AR */ \ 354 "sri v0.8h, v22.8h, #1 \n" /* AR */ \
369 "sri v0.8h, v21.8h, #6 \n" /* ARG */ \ 355 "sri v0.8h, v21.8h, #6 \n" /* ARG */ \
370 "sri v0.8h, v20.8h, #11 \n" /* ARGB */ 356 "sri v0.8h, v20.8h, #11 \n" /* ARGB */
371 357
372 #ifdef HAS_I422TOARGB1555ROW_NEON
373 void I422ToARGB1555Row_NEON(const uint8* src_y, 358 void I422ToARGB1555Row_NEON(const uint8* src_y,
374 const uint8* src_u, 359 const uint8* src_u,
375 const uint8* src_v, 360 const uint8* src_v,
376 uint8* dst_argb1555, 361 uint8* dst_argb1555,
377 const struct YuvConstants* yuvconstants, 362 const struct YuvConstants* yuvconstants,
378 int width) { 363 int width) {
379 asm volatile ( 364 asm volatile (
380 YUVTORGB_SETUP 365 YUVTORGB_SETUP
381 "movi v23.8b, #255 \n" 366 "movi v23.8b, #255 \n"
382 "1: \n" 367 "1: \n"
(...skipping 10 matching lines...) Expand all
393 "+r"(dst_argb1555), // %3 378 "+r"(dst_argb1555), // %3
394 "+r"(width) // %4 379 "+r"(width) // %4
395 : [kUVToRB]"r"(&yuvconstants->kUVToRB), 380 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
396 [kUVToG]"r"(&yuvconstants->kUVToG), 381 [kUVToG]"r"(&yuvconstants->kUVToG),
397 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), 382 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
398 [kYToRgb]"r"(&yuvconstants->kYToRgb) 383 [kYToRgb]"r"(&yuvconstants->kYToRgb)
399 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 384 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
400 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 385 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
401 ); 386 );
402 } 387 }
403 #endif // HAS_I422TOARGB1555ROW_NEON
404 388
405 #define ARGBTOARGB4444 \ 389 #define ARGBTOARGB4444 \
406 /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \ 390 /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \
407 "ushr v20.8b, v20.8b, #4 \n" /* B */ \ 391 "ushr v20.8b, v20.8b, #4 \n" /* B */ \
408 "bic v21.8b, v21.8b, v4.8b \n" /* G */ \ 392 "bic v21.8b, v21.8b, v4.8b \n" /* G */ \
409 "ushr v22.8b, v22.8b, #4 \n" /* R */ \ 393 "ushr v22.8b, v22.8b, #4 \n" /* R */ \
410 "bic v23.8b, v23.8b, v4.8b \n" /* A */ \ 394 "bic v23.8b, v23.8b, v4.8b \n" /* A */ \
411 "orr v0.8b, v20.8b, v21.8b \n" /* BG */ \ 395 "orr v0.8b, v20.8b, v21.8b \n" /* BG */ \
412 "orr v1.8b, v22.8b, v23.8b \n" /* RA */ \ 396 "orr v1.8b, v22.8b, v23.8b \n" /* RA */ \
413 "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */ 397 "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */
414 398
415 #ifdef HAS_I422TOARGB4444ROW_NEON
416 void I422ToARGB4444Row_NEON(const uint8* src_y, 399 void I422ToARGB4444Row_NEON(const uint8* src_y,
417 const uint8* src_u, 400 const uint8* src_u,
418 const uint8* src_v, 401 const uint8* src_v,
419 uint8* dst_argb4444, 402 uint8* dst_argb4444,
420 const struct YuvConstants* yuvconstants, 403 const struct YuvConstants* yuvconstants,
421 int width) { 404 int width) {
422 asm volatile ( 405 asm volatile (
423 YUVTORGB_SETUP 406 YUVTORGB_SETUP
424 "movi v4.16b, #0x0f \n" // bits to clear with vbic. 407 "movi v4.16b, #0x0f \n" // bits to clear with vbic.
425 "1: \n" 408 "1: \n"
(...skipping 11 matching lines...) Expand all
437 "+r"(dst_argb4444), // %3 420 "+r"(dst_argb4444), // %3
438 "+r"(width) // %4 421 "+r"(width) // %4
439 : [kUVToRB]"r"(&yuvconstants->kUVToRB), 422 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
440 [kUVToG]"r"(&yuvconstants->kUVToG), 423 [kUVToG]"r"(&yuvconstants->kUVToG),
441 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), 424 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
442 [kYToRgb]"r"(&yuvconstants->kYToRgb) 425 [kYToRgb]"r"(&yuvconstants->kYToRgb)
443 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 426 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
444 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 427 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
445 ); 428 );
446 } 429 }
447 #endif // HAS_I422TOARGB4444ROW_NEON
448 430
449 #ifdef HAS_I400TOARGBROW_NEON
450 void I400ToARGBRow_NEON(const uint8* src_y, 431 void I400ToARGBRow_NEON(const uint8* src_y,
451 uint8* dst_argb, 432 uint8* dst_argb,
452 int width) { 433 int width) {
453 asm volatile ( 434 asm volatile (
454 YUVTORGB_SETUP 435 YUVTORGB_SETUP
455 "movi v23.8b, #255 \n" 436 "movi v23.8b, #255 \n"
456 "1: \n" 437 "1: \n"
457 READYUV400 438 READYUV400
458 YUVTORGB(v22, v21, v20) 439 YUVTORGB(v22, v21, v20)
459 "subs %w2, %w2, #8 \n" 440 "subs %w2, %w2, #8 \n"
460 MEMACCESS(1) 441 MEMACCESS(1)
461 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" 442 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
462 "b.gt 1b \n" 443 "b.gt 1b \n"
463 : "+r"(src_y), // %0 444 : "+r"(src_y), // %0
464 "+r"(dst_argb), // %1 445 "+r"(dst_argb), // %1
465 "+r"(width) // %2 446 "+r"(width) // %2
466 : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB), 447 : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB),
467 [kUVToG]"r"(&kYuvI601Constants.kUVToG), 448 [kUVToG]"r"(&kYuvI601Constants.kUVToG),
468 [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR), 449 [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR),
469 [kYToRgb]"r"(&kYuvI601Constants.kYToRgb) 450 [kYToRgb]"r"(&kYuvI601Constants.kYToRgb)
470 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 451 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
471 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 452 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
472 ); 453 );
473 } 454 }
474 #endif // HAS_I400TOARGBROW_NEON
475 455
476 #ifdef HAS_J400TOARGBROW_NEON
477 void J400ToARGBRow_NEON(const uint8* src_y, 456 void J400ToARGBRow_NEON(const uint8* src_y,
478 uint8* dst_argb, 457 uint8* dst_argb,
479 int width) { 458 int width) {
480 asm volatile ( 459 asm volatile (
481 "movi v23.8b, #255 \n" 460 "movi v23.8b, #255 \n"
482 "1: \n" 461 "1: \n"
483 MEMACCESS(0) 462 MEMACCESS(0)
484 "ld1 {v20.8b}, [%0], #8 \n" 463 "ld1 {v20.8b}, [%0], #8 \n"
485 "orr v21.8b, v20.8b, v20.8b \n" 464 "orr v21.8b, v20.8b, v20.8b \n"
486 "orr v22.8b, v20.8b, v20.8b \n" 465 "orr v22.8b, v20.8b, v20.8b \n"
487 "subs %w2, %w2, #8 \n" 466 "subs %w2, %w2, #8 \n"
488 MEMACCESS(1) 467 MEMACCESS(1)
489 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" 468 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
490 "b.gt 1b \n" 469 "b.gt 1b \n"
491 : "+r"(src_y), // %0 470 : "+r"(src_y), // %0
492 "+r"(dst_argb), // %1 471 "+r"(dst_argb), // %1
493 "+r"(width) // %2 472 "+r"(width) // %2
494 : 473 :
495 : "cc", "memory", "v20", "v21", "v22", "v23" 474 : "cc", "memory", "v20", "v21", "v22", "v23"
496 ); 475 );
497 } 476 }
498 #endif // HAS_J400TOARGBROW_NEON
499 477
500 #ifdef HAS_NV12TOARGBROW_NEON
501 void NV12ToARGBRow_NEON(const uint8* src_y, 478 void NV12ToARGBRow_NEON(const uint8* src_y,
502 const uint8* src_uv, 479 const uint8* src_uv,
503 uint8* dst_argb, 480 uint8* dst_argb,
504 const struct YuvConstants* yuvconstants, 481 const struct YuvConstants* yuvconstants,
505 int width) { 482 int width) {
506 asm volatile ( 483 asm volatile (
507 YUVTORGB_SETUP 484 YUVTORGB_SETUP
508 "movi v23.8b, #255 \n" 485 "movi v23.8b, #255 \n"
509 "1: \n" 486 "1: \n"
510 READNV12 487 READNV12
511 YUVTORGB(v22, v21, v20) 488 YUVTORGB(v22, v21, v20)
512 "subs %w3, %w3, #8 \n" 489 "subs %w3, %w3, #8 \n"
513 MEMACCESS(2) 490 MEMACCESS(2)
514 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" 491 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
515 "b.gt 1b \n" 492 "b.gt 1b \n"
516 : "+r"(src_y), // %0 493 : "+r"(src_y), // %0
517 "+r"(src_uv), // %1 494 "+r"(src_uv), // %1
518 "+r"(dst_argb), // %2 495 "+r"(dst_argb), // %2
519 "+r"(width) // %3 496 "+r"(width) // %3
520 : [kUVToRB]"r"(&yuvconstants->kUVToRB), 497 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
521 [kUVToG]"r"(&yuvconstants->kUVToG), 498 [kUVToG]"r"(&yuvconstants->kUVToG),
522 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), 499 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
523 [kYToRgb]"r"(&yuvconstants->kYToRgb) 500 [kYToRgb]"r"(&yuvconstants->kYToRgb)
524 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 501 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
525 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 502 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
526 ); 503 );
527 } 504 }
528 #endif // HAS_NV12TOARGBROW_NEON
529 505
530 #ifdef HAS_NV12TOARGBROW_NEON
531 void NV21ToARGBRow_NEON(const uint8* src_y, 506 void NV21ToARGBRow_NEON(const uint8* src_y,
532 const uint8* src_vu, 507 const uint8* src_vu,
533 uint8* dst_argb, 508 uint8* dst_argb,
534 const struct YuvConstants* yuvconstants, 509 const struct YuvConstants* yuvconstants,
535 int width) { 510 int width) {
536 asm volatile ( 511 asm volatile (
537 YUVTORGB_SETUP 512 YUVTORGB_SETUP
538 "movi v23.8b, #255 \n" 513 "movi v23.8b, #255 \n"
539 "1: \n" 514 "1: \n"
540 READNV21 515 READNV21
541 YUVTORGB(v22, v21, v20) 516 YUVTORGB(v22, v21, v20)
542 "subs %w3, %w3, #8 \n" 517 "subs %w3, %w3, #8 \n"
543 MEMACCESS(2) 518 MEMACCESS(2)
544 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" 519 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
545 "b.gt 1b \n" 520 "b.gt 1b \n"
546 : "+r"(src_y), // %0 521 : "+r"(src_y), // %0
547 "+r"(src_vu), // %1 522 "+r"(src_vu), // %1
548 "+r"(dst_argb), // %2 523 "+r"(dst_argb), // %2
549 "+r"(width) // %3 524 "+r"(width) // %3
550 : [kUVToRB]"r"(&yuvconstants->kUVToRB), 525 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
551 [kUVToG]"r"(&yuvconstants->kUVToG), 526 [kUVToG]"r"(&yuvconstants->kUVToG),
552 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), 527 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
553 [kYToRgb]"r"(&yuvconstants->kYToRgb) 528 [kYToRgb]"r"(&yuvconstants->kYToRgb)
554 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 529 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
555 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 530 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
556 ); 531 );
557 } 532 }
558 #endif // HAS_NV12TOARGBROW_NEON
559 533
560 #ifdef HAS_NV12TORGB565ROW_NEON
561 void NV12ToRGB565Row_NEON(const uint8* src_y, 534 void NV12ToRGB565Row_NEON(const uint8* src_y,
562 const uint8* src_uv, 535 const uint8* src_uv,
563 uint8* dst_rgb565, 536 uint8* dst_rgb565,
564 const struct YuvConstants* yuvconstants, 537 const struct YuvConstants* yuvconstants,
565 int width) { 538 int width) {
566 asm volatile ( 539 asm volatile (
567 YUVTORGB_SETUP 540 YUVTORGB_SETUP
568 "1: \n" 541 "1: \n"
569 READNV12 542 READNV12
570 YUVTORGB(v22, v21, v20) 543 YUVTORGB(v22, v21, v20)
571 "subs %w3, %w3, #8 \n" 544 "subs %w3, %w3, #8 \n"
572 ARGBTORGB565 545 ARGBTORGB565
573 MEMACCESS(2) 546 MEMACCESS(2)
574 "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565. 547 "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565.
575 "b.gt 1b \n" 548 "b.gt 1b \n"
576 : "+r"(src_y), // %0 549 : "+r"(src_y), // %0
577 "+r"(src_uv), // %1 550 "+r"(src_uv), // %1
578 "+r"(dst_rgb565), // %2 551 "+r"(dst_rgb565), // %2
579 "+r"(width) // %3 552 "+r"(width) // %3
580 : [kUVToRB]"r"(&yuvconstants->kUVToRB), 553 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
581 [kUVToG]"r"(&yuvconstants->kUVToG), 554 [kUVToG]"r"(&yuvconstants->kUVToG),
582 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), 555 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
583 [kYToRgb]"r"(&yuvconstants->kYToRgb) 556 [kYToRgb]"r"(&yuvconstants->kYToRgb)
584 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 557 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
585 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 558 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
586 ); 559 );
587 } 560 }
588 #endif // HAS_NV12TORGB565ROW_NEON
589 561
590 #ifdef HAS_YUY2TOARGBROW_NEON
591 void YUY2ToARGBRow_NEON(const uint8* src_yuy2, 562 void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
592 uint8* dst_argb, 563 uint8* dst_argb,
593 const struct YuvConstants* yuvconstants, 564 const struct YuvConstants* yuvconstants,
594 int width) { 565 int width) {
595 int64 width64 = (int64)(width); 566 int64 width64 = (int64)(width);
596 asm volatile ( 567 asm volatile (
597 YUVTORGB_SETUP 568 YUVTORGB_SETUP
598 "movi v23.8b, #255 \n" 569 "movi v23.8b, #255 \n"
599 "1: \n" 570 "1: \n"
600 READYUY2 571 READYUY2
601 YUVTORGB(v22, v21, v20) 572 YUVTORGB(v22, v21, v20)
602 "subs %w2, %w2, #8 \n" 573 "subs %w2, %w2, #8 \n"
603 MEMACCESS(1) 574 MEMACCESS(1)
604 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" 575 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
605 "b.gt 1b \n" 576 "b.gt 1b \n"
606 : "+r"(src_yuy2), // %0 577 : "+r"(src_yuy2), // %0
607 "+r"(dst_argb), // %1 578 "+r"(dst_argb), // %1
608 "+r"(width64) // %2 579 "+r"(width64) // %2
609 : [kUVToRB]"r"(&yuvconstants->kUVToRB), 580 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
610 [kUVToG]"r"(&yuvconstants->kUVToG), 581 [kUVToG]"r"(&yuvconstants->kUVToG),
611 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), 582 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
612 [kYToRgb]"r"(&yuvconstants->kYToRgb) 583 [kYToRgb]"r"(&yuvconstants->kYToRgb)
613 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 584 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
614 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 585 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
615 ); 586 );
616 } 587 }
617 #endif // HAS_YUY2TOARGBROW_NEON
618 588
619 #ifdef HAS_UYVYTOARGBROW_NEON
620 void UYVYToARGBRow_NEON(const uint8* src_uyvy, 589 void UYVYToARGBRow_NEON(const uint8* src_uyvy,
621 uint8* dst_argb, 590 uint8* dst_argb,
622 const struct YuvConstants* yuvconstants, 591 const struct YuvConstants* yuvconstants,
623 int width) { 592 int width) {
624 int64 width64 = (int64)(width); 593 int64 width64 = (int64)(width);
625 asm volatile ( 594 asm volatile (
626 YUVTORGB_SETUP 595 YUVTORGB_SETUP
627 "movi v23.8b, #255 \n" 596 "movi v23.8b, #255 \n"
628 "1: \n" 597 "1: \n"
629 READUYVY 598 READUYVY
630 YUVTORGB(v22, v21, v20) 599 YUVTORGB(v22, v21, v20)
631 "subs %w2, %w2, #8 \n" 600 "subs %w2, %w2, #8 \n"
632 MEMACCESS(1) 601 MEMACCESS(1)
633 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n" 602 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n"
634 "b.gt 1b \n" 603 "b.gt 1b \n"
635 : "+r"(src_uyvy), // %0 604 : "+r"(src_uyvy), // %0
636 "+r"(dst_argb), // %1 605 "+r"(dst_argb), // %1
637 "+r"(width64) // %2 606 "+r"(width64) // %2
638 : [kUVToRB]"r"(&yuvconstants->kUVToRB), 607 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
639 [kUVToG]"r"(&yuvconstants->kUVToG), 608 [kUVToG]"r"(&yuvconstants->kUVToG),
640 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), 609 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
641 [kYToRgb]"r"(&yuvconstants->kYToRgb) 610 [kYToRgb]"r"(&yuvconstants->kYToRgb)
642 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 611 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
643 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 612 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
644 ); 613 );
645 } 614 }
646 #endif // HAS_UYVYTOARGBROW_NEON
647 615
648 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. 616 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
649 #ifdef HAS_SPLITUVROW_NEON
650 void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, 617 void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
651 int width) { 618 int width) {
652 asm volatile ( 619 asm volatile (
653 "1: \n" 620 "1: \n"
654 MEMACCESS(0) 621 MEMACCESS(0)
655 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV 622 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV
656 "subs %w3, %w3, #16 \n" // 16 processed per loop 623 "subs %w3, %w3, #16 \n" // 16 processed per loop
657 MEMACCESS(1) 624 MEMACCESS(1)
658 "st1 {v0.16b}, [%1], #16 \n" // store U 625 "st1 {v0.16b}, [%1], #16 \n" // store U
659 MEMACCESS(2) 626 MEMACCESS(2)
660 "st1 {v1.16b}, [%2], #16 \n" // store V 627 "st1 {v1.16b}, [%2], #16 \n" // store V
661 "b.gt 1b \n" 628 "b.gt 1b \n"
662 : "+r"(src_uv), // %0 629 : "+r"(src_uv), // %0
663 "+r"(dst_u), // %1 630 "+r"(dst_u), // %1
664 "+r"(dst_v), // %2 631 "+r"(dst_v), // %2
665 "+r"(width) // %3 // Output registers 632 "+r"(width) // %3 // Output registers
666 : // Input registers 633 : // Input registers
667 : "cc", "memory", "v0", "v1" // Clobber List 634 : "cc", "memory", "v0", "v1" // Clobber List
668 ); 635 );
669 } 636 }
670 #endif // HAS_SPLITUVROW_NEON
671 637
672 // Reads 16 U's and V's and writes out 16 pairs of UV. 638 // Reads 16 U's and V's and writes out 16 pairs of UV.
673 #ifdef HAS_MERGEUVROW_NEON
674 void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, 639 void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
675 int width) { 640 int width) {
676 asm volatile ( 641 asm volatile (
677 "1: \n" 642 "1: \n"
678 MEMACCESS(0) 643 MEMACCESS(0)
679 "ld1 {v0.16b}, [%0], #16 \n" // load U 644 "ld1 {v0.16b}, [%0], #16 \n" // load U
680 MEMACCESS(1) 645 MEMACCESS(1)
681 "ld1 {v1.16b}, [%1], #16 \n" // load V 646 "ld1 {v1.16b}, [%1], #16 \n" // load V
682 "subs %w3, %w3, #16 \n" // 16 processed per loop 647 "subs %w3, %w3, #16 \n" // 16 processed per loop
683 MEMACCESS(2) 648 MEMACCESS(2)
684 "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV 649 "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV
685 "b.gt 1b \n" 650 "b.gt 1b \n"
686 : 651 :
687 "+r"(src_u), // %0 652 "+r"(src_u), // %0
688 "+r"(src_v), // %1 653 "+r"(src_v), // %1
689 "+r"(dst_uv), // %2 654 "+r"(dst_uv), // %2
690 "+r"(width) // %3 // Output registers 655 "+r"(width) // %3 // Output registers
691 : // Input registers 656 : // Input registers
692 : "cc", "memory", "v0", "v1" // Clobber List 657 : "cc", "memory", "v0", "v1" // Clobber List
693 ); 658 );
694 } 659 }
695 #endif // HAS_MERGEUVROW_NEON
696 660
697 // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. 661 // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
698 #ifdef HAS_COPYROW_NEON
699 void CopyRow_NEON(const uint8* src, uint8* dst, int count) { 662 void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
700 asm volatile ( 663 asm volatile (
701 "1: \n" 664 "1: \n"
702 MEMACCESS(0) 665 MEMACCESS(0)
703 "ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32 666 "ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32
704 "subs %w2, %w2, #32 \n" // 32 processed per loop 667 "subs %w2, %w2, #32 \n" // 32 processed per loop
705 MEMACCESS(1) 668 MEMACCESS(1)
706 "st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32 669 "st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32
707 "b.gt 1b \n" 670 "b.gt 1b \n"
708 : "+r"(src), // %0 671 : "+r"(src), // %0
709 "+r"(dst), // %1 672 "+r"(dst), // %1
710 "+r"(count) // %2 // Output registers 673 "+r"(count) // %2 // Output registers
711 : // Input registers 674 : // Input registers
712 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 675 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
713 ); 676 );
714 } 677 }
715 #endif // HAS_COPYROW_NEON
716 678
717 // SetRow writes 'count' bytes using an 8 bit value repeated. 679 // SetRow writes 'count' bytes using an 8 bit value repeated.
718 void SetRow_NEON(uint8* dst, uint8 v8, int count) { 680 void SetRow_NEON(uint8* dst, uint8 v8, int count) {
719 asm volatile ( 681 asm volatile (
720 "dup v0.16b, %w2 \n" // duplicate 16 bytes 682 "dup v0.16b, %w2 \n" // duplicate 16 bytes
721 "1: \n" 683 "1: \n"
722 "subs %w1, %w1, #16 \n" // 16 bytes per loop 684 "subs %w1, %w1, #16 \n" // 16 bytes per loop
723 MEMACCESS(0) 685 MEMACCESS(0)
724 "st1 {v0.16b}, [%0], #16 \n" // store 686 "st1 {v0.16b}, [%0], #16 \n" // store
725 "b.gt 1b \n" 687 "b.gt 1b \n"
(...skipping 12 matching lines...) Expand all
738 MEMACCESS(0) 700 MEMACCESS(0)
739 "st1 {v0.16b}, [%0], #16 \n" // store 701 "st1 {v0.16b}, [%0], #16 \n" // store
740 "b.gt 1b \n" 702 "b.gt 1b \n"
741 : "+r"(dst), // %0 703 : "+r"(dst), // %0
742 "+r"(count) // %1 704 "+r"(count) // %1
743 : "r"(v32) // %2 705 : "r"(v32) // %2
744 : "cc", "memory", "v0" 706 : "cc", "memory", "v0"
745 ); 707 );
746 } 708 }
747 709
748 #ifdef HAS_MIRRORROW_NEON
749 void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { 710 void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
750 int64 width64 = (int64) width; 711 int64 width64 = (int64) width;
751 asm volatile ( 712 asm volatile (
752 // Start at end of source row. 713 // Start at end of source row.
753 "add %0, %0, %2 \n" 714 "add %0, %0, %2 \n"
754 "sub %0, %0, #16 \n" 715 "sub %0, %0, #16 \n"
755 716
756 "1: \n" 717 "1: \n"
757 MEMACCESS(0) 718 MEMACCESS(0)
758 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 719 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
759 "subs %2, %2, #16 \n" // 16 pixels per loop. 720 "subs %2, %2, #16 \n" // 16 pixels per loop.
760 "rev64 v0.16b, v0.16b \n" 721 "rev64 v0.16b, v0.16b \n"
761 MEMACCESS(1) 722 MEMACCESS(1)
762 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 723 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
763 MEMACCESS(1) 724 MEMACCESS(1)
764 "st1 {v0.D}[0], [%1], #8 \n" 725 "st1 {v0.D}[0], [%1], #8 \n"
765 "b.gt 1b \n" 726 "b.gt 1b \n"
766 : "+r"(src), // %0 727 : "+r"(src), // %0
767 "+r"(dst), // %1 728 "+r"(dst), // %1
768 "+r"(width64) // %2 729 "+r"(width64) // %2
769 : "r"((ptrdiff_t)-16) // %3 730 : "r"((ptrdiff_t)-16) // %3
770 : "cc", "memory", "v0" 731 : "cc", "memory", "v0"
771 ); 732 );
772 } 733 }
773 #endif // HAS_MIRRORROW_NEON
774 734
775 #ifdef HAS_MIRRORUVROW_NEON
776 void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, 735 void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
777 int width) { 736 int width) {
778 int64 width64 = (int64) width; 737 int64 width64 = (int64) width;
779 asm volatile ( 738 asm volatile (
780 // Start at end of source row. 739 // Start at end of source row.
781 "add %0, %0, %3, lsl #1 \n" 740 "add %0, %0, %3, lsl #1 \n"
782 "sub %0, %0, #16 \n" 741 "sub %0, %0, #16 \n"
783 742
784 "1: \n" 743 "1: \n"
785 MEMACCESS(0) 744 MEMACCESS(0)
786 "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16 745 "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16
787 "subs %3, %3, #8 \n" // 8 pixels per loop. 746 "subs %3, %3, #8 \n" // 8 pixels per loop.
788 "rev64 v0.8b, v0.8b \n" 747 "rev64 v0.8b, v0.8b \n"
789 "rev64 v1.8b, v1.8b \n" 748 "rev64 v1.8b, v1.8b \n"
790 MEMACCESS(1) 749 MEMACCESS(1)
791 "st1 {v0.8b}, [%1], #8 \n" // dst += 8 750 "st1 {v0.8b}, [%1], #8 \n" // dst += 8
792 MEMACCESS(2) 751 MEMACCESS(2)
793 "st1 {v1.8b}, [%2], #8 \n" 752 "st1 {v1.8b}, [%2], #8 \n"
794 "b.gt 1b \n" 753 "b.gt 1b \n"
795 : "+r"(src_uv), // %0 754 : "+r"(src_uv), // %0
796 "+r"(dst_u), // %1 755 "+r"(dst_u), // %1
797 "+r"(dst_v), // %2 756 "+r"(dst_v), // %2
798 "+r"(width64) // %3 757 "+r"(width64) // %3
799 : "r"((ptrdiff_t)-16) // %4 758 : "r"((ptrdiff_t)-16) // %4
800 : "cc", "memory", "v0", "v1" 759 : "cc", "memory", "v0", "v1"
801 ); 760 );
802 } 761 }
803 #endif // HAS_MIRRORUVROW_NEON
804 762
805 #ifdef HAS_ARGBMIRRORROW_NEON
806 void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { 763 void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
807 int64 width64 = (int64) width; 764 int64 width64 = (int64) width;
808 asm volatile ( 765 asm volatile (
809 // Start at end of source row. 766 // Start at end of source row.
810 "add %0, %0, %2, lsl #2 \n" 767 "add %0, %0, %2, lsl #2 \n"
811 "sub %0, %0, #16 \n" 768 "sub %0, %0, #16 \n"
812 769
813 "1: \n" 770 "1: \n"
814 MEMACCESS(0) 771 MEMACCESS(0)
815 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 772 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
816 "subs %2, %2, #4 \n" // 4 pixels per loop. 773 "subs %2, %2, #4 \n" // 4 pixels per loop.
817 "rev64 v0.4s, v0.4s \n" 774 "rev64 v0.4s, v0.4s \n"
818 MEMACCESS(1) 775 MEMACCESS(1)
819 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 776 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
820 MEMACCESS(1) 777 MEMACCESS(1)
821 "st1 {v0.D}[0], [%1], #8 \n" 778 "st1 {v0.D}[0], [%1], #8 \n"
822 "b.gt 1b \n" 779 "b.gt 1b \n"
823 : "+r"(src), // %0 780 : "+r"(src), // %0
824 "+r"(dst), // %1 781 "+r"(dst), // %1
825 "+r"(width64) // %2 782 "+r"(width64) // %2
826 : "r"((ptrdiff_t)-16) // %3 783 : "r"((ptrdiff_t)-16) // %3
827 : "cc", "memory", "v0" 784 : "cc", "memory", "v0"
828 ); 785 );
829 } 786 }
830 #endif // HAS_ARGBMIRRORROW_NEON
831 787
832 #ifdef HAS_RGB24TOARGBROW_NEON
833 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) { 788 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
834 asm volatile ( 789 asm volatile (
835 "movi v4.8b, #255 \n" // Alpha 790 "movi v4.8b, #255 \n" // Alpha
836 "1: \n" 791 "1: \n"
837 MEMACCESS(0) 792 MEMACCESS(0)
838 "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. 793 "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
839 "subs %w2, %w2, #8 \n" // 8 processed per loop. 794 "subs %w2, %w2, #8 \n" // 8 processed per loop.
840 MEMACCESS(1) 795 MEMACCESS(1)
841 "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels 796 "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels
842 "b.gt 1b \n" 797 "b.gt 1b \n"
843 : "+r"(src_rgb24), // %0 798 : "+r"(src_rgb24), // %0
844 "+r"(dst_argb), // %1 799 "+r"(dst_argb), // %1
845 "+r"(width) // %2 800 "+r"(width) // %2
846 : 801 :
847 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List 802 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
848 ); 803 );
849 } 804 }
850 #endif // HAS_RGB24TOARGBROW_NEON
851 805
852 #ifdef HAS_RAWTOARGBROW_NEON
853 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) { 806 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
854 asm volatile ( 807 asm volatile (
855 "movi v5.8b, #255 \n" // Alpha 808 "movi v5.8b, #255 \n" // Alpha
856 "1: \n" 809 "1: \n"
857 MEMACCESS(0) 810 MEMACCESS(0)
858 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b 811 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
859 "subs %w2, %w2, #8 \n" // 8 processed per loop. 812 "subs %w2, %w2, #8 \n" // 8 processed per loop.
860 "orr v3.8b, v1.8b, v1.8b \n" // move g 813 "orr v3.8b, v1.8b, v1.8b \n" // move g
861 "orr v4.8b, v0.8b, v0.8b \n" // move r 814 "orr v4.8b, v0.8b, v0.8b \n" // move r
862 MEMACCESS(1) 815 MEMACCESS(1)
863 "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a 816 "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a
864 "b.gt 1b \n" 817 "b.gt 1b \n"
865 : "+r"(src_raw), // %0 818 : "+r"(src_raw), // %0
866 "+r"(dst_argb), // %1 819 "+r"(dst_argb), // %1
867 "+r"(width) // %2 820 "+r"(width) // %2
868 : 821 :
869 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List 822 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List
870 ); 823 );
871 } 824 }
872 #endif // HAS_RAWTOARGBROW_NEON
873 825
874 void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) { 826 void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
875 asm volatile ( 827 asm volatile (
876 "1: \n" 828 "1: \n"
877 MEMACCESS(0) 829 MEMACCESS(0)
878 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b 830 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
879 "subs %w2, %w2, #8 \n" // 8 processed per loop. 831 "subs %w2, %w2, #8 \n" // 8 processed per loop.
880 "orr v3.8b, v1.8b, v1.8b \n" // move g 832 "orr v3.8b, v1.8b, v1.8b \n" // move g
881 "orr v4.8b, v0.8b, v0.8b \n" // move r 833 "orr v4.8b, v0.8b, v0.8b \n" // move r
882 MEMACCESS(1) 834 MEMACCESS(1)
(...skipping 13 matching lines...) Expand all
896 "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \ 848 "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \
897 "orr v1.8b, v4.8b, v6.8b \n" /* G */ \ 849 "orr v1.8b, v4.8b, v6.8b \n" /* G */ \
898 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ 850 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
899 "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \ 851 "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \
900 "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \ 852 "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \
901 "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \ 853 "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \
902 "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \ 854 "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \
903 "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \ 855 "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \
904 "dup v2.2D, v0.D[1] \n" /* R */ 856 "dup v2.2D, v0.D[1] \n" /* R */
905 857
906 #ifdef HAS_RGB565TOARGBROW_NEON
907 void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) { 858 void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
908 asm volatile ( 859 asm volatile (
909 "movi v3.8b, #255 \n" // Alpha 860 "movi v3.8b, #255 \n" // Alpha
910 "1: \n" 861 "1: \n"
911 MEMACCESS(0) 862 MEMACCESS(0)
912 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. 863 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
913 "subs %w2, %w2, #8 \n" // 8 processed per loop. 864 "subs %w2, %w2, #8 \n" // 8 processed per loop.
914 RGB565TOARGB 865 RGB565TOARGB
915 MEMACCESS(1) 866 MEMACCESS(1)
916 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels 867 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
917 "b.gt 1b \n" 868 "b.gt 1b \n"
918 : "+r"(src_rgb565), // %0 869 : "+r"(src_rgb565), // %0
919 "+r"(dst_argb), // %1 870 "+r"(dst_argb), // %1
920 "+r"(width) // %2 871 "+r"(width) // %2
921 : 872 :
922 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List 873 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List
923 ); 874 );
924 } 875 }
925 #endif // HAS_RGB565TOARGBROW_NEON
926 876
927 #define ARGB1555TOARGB \ 877 #define ARGB1555TOARGB \
928 "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \ 878 "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \
929 "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \ 879 "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \
930 "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \ 880 "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \
931 \ 881 \
932 "sshr v2.8h, v0.8h, #15 \n" /* A AAAAAAAA */ \ 882 "sshr v2.8h, v0.8h, #15 \n" /* A AAAAAAAA */ \
933 "xtn2 v3.16b, v2.8h \n" \ 883 "xtn2 v3.16b, v2.8h \n" \
934 \ 884 \
935 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ 885 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
(...skipping 18 matching lines...) Expand all
954 "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \ 904 "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \
955 \ 905 \
956 "ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \ 906 "ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \
957 "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \ 907 "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \
958 "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \ 908 "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \
959 \ 909 \
960 "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \ 910 "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \
961 "orr v2.16b, v1.16b, v3.16b \n" /* R */ \ 911 "orr v2.16b, v1.16b, v3.16b \n" /* R */ \
962 "dup v1.2D, v0.D[1] \n" /* G */ \ 912 "dup v1.2D, v0.D[1] \n" /* G */ \
963 913
964 #ifdef HAS_ARGB1555TOARGBROW_NEON
965 void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, 914 void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
966 int width) { 915 int width) {
967 asm volatile ( 916 asm volatile (
968 "movi v3.8b, #255 \n" // Alpha 917 "movi v3.8b, #255 \n" // Alpha
969 "1: \n" 918 "1: \n"
970 MEMACCESS(0) 919 MEMACCESS(0)
971 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. 920 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
972 "subs %w2, %w2, #8 \n" // 8 processed per loop. 921 "subs %w2, %w2, #8 \n" // 8 processed per loop.
973 ARGB1555TOARGB 922 ARGB1555TOARGB
974 MEMACCESS(1) 923 MEMACCESS(1)
975 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels 924 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
976 "b.gt 1b \n" 925 "b.gt 1b \n"
977 : "+r"(src_argb1555), // %0 926 : "+r"(src_argb1555), // %0
978 "+r"(dst_argb), // %1 927 "+r"(dst_argb), // %1
979 "+r"(width) // %2 928 "+r"(width) // %2
980 : 929 :
981 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 930 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
982 ); 931 );
983 } 932 }
984 #endif // HAS_ARGB1555TOARGBROW_NEON
985 933
986 #define ARGB4444TOARGB \ 934 #define ARGB4444TOARGB \
987 "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \ 935 "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \
988 "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \ 936 "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \
989 "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \ 937 "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \
990 "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \ 938 "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \
991 "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \ 939 "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \
992 "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \ 940 "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \
993 "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \ 941 "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \
994 "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \ 942 "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \
995 "dup v0.2D, v2.D[1] \n" \ 943 "dup v0.2D, v2.D[1] \n" \
996 "dup v1.2D, v3.D[1] \n" 944 "dup v1.2D, v3.D[1] \n"
997 945
998 #ifdef HAS_ARGB4444TOARGBROW_NEON
999 void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, 946 void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
1000 int width) { 947 int width) {
1001 asm volatile ( 948 asm volatile (
1002 "1: \n" 949 "1: \n"
1003 MEMACCESS(0) 950 MEMACCESS(0)
1004 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. 951 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
1005 "subs %w2, %w2, #8 \n" // 8 processed per loop. 952 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1006 ARGB4444TOARGB 953 ARGB4444TOARGB
1007 MEMACCESS(1) 954 MEMACCESS(1)
1008 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels 955 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
1009 "b.gt 1b \n" 956 "b.gt 1b \n"
1010 : "+r"(src_argb4444), // %0 957 : "+r"(src_argb4444), // %0
1011 "+r"(dst_argb), // %1 958 "+r"(dst_argb), // %1
1012 "+r"(width) // %2 959 "+r"(width) // %2
1013 : 960 :
1014 : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List 961 : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
1015 ); 962 );
1016 } 963 }
1017 #endif // HAS_ARGB4444TOARGBROW_NEON
1018 964
1019 #ifdef HAS_ARGBTORGB24ROW_NEON
1020 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) { 965 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
1021 asm volatile ( 966 asm volatile (
1022 "1: \n" 967 "1: \n"
1023 MEMACCESS(0) 968 MEMACCESS(0)
1024 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB pixels 969 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB pixels
1025 "subs %w2, %w2, #8 \n" // 8 processed per loop. 970 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1026 MEMACCESS(1) 971 MEMACCESS(1)
1027 "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24. 972 "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24.
1028 "b.gt 1b \n" 973 "b.gt 1b \n"
1029 : "+r"(src_argb), // %0 974 : "+r"(src_argb), // %0
1030 "+r"(dst_rgb24), // %1 975 "+r"(dst_rgb24), // %1
1031 "+r"(width) // %2 976 "+r"(width) // %2
1032 : 977 :
1033 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List 978 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
1034 ); 979 );
1035 } 980 }
1036 #endif // HAS_ARGBTORGB24ROW_NEON
1037 981
1038 #ifdef HAS_ARGBTORAWROW_NEON
1039 void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) { 982 void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
1040 asm volatile ( 983 asm volatile (
1041 "1: \n" 984 "1: \n"
1042 MEMACCESS(0) 985 MEMACCESS(0)
1043 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a 986 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a
1044 "subs %w2, %w2, #8 \n" // 8 processed per loop. 987 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1045 "orr v4.8b, v2.8b, v2.8b \n" // mov g 988 "orr v4.8b, v2.8b, v2.8b \n" // mov g
1046 "orr v5.8b, v1.8b, v1.8b \n" // mov b 989 "orr v5.8b, v1.8b, v1.8b \n" // mov b
1047 MEMACCESS(1) 990 MEMACCESS(1)
1048 "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b 991 "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b
1049 "b.gt 1b \n" 992 "b.gt 1b \n"
1050 : "+r"(src_argb), // %0 993 : "+r"(src_argb), // %0
1051 "+r"(dst_raw), // %1 994 "+r"(dst_raw), // %1
1052 "+r"(width) // %2 995 "+r"(width) // %2
1053 : 996 :
1054 : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List 997 : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List
1055 ); 998 );
1056 } 999 }
1057 #endif // HAS_ARGBTORAWROW_NEON
1058 1000
1059 #ifdef HAS_YUY2TOYROW_NEON
1060 void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) { 1001 void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
1061 asm volatile ( 1002 asm volatile (
1062 "1: \n" 1003 "1: \n"
1063 MEMACCESS(0) 1004 MEMACCESS(0)
1064 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. 1005 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
1065 "subs %w2, %w2, #16 \n" // 16 processed per loop. 1006 "subs %w2, %w2, #16 \n" // 16 processed per loop.
1066 MEMACCESS(1) 1007 MEMACCESS(1)
1067 "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. 1008 "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
1068 "b.gt 1b \n" 1009 "b.gt 1b \n"
1069 : "+r"(src_yuy2), // %0 1010 : "+r"(src_yuy2), // %0
1070 "+r"(dst_y), // %1 1011 "+r"(dst_y), // %1
1071 "+r"(width) // %2 1012 "+r"(width) // %2
1072 : 1013 :
1073 : "cc", "memory", "v0", "v1" // Clobber List 1014 : "cc", "memory", "v0", "v1" // Clobber List
1074 ); 1015 );
1075 } 1016 }
1076 #endif // HAS_YUY2TOYROW_NEON
1077 1017
1078 #ifdef HAS_UYVYTOYROW_NEON
1079 void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) { 1018 void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {
1080 asm volatile ( 1019 asm volatile (
1081 "1: \n" 1020 "1: \n"
1082 MEMACCESS(0) 1021 MEMACCESS(0)
1083 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. 1022 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
1084 "subs %w2, %w2, #16 \n" // 16 processed per loop. 1023 "subs %w2, %w2, #16 \n" // 16 processed per loop.
1085 MEMACCESS(1) 1024 MEMACCESS(1)
1086 "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. 1025 "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
1087 "b.gt 1b \n" 1026 "b.gt 1b \n"
1088 : "+r"(src_uyvy), // %0 1027 : "+r"(src_uyvy), // %0
1089 "+r"(dst_y), // %1 1028 "+r"(dst_y), // %1
1090 "+r"(width) // %2 1029 "+r"(width) // %2
1091 : 1030 :
1092 : "cc", "memory", "v0", "v1" // Clobber List 1031 : "cc", "memory", "v0", "v1" // Clobber List
1093 ); 1032 );
1094 } 1033 }
1095 #endif // HAS_UYVYTOYROW_NEON
1096 1034
1097 #ifdef HAS_YUY2TOUV422ROW_NEON
1098 void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, 1035 void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
1099 int width) { 1036 int width) {
1100 asm volatile ( 1037 asm volatile (
1101 "1: \n" 1038 "1: \n"
1102 MEMACCESS(0) 1039 MEMACCESS(0)
1103 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 pixels 1040 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 pixels
1104 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. 1041 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
1105 MEMACCESS(1) 1042 MEMACCESS(1)
1106 "st1 {v1.8b}, [%1], #8 \n" // store 8 U. 1043 "st1 {v1.8b}, [%1], #8 \n" // store 8 U.
1107 MEMACCESS(2) 1044 MEMACCESS(2)
1108 "st1 {v3.8b}, [%2], #8 \n" // store 8 V. 1045 "st1 {v3.8b}, [%2], #8 \n" // store 8 V.
1109 "b.gt 1b \n" 1046 "b.gt 1b \n"
1110 : "+r"(src_yuy2), // %0 1047 : "+r"(src_yuy2), // %0
1111 "+r"(dst_u), // %1 1048 "+r"(dst_u), // %1
1112 "+r"(dst_v), // %2 1049 "+r"(dst_v), // %2
1113 "+r"(width) // %3 1050 "+r"(width) // %3
1114 : 1051 :
1115 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 1052 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
1116 ); 1053 );
1117 } 1054 }
1118 #endif // HAS_YUY2TOUV422ROW_NEON
1119 1055
1120 #ifdef HAS_UYVYTOUV422ROW_NEON
1121 void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, 1056 void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
1122 int width) { 1057 int width) {
1123 asm volatile ( 1058 asm volatile (
1124 "1: \n" 1059 "1: \n"
1125 MEMACCESS(0) 1060 MEMACCESS(0)
1126 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY pixels 1061 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY pixels
1127 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. 1062 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
1128 MEMACCESS(1) 1063 MEMACCESS(1)
1129 "st1 {v0.8b}, [%1], #8 \n" // store 8 U. 1064 "st1 {v0.8b}, [%1], #8 \n" // store 8 U.
1130 MEMACCESS(2) 1065 MEMACCESS(2)
1131 "st1 {v2.8b}, [%2], #8 \n" // store 8 V. 1066 "st1 {v2.8b}, [%2], #8 \n" // store 8 V.
1132 "b.gt 1b \n" 1067 "b.gt 1b \n"
1133 : "+r"(src_uyvy), // %0 1068 : "+r"(src_uyvy), // %0
1134 "+r"(dst_u), // %1 1069 "+r"(dst_u), // %1
1135 "+r"(dst_v), // %2 1070 "+r"(dst_v), // %2
1136 "+r"(width) // %3 1071 "+r"(width) // %3
1137 : 1072 :
1138 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 1073 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
1139 ); 1074 );
1140 } 1075 }
1141 #endif // HAS_UYVYTOUV422ROW_NEON
1142 1076
1143 #ifdef HAS_YUY2TOUVROW_NEON
1144 void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, 1077 void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
1145 uint8* dst_u, uint8* dst_v, int width) { 1078 uint8* dst_u, uint8* dst_v, int width) {
1146 const uint8* src_yuy2b = src_yuy2 + stride_yuy2; 1079 const uint8* src_yuy2b = src_yuy2 + stride_yuy2;
1147 asm volatile ( 1080 asm volatile (
1148 "1: \n" 1081 "1: \n"
1149 MEMACCESS(0) 1082 MEMACCESS(0)
1150 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels 1083 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
1151 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. 1084 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
1152 MEMACCESS(1) 1085 MEMACCESS(1)
1153 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row 1086 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
1154 "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U 1087 "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U
1155 "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V 1088 "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V
1156 MEMACCESS(2) 1089 MEMACCESS(2)
1157 "st1 {v1.8b}, [%2], #8 \n" // store 8 U. 1090 "st1 {v1.8b}, [%2], #8 \n" // store 8 U.
1158 MEMACCESS(3) 1091 MEMACCESS(3)
1159 "st1 {v3.8b}, [%3], #8 \n" // store 8 V. 1092 "st1 {v3.8b}, [%3], #8 \n" // store 8 V.
1160 "b.gt 1b \n" 1093 "b.gt 1b \n"
1161 : "+r"(src_yuy2), // %0 1094 : "+r"(src_yuy2), // %0
1162 "+r"(src_yuy2b), // %1 1095 "+r"(src_yuy2b), // %1
1163 "+r"(dst_u), // %2 1096 "+r"(dst_u), // %2
1164 "+r"(dst_v), // %3 1097 "+r"(dst_v), // %3
1165 "+r"(width) // %4 1098 "+r"(width) // %4
1166 : 1099 :
1167 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", 1100 : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
1168 "v5", "v6", "v7" // Clobber List 1101 "v5", "v6", "v7" // Clobber List
1169 ); 1102 );
1170 } 1103 }
1171 #endif // HAS_YUY2TOUVROW_NEON
1172 1104
1173 #ifdef HAS_UYVYTOUVROW_NEON
1174 void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, 1105 void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
1175 uint8* dst_u, uint8* dst_v, int width) { 1106 uint8* dst_u, uint8* dst_v, int width) {
1176 const uint8* src_uyvyb = src_uyvy + stride_uyvy; 1107 const uint8* src_uyvyb = src_uyvy + stride_uyvy;
1177 asm volatile ( 1108 asm volatile (
1178 "1: \n" 1109 "1: \n"
1179 MEMACCESS(0) 1110 MEMACCESS(0)
1180 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels 1111 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
1181 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. 1112 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
1182 MEMACCESS(1) 1113 MEMACCESS(1)
1183 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row 1114 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
1184 "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U 1115 "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U
1185 "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V 1116 "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V
1186 MEMACCESS(2) 1117 MEMACCESS(2)
1187 "st1 {v0.8b}, [%2], #8 \n" // store 8 U. 1118 "st1 {v0.8b}, [%2], #8 \n" // store 8 U.
1188 MEMACCESS(3) 1119 MEMACCESS(3)
1189 "st1 {v2.8b}, [%3], #8 \n" // store 8 V. 1120 "st1 {v2.8b}, [%3], #8 \n" // store 8 V.
1190 "b.gt 1b \n" 1121 "b.gt 1b \n"
1191 : "+r"(src_uyvy), // %0 1122 : "+r"(src_uyvy), // %0
1192 "+r"(src_uyvyb), // %1 1123 "+r"(src_uyvyb), // %1
1193 "+r"(dst_u), // %2 1124 "+r"(dst_u), // %2
1194 "+r"(dst_v), // %3 1125 "+r"(dst_v), // %3
1195 "+r"(width) // %4 1126 "+r"(width) // %4
1196 : 1127 :
1197 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", 1128 : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
1198 "v5", "v6", "v7" // Clobber List 1129 "v5", "v6", "v7" // Clobber List
1199 ); 1130 );
1200 } 1131 }
1201 #endif // HAS_UYVYTOUVROW_NEON
1202 1132
1203 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. 1133 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
1204 #ifdef HAS_ARGBSHUFFLEROW_NEON
1205 void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, 1134 void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
1206 const uint8* shuffler, int width) { 1135 const uint8* shuffler, int width) {
1207 asm volatile ( 1136 asm volatile (
1208 MEMACCESS(3) 1137 MEMACCESS(3)
1209 "ld1 {v2.16b}, [%3] \n" // shuffler 1138 "ld1 {v2.16b}, [%3] \n" // shuffler
1210 "1: \n" 1139 "1: \n"
1211 MEMACCESS(0) 1140 MEMACCESS(0)
1212 "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. 1141 "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
1213 "subs %w2, %w2, #4 \n" // 4 processed per loop 1142 "subs %w2, %w2, #4 \n" // 4 processed per loop
1214 "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels 1143 "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
1215 MEMACCESS(1) 1144 MEMACCESS(1)
1216 "st1 {v1.16b}, [%1], #16 \n" // store 4. 1145 "st1 {v1.16b}, [%1], #16 \n" // store 4.
1217 "b.gt 1b \n" 1146 "b.gt 1b \n"
1218 : "+r"(src_argb), // %0 1147 : "+r"(src_argb), // %0
1219 "+r"(dst_argb), // %1 1148 "+r"(dst_argb), // %1
1220 "+r"(width) // %2 1149 "+r"(width) // %2
1221 : "r"(shuffler) // %3 1150 : "r"(shuffler) // %3
1222 : "cc", "memory", "v0", "v1", "v2" // Clobber List 1151 : "cc", "memory", "v0", "v1", "v2" // Clobber List
1223 ); 1152 );
1224 } 1153 }
1225 #endif // HAS_ARGBSHUFFLEROW_NEON
1226 1154
1227 #ifdef HAS_I422TOYUY2ROW_NEON
1228 void I422ToYUY2Row_NEON(const uint8* src_y, 1155 void I422ToYUY2Row_NEON(const uint8* src_y,
1229 const uint8* src_u, 1156 const uint8* src_u,
1230 const uint8* src_v, 1157 const uint8* src_v,
1231 uint8* dst_yuy2, int width) { 1158 uint8* dst_yuy2, int width) {
1232 asm volatile ( 1159 asm volatile (
1233 "1: \n" 1160 "1: \n"
1234 MEMACCESS(0) 1161 MEMACCESS(0)
1235 "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys 1162 "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys
1236 "orr v2.8b, v1.8b, v1.8b \n" 1163 "orr v2.8b, v1.8b, v1.8b \n"
1237 MEMACCESS(1) 1164 MEMACCESS(1)
1238 "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us 1165 "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
1239 MEMACCESS(2) 1166 MEMACCESS(2)
1240 "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs 1167 "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
1241 "subs %w4, %w4, #16 \n" // 16 pixels 1168 "subs %w4, %w4, #16 \n" // 16 pixels
1242 MEMACCESS(3) 1169 MEMACCESS(3)
1243 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. 1170 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
1244 "b.gt 1b \n" 1171 "b.gt 1b \n"
1245 : "+r"(src_y), // %0 1172 : "+r"(src_y), // %0
1246 "+r"(src_u), // %1 1173 "+r"(src_u), // %1
1247 "+r"(src_v), // %2 1174 "+r"(src_v), // %2
1248 "+r"(dst_yuy2), // %3 1175 "+r"(dst_yuy2), // %3
1249 "+r"(width) // %4 1176 "+r"(width) // %4
1250 : 1177 :
1251 : "cc", "memory", "v0", "v1", "v2", "v3" 1178 : "cc", "memory", "v0", "v1", "v2", "v3"
1252 ); 1179 );
1253 } 1180 }
1254 #endif // HAS_I422TOYUY2ROW_NEON
1255 1181
1256 #ifdef HAS_I422TOUYVYROW_NEON
1257 void I422ToUYVYRow_NEON(const uint8* src_y, 1182 void I422ToUYVYRow_NEON(const uint8* src_y,
1258 const uint8* src_u, 1183 const uint8* src_u,
1259 const uint8* src_v, 1184 const uint8* src_v,
1260 uint8* dst_uyvy, int width) { 1185 uint8* dst_uyvy, int width) {
1261 asm volatile ( 1186 asm volatile (
1262 "1: \n" 1187 "1: \n"
1263 MEMACCESS(0) 1188 MEMACCESS(0)
1264 "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys 1189 "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys
1265 "orr v3.8b, v2.8b, v2.8b \n" 1190 "orr v3.8b, v2.8b, v2.8b \n"
1266 MEMACCESS(1) 1191 MEMACCESS(1)
1267 "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us 1192 "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
1268 MEMACCESS(2) 1193 MEMACCESS(2)
1269 "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs 1194 "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
1270 "subs %w4, %w4, #16 \n" // 16 pixels 1195 "subs %w4, %w4, #16 \n" // 16 pixels
1271 MEMACCESS(3) 1196 MEMACCESS(3)
1272 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. 1197 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
1273 "b.gt 1b \n" 1198 "b.gt 1b \n"
1274 : "+r"(src_y), // %0 1199 : "+r"(src_y), // %0
1275 "+r"(src_u), // %1 1200 "+r"(src_u), // %1
1276 "+r"(src_v), // %2 1201 "+r"(src_v), // %2
1277 "+r"(dst_uyvy), // %3 1202 "+r"(dst_uyvy), // %3
1278 "+r"(width) // %4 1203 "+r"(width) // %4
1279 : 1204 :
1280 : "cc", "memory", "v0", "v1", "v2", "v3" 1205 : "cc", "memory", "v0", "v1", "v2", "v3"
1281 ); 1206 );
1282 } 1207 }
1283 #endif // HAS_I422TOUYVYROW_NEON
1284 1208
1285 #ifdef HAS_ARGBTORGB565ROW_NEON
1286 void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) { 1209 void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {
1287 asm volatile ( 1210 asm volatile (
1288 "1: \n" 1211 "1: \n"
1289 MEMACCESS(0) 1212 MEMACCESS(0)
1290 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels 1213 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
1291 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1214 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1292 ARGBTORGB565 1215 ARGBTORGB565
1293 MEMACCESS(1) 1216 MEMACCESS(1)
1294 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565. 1217 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565.
1295 "b.gt 1b \n" 1218 "b.gt 1b \n"
1296 : "+r"(src_argb), // %0 1219 : "+r"(src_argb), // %0
1297 "+r"(dst_rgb565), // %1 1220 "+r"(dst_rgb565), // %1
1298 "+r"(width) // %2 1221 "+r"(width) // %2
1299 : 1222 :
1300 : "cc", "memory", "v0", "v20", "v21", "v22", "v23" 1223 : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
1301 ); 1224 );
1302 } 1225 }
1303 #endif // HAS_ARGBTORGB565ROW_NEON
1304 1226
1305 #ifdef HAS_ARGBTORGB565DITHERROW_NEON
1306 void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb, 1227 void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
1307 const uint32 dither4, int width) { 1228 const uint32 dither4, int width) {
1308 asm volatile ( 1229 asm volatile (
1309 "dup v1.4s, %w2 \n" // dither4 1230 "dup v1.4s, %w2 \n" // dither4
1310 "1: \n" 1231 "1: \n"
1311 MEMACCESS(1) 1232 MEMACCESS(1)
1312 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels 1233 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels
1313 "subs %w3, %w3, #8 \n" // 8 processed per loop. 1234 "subs %w3, %w3, #8 \n" // 8 processed per loop.
1314 "uqadd v20.8b, v20.8b, v1.8b \n" 1235 "uqadd v20.8b, v20.8b, v1.8b \n"
1315 "uqadd v21.8b, v21.8b, v1.8b \n" 1236 "uqadd v21.8b, v21.8b, v1.8b \n"
1316 "uqadd v22.8b, v22.8b, v1.8b \n" 1237 "uqadd v22.8b, v22.8b, v1.8b \n"
1317 ARGBTORGB565 1238 ARGBTORGB565
1318 MEMACCESS(0) 1239 MEMACCESS(0)
1319 "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565. 1240 "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565.
1320 "b.gt 1b \n" 1241 "b.gt 1b \n"
1321 : "+r"(dst_rgb) // %0 1242 : "+r"(dst_rgb) // %0
1322 : "r"(src_argb), // %1 1243 : "r"(src_argb), // %1
1323 "r"(dither4), // %2 1244 "r"(dither4), // %2
1324 "r"(width) // %3 1245 "r"(width) // %3
1325 : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23" 1246 : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23"
1326 ); 1247 );
1327 } 1248 }
1328 #endif // HAS_ARGBTORGB565ROW_NEON
1329 1249
1330 #ifdef HAS_ARGBTOARGB1555ROW_NEON
1331 void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, 1250 void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
1332 int width) { 1251 int width) {
1333 asm volatile ( 1252 asm volatile (
1334 "1: \n" 1253 "1: \n"
1335 MEMACCESS(0) 1254 MEMACCESS(0)
1336 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels 1255 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
1337 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1256 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1338 ARGBTOARGB1555 1257 ARGBTOARGB1555
1339 MEMACCESS(1) 1258 MEMACCESS(1)
1340 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB1555. 1259 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB1555.
1341 "b.gt 1b \n" 1260 "b.gt 1b \n"
1342 : "+r"(src_argb), // %0 1261 : "+r"(src_argb), // %0
1343 "+r"(dst_argb1555), // %1 1262 "+r"(dst_argb1555), // %1
1344 "+r"(width) // %2 1263 "+r"(width) // %2
1345 : 1264 :
1346 : "cc", "memory", "v0", "v20", "v21", "v22", "v23" 1265 : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
1347 ); 1266 );
1348 } 1267 }
1349 #endif // HAS_ARGBTOARGB1555ROW_NEON
1350 1268
1351 #ifdef HAS_ARGBTOARGB4444ROW_NEON
1352 void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, 1269 void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
1353 int width) { 1270 int width) {
1354 asm volatile ( 1271 asm volatile (
1355 "movi v4.16b, #0x0f \n" // bits to clear with vbic. 1272 "movi v4.16b, #0x0f \n" // bits to clear with vbic.
1356 "1: \n" 1273 "1: \n"
1357 MEMACCESS(0) 1274 MEMACCESS(0)
1358 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels 1275 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
1359 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1276 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1360 ARGBTOARGB4444 1277 ARGBTOARGB4444
1361 MEMACCESS(1) 1278 MEMACCESS(1)
1362 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB4444. 1279 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB4444.
1363 "b.gt 1b \n" 1280 "b.gt 1b \n"
1364 : "+r"(src_argb), // %0 1281 : "+r"(src_argb), // %0
1365 "+r"(dst_argb4444), // %1 1282 "+r"(dst_argb4444), // %1
1366 "+r"(width) // %2 1283 "+r"(width) // %2
1367 : 1284 :
1368 : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23" 1285 : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23"
1369 ); 1286 );
1370 } 1287 }
1371 #endif // HAS_ARGBTOARGB4444ROW_NEON
1372 1288
1373 #ifdef HAS_ARGBTOYROW_NEON
1374 void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { 1289 void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
1375 asm volatile ( 1290 asm volatile (
1376 "movi v4.8b, #13 \n" // B * 0.1016 coefficient 1291 "movi v4.8b, #13 \n" // B * 0.1016 coefficient
1377 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 1292 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
1378 "movi v6.8b, #33 \n" // R * 0.2578 coefficient 1293 "movi v6.8b, #33 \n" // R * 0.2578 coefficient
1379 "movi v7.8b, #16 \n" // Add 16 constant 1294 "movi v7.8b, #16 \n" // Add 16 constant
1380 "1: \n" 1295 "1: \n"
1381 MEMACCESS(0) 1296 MEMACCESS(0)
1382 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 1297 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
1383 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1298 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1384 "umull v3.8h, v0.8b, v4.8b \n" // B 1299 "umull v3.8h, v0.8b, v4.8b \n" // B
1385 "umlal v3.8h, v1.8b, v5.8b \n" // G 1300 "umlal v3.8h, v1.8b, v5.8b \n" // G
1386 "umlal v3.8h, v2.8b, v6.8b \n" // R 1301 "umlal v3.8h, v2.8b, v6.8b \n" // R
1387 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y 1302 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
1388 "uqadd v0.8b, v0.8b, v7.8b \n" 1303 "uqadd v0.8b, v0.8b, v7.8b \n"
1389 MEMACCESS(1) 1304 MEMACCESS(1)
1390 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 1305 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
1391 "b.gt 1b \n" 1306 "b.gt 1b \n"
1392 : "+r"(src_argb), // %0 1307 : "+r"(src_argb), // %0
1393 "+r"(dst_y), // %1 1308 "+r"(dst_y), // %1
1394 "+r"(width) // %2 1309 "+r"(width) // %2
1395 : 1310 :
1396 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" 1311 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
1397 ); 1312 );
1398 } 1313 }
1399 #endif // HAS_ARGBTOYROW_NEON
1400 1314
1401 #ifdef HAS_ARGBEXTRACTALPHAROW_NEON
1402 void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) { 1315 void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) {
1403 asm volatile ( 1316 asm volatile (
1404 "1: \n" 1317 "1: \n"
1405 MEMACCESS(0) 1318 MEMACCESS(0)
1406 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load row 16 pix els 1319 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load row 16 pix els
1407 "subs %w2, %w2, #16 \n" // 16 processed per loop 1320 "subs %w2, %w2, #16 \n" // 16 processed per loop
1408 MEMACCESS(1) 1321 MEMACCESS(1)
1409 "st1 {v3.16b}, [%1], #16 \n" // store 16 A's. 1322 "st1 {v3.16b}, [%1], #16 \n" // store 16 A's.
1410 "b.gt 1b \n" 1323 "b.gt 1b \n"
1411 : "+r"(src_argb), // %0 1324 : "+r"(src_argb), // %0
1412 "+r"(dst_a), // %1 1325 "+r"(dst_a), // %1
1413 "+r"(width) // %2 1326 "+r"(width) // %2
1414 : 1327 :
1415 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 1328 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
1416 ); 1329 );
1417 } 1330 }
1418 #endif // HAS_ARGBEXTRACTALPHAROW_NEON
1419 1331
1420 #ifdef HAS_ARGBTOYJROW_NEON
1421 void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { 1332 void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
1422 asm volatile ( 1333 asm volatile (
1423 "movi v4.8b, #15 \n" // B * 0.11400 coefficient 1334 "movi v4.8b, #15 \n" // B * 0.11400 coefficient
1424 "movi v5.8b, #75 \n" // G * 0.58700 coefficient 1335 "movi v5.8b, #75 \n" // G * 0.58700 coefficient
1425 "movi v6.8b, #38 \n" // R * 0.29900 coefficient 1336 "movi v6.8b, #38 \n" // R * 0.29900 coefficient
1426 "1: \n" 1337 "1: \n"
1427 MEMACCESS(0) 1338 MEMACCESS(0)
1428 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 1339 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
1429 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1340 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1430 "umull v3.8h, v0.8b, v4.8b \n" // B 1341 "umull v3.8h, v0.8b, v4.8b \n" // B
1431 "umlal v3.8h, v1.8b, v5.8b \n" // G 1342 "umlal v3.8h, v1.8b, v5.8b \n" // G
1432 "umlal v3.8h, v2.8b, v6.8b \n" // R 1343 "umlal v3.8h, v2.8b, v6.8b \n" // R
1433 "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y 1344 "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y
1434 MEMACCESS(1) 1345 MEMACCESS(1)
1435 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 1346 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
1436 "b.gt 1b \n" 1347 "b.gt 1b \n"
1437 : "+r"(src_argb), // %0 1348 : "+r"(src_argb), // %0
1438 "+r"(dst_y), // %1 1349 "+r"(dst_y), // %1
1439 "+r"(width) // %2 1350 "+r"(width) // %2
1440 : 1351 :
1441 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" 1352 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
1442 ); 1353 );
1443 } 1354 }
1444 #endif // HAS_ARGBTOYJROW_NEON
1445 1355
1446 // 8x1 pixels. 1356 // 8x1 pixels.
1447 #ifdef HAS_ARGBTOUV444ROW_NEON
1448 void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, 1357 void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
1449 int width) { 1358 int width) {
1450 asm volatile ( 1359 asm volatile (
1451 "movi v24.8b, #112 \n" // UB / VR 0.875 coefficient 1360 "movi v24.8b, #112 \n" // UB / VR 0.875 coefficient
1452 "movi v25.8b, #74 \n" // UG -0.5781 coefficient 1361 "movi v25.8b, #74 \n" // UG -0.5781 coefficient
1453 "movi v26.8b, #38 \n" // UR -0.2969 coefficient 1362 "movi v26.8b, #38 \n" // UR -0.2969 coefficient
1454 "movi v27.8b, #18 \n" // VB -0.1406 coefficient 1363 "movi v27.8b, #18 \n" // VB -0.1406 coefficient
1455 "movi v28.8b, #94 \n" // VG -0.7344 coefficient 1364 "movi v28.8b, #94 \n" // VG -0.7344 coefficient
1456 "movi v29.16b,#0x80 \n" // 128.5 1365 "movi v29.16b,#0x80 \n" // 128.5
1457 "1: \n" 1366 "1: \n"
(...skipping 20 matching lines...) Expand all
1478 "b.gt 1b \n" 1387 "b.gt 1b \n"
1479 : "+r"(src_argb), // %0 1388 : "+r"(src_argb), // %0
1480 "+r"(dst_u), // %1 1389 "+r"(dst_u), // %1
1481 "+r"(dst_v), // %2 1390 "+r"(dst_v), // %2
1482 "+r"(width) // %3 1391 "+r"(width) // %3
1483 : 1392 :
1484 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", 1393 : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
1485 "v24", "v25", "v26", "v27", "v28", "v29" 1394 "v24", "v25", "v26", "v27", "v28", "v29"
1486 ); 1395 );
1487 } 1396 }
1488 #endif // HAS_ARGBTOUV444ROW_NEON
1489 1397
1490 #define RGBTOUV_SETUP_REG \ 1398 #define RGBTOUV_SETUP_REG \
1491 "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \ 1399 "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \
1492 "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \ 1400 "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \
1493 "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \ 1401 "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \
1494 "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \ 1402 "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \
1495 "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \ 1403 "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \
1496 "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */ 1404 "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */
1497 1405
1498 // 32x1 pixels -> 8x1. width is number of argb pixels. e.g. 32. 1406 // 32x1 pixels -> 8x1. width is number of argb pixels. e.g. 32.
1499 #ifdef HAS_ARGBTOUV411ROW_NEON
1500 void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, 1407 void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
1501 int width) { 1408 int width) {
1502 asm volatile ( 1409 asm volatile (
1503 RGBTOUV_SETUP_REG 1410 RGBTOUV_SETUP_REG
1504 "1: \n" 1411 "1: \n"
1505 MEMACCESS(0) 1412 MEMACCESS(0)
1506 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. 1413 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
1507 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. 1414 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
1508 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. 1415 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
1509 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. 1416 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
(...skipping 29 matching lines...) Expand all
1539 "b.gt 1b \n" 1446 "b.gt 1b \n"
1540 : "+r"(src_argb), // %0 1447 : "+r"(src_argb), // %0
1541 "+r"(dst_u), // %1 1448 "+r"(dst_u), // %1
1542 "+r"(dst_v), // %2 1449 "+r"(dst_v), // %2
1543 "+r"(width) // %3 1450 "+r"(width) // %3
1544 : 1451 :
1545 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 1452 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1546 "v20", "v21", "v22", "v23", "v24", "v25" 1453 "v20", "v21", "v22", "v23", "v24", "v25"
1547 ); 1454 );
1548 } 1455 }
1549 #endif // HAS_ARGBTOUV411ROW_NEON
1550 1456
1551 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. 1457 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
1552 #define RGBTOUV(QB, QG, QR) \ 1458 #define RGBTOUV(QB, QG, QR) \
1553 "mul v3.8h, " #QB ",v20.8h \n" /* B */ \ 1459 "mul v3.8h, " #QB ",v20.8h \n" /* B */ \
1554 "mul v4.8h, " #QR ",v20.8h \n" /* R */ \ 1460 "mul v4.8h, " #QR ",v20.8h \n" /* R */ \
1555 "mls v3.8h, " #QG ",v21.8h \n" /* G */ \ 1461 "mls v3.8h, " #QG ",v21.8h \n" /* G */ \
1556 "mls v4.8h, " #QG ",v24.8h \n" /* G */ \ 1462 "mls v4.8h, " #QG ",v24.8h \n" /* G */ \
1557 "mls v3.8h, " #QR ",v22.8h \n" /* R */ \ 1463 "mls v3.8h, " #QR ",v22.8h \n" /* R */ \
1558 "mls v4.8h, " #QB ",v23.8h \n" /* B */ \ 1464 "mls v4.8h, " #QB ",v23.8h \n" /* B */ \
1559 "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \ 1465 "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \
1560 "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \ 1466 "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \
1561 "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \ 1467 "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \
1562 "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */ 1468 "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */
1563 1469
1564 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. 1470 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
1565 // TODO(fbarchard): consider ptrdiff_t for all strides. 1471 // TODO(fbarchard): consider ptrdiff_t for all strides.
1566 1472
1567 #ifdef HAS_ARGBTOUVROW_NEON
1568 void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, 1473 void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
1569 uint8* dst_u, uint8* dst_v, int width) { 1474 uint8* dst_u, uint8* dst_v, int width) {
1570 const uint8* src_argb_1 = src_argb + src_stride_argb; 1475 const uint8* src_argb_1 = src_argb + src_stride_argb;
1571 asm volatile ( 1476 asm volatile (
1572 RGBTOUV_SETUP_REG 1477 RGBTOUV_SETUP_REG
1573 "1: \n" 1478 "1: \n"
1574 MEMACCESS(0) 1479 MEMACCESS(0)
1575 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. 1480 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
1576 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. 1481 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
1577 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. 1482 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
(...skipping 19 matching lines...) Expand all
1597 : "+r"(src_argb), // %0 1502 : "+r"(src_argb), // %0
1598 "+r"(src_argb_1), // %1 1503 "+r"(src_argb_1), // %1
1599 "+r"(dst_u), // %2 1504 "+r"(dst_u), // %2
1600 "+r"(dst_v), // %3 1505 "+r"(dst_v), // %3
1601 "+r"(width) // %4 1506 "+r"(width) // %4
1602 : 1507 :
1603 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 1508 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1604 "v20", "v21", "v22", "v23", "v24", "v25" 1509 "v20", "v21", "v22", "v23", "v24", "v25"
1605 ); 1510 );
1606 } 1511 }
1607 #endif // HAS_ARGBTOUVROW_NEON
1608 1512
1609 // TODO(fbarchard): Subsample match C code. 1513 // TODO(fbarchard): Subsample match C code.
1610 #ifdef HAS_ARGBTOUVJROW_NEON
1611 void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, 1514 void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
1612 uint8* dst_u, uint8* dst_v, int width) { 1515 uint8* dst_u, uint8* dst_v, int width) {
1613 const uint8* src_argb_1 = src_argb + src_stride_argb; 1516 const uint8* src_argb_1 = src_argb + src_stride_argb;
1614 asm volatile ( 1517 asm volatile (
1615 "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 1518 "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2
1616 "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2 1519 "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2
1617 "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2 1520 "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2
1618 "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2 1521 "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2
1619 "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2 1522 "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2
1620 "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) 1523 "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
(...skipping 23 matching lines...) Expand all
1644 : "+r"(src_argb), // %0 1547 : "+r"(src_argb), // %0
1645 "+r"(src_argb_1), // %1 1548 "+r"(src_argb_1), // %1
1646 "+r"(dst_u), // %2 1549 "+r"(dst_u), // %2
1647 "+r"(dst_v), // %3 1550 "+r"(dst_v), // %3
1648 "+r"(width) // %4 1551 "+r"(width) // %4
1649 : 1552 :
1650 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 1553 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1651 "v20", "v21", "v22", "v23", "v24", "v25" 1554 "v20", "v21", "v22", "v23", "v24", "v25"
1652 ); 1555 );
1653 } 1556 }
1654 #endif // HAS_ARGBTOUVJROW_NEON
1655 1557
1656 #ifdef HAS_BGRATOUVROW_NEON
1657 void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, 1558 void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
1658 uint8* dst_u, uint8* dst_v, int width) { 1559 uint8* dst_u, uint8* dst_v, int width) {
1659 const uint8* src_bgra_1 = src_bgra + src_stride_bgra; 1560 const uint8* src_bgra_1 = src_bgra + src_stride_bgra;
1660 asm volatile ( 1561 asm volatile (
1661 RGBTOUV_SETUP_REG 1562 RGBTOUV_SETUP_REG
1662 "1: \n" 1563 "1: \n"
1663 MEMACCESS(0) 1564 MEMACCESS(0)
1664 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. 1565 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
1665 "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts. 1566 "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts.
1666 "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts. 1567 "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
(...skipping 18 matching lines...) Expand all
1685 : "+r"(src_bgra), // %0 1586 : "+r"(src_bgra), // %0
1686 "+r"(src_bgra_1), // %1 1587 "+r"(src_bgra_1), // %1
1687 "+r"(dst_u), // %2 1588 "+r"(dst_u), // %2
1688 "+r"(dst_v), // %3 1589 "+r"(dst_v), // %3
1689 "+r"(width) // %4 1590 "+r"(width) // %4
1690 : 1591 :
1691 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 1592 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1692 "v20", "v21", "v22", "v23", "v24", "v25" 1593 "v20", "v21", "v22", "v23", "v24", "v25"
1693 ); 1594 );
1694 } 1595 }
1695 #endif // HAS_BGRATOUVROW_NEON
1696 1596
1697 #ifdef HAS_ABGRTOUVROW_NEON
1698 void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, 1597 void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
1699 uint8* dst_u, uint8* dst_v, int width) { 1598 uint8* dst_u, uint8* dst_v, int width) {
1700 const uint8* src_abgr_1 = src_abgr + src_stride_abgr; 1599 const uint8* src_abgr_1 = src_abgr + src_stride_abgr;
1701 asm volatile ( 1600 asm volatile (
1702 RGBTOUV_SETUP_REG 1601 RGBTOUV_SETUP_REG
1703 "1: \n" 1602 "1: \n"
1704 MEMACCESS(0) 1603 MEMACCESS(0)
1705 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. 1604 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
1706 "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts. 1605 "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
1707 "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts. 1606 "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
(...skipping 18 matching lines...) Expand all
1726 : "+r"(src_abgr), // %0 1625 : "+r"(src_abgr), // %0
1727 "+r"(src_abgr_1), // %1 1626 "+r"(src_abgr_1), // %1
1728 "+r"(dst_u), // %2 1627 "+r"(dst_u), // %2
1729 "+r"(dst_v), // %3 1628 "+r"(dst_v), // %3
1730 "+r"(width) // %4 1629 "+r"(width) // %4
1731 : 1630 :
1732 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 1631 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1733 "v20", "v21", "v22", "v23", "v24", "v25" 1632 "v20", "v21", "v22", "v23", "v24", "v25"
1734 ); 1633 );
1735 } 1634 }
1736 #endif // HAS_ABGRTOUVROW_NEON
1737 1635
1738 #ifdef HAS_RGBATOUVROW_NEON
1739 void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, 1636 void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
1740 uint8* dst_u, uint8* dst_v, int width) { 1637 uint8* dst_u, uint8* dst_v, int width) {
1741 const uint8* src_rgba_1 = src_rgba + src_stride_rgba; 1638 const uint8* src_rgba_1 = src_rgba + src_stride_rgba;
1742 asm volatile ( 1639 asm volatile (
1743 RGBTOUV_SETUP_REG 1640 RGBTOUV_SETUP_REG
1744 "1: \n" 1641 "1: \n"
1745 MEMACCESS(0) 1642 MEMACCESS(0)
1746 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. 1643 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
1747 "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts. 1644 "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts.
1748 "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts. 1645 "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
(...skipping 18 matching lines...) Expand all
1767 : "+r"(src_rgba), // %0 1664 : "+r"(src_rgba), // %0
1768 "+r"(src_rgba_1), // %1 1665 "+r"(src_rgba_1), // %1
1769 "+r"(dst_u), // %2 1666 "+r"(dst_u), // %2
1770 "+r"(dst_v), // %3 1667 "+r"(dst_v), // %3
1771 "+r"(width) // %4 1668 "+r"(width) // %4
1772 : 1669 :
1773 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 1670 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1774 "v20", "v21", "v22", "v23", "v24", "v25" 1671 "v20", "v21", "v22", "v23", "v24", "v25"
1775 ); 1672 );
1776 } 1673 }
1777 #endif // HAS_RGBATOUVROW_NEON
1778 1674
1779 #ifdef HAS_RGB24TOUVROW_NEON
1780 void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, 1675 void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
1781 uint8* dst_u, uint8* dst_v, int width) { 1676 uint8* dst_u, uint8* dst_v, int width) {
1782 const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24; 1677 const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
1783 asm volatile ( 1678 asm volatile (
1784 RGBTOUV_SETUP_REG 1679 RGBTOUV_SETUP_REG
1785 "1: \n" 1680 "1: \n"
1786 MEMACCESS(0) 1681 MEMACCESS(0)
1787 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. 1682 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels.
1788 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. 1683 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
1789 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. 1684 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
(...skipping 18 matching lines...) Expand all
1808 : "+r"(src_rgb24), // %0 1703 : "+r"(src_rgb24), // %0
1809 "+r"(src_rgb24_1), // %1 1704 "+r"(src_rgb24_1), // %1
1810 "+r"(dst_u), // %2 1705 "+r"(dst_u), // %2
1811 "+r"(dst_v), // %3 1706 "+r"(dst_v), // %3
1812 "+r"(width) // %4 1707 "+r"(width) // %4
1813 : 1708 :
1814 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 1709 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1815 "v20", "v21", "v22", "v23", "v24", "v25" 1710 "v20", "v21", "v22", "v23", "v24", "v25"
1816 ); 1711 );
1817 } 1712 }
1818 #endif // HAS_RGB24TOUVROW_NEON
1819 1713
1820 #ifdef HAS_RAWTOUVROW_NEON
1821 void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, 1714 void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
1822 uint8* dst_u, uint8* dst_v, int width) { 1715 uint8* dst_u, uint8* dst_v, int width) {
1823 const uint8* src_raw_1 = src_raw + src_stride_raw; 1716 const uint8* src_raw_1 = src_raw + src_stride_raw;
1824 asm volatile ( 1717 asm volatile (
1825 RGBTOUV_SETUP_REG 1718 RGBTOUV_SETUP_REG
1826 "1: \n" 1719 "1: \n"
1827 MEMACCESS(0) 1720 MEMACCESS(0)
1828 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels. 1721 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels.
1829 "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts. 1722 "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
1830 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. 1723 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
(...skipping 18 matching lines...) Expand all
1849 : "+r"(src_raw), // %0 1742 : "+r"(src_raw), // %0
1850 "+r"(src_raw_1), // %1 1743 "+r"(src_raw_1), // %1
1851 "+r"(dst_u), // %2 1744 "+r"(dst_u), // %2
1852 "+r"(dst_v), // %3 1745 "+r"(dst_v), // %3
1853 "+r"(width) // %4 1746 "+r"(width) // %4
1854 : 1747 :
1855 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 1748 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1856 "v20", "v21", "v22", "v23", "v24", "v25" 1749 "v20", "v21", "v22", "v23", "v24", "v25"
1857 ); 1750 );
1858 } 1751 }
1859 #endif // HAS_RAWTOUVROW_NEON
1860 1752
1861 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. 1753 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
1862 #ifdef HAS_RGB565TOUVROW_NEON
1863 void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, 1754 void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
1864 uint8* dst_u, uint8* dst_v, int width) { 1755 uint8* dst_u, uint8* dst_v, int width) {
1865 const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565; 1756 const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
1866 asm volatile ( 1757 asm volatile (
1867 "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) / 2 1758 "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) / 2
1868 "movi v23.8h, #37, lsl #0 \n" // UG coeff (-0.5781) / 2 1759 "movi v23.8h, #37, lsl #0 \n" // UG coeff (-0.5781) / 2
1869 "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2 1760 "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2
1870 "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2 1761 "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2
1871 "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2 1762 "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2
1872 "movi v27.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) 1763 "movi v27.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
(...skipping 52 matching lines...) Expand 10 before | Expand all | Expand 10 after
1925 "+r"(src_rgb565_1), // %1 1816 "+r"(src_rgb565_1), // %1
1926 "+r"(dst_u), // %2 1817 "+r"(dst_u), // %2
1927 "+r"(dst_v), // %3 1818 "+r"(dst_v), // %3
1928 "+r"(width) // %4 1819 "+r"(width) // %4
1929 : 1820 :
1930 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 1821 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1931 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", 1822 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
1932 "v25", "v26", "v27" 1823 "v25", "v26", "v27"
1933 ); 1824 );
1934 } 1825 }
1935 #endif // HAS_RGB565TOUVROW_NEON
1936 1826
1937 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. 1827 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
1938 #ifdef HAS_ARGB1555TOUVROW_NEON
1939 void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, 1828 void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
1940 uint8* dst_u, uint8* dst_v, int width) { 1829 uint8* dst_u, uint8* dst_v, int width) {
1941 const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555; 1830 const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
1942 asm volatile ( 1831 asm volatile (
1943 RGBTOUV_SETUP_REG 1832 RGBTOUV_SETUP_REG
1944 "1: \n" 1833 "1: \n"
1945 MEMACCESS(0) 1834 MEMACCESS(0)
1946 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. 1835 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
1947 RGB555TOARGB 1836 RGB555TOARGB
1948 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1837 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
(...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after
1996 "+r"(src_argb1555_1), // %1 1885 "+r"(src_argb1555_1), // %1
1997 "+r"(dst_u), // %2 1886 "+r"(dst_u), // %2
1998 "+r"(dst_v), // %3 1887 "+r"(dst_v), // %3
1999 "+r"(width) // %4 1888 "+r"(width) // %4
2000 : 1889 :
2001 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", 1890 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
2002 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", 1891 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
2003 "v26", "v27", "v28" 1892 "v26", "v27", "v28"
2004 ); 1893 );
2005 } 1894 }
2006 #endif // HAS_ARGB1555TOUVROW_NEON
2007 1895
2008 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. 1896 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
2009 #ifdef HAS_ARGB4444TOUVROW_NEON
2010 void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, 1897 void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
2011 uint8* dst_u, uint8* dst_v, int width) { 1898 uint8* dst_u, uint8* dst_v, int width) {
2012 const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444; 1899 const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
2013 asm volatile ( 1900 asm volatile (
2014 RGBTOUV_SETUP_REG 1901 RGBTOUV_SETUP_REG
2015 "1: \n" 1902 "1: \n"
2016 MEMACCESS(0) 1903 MEMACCESS(0)
2017 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. 1904 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
2018 ARGB4444TOARGB 1905 ARGB4444TOARGB
2019 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1906 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
(...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after
2068 "+r"(dst_u), // %2 1955 "+r"(dst_u), // %2
2069 "+r"(dst_v), // %3 1956 "+r"(dst_v), // %3
2070 "+r"(width) // %4 1957 "+r"(width) // %4
2071 : 1958 :
2072 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", 1959 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
2073 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", 1960 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
2074 "v26", "v27", "v28" 1961 "v26", "v27", "v28"
2075 1962
2076 ); 1963 );
2077 } 1964 }
2078 #endif // HAS_ARGB4444TOUVROW_NEON
2079 1965
2080 #ifdef HAS_RGB565TOYROW_NEON
2081 void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) { 1966 void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) {
2082 asm volatile ( 1967 asm volatile (
2083 "movi v24.8b, #13 \n" // B * 0.1016 coefficient 1968 "movi v24.8b, #13 \n" // B * 0.1016 coefficient
2084 "movi v25.8b, #65 \n" // G * 0.5078 coefficient 1969 "movi v25.8b, #65 \n" // G * 0.5078 coefficient
2085 "movi v26.8b, #33 \n" // R * 0.2578 coefficient 1970 "movi v26.8b, #33 \n" // R * 0.2578 coefficient
2086 "movi v27.8b, #16 \n" // Add 16 constant 1971 "movi v27.8b, #16 \n" // Add 16 constant
2087 "1: \n" 1972 "1: \n"
2088 MEMACCESS(0) 1973 MEMACCESS(0)
2089 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. 1974 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
2090 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1975 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2091 RGB565TOARGB 1976 RGB565TOARGB
2092 "umull v3.8h, v0.8b, v24.8b \n" // B 1977 "umull v3.8h, v0.8b, v24.8b \n" // B
2093 "umlal v3.8h, v1.8b, v25.8b \n" // G 1978 "umlal v3.8h, v1.8b, v25.8b \n" // G
2094 "umlal v3.8h, v2.8b, v26.8b \n" // R 1979 "umlal v3.8h, v2.8b, v26.8b \n" // R
2095 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y 1980 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
2096 "uqadd v0.8b, v0.8b, v27.8b \n" 1981 "uqadd v0.8b, v0.8b, v27.8b \n"
2097 MEMACCESS(1) 1982 MEMACCESS(1)
2098 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 1983 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2099 "b.gt 1b \n" 1984 "b.gt 1b \n"
2100 : "+r"(src_rgb565), // %0 1985 : "+r"(src_rgb565), // %0
2101 "+r"(dst_y), // %1 1986 "+r"(dst_y), // %1
2102 "+r"(width) // %2 1987 "+r"(width) // %2
2103 : 1988 :
2104 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", 1989 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6",
2105 "v24", "v25", "v26", "v27" 1990 "v24", "v25", "v26", "v27"
2106 ); 1991 );
2107 } 1992 }
2108 #endif // HAS_RGB565TOYROW_NEON
2109 1993
2110 #ifdef HAS_ARGB1555TOYROW_NEON
2111 void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) { 1994 void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) {
2112 asm volatile ( 1995 asm volatile (
2113 "movi v4.8b, #13 \n" // B * 0.1016 coefficient 1996 "movi v4.8b, #13 \n" // B * 0.1016 coefficient
2114 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 1997 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
2115 "movi v6.8b, #33 \n" // R * 0.2578 coefficient 1998 "movi v6.8b, #33 \n" // R * 0.2578 coefficient
2116 "movi v7.8b, #16 \n" // Add 16 constant 1999 "movi v7.8b, #16 \n" // Add 16 constant
2117 "1: \n" 2000 "1: \n"
2118 MEMACCESS(0) 2001 MEMACCESS(0)
2119 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. 2002 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
2120 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2003 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2121 ARGB1555TOARGB 2004 ARGB1555TOARGB
2122 "umull v3.8h, v0.8b, v4.8b \n" // B 2005 "umull v3.8h, v0.8b, v4.8b \n" // B
2123 "umlal v3.8h, v1.8b, v5.8b \n" // G 2006 "umlal v3.8h, v1.8b, v5.8b \n" // G
2124 "umlal v3.8h, v2.8b, v6.8b \n" // R 2007 "umlal v3.8h, v2.8b, v6.8b \n" // R
2125 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y 2008 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
2126 "uqadd v0.8b, v0.8b, v7.8b \n" 2009 "uqadd v0.8b, v0.8b, v7.8b \n"
2127 MEMACCESS(1) 2010 MEMACCESS(1)
2128 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2011 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2129 "b.gt 1b \n" 2012 "b.gt 1b \n"
2130 : "+r"(src_argb1555), // %0 2013 : "+r"(src_argb1555), // %0
2131 "+r"(dst_y), // %1 2014 "+r"(dst_y), // %1
2132 "+r"(width) // %2 2015 "+r"(width) // %2
2133 : 2016 :
2134 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" 2017 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
2135 ); 2018 );
2136 } 2019 }
2137 #endif // HAS_ARGB1555TOYROW_NEON
2138 2020
2139 #ifdef HAS_ARGB4444TOYROW_NEON
2140 void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) { 2021 void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) {
2141 asm volatile ( 2022 asm volatile (
2142 "movi v24.8b, #13 \n" // B * 0.1016 coefficient 2023 "movi v24.8b, #13 \n" // B * 0.1016 coefficient
2143 "movi v25.8b, #65 \n" // G * 0.5078 coefficient 2024 "movi v25.8b, #65 \n" // G * 0.5078 coefficient
2144 "movi v26.8b, #33 \n" // R * 0.2578 coefficient 2025 "movi v26.8b, #33 \n" // R * 0.2578 coefficient
2145 "movi v27.8b, #16 \n" // Add 16 constant 2026 "movi v27.8b, #16 \n" // Add 16 constant
2146 "1: \n" 2027 "1: \n"
2147 MEMACCESS(0) 2028 MEMACCESS(0)
2148 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. 2029 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
2149 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2030 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2150 ARGB4444TOARGB 2031 ARGB4444TOARGB
2151 "umull v3.8h, v0.8b, v24.8b \n" // B 2032 "umull v3.8h, v0.8b, v24.8b \n" // B
2152 "umlal v3.8h, v1.8b, v25.8b \n" // G 2033 "umlal v3.8h, v1.8b, v25.8b \n" // G
2153 "umlal v3.8h, v2.8b, v26.8b \n" // R 2034 "umlal v3.8h, v2.8b, v26.8b \n" // R
2154 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y 2035 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
2155 "uqadd v0.8b, v0.8b, v27.8b \n" 2036 "uqadd v0.8b, v0.8b, v27.8b \n"
2156 MEMACCESS(1) 2037 MEMACCESS(1)
2157 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2038 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2158 "b.gt 1b \n" 2039 "b.gt 1b \n"
2159 : "+r"(src_argb4444), // %0 2040 : "+r"(src_argb4444), // %0
2160 "+r"(dst_y), // %1 2041 "+r"(dst_y), // %1
2161 "+r"(width) // %2 2042 "+r"(width) // %2
2162 : 2043 :
2163 : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27" 2044 : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27"
2164 ); 2045 );
2165 } 2046 }
2166 #endif // HAS_ARGB4444TOYROW_NEON
2167 2047
2168 #ifdef HAS_BGRATOYROW_NEON
2169 void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) { 2048 void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) {
2170 asm volatile ( 2049 asm volatile (
2171 "movi v4.8b, #33 \n" // R * 0.2578 coefficient 2050 "movi v4.8b, #33 \n" // R * 0.2578 coefficient
2172 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 2051 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
2173 "movi v6.8b, #13 \n" // B * 0.1016 coefficient 2052 "movi v6.8b, #13 \n" // B * 0.1016 coefficient
2174 "movi v7.8b, #16 \n" // Add 16 constant 2053 "movi v7.8b, #16 \n" // Add 16 constant
2175 "1: \n" 2054 "1: \n"
2176 MEMACCESS(0) 2055 MEMACCESS(0)
2177 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. 2056 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
2178 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2057 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2179 "umull v16.8h, v1.8b, v4.8b \n" // R 2058 "umull v16.8h, v1.8b, v4.8b \n" // R
2180 "umlal v16.8h, v2.8b, v5.8b \n" // G 2059 "umlal v16.8h, v2.8b, v5.8b \n" // G
2181 "umlal v16.8h, v3.8b, v6.8b \n" // B 2060 "umlal v16.8h, v3.8b, v6.8b \n" // B
2182 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y 2061 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
2183 "uqadd v0.8b, v0.8b, v7.8b \n" 2062 "uqadd v0.8b, v0.8b, v7.8b \n"
2184 MEMACCESS(1) 2063 MEMACCESS(1)
2185 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2064 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2186 "b.gt 1b \n" 2065 "b.gt 1b \n"
2187 : "+r"(src_bgra), // %0 2066 : "+r"(src_bgra), // %0
2188 "+r"(dst_y), // %1 2067 "+r"(dst_y), // %1
2189 "+r"(width) // %2 2068 "+r"(width) // %2
2190 : 2069 :
2191 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" 2070 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2192 ); 2071 );
2193 } 2072 }
2194 #endif // HAS_BGRATOYROW_NEON
2195 2073
2196 #ifdef HAS_ABGRTOYROW_NEON
2197 void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) { 2074 void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) {
2198 asm volatile ( 2075 asm volatile (
2199 "movi v4.8b, #33 \n" // R * 0.2578 coefficient 2076 "movi v4.8b, #33 \n" // R * 0.2578 coefficient
2200 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 2077 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
2201 "movi v6.8b, #13 \n" // B * 0.1016 coefficient 2078 "movi v6.8b, #13 \n" // B * 0.1016 coefficient
2202 "movi v7.8b, #16 \n" // Add 16 constant 2079 "movi v7.8b, #16 \n" // Add 16 constant
2203 "1: \n" 2080 "1: \n"
2204 MEMACCESS(0) 2081 MEMACCESS(0)
2205 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. 2082 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
2206 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2083 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2207 "umull v16.8h, v0.8b, v4.8b \n" // R 2084 "umull v16.8h, v0.8b, v4.8b \n" // R
2208 "umlal v16.8h, v1.8b, v5.8b \n" // G 2085 "umlal v16.8h, v1.8b, v5.8b \n" // G
2209 "umlal v16.8h, v2.8b, v6.8b \n" // B 2086 "umlal v16.8h, v2.8b, v6.8b \n" // B
2210 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y 2087 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
2211 "uqadd v0.8b, v0.8b, v7.8b \n" 2088 "uqadd v0.8b, v0.8b, v7.8b \n"
2212 MEMACCESS(1) 2089 MEMACCESS(1)
2213 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2090 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2214 "b.gt 1b \n" 2091 "b.gt 1b \n"
2215 : "+r"(src_abgr), // %0 2092 : "+r"(src_abgr), // %0
2216 "+r"(dst_y), // %1 2093 "+r"(dst_y), // %1
2217 "+r"(width) // %2 2094 "+r"(width) // %2
2218 : 2095 :
2219 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" 2096 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2220 ); 2097 );
2221 } 2098 }
2222 #endif // HAS_ABGRTOYROW_NEON
2223 2099
2224 #ifdef HAS_RGBATOYROW_NEON
2225 void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) { 2100 void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) {
2226 asm volatile ( 2101 asm volatile (
2227 "movi v4.8b, #13 \n" // B * 0.1016 coefficient 2102 "movi v4.8b, #13 \n" // B * 0.1016 coefficient
2228 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 2103 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
2229 "movi v6.8b, #33 \n" // R * 0.2578 coefficient 2104 "movi v6.8b, #33 \n" // R * 0.2578 coefficient
2230 "movi v7.8b, #16 \n" // Add 16 constant 2105 "movi v7.8b, #16 \n" // Add 16 constant
2231 "1: \n" 2106 "1: \n"
2232 MEMACCESS(0) 2107 MEMACCESS(0)
2233 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. 2108 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
2234 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2109 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2235 "umull v16.8h, v1.8b, v4.8b \n" // B 2110 "umull v16.8h, v1.8b, v4.8b \n" // B
2236 "umlal v16.8h, v2.8b, v5.8b \n" // G 2111 "umlal v16.8h, v2.8b, v5.8b \n" // G
2237 "umlal v16.8h, v3.8b, v6.8b \n" // R 2112 "umlal v16.8h, v3.8b, v6.8b \n" // R
2238 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y 2113 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
2239 "uqadd v0.8b, v0.8b, v7.8b \n" 2114 "uqadd v0.8b, v0.8b, v7.8b \n"
2240 MEMACCESS(1) 2115 MEMACCESS(1)
2241 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2116 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2242 "b.gt 1b \n" 2117 "b.gt 1b \n"
2243 : "+r"(src_rgba), // %0 2118 : "+r"(src_rgba), // %0
2244 "+r"(dst_y), // %1 2119 "+r"(dst_y), // %1
2245 "+r"(width) // %2 2120 "+r"(width) // %2
2246 : 2121 :
2247 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" 2122 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2248 ); 2123 );
2249 } 2124 }
2250 #endif // HAS_RGBATOYROW_NEON
2251 2125
2252 #ifdef HAS_RGB24TOYROW_NEON
2253 void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) { 2126 void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) {
2254 asm volatile ( 2127 asm volatile (
2255 "movi v4.8b, #13 \n" // B * 0.1016 coefficient 2128 "movi v4.8b, #13 \n" // B * 0.1016 coefficient
2256 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 2129 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
2257 "movi v6.8b, #33 \n" // R * 0.2578 coefficient 2130 "movi v6.8b, #33 \n" // R * 0.2578 coefficient
2258 "movi v7.8b, #16 \n" // Add 16 constant 2131 "movi v7.8b, #16 \n" // Add 16 constant
2259 "1: \n" 2132 "1: \n"
2260 MEMACCESS(0) 2133 MEMACCESS(0)
2261 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. 2134 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
2262 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2135 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2263 "umull v16.8h, v0.8b, v4.8b \n" // B 2136 "umull v16.8h, v0.8b, v4.8b \n" // B
2264 "umlal v16.8h, v1.8b, v5.8b \n" // G 2137 "umlal v16.8h, v1.8b, v5.8b \n" // G
2265 "umlal v16.8h, v2.8b, v6.8b \n" // R 2138 "umlal v16.8h, v2.8b, v6.8b \n" // R
2266 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y 2139 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
2267 "uqadd v0.8b, v0.8b, v7.8b \n" 2140 "uqadd v0.8b, v0.8b, v7.8b \n"
2268 MEMACCESS(1) 2141 MEMACCESS(1)
2269 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2142 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2270 "b.gt 1b \n" 2143 "b.gt 1b \n"
2271 : "+r"(src_rgb24), // %0 2144 : "+r"(src_rgb24), // %0
2272 "+r"(dst_y), // %1 2145 "+r"(dst_y), // %1
2273 "+r"(width) // %2 2146 "+r"(width) // %2
2274 : 2147 :
2275 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" 2148 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2276 ); 2149 );
2277 } 2150 }
2278 #endif // HAS_RGB24TOYROW_NEON
2279 2151
2280 #ifdef HAS_RAWTOYROW_NEON
2281 void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) { 2152 void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
2282 asm volatile ( 2153 asm volatile (
2283 "movi v4.8b, #33 \n" // R * 0.2578 coefficient 2154 "movi v4.8b, #33 \n" // R * 0.2578 coefficient
2284 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 2155 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
2285 "movi v6.8b, #13 \n" // B * 0.1016 coefficient 2156 "movi v6.8b, #13 \n" // B * 0.1016 coefficient
2286 "movi v7.8b, #16 \n" // Add 16 constant 2157 "movi v7.8b, #16 \n" // Add 16 constant
2287 "1: \n" 2158 "1: \n"
2288 MEMACCESS(0) 2159 MEMACCESS(0)
2289 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. 2160 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
2290 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2161 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2291 "umull v16.8h, v0.8b, v4.8b \n" // B 2162 "umull v16.8h, v0.8b, v4.8b \n" // B
2292 "umlal v16.8h, v1.8b, v5.8b \n" // G 2163 "umlal v16.8h, v1.8b, v5.8b \n" // G
2293 "umlal v16.8h, v2.8b, v6.8b \n" // R 2164 "umlal v16.8h, v2.8b, v6.8b \n" // R
2294 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y 2165 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
2295 "uqadd v0.8b, v0.8b, v7.8b \n" 2166 "uqadd v0.8b, v0.8b, v7.8b \n"
2296 MEMACCESS(1) 2167 MEMACCESS(1)
2297 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2168 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2298 "b.gt 1b \n" 2169 "b.gt 1b \n"
2299 : "+r"(src_raw), // %0 2170 : "+r"(src_raw), // %0
2300 "+r"(dst_y), // %1 2171 "+r"(dst_y), // %1
2301 "+r"(width) // %2 2172 "+r"(width) // %2
2302 : 2173 :
2303 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" 2174 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2304 ); 2175 );
2305 } 2176 }
2306 #endif // HAS_RAWTOYROW_NEON
2307 2177
2308 // Bilinear filter 16x2 -> 16x1 2178 // Bilinear filter 16x2 -> 16x1
2309 #ifdef HAS_INTERPOLATEROW_NEON
2310 void InterpolateRow_NEON(uint8* dst_ptr, 2179 void InterpolateRow_NEON(uint8* dst_ptr,
2311 const uint8* src_ptr, ptrdiff_t src_stride, 2180 const uint8* src_ptr, ptrdiff_t src_stride,
2312 int dst_width, int source_y_fraction) { 2181 int dst_width, int source_y_fraction) {
2313 int y1_fraction = source_y_fraction; 2182 int y1_fraction = source_y_fraction;
2314 int y0_fraction = 256 - y1_fraction; 2183 int y0_fraction = 256 - y1_fraction;
2315 const uint8* src_ptr1 = src_ptr + src_stride; 2184 const uint8* src_ptr1 = src_ptr + src_stride;
2316 asm volatile ( 2185 asm volatile (
2317 "cmp %w4, #0 \n" 2186 "cmp %w4, #0 \n"
2318 "b.eq 100f \n" 2187 "b.eq 100f \n"
2319 "cmp %w4, #128 \n" 2188 "cmp %w4, #128 \n"
(...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after
2365 : "+r"(dst_ptr), // %0 2234 : "+r"(dst_ptr), // %0
2366 "+r"(src_ptr), // %1 2235 "+r"(src_ptr), // %1
2367 "+r"(src_ptr1), // %2 2236 "+r"(src_ptr1), // %2
2368 "+r"(dst_width), // %3 2237 "+r"(dst_width), // %3
2369 "+r"(y1_fraction), // %4 2238 "+r"(y1_fraction), // %4
2370 "+r"(y0_fraction) // %5 2239 "+r"(y0_fraction) // %5
2371 : 2240 :
2372 : "cc", "memory", "v0", "v1", "v3", "v4", "v5" 2241 : "cc", "memory", "v0", "v1", "v3", "v4", "v5"
2373 ); 2242 );
2374 } 2243 }
2375 #endif // HAS_INTERPOLATEROW_NEON
2376 2244
2377 // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr 2245 // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
2378 #ifdef HAS_ARGBBLENDROW_NEON
2379 void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, 2246 void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
2380 uint8* dst_argb, int width) { 2247 uint8* dst_argb, int width) {
2381 asm volatile ( 2248 asm volatile (
2382 "subs %w3, %w3, #8 \n" 2249 "subs %w3, %w3, #8 \n"
2383 "b.lt 89f \n" 2250 "b.lt 89f \n"
2384 // Blend 8 pixels. 2251 // Blend 8 pixels.
2385 "8: \n" 2252 "8: \n"
2386 MEMACCESS(0) 2253 MEMACCESS(0)
2387 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 pixels 2254 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 pixels
2388 MEMACCESS(1) 2255 MEMACCESS(1)
(...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after
2437 2304
2438 : "+r"(src_argb0), // %0 2305 : "+r"(src_argb0), // %0
2439 "+r"(src_argb1), // %1 2306 "+r"(src_argb1), // %1
2440 "+r"(dst_argb), // %2 2307 "+r"(dst_argb), // %2
2441 "+r"(width) // %3 2308 "+r"(width) // %3
2442 : 2309 :
2443 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 2310 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2444 "v16", "v17", "v18" 2311 "v16", "v17", "v18"
2445 ); 2312 );
2446 } 2313 }
2447 #endif // HAS_ARGBBLENDROW_NEON
2448 2314
2449 // Attenuate 8 pixels at a time. 2315 // Attenuate 8 pixels at a time.
2450 #ifdef HAS_ARGBATTENUATEROW_NEON
2451 void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { 2316 void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
2452 asm volatile ( 2317 asm volatile (
2453 // Attenuate 8 pixels. 2318 // Attenuate 8 pixels.
2454 "1: \n" 2319 "1: \n"
2455 MEMACCESS(0) 2320 MEMACCESS(0)
2456 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels 2321 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels
2457 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2322 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2458 "umull v4.8h, v0.8b, v3.8b \n" // b * a 2323 "umull v4.8h, v0.8b, v3.8b \n" // b * a
2459 "umull v5.8h, v1.8b, v3.8b \n" // g * a 2324 "umull v5.8h, v1.8b, v3.8b \n" // g * a
2460 "umull v6.8h, v2.8b, v3.8b \n" // r * a 2325 "umull v6.8h, v2.8b, v3.8b \n" // r * a
2461 "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8 2326 "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8
2462 "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8 2327 "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8
2463 "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8 2328 "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8
2464 MEMACCESS(1) 2329 MEMACCESS(1)
2465 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels 2330 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
2466 "b.gt 1b \n" 2331 "b.gt 1b \n"
2467 : "+r"(src_argb), // %0 2332 : "+r"(src_argb), // %0
2468 "+r"(dst_argb), // %1 2333 "+r"(dst_argb), // %1
2469 "+r"(width) // %2 2334 "+r"(width) // %2
2470 : 2335 :
2471 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" 2336 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
2472 ); 2337 );
2473 } 2338 }
2474 #endif // HAS_ARGBATTENUATEROW_NEON
2475 2339
2476 // Quantize 8 ARGB pixels (32 bytes). 2340 // Quantize 8 ARGB pixels (32 bytes).
2477 // dst = (dst * scale >> 16) * interval_size + interval_offset; 2341 // dst = (dst * scale >> 16) * interval_size + interval_offset;
2478 #ifdef HAS_ARGBQUANTIZEROW_NEON
2479 void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, 2342 void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
2480 int interval_offset, int width) { 2343 int interval_offset, int width) {
2481 asm volatile ( 2344 asm volatile (
2482 "dup v4.8h, %w2 \n" 2345 "dup v4.8h, %w2 \n"
2483 "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1 2346 "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1
2484 "dup v5.8h, %w3 \n" // interval multiply. 2347 "dup v5.8h, %w3 \n" // interval multiply.
2485 "dup v6.8h, %w4 \n" // interval add 2348 "dup v6.8h, %w4 \n" // interval add
2486 2349
2487 // 8 pixel loop. 2350 // 8 pixel loop.
2488 "1: \n" 2351 "1: \n"
(...skipping 19 matching lines...) Expand all
2508 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB pixels 2371 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB pixels
2509 "b.gt 1b \n" 2372 "b.gt 1b \n"
2510 : "+r"(dst_argb), // %0 2373 : "+r"(dst_argb), // %0
2511 "+r"(width) // %1 2374 "+r"(width) // %1
2512 : "r"(scale), // %2 2375 : "r"(scale), // %2
2513 "r"(interval_size), // %3 2376 "r"(interval_size), // %3
2514 "r"(interval_offset) // %4 2377 "r"(interval_offset) // %4
2515 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" 2378 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
2516 ); 2379 );
2517 } 2380 }
2518 #endif // HAS_ARGBQUANTIZEROW_NEON
2519 2381
2520 // Shade 8 pixels at a time by specified value. 2382 // Shade 8 pixels at a time by specified value.
2521 // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8. 2383 // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
2522 // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set. 2384 // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
2523 #ifdef HAS_ARGBSHADEROW_NEON
2524 void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, 2385 void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
2525 uint32 value) { 2386 uint32 value) {
2526 asm volatile ( 2387 asm volatile (
2527 "dup v0.4s, %w3 \n" // duplicate scale value. 2388 "dup v0.4s, %w3 \n" // duplicate scale value.
2528 "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb. 2389 "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb.
2529 "ushr v0.8h, v0.8h, #1 \n" // scale / 2. 2390 "ushr v0.8h, v0.8h, #1 \n" // scale / 2.
2530 2391
2531 // 8 pixel loop. 2392 // 8 pixel loop.
2532 "1: \n" 2393 "1: \n"
2533 MEMACCESS(0) 2394 MEMACCESS(0)
(...skipping 14 matching lines...) Expand all
2548 MEMACCESS(1) 2409 MEMACCESS(1)
2549 "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB pixels 2410 "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB pixels
2550 "b.gt 1b \n" 2411 "b.gt 1b \n"
2551 : "+r"(src_argb), // %0 2412 : "+r"(src_argb), // %0
2552 "+r"(dst_argb), // %1 2413 "+r"(dst_argb), // %1
2553 "+r"(width) // %2 2414 "+r"(width) // %2
2554 : "r"(value) // %3 2415 : "r"(value) // %3
2555 : "cc", "memory", "v0", "v4", "v5", "v6", "v7" 2416 : "cc", "memory", "v0", "v4", "v5", "v6", "v7"
2556 ); 2417 );
2557 } 2418 }
2558 #endif // HAS_ARGBSHADEROW_NEON
2559 2419
2560 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels 2420 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
2561 // Similar to ARGBToYJ but stores ARGB. 2421 // Similar to ARGBToYJ but stores ARGB.
2562 // C code is (15 * b + 75 * g + 38 * r + 64) >> 7; 2422 // C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
2563 #ifdef HAS_ARGBGRAYROW_NEON
2564 void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { 2423 void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
2565 asm volatile ( 2424 asm volatile (
2566 "movi v24.8b, #15 \n" // B * 0.11400 coefficient 2425 "movi v24.8b, #15 \n" // B * 0.11400 coefficient
2567 "movi v25.8b, #75 \n" // G * 0.58700 coefficient 2426 "movi v25.8b, #75 \n" // G * 0.58700 coefficient
2568 "movi v26.8b, #38 \n" // R * 0.29900 coefficient 2427 "movi v26.8b, #38 \n" // R * 0.29900 coefficient
2569 "1: \n" 2428 "1: \n"
2570 MEMACCESS(0) 2429 MEMACCESS(0)
2571 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 2430 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
2572 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2431 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2573 "umull v4.8h, v0.8b, v24.8b \n" // B 2432 "umull v4.8h, v0.8b, v24.8b \n" // B
2574 "umlal v4.8h, v1.8b, v25.8b \n" // G 2433 "umlal v4.8h, v1.8b, v25.8b \n" // G
2575 "umlal v4.8h, v2.8b, v26.8b \n" // R 2434 "umlal v4.8h, v2.8b, v26.8b \n" // R
2576 "sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B 2435 "sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B
2577 "orr v1.8b, v0.8b, v0.8b \n" // G 2436 "orr v1.8b, v0.8b, v0.8b \n" // G
2578 "orr v2.8b, v0.8b, v0.8b \n" // R 2437 "orr v2.8b, v0.8b, v0.8b \n" // R
2579 MEMACCESS(1) 2438 MEMACCESS(1)
2580 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels. 2439 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels.
2581 "b.gt 1b \n" 2440 "b.gt 1b \n"
2582 : "+r"(src_argb), // %0 2441 : "+r"(src_argb), // %0
2583 "+r"(dst_argb), // %1 2442 "+r"(dst_argb), // %1
2584 "+r"(width) // %2 2443 "+r"(width) // %2
2585 : 2444 :
2586 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26" 2445 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26"
2587 ); 2446 );
2588 } 2447 }
2589 #endif // HAS_ARGBGRAYROW_NEON
2590 2448
2591 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. 2449 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
2592 // b = (r * 35 + g * 68 + b * 17) >> 7 2450 // b = (r * 35 + g * 68 + b * 17) >> 7
2593 // g = (r * 45 + g * 88 + b * 22) >> 7 2451 // g = (r * 45 + g * 88 + b * 22) >> 7
2594 // r = (r * 50 + g * 98 + b * 24) >> 7 2452 // r = (r * 50 + g * 98 + b * 24) >> 7
2595 2453
2596 #ifdef HAS_ARGBSEPIAROW_NEON
2597 void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { 2454 void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
2598 asm volatile ( 2455 asm volatile (
2599 "movi v20.8b, #17 \n" // BB coefficient 2456 "movi v20.8b, #17 \n" // BB coefficient
2600 "movi v21.8b, #68 \n" // BG coefficient 2457 "movi v21.8b, #68 \n" // BG coefficient
2601 "movi v22.8b, #35 \n" // BR coefficient 2458 "movi v22.8b, #35 \n" // BR coefficient
2602 "movi v24.8b, #22 \n" // GB coefficient 2459 "movi v24.8b, #22 \n" // GB coefficient
2603 "movi v25.8b, #88 \n" // GG coefficient 2460 "movi v25.8b, #88 \n" // GG coefficient
2604 "movi v26.8b, #45 \n" // GR coefficient 2461 "movi v26.8b, #45 \n" // GR coefficient
2605 "movi v28.8b, #24 \n" // BB coefficient 2462 "movi v28.8b, #24 \n" // BB coefficient
2606 "movi v29.8b, #98 \n" // BG coefficient 2463 "movi v29.8b, #98 \n" // BG coefficient
(...skipping 17 matching lines...) Expand all
2624 MEMACCESS(0) 2481 MEMACCESS(0)
2625 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels. 2482 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels.
2626 "b.gt 1b \n" 2483 "b.gt 1b \n"
2627 : "+r"(dst_argb), // %0 2484 : "+r"(dst_argb), // %0
2628 "+r"(width) // %1 2485 "+r"(width) // %1
2629 : 2486 :
2630 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 2487 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2631 "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30" 2488 "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30"
2632 ); 2489 );
2633 } 2490 }
2634 #endif // HAS_ARGBSEPIAROW_NEON
2635 2491
2636 // Tranform 8 ARGB pixels (32 bytes) with color matrix. 2492 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
2637 // TODO(fbarchard): Was same as Sepia except matrix is provided. This function 2493 // TODO(fbarchard): Was same as Sepia except matrix is provided. This function
2638 // needs to saturate. Consider doing a non-saturating version. 2494 // needs to saturate. Consider doing a non-saturating version.
2639 #ifdef HAS_ARGBCOLORMATRIXROW_NEON
2640 void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, 2495 void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
2641 const int8* matrix_argb, int width) { 2496 const int8* matrix_argb, int width) {
2642 asm volatile ( 2497 asm volatile (
2643 MEMACCESS(3) 2498 MEMACCESS(3)
2644 "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors. 2499 "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors.
2645 "sxtl v0.8h, v2.8b \n" // B,G coefficients s16. 2500 "sxtl v0.8h, v2.8b \n" // B,G coefficients s16.
2646 "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16. 2501 "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16.
2647 2502
2648 "1: \n" 2503 "1: \n"
2649 MEMACCESS(0) 2504 MEMACCESS(0)
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after
2689 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 pixels. 2544 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 pixels.
2690 "b.gt 1b \n" 2545 "b.gt 1b \n"
2691 : "+r"(src_argb), // %0 2546 : "+r"(src_argb), // %0
2692 "+r"(dst_argb), // %1 2547 "+r"(dst_argb), // %1
2693 "+r"(width) // %2 2548 "+r"(width) // %2
2694 : "r"(matrix_argb) // %3 2549 : "r"(matrix_argb) // %3
2695 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17" , 2550 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17" ,
2696 "v18", "v19", "v22", "v23", "v24", "v25" 2551 "v18", "v19", "v22", "v23", "v24", "v25"
2697 ); 2552 );
2698 } 2553 }
2699 #endif // HAS_ARGBCOLORMATRIXROW_NEON
2700 2554
2701 // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable. 2555 // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
2702 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. 2556 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
2703 #ifdef HAS_ARGBMULTIPLYROW_NEON
2704 void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, 2557 void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
2705 uint8* dst_argb, int width) { 2558 uint8* dst_argb, int width) {
2706 asm volatile ( 2559 asm volatile (
2707 // 8 pixel loop. 2560 // 8 pixel loop.
2708 "1: \n" 2561 "1: \n"
2709 MEMACCESS(0) 2562 MEMACCESS(0)
2710 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 2563 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
2711 MEMACCESS(1) 2564 MEMACCESS(1)
2712 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. 2565 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels.
2713 "subs %w3, %w3, #8 \n" // 8 processed per loop. 2566 "subs %w3, %w3, #8 \n" // 8 processed per loop.
(...skipping 10 matching lines...) Expand all
2724 "b.gt 1b \n" 2577 "b.gt 1b \n"
2725 2578
2726 : "+r"(src_argb0), // %0 2579 : "+r"(src_argb0), // %0
2727 "+r"(src_argb1), // %1 2580 "+r"(src_argb1), // %1
2728 "+r"(dst_argb), // %2 2581 "+r"(dst_argb), // %2
2729 "+r"(width) // %3 2582 "+r"(width) // %3
2730 : 2583 :
2731 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" 2584 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
2732 ); 2585 );
2733 } 2586 }
2734 #endif // HAS_ARGBMULTIPLYROW_NEON
2735 2587
2736 // Add 2 rows of ARGB pixels together, 8 pixels at a time. 2588 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
2737 #ifdef HAS_ARGBADDROW_NEON
2738 void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, 2589 void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
2739 uint8* dst_argb, int width) { 2590 uint8* dst_argb, int width) {
2740 asm volatile ( 2591 asm volatile (
2741 // 8 pixel loop. 2592 // 8 pixel loop.
2742 "1: \n" 2593 "1: \n"
2743 MEMACCESS(0) 2594 MEMACCESS(0)
2744 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 2595 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
2745 MEMACCESS(1) 2596 MEMACCESS(1)
2746 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. 2597 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels.
2747 "subs %w3, %w3, #8 \n" // 8 processed per loop. 2598 "subs %w3, %w3, #8 \n" // 8 processed per loop.
2748 "uqadd v0.8b, v0.8b, v4.8b \n" 2599 "uqadd v0.8b, v0.8b, v4.8b \n"
2749 "uqadd v1.8b, v1.8b, v5.8b \n" 2600 "uqadd v1.8b, v1.8b, v5.8b \n"
2750 "uqadd v2.8b, v2.8b, v6.8b \n" 2601 "uqadd v2.8b, v2.8b, v6.8b \n"
2751 "uqadd v3.8b, v3.8b, v7.8b \n" 2602 "uqadd v3.8b, v3.8b, v7.8b \n"
2752 MEMACCESS(2) 2603 MEMACCESS(2)
2753 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels 2604 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
2754 "b.gt 1b \n" 2605 "b.gt 1b \n"
2755 2606
2756 : "+r"(src_argb0), // %0 2607 : "+r"(src_argb0), // %0
2757 "+r"(src_argb1), // %1 2608 "+r"(src_argb1), // %1
2758 "+r"(dst_argb), // %2 2609 "+r"(dst_argb), // %2
2759 "+r"(width) // %3 2610 "+r"(width) // %3
2760 : 2611 :
2761 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" 2612 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
2762 ); 2613 );
2763 } 2614 }
2764 #endif // HAS_ARGBADDROW_NEON
2765 2615
2766 // Subtract 2 rows of ARGB pixels, 8 pixels at a time. 2616 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
2767 #ifdef HAS_ARGBSUBTRACTROW_NEON
2768 void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, 2617 void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
2769 uint8* dst_argb, int width) { 2618 uint8* dst_argb, int width) {
2770 asm volatile ( 2619 asm volatile (
2771 // 8 pixel loop. 2620 // 8 pixel loop.
2772 "1: \n" 2621 "1: \n"
2773 MEMACCESS(0) 2622 MEMACCESS(0)
2774 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 2623 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
2775 MEMACCESS(1) 2624 MEMACCESS(1)
2776 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. 2625 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels.
2777 "subs %w3, %w3, #8 \n" // 8 processed per loop. 2626 "subs %w3, %w3, #8 \n" // 8 processed per loop.
2778 "uqsub v0.8b, v0.8b, v4.8b \n" 2627 "uqsub v0.8b, v0.8b, v4.8b \n"
2779 "uqsub v1.8b, v1.8b, v5.8b \n" 2628 "uqsub v1.8b, v1.8b, v5.8b \n"
2780 "uqsub v2.8b, v2.8b, v6.8b \n" 2629 "uqsub v2.8b, v2.8b, v6.8b \n"
2781 "uqsub v3.8b, v3.8b, v7.8b \n" 2630 "uqsub v3.8b, v3.8b, v7.8b \n"
2782 MEMACCESS(2) 2631 MEMACCESS(2)
2783 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels 2632 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
2784 "b.gt 1b \n" 2633 "b.gt 1b \n"
2785 2634
2786 : "+r"(src_argb0), // %0 2635 : "+r"(src_argb0), // %0
2787 "+r"(src_argb1), // %1 2636 "+r"(src_argb1), // %1
2788 "+r"(dst_argb), // %2 2637 "+r"(dst_argb), // %2
2789 "+r"(width) // %3 2638 "+r"(width) // %3
2790 : 2639 :
2791 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" 2640 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
2792 ); 2641 );
2793 } 2642 }
2794 #endif // HAS_ARGBSUBTRACTROW_NEON
2795 2643
2796 // Adds Sobel X and Sobel Y and stores Sobel into ARGB. 2644 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
2797 // A = 255 2645 // A = 255
2798 // R = Sobel 2646 // R = Sobel
2799 // G = Sobel 2647 // G = Sobel
2800 // B = Sobel 2648 // B = Sobel
2801 #ifdef HAS_SOBELROW_NEON
2802 void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, 2649 void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
2803 uint8* dst_argb, int width) { 2650 uint8* dst_argb, int width) {
2804 asm volatile ( 2651 asm volatile (
2805 "movi v3.8b, #255 \n" // alpha 2652 "movi v3.8b, #255 \n" // alpha
2806 // 8 pixel loop. 2653 // 8 pixel loop.
2807 "1: \n" 2654 "1: \n"
2808 MEMACCESS(0) 2655 MEMACCESS(0)
2809 "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. 2656 "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx.
2810 MEMACCESS(1) 2657 MEMACCESS(1)
2811 "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. 2658 "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely.
2812 "subs %w3, %w3, #8 \n" // 8 processed per loop. 2659 "subs %w3, %w3, #8 \n" // 8 processed per loop.
2813 "uqadd v0.8b, v0.8b, v1.8b \n" // add 2660 "uqadd v0.8b, v0.8b, v1.8b \n" // add
2814 "orr v1.8b, v0.8b, v0.8b \n" 2661 "orr v1.8b, v0.8b, v0.8b \n"
2815 "orr v2.8b, v0.8b, v0.8b \n" 2662 "orr v2.8b, v0.8b, v0.8b \n"
2816 MEMACCESS(2) 2663 MEMACCESS(2)
2817 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels 2664 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
2818 "b.gt 1b \n" 2665 "b.gt 1b \n"
2819 : "+r"(src_sobelx), // %0 2666 : "+r"(src_sobelx), // %0
2820 "+r"(src_sobely), // %1 2667 "+r"(src_sobely), // %1
2821 "+r"(dst_argb), // %2 2668 "+r"(dst_argb), // %2
2822 "+r"(width) // %3 2669 "+r"(width) // %3
2823 : 2670 :
2824 : "cc", "memory", "v0", "v1", "v2", "v3" 2671 : "cc", "memory", "v0", "v1", "v2", "v3"
2825 ); 2672 );
2826 } 2673 }
2827 #endif // HAS_SOBELROW_NEON
2828 2674
2829 // Adds Sobel X and Sobel Y and stores Sobel into plane. 2675 // Adds Sobel X and Sobel Y and stores Sobel into plane.
2830 #ifdef HAS_SOBELTOPLANEROW_NEON
2831 void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, 2676 void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
2832 uint8* dst_y, int width) { 2677 uint8* dst_y, int width) {
2833 asm volatile ( 2678 asm volatile (
2834 // 16 pixel loop. 2679 // 16 pixel loop.
2835 "1: \n" 2680 "1: \n"
2836 MEMACCESS(0) 2681 MEMACCESS(0)
2837 "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. 2682 "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx.
2838 MEMACCESS(1) 2683 MEMACCESS(1)
2839 "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. 2684 "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely.
2840 "subs %w3, %w3, #16 \n" // 16 processed per loop. 2685 "subs %w3, %w3, #16 \n" // 16 processed per loop.
2841 "uqadd v0.16b, v0.16b, v1.16b \n" // add 2686 "uqadd v0.16b, v0.16b, v1.16b \n" // add
2842 MEMACCESS(2) 2687 MEMACCESS(2)
2843 "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels. 2688 "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels.
2844 "b.gt 1b \n" 2689 "b.gt 1b \n"
2845 : "+r"(src_sobelx), // %0 2690 : "+r"(src_sobelx), // %0
2846 "+r"(src_sobely), // %1 2691 "+r"(src_sobely), // %1
2847 "+r"(dst_y), // %2 2692 "+r"(dst_y), // %2
2848 "+r"(width) // %3 2693 "+r"(width) // %3
2849 : 2694 :
2850 : "cc", "memory", "v0", "v1" 2695 : "cc", "memory", "v0", "v1"
2851 ); 2696 );
2852 } 2697 }
2853 #endif // HAS_SOBELTOPLANEROW_NEON
2854 2698
2855 // Mixes Sobel X, Sobel Y and Sobel into ARGB. 2699 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
2856 // A = 255 2700 // A = 255
2857 // R = Sobel X 2701 // R = Sobel X
2858 // G = Sobel 2702 // G = Sobel
2859 // B = Sobel Y 2703 // B = Sobel Y
2860 #ifdef HAS_SOBELXYROW_NEON
2861 void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, 2704 void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
2862 uint8* dst_argb, int width) { 2705 uint8* dst_argb, int width) {
2863 asm volatile ( 2706 asm volatile (
2864 "movi v3.8b, #255 \n" // alpha 2707 "movi v3.8b, #255 \n" // alpha
2865 // 8 pixel loop. 2708 // 8 pixel loop.
2866 "1: \n" 2709 "1: \n"
2867 MEMACCESS(0) 2710 MEMACCESS(0)
2868 "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. 2711 "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx.
2869 MEMACCESS(1) 2712 MEMACCESS(1)
2870 "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. 2713 "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely.
2871 "subs %w3, %w3, #8 \n" // 8 processed per loop. 2714 "subs %w3, %w3, #8 \n" // 8 processed per loop.
2872 "uqadd v1.8b, v0.8b, v2.8b \n" // add 2715 "uqadd v1.8b, v0.8b, v2.8b \n" // add
2873 MEMACCESS(2) 2716 MEMACCESS(2)
2874 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels 2717 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
2875 "b.gt 1b \n" 2718 "b.gt 1b \n"
2876 : "+r"(src_sobelx), // %0 2719 : "+r"(src_sobelx), // %0
2877 "+r"(src_sobely), // %1 2720 "+r"(src_sobely), // %1
2878 "+r"(dst_argb), // %2 2721 "+r"(dst_argb), // %2
2879 "+r"(width) // %3 2722 "+r"(width) // %3
2880 : 2723 :
2881 : "cc", "memory", "v0", "v1", "v2", "v3" 2724 : "cc", "memory", "v0", "v1", "v2", "v3"
2882 ); 2725 );
2883 } 2726 }
2884 #endif // HAS_SOBELXYROW_NEON
2885 2727
2886 // SobelX as a matrix is 2728 // SobelX as a matrix is
2887 // -1 0 1 2729 // -1 0 1
2888 // -2 0 2 2730 // -2 0 2
2889 // -1 0 1 2731 // -1 0 1
2890 #ifdef HAS_SOBELXROW_NEON
2891 void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, 2732 void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
2892 const uint8* src_y2, uint8* dst_sobelx, int width) { 2733 const uint8* src_y2, uint8* dst_sobelx, int width) {
2893 asm volatile ( 2734 asm volatile (
2894 "1: \n" 2735 "1: \n"
2895 MEMACCESS(0) 2736 MEMACCESS(0)
2896 "ld1 {v0.8b}, [%0],%5 \n" // top 2737 "ld1 {v0.8b}, [%0],%5 \n" // top
2897 MEMACCESS(0) 2738 MEMACCESS(0)
2898 "ld1 {v1.8b}, [%0],%6 \n" 2739 "ld1 {v1.8b}, [%0],%6 \n"
2899 "usubl v0.8h, v0.8b, v1.8b \n" 2740 "usubl v0.8h, v0.8b, v1.8b \n"
2900 MEMACCESS(1) 2741 MEMACCESS(1)
(...skipping 18 matching lines...) Expand all
2919 : "+r"(src_y0), // %0 2760 : "+r"(src_y0), // %0
2920 "+r"(src_y1), // %1 2761 "+r"(src_y1), // %1
2921 "+r"(src_y2), // %2 2762 "+r"(src_y2), // %2
2922 "+r"(dst_sobelx), // %3 2763 "+r"(dst_sobelx), // %3
2923 "+r"(width) // %4 2764 "+r"(width) // %4
2924 : "r"(2LL), // %5 2765 : "r"(2LL), // %5
2925 "r"(6LL) // %6 2766 "r"(6LL) // %6
2926 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 2767 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
2927 ); 2768 );
2928 } 2769 }
2929 #endif // HAS_SOBELXROW_NEON
2930 2770
2931 // SobelY as a matrix is 2771 // SobelY as a matrix is
2932 // -1 -2 -1 2772 // -1 -2 -1
2933 // 0 0 0 2773 // 0 0 0
2934 // 1 2 1 2774 // 1 2 1
2935 #ifdef HAS_SOBELYROW_NEON
2936 void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, 2775 void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
2937 uint8* dst_sobely, int width) { 2776 uint8* dst_sobely, int width) {
2938 asm volatile ( 2777 asm volatile (
2939 "1: \n" 2778 "1: \n"
2940 MEMACCESS(0) 2779 MEMACCESS(0)
2941 "ld1 {v0.8b}, [%0],%4 \n" // left 2780 "ld1 {v0.8b}, [%0],%4 \n" // left
2942 MEMACCESS(1) 2781 MEMACCESS(1)
2943 "ld1 {v1.8b}, [%1],%4 \n" 2782 "ld1 {v1.8b}, [%1],%4 \n"
2944 "usubl v0.8h, v0.8b, v1.8b \n" 2783 "usubl v0.8h, v0.8b, v1.8b \n"
2945 MEMACCESS(0) 2784 MEMACCESS(0)
(...skipping 17 matching lines...) Expand all
2963 "b.gt 1b \n" 2802 "b.gt 1b \n"
2964 : "+r"(src_y0), // %0 2803 : "+r"(src_y0), // %0
2965 "+r"(src_y1), // %1 2804 "+r"(src_y1), // %1
2966 "+r"(dst_sobely), // %2 2805 "+r"(dst_sobely), // %2
2967 "+r"(width) // %3 2806 "+r"(width) // %3
2968 : "r"(1LL), // %4 2807 : "r"(1LL), // %4
2969 "r"(6LL) // %5 2808 "r"(6LL) // %5
2970 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 2809 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
2971 ); 2810 );
2972 } 2811 }
2973 #endif // HAS_SOBELYROW_NEON
2974 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) 2812 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
2975 2813
2976 #ifdef __cplusplus 2814 #ifdef __cplusplus
2977 } // extern "C" 2815 } // extern "C"
2978 } // namespace libyuv 2816 } // namespace libyuv
2979 #endif 2817 #endif
OLDNEW
« no previous file with comments | « source/row_neon.cc ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698