Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(20)

Side by Side Diff: source/libvpx/third_party/libyuv/source/row_neon64.cc

Issue 1302353004: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright 2014 The LibYuv Project Authors. All rights reserved. 2 * Copyright 2014 The LibYuv Project Authors. All rights reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
(...skipping 160 matching lines...) Expand 10 before | Expand all | Expand 10 after
171 void I444ToARGBRow_NEON(const uint8* src_y, 171 void I444ToARGBRow_NEON(const uint8* src_y,
172 const uint8* src_u, 172 const uint8* src_u,
173 const uint8* src_v, 173 const uint8* src_v,
174 uint8* dst_argb, 174 uint8* dst_argb,
175 int width) { 175 int width) {
176 asm volatile ( 176 asm volatile (
177 YUV422TORGB_SETUP_REG 177 YUV422TORGB_SETUP_REG
178 "1: \n" 178 "1: \n"
179 READYUV444 179 READYUV444
180 YUV422TORGB(v22, v21, v20) 180 YUV422TORGB(v22, v21, v20)
181 "subs %4, %4, #8 \n" 181 "subs %w4, %w4, #8 \n"
182 "movi v23.8b, #255 \n" /* A */ 182 "movi v23.8b, #255 \n" /* A */
183 MEMACCESS(3) 183 MEMACCESS(3)
184 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" 184 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
185 "b.gt 1b \n" 185 "b.gt 1b \n"
186 : "+r"(src_y), // %0 186 : "+r"(src_y), // %0
187 "+r"(src_u), // %1 187 "+r"(src_u), // %1
188 "+r"(src_v), // %2 188 "+r"(src_v), // %2
189 "+r"(dst_argb), // %3 189 "+r"(dst_argb), // %3
190 "+r"(width) // %4 190 "+r"(width) // %4
191 : [kUVBiasBGR]"r"(&kUVBiasBGR), 191 : [kUVBiasBGR]"r"(&kUVBiasBGR),
192 [kYToRgb]"r"(&kYToRgb) 192 [kYToRgb]"r"(&kYToRgb)
193 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 193 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
194 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 194 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
195 ); 195 );
196 } 196 }
197 #endif // HAS_I444TOARGBROW_NEON 197 #endif // HAS_I444TOARGBROW_NEON
198 198
199 #ifdef HAS_I422TOARGBROW_NEON 199 #ifdef HAS_I422TOARGBROW_NEON
200 void I422ToARGBRow_NEON(const uint8* src_y, 200 void I422ToARGBRow_NEON(const uint8* src_y,
201 const uint8* src_u, 201 const uint8* src_u,
202 const uint8* src_v, 202 const uint8* src_v,
203 uint8* dst_argb, 203 uint8* dst_argb,
204 int width) { 204 int width) {
205 asm volatile ( 205 asm volatile (
206 YUV422TORGB_SETUP_REG 206 YUV422TORGB_SETUP_REG
207 "1: \n" 207 "1: \n"
208 READYUV422 208 READYUV422
209 YUV422TORGB(v22, v21, v20) 209 YUV422TORGB(v22, v21, v20)
210 "subs %4, %4, #8 \n" 210 "subs %w4, %w4, #8 \n"
211 "movi v23.8b, #255 \n" /* A */ 211 "movi v23.8b, #255 \n" /* A */
212 MEMACCESS(3) 212 MEMACCESS(3)
213 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" 213 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
214 "b.gt 1b \n" 214 "b.gt 1b \n"
215 : "+r"(src_y), // %0 215 : "+r"(src_y), // %0
216 "+r"(src_u), // %1 216 "+r"(src_u), // %1
217 "+r"(src_v), // %2 217 "+r"(src_v), // %2
218 "+r"(dst_argb), // %3 218 "+r"(dst_argb), // %3
219 "+r"(width) // %4 219 "+r"(width) // %4
220 : [kUVBiasBGR]"r"(&kUVBiasBGR), 220 : [kUVBiasBGR]"r"(&kUVBiasBGR),
221 [kYToRgb]"r"(&kYToRgb) 221 [kYToRgb]"r"(&kYToRgb)
222 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 222 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
223 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 223 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
224 ); 224 );
225 } 225 }
226 #endif // HAS_I422TOARGBROW_NEON 226 #endif // HAS_I422TOARGBROW_NEON
227 227
228 #ifdef HAS_I411TOARGBROW_NEON 228 #ifdef HAS_I411TOARGBROW_NEON
229 void I411ToARGBRow_NEON(const uint8* src_y, 229 void I411ToARGBRow_NEON(const uint8* src_y,
230 const uint8* src_u, 230 const uint8* src_u,
231 const uint8* src_v, 231 const uint8* src_v,
232 uint8* dst_argb, 232 uint8* dst_argb,
233 int width) { 233 int width) {
234 asm volatile ( 234 asm volatile (
235 YUV422TORGB_SETUP_REG 235 YUV422TORGB_SETUP_REG
236 "1: \n" 236 "1: \n"
237 READYUV411 237 READYUV411
238 YUV422TORGB(v22, v21, v20) 238 YUV422TORGB(v22, v21, v20)
239 "subs %4, %4, #8 \n" 239 "subs %w4, %w4, #8 \n"
240 "movi v23.8b, #255 \n" /* A */ 240 "movi v23.8b, #255 \n" /* A */
241 MEMACCESS(3) 241 MEMACCESS(3)
242 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" 242 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
243 "b.gt 1b \n" 243 "b.gt 1b \n"
244 : "+r"(src_y), // %0 244 : "+r"(src_y), // %0
245 "+r"(src_u), // %1 245 "+r"(src_u), // %1
246 "+r"(src_v), // %2 246 "+r"(src_v), // %2
247 "+r"(dst_argb), // %3 247 "+r"(dst_argb), // %3
248 "+r"(width) // %4 248 "+r"(width) // %4
249 : [kUVBiasBGR]"r"(&kUVBiasBGR), 249 : [kUVBiasBGR]"r"(&kUVBiasBGR),
250 [kYToRgb]"r"(&kYToRgb) 250 [kYToRgb]"r"(&kYToRgb)
251 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 251 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
252 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 252 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
253 ); 253 );
254 } 254 }
255 #endif // HAS_I411TOARGBROW_NEON 255 #endif // HAS_I411TOARGBROW_NEON
256 256
257 #ifdef HAS_I422TOBGRAROW_NEON 257 #ifdef HAS_I422TOBGRAROW_NEON
258 void I422ToBGRARow_NEON(const uint8* src_y, 258 void I422ToBGRARow_NEON(const uint8* src_y,
259 const uint8* src_u, 259 const uint8* src_u,
260 const uint8* src_v, 260 const uint8* src_v,
261 uint8* dst_bgra, 261 uint8* dst_bgra,
262 int width) { 262 int width) {
263 asm volatile ( 263 asm volatile (
264 YUV422TORGB_SETUP_REG 264 YUV422TORGB_SETUP_REG
265 "1: \n" 265 "1: \n"
266 READYUV422 266 READYUV422
267 YUV422TORGB(v21, v22, v23) 267 YUV422TORGB(v21, v22, v23)
268 "subs %4, %4, #8 \n" 268 "subs %w4, %w4, #8 \n"
269 "movi v20.8b, #255 \n" /* A */ 269 "movi v20.8b, #255 \n" /* A */
270 MEMACCESS(3) 270 MEMACCESS(3)
271 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" 271 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
272 "b.gt 1b \n" 272 "b.gt 1b \n"
273 : "+r"(src_y), // %0 273 : "+r"(src_y), // %0
274 "+r"(src_u), // %1 274 "+r"(src_u), // %1
275 "+r"(src_v), // %2 275 "+r"(src_v), // %2
276 "+r"(dst_bgra), // %3 276 "+r"(dst_bgra), // %3
277 "+r"(width) // %4 277 "+r"(width) // %4
278 : [kUVBiasBGR]"r"(&kUVBiasBGR), 278 : [kUVBiasBGR]"r"(&kUVBiasBGR),
279 [kYToRgb]"r"(&kYToRgb) 279 [kYToRgb]"r"(&kYToRgb)
280 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 280 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
281 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 281 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
282 ); 282 );
283 } 283 }
284 #endif // HAS_I422TOBGRAROW_NEON 284 #endif // HAS_I422TOBGRAROW_NEON
285 285
286 #ifdef HAS_I422TOABGRROW_NEON 286 #ifdef HAS_I422TOABGRROW_NEON
287 void I422ToABGRRow_NEON(const uint8* src_y, 287 void I422ToABGRRow_NEON(const uint8* src_y,
288 const uint8* src_u, 288 const uint8* src_u,
289 const uint8* src_v, 289 const uint8* src_v,
290 uint8* dst_abgr, 290 uint8* dst_abgr,
291 int width) { 291 int width) {
292 asm volatile ( 292 asm volatile (
293 YUV422TORGB_SETUP_REG 293 YUV422TORGB_SETUP_REG
294 "1: \n" 294 "1: \n"
295 READYUV422 295 READYUV422
296 YUV422TORGB(v20, v21, v22) 296 YUV422TORGB(v20, v21, v22)
297 "subs %4, %4, #8 \n" 297 "subs %w4, %w4, #8 \n"
298 "movi v23.8b, #255 \n" /* A */ 298 "movi v23.8b, #255 \n" /* A */
299 MEMACCESS(3) 299 MEMACCESS(3)
300 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" 300 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
301 "b.gt 1b \n" 301 "b.gt 1b \n"
302 : "+r"(src_y), // %0 302 : "+r"(src_y), // %0
303 "+r"(src_u), // %1 303 "+r"(src_u), // %1
304 "+r"(src_v), // %2 304 "+r"(src_v), // %2
305 "+r"(dst_abgr), // %3 305 "+r"(dst_abgr), // %3
306 "+r"(width) // %4 306 "+r"(width) // %4
307 : [kUVBiasBGR]"r"(&kUVBiasBGR), 307 : [kUVBiasBGR]"r"(&kUVBiasBGR),
308 [kYToRgb]"r"(&kYToRgb) 308 [kYToRgb]"r"(&kYToRgb)
309 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 309 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
310 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 310 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
311 ); 311 );
312 } 312 }
313 #endif // HAS_I422TOABGRROW_NEON 313 #endif // HAS_I422TOABGRROW_NEON
314 314
315 #ifdef HAS_I422TORGBAROW_NEON 315 #ifdef HAS_I422TORGBAROW_NEON
316 void I422ToRGBARow_NEON(const uint8* src_y, 316 void I422ToRGBARow_NEON(const uint8* src_y,
317 const uint8* src_u, 317 const uint8* src_u,
318 const uint8* src_v, 318 const uint8* src_v,
319 uint8* dst_rgba, 319 uint8* dst_rgba,
320 int width) { 320 int width) {
321 asm volatile ( 321 asm volatile (
322 YUV422TORGB_SETUP_REG 322 YUV422TORGB_SETUP_REG
323 "1: \n" 323 "1: \n"
324 READYUV422 324 READYUV422
325 YUV422TORGB(v23, v22, v21) 325 YUV422TORGB(v23, v22, v21)
326 "subs %4, %4, #8 \n" 326 "subs %w4, %w4, #8 \n"
327 "movi v20.8b, #255 \n" /* A */ 327 "movi v20.8b, #255 \n" /* A */
328 MEMACCESS(3) 328 MEMACCESS(3)
329 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" 329 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
330 "b.gt 1b \n" 330 "b.gt 1b \n"
331 : "+r"(src_y), // %0 331 : "+r"(src_y), // %0
332 "+r"(src_u), // %1 332 "+r"(src_u), // %1
333 "+r"(src_v), // %2 333 "+r"(src_v), // %2
334 "+r"(dst_rgba), // %3 334 "+r"(dst_rgba), // %3
335 "+r"(width) // %4 335 "+r"(width) // %4
336 : [kUVBiasBGR]"r"(&kUVBiasBGR), 336 : [kUVBiasBGR]"r"(&kUVBiasBGR),
337 [kYToRgb]"r"(&kYToRgb) 337 [kYToRgb]"r"(&kYToRgb)
338 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 338 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
339 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 339 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
340 ); 340 );
341 } 341 }
342 #endif // HAS_I422TORGBAROW_NEON 342 #endif // HAS_I422TORGBAROW_NEON
343 343
344 #ifdef HAS_I422TORGB24ROW_NEON 344 #ifdef HAS_I422TORGB24ROW_NEON
345 void I422ToRGB24Row_NEON(const uint8* src_y, 345 void I422ToRGB24Row_NEON(const uint8* src_y,
346 const uint8* src_u, 346 const uint8* src_u,
347 const uint8* src_v, 347 const uint8* src_v,
348 uint8* dst_rgb24, 348 uint8* dst_rgb24,
349 int width) { 349 int width) {
350 asm volatile ( 350 asm volatile (
351 YUV422TORGB_SETUP_REG 351 YUV422TORGB_SETUP_REG
352 "1: \n" 352 "1: \n"
353 READYUV422 353 READYUV422
354 YUV422TORGB(v22, v21, v20) 354 YUV422TORGB(v22, v21, v20)
355 "subs %4, %4, #8 \n" 355 "subs %w4, %w4, #8 \n"
356 MEMACCESS(3) 356 MEMACCESS(3)
357 "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n" 357 "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
358 "b.gt 1b \n" 358 "b.gt 1b \n"
359 : "+r"(src_y), // %0 359 : "+r"(src_y), // %0
360 "+r"(src_u), // %1 360 "+r"(src_u), // %1
361 "+r"(src_v), // %2 361 "+r"(src_v), // %2
362 "+r"(dst_rgb24), // %3 362 "+r"(dst_rgb24), // %3
363 "+r"(width) // %4 363 "+r"(width) // %4
364 : [kUVBiasBGR]"r"(&kUVBiasBGR), 364 : [kUVBiasBGR]"r"(&kUVBiasBGR),
365 [kYToRgb]"r"(&kYToRgb) 365 [kYToRgb]"r"(&kYToRgb)
366 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 366 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
367 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 367 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
368 ); 368 );
369 } 369 }
370 #endif // HAS_I422TORGB24ROW_NEON 370 #endif // HAS_I422TORGB24ROW_NEON
371 371
372 #ifdef HAS_I422TORAWROW_NEON 372 #ifdef HAS_I422TORAWROW_NEON
373 void I422ToRAWRow_NEON(const uint8* src_y, 373 void I422ToRAWRow_NEON(const uint8* src_y,
374 const uint8* src_u, 374 const uint8* src_u,
375 const uint8* src_v, 375 const uint8* src_v,
376 uint8* dst_raw, 376 uint8* dst_raw,
377 int width) { 377 int width) {
378 asm volatile ( 378 asm volatile (
379 YUV422TORGB_SETUP_REG 379 YUV422TORGB_SETUP_REG
380 "1: \n" 380 "1: \n"
381 READYUV422 381 READYUV422
382 YUV422TORGB(v20, v21, v22) 382 YUV422TORGB(v20, v21, v22)
383 "subs %4, %4, #8 \n" 383 "subs %w4, %w4, #8 \n"
384 MEMACCESS(3) 384 MEMACCESS(3)
385 "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n" 385 "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
386 "b.gt 1b \n" 386 "b.gt 1b \n"
387 : "+r"(src_y), // %0 387 : "+r"(src_y), // %0
388 "+r"(src_u), // %1 388 "+r"(src_u), // %1
389 "+r"(src_v), // %2 389 "+r"(src_v), // %2
390 "+r"(dst_raw), // %3 390 "+r"(dst_raw), // %3
391 "+r"(width) // %4 391 "+r"(width) // %4
392 : [kUVBiasBGR]"r"(&kUVBiasBGR), 392 : [kUVBiasBGR]"r"(&kUVBiasBGR),
393 [kYToRgb]"r"(&kYToRgb) 393 [kYToRgb]"r"(&kYToRgb)
(...skipping 14 matching lines...) Expand all
408 void I422ToRGB565Row_NEON(const uint8* src_y, 408 void I422ToRGB565Row_NEON(const uint8* src_y,
409 const uint8* src_u, 409 const uint8* src_u,
410 const uint8* src_v, 410 const uint8* src_v,
411 uint8* dst_rgb565, 411 uint8* dst_rgb565,
412 int width) { 412 int width) {
413 asm volatile ( 413 asm volatile (
414 YUV422TORGB_SETUP_REG 414 YUV422TORGB_SETUP_REG
415 "1: \n" 415 "1: \n"
416 READYUV422 416 READYUV422
417 YUV422TORGB(v22, v21, v20) 417 YUV422TORGB(v22, v21, v20)
418 "subs %4, %4, #8 \n" 418 "subs %w4, %w4, #8 \n"
419 ARGBTORGB565 419 ARGBTORGB565
420 MEMACCESS(3) 420 MEMACCESS(3)
421 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. 421 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565.
422 "b.gt 1b \n" 422 "b.gt 1b \n"
423 : "+r"(src_y), // %0 423 : "+r"(src_y), // %0
424 "+r"(src_u), // %1 424 "+r"(src_u), // %1
425 "+r"(src_v), // %2 425 "+r"(src_v), // %2
426 "+r"(dst_rgb565), // %3 426 "+r"(dst_rgb565), // %3
427 "+r"(width) // %4 427 "+r"(width) // %4
428 : [kUVBiasBGR]"r"(&kUVBiasBGR), 428 : [kUVBiasBGR]"r"(&kUVBiasBGR),
(...skipping 17 matching lines...) Expand all
446 void I422ToARGB1555Row_NEON(const uint8* src_y, 446 void I422ToARGB1555Row_NEON(const uint8* src_y,
447 const uint8* src_u, 447 const uint8* src_u,
448 const uint8* src_v, 448 const uint8* src_v,
449 uint8* dst_argb1555, 449 uint8* dst_argb1555,
450 int width) { 450 int width) {
451 asm volatile ( 451 asm volatile (
452 YUV422TORGB_SETUP_REG 452 YUV422TORGB_SETUP_REG
453 "1: \n" 453 "1: \n"
454 READYUV422 454 READYUV422
455 YUV422TORGB(v22, v21, v20) 455 YUV422TORGB(v22, v21, v20)
456 "subs %4, %4, #8 \n" 456 "subs %w4, %w4, #8 \n"
457 "movi v23.8b, #255 \n" 457 "movi v23.8b, #255 \n"
458 ARGBTOARGB1555 458 ARGBTOARGB1555
459 MEMACCESS(3) 459 MEMACCESS(3)
460 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. 460 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565.
461 "b.gt 1b \n" 461 "b.gt 1b \n"
462 : "+r"(src_y), // %0 462 : "+r"(src_y), // %0
463 "+r"(src_u), // %1 463 "+r"(src_u), // %1
464 "+r"(src_v), // %2 464 "+r"(src_v), // %2
465 "+r"(dst_argb1555), // %3 465 "+r"(dst_argb1555), // %3
466 "+r"(width) // %4 466 "+r"(width) // %4
(...skipping 20 matching lines...) Expand all
487 const uint8* src_u, 487 const uint8* src_u,
488 const uint8* src_v, 488 const uint8* src_v,
489 uint8* dst_argb4444, 489 uint8* dst_argb4444,
490 int width) { 490 int width) {
491 asm volatile ( 491 asm volatile (
492 YUV422TORGB_SETUP_REG 492 YUV422TORGB_SETUP_REG
493 "movi v4.16b, #0x0f \n" // bits to clear with vbic. 493 "movi v4.16b, #0x0f \n" // bits to clear with vbic.
494 "1: \n" 494 "1: \n"
495 READYUV422 495 READYUV422
496 YUV422TORGB(v22, v21, v20) 496 YUV422TORGB(v22, v21, v20)
497 "subs %4, %4, #8 \n" 497 "subs %w4, %w4, #8 \n"
498 "movi v23.8b, #255 \n" 498 "movi v23.8b, #255 \n"
499 ARGBTOARGB4444 499 ARGBTOARGB4444
500 MEMACCESS(3) 500 MEMACCESS(3)
501 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels ARGB4444. 501 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels ARGB4444.
502 "b.gt 1b \n" 502 "b.gt 1b \n"
503 : "+r"(src_y), // %0 503 : "+r"(src_y), // %0
504 "+r"(src_u), // %1 504 "+r"(src_u), // %1
505 "+r"(src_v), // %2 505 "+r"(src_v), // %2
506 "+r"(dst_argb4444), // %3 506 "+r"(dst_argb4444), // %3
507 "+r"(width) // %4 507 "+r"(width) // %4
508 : [kUVBiasBGR]"r"(&kUVBiasBGR), 508 : [kUVBiasBGR]"r"(&kUVBiasBGR),
509 [kYToRgb]"r"(&kYToRgb) 509 [kYToRgb]"r"(&kYToRgb)
510 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 510 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
511 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 511 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
512 ); 512 );
513 } 513 }
514 #endif // HAS_I422TOARGB4444ROW_NEON 514 #endif // HAS_I422TOARGB4444ROW_NEON
515 515
516 #ifdef HAS_YTOARGBROW_NEON 516 #ifdef HAS_I400TOARGBROW_NEON
517 void YToARGBRow_NEON(const uint8* src_y, 517 void I400ToARGBRow_NEON(const uint8* src_y,
518 uint8* dst_argb, 518 uint8* dst_argb,
519 int width) { 519 int width) {
520 int64 width64 = (int64)(width);
520 asm volatile ( 521 asm volatile (
521 YUV422TORGB_SETUP_REG 522 YUV422TORGB_SETUP_REG
522 "1: \n" 523 "1: \n"
523 READYUV400 524 READYUV400
524 YUV422TORGB(v22, v21, v20) 525 YUV422TORGB(v22, v21, v20)
525 "subs %2, %2, #8 \n" 526 "subs %w2, %w2, #8 \n"
526 "movi v23.8b, #255 \n" 527 "movi v23.8b, #255 \n"
527 MEMACCESS(1) 528 MEMACCESS(1)
528 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" 529 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
529 "b.gt 1b \n" 530 "b.gt 1b \n"
530 : "+r"(src_y), // %0 531 : "+r"(src_y), // %0
531 "+r"(dst_argb), // %1 532 "+r"(dst_argb), // %1
532 "+r"(width) // %2 533 "+r"(width64) // %2
533 : [kUVBiasBGR]"r"(&kUVBiasBGR), 534 : [kUVBiasBGR]"r"(&kUVBiasBGR),
534 [kYToRgb]"r"(&kYToRgb) 535 [kYToRgb]"r"(&kYToRgb)
535 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 536 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
536 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 537 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
537 ); 538 );
538 } 539 }
539 #endif // HAS_YTOARGBROW_NEON 540 #endif // HAS_I400TOARGBROW_NEON
540 541
541 #ifdef HAS_I400TOARGBROW_NEON 542 #ifdef HAS_J400TOARGBROW_NEON
542 void I400ToARGBRow_NEON(const uint8* src_y, 543 void J400ToARGBRow_NEON(const uint8* src_y,
543 uint8* dst_argb, 544 uint8* dst_argb,
544 int width) { 545 int width) {
545 asm volatile ( 546 asm volatile (
546 "movi v23.8b, #255 \n" 547 "movi v23.8b, #255 \n"
547 "1: \n" 548 "1: \n"
548 MEMACCESS(0) 549 MEMACCESS(0)
549 "ld1 {v20.8b}, [%0], #8 \n" 550 "ld1 {v20.8b}, [%0], #8 \n"
550 "orr v21.8b, v20.8b, v20.8b \n" 551 "orr v21.8b, v20.8b, v20.8b \n"
551 "orr v22.8b, v20.8b, v20.8b \n" 552 "orr v22.8b, v20.8b, v20.8b \n"
552 "subs %2, %2, #8 \n" 553 "subs %w2, %w2, #8 \n"
553 MEMACCESS(1) 554 MEMACCESS(1)
554 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" 555 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
555 "b.gt 1b \n" 556 "b.gt 1b \n"
556 : "+r"(src_y), // %0 557 : "+r"(src_y), // %0
557 "+r"(dst_argb), // %1 558 "+r"(dst_argb), // %1
558 "+r"(width) // %2 559 "+r"(width) // %2
559 : 560 :
560 : "cc", "memory", "v20", "v21", "v22", "v23" 561 : "cc", "memory", "v20", "v21", "v22", "v23"
561 ); 562 );
562 } 563 }
563 #endif // HAS_I400TOARGBROW_NEON 564 #endif // HAS_J400TOARGBROW_NEON
564 565
565 #ifdef HAS_NV12TOARGBROW_NEON 566 #ifdef HAS_NV12TOARGBROW_NEON
566 void NV12ToARGBRow_NEON(const uint8* src_y, 567 void NV12ToARGBRow_NEON(const uint8* src_y,
567 const uint8* src_uv, 568 const uint8* src_uv,
568 uint8* dst_argb, 569 uint8* dst_argb,
569 int width) { 570 int width) {
570 asm volatile ( 571 asm volatile (
571 YUV422TORGB_SETUP_REG 572 YUV422TORGB_SETUP_REG
572 "1: \n" 573 "1: \n"
573 READNV12 574 READNV12
574 YUV422TORGB(v22, v21, v20) 575 YUV422TORGB(v22, v21, v20)
575 "subs %3, %3, #8 \n" 576 "subs %w3, %w3, #8 \n"
576 "movi v23.8b, #255 \n" 577 "movi v23.8b, #255 \n"
577 MEMACCESS(2) 578 MEMACCESS(2)
578 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" 579 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
579 "b.gt 1b \n" 580 "b.gt 1b \n"
580 : "+r"(src_y), // %0 581 : "+r"(src_y), // %0
581 "+r"(src_uv), // %1 582 "+r"(src_uv), // %1
582 "+r"(dst_argb), // %2 583 "+r"(dst_argb), // %2
583 "+r"(width) // %3 584 "+r"(width) // %3
584 : [kUVBiasBGR]"r"(&kUVBiasBGR), 585 : [kUVBiasBGR]"r"(&kUVBiasBGR),
585 [kYToRgb]"r"(&kYToRgb) 586 [kYToRgb]"r"(&kYToRgb)
586 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 587 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
587 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 588 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
588 ); 589 );
589 } 590 }
590 #endif // HAS_NV12TOARGBROW_NEON 591 #endif // HAS_NV12TOARGBROW_NEON
591 592
592 #ifdef HAS_NV21TOARGBROW_NEON 593 #ifdef HAS_NV21TOARGBROW_NEON
593 void NV21ToARGBRow_NEON(const uint8* src_y, 594 void NV21ToARGBRow_NEON(const uint8* src_y,
594 const uint8* src_uv, 595 const uint8* src_uv,
595 uint8* dst_argb, 596 uint8* dst_argb,
596 int width) { 597 int width) {
597 asm volatile ( 598 asm volatile (
598 YUV422TORGB_SETUP_REG 599 YUV422TORGB_SETUP_REG
599 "1: \n" 600 "1: \n"
600 READNV21 601 READNV21
601 YUV422TORGB(v22, v21, v20) 602 YUV422TORGB(v22, v21, v20)
602 "subs %3, %3, #8 \n" 603 "subs %w3, %w3, #8 \n"
603 "movi v23.8b, #255 \n" 604 "movi v23.8b, #255 \n"
604 MEMACCESS(2) 605 MEMACCESS(2)
605 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" 606 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
606 "b.gt 1b \n" 607 "b.gt 1b \n"
607 : "+r"(src_y), // %0 608 : "+r"(src_y), // %0
608 "+r"(src_uv), // %1 609 "+r"(src_uv), // %1
609 "+r"(dst_argb), // %2 610 "+r"(dst_argb), // %2
610 "+r"(width) // %3 611 "+r"(width) // %3
611 : [kUVBiasBGR]"r"(&kUVBiasBGR), 612 : [kUVBiasBGR]"r"(&kUVBiasBGR),
612 [kYToRgb]"r"(&kYToRgb) 613 [kYToRgb]"r"(&kYToRgb)
613 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 614 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
614 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 615 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
615 ); 616 );
616 } 617 }
617 #endif // HAS_NV21TOARGBROW_NEON 618 #endif // HAS_NV21TOARGBROW_NEON
618 619
619 #ifdef HAS_NV12TORGB565ROW_NEON 620 #ifdef HAS_NV12TORGB565ROW_NEON
620 void NV12ToRGB565Row_NEON(const uint8* src_y, 621 void NV12ToRGB565Row_NEON(const uint8* src_y,
621 const uint8* src_uv, 622 const uint8* src_uv,
622 uint8* dst_rgb565, 623 uint8* dst_rgb565,
623 int width) { 624 int width) {
624 asm volatile ( 625 asm volatile (
625 YUV422TORGB_SETUP_REG 626 YUV422TORGB_SETUP_REG
626 "1: \n" 627 "1: \n"
627 READNV12 628 READNV12
628 YUV422TORGB(v22, v21, v20) 629 YUV422TORGB(v22, v21, v20)
629 "subs %3, %3, #8 \n" 630 "subs %w3, %w3, #8 \n"
630 ARGBTORGB565 631 ARGBTORGB565
631 MEMACCESS(2) 632 MEMACCESS(2)
632 "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565. 633 "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565.
633 "b.gt 1b \n" 634 "b.gt 1b \n"
634 : "+r"(src_y), // %0 635 : "+r"(src_y), // %0
635 "+r"(src_uv), // %1 636 "+r"(src_uv), // %1
636 "+r"(dst_rgb565), // %2 637 "+r"(dst_rgb565), // %2
637 "+r"(width) // %3 638 "+r"(width) // %3
638 : [kUVBiasBGR]"r"(&kUVBiasBGR), 639 : [kUVBiasBGR]"r"(&kUVBiasBGR),
639 [kYToRgb]"r"(&kYToRgb) 640 [kYToRgb]"r"(&kYToRgb)
640 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 641 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
641 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 642 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
642 ); 643 );
643 } 644 }
644 #endif // HAS_NV12TORGB565ROW_NEON 645 #endif // HAS_NV12TORGB565ROW_NEON
645 646
646 #ifdef HAS_NV21TORGB565ROW_NEON 647 #ifdef HAS_NV21TORGB565ROW_NEON
647 void NV21ToRGB565Row_NEON(const uint8* src_y, 648 void NV21ToRGB565Row_NEON(const uint8* src_y,
648 const uint8* src_uv, 649 const uint8* src_uv,
649 uint8* dst_rgb565, 650 uint8* dst_rgb565,
650 int width) { 651 int width) {
651 asm volatile ( 652 asm volatile (
652 YUV422TORGB_SETUP_REG 653 YUV422TORGB_SETUP_REG
653 "1: \n" 654 "1: \n"
654 READNV21 655 READNV21
655 YUV422TORGB(v22, v21, v20) 656 YUV422TORGB(v22, v21, v20)
656 "subs %3, %3, #8 \n" 657 "subs %w3, %w3, #8 \n"
657 ARGBTORGB565 658 ARGBTORGB565
658 MEMACCESS(2) 659 MEMACCESS(2)
659 "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565. 660 "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565.
660 "b.gt 1b \n" 661 "b.gt 1b \n"
661 : "+r"(src_y), // %0 662 : "+r"(src_y), // %0
662 "+r"(src_uv), // %1 663 "+r"(src_uv), // %1
663 "+r"(dst_rgb565), // %2 664 "+r"(dst_rgb565), // %2
664 "+r"(width) // %3 665 "+r"(width) // %3
665 : [kUVBiasBGR]"r"(&kUVBiasBGR), 666 : [kUVBiasBGR]"r"(&kUVBiasBGR),
666 [kYToRgb]"r"(&kYToRgb) 667 [kYToRgb]"r"(&kYToRgb)
667 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 668 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
668 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 669 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
669 ); 670 );
670 } 671 }
671 #endif // HAS_NV21TORGB565ROW_NEON 672 #endif // HAS_NV21TORGB565ROW_NEON
672 673
673 #ifdef HAS_YUY2TOARGBROW_NEON 674 #ifdef HAS_YUY2TOARGBROW_NEON
674 void YUY2ToARGBRow_NEON(const uint8* src_yuy2, 675 void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
675 uint8* dst_argb, 676 uint8* dst_argb,
676 int width) { 677 int width) {
678 int64 width64 = (int64)(width);
677 asm volatile ( 679 asm volatile (
678 YUV422TORGB_SETUP_REG 680 YUV422TORGB_SETUP_REG
679 "1: \n" 681 "1: \n"
680 READYUY2 682 READYUY2
681 YUV422TORGB(v22, v21, v20) 683 YUV422TORGB(v22, v21, v20)
682 "subs %2, %2, #8 \n" 684 "subs %w2, %w2, #8 \n"
683 "movi v23.8b, #255 \n" 685 "movi v23.8b, #255 \n"
684 MEMACCESS(1) 686 MEMACCESS(1)
685 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" 687 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
686 "b.gt 1b \n" 688 "b.gt 1b \n"
687 : "+r"(src_yuy2), // %0 689 : "+r"(src_yuy2), // %0
688 "+r"(dst_argb), // %1 690 "+r"(dst_argb), // %1
689 "+r"(width) // %2 691 "+r"(width64) // %2
690 : [kUVBiasBGR]"r"(&kUVBiasBGR), 692 : [kUVBiasBGR]"r"(&kUVBiasBGR),
691 [kYToRgb]"r"(&kYToRgb) 693 [kYToRgb]"r"(&kYToRgb)
692 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 694 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
693 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 695 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
694 ); 696 );
695 } 697 }
696 #endif // HAS_YUY2TOARGBROW_NEON 698 #endif // HAS_YUY2TOARGBROW_NEON
697 699
698 #ifdef HAS_UYVYTOARGBROW_NEON 700 #ifdef HAS_UYVYTOARGBROW_NEON
699 void UYVYToARGBRow_NEON(const uint8* src_uyvy, 701 void UYVYToARGBRow_NEON(const uint8* src_uyvy,
700 uint8* dst_argb, 702 uint8* dst_argb,
701 int width) { 703 int width) {
704 int64 width64 = (int64)(width);
702 asm volatile ( 705 asm volatile (
703 YUV422TORGB_SETUP_REG 706 YUV422TORGB_SETUP_REG
704 "1: \n" 707 "1: \n"
705 READUYVY 708 READUYVY
706 YUV422TORGB(v22, v21, v20) 709 YUV422TORGB(v22, v21, v20)
707 "subs %2, %2, #8 \n" 710 "subs %w2, %w2, #8 \n"
708 "movi v23.8b, #255 \n" 711 "movi v23.8b, #255 \n"
709 MEMACCESS(1) 712 MEMACCESS(1)
710 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n" 713 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n"
711 "b.gt 1b \n" 714 "b.gt 1b \n"
712 : "+r"(src_uyvy), // %0 715 : "+r"(src_uyvy), // %0
713 "+r"(dst_argb), // %1 716 "+r"(dst_argb), // %1
714 "+r"(width) // %2 717 "+r"(width64) // %2
715 : [kUVBiasBGR]"r"(&kUVBiasBGR), 718 : [kUVBiasBGR]"r"(&kUVBiasBGR),
716 [kYToRgb]"r"(&kYToRgb) 719 [kYToRgb]"r"(&kYToRgb)
717 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 720 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
718 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 721 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
719 ); 722 );
720 } 723 }
721 #endif // HAS_UYVYTOARGBROW_NEON 724 #endif // HAS_UYVYTOARGBROW_NEON
722 725
723 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. 726 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
724 #ifdef HAS_SPLITUVROW_NEON 727 #ifdef HAS_SPLITUVROW_NEON
725 void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, 728 void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
726 int width) { 729 int width) {
727 asm volatile ( 730 asm volatile (
728 "1: \n" 731 "1: \n"
729 MEMACCESS(0) 732 MEMACCESS(0)
730 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV 733 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV
731 "subs %3, %3, #16 \n" // 16 processed per loop 734 "subs %w3, %w3, #16 \n" // 16 processed per loop
732 MEMACCESS(1) 735 MEMACCESS(1)
733 "st1 {v0.16b}, [%1], #16 \n" // store U 736 "st1 {v0.16b}, [%1], #16 \n" // store U
734 MEMACCESS(2) 737 MEMACCESS(2)
735 "st1 {v1.16b}, [%2], #16 \n" // store V 738 "st1 {v1.16b}, [%2], #16 \n" // store V
736 "b.gt 1b \n" 739 "b.gt 1b \n"
737 : "+r"(src_uv), // %0 740 : "+r"(src_uv), // %0
738 "+r"(dst_u), // %1 741 "+r"(dst_u), // %1
739 "+r"(dst_v), // %2 742 "+r"(dst_v), // %2
740 "+r"(width) // %3 // Output registers 743 "+r"(width) // %3 // Output registers
741 : // Input registers 744 : // Input registers
742 : "cc", "memory", "v0", "v1" // Clobber List 745 : "cc", "memory", "v0", "v1" // Clobber List
743 ); 746 );
744 } 747 }
745 #endif // HAS_SPLITUVROW_NEON 748 #endif // HAS_SPLITUVROW_NEON
746 749
747 // Reads 16 U's and V's and writes out 16 pairs of UV. 750 // Reads 16 U's and V's and writes out 16 pairs of UV.
748 #ifdef HAS_MERGEUVROW_NEON 751 #ifdef HAS_MERGEUVROW_NEON
749 void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, 752 void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
750 int width) { 753 int width) {
751 asm volatile ( 754 asm volatile (
752 "1: \n" 755 "1: \n"
753 MEMACCESS(0) 756 MEMACCESS(0)
754 "ld1 {v0.16b}, [%0], #16 \n" // load U 757 "ld1 {v0.16b}, [%0], #16 \n" // load U
755 MEMACCESS(1) 758 MEMACCESS(1)
756 "ld1 {v1.16b}, [%1], #16 \n" // load V 759 "ld1 {v1.16b}, [%1], #16 \n" // load V
757 "subs %3, %3, #16 \n" // 16 processed per loop 760 "subs %w3, %w3, #16 \n" // 16 processed per loop
758 MEMACCESS(2) 761 MEMACCESS(2)
759 "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV 762 "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV
760 "b.gt 1b \n" 763 "b.gt 1b \n"
761 : 764 :
762 "+r"(src_u), // %0 765 "+r"(src_u), // %0
763 "+r"(src_v), // %1 766 "+r"(src_v), // %1
764 "+r"(dst_uv), // %2 767 "+r"(dst_uv), // %2
765 "+r"(width) // %3 // Output registers 768 "+r"(width) // %3 // Output registers
766 : // Input registers 769 : // Input registers
767 : "cc", "memory", "v0", "v1" // Clobber List 770 : "cc", "memory", "v0", "v1" // Clobber List
768 ); 771 );
769 } 772 }
770 #endif // HAS_MERGEUVROW_NEON 773 #endif // HAS_MERGEUVROW_NEON
771 774
772 // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. 775 // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
773 #ifdef HAS_COPYROW_NEON 776 #ifdef HAS_COPYROW_NEON
774 void CopyRow_NEON(const uint8* src, uint8* dst, int count) { 777 void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
775 asm volatile ( 778 asm volatile (
776 "1: \n" 779 "1: \n"
777 MEMACCESS(0) 780 MEMACCESS(0)
778 "ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32 781 "ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32
779 "subs %2, %2, #32 \n" // 32 processed per loop 782 "subs %w2, %w2, #32 \n" // 32 processed per loop
780 MEMACCESS(1) 783 MEMACCESS(1)
781 "st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32 784 "st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32
782 "b.gt 1b \n" 785 "b.gt 1b \n"
783 : "+r"(src), // %0 786 : "+r"(src), // %0
784 "+r"(dst), // %1 787 "+r"(dst), // %1
785 "+r"(count) // %2 // Output registers 788 "+r"(count) // %2 // Output registers
786 : // Input registers 789 : // Input registers
787 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 790 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
788 ); 791 );
789 } 792 }
790 #endif // HAS_COPYROW_NEON 793 #endif // HAS_COPYROW_NEON
791 794
792 // SetRow writes 'count' bytes using an 8 bit value repeated. 795 // SetRow writes 'count' bytes using an 8 bit value repeated.
793 void SetRow_NEON(uint8* dst, uint8 v8, int count) { 796 void SetRow_NEON(uint8* dst, uint8 v8, int count) {
794 asm volatile ( 797 asm volatile (
795 "dup v0.16b, %w2 \n" // duplicate 16 bytes 798 "dup v0.16b, %w2 \n" // duplicate 16 bytes
796 "1: \n" 799 "1: \n"
797 "subs %1, %1, #16 \n" // 16 bytes per loop 800 "subs %w1, %w1, #16 \n" // 16 bytes per loop
798 MEMACCESS(0) 801 MEMACCESS(0)
799 "st1 {v0.16b}, [%0], #16 \n" // store 802 "st1 {v0.16b}, [%0], #16 \n" // store
800 "b.gt 1b \n" 803 "b.gt 1b \n"
801 : "+r"(dst), // %0 804 : "+r"(dst), // %0
802 "+r"(count) // %1 805 "+r"(count) // %1
803 : "r"(v8) // %2 806 : "r"(v8) // %2
804 : "cc", "memory", "v0" 807 : "cc", "memory", "v0"
805 ); 808 );
806 } 809 }
807 810
808 void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) { 811 void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
809 asm volatile ( 812 asm volatile (
810 "dup v0.4s, %w2 \n" // duplicate 4 ints 813 "dup v0.4s, %w2 \n" // duplicate 4 ints
811 "1: \n" 814 "1: \n"
812 "subs %1, %1, #4 \n" // 4 ints per loop 815 "subs %w1, %w1, #4 \n" // 4 ints per loop
813 MEMACCESS(0) 816 MEMACCESS(0)
814 "st1 {v0.16b}, [%0], #16 \n" // store 817 "st1 {v0.16b}, [%0], #16 \n" // store
815 "b.gt 1b \n" 818 "b.gt 1b \n"
816 : "+r"(dst), // %0 819 : "+r"(dst), // %0
817 "+r"(count) // %1 820 "+r"(count) // %1
818 : "r"(v32) // %2 821 : "r"(v32) // %2
819 : "cc", "memory", "v0" 822 : "cc", "memory", "v0"
820 ); 823 );
821 } 824 }
822 825
823 #ifdef HAS_MIRRORROW_NEON 826 #ifdef HAS_MIRRORROW_NEON
824 void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { 827 void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
828 int64 width64 = (int64) width;
825 asm volatile ( 829 asm volatile (
826 // Start at end of source row. 830 // Start at end of source row.
827 "add %0, %0, %2 \n" 831 "add %0, %0, %2 \n"
828 "sub %0, %0, #16 \n" 832 "sub %0, %0, #16 \n"
829 833
830 "1: \n" 834 "1: \n"
831 MEMACCESS(0) 835 MEMACCESS(0)
832 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 836 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
833 "subs %2, %2, #16 \n" // 16 pixels per loop. 837 "subs %2, %2, #16 \n" // 16 pixels per loop.
834 "rev64 v0.16b, v0.16b \n" 838 "rev64 v0.16b, v0.16b \n"
835 MEMACCESS(1) 839 MEMACCESS(1)
836 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 840 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
837 MEMACCESS(1) 841 MEMACCESS(1)
838 "st1 {v0.D}[0], [%1], #8 \n" 842 "st1 {v0.D}[0], [%1], #8 \n"
839 "b.gt 1b \n" 843 "b.gt 1b \n"
840 : "+r"(src), // %0 844 : "+r"(src), // %0
841 "+r"(dst), // %1 845 "+r"(dst), // %1
842 "+r"(width) // %2 846 "+r"(width64) // %2
843 : "r"((ptrdiff_t)-16) // %3 847 : "r"((ptrdiff_t)-16) // %3
844 : "cc", "memory", "v0" 848 : "cc", "memory", "v0"
845 ); 849 );
846 } 850 }
847 #endif // HAS_MIRRORROW_NEON 851 #endif // HAS_MIRRORROW_NEON
848 852
849 #ifdef HAS_MIRRORUVROW_NEON 853 #ifdef HAS_MIRRORUVROW_NEON
850 void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, 854 void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
851 int width) { 855 int width) {
856 int64 width64 = (int64) width;
852 asm volatile ( 857 asm volatile (
853 // Start at end of source row. 858 // Start at end of source row.
854 "add %0, %0, %3, lsl #1 \n" 859 "add %0, %0, %3, lsl #1 \n"
855 "sub %0, %0, #16 \n" 860 "sub %0, %0, #16 \n"
856 861
857 "1: \n" 862 "1: \n"
858 MEMACCESS(0) 863 MEMACCESS(0)
859 "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16 864 "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16
860 "subs %3, %3, #8 \n" // 8 pixels per loop. 865 "subs %3, %3, #8 \n" // 8 pixels per loop.
861 "rev64 v0.8b, v0.8b \n" 866 "rev64 v0.8b, v0.8b \n"
862 "rev64 v1.8b, v1.8b \n" 867 "rev64 v1.8b, v1.8b \n"
863 MEMACCESS(1) 868 MEMACCESS(1)
864 "st1 {v0.8b}, [%1], #8 \n" // dst += 8 869 "st1 {v0.8b}, [%1], #8 \n" // dst += 8
865 MEMACCESS(2) 870 MEMACCESS(2)
866 "st1 {v1.8b}, [%2], #8 \n" 871 "st1 {v1.8b}, [%2], #8 \n"
867 "b.gt 1b \n" 872 "b.gt 1b \n"
868 : "+r"(src_uv), // %0 873 : "+r"(src_uv), // %0
869 "+r"(dst_u), // %1 874 "+r"(dst_u), // %1
870 "+r"(dst_v), // %2 875 "+r"(dst_v), // %2
871 "+r"(width) // %3 876 "+r"(width64) // %3
872 : "r"((ptrdiff_t)-16) // %4 877 : "r"((ptrdiff_t)-16) // %4
873 : "cc", "memory", "v0", "v1" 878 : "cc", "memory", "v0", "v1"
874 ); 879 );
875 } 880 }
876 #endif // HAS_MIRRORUVROW_NEON 881 #endif // HAS_MIRRORUVROW_NEON
877 882
878 #ifdef HAS_ARGBMIRRORROW_NEON 883 #ifdef HAS_ARGBMIRRORROW_NEON
879 void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { 884 void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
885 int64 width64 = (int64) width;
880 asm volatile ( 886 asm volatile (
881 // Start at end of source row. 887 // Start at end of source row.
882 "add %0, %0, %2, lsl #2 \n" 888 "add %0, %0, %2, lsl #2 \n"
883 "sub %0, %0, #16 \n" 889 "sub %0, %0, #16 \n"
884 890
885 "1: \n" 891 "1: \n"
886 MEMACCESS(0) 892 MEMACCESS(0)
887 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 893 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
888 "subs %2, %2, #4 \n" // 4 pixels per loop. 894 "subs %2, %2, #4 \n" // 4 pixels per loop.
889 "rev64 v0.4s, v0.4s \n" 895 "rev64 v0.4s, v0.4s \n"
890 MEMACCESS(1) 896 MEMACCESS(1)
891 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 897 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
892 MEMACCESS(1) 898 MEMACCESS(1)
893 "st1 {v0.D}[0], [%1], #8 \n" 899 "st1 {v0.D}[0], [%1], #8 \n"
894 "b.gt 1b \n" 900 "b.gt 1b \n"
895 : "+r"(src), // %0 901 : "+r"(src), // %0
896 "+r"(dst), // %1 902 "+r"(dst), // %1
897 "+r"(width) // %2 903 "+r"(width64) // %2
898 : "r"((ptrdiff_t)-16) // %3 904 : "r"((ptrdiff_t)-16) // %3
899 : "cc", "memory", "v0" 905 : "cc", "memory", "v0"
900 ); 906 );
901 } 907 }
902 #endif // HAS_ARGBMIRRORROW_NEON 908 #endif // HAS_ARGBMIRRORROW_NEON
903 909
904 #ifdef HAS_RGB24TOARGBROW_NEON 910 #ifdef HAS_RGB24TOARGBROW_NEON
905 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { 911 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
906 asm volatile ( 912 asm volatile (
907 "movi v4.8b, #255 \n" // Alpha 913 "movi v4.8b, #255 \n" // Alpha
908 "1: \n" 914 "1: \n"
909 MEMACCESS(0) 915 MEMACCESS(0)
910 "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. 916 "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
911 "subs %2, %2, #8 \n" // 8 processed per loop. 917 "subs %w2, %w2, #8 \n" // 8 processed per loop.
912 MEMACCESS(1) 918 MEMACCESS(1)
913 "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels 919 "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels
914 "b.gt 1b \n" 920 "b.gt 1b \n"
915 : "+r"(src_rgb24), // %0 921 : "+r"(src_rgb24), // %0
916 "+r"(dst_argb), // %1 922 "+r"(dst_argb), // %1
917 "+r"(pix) // %2 923 "+r"(pix) // %2
918 : 924 :
919 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List 925 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
920 ); 926 );
921 } 927 }
922 #endif // HAS_RGB24TOARGBROW_NEON 928 #endif // HAS_RGB24TOARGBROW_NEON
923 929
924 #ifdef HAS_RAWTOARGBROW_NEON 930 #ifdef HAS_RAWTOARGBROW_NEON
925 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { 931 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
926 asm volatile ( 932 asm volatile (
927 "movi v5.8b, #255 \n" // Alpha 933 "movi v5.8b, #255 \n" // Alpha
928 "1: \n" 934 "1: \n"
929 MEMACCESS(0) 935 MEMACCESS(0)
930 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b 936 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
931 "subs %2, %2, #8 \n" // 8 processed per loop. 937 "subs %w2, %w2, #8 \n" // 8 processed per loop.
932 "orr v3.8b, v1.8b, v1.8b \n" // move g 938 "orr v3.8b, v1.8b, v1.8b \n" // move g
933 "orr v4.8b, v0.8b, v0.8b \n" // move r 939 "orr v4.8b, v0.8b, v0.8b \n" // move r
934 MEMACCESS(1) 940 MEMACCESS(1)
935 "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a 941 "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a
936 "b.gt 1b \n" 942 "b.gt 1b \n"
937 : "+r"(src_raw), // %0 943 : "+r"(src_raw), // %0
938 "+r"(dst_argb), // %1 944 "+r"(dst_argb), // %1
939 "+r"(pix) // %2 945 "+r"(pix) // %2
940 : 946 :
941 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List 947 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List
(...skipping 14 matching lines...) Expand all
956 "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \ 962 "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \
957 "dup v2.2D, v0.D[1] \n" /* R */ 963 "dup v2.2D, v0.D[1] \n" /* R */
958 964
959 #ifdef HAS_RGB565TOARGBROW_NEON 965 #ifdef HAS_RGB565TOARGBROW_NEON
960 void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) { 966 void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {
961 asm volatile ( 967 asm volatile (
962 "movi v3.8b, #255 \n" // Alpha 968 "movi v3.8b, #255 \n" // Alpha
963 "1: \n" 969 "1: \n"
964 MEMACCESS(0) 970 MEMACCESS(0)
965 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. 971 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
966 "subs %2, %2, #8 \n" // 8 processed per loop. 972 "subs %w2, %w2, #8 \n" // 8 processed per loop.
967 RGB565TOARGB 973 RGB565TOARGB
968 MEMACCESS(1) 974 MEMACCESS(1)
969 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels 975 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
970 "b.gt 1b \n" 976 "b.gt 1b \n"
971 : "+r"(src_rgb565), // %0 977 : "+r"(src_rgb565), // %0
972 "+r"(dst_argb), // %1 978 "+r"(dst_argb), // %1
973 "+r"(pix) // %2 979 "+r"(pix) // %2
974 : 980 :
975 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List 981 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List
976 ); 982 );
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after
1015 "dup v1.2D, v0.D[1] \n" /* G */ \ 1021 "dup v1.2D, v0.D[1] \n" /* G */ \
1016 1022
1017 #ifdef HAS_ARGB1555TOARGBROW_NEON 1023 #ifdef HAS_ARGB1555TOARGBROW_NEON
1018 void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, 1024 void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
1019 int pix) { 1025 int pix) {
1020 asm volatile ( 1026 asm volatile (
1021 "movi v3.8b, #255 \n" // Alpha 1027 "movi v3.8b, #255 \n" // Alpha
1022 "1: \n" 1028 "1: \n"
1023 MEMACCESS(0) 1029 MEMACCESS(0)
1024 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. 1030 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
1025 "subs %2, %2, #8 \n" // 8 processed per loop. 1031 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1026 ARGB1555TOARGB 1032 ARGB1555TOARGB
1027 MEMACCESS(1) 1033 MEMACCESS(1)
1028 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels 1034 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
1029 "b.gt 1b \n" 1035 "b.gt 1b \n"
1030 : "+r"(src_argb1555), // %0 1036 : "+r"(src_argb1555), // %0
1031 "+r"(dst_argb), // %1 1037 "+r"(dst_argb), // %1
1032 "+r"(pix) // %2 1038 "+r"(pix) // %2
1033 : 1039 :
1034 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 1040 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
1035 ); 1041 );
(...skipping 12 matching lines...) Expand all
1048 "dup v0.2D, v2.D[1] \n" \ 1054 "dup v0.2D, v2.D[1] \n" \
1049 "dup v1.2D, v3.D[1] \n" 1055 "dup v1.2D, v3.D[1] \n"
1050 1056
1051 #ifdef HAS_ARGB4444TOARGBROW_NEON 1057 #ifdef HAS_ARGB4444TOARGBROW_NEON
1052 void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, 1058 void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
1053 int pix) { 1059 int pix) {
1054 asm volatile ( 1060 asm volatile (
1055 "1: \n" 1061 "1: \n"
1056 MEMACCESS(0) 1062 MEMACCESS(0)
1057 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. 1063 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
1058 "subs %2, %2, #8 \n" // 8 processed per loop. 1064 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1059 ARGB4444TOARGB 1065 ARGB4444TOARGB
1060 MEMACCESS(1) 1066 MEMACCESS(1)
1061 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels 1067 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
1062 "b.gt 1b \n" 1068 "b.gt 1b \n"
1063 : "+r"(src_argb4444), // %0 1069 : "+r"(src_argb4444), // %0
1064 "+r"(dst_argb), // %1 1070 "+r"(dst_argb), // %1
1065 "+r"(pix) // %2 1071 "+r"(pix) // %2
1066 : 1072 :
1067 : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List 1073 : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
1068 ); 1074 );
1069 } 1075 }
1070 #endif // HAS_ARGB4444TOARGBROW_NEON 1076 #endif // HAS_ARGB4444TOARGBROW_NEON
1071 1077
1072 #ifdef HAS_ARGBTORGB24ROW_NEON 1078 #ifdef HAS_ARGBTORGB24ROW_NEON
1073 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { 1079 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
1074 asm volatile ( 1080 asm volatile (
1075 "1: \n" 1081 "1: \n"
1076 MEMACCESS(0) 1082 MEMACCESS(0)
1077 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB pixels 1083 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB pixels
1078 "subs %2, %2, #8 \n" // 8 processed per loop. 1084 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1079 MEMACCESS(1) 1085 MEMACCESS(1)
1080 "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24. 1086 "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24.
1081 "b.gt 1b \n" 1087 "b.gt 1b \n"
1082 : "+r"(src_argb), // %0 1088 : "+r"(src_argb), // %0
1083 "+r"(dst_rgb24), // %1 1089 "+r"(dst_rgb24), // %1
1084 "+r"(pix) // %2 1090 "+r"(pix) // %2
1085 : 1091 :
1086 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List 1092 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
1087 ); 1093 );
1088 } 1094 }
1089 #endif // HAS_ARGBTORGB24ROW_NEON 1095 #endif // HAS_ARGBTORGB24ROW_NEON
1090 1096
1091 #ifdef HAS_ARGBTORAWROW_NEON 1097 #ifdef HAS_ARGBTORAWROW_NEON
1092 void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { 1098 void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
1093 asm volatile ( 1099 asm volatile (
1094 "1: \n" 1100 "1: \n"
1095 MEMACCESS(0) 1101 MEMACCESS(0)
1096 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a 1102 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a
1097 "subs %2, %2, #8 \n" // 8 processed per loop. 1103 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1098 "orr v4.8b, v2.8b, v2.8b \n" // mov g 1104 "orr v4.8b, v2.8b, v2.8b \n" // mov g
1099 "orr v5.8b, v1.8b, v1.8b \n" // mov b 1105 "orr v5.8b, v1.8b, v1.8b \n" // mov b
1100 MEMACCESS(1) 1106 MEMACCESS(1)
1101 "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b 1107 "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b
1102 "b.gt 1b \n" 1108 "b.gt 1b \n"
1103 : "+r"(src_argb), // %0 1109 : "+r"(src_argb), // %0
1104 "+r"(dst_raw), // %1 1110 "+r"(dst_raw), // %1
1105 "+r"(pix) // %2 1111 "+r"(pix) // %2
1106 : 1112 :
1107 : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List 1113 : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List
1108 ); 1114 );
1109 } 1115 }
1110 #endif // HAS_ARGBTORAWROW_NEON 1116 #endif // HAS_ARGBTORAWROW_NEON
1111 1117
1112 #ifdef HAS_YUY2TOYROW_NEON 1118 #ifdef HAS_YUY2TOYROW_NEON
1113 void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { 1119 void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
1114 asm volatile ( 1120 asm volatile (
1115 "1: \n" 1121 "1: \n"
1116 MEMACCESS(0) 1122 MEMACCESS(0)
1117 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. 1123 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
1118 "subs %2, %2, #16 \n" // 16 processed per loop. 1124 "subs %w2, %w2, #16 \n" // 16 processed per loop.
1119 MEMACCESS(1) 1125 MEMACCESS(1)
1120 "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. 1126 "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
1121 "b.gt 1b \n" 1127 "b.gt 1b \n"
1122 : "+r"(src_yuy2), // %0 1128 : "+r"(src_yuy2), // %0
1123 "+r"(dst_y), // %1 1129 "+r"(dst_y), // %1
1124 "+r"(pix) // %2 1130 "+r"(pix) // %2
1125 : 1131 :
1126 : "cc", "memory", "v0", "v1" // Clobber List 1132 : "cc", "memory", "v0", "v1" // Clobber List
1127 ); 1133 );
1128 } 1134 }
1129 #endif // HAS_YUY2TOYROW_NEON 1135 #endif // HAS_YUY2TOYROW_NEON
1130 1136
1131 #ifdef HAS_UYVYTOYROW_NEON 1137 #ifdef HAS_UYVYTOYROW_NEON
1132 void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { 1138 void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
1133 asm volatile ( 1139 asm volatile (
1134 "1: \n" 1140 "1: \n"
1135 MEMACCESS(0) 1141 MEMACCESS(0)
1136 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. 1142 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
1137 "subs %2, %2, #16 \n" // 16 processed per loop. 1143 "subs %w2, %w2, #16 \n" // 16 processed per loop.
1138 MEMACCESS(1) 1144 MEMACCESS(1)
1139 "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. 1145 "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
1140 "b.gt 1b \n" 1146 "b.gt 1b \n"
1141 : "+r"(src_uyvy), // %0 1147 : "+r"(src_uyvy), // %0
1142 "+r"(dst_y), // %1 1148 "+r"(dst_y), // %1
1143 "+r"(pix) // %2 1149 "+r"(pix) // %2
1144 : 1150 :
1145 : "cc", "memory", "v0", "v1" // Clobber List 1151 : "cc", "memory", "v0", "v1" // Clobber List
1146 ); 1152 );
1147 } 1153 }
1148 #endif // HAS_UYVYTOYROW_NEON 1154 #endif // HAS_UYVYTOYROW_NEON
1149 1155
1150 #ifdef HAS_YUY2TOUV422ROW_NEON 1156 #ifdef HAS_YUY2TOUV422ROW_NEON
1151 void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, 1157 void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
1152 int pix) { 1158 int pix) {
1153 asm volatile ( 1159 asm volatile (
1154 "1: \n" 1160 "1: \n"
1155 MEMACCESS(0) 1161 MEMACCESS(0)
1156 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 pixels 1162 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 pixels
1157 "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. 1163 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
1158 MEMACCESS(1) 1164 MEMACCESS(1)
1159 "st1 {v1.8b}, [%1], #8 \n" // store 8 U. 1165 "st1 {v1.8b}, [%1], #8 \n" // store 8 U.
1160 MEMACCESS(2) 1166 MEMACCESS(2)
1161 "st1 {v3.8b}, [%2], #8 \n" // store 8 V. 1167 "st1 {v3.8b}, [%2], #8 \n" // store 8 V.
1162 "b.gt 1b \n" 1168 "b.gt 1b \n"
1163 : "+r"(src_yuy2), // %0 1169 : "+r"(src_yuy2), // %0
1164 "+r"(dst_u), // %1 1170 "+r"(dst_u), // %1
1165 "+r"(dst_v), // %2 1171 "+r"(dst_v), // %2
1166 "+r"(pix) // %3 1172 "+r"(pix) // %3
1167 : 1173 :
1168 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 1174 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
1169 ); 1175 );
1170 } 1176 }
1171 #endif // HAS_YUY2TOUV422ROW_NEON 1177 #endif // HAS_YUY2TOUV422ROW_NEON
1172 1178
1173 #ifdef HAS_UYVYTOUV422ROW_NEON 1179 #ifdef HAS_UYVYTOUV422ROW_NEON
1174 void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, 1180 void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
1175 int pix) { 1181 int pix) {
1176 asm volatile ( 1182 asm volatile (
1177 "1: \n" 1183 "1: \n"
1178 MEMACCESS(0) 1184 MEMACCESS(0)
1179 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY pixels 1185 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY pixels
1180 "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. 1186 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
1181 MEMACCESS(1) 1187 MEMACCESS(1)
1182 "st1 {v0.8b}, [%1], #8 \n" // store 8 U. 1188 "st1 {v0.8b}, [%1], #8 \n" // store 8 U.
1183 MEMACCESS(2) 1189 MEMACCESS(2)
1184 "st1 {v2.8b}, [%2], #8 \n" // store 8 V. 1190 "st1 {v2.8b}, [%2], #8 \n" // store 8 V.
1185 "b.gt 1b \n" 1191 "b.gt 1b \n"
1186 : "+r"(src_uyvy), // %0 1192 : "+r"(src_uyvy), // %0
1187 "+r"(dst_u), // %1 1193 "+r"(dst_u), // %1
1188 "+r"(dst_v), // %2 1194 "+r"(dst_v), // %2
1189 "+r"(pix) // %3 1195 "+r"(pix) // %3
1190 : 1196 :
1191 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 1197 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
1192 ); 1198 );
1193 } 1199 }
1194 #endif // HAS_UYVYTOUV422ROW_NEON 1200 #endif // HAS_UYVYTOUV422ROW_NEON
1195 1201
1196 #ifdef HAS_YUY2TOUVROW_NEON 1202 #ifdef HAS_YUY2TOUVROW_NEON
1197 void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, 1203 void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
1198 uint8* dst_u, uint8* dst_v, int pix) { 1204 uint8* dst_u, uint8* dst_v, int pix) {
1199 const uint8* src_yuy2b = src_yuy2 + stride_yuy2; 1205 const uint8* src_yuy2b = src_yuy2 + stride_yuy2;
1200 asm volatile ( 1206 asm volatile (
1201 "1: \n" 1207 "1: \n"
1202 MEMACCESS(0) 1208 MEMACCESS(0)
1203 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels 1209 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
1204 "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. 1210 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
1205 MEMACCESS(1) 1211 MEMACCESS(1)
1206 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row 1212 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
1207 "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U 1213 "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U
1208 "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V 1214 "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V
1209 MEMACCESS(2) 1215 MEMACCESS(2)
1210 "st1 {v1.8b}, [%2], #8 \n" // store 8 U. 1216 "st1 {v1.8b}, [%2], #8 \n" // store 8 U.
1211 MEMACCESS(3) 1217 MEMACCESS(3)
1212 "st1 {v3.8b}, [%3], #8 \n" // store 8 V. 1218 "st1 {v3.8b}, [%3], #8 \n" // store 8 V.
1213 "b.gt 1b \n" 1219 "b.gt 1b \n"
1214 : "+r"(src_yuy2), // %0 1220 : "+r"(src_yuy2), // %0
1215 "+r"(src_yuy2b), // %1 1221 "+r"(src_yuy2b), // %1
1216 "+r"(dst_u), // %2 1222 "+r"(dst_u), // %2
1217 "+r"(dst_v), // %3 1223 "+r"(dst_v), // %3
1218 "+r"(pix) // %4 1224 "+r"(pix) // %4
1219 : 1225 :
1220 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", 1226 : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
1221 "v5", "v6", "v7" // Clobber List 1227 "v5", "v6", "v7" // Clobber List
1222 ); 1228 );
1223 } 1229 }
1224 #endif // HAS_YUY2TOUVROW_NEON 1230 #endif // HAS_YUY2TOUVROW_NEON
1225 1231
1226 #ifdef HAS_UYVYTOUVROW_NEON 1232 #ifdef HAS_UYVYTOUVROW_NEON
1227 void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, 1233 void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
1228 uint8* dst_u, uint8* dst_v, int pix) { 1234 uint8* dst_u, uint8* dst_v, int pix) {
1229 const uint8* src_uyvyb = src_uyvy + stride_uyvy; 1235 const uint8* src_uyvyb = src_uyvy + stride_uyvy;
1230 asm volatile ( 1236 asm volatile (
1231 "1: \n" 1237 "1: \n"
1232 MEMACCESS(0) 1238 MEMACCESS(0)
1233 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels 1239 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
1234 "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. 1240 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
1235 MEMACCESS(1) 1241 MEMACCESS(1)
1236 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row 1242 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
1237 "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U 1243 "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U
1238 "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V 1244 "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V
1239 MEMACCESS(2) 1245 MEMACCESS(2)
1240 "st1 {v0.8b}, [%2], #8 \n" // store 8 U. 1246 "st1 {v0.8b}, [%2], #8 \n" // store 8 U.
1241 MEMACCESS(3) 1247 MEMACCESS(3)
1242 "st1 {v2.8b}, [%3], #8 \n" // store 8 V. 1248 "st1 {v2.8b}, [%3], #8 \n" // store 8 V.
1243 "b.gt 1b \n" 1249 "b.gt 1b \n"
1244 : "+r"(src_uyvy), // %0 1250 : "+r"(src_uyvy), // %0
1245 "+r"(src_uyvyb), // %1 1251 "+r"(src_uyvyb), // %1
1246 "+r"(dst_u), // %2 1252 "+r"(dst_u), // %2
1247 "+r"(dst_v), // %3 1253 "+r"(dst_v), // %3
1248 "+r"(pix) // %4 1254 "+r"(pix) // %4
1249 : 1255 :
1250 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", 1256 : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
1251 "v5", "v6", "v7" // Clobber List 1257 "v5", "v6", "v7" // Clobber List
1252 ); 1258 );
1253 } 1259 }
1254 #endif // HAS_UYVYTOUVROW_NEON 1260 #endif // HAS_UYVYTOUVROW_NEON
1255 1261
1256 // Select G channels from ARGB. e.g. GGGGGGGG
1257 #ifdef HAS_ARGBTOBAYERGGROW_NEON
1258 void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
1259 uint32 /*selector*/, int pix) {
1260 asm volatile (
1261 "1: \n"
1262 MEMACCESS(0)
1263 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load row 8 pixels
1264 "subs %2, %2, #8 \n" // 8 processed per loop
1265 MEMACCESS(1)
1266 "st1 {v1.8b}, [%1], #8 \n" // store 8 G's.
1267 "b.gt 1b \n"
1268 : "+r"(src_argb), // %0
1269 "+r"(dst_bayer), // %1
1270 "+r"(pix) // %2
1271 :
1272 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
1273 );
1274 }
1275 #endif // HAS_ARGBTOBAYERGGROW_NEON
1276
1277 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. 1262 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
1278 #ifdef HAS_ARGBSHUFFLEROW_NEON 1263 #ifdef HAS_ARGBSHUFFLEROW_NEON
1279 void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, 1264 void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
1280 const uint8* shuffler, int pix) { 1265 const uint8* shuffler, int pix) {
1281 asm volatile ( 1266 asm volatile (
1282 MEMACCESS(3) 1267 MEMACCESS(3)
1283 "ld1 {v2.16b}, [%3] \n" // shuffler 1268 "ld1 {v2.16b}, [%3] \n" // shuffler
1284 "1: \n" 1269 "1: \n"
1285 MEMACCESS(0) 1270 MEMACCESS(0)
1286 "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. 1271 "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
1287 "subs %2, %2, #4 \n" // 4 processed per loop 1272 "subs %w2, %w2, #4 \n" // 4 processed per loop
1288 "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels 1273 "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
1289 MEMACCESS(1) 1274 MEMACCESS(1)
1290 "st1 {v1.16b}, [%1], #16 \n" // store 4. 1275 "st1 {v1.16b}, [%1], #16 \n" // store 4.
1291 "b.gt 1b \n" 1276 "b.gt 1b \n"
1292 : "+r"(src_argb), // %0 1277 : "+r"(src_argb), // %0
1293 "+r"(dst_argb), // %1 1278 "+r"(dst_argb), // %1
1294 "+r"(pix) // %2 1279 "+r"(pix) // %2
1295 : "r"(shuffler) // %3 1280 : "r"(shuffler) // %3
1296 : "cc", "memory", "v0", "v1", "v2" // Clobber List 1281 : "cc", "memory", "v0", "v1", "v2" // Clobber List
1297 ); 1282 );
1298 } 1283 }
1299 #endif // HAS_ARGBSHUFFLEROW_NEON 1284 #endif // HAS_ARGBSHUFFLEROW_NEON
1300 1285
1301 #ifdef HAS_I422TOYUY2ROW_NEON 1286 #ifdef HAS_I422TOYUY2ROW_NEON
1302 void I422ToYUY2Row_NEON(const uint8* src_y, 1287 void I422ToYUY2Row_NEON(const uint8* src_y,
1303 const uint8* src_u, 1288 const uint8* src_u,
1304 const uint8* src_v, 1289 const uint8* src_v,
1305 uint8* dst_yuy2, int width) { 1290 uint8* dst_yuy2, int width) {
1306 asm volatile ( 1291 asm volatile (
1307 "1: \n" 1292 "1: \n"
1308 MEMACCESS(0) 1293 MEMACCESS(0)
1309 "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys 1294 "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys
1310 "orr v2.8b, v1.8b, v1.8b \n" 1295 "orr v2.8b, v1.8b, v1.8b \n"
1311 MEMACCESS(1) 1296 MEMACCESS(1)
1312 "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us 1297 "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
1313 MEMACCESS(2) 1298 MEMACCESS(2)
1314 "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs 1299 "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
1315 "subs %4, %4, #16 \n" // 16 pixels 1300 "subs %w4, %w4, #16 \n" // 16 pixels
1316 MEMACCESS(3) 1301 MEMACCESS(3)
1317 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. 1302 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
1318 "b.gt 1b \n" 1303 "b.gt 1b \n"
1319 : "+r"(src_y), // %0 1304 : "+r"(src_y), // %0
1320 "+r"(src_u), // %1 1305 "+r"(src_u), // %1
1321 "+r"(src_v), // %2 1306 "+r"(src_v), // %2
1322 "+r"(dst_yuy2), // %3 1307 "+r"(dst_yuy2), // %3
1323 "+r"(width) // %4 1308 "+r"(width) // %4
1324 : 1309 :
1325 : "cc", "memory", "v0", "v1", "v2", "v3" 1310 : "cc", "memory", "v0", "v1", "v2", "v3"
1326 ); 1311 );
1327 } 1312 }
1328 #endif // HAS_I422TOYUY2ROW_NEON 1313 #endif // HAS_I422TOYUY2ROW_NEON
1329 1314
1330 #ifdef HAS_I422TOUYVYROW_NEON 1315 #ifdef HAS_I422TOUYVYROW_NEON
1331 void I422ToUYVYRow_NEON(const uint8* src_y, 1316 void I422ToUYVYRow_NEON(const uint8* src_y,
1332 const uint8* src_u, 1317 const uint8* src_u,
1333 const uint8* src_v, 1318 const uint8* src_v,
1334 uint8* dst_uyvy, int width) { 1319 uint8* dst_uyvy, int width) {
1335 asm volatile ( 1320 asm volatile (
1336 "1: \n" 1321 "1: \n"
1337 MEMACCESS(0) 1322 MEMACCESS(0)
1338 "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys 1323 "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys
1339 "orr v3.8b, v2.8b, v2.8b \n" 1324 "orr v3.8b, v2.8b, v2.8b \n"
1340 MEMACCESS(1) 1325 MEMACCESS(1)
1341 "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us 1326 "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
1342 MEMACCESS(2) 1327 MEMACCESS(2)
1343 "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs 1328 "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
1344 "subs %4, %4, #16 \n" // 16 pixels 1329 "subs %w4, %w4, #16 \n" // 16 pixels
1345 MEMACCESS(3) 1330 MEMACCESS(3)
1346 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. 1331 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
1347 "b.gt 1b \n" 1332 "b.gt 1b \n"
1348 : "+r"(src_y), // %0 1333 : "+r"(src_y), // %0
1349 "+r"(src_u), // %1 1334 "+r"(src_u), // %1
1350 "+r"(src_v), // %2 1335 "+r"(src_v), // %2
1351 "+r"(dst_uyvy), // %3 1336 "+r"(dst_uyvy), // %3
1352 "+r"(width) // %4 1337 "+r"(width) // %4
1353 : 1338 :
1354 : "cc", "memory", "v0", "v1", "v2", "v3" 1339 : "cc", "memory", "v0", "v1", "v2", "v3"
1355 ); 1340 );
1356 } 1341 }
1357 #endif // HAS_I422TOUYVYROW_NEON 1342 #endif // HAS_I422TOUYVYROW_NEON
1358 1343
1359 #ifdef HAS_ARGBTORGB565ROW_NEON 1344 #ifdef HAS_ARGBTORGB565ROW_NEON
1360 void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { 1345 void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
1361 asm volatile ( 1346 asm volatile (
1362 "1: \n" 1347 "1: \n"
1363 MEMACCESS(0) 1348 MEMACCESS(0)
1364 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels 1349 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
1365 "subs %2, %2, #8 \n" // 8 processed per loop. 1350 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1366 ARGBTORGB565 1351 ARGBTORGB565
1367 MEMACCESS(1) 1352 MEMACCESS(1)
1368 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565. 1353 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565.
1369 "b.gt 1b \n" 1354 "b.gt 1b \n"
1370 : "+r"(src_argb), // %0 1355 : "+r"(src_argb), // %0
1371 "+r"(dst_rgb565), // %1 1356 "+r"(dst_rgb565), // %1
1372 "+r"(pix) // %2 1357 "+r"(pix) // %2
1373 : 1358 :
1374 : "cc", "memory", "v0", "v20", "v21", "v22", "v23" 1359 : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
1375 ); 1360 );
1376 } 1361 }
1377 #endif // HAS_ARGBTORGB565ROW_NEON 1362 #endif // HAS_ARGBTORGB565ROW_NEON
1378 1363
1364 #ifdef HAS_ARGBTORGB565DITHERROW_NEON
1365 void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
1366 const uint32 dither4, int width) {
1367 asm volatile (
1368 "dup v1.4s, %w2 \n" // dither4
1369 "1: \n"
1370 MEMACCESS(1)
1371 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels
1372 "subs %w3, %w3, #8 \n" // 8 processed per loop.
1373 "uqadd v20.8b, v20.8b, v1.8b \n"
1374 "uqadd v21.8b, v21.8b, v1.8b \n"
1375 "uqadd v22.8b, v22.8b, v1.8b \n"
1376 ARGBTORGB565
1377 MEMACCESS(0)
1378 "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565.
1379 "b.gt 1b \n"
1380 : "+r"(dst_rgb) // %0
1381 : "r"(src_argb), // %1
1382 "r"(dither4), // %2
1383 "r"(width) // %3
1384 : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23"
1385 );
1386 }
1387 #endif // HAS_ARGBTORGB565ROW_NEON
1388
1379 #ifdef HAS_ARGBTOARGB1555ROW_NEON 1389 #ifdef HAS_ARGBTOARGB1555ROW_NEON
1380 void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, 1390 void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
1381 int pix) { 1391 int pix) {
1382 asm volatile ( 1392 asm volatile (
1383 "1: \n" 1393 "1: \n"
1384 MEMACCESS(0) 1394 MEMACCESS(0)
1385 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels 1395 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
1386 "subs %2, %2, #8 \n" // 8 processed per loop. 1396 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1387 ARGBTOARGB1555 1397 ARGBTOARGB1555
1388 MEMACCESS(1) 1398 MEMACCESS(1)
1389 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB1555. 1399 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB1555.
1390 "b.gt 1b \n" 1400 "b.gt 1b \n"
1391 : "+r"(src_argb), // %0 1401 : "+r"(src_argb), // %0
1392 "+r"(dst_argb1555), // %1 1402 "+r"(dst_argb1555), // %1
1393 "+r"(pix) // %2 1403 "+r"(pix) // %2
1394 : 1404 :
1395 : "cc", "memory", "v0", "v20", "v21", "v22", "v23" 1405 : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
1396 ); 1406 );
1397 } 1407 }
1398 #endif // HAS_ARGBTOARGB1555ROW_NEON 1408 #endif // HAS_ARGBTOARGB1555ROW_NEON
1399 1409
1400 #ifdef HAS_ARGBTOARGB4444ROW_NEON 1410 #ifdef HAS_ARGBTOARGB4444ROW_NEON
1401 void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, 1411 void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
1402 int pix) { 1412 int pix) {
1403 asm volatile ( 1413 asm volatile (
1404 "movi v4.16b, #0x0f \n" // bits to clear with vbic. 1414 "movi v4.16b, #0x0f \n" // bits to clear with vbic.
1405 "1: \n" 1415 "1: \n"
1406 MEMACCESS(0) 1416 MEMACCESS(0)
1407 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels 1417 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
1408 "subs %2, %2, #8 \n" // 8 processed per loop. 1418 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1409 ARGBTOARGB4444 1419 ARGBTOARGB4444
1410 MEMACCESS(1) 1420 MEMACCESS(1)
1411 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB4444. 1421 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB4444.
1412 "b.gt 1b \n" 1422 "b.gt 1b \n"
1413 : "+r"(src_argb), // %0 1423 : "+r"(src_argb), // %0
1414 "+r"(dst_argb4444), // %1 1424 "+r"(dst_argb4444), // %1
1415 "+r"(pix) // %2 1425 "+r"(pix) // %2
1416 : 1426 :
1417 : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23" 1427 : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23"
1418 ); 1428 );
1419 } 1429 }
1420 #endif // HAS_ARGBTOARGB4444ROW_NEON 1430 #endif // HAS_ARGBTOARGB4444ROW_NEON
1421 1431
1422 #ifdef HAS_ARGBTOYROW_NEON 1432 #ifdef HAS_ARGBTOYROW_NEON
1423 void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { 1433 void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
1424 asm volatile ( 1434 asm volatile (
1425 "movi v4.8b, #13 \n" // B * 0.1016 coefficient 1435 "movi v4.8b, #13 \n" // B * 0.1016 coefficient
1426 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 1436 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
1427 "movi v6.8b, #33 \n" // R * 0.2578 coefficient 1437 "movi v6.8b, #33 \n" // R * 0.2578 coefficient
1428 "movi v7.8b, #16 \n" // Add 16 constant 1438 "movi v7.8b, #16 \n" // Add 16 constant
1429 "1: \n" 1439 "1: \n"
1430 MEMACCESS(0) 1440 MEMACCESS(0)
1431 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 1441 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
1432 "subs %2, %2, #8 \n" // 8 processed per loop. 1442 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1433 "umull v3.8h, v0.8b, v4.8b \n" // B 1443 "umull v3.8h, v0.8b, v4.8b \n" // B
1434 "umlal v3.8h, v1.8b, v5.8b \n" // G 1444 "umlal v3.8h, v1.8b, v5.8b \n" // G
1435 "umlal v3.8h, v2.8b, v6.8b \n" // R 1445 "umlal v3.8h, v2.8b, v6.8b \n" // R
1436 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y 1446 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
1437 "uqadd v0.8b, v0.8b, v7.8b \n" 1447 "uqadd v0.8b, v0.8b, v7.8b \n"
1438 MEMACCESS(1) 1448 MEMACCESS(1)
1439 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 1449 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
1440 "b.gt 1b \n" 1450 "b.gt 1b \n"
1441 : "+r"(src_argb), // %0 1451 : "+r"(src_argb), // %0
1442 "+r"(dst_y), // %1 1452 "+r"(dst_y), // %1
1443 "+r"(pix) // %2 1453 "+r"(pix) // %2
1444 : 1454 :
1445 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" 1455 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
1446 ); 1456 );
1447 } 1457 }
1448 #endif // HAS_ARGBTOYROW_NEON 1458 #endif // HAS_ARGBTOYROW_NEON
1449 1459
1450 #ifdef HAS_ARGBTOYJROW_NEON 1460 #ifdef HAS_ARGBTOYJROW_NEON
1451 void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { 1461 void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
1452 asm volatile ( 1462 asm volatile (
1453 "movi v4.8b, #15 \n" // B * 0.11400 coefficient 1463 "movi v4.8b, #15 \n" // B * 0.11400 coefficient
1454 "movi v5.8b, #75 \n" // G * 0.58700 coefficient 1464 "movi v5.8b, #75 \n" // G * 0.58700 coefficient
1455 "movi v6.8b, #38 \n" // R * 0.29900 coefficient 1465 "movi v6.8b, #38 \n" // R * 0.29900 coefficient
1456 "1: \n" 1466 "1: \n"
1457 MEMACCESS(0) 1467 MEMACCESS(0)
1458 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 1468 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
1459 "subs %2, %2, #8 \n" // 8 processed per loop. 1469 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1460 "umull v3.8h, v0.8b, v4.8b \n" // B 1470 "umull v3.8h, v0.8b, v4.8b \n" // B
1461 "umlal v3.8h, v1.8b, v5.8b \n" // G 1471 "umlal v3.8h, v1.8b, v5.8b \n" // G
1462 "umlal v3.8h, v2.8b, v6.8b \n" // R 1472 "umlal v3.8h, v2.8b, v6.8b \n" // R
1463 "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y 1473 "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y
1464 MEMACCESS(1) 1474 MEMACCESS(1)
1465 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 1475 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
1466 "b.gt 1b \n" 1476 "b.gt 1b \n"
1467 : "+r"(src_argb), // %0 1477 : "+r"(src_argb), // %0
1468 "+r"(dst_y), // %1 1478 "+r"(dst_y), // %1
1469 "+r"(pix) // %2 1479 "+r"(pix) // %2
(...skipping 10 matching lines...) Expand all
1480 asm volatile ( 1490 asm volatile (
1481 "movi v24.8b, #112 \n" // UB / VR 0.875 coefficient 1491 "movi v24.8b, #112 \n" // UB / VR 0.875 coefficient
1482 "movi v25.8b, #74 \n" // UG -0.5781 coefficient 1492 "movi v25.8b, #74 \n" // UG -0.5781 coefficient
1483 "movi v26.8b, #38 \n" // UR -0.2969 coefficient 1493 "movi v26.8b, #38 \n" // UR -0.2969 coefficient
1484 "movi v27.8b, #18 \n" // VB -0.1406 coefficient 1494 "movi v27.8b, #18 \n" // VB -0.1406 coefficient
1485 "movi v28.8b, #94 \n" // VG -0.7344 coefficient 1495 "movi v28.8b, #94 \n" // VG -0.7344 coefficient
1486 "movi v29.16b,#0x80 \n" // 128.5 1496 "movi v29.16b,#0x80 \n" // 128.5
1487 "1: \n" 1497 "1: \n"
1488 MEMACCESS(0) 1498 MEMACCESS(0)
1489 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 1499 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
1490 "subs %3, %3, #8 \n" // 8 processed per loop. 1500 "subs %w3, %w3, #8 \n" // 8 processed per loop.
1491 "umull v4.8h, v0.8b, v24.8b \n" // B 1501 "umull v4.8h, v0.8b, v24.8b \n" // B
1492 "umlsl v4.8h, v1.8b, v25.8b \n" // G 1502 "umlsl v4.8h, v1.8b, v25.8b \n" // G
1493 "umlsl v4.8h, v2.8b, v26.8b \n" // R 1503 "umlsl v4.8h, v2.8b, v26.8b \n" // R
1494 "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned 1504 "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned
1495 1505
1496 "umull v3.8h, v2.8b, v24.8b \n" // R 1506 "umull v3.8h, v2.8b, v24.8b \n" // R
1497 "umlsl v3.8h, v1.8b, v28.8b \n" // G 1507 "umlsl v3.8h, v1.8b, v28.8b \n" // G
1498 "umlsl v3.8h, v0.8b, v27.8b \n" // B 1508 "umlsl v3.8h, v0.8b, v27.8b \n" // B
1499 "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned 1509 "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned
1500 1510
(...skipping 23 matching lines...) Expand all
1524 asm volatile ( 1534 asm volatile (
1525 RGBTOUV_SETUP_REG 1535 RGBTOUV_SETUP_REG
1526 "1: \n" 1536 "1: \n"
1527 MEMACCESS(0) 1537 MEMACCESS(0)
1528 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. 1538 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
1529 1539
1530 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. 1540 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
1531 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. 1541 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
1532 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. 1542 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
1533 1543
1534 "subs %3, %3, #16 \n" // 16 processed per loop. 1544 "subs %w3, %w3, #16 \n" // 16 processed per loop.
1535 "mul v3.8h, v0.8h, v20.8h \n" // B 1545 "mul v3.8h, v0.8h, v20.8h \n" // B
1536 "mls v3.8h, v1.8h, v21.8h \n" // G 1546 "mls v3.8h, v1.8h, v21.8h \n" // G
1537 "mls v3.8h, v2.8h, v22.8h \n" // R 1547 "mls v3.8h, v2.8h, v22.8h \n" // R
1538 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned 1548 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
1539 1549
1540 "mul v4.8h, v2.8h, v20.8h \n" // R 1550 "mul v4.8h, v2.8h, v20.8h \n" // R
1541 "mls v4.8h, v1.8h, v24.8h \n" // G 1551 "mls v4.8h, v1.8h, v24.8h \n" // G
1542 "mls v4.8h, v0.8h, v23.8h \n" // B 1552 "mls v4.8h, v0.8h, v23.8h \n" // B
1543 "add v4.8h, v4.8h, v25.8h \n" // +128 -> unsigned 1553 "add v4.8h, v4.8h, v25.8h \n" // +128 -> unsigned
1544 1554
(...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after
1580 "uaddlp v6.8h, v6.16b \n" // R 16 bytes -> 8 shorts. 1590 "uaddlp v6.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
1581 1591
1582 "addp v0.8h, v0.8h, v4.8h \n" // B 16 shorts -> 8 shorts. 1592 "addp v0.8h, v0.8h, v4.8h \n" // B 16 shorts -> 8 shorts.
1583 "addp v1.8h, v1.8h, v5.8h \n" // G 16 shorts -> 8 shorts. 1593 "addp v1.8h, v1.8h, v5.8h \n" // G 16 shorts -> 8 shorts.
1584 "addp v2.8h, v2.8h, v6.8h \n" // R 16 shorts -> 8 shorts. 1594 "addp v2.8h, v2.8h, v6.8h \n" // R 16 shorts -> 8 shorts.
1585 1595
1586 "urshr v0.8h, v0.8h, #1 \n" // 2x average 1596 "urshr v0.8h, v0.8h, #1 \n" // 2x average
1587 "urshr v1.8h, v1.8h, #1 \n" 1597 "urshr v1.8h, v1.8h, #1 \n"
1588 "urshr v2.8h, v2.8h, #1 \n" 1598 "urshr v2.8h, v2.8h, #1 \n"
1589 1599
1590 "subs %3, %3, #32 \n" // 32 processed per loop. 1600 "subs %w3, %w3, #32 \n" // 32 processed per loop.
1591 "mul v3.8h, v0.8h, v20.8h \n" // B 1601 "mul v3.8h, v0.8h, v20.8h \n" // B
1592 "mls v3.8h, v1.8h, v21.8h \n" // G 1602 "mls v3.8h, v1.8h, v21.8h \n" // G
1593 "mls v3.8h, v2.8h, v22.8h \n" // R 1603 "mls v3.8h, v2.8h, v22.8h \n" // R
1594 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned 1604 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
1595 "mul v4.8h, v2.8h, v20.8h \n" // R 1605 "mul v4.8h, v2.8h, v20.8h \n" // R
1596 "mls v4.8h, v1.8h, v24.8h \n" // G 1606 "mls v4.8h, v1.8h, v24.8h \n" // G
1597 "mls v4.8h, v0.8h, v23.8h \n" // B 1607 "mls v4.8h, v0.8h, v23.8h \n" // B
1598 "add v4.8h, v4.8h, v25.8h \n" // +128 -> unsigned 1608 "add v4.8h, v4.8h, v25.8h \n" // +128 -> unsigned
1599 "uqshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit U 1609 "uqshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit U
1600 "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V 1610 "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V
(...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after
1646 MEMACCESS(1) 1656 MEMACCESS(1)
1647 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 1657 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
1648 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. 1658 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
1649 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. 1659 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
1650 "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. 1660 "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
1651 1661
1652 "urshr v0.8h, v0.8h, #1 \n" // 2x average 1662 "urshr v0.8h, v0.8h, #1 \n" // 2x average
1653 "urshr v1.8h, v1.8h, #1 \n" 1663 "urshr v1.8h, v1.8h, #1 \n"
1654 "urshr v2.8h, v2.8h, #1 \n" 1664 "urshr v2.8h, v2.8h, #1 \n"
1655 1665
1656 "subs %4, %4, #16 \n" // 32 processed per loop. 1666 "subs %w4, %w4, #16 \n" // 32 processed per loop.
1657 RGBTOUV(v0.8h, v1.8h, v2.8h) 1667 RGBTOUV(v0.8h, v1.8h, v2.8h)
1658 MEMACCESS(2) 1668 MEMACCESS(2)
1659 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1669 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1660 MEMACCESS(3) 1670 MEMACCESS(3)
1661 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1671 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1662 "b.gt 1b \n" 1672 "b.gt 1b \n"
1663 : "+r"(src_argb), // %0 1673 : "+r"(src_argb), // %0
1664 "+r"(src_argb_1), // %1 1674 "+r"(src_argb_1), // %1
1665 "+r"(dst_u), // %2 1675 "+r"(dst_u), // %2
1666 "+r"(dst_v), // %3 1676 "+r"(dst_v), // %3
(...skipping 26 matching lines...) Expand all
1693 MEMACCESS(1) 1703 MEMACCESS(1)
1694 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 1704 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
1695 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. 1705 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
1696 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. 1706 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
1697 "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. 1707 "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
1698 1708
1699 "urshr v0.8h, v0.8h, #1 \n" // 2x average 1709 "urshr v0.8h, v0.8h, #1 \n" // 2x average
1700 "urshr v1.8h, v1.8h, #1 \n" 1710 "urshr v1.8h, v1.8h, #1 \n"
1701 "urshr v2.8h, v2.8h, #1 \n" 1711 "urshr v2.8h, v2.8h, #1 \n"
1702 1712
1703 "subs %4, %4, #16 \n" // 32 processed per loop. 1713 "subs %w4, %w4, #16 \n" // 32 processed per loop.
1704 RGBTOUV(v0.8h, v1.8h, v2.8h) 1714 RGBTOUV(v0.8h, v1.8h, v2.8h)
1705 MEMACCESS(2) 1715 MEMACCESS(2)
1706 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1716 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1707 MEMACCESS(3) 1717 MEMACCESS(3)
1708 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1718 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1709 "b.gt 1b \n" 1719 "b.gt 1b \n"
1710 : "+r"(src_argb), // %0 1720 : "+r"(src_argb), // %0
1711 "+r"(src_argb_1), // %1 1721 "+r"(src_argb_1), // %1
1712 "+r"(dst_u), // %2 1722 "+r"(dst_u), // %2
1713 "+r"(dst_v), // %3 1723 "+r"(dst_v), // %3
(...skipping 20 matching lines...) Expand all
1734 MEMACCESS(1) 1744 MEMACCESS(1)
1735 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more 1745 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more
1736 "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts. 1746 "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts.
1737 "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts. 1747 "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
1738 "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts. 1748 "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts.
1739 1749
1740 "urshr v0.8h, v0.8h, #1 \n" // 2x average 1750 "urshr v0.8h, v0.8h, #1 \n" // 2x average
1741 "urshr v1.8h, v3.8h, #1 \n" 1751 "urshr v1.8h, v3.8h, #1 \n"
1742 "urshr v2.8h, v2.8h, #1 \n" 1752 "urshr v2.8h, v2.8h, #1 \n"
1743 1753
1744 "subs %4, %4, #16 \n" // 32 processed per loop. 1754 "subs %w4, %w4, #16 \n" // 32 processed per loop.
1745 RGBTOUV(v0.8h, v1.8h, v2.8h) 1755 RGBTOUV(v0.8h, v1.8h, v2.8h)
1746 MEMACCESS(2) 1756 MEMACCESS(2)
1747 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1757 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1748 MEMACCESS(3) 1758 MEMACCESS(3)
1749 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1759 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1750 "b.gt 1b \n" 1760 "b.gt 1b \n"
1751 : "+r"(src_bgra), // %0 1761 : "+r"(src_bgra), // %0
1752 "+r"(src_bgra_1), // %1 1762 "+r"(src_bgra_1), // %1
1753 "+r"(dst_u), // %2 1763 "+r"(dst_u), // %2
1754 "+r"(dst_v), // %3 1764 "+r"(dst_v), // %3
(...skipping 20 matching lines...) Expand all
1775 MEMACCESS(1) 1785 MEMACCESS(1)
1776 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. 1786 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
1777 "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts. 1787 "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
1778 "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts. 1788 "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
1779 "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts. 1789 "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
1780 1790
1781 "urshr v0.8h, v3.8h, #1 \n" // 2x average 1791 "urshr v0.8h, v3.8h, #1 \n" // 2x average
1782 "urshr v2.8h, v2.8h, #1 \n" 1792 "urshr v2.8h, v2.8h, #1 \n"
1783 "urshr v1.8h, v1.8h, #1 \n" 1793 "urshr v1.8h, v1.8h, #1 \n"
1784 1794
1785 "subs %4, %4, #16 \n" // 32 processed per loop. 1795 "subs %w4, %w4, #16 \n" // 32 processed per loop.
1786 RGBTOUV(v0.8h, v2.8h, v1.8h) 1796 RGBTOUV(v0.8h, v2.8h, v1.8h)
1787 MEMACCESS(2) 1797 MEMACCESS(2)
1788 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1798 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1789 MEMACCESS(3) 1799 MEMACCESS(3)
1790 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1800 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1791 "b.gt 1b \n" 1801 "b.gt 1b \n"
1792 : "+r"(src_abgr), // %0 1802 : "+r"(src_abgr), // %0
1793 "+r"(src_abgr_1), // %1 1803 "+r"(src_abgr_1), // %1
1794 "+r"(dst_u), // %2 1804 "+r"(dst_u), // %2
1795 "+r"(dst_v), // %3 1805 "+r"(dst_v), // %3
(...skipping 20 matching lines...) Expand all
1816 MEMACCESS(1) 1826 MEMACCESS(1)
1817 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. 1827 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
1818 "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts. 1828 "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts.
1819 "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts. 1829 "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
1820 "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts. 1830 "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts.
1821 1831
1822 "urshr v0.8h, v0.8h, #1 \n" // 2x average 1832 "urshr v0.8h, v0.8h, #1 \n" // 2x average
1823 "urshr v1.8h, v1.8h, #1 \n" 1833 "urshr v1.8h, v1.8h, #1 \n"
1824 "urshr v2.8h, v2.8h, #1 \n" 1834 "urshr v2.8h, v2.8h, #1 \n"
1825 1835
1826 "subs %4, %4, #16 \n" // 32 processed per loop. 1836 "subs %w4, %w4, #16 \n" // 32 processed per loop.
1827 RGBTOUV(v0.8h, v1.8h, v2.8h) 1837 RGBTOUV(v0.8h, v1.8h, v2.8h)
1828 MEMACCESS(2) 1838 MEMACCESS(2)
1829 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1839 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1830 MEMACCESS(3) 1840 MEMACCESS(3)
1831 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1841 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1832 "b.gt 1b \n" 1842 "b.gt 1b \n"
1833 : "+r"(src_rgba), // %0 1843 : "+r"(src_rgba), // %0
1834 "+r"(src_rgba_1), // %1 1844 "+r"(src_rgba_1), // %1
1835 "+r"(dst_u), // %2 1845 "+r"(dst_u), // %2
1836 "+r"(dst_v), // %3 1846 "+r"(dst_v), // %3
(...skipping 20 matching lines...) Expand all
1857 MEMACCESS(1) 1867 MEMACCESS(1)
1858 "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more. 1868 "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more.
1859 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. 1869 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
1860 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. 1870 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
1861 "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. 1871 "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
1862 1872
1863 "urshr v0.8h, v0.8h, #1 \n" // 2x average 1873 "urshr v0.8h, v0.8h, #1 \n" // 2x average
1864 "urshr v1.8h, v1.8h, #1 \n" 1874 "urshr v1.8h, v1.8h, #1 \n"
1865 "urshr v2.8h, v2.8h, #1 \n" 1875 "urshr v2.8h, v2.8h, #1 \n"
1866 1876
1867 "subs %4, %4, #16 \n" // 32 processed per loop. 1877 "subs %w4, %w4, #16 \n" // 32 processed per loop.
1868 RGBTOUV(v0.8h, v1.8h, v2.8h) 1878 RGBTOUV(v0.8h, v1.8h, v2.8h)
1869 MEMACCESS(2) 1879 MEMACCESS(2)
1870 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1880 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1871 MEMACCESS(3) 1881 MEMACCESS(3)
1872 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1882 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1873 "b.gt 1b \n" 1883 "b.gt 1b \n"
1874 : "+r"(src_rgb24), // %0 1884 : "+r"(src_rgb24), // %0
1875 "+r"(src_rgb24_1), // %1 1885 "+r"(src_rgb24_1), // %1
1876 "+r"(dst_u), // %2 1886 "+r"(dst_u), // %2
1877 "+r"(dst_v), // %3 1887 "+r"(dst_v), // %3
(...skipping 20 matching lines...) Expand all
1898 MEMACCESS(1) 1908 MEMACCESS(1)
1899 "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels 1909 "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels
1900 "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts. 1910 "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
1901 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. 1911 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
1902 "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts. 1912 "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
1903 1913
1904 "urshr v2.8h, v2.8h, #1 \n" // 2x average 1914 "urshr v2.8h, v2.8h, #1 \n" // 2x average
1905 "urshr v1.8h, v1.8h, #1 \n" 1915 "urshr v1.8h, v1.8h, #1 \n"
1906 "urshr v0.8h, v0.8h, #1 \n" 1916 "urshr v0.8h, v0.8h, #1 \n"
1907 1917
1908 "subs %4, %4, #16 \n" // 32 processed per loop. 1918 "subs %w4, %w4, #16 \n" // 32 processed per loop.
1909 RGBTOUV(v2.8h, v1.8h, v0.8h) 1919 RGBTOUV(v2.8h, v1.8h, v0.8h)
1910 MEMACCESS(2) 1920 MEMACCESS(2)
1911 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1921 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1912 MEMACCESS(3) 1922 MEMACCESS(3)
1913 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1923 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1914 "b.gt 1b \n" 1924 "b.gt 1b \n"
1915 : "+r"(src_raw), // %0 1925 : "+r"(src_raw), // %0
1916 "+r"(src_raw_1), // %1 1926 "+r"(src_raw_1), // %1
1917 "+r"(dst_u), // %2 1927 "+r"(dst_u), // %2
1918 "+r"(dst_v), // %3 1928 "+r"(dst_v), // %3
(...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after
1964 "uadalp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1974 "uadalp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1965 1975
1966 "ins v16.D[1], v17.D[0] \n" 1976 "ins v16.D[1], v17.D[0] \n"
1967 "ins v18.D[1], v19.D[0] \n" 1977 "ins v18.D[1], v19.D[0] \n"
1968 "ins v20.D[1], v21.D[0] \n" 1978 "ins v20.D[1], v21.D[0] \n"
1969 1979
1970 "urshr v4.8h, v16.8h, #1 \n" // 2x average 1980 "urshr v4.8h, v16.8h, #1 \n" // 2x average
1971 "urshr v5.8h, v18.8h, #1 \n" 1981 "urshr v5.8h, v18.8h, #1 \n"
1972 "urshr v6.8h, v20.8h, #1 \n" 1982 "urshr v6.8h, v20.8h, #1 \n"
1973 1983
1974 "subs %4, %4, #16 \n" // 16 processed per loop. 1984 "subs %w4, %w4, #16 \n" // 16 processed per loop.
1975 "mul v16.8h, v4.8h, v22.8h \n" // B 1985 "mul v16.8h, v4.8h, v22.8h \n" // B
1976 "mls v16.8h, v5.8h, v23.8h \n" // G 1986 "mls v16.8h, v5.8h, v23.8h \n" // G
1977 "mls v16.8h, v6.8h, v24.8h \n" // R 1987 "mls v16.8h, v6.8h, v24.8h \n" // R
1978 "add v16.8h, v16.8h, v27.8h \n" // +128 -> unsigned 1988 "add v16.8h, v16.8h, v27.8h \n" // +128 -> unsigned
1979 "mul v17.8h, v6.8h, v22.8h \n" // R 1989 "mul v17.8h, v6.8h, v22.8h \n" // R
1980 "mls v17.8h, v5.8h, v26.8h \n" // G 1990 "mls v17.8h, v5.8h, v26.8h \n" // G
1981 "mls v17.8h, v4.8h, v25.8h \n" // B 1991 "mls v17.8h, v4.8h, v25.8h \n" // B
1982 "add v17.8h, v17.8h, v27.8h \n" // +128 -> unsigned 1992 "add v17.8h, v17.8h, v27.8h \n" // +128 -> unsigned
1983 "uqshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit U 1993 "uqshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit U
1984 "uqshrn v1.8b, v17.8h, #8 \n" // 16 bit to 8 bit V 1994 "uqshrn v1.8b, v17.8h, #8 \n" // 16 bit to 8 bit V
(...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after
2035 "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 2045 "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
2036 2046
2037 "ins v16.D[1], v26.D[0] \n" 2047 "ins v16.D[1], v26.D[0] \n"
2038 "ins v17.D[1], v27.D[0] \n" 2048 "ins v17.D[1], v27.D[0] \n"
2039 "ins v18.D[1], v28.D[0] \n" 2049 "ins v18.D[1], v28.D[0] \n"
2040 2050
2041 "urshr v4.8h, v16.8h, #1 \n" // 2x average 2051 "urshr v4.8h, v16.8h, #1 \n" // 2x average
2042 "urshr v5.8h, v17.8h, #1 \n" 2052 "urshr v5.8h, v17.8h, #1 \n"
2043 "urshr v6.8h, v18.8h, #1 \n" 2053 "urshr v6.8h, v18.8h, #1 \n"
2044 2054
2045 "subs %4, %4, #16 \n" // 16 processed per loop. 2055 "subs %w4, %w4, #16 \n" // 16 processed per loop.
2046 "mul v2.8h, v4.8h, v20.8h \n" // B 2056 "mul v2.8h, v4.8h, v20.8h \n" // B
2047 "mls v2.8h, v5.8h, v21.8h \n" // G 2057 "mls v2.8h, v5.8h, v21.8h \n" // G
2048 "mls v2.8h, v6.8h, v22.8h \n" // R 2058 "mls v2.8h, v6.8h, v22.8h \n" // R
2049 "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned 2059 "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned
2050 "mul v3.8h, v6.8h, v20.8h \n" // R 2060 "mul v3.8h, v6.8h, v20.8h \n" // R
2051 "mls v3.8h, v5.8h, v24.8h \n" // G 2061 "mls v3.8h, v5.8h, v24.8h \n" // G
2052 "mls v3.8h, v4.8h, v23.8h \n" // B 2062 "mls v3.8h, v4.8h, v23.8h \n" // B
2053 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned 2063 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
2054 "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U 2064 "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U
2055 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V 2065 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
(...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after
2106 "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 2116 "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
2107 2117
2108 "ins v16.D[1], v26.D[0] \n" 2118 "ins v16.D[1], v26.D[0] \n"
2109 "ins v17.D[1], v27.D[0] \n" 2119 "ins v17.D[1], v27.D[0] \n"
2110 "ins v18.D[1], v28.D[0] \n" 2120 "ins v18.D[1], v28.D[0] \n"
2111 2121
2112 "urshr v4.8h, v16.8h, #1 \n" // 2x average 2122 "urshr v4.8h, v16.8h, #1 \n" // 2x average
2113 "urshr v5.8h, v17.8h, #1 \n" 2123 "urshr v5.8h, v17.8h, #1 \n"
2114 "urshr v6.8h, v18.8h, #1 \n" 2124 "urshr v6.8h, v18.8h, #1 \n"
2115 2125
2116 "subs %4, %4, #16 \n" // 16 processed per loop. 2126 "subs %w4, %w4, #16 \n" // 16 processed per loop.
2117 "mul v2.8h, v4.8h, v20.8h \n" // B 2127 "mul v2.8h, v4.8h, v20.8h \n" // B
2118 "mls v2.8h, v5.8h, v21.8h \n" // G 2128 "mls v2.8h, v5.8h, v21.8h \n" // G
2119 "mls v2.8h, v6.8h, v22.8h \n" // R 2129 "mls v2.8h, v6.8h, v22.8h \n" // R
2120 "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned 2130 "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned
2121 "mul v3.8h, v6.8h, v20.8h \n" // R 2131 "mul v3.8h, v6.8h, v20.8h \n" // R
2122 "mls v3.8h, v5.8h, v24.8h \n" // G 2132 "mls v3.8h, v5.8h, v24.8h \n" // G
2123 "mls v3.8h, v4.8h, v23.8h \n" // B 2133 "mls v3.8h, v4.8h, v23.8h \n" // B
2124 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned 2134 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
2125 "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U 2135 "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U
2126 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V 2136 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
(...skipping 19 matching lines...) Expand all
2146 #ifdef HAS_RGB565TOYROW_NEON 2156 #ifdef HAS_RGB565TOYROW_NEON
2147 void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) { 2157 void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
2148 asm volatile ( 2158 asm volatile (
2149 "movi v24.8b, #13 \n" // B * 0.1016 coefficient 2159 "movi v24.8b, #13 \n" // B * 0.1016 coefficient
2150 "movi v25.8b, #65 \n" // G * 0.5078 coefficient 2160 "movi v25.8b, #65 \n" // G * 0.5078 coefficient
2151 "movi v26.8b, #33 \n" // R * 0.2578 coefficient 2161 "movi v26.8b, #33 \n" // R * 0.2578 coefficient
2152 "movi v27.8b, #16 \n" // Add 16 constant 2162 "movi v27.8b, #16 \n" // Add 16 constant
2153 "1: \n" 2163 "1: \n"
2154 MEMACCESS(0) 2164 MEMACCESS(0)
2155 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. 2165 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
2156 "subs %2, %2, #8 \n" // 8 processed per loop. 2166 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2157 RGB565TOARGB 2167 RGB565TOARGB
2158 "umull v3.8h, v0.8b, v24.8b \n" // B 2168 "umull v3.8h, v0.8b, v24.8b \n" // B
2159 "umlal v3.8h, v1.8b, v25.8b \n" // G 2169 "umlal v3.8h, v1.8b, v25.8b \n" // G
2160 "umlal v3.8h, v2.8b, v26.8b \n" // R 2170 "umlal v3.8h, v2.8b, v26.8b \n" // R
2161 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y 2171 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
2162 "uqadd v0.8b, v0.8b, v27.8b \n" 2172 "uqadd v0.8b, v0.8b, v27.8b \n"
2163 MEMACCESS(1) 2173 MEMACCESS(1)
2164 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2174 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2165 "b.gt 1b \n" 2175 "b.gt 1b \n"
2166 : "+r"(src_rgb565), // %0 2176 : "+r"(src_rgb565), // %0
2167 "+r"(dst_y), // %1 2177 "+r"(dst_y), // %1
2168 "+r"(pix) // %2 2178 "+r"(pix) // %2
2169 : 2179 :
2170 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", 2180 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6",
2171 "v24", "v25", "v26", "v27" 2181 "v24", "v25", "v26", "v27"
2172 ); 2182 );
2173 } 2183 }
2174 #endif // HAS_RGB565TOYROW_NEON 2184 #endif // HAS_RGB565TOYROW_NEON
2175 2185
2176 #ifdef HAS_ARGB1555TOYROW_NEON 2186 #ifdef HAS_ARGB1555TOYROW_NEON
2177 void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) { 2187 void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
2178 asm volatile ( 2188 asm volatile (
2179 "movi v4.8b, #13 \n" // B * 0.1016 coefficient 2189 "movi v4.8b, #13 \n" // B * 0.1016 coefficient
2180 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 2190 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
2181 "movi v6.8b, #33 \n" // R * 0.2578 coefficient 2191 "movi v6.8b, #33 \n" // R * 0.2578 coefficient
2182 "movi v7.8b, #16 \n" // Add 16 constant 2192 "movi v7.8b, #16 \n" // Add 16 constant
2183 "1: \n" 2193 "1: \n"
2184 MEMACCESS(0) 2194 MEMACCESS(0)
2185 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. 2195 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
2186 "subs %2, %2, #8 \n" // 8 processed per loop. 2196 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2187 ARGB1555TOARGB 2197 ARGB1555TOARGB
2188 "umull v3.8h, v0.8b, v4.8b \n" // B 2198 "umull v3.8h, v0.8b, v4.8b \n" // B
2189 "umlal v3.8h, v1.8b, v5.8b \n" // G 2199 "umlal v3.8h, v1.8b, v5.8b \n" // G
2190 "umlal v3.8h, v2.8b, v6.8b \n" // R 2200 "umlal v3.8h, v2.8b, v6.8b \n" // R
2191 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y 2201 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
2192 "uqadd v0.8b, v0.8b, v7.8b \n" 2202 "uqadd v0.8b, v0.8b, v7.8b \n"
2193 MEMACCESS(1) 2203 MEMACCESS(1)
2194 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2204 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2195 "b.gt 1b \n" 2205 "b.gt 1b \n"
2196 : "+r"(src_argb1555), // %0 2206 : "+r"(src_argb1555), // %0
2197 "+r"(dst_y), // %1 2207 "+r"(dst_y), // %1
2198 "+r"(pix) // %2 2208 "+r"(pix) // %2
2199 : 2209 :
2200 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" 2210 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
2201 ); 2211 );
2202 } 2212 }
2203 #endif // HAS_ARGB1555TOYROW_NEON 2213 #endif // HAS_ARGB1555TOYROW_NEON
2204 2214
2205 #ifdef HAS_ARGB4444TOYROW_NEON 2215 #ifdef HAS_ARGB4444TOYROW_NEON
2206 void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) { 2216 void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
2207 asm volatile ( 2217 asm volatile (
2208 "movi v24.8b, #13 \n" // B * 0.1016 coefficient 2218 "movi v24.8b, #13 \n" // B * 0.1016 coefficient
2209 "movi v25.8b, #65 \n" // G * 0.5078 coefficient 2219 "movi v25.8b, #65 \n" // G * 0.5078 coefficient
2210 "movi v26.8b, #33 \n" // R * 0.2578 coefficient 2220 "movi v26.8b, #33 \n" // R * 0.2578 coefficient
2211 "movi v27.8b, #16 \n" // Add 16 constant 2221 "movi v27.8b, #16 \n" // Add 16 constant
2212 "1: \n" 2222 "1: \n"
2213 MEMACCESS(0) 2223 MEMACCESS(0)
2214 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. 2224 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
2215 "subs %2, %2, #8 \n" // 8 processed per loop. 2225 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2216 ARGB4444TOARGB 2226 ARGB4444TOARGB
2217 "umull v3.8h, v0.8b, v24.8b \n" // B 2227 "umull v3.8h, v0.8b, v24.8b \n" // B
2218 "umlal v3.8h, v1.8b, v25.8b \n" // G 2228 "umlal v3.8h, v1.8b, v25.8b \n" // G
2219 "umlal v3.8h, v2.8b, v26.8b \n" // R 2229 "umlal v3.8h, v2.8b, v26.8b \n" // R
2220 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y 2230 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
2221 "uqadd v0.8b, v0.8b, v27.8b \n" 2231 "uqadd v0.8b, v0.8b, v27.8b \n"
2222 MEMACCESS(1) 2232 MEMACCESS(1)
2223 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2233 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2224 "b.gt 1b \n" 2234 "b.gt 1b \n"
2225 : "+r"(src_argb4444), // %0 2235 : "+r"(src_argb4444), // %0
2226 "+r"(dst_y), // %1 2236 "+r"(dst_y), // %1
2227 "+r"(pix) // %2 2237 "+r"(pix) // %2
2228 : 2238 :
2229 : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27" 2239 : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27"
2230 ); 2240 );
2231 } 2241 }
2232 #endif // HAS_ARGB4444TOYROW_NEON 2242 #endif // HAS_ARGB4444TOYROW_NEON
2233 2243
2234 #ifdef HAS_BGRATOYROW_NEON 2244 #ifdef HAS_BGRATOYROW_NEON
2235 void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) { 2245 void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
2236 asm volatile ( 2246 asm volatile (
2237 "movi v4.8b, #33 \n" // R * 0.2578 coefficient 2247 "movi v4.8b, #33 \n" // R * 0.2578 coefficient
2238 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 2248 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
2239 "movi v6.8b, #13 \n" // B * 0.1016 coefficient 2249 "movi v6.8b, #13 \n" // B * 0.1016 coefficient
2240 "movi v7.8b, #16 \n" // Add 16 constant 2250 "movi v7.8b, #16 \n" // Add 16 constant
2241 "1: \n" 2251 "1: \n"
2242 MEMACCESS(0) 2252 MEMACCESS(0)
2243 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. 2253 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
2244 "subs %2, %2, #8 \n" // 8 processed per loop. 2254 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2245 "umull v16.8h, v1.8b, v4.8b \n" // R 2255 "umull v16.8h, v1.8b, v4.8b \n" // R
2246 "umlal v16.8h, v2.8b, v5.8b \n" // G 2256 "umlal v16.8h, v2.8b, v5.8b \n" // G
2247 "umlal v16.8h, v3.8b, v6.8b \n" // B 2257 "umlal v16.8h, v3.8b, v6.8b \n" // B
2248 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y 2258 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
2249 "uqadd v0.8b, v0.8b, v7.8b \n" 2259 "uqadd v0.8b, v0.8b, v7.8b \n"
2250 MEMACCESS(1) 2260 MEMACCESS(1)
2251 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2261 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2252 "b.gt 1b \n" 2262 "b.gt 1b \n"
2253 : "+r"(src_bgra), // %0 2263 : "+r"(src_bgra), // %0
2254 "+r"(dst_y), // %1 2264 "+r"(dst_y), // %1
2255 "+r"(pix) // %2 2265 "+r"(pix) // %2
2256 : 2266 :
2257 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" 2267 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2258 ); 2268 );
2259 } 2269 }
2260 #endif // HAS_BGRATOYROW_NEON 2270 #endif // HAS_BGRATOYROW_NEON
2261 2271
2262 #ifdef HAS_ABGRTOYROW_NEON 2272 #ifdef HAS_ABGRTOYROW_NEON
2263 void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) { 2273 void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
2264 asm volatile ( 2274 asm volatile (
2265 "movi v4.8b, #33 \n" // R * 0.2578 coefficient 2275 "movi v4.8b, #33 \n" // R * 0.2578 coefficient
2266 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 2276 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
2267 "movi v6.8b, #13 \n" // B * 0.1016 coefficient 2277 "movi v6.8b, #13 \n" // B * 0.1016 coefficient
2268 "movi v7.8b, #16 \n" // Add 16 constant 2278 "movi v7.8b, #16 \n" // Add 16 constant
2269 "1: \n" 2279 "1: \n"
2270 MEMACCESS(0) 2280 MEMACCESS(0)
2271 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. 2281 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
2272 "subs %2, %2, #8 \n" // 8 processed per loop. 2282 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2273 "umull v16.8h, v0.8b, v4.8b \n" // R 2283 "umull v16.8h, v0.8b, v4.8b \n" // R
2274 "umlal v16.8h, v1.8b, v5.8b \n" // G 2284 "umlal v16.8h, v1.8b, v5.8b \n" // G
2275 "umlal v16.8h, v2.8b, v6.8b \n" // B 2285 "umlal v16.8h, v2.8b, v6.8b \n" // B
2276 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y 2286 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
2277 "uqadd v0.8b, v0.8b, v7.8b \n" 2287 "uqadd v0.8b, v0.8b, v7.8b \n"
2278 MEMACCESS(1) 2288 MEMACCESS(1)
2279 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2289 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2280 "b.gt 1b \n" 2290 "b.gt 1b \n"
2281 : "+r"(src_abgr), // %0 2291 : "+r"(src_abgr), // %0
2282 "+r"(dst_y), // %1 2292 "+r"(dst_y), // %1
2283 "+r"(pix) // %2 2293 "+r"(pix) // %2
2284 : 2294 :
2285 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" 2295 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2286 ); 2296 );
2287 } 2297 }
2288 #endif // HAS_ABGRTOYROW_NEON 2298 #endif // HAS_ABGRTOYROW_NEON
2289 2299
2290 #ifdef HAS_RGBATOYROW_NEON 2300 #ifdef HAS_RGBATOYROW_NEON
2291 void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) { 2301 void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
2292 asm volatile ( 2302 asm volatile (
2293 "movi v4.8b, #13 \n" // B * 0.1016 coefficient 2303 "movi v4.8b, #13 \n" // B * 0.1016 coefficient
2294 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 2304 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
2295 "movi v6.8b, #33 \n" // R * 0.2578 coefficient 2305 "movi v6.8b, #33 \n" // R * 0.2578 coefficient
2296 "movi v7.8b, #16 \n" // Add 16 constant 2306 "movi v7.8b, #16 \n" // Add 16 constant
2297 "1: \n" 2307 "1: \n"
2298 MEMACCESS(0) 2308 MEMACCESS(0)
2299 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. 2309 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
2300 "subs %2, %2, #8 \n" // 8 processed per loop. 2310 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2301 "umull v16.8h, v1.8b, v4.8b \n" // B 2311 "umull v16.8h, v1.8b, v4.8b \n" // B
2302 "umlal v16.8h, v2.8b, v5.8b \n" // G 2312 "umlal v16.8h, v2.8b, v5.8b \n" // G
2303 "umlal v16.8h, v3.8b, v6.8b \n" // R 2313 "umlal v16.8h, v3.8b, v6.8b \n" // R
2304 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y 2314 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
2305 "uqadd v0.8b, v0.8b, v7.8b \n" 2315 "uqadd v0.8b, v0.8b, v7.8b \n"
2306 MEMACCESS(1) 2316 MEMACCESS(1)
2307 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2317 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2308 "b.gt 1b \n" 2318 "b.gt 1b \n"
2309 : "+r"(src_rgba), // %0 2319 : "+r"(src_rgba), // %0
2310 "+r"(dst_y), // %1 2320 "+r"(dst_y), // %1
2311 "+r"(pix) // %2 2321 "+r"(pix) // %2
2312 : 2322 :
2313 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" 2323 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2314 ); 2324 );
2315 } 2325 }
2316 #endif // HAS_RGBATOYROW_NEON 2326 #endif // HAS_RGBATOYROW_NEON
2317 2327
2318 #ifdef HAS_RGB24TOYROW_NEON 2328 #ifdef HAS_RGB24TOYROW_NEON
2319 void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) { 2329 void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
2320 asm volatile ( 2330 asm volatile (
2321 "movi v4.8b, #13 \n" // B * 0.1016 coefficient 2331 "movi v4.8b, #13 \n" // B * 0.1016 coefficient
2322 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 2332 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
2323 "movi v6.8b, #33 \n" // R * 0.2578 coefficient 2333 "movi v6.8b, #33 \n" // R * 0.2578 coefficient
2324 "movi v7.8b, #16 \n" // Add 16 constant 2334 "movi v7.8b, #16 \n" // Add 16 constant
2325 "1: \n" 2335 "1: \n"
2326 MEMACCESS(0) 2336 MEMACCESS(0)
2327 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. 2337 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
2328 "subs %2, %2, #8 \n" // 8 processed per loop. 2338 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2329 "umull v16.8h, v0.8b, v4.8b \n" // B 2339 "umull v16.8h, v0.8b, v4.8b \n" // B
2330 "umlal v16.8h, v1.8b, v5.8b \n" // G 2340 "umlal v16.8h, v1.8b, v5.8b \n" // G
2331 "umlal v16.8h, v2.8b, v6.8b \n" // R 2341 "umlal v16.8h, v2.8b, v6.8b \n" // R
2332 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y 2342 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
2333 "uqadd v0.8b, v0.8b, v7.8b \n" 2343 "uqadd v0.8b, v0.8b, v7.8b \n"
2334 MEMACCESS(1) 2344 MEMACCESS(1)
2335 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2345 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2336 "b.gt 1b \n" 2346 "b.gt 1b \n"
2337 : "+r"(src_rgb24), // %0 2347 : "+r"(src_rgb24), // %0
2338 "+r"(dst_y), // %1 2348 "+r"(dst_y), // %1
2339 "+r"(pix) // %2 2349 "+r"(pix) // %2
2340 : 2350 :
2341 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" 2351 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2342 ); 2352 );
2343 } 2353 }
2344 #endif // HAS_RGB24TOYROW_NEON 2354 #endif // HAS_RGB24TOYROW_NEON
2345 2355
2346 #ifdef HAS_RAWTOYROW_NEON 2356 #ifdef HAS_RAWTOYROW_NEON
2347 void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { 2357 void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
2348 asm volatile ( 2358 asm volatile (
2349 "movi v4.8b, #33 \n" // R * 0.2578 coefficient 2359 "movi v4.8b, #33 \n" // R * 0.2578 coefficient
2350 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 2360 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
2351 "movi v6.8b, #13 \n" // B * 0.1016 coefficient 2361 "movi v6.8b, #13 \n" // B * 0.1016 coefficient
2352 "movi v7.8b, #16 \n" // Add 16 constant 2362 "movi v7.8b, #16 \n" // Add 16 constant
2353 "1: \n" 2363 "1: \n"
2354 MEMACCESS(0) 2364 MEMACCESS(0)
2355 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. 2365 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
2356 "subs %2, %2, #8 \n" // 8 processed per loop. 2366 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2357 "umull v16.8h, v0.8b, v4.8b \n" // B 2367 "umull v16.8h, v0.8b, v4.8b \n" // B
2358 "umlal v16.8h, v1.8b, v5.8b \n" // G 2368 "umlal v16.8h, v1.8b, v5.8b \n" // G
2359 "umlal v16.8h, v2.8b, v6.8b \n" // R 2369 "umlal v16.8h, v2.8b, v6.8b \n" // R
2360 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y 2370 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
2361 "uqadd v0.8b, v0.8b, v7.8b \n" 2371 "uqadd v0.8b, v0.8b, v7.8b \n"
2362 MEMACCESS(1) 2372 MEMACCESS(1)
2363 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2373 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2364 "b.gt 1b \n" 2374 "b.gt 1b \n"
2365 : "+r"(src_raw), // %0 2375 : "+r"(src_raw), // %0
2366 "+r"(dst_y), // %1 2376 "+r"(dst_y), // %1
2367 "+r"(pix) // %2 2377 "+r"(pix) // %2
2368 : 2378 :
2369 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" 2379 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2370 ); 2380 );
2371 } 2381 }
2372 #endif // HAS_RAWTOYROW_NEON 2382 #endif // HAS_RAWTOYROW_NEON
2373 2383
2374 // Bilinear filter 16x2 -> 16x1 2384 // Bilinear filter 16x2 -> 16x1
2375 #ifdef HAS_INTERPOLATEROW_NEON 2385 #ifdef HAS_INTERPOLATEROW_NEON
2376 void InterpolateRow_NEON(uint8* dst_ptr, 2386 void InterpolateRow_NEON(uint8* dst_ptr,
2377 const uint8* src_ptr, ptrdiff_t src_stride, 2387 const uint8* src_ptr, ptrdiff_t src_stride,
2378 int dst_width, int source_y_fraction) { 2388 int dst_width, int source_y_fraction) {
2379 int y1_fraction = source_y_fraction; 2389 int y1_fraction = source_y_fraction;
2380 int y0_fraction = 256 - y1_fraction; 2390 int y0_fraction = 256 - y1_fraction;
2381 const uint8* src_ptr1 = src_ptr + src_stride; 2391 const uint8* src_ptr1 = src_ptr + src_stride;
2382 asm volatile ( 2392 asm volatile (
2383 "cmp %4, #0 \n" 2393 "cmp %w4, #0 \n"
2384 "b.eq 100f \n" 2394 "b.eq 100f \n"
2385 "cmp %4, #64 \n" 2395 "cmp %w4, #64 \n"
2386 "b.eq 75f \n" 2396 "b.eq 75f \n"
2387 "cmp %4, #128 \n" 2397 "cmp %w4, #128 \n"
2388 "b.eq 50f \n" 2398 "b.eq 50f \n"
2389 "cmp %4, #192 \n" 2399 "cmp %w4, #192 \n"
2390 "b.eq 25f \n" 2400 "b.eq 25f \n"
2391 2401
2392 "dup v5.16b, %w4 \n" 2402 "dup v5.16b, %w4 \n"
2393 "dup v4.16b, %w5 \n" 2403 "dup v4.16b, %w5 \n"
2394 // General purpose row blend. 2404 // General purpose row blend.
2395 "1: \n" 2405 "1: \n"
2396 MEMACCESS(1) 2406 MEMACCESS(1)
2397 "ld1 {v0.16b}, [%1], #16 \n" 2407 "ld1 {v0.16b}, [%1], #16 \n"
2398 MEMACCESS(2) 2408 MEMACCESS(2)
2399 "ld1 {v1.16b}, [%2], #16 \n" 2409 "ld1 {v1.16b}, [%2], #16 \n"
2400 "subs %3, %3, #16 \n" 2410 "subs %w3, %w3, #16 \n"
2401 "umull v2.8h, v0.8b, v4.8b \n" 2411 "umull v2.8h, v0.8b, v4.8b \n"
2402 "umull2 v3.8h, v0.16b, v4.16b \n" 2412 "umull2 v3.8h, v0.16b, v4.16b \n"
2403 "umlal v2.8h, v1.8b, v5.8b \n" 2413 "umlal v2.8h, v1.8b, v5.8b \n"
2404 "umlal2 v3.8h, v1.16b, v5.16b \n" 2414 "umlal2 v3.8h, v1.16b, v5.16b \n"
2405 "rshrn v0.8b, v2.8h, #8 \n" 2415 "rshrn v0.8b, v2.8h, #8 \n"
2406 "rshrn2 v0.16b, v3.8h, #8 \n" 2416 "rshrn2 v0.16b, v3.8h, #8 \n"
2407 MEMACCESS(0) 2417 MEMACCESS(0)
2408 "st1 {v0.16b}, [%0], #16 \n" 2418 "st1 {v0.16b}, [%0], #16 \n"
2409 "b.gt 1b \n" 2419 "b.gt 1b \n"
2410 "b 99f \n" 2420 "b 99f \n"
2411 2421
2412 // Blend 25 / 75. 2422 // Blend 25 / 75.
2413 "25: \n" 2423 "25: \n"
2414 MEMACCESS(1) 2424 MEMACCESS(1)
2415 "ld1 {v0.16b}, [%1], #16 \n" 2425 "ld1 {v0.16b}, [%1], #16 \n"
2416 MEMACCESS(2) 2426 MEMACCESS(2)
2417 "ld1 {v1.16b}, [%2], #16 \n" 2427 "ld1 {v1.16b}, [%2], #16 \n"
2418 "subs %3, %3, #16 \n" 2428 "subs %w3, %w3, #16 \n"
2419 "urhadd v0.16b, v0.16b, v1.16b \n" 2429 "urhadd v0.16b, v0.16b, v1.16b \n"
2420 "urhadd v0.16b, v0.16b, v1.16b \n" 2430 "urhadd v0.16b, v0.16b, v1.16b \n"
2421 MEMACCESS(0) 2431 MEMACCESS(0)
2422 "st1 {v0.16b}, [%0], #16 \n" 2432 "st1 {v0.16b}, [%0], #16 \n"
2423 "b.gt 25b \n" 2433 "b.gt 25b \n"
2424 "b 99f \n" 2434 "b 99f \n"
2425 2435
2426 // Blend 50 / 50. 2436 // Blend 50 / 50.
2427 "50: \n" 2437 "50: \n"
2428 MEMACCESS(1) 2438 MEMACCESS(1)
2429 "ld1 {v0.16b}, [%1], #16 \n" 2439 "ld1 {v0.16b}, [%1], #16 \n"
2430 MEMACCESS(2) 2440 MEMACCESS(2)
2431 "ld1 {v1.16b}, [%2], #16 \n" 2441 "ld1 {v1.16b}, [%2], #16 \n"
2432 "subs %3, %3, #16 \n" 2442 "subs %w3, %w3, #16 \n"
2433 "urhadd v0.16b, v0.16b, v1.16b \n" 2443 "urhadd v0.16b, v0.16b, v1.16b \n"
2434 MEMACCESS(0) 2444 MEMACCESS(0)
2435 "st1 {v0.16b}, [%0], #16 \n" 2445 "st1 {v0.16b}, [%0], #16 \n"
2436 "b.gt 50b \n" 2446 "b.gt 50b \n"
2437 "b 99f \n" 2447 "b 99f \n"
2438 2448
2439 // Blend 75 / 25. 2449 // Blend 75 / 25.
2440 "75: \n" 2450 "75: \n"
2441 MEMACCESS(1) 2451 MEMACCESS(1)
2442 "ld1 {v1.16b}, [%1], #16 \n" 2452 "ld1 {v1.16b}, [%1], #16 \n"
2443 MEMACCESS(2) 2453 MEMACCESS(2)
2444 "ld1 {v0.16b}, [%2], #16 \n" 2454 "ld1 {v0.16b}, [%2], #16 \n"
2445 "subs %3, %3, #16 \n" 2455 "subs %w3, %w3, #16 \n"
2446 "urhadd v0.16b, v0.16b, v1.16b \n" 2456 "urhadd v0.16b, v0.16b, v1.16b \n"
2447 "urhadd v0.16b, v0.16b, v1.16b \n" 2457 "urhadd v0.16b, v0.16b, v1.16b \n"
2448 MEMACCESS(0) 2458 MEMACCESS(0)
2449 "st1 {v0.16b}, [%0], #16 \n" 2459 "st1 {v0.16b}, [%0], #16 \n"
2450 "b.gt 75b \n" 2460 "b.gt 75b \n"
2451 "b 99f \n" 2461 "b 99f \n"
2452 2462
2453 // Blend 100 / 0 - Copy row unchanged. 2463 // Blend 100 / 0 - Copy row unchanged.
2454 "100: \n" 2464 "100: \n"
2455 MEMACCESS(1) 2465 MEMACCESS(1)
2456 "ld1 {v0.16b}, [%1], #16 \n" 2466 "ld1 {v0.16b}, [%1], #16 \n"
2457 "subs %3, %3, #16 \n" 2467 "subs %w3, %w3, #16 \n"
2458 MEMACCESS(0) 2468 MEMACCESS(0)
2459 "st1 {v0.16b}, [%0], #16 \n" 2469 "st1 {v0.16b}, [%0], #16 \n"
2460 "b.gt 100b \n" 2470 "b.gt 100b \n"
2461 2471
2462 "99: \n" 2472 "99: \n"
2463 : "+r"(dst_ptr), // %0 2473 : "+r"(dst_ptr), // %0
2464 "+r"(src_ptr), // %1 2474 "+r"(src_ptr), // %1
2465 "+r"(src_ptr1), // %2 2475 "+r"(src_ptr1), // %2
2466 "+r"(dst_width), // %3 2476 "+r"(dst_width), // %3
2467 "+r"(y1_fraction), // %4 2477 "+r"(y1_fraction), // %4
2468 "+r"(y0_fraction) // %5 2478 "+r"(y0_fraction) // %5
2469 : 2479 :
2470 : "cc", "memory", "v0", "v1", "v3", "v4", "v5" 2480 : "cc", "memory", "v0", "v1", "v3", "v4", "v5"
2471 ); 2481 );
2472 } 2482 }
2473 #endif // HAS_INTERPOLATEROW_NEON 2483 #endif // HAS_INTERPOLATEROW_NEON
2474 2484
2475 // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr 2485 // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
2476 #ifdef HAS_ARGBBLENDROW_NEON 2486 #ifdef HAS_ARGBBLENDROW_NEON
2477 void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, 2487 void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
2478 uint8* dst_argb, int width) { 2488 uint8* dst_argb, int width) {
2479 asm volatile ( 2489 asm volatile (
2480 "subs %3, %3, #8 \n" 2490 "subs %w3, %w3, #8 \n"
2481 "b.lt 89f \n" 2491 "b.lt 89f \n"
2482 // Blend 8 pixels. 2492 // Blend 8 pixels.
2483 "8: \n" 2493 "8: \n"
2484 MEMACCESS(0) 2494 MEMACCESS(0)
2485 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 pixels 2495 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 pixels
2486 MEMACCESS(1) 2496 MEMACCESS(1)
2487 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 pixels 2497 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 pixels
2488 "subs %3, %3, #8 \n" // 8 processed per loop. 2498 "subs %w3, %w3, #8 \n" // 8 processed per loop.
2489 "umull v16.8h, v4.8b, v3.8b \n" // db * a 2499 "umull v16.8h, v4.8b, v3.8b \n" // db * a
2490 "umull v17.8h, v5.8b, v3.8b \n" // dg * a 2500 "umull v17.8h, v5.8b, v3.8b \n" // dg * a
2491 "umull v18.8h, v6.8b, v3.8b \n" // dr * a 2501 "umull v18.8h, v6.8b, v3.8b \n" // dr * a
2492 "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 2502 "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
2493 "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 2503 "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
2494 "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 2504 "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
2495 "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) 2505 "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
2496 "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) 2506 "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
2497 "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) 2507 "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
2498 "uqadd v0.8b, v0.8b, v4.8b \n" // + sb 2508 "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
2499 "uqadd v1.8b, v1.8b, v5.8b \n" // + sg 2509 "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
2500 "uqadd v2.8b, v2.8b, v6.8b \n" // + sr 2510 "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
2501 "movi v3.8b, #255 \n" // a = 255 2511 "movi v3.8b, #255 \n" // a = 255
2502 MEMACCESS(2) 2512 MEMACCESS(2)
2503 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels 2513 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
2504 "b.ge 8b \n" 2514 "b.ge 8b \n"
2505 2515
2506 "89: \n" 2516 "89: \n"
2507 "adds %3, %3, #8-1 \n" 2517 "adds %w3, %w3, #8-1 \n"
2508 "b.lt 99f \n" 2518 "b.lt 99f \n"
2509 2519
2510 // Blend 1 pixels. 2520 // Blend 1 pixels.
2511 "1: \n" 2521 "1: \n"
2512 MEMACCESS(0) 2522 MEMACCESS(0)
2513 "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0. 2523 "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0.
2514 MEMACCESS(1) 2524 MEMACCESS(1)
2515 "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1. 2525 "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1.
2516 "subs %3, %3, #1 \n" // 1 processed per loop. 2526 "subs %w3, %w3, #1 \n" // 1 processed per loop.
2517 "umull v16.8h, v4.8b, v3.8b \n" // db * a 2527 "umull v16.8h, v4.8b, v3.8b \n" // db * a
2518 "umull v17.8h, v5.8b, v3.8b \n" // dg * a 2528 "umull v17.8h, v5.8b, v3.8b \n" // dg * a
2519 "umull v18.8h, v6.8b, v3.8b \n" // dr * a 2529 "umull v18.8h, v6.8b, v3.8b \n" // dr * a
2520 "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 2530 "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
2521 "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 2531 "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
2522 "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 2532 "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
2523 "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) 2533 "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
2524 "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) 2534 "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
2525 "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) 2535 "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
2526 "uqadd v0.8b, v0.8b, v4.8b \n" // + sb 2536 "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
(...skipping 18 matching lines...) Expand all
2545 #endif // HAS_ARGBBLENDROW_NEON 2555 #endif // HAS_ARGBBLENDROW_NEON
2546 2556
2547 // Attenuate 8 pixels at a time. 2557 // Attenuate 8 pixels at a time.
2548 #ifdef HAS_ARGBATTENUATEROW_NEON 2558 #ifdef HAS_ARGBATTENUATEROW_NEON
2549 void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { 2559 void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
2550 asm volatile ( 2560 asm volatile (
2551 // Attenuate 8 pixels. 2561 // Attenuate 8 pixels.
2552 "1: \n" 2562 "1: \n"
2553 MEMACCESS(0) 2563 MEMACCESS(0)
2554 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels 2564 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels
2555 "subs %2, %2, #8 \n" // 8 processed per loop. 2565 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2556 "umull v4.8h, v0.8b, v3.8b \n" // b * a 2566 "umull v4.8h, v0.8b, v3.8b \n" // b * a
2557 "umull v5.8h, v1.8b, v3.8b \n" // g * a 2567 "umull v5.8h, v1.8b, v3.8b \n" // g * a
2558 "umull v6.8h, v2.8b, v3.8b \n" // r * a 2568 "umull v6.8h, v2.8b, v3.8b \n" // r * a
2559 "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8 2569 "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8
2560 "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8 2570 "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8
2561 "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8 2571 "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8
2562 MEMACCESS(1) 2572 MEMACCESS(1)
2563 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels 2573 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
2564 "b.gt 1b \n" 2574 "b.gt 1b \n"
2565 : "+r"(src_argb), // %0 2575 : "+r"(src_argb), // %0
(...skipping 13 matching lines...) Expand all
2579 asm volatile ( 2589 asm volatile (
2580 "dup v4.8h, %w2 \n" 2590 "dup v4.8h, %w2 \n"
2581 "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1 2591 "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1
2582 "dup v5.8h, %w3 \n" // interval multiply. 2592 "dup v5.8h, %w3 \n" // interval multiply.
2583 "dup v6.8h, %w4 \n" // interval add 2593 "dup v6.8h, %w4 \n" // interval add
2584 2594
2585 // 8 pixel loop. 2595 // 8 pixel loop.
2586 "1: \n" 2596 "1: \n"
2587 MEMACCESS(0) 2597 MEMACCESS(0)
2588 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 pixels of ARGB. 2598 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 pixels of ARGB.
2589 "subs %1, %1, #8 \n" // 8 processed per loop. 2599 "subs %w1, %w1, #8 \n" // 8 processed per loop.
2590 "uxtl v0.8h, v0.8b \n" // b (0 .. 255) 2600 "uxtl v0.8h, v0.8b \n" // b (0 .. 255)
2591 "uxtl v1.8h, v1.8b \n" 2601 "uxtl v1.8h, v1.8b \n"
2592 "uxtl v2.8h, v2.8b \n" 2602 "uxtl v2.8h, v2.8b \n"
2593 "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale 2603 "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale
2594 "sqdmulh v1.8h, v1.8h, v4.8h \n" // g 2604 "sqdmulh v1.8h, v1.8h, v4.8h \n" // g
2595 "sqdmulh v2.8h, v2.8h, v4.8h \n" // r 2605 "sqdmulh v2.8h, v2.8h, v4.8h \n" // r
2596 "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size 2606 "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size
2597 "mul v1.8h, v1.8h, v5.8h \n" // g 2607 "mul v1.8h, v1.8h, v5.8h \n" // g
2598 "mul v2.8h, v2.8h, v5.8h \n" // r 2608 "mul v2.8h, v2.8h, v5.8h \n" // r
2599 "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset 2609 "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset
(...skipping 23 matching lines...) Expand all
2623 uint32 value) { 2633 uint32 value) {
2624 asm volatile ( 2634 asm volatile (
2625 "dup v0.4s, %w3 \n" // duplicate scale value. 2635 "dup v0.4s, %w3 \n" // duplicate scale value.
2626 "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb. 2636 "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb.
2627 "ushr v0.8h, v0.8h, #1 \n" // scale / 2. 2637 "ushr v0.8h, v0.8h, #1 \n" // scale / 2.
2628 2638
2629 // 8 pixel loop. 2639 // 8 pixel loop.
2630 "1: \n" 2640 "1: \n"
2631 MEMACCESS(0) 2641 MEMACCESS(0)
2632 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB pixels. 2642 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB pixels.
2633 "subs %2, %2, #8 \n" // 8 processed per loop. 2643 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2634 "uxtl v4.8h, v4.8b \n" // b (0 .. 255) 2644 "uxtl v4.8h, v4.8b \n" // b (0 .. 255)
2635 "uxtl v5.8h, v5.8b \n" 2645 "uxtl v5.8h, v5.8b \n"
2636 "uxtl v6.8h, v6.8b \n" 2646 "uxtl v6.8h, v6.8b \n"
2637 "uxtl v7.8h, v7.8b \n" 2647 "uxtl v7.8h, v7.8b \n"
2638 "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2 2648 "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2
2639 "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g 2649 "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g
2640 "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r 2650 "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r
2641 "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a 2651 "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a
2642 "uqxtn v4.8b, v4.8h \n" 2652 "uqxtn v4.8b, v4.8h \n"
2643 "uqxtn v5.8b, v5.8h \n" 2653 "uqxtn v5.8b, v5.8h \n"
(...skipping 16 matching lines...) Expand all
2660 // C code is (15 * b + 75 * g + 38 * r + 64) >> 7; 2670 // C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
2661 #ifdef HAS_ARGBGRAYROW_NEON 2671 #ifdef HAS_ARGBGRAYROW_NEON
2662 void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { 2672 void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
2663 asm volatile ( 2673 asm volatile (
2664 "movi v24.8b, #15 \n" // B * 0.11400 coefficient 2674 "movi v24.8b, #15 \n" // B * 0.11400 coefficient
2665 "movi v25.8b, #75 \n" // G * 0.58700 coefficient 2675 "movi v25.8b, #75 \n" // G * 0.58700 coefficient
2666 "movi v26.8b, #38 \n" // R * 0.29900 coefficient 2676 "movi v26.8b, #38 \n" // R * 0.29900 coefficient
2667 "1: \n" 2677 "1: \n"
2668 MEMACCESS(0) 2678 MEMACCESS(0)
2669 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 2679 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
2670 "subs %2, %2, #8 \n" // 8 processed per loop. 2680 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2671 "umull v4.8h, v0.8b, v24.8b \n" // B 2681 "umull v4.8h, v0.8b, v24.8b \n" // B
2672 "umlal v4.8h, v1.8b, v25.8b \n" // G 2682 "umlal v4.8h, v1.8b, v25.8b \n" // G
2673 "umlal v4.8h, v2.8b, v26.8b \n" // R 2683 "umlal v4.8h, v2.8b, v26.8b \n" // R
2674 "sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B 2684 "sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B
2675 "orr v1.8b, v0.8b, v0.8b \n" // G 2685 "orr v1.8b, v0.8b, v0.8b \n" // G
2676 "orr v2.8b, v0.8b, v0.8b \n" // R 2686 "orr v2.8b, v0.8b, v0.8b \n" // R
2677 MEMACCESS(1) 2687 MEMACCESS(1)
2678 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels. 2688 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels.
2679 "b.gt 1b \n" 2689 "b.gt 1b \n"
2680 : "+r"(src_argb), // %0 2690 : "+r"(src_argb), // %0
(...skipping 18 matching lines...) Expand all
2699 "movi v22.8b, #35 \n" // BR coefficient 2709 "movi v22.8b, #35 \n" // BR coefficient
2700 "movi v24.8b, #22 \n" // GB coefficient 2710 "movi v24.8b, #22 \n" // GB coefficient
2701 "movi v25.8b, #88 \n" // GG coefficient 2711 "movi v25.8b, #88 \n" // GG coefficient
2702 "movi v26.8b, #45 \n" // GR coefficient 2712 "movi v26.8b, #45 \n" // GR coefficient
2703 "movi v28.8b, #24 \n" // BB coefficient 2713 "movi v28.8b, #24 \n" // BB coefficient
2704 "movi v29.8b, #98 \n" // BG coefficient 2714 "movi v29.8b, #98 \n" // BG coefficient
2705 "movi v30.8b, #50 \n" // BR coefficient 2715 "movi v30.8b, #50 \n" // BR coefficient
2706 "1: \n" 2716 "1: \n"
2707 MEMACCESS(0) 2717 MEMACCESS(0)
2708 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels. 2718 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels.
2709 "subs %1, %1, #8 \n" // 8 processed per loop. 2719 "subs %w1, %w1, #8 \n" // 8 processed per loop.
2710 "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B 2720 "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B
2711 "umlal v4.8h, v1.8b, v21.8b \n" // G 2721 "umlal v4.8h, v1.8b, v21.8b \n" // G
2712 "umlal v4.8h, v2.8b, v22.8b \n" // R 2722 "umlal v4.8h, v2.8b, v22.8b \n" // R
2713 "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G 2723 "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G
2714 "umlal v5.8h, v1.8b, v25.8b \n" // G 2724 "umlal v5.8h, v1.8b, v25.8b \n" // G
2715 "umlal v5.8h, v2.8b, v26.8b \n" // R 2725 "umlal v5.8h, v2.8b, v26.8b \n" // R
2716 "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R 2726 "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R
2717 "umlal v6.8h, v1.8b, v29.8b \n" // G 2727 "umlal v6.8h, v1.8b, v29.8b \n" // G
2718 "umlal v6.8h, v2.8b, v30.8b \n" // R 2728 "umlal v6.8h, v2.8b, v30.8b \n" // R
2719 "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B 2729 "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B
(...skipping 19 matching lines...) Expand all
2739 const int8* matrix_argb, int width) { 2749 const int8* matrix_argb, int width) {
2740 asm volatile ( 2750 asm volatile (
2741 MEMACCESS(3) 2751 MEMACCESS(3)
2742 "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors. 2752 "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors.
2743 "sxtl v0.8h, v2.8b \n" // B,G coefficients s16. 2753 "sxtl v0.8h, v2.8b \n" // B,G coefficients s16.
2744 "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16. 2754 "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16.
2745 2755
2746 "1: \n" 2756 "1: \n"
2747 MEMACCESS(0) 2757 MEMACCESS(0)
2748 "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 pixels. 2758 "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 pixels.
2749 "subs %2, %2, #8 \n" // 8 processed per loop. 2759 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2750 "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit 2760 "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit
2751 "uxtl v17.8h, v17.8b \n" // g 2761 "uxtl v17.8h, v17.8b \n" // g
2752 "uxtl v18.8h, v18.8b \n" // r 2762 "uxtl v18.8h, v18.8b \n" // r
2753 "uxtl v19.8h, v19.8b \n" // a 2763 "uxtl v19.8h, v19.8b \n" // a
2754 "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B 2764 "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B
2755 "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G 2765 "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G
2756 "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R 2766 "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R
2757 "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A 2767 "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A
2758 "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B 2768 "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B
2759 "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G 2769 "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after
2801 #ifdef HAS_ARGBMULTIPLYROW_NEON 2811 #ifdef HAS_ARGBMULTIPLYROW_NEON
2802 void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, 2812 void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
2803 uint8* dst_argb, int width) { 2813 uint8* dst_argb, int width) {
2804 asm volatile ( 2814 asm volatile (
2805 // 8 pixel loop. 2815 // 8 pixel loop.
2806 "1: \n" 2816 "1: \n"
2807 MEMACCESS(0) 2817 MEMACCESS(0)
2808 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 2818 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
2809 MEMACCESS(1) 2819 MEMACCESS(1)
2810 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. 2820 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels.
2811 "subs %3, %3, #8 \n" // 8 processed per loop. 2821 "subs %w3, %w3, #8 \n" // 8 processed per loop.
2812 "umull v0.8h, v0.8b, v4.8b \n" // multiply B 2822 "umull v0.8h, v0.8b, v4.8b \n" // multiply B
2813 "umull v1.8h, v1.8b, v5.8b \n" // multiply G 2823 "umull v1.8h, v1.8b, v5.8b \n" // multiply G
2814 "umull v2.8h, v2.8b, v6.8b \n" // multiply R 2824 "umull v2.8h, v2.8b, v6.8b \n" // multiply R
2815 "umull v3.8h, v3.8b, v7.8b \n" // multiply A 2825 "umull v3.8h, v3.8b, v7.8b \n" // multiply A
2816 "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B 2826 "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B
2817 "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G 2827 "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G
2818 "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R 2828 "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R
2819 "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A 2829 "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A
2820 MEMACCESS(2) 2830 MEMACCESS(2)
2821 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels 2831 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
(...skipping 13 matching lines...) Expand all
2835 #ifdef HAS_ARGBADDROW_NEON 2845 #ifdef HAS_ARGBADDROW_NEON
2836 void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, 2846 void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
2837 uint8* dst_argb, int width) { 2847 uint8* dst_argb, int width) {
2838 asm volatile ( 2848 asm volatile (
2839 // 8 pixel loop. 2849 // 8 pixel loop.
2840 "1: \n" 2850 "1: \n"
2841 MEMACCESS(0) 2851 MEMACCESS(0)
2842 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 2852 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
2843 MEMACCESS(1) 2853 MEMACCESS(1)
2844 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. 2854 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels.
2845 "subs %3, %3, #8 \n" // 8 processed per loop. 2855 "subs %w3, %w3, #8 \n" // 8 processed per loop.
2846 "uqadd v0.8b, v0.8b, v4.8b \n" 2856 "uqadd v0.8b, v0.8b, v4.8b \n"
2847 "uqadd v1.8b, v1.8b, v5.8b \n" 2857 "uqadd v1.8b, v1.8b, v5.8b \n"
2848 "uqadd v2.8b, v2.8b, v6.8b \n" 2858 "uqadd v2.8b, v2.8b, v6.8b \n"
2849 "uqadd v3.8b, v3.8b, v7.8b \n" 2859 "uqadd v3.8b, v3.8b, v7.8b \n"
2850 MEMACCESS(2) 2860 MEMACCESS(2)
2851 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels 2861 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
2852 "b.gt 1b \n" 2862 "b.gt 1b \n"
2853 2863
2854 : "+r"(src_argb0), // %0 2864 : "+r"(src_argb0), // %0
2855 "+r"(src_argb1), // %1 2865 "+r"(src_argb1), // %1
2856 "+r"(dst_argb), // %2 2866 "+r"(dst_argb), // %2
2857 "+r"(width) // %3 2867 "+r"(width) // %3
2858 : 2868 :
2859 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" 2869 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
2860 ); 2870 );
2861 } 2871 }
2862 #endif // HAS_ARGBADDROW_NEON 2872 #endif // HAS_ARGBADDROW_NEON
2863 2873
2864 // Subtract 2 rows of ARGB pixels, 8 pixels at a time. 2874 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
2865 #ifdef HAS_ARGBSUBTRACTROW_NEON 2875 #ifdef HAS_ARGBSUBTRACTROW_NEON
2866 void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, 2876 void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
2867 uint8* dst_argb, int width) { 2877 uint8* dst_argb, int width) {
2868 asm volatile ( 2878 asm volatile (
2869 // 8 pixel loop. 2879 // 8 pixel loop.
2870 "1: \n" 2880 "1: \n"
2871 MEMACCESS(0) 2881 MEMACCESS(0)
2872 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 2882 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
2873 MEMACCESS(1) 2883 MEMACCESS(1)
2874 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. 2884 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels.
2875 "subs %3, %3, #8 \n" // 8 processed per loop. 2885 "subs %w3, %w3, #8 \n" // 8 processed per loop.
2876 "uqsub v0.8b, v0.8b, v4.8b \n" 2886 "uqsub v0.8b, v0.8b, v4.8b \n"
2877 "uqsub v1.8b, v1.8b, v5.8b \n" 2887 "uqsub v1.8b, v1.8b, v5.8b \n"
2878 "uqsub v2.8b, v2.8b, v6.8b \n" 2888 "uqsub v2.8b, v2.8b, v6.8b \n"
2879 "uqsub v3.8b, v3.8b, v7.8b \n" 2889 "uqsub v3.8b, v3.8b, v7.8b \n"
2880 MEMACCESS(2) 2890 MEMACCESS(2)
2881 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels 2891 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
2882 "b.gt 1b \n" 2892 "b.gt 1b \n"
2883 2893
2884 : "+r"(src_argb0), // %0 2894 : "+r"(src_argb0), // %0
2885 "+r"(src_argb1), // %1 2895 "+r"(src_argb1), // %1
(...skipping 14 matching lines...) Expand all
2900 void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, 2910 void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
2901 uint8* dst_argb, int width) { 2911 uint8* dst_argb, int width) {
2902 asm volatile ( 2912 asm volatile (
2903 "movi v3.8b, #255 \n" // alpha 2913 "movi v3.8b, #255 \n" // alpha
2904 // 8 pixel loop. 2914 // 8 pixel loop.
2905 "1: \n" 2915 "1: \n"
2906 MEMACCESS(0) 2916 MEMACCESS(0)
2907 "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. 2917 "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx.
2908 MEMACCESS(1) 2918 MEMACCESS(1)
2909 "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. 2919 "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely.
2910 "subs %3, %3, #8 \n" // 8 processed per loop. 2920 "subs %w3, %w3, #8 \n" // 8 processed per loop.
2911 "uqadd v0.8b, v0.8b, v1.8b \n" // add 2921 "uqadd v0.8b, v0.8b, v1.8b \n" // add
2912 "orr v1.8b, v0.8b, v0.8b \n" 2922 "orr v1.8b, v0.8b, v0.8b \n"
2913 "orr v2.8b, v0.8b, v0.8b \n" 2923 "orr v2.8b, v0.8b, v0.8b \n"
2914 MEMACCESS(2) 2924 MEMACCESS(2)
2915 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels 2925 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
2916 "b.gt 1b \n" 2926 "b.gt 1b \n"
2917 : "+r"(src_sobelx), // %0 2927 : "+r"(src_sobelx), // %0
2918 "+r"(src_sobely), // %1 2928 "+r"(src_sobely), // %1
2919 "+r"(dst_argb), // %2 2929 "+r"(dst_argb), // %2
2920 "+r"(width) // %3 2930 "+r"(width) // %3
2921 : 2931 :
2922 : "cc", "memory", "v0", "v1", "v2", "v3" 2932 : "cc", "memory", "v0", "v1", "v2", "v3"
2923 ); 2933 );
2924 } 2934 }
2925 #endif // HAS_SOBELROW_NEON 2935 #endif // HAS_SOBELROW_NEON
2926 2936
2927 // Adds Sobel X and Sobel Y and stores Sobel into plane. 2937 // Adds Sobel X and Sobel Y and stores Sobel into plane.
2928 #ifdef HAS_SOBELTOPLANEROW_NEON 2938 #ifdef HAS_SOBELTOPLANEROW_NEON
2929 void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, 2939 void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
2930 uint8* dst_y, int width) { 2940 uint8* dst_y, int width) {
2931 asm volatile ( 2941 asm volatile (
2932 // 16 pixel loop. 2942 // 16 pixel loop.
2933 "1: \n" 2943 "1: \n"
2934 MEMACCESS(0) 2944 MEMACCESS(0)
2935 "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. 2945 "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx.
2936 MEMACCESS(1) 2946 MEMACCESS(1)
2937 "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. 2947 "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely.
2938 "subs %3, %3, #16 \n" // 16 processed per loop. 2948 "subs %w3, %w3, #16 \n" // 16 processed per loop.
2939 "uqadd v0.16b, v0.16b, v1.16b \n" // add 2949 "uqadd v0.16b, v0.16b, v1.16b \n" // add
2940 MEMACCESS(2) 2950 MEMACCESS(2)
2941 "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels. 2951 "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels.
2942 "b.gt 1b \n" 2952 "b.gt 1b \n"
2943 : "+r"(src_sobelx), // %0 2953 : "+r"(src_sobelx), // %0
2944 "+r"(src_sobely), // %1 2954 "+r"(src_sobely), // %1
2945 "+r"(dst_y), // %2 2955 "+r"(dst_y), // %2
2946 "+r"(width) // %3 2956 "+r"(width) // %3
2947 : 2957 :
2948 : "cc", "memory", "v0", "v1" 2958 : "cc", "memory", "v0", "v1"
(...skipping 10 matching lines...) Expand all
2959 void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, 2969 void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
2960 uint8* dst_argb, int width) { 2970 uint8* dst_argb, int width) {
2961 asm volatile ( 2971 asm volatile (
2962 "movi v3.8b, #255 \n" // alpha 2972 "movi v3.8b, #255 \n" // alpha
2963 // 8 pixel loop. 2973 // 8 pixel loop.
2964 "1: \n" 2974 "1: \n"
2965 MEMACCESS(0) 2975 MEMACCESS(0)
2966 "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. 2976 "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx.
2967 MEMACCESS(1) 2977 MEMACCESS(1)
2968 "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. 2978 "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely.
2969 "subs %3, %3, #8 \n" // 8 processed per loop. 2979 "subs %w3, %w3, #8 \n" // 8 processed per loop.
2970 "uqadd v1.8b, v0.8b, v2.8b \n" // add 2980 "uqadd v1.8b, v0.8b, v2.8b \n" // add
2971 MEMACCESS(2) 2981 MEMACCESS(2)
2972 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels 2982 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
2973 "b.gt 1b \n" 2983 "b.gt 1b \n"
2974 : "+r"(src_sobelx), // %0 2984 : "+r"(src_sobelx), // %0
2975 "+r"(src_sobely), // %1 2985 "+r"(src_sobely), // %1
2976 "+r"(dst_argb), // %2 2986 "+r"(dst_argb), // %2
2977 "+r"(width) // %3 2987 "+r"(width) // %3
2978 : 2988 :
2979 : "cc", "memory", "v0", "v1", "v2", "v3" 2989 : "cc", "memory", "v0", "v1", "v2", "v3"
(...skipping 19 matching lines...) Expand all
2999 "ld1 {v2.8b}, [%1],%5 \n" // center * 2 3009 "ld1 {v2.8b}, [%1],%5 \n" // center * 2
3000 MEMACCESS(1) 3010 MEMACCESS(1)
3001 "ld1 {v3.8b}, [%1],%6 \n" 3011 "ld1 {v3.8b}, [%1],%6 \n"
3002 "usubl v1.8h, v2.8b, v3.8b \n" 3012 "usubl v1.8h, v2.8b, v3.8b \n"
3003 "add v0.8h, v0.8h, v1.8h \n" 3013 "add v0.8h, v0.8h, v1.8h \n"
3004 "add v0.8h, v0.8h, v1.8h \n" 3014 "add v0.8h, v0.8h, v1.8h \n"
3005 MEMACCESS(2) 3015 MEMACCESS(2)
3006 "ld1 {v2.8b}, [%2],%5 \n" // bottom 3016 "ld1 {v2.8b}, [%2],%5 \n" // bottom
3007 MEMACCESS(2) 3017 MEMACCESS(2)
3008 "ld1 {v3.8b}, [%2],%6 \n" 3018 "ld1 {v3.8b}, [%2],%6 \n"
3009 "subs %4, %4, #8 \n" // 8 pixels 3019 "subs %w4, %w4, #8 \n" // 8 pixels
3010 "usubl v1.8h, v2.8b, v3.8b \n" 3020 "usubl v1.8h, v2.8b, v3.8b \n"
3011 "add v0.8h, v0.8h, v1.8h \n" 3021 "add v0.8h, v0.8h, v1.8h \n"
3012 "abs v0.8h, v0.8h \n" 3022 "abs v0.8h, v0.8h \n"
3013 "uqxtn v0.8b, v0.8h \n" 3023 "uqxtn v0.8b, v0.8h \n"
3014 MEMACCESS(3) 3024 MEMACCESS(3)
3015 "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx 3025 "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx
3016 "b.gt 1b \n" 3026 "b.gt 1b \n"
3017 : "+r"(src_y0), // %0 3027 : "+r"(src_y0), // %0
3018 "+r"(src_y1), // %1 3028 "+r"(src_y1), // %1
3019 "+r"(src_y2), // %2 3029 "+r"(src_y2), // %2
3020 "+r"(dst_sobelx), // %3 3030 "+r"(dst_sobelx), // %3
3021 "+r"(width) // %4 3031 "+r"(width) // %4
3022 : "r"(2), // %5 3032 : "r"(2LL), // %5
3023 "r"(6) // %6 3033 "r"(6LL) // %6
3024 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 3034 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
3025 ); 3035 );
3026 } 3036 }
3027 #endif // HAS_SOBELXROW_NEON 3037 #endif // HAS_SOBELXROW_NEON
3028 3038
3029 // SobelY as a matrix is 3039 // SobelY as a matrix is
3030 // -1 -2 -1 3040 // -1 -2 -1
3031 // 0 0 0 3041 // 0 0 0
3032 // 1 2 1 3042 // 1 2 1
3033 #ifdef HAS_SOBELYROW_NEON 3043 #ifdef HAS_SOBELYROW_NEON
(...skipping 10 matching lines...) Expand all
3044 "ld1 {v2.8b}, [%0],%4 \n" // center * 2 3054 "ld1 {v2.8b}, [%0],%4 \n" // center * 2
3045 MEMACCESS(1) 3055 MEMACCESS(1)
3046 "ld1 {v3.8b}, [%1],%4 \n" 3056 "ld1 {v3.8b}, [%1],%4 \n"
3047 "usubl v1.8h, v2.8b, v3.8b \n" 3057 "usubl v1.8h, v2.8b, v3.8b \n"
3048 "add v0.8h, v0.8h, v1.8h \n" 3058 "add v0.8h, v0.8h, v1.8h \n"
3049 "add v0.8h, v0.8h, v1.8h \n" 3059 "add v0.8h, v0.8h, v1.8h \n"
3050 MEMACCESS(0) 3060 MEMACCESS(0)
3051 "ld1 {v2.8b}, [%0],%5 \n" // right 3061 "ld1 {v2.8b}, [%0],%5 \n" // right
3052 MEMACCESS(1) 3062 MEMACCESS(1)
3053 "ld1 {v3.8b}, [%1],%5 \n" 3063 "ld1 {v3.8b}, [%1],%5 \n"
3054 "subs %3, %3, #8 \n" // 8 pixels 3064 "subs %w3, %w3, #8 \n" // 8 pixels
3055 "usubl v1.8h, v2.8b, v3.8b \n" 3065 "usubl v1.8h, v2.8b, v3.8b \n"
3056 "add v0.8h, v0.8h, v1.8h \n" 3066 "add v0.8h, v0.8h, v1.8h \n"
3057 "abs v0.8h, v0.8h \n" 3067 "abs v0.8h, v0.8h \n"
3058 "uqxtn v0.8b, v0.8h \n" 3068 "uqxtn v0.8b, v0.8h \n"
3059 MEMACCESS(2) 3069 MEMACCESS(2)
3060 "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely 3070 "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely
3061 "b.gt 1b \n" 3071 "b.gt 1b \n"
3062 : "+r"(src_y0), // %0 3072 : "+r"(src_y0), // %0
3063 "+r"(src_y1), // %1 3073 "+r"(src_y1), // %1
3064 "+r"(dst_sobely), // %2 3074 "+r"(dst_sobely), // %2
3065 "+r"(width) // %3 3075 "+r"(width) // %3
3066 : "r"(1), // %4 3076 : "r"(1LL), // %4
3067 "r"(6) // %5 3077 "r"(6LL) // %5
3068 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 3078 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
3069 ); 3079 );
3070 } 3080 }
3071 #endif // HAS_SOBELYROW_NEON 3081 #endif // HAS_SOBELYROW_NEON
3072 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) 3082 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
3073 3083
3074 #ifdef __cplusplus 3084 #ifdef __cplusplus
3075 } // extern "C" 3085 } // extern "C"
3076 } // namespace libyuv 3086 } // namespace libyuv
3077 #endif 3087 #endif
OLDNEW
« no previous file with comments | « source/libvpx/third_party/libyuv/source/row_neon.cc ('k') | source/libvpx/third_party/libyuv/source/row_posix.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698