OLD | NEW |
1 ; | 1 ; |
2 ; Copyright (c) 2014 The WebM project authors. All Rights Reserved. | 2 ; Copyright (c) 2014 The WebM project authors. All Rights Reserved. |
3 ; | 3 ; |
4 ; Use of this source code is governed by a BSD-style license | 4 ; Use of this source code is governed by a BSD-style license |
5 ; that can be found in the LICENSE file in the root of the source | 5 ; that can be found in the LICENSE file in the root of the source |
6 ; tree. An additional intellectual property rights grant can be found | 6 ; tree. An additional intellectual property rights grant can be found |
7 ; in the file PATENTS. All contributing project authors may | 7 ; in the file PATENTS. All contributing project authors may |
8 ; be found in the AUTHORS file in the root of the source tree. | 8 ; be found in the AUTHORS file in the root of the source tree. |
9 ; | 9 ; |
10 | 10 |
(...skipping 327 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
338 ; const uint8_t *above, | 338 ; const uint8_t *above, |
339 ; const uint8_t *left) | 339 ; const uint8_t *left) |
340 ; r0 uint8_t *dst | 340 ; r0 uint8_t *dst |
341 ; r1 ptrdiff_t y_stride | 341 ; r1 ptrdiff_t y_stride |
342 ; r2 const uint8_t *above | 342 ; r2 const uint8_t *above |
343 ; r3 const uint8_t *left | 343 ; r3 const uint8_t *left |
344 | 344 |
345 |vp9_tm_predictor_8x8_neon| PROC | 345 |vp9_tm_predictor_8x8_neon| PROC |
346 ; Load ytop_left = above[-1]; | 346 ; Load ytop_left = above[-1]; |
347 sub r12, r2, #1 | 347 sub r12, r2, #1 |
348 ldrb r12, [r12] | 348 vld1.8 {d0[]}, [r12] |
349 vdup.u8 d0, r12 | |
350 | 349 |
351 ; preload 8 left | 350 ; preload 8 left |
352 vld1.8 {d30}, [r3] | 351 vld1.8 {d30}, [r3] |
353 | 352 |
354 ; Load above 8 pixels | 353 ; Load above 8 pixels |
355 vld1.64 {d2}, [r2] | 354 vld1.64 {d2}, [r2] |
356 | 355 |
357 vmovl.u8 q10, d30 | 356 vmovl.u8 q10, d30 |
358 | 357 |
359 ; Compute above - ytop_left | 358 ; Compute above - ytop_left |
(...skipping 51 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
411 ; const uint8_t *above, | 410 ; const uint8_t *above, |
412 ; const uint8_t *left) | 411 ; const uint8_t *left) |
413 ; r0 uint8_t *dst | 412 ; r0 uint8_t *dst |
414 ; r1 ptrdiff_t y_stride | 413 ; r1 ptrdiff_t y_stride |
415 ; r2 const uint8_t *above | 414 ; r2 const uint8_t *above |
416 ; r3 const uint8_t *left | 415 ; r3 const uint8_t *left |
417 | 416 |
418 |vp9_tm_predictor_16x16_neon| PROC | 417 |vp9_tm_predictor_16x16_neon| PROC |
419 ; Load ytop_left = above[-1]; | 418 ; Load ytop_left = above[-1]; |
420 sub r12, r2, #1 | 419 sub r12, r2, #1 |
421 ldrb r12, [r12] | 420 vld1.8 {d0[]}, [r12] |
422 vdup.u8 q0, r12 | |
423 | 421 |
424 ; Load above 8 pixels | 422 ; Load above 8 pixels |
425 vld1.8 {q1}, [r2] | 423 vld1.8 {q1}, [r2] |
426 | 424 |
427 ; preload 8 left into r12 | 425 ; preload 8 left into r12 |
428 vld1.8 {d18}, [r3]! | 426 vld1.8 {d18}, [r3]! |
429 | 427 |
430 ; Compute above - ytop_left | 428 ; Compute above - ytop_left |
431 vsubl.u8 q2, d2, d0 | 429 vsubl.u8 q2, d2, d0 |
432 vsubl.u8 q3, d3, d1 | 430 vsubl.u8 q3, d3, d0 |
433 | 431 |
434 vmovl.u8 q10, d18 | 432 vmovl.u8 q10, d18 |
435 | 433 |
436 ; Load left row by row and compute left + (above - ytop_left) | 434 ; Load left row by row and compute left + (above - ytop_left) |
437 ; Process 8 rows in each single loop and loop 2 times to process 16 rows. | 435 ; Process 8 rows in each single loop and loop 2 times to process 16 rows. |
438 mov r2, #2 | 436 mov r2, #2 |
439 | 437 |
440 loop_16x16_neon | 438 loop_16x16_neon |
441 ; Process two rows. | 439 ; Process two rows. |
442 vdup.16 q0, d20[0] | 440 vdup.16 q0, d20[0] |
(...skipping 62 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
505 ; const uint8_t *above, | 503 ; const uint8_t *above, |
506 ; const uint8_t *left) | 504 ; const uint8_t *left) |
507 ; r0 uint8_t *dst | 505 ; r0 uint8_t *dst |
508 ; r1 ptrdiff_t y_stride | 506 ; r1 ptrdiff_t y_stride |
509 ; r2 const uint8_t *above | 507 ; r2 const uint8_t *above |
510 ; r3 const uint8_t *left | 508 ; r3 const uint8_t *left |
511 | 509 |
512 |vp9_tm_predictor_32x32_neon| PROC | 510 |vp9_tm_predictor_32x32_neon| PROC |
513 ; Load ytop_left = above[-1]; | 511 ; Load ytop_left = above[-1]; |
514 sub r12, r2, #1 | 512 sub r12, r2, #1 |
515 ldrb r12, [r12] | 513 vld1.8 {d0[]}, [r12] |
516 vdup.u8 q0, r12 | |
517 | 514 |
518 ; Load above 32 pixels | 515 ; Load above 32 pixels |
519 vld1.8 {q1}, [r2]! | 516 vld1.8 {q1}, [r2]! |
520 vld1.8 {q2}, [r2] | 517 vld1.8 {q2}, [r2] |
521 | 518 |
522 ; preload 8 left pixels | 519 ; preload 8 left pixels |
523 vld1.8 {d26}, [r3]! | 520 vld1.8 {d26}, [r3]! |
524 | 521 |
525 ; Compute above - ytop_left | 522 ; Compute above - ytop_left |
526 vsubl.u8 q8, d2, d0 | 523 vsubl.u8 q8, d2, d0 |
527 vsubl.u8 q9, d3, d1 | 524 vsubl.u8 q9, d3, d0 |
528 vsubl.u8 q10, d4, d0 | 525 vsubl.u8 q10, d4, d0 |
529 vsubl.u8 q11, d5, d1 | 526 vsubl.u8 q11, d5, d0 |
530 | 527 |
531 vmovl.u8 q3, d26 | 528 vmovl.u8 q3, d26 |
532 | 529 |
533 ; Load left row by row and compute left + (above - ytop_left) | 530 ; Load left row by row and compute left + (above - ytop_left) |
534 ; Process 8 rows in each single loop and loop 4 times to process 32 rows. | 531 ; Process 8 rows in each single loop and loop 4 times to process 32 rows. |
535 mov r2, #4 | 532 mov r2, #4 |
536 | 533 |
537 loop_32x32_neon | 534 loop_32x32_neon |
538 ; Process two rows. | 535 ; Process two rows. |
539 vdup.16 q0, d6[0] | 536 vdup.16 q0, d6[0] |
(...skipping 85 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
625 vmovl.u8 q3, d0 | 622 vmovl.u8 q3, d0 |
626 vst1.64 {d24-d27}, [r0], r1 | 623 vst1.64 {d24-d27}, [r0], r1 |
627 | 624 |
628 subs r2, r2, #1 | 625 subs r2, r2, #1 |
629 bgt loop_32x32_neon | 626 bgt loop_32x32_neon |
630 | 627 |
631 bx lr | 628 bx lr |
632 ENDP ; |vp9_tm_predictor_32x32_neon| | 629 ENDP ; |vp9_tm_predictor_32x32_neon| |
633 | 630 |
634 END | 631 END |
OLD | NEW |