OLD | NEW |
1 ; | 1 ; |
2 ; Copyright (c) 2014 The WebM project authors. All Rights Reserved. | 2 ; Copyright (c) 2014 The WebM project authors. All Rights Reserved. |
3 ; | 3 ; |
4 ; Use of this source code is governed by a BSD-style license | 4 ; Use of this source code is governed by a BSD-style license |
5 ; that can be found in the LICENSE file in the root of the source | 5 ; that can be found in the LICENSE file in the root of the source |
6 ; tree. An additional intellectual property rights grant can be found | 6 ; tree. An additional intellectual property rights grant can be found |
7 ; in the file PATENTS. All contributing project authors may | 7 ; in the file PATENTS. All contributing project authors may |
8 ; be found in the AUTHORS file in the root of the source tree. | 8 ; be found in the AUTHORS file in the root of the source tree. |
9 ; | 9 ; |
10 | 10 |
(...skipping 331 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
342 ; r2 const uint8_t *above | 342 ; r2 const uint8_t *above |
343 ; r3 const uint8_t *left | 343 ; r3 const uint8_t *left |
344 | 344 |
345 |vp9_tm_predictor_8x8_neon| PROC | 345 |vp9_tm_predictor_8x8_neon| PROC |
346 ; Load ytop_left = above[-1]; | 346 ; Load ytop_left = above[-1]; |
347 sub r12, r2, #1 | 347 sub r12, r2, #1 |
348 ldrb r12, [r12] | 348 ldrb r12, [r12] |
349 vdup.u8 d0, r12 | 349 vdup.u8 d0, r12 |
350 | 350 |
351 ; preload 8 left | 351 ; preload 8 left |
352 vld1.8 d30, [r3] | 352 vld1.8 {d30}, [r3] |
353 | 353 |
354 ; Load above 8 pixels | 354 ; Load above 8 pixels |
355 vld1.64 {d2}, [r2] | 355 vld1.64 {d2}, [r2] |
356 | 356 |
357 vmovl.u8 q10, d30 | 357 vmovl.u8 q10, d30 |
358 | 358 |
359 ; Compute above - ytop_left | 359 ; Compute above - ytop_left |
360 vsubl.u8 q3, d2, d0 | 360 vsubl.u8 q3, d2, d0 |
361 | 361 |
362 ; Load left row by row and compute left + (above - ytop_left) | 362 ; Load left row by row and compute left + (above - ytop_left) |
(...skipping 52 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
415 ; r2 const uint8_t *above | 415 ; r2 const uint8_t *above |
416 ; r3 const uint8_t *left | 416 ; r3 const uint8_t *left |
417 | 417 |
418 |vp9_tm_predictor_16x16_neon| PROC | 418 |vp9_tm_predictor_16x16_neon| PROC |
419 ; Load ytop_left = above[-1]; | 419 ; Load ytop_left = above[-1]; |
420 sub r12, r2, #1 | 420 sub r12, r2, #1 |
421 ldrb r12, [r12] | 421 ldrb r12, [r12] |
422 vdup.u8 q0, r12 | 422 vdup.u8 q0, r12 |
423 | 423 |
424 ; Load above 8 pixels | 424 ; Load above 8 pixels |
425 vld1.8 q1, [r2] | 425 vld1.8 {q1}, [r2] |
426 | 426 |
427 ; preload 8 left into r12 | 427 ; preload 8 left into r12 |
428 vld1.8 d18, [r3]! | 428 vld1.8 {d18}, [r3]! |
429 | 429 |
430 ; Compute above - ytop_left | 430 ; Compute above - ytop_left |
431 vsubl.u8 q2, d2, d0 | 431 vsubl.u8 q2, d2, d0 |
432 vsubl.u8 q3, d3, d1 | 432 vsubl.u8 q3, d3, d1 |
433 | 433 |
434 vmovl.u8 q10, d18 | 434 vmovl.u8 q10, d18 |
435 | 435 |
436 ; Load left row by row and compute left + (above - ytop_left) | 436 ; Load left row by row and compute left + (above - ytop_left) |
437 ; Process 8 rows in each single loop and loop 2 times to process 16 rows. | 437 ; Process 8 rows in each single loop and loop 2 times to process 16 rows. |
438 mov r2, #2 | 438 mov r2, #2 |
(...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
485 vadd.s16 q1, q0, q2 | 485 vadd.s16 q1, q0, q2 |
486 vadd.s16 q0, q0, q3 | 486 vadd.s16 q0, q0, q3 |
487 vadd.s16 q11, q8, q2 | 487 vadd.s16 q11, q8, q2 |
488 vadd.s16 q8, q8, q3 | 488 vadd.s16 q8, q8, q3 |
489 vqshrun.s16 d2, q1, #0 | 489 vqshrun.s16 d2, q1, #0 |
490 vqshrun.s16 d3, q0, #0 | 490 vqshrun.s16 d3, q0, #0 |
491 vqshrun.s16 d22, q11, #0 | 491 vqshrun.s16 d22, q11, #0 |
492 vqshrun.s16 d23, q8, #0 | 492 vqshrun.s16 d23, q8, #0 |
493 vdup.16 q0, d20[2] | 493 vdup.16 q0, d20[2] |
494 vdup.16 q8, d20[3] | 494 vdup.16 q8, d20[3] |
495 vld1.8 d18, [r3]! ; preload 8 left into r12 | 495 vld1.8 {d18}, [r3]! ; preload 8 left into r12 |
496 vmovl.u8 q10, d18 | 496 vmovl.u8 q10, d18 |
497 vst1.64 {d2,d3}, [r0], r1 | 497 vst1.64 {d2,d3}, [r0], r1 |
498 vst1.64 {d22,d23}, [r0], r1 | 498 vst1.64 {d22,d23}, [r0], r1 |
499 | 499 |
500 subs r2, r2, #1 | 500 subs r2, r2, #1 |
501 bgt loop_16x16_neon | 501 bgt loop_16x16_neon |
502 | 502 |
503 bx lr | 503 bx lr |
504 ENDP ; |vp9_tm_predictor_16x16_neon| | 504 ENDP ; |vp9_tm_predictor_16x16_neon| |
505 | 505 |
506 ;void vp9_tm_predictor_32x32_neon (uint8_t *dst, ptrdiff_t y_stride, | 506 ;void vp9_tm_predictor_32x32_neon (uint8_t *dst, ptrdiff_t y_stride, |
507 ; const uint8_t *above, | 507 ; const uint8_t *above, |
508 ; const uint8_t *left) | 508 ; const uint8_t *left) |
509 ; r0 uint8_t *dst | 509 ; r0 uint8_t *dst |
510 ; r1 ptrdiff_t y_stride | 510 ; r1 ptrdiff_t y_stride |
511 ; r2 const uint8_t *above | 511 ; r2 const uint8_t *above |
512 ; r3 const uint8_t *left | 512 ; r3 const uint8_t *left |
513 | 513 |
514 |vp9_tm_predictor_32x32_neon| PROC | 514 |vp9_tm_predictor_32x32_neon| PROC |
515 ; Load ytop_left = above[-1]; | 515 ; Load ytop_left = above[-1]; |
516 sub r12, r2, #1 | 516 sub r12, r2, #1 |
517 ldrb r12, [r12] | 517 ldrb r12, [r12] |
518 vdup.u8 q0, r12 | 518 vdup.u8 q0, r12 |
519 | 519 |
520 ; Load above 32 pixels | 520 ; Load above 32 pixels |
521 vld1.8 q1, [r2]! | 521 vld1.8 {q1}, [r2]! |
522 vld1.8 q2, [r2] | 522 vld1.8 {q2}, [r2] |
523 | 523 |
524 ; preload 8 left pixels | 524 ; preload 8 left pixels |
525 vld1.8 d26, [r3]! | 525 vld1.8 {d26}, [r3]! |
526 | 526 |
527 ; Compute above - ytop_left | 527 ; Compute above - ytop_left |
528 vsubl.u8 q8, d2, d0 | 528 vsubl.u8 q8, d2, d0 |
529 vsubl.u8 q9, d3, d1 | 529 vsubl.u8 q9, d3, d1 |
530 vsubl.u8 q10, d4, d0 | 530 vsubl.u8 q10, d4, d0 |
531 vsubl.u8 q11, d5, d1 | 531 vsubl.u8 q11, d5, d1 |
532 | 532 |
533 vmovl.u8 q3, d26 | 533 vmovl.u8 q3, d26 |
534 | 534 |
535 ; Load left row by row and compute left + (above - ytop_left) | 535 ; Load left row by row and compute left + (above - ytop_left) |
(...skipping 78 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
614 vqshrun.s16 d1, q13, #0 | 614 vqshrun.s16 d1, q13, #0 |
615 vadd.s16 q12, q2, q8 | 615 vadd.s16 q12, q2, q8 |
616 vadd.s16 q13, q2, q9 | 616 vadd.s16 q13, q2, q9 |
617 vqshrun.s16 d2, q14, #0 | 617 vqshrun.s16 d2, q14, #0 |
618 vqshrun.s16 d3, q15, #0 | 618 vqshrun.s16 d3, q15, #0 |
619 vadd.s16 q14, q2, q10 | 619 vadd.s16 q14, q2, q10 |
620 vadd.s16 q15, q2, q11 | 620 vadd.s16 q15, q2, q11 |
621 vst1.64 {d0-d3}, [r0], r1 | 621 vst1.64 {d0-d3}, [r0], r1 |
622 vqshrun.s16 d24, q12, #0 | 622 vqshrun.s16 d24, q12, #0 |
623 vqshrun.s16 d25, q13, #0 | 623 vqshrun.s16 d25, q13, #0 |
624 vld1.8 d0, [r3]! ; preload 8 left pixels | 624 vld1.8 {d0}, [r3]! ; preload 8 left pixels |
625 vqshrun.s16 d26, q14, #0 | 625 vqshrun.s16 d26, q14, #0 |
626 vqshrun.s16 d27, q15, #0 | 626 vqshrun.s16 d27, q15, #0 |
627 vmovl.u8 q3, d0 | 627 vmovl.u8 q3, d0 |
628 vst1.64 {d24-d27}, [r0], r1 | 628 vst1.64 {d24-d27}, [r0], r1 |
629 | 629 |
630 subs r2, r2, #1 | 630 subs r2, r2, #1 |
631 bgt loop_32x32_neon | 631 bgt loop_32x32_neon |
632 | 632 |
633 bx lr | 633 bx lr |
634 ENDP ; |vp9_tm_predictor_32x32_neon| | 634 ENDP ; |vp9_tm_predictor_32x32_neon| |
635 | 635 |
636 END | 636 END |
OLD | NEW |