| OLD | NEW |
| 1 ; | 1 ; |
| 2 ; Copyright (c) 2014 The WebM project authors. All Rights Reserved. | 2 ; Copyright (c) 2014 The WebM project authors. All Rights Reserved. |
| 3 ; | 3 ; |
| 4 ; Use of this source code is governed by a BSD-style license | 4 ; Use of this source code is governed by a BSD-style license |
| 5 ; that can be found in the LICENSE file in the root of the source | 5 ; that can be found in the LICENSE file in the root of the source |
| 6 ; tree. An additional intellectual property rights grant can be found | 6 ; tree. An additional intellectual property rights grant can be found |
| 7 ; in the file PATENTS. All contributing project authors may | 7 ; in the file PATENTS. All contributing project authors may |
| 8 ; be found in the AUTHORS file in the root of the source tree. | 8 ; be found in the AUTHORS file in the root of the source tree. |
| 9 ; | 9 ; |
| 10 | 10 |
| (...skipping 331 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 342 ; r2 const uint8_t *above | 342 ; r2 const uint8_t *above |
| 343 ; r3 const uint8_t *left | 343 ; r3 const uint8_t *left |
| 344 | 344 |
| 345 |vp9_tm_predictor_8x8_neon| PROC | 345 |vp9_tm_predictor_8x8_neon| PROC |
| 346 ; Load ytop_left = above[-1]; | 346 ; Load ytop_left = above[-1]; |
| 347 sub r12, r2, #1 | 347 sub r12, r2, #1 |
| 348 ldrb r12, [r12] | 348 ldrb r12, [r12] |
| 349 vdup.u8 d0, r12 | 349 vdup.u8 d0, r12 |
| 350 | 350 |
| 351 ; preload 8 left | 351 ; preload 8 left |
| 352 vld1.8 d30, [r3] | 352 vld1.8 {d30}, [r3] |
| 353 | 353 |
| 354 ; Load above 8 pixels | 354 ; Load above 8 pixels |
| 355 vld1.64 {d2}, [r2] | 355 vld1.64 {d2}, [r2] |
| 356 | 356 |
| 357 vmovl.u8 q10, d30 | 357 vmovl.u8 q10, d30 |
| 358 | 358 |
| 359 ; Compute above - ytop_left | 359 ; Compute above - ytop_left |
| 360 vsubl.u8 q3, d2, d0 | 360 vsubl.u8 q3, d2, d0 |
| 361 | 361 |
| 362 ; Load left row by row and compute left + (above - ytop_left) | 362 ; Load left row by row and compute left + (above - ytop_left) |
| (...skipping 52 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 415 ; r2 const uint8_t *above | 415 ; r2 const uint8_t *above |
| 416 ; r3 const uint8_t *left | 416 ; r3 const uint8_t *left |
| 417 | 417 |
| 418 |vp9_tm_predictor_16x16_neon| PROC | 418 |vp9_tm_predictor_16x16_neon| PROC |
| 419 ; Load ytop_left = above[-1]; | 419 ; Load ytop_left = above[-1]; |
| 420 sub r12, r2, #1 | 420 sub r12, r2, #1 |
| 421 ldrb r12, [r12] | 421 ldrb r12, [r12] |
| 422 vdup.u8 q0, r12 | 422 vdup.u8 q0, r12 |
| 423 | 423 |
| 424 ; Load above 8 pixels | 424 ; Load above 8 pixels |
| 425 vld1.8 q1, [r2] | 425 vld1.8 {q1}, [r2] |
| 426 | 426 |
| 427 ; preload 8 left into r12 | 427 ; preload 8 left into r12 |
| 428 vld1.8 d18, [r3]! | 428 vld1.8 {d18}, [r3]! |
| 429 | 429 |
| 430 ; Compute above - ytop_left | 430 ; Compute above - ytop_left |
| 431 vsubl.u8 q2, d2, d0 | 431 vsubl.u8 q2, d2, d0 |
| 432 vsubl.u8 q3, d3, d1 | 432 vsubl.u8 q3, d3, d1 |
| 433 | 433 |
| 434 vmovl.u8 q10, d18 | 434 vmovl.u8 q10, d18 |
| 435 | 435 |
| 436 ; Load left row by row and compute left + (above - ytop_left) | 436 ; Load left row by row and compute left + (above - ytop_left) |
| 437 ; Process 8 rows in each single loop and loop 2 times to process 16 rows. | 437 ; Process 8 rows in each single loop and loop 2 times to process 16 rows. |
| 438 mov r2, #2 | 438 mov r2, #2 |
| (...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 485 vadd.s16 q1, q0, q2 | 485 vadd.s16 q1, q0, q2 |
| 486 vadd.s16 q0, q0, q3 | 486 vadd.s16 q0, q0, q3 |
| 487 vadd.s16 q11, q8, q2 | 487 vadd.s16 q11, q8, q2 |
| 488 vadd.s16 q8, q8, q3 | 488 vadd.s16 q8, q8, q3 |
| 489 vqshrun.s16 d2, q1, #0 | 489 vqshrun.s16 d2, q1, #0 |
| 490 vqshrun.s16 d3, q0, #0 | 490 vqshrun.s16 d3, q0, #0 |
| 491 vqshrun.s16 d22, q11, #0 | 491 vqshrun.s16 d22, q11, #0 |
| 492 vqshrun.s16 d23, q8, #0 | 492 vqshrun.s16 d23, q8, #0 |
| 493 vdup.16 q0, d20[2] | 493 vdup.16 q0, d20[2] |
| 494 vdup.16 q8, d20[3] | 494 vdup.16 q8, d20[3] |
| 495 vld1.8 d18, [r3]! ; preload 8 left into r12 | 495 vld1.8 {d18}, [r3]! ; preload 8 left into r12 |
| 496 vmovl.u8 q10, d18 | 496 vmovl.u8 q10, d18 |
| 497 vst1.64 {d2,d3}, [r0], r1 | 497 vst1.64 {d2,d3}, [r0], r1 |
| 498 vst1.64 {d22,d23}, [r0], r1 | 498 vst1.64 {d22,d23}, [r0], r1 |
| 499 | 499 |
| 500 subs r2, r2, #1 | 500 subs r2, r2, #1 |
| 501 bgt loop_16x16_neon | 501 bgt loop_16x16_neon |
| 502 | 502 |
| 503 bx lr | 503 bx lr |
| 504 ENDP ; |vp9_tm_predictor_16x16_neon| | 504 ENDP ; |vp9_tm_predictor_16x16_neon| |
| 505 | 505 |
| 506 ;void vp9_tm_predictor_32x32_neon (uint8_t *dst, ptrdiff_t y_stride, | 506 ;void vp9_tm_predictor_32x32_neon (uint8_t *dst, ptrdiff_t y_stride, |
| 507 ; const uint8_t *above, | 507 ; const uint8_t *above, |
| 508 ; const uint8_t *left) | 508 ; const uint8_t *left) |
| 509 ; r0 uint8_t *dst | 509 ; r0 uint8_t *dst |
| 510 ; r1 ptrdiff_t y_stride | 510 ; r1 ptrdiff_t y_stride |
| 511 ; r2 const uint8_t *above | 511 ; r2 const uint8_t *above |
| 512 ; r3 const uint8_t *left | 512 ; r3 const uint8_t *left |
| 513 | 513 |
| 514 |vp9_tm_predictor_32x32_neon| PROC | 514 |vp9_tm_predictor_32x32_neon| PROC |
| 515 ; Load ytop_left = above[-1]; | 515 ; Load ytop_left = above[-1]; |
| 516 sub r12, r2, #1 | 516 sub r12, r2, #1 |
| 517 ldrb r12, [r12] | 517 ldrb r12, [r12] |
| 518 vdup.u8 q0, r12 | 518 vdup.u8 q0, r12 |
| 519 | 519 |
| 520 ; Load above 32 pixels | 520 ; Load above 32 pixels |
| 521 vld1.8 q1, [r2]! | 521 vld1.8 {q1}, [r2]! |
| 522 vld1.8 q2, [r2] | 522 vld1.8 {q2}, [r2] |
| 523 | 523 |
| 524 ; preload 8 left pixels | 524 ; preload 8 left pixels |
| 525 vld1.8 d26, [r3]! | 525 vld1.8 {d26}, [r3]! |
| 526 | 526 |
| 527 ; Compute above - ytop_left | 527 ; Compute above - ytop_left |
| 528 vsubl.u8 q8, d2, d0 | 528 vsubl.u8 q8, d2, d0 |
| 529 vsubl.u8 q9, d3, d1 | 529 vsubl.u8 q9, d3, d1 |
| 530 vsubl.u8 q10, d4, d0 | 530 vsubl.u8 q10, d4, d0 |
| 531 vsubl.u8 q11, d5, d1 | 531 vsubl.u8 q11, d5, d1 |
| 532 | 532 |
| 533 vmovl.u8 q3, d26 | 533 vmovl.u8 q3, d26 |
| 534 | 534 |
| 535 ; Load left row by row and compute left + (above - ytop_left) | 535 ; Load left row by row and compute left + (above - ytop_left) |
| (...skipping 78 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 614 vqshrun.s16 d1, q13, #0 | 614 vqshrun.s16 d1, q13, #0 |
| 615 vadd.s16 q12, q2, q8 | 615 vadd.s16 q12, q2, q8 |
| 616 vadd.s16 q13, q2, q9 | 616 vadd.s16 q13, q2, q9 |
| 617 vqshrun.s16 d2, q14, #0 | 617 vqshrun.s16 d2, q14, #0 |
| 618 vqshrun.s16 d3, q15, #0 | 618 vqshrun.s16 d3, q15, #0 |
| 619 vadd.s16 q14, q2, q10 | 619 vadd.s16 q14, q2, q10 |
| 620 vadd.s16 q15, q2, q11 | 620 vadd.s16 q15, q2, q11 |
| 621 vst1.64 {d0-d3}, [r0], r1 | 621 vst1.64 {d0-d3}, [r0], r1 |
| 622 vqshrun.s16 d24, q12, #0 | 622 vqshrun.s16 d24, q12, #0 |
| 623 vqshrun.s16 d25, q13, #0 | 623 vqshrun.s16 d25, q13, #0 |
| 624 vld1.8 d0, [r3]! ; preload 8 left pixels | 624 vld1.8 {d0}, [r3]! ; preload 8 left pixels |
| 625 vqshrun.s16 d26, q14, #0 | 625 vqshrun.s16 d26, q14, #0 |
| 626 vqshrun.s16 d27, q15, #0 | 626 vqshrun.s16 d27, q15, #0 |
| 627 vmovl.u8 q3, d0 | 627 vmovl.u8 q3, d0 |
| 628 vst1.64 {d24-d27}, [r0], r1 | 628 vst1.64 {d24-d27}, [r0], r1 |
| 629 | 629 |
| 630 subs r2, r2, #1 | 630 subs r2, r2, #1 |
| 631 bgt loop_32x32_neon | 631 bgt loop_32x32_neon |
| 632 | 632 |
| 633 bx lr | 633 bx lr |
| 634 ENDP ; |vp9_tm_predictor_32x32_neon| | 634 ENDP ; |vp9_tm_predictor_32x32_neon| |
| 635 | 635 |
| 636 END | 636 END |
| OLD | NEW |