OLD | NEW |
1 ; | 1 ; |
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
3 ; | 3 ; |
4 ; Use of this source code is governed by a BSD-style license | 4 ; Use of this source code is governed by a BSD-style license |
5 ; that can be found in the LICENSE file in the root of the source | 5 ; that can be found in the LICENSE file in the root of the source |
6 ; tree. An additional intellectual property rights grant can be found | 6 ; tree. An additional intellectual property rights grant can be found |
7 ; in the file PATENTS. All contributing project authors may | 7 ; in the file PATENTS. All contributing project authors may |
8 ; be found in the AUTHORS file in the root of the source tree. | 8 ; be found in the AUTHORS file in the root of the source tree. |
9 ; | 9 ; |
10 | 10 |
11 | 11 |
12 %include "vpx_ports/x86_abi_support.asm" | 12 %include "vpx_ports/x86_abi_support.asm" |
13 | 13 |
14 ;unsigned int vp8_get_mb_ss_mmx( short *src_ptr ) | 14 ;unsigned int vpx_get_mb_ss_mmx( short *src_ptr ) |
15 global sym(vp8_get_mb_ss_mmx) PRIVATE | 15 global sym(vpx_get_mb_ss_mmx) PRIVATE |
16 sym(vp8_get_mb_ss_mmx): | 16 sym(vpx_get_mb_ss_mmx): |
17 push rbp | 17 push rbp |
18 mov rbp, rsp | 18 mov rbp, rsp |
19 SHADOW_ARGS_TO_STACK 7 | 19 SHADOW_ARGS_TO_STACK 7 |
20 GET_GOT rbx | 20 GET_GOT rbx |
21 push rsi | 21 push rsi |
22 push rdi | 22 push rdi |
23 sub rsp, 8 | 23 sub rsp, 8 |
24 ; end prolog | 24 ; end prolog |
25 | 25 |
26 mov rax, arg(0) ;src_ptr | 26 mov rax, arg(0) ;src_ptr |
(...skipping 29 matching lines...) Expand all Loading... |
56 ; begin epilog | 56 ; begin epilog |
57 add rsp, 8 | 57 add rsp, 8 |
58 pop rdi | 58 pop rdi |
59 pop rsi | 59 pop rsi |
60 RESTORE_GOT | 60 RESTORE_GOT |
61 UNSHADOW_ARGS | 61 UNSHADOW_ARGS |
62 pop rbp | 62 pop rbp |
63 ret | 63 ret |
64 | 64 |
65 | 65 |
66 ;unsigned int vp8_get8x8var_mmx | 66 ;void vpx_get8x8var_mmx |
67 ;( | 67 ;( |
68 ; unsigned char *src_ptr, | 68 ; unsigned char *src_ptr, |
69 ; int source_stride, | 69 ; int source_stride, |
70 ; unsigned char *ref_ptr, | 70 ; unsigned char *ref_ptr, |
71 ; int recon_stride, | 71 ; int recon_stride, |
72 ; unsigned int *SSE, | 72 ; unsigned int *SSE, |
73 ; int *Sum | 73 ; int *Sum |
74 ;) | 74 ;) |
75 global sym(vp8_get8x8var_mmx) PRIVATE | 75 global sym(vpx_get8x8var_mmx) PRIVATE |
76 sym(vp8_get8x8var_mmx): | 76 sym(vpx_get8x8var_mmx): |
77 push rbp | 77 push rbp |
78 mov rbp, rsp | 78 mov rbp, rsp |
79 SHADOW_ARGS_TO_STACK 6 | 79 SHADOW_ARGS_TO_STACK 6 |
80 push rsi | 80 push rsi |
81 push rdi | 81 push rdi |
82 push rbx | 82 push rbx |
83 sub rsp, 16 | 83 sub rsp, 16 |
84 ; end prolog | 84 ; end prolog |
85 | 85 |
86 | 86 |
(...skipping 216 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
303 add rsp, 16 | 303 add rsp, 16 |
304 pop rbx | 304 pop rbx |
305 pop rdi | 305 pop rdi |
306 pop rsi | 306 pop rsi |
307 UNSHADOW_ARGS | 307 UNSHADOW_ARGS |
308 pop rbp | 308 pop rbp |
309 ret | 309 ret |
310 | 310 |
311 | 311 |
312 | 312 |
313 ;unsigned int | 313 ;void |
314 ;vp8_get4x4var_mmx | 314 ;vpx_get4x4var_mmx |
315 ;( | 315 ;( |
316 ; unsigned char *src_ptr, | 316 ; unsigned char *src_ptr, |
317 ; int source_stride, | 317 ; int source_stride, |
318 ; unsigned char *ref_ptr, | 318 ; unsigned char *ref_ptr, |
319 ; int recon_stride, | 319 ; int recon_stride, |
320 ; unsigned int *SSE, | 320 ; unsigned int *SSE, |
321 ; int *Sum | 321 ; int *Sum |
322 ;) | 322 ;) |
323 global sym(vp8_get4x4var_mmx) PRIVATE | 323 global sym(vpx_get4x4var_mmx) PRIVATE |
324 sym(vp8_get4x4var_mmx): | 324 sym(vpx_get4x4var_mmx): |
325 push rbp | 325 push rbp |
326 mov rbp, rsp | 326 mov rbp, rsp |
327 SHADOW_ARGS_TO_STACK 6 | 327 SHADOW_ARGS_TO_STACK 6 |
328 push rsi | 328 push rsi |
329 push rdi | 329 push rdi |
330 push rbx | 330 push rbx |
331 sub rsp, 16 | 331 sub rsp, 16 |
332 ; end prolog | 332 ; end prolog |
333 | 333 |
334 | 334 |
(...skipping 80 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
415 | 415 |
416 | 416 |
417 ; begin epilog | 417 ; begin epilog |
418 add rsp, 16 | 418 add rsp, 16 |
419 pop rbx | 419 pop rbx |
420 pop rdi | 420 pop rdi |
421 pop rsi | 421 pop rsi |
422 UNSHADOW_ARGS | 422 UNSHADOW_ARGS |
423 pop rbp | 423 pop rbp |
424 ret | 424 ret |
425 | |
426 | |
427 | |
428 ;unsigned int | |
429 ;vp8_get4x4sse_cs_mmx | |
430 ;( | |
431 ; unsigned char *src_ptr, | |
432 ; int source_stride, | |
433 ; unsigned char *ref_ptr, | |
434 ; int recon_stride | |
435 ;) | |
436 global sym(vp8_get4x4sse_cs_mmx) PRIVATE | |
437 sym(vp8_get4x4sse_cs_mmx): | |
438 push rbp | |
439 mov rbp, rsp | |
440 SHADOW_ARGS_TO_STACK 4 | |
441 push rsi | |
442 push rdi | |
443 push rbx | |
444 ; end prolog | |
445 | |
446 | |
447 pxor mm6, mm6 ; Blank mmx7 | |
448 pxor mm7, mm7 ; Blank mmx7 | |
449 | |
450 mov rax, arg(0) ;[src_ptr] ; Load base addresses | |
451 mov rbx, arg(2) ;[ref_ptr] | |
452 movsxd rcx, dword ptr arg(1) ;[source_stride] | |
453 movsxd rdx, dword ptr arg(3) ;[recon_stride] | |
454 ; Row 1 | |
455 movd mm0, [rax] ; Copy eight bytes to mm0 | |
456 movd mm1, [rbx] ; Copy eight bytes to mm1 | |
457 punpcklbw mm0, mm6 ; unpack to higher prrcision | |
458 punpcklbw mm1, mm6 | |
459 psubsw mm0, mm1 ; A-B (low order) to MM0 | |
460 pmaddwd mm0, mm0 ; square and accumulate | |
461 add rbx,rdx ; Inc pointer into ref data | |
462 add rax,rcx ; Inc pointer into the new data | |
463 movd mm1, [rbx] ; Copy eight bytes to mm1 | |
464 paddd mm7, mm0 ; accumulate in mm7 | |
465 | |
466 ; Row 2 | |
467 movd mm0, [rax] ; Copy eight bytes to mm0 | |
468 punpcklbw mm0, mm6 ; unpack to higher prrcision | |
469 punpcklbw mm1, mm6 | |
470 psubsw mm0, mm1 ; A-B (low order) to MM0 | |
471 pmaddwd mm0, mm0 ; square and accumulate | |
472 add rbx,rdx ; Inc pointer into ref data | |
473 add rax,rcx ; Inc pointer into the new data | |
474 movd mm1, [rbx] ; Copy eight bytes to mm1 | |
475 paddd mm7, mm0 ; accumulate in mm7 | |
476 | |
477 ; Row 3 | |
478 movd mm0, [rax] ; Copy eight bytes to mm0 | |
479 punpcklbw mm1, mm6 | |
480 punpcklbw mm0, mm6 ; unpack to higher prrcision | |
481 psubsw mm0, mm1 ; A-B (low order) to MM0 | |
482 | |
483 pmaddwd mm0, mm0 ; square and accumulate | |
484 add rbx,rdx ; Inc pointer into ref data | |
485 add rax,rcx ; Inc pointer into the new data | |
486 movd mm1, [rbx] ; Copy eight bytes to mm1 | |
487 paddd mm7, mm0 ; accumulate in mm7 | |
488 | |
489 ; Row 4 | |
490 movd mm0, [rax] ; Copy eight bytes to mm0 | |
491 punpcklbw mm0, mm6 ; unpack to higher prrcision | |
492 punpcklbw mm1, mm6 | |
493 psubsw mm0, mm1 ; A-B (low order) to MM0 | |
494 pmaddwd mm0, mm0 ; square and accumulate | |
495 paddd mm7, mm0 ; accumulate in mm7 | |
496 | |
497 movq mm0, mm7 ; | |
498 psrlq mm7, 32 | |
499 | |
500 paddd mm0, mm7 | |
501 movq rax, mm0 | |
502 | |
503 | |
504 ; begin epilog | |
505 pop rbx | |
506 pop rdi | |
507 pop rsi | |
508 UNSHADOW_ARGS | |
509 pop rbp | |
510 ret | |
511 | |
512 %define mmx_filter_shift 7 | |
513 | |
514 ;void vp8_filter_block2d_bil4x4_var_mmx | |
515 ;( | |
516 ; unsigned char *ref_ptr, | |
517 ; int ref_pixels_per_line, | |
518 ; unsigned char *src_ptr, | |
519 ; int src_pixels_per_line, | |
520 ; unsigned short *HFilter, | |
521 ; unsigned short *VFilter, | |
522 ; int *sum, | |
523 ; unsigned int *sumsquared | |
524 ;) | |
525 global sym(vp8_filter_block2d_bil4x4_var_mmx) PRIVATE | |
526 sym(vp8_filter_block2d_bil4x4_var_mmx): | |
527 push rbp | |
528 mov rbp, rsp | |
529 SHADOW_ARGS_TO_STACK 8 | |
530 GET_GOT rbx | |
531 push rsi | |
532 push rdi | |
533 sub rsp, 16 | |
534 ; end prolog | |
535 | |
536 | |
537 pxor mm6, mm6 ; | |
538 pxor mm7, mm7 ; | |
539 | |
540 mov rax, arg(4) ;HFilter ; | |
541 mov rdx, arg(5) ;VFilter ; | |
542 | |
543 mov rsi, arg(0) ;ref_ptr ; | |
544 mov rdi, arg(2) ;src_ptr ; | |
545 | |
546 mov rcx, 4 ; | |
547 pxor mm0, mm0 ; | |
548 | |
549 movd mm1, [rsi] ; | |
550 movd mm3, [rsi+1] ; | |
551 | |
552 punpcklbw mm1, mm0 ; | |
553 pmullw mm1, [rax] ; | |
554 | |
555 punpcklbw mm3, mm0 ; | |
556 pmullw mm3, [rax+8] ; | |
557 | |
558 paddw mm1, mm3 ; | |
559 paddw mm1, [GLOBAL(mmx_bi_rd)] ; | |
560 | |
561 psraw mm1, mmx_filter_shift ; | |
562 movq mm5, mm1 | |
563 | |
564 %if ABI_IS_32BIT | |
565 add rsi, dword ptr arg(1) ;ref_pixels_per_line ; | |
566 %else | |
567 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; | |
568 add rsi, r8 | |
569 %endif | |
570 | |
571 .filter_block2d_bil4x4_var_mmx_loop: | |
572 | |
573 movd mm1, [rsi] ; | |
574 movd mm3, [rsi+1] ; | |
575 | |
576 punpcklbw mm1, mm0 ; | |
577 pmullw mm1, [rax] ; | |
578 | |
579 punpcklbw mm3, mm0 ; | |
580 pmullw mm3, [rax+8] ; | |
581 | |
582 paddw mm1, mm3 ; | |
583 paddw mm1, [GLOBAL(mmx_bi_rd)] ; | |
584 | |
585 psraw mm1, mmx_filter_shift ; | |
586 movq mm3, mm5 ; | |
587 | |
588 movq mm5, mm1 ; | |
589 pmullw mm3, [rdx] ; | |
590 | |
591 pmullw mm1, [rdx+8] ; | |
592 paddw mm1, mm3 ; | |
593 | |
594 | |
595 paddw mm1, [GLOBAL(mmx_bi_rd)] ; | |
596 psraw mm1, mmx_filter_shift ; | |
597 | |
598 movd mm3, [rdi] ; | |
599 punpcklbw mm3, mm0 ; | |
600 | |
601 psubw mm1, mm3 ; | |
602 paddw mm6, mm1 ; | |
603 | |
604 pmaddwd mm1, mm1 ; | |
605 paddd mm7, mm1 ; | |
606 | |
607 %if ABI_IS_32BIT | |
608 add rsi, dword ptr arg(1) ;ref_pixels_per_line
; | |
609 add rdi, dword ptr arg(3) ;src_pixels_per_line
; | |
610 %else | |
611 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line | |
612 movsxd r9, dword ptr arg(3) ;src_pixels_per_line | |
613 add rsi, r8 | |
614 add rdi, r9 | |
615 %endif | |
616 sub rcx, 1 ; | |
617 jnz .filter_block2d_bil4x4_var_mmx_loop ; | |
618 | |
619 | |
620 pxor mm3, mm3 ; | |
621 pxor mm2, mm2 ; | |
622 | |
623 punpcklwd mm2, mm6 ; | |
624 punpckhwd mm3, mm6 ; | |
625 | |
626 paddd mm2, mm3 ; | |
627 movq mm6, mm2 ; | |
628 | |
629 psrlq mm6, 32 ; | |
630 paddd mm2, mm6 ; | |
631 | |
632 psrad mm2, 16 ; | |
633 movq mm4, mm7 ; | |
634 | |
635 psrlq mm4, 32 ; | |
636 paddd mm4, mm7 ; | |
637 | |
638 mov rdi, arg(6) ;sum | |
639 mov rsi, arg(7) ;sumsquared | |
640 | |
641 movd dword ptr [rdi], mm2 ; | |
642 movd dword ptr [rsi], mm4 ; | |
643 | |
644 | |
645 | |
646 ; begin epilog | |
647 add rsp, 16 | |
648 pop rdi | |
649 pop rsi | |
650 RESTORE_GOT | |
651 UNSHADOW_ARGS | |
652 pop rbp | |
653 ret | |
654 | |
655 | |
656 | |
657 | |
658 ;void vp8_filter_block2d_bil_var_mmx | |
659 ;( | |
660 ; unsigned char *ref_ptr, | |
661 ; int ref_pixels_per_line, | |
662 ; unsigned char *src_ptr, | |
663 ; int src_pixels_per_line, | |
664 ; unsigned int Height, | |
665 ; unsigned short *HFilter, | |
666 ; unsigned short *VFilter, | |
667 ; int *sum, | |
668 ; unsigned int *sumsquared | |
669 ;) | |
670 global sym(vp8_filter_block2d_bil_var_mmx) PRIVATE | |
671 sym(vp8_filter_block2d_bil_var_mmx): | |
672 push rbp | |
673 mov rbp, rsp | |
674 SHADOW_ARGS_TO_STACK 9 | |
675 GET_GOT rbx | |
676 push rsi | |
677 push rdi | |
678 sub rsp, 16 | |
679 ; end prolog | |
680 | |
681 pxor mm6, mm6 ; | |
682 pxor mm7, mm7 ; | |
683 mov rax, arg(5) ;HFilter ; | |
684 | |
685 mov rdx, arg(6) ;VFilter ; | |
686 mov rsi, arg(0) ;ref_ptr ; | |
687 | |
688 mov rdi, arg(2) ;src_ptr ; | |
689 movsxd rcx, dword ptr arg(4) ;Height ; | |
690 | |
691 pxor mm0, mm0 ; | |
692 movq mm1, [rsi] ; | |
693 | |
694 movq mm3, [rsi+1] ; | |
695 movq mm2, mm1 ; | |
696 | |
697 movq mm4, mm3 ; | |
698 punpcklbw mm1, mm0 ; | |
699 | |
700 punpckhbw mm2, mm0 ; | |
701 pmullw mm1, [rax] ; | |
702 | |
703 pmullw mm2, [rax] ; | |
704 punpcklbw mm3, mm0 ; | |
705 | |
706 punpckhbw mm4, mm0 ; | |
707 pmullw mm3, [rax+8] ; | |
708 | |
709 pmullw mm4, [rax+8] ; | |
710 paddw mm1, mm3 ; | |
711 | |
712 paddw mm2, mm4 ; | |
713 paddw mm1, [GLOBAL(mmx_bi_rd)] ; | |
714 | |
715 psraw mm1, mmx_filter_shift ; | |
716 paddw mm2, [GLOBAL(mmx_bi_rd)] ; | |
717 | |
718 psraw mm2, mmx_filter_shift ; | |
719 movq mm5, mm1 | |
720 | |
721 packuswb mm5, mm2 ; | |
722 %if ABI_IS_32BIT | |
723 add rsi, dword ptr arg(1) ;ref_pixels_per_line | |
724 %else | |
725 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line | |
726 add rsi, r8 | |
727 %endif | |
728 | |
729 .filter_block2d_bil_var_mmx_loop: | |
730 | |
731 movq mm1, [rsi] ; | |
732 movq mm3, [rsi+1] ; | |
733 | |
734 movq mm2, mm1 ; | |
735 movq mm4, mm3 ; | |
736 | |
737 punpcklbw mm1, mm0 ; | |
738 punpckhbw mm2, mm0 ; | |
739 | |
740 pmullw mm1, [rax] ; | |
741 pmullw mm2, [rax] ; | |
742 | |
743 punpcklbw mm3, mm0 ; | |
744 punpckhbw mm4, mm0 ; | |
745 | |
746 pmullw mm3, [rax+8] ; | |
747 pmullw mm4, [rax+8] ; | |
748 | |
749 paddw mm1, mm3 ; | |
750 paddw mm2, mm4 ; | |
751 | |
752 paddw mm1, [GLOBAL(mmx_bi_rd)] ; | |
753 psraw mm1, mmx_filter_shift ; | |
754 | |
755 paddw mm2, [GLOBAL(mmx_bi_rd)] ; | |
756 psraw mm2, mmx_filter_shift ; | |
757 | |
758 movq mm3, mm5 ; | |
759 movq mm4, mm5 ; | |
760 | |
761 punpcklbw mm3, mm0 ; | |
762 punpckhbw mm4, mm0 ; | |
763 | |
764 movq mm5, mm1 ; | |
765 packuswb mm5, mm2 ; | |
766 | |
767 pmullw mm3, [rdx] ; | |
768 pmullw mm4, [rdx] ; | |
769 | |
770 pmullw mm1, [rdx+8] ; | |
771 pmullw mm2, [rdx+8] ; | |
772 | |
773 paddw mm1, mm3 ; | |
774 paddw mm2, mm4 ; | |
775 | |
776 paddw mm1, [GLOBAL(mmx_bi_rd)] ; | |
777 paddw mm2, [GLOBAL(mmx_bi_rd)] ; | |
778 | |
779 psraw mm1, mmx_filter_shift ; | |
780 psraw mm2, mmx_filter_shift ; | |
781 | |
782 movq mm3, [rdi] ; | |
783 movq mm4, mm3 ; | |
784 | |
785 punpcklbw mm3, mm0 ; | |
786 punpckhbw mm4, mm0 ; | |
787 | |
788 psubw mm1, mm3 ; | |
789 psubw mm2, mm4 ; | |
790 | |
791 paddw mm6, mm1 ; | |
792 pmaddwd mm1, mm1 ; | |
793 | |
794 paddw mm6, mm2 ; | |
795 pmaddwd mm2, mm2 ; | |
796 | |
797 paddd mm7, mm1 ; | |
798 paddd mm7, mm2 ; | |
799 | |
800 %if ABI_IS_32BIT | |
801 add rsi, dword ptr arg(1) ;ref_pixels_per_line
; | |
802 add rdi, dword ptr arg(3) ;src_pixels_per_line
; | |
803 %else | |
804 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
; | |
805 movsxd r9, dword ptr arg(3) ;src_pixels_per_line
; | |
806 add rsi, r8 | |
807 add rdi, r9 | |
808 %endif | |
809 sub rcx, 1 ; | |
810 jnz .filter_block2d_bil_var_mmx_loop ; | |
811 | |
812 | |
813 pxor mm3, mm3 ; | |
814 pxor mm2, mm2 ; | |
815 | |
816 punpcklwd mm2, mm6 ; | |
817 punpckhwd mm3, mm6 ; | |
818 | |
819 paddd mm2, mm3 ; | |
820 movq mm6, mm2 ; | |
821 | |
822 psrlq mm6, 32 ; | |
823 paddd mm2, mm6 ; | |
824 | |
825 psrad mm2, 16 ; | |
826 movq mm4, mm7 ; | |
827 | |
828 psrlq mm4, 32 ; | |
829 paddd mm4, mm7 ; | |
830 | |
831 mov rdi, arg(7) ;sum | |
832 mov rsi, arg(8) ;sumsquared | |
833 | |
834 movd dword ptr [rdi], mm2 ; | |
835 movd dword ptr [rsi], mm4 ; | |
836 | |
837 ; begin epilog | |
838 add rsp, 16 | |
839 pop rdi | |
840 pop rsi | |
841 RESTORE_GOT | |
842 UNSHADOW_ARGS | |
843 pop rbp | |
844 ret | |
845 | |
846 | |
847 SECTION_RODATA | |
848 ;short mmx_bi_rd[4] = { 64, 64, 64, 64}; | |
849 align 16 | |
850 mmx_bi_rd: | |
851 times 4 dw 64 | |
OLD | NEW |