OLD | NEW |
1 ; | 1 ; |
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
3 ; | 3 ; |
4 ; Use of this source code is governed by a BSD-style license | 4 ; Use of this source code is governed by a BSD-style license |
5 ; that can be found in the LICENSE file in the root of the source | 5 ; that can be found in the LICENSE file in the root of the source |
6 ; tree. An additional intellectual property rights grant can be found | 6 ; tree. An additional intellectual property rights grant can be found |
7 ; in the file PATENTS. All contributing project authors may | 7 ; in the file PATENTS. All contributing project authors may |
8 ; be found in the AUTHORS file in the root of the source tree. | 8 ; be found in the AUTHORS file in the root of the source tree. |
9 ; | 9 ; |
10 | 10 |
(...skipping 382 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
393 ; begin epilog | 393 ; begin epilog |
394 add rsp, 16 | 394 add rsp, 16 |
395 pop rdi | 395 pop rdi |
396 pop rsi | 396 pop rsi |
397 RESTORE_GOT | 397 RESTORE_GOT |
398 RESTORE_XMM | 398 RESTORE_XMM |
399 UNSHADOW_ARGS | 399 UNSHADOW_ARGS |
400 pop rbp | 400 pop rbp |
401 ret | 401 ret |
402 | 402 |
403 ;void vp9_filter_block2d_bil_var_sse2 | |
404 ;( | |
405 ; unsigned char *ref_ptr, | |
406 ; int ref_pixels_per_line, | |
407 ; unsigned char *src_ptr, | |
408 ; int src_pixels_per_line, | |
409 ; unsigned int Height, | |
410 ; int xoffset, | |
411 ; int yoffset, | |
412 ; int *sum, | |
413 ; unsigned int *sumsquared;; | |
414 ; | |
415 ;) | |
416 global sym(vp9_filter_block2d_bil_var_sse2) PRIVATE | |
417 sym(vp9_filter_block2d_bil_var_sse2): | |
418 push rbp | |
419 mov rbp, rsp | |
420 SHADOW_ARGS_TO_STACK 9 | |
421 SAVE_XMM 7 | |
422 GET_GOT rbx | |
423 push rsi | |
424 push rdi | |
425 push rbx | |
426 ; end prolog | |
427 | |
428 pxor xmm6, xmm6 ; | |
429 pxor xmm7, xmm7 ; | |
430 | |
431 lea rsi, [GLOBAL(xmm_bi_rd)] ; rounding | |
432 movdqa xmm4, XMMWORD PTR [rsi] | |
433 | |
434 lea rcx, [GLOBAL(bilinear_filters_sse2)] | |
435 movsxd rax, dword ptr arg(5) ; xoffset | |
436 | |
437 cmp rax, 0 ; skip first_pass f
ilter if xoffset=0 | |
438 je filter_block2d_bil_var_sse2_sp_only | |
439 | |
440 shl rax, 5 ; point to filter c
oeff with xoffset | |
441 lea rax, [rax + rcx] ; HFilter | |
442 | |
443 movsxd rdx, dword ptr arg(6) ; yoffset | |
444 | |
445 cmp rdx, 0 ; skip second_pass
filter if yoffset=0 | |
446 je filter_block2d_bil_var_sse2_fp_only | |
447 | |
448 shl rdx, 5 | |
449 lea rdx, [rdx + rcx] ; VFilter | |
450 | |
451 mov rsi, arg(0) ;ref_ptr | |
452 mov rdi, arg(2) ;src_ptr | |
453 movsxd rcx, dword ptr arg(4) ;Height | |
454 | |
455 pxor xmm0, xmm0 ; | |
456 movq xmm1, QWORD PTR [rsi] ; | |
457 movq xmm3, QWORD PTR [rsi+1] ; | |
458 | |
459 punpcklbw xmm1, xmm0 ; | |
460 pmullw xmm1, [rax] ; | |
461 punpcklbw xmm3, xmm0 | |
462 pmullw xmm3, [rax+16] ; | |
463 | |
464 paddw xmm1, xmm3 ; | |
465 paddw xmm1, xmm4 ; | |
466 psraw xmm1, xmm_filter_shift ; | |
467 movdqa xmm5, xmm1 | |
468 | |
469 movsxd rbx, dword ptr arg(1) ;ref_pixels_per_line | |
470 lea rsi, [rsi + rbx] | |
471 %if ABI_IS_32BIT=0 | |
472 movsxd r9, dword ptr arg(3) ;src_pixels_per_line | |
473 %endif | |
474 | |
475 filter_block2d_bil_var_sse2_loop: | |
476 movq xmm1, QWORD PTR [rsi] ; | |
477 movq xmm3, QWORD PTR [rsi+1] ; | |
478 | |
479 punpcklbw xmm1, xmm0 ; | |
480 pmullw xmm1, [rax] ; | |
481 punpcklbw xmm3, xmm0 ; | |
482 pmullw xmm3, [rax+16] ; | |
483 | |
484 paddw xmm1, xmm3 ; | |
485 paddw xmm1, xmm4 ; | |
486 psraw xmm1, xmm_filter_shift ; | |
487 | |
488 movdqa xmm3, xmm5 ; | |
489 movdqa xmm5, xmm1 ; | |
490 | |
491 pmullw xmm3, [rdx] ; | |
492 pmullw xmm1, [rdx+16] ; | |
493 paddw xmm1, xmm3 ; | |
494 paddw xmm1, xmm4 ; | |
495 psraw xmm1, xmm_filter_shift ; | |
496 | |
497 movq xmm3, QWORD PTR [rdi] ; | |
498 punpcklbw xmm3, xmm0 ; | |
499 | |
500 psubw xmm1, xmm3 ; | |
501 paddw xmm6, xmm1 ; | |
502 | |
503 pmaddwd xmm1, xmm1 ; | |
504 paddd xmm7, xmm1 ; | |
505 | |
506 lea rsi, [rsi + rbx] ;ref_pixels_per_lin
e | |
507 %if ABI_IS_32BIT | |
508 add rdi, dword ptr arg(3) ;src_pixels_per_lin
e | |
509 %else | |
510 lea rdi, [rdi + r9] | |
511 %endif | |
512 | |
513 sub rcx, 1 ; | |
514 jnz filter_block2d_bil_var_sse2_loop ; | |
515 | |
516 jmp filter_block2d_bil_variance | |
517 | |
518 filter_block2d_bil_var_sse2_sp_only: | |
519 movsxd rdx, dword ptr arg(6) ; yoffset | |
520 | |
521 cmp rdx, 0 ; skip all if both
xoffset=0 and yoffset=0 | |
522 je filter_block2d_bil_var_sse2_full_pixel | |
523 | |
524 shl rdx, 5 | |
525 lea rdx, [rdx + rcx] ; VFilter | |
526 | |
527 mov rsi, arg(0) ;ref_ptr | |
528 mov rdi, arg(2) ;src_ptr | |
529 movsxd rcx, dword ptr arg(4) ;Height | |
530 movsxd rax, dword ptr arg(1) ;ref_pixels_per_lin
e | |
531 | |
532 pxor xmm0, xmm0 ; | |
533 movq xmm1, QWORD PTR [rsi] ; | |
534 punpcklbw xmm1, xmm0 ; | |
535 | |
536 movsxd rbx, dword ptr arg(3) ;src_pixels_per_lin
e | |
537 lea rsi, [rsi + rax] | |
538 | |
539 filter_block2d_bil_sp_only_loop: | |
540 movq xmm3, QWORD PTR [rsi] ; | |
541 punpcklbw xmm3, xmm0 ; | |
542 movdqa xmm5, xmm3 | |
543 | |
544 pmullw xmm1, [rdx] ; | |
545 pmullw xmm3, [rdx+16] ; | |
546 paddw xmm1, xmm3 ; | |
547 paddw xmm1, xmm4 ; | |
548 psraw xmm1, xmm_filter_shift ; | |
549 | |
550 movq xmm3, QWORD PTR [rdi] ; | |
551 punpcklbw xmm3, xmm0 ; | |
552 | |
553 psubw xmm1, xmm3 ; | |
554 paddw xmm6, xmm1 ; | |
555 | |
556 pmaddwd xmm1, xmm1 ; | |
557 paddd xmm7, xmm1 ; | |
558 | |
559 movdqa xmm1, xmm5 ; | |
560 lea rsi, [rsi + rax] ;ref_pixels_per_lin
e | |
561 lea rdi, [rdi + rbx] ;src_pixels_per_lin
e | |
562 | |
563 sub rcx, 1 ; | |
564 jnz filter_block2d_bil_sp_only_loop ; | |
565 | |
566 jmp filter_block2d_bil_variance | |
567 | |
568 filter_block2d_bil_var_sse2_full_pixel: | |
569 mov rsi, arg(0) ;ref_ptr | |
570 mov rdi, arg(2) ;src_ptr | |
571 movsxd rcx, dword ptr arg(4) ;Height | |
572 movsxd rax, dword ptr arg(1) ;ref_pixels_per_lin
e | |
573 movsxd rbx, dword ptr arg(3) ;src_pixels_per_lin
e | |
574 pxor xmm0, xmm0 ; | |
575 | |
576 filter_block2d_bil_full_pixel_loop: | |
577 movq xmm1, QWORD PTR [rsi] ; | |
578 punpcklbw xmm1, xmm0 ; | |
579 | |
580 movq xmm2, QWORD PTR [rdi] ; | |
581 punpcklbw xmm2, xmm0 ; | |
582 | |
583 psubw xmm1, xmm2 ; | |
584 paddw xmm6, xmm1 ; | |
585 | |
586 pmaddwd xmm1, xmm1 ; | |
587 paddd xmm7, xmm1 ; | |
588 | |
589 lea rsi, [rsi + rax] ;ref_pixels_per_lin
e | |
590 lea rdi, [rdi + rbx] ;src_pixels_per_lin
e | |
591 | |
592 sub rcx, 1 ; | |
593 jnz filter_block2d_bil_full_pixel_loop ; | |
594 | |
595 jmp filter_block2d_bil_variance | |
596 | |
597 filter_block2d_bil_var_sse2_fp_only: | |
598 mov rsi, arg(0) ;ref_ptr | |
599 mov rdi, arg(2) ;src_ptr | |
600 movsxd rcx, dword ptr arg(4) ;Height | |
601 movsxd rdx, dword ptr arg(1) ;ref_pixels_per_lin
e | |
602 | |
603 pxor xmm0, xmm0 ; | |
604 movsxd rbx, dword ptr arg(3) ;src_pixels_per_lin
e | |
605 | |
606 filter_block2d_bil_fp_only_loop: | |
607 movq xmm1, QWORD PTR [rsi] ; | |
608 movq xmm3, QWORD PTR [rsi+1] ; | |
609 | |
610 punpcklbw xmm1, xmm0 ; | |
611 pmullw xmm1, [rax] ; | |
612 punpcklbw xmm3, xmm0 ; | |
613 pmullw xmm3, [rax+16] ; | |
614 | |
615 paddw xmm1, xmm3 ; | |
616 paddw xmm1, xmm4 ; | |
617 psraw xmm1, xmm_filter_shift ; | |
618 | |
619 movq xmm3, QWORD PTR [rdi] ; | |
620 punpcklbw xmm3, xmm0 ; | |
621 | |
622 psubw xmm1, xmm3 ; | |
623 paddw xmm6, xmm1 ; | |
624 | |
625 pmaddwd xmm1, xmm1 ; | |
626 paddd xmm7, xmm1 ; | |
627 lea rsi, [rsi + rdx] | |
628 lea rdi, [rdi + rbx] ;src_pixels_per_lin
e | |
629 | |
630 sub rcx, 1 ; | |
631 jnz filter_block2d_bil_fp_only_loop ; | |
632 | |
633 jmp filter_block2d_bil_variance | |
634 | |
635 filter_block2d_bil_variance: | |
636 movdq2q mm6, xmm6 ; | |
637 movdq2q mm7, xmm7 ; | |
638 | |
639 psrldq xmm6, 8 | |
640 psrldq xmm7, 8 | |
641 | |
642 movdq2q mm2, xmm6 | |
643 movdq2q mm3, xmm7 | |
644 | |
645 paddw mm6, mm2 | |
646 paddd mm7, mm3 | |
647 | |
648 pxor mm3, mm3 ; | |
649 pxor mm2, mm2 ; | |
650 | |
651 punpcklwd mm2, mm6 ; | |
652 punpckhwd mm3, mm6 ; | |
653 | |
654 paddd mm2, mm3 ; | |
655 movq mm6, mm2 ; | |
656 | |
657 psrlq mm6, 32 ; | |
658 paddd mm2, mm6 ; | |
659 | |
660 psrad mm2, 16 ; | |
661 movq mm4, mm7 ; | |
662 | |
663 psrlq mm4, 32 ; | |
664 paddd mm4, mm7 ; | |
665 | |
666 mov rsi, arg(7) ; sum | |
667 mov rdi, arg(8) ; sumsquared | |
668 | |
669 movd [rsi], mm2 ; xsum | |
670 movd [rdi], mm4 ; xxsum | |
671 | |
672 ; begin epilog | |
673 pop rbx | |
674 pop rdi | |
675 pop rsi | |
676 RESTORE_GOT | |
677 RESTORE_XMM | |
678 UNSHADOW_ARGS | |
679 pop rbp | |
680 ret | |
681 | |
682 | |
683 ;void vp9_half_horiz_vert_variance8x_h_sse2 | 403 ;void vp9_half_horiz_vert_variance8x_h_sse2 |
684 ;( | 404 ;( |
685 ; unsigned char *ref_ptr, | 405 ; unsigned char *ref_ptr, |
686 ; int ref_pixels_per_line, | 406 ; int ref_pixels_per_line, |
687 ; unsigned char *src_ptr, | 407 ; unsigned char *src_ptr, |
688 ; int src_pixels_per_line, | 408 ; int src_pixels_per_line, |
689 ; unsigned int Height, | 409 ; unsigned int Height, |
690 ; int *sum, | 410 ; int *sum, |
691 ; unsigned int *sumsquared | 411 ; unsigned int *sumsquared |
692 ;) | 412 ;) |
(...skipping 102 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
795 | 515 |
796 ; begin epilog | 516 ; begin epilog |
797 pop rdi | 517 pop rdi |
798 pop rsi | 518 pop rsi |
799 RESTORE_GOT | 519 RESTORE_GOT |
800 RESTORE_XMM | 520 RESTORE_XMM |
801 UNSHADOW_ARGS | 521 UNSHADOW_ARGS |
802 pop rbp | 522 pop rbp |
803 ret | 523 ret |
804 | 524 |
805 ;void vp9_half_horiz_vert_variance16x_h_sse2 | |
806 ;( | |
807 ; unsigned char *ref_ptr, | |
808 ; int ref_pixels_per_line, | |
809 ; unsigned char *src_ptr, | |
810 ; int src_pixels_per_line, | |
811 ; unsigned int Height, | |
812 ; int *sum, | |
813 ; unsigned int *sumsquared | |
814 ;) | |
815 global sym(vp9_half_horiz_vert_variance16x_h_sse2) PRIVATE | |
816 sym(vp9_half_horiz_vert_variance16x_h_sse2): | |
817 push rbp | |
818 mov rbp, rsp | |
819 SHADOW_ARGS_TO_STACK 7 | |
820 SAVE_XMM 7 | |
821 GET_GOT rbx | |
822 push rsi | |
823 push rdi | |
824 ; end prolog | |
825 | |
826 pxor xmm6, xmm6 ; error accumulator | |
827 pxor xmm7, xmm7 ; sse eaccumulator | |
828 mov rsi, arg(0) ;ref_ptr ; | |
829 | |
830 mov rdi, arg(2) ;src_ptr ; | |
831 movsxd rcx, dword ptr arg(4) ;Height ; | |
832 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line | |
833 movsxd rdx, dword ptr arg(3) ;src_pixels_per_line | |
834 | |
835 pxor xmm0, xmm0 ; | |
836 | |
837 movdqu xmm5, XMMWORD PTR [rsi] | |
838 movdqu xmm3, XMMWORD PTR [rsi+1] | |
839 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,x
mm3) horizontal line 1 | |
840 | |
841 lea rsi, [rsi + rax] | |
842 | |
843 .half_horiz_vert_variance16x_h_1: | |
844 movdqu xmm1, XMMWORD PTR [rsi] ; | |
845 movdqu xmm2, XMMWORD PTR [rsi+1] ; | |
846 pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,x
mm3) horizontal line i+1 | |
847 | |
848 pavgb xmm5, xmm1 ; xmm = vertical av
erage of the above | |
849 | |
850 movdqa xmm4, xmm5 | |
851 punpcklbw xmm5, xmm0 ; xmm5 = words of a
bove | |
852 punpckhbw xmm4, xmm0 | |
853 | |
854 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..
d7 | |
855 punpcklbw xmm3, xmm0 ; xmm3 = words of a
bove | |
856 psubw xmm5, xmm3 ; xmm5 -= xmm3 | |
857 | |
858 movq xmm3, QWORD PTR [rdi+8] | |
859 punpcklbw xmm3, xmm0 | |
860 psubw xmm4, xmm3 | |
861 | |
862 paddw xmm6, xmm5 ; xmm6 += accumulat
ed column differences | |
863 paddw xmm6, xmm4 | |
864 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 | |
865 pmaddwd xmm4, xmm4 | |
866 paddd xmm7, xmm5 ; xmm7 += accumulat
ed square column differences | |
867 paddd xmm7, xmm4 | |
868 | |
869 movdqa xmm5, xmm1 ; save xmm1 for use
on the next row | |
870 | |
871 lea rsi, [rsi + rax] | |
872 lea rdi, [rdi + rdx] | |
873 | |
874 sub rcx, 1 ; | |
875 jnz .half_horiz_vert_variance16x_h_1 ; | |
876 | |
877 pxor xmm1, xmm1 | |
878 pxor xmm5, xmm5 | |
879 | |
880 punpcklwd xmm0, xmm6 | |
881 punpckhwd xmm1, xmm6 | |
882 psrad xmm0, 16 | |
883 psrad xmm1, 16 | |
884 paddd xmm0, xmm1 | |
885 movdqa xmm1, xmm0 | |
886 | |
887 movdqa xmm6, xmm7 | |
888 punpckldq xmm6, xmm5 | |
889 punpckhdq xmm7, xmm5 | |
890 paddd xmm6, xmm7 | |
891 | |
892 punpckldq xmm0, xmm5 | |
893 punpckhdq xmm1, xmm5 | |
894 paddd xmm0, xmm1 | |
895 | |
896 movdqa xmm7, xmm6 | |
897 movdqa xmm1, xmm0 | |
898 | |
899 psrldq xmm7, 8 | |
900 psrldq xmm1, 8 | |
901 | |
902 paddd xmm6, xmm7 | |
903 paddd xmm0, xmm1 | |
904 | |
905 mov rsi, arg(5) ;[Sum] | |
906 mov rdi, arg(6) ;[SSE] | |
907 | |
908 movd [rsi], xmm0 | |
909 movd [rdi], xmm6 | |
910 | |
911 ; begin epilog | |
912 pop rdi | |
913 pop rsi | |
914 RESTORE_GOT | |
915 RESTORE_XMM | |
916 UNSHADOW_ARGS | |
917 pop rbp | |
918 ret | |
919 | |
920 | |
921 ;void vp9_half_vert_variance8x_h_sse2 | 525 ;void vp9_half_vert_variance8x_h_sse2 |
922 ;( | 526 ;( |
923 ; unsigned char *ref_ptr, | 527 ; unsigned char *ref_ptr, |
924 ; int ref_pixels_per_line, | 528 ; int ref_pixels_per_line, |
925 ; unsigned char *src_ptr, | 529 ; unsigned char *src_ptr, |
926 ; int src_pixels_per_line, | 530 ; int src_pixels_per_line, |
927 ; unsigned int Height, | 531 ; unsigned int Height, |
928 ; int *sum, | 532 ; int *sum, |
929 ; unsigned int *sumsquared | 533 ; unsigned int *sumsquared |
930 ;) | 534 ;) |
(...skipping 87 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1018 | 622 |
1019 ; begin epilog | 623 ; begin epilog |
1020 pop rdi | 624 pop rdi |
1021 pop rsi | 625 pop rsi |
1022 RESTORE_GOT | 626 RESTORE_GOT |
1023 RESTORE_XMM | 627 RESTORE_XMM |
1024 UNSHADOW_ARGS | 628 UNSHADOW_ARGS |
1025 pop rbp | 629 pop rbp |
1026 ret | 630 ret |
1027 | 631 |
1028 ;void vp9_half_vert_variance16x_h_sse2 | |
1029 ;( | |
1030 ; unsigned char *ref_ptr, | |
1031 ; int ref_pixels_per_line, | |
1032 ; unsigned char *src_ptr, | |
1033 ; int src_pixels_per_line, | |
1034 ; unsigned int Height, | |
1035 ; int *sum, | |
1036 ; unsigned int *sumsquared | |
1037 ;) | |
1038 global sym(vp9_half_vert_variance16x_h_sse2) PRIVATE | |
1039 sym(vp9_half_vert_variance16x_h_sse2): | |
1040 push rbp | |
1041 mov rbp, rsp | |
1042 SHADOW_ARGS_TO_STACK 7 | |
1043 SAVE_XMM 7 | |
1044 GET_GOT rbx | |
1045 push rsi | |
1046 push rdi | |
1047 ; end prolog | |
1048 | |
1049 pxor xmm6, xmm6 ; error accumulator | |
1050 pxor xmm7, xmm7 ; sse eaccumulator | |
1051 mov rsi, arg(0) ;ref_ptr | |
1052 | |
1053 mov rdi, arg(2) ;src_ptr | |
1054 movsxd rcx, dword ptr arg(4) ;Height | |
1055 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line | |
1056 movsxd rdx, dword ptr arg(3) ;src_pixels_per_line | |
1057 | |
1058 movdqu xmm5, XMMWORD PTR [rsi] | |
1059 lea rsi, [rsi + rax ] | |
1060 pxor xmm0, xmm0 | |
1061 | |
1062 .half_vert_variance16x_h_1: | |
1063 movdqu xmm3, XMMWORD PTR [rsi] | |
1064 | |
1065 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,x
mm3) | |
1066 movdqa xmm4, xmm5 | |
1067 punpcklbw xmm5, xmm0 | |
1068 punpckhbw xmm4, xmm0 | |
1069 | |
1070 movq xmm2, QWORD PTR [rdi] | |
1071 punpcklbw xmm2, xmm0 | |
1072 psubw xmm5, xmm2 | |
1073 movq xmm2, QWORD PTR [rdi+8] | |
1074 punpcklbw xmm2, xmm0 | |
1075 psubw xmm4, xmm2 | |
1076 | |
1077 paddw xmm6, xmm5 ; xmm6 += accumulat
ed column differences | |
1078 paddw xmm6, xmm4 | |
1079 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 | |
1080 pmaddwd xmm4, xmm4 | |
1081 paddd xmm7, xmm5 ; xmm7 += accumulat
ed square column differences | |
1082 paddd xmm7, xmm4 | |
1083 | |
1084 movdqa xmm5, xmm3 | |
1085 | |
1086 lea rsi, [rsi + rax] | |
1087 lea rdi, [rdi + rdx] | |
1088 | |
1089 sub rcx, 1 | |
1090 jnz .half_vert_variance16x_h_1 | |
1091 | |
1092 pxor xmm1, xmm1 | |
1093 pxor xmm5, xmm5 | |
1094 | |
1095 punpcklwd xmm0, xmm6 | |
1096 punpckhwd xmm1, xmm6 | |
1097 psrad xmm0, 16 | |
1098 psrad xmm1, 16 | |
1099 paddd xmm0, xmm1 | |
1100 movdqa xmm1, xmm0 | |
1101 | |
1102 movdqa xmm6, xmm7 | |
1103 punpckldq xmm6, xmm5 | |
1104 punpckhdq xmm7, xmm5 | |
1105 paddd xmm6, xmm7 | |
1106 | |
1107 punpckldq xmm0, xmm5 | |
1108 punpckhdq xmm1, xmm5 | |
1109 paddd xmm0, xmm1 | |
1110 | |
1111 movdqa xmm7, xmm6 | |
1112 movdqa xmm1, xmm0 | |
1113 | |
1114 psrldq xmm7, 8 | |
1115 psrldq xmm1, 8 | |
1116 | |
1117 paddd xmm6, xmm7 | |
1118 paddd xmm0, xmm1 | |
1119 | |
1120 mov rsi, arg(5) ;[Sum] | |
1121 mov rdi, arg(6) ;[SSE] | |
1122 | |
1123 movd [rsi], xmm0 | |
1124 movd [rdi], xmm6 | |
1125 | |
1126 ; begin epilog | |
1127 pop rdi | |
1128 pop rsi | |
1129 RESTORE_GOT | |
1130 RESTORE_XMM | |
1131 UNSHADOW_ARGS | |
1132 pop rbp | |
1133 ret | |
1134 | |
1135 | 632 |
1136 ;void vp9_half_horiz_variance8x_h_sse2 | 633 ;void vp9_half_horiz_variance8x_h_sse2 |
1137 ;( | 634 ;( |
1138 ; unsigned char *ref_ptr, | 635 ; unsigned char *ref_ptr, |
1139 ; int ref_pixels_per_line, | 636 ; int ref_pixels_per_line, |
1140 ; unsigned char *src_ptr, | 637 ; unsigned char *src_ptr, |
1141 ; int src_pixels_per_line, | 638 ; int src_pixels_per_line, |
1142 ; unsigned int Height, | 639 ; unsigned int Height, |
1143 ; int *sum, | 640 ; int *sum, |
1144 ; unsigned int *sumsquared | 641 ; unsigned int *sumsquared |
(...skipping 86 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1231 | 728 |
1232 ; begin epilog | 729 ; begin epilog |
1233 pop rdi | 730 pop rdi |
1234 pop rsi | 731 pop rsi |
1235 RESTORE_GOT | 732 RESTORE_GOT |
1236 RESTORE_XMM | 733 RESTORE_XMM |
1237 UNSHADOW_ARGS | 734 UNSHADOW_ARGS |
1238 pop rbp | 735 pop rbp |
1239 ret | 736 ret |
1240 | 737 |
1241 ;void vp9_half_horiz_variance16x_h_sse2 | |
1242 ;( | |
1243 ; unsigned char *ref_ptr, | |
1244 ; int ref_pixels_per_line, | |
1245 ; unsigned char *src_ptr, | |
1246 ; int src_pixels_per_line, | |
1247 ; unsigned int Height, | |
1248 ; int *sum, | |
1249 ; unsigned int *sumsquared | |
1250 ;) | |
1251 global sym(vp9_half_horiz_variance16x_h_sse2) PRIVATE | |
1252 sym(vp9_half_horiz_variance16x_h_sse2): | |
1253 push rbp | |
1254 mov rbp, rsp | |
1255 SHADOW_ARGS_TO_STACK 7 | |
1256 SAVE_XMM 7 | |
1257 GET_GOT rbx | |
1258 push rsi | |
1259 push rdi | |
1260 ; end prolog | |
1261 | |
1262 pxor xmm6, xmm6 ; error accumulator | |
1263 pxor xmm7, xmm7 ; sse eaccumulator | |
1264 mov rsi, arg(0) ;ref_ptr ; | |
1265 | |
1266 mov rdi, arg(2) ;src_ptr ; | |
1267 movsxd rcx, dword ptr arg(4) ;Height ; | |
1268 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line | |
1269 movsxd rdx, dword ptr arg(3) ;src_pixels_per_line | |
1270 | |
1271 pxor xmm0, xmm0 ; | |
1272 | |
1273 .half_horiz_variance16x_h_1: | |
1274 movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2
..s15 | |
1275 movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3
..s16 | |
1276 | |
1277 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,x
mm3) | |
1278 movdqa xmm1, xmm5 | |
1279 punpcklbw xmm5, xmm0 ; xmm5 = words of a
bove | |
1280 punpckhbw xmm1, xmm0 | |
1281 | |
1282 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..
d7 | |
1283 punpcklbw xmm3, xmm0 ; xmm3 = words of a
bove | |
1284 movq xmm2, QWORD PTR [rdi+8] | |
1285 punpcklbw xmm2, xmm0 | |
1286 | |
1287 psubw xmm5, xmm3 ; xmm5 -= xmm3 | |
1288 psubw xmm1, xmm2 | |
1289 paddw xmm6, xmm5 ; xmm6 += accumulat
ed column differences | |
1290 paddw xmm6, xmm1 | |
1291 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 | |
1292 pmaddwd xmm1, xmm1 | |
1293 paddd xmm7, xmm5 ; xmm7 += accumulat
ed square column differences | |
1294 paddd xmm7, xmm1 | |
1295 | |
1296 lea rsi, [rsi + rax] | |
1297 lea rdi, [rdi + rdx] | |
1298 | |
1299 sub rcx, 1 ; | |
1300 jnz .half_horiz_variance16x_h_1 ; | |
1301 | |
1302 pxor xmm1, xmm1 | |
1303 pxor xmm5, xmm5 | |
1304 | |
1305 punpcklwd xmm0, xmm6 | |
1306 punpckhwd xmm1, xmm6 | |
1307 psrad xmm0, 16 | |
1308 psrad xmm1, 16 | |
1309 paddd xmm0, xmm1 | |
1310 movdqa xmm1, xmm0 | |
1311 | |
1312 movdqa xmm6, xmm7 | |
1313 punpckldq xmm6, xmm5 | |
1314 punpckhdq xmm7, xmm5 | |
1315 paddd xmm6, xmm7 | |
1316 | |
1317 punpckldq xmm0, xmm5 | |
1318 punpckhdq xmm1, xmm5 | |
1319 paddd xmm0, xmm1 | |
1320 | |
1321 movdqa xmm7, xmm6 | |
1322 movdqa xmm1, xmm0 | |
1323 | |
1324 psrldq xmm7, 8 | |
1325 psrldq xmm1, 8 | |
1326 | |
1327 paddd xmm6, xmm7 | |
1328 paddd xmm0, xmm1 | |
1329 | |
1330 mov rsi, arg(5) ;[Sum] | |
1331 mov rdi, arg(6) ;[SSE] | |
1332 | |
1333 movd [rsi], xmm0 | |
1334 movd [rdi], xmm6 | |
1335 | |
1336 ; begin epilog | |
1337 pop rdi | |
1338 pop rsi | |
1339 RESTORE_GOT | |
1340 RESTORE_XMM | |
1341 UNSHADOW_ARGS | |
1342 pop rbp | |
1343 ret | |
1344 | 738 |
1345 SECTION_RODATA | 739 SECTION_RODATA |
1346 ; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64}; | 740 ; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64}; |
1347 align 16 | 741 align 16 |
1348 xmm_bi_rd: | 742 xmm_bi_rd: |
1349 times 8 dw 64 | 743 times 8 dw 64 |
1350 align 16 | 744 align 16 |
1351 bilinear_filters_sse2: | 745 bilinear_filters_sse2: |
1352 dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 | 746 dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 |
1353 dw 120, 120, 120, 120, 120, 120, 120, 120, 8, 8, 8, 8, 8, 8, 8, 8 | 747 dw 120, 120, 120, 120, 120, 120, 120, 120, 8, 8, 8, 8, 8, 8, 8, 8 |
1354 dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 | 748 dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 |
1355 dw 104, 104, 104, 104, 104, 104, 104, 104, 24, 24, 24, 24, 24, 24, 24, 24 | 749 dw 104, 104, 104, 104, 104, 104, 104, 104, 24, 24, 24, 24, 24, 24, 24, 24 |
1356 dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 | 750 dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 |
1357 dw 88, 88, 88, 88, 88, 88, 88, 88, 40, 40, 40, 40, 40, 40, 40, 40 | 751 dw 88, 88, 88, 88, 88, 88, 88, 88, 40, 40, 40, 40, 40, 40, 40, 40 |
1358 dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 | 752 dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 |
1359 dw 72, 72, 72, 72, 72, 72, 72, 72, 56, 56, 56, 56, 56, 56, 56, 56 | 753 dw 72, 72, 72, 72, 72, 72, 72, 72, 56, 56, 56, 56, 56, 56, 56, 56 |
1360 dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 | 754 dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 |
1361 dw 56, 56, 56, 56, 56, 56, 56, 56, 72, 72, 72, 72, 72, 72, 72, 72 | 755 dw 56, 56, 56, 56, 56, 56, 56, 56, 72, 72, 72, 72, 72, 72, 72, 72 |
1362 dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 | 756 dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 |
1363 dw 40, 40, 40, 40, 40, 40, 40, 40, 88, 88, 88, 88, 88, 88, 88, 88 | 757 dw 40, 40, 40, 40, 40, 40, 40, 40, 88, 88, 88, 88, 88, 88, 88, 88 |
1364 dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 | 758 dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 |
1365 dw 24, 24, 24, 24, 24, 24, 24, 24, 104, 104, 104, 104, 104, 104, 104, 104 | 759 dw 24, 24, 24, 24, 24, 24, 24, 24, 104, 104, 104, 104, 104, 104, 104, 104 |
1366 dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 | 760 dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 |
1367 dw 8, 8, 8, 8, 8, 8, 8, 8, 120, 120, 120, 120, 120, 120, 120, 120 | 761 dw 8, 8, 8, 8, 8, 8, 8, 8, 120, 120, 120, 120, 120, 120, 120, 120 |
OLD | NEW |