Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(248)

Side by Side Diff: source/libvpx/vp9/encoder/x86/vp9_variance_impl_sse2.asm

Issue 11974002: libvpx: Pull from upstream (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 7 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 ; 1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ; 3 ;
4 ; Use of this source code is governed by a BSD-style license 4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source 5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found 6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may 7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree. 8 ; be found in the AUTHORS file in the root of the source tree.
9 ; 9 ;
10 10
(...skipping 382 matching lines...) Expand 10 before | Expand all | Expand 10 after
393 ; begin epilog 393 ; begin epilog
394 add rsp, 16 394 add rsp, 16
395 pop rdi 395 pop rdi
396 pop rsi 396 pop rsi
397 RESTORE_GOT 397 RESTORE_GOT
398 RESTORE_XMM 398 RESTORE_XMM
399 UNSHADOW_ARGS 399 UNSHADOW_ARGS
400 pop rbp 400 pop rbp
401 ret 401 ret
402 402
403 ;void vp9_filter_block2d_bil_var_sse2
404 ;(
405 ; unsigned char *ref_ptr,
406 ; int ref_pixels_per_line,
407 ; unsigned char *src_ptr,
408 ; int src_pixels_per_line,
409 ; unsigned int Height,
410 ; int xoffset,
411 ; int yoffset,
412 ; int *sum,
413 ; unsigned int *sumsquared;;
414 ;
415 ;)
416 global sym(vp9_filter_block2d_bil_var_sse2) PRIVATE
417 sym(vp9_filter_block2d_bil_var_sse2):
418 push rbp
419 mov rbp, rsp
420 SHADOW_ARGS_TO_STACK 9
421 SAVE_XMM 7
422 GET_GOT rbx
423 push rsi
424 push rdi
425 push rbx
426 ; end prolog
427
428 pxor xmm6, xmm6 ;
429 pxor xmm7, xmm7 ;
430
431 lea rsi, [GLOBAL(xmm_bi_rd)] ; rounding
432 movdqa xmm4, XMMWORD PTR [rsi]
433
434 lea rcx, [GLOBAL(bilinear_filters_sse2)]
435 movsxd rax, dword ptr arg(5) ; xoffset
436
437 cmp rax, 0 ; skip first_pass f ilter if xoffset=0
438 je filter_block2d_bil_var_sse2_sp_only
439
440 shl rax, 5 ; point to filter c oeff with xoffset
441 lea rax, [rax + rcx] ; HFilter
442
443 movsxd rdx, dword ptr arg(6) ; yoffset
444
445 cmp rdx, 0 ; skip second_pass filter if yoffset=0
446 je filter_block2d_bil_var_sse2_fp_only
447
448 shl rdx, 5
449 lea rdx, [rdx + rcx] ; VFilter
450
451 mov rsi, arg(0) ;ref_ptr
452 mov rdi, arg(2) ;src_ptr
453 movsxd rcx, dword ptr arg(4) ;Height
454
455 pxor xmm0, xmm0 ;
456 movq xmm1, QWORD PTR [rsi] ;
457 movq xmm3, QWORD PTR [rsi+1] ;
458
459 punpcklbw xmm1, xmm0 ;
460 pmullw xmm1, [rax] ;
461 punpcklbw xmm3, xmm0
462 pmullw xmm3, [rax+16] ;
463
464 paddw xmm1, xmm3 ;
465 paddw xmm1, xmm4 ;
466 psraw xmm1, xmm_filter_shift ;
467 movdqa xmm5, xmm1
468
469 movsxd rbx, dword ptr arg(1) ;ref_pixels_per_line
470 lea rsi, [rsi + rbx]
471 %if ABI_IS_32BIT=0
472 movsxd r9, dword ptr arg(3) ;src_pixels_per_line
473 %endif
474
475 filter_block2d_bil_var_sse2_loop:
476 movq xmm1, QWORD PTR [rsi] ;
477 movq xmm3, QWORD PTR [rsi+1] ;
478
479 punpcklbw xmm1, xmm0 ;
480 pmullw xmm1, [rax] ;
481 punpcklbw xmm3, xmm0 ;
482 pmullw xmm3, [rax+16] ;
483
484 paddw xmm1, xmm3 ;
485 paddw xmm1, xmm4 ;
486 psraw xmm1, xmm_filter_shift ;
487
488 movdqa xmm3, xmm5 ;
489 movdqa xmm5, xmm1 ;
490
491 pmullw xmm3, [rdx] ;
492 pmullw xmm1, [rdx+16] ;
493 paddw xmm1, xmm3 ;
494 paddw xmm1, xmm4 ;
495 psraw xmm1, xmm_filter_shift ;
496
497 movq xmm3, QWORD PTR [rdi] ;
498 punpcklbw xmm3, xmm0 ;
499
500 psubw xmm1, xmm3 ;
501 paddw xmm6, xmm1 ;
502
503 pmaddwd xmm1, xmm1 ;
504 paddd xmm7, xmm1 ;
505
506 lea rsi, [rsi + rbx] ;ref_pixels_per_lin e
507 %if ABI_IS_32BIT
508 add rdi, dword ptr arg(3) ;src_pixels_per_lin e
509 %else
510 lea rdi, [rdi + r9]
511 %endif
512
513 sub rcx, 1 ;
514 jnz filter_block2d_bil_var_sse2_loop ;
515
516 jmp filter_block2d_bil_variance
517
518 filter_block2d_bil_var_sse2_sp_only:
519 movsxd rdx, dword ptr arg(6) ; yoffset
520
521 cmp rdx, 0 ; skip all if both xoffset=0 and yoffset=0
522 je filter_block2d_bil_var_sse2_full_pixel
523
524 shl rdx, 5
525 lea rdx, [rdx + rcx] ; VFilter
526
527 mov rsi, arg(0) ;ref_ptr
528 mov rdi, arg(2) ;src_ptr
529 movsxd rcx, dword ptr arg(4) ;Height
530 movsxd rax, dword ptr arg(1) ;ref_pixels_per_lin e
531
532 pxor xmm0, xmm0 ;
533 movq xmm1, QWORD PTR [rsi] ;
534 punpcklbw xmm1, xmm0 ;
535
536 movsxd rbx, dword ptr arg(3) ;src_pixels_per_lin e
537 lea rsi, [rsi + rax]
538
539 filter_block2d_bil_sp_only_loop:
540 movq xmm3, QWORD PTR [rsi] ;
541 punpcklbw xmm3, xmm0 ;
542 movdqa xmm5, xmm3
543
544 pmullw xmm1, [rdx] ;
545 pmullw xmm3, [rdx+16] ;
546 paddw xmm1, xmm3 ;
547 paddw xmm1, xmm4 ;
548 psraw xmm1, xmm_filter_shift ;
549
550 movq xmm3, QWORD PTR [rdi] ;
551 punpcklbw xmm3, xmm0 ;
552
553 psubw xmm1, xmm3 ;
554 paddw xmm6, xmm1 ;
555
556 pmaddwd xmm1, xmm1 ;
557 paddd xmm7, xmm1 ;
558
559 movdqa xmm1, xmm5 ;
560 lea rsi, [rsi + rax] ;ref_pixels_per_lin e
561 lea rdi, [rdi + rbx] ;src_pixels_per_lin e
562
563 sub rcx, 1 ;
564 jnz filter_block2d_bil_sp_only_loop ;
565
566 jmp filter_block2d_bil_variance
567
568 filter_block2d_bil_var_sse2_full_pixel:
569 mov rsi, arg(0) ;ref_ptr
570 mov rdi, arg(2) ;src_ptr
571 movsxd rcx, dword ptr arg(4) ;Height
572 movsxd rax, dword ptr arg(1) ;ref_pixels_per_lin e
573 movsxd rbx, dword ptr arg(3) ;src_pixels_per_lin e
574 pxor xmm0, xmm0 ;
575
576 filter_block2d_bil_full_pixel_loop:
577 movq xmm1, QWORD PTR [rsi] ;
578 punpcklbw xmm1, xmm0 ;
579
580 movq xmm2, QWORD PTR [rdi] ;
581 punpcklbw xmm2, xmm0 ;
582
583 psubw xmm1, xmm2 ;
584 paddw xmm6, xmm1 ;
585
586 pmaddwd xmm1, xmm1 ;
587 paddd xmm7, xmm1 ;
588
589 lea rsi, [rsi + rax] ;ref_pixels_per_lin e
590 lea rdi, [rdi + rbx] ;src_pixels_per_lin e
591
592 sub rcx, 1 ;
593 jnz filter_block2d_bil_full_pixel_loop ;
594
595 jmp filter_block2d_bil_variance
596
597 filter_block2d_bil_var_sse2_fp_only:
598 mov rsi, arg(0) ;ref_ptr
599 mov rdi, arg(2) ;src_ptr
600 movsxd rcx, dword ptr arg(4) ;Height
601 movsxd rdx, dword ptr arg(1) ;ref_pixels_per_lin e
602
603 pxor xmm0, xmm0 ;
604 movsxd rbx, dword ptr arg(3) ;src_pixels_per_lin e
605
606 filter_block2d_bil_fp_only_loop:
607 movq xmm1, QWORD PTR [rsi] ;
608 movq xmm3, QWORD PTR [rsi+1] ;
609
610 punpcklbw xmm1, xmm0 ;
611 pmullw xmm1, [rax] ;
612 punpcklbw xmm3, xmm0 ;
613 pmullw xmm3, [rax+16] ;
614
615 paddw xmm1, xmm3 ;
616 paddw xmm1, xmm4 ;
617 psraw xmm1, xmm_filter_shift ;
618
619 movq xmm3, QWORD PTR [rdi] ;
620 punpcklbw xmm3, xmm0 ;
621
622 psubw xmm1, xmm3 ;
623 paddw xmm6, xmm1 ;
624
625 pmaddwd xmm1, xmm1 ;
626 paddd xmm7, xmm1 ;
627 lea rsi, [rsi + rdx]
628 lea rdi, [rdi + rbx] ;src_pixels_per_lin e
629
630 sub rcx, 1 ;
631 jnz filter_block2d_bil_fp_only_loop ;
632
633 jmp filter_block2d_bil_variance
634
635 filter_block2d_bil_variance:
636 movdq2q mm6, xmm6 ;
637 movdq2q mm7, xmm7 ;
638
639 psrldq xmm6, 8
640 psrldq xmm7, 8
641
642 movdq2q mm2, xmm6
643 movdq2q mm3, xmm7
644
645 paddw mm6, mm2
646 paddd mm7, mm3
647
648 pxor mm3, mm3 ;
649 pxor mm2, mm2 ;
650
651 punpcklwd mm2, mm6 ;
652 punpckhwd mm3, mm6 ;
653
654 paddd mm2, mm3 ;
655 movq mm6, mm2 ;
656
657 psrlq mm6, 32 ;
658 paddd mm2, mm6 ;
659
660 psrad mm2, 16 ;
661 movq mm4, mm7 ;
662
663 psrlq mm4, 32 ;
664 paddd mm4, mm7 ;
665
666 mov rsi, arg(7) ; sum
667 mov rdi, arg(8) ; sumsquared
668
669 movd [rsi], mm2 ; xsum
670 movd [rdi], mm4 ; xxsum
671
672 ; begin epilog
673 pop rbx
674 pop rdi
675 pop rsi
676 RESTORE_GOT
677 RESTORE_XMM
678 UNSHADOW_ARGS
679 pop rbp
680 ret
681
682
683 ;void vp9_half_horiz_vert_variance8x_h_sse2 403 ;void vp9_half_horiz_vert_variance8x_h_sse2
684 ;( 404 ;(
685 ; unsigned char *ref_ptr, 405 ; unsigned char *ref_ptr,
686 ; int ref_pixels_per_line, 406 ; int ref_pixels_per_line,
687 ; unsigned char *src_ptr, 407 ; unsigned char *src_ptr,
688 ; int src_pixels_per_line, 408 ; int src_pixels_per_line,
689 ; unsigned int Height, 409 ; unsigned int Height,
690 ; int *sum, 410 ; int *sum,
691 ; unsigned int *sumsquared 411 ; unsigned int *sumsquared
692 ;) 412 ;)
(...skipping 102 matching lines...) Expand 10 before | Expand all | Expand 10 after
795 515
796 ; begin epilog 516 ; begin epilog
797 pop rdi 517 pop rdi
798 pop rsi 518 pop rsi
799 RESTORE_GOT 519 RESTORE_GOT
800 RESTORE_XMM 520 RESTORE_XMM
801 UNSHADOW_ARGS 521 UNSHADOW_ARGS
802 pop rbp 522 pop rbp
803 ret 523 ret
804 524
805 ;void vp9_half_horiz_vert_variance16x_h_sse2
806 ;(
807 ; unsigned char *ref_ptr,
808 ; int ref_pixels_per_line,
809 ; unsigned char *src_ptr,
810 ; int src_pixels_per_line,
811 ; unsigned int Height,
812 ; int *sum,
813 ; unsigned int *sumsquared
814 ;)
815 global sym(vp9_half_horiz_vert_variance16x_h_sse2) PRIVATE
816 sym(vp9_half_horiz_vert_variance16x_h_sse2):
817 push rbp
818 mov rbp, rsp
819 SHADOW_ARGS_TO_STACK 7
820 SAVE_XMM 7
821 GET_GOT rbx
822 push rsi
823 push rdi
824 ; end prolog
825
826 pxor xmm6, xmm6 ; error accumulator
827 pxor xmm7, xmm7 ; sse eaccumulator
828 mov rsi, arg(0) ;ref_ptr ;
829
830 mov rdi, arg(2) ;src_ptr ;
831 movsxd rcx, dword ptr arg(4) ;Height ;
832 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
833 movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
834
835 pxor xmm0, xmm0 ;
836
837 movdqu xmm5, XMMWORD PTR [rsi]
838 movdqu xmm3, XMMWORD PTR [rsi+1]
839 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,x mm3) horizontal line 1
840
841 lea rsi, [rsi + rax]
842
843 .half_horiz_vert_variance16x_h_1:
844 movdqu xmm1, XMMWORD PTR [rsi] ;
845 movdqu xmm2, XMMWORD PTR [rsi+1] ;
846 pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,x mm3) horizontal line i+1
847
848 pavgb xmm5, xmm1 ; xmm = vertical av erage of the above
849
850 movdqa xmm4, xmm5
851 punpcklbw xmm5, xmm0 ; xmm5 = words of a bove
852 punpckhbw xmm4, xmm0
853
854 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2.. d7
855 punpcklbw xmm3, xmm0 ; xmm3 = words of a bove
856 psubw xmm5, xmm3 ; xmm5 -= xmm3
857
858 movq xmm3, QWORD PTR [rdi+8]
859 punpcklbw xmm3, xmm0
860 psubw xmm4, xmm3
861
862 paddw xmm6, xmm5 ; xmm6 += accumulat ed column differences
863 paddw xmm6, xmm4
864 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
865 pmaddwd xmm4, xmm4
866 paddd xmm7, xmm5 ; xmm7 += accumulat ed square column differences
867 paddd xmm7, xmm4
868
869 movdqa xmm5, xmm1 ; save xmm1 for use on the next row
870
871 lea rsi, [rsi + rax]
872 lea rdi, [rdi + rdx]
873
874 sub rcx, 1 ;
875 jnz .half_horiz_vert_variance16x_h_1 ;
876
877 pxor xmm1, xmm1
878 pxor xmm5, xmm5
879
880 punpcklwd xmm0, xmm6
881 punpckhwd xmm1, xmm6
882 psrad xmm0, 16
883 psrad xmm1, 16
884 paddd xmm0, xmm1
885 movdqa xmm1, xmm0
886
887 movdqa xmm6, xmm7
888 punpckldq xmm6, xmm5
889 punpckhdq xmm7, xmm5
890 paddd xmm6, xmm7
891
892 punpckldq xmm0, xmm5
893 punpckhdq xmm1, xmm5
894 paddd xmm0, xmm1
895
896 movdqa xmm7, xmm6
897 movdqa xmm1, xmm0
898
899 psrldq xmm7, 8
900 psrldq xmm1, 8
901
902 paddd xmm6, xmm7
903 paddd xmm0, xmm1
904
905 mov rsi, arg(5) ;[Sum]
906 mov rdi, arg(6) ;[SSE]
907
908 movd [rsi], xmm0
909 movd [rdi], xmm6
910
911 ; begin epilog
912 pop rdi
913 pop rsi
914 RESTORE_GOT
915 RESTORE_XMM
916 UNSHADOW_ARGS
917 pop rbp
918 ret
919
920
921 ;void vp9_half_vert_variance8x_h_sse2 525 ;void vp9_half_vert_variance8x_h_sse2
922 ;( 526 ;(
923 ; unsigned char *ref_ptr, 527 ; unsigned char *ref_ptr,
924 ; int ref_pixels_per_line, 528 ; int ref_pixels_per_line,
925 ; unsigned char *src_ptr, 529 ; unsigned char *src_ptr,
926 ; int src_pixels_per_line, 530 ; int src_pixels_per_line,
927 ; unsigned int Height, 531 ; unsigned int Height,
928 ; int *sum, 532 ; int *sum,
929 ; unsigned int *sumsquared 533 ; unsigned int *sumsquared
930 ;) 534 ;)
(...skipping 87 matching lines...) Expand 10 before | Expand all | Expand 10 after
1018 622
1019 ; begin epilog 623 ; begin epilog
1020 pop rdi 624 pop rdi
1021 pop rsi 625 pop rsi
1022 RESTORE_GOT 626 RESTORE_GOT
1023 RESTORE_XMM 627 RESTORE_XMM
1024 UNSHADOW_ARGS 628 UNSHADOW_ARGS
1025 pop rbp 629 pop rbp
1026 ret 630 ret
1027 631
1028 ;void vp9_half_vert_variance16x_h_sse2
1029 ;(
1030 ; unsigned char *ref_ptr,
1031 ; int ref_pixels_per_line,
1032 ; unsigned char *src_ptr,
1033 ; int src_pixels_per_line,
1034 ; unsigned int Height,
1035 ; int *sum,
1036 ; unsigned int *sumsquared
1037 ;)
1038 global sym(vp9_half_vert_variance16x_h_sse2) PRIVATE
1039 sym(vp9_half_vert_variance16x_h_sse2):
1040 push rbp
1041 mov rbp, rsp
1042 SHADOW_ARGS_TO_STACK 7
1043 SAVE_XMM 7
1044 GET_GOT rbx
1045 push rsi
1046 push rdi
1047 ; end prolog
1048
1049 pxor xmm6, xmm6 ; error accumulator
1050 pxor xmm7, xmm7 ; sse eaccumulator
1051 mov rsi, arg(0) ;ref_ptr
1052
1053 mov rdi, arg(2) ;src_ptr
1054 movsxd rcx, dword ptr arg(4) ;Height
1055 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
1056 movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
1057
1058 movdqu xmm5, XMMWORD PTR [rsi]
1059 lea rsi, [rsi + rax ]
1060 pxor xmm0, xmm0
1061
1062 .half_vert_variance16x_h_1:
1063 movdqu xmm3, XMMWORD PTR [rsi]
1064
1065 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,x mm3)
1066 movdqa xmm4, xmm5
1067 punpcklbw xmm5, xmm0
1068 punpckhbw xmm4, xmm0
1069
1070 movq xmm2, QWORD PTR [rdi]
1071 punpcklbw xmm2, xmm0
1072 psubw xmm5, xmm2
1073 movq xmm2, QWORD PTR [rdi+8]
1074 punpcklbw xmm2, xmm0
1075 psubw xmm4, xmm2
1076
1077 paddw xmm6, xmm5 ; xmm6 += accumulat ed column differences
1078 paddw xmm6, xmm4
1079 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
1080 pmaddwd xmm4, xmm4
1081 paddd xmm7, xmm5 ; xmm7 += accumulat ed square column differences
1082 paddd xmm7, xmm4
1083
1084 movdqa xmm5, xmm3
1085
1086 lea rsi, [rsi + rax]
1087 lea rdi, [rdi + rdx]
1088
1089 sub rcx, 1
1090 jnz .half_vert_variance16x_h_1
1091
1092 pxor xmm1, xmm1
1093 pxor xmm5, xmm5
1094
1095 punpcklwd xmm0, xmm6
1096 punpckhwd xmm1, xmm6
1097 psrad xmm0, 16
1098 psrad xmm1, 16
1099 paddd xmm0, xmm1
1100 movdqa xmm1, xmm0
1101
1102 movdqa xmm6, xmm7
1103 punpckldq xmm6, xmm5
1104 punpckhdq xmm7, xmm5
1105 paddd xmm6, xmm7
1106
1107 punpckldq xmm0, xmm5
1108 punpckhdq xmm1, xmm5
1109 paddd xmm0, xmm1
1110
1111 movdqa xmm7, xmm6
1112 movdqa xmm1, xmm0
1113
1114 psrldq xmm7, 8
1115 psrldq xmm1, 8
1116
1117 paddd xmm6, xmm7
1118 paddd xmm0, xmm1
1119
1120 mov rsi, arg(5) ;[Sum]
1121 mov rdi, arg(6) ;[SSE]
1122
1123 movd [rsi], xmm0
1124 movd [rdi], xmm6
1125
1126 ; begin epilog
1127 pop rdi
1128 pop rsi
1129 RESTORE_GOT
1130 RESTORE_XMM
1131 UNSHADOW_ARGS
1132 pop rbp
1133 ret
1134
1135 632
1136 ;void vp9_half_horiz_variance8x_h_sse2 633 ;void vp9_half_horiz_variance8x_h_sse2
1137 ;( 634 ;(
1138 ; unsigned char *ref_ptr, 635 ; unsigned char *ref_ptr,
1139 ; int ref_pixels_per_line, 636 ; int ref_pixels_per_line,
1140 ; unsigned char *src_ptr, 637 ; unsigned char *src_ptr,
1141 ; int src_pixels_per_line, 638 ; int src_pixels_per_line,
1142 ; unsigned int Height, 639 ; unsigned int Height,
1143 ; int *sum, 640 ; int *sum,
1144 ; unsigned int *sumsquared 641 ; unsigned int *sumsquared
(...skipping 86 matching lines...) Expand 10 before | Expand all | Expand 10 after
1231 728
1232 ; begin epilog 729 ; begin epilog
1233 pop rdi 730 pop rdi
1234 pop rsi 731 pop rsi
1235 RESTORE_GOT 732 RESTORE_GOT
1236 RESTORE_XMM 733 RESTORE_XMM
1237 UNSHADOW_ARGS 734 UNSHADOW_ARGS
1238 pop rbp 735 pop rbp
1239 ret 736 ret
1240 737
1241 ;void vp9_half_horiz_variance16x_h_sse2
1242 ;(
1243 ; unsigned char *ref_ptr,
1244 ; int ref_pixels_per_line,
1245 ; unsigned char *src_ptr,
1246 ; int src_pixels_per_line,
1247 ; unsigned int Height,
1248 ; int *sum,
1249 ; unsigned int *sumsquared
1250 ;)
1251 global sym(vp9_half_horiz_variance16x_h_sse2) PRIVATE
1252 sym(vp9_half_horiz_variance16x_h_sse2):
1253 push rbp
1254 mov rbp, rsp
1255 SHADOW_ARGS_TO_STACK 7
1256 SAVE_XMM 7
1257 GET_GOT rbx
1258 push rsi
1259 push rdi
1260 ; end prolog
1261
1262 pxor xmm6, xmm6 ; error accumulator
1263 pxor xmm7, xmm7 ; sse eaccumulator
1264 mov rsi, arg(0) ;ref_ptr ;
1265
1266 mov rdi, arg(2) ;src_ptr ;
1267 movsxd rcx, dword ptr arg(4) ;Height ;
1268 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
1269 movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
1270
1271 pxor xmm0, xmm0 ;
1272
1273 .half_horiz_variance16x_h_1:
1274 movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2 ..s15
1275 movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3 ..s16
1276
1277 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,x mm3)
1278 movdqa xmm1, xmm5
1279 punpcklbw xmm5, xmm0 ; xmm5 = words of a bove
1280 punpckhbw xmm1, xmm0
1281
1282 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2.. d7
1283 punpcklbw xmm3, xmm0 ; xmm3 = words of a bove
1284 movq xmm2, QWORD PTR [rdi+8]
1285 punpcklbw xmm2, xmm0
1286
1287 psubw xmm5, xmm3 ; xmm5 -= xmm3
1288 psubw xmm1, xmm2
1289 paddw xmm6, xmm5 ; xmm6 += accumulat ed column differences
1290 paddw xmm6, xmm1
1291 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
1292 pmaddwd xmm1, xmm1
1293 paddd xmm7, xmm5 ; xmm7 += accumulat ed square column differences
1294 paddd xmm7, xmm1
1295
1296 lea rsi, [rsi + rax]
1297 lea rdi, [rdi + rdx]
1298
1299 sub rcx, 1 ;
1300 jnz .half_horiz_variance16x_h_1 ;
1301
1302 pxor xmm1, xmm1
1303 pxor xmm5, xmm5
1304
1305 punpcklwd xmm0, xmm6
1306 punpckhwd xmm1, xmm6
1307 psrad xmm0, 16
1308 psrad xmm1, 16
1309 paddd xmm0, xmm1
1310 movdqa xmm1, xmm0
1311
1312 movdqa xmm6, xmm7
1313 punpckldq xmm6, xmm5
1314 punpckhdq xmm7, xmm5
1315 paddd xmm6, xmm7
1316
1317 punpckldq xmm0, xmm5
1318 punpckhdq xmm1, xmm5
1319 paddd xmm0, xmm1
1320
1321 movdqa xmm7, xmm6
1322 movdqa xmm1, xmm0
1323
1324 psrldq xmm7, 8
1325 psrldq xmm1, 8
1326
1327 paddd xmm6, xmm7
1328 paddd xmm0, xmm1
1329
1330 mov rsi, arg(5) ;[Sum]
1331 mov rdi, arg(6) ;[SSE]
1332
1333 movd [rsi], xmm0
1334 movd [rdi], xmm6
1335
1336 ; begin epilog
1337 pop rdi
1338 pop rsi
1339 RESTORE_GOT
1340 RESTORE_XMM
1341 UNSHADOW_ARGS
1342 pop rbp
1343 ret
1344 738
1345 SECTION_RODATA 739 SECTION_RODATA
1346 ; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64}; 740 ; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
1347 align 16 741 align 16
1348 xmm_bi_rd: 742 xmm_bi_rd:
1349 times 8 dw 64 743 times 8 dw 64
1350 align 16 744 align 16
1351 bilinear_filters_sse2: 745 bilinear_filters_sse2:
1352 dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 746 dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0
1353 dw 120, 120, 120, 120, 120, 120, 120, 120, 8, 8, 8, 8, 8, 8, 8, 8 747 dw 120, 120, 120, 120, 120, 120, 120, 120, 8, 8, 8, 8, 8, 8, 8, 8
1354 dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 748 dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
1355 dw 104, 104, 104, 104, 104, 104, 104, 104, 24, 24, 24, 24, 24, 24, 24, 24 749 dw 104, 104, 104, 104, 104, 104, 104, 104, 24, 24, 24, 24, 24, 24, 24, 24
1356 dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 750 dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
1357 dw 88, 88, 88, 88, 88, 88, 88, 88, 40, 40, 40, 40, 40, 40, 40, 40 751 dw 88, 88, 88, 88, 88, 88, 88, 88, 40, 40, 40, 40, 40, 40, 40, 40
1358 dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 752 dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
1359 dw 72, 72, 72, 72, 72, 72, 72, 72, 56, 56, 56, 56, 56, 56, 56, 56 753 dw 72, 72, 72, 72, 72, 72, 72, 72, 56, 56, 56, 56, 56, 56, 56, 56
1360 dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 754 dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
1361 dw 56, 56, 56, 56, 56, 56, 56, 56, 72, 72, 72, 72, 72, 72, 72, 72 755 dw 56, 56, 56, 56, 56, 56, 56, 56, 72, 72, 72, 72, 72, 72, 72, 72
1362 dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 756 dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
1363 dw 40, 40, 40, 40, 40, 40, 40, 40, 88, 88, 88, 88, 88, 88, 88, 88 757 dw 40, 40, 40, 40, 40, 40, 40, 40, 88, 88, 88, 88, 88, 88, 88, 88
1364 dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 758 dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
1365 dw 24, 24, 24, 24, 24, 24, 24, 24, 104, 104, 104, 104, 104, 104, 104, 104 759 dw 24, 24, 24, 24, 24, 24, 24, 24, 104, 104, 104, 104, 104, 104, 104, 104
1366 dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 760 dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112
1367 dw 8, 8, 8, 8, 8, 8, 8, 8, 120, 120, 120, 120, 120, 120, 120, 120 761 dw 8, 8, 8, 8, 8, 8, 8, 8, 120, 120, 120, 120, 120, 120, 120, 120
OLDNEW
« no previous file with comments | « source/libvpx/vp9/encoder/vp9_variance_c.c ('k') | source/libvpx/vp9/encoder/x86/vp9_x86_csystemdependent.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698