Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(18)

Side by Side Diff: source/libvpx/vpx_dsp/x86/variance_impl_mmx.asm

Issue 1162573005: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 ; 1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ; 3 ;
4 ; Use of this source code is governed by a BSD-style license 4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source 5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found 6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may 7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree. 8 ; be found in the AUTHORS file in the root of the source tree.
9 ; 9 ;
10 10
11 11
12 %include "vpx_ports/x86_abi_support.asm" 12 %include "vpx_ports/x86_abi_support.asm"
13 13
14 ;unsigned int vp8_get_mb_ss_mmx( short *src_ptr ) 14 ;unsigned int vpx_get_mb_ss_mmx( short *src_ptr )
15 global sym(vp8_get_mb_ss_mmx) PRIVATE 15 global sym(vpx_get_mb_ss_mmx) PRIVATE
16 sym(vp8_get_mb_ss_mmx): 16 sym(vpx_get_mb_ss_mmx):
17 push rbp 17 push rbp
18 mov rbp, rsp 18 mov rbp, rsp
19 SHADOW_ARGS_TO_STACK 7 19 SHADOW_ARGS_TO_STACK 7
20 GET_GOT rbx 20 GET_GOT rbx
21 push rsi 21 push rsi
22 push rdi 22 push rdi
23 sub rsp, 8 23 sub rsp, 8
24 ; end prolog 24 ; end prolog
25 25
26 mov rax, arg(0) ;src_ptr 26 mov rax, arg(0) ;src_ptr
(...skipping 29 matching lines...) Expand all
56 ; begin epilog 56 ; begin epilog
57 add rsp, 8 57 add rsp, 8
58 pop rdi 58 pop rdi
59 pop rsi 59 pop rsi
60 RESTORE_GOT 60 RESTORE_GOT
61 UNSHADOW_ARGS 61 UNSHADOW_ARGS
62 pop rbp 62 pop rbp
63 ret 63 ret
64 64
65 65
66 ;unsigned int vp8_get8x8var_mmx 66 ;void vpx_get8x8var_mmx
67 ;( 67 ;(
68 ; unsigned char *src_ptr, 68 ; unsigned char *src_ptr,
69 ; int source_stride, 69 ; int source_stride,
70 ; unsigned char *ref_ptr, 70 ; unsigned char *ref_ptr,
71 ; int recon_stride, 71 ; int recon_stride,
72 ; unsigned int *SSE, 72 ; unsigned int *SSE,
73 ; int *Sum 73 ; int *Sum
74 ;) 74 ;)
75 global sym(vp8_get8x8var_mmx) PRIVATE 75 global sym(vpx_get8x8var_mmx) PRIVATE
76 sym(vp8_get8x8var_mmx): 76 sym(vpx_get8x8var_mmx):
77 push rbp 77 push rbp
78 mov rbp, rsp 78 mov rbp, rsp
79 SHADOW_ARGS_TO_STACK 6 79 SHADOW_ARGS_TO_STACK 6
80 push rsi 80 push rsi
81 push rdi 81 push rdi
82 push rbx 82 push rbx
83 sub rsp, 16 83 sub rsp, 16
84 ; end prolog 84 ; end prolog
85 85
86 86
(...skipping 216 matching lines...) Expand 10 before | Expand all | Expand 10 after
303 add rsp, 16 303 add rsp, 16
304 pop rbx 304 pop rbx
305 pop rdi 305 pop rdi
306 pop rsi 306 pop rsi
307 UNSHADOW_ARGS 307 UNSHADOW_ARGS
308 pop rbp 308 pop rbp
309 ret 309 ret
310 310
311 311
312 312
313 ;unsigned int 313 ;void
314 ;vp8_get4x4var_mmx 314 ;vpx_get4x4var_mmx
315 ;( 315 ;(
316 ; unsigned char *src_ptr, 316 ; unsigned char *src_ptr,
317 ; int source_stride, 317 ; int source_stride,
318 ; unsigned char *ref_ptr, 318 ; unsigned char *ref_ptr,
319 ; int recon_stride, 319 ; int recon_stride,
320 ; unsigned int *SSE, 320 ; unsigned int *SSE,
321 ; int *Sum 321 ; int *Sum
322 ;) 322 ;)
323 global sym(vp8_get4x4var_mmx) PRIVATE 323 global sym(vpx_get4x4var_mmx) PRIVATE
324 sym(vp8_get4x4var_mmx): 324 sym(vpx_get4x4var_mmx):
325 push rbp 325 push rbp
326 mov rbp, rsp 326 mov rbp, rsp
327 SHADOW_ARGS_TO_STACK 6 327 SHADOW_ARGS_TO_STACK 6
328 push rsi 328 push rsi
329 push rdi 329 push rdi
330 push rbx 330 push rbx
331 sub rsp, 16 331 sub rsp, 16
332 ; end prolog 332 ; end prolog
333 333
334 334
(...skipping 80 matching lines...) Expand 10 before | Expand all | Expand 10 after
415 415
416 416
417 ; begin epilog 417 ; begin epilog
418 add rsp, 16 418 add rsp, 16
419 pop rbx 419 pop rbx
420 pop rdi 420 pop rdi
421 pop rsi 421 pop rsi
422 UNSHADOW_ARGS 422 UNSHADOW_ARGS
423 pop rbp 423 pop rbp
424 ret 424 ret
425
426
427
428 ;unsigned int
429 ;vp8_get4x4sse_cs_mmx
430 ;(
431 ; unsigned char *src_ptr,
432 ; int source_stride,
433 ; unsigned char *ref_ptr,
434 ; int recon_stride
435 ;)
436 global sym(vp8_get4x4sse_cs_mmx) PRIVATE
437 sym(vp8_get4x4sse_cs_mmx):
438 push rbp
439 mov rbp, rsp
440 SHADOW_ARGS_TO_STACK 4
441 push rsi
442 push rdi
443 push rbx
444 ; end prolog
445
446
447 pxor mm6, mm6 ; Blank mmx7
448 pxor mm7, mm7 ; Blank mmx7
449
450 mov rax, arg(0) ;[src_ptr] ; Load base addresses
451 mov rbx, arg(2) ;[ref_ptr]
452 movsxd rcx, dword ptr arg(1) ;[source_stride]
453 movsxd rdx, dword ptr arg(3) ;[recon_stride]
454 ; Row 1
455 movd mm0, [rax] ; Copy eight bytes to mm0
456 movd mm1, [rbx] ; Copy eight bytes to mm1
457 punpcklbw mm0, mm6 ; unpack to higher prrcision
458 punpcklbw mm1, mm6
459 psubsw mm0, mm1 ; A-B (low order) to MM0
460 pmaddwd mm0, mm0 ; square and accumulate
461 add rbx,rdx ; Inc pointer into ref data
462 add rax,rcx ; Inc pointer into the new data
463 movd mm1, [rbx] ; Copy eight bytes to mm1
464 paddd mm7, mm0 ; accumulate in mm7
465
466 ; Row 2
467 movd mm0, [rax] ; Copy eight bytes to mm0
468 punpcklbw mm0, mm6 ; unpack to higher prrcision
469 punpcklbw mm1, mm6
470 psubsw mm0, mm1 ; A-B (low order) to MM0
471 pmaddwd mm0, mm0 ; square and accumulate
472 add rbx,rdx ; Inc pointer into ref data
473 add rax,rcx ; Inc pointer into the new data
474 movd mm1, [rbx] ; Copy eight bytes to mm1
475 paddd mm7, mm0 ; accumulate in mm7
476
477 ; Row 3
478 movd mm0, [rax] ; Copy eight bytes to mm0
479 punpcklbw mm1, mm6
480 punpcklbw mm0, mm6 ; unpack to higher prrcision
481 psubsw mm0, mm1 ; A-B (low order) to MM0
482
483 pmaddwd mm0, mm0 ; square and accumulate
484 add rbx,rdx ; Inc pointer into ref data
485 add rax,rcx ; Inc pointer into the new data
486 movd mm1, [rbx] ; Copy eight bytes to mm1
487 paddd mm7, mm0 ; accumulate in mm7
488
489 ; Row 4
490 movd mm0, [rax] ; Copy eight bytes to mm0
491 punpcklbw mm0, mm6 ; unpack to higher prrcision
492 punpcklbw mm1, mm6
493 psubsw mm0, mm1 ; A-B (low order) to MM0
494 pmaddwd mm0, mm0 ; square and accumulate
495 paddd mm7, mm0 ; accumulate in mm7
496
497 movq mm0, mm7 ;
498 psrlq mm7, 32
499
500 paddd mm0, mm7
501 movq rax, mm0
502
503
504 ; begin epilog
505 pop rbx
506 pop rdi
507 pop rsi
508 UNSHADOW_ARGS
509 pop rbp
510 ret
511
512 %define mmx_filter_shift 7
513
514 ;void vp8_filter_block2d_bil4x4_var_mmx
515 ;(
516 ; unsigned char *ref_ptr,
517 ; int ref_pixels_per_line,
518 ; unsigned char *src_ptr,
519 ; int src_pixels_per_line,
520 ; unsigned short *HFilter,
521 ; unsigned short *VFilter,
522 ; int *sum,
523 ; unsigned int *sumsquared
524 ;)
525 global sym(vp8_filter_block2d_bil4x4_var_mmx) PRIVATE
526 sym(vp8_filter_block2d_bil4x4_var_mmx):
527 push rbp
528 mov rbp, rsp
529 SHADOW_ARGS_TO_STACK 8
530 GET_GOT rbx
531 push rsi
532 push rdi
533 sub rsp, 16
534 ; end prolog
535
536
537 pxor mm6, mm6 ;
538 pxor mm7, mm7 ;
539
540 mov rax, arg(4) ;HFilter ;
541 mov rdx, arg(5) ;VFilter ;
542
543 mov rsi, arg(0) ;ref_ptr ;
544 mov rdi, arg(2) ;src_ptr ;
545
546 mov rcx, 4 ;
547 pxor mm0, mm0 ;
548
549 movd mm1, [rsi] ;
550 movd mm3, [rsi+1] ;
551
552 punpcklbw mm1, mm0 ;
553 pmullw mm1, [rax] ;
554
555 punpcklbw mm3, mm0 ;
556 pmullw mm3, [rax+8] ;
557
558 paddw mm1, mm3 ;
559 paddw mm1, [GLOBAL(mmx_bi_rd)] ;
560
561 psraw mm1, mmx_filter_shift ;
562 movq mm5, mm1
563
564 %if ABI_IS_32BIT
565 add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
566 %else
567 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
568 add rsi, r8
569 %endif
570
571 .filter_block2d_bil4x4_var_mmx_loop:
572
573 movd mm1, [rsi] ;
574 movd mm3, [rsi+1] ;
575
576 punpcklbw mm1, mm0 ;
577 pmullw mm1, [rax] ;
578
579 punpcklbw mm3, mm0 ;
580 pmullw mm3, [rax+8] ;
581
582 paddw mm1, mm3 ;
583 paddw mm1, [GLOBAL(mmx_bi_rd)] ;
584
585 psraw mm1, mmx_filter_shift ;
586 movq mm3, mm5 ;
587
588 movq mm5, mm1 ;
589 pmullw mm3, [rdx] ;
590
591 pmullw mm1, [rdx+8] ;
592 paddw mm1, mm3 ;
593
594
595 paddw mm1, [GLOBAL(mmx_bi_rd)] ;
596 psraw mm1, mmx_filter_shift ;
597
598 movd mm3, [rdi] ;
599 punpcklbw mm3, mm0 ;
600
601 psubw mm1, mm3 ;
602 paddw mm6, mm1 ;
603
604 pmaddwd mm1, mm1 ;
605 paddd mm7, mm1 ;
606
607 %if ABI_IS_32BIT
608 add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
609 add rdi, dword ptr arg(3) ;src_pixels_per_line ;
610 %else
611 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
612 movsxd r9, dword ptr arg(3) ;src_pixels_per_line
613 add rsi, r8
614 add rdi, r9
615 %endif
616 sub rcx, 1 ;
617 jnz .filter_block2d_bil4x4_var_mmx_loop ;
618
619
620 pxor mm3, mm3 ;
621 pxor mm2, mm2 ;
622
623 punpcklwd mm2, mm6 ;
624 punpckhwd mm3, mm6 ;
625
626 paddd mm2, mm3 ;
627 movq mm6, mm2 ;
628
629 psrlq mm6, 32 ;
630 paddd mm2, mm6 ;
631
632 psrad mm2, 16 ;
633 movq mm4, mm7 ;
634
635 psrlq mm4, 32 ;
636 paddd mm4, mm7 ;
637
638 mov rdi, arg(6) ;sum
639 mov rsi, arg(7) ;sumsquared
640
641 movd dword ptr [rdi], mm2 ;
642 movd dword ptr [rsi], mm4 ;
643
644
645
646 ; begin epilog
647 add rsp, 16
648 pop rdi
649 pop rsi
650 RESTORE_GOT
651 UNSHADOW_ARGS
652 pop rbp
653 ret
654
655
656
657
658 ;void vp8_filter_block2d_bil_var_mmx
659 ;(
660 ; unsigned char *ref_ptr,
661 ; int ref_pixels_per_line,
662 ; unsigned char *src_ptr,
663 ; int src_pixels_per_line,
664 ; unsigned int Height,
665 ; unsigned short *HFilter,
666 ; unsigned short *VFilter,
667 ; int *sum,
668 ; unsigned int *sumsquared
669 ;)
670 global sym(vp8_filter_block2d_bil_var_mmx) PRIVATE
671 sym(vp8_filter_block2d_bil_var_mmx):
672 push rbp
673 mov rbp, rsp
674 SHADOW_ARGS_TO_STACK 9
675 GET_GOT rbx
676 push rsi
677 push rdi
678 sub rsp, 16
679 ; end prolog
680
681 pxor mm6, mm6 ;
682 pxor mm7, mm7 ;
683 mov rax, arg(5) ;HFilter ;
684
685 mov rdx, arg(6) ;VFilter ;
686 mov rsi, arg(0) ;ref_ptr ;
687
688 mov rdi, arg(2) ;src_ptr ;
689 movsxd rcx, dword ptr arg(4) ;Height ;
690
691 pxor mm0, mm0 ;
692 movq mm1, [rsi] ;
693
694 movq mm3, [rsi+1] ;
695 movq mm2, mm1 ;
696
697 movq mm4, mm3 ;
698 punpcklbw mm1, mm0 ;
699
700 punpckhbw mm2, mm0 ;
701 pmullw mm1, [rax] ;
702
703 pmullw mm2, [rax] ;
704 punpcklbw mm3, mm0 ;
705
706 punpckhbw mm4, mm0 ;
707 pmullw mm3, [rax+8] ;
708
709 pmullw mm4, [rax+8] ;
710 paddw mm1, mm3 ;
711
712 paddw mm2, mm4 ;
713 paddw mm1, [GLOBAL(mmx_bi_rd)] ;
714
715 psraw mm1, mmx_filter_shift ;
716 paddw mm2, [GLOBAL(mmx_bi_rd)] ;
717
718 psraw mm2, mmx_filter_shift ;
719 movq mm5, mm1
720
721 packuswb mm5, mm2 ;
722 %if ABI_IS_32BIT
723 add rsi, dword ptr arg(1) ;ref_pixels_per_line
724 %else
725 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
726 add rsi, r8
727 %endif
728
729 .filter_block2d_bil_var_mmx_loop:
730
731 movq mm1, [rsi] ;
732 movq mm3, [rsi+1] ;
733
734 movq mm2, mm1 ;
735 movq mm4, mm3 ;
736
737 punpcklbw mm1, mm0 ;
738 punpckhbw mm2, mm0 ;
739
740 pmullw mm1, [rax] ;
741 pmullw mm2, [rax] ;
742
743 punpcklbw mm3, mm0 ;
744 punpckhbw mm4, mm0 ;
745
746 pmullw mm3, [rax+8] ;
747 pmullw mm4, [rax+8] ;
748
749 paddw mm1, mm3 ;
750 paddw mm2, mm4 ;
751
752 paddw mm1, [GLOBAL(mmx_bi_rd)] ;
753 psraw mm1, mmx_filter_shift ;
754
755 paddw mm2, [GLOBAL(mmx_bi_rd)] ;
756 psraw mm2, mmx_filter_shift ;
757
758 movq mm3, mm5 ;
759 movq mm4, mm5 ;
760
761 punpcklbw mm3, mm0 ;
762 punpckhbw mm4, mm0 ;
763
764 movq mm5, mm1 ;
765 packuswb mm5, mm2 ;
766
767 pmullw mm3, [rdx] ;
768 pmullw mm4, [rdx] ;
769
770 pmullw mm1, [rdx+8] ;
771 pmullw mm2, [rdx+8] ;
772
773 paddw mm1, mm3 ;
774 paddw mm2, mm4 ;
775
776 paddw mm1, [GLOBAL(mmx_bi_rd)] ;
777 paddw mm2, [GLOBAL(mmx_bi_rd)] ;
778
779 psraw mm1, mmx_filter_shift ;
780 psraw mm2, mmx_filter_shift ;
781
782 movq mm3, [rdi] ;
783 movq mm4, mm3 ;
784
785 punpcklbw mm3, mm0 ;
786 punpckhbw mm4, mm0 ;
787
788 psubw mm1, mm3 ;
789 psubw mm2, mm4 ;
790
791 paddw mm6, mm1 ;
792 pmaddwd mm1, mm1 ;
793
794 paddw mm6, mm2 ;
795 pmaddwd mm2, mm2 ;
796
797 paddd mm7, mm1 ;
798 paddd mm7, mm2 ;
799
800 %if ABI_IS_32BIT
801 add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
802 add rdi, dword ptr arg(3) ;src_pixels_per_line ;
803 %else
804 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
805 movsxd r9, dword ptr arg(3) ;src_pixels_per_line ;
806 add rsi, r8
807 add rdi, r9
808 %endif
809 sub rcx, 1 ;
810 jnz .filter_block2d_bil_var_mmx_loop ;
811
812
813 pxor mm3, mm3 ;
814 pxor mm2, mm2 ;
815
816 punpcklwd mm2, mm6 ;
817 punpckhwd mm3, mm6 ;
818
819 paddd mm2, mm3 ;
820 movq mm6, mm2 ;
821
822 psrlq mm6, 32 ;
823 paddd mm2, mm6 ;
824
825 psrad mm2, 16 ;
826 movq mm4, mm7 ;
827
828 psrlq mm4, 32 ;
829 paddd mm4, mm7 ;
830
831 mov rdi, arg(7) ;sum
832 mov rsi, arg(8) ;sumsquared
833
834 movd dword ptr [rdi], mm2 ;
835 movd dword ptr [rsi], mm4 ;
836
837 ; begin epilog
838 add rsp, 16
839 pop rdi
840 pop rsi
841 RESTORE_GOT
842 UNSHADOW_ARGS
843 pop rbp
844 ret
845
846
847 SECTION_RODATA
848 ;short mmx_bi_rd[4] = { 64, 64, 64, 64};
849 align 16
850 mmx_bi_rd:
851 times 4 dw 64
OLDNEW
« no previous file with comments | « source/libvpx/vpx_dsp/x86/variance_impl_avx2.c ('k') | source/libvpx/vpx_dsp/x86/variance_mmx.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698