| OLD | NEW | 
|---|
| 1 ; | 1 ; | 
| 2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 
| 3 ; | 3 ; | 
| 4 ;  Use of this source code is governed by a BSD-style license | 4 ;  Use of this source code is governed by a BSD-style license | 
| 5 ;  that can be found in the LICENSE file in the root of the source | 5 ;  that can be found in the LICENSE file in the root of the source | 
| 6 ;  tree. An additional intellectual property rights grant can be found | 6 ;  tree. An additional intellectual property rights grant can be found | 
| 7 ;  in the file PATENTS.  All contributing project authors may | 7 ;  in the file PATENTS.  All contributing project authors may | 
| 8 ;  be found in the AUTHORS file in the root of the source tree. | 8 ;  be found in the AUTHORS file in the root of the source tree. | 
| 9 ; | 9 ; | 
| 10 | 10 | 
| 11 %include "vpx_ports/x86_abi_support.asm" | 11 %include "vpx_ports/x86_abi_support.asm" | 
| 12 | 12 | 
| 13 %macro STACK_FRAME_CREATE_X3 0 | 13 %macro STACK_FRAME_CREATE_X3 0 | 
| 14 %if ABI_IS_32BIT | 14 %if ABI_IS_32BIT | 
| 15   %define     src_ptr       rsi | 15   %define     src_ptr       rsi | 
| 16   %define     src_stride    rax | 16   %define     src_stride    rax | 
| 17   %define     ref_ptr       rdi | 17   %define     ref_ptr       rdi | 
| 18   %define     ref_stride    rdx | 18   %define     ref_stride    rdx | 
| 19   %define     end_ptr       rcx | 19   %define     end_ptr       rcx | 
| 20   %define     ret_var       rbx | 20   %define     ret_var       rbx | 
| 21   %define     result_ptr    arg(4) | 21   %define     result_ptr    arg(4) | 
| 22   %define     max_err       arg(4) | 22   %define     max_err       arg(4) | 
|  | 23   %define     height        dword ptr arg(4) | 
| 23     push        rbp | 24     push        rbp | 
| 24     mov         rbp,        rsp | 25     mov         rbp,        rsp | 
| 25     push        rsi | 26     push        rsi | 
| 26     push        rdi | 27     push        rdi | 
| 27     push        rbx | 28     push        rbx | 
| 28 | 29 | 
| 29     mov         rsi,        arg(0)              ; src_ptr | 30     mov         rsi,        arg(0)              ; src_ptr | 
| 30     mov         rdi,        arg(2)              ; ref_ptr | 31     mov         rdi,        arg(2)              ; ref_ptr | 
| 31 | 32 | 
| 32     movsxd      rax,        dword ptr arg(1)    ; src_stride | 33     movsxd      rax,        dword ptr arg(1)    ; src_stride | 
| 33     movsxd      rdx,        dword ptr arg(3)    ; ref_stride | 34     movsxd      rdx,        dword ptr arg(3)    ; ref_stride | 
| 34 %else | 35 %else | 
| 35   %ifidn __OUTPUT_FORMAT__,x64 | 36   %ifidn __OUTPUT_FORMAT__,x64 | 
|  | 37     SAVE_XMM 7, u | 
| 36     %define     src_ptr     rcx | 38     %define     src_ptr     rcx | 
| 37     %define     src_stride  rdx | 39     %define     src_stride  rdx | 
| 38     %define     ref_ptr     r8 | 40     %define     ref_ptr     r8 | 
| 39     %define     ref_stride  r9 | 41     %define     ref_stride  r9 | 
| 40     %define     end_ptr     r10 | 42     %define     end_ptr     r10 | 
| 41     %define     ret_var     r11 | 43     %define     ret_var     r11 | 
| 42     %define     result_ptr  [rsp+8+4*8] | 44     %define     result_ptr  [rsp+xmm_stack_space+8+4*8] | 
| 43     %define     max_err     [rsp+8+4*8] | 45     %define     max_err     [rsp+xmm_stack_space+8+4*8] | 
|  | 46     %define     height      dword ptr [rsp+xmm_stack_space+8+4*8] | 
| 44   %else | 47   %else | 
| 45     %define     src_ptr     rdi | 48     %define     src_ptr     rdi | 
| 46     %define     src_stride  rsi | 49     %define     src_stride  rsi | 
| 47     %define     ref_ptr     rdx | 50     %define     ref_ptr     rdx | 
| 48     %define     ref_stride  rcx | 51     %define     ref_stride  rcx | 
| 49     %define     end_ptr     r9 | 52     %define     end_ptr     r9 | 
| 50     %define     ret_var     r10 | 53     %define     ret_var     r10 | 
| 51     %define     result_ptr  r8 | 54     %define     result_ptr  r8 | 
| 52     %define     max_err     r8 | 55     %define     max_err     r8 | 
|  | 56     %define     height      r8 | 
| 53   %endif | 57   %endif | 
| 54 %endif | 58 %endif | 
| 55 | 59 | 
| 56 %endmacro | 60 %endmacro | 
| 57 | 61 | 
| 58 %macro STACK_FRAME_DESTROY_X3 0 | 62 %macro STACK_FRAME_DESTROY_X3 0 | 
| 59   %define     src_ptr | 63   %define     src_ptr | 
| 60   %define     src_stride | 64   %define     src_stride | 
| 61   %define     ref_ptr | 65   %define     ref_ptr | 
| 62   %define     ref_stride | 66   %define     ref_stride | 
| 63   %define     end_ptr | 67   %define     end_ptr | 
| 64   %define     ret_var | 68   %define     ret_var | 
| 65   %define     result_ptr | 69   %define     result_ptr | 
| 66   %define     max_err | 70   %define     max_err | 
|  | 71   %define     height | 
| 67 | 72 | 
| 68 %if ABI_IS_32BIT | 73 %if ABI_IS_32BIT | 
| 69     pop         rbx | 74     pop         rbx | 
| 70     pop         rdi | 75     pop         rdi | 
| 71     pop         rsi | 76     pop         rsi | 
| 72     pop         rbp | 77     pop         rbp | 
| 73 %else | 78 %else | 
| 74   %ifidn __OUTPUT_FORMAT__,x64 | 79   %ifidn __OUTPUT_FORMAT__,x64 | 
|  | 80     RESTORE_XMM | 
| 75   %endif | 81   %endif | 
| 76 %endif | 82 %endif | 
| 77     ret | 83     ret | 
| 78 %endmacro | 84 %endmacro | 
| 79 | 85 | 
| 80 %macro STACK_FRAME_CREATE_X4 0 | 86 %macro STACK_FRAME_CREATE_X4 0 | 
| 81 %if ABI_IS_32BIT | 87 %if ABI_IS_32BIT | 
| 82   %define     src_ptr       rsi | 88   %define     src_ptr       rsi | 
| 83   %define     src_stride    rax | 89   %define     src_stride    rax | 
| 84   %define     r0_ptr        rcx | 90   %define     r0_ptr        rcx | 
| (...skipping 14 matching lines...) Expand all  Loading... | 
| 99     LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi | 105     LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi | 
| 100 | 106 | 
| 101     mov         rsi,        arg(0)              ; src_ptr | 107     mov         rsi,        arg(0)              ; src_ptr | 
| 102 | 108 | 
| 103     movsxd      rbx,        dword ptr arg(1)    ; src_stride | 109     movsxd      rbx,        dword ptr arg(1)    ; src_stride | 
| 104     movsxd      rbp,        dword ptr arg(3)    ; ref_stride | 110     movsxd      rbp,        dword ptr arg(3)    ; ref_stride | 
| 105 | 111 | 
| 106     xchg        rbx,        rax | 112     xchg        rbx,        rax | 
| 107 %else | 113 %else | 
| 108   %ifidn __OUTPUT_FORMAT__,x64 | 114   %ifidn __OUTPUT_FORMAT__,x64 | 
|  | 115     SAVE_XMM 7, u | 
| 109     %define     src_ptr     rcx | 116     %define     src_ptr     rcx | 
| 110     %define     src_stride  rdx | 117     %define     src_stride  rdx | 
| 111     %define     r0_ptr      rsi | 118     %define     r0_ptr      rsi | 
| 112     %define     r1_ptr      r10 | 119     %define     r1_ptr      r10 | 
| 113     %define     r2_ptr      r11 | 120     %define     r2_ptr      r11 | 
| 114     %define     r3_ptr      r8 | 121     %define     r3_ptr      r8 | 
| 115     %define     ref_stride  r9 | 122     %define     ref_stride  r9 | 
| 116     %define     result_ptr  [rsp+16+4*8] | 123     %define     result_ptr  [rsp+xmm_stack_space+16+4*8] | 
| 117     push        rsi | 124     push        rsi | 
| 118 | 125 | 
| 119     LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr | 126     LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr | 
| 120   %else | 127   %else | 
| 121     %define     src_ptr     rdi | 128     %define     src_ptr     rdi | 
| 122     %define     src_stride  rsi | 129     %define     src_stride  rsi | 
| 123     %define     r0_ptr      r9 | 130     %define     r0_ptr      r9 | 
| 124     %define     r1_ptr      r10 | 131     %define     r1_ptr      r10 | 
| 125     %define     r2_ptr      r11 | 132     %define     r2_ptr      r11 | 
| 126     %define     r3_ptr      rdx | 133     %define     r3_ptr      rdx | 
| (...skipping 17 matching lines...) Expand all  Loading... | 
| 144   %define     result_ptr | 151   %define     result_ptr | 
| 145 | 152 | 
| 146 %if ABI_IS_32BIT | 153 %if ABI_IS_32BIT | 
| 147     pop         rbx | 154     pop         rbx | 
| 148     pop         rdi | 155     pop         rdi | 
| 149     pop         rsi | 156     pop         rsi | 
| 150     pop         rbp | 157     pop         rbp | 
| 151 %else | 158 %else | 
| 152   %ifidn __OUTPUT_FORMAT__,x64 | 159   %ifidn __OUTPUT_FORMAT__,x64 | 
| 153     pop         rsi | 160     pop         rsi | 
|  | 161     RESTORE_XMM | 
| 154   %endif | 162   %endif | 
| 155 %endif | 163 %endif | 
| 156     ret | 164     ret | 
| 157 %endmacro | 165 %endmacro | 
| 158 | 166 | 
| 159 %macro PROCESS_16X2X3 5 | 167 %macro PROCESS_16X2X3 5 | 
| 160 %if %1==0 | 168 %if %1==0 | 
| 161         movdqa          xmm0,       XMMWORD PTR [%2] | 169         movdqa          xmm0,       XMMWORD PTR [%2] | 
| 162         lddqu           xmm5,       XMMWORD PTR [%3] | 170         lddqu           xmm5,       XMMWORD PTR [%3] | 
| 163         lddqu           xmm6,       XMMWORD PTR [%3+1] | 171         lddqu           xmm6,       XMMWORD PTR [%3+1] | 
| (...skipping 457 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 621         sub             end_ptr,     1 | 629         sub             end_ptr,     1 | 
| 622         jne             .vp8_sad16x16_sse3_loop | 630         jne             .vp8_sad16x16_sse3_loop | 
| 623 | 631 | 
| 624         movq            xmm0,       xmm7 | 632         movq            xmm0,       xmm7 | 
| 625         psrldq          xmm7,       8 | 633         psrldq          xmm7,       8 | 
| 626         paddw           xmm0,       xmm7 | 634         paddw           xmm0,       xmm7 | 
| 627         movq            rax,        xmm0 | 635         movq            rax,        xmm0 | 
| 628 | 636 | 
| 629     STACK_FRAME_DESTROY_X3 | 637     STACK_FRAME_DESTROY_X3 | 
| 630 | 638 | 
|  | 639 ;void vp8_copy32xn_sse3( | 
|  | 640 ;    unsigned char *src_ptr, | 
|  | 641 ;    int  src_stride, | 
|  | 642 ;    unsigned char *dst_ptr, | 
|  | 643 ;    int  dst_stride, | 
|  | 644 ;    int height); | 
|  | 645 global sym(vp8_copy32xn_sse3) | 
|  | 646 sym(vp8_copy32xn_sse3): | 
|  | 647 | 
|  | 648     STACK_FRAME_CREATE_X3 | 
|  | 649 | 
|  | 650 block_copy_sse3_loopx4: | 
|  | 651         lea             end_ptr,    [src_ptr+src_stride*2] | 
|  | 652 | 
|  | 653         movdqu          xmm0,       XMMWORD PTR [src_ptr] | 
|  | 654         movdqu          xmm1,       XMMWORD PTR [src_ptr + 16] | 
|  | 655         movdqu          xmm2,       XMMWORD PTR [src_ptr + src_stride] | 
|  | 656         movdqu          xmm3,       XMMWORD PTR [src_ptr + src_stride + 16] | 
|  | 657         movdqu          xmm4,       XMMWORD PTR [end_ptr] | 
|  | 658         movdqu          xmm5,       XMMWORD PTR [end_ptr + 16] | 
|  | 659         movdqu          xmm6,       XMMWORD PTR [end_ptr + src_stride] | 
|  | 660         movdqu          xmm7,       XMMWORD PTR [end_ptr + src_stride + 16] | 
|  | 661 | 
|  | 662         lea             src_ptr,    [src_ptr+src_stride*4] | 
|  | 663 | 
|  | 664         lea             end_ptr,    [ref_ptr+ref_stride*2] | 
|  | 665 | 
|  | 666         movdqa          XMMWORD PTR [ref_ptr], xmm0 | 
|  | 667         movdqa          XMMWORD PTR [ref_ptr + 16], xmm1 | 
|  | 668         movdqa          XMMWORD PTR [ref_ptr + ref_stride], xmm2 | 
|  | 669         movdqa          XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3 | 
|  | 670         movdqa          XMMWORD PTR [end_ptr], xmm4 | 
|  | 671         movdqa          XMMWORD PTR [end_ptr + 16], xmm5 | 
|  | 672         movdqa          XMMWORD PTR [end_ptr + ref_stride], xmm6 | 
|  | 673         movdqa          XMMWORD PTR [end_ptr + ref_stride + 16], xmm7 | 
|  | 674 | 
|  | 675         lea             ref_ptr,    [ref_ptr+ref_stride*4] | 
|  | 676 | 
|  | 677         sub             height,     4 | 
|  | 678         cmp             height,     4 | 
|  | 679         jge             block_copy_sse3_loopx4 | 
|  | 680 | 
|  | 681         ;Check to see if there is more rows need to be copied. | 
|  | 682         cmp             height, 0 | 
|  | 683         je              copy_is_done | 
|  | 684 | 
|  | 685 block_copy_sse3_loop: | 
|  | 686         movdqu          xmm0,       XMMWORD PTR [src_ptr] | 
|  | 687         movdqu          xmm1,       XMMWORD PTR [src_ptr + 16] | 
|  | 688         lea             src_ptr,    [src_ptr+src_stride] | 
|  | 689 | 
|  | 690         movdqa          XMMWORD PTR [ref_ptr], xmm0 | 
|  | 691         movdqa          XMMWORD PTR [ref_ptr + 16], xmm1 | 
|  | 692         lea             ref_ptr,    [ref_ptr+ref_stride] | 
|  | 693 | 
|  | 694         sub             height,     1 | 
|  | 695         jne             block_copy_sse3_loop | 
|  | 696 | 
|  | 697 copy_is_done: | 
|  | 698     STACK_FRAME_DESTROY_X3 | 
|  | 699 | 
| 631 ;void vp8_sad16x16x4d_sse3( | 700 ;void vp8_sad16x16x4d_sse3( | 
| 632 ;    unsigned char *src_ptr, | 701 ;    unsigned char *src_ptr, | 
| 633 ;    int  src_stride, | 702 ;    int  src_stride, | 
| 634 ;    unsigned char *ref_ptr_base, | 703 ;    unsigned char *ref_ptr_base, | 
| 635 ;    int  ref_stride, | 704 ;    int  ref_stride, | 
| 636 ;    int  *results) | 705 ;    int  *results) | 
| 637 global sym(vp8_sad16x16x4d_sse3) | 706 global sym(vp8_sad16x16x4d_sse3) | 
| 638 sym(vp8_sad16x16x4d_sse3): | 707 sym(vp8_sad16x16x4d_sse3): | 
| 639 | 708 | 
| 640     STACK_FRAME_CREATE_X4 | 709     STACK_FRAME_CREATE_X4 | 
| (...skipping 240 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 881         punpcklbw       mm2,        mm1 | 950         punpcklbw       mm2,        mm1 | 
| 882 | 951 | 
| 883         movd            [rsi+8],    mm7 | 952         movd            [rsi+8],    mm7 | 
| 884         psadbw          mm2,        mm0 | 953         psadbw          mm2,        mm0 | 
| 885 | 954 | 
| 886         paddw           mm2,        mm6 | 955         paddw           mm2,        mm6 | 
| 887         movd            [rsi+12],   mm2 | 956         movd            [rsi+12],   mm2 | 
| 888 | 957 | 
| 889 | 958 | 
| 890     STACK_FRAME_DESTROY_X4 | 959     STACK_FRAME_DESTROY_X4 | 
|  | 960 | 
| OLD | NEW | 
|---|