| OLD | NEW | 
|---|
| 1 ; | 1 ; | 
| 2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 
| 3 ; | 3 ; | 
| 4 ;  Use of this source code is governed by a BSD-style license | 4 ;  Use of this source code is governed by a BSD-style license | 
| 5 ;  that can be found in the LICENSE file in the root of the source | 5 ;  that can be found in the LICENSE file in the root of the source | 
| 6 ;  tree. An additional intellectual property rights grant can be found | 6 ;  tree. An additional intellectual property rights grant can be found | 
| 7 ;  in the file PATENTS.  All contributing project authors may | 7 ;  in the file PATENTS.  All contributing project authors may | 
| 8 ;  be found in the AUTHORS file in the root of the source tree. | 8 ;  be found in the AUTHORS file in the root of the source tree. | 
| 9 ; | 9 ; | 
| 10 | 10 | 
| 11 %include "vpx_ports/x86_abi_support.asm" | 11 %include "vpx_ports/x86_abi_support.asm" | 
| 12 | 12 | 
| 13 ; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr | 13 ; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr | 
| 14 %macro TABULATE_SSIM 0 | 14 %macro TABULATE_SSIM 0 | 
| 15         paddusw         xmm15, xmm3  ; sum_s | 15         paddusw         xmm15, xmm3  ; sum_s | 
| 16         paddusw         xmm14, xmm4  ; sum_r | 16         paddusw         xmm14, xmm4  ; sum_r | 
| 17         movdqa          xmm1, xmm3 | 17         movdqa          xmm1, xmm3 | 
| 18         pmaddwd         xmm1, xmm1 | 18         pmaddwd         xmm1, xmm1 | 
| 19         paddq           xmm13, xmm1 ; sum_sq_s | 19         paddd           xmm13, xmm1 ; sum_sq_s | 
| 20         movdqa          xmm2, xmm4 | 20         movdqa          xmm2, xmm4 | 
| 21         pmaddwd         xmm2, xmm2 | 21         pmaddwd         xmm2, xmm2 | 
| 22         paddq           xmm12, xmm2 ; sum_sq_r | 22         paddd           xmm12, xmm2 ; sum_sq_r | 
| 23         pmaddwd         xmm3, xmm4 | 23         pmaddwd         xmm3, xmm4 | 
| 24         paddq           xmm11, xmm3  ; sum_sxr | 24         paddd           xmm11, xmm3  ; sum_sxr | 
| 25 %endmacro | 25 %endmacro | 
| 26 | 26 | 
| 27 ; Sum across the register %1 starting with q words | 27 ; Sum across the register %1 starting with q words | 
| 28 %macro SUM_ACROSS_Q 1 | 28 %macro SUM_ACROSS_Q 1 | 
| 29         movdqa          xmm2,%1 | 29         movdqa          xmm2,%1 | 
| 30         punpckldq       %1,xmm0 | 30         punpckldq       %1,xmm0 | 
| 31         punpckhdq       xmm2,xmm0 | 31         punpckhdq       xmm2,xmm0 | 
| 32         paddq           %1,xmm2 | 32         paddq           %1,xmm2 | 
| 33         movdqa          xmm2,%1 | 33         movdqa          xmm2,%1 | 
| 34         punpcklqdq      %1,xmm0 | 34         punpcklqdq      %1,xmm0 | 
| (...skipping 24 matching lines...) Expand all  Loading... | 
| 59 ; ( calling app will initialize to 0 ) could easily fit everything in sse2 | 59 ; ( calling app will initialize to 0 ) could easily fit everything in sse2 | 
| 60 ; without too much hastle, and can probably do better estimates with psadw | 60 ; without too much hastle, and can probably do better estimates with psadw | 
| 61 ; or pavgb At this point this is just meant to be first pass for calculating | 61 ; or pavgb At this point this is just meant to be first pass for calculating | 
| 62 ; all the parms needed for 16x16 ssim so we can play with dssim as distortion | 62 ; all the parms needed for 16x16 ssim so we can play with dssim as distortion | 
| 63 ; in mode selection code. | 63 ; in mode selection code. | 
| 64 global sym(vp8_ssim_parms_16x16_sse3) | 64 global sym(vp8_ssim_parms_16x16_sse3) | 
| 65 sym(vp8_ssim_parms_16x16_sse3): | 65 sym(vp8_ssim_parms_16x16_sse3): | 
| 66     push        rbp | 66     push        rbp | 
| 67     mov         rbp, rsp | 67     mov         rbp, rsp | 
| 68     SHADOW_ARGS_TO_STACK 9 | 68     SHADOW_ARGS_TO_STACK 9 | 
|  | 69     SAVE_XMM 15 | 
| 69     push        rsi | 70     push        rsi | 
| 70     push        rdi | 71     push        rdi | 
| 71     ; end prolog | 72     ; end prolog | 
| 72 | 73 | 
| 73     mov             rsi,        arg(0) ;s | 74     mov             rsi,        arg(0) ;s | 
| 74     mov             rcx,        arg(1) ;sp | 75     mov             rcx,        arg(1) ;sp | 
| 75     mov             rdi,        arg(2) ;r | 76     mov             rdi,        arg(2) ;r | 
| 76     mov             rax,        arg(3) ;rp | 77     mov             rax,        arg(3) ;rp | 
| 77 | 78 | 
| 78     pxor            xmm0, xmm0 | 79     pxor            xmm0, xmm0 | 
| (...skipping 29 matching lines...) Expand all  Loading... | 
| 108     dec             rdx        ; counter | 109     dec             rdx        ; counter | 
| 109     jnz NextRow | 110     jnz NextRow | 
| 110 | 111 | 
| 111     SUM_ACROSS_W    xmm15 | 112     SUM_ACROSS_W    xmm15 | 
| 112     SUM_ACROSS_W    xmm14 | 113     SUM_ACROSS_W    xmm14 | 
| 113     SUM_ACROSS_Q    xmm13 | 114     SUM_ACROSS_Q    xmm13 | 
| 114     SUM_ACROSS_Q    xmm12 | 115     SUM_ACROSS_Q    xmm12 | 
| 115     SUM_ACROSS_Q    xmm11 | 116     SUM_ACROSS_Q    xmm11 | 
| 116 | 117 | 
| 117     mov             rdi,arg(4) | 118     mov             rdi,arg(4) | 
| 118     movq            [rdi], xmm15; | 119     movd            [rdi], xmm15; | 
| 119     mov             rdi,arg(5) | 120     mov             rdi,arg(5) | 
| 120     movq            [rdi], xmm14; | 121     movd            [rdi], xmm14; | 
| 121     mov             rdi,arg(6) | 122     mov             rdi,arg(6) | 
| 122     movq            [rdi], xmm13; | 123     movd            [rdi], xmm13; | 
| 123     mov             rdi,arg(7) | 124     mov             rdi,arg(7) | 
| 124     movq            [rdi], xmm12; | 125     movd            [rdi], xmm12; | 
| 125     mov             rdi,arg(8) | 126     mov             rdi,arg(8) | 
| 126     movq            [rdi], xmm11; | 127     movd            [rdi], xmm11; | 
| 127 | 128 | 
| 128     ; begin epilog | 129     ; begin epilog | 
| 129     pop         rdi | 130     pop         rdi | 
| 130     pop         rsi | 131     pop         rsi | 
|  | 132     RESTORE_XMM | 
| 131     UNSHADOW_ARGS | 133     UNSHADOW_ARGS | 
| 132     pop         rbp | 134     pop         rbp | 
| 133     ret | 135     ret | 
| 134 | 136 | 
| 135 ;void ssim_parms_sse3( | 137 ;void ssim_parms_sse3( | 
| 136 ;    unsigned char *s, | 138 ;    unsigned char *s, | 
| 137 ;    int sp, | 139 ;    int sp, | 
| 138 ;    unsigned char *r, | 140 ;    unsigned char *r, | 
| 139 ;    int rp | 141 ;    int rp | 
| 140 ;    unsigned long *sum_s, | 142 ;    unsigned long *sum_s, | 
| 141 ;    unsigned long *sum_r, | 143 ;    unsigned long *sum_r, | 
| 142 ;    unsigned long *sum_sq_s, | 144 ;    unsigned long *sum_sq_s, | 
| 143 ;    unsigned long *sum_sq_r, | 145 ;    unsigned long *sum_sq_r, | 
| 144 ;    unsigned long *sum_sxr); | 146 ;    unsigned long *sum_sxr); | 
| 145 ; | 147 ; | 
| 146 ; TODO: Use parm passing through structure, probably don't need the pxors | 148 ; TODO: Use parm passing through structure, probably don't need the pxors | 
| 147 ; ( calling app will initialize to 0 ) could easily fit everything in sse2 | 149 ; ( calling app will initialize to 0 ) could easily fit everything in sse2 | 
| 148 ; without too much hastle, and can probably do better estimates with psadw | 150 ; without too much hastle, and can probably do better estimates with psadw | 
| 149 ; or pavgb At this point this is just meant to be first pass for calculating | 151 ; or pavgb At this point this is just meant to be first pass for calculating | 
| 150 ; all the parms needed for 16x16 ssim so we can play with dssim as distortion | 152 ; all the parms needed for 16x16 ssim so we can play with dssim as distortion | 
| 151 ; in mode selection code. | 153 ; in mode selection code. | 
| 152 global sym(vp8_ssim_parms_8x8_sse3) | 154 global sym(vp8_ssim_parms_8x8_sse3) | 
| 153 sym(vp8_ssim_parms_8x8_sse3): | 155 sym(vp8_ssim_parms_8x8_sse3): | 
| 154     push        rbp | 156     push        rbp | 
| 155     mov         rbp, rsp | 157     mov         rbp, rsp | 
| 156     SHADOW_ARGS_TO_STACK 9 | 158     SHADOW_ARGS_TO_STACK 9 | 
|  | 159     SAVE_XMM 15 | 
| 157     push        rsi | 160     push        rsi | 
| 158     push        rdi | 161     push        rdi | 
| 159     ; end prolog | 162     ; end prolog | 
| 160 | 163 | 
| 161     mov             rsi,        arg(0) ;s | 164     mov             rsi,        arg(0) ;s | 
| 162     mov             rcx,        arg(1) ;sp | 165     mov             rcx,        arg(1) ;sp | 
| 163     mov             rdi,        arg(2) ;r | 166     mov             rdi,        arg(2) ;r | 
| 164     mov             rax,        arg(3) ;rp | 167     mov             rax,        arg(3) ;rp | 
| 165 | 168 | 
| 166     pxor            xmm0, xmm0 | 169     pxor            xmm0, xmm0 | 
| 167     pxor            xmm15,xmm15  ;sum_s | 170     pxor            xmm15,xmm15  ;sum_s | 
| 168     pxor            xmm14,xmm14  ;sum_r | 171     pxor            xmm14,xmm14  ;sum_r | 
| 169     pxor            xmm13,xmm13  ;sum_sq_s | 172     pxor            xmm13,xmm13  ;sum_sq_s | 
| 170     pxor            xmm12,xmm12  ;sum_sq_r | 173     pxor            xmm12,xmm12  ;sum_sq_r | 
| 171     pxor            xmm11,xmm11  ;sum_sxr | 174     pxor            xmm11,xmm11  ;sum_sxr | 
| 172 | 175 | 
| 173     mov             rdx, 8      ;row counter | 176     mov             rdx, 8      ;row counter | 
| 174 NextRow2: | 177 NextRow2: | 
| 175 | 178 | 
| 176     ;grab source and reference pixels | 179     ;grab source and reference pixels | 
| 177     movq            xmm5, [rsi] | 180     movq            xmm3, [rsi] | 
| 178     movq            xmm6, [rdi] | 181     movq            xmm4, [rdi] | 
| 179 |  | 
| 180     movdqa          xmm3, xmm5 |  | 
| 181     movdqa          xmm4, xmm6 |  | 
| 182     punpcklbw       xmm3, xmm0 ; low_s | 182     punpcklbw       xmm3, xmm0 ; low_s | 
| 183     punpcklbw       xmm4, xmm0 ; low_r | 183     punpcklbw       xmm4, xmm0 ; low_r | 
| 184 | 184 | 
| 185     TABULATE_SSIM | 185     TABULATE_SSIM | 
| 186 | 186 | 
| 187     add             rsi, rcx   ; next s row | 187     add             rsi, rcx   ; next s row | 
| 188     add             rdi, rax   ; next r row | 188     add             rdi, rax   ; next r row | 
| 189 | 189 | 
| 190     dec             rdx        ; counter | 190     dec             rdx        ; counter | 
| 191     jnz NextRow2 | 191     jnz NextRow2 | 
| 192 | 192 | 
| 193     SUM_ACROSS_W    xmm15 | 193     SUM_ACROSS_W    xmm15 | 
| 194     SUM_ACROSS_W    xmm14 | 194     SUM_ACROSS_W    xmm14 | 
| 195     SUM_ACROSS_Q    xmm13 | 195     SUM_ACROSS_Q    xmm13 | 
| 196     SUM_ACROSS_Q    xmm12 | 196     SUM_ACROSS_Q    xmm12 | 
| 197     SUM_ACROSS_Q    xmm11 | 197     SUM_ACROSS_Q    xmm11 | 
| 198 | 198 | 
| 199     mov             rdi,arg(4) | 199     mov             rdi,arg(4) | 
| 200     movq            [rdi], xmm15; | 200     movd            [rdi], xmm15; | 
| 201     mov             rdi,arg(5) | 201     mov             rdi,arg(5) | 
| 202     movq            [rdi], xmm14; | 202     movd            [rdi], xmm14; | 
| 203     mov             rdi,arg(6) | 203     mov             rdi,arg(6) | 
| 204     movq            [rdi], xmm13; | 204     movd            [rdi], xmm13; | 
| 205     mov             rdi,arg(7) | 205     mov             rdi,arg(7) | 
| 206     movq            [rdi], xmm12; | 206     movd            [rdi], xmm12; | 
| 207     mov             rdi,arg(8) | 207     mov             rdi,arg(8) | 
| 208     movq            [rdi], xmm11; | 208     movd            [rdi], xmm11; | 
| 209 | 209 | 
| 210     ; begin epilog | 210     ; begin epilog | 
| 211     pop         rdi | 211     pop         rdi | 
| 212     pop         rsi | 212     pop         rsi | 
|  | 213     RESTORE_XMM | 
| 213     UNSHADOW_ARGS | 214     UNSHADOW_ARGS | 
| 214     pop         rbp | 215     pop         rbp | 
| 215     ret | 216     ret | 
| OLD | NEW | 
|---|