| OLD | NEW |
| 1 ; | 1 ; |
| 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| 3 ; | 3 ; |
| 4 ; Use of this source code is governed by a BSD-style license | 4 ; Use of this source code is governed by a BSD-style license |
| 5 ; that can be found in the LICENSE file in the root of the source | 5 ; that can be found in the LICENSE file in the root of the source |
| 6 ; tree. An additional intellectual property rights grant can be found | 6 ; tree. An additional intellectual property rights grant can be found |
| 7 ; in the file PATENTS. All contributing project authors may | 7 ; in the file PATENTS. All contributing project authors may |
| 8 ; be found in the AUTHORS file in the root of the source tree. | 8 ; be found in the AUTHORS file in the root of the source tree. |
| 9 ; | 9 ; |
| 10 | 10 |
| 11 %include "vpx_ports/x86_abi_support.asm" | 11 %include "vpx_ports/x86_abi_support.asm" |
| 12 | 12 |
| 13 ; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr | 13 ; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr |
| 14 %macro TABULATE_SSIM 0 | 14 %macro TABULATE_SSIM 0 |
| 15 paddusw xmm15, xmm3 ; sum_s | 15 paddusw xmm15, xmm3 ; sum_s |
| 16 paddusw xmm14, xmm4 ; sum_r | 16 paddusw xmm14, xmm4 ; sum_r |
| 17 movdqa xmm1, xmm3 | 17 movdqa xmm1, xmm3 |
| 18 pmaddwd xmm1, xmm1 | 18 pmaddwd xmm1, xmm1 |
| 19 paddq xmm13, xmm1 ; sum_sq_s | 19 paddd xmm13, xmm1 ; sum_sq_s |
| 20 movdqa xmm2, xmm4 | 20 movdqa xmm2, xmm4 |
| 21 pmaddwd xmm2, xmm2 | 21 pmaddwd xmm2, xmm2 |
| 22 paddq xmm12, xmm2 ; sum_sq_r | 22 paddd xmm12, xmm2 ; sum_sq_r |
| 23 pmaddwd xmm3, xmm4 | 23 pmaddwd xmm3, xmm4 |
| 24 paddq xmm11, xmm3 ; sum_sxr | 24 paddd xmm11, xmm3 ; sum_sxr |
| 25 %endmacro | 25 %endmacro |
| 26 | 26 |
| 27 ; Sum across the register %1 starting with q words | 27 ; Sum across the register %1 starting with q words |
| 28 %macro SUM_ACROSS_Q 1 | 28 %macro SUM_ACROSS_Q 1 |
| 29 movdqa xmm2,%1 | 29 movdqa xmm2,%1 |
| 30 punpckldq %1,xmm0 | 30 punpckldq %1,xmm0 |
| 31 punpckhdq xmm2,xmm0 | 31 punpckhdq xmm2,xmm0 |
| 32 paddq %1,xmm2 | 32 paddq %1,xmm2 |
| 33 movdqa xmm2,%1 | 33 movdqa xmm2,%1 |
| 34 punpcklqdq %1,xmm0 | 34 punpcklqdq %1,xmm0 |
| (...skipping 24 matching lines...) Expand all Loading... |
| 59 ; ( calling app will initialize to 0 ) could easily fit everything in sse2 | 59 ; ( calling app will initialize to 0 ) could easily fit everything in sse2 |
| 60 ; without too much hastle, and can probably do better estimates with psadw | 60 ; without too much hastle, and can probably do better estimates with psadw |
| 61 ; or pavgb At this point this is just meant to be first pass for calculating | 61 ; or pavgb At this point this is just meant to be first pass for calculating |
| 62 ; all the parms needed for 16x16 ssim so we can play with dssim as distortion | 62 ; all the parms needed for 16x16 ssim so we can play with dssim as distortion |
| 63 ; in mode selection code. | 63 ; in mode selection code. |
| 64 global sym(vp8_ssim_parms_16x16_sse3) | 64 global sym(vp8_ssim_parms_16x16_sse3) |
| 65 sym(vp8_ssim_parms_16x16_sse3): | 65 sym(vp8_ssim_parms_16x16_sse3): |
| 66 push rbp | 66 push rbp |
| 67 mov rbp, rsp | 67 mov rbp, rsp |
| 68 SHADOW_ARGS_TO_STACK 9 | 68 SHADOW_ARGS_TO_STACK 9 |
| 69 SAVE_XMM 15 |
| 69 push rsi | 70 push rsi |
| 70 push rdi | 71 push rdi |
| 71 ; end prolog | 72 ; end prolog |
| 72 | 73 |
| 73 mov rsi, arg(0) ;s | 74 mov rsi, arg(0) ;s |
| 74 mov rcx, arg(1) ;sp | 75 mov rcx, arg(1) ;sp |
| 75 mov rdi, arg(2) ;r | 76 mov rdi, arg(2) ;r |
| 76 mov rax, arg(3) ;rp | 77 mov rax, arg(3) ;rp |
| 77 | 78 |
| 78 pxor xmm0, xmm0 | 79 pxor xmm0, xmm0 |
| (...skipping 29 matching lines...) Expand all Loading... |
| 108 dec rdx ; counter | 109 dec rdx ; counter |
| 109 jnz NextRow | 110 jnz NextRow |
| 110 | 111 |
| 111 SUM_ACROSS_W xmm15 | 112 SUM_ACROSS_W xmm15 |
| 112 SUM_ACROSS_W xmm14 | 113 SUM_ACROSS_W xmm14 |
| 113 SUM_ACROSS_Q xmm13 | 114 SUM_ACROSS_Q xmm13 |
| 114 SUM_ACROSS_Q xmm12 | 115 SUM_ACROSS_Q xmm12 |
| 115 SUM_ACROSS_Q xmm11 | 116 SUM_ACROSS_Q xmm11 |
| 116 | 117 |
| 117 mov rdi,arg(4) | 118 mov rdi,arg(4) |
| 118 movq [rdi], xmm15; | 119 movd [rdi], xmm15; |
| 119 mov rdi,arg(5) | 120 mov rdi,arg(5) |
| 120 movq [rdi], xmm14; | 121 movd [rdi], xmm14; |
| 121 mov rdi,arg(6) | 122 mov rdi,arg(6) |
| 122 movq [rdi], xmm13; | 123 movd [rdi], xmm13; |
| 123 mov rdi,arg(7) | 124 mov rdi,arg(7) |
| 124 movq [rdi], xmm12; | 125 movd [rdi], xmm12; |
| 125 mov rdi,arg(8) | 126 mov rdi,arg(8) |
| 126 movq [rdi], xmm11; | 127 movd [rdi], xmm11; |
| 127 | 128 |
| 128 ; begin epilog | 129 ; begin epilog |
| 129 pop rdi | 130 pop rdi |
| 130 pop rsi | 131 pop rsi |
| 132 RESTORE_XMM |
| 131 UNSHADOW_ARGS | 133 UNSHADOW_ARGS |
| 132 pop rbp | 134 pop rbp |
| 133 ret | 135 ret |
| 134 | 136 |
| 135 ;void ssim_parms_sse3( | 137 ;void ssim_parms_sse3( |
| 136 ; unsigned char *s, | 138 ; unsigned char *s, |
| 137 ; int sp, | 139 ; int sp, |
| 138 ; unsigned char *r, | 140 ; unsigned char *r, |
| 139 ; int rp | 141 ; int rp |
| 140 ; unsigned long *sum_s, | 142 ; unsigned long *sum_s, |
| 141 ; unsigned long *sum_r, | 143 ; unsigned long *sum_r, |
| 142 ; unsigned long *sum_sq_s, | 144 ; unsigned long *sum_sq_s, |
| 143 ; unsigned long *sum_sq_r, | 145 ; unsigned long *sum_sq_r, |
| 144 ; unsigned long *sum_sxr); | 146 ; unsigned long *sum_sxr); |
| 145 ; | 147 ; |
| 146 ; TODO: Use parm passing through structure, probably don't need the pxors | 148 ; TODO: Use parm passing through structure, probably don't need the pxors |
| 147 ; ( calling app will initialize to 0 ) could easily fit everything in sse2 | 149 ; ( calling app will initialize to 0 ) could easily fit everything in sse2 |
| 148 ; without too much hastle, and can probably do better estimates with psadw | 150 ; without too much hastle, and can probably do better estimates with psadw |
| 149 ; or pavgb At this point this is just meant to be first pass for calculating | 151 ; or pavgb At this point this is just meant to be first pass for calculating |
| 150 ; all the parms needed for 16x16 ssim so we can play with dssim as distortion | 152 ; all the parms needed for 16x16 ssim so we can play with dssim as distortion |
| 151 ; in mode selection code. | 153 ; in mode selection code. |
| 152 global sym(vp8_ssim_parms_8x8_sse3) | 154 global sym(vp8_ssim_parms_8x8_sse3) |
| 153 sym(vp8_ssim_parms_8x8_sse3): | 155 sym(vp8_ssim_parms_8x8_sse3): |
| 154 push rbp | 156 push rbp |
| 155 mov rbp, rsp | 157 mov rbp, rsp |
| 156 SHADOW_ARGS_TO_STACK 9 | 158 SHADOW_ARGS_TO_STACK 9 |
| 159 SAVE_XMM 15 |
| 157 push rsi | 160 push rsi |
| 158 push rdi | 161 push rdi |
| 159 ; end prolog | 162 ; end prolog |
| 160 | 163 |
| 161 mov rsi, arg(0) ;s | 164 mov rsi, arg(0) ;s |
| 162 mov rcx, arg(1) ;sp | 165 mov rcx, arg(1) ;sp |
| 163 mov rdi, arg(2) ;r | 166 mov rdi, arg(2) ;r |
| 164 mov rax, arg(3) ;rp | 167 mov rax, arg(3) ;rp |
| 165 | 168 |
| 166 pxor xmm0, xmm0 | 169 pxor xmm0, xmm0 |
| 167 pxor xmm15,xmm15 ;sum_s | 170 pxor xmm15,xmm15 ;sum_s |
| 168 pxor xmm14,xmm14 ;sum_r | 171 pxor xmm14,xmm14 ;sum_r |
| 169 pxor xmm13,xmm13 ;sum_sq_s | 172 pxor xmm13,xmm13 ;sum_sq_s |
| 170 pxor xmm12,xmm12 ;sum_sq_r | 173 pxor xmm12,xmm12 ;sum_sq_r |
| 171 pxor xmm11,xmm11 ;sum_sxr | 174 pxor xmm11,xmm11 ;sum_sxr |
| 172 | 175 |
| 173 mov rdx, 8 ;row counter | 176 mov rdx, 8 ;row counter |
| 174 NextRow2: | 177 NextRow2: |
| 175 | 178 |
| 176 ;grab source and reference pixels | 179 ;grab source and reference pixels |
| 177 movq xmm5, [rsi] | 180 movq xmm3, [rsi] |
| 178 movq xmm6, [rdi] | 181 movq xmm4, [rdi] |
| 179 | |
| 180 movdqa xmm3, xmm5 | |
| 181 movdqa xmm4, xmm6 | |
| 182 punpcklbw xmm3, xmm0 ; low_s | 182 punpcklbw xmm3, xmm0 ; low_s |
| 183 punpcklbw xmm4, xmm0 ; low_r | 183 punpcklbw xmm4, xmm0 ; low_r |
| 184 | 184 |
| 185 TABULATE_SSIM | 185 TABULATE_SSIM |
| 186 | 186 |
| 187 add rsi, rcx ; next s row | 187 add rsi, rcx ; next s row |
| 188 add rdi, rax ; next r row | 188 add rdi, rax ; next r row |
| 189 | 189 |
| 190 dec rdx ; counter | 190 dec rdx ; counter |
| 191 jnz NextRow2 | 191 jnz NextRow2 |
| 192 | 192 |
| 193 SUM_ACROSS_W xmm15 | 193 SUM_ACROSS_W xmm15 |
| 194 SUM_ACROSS_W xmm14 | 194 SUM_ACROSS_W xmm14 |
| 195 SUM_ACROSS_Q xmm13 | 195 SUM_ACROSS_Q xmm13 |
| 196 SUM_ACROSS_Q xmm12 | 196 SUM_ACROSS_Q xmm12 |
| 197 SUM_ACROSS_Q xmm11 | 197 SUM_ACROSS_Q xmm11 |
| 198 | 198 |
| 199 mov rdi,arg(4) | 199 mov rdi,arg(4) |
| 200 movq [rdi], xmm15; | 200 movd [rdi], xmm15; |
| 201 mov rdi,arg(5) | 201 mov rdi,arg(5) |
| 202 movq [rdi], xmm14; | 202 movd [rdi], xmm14; |
| 203 mov rdi,arg(6) | 203 mov rdi,arg(6) |
| 204 movq [rdi], xmm13; | 204 movd [rdi], xmm13; |
| 205 mov rdi,arg(7) | 205 mov rdi,arg(7) |
| 206 movq [rdi], xmm12; | 206 movd [rdi], xmm12; |
| 207 mov rdi,arg(8) | 207 mov rdi,arg(8) |
| 208 movq [rdi], xmm11; | 208 movd [rdi], xmm11; |
| 209 | 209 |
| 210 ; begin epilog | 210 ; begin epilog |
| 211 pop rdi | 211 pop rdi |
| 212 pop rsi | 212 pop rsi |
| 213 RESTORE_XMM |
| 213 UNSHADOW_ARGS | 214 UNSHADOW_ARGS |
| 214 pop rbp | 215 pop rbp |
| 215 ret | 216 ret |
| OLD | NEW |