OLD | NEW |
(Empty) | |
| 1 ; |
| 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| 3 ; |
| 4 ; Use of this source code is governed by a BSD-style license |
| 5 ; that can be found in the LICENSE file in the root of the source |
| 6 ; tree. An additional intellectual property rights grant can be found |
| 7 ; in the file PATENTS. All contributing project authors may |
| 8 ; be found in the AUTHORS file in the root of the source tree. |
| 9 ; |
| 10 |
| 11 %include "vpx_ports/x86_abi_support.asm" |
| 12 |
| 13 ;void vpx_half_horiz_vert_variance16x_h_sse2(unsigned char *ref, |
| 14 ; int ref_stride, |
| 15 ; unsigned char *src, |
| 16 ; int src_stride, |
| 17 ; unsigned int height, |
| 18 ; int *sum, |
| 19 ; unsigned int *sumsquared) |
| 20 global sym(vpx_half_horiz_vert_variance16x_h_sse2) PRIVATE |
| 21 sym(vpx_half_horiz_vert_variance16x_h_sse2): |
| 22 push rbp |
| 23 mov rbp, rsp |
| 24 SHADOW_ARGS_TO_STACK 7 |
| 25 SAVE_XMM 7 |
| 26 GET_GOT rbx |
| 27 push rsi |
| 28 push rdi |
| 29 ; end prolog |
| 30 |
| 31 pxor xmm6, xmm6 ; error accumulator |
| 32 pxor xmm7, xmm7 ; sse eaccumulator |
| 33 mov rsi, arg(0) ;ref |
| 34 |
| 35 mov rdi, arg(2) ;src |
| 36 movsxd rcx, dword ptr arg(4) ;height |
| 37 movsxd rax, dword ptr arg(1) ;ref_stride |
| 38 movsxd rdx, dword ptr arg(3) ;src_stride |
| 39 |
| 40 pxor xmm0, xmm0 ; |
| 41 |
| 42 movdqu xmm5, XMMWORD PTR [rsi] |
| 43 movdqu xmm3, XMMWORD PTR [rsi+1] |
| 44 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,x
mm3) horizontal line 1 |
| 45 |
| 46 lea rsi, [rsi + rax] |
| 47 |
| 48 vpx_half_horiz_vert_variance16x_h_1: |
| 49 movdqu xmm1, XMMWORD PTR [rsi] ; |
| 50 movdqu xmm2, XMMWORD PTR [rsi+1] ; |
| 51 pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,x
mm3) horizontal line i+1 |
| 52 |
| 53 pavgb xmm5, xmm1 ; xmm = vertical av
erage of the above |
| 54 |
| 55 movdqa xmm4, xmm5 |
| 56 punpcklbw xmm5, xmm0 ; xmm5 = words of a
bove |
| 57 punpckhbw xmm4, xmm0 |
| 58 |
| 59 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..
d7 |
| 60 punpcklbw xmm3, xmm0 ; xmm3 = words of a
bove |
| 61 psubw xmm5, xmm3 ; xmm5 -= xmm3 |
| 62 |
| 63 movq xmm3, QWORD PTR [rdi+8] |
| 64 punpcklbw xmm3, xmm0 |
| 65 psubw xmm4, xmm3 |
| 66 |
| 67 paddw xmm6, xmm5 ; xmm6 += accumulat
ed column differences |
| 68 paddw xmm6, xmm4 |
| 69 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 |
| 70 pmaddwd xmm4, xmm4 |
| 71 paddd xmm7, xmm5 ; xmm7 += accumulat
ed square column differences |
| 72 paddd xmm7, xmm4 |
| 73 |
| 74 movdqa xmm5, xmm1 ; save xmm1 for use
on the next row |
| 75 |
| 76 lea rsi, [rsi + rax] |
| 77 lea rdi, [rdi + rdx] |
| 78 |
| 79 sub rcx, 1 ; |
| 80 jnz vpx_half_horiz_vert_variance16x_h_1 ; |
| 81 |
| 82 pxor xmm1, xmm1 |
| 83 pxor xmm5, xmm5 |
| 84 |
| 85 punpcklwd xmm0, xmm6 |
| 86 punpckhwd xmm1, xmm6 |
| 87 psrad xmm0, 16 |
| 88 psrad xmm1, 16 |
| 89 paddd xmm0, xmm1 |
| 90 movdqa xmm1, xmm0 |
| 91 |
| 92 movdqa xmm6, xmm7 |
| 93 punpckldq xmm6, xmm5 |
| 94 punpckhdq xmm7, xmm5 |
| 95 paddd xmm6, xmm7 |
| 96 |
| 97 punpckldq xmm0, xmm5 |
| 98 punpckhdq xmm1, xmm5 |
| 99 paddd xmm0, xmm1 |
| 100 |
| 101 movdqa xmm7, xmm6 |
| 102 movdqa xmm1, xmm0 |
| 103 |
| 104 psrldq xmm7, 8 |
| 105 psrldq xmm1, 8 |
| 106 |
| 107 paddd xmm6, xmm7 |
| 108 paddd xmm0, xmm1 |
| 109 |
| 110 mov rsi, arg(5) ;[Sum] |
| 111 mov rdi, arg(6) ;[SSE] |
| 112 |
| 113 movd [rsi], xmm0 |
| 114 movd [rdi], xmm6 |
| 115 |
| 116 ; begin epilog |
| 117 pop rdi |
| 118 pop rsi |
| 119 RESTORE_GOT |
| 120 RESTORE_XMM |
| 121 UNSHADOW_ARGS |
| 122 pop rbp |
| 123 ret |
| 124 |
| 125 |
| 126 ;void vpx_half_vert_variance16x_h_sse2(unsigned char *ref, |
| 127 ; int ref_stride, |
| 128 ; unsigned char *src, |
| 129 ; int src_stride, |
| 130 ; unsigned int height, |
| 131 ; int *sum, |
| 132 ; unsigned int *sumsquared) |
| 133 global sym(vpx_half_vert_variance16x_h_sse2) PRIVATE |
| 134 sym(vpx_half_vert_variance16x_h_sse2): |
| 135 push rbp |
| 136 mov rbp, rsp |
| 137 SHADOW_ARGS_TO_STACK 7 |
| 138 SAVE_XMM 7 |
| 139 GET_GOT rbx |
| 140 push rsi |
| 141 push rdi |
| 142 ; end prolog |
| 143 |
| 144 pxor xmm6, xmm6 ; error accumulator |
| 145 pxor xmm7, xmm7 ; sse eaccumulator |
| 146 mov rsi, arg(0) ;ref |
| 147 |
| 148 mov rdi, arg(2) ;src |
| 149 movsxd rcx, dword ptr arg(4) ;height |
| 150 movsxd rax, dword ptr arg(1) ;ref_stride |
| 151 movsxd rdx, dword ptr arg(3) ;src_stride |
| 152 |
| 153 movdqu xmm5, XMMWORD PTR [rsi] |
| 154 lea rsi, [rsi + rax ] |
| 155 pxor xmm0, xmm0 |
| 156 |
| 157 vpx_half_vert_variance16x_h_1: |
| 158 movdqu xmm3, XMMWORD PTR [rsi] |
| 159 |
| 160 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,x
mm3) |
| 161 movdqa xmm4, xmm5 |
| 162 punpcklbw xmm5, xmm0 |
| 163 punpckhbw xmm4, xmm0 |
| 164 |
| 165 movq xmm2, QWORD PTR [rdi] |
| 166 punpcklbw xmm2, xmm0 |
| 167 psubw xmm5, xmm2 |
| 168 movq xmm2, QWORD PTR [rdi+8] |
| 169 punpcklbw xmm2, xmm0 |
| 170 psubw xmm4, xmm2 |
| 171 |
| 172 paddw xmm6, xmm5 ; xmm6 += accumulat
ed column differences |
| 173 paddw xmm6, xmm4 |
| 174 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 |
| 175 pmaddwd xmm4, xmm4 |
| 176 paddd xmm7, xmm5 ; xmm7 += accumulat
ed square column differences |
| 177 paddd xmm7, xmm4 |
| 178 |
| 179 movdqa xmm5, xmm3 |
| 180 |
| 181 lea rsi, [rsi + rax] |
| 182 lea rdi, [rdi + rdx] |
| 183 |
| 184 sub rcx, 1 |
| 185 jnz vpx_half_vert_variance16x_h_1 |
| 186 |
| 187 pxor xmm1, xmm1 |
| 188 pxor xmm5, xmm5 |
| 189 |
| 190 punpcklwd xmm0, xmm6 |
| 191 punpckhwd xmm1, xmm6 |
| 192 psrad xmm0, 16 |
| 193 psrad xmm1, 16 |
| 194 paddd xmm0, xmm1 |
| 195 movdqa xmm1, xmm0 |
| 196 |
| 197 movdqa xmm6, xmm7 |
| 198 punpckldq xmm6, xmm5 |
| 199 punpckhdq xmm7, xmm5 |
| 200 paddd xmm6, xmm7 |
| 201 |
| 202 punpckldq xmm0, xmm5 |
| 203 punpckhdq xmm1, xmm5 |
| 204 paddd xmm0, xmm1 |
| 205 |
| 206 movdqa xmm7, xmm6 |
| 207 movdqa xmm1, xmm0 |
| 208 |
| 209 psrldq xmm7, 8 |
| 210 psrldq xmm1, 8 |
| 211 |
| 212 paddd xmm6, xmm7 |
| 213 paddd xmm0, xmm1 |
| 214 |
| 215 mov rsi, arg(5) ;[Sum] |
| 216 mov rdi, arg(6) ;[SSE] |
| 217 |
| 218 movd [rsi], xmm0 |
| 219 movd [rdi], xmm6 |
| 220 |
| 221 ; begin epilog |
| 222 pop rdi |
| 223 pop rsi |
| 224 RESTORE_GOT |
| 225 RESTORE_XMM |
| 226 UNSHADOW_ARGS |
| 227 pop rbp |
| 228 ret |
| 229 |
| 230 |
| 231 ;void vpx_half_horiz_variance16x_h_sse2(unsigned char *ref, |
| 232 ; int ref_stride |
| 233 ; unsigned char *src, |
| 234 ; int src_stride, |
| 235 ; unsigned int height, |
| 236 ; int *sum, |
| 237 ; unsigned int *sumsquared) |
| 238 global sym(vpx_half_horiz_variance16x_h_sse2) PRIVATE |
| 239 sym(vpx_half_horiz_variance16x_h_sse2): |
| 240 push rbp |
| 241 mov rbp, rsp |
| 242 SHADOW_ARGS_TO_STACK 7 |
| 243 SAVE_XMM 7 |
| 244 GET_GOT rbx |
| 245 push rsi |
| 246 push rdi |
| 247 ; end prolog |
| 248 |
| 249 pxor xmm6, xmm6 ; error accumulator |
| 250 pxor xmm7, xmm7 ; sse eaccumulator |
| 251 mov rsi, arg(0) ;ref |
| 252 |
| 253 mov rdi, arg(2) ;src |
| 254 movsxd rcx, dword ptr arg(4) ;height |
| 255 movsxd rax, dword ptr arg(1) ;ref_stride |
| 256 movsxd rdx, dword ptr arg(3) ;src_stride |
| 257 |
| 258 pxor xmm0, xmm0 ; |
| 259 |
| 260 vpx_half_horiz_variance16x_h_1: |
| 261 movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2
..s15 |
| 262 movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3
..s16 |
| 263 |
| 264 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,x
mm3) |
| 265 movdqa xmm1, xmm5 |
| 266 punpcklbw xmm5, xmm0 ; xmm5 = words of a
bove |
| 267 punpckhbw xmm1, xmm0 |
| 268 |
| 269 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..
d7 |
| 270 punpcklbw xmm3, xmm0 ; xmm3 = words of a
bove |
| 271 movq xmm2, QWORD PTR [rdi+8] |
| 272 punpcklbw xmm2, xmm0 |
| 273 |
| 274 psubw xmm5, xmm3 ; xmm5 -= xmm3 |
| 275 psubw xmm1, xmm2 |
| 276 paddw xmm6, xmm5 ; xmm6 += accumulat
ed column differences |
| 277 paddw xmm6, xmm1 |
| 278 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 |
| 279 pmaddwd xmm1, xmm1 |
| 280 paddd xmm7, xmm5 ; xmm7 += accumulat
ed square column differences |
| 281 paddd xmm7, xmm1 |
| 282 |
| 283 lea rsi, [rsi + rax] |
| 284 lea rdi, [rdi + rdx] |
| 285 |
| 286 sub rcx, 1 ; |
| 287 jnz vpx_half_horiz_variance16x_h_1 ; |
| 288 |
| 289 pxor xmm1, xmm1 |
| 290 pxor xmm5, xmm5 |
| 291 |
| 292 punpcklwd xmm0, xmm6 |
| 293 punpckhwd xmm1, xmm6 |
| 294 psrad xmm0, 16 |
| 295 psrad xmm1, 16 |
| 296 paddd xmm0, xmm1 |
| 297 movdqa xmm1, xmm0 |
| 298 |
| 299 movdqa xmm6, xmm7 |
| 300 punpckldq xmm6, xmm5 |
| 301 punpckhdq xmm7, xmm5 |
| 302 paddd xmm6, xmm7 |
| 303 |
| 304 punpckldq xmm0, xmm5 |
| 305 punpckhdq xmm1, xmm5 |
| 306 paddd xmm0, xmm1 |
| 307 |
| 308 movdqa xmm7, xmm6 |
| 309 movdqa xmm1, xmm0 |
| 310 |
| 311 psrldq xmm7, 8 |
| 312 psrldq xmm1, 8 |
| 313 |
| 314 paddd xmm6, xmm7 |
| 315 paddd xmm0, xmm1 |
| 316 |
| 317 mov rsi, arg(5) ;[Sum] |
| 318 mov rdi, arg(6) ;[SSE] |
| 319 |
| 320 movd [rsi], xmm0 |
| 321 movd [rdi], xmm6 |
| 322 |
| 323 ; begin epilog |
| 324 pop rdi |
| 325 pop rsi |
| 326 RESTORE_GOT |
| 327 RESTORE_XMM |
| 328 UNSHADOW_ARGS |
| 329 pop rbp |
| 330 ret |
| 331 |
| 332 SECTION_RODATA |
| 333 ; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64}; |
| 334 align 16 |
| 335 xmm_bi_rd: |
| 336 times 8 dw 64 |
| 337 align 16 |
| 338 vpx_bilinear_filters_sse2: |
| 339 dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 |
| 340 dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 |
| 341 dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 |
| 342 dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 |
| 343 dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 |
| 344 dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 |
| 345 dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 |
| 346 dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 |
OLD | NEW |