| OLD | NEW |
| (Empty) |
| 1 ; | |
| 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | |
| 3 ; | |
| 4 ; Use of this source code is governed by a BSD-style license | |
| 5 ; that can be found in the LICENSE file in the root of the source | |
| 6 ; tree. An additional intellectual property rights grant can be found | |
| 7 ; in the file PATENTS. All contributing project authors may | |
| 8 ; be found in the AUTHORS file in the root of the source tree. | |
| 9 ; | |
| 10 | |
| 11 | |
| 12 %include "vpx_ports/x86_abi_support.asm" | |
| 13 | |
| 14 ;unsigned int vp9_get_mb_ss_sse2 | |
| 15 ;( | |
| 16 ; short *src_ptr | |
| 17 ;) | |
| 18 global sym(vp9_get_mb_ss_sse2) PRIVATE | |
| 19 sym(vp9_get_mb_ss_sse2): | |
| 20 push rbp | |
| 21 mov rbp, rsp | |
| 22 SHADOW_ARGS_TO_STACK 1 | |
| 23 GET_GOT rbx | |
| 24 push rsi | |
| 25 push rdi | |
| 26 sub rsp, 16 | |
| 27 ; end prolog | |
| 28 | |
| 29 | |
| 30 mov rax, arg(0) ;[src_ptr] | |
| 31 mov rcx, 8 | |
| 32 pxor xmm4, xmm4 | |
| 33 | |
| 34 .NEXTROW: | |
| 35 movdqa xmm0, [rax] | |
| 36 movdqa xmm1, [rax+16] | |
| 37 movdqa xmm2, [rax+32] | |
| 38 movdqa xmm3, [rax+48] | |
| 39 pmaddwd xmm0, xmm0 | |
| 40 pmaddwd xmm1, xmm1 | |
| 41 pmaddwd xmm2, xmm2 | |
| 42 pmaddwd xmm3, xmm3 | |
| 43 | |
| 44 paddd xmm0, xmm1 | |
| 45 paddd xmm2, xmm3 | |
| 46 paddd xmm4, xmm0 | |
| 47 paddd xmm4, xmm2 | |
| 48 | |
| 49 add rax, 0x40 | |
| 50 dec rcx | |
| 51 ja .NEXTROW | |
| 52 | |
| 53 movdqa xmm3,xmm4 | |
| 54 psrldq xmm4,8 | |
| 55 paddd xmm4,xmm3 | |
| 56 movdqa xmm3,xmm4 | |
| 57 psrldq xmm4,4 | |
| 58 paddd xmm4,xmm3 | |
| 59 movq rax,xmm4 | |
| 60 | |
| 61 | |
| 62 ; begin epilog | |
| 63 add rsp, 16 | |
| 64 pop rdi | |
| 65 pop rsi | |
| 66 RESTORE_GOT | |
| 67 UNSHADOW_ARGS | |
| 68 pop rbp | |
| 69 ret | |
| 70 | |
| 71 | |
| 72 ;unsigned int vp9_get16x16var_sse2 | |
| 73 ;( | |
| 74 ; unsigned char * src_ptr, | |
| 75 ; int source_stride, | |
| 76 ; unsigned char * ref_ptr, | |
| 77 ; int recon_stride, | |
| 78 ; unsigned int * SSE, | |
| 79 ; int * Sum | |
| 80 ;) | |
| 81 global sym(vp9_get16x16var_sse2) PRIVATE | |
| 82 sym(vp9_get16x16var_sse2): | |
| 83 push rbp | |
| 84 mov rbp, rsp | |
| 85 SHADOW_ARGS_TO_STACK 6 | |
| 86 SAVE_XMM 7 | |
| 87 push rbx | |
| 88 push rsi | |
| 89 push rdi | |
| 90 ; end prolog | |
| 91 | |
| 92 mov rsi, arg(0) ;[src_ptr] | |
| 93 mov rdi, arg(2) ;[ref_ptr] | |
| 94 | |
| 95 movsxd rax, DWORD PTR arg(1) ;[source_stride] | |
| 96 movsxd rdx, DWORD PTR arg(3) ;[recon_stride] | |
| 97 | |
| 98 ; Prefetch data | |
| 99 lea rcx, [rax+rax*2] | |
| 100 prefetcht0 [rsi] | |
| 101 prefetcht0 [rsi+rax] | |
| 102 prefetcht0 [rsi+rax*2] | |
| 103 prefetcht0 [rsi+rcx] | |
| 104 lea rbx, [rsi+rax*4] | |
| 105 prefetcht0 [rbx] | |
| 106 prefetcht0 [rbx+rax] | |
| 107 prefetcht0 [rbx+rax*2] | |
| 108 prefetcht0 [rbx+rcx] | |
| 109 | |
| 110 lea rcx, [rdx+rdx*2] | |
| 111 prefetcht0 [rdi] | |
| 112 prefetcht0 [rdi+rdx] | |
| 113 prefetcht0 [rdi+rdx*2] | |
| 114 prefetcht0 [rdi+rcx] | |
| 115 lea rbx, [rdi+rdx*4] | |
| 116 prefetcht0 [rbx] | |
| 117 prefetcht0 [rbx+rdx] | |
| 118 prefetcht0 [rbx+rdx*2] | |
| 119 prefetcht0 [rbx+rcx] | |
| 120 | |
| 121 pxor xmm0, xmm0 ; clear xmm0 for
unpack | |
| 122 pxor xmm7, xmm7 ; clear xmm7 for
accumulating diffs | |
| 123 | |
| 124 pxor xmm6, xmm6 ; clear xmm6 for
accumulating sse | |
| 125 mov rcx, 16 | |
| 126 | |
| 127 .var16loop: | |
| 128 movdqu xmm1, XMMWORD PTR [rsi] | |
| 129 movdqu xmm2, XMMWORD PTR [rdi] | |
| 130 | |
| 131 prefetcht0 [rsi+rax*8] | |
| 132 prefetcht0 [rdi+rdx*8] | |
| 133 | |
| 134 movdqa xmm3, xmm1 | |
| 135 movdqa xmm4, xmm2 | |
| 136 | |
| 137 | |
| 138 punpcklbw xmm1, xmm0 | |
| 139 punpckhbw xmm3, xmm0 | |
| 140 | |
| 141 punpcklbw xmm2, xmm0 | |
| 142 punpckhbw xmm4, xmm0 | |
| 143 | |
| 144 | |
| 145 psubw xmm1, xmm2 | |
| 146 psubw xmm3, xmm4 | |
| 147 | |
| 148 paddw xmm7, xmm1 | |
| 149 pmaddwd xmm1, xmm1 | |
| 150 | |
| 151 paddw xmm7, xmm3 | |
| 152 pmaddwd xmm3, xmm3 | |
| 153 | |
| 154 paddd xmm6, xmm1 | |
| 155 paddd xmm6, xmm3 | |
| 156 | |
| 157 add rsi, rax | |
| 158 add rdi, rdx | |
| 159 | |
| 160 sub rcx, 1 | |
| 161 jnz .var16loop | |
| 162 | |
| 163 | |
| 164 movdqa xmm1, xmm6 | |
| 165 pxor xmm6, xmm6 | |
| 166 | |
| 167 pxor xmm5, xmm5 | |
| 168 punpcklwd xmm6, xmm7 | |
| 169 | |
| 170 punpckhwd xmm5, xmm7 | |
| 171 psrad xmm5, 16 | |
| 172 | |
| 173 psrad xmm6, 16 | |
| 174 paddd xmm6, xmm5 | |
| 175 | |
| 176 movdqa xmm2, xmm1 | |
| 177 punpckldq xmm1, xmm0 | |
| 178 | |
| 179 punpckhdq xmm2, xmm0 | |
| 180 movdqa xmm7, xmm6 | |
| 181 | |
| 182 paddd xmm1, xmm2 | |
| 183 punpckldq xmm6, xmm0 | |
| 184 | |
| 185 punpckhdq xmm7, xmm0 | |
| 186 paddd xmm6, xmm7 | |
| 187 | |
| 188 movdqa xmm2, xmm1 | |
| 189 movdqa xmm7, xmm6 | |
| 190 | |
| 191 psrldq xmm1, 8 | |
| 192 psrldq xmm6, 8 | |
| 193 | |
| 194 paddd xmm7, xmm6 | |
| 195 paddd xmm1, xmm2 | |
| 196 | |
| 197 mov rax, arg(5) ;[Sum] | |
| 198 mov rdi, arg(4) ;[SSE] | |
| 199 | |
| 200 movd DWORD PTR [rax], xmm7 | |
| 201 movd DWORD PTR [rdi], xmm1 | |
| 202 | |
| 203 | |
| 204 ; begin epilog | |
| 205 pop rdi | |
| 206 pop rsi | |
| 207 pop rbx | |
| 208 RESTORE_XMM | |
| 209 UNSHADOW_ARGS | |
| 210 pop rbp | |
| 211 ret | |
| 212 | |
| 213 | |
| 214 | |
| 215 | |
| 216 ;unsigned int vp9_get8x8var_sse2 | |
| 217 ;( | |
| 218 ; unsigned char * src_ptr, | |
| 219 ; int source_stride, | |
| 220 ; unsigned char * ref_ptr, | |
| 221 ; int recon_stride, | |
| 222 ; unsigned int * SSE, | |
| 223 ; int * Sum | |
| 224 ;) | |
| 225 global sym(vp9_get8x8var_sse2) PRIVATE | |
| 226 sym(vp9_get8x8var_sse2): | |
| 227 push rbp | |
| 228 mov rbp, rsp | |
| 229 SHADOW_ARGS_TO_STACK 6 | |
| 230 SAVE_XMM 7 | |
| 231 GET_GOT rbx | |
| 232 push rsi | |
| 233 push rdi | |
| 234 sub rsp, 16 | |
| 235 ; end prolog | |
| 236 | |
| 237 mov rsi, arg(0) ;[src_ptr] | |
| 238 mov rdi, arg(2) ;[ref_ptr] | |
| 239 | |
| 240 movsxd rax, DWORD PTR arg(1) ;[source_stride] | |
| 241 movsxd rdx, DWORD PTR arg(3) ;[recon_stride] | |
| 242 | |
| 243 pxor xmm0, xmm0 ; clear xmm0 for
unpack | |
| 244 pxor xmm7, xmm7 ; clear xmm7 for
accumulating diffs | |
| 245 | |
| 246 movq xmm1, QWORD PTR [rsi] | |
| 247 movq xmm2, QWORD PTR [rdi] | |
| 248 | |
| 249 punpcklbw xmm1, xmm0 | |
| 250 punpcklbw xmm2, xmm0 | |
| 251 | |
| 252 psubsw xmm1, xmm2 | |
| 253 paddw xmm7, xmm1 | |
| 254 | |
| 255 pmaddwd xmm1, xmm1 | |
| 256 | |
| 257 movq xmm2, QWORD PTR[rsi + rax] | |
| 258 movq xmm3, QWORD PTR[rdi + rdx] | |
| 259 | |
| 260 punpcklbw xmm2, xmm0 | |
| 261 punpcklbw xmm3, xmm0 | |
| 262 | |
| 263 psubsw xmm2, xmm3 | |
| 264 paddw xmm7, xmm2 | |
| 265 | |
| 266 pmaddwd xmm2, xmm2 | |
| 267 paddd xmm1, xmm2 | |
| 268 | |
| 269 | |
| 270 movq xmm2, QWORD PTR[rsi + rax * 2] | |
| 271 movq xmm3, QWORD PTR[rdi + rdx * 2] | |
| 272 | |
| 273 punpcklbw xmm2, xmm0 | |
| 274 punpcklbw xmm3, xmm0 | |
| 275 | |
| 276 psubsw xmm2, xmm3 | |
| 277 paddw xmm7, xmm2 | |
| 278 | |
| 279 pmaddwd xmm2, xmm2 | |
| 280 paddd xmm1, xmm2 | |
| 281 | |
| 282 | |
| 283 lea rsi, [rsi + rax * 2] | |
| 284 lea rdi, [rdi + rdx * 2] | |
| 285 movq xmm2, QWORD PTR[rsi + rax] | |
| 286 movq xmm3, QWORD PTR[rdi + rdx] | |
| 287 | |
| 288 punpcklbw xmm2, xmm0 | |
| 289 punpcklbw xmm3, xmm0 | |
| 290 | |
| 291 psubsw xmm2, xmm3 | |
| 292 paddw xmm7, xmm2 | |
| 293 | |
| 294 pmaddwd xmm2, xmm2 | |
| 295 paddd xmm1, xmm2 | |
| 296 | |
| 297 movq xmm2, QWORD PTR[rsi + rax *2] | |
| 298 movq xmm3, QWORD PTR[rdi + rdx *2] | |
| 299 | |
| 300 punpcklbw xmm2, xmm0 | |
| 301 punpcklbw xmm3, xmm0 | |
| 302 | |
| 303 psubsw xmm2, xmm3 | |
| 304 paddw xmm7, xmm2 | |
| 305 | |
| 306 pmaddwd xmm2, xmm2 | |
| 307 paddd xmm1, xmm2 | |
| 308 | |
| 309 | |
| 310 lea rsi, [rsi + rax * 2] | |
| 311 lea rdi, [rdi + rdx * 2] | |
| 312 | |
| 313 | |
| 314 movq xmm2, QWORD PTR[rsi + rax] | |
| 315 movq xmm3, QWORD PTR[rdi + rdx] | |
| 316 | |
| 317 punpcklbw xmm2, xmm0 | |
| 318 punpcklbw xmm3, xmm0 | |
| 319 | |
| 320 psubsw xmm2, xmm3 | |
| 321 paddw xmm7, xmm2 | |
| 322 | |
| 323 pmaddwd xmm2, xmm2 | |
| 324 paddd xmm1, xmm2 | |
| 325 | |
| 326 movq xmm2, QWORD PTR[rsi + rax *2] | |
| 327 movq xmm3, QWORD PTR[rdi + rdx *2] | |
| 328 | |
| 329 punpcklbw xmm2, xmm0 | |
| 330 punpcklbw xmm3, xmm0 | |
| 331 | |
| 332 psubsw xmm2, xmm3 | |
| 333 paddw xmm7, xmm2 | |
| 334 | |
| 335 pmaddwd xmm2, xmm2 | |
| 336 paddd xmm1, xmm2 | |
| 337 | |
| 338 | |
| 339 lea rsi, [rsi + rax * 2] | |
| 340 lea rdi, [rdi + rdx * 2] | |
| 341 | |
| 342 movq xmm2, QWORD PTR[rsi + rax] | |
| 343 movq xmm3, QWORD PTR[rdi + rdx] | |
| 344 | |
| 345 punpcklbw xmm2, xmm0 | |
| 346 punpcklbw xmm3, xmm0 | |
| 347 | |
| 348 psubsw xmm2, xmm3 | |
| 349 paddw xmm7, xmm2 | |
| 350 | |
| 351 pmaddwd xmm2, xmm2 | |
| 352 paddd xmm1, xmm2 | |
| 353 | |
| 354 | |
| 355 movdqa xmm6, xmm7 | |
| 356 punpcklwd xmm6, xmm0 | |
| 357 | |
| 358 punpckhwd xmm7, xmm0 | |
| 359 movdqa xmm2, xmm1 | |
| 360 | |
| 361 paddw xmm6, xmm7 | |
| 362 punpckldq xmm1, xmm0 | |
| 363 | |
| 364 punpckhdq xmm2, xmm0 | |
| 365 movdqa xmm7, xmm6 | |
| 366 | |
| 367 paddd xmm1, xmm2 | |
| 368 punpckldq xmm6, xmm0 | |
| 369 | |
| 370 punpckhdq xmm7, xmm0 | |
| 371 paddw xmm6, xmm7 | |
| 372 | |
| 373 movdqa xmm2, xmm1 | |
| 374 movdqa xmm7, xmm6 | |
| 375 | |
| 376 psrldq xmm1, 8 | |
| 377 psrldq xmm6, 8 | |
| 378 | |
| 379 paddw xmm7, xmm6 | |
| 380 paddd xmm1, xmm2 | |
| 381 | |
| 382 mov rax, arg(5) ;[Sum] | |
| 383 mov rdi, arg(4) ;[SSE] | |
| 384 | |
| 385 movq rdx, xmm7 | |
| 386 movsx rcx, dx | |
| 387 | |
| 388 mov dword ptr [rax], ecx | |
| 389 movd DWORD PTR [rdi], xmm1 | |
| 390 | |
| 391 ; begin epilog | |
| 392 add rsp, 16 | |
| 393 pop rdi | |
| 394 pop rsi | |
| 395 RESTORE_GOT | |
| 396 RESTORE_XMM | |
| 397 UNSHADOW_ARGS | |
| 398 pop rbp | |
| 399 ret | |
| 400 | |
| 401 | |
| OLD | NEW |