OLD | NEW |
(Empty) | |
| 1 ; |
| 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| 3 ; |
| 4 ; Use of this source code is governed by a BSD-style license |
| 5 ; that can be found in the LICENSE file in the root of the source |
| 6 ; tree. An additional intellectual property rights grant can be found |
| 7 ; in the file PATENTS. All contributing project authors may |
| 8 ; be found in the AUTHORS file in the root of the source tree. |
| 9 ; |
| 10 |
| 11 |
| 12 %include "vpx_ports/x86_abi_support.asm" |
| 13 |
| 14 %define xmm_filter_shift 7 |
| 15 |
| 16 ;unsigned int vp9_get_mb_ss_sse2 |
| 17 ;( |
| 18 ; short *src_ptr |
| 19 ;) |
| 20 global sym(vp9_get_mb_ss_sse2) |
| 21 sym(vp9_get_mb_ss_sse2): |
| 22 push rbp |
| 23 mov rbp, rsp |
| 24 SHADOW_ARGS_TO_STACK 1 |
| 25 GET_GOT rbx |
| 26 push rsi |
| 27 push rdi |
| 28 sub rsp, 16 |
| 29 ; end prolog |
| 30 |
| 31 |
| 32 mov rax, arg(0) ;[src_ptr] |
| 33 mov rcx, 8 |
| 34 pxor xmm4, xmm4 |
| 35 |
| 36 .NEXTROW: |
| 37 movdqa xmm0, [rax] |
| 38 movdqa xmm1, [rax+16] |
| 39 movdqa xmm2, [rax+32] |
| 40 movdqa xmm3, [rax+48] |
| 41 pmaddwd xmm0, xmm0 |
| 42 pmaddwd xmm1, xmm1 |
| 43 pmaddwd xmm2, xmm2 |
| 44 pmaddwd xmm3, xmm3 |
| 45 |
| 46 paddd xmm0, xmm1 |
| 47 paddd xmm2, xmm3 |
| 48 paddd xmm4, xmm0 |
| 49 paddd xmm4, xmm2 |
| 50 |
| 51 add rax, 0x40 |
| 52 dec rcx |
| 53 ja .NEXTROW |
| 54 |
| 55 movdqa xmm3,xmm4 |
| 56 psrldq xmm4,8 |
| 57 paddd xmm4,xmm3 |
| 58 movdqa xmm3,xmm4 |
| 59 psrldq xmm4,4 |
| 60 paddd xmm4,xmm3 |
| 61 movq rax,xmm4 |
| 62 |
| 63 |
| 64 ; begin epilog |
| 65 add rsp, 16 |
| 66 pop rdi |
| 67 pop rsi |
| 68 RESTORE_GOT |
| 69 UNSHADOW_ARGS |
| 70 pop rbp |
| 71 ret |
| 72 |
| 73 |
| 74 ;unsigned int vp9_get16x16var_sse2 |
| 75 ;( |
| 76 ; unsigned char * src_ptr, |
| 77 ; int source_stride, |
| 78 ; unsigned char * ref_ptr, |
| 79 ; int recon_stride, |
| 80 ; unsigned int * SSE, |
| 81 ; int * Sum |
| 82 ;) |
| 83 global sym(vp9_get16x16var_sse2) |
| 84 sym(vp9_get16x16var_sse2): |
| 85 push rbp |
| 86 mov rbp, rsp |
| 87 SHADOW_ARGS_TO_STACK 6 |
| 88 SAVE_XMM 7 |
| 89 push rbx |
| 90 push rsi |
| 91 push rdi |
| 92 ; end prolog |
| 93 |
| 94 mov rsi, arg(0) ;[src_ptr] |
| 95 mov rdi, arg(2) ;[ref_ptr] |
| 96 |
| 97 movsxd rax, DWORD PTR arg(1) ;[source_stride] |
| 98 movsxd rdx, DWORD PTR arg(3) ;[recon_stride] |
| 99 |
| 100 ; Prefetch data |
| 101 lea rcx, [rax+rax*2] |
| 102 prefetcht0 [rsi] |
| 103 prefetcht0 [rsi+rax] |
| 104 prefetcht0 [rsi+rax*2] |
| 105 prefetcht0 [rsi+rcx] |
| 106 lea rbx, [rsi+rax*4] |
| 107 prefetcht0 [rbx] |
| 108 prefetcht0 [rbx+rax] |
| 109 prefetcht0 [rbx+rax*2] |
| 110 prefetcht0 [rbx+rcx] |
| 111 |
| 112 lea rcx, [rdx+rdx*2] |
| 113 prefetcht0 [rdi] |
| 114 prefetcht0 [rdi+rdx] |
| 115 prefetcht0 [rdi+rdx*2] |
| 116 prefetcht0 [rdi+rcx] |
| 117 lea rbx, [rdi+rdx*4] |
| 118 prefetcht0 [rbx] |
| 119 prefetcht0 [rbx+rdx] |
| 120 prefetcht0 [rbx+rdx*2] |
| 121 prefetcht0 [rbx+rcx] |
| 122 |
| 123 pxor xmm0, xmm0 ; clear xmm0 for
unpack |
| 124 pxor xmm7, xmm7 ; clear xmm7 for
accumulating diffs |
| 125 |
| 126 pxor xmm6, xmm6 ; clear xmm6 for
accumulating sse |
| 127 mov rcx, 16 |
| 128 |
| 129 .var16loop: |
| 130 movdqu xmm1, XMMWORD PTR [rsi] |
| 131 movdqu xmm2, XMMWORD PTR [rdi] |
| 132 |
| 133 prefetcht0 [rsi+rax*8] |
| 134 prefetcht0 [rdi+rdx*8] |
| 135 |
| 136 movdqa xmm3, xmm1 |
| 137 movdqa xmm4, xmm2 |
| 138 |
| 139 |
| 140 punpcklbw xmm1, xmm0 |
| 141 punpckhbw xmm3, xmm0 |
| 142 |
| 143 punpcklbw xmm2, xmm0 |
| 144 punpckhbw xmm4, xmm0 |
| 145 |
| 146 |
| 147 psubw xmm1, xmm2 |
| 148 psubw xmm3, xmm4 |
| 149 |
| 150 paddw xmm7, xmm1 |
| 151 pmaddwd xmm1, xmm1 |
| 152 |
| 153 paddw xmm7, xmm3 |
| 154 pmaddwd xmm3, xmm3 |
| 155 |
| 156 paddd xmm6, xmm1 |
| 157 paddd xmm6, xmm3 |
| 158 |
| 159 add rsi, rax |
| 160 add rdi, rdx |
| 161 |
| 162 sub rcx, 1 |
| 163 jnz .var16loop |
| 164 |
| 165 |
| 166 movdqa xmm1, xmm6 |
| 167 pxor xmm6, xmm6 |
| 168 |
| 169 pxor xmm5, xmm5 |
| 170 punpcklwd xmm6, xmm7 |
| 171 |
| 172 punpckhwd xmm5, xmm7 |
| 173 psrad xmm5, 16 |
| 174 |
| 175 psrad xmm6, 16 |
| 176 paddd xmm6, xmm5 |
| 177 |
| 178 movdqa xmm2, xmm1 |
| 179 punpckldq xmm1, xmm0 |
| 180 |
| 181 punpckhdq xmm2, xmm0 |
| 182 movdqa xmm7, xmm6 |
| 183 |
| 184 paddd xmm1, xmm2 |
| 185 punpckldq xmm6, xmm0 |
| 186 |
| 187 punpckhdq xmm7, xmm0 |
| 188 paddd xmm6, xmm7 |
| 189 |
| 190 movdqa xmm2, xmm1 |
| 191 movdqa xmm7, xmm6 |
| 192 |
| 193 psrldq xmm1, 8 |
| 194 psrldq xmm6, 8 |
| 195 |
| 196 paddd xmm7, xmm6 |
| 197 paddd xmm1, xmm2 |
| 198 |
| 199 mov rax, arg(5) ;[Sum] |
| 200 mov rdi, arg(4) ;[SSE] |
| 201 |
| 202 movd DWORD PTR [rax], xmm7 |
| 203 movd DWORD PTR [rdi], xmm1 |
| 204 |
| 205 |
| 206 ; begin epilog |
| 207 pop rdi |
| 208 pop rsi |
| 209 pop rbx |
| 210 RESTORE_XMM |
| 211 UNSHADOW_ARGS |
| 212 pop rbp |
| 213 ret |
| 214 |
| 215 |
| 216 |
| 217 |
| 218 ;unsigned int vp9_get8x8var_sse2 |
| 219 ;( |
| 220 ; unsigned char * src_ptr, |
| 221 ; int source_stride, |
| 222 ; unsigned char * ref_ptr, |
| 223 ; int recon_stride, |
| 224 ; unsigned int * SSE, |
| 225 ; int * Sum |
| 226 ;) |
| 227 global sym(vp9_get8x8var_sse2) |
| 228 sym(vp9_get8x8var_sse2): |
| 229 push rbp |
| 230 mov rbp, rsp |
| 231 SHADOW_ARGS_TO_STACK 6 |
| 232 SAVE_XMM 7 |
| 233 GET_GOT rbx |
| 234 push rsi |
| 235 push rdi |
| 236 sub rsp, 16 |
| 237 ; end prolog |
| 238 |
| 239 mov rsi, arg(0) ;[src_ptr] |
| 240 mov rdi, arg(2) ;[ref_ptr] |
| 241 |
| 242 movsxd rax, DWORD PTR arg(1) ;[source_stride] |
| 243 movsxd rdx, DWORD PTR arg(3) ;[recon_stride] |
| 244 |
| 245 pxor xmm0, xmm0 ; clear xmm0 for
unpack |
| 246 pxor xmm7, xmm7 ; clear xmm7 for
accumulating diffs |
| 247 |
| 248 movq xmm1, QWORD PTR [rsi] |
| 249 movq xmm2, QWORD PTR [rdi] |
| 250 |
| 251 punpcklbw xmm1, xmm0 |
| 252 punpcklbw xmm2, xmm0 |
| 253 |
| 254 psubsw xmm1, xmm2 |
| 255 paddw xmm7, xmm1 |
| 256 |
| 257 pmaddwd xmm1, xmm1 |
| 258 |
| 259 movq xmm2, QWORD PTR[rsi + rax] |
| 260 movq xmm3, QWORD PTR[rdi + rdx] |
| 261 |
| 262 punpcklbw xmm2, xmm0 |
| 263 punpcklbw xmm3, xmm0 |
| 264 |
| 265 psubsw xmm2, xmm3 |
| 266 paddw xmm7, xmm2 |
| 267 |
| 268 pmaddwd xmm2, xmm2 |
| 269 paddd xmm1, xmm2 |
| 270 |
| 271 |
| 272 movq xmm2, QWORD PTR[rsi + rax * 2] |
| 273 movq xmm3, QWORD PTR[rdi + rdx * 2] |
| 274 |
| 275 punpcklbw xmm2, xmm0 |
| 276 punpcklbw xmm3, xmm0 |
| 277 |
| 278 psubsw xmm2, xmm3 |
| 279 paddw xmm7, xmm2 |
| 280 |
| 281 pmaddwd xmm2, xmm2 |
| 282 paddd xmm1, xmm2 |
| 283 |
| 284 |
| 285 lea rsi, [rsi + rax * 2] |
| 286 lea rdi, [rdi + rdx * 2] |
| 287 movq xmm2, QWORD PTR[rsi + rax] |
| 288 movq xmm3, QWORD PTR[rdi + rdx] |
| 289 |
| 290 punpcklbw xmm2, xmm0 |
| 291 punpcklbw xmm3, xmm0 |
| 292 |
| 293 psubsw xmm2, xmm3 |
| 294 paddw xmm7, xmm2 |
| 295 |
| 296 pmaddwd xmm2, xmm2 |
| 297 paddd xmm1, xmm2 |
| 298 |
| 299 movq xmm2, QWORD PTR[rsi + rax *2] |
| 300 movq xmm3, QWORD PTR[rdi + rdx *2] |
| 301 |
| 302 punpcklbw xmm2, xmm0 |
| 303 punpcklbw xmm3, xmm0 |
| 304 |
| 305 psubsw xmm2, xmm3 |
| 306 paddw xmm7, xmm2 |
| 307 |
| 308 pmaddwd xmm2, xmm2 |
| 309 paddd xmm1, xmm2 |
| 310 |
| 311 |
| 312 lea rsi, [rsi + rax * 2] |
| 313 lea rdi, [rdi + rdx * 2] |
| 314 |
| 315 |
| 316 movq xmm2, QWORD PTR[rsi + rax] |
| 317 movq xmm3, QWORD PTR[rdi + rdx] |
| 318 |
| 319 punpcklbw xmm2, xmm0 |
| 320 punpcklbw xmm3, xmm0 |
| 321 |
| 322 psubsw xmm2, xmm3 |
| 323 paddw xmm7, xmm2 |
| 324 |
| 325 pmaddwd xmm2, xmm2 |
| 326 paddd xmm1, xmm2 |
| 327 |
| 328 movq xmm2, QWORD PTR[rsi + rax *2] |
| 329 movq xmm3, QWORD PTR[rdi + rdx *2] |
| 330 |
| 331 punpcklbw xmm2, xmm0 |
| 332 punpcklbw xmm3, xmm0 |
| 333 |
| 334 psubsw xmm2, xmm3 |
| 335 paddw xmm7, xmm2 |
| 336 |
| 337 pmaddwd xmm2, xmm2 |
| 338 paddd xmm1, xmm2 |
| 339 |
| 340 |
| 341 lea rsi, [rsi + rax * 2] |
| 342 lea rdi, [rdi + rdx * 2] |
| 343 |
| 344 movq xmm2, QWORD PTR[rsi + rax] |
| 345 movq xmm3, QWORD PTR[rdi + rdx] |
| 346 |
| 347 punpcklbw xmm2, xmm0 |
| 348 punpcklbw xmm3, xmm0 |
| 349 |
| 350 psubsw xmm2, xmm3 |
| 351 paddw xmm7, xmm2 |
| 352 |
| 353 pmaddwd xmm2, xmm2 |
| 354 paddd xmm1, xmm2 |
| 355 |
| 356 |
| 357 movdqa xmm6, xmm7 |
| 358 punpcklwd xmm6, xmm0 |
| 359 |
| 360 punpckhwd xmm7, xmm0 |
| 361 movdqa xmm2, xmm1 |
| 362 |
| 363 paddw xmm6, xmm7 |
| 364 punpckldq xmm1, xmm0 |
| 365 |
| 366 punpckhdq xmm2, xmm0 |
| 367 movdqa xmm7, xmm6 |
| 368 |
| 369 paddd xmm1, xmm2 |
| 370 punpckldq xmm6, xmm0 |
| 371 |
| 372 punpckhdq xmm7, xmm0 |
| 373 paddw xmm6, xmm7 |
| 374 |
| 375 movdqa xmm2, xmm1 |
| 376 movdqa xmm7, xmm6 |
| 377 |
| 378 psrldq xmm1, 8 |
| 379 psrldq xmm6, 8 |
| 380 |
| 381 paddw xmm7, xmm6 |
| 382 paddd xmm1, xmm2 |
| 383 |
| 384 mov rax, arg(5) ;[Sum] |
| 385 mov rdi, arg(4) ;[SSE] |
| 386 |
| 387 movq rdx, xmm7 |
| 388 movsx rcx, dx |
| 389 |
| 390 mov dword ptr [rax], ecx |
| 391 movd DWORD PTR [rdi], xmm1 |
| 392 |
| 393 ; begin epilog |
| 394 add rsp, 16 |
| 395 pop rdi |
| 396 pop rsi |
| 397 RESTORE_GOT |
| 398 RESTORE_XMM |
| 399 UNSHADOW_ARGS |
| 400 pop rbp |
| 401 ret |
| 402 |
| 403 ;void vp9_filter_block2d_bil_var_sse2 |
| 404 ;( |
| 405 ; unsigned char *ref_ptr, |
| 406 ; int ref_pixels_per_line, |
| 407 ; unsigned char *src_ptr, |
| 408 ; int src_pixels_per_line, |
| 409 ; unsigned int Height, |
| 410 ; int xoffset, |
| 411 ; int yoffset, |
| 412 ; int *sum, |
| 413 ; unsigned int *sumsquared;; |
| 414 ; |
| 415 ;) |
| 416 global sym(vp9_filter_block2d_bil_var_sse2) |
| 417 sym(vp9_filter_block2d_bil_var_sse2): |
| 418 push rbp |
| 419 mov rbp, rsp |
| 420 SHADOW_ARGS_TO_STACK 9 |
| 421 SAVE_XMM 7 |
| 422 GET_GOT rbx |
| 423 push rsi |
| 424 push rdi |
| 425 push rbx |
| 426 ; end prolog |
| 427 |
| 428 pxor xmm6, xmm6 ; |
| 429 pxor xmm7, xmm7 ; |
| 430 |
| 431 lea rsi, [GLOBAL(xmm_bi_rd)] ; rounding |
| 432 movdqa xmm4, XMMWORD PTR [rsi] |
| 433 |
| 434 lea rcx, [GLOBAL(bilinear_filters_sse2)] |
| 435 movsxd rax, dword ptr arg(5) ; xoffset |
| 436 |
| 437 cmp rax, 0 ; skip first_pass f
ilter if xoffset=0 |
| 438 je filter_block2d_bil_var_sse2_sp_only |
| 439 |
| 440 shl rax, 5 ; point to filter c
oeff with xoffset |
| 441 lea rax, [rax + rcx] ; HFilter |
| 442 |
| 443 movsxd rdx, dword ptr arg(6) ; yoffset |
| 444 |
| 445 cmp rdx, 0 ; skip second_pass
filter if yoffset=0 |
| 446 je filter_block2d_bil_var_sse2_fp_only |
| 447 |
| 448 shl rdx, 5 |
| 449 lea rdx, [rdx + rcx] ; VFilter |
| 450 |
| 451 mov rsi, arg(0) ;ref_ptr |
| 452 mov rdi, arg(2) ;src_ptr |
| 453 movsxd rcx, dword ptr arg(4) ;Height |
| 454 |
| 455 pxor xmm0, xmm0 ; |
| 456 movq xmm1, QWORD PTR [rsi] ; |
| 457 movq xmm3, QWORD PTR [rsi+1] ; |
| 458 |
| 459 punpcklbw xmm1, xmm0 ; |
| 460 pmullw xmm1, [rax] ; |
| 461 punpcklbw xmm3, xmm0 |
| 462 pmullw xmm3, [rax+16] ; |
| 463 |
| 464 paddw xmm1, xmm3 ; |
| 465 paddw xmm1, xmm4 ; |
| 466 psraw xmm1, xmm_filter_shift ; |
| 467 movdqa xmm5, xmm1 |
| 468 |
| 469 movsxd rbx, dword ptr arg(1) ;ref_pixels_per_line |
| 470 lea rsi, [rsi + rbx] |
| 471 %if ABI_IS_32BIT=0 |
| 472 movsxd r9, dword ptr arg(3) ;src_pixels_per_line |
| 473 %endif |
| 474 |
| 475 filter_block2d_bil_var_sse2_loop: |
| 476 movq xmm1, QWORD PTR [rsi] ; |
| 477 movq xmm3, QWORD PTR [rsi+1] ; |
| 478 |
| 479 punpcklbw xmm1, xmm0 ; |
| 480 pmullw xmm1, [rax] ; |
| 481 punpcklbw xmm3, xmm0 ; |
| 482 pmullw xmm3, [rax+16] ; |
| 483 |
| 484 paddw xmm1, xmm3 ; |
| 485 paddw xmm1, xmm4 ; |
| 486 psraw xmm1, xmm_filter_shift ; |
| 487 |
| 488 movdqa xmm3, xmm5 ; |
| 489 movdqa xmm5, xmm1 ; |
| 490 |
| 491 pmullw xmm3, [rdx] ; |
| 492 pmullw xmm1, [rdx+16] ; |
| 493 paddw xmm1, xmm3 ; |
| 494 paddw xmm1, xmm4 ; |
| 495 psraw xmm1, xmm_filter_shift ; |
| 496 |
| 497 movq xmm3, QWORD PTR [rdi] ; |
| 498 punpcklbw xmm3, xmm0 ; |
| 499 |
| 500 psubw xmm1, xmm3 ; |
| 501 paddw xmm6, xmm1 ; |
| 502 |
| 503 pmaddwd xmm1, xmm1 ; |
| 504 paddd xmm7, xmm1 ; |
| 505 |
| 506 lea rsi, [rsi + rbx] ;ref_pixels_per_lin
e |
| 507 %if ABI_IS_32BIT |
| 508 add rdi, dword ptr arg(3) ;src_pixels_per_lin
e |
| 509 %else |
| 510 lea rdi, [rdi + r9] |
| 511 %endif |
| 512 |
| 513 sub rcx, 1 ; |
| 514 jnz filter_block2d_bil_var_sse2_loop ; |
| 515 |
| 516 jmp filter_block2d_bil_variance |
| 517 |
| 518 filter_block2d_bil_var_sse2_sp_only: |
| 519 movsxd rdx, dword ptr arg(6) ; yoffset |
| 520 |
| 521 cmp rdx, 0 ; skip all if both
xoffset=0 and yoffset=0 |
| 522 je filter_block2d_bil_var_sse2_full_pixel |
| 523 |
| 524 shl rdx, 5 |
| 525 lea rdx, [rdx + rcx] ; VFilter |
| 526 |
| 527 mov rsi, arg(0) ;ref_ptr |
| 528 mov rdi, arg(2) ;src_ptr |
| 529 movsxd rcx, dword ptr arg(4) ;Height |
| 530 movsxd rax, dword ptr arg(1) ;ref_pixels_per_lin
e |
| 531 |
| 532 pxor xmm0, xmm0 ; |
| 533 movq xmm1, QWORD PTR [rsi] ; |
| 534 punpcklbw xmm1, xmm0 ; |
| 535 |
| 536 movsxd rbx, dword ptr arg(3) ;src_pixels_per_lin
e |
| 537 lea rsi, [rsi + rax] |
| 538 |
| 539 filter_block2d_bil_sp_only_loop: |
| 540 movq xmm3, QWORD PTR [rsi] ; |
| 541 punpcklbw xmm3, xmm0 ; |
| 542 movdqa xmm5, xmm3 |
| 543 |
| 544 pmullw xmm1, [rdx] ; |
| 545 pmullw xmm3, [rdx+16] ; |
| 546 paddw xmm1, xmm3 ; |
| 547 paddw xmm1, xmm4 ; |
| 548 psraw xmm1, xmm_filter_shift ; |
| 549 |
| 550 movq xmm3, QWORD PTR [rdi] ; |
| 551 punpcklbw xmm3, xmm0 ; |
| 552 |
| 553 psubw xmm1, xmm3 ; |
| 554 paddw xmm6, xmm1 ; |
| 555 |
| 556 pmaddwd xmm1, xmm1 ; |
| 557 paddd xmm7, xmm1 ; |
| 558 |
| 559 movdqa xmm1, xmm5 ; |
| 560 lea rsi, [rsi + rax] ;ref_pixels_per_lin
e |
| 561 lea rdi, [rdi + rbx] ;src_pixels_per_lin
e |
| 562 |
| 563 sub rcx, 1 ; |
| 564 jnz filter_block2d_bil_sp_only_loop ; |
| 565 |
| 566 jmp filter_block2d_bil_variance |
| 567 |
| 568 filter_block2d_bil_var_sse2_full_pixel: |
| 569 mov rsi, arg(0) ;ref_ptr |
| 570 mov rdi, arg(2) ;src_ptr |
| 571 movsxd rcx, dword ptr arg(4) ;Height |
| 572 movsxd rax, dword ptr arg(1) ;ref_pixels_per_lin
e |
| 573 movsxd rbx, dword ptr arg(3) ;src_pixels_per_lin
e |
| 574 pxor xmm0, xmm0 ; |
| 575 |
| 576 filter_block2d_bil_full_pixel_loop: |
| 577 movq xmm1, QWORD PTR [rsi] ; |
| 578 punpcklbw xmm1, xmm0 ; |
| 579 |
| 580 movq xmm2, QWORD PTR [rdi] ; |
| 581 punpcklbw xmm2, xmm0 ; |
| 582 |
| 583 psubw xmm1, xmm2 ; |
| 584 paddw xmm6, xmm1 ; |
| 585 |
| 586 pmaddwd xmm1, xmm1 ; |
| 587 paddd xmm7, xmm1 ; |
| 588 |
| 589 lea rsi, [rsi + rax] ;ref_pixels_per_lin
e |
| 590 lea rdi, [rdi + rbx] ;src_pixels_per_lin
e |
| 591 |
| 592 sub rcx, 1 ; |
| 593 jnz filter_block2d_bil_full_pixel_loop ; |
| 594 |
| 595 jmp filter_block2d_bil_variance |
| 596 |
| 597 filter_block2d_bil_var_sse2_fp_only: |
| 598 mov rsi, arg(0) ;ref_ptr |
| 599 mov rdi, arg(2) ;src_ptr |
| 600 movsxd rcx, dword ptr arg(4) ;Height |
| 601 movsxd rdx, dword ptr arg(1) ;ref_pixels_per_lin
e |
| 602 |
| 603 pxor xmm0, xmm0 ; |
| 604 movsxd rbx, dword ptr arg(3) ;src_pixels_per_lin
e |
| 605 |
| 606 filter_block2d_bil_fp_only_loop: |
| 607 movq xmm1, QWORD PTR [rsi] ; |
| 608 movq xmm3, QWORD PTR [rsi+1] ; |
| 609 |
| 610 punpcklbw xmm1, xmm0 ; |
| 611 pmullw xmm1, [rax] ; |
| 612 punpcklbw xmm3, xmm0 ; |
| 613 pmullw xmm3, [rax+16] ; |
| 614 |
| 615 paddw xmm1, xmm3 ; |
| 616 paddw xmm1, xmm4 ; |
| 617 psraw xmm1, xmm_filter_shift ; |
| 618 |
| 619 movq xmm3, QWORD PTR [rdi] ; |
| 620 punpcklbw xmm3, xmm0 ; |
| 621 |
| 622 psubw xmm1, xmm3 ; |
| 623 paddw xmm6, xmm1 ; |
| 624 |
| 625 pmaddwd xmm1, xmm1 ; |
| 626 paddd xmm7, xmm1 ; |
| 627 lea rsi, [rsi + rdx] |
| 628 lea rdi, [rdi + rbx] ;src_pixels_per_lin
e |
| 629 |
| 630 sub rcx, 1 ; |
| 631 jnz filter_block2d_bil_fp_only_loop ; |
| 632 |
| 633 jmp filter_block2d_bil_variance |
| 634 |
| 635 filter_block2d_bil_variance: |
| 636 movdq2q mm6, xmm6 ; |
| 637 movdq2q mm7, xmm7 ; |
| 638 |
| 639 psrldq xmm6, 8 |
| 640 psrldq xmm7, 8 |
| 641 |
| 642 movdq2q mm2, xmm6 |
| 643 movdq2q mm3, xmm7 |
| 644 |
| 645 paddw mm6, mm2 |
| 646 paddd mm7, mm3 |
| 647 |
| 648 pxor mm3, mm3 ; |
| 649 pxor mm2, mm2 ; |
| 650 |
| 651 punpcklwd mm2, mm6 ; |
| 652 punpckhwd mm3, mm6 ; |
| 653 |
| 654 paddd mm2, mm3 ; |
| 655 movq mm6, mm2 ; |
| 656 |
| 657 psrlq mm6, 32 ; |
| 658 paddd mm2, mm6 ; |
| 659 |
| 660 psrad mm2, 16 ; |
| 661 movq mm4, mm7 ; |
| 662 |
| 663 psrlq mm4, 32 ; |
| 664 paddd mm4, mm7 ; |
| 665 |
| 666 mov rsi, arg(7) ; sum |
| 667 mov rdi, arg(8) ; sumsquared |
| 668 |
| 669 movd [rsi], mm2 ; xsum |
| 670 movd [rdi], mm4 ; xxsum |
| 671 |
| 672 ; begin epilog |
| 673 pop rbx |
| 674 pop rdi |
| 675 pop rsi |
| 676 RESTORE_GOT |
| 677 RESTORE_XMM |
| 678 UNSHADOW_ARGS |
| 679 pop rbp |
| 680 ret |
| 681 |
| 682 |
| 683 ;void vp9_half_horiz_vert_variance8x_h_sse2 |
| 684 ;( |
| 685 ; unsigned char *ref_ptr, |
| 686 ; int ref_pixels_per_line, |
| 687 ; unsigned char *src_ptr, |
| 688 ; int src_pixels_per_line, |
| 689 ; unsigned int Height, |
| 690 ; int *sum, |
| 691 ; unsigned int *sumsquared |
| 692 ;) |
| 693 global sym(vp9_half_horiz_vert_variance8x_h_sse2) |
| 694 sym(vp9_half_horiz_vert_variance8x_h_sse2): |
| 695 push rbp |
| 696 mov rbp, rsp |
| 697 SHADOW_ARGS_TO_STACK 7 |
| 698 SAVE_XMM 7 |
| 699 GET_GOT rbx |
| 700 push rsi |
| 701 push rdi |
| 702 ; end prolog |
| 703 |
| 704 %if ABI_IS_32BIT=0 |
| 705 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line |
| 706 movsxd r9, dword ptr arg(3) ;src_pixels_per_line |
| 707 %endif |
| 708 |
| 709 pxor xmm6, xmm6 ; error accumulator |
| 710 pxor xmm7, xmm7 ; sse eaccumulator |
| 711 mov rsi, arg(0) ;ref_ptr ; |
| 712 |
| 713 mov rdi, arg(2) ;src_ptr ; |
| 714 movsxd rcx, dword ptr arg(4) ;Height ; |
| 715 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line |
| 716 |
| 717 pxor xmm0, xmm0 ; |
| 718 |
| 719 movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..
s8 |
| 720 movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..
s9 |
| 721 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,x
mm3) horizontal line 1 |
| 722 |
| 723 %if ABI_IS_32BIT |
| 724 add rsi, dword ptr arg(1) ;ref_pixels_per_line
; next source |
| 725 %else |
| 726 add rsi, r8 |
| 727 %endif |
| 728 |
| 729 .half_horiz_vert_variance8x_h_1: |
| 730 |
| 731 movq xmm1, QWORD PTR [rsi] ; |
| 732 movq xmm2, QWORD PTR [rsi+1] ; |
| 733 pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,x
mm3) horizontal line i+1 |
| 734 |
| 735 pavgb xmm5, xmm1 ; xmm = vertical av
erage of the above |
| 736 punpcklbw xmm5, xmm0 ; xmm5 = words of a
bove |
| 737 |
| 738 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..
d8 |
| 739 punpcklbw xmm3, xmm0 ; xmm3 = words of a
bove |
| 740 |
| 741 psubw xmm5, xmm3 ; xmm5 -= xmm3 |
| 742 paddw xmm6, xmm5 ; xmm6 += accumulat
ed column differences |
| 743 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 |
| 744 paddd xmm7, xmm5 ; xmm7 += accumulat
ed square column differences |
| 745 |
| 746 movdqa xmm5, xmm1 ; save xmm1 for use
on the next row |
| 747 |
| 748 %if ABI_IS_32BIT |
| 749 add esi, dword ptr arg(1) ;ref_pixels_per_line
; next source |
| 750 add edi, dword ptr arg(3) ;src_pixels_per_line
; next destination |
| 751 %else |
| 752 add rsi, r8 |
| 753 add rdi, r9 |
| 754 %endif |
| 755 |
| 756 sub rcx, 1 ; |
| 757 jnz .half_horiz_vert_variance8x_h_1 ; |
| 758 |
| 759 movdq2q mm6, xmm6 ; |
| 760 movdq2q mm7, xmm7 ; |
| 761 |
| 762 psrldq xmm6, 8 |
| 763 psrldq xmm7, 8 |
| 764 |
| 765 movdq2q mm2, xmm6 |
| 766 movdq2q mm3, xmm7 |
| 767 |
| 768 paddw mm6, mm2 |
| 769 paddd mm7, mm3 |
| 770 |
| 771 pxor mm3, mm3 ; |
| 772 pxor mm2, mm2 ; |
| 773 |
| 774 punpcklwd mm2, mm6 ; |
| 775 punpckhwd mm3, mm6 ; |
| 776 |
| 777 paddd mm2, mm3 ; |
| 778 movq mm6, mm2 ; |
| 779 |
| 780 psrlq mm6, 32 ; |
| 781 paddd mm2, mm6 ; |
| 782 |
| 783 psrad mm2, 16 ; |
| 784 movq mm4, mm7 ; |
| 785 |
| 786 psrlq mm4, 32 ; |
| 787 paddd mm4, mm7 ; |
| 788 |
| 789 mov rsi, arg(5) ; sum |
| 790 mov rdi, arg(6) ; sumsquared |
| 791 |
| 792 movd [rsi], mm2 ; |
| 793 movd [rdi], mm4 ; |
| 794 |
| 795 |
| 796 ; begin epilog |
| 797 pop rdi |
| 798 pop rsi |
| 799 RESTORE_GOT |
| 800 RESTORE_XMM |
| 801 UNSHADOW_ARGS |
| 802 pop rbp |
| 803 ret |
| 804 |
| 805 ;void vp9_half_horiz_vert_variance16x_h_sse2 |
| 806 ;( |
| 807 ; unsigned char *ref_ptr, |
| 808 ; int ref_pixels_per_line, |
| 809 ; unsigned char *src_ptr, |
| 810 ; int src_pixels_per_line, |
| 811 ; unsigned int Height, |
| 812 ; int *sum, |
| 813 ; unsigned int *sumsquared |
| 814 ;) |
| 815 global sym(vp9_half_horiz_vert_variance16x_h_sse2) |
| 816 sym(vp9_half_horiz_vert_variance16x_h_sse2): |
| 817 push rbp |
| 818 mov rbp, rsp |
| 819 SHADOW_ARGS_TO_STACK 7 |
| 820 SAVE_XMM 7 |
| 821 GET_GOT rbx |
| 822 push rsi |
| 823 push rdi |
| 824 ; end prolog |
| 825 |
| 826 pxor xmm6, xmm6 ; error accumulator |
| 827 pxor xmm7, xmm7 ; sse eaccumulator |
| 828 mov rsi, arg(0) ;ref_ptr ; |
| 829 |
| 830 mov rdi, arg(2) ;src_ptr ; |
| 831 movsxd rcx, dword ptr arg(4) ;Height ; |
| 832 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line |
| 833 movsxd rdx, dword ptr arg(3) ;src_pixels_per_line |
| 834 |
| 835 pxor xmm0, xmm0 ; |
| 836 |
| 837 movdqu xmm5, XMMWORD PTR [rsi] |
| 838 movdqu xmm3, XMMWORD PTR [rsi+1] |
| 839 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,x
mm3) horizontal line 1 |
| 840 |
| 841 lea rsi, [rsi + rax] |
| 842 |
| 843 .half_horiz_vert_variance16x_h_1: |
| 844 movdqu xmm1, XMMWORD PTR [rsi] ; |
| 845 movdqu xmm2, XMMWORD PTR [rsi+1] ; |
| 846 pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,x
mm3) horizontal line i+1 |
| 847 |
| 848 pavgb xmm5, xmm1 ; xmm = vertical av
erage of the above |
| 849 |
| 850 movdqa xmm4, xmm5 |
| 851 punpcklbw xmm5, xmm0 ; xmm5 = words of a
bove |
| 852 punpckhbw xmm4, xmm0 |
| 853 |
| 854 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..
d7 |
| 855 punpcklbw xmm3, xmm0 ; xmm3 = words of a
bove |
| 856 psubw xmm5, xmm3 ; xmm5 -= xmm3 |
| 857 |
| 858 movq xmm3, QWORD PTR [rdi+8] |
| 859 punpcklbw xmm3, xmm0 |
| 860 psubw xmm4, xmm3 |
| 861 |
| 862 paddw xmm6, xmm5 ; xmm6 += accumulat
ed column differences |
| 863 paddw xmm6, xmm4 |
| 864 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 |
| 865 pmaddwd xmm4, xmm4 |
| 866 paddd xmm7, xmm5 ; xmm7 += accumulat
ed square column differences |
| 867 paddd xmm7, xmm4 |
| 868 |
| 869 movdqa xmm5, xmm1 ; save xmm1 for use
on the next row |
| 870 |
| 871 lea rsi, [rsi + rax] |
| 872 lea rdi, [rdi + rdx] |
| 873 |
| 874 sub rcx, 1 ; |
| 875 jnz .half_horiz_vert_variance16x_h_1 ; |
| 876 |
| 877 pxor xmm1, xmm1 |
| 878 pxor xmm5, xmm5 |
| 879 |
| 880 punpcklwd xmm0, xmm6 |
| 881 punpckhwd xmm1, xmm6 |
| 882 psrad xmm0, 16 |
| 883 psrad xmm1, 16 |
| 884 paddd xmm0, xmm1 |
| 885 movdqa xmm1, xmm0 |
| 886 |
| 887 movdqa xmm6, xmm7 |
| 888 punpckldq xmm6, xmm5 |
| 889 punpckhdq xmm7, xmm5 |
| 890 paddd xmm6, xmm7 |
| 891 |
| 892 punpckldq xmm0, xmm5 |
| 893 punpckhdq xmm1, xmm5 |
| 894 paddd xmm0, xmm1 |
| 895 |
| 896 movdqa xmm7, xmm6 |
| 897 movdqa xmm1, xmm0 |
| 898 |
| 899 psrldq xmm7, 8 |
| 900 psrldq xmm1, 8 |
| 901 |
| 902 paddd xmm6, xmm7 |
| 903 paddd xmm0, xmm1 |
| 904 |
| 905 mov rsi, arg(5) ;[Sum] |
| 906 mov rdi, arg(6) ;[SSE] |
| 907 |
| 908 movd [rsi], xmm0 |
| 909 movd [rdi], xmm6 |
| 910 |
| 911 ; begin epilog |
| 912 pop rdi |
| 913 pop rsi |
| 914 RESTORE_GOT |
| 915 RESTORE_XMM |
| 916 UNSHADOW_ARGS |
| 917 pop rbp |
| 918 ret |
| 919 |
| 920 |
| 921 ;void vp9_half_vert_variance8x_h_sse2 |
| 922 ;( |
| 923 ; unsigned char *ref_ptr, |
| 924 ; int ref_pixels_per_line, |
| 925 ; unsigned char *src_ptr, |
| 926 ; int src_pixels_per_line, |
| 927 ; unsigned int Height, |
| 928 ; int *sum, |
| 929 ; unsigned int *sumsquared |
| 930 ;) |
| 931 global sym(vp9_half_vert_variance8x_h_sse2) |
| 932 sym(vp9_half_vert_variance8x_h_sse2): |
| 933 push rbp |
| 934 mov rbp, rsp |
| 935 SHADOW_ARGS_TO_STACK 7 |
| 936 SAVE_XMM 7 |
| 937 GET_GOT rbx |
| 938 push rsi |
| 939 push rdi |
| 940 ; end prolog |
| 941 |
| 942 %if ABI_IS_32BIT=0 |
| 943 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line |
| 944 movsxd r9, dword ptr arg(3) ;src_pixels_per_line |
| 945 %endif |
| 946 |
| 947 pxor xmm6, xmm6 ; error accumulator |
| 948 pxor xmm7, xmm7 ; sse eaccumulator |
| 949 mov rsi, arg(0) ;ref_ptr ; |
| 950 |
| 951 mov rdi, arg(2) ;src_ptr ; |
| 952 movsxd rcx, dword ptr arg(4) ;Height ; |
| 953 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line |
| 954 |
| 955 pxor xmm0, xmm0 ; |
| 956 .half_vert_variance8x_h_1: |
| 957 movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..
s8 |
| 958 movq xmm3, QWORD PTR [rsi+rax] ; xmm3 = s1,s2,s3..
s9 |
| 959 |
| 960 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,x
mm3) |
| 961 punpcklbw xmm5, xmm0 ; xmm5 = words of a
bove |
| 962 |
| 963 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..
d8 |
| 964 punpcklbw xmm3, xmm0 ; xmm3 = words of a
bove |
| 965 |
| 966 psubw xmm5, xmm3 ; xmm5 -= xmm3 |
| 967 paddw xmm6, xmm5 ; xmm6 += accumulat
ed column differences |
| 968 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 |
| 969 paddd xmm7, xmm5 ; xmm7 += accumulat
ed square column differences |
| 970 |
| 971 %if ABI_IS_32BIT |
| 972 add esi, dword ptr arg(1) ;ref_pixels_per_line
; next source |
| 973 add edi, dword ptr arg(3) ;src_pixels_per_line
; next destination |
| 974 %else |
| 975 add rsi, r8 |
| 976 add rdi, r9 |
| 977 %endif |
| 978 |
| 979 sub rcx, 1 ; |
| 980 jnz .half_vert_variance8x_h_1 ; |
| 981 |
| 982 movdq2q mm6, xmm6 ; |
| 983 movdq2q mm7, xmm7 ; |
| 984 |
| 985 psrldq xmm6, 8 |
| 986 psrldq xmm7, 8 |
| 987 |
| 988 movdq2q mm2, xmm6 |
| 989 movdq2q mm3, xmm7 |
| 990 |
| 991 paddw mm6, mm2 |
| 992 paddd mm7, mm3 |
| 993 |
| 994 pxor mm3, mm3 ; |
| 995 pxor mm2, mm2 ; |
| 996 |
| 997 punpcklwd mm2, mm6 ; |
| 998 punpckhwd mm3, mm6 ; |
| 999 |
| 1000 paddd mm2, mm3 ; |
| 1001 movq mm6, mm2 ; |
| 1002 |
| 1003 psrlq mm6, 32 ; |
| 1004 paddd mm2, mm6 ; |
| 1005 |
| 1006 psrad mm2, 16 ; |
| 1007 movq mm4, mm7 ; |
| 1008 |
| 1009 psrlq mm4, 32 ; |
| 1010 paddd mm4, mm7 ; |
| 1011 |
| 1012 mov rsi, arg(5) ; sum |
| 1013 mov rdi, arg(6) ; sumsquared |
| 1014 |
| 1015 movd [rsi], mm2 ; |
| 1016 movd [rdi], mm4 ; |
| 1017 |
| 1018 |
| 1019 ; begin epilog |
| 1020 pop rdi |
| 1021 pop rsi |
| 1022 RESTORE_GOT |
| 1023 RESTORE_XMM |
| 1024 UNSHADOW_ARGS |
| 1025 pop rbp |
| 1026 ret |
| 1027 |
| 1028 ;void vp9_half_vert_variance16x_h_sse2 |
| 1029 ;( |
| 1030 ; unsigned char *ref_ptr, |
| 1031 ; int ref_pixels_per_line, |
| 1032 ; unsigned char *src_ptr, |
| 1033 ; int src_pixels_per_line, |
| 1034 ; unsigned int Height, |
| 1035 ; int *sum, |
| 1036 ; unsigned int *sumsquared |
| 1037 ;) |
| 1038 global sym(vp9_half_vert_variance16x_h_sse2) |
| 1039 sym(vp9_half_vert_variance16x_h_sse2): |
| 1040 push rbp |
| 1041 mov rbp, rsp |
| 1042 SHADOW_ARGS_TO_STACK 7 |
| 1043 SAVE_XMM 7 |
| 1044 GET_GOT rbx |
| 1045 push rsi |
| 1046 push rdi |
| 1047 ; end prolog |
| 1048 |
| 1049 pxor xmm6, xmm6 ; error accumulator |
| 1050 pxor xmm7, xmm7 ; sse eaccumulator |
| 1051 mov rsi, arg(0) ;ref_ptr |
| 1052 |
| 1053 mov rdi, arg(2) ;src_ptr |
| 1054 movsxd rcx, dword ptr arg(4) ;Height |
| 1055 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line |
| 1056 movsxd rdx, dword ptr arg(3) ;src_pixels_per_line |
| 1057 |
| 1058 movdqu xmm5, XMMWORD PTR [rsi] |
| 1059 lea rsi, [rsi + rax ] |
| 1060 pxor xmm0, xmm0 |
| 1061 |
| 1062 .half_vert_variance16x_h_1: |
| 1063 movdqu xmm3, XMMWORD PTR [rsi] |
| 1064 |
| 1065 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,x
mm3) |
| 1066 movdqa xmm4, xmm5 |
| 1067 punpcklbw xmm5, xmm0 |
| 1068 punpckhbw xmm4, xmm0 |
| 1069 |
| 1070 movq xmm2, QWORD PTR [rdi] |
| 1071 punpcklbw xmm2, xmm0 |
| 1072 psubw xmm5, xmm2 |
| 1073 movq xmm2, QWORD PTR [rdi+8] |
| 1074 punpcklbw xmm2, xmm0 |
| 1075 psubw xmm4, xmm2 |
| 1076 |
| 1077 paddw xmm6, xmm5 ; xmm6 += accumulat
ed column differences |
| 1078 paddw xmm6, xmm4 |
| 1079 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 |
| 1080 pmaddwd xmm4, xmm4 |
| 1081 paddd xmm7, xmm5 ; xmm7 += accumulat
ed square column differences |
| 1082 paddd xmm7, xmm4 |
| 1083 |
| 1084 movdqa xmm5, xmm3 |
| 1085 |
| 1086 lea rsi, [rsi + rax] |
| 1087 lea rdi, [rdi + rdx] |
| 1088 |
| 1089 sub rcx, 1 |
| 1090 jnz .half_vert_variance16x_h_1 |
| 1091 |
| 1092 pxor xmm1, xmm1 |
| 1093 pxor xmm5, xmm5 |
| 1094 |
| 1095 punpcklwd xmm0, xmm6 |
| 1096 punpckhwd xmm1, xmm6 |
| 1097 psrad xmm0, 16 |
| 1098 psrad xmm1, 16 |
| 1099 paddd xmm0, xmm1 |
| 1100 movdqa xmm1, xmm0 |
| 1101 |
| 1102 movdqa xmm6, xmm7 |
| 1103 punpckldq xmm6, xmm5 |
| 1104 punpckhdq xmm7, xmm5 |
| 1105 paddd xmm6, xmm7 |
| 1106 |
| 1107 punpckldq xmm0, xmm5 |
| 1108 punpckhdq xmm1, xmm5 |
| 1109 paddd xmm0, xmm1 |
| 1110 |
| 1111 movdqa xmm7, xmm6 |
| 1112 movdqa xmm1, xmm0 |
| 1113 |
| 1114 psrldq xmm7, 8 |
| 1115 psrldq xmm1, 8 |
| 1116 |
| 1117 paddd xmm6, xmm7 |
| 1118 paddd xmm0, xmm1 |
| 1119 |
| 1120 mov rsi, arg(5) ;[Sum] |
| 1121 mov rdi, arg(6) ;[SSE] |
| 1122 |
| 1123 movd [rsi], xmm0 |
| 1124 movd [rdi], xmm6 |
| 1125 |
| 1126 ; begin epilog |
| 1127 pop rdi |
| 1128 pop rsi |
| 1129 RESTORE_GOT |
| 1130 RESTORE_XMM |
| 1131 UNSHADOW_ARGS |
| 1132 pop rbp |
| 1133 ret |
| 1134 |
| 1135 |
| 1136 ;void vp9_half_horiz_variance8x_h_sse2 |
| 1137 ;( |
| 1138 ; unsigned char *ref_ptr, |
| 1139 ; int ref_pixels_per_line, |
| 1140 ; unsigned char *src_ptr, |
| 1141 ; int src_pixels_per_line, |
| 1142 ; unsigned int Height, |
| 1143 ; int *sum, |
| 1144 ; unsigned int *sumsquared |
| 1145 ;) |
| 1146 global sym(vp9_half_horiz_variance8x_h_sse2) |
| 1147 sym(vp9_half_horiz_variance8x_h_sse2): |
| 1148 push rbp |
| 1149 mov rbp, rsp |
| 1150 SHADOW_ARGS_TO_STACK 7 |
| 1151 SAVE_XMM 7 |
| 1152 GET_GOT rbx |
| 1153 push rsi |
| 1154 push rdi |
| 1155 ; end prolog |
| 1156 |
| 1157 %if ABI_IS_32BIT=0 |
| 1158 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line |
| 1159 movsxd r9, dword ptr arg(3) ;src_pixels_per_line |
| 1160 %endif |
| 1161 |
| 1162 pxor xmm6, xmm6 ; error accumulator |
| 1163 pxor xmm7, xmm7 ; sse eaccumulator |
| 1164 mov rsi, arg(0) ;ref_ptr ; |
| 1165 |
| 1166 mov rdi, arg(2) ;src_ptr ; |
| 1167 movsxd rcx, dword ptr arg(4) ;Height ; |
| 1168 |
| 1169 pxor xmm0, xmm0 ; |
| 1170 .half_horiz_variance8x_h_1: |
| 1171 movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..
s8 |
| 1172 movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..
s9 |
| 1173 |
| 1174 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,x
mm3) |
| 1175 punpcklbw xmm5, xmm0 ; xmm5 = words of a
bove |
| 1176 |
| 1177 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..
d8 |
| 1178 punpcklbw xmm3, xmm0 ; xmm3 = words of a
bove |
| 1179 |
| 1180 psubw xmm5, xmm3 ; xmm5 -= xmm3 |
| 1181 paddw xmm6, xmm5 ; xmm6 += accumulat
ed column differences |
| 1182 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 |
| 1183 paddd xmm7, xmm5 ; xmm7 += accumulat
ed square column differences |
| 1184 |
| 1185 %if ABI_IS_32BIT |
| 1186 add esi, dword ptr arg(1) ;ref_pixels_per_line
; next source |
| 1187 add edi, dword ptr arg(3) ;src_pixels_per_line
; next destination |
| 1188 %else |
| 1189 add rsi, r8 |
| 1190 add rdi, r9 |
| 1191 %endif |
| 1192 sub rcx, 1 ; |
| 1193 jnz .half_horiz_variance8x_h_1 ; |
| 1194 |
| 1195 movdq2q mm6, xmm6 ; |
| 1196 movdq2q mm7, xmm7 ; |
| 1197 |
| 1198 psrldq xmm6, 8 |
| 1199 psrldq xmm7, 8 |
| 1200 |
| 1201 movdq2q mm2, xmm6 |
| 1202 movdq2q mm3, xmm7 |
| 1203 |
| 1204 paddw mm6, mm2 |
| 1205 paddd mm7, mm3 |
| 1206 |
| 1207 pxor mm3, mm3 ; |
| 1208 pxor mm2, mm2 ; |
| 1209 |
| 1210 punpcklwd mm2, mm6 ; |
| 1211 punpckhwd mm3, mm6 ; |
| 1212 |
| 1213 paddd mm2, mm3 ; |
| 1214 movq mm6, mm2 ; |
| 1215 |
| 1216 psrlq mm6, 32 ; |
| 1217 paddd mm2, mm6 ; |
| 1218 |
| 1219 psrad mm2, 16 ; |
| 1220 movq mm4, mm7 ; |
| 1221 |
| 1222 psrlq mm4, 32 ; |
| 1223 paddd mm4, mm7 ; |
| 1224 |
| 1225 mov rsi, arg(5) ; sum |
| 1226 mov rdi, arg(6) ; sumsquared |
| 1227 |
| 1228 movd [rsi], mm2 ; |
| 1229 movd [rdi], mm4 ; |
| 1230 |
| 1231 |
| 1232 ; begin epilog |
| 1233 pop rdi |
| 1234 pop rsi |
| 1235 RESTORE_GOT |
| 1236 RESTORE_XMM |
| 1237 UNSHADOW_ARGS |
| 1238 pop rbp |
| 1239 ret |
| 1240 |
| 1241 ;void vp9_half_horiz_variance16x_h_sse2 |
| 1242 ;( |
| 1243 ; unsigned char *ref_ptr, |
| 1244 ; int ref_pixels_per_line, |
| 1245 ; unsigned char *src_ptr, |
| 1246 ; int src_pixels_per_line, |
| 1247 ; unsigned int Height, |
| 1248 ; int *sum, |
| 1249 ; unsigned int *sumsquared |
| 1250 ;) |
| 1251 global sym(vp9_half_horiz_variance16x_h_sse2) |
| 1252 sym(vp9_half_horiz_variance16x_h_sse2): |
| 1253 push rbp |
| 1254 mov rbp, rsp |
| 1255 SHADOW_ARGS_TO_STACK 7 |
| 1256 SAVE_XMM 7 |
| 1257 GET_GOT rbx |
| 1258 push rsi |
| 1259 push rdi |
| 1260 ; end prolog |
| 1261 |
| 1262 pxor xmm6, xmm6 ; error accumulator |
| 1263 pxor xmm7, xmm7 ; sse eaccumulator |
| 1264 mov rsi, arg(0) ;ref_ptr ; |
| 1265 |
| 1266 mov rdi, arg(2) ;src_ptr ; |
| 1267 movsxd rcx, dword ptr arg(4) ;Height ; |
| 1268 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line |
| 1269 movsxd rdx, dword ptr arg(3) ;src_pixels_per_line |
| 1270 |
| 1271 pxor xmm0, xmm0 ; |
| 1272 |
| 1273 .half_horiz_variance16x_h_1: |
| 1274 movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2
..s15 |
| 1275 movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3
..s16 |
| 1276 |
| 1277 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,x
mm3) |
| 1278 movdqa xmm1, xmm5 |
| 1279 punpcklbw xmm5, xmm0 ; xmm5 = words of a
bove |
| 1280 punpckhbw xmm1, xmm0 |
| 1281 |
| 1282 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..
d7 |
| 1283 punpcklbw xmm3, xmm0 ; xmm3 = words of a
bove |
| 1284 movq xmm2, QWORD PTR [rdi+8] |
| 1285 punpcklbw xmm2, xmm0 |
| 1286 |
| 1287 psubw xmm5, xmm3 ; xmm5 -= xmm3 |
| 1288 psubw xmm1, xmm2 |
| 1289 paddw xmm6, xmm5 ; xmm6 += accumulat
ed column differences |
| 1290 paddw xmm6, xmm1 |
| 1291 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 |
| 1292 pmaddwd xmm1, xmm1 |
| 1293 paddd xmm7, xmm5 ; xmm7 += accumulat
ed square column differences |
| 1294 paddd xmm7, xmm1 |
| 1295 |
| 1296 lea rsi, [rsi + rax] |
| 1297 lea rdi, [rdi + rdx] |
| 1298 |
| 1299 sub rcx, 1 ; |
| 1300 jnz .half_horiz_variance16x_h_1 ; |
| 1301 |
| 1302 pxor xmm1, xmm1 |
| 1303 pxor xmm5, xmm5 |
| 1304 |
| 1305 punpcklwd xmm0, xmm6 |
| 1306 punpckhwd xmm1, xmm6 |
| 1307 psrad xmm0, 16 |
| 1308 psrad xmm1, 16 |
| 1309 paddd xmm0, xmm1 |
| 1310 movdqa xmm1, xmm0 |
| 1311 |
| 1312 movdqa xmm6, xmm7 |
| 1313 punpckldq xmm6, xmm5 |
| 1314 punpckhdq xmm7, xmm5 |
| 1315 paddd xmm6, xmm7 |
| 1316 |
| 1317 punpckldq xmm0, xmm5 |
| 1318 punpckhdq xmm1, xmm5 |
| 1319 paddd xmm0, xmm1 |
| 1320 |
| 1321 movdqa xmm7, xmm6 |
| 1322 movdqa xmm1, xmm0 |
| 1323 |
| 1324 psrldq xmm7, 8 |
| 1325 psrldq xmm1, 8 |
| 1326 |
| 1327 paddd xmm6, xmm7 |
| 1328 paddd xmm0, xmm1 |
| 1329 |
| 1330 mov rsi, arg(5) ;[Sum] |
| 1331 mov rdi, arg(6) ;[SSE] |
| 1332 |
| 1333 movd [rsi], xmm0 |
| 1334 movd [rdi], xmm6 |
| 1335 |
| 1336 ; begin epilog |
| 1337 pop rdi |
| 1338 pop rsi |
| 1339 RESTORE_GOT |
| 1340 RESTORE_XMM |
| 1341 UNSHADOW_ARGS |
| 1342 pop rbp |
| 1343 ret |
| 1344 |
| 1345 SECTION_RODATA |
| 1346 ; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64}; |
| 1347 align 16 |
| 1348 xmm_bi_rd: |
| 1349 times 8 dw 64 |
| 1350 align 16 |
| 1351 bilinear_filters_sse2: |
| 1352 dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 |
| 1353 dw 120, 120, 120, 120, 120, 120, 120, 120, 8, 8, 8, 8, 8, 8, 8, 8 |
| 1354 dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 |
| 1355 dw 104, 104, 104, 104, 104, 104, 104, 104, 24, 24, 24, 24, 24, 24, 24, 24 |
| 1356 dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 |
| 1357 dw 88, 88, 88, 88, 88, 88, 88, 88, 40, 40, 40, 40, 40, 40, 40, 40 |
| 1358 dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 |
| 1359 dw 72, 72, 72, 72, 72, 72, 72, 72, 56, 56, 56, 56, 56, 56, 56, 56 |
| 1360 dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 |
| 1361 dw 56, 56, 56, 56, 56, 56, 56, 56, 72, 72, 72, 72, 72, 72, 72, 72 |
| 1362 dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 |
| 1363 dw 40, 40, 40, 40, 40, 40, 40, 40, 88, 88, 88, 88, 88, 88, 88, 88 |
| 1364 dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 |
| 1365 dw 24, 24, 24, 24, 24, 24, 24, 24, 104, 104, 104, 104, 104, 104, 104, 104 |
| 1366 dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 |
| 1367 dw 8, 8, 8, 8, 8, 8, 8, 8, 120, 120, 120, 120, 120, 120, 120, 120 |
OLD | NEW |