| OLD | NEW |
| 1 ; | 1 ; |
| 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| 3 ; | 3 ; |
| 4 ; Use of this source code is governed by a BSD-style license | 4 ; Use of this source code is governed by a BSD-style license |
| 5 ; that can be found in the LICENSE file in the root of the source | 5 ; that can be found in the LICENSE file in the root of the source |
| 6 ; tree. An additional intellectual property rights grant can be found | 6 ; tree. An additional intellectual property rights grant can be found |
| 7 ; in the file PATENTS. All contributing project authors may | 7 ; in the file PATENTS. All contributing project authors may |
| 8 ; be found in the AUTHORS file in the root of the source tree. | 8 ; be found in the AUTHORS file in the root of the source tree. |
| 9 ; | 9 ; |
| 10 | 10 |
| 11 | 11 |
| 12 %include "vpx_ports/x86_abi_support.asm" | 12 %include "vpx_ports/x86_abi_support.asm" |
| 13 | 13 |
| 14 ;unsigned int vp8_sad16x16_wmt( | 14 ;unsigned int vp8_sad16x16_wmt( |
| 15 ; unsigned char *src_ptr, | 15 ; unsigned char *src_ptr, |
| 16 ; int src_stride, | 16 ; int src_stride, |
| 17 ; unsigned char *ref_ptr, | 17 ; unsigned char *ref_ptr, |
| 18 ; int ref_stride) | 18 ; int ref_stride) |
| 19 global sym(vp8_sad16x16_wmt) | 19 global sym(vp8_sad16x16_wmt) |
| 20 sym(vp8_sad16x16_wmt): | 20 sym(vp8_sad16x16_wmt): |
| 21 push rbp | 21 push rbp |
| 22 mov rbp, rsp | 22 mov rbp, rsp |
| 23 SHADOW_ARGS_TO_STACK 4 | 23 SHADOW_ARGS_TO_STACK 4 |
| 24 SAVE_XMM 6 |
| 24 push rsi | 25 push rsi |
| 25 push rdi | 26 push rdi |
| 26 ; end prolog | 27 ; end prolog |
| 27 | 28 |
| 28 mov rsi, arg(0) ;src_ptr | 29 mov rsi, arg(0) ;src_ptr |
| 29 mov rdi, arg(2) ;ref_ptr | 30 mov rdi, arg(2) ;ref_ptr |
| 30 | 31 |
| 31 movsxd rax, dword ptr arg(1) ;src_stride | 32 movsxd rax, dword ptr arg(1) ;src_stride |
| 32 movsxd rdx, dword ptr arg(3) ;ref_stride | 33 movsxd rdx, dword ptr arg(3) ;ref_stride |
| 33 | 34 |
| 34 lea rcx, [rsi+rax*8] | 35 lea rcx, [rsi+rax*8] |
| 35 | 36 |
| 36 lea rcx, [rcx+rax*8] | 37 lea rcx, [rcx+rax*8] |
| 37 pxor xmm7, xmm7 | 38 pxor xmm6, xmm6 |
| 38 | 39 |
| 39 x16x16sad_wmt_loop: | 40 x16x16sad_wmt_loop: |
| 40 | 41 |
| 41 movq xmm0, QWORD PTR [rsi] | 42 movq xmm0, QWORD PTR [rsi] |
| 42 movq xmm2, QWORD PTR [rsi+8] | 43 movq xmm2, QWORD PTR [rsi+8] |
| 43 | 44 |
| 44 movq xmm1, QWORD PTR [rdi] | 45 movq xmm1, QWORD PTR [rdi] |
| 45 movq xmm3, QWORD PTR [rdi+8] | 46 movq xmm3, QWORD PTR [rdi+8] |
| 46 | 47 |
| 47 movq xmm4, QWORD PTR [rsi+rax] | 48 movq xmm4, QWORD PTR [rsi+rax] |
| 48 movq xmm5, QWORD PTR [rdi+rdx] | 49 movq xmm5, QWORD PTR [rdi+rdx] |
| 49 | 50 |
| 50 | 51 |
| 51 punpcklbw xmm0, xmm2 | 52 punpcklbw xmm0, xmm2 |
| 52 punpcklbw xmm1, xmm3 | 53 punpcklbw xmm1, xmm3 |
| 53 | 54 |
| 54 psadbw xmm0, xmm1 | 55 psadbw xmm0, xmm1 |
| 55 movq xmm6, QWORD PTR [rsi+rax+8] | 56 movq xmm2, QWORD PTR [rsi+rax+8] |
| 56 | 57 |
| 57 movq xmm3, QWORD PTR [rdi+rdx+8] | 58 movq xmm3, QWORD PTR [rdi+rdx+8] |
| 58 lea rsi, [rsi+rax*2] | 59 lea rsi, [rsi+rax*2] |
| 59 | 60 |
| 60 lea rdi, [rdi+rdx*2] | 61 lea rdi, [rdi+rdx*2] |
| 61 punpcklbw xmm4, xmm6 | 62 punpcklbw xmm4, xmm2 |
| 62 | 63 |
| 63 punpcklbw xmm5, xmm3 | 64 punpcklbw xmm5, xmm3 |
| 64 psadbw xmm4, xmm5 | 65 psadbw xmm4, xmm5 |
| 65 | 66 |
| 66 paddw xmm7, xmm0 | 67 paddw xmm6, xmm0 |
| 67 paddw xmm7, xmm4 | 68 paddw xmm6, xmm4 |
| 68 | 69 |
| 69 cmp rsi, rcx | 70 cmp rsi, rcx |
| 70 jne x16x16sad_wmt_loop | 71 jne x16x16sad_wmt_loop |
| 71 | 72 |
| 72 movq xmm0, xmm7 | 73 movq xmm0, xmm6 |
| 73 psrldq xmm7, 8 | 74 psrldq xmm6, 8 |
| 74 | 75 |
| 75 paddw xmm0, xmm7 | 76 paddw xmm0, xmm6 |
| 76 movq rax, xmm0 | 77 movq rax, xmm0 |
| 77 | 78 |
| 78 ; begin epilog | 79 ; begin epilog |
| 79 pop rdi | 80 pop rdi |
| 80 pop rsi | 81 pop rsi |
| 82 RESTORE_XMM |
| 81 UNSHADOW_ARGS | 83 UNSHADOW_ARGS |
| 82 pop rbp | 84 pop rbp |
| 83 ret | 85 ret |
| 84 | 86 |
| 85 ;unsigned int vp8_sad8x16_wmt( | 87 ;unsigned int vp8_sad8x16_wmt( |
| 86 ; unsigned char *src_ptr, | 88 ; unsigned char *src_ptr, |
| 87 ; int src_stride, | 89 ; int src_stride, |
| 88 ; unsigned char *ref_ptr, | 90 ; unsigned char *ref_ptr, |
| 89 ; int ref_stride, | 91 ; int ref_stride, |
| 90 ; int max_err) | 92 ; int max_err) |
| (...skipping 14 matching lines...) Expand all Loading... |
| 105 movsxd rdx, dword ptr arg(3) ;ref_stride | 107 movsxd rdx, dword ptr arg(3) ;ref_stride |
| 106 | 108 |
| 107 lea rcx, [rsi+rbx*8] | 109 lea rcx, [rsi+rbx*8] |
| 108 | 110 |
| 109 lea rcx, [rcx+rbx*8] | 111 lea rcx, [rcx+rbx*8] |
| 110 pxor mm7, mm7 | 112 pxor mm7, mm7 |
| 111 | 113 |
| 112 x8x16sad_wmt_loop: | 114 x8x16sad_wmt_loop: |
| 113 | 115 |
| 114 movq rax, mm7 | 116 movq rax, mm7 |
| 115 cmp rax, arg(4) | 117 cmp eax, arg(4) |
| 116 jg x8x16sad_wmt_early_exit | 118 jg x8x16sad_wmt_early_exit |
| 117 | 119 |
| 118 movq mm0, QWORD PTR [rsi] | 120 movq mm0, QWORD PTR [rsi] |
| 119 movq mm1, QWORD PTR [rdi] | 121 movq mm1, QWORD PTR [rdi] |
| 120 | 122 |
| 121 movq mm2, QWORD PTR [rsi+rbx] | 123 movq mm2, QWORD PTR [rsi+rbx] |
| 122 movq mm3, QWORD PTR [rdi+rdx] | 124 movq mm3, QWORD PTR [rdi+rdx] |
| 123 | 125 |
| 124 psadbw mm0, mm1 | 126 psadbw mm0, mm1 |
| 125 psadbw mm2, mm3 | 127 psadbw mm2, mm3 |
| (...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 166 | 168 |
| 167 movsxd rbx, dword ptr arg(1) ;src_stride | 169 movsxd rbx, dword ptr arg(1) ;src_stride |
| 168 movsxd rdx, dword ptr arg(3) ;ref_stride | 170 movsxd rdx, dword ptr arg(3) ;ref_stride |
| 169 | 171 |
| 170 lea rcx, [rsi+rbx*8] | 172 lea rcx, [rsi+rbx*8] |
| 171 pxor mm7, mm7 | 173 pxor mm7, mm7 |
| 172 | 174 |
| 173 x8x8sad_wmt_loop: | 175 x8x8sad_wmt_loop: |
| 174 | 176 |
| 175 movq rax, mm7 | 177 movq rax, mm7 |
| 176 cmp rax, arg(4) | 178 cmp eax, arg(4) |
| 177 jg x8x8sad_wmt_early_exit | 179 jg x8x8sad_wmt_early_exit |
| 178 | 180 |
| 179 movq mm0, QWORD PTR [rsi] | 181 movq mm0, QWORD PTR [rsi] |
| 180 movq mm1, QWORD PTR [rdi] | 182 movq mm1, QWORD PTR [rdi] |
| 181 | 183 |
| 182 psadbw mm0, mm1 | 184 psadbw mm0, mm1 |
| 183 lea rsi, [rsi+rbx] | 185 lea rsi, [rsi+rbx] |
| 184 | 186 |
| 185 add rdi, rdx | 187 add rdi, rdx |
| 186 paddw mm7, mm0 | 188 paddw mm7, mm0 |
| (...skipping 88 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 275 | 277 |
| 276 movsxd rbx, dword ptr arg(1) ;src_stride | 278 movsxd rbx, dword ptr arg(1) ;src_stride |
| 277 movsxd rdx, dword ptr arg(3) ;ref_stride | 279 movsxd rdx, dword ptr arg(3) ;ref_stride |
| 278 | 280 |
| 279 lea rcx, [rsi+rbx*8] | 281 lea rcx, [rsi+rbx*8] |
| 280 pxor mm7, mm7 | 282 pxor mm7, mm7 |
| 281 | 283 |
| 282 x16x8sad_wmt_loop: | 284 x16x8sad_wmt_loop: |
| 283 | 285 |
| 284 movq rax, mm7 | 286 movq rax, mm7 |
| 285 cmp rax, arg(4) | 287 cmp eax, arg(4) |
| 286 jg x16x8sad_wmt_early_exit | 288 jg x16x8sad_wmt_early_exit |
| 287 | 289 |
| 288 movq mm0, QWORD PTR [rsi] | 290 movq mm0, QWORD PTR [rsi] |
| 289 movq mm2, QWORD PTR [rsi+8] | 291 movq mm2, QWORD PTR [rsi+8] |
| 290 | 292 |
| 291 movq mm1, QWORD PTR [rdi] | 293 movq mm1, QWORD PTR [rdi] |
| 292 movq mm3, QWORD PTR [rdi+8] | 294 movq mm3, QWORD PTR [rdi+8] |
| 293 | 295 |
| 294 movq mm4, QWORD PTR [rsi+rbx] | 296 movq mm4, QWORD PTR [rsi+rbx] |
| 295 movq mm5, QWORD PTR [rdi+rdx] | 297 movq mm5, QWORD PTR [rdi+rdx] |
| (...skipping 23 matching lines...) Expand all Loading... |
| 319 | 321 |
| 320 x16x8sad_wmt_early_exit: | 322 x16x8sad_wmt_early_exit: |
| 321 | 323 |
| 322 ; begin epilog | 324 ; begin epilog |
| 323 pop rdi | 325 pop rdi |
| 324 pop rsi | 326 pop rsi |
| 325 pop rbx | 327 pop rbx |
| 326 UNSHADOW_ARGS | 328 UNSHADOW_ARGS |
| 327 pop rbp | 329 pop rbp |
| 328 ret | 330 ret |
| 331 |
| 332 ;void vp8_copy32xn_sse2( |
| 333 ; unsigned char *src_ptr, |
| 334 ; int src_stride, |
| 335 ; unsigned char *dst_ptr, |
| 336 ; int dst_stride, |
| 337 ; int height); |
| 338 global sym(vp8_copy32xn_sse2) |
| 339 sym(vp8_copy32xn_sse2): |
| 340 push rbp |
| 341 mov rbp, rsp |
| 342 SHADOW_ARGS_TO_STACK 5 |
| 343 SAVE_XMM 7 |
| 344 push rsi |
| 345 push rdi |
| 346 ; end prolog |
| 347 |
| 348 mov rsi, arg(0) ;src_ptr |
| 349 mov rdi, arg(2) ;dst_ptr |
| 350 |
| 351 movsxd rax, dword ptr arg(1) ;src_stride |
| 352 movsxd rdx, dword ptr arg(3) ;dst_stride |
| 353 movsxd rcx, dword ptr arg(4) ;height |
| 354 |
| 355 block_copy_sse2_loopx4: |
| 356 movdqu xmm0, XMMWORD PTR [rsi] |
| 357 movdqu xmm1, XMMWORD PTR [rsi + 16] |
| 358 movdqu xmm2, XMMWORD PTR [rsi + rax] |
| 359 movdqu xmm3, XMMWORD PTR [rsi + rax + 16] |
| 360 |
| 361 lea rsi, [rsi+rax*2] |
| 362 |
| 363 movdqu xmm4, XMMWORD PTR [rsi] |
| 364 movdqu xmm5, XMMWORD PTR [rsi + 16] |
| 365 movdqu xmm6, XMMWORD PTR [rsi + rax] |
| 366 movdqu xmm7, XMMWORD PTR [rsi + rax + 16] |
| 367 |
| 368 lea rsi, [rsi+rax*2] |
| 369 |
| 370 movdqa XMMWORD PTR [rdi], xmm0 |
| 371 movdqa XMMWORD PTR [rdi + 16], xmm1 |
| 372 movdqa XMMWORD PTR [rdi + rdx], xmm2 |
| 373 movdqa XMMWORD PTR [rdi + rdx + 16], xmm3 |
| 374 |
| 375 lea rdi, [rdi+rdx*2] |
| 376 |
| 377 movdqa XMMWORD PTR [rdi], xmm4 |
| 378 movdqa XMMWORD PTR [rdi + 16], xmm5 |
| 379 movdqa XMMWORD PTR [rdi + rdx], xmm6 |
| 380 movdqa XMMWORD PTR [rdi + rdx + 16], xmm7 |
| 381 |
| 382 lea rdi, [rdi+rdx*2] |
| 383 |
| 384 sub rcx, 4 |
| 385 cmp rcx, 4 |
| 386 jge block_copy_sse2_loopx4 |
| 387 |
| 388 cmp rcx, 0 |
| 389 je copy_is_done |
| 390 |
| 391 block_copy_sse2_loop: |
| 392 movdqu xmm0, XMMWORD PTR [rsi] |
| 393 movdqu xmm1, XMMWORD PTR [rsi + 16] |
| 394 lea rsi, [rsi+rax] |
| 395 |
| 396 movdqa XMMWORD PTR [rdi], xmm0 |
| 397 movdqa XMMWORD PTR [rdi + 16], xmm1 |
| 398 lea rdi, [rdi+rdx] |
| 399 |
| 400 sub rcx, 1 |
| 401 jne block_copy_sse2_loop |
| 402 |
| 403 copy_is_done: |
| 404 ; begin epilog |
| 405 pop rdi |
| 406 pop rsi |
| 407 RESTORE_XMM |
| 408 UNSHADOW_ARGS |
| 409 pop rbp |
| 410 ret |
| OLD | NEW |