OLD | NEW |
(Empty) | |
| 1 ; |
| 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| 3 ; |
| 4 ; Use of this source code is governed by a BSD-style license |
| 5 ; that can be found in the LICENSE file in the root of the source |
| 6 ; tree. An additional intellectual property rights grant can be found |
| 7 ; in the file PATENTS. All contributing project authors may |
| 8 ; be found in the AUTHORS file in the root of the source tree. |
| 9 ; |
| 10 |
| 11 |
| 12 %include "vpx_ports/x86_abi_support.asm" |
| 13 ;void vp9_recon2b_sse2(unsigned char *s, short *q, unsigned char *d, int stride) |
| 14 global sym(vp9_recon2b_sse2) |
| 15 sym(vp9_recon2b_sse2): |
| 16 push rbp |
| 17 mov rbp, rsp |
| 18 SHADOW_ARGS_TO_STACK 4 |
| 19 push rsi |
| 20 push rdi |
| 21 ; end prolog |
| 22 |
| 23 mov rsi, arg(0) ;s |
| 24 mov rdi, arg(2) ;d |
| 25 mov rdx, arg(1) ;q |
| 26 movsxd rax, dword ptr arg(3) ;stride |
| 27 pxor xmm0, xmm0 |
| 28 |
| 29 movq xmm1, MMWORD PTR [rsi] |
| 30 punpcklbw xmm1, xmm0 |
| 31 paddsw xmm1, XMMWORD PTR [rdx] |
| 32 packuswb xmm1, xmm0 ; pack and unpack to saturate |
| 33 movq MMWORD PTR [rdi], xmm1 |
| 34 |
| 35 |
| 36 movq xmm2, MMWORD PTR [rsi+8] |
| 37 punpcklbw xmm2, xmm0 |
| 38 paddsw xmm2, XMMWORD PTR [rdx+16] |
| 39 packuswb xmm2, xmm0 ; pack and unpack to saturate |
| 40 movq MMWORD PTR [rdi+rax], xmm2 |
| 41 |
| 42 |
| 43 movq xmm3, MMWORD PTR [rsi+16] |
| 44 punpcklbw xmm3, xmm0 |
| 45 paddsw xmm3, XMMWORD PTR [rdx+32] |
| 46 packuswb xmm3, xmm0 ; pack and unpack to saturate |
| 47 movq MMWORD PTR [rdi+rax*2], xmm3 |
| 48 |
| 49 add rdi, rax |
| 50 movq xmm4, MMWORD PTR [rsi+24] |
| 51 punpcklbw xmm4, xmm0 |
| 52 paddsw xmm4, XMMWORD PTR [rdx+48] |
| 53 packuswb xmm4, xmm0 ; pack and unpack to saturate |
| 54 movq MMWORD PTR [rdi+rax*2], xmm4 |
| 55 |
| 56 ; begin epilog |
| 57 pop rdi |
| 58 pop rsi |
| 59 UNSHADOW_ARGS |
| 60 pop rbp |
| 61 ret |
| 62 |
| 63 |
| 64 ;void vp9_recon4b_sse2(unsigned char *s, short *q, unsigned char *d, int stride) |
| 65 global sym(vp9_recon4b_sse2) |
| 66 sym(vp9_recon4b_sse2): |
| 67 push rbp |
| 68 mov rbp, rsp |
| 69 SHADOW_ARGS_TO_STACK 4 |
| 70 SAVE_XMM 7 |
| 71 push rsi |
| 72 push rdi |
| 73 ; end prolog |
| 74 |
| 75 mov rsi, arg(0) ;s |
| 76 mov rdi, arg(2) ;d |
| 77 mov rdx, arg(1) ;q |
| 78 movsxd rax, dword ptr arg(3) ;stride |
| 79 pxor xmm0, xmm0 |
| 80 |
| 81 movdqa xmm1, XMMWORD PTR [rsi] |
| 82 movdqa xmm5, xmm1 |
| 83 punpcklbw xmm1, xmm0 |
| 84 punpckhbw xmm5, xmm0 |
| 85 paddsw xmm1, XMMWORD PTR [rdx] |
| 86 paddsw xmm5, XMMWORD PTR [rdx+16] |
| 87 packuswb xmm1, xmm5 ; pack and unpack to saturate |
| 88 movdqa XMMWORD PTR [rdi], xmm1 |
| 89 |
| 90 |
| 91 movdqa xmm2, XMMWORD PTR [rsi+16] |
| 92 movdqa xmm6, xmm2 |
| 93 punpcklbw xmm2, xmm0 |
| 94 punpckhbw xmm6, xmm0 |
| 95 paddsw xmm2, XMMWORD PTR [rdx+32] |
| 96 paddsw xmm6, XMMWORD PTR [rdx+48] |
| 97 packuswb xmm2, xmm6 ; pack and unpack to saturate |
| 98 movdqa XMMWORD PTR [rdi+rax], xmm2 |
| 99 |
| 100 |
| 101 movdqa xmm3, XMMWORD PTR [rsi+32] |
| 102 movdqa xmm7, xmm3 |
| 103 punpcklbw xmm3, xmm0 |
| 104 punpckhbw xmm7, xmm0 |
| 105 paddsw xmm3, XMMWORD PTR [rdx+64] |
| 106 paddsw xmm7, XMMWORD PTR [rdx+80] |
| 107 packuswb xmm3, xmm7 ; pack and unpack to saturate |
| 108 movdqa XMMWORD PTR [rdi+rax*2], xmm3 |
| 109 |
| 110 add rdi, rax |
| 111 movdqa xmm4, XMMWORD PTR [rsi+48] |
| 112 movdqa xmm5, xmm4 |
| 113 punpcklbw xmm4, xmm0 |
| 114 punpckhbw xmm5, xmm0 |
| 115 paddsw xmm4, XMMWORD PTR [rdx+96] |
| 116 paddsw xmm5, XMMWORD PTR [rdx+112] |
| 117 packuswb xmm4, xmm5 ; pack and unpack to saturate |
| 118 movdqa XMMWORD PTR [rdi+rax*2], xmm4 |
| 119 |
| 120 ; begin epilog |
| 121 pop rdi |
| 122 pop rsi |
| 123 RESTORE_XMM |
| 124 UNSHADOW_ARGS |
| 125 pop rbp |
| 126 ret |
| 127 |
| 128 |
| 129 ;void copy_mem16x16_sse2( |
| 130 ; unsigned char *src, |
| 131 ; int src_stride, |
| 132 ; unsigned char *dst, |
| 133 ; int dst_stride |
| 134 ; ) |
| 135 global sym(vp9_copy_mem16x16_sse2) |
| 136 sym(vp9_copy_mem16x16_sse2): |
| 137 push rbp |
| 138 mov rbp, rsp |
| 139 SHADOW_ARGS_TO_STACK 4 |
| 140 push rsi |
| 141 push rdi |
| 142 ; end prolog |
| 143 |
| 144 mov rsi, arg(0) ;src; |
| 145 movdqu xmm0, [rsi] |
| 146 |
| 147 movsxd rax, dword ptr arg(1) ;src_stride; |
| 148 mov rdi, arg(2) ;dst; |
| 149 |
| 150 movdqu xmm1, [rsi+rax] |
| 151 movdqu xmm2, [rsi+rax*2] |
| 152 |
| 153 movsxd rcx, dword ptr arg(3) ;dst_stride |
| 154 lea rsi, [rsi+rax*2] |
| 155 |
| 156 movdqa [rdi], xmm0 |
| 157 add rsi, rax |
| 158 |
| 159 movdqa [rdi+rcx], xmm1 |
| 160 movdqa [rdi+rcx*2],xmm2 |
| 161 |
| 162 lea rdi, [rdi+rcx*2] |
| 163 movdqu xmm3, [rsi] |
| 164 |
| 165 add rdi, rcx |
| 166 movdqu xmm4, [rsi+rax] |
| 167 |
| 168 movdqu xmm5, [rsi+rax*2] |
| 169 lea rsi, [rsi+rax*2] |
| 170 |
| 171 movdqa [rdi], xmm3 |
| 172 add rsi, rax |
| 173 |
| 174 movdqa [rdi+rcx], xmm4 |
| 175 movdqa [rdi+rcx*2],xmm5 |
| 176 |
| 177 lea rdi, [rdi+rcx*2] |
| 178 movdqu xmm0, [rsi] |
| 179 |
| 180 add rdi, rcx |
| 181 movdqu xmm1, [rsi+rax] |
| 182 |
| 183 movdqu xmm2, [rsi+rax*2] |
| 184 lea rsi, [rsi+rax*2] |
| 185 |
| 186 movdqa [rdi], xmm0 |
| 187 add rsi, rax |
| 188 |
| 189 movdqa [rdi+rcx], xmm1 |
| 190 |
| 191 movdqa [rdi+rcx*2], xmm2 |
| 192 movdqu xmm3, [rsi] |
| 193 |
| 194 movdqu xmm4, [rsi+rax] |
| 195 lea rdi, [rdi+rcx*2] |
| 196 |
| 197 add rdi, rcx |
| 198 movdqu xmm5, [rsi+rax*2] |
| 199 |
| 200 lea rsi, [rsi+rax*2] |
| 201 movdqa [rdi], xmm3 |
| 202 |
| 203 add rsi, rax |
| 204 movdqa [rdi+rcx], xmm4 |
| 205 |
| 206 movdqa [rdi+rcx*2],xmm5 |
| 207 movdqu xmm0, [rsi] |
| 208 |
| 209 lea rdi, [rdi+rcx*2] |
| 210 movdqu xmm1, [rsi+rax] |
| 211 |
| 212 add rdi, rcx |
| 213 movdqu xmm2, [rsi+rax*2] |
| 214 |
| 215 lea rsi, [rsi+rax*2] |
| 216 movdqa [rdi], xmm0 |
| 217 |
| 218 movdqa [rdi+rcx], xmm1 |
| 219 movdqa [rdi+rcx*2],xmm2 |
| 220 |
| 221 movdqu xmm3, [rsi+rax] |
| 222 lea rdi, [rdi+rcx*2] |
| 223 |
| 224 movdqa [rdi+rcx], xmm3 |
| 225 |
| 226 ; begin epilog |
| 227 pop rdi |
| 228 pop rsi |
| 229 UNSHADOW_ARGS |
| 230 pop rbp |
| 231 ret |
| 232 |
| 233 |
| 234 ;void vp9_intra_pred_uv_dc_mmx2( |
| 235 ; unsigned char *dst, |
| 236 ; int dst_stride |
| 237 ; unsigned char *src, |
| 238 ; int src_stride, |
| 239 ; ) |
| 240 global sym(vp9_intra_pred_uv_dc_mmx2) |
| 241 sym(vp9_intra_pred_uv_dc_mmx2): |
| 242 push rbp |
| 243 mov rbp, rsp |
| 244 SHADOW_ARGS_TO_STACK 4 |
| 245 push rsi |
| 246 push rdi |
| 247 ; end prolog |
| 248 |
| 249 ; from top |
| 250 mov rsi, arg(2) ;src; |
| 251 movsxd rax, dword ptr arg(3) ;src_stride; |
| 252 sub rsi, rax |
| 253 pxor mm0, mm0 |
| 254 movq mm1, [rsi] |
| 255 psadbw mm1, mm0 |
| 256 |
| 257 ; from left |
| 258 dec rsi |
| 259 lea rdi, [rax*3] |
| 260 movzx ecx, byte [rsi+rax] |
| 261 movzx edx, byte [rsi+rax*2] |
| 262 add ecx, edx |
| 263 movzx edx, byte [rsi+rdi] |
| 264 add ecx, edx |
| 265 lea rsi, [rsi+rax*4] |
| 266 movzx edx, byte [rsi] |
| 267 add ecx, edx |
| 268 movzx edx, byte [rsi+rax] |
| 269 add ecx, edx |
| 270 movzx edx, byte [rsi+rax*2] |
| 271 add ecx, edx |
| 272 movzx edx, byte [rsi+rdi] |
| 273 add ecx, edx |
| 274 movzx edx, byte [rsi+rax*4] |
| 275 add ecx, edx |
| 276 |
| 277 ; add up |
| 278 pextrw edx, mm1, 0x0 |
| 279 lea edx, [edx+ecx+8] |
| 280 sar edx, 4 |
| 281 movd mm1, edx |
| 282 pshufw mm1, mm1, 0x0 |
| 283 packuswb mm1, mm1 |
| 284 |
| 285 ; write out |
| 286 mov rdi, arg(0) ;dst; |
| 287 movsxd rcx, dword ptr arg(1) ;dst_stride |
| 288 lea rax, [rcx*3] |
| 289 |
| 290 movq [rdi ], mm1 |
| 291 movq [rdi+rcx ], mm1 |
| 292 movq [rdi+rcx*2], mm1 |
| 293 movq [rdi+rax ], mm1 |
| 294 lea rdi, [rdi+rcx*4] |
| 295 movq [rdi ], mm1 |
| 296 movq [rdi+rcx ], mm1 |
| 297 movq [rdi+rcx*2], mm1 |
| 298 movq [rdi+rax ], mm1 |
| 299 |
| 300 ; begin epilog |
| 301 pop rdi |
| 302 pop rsi |
| 303 UNSHADOW_ARGS |
| 304 pop rbp |
| 305 ret |
| 306 |
| 307 ;void vp9_intra_pred_uv_dctop_mmx2( |
| 308 ; unsigned char *dst, |
| 309 ; int dst_stride |
| 310 ; unsigned char *src, |
| 311 ; int src_stride, |
| 312 ; ) |
| 313 global sym(vp9_intra_pred_uv_dctop_mmx2) |
| 314 sym(vp9_intra_pred_uv_dctop_mmx2): |
| 315 push rbp |
| 316 mov rbp, rsp |
| 317 SHADOW_ARGS_TO_STACK 4 |
| 318 GET_GOT rbx |
| 319 push rsi |
| 320 push rdi |
| 321 ; end prolog |
| 322 |
| 323 ; from top |
| 324 mov rsi, arg(2) ;src; |
| 325 movsxd rax, dword ptr arg(3) ;src_stride; |
| 326 sub rsi, rax |
| 327 pxor mm0, mm0 |
| 328 movq mm1, [rsi] |
| 329 psadbw mm1, mm0 |
| 330 |
| 331 ; add up |
| 332 paddw mm1, [GLOBAL(dc_4)] |
| 333 psraw mm1, 3 |
| 334 pshufw mm1, mm1, 0x0 |
| 335 packuswb mm1, mm1 |
| 336 |
| 337 ; write out |
| 338 mov rdi, arg(0) ;dst; |
| 339 movsxd rcx, dword ptr arg(1) ;dst_stride |
| 340 lea rax, [rcx*3] |
| 341 |
| 342 movq [rdi ], mm1 |
| 343 movq [rdi+rcx ], mm1 |
| 344 movq [rdi+rcx*2], mm1 |
| 345 movq [rdi+rax ], mm1 |
| 346 lea rdi, [rdi+rcx*4] |
| 347 movq [rdi ], mm1 |
| 348 movq [rdi+rcx ], mm1 |
| 349 movq [rdi+rcx*2], mm1 |
| 350 movq [rdi+rax ], mm1 |
| 351 |
| 352 ; begin epilog |
| 353 pop rdi |
| 354 pop rsi |
| 355 RESTORE_GOT |
| 356 UNSHADOW_ARGS |
| 357 pop rbp |
| 358 ret |
| 359 |
| 360 ;void vp9_intra_pred_uv_dcleft_mmx2( |
| 361 ; unsigned char *dst, |
| 362 ; int dst_stride |
| 363 ; unsigned char *src, |
| 364 ; int src_stride, |
| 365 ; ) |
| 366 global sym(vp9_intra_pred_uv_dcleft_mmx2) |
| 367 sym(vp9_intra_pred_uv_dcleft_mmx2): |
| 368 push rbp |
| 369 mov rbp, rsp |
| 370 SHADOW_ARGS_TO_STACK 4 |
| 371 push rsi |
| 372 push rdi |
| 373 ; end prolog |
| 374 |
| 375 ; from left |
| 376 mov rsi, arg(2) ;src; |
| 377 movsxd rax, dword ptr arg(3) ;src_stride; |
| 378 dec rsi |
| 379 lea rdi, [rax*3] |
| 380 movzx ecx, byte [rsi] |
| 381 movzx edx, byte [rsi+rax] |
| 382 add ecx, edx |
| 383 movzx edx, byte [rsi+rax*2] |
| 384 add ecx, edx |
| 385 movzx edx, byte [rsi+rdi] |
| 386 add ecx, edx |
| 387 lea rsi, [rsi+rax*4] |
| 388 movzx edx, byte [rsi] |
| 389 add ecx, edx |
| 390 movzx edx, byte [rsi+rax] |
| 391 add ecx, edx |
| 392 movzx edx, byte [rsi+rax*2] |
| 393 add ecx, edx |
| 394 movzx edx, byte [rsi+rdi] |
| 395 lea edx, [ecx+edx+4] |
| 396 |
| 397 ; add up |
| 398 shr edx, 3 |
| 399 movd mm1, edx |
| 400 pshufw mm1, mm1, 0x0 |
| 401 packuswb mm1, mm1 |
| 402 |
| 403 ; write out |
| 404 mov rdi, arg(0) ;dst; |
| 405 movsxd rcx, dword ptr arg(1) ;dst_stride |
| 406 lea rax, [rcx*3] |
| 407 |
| 408 movq [rdi ], mm1 |
| 409 movq [rdi+rcx ], mm1 |
| 410 movq [rdi+rcx*2], mm1 |
| 411 movq [rdi+rax ], mm1 |
| 412 lea rdi, [rdi+rcx*4] |
| 413 movq [rdi ], mm1 |
| 414 movq [rdi+rcx ], mm1 |
| 415 movq [rdi+rcx*2], mm1 |
| 416 movq [rdi+rax ], mm1 |
| 417 |
| 418 ; begin epilog |
| 419 pop rdi |
| 420 pop rsi |
| 421 UNSHADOW_ARGS |
| 422 pop rbp |
| 423 ret |
| 424 |
| 425 ;void vp9_intra_pred_uv_dc128_mmx( |
| 426 ; unsigned char *dst, |
| 427 ; int dst_stride |
| 428 ; unsigned char *src, |
| 429 ; int src_stride, |
| 430 ; ) |
| 431 global sym(vp9_intra_pred_uv_dc128_mmx) |
| 432 sym(vp9_intra_pred_uv_dc128_mmx): |
| 433 push rbp |
| 434 mov rbp, rsp |
| 435 SHADOW_ARGS_TO_STACK 4 |
| 436 GET_GOT rbx |
| 437 ; end prolog |
| 438 |
| 439 ; write out |
| 440 movq mm1, [GLOBAL(dc_128)] |
| 441 mov rax, arg(0) ;dst; |
| 442 movsxd rdx, dword ptr arg(1) ;dst_stride |
| 443 lea rcx, [rdx*3] |
| 444 |
| 445 movq [rax ], mm1 |
| 446 movq [rax+rdx ], mm1 |
| 447 movq [rax+rdx*2], mm1 |
| 448 movq [rax+rcx ], mm1 |
| 449 lea rax, [rax+rdx*4] |
| 450 movq [rax ], mm1 |
| 451 movq [rax+rdx ], mm1 |
| 452 movq [rax+rdx*2], mm1 |
| 453 movq [rax+rcx ], mm1 |
| 454 |
| 455 ; begin epilog |
| 456 RESTORE_GOT |
| 457 UNSHADOW_ARGS |
| 458 pop rbp |
| 459 ret |
| 460 |
| 461 ;void vp9_intra_pred_uv_tm_sse2( |
| 462 ; unsigned char *dst, |
| 463 ; int dst_stride |
| 464 ; unsigned char *src, |
| 465 ; int src_stride, |
| 466 ; ) |
| 467 %macro vp9_intra_pred_uv_tm 1 |
| 468 global sym(vp9_intra_pred_uv_tm_%1) |
| 469 sym(vp9_intra_pred_uv_tm_%1): |
| 470 push rbp |
| 471 mov rbp, rsp |
| 472 SHADOW_ARGS_TO_STACK 4 |
| 473 GET_GOT rbx |
| 474 push rsi |
| 475 push rdi |
| 476 ; end prolog |
| 477 |
| 478 ; read top row |
| 479 mov edx, 4 |
| 480 mov rsi, arg(2) ;src; |
| 481 movsxd rax, dword ptr arg(3) ;src_stride; |
| 482 sub rsi, rax |
| 483 pxor xmm0, xmm0 |
| 484 %ifidn %1, ssse3 |
| 485 movdqa xmm2, [GLOBAL(dc_1024)] |
| 486 %endif |
| 487 movq xmm1, [rsi] |
| 488 punpcklbw xmm1, xmm0 |
| 489 |
| 490 ; set up left ptrs ans subtract topleft |
| 491 movd xmm3, [rsi-1] |
| 492 lea rsi, [rsi+rax-1] |
| 493 %ifidn %1, sse2 |
| 494 punpcklbw xmm3, xmm0 |
| 495 pshuflw xmm3, xmm3, 0x0 |
| 496 punpcklqdq xmm3, xmm3 |
| 497 %else |
| 498 pshufb xmm3, xmm2 |
| 499 %endif |
| 500 psubw xmm1, xmm3 |
| 501 |
| 502 ; set up dest ptrs |
| 503 mov rdi, arg(0) ;dst; |
| 504 movsxd rcx, dword ptr arg(1) ;dst_stride |
| 505 |
| 506 .vp9_intra_pred_uv_tm_%1_loop: |
| 507 movd xmm3, [rsi] |
| 508 movd xmm5, [rsi+rax] |
| 509 %ifidn %1, sse2 |
| 510 punpcklbw xmm3, xmm0 |
| 511 punpcklbw xmm5, xmm0 |
| 512 pshuflw xmm3, xmm3, 0x0 |
| 513 pshuflw xmm5, xmm5, 0x0 |
| 514 punpcklqdq xmm3, xmm3 |
| 515 punpcklqdq xmm5, xmm5 |
| 516 %else |
| 517 pshufb xmm3, xmm2 |
| 518 pshufb xmm5, xmm2 |
| 519 %endif |
| 520 paddw xmm3, xmm1 |
| 521 paddw xmm5, xmm1 |
| 522 packuswb xmm3, xmm5 |
| 523 movq [rdi ], xmm3 |
| 524 movhps[rdi+rcx], xmm3 |
| 525 lea rsi, [rsi+rax*2] |
| 526 lea rdi, [rdi+rcx*2] |
| 527 dec edx |
| 528 jnz .vp9_intra_pred_uv_tm_%1_loop |
| 529 |
| 530 ; begin epilog |
| 531 pop rdi |
| 532 pop rsi |
| 533 RESTORE_GOT |
| 534 UNSHADOW_ARGS |
| 535 pop rbp |
| 536 ret |
| 537 %endmacro |
| 538 |
| 539 vp9_intra_pred_uv_tm sse2 |
| 540 vp9_intra_pred_uv_tm ssse3 |
| 541 |
| 542 ;void vp9_intra_pred_uv_ve_mmx( |
| 543 ; unsigned char *dst, |
| 544 ; int dst_stride |
| 545 ; unsigned char *src, |
| 546 ; int src_stride, |
| 547 ; ) |
| 548 global sym(vp9_intra_pred_uv_ve_mmx) |
| 549 sym(vp9_intra_pred_uv_ve_mmx): |
| 550 push rbp |
| 551 mov rbp, rsp |
| 552 SHADOW_ARGS_TO_STACK 4 |
| 553 ; end prolog |
| 554 |
| 555 ; read from top |
| 556 mov rax, arg(2) ;src; |
| 557 movsxd rdx, dword ptr arg(3) ;src_stride; |
| 558 sub rax, rdx |
| 559 movq mm1, [rax] |
| 560 |
| 561 ; write out |
| 562 mov rax, arg(0) ;dst; |
| 563 movsxd rdx, dword ptr arg(1) ;dst_stride |
| 564 lea rcx, [rdx*3] |
| 565 |
| 566 movq [rax ], mm1 |
| 567 movq [rax+rdx ], mm1 |
| 568 movq [rax+rdx*2], mm1 |
| 569 movq [rax+rcx ], mm1 |
| 570 lea rax, [rax+rdx*4] |
| 571 movq [rax ], mm1 |
| 572 movq [rax+rdx ], mm1 |
| 573 movq [rax+rdx*2], mm1 |
| 574 movq [rax+rcx ], mm1 |
| 575 |
| 576 ; begin epilog |
| 577 UNSHADOW_ARGS |
| 578 pop rbp |
| 579 ret |
| 580 |
| 581 ;void vp9_intra_pred_uv_ho_mmx2( |
| 582 ; unsigned char *dst, |
| 583 ; int dst_stride |
| 584 ; unsigned char *src, |
| 585 ; int src_stride, |
| 586 ; ) |
| 587 %macro vp9_intra_pred_uv_ho 1 |
| 588 global sym(vp9_intra_pred_uv_ho_%1) |
| 589 sym(vp9_intra_pred_uv_ho_%1): |
| 590 push rbp |
| 591 mov rbp, rsp |
| 592 SHADOW_ARGS_TO_STACK 4 |
| 593 push rsi |
| 594 push rdi |
| 595 %ifidn %1, ssse3 |
| 596 %ifndef GET_GOT_SAVE_ARG |
| 597 push rbx |
| 598 %endif |
| 599 GET_GOT rbx |
| 600 %endif |
| 601 ; end prolog |
| 602 |
| 603 ; read from left and write out |
| 604 %ifidn %1, mmx2 |
| 605 mov edx, 4 |
| 606 %endif |
| 607 mov rsi, arg(2) ;src; |
| 608 movsxd rax, dword ptr arg(3) ;src_stride; |
| 609 mov rdi, arg(0) ;dst; |
| 610 movsxd rcx, dword ptr arg(1) ;dst_stride |
| 611 %ifidn %1, ssse3 |
| 612 lea rdx, [rcx*3] |
| 613 movdqa xmm2, [GLOBAL(dc_00001111)] |
| 614 lea rbx, [rax*3] |
| 615 %endif |
| 616 dec rsi |
| 617 %ifidn %1, mmx2 |
| 618 .vp9_intra_pred_uv_ho_%1_loop: |
| 619 movd mm0, [rsi] |
| 620 movd mm1, [rsi+rax] |
| 621 punpcklbw mm0, mm0 |
| 622 punpcklbw mm1, mm1 |
| 623 pshufw mm0, mm0, 0x0 |
| 624 pshufw mm1, mm1, 0x0 |
| 625 movq [rdi ], mm0 |
| 626 movq [rdi+rcx], mm1 |
| 627 lea rsi, [rsi+rax*2] |
| 628 lea rdi, [rdi+rcx*2] |
| 629 dec edx |
| 630 jnz .vp9_intra_pred_uv_ho_%1_loop |
| 631 %else |
| 632 movd xmm0, [rsi] |
| 633 movd xmm3, [rsi+rax] |
| 634 movd xmm1, [rsi+rax*2] |
| 635 movd xmm4, [rsi+rbx] |
| 636 punpcklbw xmm0, xmm3 |
| 637 punpcklbw xmm1, xmm4 |
| 638 pshufb xmm0, xmm2 |
| 639 pshufb xmm1, xmm2 |
| 640 movq [rdi ], xmm0 |
| 641 movhps [rdi+rcx], xmm0 |
| 642 movq [rdi+rcx*2], xmm1 |
| 643 movhps [rdi+rdx], xmm1 |
| 644 lea rsi, [rsi+rax*4] |
| 645 lea rdi, [rdi+rcx*4] |
| 646 movd xmm0, [rsi] |
| 647 movd xmm3, [rsi+rax] |
| 648 movd xmm1, [rsi+rax*2] |
| 649 movd xmm4, [rsi+rbx] |
| 650 punpcklbw xmm0, xmm3 |
| 651 punpcklbw xmm1, xmm4 |
| 652 pshufb xmm0, xmm2 |
| 653 pshufb xmm1, xmm2 |
| 654 movq [rdi ], xmm0 |
| 655 movhps [rdi+rcx], xmm0 |
| 656 movq [rdi+rcx*2], xmm1 |
| 657 movhps [rdi+rdx], xmm1 |
| 658 %endif |
| 659 |
| 660 ; begin epilog |
| 661 %ifidn %1, ssse3 |
| 662 RESTORE_GOT |
| 663 %ifndef GET_GOT_SAVE_ARG |
| 664 pop rbx |
| 665 %endif |
| 666 %endif |
| 667 pop rdi |
| 668 pop rsi |
| 669 UNSHADOW_ARGS |
| 670 pop rbp |
| 671 ret |
| 672 %endmacro |
| 673 |
| 674 vp9_intra_pred_uv_ho mmx2 |
| 675 vp9_intra_pred_uv_ho ssse3 |
| 676 |
| 677 SECTION_RODATA |
| 678 dc_128: |
| 679 times 8 db 128 |
| 680 dc_4: |
| 681 times 4 dw 4 |
| 682 align 16 |
| 683 dc_1024: |
| 684 times 8 dw 0x400 |
| 685 align 16 |
| 686 dc_00001111: |
| 687 times 8 db 0 |
| 688 times 8 db 1 |
OLD | NEW |