| OLD | NEW | 
|---|
| 1 ; | 1 ; | 
| 2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 
| 3 ; | 3 ; | 
| 4 ;  Use of this source code is governed by a BSD-style license | 4 ;  Use of this source code is governed by a BSD-style license | 
| 5 ;  that can be found in the LICENSE file in the root of the source | 5 ;  that can be found in the LICENSE file in the root of the source | 
| 6 ;  tree. An additional intellectual property rights grant can be found | 6 ;  tree. An additional intellectual property rights grant can be found | 
| 7 ;  in the file PATENTS.  All contributing project authors may | 7 ;  in the file PATENTS.  All contributing project authors may | 
| 8 ;  be found in the AUTHORS file in the root of the source tree. | 8 ;  be found in the AUTHORS file in the root of the source tree. | 
| 9 ; | 9 ; | 
| 10 | 10 | 
| (...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 60     pop         rbp | 60     pop         rbp | 
| 61     ret | 61     ret | 
| 62 | 62 | 
| 63 | 63 | 
| 64 ;void vp8_recon4b_sse2(unsigned char *s, short *q, unsigned char *d, int stride) | 64 ;void vp8_recon4b_sse2(unsigned char *s, short *q, unsigned char *d, int stride) | 
| 65 global sym(vp8_recon4b_sse2) | 65 global sym(vp8_recon4b_sse2) | 
| 66 sym(vp8_recon4b_sse2): | 66 sym(vp8_recon4b_sse2): | 
| 67     push        rbp | 67     push        rbp | 
| 68     mov         rbp, rsp | 68     mov         rbp, rsp | 
| 69     SHADOW_ARGS_TO_STACK 4 | 69     SHADOW_ARGS_TO_STACK 4 | 
| 70     SAVE_XMM | 70     SAVE_XMM 7 | 
| 71     push        rsi | 71     push        rsi | 
| 72     push        rdi | 72     push        rdi | 
| 73     ; end prolog | 73     ; end prolog | 
| 74 | 74 | 
| 75         mov         rsi,        arg(0) ;s | 75         mov         rsi,        arg(0) ;s | 
| 76         mov         rdi,        arg(2) ;d | 76         mov         rdi,        arg(2) ;d | 
| 77         mov         rdx,        arg(1) ;q | 77         mov         rdx,        arg(1) ;q | 
| 78         movsxd      rax,        dword ptr arg(3) ;stride | 78         movsxd      rax,        dword ptr arg(3) ;stride | 
| 79         pxor        xmm0,       xmm0 | 79         pxor        xmm0,       xmm0 | 
| 80 | 80 | 
| (...skipping 141 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 222         lea         rdi,        [rdi+rcx*2] | 222         lea         rdi,        [rdi+rcx*2] | 
| 223 | 223 | 
| 224         movdqa      [rdi+rcx],  xmm3 | 224         movdqa      [rdi+rcx],  xmm3 | 
| 225 | 225 | 
| 226     ; begin epilog | 226     ; begin epilog | 
| 227     pop rdi | 227     pop rdi | 
| 228     pop rsi | 228     pop rsi | 
| 229     UNSHADOW_ARGS | 229     UNSHADOW_ARGS | 
| 230     pop         rbp | 230     pop         rbp | 
| 231     ret | 231     ret | 
|  | 232 | 
|  | 233 | 
|  | 234 ;void vp8_intra_pred_uv_dc_mmx2( | 
|  | 235 ;    unsigned char *dst, | 
|  | 236 ;    int dst_stride | 
|  | 237 ;    unsigned char *src, | 
|  | 238 ;    int src_stride, | 
|  | 239 ;    ) | 
|  | 240 global sym(vp8_intra_pred_uv_dc_mmx2) | 
|  | 241 sym(vp8_intra_pred_uv_dc_mmx2): | 
|  | 242     push        rbp | 
|  | 243     mov         rbp, rsp | 
|  | 244     SHADOW_ARGS_TO_STACK 4 | 
|  | 245     push        rsi | 
|  | 246     push        rdi | 
|  | 247     ; end prolog | 
|  | 248 | 
|  | 249     ; from top | 
|  | 250     mov         rsi,        arg(2) ;src; | 
|  | 251     movsxd      rax,        dword ptr arg(3) ;src_stride; | 
|  | 252     sub         rsi,        rax | 
|  | 253     pxor        mm0,        mm0 | 
|  | 254     movq        mm1,        [rsi] | 
|  | 255     psadbw      mm1,        mm0 | 
|  | 256 | 
|  | 257     ; from left | 
|  | 258     dec         rsi | 
|  | 259     lea         rdi,        [rax*3] | 
|  | 260     movzx       ecx,        byte [rsi+rax] | 
|  | 261     movzx       edx,        byte [rsi+rax*2] | 
|  | 262     add         ecx,        edx | 
|  | 263     movzx       edx,        byte [rsi+rdi] | 
|  | 264     add         ecx,        edx | 
|  | 265     lea         rsi,        [rsi+rax*4] | 
|  | 266     movzx       edx,        byte [rsi] | 
|  | 267     add         ecx,        edx | 
|  | 268     movzx       edx,        byte [rsi+rax] | 
|  | 269     add         ecx,        edx | 
|  | 270     movzx       edx,        byte [rsi+rax*2] | 
|  | 271     add         ecx,        edx | 
|  | 272     movzx       edx,        byte [rsi+rdi] | 
|  | 273     add         ecx,        edx | 
|  | 274     movzx       edx,        byte [rsi+rax*4] | 
|  | 275     add         ecx,        edx | 
|  | 276 | 
|  | 277     ; add up | 
|  | 278     pextrw      edx,        mm1, 0x0 | 
|  | 279     lea         edx,        [edx+ecx+8] | 
|  | 280     sar         edx,        4 | 
|  | 281     movd        mm1,        edx | 
|  | 282     pshufw      mm1,        mm1, 0x0 | 
|  | 283     packuswb    mm1,        mm1 | 
|  | 284 | 
|  | 285     ; write out | 
|  | 286     mov         rdi,        arg(0) ;dst; | 
|  | 287     movsxd      rcx,        dword ptr arg(1) ;dst_stride | 
|  | 288     lea         rax,        [rcx*3] | 
|  | 289 | 
|  | 290     movq [rdi      ],       mm1 | 
|  | 291     movq [rdi+rcx  ],       mm1 | 
|  | 292     movq [rdi+rcx*2],       mm1 | 
|  | 293     movq [rdi+rax  ],       mm1 | 
|  | 294     lea         rdi,        [rdi+rcx*4] | 
|  | 295     movq [rdi      ],       mm1 | 
|  | 296     movq [rdi+rcx  ],       mm1 | 
|  | 297     movq [rdi+rcx*2],       mm1 | 
|  | 298     movq [rdi+rax  ],       mm1 | 
|  | 299 | 
|  | 300     ; begin epilog | 
|  | 301     pop         rdi | 
|  | 302     pop         rsi | 
|  | 303     UNSHADOW_ARGS | 
|  | 304     pop         rbp | 
|  | 305     ret | 
|  | 306 | 
|  | 307 ;void vp8_intra_pred_uv_dctop_mmx2( | 
|  | 308 ;    unsigned char *dst, | 
|  | 309 ;    int dst_stride | 
|  | 310 ;    unsigned char *src, | 
|  | 311 ;    int src_stride, | 
|  | 312 ;    ) | 
|  | 313 global sym(vp8_intra_pred_uv_dctop_mmx2) | 
|  | 314 sym(vp8_intra_pred_uv_dctop_mmx2): | 
|  | 315     push        rbp | 
|  | 316     mov         rbp, rsp | 
|  | 317     SHADOW_ARGS_TO_STACK 4 | 
|  | 318     GET_GOT     rbx | 
|  | 319     push        rsi | 
|  | 320     push        rdi | 
|  | 321     ; end prolog | 
|  | 322 | 
|  | 323     ; from top | 
|  | 324     mov         rsi,        arg(2) ;src; | 
|  | 325     movsxd      rax,        dword ptr arg(3) ;src_stride; | 
|  | 326     sub         rsi,        rax | 
|  | 327     pxor        mm0,        mm0 | 
|  | 328     movq        mm1,        [rsi] | 
|  | 329     psadbw      mm1,        mm0 | 
|  | 330 | 
|  | 331     ; add up | 
|  | 332     paddw       mm1,        [GLOBAL(dc_4)] | 
|  | 333     psraw       mm1,        3 | 
|  | 334     pshufw      mm1,        mm1, 0x0 | 
|  | 335     packuswb    mm1,        mm1 | 
|  | 336 | 
|  | 337     ; write out | 
|  | 338     mov         rdi,        arg(0) ;dst; | 
|  | 339     movsxd      rcx,        dword ptr arg(1) ;dst_stride | 
|  | 340     lea         rax,        [rcx*3] | 
|  | 341 | 
|  | 342     movq [rdi      ],       mm1 | 
|  | 343     movq [rdi+rcx  ],       mm1 | 
|  | 344     movq [rdi+rcx*2],       mm1 | 
|  | 345     movq [rdi+rax  ],       mm1 | 
|  | 346     lea         rdi,        [rdi+rcx*4] | 
|  | 347     movq [rdi      ],       mm1 | 
|  | 348     movq [rdi+rcx  ],       mm1 | 
|  | 349     movq [rdi+rcx*2],       mm1 | 
|  | 350     movq [rdi+rax  ],       mm1 | 
|  | 351 | 
|  | 352     ; begin epilog | 
|  | 353     pop         rdi | 
|  | 354     pop         rsi | 
|  | 355     RESTORE_GOT | 
|  | 356     UNSHADOW_ARGS | 
|  | 357     pop         rbp | 
|  | 358     ret | 
|  | 359 | 
|  | 360 ;void vp8_intra_pred_uv_dcleft_mmx2( | 
|  | 361 ;    unsigned char *dst, | 
|  | 362 ;    int dst_stride | 
|  | 363 ;    unsigned char *src, | 
|  | 364 ;    int src_stride, | 
|  | 365 ;    ) | 
|  | 366 global sym(vp8_intra_pred_uv_dcleft_mmx2) | 
|  | 367 sym(vp8_intra_pred_uv_dcleft_mmx2): | 
|  | 368     push        rbp | 
|  | 369     mov         rbp, rsp | 
|  | 370     SHADOW_ARGS_TO_STACK 4 | 
|  | 371     push        rsi | 
|  | 372     push        rdi | 
|  | 373     ; end prolog | 
|  | 374 | 
|  | 375     ; from left | 
|  | 376     mov         rsi,        arg(2) ;src; | 
|  | 377     movsxd      rax,        dword ptr arg(3) ;src_stride; | 
|  | 378     dec         rsi | 
|  | 379     lea         rdi,        [rax*3] | 
|  | 380     movzx       ecx,        byte [rsi] | 
|  | 381     movzx       edx,        byte [rsi+rax] | 
|  | 382     add         ecx,        edx | 
|  | 383     movzx       edx,        byte [rsi+rax*2] | 
|  | 384     add         ecx,        edx | 
|  | 385     movzx       edx,        byte [rsi+rdi] | 
|  | 386     add         ecx,        edx | 
|  | 387     lea         rsi,        [rsi+rax*4] | 
|  | 388     movzx       edx,        byte [rsi] | 
|  | 389     add         ecx,        edx | 
|  | 390     movzx       edx,        byte [rsi+rax] | 
|  | 391     add         ecx,        edx | 
|  | 392     movzx       edx,        byte [rsi+rax*2] | 
|  | 393     add         ecx,        edx | 
|  | 394     movzx       edx,        byte [rsi+rdi] | 
|  | 395     lea         edx,        [ecx+edx+4] | 
|  | 396 | 
|  | 397     ; add up | 
|  | 398     shr         edx,        3 | 
|  | 399     movd        mm1,        edx | 
|  | 400     pshufw      mm1,        mm1, 0x0 | 
|  | 401     packuswb    mm1,        mm1 | 
|  | 402 | 
|  | 403     ; write out | 
|  | 404     mov         rdi,        arg(0) ;dst; | 
|  | 405     movsxd      rcx,        dword ptr arg(1) ;dst_stride | 
|  | 406     lea         rax,        [rcx*3] | 
|  | 407 | 
|  | 408     movq [rdi      ],       mm1 | 
|  | 409     movq [rdi+rcx  ],       mm1 | 
|  | 410     movq [rdi+rcx*2],       mm1 | 
|  | 411     movq [rdi+rax  ],       mm1 | 
|  | 412     lea         rdi,        [rdi+rcx*4] | 
|  | 413     movq [rdi      ],       mm1 | 
|  | 414     movq [rdi+rcx  ],       mm1 | 
|  | 415     movq [rdi+rcx*2],       mm1 | 
|  | 416     movq [rdi+rax  ],       mm1 | 
|  | 417 | 
|  | 418     ; begin epilog | 
|  | 419     pop         rdi | 
|  | 420     pop         rsi | 
|  | 421     UNSHADOW_ARGS | 
|  | 422     pop         rbp | 
|  | 423     ret | 
|  | 424 | 
|  | 425 ;void vp8_intra_pred_uv_dc128_mmx( | 
|  | 426 ;    unsigned char *dst, | 
|  | 427 ;    int dst_stride | 
|  | 428 ;    unsigned char *src, | 
|  | 429 ;    int src_stride, | 
|  | 430 ;    ) | 
|  | 431 global sym(vp8_intra_pred_uv_dc128_mmx) | 
|  | 432 sym(vp8_intra_pred_uv_dc128_mmx): | 
|  | 433     push        rbp | 
|  | 434     mov         rbp, rsp | 
|  | 435     SHADOW_ARGS_TO_STACK 4 | 
|  | 436     GET_GOT     rbx | 
|  | 437     ; end prolog | 
|  | 438 | 
|  | 439     ; write out | 
|  | 440     movq        mm1,        [GLOBAL(dc_128)] | 
|  | 441     mov         rax,        arg(0) ;dst; | 
|  | 442     movsxd      rdx,        dword ptr arg(1) ;dst_stride | 
|  | 443     lea         rcx,        [rdx*3] | 
|  | 444 | 
|  | 445     movq [rax      ],       mm1 | 
|  | 446     movq [rax+rdx  ],       mm1 | 
|  | 447     movq [rax+rdx*2],       mm1 | 
|  | 448     movq [rax+rcx  ],       mm1 | 
|  | 449     lea         rax,        [rax+rdx*4] | 
|  | 450     movq [rax      ],       mm1 | 
|  | 451     movq [rax+rdx  ],       mm1 | 
|  | 452     movq [rax+rdx*2],       mm1 | 
|  | 453     movq [rax+rcx  ],       mm1 | 
|  | 454 | 
|  | 455     ; begin epilog | 
|  | 456     RESTORE_GOT | 
|  | 457     UNSHADOW_ARGS | 
|  | 458     pop         rbp | 
|  | 459     ret | 
|  | 460 | 
|  | 461 ;void vp8_intra_pred_uv_tm_sse2( | 
|  | 462 ;    unsigned char *dst, | 
|  | 463 ;    int dst_stride | 
|  | 464 ;    unsigned char *src, | 
|  | 465 ;    int src_stride, | 
|  | 466 ;    ) | 
|  | 467 %macro vp8_intra_pred_uv_tm 1 | 
|  | 468 global sym(vp8_intra_pred_uv_tm_%1) | 
|  | 469 sym(vp8_intra_pred_uv_tm_%1): | 
|  | 470     push        rbp | 
|  | 471     mov         rbp, rsp | 
|  | 472     SHADOW_ARGS_TO_STACK 4 | 
|  | 473     GET_GOT     rbx | 
|  | 474     push        rsi | 
|  | 475     push        rdi | 
|  | 476     ; end prolog | 
|  | 477 | 
|  | 478     ; read top row | 
|  | 479     mov         edx,        4 | 
|  | 480     mov         rsi,        arg(2) ;src; | 
|  | 481     movsxd      rax,        dword ptr arg(3) ;src_stride; | 
|  | 482     sub         rsi,        rax | 
|  | 483     pxor        xmm0,       xmm0 | 
|  | 484 %ifidn %1, ssse3 | 
|  | 485     movdqa      xmm2,       [GLOBAL(dc_1024)] | 
|  | 486 %endif | 
|  | 487     movq        xmm1,       [rsi] | 
|  | 488     punpcklbw   xmm1,       xmm0 | 
|  | 489 | 
|  | 490     ; set up left ptrs ans subtract topleft | 
|  | 491     movd        xmm3,       [rsi-1] | 
|  | 492     lea         rsi,        [rsi+rax-1] | 
|  | 493 %ifidn %1, sse2 | 
|  | 494     punpcklbw   xmm3,       xmm0 | 
|  | 495     pshuflw     xmm3,       xmm3, 0x0 | 
|  | 496     punpcklqdq  xmm3,       xmm3 | 
|  | 497 %else | 
|  | 498     pshufb      xmm3,       xmm2 | 
|  | 499 %endif | 
|  | 500     psubw       xmm1,       xmm3 | 
|  | 501 | 
|  | 502     ; set up dest ptrs | 
|  | 503     mov         rdi,        arg(0) ;dst; | 
|  | 504     movsxd      rcx,        dword ptr arg(1) ;dst_stride | 
|  | 505 | 
|  | 506 vp8_intra_pred_uv_tm_%1_loop: | 
|  | 507     movd        xmm3,       [rsi] | 
|  | 508     movd        xmm5,       [rsi+rax] | 
|  | 509 %ifidn %1, sse2 | 
|  | 510     punpcklbw   xmm3,       xmm0 | 
|  | 511     punpcklbw   xmm5,       xmm0 | 
|  | 512     pshuflw     xmm3,       xmm3, 0x0 | 
|  | 513     pshuflw     xmm5,       xmm5, 0x0 | 
|  | 514     punpcklqdq  xmm3,       xmm3 | 
|  | 515     punpcklqdq  xmm5,       xmm5 | 
|  | 516 %else | 
|  | 517     pshufb      xmm3,       xmm2 | 
|  | 518     pshufb      xmm5,       xmm2 | 
|  | 519 %endif | 
|  | 520     paddw       xmm3,       xmm1 | 
|  | 521     paddw       xmm5,       xmm1 | 
|  | 522     packuswb    xmm3,       xmm5 | 
|  | 523     movq  [rdi    ],        xmm3 | 
|  | 524     movhps[rdi+rcx],        xmm3 | 
|  | 525     lea         rsi,        [rsi+rax*2] | 
|  | 526     lea         rdi,        [rdi+rcx*2] | 
|  | 527     dec         edx | 
|  | 528     jnz vp8_intra_pred_uv_tm_%1_loop | 
|  | 529 | 
|  | 530     ; begin epilog | 
|  | 531     pop         rdi | 
|  | 532     pop         rsi | 
|  | 533     RESTORE_GOT | 
|  | 534     UNSHADOW_ARGS | 
|  | 535     pop         rbp | 
|  | 536     ret | 
|  | 537 %endmacro | 
|  | 538 | 
|  | 539 vp8_intra_pred_uv_tm sse2 | 
|  | 540 vp8_intra_pred_uv_tm ssse3 | 
|  | 541 | 
|  | 542 ;void vp8_intra_pred_uv_ve_mmx( | 
|  | 543 ;    unsigned char *dst, | 
|  | 544 ;    int dst_stride | 
|  | 545 ;    unsigned char *src, | 
|  | 546 ;    int src_stride, | 
|  | 547 ;    ) | 
|  | 548 global sym(vp8_intra_pred_uv_ve_mmx) | 
|  | 549 sym(vp8_intra_pred_uv_ve_mmx): | 
|  | 550     push        rbp | 
|  | 551     mov         rbp, rsp | 
|  | 552     SHADOW_ARGS_TO_STACK 4 | 
|  | 553     ; end prolog | 
|  | 554 | 
|  | 555     ; read from top | 
|  | 556     mov         rax,        arg(2) ;src; | 
|  | 557     movsxd      rdx,        dword ptr arg(3) ;src_stride; | 
|  | 558     sub         rax,        rdx | 
|  | 559     movq        mm1,        [rax] | 
|  | 560 | 
|  | 561     ; write out | 
|  | 562     mov         rax,        arg(0) ;dst; | 
|  | 563     movsxd      rdx,        dword ptr arg(1) ;dst_stride | 
|  | 564     lea         rcx,        [rdx*3] | 
|  | 565 | 
|  | 566     movq [rax      ],       mm1 | 
|  | 567     movq [rax+rdx  ],       mm1 | 
|  | 568     movq [rax+rdx*2],       mm1 | 
|  | 569     movq [rax+rcx  ],       mm1 | 
|  | 570     lea         rax,        [rax+rdx*4] | 
|  | 571     movq [rax      ],       mm1 | 
|  | 572     movq [rax+rdx  ],       mm1 | 
|  | 573     movq [rax+rdx*2],       mm1 | 
|  | 574     movq [rax+rcx  ],       mm1 | 
|  | 575 | 
|  | 576     ; begin epilog | 
|  | 577     UNSHADOW_ARGS | 
|  | 578     pop         rbp | 
|  | 579     ret | 
|  | 580 | 
|  | 581 ;void vp8_intra_pred_uv_ho_mmx2( | 
|  | 582 ;    unsigned char *dst, | 
|  | 583 ;    int dst_stride | 
|  | 584 ;    unsigned char *src, | 
|  | 585 ;    int src_stride, | 
|  | 586 ;    ) | 
|  | 587 %macro vp8_intra_pred_uv_ho 1 | 
|  | 588 global sym(vp8_intra_pred_uv_ho_%1) | 
|  | 589 sym(vp8_intra_pred_uv_ho_%1): | 
|  | 590     push        rbp | 
|  | 591     mov         rbp, rsp | 
|  | 592     SHADOW_ARGS_TO_STACK 4 | 
|  | 593     push        rsi | 
|  | 594     push        rdi | 
|  | 595 %ifidn %1, ssse3 | 
|  | 596 %ifndef GET_GOT_SAVE_ARG | 
|  | 597     push        rbx | 
|  | 598 %endif | 
|  | 599     GET_GOT     rbx | 
|  | 600 %endif | 
|  | 601     ; end prolog | 
|  | 602 | 
|  | 603     ; read from left and write out | 
|  | 604 %ifidn %1, mmx2 | 
|  | 605     mov         edx,        4 | 
|  | 606 %endif | 
|  | 607     mov         rsi,        arg(2) ;src; | 
|  | 608     movsxd      rax,        dword ptr arg(3) ;src_stride; | 
|  | 609     mov         rdi,        arg(0) ;dst; | 
|  | 610     movsxd      rcx,        dword ptr arg(1) ;dst_stride | 
|  | 611 %ifidn %1, ssse3 | 
|  | 612     lea         rdx,        [rcx*3] | 
|  | 613     movdqa      xmm2,       [GLOBAL(dc_00001111)] | 
|  | 614     lea         rbx,        [rax*3] | 
|  | 615 %endif | 
|  | 616     dec         rsi | 
|  | 617 %ifidn %1, mmx2 | 
|  | 618 vp8_intra_pred_uv_ho_%1_loop: | 
|  | 619     movd        mm0,        [rsi] | 
|  | 620     movd        mm1,        [rsi+rax] | 
|  | 621     punpcklbw   mm0,        mm0 | 
|  | 622     punpcklbw   mm1,        mm1 | 
|  | 623     pshufw      mm0,        mm0, 0x0 | 
|  | 624     pshufw      mm1,        mm1, 0x0 | 
|  | 625     movq  [rdi    ],        mm0 | 
|  | 626     movq  [rdi+rcx],        mm1 | 
|  | 627     lea         rsi,        [rsi+rax*2] | 
|  | 628     lea         rdi,        [rdi+rcx*2] | 
|  | 629     dec         edx | 
|  | 630     jnz vp8_intra_pred_uv_ho_%1_loop | 
|  | 631 %else | 
|  | 632     movd        xmm0,       [rsi] | 
|  | 633     movd        xmm3,       [rsi+rax] | 
|  | 634     movd        xmm1,       [rsi+rax*2] | 
|  | 635     movd        xmm4,       [rsi+rbx] | 
|  | 636     punpcklbw   xmm0,       xmm3 | 
|  | 637     punpcklbw   xmm1,       xmm4 | 
|  | 638     pshufb      xmm0,       xmm2 | 
|  | 639     pshufb      xmm1,       xmm2 | 
|  | 640     movq   [rdi    ],       xmm0 | 
|  | 641     movhps [rdi+rcx],       xmm0 | 
|  | 642     movq [rdi+rcx*2],       xmm1 | 
|  | 643     movhps [rdi+rdx],       xmm1 | 
|  | 644     lea         rsi,        [rsi+rax*4] | 
|  | 645     lea         rdi,        [rdi+rcx*4] | 
|  | 646     movd        xmm0,       [rsi] | 
|  | 647     movd        xmm3,       [rsi+rax] | 
|  | 648     movd        xmm1,       [rsi+rax*2] | 
|  | 649     movd        xmm4,       [rsi+rbx] | 
|  | 650     punpcklbw   xmm0,       xmm3 | 
|  | 651     punpcklbw   xmm1,       xmm4 | 
|  | 652     pshufb      xmm0,       xmm2 | 
|  | 653     pshufb      xmm1,       xmm2 | 
|  | 654     movq   [rdi    ],       xmm0 | 
|  | 655     movhps [rdi+rcx],       xmm0 | 
|  | 656     movq [rdi+rcx*2],       xmm1 | 
|  | 657     movhps [rdi+rdx],       xmm1 | 
|  | 658 %endif | 
|  | 659 | 
|  | 660     ; begin epilog | 
|  | 661 %ifidn %1, ssse3 | 
|  | 662     RESTORE_GOT | 
|  | 663 %ifndef GET_GOT_SAVE_ARG | 
|  | 664     pop         rbx | 
|  | 665 %endif | 
|  | 666 %endif | 
|  | 667     pop         rdi | 
|  | 668     pop         rsi | 
|  | 669     UNSHADOW_ARGS | 
|  | 670     pop         rbp | 
|  | 671     ret | 
|  | 672 %endmacro | 
|  | 673 | 
|  | 674 vp8_intra_pred_uv_ho mmx2 | 
|  | 675 vp8_intra_pred_uv_ho ssse3 | 
|  | 676 | 
|  | 677 SECTION_RODATA | 
|  | 678 dc_128: | 
|  | 679     times 8 db 128 | 
|  | 680 dc_4: | 
|  | 681     times 4 dw 4 | 
|  | 682 align 16 | 
|  | 683 dc_1024: | 
|  | 684     times 8 dw 0x400 | 
|  | 685 align 16 | 
|  | 686 dc_00001111: | 
|  | 687     times 8 db 0 | 
|  | 688     times 8 db 1 | 
| OLD | NEW | 
|---|