| OLD | NEW | 
|---|
| 1 ; | 1 ; | 
| 2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 
| 3 ; | 3 ; | 
| 4 ;  Use of this source code is governed by a BSD-style license | 4 ;  Use of this source code is governed by a BSD-style license | 
| 5 ;  that can be found in the LICENSE file in the root of the source | 5 ;  that can be found in the LICENSE file in the root of the source | 
| 6 ;  tree. An additional intellectual property rights grant can be found | 6 ;  tree. An additional intellectual property rights grant can be found | 
| 7 ;  in the file PATENTS.  All contributing project authors may | 7 ;  in the file PATENTS.  All contributing project authors may | 
| 8 ;  be found in the AUTHORS file in the root of the source tree. | 8 ;  be found in the AUTHORS file in the root of the source tree. | 
| 9 ; | 9 ; | 
| 10 | 10 | 
| (...skipping 14 matching lines...) Expand all  Loading... | 
| 25 sym(idct_dequant_0_2x_sse2): | 25 sym(idct_dequant_0_2x_sse2): | 
| 26     push        rbp | 26     push        rbp | 
| 27     mov         rbp, rsp | 27     mov         rbp, rsp | 
| 28     SHADOW_ARGS_TO_STACK 6 | 28     SHADOW_ARGS_TO_STACK 6 | 
| 29     GET_GOT     rbx | 29     GET_GOT     rbx | 
| 30     ; end prolog | 30     ; end prolog | 
| 31 | 31 | 
| 32         mov         rdx,            arg(1) ; dequant | 32         mov         rdx,            arg(1) ; dequant | 
| 33         mov         rax,            arg(0) ; qcoeff | 33         mov         rax,            arg(0) ; qcoeff | 
| 34 | 34 | 
| 35     ; Zero out xmm7, for use unpacking |  | 
| 36         pxor        xmm7,           xmm7 |  | 
| 37 |  | 
| 38         movd        xmm4,           [rax] | 35         movd        xmm4,           [rax] | 
| 39         movd        xmm5,           [rdx] | 36         movd        xmm5,           [rdx] | 
| 40 | 37 | 
| 41         pinsrw      xmm4,           [rax+32],   4 | 38         pinsrw      xmm4,           [rax+32],   4 | 
| 42         pinsrw      xmm5,           [rdx],      4 | 39         pinsrw      xmm5,           [rdx],      4 | 
| 43 | 40 | 
| 44         pmullw      xmm4,           xmm5 | 41         pmullw      xmm4,           xmm5 | 
| 45 | 42 | 
|  | 43     ; Zero out xmm5, for use unpacking | 
|  | 44         pxor        xmm5,           xmm5 | 
|  | 45 | 
| 46     ; clear coeffs | 46     ; clear coeffs | 
| 47         movd        [rax],          xmm7 | 47         movd        [rax],          xmm5 | 
| 48         movd        [rax+32],       xmm7 | 48         movd        [rax+32],       xmm5 | 
| 49 ;pshufb | 49 ;pshufb | 
| 50         pshuflw     xmm4,           xmm4,       00000000b | 50         pshuflw     xmm4,           xmm4,       00000000b | 
| 51         pshufhw     xmm4,           xmm4,       00000000b | 51         pshufhw     xmm4,           xmm4,       00000000b | 
| 52 | 52 | 
| 53         mov         rax,            arg(2) ; pre | 53         mov         rax,            arg(2) ; pre | 
| 54         paddw       xmm4,           [GLOBAL(fours)] | 54         paddw       xmm4,           [GLOBAL(fours)] | 
| 55 | 55 | 
| 56         movsxd      rcx,            dword ptr arg(5) ; blk_stride | 56         movsxd      rcx,            dword ptr arg(5) ; blk_stride | 
| 57         psraw       xmm4,           3 | 57         psraw       xmm4,           3 | 
| 58 | 58 | 
| 59         movq        xmm0,           [rax] | 59         movq        xmm0,           [rax] | 
| 60         movq        xmm1,           [rax+rcx] | 60         movq        xmm1,           [rax+rcx] | 
| 61         movq        xmm2,           [rax+2*rcx] | 61         movq        xmm2,           [rax+2*rcx] | 
| 62         lea         rcx,            [3*rcx] | 62         lea         rcx,            [3*rcx] | 
| 63         movq        xmm3,           [rax+rcx] | 63         movq        xmm3,           [rax+rcx] | 
| 64 | 64 | 
| 65         punpcklbw   xmm0,           xmm7 | 65         punpcklbw   xmm0,           xmm5 | 
| 66         punpcklbw   xmm1,           xmm7 | 66         punpcklbw   xmm1,           xmm5 | 
| 67         punpcklbw   xmm2,           xmm7 | 67         punpcklbw   xmm2,           xmm5 | 
| 68         punpcklbw   xmm3,           xmm7 | 68         punpcklbw   xmm3,           xmm5 | 
| 69 | 69 | 
| 70         mov         rax,            arg(3) ; dst | 70         mov         rax,            arg(3) ; dst | 
| 71         movsxd      rdx,            dword ptr arg(4) ; dst_stride | 71         movsxd      rdx,            dword ptr arg(4) ; dst_stride | 
| 72 | 72 | 
| 73     ; Add to predict buffer | 73     ; Add to predict buffer | 
| 74         paddw       xmm0,           xmm4 | 74         paddw       xmm0,           xmm4 | 
| 75         paddw       xmm1,           xmm4 | 75         paddw       xmm1,           xmm4 | 
| 76         paddw       xmm2,           xmm4 | 76         paddw       xmm2,           xmm4 | 
| 77         paddw       xmm3,           xmm4 | 77         paddw       xmm3,           xmm4 | 
| 78 | 78 | 
| 79     ; pack up before storing | 79     ; pack up before storing | 
| 80         packuswb    xmm0,           xmm7 | 80         packuswb    xmm0,           xmm5 | 
| 81         packuswb    xmm1,           xmm7 | 81         packuswb    xmm1,           xmm5 | 
| 82         packuswb    xmm2,           xmm7 | 82         packuswb    xmm2,           xmm5 | 
| 83         packuswb    xmm3,           xmm7 | 83         packuswb    xmm3,           xmm5 | 
| 84 | 84 | 
| 85     ; store blocks back out | 85     ; store blocks back out | 
| 86         movq        [rax],          xmm0 | 86         movq        [rax],          xmm0 | 
| 87         movq        [rax + rdx],    xmm1 | 87         movq        [rax + rdx],    xmm1 | 
| 88 | 88 | 
| 89         lea         rax,            [rax + 2*rdx] | 89         lea         rax,            [rax + 2*rdx] | 
| 90 | 90 | 
| 91         movq        [rax],          xmm2 | 91         movq        [rax],          xmm2 | 
| 92         movq        [rax + rdx],    xmm3 | 92         movq        [rax + rdx],    xmm3 | 
| 93 | 93 | 
| 94     ; begin epilog | 94     ; begin epilog | 
| 95     RESTORE_GOT | 95     RESTORE_GOT | 
| 96     UNSHADOW_ARGS | 96     UNSHADOW_ARGS | 
| 97     pop         rbp | 97     pop         rbp | 
| 98     ret | 98     ret | 
| 99 | 99 | 
| 100 global sym(idct_dequant_full_2x_sse2) | 100 global sym(idct_dequant_full_2x_sse2) | 
| 101 sym(idct_dequant_full_2x_sse2): | 101 sym(idct_dequant_full_2x_sse2): | 
| 102     push        rbp | 102     push        rbp | 
| 103     mov         rbp, rsp | 103     mov         rbp, rsp | 
| 104     SHADOW_ARGS_TO_STACK 7 | 104     SHADOW_ARGS_TO_STACK 7 | 
|  | 105     SAVE_XMM 7 | 
| 105     GET_GOT     rbx | 106     GET_GOT     rbx | 
| 106     push        rsi | 107     push        rsi | 
| 107     push        rdi | 108     push        rdi | 
| 108     ; end prolog | 109     ; end prolog | 
| 109 | 110 | 
| 110     ; special case when 2 blocks have 0 or 1 coeffs | 111     ; special case when 2 blocks have 0 or 1 coeffs | 
| 111     ; dc is set as first coeff, so no need to load qcoeff | 112     ; dc is set as first coeff, so no need to load qcoeff | 
| 112         mov         rax,            arg(0) ; qcoeff | 113         mov         rax,            arg(0) ; qcoeff | 
| 113         mov         rsi,            arg(2) ; pre | 114         mov         rsi,            arg(2) ; pre | 
| 114         mov         rdi,            arg(3) ; dst | 115         mov         rdi,            arg(3) ; dst | 
| (...skipping 225 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 340 | 341 | 
| 341         lea         rdi,            [rdi + 2*rdx] | 342         lea         rdi,            [rdi + 2*rdx] | 
| 342 | 343 | 
| 343         movq        [rdi],          xmm2 | 344         movq        [rdi],          xmm2 | 
| 344         movq        [rdi + rdx],    xmm3 | 345         movq        [rdi + rdx],    xmm3 | 
| 345 | 346 | 
| 346     ; begin epilog | 347     ; begin epilog | 
| 347     pop         rdi | 348     pop         rdi | 
| 348     pop         rsi | 349     pop         rsi | 
| 349     RESTORE_GOT | 350     RESTORE_GOT | 
|  | 351     RESTORE_XMM | 
| 350     UNSHADOW_ARGS | 352     UNSHADOW_ARGS | 
| 351     pop         rbp | 353     pop         rbp | 
| 352     ret | 354     ret | 
| 353 | 355 | 
| 354 ;void idct_dequant_dc_0_2x_sse2 | 356 ;void idct_dequant_dc_0_2x_sse2 | 
| 355 ; ( | 357 ; ( | 
| 356 ;   short *qcoeff       - 0 | 358 ;   short *qcoeff       - 0 | 
| 357 ;   short *dequant      - 1 | 359 ;   short *dequant      - 1 | 
| 358 ;   unsigned char *pre  - 2 | 360 ;   unsigned char *pre  - 2 | 
| 359 ;   unsigned char *dst  - 3 | 361 ;   unsigned char *dst  - 3 | 
| (...skipping 10 matching lines...) Expand all  Loading... | 
| 370     push        rdi | 372     push        rdi | 
| 371     ; end prolog | 373     ; end prolog | 
| 372 | 374 | 
| 373     ; special case when 2 blocks have 0 or 1 coeffs | 375     ; special case when 2 blocks have 0 or 1 coeffs | 
| 374     ; dc is set as first coeff, so no need to load qcoeff | 376     ; dc is set as first coeff, so no need to load qcoeff | 
| 375         mov         rax,            arg(0) ; qcoeff | 377         mov         rax,            arg(0) ; qcoeff | 
| 376         mov         rsi,            arg(2) ; pre | 378         mov         rsi,            arg(2) ; pre | 
| 377         mov         rdi,            arg(3) ; dst | 379         mov         rdi,            arg(3) ; dst | 
| 378         mov         rdx,            arg(5) ; dc | 380         mov         rdx,            arg(5) ; dc | 
| 379 | 381 | 
| 380     ; Zero out xmm7, for use unpacking | 382     ; Zero out xmm5, for use unpacking | 
| 381         pxor        xmm7,           xmm7 | 383         pxor        xmm5,           xmm5 | 
| 382 | 384 | 
| 383     ; load up 2 dc words here == 2*16 = doubleword | 385     ; load up 2 dc words here == 2*16 = doubleword | 
| 384         movd        xmm4,           [rdx] | 386         movd        xmm4,           [rdx] | 
| 385 | 387 | 
| 386     ; Load up predict blocks | 388     ; Load up predict blocks | 
| 387         movq        xmm0,           [rsi] | 389         movq        xmm0,           [rsi] | 
| 388         movq        xmm1,           [rsi+16] | 390         movq        xmm1,           [rsi+16] | 
| 389         movq        xmm2,           [rsi+32] | 391         movq        xmm2,           [rsi+32] | 
| 390         movq        xmm3,           [rsi+48] | 392         movq        xmm3,           [rsi+48] | 
| 391 | 393 | 
| 392     ; Duplicate and expand dc across | 394     ; Duplicate and expand dc across | 
| 393         punpcklwd   xmm4,           xmm4 | 395         punpcklwd   xmm4,           xmm4 | 
| 394         punpckldq   xmm4,           xmm4 | 396         punpckldq   xmm4,           xmm4 | 
| 395 | 397 | 
| 396     ; Rounding to dequant and downshift | 398     ; Rounding to dequant and downshift | 
| 397         paddw       xmm4,           [GLOBAL(fours)] | 399         paddw       xmm4,           [GLOBAL(fours)] | 
| 398         psraw       xmm4,           3 | 400         psraw       xmm4,           3 | 
| 399 | 401 | 
| 400     ; Predict buffer needs to be expanded from bytes to words | 402     ; Predict buffer needs to be expanded from bytes to words | 
| 401         punpcklbw   xmm0,           xmm7 | 403         punpcklbw   xmm0,           xmm5 | 
| 402         punpcklbw   xmm1,           xmm7 | 404         punpcklbw   xmm1,           xmm5 | 
| 403         punpcklbw   xmm2,           xmm7 | 405         punpcklbw   xmm2,           xmm5 | 
| 404         punpcklbw   xmm3,           xmm7 | 406         punpcklbw   xmm3,           xmm5 | 
| 405 | 407 | 
| 406     ; Add to predict buffer | 408     ; Add to predict buffer | 
| 407         paddw       xmm0,           xmm4 | 409         paddw       xmm0,           xmm4 | 
| 408         paddw       xmm1,           xmm4 | 410         paddw       xmm1,           xmm4 | 
| 409         paddw       xmm2,           xmm4 | 411         paddw       xmm2,           xmm4 | 
| 410         paddw       xmm3,           xmm4 | 412         paddw       xmm3,           xmm4 | 
| 411 | 413 | 
| 412     ; pack up before storing | 414     ; pack up before storing | 
| 413         packuswb    xmm0,           xmm7 | 415         packuswb    xmm0,           xmm5 | 
| 414         packuswb    xmm1,           xmm7 | 416         packuswb    xmm1,           xmm5 | 
| 415         packuswb    xmm2,           xmm7 | 417         packuswb    xmm2,           xmm5 | 
| 416         packuswb    xmm3,           xmm7 | 418         packuswb    xmm3,           xmm5 | 
| 417 | 419 | 
| 418     ; Load destination stride before writing out, | 420     ; Load destination stride before writing out, | 
| 419     ;   doesn't need to persist | 421     ;   doesn't need to persist | 
| 420         movsxd      rdx,            dword ptr arg(4) ; dst_stride | 422         movsxd      rdx,            dword ptr arg(4) ; dst_stride | 
| 421 | 423 | 
| 422     ; store blocks back out | 424     ; store blocks back out | 
| 423         movq        [rdi],          xmm0 | 425         movq        [rdi],          xmm0 | 
| 424         movq        [rdi + rdx],    xmm1 | 426         movq        [rdi + rdx],    xmm1 | 
| 425 | 427 | 
| 426         lea         rdi,            [rdi + 2*rdx] | 428         lea         rdi,            [rdi + 2*rdx] | 
| 427 | 429 | 
| 428         movq        [rdi],          xmm2 | 430         movq        [rdi],          xmm2 | 
| 429         movq        [rdi + rdx],    xmm3 | 431         movq        [rdi + rdx],    xmm3 | 
| 430 | 432 | 
| 431     ; begin epilog | 433     ; begin epilog | 
| 432     pop         rdi | 434     pop         rdi | 
| 433     pop         rsi | 435     pop         rsi | 
| 434     RESTORE_GOT | 436     RESTORE_GOT | 
| 435     UNSHADOW_ARGS | 437     UNSHADOW_ARGS | 
| 436     pop         rbp | 438     pop         rbp | 
| 437     ret | 439     ret | 
| 438 | 440 | 
| 439 global sym(idct_dequant_dc_full_2x_sse2) | 441 global sym(idct_dequant_dc_full_2x_sse2) | 
| 440 sym(idct_dequant_dc_full_2x_sse2): | 442 sym(idct_dequant_dc_full_2x_sse2): | 
| 441     push        rbp | 443     push        rbp | 
| 442     mov         rbp, rsp | 444     mov         rbp, rsp | 
| 443     SHADOW_ARGS_TO_STACK 7 | 445     SHADOW_ARGS_TO_STACK 7 | 
|  | 446     SAVE_XMM 7 | 
| 444     GET_GOT     rbx | 447     GET_GOT     rbx | 
| 445     push        rsi | 448     push        rsi | 
| 446     push        rdi | 449     push        rdi | 
| 447     ; end prolog | 450     ; end prolog | 
| 448 | 451 | 
| 449     ; special case when 2 blocks have 0 or 1 coeffs | 452     ; special case when 2 blocks have 0 or 1 coeffs | 
| 450     ; dc is set as first coeff, so no need to load qcoeff | 453     ; dc is set as first coeff, so no need to load qcoeff | 
| 451         mov         rax,            arg(0) ; qcoeff | 454         mov         rax,            arg(0) ; qcoeff | 
| 452         mov         rsi,            arg(2) ; pre | 455         mov         rsi,            arg(2) ; pre | 
| 453         mov         rdi,            arg(3) ; dst | 456         mov         rdi,            arg(3) ; dst | 
| (...skipping 231 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 685         lea         rdi,            [rdi + 2*rdx] | 688         lea         rdi,            [rdi + 2*rdx] | 
| 686 | 689 | 
| 687         movq        [rdi],          xmm2 | 690         movq        [rdi],          xmm2 | 
| 688         movq        [rdi + rdx],    xmm3 | 691         movq        [rdi + rdx],    xmm3 | 
| 689 | 692 | 
| 690 | 693 | 
| 691     ; begin epilog | 694     ; begin epilog | 
| 692     pop         rdi | 695     pop         rdi | 
| 693     pop         rsi | 696     pop         rsi | 
| 694     RESTORE_GOT | 697     RESTORE_GOT | 
|  | 698     RESTORE_XMM | 
| 695     UNSHADOW_ARGS | 699     UNSHADOW_ARGS | 
| 696     pop         rbp | 700     pop         rbp | 
| 697     ret | 701     ret | 
| 698 | 702 | 
| 699 SECTION_RODATA | 703 SECTION_RODATA | 
| 700 align 16 | 704 align 16 | 
| 701 fours: | 705 fours: | 
| 702     times 8 dw 0x0004 | 706     times 8 dw 0x0004 | 
| 703 align 16 | 707 align 16 | 
| 704 x_s1sqr2: | 708 x_s1sqr2: | 
| 705     times 8 dw 0x8A8C | 709     times 8 dw 0x8A8C | 
| 706 align 16 | 710 align 16 | 
| 707 x_c1sqr2less1: | 711 x_c1sqr2less1: | 
| 708     times 8 dw 0x4E7B | 712     times 8 dw 0x4E7B | 
| OLD | NEW | 
|---|