| OLD | NEW |
| 1 ;****************************************************************************** | 1 ;****************************************************************************** |
| 2 ;* VP8 MMXEXT optimizations | 2 ;* VP8 MMXEXT optimizations |
| 3 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> | 3 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> |
| 4 ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com> | 4 ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com> |
| 5 ;* | 5 ;* |
| 6 ;* This file is part of FFmpeg. | 6 ;* This file is part of FFmpeg. |
| 7 ;* | 7 ;* |
| 8 ;* FFmpeg is free software; you can redistribute it and/or | 8 ;* FFmpeg is free software; you can redistribute it and/or |
| 9 ;* modify it under the terms of the GNU Lesser General Public | 9 ;* modify it under the terms of the GNU Lesser General Public |
| 10 ;* License as published by the Free Software Foundation; either | 10 ;* License as published by the Free Software Foundation; either |
| (...skipping 193 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 204 paddsw m0, m1 | 204 paddsw m0, m1 |
| 205 paddsw m0, m2 | 205 paddsw m0, m2 |
| 206 paddsw m0, [pw_64] | 206 paddsw m0, [pw_64] |
| 207 psraw m0, 7 | 207 psraw m0, 7 |
| 208 packuswb m0, m0 | 208 packuswb m0, m0 |
| 209 movh [r0], m0 ; store | 209 movh [r0], m0 ; store |
| 210 | 210 |
| 211 ; go to next line | 211 ; go to next line |
| 212 add r0, r1 | 212 add r0, r1 |
| 213 add r2, r3 | 213 add r2, r3 |
| 214 dec r4 ; next row | 214 dec r4d ; next row |
| 215 jg .nextrow | 215 jg .nextrow |
| 216 REP_RET | 216 REP_RET |
| 217 | 217 |
| 218 cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3 | 218 cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3 |
| 219 shl r5d, 4 | 219 shl r5d, 4 |
| 220 mova m2, [pw_64] | 220 mova m2, [pw_64] |
| 221 mova m3, [filter_h2_shuf] | 221 mova m3, [filter_h2_shuf] |
| 222 mova m4, [filter_h4_shuf] | 222 mova m4, [filter_h4_shuf] |
| 223 %ifdef PIC | 223 %ifdef PIC |
| 224 lea r11, [fourtap_filter_hb_m] | 224 lea r11, [fourtap_filter_hb_m] |
| (...skipping 10 matching lines...) Expand all Loading... |
| 235 pmaddubsw m1, m6 | 235 pmaddubsw m1, m6 |
| 236 paddsw m0, m2 | 236 paddsw m0, m2 |
| 237 paddsw m0, m1 | 237 paddsw m0, m1 |
| 238 psraw m0, 7 | 238 psraw m0, 7 |
| 239 packuswb m0, m0 | 239 packuswb m0, m0 |
| 240 movh [r0], m0 ; store | 240 movh [r0], m0 ; store |
| 241 | 241 |
| 242 ; go to next line | 242 ; go to next line |
| 243 add r0, r1 | 243 add r0, r1 |
| 244 add r2, r3 | 244 add r2, r3 |
| 245 dec r4 ; next row | 245 dec r4d ; next row |
| 246 jg .nextrow | 246 jg .nextrow |
| 247 REP_RET | 247 REP_RET |
| 248 | 248 |
| 249 cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2 | 249 cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2 |
| 250 shl r6d, 4 | 250 shl r6d, 4 |
| 251 %ifdef PIC | 251 %ifdef PIC |
| 252 lea r11, [fourtap_filter_hb_m] | 252 lea r11, [fourtap_filter_hb_m] |
| 253 %endif | 253 %endif |
| 254 mova m5, [fourtap_filter_hb+r6-16] | 254 mova m5, [fourtap_filter_hb+r6-16] |
| 255 mova m6, [fourtap_filter_hb+r6] | 255 mova m6, [fourtap_filter_hb+r6] |
| (...skipping 18 matching lines...) Expand all Loading... |
| 274 paddsw m4, m2 | 274 paddsw m4, m2 |
| 275 mova m2, m3 | 275 mova m2, m3 |
| 276 paddsw m4, m7 | 276 paddsw m4, m7 |
| 277 psraw m4, 7 | 277 psraw m4, 7 |
| 278 packuswb m4, m4 | 278 packuswb m4, m4 |
| 279 movh [r0], m4 | 279 movh [r0], m4 |
| 280 | 280 |
| 281 ; go to next line | 281 ; go to next line |
| 282 add r0, r1 | 282 add r0, r1 |
| 283 add r2, r3 | 283 add r2, r3 |
| 284 dec r4 ; next row | 284 dec r4d ; next row |
| 285 jg .nextrow | 285 jg .nextrow |
| 286 REP_RET | 286 REP_RET |
| 287 | 287 |
| 288 cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2 | 288 cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2 |
| 289 lea r6d, [r6*3] | 289 lea r6d, [r6*3] |
| 290 %ifdef PIC | 290 %ifdef PIC |
| 291 lea r11, [sixtap_filter_hb_m] | 291 lea r11, [sixtap_filter_hb_m] |
| 292 %endif | 292 %endif |
| 293 lea r6, [sixtap_filter_hb+r6*8] | 293 lea r6, [sixtap_filter_hb+r6*8] |
| 294 | 294 |
| (...skipping 26 matching lines...) Expand all Loading... |
| 321 mova m2, m3 | 321 mova m2, m3 |
| 322 psraw m6, 7 | 322 psraw m6, 7 |
| 323 mova m3, m4 | 323 mova m3, m4 |
| 324 packuswb m6, m6 | 324 packuswb m6, m6 |
| 325 mova m4, m5 | 325 mova m4, m5 |
| 326 movh [r0], m6 | 326 movh [r0], m6 |
| 327 | 327 |
| 328 ; go to next line | 328 ; go to next line |
| 329 add r0, r1 | 329 add r0, r1 |
| 330 add r2, r3 | 330 add r2, r3 |
| 331 dec r4 ; next row | 331 dec r4d ; next row |
| 332 jg .nextrow | 332 jg .nextrow |
| 333 REP_RET | 333 REP_RET |
| 334 %endmacro | 334 %endmacro |
| 335 | 335 |
| 336 INIT_MMX | 336 INIT_MMX |
| 337 FILTER_SSSE3 4, 0, 0 | 337 FILTER_SSSE3 4, 0, 0 |
| 338 INIT_XMM | 338 INIT_XMM |
| 339 FILTER_SSSE3 8, 8, 7 | 339 FILTER_SSSE3 8, 8, 7 |
| 340 | 340 |
| 341 ; 4x4 block, H-only 4-tap filter | 341 ; 4x4 block, H-only 4-tap filter |
| (...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 374 ; merge two sets of 2 pixels into one set of 4, round/clip/store | 374 ; merge two sets of 2 pixels into one set of 4, round/clip/store |
| 375 packssdw mm3, mm0 ; merge dword->word (4px) | 375 packssdw mm3, mm0 ; merge dword->word (4px) |
| 376 paddsw mm3, mm7 ; rounding | 376 paddsw mm3, mm7 ; rounding |
| 377 psraw mm3, 7 | 377 psraw mm3, 7 |
| 378 packuswb mm3, mm6 ; clip and word->bytes | 378 packuswb mm3, mm6 ; clip and word->bytes |
| 379 movd [r0], mm3 ; store | 379 movd [r0], mm3 ; store |
| 380 | 380 |
| 381 ; go to next line | 381 ; go to next line |
| 382 add r0, r1 | 382 add r0, r1 |
| 383 add r2, r3 | 383 add r2, r3 |
| 384 dec r4 ; next row | 384 dec r4d ; next row |
| 385 jg .nextrow | 385 jg .nextrow |
| 386 REP_RET | 386 REP_RET |
| 387 | 387 |
| 388 ; 4x4 block, H-only 6-tap filter | 388 ; 4x4 block, H-only 6-tap filter |
| 389 cglobal put_vp8_epel4_h6_mmxext, 6, 6 | 389 cglobal put_vp8_epel4_h6_mmxext, 6, 6 |
| 390 lea r5d, [r5*3] | 390 lea r5d, [r5*3] |
| 391 %ifdef PIC | 391 %ifdef PIC |
| 392 lea r11, [sixtap_filter_hw_m] | 392 lea r11, [sixtap_filter_hw_m] |
| 393 %endif | 393 %endif |
| 394 movq mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words | 394 movq mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words |
| (...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 431 ; merge two sets of 2 pixels into one set of 4, round/clip/store | 431 ; merge two sets of 2 pixels into one set of 4, round/clip/store |
| 432 packssdw mm1, mm0 ; merge dword->word (4px) | 432 packssdw mm1, mm0 ; merge dword->word (4px) |
| 433 paddsw mm1, mm7 ; rounding | 433 paddsw mm1, mm7 ; rounding |
| 434 psraw mm1, 7 | 434 psraw mm1, 7 |
| 435 packuswb mm1, mm3 ; clip and word->bytes | 435 packuswb mm1, mm3 ; clip and word->bytes |
| 436 movd [r0], mm1 ; store | 436 movd [r0], mm1 ; store |
| 437 | 437 |
| 438 ; go to next line | 438 ; go to next line |
| 439 add r0, r1 | 439 add r0, r1 |
| 440 add r2, r3 | 440 add r2, r3 |
| 441 dec r4 ; next row | 441 dec r4d ; next row |
| 442 jg .nextrow | 442 jg .nextrow |
| 443 REP_RET | 443 REP_RET |
| 444 | 444 |
| 445 INIT_XMM | 445 INIT_XMM |
| 446 cglobal put_vp8_epel8_h4_sse2, 6, 6, 10 | 446 cglobal put_vp8_epel8_h4_sse2, 6, 6, 10 |
| 447 shl r5d, 5 | 447 shl r5d, 5 |
| 448 %ifdef PIC | 448 %ifdef PIC |
| 449 lea r11, [fourtap_filter_v_m] | 449 lea r11, [fourtap_filter_v_m] |
| 450 %endif | 450 %endif |
| 451 lea r5, [fourtap_filter_v+r5-32] | 451 lea r5, [fourtap_filter_v+r5-32] |
| (...skipping 27 matching lines...) Expand all Loading... |
| 479 paddsw m2, m3 | 479 paddsw m2, m3 |
| 480 paddsw m0, m2 | 480 paddsw m0, m2 |
| 481 paddsw m0, m4 | 481 paddsw m0, m4 |
| 482 psraw m0, 7 | 482 psraw m0, 7 |
| 483 packuswb m0, m7 | 483 packuswb m0, m7 |
| 484 movh [r0], m0 ; store | 484 movh [r0], m0 ; store |
| 485 | 485 |
| 486 ; go to next line | 486 ; go to next line |
| 487 add r0, r1 | 487 add r0, r1 |
| 488 add r2, r3 | 488 add r2, r3 |
| 489 dec r4 ; next row | 489 dec r4d ; next row |
| 490 jg .nextrow | 490 jg .nextrow |
| 491 REP_RET | 491 REP_RET |
| 492 | 492 |
| 493 cglobal put_vp8_epel8_h6_sse2, 6, 6, 14 | 493 cglobal put_vp8_epel8_h6_sse2, 6, 6, 14 |
| 494 lea r5d, [r5*3] | 494 lea r5d, [r5*3] |
| 495 shl r5d, 4 | 495 shl r5d, 4 |
| 496 %ifdef PIC | 496 %ifdef PIC |
| 497 lea r11, [sixtap_filter_v_m] | 497 lea r11, [sixtap_filter_v_m] |
| 498 %endif | 498 %endif |
| 499 lea r5, [sixtap_filter_v+r5-96] | 499 lea r5, [sixtap_filter_v+r5-96] |
| (...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 541 paddsw m0, m3 | 541 paddsw m0, m3 |
| 542 paddsw m0, m1 | 542 paddsw m0, m1 |
| 543 paddsw m0, m6 | 543 paddsw m0, m6 |
| 544 psraw m0, 7 | 544 psraw m0, 7 |
| 545 packuswb m0, m7 | 545 packuswb m0, m7 |
| 546 movh [r0], m0 ; store | 546 movh [r0], m0 ; store |
| 547 | 547 |
| 548 ; go to next line | 548 ; go to next line |
| 549 add r0, r1 | 549 add r0, r1 |
| 550 add r2, r3 | 550 add r2, r3 |
| 551 dec r4 ; next row | 551 dec r4d ; next row |
| 552 jg .nextrow | 552 jg .nextrow |
| 553 REP_RET | 553 REP_RET |
| 554 | 554 |
| 555 %macro FILTER_V 3 | 555 %macro FILTER_V 3 |
| 556 ; 4x4 block, V-only 4-tap filter | 556 ; 4x4 block, V-only 4-tap filter |
| 557 cglobal put_vp8_epel%2_v4_%1, 7, 7, %3 | 557 cglobal put_vp8_epel%2_v4_%1, 7, 7, %3 |
| 558 shl r6d, 5 | 558 shl r6d, 5 |
| 559 %ifdef PIC | 559 %ifdef PIC |
| 560 lea r11, [fourtap_filter_v_m] | 560 lea r11, [fourtap_filter_v_m] |
| 561 %endif | 561 %endif |
| (...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 594 | 594 |
| 595 ; round/clip/store | 595 ; round/clip/store |
| 596 paddsw m4, m6 | 596 paddsw m4, m6 |
| 597 psraw m4, 7 | 597 psraw m4, 7 |
| 598 packuswb m4, m7 | 598 packuswb m4, m7 |
| 599 movh [r0], m4 | 599 movh [r0], m4 |
| 600 | 600 |
| 601 ; go to next line | 601 ; go to next line |
| 602 add r0, r1 | 602 add r0, r1 |
| 603 add r2, r3 | 603 add r2, r3 |
| 604 dec r4 ; next row | 604 dec r4d ; next row |
| 605 jg .nextrow | 605 jg .nextrow |
| 606 REP_RET | 606 REP_RET |
| 607 | 607 |
| 608 | 608 |
| 609 ; 4x4 block, V-only 6-tap filter | 609 ; 4x4 block, V-only 6-tap filter |
| 610 cglobal put_vp8_epel%2_v6_%1, 7, 7, %3 | 610 cglobal put_vp8_epel%2_v6_%1, 7, 7, %3 |
| 611 shl r6d, 4 | 611 shl r6d, 4 |
| 612 lea r6, [r6*3] | 612 lea r6, [r6*3] |
| 613 %ifdef PIC | 613 %ifdef PIC |
| 614 lea r11, [sixtap_filter_v_m] | 614 lea r11, [sixtap_filter_v_m] |
| (...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 659 | 659 |
| 660 ; round/clip/store | 660 ; round/clip/store |
| 661 paddsw m6, [pw_64] | 661 paddsw m6, [pw_64] |
| 662 psraw m6, 7 | 662 psraw m6, 7 |
| 663 packuswb m6, m7 | 663 packuswb m6, m7 |
| 664 movh [r0], m6 | 664 movh [r0], m6 |
| 665 | 665 |
| 666 ; go to next line | 666 ; go to next line |
| 667 add r0, r1 | 667 add r0, r1 |
| 668 add r2, r3 | 668 add r2, r3 |
| 669 dec r4 ; next row | 669 dec r4d ; next row |
| 670 jg .nextrow | 670 jg .nextrow |
| 671 REP_RET | 671 REP_RET |
| 672 %endmacro | 672 %endmacro |
| 673 | 673 |
| 674 INIT_MMX | 674 INIT_MMX |
| 675 FILTER_V mmxext, 4, 0 | 675 FILTER_V mmxext, 4, 0 |
| 676 INIT_XMM | 676 INIT_XMM |
| 677 FILTER_V sse2, 8, 8 | 677 FILTER_V sse2, 8, 8 |
| 678 | 678 |
| 679 %macro FILTER_BILINEAR 3 | 679 %macro FILTER_BILINEAR 3 |
| (...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 711 movh [r0+r1*0], m0 | 711 movh [r0+r1*0], m0 |
| 712 movh [r0+r1*1], m2 | 712 movh [r0+r1*1], m2 |
| 713 %else | 713 %else |
| 714 packuswb m0, m2 | 714 packuswb m0, m2 |
| 715 movh [r0+r1*0], m0 | 715 movh [r0+r1*0], m0 |
| 716 movhps [r0+r1*1], m0 | 716 movhps [r0+r1*1], m0 |
| 717 %endif | 717 %endif |
| 718 | 718 |
| 719 lea r0, [r0+r1*2] | 719 lea r0, [r0+r1*2] |
| 720 lea r2, [r2+r3*2] | 720 lea r2, [r2+r3*2] |
| 721 sub r4, 2 | 721 sub r4d, 2 |
| 722 jg .nextrow | 722 jg .nextrow |
| 723 REP_RET | 723 REP_RET |
| 724 | 724 |
| 725 cglobal put_vp8_bilinear%2_h_%1, 7,7,%3 | 725 cglobal put_vp8_bilinear%2_h_%1, 7,7,%3 |
| 726 mov r6d, 8*16 | 726 mov r6d, 8*16 |
| 727 shl r5d, 4 | 727 shl r5d, 4 |
| 728 sub r6d, r5d | 728 sub r6d, r5d |
| 729 %ifdef PIC | 729 %ifdef PIC |
| 730 lea r11, [bilinear_filter_vw_m] | 730 lea r11, [bilinear_filter_vw_m] |
| 731 %endif | 731 %endif |
| (...skipping 25 matching lines...) Expand all Loading... |
| 757 movh [r0+r1*0], m0 | 757 movh [r0+r1*0], m0 |
| 758 movh [r0+r1*1], m2 | 758 movh [r0+r1*1], m2 |
| 759 %else | 759 %else |
| 760 packuswb m0, m2 | 760 packuswb m0, m2 |
| 761 movh [r0+r1*0], m0 | 761 movh [r0+r1*0], m0 |
| 762 movhps [r0+r1*1], m0 | 762 movhps [r0+r1*1], m0 |
| 763 %endif | 763 %endif |
| 764 | 764 |
| 765 lea r0, [r0+r1*2] | 765 lea r0, [r0+r1*2] |
| 766 lea r2, [r2+r3*2] | 766 lea r2, [r2+r3*2] |
| 767 sub r4, 2 | 767 sub r4d, 2 |
| 768 jg .nextrow | 768 jg .nextrow |
| 769 REP_RET | 769 REP_RET |
| 770 %endmacro | 770 %endmacro |
| 771 | 771 |
| 772 INIT_MMX | 772 INIT_MMX |
| 773 FILTER_BILINEAR mmxext, 4, 0 | 773 FILTER_BILINEAR mmxext, 4, 0 |
| 774 INIT_XMM | 774 INIT_XMM |
| 775 FILTER_BILINEAR sse2, 8, 7 | 775 FILTER_BILINEAR sse2, 8, 7 |
| 776 | 776 |
| 777 %macro FILTER_BILINEAR_SSSE3 1 | 777 %macro FILTER_BILINEAR_SSSE3 1 |
| (...skipping 22 matching lines...) Expand all Loading... |
| 800 movh [r0+r1*0], m0 | 800 movh [r0+r1*0], m0 |
| 801 movh [r0+r1*1], m1 | 801 movh [r0+r1*1], m1 |
| 802 %else | 802 %else |
| 803 packuswb m0, m1 | 803 packuswb m0, m1 |
| 804 movh [r0+r1*0], m0 | 804 movh [r0+r1*0], m0 |
| 805 movhps [r0+r1*1], m0 | 805 movhps [r0+r1*1], m0 |
| 806 %endif | 806 %endif |
| 807 | 807 |
| 808 lea r0, [r0+r1*2] | 808 lea r0, [r0+r1*2] |
| 809 lea r2, [r2+r3*2] | 809 lea r2, [r2+r3*2] |
| 810 sub r4, 2 | 810 sub r4d, 2 |
| 811 jg .nextrow | 811 jg .nextrow |
| 812 REP_RET | 812 REP_RET |
| 813 | 813 |
| 814 cglobal put_vp8_bilinear%1_h_ssse3, 7,7 | 814 cglobal put_vp8_bilinear%1_h_ssse3, 7,7 |
| 815 shl r5d, 4 | 815 shl r5d, 4 |
| 816 %ifdef PIC | 816 %ifdef PIC |
| 817 lea r11, [bilinear_filter_vb_m] | 817 lea r11, [bilinear_filter_vb_m] |
| 818 %endif | 818 %endif |
| 819 pxor m4, m4 | 819 pxor m4, m4 |
| 820 mova m2, [filter_h2_shuf] | 820 mova m2, [filter_h2_shuf] |
| (...skipping 15 matching lines...) Expand all Loading... |
| 836 movh [r0+r1*0], m0 | 836 movh [r0+r1*0], m0 |
| 837 movh [r0+r1*1], m1 | 837 movh [r0+r1*1], m1 |
| 838 %else | 838 %else |
| 839 packuswb m0, m1 | 839 packuswb m0, m1 |
| 840 movh [r0+r1*0], m0 | 840 movh [r0+r1*0], m0 |
| 841 movhps [r0+r1*1], m0 | 841 movhps [r0+r1*1], m0 |
| 842 %endif | 842 %endif |
| 843 | 843 |
| 844 lea r0, [r0+r1*2] | 844 lea r0, [r0+r1*2] |
| 845 lea r2, [r2+r3*2] | 845 lea r2, [r2+r3*2] |
| 846 sub r4, 2 | 846 sub r4d, 2 |
| 847 jg .nextrow | 847 jg .nextrow |
| 848 REP_RET | 848 REP_RET |
| 849 %endmacro | 849 %endmacro |
| 850 | 850 |
| 851 INIT_MMX | 851 INIT_MMX |
| 852 FILTER_BILINEAR_SSSE3 4 | 852 FILTER_BILINEAR_SSSE3 4 |
| 853 INIT_XMM | 853 INIT_XMM |
| 854 FILTER_BILINEAR_SSSE3 8 | 854 FILTER_BILINEAR_SSSE3 8 |
| 855 | 855 |
| 856 cglobal put_vp8_pixels8_mmx, 5,5 | 856 cglobal put_vp8_pixels8_mmx, 5,5 |
| (...skipping 478 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1335 %if %10 == 16 | 1335 %if %10 == 16 |
| 1336 movd [%6+%9*4], m%3 | 1336 movd [%6+%9*4], m%3 |
| 1337 %endif | 1337 %endif |
| 1338 movd [%7+%9], m%4 | 1338 movd [%7+%9], m%4 |
| 1339 | 1339 |
| 1340 ; write dwords 2 | 1340 ; write dwords 2 |
| 1341 psrldq m%1, 4 | 1341 psrldq m%1, 4 |
| 1342 psrldq m%2, 4 | 1342 psrldq m%2, 4 |
| 1343 %if %10 == 8 | 1343 %if %10 == 8 |
| 1344 movd [%5+%8*2], m%1 | 1344 movd [%5+%8*2], m%1 |
| 1345 movd %5, m%3 | 1345 movd %5d, m%3 |
| 1346 %endif | 1346 %endif |
| 1347 psrldq m%3, 4 | 1347 psrldq m%3, 4 |
| 1348 psrldq m%4, 4 | 1348 psrldq m%4, 4 |
| 1349 %if %10 == 16 | 1349 %if %10 == 16 |
| 1350 movd [%5+%8*2], m%1 | 1350 movd [%5+%8*2], m%1 |
| 1351 %endif | 1351 %endif |
| 1352 movd [%6+%9], m%2 | 1352 movd [%6+%9], m%2 |
| 1353 movd [%7+%8*2], m%3 | 1353 movd [%7+%8*2], m%3 |
| 1354 movd [%7+%9*2], m%4 | 1354 movd [%7+%9*2], m%4 |
| 1355 add %7, %9 | 1355 add %7, %9 |
| (...skipping 16 matching lines...) Expand all Loading... |
| 1372 | 1372 |
| 1373 ; write 4 or 8 words in the mmx/xmm registers as 8 lines | 1373 ; write 4 or 8 words in the mmx/xmm registers as 8 lines |
| 1374 ; 1 and 2 are the registers to write, this can be the same (for SSE2) | 1374 ; 1 and 2 are the registers to write, this can be the same (for SSE2) |
| 1375 ; for pre-SSE4: | 1375 ; for pre-SSE4: |
| 1376 ; 3 is a general-purpose register that we will clobber | 1376 ; 3 is a general-purpose register that we will clobber |
| 1377 ; for SSE4: | 1377 ; for SSE4: |
| 1378 ; 3 is a pointer to the destination's 5th line | 1378 ; 3 is a pointer to the destination's 5th line |
| 1379 ; 4 is a pointer to the destination's 4th line | 1379 ; 4 is a pointer to the destination's 4th line |
| 1380 ; 5/6 is -stride and +stride | 1380 ; 5/6 is -stride and +stride |
| 1381 %macro WRITE_2x4W 6 | 1381 %macro WRITE_2x4W 6 |
| 1382 movd %3, %1 | 1382 movd %3d, %1 |
| 1383 punpckhdq %1, %1 | 1383 punpckhdq %1, %1 |
| 1384 mov [%4+%5*4], %3w | 1384 mov [%4+%5*4], %3w |
| 1385 shr %3, 16 | 1385 shr %3, 16 |
| 1386 add %4, %6 | 1386 add %4, %6 |
| 1387 mov [%4+%5*4], %3w | 1387 mov [%4+%5*4], %3w |
| 1388 | 1388 |
| 1389 movd %3, %1 | 1389 movd %3d, %1 |
| 1390 add %4, %5 | 1390 add %4, %5 |
| 1391 mov [%4+%5*2], %3w | 1391 mov [%4+%5*2], %3w |
| 1392 shr %3, 16 | 1392 shr %3, 16 |
| 1393 mov [%4+%5 ], %3w | 1393 mov [%4+%5 ], %3w |
| 1394 | 1394 |
| 1395 movd %3, %2 | 1395 movd %3d, %2 |
| 1396 punpckhdq %2, %2 | 1396 punpckhdq %2, %2 |
| 1397 mov [%4 ], %3w | 1397 mov [%4 ], %3w |
| 1398 shr %3, 16 | 1398 shr %3, 16 |
| 1399 mov [%4+%6 ], %3w | 1399 mov [%4+%6 ], %3w |
| 1400 | 1400 |
| 1401 movd %3, %2 | 1401 movd %3d, %2 |
| 1402 add %4, %6 | 1402 add %4, %6 |
| 1403 mov [%4+%6 ], %3w | 1403 mov [%4+%6 ], %3w |
| 1404 shr %3, 16 | 1404 shr %3, 16 |
| 1405 mov [%4+%6*2], %3w | 1405 mov [%4+%6*2], %3w |
| 1406 add %4, %5 | 1406 add %4, %5 |
| 1407 %endmacro | 1407 %endmacro |
| 1408 | 1408 |
| 1409 %macro WRITE_8W_SSE2 5 | 1409 %macro WRITE_8W_SSE2 5 |
| 1410 movd %2, %1 | 1410 movd %2d, %1 |
| 1411 psrldq %1, 4 | 1411 psrldq %1, 4 |
| 1412 mov [%3+%4*4], %2w | 1412 mov [%3+%4*4], %2w |
| 1413 shr %2, 16 | 1413 shr %2, 16 |
| 1414 add %3, %5 | 1414 add %3, %5 |
| 1415 mov [%3+%4*4], %2w | 1415 mov [%3+%4*4], %2w |
| 1416 | 1416 |
| 1417 movd %2, %1 | 1417 movd %2d, %1 |
| 1418 psrldq %1, 4 | 1418 psrldq %1, 4 |
| 1419 add %3, %4 | 1419 add %3, %4 |
| 1420 mov [%3+%4*2], %2w | 1420 mov [%3+%4*2], %2w |
| 1421 shr %2, 16 | 1421 shr %2, 16 |
| 1422 mov [%3+%4 ], %2w | 1422 mov [%3+%4 ], %2w |
| 1423 | 1423 |
| 1424 movd %2, %1 | 1424 movd %2d, %1 |
| 1425 psrldq %1, 4 | 1425 psrldq %1, 4 |
| 1426 mov [%3 ], %2w | 1426 mov [%3 ], %2w |
| 1427 shr %2, 16 | 1427 shr %2, 16 |
| 1428 mov [%3+%5 ], %2w | 1428 mov [%3+%5 ], %2w |
| 1429 | 1429 |
| 1430 movd %2, %1 | 1430 movd %2d, %1 |
| 1431 add %3, %5 | 1431 add %3, %5 |
| 1432 mov [%3+%5 ], %2w | 1432 mov [%3+%5 ], %2w |
| 1433 shr %2, 16 | 1433 shr %2, 16 |
| 1434 mov [%3+%5*2], %2w | 1434 mov [%3+%5*2], %2w |
| 1435 %endmacro | 1435 %endmacro |
| 1436 | 1436 |
| 1437 %macro WRITE_8W_SSE4 5 | 1437 %macro WRITE_8W_SSE4 5 |
| 1438 pextrw [%3+%4*4], %1, 0 | 1438 pextrw [%3+%4*4], %1, 0 |
| 1439 pextrw [%2+%4*4], %1, 1 | 1439 pextrw [%2+%4*4], %1, 1 |
| 1440 pextrw [%3+%4*2], %1, 2 | 1440 pextrw [%3+%4*2], %1, 2 |
| 1441 pextrw [%3+%4 ], %1, 3 | 1441 pextrw [%3+%4 ], %1, 3 |
| 1442 pextrw [%3 ], %1, 4 | 1442 pextrw [%3 ], %1, 4 |
| 1443 pextrw [%2 ], %1, 5 | 1443 pextrw [%2 ], %1, 5 |
| 1444 pextrw [%2+%5 ], %1, 6 | 1444 pextrw [%2+%5 ], %1, 6 |
| 1445 pextrw [%2+%5*2], %1, 7 | 1445 pextrw [%2+%5*2], %1, 7 |
| 1446 %endmacro | 1446 %endmacro |
| 1447 | 1447 |
| 1448 %macro SPLATB_REG_MMX 2-3 | 1448 %macro SPLATB_REG_MMX 2-3 |
| 1449 movd %1, %2 | 1449 movd %1, %2d |
| 1450 punpcklbw %1, %1 | 1450 punpcklbw %1, %1 |
| 1451 punpcklwd %1, %1 | 1451 punpcklwd %1, %1 |
| 1452 punpckldq %1, %1 | 1452 punpckldq %1, %1 |
| 1453 %endmacro | 1453 %endmacro |
| 1454 | 1454 |
| 1455 %macro SPLATB_REG_MMXEXT 2-3 | 1455 %macro SPLATB_REG_MMXEXT 2-3 |
| 1456 movd %1, %2 | 1456 movd %1, %2d |
| 1457 punpcklbw %1, %1 | 1457 punpcklbw %1, %1 |
| 1458 pshufw %1, %1, 0x0 | 1458 pshufw %1, %1, 0x0 |
| 1459 %endmacro | 1459 %endmacro |
| 1460 | 1460 |
| 1461 %macro SPLATB_REG_SSE2 2-3 | 1461 %macro SPLATB_REG_SSE2 2-3 |
| 1462 movd %1, %2 | 1462 movd %1, %2d |
| 1463 punpcklbw %1, %1 | 1463 punpcklbw %1, %1 |
| 1464 pshuflw %1, %1, 0x0 | 1464 pshuflw %1, %1, 0x0 |
| 1465 punpcklqdq %1, %1 | 1465 punpcklqdq %1, %1 |
| 1466 %endmacro | 1466 %endmacro |
| 1467 | 1467 |
| 1468 %macro SPLATB_REG_SSSE3 3 | 1468 %macro SPLATB_REG_SSSE3 3 |
| 1469 movd %1, %2 | 1469 movd %1, %2d |
| 1470 pshufb %1, %3 | 1470 pshufb %1, %3 |
| 1471 %endmacro | 1471 %endmacro |
| 1472 | 1472 |
| 1473 %macro SIMPLE_LOOPFILTER 3 | 1473 %macro SIMPLE_LOOPFILTER 4 |
| 1474 cglobal vp8_%2_loop_filter_simple_%1, 3, %3 | 1474 cglobal vp8_%2_loop_filter_simple_%1, 3, %3, %4 |
| 1475 %if mmsize == 8 ; mmx/mmxext | 1475 %if mmsize == 8 ; mmx/mmxext |
| 1476 mov r3, 2 | 1476 mov r3, 2 |
| 1477 %endif | 1477 %endif |
| 1478 %ifnidn %1, sse2 | 1478 %ifnidn %1, sse2 |
| 1479 %if mmsize == 16 | 1479 %if mmsize == 16 |
| 1480 pxor m0, m0 | 1480 pxor m0, m0 |
| 1481 %endif | 1481 %endif |
| 1482 %endif | 1482 %endif |
| 1483 SPLATB_REG m7, r2, m0 ; splat "flim" into register | 1483 SPLATB_REG m7, r2, m0 ; splat "flim" into register |
| 1484 | 1484 |
| (...skipping 120 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1605 dec r3 | 1605 dec r3 |
| 1606 jg .next8px | 1606 jg .next8px |
| 1607 REP_RET | 1607 REP_RET |
| 1608 %else ; sse2 | 1608 %else ; sse2 |
| 1609 RET | 1609 RET |
| 1610 %endif | 1610 %endif |
| 1611 %endmacro | 1611 %endmacro |
| 1612 | 1612 |
| 1613 INIT_MMX | 1613 INIT_MMX |
| 1614 %define SPLATB_REG SPLATB_REG_MMX | 1614 %define SPLATB_REG SPLATB_REG_MMX |
| 1615 SIMPLE_LOOPFILTER mmx, v, 4 | 1615 SIMPLE_LOOPFILTER mmx, v, 4, 0 |
| 1616 SIMPLE_LOOPFILTER mmx, h, 5 | 1616 SIMPLE_LOOPFILTER mmx, h, 5, 0 |
| 1617 %define SPLATB_REG SPLATB_REG_MMXEXT | 1617 %define SPLATB_REG SPLATB_REG_MMXEXT |
| 1618 SIMPLE_LOOPFILTER mmxext, v, 4 | 1618 SIMPLE_LOOPFILTER mmxext, v, 4, 0 |
| 1619 SIMPLE_LOOPFILTER mmxext, h, 5 | 1619 SIMPLE_LOOPFILTER mmxext, h, 5, 0 |
| 1620 INIT_XMM | 1620 INIT_XMM |
| 1621 %define SPLATB_REG SPLATB_REG_SSE2 | 1621 %define SPLATB_REG SPLATB_REG_SSE2 |
| 1622 %define WRITE_8W WRITE_8W_SSE2 | 1622 %define WRITE_8W WRITE_8W_SSE2 |
| 1623 SIMPLE_LOOPFILTER sse2, v, 3 | 1623 SIMPLE_LOOPFILTER sse2, v, 3, 8 |
| 1624 SIMPLE_LOOPFILTER sse2, h, 5 | 1624 SIMPLE_LOOPFILTER sse2, h, 5, 8 |
| 1625 %define SPLATB_REG SPLATB_REG_SSSE3 | 1625 %define SPLATB_REG SPLATB_REG_SSSE3 |
| 1626 SIMPLE_LOOPFILTER ssse3, v, 3 | 1626 SIMPLE_LOOPFILTER ssse3, v, 3, 8 |
| 1627 SIMPLE_LOOPFILTER ssse3, h, 5 | 1627 SIMPLE_LOOPFILTER ssse3, h, 5, 8 |
| 1628 %define WRITE_8W WRITE_8W_SSE4 | 1628 %define WRITE_8W WRITE_8W_SSE4 |
| 1629 SIMPLE_LOOPFILTER sse4, h, 5 | 1629 SIMPLE_LOOPFILTER sse4, h, 5, 8 |
| 1630 | 1630 |
| 1631 ;----------------------------------------------------------------------------- | 1631 ;----------------------------------------------------------------------------- |
| 1632 ; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int str
ide, | 1632 ; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int str
ide, |
| 1633 ; int flimE, int flimI, int hev_thr); | 1633 ; int flimE, int flimI, int hev_thr); |
| 1634 ;----------------------------------------------------------------------------- | 1634 ;----------------------------------------------------------------------------- |
| 1635 | 1635 |
| 1636 %macro INNER_LOOPFILTER 5 | 1636 %macro INNER_LOOPFILTER 5 |
| 1637 %if %4 == 8 ; chroma | 1637 %if %4 == 8 ; chroma |
| 1638 cglobal vp8_%2_loop_filter8uv_inner_%1, 6, %3, %5 | 1638 cglobal vp8_%2_loop_filter8uv_inner_%1, 6, %3, %5 |
| 1639 %define dst8_reg r1 | 1639 %define dst8_reg r1 |
| (...skipping 1209 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2849 MBEDGE_LOOPFILTER ssse3, v, 6, 8, 15 | 2849 MBEDGE_LOOPFILTER ssse3, v, 6, 8, 15 |
| 2850 MBEDGE_LOOPFILTER ssse3, h, 6, 8, 15 | 2850 MBEDGE_LOOPFILTER ssse3, h, 6, 8, 15 |
| 2851 | 2851 |
| 2852 %define WRITE_8W WRITE_8W_SSE4 | 2852 %define WRITE_8W WRITE_8W_SSE4 |
| 2853 %ifdef m8 | 2853 %ifdef m8 |
| 2854 MBEDGE_LOOPFILTER sse4, h, 5, 16, 15 | 2854 MBEDGE_LOOPFILTER sse4, h, 5, 16, 15 |
| 2855 %else | 2855 %else |
| 2856 MBEDGE_LOOPFILTER sse4, h, 6, 16, 15 | 2856 MBEDGE_LOOPFILTER sse4, h, 6, 16, 15 |
| 2857 %endif | 2857 %endif |
| 2858 MBEDGE_LOOPFILTER sse4, h, 6, 8, 15 | 2858 MBEDGE_LOOPFILTER sse4, h, 6, 8, 15 |
| OLD | NEW |