OLD | NEW |
1 ; | 1 ; |
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
3 ; | 3 ; |
4 ; Use of this source code is governed by a BSD-style license | 4 ; Use of this source code is governed by a BSD-style license |
5 ; that can be found in the LICENSE file in the root of the source | 5 ; that can be found in the LICENSE file in the root of the source |
6 ; tree. An additional intellectual property rights grant can be found | 6 ; tree. An additional intellectual property rights grant can be found |
7 ; in the file PATENTS. All contributing project authors may | 7 ; in the file PATENTS. All contributing project authors may |
8 ; be found in the AUTHORS file in the root of the source tree. | 8 ; be found in the AUTHORS file in the root of the source tree. |
9 ; | 9 ; |
10 | 10 |
11 %include "third_party/x86inc/x86inc.asm" | 11 %include "third_party/x86inc/x86inc.asm" |
12 | 12 |
13 SECTION_RODATA | 13 SECTION_RODATA |
14 | 14 |
15 pb_1: times 16 db 1 | 15 pb_1: times 16 db 1 |
16 pw_2: times 8 dw 2 | 16 sh_b01234577: db 0, 1, 2, 3, 4, 5, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 |
17 pb_7m1: times 8 db 7, -1 | 17 sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 |
18 pb_15: times 16 db 15 | |
19 | |
20 sh_b01234577: db 0, 1, 2, 3, 4, 5, 7, 7 | |
21 sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7 | |
22 sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 | 18 sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 |
23 sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7 | 19 sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7 |
24 sh_b1234567777777777: db 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 | 20 sh_b1234567777777777: db 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 |
25 sh_b2345677777777777: db 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 | 21 sh_b2345677777777777: db 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 |
26 sh_b2w01234577: db 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 7, -1, 7, -1 | |
27 sh_b2w12345677: db 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 7, -1 | |
28 sh_b2w23456777: db 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 7, -1, 7, -1 | |
29 sh_b2w01234567: db 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1 | |
30 sh_b2w12345678: db 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 8, -1 | |
31 sh_b2w23456789: db 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 8, -1, 9, -1 | |
32 sh_b2w89abcdef: db 8, -1, 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15, -1 | |
33 sh_b2w9abcdeff: db 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15, -1, 15, -1 | |
34 sh_b2wabcdefff: db 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15, -1, 15, -1, 15, -
1 | |
35 sh_b123456789abcdeff: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15 | 22 sh_b123456789abcdeff: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15 |
36 sh_b23456789abcdefff: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15 | 23 sh_b23456789abcdefff: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15 |
| 24 sh_b32104567: db 3, 2, 1, 0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0 |
| 25 sh_b8091a2b345: db 8, 0, 9, 1, 10, 2, 11, 3, 4, 5, 0, 0, 0, 0, 0, 0 |
| 26 sh_b76543210: db 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| 27 sh_b65432108: db 6, 5, 4, 3, 2, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0 |
| 28 sh_b54321089: db 5, 4, 3, 2, 1, 0, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0 |
| 29 sh_b89abcdef: db 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0 |
| 30 sh_bfedcba9876543210: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 |
| 31 sh_b1233: db 1, 2, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| 32 sh_b2333: db 2, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
37 | 33 |
38 SECTION .text | 34 SECTION .text |
39 | 35 |
40 INIT_MMX ssse3 | 36 INIT_MMX ssse3 |
41 cglobal h_predictor_4x4, 2, 4, 3, dst, stride, line, left | 37 cglobal h_predictor_4x4, 2, 4, 3, dst, stride, line, left |
42 movifnidn leftq, leftmp | 38 movifnidn leftq, leftmp |
43 add leftq, 4 | 39 add leftq, 4 |
44 mov lineq, -2 | 40 mov lineq, -2 |
45 pxor m0, m0 | 41 pxor m0, m0 |
46 .loop: | 42 .loop: |
(...skipping 251 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
298 mova [dstq +strideq*2+16], m4 | 294 mova [dstq +strideq*2+16], m4 |
299 mova [dstq +stride3q +16], m4 | 295 mova [dstq +stride3q +16], m4 |
300 lea dstq, [dstq +strideq*4] | 296 lea dstq, [dstq +strideq*4] |
301 mova [dstq +16], m4 | 297 mova [dstq +16], m4 |
302 mova [dstq +strideq +16], m4 | 298 mova [dstq +strideq +16], m4 |
303 mova [dstq +strideq*2+16], m4 | 299 mova [dstq +strideq*2+16], m4 |
304 mova [dstq +stride3q +16], m4 | 300 mova [dstq +stride3q +16], m4 |
305 | 301 |
306 RESTORE_GOT | 302 RESTORE_GOT |
307 RET | 303 RET |
| 304 |
| 305 ; ------------------------------------------ |
| 306 ; input: x, y, z, result |
| 307 ; |
| 308 ; trick from pascal |
| 309 ; (x+2y+z+2)>>2 can be calculated as: |
| 310 ; result = avg(x,z) |
| 311 ; result -= xor(x,z) & 1 |
| 312 ; result = avg(result,y) |
| 313 ; ------------------------------------------ |
| 314 %macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4 |
| 315 pavgb %4, %1, %3 |
| 316 pxor %3, %1 |
| 317 pand %3, [GLOBAL(pb_1)] |
| 318 psubb %4, %3 |
| 319 pavgb %4, %2 |
| 320 %endmacro |
| 321 |
| 322 INIT_XMM ssse3 |
| 323 cglobal d63_predictor_4x4, 3, 4, 5, dst, stride, above, goffset |
| 324 GET_GOT goffsetq |
| 325 |
| 326 movq m3, [aboveq] |
| 327 pshufb m1, m3, [GLOBAL(sh_b23456777)] |
| 328 pshufb m2, m3, [GLOBAL(sh_b12345677)] |
| 329 |
| 330 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m2, m1, m4 |
| 331 pavgb m3, m2 |
| 332 |
| 333 ; store 4 lines |
| 334 movd [dstq ], m3 |
| 335 movd [dstq+strideq], m4 |
| 336 lea dstq, [dstq+strideq*2] |
| 337 psrldq m3, 1 |
| 338 psrldq m4, 1 |
| 339 movd [dstq ], m3 |
| 340 movd [dstq+strideq], m4 |
| 341 RESTORE_GOT |
| 342 RET |
| 343 |
| 344 INIT_XMM ssse3 |
| 345 cglobal d63_predictor_8x8, 3, 4, 5, dst, stride, above, goffset |
| 346 GET_GOT goffsetq |
| 347 |
| 348 movq m3, [aboveq] |
| 349 DEFINE_ARGS dst, stride, stride3 |
| 350 lea stride3q, [strideq*3] |
| 351 pshufb m1, m3, [GLOBAL(sh_b2345677777777777)] |
| 352 pshufb m0, m3, [GLOBAL(sh_b0123456777777777)] |
| 353 pshufb m2, m3, [GLOBAL(sh_b1234567777777777)] |
| 354 pshufb m3, [GLOBAL(sh_b0123456777777777)] |
| 355 |
| 356 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m4 |
| 357 pavgb m3, m2 |
| 358 |
| 359 ; store 4 lines |
| 360 movq [dstq ], m3 |
| 361 movq [dstq+strideq], m4 |
| 362 psrldq m3, 1 |
| 363 psrldq m4, 1 |
| 364 movq [dstq+strideq*2], m3 |
| 365 movq [dstq+stride3q ], m4 |
| 366 lea dstq, [dstq+strideq*4] |
| 367 psrldq m3, 1 |
| 368 psrldq m4, 1 |
| 369 |
| 370 ; store 4 lines |
| 371 movq [dstq ], m3 |
| 372 movq [dstq+strideq], m4 |
| 373 psrldq m3, 1 |
| 374 psrldq m4, 1 |
| 375 movq [dstq+strideq*2], m3 |
| 376 movq [dstq+stride3q ], m4 |
| 377 RESTORE_GOT |
| 378 RET |
| 379 |
| 380 INIT_XMM ssse3 |
| 381 cglobal d63_predictor_16x16, 3, 5, 5, dst, stride, above, line, goffset |
| 382 GET_GOT goffsetq |
| 383 |
| 384 mova m0, [aboveq] |
| 385 DEFINE_ARGS dst, stride, stride3, line |
| 386 lea stride3q, [strideq*3] |
| 387 mova m1, [GLOBAL(sh_b123456789abcdeff)] |
| 388 pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)] |
| 389 pshufb m3, m0, m1 |
| 390 |
| 391 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m3, m2, m4 |
| 392 pavgb m0, m3 |
| 393 |
| 394 mov lined, 4 |
| 395 .loop: |
| 396 mova [dstq ], m0 |
| 397 mova [dstq+strideq ], m4 |
| 398 pshufb m0, m1 |
| 399 pshufb m4, m1 |
| 400 mova [dstq+strideq*2], m0 |
| 401 mova [dstq+stride3q ], m4 |
| 402 pshufb m0, m1 |
| 403 pshufb m4, m1 |
| 404 lea dstq, [dstq+strideq*4] |
| 405 dec lined |
| 406 jnz .loop |
| 407 RESTORE_GOT |
| 408 REP_RET |
| 409 |
| 410 INIT_XMM ssse3 |
| 411 cglobal d63_predictor_32x32, 3, 5, 8, dst, stride, above, line, goffset |
| 412 GET_GOT goffsetq |
| 413 |
| 414 mova m0, [aboveq] |
| 415 mova m7, [aboveq+16] |
| 416 DEFINE_ARGS dst, stride, stride3, line |
| 417 mova m1, [GLOBAL(sh_b123456789abcdeff)] |
| 418 lea stride3q, [strideq*3] |
| 419 pshufb m2, m7, [GLOBAL(sh_b23456789abcdefff)] |
| 420 pshufb m3, m7, m1 |
| 421 |
| 422 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m2, m4 |
| 423 palignr m6, m7, m0, 1 |
| 424 palignr m5, m7, m0, 2 |
| 425 pavgb m7, m3 |
| 426 |
| 427 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m6, m5, m2 |
| 428 pavgb m0, m6 |
| 429 |
| 430 mov lined, 8 |
| 431 .loop: |
| 432 mova [dstq ], m0 |
| 433 mova [dstq +16], m7 |
| 434 mova [dstq+strideq ], m2 |
| 435 mova [dstq+strideq +16], m4 |
| 436 palignr m3, m7, m0, 1 |
| 437 palignr m5, m4, m2, 1 |
| 438 pshufb m7, m1 |
| 439 pshufb m4, m1 |
| 440 |
| 441 mova [dstq+strideq*2 ], m3 |
| 442 mova [dstq+strideq*2+16], m7 |
| 443 mova [dstq+stride3q ], m5 |
| 444 mova [dstq+stride3q +16], m4 |
| 445 palignr m0, m7, m3, 1 |
| 446 palignr m2, m4, m5, 1 |
| 447 pshufb m7, m1 |
| 448 pshufb m4, m1 |
| 449 lea dstq, [dstq+strideq*4] |
| 450 dec lined |
| 451 jnz .loop |
| 452 RESTORE_GOT |
| 453 REP_RET |
| 454 |
| 455 INIT_XMM ssse3 |
| 456 cglobal d153_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset |
| 457 GET_GOT goffsetq |
| 458 movd m0, [leftq] ; l1, l2, l3, l4 |
| 459 movd m1, [aboveq-1] ; tl, t1, t2, t3 |
| 460 punpckldq m0, m1 ; l1, l2, l3, l4, tl, t1, t2, t3 |
| 461 pshufb m0, [GLOBAL(sh_b32104567)]; l4, l3, l2, l1, tl, t1, t2, t3 |
| 462 psrldq m1, m0, 1 ; l3, l2, l1, tl, t1, t2, t3 |
| 463 psrldq m2, m0, 2 ; l2, l1, tl, t1, t2, t3 |
| 464 ; comments below are for a predictor like this |
| 465 ; A1 B1 C1 D1 |
| 466 ; A2 B2 A1 B1 |
| 467 ; A3 B3 A2 B2 |
| 468 ; A4 B4 A3 B3 |
| 469 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 ; 3-tap avg B4 B3 B2 B1 C1 D1 |
| 470 pavgb m1, m0 ; 2-tap avg A4 A3 A2 A1 |
| 471 |
| 472 punpcklqdq m3, m1 ; B4 B3 B2 B1 C1 D1 x x A4 A3 A2
A1 .. |
| 473 |
| 474 DEFINE_ARGS dst, stride, stride3 |
| 475 lea stride3q, [strideq*3] |
| 476 pshufb m3, [GLOBAL(sh_b8091a2b345)] ; A4 B4 A3 B3 A2 B2 A1 B1 C1
D1 .. |
| 477 movd [dstq+stride3q ], m3 |
| 478 psrldq m3, 2 ; A3 B3 A2 B2 A1 B1 C1 D1 .. |
| 479 movd [dstq+strideq*2], m3 |
| 480 psrldq m3, 2 ; A2 B2 A1 B1 C1 D1 .. |
| 481 movd [dstq+strideq ], m3 |
| 482 psrldq m3, 2 ; A1 B1 C1 D1 .. |
| 483 movd [dstq ], m3 |
| 484 RESTORE_GOT |
| 485 RET |
| 486 |
| 487 INIT_XMM ssse3 |
| 488 cglobal d153_predictor_8x8, 4, 5, 8, dst, stride, above, left, goffset |
| 489 GET_GOT goffsetq |
| 490 movq m0, [leftq] ; [0- 7] l1-8 [byte] |
| 491 movhps m0, [aboveq-1] ; [8-15] tl, t1-7 [byte] |
| 492 pshufb m1, m0, [GLOBAL(sh_b76543210)] ; l8-1 [word] |
| 493 pshufb m2, m0, [GLOBAL(sh_b65432108)] ; l7-1,tl [word] |
| 494 pshufb m3, m0, [GLOBAL(sh_b54321089)] ; l6-1,tl,t1 [word] |
| 495 pshufb m0, [GLOBAL(sh_b89abcdef)] ; tl,t1-7 [word] |
| 496 psrldq m4, m0, 1 ; t1-7 [word] |
| 497 psrldq m5, m0, 2 ; t2-7 [word] |
| 498 ; comments below are for a predictor like this |
| 499 ; A1 B1 C1 D1 E1 F1 G1 H1 |
| 500 ; A2 B2 A1 B1 C1 D1 E1 F1 |
| 501 ; A3 B3 A2 B2 A1 B1 C1 D1 |
| 502 ; A4 B4 A3 B3 A2 B2 A1 B1 |
| 503 ; A5 B5 A4 B4 A3 B3 A2 B2 |
| 504 ; A6 B6 A5 B5 A4 B4 A3 B3 |
| 505 ; A7 B7 A6 B6 A5 B5 A4 B4 |
| 506 ; A8 B8 A7 B7 A6 B6 A5 B5 |
| 507 pavgb m6, m1, m2 ; 2-tap avg A8-A1 |
| 508 |
| 509 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m4, m5, m7 ; 3-tap avg C-H1 |
| 510 |
| 511 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m2, m3, m0 ; 3-tap avg B8-1 |
| 512 |
| 513 punpcklbw m6, m0 ; A-B8, A-B7 ... A-B2, A-B1 |
| 514 |
| 515 DEFINE_ARGS dst, stride, stride3 |
| 516 lea stride3q, [strideq*3] |
| 517 |
| 518 movhps [dstq+stride3q], m6 ; A-B4, A-B3, A-B2, A-B1 |
| 519 palignr m0, m7, m6, 10 ; A-B3, A-B2, A-B1, C-H1 |
| 520 movq [dstq+strideq*2], m0 |
| 521 psrldq m0, 2 ; A-B2, A-B1, C-H1 |
| 522 movq [dstq+strideq ], m0 |
| 523 psrldq m0, 2 ; A-H1 |
| 524 movq [dstq ], m0 |
| 525 lea dstq, [dstq+strideq*4] |
| 526 movq [dstq+stride3q ], m6 ; A-B8, A-B7, A-B6, A-B5 |
| 527 psrldq m6, 2 ; A-B7, A-B6, A-B5, A-B4 |
| 528 movq [dstq+strideq*2], m6 |
| 529 psrldq m6, 2 ; A-B6, A-B5, A-B4, A-B3 |
| 530 movq [dstq+strideq ], m6 |
| 531 psrldq m6, 2 ; A-B5, A-B4, A-B3, A-B2 |
| 532 movq [dstq ], m6 |
| 533 RESTORE_GOT |
| 534 RET |
| 535 |
| 536 INIT_XMM ssse3 |
| 537 cglobal d153_predictor_16x16, 4, 5, 8, dst, stride, above, left, goffset |
| 538 GET_GOT goffsetq |
| 539 mova m0, [leftq] |
| 540 movu m7, [aboveq-1] |
| 541 ; comments below are for a predictor like this |
| 542 ; A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 O1 P1 |
| 543 ; A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 |
| 544 ; A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 |
| 545 ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 |
| 546 ; A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 |
| 547 ; A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 |
| 548 ; A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 |
| 549 ; A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 |
| 550 ; A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 |
| 551 ; Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 |
| 552 ; Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 |
| 553 ; Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 |
| 554 ; Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 |
| 555 ; Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 |
| 556 ; Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 |
| 557 ; Ag Bg Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 |
| 558 pshufb m6, m7, [GLOBAL(sh_bfedcba9876543210)] |
| 559 palignr m5, m0, m6, 15 |
| 560 palignr m3, m0, m6, 14 |
| 561 |
| 562 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg |
| 563 pshufb m1, m0, [GLOBAL(sh_b123456789abcdeff)] |
| 564 pavgb m5, m0 ; A1 - Ag |
| 565 |
| 566 punpcklbw m0, m4, m5 ; A-B8 ... A-B1 |
| 567 punpckhbw m4, m5 ; A-B9 ... A-Bg |
| 568 |
| 569 pshufb m3, m7, [GLOBAL(sh_b123456789abcdeff)] |
| 570 pshufb m5, m7, [GLOBAL(sh_b23456789abcdefff)] |
| 571 |
| 572 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg C1-P1 |
| 573 |
| 574 pshufb m6, m0, [GLOBAL(sh_bfedcba9876543210)] |
| 575 DEFINE_ARGS dst, stride, stride3 |
| 576 lea stride3q, [strideq*3] |
| 577 palignr m2, m1, m6, 14 |
| 578 mova [dstq ], m2 |
| 579 palignr m2, m1, m6, 12 |
| 580 mova [dstq+strideq ], m2 |
| 581 palignr m2, m1, m6, 10 |
| 582 mova [dstq+strideq*2], m2 |
| 583 palignr m2, m1, m6, 8 |
| 584 mova [dstq+stride3q ], m2 |
| 585 lea dstq, [dstq+strideq*4] |
| 586 palignr m2, m1, m6, 6 |
| 587 mova [dstq ], m2 |
| 588 palignr m2, m1, m6, 4 |
| 589 mova [dstq+strideq ], m2 |
| 590 palignr m2, m1, m6, 2 |
| 591 mova [dstq+strideq*2], m2 |
| 592 pshufb m4, [GLOBAL(sh_bfedcba9876543210)] |
| 593 mova [dstq+stride3q ], m6 |
| 594 lea dstq, [dstq+strideq*4] |
| 595 |
| 596 palignr m2, m6, m4, 14 |
| 597 mova [dstq ], m2 |
| 598 palignr m2, m6, m4, 12 |
| 599 mova [dstq+strideq ], m2 |
| 600 palignr m2, m6, m4, 10 |
| 601 mova [dstq+strideq*2], m2 |
| 602 palignr m2, m6, m4, 8 |
| 603 mova [dstq+stride3q ], m2 |
| 604 lea dstq, [dstq+strideq*4] |
| 605 palignr m2, m6, m4, 6 |
| 606 mova [dstq ], m2 |
| 607 palignr m2, m6, m4, 4 |
| 608 mova [dstq+strideq ], m2 |
| 609 palignr m2, m6, m4, 2 |
| 610 mova [dstq+strideq*2], m2 |
| 611 mova [dstq+stride3q ], m4 |
| 612 RESTORE_GOT |
| 613 RET |
| 614 |
| 615 INIT_XMM ssse3 |
| 616 cglobal d153_predictor_32x32, 4, 5, 8, dst, stride, above, left, goffset |
| 617 GET_GOT goffsetq |
| 618 mova m0, [leftq] |
| 619 movu m7, [aboveq-1] |
| 620 movu m1, [aboveq+15] |
| 621 |
| 622 pshufb m4, m1, [GLOBAL(sh_b123456789abcdeff)] |
| 623 pshufb m6, m1, [GLOBAL(sh_b23456789abcdefff)] |
| 624 |
| 625 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m4, m6, m2 ; 3-tap avg above [high] |
| 626 |
| 627 palignr m3, m1, m7, 1 |
| 628 palignr m5, m1, m7, 2 |
| 629 |
| 630 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg above [low] |
| 631 |
| 632 pshufb m7, [GLOBAL(sh_bfedcba9876543210)] |
| 633 palignr m5, m0, m7, 15 |
| 634 palignr m3, m0, m7, 14 |
| 635 |
| 636 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg |
| 637 pavgb m5, m0 ; A1 - Ag |
| 638 punpcklbw m6, m4, m5 ; A-B8 ... A-B1 |
| 639 punpckhbw m4, m5 ; A-B9 ... A-Bg |
| 640 pshufb m6, [GLOBAL(sh_bfedcba9876543210)] |
| 641 pshufb m4, [GLOBAL(sh_bfedcba9876543210)] |
| 642 |
| 643 DEFINE_ARGS dst, stride, stride3, left, line |
| 644 lea stride3q, [strideq*3] |
| 645 |
| 646 palignr m5, m2, m1, 14 |
| 647 palignr m7, m1, m6, 14 |
| 648 mova [dstq ], m7 |
| 649 mova [dstq+16 ], m5 |
| 650 palignr m5, m2, m1, 12 |
| 651 palignr m7, m1, m6, 12 |
| 652 mova [dstq+strideq ], m7 |
| 653 mova [dstq+strideq+16 ], m5 |
| 654 palignr m5, m2, m1, 10 |
| 655 palignr m7, m1, m6, 10 |
| 656 mova [dstq+strideq*2 ], m7 |
| 657 mova [dstq+strideq*2+16], m5 |
| 658 palignr m5, m2, m1, 8 |
| 659 palignr m7, m1, m6, 8 |
| 660 mova [dstq+stride3q ], m7 |
| 661 mova [dstq+stride3q+16 ], m5 |
| 662 lea dstq, [dstq+strideq*4] |
| 663 palignr m5, m2, m1, 6 |
| 664 palignr m7, m1, m6, 6 |
| 665 mova [dstq ], m7 |
| 666 mova [dstq+16 ], m5 |
| 667 palignr m5, m2, m1, 4 |
| 668 palignr m7, m1, m6, 4 |
| 669 mova [dstq+strideq ], m7 |
| 670 mova [dstq+strideq+16 ], m5 |
| 671 palignr m5, m2, m1, 2 |
| 672 palignr m7, m1, m6, 2 |
| 673 mova [dstq+strideq*2 ], m7 |
| 674 mova [dstq+strideq*2+16], m5 |
| 675 mova [dstq+stride3q ], m6 |
| 676 mova [dstq+stride3q+16 ], m1 |
| 677 lea dstq, [dstq+strideq*4] |
| 678 |
| 679 palignr m5, m1, m6, 14 |
| 680 palignr m3, m6, m4, 14 |
| 681 mova [dstq ], m3 |
| 682 mova [dstq+16 ], m5 |
| 683 palignr m5, m1, m6, 12 |
| 684 palignr m3, m6, m4, 12 |
| 685 mova [dstq+strideq ], m3 |
| 686 mova [dstq+strideq+16 ], m5 |
| 687 palignr m5, m1, m6, 10 |
| 688 palignr m3, m6, m4, 10 |
| 689 mova [dstq+strideq*2 ], m3 |
| 690 mova [dstq+strideq*2+16], m5 |
| 691 palignr m5, m1, m6, 8 |
| 692 palignr m3, m6, m4, 8 |
| 693 mova [dstq+stride3q ], m3 |
| 694 mova [dstq+stride3q+16 ], m5 |
| 695 lea dstq, [dstq+strideq*4] |
| 696 palignr m5, m1, m6, 6 |
| 697 palignr m3, m6, m4, 6 |
| 698 mova [dstq ], m3 |
| 699 mova [dstq+16 ], m5 |
| 700 palignr m5, m1, m6, 4 |
| 701 palignr m3, m6, m4, 4 |
| 702 mova [dstq+strideq ], m3 |
| 703 mova [dstq+strideq+16 ], m5 |
| 704 palignr m5, m1, m6, 2 |
| 705 palignr m3, m6, m4, 2 |
| 706 mova [dstq+strideq*2 ], m3 |
| 707 mova [dstq+strideq*2+16], m5 |
| 708 mova [dstq+stride3q ], m4 |
| 709 mova [dstq+stride3q+16 ], m6 |
| 710 lea dstq, [dstq+strideq*4] |
| 711 |
| 712 mova m7, [leftq] |
| 713 mova m3, [leftq+16] |
| 714 palignr m5, m3, m7, 15 |
| 715 palignr m0, m3, m7, 14 |
| 716 |
| 717 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m5, m0, m2 ; 3-tap avg Bh - |
| 718 pavgb m5, m3 ; Ah - |
| 719 punpcklbw m3, m2, m5 ; A-B8 ... A-B1 |
| 720 punpckhbw m2, m5 ; A-B9 ... A-Bg |
| 721 pshufb m3, [GLOBAL(sh_bfedcba9876543210)] |
| 722 pshufb m2, [GLOBAL(sh_bfedcba9876543210)] |
| 723 |
| 724 palignr m7, m6, m4, 14 |
| 725 palignr m0, m4, m3, 14 |
| 726 mova [dstq ], m0 |
| 727 mova [dstq+16 ], m7 |
| 728 palignr m7, m6, m4, 12 |
| 729 palignr m0, m4, m3, 12 |
| 730 mova [dstq+strideq ], m0 |
| 731 mova [dstq+strideq+16 ], m7 |
| 732 palignr m7, m6, m4, 10 |
| 733 palignr m0, m4, m3, 10 |
| 734 mova [dstq+strideq*2 ], m0 |
| 735 mova [dstq+strideq*2+16], m7 |
| 736 palignr m7, m6, m4, 8 |
| 737 palignr m0, m4, m3, 8 |
| 738 mova [dstq+stride3q ], m0 |
| 739 mova [dstq+stride3q+16 ], m7 |
| 740 lea dstq, [dstq+strideq*4] |
| 741 palignr m7, m6, m4, 6 |
| 742 palignr m0, m4, m3, 6 |
| 743 mova [dstq ], m0 |
| 744 mova [dstq+16 ], m7 |
| 745 palignr m7, m6, m4, 4 |
| 746 palignr m0, m4, m3, 4 |
| 747 mova [dstq+strideq ], m0 |
| 748 mova [dstq+strideq+16 ], m7 |
| 749 palignr m7, m6, m4, 2 |
| 750 palignr m0, m4, m3, 2 |
| 751 mova [dstq+strideq*2 ], m0 |
| 752 mova [dstq+strideq*2+16], m7 |
| 753 mova [dstq+stride3q ], m3 |
| 754 mova [dstq+stride3q+16 ], m4 |
| 755 lea dstq, [dstq+strideq*4] |
| 756 |
| 757 palignr m7, m4, m3, 14 |
| 758 palignr m0, m3, m2, 14 |
| 759 mova [dstq ], m0 |
| 760 mova [dstq+16 ], m7 |
| 761 palignr m7, m4, m3, 12 |
| 762 palignr m0, m3, m2, 12 |
| 763 mova [dstq+strideq ], m0 |
| 764 mova [dstq+strideq+16 ], m7 |
| 765 palignr m7, m4, m3, 10 |
| 766 palignr m0, m3, m2, 10 |
| 767 mova [dstq+strideq*2 ], m0 |
| 768 mova [dstq+strideq*2+16], m7 |
| 769 palignr m7, m4, m3, 8 |
| 770 palignr m0, m3, m2, 8 |
| 771 mova [dstq+stride3q ], m0 |
| 772 mova [dstq+stride3q+16 ], m7 |
| 773 lea dstq, [dstq+strideq*4] |
| 774 palignr m7, m4, m3, 6 |
| 775 palignr m0, m3, m2, 6 |
| 776 mova [dstq ], m0 |
| 777 mova [dstq+16 ], m7 |
| 778 palignr m7, m4, m3, 4 |
| 779 palignr m0, m3, m2, 4 |
| 780 mova [dstq+strideq ], m0 |
| 781 mova [dstq+strideq+16 ], m7 |
| 782 palignr m7, m4, m3, 2 |
| 783 palignr m0, m3, m2, 2 |
| 784 mova [dstq+strideq*2 ], m0 |
| 785 mova [dstq+strideq*2+16], m7 |
| 786 mova [dstq+stride3q ], m2 |
| 787 mova [dstq+stride3q+16 ], m3 |
| 788 |
| 789 RESTORE_GOT |
| 790 RET |
| 791 |
| 792 INIT_MMX ssse3 |
| 793 cglobal d207_predictor_4x4, 4, 5, 4, dst, stride, unused, left, goffset |
| 794 GET_GOT goffsetq |
| 795 movd m0, [leftq] ; abcd [byte] |
| 796 pshufb m1, m0, [GLOBAL(sh_b1233)] ; bcdd [byte] |
| 797 pshufb m3, m0, [GLOBAL(sh_b2333)] ; cddd |
| 798 |
| 799 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m3, m2 |
| 800 pavgb m1, m0 ; ab, bc, cd, d [byte] |
| 801 |
| 802 punpcklbw m1, m2 ; ab, a2bc, bc, b2cd, cd, c3d, d, d |
| 803 movd [dstq ], m1 |
| 804 psrlq m1, 16 ; bc, b2cd, cd, c3d, d, d |
| 805 movd [dstq+strideq], m1 |
| 806 lea dstq, [dstq+strideq*2] |
| 807 psrlq m1, 16 ; cd, c3d, d, d |
| 808 movd [dstq ], m1 |
| 809 pshufw m1, m1, q1111 ; d, d, d, d |
| 810 movd [dstq+strideq], m1 |
| 811 RESTORE_GOT |
| 812 RET |
| 813 |
| 814 INIT_XMM ssse3 |
| 815 cglobal d207_predictor_8x8, 4, 5, 4, dst, stride, stride3, left, goffset |
| 816 GET_GOT goffsetq |
| 817 movq m3, [leftq] ; abcdefgh [byte] |
| 818 lea stride3q, [strideq*3] |
| 819 |
| 820 pshufb m1, m3, [GLOBAL(sh_b2345677777777777)] |
| 821 pshufb m0, m3, [GLOBAL(sh_b0123456777777777)] |
| 822 pshufb m2, m3, [GLOBAL(sh_b1234567777777777)] |
| 823 |
| 824 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m3 |
| 825 pavgb m0, m2 |
| 826 punpcklbw m0, m3 ; interleaved output |
| 827 |
| 828 movq [dstq ], m0 |
| 829 psrldq m0, 2 |
| 830 movq [dstq+strideq ], m0 |
| 831 psrldq m0, 2 |
| 832 movq [dstq+strideq*2], m0 |
| 833 psrldq m0, 2 |
| 834 movq [dstq+stride3q ], m0 |
| 835 lea dstq, [dstq+strideq*4] |
| 836 pshufhw m0, m0, q0000 ; de, d2ef, ef, e2fg, fg, f2gh, gh, g3h, 8xh |
| 837 psrldq m0, 2 |
| 838 movq [dstq ], m0 |
| 839 psrldq m0, 2 |
| 840 movq [dstq+strideq ], m0 |
| 841 psrldq m0, 2 |
| 842 movq [dstq+strideq*2], m0 |
| 843 psrldq m0, 2 |
| 844 movq [dstq+stride3q ], m0 |
| 845 RESTORE_GOT |
| 846 RET |
| 847 |
| 848 INIT_XMM ssse3 |
| 849 cglobal d207_predictor_16x16, 4, 5, 5, dst, stride, stride3, left, goffset |
| 850 GET_GOT goffsetq |
| 851 lea stride3q, [strideq*3] |
| 852 mova m0, [leftq] ; abcdefghijklmnop [byte] |
| 853 pshufb m1, m0, [GLOBAL(sh_b123456789abcdeff)] ; bcdefghijklmnopp |
| 854 pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)] |
| 855 |
| 856 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 |
| 857 pavgb m1, m0 ; ab, bc, cd .. no, op, pp [byte] |
| 858 |
| 859 punpckhbw m4, m1, m3 ; interleaved input |
| 860 punpcklbw m1, m3 ; interleaved output |
| 861 mova [dstq ], m1 |
| 862 palignr m3, m4, m1, 2 |
| 863 mova [dstq+strideq ], m3 |
| 864 palignr m3, m4, m1, 4 |
| 865 mova [dstq+strideq*2], m3 |
| 866 palignr m3, m4, m1, 6 |
| 867 mova [dstq+stride3q ], m3 |
| 868 lea dstq, [dstq+strideq*4] |
| 869 palignr m3, m4, m1, 8 |
| 870 mova [dstq ], m3 |
| 871 palignr m3, m4, m1, 10 |
| 872 mova [dstq+strideq ], m3 |
| 873 palignr m3, m4, m1, 12 |
| 874 mova [dstq+strideq*2], m3 |
| 875 palignr m3, m4, m1, 14 |
| 876 mova [dstq+stride3q ], m3 |
| 877 DEFINE_ARGS dst, stride, stride3, line |
| 878 mov lined, 2 |
| 879 mova m0, [GLOBAL(sh_b23456789abcdefff)] |
| 880 .loop: |
| 881 lea dstq, [dstq+strideq*4] |
| 882 mova [dstq ], m4 |
| 883 pshufb m4, m0 |
| 884 mova [dstq+strideq ], m4 |
| 885 pshufb m4, m0 |
| 886 mova [dstq+strideq*2], m4 |
| 887 pshufb m4, m0 |
| 888 mova [dstq+stride3q ], m4 |
| 889 pshufb m4, m0 |
| 890 dec lined |
| 891 jnz .loop |
| 892 RESTORE_GOT |
| 893 REP_RET |
| 894 |
| 895 INIT_XMM ssse3 |
| 896 cglobal d207_predictor_32x32, 4, 5, 8, dst, stride, stride3, left, goffset |
| 897 GET_GOT goffsetq |
| 898 lea stride3q, [strideq*3] |
| 899 mova m1, [leftq] ; 0-15 [byte] |
| 900 mova m2, [leftq+16] ; 16-31 [byte] |
| 901 pshufb m0, m2, [GLOBAL(sh_b23456789abcdefff)] |
| 902 pshufb m4, m2, [GLOBAL(sh_b123456789abcdeff)] |
| 903 |
| 904 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m2, m4, m0, m3 |
| 905 palignr m6, m2, m1, 1 |
| 906 palignr m5, m2, m1, 2 |
| 907 pavgb m2, m4 ; high 16px even lines |
| 908 |
| 909 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m6, m5, m0 |
| 910 pavgb m1, m6 ; low 16px even lines |
| 911 |
| 912 punpckhbw m6, m1, m0 ; interleaved output 2 |
| 913 punpcklbw m1, m0 ; interleaved output 1 |
| 914 |
| 915 punpckhbw m7, m2, m3 ; interleaved output 4 |
| 916 punpcklbw m2, m3 ; interleaved output 3 |
| 917 |
| 918 ; output 1st 8 lines (and half of 2nd 8 lines) |
| 919 DEFINE_ARGS dst, stride, stride3, dst8 |
| 920 lea dst8q, [dstq+strideq*8] |
| 921 mova [dstq ], m1 |
| 922 mova [dstq +16], m6 |
| 923 mova [dst8q ], m6 |
| 924 palignr m0, m6, m1, 2 |
| 925 palignr m4, m2, m6, 2 |
| 926 mova [dstq +strideq ], m0 |
| 927 mova [dstq +strideq +16], m4 |
| 928 mova [dst8q+strideq ], m4 |
| 929 palignr m0, m6, m1, 4 |
| 930 palignr m4, m2, m6, 4 |
| 931 mova [dstq +strideq*2 ], m0 |
| 932 mova [dstq +strideq*2+16], m4 |
| 933 mova [dst8q+strideq*2 ], m4 |
| 934 palignr m0, m6, m1, 6 |
| 935 palignr m4, m2, m6, 6 |
| 936 mova [dstq +stride3q ], m0 |
| 937 mova [dstq +stride3q +16], m4 |
| 938 mova [dst8q+stride3q ], m4 |
| 939 lea dstq, [dstq +strideq*4] |
| 940 lea dst8q, [dst8q+strideq*4] |
| 941 palignr m0, m6, m1, 8 |
| 942 palignr m4, m2, m6, 8 |
| 943 mova [dstq ], m0 |
| 944 mova [dstq +16], m4 |
| 945 mova [dst8q ], m4 |
| 946 palignr m0, m6, m1, 10 |
| 947 palignr m4, m2, m6, 10 |
| 948 mova [dstq +strideq ], m0 |
| 949 mova [dstq +strideq +16], m4 |
| 950 mova [dst8q+strideq ], m4 |
| 951 palignr m0, m6, m1, 12 |
| 952 palignr m4, m2, m6, 12 |
| 953 mova [dstq +strideq*2 ], m0 |
| 954 mova [dstq +strideq*2+16], m4 |
| 955 mova [dst8q+strideq*2 ], m4 |
| 956 palignr m0, m6, m1, 14 |
| 957 palignr m4, m2, m6, 14 |
| 958 mova [dstq +stride3q ], m0 |
| 959 mova [dstq +stride3q +16], m4 |
| 960 mova [dst8q+stride3q ], m4 |
| 961 lea dstq, [dstq+strideq*4] |
| 962 lea dst8q, [dst8q+strideq*4] |
| 963 |
| 964 ; output 2nd half of 2nd 8 lines and half of 3rd 8 lines |
| 965 mova [dstq +16], m2 |
| 966 mova [dst8q ], m2 |
| 967 palignr m4, m7, m2, 2 |
| 968 mova [dstq +strideq +16], m4 |
| 969 mova [dst8q+strideq ], m4 |
| 970 palignr m4, m7, m2, 4 |
| 971 mova [dstq +strideq*2+16], m4 |
| 972 mova [dst8q+strideq*2 ], m4 |
| 973 palignr m4, m7, m2, 6 |
| 974 mova [dstq +stride3q +16], m4 |
| 975 mova [dst8q+stride3q ], m4 |
| 976 lea dstq, [dstq+strideq*4] |
| 977 lea dst8q, [dst8q+strideq*4] |
| 978 palignr m4, m7, m2, 8 |
| 979 mova [dstq +16], m4 |
| 980 mova [dst8q ], m4 |
| 981 palignr m4, m7, m2, 10 |
| 982 mova [dstq +strideq +16], m4 |
| 983 mova [dst8q+strideq ], m4 |
| 984 palignr m4, m7, m2, 12 |
| 985 mova [dstq +strideq*2+16], m4 |
| 986 mova [dst8q+strideq*2 ], m4 |
| 987 palignr m4, m7, m2, 14 |
| 988 mova [dstq +stride3q +16], m4 |
| 989 mova [dst8q+stride3q ], m4 |
| 990 lea dstq, [dstq+strideq*4] |
| 991 lea dst8q, [dst8q+strideq*4] |
| 992 |
| 993 ; output 2nd half of 3rd 8 lines and half of 4th 8 lines |
| 994 mova m0, [sh_b23456789abcdefff] |
| 995 mova [dstq +16], m7 |
| 996 mova [dst8q ], m7 |
| 997 pshufb m7, m0 |
| 998 mova [dstq +strideq +16], m7 |
| 999 mova [dst8q+strideq ], m7 |
| 1000 pshufb m7, m0 |
| 1001 mova [dstq +strideq*2+16], m7 |
| 1002 mova [dst8q+strideq*2 ], m7 |
| 1003 pshufb m7, m0 |
| 1004 mova [dstq +stride3q +16], m7 |
| 1005 mova [dst8q+stride3q ], m7 |
| 1006 pshufb m7, m0 |
| 1007 lea dstq, [dstq+strideq*4] |
| 1008 lea dst8q, [dst8q+strideq*4] |
| 1009 mova [dstq +16], m7 |
| 1010 mova [dst8q ], m7 |
| 1011 pshufb m7, m0 |
| 1012 mova [dstq +strideq +16], m7 |
| 1013 mova [dst8q+strideq ], m7 |
| 1014 pshufb m7, m0 |
| 1015 mova [dstq +strideq*2+16], m7 |
| 1016 mova [dst8q+strideq*2 ], m7 |
| 1017 pshufb m7, m0 |
| 1018 mova [dstq +stride3q +16], m7 |
| 1019 mova [dst8q+stride3q ], m7 |
| 1020 pshufb m7, m0 |
| 1021 lea dstq, [dstq+strideq*4] |
| 1022 |
| 1023 ; output last half of 4th 8 lines |
| 1024 mova [dstq +16], m7 |
| 1025 mova [dstq +strideq +16], m7 |
| 1026 mova [dstq +strideq*2+16], m7 |
| 1027 mova [dstq +stride3q +16], m7 |
| 1028 lea dstq, [dstq+strideq*4] |
| 1029 mova [dstq +16], m7 |
| 1030 mova [dstq +strideq +16], m7 |
| 1031 mova [dstq +strideq*2+16], m7 |
| 1032 mova [dstq +stride3q +16], m7 |
| 1033 |
| 1034 ; done! |
| 1035 RESTORE_GOT |
| 1036 RET |
OLD | NEW |