| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| 11 #include <assert.h> | 11 #include "vp9/common/x86/vp9_idct_intrin_sse2.h" |
| 12 #include <emmintrin.h> // SSE2 | |
| 13 #include "./vpx_config.h" | |
| 14 #include "vpx/vpx_integer.h" | |
| 15 #include "vp9/common/vp9_common.h" | |
| 16 #include "vp9/common/vp9_idct.h" | |
| 17 | 12 |
| 18 #define RECON_AND_STORE4X4(dest, in_x) \ | 13 #define RECON_AND_STORE4X4(dest, in_x) \ |
| 19 { \ | 14 { \ |
| 20 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ | 15 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ |
| 21 d0 = _mm_unpacklo_epi8(d0, zero); \ | 16 d0 = _mm_unpacklo_epi8(d0, zero); \ |
| 22 d0 = _mm_add_epi16(in_x, d0); \ | 17 d0 = _mm_add_epi16(in_x, d0); \ |
| 23 d0 = _mm_packus_epi16(d0, d0); \ | 18 d0 = _mm_packus_epi16(d0, d0); \ |
| 24 *(int *)dest = _mm_cvtsi128_si32(d0); \ | 19 *(int *)dest = _mm_cvtsi128_si32(d0); \ |
| 25 dest += stride; \ | 20 dest += stride; \ |
| 26 } | 21 } |
| (...skipping 346 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 373 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ | 368 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ |
| 374 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ | 369 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ |
| 375 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ | 370 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ |
| 376 \ | 371 \ |
| 377 out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ | 372 out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ |
| 378 out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ | 373 out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ |
| 379 out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ | 374 out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ |
| 380 out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ | 375 out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ |
| 381 } | 376 } |
| 382 | 377 |
| 383 #define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \ | |
| 384 { \ | |
| 385 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ | |
| 386 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ | |
| 387 \ | |
| 388 in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \ | |
| 389 in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \ | |
| 390 } | |
| 391 | |
| 392 #define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \ | 378 #define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \ |
| 393 { \ | 379 { \ |
| 394 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ | 380 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ |
| 395 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ | 381 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ |
| 396 out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ | 382 out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ |
| 397 out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ | 383 out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ |
| 398 } | 384 } |
| 399 | 385 |
| 400 // Define Macro for multiplying elements by constants and adding them together. | 386 // Define Macro for multiplying elements by constants and adding them together. |
| 401 #define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \ | 387 #define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \ |
| (...skipping 118 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 520 out0 = _mm_adds_epi16(stp1_0, stp2_7); \ | 506 out0 = _mm_adds_epi16(stp1_0, stp2_7); \ |
| 521 out1 = _mm_adds_epi16(stp1_1, stp1_6); \ | 507 out1 = _mm_adds_epi16(stp1_1, stp1_6); \ |
| 522 out2 = _mm_adds_epi16(stp1_2, stp1_5); \ | 508 out2 = _mm_adds_epi16(stp1_2, stp1_5); \ |
| 523 out3 = _mm_adds_epi16(stp1_3, stp2_4); \ | 509 out3 = _mm_adds_epi16(stp1_3, stp2_4); \ |
| 524 out4 = _mm_subs_epi16(stp1_3, stp2_4); \ | 510 out4 = _mm_subs_epi16(stp1_3, stp2_4); \ |
| 525 out5 = _mm_subs_epi16(stp1_2, stp1_5); \ | 511 out5 = _mm_subs_epi16(stp1_2, stp1_5); \ |
| 526 out6 = _mm_subs_epi16(stp1_1, stp1_6); \ | 512 out6 = _mm_subs_epi16(stp1_1, stp1_6); \ |
| 527 out7 = _mm_subs_epi16(stp1_0, stp2_7); \ | 513 out7 = _mm_subs_epi16(stp1_0, stp2_7); \ |
| 528 } | 514 } |
| 529 | 515 |
| 530 #define RECON_AND_STORE(dest, in_x) \ | |
| 531 { \ | |
| 532 __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \ | |
| 533 d0 = _mm_unpacklo_epi8(d0, zero); \ | |
| 534 d0 = _mm_add_epi16(in_x, d0); \ | |
| 535 d0 = _mm_packus_epi16(d0, d0); \ | |
| 536 _mm_storel_epi64((__m128i *)(dest), d0); \ | |
| 537 dest += stride; \ | |
| 538 } | |
| 539 | |
| 540 void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) { | 516 void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
| 541 const __m128i zero = _mm_setzero_si128(); | 517 const __m128i zero = _mm_setzero_si128(); |
| 542 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); | 518 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
| 543 const __m128i final_rounding = _mm_set1_epi16(1<<4); | 519 const __m128i final_rounding = _mm_set1_epi16(1<<4); |
| 544 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); | 520 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); |
| 545 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); | 521 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); |
| 546 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); | 522 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); |
| 547 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); | 523 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); |
| 548 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); | 524 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); |
| 549 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 525 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
| (...skipping 70 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 620 RECON_AND_STORE(dest, dc_value); | 596 RECON_AND_STORE(dest, dc_value); |
| 621 RECON_AND_STORE(dest, dc_value); | 597 RECON_AND_STORE(dest, dc_value); |
| 622 RECON_AND_STORE(dest, dc_value); | 598 RECON_AND_STORE(dest, dc_value); |
| 623 RECON_AND_STORE(dest, dc_value); | 599 RECON_AND_STORE(dest, dc_value); |
| 624 RECON_AND_STORE(dest, dc_value); | 600 RECON_AND_STORE(dest, dc_value); |
| 625 RECON_AND_STORE(dest, dc_value); | 601 RECON_AND_STORE(dest, dc_value); |
| 626 RECON_AND_STORE(dest, dc_value); | 602 RECON_AND_STORE(dest, dc_value); |
| 627 RECON_AND_STORE(dest, dc_value); | 603 RECON_AND_STORE(dest, dc_value); |
| 628 } | 604 } |
| 629 | 605 |
| 630 // perform 8x8 transpose | |
| 631 static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { | |
| 632 const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); | |
| 633 const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); | |
| 634 const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); | |
| 635 const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); | |
| 636 const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); | |
| 637 const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); | |
| 638 const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); | |
| 639 const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); | |
| 640 | |
| 641 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); | |
| 642 const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); | |
| 643 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); | |
| 644 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); | |
| 645 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); | |
| 646 const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); | |
| 647 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); | |
| 648 const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); | |
| 649 | |
| 650 res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); | |
| 651 res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); | |
| 652 res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); | |
| 653 res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); | |
| 654 res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); | |
| 655 res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); | |
| 656 res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); | |
| 657 res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); | |
| 658 } | |
| 659 | |
| 660 static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) { | |
| 661 const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); | |
| 662 const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); | |
| 663 const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); | |
| 664 const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); | |
| 665 | |
| 666 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); | |
| 667 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); | |
| 668 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); | |
| 669 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); | |
| 670 | |
| 671 out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4); | |
| 672 out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4); | |
| 673 out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6); | |
| 674 out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6); | |
| 675 } | |
| 676 | |
| 677 static void idct8_sse2(__m128i *in) { | 606 static void idct8_sse2(__m128i *in) { |
| 678 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); | 607 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
| 679 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); | 608 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); |
| 680 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); | 609 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); |
| 681 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); | 610 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); |
| 682 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); | 611 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); |
| 683 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); | 612 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); |
| 684 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 613 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
| 685 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); | 614 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
| 686 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); | 615 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); |
| (...skipping 879 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1566 RECON_AND_STORE(dest, dc_value); | 1495 RECON_AND_STORE(dest, dc_value); |
| 1567 RECON_AND_STORE(dest, dc_value); | 1496 RECON_AND_STORE(dest, dc_value); |
| 1568 RECON_AND_STORE(dest, dc_value); | 1497 RECON_AND_STORE(dest, dc_value); |
| 1569 RECON_AND_STORE(dest, dc_value); | 1498 RECON_AND_STORE(dest, dc_value); |
| 1570 RECON_AND_STORE(dest, dc_value); | 1499 RECON_AND_STORE(dest, dc_value); |
| 1571 RECON_AND_STORE(dest, dc_value); | 1500 RECON_AND_STORE(dest, dc_value); |
| 1572 dest += 8 - (stride * 16); | 1501 dest += 8 - (stride * 16); |
| 1573 } | 1502 } |
| 1574 } | 1503 } |
| 1575 | 1504 |
| 1576 static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { | |
| 1577 __m128i tbuf[8]; | |
| 1578 array_transpose_8x8(res0, res0); | |
| 1579 array_transpose_8x8(res1, tbuf); | |
| 1580 array_transpose_8x8(res0 + 8, res1); | |
| 1581 array_transpose_8x8(res1 + 8, res1 + 8); | |
| 1582 | |
| 1583 res0[8] = tbuf[0]; | |
| 1584 res0[9] = tbuf[1]; | |
| 1585 res0[10] = tbuf[2]; | |
| 1586 res0[11] = tbuf[3]; | |
| 1587 res0[12] = tbuf[4]; | |
| 1588 res0[13] = tbuf[5]; | |
| 1589 res0[14] = tbuf[6]; | |
| 1590 res0[15] = tbuf[7]; | |
| 1591 } | |
| 1592 | |
| 1593 static void iadst16_8col(__m128i *in) { | 1505 static void iadst16_8col(__m128i *in) { |
| 1594 // perform 16x16 1-D ADST for 8 columns | 1506 // perform 16x16 1-D ADST for 8 columns |
| 1595 __m128i s[16], x[16], u[32], v[32]; | 1507 __m128i s[16], x[16], u[32], v[32]; |
| 1596 const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); | 1508 const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); |
| 1597 const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); | 1509 const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); |
| 1598 const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); | 1510 const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); |
| 1599 const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); | 1511 const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); |
| 1600 const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); | 1512 const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); |
| 1601 const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); | 1513 const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); |
| 1602 const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); | 1514 const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); |
| (...skipping 806 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2409 idct16_8col(in0); | 2321 idct16_8col(in0); |
| 2410 idct16_8col(in1); | 2322 idct16_8col(in1); |
| 2411 } | 2323 } |
| 2412 | 2324 |
| 2413 static void iadst16_sse2(__m128i *in0, __m128i *in1) { | 2325 static void iadst16_sse2(__m128i *in0, __m128i *in1) { |
| 2414 array_transpose_16x16(in0, in1); | 2326 array_transpose_16x16(in0, in1); |
| 2415 iadst16_8col(in0); | 2327 iadst16_8col(in0); |
| 2416 iadst16_8col(in1); | 2328 iadst16_8col(in1); |
| 2417 } | 2329 } |
| 2418 | 2330 |
| 2419 static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) { | |
| 2420 in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16)); | |
| 2421 in[1] = _mm_load_si128((const __m128i *)(input + 1 * 16)); | |
| 2422 in[2] = _mm_load_si128((const __m128i *)(input + 2 * 16)); | |
| 2423 in[3] = _mm_load_si128((const __m128i *)(input + 3 * 16)); | |
| 2424 in[4] = _mm_load_si128((const __m128i *)(input + 4 * 16)); | |
| 2425 in[5] = _mm_load_si128((const __m128i *)(input + 5 * 16)); | |
| 2426 in[6] = _mm_load_si128((const __m128i *)(input + 6 * 16)); | |
| 2427 in[7] = _mm_load_si128((const __m128i *)(input + 7 * 16)); | |
| 2428 | |
| 2429 in[8] = _mm_load_si128((const __m128i *)(input + 8 * 16)); | |
| 2430 in[9] = _mm_load_si128((const __m128i *)(input + 9 * 16)); | |
| 2431 in[10] = _mm_load_si128((const __m128i *)(input + 10 * 16)); | |
| 2432 in[11] = _mm_load_si128((const __m128i *)(input + 11 * 16)); | |
| 2433 in[12] = _mm_load_si128((const __m128i *)(input + 12 * 16)); | |
| 2434 in[13] = _mm_load_si128((const __m128i *)(input + 13 * 16)); | |
| 2435 in[14] = _mm_load_si128((const __m128i *)(input + 14 * 16)); | |
| 2436 in[15] = _mm_load_si128((const __m128i *)(input + 15 * 16)); | |
| 2437 } | |
| 2438 | |
| 2439 static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) { | |
| 2440 const __m128i final_rounding = _mm_set1_epi16(1<<5); | |
| 2441 const __m128i zero = _mm_setzero_si128(); | |
| 2442 // Final rounding and shift | |
| 2443 in[0] = _mm_adds_epi16(in[0], final_rounding); | |
| 2444 in[1] = _mm_adds_epi16(in[1], final_rounding); | |
| 2445 in[2] = _mm_adds_epi16(in[2], final_rounding); | |
| 2446 in[3] = _mm_adds_epi16(in[3], final_rounding); | |
| 2447 in[4] = _mm_adds_epi16(in[4], final_rounding); | |
| 2448 in[5] = _mm_adds_epi16(in[5], final_rounding); | |
| 2449 in[6] = _mm_adds_epi16(in[6], final_rounding); | |
| 2450 in[7] = _mm_adds_epi16(in[7], final_rounding); | |
| 2451 in[8] = _mm_adds_epi16(in[8], final_rounding); | |
| 2452 in[9] = _mm_adds_epi16(in[9], final_rounding); | |
| 2453 in[10] = _mm_adds_epi16(in[10], final_rounding); | |
| 2454 in[11] = _mm_adds_epi16(in[11], final_rounding); | |
| 2455 in[12] = _mm_adds_epi16(in[12], final_rounding); | |
| 2456 in[13] = _mm_adds_epi16(in[13], final_rounding); | |
| 2457 in[14] = _mm_adds_epi16(in[14], final_rounding); | |
| 2458 in[15] = _mm_adds_epi16(in[15], final_rounding); | |
| 2459 | |
| 2460 in[0] = _mm_srai_epi16(in[0], 6); | |
| 2461 in[1] = _mm_srai_epi16(in[1], 6); | |
| 2462 in[2] = _mm_srai_epi16(in[2], 6); | |
| 2463 in[3] = _mm_srai_epi16(in[3], 6); | |
| 2464 in[4] = _mm_srai_epi16(in[4], 6); | |
| 2465 in[5] = _mm_srai_epi16(in[5], 6); | |
| 2466 in[6] = _mm_srai_epi16(in[6], 6); | |
| 2467 in[7] = _mm_srai_epi16(in[7], 6); | |
| 2468 in[8] = _mm_srai_epi16(in[8], 6); | |
| 2469 in[9] = _mm_srai_epi16(in[9], 6); | |
| 2470 in[10] = _mm_srai_epi16(in[10], 6); | |
| 2471 in[11] = _mm_srai_epi16(in[11], 6); | |
| 2472 in[12] = _mm_srai_epi16(in[12], 6); | |
| 2473 in[13] = _mm_srai_epi16(in[13], 6); | |
| 2474 in[14] = _mm_srai_epi16(in[14], 6); | |
| 2475 in[15] = _mm_srai_epi16(in[15], 6); | |
| 2476 | |
| 2477 RECON_AND_STORE(dest, in[0]); | |
| 2478 RECON_AND_STORE(dest, in[1]); | |
| 2479 RECON_AND_STORE(dest, in[2]); | |
| 2480 RECON_AND_STORE(dest, in[3]); | |
| 2481 RECON_AND_STORE(dest, in[4]); | |
| 2482 RECON_AND_STORE(dest, in[5]); | |
| 2483 RECON_AND_STORE(dest, in[6]); | |
| 2484 RECON_AND_STORE(dest, in[7]); | |
| 2485 RECON_AND_STORE(dest, in[8]); | |
| 2486 RECON_AND_STORE(dest, in[9]); | |
| 2487 RECON_AND_STORE(dest, in[10]); | |
| 2488 RECON_AND_STORE(dest, in[11]); | |
| 2489 RECON_AND_STORE(dest, in[12]); | |
| 2490 RECON_AND_STORE(dest, in[13]); | |
| 2491 RECON_AND_STORE(dest, in[14]); | |
| 2492 RECON_AND_STORE(dest, in[15]); | |
| 2493 } | |
| 2494 | |
| 2495 void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride, | 2331 void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride, |
| 2496 int tx_type) { | 2332 int tx_type) { |
| 2497 __m128i in0[16], in1[16]; | 2333 __m128i in0[16], in1[16]; |
| 2498 | 2334 |
| 2499 load_buffer_8x16(input, in0); | 2335 load_buffer_8x16(input, in0); |
| 2500 input += 8; | 2336 input += 8; |
| 2501 load_buffer_8x16(input, in1); | 2337 load_buffer_8x16(input, in1); |
| 2502 | 2338 |
| 2503 switch (tx_type) { | 2339 switch (tx_type) { |
| 2504 case 0: // DCT_DCT | 2340 case 0: // DCT_DCT |
| (...skipping 1644 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 4149 RECON_AND_STORE(dest, dc_value); | 3985 RECON_AND_STORE(dest, dc_value); |
| 4150 RECON_AND_STORE(dest, dc_value); | 3986 RECON_AND_STORE(dest, dc_value); |
| 4151 RECON_AND_STORE(dest, dc_value); | 3987 RECON_AND_STORE(dest, dc_value); |
| 4152 RECON_AND_STORE(dest, dc_value); | 3988 RECON_AND_STORE(dest, dc_value); |
| 4153 RECON_AND_STORE(dest, dc_value); | 3989 RECON_AND_STORE(dest, dc_value); |
| 4154 RECON_AND_STORE(dest, dc_value); | 3990 RECON_AND_STORE(dest, dc_value); |
| 4155 RECON_AND_STORE(dest, dc_value); | 3991 RECON_AND_STORE(dest, dc_value); |
| 4156 dest += 8 - (stride * 32); | 3992 dest += 8 - (stride * 32); |
| 4157 } | 3993 } |
| 4158 } | 3994 } |
| OLD | NEW |