OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 #include <assert.h> | 11 #include "vp9/common/x86/vp9_idct_intrin_sse2.h" |
12 #include <emmintrin.h> // SSE2 | |
13 #include "./vpx_config.h" | |
14 #include "vpx/vpx_integer.h" | |
15 #include "vp9/common/vp9_common.h" | |
16 #include "vp9/common/vp9_idct.h" | |
17 | 12 |
18 #define RECON_AND_STORE4X4(dest, in_x) \ | 13 #define RECON_AND_STORE4X4(dest, in_x) \ |
19 { \ | 14 { \ |
20 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ | 15 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ |
21 d0 = _mm_unpacklo_epi8(d0, zero); \ | 16 d0 = _mm_unpacklo_epi8(d0, zero); \ |
22 d0 = _mm_add_epi16(in_x, d0); \ | 17 d0 = _mm_add_epi16(in_x, d0); \ |
23 d0 = _mm_packus_epi16(d0, d0); \ | 18 d0 = _mm_packus_epi16(d0, d0); \ |
24 *(int *)dest = _mm_cvtsi128_si32(d0); \ | 19 *(int *)dest = _mm_cvtsi128_si32(d0); \ |
25 dest += stride; \ | 20 dest += stride; \ |
26 } | 21 } |
(...skipping 346 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
373 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ | 368 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ |
374 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ | 369 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ |
375 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ | 370 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ |
376 \ | 371 \ |
377 out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ | 372 out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ |
378 out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ | 373 out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ |
379 out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ | 374 out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ |
380 out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ | 375 out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ |
381 } | 376 } |
382 | 377 |
383 #define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \ | |
384 { \ | |
385 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ | |
386 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ | |
387 \ | |
388 in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \ | |
389 in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \ | |
390 } | |
391 | |
392 #define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \ | 378 #define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \ |
393 { \ | 379 { \ |
394 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ | 380 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ |
395 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ | 381 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ |
396 out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ | 382 out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ |
397 out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ | 383 out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ |
398 } | 384 } |
399 | 385 |
400 // Define Macro for multiplying elements by constants and adding them together. | 386 // Define Macro for multiplying elements by constants and adding them together. |
401 #define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \ | 387 #define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \ |
(...skipping 118 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
520 out0 = _mm_adds_epi16(stp1_0, stp2_7); \ | 506 out0 = _mm_adds_epi16(stp1_0, stp2_7); \ |
521 out1 = _mm_adds_epi16(stp1_1, stp1_6); \ | 507 out1 = _mm_adds_epi16(stp1_1, stp1_6); \ |
522 out2 = _mm_adds_epi16(stp1_2, stp1_5); \ | 508 out2 = _mm_adds_epi16(stp1_2, stp1_5); \ |
523 out3 = _mm_adds_epi16(stp1_3, stp2_4); \ | 509 out3 = _mm_adds_epi16(stp1_3, stp2_4); \ |
524 out4 = _mm_subs_epi16(stp1_3, stp2_4); \ | 510 out4 = _mm_subs_epi16(stp1_3, stp2_4); \ |
525 out5 = _mm_subs_epi16(stp1_2, stp1_5); \ | 511 out5 = _mm_subs_epi16(stp1_2, stp1_5); \ |
526 out6 = _mm_subs_epi16(stp1_1, stp1_6); \ | 512 out6 = _mm_subs_epi16(stp1_1, stp1_6); \ |
527 out7 = _mm_subs_epi16(stp1_0, stp2_7); \ | 513 out7 = _mm_subs_epi16(stp1_0, stp2_7); \ |
528 } | 514 } |
529 | 515 |
530 #define RECON_AND_STORE(dest, in_x) \ | |
531 { \ | |
532 __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \ | |
533 d0 = _mm_unpacklo_epi8(d0, zero); \ | |
534 d0 = _mm_add_epi16(in_x, d0); \ | |
535 d0 = _mm_packus_epi16(d0, d0); \ | |
536 _mm_storel_epi64((__m128i *)(dest), d0); \ | |
537 dest += stride; \ | |
538 } | |
539 | |
540 void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) { | 516 void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
541 const __m128i zero = _mm_setzero_si128(); | 517 const __m128i zero = _mm_setzero_si128(); |
542 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); | 518 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
543 const __m128i final_rounding = _mm_set1_epi16(1<<4); | 519 const __m128i final_rounding = _mm_set1_epi16(1<<4); |
544 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); | 520 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); |
545 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); | 521 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); |
546 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); | 522 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); |
547 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); | 523 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); |
548 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); | 524 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); |
549 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 525 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
(...skipping 70 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
620 RECON_AND_STORE(dest, dc_value); | 596 RECON_AND_STORE(dest, dc_value); |
621 RECON_AND_STORE(dest, dc_value); | 597 RECON_AND_STORE(dest, dc_value); |
622 RECON_AND_STORE(dest, dc_value); | 598 RECON_AND_STORE(dest, dc_value); |
623 RECON_AND_STORE(dest, dc_value); | 599 RECON_AND_STORE(dest, dc_value); |
624 RECON_AND_STORE(dest, dc_value); | 600 RECON_AND_STORE(dest, dc_value); |
625 RECON_AND_STORE(dest, dc_value); | 601 RECON_AND_STORE(dest, dc_value); |
626 RECON_AND_STORE(dest, dc_value); | 602 RECON_AND_STORE(dest, dc_value); |
627 RECON_AND_STORE(dest, dc_value); | 603 RECON_AND_STORE(dest, dc_value); |
628 } | 604 } |
629 | 605 |
630 // perform 8x8 transpose | |
631 static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { | |
632 const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); | |
633 const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); | |
634 const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); | |
635 const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); | |
636 const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); | |
637 const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); | |
638 const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); | |
639 const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); | |
640 | |
641 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); | |
642 const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); | |
643 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); | |
644 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); | |
645 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); | |
646 const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); | |
647 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); | |
648 const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); | |
649 | |
650 res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); | |
651 res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); | |
652 res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); | |
653 res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); | |
654 res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); | |
655 res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); | |
656 res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); | |
657 res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); | |
658 } | |
659 | |
660 static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) { | |
661 const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); | |
662 const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); | |
663 const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); | |
664 const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); | |
665 | |
666 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); | |
667 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); | |
668 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); | |
669 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); | |
670 | |
671 out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4); | |
672 out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4); | |
673 out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6); | |
674 out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6); | |
675 } | |
676 | |
677 static void idct8_sse2(__m128i *in) { | 606 static void idct8_sse2(__m128i *in) { |
678 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); | 607 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
679 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); | 608 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); |
680 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); | 609 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); |
681 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); | 610 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); |
682 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); | 611 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); |
683 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); | 612 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); |
684 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 613 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
685 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); | 614 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
686 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); | 615 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); |
(...skipping 879 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1566 RECON_AND_STORE(dest, dc_value); | 1495 RECON_AND_STORE(dest, dc_value); |
1567 RECON_AND_STORE(dest, dc_value); | 1496 RECON_AND_STORE(dest, dc_value); |
1568 RECON_AND_STORE(dest, dc_value); | 1497 RECON_AND_STORE(dest, dc_value); |
1569 RECON_AND_STORE(dest, dc_value); | 1498 RECON_AND_STORE(dest, dc_value); |
1570 RECON_AND_STORE(dest, dc_value); | 1499 RECON_AND_STORE(dest, dc_value); |
1571 RECON_AND_STORE(dest, dc_value); | 1500 RECON_AND_STORE(dest, dc_value); |
1572 dest += 8 - (stride * 16); | 1501 dest += 8 - (stride * 16); |
1573 } | 1502 } |
1574 } | 1503 } |
1575 | 1504 |
1576 static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { | |
1577 __m128i tbuf[8]; | |
1578 array_transpose_8x8(res0, res0); | |
1579 array_transpose_8x8(res1, tbuf); | |
1580 array_transpose_8x8(res0 + 8, res1); | |
1581 array_transpose_8x8(res1 + 8, res1 + 8); | |
1582 | |
1583 res0[8] = tbuf[0]; | |
1584 res0[9] = tbuf[1]; | |
1585 res0[10] = tbuf[2]; | |
1586 res0[11] = tbuf[3]; | |
1587 res0[12] = tbuf[4]; | |
1588 res0[13] = tbuf[5]; | |
1589 res0[14] = tbuf[6]; | |
1590 res0[15] = tbuf[7]; | |
1591 } | |
1592 | |
1593 static void iadst16_8col(__m128i *in) { | 1505 static void iadst16_8col(__m128i *in) { |
1594 // perform 16x16 1-D ADST for 8 columns | 1506 // perform 16x16 1-D ADST for 8 columns |
1595 __m128i s[16], x[16], u[32], v[32]; | 1507 __m128i s[16], x[16], u[32], v[32]; |
1596 const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); | 1508 const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); |
1597 const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); | 1509 const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); |
1598 const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); | 1510 const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); |
1599 const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); | 1511 const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); |
1600 const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); | 1512 const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); |
1601 const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); | 1513 const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); |
1602 const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); | 1514 const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); |
(...skipping 806 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2409 idct16_8col(in0); | 2321 idct16_8col(in0); |
2410 idct16_8col(in1); | 2322 idct16_8col(in1); |
2411 } | 2323 } |
2412 | 2324 |
2413 static void iadst16_sse2(__m128i *in0, __m128i *in1) { | 2325 static void iadst16_sse2(__m128i *in0, __m128i *in1) { |
2414 array_transpose_16x16(in0, in1); | 2326 array_transpose_16x16(in0, in1); |
2415 iadst16_8col(in0); | 2327 iadst16_8col(in0); |
2416 iadst16_8col(in1); | 2328 iadst16_8col(in1); |
2417 } | 2329 } |
2418 | 2330 |
2419 static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) { | |
2420 in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16)); | |
2421 in[1] = _mm_load_si128((const __m128i *)(input + 1 * 16)); | |
2422 in[2] = _mm_load_si128((const __m128i *)(input + 2 * 16)); | |
2423 in[3] = _mm_load_si128((const __m128i *)(input + 3 * 16)); | |
2424 in[4] = _mm_load_si128((const __m128i *)(input + 4 * 16)); | |
2425 in[5] = _mm_load_si128((const __m128i *)(input + 5 * 16)); | |
2426 in[6] = _mm_load_si128((const __m128i *)(input + 6 * 16)); | |
2427 in[7] = _mm_load_si128((const __m128i *)(input + 7 * 16)); | |
2428 | |
2429 in[8] = _mm_load_si128((const __m128i *)(input + 8 * 16)); | |
2430 in[9] = _mm_load_si128((const __m128i *)(input + 9 * 16)); | |
2431 in[10] = _mm_load_si128((const __m128i *)(input + 10 * 16)); | |
2432 in[11] = _mm_load_si128((const __m128i *)(input + 11 * 16)); | |
2433 in[12] = _mm_load_si128((const __m128i *)(input + 12 * 16)); | |
2434 in[13] = _mm_load_si128((const __m128i *)(input + 13 * 16)); | |
2435 in[14] = _mm_load_si128((const __m128i *)(input + 14 * 16)); | |
2436 in[15] = _mm_load_si128((const __m128i *)(input + 15 * 16)); | |
2437 } | |
2438 | |
2439 static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) { | |
2440 const __m128i final_rounding = _mm_set1_epi16(1<<5); | |
2441 const __m128i zero = _mm_setzero_si128(); | |
2442 // Final rounding and shift | |
2443 in[0] = _mm_adds_epi16(in[0], final_rounding); | |
2444 in[1] = _mm_adds_epi16(in[1], final_rounding); | |
2445 in[2] = _mm_adds_epi16(in[2], final_rounding); | |
2446 in[3] = _mm_adds_epi16(in[3], final_rounding); | |
2447 in[4] = _mm_adds_epi16(in[4], final_rounding); | |
2448 in[5] = _mm_adds_epi16(in[5], final_rounding); | |
2449 in[6] = _mm_adds_epi16(in[6], final_rounding); | |
2450 in[7] = _mm_adds_epi16(in[7], final_rounding); | |
2451 in[8] = _mm_adds_epi16(in[8], final_rounding); | |
2452 in[9] = _mm_adds_epi16(in[9], final_rounding); | |
2453 in[10] = _mm_adds_epi16(in[10], final_rounding); | |
2454 in[11] = _mm_adds_epi16(in[11], final_rounding); | |
2455 in[12] = _mm_adds_epi16(in[12], final_rounding); | |
2456 in[13] = _mm_adds_epi16(in[13], final_rounding); | |
2457 in[14] = _mm_adds_epi16(in[14], final_rounding); | |
2458 in[15] = _mm_adds_epi16(in[15], final_rounding); | |
2459 | |
2460 in[0] = _mm_srai_epi16(in[0], 6); | |
2461 in[1] = _mm_srai_epi16(in[1], 6); | |
2462 in[2] = _mm_srai_epi16(in[2], 6); | |
2463 in[3] = _mm_srai_epi16(in[3], 6); | |
2464 in[4] = _mm_srai_epi16(in[4], 6); | |
2465 in[5] = _mm_srai_epi16(in[5], 6); | |
2466 in[6] = _mm_srai_epi16(in[6], 6); | |
2467 in[7] = _mm_srai_epi16(in[7], 6); | |
2468 in[8] = _mm_srai_epi16(in[8], 6); | |
2469 in[9] = _mm_srai_epi16(in[9], 6); | |
2470 in[10] = _mm_srai_epi16(in[10], 6); | |
2471 in[11] = _mm_srai_epi16(in[11], 6); | |
2472 in[12] = _mm_srai_epi16(in[12], 6); | |
2473 in[13] = _mm_srai_epi16(in[13], 6); | |
2474 in[14] = _mm_srai_epi16(in[14], 6); | |
2475 in[15] = _mm_srai_epi16(in[15], 6); | |
2476 | |
2477 RECON_AND_STORE(dest, in[0]); | |
2478 RECON_AND_STORE(dest, in[1]); | |
2479 RECON_AND_STORE(dest, in[2]); | |
2480 RECON_AND_STORE(dest, in[3]); | |
2481 RECON_AND_STORE(dest, in[4]); | |
2482 RECON_AND_STORE(dest, in[5]); | |
2483 RECON_AND_STORE(dest, in[6]); | |
2484 RECON_AND_STORE(dest, in[7]); | |
2485 RECON_AND_STORE(dest, in[8]); | |
2486 RECON_AND_STORE(dest, in[9]); | |
2487 RECON_AND_STORE(dest, in[10]); | |
2488 RECON_AND_STORE(dest, in[11]); | |
2489 RECON_AND_STORE(dest, in[12]); | |
2490 RECON_AND_STORE(dest, in[13]); | |
2491 RECON_AND_STORE(dest, in[14]); | |
2492 RECON_AND_STORE(dest, in[15]); | |
2493 } | |
2494 | |
2495 void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride, | 2331 void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride, |
2496 int tx_type) { | 2332 int tx_type) { |
2497 __m128i in0[16], in1[16]; | 2333 __m128i in0[16], in1[16]; |
2498 | 2334 |
2499 load_buffer_8x16(input, in0); | 2335 load_buffer_8x16(input, in0); |
2500 input += 8; | 2336 input += 8; |
2501 load_buffer_8x16(input, in1); | 2337 load_buffer_8x16(input, in1); |
2502 | 2338 |
2503 switch (tx_type) { | 2339 switch (tx_type) { |
2504 case 0: // DCT_DCT | 2340 case 0: // DCT_DCT |
(...skipping 1644 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4149 RECON_AND_STORE(dest, dc_value); | 3985 RECON_AND_STORE(dest, dc_value); |
4150 RECON_AND_STORE(dest, dc_value); | 3986 RECON_AND_STORE(dest, dc_value); |
4151 RECON_AND_STORE(dest, dc_value); | 3987 RECON_AND_STORE(dest, dc_value); |
4152 RECON_AND_STORE(dest, dc_value); | 3988 RECON_AND_STORE(dest, dc_value); |
4153 RECON_AND_STORE(dest, dc_value); | 3989 RECON_AND_STORE(dest, dc_value); |
4154 RECON_AND_STORE(dest, dc_value); | 3990 RECON_AND_STORE(dest, dc_value); |
4155 RECON_AND_STORE(dest, dc_value); | 3991 RECON_AND_STORE(dest, dc_value); |
4156 dest += 8 - (stride * 32); | 3992 dest += 8 - (stride * 32); |
4157 } | 3993 } |
4158 } | 3994 } |
OLD | NEW |