OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 505 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
516 in3 = _mm_adds_epi16(stp1_3, stp2_4); \ | 516 in3 = _mm_adds_epi16(stp1_3, stp2_4); \ |
517 in4 = _mm_subs_epi16(stp1_3, stp2_4); \ | 517 in4 = _mm_subs_epi16(stp1_3, stp2_4); \ |
518 in5 = _mm_subs_epi16(stp1_2, stp1_5); \ | 518 in5 = _mm_subs_epi16(stp1_2, stp1_5); \ |
519 in6 = _mm_subs_epi16(stp1_1, stp1_6); \ | 519 in6 = _mm_subs_epi16(stp1_1, stp1_6); \ |
520 in7 = _mm_subs_epi16(stp1_0, stp2_7); | 520 in7 = _mm_subs_epi16(stp1_0, stp2_7); |
521 | 521 |
522 #define RECON_AND_STORE(dest, in_x) \ | 522 #define RECON_AND_STORE(dest, in_x) \ |
523 { \ | 523 { \ |
524 __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \ | 524 __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \ |
525 d0 = _mm_unpacklo_epi8(d0, zero); \ | 525 d0 = _mm_unpacklo_epi8(d0, zero); \ |
526 in_x = _mm_add_epi16(in_x, d0); \ | 526 d0 = _mm_add_epi16(in_x, d0); \ |
527 in_x = _mm_packus_epi16(in_x, in_x); \ | 527 d0 = _mm_packus_epi16(d0, d0); \ |
528 _mm_storel_epi64((__m128i *)(dest), in_x); \ | 528 _mm_storel_epi64((__m128i *)(dest), d0); \ |
529 dest += stride; \ | 529 dest += stride; \ |
530 } | 530 } |
531 | 531 |
532 void vp9_short_idct8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) { | 532 void vp9_short_idct8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) { |
533 const __m128i zero = _mm_setzero_si128(); | 533 const __m128i zero = _mm_setzero_si128(); |
534 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); | 534 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
535 const __m128i final_rounding = _mm_set1_epi16(1<<4); | 535 const __m128i final_rounding = _mm_set1_epi16(1<<4); |
536 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); | 536 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); |
537 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); | 537 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); |
538 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); | 538 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); |
(...skipping 51 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
590 RECON_AND_STORE(dest, in0); | 590 RECON_AND_STORE(dest, in0); |
591 RECON_AND_STORE(dest, in1); | 591 RECON_AND_STORE(dest, in1); |
592 RECON_AND_STORE(dest, in2); | 592 RECON_AND_STORE(dest, in2); |
593 RECON_AND_STORE(dest, in3); | 593 RECON_AND_STORE(dest, in3); |
594 RECON_AND_STORE(dest, in4); | 594 RECON_AND_STORE(dest, in4); |
595 RECON_AND_STORE(dest, in5); | 595 RECON_AND_STORE(dest, in5); |
596 RECON_AND_STORE(dest, in6); | 596 RECON_AND_STORE(dest, in6); |
597 RECON_AND_STORE(dest, in7); | 597 RECON_AND_STORE(dest, in7); |
598 } | 598 } |
599 | 599 |
| 600 void vp9_short_idct8x8_1_add_sse2(int16_t *input, uint8_t *dest, int stride) { |
| 601 __m128i dc_value; |
| 602 const __m128i zero = _mm_setzero_si128(); |
| 603 int a; |
| 604 |
| 605 a = dct_const_round_shift(input[0] * cospi_16_64); |
| 606 a = dct_const_round_shift(a * cospi_16_64); |
| 607 a = ROUND_POWER_OF_TWO(a, 5); |
| 608 |
| 609 dc_value = _mm_set1_epi16(a); |
| 610 |
| 611 RECON_AND_STORE(dest, dc_value); |
| 612 RECON_AND_STORE(dest, dc_value); |
| 613 RECON_AND_STORE(dest, dc_value); |
| 614 RECON_AND_STORE(dest, dc_value); |
| 615 RECON_AND_STORE(dest, dc_value); |
| 616 RECON_AND_STORE(dest, dc_value); |
| 617 RECON_AND_STORE(dest, dc_value); |
| 618 RECON_AND_STORE(dest, dc_value); |
| 619 } |
| 620 |
600 // perform 8x8 transpose | 621 // perform 8x8 transpose |
601 static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { | 622 static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { |
602 const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); | 623 const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); |
603 const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); | 624 const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); |
604 const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); | 625 const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); |
605 const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); | 626 const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); |
606 const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); | 627 const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); |
607 const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); | 628 const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); |
608 const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); | 629 const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); |
609 const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); | 630 const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); |
(...skipping 832 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1442 RECON_AND_STORE(dest, in12); | 1463 RECON_AND_STORE(dest, in12); |
1443 RECON_AND_STORE(dest, in13); | 1464 RECON_AND_STORE(dest, in13); |
1444 RECON_AND_STORE(dest, in14); | 1465 RECON_AND_STORE(dest, in14); |
1445 RECON_AND_STORE(dest, in15); | 1466 RECON_AND_STORE(dest, in15); |
1446 | 1467 |
1447 dest += 8 - (stride * 16); | 1468 dest += 8 - (stride * 16); |
1448 } | 1469 } |
1449 } | 1470 } |
1450 } | 1471 } |
1451 | 1472 |
| 1473 void vp9_short_idct16x16_1_add_sse2(int16_t *input, uint8_t *dest, int stride) { |
| 1474 __m128i dc_value; |
| 1475 const __m128i zero = _mm_setzero_si128(); |
| 1476 int a, i; |
| 1477 |
| 1478 a = dct_const_round_shift(input[0] * cospi_16_64); |
| 1479 a = dct_const_round_shift(a * cospi_16_64); |
| 1480 a = ROUND_POWER_OF_TWO(a, 6); |
| 1481 |
| 1482 dc_value = _mm_set1_epi16(a); |
| 1483 |
| 1484 for (i = 0; i < 2; ++i) { |
| 1485 RECON_AND_STORE(dest, dc_value); |
| 1486 RECON_AND_STORE(dest, dc_value); |
| 1487 RECON_AND_STORE(dest, dc_value); |
| 1488 RECON_AND_STORE(dest, dc_value); |
| 1489 RECON_AND_STORE(dest, dc_value); |
| 1490 RECON_AND_STORE(dest, dc_value); |
| 1491 RECON_AND_STORE(dest, dc_value); |
| 1492 RECON_AND_STORE(dest, dc_value); |
| 1493 RECON_AND_STORE(dest, dc_value); |
| 1494 RECON_AND_STORE(dest, dc_value); |
| 1495 RECON_AND_STORE(dest, dc_value); |
| 1496 RECON_AND_STORE(dest, dc_value); |
| 1497 RECON_AND_STORE(dest, dc_value); |
| 1498 RECON_AND_STORE(dest, dc_value); |
| 1499 RECON_AND_STORE(dest, dc_value); |
| 1500 RECON_AND_STORE(dest, dc_value); |
| 1501 dest += 8 - (stride * 16); |
| 1502 } |
| 1503 } |
| 1504 |
1452 static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { | 1505 static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { |
1453 __m128i tbuf[8]; | 1506 __m128i tbuf[8]; |
1454 array_transpose_8x8(res0, res0); | 1507 array_transpose_8x8(res0, res0); |
1455 array_transpose_8x8(res1, tbuf); | 1508 array_transpose_8x8(res1, tbuf); |
1456 array_transpose_8x8(res0 + 8, res1); | 1509 array_transpose_8x8(res0 + 8, res1); |
1457 array_transpose_8x8(res1 + 8, res1 + 8); | 1510 array_transpose_8x8(res1 + 8, res1 + 8); |
1458 | 1511 |
1459 res0[8] = tbuf[0]; | 1512 res0[8] = tbuf[0]; |
1460 res0[9] = tbuf[1]; | 1513 res0[9] = tbuf[1]; |
1461 res0[10] = tbuf[2]; | 1514 res0[10] = tbuf[2]; |
(...skipping 1291 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2753 RECON_AND_STORE(dest, in11); | 2806 RECON_AND_STORE(dest, in11); |
2754 RECON_AND_STORE(dest, in12); | 2807 RECON_AND_STORE(dest, in12); |
2755 RECON_AND_STORE(dest, in13); | 2808 RECON_AND_STORE(dest, in13); |
2756 RECON_AND_STORE(dest, in14); | 2809 RECON_AND_STORE(dest, in14); |
2757 RECON_AND_STORE(dest, in15); | 2810 RECON_AND_STORE(dest, in15); |
2758 | 2811 |
2759 dest += 8 - (stride * 16); | 2812 dest += 8 - (stride * 16); |
2760 } | 2813 } |
2761 } | 2814 } |
2762 | 2815 |
| 2816 #define LOAD_DQCOEFF(reg, input) \ |
| 2817 { \ |
| 2818 reg = _mm_load_si128((__m128i *) input); \ |
| 2819 input += 8; \ |
| 2820 } \ |
| 2821 |
2763 void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) { | 2822 void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) { |
2764 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); | 2823 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
2765 const __m128i final_rounding = _mm_set1_epi16(1<<5); | 2824 const __m128i final_rounding = _mm_set1_epi16(1<<5); |
2766 | 2825 |
2767 // idct constants for each stage | 2826 // idct constants for each stage |
2768 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); | 2827 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); |
2769 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); | 2828 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); |
2770 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); | 2829 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); |
2771 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); | 2830 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); |
2772 const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); | 2831 const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); |
(...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2820 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, | 2879 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, |
2821 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, | 2880 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, |
2822 stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, | 2881 stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, |
2823 stp1_30, stp1_31; | 2882 stp1_30, stp1_31; |
2824 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, | 2883 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, |
2825 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, | 2884 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, |
2826 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, | 2885 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, |
2827 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, | 2886 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, |
2828 stp2_30, stp2_31; | 2887 stp2_30, stp2_31; |
2829 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; | 2888 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
2830 int i, j; | 2889 int i, j, i32; |
| 2890 __m128i zero_idx[16]; |
| 2891 int zero_flag[2]; |
2831 | 2892 |
2832 // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct. | 2893 // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct. |
2833 for (i = 0; i < 8; i++) { | 2894 for (i = 0; i < 8; i++) { |
| 2895 i32 = (i << 5); |
2834 if (i < 4) { | 2896 if (i < 4) { |
2835 // First 1-D idct | 2897 // First 1-D idct |
2836 // Load input data. | 2898 // Load input data. |
2837 in0 = _mm_load_si128((__m128i *)input); | 2899 LOAD_DQCOEFF(in0, input); |
2838 in8 = _mm_load_si128((__m128i *)(input + 8 * 1)); | 2900 LOAD_DQCOEFF(in8, input); |
2839 in16 = _mm_load_si128((__m128i *)(input + 8 * 2)); | 2901 LOAD_DQCOEFF(in16, input); |
2840 in24 = _mm_load_si128((__m128i *)(input + 8 * 3)); | 2902 LOAD_DQCOEFF(in24, input); |
2841 in1 = _mm_load_si128((__m128i *)(input + 8 * 4)); | 2903 LOAD_DQCOEFF(in1, input); |
2842 in9 = _mm_load_si128((__m128i *)(input + 8 * 5)); | 2904 LOAD_DQCOEFF(in9, input); |
2843 in17 = _mm_load_si128((__m128i *)(input + 8 * 6)); | 2905 LOAD_DQCOEFF(in17, input); |
2844 in25 = _mm_load_si128((__m128i *)(input + 8 * 7)); | 2906 LOAD_DQCOEFF(in25, input); |
2845 in2 = _mm_load_si128((__m128i *)(input + 8 * 8)); | 2907 LOAD_DQCOEFF(in2, input); |
2846 in10 = _mm_load_si128((__m128i *)(input + 8 * 9)); | 2908 LOAD_DQCOEFF(in10, input); |
2847 in18 = _mm_load_si128((__m128i *)(input + 8 * 10)); | 2909 LOAD_DQCOEFF(in18, input); |
2848 in26 = _mm_load_si128((__m128i *)(input + 8 * 11)); | 2910 LOAD_DQCOEFF(in26, input); |
2849 in3 = _mm_load_si128((__m128i *)(input + 8 * 12)); | 2911 LOAD_DQCOEFF(in3, input); |
2850 in11 = _mm_load_si128((__m128i *)(input + 8 * 13)); | 2912 LOAD_DQCOEFF(in11, input); |
2851 in19 = _mm_load_si128((__m128i *)(input + 8 * 14)); | 2913 LOAD_DQCOEFF(in19, input); |
2852 in27 = _mm_load_si128((__m128i *)(input + 8 * 15)); | 2914 LOAD_DQCOEFF(in27, input); |
2853 | 2915 |
2854 in4 = _mm_load_si128((__m128i *)(input + 8 * 16)); | 2916 LOAD_DQCOEFF(in4, input); |
2855 in12 = _mm_load_si128((__m128i *)(input + 8 * 17)); | 2917 LOAD_DQCOEFF(in12, input); |
2856 in20 = _mm_load_si128((__m128i *)(input + 8 * 18)); | 2918 LOAD_DQCOEFF(in20, input); |
2857 in28 = _mm_load_si128((__m128i *)(input + 8 * 19)); | 2919 LOAD_DQCOEFF(in28, input); |
2858 in5 = _mm_load_si128((__m128i *)(input + 8 * 20)); | 2920 LOAD_DQCOEFF(in5, input); |
2859 in13 = _mm_load_si128((__m128i *)(input + 8 * 21)); | 2921 LOAD_DQCOEFF(in13, input); |
2860 in21 = _mm_load_si128((__m128i *)(input + 8 * 22)); | 2922 LOAD_DQCOEFF(in21, input); |
2861 in29 = _mm_load_si128((__m128i *)(input + 8 * 23)); | 2923 LOAD_DQCOEFF(in29, input); |
2862 in6 = _mm_load_si128((__m128i *)(input + 8 * 24)); | 2924 LOAD_DQCOEFF(in6, input); |
2863 in14 = _mm_load_si128((__m128i *)(input + 8 * 25)); | 2925 LOAD_DQCOEFF(in14, input); |
2864 in22 = _mm_load_si128((__m128i *)(input + 8 * 26)); | 2926 LOAD_DQCOEFF(in22, input); |
2865 in30 = _mm_load_si128((__m128i *)(input + 8 * 27)); | 2927 LOAD_DQCOEFF(in30, input); |
2866 in7 = _mm_load_si128((__m128i *)(input + 8 * 28)); | 2928 LOAD_DQCOEFF(in7, input); |
2867 in15 = _mm_load_si128((__m128i *)(input + 8 * 29)); | 2929 LOAD_DQCOEFF(in15, input); |
2868 in23 = _mm_load_si128((__m128i *)(input + 8 * 30)); | 2930 LOAD_DQCOEFF(in23, input); |
2869 in31 = _mm_load_si128((__m128i *)(input + 8 * 31)); | 2931 LOAD_DQCOEFF(in31, input); |
2870 | 2932 |
2871 input += 256; | 2933 // checking if all entries are zero |
| 2934 zero_idx[0] = _mm_or_si128(in0, in1); |
| 2935 zero_idx[1] = _mm_or_si128(in2, in3); |
| 2936 zero_idx[2] = _mm_or_si128(in4, in5); |
| 2937 zero_idx[3] = _mm_or_si128(in6, in7); |
| 2938 zero_idx[4] = _mm_or_si128(in8, in9); |
| 2939 zero_idx[5] = _mm_or_si128(in10, in11); |
| 2940 zero_idx[6] = _mm_or_si128(in12, in13); |
| 2941 zero_idx[7] = _mm_or_si128(in14, in15); |
| 2942 zero_idx[8] = _mm_or_si128(in16, in17); |
| 2943 zero_idx[9] = _mm_or_si128(in18, in19); |
| 2944 zero_idx[10] = _mm_or_si128(in20, in21); |
| 2945 zero_idx[11] = _mm_or_si128(in22, in23); |
| 2946 zero_idx[12] = _mm_or_si128(in24, in25); |
| 2947 zero_idx[13] = _mm_or_si128(in26, in27); |
| 2948 zero_idx[14] = _mm_or_si128(in28, in29); |
| 2949 zero_idx[15] = _mm_or_si128(in30, in31); |
| 2950 |
| 2951 zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]); |
| 2952 zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]); |
| 2953 zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]); |
| 2954 zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]); |
| 2955 zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]); |
| 2956 zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]); |
| 2957 zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]); |
| 2958 zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]); |
| 2959 |
| 2960 zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]); |
| 2961 zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]); |
| 2962 zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]); |
| 2963 zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]); |
| 2964 zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]); |
| 2965 zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]); |
| 2966 zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]); |
| 2967 |
| 2968 zero_idx[0] = _mm_unpackhi_epi64(zero_idx[14], zero_idx[14]); |
| 2969 zero_idx[1] = _mm_or_si128(zero_idx[0], zero_idx[14]); |
| 2970 zero_idx[2] = _mm_srli_epi64(zero_idx[1], 32); |
| 2971 zero_flag[0] = _mm_cvtsi128_si32(zero_idx[1]); |
| 2972 zero_flag[1] = _mm_cvtsi128_si32(zero_idx[2]); |
| 2973 |
| 2974 if (!zero_flag[0] && !zero_flag[1]) { |
| 2975 col[i32 + 0] = _mm_setzero_si128(); |
| 2976 col[i32 + 1] = _mm_setzero_si128(); |
| 2977 col[i32 + 2] = _mm_setzero_si128(); |
| 2978 col[i32 + 3] = _mm_setzero_si128(); |
| 2979 col[i32 + 4] = _mm_setzero_si128(); |
| 2980 col[i32 + 5] = _mm_setzero_si128(); |
| 2981 col[i32 + 6] = _mm_setzero_si128(); |
| 2982 col[i32 + 7] = _mm_setzero_si128(); |
| 2983 col[i32 + 8] = _mm_setzero_si128(); |
| 2984 col[i32 + 9] = _mm_setzero_si128(); |
| 2985 col[i32 + 10] = _mm_setzero_si128(); |
| 2986 col[i32 + 11] = _mm_setzero_si128(); |
| 2987 col[i32 + 12] = _mm_setzero_si128(); |
| 2988 col[i32 + 13] = _mm_setzero_si128(); |
| 2989 col[i32 + 14] = _mm_setzero_si128(); |
| 2990 col[i32 + 15] = _mm_setzero_si128(); |
| 2991 col[i32 + 16] = _mm_setzero_si128(); |
| 2992 col[i32 + 17] = _mm_setzero_si128(); |
| 2993 col[i32 + 18] = _mm_setzero_si128(); |
| 2994 col[i32 + 19] = _mm_setzero_si128(); |
| 2995 col[i32 + 20] = _mm_setzero_si128(); |
| 2996 col[i32 + 21] = _mm_setzero_si128(); |
| 2997 col[i32 + 22] = _mm_setzero_si128(); |
| 2998 col[i32 + 23] = _mm_setzero_si128(); |
| 2999 col[i32 + 24] = _mm_setzero_si128(); |
| 3000 col[i32 + 25] = _mm_setzero_si128(); |
| 3001 col[i32 + 26] = _mm_setzero_si128(); |
| 3002 col[i32 + 27] = _mm_setzero_si128(); |
| 3003 col[i32 + 28] = _mm_setzero_si128(); |
| 3004 col[i32 + 29] = _mm_setzero_si128(); |
| 3005 col[i32 + 30] = _mm_setzero_si128(); |
| 3006 col[i32 + 31] = _mm_setzero_si128(); |
| 3007 continue; |
| 3008 } |
2872 | 3009 |
2873 // Transpose 32x8 block to 8x32 block | 3010 // Transpose 32x8 block to 8x32 block |
2874 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, | 3011 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, |
2875 in4, in5, in6, in7); | 3012 in4, in5, in6, in7); |
2876 TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, | 3013 TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, |
2877 in10, in11, in12, in13, in14, in15); | 3014 in10, in11, in12, in13, in14, in15); |
2878 TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17, | 3015 TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17, |
2879 in18, in19, in20, in21, in22, in23); | 3016 in18, in19, in20, in21, in22, in23); |
2880 TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25, | 3017 TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25, |
2881 in26, in27, in28, in29, in30, in31); | 3018 in26, in27, in28, in29, in30, in31); |
(...skipping 350 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3232 | 3369 |
3233 stp1_28 = stp2_28; | 3370 stp1_28 = stp2_28; |
3234 stp1_29 = stp2_29; | 3371 stp1_29 = stp2_29; |
3235 stp1_30 = stp2_30; | 3372 stp1_30 = stp2_30; |
3236 stp1_31 = stp2_31; | 3373 stp1_31 = stp2_31; |
3237 } | 3374 } |
3238 | 3375 |
3239 // final stage | 3376 // final stage |
3240 if (i < 4) { | 3377 if (i < 4) { |
3241 // 1_D: Store 32 intermediate results for each 8x32 block. | 3378 // 1_D: Store 32 intermediate results for each 8x32 block. |
3242 col[i * 32 + 0] = _mm_add_epi16(stp1_0, stp1_31); | 3379 col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); |
3243 col[i * 32 + 1] = _mm_add_epi16(stp1_1, stp1_30); | 3380 col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); |
3244 col[i * 32 + 2] = _mm_add_epi16(stp1_2, stp1_29); | 3381 col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29); |
3245 col[i * 32 + 3] = _mm_add_epi16(stp1_3, stp1_28); | 3382 col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28); |
3246 col[i * 32 + 4] = _mm_add_epi16(stp1_4, stp1_27); | 3383 col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27); |
3247 col[i * 32 + 5] = _mm_add_epi16(stp1_5, stp1_26); | 3384 col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26); |
3248 col[i * 32 + 6] = _mm_add_epi16(stp1_6, stp1_25); | 3385 col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25); |
3249 col[i * 32 + 7] = _mm_add_epi16(stp1_7, stp1_24); | 3386 col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24); |
3250 col[i * 32 + 8] = _mm_add_epi16(stp1_8, stp1_23); | 3387 col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23); |
3251 col[i * 32 + 9] = _mm_add_epi16(stp1_9, stp1_22); | 3388 col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22); |
3252 col[i * 32 + 10] = _mm_add_epi16(stp1_10, stp1_21); | 3389 col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21); |
3253 col[i * 32 + 11] = _mm_add_epi16(stp1_11, stp1_20); | 3390 col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20); |
3254 col[i * 32 + 12] = _mm_add_epi16(stp1_12, stp1_19); | 3391 col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19); |
3255 col[i * 32 + 13] = _mm_add_epi16(stp1_13, stp1_18); | 3392 col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18); |
3256 col[i * 32 + 14] = _mm_add_epi16(stp1_14, stp1_17); | 3393 col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17); |
3257 col[i * 32 + 15] = _mm_add_epi16(stp1_15, stp1_16); | 3394 col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16); |
3258 col[i * 32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); | 3395 col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); |
3259 col[i * 32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); | 3396 col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); |
3260 col[i * 32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); | 3397 col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); |
3261 col[i * 32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); | 3398 col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); |
3262 col[i * 32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); | 3399 col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); |
3263 col[i * 32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); | 3400 col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); |
3264 col[i * 32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); | 3401 col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); |
3265 col[i * 32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); | 3402 col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); |
3266 col[i * 32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); | 3403 col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); |
3267 col[i * 32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); | 3404 col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); |
3268 col[i * 32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); | 3405 col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); |
3269 col[i * 32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); | 3406 col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); |
3270 col[i * 32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); | 3407 col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); |
3271 col[i * 32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); | 3408 col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); |
3272 col[i * 32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); | 3409 col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); |
3273 col[i * 32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); | 3410 col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); |
3274 } else { | 3411 } else { |
3275 const __m128i zero = _mm_setzero_si128(); | 3412 const __m128i zero = _mm_setzero_si128(); |
3276 | 3413 |
3277 // 2_D: Calculate the results and store them to destination. | 3414 // 2_D: Calculate the results and store them to destination. |
3278 in0 = _mm_add_epi16(stp1_0, stp1_31); | 3415 in0 = _mm_add_epi16(stp1_0, stp1_31); |
3279 in1 = _mm_add_epi16(stp1_1, stp1_30); | 3416 in1 = _mm_add_epi16(stp1_1, stp1_30); |
3280 in2 = _mm_add_epi16(stp1_2, stp1_29); | 3417 in2 = _mm_add_epi16(stp1_2, stp1_29); |
3281 in3 = _mm_add_epi16(stp1_3, stp1_28); | 3418 in3 = _mm_add_epi16(stp1_3, stp1_28); |
3282 in4 = _mm_add_epi16(stp1_4, stp1_27); | 3419 in4 = _mm_add_epi16(stp1_4, stp1_27); |
3283 in5 = _mm_add_epi16(stp1_5, stp1_26); | 3420 in5 = _mm_add_epi16(stp1_5, stp1_26); |
(...skipping 121 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3405 RECON_AND_STORE(dest, in27); | 3542 RECON_AND_STORE(dest, in27); |
3406 RECON_AND_STORE(dest, in28); | 3543 RECON_AND_STORE(dest, in28); |
3407 RECON_AND_STORE(dest, in29); | 3544 RECON_AND_STORE(dest, in29); |
3408 RECON_AND_STORE(dest, in30); | 3545 RECON_AND_STORE(dest, in30); |
3409 RECON_AND_STORE(dest, in31); | 3546 RECON_AND_STORE(dest, in31); |
3410 | 3547 |
3411 dest += 8 - (stride * 32); | 3548 dest += 8 - (stride * 32); |
3412 } | 3549 } |
3413 } | 3550 } |
3414 } | 3551 } |
OLD | NEW |