Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(468)

Side by Side Diff: source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c

Issue 23600008: libvpx: Pull from upstream (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 7 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
(...skipping 505 matching lines...) Expand 10 before | Expand all | Expand 10 after
516 in3 = _mm_adds_epi16(stp1_3, stp2_4); \ 516 in3 = _mm_adds_epi16(stp1_3, stp2_4); \
517 in4 = _mm_subs_epi16(stp1_3, stp2_4); \ 517 in4 = _mm_subs_epi16(stp1_3, stp2_4); \
518 in5 = _mm_subs_epi16(stp1_2, stp1_5); \ 518 in5 = _mm_subs_epi16(stp1_2, stp1_5); \
519 in6 = _mm_subs_epi16(stp1_1, stp1_6); \ 519 in6 = _mm_subs_epi16(stp1_1, stp1_6); \
520 in7 = _mm_subs_epi16(stp1_0, stp2_7); 520 in7 = _mm_subs_epi16(stp1_0, stp2_7);
521 521
522 #define RECON_AND_STORE(dest, in_x) \ 522 #define RECON_AND_STORE(dest, in_x) \
523 { \ 523 { \
524 __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \ 524 __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
525 d0 = _mm_unpacklo_epi8(d0, zero); \ 525 d0 = _mm_unpacklo_epi8(d0, zero); \
526 in_x = _mm_add_epi16(in_x, d0); \ 526 d0 = _mm_add_epi16(in_x, d0); \
527 in_x = _mm_packus_epi16(in_x, in_x); \ 527 d0 = _mm_packus_epi16(d0, d0); \
528 _mm_storel_epi64((__m128i *)(dest), in_x); \ 528 _mm_storel_epi64((__m128i *)(dest), d0); \
529 dest += stride; \ 529 dest += stride; \
530 } 530 }
531 531
532 void vp9_short_idct8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) { 532 void vp9_short_idct8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
533 const __m128i zero = _mm_setzero_si128(); 533 const __m128i zero = _mm_setzero_si128();
534 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 534 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
535 const __m128i final_rounding = _mm_set1_epi16(1<<4); 535 const __m128i final_rounding = _mm_set1_epi16(1<<4);
536 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 536 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
537 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 537 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
538 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); 538 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
(...skipping 51 matching lines...) Expand 10 before | Expand all | Expand 10 after
590 RECON_AND_STORE(dest, in0); 590 RECON_AND_STORE(dest, in0);
591 RECON_AND_STORE(dest, in1); 591 RECON_AND_STORE(dest, in1);
592 RECON_AND_STORE(dest, in2); 592 RECON_AND_STORE(dest, in2);
593 RECON_AND_STORE(dest, in3); 593 RECON_AND_STORE(dest, in3);
594 RECON_AND_STORE(dest, in4); 594 RECON_AND_STORE(dest, in4);
595 RECON_AND_STORE(dest, in5); 595 RECON_AND_STORE(dest, in5);
596 RECON_AND_STORE(dest, in6); 596 RECON_AND_STORE(dest, in6);
597 RECON_AND_STORE(dest, in7); 597 RECON_AND_STORE(dest, in7);
598 } 598 }
599 599
600 void vp9_short_idct8x8_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {
601 __m128i dc_value;
602 const __m128i zero = _mm_setzero_si128();
603 int a;
604
605 a = dct_const_round_shift(input[0] * cospi_16_64);
606 a = dct_const_round_shift(a * cospi_16_64);
607 a = ROUND_POWER_OF_TWO(a, 5);
608
609 dc_value = _mm_set1_epi16(a);
610
611 RECON_AND_STORE(dest, dc_value);
612 RECON_AND_STORE(dest, dc_value);
613 RECON_AND_STORE(dest, dc_value);
614 RECON_AND_STORE(dest, dc_value);
615 RECON_AND_STORE(dest, dc_value);
616 RECON_AND_STORE(dest, dc_value);
617 RECON_AND_STORE(dest, dc_value);
618 RECON_AND_STORE(dest, dc_value);
619 }
620
600 // perform 8x8 transpose 621 // perform 8x8 transpose
601 static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { 622 static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
602 const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); 623 const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
603 const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); 624 const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
604 const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); 625 const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
605 const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); 626 const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
606 const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); 627 const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
607 const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); 628 const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
608 const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); 629 const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
609 const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); 630 const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
(...skipping 832 matching lines...) Expand 10 before | Expand all | Expand 10 after
1442 RECON_AND_STORE(dest, in12); 1463 RECON_AND_STORE(dest, in12);
1443 RECON_AND_STORE(dest, in13); 1464 RECON_AND_STORE(dest, in13);
1444 RECON_AND_STORE(dest, in14); 1465 RECON_AND_STORE(dest, in14);
1445 RECON_AND_STORE(dest, in15); 1466 RECON_AND_STORE(dest, in15);
1446 1467
1447 dest += 8 - (stride * 16); 1468 dest += 8 - (stride * 16);
1448 } 1469 }
1449 } 1470 }
1450 } 1471 }
1451 1472
1473 void vp9_short_idct16x16_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {
1474 __m128i dc_value;
1475 const __m128i zero = _mm_setzero_si128();
1476 int a, i;
1477
1478 a = dct_const_round_shift(input[0] * cospi_16_64);
1479 a = dct_const_round_shift(a * cospi_16_64);
1480 a = ROUND_POWER_OF_TWO(a, 6);
1481
1482 dc_value = _mm_set1_epi16(a);
1483
1484 for (i = 0; i < 2; ++i) {
1485 RECON_AND_STORE(dest, dc_value);
1486 RECON_AND_STORE(dest, dc_value);
1487 RECON_AND_STORE(dest, dc_value);
1488 RECON_AND_STORE(dest, dc_value);
1489 RECON_AND_STORE(dest, dc_value);
1490 RECON_AND_STORE(dest, dc_value);
1491 RECON_AND_STORE(dest, dc_value);
1492 RECON_AND_STORE(dest, dc_value);
1493 RECON_AND_STORE(dest, dc_value);
1494 RECON_AND_STORE(dest, dc_value);
1495 RECON_AND_STORE(dest, dc_value);
1496 RECON_AND_STORE(dest, dc_value);
1497 RECON_AND_STORE(dest, dc_value);
1498 RECON_AND_STORE(dest, dc_value);
1499 RECON_AND_STORE(dest, dc_value);
1500 RECON_AND_STORE(dest, dc_value);
1501 dest += 8 - (stride * 16);
1502 }
1503 }
1504
1452 static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { 1505 static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
1453 __m128i tbuf[8]; 1506 __m128i tbuf[8];
1454 array_transpose_8x8(res0, res0); 1507 array_transpose_8x8(res0, res0);
1455 array_transpose_8x8(res1, tbuf); 1508 array_transpose_8x8(res1, tbuf);
1456 array_transpose_8x8(res0 + 8, res1); 1509 array_transpose_8x8(res0 + 8, res1);
1457 array_transpose_8x8(res1 + 8, res1 + 8); 1510 array_transpose_8x8(res1 + 8, res1 + 8);
1458 1511
1459 res0[8] = tbuf[0]; 1512 res0[8] = tbuf[0];
1460 res0[9] = tbuf[1]; 1513 res0[9] = tbuf[1];
1461 res0[10] = tbuf[2]; 1514 res0[10] = tbuf[2];
(...skipping 1291 matching lines...) Expand 10 before | Expand all | Expand 10 after
2753 RECON_AND_STORE(dest, in11); 2806 RECON_AND_STORE(dest, in11);
2754 RECON_AND_STORE(dest, in12); 2807 RECON_AND_STORE(dest, in12);
2755 RECON_AND_STORE(dest, in13); 2808 RECON_AND_STORE(dest, in13);
2756 RECON_AND_STORE(dest, in14); 2809 RECON_AND_STORE(dest, in14);
2757 RECON_AND_STORE(dest, in15); 2810 RECON_AND_STORE(dest, in15);
2758 2811
2759 dest += 8 - (stride * 16); 2812 dest += 8 - (stride * 16);
2760 } 2813 }
2761 } 2814 }
2762 2815
2816 #define LOAD_DQCOEFF(reg, input) \
2817 { \
2818 reg = _mm_load_si128((__m128i *) input); \
2819 input += 8; \
2820 } \
2821
2763 void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) { 2822 void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) {
2764 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 2823 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
2765 const __m128i final_rounding = _mm_set1_epi16(1<<5); 2824 const __m128i final_rounding = _mm_set1_epi16(1<<5);
2766 2825
2767 // idct constants for each stage 2826 // idct constants for each stage
2768 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); 2827 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
2769 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); 2828 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
2770 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); 2829 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
2771 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); 2830 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
2772 const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); 2831 const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
(...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after
2820 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, 2879 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
2821 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, 2880 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
2822 stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, 2881 stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
2823 stp1_30, stp1_31; 2882 stp1_30, stp1_31;
2824 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 2883 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
2825 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, 2884 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
2826 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, 2885 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
2827 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, 2886 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
2828 stp2_30, stp2_31; 2887 stp2_30, stp2_31;
2829 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 2888 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2830 int i, j; 2889 int i, j, i32;
2890 __m128i zero_idx[16];
2891 int zero_flag[2];
2831 2892
2832 // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct. 2893 // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct.
2833 for (i = 0; i < 8; i++) { 2894 for (i = 0; i < 8; i++) {
2895 i32 = (i << 5);
2834 if (i < 4) { 2896 if (i < 4) {
2835 // First 1-D idct 2897 // First 1-D idct
2836 // Load input data. 2898 // Load input data.
2837 in0 = _mm_load_si128((__m128i *)input); 2899 LOAD_DQCOEFF(in0, input);
2838 in8 = _mm_load_si128((__m128i *)(input + 8 * 1)); 2900 LOAD_DQCOEFF(in8, input);
2839 in16 = _mm_load_si128((__m128i *)(input + 8 * 2)); 2901 LOAD_DQCOEFF(in16, input);
2840 in24 = _mm_load_si128((__m128i *)(input + 8 * 3)); 2902 LOAD_DQCOEFF(in24, input);
2841 in1 = _mm_load_si128((__m128i *)(input + 8 * 4)); 2903 LOAD_DQCOEFF(in1, input);
2842 in9 = _mm_load_si128((__m128i *)(input + 8 * 5)); 2904 LOAD_DQCOEFF(in9, input);
2843 in17 = _mm_load_si128((__m128i *)(input + 8 * 6)); 2905 LOAD_DQCOEFF(in17, input);
2844 in25 = _mm_load_si128((__m128i *)(input + 8 * 7)); 2906 LOAD_DQCOEFF(in25, input);
2845 in2 = _mm_load_si128((__m128i *)(input + 8 * 8)); 2907 LOAD_DQCOEFF(in2, input);
2846 in10 = _mm_load_si128((__m128i *)(input + 8 * 9)); 2908 LOAD_DQCOEFF(in10, input);
2847 in18 = _mm_load_si128((__m128i *)(input + 8 * 10)); 2909 LOAD_DQCOEFF(in18, input);
2848 in26 = _mm_load_si128((__m128i *)(input + 8 * 11)); 2910 LOAD_DQCOEFF(in26, input);
2849 in3 = _mm_load_si128((__m128i *)(input + 8 * 12)); 2911 LOAD_DQCOEFF(in3, input);
2850 in11 = _mm_load_si128((__m128i *)(input + 8 * 13)); 2912 LOAD_DQCOEFF(in11, input);
2851 in19 = _mm_load_si128((__m128i *)(input + 8 * 14)); 2913 LOAD_DQCOEFF(in19, input);
2852 in27 = _mm_load_si128((__m128i *)(input + 8 * 15)); 2914 LOAD_DQCOEFF(in27, input);
2853 2915
2854 in4 = _mm_load_si128((__m128i *)(input + 8 * 16)); 2916 LOAD_DQCOEFF(in4, input);
2855 in12 = _mm_load_si128((__m128i *)(input + 8 * 17)); 2917 LOAD_DQCOEFF(in12, input);
2856 in20 = _mm_load_si128((__m128i *)(input + 8 * 18)); 2918 LOAD_DQCOEFF(in20, input);
2857 in28 = _mm_load_si128((__m128i *)(input + 8 * 19)); 2919 LOAD_DQCOEFF(in28, input);
2858 in5 = _mm_load_si128((__m128i *)(input + 8 * 20)); 2920 LOAD_DQCOEFF(in5, input);
2859 in13 = _mm_load_si128((__m128i *)(input + 8 * 21)); 2921 LOAD_DQCOEFF(in13, input);
2860 in21 = _mm_load_si128((__m128i *)(input + 8 * 22)); 2922 LOAD_DQCOEFF(in21, input);
2861 in29 = _mm_load_si128((__m128i *)(input + 8 * 23)); 2923 LOAD_DQCOEFF(in29, input);
2862 in6 = _mm_load_si128((__m128i *)(input + 8 * 24)); 2924 LOAD_DQCOEFF(in6, input);
2863 in14 = _mm_load_si128((__m128i *)(input + 8 * 25)); 2925 LOAD_DQCOEFF(in14, input);
2864 in22 = _mm_load_si128((__m128i *)(input + 8 * 26)); 2926 LOAD_DQCOEFF(in22, input);
2865 in30 = _mm_load_si128((__m128i *)(input + 8 * 27)); 2927 LOAD_DQCOEFF(in30, input);
2866 in7 = _mm_load_si128((__m128i *)(input + 8 * 28)); 2928 LOAD_DQCOEFF(in7, input);
2867 in15 = _mm_load_si128((__m128i *)(input + 8 * 29)); 2929 LOAD_DQCOEFF(in15, input);
2868 in23 = _mm_load_si128((__m128i *)(input + 8 * 30)); 2930 LOAD_DQCOEFF(in23, input);
2869 in31 = _mm_load_si128((__m128i *)(input + 8 * 31)); 2931 LOAD_DQCOEFF(in31, input);
2870 2932
2871 input += 256; 2933 // checking if all entries are zero
2934 zero_idx[0] = _mm_or_si128(in0, in1);
2935 zero_idx[1] = _mm_or_si128(in2, in3);
2936 zero_idx[2] = _mm_or_si128(in4, in5);
2937 zero_idx[3] = _mm_or_si128(in6, in7);
2938 zero_idx[4] = _mm_or_si128(in8, in9);
2939 zero_idx[5] = _mm_or_si128(in10, in11);
2940 zero_idx[6] = _mm_or_si128(in12, in13);
2941 zero_idx[7] = _mm_or_si128(in14, in15);
2942 zero_idx[8] = _mm_or_si128(in16, in17);
2943 zero_idx[9] = _mm_or_si128(in18, in19);
2944 zero_idx[10] = _mm_or_si128(in20, in21);
2945 zero_idx[11] = _mm_or_si128(in22, in23);
2946 zero_idx[12] = _mm_or_si128(in24, in25);
2947 zero_idx[13] = _mm_or_si128(in26, in27);
2948 zero_idx[14] = _mm_or_si128(in28, in29);
2949 zero_idx[15] = _mm_or_si128(in30, in31);
2950
2951 zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
2952 zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
2953 zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
2954 zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
2955 zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
2956 zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
2957 zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
2958 zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
2959
2960 zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
2961 zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
2962 zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
2963 zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
2964 zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
2965 zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
2966 zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
2967
2968 zero_idx[0] = _mm_unpackhi_epi64(zero_idx[14], zero_idx[14]);
2969 zero_idx[1] = _mm_or_si128(zero_idx[0], zero_idx[14]);
2970 zero_idx[2] = _mm_srli_epi64(zero_idx[1], 32);
2971 zero_flag[0] = _mm_cvtsi128_si32(zero_idx[1]);
2972 zero_flag[1] = _mm_cvtsi128_si32(zero_idx[2]);
2973
2974 if (!zero_flag[0] && !zero_flag[1]) {
2975 col[i32 + 0] = _mm_setzero_si128();
2976 col[i32 + 1] = _mm_setzero_si128();
2977 col[i32 + 2] = _mm_setzero_si128();
2978 col[i32 + 3] = _mm_setzero_si128();
2979 col[i32 + 4] = _mm_setzero_si128();
2980 col[i32 + 5] = _mm_setzero_si128();
2981 col[i32 + 6] = _mm_setzero_si128();
2982 col[i32 + 7] = _mm_setzero_si128();
2983 col[i32 + 8] = _mm_setzero_si128();
2984 col[i32 + 9] = _mm_setzero_si128();
2985 col[i32 + 10] = _mm_setzero_si128();
2986 col[i32 + 11] = _mm_setzero_si128();
2987 col[i32 + 12] = _mm_setzero_si128();
2988 col[i32 + 13] = _mm_setzero_si128();
2989 col[i32 + 14] = _mm_setzero_si128();
2990 col[i32 + 15] = _mm_setzero_si128();
2991 col[i32 + 16] = _mm_setzero_si128();
2992 col[i32 + 17] = _mm_setzero_si128();
2993 col[i32 + 18] = _mm_setzero_si128();
2994 col[i32 + 19] = _mm_setzero_si128();
2995 col[i32 + 20] = _mm_setzero_si128();
2996 col[i32 + 21] = _mm_setzero_si128();
2997 col[i32 + 22] = _mm_setzero_si128();
2998 col[i32 + 23] = _mm_setzero_si128();
2999 col[i32 + 24] = _mm_setzero_si128();
3000 col[i32 + 25] = _mm_setzero_si128();
3001 col[i32 + 26] = _mm_setzero_si128();
3002 col[i32 + 27] = _mm_setzero_si128();
3003 col[i32 + 28] = _mm_setzero_si128();
3004 col[i32 + 29] = _mm_setzero_si128();
3005 col[i32 + 30] = _mm_setzero_si128();
3006 col[i32 + 31] = _mm_setzero_si128();
3007 continue;
3008 }
2872 3009
2873 // Transpose 32x8 block to 8x32 block 3010 // Transpose 32x8 block to 8x32 block
2874 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, 3011 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
2875 in4, in5, in6, in7); 3012 in4, in5, in6, in7);
2876 TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, 3013 TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
2877 in10, in11, in12, in13, in14, in15); 3014 in10, in11, in12, in13, in14, in15);
2878 TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17, 3015 TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17,
2879 in18, in19, in20, in21, in22, in23); 3016 in18, in19, in20, in21, in22, in23);
2880 TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25, 3017 TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25,
2881 in26, in27, in28, in29, in30, in31); 3018 in26, in27, in28, in29, in30, in31);
(...skipping 350 matching lines...) Expand 10 before | Expand all | Expand 10 after
3232 3369
3233 stp1_28 = stp2_28; 3370 stp1_28 = stp2_28;
3234 stp1_29 = stp2_29; 3371 stp1_29 = stp2_29;
3235 stp1_30 = stp2_30; 3372 stp1_30 = stp2_30;
3236 stp1_31 = stp2_31; 3373 stp1_31 = stp2_31;
3237 } 3374 }
3238 3375
3239 // final stage 3376 // final stage
3240 if (i < 4) { 3377 if (i < 4) {
3241 // 1_D: Store 32 intermediate results for each 8x32 block. 3378 // 1_D: Store 32 intermediate results for each 8x32 block.
3242 col[i * 32 + 0] = _mm_add_epi16(stp1_0, stp1_31); 3379 col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
3243 col[i * 32 + 1] = _mm_add_epi16(stp1_1, stp1_30); 3380 col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
3244 col[i * 32 + 2] = _mm_add_epi16(stp1_2, stp1_29); 3381 col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
3245 col[i * 32 + 3] = _mm_add_epi16(stp1_3, stp1_28); 3382 col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
3246 col[i * 32 + 4] = _mm_add_epi16(stp1_4, stp1_27); 3383 col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
3247 col[i * 32 + 5] = _mm_add_epi16(stp1_5, stp1_26); 3384 col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
3248 col[i * 32 + 6] = _mm_add_epi16(stp1_6, stp1_25); 3385 col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
3249 col[i * 32 + 7] = _mm_add_epi16(stp1_7, stp1_24); 3386 col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
3250 col[i * 32 + 8] = _mm_add_epi16(stp1_8, stp1_23); 3387 col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
3251 col[i * 32 + 9] = _mm_add_epi16(stp1_9, stp1_22); 3388 col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
3252 col[i * 32 + 10] = _mm_add_epi16(stp1_10, stp1_21); 3389 col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
3253 col[i * 32 + 11] = _mm_add_epi16(stp1_11, stp1_20); 3390 col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
3254 col[i * 32 + 12] = _mm_add_epi16(stp1_12, stp1_19); 3391 col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
3255 col[i * 32 + 13] = _mm_add_epi16(stp1_13, stp1_18); 3392 col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
3256 col[i * 32 + 14] = _mm_add_epi16(stp1_14, stp1_17); 3393 col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
3257 col[i * 32 + 15] = _mm_add_epi16(stp1_15, stp1_16); 3394 col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
3258 col[i * 32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); 3395 col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
3259 col[i * 32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); 3396 col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
3260 col[i * 32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); 3397 col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
3261 col[i * 32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); 3398 col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
3262 col[i * 32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); 3399 col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
3263 col[i * 32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); 3400 col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
3264 col[i * 32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); 3401 col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
3265 col[i * 32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); 3402 col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
3266 col[i * 32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); 3403 col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
3267 col[i * 32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); 3404 col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
3268 col[i * 32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); 3405 col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
3269 col[i * 32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); 3406 col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
3270 col[i * 32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); 3407 col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
3271 col[i * 32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); 3408 col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
3272 col[i * 32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); 3409 col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
3273 col[i * 32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); 3410 col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
3274 } else { 3411 } else {
3275 const __m128i zero = _mm_setzero_si128(); 3412 const __m128i zero = _mm_setzero_si128();
3276 3413
3277 // 2_D: Calculate the results and store them to destination. 3414 // 2_D: Calculate the results and store them to destination.
3278 in0 = _mm_add_epi16(stp1_0, stp1_31); 3415 in0 = _mm_add_epi16(stp1_0, stp1_31);
3279 in1 = _mm_add_epi16(stp1_1, stp1_30); 3416 in1 = _mm_add_epi16(stp1_1, stp1_30);
3280 in2 = _mm_add_epi16(stp1_2, stp1_29); 3417 in2 = _mm_add_epi16(stp1_2, stp1_29);
3281 in3 = _mm_add_epi16(stp1_3, stp1_28); 3418 in3 = _mm_add_epi16(stp1_3, stp1_28);
3282 in4 = _mm_add_epi16(stp1_4, stp1_27); 3419 in4 = _mm_add_epi16(stp1_4, stp1_27);
3283 in5 = _mm_add_epi16(stp1_5, stp1_26); 3420 in5 = _mm_add_epi16(stp1_5, stp1_26);
(...skipping 121 matching lines...) Expand 10 before | Expand all | Expand 10 after
3405 RECON_AND_STORE(dest, in27); 3542 RECON_AND_STORE(dest, in27);
3406 RECON_AND_STORE(dest, in28); 3543 RECON_AND_STORE(dest, in28);
3407 RECON_AND_STORE(dest, in29); 3544 RECON_AND_STORE(dest, in29);
3408 RECON_AND_STORE(dest, in30); 3545 RECON_AND_STORE(dest, in30);
3409 RECON_AND_STORE(dest, in31); 3546 RECON_AND_STORE(dest, in31);
3410 3547
3411 dest += 8 - (stride * 32); 3548 dest += 8 - (stride * 32);
3412 } 3549 }
3413 } 3550 }
3414 } 3551 }
OLDNEW
« no previous file with comments | « source/libvpx/vp9/common/vp9_treecoder.h ('k') | source/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698