OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 #include <assert.h> | 11 #include <assert.h> |
12 #include <emmintrin.h> // SSE2 | 12 #include <emmintrin.h> // SSE2 |
| 13 |
| 14 #include "./vp9_rtcd.h" |
13 #include "vp9/common/vp9_idct.h" // for cospi constants | 15 #include "vp9/common/vp9_idct.h" // for cospi constants |
14 #include "vp9/encoder/vp9_dct.h" | 16 #include "vp9/encoder/vp9_dct.h" |
15 #include "vp9/encoder/x86/vp9_dct_sse2.h" | 17 #include "vp9/encoder/x86/vp9_dct_sse2.h" |
16 #include "vpx_ports/mem.h" | 18 #include "vpx_ports/mem.h" |
17 | 19 |
18 void vp9_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) { | 20 void vp9_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) { |
19 __m128i in0, in1; | 21 __m128i in0, in1; |
20 __m128i tmp; | 22 __m128i tmp; |
21 const __m128i zero = _mm_setzero_si128(); | 23 const __m128i zero = _mm_setzero_si128(); |
22 in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); | 24 in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); |
(...skipping 66 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
89 res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1); | 91 res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1); |
90 res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1); | 92 res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1); |
91 | 93 |
92 // 00 10 20 30 01 11 21 31 | 94 // 00 10 20 30 01 11 21 31 |
93 // 02 12 22 32 03 13 23 33 | 95 // 02 12 22 32 03 13 23 33 |
94 // only use the first 4 16-bit integers | 96 // only use the first 4 16-bit integers |
95 res[1] = _mm_unpackhi_epi64(res[0], res[0]); | 97 res[1] = _mm_unpackhi_epi64(res[0], res[0]); |
96 res[3] = _mm_unpackhi_epi64(res[2], res[2]); | 98 res[3] = _mm_unpackhi_epi64(res[2], res[2]); |
97 } | 99 } |
98 | 100 |
99 void fdct4_sse2(__m128i *in) { | 101 static void fdct4_sse2(__m128i *in) { |
100 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); | 102 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); |
101 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 103 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
102 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); | 104 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); |
103 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); | 105 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
104 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); | 106 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); |
105 | 107 |
106 __m128i u[4], v[4]; | 108 __m128i u[4], v[4]; |
107 u[0]=_mm_unpacklo_epi16(in[0], in[1]); | 109 u[0]=_mm_unpacklo_epi16(in[0], in[1]); |
108 u[1]=_mm_unpacklo_epi16(in[3], in[2]); | 110 u[1]=_mm_unpacklo_epi16(in[3], in[2]); |
109 | 111 |
(...skipping 12 matching lines...) Expand all Loading... |
122 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); | 124 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); |
123 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); | 125 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); |
124 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); | 126 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); |
125 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); | 127 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); |
126 | 128 |
127 in[0] = _mm_packs_epi32(u[0], u[1]); | 129 in[0] = _mm_packs_epi32(u[0], u[1]); |
128 in[1] = _mm_packs_epi32(u[2], u[3]); | 130 in[1] = _mm_packs_epi32(u[2], u[3]); |
129 transpose_4x4(in); | 131 transpose_4x4(in); |
130 } | 132 } |
131 | 133 |
132 void fadst4_sse2(__m128i *in) { | 134 static void fadst4_sse2(__m128i *in) { |
133 const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9); | 135 const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9); |
134 const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9); | 136 const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9); |
135 const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9); | 137 const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9); |
136 const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9); | 138 const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9); |
137 const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9); | 139 const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9); |
138 const __m128i kZero = _mm_set1_epi16(0); | 140 const __m128i kZero = _mm_set1_epi16(0); |
139 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); | 141 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); |
140 __m128i u[8], v[8]; | 142 __m128i u[8], v[8]; |
141 __m128i in7 = _mm_add_epi16(in[0], in[1]); | 143 __m128i in7 = _mm_add_epi16(in[0], in[1]); |
142 | 144 |
(...skipping 681 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
824 // 00 10 20 30 40 50 60 70 | 826 // 00 10 20 30 40 50 60 70 |
825 // 01 11 21 31 41 51 61 71 | 827 // 01 11 21 31 41 51 61 71 |
826 // 02 12 22 32 42 52 62 72 | 828 // 02 12 22 32 42 52 62 72 |
827 // 03 13 23 33 43 53 63 73 | 829 // 03 13 23 33 43 53 63 73 |
828 // 04 14 24 34 44 54 64 74 | 830 // 04 14 24 34 44 54 64 74 |
829 // 05 15 25 35 45 55 65 75 | 831 // 05 15 25 35 45 55 65 75 |
830 // 06 16 26 36 46 56 66 76 | 832 // 06 16 26 36 46 56 66 76 |
831 // 07 17 27 37 47 57 67 77 | 833 // 07 17 27 37 47 57 67 77 |
832 } | 834 } |
833 | 835 |
834 void fdct8_sse2(__m128i *in) { | 836 static void fdct8_sse2(__m128i *in) { |
835 // constants | 837 // constants |
836 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); | 838 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); |
837 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 839 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
838 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); | 840 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); |
839 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); | 841 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); |
840 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); | 842 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); |
841 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); | 843 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); |
842 const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); | 844 const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); |
843 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); | 845 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); |
844 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); | 846 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); |
(...skipping 119 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
964 | 966 |
965 in[1] = _mm_packs_epi32(v0, v1); | 967 in[1] = _mm_packs_epi32(v0, v1); |
966 in[3] = _mm_packs_epi32(v4, v5); | 968 in[3] = _mm_packs_epi32(v4, v5); |
967 in[5] = _mm_packs_epi32(v2, v3); | 969 in[5] = _mm_packs_epi32(v2, v3); |
968 in[7] = _mm_packs_epi32(v6, v7); | 970 in[7] = _mm_packs_epi32(v6, v7); |
969 | 971 |
970 // transpose | 972 // transpose |
971 array_transpose_8x8(in, in); | 973 array_transpose_8x8(in, in); |
972 } | 974 } |
973 | 975 |
974 void fadst8_sse2(__m128i *in) { | 976 static void fadst8_sse2(__m128i *in) { |
975 // Constants | 977 // Constants |
976 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); | 978 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); |
977 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); | 979 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); |
978 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); | 980 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); |
979 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); | 981 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); |
980 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); | 982 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); |
981 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); | 983 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); |
982 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); | 984 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); |
983 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); | 985 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); |
984 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); | 986 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); |
(...skipping 361 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1346 } | 1348 } |
1347 | 1349 |
1348 static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) { | 1350 static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) { |
1349 // perform rounding operations | 1351 // perform rounding operations |
1350 right_shift_8x8(res0, 2); | 1352 right_shift_8x8(res0, 2); |
1351 right_shift_8x8(res0 + 8, 2); | 1353 right_shift_8x8(res0 + 8, 2); |
1352 right_shift_8x8(res1, 2); | 1354 right_shift_8x8(res1, 2); |
1353 right_shift_8x8(res1 + 8, 2); | 1355 right_shift_8x8(res1 + 8, 2); |
1354 } | 1356 } |
1355 | 1357 |
1356 void fdct16_8col(__m128i *in) { | 1358 static void fdct16_8col(__m128i *in) { |
1357 // perform 16x16 1-D DCT for 8 columns | 1359 // perform 16x16 1-D DCT for 8 columns |
1358 __m128i i[8], s[8], p[8], t[8], u[16], v[16]; | 1360 __m128i i[8], s[8], p[8], t[8], u[16], v[16]; |
1359 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); | 1361 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); |
1360 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 1362 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
1361 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); | 1363 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); |
1362 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); | 1364 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); |
1363 const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64); | 1365 const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64); |
1364 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); | 1366 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); |
1365 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); | 1367 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); |
1366 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); | 1368 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); |
(...skipping 301 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1668 in[1] = _mm_packs_epi32(v[0], v[1]); | 1670 in[1] = _mm_packs_epi32(v[0], v[1]); |
1669 in[9] = _mm_packs_epi32(v[2], v[3]); | 1671 in[9] = _mm_packs_epi32(v[2], v[3]); |
1670 in[5] = _mm_packs_epi32(v[4], v[5]); | 1672 in[5] = _mm_packs_epi32(v[4], v[5]); |
1671 in[13] = _mm_packs_epi32(v[6], v[7]); | 1673 in[13] = _mm_packs_epi32(v[6], v[7]); |
1672 in[3] = _mm_packs_epi32(v[8], v[9]); | 1674 in[3] = _mm_packs_epi32(v[8], v[9]); |
1673 in[11] = _mm_packs_epi32(v[10], v[11]); | 1675 in[11] = _mm_packs_epi32(v[10], v[11]); |
1674 in[7] = _mm_packs_epi32(v[12], v[13]); | 1676 in[7] = _mm_packs_epi32(v[12], v[13]); |
1675 in[15] = _mm_packs_epi32(v[14], v[15]); | 1677 in[15] = _mm_packs_epi32(v[14], v[15]); |
1676 } | 1678 } |
1677 | 1679 |
1678 void fadst16_8col(__m128i *in) { | 1680 static void fadst16_8col(__m128i *in) { |
1679 // perform 16x16 1-D ADST for 8 columns | 1681 // perform 16x16 1-D ADST for 8 columns |
1680 __m128i s[16], x[16], u[32], v[32]; | 1682 __m128i s[16], x[16], u[32], v[32]; |
1681 const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); | 1683 const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); |
1682 const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); | 1684 const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); |
1683 const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); | 1685 const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); |
1684 const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); | 1686 const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); |
1685 const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); | 1687 const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); |
1686 const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); | 1688 const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); |
1687 const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); | 1689 const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); |
1688 const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64); | 1690 const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64); |
(...skipping 449 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2138 in[8] = _mm_packs_epi32(v[2], v[3]); | 2140 in[8] = _mm_packs_epi32(v[2], v[3]); |
2139 in[9] = _mm_packs_epi32(v[10], v[11]); | 2141 in[9] = _mm_packs_epi32(v[10], v[11]); |
2140 in[10] = _mm_packs_epi32(v[14], v[15]); | 2142 in[10] = _mm_packs_epi32(v[14], v[15]); |
2141 in[11] = _mm_packs_epi32(v[6], v[7]); | 2143 in[11] = _mm_packs_epi32(v[6], v[7]); |
2142 in[12] = s[5]; | 2144 in[12] = s[5]; |
2143 in[13] = _mm_sub_epi16(kZero, s[13]); | 2145 in[13] = _mm_sub_epi16(kZero, s[13]); |
2144 in[14] = s[9]; | 2146 in[14] = s[9]; |
2145 in[15] = _mm_sub_epi16(kZero, s[1]); | 2147 in[15] = _mm_sub_epi16(kZero, s[1]); |
2146 } | 2148 } |
2147 | 2149 |
2148 void fdct16_sse2(__m128i *in0, __m128i *in1) { | 2150 static void fdct16_sse2(__m128i *in0, __m128i *in1) { |
2149 fdct16_8col(in0); | 2151 fdct16_8col(in0); |
2150 fdct16_8col(in1); | 2152 fdct16_8col(in1); |
2151 array_transpose_16x16(in0, in1); | 2153 array_transpose_16x16(in0, in1); |
2152 } | 2154 } |
2153 | 2155 |
2154 void fadst16_sse2(__m128i *in0, __m128i *in1) { | 2156 static void fadst16_sse2(__m128i *in0, __m128i *in1) { |
2155 fadst16_8col(in0); | 2157 fadst16_8col(in0); |
2156 fadst16_8col(in1); | 2158 fadst16_8col(in1); |
2157 array_transpose_16x16(in0, in1); | 2159 array_transpose_16x16(in0, in1); |
2158 } | 2160 } |
2159 | 2161 |
2160 void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, | 2162 void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, |
2161 int stride, int tx_type) { | 2163 int stride, int tx_type) { |
2162 __m128i in0[16], in1[16]; | 2164 __m128i in0[16], in1[16]; |
2163 | 2165 |
2164 switch (tx_type) { | 2166 switch (tx_type) { |
(...skipping 162 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2327 for (i = 0; i < 8; ++i) { | 2329 for (i = 0; i < 8; ++i) { |
2328 for (j = 0; j < 8; ++j) | 2330 for (j = 0; j < 8; ++j) |
2329 temp_in[j] = out[j + i * 8]; | 2331 temp_in[j] = out[j + i * 8]; |
2330 ht.rows(temp_in, temp_out); | 2332 ht.rows(temp_in, temp_out); |
2331 for (j = 0; j < 8; ++j) | 2333 for (j = 0; j < 8; ++j) |
2332 output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1; | 2334 output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1; |
2333 } | 2335 } |
2334 } | 2336 } |
2335 } | 2337 } |
2336 | 2338 |
2337 void vp9_highbd_fht16x16_sse2(int16_t *input, tran_low_t *output, | 2339 void vp9_highbd_fht16x16_sse2(const int16_t *input, tran_low_t *output, |
2338 int stride, int tx_type) { | 2340 int stride, int tx_type) { |
2339 if (tx_type == DCT_DCT) { | 2341 if (tx_type == DCT_DCT) { |
2340 vp9_highbd_fdct16x16_sse2(input, output, stride); | 2342 vp9_highbd_fdct16x16_sse2(input, output, stride); |
2341 } else { | 2343 } else { |
2342 tran_low_t out[256]; | 2344 tran_low_t out[256]; |
2343 tran_low_t *outptr = &out[0]; | 2345 tran_low_t *outptr = &out[0]; |
2344 int i, j; | 2346 int i, j; |
2345 tran_low_t temp_in[16], temp_out[16]; | 2347 tran_low_t temp_in[16], temp_out[16]; |
2346 const transform_2d ht = FHT_16[tx_type]; | 2348 const transform_2d ht = FHT_16[tx_type]; |
2347 | 2349 |
(...skipping 13 matching lines...) Expand all Loading... |
2361 ht.rows(temp_in, temp_out); | 2363 ht.rows(temp_in, temp_out); |
2362 for (j = 0; j < 16; ++j) | 2364 for (j = 0; j < 16; ++j) |
2363 output[j + i * 16] = temp_out[j]; | 2365 output[j + i * 16] = temp_out[j]; |
2364 } | 2366 } |
2365 } | 2367 } |
2366 } | 2368 } |
2367 #endif // CONFIG_VP9_HIGHBITDEPTH | 2369 #endif // CONFIG_VP9_HIGHBITDEPTH |
2368 | 2370 |
2369 /* | 2371 /* |
2370 * The DCTnxn functions are defined using the macros below. The main code for | 2372 * The DCTnxn functions are defined using the macros below. The main code for |
2371 * them is in separate files (vp9/encoder/x86/vp9_dct_impl_sse2.c & | 2373 * them is in separate files (vp9/encoder/x86/vp9_dct_sse2_impl.h & |
2372 * vp9/encoder/x86/vp9_dct32x32_sse2.c) which are used by both the 8 bit code | 2374 * vp9/encoder/x86/vp9_dct32x32_sse2_impl.h) which are used by both the 8 bit co
de |
2373 * and the high bit depth code. | 2375 * and the high bit depth code. |
2374 */ | 2376 */ |
2375 | 2377 |
2376 #define DCT_HIGH_BIT_DEPTH 0 | 2378 #define DCT_HIGH_BIT_DEPTH 0 |
2377 | 2379 |
2378 #define FDCT4x4_2D vp9_fdct4x4_sse2 | 2380 #define FDCT4x4_2D vp9_fdct4x4_sse2 |
2379 #define FDCT8x8_2D vp9_fdct8x8_sse2 | 2381 #define FDCT8x8_2D vp9_fdct8x8_sse2 |
2380 #define FDCT16x16_2D vp9_fdct16x16_sse2 | 2382 #define FDCT16x16_2D vp9_fdct16x16_sse2 |
2381 #include "vp9/encoder/x86/vp9_dct_impl_sse2.c" | 2383 #include "vp9/encoder/x86/vp9_dct_sse2_impl.h" |
2382 #undef FDCT4x4_2D | 2384 #undef FDCT4x4_2D |
2383 #undef FDCT8x8_2D | 2385 #undef FDCT8x8_2D |
2384 #undef FDCT16x16_2D | 2386 #undef FDCT16x16_2D |
2385 | 2387 |
2386 #define FDCT32x32_2D vp9_fdct32x32_rd_sse2 | 2388 #define FDCT32x32_2D vp9_fdct32x32_rd_sse2 |
2387 #define FDCT32x32_HIGH_PRECISION 0 | 2389 #define FDCT32x32_HIGH_PRECISION 0 |
2388 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" | 2390 #include "vp9/encoder/x86/vp9_dct32x32_sse2_impl.h" |
2389 #undef FDCT32x32_2D | 2391 #undef FDCT32x32_2D |
2390 #undef FDCT32x32_HIGH_PRECISION | 2392 #undef FDCT32x32_HIGH_PRECISION |
2391 | 2393 |
2392 #define FDCT32x32_2D vp9_fdct32x32_sse2 | 2394 #define FDCT32x32_2D vp9_fdct32x32_sse2 |
2393 #define FDCT32x32_HIGH_PRECISION 1 | 2395 #define FDCT32x32_HIGH_PRECISION 1 |
2394 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT | 2396 #include "vp9/encoder/x86/vp9_dct32x32_sse2_impl.h" // NOLINT |
2395 #undef FDCT32x32_2D | 2397 #undef FDCT32x32_2D |
2396 #undef FDCT32x32_HIGH_PRECISION | 2398 #undef FDCT32x32_HIGH_PRECISION |
2397 | 2399 |
2398 #undef DCT_HIGH_BIT_DEPTH | 2400 #undef DCT_HIGH_BIT_DEPTH |
2399 | 2401 |
2400 | 2402 |
2401 #if CONFIG_VP9_HIGHBITDEPTH | 2403 #if CONFIG_VP9_HIGHBITDEPTH |
2402 | 2404 |
2403 #define DCT_HIGH_BIT_DEPTH 1 | 2405 #define DCT_HIGH_BIT_DEPTH 1 |
2404 | 2406 |
2405 #define FDCT4x4_2D vp9_highbd_fdct4x4_sse2 | 2407 #define FDCT4x4_2D vp9_highbd_fdct4x4_sse2 |
2406 #define FDCT8x8_2D vp9_highbd_fdct8x8_sse2 | 2408 #define FDCT8x8_2D vp9_highbd_fdct8x8_sse2 |
2407 #define FDCT16x16_2D vp9_highbd_fdct16x16_sse2 | 2409 #define FDCT16x16_2D vp9_highbd_fdct16x16_sse2 |
2408 #include "vp9/encoder/x86/vp9_dct_impl_sse2.c" // NOLINT | 2410 #include "vp9/encoder/x86/vp9_dct_sse2_impl.h" // NOLINT |
2409 #undef FDCT4x4_2D | 2411 #undef FDCT4x4_2D |
2410 #undef FDCT8x8_2D | 2412 #undef FDCT8x8_2D |
2411 #undef FDCT16x16_2D | 2413 #undef FDCT16x16_2D |
2412 | 2414 |
2413 #define FDCT32x32_2D vp9_highbd_fdct32x32_rd_sse2 | 2415 #define FDCT32x32_2D vp9_highbd_fdct32x32_rd_sse2 |
2414 #define FDCT32x32_HIGH_PRECISION 0 | 2416 #define FDCT32x32_HIGH_PRECISION 0 |
2415 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT | 2417 #include "vp9/encoder/x86/vp9_dct32x32_sse2_impl.h" // NOLINT |
2416 #undef FDCT32x32_2D | 2418 #undef FDCT32x32_2D |
2417 #undef FDCT32x32_HIGH_PRECISION | 2419 #undef FDCT32x32_HIGH_PRECISION |
2418 | 2420 |
2419 #define FDCT32x32_2D vp9_highbd_fdct32x32_sse2 | 2421 #define FDCT32x32_2D vp9_highbd_fdct32x32_sse2 |
2420 #define FDCT32x32_HIGH_PRECISION 1 | 2422 #define FDCT32x32_HIGH_PRECISION 1 |
2421 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT | 2423 #include "vp9/encoder/x86/vp9_dct32x32_sse2_impl.h" // NOLINT |
2422 #undef FDCT32x32_2D | 2424 #undef FDCT32x32_2D |
2423 #undef FDCT32x32_HIGH_PRECISION | 2425 #undef FDCT32x32_HIGH_PRECISION |
2424 | 2426 |
2425 #undef DCT_HIGH_BIT_DEPTH | 2427 #undef DCT_HIGH_BIT_DEPTH |
2426 | 2428 |
2427 #endif // CONFIG_VP9_HIGHBITDEPTH | 2429 #endif // CONFIG_VP9_HIGHBITDEPTH |
OLD | NEW |