OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 #include <assert.h> | 11 #include <assert.h> |
12 #include <emmintrin.h> // SSE2 | 12 #include <emmintrin.h> // SSE2 |
13 #include "./vpx_config.h" | 13 #include "./vpx_config.h" |
14 #include "vpx/vpx_integer.h" | 14 #include "vpx/vpx_integer.h" |
15 #include "vp9/common/vp9_common.h" | 15 #include "vp9/common/vp9_common.h" |
16 #include "vp9/common/vp9_idct.h" | 16 #include "vp9/common/vp9_idct.h" |
17 | 17 |
| 18 #define RECON_AND_STORE4X4(dest, in_x) \ |
| 19 { \ |
| 20 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ |
| 21 d0 = _mm_unpacklo_epi8(d0, zero); \ |
| 22 d0 = _mm_add_epi16(in_x, d0); \ |
| 23 d0 = _mm_packus_epi16(d0, d0); \ |
| 24 *(int *)dest = _mm_cvtsi128_si32(d0); \ |
| 25 dest += stride; \ |
| 26 } |
| 27 |
18 void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) { | 28 void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
19 const __m128i zero = _mm_setzero_si128(); | 29 const __m128i zero = _mm_setzero_si128(); |
20 const __m128i eight = _mm_set1_epi16(8); | 30 const __m128i eight = _mm_set1_epi16(8); |
21 const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64, | 31 const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64, |
22 (int16_t)cospi_16_64, (int16_t)-cospi_16_64, | 32 (int16_t)cospi_16_64, (int16_t)-cospi_16_64, |
23 (int16_t)cospi_24_64, (int16_t)-cospi_8_64, | 33 (int16_t)cospi_24_64, (int16_t)-cospi_8_64, |
24 (int16_t)cospi_8_64, (int16_t)cospi_24_64); | 34 (int16_t)cospi_8_64, (int16_t)cospi_24_64); |
25 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); | 35 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
26 __m128i input0, input1, input2, input3; | 36 __m128i input0, input1, input2, input3; |
27 | 37 |
28 // Rows | 38 // Rows |
29 input0 = _mm_loadl_epi64((const __m128i *)input); | 39 input0 = _mm_load_si128((const __m128i *)input); |
30 input1 = _mm_loadl_epi64((const __m128i *)(input + 4)); | 40 input2 = _mm_load_si128((const __m128i *)(input + 8)); |
31 input2 = _mm_loadl_epi64((const __m128i *)(input + 8)); | |
32 input3 = _mm_loadl_epi64((const __m128i *)(input + 12)); | |
33 | 41 |
34 // Construct i3, i1, i3, i1, i2, i0, i2, i0 | 42 // Construct i3, i1, i3, i1, i2, i0, i2, i0 |
35 input0 = _mm_shufflelo_epi16(input0, 0xd8); | 43 input0 = _mm_shufflelo_epi16(input0, 0xd8); |
36 input1 = _mm_shufflelo_epi16(input1, 0xd8); | 44 input0 = _mm_shufflehi_epi16(input0, 0xd8); |
37 input2 = _mm_shufflelo_epi16(input2, 0xd8); | 45 input2 = _mm_shufflelo_epi16(input2, 0xd8); |
38 input3 = _mm_shufflelo_epi16(input3, 0xd8); | 46 input2 = _mm_shufflehi_epi16(input2, 0xd8); |
39 | 47 |
| 48 input1 = _mm_unpackhi_epi32(input0, input0); |
40 input0 = _mm_unpacklo_epi32(input0, input0); | 49 input0 = _mm_unpacklo_epi32(input0, input0); |
41 input1 = _mm_unpacklo_epi32(input1, input1); | 50 input3 = _mm_unpackhi_epi32(input2, input2); |
42 input2 = _mm_unpacklo_epi32(input2, input2); | 51 input2 = _mm_unpacklo_epi32(input2, input2); |
43 input3 = _mm_unpacklo_epi32(input3, input3); | |
44 | 52 |
45 // Stage 1 | 53 // Stage 1 |
46 input0 = _mm_madd_epi16(input0, cst); | 54 input0 = _mm_madd_epi16(input0, cst); |
47 input1 = _mm_madd_epi16(input1, cst); | 55 input1 = _mm_madd_epi16(input1, cst); |
48 input2 = _mm_madd_epi16(input2, cst); | 56 input2 = _mm_madd_epi16(input2, cst); |
49 input3 = _mm_madd_epi16(input3, cst); | 57 input3 = _mm_madd_epi16(input3, cst); |
50 | 58 |
51 input0 = _mm_add_epi32(input0, rounding); | 59 input0 = _mm_add_epi32(input0, rounding); |
52 input1 = _mm_add_epi32(input1, rounding); | 60 input1 = _mm_add_epi32(input1, rounding); |
53 input2 = _mm_add_epi32(input2, rounding); | 61 input2 = _mm_add_epi32(input2, rounding); |
54 input3 = _mm_add_epi32(input3, rounding); | 62 input3 = _mm_add_epi32(input3, rounding); |
55 | 63 |
56 input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); | 64 input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); |
57 input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); | 65 input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); |
58 input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); | 66 input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); |
59 input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); | 67 input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); |
60 | 68 |
61 // Stage 2 | 69 // Stage 2 |
62 input0 = _mm_packs_epi32(input0, zero); | 70 input0 = _mm_packs_epi32(input0, input1); |
63 input1 = _mm_packs_epi32(input1, zero); | 71 input1 = _mm_packs_epi32(input2, input3); |
64 input2 = _mm_packs_epi32(input2, zero); | |
65 input3 = _mm_packs_epi32(input3, zero); | |
66 | 72 |
67 // Transpose | 73 // Transpose |
68 input1 = _mm_unpacklo_epi16(input0, input1); | 74 input2 = _mm_unpacklo_epi16(input0, input1); |
69 input3 = _mm_unpacklo_epi16(input2, input3); | 75 input3 = _mm_unpackhi_epi16(input0, input1); |
70 input0 = _mm_unpacklo_epi32(input1, input3); | 76 input0 = _mm_unpacklo_epi32(input2, input3); |
71 input1 = _mm_unpackhi_epi32(input1, input3); | 77 input1 = _mm_unpackhi_epi32(input2, input3); |
72 | 78 |
73 // Switch column2, column 3, and then, we got: | 79 // Switch column2, column 3, and then, we got: |
74 // input2: column1, column 0; input3: column2, column 3. | 80 // input2: column1, column 0; input3: column2, column 3. |
75 input1 = _mm_shuffle_epi32(input1, 0x4e); | 81 input1 = _mm_shuffle_epi32(input1, 0x4e); |
76 input2 = _mm_add_epi16(input0, input1); | 82 input2 = _mm_add_epi16(input0, input1); |
77 input3 = _mm_sub_epi16(input0, input1); | 83 input3 = _mm_sub_epi16(input0, input1); |
78 | 84 |
79 // Columns | 85 // Columns |
80 // Construct i3, i1, i3, i1, i2, i0, i2, i0 | 86 // Construct i3, i1, i3, i1, i2, i0, i2, i0 |
81 input0 = _mm_shufflelo_epi16(input2, 0xd8); | 87 input0 = _mm_unpacklo_epi32(input2, input2); |
82 input1 = _mm_shufflehi_epi16(input2, 0xd8); | 88 input1 = _mm_unpackhi_epi32(input2, input2); |
83 input2 = _mm_shufflehi_epi16(input3, 0xd8); | 89 input2 = _mm_unpackhi_epi32(input3, input3); |
84 input3 = _mm_shufflelo_epi16(input3, 0xd8); | |
85 | |
86 input0 = _mm_unpacklo_epi32(input0, input0); | |
87 input1 = _mm_unpackhi_epi32(input1, input1); | |
88 input2 = _mm_unpackhi_epi32(input2, input2); | |
89 input3 = _mm_unpacklo_epi32(input3, input3); | 90 input3 = _mm_unpacklo_epi32(input3, input3); |
90 | 91 |
91 // Stage 1 | 92 // Stage 1 |
92 input0 = _mm_madd_epi16(input0, cst); | 93 input0 = _mm_madd_epi16(input0, cst); |
93 input1 = _mm_madd_epi16(input1, cst); | 94 input1 = _mm_madd_epi16(input1, cst); |
94 input2 = _mm_madd_epi16(input2, cst); | 95 input2 = _mm_madd_epi16(input2, cst); |
95 input3 = _mm_madd_epi16(input3, cst); | 96 input3 = _mm_madd_epi16(input3, cst); |
96 | 97 |
97 input0 = _mm_add_epi32(input0, rounding); | 98 input0 = _mm_add_epi32(input0, rounding); |
98 input1 = _mm_add_epi32(input1, rounding); | 99 input1 = _mm_add_epi32(input1, rounding); |
99 input2 = _mm_add_epi32(input2, rounding); | 100 input2 = _mm_add_epi32(input2, rounding); |
100 input3 = _mm_add_epi32(input3, rounding); | 101 input3 = _mm_add_epi32(input3, rounding); |
101 | 102 |
102 input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); | 103 input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); |
103 input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); | 104 input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); |
104 input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); | 105 input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); |
105 input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); | 106 input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); |
106 | 107 |
107 // Stage 2 | 108 // Stage 2 |
108 input0 = _mm_packs_epi32(input0, zero); | 109 input0 = _mm_packs_epi32(input0, input2); |
109 input1 = _mm_packs_epi32(input1, zero); | 110 input1 = _mm_packs_epi32(input1, input3); |
110 input2 = _mm_packs_epi32(input2, zero); | |
111 input3 = _mm_packs_epi32(input3, zero); | |
112 | 111 |
113 // Transpose | 112 // Transpose |
114 input1 = _mm_unpacklo_epi16(input0, input1); | 113 input2 = _mm_unpacklo_epi16(input0, input1); |
115 input3 = _mm_unpacklo_epi16(input2, input3); | 114 input3 = _mm_unpackhi_epi16(input0, input1); |
116 input0 = _mm_unpacklo_epi32(input1, input3); | 115 input0 = _mm_unpacklo_epi32(input2, input3); |
117 input1 = _mm_unpackhi_epi32(input1, input3); | 116 input1 = _mm_unpackhi_epi32(input2, input3); |
118 | 117 |
119 // Switch column2, column 3, and then, we got: | 118 // Switch column2, column 3, and then, we got: |
120 // input2: column1, column 0; input3: column2, column 3. | 119 // input2: column1, column 0; input3: column2, column 3. |
121 input1 = _mm_shuffle_epi32(input1, 0x4e); | 120 input1 = _mm_shuffle_epi32(input1, 0x4e); |
122 input2 = _mm_add_epi16(input0, input1); | 121 input2 = _mm_add_epi16(input0, input1); |
123 input3 = _mm_sub_epi16(input0, input1); | 122 input3 = _mm_sub_epi16(input0, input1); |
124 | 123 |
125 // Final round and shift | 124 // Final round and shift |
126 input2 = _mm_add_epi16(input2, eight); | 125 input2 = _mm_add_epi16(input2, eight); |
127 input3 = _mm_add_epi16(input3, eight); | 126 input3 = _mm_add_epi16(input3, eight); |
128 | 127 |
129 input2 = _mm_srai_epi16(input2, 4); | 128 input2 = _mm_srai_epi16(input2, 4); |
130 input3 = _mm_srai_epi16(input3, 4); | 129 input3 = _mm_srai_epi16(input3, 4); |
131 | 130 |
132 #define RECON_AND_STORE4X4(dest, in_x) \ | 131 // Reconstruction and Store |
133 { \ | 132 { |
134 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ | 133 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); |
135 d0 = _mm_unpacklo_epi8(d0, zero); \ | 134 __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); |
136 d0 = _mm_add_epi16(in_x, d0); \ | 135 d0 = _mm_unpacklo_epi32(d0, |
137 d0 = _mm_packus_epi16(d0, d0); \ | 136 _mm_cvtsi32_si128(*(const int *) (dest + stride))); |
138 *(int *)dest = _mm_cvtsi128_si32(d0); \ | 137 d2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128( |
139 dest += stride; \ | 138 *(const int *) (dest + stride * 3)), d2); |
| 139 d0 = _mm_unpacklo_epi8(d0, zero); |
| 140 d2 = _mm_unpacklo_epi8(d2, zero); |
| 141 d0 = _mm_add_epi16(d0, input2); |
| 142 d2 = _mm_add_epi16(d2, input3); |
| 143 d0 = _mm_packus_epi16(d0, d2); |
| 144 // store input0 |
| 145 *(int *)dest = _mm_cvtsi128_si32(d0); |
| 146 // store input1 |
| 147 d0 = _mm_srli_si128(d0, 4); |
| 148 *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); |
| 149 // store input2 |
| 150 d0 = _mm_srli_si128(d0, 4); |
| 151 *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); |
| 152 // store input3 |
| 153 d0 = _mm_srli_si128(d0, 4); |
| 154 *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); |
140 } | 155 } |
141 | |
142 input0 = _mm_srli_si128(input2, 8); | |
143 input1 = _mm_srli_si128(input3, 8); | |
144 | |
145 RECON_AND_STORE4X4(dest, input2); | |
146 RECON_AND_STORE4X4(dest, input0); | |
147 RECON_AND_STORE4X4(dest, input1); | |
148 RECON_AND_STORE4X4(dest, input3); | |
149 } | 156 } |
150 | 157 |
151 void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { | 158 void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
152 __m128i dc_value; | 159 __m128i dc_value; |
153 const __m128i zero = _mm_setzero_si128(); | 160 const __m128i zero = _mm_setzero_si128(); |
154 int a; | 161 int a; |
155 | 162 |
156 a = dct_const_round_shift(input[0] * cospi_16_64); | 163 a = dct_const_round_shift(input[0] * cospi_16_64); |
157 a = dct_const_round_shift(a * cospi_16_64); | 164 a = dct_const_round_shift(a * cospi_16_64); |
158 a = ROUND_POWER_OF_TWO(a, 4); | 165 a = ROUND_POWER_OF_TWO(a, 4); |
159 | 166 |
160 dc_value = _mm_set1_epi16(a); | 167 dc_value = _mm_set1_epi16(a); |
161 | 168 |
162 RECON_AND_STORE4X4(dest, dc_value); | 169 RECON_AND_STORE4X4(dest, dc_value); |
163 RECON_AND_STORE4X4(dest, dc_value); | 170 RECON_AND_STORE4X4(dest, dc_value); |
164 RECON_AND_STORE4X4(dest, dc_value); | 171 RECON_AND_STORE4X4(dest, dc_value); |
165 RECON_AND_STORE4X4(dest, dc_value); | 172 RECON_AND_STORE4X4(dest, dc_value); |
166 } | 173 } |
167 | 174 |
168 static INLINE void transpose_4x4(__m128i *res) { | 175 static INLINE void transpose_4x4(__m128i *res) { |
169 const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); | 176 const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); |
170 const __m128i tr0_1 = _mm_unpacklo_epi16(res[2], res[3]); | 177 const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); |
171 res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1); | |
172 res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1); | |
173 | 178 |
174 res[1] = _mm_unpackhi_epi64(res[0], res[0]); | 179 res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1); |
175 res[3] = _mm_unpackhi_epi64(res[2], res[2]); | 180 res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1); |
176 } | 181 } |
177 | 182 |
178 static void idct4_1d_sse2(__m128i *in) { | 183 static void idct4_1d_sse2(__m128i *in) { |
179 const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64); | 184 const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64); |
180 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 185 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
181 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); | 186 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
182 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); | 187 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); |
183 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); | 188 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); |
184 __m128i u[8], v[8]; | 189 __m128i u[8], v[8]; |
185 | 190 |
186 transpose_4x4(in); | 191 transpose_4x4(in); |
187 // stage 1 | 192 // stage 1 |
188 u[0] = _mm_unpacklo_epi16(in[0], in[2]); | 193 u[0] = _mm_unpacklo_epi16(in[0], in[1]); |
189 u[1] = _mm_unpacklo_epi16(in[1], in[3]); | 194 u[1] = _mm_unpackhi_epi16(in[0], in[1]); |
190 v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); | 195 v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); |
191 v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16); | 196 v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16); |
192 v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08); | 197 v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08); |
193 v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24); | 198 v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24); |
194 | 199 |
195 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); | 200 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); |
196 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); | 201 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); |
197 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); | 202 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); |
198 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); | 203 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); |
199 | 204 |
200 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); | 205 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); |
201 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); | 206 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); |
202 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); | 207 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); |
203 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); | 208 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); |
204 | 209 |
205 u[0] = _mm_packs_epi32(v[0], v[2]); | 210 u[0] = _mm_packs_epi32(v[0], v[1]); |
206 u[1] = _mm_packs_epi32(v[1], v[3]); | 211 u[1] = _mm_packs_epi32(v[3], v[2]); |
207 u[2] = _mm_unpackhi_epi64(u[0], u[0]); | |
208 u[3] = _mm_unpackhi_epi64(u[1], u[1]); | |
209 | 212 |
210 // stage 2 | 213 // stage 2 |
211 in[0] = _mm_add_epi16(u[0], u[3]); | 214 in[0] = _mm_add_epi16(u[0], u[1]); |
212 in[1] = _mm_add_epi16(u[1], u[2]); | 215 in[1] = _mm_sub_epi16(u[0], u[1]); |
213 in[2] = _mm_sub_epi16(u[1], u[2]); | 216 in[1] = _mm_shuffle_epi32(in[1], 0x4E); |
214 in[3] = _mm_sub_epi16(u[0], u[3]); | |
215 } | 217 } |
216 | 218 |
217 static void iadst4_1d_sse2(__m128i *in) { | 219 static void iadst4_1d_sse2(__m128i *in) { |
218 const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9); | 220 const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9); |
219 const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9); | 221 const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9); |
220 const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9); | 222 const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9); |
221 const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9); | 223 const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9); |
222 const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9); | 224 const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9); |
223 const __m128i kZero = _mm_set1_epi16(0); | 225 const __m128i kZero = _mm_set1_epi16(0); |
224 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); | 226 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); |
225 __m128i u[8], v[8], in7; | 227 __m128i u[8], v[8], in7; |
226 | 228 |
227 transpose_4x4(in); | 229 transpose_4x4(in); |
228 in7 = _mm_add_epi16(in[0], in[3]); | 230 in7 = _mm_srli_si128(in[1], 8); |
229 in7 = _mm_sub_epi16(in7, in[2]); | 231 in7 = _mm_add_epi16(in7, in[0]); |
| 232 in7 = _mm_sub_epi16(in7, in[1]); |
230 | 233 |
231 u[0] = _mm_unpacklo_epi16(in[0], in[2]); | 234 u[0] = _mm_unpacklo_epi16(in[0], in[1]); |
232 u[1] = _mm_unpacklo_epi16(in[1], in[3]); | 235 u[1] = _mm_unpackhi_epi16(in[0], in[1]); |
233 u[2] = _mm_unpacklo_epi16(in7, kZero); | 236 u[2] = _mm_unpacklo_epi16(in7, kZero); |
234 u[3] = _mm_unpacklo_epi16(in[1], kZero); | 237 u[3] = _mm_unpackhi_epi16(in[0], kZero); |
235 | 238 |
236 v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3 | 239 v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3 |
237 v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5 | 240 v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5 |
238 v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2 | 241 v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2 |
239 v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4 | 242 v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4 |
240 v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6 | 243 v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6 |
241 v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2 | 244 v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2 |
242 | 245 |
243 u[0] = _mm_add_epi32(v[0], v[1]); | 246 u[0] = _mm_add_epi32(v[0], v[1]); |
244 u[1] = _mm_add_epi32(v[3], v[4]); | 247 u[1] = _mm_add_epi32(v[3], v[4]); |
245 u[2] = v[2]; | 248 u[2] = v[2]; |
246 u[3] = _mm_add_epi32(u[0], u[1]); | 249 u[3] = _mm_add_epi32(u[0], u[1]); |
247 u[4] = _mm_slli_epi32(v[5], 2); | 250 u[4] = _mm_slli_epi32(v[5], 2); |
248 u[5] = _mm_add_epi32(u[3], v[5]); | 251 u[5] = _mm_add_epi32(u[3], v[5]); |
249 u[6] = _mm_sub_epi32(u[5], u[4]); | 252 u[6] = _mm_sub_epi32(u[5], u[4]); |
250 | 253 |
251 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); | 254 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); |
252 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); | 255 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); |
253 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); | 256 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); |
254 v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); | 257 v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); |
255 | 258 |
256 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); | 259 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); |
257 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); | 260 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); |
258 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); | 261 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); |
259 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); | 262 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); |
260 | 263 |
261 in[0] = _mm_packs_epi32(u[0], u[2]); | 264 in[0] = _mm_packs_epi32(u[0], u[1]); |
262 in[1] = _mm_packs_epi32(u[1], u[3]); | 265 in[1] = _mm_packs_epi32(u[2], u[3]); |
263 in[2] = _mm_unpackhi_epi64(in[0], in[0]); | |
264 in[3] = _mm_unpackhi_epi64(in[1], in[1]); | |
265 } | 266 } |
266 | 267 |
267 void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, | 268 void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, |
268 int tx_type) { | 269 int tx_type) { |
269 __m128i in[4]; | 270 __m128i in[2]; |
270 const __m128i zero = _mm_setzero_si128(); | 271 const __m128i zero = _mm_setzero_si128(); |
271 const __m128i eight = _mm_set1_epi16(8); | 272 const __m128i eight = _mm_set1_epi16(8); |
272 | 273 |
273 in[0] = _mm_loadl_epi64((const __m128i *)input); | 274 in[0]= _mm_loadu_si128((const __m128i *)(input)); |
274 in[1] = _mm_loadl_epi64((const __m128i *)(input + 4)); | 275 in[1]= _mm_loadu_si128((const __m128i *)(input + 8)); |
275 in[2] = _mm_loadl_epi64((const __m128i *)(input + 8)); | |
276 in[3] = _mm_loadl_epi64((const __m128i *)(input + 12)); | |
277 | 276 |
278 switch (tx_type) { | 277 switch (tx_type) { |
279 case 0: // DCT_DCT | 278 case 0: // DCT_DCT |
280 idct4_1d_sse2(in); | 279 idct4_1d_sse2(in); |
281 idct4_1d_sse2(in); | 280 idct4_1d_sse2(in); |
282 break; | 281 break; |
283 case 1: // ADST_DCT | 282 case 1: // ADST_DCT |
284 idct4_1d_sse2(in); | 283 idct4_1d_sse2(in); |
285 iadst4_1d_sse2(in); | 284 iadst4_1d_sse2(in); |
286 break; | 285 break; |
287 case 2: // DCT_ADST | 286 case 2: // DCT_ADST |
288 iadst4_1d_sse2(in); | 287 iadst4_1d_sse2(in); |
289 idct4_1d_sse2(in); | 288 idct4_1d_sse2(in); |
290 break; | 289 break; |
291 case 3: // ADST_ADST | 290 case 3: // ADST_ADST |
292 iadst4_1d_sse2(in); | 291 iadst4_1d_sse2(in); |
293 iadst4_1d_sse2(in); | 292 iadst4_1d_sse2(in); |
294 break; | 293 break; |
295 default: | 294 default: |
296 assert(0); | 295 assert(0); |
297 break; | 296 break; |
298 } | 297 } |
299 | 298 |
300 // Final round and shift | 299 // Final round and shift |
301 in[0] = _mm_add_epi16(in[0], eight); | 300 in[0] = _mm_add_epi16(in[0], eight); |
302 in[1] = _mm_add_epi16(in[1], eight); | 301 in[1] = _mm_add_epi16(in[1], eight); |
303 in[2] = _mm_add_epi16(in[2], eight); | |
304 in[3] = _mm_add_epi16(in[3], eight); | |
305 | 302 |
306 in[0] = _mm_srai_epi16(in[0], 4); | 303 in[0] = _mm_srai_epi16(in[0], 4); |
307 in[1] = _mm_srai_epi16(in[1], 4); | 304 in[1] = _mm_srai_epi16(in[1], 4); |
308 in[2] = _mm_srai_epi16(in[2], 4); | |
309 in[3] = _mm_srai_epi16(in[3], 4); | |
310 | 305 |
311 RECON_AND_STORE4X4(dest, in[0]); | 306 // Reconstruction and Store |
312 RECON_AND_STORE4X4(dest, in[1]); | 307 { |
313 RECON_AND_STORE4X4(dest, in[2]); | 308 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); |
314 RECON_AND_STORE4X4(dest, in[3]); | 309 __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); |
| 310 d0 = _mm_unpacklo_epi32(d0, |
| 311 _mm_cvtsi32_si128(*(const int *) (dest + stride))); |
| 312 d2 = _mm_unpacklo_epi32(d2, _mm_cvtsi32_si128( |
| 313 *(const int *) (dest + stride * 3))); |
| 314 d0 = _mm_unpacklo_epi8(d0, zero); |
| 315 d2 = _mm_unpacklo_epi8(d2, zero); |
| 316 d0 = _mm_add_epi16(d0, in[0]); |
| 317 d2 = _mm_add_epi16(d2, in[1]); |
| 318 d0 = _mm_packus_epi16(d0, d2); |
| 319 // store result[0] |
| 320 *(int *)dest = _mm_cvtsi128_si32(d0); |
| 321 // store result[1] |
| 322 d0 = _mm_srli_si128(d0, 4); |
| 323 *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); |
| 324 // store result[2] |
| 325 d0 = _mm_srli_si128(d0, 4); |
| 326 *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); |
| 327 // store result[3] |
| 328 d0 = _mm_srli_si128(d0, 4); |
| 329 *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); |
| 330 } |
315 } | 331 } |
316 | 332 |
317 #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \ | 333 #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \ |
318 out0, out1, out2, out3, out4, out5, out6, out7) \ | 334 out0, out1, out2, out3, out4, out5, out6, out7) \ |
319 { \ | 335 { \ |
320 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ | 336 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ |
321 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ | 337 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ |
322 const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ | 338 const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ |
323 const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ | 339 const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ |
324 const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ | 340 const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ |
(...skipping 83 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
408 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \ | 424 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \ |
409 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \ | 425 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \ |
410 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \ | 426 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \ |
411 \ | 427 \ |
412 res0 = _mm_packs_epi32(tmp0, tmp1); \ | 428 res0 = _mm_packs_epi32(tmp0, tmp1); \ |
413 res1 = _mm_packs_epi32(tmp2, tmp3); \ | 429 res1 = _mm_packs_epi32(tmp2, tmp3); \ |
414 res2 = _mm_packs_epi32(tmp4, tmp5); \ | 430 res2 = _mm_packs_epi32(tmp4, tmp5); \ |
415 res3 = _mm_packs_epi32(tmp6, tmp7); \ | 431 res3 = _mm_packs_epi32(tmp6, tmp7); \ |
416 } | 432 } |
417 | 433 |
| 434 #define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \ |
| 435 { \ |
| 436 tmp0 = _mm_madd_epi16(lo_0, cst0); \ |
| 437 tmp1 = _mm_madd_epi16(hi_0, cst0); \ |
| 438 tmp2 = _mm_madd_epi16(lo_0, cst1); \ |
| 439 tmp3 = _mm_madd_epi16(hi_0, cst1); \ |
| 440 \ |
| 441 tmp0 = _mm_add_epi32(tmp0, rounding); \ |
| 442 tmp1 = _mm_add_epi32(tmp1, rounding); \ |
| 443 tmp2 = _mm_add_epi32(tmp2, rounding); \ |
| 444 tmp3 = _mm_add_epi32(tmp3, rounding); \ |
| 445 \ |
| 446 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ |
| 447 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ |
| 448 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ |
| 449 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ |
| 450 \ |
| 451 res0 = _mm_packs_epi32(tmp0, tmp1); \ |
| 452 res1 = _mm_packs_epi32(tmp2, tmp3); \ |
| 453 } |
| 454 |
418 #define IDCT8_1D \ | 455 #define IDCT8_1D \ |
419 /* Stage1 */ \ | 456 /* Stage1 */ \ |
420 { \ | 457 { \ |
421 const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \ | 458 const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \ |
422 const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \ | 459 const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \ |
423 const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \ | 460 const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \ |
424 const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \ | 461 const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \ |
425 \ | 462 \ |
426 MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \ | 463 MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \ |
427 stg1_1, stg1_2, stg1_3, stp1_4, \ | 464 stg1_1, stg1_2, stg1_3, stp1_4, \ |
(...skipping 178 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
606 res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); | 643 res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); |
607 res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); | 644 res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); |
608 res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); | 645 res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); |
609 res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); | 646 res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); |
610 res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); | 647 res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); |
611 res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); | 648 res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); |
612 res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); | 649 res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); |
613 res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); | 650 res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); |
614 } | 651 } |
615 | 652 |
| 653 static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) { |
| 654 const __m128i zero = _mm_setzero_si128(); |
| 655 const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); |
| 656 const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); |
| 657 const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); |
| 658 const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); |
| 659 |
| 660 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); |
| 661 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); |
| 662 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); |
| 663 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); |
| 664 |
| 665 out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4); |
| 666 out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4); |
| 667 out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6); |
| 668 out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6); |
| 669 out[4] = out[5] = out[6] = out[7] = zero; |
| 670 } |
| 671 |
616 static void idct8_1d_sse2(__m128i *in) { | 672 static void idct8_1d_sse2(__m128i *in) { |
617 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); | 673 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
618 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); | 674 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); |
619 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); | 675 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); |
620 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); | 676 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); |
621 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); | 677 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); |
622 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); | 678 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); |
623 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 679 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
624 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); | 680 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
625 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); | 681 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); |
(...skipping 469 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1095 RECON_AND_STORE(dest, in3); | 1151 RECON_AND_STORE(dest, in3); |
1096 RECON_AND_STORE(dest, in4); | 1152 RECON_AND_STORE(dest, in4); |
1097 RECON_AND_STORE(dest, in5); | 1153 RECON_AND_STORE(dest, in5); |
1098 RECON_AND_STORE(dest, in6); | 1154 RECON_AND_STORE(dest, in6); |
1099 RECON_AND_STORE(dest, in7); | 1155 RECON_AND_STORE(dest, in7); |
1100 } | 1156 } |
1101 | 1157 |
1102 #define IDCT16_1D \ | 1158 #define IDCT16_1D \ |
1103 /* Stage2 */ \ | 1159 /* Stage2 */ \ |
1104 { \ | 1160 { \ |
1105 const __m128i lo_1_15 = _mm_unpacklo_epi16(in1, in15); \ | 1161 const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \ |
1106 const __m128i hi_1_15 = _mm_unpackhi_epi16(in1, in15); \ | 1162 const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \ |
1107 const __m128i lo_9_7 = _mm_unpacklo_epi16(in9, in7); \ | 1163 const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \ |
1108 const __m128i hi_9_7 = _mm_unpackhi_epi16(in9, in7); \ | 1164 const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \ |
1109 const __m128i lo_5_11 = _mm_unpacklo_epi16(in5, in11); \ | 1165 const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \ |
1110 const __m128i hi_5_11 = _mm_unpackhi_epi16(in5, in11); \ | 1166 const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \ |
1111 const __m128i lo_13_3 = _mm_unpacklo_epi16(in13, in3); \ | 1167 const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \ |
1112 const __m128i hi_13_3 = _mm_unpackhi_epi16(in13, in3); \ | 1168 const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \ |
1113 \ | 1169 \ |
1114 MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \ | 1170 MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \ |
1115 stg2_0, stg2_1, stg2_2, stg2_3, \ | 1171 stg2_0, stg2_1, stg2_2, stg2_3, \ |
1116 stp2_8, stp2_15, stp2_9, stp2_14) \ | 1172 stp2_8, stp2_15, stp2_9, stp2_14) \ |
1117 \ | 1173 \ |
1118 MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \ | 1174 MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \ |
1119 stg2_4, stg2_5, stg2_6, stg2_7, \ | 1175 stg2_4, stg2_5, stg2_6, stg2_7, \ |
1120 stp2_10, stp2_13, stp2_11, stp2_12) \ | 1176 stp2_10, stp2_13, stp2_11, stp2_12) \ |
1121 } \ | 1177 } \ |
1122 \ | 1178 \ |
1123 /* Stage3 */ \ | 1179 /* Stage3 */ \ |
1124 { \ | 1180 { \ |
1125 const __m128i lo_2_14 = _mm_unpacklo_epi16(in2, in14); \ | 1181 const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \ |
1126 const __m128i hi_2_14 = _mm_unpackhi_epi16(in2, in14); \ | 1182 const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \ |
1127 const __m128i lo_10_6 = _mm_unpacklo_epi16(in10, in6); \ | 1183 const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \ |
1128 const __m128i hi_10_6 = _mm_unpackhi_epi16(in10, in6); \ | 1184 const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \ |
1129 \ | 1185 \ |
1130 MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \ | 1186 MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \ |
1131 stg3_0, stg3_1, stg3_2, stg3_3, \ | 1187 stg3_0, stg3_1, stg3_2, stg3_3, \ |
1132 stp1_4, stp1_7, stp1_5, stp1_6) \ | 1188 stp1_4, stp1_7, stp1_5, stp1_6) \ |
1133 \ | 1189 \ |
1134 stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \ | 1190 stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \ |
1135 stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ | 1191 stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ |
1136 stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ | 1192 stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ |
1137 stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ | 1193 stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ |
1138 \ | 1194 \ |
1139 stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \ | 1195 stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \ |
1140 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ | 1196 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ |
1141 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ | 1197 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ |
1142 stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ | 1198 stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ |
1143 } \ | 1199 } \ |
1144 \ | 1200 \ |
1145 /* Stage4 */ \ | 1201 /* Stage4 */ \ |
1146 { \ | 1202 { \ |
1147 const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8); \ | 1203 const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \ |
1148 const __m128i hi_0_8 = _mm_unpackhi_epi16(in0, in8); \ | 1204 const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \ |
1149 const __m128i lo_4_12 = _mm_unpacklo_epi16(in4, in12); \ | 1205 const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \ |
1150 const __m128i hi_4_12 = _mm_unpackhi_epi16(in4, in12); \ | 1206 const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \ |
1151 \ | 1207 \ |
1152 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ | 1208 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ |
1153 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ | 1209 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ |
1154 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ | 1210 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ |
1155 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ | 1211 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ |
1156 \ | 1212 \ |
1157 MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \ | 1213 MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \ |
1158 stg4_0, stg4_1, stg4_2, stg4_3, \ | 1214 stg4_0, stg4_1, stg4_2, stg4_3, \ |
1159 stp2_0, stp2_1, stp2_2, stp2_3) \ | 1215 stp2_0, stp2_1, stp2_2, stp2_3) \ |
1160 \ | 1216 \ |
(...skipping 91 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1252 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 1308 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
1253 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); | 1309 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
1254 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); | 1310 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); |
1255 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); | 1311 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); |
1256 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); | 1312 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); |
1257 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); | 1313 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); |
1258 const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); | 1314 const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); |
1259 | 1315 |
1260 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); | 1316 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); |
1261 | 1317 |
1262 __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero, | 1318 __m128i in[16], l[16], r[16], *curr1; |
1263 in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero, | |
1264 in10 = zero, in11 = zero, in12 = zero, in13 = zero, | |
1265 in14 = zero, in15 = zero; | |
1266 __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero, | |
1267 l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero, | |
1268 l12 = zero, l13 = zero, l14 = zero, l15 = zero; | |
1269 __m128i r0 = zero, r1 = zero, r2 = zero, r3 = zero, r4 = zero, r5 = zero, | |
1270 r6 = zero, r7 = zero, r8 = zero, r9 = zero, r10 = zero, r11 = zero, | |
1271 r12 = zero, r13 = zero, r14 = zero, r15 = zero; | |
1272 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, | 1319 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, |
1273 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, | 1320 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, |
1274 stp1_8_0, stp1_12_0; | 1321 stp1_8_0, stp1_12_0; |
1275 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, | 1322 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, |
1276 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; | 1323 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; |
1277 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; | 1324 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
1278 int i; | 1325 int i; |
1279 | 1326 |
1280 // We work on a 8x16 block each time, and loop 4 times for 2-D 16x16 idct. | 1327 curr1 = l; |
1281 for (i = 0; i < 4; i++) { | 1328 for (i = 0; i < 2; i++) { |
1282 // 1-D idct | 1329 // 1-D idct |
1283 if (i < 2) { | |
1284 if (i == 1) input += 128; | |
1285 | 1330 |
1286 // Load input data. | 1331 // Load input data. |
1287 in0 = _mm_load_si128((const __m128i *)input); | 1332 in[0] = _mm_load_si128((const __m128i *)input); |
1288 in8 = _mm_load_si128((const __m128i *)(input + 8 * 1)); | 1333 in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1)); |
1289 in1 = _mm_load_si128((const __m128i *)(input + 8 * 2)); | 1334 in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2)); |
1290 in9 = _mm_load_si128((const __m128i *)(input + 8 * 3)); | 1335 in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3)); |
1291 in2 = _mm_load_si128((const __m128i *)(input + 8 * 4)); | 1336 in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4)); |
1292 in10 = _mm_load_si128((const __m128i *)(input + 8 * 5)); | 1337 in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5)); |
1293 in3 = _mm_load_si128((const __m128i *)(input + 8 * 6)); | 1338 in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6)); |
1294 in11 = _mm_load_si128((const __m128i *)(input + 8 * 7)); | 1339 in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7)); |
1295 in4 = _mm_load_si128((const __m128i *)(input + 8 * 8)); | 1340 in[4] = _mm_load_si128((const __m128i *)(input + 8 * 8)); |
1296 in12 = _mm_load_si128((const __m128i *)(input + 8 * 9)); | 1341 in[12] = _mm_load_si128((const __m128i *)(input + 8 * 9)); |
1297 in5 = _mm_load_si128((const __m128i *)(input + 8 * 10)); | 1342 in[5] = _mm_load_si128((const __m128i *)(input + 8 * 10)); |
1298 in13 = _mm_load_si128((const __m128i *)(input + 8 * 11)); | 1343 in[13] = _mm_load_si128((const __m128i *)(input + 8 * 11)); |
1299 in6 = _mm_load_si128((const __m128i *)(input + 8 * 12)); | 1344 in[6] = _mm_load_si128((const __m128i *)(input + 8 * 12)); |
1300 in14 = _mm_load_si128((const __m128i *)(input + 8 * 13)); | 1345 in[14] = _mm_load_si128((const __m128i *)(input + 8 * 13)); |
1301 in7 = _mm_load_si128((const __m128i *)(input + 8 * 14)); | 1346 in[7] = _mm_load_si128((const __m128i *)(input + 8 * 14)); |
1302 in15 = _mm_load_si128((const __m128i *)(input + 8 * 15)); | 1347 in[15] = _mm_load_si128((const __m128i *)(input + 8 * 15)); |
1303 | 1348 |
1304 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, | 1349 array_transpose_8x8(in, in); |
1305 in4, in5, in6, in7); | 1350 array_transpose_8x8(in+8, in+8); |
1306 TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, | |
1307 in10, in11, in12, in13, in14, in15); | |
1308 } | |
1309 | 1351 |
1310 if (i == 2) { | 1352 IDCT16_1D |
1311 TRANSPOSE_8X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4, | |
1312 in5, in6, in7); | |
1313 TRANSPOSE_8X8(r0, r1, r2, r3, r4, r5, r6, r7, in8, in9, in10, in11, in12, | |
1314 in13, in14, in15); | |
1315 } | |
1316 | 1353 |
1317 if (i == 3) { | 1354 // Stage7 |
1318 TRANSPOSE_8X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3, | 1355 curr1[0] = _mm_add_epi16(stp2_0, stp1_15); |
1319 in4, in5, in6, in7); | 1356 curr1[1] = _mm_add_epi16(stp2_1, stp1_14); |
1320 TRANSPOSE_8X8(r8, r9, r10, r11, r12, r13, r14, r15, in8, in9, in10, in11, | 1357 curr1[2] = _mm_add_epi16(stp2_2, stp2_13); |
1321 in12, in13, in14, in15); | 1358 curr1[3] = _mm_add_epi16(stp2_3, stp2_12); |
1322 } | 1359 curr1[4] = _mm_add_epi16(stp2_4, stp2_11); |
| 1360 curr1[5] = _mm_add_epi16(stp2_5, stp2_10); |
| 1361 curr1[6] = _mm_add_epi16(stp2_6, stp1_9); |
| 1362 curr1[7] = _mm_add_epi16(stp2_7, stp1_8); |
| 1363 curr1[8] = _mm_sub_epi16(stp2_7, stp1_8); |
| 1364 curr1[9] = _mm_sub_epi16(stp2_6, stp1_9); |
| 1365 curr1[10] = _mm_sub_epi16(stp2_5, stp2_10); |
| 1366 curr1[11] = _mm_sub_epi16(stp2_4, stp2_11); |
| 1367 curr1[12] = _mm_sub_epi16(stp2_3, stp2_12); |
| 1368 curr1[13] = _mm_sub_epi16(stp2_2, stp2_13); |
| 1369 curr1[14] = _mm_sub_epi16(stp2_1, stp1_14); |
| 1370 curr1[15] = _mm_sub_epi16(stp2_0, stp1_15); |
1323 | 1371 |
1324 IDCT16_1D | 1372 curr1 = r; |
| 1373 input += 128; |
| 1374 } |
| 1375 for (i = 0; i < 2; i++) { |
| 1376 // 1-D idct |
| 1377 array_transpose_8x8(l+i*8, in); |
| 1378 array_transpose_8x8(r+i*8, in+8); |
1325 | 1379 |
1326 // Stage7 | 1380 IDCT16_1D |
1327 if (i == 0) { | 1381 |
1328 // Left 8x16 | |
1329 l0 = _mm_add_epi16(stp2_0, stp1_15); | |
1330 l1 = _mm_add_epi16(stp2_1, stp1_14); | |
1331 l2 = _mm_add_epi16(stp2_2, stp2_13); | |
1332 l3 = _mm_add_epi16(stp2_3, stp2_12); | |
1333 l4 = _mm_add_epi16(stp2_4, stp2_11); | |
1334 l5 = _mm_add_epi16(stp2_5, stp2_10); | |
1335 l6 = _mm_add_epi16(stp2_6, stp1_9); | |
1336 l7 = _mm_add_epi16(stp2_7, stp1_8); | |
1337 l8 = _mm_sub_epi16(stp2_7, stp1_8); | |
1338 l9 = _mm_sub_epi16(stp2_6, stp1_9); | |
1339 l10 = _mm_sub_epi16(stp2_5, stp2_10); | |
1340 l11 = _mm_sub_epi16(stp2_4, stp2_11); | |
1341 l12 = _mm_sub_epi16(stp2_3, stp2_12); | |
1342 l13 = _mm_sub_epi16(stp2_2, stp2_13); | |
1343 l14 = _mm_sub_epi16(stp2_1, stp1_14); | |
1344 l15 = _mm_sub_epi16(stp2_0, stp1_15); | |
1345 } else if (i == 1) { | |
1346 // Right 8x16 | |
1347 r0 = _mm_add_epi16(stp2_0, stp1_15); | |
1348 r1 = _mm_add_epi16(stp2_1, stp1_14); | |
1349 r2 = _mm_add_epi16(stp2_2, stp2_13); | |
1350 r3 = _mm_add_epi16(stp2_3, stp2_12); | |
1351 r4 = _mm_add_epi16(stp2_4, stp2_11); | |
1352 r5 = _mm_add_epi16(stp2_5, stp2_10); | |
1353 r6 = _mm_add_epi16(stp2_6, stp1_9); | |
1354 r7 = _mm_add_epi16(stp2_7, stp1_8); | |
1355 r8 = _mm_sub_epi16(stp2_7, stp1_8); | |
1356 r9 = _mm_sub_epi16(stp2_6, stp1_9); | |
1357 r10 = _mm_sub_epi16(stp2_5, stp2_10); | |
1358 r11 = _mm_sub_epi16(stp2_4, stp2_11); | |
1359 r12 = _mm_sub_epi16(stp2_3, stp2_12); | |
1360 r13 = _mm_sub_epi16(stp2_2, stp2_13); | |
1361 r14 = _mm_sub_epi16(stp2_1, stp1_14); | |
1362 r15 = _mm_sub_epi16(stp2_0, stp1_15); | |
1363 } else { | |
1364 // 2-D | 1382 // 2-D |
1365 in0 = _mm_add_epi16(stp2_0, stp1_15); | 1383 in[0] = _mm_add_epi16(stp2_0, stp1_15); |
1366 in1 = _mm_add_epi16(stp2_1, stp1_14); | 1384 in[1] = _mm_add_epi16(stp2_1, stp1_14); |
1367 in2 = _mm_add_epi16(stp2_2, stp2_13); | 1385 in[2] = _mm_add_epi16(stp2_2, stp2_13); |
1368 in3 = _mm_add_epi16(stp2_3, stp2_12); | 1386 in[3] = _mm_add_epi16(stp2_3, stp2_12); |
1369 in4 = _mm_add_epi16(stp2_4, stp2_11); | 1387 in[4] = _mm_add_epi16(stp2_4, stp2_11); |
1370 in5 = _mm_add_epi16(stp2_5, stp2_10); | 1388 in[5] = _mm_add_epi16(stp2_5, stp2_10); |
1371 in6 = _mm_add_epi16(stp2_6, stp1_9); | 1389 in[6] = _mm_add_epi16(stp2_6, stp1_9); |
1372 in7 = _mm_add_epi16(stp2_7, stp1_8); | 1390 in[7] = _mm_add_epi16(stp2_7, stp1_8); |
1373 in8 = _mm_sub_epi16(stp2_7, stp1_8); | 1391 in[8] = _mm_sub_epi16(stp2_7, stp1_8); |
1374 in9 = _mm_sub_epi16(stp2_6, stp1_9); | 1392 in[9] = _mm_sub_epi16(stp2_6, stp1_9); |
1375 in10 = _mm_sub_epi16(stp2_5, stp2_10); | 1393 in[10] = _mm_sub_epi16(stp2_5, stp2_10); |
1376 in11 = _mm_sub_epi16(stp2_4, stp2_11); | 1394 in[11] = _mm_sub_epi16(stp2_4, stp2_11); |
1377 in12 = _mm_sub_epi16(stp2_3, stp2_12); | 1395 in[12] = _mm_sub_epi16(stp2_3, stp2_12); |
1378 in13 = _mm_sub_epi16(stp2_2, stp2_13); | 1396 in[13] = _mm_sub_epi16(stp2_2, stp2_13); |
1379 in14 = _mm_sub_epi16(stp2_1, stp1_14); | 1397 in[14] = _mm_sub_epi16(stp2_1, stp1_14); |
1380 in15 = _mm_sub_epi16(stp2_0, stp1_15); | 1398 in[15] = _mm_sub_epi16(stp2_0, stp1_15); |
1381 | 1399 |
1382 // Final rounding and shift | 1400 // Final rounding and shift |
1383 in0 = _mm_adds_epi16(in0, final_rounding); | 1401 in[0] = _mm_adds_epi16(in[0], final_rounding); |
1384 in1 = _mm_adds_epi16(in1, final_rounding); | 1402 in[1] = _mm_adds_epi16(in[1], final_rounding); |
1385 in2 = _mm_adds_epi16(in2, final_rounding); | 1403 in[2] = _mm_adds_epi16(in[2], final_rounding); |
1386 in3 = _mm_adds_epi16(in3, final_rounding); | 1404 in[3] = _mm_adds_epi16(in[3], final_rounding); |
1387 in4 = _mm_adds_epi16(in4, final_rounding); | 1405 in[4] = _mm_adds_epi16(in[4], final_rounding); |
1388 in5 = _mm_adds_epi16(in5, final_rounding); | 1406 in[5] = _mm_adds_epi16(in[5], final_rounding); |
1389 in6 = _mm_adds_epi16(in6, final_rounding); | 1407 in[6] = _mm_adds_epi16(in[6], final_rounding); |
1390 in7 = _mm_adds_epi16(in7, final_rounding); | 1408 in[7] = _mm_adds_epi16(in[7], final_rounding); |
1391 in8 = _mm_adds_epi16(in8, final_rounding); | 1409 in[8] = _mm_adds_epi16(in[8], final_rounding); |
1392 in9 = _mm_adds_epi16(in9, final_rounding); | 1410 in[9] = _mm_adds_epi16(in[9], final_rounding); |
1393 in10 = _mm_adds_epi16(in10, final_rounding); | 1411 in[10] = _mm_adds_epi16(in[10], final_rounding); |
1394 in11 = _mm_adds_epi16(in11, final_rounding); | 1412 in[11] = _mm_adds_epi16(in[11], final_rounding); |
1395 in12 = _mm_adds_epi16(in12, final_rounding); | 1413 in[12] = _mm_adds_epi16(in[12], final_rounding); |
1396 in13 = _mm_adds_epi16(in13, final_rounding); | 1414 in[13] = _mm_adds_epi16(in[13], final_rounding); |
1397 in14 = _mm_adds_epi16(in14, final_rounding); | 1415 in[14] = _mm_adds_epi16(in[14], final_rounding); |
1398 in15 = _mm_adds_epi16(in15, final_rounding); | 1416 in[15] = _mm_adds_epi16(in[15], final_rounding); |
1399 | 1417 |
1400 in0 = _mm_srai_epi16(in0, 6); | 1418 in[0] = _mm_srai_epi16(in[0], 6); |
1401 in1 = _mm_srai_epi16(in1, 6); | 1419 in[1] = _mm_srai_epi16(in[1], 6); |
1402 in2 = _mm_srai_epi16(in2, 6); | 1420 in[2] = _mm_srai_epi16(in[2], 6); |
1403 in3 = _mm_srai_epi16(in3, 6); | 1421 in[3] = _mm_srai_epi16(in[3], 6); |
1404 in4 = _mm_srai_epi16(in4, 6); | 1422 in[4] = _mm_srai_epi16(in[4], 6); |
1405 in5 = _mm_srai_epi16(in5, 6); | 1423 in[5] = _mm_srai_epi16(in[5], 6); |
1406 in6 = _mm_srai_epi16(in6, 6); | 1424 in[6] = _mm_srai_epi16(in[6], 6); |
1407 in7 = _mm_srai_epi16(in7, 6); | 1425 in[7] = _mm_srai_epi16(in[7], 6); |
1408 in8 = _mm_srai_epi16(in8, 6); | 1426 in[8] = _mm_srai_epi16(in[8], 6); |
1409 in9 = _mm_srai_epi16(in9, 6); | 1427 in[9] = _mm_srai_epi16(in[9], 6); |
1410 in10 = _mm_srai_epi16(in10, 6); | 1428 in[10] = _mm_srai_epi16(in[10], 6); |
1411 in11 = _mm_srai_epi16(in11, 6); | 1429 in[11] = _mm_srai_epi16(in[11], 6); |
1412 in12 = _mm_srai_epi16(in12, 6); | 1430 in[12] = _mm_srai_epi16(in[12], 6); |
1413 in13 = _mm_srai_epi16(in13, 6); | 1431 in[13] = _mm_srai_epi16(in[13], 6); |
1414 in14 = _mm_srai_epi16(in14, 6); | 1432 in[14] = _mm_srai_epi16(in[14], 6); |
1415 in15 = _mm_srai_epi16(in15, 6); | 1433 in[15] = _mm_srai_epi16(in[15], 6); |
1416 | 1434 |
1417 RECON_AND_STORE(dest, in0); | 1435 RECON_AND_STORE(dest, in[0]); |
1418 RECON_AND_STORE(dest, in1); | 1436 RECON_AND_STORE(dest, in[1]); |
1419 RECON_AND_STORE(dest, in2); | 1437 RECON_AND_STORE(dest, in[2]); |
1420 RECON_AND_STORE(dest, in3); | 1438 RECON_AND_STORE(dest, in[3]); |
1421 RECON_AND_STORE(dest, in4); | 1439 RECON_AND_STORE(dest, in[4]); |
1422 RECON_AND_STORE(dest, in5); | 1440 RECON_AND_STORE(dest, in[5]); |
1423 RECON_AND_STORE(dest, in6); | 1441 RECON_AND_STORE(dest, in[6]); |
1424 RECON_AND_STORE(dest, in7); | 1442 RECON_AND_STORE(dest, in[7]); |
1425 RECON_AND_STORE(dest, in8); | 1443 RECON_AND_STORE(dest, in[8]); |
1426 RECON_AND_STORE(dest, in9); | 1444 RECON_AND_STORE(dest, in[9]); |
1427 RECON_AND_STORE(dest, in10); | 1445 RECON_AND_STORE(dest, in[10]); |
1428 RECON_AND_STORE(dest, in11); | 1446 RECON_AND_STORE(dest, in[11]); |
1429 RECON_AND_STORE(dest, in12); | 1447 RECON_AND_STORE(dest, in[12]); |
1430 RECON_AND_STORE(dest, in13); | 1448 RECON_AND_STORE(dest, in[13]); |
1431 RECON_AND_STORE(dest, in14); | 1449 RECON_AND_STORE(dest, in[14]); |
1432 RECON_AND_STORE(dest, in15); | 1450 RECON_AND_STORE(dest, in[15]); |
1433 | 1451 |
1434 dest += 8 - (stride * 16); | 1452 dest += 8 - (stride * 16); |
1435 } | |
1436 } | 1453 } |
1437 } | 1454 } |
1438 | 1455 |
1439 void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { | 1456 void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
1440 __m128i dc_value; | 1457 __m128i dc_value; |
1441 const __m128i zero = _mm_setzero_si128(); | 1458 const __m128i zero = _mm_setzero_si128(); |
1442 int a, i; | 1459 int a, i; |
1443 | 1460 |
1444 a = dct_const_round_shift(input[0] * cospi_16_64); | 1461 a = dct_const_round_shift(input[0] * cospi_16_64); |
1445 a = dct_const_round_shift(a * cospi_16_64); | 1462 a = dct_const_round_shift(a * cospi_16_64); |
(...skipping 999 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2445 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); | 2462 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); |
2446 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 2463 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
2447 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); | 2464 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
2448 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); | 2465 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); |
2449 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); | 2466 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); |
2450 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); | 2467 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); |
2451 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); | 2468 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); |
2452 const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); | 2469 const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); |
2453 | 2470 |
2454 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); | 2471 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); |
2455 | 2472 __m128i in[16], l[16]; |
2456 __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero, | |
2457 in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero, | |
2458 in10 = zero, in11 = zero, in12 = zero, in13 = zero, | |
2459 in14 = zero, in15 = zero; | |
2460 __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero, | |
2461 l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero, | |
2462 l12 = zero, l13 = zero, l14 = zero, l15 = zero; | |
2463 | |
2464 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, | 2473 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, |
2465 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, | 2474 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, |
2466 stp1_8_0, stp1_12_0; | 2475 stp1_8_0, stp1_12_0; |
2467 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, | 2476 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, |
2468 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; | 2477 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; |
2469 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; | 2478 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
2470 int i; | 2479 int i; |
| 2480 in[4] = in[5] = in[6] = in[7] = in[12] = in[13] = in[14] = in[15] = zero; |
2471 // 1-D idct. Load input data. | 2481 // 1-D idct. Load input data. |
2472 in0 = _mm_load_si128((const __m128i *)input); | 2482 in[0] = _mm_load_si128((const __m128i *)input); |
2473 in8 = _mm_load_si128((const __m128i *)(input + 8 * 1)); | 2483 in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1)); |
2474 in1 = _mm_load_si128((const __m128i *)(input + 8 * 2)); | 2484 in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2)); |
2475 in9 = _mm_load_si128((const __m128i *)(input + 8 * 3)); | 2485 in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3)); |
2476 in2 = _mm_load_si128((const __m128i *)(input + 8 * 4)); | 2486 in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4)); |
2477 in10 = _mm_load_si128((const __m128i *)(input + 8 * 5)); | 2487 in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5)); |
2478 in3 = _mm_load_si128((const __m128i *)(input + 8 * 6)); | 2488 in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6)); |
2479 in11 = _mm_load_si128((const __m128i *)(input + 8 * 7)); | 2489 in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7)); |
2480 | 2490 |
2481 TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3); | 2491 TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1], in[2], in[3]); |
2482 TRANSPOSE_8X4(in8, in9, in10, in11, in8, in9, in10, in11); | 2492 TRANSPOSE_8X4(in[8], in[9], in[10], in[11], in[8], in[9], in[10], in[11]); |
2483 | 2493 |
2484 // Stage2 | 2494 // Stage2 |
2485 { | 2495 { |
2486 const __m128i lo_1_15 = _mm_unpackhi_epi16(in0, in11); | 2496 const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], in[11]); |
2487 const __m128i lo_9_7 = _mm_unpackhi_epi16(in8, in3); | 2497 const __m128i lo_9_7 = _mm_unpackhi_epi16(in[8], in[3]); |
2488 const __m128i lo_5_11 = _mm_unpackhi_epi16(in2, in9); | 2498 const __m128i lo_5_11 = _mm_unpackhi_epi16(in[2], in[9]); |
2489 const __m128i lo_13_3 = _mm_unpackhi_epi16(in10, in1); | 2499 const __m128i lo_13_3 = _mm_unpackhi_epi16(in[10], in[1]); |
2490 | 2500 |
2491 tmp0 = _mm_madd_epi16(lo_1_15, stg2_0); | 2501 tmp0 = _mm_madd_epi16(lo_1_15, stg2_0); |
2492 tmp2 = _mm_madd_epi16(lo_1_15, stg2_1); | 2502 tmp2 = _mm_madd_epi16(lo_1_15, stg2_1); |
2493 tmp4 = _mm_madd_epi16(lo_9_7, stg2_2); | 2503 tmp4 = _mm_madd_epi16(lo_9_7, stg2_2); |
2494 tmp6 = _mm_madd_epi16(lo_9_7, stg2_3); | 2504 tmp6 = _mm_madd_epi16(lo_9_7, stg2_3); |
2495 tmp1 = _mm_madd_epi16(lo_5_11, stg2_4); | 2505 tmp1 = _mm_madd_epi16(lo_5_11, stg2_4); |
2496 tmp3 = _mm_madd_epi16(lo_5_11, stg2_5); | 2506 tmp3 = _mm_madd_epi16(lo_5_11, stg2_5); |
2497 tmp5 = _mm_madd_epi16(lo_13_3, stg2_6); | 2507 tmp5 = _mm_madd_epi16(lo_13_3, stg2_6); |
2498 tmp7 = _mm_madd_epi16(lo_13_3, stg2_7); | 2508 tmp7 = _mm_madd_epi16(lo_13_3, stg2_7); |
2499 | 2509 |
(...skipping 21 matching lines...) Expand all Loading... |
2521 stp2_14 = _mm_packs_epi32(tmp6, zero); | 2531 stp2_14 = _mm_packs_epi32(tmp6, zero); |
2522 | 2532 |
2523 stp2_10 = _mm_packs_epi32(tmp1, zero); | 2533 stp2_10 = _mm_packs_epi32(tmp1, zero); |
2524 stp2_13 = _mm_packs_epi32(tmp3, zero); | 2534 stp2_13 = _mm_packs_epi32(tmp3, zero); |
2525 stp2_11 = _mm_packs_epi32(tmp5, zero); | 2535 stp2_11 = _mm_packs_epi32(tmp5, zero); |
2526 stp2_12 = _mm_packs_epi32(tmp7, zero); | 2536 stp2_12 = _mm_packs_epi32(tmp7, zero); |
2527 } | 2537 } |
2528 | 2538 |
2529 // Stage3 | 2539 // Stage3 |
2530 { | 2540 { |
2531 const __m128i lo_2_14 = _mm_unpacklo_epi16(in1, in11); | 2541 const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], in[11]); |
2532 const __m128i lo_10_6 = _mm_unpacklo_epi16(in9, in3); | 2542 const __m128i lo_10_6 = _mm_unpacklo_epi16(in[9], in[3]); |
2533 | 2543 |
2534 tmp0 = _mm_madd_epi16(lo_2_14, stg3_0); | 2544 tmp0 = _mm_madd_epi16(lo_2_14, stg3_0); |
2535 tmp2 = _mm_madd_epi16(lo_2_14, stg3_1); | 2545 tmp2 = _mm_madd_epi16(lo_2_14, stg3_1); |
2536 tmp4 = _mm_madd_epi16(lo_10_6, stg3_2); | 2546 tmp4 = _mm_madd_epi16(lo_10_6, stg3_2); |
2537 tmp6 = _mm_madd_epi16(lo_10_6, stg3_3); | 2547 tmp6 = _mm_madd_epi16(lo_10_6, stg3_3); |
2538 | 2548 |
2539 tmp0 = _mm_add_epi32(tmp0, rounding); | 2549 tmp0 = _mm_add_epi32(tmp0, rounding); |
2540 tmp2 = _mm_add_epi32(tmp2, rounding); | 2550 tmp2 = _mm_add_epi32(tmp2, rounding); |
2541 tmp4 = _mm_add_epi32(tmp4, rounding); | 2551 tmp4 = _mm_add_epi32(tmp4, rounding); |
2542 tmp6 = _mm_add_epi32(tmp6, rounding); | 2552 tmp6 = _mm_add_epi32(tmp6, rounding); |
(...skipping 14 matching lines...) Expand all Loading... |
2557 stp1_11 = _mm_add_epi16(stp2_11, stp2_10); | 2567 stp1_11 = _mm_add_epi16(stp2_11, stp2_10); |
2558 | 2568 |
2559 stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); | 2569 stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); |
2560 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); | 2570 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); |
2561 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); | 2571 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); |
2562 stp1_15 = _mm_add_epi16(stp2_15, stp2_14); | 2572 stp1_15 = _mm_add_epi16(stp2_15, stp2_14); |
2563 } | 2573 } |
2564 | 2574 |
2565 // Stage4 | 2575 // Stage4 |
2566 { | 2576 { |
2567 const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8); | 2577 const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); |
2568 const __m128i lo_4_12 = _mm_unpacklo_epi16(in2, in10); | 2578 const __m128i lo_4_12 = _mm_unpacklo_epi16(in[2], in[10]); |
2569 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); | 2579 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); |
2570 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); | 2580 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); |
2571 | 2581 |
2572 tmp0 = _mm_madd_epi16(lo_0_8, stg4_0); | 2582 tmp0 = _mm_madd_epi16(lo_0_8, stg4_0); |
2573 tmp2 = _mm_madd_epi16(lo_0_8, stg4_1); | 2583 tmp2 = _mm_madd_epi16(lo_0_8, stg4_1); |
2574 tmp4 = _mm_madd_epi16(lo_4_12, stg4_2); | 2584 tmp4 = _mm_madd_epi16(lo_4_12, stg4_2); |
2575 tmp6 = _mm_madd_epi16(lo_4_12, stg4_3); | 2585 tmp6 = _mm_madd_epi16(lo_4_12, stg4_3); |
2576 tmp1 = _mm_madd_epi16(lo_9_14, stg4_4); | 2586 tmp1 = _mm_madd_epi16(lo_9_14, stg4_4); |
2577 tmp3 = _mm_madd_epi16(lo_9_14, stg4_5); | 2587 tmp3 = _mm_madd_epi16(lo_9_14, stg4_5); |
2578 tmp5 = _mm_madd_epi16(lo_10_13, stg4_6); | 2588 tmp5 = _mm_madd_epi16(lo_10_13, stg4_6); |
(...skipping 88 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2667 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); | 2677 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); |
2668 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); | 2678 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); |
2669 stp2_3 = _mm_add_epi16(stp1_3, stp2_4); | 2679 stp2_3 = _mm_add_epi16(stp1_3, stp2_4); |
2670 stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); | 2680 stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); |
2671 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); | 2681 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); |
2672 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); | 2682 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); |
2673 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); | 2683 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); |
2674 } | 2684 } |
2675 | 2685 |
2676 // Stage7. Left 8x16 only. | 2686 // Stage7. Left 8x16 only. |
2677 l0 = _mm_add_epi16(stp2_0, stp1_15); | 2687 l[0] = _mm_add_epi16(stp2_0, stp1_15); |
2678 l1 = _mm_add_epi16(stp2_1, stp1_14); | 2688 l[1] = _mm_add_epi16(stp2_1, stp1_14); |
2679 l2 = _mm_add_epi16(stp2_2, stp2_13); | 2689 l[2] = _mm_add_epi16(stp2_2, stp2_13); |
2680 l3 = _mm_add_epi16(stp2_3, stp2_12); | 2690 l[3] = _mm_add_epi16(stp2_3, stp2_12); |
2681 l4 = _mm_add_epi16(stp2_4, stp2_11); | 2691 l[4] = _mm_add_epi16(stp2_4, stp2_11); |
2682 l5 = _mm_add_epi16(stp2_5, stp2_10); | 2692 l[5] = _mm_add_epi16(stp2_5, stp2_10); |
2683 l6 = _mm_add_epi16(stp2_6, stp1_9); | 2693 l[6] = _mm_add_epi16(stp2_6, stp1_9); |
2684 l7 = _mm_add_epi16(stp2_7, stp1_8); | 2694 l[7] = _mm_add_epi16(stp2_7, stp1_8); |
2685 l8 = _mm_sub_epi16(stp2_7, stp1_8); | 2695 l[8] = _mm_sub_epi16(stp2_7, stp1_8); |
2686 l9 = _mm_sub_epi16(stp2_6, stp1_9); | 2696 l[9] = _mm_sub_epi16(stp2_6, stp1_9); |
2687 l10 = _mm_sub_epi16(stp2_5, stp2_10); | 2697 l[10] = _mm_sub_epi16(stp2_5, stp2_10); |
2688 l11 = _mm_sub_epi16(stp2_4, stp2_11); | 2698 l[11] = _mm_sub_epi16(stp2_4, stp2_11); |
2689 l12 = _mm_sub_epi16(stp2_3, stp2_12); | 2699 l[12] = _mm_sub_epi16(stp2_3, stp2_12); |
2690 l13 = _mm_sub_epi16(stp2_2, stp2_13); | 2700 l[13] = _mm_sub_epi16(stp2_2, stp2_13); |
2691 l14 = _mm_sub_epi16(stp2_1, stp1_14); | 2701 l[14] = _mm_sub_epi16(stp2_1, stp1_14); |
2692 l15 = _mm_sub_epi16(stp2_0, stp1_15); | 2702 l[15] = _mm_sub_epi16(stp2_0, stp1_15); |
2693 | 2703 |
2694 // 2-D idct. We do 2 8x16 blocks. | 2704 // 2-D idct. We do 2 8x16 blocks. |
2695 for (i = 0; i < 2; i++) { | 2705 for (i = 0; i < 2; i++) { |
2696 if (i == 0) | 2706 array_transpose_4X8(l + 8*i, in); |
2697 TRANSPOSE_4X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4, | 2707 in[8] = in[9] = in[10] = in[11] = in[12] = in[13] = in[14] = in[15] = zero; |
2698 in5, in6, in7); | |
2699 | |
2700 if (i == 1) | |
2701 TRANSPOSE_4X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3, | |
2702 in4, in5, in6, in7); | |
2703 | |
2704 in8 = in9 = in10 = in11 = in12 = in13 = in14 = in15 = zero; | |
2705 | 2708 |
2706 IDCT16_1D | 2709 IDCT16_1D |
2707 | 2710 |
2708 // Stage7 | 2711 // Stage7 |
2709 in0 = _mm_add_epi16(stp2_0, stp1_15); | 2712 in[0] = _mm_add_epi16(stp2_0, stp1_15); |
2710 in1 = _mm_add_epi16(stp2_1, stp1_14); | 2713 in[1] = _mm_add_epi16(stp2_1, stp1_14); |
2711 in2 = _mm_add_epi16(stp2_2, stp2_13); | 2714 in[2] = _mm_add_epi16(stp2_2, stp2_13); |
2712 in3 = _mm_add_epi16(stp2_3, stp2_12); | 2715 in[3] = _mm_add_epi16(stp2_3, stp2_12); |
2713 in4 = _mm_add_epi16(stp2_4, stp2_11); | 2716 in[4] = _mm_add_epi16(stp2_4, stp2_11); |
2714 in5 = _mm_add_epi16(stp2_5, stp2_10); | 2717 in[5] = _mm_add_epi16(stp2_5, stp2_10); |
2715 in6 = _mm_add_epi16(stp2_6, stp1_9); | 2718 in[6] = _mm_add_epi16(stp2_6, stp1_9); |
2716 in7 = _mm_add_epi16(stp2_7, stp1_8); | 2719 in[7] = _mm_add_epi16(stp2_7, stp1_8); |
2717 in8 = _mm_sub_epi16(stp2_7, stp1_8); | 2720 in[8] = _mm_sub_epi16(stp2_7, stp1_8); |
2718 in9 = _mm_sub_epi16(stp2_6, stp1_9); | 2721 in[9] = _mm_sub_epi16(stp2_6, stp1_9); |
2719 in10 = _mm_sub_epi16(stp2_5, stp2_10); | 2722 in[10] = _mm_sub_epi16(stp2_5, stp2_10); |
2720 in11 = _mm_sub_epi16(stp2_4, stp2_11); | 2723 in[11] = _mm_sub_epi16(stp2_4, stp2_11); |
2721 in12 = _mm_sub_epi16(stp2_3, stp2_12); | 2724 in[12] = _mm_sub_epi16(stp2_3, stp2_12); |
2722 in13 = _mm_sub_epi16(stp2_2, stp2_13); | 2725 in[13] = _mm_sub_epi16(stp2_2, stp2_13); |
2723 in14 = _mm_sub_epi16(stp2_1, stp1_14); | 2726 in[14] = _mm_sub_epi16(stp2_1, stp1_14); |
2724 in15 = _mm_sub_epi16(stp2_0, stp1_15); | 2727 in[15] = _mm_sub_epi16(stp2_0, stp1_15); |
2725 | 2728 |
2726 // Final rounding and shift | 2729 // Final rounding and shift |
2727 in0 = _mm_adds_epi16(in0, final_rounding); | 2730 in[0] = _mm_adds_epi16(in[0], final_rounding); |
2728 in1 = _mm_adds_epi16(in1, final_rounding); | 2731 in[1] = _mm_adds_epi16(in[1], final_rounding); |
2729 in2 = _mm_adds_epi16(in2, final_rounding); | 2732 in[2] = _mm_adds_epi16(in[2], final_rounding); |
2730 in3 = _mm_adds_epi16(in3, final_rounding); | 2733 in[3] = _mm_adds_epi16(in[3], final_rounding); |
2731 in4 = _mm_adds_epi16(in4, final_rounding); | 2734 in[4] = _mm_adds_epi16(in[4], final_rounding); |
2732 in5 = _mm_adds_epi16(in5, final_rounding); | 2735 in[5] = _mm_adds_epi16(in[5], final_rounding); |
2733 in6 = _mm_adds_epi16(in6, final_rounding); | 2736 in[6] = _mm_adds_epi16(in[6], final_rounding); |
2734 in7 = _mm_adds_epi16(in7, final_rounding); | 2737 in[7] = _mm_adds_epi16(in[7], final_rounding); |
2735 in8 = _mm_adds_epi16(in8, final_rounding); | 2738 in[8] = _mm_adds_epi16(in[8], final_rounding); |
2736 in9 = _mm_adds_epi16(in9, final_rounding); | 2739 in[9] = _mm_adds_epi16(in[9], final_rounding); |
2737 in10 = _mm_adds_epi16(in10, final_rounding); | 2740 in[10] = _mm_adds_epi16(in[10], final_rounding); |
2738 in11 = _mm_adds_epi16(in11, final_rounding); | 2741 in[11] = _mm_adds_epi16(in[11], final_rounding); |
2739 in12 = _mm_adds_epi16(in12, final_rounding); | 2742 in[12] = _mm_adds_epi16(in[12], final_rounding); |
2740 in13 = _mm_adds_epi16(in13, final_rounding); | 2743 in[13] = _mm_adds_epi16(in[13], final_rounding); |
2741 in14 = _mm_adds_epi16(in14, final_rounding); | 2744 in[14] = _mm_adds_epi16(in[14], final_rounding); |
2742 in15 = _mm_adds_epi16(in15, final_rounding); | 2745 in[15] = _mm_adds_epi16(in[15], final_rounding); |
2743 | 2746 |
2744 in0 = _mm_srai_epi16(in0, 6); | 2747 in[0] = _mm_srai_epi16(in[0], 6); |
2745 in1 = _mm_srai_epi16(in1, 6); | 2748 in[1] = _mm_srai_epi16(in[1], 6); |
2746 in2 = _mm_srai_epi16(in2, 6); | 2749 in[2] = _mm_srai_epi16(in[2], 6); |
2747 in3 = _mm_srai_epi16(in3, 6); | 2750 in[3] = _mm_srai_epi16(in[3], 6); |
2748 in4 = _mm_srai_epi16(in4, 6); | 2751 in[4] = _mm_srai_epi16(in[4], 6); |
2749 in5 = _mm_srai_epi16(in5, 6); | 2752 in[5] = _mm_srai_epi16(in[5], 6); |
2750 in6 = _mm_srai_epi16(in6, 6); | 2753 in[6] = _mm_srai_epi16(in[6], 6); |
2751 in7 = _mm_srai_epi16(in7, 6); | 2754 in[7] = _mm_srai_epi16(in[7], 6); |
2752 in8 = _mm_srai_epi16(in8, 6); | 2755 in[8] = _mm_srai_epi16(in[8], 6); |
2753 in9 = _mm_srai_epi16(in9, 6); | 2756 in[9] = _mm_srai_epi16(in[9], 6); |
2754 in10 = _mm_srai_epi16(in10, 6); | 2757 in[10] = _mm_srai_epi16(in[10], 6); |
2755 in11 = _mm_srai_epi16(in11, 6); | 2758 in[11] = _mm_srai_epi16(in[11], 6); |
2756 in12 = _mm_srai_epi16(in12, 6); | 2759 in[12] = _mm_srai_epi16(in[12], 6); |
2757 in13 = _mm_srai_epi16(in13, 6); | 2760 in[13] = _mm_srai_epi16(in[13], 6); |
2758 in14 = _mm_srai_epi16(in14, 6); | 2761 in[14] = _mm_srai_epi16(in[14], 6); |
2759 in15 = _mm_srai_epi16(in15, 6); | 2762 in[15] = _mm_srai_epi16(in[15], 6); |
2760 | 2763 |
2761 RECON_AND_STORE(dest, in0); | 2764 RECON_AND_STORE(dest, in[0]); |
2762 RECON_AND_STORE(dest, in1); | 2765 RECON_AND_STORE(dest, in[1]); |
2763 RECON_AND_STORE(dest, in2); | 2766 RECON_AND_STORE(dest, in[2]); |
2764 RECON_AND_STORE(dest, in3); | 2767 RECON_AND_STORE(dest, in[3]); |
2765 RECON_AND_STORE(dest, in4); | 2768 RECON_AND_STORE(dest, in[4]); |
2766 RECON_AND_STORE(dest, in5); | 2769 RECON_AND_STORE(dest, in[5]); |
2767 RECON_AND_STORE(dest, in6); | 2770 RECON_AND_STORE(dest, in[6]); |
2768 RECON_AND_STORE(dest, in7); | 2771 RECON_AND_STORE(dest, in[7]); |
2769 RECON_AND_STORE(dest, in8); | 2772 RECON_AND_STORE(dest, in[8]); |
2770 RECON_AND_STORE(dest, in9); | 2773 RECON_AND_STORE(dest, in[9]); |
2771 RECON_AND_STORE(dest, in10); | 2774 RECON_AND_STORE(dest, in[10]); |
2772 RECON_AND_STORE(dest, in11); | 2775 RECON_AND_STORE(dest, in[11]); |
2773 RECON_AND_STORE(dest, in12); | 2776 RECON_AND_STORE(dest, in[12]); |
2774 RECON_AND_STORE(dest, in13); | 2777 RECON_AND_STORE(dest, in[13]); |
2775 RECON_AND_STORE(dest, in14); | 2778 RECON_AND_STORE(dest, in[14]); |
2776 RECON_AND_STORE(dest, in15); | 2779 RECON_AND_STORE(dest, in[15]); |
2777 | 2780 |
2778 dest += 8 - (stride * 16); | 2781 dest += 8 - (stride * 16); |
2779 } | 2782 } |
2780 } | 2783 } |
2781 | 2784 |
2782 #define LOAD_DQCOEFF(reg, input) \ | 2785 #define LOAD_DQCOEFF(reg, input) \ |
2783 { \ | 2786 { \ |
2784 reg = _mm_load_si128((const __m128i *) input); \ | 2787 reg = _mm_load_si128((const __m128i *) input); \ |
2785 input += 8; \ | 2788 input += 8; \ |
2786 } \ | 2789 } \ |
2787 | 2790 |
| 2791 #define IDCT32_1D_34 \ |
| 2792 /* Stage1 */ \ |
| 2793 { \ |
| 2794 const __m128i zero = _mm_setzero_si128();\ |
| 2795 const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \ |
| 2796 const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \ |
| 2797 \ |
| 2798 const __m128i lo_25_7= _mm_unpacklo_epi16(zero, in[7]); \ |
| 2799 const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \ |
| 2800 \ |
| 2801 const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \ |
| 2802 const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \ |
| 2803 \ |
| 2804 const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \ |
| 2805 const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \ |
| 2806 \ |
| 2807 MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, \ |
| 2808 stg1_1, stp1_16, stp1_31); \ |
| 2809 MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, \ |
| 2810 stg1_7, stp1_19, stp1_28); \ |
| 2811 MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, \ |
| 2812 stg1_9, stp1_20, stp1_27); \ |
| 2813 MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, \ |
| 2814 stg1_15, stp1_23, stp1_24); \ |
| 2815 } \ |
| 2816 \ |
| 2817 /* Stage2 */ \ |
| 2818 { \ |
| 2819 const __m128i zero = _mm_setzero_si128();\ |
| 2820 const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \ |
| 2821 const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \ |
| 2822 \ |
| 2823 const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \ |
| 2824 const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \ |
| 2825 \ |
| 2826 MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, \ |
| 2827 stg2_1, stp2_8, stp2_15); \ |
| 2828 MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, \ |
| 2829 stg2_7, stp2_11, stp2_12); \ |
| 2830 \ |
| 2831 stp2_16 = stp1_16; \ |
| 2832 stp2_19 = stp1_19; \ |
| 2833 \ |
| 2834 stp2_20 = stp1_20; \ |
| 2835 stp2_23 = stp1_23; \ |
| 2836 \ |
| 2837 stp2_24 = stp1_24; \ |
| 2838 stp2_27 = stp1_27; \ |
| 2839 \ |
| 2840 stp2_28 = stp1_28; \ |
| 2841 stp2_31 = stp1_31; \ |
| 2842 } \ |
| 2843 \ |
| 2844 /* Stage3 */ \ |
| 2845 { \ |
| 2846 const __m128i zero = _mm_setzero_si128();\ |
| 2847 const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \ |
| 2848 const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \ |
| 2849 \ |
| 2850 const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \ |
| 2851 const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \ |
| 2852 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \ |
| 2853 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \ |
| 2854 \ |
| 2855 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \ |
| 2856 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \ |
| 2857 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \ |
| 2858 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \ |
| 2859 \ |
| 2860 MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, \ |
| 2861 stg3_1, stp1_4, stp1_7); \ |
| 2862 \ |
| 2863 stp1_8 = stp2_8; \ |
| 2864 stp1_11 = stp2_11; \ |
| 2865 stp1_12 = stp2_12; \ |
| 2866 stp1_15 = stp2_15; \ |
| 2867 \ |
| 2868 MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ |
| 2869 stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \ |
| 2870 stp1_18, stp1_29) \ |
| 2871 MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ |
| 2872 stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \ |
| 2873 stp1_22, stp1_25) \ |
| 2874 \ |
| 2875 stp1_16 = stp2_16; \ |
| 2876 stp1_31 = stp2_31; \ |
| 2877 stp1_19 = stp2_19; \ |
| 2878 stp1_20 = stp2_20; \ |
| 2879 stp1_23 = stp2_23; \ |
| 2880 stp1_24 = stp2_24; \ |
| 2881 stp1_27 = stp2_27; \ |
| 2882 stp1_28 = stp2_28; \ |
| 2883 } \ |
| 2884 \ |
| 2885 /* Stage4 */ \ |
| 2886 { \ |
| 2887 const __m128i zero = _mm_setzero_si128();\ |
| 2888 const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \ |
| 2889 const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \ |
| 2890 \ |
| 2891 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \ |
| 2892 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \ |
| 2893 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \ |
| 2894 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \ |
| 2895 \ |
| 2896 MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, \ |
| 2897 stg4_1, stp2_0, stp2_1); \ |
| 2898 \ |
| 2899 stp2_4 = stp1_4; \ |
| 2900 stp2_5 = stp1_4; \ |
| 2901 stp2_6 = stp1_7; \ |
| 2902 stp2_7 = stp1_7; \ |
| 2903 \ |
| 2904 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ |
| 2905 stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \ |
| 2906 stp2_10, stp2_13) \ |
| 2907 \ |
| 2908 stp2_8 = stp1_8; \ |
| 2909 stp2_15 = stp1_15; \ |
| 2910 stp2_11 = stp1_11; \ |
| 2911 stp2_12 = stp1_12; \ |
| 2912 \ |
| 2913 stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ |
| 2914 stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ |
| 2915 stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ |
| 2916 stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ |
| 2917 stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ |
| 2918 stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ |
| 2919 stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ |
| 2920 stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ |
| 2921 \ |
| 2922 stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ |
| 2923 stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ |
| 2924 stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ |
| 2925 stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ |
| 2926 stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ |
| 2927 stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ |
| 2928 stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ |
| 2929 stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ |
| 2930 } \ |
| 2931 \ |
| 2932 /* Stage5 */ \ |
| 2933 { \ |
| 2934 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ |
| 2935 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ |
| 2936 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ |
| 2937 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ |
| 2938 \ |
| 2939 const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ |
| 2940 const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ |
| 2941 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ |
| 2942 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ |
| 2943 \ |
| 2944 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ |
| 2945 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ |
| 2946 \ |
| 2947 stp1_0 = stp2_0; \ |
| 2948 stp1_1 = stp2_1; \ |
| 2949 stp1_2 = stp2_1; \ |
| 2950 stp1_3 = stp2_0; \ |
| 2951 \ |
| 2952 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ |
| 2953 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ |
| 2954 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ |
| 2955 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ |
| 2956 \ |
| 2957 tmp0 = _mm_add_epi32(tmp0, rounding); \ |
| 2958 tmp1 = _mm_add_epi32(tmp1, rounding); \ |
| 2959 tmp2 = _mm_add_epi32(tmp2, rounding); \ |
| 2960 tmp3 = _mm_add_epi32(tmp3, rounding); \ |
| 2961 \ |
| 2962 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ |
| 2963 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ |
| 2964 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ |
| 2965 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ |
| 2966 \ |
| 2967 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ |
| 2968 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ |
| 2969 \ |
| 2970 stp1_4 = stp2_4; \ |
| 2971 stp1_7 = stp2_7; \ |
| 2972 \ |
| 2973 stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ |
| 2974 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ |
| 2975 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ |
| 2976 stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ |
| 2977 stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ |
| 2978 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ |
| 2979 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ |
| 2980 stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ |
| 2981 \ |
| 2982 stp1_16 = stp2_16; \ |
| 2983 stp1_17 = stp2_17; \ |
| 2984 \ |
| 2985 MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ |
| 2986 stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \ |
| 2987 stp1_19, stp1_28) \ |
| 2988 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ |
| 2989 stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \ |
| 2990 stp1_21, stp1_26) \ |
| 2991 \ |
| 2992 stp1_22 = stp2_22; \ |
| 2993 stp1_23 = stp2_23; \ |
| 2994 stp1_24 = stp2_24; \ |
| 2995 stp1_25 = stp2_25; \ |
| 2996 stp1_30 = stp2_30; \ |
| 2997 stp1_31 = stp2_31; \ |
| 2998 } \ |
| 2999 \ |
| 3000 /* Stage6 */ \ |
| 3001 { \ |
| 3002 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ |
| 3003 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ |
| 3004 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ |
| 3005 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ |
| 3006 \ |
| 3007 stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ |
| 3008 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ |
| 3009 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ |
| 3010 stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ |
| 3011 stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ |
| 3012 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ |
| 3013 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ |
| 3014 stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ |
| 3015 \ |
| 3016 stp2_8 = stp1_8; \ |
| 3017 stp2_9 = stp1_9; \ |
| 3018 stp2_14 = stp1_14; \ |
| 3019 stp2_15 = stp1_15; \ |
| 3020 \ |
| 3021 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ |
| 3022 stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \ |
| 3023 stp2_13, stp2_11, stp2_12) \ |
| 3024 \ |
| 3025 stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ |
| 3026 stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ |
| 3027 stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ |
| 3028 stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ |
| 3029 stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ |
| 3030 stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ |
| 3031 stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ |
| 3032 stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ |
| 3033 \ |
| 3034 stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ |
| 3035 stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ |
| 3036 stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ |
| 3037 stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ |
| 3038 stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ |
| 3039 stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ |
| 3040 stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ |
| 3041 stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ |
| 3042 } \ |
| 3043 \ |
| 3044 /* Stage7 */ \ |
| 3045 { \ |
| 3046 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ |
| 3047 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ |
| 3048 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ |
| 3049 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ |
| 3050 \ |
| 3051 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ |
| 3052 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ |
| 3053 const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ |
| 3054 const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ |
| 3055 \ |
| 3056 stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ |
| 3057 stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ |
| 3058 stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ |
| 3059 stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ |
| 3060 stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ |
| 3061 stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ |
| 3062 stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ |
| 3063 stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ |
| 3064 stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ |
| 3065 stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ |
| 3066 stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ |
| 3067 stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ |
| 3068 stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ |
| 3069 stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ |
| 3070 stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ |
| 3071 stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ |
| 3072 \ |
| 3073 stp1_16 = stp2_16; \ |
| 3074 stp1_17 = stp2_17; \ |
| 3075 stp1_18 = stp2_18; \ |
| 3076 stp1_19 = stp2_19; \ |
| 3077 \ |
| 3078 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ |
| 3079 stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \ |
| 3080 stp1_21, stp1_26) \ |
| 3081 MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ |
| 3082 stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \ |
| 3083 stp1_23, stp1_24) \ |
| 3084 \ |
| 3085 stp1_28 = stp2_28; \ |
| 3086 stp1_29 = stp2_29; \ |
| 3087 stp1_30 = stp2_30; \ |
| 3088 stp1_31 = stp2_31; \ |
| 3089 } |
| 3090 |
| 3091 |
2788 #define IDCT32_1D \ | 3092 #define IDCT32_1D \ |
2789 /* Stage1 */ \ | 3093 /* Stage1 */ \ |
2790 { \ | 3094 { \ |
2791 const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31); \ | 3095 const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \ |
2792 const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31); \ | 3096 const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \ |
2793 const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15); \ | 3097 const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \ |
2794 const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15); \ | 3098 const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \ |
2795 \ | 3099 \ |
2796 const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23); \ | 3100 const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \ |
2797 const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23); \ | 3101 const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \ |
2798 const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7); \ | 3102 const __m128i lo_25_7= _mm_unpacklo_epi16(in[25], in[7]); \ |
2799 const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7); \ | 3103 const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \ |
2800 \ | 3104 \ |
2801 const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27); \ | 3105 const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \ |
2802 const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27); \ | 3106 const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \ |
2803 const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11); \ | 3107 const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \ |
2804 const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11); \ | 3108 const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \ |
2805 \ | 3109 \ |
2806 const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19); \ | 3110 const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \ |
2807 const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19); \ | 3111 const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \ |
2808 const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3); \ | 3112 const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \ |
2809 const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3); \ | 3113 const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \ |
2810 \ | 3114 \ |
2811 MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \ | 3115 MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \ |
2812 stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \ | 3116 stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \ |
2813 stp1_17, stp1_30) \ | 3117 stp1_17, stp1_30) \ |
2814 MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \ | 3118 MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \ |
2815 stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \ | 3119 stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \ |
2816 stp1_19, stp1_28) \ | 3120 stp1_19, stp1_28) \ |
2817 MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \ | 3121 MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \ |
2818 stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \ | 3122 stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \ |
2819 stp1_21, stp1_26) \ | 3123 stp1_21, stp1_26) \ |
2820 MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \ | 3124 MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \ |
2821 stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \ | 3125 stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \ |
2822 stp1_23, stp1_24) \ | 3126 stp1_23, stp1_24) \ |
2823 } \ | 3127 } \ |
2824 \ | 3128 \ |
2825 /* Stage2 */ \ | 3129 /* Stage2 */ \ |
2826 { \ | 3130 { \ |
2827 const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30); \ | 3131 const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \ |
2828 const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30); \ | 3132 const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \ |
2829 const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14); \ | 3133 const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \ |
2830 const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14); \ | 3134 const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \ |
2831 \ | 3135 \ |
2832 const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22); \ | 3136 const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \ |
2833 const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22); \ | 3137 const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \ |
2834 const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6); \ | 3138 const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \ |
2835 const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6); \ | 3139 const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \ |
2836 \ | 3140 \ |
2837 MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \ | 3141 MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \ |
2838 stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \ | 3142 stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \ |
2839 stp2_14) \ | 3143 stp2_14) \ |
2840 MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \ | 3144 MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \ |
2841 stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \ | 3145 stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \ |
2842 stp2_11, stp2_12) \ | 3146 stp2_11, stp2_12) \ |
2843 \ | 3147 \ |
2844 stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \ | 3148 stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \ |
2845 stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \ | 3149 stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \ |
(...skipping 11 matching lines...) Expand all Loading... |
2857 stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \ | 3161 stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \ |
2858 \ | 3162 \ |
2859 stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \ | 3163 stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \ |
2860 stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \ | 3164 stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \ |
2861 stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \ | 3165 stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \ |
2862 stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \ | 3166 stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \ |
2863 } \ | 3167 } \ |
2864 \ | 3168 \ |
2865 /* Stage3 */ \ | 3169 /* Stage3 */ \ |
2866 { \ | 3170 { \ |
2867 const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28); \ | 3171 const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \ |
2868 const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28); \ | 3172 const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \ |
2869 const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12); \ | 3173 const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \ |
2870 const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12); \ | 3174 const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \ |
2871 \ | 3175 \ |
2872 const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \ | 3176 const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \ |
2873 const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \ | 3177 const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \ |
2874 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ | 3178 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ |
2875 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ | 3179 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ |
2876 \ | 3180 \ |
2877 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ | 3181 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ |
2878 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ | 3182 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ |
2879 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ | 3183 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ |
2880 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ | 3184 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ |
(...skipping 23 matching lines...) Expand all Loading... |
2904 stp1_19 = stp2_19; \ | 3208 stp1_19 = stp2_19; \ |
2905 stp1_20 = stp2_20; \ | 3209 stp1_20 = stp2_20; \ |
2906 stp1_23 = stp2_23; \ | 3210 stp1_23 = stp2_23; \ |
2907 stp1_24 = stp2_24; \ | 3211 stp1_24 = stp2_24; \ |
2908 stp1_27 = stp2_27; \ | 3212 stp1_27 = stp2_27; \ |
2909 stp1_28 = stp2_28; \ | 3213 stp1_28 = stp2_28; \ |
2910 } \ | 3214 } \ |
2911 \ | 3215 \ |
2912 /* Stage4 */ \ | 3216 /* Stage4 */ \ |
2913 { \ | 3217 { \ |
2914 const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16); \ | 3218 const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \ |
2915 const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16); \ | 3219 const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \ |
2916 const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24); \ | 3220 const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \ |
2917 const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24); \ | 3221 const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \ |
2918 \ | 3222 \ |
2919 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ | 3223 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ |
2920 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ | 3224 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ |
2921 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ | 3225 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ |
2922 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ | 3226 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ |
2923 \ | 3227 \ |
2924 MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \ | 3228 MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \ |
2925 stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \ | 3229 stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \ |
2926 stp2_2, stp2_3) \ | 3230 stp2_2, stp2_3) \ |
2927 \ | 3231 \ |
(...skipping 236 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3164 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); | 3468 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); |
3165 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 3469 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
3166 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); | 3470 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
3167 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); | 3471 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); |
3168 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); | 3472 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); |
3169 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); | 3473 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); |
3170 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); | 3474 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); |
3171 | 3475 |
3172 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); | 3476 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); |
3173 | 3477 |
3174 __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, | 3478 __m128i in[32], col[32]; |
3175 in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23, | |
3176 in24, in25, in26, in27, in28, in29, in30, in31; | |
3177 __m128i col[128]; | |
3178 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, | 3479 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, |
3179 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, | 3480 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, |
3180 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, | 3481 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, |
3181 stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, | 3482 stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, |
3182 stp1_30, stp1_31; | 3483 stp1_30, stp1_31; |
3183 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, | 3484 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, |
3184 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, | 3485 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, |
3185 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, | 3486 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, |
3186 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, | 3487 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, |
3187 stp2_30, stp2_31; | 3488 stp2_30, stp2_31; |
3188 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; | 3489 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
3189 int i, j, i32; | 3490 int i; |
3190 | 3491 // Load input data. |
3191 // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct. | 3492 LOAD_DQCOEFF(in[0], input); |
3192 for (i = 0; i < 8; i++) { | 3493 LOAD_DQCOEFF(in[8], input); |
3193 i32 = (i << 5); | 3494 LOAD_DQCOEFF(in[16], input); |
3194 if (i == 0) { | 3495 LOAD_DQCOEFF(in[24], input); |
3195 // First 1-D idct: first 8 rows | 3496 LOAD_DQCOEFF(in[1], input); |
3196 // Load input data. | 3497 LOAD_DQCOEFF(in[9], input); |
3197 LOAD_DQCOEFF(in0, input); | 3498 LOAD_DQCOEFF(in[17], input); |
3198 LOAD_DQCOEFF(in8, input); | 3499 LOAD_DQCOEFF(in[25], input); |
3199 LOAD_DQCOEFF(in16, input); | 3500 LOAD_DQCOEFF(in[2], input); |
3200 LOAD_DQCOEFF(in24, input); | 3501 LOAD_DQCOEFF(in[10], input); |
3201 LOAD_DQCOEFF(in1, input); | 3502 LOAD_DQCOEFF(in[18], input); |
3202 LOAD_DQCOEFF(in9, input); | 3503 LOAD_DQCOEFF(in[26], input); |
3203 LOAD_DQCOEFF(in17, input); | 3504 LOAD_DQCOEFF(in[3], input); |
3204 LOAD_DQCOEFF(in25, input); | 3505 LOAD_DQCOEFF(in[11], input); |
3205 LOAD_DQCOEFF(in2, input); | 3506 LOAD_DQCOEFF(in[19], input); |
3206 LOAD_DQCOEFF(in10, input); | 3507 LOAD_DQCOEFF(in[27], input); |
3207 LOAD_DQCOEFF(in18, input); | 3508 |
3208 LOAD_DQCOEFF(in26, input); | 3509 LOAD_DQCOEFF(in[4], input); |
3209 LOAD_DQCOEFF(in3, input); | 3510 LOAD_DQCOEFF(in[12], input); |
3210 LOAD_DQCOEFF(in11, input); | 3511 LOAD_DQCOEFF(in[20], input); |
3211 LOAD_DQCOEFF(in19, input); | 3512 LOAD_DQCOEFF(in[28], input); |
3212 LOAD_DQCOEFF(in27, input); | 3513 LOAD_DQCOEFF(in[5], input); |
3213 | 3514 LOAD_DQCOEFF(in[13], input); |
3214 LOAD_DQCOEFF(in4, input); | 3515 LOAD_DQCOEFF(in[21], input); |
3215 LOAD_DQCOEFF(in12, input); | 3516 LOAD_DQCOEFF(in[29], input); |
3216 LOAD_DQCOEFF(in20, input); | 3517 LOAD_DQCOEFF(in[6], input); |
3217 LOAD_DQCOEFF(in28, input); | 3518 LOAD_DQCOEFF(in[14], input); |
3218 LOAD_DQCOEFF(in5, input); | 3519 LOAD_DQCOEFF(in[22], input); |
3219 LOAD_DQCOEFF(in13, input); | 3520 LOAD_DQCOEFF(in[30], input); |
3220 LOAD_DQCOEFF(in21, input); | 3521 LOAD_DQCOEFF(in[7], input); |
3221 LOAD_DQCOEFF(in29, input); | 3522 LOAD_DQCOEFF(in[15], input); |
3222 LOAD_DQCOEFF(in6, input); | 3523 LOAD_DQCOEFF(in[23], input); |
3223 LOAD_DQCOEFF(in14, input); | 3524 LOAD_DQCOEFF(in[31], input); |
3224 LOAD_DQCOEFF(in22, input); | 3525 |
3225 LOAD_DQCOEFF(in30, input); | 3526 array_transpose_8x8(in, in); |
3226 LOAD_DQCOEFF(in7, input); | 3527 array_transpose_8x8(in+8, in+8); |
3227 LOAD_DQCOEFF(in15, input); | 3528 array_transpose_8x8(in+16, in+16); |
3228 LOAD_DQCOEFF(in23, input); | 3529 array_transpose_8x8(in+24, in+24); |
3229 LOAD_DQCOEFF(in31, input); | 3530 |
3230 | 3531 IDCT32_1D |
| 3532 |
| 3533 // 1_D: Store 32 intermediate results for each 8x32 block. |
| 3534 col[0] = _mm_add_epi16(stp1_0, stp1_31); |
| 3535 col[1] = _mm_add_epi16(stp1_1, stp1_30); |
| 3536 col[2] = _mm_add_epi16(stp1_2, stp1_29); |
| 3537 col[3] = _mm_add_epi16(stp1_3, stp1_28); |
| 3538 col[4] = _mm_add_epi16(stp1_4, stp1_27); |
| 3539 col[5] = _mm_add_epi16(stp1_5, stp1_26); |
| 3540 col[6] = _mm_add_epi16(stp1_6, stp1_25); |
| 3541 col[7] = _mm_add_epi16(stp1_7, stp1_24); |
| 3542 col[8] = _mm_add_epi16(stp1_8, stp1_23); |
| 3543 col[9] = _mm_add_epi16(stp1_9, stp1_22); |
| 3544 col[10] = _mm_add_epi16(stp1_10, stp1_21); |
| 3545 col[11] = _mm_add_epi16(stp1_11, stp1_20); |
| 3546 col[12] = _mm_add_epi16(stp1_12, stp1_19); |
| 3547 col[13] = _mm_add_epi16(stp1_13, stp1_18); |
| 3548 col[14] = _mm_add_epi16(stp1_14, stp1_17); |
| 3549 col[15] = _mm_add_epi16(stp1_15, stp1_16); |
| 3550 col[16] = _mm_sub_epi16(stp1_15, stp1_16); |
| 3551 col[17] = _mm_sub_epi16(stp1_14, stp1_17); |
| 3552 col[18] = _mm_sub_epi16(stp1_13, stp1_18); |
| 3553 col[19] = _mm_sub_epi16(stp1_12, stp1_19); |
| 3554 col[20] = _mm_sub_epi16(stp1_11, stp1_20); |
| 3555 col[21] = _mm_sub_epi16(stp1_10, stp1_21); |
| 3556 col[22] = _mm_sub_epi16(stp1_9, stp1_22); |
| 3557 col[23] = _mm_sub_epi16(stp1_8, stp1_23); |
| 3558 col[24] = _mm_sub_epi16(stp1_7, stp1_24); |
| 3559 col[25] = _mm_sub_epi16(stp1_6, stp1_25); |
| 3560 col[26] = _mm_sub_epi16(stp1_5, stp1_26); |
| 3561 col[27] = _mm_sub_epi16(stp1_4, stp1_27); |
| 3562 col[28] = _mm_sub_epi16(stp1_3, stp1_28); |
| 3563 col[29] = _mm_sub_epi16(stp1_2, stp1_29); |
| 3564 col[30] = _mm_sub_epi16(stp1_1, stp1_30); |
| 3565 col[31] = _mm_sub_epi16(stp1_0, stp1_31); |
| 3566 for (i = 0; i < 4; i++) { |
| 3567 const __m128i zero = _mm_setzero_si128(); |
3231 // Transpose 32x8 block to 8x32 block | 3568 // Transpose 32x8 block to 8x32 block |
3232 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, | 3569 array_transpose_8x8(col+i*8, in); |
3233 in4, in5, in6, in7); | 3570 IDCT32_1D_34 |
3234 TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, | |
3235 in10, in11, in12, in13, in14, in15); | |
3236 TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17, | |
3237 in18, in19, in20, in21, in22, in23); | |
3238 TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25, | |
3239 in26, in27, in28, in29, in30, in31); | |
3240 } else if (i < 4) { | |
3241 // First 1-D idct: next 24 zero-coeff rows | |
3242 col[i32 + 0] = _mm_setzero_si128(); | |
3243 col[i32 + 1] = _mm_setzero_si128(); | |
3244 col[i32 + 2] = _mm_setzero_si128(); | |
3245 col[i32 + 3] = _mm_setzero_si128(); | |
3246 col[i32 + 4] = _mm_setzero_si128(); | |
3247 col[i32 + 5] = _mm_setzero_si128(); | |
3248 col[i32 + 6] = _mm_setzero_si128(); | |
3249 col[i32 + 7] = _mm_setzero_si128(); | |
3250 col[i32 + 8] = _mm_setzero_si128(); | |
3251 col[i32 + 9] = _mm_setzero_si128(); | |
3252 col[i32 + 10] = _mm_setzero_si128(); | |
3253 col[i32 + 11] = _mm_setzero_si128(); | |
3254 col[i32 + 12] = _mm_setzero_si128(); | |
3255 col[i32 + 13] = _mm_setzero_si128(); | |
3256 col[i32 + 14] = _mm_setzero_si128(); | |
3257 col[i32 + 15] = _mm_setzero_si128(); | |
3258 col[i32 + 16] = _mm_setzero_si128(); | |
3259 col[i32 + 17] = _mm_setzero_si128(); | |
3260 col[i32 + 18] = _mm_setzero_si128(); | |
3261 col[i32 + 19] = _mm_setzero_si128(); | |
3262 col[i32 + 20] = _mm_setzero_si128(); | |
3263 col[i32 + 21] = _mm_setzero_si128(); | |
3264 col[i32 + 22] = _mm_setzero_si128(); | |
3265 col[i32 + 23] = _mm_setzero_si128(); | |
3266 col[i32 + 24] = _mm_setzero_si128(); | |
3267 col[i32 + 25] = _mm_setzero_si128(); | |
3268 col[i32 + 26] = _mm_setzero_si128(); | |
3269 col[i32 + 27] = _mm_setzero_si128(); | |
3270 col[i32 + 28] = _mm_setzero_si128(); | |
3271 col[i32 + 29] = _mm_setzero_si128(); | |
3272 col[i32 + 30] = _mm_setzero_si128(); | |
3273 col[i32 + 31] = _mm_setzero_si128(); | |
3274 continue; | |
3275 } else { | |
3276 // Second 1-D idct | |
3277 j = i - 4; | |
3278 | |
3279 // Transpose 32x8 block to 8x32 block | |
3280 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], | |
3281 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], | |
3282 col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4, | |
3283 in5, in6, in7); | |
3284 j += 4; | |
3285 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], | |
3286 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], | |
3287 col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10, | |
3288 in11, in12, in13, in14, in15); | |
3289 j += 4; | |
3290 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], | |
3291 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], | |
3292 col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18, | |
3293 in19, in20, in21, in22, in23); | |
3294 j += 4; | |
3295 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], | |
3296 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], | |
3297 col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27, | |
3298 in28, in29, in30, in31); | |
3299 } | |
3300 | |
3301 IDCT32_1D | |
3302 | |
3303 // final stage | |
3304 if (i < 4) { | |
3305 // 1_D: Store 32 intermediate results for each 8x32 block. | |
3306 col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); | |
3307 col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); | |
3308 col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29); | |
3309 col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28); | |
3310 col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27); | |
3311 col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26); | |
3312 col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25); | |
3313 col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24); | |
3314 col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23); | |
3315 col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22); | |
3316 col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21); | |
3317 col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20); | |
3318 col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19); | |
3319 col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18); | |
3320 col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17); | |
3321 col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16); | |
3322 col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); | |
3323 col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); | |
3324 col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); | |
3325 col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); | |
3326 col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); | |
3327 col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); | |
3328 col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); | |
3329 col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); | |
3330 col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); | |
3331 col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); | |
3332 col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); | |
3333 col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); | |
3334 col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); | |
3335 col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); | |
3336 col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); | |
3337 col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); | |
3338 } else { | |
3339 const __m128i zero = _mm_setzero_si128(); | |
3340 | 3571 |
3341 // 2_D: Calculate the results and store them to destination. | 3572 // 2_D: Calculate the results and store them to destination. |
3342 in0 = _mm_add_epi16(stp1_0, stp1_31); | 3573 in[0] = _mm_add_epi16(stp1_0, stp1_31); |
3343 in1 = _mm_add_epi16(stp1_1, stp1_30); | 3574 in[1] = _mm_add_epi16(stp1_1, stp1_30); |
3344 in2 = _mm_add_epi16(stp1_2, stp1_29); | 3575 in[2] = _mm_add_epi16(stp1_2, stp1_29); |
3345 in3 = _mm_add_epi16(stp1_3, stp1_28); | 3576 in[3] = _mm_add_epi16(stp1_3, stp1_28); |
3346 in4 = _mm_add_epi16(stp1_4, stp1_27); | 3577 in[4] = _mm_add_epi16(stp1_4, stp1_27); |
3347 in5 = _mm_add_epi16(stp1_5, stp1_26); | 3578 in[5] = _mm_add_epi16(stp1_5, stp1_26); |
3348 in6 = _mm_add_epi16(stp1_6, stp1_25); | 3579 in[6] = _mm_add_epi16(stp1_6, stp1_25); |
3349 in7 = _mm_add_epi16(stp1_7, stp1_24); | 3580 in[7] = _mm_add_epi16(stp1_7, stp1_24); |
3350 in8 = _mm_add_epi16(stp1_8, stp1_23); | 3581 in[8] = _mm_add_epi16(stp1_8, stp1_23); |
3351 in9 = _mm_add_epi16(stp1_9, stp1_22); | 3582 in[9] = _mm_add_epi16(stp1_9, stp1_22); |
3352 in10 = _mm_add_epi16(stp1_10, stp1_21); | 3583 in[10] = _mm_add_epi16(stp1_10, stp1_21); |
3353 in11 = _mm_add_epi16(stp1_11, stp1_20); | 3584 in[11] = _mm_add_epi16(stp1_11, stp1_20); |
3354 in12 = _mm_add_epi16(stp1_12, stp1_19); | 3585 in[12] = _mm_add_epi16(stp1_12, stp1_19); |
3355 in13 = _mm_add_epi16(stp1_13, stp1_18); | 3586 in[13] = _mm_add_epi16(stp1_13, stp1_18); |
3356 in14 = _mm_add_epi16(stp1_14, stp1_17); | 3587 in[14] = _mm_add_epi16(stp1_14, stp1_17); |
3357 in15 = _mm_add_epi16(stp1_15, stp1_16); | 3588 in[15] = _mm_add_epi16(stp1_15, stp1_16); |
3358 in16 = _mm_sub_epi16(stp1_15, stp1_16); | 3589 in[16] = _mm_sub_epi16(stp1_15, stp1_16); |
3359 in17 = _mm_sub_epi16(stp1_14, stp1_17); | 3590 in[17] = _mm_sub_epi16(stp1_14, stp1_17); |
3360 in18 = _mm_sub_epi16(stp1_13, stp1_18); | 3591 in[18] = _mm_sub_epi16(stp1_13, stp1_18); |
3361 in19 = _mm_sub_epi16(stp1_12, stp1_19); | 3592 in[19] = _mm_sub_epi16(stp1_12, stp1_19); |
3362 in20 = _mm_sub_epi16(stp1_11, stp1_20); | 3593 in[20] = _mm_sub_epi16(stp1_11, stp1_20); |
3363 in21 = _mm_sub_epi16(stp1_10, stp1_21); | 3594 in[21] = _mm_sub_epi16(stp1_10, stp1_21); |
3364 in22 = _mm_sub_epi16(stp1_9, stp1_22); | 3595 in[22] = _mm_sub_epi16(stp1_9, stp1_22); |
3365 in23 = _mm_sub_epi16(stp1_8, stp1_23); | 3596 in[23] = _mm_sub_epi16(stp1_8, stp1_23); |
3366 in24 = _mm_sub_epi16(stp1_7, stp1_24); | 3597 in[24] = _mm_sub_epi16(stp1_7, stp1_24); |
3367 in25 = _mm_sub_epi16(stp1_6, stp1_25); | 3598 in[25] = _mm_sub_epi16(stp1_6, stp1_25); |
3368 in26 = _mm_sub_epi16(stp1_5, stp1_26); | 3599 in[26] = _mm_sub_epi16(stp1_5, stp1_26); |
3369 in27 = _mm_sub_epi16(stp1_4, stp1_27); | 3600 in[27] = _mm_sub_epi16(stp1_4, stp1_27); |
3370 in28 = _mm_sub_epi16(stp1_3, stp1_28); | 3601 in[28] = _mm_sub_epi16(stp1_3, stp1_28); |
3371 in29 = _mm_sub_epi16(stp1_2, stp1_29); | 3602 in[29] = _mm_sub_epi16(stp1_2, stp1_29); |
3372 in30 = _mm_sub_epi16(stp1_1, stp1_30); | 3603 in[30] = _mm_sub_epi16(stp1_1, stp1_30); |
3373 in31 = _mm_sub_epi16(stp1_0, stp1_31); | 3604 in[31] = _mm_sub_epi16(stp1_0, stp1_31); |
3374 | 3605 |
3375 // Final rounding and shift | 3606 // Final rounding and shift |
3376 in0 = _mm_adds_epi16(in0, final_rounding); | 3607 in[0] = _mm_adds_epi16(in[0], final_rounding); |
3377 in1 = _mm_adds_epi16(in1, final_rounding); | 3608 in[1] = _mm_adds_epi16(in[1], final_rounding); |
3378 in2 = _mm_adds_epi16(in2, final_rounding); | 3609 in[2] = _mm_adds_epi16(in[2], final_rounding); |
3379 in3 = _mm_adds_epi16(in3, final_rounding); | 3610 in[3] = _mm_adds_epi16(in[3], final_rounding); |
3380 in4 = _mm_adds_epi16(in4, final_rounding); | 3611 in[4] = _mm_adds_epi16(in[4], final_rounding); |
3381 in5 = _mm_adds_epi16(in5, final_rounding); | 3612 in[5] = _mm_adds_epi16(in[5], final_rounding); |
3382 in6 = _mm_adds_epi16(in6, final_rounding); | 3613 in[6] = _mm_adds_epi16(in[6], final_rounding); |
3383 in7 = _mm_adds_epi16(in7, final_rounding); | 3614 in[7] = _mm_adds_epi16(in[7], final_rounding); |
3384 in8 = _mm_adds_epi16(in8, final_rounding); | 3615 in[8] = _mm_adds_epi16(in[8], final_rounding); |
3385 in9 = _mm_adds_epi16(in9, final_rounding); | 3616 in[9] = _mm_adds_epi16(in[9], final_rounding); |
3386 in10 = _mm_adds_epi16(in10, final_rounding); | 3617 in[10] = _mm_adds_epi16(in[10], final_rounding); |
3387 in11 = _mm_adds_epi16(in11, final_rounding); | 3618 in[11] = _mm_adds_epi16(in[11], final_rounding); |
3388 in12 = _mm_adds_epi16(in12, final_rounding); | 3619 in[12] = _mm_adds_epi16(in[12], final_rounding); |
3389 in13 = _mm_adds_epi16(in13, final_rounding); | 3620 in[13] = _mm_adds_epi16(in[13], final_rounding); |
3390 in14 = _mm_adds_epi16(in14, final_rounding); | 3621 in[14] = _mm_adds_epi16(in[14], final_rounding); |
3391 in15 = _mm_adds_epi16(in15, final_rounding); | 3622 in[15] = _mm_adds_epi16(in[15], final_rounding); |
3392 in16 = _mm_adds_epi16(in16, final_rounding); | 3623 in[16] = _mm_adds_epi16(in[16], final_rounding); |
3393 in17 = _mm_adds_epi16(in17, final_rounding); | 3624 in[17] = _mm_adds_epi16(in[17], final_rounding); |
3394 in18 = _mm_adds_epi16(in18, final_rounding); | 3625 in[18] = _mm_adds_epi16(in[18], final_rounding); |
3395 in19 = _mm_adds_epi16(in19, final_rounding); | 3626 in[19] = _mm_adds_epi16(in[19], final_rounding); |
3396 in20 = _mm_adds_epi16(in20, final_rounding); | 3627 in[20] = _mm_adds_epi16(in[20], final_rounding); |
3397 in21 = _mm_adds_epi16(in21, final_rounding); | 3628 in[21] = _mm_adds_epi16(in[21], final_rounding); |
3398 in22 = _mm_adds_epi16(in22, final_rounding); | 3629 in[22] = _mm_adds_epi16(in[22], final_rounding); |
3399 in23 = _mm_adds_epi16(in23, final_rounding); | 3630 in[23] = _mm_adds_epi16(in[23], final_rounding); |
3400 in24 = _mm_adds_epi16(in24, final_rounding); | 3631 in[24] = _mm_adds_epi16(in[24], final_rounding); |
3401 in25 = _mm_adds_epi16(in25, final_rounding); | 3632 in[25] = _mm_adds_epi16(in[25], final_rounding); |
3402 in26 = _mm_adds_epi16(in26, final_rounding); | 3633 in[26] = _mm_adds_epi16(in[26], final_rounding); |
3403 in27 = _mm_adds_epi16(in27, final_rounding); | 3634 in[27] = _mm_adds_epi16(in[27], final_rounding); |
3404 in28 = _mm_adds_epi16(in28, final_rounding); | 3635 in[28] = _mm_adds_epi16(in[28], final_rounding); |
3405 in29 = _mm_adds_epi16(in29, final_rounding); | 3636 in[29] = _mm_adds_epi16(in[29], final_rounding); |
3406 in30 = _mm_adds_epi16(in30, final_rounding); | 3637 in[30] = _mm_adds_epi16(in[30], final_rounding); |
3407 in31 = _mm_adds_epi16(in31, final_rounding); | 3638 in[31] = _mm_adds_epi16(in[31], final_rounding); |
3408 | 3639 |
3409 in0 = _mm_srai_epi16(in0, 6); | 3640 in[0] = _mm_srai_epi16(in[0], 6); |
3410 in1 = _mm_srai_epi16(in1, 6); | 3641 in[1] = _mm_srai_epi16(in[1], 6); |
3411 in2 = _mm_srai_epi16(in2, 6); | 3642 in[2] = _mm_srai_epi16(in[2], 6); |
3412 in3 = _mm_srai_epi16(in3, 6); | 3643 in[3] = _mm_srai_epi16(in[3], 6); |
3413 in4 = _mm_srai_epi16(in4, 6); | 3644 in[4] = _mm_srai_epi16(in[4], 6); |
3414 in5 = _mm_srai_epi16(in5, 6); | 3645 in[5] = _mm_srai_epi16(in[5], 6); |
3415 in6 = _mm_srai_epi16(in6, 6); | 3646 in[6] = _mm_srai_epi16(in[6], 6); |
3416 in7 = _mm_srai_epi16(in7, 6); | 3647 in[7] = _mm_srai_epi16(in[7], 6); |
3417 in8 = _mm_srai_epi16(in8, 6); | 3648 in[8] = _mm_srai_epi16(in[8], 6); |
3418 in9 = _mm_srai_epi16(in9, 6); | 3649 in[9] = _mm_srai_epi16(in[9], 6); |
3419 in10 = _mm_srai_epi16(in10, 6); | 3650 in[10] = _mm_srai_epi16(in[10], 6); |
3420 in11 = _mm_srai_epi16(in11, 6); | 3651 in[11] = _mm_srai_epi16(in[11], 6); |
3421 in12 = _mm_srai_epi16(in12, 6); | 3652 in[12] = _mm_srai_epi16(in[12], 6); |
3422 in13 = _mm_srai_epi16(in13, 6); | 3653 in[13] = _mm_srai_epi16(in[13], 6); |
3423 in14 = _mm_srai_epi16(in14, 6); | 3654 in[14] = _mm_srai_epi16(in[14], 6); |
3424 in15 = _mm_srai_epi16(in15, 6); | 3655 in[15] = _mm_srai_epi16(in[15], 6); |
3425 in16 = _mm_srai_epi16(in16, 6); | 3656 in[16] = _mm_srai_epi16(in[16], 6); |
3426 in17 = _mm_srai_epi16(in17, 6); | 3657 in[17] = _mm_srai_epi16(in[17], 6); |
3427 in18 = _mm_srai_epi16(in18, 6); | 3658 in[18] = _mm_srai_epi16(in[18], 6); |
3428 in19 = _mm_srai_epi16(in19, 6); | 3659 in[19] = _mm_srai_epi16(in[19], 6); |
3429 in20 = _mm_srai_epi16(in20, 6); | 3660 in[20] = _mm_srai_epi16(in[20], 6); |
3430 in21 = _mm_srai_epi16(in21, 6); | 3661 in[21] = _mm_srai_epi16(in[21], 6); |
3431 in22 = _mm_srai_epi16(in22, 6); | 3662 in[22] = _mm_srai_epi16(in[22], 6); |
3432 in23 = _mm_srai_epi16(in23, 6); | 3663 in[23] = _mm_srai_epi16(in[23], 6); |
3433 in24 = _mm_srai_epi16(in24, 6); | 3664 in[24] = _mm_srai_epi16(in[24], 6); |
3434 in25 = _mm_srai_epi16(in25, 6); | 3665 in[25] = _mm_srai_epi16(in[25], 6); |
3435 in26 = _mm_srai_epi16(in26, 6); | 3666 in[26] = _mm_srai_epi16(in[26], 6); |
3436 in27 = _mm_srai_epi16(in27, 6); | 3667 in[27] = _mm_srai_epi16(in[27], 6); |
3437 in28 = _mm_srai_epi16(in28, 6); | 3668 in[28] = _mm_srai_epi16(in[28], 6); |
3438 in29 = _mm_srai_epi16(in29, 6); | 3669 in[29] = _mm_srai_epi16(in[29], 6); |
3439 in30 = _mm_srai_epi16(in30, 6); | 3670 in[30] = _mm_srai_epi16(in[30], 6); |
3440 in31 = _mm_srai_epi16(in31, 6); | 3671 in[31] = _mm_srai_epi16(in[31], 6); |
3441 | 3672 |
3442 RECON_AND_STORE(dest, in0); | 3673 RECON_AND_STORE(dest, in[0]); |
3443 RECON_AND_STORE(dest, in1); | 3674 RECON_AND_STORE(dest, in[1]); |
3444 RECON_AND_STORE(dest, in2); | 3675 RECON_AND_STORE(dest, in[2]); |
3445 RECON_AND_STORE(dest, in3); | 3676 RECON_AND_STORE(dest, in[3]); |
3446 RECON_AND_STORE(dest, in4); | 3677 RECON_AND_STORE(dest, in[4]); |
3447 RECON_AND_STORE(dest, in5); | 3678 RECON_AND_STORE(dest, in[5]); |
3448 RECON_AND_STORE(dest, in6); | 3679 RECON_AND_STORE(dest, in[6]); |
3449 RECON_AND_STORE(dest, in7); | 3680 RECON_AND_STORE(dest, in[7]); |
3450 RECON_AND_STORE(dest, in8); | 3681 RECON_AND_STORE(dest, in[8]); |
3451 RECON_AND_STORE(dest, in9); | 3682 RECON_AND_STORE(dest, in[9]); |
3452 RECON_AND_STORE(dest, in10); | 3683 RECON_AND_STORE(dest, in[10]); |
3453 RECON_AND_STORE(dest, in11); | 3684 RECON_AND_STORE(dest, in[11]); |
3454 RECON_AND_STORE(dest, in12); | 3685 RECON_AND_STORE(dest, in[12]); |
3455 RECON_AND_STORE(dest, in13); | 3686 RECON_AND_STORE(dest, in[13]); |
3456 RECON_AND_STORE(dest, in14); | 3687 RECON_AND_STORE(dest, in[14]); |
3457 RECON_AND_STORE(dest, in15); | 3688 RECON_AND_STORE(dest, in[15]); |
3458 RECON_AND_STORE(dest, in16); | 3689 RECON_AND_STORE(dest, in[16]); |
3459 RECON_AND_STORE(dest, in17); | 3690 RECON_AND_STORE(dest, in[17]); |
3460 RECON_AND_STORE(dest, in18); | 3691 RECON_AND_STORE(dest, in[18]); |
3461 RECON_AND_STORE(dest, in19); | 3692 RECON_AND_STORE(dest, in[19]); |
3462 RECON_AND_STORE(dest, in20); | 3693 RECON_AND_STORE(dest, in[20]); |
3463 RECON_AND_STORE(dest, in21); | 3694 RECON_AND_STORE(dest, in[21]); |
3464 RECON_AND_STORE(dest, in22); | 3695 RECON_AND_STORE(dest, in[22]); |
3465 RECON_AND_STORE(dest, in23); | 3696 RECON_AND_STORE(dest, in[23]); |
3466 RECON_AND_STORE(dest, in24); | 3697 RECON_AND_STORE(dest, in[24]); |
3467 RECON_AND_STORE(dest, in25); | 3698 RECON_AND_STORE(dest, in[25]); |
3468 RECON_AND_STORE(dest, in26); | 3699 RECON_AND_STORE(dest, in[26]); |
3469 RECON_AND_STORE(dest, in27); | 3700 RECON_AND_STORE(dest, in[27]); |
3470 RECON_AND_STORE(dest, in28); | 3701 RECON_AND_STORE(dest, in[28]); |
3471 RECON_AND_STORE(dest, in29); | 3702 RECON_AND_STORE(dest, in[29]); |
3472 RECON_AND_STORE(dest, in30); | 3703 RECON_AND_STORE(dest, in[30]); |
3473 RECON_AND_STORE(dest, in31); | 3704 RECON_AND_STORE(dest, in[31]); |
3474 | 3705 |
3475 dest += 8 - (stride * 32); | 3706 dest += 8 - (stride * 32); |
3476 } | 3707 } |
3477 } | 3708 } |
3478 } | |
3479 | 3709 |
3480 void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, | 3710 void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, |
3481 int stride) { | 3711 int stride) { |
3482 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); | 3712 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
3483 const __m128i final_rounding = _mm_set1_epi16(1<<5); | 3713 const __m128i final_rounding = _mm_set1_epi16(1<<5); |
3484 | 3714 |
3485 // idct constants for each stage | 3715 // idct constants for each stage |
3486 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); | 3716 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); |
3487 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); | 3717 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); |
3488 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); | 3718 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); |
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3523 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); | 3753 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); |
3524 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 3754 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
3525 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); | 3755 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
3526 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); | 3756 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); |
3527 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); | 3757 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); |
3528 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); | 3758 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); |
3529 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); | 3759 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); |
3530 | 3760 |
3531 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); | 3761 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); |
3532 | 3762 |
3533 __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, | 3763 __m128i in[32], col[128], zero_idx[16]; |
3534 in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23, | |
3535 in24, in25, in26, in27, in28, in29, in30, in31; | |
3536 __m128i col[128]; | |
3537 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, | 3764 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, |
3538 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, | 3765 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, |
3539 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, | 3766 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, |
3540 stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, | 3767 stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, |
3541 stp1_30, stp1_31; | 3768 stp1_30, stp1_31; |
3542 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, | 3769 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, |
3543 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, | 3770 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, |
3544 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, | 3771 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, |
3545 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, | 3772 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, |
3546 stp2_30, stp2_31; | 3773 stp2_30, stp2_31; |
3547 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; | 3774 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
3548 int i, j, i32; | 3775 int i, j, i32; |
3549 __m128i zero_idx[16]; | |
3550 int zero_flag[2]; | 3776 int zero_flag[2]; |
3551 | 3777 |
3552 // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct. | 3778 for (i = 0; i < 4; i++) { |
3553 for (i = 0; i < 8; i++) { | |
3554 i32 = (i << 5); | 3779 i32 = (i << 5); |
3555 if (i < 4) { | |
3556 // First 1-D idct | 3780 // First 1-D idct |
3557 // Load input data. | 3781 // Load input data. |
3558 LOAD_DQCOEFF(in0, input); | 3782 LOAD_DQCOEFF(in[0], input); |
3559 LOAD_DQCOEFF(in8, input); | 3783 LOAD_DQCOEFF(in[8], input); |
3560 LOAD_DQCOEFF(in16, input); | 3784 LOAD_DQCOEFF(in[16], input); |
3561 LOAD_DQCOEFF(in24, input); | 3785 LOAD_DQCOEFF(in[24], input); |
3562 LOAD_DQCOEFF(in1, input); | 3786 LOAD_DQCOEFF(in[1], input); |
3563 LOAD_DQCOEFF(in9, input); | 3787 LOAD_DQCOEFF(in[9], input); |
3564 LOAD_DQCOEFF(in17, input); | 3788 LOAD_DQCOEFF(in[17], input); |
3565 LOAD_DQCOEFF(in25, input); | 3789 LOAD_DQCOEFF(in[25], input); |
3566 LOAD_DQCOEFF(in2, input); | 3790 LOAD_DQCOEFF(in[2], input); |
3567 LOAD_DQCOEFF(in10, input); | 3791 LOAD_DQCOEFF(in[10], input); |
3568 LOAD_DQCOEFF(in18, input); | 3792 LOAD_DQCOEFF(in[18], input); |
3569 LOAD_DQCOEFF(in26, input); | 3793 LOAD_DQCOEFF(in[26], input); |
3570 LOAD_DQCOEFF(in3, input); | 3794 LOAD_DQCOEFF(in[3], input); |
3571 LOAD_DQCOEFF(in11, input); | 3795 LOAD_DQCOEFF(in[11], input); |
3572 LOAD_DQCOEFF(in19, input); | 3796 LOAD_DQCOEFF(in[19], input); |
3573 LOAD_DQCOEFF(in27, input); | 3797 LOAD_DQCOEFF(in[27], input); |
3574 | 3798 |
3575 LOAD_DQCOEFF(in4, input); | 3799 LOAD_DQCOEFF(in[4], input); |
3576 LOAD_DQCOEFF(in12, input); | 3800 LOAD_DQCOEFF(in[12], input); |
3577 LOAD_DQCOEFF(in20, input); | 3801 LOAD_DQCOEFF(in[20], input); |
3578 LOAD_DQCOEFF(in28, input); | 3802 LOAD_DQCOEFF(in[28], input); |
3579 LOAD_DQCOEFF(in5, input); | 3803 LOAD_DQCOEFF(in[5], input); |
3580 LOAD_DQCOEFF(in13, input); | 3804 LOAD_DQCOEFF(in[13], input); |
3581 LOAD_DQCOEFF(in21, input); | 3805 LOAD_DQCOEFF(in[21], input); |
3582 LOAD_DQCOEFF(in29, input); | 3806 LOAD_DQCOEFF(in[29], input); |
3583 LOAD_DQCOEFF(in6, input); | 3807 LOAD_DQCOEFF(in[6], input); |
3584 LOAD_DQCOEFF(in14, input); | 3808 LOAD_DQCOEFF(in[14], input); |
3585 LOAD_DQCOEFF(in22, input); | 3809 LOAD_DQCOEFF(in[22], input); |
3586 LOAD_DQCOEFF(in30, input); | 3810 LOAD_DQCOEFF(in[30], input); |
3587 LOAD_DQCOEFF(in7, input); | 3811 LOAD_DQCOEFF(in[7], input); |
3588 LOAD_DQCOEFF(in15, input); | 3812 LOAD_DQCOEFF(in[15], input); |
3589 LOAD_DQCOEFF(in23, input); | 3813 LOAD_DQCOEFF(in[23], input); |
3590 LOAD_DQCOEFF(in31, input); | 3814 LOAD_DQCOEFF(in[31], input); |
3591 | 3815 |
3592 // checking if all entries are zero | 3816 // checking if all entries are zero |
3593 zero_idx[0] = _mm_or_si128(in0, in1); | 3817 zero_idx[0] = _mm_or_si128(in[0], in[1]); |
3594 zero_idx[1] = _mm_or_si128(in2, in3); | 3818 zero_idx[1] = _mm_or_si128(in[2], in[3]); |
3595 zero_idx[2] = _mm_or_si128(in4, in5); | 3819 zero_idx[2] = _mm_or_si128(in[4], in[5]); |
3596 zero_idx[3] = _mm_or_si128(in6, in7); | 3820 zero_idx[3] = _mm_or_si128(in[6], in[7]); |
3597 zero_idx[4] = _mm_or_si128(in8, in9); | 3821 zero_idx[4] = _mm_or_si128(in[8], in[9]); |
3598 zero_idx[5] = _mm_or_si128(in10, in11); | 3822 zero_idx[5] = _mm_or_si128(in[10], in[11]); |
3599 zero_idx[6] = _mm_or_si128(in12, in13); | 3823 zero_idx[6] = _mm_or_si128(in[12], in[13]); |
3600 zero_idx[7] = _mm_or_si128(in14, in15); | 3824 zero_idx[7] = _mm_or_si128(in[14], in[15]); |
3601 zero_idx[8] = _mm_or_si128(in16, in17); | 3825 zero_idx[8] = _mm_or_si128(in[16], in[17]); |
3602 zero_idx[9] = _mm_or_si128(in18, in19); | 3826 zero_idx[9] = _mm_or_si128(in[18], in[19]); |
3603 zero_idx[10] = _mm_or_si128(in20, in21); | 3827 zero_idx[10] = _mm_or_si128(in[20], in[21]); |
3604 zero_idx[11] = _mm_or_si128(in22, in23); | 3828 zero_idx[11] = _mm_or_si128(in[22], in[23]); |
3605 zero_idx[12] = _mm_or_si128(in24, in25); | 3829 zero_idx[12] = _mm_or_si128(in[24], in[25]); |
3606 zero_idx[13] = _mm_or_si128(in26, in27); | 3830 zero_idx[13] = _mm_or_si128(in[26], in[27]); |
3607 zero_idx[14] = _mm_or_si128(in28, in29); | 3831 zero_idx[14] = _mm_or_si128(in[28], in[29]); |
3608 zero_idx[15] = _mm_or_si128(in30, in31); | 3832 zero_idx[15] = _mm_or_si128(in[30], in[31]); |
3609 | 3833 |
3610 zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]); | 3834 zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]); |
3611 zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]); | 3835 zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]); |
3612 zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]); | 3836 zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]); |
3613 zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]); | 3837 zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]); |
3614 zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]); | 3838 zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]); |
3615 zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]); | 3839 zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]); |
3616 zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]); | 3840 zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]); |
3617 zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]); | 3841 zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]); |
3618 | 3842 |
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3660 col[i32 + 26] = _mm_setzero_si128(); | 3884 col[i32 + 26] = _mm_setzero_si128(); |
3661 col[i32 + 27] = _mm_setzero_si128(); | 3885 col[i32 + 27] = _mm_setzero_si128(); |
3662 col[i32 + 28] = _mm_setzero_si128(); | 3886 col[i32 + 28] = _mm_setzero_si128(); |
3663 col[i32 + 29] = _mm_setzero_si128(); | 3887 col[i32 + 29] = _mm_setzero_si128(); |
3664 col[i32 + 30] = _mm_setzero_si128(); | 3888 col[i32 + 30] = _mm_setzero_si128(); |
3665 col[i32 + 31] = _mm_setzero_si128(); | 3889 col[i32 + 31] = _mm_setzero_si128(); |
3666 continue; | 3890 continue; |
3667 } | 3891 } |
3668 | 3892 |
3669 // Transpose 32x8 block to 8x32 block | 3893 // Transpose 32x8 block to 8x32 block |
3670 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, | 3894 array_transpose_8x8(in, in); |
3671 in4, in5, in6, in7); | 3895 array_transpose_8x8(in+8, in+8); |
3672 TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, | 3896 array_transpose_8x8(in+16, in+16); |
3673 in10, in11, in12, in13, in14, in15); | 3897 array_transpose_8x8(in+24, in+24); |
3674 TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17, | |
3675 in18, in19, in20, in21, in22, in23); | |
3676 TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25, | |
3677 in26, in27, in28, in29, in30, in31); | |
3678 } else { | |
3679 // Second 1-D idct | |
3680 j = i - 4; | |
3681 | 3898 |
3682 // Transpose 32x8 block to 8x32 block | 3899 IDCT32_1D |
3683 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], | |
3684 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], | |
3685 col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4, | |
3686 in5, in6, in7); | |
3687 j += 4; | |
3688 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], | |
3689 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], | |
3690 col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10, | |
3691 in11, in12, in13, in14, in15); | |
3692 j += 4; | |
3693 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], | |
3694 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], | |
3695 col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18, | |
3696 in19, in20, in21, in22, in23); | |
3697 j += 4; | |
3698 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], | |
3699 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], | |
3700 col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27, | |
3701 in28, in29, in30, in31); | |
3702 } | |
3703 | 3900 |
3704 IDCT32_1D | |
3705 | |
3706 // final stage | |
3707 if (i < 4) { | |
3708 // 1_D: Store 32 intermediate results for each 8x32 block. | 3901 // 1_D: Store 32 intermediate results for each 8x32 block. |
3709 col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); | 3902 col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); |
3710 col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); | 3903 col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); |
3711 col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29); | 3904 col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29); |
3712 col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28); | 3905 col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28); |
3713 col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27); | 3906 col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27); |
3714 col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26); | 3907 col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26); |
3715 col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25); | 3908 col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25); |
3716 col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24); | 3909 col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24); |
3717 col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23); | 3910 col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23); |
(...skipping 13 matching lines...) Expand all Loading... |
3731 col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); | 3924 col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); |
3732 col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); | 3925 col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); |
3733 col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); | 3926 col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); |
3734 col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); | 3927 col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); |
3735 col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); | 3928 col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); |
3736 col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); | 3929 col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); |
3737 col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); | 3930 col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); |
3738 col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); | 3931 col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); |
3739 col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); | 3932 col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); |
3740 col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); | 3933 col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); |
3741 } else { | 3934 } |
| 3935 for (i = 0; i < 4; i++) { |
3742 const __m128i zero = _mm_setzero_si128(); | 3936 const __m128i zero = _mm_setzero_si128(); |
| 3937 // Second 1-D idct |
| 3938 j = i << 3; |
| 3939 |
| 3940 // Transpose 32x8 block to 8x32 block |
| 3941 array_transpose_8x8(col+j, in); |
| 3942 array_transpose_8x8(col+j+32, in+8); |
| 3943 array_transpose_8x8(col+j+64, in+16); |
| 3944 array_transpose_8x8(col+j+96, in+24); |
| 3945 |
| 3946 IDCT32_1D |
3743 | 3947 |
3744 // 2_D: Calculate the results and store them to destination. | 3948 // 2_D: Calculate the results and store them to destination. |
3745 in0 = _mm_add_epi16(stp1_0, stp1_31); | 3949 in[0] = _mm_add_epi16(stp1_0, stp1_31); |
3746 in1 = _mm_add_epi16(stp1_1, stp1_30); | 3950 in[1] = _mm_add_epi16(stp1_1, stp1_30); |
3747 in2 = _mm_add_epi16(stp1_2, stp1_29); | 3951 in[2] = _mm_add_epi16(stp1_2, stp1_29); |
3748 in3 = _mm_add_epi16(stp1_3, stp1_28); | 3952 in[3] = _mm_add_epi16(stp1_3, stp1_28); |
3749 in4 = _mm_add_epi16(stp1_4, stp1_27); | 3953 in[4] = _mm_add_epi16(stp1_4, stp1_27); |
3750 in5 = _mm_add_epi16(stp1_5, stp1_26); | 3954 in[5] = _mm_add_epi16(stp1_5, stp1_26); |
3751 in6 = _mm_add_epi16(stp1_6, stp1_25); | 3955 in[6] = _mm_add_epi16(stp1_6, stp1_25); |
3752 in7 = _mm_add_epi16(stp1_7, stp1_24); | 3956 in[7] = _mm_add_epi16(stp1_7, stp1_24); |
3753 in8 = _mm_add_epi16(stp1_8, stp1_23); | 3957 in[8] = _mm_add_epi16(stp1_8, stp1_23); |
3754 in9 = _mm_add_epi16(stp1_9, stp1_22); | 3958 in[9] = _mm_add_epi16(stp1_9, stp1_22); |
3755 in10 = _mm_add_epi16(stp1_10, stp1_21); | 3959 in[10] = _mm_add_epi16(stp1_10, stp1_21); |
3756 in11 = _mm_add_epi16(stp1_11, stp1_20); | 3960 in[11] = _mm_add_epi16(stp1_11, stp1_20); |
3757 in12 = _mm_add_epi16(stp1_12, stp1_19); | 3961 in[12] = _mm_add_epi16(stp1_12, stp1_19); |
3758 in13 = _mm_add_epi16(stp1_13, stp1_18); | 3962 in[13] = _mm_add_epi16(stp1_13, stp1_18); |
3759 in14 = _mm_add_epi16(stp1_14, stp1_17); | 3963 in[14] = _mm_add_epi16(stp1_14, stp1_17); |
3760 in15 = _mm_add_epi16(stp1_15, stp1_16); | 3964 in[15] = _mm_add_epi16(stp1_15, stp1_16); |
3761 in16 = _mm_sub_epi16(stp1_15, stp1_16); | 3965 in[16] = _mm_sub_epi16(stp1_15, stp1_16); |
3762 in17 = _mm_sub_epi16(stp1_14, stp1_17); | 3966 in[17] = _mm_sub_epi16(stp1_14, stp1_17); |
3763 in18 = _mm_sub_epi16(stp1_13, stp1_18); | 3967 in[18] = _mm_sub_epi16(stp1_13, stp1_18); |
3764 in19 = _mm_sub_epi16(stp1_12, stp1_19); | 3968 in[19] = _mm_sub_epi16(stp1_12, stp1_19); |
3765 in20 = _mm_sub_epi16(stp1_11, stp1_20); | 3969 in[20] = _mm_sub_epi16(stp1_11, stp1_20); |
3766 in21 = _mm_sub_epi16(stp1_10, stp1_21); | 3970 in[21] = _mm_sub_epi16(stp1_10, stp1_21); |
3767 in22 = _mm_sub_epi16(stp1_9, stp1_22); | 3971 in[22] = _mm_sub_epi16(stp1_9, stp1_22); |
3768 in23 = _mm_sub_epi16(stp1_8, stp1_23); | 3972 in[23] = _mm_sub_epi16(stp1_8, stp1_23); |
3769 in24 = _mm_sub_epi16(stp1_7, stp1_24); | 3973 in[24] = _mm_sub_epi16(stp1_7, stp1_24); |
3770 in25 = _mm_sub_epi16(stp1_6, stp1_25); | 3974 in[25] = _mm_sub_epi16(stp1_6, stp1_25); |
3771 in26 = _mm_sub_epi16(stp1_5, stp1_26); | 3975 in[26] = _mm_sub_epi16(stp1_5, stp1_26); |
3772 in27 = _mm_sub_epi16(stp1_4, stp1_27); | 3976 in[27] = _mm_sub_epi16(stp1_4, stp1_27); |
3773 in28 = _mm_sub_epi16(stp1_3, stp1_28); | 3977 in[28] = _mm_sub_epi16(stp1_3, stp1_28); |
3774 in29 = _mm_sub_epi16(stp1_2, stp1_29); | 3978 in[29] = _mm_sub_epi16(stp1_2, stp1_29); |
3775 in30 = _mm_sub_epi16(stp1_1, stp1_30); | 3979 in[30] = _mm_sub_epi16(stp1_1, stp1_30); |
3776 in31 = _mm_sub_epi16(stp1_0, stp1_31); | 3980 in[31] = _mm_sub_epi16(stp1_0, stp1_31); |
3777 | 3981 |
3778 // Final rounding and shift | 3982 // Final rounding and shift |
3779 in0 = _mm_adds_epi16(in0, final_rounding); | 3983 in[0] = _mm_adds_epi16(in[0], final_rounding); |
3780 in1 = _mm_adds_epi16(in1, final_rounding); | 3984 in[1] = _mm_adds_epi16(in[1], final_rounding); |
3781 in2 = _mm_adds_epi16(in2, final_rounding); | 3985 in[2] = _mm_adds_epi16(in[2], final_rounding); |
3782 in3 = _mm_adds_epi16(in3, final_rounding); | 3986 in[3] = _mm_adds_epi16(in[3], final_rounding); |
3783 in4 = _mm_adds_epi16(in4, final_rounding); | 3987 in[4] = _mm_adds_epi16(in[4], final_rounding); |
3784 in5 = _mm_adds_epi16(in5, final_rounding); | 3988 in[5] = _mm_adds_epi16(in[5], final_rounding); |
3785 in6 = _mm_adds_epi16(in6, final_rounding); | 3989 in[6] = _mm_adds_epi16(in[6], final_rounding); |
3786 in7 = _mm_adds_epi16(in7, final_rounding); | 3990 in[7] = _mm_adds_epi16(in[7], final_rounding); |
3787 in8 = _mm_adds_epi16(in8, final_rounding); | 3991 in[8] = _mm_adds_epi16(in[8], final_rounding); |
3788 in9 = _mm_adds_epi16(in9, final_rounding); | 3992 in[9] = _mm_adds_epi16(in[9], final_rounding); |
3789 in10 = _mm_adds_epi16(in10, final_rounding); | 3993 in[10] = _mm_adds_epi16(in[10], final_rounding); |
3790 in11 = _mm_adds_epi16(in11, final_rounding); | 3994 in[11] = _mm_adds_epi16(in[11], final_rounding); |
3791 in12 = _mm_adds_epi16(in12, final_rounding); | 3995 in[12] = _mm_adds_epi16(in[12], final_rounding); |
3792 in13 = _mm_adds_epi16(in13, final_rounding); | 3996 in[13] = _mm_adds_epi16(in[13], final_rounding); |
3793 in14 = _mm_adds_epi16(in14, final_rounding); | 3997 in[14] = _mm_adds_epi16(in[14], final_rounding); |
3794 in15 = _mm_adds_epi16(in15, final_rounding); | 3998 in[15] = _mm_adds_epi16(in[15], final_rounding); |
3795 in16 = _mm_adds_epi16(in16, final_rounding); | 3999 in[16] = _mm_adds_epi16(in[16], final_rounding); |
3796 in17 = _mm_adds_epi16(in17, final_rounding); | 4000 in[17] = _mm_adds_epi16(in[17], final_rounding); |
3797 in18 = _mm_adds_epi16(in18, final_rounding); | 4001 in[18] = _mm_adds_epi16(in[18], final_rounding); |
3798 in19 = _mm_adds_epi16(in19, final_rounding); | 4002 in[19] = _mm_adds_epi16(in[19], final_rounding); |
3799 in20 = _mm_adds_epi16(in20, final_rounding); | 4003 in[20] = _mm_adds_epi16(in[20], final_rounding); |
3800 in21 = _mm_adds_epi16(in21, final_rounding); | 4004 in[21] = _mm_adds_epi16(in[21], final_rounding); |
3801 in22 = _mm_adds_epi16(in22, final_rounding); | 4005 in[22] = _mm_adds_epi16(in[22], final_rounding); |
3802 in23 = _mm_adds_epi16(in23, final_rounding); | 4006 in[23] = _mm_adds_epi16(in[23], final_rounding); |
3803 in24 = _mm_adds_epi16(in24, final_rounding); | 4007 in[24] = _mm_adds_epi16(in[24], final_rounding); |
3804 in25 = _mm_adds_epi16(in25, final_rounding); | 4008 in[25] = _mm_adds_epi16(in[25], final_rounding); |
3805 in26 = _mm_adds_epi16(in26, final_rounding); | 4009 in[26] = _mm_adds_epi16(in[26], final_rounding); |
3806 in27 = _mm_adds_epi16(in27, final_rounding); | 4010 in[27] = _mm_adds_epi16(in[27], final_rounding); |
3807 in28 = _mm_adds_epi16(in28, final_rounding); | 4011 in[28] = _mm_adds_epi16(in[28], final_rounding); |
3808 in29 = _mm_adds_epi16(in29, final_rounding); | 4012 in[29] = _mm_adds_epi16(in[29], final_rounding); |
3809 in30 = _mm_adds_epi16(in30, final_rounding); | 4013 in[30] = _mm_adds_epi16(in[30], final_rounding); |
3810 in31 = _mm_adds_epi16(in31, final_rounding); | 4014 in[31] = _mm_adds_epi16(in[31], final_rounding); |
3811 | 4015 |
3812 in0 = _mm_srai_epi16(in0, 6); | 4016 in[0] = _mm_srai_epi16(in[0], 6); |
3813 in1 = _mm_srai_epi16(in1, 6); | 4017 in[1] = _mm_srai_epi16(in[1], 6); |
3814 in2 = _mm_srai_epi16(in2, 6); | 4018 in[2] = _mm_srai_epi16(in[2], 6); |
3815 in3 = _mm_srai_epi16(in3, 6); | 4019 in[3] = _mm_srai_epi16(in[3], 6); |
3816 in4 = _mm_srai_epi16(in4, 6); | 4020 in[4] = _mm_srai_epi16(in[4], 6); |
3817 in5 = _mm_srai_epi16(in5, 6); | 4021 in[5] = _mm_srai_epi16(in[5], 6); |
3818 in6 = _mm_srai_epi16(in6, 6); | 4022 in[6] = _mm_srai_epi16(in[6], 6); |
3819 in7 = _mm_srai_epi16(in7, 6); | 4023 in[7] = _mm_srai_epi16(in[7], 6); |
3820 in8 = _mm_srai_epi16(in8, 6); | 4024 in[8] = _mm_srai_epi16(in[8], 6); |
3821 in9 = _mm_srai_epi16(in9, 6); | 4025 in[9] = _mm_srai_epi16(in[9], 6); |
3822 in10 = _mm_srai_epi16(in10, 6); | 4026 in[10] = _mm_srai_epi16(in[10], 6); |
3823 in11 = _mm_srai_epi16(in11, 6); | 4027 in[11] = _mm_srai_epi16(in[11], 6); |
3824 in12 = _mm_srai_epi16(in12, 6); | 4028 in[12] = _mm_srai_epi16(in[12], 6); |
3825 in13 = _mm_srai_epi16(in13, 6); | 4029 in[13] = _mm_srai_epi16(in[13], 6); |
3826 in14 = _mm_srai_epi16(in14, 6); | 4030 in[14] = _mm_srai_epi16(in[14], 6); |
3827 in15 = _mm_srai_epi16(in15, 6); | 4031 in[15] = _mm_srai_epi16(in[15], 6); |
3828 in16 = _mm_srai_epi16(in16, 6); | 4032 in[16] = _mm_srai_epi16(in[16], 6); |
3829 in17 = _mm_srai_epi16(in17, 6); | 4033 in[17] = _mm_srai_epi16(in[17], 6); |
3830 in18 = _mm_srai_epi16(in18, 6); | 4034 in[18] = _mm_srai_epi16(in[18], 6); |
3831 in19 = _mm_srai_epi16(in19, 6); | 4035 in[19] = _mm_srai_epi16(in[19], 6); |
3832 in20 = _mm_srai_epi16(in20, 6); | 4036 in[20] = _mm_srai_epi16(in[20], 6); |
3833 in21 = _mm_srai_epi16(in21, 6); | 4037 in[21] = _mm_srai_epi16(in[21], 6); |
3834 in22 = _mm_srai_epi16(in22, 6); | 4038 in[22] = _mm_srai_epi16(in[22], 6); |
3835 in23 = _mm_srai_epi16(in23, 6); | 4039 in[23] = _mm_srai_epi16(in[23], 6); |
3836 in24 = _mm_srai_epi16(in24, 6); | 4040 in[24] = _mm_srai_epi16(in[24], 6); |
3837 in25 = _mm_srai_epi16(in25, 6); | 4041 in[25] = _mm_srai_epi16(in[25], 6); |
3838 in26 = _mm_srai_epi16(in26, 6); | 4042 in[26] = _mm_srai_epi16(in[26], 6); |
3839 in27 = _mm_srai_epi16(in27, 6); | 4043 in[27] = _mm_srai_epi16(in[27], 6); |
3840 in28 = _mm_srai_epi16(in28, 6); | 4044 in[28] = _mm_srai_epi16(in[28], 6); |
3841 in29 = _mm_srai_epi16(in29, 6); | 4045 in[29] = _mm_srai_epi16(in[29], 6); |
3842 in30 = _mm_srai_epi16(in30, 6); | 4046 in[30] = _mm_srai_epi16(in[30], 6); |
3843 in31 = _mm_srai_epi16(in31, 6); | 4047 in[31] = _mm_srai_epi16(in[31], 6); |
3844 | 4048 |
3845 RECON_AND_STORE(dest, in0); | 4049 RECON_AND_STORE(dest, in[0]); |
3846 RECON_AND_STORE(dest, in1); | 4050 RECON_AND_STORE(dest, in[1]); |
3847 RECON_AND_STORE(dest, in2); | 4051 RECON_AND_STORE(dest, in[2]); |
3848 RECON_AND_STORE(dest, in3); | 4052 RECON_AND_STORE(dest, in[3]); |
3849 RECON_AND_STORE(dest, in4); | 4053 RECON_AND_STORE(dest, in[4]); |
3850 RECON_AND_STORE(dest, in5); | 4054 RECON_AND_STORE(dest, in[5]); |
3851 RECON_AND_STORE(dest, in6); | 4055 RECON_AND_STORE(dest, in[6]); |
3852 RECON_AND_STORE(dest, in7); | 4056 RECON_AND_STORE(dest, in[7]); |
3853 RECON_AND_STORE(dest, in8); | 4057 RECON_AND_STORE(dest, in[8]); |
3854 RECON_AND_STORE(dest, in9); | 4058 RECON_AND_STORE(dest, in[9]); |
3855 RECON_AND_STORE(dest, in10); | 4059 RECON_AND_STORE(dest, in[10]); |
3856 RECON_AND_STORE(dest, in11); | 4060 RECON_AND_STORE(dest, in[11]); |
3857 RECON_AND_STORE(dest, in12); | 4061 RECON_AND_STORE(dest, in[12]); |
3858 RECON_AND_STORE(dest, in13); | 4062 RECON_AND_STORE(dest, in[13]); |
3859 RECON_AND_STORE(dest, in14); | 4063 RECON_AND_STORE(dest, in[14]); |
3860 RECON_AND_STORE(dest, in15); | 4064 RECON_AND_STORE(dest, in[15]); |
3861 RECON_AND_STORE(dest, in16); | 4065 RECON_AND_STORE(dest, in[16]); |
3862 RECON_AND_STORE(dest, in17); | 4066 RECON_AND_STORE(dest, in[17]); |
3863 RECON_AND_STORE(dest, in18); | 4067 RECON_AND_STORE(dest, in[18]); |
3864 RECON_AND_STORE(dest, in19); | 4068 RECON_AND_STORE(dest, in[19]); |
3865 RECON_AND_STORE(dest, in20); | 4069 RECON_AND_STORE(dest, in[20]); |
3866 RECON_AND_STORE(dest, in21); | 4070 RECON_AND_STORE(dest, in[21]); |
3867 RECON_AND_STORE(dest, in22); | 4071 RECON_AND_STORE(dest, in[22]); |
3868 RECON_AND_STORE(dest, in23); | 4072 RECON_AND_STORE(dest, in[23]); |
3869 RECON_AND_STORE(dest, in24); | 4073 RECON_AND_STORE(dest, in[24]); |
3870 RECON_AND_STORE(dest, in25); | 4074 RECON_AND_STORE(dest, in[25]); |
3871 RECON_AND_STORE(dest, in26); | 4075 RECON_AND_STORE(dest, in[26]); |
3872 RECON_AND_STORE(dest, in27); | 4076 RECON_AND_STORE(dest, in[27]); |
3873 RECON_AND_STORE(dest, in28); | 4077 RECON_AND_STORE(dest, in[28]); |
3874 RECON_AND_STORE(dest, in29); | 4078 RECON_AND_STORE(dest, in[29]); |
3875 RECON_AND_STORE(dest, in30); | 4079 RECON_AND_STORE(dest, in[30]); |
3876 RECON_AND_STORE(dest, in31); | 4080 RECON_AND_STORE(dest, in[31]); |
3877 | 4081 |
3878 dest += 8 - (stride * 32); | 4082 dest += 8 - (stride * 32); |
3879 } | 4083 } |
3880 } | |
3881 } //NOLINT | 4084 } //NOLINT |
3882 | 4085 |
3883 void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { | 4086 void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
3884 __m128i dc_value; | 4087 __m128i dc_value; |
3885 const __m128i zero = _mm_setzero_si128(); | 4088 const __m128i zero = _mm_setzero_si128(); |
3886 int a, i; | 4089 int a, i; |
3887 | 4090 |
3888 a = dct_const_round_shift(input[0] * cospi_16_64); | 4091 a = dct_const_round_shift(input[0] * cospi_16_64); |
3889 a = dct_const_round_shift(a * cospi_16_64); | 4092 a = dct_const_round_shift(a * cospi_16_64); |
3890 a = ROUND_POWER_OF_TWO(a, 6); | 4093 a = ROUND_POWER_OF_TWO(a, 6); |
(...skipping 29 matching lines...) Expand all Loading... |
3920 RECON_AND_STORE(dest, dc_value); | 4123 RECON_AND_STORE(dest, dc_value); |
3921 RECON_AND_STORE(dest, dc_value); | 4124 RECON_AND_STORE(dest, dc_value); |
3922 RECON_AND_STORE(dest, dc_value); | 4125 RECON_AND_STORE(dest, dc_value); |
3923 RECON_AND_STORE(dest, dc_value); | 4126 RECON_AND_STORE(dest, dc_value); |
3924 RECON_AND_STORE(dest, dc_value); | 4127 RECON_AND_STORE(dest, dc_value); |
3925 RECON_AND_STORE(dest, dc_value); | 4128 RECON_AND_STORE(dest, dc_value); |
3926 RECON_AND_STORE(dest, dc_value); | 4129 RECON_AND_STORE(dest, dc_value); |
3927 dest += 8 - (stride * 32); | 4130 dest += 8 - (stride * 32); |
3928 } | 4131 } |
3929 } | 4132 } |
OLD | NEW |