OLD | NEW |
| (Empty) |
1 /* | |
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. | |
3 * | |
4 * Use of this source code is governed by a BSD-style license | |
5 * that can be found in the LICENSE file in the root of the source | |
6 * tree. An additional intellectual property rights grant can be found | |
7 * in the file PATENTS. All contributing project authors may | |
8 * be found in the AUTHORS file in the root of the source tree. | |
9 */ | |
10 | |
11 #include <emmintrin.h> // SSE2 | |
12 #include "vp9/common/vp9_idct.h" // for cospi constants | |
13 #include "vp9/encoder/vp9_dct.h" | |
14 #include "vp9/encoder/x86/vp9_dct_sse2.h" | |
15 #include "vpx_ports/mem.h" | |
16 | |
17 #if DCT_HIGH_BIT_DEPTH | |
18 #define ADD_EPI16 _mm_adds_epi16 | |
19 #define SUB_EPI16 _mm_subs_epi16 | |
20 | |
21 #else | |
22 #define ADD_EPI16 _mm_add_epi16 | |
23 #define SUB_EPI16 _mm_sub_epi16 | |
24 #endif | |
25 | |
26 void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) { | |
27 // This 2D transform implements 4 vertical 1D transforms followed | |
28 // by 4 horizontal 1D transforms. The multiplies and adds are as given | |
29 // by Chen, Smith and Fralick ('77). The commands for moving the data | |
30 // around have been minimized by hand. | |
31 // For the purposes of the comments, the 16 inputs are referred to at i0 | |
32 // through iF (in raster order), intermediate variables are a0, b0, c0 | |
33 // through f, and correspond to the in-place computations mapped to input | |
34 // locations. The outputs, o0 through oF are labeled according to the | |
35 // output locations. | |
36 | |
37 // Constants | |
38 // These are the coefficients used for the multiplies. | |
39 // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64), | |
40 // where cospi_N_64 = cos(N pi /64) | |
41 const __m128i k__cospi_A = _mm_setr_epi16(cospi_16_64, cospi_16_64, | |
42 cospi_16_64, cospi_16_64, | |
43 cospi_16_64, -cospi_16_64, | |
44 cospi_16_64, -cospi_16_64); | |
45 const __m128i k__cospi_B = _mm_setr_epi16(cospi_16_64, -cospi_16_64, | |
46 cospi_16_64, -cospi_16_64, | |
47 cospi_16_64, cospi_16_64, | |
48 cospi_16_64, cospi_16_64); | |
49 const __m128i k__cospi_C = _mm_setr_epi16(cospi_8_64, cospi_24_64, | |
50 cospi_8_64, cospi_24_64, | |
51 cospi_24_64, -cospi_8_64, | |
52 cospi_24_64, -cospi_8_64); | |
53 const __m128i k__cospi_D = _mm_setr_epi16(cospi_24_64, -cospi_8_64, | |
54 cospi_24_64, -cospi_8_64, | |
55 cospi_8_64, cospi_24_64, | |
56 cospi_8_64, cospi_24_64); | |
57 const __m128i k__cospi_E = _mm_setr_epi16(cospi_16_64, cospi_16_64, | |
58 cospi_16_64, cospi_16_64, | |
59 cospi_16_64, cospi_16_64, | |
60 cospi_16_64, cospi_16_64); | |
61 const __m128i k__cospi_F = _mm_setr_epi16(cospi_16_64, -cospi_16_64, | |
62 cospi_16_64, -cospi_16_64, | |
63 cospi_16_64, -cospi_16_64, | |
64 cospi_16_64, -cospi_16_64); | |
65 const __m128i k__cospi_G = _mm_setr_epi16(cospi_8_64, cospi_24_64, | |
66 cospi_8_64, cospi_24_64, | |
67 -cospi_8_64, -cospi_24_64, | |
68 -cospi_8_64, -cospi_24_64); | |
69 const __m128i k__cospi_H = _mm_setr_epi16(cospi_24_64, -cospi_8_64, | |
70 cospi_24_64, -cospi_8_64, | |
71 -cospi_24_64, cospi_8_64, | |
72 -cospi_24_64, cospi_8_64); | |
73 | |
74 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); | |
75 // This second rounding constant saves doing some extra adds at the end | |
76 const __m128i k__DCT_CONST_ROUNDING2 = _mm_set1_epi32(DCT_CONST_ROUNDING | |
77 +(DCT_CONST_ROUNDING << 1)); | |
78 const int DCT_CONST_BITS2 = DCT_CONST_BITS + 2; | |
79 const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); | |
80 const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); | |
81 __m128i in0, in1; | |
82 #if DCT_HIGH_BIT_DEPTH | |
83 __m128i cmp0, cmp1; | |
84 int test, overflow; | |
85 #endif | |
86 | |
87 // Load inputs. | |
88 in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); | |
89 in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); | |
90 in1 = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *) | |
91 (input + 2 * stride))); | |
92 in0 = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *) | |
93 (input + 3 * stride))); | |
94 // in0 = [i0 i1 i2 i3 iC iD iE iF] | |
95 // in1 = [i4 i5 i6 i7 i8 i9 iA iB] | |
96 #if DCT_HIGH_BIT_DEPTH | |
97 // Check inputs small enough to use optimised code | |
98 cmp0 = _mm_xor_si128(_mm_cmpgt_epi16(in0, _mm_set1_epi16(0x3ff)), | |
99 _mm_cmplt_epi16(in0, _mm_set1_epi16(0xfc00))); | |
100 cmp1 = _mm_xor_si128(_mm_cmpgt_epi16(in1, _mm_set1_epi16(0x3ff)), | |
101 _mm_cmplt_epi16(in1, _mm_set1_epi16(0xfc00))); | |
102 test = _mm_movemask_epi8(_mm_or_si128(cmp0, cmp1)); | |
103 if (test) { | |
104 vp9_highbd_fdct4x4_c(input, output, stride); | |
105 return; | |
106 } | |
107 #endif // DCT_HIGH_BIT_DEPTH | |
108 | |
109 // multiply by 16 to give some extra precision | |
110 in0 = _mm_slli_epi16(in0, 4); | |
111 in1 = _mm_slli_epi16(in1, 4); | |
112 // if (i == 0 && input[0]) input[0] += 1; | |
113 // add 1 to the upper left pixel if it is non-zero, which helps reduce | |
114 // the round-trip error | |
115 { | |
116 // The mask will only contain whether the first value is zero, all | |
117 // other comparison will fail as something shifted by 4 (above << 4) | |
118 // can never be equal to one. To increment in the non-zero case, we | |
119 // add the mask and one for the first element: | |
120 // - if zero, mask = -1, v = v - 1 + 1 = v | |
121 // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1 | |
122 __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a); | |
123 in0 = _mm_add_epi16(in0, mask); | |
124 in0 = _mm_add_epi16(in0, k__nonzero_bias_b); | |
125 } | |
126 // There are 4 total stages, alternating between an add/subtract stage | |
127 // followed by an multiply-and-add stage. | |
128 { | |
129 // Stage 1: Add/subtract | |
130 | |
131 // in0 = [i0 i1 i2 i3 iC iD iE iF] | |
132 // in1 = [i4 i5 i6 i7 i8 i9 iA iB] | |
133 const __m128i r0 = _mm_unpacklo_epi16(in0, in1); | |
134 const __m128i r1 = _mm_unpackhi_epi16(in0, in1); | |
135 // r0 = [i0 i4 i1 i5 i2 i6 i3 i7] | |
136 // r1 = [iC i8 iD i9 iE iA iF iB] | |
137 const __m128i r2 = _mm_shuffle_epi32(r0, 0xB4); | |
138 const __m128i r3 = _mm_shuffle_epi32(r1, 0xB4); | |
139 // r2 = [i0 i4 i1 i5 i3 i7 i2 i6] | |
140 // r3 = [iC i8 iD i9 iF iB iE iA] | |
141 | |
142 const __m128i t0 = _mm_add_epi16(r2, r3); | |
143 const __m128i t1 = _mm_sub_epi16(r2, r3); | |
144 // t0 = [a0 a4 a1 a5 a3 a7 a2 a6] | |
145 // t1 = [aC a8 aD a9 aF aB aE aA] | |
146 | |
147 // Stage 2: multiply by constants (which gets us into 32 bits). | |
148 // The constants needed here are: | |
149 // k__cospi_A = [p16 p16 p16 p16 p16 m16 p16 m16] | |
150 // k__cospi_B = [p16 m16 p16 m16 p16 p16 p16 p16] | |
151 // k__cospi_C = [p08 p24 p08 p24 p24 m08 p24 m08] | |
152 // k__cospi_D = [p24 m08 p24 m08 p08 p24 p08 p24] | |
153 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_A); | |
154 const __m128i u2 = _mm_madd_epi16(t0, k__cospi_B); | |
155 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_C); | |
156 const __m128i u3 = _mm_madd_epi16(t1, k__cospi_D); | |
157 // Then add and right-shift to get back to 16-bit range | |
158 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); | |
159 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); | |
160 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); | |
161 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); | |
162 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); | |
163 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); | |
164 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); | |
165 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); | |
166 // w0 = [b0 b1 b7 b6] | |
167 // w1 = [b8 b9 bF bE] | |
168 // w2 = [b4 b5 b3 b2] | |
169 // w3 = [bC bD bB bA] | |
170 const __m128i x0 = _mm_packs_epi32(w0, w1); | |
171 const __m128i x1 = _mm_packs_epi32(w2, w3); | |
172 #if DCT_HIGH_BIT_DEPTH | |
173 overflow = check_epi16_overflow_x2(&x0, &x1); | |
174 if (overflow) { | |
175 vp9_highbd_fdct4x4_c(input, output, stride); | |
176 return; | |
177 } | |
178 #endif // DCT_HIGH_BIT_DEPTH | |
179 // x0 = [b0 b1 b7 b6 b8 b9 bF bE] | |
180 // x1 = [b4 b5 b3 b2 bC bD bB bA] | |
181 in0 = _mm_shuffle_epi32(x0, 0xD8); | |
182 in1 = _mm_shuffle_epi32(x1, 0x8D); | |
183 // in0 = [b0 b1 b8 b9 b7 b6 bF bE] | |
184 // in1 = [b3 b2 bB bA b4 b5 bC bD] | |
185 } | |
186 { | |
187 // vertical DCTs finished. Now we do the horizontal DCTs. | |
188 // Stage 3: Add/subtract | |
189 | |
190 const __m128i t0 = ADD_EPI16(in0, in1); | |
191 const __m128i t1 = SUB_EPI16(in0, in1); | |
192 // t0 = [c0 c1 c8 c9 c4 c5 cC cD] | |
193 // t1 = [c3 c2 cB cA -c7 -c6 -cF -cE] | |
194 #if DCT_HIGH_BIT_DEPTH | |
195 overflow = check_epi16_overflow_x2(&t0, &t1); | |
196 if (overflow) { | |
197 vp9_highbd_fdct4x4_c(input, output, stride); | |
198 return; | |
199 } | |
200 #endif // DCT_HIGH_BIT_DEPTH | |
201 | |
202 // Stage 4: multiply by constants (which gets us into 32 bits). | |
203 { | |
204 // The constants needed here are: | |
205 // k__cospi_E = [p16 p16 p16 p16 p16 p16 p16 p16] | |
206 // k__cospi_F = [p16 m16 p16 m16 p16 m16 p16 m16] | |
207 // k__cospi_G = [p08 p24 p08 p24 m08 m24 m08 m24] | |
208 // k__cospi_H = [p24 m08 p24 m08 m24 p08 m24 p08] | |
209 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_E); | |
210 const __m128i u1 = _mm_madd_epi16(t0, k__cospi_F); | |
211 const __m128i u2 = _mm_madd_epi16(t1, k__cospi_G); | |
212 const __m128i u3 = _mm_madd_epi16(t1, k__cospi_H); | |
213 // Then add and right-shift to get back to 16-bit range | |
214 // but this combines the final right-shift as well to save operations | |
215 // This unusual rounding operations is to maintain bit-accurate | |
216 // compatibility with the c version of this function which has two | |
217 // rounding steps in a row. | |
218 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING2); | |
219 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING2); | |
220 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING2); | |
221 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING2); | |
222 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS2); | |
223 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS2); | |
224 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS2); | |
225 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS2); | |
226 // w0 = [o0 o4 o8 oC] | |
227 // w1 = [o2 o6 oA oE] | |
228 // w2 = [o1 o5 o9 oD] | |
229 // w3 = [o3 o7 oB oF] | |
230 // remember the o's are numbered according to the correct output location | |
231 const __m128i x0 = _mm_packs_epi32(w0, w1); | |
232 const __m128i x1 = _mm_packs_epi32(w2, w3); | |
233 #if DCT_HIGH_BIT_DEPTH | |
234 overflow = check_epi16_overflow_x2(&x0, &x1); | |
235 if (overflow) { | |
236 vp9_highbd_fdct4x4_c(input, output, stride); | |
237 return; | |
238 } | |
239 #endif // DCT_HIGH_BIT_DEPTH | |
240 { | |
241 // x0 = [o0 o4 o8 oC o2 o6 oA oE] | |
242 // x1 = [o1 o5 o9 oD o3 o7 oB oF] | |
243 const __m128i y0 = _mm_unpacklo_epi16(x0, x1); | |
244 const __m128i y1 = _mm_unpackhi_epi16(x0, x1); | |
245 // y0 = [o0 o1 o4 o5 o8 o9 oC oD] | |
246 // y1 = [o2 o3 o6 o7 oA oB oE oF] | |
247 in0 = _mm_unpacklo_epi32(y0, y1); | |
248 // in0 = [o0 o1 o2 o3 o4 o5 o6 o7] | |
249 in1 = _mm_unpackhi_epi32(y0, y1); | |
250 // in1 = [o8 o9 oA oB oC oD oE oF] | |
251 } | |
252 } | |
253 } | |
254 // Post-condition (v + 1) >> 2 is now incorporated into previous | |
255 // add and right-shift commands. Only 2 store instructions needed | |
256 // because we are using the fact that 1/3 are stored just after 0/2. | |
257 storeu_output(&in0, output + 0 * 4); | |
258 storeu_output(&in1, output + 2 * 4); | |
259 } | |
260 | |
261 | |
262 void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) { | |
263 int pass; | |
264 // Constants | |
265 // When we use them, in one case, they are all the same. In all others | |
266 // it's a pair of them that we need to repeat four times. This is done | |
267 // by constructing the 32 bit constant corresponding to that pair. | |
268 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); | |
269 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); | |
270 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); | |
271 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); | |
272 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); | |
273 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); | |
274 const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); | |
275 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); | |
276 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); | |
277 #if DCT_HIGH_BIT_DEPTH | |
278 int overflow; | |
279 #endif | |
280 // Load input | |
281 __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); | |
282 __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); | |
283 __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); | |
284 __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); | |
285 __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride)); | |
286 __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride)); | |
287 __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride)); | |
288 __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride)); | |
289 // Pre-condition input (shift by two) | |
290 in0 = _mm_slli_epi16(in0, 2); | |
291 in1 = _mm_slli_epi16(in1, 2); | |
292 in2 = _mm_slli_epi16(in2, 2); | |
293 in3 = _mm_slli_epi16(in3, 2); | |
294 in4 = _mm_slli_epi16(in4, 2); | |
295 in5 = _mm_slli_epi16(in5, 2); | |
296 in6 = _mm_slli_epi16(in6, 2); | |
297 in7 = _mm_slli_epi16(in7, 2); | |
298 | |
299 // We do two passes, first the columns, then the rows. The results of the | |
300 // first pass are transposed so that the same column code can be reused. The | |
301 // results of the second pass are also transposed so that the rows (processed | |
302 // as columns) are put back in row positions. | |
303 for (pass = 0; pass < 2; pass++) { | |
304 // To store results of each pass before the transpose. | |
305 __m128i res0, res1, res2, res3, res4, res5, res6, res7; | |
306 // Add/subtract | |
307 const __m128i q0 = ADD_EPI16(in0, in7); | |
308 const __m128i q1 = ADD_EPI16(in1, in6); | |
309 const __m128i q2 = ADD_EPI16(in2, in5); | |
310 const __m128i q3 = ADD_EPI16(in3, in4); | |
311 const __m128i q4 = SUB_EPI16(in3, in4); | |
312 const __m128i q5 = SUB_EPI16(in2, in5); | |
313 const __m128i q6 = SUB_EPI16(in1, in6); | |
314 const __m128i q7 = SUB_EPI16(in0, in7); | |
315 #if DCT_HIGH_BIT_DEPTH | |
316 if (pass == 1) { | |
317 overflow = check_epi16_overflow_x8(&q0, &q1, &q2, &q3, | |
318 &q4, &q5, &q6, &q7); | |
319 if (overflow) { | |
320 vp9_highbd_fdct8x8_c(input, output, stride); | |
321 return; | |
322 } | |
323 } | |
324 #endif // DCT_HIGH_BIT_DEPTH | |
325 // Work on first four results | |
326 { | |
327 // Add/subtract | |
328 const __m128i r0 = ADD_EPI16(q0, q3); | |
329 const __m128i r1 = ADD_EPI16(q1, q2); | |
330 const __m128i r2 = SUB_EPI16(q1, q2); | |
331 const __m128i r3 = SUB_EPI16(q0, q3); | |
332 #if DCT_HIGH_BIT_DEPTH | |
333 overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3); | |
334 if (overflow) { | |
335 vp9_highbd_fdct8x8_c(input, output, stride); | |
336 return; | |
337 } | |
338 #endif // DCT_HIGH_BIT_DEPTH | |
339 // Interleave to do the multiply by constants which gets us into 32bits | |
340 { | |
341 const __m128i t0 = _mm_unpacklo_epi16(r0, r1); | |
342 const __m128i t1 = _mm_unpackhi_epi16(r0, r1); | |
343 const __m128i t2 = _mm_unpacklo_epi16(r2, r3); | |
344 const __m128i t3 = _mm_unpackhi_epi16(r2, r3); | |
345 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); | |
346 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); | |
347 const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); | |
348 const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); | |
349 const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); | |
350 const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); | |
351 const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); | |
352 const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); | |
353 // dct_const_round_shift | |
354 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); | |
355 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); | |
356 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); | |
357 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); | |
358 const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); | |
359 const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); | |
360 const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); | |
361 const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); | |
362 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); | |
363 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); | |
364 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); | |
365 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); | |
366 const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); | |
367 const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); | |
368 const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); | |
369 const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); | |
370 // Combine | |
371 res0 = _mm_packs_epi32(w0, w1); | |
372 res4 = _mm_packs_epi32(w2, w3); | |
373 res2 = _mm_packs_epi32(w4, w5); | |
374 res6 = _mm_packs_epi32(w6, w7); | |
375 #if DCT_HIGH_BIT_DEPTH | |
376 overflow = check_epi16_overflow_x4(&res0, &res4, &res2, &res6); | |
377 if (overflow) { | |
378 vp9_highbd_fdct8x8_c(input, output, stride); | |
379 return; | |
380 } | |
381 #endif // DCT_HIGH_BIT_DEPTH | |
382 } | |
383 } | |
384 // Work on next four results | |
385 { | |
386 // Interleave to do the multiply by constants which gets us into 32bits | |
387 const __m128i d0 = _mm_unpacklo_epi16(q6, q5); | |
388 const __m128i d1 = _mm_unpackhi_epi16(q6, q5); | |
389 const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); | |
390 const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); | |
391 const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); | |
392 const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); | |
393 // dct_const_round_shift | |
394 const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); | |
395 const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); | |
396 const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); | |
397 const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); | |
398 const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); | |
399 const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); | |
400 const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); | |
401 const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); | |
402 // Combine | |
403 const __m128i r0 = _mm_packs_epi32(s0, s1); | |
404 const __m128i r1 = _mm_packs_epi32(s2, s3); | |
405 #if DCT_HIGH_BIT_DEPTH | |
406 overflow = check_epi16_overflow_x2(&r0, &r1); | |
407 if (overflow) { | |
408 vp9_highbd_fdct8x8_c(input, output, stride); | |
409 return; | |
410 } | |
411 #endif // DCT_HIGH_BIT_DEPTH | |
412 { | |
413 // Add/subtract | |
414 const __m128i x0 = ADD_EPI16(q4, r0); | |
415 const __m128i x1 = SUB_EPI16(q4, r0); | |
416 const __m128i x2 = SUB_EPI16(q7, r1); | |
417 const __m128i x3 = ADD_EPI16(q7, r1); | |
418 #if DCT_HIGH_BIT_DEPTH | |
419 overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3); | |
420 if (overflow) { | |
421 vp9_highbd_fdct8x8_c(input, output, stride); | |
422 return; | |
423 } | |
424 #endif // DCT_HIGH_BIT_DEPTH | |
425 // Interleave to do the multiply by constants which gets us into 32bits | |
426 { | |
427 const __m128i t0 = _mm_unpacklo_epi16(x0, x3); | |
428 const __m128i t1 = _mm_unpackhi_epi16(x0, x3); | |
429 const __m128i t2 = _mm_unpacklo_epi16(x1, x2); | |
430 const __m128i t3 = _mm_unpackhi_epi16(x1, x2); | |
431 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); | |
432 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); | |
433 const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); | |
434 const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); | |
435 const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); | |
436 const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); | |
437 const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); | |
438 const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); | |
439 // dct_const_round_shift | |
440 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); | |
441 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); | |
442 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); | |
443 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); | |
444 const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); | |
445 const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); | |
446 const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); | |
447 const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); | |
448 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); | |
449 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); | |
450 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); | |
451 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); | |
452 const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); | |
453 const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); | |
454 const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); | |
455 const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); | |
456 // Combine | |
457 res1 = _mm_packs_epi32(w0, w1); | |
458 res7 = _mm_packs_epi32(w2, w3); | |
459 res5 = _mm_packs_epi32(w4, w5); | |
460 res3 = _mm_packs_epi32(w6, w7); | |
461 #if DCT_HIGH_BIT_DEPTH | |
462 overflow = check_epi16_overflow_x4(&res1, &res7, &res5, &res3); | |
463 if (overflow) { | |
464 vp9_highbd_fdct8x8_c(input, output, stride); | |
465 return; | |
466 } | |
467 #endif // DCT_HIGH_BIT_DEPTH | |
468 } | |
469 } | |
470 } | |
471 // Transpose the 8x8. | |
472 { | |
473 // 00 01 02 03 04 05 06 07 | |
474 // 10 11 12 13 14 15 16 17 | |
475 // 20 21 22 23 24 25 26 27 | |
476 // 30 31 32 33 34 35 36 37 | |
477 // 40 41 42 43 44 45 46 47 | |
478 // 50 51 52 53 54 55 56 57 | |
479 // 60 61 62 63 64 65 66 67 | |
480 // 70 71 72 73 74 75 76 77 | |
481 const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); | |
482 const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3); | |
483 const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1); | |
484 const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3); | |
485 const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5); | |
486 const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7); | |
487 const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5); | |
488 const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7); | |
489 // 00 10 01 11 02 12 03 13 | |
490 // 20 30 21 31 22 32 23 33 | |
491 // 04 14 05 15 06 16 07 17 | |
492 // 24 34 25 35 26 36 27 37 | |
493 // 40 50 41 51 42 52 43 53 | |
494 // 60 70 61 71 62 72 63 73 | |
495 // 54 54 55 55 56 56 57 57 | |
496 // 64 74 65 75 66 76 67 77 | |
497 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); | |
498 const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); | |
499 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); | |
500 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); | |
501 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); | |
502 const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); | |
503 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); | |
504 const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); | |
505 // 00 10 20 30 01 11 21 31 | |
506 // 40 50 60 70 41 51 61 71 | |
507 // 02 12 22 32 03 13 23 33 | |
508 // 42 52 62 72 43 53 63 73 | |
509 // 04 14 24 34 05 15 21 36 | |
510 // 44 54 64 74 45 55 61 76 | |
511 // 06 16 26 36 07 17 27 37 | |
512 // 46 56 66 76 47 57 67 77 | |
513 in0 = _mm_unpacklo_epi64(tr1_0, tr1_4); | |
514 in1 = _mm_unpackhi_epi64(tr1_0, tr1_4); | |
515 in2 = _mm_unpacklo_epi64(tr1_2, tr1_6); | |
516 in3 = _mm_unpackhi_epi64(tr1_2, tr1_6); | |
517 in4 = _mm_unpacklo_epi64(tr1_1, tr1_5); | |
518 in5 = _mm_unpackhi_epi64(tr1_1, tr1_5); | |
519 in6 = _mm_unpacklo_epi64(tr1_3, tr1_7); | |
520 in7 = _mm_unpackhi_epi64(tr1_3, tr1_7); | |
521 // 00 10 20 30 40 50 60 70 | |
522 // 01 11 21 31 41 51 61 71 | |
523 // 02 12 22 32 42 52 62 72 | |
524 // 03 13 23 33 43 53 63 73 | |
525 // 04 14 24 34 44 54 64 74 | |
526 // 05 15 25 35 45 55 65 75 | |
527 // 06 16 26 36 46 56 66 76 | |
528 // 07 17 27 37 47 57 67 77 | |
529 } | |
530 } | |
531 // Post-condition output and store it | |
532 { | |
533 // Post-condition (division by two) | |
534 // division of two 16 bits signed numbers using shifts | |
535 // n / 2 = (n - (n >> 15)) >> 1 | |
536 const __m128i sign_in0 = _mm_srai_epi16(in0, 15); | |
537 const __m128i sign_in1 = _mm_srai_epi16(in1, 15); | |
538 const __m128i sign_in2 = _mm_srai_epi16(in2, 15); | |
539 const __m128i sign_in3 = _mm_srai_epi16(in3, 15); | |
540 const __m128i sign_in4 = _mm_srai_epi16(in4, 15); | |
541 const __m128i sign_in5 = _mm_srai_epi16(in5, 15); | |
542 const __m128i sign_in6 = _mm_srai_epi16(in6, 15); | |
543 const __m128i sign_in7 = _mm_srai_epi16(in7, 15); | |
544 in0 = _mm_sub_epi16(in0, sign_in0); | |
545 in1 = _mm_sub_epi16(in1, sign_in1); | |
546 in2 = _mm_sub_epi16(in2, sign_in2); | |
547 in3 = _mm_sub_epi16(in3, sign_in3); | |
548 in4 = _mm_sub_epi16(in4, sign_in4); | |
549 in5 = _mm_sub_epi16(in5, sign_in5); | |
550 in6 = _mm_sub_epi16(in6, sign_in6); | |
551 in7 = _mm_sub_epi16(in7, sign_in7); | |
552 in0 = _mm_srai_epi16(in0, 1); | |
553 in1 = _mm_srai_epi16(in1, 1); | |
554 in2 = _mm_srai_epi16(in2, 1); | |
555 in3 = _mm_srai_epi16(in3, 1); | |
556 in4 = _mm_srai_epi16(in4, 1); | |
557 in5 = _mm_srai_epi16(in5, 1); | |
558 in6 = _mm_srai_epi16(in6, 1); | |
559 in7 = _mm_srai_epi16(in7, 1); | |
560 // store results | |
561 store_output(&in0, (output + 0 * 8)); | |
562 store_output(&in1, (output + 1 * 8)); | |
563 store_output(&in2, (output + 2 * 8)); | |
564 store_output(&in3, (output + 3 * 8)); | |
565 store_output(&in4, (output + 4 * 8)); | |
566 store_output(&in5, (output + 5 * 8)); | |
567 store_output(&in6, (output + 6 * 8)); | |
568 store_output(&in7, (output + 7 * 8)); | |
569 } | |
570 } | |
571 | |
572 void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) { | |
573 // The 2D transform is done with two passes which are actually pretty | |
574 // similar. In the first one, we transform the columns and transpose | |
575 // the results. In the second one, we transform the rows. To achieve that, | |
576 // as the first pass results are transposed, we transpose the columns (that | |
577 // is the transposed rows) and transpose the results (so that it goes back | |
578 // in normal/row positions). | |
579 int pass; | |
580 // We need an intermediate buffer between passes. | |
581 DECLARE_ALIGNED(16, int16_t, intermediate[256]); | |
582 const int16_t *in = input; | |
583 int16_t *out0 = intermediate; | |
584 tran_low_t *out1 = output; | |
585 // Constants | |
586 // When we use them, in one case, they are all the same. In all others | |
587 // it's a pair of them that we need to repeat four times. This is done | |
588 // by constructing the 32 bit constant corresponding to that pair. | |
589 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); | |
590 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); | |
591 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); | |
592 const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64); | |
593 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); | |
594 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); | |
595 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); | |
596 const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); | |
597 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); | |
598 const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); | |
599 const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64); | |
600 const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); | |
601 const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); | |
602 const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64); | |
603 const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64); | |
604 const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); | |
605 const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); | |
606 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); | |
607 const __m128i kOne = _mm_set1_epi16(1); | |
608 // Do the two transform/transpose passes | |
609 for (pass = 0; pass < 2; ++pass) { | |
610 // We process eight columns (transposed rows in second pass) at a time. | |
611 int column_start; | |
612 #if DCT_HIGH_BIT_DEPTH | |
613 int overflow; | |
614 #endif | |
615 for (column_start = 0; column_start < 16; column_start += 8) { | |
616 __m128i in00, in01, in02, in03, in04, in05, in06, in07; | |
617 __m128i in08, in09, in10, in11, in12, in13, in14, in15; | |
618 __m128i input0, input1, input2, input3, input4, input5, input6, input7; | |
619 __m128i step1_0, step1_1, step1_2, step1_3; | |
620 __m128i step1_4, step1_5, step1_6, step1_7; | |
621 __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; | |
622 __m128i step3_0, step3_1, step3_2, step3_3; | |
623 __m128i step3_4, step3_5, step3_6, step3_7; | |
624 __m128i res00, res01, res02, res03, res04, res05, res06, res07; | |
625 __m128i res08, res09, res10, res11, res12, res13, res14, res15; | |
626 // Load and pre-condition input. | |
627 if (0 == pass) { | |
628 in00 = _mm_load_si128((const __m128i *)(in + 0 * stride)); | |
629 in01 = _mm_load_si128((const __m128i *)(in + 1 * stride)); | |
630 in02 = _mm_load_si128((const __m128i *)(in + 2 * stride)); | |
631 in03 = _mm_load_si128((const __m128i *)(in + 3 * stride)); | |
632 in04 = _mm_load_si128((const __m128i *)(in + 4 * stride)); | |
633 in05 = _mm_load_si128((const __m128i *)(in + 5 * stride)); | |
634 in06 = _mm_load_si128((const __m128i *)(in + 6 * stride)); | |
635 in07 = _mm_load_si128((const __m128i *)(in + 7 * stride)); | |
636 in08 = _mm_load_si128((const __m128i *)(in + 8 * stride)); | |
637 in09 = _mm_load_si128((const __m128i *)(in + 9 * stride)); | |
638 in10 = _mm_load_si128((const __m128i *)(in + 10 * stride)); | |
639 in11 = _mm_load_si128((const __m128i *)(in + 11 * stride)); | |
640 in12 = _mm_load_si128((const __m128i *)(in + 12 * stride)); | |
641 in13 = _mm_load_si128((const __m128i *)(in + 13 * stride)); | |
642 in14 = _mm_load_si128((const __m128i *)(in + 14 * stride)); | |
643 in15 = _mm_load_si128((const __m128i *)(in + 15 * stride)); | |
644 // x = x << 2 | |
645 in00 = _mm_slli_epi16(in00, 2); | |
646 in01 = _mm_slli_epi16(in01, 2); | |
647 in02 = _mm_slli_epi16(in02, 2); | |
648 in03 = _mm_slli_epi16(in03, 2); | |
649 in04 = _mm_slli_epi16(in04, 2); | |
650 in05 = _mm_slli_epi16(in05, 2); | |
651 in06 = _mm_slli_epi16(in06, 2); | |
652 in07 = _mm_slli_epi16(in07, 2); | |
653 in08 = _mm_slli_epi16(in08, 2); | |
654 in09 = _mm_slli_epi16(in09, 2); | |
655 in10 = _mm_slli_epi16(in10, 2); | |
656 in11 = _mm_slli_epi16(in11, 2); | |
657 in12 = _mm_slli_epi16(in12, 2); | |
658 in13 = _mm_slli_epi16(in13, 2); | |
659 in14 = _mm_slli_epi16(in14, 2); | |
660 in15 = _mm_slli_epi16(in15, 2); | |
661 } else { | |
662 in00 = _mm_load_si128((const __m128i *)(in + 0 * 16)); | |
663 in01 = _mm_load_si128((const __m128i *)(in + 1 * 16)); | |
664 in02 = _mm_load_si128((const __m128i *)(in + 2 * 16)); | |
665 in03 = _mm_load_si128((const __m128i *)(in + 3 * 16)); | |
666 in04 = _mm_load_si128((const __m128i *)(in + 4 * 16)); | |
667 in05 = _mm_load_si128((const __m128i *)(in + 5 * 16)); | |
668 in06 = _mm_load_si128((const __m128i *)(in + 6 * 16)); | |
669 in07 = _mm_load_si128((const __m128i *)(in + 7 * 16)); | |
670 in08 = _mm_load_si128((const __m128i *)(in + 8 * 16)); | |
671 in09 = _mm_load_si128((const __m128i *)(in + 9 * 16)); | |
672 in10 = _mm_load_si128((const __m128i *)(in + 10 * 16)); | |
673 in11 = _mm_load_si128((const __m128i *)(in + 11 * 16)); | |
674 in12 = _mm_load_si128((const __m128i *)(in + 12 * 16)); | |
675 in13 = _mm_load_si128((const __m128i *)(in + 13 * 16)); | |
676 in14 = _mm_load_si128((const __m128i *)(in + 14 * 16)); | |
677 in15 = _mm_load_si128((const __m128i *)(in + 15 * 16)); | |
678 // x = (x + 1) >> 2 | |
679 in00 = _mm_add_epi16(in00, kOne); | |
680 in01 = _mm_add_epi16(in01, kOne); | |
681 in02 = _mm_add_epi16(in02, kOne); | |
682 in03 = _mm_add_epi16(in03, kOne); | |
683 in04 = _mm_add_epi16(in04, kOne); | |
684 in05 = _mm_add_epi16(in05, kOne); | |
685 in06 = _mm_add_epi16(in06, kOne); | |
686 in07 = _mm_add_epi16(in07, kOne); | |
687 in08 = _mm_add_epi16(in08, kOne); | |
688 in09 = _mm_add_epi16(in09, kOne); | |
689 in10 = _mm_add_epi16(in10, kOne); | |
690 in11 = _mm_add_epi16(in11, kOne); | |
691 in12 = _mm_add_epi16(in12, kOne); | |
692 in13 = _mm_add_epi16(in13, kOne); | |
693 in14 = _mm_add_epi16(in14, kOne); | |
694 in15 = _mm_add_epi16(in15, kOne); | |
695 in00 = _mm_srai_epi16(in00, 2); | |
696 in01 = _mm_srai_epi16(in01, 2); | |
697 in02 = _mm_srai_epi16(in02, 2); | |
698 in03 = _mm_srai_epi16(in03, 2); | |
699 in04 = _mm_srai_epi16(in04, 2); | |
700 in05 = _mm_srai_epi16(in05, 2); | |
701 in06 = _mm_srai_epi16(in06, 2); | |
702 in07 = _mm_srai_epi16(in07, 2); | |
703 in08 = _mm_srai_epi16(in08, 2); | |
704 in09 = _mm_srai_epi16(in09, 2); | |
705 in10 = _mm_srai_epi16(in10, 2); | |
706 in11 = _mm_srai_epi16(in11, 2); | |
707 in12 = _mm_srai_epi16(in12, 2); | |
708 in13 = _mm_srai_epi16(in13, 2); | |
709 in14 = _mm_srai_epi16(in14, 2); | |
710 in15 = _mm_srai_epi16(in15, 2); | |
711 } | |
712 in += 8; | |
713 // Calculate input for the first 8 results. | |
714 { | |
715 input0 = ADD_EPI16(in00, in15); | |
716 input1 = ADD_EPI16(in01, in14); | |
717 input2 = ADD_EPI16(in02, in13); | |
718 input3 = ADD_EPI16(in03, in12); | |
719 input4 = ADD_EPI16(in04, in11); | |
720 input5 = ADD_EPI16(in05, in10); | |
721 input6 = ADD_EPI16(in06, in09); | |
722 input7 = ADD_EPI16(in07, in08); | |
723 #if DCT_HIGH_BIT_DEPTH | |
724 overflow = check_epi16_overflow_x8(&input0, &input1, &input2, &input3, | |
725 &input4, &input5, &input6, &input7); | |
726 if (overflow) { | |
727 vp9_highbd_fdct16x16_c(input, output, stride); | |
728 return; | |
729 } | |
730 #endif // DCT_HIGH_BIT_DEPTH | |
731 } | |
732 // Calculate input for the next 8 results. | |
733 { | |
734 step1_0 = SUB_EPI16(in07, in08); | |
735 step1_1 = SUB_EPI16(in06, in09); | |
736 step1_2 = SUB_EPI16(in05, in10); | |
737 step1_3 = SUB_EPI16(in04, in11); | |
738 step1_4 = SUB_EPI16(in03, in12); | |
739 step1_5 = SUB_EPI16(in02, in13); | |
740 step1_6 = SUB_EPI16(in01, in14); | |
741 step1_7 = SUB_EPI16(in00, in15); | |
742 #if DCT_HIGH_BIT_DEPTH | |
743 overflow = check_epi16_overflow_x8(&step1_0, &step1_1, | |
744 &step1_2, &step1_3, | |
745 &step1_4, &step1_5, | |
746 &step1_6, &step1_7); | |
747 if (overflow) { | |
748 vp9_highbd_fdct16x16_c(input, output, stride); | |
749 return; | |
750 } | |
751 #endif // DCT_HIGH_BIT_DEPTH | |
752 } | |
753 // Work on the first eight values; fdct8(input, even_results); | |
754 { | |
755 // Add/subtract | |
756 const __m128i q0 = ADD_EPI16(input0, input7); | |
757 const __m128i q1 = ADD_EPI16(input1, input6); | |
758 const __m128i q2 = ADD_EPI16(input2, input5); | |
759 const __m128i q3 = ADD_EPI16(input3, input4); | |
760 const __m128i q4 = SUB_EPI16(input3, input4); | |
761 const __m128i q5 = SUB_EPI16(input2, input5); | |
762 const __m128i q6 = SUB_EPI16(input1, input6); | |
763 const __m128i q7 = SUB_EPI16(input0, input7); | |
764 #if DCT_HIGH_BIT_DEPTH | |
765 overflow = check_epi16_overflow_x8(&q0, &q1, &q2, &q3, | |
766 &q4, &q5, &q6, &q7); | |
767 if (overflow) { | |
768 vp9_highbd_fdct16x16_c(input, output, stride); | |
769 return; | |
770 } | |
771 #endif // DCT_HIGH_BIT_DEPTH | |
772 // Work on first four results | |
773 { | |
774 // Add/subtract | |
775 const __m128i r0 = ADD_EPI16(q0, q3); | |
776 const __m128i r1 = ADD_EPI16(q1, q2); | |
777 const __m128i r2 = SUB_EPI16(q1, q2); | |
778 const __m128i r3 = SUB_EPI16(q0, q3); | |
779 #if DCT_HIGH_BIT_DEPTH | |
780 overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3); | |
781 if (overflow) { | |
782 vp9_highbd_fdct16x16_c(input, output, stride); | |
783 return; | |
784 } | |
785 #endif // DCT_HIGH_BIT_DEPTH | |
786 // Interleave to do the multiply by constants which gets us | |
787 // into 32 bits. | |
788 { | |
789 const __m128i t0 = _mm_unpacklo_epi16(r0, r1); | |
790 const __m128i t1 = _mm_unpackhi_epi16(r0, r1); | |
791 const __m128i t2 = _mm_unpacklo_epi16(r2, r3); | |
792 const __m128i t3 = _mm_unpackhi_epi16(r2, r3); | |
793 res00 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16, | |
794 &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); | |
795 res08 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16, | |
796 &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); | |
797 res04 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08, | |
798 &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); | |
799 res12 = mult_round_shift(&t2, &t3, &k__cospi_m08_p24, | |
800 &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); | |
801 #if DCT_HIGH_BIT_DEPTH | |
802 overflow = check_epi16_overflow_x4(&res00, &res08, &res04, &res12); | |
803 if (overflow) { | |
804 vp9_highbd_fdct16x16_c(input, output, stride); | |
805 return; | |
806 } | |
807 #endif // DCT_HIGH_BIT_DEPTH | |
808 } | |
809 } | |
810 // Work on next four results | |
811 { | |
812 // Interleave to do the multiply by constants which gets us | |
813 // into 32 bits. | |
814 const __m128i d0 = _mm_unpacklo_epi16(q6, q5); | |
815 const __m128i d1 = _mm_unpackhi_epi16(q6, q5); | |
816 const __m128i r0 = mult_round_shift(&d0, &d1, &k__cospi_p16_m16, | |
817 &k__DCT_CONST_ROUNDING, | |
818 DCT_CONST_BITS); | |
819 const __m128i r1 = mult_round_shift(&d0, &d1, &k__cospi_p16_p16, | |
820 &k__DCT_CONST_ROUNDING, | |
821 DCT_CONST_BITS); | |
822 #if DCT_HIGH_BIT_DEPTH | |
823 overflow = check_epi16_overflow_x2(&r0, &r1); | |
824 if (overflow) { | |
825 vp9_highbd_fdct16x16_c(input, output, stride); | |
826 return; | |
827 } | |
828 #endif // DCT_HIGH_BIT_DEPTH | |
829 { | |
830 // Add/subtract | |
831 const __m128i x0 = ADD_EPI16(q4, r0); | |
832 const __m128i x1 = SUB_EPI16(q4, r0); | |
833 const __m128i x2 = SUB_EPI16(q7, r1); | |
834 const __m128i x3 = ADD_EPI16(q7, r1); | |
835 #if DCT_HIGH_BIT_DEPTH | |
836 overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3); | |
837 if (overflow) { | |
838 vp9_highbd_fdct16x16_c(input, output, stride); | |
839 return; | |
840 } | |
841 #endif // DCT_HIGH_BIT_DEPTH | |
842 // Interleave to do the multiply by constants which gets us | |
843 // into 32 bits. | |
844 { | |
845 const __m128i t0 = _mm_unpacklo_epi16(x0, x3); | |
846 const __m128i t1 = _mm_unpackhi_epi16(x0, x3); | |
847 const __m128i t2 = _mm_unpacklo_epi16(x1, x2); | |
848 const __m128i t3 = _mm_unpackhi_epi16(x1, x2); | |
849 res02 = mult_round_shift(&t0, &t1, &k__cospi_p28_p04, | |
850 &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); | |
851 res14 = mult_round_shift(&t0, &t1, &k__cospi_m04_p28, | |
852 &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); | |
853 res10 = mult_round_shift(&t2, &t3, &k__cospi_p12_p20, | |
854 &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); | |
855 res06 = mult_round_shift(&t2, &t3, &k__cospi_m20_p12, | |
856 &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); | |
857 #if DCT_HIGH_BIT_DEPTH | |
858 overflow = check_epi16_overflow_x4(&res02, &res14, | |
859 &res10, &res06); | |
860 if (overflow) { | |
861 vp9_highbd_fdct16x16_c(input, output, stride); | |
862 return; | |
863 } | |
864 #endif // DCT_HIGH_BIT_DEPTH | |
865 } | |
866 } | |
867 } | |
868 } | |
869 // Work on the next eight values; step1 -> odd_results | |
870 { | |
871 // step 2 | |
872 { | |
873 const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2); | |
874 const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2); | |
875 const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3); | |
876 const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3); | |
877 step2_2 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16, | |
878 &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); | |
879 step2_3 = mult_round_shift(&t2, &t3, &k__cospi_p16_m16, | |
880 &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); | |
881 step2_5 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16, | |
882 &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); | |
883 step2_4 = mult_round_shift(&t2, &t3, &k__cospi_p16_p16, | |
884 &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); | |
885 #if DCT_HIGH_BIT_DEPTH | |
886 overflow = check_epi16_overflow_x4(&step2_2, &step2_3, &step2_5, | |
887 &step2_4); | |
888 if (overflow) { | |
889 vp9_highbd_fdct16x16_c(input, output, stride); | |
890 return; | |
891 } | |
892 #endif // DCT_HIGH_BIT_DEPTH | |
893 } | |
894 // step 3 | |
895 { | |
896 step3_0 = ADD_EPI16(step1_0, step2_3); | |
897 step3_1 = ADD_EPI16(step1_1, step2_2); | |
898 step3_2 = SUB_EPI16(step1_1, step2_2); | |
899 step3_3 = SUB_EPI16(step1_0, step2_3); | |
900 step3_4 = SUB_EPI16(step1_7, step2_4); | |
901 step3_5 = SUB_EPI16(step1_6, step2_5); | |
902 step3_6 = ADD_EPI16(step1_6, step2_5); | |
903 step3_7 = ADD_EPI16(step1_7, step2_4); | |
904 #if DCT_HIGH_BIT_DEPTH | |
905 overflow = check_epi16_overflow_x8(&step3_0, &step3_1, | |
906 &step3_2, &step3_3, | |
907 &step3_4, &step3_5, | |
908 &step3_6, &step3_7); | |
909 if (overflow) { | |
910 vp9_highbd_fdct16x16_c(input, output, stride); | |
911 return; | |
912 } | |
913 #endif // DCT_HIGH_BIT_DEPTH | |
914 } | |
915 // step 4 | |
916 { | |
917 const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6); | |
918 const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6); | |
919 const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5); | |
920 const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); | |
921 step2_1 = mult_round_shift(&t0, &t1, &k__cospi_m08_p24, | |
922 &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); | |
923 step2_2 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08, | |
924 &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); | |
925 step2_6 = mult_round_shift(&t0, &t1, &k__cospi_p24_p08, | |
926 &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); | |
927 step2_5 = mult_round_shift(&t2, &t3, &k__cospi_p08_m24, | |
928 &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); | |
929 #if DCT_HIGH_BIT_DEPTH | |
930 overflow = check_epi16_overflow_x4(&step2_1, &step2_2, &step2_6, | |
931 &step2_5); | |
932 if (overflow) { | |
933 vp9_highbd_fdct16x16_c(input, output, stride); | |
934 return; | |
935 } | |
936 #endif // DCT_HIGH_BIT_DEPTH | |
937 } | |
938 // step 5 | |
939 { | |
940 step1_0 = ADD_EPI16(step3_0, step2_1); | |
941 step1_1 = SUB_EPI16(step3_0, step2_1); | |
942 step1_2 = ADD_EPI16(step3_3, step2_2); | |
943 step1_3 = SUB_EPI16(step3_3, step2_2); | |
944 step1_4 = SUB_EPI16(step3_4, step2_5); | |
945 step1_5 = ADD_EPI16(step3_4, step2_5); | |
946 step1_6 = SUB_EPI16(step3_7, step2_6); | |
947 step1_7 = ADD_EPI16(step3_7, step2_6); | |
948 #if DCT_HIGH_BIT_DEPTH | |
949 overflow = check_epi16_overflow_x8(&step1_0, &step1_1, | |
950 &step1_2, &step1_3, | |
951 &step1_4, &step1_5, | |
952 &step1_6, &step1_7); | |
953 if (overflow) { | |
954 vp9_highbd_fdct16x16_c(input, output, stride); | |
955 return; | |
956 } | |
957 #endif // DCT_HIGH_BIT_DEPTH | |
958 } | |
959 // step 6 | |
960 { | |
961 const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7); | |
962 const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7); | |
963 const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6); | |
964 const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6); | |
965 res01 = mult_round_shift(&t0, &t1, &k__cospi_p30_p02, | |
966 &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); | |
967 res09 = mult_round_shift(&t2, &t3, &k__cospi_p14_p18, | |
968 &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); | |
969 res15 = mult_round_shift(&t0, &t1, &k__cospi_m02_p30, | |
970 &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); | |
971 res07 = mult_round_shift(&t2, &t3, &k__cospi_m18_p14, | |
972 &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); | |
973 #if DCT_HIGH_BIT_DEPTH | |
974 overflow = check_epi16_overflow_x4(&res01, &res09, &res15, &res07); | |
975 if (overflow) { | |
976 vp9_highbd_fdct16x16_c(input, output, stride); | |
977 return; | |
978 } | |
979 #endif // DCT_HIGH_BIT_DEPTH | |
980 } | |
981 { | |
982 const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5); | |
983 const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5); | |
984 const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4); | |
985 const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4); | |
986 res05 = mult_round_shift(&t0, &t1, &k__cospi_p22_p10, | |
987 &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); | |
988 res13 = mult_round_shift(&t2, &t3, &k__cospi_p06_p26, | |
989 &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); | |
990 res11 = mult_round_shift(&t0, &t1, &k__cospi_m10_p22, | |
991 &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); | |
992 res03 = mult_round_shift(&t2, &t3, &k__cospi_m26_p06, | |
993 &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); | |
994 #if DCT_HIGH_BIT_DEPTH | |
995 overflow = check_epi16_overflow_x4(&res05, &res13, &res11, &res03); | |
996 if (overflow) { | |
997 vp9_highbd_fdct16x16_c(input, output, stride); | |
998 return; | |
999 } | |
1000 #endif // DCT_HIGH_BIT_DEPTH | |
1001 } | |
1002 } | |
1003 // Transpose the results, do it as two 8x8 transposes. | |
1004 transpose_and_output8x8(&res00, &res01, &res02, &res03, | |
1005 &res04, &res05, &res06, &res07, | |
1006 pass, out0, out1); | |
1007 transpose_and_output8x8(&res08, &res09, &res10, &res11, | |
1008 &res12, &res13, &res14, &res15, | |
1009 pass, out0 + 8, out1 + 8); | |
1010 if (pass == 0) { | |
1011 out0 += 8*16; | |
1012 } else { | |
1013 out1 += 8*16; | |
1014 } | |
1015 } | |
1016 // Setup in/out for next pass. | |
1017 in = intermediate; | |
1018 } | |
1019 } | |
1020 | |
1021 #undef ADD_EPI16 | |
1022 #undef SUB_EPI16 | |
OLD | NEW |