OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 #include "vp9/common/x86/vp9_idct_intrin_sse2.h" | 11 #include "vp9/common/x86/vp9_idct_intrin_sse2.h" |
12 #include "vp9/common/vp9_idct.h" | 12 #include "vp9/common/vp9_idct.h" |
13 | 13 |
14 #define RECON_AND_STORE4X4(dest, in_x) \ | 14 #define RECON_AND_STORE4X4(dest, in_x) \ |
15 { \ | 15 { \ |
16 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ | 16 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ |
17 d0 = _mm_unpacklo_epi8(d0, zero); \ | 17 d0 = _mm_unpacklo_epi8(d0, zero); \ |
18 d0 = _mm_add_epi16(in_x, d0); \ | 18 d0 = _mm_add_epi16(in_x, d0); \ |
19 d0 = _mm_packus_epi16(d0, d0); \ | 19 d0 = _mm_packus_epi16(d0, d0); \ |
20 *(int *)dest = _mm_cvtsi128_si32(d0); \ | 20 *(int *)(dest) = _mm_cvtsi128_si32(d0); \ |
21 dest += stride; \ | |
22 } | 21 } |
23 | 22 |
24 void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) { | 23 void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
25 const __m128i zero = _mm_setzero_si128(); | 24 const __m128i zero = _mm_setzero_si128(); |
26 const __m128i eight = _mm_set1_epi16(8); | 25 const __m128i eight = _mm_set1_epi16(8); |
27 const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64, | 26 const __m128i cst = _mm_setr_epi16( |
28 (int16_t)cospi_16_64, (int16_t)-cospi_16_64, | 27 (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64, |
29 (int16_t)cospi_24_64, (int16_t)-cospi_8_64, | 28 (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64, |
30 (int16_t)cospi_8_64, (int16_t)cospi_24_64); | 29 (int16_t)cospi_8_64, (int16_t)cospi_24_64); |
31 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); | 30 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
32 __m128i input0, input1, input2, input3; | 31 __m128i input0, input1, input2, input3; |
33 | 32 |
34 // Rows | 33 // Rows |
35 input0 = _mm_load_si128((const __m128i *)input); | 34 input0 = _mm_load_si128((const __m128i *)input); |
36 input2 = _mm_load_si128((const __m128i *)(input + 8)); | 35 input2 = _mm_load_si128((const __m128i *)(input + 8)); |
37 | 36 |
38 // Construct i3, i1, i3, i1, i2, i0, i2, i0 | 37 // Construct i3, i1, i3, i1, i2, i0, i2, i0 |
39 input0 = _mm_shufflelo_epi16(input0, 0xd8); | 38 input0 = _mm_shufflelo_epi16(input0, 0xd8); |
40 input0 = _mm_shufflehi_epi16(input0, 0xd8); | 39 input0 = _mm_shufflehi_epi16(input0, 0xd8); |
(...skipping 78 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
119 | 118 |
120 // Final round and shift | 119 // Final round and shift |
121 input2 = _mm_add_epi16(input2, eight); | 120 input2 = _mm_add_epi16(input2, eight); |
122 input3 = _mm_add_epi16(input3, eight); | 121 input3 = _mm_add_epi16(input3, eight); |
123 | 122 |
124 input2 = _mm_srai_epi16(input2, 4); | 123 input2 = _mm_srai_epi16(input2, 4); |
125 input3 = _mm_srai_epi16(input3, 4); | 124 input3 = _mm_srai_epi16(input3, 4); |
126 | 125 |
127 // Reconstruction and Store | 126 // Reconstruction and Store |
128 { | 127 { |
129 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); | 128 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); |
130 __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); | 129 __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); |
131 d0 = _mm_unpacklo_epi32(d0, | 130 d0 = _mm_unpacklo_epi32(d0, |
132 _mm_cvtsi32_si128(*(const int *) (dest + stride))); | 131 _mm_cvtsi32_si128(*(const int *)(dest + stride))); |
133 d2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128( | 132 d2 = _mm_unpacklo_epi32( |
134 *(const int *) (dest + stride * 3)), d2); | 133 _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2); |
135 d0 = _mm_unpacklo_epi8(d0, zero); | 134 d0 = _mm_unpacklo_epi8(d0, zero); |
136 d2 = _mm_unpacklo_epi8(d2, zero); | 135 d2 = _mm_unpacklo_epi8(d2, zero); |
137 d0 = _mm_add_epi16(d0, input2); | 136 d0 = _mm_add_epi16(d0, input2); |
138 d2 = _mm_add_epi16(d2, input3); | 137 d2 = _mm_add_epi16(d2, input3); |
139 d0 = _mm_packus_epi16(d0, d2); | 138 d0 = _mm_packus_epi16(d0, d2); |
140 // store input0 | 139 // store input0 |
141 *(int *)dest = _mm_cvtsi128_si32(d0); | 140 *(int *)dest = _mm_cvtsi128_si32(d0); |
142 // store input1 | 141 // store input1 |
143 d0 = _mm_srli_si128(d0, 4); | 142 d0 = _mm_srli_si128(d0, 4); |
144 *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); | 143 *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); |
145 // store input2 | 144 // store input2 |
146 d0 = _mm_srli_si128(d0, 4); | 145 d0 = _mm_srli_si128(d0, 4); |
147 *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); | 146 *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); |
148 // store input3 | 147 // store input3 |
149 d0 = _mm_srli_si128(d0, 4); | 148 d0 = _mm_srli_si128(d0, 4); |
150 *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); | 149 *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); |
151 } | 150 } |
152 } | 151 } |
153 | 152 |
154 void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { | 153 void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
155 __m128i dc_value; | 154 __m128i dc_value; |
156 const __m128i zero = _mm_setzero_si128(); | 155 const __m128i zero = _mm_setzero_si128(); |
157 int a; | 156 int a; |
158 | 157 |
159 a = dct_const_round_shift(input[0] * cospi_16_64); | 158 a = dct_const_round_shift(input[0] * cospi_16_64); |
160 a = dct_const_round_shift(a * cospi_16_64); | 159 a = dct_const_round_shift(a * cospi_16_64); |
161 a = ROUND_POWER_OF_TWO(a, 4); | 160 a = ROUND_POWER_OF_TWO(a, 4); |
162 | 161 |
163 dc_value = _mm_set1_epi16(a); | 162 dc_value = _mm_set1_epi16(a); |
164 | 163 |
165 RECON_AND_STORE4X4(dest, dc_value); | 164 RECON_AND_STORE4X4(dest + 0 * stride, dc_value); |
166 RECON_AND_STORE4X4(dest, dc_value); | 165 RECON_AND_STORE4X4(dest + 1 * stride, dc_value); |
167 RECON_AND_STORE4X4(dest, dc_value); | 166 RECON_AND_STORE4X4(dest + 2 * stride, dc_value); |
168 RECON_AND_STORE4X4(dest, dc_value); | 167 RECON_AND_STORE4X4(dest + 3 * stride, dc_value); |
169 } | 168 } |
170 | 169 |
171 static INLINE void transpose_4x4(__m128i *res) { | 170 static INLINE void transpose_4x4(__m128i *res) { |
172 const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); | 171 const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); |
173 const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); | 172 const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); |
174 | 173 |
175 res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1); | 174 res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1); |
176 res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1); | 175 res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1); |
177 } | 176 } |
178 | 177 |
(...skipping 81 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
260 in[0] = _mm_packs_epi32(u[0], u[1]); | 259 in[0] = _mm_packs_epi32(u[0], u[1]); |
261 in[1] = _mm_packs_epi32(u[2], u[3]); | 260 in[1] = _mm_packs_epi32(u[2], u[3]); |
262 } | 261 } |
263 | 262 |
264 void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, | 263 void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, |
265 int tx_type) { | 264 int tx_type) { |
266 __m128i in[2]; | 265 __m128i in[2]; |
267 const __m128i zero = _mm_setzero_si128(); | 266 const __m128i zero = _mm_setzero_si128(); |
268 const __m128i eight = _mm_set1_epi16(8); | 267 const __m128i eight = _mm_set1_epi16(8); |
269 | 268 |
270 in[0]= _mm_loadu_si128((const __m128i *)(input)); | 269 in[0] = _mm_loadu_si128((const __m128i *)(input)); |
271 in[1]= _mm_loadu_si128((const __m128i *)(input + 8)); | 270 in[1] = _mm_loadu_si128((const __m128i *)(input + 8)); |
272 | 271 |
273 switch (tx_type) { | 272 switch (tx_type) { |
274 case 0: // DCT_DCT | 273 case 0: // DCT_DCT |
275 idct4_sse2(in); | 274 idct4_sse2(in); |
276 idct4_sse2(in); | 275 idct4_sse2(in); |
277 break; | 276 break; |
278 case 1: // ADST_DCT | 277 case 1: // ADST_DCT |
279 idct4_sse2(in); | 278 idct4_sse2(in); |
280 iadst4_sse2(in); | 279 iadst4_sse2(in); |
281 break; | 280 break; |
(...skipping 12 matching lines...) Expand all Loading... |
294 | 293 |
295 // Final round and shift | 294 // Final round and shift |
296 in[0] = _mm_add_epi16(in[0], eight); | 295 in[0] = _mm_add_epi16(in[0], eight); |
297 in[1] = _mm_add_epi16(in[1], eight); | 296 in[1] = _mm_add_epi16(in[1], eight); |
298 | 297 |
299 in[0] = _mm_srai_epi16(in[0], 4); | 298 in[0] = _mm_srai_epi16(in[0], 4); |
300 in[1] = _mm_srai_epi16(in[1], 4); | 299 in[1] = _mm_srai_epi16(in[1], 4); |
301 | 300 |
302 // Reconstruction and Store | 301 // Reconstruction and Store |
303 { | 302 { |
304 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); | 303 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); |
305 __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); | 304 __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); |
306 d0 = _mm_unpacklo_epi32(d0, | 305 d0 = _mm_unpacklo_epi32(d0, |
307 _mm_cvtsi32_si128(*(const int *) (dest + stride))); | 306 _mm_cvtsi32_si128(*(const int *)(dest + stride))); |
308 d2 = _mm_unpacklo_epi32(d2, _mm_cvtsi32_si128( | 307 d2 = _mm_unpacklo_epi32( |
309 *(const int *) (dest + stride * 3))); | 308 d2, _mm_cvtsi32_si128(*(const int *)(dest + stride * 3))); |
310 d0 = _mm_unpacklo_epi8(d0, zero); | 309 d0 = _mm_unpacklo_epi8(d0, zero); |
311 d2 = _mm_unpacklo_epi8(d2, zero); | 310 d2 = _mm_unpacklo_epi8(d2, zero); |
312 d0 = _mm_add_epi16(d0, in[0]); | 311 d0 = _mm_add_epi16(d0, in[0]); |
313 d2 = _mm_add_epi16(d2, in[1]); | 312 d2 = _mm_add_epi16(d2, in[1]); |
314 d0 = _mm_packus_epi16(d0, d2); | 313 d0 = _mm_packus_epi16(d0, d2); |
315 // store result[0] | 314 // store result[0] |
316 *(int *)dest = _mm_cvtsi128_si32(d0); | 315 *(int *)dest = _mm_cvtsi128_si32(d0); |
317 // store result[1] | 316 // store result[1] |
318 d0 = _mm_srli_si128(d0, 4); | 317 d0 = _mm_srli_si128(d0, 4); |
319 *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); | 318 *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); |
320 // store result[2] | 319 // store result[2] |
321 d0 = _mm_srli_si128(d0, 4); | 320 d0 = _mm_srli_si128(d0, 4); |
322 *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); | 321 *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); |
323 // store result[3] | 322 // store result[3] |
324 d0 = _mm_srli_si128(d0, 4); | 323 d0 = _mm_srli_si128(d0, 4); |
325 *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); | 324 *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); |
326 } | 325 } |
327 } | 326 } |
328 | 327 |
329 #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \ | 328 #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \ |
330 out0, out1, out2, out3, out4, out5, out6, out7) \ | 329 out0, out1, out2, out3, out4, out5, out6, out7) \ |
331 { \ | 330 { \ |
332 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ | 331 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ |
333 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ | 332 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ |
334 const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ | 333 const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ |
335 const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ | 334 const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ |
(...skipping 174 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
510 out3 = _mm_adds_epi16(stp1_3, stp2_4); \ | 509 out3 = _mm_adds_epi16(stp1_3, stp2_4); \ |
511 out4 = _mm_subs_epi16(stp1_3, stp2_4); \ | 510 out4 = _mm_subs_epi16(stp1_3, stp2_4); \ |
512 out5 = _mm_subs_epi16(stp1_2, stp1_5); \ | 511 out5 = _mm_subs_epi16(stp1_2, stp1_5); \ |
513 out6 = _mm_subs_epi16(stp1_1, stp1_6); \ | 512 out6 = _mm_subs_epi16(stp1_1, stp1_6); \ |
514 out7 = _mm_subs_epi16(stp1_0, stp2_7); \ | 513 out7 = _mm_subs_epi16(stp1_0, stp2_7); \ |
515 } | 514 } |
516 | 515 |
517 void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) { | 516 void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
518 const __m128i zero = _mm_setzero_si128(); | 517 const __m128i zero = _mm_setzero_si128(); |
519 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); | 518 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
520 const __m128i final_rounding = _mm_set1_epi16(1<<4); | 519 const __m128i final_rounding = _mm_set1_epi16(1 << 4); |
521 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); | 520 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); |
522 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); | 521 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); |
523 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); | 522 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); |
524 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); | 523 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); |
525 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); | 524 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); |
526 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 525 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
527 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); | 526 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
528 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); | 527 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); |
529 | 528 |
530 __m128i in0, in1, in2, in3, in4, in5, in6, in7; | 529 __m128i in0, in1, in2, in3, in4, in5, in6, in7; |
(...skipping 13 matching lines...) Expand all Loading... |
544 in7 = _mm_load_si128((const __m128i *)(input + 8 * 7)); | 543 in7 = _mm_load_si128((const __m128i *)(input + 8 * 7)); |
545 | 544 |
546 // 2-D | 545 // 2-D |
547 for (i = 0; i < 2; i++) { | 546 for (i = 0; i < 2; i++) { |
548 // 8x8 Transpose is copied from vp9_fdct8x8_sse2() | 547 // 8x8 Transpose is copied from vp9_fdct8x8_sse2() |
549 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, | 548 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, |
550 in0, in1, in2, in3, in4, in5, in6, in7); | 549 in0, in1, in2, in3, in4, in5, in6, in7); |
551 | 550 |
552 // 4-stage 1D idct8x8 | 551 // 4-stage 1D idct8x8 |
553 IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, | 552 IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, |
554 in0, in1, in2, in3, in4, in5, in6, in7); | 553 in0, in1, in2, in3, in4, in5, in6, in7); |
555 } | 554 } |
556 | 555 |
557 // Final rounding and shift | 556 // Final rounding and shift |
558 in0 = _mm_adds_epi16(in0, final_rounding); | 557 in0 = _mm_adds_epi16(in0, final_rounding); |
559 in1 = _mm_adds_epi16(in1, final_rounding); | 558 in1 = _mm_adds_epi16(in1, final_rounding); |
560 in2 = _mm_adds_epi16(in2, final_rounding); | 559 in2 = _mm_adds_epi16(in2, final_rounding); |
561 in3 = _mm_adds_epi16(in3, final_rounding); | 560 in3 = _mm_adds_epi16(in3, final_rounding); |
562 in4 = _mm_adds_epi16(in4, final_rounding); | 561 in4 = _mm_adds_epi16(in4, final_rounding); |
563 in5 = _mm_adds_epi16(in5, final_rounding); | 562 in5 = _mm_adds_epi16(in5, final_rounding); |
564 in6 = _mm_adds_epi16(in6, final_rounding); | 563 in6 = _mm_adds_epi16(in6, final_rounding); |
565 in7 = _mm_adds_epi16(in7, final_rounding); | 564 in7 = _mm_adds_epi16(in7, final_rounding); |
566 | 565 |
567 in0 = _mm_srai_epi16(in0, 5); | 566 in0 = _mm_srai_epi16(in0, 5); |
568 in1 = _mm_srai_epi16(in1, 5); | 567 in1 = _mm_srai_epi16(in1, 5); |
569 in2 = _mm_srai_epi16(in2, 5); | 568 in2 = _mm_srai_epi16(in2, 5); |
570 in3 = _mm_srai_epi16(in3, 5); | 569 in3 = _mm_srai_epi16(in3, 5); |
571 in4 = _mm_srai_epi16(in4, 5); | 570 in4 = _mm_srai_epi16(in4, 5); |
572 in5 = _mm_srai_epi16(in5, 5); | 571 in5 = _mm_srai_epi16(in5, 5); |
573 in6 = _mm_srai_epi16(in6, 5); | 572 in6 = _mm_srai_epi16(in6, 5); |
574 in7 = _mm_srai_epi16(in7, 5); | 573 in7 = _mm_srai_epi16(in7, 5); |
575 | 574 |
576 RECON_AND_STORE(dest, in0); | 575 RECON_AND_STORE(dest + 0 * stride, in0); |
577 RECON_AND_STORE(dest, in1); | 576 RECON_AND_STORE(dest + 1 * stride, in1); |
578 RECON_AND_STORE(dest, in2); | 577 RECON_AND_STORE(dest + 2 * stride, in2); |
579 RECON_AND_STORE(dest, in3); | 578 RECON_AND_STORE(dest + 3 * stride, in3); |
580 RECON_AND_STORE(dest, in4); | 579 RECON_AND_STORE(dest + 4 * stride, in4); |
581 RECON_AND_STORE(dest, in5); | 580 RECON_AND_STORE(dest + 5 * stride, in5); |
582 RECON_AND_STORE(dest, in6); | 581 RECON_AND_STORE(dest + 6 * stride, in6); |
583 RECON_AND_STORE(dest, in7); | 582 RECON_AND_STORE(dest + 7 * stride, in7); |
584 } | 583 } |
585 | 584 |
586 void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { | 585 void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
587 __m128i dc_value; | 586 __m128i dc_value; |
588 const __m128i zero = _mm_setzero_si128(); | 587 const __m128i zero = _mm_setzero_si128(); |
589 int a; | 588 int a; |
590 | 589 |
591 a = dct_const_round_shift(input[0] * cospi_16_64); | 590 a = dct_const_round_shift(input[0] * cospi_16_64); |
592 a = dct_const_round_shift(a * cospi_16_64); | 591 a = dct_const_round_shift(a * cospi_16_64); |
593 a = ROUND_POWER_OF_TWO(a, 5); | 592 a = ROUND_POWER_OF_TWO(a, 5); |
594 | 593 |
595 dc_value = _mm_set1_epi16(a); | 594 dc_value = _mm_set1_epi16(a); |
596 | 595 |
597 RECON_AND_STORE(dest, dc_value); | 596 RECON_AND_STORE(dest + 0 * stride, dc_value); |
598 RECON_AND_STORE(dest, dc_value); | 597 RECON_AND_STORE(dest + 1 * stride, dc_value); |
599 RECON_AND_STORE(dest, dc_value); | 598 RECON_AND_STORE(dest + 2 * stride, dc_value); |
600 RECON_AND_STORE(dest, dc_value); | 599 RECON_AND_STORE(dest + 3 * stride, dc_value); |
601 RECON_AND_STORE(dest, dc_value); | 600 RECON_AND_STORE(dest + 4 * stride, dc_value); |
602 RECON_AND_STORE(dest, dc_value); | 601 RECON_AND_STORE(dest + 5 * stride, dc_value); |
603 RECON_AND_STORE(dest, dc_value); | 602 RECON_AND_STORE(dest + 6 * stride, dc_value); |
604 RECON_AND_STORE(dest, dc_value); | 603 RECON_AND_STORE(dest + 7 * stride, dc_value); |
605 } | 604 } |
606 | 605 |
607 static void idct8_sse2(__m128i *in) { | 606 static void idct8_sse2(__m128i *in) { |
608 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); | 607 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
609 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); | 608 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); |
610 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); | 609 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); |
611 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); | 610 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); |
612 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); | 611 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); |
613 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); | 612 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); |
614 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 613 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
615 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); | 614 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
616 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); | 615 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); |
617 | 616 |
618 __m128i in0, in1, in2, in3, in4, in5, in6, in7; | 617 __m128i in0, in1, in2, in3, in4, in5, in6, in7; |
619 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; | 618 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; |
620 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; | 619 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; |
621 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; | 620 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
622 | 621 |
623 // 8x8 Transpose is copied from vp9_fdct8x8_sse2() | 622 // 8x8 Transpose is copied from vp9_fdct8x8_sse2() |
624 TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], | 623 TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], |
625 in0, in1, in2, in3, in4, in5, in6, in7); | 624 in0, in1, in2, in3, in4, in5, in6, in7); |
626 | 625 |
627 // 4-stage 1D idct8x8 | 626 // 4-stage 1D idct8x8 |
628 IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, | 627 IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, |
629 in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]); | 628 in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]); |
630 } | 629 } |
631 | 630 |
632 static void iadst8_sse2(__m128i *in) { | 631 static void iadst8_sse2(__m128i *in) { |
633 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); | 632 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); |
634 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); | 633 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); |
635 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); | 634 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); |
636 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); | 635 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); |
637 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); | 636 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); |
638 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); | 637 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); |
639 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); | 638 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); |
640 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); | 639 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); |
641 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); | 640 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); |
642 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); | 641 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
643 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); | 642 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); |
644 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 643 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
645 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); | 644 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); |
646 const __m128i k__const_0 = _mm_set1_epi16(0); | 645 const __m128i k__const_0 = _mm_set1_epi16(0); |
647 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); | 646 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); |
648 | 647 |
649 __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; | 648 __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; |
650 __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; | 649 __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; |
651 __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; | 650 __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; |
652 __m128i s0, s1, s2, s3, s4, s5, s6, s7; | 651 __m128i s0, s1, s2, s3, s4, s5, s6, s7; |
653 __m128i in0, in1, in2, in3, in4, in5, in6, in7; | 652 __m128i in0, in1, in2, in3, in4, in5, in6, in7; |
654 | 653 |
655 // transpose | 654 // transpose |
656 array_transpose_8x8(in, in); | 655 array_transpose_8x8(in, in); |
657 | 656 |
658 // properly aligned for butterfly input | 657 // properly aligned for butterfly input |
659 in0 = in[7]; | 658 in0 = in[7]; |
660 in1 = in[0]; | 659 in1 = in[0]; |
661 in2 = in[5]; | 660 in2 = in[5]; |
662 in3 = in[2]; | 661 in3 = in[2]; |
663 in4 = in[3]; | 662 in4 = in[3]; |
664 in5 = in[4]; | 663 in5 = in[4]; |
665 in6 = in[1]; | 664 in6 = in[1]; |
666 in7 = in[6]; | 665 in7 = in[6]; |
667 | 666 |
668 // column transformation | 667 // column transformation |
669 // stage 1 | 668 // stage 1 |
670 // interleave and multiply/add into 32-bit integer | 669 // interleave and multiply/add into 32-bit integer |
671 s0 = _mm_unpacklo_epi16(in0, in1); | 670 s0 = _mm_unpacklo_epi16(in0, in1); |
672 s1 = _mm_unpackhi_epi16(in0, in1); | 671 s1 = _mm_unpackhi_epi16(in0, in1); |
673 s2 = _mm_unpacklo_epi16(in2, in3); | 672 s2 = _mm_unpacklo_epi16(in2, in3); |
674 s3 = _mm_unpackhi_epi16(in2, in3); | 673 s3 = _mm_unpackhi_epi16(in2, in3); |
675 s4 = _mm_unpacklo_epi16(in4, in5); | 674 s4 = _mm_unpacklo_epi16(in4, in5); |
676 s5 = _mm_unpackhi_epi16(in4, in5); | 675 s5 = _mm_unpackhi_epi16(in4, in5); |
(...skipping 173 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
850 in[0] = s0; | 849 in[0] = s0; |
851 in[1] = _mm_sub_epi16(k__const_0, s4); | 850 in[1] = _mm_sub_epi16(k__const_0, s4); |
852 in[2] = s6; | 851 in[2] = s6; |
853 in[3] = _mm_sub_epi16(k__const_0, s2); | 852 in[3] = _mm_sub_epi16(k__const_0, s2); |
854 in[4] = s3; | 853 in[4] = s3; |
855 in[5] = _mm_sub_epi16(k__const_0, s7); | 854 in[5] = _mm_sub_epi16(k__const_0, s7); |
856 in[6] = s5; | 855 in[6] = s5; |
857 in[7] = _mm_sub_epi16(k__const_0, s1); | 856 in[7] = _mm_sub_epi16(k__const_0, s1); |
858 } | 857 } |
859 | 858 |
860 | |
861 void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride, | 859 void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride, |
862 int tx_type) { | 860 int tx_type) { |
863 __m128i in[8]; | 861 __m128i in[8]; |
864 const __m128i zero = _mm_setzero_si128(); | 862 const __m128i zero = _mm_setzero_si128(); |
865 const __m128i final_rounding = _mm_set1_epi16(1<<4); | 863 const __m128i final_rounding = _mm_set1_epi16(1 << 4); |
866 | 864 |
867 // load input data | 865 // load input data |
868 in[0] = _mm_load_si128((const __m128i *)input); | 866 in[0] = _mm_load_si128((const __m128i *)input); |
869 in[1] = _mm_load_si128((const __m128i *)(input + 8 * 1)); | 867 in[1] = _mm_load_si128((const __m128i *)(input + 8 * 1)); |
870 in[2] = _mm_load_si128((const __m128i *)(input + 8 * 2)); | 868 in[2] = _mm_load_si128((const __m128i *)(input + 8 * 2)); |
871 in[3] = _mm_load_si128((const __m128i *)(input + 8 * 3)); | 869 in[3] = _mm_load_si128((const __m128i *)(input + 8 * 3)); |
872 in[4] = _mm_load_si128((const __m128i *)(input + 8 * 4)); | 870 in[4] = _mm_load_si128((const __m128i *)(input + 8 * 4)); |
873 in[5] = _mm_load_si128((const __m128i *)(input + 8 * 5)); | 871 in[5] = _mm_load_si128((const __m128i *)(input + 8 * 5)); |
874 in[6] = _mm_load_si128((const __m128i *)(input + 8 * 6)); | 872 in[6] = _mm_load_si128((const __m128i *)(input + 8 * 6)); |
875 in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7)); | 873 in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7)); |
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
908 | 906 |
909 in[0] = _mm_srai_epi16(in[0], 5); | 907 in[0] = _mm_srai_epi16(in[0], 5); |
910 in[1] = _mm_srai_epi16(in[1], 5); | 908 in[1] = _mm_srai_epi16(in[1], 5); |
911 in[2] = _mm_srai_epi16(in[2], 5); | 909 in[2] = _mm_srai_epi16(in[2], 5); |
912 in[3] = _mm_srai_epi16(in[3], 5); | 910 in[3] = _mm_srai_epi16(in[3], 5); |
913 in[4] = _mm_srai_epi16(in[4], 5); | 911 in[4] = _mm_srai_epi16(in[4], 5); |
914 in[5] = _mm_srai_epi16(in[5], 5); | 912 in[5] = _mm_srai_epi16(in[5], 5); |
915 in[6] = _mm_srai_epi16(in[6], 5); | 913 in[6] = _mm_srai_epi16(in[6], 5); |
916 in[7] = _mm_srai_epi16(in[7], 5); | 914 in[7] = _mm_srai_epi16(in[7], 5); |
917 | 915 |
918 RECON_AND_STORE(dest, in[0]); | 916 RECON_AND_STORE(dest + 0 * stride, in[0]); |
919 RECON_AND_STORE(dest, in[1]); | 917 RECON_AND_STORE(dest + 1 * stride, in[1]); |
920 RECON_AND_STORE(dest, in[2]); | 918 RECON_AND_STORE(dest + 2 * stride, in[2]); |
921 RECON_AND_STORE(dest, in[3]); | 919 RECON_AND_STORE(dest + 3 * stride, in[3]); |
922 RECON_AND_STORE(dest, in[4]); | 920 RECON_AND_STORE(dest + 4 * stride, in[4]); |
923 RECON_AND_STORE(dest, in[5]); | 921 RECON_AND_STORE(dest + 5 * stride, in[5]); |
924 RECON_AND_STORE(dest, in[6]); | 922 RECON_AND_STORE(dest + 6 * stride, in[6]); |
925 RECON_AND_STORE(dest, in[7]); | 923 RECON_AND_STORE(dest + 7 * stride, in[7]); |
926 } | 924 } |
927 | 925 |
928 void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) { | 926 void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
929 const __m128i zero = _mm_setzero_si128(); | 927 const __m128i zero = _mm_setzero_si128(); |
930 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); | 928 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
931 const __m128i final_rounding = _mm_set1_epi16(1<<4); | 929 const __m128i final_rounding = _mm_set1_epi16(1 << 4); |
932 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); | 930 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); |
933 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); | 931 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); |
934 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); | 932 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); |
935 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); | 933 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); |
936 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); | 934 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); |
937 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 935 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
938 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); | 936 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
939 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); | 937 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); |
940 const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); | 938 const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); |
941 | 939 |
942 __m128i in0, in1, in2, in3, in4, in5, in6, in7; | 940 __m128i in0, in1, in2, in3, in4, in5, in6, in7; |
943 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; | 941 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; |
944 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; | 942 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; |
945 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; | 943 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
946 | 944 |
947 // Rows. Load 4-row input data. | 945 // Rows. Load 4-row input data. |
948 in0 = _mm_load_si128((const __m128i *)input); | 946 in0 = _mm_load_si128((const __m128i *)input); |
949 in1 = _mm_load_si128((const __m128i *)(input + 8 * 1)); | 947 in1 = _mm_load_si128((const __m128i *)(input + 8 * 1)); |
950 in2 = _mm_load_si128((const __m128i *)(input + 8 * 2)); | 948 in2 = _mm_load_si128((const __m128i *)(input + 8 * 2)); |
951 in3 = _mm_load_si128((const __m128i *)(input + 8 * 3)); | 949 in3 = _mm_load_si128((const __m128i *)(input + 8 * 3)); |
952 | 950 |
953 // 8x4 Transpose | 951 // 8x4 Transpose |
954 TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1); | 952 TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1); |
955 // Stage1 | 953 // Stage1 |
956 { //NOLINT | 954 { |
957 const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero); | 955 const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero); |
958 const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero); | 956 const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero); |
959 | 957 |
960 tmp0 = _mm_madd_epi16(lo_17, stg1_0); | 958 tmp0 = _mm_madd_epi16(lo_17, stg1_0); |
961 tmp2 = _mm_madd_epi16(lo_17, stg1_1); | 959 tmp2 = _mm_madd_epi16(lo_17, stg1_1); |
962 tmp4 = _mm_madd_epi16(lo_35, stg1_2); | 960 tmp4 = _mm_madd_epi16(lo_35, stg1_2); |
963 tmp6 = _mm_madd_epi16(lo_35, stg1_3); | 961 tmp6 = _mm_madd_epi16(lo_35, stg1_3); |
964 | 962 |
965 tmp0 = _mm_add_epi32(tmp0, rounding); | 963 tmp0 = _mm_add_epi32(tmp0, rounding); |
966 tmp2 = _mm_add_epi32(tmp2, rounding); | 964 tmp2 = _mm_add_epi32(tmp2, rounding); |
967 tmp4 = _mm_add_epi32(tmp4, rounding); | 965 tmp4 = _mm_add_epi32(tmp4, rounding); |
968 tmp6 = _mm_add_epi32(tmp6, rounding); | 966 tmp6 = _mm_add_epi32(tmp6, rounding); |
969 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); | 967 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); |
970 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); | 968 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); |
971 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); | 969 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); |
972 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); | 970 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); |
973 | 971 |
974 stp1_4 = _mm_packs_epi32(tmp0, tmp2); | 972 stp1_4 = _mm_packs_epi32(tmp0, tmp2); |
975 stp1_5 = _mm_packs_epi32(tmp4, tmp6); | 973 stp1_5 = _mm_packs_epi32(tmp4, tmp6); |
976 } | 974 } |
977 | 975 |
978 // Stage2 | 976 // Stage2 |
979 { //NOLINT | 977 { |
980 const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero); | 978 const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero); |
981 const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero); | 979 const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero); |
982 | 980 |
983 tmp0 = _mm_madd_epi16(lo_04, stg2_0); | 981 tmp0 = _mm_madd_epi16(lo_04, stg2_0); |
984 tmp2 = _mm_madd_epi16(lo_04, stg2_1); | 982 tmp2 = _mm_madd_epi16(lo_04, stg2_1); |
985 tmp4 = _mm_madd_epi16(lo_26, stg2_2); | 983 tmp4 = _mm_madd_epi16(lo_26, stg2_2); |
986 tmp6 = _mm_madd_epi16(lo_26, stg2_3); | 984 tmp6 = _mm_madd_epi16(lo_26, stg2_3); |
987 | 985 |
988 tmp0 = _mm_add_epi32(tmp0, rounding); | 986 tmp0 = _mm_add_epi32(tmp0, rounding); |
989 tmp2 = _mm_add_epi32(tmp2, rounding); | 987 tmp2 = _mm_add_epi32(tmp2, rounding); |
990 tmp4 = _mm_add_epi32(tmp4, rounding); | 988 tmp4 = _mm_add_epi32(tmp4, rounding); |
991 tmp6 = _mm_add_epi32(tmp6, rounding); | 989 tmp6 = _mm_add_epi32(tmp6, rounding); |
992 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); | 990 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); |
993 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); | 991 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); |
994 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); | 992 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); |
995 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); | 993 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); |
996 | 994 |
997 stp2_0 = _mm_packs_epi32(tmp0, tmp2); | 995 stp2_0 = _mm_packs_epi32(tmp0, tmp2); |
998 stp2_2 = _mm_packs_epi32(tmp6, tmp4); | 996 stp2_2 = _mm_packs_epi32(tmp6, tmp4); |
999 | 997 |
1000 tmp0 = _mm_adds_epi16(stp1_4, stp1_5); | 998 tmp0 = _mm_adds_epi16(stp1_4, stp1_5); |
1001 tmp1 = _mm_subs_epi16(stp1_4, stp1_5); | 999 tmp1 = _mm_subs_epi16(stp1_4, stp1_5); |
1002 | 1000 |
1003 stp2_4 = tmp0; | 1001 stp2_4 = tmp0; |
1004 stp2_5 = _mm_unpacklo_epi64(tmp1, zero); | 1002 stp2_5 = _mm_unpacklo_epi64(tmp1, zero); |
1005 stp2_6 = _mm_unpackhi_epi64(tmp1, zero); | 1003 stp2_6 = _mm_unpackhi_epi64(tmp1, zero); |
1006 } | 1004 } |
1007 | 1005 |
1008 // Stage3 | 1006 // Stage3 |
1009 { //NOLINT | 1007 { |
1010 const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6); | 1008 const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6); |
1011 | 1009 |
1012 tmp4 = _mm_adds_epi16(stp2_0, stp2_2); | 1010 tmp4 = _mm_adds_epi16(stp2_0, stp2_2); |
1013 tmp6 = _mm_subs_epi16(stp2_0, stp2_2); | 1011 tmp6 = _mm_subs_epi16(stp2_0, stp2_2); |
1014 | 1012 |
1015 stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4); | 1013 stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4); |
1016 stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4); | 1014 stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4); |
1017 | 1015 |
1018 tmp0 = _mm_madd_epi16(lo_56, stg3_0); | 1016 tmp0 = _mm_madd_epi16(lo_56, stg3_0); |
1019 tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0 | 1017 tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0 |
1020 | 1018 |
1021 tmp0 = _mm_add_epi32(tmp0, rounding); | 1019 tmp0 = _mm_add_epi32(tmp0, rounding); |
1022 tmp2 = _mm_add_epi32(tmp2, rounding); | 1020 tmp2 = _mm_add_epi32(tmp2, rounding); |
1023 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); | 1021 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); |
1024 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); | 1022 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); |
1025 | 1023 |
1026 stp1_5 = _mm_packs_epi32(tmp0, tmp2); | 1024 stp1_5 = _mm_packs_epi32(tmp0, tmp2); |
1027 } | 1025 } |
1028 | 1026 |
1029 // Stage4 | 1027 // Stage4 |
1030 tmp0 = _mm_adds_epi16(stp1_3, stp2_4); | 1028 tmp0 = _mm_adds_epi16(stp1_3, stp2_4); |
1031 tmp1 = _mm_adds_epi16(stp1_2, stp1_5); | 1029 tmp1 = _mm_adds_epi16(stp1_2, stp1_5); |
1032 tmp2 = _mm_subs_epi16(stp1_3, stp2_4); | 1030 tmp2 = _mm_subs_epi16(stp1_3, stp2_4); |
1033 tmp3 = _mm_subs_epi16(stp1_2, stp1_5); | 1031 tmp3 = _mm_subs_epi16(stp1_2, stp1_5); |
1034 | 1032 |
1035 TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3) | 1033 TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3) |
1036 | 1034 |
1037 IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, | 1035 IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, |
1038 in0, in1, in2, in3, in4, in5, in6, in7); | 1036 in0, in1, in2, in3, in4, in5, in6, in7); |
1039 // Final rounding and shift | 1037 // Final rounding and shift |
1040 in0 = _mm_adds_epi16(in0, final_rounding); | 1038 in0 = _mm_adds_epi16(in0, final_rounding); |
1041 in1 = _mm_adds_epi16(in1, final_rounding); | 1039 in1 = _mm_adds_epi16(in1, final_rounding); |
1042 in2 = _mm_adds_epi16(in2, final_rounding); | 1040 in2 = _mm_adds_epi16(in2, final_rounding); |
1043 in3 = _mm_adds_epi16(in3, final_rounding); | 1041 in3 = _mm_adds_epi16(in3, final_rounding); |
1044 in4 = _mm_adds_epi16(in4, final_rounding); | 1042 in4 = _mm_adds_epi16(in4, final_rounding); |
1045 in5 = _mm_adds_epi16(in5, final_rounding); | 1043 in5 = _mm_adds_epi16(in5, final_rounding); |
1046 in6 = _mm_adds_epi16(in6, final_rounding); | 1044 in6 = _mm_adds_epi16(in6, final_rounding); |
1047 in7 = _mm_adds_epi16(in7, final_rounding); | 1045 in7 = _mm_adds_epi16(in7, final_rounding); |
1048 | 1046 |
1049 in0 = _mm_srai_epi16(in0, 5); | 1047 in0 = _mm_srai_epi16(in0, 5); |
1050 in1 = _mm_srai_epi16(in1, 5); | 1048 in1 = _mm_srai_epi16(in1, 5); |
1051 in2 = _mm_srai_epi16(in2, 5); | 1049 in2 = _mm_srai_epi16(in2, 5); |
1052 in3 = _mm_srai_epi16(in3, 5); | 1050 in3 = _mm_srai_epi16(in3, 5); |
1053 in4 = _mm_srai_epi16(in4, 5); | 1051 in4 = _mm_srai_epi16(in4, 5); |
1054 in5 = _mm_srai_epi16(in5, 5); | 1052 in5 = _mm_srai_epi16(in5, 5); |
1055 in6 = _mm_srai_epi16(in6, 5); | 1053 in6 = _mm_srai_epi16(in6, 5); |
1056 in7 = _mm_srai_epi16(in7, 5); | 1054 in7 = _mm_srai_epi16(in7, 5); |
1057 | 1055 |
1058 RECON_AND_STORE(dest, in0); | 1056 RECON_AND_STORE(dest + 0 * stride, in0); |
1059 RECON_AND_STORE(dest, in1); | 1057 RECON_AND_STORE(dest + 1 * stride, in1); |
1060 RECON_AND_STORE(dest, in2); | 1058 RECON_AND_STORE(dest + 2 * stride, in2); |
1061 RECON_AND_STORE(dest, in3); | 1059 RECON_AND_STORE(dest + 3 * stride, in3); |
1062 RECON_AND_STORE(dest, in4); | 1060 RECON_AND_STORE(dest + 4 * stride, in4); |
1063 RECON_AND_STORE(dest, in5); | 1061 RECON_AND_STORE(dest + 5 * stride, in5); |
1064 RECON_AND_STORE(dest, in6); | 1062 RECON_AND_STORE(dest + 6 * stride, in6); |
1065 RECON_AND_STORE(dest, in7); | 1063 RECON_AND_STORE(dest + 7 * stride, in7); |
1066 } | 1064 } |
1067 | 1065 |
1068 #define IDCT16 \ | 1066 #define IDCT16 \ |
1069 /* Stage2 */ \ | 1067 /* Stage2 */ \ |
1070 { \ | 1068 { \ |
1071 const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \ | 1069 const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \ |
1072 const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \ | 1070 const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \ |
1073 const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \ | 1071 const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \ |
1074 const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \ | 1072 const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \ |
1075 const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \ | 1073 const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \ |
(...skipping 222 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1298 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ | 1296 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ |
1299 \ | 1297 \ |
1300 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ | 1298 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ |
1301 stg6_0, stg4_0, stg6_0, stg4_0, \ | 1299 stg6_0, stg4_0, stg6_0, stg4_0, \ |
1302 stp2_10, stp2_13, stp2_11, stp2_12) \ | 1300 stp2_10, stp2_13, stp2_11, stp2_12) \ |
1303 } | 1301 } |
1304 | 1302 |
1305 void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, | 1303 void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, |
1306 int stride) { | 1304 int stride) { |
1307 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); | 1305 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
1308 const __m128i final_rounding = _mm_set1_epi16(1<<5); | 1306 const __m128i final_rounding = _mm_set1_epi16(1 << 5); |
1309 const __m128i zero = _mm_setzero_si128(); | 1307 const __m128i zero = _mm_setzero_si128(); |
1310 | 1308 |
1311 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); | 1309 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); |
1312 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); | 1310 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); |
1313 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); | 1311 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); |
1314 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); | 1312 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); |
1315 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); | 1313 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); |
1316 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); | 1314 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); |
1317 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); | 1315 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); |
1318 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); | 1316 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); |
(...skipping 18 matching lines...) Expand all Loading... |
1337 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, | 1335 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, |
1338 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, | 1336 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, |
1339 stp1_8_0, stp1_12_0; | 1337 stp1_8_0, stp1_12_0; |
1340 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, | 1338 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, |
1341 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; | 1339 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; |
1342 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; | 1340 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
1343 int i; | 1341 int i; |
1344 | 1342 |
1345 curr1 = l; | 1343 curr1 = l; |
1346 for (i = 0; i < 2; i++) { | 1344 for (i = 0; i < 2; i++) { |
1347 // 1-D idct | 1345 // 1-D idct |
1348 | 1346 |
1349 // Load input data. | 1347 // Load input data. |
1350 in[0] = _mm_load_si128((const __m128i *)input); | 1348 in[0] = _mm_load_si128((const __m128i *)input); |
1351 in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1)); | 1349 in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1)); |
1352 in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2)); | 1350 in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2)); |
1353 in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3)); | 1351 in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3)); |
1354 in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4)); | 1352 in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4)); |
1355 in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5)); | 1353 in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5)); |
1356 in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6)); | 1354 in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6)); |
1357 in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7)); | 1355 in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7)); |
1358 in[4] = _mm_load_si128((const __m128i *)(input + 8 * 8)); | 1356 in[4] = _mm_load_si128((const __m128i *)(input + 8 * 8)); |
1359 in[12] = _mm_load_si128((const __m128i *)(input + 8 * 9)); | 1357 in[12] = _mm_load_si128((const __m128i *)(input + 8 * 9)); |
1360 in[5] = _mm_load_si128((const __m128i *)(input + 8 * 10)); | 1358 in[5] = _mm_load_si128((const __m128i *)(input + 8 * 10)); |
1361 in[13] = _mm_load_si128((const __m128i *)(input + 8 * 11)); | 1359 in[13] = _mm_load_si128((const __m128i *)(input + 8 * 11)); |
1362 in[6] = _mm_load_si128((const __m128i *)(input + 8 * 12)); | 1360 in[6] = _mm_load_si128((const __m128i *)(input + 8 * 12)); |
1363 in[14] = _mm_load_si128((const __m128i *)(input + 8 * 13)); | 1361 in[14] = _mm_load_si128((const __m128i *)(input + 8 * 13)); |
1364 in[7] = _mm_load_si128((const __m128i *)(input + 8 * 14)); | 1362 in[7] = _mm_load_si128((const __m128i *)(input + 8 * 14)); |
1365 in[15] = _mm_load_si128((const __m128i *)(input + 8 * 15)); | 1363 in[15] = _mm_load_si128((const __m128i *)(input + 8 * 15)); |
1366 | 1364 |
1367 array_transpose_8x8(in, in); | 1365 array_transpose_8x8(in, in); |
1368 array_transpose_8x8(in+8, in+8); | 1366 array_transpose_8x8(in + 8, in + 8); |
1369 | 1367 |
1370 IDCT16 | 1368 IDCT16 |
1371 | 1369 |
1372 // Stage7 | 1370 // Stage7 |
1373 curr1[0] = _mm_add_epi16(stp2_0, stp1_15); | 1371 curr1[0] = _mm_add_epi16(stp2_0, stp1_15); |
1374 curr1[1] = _mm_add_epi16(stp2_1, stp1_14); | 1372 curr1[1] = _mm_add_epi16(stp2_1, stp1_14); |
1375 curr1[2] = _mm_add_epi16(stp2_2, stp2_13); | 1373 curr1[2] = _mm_add_epi16(stp2_2, stp2_13); |
1376 curr1[3] = _mm_add_epi16(stp2_3, stp2_12); | 1374 curr1[3] = _mm_add_epi16(stp2_3, stp2_12); |
1377 curr1[4] = _mm_add_epi16(stp2_4, stp2_11); | 1375 curr1[4] = _mm_add_epi16(stp2_4, stp2_11); |
1378 curr1[5] = _mm_add_epi16(stp2_5, stp2_10); | 1376 curr1[5] = _mm_add_epi16(stp2_5, stp2_10); |
1379 curr1[6] = _mm_add_epi16(stp2_6, stp1_9); | 1377 curr1[6] = _mm_add_epi16(stp2_6, stp1_9); |
1380 curr1[7] = _mm_add_epi16(stp2_7, stp1_8); | 1378 curr1[7] = _mm_add_epi16(stp2_7, stp1_8); |
1381 curr1[8] = _mm_sub_epi16(stp2_7, stp1_8); | 1379 curr1[8] = _mm_sub_epi16(stp2_7, stp1_8); |
1382 curr1[9] = _mm_sub_epi16(stp2_6, stp1_9); | 1380 curr1[9] = _mm_sub_epi16(stp2_6, stp1_9); |
1383 curr1[10] = _mm_sub_epi16(stp2_5, stp2_10); | 1381 curr1[10] = _mm_sub_epi16(stp2_5, stp2_10); |
1384 curr1[11] = _mm_sub_epi16(stp2_4, stp2_11); | 1382 curr1[11] = _mm_sub_epi16(stp2_4, stp2_11); |
1385 curr1[12] = _mm_sub_epi16(stp2_3, stp2_12); | 1383 curr1[12] = _mm_sub_epi16(stp2_3, stp2_12); |
1386 curr1[13] = _mm_sub_epi16(stp2_2, stp2_13); | 1384 curr1[13] = _mm_sub_epi16(stp2_2, stp2_13); |
1387 curr1[14] = _mm_sub_epi16(stp2_1, stp1_14); | 1385 curr1[14] = _mm_sub_epi16(stp2_1, stp1_14); |
1388 curr1[15] = _mm_sub_epi16(stp2_0, stp1_15); | 1386 curr1[15] = _mm_sub_epi16(stp2_0, stp1_15); |
1389 | 1387 |
1390 curr1 = r; | 1388 curr1 = r; |
1391 input += 128; | 1389 input += 128; |
1392 } | 1390 } |
1393 for (i = 0; i < 2; i++) { | 1391 for (i = 0; i < 2; i++) { |
1394 // 1-D idct | 1392 int j; |
1395 array_transpose_8x8(l+i*8, in); | 1393 // 1-D idct |
1396 array_transpose_8x8(r+i*8, in+8); | 1394 array_transpose_8x8(l + i * 8, in); |
| 1395 array_transpose_8x8(r + i * 8, in + 8); |
1397 | 1396 |
1398 IDCT16 | 1397 IDCT16 |
1399 | 1398 |
1400 // 2-D | 1399 // 2-D |
1401 in[0] = _mm_add_epi16(stp2_0, stp1_15); | 1400 in[0] = _mm_add_epi16(stp2_0, stp1_15); |
1402 in[1] = _mm_add_epi16(stp2_1, stp1_14); | 1401 in[1] = _mm_add_epi16(stp2_1, stp1_14); |
1403 in[2] = _mm_add_epi16(stp2_2, stp2_13); | 1402 in[2] = _mm_add_epi16(stp2_2, stp2_13); |
1404 in[3] = _mm_add_epi16(stp2_3, stp2_12); | 1403 in[3] = _mm_add_epi16(stp2_3, stp2_12); |
1405 in[4] = _mm_add_epi16(stp2_4, stp2_11); | 1404 in[4] = _mm_add_epi16(stp2_4, stp2_11); |
1406 in[5] = _mm_add_epi16(stp2_5, stp2_10); | 1405 in[5] = _mm_add_epi16(stp2_5, stp2_10); |
1407 in[6] = _mm_add_epi16(stp2_6, stp1_9); | 1406 in[6] = _mm_add_epi16(stp2_6, stp1_9); |
1408 in[7] = _mm_add_epi16(stp2_7, stp1_8); | 1407 in[7] = _mm_add_epi16(stp2_7, stp1_8); |
1409 in[8] = _mm_sub_epi16(stp2_7, stp1_8); | 1408 in[8] = _mm_sub_epi16(stp2_7, stp1_8); |
1410 in[9] = _mm_sub_epi16(stp2_6, stp1_9); | 1409 in[9] = _mm_sub_epi16(stp2_6, stp1_9); |
1411 in[10] = _mm_sub_epi16(stp2_5, stp2_10); | 1410 in[10] = _mm_sub_epi16(stp2_5, stp2_10); |
1412 in[11] = _mm_sub_epi16(stp2_4, stp2_11); | 1411 in[11] = _mm_sub_epi16(stp2_4, stp2_11); |
1413 in[12] = _mm_sub_epi16(stp2_3, stp2_12); | 1412 in[12] = _mm_sub_epi16(stp2_3, stp2_12); |
1414 in[13] = _mm_sub_epi16(stp2_2, stp2_13); | 1413 in[13] = _mm_sub_epi16(stp2_2, stp2_13); |
1415 in[14] = _mm_sub_epi16(stp2_1, stp1_14); | 1414 in[14] = _mm_sub_epi16(stp2_1, stp1_14); |
1416 in[15] = _mm_sub_epi16(stp2_0, stp1_15); | 1415 in[15] = _mm_sub_epi16(stp2_0, stp1_15); |
1417 | 1416 |
| 1417 for (j = 0; j < 16; ++j) { |
1418 // Final rounding and shift | 1418 // Final rounding and shift |
1419 in[0] = _mm_adds_epi16(in[0], final_rounding); | 1419 in[j] = _mm_adds_epi16(in[j], final_rounding); |
1420 in[1] = _mm_adds_epi16(in[1], final_rounding); | 1420 in[j] = _mm_srai_epi16(in[j], 6); |
1421 in[2] = _mm_adds_epi16(in[2], final_rounding); | 1421 RECON_AND_STORE(dest + j * stride, in[j]); |
1422 in[3] = _mm_adds_epi16(in[3], final_rounding); | 1422 } |
1423 in[4] = _mm_adds_epi16(in[4], final_rounding); | |
1424 in[5] = _mm_adds_epi16(in[5], final_rounding); | |
1425 in[6] = _mm_adds_epi16(in[6], final_rounding); | |
1426 in[7] = _mm_adds_epi16(in[7], final_rounding); | |
1427 in[8] = _mm_adds_epi16(in[8], final_rounding); | |
1428 in[9] = _mm_adds_epi16(in[9], final_rounding); | |
1429 in[10] = _mm_adds_epi16(in[10], final_rounding); | |
1430 in[11] = _mm_adds_epi16(in[11], final_rounding); | |
1431 in[12] = _mm_adds_epi16(in[12], final_rounding); | |
1432 in[13] = _mm_adds_epi16(in[13], final_rounding); | |
1433 in[14] = _mm_adds_epi16(in[14], final_rounding); | |
1434 in[15] = _mm_adds_epi16(in[15], final_rounding); | |
1435 | 1423 |
1436 in[0] = _mm_srai_epi16(in[0], 6); | 1424 dest += 8; |
1437 in[1] = _mm_srai_epi16(in[1], 6); | |
1438 in[2] = _mm_srai_epi16(in[2], 6); | |
1439 in[3] = _mm_srai_epi16(in[3], 6); | |
1440 in[4] = _mm_srai_epi16(in[4], 6); | |
1441 in[5] = _mm_srai_epi16(in[5], 6); | |
1442 in[6] = _mm_srai_epi16(in[6], 6); | |
1443 in[7] = _mm_srai_epi16(in[7], 6); | |
1444 in[8] = _mm_srai_epi16(in[8], 6); | |
1445 in[9] = _mm_srai_epi16(in[9], 6); | |
1446 in[10] = _mm_srai_epi16(in[10], 6); | |
1447 in[11] = _mm_srai_epi16(in[11], 6); | |
1448 in[12] = _mm_srai_epi16(in[12], 6); | |
1449 in[13] = _mm_srai_epi16(in[13], 6); | |
1450 in[14] = _mm_srai_epi16(in[14], 6); | |
1451 in[15] = _mm_srai_epi16(in[15], 6); | |
1452 | |
1453 RECON_AND_STORE(dest, in[0]); | |
1454 RECON_AND_STORE(dest, in[1]); | |
1455 RECON_AND_STORE(dest, in[2]); | |
1456 RECON_AND_STORE(dest, in[3]); | |
1457 RECON_AND_STORE(dest, in[4]); | |
1458 RECON_AND_STORE(dest, in[5]); | |
1459 RECON_AND_STORE(dest, in[6]); | |
1460 RECON_AND_STORE(dest, in[7]); | |
1461 RECON_AND_STORE(dest, in[8]); | |
1462 RECON_AND_STORE(dest, in[9]); | |
1463 RECON_AND_STORE(dest, in[10]); | |
1464 RECON_AND_STORE(dest, in[11]); | |
1465 RECON_AND_STORE(dest, in[12]); | |
1466 RECON_AND_STORE(dest, in[13]); | |
1467 RECON_AND_STORE(dest, in[14]); | |
1468 RECON_AND_STORE(dest, in[15]); | |
1469 | |
1470 dest += 8 - (stride * 16); | |
1471 } | 1425 } |
1472 } | 1426 } |
1473 | 1427 |
1474 void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { | 1428 void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
1475 __m128i dc_value; | 1429 __m128i dc_value; |
1476 const __m128i zero = _mm_setzero_si128(); | 1430 const __m128i zero = _mm_setzero_si128(); |
1477 int a, i; | 1431 int a, i; |
1478 | 1432 |
1479 a = dct_const_round_shift(input[0] * cospi_16_64); | 1433 a = dct_const_round_shift(input[0] * cospi_16_64); |
1480 a = dct_const_round_shift(a * cospi_16_64); | 1434 a = dct_const_round_shift(a * cospi_16_64); |
1481 a = ROUND_POWER_OF_TWO(a, 6); | 1435 a = ROUND_POWER_OF_TWO(a, 6); |
1482 | 1436 |
1483 dc_value = _mm_set1_epi16(a); | 1437 dc_value = _mm_set1_epi16(a); |
1484 | 1438 |
1485 for (i = 0; i < 2; ++i) { | 1439 for (i = 0; i < 2; ++i) { |
1486 RECON_AND_STORE(dest, dc_value); | 1440 RECON_AND_STORE(dest + 0 * stride, dc_value); |
1487 RECON_AND_STORE(dest, dc_value); | 1441 RECON_AND_STORE(dest + 1 * stride, dc_value); |
1488 RECON_AND_STORE(dest, dc_value); | 1442 RECON_AND_STORE(dest + 2 * stride, dc_value); |
1489 RECON_AND_STORE(dest, dc_value); | 1443 RECON_AND_STORE(dest + 3 * stride, dc_value); |
1490 RECON_AND_STORE(dest, dc_value); | 1444 RECON_AND_STORE(dest + 4 * stride, dc_value); |
1491 RECON_AND_STORE(dest, dc_value); | 1445 RECON_AND_STORE(dest + 5 * stride, dc_value); |
1492 RECON_AND_STORE(dest, dc_value); | 1446 RECON_AND_STORE(dest + 6 * stride, dc_value); |
1493 RECON_AND_STORE(dest, dc_value); | 1447 RECON_AND_STORE(dest + 7 * stride, dc_value); |
1494 RECON_AND_STORE(dest, dc_value); | 1448 RECON_AND_STORE(dest + 8 * stride, dc_value); |
1495 RECON_AND_STORE(dest, dc_value); | 1449 RECON_AND_STORE(dest + 9 * stride, dc_value); |
1496 RECON_AND_STORE(dest, dc_value); | 1450 RECON_AND_STORE(dest + 10 * stride, dc_value); |
1497 RECON_AND_STORE(dest, dc_value); | 1451 RECON_AND_STORE(dest + 11 * stride, dc_value); |
1498 RECON_AND_STORE(dest, dc_value); | 1452 RECON_AND_STORE(dest + 12 * stride, dc_value); |
1499 RECON_AND_STORE(dest, dc_value); | 1453 RECON_AND_STORE(dest + 13 * stride, dc_value); |
1500 RECON_AND_STORE(dest, dc_value); | 1454 RECON_AND_STORE(dest + 14 * stride, dc_value); |
1501 RECON_AND_STORE(dest, dc_value); | 1455 RECON_AND_STORE(dest + 15 * stride, dc_value); |
1502 dest += 8 - (stride * 16); | 1456 dest += 8; |
1503 } | 1457 } |
1504 } | 1458 } |
1505 | 1459 |
1506 static void iadst16_8col(__m128i *in) { | 1460 static void iadst16_8col(__m128i *in) { |
1507 // perform 16x16 1-D ADST for 8 columns | 1461 // perform 16x16 1-D ADST for 8 columns |
1508 __m128i s[16], x[16], u[32], v[32]; | 1462 __m128i s[16], x[16], u[32], v[32]; |
1509 const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); | 1463 const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); |
1510 const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); | 1464 const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); |
1511 const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); | 1465 const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); |
1512 const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); | 1466 const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); |
(...skipping 847 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2360 } | 2314 } |
2361 | 2315 |
2362 write_buffer_8x16(dest, in0, stride); | 2316 write_buffer_8x16(dest, in0, stride); |
2363 dest += 8; | 2317 dest += 8; |
2364 write_buffer_8x16(dest, in1, stride); | 2318 write_buffer_8x16(dest, in1, stride); |
2365 } | 2319 } |
2366 | 2320 |
2367 void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, | 2321 void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, |
2368 int stride) { | 2322 int stride) { |
2369 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); | 2323 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
2370 const __m128i final_rounding = _mm_set1_epi16(1<<5); | 2324 const __m128i final_rounding = _mm_set1_epi16(1 << 5); |
2371 const __m128i zero = _mm_setzero_si128(); | 2325 const __m128i zero = _mm_setzero_si128(); |
2372 | 2326 |
2373 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); | 2327 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); |
2374 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); | 2328 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); |
2375 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); | 2329 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); |
2376 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); | 2330 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); |
2377 | 2331 |
2378 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); | 2332 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); |
2379 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); | 2333 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); |
2380 | 2334 |
(...skipping 18 matching lines...) Expand all Loading... |
2399 in[0] = _mm_load_si128((const __m128i *)input); | 2353 in[0] = _mm_load_si128((const __m128i *)input); |
2400 in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2)); | 2354 in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2)); |
2401 in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4)); | 2355 in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4)); |
2402 in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6)); | 2356 in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6)); |
2403 | 2357 |
2404 TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]); | 2358 TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]); |
2405 | 2359 |
2406 // Stage2 | 2360 // Stage2 |
2407 { | 2361 { |
2408 const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero); | 2362 const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero); |
2409 const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]); | 2363 const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]); |
2410 | 2364 |
2411 tmp0 = _mm_madd_epi16(lo_1_15, stg2_0); | 2365 tmp0 = _mm_madd_epi16(lo_1_15, stg2_0); |
2412 tmp2 = _mm_madd_epi16(lo_1_15, stg2_1); | 2366 tmp2 = _mm_madd_epi16(lo_1_15, stg2_1); |
2413 tmp5 = _mm_madd_epi16(lo_13_3, stg2_6); | 2367 tmp5 = _mm_madd_epi16(lo_13_3, stg2_6); |
2414 tmp7 = _mm_madd_epi16(lo_13_3, stg2_7); | 2368 tmp7 = _mm_madd_epi16(lo_13_3, stg2_7); |
2415 | 2369 |
2416 tmp0 = _mm_add_epi32(tmp0, rounding); | 2370 tmp0 = _mm_add_epi32(tmp0, rounding); |
2417 tmp2 = _mm_add_epi32(tmp2, rounding); | 2371 tmp2 = _mm_add_epi32(tmp2, rounding); |
2418 tmp5 = _mm_add_epi32(tmp5, rounding); | 2372 tmp5 = _mm_add_epi32(tmp5, rounding); |
2419 tmp7 = _mm_add_epi32(tmp7, rounding); | 2373 tmp7 = _mm_add_epi32(tmp7, rounding); |
(...skipping 140 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2560 l[9] = _mm_sub_epi16(stp2_6, stp1_9); | 2514 l[9] = _mm_sub_epi16(stp2_6, stp1_9); |
2561 l[10] = _mm_sub_epi16(stp2_5, stp2_10); | 2515 l[10] = _mm_sub_epi16(stp2_5, stp2_10); |
2562 l[11] = _mm_sub_epi16(stp2_4, stp2_11); | 2516 l[11] = _mm_sub_epi16(stp2_4, stp2_11); |
2563 l[12] = _mm_sub_epi16(stp2_3, stp2_12); | 2517 l[12] = _mm_sub_epi16(stp2_3, stp2_12); |
2564 l[13] = _mm_sub_epi16(stp2_2, stp2_13); | 2518 l[13] = _mm_sub_epi16(stp2_2, stp2_13); |
2565 l[14] = _mm_sub_epi16(stp2_1, stp1_14); | 2519 l[14] = _mm_sub_epi16(stp2_1, stp1_14); |
2566 l[15] = _mm_sub_epi16(stp2_0, stp1_15); | 2520 l[15] = _mm_sub_epi16(stp2_0, stp1_15); |
2567 | 2521 |
2568 // Second 1-D inverse transform, performed per 8x16 block | 2522 // Second 1-D inverse transform, performed per 8x16 block |
2569 for (i = 0; i < 2; i++) { | 2523 for (i = 0; i < 2; i++) { |
2570 array_transpose_4X8(l + 8*i, in); | 2524 int j; |
| 2525 array_transpose_4X8(l + 8 * i, in); |
2571 | 2526 |
2572 IDCT16_10 | 2527 IDCT16_10 |
2573 | 2528 |
2574 // Stage7 | 2529 // Stage7 |
2575 in[0] = _mm_add_epi16(stp2_0, stp1_15); | 2530 in[0] = _mm_add_epi16(stp2_0, stp1_15); |
2576 in[1] = _mm_add_epi16(stp2_1, stp1_14); | 2531 in[1] = _mm_add_epi16(stp2_1, stp1_14); |
2577 in[2] = _mm_add_epi16(stp2_2, stp2_13); | 2532 in[2] = _mm_add_epi16(stp2_2, stp2_13); |
2578 in[3] = _mm_add_epi16(stp2_3, stp2_12); | 2533 in[3] = _mm_add_epi16(stp2_3, stp2_12); |
2579 in[4] = _mm_add_epi16(stp2_4, stp2_11); | 2534 in[4] = _mm_add_epi16(stp2_4, stp2_11); |
2580 in[5] = _mm_add_epi16(stp2_5, stp2_10); | 2535 in[5] = _mm_add_epi16(stp2_5, stp2_10); |
2581 in[6] = _mm_add_epi16(stp2_6, stp1_9); | 2536 in[6] = _mm_add_epi16(stp2_6, stp1_9); |
2582 in[7] = _mm_add_epi16(stp2_7, stp1_8); | 2537 in[7] = _mm_add_epi16(stp2_7, stp1_8); |
2583 in[8] = _mm_sub_epi16(stp2_7, stp1_8); | 2538 in[8] = _mm_sub_epi16(stp2_7, stp1_8); |
2584 in[9] = _mm_sub_epi16(stp2_6, stp1_9); | 2539 in[9] = _mm_sub_epi16(stp2_6, stp1_9); |
2585 in[10] = _mm_sub_epi16(stp2_5, stp2_10); | 2540 in[10] = _mm_sub_epi16(stp2_5, stp2_10); |
2586 in[11] = _mm_sub_epi16(stp2_4, stp2_11); | 2541 in[11] = _mm_sub_epi16(stp2_4, stp2_11); |
2587 in[12] = _mm_sub_epi16(stp2_3, stp2_12); | 2542 in[12] = _mm_sub_epi16(stp2_3, stp2_12); |
2588 in[13] = _mm_sub_epi16(stp2_2, stp2_13); | 2543 in[13] = _mm_sub_epi16(stp2_2, stp2_13); |
2589 in[14] = _mm_sub_epi16(stp2_1, stp1_14); | 2544 in[14] = _mm_sub_epi16(stp2_1, stp1_14); |
2590 in[15] = _mm_sub_epi16(stp2_0, stp1_15); | 2545 in[15] = _mm_sub_epi16(stp2_0, stp1_15); |
2591 | 2546 |
2592 // Final rounding and shift | 2547 for (j = 0; j < 16; ++j) { |
2593 in[0] = _mm_adds_epi16(in[0], final_rounding); | 2548 // Final rounding and shift |
2594 in[1] = _mm_adds_epi16(in[1], final_rounding); | 2549 in[j] = _mm_adds_epi16(in[j], final_rounding); |
2595 in[2] = _mm_adds_epi16(in[2], final_rounding); | 2550 in[j] = _mm_srai_epi16(in[j], 6); |
2596 in[3] = _mm_adds_epi16(in[3], final_rounding); | 2551 RECON_AND_STORE(dest + j * stride, in[j]); |
2597 in[4] = _mm_adds_epi16(in[4], final_rounding); | 2552 } |
2598 in[5] = _mm_adds_epi16(in[5], final_rounding); | |
2599 in[6] = _mm_adds_epi16(in[6], final_rounding); | |
2600 in[7] = _mm_adds_epi16(in[7], final_rounding); | |
2601 in[8] = _mm_adds_epi16(in[8], final_rounding); | |
2602 in[9] = _mm_adds_epi16(in[9], final_rounding); | |
2603 in[10] = _mm_adds_epi16(in[10], final_rounding); | |
2604 in[11] = _mm_adds_epi16(in[11], final_rounding); | |
2605 in[12] = _mm_adds_epi16(in[12], final_rounding); | |
2606 in[13] = _mm_adds_epi16(in[13], final_rounding); | |
2607 in[14] = _mm_adds_epi16(in[14], final_rounding); | |
2608 in[15] = _mm_adds_epi16(in[15], final_rounding); | |
2609 | 2553 |
2610 in[0] = _mm_srai_epi16(in[0], 6); | 2554 dest += 8; |
2611 in[1] = _mm_srai_epi16(in[1], 6); | |
2612 in[2] = _mm_srai_epi16(in[2], 6); | |
2613 in[3] = _mm_srai_epi16(in[3], 6); | |
2614 in[4] = _mm_srai_epi16(in[4], 6); | |
2615 in[5] = _mm_srai_epi16(in[5], 6); | |
2616 in[6] = _mm_srai_epi16(in[6], 6); | |
2617 in[7] = _mm_srai_epi16(in[7], 6); | |
2618 in[8] = _mm_srai_epi16(in[8], 6); | |
2619 in[9] = _mm_srai_epi16(in[9], 6); | |
2620 in[10] = _mm_srai_epi16(in[10], 6); | |
2621 in[11] = _mm_srai_epi16(in[11], 6); | |
2622 in[12] = _mm_srai_epi16(in[12], 6); | |
2623 in[13] = _mm_srai_epi16(in[13], 6); | |
2624 in[14] = _mm_srai_epi16(in[14], 6); | |
2625 in[15] = _mm_srai_epi16(in[15], 6); | |
2626 | |
2627 RECON_AND_STORE(dest, in[0]); | |
2628 RECON_AND_STORE(dest, in[1]); | |
2629 RECON_AND_STORE(dest, in[2]); | |
2630 RECON_AND_STORE(dest, in[3]); | |
2631 RECON_AND_STORE(dest, in[4]); | |
2632 RECON_AND_STORE(dest, in[5]); | |
2633 RECON_AND_STORE(dest, in[6]); | |
2634 RECON_AND_STORE(dest, in[7]); | |
2635 RECON_AND_STORE(dest, in[8]); | |
2636 RECON_AND_STORE(dest, in[9]); | |
2637 RECON_AND_STORE(dest, in[10]); | |
2638 RECON_AND_STORE(dest, in[11]); | |
2639 RECON_AND_STORE(dest, in[12]); | |
2640 RECON_AND_STORE(dest, in[13]); | |
2641 RECON_AND_STORE(dest, in[14]); | |
2642 RECON_AND_STORE(dest, in[15]); | |
2643 | |
2644 dest += 8 - (stride * 16); | |
2645 } | 2555 } |
2646 } | 2556 } |
2647 | 2557 |
2648 #define LOAD_DQCOEFF(reg, input) \ | 2558 #define LOAD_DQCOEFF(reg, input) \ |
2649 { \ | 2559 { \ |
2650 reg = _mm_load_si128((const __m128i *) input); \ | 2560 reg = _mm_load_si128((const __m128i *) input); \ |
2651 input += 8; \ | 2561 input += 8; \ |
2652 } \ | 2562 } \ |
2653 | 2563 |
2654 #define IDCT32_34 \ | 2564 #define IDCT32_34 \ |
(...skipping 624 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3279 stp1_23, stp1_24) \ | 3189 stp1_23, stp1_24) \ |
3280 \ | 3190 \ |
3281 stp1_28 = stp2_28; \ | 3191 stp1_28 = stp2_28; \ |
3282 stp1_29 = stp2_29; \ | 3192 stp1_29 = stp2_29; \ |
3283 stp1_30 = stp2_30; \ | 3193 stp1_30 = stp2_30; \ |
3284 stp1_31 = stp2_31; \ | 3194 stp1_31 = stp2_31; \ |
3285 } | 3195 } |
3286 | 3196 |
3287 // Only upper-left 8x8 has non-zero coeff | 3197 // Only upper-left 8x8 has non-zero coeff |
3288 void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, | 3198 void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, |
3289 int stride) { | 3199 int stride) { |
3290 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); | 3200 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
3291 const __m128i final_rounding = _mm_set1_epi16(1<<5); | 3201 const __m128i final_rounding = _mm_set1_epi16(1<<5); |
3292 | 3202 |
3293 // idct constants for each stage | 3203 // idct constants for each stage |
3294 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); | 3204 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); |
3295 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); | 3205 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); |
3296 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); | 3206 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); |
3297 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); | 3207 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); |
3298 const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); | 3208 const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); |
3299 const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); | 3209 const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); |
(...skipping 80 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3380 LOAD_DQCOEFF(in[6], input); | 3290 LOAD_DQCOEFF(in[6], input); |
3381 LOAD_DQCOEFF(in[14], input); | 3291 LOAD_DQCOEFF(in[14], input); |
3382 LOAD_DQCOEFF(in[22], input); | 3292 LOAD_DQCOEFF(in[22], input); |
3383 LOAD_DQCOEFF(in[30], input); | 3293 LOAD_DQCOEFF(in[30], input); |
3384 LOAD_DQCOEFF(in[7], input); | 3294 LOAD_DQCOEFF(in[7], input); |
3385 LOAD_DQCOEFF(in[15], input); | 3295 LOAD_DQCOEFF(in[15], input); |
3386 LOAD_DQCOEFF(in[23], input); | 3296 LOAD_DQCOEFF(in[23], input); |
3387 LOAD_DQCOEFF(in[31], input); | 3297 LOAD_DQCOEFF(in[31], input); |
3388 | 3298 |
3389 array_transpose_8x8(in, in); | 3299 array_transpose_8x8(in, in); |
3390 array_transpose_8x8(in+8, in+8); | 3300 array_transpose_8x8(in + 8, in + 8); |
3391 array_transpose_8x8(in+16, in+16); | 3301 array_transpose_8x8(in + 16, in + 16); |
3392 array_transpose_8x8(in+24, in+24); | 3302 array_transpose_8x8(in + 24, in + 24); |
3393 | 3303 |
3394 IDCT32 | 3304 IDCT32 |
3395 | 3305 |
3396 // 1_D: Store 32 intermediate results for each 8x32 block. | 3306 // 1_D: Store 32 intermediate results for each 8x32 block. |
3397 col[0] = _mm_add_epi16(stp1_0, stp1_31); | 3307 col[0] = _mm_add_epi16(stp1_0, stp1_31); |
3398 col[1] = _mm_add_epi16(stp1_1, stp1_30); | 3308 col[1] = _mm_add_epi16(stp1_1, stp1_30); |
3399 col[2] = _mm_add_epi16(stp1_2, stp1_29); | 3309 col[2] = _mm_add_epi16(stp1_2, stp1_29); |
3400 col[3] = _mm_add_epi16(stp1_3, stp1_28); | 3310 col[3] = _mm_add_epi16(stp1_3, stp1_28); |
3401 col[4] = _mm_add_epi16(stp1_4, stp1_27); | 3311 col[4] = _mm_add_epi16(stp1_4, stp1_27); |
3402 col[5] = _mm_add_epi16(stp1_5, stp1_26); | 3312 col[5] = _mm_add_epi16(stp1_5, stp1_26); |
(...skipping 17 matching lines...) Expand all Loading... |
3420 col[23] = _mm_sub_epi16(stp1_8, stp1_23); | 3330 col[23] = _mm_sub_epi16(stp1_8, stp1_23); |
3421 col[24] = _mm_sub_epi16(stp1_7, stp1_24); | 3331 col[24] = _mm_sub_epi16(stp1_7, stp1_24); |
3422 col[25] = _mm_sub_epi16(stp1_6, stp1_25); | 3332 col[25] = _mm_sub_epi16(stp1_6, stp1_25); |
3423 col[26] = _mm_sub_epi16(stp1_5, stp1_26); | 3333 col[26] = _mm_sub_epi16(stp1_5, stp1_26); |
3424 col[27] = _mm_sub_epi16(stp1_4, stp1_27); | 3334 col[27] = _mm_sub_epi16(stp1_4, stp1_27); |
3425 col[28] = _mm_sub_epi16(stp1_3, stp1_28); | 3335 col[28] = _mm_sub_epi16(stp1_3, stp1_28); |
3426 col[29] = _mm_sub_epi16(stp1_2, stp1_29); | 3336 col[29] = _mm_sub_epi16(stp1_2, stp1_29); |
3427 col[30] = _mm_sub_epi16(stp1_1, stp1_30); | 3337 col[30] = _mm_sub_epi16(stp1_1, stp1_30); |
3428 col[31] = _mm_sub_epi16(stp1_0, stp1_31); | 3338 col[31] = _mm_sub_epi16(stp1_0, stp1_31); |
3429 for (i = 0; i < 4; i++) { | 3339 for (i = 0; i < 4; i++) { |
3430 const __m128i zero = _mm_setzero_si128(); | 3340 int j; |
3431 // Transpose 32x8 block to 8x32 block | 3341 const __m128i zero = _mm_setzero_si128(); |
3432 array_transpose_8x8(col+i*8, in); | 3342 // Transpose 32x8 block to 8x32 block |
3433 IDCT32_34 | 3343 array_transpose_8x8(col + i * 8, in); |
| 3344 IDCT32_34 |
3434 | 3345 |
3435 // 2_D: Calculate the results and store them to destination. | 3346 // 2_D: Calculate the results and store them to destination. |
3436 in[0] = _mm_add_epi16(stp1_0, stp1_31); | 3347 in[0] = _mm_add_epi16(stp1_0, stp1_31); |
3437 in[1] = _mm_add_epi16(stp1_1, stp1_30); | 3348 in[1] = _mm_add_epi16(stp1_1, stp1_30); |
3438 in[2] = _mm_add_epi16(stp1_2, stp1_29); | 3349 in[2] = _mm_add_epi16(stp1_2, stp1_29); |
3439 in[3] = _mm_add_epi16(stp1_3, stp1_28); | 3350 in[3] = _mm_add_epi16(stp1_3, stp1_28); |
3440 in[4] = _mm_add_epi16(stp1_4, stp1_27); | 3351 in[4] = _mm_add_epi16(stp1_4, stp1_27); |
3441 in[5] = _mm_add_epi16(stp1_5, stp1_26); | 3352 in[5] = _mm_add_epi16(stp1_5, stp1_26); |
3442 in[6] = _mm_add_epi16(stp1_6, stp1_25); | 3353 in[6] = _mm_add_epi16(stp1_6, stp1_25); |
3443 in[7] = _mm_add_epi16(stp1_7, stp1_24); | 3354 in[7] = _mm_add_epi16(stp1_7, stp1_24); |
3444 in[8] = _mm_add_epi16(stp1_8, stp1_23); | 3355 in[8] = _mm_add_epi16(stp1_8, stp1_23); |
3445 in[9] = _mm_add_epi16(stp1_9, stp1_22); | 3356 in[9] = _mm_add_epi16(stp1_9, stp1_22); |
3446 in[10] = _mm_add_epi16(stp1_10, stp1_21); | 3357 in[10] = _mm_add_epi16(stp1_10, stp1_21); |
3447 in[11] = _mm_add_epi16(stp1_11, stp1_20); | 3358 in[11] = _mm_add_epi16(stp1_11, stp1_20); |
3448 in[12] = _mm_add_epi16(stp1_12, stp1_19); | 3359 in[12] = _mm_add_epi16(stp1_12, stp1_19); |
3449 in[13] = _mm_add_epi16(stp1_13, stp1_18); | 3360 in[13] = _mm_add_epi16(stp1_13, stp1_18); |
3450 in[14] = _mm_add_epi16(stp1_14, stp1_17); | 3361 in[14] = _mm_add_epi16(stp1_14, stp1_17); |
3451 in[15] = _mm_add_epi16(stp1_15, stp1_16); | 3362 in[15] = _mm_add_epi16(stp1_15, stp1_16); |
3452 in[16] = _mm_sub_epi16(stp1_15, stp1_16); | 3363 in[16] = _mm_sub_epi16(stp1_15, stp1_16); |
3453 in[17] = _mm_sub_epi16(stp1_14, stp1_17); | 3364 in[17] = _mm_sub_epi16(stp1_14, stp1_17); |
3454 in[18] = _mm_sub_epi16(stp1_13, stp1_18); | 3365 in[18] = _mm_sub_epi16(stp1_13, stp1_18); |
3455 in[19] = _mm_sub_epi16(stp1_12, stp1_19); | 3366 in[19] = _mm_sub_epi16(stp1_12, stp1_19); |
3456 in[20] = _mm_sub_epi16(stp1_11, stp1_20); | 3367 in[20] = _mm_sub_epi16(stp1_11, stp1_20); |
3457 in[21] = _mm_sub_epi16(stp1_10, stp1_21); | 3368 in[21] = _mm_sub_epi16(stp1_10, stp1_21); |
3458 in[22] = _mm_sub_epi16(stp1_9, stp1_22); | 3369 in[22] = _mm_sub_epi16(stp1_9, stp1_22); |
3459 in[23] = _mm_sub_epi16(stp1_8, stp1_23); | 3370 in[23] = _mm_sub_epi16(stp1_8, stp1_23); |
3460 in[24] = _mm_sub_epi16(stp1_7, stp1_24); | 3371 in[24] = _mm_sub_epi16(stp1_7, stp1_24); |
3461 in[25] = _mm_sub_epi16(stp1_6, stp1_25); | 3372 in[25] = _mm_sub_epi16(stp1_6, stp1_25); |
3462 in[26] = _mm_sub_epi16(stp1_5, stp1_26); | 3373 in[26] = _mm_sub_epi16(stp1_5, stp1_26); |
3463 in[27] = _mm_sub_epi16(stp1_4, stp1_27); | 3374 in[27] = _mm_sub_epi16(stp1_4, stp1_27); |
3464 in[28] = _mm_sub_epi16(stp1_3, stp1_28); | 3375 in[28] = _mm_sub_epi16(stp1_3, stp1_28); |
3465 in[29] = _mm_sub_epi16(stp1_2, stp1_29); | 3376 in[29] = _mm_sub_epi16(stp1_2, stp1_29); |
3466 in[30] = _mm_sub_epi16(stp1_1, stp1_30); | 3377 in[30] = _mm_sub_epi16(stp1_1, stp1_30); |
3467 in[31] = _mm_sub_epi16(stp1_0, stp1_31); | 3378 in[31] = _mm_sub_epi16(stp1_0, stp1_31); |
3468 | 3379 |
| 3380 for (j = 0; j < 32; ++j) { |
3469 // Final rounding and shift | 3381 // Final rounding and shift |
3470 in[0] = _mm_adds_epi16(in[0], final_rounding); | 3382 in[j] = _mm_adds_epi16(in[j], final_rounding); |
3471 in[1] = _mm_adds_epi16(in[1], final_rounding); | 3383 in[j] = _mm_srai_epi16(in[j], 6); |
3472 in[2] = _mm_adds_epi16(in[2], final_rounding); | 3384 RECON_AND_STORE(dest + j * stride, in[j]); |
3473 in[3] = _mm_adds_epi16(in[3], final_rounding); | 3385 } |
3474 in[4] = _mm_adds_epi16(in[4], final_rounding); | |
3475 in[5] = _mm_adds_epi16(in[5], final_rounding); | |
3476 in[6] = _mm_adds_epi16(in[6], final_rounding); | |
3477 in[7] = _mm_adds_epi16(in[7], final_rounding); | |
3478 in[8] = _mm_adds_epi16(in[8], final_rounding); | |
3479 in[9] = _mm_adds_epi16(in[9], final_rounding); | |
3480 in[10] = _mm_adds_epi16(in[10], final_rounding); | |
3481 in[11] = _mm_adds_epi16(in[11], final_rounding); | |
3482 in[12] = _mm_adds_epi16(in[12], final_rounding); | |
3483 in[13] = _mm_adds_epi16(in[13], final_rounding); | |
3484 in[14] = _mm_adds_epi16(in[14], final_rounding); | |
3485 in[15] = _mm_adds_epi16(in[15], final_rounding); | |
3486 in[16] = _mm_adds_epi16(in[16], final_rounding); | |
3487 in[17] = _mm_adds_epi16(in[17], final_rounding); | |
3488 in[18] = _mm_adds_epi16(in[18], final_rounding); | |
3489 in[19] = _mm_adds_epi16(in[19], final_rounding); | |
3490 in[20] = _mm_adds_epi16(in[20], final_rounding); | |
3491 in[21] = _mm_adds_epi16(in[21], final_rounding); | |
3492 in[22] = _mm_adds_epi16(in[22], final_rounding); | |
3493 in[23] = _mm_adds_epi16(in[23], final_rounding); | |
3494 in[24] = _mm_adds_epi16(in[24], final_rounding); | |
3495 in[25] = _mm_adds_epi16(in[25], final_rounding); | |
3496 in[26] = _mm_adds_epi16(in[26], final_rounding); | |
3497 in[27] = _mm_adds_epi16(in[27], final_rounding); | |
3498 in[28] = _mm_adds_epi16(in[28], final_rounding); | |
3499 in[29] = _mm_adds_epi16(in[29], final_rounding); | |
3500 in[30] = _mm_adds_epi16(in[30], final_rounding); | |
3501 in[31] = _mm_adds_epi16(in[31], final_rounding); | |
3502 | 3386 |
3503 in[0] = _mm_srai_epi16(in[0], 6); | 3387 dest += 8; |
3504 in[1] = _mm_srai_epi16(in[1], 6); | |
3505 in[2] = _mm_srai_epi16(in[2], 6); | |
3506 in[3] = _mm_srai_epi16(in[3], 6); | |
3507 in[4] = _mm_srai_epi16(in[4], 6); | |
3508 in[5] = _mm_srai_epi16(in[5], 6); | |
3509 in[6] = _mm_srai_epi16(in[6], 6); | |
3510 in[7] = _mm_srai_epi16(in[7], 6); | |
3511 in[8] = _mm_srai_epi16(in[8], 6); | |
3512 in[9] = _mm_srai_epi16(in[9], 6); | |
3513 in[10] = _mm_srai_epi16(in[10], 6); | |
3514 in[11] = _mm_srai_epi16(in[11], 6); | |
3515 in[12] = _mm_srai_epi16(in[12], 6); | |
3516 in[13] = _mm_srai_epi16(in[13], 6); | |
3517 in[14] = _mm_srai_epi16(in[14], 6); | |
3518 in[15] = _mm_srai_epi16(in[15], 6); | |
3519 in[16] = _mm_srai_epi16(in[16], 6); | |
3520 in[17] = _mm_srai_epi16(in[17], 6); | |
3521 in[18] = _mm_srai_epi16(in[18], 6); | |
3522 in[19] = _mm_srai_epi16(in[19], 6); | |
3523 in[20] = _mm_srai_epi16(in[20], 6); | |
3524 in[21] = _mm_srai_epi16(in[21], 6); | |
3525 in[22] = _mm_srai_epi16(in[22], 6); | |
3526 in[23] = _mm_srai_epi16(in[23], 6); | |
3527 in[24] = _mm_srai_epi16(in[24], 6); | |
3528 in[25] = _mm_srai_epi16(in[25], 6); | |
3529 in[26] = _mm_srai_epi16(in[26], 6); | |
3530 in[27] = _mm_srai_epi16(in[27], 6); | |
3531 in[28] = _mm_srai_epi16(in[28], 6); | |
3532 in[29] = _mm_srai_epi16(in[29], 6); | |
3533 in[30] = _mm_srai_epi16(in[30], 6); | |
3534 in[31] = _mm_srai_epi16(in[31], 6); | |
3535 | |
3536 RECON_AND_STORE(dest, in[0]); | |
3537 RECON_AND_STORE(dest, in[1]); | |
3538 RECON_AND_STORE(dest, in[2]); | |
3539 RECON_AND_STORE(dest, in[3]); | |
3540 RECON_AND_STORE(dest, in[4]); | |
3541 RECON_AND_STORE(dest, in[5]); | |
3542 RECON_AND_STORE(dest, in[6]); | |
3543 RECON_AND_STORE(dest, in[7]); | |
3544 RECON_AND_STORE(dest, in[8]); | |
3545 RECON_AND_STORE(dest, in[9]); | |
3546 RECON_AND_STORE(dest, in[10]); | |
3547 RECON_AND_STORE(dest, in[11]); | |
3548 RECON_AND_STORE(dest, in[12]); | |
3549 RECON_AND_STORE(dest, in[13]); | |
3550 RECON_AND_STORE(dest, in[14]); | |
3551 RECON_AND_STORE(dest, in[15]); | |
3552 RECON_AND_STORE(dest, in[16]); | |
3553 RECON_AND_STORE(dest, in[17]); | |
3554 RECON_AND_STORE(dest, in[18]); | |
3555 RECON_AND_STORE(dest, in[19]); | |
3556 RECON_AND_STORE(dest, in[20]); | |
3557 RECON_AND_STORE(dest, in[21]); | |
3558 RECON_AND_STORE(dest, in[22]); | |
3559 RECON_AND_STORE(dest, in[23]); | |
3560 RECON_AND_STORE(dest, in[24]); | |
3561 RECON_AND_STORE(dest, in[25]); | |
3562 RECON_AND_STORE(dest, in[26]); | |
3563 RECON_AND_STORE(dest, in[27]); | |
3564 RECON_AND_STORE(dest, in[28]); | |
3565 RECON_AND_STORE(dest, in[29]); | |
3566 RECON_AND_STORE(dest, in[30]); | |
3567 RECON_AND_STORE(dest, in[31]); | |
3568 | |
3569 dest += 8 - (stride * 32); | |
3570 } | |
3571 } | 3388 } |
| 3389 } |
3572 | 3390 |
3573 void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, | 3391 void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, |
3574 int stride) { | 3392 int stride) { |
3575 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); | 3393 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
3576 const __m128i final_rounding = _mm_set1_epi16(1<<5); | 3394 const __m128i final_rounding = _mm_set1_epi16(1 << 5); |
3577 const __m128i zero = _mm_setzero_si128(); | 3395 const __m128i zero = _mm_setzero_si128(); |
3578 | 3396 |
3579 // idct constants for each stage | 3397 // idct constants for each stage |
3580 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); | 3398 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); |
3581 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); | 3399 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); |
3582 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); | 3400 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); |
3583 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); | 3401 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); |
3584 const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); | 3402 const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); |
3585 const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); | 3403 const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); |
3586 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); | 3404 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); |
(...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3633 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, | 3451 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, |
3634 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, | 3452 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, |
3635 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, | 3453 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, |
3636 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, | 3454 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, |
3637 stp2_30, stp2_31; | 3455 stp2_30, stp2_31; |
3638 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; | 3456 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
3639 int i, j, i32; | 3457 int i, j, i32; |
3640 | 3458 |
3641 for (i = 0; i < 4; i++) { | 3459 for (i = 0; i < 4; i++) { |
3642 i32 = (i << 5); | 3460 i32 = (i << 5); |
3643 // First 1-D idct | 3461 // First 1-D idct |
3644 // Load input data. | 3462 // Load input data. |
3645 LOAD_DQCOEFF(in[0], input); | 3463 LOAD_DQCOEFF(in[0], input); |
3646 LOAD_DQCOEFF(in[8], input); | 3464 LOAD_DQCOEFF(in[8], input); |
3647 LOAD_DQCOEFF(in[16], input); | 3465 LOAD_DQCOEFF(in[16], input); |
3648 LOAD_DQCOEFF(in[24], input); | 3466 LOAD_DQCOEFF(in[24], input); |
3649 LOAD_DQCOEFF(in[1], input); | 3467 LOAD_DQCOEFF(in[1], input); |
3650 LOAD_DQCOEFF(in[9], input); | 3468 LOAD_DQCOEFF(in[9], input); |
3651 LOAD_DQCOEFF(in[17], input); | 3469 LOAD_DQCOEFF(in[17], input); |
3652 LOAD_DQCOEFF(in[25], input); | 3470 LOAD_DQCOEFF(in[25], input); |
3653 LOAD_DQCOEFF(in[2], input); | 3471 LOAD_DQCOEFF(in[2], input); |
3654 LOAD_DQCOEFF(in[10], input); | 3472 LOAD_DQCOEFF(in[10], input); |
3655 LOAD_DQCOEFF(in[18], input); | 3473 LOAD_DQCOEFF(in[18], input); |
3656 LOAD_DQCOEFF(in[26], input); | 3474 LOAD_DQCOEFF(in[26], input); |
3657 LOAD_DQCOEFF(in[3], input); | 3475 LOAD_DQCOEFF(in[3], input); |
3658 LOAD_DQCOEFF(in[11], input); | 3476 LOAD_DQCOEFF(in[11], input); |
3659 LOAD_DQCOEFF(in[19], input); | 3477 LOAD_DQCOEFF(in[19], input); |
3660 LOAD_DQCOEFF(in[27], input); | 3478 LOAD_DQCOEFF(in[27], input); |
3661 | 3479 |
3662 LOAD_DQCOEFF(in[4], input); | 3480 LOAD_DQCOEFF(in[4], input); |
3663 LOAD_DQCOEFF(in[12], input); | 3481 LOAD_DQCOEFF(in[12], input); |
3664 LOAD_DQCOEFF(in[20], input); | 3482 LOAD_DQCOEFF(in[20], input); |
3665 LOAD_DQCOEFF(in[28], input); | 3483 LOAD_DQCOEFF(in[28], input); |
3666 LOAD_DQCOEFF(in[5], input); | 3484 LOAD_DQCOEFF(in[5], input); |
3667 LOAD_DQCOEFF(in[13], input); | 3485 LOAD_DQCOEFF(in[13], input); |
3668 LOAD_DQCOEFF(in[21], input); | 3486 LOAD_DQCOEFF(in[21], input); |
3669 LOAD_DQCOEFF(in[29], input); | 3487 LOAD_DQCOEFF(in[29], input); |
3670 LOAD_DQCOEFF(in[6], input); | 3488 LOAD_DQCOEFF(in[6], input); |
3671 LOAD_DQCOEFF(in[14], input); | 3489 LOAD_DQCOEFF(in[14], input); |
3672 LOAD_DQCOEFF(in[22], input); | 3490 LOAD_DQCOEFF(in[22], input); |
3673 LOAD_DQCOEFF(in[30], input); | 3491 LOAD_DQCOEFF(in[30], input); |
3674 LOAD_DQCOEFF(in[7], input); | 3492 LOAD_DQCOEFF(in[7], input); |
3675 LOAD_DQCOEFF(in[15], input); | 3493 LOAD_DQCOEFF(in[15], input); |
3676 LOAD_DQCOEFF(in[23], input); | 3494 LOAD_DQCOEFF(in[23], input); |
3677 LOAD_DQCOEFF(in[31], input); | 3495 LOAD_DQCOEFF(in[31], input); |
3678 | 3496 |
3679 // checking if all entries are zero | 3497 // checking if all entries are zero |
3680 zero_idx[0] = _mm_or_si128(in[0], in[1]); | 3498 zero_idx[0] = _mm_or_si128(in[0], in[1]); |
3681 zero_idx[1] = _mm_or_si128(in[2], in[3]); | 3499 zero_idx[1] = _mm_or_si128(in[2], in[3]); |
3682 zero_idx[2] = _mm_or_si128(in[4], in[5]); | 3500 zero_idx[2] = _mm_or_si128(in[4], in[5]); |
3683 zero_idx[3] = _mm_or_si128(in[6], in[7]); | 3501 zero_idx[3] = _mm_or_si128(in[6], in[7]); |
3684 zero_idx[4] = _mm_or_si128(in[8], in[9]); | 3502 zero_idx[4] = _mm_or_si128(in[8], in[9]); |
3685 zero_idx[5] = _mm_or_si128(in[10], in[11]); | 3503 zero_idx[5] = _mm_or_si128(in[10], in[11]); |
3686 zero_idx[6] = _mm_or_si128(in[12], in[13]); | 3504 zero_idx[6] = _mm_or_si128(in[12], in[13]); |
3687 zero_idx[7] = _mm_or_si128(in[14], in[15]); | 3505 zero_idx[7] = _mm_or_si128(in[14], in[15]); |
3688 zero_idx[8] = _mm_or_si128(in[16], in[17]); | 3506 zero_idx[8] = _mm_or_si128(in[16], in[17]); |
3689 zero_idx[9] = _mm_or_si128(in[18], in[19]); | 3507 zero_idx[9] = _mm_or_si128(in[18], in[19]); |
3690 zero_idx[10] = _mm_or_si128(in[20], in[21]); | 3508 zero_idx[10] = _mm_or_si128(in[20], in[21]); |
3691 zero_idx[11] = _mm_or_si128(in[22], in[23]); | 3509 zero_idx[11] = _mm_or_si128(in[22], in[23]); |
3692 zero_idx[12] = _mm_or_si128(in[24], in[25]); | 3510 zero_idx[12] = _mm_or_si128(in[24], in[25]); |
3693 zero_idx[13] = _mm_or_si128(in[26], in[27]); | 3511 zero_idx[13] = _mm_or_si128(in[26], in[27]); |
3694 zero_idx[14] = _mm_or_si128(in[28], in[29]); | 3512 zero_idx[14] = _mm_or_si128(in[28], in[29]); |
3695 zero_idx[15] = _mm_or_si128(in[30], in[31]); | 3513 zero_idx[15] = _mm_or_si128(in[30], in[31]); |
3696 | 3514 |
3697 zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]); | 3515 zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]); |
3698 zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]); | 3516 zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]); |
3699 zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]); | 3517 zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]); |
3700 zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]); | 3518 zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]); |
3701 zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]); | 3519 zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]); |
3702 zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]); | 3520 zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]); |
3703 zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]); | 3521 zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]); |
3704 zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]); | 3522 zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]); |
3705 | 3523 |
3706 zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]); | 3524 zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]); |
3707 zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]); | 3525 zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]); |
3708 zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]); | 3526 zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]); |
3709 zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]); | 3527 zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]); |
3710 zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]); | 3528 zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]); |
3711 zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]); | 3529 zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]); |
3712 zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]); | 3530 zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]); |
3713 | 3531 |
3714 if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) { | 3532 if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) { |
3715 col[i32 + 0] = _mm_setzero_si128(); | 3533 col[i32 + 0] = _mm_setzero_si128(); |
3716 col[i32 + 1] = _mm_setzero_si128(); | 3534 col[i32 + 1] = _mm_setzero_si128(); |
3717 col[i32 + 2] = _mm_setzero_si128(); | 3535 col[i32 + 2] = _mm_setzero_si128(); |
3718 col[i32 + 3] = _mm_setzero_si128(); | 3536 col[i32 + 3] = _mm_setzero_si128(); |
3719 col[i32 + 4] = _mm_setzero_si128(); | 3537 col[i32 + 4] = _mm_setzero_si128(); |
3720 col[i32 + 5] = _mm_setzero_si128(); | 3538 col[i32 + 5] = _mm_setzero_si128(); |
3721 col[i32 + 6] = _mm_setzero_si128(); | 3539 col[i32 + 6] = _mm_setzero_si128(); |
3722 col[i32 + 7] = _mm_setzero_si128(); | 3540 col[i32 + 7] = _mm_setzero_si128(); |
3723 col[i32 + 8] = _mm_setzero_si128(); | 3541 col[i32 + 8] = _mm_setzero_si128(); |
3724 col[i32 + 9] = _mm_setzero_si128(); | 3542 col[i32 + 9] = _mm_setzero_si128(); |
3725 col[i32 + 10] = _mm_setzero_si128(); | 3543 col[i32 + 10] = _mm_setzero_si128(); |
3726 col[i32 + 11] = _mm_setzero_si128(); | 3544 col[i32 + 11] = _mm_setzero_si128(); |
3727 col[i32 + 12] = _mm_setzero_si128(); | 3545 col[i32 + 12] = _mm_setzero_si128(); |
3728 col[i32 + 13] = _mm_setzero_si128(); | 3546 col[i32 + 13] = _mm_setzero_si128(); |
3729 col[i32 + 14] = _mm_setzero_si128(); | 3547 col[i32 + 14] = _mm_setzero_si128(); |
3730 col[i32 + 15] = _mm_setzero_si128(); | 3548 col[i32 + 15] = _mm_setzero_si128(); |
3731 col[i32 + 16] = _mm_setzero_si128(); | 3549 col[i32 + 16] = _mm_setzero_si128(); |
3732 col[i32 + 17] = _mm_setzero_si128(); | 3550 col[i32 + 17] = _mm_setzero_si128(); |
3733 col[i32 + 18] = _mm_setzero_si128(); | 3551 col[i32 + 18] = _mm_setzero_si128(); |
3734 col[i32 + 19] = _mm_setzero_si128(); | 3552 col[i32 + 19] = _mm_setzero_si128(); |
3735 col[i32 + 20] = _mm_setzero_si128(); | 3553 col[i32 + 20] = _mm_setzero_si128(); |
3736 col[i32 + 21] = _mm_setzero_si128(); | 3554 col[i32 + 21] = _mm_setzero_si128(); |
3737 col[i32 + 22] = _mm_setzero_si128(); | 3555 col[i32 + 22] = _mm_setzero_si128(); |
3738 col[i32 + 23] = _mm_setzero_si128(); | 3556 col[i32 + 23] = _mm_setzero_si128(); |
3739 col[i32 + 24] = _mm_setzero_si128(); | 3557 col[i32 + 24] = _mm_setzero_si128(); |
3740 col[i32 + 25] = _mm_setzero_si128(); | 3558 col[i32 + 25] = _mm_setzero_si128(); |
3741 col[i32 + 26] = _mm_setzero_si128(); | 3559 col[i32 + 26] = _mm_setzero_si128(); |
3742 col[i32 + 27] = _mm_setzero_si128(); | 3560 col[i32 + 27] = _mm_setzero_si128(); |
3743 col[i32 + 28] = _mm_setzero_si128(); | 3561 col[i32 + 28] = _mm_setzero_si128(); |
3744 col[i32 + 29] = _mm_setzero_si128(); | 3562 col[i32 + 29] = _mm_setzero_si128(); |
3745 col[i32 + 30] = _mm_setzero_si128(); | 3563 col[i32 + 30] = _mm_setzero_si128(); |
3746 col[i32 + 31] = _mm_setzero_si128(); | 3564 col[i32 + 31] = _mm_setzero_si128(); |
3747 continue; | 3565 continue; |
3748 } | |
3749 | |
3750 // Transpose 32x8 block to 8x32 block | |
3751 array_transpose_8x8(in, in); | |
3752 array_transpose_8x8(in+8, in+8); | |
3753 array_transpose_8x8(in+16, in+16); | |
3754 array_transpose_8x8(in+24, in+24); | |
3755 | |
3756 IDCT32 | |
3757 | |
3758 // 1_D: Store 32 intermediate results for each 8x32 block. | |
3759 col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); | |
3760 col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); | |
3761 col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29); | |
3762 col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28); | |
3763 col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27); | |
3764 col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26); | |
3765 col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25); | |
3766 col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24); | |
3767 col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23); | |
3768 col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22); | |
3769 col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21); | |
3770 col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20); | |
3771 col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19); | |
3772 col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18); | |
3773 col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17); | |
3774 col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16); | |
3775 col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); | |
3776 col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); | |
3777 col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); | |
3778 col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); | |
3779 col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); | |
3780 col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); | |
3781 col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); | |
3782 col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); | |
3783 col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); | |
3784 col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); | |
3785 col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); | |
3786 col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); | |
3787 col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); | |
3788 col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); | |
3789 col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); | |
3790 col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); | |
3791 } | 3566 } |
| 3567 |
| 3568 // Transpose 32x8 block to 8x32 block |
| 3569 array_transpose_8x8(in, in); |
| 3570 array_transpose_8x8(in + 8, in + 8); |
| 3571 array_transpose_8x8(in + 16, in + 16); |
| 3572 array_transpose_8x8(in + 24, in + 24); |
| 3573 |
| 3574 IDCT32 |
| 3575 |
| 3576 // 1_D: Store 32 intermediate results for each 8x32 block. |
| 3577 col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); |
| 3578 col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); |
| 3579 col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29); |
| 3580 col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28); |
| 3581 col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27); |
| 3582 col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26); |
| 3583 col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25); |
| 3584 col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24); |
| 3585 col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23); |
| 3586 col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22); |
| 3587 col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21); |
| 3588 col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20); |
| 3589 col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19); |
| 3590 col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18); |
| 3591 col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17); |
| 3592 col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16); |
| 3593 col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); |
| 3594 col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); |
| 3595 col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); |
| 3596 col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); |
| 3597 col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); |
| 3598 col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); |
| 3599 col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); |
| 3600 col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); |
| 3601 col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); |
| 3602 col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); |
| 3603 col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); |
| 3604 col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); |
| 3605 col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); |
| 3606 col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); |
| 3607 col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); |
| 3608 col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); |
| 3609 } |
3792 for (i = 0; i < 4; i++) { | 3610 for (i = 0; i < 4; i++) { |
3793 // Second 1-D idct | 3611 // Second 1-D idct |
3794 j = i << 3; | 3612 j = i << 3; |
3795 | 3613 |
3796 // Transpose 32x8 block to 8x32 block | 3614 // Transpose 32x8 block to 8x32 block |
3797 array_transpose_8x8(col+j, in); | 3615 array_transpose_8x8(col + j, in); |
3798 array_transpose_8x8(col+j+32, in+8); | 3616 array_transpose_8x8(col + j + 32, in + 8); |
3799 array_transpose_8x8(col+j+64, in+16); | 3617 array_transpose_8x8(col + j + 64, in + 16); |
3800 array_transpose_8x8(col+j+96, in+24); | 3618 array_transpose_8x8(col + j + 96, in + 24); |
3801 | 3619 |
3802 IDCT32 | 3620 IDCT32 |
3803 | 3621 |
3804 // 2_D: Calculate the results and store them to destination. | 3622 // 2_D: Calculate the results and store them to destination. |
3805 in[0] = _mm_add_epi16(stp1_0, stp1_31); | 3623 in[0] = _mm_add_epi16(stp1_0, stp1_31); |
3806 in[1] = _mm_add_epi16(stp1_1, stp1_30); | 3624 in[1] = _mm_add_epi16(stp1_1, stp1_30); |
3807 in[2] = _mm_add_epi16(stp1_2, stp1_29); | 3625 in[2] = _mm_add_epi16(stp1_2, stp1_29); |
3808 in[3] = _mm_add_epi16(stp1_3, stp1_28); | 3626 in[3] = _mm_add_epi16(stp1_3, stp1_28); |
3809 in[4] = _mm_add_epi16(stp1_4, stp1_27); | 3627 in[4] = _mm_add_epi16(stp1_4, stp1_27); |
3810 in[5] = _mm_add_epi16(stp1_5, stp1_26); | 3628 in[5] = _mm_add_epi16(stp1_5, stp1_26); |
3811 in[6] = _mm_add_epi16(stp1_6, stp1_25); | 3629 in[6] = _mm_add_epi16(stp1_6, stp1_25); |
3812 in[7] = _mm_add_epi16(stp1_7, stp1_24); | 3630 in[7] = _mm_add_epi16(stp1_7, stp1_24); |
3813 in[8] = _mm_add_epi16(stp1_8, stp1_23); | 3631 in[8] = _mm_add_epi16(stp1_8, stp1_23); |
3814 in[9] = _mm_add_epi16(stp1_9, stp1_22); | 3632 in[9] = _mm_add_epi16(stp1_9, stp1_22); |
3815 in[10] = _mm_add_epi16(stp1_10, stp1_21); | 3633 in[10] = _mm_add_epi16(stp1_10, stp1_21); |
3816 in[11] = _mm_add_epi16(stp1_11, stp1_20); | 3634 in[11] = _mm_add_epi16(stp1_11, stp1_20); |
3817 in[12] = _mm_add_epi16(stp1_12, stp1_19); | 3635 in[12] = _mm_add_epi16(stp1_12, stp1_19); |
3818 in[13] = _mm_add_epi16(stp1_13, stp1_18); | 3636 in[13] = _mm_add_epi16(stp1_13, stp1_18); |
3819 in[14] = _mm_add_epi16(stp1_14, stp1_17); | 3637 in[14] = _mm_add_epi16(stp1_14, stp1_17); |
3820 in[15] = _mm_add_epi16(stp1_15, stp1_16); | 3638 in[15] = _mm_add_epi16(stp1_15, stp1_16); |
3821 in[16] = _mm_sub_epi16(stp1_15, stp1_16); | 3639 in[16] = _mm_sub_epi16(stp1_15, stp1_16); |
3822 in[17] = _mm_sub_epi16(stp1_14, stp1_17); | 3640 in[17] = _mm_sub_epi16(stp1_14, stp1_17); |
3823 in[18] = _mm_sub_epi16(stp1_13, stp1_18); | 3641 in[18] = _mm_sub_epi16(stp1_13, stp1_18); |
3824 in[19] = _mm_sub_epi16(stp1_12, stp1_19); | 3642 in[19] = _mm_sub_epi16(stp1_12, stp1_19); |
3825 in[20] = _mm_sub_epi16(stp1_11, stp1_20); | 3643 in[20] = _mm_sub_epi16(stp1_11, stp1_20); |
3826 in[21] = _mm_sub_epi16(stp1_10, stp1_21); | 3644 in[21] = _mm_sub_epi16(stp1_10, stp1_21); |
3827 in[22] = _mm_sub_epi16(stp1_9, stp1_22); | 3645 in[22] = _mm_sub_epi16(stp1_9, stp1_22); |
3828 in[23] = _mm_sub_epi16(stp1_8, stp1_23); | 3646 in[23] = _mm_sub_epi16(stp1_8, stp1_23); |
3829 in[24] = _mm_sub_epi16(stp1_7, stp1_24); | 3647 in[24] = _mm_sub_epi16(stp1_7, stp1_24); |
3830 in[25] = _mm_sub_epi16(stp1_6, stp1_25); | 3648 in[25] = _mm_sub_epi16(stp1_6, stp1_25); |
3831 in[26] = _mm_sub_epi16(stp1_5, stp1_26); | 3649 in[26] = _mm_sub_epi16(stp1_5, stp1_26); |
3832 in[27] = _mm_sub_epi16(stp1_4, stp1_27); | 3650 in[27] = _mm_sub_epi16(stp1_4, stp1_27); |
3833 in[28] = _mm_sub_epi16(stp1_3, stp1_28); | 3651 in[28] = _mm_sub_epi16(stp1_3, stp1_28); |
3834 in[29] = _mm_sub_epi16(stp1_2, stp1_29); | 3652 in[29] = _mm_sub_epi16(stp1_2, stp1_29); |
3835 in[30] = _mm_sub_epi16(stp1_1, stp1_30); | 3653 in[30] = _mm_sub_epi16(stp1_1, stp1_30); |
3836 in[31] = _mm_sub_epi16(stp1_0, stp1_31); | 3654 in[31] = _mm_sub_epi16(stp1_0, stp1_31); |
3837 | 3655 |
| 3656 for (j = 0; j < 32; ++j) { |
3838 // Final rounding and shift | 3657 // Final rounding and shift |
3839 in[0] = _mm_adds_epi16(in[0], final_rounding); | 3658 in[j] = _mm_adds_epi16(in[j], final_rounding); |
3840 in[1] = _mm_adds_epi16(in[1], final_rounding); | 3659 in[j] = _mm_srai_epi16(in[j], 6); |
3841 in[2] = _mm_adds_epi16(in[2], final_rounding); | 3660 RECON_AND_STORE(dest + j * stride, in[j]); |
3842 in[3] = _mm_adds_epi16(in[3], final_rounding); | |
3843 in[4] = _mm_adds_epi16(in[4], final_rounding); | |
3844 in[5] = _mm_adds_epi16(in[5], final_rounding); | |
3845 in[6] = _mm_adds_epi16(in[6], final_rounding); | |
3846 in[7] = _mm_adds_epi16(in[7], final_rounding); | |
3847 in[8] = _mm_adds_epi16(in[8], final_rounding); | |
3848 in[9] = _mm_adds_epi16(in[9], final_rounding); | |
3849 in[10] = _mm_adds_epi16(in[10], final_rounding); | |
3850 in[11] = _mm_adds_epi16(in[11], final_rounding); | |
3851 in[12] = _mm_adds_epi16(in[12], final_rounding); | |
3852 in[13] = _mm_adds_epi16(in[13], final_rounding); | |
3853 in[14] = _mm_adds_epi16(in[14], final_rounding); | |
3854 in[15] = _mm_adds_epi16(in[15], final_rounding); | |
3855 in[16] = _mm_adds_epi16(in[16], final_rounding); | |
3856 in[17] = _mm_adds_epi16(in[17], final_rounding); | |
3857 in[18] = _mm_adds_epi16(in[18], final_rounding); | |
3858 in[19] = _mm_adds_epi16(in[19], final_rounding); | |
3859 in[20] = _mm_adds_epi16(in[20], final_rounding); | |
3860 in[21] = _mm_adds_epi16(in[21], final_rounding); | |
3861 in[22] = _mm_adds_epi16(in[22], final_rounding); | |
3862 in[23] = _mm_adds_epi16(in[23], final_rounding); | |
3863 in[24] = _mm_adds_epi16(in[24], final_rounding); | |
3864 in[25] = _mm_adds_epi16(in[25], final_rounding); | |
3865 in[26] = _mm_adds_epi16(in[26], final_rounding); | |
3866 in[27] = _mm_adds_epi16(in[27], final_rounding); | |
3867 in[28] = _mm_adds_epi16(in[28], final_rounding); | |
3868 in[29] = _mm_adds_epi16(in[29], final_rounding); | |
3869 in[30] = _mm_adds_epi16(in[30], final_rounding); | |
3870 in[31] = _mm_adds_epi16(in[31], final_rounding); | |
3871 | |
3872 in[0] = _mm_srai_epi16(in[0], 6); | |
3873 in[1] = _mm_srai_epi16(in[1], 6); | |
3874 in[2] = _mm_srai_epi16(in[2], 6); | |
3875 in[3] = _mm_srai_epi16(in[3], 6); | |
3876 in[4] = _mm_srai_epi16(in[4], 6); | |
3877 in[5] = _mm_srai_epi16(in[5], 6); | |
3878 in[6] = _mm_srai_epi16(in[6], 6); | |
3879 in[7] = _mm_srai_epi16(in[7], 6); | |
3880 in[8] = _mm_srai_epi16(in[8], 6); | |
3881 in[9] = _mm_srai_epi16(in[9], 6); | |
3882 in[10] = _mm_srai_epi16(in[10], 6); | |
3883 in[11] = _mm_srai_epi16(in[11], 6); | |
3884 in[12] = _mm_srai_epi16(in[12], 6); | |
3885 in[13] = _mm_srai_epi16(in[13], 6); | |
3886 in[14] = _mm_srai_epi16(in[14], 6); | |
3887 in[15] = _mm_srai_epi16(in[15], 6); | |
3888 in[16] = _mm_srai_epi16(in[16], 6); | |
3889 in[17] = _mm_srai_epi16(in[17], 6); | |
3890 in[18] = _mm_srai_epi16(in[18], 6); | |
3891 in[19] = _mm_srai_epi16(in[19], 6); | |
3892 in[20] = _mm_srai_epi16(in[20], 6); | |
3893 in[21] = _mm_srai_epi16(in[21], 6); | |
3894 in[22] = _mm_srai_epi16(in[22], 6); | |
3895 in[23] = _mm_srai_epi16(in[23], 6); | |
3896 in[24] = _mm_srai_epi16(in[24], 6); | |
3897 in[25] = _mm_srai_epi16(in[25], 6); | |
3898 in[26] = _mm_srai_epi16(in[26], 6); | |
3899 in[27] = _mm_srai_epi16(in[27], 6); | |
3900 in[28] = _mm_srai_epi16(in[28], 6); | |
3901 in[29] = _mm_srai_epi16(in[29], 6); | |
3902 in[30] = _mm_srai_epi16(in[30], 6); | |
3903 in[31] = _mm_srai_epi16(in[31], 6); | |
3904 | |
3905 RECON_AND_STORE(dest, in[0]); | |
3906 RECON_AND_STORE(dest, in[1]); | |
3907 RECON_AND_STORE(dest, in[2]); | |
3908 RECON_AND_STORE(dest, in[3]); | |
3909 RECON_AND_STORE(dest, in[4]); | |
3910 RECON_AND_STORE(dest, in[5]); | |
3911 RECON_AND_STORE(dest, in[6]); | |
3912 RECON_AND_STORE(dest, in[7]); | |
3913 RECON_AND_STORE(dest, in[8]); | |
3914 RECON_AND_STORE(dest, in[9]); | |
3915 RECON_AND_STORE(dest, in[10]); | |
3916 RECON_AND_STORE(dest, in[11]); | |
3917 RECON_AND_STORE(dest, in[12]); | |
3918 RECON_AND_STORE(dest, in[13]); | |
3919 RECON_AND_STORE(dest, in[14]); | |
3920 RECON_AND_STORE(dest, in[15]); | |
3921 RECON_AND_STORE(dest, in[16]); | |
3922 RECON_AND_STORE(dest, in[17]); | |
3923 RECON_AND_STORE(dest, in[18]); | |
3924 RECON_AND_STORE(dest, in[19]); | |
3925 RECON_AND_STORE(dest, in[20]); | |
3926 RECON_AND_STORE(dest, in[21]); | |
3927 RECON_AND_STORE(dest, in[22]); | |
3928 RECON_AND_STORE(dest, in[23]); | |
3929 RECON_AND_STORE(dest, in[24]); | |
3930 RECON_AND_STORE(dest, in[25]); | |
3931 RECON_AND_STORE(dest, in[26]); | |
3932 RECON_AND_STORE(dest, in[27]); | |
3933 RECON_AND_STORE(dest, in[28]); | |
3934 RECON_AND_STORE(dest, in[29]); | |
3935 RECON_AND_STORE(dest, in[30]); | |
3936 RECON_AND_STORE(dest, in[31]); | |
3937 | |
3938 dest += 8 - (stride * 32); | |
3939 } | 3661 } |
3940 } //NOLINT | 3662 |
| 3663 dest += 8; |
| 3664 } |
| 3665 } |
3941 | 3666 |
3942 void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { | 3667 void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
3943 __m128i dc_value; | 3668 __m128i dc_value; |
3944 const __m128i zero = _mm_setzero_si128(); | 3669 const __m128i zero = _mm_setzero_si128(); |
3945 int a, i; | 3670 int a, i; |
3946 | 3671 |
3947 a = dct_const_round_shift(input[0] * cospi_16_64); | 3672 a = dct_const_round_shift(input[0] * cospi_16_64); |
3948 a = dct_const_round_shift(a * cospi_16_64); | 3673 a = dct_const_round_shift(a * cospi_16_64); |
3949 a = ROUND_POWER_OF_TWO(a, 6); | 3674 a = ROUND_POWER_OF_TWO(a, 6); |
3950 | 3675 |
3951 dc_value = _mm_set1_epi16(a); | 3676 dc_value = _mm_set1_epi16(a); |
3952 | 3677 |
3953 for (i = 0; i < 4; ++i) { | 3678 for (i = 0; i < 4; ++i) { |
3954 RECON_AND_STORE(dest, dc_value); | 3679 int j; |
3955 RECON_AND_STORE(dest, dc_value); | 3680 for (j = 0; j < 32; ++j) { |
3956 RECON_AND_STORE(dest, dc_value); | 3681 RECON_AND_STORE(dest + j * stride, dc_value); |
3957 RECON_AND_STORE(dest, dc_value); | 3682 } |
3958 RECON_AND_STORE(dest, dc_value); | 3683 dest += 8; |
3959 RECON_AND_STORE(dest, dc_value); | |
3960 RECON_AND_STORE(dest, dc_value); | |
3961 RECON_AND_STORE(dest, dc_value); | |
3962 RECON_AND_STORE(dest, dc_value); | |
3963 RECON_AND_STORE(dest, dc_value); | |
3964 RECON_AND_STORE(dest, dc_value); | |
3965 RECON_AND_STORE(dest, dc_value); | |
3966 RECON_AND_STORE(dest, dc_value); | |
3967 RECON_AND_STORE(dest, dc_value); | |
3968 RECON_AND_STORE(dest, dc_value); | |
3969 RECON_AND_STORE(dest, dc_value); | |
3970 RECON_AND_STORE(dest, dc_value); | |
3971 RECON_AND_STORE(dest, dc_value); | |
3972 RECON_AND_STORE(dest, dc_value); | |
3973 RECON_AND_STORE(dest, dc_value); | |
3974 RECON_AND_STORE(dest, dc_value); | |
3975 RECON_AND_STORE(dest, dc_value); | |
3976 RECON_AND_STORE(dest, dc_value); | |
3977 RECON_AND_STORE(dest, dc_value); | |
3978 RECON_AND_STORE(dest, dc_value); | |
3979 RECON_AND_STORE(dest, dc_value); | |
3980 RECON_AND_STORE(dest, dc_value); | |
3981 RECON_AND_STORE(dest, dc_value); | |
3982 RECON_AND_STORE(dest, dc_value); | |
3983 RECON_AND_STORE(dest, dc_value); | |
3984 RECON_AND_STORE(dest, dc_value); | |
3985 RECON_AND_STORE(dest, dc_value); | |
3986 dest += 8 - (stride * 32); | |
3987 } | 3684 } |
3988 } | 3685 } |
3989 | 3686 |
3990 #if CONFIG_VP9_HIGHBITDEPTH | 3687 #if CONFIG_VP9_HIGHBITDEPTH |
3991 static INLINE __m128i clamp_high_sse2(__m128i value, int bd) { | 3688 static INLINE __m128i clamp_high_sse2(__m128i value, int bd) { |
3992 __m128i ubounded, retval; | 3689 __m128i ubounded, retval; |
3993 const __m128i zero = _mm_set1_epi16(0); | 3690 const __m128i zero = _mm_set1_epi16(0); |
3994 const __m128i one = _mm_set1_epi16(1); | 3691 const __m128i one = _mm_set1_epi16(1); |
3995 const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one); | 3692 const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one); |
3996 ubounded = _mm_cmpgt_epi16(value, max); | 3693 ubounded = _mm_cmpgt_epi16(value, max); |
3997 retval = _mm_andnot_si128(ubounded, value); | 3694 retval = _mm_andnot_si128(ubounded, value); |
3998 ubounded = _mm_and_si128(ubounded, max); | 3695 ubounded = _mm_and_si128(ubounded, max); |
3999 retval = _mm_or_si128(retval, ubounded); | 3696 retval = _mm_or_si128(retval, ubounded); |
4000 retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero)); | 3697 retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero)); |
4001 return retval; | 3698 return retval; |
4002 } | 3699 } |
4003 | 3700 |
4004 void vp9_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8, | 3701 void vp9_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8, |
4005 int stride, int bd) { | 3702 int stride, int bd) { |
4006 tran_low_t out[4 * 4]; | 3703 tran_low_t out[4 * 4]; |
4007 tran_low_t *outptr = out; | 3704 tran_low_t *outptr = out; |
4008 int i, j; | 3705 int i, j; |
4009 __m128i inptr[4]; | 3706 __m128i inptr[4]; |
4010 __m128i sign_bits[2]; | 3707 __m128i sign_bits[2]; |
4011 __m128i temp_mm, min_input, max_input; | 3708 __m128i temp_mm, min_input, max_input; |
4012 int test; | 3709 int test; |
4013 uint16_t * dest = CONVERT_TO_SHORTPTR(dest8); | 3710 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); |
4014 int optimised_cols = 0; | 3711 int optimised_cols = 0; |
4015 const __m128i zero = _mm_set1_epi16(0); | 3712 const __m128i zero = _mm_set1_epi16(0); |
4016 const __m128i eight = _mm_set1_epi16(8); | 3713 const __m128i eight = _mm_set1_epi16(8); |
4017 const __m128i max = _mm_set1_epi16(12043); | 3714 const __m128i max = _mm_set1_epi16(12043); |
4018 const __m128i min = _mm_set1_epi16(-12043); | 3715 const __m128i min = _mm_set1_epi16(-12043); |
4019 // Load input into __m128i | 3716 // Load input into __m128i |
4020 inptr[0] = _mm_loadu_si128((const __m128i *)input); | 3717 inptr[0] = _mm_loadu_si128((const __m128i *)input); |
4021 inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4)); | 3718 inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4)); |
4022 inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8)); | 3719 inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8)); |
4023 inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12)); | 3720 inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12)); |
(...skipping 22 matching lines...) Expand all Loading... |
4046 test = _mm_movemask_epi8(temp_mm); | 3743 test = _mm_movemask_epi8(temp_mm); |
4047 | 3744 |
4048 if (test) { | 3745 if (test) { |
4049 transpose_4x4(inptr); | 3746 transpose_4x4(inptr); |
4050 sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero); | 3747 sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero); |
4051 sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero); | 3748 sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero); |
4052 inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]); | 3749 inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]); |
4053 inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]); | 3750 inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]); |
4054 inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]); | 3751 inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]); |
4055 inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]); | 3752 inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]); |
4056 _mm_storeu_si128((__m128i*)outptr, inptr[0]); | 3753 _mm_storeu_si128((__m128i *)outptr, inptr[0]); |
4057 _mm_storeu_si128((__m128i*)(outptr + 4), inptr[1]); | 3754 _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]); |
4058 _mm_storeu_si128((__m128i*)(outptr + 8), inptr[2]); | 3755 _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]); |
4059 _mm_storeu_si128((__m128i*)(outptr + 12), inptr[3]); | 3756 _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]); |
4060 } else { | 3757 } else { |
4061 // Set to use the optimised transform for the column | 3758 // Set to use the optimised transform for the column |
4062 optimised_cols = 1; | 3759 optimised_cols = 1; |
4063 } | 3760 } |
4064 } else { | 3761 } else { |
4065 // Run the un-optimised row transform | 3762 // Run the un-optimised row transform |
4066 for (i = 0; i < 4; ++i) { | 3763 for (i = 0; i < 4; ++i) { |
4067 vp9_highbd_idct4(input, outptr, bd); | 3764 vp9_highbd_idct4(input, outptr, bd); |
4068 input += 4; | 3765 input += 4; |
4069 outptr += 4; | 3766 outptr += 4; |
4070 } | 3767 } |
4071 } | 3768 } |
4072 | 3769 |
4073 if (optimised_cols) { | 3770 if (optimised_cols) { |
4074 idct4_sse2(inptr); | 3771 idct4_sse2(inptr); |
4075 | 3772 |
4076 // Final round and shift | 3773 // Final round and shift |
4077 inptr[0] = _mm_add_epi16(inptr[0], eight); | 3774 inptr[0] = _mm_add_epi16(inptr[0], eight); |
4078 inptr[1] = _mm_add_epi16(inptr[1], eight); | 3775 inptr[1] = _mm_add_epi16(inptr[1], eight); |
4079 | 3776 |
4080 inptr[0] = _mm_srai_epi16(inptr[0], 4); | 3777 inptr[0] = _mm_srai_epi16(inptr[0], 4); |
4081 inptr[1] = _mm_srai_epi16(inptr[1], 4); | 3778 inptr[1] = _mm_srai_epi16(inptr[1], 4); |
4082 | 3779 |
4083 // Reconstruction and Store | 3780 // Reconstruction and Store |
4084 { | 3781 { |
4085 __m128i d0 = _mm_loadl_epi64((const __m128i *)dest); | 3782 __m128i d0 = _mm_loadl_epi64((const __m128i *)dest); |
4086 __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2)); | 3783 __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2)); |
4087 d0 = _mm_unpacklo_epi64(d0, | 3784 d0 = _mm_unpacklo_epi64( |
4088 _mm_loadl_epi64((const __m128i *)(dest + stride))); | 3785 d0, _mm_loadl_epi64((const __m128i *)(dest + stride))); |
4089 d2 = _mm_unpacklo_epi64(d2, | 3786 d2 = _mm_unpacklo_epi64( |
4090 _mm_loadl_epi64((const __m128i *)(dest + stride * 3))); | 3787 d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3))); |
4091 d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd); | 3788 d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd); |
4092 d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd); | 3789 d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd); |
4093 // store input0 | 3790 // store input0 |
4094 _mm_storel_epi64((__m128i *)dest, d0); | 3791 _mm_storel_epi64((__m128i *)dest, d0); |
4095 // store input1 | 3792 // store input1 |
4096 d0 = _mm_srli_si128(d0, 8); | 3793 d0 = _mm_srli_si128(d0, 8); |
4097 _mm_storel_epi64((__m128i *)(dest + stride), d0); | 3794 _mm_storel_epi64((__m128i *)(dest + stride), d0); |
4098 // store input2 | 3795 // store input2 |
4099 _mm_storel_epi64((__m128i *)(dest + stride * 2), d2); | 3796 _mm_storel_epi64((__m128i *)(dest + stride * 2), d2); |
4100 // store input3 | 3797 // store input3 |
(...skipping 10 matching lines...) Expand all Loading... |
4111 vp9_highbd_idct4(temp_in, temp_out, bd); | 3808 vp9_highbd_idct4(temp_in, temp_out, bd); |
4112 for (j = 0; j < 4; ++j) { | 3809 for (j = 0; j < 4; ++j) { |
4113 dest[j * stride + i] = highbd_clip_pixel_add( | 3810 dest[j * stride + i] = highbd_clip_pixel_add( |
4114 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); | 3811 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); |
4115 } | 3812 } |
4116 } | 3813 } |
4117 } | 3814 } |
4118 } | 3815 } |
4119 | 3816 |
4120 void vp9_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8, | 3817 void vp9_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8, |
4121 int stride, int bd) { | 3818 int stride, int bd) { |
4122 tran_low_t out[8 * 8]; | 3819 tran_low_t out[8 * 8]; |
4123 tran_low_t *outptr = out; | 3820 tran_low_t *outptr = out; |
4124 int i, j, test; | 3821 int i, j, test; |
4125 __m128i inptr[8]; | 3822 __m128i inptr[8]; |
4126 __m128i min_input, max_input, temp1, temp2, sign_bits; | 3823 __m128i min_input, max_input, temp1, temp2, sign_bits; |
4127 uint16_t * dest = CONVERT_TO_SHORTPTR(dest8); | 3824 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); |
4128 const __m128i zero = _mm_set1_epi16(0); | 3825 const __m128i zero = _mm_set1_epi16(0); |
4129 const __m128i sixteen = _mm_set1_epi16(16); | 3826 const __m128i sixteen = _mm_set1_epi16(16); |
4130 const __m128i max = _mm_set1_epi16(6201); | 3827 const __m128i max = _mm_set1_epi16(6201); |
4131 const __m128i min = _mm_set1_epi16(-6201); | 3828 const __m128i min = _mm_set1_epi16(-6201); |
4132 int optimised_cols = 0; | 3829 int optimised_cols = 0; |
4133 | 3830 |
4134 // Load input into __m128i & pack to 16 bits | 3831 // Load input into __m128i & pack to 16 bits |
4135 for (i = 0; i < 8; i++) { | 3832 for (i = 0; i < 8; i++) { |
4136 temp1 = _mm_loadu_si128((const __m128i *)(input + 8*i)); | 3833 temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i)); |
4137 temp2 = _mm_loadu_si128((const __m128i *)(input + 8*i + 4)); | 3834 temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4)); |
4138 inptr[i] = _mm_packs_epi32(temp1, temp2); | 3835 inptr[i] = _mm_packs_epi32(temp1, temp2); |
4139 } | 3836 } |
4140 | 3837 |
4141 // Find the min & max for the row transform | 3838 // Find the min & max for the row transform |
4142 max_input = _mm_max_epi16(inptr[0], inptr[1]); | 3839 max_input = _mm_max_epi16(inptr[0], inptr[1]); |
4143 min_input = _mm_min_epi16(inptr[0], inptr[1]); | 3840 min_input = _mm_min_epi16(inptr[0], inptr[1]); |
4144 for (i = 2; i < 8; i++) { | 3841 for (i = 2; i < 8; i++) { |
4145 max_input = _mm_max_epi16(max_input, inptr[i]); | 3842 max_input = _mm_max_epi16(max_input, inptr[i]); |
4146 min_input = _mm_min_epi16(min_input, inptr[i]); | 3843 min_input = _mm_min_epi16(min_input, inptr[i]); |
4147 } | 3844 } |
(...skipping 17 matching lines...) Expand all Loading... |
4165 min_input = _mm_cmplt_epi16(min_input, min); | 3862 min_input = _mm_cmplt_epi16(min_input, min); |
4166 temp1 = _mm_or_si128(max_input, min_input); | 3863 temp1 = _mm_or_si128(max_input, min_input); |
4167 test = _mm_movemask_epi8(temp1); | 3864 test = _mm_movemask_epi8(temp1); |
4168 | 3865 |
4169 if (test) { | 3866 if (test) { |
4170 array_transpose_8x8(inptr, inptr); | 3867 array_transpose_8x8(inptr, inptr); |
4171 for (i = 0; i < 8; i++) { | 3868 for (i = 0; i < 8; i++) { |
4172 sign_bits = _mm_cmplt_epi16(inptr[i], zero); | 3869 sign_bits = _mm_cmplt_epi16(inptr[i], zero); |
4173 temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits); | 3870 temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits); |
4174 temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits); | 3871 temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits); |
4175 _mm_storeu_si128((__m128i*)(outptr + 4*(2*i+1)), temp1); | 3872 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1); |
4176 _mm_storeu_si128((__m128i*)(outptr + 4*(2*i)), temp2); | 3873 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2); |
4177 } | 3874 } |
4178 } else { | 3875 } else { |
4179 // Set to use the optimised transform for the column | 3876 // Set to use the optimised transform for the column |
4180 optimised_cols = 1; | 3877 optimised_cols = 1; |
4181 } | 3878 } |
4182 } else { | 3879 } else { |
4183 // Run the un-optimised row transform | 3880 // Run the un-optimised row transform |
4184 for (i = 0; i < 8; ++i) { | 3881 for (i = 0; i < 8; ++i) { |
4185 vp9_highbd_idct8(input, outptr, bd); | 3882 vp9_highbd_idct8(input, outptr, bd); |
4186 input += 8; | 3883 input += 8; |
(...skipping 25 matching lines...) Expand all Loading... |
4212 vp9_highbd_idct8(temp_in, temp_out, bd); | 3909 vp9_highbd_idct8(temp_in, temp_out, bd); |
4213 for (j = 0; j < 8; ++j) { | 3910 for (j = 0; j < 8; ++j) { |
4214 dest[j * stride + i] = highbd_clip_pixel_add( | 3911 dest[j * stride + i] = highbd_clip_pixel_add( |
4215 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); | 3912 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); |
4216 } | 3913 } |
4217 } | 3914 } |
4218 } | 3915 } |
4219 } | 3916 } |
4220 | 3917 |
4221 void vp9_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8, | 3918 void vp9_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8, |
4222 int stride, int bd) { | 3919 int stride, int bd) { |
4223 tran_low_t out[8 * 8] = { 0 }; | 3920 tran_low_t out[8 * 8] = { 0 }; |
4224 tran_low_t *outptr = out; | 3921 tran_low_t *outptr = out; |
4225 int i, j, test; | 3922 int i, j, test; |
4226 __m128i inptr[8]; | 3923 __m128i inptr[8]; |
4227 __m128i min_input, max_input, temp1, temp2, sign_bits; | 3924 __m128i min_input, max_input, temp1, temp2, sign_bits; |
4228 uint16_t * dest = CONVERT_TO_SHORTPTR(dest8); | 3925 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); |
4229 const __m128i zero = _mm_set1_epi16(0); | 3926 const __m128i zero = _mm_set1_epi16(0); |
4230 const __m128i sixteen = _mm_set1_epi16(16); | 3927 const __m128i sixteen = _mm_set1_epi16(16); |
4231 const __m128i max = _mm_set1_epi16(6201); | 3928 const __m128i max = _mm_set1_epi16(6201); |
4232 const __m128i min = _mm_set1_epi16(-6201); | 3929 const __m128i min = _mm_set1_epi16(-6201); |
4233 int optimised_cols = 0; | 3930 int optimised_cols = 0; |
4234 | 3931 |
4235 // Load input into __m128i & pack to 16 bits | 3932 // Load input into __m128i & pack to 16 bits |
4236 for (i = 0; i < 8; i++) { | 3933 for (i = 0; i < 8; i++) { |
4237 temp1 = _mm_loadu_si128((const __m128i *)(input + 8*i)); | 3934 temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i)); |
4238 temp2 = _mm_loadu_si128((const __m128i *)(input + 8*i + 4)); | 3935 temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4)); |
4239 inptr[i] = _mm_packs_epi32(temp1, temp2); | 3936 inptr[i] = _mm_packs_epi32(temp1, temp2); |
4240 } | 3937 } |
4241 | 3938 |
4242 // Find the min & max for the row transform | 3939 // Find the min & max for the row transform |
4243 // only first 4 row has non-zero coefs | 3940 // only first 4 row has non-zero coefs |
4244 max_input = _mm_max_epi16(inptr[0], inptr[1]); | 3941 max_input = _mm_max_epi16(inptr[0], inptr[1]); |
4245 min_input = _mm_min_epi16(inptr[0], inptr[1]); | 3942 min_input = _mm_min_epi16(inptr[0], inptr[1]); |
4246 for (i = 2; i < 4; i++) { | 3943 for (i = 2; i < 4; i++) { |
4247 max_input = _mm_max_epi16(max_input, inptr[i]); | 3944 max_input = _mm_max_epi16(max_input, inptr[i]); |
4248 min_input = _mm_min_epi16(min_input, inptr[i]); | 3945 min_input = _mm_min_epi16(min_input, inptr[i]); |
(...skipping 20 matching lines...) Expand all Loading... |
4269 temp1 = _mm_or_si128(max_input, min_input); | 3966 temp1 = _mm_or_si128(max_input, min_input); |
4270 test = _mm_movemask_epi8(temp1); | 3967 test = _mm_movemask_epi8(temp1); |
4271 | 3968 |
4272 if (test) { | 3969 if (test) { |
4273 // Use fact only first 4 rows contain non-zero coeffs | 3970 // Use fact only first 4 rows contain non-zero coeffs |
4274 array_transpose_4X8(inptr, inptr); | 3971 array_transpose_4X8(inptr, inptr); |
4275 for (i = 0; i < 4; i++) { | 3972 for (i = 0; i < 4; i++) { |
4276 sign_bits = _mm_cmplt_epi16(inptr[i], zero); | 3973 sign_bits = _mm_cmplt_epi16(inptr[i], zero); |
4277 temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits); | 3974 temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits); |
4278 temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits); | 3975 temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits); |
4279 _mm_storeu_si128((__m128i*)(outptr + 4*(2*i+1)), temp1); | 3976 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1); |
4280 _mm_storeu_si128((__m128i*)(outptr + 4*(2*i)), temp2); | 3977 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2); |
4281 } | 3978 } |
4282 } else { | 3979 } else { |
4283 // Set to use the optimised transform for the column | 3980 // Set to use the optimised transform for the column |
4284 optimised_cols = 1; | 3981 optimised_cols = 1; |
4285 } | 3982 } |
4286 } else { | 3983 } else { |
4287 // Run the un-optimised row transform | 3984 // Run the un-optimised row transform |
4288 for (i = 0; i < 4; ++i) { | 3985 for (i = 0; i < 4; ++i) { |
4289 vp9_highbd_idct8(input, outptr, bd); | 3986 vp9_highbd_idct8(input, outptr, bd); |
4290 input += 8; | 3987 input += 8; |
(...skipping 25 matching lines...) Expand all Loading... |
4316 vp9_highbd_idct8(temp_in, temp_out, bd); | 4013 vp9_highbd_idct8(temp_in, temp_out, bd); |
4317 for (j = 0; j < 8; ++j) { | 4014 for (j = 0; j < 8; ++j) { |
4318 dest[j * stride + i] = highbd_clip_pixel_add( | 4015 dest[j * stride + i] = highbd_clip_pixel_add( |
4319 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); | 4016 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); |
4320 } | 4017 } |
4321 } | 4018 } |
4322 } | 4019 } |
4323 } | 4020 } |
4324 | 4021 |
4325 void vp9_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8, | 4022 void vp9_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8, |
4326 int stride, int bd) { | 4023 int stride, int bd) { |
4327 tran_low_t out[16 * 16]; | 4024 tran_low_t out[16 * 16]; |
4328 tran_low_t *outptr = out; | 4025 tran_low_t *outptr = out; |
4329 int i, j, test; | 4026 int i, j, test; |
4330 __m128i inptr[32]; | 4027 __m128i inptr[32]; |
4331 __m128i min_input, max_input, temp1, temp2, sign_bits; | 4028 __m128i min_input, max_input, temp1, temp2, sign_bits; |
4332 uint16_t * dest = CONVERT_TO_SHORTPTR(dest8); | 4029 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); |
4333 const __m128i zero = _mm_set1_epi16(0); | 4030 const __m128i zero = _mm_set1_epi16(0); |
4334 const __m128i rounding = _mm_set1_epi16(32); | 4031 const __m128i rounding = _mm_set1_epi16(32); |
4335 const __m128i max = _mm_set1_epi16(3155); | 4032 const __m128i max = _mm_set1_epi16(3155); |
4336 const __m128i min = _mm_set1_epi16(-3155); | 4033 const __m128i min = _mm_set1_epi16(-3155); |
4337 int optimised_cols = 0; | 4034 int optimised_cols = 0; |
4338 | 4035 |
4339 // Load input into __m128i & pack to 16 bits | 4036 // Load input into __m128i & pack to 16 bits |
4340 for (i = 0; i < 16; i++) { | 4037 for (i = 0; i < 16; i++) { |
4341 temp1 = _mm_loadu_si128((const __m128i *)(input + 16*i)); | 4038 temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i)); |
4342 temp2 = _mm_loadu_si128((const __m128i *)(input + 16*i + 4)); | 4039 temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4)); |
4343 inptr[i] = _mm_packs_epi32(temp1, temp2); | 4040 inptr[i] = _mm_packs_epi32(temp1, temp2); |
4344 temp1 = _mm_loadu_si128((const __m128i *)(input + 16*i + 8)); | 4041 temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8)); |
4345 temp2 = _mm_loadu_si128((const __m128i *)(input + 16*i + 12)); | 4042 temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12)); |
4346 inptr[i + 16] = _mm_packs_epi32(temp1, temp2); | 4043 inptr[i + 16] = _mm_packs_epi32(temp1, temp2); |
4347 } | 4044 } |
4348 | 4045 |
4349 // Find the min & max for the row transform | 4046 // Find the min & max for the row transform |
4350 max_input = _mm_max_epi16(inptr[0], inptr[1]); | 4047 max_input = _mm_max_epi16(inptr[0], inptr[1]); |
4351 min_input = _mm_min_epi16(inptr[0], inptr[1]); | 4048 min_input = _mm_min_epi16(inptr[0], inptr[1]); |
4352 for (i = 2; i < 32; i++) { | 4049 for (i = 2; i < 32; i++) { |
4353 max_input = _mm_max_epi16(max_input, inptr[i]); | 4050 max_input = _mm_max_epi16(max_input, inptr[i]); |
4354 min_input = _mm_min_epi16(min_input, inptr[i]); | 4051 min_input = _mm_min_epi16(min_input, inptr[i]); |
4355 } | 4052 } |
(...skipping 15 matching lines...) Expand all Loading... |
4371 } | 4068 } |
4372 max_input = _mm_cmpgt_epi16(max_input, max); | 4069 max_input = _mm_cmpgt_epi16(max_input, max); |
4373 min_input = _mm_cmplt_epi16(min_input, min); | 4070 min_input = _mm_cmplt_epi16(min_input, min); |
4374 temp1 = _mm_or_si128(max_input, min_input); | 4071 temp1 = _mm_or_si128(max_input, min_input); |
4375 test = _mm_movemask_epi8(temp1); | 4072 test = _mm_movemask_epi8(temp1); |
4376 | 4073 |
4377 if (test) { | 4074 if (test) { |
4378 array_transpose_16x16(inptr, inptr + 16); | 4075 array_transpose_16x16(inptr, inptr + 16); |
4379 for (i = 0; i < 16; i++) { | 4076 for (i = 0; i < 16; i++) { |
4380 sign_bits = _mm_cmplt_epi16(inptr[i], zero); | 4077 sign_bits = _mm_cmplt_epi16(inptr[i], zero); |
4381 temp1 = _mm_unpacklo_epi16(inptr[i ], sign_bits); | 4078 temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits); |
4382 temp2 = _mm_unpackhi_epi16(inptr[i ], sign_bits); | 4079 temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits); |
4383 _mm_storeu_si128((__m128i*)(outptr + 4*(i*4)), temp1); | 4080 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1); |
4384 _mm_storeu_si128((__m128i*)(outptr + 4*(i*4+1)), temp2); | 4081 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2); |
4385 sign_bits = _mm_cmplt_epi16(inptr[i+16], zero); | 4082 sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero); |
4386 temp1 = _mm_unpacklo_epi16(inptr[i+16], sign_bits); | 4083 temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits); |
4387 temp2 = _mm_unpackhi_epi16(inptr[i+16], sign_bits); | 4084 temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits); |
4388 _mm_storeu_si128((__m128i*)(outptr + 4*(i*4+2)), temp1); | 4085 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1); |
4389 _mm_storeu_si128((__m128i*)(outptr + 4*(i*4+3)), temp2); | 4086 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2); |
4390 } | 4087 } |
4391 } else { | 4088 } else { |
4392 // Set to use the optimised transform for the column | 4089 // Set to use the optimised transform for the column |
4393 optimised_cols = 1; | 4090 optimised_cols = 1; |
4394 } | 4091 } |
4395 } else { | 4092 } else { |
4396 // Run the un-optimised row transform | 4093 // Run the un-optimised row transform |
4397 for (i = 0; i < 16; ++i) { | 4094 for (i = 0; i < 16; ++i) { |
4398 vp9_highbd_idct16(input, outptr, bd); | 4095 vp9_highbd_idct16(input, outptr, bd); |
4399 input += 16; | 4096 input += 16; |
(...skipping 30 matching lines...) Expand all Loading... |
4430 vp9_highbd_idct16(temp_in, temp_out, bd); | 4127 vp9_highbd_idct16(temp_in, temp_out, bd); |
4431 for (j = 0; j < 16; ++j) { | 4128 for (j = 0; j < 16; ++j) { |
4432 dest[j * stride + i] = highbd_clip_pixel_add( | 4129 dest[j * stride + i] = highbd_clip_pixel_add( |
4433 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); | 4130 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); |
4434 } | 4131 } |
4435 } | 4132 } |
4436 } | 4133 } |
4437 } | 4134 } |
4438 | 4135 |
4439 void vp9_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8, | 4136 void vp9_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8, |
4440 int stride, int bd) { | 4137 int stride, int bd) { |
4441 tran_low_t out[16 * 16] = { 0 }; | 4138 tran_low_t out[16 * 16] = { 0 }; |
4442 tran_low_t *outptr = out; | 4139 tran_low_t *outptr = out; |
4443 int i, j, test; | 4140 int i, j, test; |
4444 __m128i inptr[32]; | 4141 __m128i inptr[32]; |
4445 __m128i min_input, max_input, temp1, temp2, sign_bits; | 4142 __m128i min_input, max_input, temp1, temp2, sign_bits; |
4446 uint16_t * dest = CONVERT_TO_SHORTPTR(dest8); | 4143 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); |
4447 const __m128i zero = _mm_set1_epi16(0); | 4144 const __m128i zero = _mm_set1_epi16(0); |
4448 const __m128i rounding = _mm_set1_epi16(32); | 4145 const __m128i rounding = _mm_set1_epi16(32); |
4449 const __m128i max = _mm_set1_epi16(3155); | 4146 const __m128i max = _mm_set1_epi16(3155); |
4450 const __m128i min = _mm_set1_epi16(-3155); | 4147 const __m128i min = _mm_set1_epi16(-3155); |
4451 int optimised_cols = 0; | 4148 int optimised_cols = 0; |
4452 | 4149 |
4453 // Load input into __m128i & pack to 16 bits | 4150 // Load input into __m128i & pack to 16 bits |
4454 for (i = 0; i < 16; i++) { | 4151 for (i = 0; i < 16; i++) { |
4455 temp1 = _mm_loadu_si128((const __m128i *)(input + 16*i)); | 4152 temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i)); |
4456 temp2 = _mm_loadu_si128((const __m128i *)(input + 16*i + 4)); | 4153 temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4)); |
4457 inptr[i] = _mm_packs_epi32(temp1, temp2); | 4154 inptr[i] = _mm_packs_epi32(temp1, temp2); |
4458 temp1 = _mm_loadu_si128((const __m128i *)(input + 16*i + 8)); | 4155 temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8)); |
4459 temp2 = _mm_loadu_si128((const __m128i *)(input + 16*i + 12)); | 4156 temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12)); |
4460 inptr[i + 16] = _mm_packs_epi32(temp1, temp2); | 4157 inptr[i + 16] = _mm_packs_epi32(temp1, temp2); |
4461 } | 4158 } |
4462 | 4159 |
4463 // Find the min & max for the row transform | 4160 // Find the min & max for the row transform |
4464 // Since all non-zero dct coefficients are in upper-left 4x4 area, | 4161 // Since all non-zero dct coefficients are in upper-left 4x4 area, |
4465 // we only need to consider first 4 rows here. | 4162 // we only need to consider first 4 rows here. |
4466 max_input = _mm_max_epi16(inptr[0], inptr[1]); | 4163 max_input = _mm_max_epi16(inptr[0], inptr[1]); |
4467 min_input = _mm_min_epi16(inptr[0], inptr[1]); | 4164 min_input = _mm_min_epi16(inptr[0], inptr[1]); |
4468 for (i = 2; i < 4; i++) { | 4165 for (i = 2; i < 4; i++) { |
4469 max_input = _mm_max_epi16(max_input, inptr[i]); | 4166 max_input = _mm_max_epi16(max_input, inptr[i]); |
(...skipping 20 matching lines...) Expand all Loading... |
4490 min_input = _mm_cmplt_epi16(min_input, min); | 4187 min_input = _mm_cmplt_epi16(min_input, min); |
4491 temp1 = _mm_or_si128(max_input, min_input); | 4188 temp1 = _mm_or_si128(max_input, min_input); |
4492 test = _mm_movemask_epi8(temp1); | 4189 test = _mm_movemask_epi8(temp1); |
4493 | 4190 |
4494 if (test) { | 4191 if (test) { |
4495 // Use fact only first 4 rows contain non-zero coeffs | 4192 // Use fact only first 4 rows contain non-zero coeffs |
4496 array_transpose_8x8(inptr, inptr); | 4193 array_transpose_8x8(inptr, inptr); |
4497 array_transpose_8x8(inptr + 8, inptr + 16); | 4194 array_transpose_8x8(inptr + 8, inptr + 16); |
4498 for (i = 0; i < 4; i++) { | 4195 for (i = 0; i < 4; i++) { |
4499 sign_bits = _mm_cmplt_epi16(inptr[i], zero); | 4196 sign_bits = _mm_cmplt_epi16(inptr[i], zero); |
4500 temp1 = _mm_unpacklo_epi16(inptr[i ], sign_bits); | 4197 temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits); |
4501 temp2 = _mm_unpackhi_epi16(inptr[i ], sign_bits); | 4198 temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits); |
4502 _mm_storeu_si128((__m128i*)(outptr + 4*(i*4)), temp1); | 4199 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1); |
4503 _mm_storeu_si128((__m128i*)(outptr + 4*(i*4+1)), temp2); | 4200 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2); |
4504 sign_bits = _mm_cmplt_epi16(inptr[i+16], zero); | 4201 sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero); |
4505 temp1 = _mm_unpacklo_epi16(inptr[i+16], sign_bits); | 4202 temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits); |
4506 temp2 = _mm_unpackhi_epi16(inptr[i+16], sign_bits); | 4203 temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits); |
4507 _mm_storeu_si128((__m128i*)(outptr + 4*(i*4+2)), temp1); | 4204 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1); |
4508 _mm_storeu_si128((__m128i*)(outptr + 4*(i*4+3)), temp2); | 4205 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2); |
4509 } | 4206 } |
4510 } else { | 4207 } else { |
4511 // Set to use the optimised transform for the column | 4208 // Set to use the optimised transform for the column |
4512 optimised_cols = 1; | 4209 optimised_cols = 1; |
4513 } | 4210 } |
4514 } else { | 4211 } else { |
4515 // Run the un-optimised row transform | 4212 // Run the un-optimised row transform |
4516 for (i = 0; i < 4; ++i) { | 4213 for (i = 0; i < 4; ++i) { |
4517 vp9_highbd_idct16(input, outptr, bd); | 4214 vp9_highbd_idct16(input, outptr, bd); |
4518 input += 16; | 4215 input += 16; |
(...skipping 30 matching lines...) Expand all Loading... |
4549 vp9_highbd_idct16(temp_in, temp_out, bd); | 4246 vp9_highbd_idct16(temp_in, temp_out, bd); |
4550 for (j = 0; j < 16; ++j) { | 4247 for (j = 0; j < 16; ++j) { |
4551 dest[j * stride + i] = highbd_clip_pixel_add( | 4248 dest[j * stride + i] = highbd_clip_pixel_add( |
4552 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); | 4249 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); |
4553 } | 4250 } |
4554 } | 4251 } |
4555 } | 4252 } |
4556 } | 4253 } |
4557 | 4254 |
4558 #endif // CONFIG_VP9_HIGHBITDEPTH | 4255 #endif // CONFIG_VP9_HIGHBITDEPTH |
OLD | NEW |