Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1)

Side by Side Diff: source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c

Issue 1124333011: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: only update to last nights LKGR Created 5 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
11 #include "vp9/common/x86/vp9_idct_intrin_sse2.h" 11 #include "vp9/common/x86/vp9_idct_intrin_sse2.h"
12 #include "vp9/common/vp9_idct.h" 12 #include "vp9/common/vp9_idct.h"
13 13
14 #define RECON_AND_STORE4X4(dest, in_x) \ 14 #define RECON_AND_STORE4X4(dest, in_x) \
15 { \ 15 { \
16 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ 16 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
17 d0 = _mm_unpacklo_epi8(d0, zero); \ 17 d0 = _mm_unpacklo_epi8(d0, zero); \
18 d0 = _mm_add_epi16(in_x, d0); \ 18 d0 = _mm_add_epi16(in_x, d0); \
19 d0 = _mm_packus_epi16(d0, d0); \ 19 d0 = _mm_packus_epi16(d0, d0); \
20 *(int *)dest = _mm_cvtsi128_si32(d0); \ 20 *(int *)(dest) = _mm_cvtsi128_si32(d0); \
21 dest += stride; \
22 } 21 }
23 22
24 void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 23 void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
25 const __m128i zero = _mm_setzero_si128(); 24 const __m128i zero = _mm_setzero_si128();
26 const __m128i eight = _mm_set1_epi16(8); 25 const __m128i eight = _mm_set1_epi16(8);
27 const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64, 26 const __m128i cst = _mm_setr_epi16(
28 (int16_t)cospi_16_64, (int16_t)-cospi_16_64, 27 (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64,
29 (int16_t)cospi_24_64, (int16_t)-cospi_8_64, 28 (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
30 (int16_t)cospi_8_64, (int16_t)cospi_24_64); 29 (int16_t)cospi_8_64, (int16_t)cospi_24_64);
31 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 30 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
32 __m128i input0, input1, input2, input3; 31 __m128i input0, input1, input2, input3;
33 32
34 // Rows 33 // Rows
35 input0 = _mm_load_si128((const __m128i *)input); 34 input0 = _mm_load_si128((const __m128i *)input);
36 input2 = _mm_load_si128((const __m128i *)(input + 8)); 35 input2 = _mm_load_si128((const __m128i *)(input + 8));
37 36
38 // Construct i3, i1, i3, i1, i2, i0, i2, i0 37 // Construct i3, i1, i3, i1, i2, i0, i2, i0
39 input0 = _mm_shufflelo_epi16(input0, 0xd8); 38 input0 = _mm_shufflelo_epi16(input0, 0xd8);
40 input0 = _mm_shufflehi_epi16(input0, 0xd8); 39 input0 = _mm_shufflehi_epi16(input0, 0xd8);
(...skipping 78 matching lines...) Expand 10 before | Expand all | Expand 10 after
119 118
120 // Final round and shift 119 // Final round and shift
121 input2 = _mm_add_epi16(input2, eight); 120 input2 = _mm_add_epi16(input2, eight);
122 input3 = _mm_add_epi16(input3, eight); 121 input3 = _mm_add_epi16(input3, eight);
123 122
124 input2 = _mm_srai_epi16(input2, 4); 123 input2 = _mm_srai_epi16(input2, 4);
125 input3 = _mm_srai_epi16(input3, 4); 124 input3 = _mm_srai_epi16(input3, 4);
126 125
127 // Reconstruction and Store 126 // Reconstruction and Store
128 { 127 {
129 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); 128 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
130 __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); 129 __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
131 d0 = _mm_unpacklo_epi32(d0, 130 d0 = _mm_unpacklo_epi32(d0,
132 _mm_cvtsi32_si128(*(const int *) (dest + stride))); 131 _mm_cvtsi32_si128(*(const int *)(dest + stride)));
133 d2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128( 132 d2 = _mm_unpacklo_epi32(
134 *(const int *) (dest + stride * 3)), d2); 133 _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2);
135 d0 = _mm_unpacklo_epi8(d0, zero); 134 d0 = _mm_unpacklo_epi8(d0, zero);
136 d2 = _mm_unpacklo_epi8(d2, zero); 135 d2 = _mm_unpacklo_epi8(d2, zero);
137 d0 = _mm_add_epi16(d0, input2); 136 d0 = _mm_add_epi16(d0, input2);
138 d2 = _mm_add_epi16(d2, input3); 137 d2 = _mm_add_epi16(d2, input3);
139 d0 = _mm_packus_epi16(d0, d2); 138 d0 = _mm_packus_epi16(d0, d2);
140 // store input0 139 // store input0
141 *(int *)dest = _mm_cvtsi128_si32(d0); 140 *(int *)dest = _mm_cvtsi128_si32(d0);
142 // store input1 141 // store input1
143 d0 = _mm_srli_si128(d0, 4); 142 d0 = _mm_srli_si128(d0, 4);
144 *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); 143 *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
145 // store input2 144 // store input2
146 d0 = _mm_srli_si128(d0, 4); 145 d0 = _mm_srli_si128(d0, 4);
147 *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); 146 *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
148 // store input3 147 // store input3
149 d0 = _mm_srli_si128(d0, 4); 148 d0 = _mm_srli_si128(d0, 4);
150 *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); 149 *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
151 } 150 }
152 } 151 }
153 152
154 void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 153 void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
155 __m128i dc_value; 154 __m128i dc_value;
156 const __m128i zero = _mm_setzero_si128(); 155 const __m128i zero = _mm_setzero_si128();
157 int a; 156 int a;
158 157
159 a = dct_const_round_shift(input[0] * cospi_16_64); 158 a = dct_const_round_shift(input[0] * cospi_16_64);
160 a = dct_const_round_shift(a * cospi_16_64); 159 a = dct_const_round_shift(a * cospi_16_64);
161 a = ROUND_POWER_OF_TWO(a, 4); 160 a = ROUND_POWER_OF_TWO(a, 4);
162 161
163 dc_value = _mm_set1_epi16(a); 162 dc_value = _mm_set1_epi16(a);
164 163
165 RECON_AND_STORE4X4(dest, dc_value); 164 RECON_AND_STORE4X4(dest + 0 * stride, dc_value);
166 RECON_AND_STORE4X4(dest, dc_value); 165 RECON_AND_STORE4X4(dest + 1 * stride, dc_value);
167 RECON_AND_STORE4X4(dest, dc_value); 166 RECON_AND_STORE4X4(dest + 2 * stride, dc_value);
168 RECON_AND_STORE4X4(dest, dc_value); 167 RECON_AND_STORE4X4(dest + 3 * stride, dc_value);
169 } 168 }
170 169
171 static INLINE void transpose_4x4(__m128i *res) { 170 static INLINE void transpose_4x4(__m128i *res) {
172 const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); 171 const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
173 const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); 172 const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
174 173
175 res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1); 174 res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
176 res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1); 175 res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
177 } 176 }
178 177
(...skipping 81 matching lines...) Expand 10 before | Expand all | Expand 10 after
260 in[0] = _mm_packs_epi32(u[0], u[1]); 259 in[0] = _mm_packs_epi32(u[0], u[1]);
261 in[1] = _mm_packs_epi32(u[2], u[3]); 260 in[1] = _mm_packs_epi32(u[2], u[3]);
262 } 261 }
263 262
264 void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, 263 void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
265 int tx_type) { 264 int tx_type) {
266 __m128i in[2]; 265 __m128i in[2];
267 const __m128i zero = _mm_setzero_si128(); 266 const __m128i zero = _mm_setzero_si128();
268 const __m128i eight = _mm_set1_epi16(8); 267 const __m128i eight = _mm_set1_epi16(8);
269 268
270 in[0]= _mm_loadu_si128((const __m128i *)(input)); 269 in[0] = _mm_loadu_si128((const __m128i *)(input));
271 in[1]= _mm_loadu_si128((const __m128i *)(input + 8)); 270 in[1] = _mm_loadu_si128((const __m128i *)(input + 8));
272 271
273 switch (tx_type) { 272 switch (tx_type) {
274 case 0: // DCT_DCT 273 case 0: // DCT_DCT
275 idct4_sse2(in); 274 idct4_sse2(in);
276 idct4_sse2(in); 275 idct4_sse2(in);
277 break; 276 break;
278 case 1: // ADST_DCT 277 case 1: // ADST_DCT
279 idct4_sse2(in); 278 idct4_sse2(in);
280 iadst4_sse2(in); 279 iadst4_sse2(in);
281 break; 280 break;
(...skipping 12 matching lines...) Expand all
294 293
295 // Final round and shift 294 // Final round and shift
296 in[0] = _mm_add_epi16(in[0], eight); 295 in[0] = _mm_add_epi16(in[0], eight);
297 in[1] = _mm_add_epi16(in[1], eight); 296 in[1] = _mm_add_epi16(in[1], eight);
298 297
299 in[0] = _mm_srai_epi16(in[0], 4); 298 in[0] = _mm_srai_epi16(in[0], 4);
300 in[1] = _mm_srai_epi16(in[1], 4); 299 in[1] = _mm_srai_epi16(in[1], 4);
301 300
302 // Reconstruction and Store 301 // Reconstruction and Store
303 { 302 {
304 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); 303 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
305 __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); 304 __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
306 d0 = _mm_unpacklo_epi32(d0, 305 d0 = _mm_unpacklo_epi32(d0,
307 _mm_cvtsi32_si128(*(const int *) (dest + stride))); 306 _mm_cvtsi32_si128(*(const int *)(dest + stride)));
308 d2 = _mm_unpacklo_epi32(d2, _mm_cvtsi32_si128( 307 d2 = _mm_unpacklo_epi32(
309 *(const int *) (dest + stride * 3))); 308 d2, _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)));
310 d0 = _mm_unpacklo_epi8(d0, zero); 309 d0 = _mm_unpacklo_epi8(d0, zero);
311 d2 = _mm_unpacklo_epi8(d2, zero); 310 d2 = _mm_unpacklo_epi8(d2, zero);
312 d0 = _mm_add_epi16(d0, in[0]); 311 d0 = _mm_add_epi16(d0, in[0]);
313 d2 = _mm_add_epi16(d2, in[1]); 312 d2 = _mm_add_epi16(d2, in[1]);
314 d0 = _mm_packus_epi16(d0, d2); 313 d0 = _mm_packus_epi16(d0, d2);
315 // store result[0] 314 // store result[0]
316 *(int *)dest = _mm_cvtsi128_si32(d0); 315 *(int *)dest = _mm_cvtsi128_si32(d0);
317 // store result[1] 316 // store result[1]
318 d0 = _mm_srli_si128(d0, 4); 317 d0 = _mm_srli_si128(d0, 4);
319 *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); 318 *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
320 // store result[2] 319 // store result[2]
321 d0 = _mm_srli_si128(d0, 4); 320 d0 = _mm_srli_si128(d0, 4);
322 *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); 321 *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
323 // store result[3] 322 // store result[3]
324 d0 = _mm_srli_si128(d0, 4); 323 d0 = _mm_srli_si128(d0, 4);
325 *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); 324 *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
326 } 325 }
327 } 326 }
328 327
329 #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \ 328 #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
330 out0, out1, out2, out3, out4, out5, out6, out7) \ 329 out0, out1, out2, out3, out4, out5, out6, out7) \
331 { \ 330 { \
332 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ 331 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
333 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ 332 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
334 const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ 333 const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
335 const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ 334 const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
(...skipping 174 matching lines...) Expand 10 before | Expand all | Expand 10 after
510 out3 = _mm_adds_epi16(stp1_3, stp2_4); \ 509 out3 = _mm_adds_epi16(stp1_3, stp2_4); \
511 out4 = _mm_subs_epi16(stp1_3, stp2_4); \ 510 out4 = _mm_subs_epi16(stp1_3, stp2_4); \
512 out5 = _mm_subs_epi16(stp1_2, stp1_5); \ 511 out5 = _mm_subs_epi16(stp1_2, stp1_5); \
513 out6 = _mm_subs_epi16(stp1_1, stp1_6); \ 512 out6 = _mm_subs_epi16(stp1_1, stp1_6); \
514 out7 = _mm_subs_epi16(stp1_0, stp2_7); \ 513 out7 = _mm_subs_epi16(stp1_0, stp2_7); \
515 } 514 }
516 515
517 void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 516 void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
518 const __m128i zero = _mm_setzero_si128(); 517 const __m128i zero = _mm_setzero_si128();
519 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 518 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
520 const __m128i final_rounding = _mm_set1_epi16(1<<4); 519 const __m128i final_rounding = _mm_set1_epi16(1 << 4);
521 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 520 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
522 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 521 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
523 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); 522 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
524 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); 523 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
525 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 524 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
526 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 525 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
527 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 526 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
528 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 527 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
529 528
530 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 529 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
(...skipping 13 matching lines...) Expand all
544 in7 = _mm_load_si128((const __m128i *)(input + 8 * 7)); 543 in7 = _mm_load_si128((const __m128i *)(input + 8 * 7));
545 544
546 // 2-D 545 // 2-D
547 for (i = 0; i < 2; i++) { 546 for (i = 0; i < 2; i++) {
548 // 8x8 Transpose is copied from vp9_fdct8x8_sse2() 547 // 8x8 Transpose is copied from vp9_fdct8x8_sse2()
549 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, 548 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7,
550 in0, in1, in2, in3, in4, in5, in6, in7); 549 in0, in1, in2, in3, in4, in5, in6, in7);
551 550
552 // 4-stage 1D idct8x8 551 // 4-stage 1D idct8x8
553 IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, 552 IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
554 in0, in1, in2, in3, in4, in5, in6, in7); 553 in0, in1, in2, in3, in4, in5, in6, in7);
555 } 554 }
556 555
557 // Final rounding and shift 556 // Final rounding and shift
558 in0 = _mm_adds_epi16(in0, final_rounding); 557 in0 = _mm_adds_epi16(in0, final_rounding);
559 in1 = _mm_adds_epi16(in1, final_rounding); 558 in1 = _mm_adds_epi16(in1, final_rounding);
560 in2 = _mm_adds_epi16(in2, final_rounding); 559 in2 = _mm_adds_epi16(in2, final_rounding);
561 in3 = _mm_adds_epi16(in3, final_rounding); 560 in3 = _mm_adds_epi16(in3, final_rounding);
562 in4 = _mm_adds_epi16(in4, final_rounding); 561 in4 = _mm_adds_epi16(in4, final_rounding);
563 in5 = _mm_adds_epi16(in5, final_rounding); 562 in5 = _mm_adds_epi16(in5, final_rounding);
564 in6 = _mm_adds_epi16(in6, final_rounding); 563 in6 = _mm_adds_epi16(in6, final_rounding);
565 in7 = _mm_adds_epi16(in7, final_rounding); 564 in7 = _mm_adds_epi16(in7, final_rounding);
566 565
567 in0 = _mm_srai_epi16(in0, 5); 566 in0 = _mm_srai_epi16(in0, 5);
568 in1 = _mm_srai_epi16(in1, 5); 567 in1 = _mm_srai_epi16(in1, 5);
569 in2 = _mm_srai_epi16(in2, 5); 568 in2 = _mm_srai_epi16(in2, 5);
570 in3 = _mm_srai_epi16(in3, 5); 569 in3 = _mm_srai_epi16(in3, 5);
571 in4 = _mm_srai_epi16(in4, 5); 570 in4 = _mm_srai_epi16(in4, 5);
572 in5 = _mm_srai_epi16(in5, 5); 571 in5 = _mm_srai_epi16(in5, 5);
573 in6 = _mm_srai_epi16(in6, 5); 572 in6 = _mm_srai_epi16(in6, 5);
574 in7 = _mm_srai_epi16(in7, 5); 573 in7 = _mm_srai_epi16(in7, 5);
575 574
576 RECON_AND_STORE(dest, in0); 575 RECON_AND_STORE(dest + 0 * stride, in0);
577 RECON_AND_STORE(dest, in1); 576 RECON_AND_STORE(dest + 1 * stride, in1);
578 RECON_AND_STORE(dest, in2); 577 RECON_AND_STORE(dest + 2 * stride, in2);
579 RECON_AND_STORE(dest, in3); 578 RECON_AND_STORE(dest + 3 * stride, in3);
580 RECON_AND_STORE(dest, in4); 579 RECON_AND_STORE(dest + 4 * stride, in4);
581 RECON_AND_STORE(dest, in5); 580 RECON_AND_STORE(dest + 5 * stride, in5);
582 RECON_AND_STORE(dest, in6); 581 RECON_AND_STORE(dest + 6 * stride, in6);
583 RECON_AND_STORE(dest, in7); 582 RECON_AND_STORE(dest + 7 * stride, in7);
584 } 583 }
585 584
586 void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 585 void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
587 __m128i dc_value; 586 __m128i dc_value;
588 const __m128i zero = _mm_setzero_si128(); 587 const __m128i zero = _mm_setzero_si128();
589 int a; 588 int a;
590 589
591 a = dct_const_round_shift(input[0] * cospi_16_64); 590 a = dct_const_round_shift(input[0] * cospi_16_64);
592 a = dct_const_round_shift(a * cospi_16_64); 591 a = dct_const_round_shift(a * cospi_16_64);
593 a = ROUND_POWER_OF_TWO(a, 5); 592 a = ROUND_POWER_OF_TWO(a, 5);
594 593
595 dc_value = _mm_set1_epi16(a); 594 dc_value = _mm_set1_epi16(a);
596 595
597 RECON_AND_STORE(dest, dc_value); 596 RECON_AND_STORE(dest + 0 * stride, dc_value);
598 RECON_AND_STORE(dest, dc_value); 597 RECON_AND_STORE(dest + 1 * stride, dc_value);
599 RECON_AND_STORE(dest, dc_value); 598 RECON_AND_STORE(dest + 2 * stride, dc_value);
600 RECON_AND_STORE(dest, dc_value); 599 RECON_AND_STORE(dest + 3 * stride, dc_value);
601 RECON_AND_STORE(dest, dc_value); 600 RECON_AND_STORE(dest + 4 * stride, dc_value);
602 RECON_AND_STORE(dest, dc_value); 601 RECON_AND_STORE(dest + 5 * stride, dc_value);
603 RECON_AND_STORE(dest, dc_value); 602 RECON_AND_STORE(dest + 6 * stride, dc_value);
604 RECON_AND_STORE(dest, dc_value); 603 RECON_AND_STORE(dest + 7 * stride, dc_value);
605 } 604 }
606 605
607 static void idct8_sse2(__m128i *in) { 606 static void idct8_sse2(__m128i *in) {
608 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 607 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
609 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 608 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
610 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 609 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
611 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); 610 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
612 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); 611 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
613 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 612 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
614 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 613 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
615 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 614 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
616 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 615 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
617 616
618 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 617 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
619 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; 618 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
620 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; 619 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
621 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 620 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
622 621
623 // 8x8 Transpose is copied from vp9_fdct8x8_sse2() 622 // 8x8 Transpose is copied from vp9_fdct8x8_sse2()
624 TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], 623 TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7],
625 in0, in1, in2, in3, in4, in5, in6, in7); 624 in0, in1, in2, in3, in4, in5, in6, in7);
626 625
627 // 4-stage 1D idct8x8 626 // 4-stage 1D idct8x8
628 IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, 627 IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
629 in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]); 628 in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]);
630 } 629 }
631 630
632 static void iadst8_sse2(__m128i *in) { 631 static void iadst8_sse2(__m128i *in) {
633 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); 632 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
634 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); 633 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
635 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); 634 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
636 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); 635 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
637 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); 636 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
638 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); 637 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
639 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); 638 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
640 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); 639 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
641 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 640 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
642 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 641 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
643 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); 642 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
644 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 643 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
645 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); 644 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
646 const __m128i k__const_0 = _mm_set1_epi16(0); 645 const __m128i k__const_0 = _mm_set1_epi16(0);
647 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 646 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
648 647
649 __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; 648 __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
650 __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; 649 __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
651 __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; 650 __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
652 __m128i s0, s1, s2, s3, s4, s5, s6, s7; 651 __m128i s0, s1, s2, s3, s4, s5, s6, s7;
653 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 652 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
654 653
655 // transpose 654 // transpose
656 array_transpose_8x8(in, in); 655 array_transpose_8x8(in, in);
657 656
658 // properly aligned for butterfly input 657 // properly aligned for butterfly input
659 in0 = in[7]; 658 in0 = in[7];
660 in1 = in[0]; 659 in1 = in[0];
661 in2 = in[5]; 660 in2 = in[5];
662 in3 = in[2]; 661 in3 = in[2];
663 in4 = in[3]; 662 in4 = in[3];
664 in5 = in[4]; 663 in5 = in[4];
665 in6 = in[1]; 664 in6 = in[1];
666 in7 = in[6]; 665 in7 = in[6];
667 666
668 // column transformation 667 // column transformation
669 // stage 1 668 // stage 1
670 // interleave and multiply/add into 32-bit integer 669 // interleave and multiply/add into 32-bit integer
671 s0 = _mm_unpacklo_epi16(in0, in1); 670 s0 = _mm_unpacklo_epi16(in0, in1);
672 s1 = _mm_unpackhi_epi16(in0, in1); 671 s1 = _mm_unpackhi_epi16(in0, in1);
673 s2 = _mm_unpacklo_epi16(in2, in3); 672 s2 = _mm_unpacklo_epi16(in2, in3);
674 s3 = _mm_unpackhi_epi16(in2, in3); 673 s3 = _mm_unpackhi_epi16(in2, in3);
675 s4 = _mm_unpacklo_epi16(in4, in5); 674 s4 = _mm_unpacklo_epi16(in4, in5);
676 s5 = _mm_unpackhi_epi16(in4, in5); 675 s5 = _mm_unpackhi_epi16(in4, in5);
(...skipping 173 matching lines...) Expand 10 before | Expand all | Expand 10 after
850 in[0] = s0; 849 in[0] = s0;
851 in[1] = _mm_sub_epi16(k__const_0, s4); 850 in[1] = _mm_sub_epi16(k__const_0, s4);
852 in[2] = s6; 851 in[2] = s6;
853 in[3] = _mm_sub_epi16(k__const_0, s2); 852 in[3] = _mm_sub_epi16(k__const_0, s2);
854 in[4] = s3; 853 in[4] = s3;
855 in[5] = _mm_sub_epi16(k__const_0, s7); 854 in[5] = _mm_sub_epi16(k__const_0, s7);
856 in[6] = s5; 855 in[6] = s5;
857 in[7] = _mm_sub_epi16(k__const_0, s1); 856 in[7] = _mm_sub_epi16(k__const_0, s1);
858 } 857 }
859 858
860
861 void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride, 859 void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
862 int tx_type) { 860 int tx_type) {
863 __m128i in[8]; 861 __m128i in[8];
864 const __m128i zero = _mm_setzero_si128(); 862 const __m128i zero = _mm_setzero_si128();
865 const __m128i final_rounding = _mm_set1_epi16(1<<4); 863 const __m128i final_rounding = _mm_set1_epi16(1 << 4);
866 864
867 // load input data 865 // load input data
868 in[0] = _mm_load_si128((const __m128i *)input); 866 in[0] = _mm_load_si128((const __m128i *)input);
869 in[1] = _mm_load_si128((const __m128i *)(input + 8 * 1)); 867 in[1] = _mm_load_si128((const __m128i *)(input + 8 * 1));
870 in[2] = _mm_load_si128((const __m128i *)(input + 8 * 2)); 868 in[2] = _mm_load_si128((const __m128i *)(input + 8 * 2));
871 in[3] = _mm_load_si128((const __m128i *)(input + 8 * 3)); 869 in[3] = _mm_load_si128((const __m128i *)(input + 8 * 3));
872 in[4] = _mm_load_si128((const __m128i *)(input + 8 * 4)); 870 in[4] = _mm_load_si128((const __m128i *)(input + 8 * 4));
873 in[5] = _mm_load_si128((const __m128i *)(input + 8 * 5)); 871 in[5] = _mm_load_si128((const __m128i *)(input + 8 * 5));
874 in[6] = _mm_load_si128((const __m128i *)(input + 8 * 6)); 872 in[6] = _mm_load_si128((const __m128i *)(input + 8 * 6));
875 in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7)); 873 in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7));
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after
908 906
909 in[0] = _mm_srai_epi16(in[0], 5); 907 in[0] = _mm_srai_epi16(in[0], 5);
910 in[1] = _mm_srai_epi16(in[1], 5); 908 in[1] = _mm_srai_epi16(in[1], 5);
911 in[2] = _mm_srai_epi16(in[2], 5); 909 in[2] = _mm_srai_epi16(in[2], 5);
912 in[3] = _mm_srai_epi16(in[3], 5); 910 in[3] = _mm_srai_epi16(in[3], 5);
913 in[4] = _mm_srai_epi16(in[4], 5); 911 in[4] = _mm_srai_epi16(in[4], 5);
914 in[5] = _mm_srai_epi16(in[5], 5); 912 in[5] = _mm_srai_epi16(in[5], 5);
915 in[6] = _mm_srai_epi16(in[6], 5); 913 in[6] = _mm_srai_epi16(in[6], 5);
916 in[7] = _mm_srai_epi16(in[7], 5); 914 in[7] = _mm_srai_epi16(in[7], 5);
917 915
918 RECON_AND_STORE(dest, in[0]); 916 RECON_AND_STORE(dest + 0 * stride, in[0]);
919 RECON_AND_STORE(dest, in[1]); 917 RECON_AND_STORE(dest + 1 * stride, in[1]);
920 RECON_AND_STORE(dest, in[2]); 918 RECON_AND_STORE(dest + 2 * stride, in[2]);
921 RECON_AND_STORE(dest, in[3]); 919 RECON_AND_STORE(dest + 3 * stride, in[3]);
922 RECON_AND_STORE(dest, in[4]); 920 RECON_AND_STORE(dest + 4 * stride, in[4]);
923 RECON_AND_STORE(dest, in[5]); 921 RECON_AND_STORE(dest + 5 * stride, in[5]);
924 RECON_AND_STORE(dest, in[6]); 922 RECON_AND_STORE(dest + 6 * stride, in[6]);
925 RECON_AND_STORE(dest, in[7]); 923 RECON_AND_STORE(dest + 7 * stride, in[7]);
926 } 924 }
927 925
928 void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 926 void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
929 const __m128i zero = _mm_setzero_si128(); 927 const __m128i zero = _mm_setzero_si128();
930 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 928 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
931 const __m128i final_rounding = _mm_set1_epi16(1<<4); 929 const __m128i final_rounding = _mm_set1_epi16(1 << 4);
932 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 930 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
933 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 931 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
934 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); 932 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
935 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); 933 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
936 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 934 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
937 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 935 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
938 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 936 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
939 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 937 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
940 const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 938 const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
941 939
942 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 940 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
943 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; 941 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
944 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; 942 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
945 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 943 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
946 944
947 // Rows. Load 4-row input data. 945 // Rows. Load 4-row input data.
948 in0 = _mm_load_si128((const __m128i *)input); 946 in0 = _mm_load_si128((const __m128i *)input);
949 in1 = _mm_load_si128((const __m128i *)(input + 8 * 1)); 947 in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
950 in2 = _mm_load_si128((const __m128i *)(input + 8 * 2)); 948 in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
951 in3 = _mm_load_si128((const __m128i *)(input + 8 * 3)); 949 in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
952 950
953 // 8x4 Transpose 951 // 8x4 Transpose
954 TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1); 952 TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
955 // Stage1 953 // Stage1
956 { //NOLINT 954 {
957 const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero); 955 const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
958 const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero); 956 const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
959 957
960 tmp0 = _mm_madd_epi16(lo_17, stg1_0); 958 tmp0 = _mm_madd_epi16(lo_17, stg1_0);
961 tmp2 = _mm_madd_epi16(lo_17, stg1_1); 959 tmp2 = _mm_madd_epi16(lo_17, stg1_1);
962 tmp4 = _mm_madd_epi16(lo_35, stg1_2); 960 tmp4 = _mm_madd_epi16(lo_35, stg1_2);
963 tmp6 = _mm_madd_epi16(lo_35, stg1_3); 961 tmp6 = _mm_madd_epi16(lo_35, stg1_3);
964 962
965 tmp0 = _mm_add_epi32(tmp0, rounding); 963 tmp0 = _mm_add_epi32(tmp0, rounding);
966 tmp2 = _mm_add_epi32(tmp2, rounding); 964 tmp2 = _mm_add_epi32(tmp2, rounding);
967 tmp4 = _mm_add_epi32(tmp4, rounding); 965 tmp4 = _mm_add_epi32(tmp4, rounding);
968 tmp6 = _mm_add_epi32(tmp6, rounding); 966 tmp6 = _mm_add_epi32(tmp6, rounding);
969 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 967 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
970 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 968 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
971 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 969 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
972 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 970 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
973 971
974 stp1_4 = _mm_packs_epi32(tmp0, tmp2); 972 stp1_4 = _mm_packs_epi32(tmp0, tmp2);
975 stp1_5 = _mm_packs_epi32(tmp4, tmp6); 973 stp1_5 = _mm_packs_epi32(tmp4, tmp6);
976 } 974 }
977 975
978 // Stage2 976 // Stage2
979 { //NOLINT 977 {
980 const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero); 978 const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
981 const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero); 979 const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
982 980
983 tmp0 = _mm_madd_epi16(lo_04, stg2_0); 981 tmp0 = _mm_madd_epi16(lo_04, stg2_0);
984 tmp2 = _mm_madd_epi16(lo_04, stg2_1); 982 tmp2 = _mm_madd_epi16(lo_04, stg2_1);
985 tmp4 = _mm_madd_epi16(lo_26, stg2_2); 983 tmp4 = _mm_madd_epi16(lo_26, stg2_2);
986 tmp6 = _mm_madd_epi16(lo_26, stg2_3); 984 tmp6 = _mm_madd_epi16(lo_26, stg2_3);
987 985
988 tmp0 = _mm_add_epi32(tmp0, rounding); 986 tmp0 = _mm_add_epi32(tmp0, rounding);
989 tmp2 = _mm_add_epi32(tmp2, rounding); 987 tmp2 = _mm_add_epi32(tmp2, rounding);
990 tmp4 = _mm_add_epi32(tmp4, rounding); 988 tmp4 = _mm_add_epi32(tmp4, rounding);
991 tmp6 = _mm_add_epi32(tmp6, rounding); 989 tmp6 = _mm_add_epi32(tmp6, rounding);
992 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 990 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
993 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 991 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
994 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 992 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
995 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 993 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
996 994
997 stp2_0 = _mm_packs_epi32(tmp0, tmp2); 995 stp2_0 = _mm_packs_epi32(tmp0, tmp2);
998 stp2_2 = _mm_packs_epi32(tmp6, tmp4); 996 stp2_2 = _mm_packs_epi32(tmp6, tmp4);
999 997
1000 tmp0 = _mm_adds_epi16(stp1_4, stp1_5); 998 tmp0 = _mm_adds_epi16(stp1_4, stp1_5);
1001 tmp1 = _mm_subs_epi16(stp1_4, stp1_5); 999 tmp1 = _mm_subs_epi16(stp1_4, stp1_5);
1002 1000
1003 stp2_4 = tmp0; 1001 stp2_4 = tmp0;
1004 stp2_5 = _mm_unpacklo_epi64(tmp1, zero); 1002 stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
1005 stp2_6 = _mm_unpackhi_epi64(tmp1, zero); 1003 stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
1006 } 1004 }
1007 1005
1008 // Stage3 1006 // Stage3
1009 { //NOLINT 1007 {
1010 const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6); 1008 const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
1011 1009
1012 tmp4 = _mm_adds_epi16(stp2_0, stp2_2); 1010 tmp4 = _mm_adds_epi16(stp2_0, stp2_2);
1013 tmp6 = _mm_subs_epi16(stp2_0, stp2_2); 1011 tmp6 = _mm_subs_epi16(stp2_0, stp2_2);
1014 1012
1015 stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4); 1013 stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
1016 stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4); 1014 stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
1017 1015
1018 tmp0 = _mm_madd_epi16(lo_56, stg3_0); 1016 tmp0 = _mm_madd_epi16(lo_56, stg3_0);
1019 tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0 1017 tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0
1020 1018
1021 tmp0 = _mm_add_epi32(tmp0, rounding); 1019 tmp0 = _mm_add_epi32(tmp0, rounding);
1022 tmp2 = _mm_add_epi32(tmp2, rounding); 1020 tmp2 = _mm_add_epi32(tmp2, rounding);
1023 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 1021 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
1024 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 1022 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
1025 1023
1026 stp1_5 = _mm_packs_epi32(tmp0, tmp2); 1024 stp1_5 = _mm_packs_epi32(tmp0, tmp2);
1027 } 1025 }
1028 1026
1029 // Stage4 1027 // Stage4
1030 tmp0 = _mm_adds_epi16(stp1_3, stp2_4); 1028 tmp0 = _mm_adds_epi16(stp1_3, stp2_4);
1031 tmp1 = _mm_adds_epi16(stp1_2, stp1_5); 1029 tmp1 = _mm_adds_epi16(stp1_2, stp1_5);
1032 tmp2 = _mm_subs_epi16(stp1_3, stp2_4); 1030 tmp2 = _mm_subs_epi16(stp1_3, stp2_4);
1033 tmp3 = _mm_subs_epi16(stp1_2, stp1_5); 1031 tmp3 = _mm_subs_epi16(stp1_2, stp1_5);
1034 1032
1035 TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3) 1033 TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
1036 1034
1037 IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, 1035 IDCT8(in0, in1, in2, in3, zero, zero, zero, zero,
1038 in0, in1, in2, in3, in4, in5, in6, in7); 1036 in0, in1, in2, in3, in4, in5, in6, in7);
1039 // Final rounding and shift 1037 // Final rounding and shift
1040 in0 = _mm_adds_epi16(in0, final_rounding); 1038 in0 = _mm_adds_epi16(in0, final_rounding);
1041 in1 = _mm_adds_epi16(in1, final_rounding); 1039 in1 = _mm_adds_epi16(in1, final_rounding);
1042 in2 = _mm_adds_epi16(in2, final_rounding); 1040 in2 = _mm_adds_epi16(in2, final_rounding);
1043 in3 = _mm_adds_epi16(in3, final_rounding); 1041 in3 = _mm_adds_epi16(in3, final_rounding);
1044 in4 = _mm_adds_epi16(in4, final_rounding); 1042 in4 = _mm_adds_epi16(in4, final_rounding);
1045 in5 = _mm_adds_epi16(in5, final_rounding); 1043 in5 = _mm_adds_epi16(in5, final_rounding);
1046 in6 = _mm_adds_epi16(in6, final_rounding); 1044 in6 = _mm_adds_epi16(in6, final_rounding);
1047 in7 = _mm_adds_epi16(in7, final_rounding); 1045 in7 = _mm_adds_epi16(in7, final_rounding);
1048 1046
1049 in0 = _mm_srai_epi16(in0, 5); 1047 in0 = _mm_srai_epi16(in0, 5);
1050 in1 = _mm_srai_epi16(in1, 5); 1048 in1 = _mm_srai_epi16(in1, 5);
1051 in2 = _mm_srai_epi16(in2, 5); 1049 in2 = _mm_srai_epi16(in2, 5);
1052 in3 = _mm_srai_epi16(in3, 5); 1050 in3 = _mm_srai_epi16(in3, 5);
1053 in4 = _mm_srai_epi16(in4, 5); 1051 in4 = _mm_srai_epi16(in4, 5);
1054 in5 = _mm_srai_epi16(in5, 5); 1052 in5 = _mm_srai_epi16(in5, 5);
1055 in6 = _mm_srai_epi16(in6, 5); 1053 in6 = _mm_srai_epi16(in6, 5);
1056 in7 = _mm_srai_epi16(in7, 5); 1054 in7 = _mm_srai_epi16(in7, 5);
1057 1055
1058 RECON_AND_STORE(dest, in0); 1056 RECON_AND_STORE(dest + 0 * stride, in0);
1059 RECON_AND_STORE(dest, in1); 1057 RECON_AND_STORE(dest + 1 * stride, in1);
1060 RECON_AND_STORE(dest, in2); 1058 RECON_AND_STORE(dest + 2 * stride, in2);
1061 RECON_AND_STORE(dest, in3); 1059 RECON_AND_STORE(dest + 3 * stride, in3);
1062 RECON_AND_STORE(dest, in4); 1060 RECON_AND_STORE(dest + 4 * stride, in4);
1063 RECON_AND_STORE(dest, in5); 1061 RECON_AND_STORE(dest + 5 * stride, in5);
1064 RECON_AND_STORE(dest, in6); 1062 RECON_AND_STORE(dest + 6 * stride, in6);
1065 RECON_AND_STORE(dest, in7); 1063 RECON_AND_STORE(dest + 7 * stride, in7);
1066 } 1064 }
1067 1065
1068 #define IDCT16 \ 1066 #define IDCT16 \
1069 /* Stage2 */ \ 1067 /* Stage2 */ \
1070 { \ 1068 { \
1071 const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \ 1069 const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \
1072 const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \ 1070 const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \
1073 const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \ 1071 const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \
1074 const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \ 1072 const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \
1075 const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \ 1073 const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \
(...skipping 222 matching lines...) Expand 10 before | Expand all | Expand 10 after
1298 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ 1296 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
1299 \ 1297 \
1300 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ 1298 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
1301 stg6_0, stg4_0, stg6_0, stg4_0, \ 1299 stg6_0, stg4_0, stg6_0, stg4_0, \
1302 stp2_10, stp2_13, stp2_11, stp2_12) \ 1300 stp2_10, stp2_13, stp2_11, stp2_12) \
1303 } 1301 }
1304 1302
1305 void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, 1303 void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
1306 int stride) { 1304 int stride) {
1307 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 1305 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
1308 const __m128i final_rounding = _mm_set1_epi16(1<<5); 1306 const __m128i final_rounding = _mm_set1_epi16(1 << 5);
1309 const __m128i zero = _mm_setzero_si128(); 1307 const __m128i zero = _mm_setzero_si128();
1310 1308
1311 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); 1309 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
1312 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); 1310 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
1313 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); 1311 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
1314 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); 1312 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
1315 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); 1313 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
1316 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); 1314 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
1317 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); 1315 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
1318 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); 1316 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
(...skipping 18 matching lines...) Expand all
1337 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, 1335 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
1338 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, 1336 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
1339 stp1_8_0, stp1_12_0; 1337 stp1_8_0, stp1_12_0;
1340 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 1338 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
1341 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; 1339 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
1342 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 1340 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1343 int i; 1341 int i;
1344 1342
1345 curr1 = l; 1343 curr1 = l;
1346 for (i = 0; i < 2; i++) { 1344 for (i = 0; i < 2; i++) {
1347 // 1-D idct 1345 // 1-D idct
1348 1346
1349 // Load input data. 1347 // Load input data.
1350 in[0] = _mm_load_si128((const __m128i *)input); 1348 in[0] = _mm_load_si128((const __m128i *)input);
1351 in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1)); 1349 in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1));
1352 in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2)); 1350 in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
1353 in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3)); 1351 in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3));
1354 in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4)); 1352 in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
1355 in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5)); 1353 in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5));
1356 in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6)); 1354 in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
1357 in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7)); 1355 in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7));
1358 in[4] = _mm_load_si128((const __m128i *)(input + 8 * 8)); 1356 in[4] = _mm_load_si128((const __m128i *)(input + 8 * 8));
1359 in[12] = _mm_load_si128((const __m128i *)(input + 8 * 9)); 1357 in[12] = _mm_load_si128((const __m128i *)(input + 8 * 9));
1360 in[5] = _mm_load_si128((const __m128i *)(input + 8 * 10)); 1358 in[5] = _mm_load_si128((const __m128i *)(input + 8 * 10));
1361 in[13] = _mm_load_si128((const __m128i *)(input + 8 * 11)); 1359 in[13] = _mm_load_si128((const __m128i *)(input + 8 * 11));
1362 in[6] = _mm_load_si128((const __m128i *)(input + 8 * 12)); 1360 in[6] = _mm_load_si128((const __m128i *)(input + 8 * 12));
1363 in[14] = _mm_load_si128((const __m128i *)(input + 8 * 13)); 1361 in[14] = _mm_load_si128((const __m128i *)(input + 8 * 13));
1364 in[7] = _mm_load_si128((const __m128i *)(input + 8 * 14)); 1362 in[7] = _mm_load_si128((const __m128i *)(input + 8 * 14));
1365 in[15] = _mm_load_si128((const __m128i *)(input + 8 * 15)); 1363 in[15] = _mm_load_si128((const __m128i *)(input + 8 * 15));
1366 1364
1367 array_transpose_8x8(in, in); 1365 array_transpose_8x8(in, in);
1368 array_transpose_8x8(in+8, in+8); 1366 array_transpose_8x8(in + 8, in + 8);
1369 1367
1370 IDCT16 1368 IDCT16
1371 1369
1372 // Stage7 1370 // Stage7
1373 curr1[0] = _mm_add_epi16(stp2_0, stp1_15); 1371 curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
1374 curr1[1] = _mm_add_epi16(stp2_1, stp1_14); 1372 curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
1375 curr1[2] = _mm_add_epi16(stp2_2, stp2_13); 1373 curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
1376 curr1[3] = _mm_add_epi16(stp2_3, stp2_12); 1374 curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
1377 curr1[4] = _mm_add_epi16(stp2_4, stp2_11); 1375 curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
1378 curr1[5] = _mm_add_epi16(stp2_5, stp2_10); 1376 curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
1379 curr1[6] = _mm_add_epi16(stp2_6, stp1_9); 1377 curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
1380 curr1[7] = _mm_add_epi16(stp2_7, stp1_8); 1378 curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
1381 curr1[8] = _mm_sub_epi16(stp2_7, stp1_8); 1379 curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
1382 curr1[9] = _mm_sub_epi16(stp2_6, stp1_9); 1380 curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
1383 curr1[10] = _mm_sub_epi16(stp2_5, stp2_10); 1381 curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
1384 curr1[11] = _mm_sub_epi16(stp2_4, stp2_11); 1382 curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
1385 curr1[12] = _mm_sub_epi16(stp2_3, stp2_12); 1383 curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
1386 curr1[13] = _mm_sub_epi16(stp2_2, stp2_13); 1384 curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
1387 curr1[14] = _mm_sub_epi16(stp2_1, stp1_14); 1385 curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
1388 curr1[15] = _mm_sub_epi16(stp2_0, stp1_15); 1386 curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
1389 1387
1390 curr1 = r; 1388 curr1 = r;
1391 input += 128; 1389 input += 128;
1392 } 1390 }
1393 for (i = 0; i < 2; i++) { 1391 for (i = 0; i < 2; i++) {
1394 // 1-D idct 1392 int j;
1395 array_transpose_8x8(l+i*8, in); 1393 // 1-D idct
1396 array_transpose_8x8(r+i*8, in+8); 1394 array_transpose_8x8(l + i * 8, in);
1395 array_transpose_8x8(r + i * 8, in + 8);
1397 1396
1398 IDCT16 1397 IDCT16
1399 1398
1400 // 2-D 1399 // 2-D
1401 in[0] = _mm_add_epi16(stp2_0, stp1_15); 1400 in[0] = _mm_add_epi16(stp2_0, stp1_15);
1402 in[1] = _mm_add_epi16(stp2_1, stp1_14); 1401 in[1] = _mm_add_epi16(stp2_1, stp1_14);
1403 in[2] = _mm_add_epi16(stp2_2, stp2_13); 1402 in[2] = _mm_add_epi16(stp2_2, stp2_13);
1404 in[3] = _mm_add_epi16(stp2_3, stp2_12); 1403 in[3] = _mm_add_epi16(stp2_3, stp2_12);
1405 in[4] = _mm_add_epi16(stp2_4, stp2_11); 1404 in[4] = _mm_add_epi16(stp2_4, stp2_11);
1406 in[5] = _mm_add_epi16(stp2_5, stp2_10); 1405 in[5] = _mm_add_epi16(stp2_5, stp2_10);
1407 in[6] = _mm_add_epi16(stp2_6, stp1_9); 1406 in[6] = _mm_add_epi16(stp2_6, stp1_9);
1408 in[7] = _mm_add_epi16(stp2_7, stp1_8); 1407 in[7] = _mm_add_epi16(stp2_7, stp1_8);
1409 in[8] = _mm_sub_epi16(stp2_7, stp1_8); 1408 in[8] = _mm_sub_epi16(stp2_7, stp1_8);
1410 in[9] = _mm_sub_epi16(stp2_6, stp1_9); 1409 in[9] = _mm_sub_epi16(stp2_6, stp1_9);
1411 in[10] = _mm_sub_epi16(stp2_5, stp2_10); 1410 in[10] = _mm_sub_epi16(stp2_5, stp2_10);
1412 in[11] = _mm_sub_epi16(stp2_4, stp2_11); 1411 in[11] = _mm_sub_epi16(stp2_4, stp2_11);
1413 in[12] = _mm_sub_epi16(stp2_3, stp2_12); 1412 in[12] = _mm_sub_epi16(stp2_3, stp2_12);
1414 in[13] = _mm_sub_epi16(stp2_2, stp2_13); 1413 in[13] = _mm_sub_epi16(stp2_2, stp2_13);
1415 in[14] = _mm_sub_epi16(stp2_1, stp1_14); 1414 in[14] = _mm_sub_epi16(stp2_1, stp1_14);
1416 in[15] = _mm_sub_epi16(stp2_0, stp1_15); 1415 in[15] = _mm_sub_epi16(stp2_0, stp1_15);
1417 1416
1417 for (j = 0; j < 16; ++j) {
1418 // Final rounding and shift 1418 // Final rounding and shift
1419 in[0] = _mm_adds_epi16(in[0], final_rounding); 1419 in[j] = _mm_adds_epi16(in[j], final_rounding);
1420 in[1] = _mm_adds_epi16(in[1], final_rounding); 1420 in[j] = _mm_srai_epi16(in[j], 6);
1421 in[2] = _mm_adds_epi16(in[2], final_rounding); 1421 RECON_AND_STORE(dest + j * stride, in[j]);
1422 in[3] = _mm_adds_epi16(in[3], final_rounding); 1422 }
1423 in[4] = _mm_adds_epi16(in[4], final_rounding);
1424 in[5] = _mm_adds_epi16(in[5], final_rounding);
1425 in[6] = _mm_adds_epi16(in[6], final_rounding);
1426 in[7] = _mm_adds_epi16(in[7], final_rounding);
1427 in[8] = _mm_adds_epi16(in[8], final_rounding);
1428 in[9] = _mm_adds_epi16(in[9], final_rounding);
1429 in[10] = _mm_adds_epi16(in[10], final_rounding);
1430 in[11] = _mm_adds_epi16(in[11], final_rounding);
1431 in[12] = _mm_adds_epi16(in[12], final_rounding);
1432 in[13] = _mm_adds_epi16(in[13], final_rounding);
1433 in[14] = _mm_adds_epi16(in[14], final_rounding);
1434 in[15] = _mm_adds_epi16(in[15], final_rounding);
1435 1423
1436 in[0] = _mm_srai_epi16(in[0], 6); 1424 dest += 8;
1437 in[1] = _mm_srai_epi16(in[1], 6);
1438 in[2] = _mm_srai_epi16(in[2], 6);
1439 in[3] = _mm_srai_epi16(in[3], 6);
1440 in[4] = _mm_srai_epi16(in[4], 6);
1441 in[5] = _mm_srai_epi16(in[5], 6);
1442 in[6] = _mm_srai_epi16(in[6], 6);
1443 in[7] = _mm_srai_epi16(in[7], 6);
1444 in[8] = _mm_srai_epi16(in[8], 6);
1445 in[9] = _mm_srai_epi16(in[9], 6);
1446 in[10] = _mm_srai_epi16(in[10], 6);
1447 in[11] = _mm_srai_epi16(in[11], 6);
1448 in[12] = _mm_srai_epi16(in[12], 6);
1449 in[13] = _mm_srai_epi16(in[13], 6);
1450 in[14] = _mm_srai_epi16(in[14], 6);
1451 in[15] = _mm_srai_epi16(in[15], 6);
1452
1453 RECON_AND_STORE(dest, in[0]);
1454 RECON_AND_STORE(dest, in[1]);
1455 RECON_AND_STORE(dest, in[2]);
1456 RECON_AND_STORE(dest, in[3]);
1457 RECON_AND_STORE(dest, in[4]);
1458 RECON_AND_STORE(dest, in[5]);
1459 RECON_AND_STORE(dest, in[6]);
1460 RECON_AND_STORE(dest, in[7]);
1461 RECON_AND_STORE(dest, in[8]);
1462 RECON_AND_STORE(dest, in[9]);
1463 RECON_AND_STORE(dest, in[10]);
1464 RECON_AND_STORE(dest, in[11]);
1465 RECON_AND_STORE(dest, in[12]);
1466 RECON_AND_STORE(dest, in[13]);
1467 RECON_AND_STORE(dest, in[14]);
1468 RECON_AND_STORE(dest, in[15]);
1469
1470 dest += 8 - (stride * 16);
1471 } 1425 }
1472 } 1426 }
1473 1427
1474 void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 1428 void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
1475 __m128i dc_value; 1429 __m128i dc_value;
1476 const __m128i zero = _mm_setzero_si128(); 1430 const __m128i zero = _mm_setzero_si128();
1477 int a, i; 1431 int a, i;
1478 1432
1479 a = dct_const_round_shift(input[0] * cospi_16_64); 1433 a = dct_const_round_shift(input[0] * cospi_16_64);
1480 a = dct_const_round_shift(a * cospi_16_64); 1434 a = dct_const_round_shift(a * cospi_16_64);
1481 a = ROUND_POWER_OF_TWO(a, 6); 1435 a = ROUND_POWER_OF_TWO(a, 6);
1482 1436
1483 dc_value = _mm_set1_epi16(a); 1437 dc_value = _mm_set1_epi16(a);
1484 1438
1485 for (i = 0; i < 2; ++i) { 1439 for (i = 0; i < 2; ++i) {
1486 RECON_AND_STORE(dest, dc_value); 1440 RECON_AND_STORE(dest + 0 * stride, dc_value);
1487 RECON_AND_STORE(dest, dc_value); 1441 RECON_AND_STORE(dest + 1 * stride, dc_value);
1488 RECON_AND_STORE(dest, dc_value); 1442 RECON_AND_STORE(dest + 2 * stride, dc_value);
1489 RECON_AND_STORE(dest, dc_value); 1443 RECON_AND_STORE(dest + 3 * stride, dc_value);
1490 RECON_AND_STORE(dest, dc_value); 1444 RECON_AND_STORE(dest + 4 * stride, dc_value);
1491 RECON_AND_STORE(dest, dc_value); 1445 RECON_AND_STORE(dest + 5 * stride, dc_value);
1492 RECON_AND_STORE(dest, dc_value); 1446 RECON_AND_STORE(dest + 6 * stride, dc_value);
1493 RECON_AND_STORE(dest, dc_value); 1447 RECON_AND_STORE(dest + 7 * stride, dc_value);
1494 RECON_AND_STORE(dest, dc_value); 1448 RECON_AND_STORE(dest + 8 * stride, dc_value);
1495 RECON_AND_STORE(dest, dc_value); 1449 RECON_AND_STORE(dest + 9 * stride, dc_value);
1496 RECON_AND_STORE(dest, dc_value); 1450 RECON_AND_STORE(dest + 10 * stride, dc_value);
1497 RECON_AND_STORE(dest, dc_value); 1451 RECON_AND_STORE(dest + 11 * stride, dc_value);
1498 RECON_AND_STORE(dest, dc_value); 1452 RECON_AND_STORE(dest + 12 * stride, dc_value);
1499 RECON_AND_STORE(dest, dc_value); 1453 RECON_AND_STORE(dest + 13 * stride, dc_value);
1500 RECON_AND_STORE(dest, dc_value); 1454 RECON_AND_STORE(dest + 14 * stride, dc_value);
1501 RECON_AND_STORE(dest, dc_value); 1455 RECON_AND_STORE(dest + 15 * stride, dc_value);
1502 dest += 8 - (stride * 16); 1456 dest += 8;
1503 } 1457 }
1504 } 1458 }
1505 1459
1506 static void iadst16_8col(__m128i *in) { 1460 static void iadst16_8col(__m128i *in) {
1507 // perform 16x16 1-D ADST for 8 columns 1461 // perform 16x16 1-D ADST for 8 columns
1508 __m128i s[16], x[16], u[32], v[32]; 1462 __m128i s[16], x[16], u[32], v[32];
1509 const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); 1463 const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
1510 const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); 1464 const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
1511 const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); 1465 const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
1512 const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); 1466 const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
(...skipping 847 matching lines...) Expand 10 before | Expand all | Expand 10 after
2360 } 2314 }
2361 2315
2362 write_buffer_8x16(dest, in0, stride); 2316 write_buffer_8x16(dest, in0, stride);
2363 dest += 8; 2317 dest += 8;
2364 write_buffer_8x16(dest, in1, stride); 2318 write_buffer_8x16(dest, in1, stride);
2365 } 2319 }
2366 2320
2367 void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, 2321 void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
2368 int stride) { 2322 int stride) {
2369 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 2323 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
2370 const __m128i final_rounding = _mm_set1_epi16(1<<5); 2324 const __m128i final_rounding = _mm_set1_epi16(1 << 5);
2371 const __m128i zero = _mm_setzero_si128(); 2325 const __m128i zero = _mm_setzero_si128();
2372 2326
2373 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); 2327 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
2374 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); 2328 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
2375 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); 2329 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
2376 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); 2330 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
2377 2331
2378 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 2332 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
2379 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 2333 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
2380 2334
(...skipping 18 matching lines...) Expand all
2399 in[0] = _mm_load_si128((const __m128i *)input); 2353 in[0] = _mm_load_si128((const __m128i *)input);
2400 in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2)); 2354 in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
2401 in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4)); 2355 in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
2402 in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6)); 2356 in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
2403 2357
2404 TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]); 2358 TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
2405 2359
2406 // Stage2 2360 // Stage2
2407 { 2361 {
2408 const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero); 2362 const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
2409 const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]); 2363 const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]);
2410 2364
2411 tmp0 = _mm_madd_epi16(lo_1_15, stg2_0); 2365 tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
2412 tmp2 = _mm_madd_epi16(lo_1_15, stg2_1); 2366 tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
2413 tmp5 = _mm_madd_epi16(lo_13_3, stg2_6); 2367 tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
2414 tmp7 = _mm_madd_epi16(lo_13_3, stg2_7); 2368 tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
2415 2369
2416 tmp0 = _mm_add_epi32(tmp0, rounding); 2370 tmp0 = _mm_add_epi32(tmp0, rounding);
2417 tmp2 = _mm_add_epi32(tmp2, rounding); 2371 tmp2 = _mm_add_epi32(tmp2, rounding);
2418 tmp5 = _mm_add_epi32(tmp5, rounding); 2372 tmp5 = _mm_add_epi32(tmp5, rounding);
2419 tmp7 = _mm_add_epi32(tmp7, rounding); 2373 tmp7 = _mm_add_epi32(tmp7, rounding);
(...skipping 140 matching lines...) Expand 10 before | Expand all | Expand 10 after
2560 l[9] = _mm_sub_epi16(stp2_6, stp1_9); 2514 l[9] = _mm_sub_epi16(stp2_6, stp1_9);
2561 l[10] = _mm_sub_epi16(stp2_5, stp2_10); 2515 l[10] = _mm_sub_epi16(stp2_5, stp2_10);
2562 l[11] = _mm_sub_epi16(stp2_4, stp2_11); 2516 l[11] = _mm_sub_epi16(stp2_4, stp2_11);
2563 l[12] = _mm_sub_epi16(stp2_3, stp2_12); 2517 l[12] = _mm_sub_epi16(stp2_3, stp2_12);
2564 l[13] = _mm_sub_epi16(stp2_2, stp2_13); 2518 l[13] = _mm_sub_epi16(stp2_2, stp2_13);
2565 l[14] = _mm_sub_epi16(stp2_1, stp1_14); 2519 l[14] = _mm_sub_epi16(stp2_1, stp1_14);
2566 l[15] = _mm_sub_epi16(stp2_0, stp1_15); 2520 l[15] = _mm_sub_epi16(stp2_0, stp1_15);
2567 2521
2568 // Second 1-D inverse transform, performed per 8x16 block 2522 // Second 1-D inverse transform, performed per 8x16 block
2569 for (i = 0; i < 2; i++) { 2523 for (i = 0; i < 2; i++) {
2570 array_transpose_4X8(l + 8*i, in); 2524 int j;
2525 array_transpose_4X8(l + 8 * i, in);
2571 2526
2572 IDCT16_10 2527 IDCT16_10
2573 2528
2574 // Stage7 2529 // Stage7
2575 in[0] = _mm_add_epi16(stp2_0, stp1_15); 2530 in[0] = _mm_add_epi16(stp2_0, stp1_15);
2576 in[1] = _mm_add_epi16(stp2_1, stp1_14); 2531 in[1] = _mm_add_epi16(stp2_1, stp1_14);
2577 in[2] = _mm_add_epi16(stp2_2, stp2_13); 2532 in[2] = _mm_add_epi16(stp2_2, stp2_13);
2578 in[3] = _mm_add_epi16(stp2_3, stp2_12); 2533 in[3] = _mm_add_epi16(stp2_3, stp2_12);
2579 in[4] = _mm_add_epi16(stp2_4, stp2_11); 2534 in[4] = _mm_add_epi16(stp2_4, stp2_11);
2580 in[5] = _mm_add_epi16(stp2_5, stp2_10); 2535 in[5] = _mm_add_epi16(stp2_5, stp2_10);
2581 in[6] = _mm_add_epi16(stp2_6, stp1_9); 2536 in[6] = _mm_add_epi16(stp2_6, stp1_9);
2582 in[7] = _mm_add_epi16(stp2_7, stp1_8); 2537 in[7] = _mm_add_epi16(stp2_7, stp1_8);
2583 in[8] = _mm_sub_epi16(stp2_7, stp1_8); 2538 in[8] = _mm_sub_epi16(stp2_7, stp1_8);
2584 in[9] = _mm_sub_epi16(stp2_6, stp1_9); 2539 in[9] = _mm_sub_epi16(stp2_6, stp1_9);
2585 in[10] = _mm_sub_epi16(stp2_5, stp2_10); 2540 in[10] = _mm_sub_epi16(stp2_5, stp2_10);
2586 in[11] = _mm_sub_epi16(stp2_4, stp2_11); 2541 in[11] = _mm_sub_epi16(stp2_4, stp2_11);
2587 in[12] = _mm_sub_epi16(stp2_3, stp2_12); 2542 in[12] = _mm_sub_epi16(stp2_3, stp2_12);
2588 in[13] = _mm_sub_epi16(stp2_2, stp2_13); 2543 in[13] = _mm_sub_epi16(stp2_2, stp2_13);
2589 in[14] = _mm_sub_epi16(stp2_1, stp1_14); 2544 in[14] = _mm_sub_epi16(stp2_1, stp1_14);
2590 in[15] = _mm_sub_epi16(stp2_0, stp1_15); 2545 in[15] = _mm_sub_epi16(stp2_0, stp1_15);
2591 2546
2592 // Final rounding and shift 2547 for (j = 0; j < 16; ++j) {
2593 in[0] = _mm_adds_epi16(in[0], final_rounding); 2548 // Final rounding and shift
2594 in[1] = _mm_adds_epi16(in[1], final_rounding); 2549 in[j] = _mm_adds_epi16(in[j], final_rounding);
2595 in[2] = _mm_adds_epi16(in[2], final_rounding); 2550 in[j] = _mm_srai_epi16(in[j], 6);
2596 in[3] = _mm_adds_epi16(in[3], final_rounding); 2551 RECON_AND_STORE(dest + j * stride, in[j]);
2597 in[4] = _mm_adds_epi16(in[4], final_rounding); 2552 }
2598 in[5] = _mm_adds_epi16(in[5], final_rounding);
2599 in[6] = _mm_adds_epi16(in[6], final_rounding);
2600 in[7] = _mm_adds_epi16(in[7], final_rounding);
2601 in[8] = _mm_adds_epi16(in[8], final_rounding);
2602 in[9] = _mm_adds_epi16(in[9], final_rounding);
2603 in[10] = _mm_adds_epi16(in[10], final_rounding);
2604 in[11] = _mm_adds_epi16(in[11], final_rounding);
2605 in[12] = _mm_adds_epi16(in[12], final_rounding);
2606 in[13] = _mm_adds_epi16(in[13], final_rounding);
2607 in[14] = _mm_adds_epi16(in[14], final_rounding);
2608 in[15] = _mm_adds_epi16(in[15], final_rounding);
2609 2553
2610 in[0] = _mm_srai_epi16(in[0], 6); 2554 dest += 8;
2611 in[1] = _mm_srai_epi16(in[1], 6);
2612 in[2] = _mm_srai_epi16(in[2], 6);
2613 in[3] = _mm_srai_epi16(in[3], 6);
2614 in[4] = _mm_srai_epi16(in[4], 6);
2615 in[5] = _mm_srai_epi16(in[5], 6);
2616 in[6] = _mm_srai_epi16(in[6], 6);
2617 in[7] = _mm_srai_epi16(in[7], 6);
2618 in[8] = _mm_srai_epi16(in[8], 6);
2619 in[9] = _mm_srai_epi16(in[9], 6);
2620 in[10] = _mm_srai_epi16(in[10], 6);
2621 in[11] = _mm_srai_epi16(in[11], 6);
2622 in[12] = _mm_srai_epi16(in[12], 6);
2623 in[13] = _mm_srai_epi16(in[13], 6);
2624 in[14] = _mm_srai_epi16(in[14], 6);
2625 in[15] = _mm_srai_epi16(in[15], 6);
2626
2627 RECON_AND_STORE(dest, in[0]);
2628 RECON_AND_STORE(dest, in[1]);
2629 RECON_AND_STORE(dest, in[2]);
2630 RECON_AND_STORE(dest, in[3]);
2631 RECON_AND_STORE(dest, in[4]);
2632 RECON_AND_STORE(dest, in[5]);
2633 RECON_AND_STORE(dest, in[6]);
2634 RECON_AND_STORE(dest, in[7]);
2635 RECON_AND_STORE(dest, in[8]);
2636 RECON_AND_STORE(dest, in[9]);
2637 RECON_AND_STORE(dest, in[10]);
2638 RECON_AND_STORE(dest, in[11]);
2639 RECON_AND_STORE(dest, in[12]);
2640 RECON_AND_STORE(dest, in[13]);
2641 RECON_AND_STORE(dest, in[14]);
2642 RECON_AND_STORE(dest, in[15]);
2643
2644 dest += 8 - (stride * 16);
2645 } 2555 }
2646 } 2556 }
2647 2557
2648 #define LOAD_DQCOEFF(reg, input) \ 2558 #define LOAD_DQCOEFF(reg, input) \
2649 { \ 2559 { \
2650 reg = _mm_load_si128((const __m128i *) input); \ 2560 reg = _mm_load_si128((const __m128i *) input); \
2651 input += 8; \ 2561 input += 8; \
2652 } \ 2562 } \
2653 2563
2654 #define IDCT32_34 \ 2564 #define IDCT32_34 \
(...skipping 624 matching lines...) Expand 10 before | Expand all | Expand 10 after
3279 stp1_23, stp1_24) \ 3189 stp1_23, stp1_24) \
3280 \ 3190 \
3281 stp1_28 = stp2_28; \ 3191 stp1_28 = stp2_28; \
3282 stp1_29 = stp2_29; \ 3192 stp1_29 = stp2_29; \
3283 stp1_30 = stp2_30; \ 3193 stp1_30 = stp2_30; \
3284 stp1_31 = stp2_31; \ 3194 stp1_31 = stp2_31; \
3285 } 3195 }
3286 3196
3287 // Only upper-left 8x8 has non-zero coeff 3197 // Only upper-left 8x8 has non-zero coeff
3288 void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, 3198 void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
3289 int stride) { 3199 int stride) {
3290 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 3200 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
3291 const __m128i final_rounding = _mm_set1_epi16(1<<5); 3201 const __m128i final_rounding = _mm_set1_epi16(1<<5);
3292 3202
3293 // idct constants for each stage 3203 // idct constants for each stage
3294 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); 3204 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
3295 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); 3205 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
3296 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); 3206 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
3297 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); 3207 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
3298 const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); 3208 const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
3299 const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); 3209 const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
(...skipping 80 matching lines...) Expand 10 before | Expand all | Expand 10 after
3380 LOAD_DQCOEFF(in[6], input); 3290 LOAD_DQCOEFF(in[6], input);
3381 LOAD_DQCOEFF(in[14], input); 3291 LOAD_DQCOEFF(in[14], input);
3382 LOAD_DQCOEFF(in[22], input); 3292 LOAD_DQCOEFF(in[22], input);
3383 LOAD_DQCOEFF(in[30], input); 3293 LOAD_DQCOEFF(in[30], input);
3384 LOAD_DQCOEFF(in[7], input); 3294 LOAD_DQCOEFF(in[7], input);
3385 LOAD_DQCOEFF(in[15], input); 3295 LOAD_DQCOEFF(in[15], input);
3386 LOAD_DQCOEFF(in[23], input); 3296 LOAD_DQCOEFF(in[23], input);
3387 LOAD_DQCOEFF(in[31], input); 3297 LOAD_DQCOEFF(in[31], input);
3388 3298
3389 array_transpose_8x8(in, in); 3299 array_transpose_8x8(in, in);
3390 array_transpose_8x8(in+8, in+8); 3300 array_transpose_8x8(in + 8, in + 8);
3391 array_transpose_8x8(in+16, in+16); 3301 array_transpose_8x8(in + 16, in + 16);
3392 array_transpose_8x8(in+24, in+24); 3302 array_transpose_8x8(in + 24, in + 24);
3393 3303
3394 IDCT32 3304 IDCT32
3395 3305
3396 // 1_D: Store 32 intermediate results for each 8x32 block. 3306 // 1_D: Store 32 intermediate results for each 8x32 block.
3397 col[0] = _mm_add_epi16(stp1_0, stp1_31); 3307 col[0] = _mm_add_epi16(stp1_0, stp1_31);
3398 col[1] = _mm_add_epi16(stp1_1, stp1_30); 3308 col[1] = _mm_add_epi16(stp1_1, stp1_30);
3399 col[2] = _mm_add_epi16(stp1_2, stp1_29); 3309 col[2] = _mm_add_epi16(stp1_2, stp1_29);
3400 col[3] = _mm_add_epi16(stp1_3, stp1_28); 3310 col[3] = _mm_add_epi16(stp1_3, stp1_28);
3401 col[4] = _mm_add_epi16(stp1_4, stp1_27); 3311 col[4] = _mm_add_epi16(stp1_4, stp1_27);
3402 col[5] = _mm_add_epi16(stp1_5, stp1_26); 3312 col[5] = _mm_add_epi16(stp1_5, stp1_26);
(...skipping 17 matching lines...) Expand all
3420 col[23] = _mm_sub_epi16(stp1_8, stp1_23); 3330 col[23] = _mm_sub_epi16(stp1_8, stp1_23);
3421 col[24] = _mm_sub_epi16(stp1_7, stp1_24); 3331 col[24] = _mm_sub_epi16(stp1_7, stp1_24);
3422 col[25] = _mm_sub_epi16(stp1_6, stp1_25); 3332 col[25] = _mm_sub_epi16(stp1_6, stp1_25);
3423 col[26] = _mm_sub_epi16(stp1_5, stp1_26); 3333 col[26] = _mm_sub_epi16(stp1_5, stp1_26);
3424 col[27] = _mm_sub_epi16(stp1_4, stp1_27); 3334 col[27] = _mm_sub_epi16(stp1_4, stp1_27);
3425 col[28] = _mm_sub_epi16(stp1_3, stp1_28); 3335 col[28] = _mm_sub_epi16(stp1_3, stp1_28);
3426 col[29] = _mm_sub_epi16(stp1_2, stp1_29); 3336 col[29] = _mm_sub_epi16(stp1_2, stp1_29);
3427 col[30] = _mm_sub_epi16(stp1_1, stp1_30); 3337 col[30] = _mm_sub_epi16(stp1_1, stp1_30);
3428 col[31] = _mm_sub_epi16(stp1_0, stp1_31); 3338 col[31] = _mm_sub_epi16(stp1_0, stp1_31);
3429 for (i = 0; i < 4; i++) { 3339 for (i = 0; i < 4; i++) {
3430 const __m128i zero = _mm_setzero_si128(); 3340 int j;
3431 // Transpose 32x8 block to 8x32 block 3341 const __m128i zero = _mm_setzero_si128();
3432 array_transpose_8x8(col+i*8, in); 3342 // Transpose 32x8 block to 8x32 block
3433 IDCT32_34 3343 array_transpose_8x8(col + i * 8, in);
3344 IDCT32_34
3434 3345
3435 // 2_D: Calculate the results and store them to destination. 3346 // 2_D: Calculate the results and store them to destination.
3436 in[0] = _mm_add_epi16(stp1_0, stp1_31); 3347 in[0] = _mm_add_epi16(stp1_0, stp1_31);
3437 in[1] = _mm_add_epi16(stp1_1, stp1_30); 3348 in[1] = _mm_add_epi16(stp1_1, stp1_30);
3438 in[2] = _mm_add_epi16(stp1_2, stp1_29); 3349 in[2] = _mm_add_epi16(stp1_2, stp1_29);
3439 in[3] = _mm_add_epi16(stp1_3, stp1_28); 3350 in[3] = _mm_add_epi16(stp1_3, stp1_28);
3440 in[4] = _mm_add_epi16(stp1_4, stp1_27); 3351 in[4] = _mm_add_epi16(stp1_4, stp1_27);
3441 in[5] = _mm_add_epi16(stp1_5, stp1_26); 3352 in[5] = _mm_add_epi16(stp1_5, stp1_26);
3442 in[6] = _mm_add_epi16(stp1_6, stp1_25); 3353 in[6] = _mm_add_epi16(stp1_6, stp1_25);
3443 in[7] = _mm_add_epi16(stp1_7, stp1_24); 3354 in[7] = _mm_add_epi16(stp1_7, stp1_24);
3444 in[8] = _mm_add_epi16(stp1_8, stp1_23); 3355 in[8] = _mm_add_epi16(stp1_8, stp1_23);
3445 in[9] = _mm_add_epi16(stp1_9, stp1_22); 3356 in[9] = _mm_add_epi16(stp1_9, stp1_22);
3446 in[10] = _mm_add_epi16(stp1_10, stp1_21); 3357 in[10] = _mm_add_epi16(stp1_10, stp1_21);
3447 in[11] = _mm_add_epi16(stp1_11, stp1_20); 3358 in[11] = _mm_add_epi16(stp1_11, stp1_20);
3448 in[12] = _mm_add_epi16(stp1_12, stp1_19); 3359 in[12] = _mm_add_epi16(stp1_12, stp1_19);
3449 in[13] = _mm_add_epi16(stp1_13, stp1_18); 3360 in[13] = _mm_add_epi16(stp1_13, stp1_18);
3450 in[14] = _mm_add_epi16(stp1_14, stp1_17); 3361 in[14] = _mm_add_epi16(stp1_14, stp1_17);
3451 in[15] = _mm_add_epi16(stp1_15, stp1_16); 3362 in[15] = _mm_add_epi16(stp1_15, stp1_16);
3452 in[16] = _mm_sub_epi16(stp1_15, stp1_16); 3363 in[16] = _mm_sub_epi16(stp1_15, stp1_16);
3453 in[17] = _mm_sub_epi16(stp1_14, stp1_17); 3364 in[17] = _mm_sub_epi16(stp1_14, stp1_17);
3454 in[18] = _mm_sub_epi16(stp1_13, stp1_18); 3365 in[18] = _mm_sub_epi16(stp1_13, stp1_18);
3455 in[19] = _mm_sub_epi16(stp1_12, stp1_19); 3366 in[19] = _mm_sub_epi16(stp1_12, stp1_19);
3456 in[20] = _mm_sub_epi16(stp1_11, stp1_20); 3367 in[20] = _mm_sub_epi16(stp1_11, stp1_20);
3457 in[21] = _mm_sub_epi16(stp1_10, stp1_21); 3368 in[21] = _mm_sub_epi16(stp1_10, stp1_21);
3458 in[22] = _mm_sub_epi16(stp1_9, stp1_22); 3369 in[22] = _mm_sub_epi16(stp1_9, stp1_22);
3459 in[23] = _mm_sub_epi16(stp1_8, stp1_23); 3370 in[23] = _mm_sub_epi16(stp1_8, stp1_23);
3460 in[24] = _mm_sub_epi16(stp1_7, stp1_24); 3371 in[24] = _mm_sub_epi16(stp1_7, stp1_24);
3461 in[25] = _mm_sub_epi16(stp1_6, stp1_25); 3372 in[25] = _mm_sub_epi16(stp1_6, stp1_25);
3462 in[26] = _mm_sub_epi16(stp1_5, stp1_26); 3373 in[26] = _mm_sub_epi16(stp1_5, stp1_26);
3463 in[27] = _mm_sub_epi16(stp1_4, stp1_27); 3374 in[27] = _mm_sub_epi16(stp1_4, stp1_27);
3464 in[28] = _mm_sub_epi16(stp1_3, stp1_28); 3375 in[28] = _mm_sub_epi16(stp1_3, stp1_28);
3465 in[29] = _mm_sub_epi16(stp1_2, stp1_29); 3376 in[29] = _mm_sub_epi16(stp1_2, stp1_29);
3466 in[30] = _mm_sub_epi16(stp1_1, stp1_30); 3377 in[30] = _mm_sub_epi16(stp1_1, stp1_30);
3467 in[31] = _mm_sub_epi16(stp1_0, stp1_31); 3378 in[31] = _mm_sub_epi16(stp1_0, stp1_31);
3468 3379
3380 for (j = 0; j < 32; ++j) {
3469 // Final rounding and shift 3381 // Final rounding and shift
3470 in[0] = _mm_adds_epi16(in[0], final_rounding); 3382 in[j] = _mm_adds_epi16(in[j], final_rounding);
3471 in[1] = _mm_adds_epi16(in[1], final_rounding); 3383 in[j] = _mm_srai_epi16(in[j], 6);
3472 in[2] = _mm_adds_epi16(in[2], final_rounding); 3384 RECON_AND_STORE(dest + j * stride, in[j]);
3473 in[3] = _mm_adds_epi16(in[3], final_rounding); 3385 }
3474 in[4] = _mm_adds_epi16(in[4], final_rounding);
3475 in[5] = _mm_adds_epi16(in[5], final_rounding);
3476 in[6] = _mm_adds_epi16(in[6], final_rounding);
3477 in[7] = _mm_adds_epi16(in[7], final_rounding);
3478 in[8] = _mm_adds_epi16(in[8], final_rounding);
3479 in[9] = _mm_adds_epi16(in[9], final_rounding);
3480 in[10] = _mm_adds_epi16(in[10], final_rounding);
3481 in[11] = _mm_adds_epi16(in[11], final_rounding);
3482 in[12] = _mm_adds_epi16(in[12], final_rounding);
3483 in[13] = _mm_adds_epi16(in[13], final_rounding);
3484 in[14] = _mm_adds_epi16(in[14], final_rounding);
3485 in[15] = _mm_adds_epi16(in[15], final_rounding);
3486 in[16] = _mm_adds_epi16(in[16], final_rounding);
3487 in[17] = _mm_adds_epi16(in[17], final_rounding);
3488 in[18] = _mm_adds_epi16(in[18], final_rounding);
3489 in[19] = _mm_adds_epi16(in[19], final_rounding);
3490 in[20] = _mm_adds_epi16(in[20], final_rounding);
3491 in[21] = _mm_adds_epi16(in[21], final_rounding);
3492 in[22] = _mm_adds_epi16(in[22], final_rounding);
3493 in[23] = _mm_adds_epi16(in[23], final_rounding);
3494 in[24] = _mm_adds_epi16(in[24], final_rounding);
3495 in[25] = _mm_adds_epi16(in[25], final_rounding);
3496 in[26] = _mm_adds_epi16(in[26], final_rounding);
3497 in[27] = _mm_adds_epi16(in[27], final_rounding);
3498 in[28] = _mm_adds_epi16(in[28], final_rounding);
3499 in[29] = _mm_adds_epi16(in[29], final_rounding);
3500 in[30] = _mm_adds_epi16(in[30], final_rounding);
3501 in[31] = _mm_adds_epi16(in[31], final_rounding);
3502 3386
3503 in[0] = _mm_srai_epi16(in[0], 6); 3387 dest += 8;
3504 in[1] = _mm_srai_epi16(in[1], 6);
3505 in[2] = _mm_srai_epi16(in[2], 6);
3506 in[3] = _mm_srai_epi16(in[3], 6);
3507 in[4] = _mm_srai_epi16(in[4], 6);
3508 in[5] = _mm_srai_epi16(in[5], 6);
3509 in[6] = _mm_srai_epi16(in[6], 6);
3510 in[7] = _mm_srai_epi16(in[7], 6);
3511 in[8] = _mm_srai_epi16(in[8], 6);
3512 in[9] = _mm_srai_epi16(in[9], 6);
3513 in[10] = _mm_srai_epi16(in[10], 6);
3514 in[11] = _mm_srai_epi16(in[11], 6);
3515 in[12] = _mm_srai_epi16(in[12], 6);
3516 in[13] = _mm_srai_epi16(in[13], 6);
3517 in[14] = _mm_srai_epi16(in[14], 6);
3518 in[15] = _mm_srai_epi16(in[15], 6);
3519 in[16] = _mm_srai_epi16(in[16], 6);
3520 in[17] = _mm_srai_epi16(in[17], 6);
3521 in[18] = _mm_srai_epi16(in[18], 6);
3522 in[19] = _mm_srai_epi16(in[19], 6);
3523 in[20] = _mm_srai_epi16(in[20], 6);
3524 in[21] = _mm_srai_epi16(in[21], 6);
3525 in[22] = _mm_srai_epi16(in[22], 6);
3526 in[23] = _mm_srai_epi16(in[23], 6);
3527 in[24] = _mm_srai_epi16(in[24], 6);
3528 in[25] = _mm_srai_epi16(in[25], 6);
3529 in[26] = _mm_srai_epi16(in[26], 6);
3530 in[27] = _mm_srai_epi16(in[27], 6);
3531 in[28] = _mm_srai_epi16(in[28], 6);
3532 in[29] = _mm_srai_epi16(in[29], 6);
3533 in[30] = _mm_srai_epi16(in[30], 6);
3534 in[31] = _mm_srai_epi16(in[31], 6);
3535
3536 RECON_AND_STORE(dest, in[0]);
3537 RECON_AND_STORE(dest, in[1]);
3538 RECON_AND_STORE(dest, in[2]);
3539 RECON_AND_STORE(dest, in[3]);
3540 RECON_AND_STORE(dest, in[4]);
3541 RECON_AND_STORE(dest, in[5]);
3542 RECON_AND_STORE(dest, in[6]);
3543 RECON_AND_STORE(dest, in[7]);
3544 RECON_AND_STORE(dest, in[8]);
3545 RECON_AND_STORE(dest, in[9]);
3546 RECON_AND_STORE(dest, in[10]);
3547 RECON_AND_STORE(dest, in[11]);
3548 RECON_AND_STORE(dest, in[12]);
3549 RECON_AND_STORE(dest, in[13]);
3550 RECON_AND_STORE(dest, in[14]);
3551 RECON_AND_STORE(dest, in[15]);
3552 RECON_AND_STORE(dest, in[16]);
3553 RECON_AND_STORE(dest, in[17]);
3554 RECON_AND_STORE(dest, in[18]);
3555 RECON_AND_STORE(dest, in[19]);
3556 RECON_AND_STORE(dest, in[20]);
3557 RECON_AND_STORE(dest, in[21]);
3558 RECON_AND_STORE(dest, in[22]);
3559 RECON_AND_STORE(dest, in[23]);
3560 RECON_AND_STORE(dest, in[24]);
3561 RECON_AND_STORE(dest, in[25]);
3562 RECON_AND_STORE(dest, in[26]);
3563 RECON_AND_STORE(dest, in[27]);
3564 RECON_AND_STORE(dest, in[28]);
3565 RECON_AND_STORE(dest, in[29]);
3566 RECON_AND_STORE(dest, in[30]);
3567 RECON_AND_STORE(dest, in[31]);
3568
3569 dest += 8 - (stride * 32);
3570 }
3571 } 3388 }
3389 }
3572 3390
3573 void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, 3391 void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
3574 int stride) { 3392 int stride) {
3575 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 3393 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
3576 const __m128i final_rounding = _mm_set1_epi16(1<<5); 3394 const __m128i final_rounding = _mm_set1_epi16(1 << 5);
3577 const __m128i zero = _mm_setzero_si128(); 3395 const __m128i zero = _mm_setzero_si128();
3578 3396
3579 // idct constants for each stage 3397 // idct constants for each stage
3580 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); 3398 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
3581 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); 3399 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
3582 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); 3400 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
3583 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); 3401 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
3584 const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); 3402 const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
3585 const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); 3403 const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
3586 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); 3404 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
(...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after
3633 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 3451 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
3634 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, 3452 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
3635 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, 3453 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
3636 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, 3454 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
3637 stp2_30, stp2_31; 3455 stp2_30, stp2_31;
3638 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 3456 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3639 int i, j, i32; 3457 int i, j, i32;
3640 3458
3641 for (i = 0; i < 4; i++) { 3459 for (i = 0; i < 4; i++) {
3642 i32 = (i << 5); 3460 i32 = (i << 5);
3643 // First 1-D idct 3461 // First 1-D idct
3644 // Load input data. 3462 // Load input data.
3645 LOAD_DQCOEFF(in[0], input); 3463 LOAD_DQCOEFF(in[0], input);
3646 LOAD_DQCOEFF(in[8], input); 3464 LOAD_DQCOEFF(in[8], input);
3647 LOAD_DQCOEFF(in[16], input); 3465 LOAD_DQCOEFF(in[16], input);
3648 LOAD_DQCOEFF(in[24], input); 3466 LOAD_DQCOEFF(in[24], input);
3649 LOAD_DQCOEFF(in[1], input); 3467 LOAD_DQCOEFF(in[1], input);
3650 LOAD_DQCOEFF(in[9], input); 3468 LOAD_DQCOEFF(in[9], input);
3651 LOAD_DQCOEFF(in[17], input); 3469 LOAD_DQCOEFF(in[17], input);
3652 LOAD_DQCOEFF(in[25], input); 3470 LOAD_DQCOEFF(in[25], input);
3653 LOAD_DQCOEFF(in[2], input); 3471 LOAD_DQCOEFF(in[2], input);
3654 LOAD_DQCOEFF(in[10], input); 3472 LOAD_DQCOEFF(in[10], input);
3655 LOAD_DQCOEFF(in[18], input); 3473 LOAD_DQCOEFF(in[18], input);
3656 LOAD_DQCOEFF(in[26], input); 3474 LOAD_DQCOEFF(in[26], input);
3657 LOAD_DQCOEFF(in[3], input); 3475 LOAD_DQCOEFF(in[3], input);
3658 LOAD_DQCOEFF(in[11], input); 3476 LOAD_DQCOEFF(in[11], input);
3659 LOAD_DQCOEFF(in[19], input); 3477 LOAD_DQCOEFF(in[19], input);
3660 LOAD_DQCOEFF(in[27], input); 3478 LOAD_DQCOEFF(in[27], input);
3661 3479
3662 LOAD_DQCOEFF(in[4], input); 3480 LOAD_DQCOEFF(in[4], input);
3663 LOAD_DQCOEFF(in[12], input); 3481 LOAD_DQCOEFF(in[12], input);
3664 LOAD_DQCOEFF(in[20], input); 3482 LOAD_DQCOEFF(in[20], input);
3665 LOAD_DQCOEFF(in[28], input); 3483 LOAD_DQCOEFF(in[28], input);
3666 LOAD_DQCOEFF(in[5], input); 3484 LOAD_DQCOEFF(in[5], input);
3667 LOAD_DQCOEFF(in[13], input); 3485 LOAD_DQCOEFF(in[13], input);
3668 LOAD_DQCOEFF(in[21], input); 3486 LOAD_DQCOEFF(in[21], input);
3669 LOAD_DQCOEFF(in[29], input); 3487 LOAD_DQCOEFF(in[29], input);
3670 LOAD_DQCOEFF(in[6], input); 3488 LOAD_DQCOEFF(in[6], input);
3671 LOAD_DQCOEFF(in[14], input); 3489 LOAD_DQCOEFF(in[14], input);
3672 LOAD_DQCOEFF(in[22], input); 3490 LOAD_DQCOEFF(in[22], input);
3673 LOAD_DQCOEFF(in[30], input); 3491 LOAD_DQCOEFF(in[30], input);
3674 LOAD_DQCOEFF(in[7], input); 3492 LOAD_DQCOEFF(in[7], input);
3675 LOAD_DQCOEFF(in[15], input); 3493 LOAD_DQCOEFF(in[15], input);
3676 LOAD_DQCOEFF(in[23], input); 3494 LOAD_DQCOEFF(in[23], input);
3677 LOAD_DQCOEFF(in[31], input); 3495 LOAD_DQCOEFF(in[31], input);
3678 3496
3679 // checking if all entries are zero 3497 // checking if all entries are zero
3680 zero_idx[0] = _mm_or_si128(in[0], in[1]); 3498 zero_idx[0] = _mm_or_si128(in[0], in[1]);
3681 zero_idx[1] = _mm_or_si128(in[2], in[3]); 3499 zero_idx[1] = _mm_or_si128(in[2], in[3]);
3682 zero_idx[2] = _mm_or_si128(in[4], in[5]); 3500 zero_idx[2] = _mm_or_si128(in[4], in[5]);
3683 zero_idx[3] = _mm_or_si128(in[6], in[7]); 3501 zero_idx[3] = _mm_or_si128(in[6], in[7]);
3684 zero_idx[4] = _mm_or_si128(in[8], in[9]); 3502 zero_idx[4] = _mm_or_si128(in[8], in[9]);
3685 zero_idx[5] = _mm_or_si128(in[10], in[11]); 3503 zero_idx[5] = _mm_or_si128(in[10], in[11]);
3686 zero_idx[6] = _mm_or_si128(in[12], in[13]); 3504 zero_idx[6] = _mm_or_si128(in[12], in[13]);
3687 zero_idx[7] = _mm_or_si128(in[14], in[15]); 3505 zero_idx[7] = _mm_or_si128(in[14], in[15]);
3688 zero_idx[8] = _mm_or_si128(in[16], in[17]); 3506 zero_idx[8] = _mm_or_si128(in[16], in[17]);
3689 zero_idx[9] = _mm_or_si128(in[18], in[19]); 3507 zero_idx[9] = _mm_or_si128(in[18], in[19]);
3690 zero_idx[10] = _mm_or_si128(in[20], in[21]); 3508 zero_idx[10] = _mm_or_si128(in[20], in[21]);
3691 zero_idx[11] = _mm_or_si128(in[22], in[23]); 3509 zero_idx[11] = _mm_or_si128(in[22], in[23]);
3692 zero_idx[12] = _mm_or_si128(in[24], in[25]); 3510 zero_idx[12] = _mm_or_si128(in[24], in[25]);
3693 zero_idx[13] = _mm_or_si128(in[26], in[27]); 3511 zero_idx[13] = _mm_or_si128(in[26], in[27]);
3694 zero_idx[14] = _mm_or_si128(in[28], in[29]); 3512 zero_idx[14] = _mm_or_si128(in[28], in[29]);
3695 zero_idx[15] = _mm_or_si128(in[30], in[31]); 3513 zero_idx[15] = _mm_or_si128(in[30], in[31]);
3696 3514
3697 zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]); 3515 zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
3698 zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]); 3516 zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
3699 zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]); 3517 zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
3700 zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]); 3518 zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
3701 zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]); 3519 zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
3702 zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]); 3520 zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
3703 zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]); 3521 zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
3704 zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]); 3522 zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
3705 3523
3706 zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]); 3524 zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
3707 zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]); 3525 zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
3708 zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]); 3526 zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
3709 zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]); 3527 zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
3710 zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]); 3528 zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
3711 zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]); 3529 zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
3712 zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]); 3530 zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
3713 3531
3714 if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) { 3532 if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
3715 col[i32 + 0] = _mm_setzero_si128(); 3533 col[i32 + 0] = _mm_setzero_si128();
3716 col[i32 + 1] = _mm_setzero_si128(); 3534 col[i32 + 1] = _mm_setzero_si128();
3717 col[i32 + 2] = _mm_setzero_si128(); 3535 col[i32 + 2] = _mm_setzero_si128();
3718 col[i32 + 3] = _mm_setzero_si128(); 3536 col[i32 + 3] = _mm_setzero_si128();
3719 col[i32 + 4] = _mm_setzero_si128(); 3537 col[i32 + 4] = _mm_setzero_si128();
3720 col[i32 + 5] = _mm_setzero_si128(); 3538 col[i32 + 5] = _mm_setzero_si128();
3721 col[i32 + 6] = _mm_setzero_si128(); 3539 col[i32 + 6] = _mm_setzero_si128();
3722 col[i32 + 7] = _mm_setzero_si128(); 3540 col[i32 + 7] = _mm_setzero_si128();
3723 col[i32 + 8] = _mm_setzero_si128(); 3541 col[i32 + 8] = _mm_setzero_si128();
3724 col[i32 + 9] = _mm_setzero_si128(); 3542 col[i32 + 9] = _mm_setzero_si128();
3725 col[i32 + 10] = _mm_setzero_si128(); 3543 col[i32 + 10] = _mm_setzero_si128();
3726 col[i32 + 11] = _mm_setzero_si128(); 3544 col[i32 + 11] = _mm_setzero_si128();
3727 col[i32 + 12] = _mm_setzero_si128(); 3545 col[i32 + 12] = _mm_setzero_si128();
3728 col[i32 + 13] = _mm_setzero_si128(); 3546 col[i32 + 13] = _mm_setzero_si128();
3729 col[i32 + 14] = _mm_setzero_si128(); 3547 col[i32 + 14] = _mm_setzero_si128();
3730 col[i32 + 15] = _mm_setzero_si128(); 3548 col[i32 + 15] = _mm_setzero_si128();
3731 col[i32 + 16] = _mm_setzero_si128(); 3549 col[i32 + 16] = _mm_setzero_si128();
3732 col[i32 + 17] = _mm_setzero_si128(); 3550 col[i32 + 17] = _mm_setzero_si128();
3733 col[i32 + 18] = _mm_setzero_si128(); 3551 col[i32 + 18] = _mm_setzero_si128();
3734 col[i32 + 19] = _mm_setzero_si128(); 3552 col[i32 + 19] = _mm_setzero_si128();
3735 col[i32 + 20] = _mm_setzero_si128(); 3553 col[i32 + 20] = _mm_setzero_si128();
3736 col[i32 + 21] = _mm_setzero_si128(); 3554 col[i32 + 21] = _mm_setzero_si128();
3737 col[i32 + 22] = _mm_setzero_si128(); 3555 col[i32 + 22] = _mm_setzero_si128();
3738 col[i32 + 23] = _mm_setzero_si128(); 3556 col[i32 + 23] = _mm_setzero_si128();
3739 col[i32 + 24] = _mm_setzero_si128(); 3557 col[i32 + 24] = _mm_setzero_si128();
3740 col[i32 + 25] = _mm_setzero_si128(); 3558 col[i32 + 25] = _mm_setzero_si128();
3741 col[i32 + 26] = _mm_setzero_si128(); 3559 col[i32 + 26] = _mm_setzero_si128();
3742 col[i32 + 27] = _mm_setzero_si128(); 3560 col[i32 + 27] = _mm_setzero_si128();
3743 col[i32 + 28] = _mm_setzero_si128(); 3561 col[i32 + 28] = _mm_setzero_si128();
3744 col[i32 + 29] = _mm_setzero_si128(); 3562 col[i32 + 29] = _mm_setzero_si128();
3745 col[i32 + 30] = _mm_setzero_si128(); 3563 col[i32 + 30] = _mm_setzero_si128();
3746 col[i32 + 31] = _mm_setzero_si128(); 3564 col[i32 + 31] = _mm_setzero_si128();
3747 continue; 3565 continue;
3748 }
3749
3750 // Transpose 32x8 block to 8x32 block
3751 array_transpose_8x8(in, in);
3752 array_transpose_8x8(in+8, in+8);
3753 array_transpose_8x8(in+16, in+16);
3754 array_transpose_8x8(in+24, in+24);
3755
3756 IDCT32
3757
3758 // 1_D: Store 32 intermediate results for each 8x32 block.
3759 col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
3760 col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
3761 col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
3762 col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
3763 col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
3764 col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
3765 col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
3766 col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
3767 col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
3768 col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
3769 col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
3770 col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
3771 col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
3772 col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
3773 col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
3774 col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
3775 col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
3776 col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
3777 col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
3778 col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
3779 col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
3780 col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
3781 col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
3782 col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
3783 col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
3784 col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
3785 col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
3786 col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
3787 col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
3788 col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
3789 col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
3790 col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
3791 } 3566 }
3567
3568 // Transpose 32x8 block to 8x32 block
3569 array_transpose_8x8(in, in);
3570 array_transpose_8x8(in + 8, in + 8);
3571 array_transpose_8x8(in + 16, in + 16);
3572 array_transpose_8x8(in + 24, in + 24);
3573
3574 IDCT32
3575
3576 // 1_D: Store 32 intermediate results for each 8x32 block.
3577 col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
3578 col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
3579 col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
3580 col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
3581 col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
3582 col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
3583 col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
3584 col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
3585 col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
3586 col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
3587 col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
3588 col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
3589 col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
3590 col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
3591 col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
3592 col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
3593 col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
3594 col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
3595 col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
3596 col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
3597 col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
3598 col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
3599 col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
3600 col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
3601 col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
3602 col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
3603 col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
3604 col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
3605 col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
3606 col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
3607 col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
3608 col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
3609 }
3792 for (i = 0; i < 4; i++) { 3610 for (i = 0; i < 4; i++) {
3793 // Second 1-D idct 3611 // Second 1-D idct
3794 j = i << 3; 3612 j = i << 3;
3795 3613
3796 // Transpose 32x8 block to 8x32 block 3614 // Transpose 32x8 block to 8x32 block
3797 array_transpose_8x8(col+j, in); 3615 array_transpose_8x8(col + j, in);
3798 array_transpose_8x8(col+j+32, in+8); 3616 array_transpose_8x8(col + j + 32, in + 8);
3799 array_transpose_8x8(col+j+64, in+16); 3617 array_transpose_8x8(col + j + 64, in + 16);
3800 array_transpose_8x8(col+j+96, in+24); 3618 array_transpose_8x8(col + j + 96, in + 24);
3801 3619
3802 IDCT32 3620 IDCT32
3803 3621
3804 // 2_D: Calculate the results and store them to destination. 3622 // 2_D: Calculate the results and store them to destination.
3805 in[0] = _mm_add_epi16(stp1_0, stp1_31); 3623 in[0] = _mm_add_epi16(stp1_0, stp1_31);
3806 in[1] = _mm_add_epi16(stp1_1, stp1_30); 3624 in[1] = _mm_add_epi16(stp1_1, stp1_30);
3807 in[2] = _mm_add_epi16(stp1_2, stp1_29); 3625 in[2] = _mm_add_epi16(stp1_2, stp1_29);
3808 in[3] = _mm_add_epi16(stp1_3, stp1_28); 3626 in[3] = _mm_add_epi16(stp1_3, stp1_28);
3809 in[4] = _mm_add_epi16(stp1_4, stp1_27); 3627 in[4] = _mm_add_epi16(stp1_4, stp1_27);
3810 in[5] = _mm_add_epi16(stp1_5, stp1_26); 3628 in[5] = _mm_add_epi16(stp1_5, stp1_26);
3811 in[6] = _mm_add_epi16(stp1_6, stp1_25); 3629 in[6] = _mm_add_epi16(stp1_6, stp1_25);
3812 in[7] = _mm_add_epi16(stp1_7, stp1_24); 3630 in[7] = _mm_add_epi16(stp1_7, stp1_24);
3813 in[8] = _mm_add_epi16(stp1_8, stp1_23); 3631 in[8] = _mm_add_epi16(stp1_8, stp1_23);
3814 in[9] = _mm_add_epi16(stp1_9, stp1_22); 3632 in[9] = _mm_add_epi16(stp1_9, stp1_22);
3815 in[10] = _mm_add_epi16(stp1_10, stp1_21); 3633 in[10] = _mm_add_epi16(stp1_10, stp1_21);
3816 in[11] = _mm_add_epi16(stp1_11, stp1_20); 3634 in[11] = _mm_add_epi16(stp1_11, stp1_20);
3817 in[12] = _mm_add_epi16(stp1_12, stp1_19); 3635 in[12] = _mm_add_epi16(stp1_12, stp1_19);
3818 in[13] = _mm_add_epi16(stp1_13, stp1_18); 3636 in[13] = _mm_add_epi16(stp1_13, stp1_18);
3819 in[14] = _mm_add_epi16(stp1_14, stp1_17); 3637 in[14] = _mm_add_epi16(stp1_14, stp1_17);
3820 in[15] = _mm_add_epi16(stp1_15, stp1_16); 3638 in[15] = _mm_add_epi16(stp1_15, stp1_16);
3821 in[16] = _mm_sub_epi16(stp1_15, stp1_16); 3639 in[16] = _mm_sub_epi16(stp1_15, stp1_16);
3822 in[17] = _mm_sub_epi16(stp1_14, stp1_17); 3640 in[17] = _mm_sub_epi16(stp1_14, stp1_17);
3823 in[18] = _mm_sub_epi16(stp1_13, stp1_18); 3641 in[18] = _mm_sub_epi16(stp1_13, stp1_18);
3824 in[19] = _mm_sub_epi16(stp1_12, stp1_19); 3642 in[19] = _mm_sub_epi16(stp1_12, stp1_19);
3825 in[20] = _mm_sub_epi16(stp1_11, stp1_20); 3643 in[20] = _mm_sub_epi16(stp1_11, stp1_20);
3826 in[21] = _mm_sub_epi16(stp1_10, stp1_21); 3644 in[21] = _mm_sub_epi16(stp1_10, stp1_21);
3827 in[22] = _mm_sub_epi16(stp1_9, stp1_22); 3645 in[22] = _mm_sub_epi16(stp1_9, stp1_22);
3828 in[23] = _mm_sub_epi16(stp1_8, stp1_23); 3646 in[23] = _mm_sub_epi16(stp1_8, stp1_23);
3829 in[24] = _mm_sub_epi16(stp1_7, stp1_24); 3647 in[24] = _mm_sub_epi16(stp1_7, stp1_24);
3830 in[25] = _mm_sub_epi16(stp1_6, stp1_25); 3648 in[25] = _mm_sub_epi16(stp1_6, stp1_25);
3831 in[26] = _mm_sub_epi16(stp1_5, stp1_26); 3649 in[26] = _mm_sub_epi16(stp1_5, stp1_26);
3832 in[27] = _mm_sub_epi16(stp1_4, stp1_27); 3650 in[27] = _mm_sub_epi16(stp1_4, stp1_27);
3833 in[28] = _mm_sub_epi16(stp1_3, stp1_28); 3651 in[28] = _mm_sub_epi16(stp1_3, stp1_28);
3834 in[29] = _mm_sub_epi16(stp1_2, stp1_29); 3652 in[29] = _mm_sub_epi16(stp1_2, stp1_29);
3835 in[30] = _mm_sub_epi16(stp1_1, stp1_30); 3653 in[30] = _mm_sub_epi16(stp1_1, stp1_30);
3836 in[31] = _mm_sub_epi16(stp1_0, stp1_31); 3654 in[31] = _mm_sub_epi16(stp1_0, stp1_31);
3837 3655
3656 for (j = 0; j < 32; ++j) {
3838 // Final rounding and shift 3657 // Final rounding and shift
3839 in[0] = _mm_adds_epi16(in[0], final_rounding); 3658 in[j] = _mm_adds_epi16(in[j], final_rounding);
3840 in[1] = _mm_adds_epi16(in[1], final_rounding); 3659 in[j] = _mm_srai_epi16(in[j], 6);
3841 in[2] = _mm_adds_epi16(in[2], final_rounding); 3660 RECON_AND_STORE(dest + j * stride, in[j]);
3842 in[3] = _mm_adds_epi16(in[3], final_rounding);
3843 in[4] = _mm_adds_epi16(in[4], final_rounding);
3844 in[5] = _mm_adds_epi16(in[5], final_rounding);
3845 in[6] = _mm_adds_epi16(in[6], final_rounding);
3846 in[7] = _mm_adds_epi16(in[7], final_rounding);
3847 in[8] = _mm_adds_epi16(in[8], final_rounding);
3848 in[9] = _mm_adds_epi16(in[9], final_rounding);
3849 in[10] = _mm_adds_epi16(in[10], final_rounding);
3850 in[11] = _mm_adds_epi16(in[11], final_rounding);
3851 in[12] = _mm_adds_epi16(in[12], final_rounding);
3852 in[13] = _mm_adds_epi16(in[13], final_rounding);
3853 in[14] = _mm_adds_epi16(in[14], final_rounding);
3854 in[15] = _mm_adds_epi16(in[15], final_rounding);
3855 in[16] = _mm_adds_epi16(in[16], final_rounding);
3856 in[17] = _mm_adds_epi16(in[17], final_rounding);
3857 in[18] = _mm_adds_epi16(in[18], final_rounding);
3858 in[19] = _mm_adds_epi16(in[19], final_rounding);
3859 in[20] = _mm_adds_epi16(in[20], final_rounding);
3860 in[21] = _mm_adds_epi16(in[21], final_rounding);
3861 in[22] = _mm_adds_epi16(in[22], final_rounding);
3862 in[23] = _mm_adds_epi16(in[23], final_rounding);
3863 in[24] = _mm_adds_epi16(in[24], final_rounding);
3864 in[25] = _mm_adds_epi16(in[25], final_rounding);
3865 in[26] = _mm_adds_epi16(in[26], final_rounding);
3866 in[27] = _mm_adds_epi16(in[27], final_rounding);
3867 in[28] = _mm_adds_epi16(in[28], final_rounding);
3868 in[29] = _mm_adds_epi16(in[29], final_rounding);
3869 in[30] = _mm_adds_epi16(in[30], final_rounding);
3870 in[31] = _mm_adds_epi16(in[31], final_rounding);
3871
3872 in[0] = _mm_srai_epi16(in[0], 6);
3873 in[1] = _mm_srai_epi16(in[1], 6);
3874 in[2] = _mm_srai_epi16(in[2], 6);
3875 in[3] = _mm_srai_epi16(in[3], 6);
3876 in[4] = _mm_srai_epi16(in[4], 6);
3877 in[5] = _mm_srai_epi16(in[5], 6);
3878 in[6] = _mm_srai_epi16(in[6], 6);
3879 in[7] = _mm_srai_epi16(in[7], 6);
3880 in[8] = _mm_srai_epi16(in[8], 6);
3881 in[9] = _mm_srai_epi16(in[9], 6);
3882 in[10] = _mm_srai_epi16(in[10], 6);
3883 in[11] = _mm_srai_epi16(in[11], 6);
3884 in[12] = _mm_srai_epi16(in[12], 6);
3885 in[13] = _mm_srai_epi16(in[13], 6);
3886 in[14] = _mm_srai_epi16(in[14], 6);
3887 in[15] = _mm_srai_epi16(in[15], 6);
3888 in[16] = _mm_srai_epi16(in[16], 6);
3889 in[17] = _mm_srai_epi16(in[17], 6);
3890 in[18] = _mm_srai_epi16(in[18], 6);
3891 in[19] = _mm_srai_epi16(in[19], 6);
3892 in[20] = _mm_srai_epi16(in[20], 6);
3893 in[21] = _mm_srai_epi16(in[21], 6);
3894 in[22] = _mm_srai_epi16(in[22], 6);
3895 in[23] = _mm_srai_epi16(in[23], 6);
3896 in[24] = _mm_srai_epi16(in[24], 6);
3897 in[25] = _mm_srai_epi16(in[25], 6);
3898 in[26] = _mm_srai_epi16(in[26], 6);
3899 in[27] = _mm_srai_epi16(in[27], 6);
3900 in[28] = _mm_srai_epi16(in[28], 6);
3901 in[29] = _mm_srai_epi16(in[29], 6);
3902 in[30] = _mm_srai_epi16(in[30], 6);
3903 in[31] = _mm_srai_epi16(in[31], 6);
3904
3905 RECON_AND_STORE(dest, in[0]);
3906 RECON_AND_STORE(dest, in[1]);
3907 RECON_AND_STORE(dest, in[2]);
3908 RECON_AND_STORE(dest, in[3]);
3909 RECON_AND_STORE(dest, in[4]);
3910 RECON_AND_STORE(dest, in[5]);
3911 RECON_AND_STORE(dest, in[6]);
3912 RECON_AND_STORE(dest, in[7]);
3913 RECON_AND_STORE(dest, in[8]);
3914 RECON_AND_STORE(dest, in[9]);
3915 RECON_AND_STORE(dest, in[10]);
3916 RECON_AND_STORE(dest, in[11]);
3917 RECON_AND_STORE(dest, in[12]);
3918 RECON_AND_STORE(dest, in[13]);
3919 RECON_AND_STORE(dest, in[14]);
3920 RECON_AND_STORE(dest, in[15]);
3921 RECON_AND_STORE(dest, in[16]);
3922 RECON_AND_STORE(dest, in[17]);
3923 RECON_AND_STORE(dest, in[18]);
3924 RECON_AND_STORE(dest, in[19]);
3925 RECON_AND_STORE(dest, in[20]);
3926 RECON_AND_STORE(dest, in[21]);
3927 RECON_AND_STORE(dest, in[22]);
3928 RECON_AND_STORE(dest, in[23]);
3929 RECON_AND_STORE(dest, in[24]);
3930 RECON_AND_STORE(dest, in[25]);
3931 RECON_AND_STORE(dest, in[26]);
3932 RECON_AND_STORE(dest, in[27]);
3933 RECON_AND_STORE(dest, in[28]);
3934 RECON_AND_STORE(dest, in[29]);
3935 RECON_AND_STORE(dest, in[30]);
3936 RECON_AND_STORE(dest, in[31]);
3937
3938 dest += 8 - (stride * 32);
3939 } 3661 }
3940 } //NOLINT 3662
3663 dest += 8;
3664 }
3665 }
3941 3666
3942 void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 3667 void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
3943 __m128i dc_value; 3668 __m128i dc_value;
3944 const __m128i zero = _mm_setzero_si128(); 3669 const __m128i zero = _mm_setzero_si128();
3945 int a, i; 3670 int a, i;
3946 3671
3947 a = dct_const_round_shift(input[0] * cospi_16_64); 3672 a = dct_const_round_shift(input[0] * cospi_16_64);
3948 a = dct_const_round_shift(a * cospi_16_64); 3673 a = dct_const_round_shift(a * cospi_16_64);
3949 a = ROUND_POWER_OF_TWO(a, 6); 3674 a = ROUND_POWER_OF_TWO(a, 6);
3950 3675
3951 dc_value = _mm_set1_epi16(a); 3676 dc_value = _mm_set1_epi16(a);
3952 3677
3953 for (i = 0; i < 4; ++i) { 3678 for (i = 0; i < 4; ++i) {
3954 RECON_AND_STORE(dest, dc_value); 3679 int j;
3955 RECON_AND_STORE(dest, dc_value); 3680 for (j = 0; j < 32; ++j) {
3956 RECON_AND_STORE(dest, dc_value); 3681 RECON_AND_STORE(dest + j * stride, dc_value);
3957 RECON_AND_STORE(dest, dc_value); 3682 }
3958 RECON_AND_STORE(dest, dc_value); 3683 dest += 8;
3959 RECON_AND_STORE(dest, dc_value);
3960 RECON_AND_STORE(dest, dc_value);
3961 RECON_AND_STORE(dest, dc_value);
3962 RECON_AND_STORE(dest, dc_value);
3963 RECON_AND_STORE(dest, dc_value);
3964 RECON_AND_STORE(dest, dc_value);
3965 RECON_AND_STORE(dest, dc_value);
3966 RECON_AND_STORE(dest, dc_value);
3967 RECON_AND_STORE(dest, dc_value);
3968 RECON_AND_STORE(dest, dc_value);
3969 RECON_AND_STORE(dest, dc_value);
3970 RECON_AND_STORE(dest, dc_value);
3971 RECON_AND_STORE(dest, dc_value);
3972 RECON_AND_STORE(dest, dc_value);
3973 RECON_AND_STORE(dest, dc_value);
3974 RECON_AND_STORE(dest, dc_value);
3975 RECON_AND_STORE(dest, dc_value);
3976 RECON_AND_STORE(dest, dc_value);
3977 RECON_AND_STORE(dest, dc_value);
3978 RECON_AND_STORE(dest, dc_value);
3979 RECON_AND_STORE(dest, dc_value);
3980 RECON_AND_STORE(dest, dc_value);
3981 RECON_AND_STORE(dest, dc_value);
3982 RECON_AND_STORE(dest, dc_value);
3983 RECON_AND_STORE(dest, dc_value);
3984 RECON_AND_STORE(dest, dc_value);
3985 RECON_AND_STORE(dest, dc_value);
3986 dest += 8 - (stride * 32);
3987 } 3684 }
3988 } 3685 }
3989 3686
3990 #if CONFIG_VP9_HIGHBITDEPTH 3687 #if CONFIG_VP9_HIGHBITDEPTH
3991 static INLINE __m128i clamp_high_sse2(__m128i value, int bd) { 3688 static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
3992 __m128i ubounded, retval; 3689 __m128i ubounded, retval;
3993 const __m128i zero = _mm_set1_epi16(0); 3690 const __m128i zero = _mm_set1_epi16(0);
3994 const __m128i one = _mm_set1_epi16(1); 3691 const __m128i one = _mm_set1_epi16(1);
3995 const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one); 3692 const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one);
3996 ubounded = _mm_cmpgt_epi16(value, max); 3693 ubounded = _mm_cmpgt_epi16(value, max);
3997 retval = _mm_andnot_si128(ubounded, value); 3694 retval = _mm_andnot_si128(ubounded, value);
3998 ubounded = _mm_and_si128(ubounded, max); 3695 ubounded = _mm_and_si128(ubounded, max);
3999 retval = _mm_or_si128(retval, ubounded); 3696 retval = _mm_or_si128(retval, ubounded);
4000 retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero)); 3697 retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));
4001 return retval; 3698 return retval;
4002 } 3699 }
4003 3700
4004 void vp9_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8, 3701 void vp9_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8,
4005 int stride, int bd) { 3702 int stride, int bd) {
4006 tran_low_t out[4 * 4]; 3703 tran_low_t out[4 * 4];
4007 tran_low_t *outptr = out; 3704 tran_low_t *outptr = out;
4008 int i, j; 3705 int i, j;
4009 __m128i inptr[4]; 3706 __m128i inptr[4];
4010 __m128i sign_bits[2]; 3707 __m128i sign_bits[2];
4011 __m128i temp_mm, min_input, max_input; 3708 __m128i temp_mm, min_input, max_input;
4012 int test; 3709 int test;
4013 uint16_t * dest = CONVERT_TO_SHORTPTR(dest8); 3710 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
4014 int optimised_cols = 0; 3711 int optimised_cols = 0;
4015 const __m128i zero = _mm_set1_epi16(0); 3712 const __m128i zero = _mm_set1_epi16(0);
4016 const __m128i eight = _mm_set1_epi16(8); 3713 const __m128i eight = _mm_set1_epi16(8);
4017 const __m128i max = _mm_set1_epi16(12043); 3714 const __m128i max = _mm_set1_epi16(12043);
4018 const __m128i min = _mm_set1_epi16(-12043); 3715 const __m128i min = _mm_set1_epi16(-12043);
4019 // Load input into __m128i 3716 // Load input into __m128i
4020 inptr[0] = _mm_loadu_si128((const __m128i *)input); 3717 inptr[0] = _mm_loadu_si128((const __m128i *)input);
4021 inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4)); 3718 inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));
4022 inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8)); 3719 inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));
4023 inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12)); 3720 inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));
(...skipping 22 matching lines...) Expand all
4046 test = _mm_movemask_epi8(temp_mm); 3743 test = _mm_movemask_epi8(temp_mm);
4047 3744
4048 if (test) { 3745 if (test) {
4049 transpose_4x4(inptr); 3746 transpose_4x4(inptr);
4050 sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero); 3747 sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);
4051 sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero); 3748 sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);
4052 inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]); 3749 inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);
4053 inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]); 3750 inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);
4054 inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]); 3751 inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);
4055 inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]); 3752 inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);
4056 _mm_storeu_si128((__m128i*)outptr, inptr[0]); 3753 _mm_storeu_si128((__m128i *)outptr, inptr[0]);
4057 _mm_storeu_si128((__m128i*)(outptr + 4), inptr[1]); 3754 _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]);
4058 _mm_storeu_si128((__m128i*)(outptr + 8), inptr[2]); 3755 _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]);
4059 _mm_storeu_si128((__m128i*)(outptr + 12), inptr[3]); 3756 _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]);
4060 } else { 3757 } else {
4061 // Set to use the optimised transform for the column 3758 // Set to use the optimised transform for the column
4062 optimised_cols = 1; 3759 optimised_cols = 1;
4063 } 3760 }
4064 } else { 3761 } else {
4065 // Run the un-optimised row transform 3762 // Run the un-optimised row transform
4066 for (i = 0; i < 4; ++i) { 3763 for (i = 0; i < 4; ++i) {
4067 vp9_highbd_idct4(input, outptr, bd); 3764 vp9_highbd_idct4(input, outptr, bd);
4068 input += 4; 3765 input += 4;
4069 outptr += 4; 3766 outptr += 4;
4070 } 3767 }
4071 } 3768 }
4072 3769
4073 if (optimised_cols) { 3770 if (optimised_cols) {
4074 idct4_sse2(inptr); 3771 idct4_sse2(inptr);
4075 3772
4076 // Final round and shift 3773 // Final round and shift
4077 inptr[0] = _mm_add_epi16(inptr[0], eight); 3774 inptr[0] = _mm_add_epi16(inptr[0], eight);
4078 inptr[1] = _mm_add_epi16(inptr[1], eight); 3775 inptr[1] = _mm_add_epi16(inptr[1], eight);
4079 3776
4080 inptr[0] = _mm_srai_epi16(inptr[0], 4); 3777 inptr[0] = _mm_srai_epi16(inptr[0], 4);
4081 inptr[1] = _mm_srai_epi16(inptr[1], 4); 3778 inptr[1] = _mm_srai_epi16(inptr[1], 4);
4082 3779
4083 // Reconstruction and Store 3780 // Reconstruction and Store
4084 { 3781 {
4085 __m128i d0 = _mm_loadl_epi64((const __m128i *)dest); 3782 __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
4086 __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2)); 3783 __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
4087 d0 = _mm_unpacklo_epi64(d0, 3784 d0 = _mm_unpacklo_epi64(
4088 _mm_loadl_epi64((const __m128i *)(dest + stride))); 3785 d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));
4089 d2 = _mm_unpacklo_epi64(d2, 3786 d2 = _mm_unpacklo_epi64(
4090 _mm_loadl_epi64((const __m128i *)(dest + stride * 3))); 3787 d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
4091 d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd); 3788 d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);
4092 d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd); 3789 d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);
4093 // store input0 3790 // store input0
4094 _mm_storel_epi64((__m128i *)dest, d0); 3791 _mm_storel_epi64((__m128i *)dest, d0);
4095 // store input1 3792 // store input1
4096 d0 = _mm_srli_si128(d0, 8); 3793 d0 = _mm_srli_si128(d0, 8);
4097 _mm_storel_epi64((__m128i *)(dest + stride), d0); 3794 _mm_storel_epi64((__m128i *)(dest + stride), d0);
4098 // store input2 3795 // store input2
4099 _mm_storel_epi64((__m128i *)(dest + stride * 2), d2); 3796 _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
4100 // store input3 3797 // store input3
(...skipping 10 matching lines...) Expand all
4111 vp9_highbd_idct4(temp_in, temp_out, bd); 3808 vp9_highbd_idct4(temp_in, temp_out, bd);
4112 for (j = 0; j < 4; ++j) { 3809 for (j = 0; j < 4; ++j) {
4113 dest[j * stride + i] = highbd_clip_pixel_add( 3810 dest[j * stride + i] = highbd_clip_pixel_add(
4114 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); 3811 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
4115 } 3812 }
4116 } 3813 }
4117 } 3814 }
4118 } 3815 }
4119 3816
4120 void vp9_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8, 3817 void vp9_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8,
4121 int stride, int bd) { 3818 int stride, int bd) {
4122 tran_low_t out[8 * 8]; 3819 tran_low_t out[8 * 8];
4123 tran_low_t *outptr = out; 3820 tran_low_t *outptr = out;
4124 int i, j, test; 3821 int i, j, test;
4125 __m128i inptr[8]; 3822 __m128i inptr[8];
4126 __m128i min_input, max_input, temp1, temp2, sign_bits; 3823 __m128i min_input, max_input, temp1, temp2, sign_bits;
4127 uint16_t * dest = CONVERT_TO_SHORTPTR(dest8); 3824 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
4128 const __m128i zero = _mm_set1_epi16(0); 3825 const __m128i zero = _mm_set1_epi16(0);
4129 const __m128i sixteen = _mm_set1_epi16(16); 3826 const __m128i sixteen = _mm_set1_epi16(16);
4130 const __m128i max = _mm_set1_epi16(6201); 3827 const __m128i max = _mm_set1_epi16(6201);
4131 const __m128i min = _mm_set1_epi16(-6201); 3828 const __m128i min = _mm_set1_epi16(-6201);
4132 int optimised_cols = 0; 3829 int optimised_cols = 0;
4133 3830
4134 // Load input into __m128i & pack to 16 bits 3831 // Load input into __m128i & pack to 16 bits
4135 for (i = 0; i < 8; i++) { 3832 for (i = 0; i < 8; i++) {
4136 temp1 = _mm_loadu_si128((const __m128i *)(input + 8*i)); 3833 temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
4137 temp2 = _mm_loadu_si128((const __m128i *)(input + 8*i + 4)); 3834 temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
4138 inptr[i] = _mm_packs_epi32(temp1, temp2); 3835 inptr[i] = _mm_packs_epi32(temp1, temp2);
4139 } 3836 }
4140 3837
4141 // Find the min & max for the row transform 3838 // Find the min & max for the row transform
4142 max_input = _mm_max_epi16(inptr[0], inptr[1]); 3839 max_input = _mm_max_epi16(inptr[0], inptr[1]);
4143 min_input = _mm_min_epi16(inptr[0], inptr[1]); 3840 min_input = _mm_min_epi16(inptr[0], inptr[1]);
4144 for (i = 2; i < 8; i++) { 3841 for (i = 2; i < 8; i++) {
4145 max_input = _mm_max_epi16(max_input, inptr[i]); 3842 max_input = _mm_max_epi16(max_input, inptr[i]);
4146 min_input = _mm_min_epi16(min_input, inptr[i]); 3843 min_input = _mm_min_epi16(min_input, inptr[i]);
4147 } 3844 }
(...skipping 17 matching lines...) Expand all
4165 min_input = _mm_cmplt_epi16(min_input, min); 3862 min_input = _mm_cmplt_epi16(min_input, min);
4166 temp1 = _mm_or_si128(max_input, min_input); 3863 temp1 = _mm_or_si128(max_input, min_input);
4167 test = _mm_movemask_epi8(temp1); 3864 test = _mm_movemask_epi8(temp1);
4168 3865
4169 if (test) { 3866 if (test) {
4170 array_transpose_8x8(inptr, inptr); 3867 array_transpose_8x8(inptr, inptr);
4171 for (i = 0; i < 8; i++) { 3868 for (i = 0; i < 8; i++) {
4172 sign_bits = _mm_cmplt_epi16(inptr[i], zero); 3869 sign_bits = _mm_cmplt_epi16(inptr[i], zero);
4173 temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits); 3870 temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
4174 temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits); 3871 temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
4175 _mm_storeu_si128((__m128i*)(outptr + 4*(2*i+1)), temp1); 3872 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
4176 _mm_storeu_si128((__m128i*)(outptr + 4*(2*i)), temp2); 3873 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
4177 } 3874 }
4178 } else { 3875 } else {
4179 // Set to use the optimised transform for the column 3876 // Set to use the optimised transform for the column
4180 optimised_cols = 1; 3877 optimised_cols = 1;
4181 } 3878 }
4182 } else { 3879 } else {
4183 // Run the un-optimised row transform 3880 // Run the un-optimised row transform
4184 for (i = 0; i < 8; ++i) { 3881 for (i = 0; i < 8; ++i) {
4185 vp9_highbd_idct8(input, outptr, bd); 3882 vp9_highbd_idct8(input, outptr, bd);
4186 input += 8; 3883 input += 8;
(...skipping 25 matching lines...) Expand all
4212 vp9_highbd_idct8(temp_in, temp_out, bd); 3909 vp9_highbd_idct8(temp_in, temp_out, bd);
4213 for (j = 0; j < 8; ++j) { 3910 for (j = 0; j < 8; ++j) {
4214 dest[j * stride + i] = highbd_clip_pixel_add( 3911 dest[j * stride + i] = highbd_clip_pixel_add(
4215 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); 3912 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
4216 } 3913 }
4217 } 3914 }
4218 } 3915 }
4219 } 3916 }
4220 3917
4221 void vp9_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8, 3918 void vp9_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
4222 int stride, int bd) { 3919 int stride, int bd) {
4223 tran_low_t out[8 * 8] = { 0 }; 3920 tran_low_t out[8 * 8] = { 0 };
4224 tran_low_t *outptr = out; 3921 tran_low_t *outptr = out;
4225 int i, j, test; 3922 int i, j, test;
4226 __m128i inptr[8]; 3923 __m128i inptr[8];
4227 __m128i min_input, max_input, temp1, temp2, sign_bits; 3924 __m128i min_input, max_input, temp1, temp2, sign_bits;
4228 uint16_t * dest = CONVERT_TO_SHORTPTR(dest8); 3925 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
4229 const __m128i zero = _mm_set1_epi16(0); 3926 const __m128i zero = _mm_set1_epi16(0);
4230 const __m128i sixteen = _mm_set1_epi16(16); 3927 const __m128i sixteen = _mm_set1_epi16(16);
4231 const __m128i max = _mm_set1_epi16(6201); 3928 const __m128i max = _mm_set1_epi16(6201);
4232 const __m128i min = _mm_set1_epi16(-6201); 3929 const __m128i min = _mm_set1_epi16(-6201);
4233 int optimised_cols = 0; 3930 int optimised_cols = 0;
4234 3931
4235 // Load input into __m128i & pack to 16 bits 3932 // Load input into __m128i & pack to 16 bits
4236 for (i = 0; i < 8; i++) { 3933 for (i = 0; i < 8; i++) {
4237 temp1 = _mm_loadu_si128((const __m128i *)(input + 8*i)); 3934 temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
4238 temp2 = _mm_loadu_si128((const __m128i *)(input + 8*i + 4)); 3935 temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
4239 inptr[i] = _mm_packs_epi32(temp1, temp2); 3936 inptr[i] = _mm_packs_epi32(temp1, temp2);
4240 } 3937 }
4241 3938
4242 // Find the min & max for the row transform 3939 // Find the min & max for the row transform
4243 // only first 4 row has non-zero coefs 3940 // only first 4 row has non-zero coefs
4244 max_input = _mm_max_epi16(inptr[0], inptr[1]); 3941 max_input = _mm_max_epi16(inptr[0], inptr[1]);
4245 min_input = _mm_min_epi16(inptr[0], inptr[1]); 3942 min_input = _mm_min_epi16(inptr[0], inptr[1]);
4246 for (i = 2; i < 4; i++) { 3943 for (i = 2; i < 4; i++) {
4247 max_input = _mm_max_epi16(max_input, inptr[i]); 3944 max_input = _mm_max_epi16(max_input, inptr[i]);
4248 min_input = _mm_min_epi16(min_input, inptr[i]); 3945 min_input = _mm_min_epi16(min_input, inptr[i]);
(...skipping 20 matching lines...) Expand all
4269 temp1 = _mm_or_si128(max_input, min_input); 3966 temp1 = _mm_or_si128(max_input, min_input);
4270 test = _mm_movemask_epi8(temp1); 3967 test = _mm_movemask_epi8(temp1);
4271 3968
4272 if (test) { 3969 if (test) {
4273 // Use fact only first 4 rows contain non-zero coeffs 3970 // Use fact only first 4 rows contain non-zero coeffs
4274 array_transpose_4X8(inptr, inptr); 3971 array_transpose_4X8(inptr, inptr);
4275 for (i = 0; i < 4; i++) { 3972 for (i = 0; i < 4; i++) {
4276 sign_bits = _mm_cmplt_epi16(inptr[i], zero); 3973 sign_bits = _mm_cmplt_epi16(inptr[i], zero);
4277 temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits); 3974 temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
4278 temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits); 3975 temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
4279 _mm_storeu_si128((__m128i*)(outptr + 4*(2*i+1)), temp1); 3976 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
4280 _mm_storeu_si128((__m128i*)(outptr + 4*(2*i)), temp2); 3977 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
4281 } 3978 }
4282 } else { 3979 } else {
4283 // Set to use the optimised transform for the column 3980 // Set to use the optimised transform for the column
4284 optimised_cols = 1; 3981 optimised_cols = 1;
4285 } 3982 }
4286 } else { 3983 } else {
4287 // Run the un-optimised row transform 3984 // Run the un-optimised row transform
4288 for (i = 0; i < 4; ++i) { 3985 for (i = 0; i < 4; ++i) {
4289 vp9_highbd_idct8(input, outptr, bd); 3986 vp9_highbd_idct8(input, outptr, bd);
4290 input += 8; 3987 input += 8;
(...skipping 25 matching lines...) Expand all
4316 vp9_highbd_idct8(temp_in, temp_out, bd); 4013 vp9_highbd_idct8(temp_in, temp_out, bd);
4317 for (j = 0; j < 8; ++j) { 4014 for (j = 0; j < 8; ++j) {
4318 dest[j * stride + i] = highbd_clip_pixel_add( 4015 dest[j * stride + i] = highbd_clip_pixel_add(
4319 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); 4016 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
4320 } 4017 }
4321 } 4018 }
4322 } 4019 }
4323 } 4020 }
4324 4021
4325 void vp9_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8, 4022 void vp9_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8,
4326 int stride, int bd) { 4023 int stride, int bd) {
4327 tran_low_t out[16 * 16]; 4024 tran_low_t out[16 * 16];
4328 tran_low_t *outptr = out; 4025 tran_low_t *outptr = out;
4329 int i, j, test; 4026 int i, j, test;
4330 __m128i inptr[32]; 4027 __m128i inptr[32];
4331 __m128i min_input, max_input, temp1, temp2, sign_bits; 4028 __m128i min_input, max_input, temp1, temp2, sign_bits;
4332 uint16_t * dest = CONVERT_TO_SHORTPTR(dest8); 4029 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
4333 const __m128i zero = _mm_set1_epi16(0); 4030 const __m128i zero = _mm_set1_epi16(0);
4334 const __m128i rounding = _mm_set1_epi16(32); 4031 const __m128i rounding = _mm_set1_epi16(32);
4335 const __m128i max = _mm_set1_epi16(3155); 4032 const __m128i max = _mm_set1_epi16(3155);
4336 const __m128i min = _mm_set1_epi16(-3155); 4033 const __m128i min = _mm_set1_epi16(-3155);
4337 int optimised_cols = 0; 4034 int optimised_cols = 0;
4338 4035
4339 // Load input into __m128i & pack to 16 bits 4036 // Load input into __m128i & pack to 16 bits
4340 for (i = 0; i < 16; i++) { 4037 for (i = 0; i < 16; i++) {
4341 temp1 = _mm_loadu_si128((const __m128i *)(input + 16*i)); 4038 temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
4342 temp2 = _mm_loadu_si128((const __m128i *)(input + 16*i + 4)); 4039 temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
4343 inptr[i] = _mm_packs_epi32(temp1, temp2); 4040 inptr[i] = _mm_packs_epi32(temp1, temp2);
4344 temp1 = _mm_loadu_si128((const __m128i *)(input + 16*i + 8)); 4041 temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
4345 temp2 = _mm_loadu_si128((const __m128i *)(input + 16*i + 12)); 4042 temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
4346 inptr[i + 16] = _mm_packs_epi32(temp1, temp2); 4043 inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
4347 } 4044 }
4348 4045
4349 // Find the min & max for the row transform 4046 // Find the min & max for the row transform
4350 max_input = _mm_max_epi16(inptr[0], inptr[1]); 4047 max_input = _mm_max_epi16(inptr[0], inptr[1]);
4351 min_input = _mm_min_epi16(inptr[0], inptr[1]); 4048 min_input = _mm_min_epi16(inptr[0], inptr[1]);
4352 for (i = 2; i < 32; i++) { 4049 for (i = 2; i < 32; i++) {
4353 max_input = _mm_max_epi16(max_input, inptr[i]); 4050 max_input = _mm_max_epi16(max_input, inptr[i]);
4354 min_input = _mm_min_epi16(min_input, inptr[i]); 4051 min_input = _mm_min_epi16(min_input, inptr[i]);
4355 } 4052 }
(...skipping 15 matching lines...) Expand all
4371 } 4068 }
4372 max_input = _mm_cmpgt_epi16(max_input, max); 4069 max_input = _mm_cmpgt_epi16(max_input, max);
4373 min_input = _mm_cmplt_epi16(min_input, min); 4070 min_input = _mm_cmplt_epi16(min_input, min);
4374 temp1 = _mm_or_si128(max_input, min_input); 4071 temp1 = _mm_or_si128(max_input, min_input);
4375 test = _mm_movemask_epi8(temp1); 4072 test = _mm_movemask_epi8(temp1);
4376 4073
4377 if (test) { 4074 if (test) {
4378 array_transpose_16x16(inptr, inptr + 16); 4075 array_transpose_16x16(inptr, inptr + 16);
4379 for (i = 0; i < 16; i++) { 4076 for (i = 0; i < 16; i++) {
4380 sign_bits = _mm_cmplt_epi16(inptr[i], zero); 4077 sign_bits = _mm_cmplt_epi16(inptr[i], zero);
4381 temp1 = _mm_unpacklo_epi16(inptr[i ], sign_bits); 4078 temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
4382 temp2 = _mm_unpackhi_epi16(inptr[i ], sign_bits); 4079 temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
4383 _mm_storeu_si128((__m128i*)(outptr + 4*(i*4)), temp1); 4080 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
4384 _mm_storeu_si128((__m128i*)(outptr + 4*(i*4+1)), temp2); 4081 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
4385 sign_bits = _mm_cmplt_epi16(inptr[i+16], zero); 4082 sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
4386 temp1 = _mm_unpacklo_epi16(inptr[i+16], sign_bits); 4083 temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
4387 temp2 = _mm_unpackhi_epi16(inptr[i+16], sign_bits); 4084 temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
4388 _mm_storeu_si128((__m128i*)(outptr + 4*(i*4+2)), temp1); 4085 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
4389 _mm_storeu_si128((__m128i*)(outptr + 4*(i*4+3)), temp2); 4086 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
4390 } 4087 }
4391 } else { 4088 } else {
4392 // Set to use the optimised transform for the column 4089 // Set to use the optimised transform for the column
4393 optimised_cols = 1; 4090 optimised_cols = 1;
4394 } 4091 }
4395 } else { 4092 } else {
4396 // Run the un-optimised row transform 4093 // Run the un-optimised row transform
4397 for (i = 0; i < 16; ++i) { 4094 for (i = 0; i < 16; ++i) {
4398 vp9_highbd_idct16(input, outptr, bd); 4095 vp9_highbd_idct16(input, outptr, bd);
4399 input += 16; 4096 input += 16;
(...skipping 30 matching lines...) Expand all
4430 vp9_highbd_idct16(temp_in, temp_out, bd); 4127 vp9_highbd_idct16(temp_in, temp_out, bd);
4431 for (j = 0; j < 16; ++j) { 4128 for (j = 0; j < 16; ++j) {
4432 dest[j * stride + i] = highbd_clip_pixel_add( 4129 dest[j * stride + i] = highbd_clip_pixel_add(
4433 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); 4130 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
4434 } 4131 }
4435 } 4132 }
4436 } 4133 }
4437 } 4134 }
4438 4135
4439 void vp9_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8, 4136 void vp9_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
4440 int stride, int bd) { 4137 int stride, int bd) {
4441 tran_low_t out[16 * 16] = { 0 }; 4138 tran_low_t out[16 * 16] = { 0 };
4442 tran_low_t *outptr = out; 4139 tran_low_t *outptr = out;
4443 int i, j, test; 4140 int i, j, test;
4444 __m128i inptr[32]; 4141 __m128i inptr[32];
4445 __m128i min_input, max_input, temp1, temp2, sign_bits; 4142 __m128i min_input, max_input, temp1, temp2, sign_bits;
4446 uint16_t * dest = CONVERT_TO_SHORTPTR(dest8); 4143 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
4447 const __m128i zero = _mm_set1_epi16(0); 4144 const __m128i zero = _mm_set1_epi16(0);
4448 const __m128i rounding = _mm_set1_epi16(32); 4145 const __m128i rounding = _mm_set1_epi16(32);
4449 const __m128i max = _mm_set1_epi16(3155); 4146 const __m128i max = _mm_set1_epi16(3155);
4450 const __m128i min = _mm_set1_epi16(-3155); 4147 const __m128i min = _mm_set1_epi16(-3155);
4451 int optimised_cols = 0; 4148 int optimised_cols = 0;
4452 4149
4453 // Load input into __m128i & pack to 16 bits 4150 // Load input into __m128i & pack to 16 bits
4454 for (i = 0; i < 16; i++) { 4151 for (i = 0; i < 16; i++) {
4455 temp1 = _mm_loadu_si128((const __m128i *)(input + 16*i)); 4152 temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
4456 temp2 = _mm_loadu_si128((const __m128i *)(input + 16*i + 4)); 4153 temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
4457 inptr[i] = _mm_packs_epi32(temp1, temp2); 4154 inptr[i] = _mm_packs_epi32(temp1, temp2);
4458 temp1 = _mm_loadu_si128((const __m128i *)(input + 16*i + 8)); 4155 temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
4459 temp2 = _mm_loadu_si128((const __m128i *)(input + 16*i + 12)); 4156 temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
4460 inptr[i + 16] = _mm_packs_epi32(temp1, temp2); 4157 inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
4461 } 4158 }
4462 4159
4463 // Find the min & max for the row transform 4160 // Find the min & max for the row transform
4464 // Since all non-zero dct coefficients are in upper-left 4x4 area, 4161 // Since all non-zero dct coefficients are in upper-left 4x4 area,
4465 // we only need to consider first 4 rows here. 4162 // we only need to consider first 4 rows here.
4466 max_input = _mm_max_epi16(inptr[0], inptr[1]); 4163 max_input = _mm_max_epi16(inptr[0], inptr[1]);
4467 min_input = _mm_min_epi16(inptr[0], inptr[1]); 4164 min_input = _mm_min_epi16(inptr[0], inptr[1]);
4468 for (i = 2; i < 4; i++) { 4165 for (i = 2; i < 4; i++) {
4469 max_input = _mm_max_epi16(max_input, inptr[i]); 4166 max_input = _mm_max_epi16(max_input, inptr[i]);
(...skipping 20 matching lines...) Expand all
4490 min_input = _mm_cmplt_epi16(min_input, min); 4187 min_input = _mm_cmplt_epi16(min_input, min);
4491 temp1 = _mm_or_si128(max_input, min_input); 4188 temp1 = _mm_or_si128(max_input, min_input);
4492 test = _mm_movemask_epi8(temp1); 4189 test = _mm_movemask_epi8(temp1);
4493 4190
4494 if (test) { 4191 if (test) {
4495 // Use fact only first 4 rows contain non-zero coeffs 4192 // Use fact only first 4 rows contain non-zero coeffs
4496 array_transpose_8x8(inptr, inptr); 4193 array_transpose_8x8(inptr, inptr);
4497 array_transpose_8x8(inptr + 8, inptr + 16); 4194 array_transpose_8x8(inptr + 8, inptr + 16);
4498 for (i = 0; i < 4; i++) { 4195 for (i = 0; i < 4; i++) {
4499 sign_bits = _mm_cmplt_epi16(inptr[i], zero); 4196 sign_bits = _mm_cmplt_epi16(inptr[i], zero);
4500 temp1 = _mm_unpacklo_epi16(inptr[i ], sign_bits); 4197 temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
4501 temp2 = _mm_unpackhi_epi16(inptr[i ], sign_bits); 4198 temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
4502 _mm_storeu_si128((__m128i*)(outptr + 4*(i*4)), temp1); 4199 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
4503 _mm_storeu_si128((__m128i*)(outptr + 4*(i*4+1)), temp2); 4200 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
4504 sign_bits = _mm_cmplt_epi16(inptr[i+16], zero); 4201 sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
4505 temp1 = _mm_unpacklo_epi16(inptr[i+16], sign_bits); 4202 temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
4506 temp2 = _mm_unpackhi_epi16(inptr[i+16], sign_bits); 4203 temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
4507 _mm_storeu_si128((__m128i*)(outptr + 4*(i*4+2)), temp1); 4204 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
4508 _mm_storeu_si128((__m128i*)(outptr + 4*(i*4+3)), temp2); 4205 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
4509 } 4206 }
4510 } else { 4207 } else {
4511 // Set to use the optimised transform for the column 4208 // Set to use the optimised transform for the column
4512 optimised_cols = 1; 4209 optimised_cols = 1;
4513 } 4210 }
4514 } else { 4211 } else {
4515 // Run the un-optimised row transform 4212 // Run the un-optimised row transform
4516 for (i = 0; i < 4; ++i) { 4213 for (i = 0; i < 4; ++i) {
4517 vp9_highbd_idct16(input, outptr, bd); 4214 vp9_highbd_idct16(input, outptr, bd);
4518 input += 16; 4215 input += 16;
(...skipping 30 matching lines...) Expand all
4549 vp9_highbd_idct16(temp_in, temp_out, bd); 4246 vp9_highbd_idct16(temp_in, temp_out, bd);
4550 for (j = 0; j < 16; ++j) { 4247 for (j = 0; j < 16; ++j) {
4551 dest[j * stride + i] = highbd_clip_pixel_add( 4248 dest[j * stride + i] = highbd_clip_pixel_add(
4552 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); 4249 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
4553 } 4250 }
4554 } 4251 }
4555 } 4252 }
4556 } 4253 }
4557 4254
4558 #endif // CONFIG_VP9_HIGHBITDEPTH 4255 #endif // CONFIG_VP9_HIGHBITDEPTH
OLDNEW
« no previous file with comments | « source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.h ('k') | source/libvpx/vp9/common/x86/vp9_idct_intrin_ssse3.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698