OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 #include <assert.h> | 11 #include <assert.h> |
12 #include <emmintrin.h> // SSE2 | 12 #include <emmintrin.h> // SSE2 |
13 #include "./vpx_config.h" | 13 #include "./vpx_config.h" |
14 #include "vpx/vpx_integer.h" | 14 #include "vpx/vpx_integer.h" |
15 #include "vp9/common/vp9_common.h" | 15 #include "vp9/common/vp9_common.h" |
16 #include "vp9/common/vp9_idct.h" | 16 #include "vp9/common/vp9_idct.h" |
17 | 17 |
18 void vp9_short_idct4x4_add_sse2(int16_t *input, uint8_t *dest, int stride) { | 18 void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
19 const __m128i zero = _mm_setzero_si128(); | 19 const __m128i zero = _mm_setzero_si128(); |
20 const __m128i eight = _mm_set1_epi16(8); | 20 const __m128i eight = _mm_set1_epi16(8); |
21 const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64, | 21 const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64, |
22 (int16_t)cospi_16_64, (int16_t)-cospi_16_64, | 22 (int16_t)cospi_16_64, (int16_t)-cospi_16_64, |
23 (int16_t)cospi_24_64, (int16_t)-cospi_8_64, | 23 (int16_t)cospi_24_64, (int16_t)-cospi_8_64, |
24 (int16_t)cospi_8_64, (int16_t)cospi_24_64); | 24 (int16_t)cospi_8_64, (int16_t)cospi_24_64); |
25 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); | 25 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
26 __m128i input0, input1, input2, input3; | 26 __m128i input0, input1, input2, input3; |
27 | 27 |
28 // Rows | 28 // Rows |
29 input0 = _mm_loadl_epi64((__m128i *)input); | 29 input0 = _mm_loadl_epi64((const __m128i *)input); |
30 input1 = _mm_loadl_epi64((__m128i *)(input + 4)); | 30 input1 = _mm_loadl_epi64((const __m128i *)(input + 4)); |
31 input2 = _mm_loadl_epi64((__m128i *)(input + 8)); | 31 input2 = _mm_loadl_epi64((const __m128i *)(input + 8)); |
32 input3 = _mm_loadl_epi64((__m128i *)(input + 12)); | 32 input3 = _mm_loadl_epi64((const __m128i *)(input + 12)); |
33 | 33 |
34 // Construct i3, i1, i3, i1, i2, i0, i2, i0 | 34 // Construct i3, i1, i3, i1, i2, i0, i2, i0 |
35 input0 = _mm_shufflelo_epi16(input0, 0xd8); | 35 input0 = _mm_shufflelo_epi16(input0, 0xd8); |
36 input1 = _mm_shufflelo_epi16(input1, 0xd8); | 36 input1 = _mm_shufflelo_epi16(input1, 0xd8); |
37 input2 = _mm_shufflelo_epi16(input2, 0xd8); | 37 input2 = _mm_shufflelo_epi16(input2, 0xd8); |
38 input3 = _mm_shufflelo_epi16(input3, 0xd8); | 38 input3 = _mm_shufflelo_epi16(input3, 0xd8); |
39 | 39 |
40 input0 = _mm_unpacklo_epi32(input0, input0); | 40 input0 = _mm_unpacklo_epi32(input0, input0); |
41 input1 = _mm_unpacklo_epi32(input1, input1); | 41 input1 = _mm_unpacklo_epi32(input1, input1); |
42 input2 = _mm_unpacklo_epi32(input2, input2); | 42 input2 = _mm_unpacklo_epi32(input2, input2); |
(...skipping 98 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
141 | 141 |
142 input0 = _mm_srli_si128(input2, 8); | 142 input0 = _mm_srli_si128(input2, 8); |
143 input1 = _mm_srli_si128(input3, 8); | 143 input1 = _mm_srli_si128(input3, 8); |
144 | 144 |
145 RECON_AND_STORE4X4(dest, input2); | 145 RECON_AND_STORE4X4(dest, input2); |
146 RECON_AND_STORE4X4(dest, input0); | 146 RECON_AND_STORE4X4(dest, input0); |
147 RECON_AND_STORE4X4(dest, input1); | 147 RECON_AND_STORE4X4(dest, input1); |
148 RECON_AND_STORE4X4(dest, input3); | 148 RECON_AND_STORE4X4(dest, input3); |
149 } | 149 } |
150 | 150 |
151 void vp9_short_idct4x4_1_add_sse2(int16_t *input, uint8_t *dest, int stride) { | 151 void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
152 __m128i dc_value; | 152 __m128i dc_value; |
153 const __m128i zero = _mm_setzero_si128(); | 153 const __m128i zero = _mm_setzero_si128(); |
154 int a; | 154 int a; |
155 | 155 |
156 a = dct_const_round_shift(input[0] * cospi_16_64); | 156 a = dct_const_round_shift(input[0] * cospi_16_64); |
157 a = dct_const_round_shift(a * cospi_16_64); | 157 a = dct_const_round_shift(a * cospi_16_64); |
158 a = ROUND_POWER_OF_TWO(a, 4); | 158 a = ROUND_POWER_OF_TWO(a, 4); |
159 | 159 |
160 dc_value = _mm_set1_epi16(a); | 160 dc_value = _mm_set1_epi16(a); |
161 | 161 |
162 RECON_AND_STORE4X4(dest, dc_value); | 162 RECON_AND_STORE4X4(dest, dc_value); |
163 RECON_AND_STORE4X4(dest, dc_value); | 163 RECON_AND_STORE4X4(dest, dc_value); |
164 RECON_AND_STORE4X4(dest, dc_value); | 164 RECON_AND_STORE4X4(dest, dc_value); |
165 RECON_AND_STORE4X4(dest, dc_value); | 165 RECON_AND_STORE4X4(dest, dc_value); |
166 } | 166 } |
167 | 167 |
168 void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) { | |
169 const __m128i zero = _mm_setzero_si128(); | |
170 const __m128i c1 = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64, | |
171 (int16_t)cospi_16_64, (int16_t)-cospi_16_64, | |
172 (int16_t)cospi_24_64, (int16_t)-cospi_8_64, | |
173 (int16_t)cospi_8_64, (int16_t)cospi_24_64); | |
174 const __m128i c2 = _mm_setr_epi16(1, 1, 1, 1, 1, -1, 1, -1); | |
175 | |
176 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); | |
177 __m128i in, temp; | |
178 | |
179 // Load input data. | |
180 in = _mm_loadl_epi64((__m128i *)input); | |
181 | |
182 // Construct i3, i1, i3, i1, i2, i0, i2, i0 | |
183 in = _mm_shufflelo_epi16(in, 0xd8); | |
184 in = _mm_unpacklo_epi32(in, in); | |
185 | |
186 // Stage 1 | |
187 in = _mm_madd_epi16(in, c1); | |
188 in = _mm_add_epi32(in, rounding); | |
189 in = _mm_srai_epi32(in, DCT_CONST_BITS); | |
190 in = _mm_packs_epi32(in, zero); | |
191 | |
192 // Stage 2 | |
193 temp = _mm_shufflelo_epi16(in, 0x9c); | |
194 in = _mm_shufflelo_epi16(in, 0xc9); | |
195 in = _mm_unpacklo_epi64(temp, in); | |
196 in = _mm_madd_epi16(in, c2); | |
197 in = _mm_packs_epi32(in, zero); | |
198 | |
199 // Store results | |
200 _mm_storel_epi64((__m128i *)output, in); | |
201 } | |
202 | |
203 static INLINE void transpose_4x4(__m128i *res) { | 168 static INLINE void transpose_4x4(__m128i *res) { |
204 const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); | 169 const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); |
205 const __m128i tr0_1 = _mm_unpacklo_epi16(res[2], res[3]); | 170 const __m128i tr0_1 = _mm_unpacklo_epi16(res[2], res[3]); |
206 res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1); | 171 res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1); |
207 res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1); | 172 res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1); |
208 | 173 |
209 res[1] = _mm_unpackhi_epi64(res[0], res[0]); | 174 res[1] = _mm_unpackhi_epi64(res[0], res[0]); |
210 res[3] = _mm_unpackhi_epi64(res[2], res[2]); | 175 res[3] = _mm_unpackhi_epi64(res[2], res[2]); |
211 } | 176 } |
212 | 177 |
213 void idct4_1d_sse2(__m128i *in) { | 178 static void idct4_1d_sse2(__m128i *in) { |
214 const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64); | 179 const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64); |
215 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 180 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
216 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); | 181 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
217 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); | 182 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); |
218 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); | 183 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); |
219 __m128i u[8], v[8]; | 184 __m128i u[8], v[8]; |
220 | 185 |
221 transpose_4x4(in); | 186 transpose_4x4(in); |
222 // stage 1 | 187 // stage 1 |
223 u[0] = _mm_unpacklo_epi16(in[0], in[2]); | 188 u[0] = _mm_unpacklo_epi16(in[0], in[2]); |
(...skipping 18 matching lines...) Expand all Loading... |
242 u[2] = _mm_unpackhi_epi64(u[0], u[0]); | 207 u[2] = _mm_unpackhi_epi64(u[0], u[0]); |
243 u[3] = _mm_unpackhi_epi64(u[1], u[1]); | 208 u[3] = _mm_unpackhi_epi64(u[1], u[1]); |
244 | 209 |
245 // stage 2 | 210 // stage 2 |
246 in[0] = _mm_add_epi16(u[0], u[3]); | 211 in[0] = _mm_add_epi16(u[0], u[3]); |
247 in[1] = _mm_add_epi16(u[1], u[2]); | 212 in[1] = _mm_add_epi16(u[1], u[2]); |
248 in[2] = _mm_sub_epi16(u[1], u[2]); | 213 in[2] = _mm_sub_epi16(u[1], u[2]); |
249 in[3] = _mm_sub_epi16(u[0], u[3]); | 214 in[3] = _mm_sub_epi16(u[0], u[3]); |
250 } | 215 } |
251 | 216 |
252 void iadst4_1d_sse2(__m128i *in) { | 217 static void iadst4_1d_sse2(__m128i *in) { |
253 const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9); | 218 const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9); |
254 const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9); | 219 const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9); |
255 const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9); | 220 const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9); |
256 const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9); | 221 const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9); |
257 const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9); | 222 const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9); |
258 const __m128i kZero = _mm_set1_epi16(0); | 223 const __m128i kZero = _mm_set1_epi16(0); |
259 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); | 224 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); |
260 __m128i u[8], v[8], in7; | 225 __m128i u[8], v[8], in7; |
261 | 226 |
262 transpose_4x4(in); | 227 transpose_4x4(in); |
(...skipping 29 matching lines...) Expand all Loading... |
292 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); | 257 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); |
293 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); | 258 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); |
294 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); | 259 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); |
295 | 260 |
296 in[0] = _mm_packs_epi32(u[0], u[2]); | 261 in[0] = _mm_packs_epi32(u[0], u[2]); |
297 in[1] = _mm_packs_epi32(u[1], u[3]); | 262 in[1] = _mm_packs_epi32(u[1], u[3]); |
298 in[2] = _mm_unpackhi_epi64(in[0], in[0]); | 263 in[2] = _mm_unpackhi_epi64(in[0], in[0]); |
299 in[3] = _mm_unpackhi_epi64(in[1], in[1]); | 264 in[3] = _mm_unpackhi_epi64(in[1], in[1]); |
300 } | 265 } |
301 | 266 |
302 void vp9_short_iht4x4_add_sse2(int16_t *input, uint8_t *dest, int stride, | 267 void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, |
303 int tx_type) { | 268 int tx_type) { |
304 __m128i in[4]; | 269 __m128i in[4]; |
305 const __m128i zero = _mm_setzero_si128(); | 270 const __m128i zero = _mm_setzero_si128(); |
306 const __m128i eight = _mm_set1_epi16(8); | 271 const __m128i eight = _mm_set1_epi16(8); |
307 | 272 |
308 in[0] = _mm_loadl_epi64((__m128i *)input); | 273 in[0] = _mm_loadl_epi64((const __m128i *)input); |
309 in[1] = _mm_loadl_epi64((__m128i *)(input + 4)); | 274 in[1] = _mm_loadl_epi64((const __m128i *)(input + 4)); |
310 in[2] = _mm_loadl_epi64((__m128i *)(input + 8)); | 275 in[2] = _mm_loadl_epi64((const __m128i *)(input + 8)); |
311 in[3] = _mm_loadl_epi64((__m128i *)(input + 12)); | 276 in[3] = _mm_loadl_epi64((const __m128i *)(input + 12)); |
312 | 277 |
313 switch (tx_type) { | 278 switch (tx_type) { |
314 case 0: // DCT_DCT | 279 case 0: // DCT_DCT |
315 idct4_1d_sse2(in); | 280 idct4_1d_sse2(in); |
316 idct4_1d_sse2(in); | 281 idct4_1d_sse2(in); |
317 break; | 282 break; |
318 case 1: // ADST_DCT | 283 case 1: // ADST_DCT |
319 idct4_1d_sse2(in); | 284 idct4_1d_sse2(in); |
320 iadst4_1d_sse2(in); | 285 iadst4_1d_sse2(in); |
321 break; | 286 break; |
(...skipping 121 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
443 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \ | 408 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \ |
444 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \ | 409 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \ |
445 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \ | 410 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \ |
446 \ | 411 \ |
447 res0 = _mm_packs_epi32(tmp0, tmp1); \ | 412 res0 = _mm_packs_epi32(tmp0, tmp1); \ |
448 res1 = _mm_packs_epi32(tmp2, tmp3); \ | 413 res1 = _mm_packs_epi32(tmp2, tmp3); \ |
449 res2 = _mm_packs_epi32(tmp4, tmp5); \ | 414 res2 = _mm_packs_epi32(tmp4, tmp5); \ |
450 res3 = _mm_packs_epi32(tmp6, tmp7); \ | 415 res3 = _mm_packs_epi32(tmp6, tmp7); \ |
451 } | 416 } |
452 | 417 |
453 #define IDCT8x8_1D \ | 418 #define IDCT8_1D \ |
454 /* Stage1 */ \ | 419 /* Stage1 */ \ |
455 { \ | 420 { \ |
456 const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \ | 421 const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \ |
457 const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \ | 422 const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \ |
458 const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \ | 423 const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \ |
459 const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \ | 424 const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \ |
460 \ | 425 \ |
461 MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \ | 426 MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \ |
462 stg1_1, stg1_2, stg1_3, stp1_4, \ | 427 stg1_1, stg1_2, stg1_3, stp1_4, \ |
463 stp1_7, stp1_5, stp1_6) \ | 428 stp1_7, stp1_5, stp1_6) \ |
(...skipping 58 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
522 #define RECON_AND_STORE(dest, in_x) \ | 487 #define RECON_AND_STORE(dest, in_x) \ |
523 { \ | 488 { \ |
524 __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \ | 489 __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \ |
525 d0 = _mm_unpacklo_epi8(d0, zero); \ | 490 d0 = _mm_unpacklo_epi8(d0, zero); \ |
526 d0 = _mm_add_epi16(in_x, d0); \ | 491 d0 = _mm_add_epi16(in_x, d0); \ |
527 d0 = _mm_packus_epi16(d0, d0); \ | 492 d0 = _mm_packus_epi16(d0, d0); \ |
528 _mm_storel_epi64((__m128i *)(dest), d0); \ | 493 _mm_storel_epi64((__m128i *)(dest), d0); \ |
529 dest += stride; \ | 494 dest += stride; \ |
530 } | 495 } |
531 | 496 |
532 void vp9_short_idct8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) { | 497 void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
533 const __m128i zero = _mm_setzero_si128(); | 498 const __m128i zero = _mm_setzero_si128(); |
534 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); | 499 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
535 const __m128i final_rounding = _mm_set1_epi16(1<<4); | 500 const __m128i final_rounding = _mm_set1_epi16(1<<4); |
536 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); | 501 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); |
537 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); | 502 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); |
538 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); | 503 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); |
539 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); | 504 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); |
540 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); | 505 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); |
541 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 506 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
542 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); | 507 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
543 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); | 508 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); |
544 | 509 |
545 __m128i in0, in1, in2, in3, in4, in5, in6, in7; | 510 __m128i in0, in1, in2, in3, in4, in5, in6, in7; |
546 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; | 511 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; |
547 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; | 512 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; |
548 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; | 513 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
549 int i; | 514 int i; |
550 | 515 |
551 // Load input data. | 516 // Load input data. |
552 in0 = _mm_load_si128((__m128i *)input); | 517 in0 = _mm_load_si128((const __m128i *)input); |
553 in1 = _mm_load_si128((__m128i *)(input + 8 * 1)); | 518 in1 = _mm_load_si128((const __m128i *)(input + 8 * 1)); |
554 in2 = _mm_load_si128((__m128i *)(input + 8 * 2)); | 519 in2 = _mm_load_si128((const __m128i *)(input + 8 * 2)); |
555 in3 = _mm_load_si128((__m128i *)(input + 8 * 3)); | 520 in3 = _mm_load_si128((const __m128i *)(input + 8 * 3)); |
556 in4 = _mm_load_si128((__m128i *)(input + 8 * 4)); | 521 in4 = _mm_load_si128((const __m128i *)(input + 8 * 4)); |
557 in5 = _mm_load_si128((__m128i *)(input + 8 * 5)); | 522 in5 = _mm_load_si128((const __m128i *)(input + 8 * 5)); |
558 in6 = _mm_load_si128((__m128i *)(input + 8 * 6)); | 523 in6 = _mm_load_si128((const __m128i *)(input + 8 * 6)); |
559 in7 = _mm_load_si128((__m128i *)(input + 8 * 7)); | 524 in7 = _mm_load_si128((const __m128i *)(input + 8 * 7)); |
560 | 525 |
561 // 2-D | 526 // 2-D |
562 for (i = 0; i < 2; i++) { | 527 for (i = 0; i < 2; i++) { |
563 // 8x8 Transpose is copied from vp9_short_fdct8x8_sse2() | 528 // 8x8 Transpose is copied from vp9_fdct8x8_sse2() |
564 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, | 529 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, |
565 in4, in5, in6, in7); | 530 in4, in5, in6, in7); |
566 | 531 |
567 // 4-stage 1D idct8x8 | 532 // 4-stage 1D idct8x8 |
568 IDCT8x8_1D | 533 IDCT8_1D |
569 } | 534 } |
570 | 535 |
571 // Final rounding and shift | 536 // Final rounding and shift |
572 in0 = _mm_adds_epi16(in0, final_rounding); | 537 in0 = _mm_adds_epi16(in0, final_rounding); |
573 in1 = _mm_adds_epi16(in1, final_rounding); | 538 in1 = _mm_adds_epi16(in1, final_rounding); |
574 in2 = _mm_adds_epi16(in2, final_rounding); | 539 in2 = _mm_adds_epi16(in2, final_rounding); |
575 in3 = _mm_adds_epi16(in3, final_rounding); | 540 in3 = _mm_adds_epi16(in3, final_rounding); |
576 in4 = _mm_adds_epi16(in4, final_rounding); | 541 in4 = _mm_adds_epi16(in4, final_rounding); |
577 in5 = _mm_adds_epi16(in5, final_rounding); | 542 in5 = _mm_adds_epi16(in5, final_rounding); |
578 in6 = _mm_adds_epi16(in6, final_rounding); | 543 in6 = _mm_adds_epi16(in6, final_rounding); |
(...skipping 11 matching lines...) Expand all Loading... |
590 RECON_AND_STORE(dest, in0); | 555 RECON_AND_STORE(dest, in0); |
591 RECON_AND_STORE(dest, in1); | 556 RECON_AND_STORE(dest, in1); |
592 RECON_AND_STORE(dest, in2); | 557 RECON_AND_STORE(dest, in2); |
593 RECON_AND_STORE(dest, in3); | 558 RECON_AND_STORE(dest, in3); |
594 RECON_AND_STORE(dest, in4); | 559 RECON_AND_STORE(dest, in4); |
595 RECON_AND_STORE(dest, in5); | 560 RECON_AND_STORE(dest, in5); |
596 RECON_AND_STORE(dest, in6); | 561 RECON_AND_STORE(dest, in6); |
597 RECON_AND_STORE(dest, in7); | 562 RECON_AND_STORE(dest, in7); |
598 } | 563 } |
599 | 564 |
600 void vp9_short_idct8x8_1_add_sse2(int16_t *input, uint8_t *dest, int stride) { | 565 void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
601 __m128i dc_value; | 566 __m128i dc_value; |
602 const __m128i zero = _mm_setzero_si128(); | 567 const __m128i zero = _mm_setzero_si128(); |
603 int a; | 568 int a; |
604 | 569 |
605 a = dct_const_round_shift(input[0] * cospi_16_64); | 570 a = dct_const_round_shift(input[0] * cospi_16_64); |
606 a = dct_const_round_shift(a * cospi_16_64); | 571 a = dct_const_round_shift(a * cospi_16_64); |
607 a = ROUND_POWER_OF_TWO(a, 5); | 572 a = ROUND_POWER_OF_TWO(a, 5); |
608 | 573 |
609 dc_value = _mm_set1_epi16(a); | 574 dc_value = _mm_set1_epi16(a); |
610 | 575 |
(...skipping 30 matching lines...) Expand all Loading... |
641 res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); | 606 res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); |
642 res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); | 607 res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); |
643 res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); | 608 res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); |
644 res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); | 609 res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); |
645 res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); | 610 res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); |
646 res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); | 611 res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); |
647 res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); | 612 res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); |
648 res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); | 613 res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); |
649 } | 614 } |
650 | 615 |
651 void idct8_1d_sse2(__m128i *in) { | 616 static void idct8_1d_sse2(__m128i *in) { |
652 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); | 617 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
653 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); | 618 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); |
654 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); | 619 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); |
655 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); | 620 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); |
656 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); | 621 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); |
657 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); | 622 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); |
658 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 623 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
659 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); | 624 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
660 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); | 625 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); |
661 | 626 |
662 __m128i in0, in1, in2, in3, in4, in5, in6, in7; | 627 __m128i in0, in1, in2, in3, in4, in5, in6, in7; |
663 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; | 628 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; |
664 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; | 629 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; |
665 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; | 630 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
666 | 631 |
667 in0 = in[0]; | 632 in0 = in[0]; |
668 in1 = in[1]; | 633 in1 = in[1]; |
669 in2 = in[2]; | 634 in2 = in[2]; |
670 in3 = in[3]; | 635 in3 = in[3]; |
671 in4 = in[4]; | 636 in4 = in[4]; |
672 in5 = in[5]; | 637 in5 = in[5]; |
673 in6 = in[6]; | 638 in6 = in[6]; |
674 in7 = in[7]; | 639 in7 = in[7]; |
675 | 640 |
676 // 8x8 Transpose is copied from vp9_short_fdct8x8_sse2() | 641 // 8x8 Transpose is copied from vp9_fdct8x8_sse2() |
677 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, | 642 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, |
678 in4, in5, in6, in7); | 643 in4, in5, in6, in7); |
679 | 644 |
680 // 4-stage 1D idct8x8 | 645 // 4-stage 1D idct8x8 |
681 IDCT8x8_1D | 646 IDCT8_1D |
682 in[0] = in0; | 647 in[0] = in0; |
683 in[1] = in1; | 648 in[1] = in1; |
684 in[2] = in2; | 649 in[2] = in2; |
685 in[3] = in3; | 650 in[3] = in3; |
686 in[4] = in4; | 651 in[4] = in4; |
687 in[5] = in5; | 652 in[5] = in5; |
688 in[6] = in6; | 653 in[6] = in6; |
689 in[7] = in7; | 654 in[7] = in7; |
690 } | 655 } |
691 | 656 |
692 void iadst8_1d_sse2(__m128i *in) { | 657 static void iadst8_1d_sse2(__m128i *in) { |
693 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); | 658 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); |
694 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); | 659 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); |
695 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); | 660 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); |
696 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); | 661 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); |
697 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); | 662 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); |
698 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); | 663 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); |
699 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); | 664 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); |
700 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); | 665 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); |
701 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); | 666 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); |
702 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); | 667 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
(...skipping 208 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
911 in[1] = _mm_sub_epi16(k__const_0, s4); | 876 in[1] = _mm_sub_epi16(k__const_0, s4); |
912 in[2] = s6; | 877 in[2] = s6; |
913 in[3] = _mm_sub_epi16(k__const_0, s2); | 878 in[3] = _mm_sub_epi16(k__const_0, s2); |
914 in[4] = s3; | 879 in[4] = s3; |
915 in[5] = _mm_sub_epi16(k__const_0, s7); | 880 in[5] = _mm_sub_epi16(k__const_0, s7); |
916 in[6] = s5; | 881 in[6] = s5; |
917 in[7] = _mm_sub_epi16(k__const_0, s1); | 882 in[7] = _mm_sub_epi16(k__const_0, s1); |
918 } | 883 } |
919 | 884 |
920 | 885 |
921 void vp9_short_iht8x8_add_sse2(int16_t *input, uint8_t *dest, int stride, | 886 void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride, |
922 int tx_type) { | 887 int tx_type) { |
923 __m128i in[8]; | 888 __m128i in[8]; |
924 const __m128i zero = _mm_setzero_si128(); | 889 const __m128i zero = _mm_setzero_si128(); |
925 const __m128i final_rounding = _mm_set1_epi16(1<<4); | 890 const __m128i final_rounding = _mm_set1_epi16(1<<4); |
926 | 891 |
927 // load input data | 892 // load input data |
928 in[0] = _mm_load_si128((__m128i *)input); | 893 in[0] = _mm_load_si128((const __m128i *)input); |
929 in[1] = _mm_load_si128((__m128i *)(input + 8 * 1)); | 894 in[1] = _mm_load_si128((const __m128i *)(input + 8 * 1)); |
930 in[2] = _mm_load_si128((__m128i *)(input + 8 * 2)); | 895 in[2] = _mm_load_si128((const __m128i *)(input + 8 * 2)); |
931 in[3] = _mm_load_si128((__m128i *)(input + 8 * 3)); | 896 in[3] = _mm_load_si128((const __m128i *)(input + 8 * 3)); |
932 in[4] = _mm_load_si128((__m128i *)(input + 8 * 4)); | 897 in[4] = _mm_load_si128((const __m128i *)(input + 8 * 4)); |
933 in[5] = _mm_load_si128((__m128i *)(input + 8 * 5)); | 898 in[5] = _mm_load_si128((const __m128i *)(input + 8 * 5)); |
934 in[6] = _mm_load_si128((__m128i *)(input + 8 * 6)); | 899 in[6] = _mm_load_si128((const __m128i *)(input + 8 * 6)); |
935 in[7] = _mm_load_si128((__m128i *)(input + 8 * 7)); | 900 in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7)); |
936 | 901 |
937 switch (tx_type) { | 902 switch (tx_type) { |
938 case 0: // DCT_DCT | 903 case 0: // DCT_DCT |
939 idct8_1d_sse2(in); | 904 idct8_1d_sse2(in); |
940 idct8_1d_sse2(in); | 905 idct8_1d_sse2(in); |
941 break; | 906 break; |
942 case 1: // ADST_DCT | 907 case 1: // ADST_DCT |
943 idct8_1d_sse2(in); | 908 idct8_1d_sse2(in); |
944 iadst8_1d_sse2(in); | 909 iadst8_1d_sse2(in); |
945 break; | 910 break; |
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
978 RECON_AND_STORE(dest, in[0]); | 943 RECON_AND_STORE(dest, in[0]); |
979 RECON_AND_STORE(dest, in[1]); | 944 RECON_AND_STORE(dest, in[1]); |
980 RECON_AND_STORE(dest, in[2]); | 945 RECON_AND_STORE(dest, in[2]); |
981 RECON_AND_STORE(dest, in[3]); | 946 RECON_AND_STORE(dest, in[3]); |
982 RECON_AND_STORE(dest, in[4]); | 947 RECON_AND_STORE(dest, in[4]); |
983 RECON_AND_STORE(dest, in[5]); | 948 RECON_AND_STORE(dest, in[5]); |
984 RECON_AND_STORE(dest, in[6]); | 949 RECON_AND_STORE(dest, in[6]); |
985 RECON_AND_STORE(dest, in[7]); | 950 RECON_AND_STORE(dest, in[7]); |
986 } | 951 } |
987 | 952 |
988 void vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) { | 953 void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
989 const __m128i zero = _mm_setzero_si128(); | 954 const __m128i zero = _mm_setzero_si128(); |
990 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); | 955 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
991 const __m128i final_rounding = _mm_set1_epi16(1<<4); | 956 const __m128i final_rounding = _mm_set1_epi16(1<<4); |
992 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); | 957 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); |
993 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); | 958 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); |
994 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); | 959 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); |
995 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); | 960 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); |
996 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); | 961 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); |
997 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 962 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
998 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); | 963 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
999 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); | 964 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); |
1000 const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); | 965 const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); |
1001 | 966 |
1002 __m128i in0, in1, in2, in3, in4, in5, in6, in7; | 967 __m128i in0, in1, in2, in3, in4, in5, in6, in7; |
1003 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; | 968 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; |
1004 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; | 969 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; |
1005 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; | 970 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
1006 | 971 |
1007 // Rows. Load 4-row input data. | 972 // Rows. Load 4-row input data. |
1008 in0 = _mm_load_si128((__m128i *)input); | 973 in0 = _mm_load_si128((const __m128i *)input); |
1009 in1 = _mm_load_si128((__m128i *)(input + 8 * 1)); | 974 in1 = _mm_load_si128((const __m128i *)(input + 8 * 1)); |
1010 in2 = _mm_load_si128((__m128i *)(input + 8 * 2)); | 975 in2 = _mm_load_si128((const __m128i *)(input + 8 * 2)); |
1011 in3 = _mm_load_si128((__m128i *)(input + 8 * 3)); | 976 in3 = _mm_load_si128((const __m128i *)(input + 8 * 3)); |
1012 | 977 |
1013 // 8x4 Transpose | 978 // 8x4 Transpose |
1014 TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3) | 979 TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3) |
1015 | 980 |
1016 // Stage1 | 981 // Stage1 |
1017 { | 982 { //NOLINT |
1018 const __m128i lo_17 = _mm_unpackhi_epi16(in0, in3); | 983 const __m128i lo_17 = _mm_unpackhi_epi16(in0, in3); |
1019 const __m128i lo_35 = _mm_unpackhi_epi16(in1, in2); | 984 const __m128i lo_35 = _mm_unpackhi_epi16(in1, in2); |
1020 | 985 |
1021 tmp0 = _mm_madd_epi16(lo_17, stg1_0); | 986 tmp0 = _mm_madd_epi16(lo_17, stg1_0); |
1022 tmp2 = _mm_madd_epi16(lo_17, stg1_1); | 987 tmp2 = _mm_madd_epi16(lo_17, stg1_1); |
1023 tmp4 = _mm_madd_epi16(lo_35, stg1_2); | 988 tmp4 = _mm_madd_epi16(lo_35, stg1_2); |
1024 tmp6 = _mm_madd_epi16(lo_35, stg1_3); | 989 tmp6 = _mm_madd_epi16(lo_35, stg1_3); |
1025 | 990 |
1026 tmp0 = _mm_add_epi32(tmp0, rounding); | 991 tmp0 = _mm_add_epi32(tmp0, rounding); |
1027 tmp2 = _mm_add_epi32(tmp2, rounding); | 992 tmp2 = _mm_add_epi32(tmp2, rounding); |
1028 tmp4 = _mm_add_epi32(tmp4, rounding); | 993 tmp4 = _mm_add_epi32(tmp4, rounding); |
1029 tmp6 = _mm_add_epi32(tmp6, rounding); | 994 tmp6 = _mm_add_epi32(tmp6, rounding); |
1030 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); | 995 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); |
1031 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); | 996 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); |
1032 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); | 997 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); |
1033 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); | 998 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); |
1034 | 999 |
1035 stp1_4 = _mm_packs_epi32(tmp0, zero); | 1000 stp1_4 = _mm_packs_epi32(tmp0, zero); |
1036 stp1_7 = _mm_packs_epi32(tmp2, zero); | 1001 stp1_7 = _mm_packs_epi32(tmp2, zero); |
1037 stp1_5 = _mm_packs_epi32(tmp4, zero); | 1002 stp1_5 = _mm_packs_epi32(tmp4, zero); |
1038 stp1_6 = _mm_packs_epi32(tmp6, zero); | 1003 stp1_6 = _mm_packs_epi32(tmp6, zero); |
1039 } | 1004 } |
1040 | 1005 |
1041 // Stage2 | 1006 // Stage2 |
1042 { | 1007 { //NOLINT |
1043 const __m128i lo_04 = _mm_unpacklo_epi16(in0, in2); | 1008 const __m128i lo_04 = _mm_unpacklo_epi16(in0, in2); |
1044 const __m128i lo_26 = _mm_unpacklo_epi16(in1, in3); | 1009 const __m128i lo_26 = _mm_unpacklo_epi16(in1, in3); |
1045 | 1010 |
1046 tmp0 = _mm_madd_epi16(lo_04, stg2_0); | 1011 tmp0 = _mm_madd_epi16(lo_04, stg2_0); |
1047 tmp2 = _mm_madd_epi16(lo_04, stg2_1); | 1012 tmp2 = _mm_madd_epi16(lo_04, stg2_1); |
1048 tmp4 = _mm_madd_epi16(lo_26, stg2_2); | 1013 tmp4 = _mm_madd_epi16(lo_26, stg2_2); |
1049 tmp6 = _mm_madd_epi16(lo_26, stg2_3); | 1014 tmp6 = _mm_madd_epi16(lo_26, stg2_3); |
1050 | 1015 |
1051 tmp0 = _mm_add_epi32(tmp0, rounding); | 1016 tmp0 = _mm_add_epi32(tmp0, rounding); |
1052 tmp2 = _mm_add_epi32(tmp2, rounding); | 1017 tmp2 = _mm_add_epi32(tmp2, rounding); |
1053 tmp4 = _mm_add_epi32(tmp4, rounding); | 1018 tmp4 = _mm_add_epi32(tmp4, rounding); |
1054 tmp6 = _mm_add_epi32(tmp6, rounding); | 1019 tmp6 = _mm_add_epi32(tmp6, rounding); |
1055 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); | 1020 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); |
1056 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); | 1021 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); |
1057 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); | 1022 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); |
1058 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); | 1023 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); |
1059 | 1024 |
1060 stp2_0 = _mm_packs_epi32(tmp0, zero); | 1025 stp2_0 = _mm_packs_epi32(tmp0, zero); |
1061 stp2_1 = _mm_packs_epi32(tmp2, zero); | 1026 stp2_1 = _mm_packs_epi32(tmp2, zero); |
1062 stp2_2 = _mm_packs_epi32(tmp4, zero); | 1027 stp2_2 = _mm_packs_epi32(tmp4, zero); |
1063 stp2_3 = _mm_packs_epi32(tmp6, zero); | 1028 stp2_3 = _mm_packs_epi32(tmp6, zero); |
1064 | 1029 |
1065 stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); | 1030 stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); |
1066 stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); | 1031 stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); |
1067 stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); | 1032 stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); |
1068 stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); | 1033 stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); |
1069 } | 1034 } |
1070 | 1035 |
1071 // Stage3 | 1036 // Stage3 |
1072 { | 1037 { //NOLINT |
1073 const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6); | 1038 const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6); |
1074 stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); | 1039 stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); |
1075 stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); | 1040 stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); |
1076 stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); | 1041 stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); |
1077 stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); | 1042 stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); |
1078 | 1043 |
1079 tmp0 = _mm_madd_epi16(lo_56, stg3_0); | 1044 tmp0 = _mm_madd_epi16(lo_56, stg3_0); |
1080 tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0 | 1045 tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0 |
1081 | 1046 |
1082 tmp0 = _mm_add_epi32(tmp0, rounding); | 1047 tmp0 = _mm_add_epi32(tmp0, rounding); |
(...skipping 13 matching lines...) Expand all Loading... |
1096 in4 = _mm_subs_epi16(stp1_3, stp2_4); | 1061 in4 = _mm_subs_epi16(stp1_3, stp2_4); |
1097 in5 = _mm_subs_epi16(stp1_2, stp1_5); | 1062 in5 = _mm_subs_epi16(stp1_2, stp1_5); |
1098 in6 = _mm_subs_epi16(stp1_1, stp1_6); | 1063 in6 = _mm_subs_epi16(stp1_1, stp1_6); |
1099 in7 = _mm_subs_epi16(stp1_0, stp2_7); | 1064 in7 = _mm_subs_epi16(stp1_0, stp2_7); |
1100 | 1065 |
1101 // Columns. 4x8 Transpose | 1066 // Columns. 4x8 Transpose |
1102 TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, | 1067 TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, |
1103 in4, in5, in6, in7) | 1068 in4, in5, in6, in7) |
1104 | 1069 |
1105 // 1D idct8x8 | 1070 // 1D idct8x8 |
1106 IDCT8x8_1D | 1071 IDCT8_1D |
1107 | 1072 |
1108 // Final rounding and shift | 1073 // Final rounding and shift |
1109 in0 = _mm_adds_epi16(in0, final_rounding); | 1074 in0 = _mm_adds_epi16(in0, final_rounding); |
1110 in1 = _mm_adds_epi16(in1, final_rounding); | 1075 in1 = _mm_adds_epi16(in1, final_rounding); |
1111 in2 = _mm_adds_epi16(in2, final_rounding); | 1076 in2 = _mm_adds_epi16(in2, final_rounding); |
1112 in3 = _mm_adds_epi16(in3, final_rounding); | 1077 in3 = _mm_adds_epi16(in3, final_rounding); |
1113 in4 = _mm_adds_epi16(in4, final_rounding); | 1078 in4 = _mm_adds_epi16(in4, final_rounding); |
1114 in5 = _mm_adds_epi16(in5, final_rounding); | 1079 in5 = _mm_adds_epi16(in5, final_rounding); |
1115 in6 = _mm_adds_epi16(in6, final_rounding); | 1080 in6 = _mm_adds_epi16(in6, final_rounding); |
1116 in7 = _mm_adds_epi16(in7, final_rounding); | 1081 in7 = _mm_adds_epi16(in7, final_rounding); |
(...skipping 10 matching lines...) Expand all Loading... |
1127 RECON_AND_STORE(dest, in0); | 1092 RECON_AND_STORE(dest, in0); |
1128 RECON_AND_STORE(dest, in1); | 1093 RECON_AND_STORE(dest, in1); |
1129 RECON_AND_STORE(dest, in2); | 1094 RECON_AND_STORE(dest, in2); |
1130 RECON_AND_STORE(dest, in3); | 1095 RECON_AND_STORE(dest, in3); |
1131 RECON_AND_STORE(dest, in4); | 1096 RECON_AND_STORE(dest, in4); |
1132 RECON_AND_STORE(dest, in5); | 1097 RECON_AND_STORE(dest, in5); |
1133 RECON_AND_STORE(dest, in6); | 1098 RECON_AND_STORE(dest, in6); |
1134 RECON_AND_STORE(dest, in7); | 1099 RECON_AND_STORE(dest, in7); |
1135 } | 1100 } |
1136 | 1101 |
1137 #define IDCT16x16_1D \ | 1102 #define IDCT16_1D \ |
1138 /* Stage2 */ \ | 1103 /* Stage2 */ \ |
1139 { \ | 1104 { \ |
1140 const __m128i lo_1_15 = _mm_unpacklo_epi16(in1, in15); \ | 1105 const __m128i lo_1_15 = _mm_unpacklo_epi16(in1, in15); \ |
1141 const __m128i hi_1_15 = _mm_unpackhi_epi16(in1, in15); \ | 1106 const __m128i hi_1_15 = _mm_unpackhi_epi16(in1, in15); \ |
1142 const __m128i lo_9_7 = _mm_unpacklo_epi16(in9, in7); \ | 1107 const __m128i lo_9_7 = _mm_unpacklo_epi16(in9, in7); \ |
1143 const __m128i hi_9_7 = _mm_unpackhi_epi16(in9, in7); \ | 1108 const __m128i hi_9_7 = _mm_unpackhi_epi16(in9, in7); \ |
1144 const __m128i lo_5_11 = _mm_unpacklo_epi16(in5, in11); \ | 1109 const __m128i lo_5_11 = _mm_unpacklo_epi16(in5, in11); \ |
1145 const __m128i hi_5_11 = _mm_unpackhi_epi16(in5, in11); \ | 1110 const __m128i hi_5_11 = _mm_unpackhi_epi16(in5, in11); \ |
1146 const __m128i lo_13_3 = _mm_unpacklo_epi16(in13, in3); \ | 1111 const __m128i lo_13_3 = _mm_unpacklo_epi16(in13, in3); \ |
1147 const __m128i hi_13_3 = _mm_unpackhi_epi16(in13, in3); \ | 1112 const __m128i hi_13_3 = _mm_unpackhi_epi16(in13, in3); \ |
(...skipping 108 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1256 stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ | 1221 stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ |
1257 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ | 1222 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ |
1258 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ | 1223 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ |
1259 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ | 1224 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ |
1260 \ | 1225 \ |
1261 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ | 1226 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ |
1262 stg6_0, stg4_0, stg6_0, stg4_0, \ | 1227 stg6_0, stg4_0, stg6_0, stg4_0, \ |
1263 stp2_10, stp2_13, stp2_11, stp2_12) \ | 1228 stp2_10, stp2_13, stp2_11, stp2_12) \ |
1264 } | 1229 } |
1265 | 1230 |
1266 void vp9_short_idct16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) { | 1231 void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, |
| 1232 int stride) { |
1267 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); | 1233 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
1268 const __m128i final_rounding = _mm_set1_epi16(1<<5); | 1234 const __m128i final_rounding = _mm_set1_epi16(1<<5); |
1269 const __m128i zero = _mm_setzero_si128(); | 1235 const __m128i zero = _mm_setzero_si128(); |
1270 | 1236 |
1271 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); | 1237 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); |
1272 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); | 1238 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); |
1273 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); | 1239 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); |
1274 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); | 1240 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); |
1275 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); | 1241 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); |
1276 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); | 1242 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); |
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1311 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; | 1277 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
1312 int i; | 1278 int i; |
1313 | 1279 |
1314 // We work on a 8x16 block each time, and loop 4 times for 2-D 16x16 idct. | 1280 // We work on a 8x16 block each time, and loop 4 times for 2-D 16x16 idct. |
1315 for (i = 0; i < 4; i++) { | 1281 for (i = 0; i < 4; i++) { |
1316 // 1-D idct | 1282 // 1-D idct |
1317 if (i < 2) { | 1283 if (i < 2) { |
1318 if (i == 1) input += 128; | 1284 if (i == 1) input += 128; |
1319 | 1285 |
1320 // Load input data. | 1286 // Load input data. |
1321 in0 = _mm_load_si128((__m128i *)input); | 1287 in0 = _mm_load_si128((const __m128i *)input); |
1322 in8 = _mm_load_si128((__m128i *)(input + 8 * 1)); | 1288 in8 = _mm_load_si128((const __m128i *)(input + 8 * 1)); |
1323 in1 = _mm_load_si128((__m128i *)(input + 8 * 2)); | 1289 in1 = _mm_load_si128((const __m128i *)(input + 8 * 2)); |
1324 in9 = _mm_load_si128((__m128i *)(input + 8 * 3)); | 1290 in9 = _mm_load_si128((const __m128i *)(input + 8 * 3)); |
1325 in2 = _mm_load_si128((__m128i *)(input + 8 * 4)); | 1291 in2 = _mm_load_si128((const __m128i *)(input + 8 * 4)); |
1326 in10 = _mm_load_si128((__m128i *)(input + 8 * 5)); | 1292 in10 = _mm_load_si128((const __m128i *)(input + 8 * 5)); |
1327 in3 = _mm_load_si128((__m128i *)(input + 8 * 6)); | 1293 in3 = _mm_load_si128((const __m128i *)(input + 8 * 6)); |
1328 in11 = _mm_load_si128((__m128i *)(input + 8 * 7)); | 1294 in11 = _mm_load_si128((const __m128i *)(input + 8 * 7)); |
1329 in4 = _mm_load_si128((__m128i *)(input + 8 * 8)); | 1295 in4 = _mm_load_si128((const __m128i *)(input + 8 * 8)); |
1330 in12 = _mm_load_si128((__m128i *)(input + 8 * 9)); | 1296 in12 = _mm_load_si128((const __m128i *)(input + 8 * 9)); |
1331 in5 = _mm_load_si128((__m128i *)(input + 8 * 10)); | 1297 in5 = _mm_load_si128((const __m128i *)(input + 8 * 10)); |
1332 in13 = _mm_load_si128((__m128i *)(input + 8 * 11)); | 1298 in13 = _mm_load_si128((const __m128i *)(input + 8 * 11)); |
1333 in6 = _mm_load_si128((__m128i *)(input + 8 * 12)); | 1299 in6 = _mm_load_si128((const __m128i *)(input + 8 * 12)); |
1334 in14 = _mm_load_si128((__m128i *)(input + 8 * 13)); | 1300 in14 = _mm_load_si128((const __m128i *)(input + 8 * 13)); |
1335 in7 = _mm_load_si128((__m128i *)(input + 8 * 14)); | 1301 in7 = _mm_load_si128((const __m128i *)(input + 8 * 14)); |
1336 in15 = _mm_load_si128((__m128i *)(input + 8 * 15)); | 1302 in15 = _mm_load_si128((const __m128i *)(input + 8 * 15)); |
1337 | 1303 |
1338 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, | 1304 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, |
1339 in4, in5, in6, in7); | 1305 in4, in5, in6, in7); |
1340 TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, | 1306 TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, |
1341 in10, in11, in12, in13, in14, in15); | 1307 in10, in11, in12, in13, in14, in15); |
1342 } | 1308 } |
1343 | 1309 |
1344 if (i == 2) { | 1310 if (i == 2) { |
1345 TRANSPOSE_8X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4, | 1311 TRANSPOSE_8X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4, |
1346 in5, in6, in7); | 1312 in5, in6, in7); |
1347 TRANSPOSE_8X8(r0, r1, r2, r3, r4, r5, r6, r7, in8, in9, in10, in11, in12, | 1313 TRANSPOSE_8X8(r0, r1, r2, r3, r4, r5, r6, r7, in8, in9, in10, in11, in12, |
1348 in13, in14, in15); | 1314 in13, in14, in15); |
1349 } | 1315 } |
1350 | 1316 |
1351 if (i == 3) { | 1317 if (i == 3) { |
1352 TRANSPOSE_8X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3, | 1318 TRANSPOSE_8X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3, |
1353 in4, in5, in6, in7); | 1319 in4, in5, in6, in7); |
1354 TRANSPOSE_8X8(r8, r9, r10, r11, r12, r13, r14, r15, in8, in9, in10, in11, | 1320 TRANSPOSE_8X8(r8, r9, r10, r11, r12, r13, r14, r15, in8, in9, in10, in11, |
1355 in12, in13, in14, in15); | 1321 in12, in13, in14, in15); |
1356 } | 1322 } |
1357 | 1323 |
1358 IDCT16x16_1D | 1324 IDCT16_1D |
1359 | 1325 |
1360 // Stage7 | 1326 // Stage7 |
1361 if (i == 0) { | 1327 if (i == 0) { |
1362 // Left 8x16 | 1328 // Left 8x16 |
1363 l0 = _mm_add_epi16(stp2_0, stp1_15); | 1329 l0 = _mm_add_epi16(stp2_0, stp1_15); |
1364 l1 = _mm_add_epi16(stp2_1, stp1_14); | 1330 l1 = _mm_add_epi16(stp2_1, stp1_14); |
1365 l2 = _mm_add_epi16(stp2_2, stp2_13); | 1331 l2 = _mm_add_epi16(stp2_2, stp2_13); |
1366 l3 = _mm_add_epi16(stp2_3, stp2_12); | 1332 l3 = _mm_add_epi16(stp2_3, stp2_12); |
1367 l4 = _mm_add_epi16(stp2_4, stp2_11); | 1333 l4 = _mm_add_epi16(stp2_4, stp2_11); |
1368 l5 = _mm_add_epi16(stp2_5, stp2_10); | 1334 l5 = _mm_add_epi16(stp2_5, stp2_10); |
(...skipping 94 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1463 RECON_AND_STORE(dest, in12); | 1429 RECON_AND_STORE(dest, in12); |
1464 RECON_AND_STORE(dest, in13); | 1430 RECON_AND_STORE(dest, in13); |
1465 RECON_AND_STORE(dest, in14); | 1431 RECON_AND_STORE(dest, in14); |
1466 RECON_AND_STORE(dest, in15); | 1432 RECON_AND_STORE(dest, in15); |
1467 | 1433 |
1468 dest += 8 - (stride * 16); | 1434 dest += 8 - (stride * 16); |
1469 } | 1435 } |
1470 } | 1436 } |
1471 } | 1437 } |
1472 | 1438 |
1473 void vp9_short_idct16x16_1_add_sse2(int16_t *input, uint8_t *dest, int stride) { | 1439 void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
1474 __m128i dc_value; | 1440 __m128i dc_value; |
1475 const __m128i zero = _mm_setzero_si128(); | 1441 const __m128i zero = _mm_setzero_si128(); |
1476 int a, i; | 1442 int a, i; |
1477 | 1443 |
1478 a = dct_const_round_shift(input[0] * cospi_16_64); | 1444 a = dct_const_round_shift(input[0] * cospi_16_64); |
1479 a = dct_const_round_shift(a * cospi_16_64); | 1445 a = dct_const_round_shift(a * cospi_16_64); |
1480 a = ROUND_POWER_OF_TWO(a, 6); | 1446 a = ROUND_POWER_OF_TWO(a, 6); |
1481 | 1447 |
1482 dc_value = _mm_set1_epi16(a); | 1448 dc_value = _mm_set1_epi16(a); |
1483 | 1449 |
(...skipping 28 matching lines...) Expand all Loading... |
1512 res0[8] = tbuf[0]; | 1478 res0[8] = tbuf[0]; |
1513 res0[9] = tbuf[1]; | 1479 res0[9] = tbuf[1]; |
1514 res0[10] = tbuf[2]; | 1480 res0[10] = tbuf[2]; |
1515 res0[11] = tbuf[3]; | 1481 res0[11] = tbuf[3]; |
1516 res0[12] = tbuf[4]; | 1482 res0[12] = tbuf[4]; |
1517 res0[13] = tbuf[5]; | 1483 res0[13] = tbuf[5]; |
1518 res0[14] = tbuf[6]; | 1484 res0[14] = tbuf[6]; |
1519 res0[15] = tbuf[7]; | 1485 res0[15] = tbuf[7]; |
1520 } | 1486 } |
1521 | 1487 |
1522 void iadst16_1d_8col(__m128i *in) { | 1488 static void iadst16_1d_8col(__m128i *in) { |
1523 // perform 16x16 1-D ADST for 8 columns | 1489 // perform 16x16 1-D ADST for 8 columns |
1524 __m128i s[16], x[16], u[32], v[32]; | 1490 __m128i s[16], x[16], u[32], v[32]; |
1525 const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); | 1491 const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); |
1526 const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); | 1492 const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); |
1527 const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); | 1493 const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); |
1528 const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); | 1494 const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); |
1529 const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); | 1495 const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); |
1530 const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); | 1496 const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); |
1531 const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); | 1497 const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); |
1532 const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64); | 1498 const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64); |
(...skipping 449 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1982 in[8] = _mm_packs_epi32(v[2], v[3]); | 1948 in[8] = _mm_packs_epi32(v[2], v[3]); |
1983 in[9] = _mm_packs_epi32(v[10], v[11]); | 1949 in[9] = _mm_packs_epi32(v[10], v[11]); |
1984 in[10] = _mm_packs_epi32(v[14], v[15]); | 1950 in[10] = _mm_packs_epi32(v[14], v[15]); |
1985 in[11] = _mm_packs_epi32(v[6], v[7]); | 1951 in[11] = _mm_packs_epi32(v[6], v[7]); |
1986 in[12] = s[5]; | 1952 in[12] = s[5]; |
1987 in[13] = _mm_sub_epi16(kZero, s[13]); | 1953 in[13] = _mm_sub_epi16(kZero, s[13]); |
1988 in[14] = s[9]; | 1954 in[14] = s[9]; |
1989 in[15] = _mm_sub_epi16(kZero, s[1]); | 1955 in[15] = _mm_sub_epi16(kZero, s[1]); |
1990 } | 1956 } |
1991 | 1957 |
1992 void idct16_1d_8col(__m128i *in) { | 1958 static void idct16_1d_8col(__m128i *in) { |
1993 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); | 1959 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); |
1994 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); | 1960 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); |
1995 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); | 1961 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); |
1996 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); | 1962 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); |
1997 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); | 1963 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); |
1998 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); | 1964 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); |
1999 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); | 1965 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); |
2000 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); | 1966 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); |
2001 const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); | 1967 const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); |
2002 const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); | 1968 const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); |
(...skipping 323 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2326 in[8] = _mm_sub_epi16(s[7], s[8]); | 2292 in[8] = _mm_sub_epi16(s[7], s[8]); |
2327 in[9] = _mm_sub_epi16(s[6], s[9]); | 2293 in[9] = _mm_sub_epi16(s[6], s[9]); |
2328 in[10] = _mm_sub_epi16(s[5], s[10]); | 2294 in[10] = _mm_sub_epi16(s[5], s[10]); |
2329 in[11] = _mm_sub_epi16(s[4], s[11]); | 2295 in[11] = _mm_sub_epi16(s[4], s[11]); |
2330 in[12] = _mm_sub_epi16(s[3], s[12]); | 2296 in[12] = _mm_sub_epi16(s[3], s[12]); |
2331 in[13] = _mm_sub_epi16(s[2], s[13]); | 2297 in[13] = _mm_sub_epi16(s[2], s[13]); |
2332 in[14] = _mm_sub_epi16(s[1], s[14]); | 2298 in[14] = _mm_sub_epi16(s[1], s[14]); |
2333 in[15] = _mm_sub_epi16(s[0], s[15]); | 2299 in[15] = _mm_sub_epi16(s[0], s[15]); |
2334 } | 2300 } |
2335 | 2301 |
2336 void idct16_1d_sse2(__m128i *in0, __m128i *in1) { | 2302 static void idct16_1d_sse2(__m128i *in0, __m128i *in1) { |
2337 array_transpose_16x16(in0, in1); | 2303 array_transpose_16x16(in0, in1); |
2338 idct16_1d_8col(in0); | 2304 idct16_1d_8col(in0); |
2339 idct16_1d_8col(in1); | 2305 idct16_1d_8col(in1); |
2340 } | 2306 } |
2341 | 2307 |
2342 void iadst16_1d_sse2(__m128i *in0, __m128i *in1) { | 2308 static void iadst16_1d_sse2(__m128i *in0, __m128i *in1) { |
2343 array_transpose_16x16(in0, in1); | 2309 array_transpose_16x16(in0, in1); |
2344 iadst16_1d_8col(in0); | 2310 iadst16_1d_8col(in0); |
2345 iadst16_1d_8col(in1); | 2311 iadst16_1d_8col(in1); |
2346 } | 2312 } |
2347 | 2313 |
2348 static INLINE void load_buffer_8x16(int16_t *input, __m128i *in) { | 2314 static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) { |
2349 in[0] = _mm_load_si128((__m128i *)(input + 0 * 16)); | 2315 in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16)); |
2350 in[1] = _mm_load_si128((__m128i *)(input + 1 * 16)); | 2316 in[1] = _mm_load_si128((const __m128i *)(input + 1 * 16)); |
2351 in[2] = _mm_load_si128((__m128i *)(input + 2 * 16)); | 2317 in[2] = _mm_load_si128((const __m128i *)(input + 2 * 16)); |
2352 in[3] = _mm_load_si128((__m128i *)(input + 3 * 16)); | 2318 in[3] = _mm_load_si128((const __m128i *)(input + 3 * 16)); |
2353 in[4] = _mm_load_si128((__m128i *)(input + 4 * 16)); | 2319 in[4] = _mm_load_si128((const __m128i *)(input + 4 * 16)); |
2354 in[5] = _mm_load_si128((__m128i *)(input + 5 * 16)); | 2320 in[5] = _mm_load_si128((const __m128i *)(input + 5 * 16)); |
2355 in[6] = _mm_load_si128((__m128i *)(input + 6 * 16)); | 2321 in[6] = _mm_load_si128((const __m128i *)(input + 6 * 16)); |
2356 in[7] = _mm_load_si128((__m128i *)(input + 7 * 16)); | 2322 in[7] = _mm_load_si128((const __m128i *)(input + 7 * 16)); |
2357 | 2323 |
2358 in[8] = _mm_load_si128((__m128i *)(input + 8 * 16)); | 2324 in[8] = _mm_load_si128((const __m128i *)(input + 8 * 16)); |
2359 in[9] = _mm_load_si128((__m128i *)(input + 9 * 16)); | 2325 in[9] = _mm_load_si128((const __m128i *)(input + 9 * 16)); |
2360 in[10] = _mm_load_si128((__m128i *)(input + 10 * 16)); | 2326 in[10] = _mm_load_si128((const __m128i *)(input + 10 * 16)); |
2361 in[11] = _mm_load_si128((__m128i *)(input + 11 * 16)); | 2327 in[11] = _mm_load_si128((const __m128i *)(input + 11 * 16)); |
2362 in[12] = _mm_load_si128((__m128i *)(input + 12 * 16)); | 2328 in[12] = _mm_load_si128((const __m128i *)(input + 12 * 16)); |
2363 in[13] = _mm_load_si128((__m128i *)(input + 13 * 16)); | 2329 in[13] = _mm_load_si128((const __m128i *)(input + 13 * 16)); |
2364 in[14] = _mm_load_si128((__m128i *)(input + 14 * 16)); | 2330 in[14] = _mm_load_si128((const __m128i *)(input + 14 * 16)); |
2365 in[15] = _mm_load_si128((__m128i *)(input + 15 * 16)); | 2331 in[15] = _mm_load_si128((const __m128i *)(input + 15 * 16)); |
2366 } | 2332 } |
2367 | 2333 |
2368 static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) { | 2334 static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) { |
2369 const __m128i final_rounding = _mm_set1_epi16(1<<5); | 2335 const __m128i final_rounding = _mm_set1_epi16(1<<5); |
2370 const __m128i zero = _mm_setzero_si128(); | 2336 const __m128i zero = _mm_setzero_si128(); |
2371 // Final rounding and shift | 2337 // Final rounding and shift |
2372 in[0] = _mm_adds_epi16(in[0], final_rounding); | 2338 in[0] = _mm_adds_epi16(in[0], final_rounding); |
2373 in[1] = _mm_adds_epi16(in[1], final_rounding); | 2339 in[1] = _mm_adds_epi16(in[1], final_rounding); |
2374 in[2] = _mm_adds_epi16(in[2], final_rounding); | 2340 in[2] = _mm_adds_epi16(in[2], final_rounding); |
2375 in[3] = _mm_adds_epi16(in[3], final_rounding); | 2341 in[3] = _mm_adds_epi16(in[3], final_rounding); |
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2414 RECON_AND_STORE(dest, in[8]); | 2380 RECON_AND_STORE(dest, in[8]); |
2415 RECON_AND_STORE(dest, in[9]); | 2381 RECON_AND_STORE(dest, in[9]); |
2416 RECON_AND_STORE(dest, in[10]); | 2382 RECON_AND_STORE(dest, in[10]); |
2417 RECON_AND_STORE(dest, in[11]); | 2383 RECON_AND_STORE(dest, in[11]); |
2418 RECON_AND_STORE(dest, in[12]); | 2384 RECON_AND_STORE(dest, in[12]); |
2419 RECON_AND_STORE(dest, in[13]); | 2385 RECON_AND_STORE(dest, in[13]); |
2420 RECON_AND_STORE(dest, in[14]); | 2386 RECON_AND_STORE(dest, in[14]); |
2421 RECON_AND_STORE(dest, in[15]); | 2387 RECON_AND_STORE(dest, in[15]); |
2422 } | 2388 } |
2423 | 2389 |
2424 void vp9_short_iht16x16_add_sse2(int16_t *input, uint8_t *dest, int stride, | 2390 void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride, |
2425 int tx_type) { | 2391 int tx_type) { |
2426 __m128i in0[16], in1[16]; | 2392 __m128i in0[16], in1[16]; |
2427 | 2393 |
2428 load_buffer_8x16(input, in0); | 2394 load_buffer_8x16(input, in0); |
2429 input += 8; | 2395 input += 8; |
2430 load_buffer_8x16(input, in1); | 2396 load_buffer_8x16(input, in1); |
2431 | 2397 |
2432 switch (tx_type) { | 2398 switch (tx_type) { |
2433 case 0: // DCT_DCT | 2399 case 0: // DCT_DCT |
2434 idct16_1d_sse2(in0, in1); | 2400 idct16_1d_sse2(in0, in1); |
2435 idct16_1d_sse2(in0, in1); | 2401 idct16_1d_sse2(in0, in1); |
(...skipping 13 matching lines...) Expand all Loading... |
2449 default: | 2415 default: |
2450 assert(0); | 2416 assert(0); |
2451 break; | 2417 break; |
2452 } | 2418 } |
2453 | 2419 |
2454 write_buffer_8x16(dest, in0, stride); | 2420 write_buffer_8x16(dest, in0, stride); |
2455 dest += 8; | 2421 dest += 8; |
2456 write_buffer_8x16(dest, in1, stride); | 2422 write_buffer_8x16(dest, in1, stride); |
2457 } | 2423 } |
2458 | 2424 |
2459 void vp9_short_idct10_16x16_add_sse2(int16_t *input, uint8_t *dest, | 2425 void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, |
2460 int stride) { | 2426 int stride) { |
2461 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); | 2427 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
2462 const __m128i final_rounding = _mm_set1_epi16(1<<5); | 2428 const __m128i final_rounding = _mm_set1_epi16(1<<5); |
2463 const __m128i zero = _mm_setzero_si128(); | 2429 const __m128i zero = _mm_setzero_si128(); |
2464 | 2430 |
2465 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); | 2431 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); |
2466 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); | 2432 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); |
2467 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); | 2433 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); |
2468 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); | 2434 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); |
2469 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); | 2435 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); |
2470 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); | 2436 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); |
(...skipping 25 matching lines...) Expand all Loading... |
2496 l12 = zero, l13 = zero, l14 = zero, l15 = zero; | 2462 l12 = zero, l13 = zero, l14 = zero, l15 = zero; |
2497 | 2463 |
2498 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, | 2464 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, |
2499 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, | 2465 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, |
2500 stp1_8_0, stp1_12_0; | 2466 stp1_8_0, stp1_12_0; |
2501 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, | 2467 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, |
2502 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; | 2468 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; |
2503 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; | 2469 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
2504 int i; | 2470 int i; |
2505 // 1-D idct. Load input data. | 2471 // 1-D idct. Load input data. |
2506 in0 = _mm_load_si128((__m128i *)input); | 2472 in0 = _mm_load_si128((const __m128i *)input); |
2507 in8 = _mm_load_si128((__m128i *)(input + 8 * 1)); | 2473 in8 = _mm_load_si128((const __m128i *)(input + 8 * 1)); |
2508 in1 = _mm_load_si128((__m128i *)(input + 8 * 2)); | 2474 in1 = _mm_load_si128((const __m128i *)(input + 8 * 2)); |
2509 in9 = _mm_load_si128((__m128i *)(input + 8 * 3)); | 2475 in9 = _mm_load_si128((const __m128i *)(input + 8 * 3)); |
2510 in2 = _mm_load_si128((__m128i *)(input + 8 * 4)); | 2476 in2 = _mm_load_si128((const __m128i *)(input + 8 * 4)); |
2511 in10 = _mm_load_si128((__m128i *)(input + 8 * 5)); | 2477 in10 = _mm_load_si128((const __m128i *)(input + 8 * 5)); |
2512 in3 = _mm_load_si128((__m128i *)(input + 8 * 6)); | 2478 in3 = _mm_load_si128((const __m128i *)(input + 8 * 6)); |
2513 in11 = _mm_load_si128((__m128i *)(input + 8 * 7)); | 2479 in11 = _mm_load_si128((const __m128i *)(input + 8 * 7)); |
2514 | 2480 |
2515 TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3); | 2481 TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3); |
2516 TRANSPOSE_8X4(in8, in9, in10, in11, in8, in9, in10, in11); | 2482 TRANSPOSE_8X4(in8, in9, in10, in11, in8, in9, in10, in11); |
2517 | 2483 |
2518 // Stage2 | 2484 // Stage2 |
2519 { | 2485 { |
2520 const __m128i lo_1_15 = _mm_unpackhi_epi16(in0, in11); | 2486 const __m128i lo_1_15 = _mm_unpackhi_epi16(in0, in11); |
2521 const __m128i lo_9_7 = _mm_unpackhi_epi16(in8, in3); | 2487 const __m128i lo_9_7 = _mm_unpackhi_epi16(in8, in3); |
2522 const __m128i lo_5_11 = _mm_unpackhi_epi16(in2, in9); | 2488 const __m128i lo_5_11 = _mm_unpackhi_epi16(in2, in9); |
2523 const __m128i lo_13_3 = _mm_unpackhi_epi16(in10, in1); | 2489 const __m128i lo_13_3 = _mm_unpackhi_epi16(in10, in1); |
(...skipping 206 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2730 if (i == 0) | 2696 if (i == 0) |
2731 TRANSPOSE_4X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4, | 2697 TRANSPOSE_4X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4, |
2732 in5, in6, in7); | 2698 in5, in6, in7); |
2733 | 2699 |
2734 if (i == 1) | 2700 if (i == 1) |
2735 TRANSPOSE_4X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3, | 2701 TRANSPOSE_4X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3, |
2736 in4, in5, in6, in7); | 2702 in4, in5, in6, in7); |
2737 | 2703 |
2738 in8 = in9 = in10 = in11 = in12 = in13 = in14 = in15 = zero; | 2704 in8 = in9 = in10 = in11 = in12 = in13 = in14 = in15 = zero; |
2739 | 2705 |
2740 IDCT16x16_1D | 2706 IDCT16_1D |
2741 | 2707 |
2742 // Stage7 | 2708 // Stage7 |
2743 in0 = _mm_add_epi16(stp2_0, stp1_15); | 2709 in0 = _mm_add_epi16(stp2_0, stp1_15); |
2744 in1 = _mm_add_epi16(stp2_1, stp1_14); | 2710 in1 = _mm_add_epi16(stp2_1, stp1_14); |
2745 in2 = _mm_add_epi16(stp2_2, stp2_13); | 2711 in2 = _mm_add_epi16(stp2_2, stp2_13); |
2746 in3 = _mm_add_epi16(stp2_3, stp2_12); | 2712 in3 = _mm_add_epi16(stp2_3, stp2_12); |
2747 in4 = _mm_add_epi16(stp2_4, stp2_11); | 2713 in4 = _mm_add_epi16(stp2_4, stp2_11); |
2748 in5 = _mm_add_epi16(stp2_5, stp2_10); | 2714 in5 = _mm_add_epi16(stp2_5, stp2_10); |
2749 in6 = _mm_add_epi16(stp2_6, stp1_9); | 2715 in6 = _mm_add_epi16(stp2_6, stp1_9); |
2750 in7 = _mm_add_epi16(stp2_7, stp1_8); | 2716 in7 = _mm_add_epi16(stp2_7, stp1_8); |
(...skipping 57 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2808 RECON_AND_STORE(dest, in13); | 2774 RECON_AND_STORE(dest, in13); |
2809 RECON_AND_STORE(dest, in14); | 2775 RECON_AND_STORE(dest, in14); |
2810 RECON_AND_STORE(dest, in15); | 2776 RECON_AND_STORE(dest, in15); |
2811 | 2777 |
2812 dest += 8 - (stride * 16); | 2778 dest += 8 - (stride * 16); |
2813 } | 2779 } |
2814 } | 2780 } |
2815 | 2781 |
2816 #define LOAD_DQCOEFF(reg, input) \ | 2782 #define LOAD_DQCOEFF(reg, input) \ |
2817 { \ | 2783 { \ |
2818 reg = _mm_load_si128((__m128i *) input); \ | 2784 reg = _mm_load_si128((const __m128i *) input); \ |
2819 input += 8; \ | 2785 input += 8; \ |
2820 } \ | 2786 } \ |
2821 | 2787 |
2822 void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) { | 2788 #define IDCT32_1D \ |
| 2789 /* Stage1 */ \ |
| 2790 { \ |
| 2791 const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31); \ |
| 2792 const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31); \ |
| 2793 const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15); \ |
| 2794 const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15); \ |
| 2795 \ |
| 2796 const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23); \ |
| 2797 const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23); \ |
| 2798 const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7); \ |
| 2799 const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7); \ |
| 2800 \ |
| 2801 const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27); \ |
| 2802 const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27); \ |
| 2803 const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11); \ |
| 2804 const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11); \ |
| 2805 \ |
| 2806 const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19); \ |
| 2807 const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19); \ |
| 2808 const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3); \ |
| 2809 const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3); \ |
| 2810 \ |
| 2811 MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \ |
| 2812 stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \ |
| 2813 stp1_17, stp1_30) \ |
| 2814 MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \ |
| 2815 stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \ |
| 2816 stp1_19, stp1_28) \ |
| 2817 MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \ |
| 2818 stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \ |
| 2819 stp1_21, stp1_26) \ |
| 2820 MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \ |
| 2821 stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \ |
| 2822 stp1_23, stp1_24) \ |
| 2823 } \ |
| 2824 \ |
| 2825 /* Stage2 */ \ |
| 2826 { \ |
| 2827 const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30); \ |
| 2828 const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30); \ |
| 2829 const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14); \ |
| 2830 const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14); \ |
| 2831 \ |
| 2832 const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22); \ |
| 2833 const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22); \ |
| 2834 const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6); \ |
| 2835 const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6); \ |
| 2836 \ |
| 2837 MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \ |
| 2838 stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \ |
| 2839 stp2_14) \ |
| 2840 MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \ |
| 2841 stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \ |
| 2842 stp2_11, stp2_12) \ |
| 2843 \ |
| 2844 stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \ |
| 2845 stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \ |
| 2846 stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \ |
| 2847 stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \ |
| 2848 \ |
| 2849 stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \ |
| 2850 stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \ |
| 2851 stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \ |
| 2852 stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \ |
| 2853 \ |
| 2854 stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \ |
| 2855 stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \ |
| 2856 stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \ |
| 2857 stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \ |
| 2858 \ |
| 2859 stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \ |
| 2860 stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \ |
| 2861 stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \ |
| 2862 stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \ |
| 2863 } \ |
| 2864 \ |
| 2865 /* Stage3 */ \ |
| 2866 { \ |
| 2867 const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28); \ |
| 2868 const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28); \ |
| 2869 const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12); \ |
| 2870 const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12); \ |
| 2871 \ |
| 2872 const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \ |
| 2873 const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \ |
| 2874 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ |
| 2875 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ |
| 2876 \ |
| 2877 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ |
| 2878 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ |
| 2879 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ |
| 2880 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ |
| 2881 \ |
| 2882 MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \ |
| 2883 stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \ |
| 2884 stp1_6) \ |
| 2885 \ |
| 2886 stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \ |
| 2887 stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ |
| 2888 stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ |
| 2889 stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ |
| 2890 stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \ |
| 2891 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ |
| 2892 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ |
| 2893 stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ |
| 2894 \ |
| 2895 MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ |
| 2896 stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \ |
| 2897 stp1_18, stp1_29) \ |
| 2898 MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ |
| 2899 stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \ |
| 2900 stp1_22, stp1_25) \ |
| 2901 \ |
| 2902 stp1_16 = stp2_16; \ |
| 2903 stp1_31 = stp2_31; \ |
| 2904 stp1_19 = stp2_19; \ |
| 2905 stp1_20 = stp2_20; \ |
| 2906 stp1_23 = stp2_23; \ |
| 2907 stp1_24 = stp2_24; \ |
| 2908 stp1_27 = stp2_27; \ |
| 2909 stp1_28 = stp2_28; \ |
| 2910 } \ |
| 2911 \ |
| 2912 /* Stage4 */ \ |
| 2913 { \ |
| 2914 const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16); \ |
| 2915 const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16); \ |
| 2916 const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24); \ |
| 2917 const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24); \ |
| 2918 \ |
| 2919 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ |
| 2920 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ |
| 2921 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ |
| 2922 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ |
| 2923 \ |
| 2924 MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \ |
| 2925 stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \ |
| 2926 stp2_2, stp2_3) \ |
| 2927 \ |
| 2928 stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ |
| 2929 stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ |
| 2930 stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ |
| 2931 stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ |
| 2932 \ |
| 2933 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ |
| 2934 stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \ |
| 2935 stp2_10, stp2_13) \ |
| 2936 \ |
| 2937 stp2_8 = stp1_8; \ |
| 2938 stp2_15 = stp1_15; \ |
| 2939 stp2_11 = stp1_11; \ |
| 2940 stp2_12 = stp1_12; \ |
| 2941 \ |
| 2942 stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ |
| 2943 stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ |
| 2944 stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ |
| 2945 stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ |
| 2946 stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ |
| 2947 stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ |
| 2948 stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ |
| 2949 stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ |
| 2950 \ |
| 2951 stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ |
| 2952 stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ |
| 2953 stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ |
| 2954 stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ |
| 2955 stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ |
| 2956 stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ |
| 2957 stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ |
| 2958 stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ |
| 2959 } \ |
| 2960 \ |
| 2961 /* Stage5 */ \ |
| 2962 { \ |
| 2963 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ |
| 2964 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ |
| 2965 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ |
| 2966 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ |
| 2967 \ |
| 2968 const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ |
| 2969 const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ |
| 2970 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ |
| 2971 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ |
| 2972 \ |
| 2973 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ |
| 2974 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ |
| 2975 \ |
| 2976 stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ |
| 2977 stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ |
| 2978 stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ |
| 2979 stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ |
| 2980 \ |
| 2981 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ |
| 2982 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ |
| 2983 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ |
| 2984 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ |
| 2985 \ |
| 2986 tmp0 = _mm_add_epi32(tmp0, rounding); \ |
| 2987 tmp1 = _mm_add_epi32(tmp1, rounding); \ |
| 2988 tmp2 = _mm_add_epi32(tmp2, rounding); \ |
| 2989 tmp3 = _mm_add_epi32(tmp3, rounding); \ |
| 2990 \ |
| 2991 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ |
| 2992 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ |
| 2993 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ |
| 2994 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ |
| 2995 \ |
| 2996 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ |
| 2997 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ |
| 2998 \ |
| 2999 stp1_4 = stp2_4; \ |
| 3000 stp1_7 = stp2_7; \ |
| 3001 \ |
| 3002 stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ |
| 3003 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ |
| 3004 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ |
| 3005 stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ |
| 3006 stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ |
| 3007 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ |
| 3008 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ |
| 3009 stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ |
| 3010 \ |
| 3011 stp1_16 = stp2_16; \ |
| 3012 stp1_17 = stp2_17; \ |
| 3013 \ |
| 3014 MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ |
| 3015 stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \ |
| 3016 stp1_19, stp1_28) \ |
| 3017 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ |
| 3018 stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \ |
| 3019 stp1_21, stp1_26) \ |
| 3020 \ |
| 3021 stp1_22 = stp2_22; \ |
| 3022 stp1_23 = stp2_23; \ |
| 3023 stp1_24 = stp2_24; \ |
| 3024 stp1_25 = stp2_25; \ |
| 3025 stp1_30 = stp2_30; \ |
| 3026 stp1_31 = stp2_31; \ |
| 3027 } \ |
| 3028 \ |
| 3029 /* Stage6 */ \ |
| 3030 { \ |
| 3031 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ |
| 3032 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ |
| 3033 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ |
| 3034 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ |
| 3035 \ |
| 3036 stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ |
| 3037 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ |
| 3038 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ |
| 3039 stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ |
| 3040 stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ |
| 3041 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ |
| 3042 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ |
| 3043 stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ |
| 3044 \ |
| 3045 stp2_8 = stp1_8; \ |
| 3046 stp2_9 = stp1_9; \ |
| 3047 stp2_14 = stp1_14; \ |
| 3048 stp2_15 = stp1_15; \ |
| 3049 \ |
| 3050 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ |
| 3051 stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \ |
| 3052 stp2_13, stp2_11, stp2_12) \ |
| 3053 \ |
| 3054 stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ |
| 3055 stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ |
| 3056 stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ |
| 3057 stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ |
| 3058 stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ |
| 3059 stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ |
| 3060 stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ |
| 3061 stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ |
| 3062 \ |
| 3063 stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ |
| 3064 stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ |
| 3065 stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ |
| 3066 stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ |
| 3067 stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ |
| 3068 stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ |
| 3069 stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ |
| 3070 stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ |
| 3071 } \ |
| 3072 \ |
| 3073 /* Stage7 */ \ |
| 3074 { \ |
| 3075 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ |
| 3076 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ |
| 3077 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ |
| 3078 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ |
| 3079 \ |
| 3080 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ |
| 3081 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ |
| 3082 const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ |
| 3083 const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ |
| 3084 \ |
| 3085 stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ |
| 3086 stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ |
| 3087 stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ |
| 3088 stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ |
| 3089 stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ |
| 3090 stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ |
| 3091 stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ |
| 3092 stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ |
| 3093 stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ |
| 3094 stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ |
| 3095 stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ |
| 3096 stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ |
| 3097 stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ |
| 3098 stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ |
| 3099 stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ |
| 3100 stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ |
| 3101 \ |
| 3102 stp1_16 = stp2_16; \ |
| 3103 stp1_17 = stp2_17; \ |
| 3104 stp1_18 = stp2_18; \ |
| 3105 stp1_19 = stp2_19; \ |
| 3106 \ |
| 3107 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ |
| 3108 stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \ |
| 3109 stp1_21, stp1_26) \ |
| 3110 MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ |
| 3111 stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \ |
| 3112 stp1_23, stp1_24) \ |
| 3113 \ |
| 3114 stp1_28 = stp2_28; \ |
| 3115 stp1_29 = stp2_29; \ |
| 3116 stp1_30 = stp2_30; \ |
| 3117 stp1_31 = stp2_31; \ |
| 3118 } |
| 3119 |
| 3120 // Only upper-left 8x8 has non-zero coeff |
| 3121 void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, |
| 3122 int stride) { |
2823 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); | 3123 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
2824 const __m128i final_rounding = _mm_set1_epi16(1<<5); | 3124 const __m128i final_rounding = _mm_set1_epi16(1<<5); |
2825 | 3125 |
2826 // idct constants for each stage | 3126 // idct constants for each stage |
2827 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); | 3127 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); |
2828 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); | 3128 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); |
2829 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); | 3129 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); |
2830 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); | 3130 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); |
2831 const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); | 3131 const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); |
2832 const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); | 3132 const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); |
(...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2880 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, | 3180 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, |
2881 stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, | 3181 stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, |
2882 stp1_30, stp1_31; | 3182 stp1_30, stp1_31; |
2883 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, | 3183 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, |
2884 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, | 3184 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, |
2885 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, | 3185 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, |
2886 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, | 3186 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, |
2887 stp2_30, stp2_31; | 3187 stp2_30, stp2_31; |
2888 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; | 3188 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
2889 int i, j, i32; | 3189 int i, j, i32; |
| 3190 |
| 3191 // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct. |
| 3192 for (i = 0; i < 8; i++) { |
| 3193 i32 = (i << 5); |
| 3194 if (i == 0) { |
| 3195 // First 1-D idct: first 8 rows |
| 3196 // Load input data. |
| 3197 LOAD_DQCOEFF(in0, input); |
| 3198 LOAD_DQCOEFF(in8, input); |
| 3199 LOAD_DQCOEFF(in16, input); |
| 3200 LOAD_DQCOEFF(in24, input); |
| 3201 LOAD_DQCOEFF(in1, input); |
| 3202 LOAD_DQCOEFF(in9, input); |
| 3203 LOAD_DQCOEFF(in17, input); |
| 3204 LOAD_DQCOEFF(in25, input); |
| 3205 LOAD_DQCOEFF(in2, input); |
| 3206 LOAD_DQCOEFF(in10, input); |
| 3207 LOAD_DQCOEFF(in18, input); |
| 3208 LOAD_DQCOEFF(in26, input); |
| 3209 LOAD_DQCOEFF(in3, input); |
| 3210 LOAD_DQCOEFF(in11, input); |
| 3211 LOAD_DQCOEFF(in19, input); |
| 3212 LOAD_DQCOEFF(in27, input); |
| 3213 |
| 3214 LOAD_DQCOEFF(in4, input); |
| 3215 LOAD_DQCOEFF(in12, input); |
| 3216 LOAD_DQCOEFF(in20, input); |
| 3217 LOAD_DQCOEFF(in28, input); |
| 3218 LOAD_DQCOEFF(in5, input); |
| 3219 LOAD_DQCOEFF(in13, input); |
| 3220 LOAD_DQCOEFF(in21, input); |
| 3221 LOAD_DQCOEFF(in29, input); |
| 3222 LOAD_DQCOEFF(in6, input); |
| 3223 LOAD_DQCOEFF(in14, input); |
| 3224 LOAD_DQCOEFF(in22, input); |
| 3225 LOAD_DQCOEFF(in30, input); |
| 3226 LOAD_DQCOEFF(in7, input); |
| 3227 LOAD_DQCOEFF(in15, input); |
| 3228 LOAD_DQCOEFF(in23, input); |
| 3229 LOAD_DQCOEFF(in31, input); |
| 3230 |
| 3231 // Transpose 32x8 block to 8x32 block |
| 3232 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, |
| 3233 in4, in5, in6, in7); |
| 3234 TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, |
| 3235 in10, in11, in12, in13, in14, in15); |
| 3236 TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17, |
| 3237 in18, in19, in20, in21, in22, in23); |
| 3238 TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25, |
| 3239 in26, in27, in28, in29, in30, in31); |
| 3240 } else if (i < 4) { |
| 3241 // First 1-D idct: next 24 zero-coeff rows |
| 3242 col[i32 + 0] = _mm_setzero_si128(); |
| 3243 col[i32 + 1] = _mm_setzero_si128(); |
| 3244 col[i32 + 2] = _mm_setzero_si128(); |
| 3245 col[i32 + 3] = _mm_setzero_si128(); |
| 3246 col[i32 + 4] = _mm_setzero_si128(); |
| 3247 col[i32 + 5] = _mm_setzero_si128(); |
| 3248 col[i32 + 6] = _mm_setzero_si128(); |
| 3249 col[i32 + 7] = _mm_setzero_si128(); |
| 3250 col[i32 + 8] = _mm_setzero_si128(); |
| 3251 col[i32 + 9] = _mm_setzero_si128(); |
| 3252 col[i32 + 10] = _mm_setzero_si128(); |
| 3253 col[i32 + 11] = _mm_setzero_si128(); |
| 3254 col[i32 + 12] = _mm_setzero_si128(); |
| 3255 col[i32 + 13] = _mm_setzero_si128(); |
| 3256 col[i32 + 14] = _mm_setzero_si128(); |
| 3257 col[i32 + 15] = _mm_setzero_si128(); |
| 3258 col[i32 + 16] = _mm_setzero_si128(); |
| 3259 col[i32 + 17] = _mm_setzero_si128(); |
| 3260 col[i32 + 18] = _mm_setzero_si128(); |
| 3261 col[i32 + 19] = _mm_setzero_si128(); |
| 3262 col[i32 + 20] = _mm_setzero_si128(); |
| 3263 col[i32 + 21] = _mm_setzero_si128(); |
| 3264 col[i32 + 22] = _mm_setzero_si128(); |
| 3265 col[i32 + 23] = _mm_setzero_si128(); |
| 3266 col[i32 + 24] = _mm_setzero_si128(); |
| 3267 col[i32 + 25] = _mm_setzero_si128(); |
| 3268 col[i32 + 26] = _mm_setzero_si128(); |
| 3269 col[i32 + 27] = _mm_setzero_si128(); |
| 3270 col[i32 + 28] = _mm_setzero_si128(); |
| 3271 col[i32 + 29] = _mm_setzero_si128(); |
| 3272 col[i32 + 30] = _mm_setzero_si128(); |
| 3273 col[i32 + 31] = _mm_setzero_si128(); |
| 3274 continue; |
| 3275 } else { |
| 3276 // Second 1-D idct |
| 3277 j = i - 4; |
| 3278 |
| 3279 // Transpose 32x8 block to 8x32 block |
| 3280 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], |
| 3281 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], |
| 3282 col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4, |
| 3283 in5, in6, in7); |
| 3284 j += 4; |
| 3285 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], |
| 3286 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], |
| 3287 col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10, |
| 3288 in11, in12, in13, in14, in15); |
| 3289 j += 4; |
| 3290 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], |
| 3291 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], |
| 3292 col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18, |
| 3293 in19, in20, in21, in22, in23); |
| 3294 j += 4; |
| 3295 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], |
| 3296 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], |
| 3297 col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27, |
| 3298 in28, in29, in30, in31); |
| 3299 } |
| 3300 |
| 3301 IDCT32_1D |
| 3302 |
| 3303 // final stage |
| 3304 if (i < 4) { |
| 3305 // 1_D: Store 32 intermediate results for each 8x32 block. |
| 3306 col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); |
| 3307 col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); |
| 3308 col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29); |
| 3309 col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28); |
| 3310 col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27); |
| 3311 col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26); |
| 3312 col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25); |
| 3313 col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24); |
| 3314 col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23); |
| 3315 col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22); |
| 3316 col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21); |
| 3317 col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20); |
| 3318 col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19); |
| 3319 col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18); |
| 3320 col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17); |
| 3321 col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16); |
| 3322 col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); |
| 3323 col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); |
| 3324 col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); |
| 3325 col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); |
| 3326 col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); |
| 3327 col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); |
| 3328 col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); |
| 3329 col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); |
| 3330 col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); |
| 3331 col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); |
| 3332 col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); |
| 3333 col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); |
| 3334 col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); |
| 3335 col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); |
| 3336 col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); |
| 3337 col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); |
| 3338 } else { |
| 3339 const __m128i zero = _mm_setzero_si128(); |
| 3340 |
| 3341 // 2_D: Calculate the results and store them to destination. |
| 3342 in0 = _mm_add_epi16(stp1_0, stp1_31); |
| 3343 in1 = _mm_add_epi16(stp1_1, stp1_30); |
| 3344 in2 = _mm_add_epi16(stp1_2, stp1_29); |
| 3345 in3 = _mm_add_epi16(stp1_3, stp1_28); |
| 3346 in4 = _mm_add_epi16(stp1_4, stp1_27); |
| 3347 in5 = _mm_add_epi16(stp1_5, stp1_26); |
| 3348 in6 = _mm_add_epi16(stp1_6, stp1_25); |
| 3349 in7 = _mm_add_epi16(stp1_7, stp1_24); |
| 3350 in8 = _mm_add_epi16(stp1_8, stp1_23); |
| 3351 in9 = _mm_add_epi16(stp1_9, stp1_22); |
| 3352 in10 = _mm_add_epi16(stp1_10, stp1_21); |
| 3353 in11 = _mm_add_epi16(stp1_11, stp1_20); |
| 3354 in12 = _mm_add_epi16(stp1_12, stp1_19); |
| 3355 in13 = _mm_add_epi16(stp1_13, stp1_18); |
| 3356 in14 = _mm_add_epi16(stp1_14, stp1_17); |
| 3357 in15 = _mm_add_epi16(stp1_15, stp1_16); |
| 3358 in16 = _mm_sub_epi16(stp1_15, stp1_16); |
| 3359 in17 = _mm_sub_epi16(stp1_14, stp1_17); |
| 3360 in18 = _mm_sub_epi16(stp1_13, stp1_18); |
| 3361 in19 = _mm_sub_epi16(stp1_12, stp1_19); |
| 3362 in20 = _mm_sub_epi16(stp1_11, stp1_20); |
| 3363 in21 = _mm_sub_epi16(stp1_10, stp1_21); |
| 3364 in22 = _mm_sub_epi16(stp1_9, stp1_22); |
| 3365 in23 = _mm_sub_epi16(stp1_8, stp1_23); |
| 3366 in24 = _mm_sub_epi16(stp1_7, stp1_24); |
| 3367 in25 = _mm_sub_epi16(stp1_6, stp1_25); |
| 3368 in26 = _mm_sub_epi16(stp1_5, stp1_26); |
| 3369 in27 = _mm_sub_epi16(stp1_4, stp1_27); |
| 3370 in28 = _mm_sub_epi16(stp1_3, stp1_28); |
| 3371 in29 = _mm_sub_epi16(stp1_2, stp1_29); |
| 3372 in30 = _mm_sub_epi16(stp1_1, stp1_30); |
| 3373 in31 = _mm_sub_epi16(stp1_0, stp1_31); |
| 3374 |
| 3375 // Final rounding and shift |
| 3376 in0 = _mm_adds_epi16(in0, final_rounding); |
| 3377 in1 = _mm_adds_epi16(in1, final_rounding); |
| 3378 in2 = _mm_adds_epi16(in2, final_rounding); |
| 3379 in3 = _mm_adds_epi16(in3, final_rounding); |
| 3380 in4 = _mm_adds_epi16(in4, final_rounding); |
| 3381 in5 = _mm_adds_epi16(in5, final_rounding); |
| 3382 in6 = _mm_adds_epi16(in6, final_rounding); |
| 3383 in7 = _mm_adds_epi16(in7, final_rounding); |
| 3384 in8 = _mm_adds_epi16(in8, final_rounding); |
| 3385 in9 = _mm_adds_epi16(in9, final_rounding); |
| 3386 in10 = _mm_adds_epi16(in10, final_rounding); |
| 3387 in11 = _mm_adds_epi16(in11, final_rounding); |
| 3388 in12 = _mm_adds_epi16(in12, final_rounding); |
| 3389 in13 = _mm_adds_epi16(in13, final_rounding); |
| 3390 in14 = _mm_adds_epi16(in14, final_rounding); |
| 3391 in15 = _mm_adds_epi16(in15, final_rounding); |
| 3392 in16 = _mm_adds_epi16(in16, final_rounding); |
| 3393 in17 = _mm_adds_epi16(in17, final_rounding); |
| 3394 in18 = _mm_adds_epi16(in18, final_rounding); |
| 3395 in19 = _mm_adds_epi16(in19, final_rounding); |
| 3396 in20 = _mm_adds_epi16(in20, final_rounding); |
| 3397 in21 = _mm_adds_epi16(in21, final_rounding); |
| 3398 in22 = _mm_adds_epi16(in22, final_rounding); |
| 3399 in23 = _mm_adds_epi16(in23, final_rounding); |
| 3400 in24 = _mm_adds_epi16(in24, final_rounding); |
| 3401 in25 = _mm_adds_epi16(in25, final_rounding); |
| 3402 in26 = _mm_adds_epi16(in26, final_rounding); |
| 3403 in27 = _mm_adds_epi16(in27, final_rounding); |
| 3404 in28 = _mm_adds_epi16(in28, final_rounding); |
| 3405 in29 = _mm_adds_epi16(in29, final_rounding); |
| 3406 in30 = _mm_adds_epi16(in30, final_rounding); |
| 3407 in31 = _mm_adds_epi16(in31, final_rounding); |
| 3408 |
| 3409 in0 = _mm_srai_epi16(in0, 6); |
| 3410 in1 = _mm_srai_epi16(in1, 6); |
| 3411 in2 = _mm_srai_epi16(in2, 6); |
| 3412 in3 = _mm_srai_epi16(in3, 6); |
| 3413 in4 = _mm_srai_epi16(in4, 6); |
| 3414 in5 = _mm_srai_epi16(in5, 6); |
| 3415 in6 = _mm_srai_epi16(in6, 6); |
| 3416 in7 = _mm_srai_epi16(in7, 6); |
| 3417 in8 = _mm_srai_epi16(in8, 6); |
| 3418 in9 = _mm_srai_epi16(in9, 6); |
| 3419 in10 = _mm_srai_epi16(in10, 6); |
| 3420 in11 = _mm_srai_epi16(in11, 6); |
| 3421 in12 = _mm_srai_epi16(in12, 6); |
| 3422 in13 = _mm_srai_epi16(in13, 6); |
| 3423 in14 = _mm_srai_epi16(in14, 6); |
| 3424 in15 = _mm_srai_epi16(in15, 6); |
| 3425 in16 = _mm_srai_epi16(in16, 6); |
| 3426 in17 = _mm_srai_epi16(in17, 6); |
| 3427 in18 = _mm_srai_epi16(in18, 6); |
| 3428 in19 = _mm_srai_epi16(in19, 6); |
| 3429 in20 = _mm_srai_epi16(in20, 6); |
| 3430 in21 = _mm_srai_epi16(in21, 6); |
| 3431 in22 = _mm_srai_epi16(in22, 6); |
| 3432 in23 = _mm_srai_epi16(in23, 6); |
| 3433 in24 = _mm_srai_epi16(in24, 6); |
| 3434 in25 = _mm_srai_epi16(in25, 6); |
| 3435 in26 = _mm_srai_epi16(in26, 6); |
| 3436 in27 = _mm_srai_epi16(in27, 6); |
| 3437 in28 = _mm_srai_epi16(in28, 6); |
| 3438 in29 = _mm_srai_epi16(in29, 6); |
| 3439 in30 = _mm_srai_epi16(in30, 6); |
| 3440 in31 = _mm_srai_epi16(in31, 6); |
| 3441 |
| 3442 RECON_AND_STORE(dest, in0); |
| 3443 RECON_AND_STORE(dest, in1); |
| 3444 RECON_AND_STORE(dest, in2); |
| 3445 RECON_AND_STORE(dest, in3); |
| 3446 RECON_AND_STORE(dest, in4); |
| 3447 RECON_AND_STORE(dest, in5); |
| 3448 RECON_AND_STORE(dest, in6); |
| 3449 RECON_AND_STORE(dest, in7); |
| 3450 RECON_AND_STORE(dest, in8); |
| 3451 RECON_AND_STORE(dest, in9); |
| 3452 RECON_AND_STORE(dest, in10); |
| 3453 RECON_AND_STORE(dest, in11); |
| 3454 RECON_AND_STORE(dest, in12); |
| 3455 RECON_AND_STORE(dest, in13); |
| 3456 RECON_AND_STORE(dest, in14); |
| 3457 RECON_AND_STORE(dest, in15); |
| 3458 RECON_AND_STORE(dest, in16); |
| 3459 RECON_AND_STORE(dest, in17); |
| 3460 RECON_AND_STORE(dest, in18); |
| 3461 RECON_AND_STORE(dest, in19); |
| 3462 RECON_AND_STORE(dest, in20); |
| 3463 RECON_AND_STORE(dest, in21); |
| 3464 RECON_AND_STORE(dest, in22); |
| 3465 RECON_AND_STORE(dest, in23); |
| 3466 RECON_AND_STORE(dest, in24); |
| 3467 RECON_AND_STORE(dest, in25); |
| 3468 RECON_AND_STORE(dest, in26); |
| 3469 RECON_AND_STORE(dest, in27); |
| 3470 RECON_AND_STORE(dest, in28); |
| 3471 RECON_AND_STORE(dest, in29); |
| 3472 RECON_AND_STORE(dest, in30); |
| 3473 RECON_AND_STORE(dest, in31); |
| 3474 |
| 3475 dest += 8 - (stride * 32); |
| 3476 } |
| 3477 } |
| 3478 } |
| 3479 |
| 3480 void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, |
| 3481 int stride) { |
| 3482 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
| 3483 const __m128i final_rounding = _mm_set1_epi16(1<<5); |
| 3484 |
| 3485 // idct constants for each stage |
| 3486 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); |
| 3487 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); |
| 3488 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); |
| 3489 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); |
| 3490 const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); |
| 3491 const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); |
| 3492 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); |
| 3493 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); |
| 3494 const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); |
| 3495 const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); |
| 3496 const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); |
| 3497 const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); |
| 3498 const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); |
| 3499 const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); |
| 3500 const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); |
| 3501 const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); |
| 3502 |
| 3503 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); |
| 3504 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); |
| 3505 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); |
| 3506 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); |
| 3507 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); |
| 3508 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); |
| 3509 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); |
| 3510 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); |
| 3511 |
| 3512 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); |
| 3513 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); |
| 3514 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); |
| 3515 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); |
| 3516 const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); |
| 3517 const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); |
| 3518 const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); |
| 3519 const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); |
| 3520 const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); |
| 3521 const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); |
| 3522 |
| 3523 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); |
| 3524 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
| 3525 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
| 3526 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); |
| 3527 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); |
| 3528 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); |
| 3529 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); |
| 3530 |
| 3531 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); |
| 3532 |
| 3533 __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, |
| 3534 in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23, |
| 3535 in24, in25, in26, in27, in28, in29, in30, in31; |
| 3536 __m128i col[128]; |
| 3537 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, |
| 3538 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, |
| 3539 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, |
| 3540 stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, |
| 3541 stp1_30, stp1_31; |
| 3542 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, |
| 3543 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, |
| 3544 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, |
| 3545 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, |
| 3546 stp2_30, stp2_31; |
| 3547 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
| 3548 int i, j, i32; |
2890 __m128i zero_idx[16]; | 3549 __m128i zero_idx[16]; |
2891 int zero_flag[2]; | 3550 int zero_flag[2]; |
2892 | 3551 |
2893 // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct. | 3552 // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct. |
2894 for (i = 0; i < 8; i++) { | 3553 for (i = 0; i < 8; i++) { |
2895 i32 = (i << 5); | 3554 i32 = (i << 5); |
2896 if (i < 4) { | 3555 if (i < 4) { |
2897 // First 1-D idct | 3556 // First 1-D idct |
2898 // Load input data. | 3557 // Load input data. |
2899 LOAD_DQCOEFF(in0, input); | 3558 LOAD_DQCOEFF(in0, input); |
(...skipping 135 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3035 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], | 3694 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], |
3036 col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18, | 3695 col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18, |
3037 in19, in20, in21, in22, in23); | 3696 in19, in20, in21, in22, in23); |
3038 j += 4; | 3697 j += 4; |
3039 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], | 3698 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], |
3040 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], | 3699 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], |
3041 col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27, | 3700 col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27, |
3042 in28, in29, in30, in31); | 3701 in28, in29, in30, in31); |
3043 } | 3702 } |
3044 | 3703 |
3045 // Stage1 | 3704 IDCT32_1D |
3046 { | |
3047 const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31); | |
3048 const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31); | |
3049 const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15); | |
3050 const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15); | |
3051 | |
3052 const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23); | |
3053 const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23); | |
3054 const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7); | |
3055 const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7); | |
3056 | |
3057 const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27); | |
3058 const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27); | |
3059 const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11); | |
3060 const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11); | |
3061 | |
3062 const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19); | |
3063 const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19); | |
3064 const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3); | |
3065 const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3); | |
3066 | |
3067 MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, | |
3068 stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, | |
3069 stp1_17, stp1_30) | |
3070 MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, | |
3071 stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, | |
3072 stp1_19, stp1_28) | |
3073 MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, | |
3074 stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, | |
3075 stp1_21, stp1_26) | |
3076 MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, | |
3077 stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, | |
3078 stp1_23, stp1_24) | |
3079 } | |
3080 | |
3081 // Stage2 | |
3082 { | |
3083 const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30); | |
3084 const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30); | |
3085 const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14); | |
3086 const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14); | |
3087 | |
3088 const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22); | |
3089 const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22); | |
3090 const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6); | |
3091 const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6); | |
3092 | |
3093 MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, | |
3094 stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, | |
3095 stp2_14) | |
3096 MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, | |
3097 stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, | |
3098 stp2_11, stp2_12) | |
3099 | |
3100 stp2_16 = _mm_add_epi16(stp1_16, stp1_17); | |
3101 stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); | |
3102 stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); | |
3103 stp2_19 = _mm_add_epi16(stp1_19, stp1_18); | |
3104 | |
3105 stp2_20 = _mm_add_epi16(stp1_20, stp1_21); | |
3106 stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); | |
3107 stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); | |
3108 stp2_23 = _mm_add_epi16(stp1_23, stp1_22); | |
3109 | |
3110 stp2_24 = _mm_add_epi16(stp1_24, stp1_25); | |
3111 stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); | |
3112 stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); | |
3113 stp2_27 = _mm_add_epi16(stp1_27, stp1_26); | |
3114 | |
3115 stp2_28 = _mm_add_epi16(stp1_28, stp1_29); | |
3116 stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); | |
3117 stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); | |
3118 stp2_31 = _mm_add_epi16(stp1_31, stp1_30); | |
3119 } | |
3120 | |
3121 // Stage3 | |
3122 { | |
3123 const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28); | |
3124 const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28); | |
3125 const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12); | |
3126 const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12); | |
3127 | |
3128 const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); | |
3129 const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); | |
3130 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); | |
3131 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); | |
3132 | |
3133 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); | |
3134 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); | |
3135 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); | |
3136 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); | |
3137 | |
3138 MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, | |
3139 stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, | |
3140 stp1_6) | |
3141 | |
3142 stp1_8 = _mm_add_epi16(stp2_8, stp2_9); | |
3143 stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); | |
3144 stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); | |
3145 stp1_11 = _mm_add_epi16(stp2_11, stp2_10); | |
3146 stp1_12 = _mm_add_epi16(stp2_12, stp2_13); | |
3147 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); | |
3148 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); | |
3149 stp1_15 = _mm_add_epi16(stp2_15, stp2_14); | |
3150 | |
3151 MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, | |
3152 stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, | |
3153 stp1_18, stp1_29) | |
3154 MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, | |
3155 stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, | |
3156 stp1_22, stp1_25) | |
3157 | |
3158 stp1_16 = stp2_16; | |
3159 stp1_31 = stp2_31; | |
3160 stp1_19 = stp2_19; | |
3161 stp1_20 = stp2_20; | |
3162 stp1_23 = stp2_23; | |
3163 stp1_24 = stp2_24; | |
3164 stp1_27 = stp2_27; | |
3165 stp1_28 = stp2_28; | |
3166 } | |
3167 | |
3168 // Stage4 | |
3169 { | |
3170 const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16); | |
3171 const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16); | |
3172 const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24); | |
3173 const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24); | |
3174 | |
3175 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); | |
3176 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); | |
3177 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); | |
3178 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); | |
3179 | |
3180 MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, | |
3181 stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, | |
3182 stp2_2, stp2_3) | |
3183 | |
3184 stp2_4 = _mm_add_epi16(stp1_4, stp1_5); | |
3185 stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); | |
3186 stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); | |
3187 stp2_7 = _mm_add_epi16(stp1_7, stp1_6); | |
3188 | |
3189 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, | |
3190 stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, | |
3191 stp2_10, stp2_13) | |
3192 | |
3193 stp2_8 = stp1_8; | |
3194 stp2_15 = stp1_15; | |
3195 stp2_11 = stp1_11; | |
3196 stp2_12 = stp1_12; | |
3197 | |
3198 stp2_16 = _mm_add_epi16(stp1_16, stp1_19); | |
3199 stp2_17 = _mm_add_epi16(stp1_17, stp1_18); | |
3200 stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); | |
3201 stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); | |
3202 stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); | |
3203 stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); | |
3204 stp2_22 = _mm_add_epi16(stp1_22, stp1_21); | |
3205 stp2_23 = _mm_add_epi16(stp1_23, stp1_20); | |
3206 | |
3207 stp2_24 = _mm_add_epi16(stp1_24, stp1_27); | |
3208 stp2_25 = _mm_add_epi16(stp1_25, stp1_26); | |
3209 stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); | |
3210 stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); | |
3211 stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); | |
3212 stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); | |
3213 stp2_30 = _mm_add_epi16(stp1_29, stp1_30); | |
3214 stp2_31 = _mm_add_epi16(stp1_28, stp1_31); | |
3215 } | |
3216 | |
3217 // Stage5 | |
3218 { | |
3219 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); | |
3220 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); | |
3221 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); | |
3222 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); | |
3223 | |
3224 const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); | |
3225 const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); | |
3226 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); | |
3227 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); | |
3228 | |
3229 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); | |
3230 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); | |
3231 | |
3232 stp1_0 = _mm_add_epi16(stp2_0, stp2_3); | |
3233 stp1_1 = _mm_add_epi16(stp2_1, stp2_2); | |
3234 stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); | |
3235 stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); | |
3236 | |
3237 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); | |
3238 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); | |
3239 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); | |
3240 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); | |
3241 | |
3242 tmp0 = _mm_add_epi32(tmp0, rounding); | |
3243 tmp1 = _mm_add_epi32(tmp1, rounding); | |
3244 tmp2 = _mm_add_epi32(tmp2, rounding); | |
3245 tmp3 = _mm_add_epi32(tmp3, rounding); | |
3246 | |
3247 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); | |
3248 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); | |
3249 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); | |
3250 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); | |
3251 | |
3252 stp1_5 = _mm_packs_epi32(tmp0, tmp1); | |
3253 stp1_6 = _mm_packs_epi32(tmp2, tmp3); | |
3254 | |
3255 stp1_4 = stp2_4; | |
3256 stp1_7 = stp2_7; | |
3257 | |
3258 stp1_8 = _mm_add_epi16(stp2_8, stp2_11); | |
3259 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); | |
3260 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); | |
3261 stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); | |
3262 stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); | |
3263 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); | |
3264 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); | |
3265 stp1_15 = _mm_add_epi16(stp2_15, stp2_12); | |
3266 | |
3267 stp1_16 = stp2_16; | |
3268 stp1_17 = stp2_17; | |
3269 | |
3270 MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, | |
3271 stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, | |
3272 stp1_19, stp1_28) | |
3273 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, | |
3274 stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, | |
3275 stp1_21, stp1_26) | |
3276 | |
3277 stp1_22 = stp2_22; | |
3278 stp1_23 = stp2_23; | |
3279 stp1_24 = stp2_24; | |
3280 stp1_25 = stp2_25; | |
3281 stp1_30 = stp2_30; | |
3282 stp1_31 = stp2_31; | |
3283 } | |
3284 | |
3285 // Stage6 | |
3286 { | |
3287 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); | |
3288 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); | |
3289 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); | |
3290 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); | |
3291 | |
3292 stp2_0 = _mm_add_epi16(stp1_0, stp1_7); | |
3293 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); | |
3294 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); | |
3295 stp2_3 = _mm_add_epi16(stp1_3, stp1_4); | |
3296 stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); | |
3297 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); | |
3298 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); | |
3299 stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); | |
3300 | |
3301 stp2_8 = stp1_8; | |
3302 stp2_9 = stp1_9; | |
3303 stp2_14 = stp1_14; | |
3304 stp2_15 = stp1_15; | |
3305 | |
3306 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, | |
3307 stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, | |
3308 stp2_13, stp2_11, stp2_12) | |
3309 | |
3310 stp2_16 = _mm_add_epi16(stp1_16, stp1_23); | |
3311 stp2_17 = _mm_add_epi16(stp1_17, stp1_22); | |
3312 stp2_18 = _mm_add_epi16(stp1_18, stp1_21); | |
3313 stp2_19 = _mm_add_epi16(stp1_19, stp1_20); | |
3314 stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); | |
3315 stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); | |
3316 stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); | |
3317 stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); | |
3318 | |
3319 stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); | |
3320 stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); | |
3321 stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); | |
3322 stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); | |
3323 stp2_28 = _mm_add_epi16(stp1_27, stp1_28); | |
3324 stp2_29 = _mm_add_epi16(stp1_26, stp1_29); | |
3325 stp2_30 = _mm_add_epi16(stp1_25, stp1_30); | |
3326 stp2_31 = _mm_add_epi16(stp1_24, stp1_31); | |
3327 } | |
3328 | |
3329 // Stage7 | |
3330 { | |
3331 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); | |
3332 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); | |
3333 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); | |
3334 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); | |
3335 | |
3336 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); | |
3337 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); | |
3338 const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); | |
3339 const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); | |
3340 | |
3341 stp1_0 = _mm_add_epi16(stp2_0, stp2_15); | |
3342 stp1_1 = _mm_add_epi16(stp2_1, stp2_14); | |
3343 stp1_2 = _mm_add_epi16(stp2_2, stp2_13); | |
3344 stp1_3 = _mm_add_epi16(stp2_3, stp2_12); | |
3345 stp1_4 = _mm_add_epi16(stp2_4, stp2_11); | |
3346 stp1_5 = _mm_add_epi16(stp2_5, stp2_10); | |
3347 stp1_6 = _mm_add_epi16(stp2_6, stp2_9); | |
3348 stp1_7 = _mm_add_epi16(stp2_7, stp2_8); | |
3349 stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); | |
3350 stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); | |
3351 stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); | |
3352 stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); | |
3353 stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); | |
3354 stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); | |
3355 stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); | |
3356 stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); | |
3357 | |
3358 stp1_16 = stp2_16; | |
3359 stp1_17 = stp2_17; | |
3360 stp1_18 = stp2_18; | |
3361 stp1_19 = stp2_19; | |
3362 | |
3363 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, | |
3364 stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, | |
3365 stp1_21, stp1_26) | |
3366 MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, | |
3367 stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, | |
3368 stp1_23, stp1_24) | |
3369 | |
3370 stp1_28 = stp2_28; | |
3371 stp1_29 = stp2_29; | |
3372 stp1_30 = stp2_30; | |
3373 stp1_31 = stp2_31; | |
3374 } | |
3375 | 3705 |
3376 // final stage | 3706 // final stage |
3377 if (i < 4) { | 3707 if (i < 4) { |
3378 // 1_D: Store 32 intermediate results for each 8x32 block. | 3708 // 1_D: Store 32 intermediate results for each 8x32 block. |
3379 col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); | 3709 col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); |
3380 col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); | 3710 col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); |
3381 col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29); | 3711 col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29); |
3382 col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28); | 3712 col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28); |
3383 col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27); | 3713 col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27); |
3384 col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26); | 3714 col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26); |
(...skipping 156 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3541 RECON_AND_STORE(dest, in26); | 3871 RECON_AND_STORE(dest, in26); |
3542 RECON_AND_STORE(dest, in27); | 3872 RECON_AND_STORE(dest, in27); |
3543 RECON_AND_STORE(dest, in28); | 3873 RECON_AND_STORE(dest, in28); |
3544 RECON_AND_STORE(dest, in29); | 3874 RECON_AND_STORE(dest, in29); |
3545 RECON_AND_STORE(dest, in30); | 3875 RECON_AND_STORE(dest, in30); |
3546 RECON_AND_STORE(dest, in31); | 3876 RECON_AND_STORE(dest, in31); |
3547 | 3877 |
3548 dest += 8 - (stride * 32); | 3878 dest += 8 - (stride * 32); |
3549 } | 3879 } |
3550 } | 3880 } |
| 3881 } //NOLINT |
| 3882 |
| 3883 void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
| 3884 __m128i dc_value; |
| 3885 const __m128i zero = _mm_setzero_si128(); |
| 3886 int a, i; |
| 3887 |
| 3888 a = dct_const_round_shift(input[0] * cospi_16_64); |
| 3889 a = dct_const_round_shift(a * cospi_16_64); |
| 3890 a = ROUND_POWER_OF_TWO(a, 6); |
| 3891 |
| 3892 dc_value = _mm_set1_epi16(a); |
| 3893 |
| 3894 for (i = 0; i < 4; ++i) { |
| 3895 RECON_AND_STORE(dest, dc_value); |
| 3896 RECON_AND_STORE(dest, dc_value); |
| 3897 RECON_AND_STORE(dest, dc_value); |
| 3898 RECON_AND_STORE(dest, dc_value); |
| 3899 RECON_AND_STORE(dest, dc_value); |
| 3900 RECON_AND_STORE(dest, dc_value); |
| 3901 RECON_AND_STORE(dest, dc_value); |
| 3902 RECON_AND_STORE(dest, dc_value); |
| 3903 RECON_AND_STORE(dest, dc_value); |
| 3904 RECON_AND_STORE(dest, dc_value); |
| 3905 RECON_AND_STORE(dest, dc_value); |
| 3906 RECON_AND_STORE(dest, dc_value); |
| 3907 RECON_AND_STORE(dest, dc_value); |
| 3908 RECON_AND_STORE(dest, dc_value); |
| 3909 RECON_AND_STORE(dest, dc_value); |
| 3910 RECON_AND_STORE(dest, dc_value); |
| 3911 RECON_AND_STORE(dest, dc_value); |
| 3912 RECON_AND_STORE(dest, dc_value); |
| 3913 RECON_AND_STORE(dest, dc_value); |
| 3914 RECON_AND_STORE(dest, dc_value); |
| 3915 RECON_AND_STORE(dest, dc_value); |
| 3916 RECON_AND_STORE(dest, dc_value); |
| 3917 RECON_AND_STORE(dest, dc_value); |
| 3918 RECON_AND_STORE(dest, dc_value); |
| 3919 RECON_AND_STORE(dest, dc_value); |
| 3920 RECON_AND_STORE(dest, dc_value); |
| 3921 RECON_AND_STORE(dest, dc_value); |
| 3922 RECON_AND_STORE(dest, dc_value); |
| 3923 RECON_AND_STORE(dest, dc_value); |
| 3924 RECON_AND_STORE(dest, dc_value); |
| 3925 RECON_AND_STORE(dest, dc_value); |
| 3926 RECON_AND_STORE(dest, dc_value); |
| 3927 dest += 8 - (stride * 32); |
| 3928 } |
3551 } | 3929 } |
OLD | NEW |