OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
51 // For the purposes of the comments, the 16 inputs are referred to at i0 | 51 // For the purposes of the comments, the 16 inputs are referred to at i0 |
52 // through iF (in raster order), intermediate variables are a0, b0, c0 | 52 // through iF (in raster order), intermediate variables are a0, b0, c0 |
53 // through f, and correspond to the in-place computations mapped to input | 53 // through f, and correspond to the in-place computations mapped to input |
54 // locations. The outputs, o0 through oF are labeled according to the | 54 // locations. The outputs, o0 through oF are labeled according to the |
55 // output locations. | 55 // output locations. |
56 | 56 |
57 // Constants | 57 // Constants |
58 // These are the coefficients used for the multiplies. | 58 // These are the coefficients used for the multiplies. |
59 // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64), | 59 // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64), |
60 // where cospi_N_64 = cos(N pi /64) | 60 // where cospi_N_64 = cos(N pi /64) |
61 const __m128i k__cospi_A = _mm_setr_epi16(cospi_16_64, cospi_16_64, | 61 const __m128i k__cospi_A = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi
_16_64, |
62 cospi_16_64, cospi_16_64, | 62 (int16_t)cospi_16_64, (int16_t)cospi
_16_64, |
63 cospi_16_64, -cospi_16_64, | 63 (int16_t)cospi_16_64, (int16_t)-cosp
i_16_64, |
64 cospi_16_64, -cospi_16_64); | 64 (int16_t)cospi_16_64, (int16_t)-cosp
i_16_64); |
65 const __m128i k__cospi_B = _mm_setr_epi16(cospi_16_64, -cospi_16_64, | 65 const __m128i k__cospi_B = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)-cosp
i_16_64, |
66 cospi_16_64, -cospi_16_64, | 66 (int16_t)cospi_16_64, (int16_t)-cosp
i_16_64, |
67 cospi_16_64, cospi_16_64, | 67 (int16_t)cospi_16_64, (int16_t)cospi
_16_64, |
68 cospi_16_64, cospi_16_64); | 68 (int16_t)cospi_16_64, (int16_t)cospi
_16_64); |
69 const __m128i k__cospi_C = _mm_setr_epi16(cospi_8_64, cospi_24_64, | 69 const __m128i k__cospi_C = _mm_setr_epi16((int16_t)cospi_8_64, (int16_t)cospi_
24_64, |
70 cospi_8_64, cospi_24_64, | 70 (int16_t)cospi_8_64, (int16_t)cospi_
24_64, |
71 cospi_24_64, -cospi_8_64, | 71 (int16_t)cospi_24_64, (int16_t)-cosp
i_8_64, |
72 cospi_24_64, -cospi_8_64); | 72 (int16_t)cospi_24_64, (int16_t)-cosp
i_8_64); |
73 const __m128i k__cospi_D = _mm_setr_epi16(cospi_24_64, -cospi_8_64, | 73 const __m128i k__cospi_D = _mm_setr_epi16((int16_t)cospi_24_64, (int16_t)-cosp
i_8_64, |
74 cospi_24_64, -cospi_8_64, | 74 (int16_t)cospi_24_64, (int16_t)-cosp
i_8_64, |
75 cospi_8_64, cospi_24_64, | 75 (int16_t)cospi_8_64, (int16_t)cospi_
24_64, |
76 cospi_8_64, cospi_24_64); | 76 (int16_t)cospi_8_64, (int16_t)cospi_
24_64); |
77 const __m128i k__cospi_E = _mm_setr_epi16(cospi_16_64, cospi_16_64, | 77 const __m128i k__cospi_E = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi
_16_64, |
78 cospi_16_64, cospi_16_64, | 78 (int16_t)cospi_16_64, (int16_t)cospi
_16_64, |
79 cospi_16_64, cospi_16_64, | 79 (int16_t)cospi_16_64, (int16_t)cospi
_16_64, |
80 cospi_16_64, cospi_16_64); | 80 (int16_t)cospi_16_64, (int16_t)cospi
_16_64); |
81 const __m128i k__cospi_F = _mm_setr_epi16(cospi_16_64, -cospi_16_64, | 81 const __m128i k__cospi_F = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)-cosp
i_16_64, |
82 cospi_16_64, -cospi_16_64, | 82 (int16_t)cospi_16_64, (int16_t)-cosp
i_16_64, |
83 cospi_16_64, -cospi_16_64, | 83 (int16_t)cospi_16_64, (int16_t)-cosp
i_16_64, |
84 cospi_16_64, -cospi_16_64); | 84 (int16_t)cospi_16_64, (int16_t)-cosp
i_16_64); |
85 const __m128i k__cospi_G = _mm_setr_epi16(cospi_8_64, cospi_24_64, | 85 const __m128i k__cospi_G = _mm_setr_epi16((int16_t)cospi_8_64, (int16_t)cospi_
24_64, |
86 cospi_8_64, cospi_24_64, | 86 (int16_t)cospi_8_64, (int16_t)cospi_
24_64, |
87 -cospi_8_64, -cospi_24_64, | 87 (int16_t)-cospi_8_64, (int16_t)-cosp
i_24_64, |
88 -cospi_8_64, -cospi_24_64); | 88 (int16_t)-cospi_8_64, (int16_t)-cosp
i_24_64); |
89 const __m128i k__cospi_H = _mm_setr_epi16(cospi_24_64, -cospi_8_64, | 89 const __m128i k__cospi_H = _mm_setr_epi16((int16_t)cospi_24_64, (int16_t)-cosp
i_8_64, |
90 cospi_24_64, -cospi_8_64, | 90 (int16_t)cospi_24_64, (int16_t)-cosp
i_8_64, |
91 -cospi_24_64, cospi_8_64, | 91 (int16_t)-cospi_24_64, (int16_t)cosp
i_8_64, |
92 -cospi_24_64, cospi_8_64); | 92 (int16_t)-cospi_24_64, (int16_t)cosp
i_8_64); |
93 | 93 |
94 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); | 94 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); |
95 // This second rounding constant saves doing some extra adds at the end | 95 // This second rounding constant saves doing some extra adds at the end |
96 const __m128i k__DCT_CONST_ROUNDING2 = _mm_set1_epi32(DCT_CONST_ROUNDING | 96 const __m128i k__DCT_CONST_ROUNDING2 = _mm_set1_epi32(DCT_CONST_ROUNDING |
97 +(DCT_CONST_ROUNDING << 1)); | 97 +(DCT_CONST_ROUNDING << 1)); |
98 const int DCT_CONST_BITS2 = DCT_CONST_BITS+2; | 98 const int DCT_CONST_BITS2 = DCT_CONST_BITS+2; |
99 const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); | 99 const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); |
100 const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); | 100 const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); |
101 __m128i in0, in1; | 101 __m128i in0, in1; |
102 | 102 |
(...skipping 186 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
289 res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1); | 289 res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1); |
290 | 290 |
291 // 00 10 20 30 01 11 21 31 | 291 // 00 10 20 30 01 11 21 31 |
292 // 02 12 22 32 03 13 23 33 | 292 // 02 12 22 32 03 13 23 33 |
293 // only use the first 4 16-bit integers | 293 // only use the first 4 16-bit integers |
294 res[1] = _mm_unpackhi_epi64(res[0], res[0]); | 294 res[1] = _mm_unpackhi_epi64(res[0], res[0]); |
295 res[3] = _mm_unpackhi_epi64(res[2], res[2]); | 295 res[3] = _mm_unpackhi_epi64(res[2], res[2]); |
296 } | 296 } |
297 | 297 |
298 void fdct4_sse2(__m128i *in) { | 298 void fdct4_sse2(__m128i *in) { |
299 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); | 299 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); |
300 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 300 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
301 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); | 301 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); |
302 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); | 302 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
303 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); | 303 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); |
304 | 304 |
305 __m128i u[4], v[4]; | 305 __m128i u[4], v[4]; |
306 u[0]=_mm_unpacklo_epi16(in[0], in[1]); | 306 u[0]=_mm_unpacklo_epi16(in[0], in[1]); |
307 u[1]=_mm_unpacklo_epi16(in[3], in[2]); | 307 u[1]=_mm_unpacklo_epi16(in[3], in[2]); |
308 | 308 |
309 v[0] = _mm_add_epi16(u[0], u[1]); | 309 v[0] = _mm_add_epi16(u[0], u[1]); |
(...skipping 16 matching lines...) Expand all Loading... |
326 in[0] = _mm_packs_epi32(u[0], u[1]); | 326 in[0] = _mm_packs_epi32(u[0], u[1]); |
327 in[1] = _mm_packs_epi32(u[2], u[3]); | 327 in[1] = _mm_packs_epi32(u[2], u[3]); |
328 transpose_4x4(in); | 328 transpose_4x4(in); |
329 } | 329 } |
330 | 330 |
331 void fadst4_sse2(__m128i *in) { | 331 void fadst4_sse2(__m128i *in) { |
332 const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9); | 332 const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9); |
333 const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9); | 333 const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9); |
334 const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9); | 334 const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9); |
335 const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9); | 335 const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9); |
336 const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9); | 336 const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9); |
337 const __m128i kZero = _mm_set1_epi16(0); | 337 const __m128i kZero = _mm_set1_epi16(0); |
338 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); | 338 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); |
339 __m128i u[8], v[8]; | 339 __m128i u[8], v[8]; |
340 __m128i in7 = _mm_add_epi16(in[0], in[1]); | 340 __m128i in7 = _mm_add_epi16(in[0], in[1]); |
341 | 341 |
342 u[0] = _mm_unpacklo_epi16(in[0], in[1]); | 342 u[0] = _mm_unpacklo_epi16(in[0], in[1]); |
343 u[1] = _mm_unpacklo_epi16(in[2], in[3]); | 343 u[1] = _mm_unpacklo_epi16(in[2], in[3]); |
344 u[2] = _mm_unpacklo_epi16(in7, kZero); | 344 u[2] = _mm_unpacklo_epi16(in7, kZero); |
345 u[3] = _mm_unpacklo_epi16(in[2], kZero); | 345 u[3] = _mm_unpacklo_epi16(in[2], kZero); |
346 u[4] = _mm_unpacklo_epi16(in[3], kZero); | 346 u[4] = _mm_unpacklo_epi16(in[3], kZero); |
(...skipping 100 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
447 in1 = _mm_add_epi32(sum, in0); | 447 in1 = _mm_add_epi32(sum, in0); |
448 _mm_store_si128((__m128i *)(output), in1); | 448 _mm_store_si128((__m128i *)(output), in1); |
449 } | 449 } |
450 | 450 |
451 void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) { | 451 void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) { |
452 int pass; | 452 int pass; |
453 // Constants | 453 // Constants |
454 // When we use them, in one case, they are all the same. In all others | 454 // When we use them, in one case, they are all the same. In all others |
455 // it's a pair of them that we need to repeat four times. This is done | 455 // it's a pair of them that we need to repeat four times. This is done |
456 // by constructing the 32 bit constant corresponding to that pair. | 456 // by constructing the 32 bit constant corresponding to that pair. |
457 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); | 457 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); |
458 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 458 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
459 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); | 459 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); |
460 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); | 460 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); |
461 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); | 461 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); |
462 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); | 462 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); |
463 const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); | 463 const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); |
464 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); | 464 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); |
465 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); | 465 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); |
466 // Load input | 466 // Load input |
467 __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); | 467 __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); |
(...skipping 229 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
697 _mm_store_si128((__m128i *)(output + 1 * 8), in1); | 697 _mm_store_si128((__m128i *)(output + 1 * 8), in1); |
698 _mm_store_si128((__m128i *)(output + 2 * 8), in2); | 698 _mm_store_si128((__m128i *)(output + 2 * 8), in2); |
699 _mm_store_si128((__m128i *)(output + 3 * 8), in3); | 699 _mm_store_si128((__m128i *)(output + 3 * 8), in3); |
700 _mm_store_si128((__m128i *)(output + 4 * 8), in4); | 700 _mm_store_si128((__m128i *)(output + 4 * 8), in4); |
701 _mm_store_si128((__m128i *)(output + 5 * 8), in5); | 701 _mm_store_si128((__m128i *)(output + 5 * 8), in5); |
702 _mm_store_si128((__m128i *)(output + 6 * 8), in6); | 702 _mm_store_si128((__m128i *)(output + 6 * 8), in6); |
703 _mm_store_si128((__m128i *)(output + 7 * 8), in7); | 703 _mm_store_si128((__m128i *)(output + 7 * 8), in7); |
704 } | 704 } |
705 } | 705 } |
706 | 706 |
| 707 void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, |
| 708 int16_t* coeff_ptr, intptr_t n_coeffs, |
| 709 int skip_block, const int16_t* zbin_ptr, |
| 710 const int16_t* round_ptr, const int16_t* quant_ptr, |
| 711 const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr, |
| 712 int16_t* dqcoeff_ptr, const int16_t* dequant_ptr, |
| 713 int zbin_oq_value, uint16_t* eob_ptr, |
| 714 const int16_t* scan_ptr, |
| 715 const int16_t* iscan_ptr) { |
| 716 __m128i zero; |
| 717 int pass; |
| 718 // Constants |
| 719 // When we use them, in one case, they are all the same. In all others |
| 720 // it's a pair of them that we need to repeat four times. This is done |
| 721 // by constructing the 32 bit constant corresponding to that pair. |
| 722 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); |
| 723 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
| 724 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); |
| 725 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); |
| 726 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); |
| 727 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); |
| 728 const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); |
| 729 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); |
| 730 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); |
| 731 // Load input |
| 732 __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); |
| 733 __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); |
| 734 __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); |
| 735 __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); |
| 736 __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride)); |
| 737 __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride)); |
| 738 __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride)); |
| 739 __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride)); |
| 740 __m128i *in[8]; |
| 741 int index = 0; |
| 742 |
| 743 (void)scan_ptr; |
| 744 (void)zbin_ptr; |
| 745 (void)quant_shift_ptr; |
| 746 (void)zbin_oq_value; |
| 747 (void)coeff_ptr; |
| 748 |
| 749 // Pre-condition input (shift by two) |
| 750 in0 = _mm_slli_epi16(in0, 2); |
| 751 in1 = _mm_slli_epi16(in1, 2); |
| 752 in2 = _mm_slli_epi16(in2, 2); |
| 753 in3 = _mm_slli_epi16(in3, 2); |
| 754 in4 = _mm_slli_epi16(in4, 2); |
| 755 in5 = _mm_slli_epi16(in5, 2); |
| 756 in6 = _mm_slli_epi16(in6, 2); |
| 757 in7 = _mm_slli_epi16(in7, 2); |
| 758 |
| 759 in[0] = &in0; |
| 760 in[1] = &in1; |
| 761 in[2] = &in2; |
| 762 in[3] = &in3; |
| 763 in[4] = &in4; |
| 764 in[5] = &in5; |
| 765 in[6] = &in6; |
| 766 in[7] = &in7; |
| 767 |
| 768 // We do two passes, first the columns, then the rows. The results of the |
| 769 // first pass are transposed so that the same column code can be reused. The |
| 770 // results of the second pass are also transposed so that the rows (processed |
| 771 // as columns) are put back in row positions. |
| 772 for (pass = 0; pass < 2; pass++) { |
| 773 // To store results of each pass before the transpose. |
| 774 __m128i res0, res1, res2, res3, res4, res5, res6, res7; |
| 775 // Add/subtract |
| 776 const __m128i q0 = _mm_add_epi16(in0, in7); |
| 777 const __m128i q1 = _mm_add_epi16(in1, in6); |
| 778 const __m128i q2 = _mm_add_epi16(in2, in5); |
| 779 const __m128i q3 = _mm_add_epi16(in3, in4); |
| 780 const __m128i q4 = _mm_sub_epi16(in3, in4); |
| 781 const __m128i q5 = _mm_sub_epi16(in2, in5); |
| 782 const __m128i q6 = _mm_sub_epi16(in1, in6); |
| 783 const __m128i q7 = _mm_sub_epi16(in0, in7); |
| 784 // Work on first four results |
| 785 { |
| 786 // Add/subtract |
| 787 const __m128i r0 = _mm_add_epi16(q0, q3); |
| 788 const __m128i r1 = _mm_add_epi16(q1, q2); |
| 789 const __m128i r2 = _mm_sub_epi16(q1, q2); |
| 790 const __m128i r3 = _mm_sub_epi16(q0, q3); |
| 791 // Interleave to do the multiply by constants which gets us into 32bits |
| 792 const __m128i t0 = _mm_unpacklo_epi16(r0, r1); |
| 793 const __m128i t1 = _mm_unpackhi_epi16(r0, r1); |
| 794 const __m128i t2 = _mm_unpacklo_epi16(r2, r3); |
| 795 const __m128i t3 = _mm_unpackhi_epi16(r2, r3); |
| 796 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); |
| 797 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); |
| 798 const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); |
| 799 const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); |
| 800 const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); |
| 801 const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); |
| 802 const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); |
| 803 const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); |
| 804 // dct_const_round_shift |
| 805 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); |
| 806 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); |
| 807 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); |
| 808 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); |
| 809 const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); |
| 810 const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); |
| 811 const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); |
| 812 const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); |
| 813 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); |
| 814 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); |
| 815 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); |
| 816 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); |
| 817 const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); |
| 818 const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); |
| 819 const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); |
| 820 const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); |
| 821 // Combine |
| 822 res0 = _mm_packs_epi32(w0, w1); |
| 823 res4 = _mm_packs_epi32(w2, w3); |
| 824 res2 = _mm_packs_epi32(w4, w5); |
| 825 res6 = _mm_packs_epi32(w6, w7); |
| 826 } |
| 827 // Work on next four results |
| 828 { |
| 829 // Interleave to do the multiply by constants which gets us into 32bits |
| 830 const __m128i d0 = _mm_unpacklo_epi16(q6, q5); |
| 831 const __m128i d1 = _mm_unpackhi_epi16(q6, q5); |
| 832 const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); |
| 833 const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); |
| 834 const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); |
| 835 const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); |
| 836 // dct_const_round_shift |
| 837 const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); |
| 838 const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); |
| 839 const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); |
| 840 const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); |
| 841 const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); |
| 842 const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); |
| 843 const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); |
| 844 const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); |
| 845 // Combine |
| 846 const __m128i r0 = _mm_packs_epi32(s0, s1); |
| 847 const __m128i r1 = _mm_packs_epi32(s2, s3); |
| 848 // Add/subtract |
| 849 const __m128i x0 = _mm_add_epi16(q4, r0); |
| 850 const __m128i x1 = _mm_sub_epi16(q4, r0); |
| 851 const __m128i x2 = _mm_sub_epi16(q7, r1); |
| 852 const __m128i x3 = _mm_add_epi16(q7, r1); |
| 853 // Interleave to do the multiply by constants which gets us into 32bits |
| 854 const __m128i t0 = _mm_unpacklo_epi16(x0, x3); |
| 855 const __m128i t1 = _mm_unpackhi_epi16(x0, x3); |
| 856 const __m128i t2 = _mm_unpacklo_epi16(x1, x2); |
| 857 const __m128i t3 = _mm_unpackhi_epi16(x1, x2); |
| 858 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); |
| 859 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); |
| 860 const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); |
| 861 const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); |
| 862 const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); |
| 863 const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); |
| 864 const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); |
| 865 const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); |
| 866 // dct_const_round_shift |
| 867 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); |
| 868 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); |
| 869 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); |
| 870 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); |
| 871 const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); |
| 872 const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); |
| 873 const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); |
| 874 const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); |
| 875 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); |
| 876 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); |
| 877 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); |
| 878 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); |
| 879 const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); |
| 880 const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); |
| 881 const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); |
| 882 const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); |
| 883 // Combine |
| 884 res1 = _mm_packs_epi32(w0, w1); |
| 885 res7 = _mm_packs_epi32(w2, w3); |
| 886 res5 = _mm_packs_epi32(w4, w5); |
| 887 res3 = _mm_packs_epi32(w6, w7); |
| 888 } |
| 889 // Transpose the 8x8. |
| 890 { |
| 891 // 00 01 02 03 04 05 06 07 |
| 892 // 10 11 12 13 14 15 16 17 |
| 893 // 20 21 22 23 24 25 26 27 |
| 894 // 30 31 32 33 34 35 36 37 |
| 895 // 40 41 42 43 44 45 46 47 |
| 896 // 50 51 52 53 54 55 56 57 |
| 897 // 60 61 62 63 64 65 66 67 |
| 898 // 70 71 72 73 74 75 76 77 |
| 899 const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); |
| 900 const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3); |
| 901 const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1); |
| 902 const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3); |
| 903 const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5); |
| 904 const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7); |
| 905 const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5); |
| 906 const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7); |
| 907 // 00 10 01 11 02 12 03 13 |
| 908 // 20 30 21 31 22 32 23 33 |
| 909 // 04 14 05 15 06 16 07 17 |
| 910 // 24 34 25 35 26 36 27 37 |
| 911 // 40 50 41 51 42 52 43 53 |
| 912 // 60 70 61 71 62 72 63 73 |
| 913 // 54 54 55 55 56 56 57 57 |
| 914 // 64 74 65 75 66 76 67 77 |
| 915 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); |
| 916 const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); |
| 917 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); |
| 918 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); |
| 919 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); |
| 920 const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); |
| 921 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); |
| 922 const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); |
| 923 // 00 10 20 30 01 11 21 31 |
| 924 // 40 50 60 70 41 51 61 71 |
| 925 // 02 12 22 32 03 13 23 33 |
| 926 // 42 52 62 72 43 53 63 73 |
| 927 // 04 14 24 34 05 15 21 36 |
| 928 // 44 54 64 74 45 55 61 76 |
| 929 // 06 16 26 36 07 17 27 37 |
| 930 // 46 56 66 76 47 57 67 77 |
| 931 in0 = _mm_unpacklo_epi64(tr1_0, tr1_4); |
| 932 in1 = _mm_unpackhi_epi64(tr1_0, tr1_4); |
| 933 in2 = _mm_unpacklo_epi64(tr1_2, tr1_6); |
| 934 in3 = _mm_unpackhi_epi64(tr1_2, tr1_6); |
| 935 in4 = _mm_unpacklo_epi64(tr1_1, tr1_5); |
| 936 in5 = _mm_unpackhi_epi64(tr1_1, tr1_5); |
| 937 in6 = _mm_unpacklo_epi64(tr1_3, tr1_7); |
| 938 in7 = _mm_unpackhi_epi64(tr1_3, tr1_7); |
| 939 // 00 10 20 30 40 50 60 70 |
| 940 // 01 11 21 31 41 51 61 71 |
| 941 // 02 12 22 32 42 52 62 72 |
| 942 // 03 13 23 33 43 53 63 73 |
| 943 // 04 14 24 34 44 54 64 74 |
| 944 // 05 15 25 35 45 55 65 75 |
| 945 // 06 16 26 36 46 56 66 76 |
| 946 // 07 17 27 37 47 57 67 77 |
| 947 } |
| 948 } |
| 949 // Post-condition output and store it |
| 950 { |
| 951 // Post-condition (division by two) |
| 952 // division of two 16 bits signed numbers using shifts |
| 953 // n / 2 = (n - (n >> 15)) >> 1 |
| 954 const __m128i sign_in0 = _mm_srai_epi16(in0, 15); |
| 955 const __m128i sign_in1 = _mm_srai_epi16(in1, 15); |
| 956 const __m128i sign_in2 = _mm_srai_epi16(in2, 15); |
| 957 const __m128i sign_in3 = _mm_srai_epi16(in3, 15); |
| 958 const __m128i sign_in4 = _mm_srai_epi16(in4, 15); |
| 959 const __m128i sign_in5 = _mm_srai_epi16(in5, 15); |
| 960 const __m128i sign_in6 = _mm_srai_epi16(in6, 15); |
| 961 const __m128i sign_in7 = _mm_srai_epi16(in7, 15); |
| 962 in0 = _mm_sub_epi16(in0, sign_in0); |
| 963 in1 = _mm_sub_epi16(in1, sign_in1); |
| 964 in2 = _mm_sub_epi16(in2, sign_in2); |
| 965 in3 = _mm_sub_epi16(in3, sign_in3); |
| 966 in4 = _mm_sub_epi16(in4, sign_in4); |
| 967 in5 = _mm_sub_epi16(in5, sign_in5); |
| 968 in6 = _mm_sub_epi16(in6, sign_in6); |
| 969 in7 = _mm_sub_epi16(in7, sign_in7); |
| 970 in0 = _mm_srai_epi16(in0, 1); |
| 971 in1 = _mm_srai_epi16(in1, 1); |
| 972 in2 = _mm_srai_epi16(in2, 1); |
| 973 in3 = _mm_srai_epi16(in3, 1); |
| 974 in4 = _mm_srai_epi16(in4, 1); |
| 975 in5 = _mm_srai_epi16(in5, 1); |
| 976 in6 = _mm_srai_epi16(in6, 1); |
| 977 in7 = _mm_srai_epi16(in7, 1); |
| 978 } |
| 979 |
| 980 iscan_ptr += n_coeffs; |
| 981 qcoeff_ptr += n_coeffs; |
| 982 dqcoeff_ptr += n_coeffs; |
| 983 n_coeffs = -n_coeffs; |
| 984 zero = _mm_setzero_si128(); |
| 985 |
| 986 if (!skip_block) { |
| 987 __m128i eob; |
| 988 __m128i round, quant, dequant; |
| 989 { |
| 990 __m128i coeff0, coeff1; |
| 991 |
| 992 // Setup global values |
| 993 { |
| 994 round = _mm_load_si128((const __m128i*)round_ptr); |
| 995 quant = _mm_load_si128((const __m128i*)quant_ptr); |
| 996 dequant = _mm_load_si128((const __m128i*)dequant_ptr); |
| 997 } |
| 998 |
| 999 { |
| 1000 __m128i coeff0_sign, coeff1_sign; |
| 1001 __m128i qcoeff0, qcoeff1; |
| 1002 __m128i qtmp0, qtmp1; |
| 1003 // Do DC and first 15 AC |
| 1004 coeff0 = *in[0]; |
| 1005 coeff1 = *in[1]; |
| 1006 |
| 1007 // Poor man's sign extract |
| 1008 coeff0_sign = _mm_srai_epi16(coeff0, 15); |
| 1009 coeff1_sign = _mm_srai_epi16(coeff1, 15); |
| 1010 qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); |
| 1011 qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); |
| 1012 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); |
| 1013 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); |
| 1014 |
| 1015 qcoeff0 = _mm_adds_epi16(qcoeff0, round); |
| 1016 round = _mm_unpackhi_epi64(round, round); |
| 1017 qcoeff1 = _mm_adds_epi16(qcoeff1, round); |
| 1018 qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); |
| 1019 quant = _mm_unpackhi_epi64(quant, quant); |
| 1020 qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); |
| 1021 |
| 1022 // Reinsert signs |
| 1023 qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); |
| 1024 qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); |
| 1025 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); |
| 1026 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); |
| 1027 |
| 1028 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0); |
| 1029 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); |
| 1030 |
| 1031 coeff0 = _mm_mullo_epi16(qcoeff0, dequant); |
| 1032 dequant = _mm_unpackhi_epi64(dequant, dequant); |
| 1033 coeff1 = _mm_mullo_epi16(qcoeff1, dequant); |
| 1034 |
| 1035 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0); |
| 1036 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1); |
| 1037 } |
| 1038 |
| 1039 { |
| 1040 // Scan for eob |
| 1041 __m128i zero_coeff0, zero_coeff1; |
| 1042 __m128i nzero_coeff0, nzero_coeff1; |
| 1043 __m128i iscan0, iscan1; |
| 1044 __m128i eob1; |
| 1045 zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); |
| 1046 zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); |
| 1047 nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); |
| 1048 nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); |
| 1049 iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs)); |
| 1050 iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1); |
| 1051 // Add one to convert from indices to counts |
| 1052 iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); |
| 1053 iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); |
| 1054 eob = _mm_and_si128(iscan0, nzero_coeff0); |
| 1055 eob1 = _mm_and_si128(iscan1, nzero_coeff1); |
| 1056 eob = _mm_max_epi16(eob, eob1); |
| 1057 } |
| 1058 n_coeffs += 8 * 2; |
| 1059 } |
| 1060 |
| 1061 // AC only loop |
| 1062 index = 2; |
| 1063 while (n_coeffs < 0) { |
| 1064 __m128i coeff0, coeff1; |
| 1065 { |
| 1066 __m128i coeff0_sign, coeff1_sign; |
| 1067 __m128i qcoeff0, qcoeff1; |
| 1068 __m128i qtmp0, qtmp1; |
| 1069 |
| 1070 coeff0 = *in[index]; |
| 1071 coeff1 = *in[index + 1]; |
| 1072 |
| 1073 // Poor man's sign extract |
| 1074 coeff0_sign = _mm_srai_epi16(coeff0, 15); |
| 1075 coeff1_sign = _mm_srai_epi16(coeff1, 15); |
| 1076 qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); |
| 1077 qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); |
| 1078 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); |
| 1079 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); |
| 1080 |
| 1081 qcoeff0 = _mm_adds_epi16(qcoeff0, round); |
| 1082 qcoeff1 = _mm_adds_epi16(qcoeff1, round); |
| 1083 qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); |
| 1084 qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); |
| 1085 |
| 1086 // Reinsert signs |
| 1087 qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); |
| 1088 qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); |
| 1089 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); |
| 1090 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); |
| 1091 |
| 1092 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0); |
| 1093 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); |
| 1094 |
| 1095 coeff0 = _mm_mullo_epi16(qcoeff0, dequant); |
| 1096 coeff1 = _mm_mullo_epi16(qcoeff1, dequant); |
| 1097 |
| 1098 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0); |
| 1099 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1); |
| 1100 } |
| 1101 |
| 1102 { |
| 1103 // Scan for eob |
| 1104 __m128i zero_coeff0, zero_coeff1; |
| 1105 __m128i nzero_coeff0, nzero_coeff1; |
| 1106 __m128i iscan0, iscan1; |
| 1107 __m128i eob0, eob1; |
| 1108 zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); |
| 1109 zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); |
| 1110 nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); |
| 1111 nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); |
| 1112 iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs)); |
| 1113 iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1); |
| 1114 // Add one to convert from indices to counts |
| 1115 iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); |
| 1116 iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); |
| 1117 eob0 = _mm_and_si128(iscan0, nzero_coeff0); |
| 1118 eob1 = _mm_and_si128(iscan1, nzero_coeff1); |
| 1119 eob0 = _mm_max_epi16(eob0, eob1); |
| 1120 eob = _mm_max_epi16(eob, eob0); |
| 1121 } |
| 1122 n_coeffs += 8 * 2; |
| 1123 index += 2; |
| 1124 } |
| 1125 |
| 1126 // Accumulate EOB |
| 1127 { |
| 1128 __m128i eob_shuffled; |
| 1129 eob_shuffled = _mm_shuffle_epi32(eob, 0xe); |
| 1130 eob = _mm_max_epi16(eob, eob_shuffled); |
| 1131 eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); |
| 1132 eob = _mm_max_epi16(eob, eob_shuffled); |
| 1133 eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); |
| 1134 eob = _mm_max_epi16(eob, eob_shuffled); |
| 1135 *eob_ptr = _mm_extract_epi16(eob, 1); |
| 1136 } |
| 1137 } else { |
| 1138 do { |
| 1139 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero); |
| 1140 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero); |
| 1141 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero); |
| 1142 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero); |
| 1143 n_coeffs += 8 * 2; |
| 1144 } while (n_coeffs < 0); |
| 1145 *eob_ptr = 0; |
| 1146 } |
| 1147 } |
| 1148 |
707 // load 8x8 array | 1149 // load 8x8 array |
708 static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in, | 1150 static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in, |
709 int stride) { | 1151 int stride) { |
710 in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride)); | 1152 in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride)); |
711 in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride)); | 1153 in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride)); |
712 in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride)); | 1154 in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride)); |
713 in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride)); | 1155 in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride)); |
714 in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride)); | 1156 in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride)); |
715 in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride)); | 1157 in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride)); |
716 in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride)); | 1158 in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride)); |
(...skipping 60 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
777 _mm_store_si128((__m128i *)(output + 2 * stride), res[2]); | 1219 _mm_store_si128((__m128i *)(output + 2 * stride), res[2]); |
778 _mm_store_si128((__m128i *)(output + 3 * stride), res[3]); | 1220 _mm_store_si128((__m128i *)(output + 3 * stride), res[3]); |
779 _mm_store_si128((__m128i *)(output + 4 * stride), res[4]); | 1221 _mm_store_si128((__m128i *)(output + 4 * stride), res[4]); |
780 _mm_store_si128((__m128i *)(output + 5 * stride), res[5]); | 1222 _mm_store_si128((__m128i *)(output + 5 * stride), res[5]); |
781 _mm_store_si128((__m128i *)(output + 6 * stride), res[6]); | 1223 _mm_store_si128((__m128i *)(output + 6 * stride), res[6]); |
782 _mm_store_si128((__m128i *)(output + 7 * stride), res[7]); | 1224 _mm_store_si128((__m128i *)(output + 7 * stride), res[7]); |
783 } | 1225 } |
784 | 1226 |
785 void fdct8_sse2(__m128i *in) { | 1227 void fdct8_sse2(__m128i *in) { |
786 // constants | 1228 // constants |
787 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); | 1229 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); |
788 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 1230 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
789 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); | 1231 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); |
790 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); | 1232 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); |
791 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); | 1233 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); |
792 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); | 1234 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); |
793 const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); | 1235 const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); |
794 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); | 1236 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); |
795 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); | 1237 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); |
796 __m128i u0, u1, u2, u3, u4, u5, u6, u7; | 1238 __m128i u0, u1, u2, u3, u4, u5, u6, u7; |
797 __m128i v0, v1, v2, v3, v4, v5, v6, v7; | 1239 __m128i v0, v1, v2, v3, v4, v5, v6, v7; |
(...skipping 131 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
929 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); | 1371 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); |
930 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); | 1372 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); |
931 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); | 1373 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); |
932 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); | 1374 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); |
933 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); | 1375 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); |
934 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); | 1376 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); |
935 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); | 1377 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); |
936 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); | 1378 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
937 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); | 1379 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); |
938 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 1380 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
939 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); | 1381 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); |
940 const __m128i k__const_0 = _mm_set1_epi16(0); | 1382 const __m128i k__const_0 = _mm_set1_epi16(0); |
941 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); | 1383 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); |
942 | 1384 |
943 __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; | 1385 __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; |
944 __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; | 1386 __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; |
945 __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; | 1387 __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; |
946 __m128i s0, s1, s2, s3, s4, s5, s6, s7; | 1388 __m128i s0, s1, s2, s3, s4, s5, s6, s7; |
947 __m128i in0, in1, in2, in3, in4, in5, in6, in7; | 1389 __m128i in0, in1, in2, in3, in4, in5, in6, in7; |
948 | 1390 |
949 // properly aligned for butterfly input | 1391 // properly aligned for butterfly input |
(...skipping 314 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1264 // in normal/row positions). | 1706 // in normal/row positions). |
1265 int pass; | 1707 int pass; |
1266 // We need an intermediate buffer between passes. | 1708 // We need an intermediate buffer between passes. |
1267 DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256); | 1709 DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256); |
1268 const int16_t *in = input; | 1710 const int16_t *in = input; |
1269 int16_t *out = intermediate; | 1711 int16_t *out = intermediate; |
1270 // Constants | 1712 // Constants |
1271 // When we use them, in one case, they are all the same. In all others | 1713 // When we use them, in one case, they are all the same. In all others |
1272 // it's a pair of them that we need to repeat four times. This is done | 1714 // it's a pair of them that we need to repeat four times. This is done |
1273 // by constructing the 32 bit constant corresponding to that pair. | 1715 // by constructing the 32 bit constant corresponding to that pair. |
1274 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); | 1716 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); |
1275 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 1717 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
1276 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); | 1718 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); |
1277 const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64); | 1719 const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64); |
1278 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); | 1720 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); |
1279 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); | 1721 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); |
1280 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); | 1722 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); |
1281 const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); | 1723 const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); |
1282 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); | 1724 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); |
1283 const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); | 1725 const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); |
1284 const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64); | 1726 const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64); |
(...skipping 622 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1907 // perform rounding operations | 2349 // perform rounding operations |
1908 right_shift_8x8(res0, 2); | 2350 right_shift_8x8(res0, 2); |
1909 right_shift_8x8(res0 + 8, 2); | 2351 right_shift_8x8(res0 + 8, 2); |
1910 right_shift_8x8(res1, 2); | 2352 right_shift_8x8(res1, 2); |
1911 right_shift_8x8(res1 + 8, 2); | 2353 right_shift_8x8(res1 + 8, 2); |
1912 } | 2354 } |
1913 | 2355 |
1914 void fdct16_8col(__m128i *in) { | 2356 void fdct16_8col(__m128i *in) { |
1915 // perform 16x16 1-D DCT for 8 columns | 2357 // perform 16x16 1-D DCT for 8 columns |
1916 __m128i i[8], s[8], p[8], t[8], u[16], v[16]; | 2358 __m128i i[8], s[8], p[8], t[8], u[16], v[16]; |
1917 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); | 2359 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); |
1918 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 2360 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
1919 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); | 2361 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); |
1920 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); | 2362 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); |
1921 const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64); | 2363 const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64); |
1922 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); | 2364 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); |
1923 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); | 2365 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); |
1924 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); | 2366 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); |
1925 const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); | 2367 const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); |
1926 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); | 2368 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); |
1927 const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); | 2369 const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); |
(...skipping 326 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2254 const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64); | 2696 const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64); |
2255 const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); | 2697 const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); |
2256 const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); | 2698 const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); |
2257 const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); | 2699 const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); |
2258 const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); | 2700 const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); |
2259 const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64); | 2701 const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64); |
2260 const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64); | 2702 const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64); |
2261 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); | 2703 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); |
2262 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); | 2704 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
2263 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); | 2705 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); |
2264 const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64); | 2706 const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64); |
2265 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); | 2707 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); |
2266 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 2708 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
2267 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); | 2709 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); |
2268 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); | 2710 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); |
2269 const __m128i kZero = _mm_set1_epi16(0); | 2711 const __m128i kZero = _mm_set1_epi16(0); |
2270 | 2712 |
2271 u[0] = _mm_unpacklo_epi16(in[15], in[0]); | 2713 u[0] = _mm_unpacklo_epi16(in[15], in[0]); |
2272 u[1] = _mm_unpackhi_epi16(in[15], in[0]); | 2714 u[1] = _mm_unpackhi_epi16(in[15], in[0]); |
2273 u[2] = _mm_unpacklo_epi16(in[13], in[2]); | 2715 u[2] = _mm_unpacklo_epi16(in[13], in[2]); |
2274 u[3] = _mm_unpackhi_epi16(in[13], in[2]); | 2716 u[3] = _mm_unpackhi_epi16(in[13], in[2]); |
2275 u[4] = _mm_unpacklo_epi16(in[11], in[4]); | 2717 u[4] = _mm_unpacklo_epi16(in[11], in[4]); |
(...skipping 549 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2825 #define FDCT32x32_HIGH_PRECISION 0 | 3267 #define FDCT32x32_HIGH_PRECISION 0 |
2826 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" | 3268 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" |
2827 #undef FDCT32x32_HIGH_PRECISION | 3269 #undef FDCT32x32_HIGH_PRECISION |
2828 #undef FDCT32x32_2D | 3270 #undef FDCT32x32_2D |
2829 | 3271 |
2830 #define FDCT32x32_2D vp9_fdct32x32_sse2 | 3272 #define FDCT32x32_2D vp9_fdct32x32_sse2 |
2831 #define FDCT32x32_HIGH_PRECISION 1 | 3273 #define FDCT32x32_HIGH_PRECISION 1 |
2832 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT | 3274 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT |
2833 #undef FDCT32x32_HIGH_PRECISION | 3275 #undef FDCT32x32_HIGH_PRECISION |
2834 #undef FDCT32x32_2D | 3276 #undef FDCT32x32_2D |
OLD | NEW |