| Index: source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
|
| ===================================================================
|
| --- source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c (revision 219822)
|
| +++ source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c (working copy)
|
| @@ -523,9 +523,9 @@
|
| { \
|
| __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
|
| d0 = _mm_unpacklo_epi8(d0, zero); \
|
| - in_x = _mm_add_epi16(in_x, d0); \
|
| - in_x = _mm_packus_epi16(in_x, in_x); \
|
| - _mm_storel_epi64((__m128i *)(dest), in_x); \
|
| + d0 = _mm_add_epi16(in_x, d0); \
|
| + d0 = _mm_packus_epi16(d0, d0); \
|
| + _mm_storel_epi64((__m128i *)(dest), d0); \
|
| dest += stride; \
|
| }
|
|
|
| @@ -597,6 +597,27 @@
|
| RECON_AND_STORE(dest, in7);
|
| }
|
|
|
| +void vp9_short_idct8x8_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {
|
| + __m128i dc_value;
|
| + const __m128i zero = _mm_setzero_si128();
|
| + int a;
|
| +
|
| + a = dct_const_round_shift(input[0] * cospi_16_64);
|
| + a = dct_const_round_shift(a * cospi_16_64);
|
| + a = ROUND_POWER_OF_TWO(a, 5);
|
| +
|
| + dc_value = _mm_set1_epi16(a);
|
| +
|
| + RECON_AND_STORE(dest, dc_value);
|
| + RECON_AND_STORE(dest, dc_value);
|
| + RECON_AND_STORE(dest, dc_value);
|
| + RECON_AND_STORE(dest, dc_value);
|
| + RECON_AND_STORE(dest, dc_value);
|
| + RECON_AND_STORE(dest, dc_value);
|
| + RECON_AND_STORE(dest, dc_value);
|
| + RECON_AND_STORE(dest, dc_value);
|
| +}
|
| +
|
| // perform 8x8 transpose
|
| static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
|
| const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
|
| @@ -1449,6 +1470,38 @@
|
| }
|
| }
|
|
|
| +void vp9_short_idct16x16_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {
|
| + __m128i dc_value;
|
| + const __m128i zero = _mm_setzero_si128();
|
| + int a, i;
|
| +
|
| + a = dct_const_round_shift(input[0] * cospi_16_64);
|
| + a = dct_const_round_shift(a * cospi_16_64);
|
| + a = ROUND_POWER_OF_TWO(a, 6);
|
| +
|
| + dc_value = _mm_set1_epi16(a);
|
| +
|
| + for (i = 0; i < 2; ++i) {
|
| + RECON_AND_STORE(dest, dc_value);
|
| + RECON_AND_STORE(dest, dc_value);
|
| + RECON_AND_STORE(dest, dc_value);
|
| + RECON_AND_STORE(dest, dc_value);
|
| + RECON_AND_STORE(dest, dc_value);
|
| + RECON_AND_STORE(dest, dc_value);
|
| + RECON_AND_STORE(dest, dc_value);
|
| + RECON_AND_STORE(dest, dc_value);
|
| + RECON_AND_STORE(dest, dc_value);
|
| + RECON_AND_STORE(dest, dc_value);
|
| + RECON_AND_STORE(dest, dc_value);
|
| + RECON_AND_STORE(dest, dc_value);
|
| + RECON_AND_STORE(dest, dc_value);
|
| + RECON_AND_STORE(dest, dc_value);
|
| + RECON_AND_STORE(dest, dc_value);
|
| + RECON_AND_STORE(dest, dc_value);
|
| + dest += 8 - (stride * 16);
|
| + }
|
| +}
|
| +
|
| static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
|
| __m128i tbuf[8];
|
| array_transpose_8x8(res0, res0);
|
| @@ -2760,6 +2813,12 @@
|
| }
|
| }
|
|
|
| +#define LOAD_DQCOEFF(reg, input) \
|
| + { \
|
| + reg = _mm_load_si128((__m128i *) input); \
|
| + input += 8; \
|
| + } \
|
| +
|
| void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) {
|
| const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
|
| const __m128i final_rounding = _mm_set1_epi16(1<<5);
|
| @@ -2827,49 +2886,127 @@
|
| stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
|
| stp2_30, stp2_31;
|
| __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
|
| - int i, j;
|
| + int i, j, i32;
|
| + __m128i zero_idx[16];
|
| + int zero_flag[2];
|
|
|
| // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct.
|
| for (i = 0; i < 8; i++) {
|
| + i32 = (i << 5);
|
| if (i < 4) {
|
| // First 1-D idct
|
| // Load input data.
|
| - in0 = _mm_load_si128((__m128i *)input);
|
| - in8 = _mm_load_si128((__m128i *)(input + 8 * 1));
|
| - in16 = _mm_load_si128((__m128i *)(input + 8 * 2));
|
| - in24 = _mm_load_si128((__m128i *)(input + 8 * 3));
|
| - in1 = _mm_load_si128((__m128i *)(input + 8 * 4));
|
| - in9 = _mm_load_si128((__m128i *)(input + 8 * 5));
|
| - in17 = _mm_load_si128((__m128i *)(input + 8 * 6));
|
| - in25 = _mm_load_si128((__m128i *)(input + 8 * 7));
|
| - in2 = _mm_load_si128((__m128i *)(input + 8 * 8));
|
| - in10 = _mm_load_si128((__m128i *)(input + 8 * 9));
|
| - in18 = _mm_load_si128((__m128i *)(input + 8 * 10));
|
| - in26 = _mm_load_si128((__m128i *)(input + 8 * 11));
|
| - in3 = _mm_load_si128((__m128i *)(input + 8 * 12));
|
| - in11 = _mm_load_si128((__m128i *)(input + 8 * 13));
|
| - in19 = _mm_load_si128((__m128i *)(input + 8 * 14));
|
| - in27 = _mm_load_si128((__m128i *)(input + 8 * 15));
|
| + LOAD_DQCOEFF(in0, input);
|
| + LOAD_DQCOEFF(in8, input);
|
| + LOAD_DQCOEFF(in16, input);
|
| + LOAD_DQCOEFF(in24, input);
|
| + LOAD_DQCOEFF(in1, input);
|
| + LOAD_DQCOEFF(in9, input);
|
| + LOAD_DQCOEFF(in17, input);
|
| + LOAD_DQCOEFF(in25, input);
|
| + LOAD_DQCOEFF(in2, input);
|
| + LOAD_DQCOEFF(in10, input);
|
| + LOAD_DQCOEFF(in18, input);
|
| + LOAD_DQCOEFF(in26, input);
|
| + LOAD_DQCOEFF(in3, input);
|
| + LOAD_DQCOEFF(in11, input);
|
| + LOAD_DQCOEFF(in19, input);
|
| + LOAD_DQCOEFF(in27, input);
|
|
|
| - in4 = _mm_load_si128((__m128i *)(input + 8 * 16));
|
| - in12 = _mm_load_si128((__m128i *)(input + 8 * 17));
|
| - in20 = _mm_load_si128((__m128i *)(input + 8 * 18));
|
| - in28 = _mm_load_si128((__m128i *)(input + 8 * 19));
|
| - in5 = _mm_load_si128((__m128i *)(input + 8 * 20));
|
| - in13 = _mm_load_si128((__m128i *)(input + 8 * 21));
|
| - in21 = _mm_load_si128((__m128i *)(input + 8 * 22));
|
| - in29 = _mm_load_si128((__m128i *)(input + 8 * 23));
|
| - in6 = _mm_load_si128((__m128i *)(input + 8 * 24));
|
| - in14 = _mm_load_si128((__m128i *)(input + 8 * 25));
|
| - in22 = _mm_load_si128((__m128i *)(input + 8 * 26));
|
| - in30 = _mm_load_si128((__m128i *)(input + 8 * 27));
|
| - in7 = _mm_load_si128((__m128i *)(input + 8 * 28));
|
| - in15 = _mm_load_si128((__m128i *)(input + 8 * 29));
|
| - in23 = _mm_load_si128((__m128i *)(input + 8 * 30));
|
| - in31 = _mm_load_si128((__m128i *)(input + 8 * 31));
|
| + LOAD_DQCOEFF(in4, input);
|
| + LOAD_DQCOEFF(in12, input);
|
| + LOAD_DQCOEFF(in20, input);
|
| + LOAD_DQCOEFF(in28, input);
|
| + LOAD_DQCOEFF(in5, input);
|
| + LOAD_DQCOEFF(in13, input);
|
| + LOAD_DQCOEFF(in21, input);
|
| + LOAD_DQCOEFF(in29, input);
|
| + LOAD_DQCOEFF(in6, input);
|
| + LOAD_DQCOEFF(in14, input);
|
| + LOAD_DQCOEFF(in22, input);
|
| + LOAD_DQCOEFF(in30, input);
|
| + LOAD_DQCOEFF(in7, input);
|
| + LOAD_DQCOEFF(in15, input);
|
| + LOAD_DQCOEFF(in23, input);
|
| + LOAD_DQCOEFF(in31, input);
|
|
|
| - input += 256;
|
| + // checking if all entries are zero
|
| + zero_idx[0] = _mm_or_si128(in0, in1);
|
| + zero_idx[1] = _mm_or_si128(in2, in3);
|
| + zero_idx[2] = _mm_or_si128(in4, in5);
|
| + zero_idx[3] = _mm_or_si128(in6, in7);
|
| + zero_idx[4] = _mm_or_si128(in8, in9);
|
| + zero_idx[5] = _mm_or_si128(in10, in11);
|
| + zero_idx[6] = _mm_or_si128(in12, in13);
|
| + zero_idx[7] = _mm_or_si128(in14, in15);
|
| + zero_idx[8] = _mm_or_si128(in16, in17);
|
| + zero_idx[9] = _mm_or_si128(in18, in19);
|
| + zero_idx[10] = _mm_or_si128(in20, in21);
|
| + zero_idx[11] = _mm_or_si128(in22, in23);
|
| + zero_idx[12] = _mm_or_si128(in24, in25);
|
| + zero_idx[13] = _mm_or_si128(in26, in27);
|
| + zero_idx[14] = _mm_or_si128(in28, in29);
|
| + zero_idx[15] = _mm_or_si128(in30, in31);
|
|
|
| + zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
|
| + zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
|
| + zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
|
| + zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
|
| + zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
|
| + zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
|
| + zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
|
| + zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
|
| +
|
| + zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
|
| + zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
|
| + zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
|
| + zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
|
| + zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
|
| + zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
|
| + zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
|
| +
|
| + zero_idx[0] = _mm_unpackhi_epi64(zero_idx[14], zero_idx[14]);
|
| + zero_idx[1] = _mm_or_si128(zero_idx[0], zero_idx[14]);
|
| + zero_idx[2] = _mm_srli_epi64(zero_idx[1], 32);
|
| + zero_flag[0] = _mm_cvtsi128_si32(zero_idx[1]);
|
| + zero_flag[1] = _mm_cvtsi128_si32(zero_idx[2]);
|
| +
|
| + if (!zero_flag[0] && !zero_flag[1]) {
|
| + col[i32 + 0] = _mm_setzero_si128();
|
| + col[i32 + 1] = _mm_setzero_si128();
|
| + col[i32 + 2] = _mm_setzero_si128();
|
| + col[i32 + 3] = _mm_setzero_si128();
|
| + col[i32 + 4] = _mm_setzero_si128();
|
| + col[i32 + 5] = _mm_setzero_si128();
|
| + col[i32 + 6] = _mm_setzero_si128();
|
| + col[i32 + 7] = _mm_setzero_si128();
|
| + col[i32 + 8] = _mm_setzero_si128();
|
| + col[i32 + 9] = _mm_setzero_si128();
|
| + col[i32 + 10] = _mm_setzero_si128();
|
| + col[i32 + 11] = _mm_setzero_si128();
|
| + col[i32 + 12] = _mm_setzero_si128();
|
| + col[i32 + 13] = _mm_setzero_si128();
|
| + col[i32 + 14] = _mm_setzero_si128();
|
| + col[i32 + 15] = _mm_setzero_si128();
|
| + col[i32 + 16] = _mm_setzero_si128();
|
| + col[i32 + 17] = _mm_setzero_si128();
|
| + col[i32 + 18] = _mm_setzero_si128();
|
| + col[i32 + 19] = _mm_setzero_si128();
|
| + col[i32 + 20] = _mm_setzero_si128();
|
| + col[i32 + 21] = _mm_setzero_si128();
|
| + col[i32 + 22] = _mm_setzero_si128();
|
| + col[i32 + 23] = _mm_setzero_si128();
|
| + col[i32 + 24] = _mm_setzero_si128();
|
| + col[i32 + 25] = _mm_setzero_si128();
|
| + col[i32 + 26] = _mm_setzero_si128();
|
| + col[i32 + 27] = _mm_setzero_si128();
|
| + col[i32 + 28] = _mm_setzero_si128();
|
| + col[i32 + 29] = _mm_setzero_si128();
|
| + col[i32 + 30] = _mm_setzero_si128();
|
| + col[i32 + 31] = _mm_setzero_si128();
|
| + continue;
|
| + }
|
| +
|
| // Transpose 32x8 block to 8x32 block
|
| TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
|
| in4, in5, in6, in7);
|
| @@ -3239,38 +3376,38 @@
|
| // final stage
|
| if (i < 4) {
|
| // 1_D: Store 32 intermediate results for each 8x32 block.
|
| - col[i * 32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
|
| - col[i * 32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
|
| - col[i * 32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
|
| - col[i * 32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
|
| - col[i * 32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
|
| - col[i * 32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
|
| - col[i * 32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
|
| - col[i * 32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
|
| - col[i * 32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
|
| - col[i * 32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
|
| - col[i * 32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
|
| - col[i * 32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
|
| - col[i * 32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
|
| - col[i * 32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
|
| - col[i * 32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
|
| - col[i * 32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
|
| - col[i * 32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
|
| - col[i * 32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
|
| - col[i * 32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
|
| - col[i * 32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
|
| - col[i * 32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
|
| - col[i * 32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
|
| - col[i * 32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
|
| - col[i * 32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
|
| - col[i * 32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
|
| - col[i * 32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
|
| - col[i * 32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
|
| - col[i * 32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
|
| - col[i * 32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
|
| - col[i * 32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
|
| - col[i * 32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
|
| - col[i * 32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
|
| + col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
|
| + col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
|
| + col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
|
| + col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
|
| + col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
|
| + col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
|
| + col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
|
| + col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
|
| + col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
|
| + col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
|
| + col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
|
| + col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
|
| + col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
|
| + col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
|
| + col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
|
| + col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
|
| + col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
|
| + col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
|
| + col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
|
| + col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
|
| + col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
|
| + col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
|
| + col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
|
| + col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
|
| + col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
|
| + col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
|
| + col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
|
| + col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
|
| + col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
|
| + col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
|
| + col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
|
| + col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
|
| } else {
|
| const __m128i zero = _mm_setzero_si128();
|
|
|
|
|