| Index: source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c
|
| ===================================================================
|
| --- source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c (revision 278778)
|
| +++ source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c (working copy)
|
| @@ -12,6 +12,35 @@
|
| #include "vp9/common/vp9_idct.h" // for cospi constants
|
| #include "vpx_ports/mem.h"
|
|
|
| +void vp9_fdct4x4_1_sse2(const int16_t *input, int16_t *output, int stride) {
|
| + __m128i in0, in1;
|
| + __m128i tmp;
|
| + const __m128i zero = _mm_setzero_si128();
|
| + in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
|
| + in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
|
| + in1 = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *)
|
| + (input + 2 * stride)));
|
| + in0 = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *)
|
| + (input + 3 * stride)));
|
| +
|
| + tmp = _mm_add_epi16(in0, in1);
|
| + in0 = _mm_unpacklo_epi16(zero, tmp);
|
| + in1 = _mm_unpackhi_epi16(zero, tmp);
|
| + in0 = _mm_srai_epi32(in0, 16);
|
| + in1 = _mm_srai_epi32(in1, 16);
|
| +
|
| + tmp = _mm_add_epi32(in0, in1);
|
| + in0 = _mm_unpacklo_epi32(tmp, zero);
|
| + in1 = _mm_unpackhi_epi32(tmp, zero);
|
| +
|
| + tmp = _mm_add_epi32(in0, in1);
|
| + in0 = _mm_srli_si128(tmp, 8);
|
| +
|
| + in1 = _mm_add_epi32(tmp, in0);
|
| + in0 = _mm_slli_epi32(in1, 1);
|
| + _mm_store_si128((__m128i *)(output), in0);
|
| +}
|
| +
|
| void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) {
|
| // This 2D transform implements 4 vertical 1D transforms followed
|
| // by 4 horizontal 1D transforms. The multiplies and adds are as given
|
| @@ -377,6 +406,46 @@
|
| }
|
| }
|
|
|
| +void vp9_fdct8x8_1_sse2(const int16_t *input, int16_t *output, int stride) {
|
| + __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
|
| + __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
|
| + __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
|
| + __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
|
| + __m128i u0, u1, sum;
|
| +
|
| + u0 = _mm_add_epi16(in0, in1);
|
| + u1 = _mm_add_epi16(in2, in3);
|
| +
|
| + in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
|
| + in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
|
| + in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
|
| + in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
|
| +
|
| + sum = _mm_add_epi16(u0, u1);
|
| +
|
| + in0 = _mm_add_epi16(in0, in1);
|
| + in2 = _mm_add_epi16(in2, in3);
|
| + sum = _mm_add_epi16(sum, in0);
|
| +
|
| + u0 = _mm_setzero_si128();
|
| + sum = _mm_add_epi16(sum, in2);
|
| +
|
| + in0 = _mm_unpacklo_epi16(u0, sum);
|
| + in1 = _mm_unpackhi_epi16(u0, sum);
|
| + in0 = _mm_srai_epi32(in0, 16);
|
| + in1 = _mm_srai_epi32(in1, 16);
|
| +
|
| + sum = _mm_add_epi32(in0, in1);
|
| + in0 = _mm_unpacklo_epi32(sum, u0);
|
| + in1 = _mm_unpackhi_epi32(sum, u0);
|
| +
|
| + sum = _mm_add_epi32(in0, in1);
|
| + in0 = _mm_srli_si128(sum, 8);
|
| +
|
| + in1 = _mm_add_epi32(sum, in0);
|
| + _mm_store_si128((__m128i *)(output), in1);
|
| +}
|
| +
|
| void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) {
|
| int pass;
|
| // Constants
|
| @@ -1168,6 +1237,74 @@
|
| }
|
| }
|
|
|
| +void vp9_fdct16x16_1_sse2(const int16_t *input, int16_t *output, int stride) {
|
| + __m128i in0, in1, in2, in3;
|
| + __m128i u0, u1;
|
| + __m128i sum = _mm_setzero_si128();
|
| + int i;
|
| +
|
| + for (i = 0; i < 2; ++i) {
|
| + input += 8 * i;
|
| + in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
|
| + in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
|
| + in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
|
| + in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
|
| +
|
| + u0 = _mm_add_epi16(in0, in1);
|
| + u1 = _mm_add_epi16(in2, in3);
|
| + sum = _mm_add_epi16(sum, u0);
|
| +
|
| + in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
|
| + in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
|
| + in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
|
| + in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
|
| +
|
| + sum = _mm_add_epi16(sum, u1);
|
| + u0 = _mm_add_epi16(in0, in1);
|
| + u1 = _mm_add_epi16(in2, in3);
|
| + sum = _mm_add_epi16(sum, u0);
|
| +
|
| + in0 = _mm_load_si128((const __m128i *)(input + 8 * stride));
|
| + in1 = _mm_load_si128((const __m128i *)(input + 9 * stride));
|
| + in2 = _mm_load_si128((const __m128i *)(input + 10 * stride));
|
| + in3 = _mm_load_si128((const __m128i *)(input + 11 * stride));
|
| +
|
| + sum = _mm_add_epi16(sum, u1);
|
| + u0 = _mm_add_epi16(in0, in1);
|
| + u1 = _mm_add_epi16(in2, in3);
|
| + sum = _mm_add_epi16(sum, u0);
|
| +
|
| + in0 = _mm_load_si128((const __m128i *)(input + 12 * stride));
|
| + in1 = _mm_load_si128((const __m128i *)(input + 13 * stride));
|
| + in2 = _mm_load_si128((const __m128i *)(input + 14 * stride));
|
| + in3 = _mm_load_si128((const __m128i *)(input + 15 * stride));
|
| +
|
| + sum = _mm_add_epi16(sum, u1);
|
| + u0 = _mm_add_epi16(in0, in1);
|
| + u1 = _mm_add_epi16(in2, in3);
|
| + sum = _mm_add_epi16(sum, u0);
|
| +
|
| + sum = _mm_add_epi16(sum, u1);
|
| + }
|
| +
|
| + u0 = _mm_setzero_si128();
|
| + in0 = _mm_unpacklo_epi16(u0, sum);
|
| + in1 = _mm_unpackhi_epi16(u0, sum);
|
| + in0 = _mm_srai_epi32(in0, 16);
|
| + in1 = _mm_srai_epi32(in1, 16);
|
| +
|
| + sum = _mm_add_epi32(in0, in1);
|
| + in0 = _mm_unpacklo_epi32(sum, u0);
|
| + in1 = _mm_unpackhi_epi32(sum, u0);
|
| +
|
| + sum = _mm_add_epi32(in0, in1);
|
| + in0 = _mm_srli_si128(sum, 8);
|
| +
|
| + in1 = _mm_add_epi32(sum, in0);
|
| + in1 = _mm_srai_epi32(in1, 1);
|
| + _mm_store_si128((__m128i *)(output), in1);
|
| +}
|
| +
|
| void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {
|
| // The 2D transform is done with two passes which are actually pretty
|
| // similar. In the first one, we transform the columns and transpose
|
| @@ -1187,7 +1324,7 @@
|
| const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
|
| const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
|
| const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
|
| - const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
|
| + const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
|
| const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
|
| const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
|
| const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
|
| @@ -1513,8 +1650,8 @@
|
| const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
|
| const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24);
|
| const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24);
|
| - const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m24_m08);
|
| - const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m24_m08);
|
| + const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p24_p08);
|
| + const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p24_p08);
|
| // dct_const_round_shift
|
| const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
|
| const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
|
| @@ -1535,8 +1672,8 @@
|
| const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
|
| const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08);
|
| const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08);
|
| - const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m08_p24);
|
| - const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m08_p24);
|
| + const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p08_m24);
|
| + const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p08_m24);
|
| // dct_const_round_shift
|
| const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
|
| const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
|
| @@ -1554,10 +1691,10 @@
|
| {
|
| step1_0 = _mm_add_epi16(step3_0, step2_1);
|
| step1_1 = _mm_sub_epi16(step3_0, step2_1);
|
| - step1_2 = _mm_sub_epi16(step3_3, step2_2);
|
| - step1_3 = _mm_add_epi16(step3_3, step2_2);
|
| - step1_4 = _mm_add_epi16(step3_4, step2_5);
|
| - step1_5 = _mm_sub_epi16(step3_4, step2_5);
|
| + step1_2 = _mm_add_epi16(step3_3, step2_2);
|
| + step1_3 = _mm_sub_epi16(step3_3, step2_2);
|
| + step1_4 = _mm_sub_epi16(step3_4, step2_5);
|
| + step1_5 = _mm_add_epi16(step3_4, step2_5);
|
| step1_6 = _mm_sub_epi16(step3_7, step2_6);
|
| step1_7 = _mm_add_epi16(step3_7, step2_6);
|
| }
|
| @@ -1848,7 +1985,7 @@
|
| const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
|
| const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
|
| const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
|
| - const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
|
| + const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
|
| const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
|
| const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
|
| const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
|
| @@ -2052,10 +2189,10 @@
|
|
|
| v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24);
|
| v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24);
|
| - v[2] = _mm_madd_epi16(u[2], k__cospi_m24_m08);
|
| - v[3] = _mm_madd_epi16(u[3], k__cospi_m24_m08);
|
| - v[4] = _mm_madd_epi16(u[2], k__cospi_m08_p24);
|
| - v[5] = _mm_madd_epi16(u[3], k__cospi_m08_p24);
|
| + v[2] = _mm_madd_epi16(u[2], k__cospi_p24_p08);
|
| + v[3] = _mm_madd_epi16(u[3], k__cospi_p24_p08);
|
| + v[4] = _mm_madd_epi16(u[2], k__cospi_p08_m24);
|
| + v[5] = _mm_madd_epi16(u[3], k__cospi_p08_m24);
|
| v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08);
|
| v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08);
|
|
|
| @@ -2085,10 +2222,10 @@
|
| // stage 5
|
| s[0] = _mm_add_epi16(p[0], t[1]);
|
| s[1] = _mm_sub_epi16(p[0], t[1]);
|
| - s[2] = _mm_sub_epi16(p[3], t[2]);
|
| - s[3] = _mm_add_epi16(p[3], t[2]);
|
| - s[4] = _mm_add_epi16(p[4], t[5]);
|
| - s[5] = _mm_sub_epi16(p[4], t[5]);
|
| + s[2] = _mm_add_epi16(p[3], t[2]);
|
| + s[3] = _mm_sub_epi16(p[3], t[2]);
|
| + s[4] = _mm_sub_epi16(p[4], t[5]);
|
| + s[5] = _mm_add_epi16(p[4], t[5]);
|
| s[6] = _mm_sub_epi16(p[7], t[6]);
|
| s[7] = _mm_add_epi16(p[7], t[6]);
|
|
|
| @@ -2680,6 +2817,77 @@
|
| }
|
| }
|
|
|
| +void vp9_fdct32x32_1_sse2(const int16_t *input, int16_t *output, int stride) {
|
| + __m128i in0, in1, in2, in3;
|
| + __m128i u0, u1;
|
| + __m128i sum = _mm_setzero_si128();
|
| + int i;
|
| +
|
| + for (i = 0; i < 8; ++i) {
|
| + in0 = _mm_load_si128((const __m128i *)(input + 0));
|
| + in1 = _mm_load_si128((const __m128i *)(input + 8));
|
| + in2 = _mm_load_si128((const __m128i *)(input + 16));
|
| + in3 = _mm_load_si128((const __m128i *)(input + 24));
|
| +
|
| + input += stride;
|
| + u0 = _mm_add_epi16(in0, in1);
|
| + u1 = _mm_add_epi16(in2, in3);
|
| + sum = _mm_add_epi16(sum, u0);
|
| +
|
| + in0 = _mm_load_si128((const __m128i *)(input + 0));
|
| + in1 = _mm_load_si128((const __m128i *)(input + 8));
|
| + in2 = _mm_load_si128((const __m128i *)(input + 16));
|
| + in3 = _mm_load_si128((const __m128i *)(input + 24));
|
| +
|
| + input += stride;
|
| + sum = _mm_add_epi16(sum, u1);
|
| + u0 = _mm_add_epi16(in0, in1);
|
| + u1 = _mm_add_epi16(in2, in3);
|
| + sum = _mm_add_epi16(sum, u0);
|
| +
|
| + in0 = _mm_load_si128((const __m128i *)(input + 0));
|
| + in1 = _mm_load_si128((const __m128i *)(input + 8));
|
| + in2 = _mm_load_si128((const __m128i *)(input + 16));
|
| + in3 = _mm_load_si128((const __m128i *)(input + 24));
|
| +
|
| + input += stride;
|
| + sum = _mm_add_epi16(sum, u1);
|
| + u0 = _mm_add_epi16(in0, in1);
|
| + u1 = _mm_add_epi16(in2, in3);
|
| + sum = _mm_add_epi16(sum, u0);
|
| +
|
| + in0 = _mm_load_si128((const __m128i *)(input + 0));
|
| + in1 = _mm_load_si128((const __m128i *)(input + 8));
|
| + in2 = _mm_load_si128((const __m128i *)(input + 16));
|
| + in3 = _mm_load_si128((const __m128i *)(input + 24));
|
| +
|
| + input += stride;
|
| + sum = _mm_add_epi16(sum, u1);
|
| + u0 = _mm_add_epi16(in0, in1);
|
| + u1 = _mm_add_epi16(in2, in3);
|
| + sum = _mm_add_epi16(sum, u0);
|
| +
|
| + sum = _mm_add_epi16(sum, u1);
|
| + }
|
| +
|
| + u0 = _mm_setzero_si128();
|
| + in0 = _mm_unpacklo_epi16(u0, sum);
|
| + in1 = _mm_unpackhi_epi16(u0, sum);
|
| + in0 = _mm_srai_epi32(in0, 16);
|
| + in1 = _mm_srai_epi32(in1, 16);
|
| +
|
| + sum = _mm_add_epi32(in0, in1);
|
| + in0 = _mm_unpacklo_epi32(sum, u0);
|
| + in1 = _mm_unpackhi_epi32(sum, u0);
|
| +
|
| + sum = _mm_add_epi32(in0, in1);
|
| + in0 = _mm_srli_si128(sum, 8);
|
| +
|
| + in1 = _mm_add_epi32(sum, in0);
|
| + in1 = _mm_srai_epi32(in1, 3);
|
| + _mm_store_si128((__m128i *)(output), in1);
|
| +}
|
| +
|
| #define FDCT32x32_2D vp9_fdct32x32_rd_sse2
|
| #define FDCT32x32_HIGH_PRECISION 0
|
| #include "vp9/encoder/x86/vp9_dct32x32_sse2.c"
|
|
|