Index: source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c |
=================================================================== |
--- source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c (revision 278778) |
+++ source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c (working copy) |
@@ -12,6 +12,35 @@ |
#include "vp9/common/vp9_idct.h" // for cospi constants |
#include "vpx_ports/mem.h" |
+void vp9_fdct4x4_1_sse2(const int16_t *input, int16_t *output, int stride) { |
+ __m128i in0, in1; |
+ __m128i tmp; |
+ const __m128i zero = _mm_setzero_si128(); |
+ in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); |
+ in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); |
+ in1 = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *) |
+ (input + 2 * stride))); |
+ in0 = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *) |
+ (input + 3 * stride))); |
+ |
+ tmp = _mm_add_epi16(in0, in1); |
+ in0 = _mm_unpacklo_epi16(zero, tmp); |
+ in1 = _mm_unpackhi_epi16(zero, tmp); |
+ in0 = _mm_srai_epi32(in0, 16); |
+ in1 = _mm_srai_epi32(in1, 16); |
+ |
+ tmp = _mm_add_epi32(in0, in1); |
+ in0 = _mm_unpacklo_epi32(tmp, zero); |
+ in1 = _mm_unpackhi_epi32(tmp, zero); |
+ |
+ tmp = _mm_add_epi32(in0, in1); |
+ in0 = _mm_srli_si128(tmp, 8); |
+ |
+ in1 = _mm_add_epi32(tmp, in0); |
+ in0 = _mm_slli_epi32(in1, 1); |
+ _mm_store_si128((__m128i *)(output), in0); |
+} |
+ |
void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) { |
// This 2D transform implements 4 vertical 1D transforms followed |
// by 4 horizontal 1D transforms. The multiplies and adds are as given |
@@ -377,6 +406,46 @@ |
} |
} |
+void vp9_fdct8x8_1_sse2(const int16_t *input, int16_t *output, int stride) { |
+ __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); |
+ __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); |
+ __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); |
+ __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); |
+ __m128i u0, u1, sum; |
+ |
+ u0 = _mm_add_epi16(in0, in1); |
+ u1 = _mm_add_epi16(in2, in3); |
+ |
+ in0 = _mm_load_si128((const __m128i *)(input + 4 * stride)); |
+ in1 = _mm_load_si128((const __m128i *)(input + 5 * stride)); |
+ in2 = _mm_load_si128((const __m128i *)(input + 6 * stride)); |
+ in3 = _mm_load_si128((const __m128i *)(input + 7 * stride)); |
+ |
+ sum = _mm_add_epi16(u0, u1); |
+ |
+ in0 = _mm_add_epi16(in0, in1); |
+ in2 = _mm_add_epi16(in2, in3); |
+ sum = _mm_add_epi16(sum, in0); |
+ |
+ u0 = _mm_setzero_si128(); |
+ sum = _mm_add_epi16(sum, in2); |
+ |
+ in0 = _mm_unpacklo_epi16(u0, sum); |
+ in1 = _mm_unpackhi_epi16(u0, sum); |
+ in0 = _mm_srai_epi32(in0, 16); |
+ in1 = _mm_srai_epi32(in1, 16); |
+ |
+ sum = _mm_add_epi32(in0, in1); |
+ in0 = _mm_unpacklo_epi32(sum, u0); |
+ in1 = _mm_unpackhi_epi32(sum, u0); |
+ |
+ sum = _mm_add_epi32(in0, in1); |
+ in0 = _mm_srli_si128(sum, 8); |
+ |
+ in1 = _mm_add_epi32(sum, in0); |
+ _mm_store_si128((__m128i *)(output), in1); |
+} |
+ |
void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) { |
int pass; |
// Constants |
@@ -1168,6 +1237,74 @@ |
} |
} |
+void vp9_fdct16x16_1_sse2(const int16_t *input, int16_t *output, int stride) { |
+ __m128i in0, in1, in2, in3; |
+ __m128i u0, u1; |
+ __m128i sum = _mm_setzero_si128(); |
+ int i; |
+ |
+ for (i = 0; i < 2; ++i) { |
+ input += 8 * i; |
+ in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); |
+ in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); |
+ in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); |
+ in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); |
+ |
+ u0 = _mm_add_epi16(in0, in1); |
+ u1 = _mm_add_epi16(in2, in3); |
+ sum = _mm_add_epi16(sum, u0); |
+ |
+ in0 = _mm_load_si128((const __m128i *)(input + 4 * stride)); |
+ in1 = _mm_load_si128((const __m128i *)(input + 5 * stride)); |
+ in2 = _mm_load_si128((const __m128i *)(input + 6 * stride)); |
+ in3 = _mm_load_si128((const __m128i *)(input + 7 * stride)); |
+ |
+ sum = _mm_add_epi16(sum, u1); |
+ u0 = _mm_add_epi16(in0, in1); |
+ u1 = _mm_add_epi16(in2, in3); |
+ sum = _mm_add_epi16(sum, u0); |
+ |
+ in0 = _mm_load_si128((const __m128i *)(input + 8 * stride)); |
+ in1 = _mm_load_si128((const __m128i *)(input + 9 * stride)); |
+ in2 = _mm_load_si128((const __m128i *)(input + 10 * stride)); |
+ in3 = _mm_load_si128((const __m128i *)(input + 11 * stride)); |
+ |
+ sum = _mm_add_epi16(sum, u1); |
+ u0 = _mm_add_epi16(in0, in1); |
+ u1 = _mm_add_epi16(in2, in3); |
+ sum = _mm_add_epi16(sum, u0); |
+ |
+ in0 = _mm_load_si128((const __m128i *)(input + 12 * stride)); |
+ in1 = _mm_load_si128((const __m128i *)(input + 13 * stride)); |
+ in2 = _mm_load_si128((const __m128i *)(input + 14 * stride)); |
+ in3 = _mm_load_si128((const __m128i *)(input + 15 * stride)); |
+ |
+ sum = _mm_add_epi16(sum, u1); |
+ u0 = _mm_add_epi16(in0, in1); |
+ u1 = _mm_add_epi16(in2, in3); |
+ sum = _mm_add_epi16(sum, u0); |
+ |
+ sum = _mm_add_epi16(sum, u1); |
+ } |
+ |
+ u0 = _mm_setzero_si128(); |
+ in0 = _mm_unpacklo_epi16(u0, sum); |
+ in1 = _mm_unpackhi_epi16(u0, sum); |
+ in0 = _mm_srai_epi32(in0, 16); |
+ in1 = _mm_srai_epi32(in1, 16); |
+ |
+ sum = _mm_add_epi32(in0, in1); |
+ in0 = _mm_unpacklo_epi32(sum, u0); |
+ in1 = _mm_unpackhi_epi32(sum, u0); |
+ |
+ sum = _mm_add_epi32(in0, in1); |
+ in0 = _mm_srli_si128(sum, 8); |
+ |
+ in1 = _mm_add_epi32(sum, in0); |
+ in1 = _mm_srai_epi32(in1, 1); |
+ _mm_store_si128((__m128i *)(output), in1); |
+} |
+ |
void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) { |
// The 2D transform is done with two passes which are actually pretty |
// similar. In the first one, we transform the columns and transpose |
@@ -1187,7 +1324,7 @@ |
const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); |
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); |
- const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); |
+ const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64); |
const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); |
const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); |
const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); |
@@ -1513,8 +1650,8 @@ |
const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); |
const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24); |
const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24); |
- const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m24_m08); |
- const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m24_m08); |
+ const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p24_p08); |
+ const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p24_p08); |
// dct_const_round_shift |
const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); |
const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); |
@@ -1535,8 +1672,8 @@ |
const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); |
const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08); |
const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08); |
- const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m08_p24); |
- const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m08_p24); |
+ const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p08_m24); |
+ const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p08_m24); |
// dct_const_round_shift |
const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); |
const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); |
@@ -1554,10 +1691,10 @@ |
{ |
step1_0 = _mm_add_epi16(step3_0, step2_1); |
step1_1 = _mm_sub_epi16(step3_0, step2_1); |
- step1_2 = _mm_sub_epi16(step3_3, step2_2); |
- step1_3 = _mm_add_epi16(step3_3, step2_2); |
- step1_4 = _mm_add_epi16(step3_4, step2_5); |
- step1_5 = _mm_sub_epi16(step3_4, step2_5); |
+ step1_2 = _mm_add_epi16(step3_3, step2_2); |
+ step1_3 = _mm_sub_epi16(step3_3, step2_2); |
+ step1_4 = _mm_sub_epi16(step3_4, step2_5); |
+ step1_5 = _mm_add_epi16(step3_4, step2_5); |
step1_6 = _mm_sub_epi16(step3_7, step2_6); |
step1_7 = _mm_add_epi16(step3_7, step2_6); |
} |
@@ -1848,7 +1985,7 @@ |
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); |
const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); |
- const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); |
+ const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64); |
const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); |
const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); |
const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); |
@@ -2052,10 +2189,10 @@ |
v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24); |
v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24); |
- v[2] = _mm_madd_epi16(u[2], k__cospi_m24_m08); |
- v[3] = _mm_madd_epi16(u[3], k__cospi_m24_m08); |
- v[4] = _mm_madd_epi16(u[2], k__cospi_m08_p24); |
- v[5] = _mm_madd_epi16(u[3], k__cospi_m08_p24); |
+ v[2] = _mm_madd_epi16(u[2], k__cospi_p24_p08); |
+ v[3] = _mm_madd_epi16(u[3], k__cospi_p24_p08); |
+ v[4] = _mm_madd_epi16(u[2], k__cospi_p08_m24); |
+ v[5] = _mm_madd_epi16(u[3], k__cospi_p08_m24); |
v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08); |
v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08); |
@@ -2085,10 +2222,10 @@ |
// stage 5 |
s[0] = _mm_add_epi16(p[0], t[1]); |
s[1] = _mm_sub_epi16(p[0], t[1]); |
- s[2] = _mm_sub_epi16(p[3], t[2]); |
- s[3] = _mm_add_epi16(p[3], t[2]); |
- s[4] = _mm_add_epi16(p[4], t[5]); |
- s[5] = _mm_sub_epi16(p[4], t[5]); |
+ s[2] = _mm_add_epi16(p[3], t[2]); |
+ s[3] = _mm_sub_epi16(p[3], t[2]); |
+ s[4] = _mm_sub_epi16(p[4], t[5]); |
+ s[5] = _mm_add_epi16(p[4], t[5]); |
s[6] = _mm_sub_epi16(p[7], t[6]); |
s[7] = _mm_add_epi16(p[7], t[6]); |
@@ -2680,6 +2817,77 @@ |
} |
} |
+void vp9_fdct32x32_1_sse2(const int16_t *input, int16_t *output, int stride) { |
+ __m128i in0, in1, in2, in3; |
+ __m128i u0, u1; |
+ __m128i sum = _mm_setzero_si128(); |
+ int i; |
+ |
+ for (i = 0; i < 8; ++i) { |
+ in0 = _mm_load_si128((const __m128i *)(input + 0)); |
+ in1 = _mm_load_si128((const __m128i *)(input + 8)); |
+ in2 = _mm_load_si128((const __m128i *)(input + 16)); |
+ in3 = _mm_load_si128((const __m128i *)(input + 24)); |
+ |
+ input += stride; |
+ u0 = _mm_add_epi16(in0, in1); |
+ u1 = _mm_add_epi16(in2, in3); |
+ sum = _mm_add_epi16(sum, u0); |
+ |
+ in0 = _mm_load_si128((const __m128i *)(input + 0)); |
+ in1 = _mm_load_si128((const __m128i *)(input + 8)); |
+ in2 = _mm_load_si128((const __m128i *)(input + 16)); |
+ in3 = _mm_load_si128((const __m128i *)(input + 24)); |
+ |
+ input += stride; |
+ sum = _mm_add_epi16(sum, u1); |
+ u0 = _mm_add_epi16(in0, in1); |
+ u1 = _mm_add_epi16(in2, in3); |
+ sum = _mm_add_epi16(sum, u0); |
+ |
+ in0 = _mm_load_si128((const __m128i *)(input + 0)); |
+ in1 = _mm_load_si128((const __m128i *)(input + 8)); |
+ in2 = _mm_load_si128((const __m128i *)(input + 16)); |
+ in3 = _mm_load_si128((const __m128i *)(input + 24)); |
+ |
+ input += stride; |
+ sum = _mm_add_epi16(sum, u1); |
+ u0 = _mm_add_epi16(in0, in1); |
+ u1 = _mm_add_epi16(in2, in3); |
+ sum = _mm_add_epi16(sum, u0); |
+ |
+ in0 = _mm_load_si128((const __m128i *)(input + 0)); |
+ in1 = _mm_load_si128((const __m128i *)(input + 8)); |
+ in2 = _mm_load_si128((const __m128i *)(input + 16)); |
+ in3 = _mm_load_si128((const __m128i *)(input + 24)); |
+ |
+ input += stride; |
+ sum = _mm_add_epi16(sum, u1); |
+ u0 = _mm_add_epi16(in0, in1); |
+ u1 = _mm_add_epi16(in2, in3); |
+ sum = _mm_add_epi16(sum, u0); |
+ |
+ sum = _mm_add_epi16(sum, u1); |
+ } |
+ |
+ u0 = _mm_setzero_si128(); |
+ in0 = _mm_unpacklo_epi16(u0, sum); |
+ in1 = _mm_unpackhi_epi16(u0, sum); |
+ in0 = _mm_srai_epi32(in0, 16); |
+ in1 = _mm_srai_epi32(in1, 16); |
+ |
+ sum = _mm_add_epi32(in0, in1); |
+ in0 = _mm_unpacklo_epi32(sum, u0); |
+ in1 = _mm_unpackhi_epi32(sum, u0); |
+ |
+ sum = _mm_add_epi32(in0, in1); |
+ in0 = _mm_srli_si128(sum, 8); |
+ |
+ in1 = _mm_add_epi32(sum, in0); |
+ in1 = _mm_srai_epi32(in1, 3); |
+ _mm_store_si128((__m128i *)(output), in1); |
+} |
+ |
#define FDCT32x32_2D vp9_fdct32x32_rd_sse2 |
#define FDCT32x32_HIGH_PRECISION 0 |
#include "vp9/encoder/x86/vp9_dct32x32_sse2.c" |