Index: source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c |
=================================================================== |
--- source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c (revision 232232) |
+++ source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c (working copy) |
@@ -12,14 +12,13 @@ |
#include "vp9/common/vp9_idct.h" // for cospi constants |
#include "vpx_ports/mem.h" |
-void vp9_short_fdct4x4_sse2(int16_t *input, int16_t *output, int pitch) { |
+void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) { |
// The 2D transform is done with two passes which are actually pretty |
// similar. In the first one, we transform the columns and transpose |
// the results. In the second one, we transform the rows. To achieve that, |
// as the first pass results are transposed, we tranpose the columns (that |
// is the transposed rows) and transpose the results (so that it goes back |
// in normal/row positions). |
- const int stride = pitch >> 1; |
int pass; |
// Constants |
// When we use them, in one case, they are all the same. In all others |
@@ -112,12 +111,8 @@ |
} |
} |
-void vp9_short_fdct8x4_sse2(int16_t *input, int16_t *output, int pitch) { |
- vp9_short_fdct4x4_sse2(input, output, pitch); |
- vp9_short_fdct4x4_sse2(input + 4, output + 16, pitch); |
-} |
- |
-static INLINE void load_buffer_4x4(int16_t *input, __m128i *in, int stride) { |
+static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in, |
+ int stride) { |
const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); |
const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); |
__m128i mask; |
@@ -171,22 +166,21 @@ |
void fdct4_1d_sse2(__m128i *in) { |
const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); |
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
- const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); |
- const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); |
+ const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); |
+ const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); |
__m128i u[4], v[4]; |
- u[0] = _mm_add_epi16(in[0], in[3]); |
- u[1] = _mm_add_epi16(in[1], in[2]); |
- u[2] = _mm_sub_epi16(in[1], in[2]); |
- u[3] = _mm_sub_epi16(in[0], in[3]); |
+ u[0]=_mm_unpacklo_epi16(in[0], in[1]); |
+ u[1]=_mm_unpacklo_epi16(in[3], in[2]); |
- v[0] = _mm_unpacklo_epi16(u[0], u[1]); |
- v[1] = _mm_unpacklo_epi16(u[2], u[3]); |
+ v[0] = _mm_add_epi16(u[0], u[1]); |
+ v[1] = _mm_sub_epi16(u[0], u[1]); |
+ |
u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); // 0 |
u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16); // 2 |
- u[2] = _mm_madd_epi16(v[1], k__cospi_p24_p08); // 1 |
- u[3] = _mm_madd_epi16(v[1], k__cospi_m08_p24); // 3 |
+ u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24); // 1 |
+ u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08); // 3 |
v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); |
v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); |
@@ -249,7 +243,7 @@ |
transpose_4x4(in); |
} |
-void vp9_short_fht4x4_sse2(int16_t *input, int16_t *output, |
+void vp9_short_fht4x4_sse2(const int16_t *input, int16_t *output, |
int stride, int tx_type) { |
__m128i in[4]; |
load_buffer_4x4(input, in, stride); |
@@ -277,8 +271,7 @@ |
write_buffer_4x4(output, in); |
} |
-void vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int pitch) { |
- const int stride = pitch >> 1; |
+void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) { |
int pass; |
// Constants |
// When we use them, in one case, they are all the same. In all others |
@@ -535,15 +528,16 @@ |
} |
// load 8x8 array |
-static INLINE void load_buffer_8x8(int16_t *input, __m128i *in, int stride) { |
- in[0] = _mm_load_si128((__m128i *)(input + 0 * stride)); |
- in[1] = _mm_load_si128((__m128i *)(input + 1 * stride)); |
- in[2] = _mm_load_si128((__m128i *)(input + 2 * stride)); |
- in[3] = _mm_load_si128((__m128i *)(input + 3 * stride)); |
- in[4] = _mm_load_si128((__m128i *)(input + 4 * stride)); |
- in[5] = _mm_load_si128((__m128i *)(input + 5 * stride)); |
- in[6] = _mm_load_si128((__m128i *)(input + 6 * stride)); |
- in[7] = _mm_load_si128((__m128i *)(input + 7 * stride)); |
+static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in, |
+ int stride) { |
+ in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride)); |
+ in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride)); |
+ in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride)); |
+ in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride)); |
+ in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride)); |
+ in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride)); |
+ in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride)); |
+ in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride)); |
in[0] = _mm_slli_epi16(in[0], 2); |
in[1] = _mm_slli_epi16(in[1], 2); |
@@ -1033,7 +1027,7 @@ |
array_transpose_8x8(in, in); |
} |
-void vp9_short_fht8x8_sse2(int16_t *input, int16_t *output, |
+void vp9_short_fht8x8_sse2(const int16_t *input, int16_t *output, |
int stride, int tx_type) { |
__m128i in[8]; |
load_buffer_8x8(input, in, stride); |
@@ -1062,18 +1056,17 @@ |
write_buffer_8x8(output, in, 8); |
} |
-void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int pitch) { |
+void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) { |
// The 2D transform is done with two passes which are actually pretty |
// similar. In the first one, we transform the columns and transpose |
// the results. In the second one, we transform the rows. To achieve that, |
// as the first pass results are transposed, we tranpose the columns (that |
// is the transposed rows) and transpose the results (so that it goes back |
// in normal/row positions). |
- const int stride = pitch >> 1; |
int pass; |
// We need an intermediate buffer between passes. |
DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256); |
- int16_t *in = input; |
+ const int16_t *in = input; |
int16_t *out = intermediate; |
// Constants |
// When we use them, in one case, they are all the same. In all others |
@@ -1688,7 +1681,7 @@ |
} |
} |
-static INLINE void load_buffer_16x16(int16_t* input, __m128i *in0, |
+static INLINE void load_buffer_16x16(const int16_t* input, __m128i *in0, |
__m128i *in1, int stride) { |
// load first 8 columns |
load_buffer_8x8(input, in0, stride); |
@@ -2540,7 +2533,7 @@ |
array_transpose_16x16(in0, in1); |
} |
-void vp9_short_fht16x16_sse2(int16_t *input, int16_t *output, |
+void vp9_short_fht16x16_sse2(const int16_t *input, int16_t *output, |
int stride, int tx_type) { |
__m128i in0[16], in1[16]; |
load_buffer_16x16(input, in0, in1, stride); |
@@ -2572,13 +2565,13 @@ |
write_buffer_16x16(output, in0, in1, 16); |
} |
-#define FDCT32x32_2D vp9_short_fdct32x32_rd_sse2 |
+#define FDCT32x32_2D vp9_fdct32x32_rd_sse2 |
#define FDCT32x32_HIGH_PRECISION 0 |
#include "vp9/encoder/x86/vp9_dct32x32_sse2.c" |
#undef FDCT32x32_2D |
#undef FDCT32x32_HIGH_PRECISION |
-#define FDCT32x32_2D vp9_short_fdct32x32_sse2 |
+#define FDCT32x32_2D vp9_fdct32x32_sse2 |
#define FDCT32x32_HIGH_PRECISION 1 |
#include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT |
#undef FDCT32x32_2D |