source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c - Issue 54923004: libvpx: Pull from upstream

Unified Diff: source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c

Issue 54923004: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 7 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c

===================================================================

--- source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c (revision 232232)

+++ source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c (working copy)

@@ -12,14 +12,13 @@

#include "vp9/common/vp9_idct.h" // for cospi constants

#include "vpx_ports/mem.h"

-void vp9_short_fdct4x4_sse2(int16_t *input, int16_t *output, int pitch) {

+void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) {

// The 2D transform is done with two passes which are actually pretty

// similar. In the first one, we transform the columns and transpose

// the results. In the second one, we transform the rows. To achieve that,

// as the first pass results are transposed, we tranpose the columns (that

// is the transposed rows) and transpose the results (so that it goes back

// in normal/row positions).

- const int stride = pitch >> 1;

int pass;

// Constants

// When we use them, in one case, they are all the same. In all others

@@ -112,12 +111,8 @@

}

-void vp9_short_fdct8x4_sse2(int16_t *input, int16_t *output, int pitch) {

- vp9_short_fdct4x4_sse2(input, output, pitch);

- vp9_short_fdct4x4_sse2(input + 4, output + 16, pitch);

-static INLINE void load_buffer_4x4(int16_t *input, __m128i *in, int stride) {

+static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,

+ int stride) {

const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);

const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);

__m128i mask;

@@ -171,22 +166,21 @@

void fdct4_1d_sse2(__m128i *in) {

const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);

const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);

- const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);

- const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);

+ const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);

+ const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);

const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);

__m128i u[4], v[4];

- u[0] = _mm_add_epi16(in[0], in[3]);

- u[1] = _mm_add_epi16(in[1], in[2]);

- u[2] = _mm_sub_epi16(in[1], in[2]);

- u[3] = _mm_sub_epi16(in[0], in[3]);

+ u[0]=_mm_unpacklo_epi16(in[0], in[1]);

+ u[1]=_mm_unpacklo_epi16(in[3], in[2]);

- v[0] = _mm_unpacklo_epi16(u[0], u[1]);

- v[1] = _mm_unpacklo_epi16(u[2], u[3]);

+ v[0] = _mm_add_epi16(u[0], u[1]);

+ v[1] = _mm_sub_epi16(u[0], u[1]);

u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); // 0

u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16); // 2

- u[2] = _mm_madd_epi16(v[1], k__cospi_p24_p08); // 1

- u[3] = _mm_madd_epi16(v[1], k__cospi_m08_p24); // 3

+ u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24); // 1

+ u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08); // 3

v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);

v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);

@@ -249,7 +243,7 @@

transpose_4x4(in);

}

-void vp9_short_fht4x4_sse2(int16_t *input, int16_t *output,

+void vp9_short_fht4x4_sse2(const int16_t *input, int16_t *output,

int stride, int tx_type) {

__m128i in[4];

load_buffer_4x4(input, in, stride);

@@ -277,8 +271,7 @@

write_buffer_4x4(output, in);

}

-void vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int pitch) {

- const int stride = pitch >> 1;

+void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) {

int pass;

// Constants

// When we use them, in one case, they are all the same. In all others

@@ -535,15 +528,16 @@

}

// load 8x8 array

-static INLINE void load_buffer_8x8(int16_t *input, __m128i *in, int stride) {

- in[0] = _mm_load_si128((__m128i *)(input + 0 * stride));

- in[1] = _mm_load_si128((__m128i *)(input + 1 * stride));

- in[2] = _mm_load_si128((__m128i *)(input + 2 * stride));

- in[3] = _mm_load_si128((__m128i *)(input + 3 * stride));

- in[4] = _mm_load_si128((__m128i *)(input + 4 * stride));

- in[5] = _mm_load_si128((__m128i *)(input + 5 * stride));

- in[6] = _mm_load_si128((__m128i *)(input + 6 * stride));

- in[7] = _mm_load_si128((__m128i *)(input + 7 * stride));

+static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,

+ int stride) {

+ in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));

+ in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));

+ in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));

+ in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));

+ in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));

+ in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));

+ in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));

+ in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));

in[0] = _mm_slli_epi16(in[0], 2);

in[1] = _mm_slli_epi16(in[1], 2);

@@ -1033,7 +1027,7 @@

array_transpose_8x8(in, in);

}

-void vp9_short_fht8x8_sse2(int16_t *input, int16_t *output,

+void vp9_short_fht8x8_sse2(const int16_t *input, int16_t *output,

int stride, int tx_type) {

__m128i in[8];

load_buffer_8x8(input, in, stride);

@@ -1062,18 +1056,17 @@

write_buffer_8x8(output, in, 8);

}

-void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int pitch) {

+void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {

// The 2D transform is done with two passes which are actually pretty

// similar. In the first one, we transform the columns and transpose

// the results. In the second one, we transform the rows. To achieve that,

// as the first pass results are transposed, we tranpose the columns (that

// is the transposed rows) and transpose the results (so that it goes back

// in normal/row positions).

- const int stride = pitch >> 1;

int pass;

// We need an intermediate buffer between passes.

DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256);

- int16_t *in = input;

+ const int16_t *in = input;

int16_t *out = intermediate;

// Constants

// When we use them, in one case, they are all the same. In all others

@@ -1688,7 +1681,7 @@

}

-static INLINE void load_buffer_16x16(int16_t* input, __m128i *in0,

+static INLINE void load_buffer_16x16(const int16_t* input, __m128i *in0,

__m128i *in1, int stride) {

// load first 8 columns

load_buffer_8x8(input, in0, stride);

@@ -2540,7 +2533,7 @@

array_transpose_16x16(in0, in1);

}

-void vp9_short_fht16x16_sse2(int16_t *input, int16_t *output,

+void vp9_short_fht16x16_sse2(const int16_t *input, int16_t *output,

int stride, int tx_type) {

__m128i in0[16], in1[16];

load_buffer_16x16(input, in0, in1, stride);

@@ -2572,13 +2565,13 @@

write_buffer_16x16(output, in0, in1, 16);

}

-#define FDCT32x32_2D vp9_short_fdct32x32_rd_sse2

+#define FDCT32x32_2D vp9_fdct32x32_rd_sse2

#define FDCT32x32_HIGH_PRECISION 0

#include "vp9/encoder/x86/vp9_dct32x32_sse2.c"

#undef FDCT32x32_2D

#undef FDCT32x32_HIGH_PRECISION

-#define FDCT32x32_2D vp9_short_fdct32x32_sse2

+#define FDCT32x32_2D vp9_fdct32x32_sse2

#define FDCT32x32_HIGH_PRECISION 1

#include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT

#undef FDCT32x32_2D

« no previous file with comments | « source/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c ('k') | source/libvpx/vp9/encoder/x86/vp9_variance_mmx.c » ('j') | no next file with comments »