Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(11)

Unified Diff: source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c

Issue 54923004: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 7 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « source/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c ('k') | source/libvpx/vp9/encoder/x86/vp9_variance_mmx.c » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c
===================================================================
--- source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c (revision 232232)
+++ source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c (working copy)
@@ -12,14 +12,13 @@
#include "vp9/common/vp9_idct.h" // for cospi constants
#include "vpx_ports/mem.h"
-void vp9_short_fdct4x4_sse2(int16_t *input, int16_t *output, int pitch) {
+void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) {
// The 2D transform is done with two passes which are actually pretty
// similar. In the first one, we transform the columns and transpose
// the results. In the second one, we transform the rows. To achieve that,
// as the first pass results are transposed, we tranpose the columns (that
// is the transposed rows) and transpose the results (so that it goes back
// in normal/row positions).
- const int stride = pitch >> 1;
int pass;
// Constants
// When we use them, in one case, they are all the same. In all others
@@ -112,12 +111,8 @@
}
}
-void vp9_short_fdct8x4_sse2(int16_t *input, int16_t *output, int pitch) {
- vp9_short_fdct4x4_sse2(input, output, pitch);
- vp9_short_fdct4x4_sse2(input + 4, output + 16, pitch);
-}
-
-static INLINE void load_buffer_4x4(int16_t *input, __m128i *in, int stride) {
+static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
+ int stride) {
const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
__m128i mask;
@@ -171,22 +166,21 @@
void fdct4_1d_sse2(__m128i *in) {
const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
__m128i u[4], v[4];
- u[0] = _mm_add_epi16(in[0], in[3]);
- u[1] = _mm_add_epi16(in[1], in[2]);
- u[2] = _mm_sub_epi16(in[1], in[2]);
- u[3] = _mm_sub_epi16(in[0], in[3]);
+ u[0]=_mm_unpacklo_epi16(in[0], in[1]);
+ u[1]=_mm_unpacklo_epi16(in[3], in[2]);
- v[0] = _mm_unpacklo_epi16(u[0], u[1]);
- v[1] = _mm_unpacklo_epi16(u[2], u[3]);
+ v[0] = _mm_add_epi16(u[0], u[1]);
+ v[1] = _mm_sub_epi16(u[0], u[1]);
+
u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); // 0
u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16); // 2
- u[2] = _mm_madd_epi16(v[1], k__cospi_p24_p08); // 1
- u[3] = _mm_madd_epi16(v[1], k__cospi_m08_p24); // 3
+ u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24); // 1
+ u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08); // 3
v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
@@ -249,7 +243,7 @@
transpose_4x4(in);
}
-void vp9_short_fht4x4_sse2(int16_t *input, int16_t *output,
+void vp9_short_fht4x4_sse2(const int16_t *input, int16_t *output,
int stride, int tx_type) {
__m128i in[4];
load_buffer_4x4(input, in, stride);
@@ -277,8 +271,7 @@
write_buffer_4x4(output, in);
}
-void vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int pitch) {
- const int stride = pitch >> 1;
+void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) {
int pass;
// Constants
// When we use them, in one case, they are all the same. In all others
@@ -535,15 +528,16 @@
}
// load 8x8 array
-static INLINE void load_buffer_8x8(int16_t *input, __m128i *in, int stride) {
- in[0] = _mm_load_si128((__m128i *)(input + 0 * stride));
- in[1] = _mm_load_si128((__m128i *)(input + 1 * stride));
- in[2] = _mm_load_si128((__m128i *)(input + 2 * stride));
- in[3] = _mm_load_si128((__m128i *)(input + 3 * stride));
- in[4] = _mm_load_si128((__m128i *)(input + 4 * stride));
- in[5] = _mm_load_si128((__m128i *)(input + 5 * stride));
- in[6] = _mm_load_si128((__m128i *)(input + 6 * stride));
- in[7] = _mm_load_si128((__m128i *)(input + 7 * stride));
+static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
+ int stride) {
+ in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
in[0] = _mm_slli_epi16(in[0], 2);
in[1] = _mm_slli_epi16(in[1], 2);
@@ -1033,7 +1027,7 @@
array_transpose_8x8(in, in);
}
-void vp9_short_fht8x8_sse2(int16_t *input, int16_t *output,
+void vp9_short_fht8x8_sse2(const int16_t *input, int16_t *output,
int stride, int tx_type) {
__m128i in[8];
load_buffer_8x8(input, in, stride);
@@ -1062,18 +1056,17 @@
write_buffer_8x8(output, in, 8);
}
-void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int pitch) {
+void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {
// The 2D transform is done with two passes which are actually pretty
// similar. In the first one, we transform the columns and transpose
// the results. In the second one, we transform the rows. To achieve that,
// as the first pass results are transposed, we tranpose the columns (that
// is the transposed rows) and transpose the results (so that it goes back
// in normal/row positions).
- const int stride = pitch >> 1;
int pass;
// We need an intermediate buffer between passes.
DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256);
- int16_t *in = input;
+ const int16_t *in = input;
int16_t *out = intermediate;
// Constants
// When we use them, in one case, they are all the same. In all others
@@ -1688,7 +1681,7 @@
}
}
-static INLINE void load_buffer_16x16(int16_t* input, __m128i *in0,
+static INLINE void load_buffer_16x16(const int16_t* input, __m128i *in0,
__m128i *in1, int stride) {
// load first 8 columns
load_buffer_8x8(input, in0, stride);
@@ -2540,7 +2533,7 @@
array_transpose_16x16(in0, in1);
}
-void vp9_short_fht16x16_sse2(int16_t *input, int16_t *output,
+void vp9_short_fht16x16_sse2(const int16_t *input, int16_t *output,
int stride, int tx_type) {
__m128i in0[16], in1[16];
load_buffer_16x16(input, in0, in1, stride);
@@ -2572,13 +2565,13 @@
write_buffer_16x16(output, in0, in1, 16);
}
-#define FDCT32x32_2D vp9_short_fdct32x32_rd_sse2
+#define FDCT32x32_2D vp9_fdct32x32_rd_sse2
#define FDCT32x32_HIGH_PRECISION 0
#include "vp9/encoder/x86/vp9_dct32x32_sse2.c"
#undef FDCT32x32_2D
#undef FDCT32x32_HIGH_PRECISION
-#define FDCT32x32_2D vp9_short_fdct32x32_sse2
+#define FDCT32x32_2D vp9_fdct32x32_sse2
#define FDCT32x32_HIGH_PRECISION 1
#include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT
#undef FDCT32x32_2D
« no previous file with comments | « source/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c ('k') | source/libvpx/vp9/encoder/x86/vp9_variance_mmx.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698