Index: source/libvpx/vp9/encoder/vp9_dct.c |
=================================================================== |
--- source/libvpx/vp9/encoder/vp9_dct.c (revision 232232) |
+++ source/libvpx/vp9/encoder/vp9_dct.c (working copy) |
@@ -8,16 +8,19 @@ |
* be found in the AUTHORS file in the root of the source tree. |
*/ |
- |
#include <assert.h> |
#include <math.h> |
+ |
#include "./vpx_config.h" |
-#include "vp9/common/vp9_systemdependent.h" |
+#include "./vp9_rtcd.h" |
#include "vp9/common/vp9_blockd.h" |
#include "vp9/common/vp9_idct.h" |
+#include "vp9/common/vp9_systemdependent.h" |
-static void fdct4_1d(int16_t *input, int16_t *output) { |
+#include "vp9/encoder/vp9_dct.h" |
+ |
+static void fdct4(const int16_t *input, int16_t *output) { |
int16_t step[4]; |
int temp1, temp2; |
@@ -36,18 +39,17 @@ |
output[3] = dct_const_round_shift(temp2); |
} |
-void vp9_short_fdct4x4_c(int16_t *input, int16_t *output, int pitch) { |
+void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) { |
// The 2D transform is done with two passes which are actually pretty |
// similar. In the first one, we transform the columns and transpose |
// the results. In the second one, we transform the rows. To achieve that, |
// as the first pass results are transposed, we tranpose the columns (that |
// is the transposed rows) and transpose the results (so that it goes back |
// in normal/row positions). |
- const int stride = pitch >> 1; |
int pass; |
// We need an intermediate buffer between passes. |
int16_t intermediate[4 * 4]; |
- int16_t *in = input; |
+ const int16_t *in = input; |
int16_t *out = intermediate; |
// Do the two transform/transpose passes |
for (pass = 0; pass < 2; ++pass) { |
@@ -58,10 +60,10 @@ |
for (i = 0; i < 4; ++i) { |
// Load inputs. |
if (0 == pass) { |
- input[0] = in[0 * stride] << 4; |
- input[1] = in[1 * stride] << 4; |
- input[2] = in[2 * stride] << 4; |
- input[3] = in[3 * stride] << 4; |
+ input[0] = in[0 * stride] * 16; |
+ input[1] = in[1 * stride] * 16; |
+ input[2] = in[2 * stride] * 16; |
+ input[3] = in[3 * stride] * 16; |
if (i == 0 && input[0]) { |
input[0] += 1; |
} |
@@ -102,7 +104,7 @@ |
} |
} |
-static void fadst4_1d(int16_t *input, int16_t *output) { |
+static void fadst4(const int16_t *input, int16_t *output) { |
int x0, x1, x2, x3; |
int s0, s1, s2, s3, s4, s5, s6, s7; |
@@ -143,14 +145,14 @@ |
} |
static const transform_2d FHT_4[] = { |
- { fdct4_1d, fdct4_1d }, // DCT_DCT = 0 |
- { fadst4_1d, fdct4_1d }, // ADST_DCT = 1 |
- { fdct4_1d, fadst4_1d }, // DCT_ADST = 2 |
- { fadst4_1d, fadst4_1d } // ADST_ADST = 3 |
+ { fdct4, fdct4 }, // DCT_DCT = 0 |
+ { fadst4, fdct4 }, // ADST_DCT = 1 |
+ { fdct4, fadst4 }, // DCT_ADST = 2 |
+ { fadst4, fadst4 } // ADST_ADST = 3 |
}; |
-void vp9_short_fht4x4_c(int16_t *input, int16_t *output, |
- int pitch, TX_TYPE tx_type) { |
+void vp9_short_fht4x4_c(const int16_t *input, int16_t *output, |
+ int stride, int tx_type) { |
int16_t out[4 * 4]; |
int16_t *outptr = &out[0]; |
int i, j; |
@@ -160,7 +162,7 @@ |
// Columns |
for (i = 0; i < 4; ++i) { |
for (j = 0; j < 4; ++j) |
- temp_in[j] = input[j * pitch + i] << 4; |
+ temp_in[j] = input[j * stride + i] * 16; |
if (i == 0 && temp_in[0]) |
temp_in[0] += 1; |
ht.cols(temp_in, temp_out); |
@@ -178,12 +180,7 @@ |
} |
} |
-void vp9_short_fdct8x4_c(int16_t *input, int16_t *output, int pitch) { |
- vp9_short_fdct4x4_c(input, output, pitch); |
- vp9_short_fdct4x4_c(input + 4, output + 16, pitch); |
-} |
- |
-static void fdct8_1d(int16_t *input, int16_t *output) { |
+static void fdct8(const int16_t *input, int16_t *output) { |
/*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; |
/*needs32*/ int t0, t1, t2, t3; |
/*canbe16*/ int x0, x1, x2, x3; |
@@ -198,7 +195,7 @@ |
s6 = input[1] - input[6]; |
s7 = input[0] - input[7]; |
- // fdct4_1d(step, step); |
+ // fdct4(step, step); |
x0 = s0 + s3; |
x1 = s1 + s2; |
x2 = s1 - s2; |
@@ -235,8 +232,7 @@ |
output[7] = dct_const_round_shift(t3); |
} |
-void vp9_short_fdct8x8_c(int16_t *input, int16_t *final_output, int pitch) { |
- const int stride = pitch >> 1; |
+void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) { |
int i, j; |
int16_t intermediate[64]; |
@@ -250,16 +246,16 @@ |
int i; |
for (i = 0; i < 8; i++) { |
// stage 1 |
- s0 = (input[0 * stride] + input[7 * stride]) << 2; |
- s1 = (input[1 * stride] + input[6 * stride]) << 2; |
- s2 = (input[2 * stride] + input[5 * stride]) << 2; |
- s3 = (input[3 * stride] + input[4 * stride]) << 2; |
- s4 = (input[3 * stride] - input[4 * stride]) << 2; |
- s5 = (input[2 * stride] - input[5 * stride]) << 2; |
- s6 = (input[1 * stride] - input[6 * stride]) << 2; |
- s7 = (input[0 * stride] - input[7 * stride]) << 2; |
+ s0 = (input[0 * stride] + input[7 * stride]) * 4; |
+ s1 = (input[1 * stride] + input[6 * stride]) * 4; |
+ s2 = (input[2 * stride] + input[5 * stride]) * 4; |
+ s3 = (input[3 * stride] + input[4 * stride]) * 4; |
+ s4 = (input[3 * stride] - input[4 * stride]) * 4; |
+ s5 = (input[2 * stride] - input[5 * stride]) * 4; |
+ s6 = (input[1 * stride] - input[6 * stride]) * 4; |
+ s7 = (input[0 * stride] - input[7 * stride]) * 4; |
- // fdct4_1d(step, step); |
+ // fdct4(step, step); |
x0 = s0 + s3; |
x1 = s1 + s2; |
x2 = s1 - s2; |
@@ -301,24 +297,23 @@ |
// Rows |
for (i = 0; i < 8; ++i) { |
- fdct8_1d(&intermediate[i * 8], &final_output[i * 8]); |
+ fdct8(&intermediate[i * 8], &final_output[i * 8]); |
for (j = 0; j < 8; ++j) |
final_output[j + i * 8] /= 2; |
} |
} |
-void vp9_short_fdct16x16_c(int16_t *input, int16_t *output, int pitch) { |
+void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) { |
// The 2D transform is done with two passes which are actually pretty |
// similar. In the first one, we transform the columns and transpose |
// the results. In the second one, we transform the rows. To achieve that, |
// as the first pass results are transposed, we tranpose the columns (that |
// is the transposed rows) and transpose the results (so that it goes back |
// in normal/row positions). |
- const int stride = pitch >> 1; |
int pass; |
// We need an intermediate buffer between passes. |
int16_t intermediate[256]; |
- int16_t *in = input; |
+ const int16_t *in = input; |
int16_t *out = intermediate; |
// Do the two transform/transpose passes |
for (pass = 0; pass < 2; ++pass) { |
@@ -331,23 +326,23 @@ |
for (i = 0; i < 16; i++) { |
if (0 == pass) { |
// Calculate input for the first 8 results. |
- input[0] = (in[0 * stride] + in[15 * stride]) << 2; |
- input[1] = (in[1 * stride] + in[14 * stride]) << 2; |
- input[2] = (in[2 * stride] + in[13 * stride]) << 2; |
- input[3] = (in[3 * stride] + in[12 * stride]) << 2; |
- input[4] = (in[4 * stride] + in[11 * stride]) << 2; |
- input[5] = (in[5 * stride] + in[10 * stride]) << 2; |
- input[6] = (in[6 * stride] + in[ 9 * stride]) << 2; |
- input[7] = (in[7 * stride] + in[ 8 * stride]) << 2; |
+ input[0] = (in[0 * stride] + in[15 * stride]) * 4; |
+ input[1] = (in[1 * stride] + in[14 * stride]) * 4; |
+ input[2] = (in[2 * stride] + in[13 * stride]) * 4; |
+ input[3] = (in[3 * stride] + in[12 * stride]) * 4; |
+ input[4] = (in[4 * stride] + in[11 * stride]) * 4; |
+ input[5] = (in[5 * stride] + in[10 * stride]) * 4; |
+ input[6] = (in[6 * stride] + in[ 9 * stride]) * 4; |
+ input[7] = (in[7 * stride] + in[ 8 * stride]) * 4; |
// Calculate input for the next 8 results. |
- step1[0] = (in[7 * stride] - in[ 8 * stride]) << 2; |
- step1[1] = (in[6 * stride] - in[ 9 * stride]) << 2; |
- step1[2] = (in[5 * stride] - in[10 * stride]) << 2; |
- step1[3] = (in[4 * stride] - in[11 * stride]) << 2; |
- step1[4] = (in[3 * stride] - in[12 * stride]) << 2; |
- step1[5] = (in[2 * stride] - in[13 * stride]) << 2; |
- step1[6] = (in[1 * stride] - in[14 * stride]) << 2; |
- step1[7] = (in[0 * stride] - in[15 * stride]) << 2; |
+ step1[0] = (in[7 * stride] - in[ 8 * stride]) * 4; |
+ step1[1] = (in[6 * stride] - in[ 9 * stride]) * 4; |
+ step1[2] = (in[5 * stride] - in[10 * stride]) * 4; |
+ step1[3] = (in[4 * stride] - in[11 * stride]) * 4; |
+ step1[4] = (in[3 * stride] - in[12 * stride]) * 4; |
+ step1[5] = (in[2 * stride] - in[13 * stride]) * 4; |
+ step1[6] = (in[1 * stride] - in[14 * stride]) * 4; |
+ step1[7] = (in[0 * stride] - in[15 * stride]) * 4; |
} else { |
// Calculate input for the first 8 results. |
input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2); |
@@ -368,7 +363,7 @@ |
step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2); |
step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2); |
} |
- // Work on the first eight values; fdct8_1d(input, even_results); |
+ // Work on the first eight values; fdct8(input, even_results); |
{ |
/*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; |
/*needs32*/ int t0, t1, t2, t3; |
@@ -384,7 +379,7 @@ |
s6 = input[1] - input[6]; |
s7 = input[0] - input[7]; |
- // fdct4_1d(step, step); |
+ // fdct4(step, step); |
x0 = s0 + s3; |
x1 = s1 + s2; |
x2 = s1 - s2; |
@@ -486,7 +481,7 @@ |
} |
} |
-static void fadst8_1d(int16_t *input, int16_t *output) { |
+static void fadst8(const int16_t *input, int16_t *output) { |
int s0, s1, s2, s3, s4, s5, s6, s7; |
int x0 = input[7]; |
@@ -558,14 +553,14 @@ |
} |
static const transform_2d FHT_8[] = { |
- { fdct8_1d, fdct8_1d }, // DCT_DCT = 0 |
- { fadst8_1d, fdct8_1d }, // ADST_DCT = 1 |
- { fdct8_1d, fadst8_1d }, // DCT_ADST = 2 |
- { fadst8_1d, fadst8_1d } // ADST_ADST = 3 |
+ { fdct8, fdct8 }, // DCT_DCT = 0 |
+ { fadst8, fdct8 }, // ADST_DCT = 1 |
+ { fdct8, fadst8 }, // DCT_ADST = 2 |
+ { fadst8, fadst8 } // ADST_ADST = 3 |
}; |
-void vp9_short_fht8x8_c(int16_t *input, int16_t *output, |
- int pitch, TX_TYPE tx_type) { |
+void vp9_short_fht8x8_c(const int16_t *input, int16_t *output, |
+ int stride, int tx_type) { |
int16_t out[64]; |
int16_t *outptr = &out[0]; |
int i, j; |
@@ -575,7 +570,7 @@ |
// Columns |
for (i = 0; i < 8; ++i) { |
for (j = 0; j < 8; ++j) |
- temp_in[j] = input[j * pitch + i] << 2; |
+ temp_in[j] = input[j * stride + i] * 4; |
ht.cols(temp_in, temp_out); |
for (j = 0; j < 8; ++j) |
outptr[j * 8 + i] = temp_out[j]; |
@@ -593,18 +588,17 @@ |
/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per |
pixel. */ |
-void vp9_short_walsh4x4_c(short *input, short *output, int pitch) { |
+void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride) { |
int i; |
int a1, b1, c1, d1, e1; |
- short *ip = input; |
- short *op = output; |
- int pitch_short = pitch >> 1; |
+ const int16_t *ip = input; |
+ int16_t *op = output; |
for (i = 0; i < 4; i++) { |
- a1 = ip[0 * pitch_short]; |
- b1 = ip[1 * pitch_short]; |
- c1 = ip[2 * pitch_short]; |
- d1 = ip[3 * pitch_short]; |
+ a1 = ip[0 * stride]; |
+ b1 = ip[1 * stride]; |
+ c1 = ip[2 * stride]; |
+ d1 = ip[3 * stride]; |
a1 += b1; |
d1 = d1 - c1; |
@@ -637,24 +631,18 @@ |
c1 = e1 - c1; |
a1 -= c1; |
d1 += b1; |
- op[0] = a1 << WHT_UPSCALE_FACTOR; |
- op[1] = c1 << WHT_UPSCALE_FACTOR; |
- op[2] = d1 << WHT_UPSCALE_FACTOR; |
- op[3] = b1 << WHT_UPSCALE_FACTOR; |
+ op[0] = a1 * UNIT_QUANT_FACTOR; |
+ op[1] = c1 * UNIT_QUANT_FACTOR; |
+ op[2] = d1 * UNIT_QUANT_FACTOR; |
+ op[3] = b1 * UNIT_QUANT_FACTOR; |
ip += 4; |
op += 4; |
} |
} |
-void vp9_short_walsh8x4_c(short *input, short *output, int pitch) { |
- vp9_short_walsh4x4_c(input, output, pitch); |
- vp9_short_walsh4x4_c(input + 4, output + 16, pitch); |
-} |
- |
- |
// Rewrote to use same algorithm as others. |
-static void fdct16_1d(int16_t in[16], int16_t out[16]) { |
+static void fdct16(const int16_t in[16], int16_t out[16]) { |
/*canbe16*/ int step1[8]; |
/*canbe16*/ int step2[8]; |
/*canbe16*/ int step3[8]; |
@@ -680,7 +668,7 @@ |
step1[6] = in[1] - in[14]; |
step1[7] = in[0] - in[15]; |
- // fdct8_1d(step, step); |
+ // fdct8(step, step); |
{ |
/*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; |
/*needs32*/ int t0, t1, t2, t3; |
@@ -696,7 +684,7 @@ |
s6 = input[1] - input[6]; |
s7 = input[0] - input[7]; |
- // fdct4_1d(step, step); |
+ // fdct4(step, step); |
x0 = s0 + s3; |
x1 = s1 + s2; |
x2 = s1 - s2; |
@@ -795,7 +783,7 @@ |
out[15] = dct_const_round_shift(temp2); |
} |
-void fadst16_1d(int16_t *input, int16_t *output) { |
+static void fadst16(const int16_t *input, int16_t *output) { |
int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; |
int x0 = input[15]; |
@@ -958,14 +946,14 @@ |
} |
static const transform_2d FHT_16[] = { |
- { fdct16_1d, fdct16_1d }, // DCT_DCT = 0 |
- { fadst16_1d, fdct16_1d }, // ADST_DCT = 1 |
- { fdct16_1d, fadst16_1d }, // DCT_ADST = 2 |
- { fadst16_1d, fadst16_1d } // ADST_ADST = 3 |
+ { fdct16, fdct16 }, // DCT_DCT = 0 |
+ { fadst16, fdct16 }, // ADST_DCT = 1 |
+ { fdct16, fadst16 }, // DCT_ADST = 2 |
+ { fadst16, fadst16 } // ADST_ADST = 3 |
}; |
-void vp9_short_fht16x16_c(int16_t *input, int16_t *output, |
- int pitch, TX_TYPE tx_type) { |
+void vp9_short_fht16x16_c(const int16_t *input, int16_t *output, |
+ int stride, int tx_type) { |
int16_t out[256]; |
int16_t *outptr = &out[0]; |
int i, j; |
@@ -975,7 +963,7 @@ |
// Columns |
for (i = 0; i < 16; ++i) { |
for (j = 0; j < 16; ++j) |
- temp_in[j] = input[j * pitch + i] << 2; |
+ temp_in[j] = input[j * stride + i] * 4; |
ht.cols(temp_in, temp_out); |
for (j = 0; j < 16; ++j) |
outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; |
@@ -1003,7 +991,7 @@ |
return rv; |
} |
-static void dct32_1d(int *input, int *output, int round) { |
+static void dct32_1d(const int *input, int *output, int round) { |
int step[32]; |
// Stage 1 |
step[0] = input[0] + input[(32 - 1)]; |
@@ -1326,8 +1314,7 @@ |
output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64); |
} |
-void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch) { |
- int shortpitch = pitch >> 1; |
+void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) { |
int i, j; |
int output[32 * 32]; |
@@ -1335,7 +1322,7 @@ |
for (i = 0; i < 32; ++i) { |
int temp_in[32], temp_out[32]; |
for (j = 0; j < 32; ++j) |
- temp_in[j] = input[j * shortpitch + i] << 2; |
+ temp_in[j] = input[j * stride + i] * 4; |
dct32_1d(temp_in, temp_out, 0); |
for (j = 0; j < 32; ++j) |
output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; |
@@ -1355,8 +1342,7 @@ |
// Note that although we use dct_32_round in dct32_1d computation flow, |
// this 2d fdct32x32 for rate-distortion optimization loop is operating |
// within 16 bits precision. |
-void vp9_short_fdct32x32_rd_c(int16_t *input, int16_t *out, int pitch) { |
- int shortpitch = pitch >> 1; |
+void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) { |
int i, j; |
int output[32 * 32]; |
@@ -1364,7 +1350,7 @@ |
for (i = 0; i < 32; ++i) { |
int temp_in[32], temp_out[32]; |
for (j = 0; j < 32; ++j) |
- temp_in[j] = input[j * shortpitch + i] << 2; |
+ temp_in[j] = input[j * stride + i] * 4; |
dct32_1d(temp_in, temp_out, 0); |
for (j = 0; j < 32; ++j) |
// TODO(cd): see quality impact of only doing |
@@ -1383,3 +1369,27 @@ |
out[j + i * 32] = temp_out[j]; |
} |
} |
+ |
+void vp9_fht4x4(TX_TYPE tx_type, const int16_t *input, int16_t *output, |
+ int stride) { |
+ if (tx_type == DCT_DCT) |
+ vp9_fdct4x4(input, output, stride); |
+ else |
+ vp9_short_fht4x4(input, output, stride, tx_type); |
+} |
+ |
+void vp9_fht8x8(TX_TYPE tx_type, const int16_t *input, int16_t *output, |
+ int stride) { |
+ if (tx_type == DCT_DCT) |
+ vp9_fdct8x8(input, output, stride); |
+ else |
+ vp9_short_fht8x8(input, output, stride, tx_type); |
+} |
+ |
+void vp9_fht16x16(TX_TYPE tx_type, const int16_t *input, int16_t *output, |
+ int stride) { |
+ if (tx_type == DCT_DCT) |
+ vp9_fdct16x16(input, output, stride); |
+ else |
+ vp9_short_fht16x16(input, output, stride, tx_type); |
+} |