source/libvpx/vp9/encoder/vp9_dct.c - Issue 592203002: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/encoder/vp9_dct.c

Issue 592203002: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 6 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.	2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11 #include <assert.h>	11 #include <assert.h>

12 #include <math.h>	12 #include <math.h>

13	13

14 #include "./vpx_config.h"	14 #include "./vpx_config.h"

15 #include "./vp9_rtcd.h"	15 #include "./vp9_rtcd.h"

16	16

17 #include "vp9/common/vp9_blockd.h"	17 #include "vp9/common/vp9_blockd.h"

18 #include "vp9/common/vp9_idct.h"	18 #include "vp9/common/vp9_idct.h"

19 #include "vp9/common/vp9_systemdependent.h"	19 #include "vp9/common/vp9_systemdependent.h"

20	20

21 static INLINE int fdct_round_shift(int input) {	21 static INLINE tran_high_t fdct_round_shift(tran_high_t input) {

22 int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);	22 tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);

23 assert(INT16_MIN <= rv && rv <= INT16_MAX);	23 // TODO(debargha, peter.derivaz): Find new bounds for this assert

	24 // and make the bounds consts.

	25 // assert(INT16_MIN <= rv && rv <= INT16_MAX);

24 return rv;	26 return rv;

25 }	27 }

26	28

27 static void fdct4(const int16_t input, int16_t output) {	29 static void fdct4(const tran_low_t input, tran_low_t output) {

28 int16_t step[4];	30 tran_high_t step[4];

29 int temp1, temp2;	31 tran_high_t temp1, temp2;

30	32

31 step[0] = input[0] + input[3];	33 step[0] = input[0] + input[3];

32 step[1] = input[1] + input[2];	34 step[1] = input[1] + input[2];

33 step[2] = input[1] - input[2];	35 step[2] = input[1] - input[2];

34 step[3] = input[0] - input[3];	36 step[3] = input[0] - input[3];

35	37

36 temp1 = (step[0] + step[1]) * cospi_16_64;	38 temp1 = (step[0] + step[1]) * cospi_16_64;

37 temp2 = (step[0] - step[1]) * cospi_16_64;	39 temp2 = (step[0] - step[1]) * cospi_16_64;

38 output[0] = fdct_round_shift(temp1);	40 output[0] = fdct_round_shift(temp1);

39 output[2] = fdct_round_shift(temp2);	41 output[2] = fdct_round_shift(temp2);

40 temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;	42 temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;

41 temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;	43 temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;

42 output[1] = fdct_round_shift(temp1);	44 output[1] = fdct_round_shift(temp1);

43 output[3] = fdct_round_shift(temp2);	45 output[3] = fdct_round_shift(temp2);

44 }	46 }

45	47

46 void vp9_fdct4x4_1_c(const int16_t input, int16_t output, int stride) {	48 void vp9_fdct4x4_1_c(const int16_t input, tran_low_t output, int stride) {

47 int r, c;	49 int r, c;

48 int16_t sum = 0;	50 tran_low_t sum = 0;

49 for (r = 0; r < 4; ++r)	51 for (r = 0; r < 4; ++r)

50 for (c = 0; c < 4; ++c)	52 for (c = 0; c < 4; ++c)

51 sum += input[r * stride + c];	53 sum += input[r * stride + c];

52	54

53 output[0] = sum << 1;	55 output[0] = sum << 1;

54 output[1] = 0;	56 output[1] = 0;

55 }	57 }

56	58

57 void vp9_fdct4x4_c(const int16_t input, int16_t output, int stride) {	59 void vp9_fdct4x4_c(const int16_t input, tran_low_t output, int stride) {

58 // The 2D transform is done with two passes which are actually pretty	60 // The 2D transform is done with two passes which are actually pretty

59 // similar. In the first one, we transform the columns and transpose	61 // similar. In the first one, we transform the columns and transpose

60 // the results. In the second one, we transform the rows. To achieve that,	62 // the results. In the second one, we transform the rows. To achieve that,

61 // as the first pass results are transposed, we transpose the columns (that	63 // as the first pass results are transposed, we transpose the columns (that

62 // is the transposed rows) and transpose the results (so that it goes back	64 // is the transposed rows) and transpose the results (so that it goes back

63 // in normal/row positions).	65 // in normal/row positions).

64 int pass;	66 int pass;

65 // We need an intermediate buffer between passes.	67 // We need an intermediate buffer between passes.

66 int16_t intermediate[4 * 4];	68 tran_low_t intermediate[4 * 4];

67 const int16_t *in = input;	69 const int16_t *in_pass0 = input;

68 int16_t *out = intermediate;	70 const tran_low_t *in = NULL;

	71 tran_low_t *out = intermediate;

69 // Do the two transform/transpose passes	72 // Do the two transform/transpose passes

70 for (pass = 0; pass < 2; ++pass) {	73 for (pass = 0; pass < 2; ++pass) {

71 /canbe16/ int input[4];	74 tran_high_t input[4]; // canbe16

72 /canbe16/ int step[4];	75 tran_high_t step[4]; // canbe16

73 /needs32/ int temp1, temp2;	76 tran_high_t temp1, temp2; // needs32

74 int i;	77 int i;

75 for (i = 0; i < 4; ++i) {	78 for (i = 0; i < 4; ++i) {

76 // Load inputs.	79 // Load inputs.

77 if (0 == pass) {	80 if (0 == pass) {

78 input[0] = in[0 * stride] * 16;	81 input[0] = in_pass0[0 * stride] * 16;

79 input[1] = in[1 * stride] * 16;	82 input[1] = in_pass0[1 * stride] * 16;

80 input[2] = in[2 * stride] * 16;	83 input[2] = in_pass0[2 * stride] * 16;

81 input[3] = in[3 * stride] * 16;	84 input[3] = in_pass0[3 * stride] * 16;

82 if (i == 0 && input[0]) {	85 if (i == 0 && input[0]) {

83 input[0] += 1;	86 input[0] += 1;

84 }	87 }

85 } else {	88 } else {

86 input[0] = in[0 * 4];	89 input[0] = in[0 * 4];

87 input[1] = in[1 * 4];	90 input[1] = in[1 * 4];

88 input[2] = in[2 * 4];	91 input[2] = in[2 * 4];

89 input[3] = in[3 * 4];	92 input[3] = in[3 * 4];

90 }	93 }

91 // Transform.	94 // Transform.

92 step[0] = input[0] + input[3];	95 step[0] = input[0] + input[3];

93 step[1] = input[1] + input[2];	96 step[1] = input[1] + input[2];

94 step[2] = input[1] - input[2];	97 step[2] = input[1] - input[2];

95 step[3] = input[0] - input[3];	98 step[3] = input[0] - input[3];

96 temp1 = (step[0] + step[1]) * cospi_16_64;	99 temp1 = (step[0] + step[1]) * cospi_16_64;

97 temp2 = (step[0] - step[1]) * cospi_16_64;	100 temp2 = (step[0] - step[1]) * cospi_16_64;

98 out[0] = fdct_round_shift(temp1);	101 out[0] = fdct_round_shift(temp1);

99 out[2] = fdct_round_shift(temp2);	102 out[2] = fdct_round_shift(temp2);

100 temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;	103 temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;

101 temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;	104 temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;

102 out[1] = fdct_round_shift(temp1);	105 out[1] = fdct_round_shift(temp1);

103 out[3] = fdct_round_shift(temp2);	106 out[3] = fdct_round_shift(temp2);

104 // Do next column (which is a transposed row in second/horizontal pass)	107 // Do next column (which is a transposed row in second/horizontal pass)

	108 in_pass0++;

105 in++;	109 in++;

106 out += 4;	110 out += 4;

107 }	111 }

108 // Setup in/out for next pass.	112 // Setup in/out for next pass.

109 in = intermediate;	113 in = intermediate;

110 out = output;	114 out = output;

111 }	115 }

112	116

113 {	117 {

114 int i, j;	118 int i, j;

115 for (i = 0; i < 4; ++i) {	119 for (i = 0; i < 4; ++i) {

116 for (j = 0; j < 4; ++j)	120 for (j = 0; j < 4; ++j)

117 output[j + i * 4] = (output[j + i * 4] + 1) >> 2;	121 output[j + i * 4] = (output[j + i * 4] + 1) >> 2;

118 }	122 }

119 }	123 }

120 }	124 }

121	125

122 static void fadst4(const int16_t input, int16_t output) {	126 static void fadst4(const tran_low_t input, tran_low_t output) {

123 int x0, x1, x2, x3;	127 tran_high_t x0, x1, x2, x3;

124 int s0, s1, s2, s3, s4, s5, s6, s7;	128 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;

125	129

126 x0 = input[0];	130 x0 = input[0];

127 x1 = input[1];	131 x1 = input[1];

128 x2 = input[2];	132 x2 = input[2];

129 x3 = input[3];	133 x3 = input[3];

130	134

131 if (!(x0 \| x1 \| x2 \| x3)) {	135 if (!(x0 \| x1 \| x2 \| x3)) {

132 output[0] = output[1] = output[2] = output[3] = 0;	136 output[0] = output[1] = output[2] = output[3] = 0;

133 return;	137 return;

134 }	138 }

(...skipping 24 matching lines...) Expand all Loading...
159 output[3] = fdct_round_shift(s3);	163 output[3] = fdct_round_shift(s3);

160 }	164 }

161	165

162 static const transform_2d FHT_4[] = {	166 static const transform_2d FHT_4[] = {

163 { fdct4, fdct4 }, // DCT_DCT = 0	167 { fdct4, fdct4 }, // DCT_DCT = 0

164 { fadst4, fdct4 }, // ADST_DCT = 1	168 { fadst4, fdct4 }, // ADST_DCT = 1

165 { fdct4, fadst4 }, // DCT_ADST = 2	169 { fdct4, fadst4 }, // DCT_ADST = 2

166 { fadst4, fadst4 } // ADST_ADST = 3	170 { fadst4, fadst4 } // ADST_ADST = 3

167 };	171 };

168	172

169 void vp9_fht4x4_c(const int16_t input, int16_t output,	173 void vp9_fht4x4_c(const int16_t input, tran_low_t output,

170 int stride, int tx_type) {	174 int stride, int tx_type) {

171 if (tx_type == DCT_DCT) {	175 if (tx_type == DCT_DCT) {

172 vp9_fdct4x4_c(input, output, stride);	176 vp9_fdct4x4_c(input, output, stride);

173 } else {	177 } else {

174 int16_t out[4 * 4];	178 tran_low_t out[4 * 4];

175 int16_t *outptr = &out[0];	179 tran_low_t *outptr = &out[0];

176 int i, j;	180 int i, j;

177 int16_t temp_in[4], temp_out[4];	181 tran_low_t temp_in[4], temp_out[4];

178 const transform_2d ht = FHT_4[tx_type];	182 const transform_2d ht = FHT_4[tx_type];

179	183

180 // Columns	184 // Columns

181 for (i = 0; i < 4; ++i) {	185 for (i = 0; i < 4; ++i) {

182 for (j = 0; j < 4; ++j)	186 for (j = 0; j < 4; ++j)

183 temp_in[j] = input[j * stride + i] * 16;	187 temp_in[j] = input[j * stride + i] * 16;

184 if (i == 0 && temp_in[0])	188 if (i == 0 && temp_in[0])

185 temp_in[0] += 1;	189 temp_in[0] += 1;

186 ht.cols(temp_in, temp_out);	190 ht.cols(temp_in, temp_out);

187 for (j = 0; j < 4; ++j)	191 for (j = 0; j < 4; ++j)

188 outptr[j * 4 + i] = temp_out[j];	192 outptr[j * 4 + i] = temp_out[j];

189 }	193 }

190	194

191 // Rows	195 // Rows

192 for (i = 0; i < 4; ++i) {	196 for (i = 0; i < 4; ++i) {

193 for (j = 0; j < 4; ++j)	197 for (j = 0; j < 4; ++j)

194 temp_in[j] = out[j + i * 4];	198 temp_in[j] = out[j + i * 4];

195 ht.rows(temp_in, temp_out);	199 ht.rows(temp_in, temp_out);

196 for (j = 0; j < 4; ++j)	200 for (j = 0; j < 4; ++j)

197 output[j + i * 4] = (temp_out[j] + 1) >> 2;	201 output[j + i * 4] = (temp_out[j] + 1) >> 2;

198 }	202 }

199 }	203 }

200 }	204 }

201	205

202 static void fdct8(const int16_t input, int16_t output) {	206 static void fdct8(const tran_low_t input, tran_low_t output) {

203 /canbe16/ int s0, s1, s2, s3, s4, s5, s6, s7;	207 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16

204 /needs32/ int t0, t1, t2, t3;	208 tran_high_t t0, t1, t2, t3; // needs32

205 /canbe16/ int x0, x1, x2, x3;	209 tran_high_t x0, x1, x2, x3; // canbe16

206	210

207 // stage 1	211 // stage 1

208 s0 = input[0] + input[7];	212 s0 = input[0] + input[7];

209 s1 = input[1] + input[6];	213 s1 = input[1] + input[6];

210 s2 = input[2] + input[5];	214 s2 = input[2] + input[5];

211 s3 = input[3] + input[4];	215 s3 = input[3] + input[4];

212 s4 = input[3] - input[4];	216 s4 = input[3] - input[4];

213 s5 = input[2] - input[5];	217 s5 = input[2] - input[5];

214 s6 = input[1] - input[6];	218 s6 = input[1] - input[6];

215 s7 = input[0] - input[7];	219 s7 = input[0] - input[7];

(...skipping 28 matching lines...) Expand all Loading...
244 t0 = x0 * cospi_28_64 + x3 * cospi_4_64;	248 t0 = x0 * cospi_28_64 + x3 * cospi_4_64;

245 t1 = x1 * cospi_12_64 + x2 * cospi_20_64;	249 t1 = x1 * cospi_12_64 + x2 * cospi_20_64;

246 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;	250 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;

247 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;	251 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;

248 output[1] = fdct_round_shift(t0);	252 output[1] = fdct_round_shift(t0);

249 output[3] = fdct_round_shift(t2);	253 output[3] = fdct_round_shift(t2);

250 output[5] = fdct_round_shift(t1);	254 output[5] = fdct_round_shift(t1);

251 output[7] = fdct_round_shift(t3);	255 output[7] = fdct_round_shift(t3);

252 }	256 }

253	257

254 void vp9_fdct8x8_1_c(const int16_t input, int16_t output, int stride) {	258 void vp9_fdct8x8_1_c(const int16_t input, tran_low_t output, int stride) {

255 int r, c;	259 int r, c;

256 int16_t sum = 0;	260 tran_low_t sum = 0;

257 for (r = 0; r < 8; ++r)	261 for (r = 0; r < 8; ++r)

258 for (c = 0; c < 8; ++c)	262 for (c = 0; c < 8; ++c)

259 sum += input[r * stride + c];	263 sum += input[r * stride + c];

260	264

261 output[0] = sum;	265 output[0] = sum;

262 output[1] = 0;	266 output[1] = 0;

263 }	267 }

264	268

265 void vp9_fdct8x8_c(const int16_t input, int16_t final_output, int stride) {	269 void vp9_fdct8x8_c(const int16_t input, tran_low_t final_output, int stride) {

266 int i, j;	270 int i, j;

267 int16_t intermediate[64];	271 tran_low_t intermediate[64];

268	272

269 // Transform columns	273 // Transform columns

270 {	274 {

271 int16_t *output = intermediate;	275 tran_low_t *output = intermediate;

272 /canbe16/ int s0, s1, s2, s3, s4, s5, s6, s7;	276 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16

273 /needs32/ int t0, t1, t2, t3;	277 tran_high_t t0, t1, t2, t3; // needs32

274 /canbe16/ int x0, x1, x2, x3;	278 tran_high_t x0, x1, x2, x3; // canbe16

275	279

276 int i;	280 int i;

277 for (i = 0; i < 8; i++) {	281 for (i = 0; i < 8; i++) {

278 // stage 1	282 // stage 1

279 s0 = (input[0 * stride] + input[7 * stride]) * 4;	283 s0 = (input[0 * stride] + input[7 * stride]) * 4;

280 s1 = (input[1 * stride] + input[6 * stride]) * 4;	284 s1 = (input[1 * stride] + input[6 * stride]) * 4;

281 s2 = (input[2 * stride] + input[5 * stride]) * 4;	285 s2 = (input[2 * stride] + input[5 * stride]) * 4;

282 s3 = (input[3 * stride] + input[4 * stride]) * 4;	286 s3 = (input[3 * stride] + input[4 * stride]) * 4;

283 s4 = (input[3 * stride] - input[4 * stride]) * 4;	287 s4 = (input[3 * stride] - input[4 * stride]) * 4;

284 s5 = (input[2 * stride] - input[5 * stride]) * 4;	288 s5 = (input[2 * stride] - input[5 * stride]) * 4;

(...skipping 41 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
326 }	330 }

327	331

328 // Rows	332 // Rows

329 for (i = 0; i < 8; ++i) {	333 for (i = 0; i < 8; ++i) {

330 fdct8(&intermediate[i * 8], &final_output[i * 8]);	334 fdct8(&intermediate[i * 8], &final_output[i * 8]);

331 for (j = 0; j < 8; ++j)	335 for (j = 0; j < 8; ++j)

332 final_output[j + i * 8] /= 2;	336 final_output[j + i * 8] /= 2;

333 }	337 }

334 }	338 }

335	339

336 void vp9_fdct16x16_1_c(const int16_t input, int16_t output, int stride) {	340 void vp9_fdct16x16_1_c(const int16_t input, tran_low_t output, int stride) {

337 int r, c;	341 int r, c;

338 int16_t sum = 0;	342 tran_low_t sum = 0;

339 for (r = 0; r < 16; ++r)	343 for (r = 0; r < 16; ++r)

340 for (c = 0; c < 16; ++c)	344 for (c = 0; c < 16; ++c)

341 sum += input[r * stride + c];	345 sum += input[r * stride + c];

342	346

343 output[0] = sum >> 1;	347 output[0] = sum >> 1;

344 output[1] = 0;	348 output[1] = 0;

345 }	349 }

346	350

347 void vp9_fdct16x16_c(const int16_t input, int16_t output, int stride) {	351 void vp9_fdct16x16_c(const int16_t input, tran_low_t output, int stride) {

348 // The 2D transform is done with two passes which are actually pretty	352 // The 2D transform is done with two passes which are actually pretty

349 // similar. In the first one, we transform the columns and transpose	353 // similar. In the first one, we transform the columns and transpose

350 // the results. In the second one, we transform the rows. To achieve that,	354 // the results. In the second one, we transform the rows. To achieve that,

351 // as the first pass results are transposed, we transpose the columns (that	355 // as the first pass results are transposed, we transpose the columns (that

352 // is the transposed rows) and transpose the results (so that it goes back	356 // is the transposed rows) and transpose the results (so that it goes back

353 // in normal/row positions).	357 // in normal/row positions).

354 int pass;	358 int pass;

355 // We need an intermediate buffer between passes.	359 // We need an intermediate buffer between passes.

356 int16_t intermediate[256];	360 tran_low_t intermediate[256];

357 const int16_t *in = input;	361 const int16_t *in_pass0 = input;

358 int16_t *out = intermediate;	362 const tran_low_t *in = NULL;

	363 tran_low_t *out = intermediate;

359 // Do the two transform/transpose passes	364 // Do the two transform/transpose passes

360 for (pass = 0; pass < 2; ++pass) {	365 for (pass = 0; pass < 2; ++pass) {

361 /canbe16/ int step1[8];	366 tran_high_t step1[8]; // canbe16

362 /canbe16/ int step2[8];	367 tran_high_t step2[8]; // canbe16

363 /canbe16/ int step3[8];	368 tran_high_t step3[8]; // canbe16

364 /canbe16/ int input[8];	369 tran_high_t input[8]; // canbe16

365 /needs32/ int temp1, temp2;	370 tran_high_t temp1, temp2; // needs32

366 int i;	371 int i;

367 for (i = 0; i < 16; i++) {	372 for (i = 0; i < 16; i++) {

368 if (0 == pass) {	373 if (0 == pass) {

369 // Calculate input for the first 8 results.	374 // Calculate input for the first 8 results.

370 input[0] = (in[0 * stride] + in[15 * stride]) * 4;	375 input[0] = (in_pass0[0 * stride] + in_pass0[15 * stride]) * 4;

371 input[1] = (in[1 * stride] + in[14 * stride]) * 4;	376 input[1] = (in_pass0[1 * stride] + in_pass0[14 * stride]) * 4;

372 input[2] = (in[2 * stride] + in[13 * stride]) * 4;	377 input[2] = (in_pass0[2 * stride] + in_pass0[13 * stride]) * 4;

373 input[3] = (in[3 * stride] + in[12 * stride]) * 4;	378 input[3] = (in_pass0[3 * stride] + in_pass0[12 * stride]) * 4;

374 input[4] = (in[4 * stride] + in[11 * stride]) * 4;	379 input[4] = (in_pass0[4 * stride] + in_pass0[11 * stride]) * 4;

375 input[5] = (in[5 * stride] + in[10 * stride]) * 4;	380 input[5] = (in_pass0[5 * stride] + in_pass0[10 * stride]) * 4;

376 input[6] = (in[6 * stride] + in[ 9 * stride]) * 4;	381 input[6] = (in_pass0[6 * stride] + in_pass0[ 9 * stride]) * 4;

377 input[7] = (in[7 * stride] + in[ 8 * stride]) * 4;	382 input[7] = (in_pass0[7 * stride] + in_pass0[ 8 * stride]) * 4;

378 // Calculate input for the next 8 results.	383 // Calculate input for the next 8 results.

379 step1[0] = (in[7 * stride] - in[ 8 * stride]) * 4;	384 step1[0] = (in_pass0[7 * stride] - in_pass0[ 8 * stride]) * 4;

380 step1[1] = (in[6 * stride] - in[ 9 * stride]) * 4;	385 step1[1] = (in_pass0[6 * stride] - in_pass0[ 9 * stride]) * 4;

381 step1[2] = (in[5 * stride] - in[10 * stride]) * 4;	386 step1[2] = (in_pass0[5 * stride] - in_pass0[10 * stride]) * 4;

382 step1[3] = (in[4 * stride] - in[11 * stride]) * 4;	387 step1[3] = (in_pass0[4 * stride] - in_pass0[11 * stride]) * 4;

383 step1[4] = (in[3 * stride] - in[12 * stride]) * 4;	388 step1[4] = (in_pass0[3 * stride] - in_pass0[12 * stride]) * 4;

384 step1[5] = (in[2 * stride] - in[13 * stride]) * 4;	389 step1[5] = (in_pass0[2 * stride] - in_pass0[13 * stride]) * 4;

385 step1[6] = (in[1 * stride] - in[14 * stride]) * 4;	390 step1[6] = (in_pass0[1 * stride] - in_pass0[14 * stride]) * 4;

386 step1[7] = (in[0 * stride] - in[15 * stride]) * 4;	391 step1[7] = (in_pass0[0 * stride] - in_pass0[15 * stride]) * 4;

387 } else {	392 } else {

388 // Calculate input for the first 8 results.	393 // Calculate input for the first 8 results.

389 input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2);	394 input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2);

390 input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2);	395 input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2);

391 input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2);	396 input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2);

392 input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2);	397 input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2);

393 input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2);	398 input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2);

394 input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2);	399 input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2);

395 input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2);	400 input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2);

396 input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2);	401 input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2);

397 // Calculate input for the next 8 results.	402 // Calculate input for the next 8 results.

398 step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2);	403 step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2);

399 step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2);	404 step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2);

400 step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2);	405 step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2);

401 step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2);	406 step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2);

402 step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2);	407 step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2);

403 step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2);	408 step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2);

404 step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2);	409 step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2);

405 step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2);	410 step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2);

406 }	411 }

407 // Work on the first eight values; fdct8(input, even_results);	412 // Work on the first eight values; fdct8(input, even_results);

408 {	413 {

409 /canbe16/ int s0, s1, s2, s3, s4, s5, s6, s7;	414 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16

410 /needs32/ int t0, t1, t2, t3;	415 tran_high_t t0, t1, t2, t3; // needs32

411 /canbe16/ int x0, x1, x2, x3;	416 tran_high_t x0, x1, x2, x3; // canbe16

412	417

413 // stage 1	418 // stage 1

414 s0 = input[0] + input[7];	419 s0 = input[0] + input[7];

415 s1 = input[1] + input[6];	420 s1 = input[1] + input[6];

416 s2 = input[2] + input[5];	421 s2 = input[2] + input[5];

417 s3 = input[3] + input[4];	422 s3 = input[3] + input[4];

418 s4 = input[3] - input[4];	423 s4 = input[3] - input[4];

419 s5 = input[2] - input[5];	424 s5 = input[2] - input[5];

420 s6 = input[1] - input[6];	425 s6 = input[1] - input[6];

421 s7 = input[0] - input[7];	426 s7 = input[0] - input[7];

(...skipping 85 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
507 temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;	512 temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;

508 out[3] = fdct_round_shift(temp1);	513 out[3] = fdct_round_shift(temp1);

509 out[11] = fdct_round_shift(temp2);	514 out[11] = fdct_round_shift(temp2);

510 temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;	515 temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;

511 temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;	516 temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;

512 out[7] = fdct_round_shift(temp1);	517 out[7] = fdct_round_shift(temp1);

513 out[15] = fdct_round_shift(temp2);	518 out[15] = fdct_round_shift(temp2);

514 }	519 }

515 // Do next column (which is a transposed row in second/horizontal pass)	520 // Do next column (which is a transposed row in second/horizontal pass)

516 in++;	521 in++;

	522 in_pass0++;

517 out += 16;	523 out += 16;

518 }	524 }

519 // Setup in/out for next pass.	525 // Setup in/out for next pass.

520 in = intermediate;	526 in = intermediate;

521 out = output;	527 out = output;

522 }	528 }

523 }	529 }

524	530

525 static void fadst8(const int16_t input, int16_t output) {	531 static void fadst8(const tran_low_t input, tran_low_t output) {

526 int s0, s1, s2, s3, s4, s5, s6, s7;	532 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;

527	533

528 int x0 = input[7];	534 tran_high_t x0 = input[7];

529 int x1 = input[0];	535 tran_high_t x1 = input[0];

530 int x2 = input[5];	536 tran_high_t x2 = input[5];

531 int x3 = input[2];	537 tran_high_t x3 = input[2];

532 int x4 = input[3];	538 tran_high_t x4 = input[3];

533 int x5 = input[4];	539 tran_high_t x5 = input[4];

534 int x6 = input[1];	540 tran_high_t x6 = input[1];

535 int x7 = input[6];	541 tran_high_t x7 = input[6];

536	542

537 // stage 1	543 // stage 1

538 s0 = cospi_2_64 * x0 + cospi_30_64 * x1;	544 s0 = cospi_2_64 * x0 + cospi_30_64 * x1;

539 s1 = cospi_30_64 * x0 - cospi_2_64 * x1;	545 s1 = cospi_30_64 * x0 - cospi_2_64 * x1;

540 s2 = cospi_10_64 * x2 + cospi_22_64 * x3;	546 s2 = cospi_10_64 * x2 + cospi_22_64 * x3;

541 s3 = cospi_22_64 * x2 - cospi_10_64 * x3;	547 s3 = cospi_22_64 * x2 - cospi_10_64 * x3;

542 s4 = cospi_18_64 * x4 + cospi_14_64 * x5;	548 s4 = cospi_18_64 * x4 + cospi_14_64 * x5;

543 s5 = cospi_14_64 * x4 - cospi_18_64 * x5;	549 s5 = cospi_14_64 * x4 - cospi_18_64 * x5;

544 s6 = cospi_26_64 * x6 + cospi_6_64 * x7;	550 s6 = cospi_26_64 * x6 + cospi_6_64 * x7;

545 s7 = cospi_6_64 * x6 - cospi_26_64 * x7;	551 s7 = cospi_6_64 * x6 - cospi_26_64 * x7;

(...skipping 47 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
593 output[7] = - x1;	599 output[7] = - x1;

594 }	600 }

595	601

596 static const transform_2d FHT_8[] = {	602 static const transform_2d FHT_8[] = {

597 { fdct8, fdct8 }, // DCT_DCT = 0	603 { fdct8, fdct8 }, // DCT_DCT = 0

598 { fadst8, fdct8 }, // ADST_DCT = 1	604 { fadst8, fdct8 }, // ADST_DCT = 1

599 { fdct8, fadst8 }, // DCT_ADST = 2	605 { fdct8, fadst8 }, // DCT_ADST = 2

600 { fadst8, fadst8 } // ADST_ADST = 3	606 { fadst8, fadst8 } // ADST_ADST = 3

601 };	607 };

602	608

603 void vp9_fht8x8_c(const int16_t input, int16_t output,	609 void vp9_fht8x8_c(const int16_t input, tran_low_t output,

604 int stride, int tx_type) {	610 int stride, int tx_type) {

605 if (tx_type == DCT_DCT) {	611 if (tx_type == DCT_DCT) {

606 vp9_fdct8x8_c(input, output, stride);	612 vp9_fdct8x8_c(input, output, stride);

607 } else {	613 } else {

608 int16_t out[64];	614 tran_low_t out[64];

609 int16_t *outptr = &out[0];	615 tran_low_t *outptr = &out[0];

610 int i, j;	616 int i, j;

611 int16_t temp_in[8], temp_out[8];	617 tran_low_t temp_in[8], temp_out[8];

612 const transform_2d ht = FHT_8[tx_type];	618 const transform_2d ht = FHT_8[tx_type];

613	619

614 // Columns	620 // Columns

615 for (i = 0; i < 8; ++i) {	621 for (i = 0; i < 8; ++i) {

616 for (j = 0; j < 8; ++j)	622 for (j = 0; j < 8; ++j)

617 temp_in[j] = input[j * stride + i] * 4;	623 temp_in[j] = input[j * stride + i] * 4;

618 ht.cols(temp_in, temp_out);	624 ht.cols(temp_in, temp_out);

619 for (j = 0; j < 8; ++j)	625 for (j = 0; j < 8; ++j)

620 outptr[j * 8 + i] = temp_out[j];	626 outptr[j * 8 + i] = temp_out[j];

621 }	627 }

622	628

623 // Rows	629 // Rows

624 for (i = 0; i < 8; ++i) {	630 for (i = 0; i < 8; ++i) {

625 for (j = 0; j < 8; ++j)	631 for (j = 0; j < 8; ++j)

626 temp_in[j] = out[j + i * 8];	632 temp_in[j] = out[j + i * 8];

627 ht.rows(temp_in, temp_out);	633 ht.rows(temp_in, temp_out);

628 for (j = 0; j < 8; ++j)	634 for (j = 0; j < 8; ++j)

629 output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;	635 output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;

630 }	636 }

631 }	637 }

632 }	638 }

633	639

634 /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per	640 /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per

635 pixel. */	641 pixel. */

636 void vp9_fwht4x4_c(const int16_t input, int16_t output, int stride) {	642 void vp9_fwht4x4_c(const int16_t input, tran_low_t output, int stride) {

637 int i;	643 int i;

638 int a1, b1, c1, d1, e1;	644 tran_high_t a1, b1, c1, d1, e1;

639 const int16_t *ip = input;	645 const int16_t *ip_pass0 = input;

640 int16_t *op = output;	646 const tran_low_t *ip = NULL;

	647 tran_low_t *op = output;

641	648

642 for (i = 0; i < 4; i++) {	649 for (i = 0; i < 4; i++) {

643 a1 = ip[0 * stride];	650 a1 = ip_pass0[0 * stride];

644 b1 = ip[1 * stride];	651 b1 = ip_pass0[1 * stride];

645 c1 = ip[2 * stride];	652 c1 = ip_pass0[2 * stride];

646 d1 = ip[3 * stride];	653 d1 = ip_pass0[3 * stride];

647	654

648 a1 += b1;	655 a1 += b1;

649 d1 = d1 - c1;	656 d1 = d1 - c1;

650 e1 = (a1 - d1) >> 1;	657 e1 = (a1 - d1) >> 1;

651 b1 = e1 - b1;	658 b1 = e1 - b1;

652 c1 = e1 - c1;	659 c1 = e1 - c1;

653 a1 -= c1;	660 a1 -= c1;

654 d1 += b1;	661 d1 += b1;

655 op[0] = a1;	662 op[0] = a1;

656 op[4] = c1;	663 op[4] = c1;

657 op[8] = d1;	664 op[8] = d1;

658 op[12] = b1;	665 op[12] = b1;

659	666

660 ip++;	667 ip_pass0++;

661 op++;	668 op++;

662 }	669 }

663 ip = output;	670 ip = output;

664 op = output;	671 op = output;

665	672

666 for (i = 0; i < 4; i++) {	673 for (i = 0; i < 4; i++) {

667 a1 = ip[0];	674 a1 = ip[0];

668 b1 = ip[1];	675 b1 = ip[1];

669 c1 = ip[2];	676 c1 = ip[2];

670 d1 = ip[3];	677 d1 = ip[3];

671	678

672 a1 += b1;	679 a1 += b1;

673 d1 -= c1;	680 d1 -= c1;

674 e1 = (a1 - d1) >> 1;	681 e1 = (a1 - d1) >> 1;

675 b1 = e1 - b1;	682 b1 = e1 - b1;

676 c1 = e1 - c1;	683 c1 = e1 - c1;

677 a1 -= c1;	684 a1 -= c1;

678 d1 += b1;	685 d1 += b1;

679 op[0] = a1 * UNIT_QUANT_FACTOR;	686 op[0] = a1 * UNIT_QUANT_FACTOR;

680 op[1] = c1 * UNIT_QUANT_FACTOR;	687 op[1] = c1 * UNIT_QUANT_FACTOR;

681 op[2] = d1 * UNIT_QUANT_FACTOR;	688 op[2] = d1 * UNIT_QUANT_FACTOR;

682 op[3] = b1 * UNIT_QUANT_FACTOR;	689 op[3] = b1 * UNIT_QUANT_FACTOR;

683	690

684 ip += 4;	691 ip += 4;

685 op += 4;	692 op += 4;

686 }	693 }

687 }	694 }

688	695

689 // Rewrote to use same algorithm as others.	696 // Rewrote to use same algorithm as others.

690 static void fdct16(const int16_t in[16], int16_t out[16]) {	697 static void fdct16(const tran_low_t in[16], tran_low_t out[16]) {

691 /canbe16/ int step1[8];	698 tran_high_t step1[8]; // canbe16

692 /canbe16/ int step2[8];	699 tran_high_t step2[8]; // canbe16

693 /canbe16/ int step3[8];	700 tran_high_t step3[8]; // canbe16

694 /canbe16/ int input[8];	701 tran_high_t input[8]; // canbe16

695 /needs32/ int temp1, temp2;	702 tran_high_t temp1, temp2; // needs32

696	703

697 // step 1	704 // step 1

698 input[0] = in[0] + in[15];	705 input[0] = in[0] + in[15];

699 input[1] = in[1] + in[14];	706 input[1] = in[1] + in[14];

700 input[2] = in[2] + in[13];	707 input[2] = in[2] + in[13];

701 input[3] = in[3] + in[12];	708 input[3] = in[3] + in[12];

702 input[4] = in[4] + in[11];	709 input[4] = in[4] + in[11];

703 input[5] = in[5] + in[10];	710 input[5] = in[5] + in[10];

704 input[6] = in[6] + in[ 9];	711 input[6] = in[6] + in[ 9];

705 input[7] = in[7] + in[ 8];	712 input[7] = in[7] + in[ 8];

706	713

707 step1[0] = in[7] - in[ 8];	714 step1[0] = in[7] - in[ 8];

708 step1[1] = in[6] - in[ 9];	715 step1[1] = in[6] - in[ 9];

709 step1[2] = in[5] - in[10];	716 step1[2] = in[5] - in[10];

710 step1[3] = in[4] - in[11];	717 step1[3] = in[4] - in[11];

711 step1[4] = in[3] - in[12];	718 step1[4] = in[3] - in[12];

712 step1[5] = in[2] - in[13];	719 step1[5] = in[2] - in[13];

713 step1[6] = in[1] - in[14];	720 step1[6] = in[1] - in[14];

714 step1[7] = in[0] - in[15];	721 step1[7] = in[0] - in[15];

715	722

716 // fdct8(step, step);	723 // fdct8(step, step);

717 {	724 {

718 /canbe16/ int s0, s1, s2, s3, s4, s5, s6, s7;	725 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16

719 /needs32/ int t0, t1, t2, t3;	726 tran_high_t t0, t1, t2, t3; // needs32

720 /canbe16/ int x0, x1, x2, x3;	727 tran_high_t x0, x1, x2, x3; // canbe16

721	728

722 // stage 1	729 // stage 1

723 s0 = input[0] + input[7];	730 s0 = input[0] + input[7];

724 s1 = input[1] + input[6];	731 s1 = input[1] + input[6];

725 s2 = input[2] + input[5];	732 s2 = input[2] + input[5];

726 s3 = input[3] + input[4];	733 s3 = input[3] + input[4];

727 s4 = input[3] - input[4];	734 s4 = input[3] - input[4];

728 s5 = input[2] - input[5];	735 s5 = input[2] - input[5];

729 s6 = input[1] - input[6];	736 s6 = input[1] - input[6];

730 s7 = input[0] - input[7];	737 s7 = input[0] - input[7];

(...skipping 90 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
821 temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;	828 temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;

822 out[3] = fdct_round_shift(temp1);	829 out[3] = fdct_round_shift(temp1);

823 out[11] = fdct_round_shift(temp2);	830 out[11] = fdct_round_shift(temp2);

824	831

825 temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;	832 temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;

826 temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;	833 temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;

827 out[7] = fdct_round_shift(temp1);	834 out[7] = fdct_round_shift(temp1);

828 out[15] = fdct_round_shift(temp2);	835 out[15] = fdct_round_shift(temp2);

829 }	836 }

830	837

831 static void fadst16(const int16_t input, int16_t output) {	838 static void fadst16(const tran_low_t input, tran_low_t output) {

832 int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;	839 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;

	840 tran_high_t s9, s10, s11, s12, s13, s14, s15;

833	841

834 int x0 = input[15];	842 tran_high_t x0 = input[15];

835 int x1 = input[0];	843 tran_high_t x1 = input[0];

836 int x2 = input[13];	844 tran_high_t x2 = input[13];

837 int x3 = input[2];	845 tran_high_t x3 = input[2];

838 int x4 = input[11];	846 tran_high_t x4 = input[11];

839 int x5 = input[4];	847 tran_high_t x5 = input[4];

840 int x6 = input[9];	848 tran_high_t x6 = input[9];

841 int x7 = input[6];	849 tran_high_t x7 = input[6];

842 int x8 = input[7];	850 tran_high_t x8 = input[7];

843 int x9 = input[8];	851 tran_high_t x9 = input[8];

844 int x10 = input[5];	852 tran_high_t x10 = input[5];

845 int x11 = input[10];	853 tran_high_t x11 = input[10];

846 int x12 = input[3];	854 tran_high_t x12 = input[3];

847 int x13 = input[12];	855 tran_high_t x13 = input[12];

848 int x14 = input[1];	856 tran_high_t x14 = input[1];

849 int x15 = input[14];	857 tran_high_t x15 = input[14];

850	858

851 // stage 1	859 // stage 1

852 s0 = x0 * cospi_1_64 + x1 * cospi_31_64;	860 s0 = x0 * cospi_1_64 + x1 * cospi_31_64;

853 s1 = x0 * cospi_31_64 - x1 * cospi_1_64;	861 s1 = x0 * cospi_31_64 - x1 * cospi_1_64;

854 s2 = x2 * cospi_5_64 + x3 * cospi_27_64;	862 s2 = x2 * cospi_5_64 + x3 * cospi_27_64;

855 s3 = x2 * cospi_27_64 - x3 * cospi_5_64;	863 s3 = x2 * cospi_27_64 - x3 * cospi_5_64;

856 s4 = x4 * cospi_9_64 + x5 * cospi_23_64;	864 s4 = x4 * cospi_9_64 + x5 * cospi_23_64;

857 s5 = x4 * cospi_23_64 - x5 * cospi_9_64;	865 s5 = x4 * cospi_23_64 - x5 * cospi_9_64;

858 s6 = x6 * cospi_13_64 + x7 * cospi_19_64;	866 s6 = x6 * cospi_13_64 + x7 * cospi_19_64;

859 s7 = x6 * cospi_19_64 - x7 * cospi_13_64;	867 s7 = x6 * cospi_19_64 - x7 * cospi_13_64;

(...skipping 130 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
990 output[15] = - x1;	998 output[15] = - x1;

991 }	999 }

992	1000

993 static const transform_2d FHT_16[] = {	1001 static const transform_2d FHT_16[] = {

994 { fdct16, fdct16 }, // DCT_DCT = 0	1002 { fdct16, fdct16 }, // DCT_DCT = 0

995 { fadst16, fdct16 }, // ADST_DCT = 1	1003 { fadst16, fdct16 }, // ADST_DCT = 1

996 { fdct16, fadst16 }, // DCT_ADST = 2	1004 { fdct16, fadst16 }, // DCT_ADST = 2

997 { fadst16, fadst16 } // ADST_ADST = 3	1005 { fadst16, fadst16 } // ADST_ADST = 3

998 };	1006 };

999	1007

1000 void vp9_fht16x16_c(const int16_t input, int16_t output,	1008 void vp9_fht16x16_c(const int16_t input, tran_low_t output,

1001 int stride, int tx_type) {	1009 int stride, int tx_type) {

1002 if (tx_type == DCT_DCT) {	1010 if (tx_type == DCT_DCT) {

1003 vp9_fdct16x16_c(input, output, stride);	1011 vp9_fdct16x16_c(input, output, stride);

1004 } else {	1012 } else {

1005 int16_t out[256];	1013 tran_low_t out[256];

1006 int16_t *outptr = &out[0];	1014 tran_low_t *outptr = &out[0];

1007 int i, j;	1015 int i, j;

1008 int16_t temp_in[16], temp_out[16];	1016 tran_low_t temp_in[16], temp_out[16];

1009 const transform_2d ht = FHT_16[tx_type];	1017 const transform_2d ht = FHT_16[tx_type];

1010	1018

1011 // Columns	1019 // Columns

1012 for (i = 0; i < 16; ++i) {	1020 for (i = 0; i < 16; ++i) {

1013 for (j = 0; j < 16; ++j)	1021 for (j = 0; j < 16; ++j)

1014 temp_in[j] = input[j * stride + i] * 4;	1022 temp_in[j] = input[j * stride + i] * 4;

1015 ht.cols(temp_in, temp_out);	1023 ht.cols(temp_in, temp_out);

1016 for (j = 0; j < 16; ++j)	1024 for (j = 0; j < 16; ++j)

1017 outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;	1025 outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;

1018 }	1026 }

1019	1027

1020 // Rows	1028 // Rows

1021 for (i = 0; i < 16; ++i) {	1029 for (i = 0; i < 16; ++i) {

1022 for (j = 0; j < 16; ++j)	1030 for (j = 0; j < 16; ++j)

1023 temp_in[j] = out[j + i * 16];	1031 temp_in[j] = out[j + i * 16];

1024 ht.rows(temp_in, temp_out);	1032 ht.rows(temp_in, temp_out);

1025 for (j = 0; j < 16; ++j)	1033 for (j = 0; j < 16; ++j)

1026 output[j + i * 16] = temp_out[j];	1034 output[j + i * 16] = temp_out[j];

1027 }	1035 }

1028 }	1036 }

1029 }	1037 }

1030	1038

1031 static INLINE int dct_32_round(int input) {	1039 static INLINE tran_high_t dct_32_round(tran_high_t input) {

1032 int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);	1040 tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);

1033 assert(-131072 <= rv && rv <= 131071);	1041 // TODO(debargha, peter.derivaz): Find new bounds for this assert,

	1042 // and make the bounds consts.

	1043 // assert(-131072 <= rv && rv <= 131071);

1034 return rv;	1044 return rv;

1035 }	1045 }

1036	1046

1037 static INLINE int half_round_shift(int input) {	1047 static INLINE tran_high_t half_round_shift(tran_high_t input) {

1038 int rv = (input + 1 + (input < 0)) >> 2;	1048 tran_high_t rv = (input + 1 + (input < 0)) >> 2;

1039 return rv;	1049 return rv;

1040 }	1050 }

1041	1051

1042 static void fdct32(const int input, int output, int round) {	1052 static void fdct32(const tran_high_t input, tran_high_t output, int round) {

1043 int step[32];	1053 tran_high_t step[32];

1044 // Stage 1	1054 // Stage 1

1045 step[0] = input[0] + input[(32 - 1)];	1055 step[0] = input[0] + input[(32 - 1)];

1046 step[1] = input[1] + input[(32 - 2)];	1056 step[1] = input[1] + input[(32 - 2)];

1047 step[2] = input[2] + input[(32 - 3)];	1057 step[2] = input[2] + input[(32 - 3)];

1048 step[3] = input[3] + input[(32 - 4)];	1058 step[3] = input[3] + input[(32 - 4)];

1049 step[4] = input[4] + input[(32 - 5)];	1059 step[4] = input[4] + input[(32 - 5)];

1050 step[5] = input[5] + input[(32 - 6)];	1060 step[5] = input[5] + input[(32 - 6)];

1051 step[6] = input[6] + input[(32 - 7)];	1061 step[6] = input[6] + input[(32 - 7)];

1052 step[7] = input[7] + input[(32 - 8)];	1062 step[7] = input[7] + input[(32 - 8)];

1053 step[8] = input[8] + input[(32 - 9)];	1063 step[8] = input[8] + input[(32 - 9)];

(...skipping 301 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1355 output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64);	1365 output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64);

1356 output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64);	1366 output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64);

1357 output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64);	1367 output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64);

1358 output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64);	1368 output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64);

1359 output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64);	1369 output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64);

1360 output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64);	1370 output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64);

1361 output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64);	1371 output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64);

1362 output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);	1372 output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);

1363 }	1373 }

1364	1374

1365 void vp9_fdct32x32_1_c(const int16_t input, int16_t output, int stride) {	1375 void vp9_fdct32x32_1_c(const int16_t input, tran_low_t output, int stride) {

1366 int r, c;	1376 int r, c;

1367 int16_t sum = 0;	1377 tran_low_t sum = 0;

1368 for (r = 0; r < 32; ++r)	1378 for (r = 0; r < 32; ++r)

1369 for (c = 0; c < 32; ++c)	1379 for (c = 0; c < 32; ++c)

1370 sum += input[r * stride + c];	1380 sum += input[r * stride + c];

1371	1381

1372 output[0] = sum >> 3;	1382 output[0] = sum >> 3;

1373 output[1] = 0;	1383 output[1] = 0;

1374 }	1384 }

1375	1385

1376 void vp9_fdct32x32_c(const int16_t input, int16_t out, int stride) {	1386 void vp9_fdct32x32_c(const int16_t input, tran_low_t out, int stride) {

1377 int i, j;	1387 int i, j;

1378 int output[32 * 32];	1388 tran_high_t output[32 * 32];

1379	1389

1380 // Columns	1390 // Columns

1381 for (i = 0; i < 32; ++i) {	1391 for (i = 0; i < 32; ++i) {

1382 int temp_in[32], temp_out[32];	1392 tran_high_t temp_in[32], temp_out[32];

1383 for (j = 0; j < 32; ++j)	1393 for (j = 0; j < 32; ++j)

1384 temp_in[j] = input[j * stride + i] * 4;	1394 temp_in[j] = input[j * stride + i] * 4;

1385 fdct32(temp_in, temp_out, 0);	1395 fdct32(temp_in, temp_out, 0);

1386 for (j = 0; j < 32; ++j)	1396 for (j = 0; j < 32; ++j)

1387 output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;	1397 output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;

1388 }	1398 }

1389	1399

1390 // Rows	1400 // Rows

1391 for (i = 0; i < 32; ++i) {	1401 for (i = 0; i < 32; ++i) {

1392 int temp_in[32], temp_out[32];	1402 tran_high_t temp_in[32], temp_out[32];

1393 for (j = 0; j < 32; ++j)	1403 for (j = 0; j < 32; ++j)

1394 temp_in[j] = output[j + i * 32];	1404 temp_in[j] = output[j + i * 32];

1395 fdct32(temp_in, temp_out, 0);	1405 fdct32(temp_in, temp_out, 0);

1396 for (j = 0; j < 32; ++j)	1406 for (j = 0; j < 32; ++j)

1397 out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;	1407 out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;

1398 }	1408 }

1399 }	1409 }

1400	1410

1401 // Note that although we use dct_32_round in dct32 computation flow,	1411 // Note that although we use dct_32_round in dct32 computation flow,

1402 // this 2d fdct32x32 for rate-distortion optimization loop is operating	1412 // this 2d fdct32x32 for rate-distortion optimization loop is operating

1403 // within 16 bits precision.	1413 // within 16 bits precision.

1404 void vp9_fdct32x32_rd_c(const int16_t input, int16_t out, int stride) {	1414 void vp9_fdct32x32_rd_c(const int16_t input, tran_low_t out, int stride) {

1405 int i, j;	1415 int i, j;

1406 int output[32 * 32];	1416 tran_high_t output[32 * 32];

1407	1417

1408 // Columns	1418 // Columns

1409 for (i = 0; i < 32; ++i) {	1419 for (i = 0; i < 32; ++i) {

1410 int temp_in[32], temp_out[32];	1420 tran_high_t temp_in[32], temp_out[32];

1411 for (j = 0; j < 32; ++j)	1421 for (j = 0; j < 32; ++j)

1412 temp_in[j] = input[j * stride + i] * 4;	1422 temp_in[j] = input[j * stride + i] * 4;

1413 fdct32(temp_in, temp_out, 0);	1423 fdct32(temp_in, temp_out, 0);

1414 for (j = 0; j < 32; ++j)	1424 for (j = 0; j < 32; ++j)

1415 // TODO(cd): see quality impact of only doing	1425 // TODO(cd): see quality impact of only doing

1416 // output[j * 32 + i] = (temp_out[j] + 1) >> 2;	1426 // output[j * 32 + i] = (temp_out[j] + 1) >> 2;

1417 // PS: also change code in vp9/encoder/x86/vp9_dct_sse2.c	1427 // PS: also change code in vp9/encoder/x86/vp9_dct_sse2.c

1418 output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;	1428 output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;

1419 }	1429 }

1420	1430

1421 // Rows	1431 // Rows

1422 for (i = 0; i < 32; ++i) {	1432 for (i = 0; i < 32; ++i) {

1423 int temp_in[32], temp_out[32];	1433 tran_high_t temp_in[32], temp_out[32];

1424 for (j = 0; j < 32; ++j)	1434 for (j = 0; j < 32; ++j)

1425 temp_in[j] = output[j + i * 32];	1435 temp_in[j] = output[j + i * 32];

1426 fdct32(temp_in, temp_out, 1);	1436 fdct32(temp_in, temp_out, 1);

1427 for (j = 0; j < 32; ++j)	1437 for (j = 0; j < 32; ++j)

1428 out[j + i * 32] = temp_out[j];	1438 out[j + i * 32] = temp_out[j];

1429 }	1439 }

1430 }	1440 }

	1441

	1442 #if CONFIG_VP9_HIGHBITDEPTH

	1443 void vp9_high_fdct4x4_c(const int16_t input, tran_low_t output, int stride) {

	1444 vp9_fdct4x4_c(input, output, stride);

	1445 }

	1446

	1447 void vp9_high_fht4x4_c(const int16_t input, tran_low_t output,

	1448 int stride, int tx_type) {

	1449 vp9_fht4x4_c(input, output, stride, tx_type);

	1450 }

	1451

	1452 void vp9_high_fdct8x8_1_c(const int16_t input, tran_low_t final_output,

	1453 int stride) {

	1454 vp9_fdct8x8_1_c(input, final_output, stride);

	1455 }

	1456

	1457 void vp9_high_fdct8x8_c(const int16_t input, tran_low_t final_output,

	1458 int stride) {

	1459 vp9_fdct8x8_c(input, final_output, stride);

	1460 }

	1461

	1462 void vp9_high_fdct16x16_1_c(const int16_t input, tran_low_t output,

	1463 int stride) {

	1464 vp9_fdct16x16_1_c(input, output, stride);

	1465 }

	1466

	1467 void vp9_high_fdct16x16_c(const int16_t input, tran_low_t output,

	1468 int stride) {

	1469 vp9_fdct16x16_c(input, output, stride);

	1470 }

	1471

	1472 void vp9_high_fht8x8_c(const int16_t input, tran_low_t output,

	1473 int stride, int tx_type) {

	1474 vp9_fht8x8_c(input, output, stride, tx_type);

	1475 }

	1476

	1477 void vp9_high_fwht4x4_c(const int16_t input, tran_low_t output, int stride) {

	1478 vp9_fwht4x4_c(input, output, stride);

	1479 }

	1480

	1481 void vp9_high_fht16x16_c(const int16_t input, tran_low_t output,

	1482 int stride, int tx_type) {

	1483 vp9_fht16x16_c(input, output, stride, tx_type);

	1484 }

	1485

	1486 void vp9_high_fdct32x32_1_c(const int16_t input, tran_low_t out, int stride) {

	1487 vp9_fdct32x32_1_c(input, out, stride);

	1488 }

	1489

	1490 void vp9_high_fdct32x32_c(const int16_t input, tran_low_t out, int stride) {

	1491 vp9_fdct32x32_c(input, out, stride);

	1492 }

	1493

	1494 void vp9_high_fdct32x32_rd_c(const int16_t input, tran_low_t out,

	1495 int stride) {

	1496 vp9_fdct32x32_rd_c(input, out, stride);

	1497 }

	1498 #endif // CONFIG_VP9_HIGHBITDEPTH

OLD	NEW

« no previous file with comments | « source/libvpx/vp9/encoder/vp9_context_tree.c ('k') | source/libvpx/vp9/encoder/vp9_denoiser.h » ('j') | no next file with comments »