source/libvpx/vp9/common/vp9_idct.c - Issue 592203002: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/common/vp9_idct.c

Issue 592203002: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 6 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.	2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11 #include <assert.h>	11 #include <assert.h>

12 #include <math.h>	12 #include <math.h>

13	13

14 #include "./vpx_config.h"	14 #include "./vpx_config.h"

15 #include "./vp9_rtcd.h"	15 #include "./vp9_rtcd.h"

16 #include "vp9/common/vp9_systemdependent.h"	16 #include "vp9/common/vp9_systemdependent.h"

17 #include "vp9/common/vp9_blockd.h"	17 #include "vp9/common/vp9_blockd.h"

18 #include "vp9/common/vp9_common.h"	18 #include "vp9/common/vp9_common.h"

19 #include "vp9/common/vp9_idct.h"	19 #include "vp9/common/vp9_idct.h"

20	20

21 void vp9_iwht4x4_16_add_c(const int16_t input, uint8_t dest, int stride) {	21 #if CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH

	22 // When CONFIG_EMULATE_HW_HIGHBITDEPTH is 1 the transform performs strict

	23 // overflow wrapping to match expected hardware implementations.

	24 // bd of 8 uses trans_low with 16bits, need to remove 16bits

	25 // bd of 10 uses trans_low with 18bits, need to remove 14bits

	26 // bd of 12 uses trans_low with 20bits, need to remove 12bits

	27 // bd of x uses trans_low with 8+x bits, need to remove 24-x bits

	28 #define WRAPLOW(x) ((((int32_t)x) << (24 - bd)) >> (24 - bd))

	29 #else

	30 #define WRAPLOW(x) (x)

	31 #endif // CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH

	32

	33 #if CONFIG_VP9_HIGHBITDEPTH

	34 static INLINE tran_low_t clamp_high(tran_high_t value, tran_low_t low,

	35 tran_low_t high) {

	36 return value < low ? low : (value > high ? high : value);

	37 }

	38

	39 static INLINE tran_low_t clip_pixel_bd_high(tran_high_t dest,

	40 tran_high_t trans, int bd) {

	41 trans = WRAPLOW(trans);

	42 switch (bd) {

	43 case 8:

	44 default:

	45 return clamp_high(WRAPLOW(dest + trans), 0, 255);

	46 case 10:

	47 return clamp_high(WRAPLOW(dest + trans), 0, 1023);

	48 case 12:

	49 return clamp_high(WRAPLOW(dest + trans), 0, 4095);

	50 }

	51 }

	52 #endif // CONFIG_VP9_HIGHBITDEPTH

	53

	54 void vp9_iwht4x4_16_add_c(const tran_low_t input, uint8_t dest, int stride) {

22 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,	55 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,

23 0.5 shifts per pixel. */	56 0.5 shifts per pixel. */

24 int i;	57 int i;

25 int16_t output[16];	58 tran_low_t output[16];

26 int a1, b1, c1, d1, e1;	59 tran_high_t a1, b1, c1, d1, e1;

27 const int16_t *ip = input;	60 const tran_low_t *ip = input;

28 int16_t *op = output;	61 tran_low_t *op = output;

29	62

30 for (i = 0; i < 4; i++) {	63 for (i = 0; i < 4; i++) {

31 a1 = ip[0] >> UNIT_QUANT_SHIFT;	64 a1 = ip[0] >> UNIT_QUANT_SHIFT;

32 c1 = ip[1] >> UNIT_QUANT_SHIFT;	65 c1 = ip[1] >> UNIT_QUANT_SHIFT;

33 d1 = ip[2] >> UNIT_QUANT_SHIFT;	66 d1 = ip[2] >> UNIT_QUANT_SHIFT;

34 b1 = ip[3] >> UNIT_QUANT_SHIFT;	67 b1 = ip[3] >> UNIT_QUANT_SHIFT;

35 a1 += c1;	68 a1 += c1;

36 d1 -= b1;	69 d1 -= b1;

37 e1 = (a1 - d1) >> 1;	70 e1 = (a1 - d1) >> 1;

38 b1 = e1 - b1;	71 b1 = e1 - b1;

(...skipping 24 matching lines...) Expand all Loading...
63 dest[stride * 0] = clip_pixel(dest[stride * 0] + a1);	96 dest[stride * 0] = clip_pixel(dest[stride * 0] + a1);

64 dest[stride * 1] = clip_pixel(dest[stride * 1] + b1);	97 dest[stride * 1] = clip_pixel(dest[stride * 1] + b1);

65 dest[stride * 2] = clip_pixel(dest[stride * 2] + c1);	98 dest[stride * 2] = clip_pixel(dest[stride * 2] + c1);

66 dest[stride * 3] = clip_pixel(dest[stride * 3] + d1);	99 dest[stride * 3] = clip_pixel(dest[stride * 3] + d1);

67	100

68 ip++;	101 ip++;

69 dest++;	102 dest++;

70 }	103 }

71 }	104 }

72	105

73 void vp9_iwht4x4_1_add_c(const int16_t in, uint8_t dest, int dest_stride) {	106 void vp9_iwht4x4_1_add_c(const tran_low_t in, uint8_t dest, int dest_stride) {

74 int i;	107 int i;

75 int a1, e1;	108 tran_high_t a1, e1;

76 int16_t tmp[4];	109 tran_low_t tmp[4];

77 const int16_t *ip = in;	110 const tran_low_t *ip = in;

78 int16_t *op = tmp;	111 tran_low_t *op = tmp;

79	112

80 a1 = ip[0] >> UNIT_QUANT_SHIFT;	113 a1 = ip[0] >> UNIT_QUANT_SHIFT;

81 e1 = a1 >> 1;	114 e1 = a1 >> 1;

82 a1 -= e1;	115 a1 -= e1;

83 op[0] = a1;	116 op[0] = a1;

84 op[1] = op[2] = op[3] = e1;	117 op[1] = op[2] = op[3] = e1;

85	118

86 ip = tmp;	119 ip = tmp;

87 for (i = 0; i < 4; i++) {	120 for (i = 0; i < 4; i++) {

88 e1 = ip[0] >> 1;	121 e1 = ip[0] >> 1;

89 a1 = ip[0] - e1;	122 a1 = ip[0] - e1;

90 dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + a1);	123 dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + a1);

91 dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + e1);	124 dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + e1);

92 dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + e1);	125 dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + e1);

93 dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + e1);	126 dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + e1);

94 ip++;	127 ip++;

95 dest++;	128 dest++;

96 }	129 }

97 }	130 }

98	131

99 static void idct4(const int16_t input, int16_t output) {	132 static void idct4(const tran_low_t input, tran_low_t output) {

100 int16_t step[4];	133 tran_low_t step[4];

101 int temp1, temp2;	134 tran_high_t temp1, temp2;

102 // stage 1	135 // stage 1

103 temp1 = (input[0] + input[2]) * cospi_16_64;	136 temp1 = (input[0] + input[2]) * cospi_16_64;

104 temp2 = (input[0] - input[2]) * cospi_16_64;	137 temp2 = (input[0] - input[2]) * cospi_16_64;

105 step[0] = dct_const_round_shift(temp1);	138 step[0] = dct_const_round_shift(temp1);

106 step[1] = dct_const_round_shift(temp2);	139 step[1] = dct_const_round_shift(temp2);

107 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;	140 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;

108 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;	141 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;

109 step[2] = dct_const_round_shift(temp1);	142 step[2] = dct_const_round_shift(temp1);

110 step[3] = dct_const_round_shift(temp2);	143 step[3] = dct_const_round_shift(temp2);

111	144

112 // stage 2	145 // stage 2

113 output[0] = step[0] + step[3];	146 output[0] = step[0] + step[3];

114 output[1] = step[1] + step[2];	147 output[1] = step[1] + step[2];

115 output[2] = step[1] - step[2];	148 output[2] = step[1] - step[2];

116 output[3] = step[0] - step[3];	149 output[3] = step[0] - step[3];

117 }	150 }

118	151

119 void vp9_idct4x4_16_add_c(const int16_t input, uint8_t dest, int stride) {	152 void vp9_idct4x4_16_add_c(const tran_low_t input, uint8_t dest, int stride) {

120 int16_t out[4 * 4];	153 tran_low_t out[4 * 4];

121 int16_t *outptr = out;	154 tran_low_t *outptr = out;

122 int i, j;	155 int i, j;

123 int16_t temp_in[4], temp_out[4];	156 tran_low_t temp_in[4], temp_out[4];

124	157

125 // Rows	158 // Rows

126 for (i = 0; i < 4; ++i) {	159 for (i = 0; i < 4; ++i) {

127 idct4(input, outptr);	160 idct4(input, outptr);

128 input += 4;	161 input += 4;

129 outptr += 4;	162 outptr += 4;

130 }	163 }

131	164

132 // Columns	165 // Columns

133 for (i = 0; i < 4; ++i) {	166 for (i = 0; i < 4; ++i) {

134 for (j = 0; j < 4; ++j)	167 for (j = 0; j < 4; ++j)

135 temp_in[j] = out[j * 4 + i];	168 temp_in[j] = out[j * 4 + i];

136 idct4(temp_in, temp_out);	169 idct4(temp_in, temp_out);

137 for (j = 0; j < 4; ++j)	170 for (j = 0; j < 4; ++j)

138 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)	171 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)

139 + dest[j * stride + i]);	172 + dest[j * stride + i]);

140 }	173 }

141 }	174 }

142	175

143 void vp9_idct4x4_1_add_c(const int16_t input, uint8_t dest, int dest_stride) {	176 void vp9_idct4x4_1_add_c(const tran_low_t input, uint8_t dest,

	177 int dest_stride) {

144 int i;	178 int i;

145 int a1;	179 tran_high_t a1;

146 int16_t out = dct_const_round_shift(input[0] * cospi_16_64);	180 tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64);

147 out = dct_const_round_shift(out * cospi_16_64);	181 out = dct_const_round_shift(out * cospi_16_64);

148 a1 = ROUND_POWER_OF_TWO(out, 4);	182 a1 = ROUND_POWER_OF_TWO(out, 4);

149	183

150 for (i = 0; i < 4; i++) {	184 for (i = 0; i < 4; i++) {

151 dest[0] = clip_pixel(dest[0] + a1);	185 dest[0] = clip_pixel(dest[0] + a1);

152 dest[1] = clip_pixel(dest[1] + a1);	186 dest[1] = clip_pixel(dest[1] + a1);

153 dest[2] = clip_pixel(dest[2] + a1);	187 dest[2] = clip_pixel(dest[2] + a1);

154 dest[3] = clip_pixel(dest[3] + a1);	188 dest[3] = clip_pixel(dest[3] + a1);

155 dest += dest_stride;	189 dest += dest_stride;

156 }	190 }

157 }	191 }

158	192

159 static void idct8(const int16_t input, int16_t output) {	193 static void idct8(const tran_low_t input, tran_low_t output) {

160 int16_t step1[8], step2[8];	194 tran_low_t step1[8], step2[8];

161 int temp1, temp2;	195 tran_high_t temp1, temp2;

162 // stage 1	196 // stage 1

163 step1[0] = input[0];	197 step1[0] = input[0];

164 step1[2] = input[4];	198 step1[2] = input[4];

165 step1[1] = input[2];	199 step1[1] = input[2];

166 step1[3] = input[6];	200 step1[3] = input[6];

167 temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;	201 temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;

168 temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;	202 temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;

169 step1[4] = dct_const_round_shift(temp1);	203 step1[4] = dct_const_round_shift(temp1);

170 step1[7] = dct_const_round_shift(temp2);	204 step1[7] = dct_const_round_shift(temp2);

171 temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;	205 temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;

(...skipping 22 matching lines...) Expand all Loading...
194 output[0] = step1[0] + step1[7];	228 output[0] = step1[0] + step1[7];

195 output[1] = step1[1] + step1[6];	229 output[1] = step1[1] + step1[6];

196 output[2] = step1[2] + step1[5];	230 output[2] = step1[2] + step1[5];

197 output[3] = step1[3] + step1[4];	231 output[3] = step1[3] + step1[4];

198 output[4] = step1[3] - step1[4];	232 output[4] = step1[3] - step1[4];

199 output[5] = step1[2] - step1[5];	233 output[5] = step1[2] - step1[5];

200 output[6] = step1[1] - step1[6];	234 output[6] = step1[1] - step1[6];

201 output[7] = step1[0] - step1[7];	235 output[7] = step1[0] - step1[7];

202 }	236 }

203	237

204 void vp9_idct8x8_64_add_c(const int16_t input, uint8_t dest, int stride) {	238 void vp9_idct8x8_64_add_c(const tran_low_t input, uint8_t dest, int stride) {

205 int16_t out[8 * 8];	239 tran_low_t out[8 * 8];

206 int16_t *outptr = out;	240 tran_low_t *outptr = out;

207 int i, j;	241 int i, j;

208 int16_t temp_in[8], temp_out[8];	242 tran_low_t temp_in[8], temp_out[8];

209	243

210 // First transform rows	244 // First transform rows

211 for (i = 0; i < 8; ++i) {	245 for (i = 0; i < 8; ++i) {

212 idct8(input, outptr);	246 idct8(input, outptr);

213 input += 8;	247 input += 8;

214 outptr += 8;	248 outptr += 8;

215 }	249 }

216	250

217 // Then transform columns	251 // Then transform columns

218 for (i = 0; i < 8; ++i) {	252 for (i = 0; i < 8; ++i) {

219 for (j = 0; j < 8; ++j)	253 for (j = 0; j < 8; ++j)

220 temp_in[j] = out[j * 8 + i];	254 temp_in[j] = out[j * 8 + i];

221 idct8(temp_in, temp_out);	255 idct8(temp_in, temp_out);

222 for (j = 0; j < 8; ++j)	256 for (j = 0; j < 8; ++j)

223 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)	257 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)

224 + dest[j * stride + i]);	258 + dest[j * stride + i]);

225 }	259 }

226 }	260 }

227	261

228 void vp9_idct8x8_1_add_c(const int16_t input, uint8_t dest, int stride) {	262 void vp9_idct8x8_1_add_c(const tran_low_t input, uint8_t dest, int stride) {

229 int i, j;	263 int i, j;

230 int a1;	264 tran_high_t a1;

231 int16_t out = dct_const_round_shift(input[0] * cospi_16_64);	265 tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64);

232 out = dct_const_round_shift(out * cospi_16_64);	266 out = dct_const_round_shift(out * cospi_16_64);

233 a1 = ROUND_POWER_OF_TWO(out, 5);	267 a1 = ROUND_POWER_OF_TWO(out, 5);

234 for (j = 0; j < 8; ++j) {	268 for (j = 0; j < 8; ++j) {

235 for (i = 0; i < 8; ++i)	269 for (i = 0; i < 8; ++i)

236 dest[i] = clip_pixel(dest[i] + a1);	270 dest[i] = clip_pixel(dest[i] + a1);

237 dest += stride;	271 dest += stride;

238 }	272 }

239 }	273 }

240	274

241 static void iadst4(const int16_t input, int16_t output) {	275 static void iadst4(const tran_low_t input, tran_low_t output) {

242 int s0, s1, s2, s3, s4, s5, s6, s7;	276 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;

243	277

244 int x0 = input[0];	278 tran_high_t x0 = input[0];

245 int x1 = input[1];	279 tran_high_t x1 = input[1];

246 int x2 = input[2];	280 tran_high_t x2 = input[2];

247 int x3 = input[3];	281 tran_high_t x3 = input[3];

248	282

249 if (!(x0 \| x1 \| x2 \| x3)) {	283 if (!(x0 \| x1 \| x2 \| x3)) {

250 output[0] = output[1] = output[2] = output[3] = 0;	284 output[0] = output[1] = output[2] = output[3] = 0;

251 return;	285 return;

252 }	286 }

253	287

254 s0 = sinpi_1_9 * x0;	288 s0 = sinpi_1_9 * x0;

255 s1 = sinpi_2_9 * x0;	289 s1 = sinpi_2_9 * x0;

256 s2 = sinpi_3_9 * x1;	290 s2 = sinpi_3_9 * x1;

257 s3 = sinpi_4_9 * x2;	291 s3 = sinpi_4_9 * x2;

(...skipping 15 matching lines...) Expand all Loading...
273 // 1-D transform scaling factor is sqrt(2).	307 // 1-D transform scaling factor is sqrt(2).

274 // The overall dynamic range is 14b (input) + 14b (multiplication scaling)	308 // The overall dynamic range is 14b (input) + 14b (multiplication scaling)

275 // + 1b (addition) = 29b.	309 // + 1b (addition) = 29b.

276 // Hence the output bit depth is 15b.	310 // Hence the output bit depth is 15b.

277 output[0] = dct_const_round_shift(s0);	311 output[0] = dct_const_round_shift(s0);

278 output[1] = dct_const_round_shift(s1);	312 output[1] = dct_const_round_shift(s1);

279 output[2] = dct_const_round_shift(s2);	313 output[2] = dct_const_round_shift(s2);

280 output[3] = dct_const_round_shift(s3);	314 output[3] = dct_const_round_shift(s3);

281 }	315 }

282	316

283 void vp9_iht4x4_16_add_c(const int16_t input, uint8_t dest, int stride,	317 void vp9_iht4x4_16_add_c(const tran_low_t input, uint8_t dest, int stride,

284 int tx_type) {	318 int tx_type) {

285 const transform_2d IHT_4[] = {	319 const transform_2d IHT_4[] = {

286 { idct4, idct4 }, // DCT_DCT = 0	320 { idct4, idct4 }, // DCT_DCT = 0

287 { iadst4, idct4 }, // ADST_DCT = 1	321 { iadst4, idct4 }, // ADST_DCT = 1

288 { idct4, iadst4 }, // DCT_ADST = 2	322 { idct4, iadst4 }, // DCT_ADST = 2

289 { iadst4, iadst4 } // ADST_ADST = 3	323 { iadst4, iadst4 } // ADST_ADST = 3

290 };	324 };

291	325

292 int i, j;	326 int i, j;

293 int16_t out[4 * 4];	327 tran_low_t out[4 * 4];

294 int16_t *outptr = out;	328 tran_low_t *outptr = out;

295 int16_t temp_in[4], temp_out[4];	329 tran_low_t temp_in[4], temp_out[4];

296	330

297 // inverse transform row vectors	331 // inverse transform row vectors

298 for (i = 0; i < 4; ++i) {	332 for (i = 0; i < 4; ++i) {

299 IHT_4[tx_type].rows(input, outptr);	333 IHT_4[tx_type].rows(input, outptr);

300 input += 4;	334 input += 4;

301 outptr += 4;	335 outptr += 4;

302 }	336 }

303	337

304 // inverse transform column vectors	338 // inverse transform column vectors

305 for (i = 0; i < 4; ++i) {	339 for (i = 0; i < 4; ++i) {

306 for (j = 0; j < 4; ++j)	340 for (j = 0; j < 4; ++j)

307 temp_in[j] = out[j * 4 + i];	341 temp_in[j] = out[j * 4 + i];

308 IHT_4[tx_type].cols(temp_in, temp_out);	342 IHT_4[tx_type].cols(temp_in, temp_out);

309 for (j = 0; j < 4; ++j)	343 for (j = 0; j < 4; ++j)

310 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)	344 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)

311 + dest[j * stride + i]);	345 + dest[j * stride + i]);

312 }	346 }

313 }	347 }

314 static void iadst8(const int16_t input, int16_t output) {	348 static void iadst8(const tran_low_t input, tran_low_t output) {

315 int s0, s1, s2, s3, s4, s5, s6, s7;	349 int s0, s1, s2, s3, s4, s5, s6, s7;

316	350

317 int x0 = input[7];	351 tran_high_t x0 = input[7];

318 int x1 = input[0];	352 tran_high_t x1 = input[0];

319 int x2 = input[5];	353 tran_high_t x2 = input[5];

320 int x3 = input[2];	354 tran_high_t x3 = input[2];

321 int x4 = input[3];	355 tran_high_t x4 = input[3];

322 int x5 = input[4];	356 tran_high_t x5 = input[4];

323 int x6 = input[1];	357 tran_high_t x6 = input[1];

324 int x7 = input[6];	358 tran_high_t x7 = input[6];

325	359

326 if (!(x0 \| x1 \| x2 \| x3 \| x4 \| x5 \| x6 \| x7)) {	360 if (!(x0 \| x1 \| x2 \| x3 \| x4 \| x5 \| x6 \| x7)) {

327 output[0] = output[1] = output[2] = output[3] = output[4]	361 output[0] = output[1] = output[2] = output[3] = output[4]

328 = output[5] = output[6] = output[7] = 0;	362 = output[5] = output[6] = output[7] = 0;

329 return;	363 return;

330 }	364 }

331	365

332 // stage 1	366 // stage 1

333 s0 = cospi_2_64 * x0 + cospi_30_64 * x1;	367 s0 = cospi_2_64 * x0 + cospi_30_64 * x1;

334 s1 = cospi_30_64 * x0 - cospi_2_64 * x1;	368 s1 = cospi_30_64 * x0 - cospi_2_64 * x1;

(...skipping 53 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
388 output[7] = -x1;	422 output[7] = -x1;

389 }	423 }

390	424

391 static const transform_2d IHT_8[] = {	425 static const transform_2d IHT_8[] = {

392 { idct8, idct8 }, // DCT_DCT = 0	426 { idct8, idct8 }, // DCT_DCT = 0

393 { iadst8, idct8 }, // ADST_DCT = 1	427 { iadst8, idct8 }, // ADST_DCT = 1

394 { idct8, iadst8 }, // DCT_ADST = 2	428 { idct8, iadst8 }, // DCT_ADST = 2

395 { iadst8, iadst8 } // ADST_ADST = 3	429 { iadst8, iadst8 } // ADST_ADST = 3

396 };	430 };

397	431

398 void vp9_iht8x8_64_add_c(const int16_t input, uint8_t dest, int stride,	432 void vp9_iht8x8_64_add_c(const tran_low_t input, uint8_t dest, int stride,

399 int tx_type) {	433 int tx_type) {

400 int i, j;	434 int i, j;

401 int16_t out[8 * 8];	435 tran_low_t out[8 * 8];

402 int16_t *outptr = out;	436 tran_low_t *outptr = out;

403 int16_t temp_in[8], temp_out[8];	437 tran_low_t temp_in[8], temp_out[8];

404 const transform_2d ht = IHT_8[tx_type];	438 const transform_2d ht = IHT_8[tx_type];

405	439

406 // inverse transform row vectors	440 // inverse transform row vectors

407 for (i = 0; i < 8; ++i) {	441 for (i = 0; i < 8; ++i) {

408 ht.rows(input, outptr);	442 ht.rows(input, outptr);

409 input += 8;	443 input += 8;

410 outptr += 8;	444 outptr += 8;

411 }	445 }

412	446

413 // inverse transform column vectors	447 // inverse transform column vectors

414 for (i = 0; i < 8; ++i) {	448 for (i = 0; i < 8; ++i) {

415 for (j = 0; j < 8; ++j)	449 for (j = 0; j < 8; ++j)

416 temp_in[j] = out[j * 8 + i];	450 temp_in[j] = out[j * 8 + i];

417 ht.cols(temp_in, temp_out);	451 ht.cols(temp_in, temp_out);

418 for (j = 0; j < 8; ++j)	452 for (j = 0; j < 8; ++j)

419 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)	453 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)

420 + dest[j * stride + i]);	454 + dest[j * stride + i]);

421 }	455 }

422 }	456 }

423	457

424 void vp9_idct8x8_12_add_c(const int16_t input, uint8_t dest, int stride) {	458 void vp9_idct8x8_12_add_c(const tran_low_t input, uint8_t dest, int stride) {

425 int16_t out[8 * 8] = { 0 };	459 tran_low_t out[8 * 8] = { 0 };

426 int16_t *outptr = out;	460 tran_low_t *outptr = out;

427 int i, j;	461 int i, j;

428 int16_t temp_in[8], temp_out[8];	462 tran_low_t temp_in[8], temp_out[8];

429	463

430 // First transform rows	464 // First transform rows

431 // only first 4 row has non-zero coefs	465 // only first 4 row has non-zero coefs

432 for (i = 0; i < 4; ++i) {	466 for (i = 0; i < 4; ++i) {

433 idct8(input, outptr);	467 idct8(input, outptr);

434 input += 8;	468 input += 8;

435 outptr += 8;	469 outptr += 8;

436 }	470 }

437	471

438 // Then transform columns	472 // Then transform columns

439 for (i = 0; i < 8; ++i) {	473 for (i = 0; i < 8; ++i) {

440 for (j = 0; j < 8; ++j)	474 for (j = 0; j < 8; ++j)

441 temp_in[j] = out[j * 8 + i];	475 temp_in[j] = out[j * 8 + i];

442 idct8(temp_in, temp_out);	476 idct8(temp_in, temp_out);

443 for (j = 0; j < 8; ++j)	477 for (j = 0; j < 8; ++j)

444 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)	478 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)

445 + dest[j * stride + i]);	479 + dest[j * stride + i]);

446 }	480 }

447 }	481 }

448	482

449 static void idct16(const int16_t input, int16_t output) {	483 static void idct16(const tran_low_t input, tran_low_t output) {

450 int16_t step1[16], step2[16];	484 tran_low_t step1[16], step2[16];

451 int temp1, temp2;	485 tran_high_t temp1, temp2;

452	486

453 // stage 1	487 // stage 1

454 step1[0] = input[0/2];	488 step1[0] = input[0/2];

455 step1[1] = input[16/2];	489 step1[1] = input[16/2];

456 step1[2] = input[8/2];	490 step1[2] = input[8/2];

457 step1[3] = input[24/2];	491 step1[3] = input[24/2];

458 step1[4] = input[4/2];	492 step1[4] = input[4/2];

459 step1[5] = input[20/2];	493 step1[5] = input[20/2];

460 step1[6] = input[12/2];	494 step1[6] = input[12/2];

461 step1[7] = input[28/2];	495 step1[7] = input[28/2];

(...skipping 142 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
604 output[8] = step2[7] - step2[8];	638 output[8] = step2[7] - step2[8];

605 output[9] = step2[6] - step2[9];	639 output[9] = step2[6] - step2[9];

606 output[10] = step2[5] - step2[10];	640 output[10] = step2[5] - step2[10];

607 output[11] = step2[4] - step2[11];	641 output[11] = step2[4] - step2[11];

608 output[12] = step2[3] - step2[12];	642 output[12] = step2[3] - step2[12];

609 output[13] = step2[2] - step2[13];	643 output[13] = step2[2] - step2[13];

610 output[14] = step2[1] - step2[14];	644 output[14] = step2[1] - step2[14];

611 output[15] = step2[0] - step2[15];	645 output[15] = step2[0] - step2[15];

612 }	646 }

613	647

614 void vp9_idct16x16_256_add_c(const int16_t input, uint8_t dest, int stride) {	648 void vp9_idct16x16_256_add_c(const tran_low_t input, uint8_t dest,

615 int16_t out[16 * 16];	649 int stride) {

616 int16_t *outptr = out;	650 tran_low_t out[16 * 16];

	651 tran_low_t *outptr = out;

617 int i, j;	652 int i, j;

618 int16_t temp_in[16], temp_out[16];	653 tran_low_t temp_in[16], temp_out[16];

619	654

620 // First transform rows	655 // First transform rows

621 for (i = 0; i < 16; ++i) {	656 for (i = 0; i < 16; ++i) {

622 idct16(input, outptr);	657 idct16(input, outptr);

623 input += 16;	658 input += 16;

624 outptr += 16;	659 outptr += 16;

625 }	660 }

626	661

627 // Then transform columns	662 // Then transform columns

628 for (i = 0; i < 16; ++i) {	663 for (i = 0; i < 16; ++i) {

629 for (j = 0; j < 16; ++j)	664 for (j = 0; j < 16; ++j)

630 temp_in[j] = out[j * 16 + i];	665 temp_in[j] = out[j * 16 + i];

631 idct16(temp_in, temp_out);	666 idct16(temp_in, temp_out);

632 for (j = 0; j < 16; ++j)	667 for (j = 0; j < 16; ++j)

633 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)	668 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)

634 + dest[j * stride + i]);	669 + dest[j * stride + i]);

635 }	670 }

636 }	671 }

637	672

638 static void iadst16(const int16_t input, int16_t output) {	673 static void iadst16(const tran_low_t input, tran_low_t output) {

639 int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;	674 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;

	675 tran_high_t s9, s10, s11, s12, s13, s14, s15;

640	676

641 int x0 = input[15];	677 tran_high_t x0 = input[15];

642 int x1 = input[0];	678 tran_high_t x1 = input[0];

643 int x2 = input[13];	679 tran_high_t x2 = input[13];

644 int x3 = input[2];	680 tran_high_t x3 = input[2];

645 int x4 = input[11];	681 tran_high_t x4 = input[11];

646 int x5 = input[4];	682 tran_high_t x5 = input[4];

647 int x6 = input[9];	683 tran_high_t x6 = input[9];

648 int x7 = input[6];	684 tran_high_t x7 = input[6];

649 int x8 = input[7];	685 tran_high_t x8 = input[7];

650 int x9 = input[8];	686 tran_high_t x9 = input[8];

651 int x10 = input[5];	687 tran_high_t x10 = input[5];

652 int x11 = input[10];	688 tran_high_t x11 = input[10];

653 int x12 = input[3];	689 tran_high_t x12 = input[3];

654 int x13 = input[12];	690 tran_high_t x13 = input[12];

655 int x14 = input[1];	691 tran_high_t x14 = input[1];

656 int x15 = input[14];	692 tran_high_t x15 = input[14];

657	693

658 if (!(x0 \| x1 \| x2 \| x3 \| x4 \| x5 \| x6 \| x7 \| x8	694 if (!(x0 \| x1 \| x2 \| x3 \| x4 \| x5 \| x6 \| x7 \| x8

659 \| x9 \| x10 \| x11 \| x12 \| x13 \| x14 \| x15)) {	695 \| x9 \| x10 \| x11 \| x12 \| x13 \| x14 \| x15)) {

660 output[0] = output[1] = output[2] = output[3] = output[4]	696 output[0] = output[1] = output[2] = output[3] = output[4]

661 = output[5] = output[6] = output[7] = output[8]	697 = output[5] = output[6] = output[7] = output[8]

662 = output[9] = output[10] = output[11] = output[12]	698 = output[9] = output[10] = output[11] = output[12]

663 = output[13] = output[14] = output[15] = 0;	699 = output[13] = output[14] = output[15] = 0;

664 return;	700 return;

665 }	701 }

666	702

(...skipping 139 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
806 output[15] = -x1;	842 output[15] = -x1;

807 }	843 }

808	844

809 static const transform_2d IHT_16[] = {	845 static const transform_2d IHT_16[] = {

810 { idct16, idct16 }, // DCT_DCT = 0	846 { idct16, idct16 }, // DCT_DCT = 0

811 { iadst16, idct16 }, // ADST_DCT = 1	847 { iadst16, idct16 }, // ADST_DCT = 1

812 { idct16, iadst16 }, // DCT_ADST = 2	848 { idct16, iadst16 }, // DCT_ADST = 2

813 { iadst16, iadst16 } // ADST_ADST = 3	849 { iadst16, iadst16 } // ADST_ADST = 3

814 };	850 };

815	851

816 void vp9_iht16x16_256_add_c(const int16_t input, uint8_t dest, int stride,	852 void vp9_iht16x16_256_add_c(const tran_low_t input, uint8_t dest, int stride,

817 int tx_type) {	853 int tx_type) {

818 int i, j;	854 int i, j;

819 int16_t out[16 * 16];	855 tran_low_t out[16 * 16];

820 int16_t *outptr = out;	856 tran_low_t *outptr = out;

821 int16_t temp_in[16], temp_out[16];	857 tran_low_t temp_in[16], temp_out[16];

822 const transform_2d ht = IHT_16[tx_type];	858 const transform_2d ht = IHT_16[tx_type];

823	859

824 // Rows	860 // Rows

825 for (i = 0; i < 16; ++i) {	861 for (i = 0; i < 16; ++i) {

826 ht.rows(input, outptr);	862 ht.rows(input, outptr);

827 input += 16;	863 input += 16;

828 outptr += 16;	864 outptr += 16;

829 }	865 }

830	866

831 // Columns	867 // Columns

832 for (i = 0; i < 16; ++i) {	868 for (i = 0; i < 16; ++i) {

833 for (j = 0; j < 16; ++j)	869 for (j = 0; j < 16; ++j)

834 temp_in[j] = out[j * 16 + i];	870 temp_in[j] = out[j * 16 + i];

835 ht.cols(temp_in, temp_out);	871 ht.cols(temp_in, temp_out);

836 for (j = 0; j < 16; ++j)	872 for (j = 0; j < 16; ++j)

837 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)	873 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)

838 + dest[j * stride + i]);	874 + dest[j * stride + i]);

839 }	875 }

840 }	876 }

841	877

842 void vp9_idct16x16_10_add_c(const int16_t input, uint8_t dest, int stride) {	878 void vp9_idct16x16_10_add_c(const tran_low_t input, uint8_t dest,

843 int16_t out[16 * 16] = { 0 };	879 int stride) {

844 int16_t *outptr = out;	880 tran_low_t out[16 * 16] = { 0 };

	881 tran_low_t *outptr = out;

845 int i, j;	882 int i, j;

846 int16_t temp_in[16], temp_out[16];	883 tran_low_t temp_in[16], temp_out[16];

847	884

848 // First transform rows. Since all non-zero dct coefficients are in	885 // First transform rows. Since all non-zero dct coefficients are in

849 // upper-left 4x4 area, we only need to calculate first 4 rows here.	886 // upper-left 4x4 area, we only need to calculate first 4 rows here.

850 for (i = 0; i < 4; ++i) {	887 for (i = 0; i < 4; ++i) {

851 idct16(input, outptr);	888 idct16(input, outptr);

852 input += 16;	889 input += 16;

853 outptr += 16;	890 outptr += 16;

854 }	891 }

855	892

856 // Then transform columns	893 // Then transform columns

857 for (i = 0; i < 16; ++i) {	894 for (i = 0; i < 16; ++i) {

858 for (j = 0; j < 16; ++j)	895 for (j = 0; j < 16; ++j)

859 temp_in[j] = out[j*16 + i];	896 temp_in[j] = out[j*16 + i];

860 idct16(temp_in, temp_out);	897 idct16(temp_in, temp_out);

861 for (j = 0; j < 16; ++j)	898 for (j = 0; j < 16; ++j)

862 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)	899 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)

863 + dest[j * stride + i]);	900 + dest[j * stride + i]);

864 }	901 }

865 }	902 }

866	903

867 void vp9_idct16x16_1_add_c(const int16_t input, uint8_t dest, int stride) {	904 void vp9_idct16x16_1_add_c(const tran_low_t input, uint8_t dest, int stride) {

868 int i, j;	905 int i, j;

869 int a1;	906 tran_high_t a1;

870 int16_t out = dct_const_round_shift(input[0] * cospi_16_64);	907 tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64);

871 out = dct_const_round_shift(out * cospi_16_64);	908 out = dct_const_round_shift(out * cospi_16_64);

872 a1 = ROUND_POWER_OF_TWO(out, 6);	909 a1 = ROUND_POWER_OF_TWO(out, 6);

873 for (j = 0; j < 16; ++j) {	910 for (j = 0; j < 16; ++j) {

874 for (i = 0; i < 16; ++i)	911 for (i = 0; i < 16; ++i)

875 dest[i] = clip_pixel(dest[i] + a1);	912 dest[i] = clip_pixel(dest[i] + a1);

876 dest += stride;	913 dest += stride;

877 }	914 }

878 }	915 }

879	916

880 static void idct32(const int16_t input, int16_t output) {	917 static void idct32(const tran_low_t input, tran_low_t output) {

881 int16_t step1[32], step2[32];	918 tran_low_t step1[32], step2[32];

882 int temp1, temp2;	919 tran_high_t temp1, temp2;

883	920

884 // stage 1	921 // stage 1

885 step1[0] = input[0];	922 step1[0] = input[0];

886 step1[1] = input[16];	923 step1[1] = input[16];

887 step1[2] = input[8];	924 step1[2] = input[8];

888 step1[3] = input[24];	925 step1[3] = input[24];

889 step1[4] = input[4];	926 step1[4] = input[4];

890 step1[5] = input[20];	927 step1[5] = input[20];

891 step1[6] = input[12];	928 step1[6] = input[12];

892 step1[7] = input[28];	929 step1[7] = input[28];

(...skipping 344 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1237 output[24] = step1[7] - step1[24];	1274 output[24] = step1[7] - step1[24];

1238 output[25] = step1[6] - step1[25];	1275 output[25] = step1[6] - step1[25];

1239 output[26] = step1[5] - step1[26];	1276 output[26] = step1[5] - step1[26];

1240 output[27] = step1[4] - step1[27];	1277 output[27] = step1[4] - step1[27];

1241 output[28] = step1[3] - step1[28];	1278 output[28] = step1[3] - step1[28];

1242 output[29] = step1[2] - step1[29];	1279 output[29] = step1[2] - step1[29];

1243 output[30] = step1[1] - step1[30];	1280 output[30] = step1[1] - step1[30];

1244 output[31] = step1[0] - step1[31];	1281 output[31] = step1[0] - step1[31];

1245 }	1282 }

1246	1283

1247 void vp9_idct32x32_1024_add_c(const int16_t input, uint8_t dest, int stride) {	1284 void vp9_idct32x32_1024_add_c(const tran_low_t input, uint8_t dest,

1248 int16_t out[32 * 32];	1285 int stride) {

1249 int16_t *outptr = out;	1286 tran_low_t out[32 * 32];

	1287 tran_low_t *outptr = out;

1250 int i, j;	1288 int i, j;

1251 int16_t temp_in[32], temp_out[32];	1289 tran_low_t temp_in[32], temp_out[32];

1252	1290

1253 // Rows	1291 // Rows

1254 for (i = 0; i < 32; ++i) {	1292 for (i = 0; i < 32; ++i) {

1255 int16_t zero_coeff[16];	1293 int16_t zero_coeff[16];

1256 for (j = 0; j < 16; ++j)	1294 for (j = 0; j < 16; ++j)

1257 zero_coeff[j] = input[2 * j] \| input[2 * j + 1];	1295 zero_coeff[j] = input[2 * j] \| input[2 * j + 1];

1258 for (j = 0; j < 8; ++j)	1296 for (j = 0; j < 8; ++j)

1259 zero_coeff[j] = zero_coeff[2 * j] \| zero_coeff[2 * j + 1];	1297 zero_coeff[j] = zero_coeff[2 * j] \| zero_coeff[2 * j + 1];

1260 for (j = 0; j < 4; ++j)	1298 for (j = 0; j < 4; ++j)

1261 zero_coeff[j] = zero_coeff[2 * j] \| zero_coeff[2 * j + 1];	1299 zero_coeff[j] = zero_coeff[2 * j] \| zero_coeff[2 * j + 1];

1262 for (j = 0; j < 2; ++j)	1300 for (j = 0; j < 2; ++j)

1263 zero_coeff[j] = zero_coeff[2 * j] \| zero_coeff[2 * j + 1];	1301 zero_coeff[j] = zero_coeff[2 * j] \| zero_coeff[2 * j + 1];

1264	1302

1265 if (zero_coeff[0] \| zero_coeff[1])	1303 if (zero_coeff[0] \| zero_coeff[1])

1266 idct32(input, outptr);	1304 idct32(input, outptr);

1267 else	1305 else

1268 vpx_memset(outptr, 0, sizeof(int16_t) * 32);	1306 vpx_memset(outptr, 0, sizeof(tran_low_t) * 32);

1269 input += 32;	1307 input += 32;

1270 outptr += 32;	1308 outptr += 32;

1271 }	1309 }

1272	1310

1273 // Columns	1311 // Columns

1274 for (i = 0; i < 32; ++i) {	1312 for (i = 0; i < 32; ++i) {

1275 for (j = 0; j < 32; ++j)	1313 for (j = 0; j < 32; ++j)

1276 temp_in[j] = out[j * 32 + i];	1314 temp_in[j] = out[j * 32 + i];

1277 idct32(temp_in, temp_out);	1315 idct32(temp_in, temp_out);

1278 for (j = 0; j < 32; ++j)	1316 for (j = 0; j < 32; ++j)

1279 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)	1317 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)

1280 + dest[j * stride + i]);	1318 + dest[j * stride + i]);

1281 }	1319 }

1282 }	1320 }

1283	1321

1284 void vp9_idct32x32_34_add_c(const int16_t input, uint8_t dest, int stride) {	1322 void vp9_idct32x32_34_add_c(const tran_low_t input, uint8_t dest,

1285 int16_t out[32 * 32] = {0};	1323 int stride) {

1286 int16_t *outptr = out;	1324 tran_low_t out[32 * 32] = {0};

	1325 tran_low_t *outptr = out;

1287 int i, j;	1326 int i, j;

1288 int16_t temp_in[32], temp_out[32];	1327 tran_low_t temp_in[32], temp_out[32];

1289	1328

1290 // Rows	1329 // Rows

1291 // only upper-left 8x8 has non-zero coeff	1330 // only upper-left 8x8 has non-zero coeff

1292 for (i = 0; i < 8; ++i) {	1331 for (i = 0; i < 8; ++i) {

1293 idct32(input, outptr);	1332 idct32(input, outptr);

1294 input += 32;	1333 input += 32;

1295 outptr += 32;	1334 outptr += 32;

1296 }	1335 }

1297	1336

1298 // Columns	1337 // Columns

1299 for (i = 0; i < 32; ++i) {	1338 for (i = 0; i < 32; ++i) {

1300 for (j = 0; j < 32; ++j)	1339 for (j = 0; j < 32; ++j)

1301 temp_in[j] = out[j * 32 + i];	1340 temp_in[j] = out[j * 32 + i];

1302 idct32(temp_in, temp_out);	1341 idct32(temp_in, temp_out);

1303 for (j = 0; j < 32; ++j)	1342 for (j = 0; j < 32; ++j)

1304 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)	1343 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)

1305 + dest[j * stride + i]);	1344 + dest[j * stride + i]);

1306 }	1345 }

1307 }	1346 }

1308	1347

1309 void vp9_idct32x32_1_add_c(const int16_t input, uint8_t dest, int stride) {	1348 void vp9_idct32x32_1_add_c(const tran_low_t input, uint8_t dest, int stride) {

1310 int i, j;	1349 int i, j;

1311 int a1;	1350 tran_high_t a1;

1312	1351

1313 int16_t out = dct_const_round_shift(input[0] * cospi_16_64);	1352 tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64);

1314 out = dct_const_round_shift(out * cospi_16_64);	1353 out = dct_const_round_shift(out * cospi_16_64);

1315 a1 = ROUND_POWER_OF_TWO(out, 6);	1354 a1 = ROUND_POWER_OF_TWO(out, 6);

1316	1355

1317 for (j = 0; j < 32; ++j) {	1356 for (j = 0; j < 32; ++j) {

1318 for (i = 0; i < 32; ++i)	1357 for (i = 0; i < 32; ++i)

1319 dest[i] = clip_pixel(dest[i] + a1);	1358 dest[i] = clip_pixel(dest[i] + a1);

1320 dest += stride;	1359 dest += stride;

1321 }	1360 }

1322 }	1361 }

1323	1362

1324 // idct	1363 // idct

1325 void vp9_idct4x4_add(const int16_t input, uint8_t dest, int stride, int eob) {	1364 void vp9_idct4x4_add(const tran_low_t input, uint8_t dest, int stride,

	1365 int eob) {

1326 if (eob > 1)	1366 if (eob > 1)

1327 vp9_idct4x4_16_add(input, dest, stride);	1367 vp9_idct4x4_16_add(input, dest, stride);

1328 else	1368 else

1329 vp9_idct4x4_1_add(input, dest, stride);	1369 vp9_idct4x4_1_add(input, dest, stride);

1330 }	1370 }

1331	1371

1332	1372

1333 void vp9_iwht4x4_add(const int16_t input, uint8_t dest, int stride, int eob) {	1373 void vp9_iwht4x4_add(const tran_low_t input, uint8_t dest, int stride,

	1374 int eob) {

1334 if (eob > 1)	1375 if (eob > 1)

1335 vp9_iwht4x4_16_add(input, dest, stride);	1376 vp9_iwht4x4_16_add(input, dest, stride);

1336 else	1377 else

1337 vp9_iwht4x4_1_add(input, dest, stride);	1378 vp9_iwht4x4_1_add(input, dest, stride);

1338 }	1379 }

1339	1380

1340 void vp9_idct8x8_add(const int16_t input, uint8_t dest, int stride, int eob) {	1381 void vp9_idct8x8_add(const tran_low_t input, uint8_t dest, int stride,

	1382 int eob) {

1341 // If dc is 1, then input[0] is the reconstructed value, do not need	1383 // If dc is 1, then input[0] is the reconstructed value, do not need

1342 // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.	1384 // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.

1343	1385

1344 // The calculation can be simplified if there are not many non-zero dct	1386 // The calculation can be simplified if there are not many non-zero dct

1345 // coefficients. Use eobs to decide what to do.	1387 // coefficients. Use eobs to decide what to do.

1346 // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.	1388 // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.

1347 // Combine that with code here.	1389 // Combine that with code here.

1348 if (eob == 1)	1390 if (eob == 1)

1349 // DC only DCT coefficient	1391 // DC only DCT coefficient

1350 vp9_idct8x8_1_add(input, dest, stride);	1392 vp9_idct8x8_1_add(input, dest, stride);

1351 else if (eob <= 12)	1393 else if (eob <= 12)

1352 vp9_idct8x8_12_add(input, dest, stride);	1394 vp9_idct8x8_12_add(input, dest, stride);

1353 else	1395 else

1354 vp9_idct8x8_64_add(input, dest, stride);	1396 vp9_idct8x8_64_add(input, dest, stride);

1355 }	1397 }

1356	1398

1357 void vp9_idct16x16_add(const int16_t input, uint8_t dest, int stride,	1399 void vp9_idct16x16_add(const tran_low_t input, uint8_t dest, int stride,

1358 int eob) {	1400 int eob) {

1359 /* The calculation can be simplified if there are not many non-zero dct	1401 /* The calculation can be simplified if there are not many non-zero dct

1360 * coefficients. Use eobs to separate different cases. */	1402 * coefficients. Use eobs to separate different cases. */

1361 if (eob == 1)	1403 if (eob == 1)

1362 /* DC only DCT coefficient. */	1404 /* DC only DCT coefficient. */

1363 vp9_idct16x16_1_add(input, dest, stride);	1405 vp9_idct16x16_1_add(input, dest, stride);

1364 else if (eob <= 10)	1406 else if (eob <= 10)

1365 vp9_idct16x16_10_add(input, dest, stride);	1407 vp9_idct16x16_10_add(input, dest, stride);

1366 else	1408 else

1367 vp9_idct16x16_256_add(input, dest, stride);	1409 vp9_idct16x16_256_add(input, dest, stride);

1368 }	1410 }

1369	1411

1370 void vp9_idct32x32_add(const int16_t input, uint8_t dest, int stride,	1412 void vp9_idct32x32_add(const tran_low_t input, uint8_t dest, int stride,

1371 int eob) {	1413 int eob) {

1372 if (eob == 1)	1414 if (eob == 1)

1373 vp9_idct32x32_1_add(input, dest, stride);	1415 vp9_idct32x32_1_add(input, dest, stride);

1374 else if (eob <= 34)	1416 else if (eob <= 34)

1375 // non-zero coeff only in upper-left 8x8	1417 // non-zero coeff only in upper-left 8x8

1376 vp9_idct32x32_34_add(input, dest, stride);	1418 vp9_idct32x32_34_add(input, dest, stride);

1377 else	1419 else

1378 vp9_idct32x32_1024_add(input, dest, stride);	1420 vp9_idct32x32_1024_add(input, dest, stride);

1379 }	1421 }

1380	1422

1381 // iht	1423 // iht

1382 void vp9_iht4x4_add(TX_TYPE tx_type, const int16_t input, uint8_t dest,	1424 void vp9_iht4x4_add(TX_TYPE tx_type, const tran_low_t input, uint8_t dest,

1383 int stride, int eob) {	1425 int stride, int eob) {

1384 if (tx_type == DCT_DCT)	1426 if (tx_type == DCT_DCT)

1385 vp9_idct4x4_add(input, dest, stride, eob);	1427 vp9_idct4x4_add(input, dest, stride, eob);

1386 else	1428 else

1387 vp9_iht4x4_16_add(input, dest, stride, tx_type);	1429 vp9_iht4x4_16_add(input, dest, stride, tx_type);

1388 }	1430 }

1389	1431

1390 void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t input, uint8_t dest,	1432 void vp9_iht8x8_add(TX_TYPE tx_type, const tran_low_t input, uint8_t dest,

1391 int stride, int eob) {	1433 int stride, int eob) {

1392 if (tx_type == DCT_DCT) {	1434 if (tx_type == DCT_DCT) {

1393 vp9_idct8x8_add(input, dest, stride, eob);	1435 vp9_idct8x8_add(input, dest, stride, eob);

1394 } else {	1436 } else {

1395 vp9_iht8x8_64_add(input, dest, stride, tx_type);	1437 vp9_iht8x8_64_add(input, dest, stride, tx_type);

1396 }	1438 }

1397 }	1439 }

1398	1440

1399 void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t input, uint8_t dest,	1441 void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t input, uint8_t dest,

1400 int stride, int eob) {	1442 int stride, int eob) {

1401 if (tx_type == DCT_DCT) {	1443 if (tx_type == DCT_DCT) {

1402 vp9_idct16x16_add(input, dest, stride, eob);	1444 vp9_idct16x16_add(input, dest, stride, eob);

1403 } else {	1445 } else {

1404 vp9_iht16x16_256_add(input, dest, stride, tx_type);	1446 vp9_iht16x16_256_add(input, dest, stride, tx_type);

1405 }	1447 }

1406 }	1448 }

	1449

	1450 #if CONFIG_VP9_HIGHBITDEPTH

	1451 void vp9_high_iwht4x4_16_add_c(const tran_low_t input, uint8_t dest8,

	1452 int stride, int bd) {

	1453 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,

	1454 0.5 shifts per pixel. */

	1455 int i;

	1456 tran_low_t output[16];

	1457 tran_high_t a1, b1, c1, d1, e1;

	1458 const tran_low_t *ip = input;

	1459 tran_low_t *op = output;

	1460 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

	1461

	1462 for (i = 0; i < 4; i++) {

	1463 a1 = ip[0] >> UNIT_QUANT_SHIFT;

	1464 c1 = ip[1] >> UNIT_QUANT_SHIFT;

	1465 d1 = ip[2] >> UNIT_QUANT_SHIFT;

	1466 b1 = ip[3] >> UNIT_QUANT_SHIFT;

	1467 a1 += c1;

	1468 d1 -= b1;

	1469 e1 = (a1 - d1) >> 1;

	1470 b1 = e1 - b1;

	1471 c1 = e1 - c1;

	1472 a1 -= b1;

	1473 d1 += c1;

	1474 op[0] = WRAPLOW(a1);

	1475 op[1] = WRAPLOW(b1);

	1476 op[2] = WRAPLOW(c1);

	1477 op[3] = WRAPLOW(d1);

	1478 ip += 4;

	1479 op += 4;

	1480 }

	1481

	1482 ip = output;

	1483 for (i = 0; i < 4; i++) {

	1484 a1 = ip[4 * 0];

	1485 c1 = ip[4 * 1];

	1486 d1 = ip[4 * 2];

	1487 b1 = ip[4 * 3];

	1488 a1 += c1;

	1489 d1 -= b1;

	1490 e1 = (a1 - d1) >> 1;

	1491 b1 = e1 - b1;

	1492 c1 = e1 - c1;

	1493 a1 -= b1;

	1494 d1 += c1;

	1495 dest[stride * 0] = clip_pixel_bd_high(dest[stride * 0], a1, bd);

	1496 dest[stride * 1] = clip_pixel_bd_high(dest[stride * 1], b1, bd);

	1497 dest[stride * 2] = clip_pixel_bd_high(dest[stride * 2], c1, bd);

	1498 dest[stride * 3] = clip_pixel_bd_high(dest[stride * 3], d1, bd);

	1499

	1500 ip++;

	1501 dest++;

	1502 }

	1503 }

	1504

	1505 static void high_idct4(const tran_low_t input, tran_low_t output, int bd) {

	1506 tran_low_t step[4];

	1507 tran_high_t temp1, temp2;

	1508 (void) bd;

	1509 // stage 1

	1510 temp1 = (input[0] + input[2]) * cospi_16_64;

	1511 temp2 = (input[0] - input[2]) * cospi_16_64;

	1512 step[0] = WRAPLOW(dct_const_round_shift(temp1));

	1513 step[1] = WRAPLOW(dct_const_round_shift(temp2));

	1514 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;

	1515 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;

	1516 step[2] = WRAPLOW(dct_const_round_shift(temp1));

	1517 step[3] = WRAPLOW(dct_const_round_shift(temp2));

	1518

	1519 // stage 2

	1520 output[0] = WRAPLOW(step[0] + step[3]);

	1521 output[1] = WRAPLOW(step[1] + step[2]);

	1522 output[2] = WRAPLOW(step[1] - step[2]);

	1523 output[3] = WRAPLOW(step[0] - step[3]);

	1524 }

	1525

	1526 void vp9_high_iwht4x4_1_add_c(const tran_low_t in, uint8_t dest8,

	1527 int dest_stride, int bd) {

	1528 int i;

	1529 tran_high_t a1, e1;

	1530 tran_low_t tmp[4];

	1531 const tran_low_t *ip = in;

	1532 tran_low_t *op = tmp;

	1533 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

	1534 (void) bd;

	1535

	1536 a1 = ip[0] >> UNIT_QUANT_SHIFT;

	1537 e1 = a1 >> 1;

	1538 a1 -= e1;

	1539 op[0] = WRAPLOW(a1);

	1540 op[1] = op[2] = op[3] = WRAPLOW(e1);

	1541

	1542 ip = tmp;

	1543 for (i = 0; i < 4; i++) {

	1544 e1 = ip[0] >> 1;

	1545 a1 = ip[0] - e1;

	1546 dest[dest_stride * 0] = clip_pixel_bd_high(dest[dest_stride * 0], a1, bd);

	1547 dest[dest_stride * 1] = clip_pixel_bd_high(dest[dest_stride * 1], e1, bd);

	1548 dest[dest_stride * 2] = clip_pixel_bd_high(dest[dest_stride * 2], e1, bd);

	1549 dest[dest_stride * 3] = clip_pixel_bd_high(dest[dest_stride * 3], e1, bd);

	1550 ip++;

	1551 dest++;

	1552 }

	1553 }

	1554

	1555 void vp9_high_idct4x4_16_add_c(const tran_low_t input, uint8_t dest8,

	1556 int stride, int bd) {

	1557 tran_low_t out[4 * 4];

	1558 tran_low_t *outptr = out;

	1559 int i, j;

	1560 tran_low_t temp_in[4], temp_out[4];

	1561 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

	1562

	1563 // Rows

	1564 for (i = 0; i < 4; ++i) {

	1565 high_idct4(input, outptr, bd);

	1566 input += 4;

	1567 outptr += 4;

	1568 }

	1569

	1570 // Columns

	1571 for (i = 0; i < 4; ++i) {

	1572 for (j = 0; j < 4; ++j)

	1573 temp_in[j] = out[j * 4 + i];

	1574 high_idct4(temp_in, temp_out, bd);

	1575 for (j = 0; j < 4; ++j)

	1576 dest[j * stride + i] = clip_pixel_bd_high(

	1577 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);

	1578 }

	1579 }

	1580

	1581 void vp9_high_idct4x4_1_add_c(const tran_low_t input, uint8_t dest8,

	1582 int dest_stride, int bd) {

	1583 int i;

	1584 tran_high_t a1;

	1585 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));

	1586 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

	1587

	1588 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));

	1589 a1 = ROUND_POWER_OF_TWO(out, 4);

	1590

	1591 for (i = 0; i < 4; i++) {

	1592 dest[0] = clip_pixel_bd_high(dest[0], a1, bd);

	1593 dest[1] = clip_pixel_bd_high(dest[1], a1, bd);

	1594 dest[2] = clip_pixel_bd_high(dest[2], a1, bd);

	1595 dest[3] = clip_pixel_bd_high(dest[3], a1, bd);

	1596 dest += dest_stride;

	1597 }

	1598 }

	1599

	1600 static void high_idct8(const tran_low_t input, tran_low_t output, int bd) {

	1601 tran_low_t step1[8], step2[8];

	1602 tran_high_t temp1, temp2;

	1603 // stage 1

	1604 step1[0] = input[0];

	1605 step1[2] = input[4];

	1606 step1[1] = input[2];

	1607 step1[3] = input[6];

	1608 temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;

	1609 temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;

	1610 step1[4] = WRAPLOW(dct_const_round_shift(temp1));

	1611 step1[7] = WRAPLOW(dct_const_round_shift(temp2));

	1612 temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;

	1613 temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;

	1614 step1[5] = WRAPLOW(dct_const_round_shift(temp1));

	1615 step1[6] = WRAPLOW(dct_const_round_shift(temp2));

	1616

	1617 // stage 2 & stage 3 - even half

	1618 high_idct4(step1, step1, bd);

	1619

	1620 // stage 2 - odd half

	1621 step2[4] = WRAPLOW(step1[4] + step1[5]);

	1622 step2[5] = WRAPLOW(step1[4] - step1[5]);

	1623 step2[6] = WRAPLOW(-step1[6] + step1[7]);

	1624 step2[7] = WRAPLOW(step1[6] + step1[7]);

	1625

	1626 // stage 3 - odd half

	1627 step1[4] = step2[4];

	1628 temp1 = (step2[6] - step2[5]) * cospi_16_64;

	1629 temp2 = (step2[5] + step2[6]) * cospi_16_64;

	1630 step1[5] = WRAPLOW(dct_const_round_shift(temp1));

	1631 step1[6] = WRAPLOW(dct_const_round_shift(temp2));

	1632 step1[7] = step2[7];

	1633

	1634 // stage 4

	1635 output[0] = WRAPLOW(step1[0] + step1[7]);

	1636 output[1] = WRAPLOW(step1[1] + step1[6]);

	1637 output[2] = WRAPLOW(step1[2] + step1[5]);

	1638 output[3] = WRAPLOW(step1[3] + step1[4]);

	1639 output[4] = WRAPLOW(step1[3] - step1[4]);

	1640 output[5] = WRAPLOW(step1[2] - step1[5]);

	1641 output[6] = WRAPLOW(step1[1] - step1[6]);

	1642 output[7] = WRAPLOW(step1[0] - step1[7]);

	1643 }

	1644

	1645 void vp9_high_idct8x8_64_add_c(const tran_low_t input, uint8_t dest8,

	1646 int stride, int bd) {

	1647 tran_low_t out[8 * 8];

	1648 tran_low_t *outptr = out;

	1649 int i, j;

	1650 tran_low_t temp_in[8], temp_out[8];

	1651 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

	1652

	1653 // First transform rows.

	1654 for (i = 0; i < 8; ++i) {

	1655 high_idct8(input, outptr, bd);

	1656 input += 8;

	1657 outptr += 8;

	1658 }

	1659

	1660 // Then transform columns.

	1661 for (i = 0; i < 8; ++i) {

	1662 for (j = 0; j < 8; ++j)

	1663 temp_in[j] = out[j * 8 + i];

	1664 high_idct8(temp_in, temp_out, bd);

	1665 for (j = 0; j < 8; ++j)

	1666 dest[j * stride + i] = clip_pixel_bd_high(dest[j * stride + i],

	1667 ROUND_POWER_OF_TWO(temp_out[j], 5),

	1668 bd);

	1669 }

	1670 }

	1671

	1672 void vp9_high_idct8x8_1_add_c(const tran_low_t input, uint8_t dest8,

	1673 int stride, int bd) {

	1674 int i, j;

	1675 tran_high_t a1;

	1676 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));

	1677 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

	1678 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));

	1679 a1 = ROUND_POWER_OF_TWO(out, 5);

	1680 for (j = 0; j < 8; ++j) {

	1681 for (i = 0; i < 8; ++i)

	1682 dest[i] = clip_pixel_bd_high(dest[i], a1, bd);

	1683 dest += stride;

	1684 }

	1685 }

	1686

	1687 static void high_iadst4(const tran_low_t input, tran_low_t output, int bd) {

	1688 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;

	1689

	1690 tran_high_t x0 = input[0];

	1691 tran_high_t x1 = input[1];

	1692 tran_high_t x2 = input[2];

	1693 tran_high_t x3 = input[3];

	1694 (void) bd;

	1695

	1696 if (!(x0 \| x1 \| x2 \| x3)) {

	1697 vpx_memset(output, 0, 4 * sizeof(*output));

	1698 return;

	1699 }

	1700

	1701 s0 = sinpi_1_9 * x0;

	1702 s1 = sinpi_2_9 * x0;

	1703 s2 = sinpi_3_9 * x1;

	1704 s3 = sinpi_4_9 * x2;

	1705 s4 = sinpi_1_9 * x2;

	1706 s5 = sinpi_2_9 * x3;

	1707 s6 = sinpi_4_9 * x3;

	1708 s7 = x0 - x2 + x3;

	1709

	1710 x0 = s0 + s3 + s5;

	1711 x1 = s1 - s4 - s6;

	1712 x2 = sinpi_3_9 * s7;

	1713 x3 = s2;

	1714

	1715 s0 = x0 + x3;

	1716 s1 = x1 + x3;

	1717 s2 = x2;

	1718 s3 = x0 + x1 - x3;

	1719

	1720 // 1-D transform scaling factor is sqrt(2).

	1721 // The overall dynamic range is 14b (input) + 14b (multiplication scaling)

	1722 // + 1b (addition) = 29b.

	1723 // Hence the output bit depth is 15b.

	1724 output[0] = WRAPLOW(dct_const_round_shift(s0));

	1725 output[1] = WRAPLOW(dct_const_round_shift(s1));

	1726 output[2] = WRAPLOW(dct_const_round_shift(s2));

	1727 output[3] = WRAPLOW(dct_const_round_shift(s3));

	1728 }

	1729

	1730 void vp9_high_iht4x4_16_add_c(const tran_low_t input, uint8_t dest8,

	1731 int stride, int tx_type, int bd) {

	1732 const high_transform_2d IHT_4[] = {

	1733 { high_idct4, high_idct4 }, // DCT_DCT = 0

	1734 { high_iadst4, high_idct4 }, // ADST_DCT = 1

	1735 { high_idct4, high_iadst4 }, // DCT_ADST = 2

	1736 { high_iadst4, high_iadst4 } // ADST_ADST = 3

	1737 };

	1738 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

	1739

	1740 int i, j;

	1741 tran_low_t out[4 * 4];

	1742 tran_low_t *outptr = out;

	1743 tran_low_t temp_in[4], temp_out[4];

	1744

	1745 // Inverse transform row vectors.

	1746 for (i = 0; i < 4; ++i) {

	1747 IHT_4[tx_type].rows(input, outptr, bd);

	1748 input += 4;

	1749 outptr += 4;

	1750 }

	1751

	1752 // Inverse transform column vectors.

	1753 for (i = 0; i < 4; ++i) {

	1754 for (j = 0; j < 4; ++j)

	1755 temp_in[j] = out[j * 4 + i];

	1756 IHT_4[tx_type].cols(temp_in, temp_out, bd);

	1757 for (j = 0; j < 4; ++j)

	1758 dest[j * stride + i] = clip_pixel_bd_high(

	1759 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);

	1760 }

	1761 }

	1762

	1763 static void high_iadst8(const tran_low_t input, tran_low_t output, int bd) {

	1764 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;

	1765

	1766 tran_high_t x0 = input[7];

	1767 tran_high_t x1 = input[0];

	1768 tran_high_t x2 = input[5];

	1769 tran_high_t x3 = input[2];

	1770 tran_high_t x4 = input[3];

	1771 tran_high_t x5 = input[4];

	1772 tran_high_t x6 = input[1];

	1773 tran_high_t x7 = input[6];

	1774 (void) bd;

	1775

	1776 if (!(x0 \| x1 \| x2 \| x3 \| x4 \| x5 \| x6 \| x7)) {

	1777 vpx_memset(output, 0, 8 * sizeof(*output));

	1778 return;

	1779 }

	1780

	1781 // stage 1

	1782 s0 = cospi_2_64 * x0 + cospi_30_64 * x1;

	1783 s1 = cospi_30_64 * x0 - cospi_2_64 * x1;

	1784 s2 = cospi_10_64 * x2 + cospi_22_64 * x3;

	1785 s3 = cospi_22_64 * x2 - cospi_10_64 * x3;

	1786 s4 = cospi_18_64 * x4 + cospi_14_64 * x5;

	1787 s5 = cospi_14_64 * x4 - cospi_18_64 * x5;

	1788 s6 = cospi_26_64 * x6 + cospi_6_64 * x7;

	1789 s7 = cospi_6_64 * x6 - cospi_26_64 * x7;

	1790

	1791 x0 = WRAPLOW(dct_const_round_shift(s0 + s4));

	1792 x1 = WRAPLOW(dct_const_round_shift(s1 + s5));

	1793 x2 = WRAPLOW(dct_const_round_shift(s2 + s6));

	1794 x3 = WRAPLOW(dct_const_round_shift(s3 + s7));

	1795 x4 = WRAPLOW(dct_const_round_shift(s0 - s4));

	1796 x5 = WRAPLOW(dct_const_round_shift(s1 - s5));

	1797 x6 = WRAPLOW(dct_const_round_shift(s2 - s6));

	1798 x7 = WRAPLOW(dct_const_round_shift(s3 - s7));

	1799

	1800 // stage 2

	1801 s0 = x0;

	1802 s1 = x1;

	1803 s2 = x2;

	1804 s3 = x3;

	1805 s4 = cospi_8_64 * x4 + cospi_24_64 * x5;

	1806 s5 = cospi_24_64 * x4 - cospi_8_64 * x5;

	1807 s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;

	1808 s7 = cospi_8_64 * x6 + cospi_24_64 * x7;

	1809

	1810 x0 = s0 + s2;

	1811 x1 = s1 + s3;

	1812 x2 = s0 - s2;

	1813 x3 = s1 - s3;

	1814 x4 = WRAPLOW(dct_const_round_shift(s4 + s6));

	1815 x5 = WRAPLOW(dct_const_round_shift(s5 + s7));

	1816 x6 = WRAPLOW(dct_const_round_shift(s4 - s6));

	1817 x7 = WRAPLOW(dct_const_round_shift(s5 - s7));

	1818

	1819 // stage 3

	1820 s2 = cospi_16_64 * (x2 + x3);

	1821 s3 = cospi_16_64 * (x2 - x3);

	1822 s6 = cospi_16_64 * (x6 + x7);

	1823 s7 = cospi_16_64 * (x6 - x7);

	1824

	1825 x2 = WRAPLOW(dct_const_round_shift(s2));

	1826 x3 = WRAPLOW(dct_const_round_shift(s3));

	1827 x6 = WRAPLOW(dct_const_round_shift(s6));

	1828 x7 = WRAPLOW(dct_const_round_shift(s7));

	1829

	1830 output[0] = WRAPLOW(x0);

	1831 output[1] = WRAPLOW(-x4);

	1832 output[2] = WRAPLOW(x6);

	1833 output[3] = WRAPLOW(-x2);

	1834 output[4] = WRAPLOW(x3);

	1835 output[5] = WRAPLOW(-x7);

	1836 output[6] = WRAPLOW(x5);

	1837 output[7] = WRAPLOW(-x1);

	1838 }

	1839

	1840 static const high_transform_2d HIGH_IHT_8[] = {

	1841 { high_idct8, high_idct8 }, // DCT_DCT = 0

	1842 { high_iadst8, high_idct8 }, // ADST_DCT = 1

	1843 { high_idct8, high_iadst8 }, // DCT_ADST = 2

	1844 { high_iadst8, high_iadst8 } // ADST_ADST = 3

	1845 };

	1846

	1847 void vp9_high_iht8x8_64_add_c(const tran_low_t input, uint8_t dest8,

	1848 int stride, int tx_type, int bd) {

	1849 int i, j;

	1850 tran_low_t out[8 * 8];

	1851 tran_low_t *outptr = out;

	1852 tran_low_t temp_in[8], temp_out[8];

	1853 const high_transform_2d ht = HIGH_IHT_8[tx_type];

	1854 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

	1855

	1856 // Inverse transform row vectors.

	1857 for (i = 0; i < 8; ++i) {

	1858 ht.rows(input, outptr, bd);

	1859 input += 8;

	1860 outptr += 8;

	1861 }

	1862

	1863 // Inverse transform column vectors.

	1864 for (i = 0; i < 8; ++i) {

	1865 for (j = 0; j < 8; ++j)

	1866 temp_in[j] = out[j * 8 + i];

	1867 ht.cols(temp_in, temp_out, bd);

	1868 for (j = 0; j < 8; ++j)

	1869 dest[j * stride + i] = clip_pixel_bd_high(

	1870 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);

	1871 }

	1872 }

	1873

	1874 void vp9_high_idct8x8_10_add_c(const tran_low_t input, uint8_t dest8,

	1875 int stride, int bd) {

	1876 tran_low_t out[8 * 8] = { 0 };

	1877 tran_low_t *outptr = out;

	1878 int i, j;

	1879 tran_low_t temp_in[8], temp_out[8];

	1880 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

	1881

	1882 // First transform rows.

	1883 // Only first 4 row has non-zero coefs.

	1884 for (i = 0; i < 4; ++i) {

	1885 high_idct8(input, outptr, bd);

	1886 input += 8;

	1887 outptr += 8;

	1888 }

	1889 // Then transform columns.

	1890 for (i = 0; i < 8; ++i) {

	1891 for (j = 0; j < 8; ++j)

	1892 temp_in[j] = out[j * 8 + i];

	1893 high_idct8(temp_in, temp_out, bd);

	1894 for (j = 0; j < 8; ++j)

	1895 dest[j * stride + i] = clip_pixel_bd_high(

	1896 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);

	1897 }

	1898 }

	1899

	1900 static void high_idct16(const tran_low_t input, tran_low_t output, int bd) {

	1901 tran_low_t step1[16], step2[16];

	1902 tran_high_t temp1, temp2;

	1903 (void) bd;

	1904

	1905 // stage 1

	1906 step1[0] = input[0/2];

	1907 step1[1] = input[16/2];

	1908 step1[2] = input[8/2];

	1909 step1[3] = input[24/2];

	1910 step1[4] = input[4/2];

	1911 step1[5] = input[20/2];

	1912 step1[6] = input[12/2];

	1913 step1[7] = input[28/2];

	1914 step1[8] = input[2/2];

	1915 step1[9] = input[18/2];

	1916 step1[10] = input[10/2];

	1917 step1[11] = input[26/2];

	1918 step1[12] = input[6/2];

	1919 step1[13] = input[22/2];

	1920 step1[14] = input[14/2];

	1921 step1[15] = input[30/2];

	1922

	1923 // stage 2

	1924 step2[0] = step1[0];

	1925 step2[1] = step1[1];

	1926 step2[2] = step1[2];

	1927 step2[3] = step1[3];

	1928 step2[4] = step1[4];

	1929 step2[5] = step1[5];

	1930 step2[6] = step1[6];

	1931 step2[7] = step1[7];

	1932

	1933 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;

	1934 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;

	1935 step2[8] = WRAPLOW(dct_const_round_shift(temp1));

	1936 step2[15] = WRAPLOW(dct_const_round_shift(temp2));

	1937

	1938 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;

	1939 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;

	1940 step2[9] = WRAPLOW(dct_const_round_shift(temp1));

	1941 step2[14] = WRAPLOW(dct_const_round_shift(temp2));

	1942

	1943 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;

	1944 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;

	1945 step2[10] = WRAPLOW(dct_const_round_shift(temp1));

	1946 step2[13] = WRAPLOW(dct_const_round_shift(temp2));

	1947

	1948 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;

	1949 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;

	1950 step2[11] = WRAPLOW(dct_const_round_shift(temp1));

	1951 step2[12] = WRAPLOW(dct_const_round_shift(temp2));

	1952

	1953 // stage 3

	1954 step1[0] = step2[0];

	1955 step1[1] = step2[1];

	1956 step1[2] = step2[2];

	1957 step1[3] = step2[3];

	1958

	1959 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;

	1960 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;

	1961 step1[4] = WRAPLOW(dct_const_round_shift(temp1));

	1962 step1[7] = WRAPLOW(dct_const_round_shift(temp2));

	1963 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;

	1964 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;

	1965 step1[5] = WRAPLOW(dct_const_round_shift(temp1));

	1966 step1[6] = WRAPLOW(dct_const_round_shift(temp2));

	1967

	1968 step1[8] = WRAPLOW(step2[8] + step2[9]);

	1969 step1[9] = WRAPLOW(step2[8] - step2[9]);

	1970 step1[10] = WRAPLOW(-step2[10] + step2[11]);

	1971 step1[11] = WRAPLOW(step2[10] + step2[11]);

	1972 step1[12] = WRAPLOW(step2[12] + step2[13]);

	1973 step1[13] = WRAPLOW(step2[12] - step2[13]);

	1974 step1[14] = WRAPLOW(-step2[14] + step2[15]);

	1975 step1[15] = WRAPLOW(step2[14] + step2[15]);

	1976

	1977 // stage 4

	1978 temp1 = (step1[0] + step1[1]) * cospi_16_64;

	1979 temp2 = (step1[0] - step1[1]) * cospi_16_64;

	1980 step2[0] = WRAPLOW(dct_const_round_shift(temp1));

	1981 step2[1] = WRAPLOW(dct_const_round_shift(temp2));

	1982 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;

	1983 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;

	1984 step2[2] = WRAPLOW(dct_const_round_shift(temp1));

	1985 step2[3] = WRAPLOW(dct_const_round_shift(temp2));

	1986 step2[4] = WRAPLOW(step1[4] + step1[5]);

	1987 step2[5] = WRAPLOW(step1[4] - step1[5]);

	1988 step2[6] = WRAPLOW(-step1[6] + step1[7]);

	1989 step2[7] = WRAPLOW(step1[6] + step1[7]);

	1990

	1991 step2[8] = step1[8];

	1992 step2[15] = step1[15];

	1993 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;

	1994 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;

	1995 step2[9] = WRAPLOW(dct_const_round_shift(temp1));

	1996 step2[14] = WRAPLOW(dct_const_round_shift(temp2));

	1997 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;

	1998 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;

	1999 step2[10] = WRAPLOW(dct_const_round_shift(temp1));

	2000 step2[13] = WRAPLOW(dct_const_round_shift(temp2));

	2001 step2[11] = step1[11];

	2002 step2[12] = step1[12];

	2003

	2004 // stage 5

	2005 step1[0] = WRAPLOW(step2[0] + step2[3]);

	2006 step1[1] = WRAPLOW(step2[1] + step2[2]);

	2007 step1[2] = WRAPLOW(step2[1] - step2[2]);

	2008 step1[3] = WRAPLOW(step2[0] - step2[3]);

	2009 step1[4] = step2[4];

	2010 temp1 = (step2[6] - step2[5]) * cospi_16_64;

	2011 temp2 = (step2[5] + step2[6]) * cospi_16_64;

	2012 step1[5] = WRAPLOW(dct_const_round_shift(temp1));

	2013 step1[6] = WRAPLOW(dct_const_round_shift(temp2));

	2014 step1[7] = step2[7];

	2015

	2016 step1[8] = WRAPLOW(step2[8] + step2[11]);

	2017 step1[9] = WRAPLOW(step2[9] + step2[10]);

	2018 step1[10] = WRAPLOW(step2[9] - step2[10]);

	2019 step1[11] = WRAPLOW(step2[8] - step2[11]);

	2020 step1[12] = WRAPLOW(-step2[12] + step2[15]);

	2021 step1[13] = WRAPLOW(-step2[13] + step2[14]);

	2022 step1[14] = WRAPLOW(step2[13] + step2[14]);

	2023 step1[15] = WRAPLOW(step2[12] + step2[15]);

	2024

	2025 // stage 6

	2026 step2[0] = WRAPLOW(step1[0] + step1[7]);

	2027 step2[1] = WRAPLOW(step1[1] + step1[6]);

	2028 step2[2] = WRAPLOW(step1[2] + step1[5]);

	2029 step2[3] = WRAPLOW(step1[3] + step1[4]);

	2030 step2[4] = WRAPLOW(step1[3] - step1[4]);

	2031 step2[5] = WRAPLOW(step1[2] - step1[5]);

	2032 step2[6] = WRAPLOW(step1[1] - step1[6]);

	2033 step2[7] = WRAPLOW(step1[0] - step1[7]);

	2034 step2[8] = step1[8];

	2035 step2[9] = step1[9];

	2036 temp1 = (-step1[10] + step1[13]) * cospi_16_64;

	2037 temp2 = (step1[10] + step1[13]) * cospi_16_64;

	2038 step2[10] = WRAPLOW(dct_const_round_shift(temp1));

	2039 step2[13] = WRAPLOW(dct_const_round_shift(temp2));

	2040 temp1 = (-step1[11] + step1[12]) * cospi_16_64;

	2041 temp2 = (step1[11] + step1[12]) * cospi_16_64;

	2042 step2[11] = WRAPLOW(dct_const_round_shift(temp1));

	2043 step2[12] = WRAPLOW(dct_const_round_shift(temp2));

	2044 step2[14] = step1[14];

	2045 step2[15] = step1[15];

	2046

	2047 // stage 7

	2048 output[0] = WRAPLOW(step2[0] + step2[15]);

	2049 output[1] = WRAPLOW(step2[1] + step2[14]);

	2050 output[2] = WRAPLOW(step2[2] + step2[13]);

	2051 output[3] = WRAPLOW(step2[3] + step2[12]);

	2052 output[4] = WRAPLOW(step2[4] + step2[11]);

	2053 output[5] = WRAPLOW(step2[5] + step2[10]);

	2054 output[6] = WRAPLOW(step2[6] + step2[9]);

	2055 output[7] = WRAPLOW(step2[7] + step2[8]);

	2056 output[8] = WRAPLOW(step2[7] - step2[8]);

	2057 output[9] = WRAPLOW(step2[6] - step2[9]);

	2058 output[10] = WRAPLOW(step2[5] - step2[10]);

	2059 output[11] = WRAPLOW(step2[4] - step2[11]);

	2060 output[12] = WRAPLOW(step2[3] - step2[12]);

	2061 output[13] = WRAPLOW(step2[2] - step2[13]);

	2062 output[14] = WRAPLOW(step2[1] - step2[14]);

	2063 output[15] = WRAPLOW(step2[0] - step2[15]);

	2064 }

	2065

	2066 void vp9_high_idct16x16_256_add_c(const tran_low_t input, uint8_t dest8,

	2067 int stride, int bd) {

	2068 tran_low_t out[16 * 16];

	2069 tran_low_t *outptr = out;

	2070 int i, j;

	2071 tran_low_t temp_in[16], temp_out[16];

	2072 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

	2073

	2074 // First transform rows.

	2075 for (i = 0; i < 16; ++i) {

	2076 high_idct16(input, outptr, bd);

	2077 input += 16;

	2078 outptr += 16;

	2079 }

	2080

	2081 // Then transform columns.

	2082 for (i = 0; i < 16; ++i) {

	2083 for (j = 0; j < 16; ++j)

	2084 temp_in[j] = out[j * 16 + i];

	2085 high_idct16(temp_in, temp_out, bd);

	2086 for (j = 0; j < 16; ++j)

	2087 dest[j * stride + i] = clip_pixel_bd_high(

	2088 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);

	2089 }

	2090 }

	2091

	2092 static void high_iadst16(const tran_low_t input, tran_low_t output, int bd) {

	2093 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;

	2094 tran_high_t s9, s10, s11, s12, s13, s14, s15;

	2095

	2096 tran_high_t x0 = input[15];

	2097 tran_high_t x1 = input[0];

	2098 tran_high_t x2 = input[13];

	2099 tran_high_t x3 = input[2];

	2100 tran_high_t x4 = input[11];

	2101 tran_high_t x5 = input[4];

	2102 tran_high_t x6 = input[9];

	2103 tran_high_t x7 = input[6];

	2104 tran_high_t x8 = input[7];

	2105 tran_high_t x9 = input[8];

	2106 tran_high_t x10 = input[5];

	2107 tran_high_t x11 = input[10];

	2108 tran_high_t x12 = input[3];

	2109 tran_high_t x13 = input[12];

	2110 tran_high_t x14 = input[1];

	2111 tran_high_t x15 = input[14];

	2112 (void) bd;

	2113

	2114 if (!(x0 \| x1 \| x2 \| x3 \| x4 \| x5 \| x6 \| x7 \| x8

	2115 \| x9 \| x10 \| x11 \| x12 \| x13 \| x14 \| x15)) {

	2116 vpx_memset(output, 0, 16 * sizeof(*output));

	2117 return;

	2118 }

	2119

	2120 // stage 1

	2121 s0 = x0 * cospi_1_64 + x1 * cospi_31_64;

	2122 s1 = x0 * cospi_31_64 - x1 * cospi_1_64;

	2123 s2 = x2 * cospi_5_64 + x3 * cospi_27_64;

	2124 s3 = x2 * cospi_27_64 - x3 * cospi_5_64;

	2125 s4 = x4 * cospi_9_64 + x5 * cospi_23_64;

	2126 s5 = x4 * cospi_23_64 - x5 * cospi_9_64;

	2127 s6 = x6 * cospi_13_64 + x7 * cospi_19_64;

	2128 s7 = x6 * cospi_19_64 - x7 * cospi_13_64;

	2129 s8 = x8 * cospi_17_64 + x9 * cospi_15_64;

	2130 s9 = x8 * cospi_15_64 - x9 * cospi_17_64;

	2131 s10 = x10 * cospi_21_64 + x11 * cospi_11_64;

	2132 s11 = x10 * cospi_11_64 - x11 * cospi_21_64;

	2133 s12 = x12 * cospi_25_64 + x13 * cospi_7_64;

	2134 s13 = x12 * cospi_7_64 - x13 * cospi_25_64;

	2135 s14 = x14 * cospi_29_64 + x15 * cospi_3_64;

	2136 s15 = x14 * cospi_3_64 - x15 * cospi_29_64;

	2137

	2138 x0 = WRAPLOW(dct_const_round_shift(s0 + s8));

	2139 x1 = WRAPLOW(dct_const_round_shift(s1 + s9));

	2140 x2 = WRAPLOW(dct_const_round_shift(s2 + s10));

	2141 x3 = WRAPLOW(dct_const_round_shift(s3 + s11));

	2142 x4 = WRAPLOW(dct_const_round_shift(s4 + s12));

	2143 x5 = WRAPLOW(dct_const_round_shift(s5 + s13));

	2144 x6 = WRAPLOW(dct_const_round_shift(s6 + s14));

	2145 x7 = WRAPLOW(dct_const_round_shift(s7 + s15));

	2146 x8 = WRAPLOW(dct_const_round_shift(s0 - s8));

	2147 x9 = WRAPLOW(dct_const_round_shift(s1 - s9));

	2148 x10 = WRAPLOW(dct_const_round_shift(s2 - s10));

	2149 x11 = WRAPLOW(dct_const_round_shift(s3 - s11));

	2150 x12 = WRAPLOW(dct_const_round_shift(s4 - s12));

	2151 x13 = WRAPLOW(dct_const_round_shift(s5 - s13));

	2152 x14 = WRAPLOW(dct_const_round_shift(s6 - s14));

	2153 x15 = WRAPLOW(dct_const_round_shift(s7 - s15));

	2154

	2155 // stage 2

	2156 s0 = x0;

	2157 s1 = x1;

	2158 s2 = x2;

	2159 s3 = x3;

	2160 s4 = x4;

	2161 s5 = x5;

	2162 s6 = x6;

	2163 s7 = x7;

	2164 s8 = x8 * cospi_4_64 + x9 * cospi_28_64;

	2165 s9 = x8 * cospi_28_64 - x9 * cospi_4_64;

	2166 s10 = x10 * cospi_20_64 + x11 * cospi_12_64;

	2167 s11 = x10 * cospi_12_64 - x11 * cospi_20_64;

	2168 s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;

	2169 s13 = x12 * cospi_4_64 + x13 * cospi_28_64;

	2170 s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;

	2171 s15 = x14 * cospi_20_64 + x15 * cospi_12_64;

	2172

	2173 x0 = WRAPLOW(s0 + s4);

	2174 x1 = WRAPLOW(s1 + s5);

	2175 x2 = WRAPLOW(s2 + s6);

	2176 x3 = WRAPLOW(s3 + s7);

	2177 x4 = WRAPLOW(s0 - s4);

	2178 x5 = WRAPLOW(s1 - s5);

	2179 x6 = WRAPLOW(s2 - s6);

	2180 x7 = WRAPLOW(s3 - s7);

	2181 x8 = WRAPLOW(dct_const_round_shift(s8 + s12));

	2182 x9 = WRAPLOW(dct_const_round_shift(s9 + s13));

	2183 x10 = WRAPLOW(dct_const_round_shift(s10 + s14));

	2184 x11 = WRAPLOW(dct_const_round_shift(s11 + s15));

	2185 x12 = WRAPLOW(dct_const_round_shift(s8 - s12));

	2186 x13 = WRAPLOW(dct_const_round_shift(s9 - s13));

	2187 x14 = WRAPLOW(dct_const_round_shift(s10 - s14));

	2188 x15 = WRAPLOW(dct_const_round_shift(s11 - s15));

	2189

	2190 // stage 3

	2191 s0 = x0;

	2192 s1 = x1;

	2193 s2 = x2;

	2194 s3 = x3;

	2195 s4 = x4 * cospi_8_64 + x5 * cospi_24_64;

	2196 s5 = x4 * cospi_24_64 - x5 * cospi_8_64;

	2197 s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;

	2198 s7 = x6 * cospi_8_64 + x7 * cospi_24_64;

	2199 s8 = x8;

	2200 s9 = x9;

	2201 s10 = x10;

	2202 s11 = x11;

	2203 s12 = x12 * cospi_8_64 + x13 * cospi_24_64;

	2204 s13 = x12 * cospi_24_64 - x13 * cospi_8_64;

	2205 s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;

	2206 s15 = x14 * cospi_8_64 + x15 * cospi_24_64;

	2207

	2208 x0 = WRAPLOW(s0 + s2);

	2209 x1 = WRAPLOW(s1 + s3);

	2210 x2 = WRAPLOW(s0 - s2);

	2211 x3 = WRAPLOW(s1 - s3);

	2212 x4 = WRAPLOW(dct_const_round_shift(s4 + s6));

	2213 x5 = WRAPLOW(dct_const_round_shift(s5 + s7));

	2214 x6 = WRAPLOW(dct_const_round_shift(s4 - s6));

	2215 x7 = WRAPLOW(dct_const_round_shift(s5 - s7));

	2216 x8 = WRAPLOW(s8 + s10);

	2217 x9 = WRAPLOW(s9 + s11);

	2218 x10 = WRAPLOW(s8 - s10);

	2219 x11 = WRAPLOW(s9 - s11);

	2220 x12 = WRAPLOW(dct_const_round_shift(s12 + s14));

	2221 x13 = WRAPLOW(dct_const_round_shift(s13 + s15));

	2222 x14 = WRAPLOW(dct_const_round_shift(s12 - s14));

	2223 x15 = WRAPLOW(dct_const_round_shift(s13 - s15));

	2224

	2225 // stage 4

	2226 s2 = (- cospi_16_64) * (x2 + x3);

	2227 s3 = cospi_16_64 * (x2 - x3);

	2228 s6 = cospi_16_64 * (x6 + x7);

	2229 s7 = cospi_16_64 * (-x6 + x7);

	2230 s10 = cospi_16_64 * (x10 + x11);

	2231 s11 = cospi_16_64 * (-x10 + x11);

	2232 s14 = (- cospi_16_64) * (x14 + x15);

	2233 s15 = cospi_16_64 * (x14 - x15);

	2234

	2235 x2 = WRAPLOW(dct_const_round_shift(s2));

	2236 x3 = WRAPLOW(dct_const_round_shift(s3));

	2237 x6 = WRAPLOW(dct_const_round_shift(s6));

	2238 x7 = WRAPLOW(dct_const_round_shift(s7));

	2239 x10 = WRAPLOW(dct_const_round_shift(s10));

	2240 x11 = WRAPLOW(dct_const_round_shift(s11));

	2241 x14 = WRAPLOW(dct_const_round_shift(s14));

	2242 x15 = WRAPLOW(dct_const_round_shift(s15));

	2243

	2244 output[0] = WRAPLOW(x0);

	2245 output[1] = WRAPLOW(-x8);

	2246 output[2] = WRAPLOW(x12);

	2247 output[3] = WRAPLOW(-x4);

	2248 output[4] = WRAPLOW(x6);

	2249 output[5] = WRAPLOW(x14);

	2250 output[6] = WRAPLOW(x10);

	2251 output[7] = WRAPLOW(x2);

	2252 output[8] = WRAPLOW(x3);

	2253 output[9] = WRAPLOW(x11);

	2254 output[10] = WRAPLOW(x15);

	2255 output[11] = WRAPLOW(x7);

	2256 output[12] = WRAPLOW(x5);

	2257 output[13] = WRAPLOW(-x13);

	2258 output[14] = WRAPLOW(x9);

	2259 output[15] = WRAPLOW(-x1);

	2260 }

	2261

	2262 static const high_transform_2d HIGH_IHT_16[] = {

	2263 { high_idct16, high_idct16 }, // DCT_DCT = 0

	2264 { high_iadst16, high_idct16 }, // ADST_DCT = 1

	2265 { high_idct16, high_iadst16 }, // DCT_ADST = 2

	2266 { high_iadst16, high_iadst16 } // ADST_ADST = 3

	2267 };

	2268

	2269 void vp9_high_iht16x16_256_add_c(const tran_low_t input, uint8_t dest8,

	2270 int stride, int tx_type, int bd) {

	2271 int i, j;

	2272 tran_low_t out[16 * 16];

	2273 tran_low_t *outptr = out;

	2274 tran_low_t temp_in[16], temp_out[16];

	2275 const high_transform_2d ht = HIGH_IHT_16[tx_type];

	2276 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

	2277

	2278 // Rows

	2279 for (i = 0; i < 16; ++i) {

	2280 ht.rows(input, outptr, bd);

	2281 input += 16;

	2282 outptr += 16;

	2283 }

	2284

	2285 // Columns

	2286 for (i = 0; i < 16; ++i) {

	2287 for (j = 0; j < 16; ++j)

	2288 temp_in[j] = out[j * 16 + i];

	2289 ht.cols(temp_in, temp_out, bd);

	2290 for (j = 0; j < 16; ++j)

	2291 dest[j * stride + i] = clip_pixel_bd_high(

	2292 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);

	2293 }

	2294 }

	2295

	2296 void vp9_high_idct16x16_10_add_c(const tran_low_t input, uint8_t dest8,

	2297 int stride, int bd) {

	2298 tran_low_t out[16 * 16] = { 0 };

	2299 tran_low_t *outptr = out;

	2300 int i, j;

	2301 tran_low_t temp_in[16], temp_out[16];

	2302 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

	2303

	2304 // First transform rows. Since all non-zero dct coefficients are in

	2305 // upper-left 4x4 area, we only need to calculate first 4 rows here.

	2306 for (i = 0; i < 4; ++i) {

	2307 high_idct16(input, outptr, bd);

	2308 input += 16;

	2309 outptr += 16;

	2310 }

	2311

	2312 // Then transform columns.

	2313 for (i = 0; i < 16; ++i) {

	2314 for (j = 0; j < 16; ++j)

	2315 temp_in[j] = out[j*16 + i];

	2316 high_idct16(temp_in, temp_out, bd);

	2317 for (j = 0; j < 16; ++j)

	2318 dest[j * stride + i] = clip_pixel_bd_high(

	2319 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);

	2320 }

	2321 }

	2322

	2323 void vp9_high_idct16x16_1_add_c(const tran_low_t input, uint8_t dest8,

	2324 int stride, int bd) {

	2325 int i, j;

	2326 tran_high_t a1;

	2327 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));

	2328 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

	2329

	2330 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));

	2331 a1 = ROUND_POWER_OF_TWO(out, 6);

	2332 for (j = 0; j < 16; ++j) {

	2333 for (i = 0; i < 16; ++i)

	2334 dest[i] = clip_pixel_bd_high(dest[i], a1, bd);

	2335 dest += stride;

	2336 }

	2337 }

	2338

	2339 static void high_idct32(const tran_low_t input, tran_low_t output, int bd) {

	2340 tran_low_t step1[32], step2[32];

	2341 tran_high_t temp1, temp2;

	2342 (void) bd;

	2343

	2344 // stage 1

	2345 step1[0] = input[0];

	2346 step1[1] = input[16];

	2347 step1[2] = input[8];

	2348 step1[3] = input[24];

	2349 step1[4] = input[4];

	2350 step1[5] = input[20];

	2351 step1[6] = input[12];

	2352 step1[7] = input[28];

	2353 step1[8] = input[2];

	2354 step1[9] = input[18];

	2355 step1[10] = input[10];

	2356 step1[11] = input[26];

	2357 step1[12] = input[6];

	2358 step1[13] = input[22];

	2359 step1[14] = input[14];

	2360 step1[15] = input[30];

	2361

	2362 temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;

	2363 temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;

	2364 step1[16] = WRAPLOW(dct_const_round_shift(temp1));

	2365 step1[31] = WRAPLOW(dct_const_round_shift(temp2));

	2366

	2367 temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;

	2368 temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;

	2369 step1[17] = WRAPLOW(dct_const_round_shift(temp1));

	2370 step1[30] = WRAPLOW(dct_const_round_shift(temp2));

	2371

	2372 temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;

	2373 temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;

	2374 step1[18] = WRAPLOW(dct_const_round_shift(temp1));

	2375 step1[29] = WRAPLOW(dct_const_round_shift(temp2));

	2376

	2377 temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;

	2378 temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;

	2379 step1[19] = WRAPLOW(dct_const_round_shift(temp1));

	2380 step1[28] = WRAPLOW(dct_const_round_shift(temp2));

	2381

	2382 temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;

	2383 temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;

	2384 step1[20] = WRAPLOW(dct_const_round_shift(temp1));

	2385 step1[27] = WRAPLOW(dct_const_round_shift(temp2));

	2386

	2387 temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;

	2388 temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;

	2389 step1[21] = WRAPLOW(dct_const_round_shift(temp1));

	2390 step1[26] = WRAPLOW(dct_const_round_shift(temp2));

	2391

	2392 temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;

	2393 temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;

	2394 step1[22] = WRAPLOW(dct_const_round_shift(temp1));

	2395 step1[25] = WRAPLOW(dct_const_round_shift(temp2));

	2396

	2397 temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;

	2398 temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;

	2399 step1[23] = WRAPLOW(dct_const_round_shift(temp1));

	2400 step1[24] = WRAPLOW(dct_const_round_shift(temp2));

	2401

	2402 // stage 2

	2403 step2[0] = step1[0];

	2404 step2[1] = step1[1];

	2405 step2[2] = step1[2];

	2406 step2[3] = step1[3];

	2407 step2[4] = step1[4];

	2408 step2[5] = step1[5];

	2409 step2[6] = step1[6];

	2410 step2[7] = step1[7];

	2411

	2412 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;

	2413 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;

	2414 step2[8] = WRAPLOW(dct_const_round_shift(temp1));

	2415 step2[15] = WRAPLOW(dct_const_round_shift(temp2));

	2416

	2417 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;

	2418 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;

	2419 step2[9] = WRAPLOW(dct_const_round_shift(temp1));

	2420 step2[14] = WRAPLOW(dct_const_round_shift(temp2));

	2421

	2422 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;

	2423 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;

	2424 step2[10] = WRAPLOW(dct_const_round_shift(temp1));

	2425 step2[13] = WRAPLOW(dct_const_round_shift(temp2));

	2426

	2427 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;

	2428 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;

	2429 step2[11] = WRAPLOW(dct_const_round_shift(temp1));

	2430 step2[12] = WRAPLOW(dct_const_round_shift(temp2));

	2431

	2432 step2[16] = WRAPLOW(step1[16] + step1[17]);

	2433 step2[17] = WRAPLOW(step1[16] - step1[17]);

	2434 step2[18] = WRAPLOW(-step1[18] + step1[19]);

	2435 step2[19] = WRAPLOW(step1[18] + step1[19]);

	2436 step2[20] = WRAPLOW(step1[20] + step1[21]);

	2437 step2[21] = WRAPLOW(step1[20] - step1[21]);

	2438 step2[22] = WRAPLOW(-step1[22] + step1[23]);

	2439 step2[23] = WRAPLOW(step1[22] + step1[23]);

	2440 step2[24] = WRAPLOW(step1[24] + step1[25]);

	2441 step2[25] = WRAPLOW(step1[24] - step1[25]);

	2442 step2[26] = WRAPLOW(-step1[26] + step1[27]);

	2443 step2[27] = WRAPLOW(step1[26] + step1[27]);

	2444 step2[28] = WRAPLOW(step1[28] + step1[29]);

	2445 step2[29] = WRAPLOW(step1[28] - step1[29]);

	2446 step2[30] = WRAPLOW(-step1[30] + step1[31]);

	2447 step2[31] = WRAPLOW(step1[30] + step1[31]);

	2448

	2449 // stage 3

	2450 step1[0] = step2[0];

	2451 step1[1] = step2[1];

	2452 step1[2] = step2[2];

	2453 step1[3] = step2[3];

	2454

	2455 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;

	2456 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;

	2457 step1[4] = WRAPLOW(dct_const_round_shift(temp1));

	2458 step1[7] = WRAPLOW(dct_const_round_shift(temp2));

	2459 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;

	2460 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;

	2461 step1[5] = WRAPLOW(dct_const_round_shift(temp1));

	2462 step1[6] = WRAPLOW(dct_const_round_shift(temp2));

	2463

	2464 step1[8] = WRAPLOW(step2[8] + step2[9]);

	2465 step1[9] = WRAPLOW(step2[8] - step2[9]);

	2466 step1[10] = WRAPLOW(-step2[10] + step2[11]);

	2467 step1[11] = WRAPLOW(step2[10] + step2[11]);

	2468 step1[12] = WRAPLOW(step2[12] + step2[13]);

	2469 step1[13] = WRAPLOW(step2[12] - step2[13]);

	2470 step1[14] = WRAPLOW(-step2[14] + step2[15]);

	2471 step1[15] = WRAPLOW(step2[14] + step2[15]);

	2472

	2473 step1[16] = step2[16];

	2474 step1[31] = step2[31];

	2475 temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;

	2476 temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;

	2477 step1[17] = WRAPLOW(dct_const_round_shift(temp1));

	2478 step1[30] = WRAPLOW(dct_const_round_shift(temp2));

	2479 temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;

	2480 temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;

	2481 step1[18] = WRAPLOW(dct_const_round_shift(temp1));

	2482 step1[29] = WRAPLOW(dct_const_round_shift(temp2));

	2483 step1[19] = step2[19];

	2484 step1[20] = step2[20];

	2485 temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;

	2486 temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;

	2487 step1[21] = WRAPLOW(dct_const_round_shift(temp1));

	2488 step1[26] = WRAPLOW(dct_const_round_shift(temp2));

	2489 temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;

	2490 temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;

	2491 step1[22] = WRAPLOW(dct_const_round_shift(temp1));

	2492 step1[25] = WRAPLOW(dct_const_round_shift(temp2));

	2493 step1[23] = step2[23];

	2494 step1[24] = step2[24];

	2495 step1[27] = step2[27];

	2496 step1[28] = step2[28];

	2497

	2498 // stage 4

	2499 temp1 = (step1[0] + step1[1]) * cospi_16_64;

	2500 temp2 = (step1[0] - step1[1]) * cospi_16_64;

	2501 step2[0] = WRAPLOW(dct_const_round_shift(temp1));

	2502 step2[1] = WRAPLOW(dct_const_round_shift(temp2));

	2503 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;

	2504 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;

	2505 step2[2] = WRAPLOW(dct_const_round_shift(temp1));

	2506 step2[3] = WRAPLOW(dct_const_round_shift(temp2));

	2507 step2[4] = WRAPLOW(step1[4] + step1[5]);

	2508 step2[5] = WRAPLOW(step1[4] - step1[5]);

	2509 step2[6] = WRAPLOW(-step1[6] + step1[7]);

	2510 step2[7] = WRAPLOW(step1[6] + step1[7]);

	2511

	2512 step2[8] = step1[8];

	2513 step2[15] = step1[15];

	2514 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;

	2515 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;

	2516 step2[9] = WRAPLOW(dct_const_round_shift(temp1));

	2517 step2[14] = WRAPLOW(dct_const_round_shift(temp2));

	2518 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;

	2519 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;

	2520 step2[10] = WRAPLOW(dct_const_round_shift(temp1));

	2521 step2[13] = WRAPLOW(dct_const_round_shift(temp2));

	2522 step2[11] = step1[11];

	2523 step2[12] = step1[12];

	2524

	2525 step2[16] = WRAPLOW(step1[16] + step1[19]);

	2526 step2[17] = WRAPLOW(step1[17] + step1[18]);

	2527 step2[18] = WRAPLOW(step1[17] - step1[18]);

	2528 step2[19] = WRAPLOW(step1[16] - step1[19]);

	2529 step2[20] = WRAPLOW(-step1[20] + step1[23]);

	2530 step2[21] = WRAPLOW(-step1[21] + step1[22]);

	2531 step2[22] = WRAPLOW(step1[21] + step1[22]);

	2532 step2[23] = WRAPLOW(step1[20] + step1[23]);

	2533

	2534 step2[24] = WRAPLOW(step1[24] + step1[27]);

	2535 step2[25] = WRAPLOW(step1[25] + step1[26]);

	2536 step2[26] = WRAPLOW(step1[25] - step1[26]);

	2537 step2[27] = WRAPLOW(step1[24] - step1[27]);

	2538 step2[28] = WRAPLOW(-step1[28] + step1[31]);

	2539 step2[29] = WRAPLOW(-step1[29] + step1[30]);

	2540 step2[30] = WRAPLOW(step1[29] + step1[30]);

	2541 step2[31] = WRAPLOW(step1[28] + step1[31]);

	2542

	2543 // stage 5

	2544 step1[0] = WRAPLOW(step2[0] + step2[3]);

	2545 step1[1] = WRAPLOW(step2[1] + step2[2]);

	2546 step1[2] = WRAPLOW(step2[1] - step2[2]);

	2547 step1[3] = WRAPLOW(step2[0] - step2[3]);

	2548 step1[4] = step2[4];

	2549 temp1 = (step2[6] - step2[5]) * cospi_16_64;

	2550 temp2 = (step2[5] + step2[6]) * cospi_16_64;

	2551 step1[5] = WRAPLOW(dct_const_round_shift(temp1));

	2552 step1[6] = WRAPLOW(dct_const_round_shift(temp2));

	2553 step1[7] = step2[7];

	2554

	2555 step1[8] = WRAPLOW(step2[8] + step2[11]);

	2556 step1[9] = WRAPLOW(step2[9] + step2[10]);

	2557 step1[10] = WRAPLOW(step2[9] - step2[10]);

	2558 step1[11] = WRAPLOW(step2[8] - step2[11]);

	2559 step1[12] = WRAPLOW(-step2[12] + step2[15]);

	2560 step1[13] = WRAPLOW(-step2[13] + step2[14]);

	2561 step1[14] = WRAPLOW(step2[13] + step2[14]);

	2562 step1[15] = WRAPLOW(step2[12] + step2[15]);

	2563

	2564 step1[16] = step2[16];

	2565 step1[17] = step2[17];

	2566 temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;

	2567 temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;

	2568 step1[18] = WRAPLOW(dct_const_round_shift(temp1));

	2569 step1[29] = WRAPLOW(dct_const_round_shift(temp2));

	2570 temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;

	2571 temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;

	2572 step1[19] = WRAPLOW(dct_const_round_shift(temp1));

	2573 step1[28] = WRAPLOW(dct_const_round_shift(temp2));

	2574 temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;

	2575 temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;

	2576 step1[20] = WRAPLOW(dct_const_round_shift(temp1));

	2577 step1[27] = WRAPLOW(dct_const_round_shift(temp2));

	2578 temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;

	2579 temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;

	2580 step1[21] = WRAPLOW(dct_const_round_shift(temp1));

	2581 step1[26] = WRAPLOW(dct_const_round_shift(temp2));

	2582 step1[22] = step2[22];

	2583 step1[23] = step2[23];

	2584 step1[24] = step2[24];

	2585 step1[25] = step2[25];

	2586 step1[30] = step2[30];

	2587 step1[31] = step2[31];

	2588

	2589 // stage 6

	2590 step2[0] = WRAPLOW(step1[0] + step1[7]);

	2591 step2[1] = WRAPLOW(step1[1] + step1[6]);

	2592 step2[2] = WRAPLOW(step1[2] + step1[5]);

	2593 step2[3] = WRAPLOW(step1[3] + step1[4]);

	2594 step2[4] = WRAPLOW(step1[3] - step1[4]);

	2595 step2[5] = WRAPLOW(step1[2] - step1[5]);

	2596 step2[6] = WRAPLOW(step1[1] - step1[6]);

	2597 step2[7] = WRAPLOW(step1[0] - step1[7]);

	2598 step2[8] = step1[8];

	2599 step2[9] = step1[9];

	2600 temp1 = (-step1[10] + step1[13]) * cospi_16_64;

	2601 temp2 = (step1[10] + step1[13]) * cospi_16_64;

	2602 step2[10] = WRAPLOW(dct_const_round_shift(temp1));

	2603 step2[13] = WRAPLOW(dct_const_round_shift(temp2));

	2604 temp1 = (-step1[11] + step1[12]) * cospi_16_64;

	2605 temp2 = (step1[11] + step1[12]) * cospi_16_64;

	2606 step2[11] = WRAPLOW(dct_const_round_shift(temp1));

	2607 step2[12] = WRAPLOW(dct_const_round_shift(temp2));

	2608 step2[14] = WRAPLOW(step1[14]);

	2609 step2[15] = WRAPLOW(step1[15]);

	2610

	2611 step2[16] = WRAPLOW(step1[16] + step1[23]);

	2612 step2[17] = WRAPLOW(step1[17] + step1[22]);

	2613 step2[18] = WRAPLOW(step1[18] + step1[21]);

	2614 step2[19] = WRAPLOW(step1[19] + step1[20]);

	2615 step2[20] = WRAPLOW(step1[19] - step1[20]);

	2616 step2[21] = WRAPLOW(step1[18] - step1[21]);

	2617 step2[22] = WRAPLOW(step1[17] - step1[22]);

	2618 step2[23] = WRAPLOW(step1[16] - step1[23]);

	2619

	2620 step2[24] = WRAPLOW(-step1[24] + step1[31]);

	2621 step2[25] = WRAPLOW(-step1[25] + step1[30]);

	2622 step2[26] = WRAPLOW(-step1[26] + step1[29]);

	2623 step2[27] = WRAPLOW(-step1[27] + step1[28]);

	2624 step2[28] = WRAPLOW(step1[27] + step1[28]);

	2625 step2[29] = WRAPLOW(step1[26] + step1[29]);

	2626 step2[30] = WRAPLOW(step1[25] + step1[30]);

	2627 step2[31] = WRAPLOW(step1[24] + step1[31]);

	2628

	2629 // stage 7

	2630 step1[0] = WRAPLOW(step2[0] + step2[15]);

	2631 step1[1] = WRAPLOW(step2[1] + step2[14]);

	2632 step1[2] = WRAPLOW(step2[2] + step2[13]);

	2633 step1[3] = WRAPLOW(step2[3] + step2[12]);

	2634 step1[4] = WRAPLOW(step2[4] + step2[11]);

	2635 step1[5] = WRAPLOW(step2[5] + step2[10]);

	2636 step1[6] = WRAPLOW(step2[6] + step2[9]);

	2637 step1[7] = WRAPLOW(step2[7] + step2[8]);

	2638 step1[8] = WRAPLOW(step2[7] - step2[8]);

	2639 step1[9] = WRAPLOW(step2[6] - step2[9]);

	2640 step1[10] = WRAPLOW(step2[5] - step2[10]);

	2641 step1[11] = WRAPLOW(step2[4] - step2[11]);

	2642 step1[12] = WRAPLOW(step2[3] - step2[12]);

	2643 step1[13] = WRAPLOW(step2[2] - step2[13]);

	2644 step1[14] = WRAPLOW(step2[1] - step2[14]);

	2645 step1[15] = WRAPLOW(step2[0] - step2[15]);

	2646

	2647 step1[16] = step2[16];

	2648 step1[17] = step2[17];

	2649 step1[18] = step2[18];

	2650 step1[19] = step2[19];

	2651 temp1 = (-step2[20] + step2[27]) * cospi_16_64;

	2652 temp2 = (step2[20] + step2[27]) * cospi_16_64;

	2653 step1[20] = WRAPLOW(dct_const_round_shift(temp1));

	2654 step1[27] = WRAPLOW(dct_const_round_shift(temp2));

	2655 temp1 = (-step2[21] + step2[26]) * cospi_16_64;

	2656 temp2 = (step2[21] + step2[26]) * cospi_16_64;

	2657 step1[21] = WRAPLOW(dct_const_round_shift(temp1));

	2658 step1[26] = WRAPLOW(dct_const_round_shift(temp2));

	2659 temp1 = (-step2[22] + step2[25]) * cospi_16_64;

	2660 temp2 = (step2[22] + step2[25]) * cospi_16_64;

	2661 step1[22] = WRAPLOW(dct_const_round_shift(temp1));

	2662 step1[25] = WRAPLOW(dct_const_round_shift(temp2));

	2663 temp1 = (-step2[23] + step2[24]) * cospi_16_64;

	2664 temp2 = (step2[23] + step2[24]) * cospi_16_64;

	2665 step1[23] = WRAPLOW(dct_const_round_shift(temp1));

	2666 step1[24] = WRAPLOW(dct_const_round_shift(temp2));

	2667 step1[28] = step2[28];

	2668 step1[29] = step2[29];

	2669 step1[30] = step2[30];

	2670 step1[31] = step2[31];

	2671

	2672 // final stage

	2673 output[0] = WRAPLOW(step1[0] + step1[31]);

	2674 output[1] = WRAPLOW(step1[1] + step1[30]);

	2675 output[2] = WRAPLOW(step1[2] + step1[29]);

	2676 output[3] = WRAPLOW(step1[3] + step1[28]);

	2677 output[4] = WRAPLOW(step1[4] + step1[27]);

	2678 output[5] = WRAPLOW(step1[5] + step1[26]);

	2679 output[6] = WRAPLOW(step1[6] + step1[25]);

	2680 output[7] = WRAPLOW(step1[7] + step1[24]);

	2681 output[8] = WRAPLOW(step1[8] + step1[23]);

	2682 output[9] = WRAPLOW(step1[9] + step1[22]);

	2683 output[10] = WRAPLOW(step1[10] + step1[21]);

	2684 output[11] = WRAPLOW(step1[11] + step1[20]);

	2685 output[12] = WRAPLOW(step1[12] + step1[19]);

	2686 output[13] = WRAPLOW(step1[13] + step1[18]);

	2687 output[14] = WRAPLOW(step1[14] + step1[17]);

	2688 output[15] = WRAPLOW(step1[15] + step1[16]);

	2689 output[16] = WRAPLOW(step1[15] - step1[16]);

	2690 output[17] = WRAPLOW(step1[14] - step1[17]);

	2691 output[18] = WRAPLOW(step1[13] - step1[18]);

	2692 output[19] = WRAPLOW(step1[12] - step1[19]);

	2693 output[20] = WRAPLOW(step1[11] - step1[20]);

	2694 output[21] = WRAPLOW(step1[10] - step1[21]);

	2695 output[22] = WRAPLOW(step1[9] - step1[22]);

	2696 output[23] = WRAPLOW(step1[8] - step1[23]);

	2697 output[24] = WRAPLOW(step1[7] - step1[24]);

	2698 output[25] = WRAPLOW(step1[6] - step1[25]);

	2699 output[26] = WRAPLOW(step1[5] - step1[26]);

	2700 output[27] = WRAPLOW(step1[4] - step1[27]);

	2701 output[28] = WRAPLOW(step1[3] - step1[28]);

	2702 output[29] = WRAPLOW(step1[2] - step1[29]);

	2703 output[30] = WRAPLOW(step1[1] - step1[30]);

	2704 output[31] = WRAPLOW(step1[0] - step1[31]);

	2705 }

	2706

	2707 void vp9_high_idct32x32_1024_add_c(const tran_low_t input, uint8_t dest8,

	2708 int stride, int bd) {

	2709 tran_low_t out[32 * 32];

	2710 tran_low_t *outptr = out;

	2711 int i, j;

	2712 tran_low_t temp_in[32], temp_out[32];

	2713 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

	2714

	2715 // Rows

	2716 for (i = 0; i < 32; ++i) {

	2717 tran_low_t zero_coeff[16];

	2718 for (j = 0; j < 16; ++j)

	2719 zero_coeff[j] = input[2 * j] \| input[2 * j + 1];

	2720 for (j = 0; j < 8; ++j)

	2721 zero_coeff[j] = zero_coeff[2 * j] \| zero_coeff[2 * j + 1];

	2722 for (j = 0; j < 4; ++j)

	2723 zero_coeff[j] = zero_coeff[2 * j] \| zero_coeff[2 * j + 1];

	2724 for (j = 0; j < 2; ++j)

	2725 zero_coeff[j] = zero_coeff[2 * j] \| zero_coeff[2 * j + 1];

	2726

	2727 if (zero_coeff[0] \| zero_coeff[1])

	2728 high_idct32(input, outptr, bd);

	2729 else

	2730 vpx_memset(outptr, 0, sizeof(tran_low_t) * 32);

	2731 input += 32;

	2732 outptr += 32;

	2733 }

	2734

	2735 // Columns

	2736 for (i = 0; i < 32; ++i) {

	2737 for (j = 0; j < 32; ++j)

	2738 temp_in[j] = out[j * 32 + i];

	2739 high_idct32(temp_in, temp_out, bd);

	2740 for (j = 0; j < 32; ++j)

	2741 dest[j * stride + i] = clip_pixel_bd_high(

	2742 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);

	2743 }

	2744 }

	2745

	2746 void vp9_high_idct32x32_34_add_c(const tran_low_t input, uint8_t dest8,

	2747 int stride, int bd) {

	2748 tran_low_t out[32 * 32] = {0};

	2749 tran_low_t *outptr = out;

	2750 int i, j;

	2751 tran_low_t temp_in[32], temp_out[32];

	2752 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

	2753

	2754 // Rows

	2755 // Only upper-left 8x8 has non-zero coeff.

	2756 for (i = 0; i < 8; ++i) {

	2757 high_idct32(input, outptr, bd);

	2758 input += 32;

	2759 outptr += 32;

	2760 }

	2761 // Columns

	2762 for (i = 0; i < 32; ++i) {

	2763 for (j = 0; j < 32; ++j)

	2764 temp_in[j] = out[j * 32 + i];

	2765 high_idct32(temp_in, temp_out, bd);

	2766 for (j = 0; j < 32; ++j)

	2767 dest[j * stride + i] = clip_pixel_bd_high(

	2768 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);

	2769 }

	2770 }

	2771

	2772 void vp9_high_idct32x32_1_add_c(const tran_low_t input, uint8_t dest8,

	2773 int stride, int bd) {

	2774 int i, j;

	2775 int a1;

	2776 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

	2777

	2778 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));

	2779 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));

	2780 a1 = ROUND_POWER_OF_TWO(out, 6);

	2781

	2782 for (j = 0; j < 32; ++j) {

	2783 for (i = 0; i < 32; ++i)

	2784 dest[i] = clip_pixel_bd_high(dest[i], a1, bd);

	2785 dest += stride;

	2786 }

	2787 }

	2788

	2789 // idct

	2790 void vp9_high_idct4x4_add(const tran_low_t input, uint8_t dest, int stride,

	2791 int eob, int bd) {

	2792 if (eob > 1)

	2793 vp9_high_idct4x4_16_add(input, dest, stride, bd);

	2794 else

	2795 vp9_high_idct4x4_1_add(input, dest, stride, bd);

	2796 }

	2797

	2798

	2799 void vp9_high_iwht4x4_add(const tran_low_t input, uint8_t dest, int stride,

	2800 int eob, int bd) {

	2801 if (eob > 1)

	2802 vp9_high_iwht4x4_16_add(input, dest, stride, bd);

	2803 else

	2804 vp9_high_iwht4x4_1_add(input, dest, stride, bd);

	2805 }

	2806

	2807 void vp9_high_idct8x8_add(const tran_low_t input, uint8_t dest, int stride,

	2808 int eob, int bd) {

	2809 // If dc is 1, then input[0] is the reconstructed value, do not need

	2810 // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.

	2811

	2812 // The calculation can be simplified if there are not many non-zero dct

	2813 // coefficients. Use eobs to decide what to do.

	2814 // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.

	2815 // Combine that with code here.

	2816 // DC only DCT coefficient

	2817 if (eob == 1) {

	2818 vp9_high_idct8x8_1_add(input, dest, stride, bd);

	2819 } else if (eob <= 10) {

	2820 vp9_high_idct8x8_10_add(input, dest, stride, bd);

	2821 } else {

	2822 vp9_high_idct8x8_64_add(input, dest, stride, bd);

	2823 }

	2824 }

	2825

	2826 void vp9_high_idct16x16_add(const tran_low_t input, uint8_t dest, int stride,

	2827 int eob, int bd) {

	2828 // The calculation can be simplified if there are not many non-zero dct

	2829 // coefficients. Use eobs to separate different cases.

	2830 // DC only DCT coefficient.

	2831 if (eob == 1) {

	2832 vp9_high_idct16x16_1_add(input, dest, stride, bd);

	2833 } else if (eob <= 10) {

	2834 vp9_high_idct16x16_10_add(input, dest, stride, bd);

	2835 } else {

	2836 vp9_high_idct16x16_256_add(input, dest, stride, bd);

	2837 }

	2838 }

	2839

	2840 void vp9_high_idct32x32_add(const tran_low_t input, uint8_t dest, int stride,

	2841 int eob, int bd) {

	2842 // Non-zero coeff only in upper-left 8x8

	2843 if (eob == 1) {

	2844 vp9_high_idct32x32_1_add(input, dest, stride, bd);

	2845 } else if (eob <= 34) {

	2846 vp9_high_idct32x32_34_add(input, dest, stride, bd);

	2847 } else {

	2848 vp9_high_idct32x32_1024_add(input, dest, stride, bd);

	2849 }

	2850 }

	2851

	2852 // iht

	2853 void vp9_high_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,

	2854 uint8_t *dest, int stride, int eob, int bd) {

	2855 if (tx_type == DCT_DCT)

	2856 vp9_high_idct4x4_add(input, dest, stride, eob, bd);

	2857 else

	2858 vp9_high_iht4x4_16_add(input, dest, stride, tx_type, bd);

	2859 }

	2860

	2861 void vp9_high_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,

	2862 uint8_t *dest, int stride, int eob, int bd) {

	2863 if (tx_type == DCT_DCT) {

	2864 vp9_high_idct8x8_add(input, dest, stride, eob, bd);

	2865 } else {

	2866 vp9_high_iht8x8_64_add(input, dest, stride, tx_type, bd);

	2867 }

	2868 }

	2869

	2870 void vp9_high_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,

	2871 uint8_t *dest, int stride, int eob, int bd) {

	2872 if (tx_type == DCT_DCT) {

	2873 vp9_high_idct16x16_add(input, dest, stride, eob, bd);

	2874 } else {

	2875 vp9_high_iht16x16_256_add(input, dest, stride, tx_type, bd);

	2876 }

	2877 }

	2878 #endif // CONFIG_VP9_HIGHBITDEPTH

OLD	NEW

« no previous file with comments | « source/libvpx/vp9/common/vp9_idct.h ('k') | source/libvpx/vp9/common/vp9_loopfilter.h » ('j') | no next file with comments »