source/libvpx/vp9/common/vp9_idct.c - Issue 668403002: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/common/vp9_idct.c

Issue 668403002: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 6 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.	2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11 #include <assert.h>

12 #include <math.h>	11 #include <math.h>

13	12

14 #include "./vpx_config.h"

15 #include "./vp9_rtcd.h"	13 #include "./vp9_rtcd.h"

16 #include "vp9/common/vp9_systemdependent.h"	14 #include "vp9/common/vp9_systemdependent.h"

17 #include "vp9/common/vp9_blockd.h"	15 #include "vp9/common/vp9_blockd.h"

18 #include "vp9/common/vp9_common.h"

19 #include "vp9/common/vp9_idct.h"	16 #include "vp9/common/vp9_idct.h"

20	17

21 #if CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH	18 #if CONFIG_EMULATE_HARDWARE

22 // When CONFIG_EMULATE_HW_HIGHBITDEPTH is 1 the transform performs strict	19 // When CONFIG_EMULATE_HARDWARE is 1 the transform performs a

23 // overflow wrapping to match expected hardware implementations.	20 // non-normative method to handle overflows. A stream that causes

	21 // overflows in the inverse transform is considered invalid in VP9,

	22 // and a hardware implementer is free to choose any reasonable

	23 // method to handle overflows. However to aid in hardware

	24 // verification they can use a specific implementation of the

	25 // WRAPLOW() macro below that is identical to their intended

	26 // hardware implementation (and also use configure options to trigger

	27 // the C-implementation of the transform).

	28 //

	29 // The particular WRAPLOW implementation below performs strict

	30 // overflow wrapping to match common hardware implementations.

24 // bd of 8 uses trans_low with 16bits, need to remove 16bits	31 // bd of 8 uses trans_low with 16bits, need to remove 16bits

25 // bd of 10 uses trans_low with 18bits, need to remove 14bits	32 // bd of 10 uses trans_low with 18bits, need to remove 14bits

26 // bd of 12 uses trans_low with 20bits, need to remove 12bits	33 // bd of 12 uses trans_low with 20bits, need to remove 12bits

27 // bd of x uses trans_low with 8+x bits, need to remove 24-x bits	34 // bd of x uses trans_low with 8+x bits, need to remove 24-x bits

28 #define WRAPLOW(x) ((((int32_t)x) << (24 - bd)) >> (24 - bd))	35 #define WRAPLOW(x, bd) ((((int32_t)(x)) << (24 - bd)) >> (24 - bd))

29 #else	36 #else

30 #define WRAPLOW(x) (x)	37 #define WRAPLOW(x, bd) (x)

31 #endif // CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH	38 #endif // CONFIG_EMULATE_HARDWARE

32	39

33 #if CONFIG_VP9_HIGHBITDEPTH	40 #if CONFIG_VP9_HIGHBITDEPTH

34 static INLINE tran_low_t clamp_high(tran_high_t value, tran_low_t low,	41 static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,

35 tran_low_t high) {	42 int bd) {

36 return value < low ? low : (value > high ? high : value);	43 trans = WRAPLOW(trans, bd);

37 }	44 return clip_pixel_highbd(WRAPLOW(dest + trans, bd), bd);

38

39 static INLINE tran_low_t clip_pixel_bd_high(tran_high_t dest,

40 tran_high_t trans, int bd) {

41 trans = WRAPLOW(trans);

42 switch (bd) {

43 case 8:

44 default:

45 return clamp_high(WRAPLOW(dest + trans), 0, 255);

46 case 10:

47 return clamp_high(WRAPLOW(dest + trans), 0, 1023);

48 case 12:

49 return clamp_high(WRAPLOW(dest + trans), 0, 4095);

50 }

51 }	45 }

52 #endif // CONFIG_VP9_HIGHBITDEPTH	46 #endif // CONFIG_VP9_HIGHBITDEPTH

53	47

	48 static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) {

	49 trans = WRAPLOW(trans, 8);

	50 return clip_pixel(WRAPLOW(dest + trans, 8));

	51 }

	52

54 void vp9_iwht4x4_16_add_c(const tran_low_t input, uint8_t dest, int stride) {	53 void vp9_iwht4x4_16_add_c(const tran_low_t input, uint8_t dest, int stride) {

55 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,	54 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,

56 0.5 shifts per pixel. */	55 0.5 shifts per pixel. */

57 int i;	56 int i;

58 tran_low_t output[16];	57 tran_low_t output[16];

59 tran_high_t a1, b1, c1, d1, e1;	58 tran_high_t a1, b1, c1, d1, e1;

60 const tran_low_t *ip = input;	59 const tran_low_t *ip = input;

61 tran_low_t *op = output;	60 tran_low_t *op = output;

62	61

63 for (i = 0; i < 4; i++) {	62 for (i = 0; i < 4; i++) {

64 a1 = ip[0] >> UNIT_QUANT_SHIFT;	63 a1 = ip[0] >> UNIT_QUANT_SHIFT;

65 c1 = ip[1] >> UNIT_QUANT_SHIFT;	64 c1 = ip[1] >> UNIT_QUANT_SHIFT;

66 d1 = ip[2] >> UNIT_QUANT_SHIFT;	65 d1 = ip[2] >> UNIT_QUANT_SHIFT;

67 b1 = ip[3] >> UNIT_QUANT_SHIFT;	66 b1 = ip[3] >> UNIT_QUANT_SHIFT;

68 a1 += c1;	67 a1 += c1;

69 d1 -= b1;	68 d1 -= b1;

70 e1 = (a1 - d1) >> 1;	69 e1 = (a1 - d1) >> 1;

71 b1 = e1 - b1;	70 b1 = e1 - b1;

72 c1 = e1 - c1;	71 c1 = e1 - c1;

73 a1 -= b1;	72 a1 -= b1;

74 d1 += c1;	73 d1 += c1;

75 op[0] = a1;	74 op[0] = WRAPLOW(a1, 8);

76 op[1] = b1;	75 op[1] = WRAPLOW(b1, 8);

77 op[2] = c1;	76 op[2] = WRAPLOW(c1, 8);

78 op[3] = d1;	77 op[3] = WRAPLOW(d1, 8);

79 ip += 4;	78 ip += 4;

80 op += 4;	79 op += 4;

81 }	80 }

82	81

83 ip = output;	82 ip = output;

84 for (i = 0; i < 4; i++) {	83 for (i = 0; i < 4; i++) {

85 a1 = ip[4 * 0];	84 a1 = ip[4 * 0];

86 c1 = ip[4 * 1];	85 c1 = ip[4 * 1];

87 d1 = ip[4 * 2];	86 d1 = ip[4 * 2];

88 b1 = ip[4 * 3];	87 b1 = ip[4 * 3];

89 a1 += c1;	88 a1 += c1;

90 d1 -= b1;	89 d1 -= b1;

91 e1 = (a1 - d1) >> 1;	90 e1 = (a1 - d1) >> 1;

92 b1 = e1 - b1;	91 b1 = e1 - b1;

93 c1 = e1 - c1;	92 c1 = e1 - c1;

94 a1 -= b1;	93 a1 -= b1;

95 d1 += c1;	94 d1 += c1;

96 dest[stride * 0] = clip_pixel(dest[stride * 0] + a1);	95 dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1);

97 dest[stride * 1] = clip_pixel(dest[stride * 1] + b1);	96 dest[stride * 1] = clip_pixel_add(dest[stride * 1], b1);

98 dest[stride * 2] = clip_pixel(dest[stride * 2] + c1);	97 dest[stride * 2] = clip_pixel_add(dest[stride * 2], c1);

99 dest[stride * 3] = clip_pixel(dest[stride * 3] + d1);	98 dest[stride * 3] = clip_pixel_add(dest[stride * 3], d1);

100	99

101 ip++;	100 ip++;

102 dest++;	101 dest++;

103 }	102 }

104 }	103 }

105	104

106 void vp9_iwht4x4_1_add_c(const tran_low_t in, uint8_t dest, int dest_stride) {	105 void vp9_iwht4x4_1_add_c(const tran_low_t in, uint8_t dest, int dest_stride) {

107 int i;	106 int i;

108 tran_high_t a1, e1;	107 tran_high_t a1, e1;

109 tran_low_t tmp[4];	108 tran_low_t tmp[4];

110 const tran_low_t *ip = in;	109 const tran_low_t *ip = in;

111 tran_low_t *op = tmp;	110 tran_low_t *op = tmp;

112	111

113 a1 = ip[0] >> UNIT_QUANT_SHIFT;	112 a1 = ip[0] >> UNIT_QUANT_SHIFT;

114 e1 = a1 >> 1;	113 e1 = a1 >> 1;

115 a1 -= e1;	114 a1 -= e1;

116 op[0] = a1;	115 op[0] = WRAPLOW(a1, 8);

117 op[1] = op[2] = op[3] = e1;	116 op[1] = op[2] = op[3] = WRAPLOW(e1, 8);

118	117

119 ip = tmp;	118 ip = tmp;

120 for (i = 0; i < 4; i++) {	119 for (i = 0; i < 4; i++) {

121 e1 = ip[0] >> 1;	120 e1 = ip[0] >> 1;

122 a1 = ip[0] - e1;	121 a1 = ip[0] - e1;

123 dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + a1);	122 dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1);

124 dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + e1);	123 dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1);

125 dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + e1);	124 dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1);

126 dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + e1);	125 dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1);

127 ip++;	126 ip++;

128 dest++;	127 dest++;

129 }	128 }

130 }	129 }

131	130

132 static void idct4(const tran_low_t input, tran_low_t output) {	131 static void idct4(const tran_low_t input, tran_low_t output) {

133 tran_low_t step[4];	132 tran_low_t step[4];

134 tran_high_t temp1, temp2;	133 tran_high_t temp1, temp2;

135 // stage 1	134 // stage 1

136 temp1 = (input[0] + input[2]) * cospi_16_64;	135 temp1 = (input[0] + input[2]) * cospi_16_64;

137 temp2 = (input[0] - input[2]) * cospi_16_64;	136 temp2 = (input[0] - input[2]) * cospi_16_64;

138 step[0] = dct_const_round_shift(temp1);	137 step[0] = WRAPLOW(dct_const_round_shift(temp1), 8);

139 step[1] = dct_const_round_shift(temp2);	138 step[1] = WRAPLOW(dct_const_round_shift(temp2), 8);

140 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;	139 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;

141 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;	140 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;

142 step[2] = dct_const_round_shift(temp1);	141 step[2] = WRAPLOW(dct_const_round_shift(temp1), 8);

143 step[3] = dct_const_round_shift(temp2);	142 step[3] = WRAPLOW(dct_const_round_shift(temp2), 8);

144	143

145 // stage 2	144 // stage 2

146 output[0] = step[0] + step[3];	145 output[0] = WRAPLOW(step[0] + step[3], 8);

147 output[1] = step[1] + step[2];	146 output[1] = WRAPLOW(step[1] + step[2], 8);

148 output[2] = step[1] - step[2];	147 output[2] = WRAPLOW(step[1] - step[2], 8);

149 output[3] = step[0] - step[3];	148 output[3] = WRAPLOW(step[0] - step[3], 8);

150 }	149 }

151	150

152 void vp9_idct4x4_16_add_c(const tran_low_t input, uint8_t dest, int stride) {	151 void vp9_idct4x4_16_add_c(const tran_low_t input, uint8_t dest, int stride) {

153 tran_low_t out[4 * 4];	152 tran_low_t out[4 * 4];

154 tran_low_t *outptr = out;	153 tran_low_t *outptr = out;

155 int i, j;	154 int i, j;

156 tran_low_t temp_in[4], temp_out[4];	155 tran_low_t temp_in[4], temp_out[4];

157	156

158 // Rows	157 // Rows

159 for (i = 0; i < 4; ++i) {	158 for (i = 0; i < 4; ++i) {

160 idct4(input, outptr);	159 idct4(input, outptr);

161 input += 4;	160 input += 4;

162 outptr += 4;	161 outptr += 4;

163 }	162 }

164	163

165 // Columns	164 // Columns

166 for (i = 0; i < 4; ++i) {	165 for (i = 0; i < 4; ++i) {

167 for (j = 0; j < 4; ++j)	166 for (j = 0; j < 4; ++j)

168 temp_in[j] = out[j * 4 + i];	167 temp_in[j] = out[j * 4 + i];

169 idct4(temp_in, temp_out);	168 idct4(temp_in, temp_out);

170 for (j = 0; j < 4; ++j)	169 for (j = 0; j < 4; ++j) {

171 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)	170 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],

172 + dest[j * stride + i]);	171 ROUND_POWER_OF_TWO(temp_out[j], 4));

	172 }

173 }	173 }

174 }	174 }

175	175

176 void vp9_idct4x4_1_add_c(const tran_low_t input, uint8_t dest,	176 void vp9_idct4x4_1_add_c(const tran_low_t input, uint8_t dest,

177 int dest_stride) {	177 int dest_stride) {

178 int i;	178 int i;

179 tran_high_t a1;	179 tran_high_t a1;

180 tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64);	180 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);

181 out = dct_const_round_shift(out * cospi_16_64);	181 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);

182 a1 = ROUND_POWER_OF_TWO(out, 4);	182 a1 = ROUND_POWER_OF_TWO(out, 4);

183	183

184 for (i = 0; i < 4; i++) {	184 for (i = 0; i < 4; i++) {

185 dest[0] = clip_pixel(dest[0] + a1);	185 dest[0] = clip_pixel_add(dest[0], a1);

186 dest[1] = clip_pixel(dest[1] + a1);	186 dest[1] = clip_pixel_add(dest[1], a1);

187 dest[2] = clip_pixel(dest[2] + a1);	187 dest[2] = clip_pixel_add(dest[2], a1);

188 dest[3] = clip_pixel(dest[3] + a1);	188 dest[3] = clip_pixel_add(dest[3], a1);

189 dest += dest_stride;	189 dest += dest_stride;

190 }	190 }

191 }	191 }

192	192

193 static void idct8(const tran_low_t input, tran_low_t output) {	193 static void idct8(const tran_low_t input, tran_low_t output) {

194 tran_low_t step1[8], step2[8];	194 tran_low_t step1[8], step2[8];

195 tran_high_t temp1, temp2;	195 tran_high_t temp1, temp2;

196 // stage 1	196 // stage 1

197 step1[0] = input[0];	197 step1[0] = input[0];

198 step1[2] = input[4];	198 step1[2] = input[4];

199 step1[1] = input[2];	199 step1[1] = input[2];

200 step1[3] = input[6];	200 step1[3] = input[6];

201 temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;	201 temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;

202 temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;	202 temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;

203 step1[4] = dct_const_round_shift(temp1);	203 step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);

204 step1[7] = dct_const_round_shift(temp2);	204 step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);

205 temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;	205 temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;

206 temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;	206 temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;

207 step1[5] = dct_const_round_shift(temp1);	207 step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);

208 step1[6] = dct_const_round_shift(temp2);	208 step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);

209	209

210 // stage 2 & stage 3 - even half	210 // stage 2 & stage 3 - even half

211 idct4(step1, step1);	211 idct4(step1, step1);

212	212

213 // stage 2 - odd half	213 // stage 2 - odd half

214 step2[4] = step1[4] + step1[5];	214 step2[4] = WRAPLOW(step1[4] + step1[5], 8);

215 step2[5] = step1[4] - step1[5];	215 step2[5] = WRAPLOW(step1[4] - step1[5], 8);

216 step2[6] = -step1[6] + step1[7];	216 step2[6] = WRAPLOW(-step1[6] + step1[7], 8);

217 step2[7] = step1[6] + step1[7];	217 step2[7] = WRAPLOW(step1[6] + step1[7], 8);

218	218

219 // stage 3 -odd half	219 // stage 3 -odd half

220 step1[4] = step2[4];	220 step1[4] = step2[4];

221 temp1 = (step2[6] - step2[5]) * cospi_16_64;	221 temp1 = (step2[6] - step2[5]) * cospi_16_64;

222 temp2 = (step2[5] + step2[6]) * cospi_16_64;	222 temp2 = (step2[5] + step2[6]) * cospi_16_64;

223 step1[5] = dct_const_round_shift(temp1);	223 step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);

224 step1[6] = dct_const_round_shift(temp2);	224 step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);

225 step1[7] = step2[7];	225 step1[7] = step2[7];

226	226

227 // stage 4	227 // stage 4

228 output[0] = step1[0] + step1[7];	228 output[0] = WRAPLOW(step1[0] + step1[7], 8);

229 output[1] = step1[1] + step1[6];	229 output[1] = WRAPLOW(step1[1] + step1[6], 8);

230 output[2] = step1[2] + step1[5];	230 output[2] = WRAPLOW(step1[2] + step1[5], 8);

231 output[3] = step1[3] + step1[4];	231 output[3] = WRAPLOW(step1[3] + step1[4], 8);

232 output[4] = step1[3] - step1[4];	232 output[4] = WRAPLOW(step1[3] - step1[4], 8);

233 output[5] = step1[2] - step1[5];	233 output[5] = WRAPLOW(step1[2] - step1[5], 8);

234 output[6] = step1[1] - step1[6];	234 output[6] = WRAPLOW(step1[1] - step1[6], 8);

235 output[7] = step1[0] - step1[7];	235 output[7] = WRAPLOW(step1[0] - step1[7], 8);

236 }	236 }

237	237

238 void vp9_idct8x8_64_add_c(const tran_low_t input, uint8_t dest, int stride) {	238 void vp9_idct8x8_64_add_c(const tran_low_t input, uint8_t dest, int stride) {

239 tran_low_t out[8 * 8];	239 tran_low_t out[8 * 8];

240 tran_low_t *outptr = out;	240 tran_low_t *outptr = out;

241 int i, j;	241 int i, j;

242 tran_low_t temp_in[8], temp_out[8];	242 tran_low_t temp_in[8], temp_out[8];

243	243

244 // First transform rows	244 // First transform rows

245 for (i = 0; i < 8; ++i) {	245 for (i = 0; i < 8; ++i) {

246 idct8(input, outptr);	246 idct8(input, outptr);

247 input += 8;	247 input += 8;

248 outptr += 8;	248 outptr += 8;

249 }	249 }

250	250

251 // Then transform columns	251 // Then transform columns

252 for (i = 0; i < 8; ++i) {	252 for (i = 0; i < 8; ++i) {

253 for (j = 0; j < 8; ++j)	253 for (j = 0; j < 8; ++j)

254 temp_in[j] = out[j * 8 + i];	254 temp_in[j] = out[j * 8 + i];

255 idct8(temp_in, temp_out);	255 idct8(temp_in, temp_out);

256 for (j = 0; j < 8; ++j)	256 for (j = 0; j < 8; ++j) {

257 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)	257 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],

258 + dest[j * stride + i]);	258 ROUND_POWER_OF_TWO(temp_out[j], 5));

	259 }

259 }	260 }

260 }	261 }

261	262

262 void vp9_idct8x8_1_add_c(const tran_low_t input, uint8_t dest, int stride) {	263 void vp9_idct8x8_1_add_c(const tran_low_t input, uint8_t dest, int stride) {

263 int i, j;	264 int i, j;

264 tran_high_t a1;	265 tran_high_t a1;

265 tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64);	266 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);

266 out = dct_const_round_shift(out * cospi_16_64);	267 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);

267 a1 = ROUND_POWER_OF_TWO(out, 5);	268 a1 = ROUND_POWER_OF_TWO(out, 5);

268 for (j = 0; j < 8; ++j) {	269 for (j = 0; j < 8; ++j) {

269 for (i = 0; i < 8; ++i)	270 for (i = 0; i < 8; ++i)

270 dest[i] = clip_pixel(dest[i] + a1);	271 dest[i] = clip_pixel_add(dest[i], a1);

271 dest += stride;	272 dest += stride;

272 }	273 }

273 }	274 }

274	275

275 static void iadst4(const tran_low_t input, tran_low_t output) {	276 static void iadst4(const tran_low_t input, tran_low_t output) {

276 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;	277 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;

277	278

278 tran_high_t x0 = input[0];	279 tran_high_t x0 = input[0];

279 tran_high_t x1 = input[1];	280 tran_high_t x1 = input[1];

280 tran_high_t x2 = input[2];	281 tran_high_t x2 = input[2];

(...skipping 20 matching lines...) Expand all Loading...
301	302

302 s0 = x0 + x3;	303 s0 = x0 + x3;

303 s1 = x1 + x3;	304 s1 = x1 + x3;

304 s2 = x2;	305 s2 = x2;

305 s3 = x0 + x1 - x3;	306 s3 = x0 + x1 - x3;

306	307

307 // 1-D transform scaling factor is sqrt(2).	308 // 1-D transform scaling factor is sqrt(2).

308 // The overall dynamic range is 14b (input) + 14b (multiplication scaling)	309 // The overall dynamic range is 14b (input) + 14b (multiplication scaling)

309 // + 1b (addition) = 29b.	310 // + 1b (addition) = 29b.

310 // Hence the output bit depth is 15b.	311 // Hence the output bit depth is 15b.

311 output[0] = dct_const_round_shift(s0);	312 output[0] = WRAPLOW(dct_const_round_shift(s0), 8);

312 output[1] = dct_const_round_shift(s1);	313 output[1] = WRAPLOW(dct_const_round_shift(s1), 8);

313 output[2] = dct_const_round_shift(s2);	314 output[2] = WRAPLOW(dct_const_round_shift(s2), 8);

314 output[3] = dct_const_round_shift(s3);	315 output[3] = WRAPLOW(dct_const_round_shift(s3), 8);

315 }	316 }

316	317

317 void vp9_iht4x4_16_add_c(const tran_low_t input, uint8_t dest, int stride,	318 void vp9_iht4x4_16_add_c(const tran_low_t input, uint8_t dest, int stride,

318 int tx_type) {	319 int tx_type) {

319 const transform_2d IHT_4[] = {	320 const transform_2d IHT_4[] = {

320 { idct4, idct4 }, // DCT_DCT = 0	321 { idct4, idct4 }, // DCT_DCT = 0

321 { iadst4, idct4 }, // ADST_DCT = 1	322 { iadst4, idct4 }, // ADST_DCT = 1

322 { idct4, iadst4 }, // DCT_ADST = 2	323 { idct4, iadst4 }, // DCT_ADST = 2

323 { iadst4, iadst4 } // ADST_ADST = 3	324 { iadst4, iadst4 } // ADST_ADST = 3

324 };	325 };

325	326

326 int i, j;	327 int i, j;

327 tran_low_t out[4 * 4];	328 tran_low_t out[4 * 4];

328 tran_low_t *outptr = out;	329 tran_low_t *outptr = out;

329 tran_low_t temp_in[4], temp_out[4];	330 tran_low_t temp_in[4], temp_out[4];

330	331

331 // inverse transform row vectors	332 // inverse transform row vectors

332 for (i = 0; i < 4; ++i) {	333 for (i = 0; i < 4; ++i) {

333 IHT_4[tx_type].rows(input, outptr);	334 IHT_4[tx_type].rows(input, outptr);

334 input += 4;	335 input += 4;

335 outptr += 4;	336 outptr += 4;

336 }	337 }

337	338

338 // inverse transform column vectors	339 // inverse transform column vectors

339 for (i = 0; i < 4; ++i) {	340 for (i = 0; i < 4; ++i) {

340 for (j = 0; j < 4; ++j)	341 for (j = 0; j < 4; ++j)

341 temp_in[j] = out[j * 4 + i];	342 temp_in[j] = out[j * 4 + i];

342 IHT_4[tx_type].cols(temp_in, temp_out);	343 IHT_4[tx_type].cols(temp_in, temp_out);

343 for (j = 0; j < 4; ++j)	344 for (j = 0; j < 4; ++j) {

344 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)	345 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],

345 + dest[j * stride + i]);	346 ROUND_POWER_OF_TWO(temp_out[j], 4));

	347 }

346 }	348 }

347 }	349 }

	350

348 static void iadst8(const tran_low_t input, tran_low_t output) {	351 static void iadst8(const tran_low_t input, tran_low_t output) {

349 int s0, s1, s2, s3, s4, s5, s6, s7;	352 int s0, s1, s2, s3, s4, s5, s6, s7;

350	353

351 tran_high_t x0 = input[7];	354 tran_high_t x0 = input[7];

352 tran_high_t x1 = input[0];	355 tran_high_t x1 = input[0];

353 tran_high_t x2 = input[5];	356 tran_high_t x2 = input[5];

354 tran_high_t x3 = input[2];	357 tran_high_t x3 = input[2];

355 tran_high_t x4 = input[3];	358 tran_high_t x4 = input[3];

356 tran_high_t x5 = input[4];	359 tran_high_t x5 = input[4];

357 tran_high_t x6 = input[1];	360 tran_high_t x6 = input[1];

358 tran_high_t x7 = input[6];	361 tran_high_t x7 = input[6];

359	362

360 if (!(x0 \| x1 \| x2 \| x3 \| x4 \| x5 \| x6 \| x7)) {	363 if (!(x0 \| x1 \| x2 \| x3 \| x4 \| x5 \| x6 \| x7)) {

361 output[0] = output[1] = output[2] = output[3] = output[4]	364 output[0] = output[1] = output[2] = output[3] = output[4]

362 = output[5] = output[6] = output[7] = 0;	365 = output[5] = output[6] = output[7] = 0;

363 return;	366 return;

364 }	367 }

365	368

366 // stage 1	369 // stage 1

367 s0 = cospi_2_64 * x0 + cospi_30_64 * x1;	370 s0 = cospi_2_64 * x0 + cospi_30_64 * x1;

368 s1 = cospi_30_64 * x0 - cospi_2_64 * x1;	371 s1 = cospi_30_64 * x0 - cospi_2_64 * x1;

369 s2 = cospi_10_64 * x2 + cospi_22_64 * x3;	372 s2 = cospi_10_64 * x2 + cospi_22_64 * x3;

370 s3 = cospi_22_64 * x2 - cospi_10_64 * x3;	373 s3 = cospi_22_64 * x2 - cospi_10_64 * x3;

371 s4 = cospi_18_64 * x4 + cospi_14_64 * x5;	374 s4 = cospi_18_64 * x4 + cospi_14_64 * x5;

372 s5 = cospi_14_64 * x4 - cospi_18_64 * x5;	375 s5 = cospi_14_64 * x4 - cospi_18_64 * x5;

373 s6 = cospi_26_64 * x6 + cospi_6_64 * x7;	376 s6 = cospi_26_64 * x6 + cospi_6_64 * x7;

374 s7 = cospi_6_64 * x6 - cospi_26_64 * x7;	377 s7 = cospi_6_64 * x6 - cospi_26_64 * x7;

375	378

376 x0 = dct_const_round_shift(s0 + s4);	379 x0 = WRAPLOW(dct_const_round_shift(s0 + s4), 8);

377 x1 = dct_const_round_shift(s1 + s5);	380 x1 = WRAPLOW(dct_const_round_shift(s1 + s5), 8);

378 x2 = dct_const_round_shift(s2 + s6);	381 x2 = WRAPLOW(dct_const_round_shift(s2 + s6), 8);

379 x3 = dct_const_round_shift(s3 + s7);	382 x3 = WRAPLOW(dct_const_round_shift(s3 + s7), 8);

380 x4 = dct_const_round_shift(s0 - s4);	383 x4 = WRAPLOW(dct_const_round_shift(s0 - s4), 8);

381 x5 = dct_const_round_shift(s1 - s5);	384 x5 = WRAPLOW(dct_const_round_shift(s1 - s5), 8);

382 x6 = dct_const_round_shift(s2 - s6);	385 x6 = WRAPLOW(dct_const_round_shift(s2 - s6), 8);

383 x7 = dct_const_round_shift(s3 - s7);	386 x7 = WRAPLOW(dct_const_round_shift(s3 - s7), 8);

384	387

385 // stage 2	388 // stage 2

386 s0 = x0;	389 s0 = x0;

387 s1 = x1;	390 s1 = x1;

388 s2 = x2;	391 s2 = x2;

389 s3 = x3;	392 s3 = x3;

390 s4 = cospi_8_64 * x4 + cospi_24_64 * x5;	393 s4 = cospi_8_64 * x4 + cospi_24_64 * x5;

391 s5 = cospi_24_64 * x4 - cospi_8_64 * x5;	394 s5 = cospi_24_64 * x4 - cospi_8_64 * x5;

392 s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;	395 s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;

393 s7 = cospi_8_64 * x6 + cospi_24_64 * x7;	396 s7 = cospi_8_64 * x6 + cospi_24_64 * x7;

394	397

395 x0 = s0 + s2;	398 x0 = WRAPLOW(s0 + s2, 8);

396 x1 = s1 + s3;	399 x1 = WRAPLOW(s1 + s3, 8);

397 x2 = s0 - s2;	400 x2 = WRAPLOW(s0 - s2, 8);

398 x3 = s1 - s3;	401 x3 = WRAPLOW(s1 - s3, 8);

399 x4 = dct_const_round_shift(s4 + s6);	402 x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);

400 x5 = dct_const_round_shift(s5 + s7);	403 x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);

401 x6 = dct_const_round_shift(s4 - s6);	404 x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);

402 x7 = dct_const_round_shift(s5 - s7);	405 x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);

403	406

404 // stage 3	407 // stage 3

405 s2 = cospi_16_64 * (x2 + x3);	408 s2 = cospi_16_64 * (x2 + x3);

406 s3 = cospi_16_64 * (x2 - x3);	409 s3 = cospi_16_64 * (x2 - x3);

407 s6 = cospi_16_64 * (x6 + x7);	410 s6 = cospi_16_64 * (x6 + x7);

408 s7 = cospi_16_64 * (x6 - x7);	411 s7 = cospi_16_64 * (x6 - x7);

409	412

410 x2 = dct_const_round_shift(s2);	413 x2 = WRAPLOW(dct_const_round_shift(s2), 8);

411 x3 = dct_const_round_shift(s3);	414 x3 = WRAPLOW(dct_const_round_shift(s3), 8);

412 x6 = dct_const_round_shift(s6);	415 x6 = WRAPLOW(dct_const_round_shift(s6), 8);

413 x7 = dct_const_round_shift(s7);	416 x7 = WRAPLOW(dct_const_round_shift(s7), 8);

414	417

415 output[0] = x0;	418 output[0] = WRAPLOW(x0, 8);

416 output[1] = -x4;	419 output[1] = WRAPLOW(-x4, 8);

417 output[2] = x6;	420 output[2] = WRAPLOW(x6, 8);

418 output[3] = -x2;	421 output[3] = WRAPLOW(-x2, 8);

419 output[4] = x3;	422 output[4] = WRAPLOW(x3, 8);

420 output[5] = -x7;	423 output[5] = WRAPLOW(-x7, 8);

421 output[6] = x5;	424 output[6] = WRAPLOW(x5, 8);

422 output[7] = -x1;	425 output[7] = WRAPLOW(-x1, 8);

423 }	426 }

424	427

425 static const transform_2d IHT_8[] = {	428 static const transform_2d IHT_8[] = {

426 { idct8, idct8 }, // DCT_DCT = 0	429 { idct8, idct8 }, // DCT_DCT = 0

427 { iadst8, idct8 }, // ADST_DCT = 1	430 { iadst8, idct8 }, // ADST_DCT = 1

428 { idct8, iadst8 }, // DCT_ADST = 2	431 { idct8, iadst8 }, // DCT_ADST = 2

429 { iadst8, iadst8 } // ADST_ADST = 3	432 { iadst8, iadst8 } // ADST_ADST = 3

430 };	433 };

431	434

432 void vp9_iht8x8_64_add_c(const tran_low_t input, uint8_t dest, int stride,	435 void vp9_iht8x8_64_add_c(const tran_low_t input, uint8_t dest, int stride,

433 int tx_type) {	436 int tx_type) {

434 int i, j;	437 int i, j;

435 tran_low_t out[8 * 8];	438 tran_low_t out[8 * 8];

436 tran_low_t *outptr = out;	439 tran_low_t *outptr = out;

437 tran_low_t temp_in[8], temp_out[8];	440 tran_low_t temp_in[8], temp_out[8];

438 const transform_2d ht = IHT_8[tx_type];	441 const transform_2d ht = IHT_8[tx_type];

439	442

440 // inverse transform row vectors	443 // inverse transform row vectors

441 for (i = 0; i < 8; ++i) {	444 for (i = 0; i < 8; ++i) {

442 ht.rows(input, outptr);	445 ht.rows(input, outptr);

443 input += 8;	446 input += 8;

444 outptr += 8;	447 outptr += 8;

445 }	448 }

446	449

447 // inverse transform column vectors	450 // inverse transform column vectors

448 for (i = 0; i < 8; ++i) {	451 for (i = 0; i < 8; ++i) {

449 for (j = 0; j < 8; ++j)	452 for (j = 0; j < 8; ++j)

450 temp_in[j] = out[j * 8 + i];	453 temp_in[j] = out[j * 8 + i];

451 ht.cols(temp_in, temp_out);	454 ht.cols(temp_in, temp_out);

452 for (j = 0; j < 8; ++j)	455 for (j = 0; j < 8; ++j) {

453 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)	456 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],

454 + dest[j * stride + i]);	457 ROUND_POWER_OF_TWO(temp_out[j], 5));

	458 }

455 }	459 }

456 }	460 }

457	461

458 void vp9_idct8x8_12_add_c(const tran_low_t input, uint8_t dest, int stride) {	462 void vp9_idct8x8_12_add_c(const tran_low_t input, uint8_t dest, int stride) {

459 tran_low_t out[8 * 8] = { 0 };	463 tran_low_t out[8 * 8] = { 0 };

460 tran_low_t *outptr = out;	464 tran_low_t *outptr = out;

461 int i, j;	465 int i, j;

462 tran_low_t temp_in[8], temp_out[8];	466 tran_low_t temp_in[8], temp_out[8];

463	467

464 // First transform rows	468 // First transform rows

465 // only first 4 row has non-zero coefs	469 // only first 4 row has non-zero coefs

466 for (i = 0; i < 4; ++i) {	470 for (i = 0; i < 4; ++i) {

467 idct8(input, outptr);	471 idct8(input, outptr);

468 input += 8;	472 input += 8;

469 outptr += 8;	473 outptr += 8;

470 }	474 }

471	475

472 // Then transform columns	476 // Then transform columns

473 for (i = 0; i < 8; ++i) {	477 for (i = 0; i < 8; ++i) {

474 for (j = 0; j < 8; ++j)	478 for (j = 0; j < 8; ++j)

475 temp_in[j] = out[j * 8 + i];	479 temp_in[j] = out[j * 8 + i];

476 idct8(temp_in, temp_out);	480 idct8(temp_in, temp_out);

477 for (j = 0; j < 8; ++j)	481 for (j = 0; j < 8; ++j) {

478 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)	482 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],

479 + dest[j * stride + i]);	483 ROUND_POWER_OF_TWO(temp_out[j], 5));

	484 }

480 }	485 }

481 }	486 }

482	487

483 static void idct16(const tran_low_t input, tran_low_t output) {	488 static void idct16(const tran_low_t input, tran_low_t output) {

484 tran_low_t step1[16], step2[16];	489 tran_low_t step1[16], step2[16];

485 tran_high_t temp1, temp2;	490 tran_high_t temp1, temp2;

486	491

487 // stage 1	492 // stage 1

488 step1[0] = input[0/2];	493 step1[0] = input[0/2];

489 step1[1] = input[16/2];	494 step1[1] = input[16/2];

(...skipping 17 matching lines...) Expand all Loading...
507 step2[1] = step1[1];	512 step2[1] = step1[1];

508 step2[2] = step1[2];	513 step2[2] = step1[2];

509 step2[3] = step1[3];	514 step2[3] = step1[3];

510 step2[4] = step1[4];	515 step2[4] = step1[4];

511 step2[5] = step1[5];	516 step2[5] = step1[5];

512 step2[6] = step1[6];	517 step2[6] = step1[6];

513 step2[7] = step1[7];	518 step2[7] = step1[7];

514	519

515 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;	520 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;

516 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;	521 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;

517 step2[8] = dct_const_round_shift(temp1);	522 step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);

518 step2[15] = dct_const_round_shift(temp2);	523 step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);

519	524

520 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;	525 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;

521 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;	526 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;

522 step2[9] = dct_const_round_shift(temp1);	527 step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);

523 step2[14] = dct_const_round_shift(temp2);	528 step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);

524	529

525 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;	530 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;

526 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;	531 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;

527 step2[10] = dct_const_round_shift(temp1);	532 step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);

528 step2[13] = dct_const_round_shift(temp2);	533 step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);

529	534

530 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;	535 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;

531 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;	536 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;

532 step2[11] = dct_const_round_shift(temp1);	537 step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);

533 step2[12] = dct_const_round_shift(temp2);	538 step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);

534	539

535 // stage 3	540 // stage 3

536 step1[0] = step2[0];	541 step1[0] = step2[0];

537 step1[1] = step2[1];	542 step1[1] = step2[1];

538 step1[2] = step2[2];	543 step1[2] = step2[2];

539 step1[3] = step2[3];	544 step1[3] = step2[3];

540	545

541 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;	546 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;

542 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;	547 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;

543 step1[4] = dct_const_round_shift(temp1);	548 step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);

544 step1[7] = dct_const_round_shift(temp2);	549 step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);

545 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;	550 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;

546 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;	551 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;

547 step1[5] = dct_const_round_shift(temp1);	552 step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);

548 step1[6] = dct_const_round_shift(temp2);	553 step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);

549	554

550 step1[8] = step2[8] + step2[9];	555 step1[8] = WRAPLOW(step2[8] + step2[9], 8);

551 step1[9] = step2[8] - step2[9];	556 step1[9] = WRAPLOW(step2[8] - step2[9], 8);

552 step1[10] = -step2[10] + step2[11];	557 step1[10] = WRAPLOW(-step2[10] + step2[11], 8);

553 step1[11] = step2[10] + step2[11];	558 step1[11] = WRAPLOW(step2[10] + step2[11], 8);

554 step1[12] = step2[12] + step2[13];	559 step1[12] = WRAPLOW(step2[12] + step2[13], 8);

555 step1[13] = step2[12] - step2[13];	560 step1[13] = WRAPLOW(step2[12] - step2[13], 8);

556 step1[14] = -step2[14] + step2[15];	561 step1[14] = WRAPLOW(-step2[14] + step2[15], 8);

557 step1[15] = step2[14] + step2[15];	562 step1[15] = WRAPLOW(step2[14] + step2[15], 8);

558	563

559 // stage 4	564 // stage 4

560 temp1 = (step1[0] + step1[1]) * cospi_16_64;	565 temp1 = (step1[0] + step1[1]) * cospi_16_64;

561 temp2 = (step1[0] - step1[1]) * cospi_16_64;	566 temp2 = (step1[0] - step1[1]) * cospi_16_64;

562 step2[0] = dct_const_round_shift(temp1);	567 step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);

563 step2[1] = dct_const_round_shift(temp2);	568 step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);

564 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;	569 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;

565 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;	570 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;

566 step2[2] = dct_const_round_shift(temp1);	571 step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);

567 step2[3] = dct_const_round_shift(temp2);	572 step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);

568 step2[4] = step1[4] + step1[5];	573 step2[4] = WRAPLOW(step1[4] + step1[5], 8);

569 step2[5] = step1[4] - step1[5];	574 step2[5] = WRAPLOW(step1[4] - step1[5], 8);

570 step2[6] = -step1[6] + step1[7];	575 step2[6] = WRAPLOW(-step1[6] + step1[7], 8);

571 step2[7] = step1[6] + step1[7];	576 step2[7] = WRAPLOW(step1[6] + step1[7], 8);

572	577

573 step2[8] = step1[8];	578 step2[8] = step1[8];

574 step2[15] = step1[15];	579 step2[15] = step1[15];

575 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;	580 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;

576 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;	581 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;

577 step2[9] = dct_const_round_shift(temp1);	582 step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);

578 step2[14] = dct_const_round_shift(temp2);	583 step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);

579 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;	584 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;

580 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;	585 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;

581 step2[10] = dct_const_round_shift(temp1);	586 step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);

582 step2[13] = dct_const_round_shift(temp2);	587 step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);

583 step2[11] = step1[11];	588 step2[11] = step1[11];

584 step2[12] = step1[12];	589 step2[12] = step1[12];

585	590

586 // stage 5	591 // stage 5

587 step1[0] = step2[0] + step2[3];	592 step1[0] = WRAPLOW(step2[0] + step2[3], 8);

588 step1[1] = step2[1] + step2[2];	593 step1[1] = WRAPLOW(step2[1] + step2[2], 8);

589 step1[2] = step2[1] - step2[2];	594 step1[2] = WRAPLOW(step2[1] - step2[2], 8);

590 step1[3] = step2[0] - step2[3];	595 step1[3] = WRAPLOW(step2[0] - step2[3], 8);

591 step1[4] = step2[4];	596 step1[4] = step2[4];

592 temp1 = (step2[6] - step2[5]) * cospi_16_64;	597 temp1 = (step2[6] - step2[5]) * cospi_16_64;

593 temp2 = (step2[5] + step2[6]) * cospi_16_64;	598 temp2 = (step2[5] + step2[6]) * cospi_16_64;

594 step1[5] = dct_const_round_shift(temp1);	599 step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);

595 step1[6] = dct_const_round_shift(temp2);	600 step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);

596 step1[7] = step2[7];	601 step1[7] = step2[7];

597	602

598 step1[8] = step2[8] + step2[11];	603 step1[8] = WRAPLOW(step2[8] + step2[11], 8);

599 step1[9] = step2[9] + step2[10];	604 step1[9] = WRAPLOW(step2[9] + step2[10], 8);

600 step1[10] = step2[9] - step2[10];	605 step1[10] = WRAPLOW(step2[9] - step2[10], 8);

601 step1[11] = step2[8] - step2[11];	606 step1[11] = WRAPLOW(step2[8] - step2[11], 8);

602 step1[12] = -step2[12] + step2[15];	607 step1[12] = WRAPLOW(-step2[12] + step2[15], 8);

603 step1[13] = -step2[13] + step2[14];	608 step1[13] = WRAPLOW(-step2[13] + step2[14], 8);

604 step1[14] = step2[13] + step2[14];	609 step1[14] = WRAPLOW(step2[13] + step2[14], 8);

605 step1[15] = step2[12] + step2[15];	610 step1[15] = WRAPLOW(step2[12] + step2[15], 8);

606	611

607 // stage 6	612 // stage 6

608 step2[0] = step1[0] + step1[7];	613 step2[0] = WRAPLOW(step1[0] + step1[7], 8);

609 step2[1] = step1[1] + step1[6];	614 step2[1] = WRAPLOW(step1[1] + step1[6], 8);

610 step2[2] = step1[2] + step1[5];	615 step2[2] = WRAPLOW(step1[2] + step1[5], 8);

611 step2[3] = step1[3] + step1[4];	616 step2[3] = WRAPLOW(step1[3] + step1[4], 8);

612 step2[4] = step1[3] - step1[4];	617 step2[4] = WRAPLOW(step1[3] - step1[4], 8);

613 step2[5] = step1[2] - step1[5];	618 step2[5] = WRAPLOW(step1[2] - step1[5], 8);

614 step2[6] = step1[1] - step1[6];	619 step2[6] = WRAPLOW(step1[1] - step1[6], 8);

615 step2[7] = step1[0] - step1[7];	620 step2[7] = WRAPLOW(step1[0] - step1[7], 8);

616 step2[8] = step1[8];	621 step2[8] = step1[8];

617 step2[9] = step1[9];	622 step2[9] = step1[9];

618 temp1 = (-step1[10] + step1[13]) * cospi_16_64;	623 temp1 = (-step1[10] + step1[13]) * cospi_16_64;

619 temp2 = (step1[10] + step1[13]) * cospi_16_64;	624 temp2 = (step1[10] + step1[13]) * cospi_16_64;

620 step2[10] = dct_const_round_shift(temp1);	625 step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);

621 step2[13] = dct_const_round_shift(temp2);	626 step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);

622 temp1 = (-step1[11] + step1[12]) * cospi_16_64;	627 temp1 = (-step1[11] + step1[12]) * cospi_16_64;

623 temp2 = (step1[11] + step1[12]) * cospi_16_64;	628 temp2 = (step1[11] + step1[12]) * cospi_16_64;

624 step2[11] = dct_const_round_shift(temp1);	629 step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);

625 step2[12] = dct_const_round_shift(temp2);	630 step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);

626 step2[14] = step1[14];	631 step2[14] = step1[14];

627 step2[15] = step1[15];	632 step2[15] = step1[15];

628	633

629 // stage 7	634 // stage 7

630 output[0] = step2[0] + step2[15];	635 output[0] = WRAPLOW(step2[0] + step2[15], 8);

631 output[1] = step2[1] + step2[14];	636 output[1] = WRAPLOW(step2[1] + step2[14], 8);

632 output[2] = step2[2] + step2[13];	637 output[2] = WRAPLOW(step2[2] + step2[13], 8);

633 output[3] = step2[3] + step2[12];	638 output[3] = WRAPLOW(step2[3] + step2[12], 8);

634 output[4] = step2[4] + step2[11];	639 output[4] = WRAPLOW(step2[4] + step2[11], 8);

635 output[5] = step2[5] + step2[10];	640 output[5] = WRAPLOW(step2[5] + step2[10], 8);

636 output[6] = step2[6] + step2[9];	641 output[6] = WRAPLOW(step2[6] + step2[9], 8);

637 output[7] = step2[7] + step2[8];	642 output[7] = WRAPLOW(step2[7] + step2[8], 8);

638 output[8] = step2[7] - step2[8];	643 output[8] = WRAPLOW(step2[7] - step2[8], 8);

639 output[9] = step2[6] - step2[9];	644 output[9] = WRAPLOW(step2[6] - step2[9], 8);

640 output[10] = step2[5] - step2[10];	645 output[10] = WRAPLOW(step2[5] - step2[10], 8);

641 output[11] = step2[4] - step2[11];	646 output[11] = WRAPLOW(step2[4] - step2[11], 8);

642 output[12] = step2[3] - step2[12];	647 output[12] = WRAPLOW(step2[3] - step2[12], 8);

643 output[13] = step2[2] - step2[13];	648 output[13] = WRAPLOW(step2[2] - step2[13], 8);

644 output[14] = step2[1] - step2[14];	649 output[14] = WRAPLOW(step2[1] - step2[14], 8);

645 output[15] = step2[0] - step2[15];	650 output[15] = WRAPLOW(step2[0] - step2[15], 8);

646 }	651 }

647	652

648 void vp9_idct16x16_256_add_c(const tran_low_t input, uint8_t dest,	653 void vp9_idct16x16_256_add_c(const tran_low_t input, uint8_t dest,

649 int stride) {	654 int stride) {

650 tran_low_t out[16 * 16];	655 tran_low_t out[16 * 16];

651 tran_low_t *outptr = out;	656 tran_low_t *outptr = out;

652 int i, j;	657 int i, j;

653 tran_low_t temp_in[16], temp_out[16];	658 tran_low_t temp_in[16], temp_out[16];

654	659

655 // First transform rows	660 // First transform rows

656 for (i = 0; i < 16; ++i) {	661 for (i = 0; i < 16; ++i) {

657 idct16(input, outptr);	662 idct16(input, outptr);

658 input += 16;	663 input += 16;

659 outptr += 16;	664 outptr += 16;

660 }	665 }

661	666

662 // Then transform columns	667 // Then transform columns

663 for (i = 0; i < 16; ++i) {	668 for (i = 0; i < 16; ++i) {

664 for (j = 0; j < 16; ++j)	669 for (j = 0; j < 16; ++j)

665 temp_in[j] = out[j * 16 + i];	670 temp_in[j] = out[j * 16 + i];

666 idct16(temp_in, temp_out);	671 idct16(temp_in, temp_out);

667 for (j = 0; j < 16; ++j)	672 for (j = 0; j < 16; ++j) {

668 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)	673 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],

669 + dest[j * stride + i]);	674 ROUND_POWER_OF_TWO(temp_out[j], 6));

	675 }

670 }	676 }

671 }	677 }

672	678

673 static void iadst16(const tran_low_t input, tran_low_t output) {	679 static void iadst16(const tran_low_t input, tran_low_t output) {

674 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;	680 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;

675 tran_high_t s9, s10, s11, s12, s13, s14, s15;	681 tran_high_t s9, s10, s11, s12, s13, s14, s15;

676	682

677 tran_high_t x0 = input[15];	683 tran_high_t x0 = input[15];

678 tran_high_t x1 = input[0];	684 tran_high_t x1 = input[0];

679 tran_high_t x2 = input[13];	685 tran_high_t x2 = input[13];

(...skipping 31 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
711 s7 = x6 * cospi_19_64 - x7 * cospi_13_64;	717 s7 = x6 * cospi_19_64 - x7 * cospi_13_64;

712 s8 = x8 * cospi_17_64 + x9 * cospi_15_64;	718 s8 = x8 * cospi_17_64 + x9 * cospi_15_64;

713 s9 = x8 * cospi_15_64 - x9 * cospi_17_64;	719 s9 = x8 * cospi_15_64 - x9 * cospi_17_64;

714 s10 = x10 * cospi_21_64 + x11 * cospi_11_64;	720 s10 = x10 * cospi_21_64 + x11 * cospi_11_64;

715 s11 = x10 * cospi_11_64 - x11 * cospi_21_64;	721 s11 = x10 * cospi_11_64 - x11 * cospi_21_64;

716 s12 = x12 * cospi_25_64 + x13 * cospi_7_64;	722 s12 = x12 * cospi_25_64 + x13 * cospi_7_64;

717 s13 = x12 * cospi_7_64 - x13 * cospi_25_64;	723 s13 = x12 * cospi_7_64 - x13 * cospi_25_64;

718 s14 = x14 * cospi_29_64 + x15 * cospi_3_64;	724 s14 = x14 * cospi_29_64 + x15 * cospi_3_64;

719 s15 = x14 * cospi_3_64 - x15 * cospi_29_64;	725 s15 = x14 * cospi_3_64 - x15 * cospi_29_64;

720	726

721 x0 = dct_const_round_shift(s0 + s8);	727 x0 = WRAPLOW(dct_const_round_shift(s0 + s8), 8);

722 x1 = dct_const_round_shift(s1 + s9);	728 x1 = WRAPLOW(dct_const_round_shift(s1 + s9), 8);

723 x2 = dct_const_round_shift(s2 + s10);	729 x2 = WRAPLOW(dct_const_round_shift(s2 + s10), 8);

724 x3 = dct_const_round_shift(s3 + s11);	730 x3 = WRAPLOW(dct_const_round_shift(s3 + s11), 8);

725 x4 = dct_const_round_shift(s4 + s12);	731 x4 = WRAPLOW(dct_const_round_shift(s4 + s12), 8);

726 x5 = dct_const_round_shift(s5 + s13);	732 x5 = WRAPLOW(dct_const_round_shift(s5 + s13), 8);

727 x6 = dct_const_round_shift(s6 + s14);	733 x6 = WRAPLOW(dct_const_round_shift(s6 + s14), 8);

728 x7 = dct_const_round_shift(s7 + s15);	734 x7 = WRAPLOW(dct_const_round_shift(s7 + s15), 8);

729 x8 = dct_const_round_shift(s0 - s8);	735 x8 = WRAPLOW(dct_const_round_shift(s0 - s8), 8);

730 x9 = dct_const_round_shift(s1 - s9);	736 x9 = WRAPLOW(dct_const_round_shift(s1 - s9), 8);

731 x10 = dct_const_round_shift(s2 - s10);	737 x10 = WRAPLOW(dct_const_round_shift(s2 - s10), 8);

732 x11 = dct_const_round_shift(s3 - s11);	738 x11 = WRAPLOW(dct_const_round_shift(s3 - s11), 8);

733 x12 = dct_const_round_shift(s4 - s12);	739 x12 = WRAPLOW(dct_const_round_shift(s4 - s12), 8);

734 x13 = dct_const_round_shift(s5 - s13);	740 x13 = WRAPLOW(dct_const_round_shift(s5 - s13), 8);

735 x14 = dct_const_round_shift(s6 - s14);	741 x14 = WRAPLOW(dct_const_round_shift(s6 - s14), 8);

736 x15 = dct_const_round_shift(s7 - s15);	742 x15 = WRAPLOW(dct_const_round_shift(s7 - s15), 8);

737	743

738 // stage 2	744 // stage 2

739 s0 = x0;	745 s0 = x0;

740 s1 = x1;	746 s1 = x1;

741 s2 = x2;	747 s2 = x2;

742 s3 = x3;	748 s3 = x3;

743 s4 = x4;	749 s4 = x4;

744 s5 = x5;	750 s5 = x5;

745 s6 = x6;	751 s6 = x6;

746 s7 = x7;	752 s7 = x7;

747 s8 = x8 * cospi_4_64 + x9 * cospi_28_64;	753 s8 = x8 * cospi_4_64 + x9 * cospi_28_64;

748 s9 = x8 * cospi_28_64 - x9 * cospi_4_64;	754 s9 = x8 * cospi_28_64 - x9 * cospi_4_64;

749 s10 = x10 * cospi_20_64 + x11 * cospi_12_64;	755 s10 = x10 * cospi_20_64 + x11 * cospi_12_64;

750 s11 = x10 * cospi_12_64 - x11 * cospi_20_64;	756 s11 = x10 * cospi_12_64 - x11 * cospi_20_64;

751 s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;	757 s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;

752 s13 = x12 * cospi_4_64 + x13 * cospi_28_64;	758 s13 = x12 * cospi_4_64 + x13 * cospi_28_64;

753 s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;	759 s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;

754 s15 = x14 * cospi_20_64 + x15 * cospi_12_64;	760 s15 = x14 * cospi_20_64 + x15 * cospi_12_64;

755	761

756 x0 = s0 + s4;	762 x0 = WRAPLOW(s0 + s4, 8);

757 x1 = s1 + s5;	763 x1 = WRAPLOW(s1 + s5, 8);

758 x2 = s2 + s6;	764 x2 = WRAPLOW(s2 + s6, 8);

759 x3 = s3 + s7;	765 x3 = WRAPLOW(s3 + s7, 8);

760 x4 = s0 - s4;	766 x4 = WRAPLOW(s0 - s4, 8);

761 x5 = s1 - s5;	767 x5 = WRAPLOW(s1 - s5, 8);

762 x6 = s2 - s6;	768 x6 = WRAPLOW(s2 - s6, 8);

763 x7 = s3 - s7;	769 x7 = WRAPLOW(s3 - s7, 8);

764 x8 = dct_const_round_shift(s8 + s12);	770 x8 = WRAPLOW(dct_const_round_shift(s8 + s12), 8);

765 x9 = dct_const_round_shift(s9 + s13);	771 x9 = WRAPLOW(dct_const_round_shift(s9 + s13), 8);

766 x10 = dct_const_round_shift(s10 + s14);	772 x10 = WRAPLOW(dct_const_round_shift(s10 + s14), 8);

767 x11 = dct_const_round_shift(s11 + s15);	773 x11 = WRAPLOW(dct_const_round_shift(s11 + s15), 8);

768 x12 = dct_const_round_shift(s8 - s12);	774 x12 = WRAPLOW(dct_const_round_shift(s8 - s12), 8);

769 x13 = dct_const_round_shift(s9 - s13);	775 x13 = WRAPLOW(dct_const_round_shift(s9 - s13), 8);

770 x14 = dct_const_round_shift(s10 - s14);	776 x14 = WRAPLOW(dct_const_round_shift(s10 - s14), 8);

771 x15 = dct_const_round_shift(s11 - s15);	777 x15 = WRAPLOW(dct_const_round_shift(s11 - s15), 8);

772	778

773 // stage 3	779 // stage 3

774 s0 = x0;	780 s0 = x0;

775 s1 = x1;	781 s1 = x1;

776 s2 = x2;	782 s2 = x2;

777 s3 = x3;	783 s3 = x3;

778 s4 = x4 * cospi_8_64 + x5 * cospi_24_64;	784 s4 = x4 * cospi_8_64 + x5 * cospi_24_64;

779 s5 = x4 * cospi_24_64 - x5 * cospi_8_64;	785 s5 = x4 * cospi_24_64 - x5 * cospi_8_64;

780 s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;	786 s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;

781 s7 = x6 * cospi_8_64 + x7 * cospi_24_64;	787 s7 = x6 * cospi_8_64 + x7 * cospi_24_64;

782 s8 = x8;	788 s8 = x8;

783 s9 = x9;	789 s9 = x9;

784 s10 = x10;	790 s10 = x10;

785 s11 = x11;	791 s11 = x11;

786 s12 = x12 * cospi_8_64 + x13 * cospi_24_64;	792 s12 = x12 * cospi_8_64 + x13 * cospi_24_64;

787 s13 = x12 * cospi_24_64 - x13 * cospi_8_64;	793 s13 = x12 * cospi_24_64 - x13 * cospi_8_64;

788 s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;	794 s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;

789 s15 = x14 * cospi_8_64 + x15 * cospi_24_64;	795 s15 = x14 * cospi_8_64 + x15 * cospi_24_64;

790	796

791 x0 = s0 + s2;	797 x0 = WRAPLOW(check_range(s0 + s2), 8);

792 x1 = s1 + s3;	798 x1 = WRAPLOW(check_range(s1 + s3), 8);

793 x2 = s0 - s2;	799 x2 = WRAPLOW(check_range(s0 - s2), 8);

794 x3 = s1 - s3;	800 x3 = WRAPLOW(check_range(s1 - s3), 8);

795 x4 = dct_const_round_shift(s4 + s6);	801 x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);

796 x5 = dct_const_round_shift(s5 + s7);	802 x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);

797 x6 = dct_const_round_shift(s4 - s6);	803 x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);

798 x7 = dct_const_round_shift(s5 - s7);	804 x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);

799 x8 = s8 + s10;	805 x8 = WRAPLOW(check_range(s8 + s10), 8);

800 x9 = s9 + s11;	806 x9 = WRAPLOW(check_range(s9 + s11), 8);

801 x10 = s8 - s10;	807 x10 = WRAPLOW(check_range(s8 - s10), 8);

802 x11 = s9 - s11;	808 x11 = WRAPLOW(check_range(s9 - s11), 8);

803 x12 = dct_const_round_shift(s12 + s14);	809 x12 = WRAPLOW(dct_const_round_shift(s12 + s14), 8);

804 x13 = dct_const_round_shift(s13 + s15);	810 x13 = WRAPLOW(dct_const_round_shift(s13 + s15), 8);

805 x14 = dct_const_round_shift(s12 - s14);	811 x14 = WRAPLOW(dct_const_round_shift(s12 - s14), 8);

806 x15 = dct_const_round_shift(s13 - s15);	812 x15 = WRAPLOW(dct_const_round_shift(s13 - s15), 8);

807	813

808 // stage 4	814 // stage 4

809 s2 = (- cospi_16_64) * (x2 + x3);	815 s2 = (- cospi_16_64) * (x2 + x3);

810 s3 = cospi_16_64 * (x2 - x3);	816 s3 = cospi_16_64 * (x2 - x3);

811 s6 = cospi_16_64 * (x6 + x7);	817 s6 = cospi_16_64 * (x6 + x7);

812 s7 = cospi_16_64 * (- x6 + x7);	818 s7 = cospi_16_64 * (- x6 + x7);

813 s10 = cospi_16_64 * (x10 + x11);	819 s10 = cospi_16_64 * (x10 + x11);

814 s11 = cospi_16_64 * (- x10 + x11);	820 s11 = cospi_16_64 * (- x10 + x11);

815 s14 = (- cospi_16_64) * (x14 + x15);	821 s14 = (- cospi_16_64) * (x14 + x15);

816 s15 = cospi_16_64 * (x14 - x15);	822 s15 = cospi_16_64 * (x14 - x15);

817	823

818 x2 = dct_const_round_shift(s2);	824 x2 = WRAPLOW(dct_const_round_shift(s2), 8);

819 x3 = dct_const_round_shift(s3);	825 x3 = WRAPLOW(dct_const_round_shift(s3), 8);

820 x6 = dct_const_round_shift(s6);	826 x6 = WRAPLOW(dct_const_round_shift(s6), 8);

821 x7 = dct_const_round_shift(s7);	827 x7 = WRAPLOW(dct_const_round_shift(s7), 8);

822 x10 = dct_const_round_shift(s10);	828 x10 = WRAPLOW(dct_const_round_shift(s10), 8);

823 x11 = dct_const_round_shift(s11);	829 x11 = WRAPLOW(dct_const_round_shift(s11), 8);

824 x14 = dct_const_round_shift(s14);	830 x14 = WRAPLOW(dct_const_round_shift(s14), 8);

825 x15 = dct_const_round_shift(s15);	831 x15 = WRAPLOW(dct_const_round_shift(s15), 8);

826	832

827 output[0] = x0;	833 output[0] = WRAPLOW(x0, 8);

828 output[1] = -x8;	834 output[1] = WRAPLOW(-x8, 8);

829 output[2] = x12;	835 output[2] = WRAPLOW(x12, 8);

830 output[3] = -x4;	836 output[3] = WRAPLOW(-x4, 8);

831 output[4] = x6;	837 output[4] = WRAPLOW(x6, 8);

832 output[5] = x14;	838 output[5] = WRAPLOW(x14, 8);

833 output[6] = x10;	839 output[6] = WRAPLOW(x10, 8);

834 output[7] = x2;	840 output[7] = WRAPLOW(x2, 8);

835 output[8] = x3;	841 output[8] = WRAPLOW(x3, 8);

836 output[9] = x11;	842 output[9] = WRAPLOW(x11, 8);

837 output[10] = x15;	843 output[10] = WRAPLOW(x15, 8);

838 output[11] = x7;	844 output[11] = WRAPLOW(x7, 8);

839 output[12] = x5;	845 output[12] = WRAPLOW(x5, 8);

840 output[13] = -x13;	846 output[13] = WRAPLOW(-x13, 8);

841 output[14] = x9;	847 output[14] = WRAPLOW(x9, 8);

842 output[15] = -x1;	848 output[15] = WRAPLOW(-x1, 8);

843 }	849 }

844	850

845 static const transform_2d IHT_16[] = {	851 static const transform_2d IHT_16[] = {

846 { idct16, idct16 }, // DCT_DCT = 0	852 { idct16, idct16 }, // DCT_DCT = 0

847 { iadst16, idct16 }, // ADST_DCT = 1	853 { iadst16, idct16 }, // ADST_DCT = 1

848 { idct16, iadst16 }, // DCT_ADST = 2	854 { idct16, iadst16 }, // DCT_ADST = 2

849 { iadst16, iadst16 } // ADST_ADST = 3	855 { iadst16, iadst16 } // ADST_ADST = 3

850 };	856 };

851	857

852 void vp9_iht16x16_256_add_c(const tran_low_t input, uint8_t dest, int stride,	858 void vp9_iht16x16_256_add_c(const tran_low_t input, uint8_t dest, int stride,

853 int tx_type) {	859 int tx_type) {

854 int i, j;	860 int i, j;

855 tran_low_t out[16 * 16];	861 tran_low_t out[16 * 16];

856 tran_low_t *outptr = out;	862 tran_low_t *outptr = out;

857 tran_low_t temp_in[16], temp_out[16];	863 tran_low_t temp_in[16], temp_out[16];

858 const transform_2d ht = IHT_16[tx_type];	864 const transform_2d ht = IHT_16[tx_type];

859	865

860 // Rows	866 // Rows

861 for (i = 0; i < 16; ++i) {	867 for (i = 0; i < 16; ++i) {

862 ht.rows(input, outptr);	868 ht.rows(input, outptr);

863 input += 16;	869 input += 16;

864 outptr += 16;	870 outptr += 16;

865 }	871 }

866	872

867 // Columns	873 // Columns

868 for (i = 0; i < 16; ++i) {	874 for (i = 0; i < 16; ++i) {

869 for (j = 0; j < 16; ++j)	875 for (j = 0; j < 16; ++j)

870 temp_in[j] = out[j * 16 + i];	876 temp_in[j] = out[j * 16 + i];

871 ht.cols(temp_in, temp_out);	877 ht.cols(temp_in, temp_out);

872 for (j = 0; j < 16; ++j)	878 for (j = 0; j < 16; ++j) {

873 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)	879 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],

874 + dest[j * stride + i]);	880 ROUND_POWER_OF_TWO(temp_out[j], 6));

	881 }

875 }	882 }

876 }	883 }

877	884

878 void vp9_idct16x16_10_add_c(const tran_low_t input, uint8_t dest,	885 void vp9_idct16x16_10_add_c(const tran_low_t input, uint8_t dest,

879 int stride) {	886 int stride) {

880 tran_low_t out[16 * 16] = { 0 };	887 tran_low_t out[16 * 16] = { 0 };

881 tran_low_t *outptr = out;	888 tran_low_t *outptr = out;

882 int i, j;	889 int i, j;

883 tran_low_t temp_in[16], temp_out[16];	890 tran_low_t temp_in[16], temp_out[16];

884	891

885 // First transform rows. Since all non-zero dct coefficients are in	892 // First transform rows. Since all non-zero dct coefficients are in

886 // upper-left 4x4 area, we only need to calculate first 4 rows here.	893 // upper-left 4x4 area, we only need to calculate first 4 rows here.

887 for (i = 0; i < 4; ++i) {	894 for (i = 0; i < 4; ++i) {

888 idct16(input, outptr);	895 idct16(input, outptr);

889 input += 16;	896 input += 16;

890 outptr += 16;	897 outptr += 16;

891 }	898 }

892	899

893 // Then transform columns	900 // Then transform columns

894 for (i = 0; i < 16; ++i) {	901 for (i = 0; i < 16; ++i) {

895 for (j = 0; j < 16; ++j)	902 for (j = 0; j < 16; ++j)

896 temp_in[j] = out[j*16 + i];	903 temp_in[j] = out[j*16 + i];

897 idct16(temp_in, temp_out);	904 idct16(temp_in, temp_out);

898 for (j = 0; j < 16; ++j)	905 for (j = 0; j < 16; ++j) {

899 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)	906 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],

900 + dest[j * stride + i]);	907 ROUND_POWER_OF_TWO(temp_out[j], 6));

	908 }

901 }	909 }

902 }	910 }

903	911

904 void vp9_idct16x16_1_add_c(const tran_low_t input, uint8_t dest, int stride) {	912 void vp9_idct16x16_1_add_c(const tran_low_t input, uint8_t dest, int stride) {

905 int i, j;	913 int i, j;

906 tran_high_t a1;	914 tran_high_t a1;

907 tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64);	915 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);

908 out = dct_const_round_shift(out * cospi_16_64);	916 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);

909 a1 = ROUND_POWER_OF_TWO(out, 6);	917 a1 = ROUND_POWER_OF_TWO(out, 6);

910 for (j = 0; j < 16; ++j) {	918 for (j = 0; j < 16; ++j) {

911 for (i = 0; i < 16; ++i)	919 for (i = 0; i < 16; ++i)

912 dest[i] = clip_pixel(dest[i] + a1);	920 dest[i] = clip_pixel_add(dest[i], a1);

913 dest += stride;	921 dest += stride;

914 }	922 }

915 }	923 }

916	924

917 static void idct32(const tran_low_t input, tran_low_t output) {	925 static void idct32(const tran_low_t input, tran_low_t output) {

918 tran_low_t step1[32], step2[32];	926 tran_low_t step1[32], step2[32];

919 tran_high_t temp1, temp2;	927 tran_high_t temp1, temp2;

920	928

921 // stage 1	929 // stage 1

922 step1[0] = input[0];	930 step1[0] = input[0];

923 step1[1] = input[16];	931 step1[1] = input[16];

924 step1[2] = input[8];	932 step1[2] = input[8];

925 step1[3] = input[24];	933 step1[3] = input[24];

926 step1[4] = input[4];	934 step1[4] = input[4];

927 step1[5] = input[20];	935 step1[5] = input[20];

928 step1[6] = input[12];	936 step1[6] = input[12];

929 step1[7] = input[28];	937 step1[7] = input[28];

930 step1[8] = input[2];	938 step1[8] = input[2];

931 step1[9] = input[18];	939 step1[9] = input[18];

932 step1[10] = input[10];	940 step1[10] = input[10];

933 step1[11] = input[26];	941 step1[11] = input[26];

934 step1[12] = input[6];	942 step1[12] = input[6];

935 step1[13] = input[22];	943 step1[13] = input[22];

936 step1[14] = input[14];	944 step1[14] = input[14];

937 step1[15] = input[30];	945 step1[15] = input[30];

938	946

939 temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;	947 temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;

940 temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;	948 temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;

941 step1[16] = dct_const_round_shift(temp1);	949 step1[16] = WRAPLOW(dct_const_round_shift(temp1), 8);

942 step1[31] = dct_const_round_shift(temp2);	950 step1[31] = WRAPLOW(dct_const_round_shift(temp2), 8);

943	951

944 temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;	952 temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;

945 temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;	953 temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;

946 step1[17] = dct_const_round_shift(temp1);	954 step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);

947 step1[30] = dct_const_round_shift(temp2);	955 step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);

948	956

949 temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;	957 temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;

950 temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;	958 temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;

951 step1[18] = dct_const_round_shift(temp1);	959 step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);

952 step1[29] = dct_const_round_shift(temp2);	960 step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);

953	961

954 temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;	962 temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;

955 temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;	963 temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;

956 step1[19] = dct_const_round_shift(temp1);	964 step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);

957 step1[28] = dct_const_round_shift(temp2);	965 step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);

958	966

959 temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;	967 temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;

960 temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;	968 temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;

961 step1[20] = dct_const_round_shift(temp1);	969 step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);

962 step1[27] = dct_const_round_shift(temp2);	970 step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);

963	971

964 temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;	972 temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;

965 temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;	973 temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;

966 step1[21] = dct_const_round_shift(temp1);	974 step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);

967 step1[26] = dct_const_round_shift(temp2);	975 step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);

968	976

969 temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;	977 temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;

970 temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;	978 temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;

971 step1[22] = dct_const_round_shift(temp1);	979 step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);

972 step1[25] = dct_const_round_shift(temp2);	980 step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);

973	981

974 temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;	982 temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;

975 temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;	983 temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;

976 step1[23] = dct_const_round_shift(temp1);	984 step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);

977 step1[24] = dct_const_round_shift(temp2);	985 step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);

978	986

979 // stage 2	987 // stage 2

980 step2[0] = step1[0];	988 step2[0] = step1[0];

981 step2[1] = step1[1];	989 step2[1] = step1[1];

982 step2[2] = step1[2];	990 step2[2] = step1[2];

983 step2[3] = step1[3];	991 step2[3] = step1[3];

984 step2[4] = step1[4];	992 step2[4] = step1[4];

985 step2[5] = step1[5];	993 step2[5] = step1[5];

986 step2[6] = step1[6];	994 step2[6] = step1[6];

987 step2[7] = step1[7];	995 step2[7] = step1[7];

988	996

989 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;	997 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;

990 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;	998 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;

991 step2[8] = dct_const_round_shift(temp1);	999 step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);

992 step2[15] = dct_const_round_shift(temp2);	1000 step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);

993	1001

994 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;	1002 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;

995 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;	1003 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;

996 step2[9] = dct_const_round_shift(temp1);	1004 step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);

997 step2[14] = dct_const_round_shift(temp2);	1005 step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);

998	1006

999 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;	1007 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;

1000 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;	1008 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;

1001 step2[10] = dct_const_round_shift(temp1);	1009 step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);

1002 step2[13] = dct_const_round_shift(temp2);	1010 step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);

1003	1011

1004 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;	1012 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;

1005 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;	1013 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;

1006 step2[11] = dct_const_round_shift(temp1);	1014 step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);

1007 step2[12] = dct_const_round_shift(temp2);	1015 step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);

1008	1016

1009 step2[16] = step1[16] + step1[17];	1017 step2[16] = WRAPLOW(step1[16] + step1[17], 8);

1010 step2[17] = step1[16] - step1[17];	1018 step2[17] = WRAPLOW(step1[16] - step1[17], 8);

1011 step2[18] = -step1[18] + step1[19];	1019 step2[18] = WRAPLOW(-step1[18] + step1[19], 8);

1012 step2[19] = step1[18] + step1[19];	1020 step2[19] = WRAPLOW(step1[18] + step1[19], 8);

1013 step2[20] = step1[20] + step1[21];	1021 step2[20] = WRAPLOW(step1[20] + step1[21], 8);

1014 step2[21] = step1[20] - step1[21];	1022 step2[21] = WRAPLOW(step1[20] - step1[21], 8);

1015 step2[22] = -step1[22] + step1[23];	1023 step2[22] = WRAPLOW(-step1[22] + step1[23], 8);

1016 step2[23] = step1[22] + step1[23];	1024 step2[23] = WRAPLOW(step1[22] + step1[23], 8);

1017 step2[24] = step1[24] + step1[25];	1025 step2[24] = WRAPLOW(step1[24] + step1[25], 8);

1018 step2[25] = step1[24] - step1[25];	1026 step2[25] = WRAPLOW(step1[24] - step1[25], 8);

1019 step2[26] = -step1[26] + step1[27];	1027 step2[26] = WRAPLOW(-step1[26] + step1[27], 8);

1020 step2[27] = step1[26] + step1[27];	1028 step2[27] = WRAPLOW(step1[26] + step1[27], 8);

1021 step2[28] = step1[28] + step1[29];	1029 step2[28] = WRAPLOW(step1[28] + step1[29], 8);

1022 step2[29] = step1[28] - step1[29];	1030 step2[29] = WRAPLOW(step1[28] - step1[29], 8);

1023 step2[30] = -step1[30] + step1[31];	1031 step2[30] = WRAPLOW(-step1[30] + step1[31], 8);

1024 step2[31] = step1[30] + step1[31];	1032 step2[31] = WRAPLOW(step1[30] + step1[31], 8);

1025	1033

1026 // stage 3	1034 // stage 3

1027 step1[0] = step2[0];	1035 step1[0] = step2[0];

1028 step1[1] = step2[1];	1036 step1[1] = step2[1];

1029 step1[2] = step2[2];	1037 step1[2] = step2[2];

1030 step1[3] = step2[3];	1038 step1[3] = step2[3];

1031	1039

1032 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;	1040 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;

1033 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;	1041 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;

1034 step1[4] = dct_const_round_shift(temp1);	1042 step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);

1035 step1[7] = dct_const_round_shift(temp2);	1043 step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);

1036 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;	1044 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;

1037 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;	1045 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;

1038 step1[5] = dct_const_round_shift(temp1);	1046 step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);

1039 step1[6] = dct_const_round_shift(temp2);	1047 step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);

1040	1048

1041 step1[8] = step2[8] + step2[9];	1049 step1[8] = WRAPLOW(step2[8] + step2[9], 8);

1042 step1[9] = step2[8] - step2[9];	1050 step1[9] = WRAPLOW(step2[8] - step2[9], 8);

1043 step1[10] = -step2[10] + step2[11];	1051 step1[10] = WRAPLOW(-step2[10] + step2[11], 8);

1044 step1[11] = step2[10] + step2[11];	1052 step1[11] = WRAPLOW(step2[10] + step2[11], 8);

1045 step1[12] = step2[12] + step2[13];	1053 step1[12] = WRAPLOW(step2[12] + step2[13], 8);

1046 step1[13] = step2[12] - step2[13];	1054 step1[13] = WRAPLOW(step2[12] - step2[13], 8);

1047 step1[14] = -step2[14] + step2[15];	1055 step1[14] = WRAPLOW(-step2[14] + step2[15], 8);

1048 step1[15] = step2[14] + step2[15];	1056 step1[15] = WRAPLOW(step2[14] + step2[15], 8);

1049	1057

1050 step1[16] = step2[16];	1058 step1[16] = step2[16];

1051 step1[31] = step2[31];	1059 step1[31] = step2[31];

1052 temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;	1060 temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;

1053 temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;	1061 temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;

1054 step1[17] = dct_const_round_shift(temp1);	1062 step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);

1055 step1[30] = dct_const_round_shift(temp2);	1063 step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);

1056 temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;	1064 temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;

1057 temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;	1065 temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;

1058 step1[18] = dct_const_round_shift(temp1);	1066 step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);

1059 step1[29] = dct_const_round_shift(temp2);	1067 step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);

1060 step1[19] = step2[19];	1068 step1[19] = step2[19];

1061 step1[20] = step2[20];	1069 step1[20] = step2[20];

1062 temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;	1070 temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;

1063 temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;	1071 temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;

1064 step1[21] = dct_const_round_shift(temp1);	1072 step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);

1065 step1[26] = dct_const_round_shift(temp2);	1073 step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);

1066 temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;	1074 temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;

1067 temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;	1075 temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;

1068 step1[22] = dct_const_round_shift(temp1);	1076 step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);

1069 step1[25] = dct_const_round_shift(temp2);	1077 step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);

1070 step1[23] = step2[23];	1078 step1[23] = step2[23];

1071 step1[24] = step2[24];	1079 step1[24] = step2[24];

1072 step1[27] = step2[27];	1080 step1[27] = step2[27];

1073 step1[28] = step2[28];	1081 step1[28] = step2[28];

1074	1082

1075 // stage 4	1083 // stage 4

1076 temp1 = (step1[0] + step1[1]) * cospi_16_64;	1084 temp1 = (step1[0] + step1[1]) * cospi_16_64;

1077 temp2 = (step1[0] - step1[1]) * cospi_16_64;	1085 temp2 = (step1[0] - step1[1]) * cospi_16_64;

1078 step2[0] = dct_const_round_shift(temp1);	1086 step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);

1079 step2[1] = dct_const_round_shift(temp2);	1087 step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);

1080 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;	1088 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;

1081 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;	1089 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;

1082 step2[2] = dct_const_round_shift(temp1);	1090 step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);

1083 step2[3] = dct_const_round_shift(temp2);	1091 step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);

1084 step2[4] = step1[4] + step1[5];	1092 step2[4] = WRAPLOW(step1[4] + step1[5], 8);

1085 step2[5] = step1[4] - step1[5];	1093 step2[5] = WRAPLOW(step1[4] - step1[5], 8);

1086 step2[6] = -step1[6] + step1[7];	1094 step2[6] = WRAPLOW(-step1[6] + step1[7], 8);

1087 step2[7] = step1[6] + step1[7];	1095 step2[7] = WRAPLOW(step1[6] + step1[7], 8);

1088	1096

1089 step2[8] = step1[8];	1097 step2[8] = step1[8];

1090 step2[15] = step1[15];	1098 step2[15] = step1[15];

1091 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;	1099 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;

1092 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;	1100 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;

1093 step2[9] = dct_const_round_shift(temp1);	1101 step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);

1094 step2[14] = dct_const_round_shift(temp2);	1102 step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);

1095 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;	1103 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;

1096 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;	1104 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;

1097 step2[10] = dct_const_round_shift(temp1);	1105 step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);

1098 step2[13] = dct_const_round_shift(temp2);	1106 step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);

1099 step2[11] = step1[11];	1107 step2[11] = step1[11];

1100 step2[12] = step1[12];	1108 step2[12] = step1[12];

1101	1109

1102 step2[16] = step1[16] + step1[19];	1110 step2[16] = WRAPLOW(step1[16] + step1[19], 8);

1103 step2[17] = step1[17] + step1[18];	1111 step2[17] = WRAPLOW(step1[17] + step1[18], 8);

1104 step2[18] = step1[17] - step1[18];	1112 step2[18] = WRAPLOW(step1[17] - step1[18], 8);

1105 step2[19] = step1[16] - step1[19];	1113 step2[19] = WRAPLOW(step1[16] - step1[19], 8);

1106 step2[20] = -step1[20] + step1[23];	1114 step2[20] = WRAPLOW(-step1[20] + step1[23], 8);

1107 step2[21] = -step1[21] + step1[22];	1115 step2[21] = WRAPLOW(-step1[21] + step1[22], 8);

1108 step2[22] = step1[21] + step1[22];	1116 step2[22] = WRAPLOW(step1[21] + step1[22], 8);

1109 step2[23] = step1[20] + step1[23];	1117 step2[23] = WRAPLOW(step1[20] + step1[23], 8);

1110	1118

1111 step2[24] = step1[24] + step1[27];	1119 step2[24] = WRAPLOW(step1[24] + step1[27], 8);

1112 step2[25] = step1[25] + step1[26];	1120 step2[25] = WRAPLOW(step1[25] + step1[26], 8);

1113 step2[26] = step1[25] - step1[26];	1121 step2[26] = WRAPLOW(step1[25] - step1[26], 8);

1114 step2[27] = step1[24] - step1[27];	1122 step2[27] = WRAPLOW(step1[24] - step1[27], 8);

1115 step2[28] = -step1[28] + step1[31];	1123 step2[28] = WRAPLOW(-step1[28] + step1[31], 8);

1116 step2[29] = -step1[29] + step1[30];	1124 step2[29] = WRAPLOW(-step1[29] + step1[30], 8);

1117 step2[30] = step1[29] + step1[30];	1125 step2[30] = WRAPLOW(step1[29] + step1[30], 8);

1118 step2[31] = step1[28] + step1[31];	1126 step2[31] = WRAPLOW(step1[28] + step1[31], 8);

1119	1127

1120 // stage 5	1128 // stage 5

1121 step1[0] = step2[0] + step2[3];	1129 step1[0] = WRAPLOW(step2[0] + step2[3], 8);

1122 step1[1] = step2[1] + step2[2];	1130 step1[1] = WRAPLOW(step2[1] + step2[2], 8);

1123 step1[2] = step2[1] - step2[2];	1131 step1[2] = WRAPLOW(step2[1] - step2[2], 8);

1124 step1[3] = step2[0] - step2[3];	1132 step1[3] = WRAPLOW(step2[0] - step2[3], 8);

1125 step1[4] = step2[4];	1133 step1[4] = step2[4];

1126 temp1 = (step2[6] - step2[5]) * cospi_16_64;	1134 temp1 = (step2[6] - step2[5]) * cospi_16_64;

1127 temp2 = (step2[5] + step2[6]) * cospi_16_64;	1135 temp2 = (step2[5] + step2[6]) * cospi_16_64;

1128 step1[5] = dct_const_round_shift(temp1);	1136 step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);

1129 step1[6] = dct_const_round_shift(temp2);	1137 step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);

1130 step1[7] = step2[7];	1138 step1[7] = step2[7];

1131	1139

1132 step1[8] = step2[8] + step2[11];	1140 step1[8] = WRAPLOW(step2[8] + step2[11], 8);

1133 step1[9] = step2[9] + step2[10];	1141 step1[9] = WRAPLOW(step2[9] + step2[10], 8);

1134 step1[10] = step2[9] - step2[10];	1142 step1[10] = WRAPLOW(step2[9] - step2[10], 8);

1135 step1[11] = step2[8] - step2[11];	1143 step1[11] = WRAPLOW(step2[8] - step2[11], 8);

1136 step1[12] = -step2[12] + step2[15];	1144 step1[12] = WRAPLOW(-step2[12] + step2[15], 8);

1137 step1[13] = -step2[13] + step2[14];	1145 step1[13] = WRAPLOW(-step2[13] + step2[14], 8);

1138 step1[14] = step2[13] + step2[14];	1146 step1[14] = WRAPLOW(step2[13] + step2[14], 8);

1139 step1[15] = step2[12] + step2[15];	1147 step1[15] = WRAPLOW(step2[12] + step2[15], 8);

1140	1148

1141 step1[16] = step2[16];	1149 step1[16] = step2[16];

1142 step1[17] = step2[17];	1150 step1[17] = step2[17];

1143 temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;	1151 temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;

1144 temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;	1152 temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;

1145 step1[18] = dct_const_round_shift(temp1);	1153 step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);

1146 step1[29] = dct_const_round_shift(temp2);	1154 step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);

1147 temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;	1155 temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;

1148 temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;	1156 temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;

1149 step1[19] = dct_const_round_shift(temp1);	1157 step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);

1150 step1[28] = dct_const_round_shift(temp2);	1158 step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);

1151 temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;	1159 temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;

1152 temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;	1160 temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;

1153 step1[20] = dct_const_round_shift(temp1);	1161 step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);

1154 step1[27] = dct_const_round_shift(temp2);	1162 step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);

1155 temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;	1163 temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;

1156 temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;	1164 temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;

1157 step1[21] = dct_const_round_shift(temp1);	1165 step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);

1158 step1[26] = dct_const_round_shift(temp2);	1166 step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);

1159 step1[22] = step2[22];	1167 step1[22] = step2[22];

1160 step1[23] = step2[23];	1168 step1[23] = step2[23];

1161 step1[24] = step2[24];	1169 step1[24] = step2[24];

1162 step1[25] = step2[25];	1170 step1[25] = step2[25];

1163 step1[30] = step2[30];	1171 step1[30] = step2[30];

1164 step1[31] = step2[31];	1172 step1[31] = step2[31];

1165	1173

1166 // stage 6	1174 // stage 6

1167 step2[0] = step1[0] + step1[7];	1175 step2[0] = WRAPLOW(step1[0] + step1[7], 8);

1168 step2[1] = step1[1] + step1[6];	1176 step2[1] = WRAPLOW(step1[1] + step1[6], 8);

1169 step2[2] = step1[2] + step1[5];	1177 step2[2] = WRAPLOW(step1[2] + step1[5], 8);

1170 step2[3] = step1[3] + step1[4];	1178 step2[3] = WRAPLOW(step1[3] + step1[4], 8);

1171 step2[4] = step1[3] - step1[4];	1179 step2[4] = WRAPLOW(step1[3] - step1[4], 8);

1172 step2[5] = step1[2] - step1[5];	1180 step2[5] = WRAPLOW(step1[2] - step1[5], 8);

1173 step2[6] = step1[1] - step1[6];	1181 step2[6] = WRAPLOW(step1[1] - step1[6], 8);

1174 step2[7] = step1[0] - step1[7];	1182 step2[7] = WRAPLOW(step1[0] - step1[7], 8);

1175 step2[8] = step1[8];	1183 step2[8] = step1[8];

1176 step2[9] = step1[9];	1184 step2[9] = step1[9];

1177 temp1 = (-step1[10] + step1[13]) * cospi_16_64;	1185 temp1 = (-step1[10] + step1[13]) * cospi_16_64;

1178 temp2 = (step1[10] + step1[13]) * cospi_16_64;	1186 temp2 = (step1[10] + step1[13]) * cospi_16_64;

1179 step2[10] = dct_const_round_shift(temp1);	1187 step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);

1180 step2[13] = dct_const_round_shift(temp2);	1188 step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);

1181 temp1 = (-step1[11] + step1[12]) * cospi_16_64;	1189 temp1 = (-step1[11] + step1[12]) * cospi_16_64;

1182 temp2 = (step1[11] + step1[12]) * cospi_16_64;	1190 temp2 = (step1[11] + step1[12]) * cospi_16_64;

1183 step2[11] = dct_const_round_shift(temp1);	1191 step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);

1184 step2[12] = dct_const_round_shift(temp2);	1192 step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);

1185 step2[14] = step1[14];	1193 step2[14] = step1[14];

1186 step2[15] = step1[15];	1194 step2[15] = step1[15];

1187	1195

1188 step2[16] = step1[16] + step1[23];	1196 step2[16] = WRAPLOW(step1[16] + step1[23], 8);

1189 step2[17] = step1[17] + step1[22];	1197 step2[17] = WRAPLOW(step1[17] + step1[22], 8);

1190 step2[18] = step1[18] + step1[21];	1198 step2[18] = WRAPLOW(step1[18] + step1[21], 8);

1191 step2[19] = step1[19] + step1[20];	1199 step2[19] = WRAPLOW(step1[19] + step1[20], 8);

1192 step2[20] = step1[19] - step1[20];	1200 step2[20] = WRAPLOW(step1[19] - step1[20], 8);

1193 step2[21] = step1[18] - step1[21];	1201 step2[21] = WRAPLOW(step1[18] - step1[21], 8);

1194 step2[22] = step1[17] - step1[22];	1202 step2[22] = WRAPLOW(step1[17] - step1[22], 8);

1195 step2[23] = step1[16] - step1[23];	1203 step2[23] = WRAPLOW(step1[16] - step1[23], 8);

1196	1204

1197 step2[24] = -step1[24] + step1[31];	1205 step2[24] = WRAPLOW(-step1[24] + step1[31], 8);

1198 step2[25] = -step1[25] + step1[30];	1206 step2[25] = WRAPLOW(-step1[25] + step1[30], 8);

1199 step2[26] = -step1[26] + step1[29];	1207 step2[26] = WRAPLOW(-step1[26] + step1[29], 8);

1200 step2[27] = -step1[27] + step1[28];	1208 step2[27] = WRAPLOW(-step1[27] + step1[28], 8);

1201 step2[28] = step1[27] + step1[28];	1209 step2[28] = WRAPLOW(step1[27] + step1[28], 8);

1202 step2[29] = step1[26] + step1[29];	1210 step2[29] = WRAPLOW(step1[26] + step1[29], 8);

1203 step2[30] = step1[25] + step1[30];	1211 step2[30] = WRAPLOW(step1[25] + step1[30], 8);

1204 step2[31] = step1[24] + step1[31];	1212 step2[31] = WRAPLOW(step1[24] + step1[31], 8);

1205	1213

1206 // stage 7	1214 // stage 7

1207 step1[0] = step2[0] + step2[15];	1215 step1[0] = WRAPLOW(step2[0] + step2[15], 8);

1208 step1[1] = step2[1] + step2[14];	1216 step1[1] = WRAPLOW(step2[1] + step2[14], 8);

1209 step1[2] = step2[2] + step2[13];	1217 step1[2] = WRAPLOW(step2[2] + step2[13], 8);

1210 step1[3] = step2[3] + step2[12];	1218 step1[3] = WRAPLOW(step2[3] + step2[12], 8);

1211 step1[4] = step2[4] + step2[11];	1219 step1[4] = WRAPLOW(step2[4] + step2[11], 8);

1212 step1[5] = step2[5] + step2[10];	1220 step1[5] = WRAPLOW(step2[5] + step2[10], 8);

1213 step1[6] = step2[6] + step2[9];	1221 step1[6] = WRAPLOW(step2[6] + step2[9], 8);

1214 step1[7] = step2[7] + step2[8];	1222 step1[7] = WRAPLOW(step2[7] + step2[8], 8);

1215 step1[8] = step2[7] - step2[8];	1223 step1[8] = WRAPLOW(step2[7] - step2[8], 8);

1216 step1[9] = step2[6] - step2[9];	1224 step1[9] = WRAPLOW(step2[6] - step2[9], 8);

1217 step1[10] = step2[5] - step2[10];	1225 step1[10] = WRAPLOW(step2[5] - step2[10], 8);

1218 step1[11] = step2[4] - step2[11];	1226 step1[11] = WRAPLOW(step2[4] - step2[11], 8);

1219 step1[12] = step2[3] - step2[12];	1227 step1[12] = WRAPLOW(step2[3] - step2[12], 8);

1220 step1[13] = step2[2] - step2[13];	1228 step1[13] = WRAPLOW(step2[2] - step2[13], 8);

1221 step1[14] = step2[1] - step2[14];	1229 step1[14] = WRAPLOW(step2[1] - step2[14], 8);

1222 step1[15] = step2[0] - step2[15];	1230 step1[15] = WRAPLOW(step2[0] - step2[15], 8);

1223	1231

1224 step1[16] = step2[16];	1232 step1[16] = step2[16];

1225 step1[17] = step2[17];	1233 step1[17] = step2[17];

1226 step1[18] = step2[18];	1234 step1[18] = step2[18];

1227 step1[19] = step2[19];	1235 step1[19] = step2[19];

1228 temp1 = (-step2[20] + step2[27]) * cospi_16_64;	1236 temp1 = (-step2[20] + step2[27]) * cospi_16_64;

1229 temp2 = (step2[20] + step2[27]) * cospi_16_64;	1237 temp2 = (step2[20] + step2[27]) * cospi_16_64;

1230 step1[20] = dct_const_round_shift(temp1);	1238 step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);

1231 step1[27] = dct_const_round_shift(temp2);	1239 step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);

1232 temp1 = (-step2[21] + step2[26]) * cospi_16_64;	1240 temp1 = (-step2[21] + step2[26]) * cospi_16_64;

1233 temp2 = (step2[21] + step2[26]) * cospi_16_64;	1241 temp2 = (step2[21] + step2[26]) * cospi_16_64;

1234 step1[21] = dct_const_round_shift(temp1);	1242 step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);

1235 step1[26] = dct_const_round_shift(temp2);	1243 step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);

1236 temp1 = (-step2[22] + step2[25]) * cospi_16_64;	1244 temp1 = (-step2[22] + step2[25]) * cospi_16_64;

1237 temp2 = (step2[22] + step2[25]) * cospi_16_64;	1245 temp2 = (step2[22] + step2[25]) * cospi_16_64;

1238 step1[22] = dct_const_round_shift(temp1);	1246 step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);

1239 step1[25] = dct_const_round_shift(temp2);	1247 step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);

1240 temp1 = (-step2[23] + step2[24]) * cospi_16_64;	1248 temp1 = (-step2[23] + step2[24]) * cospi_16_64;

1241 temp2 = (step2[23] + step2[24]) * cospi_16_64;	1249 temp2 = (step2[23] + step2[24]) * cospi_16_64;

1242 step1[23] = dct_const_round_shift(temp1);	1250 step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);

1243 step1[24] = dct_const_round_shift(temp2);	1251 step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);

1244 step1[28] = step2[28];	1252 step1[28] = step2[28];

1245 step1[29] = step2[29];	1253 step1[29] = step2[29];

1246 step1[30] = step2[30];	1254 step1[30] = step2[30];

1247 step1[31] = step2[31];	1255 step1[31] = step2[31];

1248	1256

1249 // final stage	1257 // final stage

1250 output[0] = step1[0] + step1[31];	1258 output[0] = WRAPLOW(step1[0] + step1[31], 8);

1251 output[1] = step1[1] + step1[30];	1259 output[1] = WRAPLOW(step1[1] + step1[30], 8);

1252 output[2] = step1[2] + step1[29];	1260 output[2] = WRAPLOW(step1[2] + step1[29], 8);

1253 output[3] = step1[3] + step1[28];	1261 output[3] = WRAPLOW(step1[3] + step1[28], 8);

1254 output[4] = step1[4] + step1[27];	1262 output[4] = WRAPLOW(step1[4] + step1[27], 8);

1255 output[5] = step1[5] + step1[26];	1263 output[5] = WRAPLOW(step1[5] + step1[26], 8);

1256 output[6] = step1[6] + step1[25];	1264 output[6] = WRAPLOW(step1[6] + step1[25], 8);

1257 output[7] = step1[7] + step1[24];	1265 output[7] = WRAPLOW(step1[7] + step1[24], 8);

1258 output[8] = step1[8] + step1[23];	1266 output[8] = WRAPLOW(step1[8] + step1[23], 8);

1259 output[9] = step1[9] + step1[22];	1267 output[9] = WRAPLOW(step1[9] + step1[22], 8);

1260 output[10] = step1[10] + step1[21];	1268 output[10] = WRAPLOW(step1[10] + step1[21], 8);

1261 output[11] = step1[11] + step1[20];	1269 output[11] = WRAPLOW(step1[11] + step1[20], 8);

1262 output[12] = step1[12] + step1[19];	1270 output[12] = WRAPLOW(step1[12] + step1[19], 8);

1263 output[13] = step1[13] + step1[18];	1271 output[13] = WRAPLOW(step1[13] + step1[18], 8);

1264 output[14] = step1[14] + step1[17];	1272 output[14] = WRAPLOW(step1[14] + step1[17], 8);

1265 output[15] = step1[15] + step1[16];	1273 output[15] = WRAPLOW(step1[15] + step1[16], 8);

1266 output[16] = step1[15] - step1[16];	1274 output[16] = WRAPLOW(step1[15] - step1[16], 8);

1267 output[17] = step1[14] - step1[17];	1275 output[17] = WRAPLOW(step1[14] - step1[17], 8);

1268 output[18] = step1[13] - step1[18];	1276 output[18] = WRAPLOW(step1[13] - step1[18], 8);

1269 output[19] = step1[12] - step1[19];	1277 output[19] = WRAPLOW(step1[12] - step1[19], 8);

1270 output[20] = step1[11] - step1[20];	1278 output[20] = WRAPLOW(step1[11] - step1[20], 8);

1271 output[21] = step1[10] - step1[21];	1279 output[21] = WRAPLOW(step1[10] - step1[21], 8);

1272 output[22] = step1[9] - step1[22];	1280 output[22] = WRAPLOW(step1[9] - step1[22], 8);

1273 output[23] = step1[8] - step1[23];	1281 output[23] = WRAPLOW(step1[8] - step1[23], 8);

1274 output[24] = step1[7] - step1[24];	1282 output[24] = WRAPLOW(step1[7] - step1[24], 8);

1275 output[25] = step1[6] - step1[25];	1283 output[25] = WRAPLOW(step1[6] - step1[25], 8);

1276 output[26] = step1[5] - step1[26];	1284 output[26] = WRAPLOW(step1[5] - step1[26], 8);

1277 output[27] = step1[4] - step1[27];	1285 output[27] = WRAPLOW(step1[4] - step1[27], 8);

1278 output[28] = step1[3] - step1[28];	1286 output[28] = WRAPLOW(step1[3] - step1[28], 8);

1279 output[29] = step1[2] - step1[29];	1287 output[29] = WRAPLOW(step1[2] - step1[29], 8);

1280 output[30] = step1[1] - step1[30];	1288 output[30] = WRAPLOW(step1[1] - step1[30], 8);

1281 output[31] = step1[0] - step1[31];	1289 output[31] = WRAPLOW(step1[0] - step1[31], 8);

1282 }	1290 }

1283	1291

1284 void vp9_idct32x32_1024_add_c(const tran_low_t input, uint8_t dest,	1292 void vp9_idct32x32_1024_add_c(const tran_low_t input, uint8_t dest,

1285 int stride) {	1293 int stride) {

1286 tran_low_t out[32 * 32];	1294 tran_low_t out[32 * 32];

1287 tran_low_t *outptr = out;	1295 tran_low_t *outptr = out;

1288 int i, j;	1296 int i, j;

1289 tran_low_t temp_in[32], temp_out[32];	1297 tran_low_t temp_in[32], temp_out[32];

1290	1298

1291 // Rows	1299 // Rows

(...skipping 14 matching lines...) Expand all Loading...
1306 vpx_memset(outptr, 0, sizeof(tran_low_t) * 32);	1314 vpx_memset(outptr, 0, sizeof(tran_low_t) * 32);

1307 input += 32;	1315 input += 32;

1308 outptr += 32;	1316 outptr += 32;

1309 }	1317 }

1310	1318

1311 // Columns	1319 // Columns

1312 for (i = 0; i < 32; ++i) {	1320 for (i = 0; i < 32; ++i) {

1313 for (j = 0; j < 32; ++j)	1321 for (j = 0; j < 32; ++j)

1314 temp_in[j] = out[j * 32 + i];	1322 temp_in[j] = out[j * 32 + i];

1315 idct32(temp_in, temp_out);	1323 idct32(temp_in, temp_out);

1316 for (j = 0; j < 32; ++j)	1324 for (j = 0; j < 32; ++j) {

1317 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)	1325 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],

1318 + dest[j * stride + i]);	1326 ROUND_POWER_OF_TWO(temp_out[j], 6));

	1327 }

1319 }	1328 }

1320 }	1329 }

1321	1330

1322 void vp9_idct32x32_34_add_c(const tran_low_t input, uint8_t dest,	1331 void vp9_idct32x32_34_add_c(const tran_low_t input, uint8_t dest,

1323 int stride) {	1332 int stride) {

1324 tran_low_t out[32 * 32] = {0};	1333 tran_low_t out[32 * 32] = {0};

1325 tran_low_t *outptr = out;	1334 tran_low_t *outptr = out;

1326 int i, j;	1335 int i, j;

1327 tran_low_t temp_in[32], temp_out[32];	1336 tran_low_t temp_in[32], temp_out[32];

1328	1337

1329 // Rows	1338 // Rows

1330 // only upper-left 8x8 has non-zero coeff	1339 // only upper-left 8x8 has non-zero coeff

1331 for (i = 0; i < 8; ++i) {	1340 for (i = 0; i < 8; ++i) {

1332 idct32(input, outptr);	1341 idct32(input, outptr);

1333 input += 32;	1342 input += 32;

1334 outptr += 32;	1343 outptr += 32;

1335 }	1344 }

1336	1345

1337 // Columns	1346 // Columns

1338 for (i = 0; i < 32; ++i) {	1347 for (i = 0; i < 32; ++i) {

1339 for (j = 0; j < 32; ++j)	1348 for (j = 0; j < 32; ++j)

1340 temp_in[j] = out[j * 32 + i];	1349 temp_in[j] = out[j * 32 + i];

1341 idct32(temp_in, temp_out);	1350 idct32(temp_in, temp_out);

1342 for (j = 0; j < 32; ++j)	1351 for (j = 0; j < 32; ++j) {

1343 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)	1352 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],

1344 + dest[j * stride + i]);	1353 ROUND_POWER_OF_TWO(temp_out[j], 6));

	1354 }

1345 }	1355 }

1346 }	1356 }

1347	1357

1348 void vp9_idct32x32_1_add_c(const tran_low_t input, uint8_t dest, int stride) {	1358 void vp9_idct32x32_1_add_c(const tran_low_t input, uint8_t dest, int stride) {

1349 int i, j;	1359 int i, j;

1350 tran_high_t a1;	1360 tran_high_t a1;

1351	1361

1352 tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64);	1362 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);

1353 out = dct_const_round_shift(out * cospi_16_64);	1363 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);

1354 a1 = ROUND_POWER_OF_TWO(out, 6);	1364 a1 = ROUND_POWER_OF_TWO(out, 6);

1355	1365

1356 for (j = 0; j < 32; ++j) {	1366 for (j = 0; j < 32; ++j) {

1357 for (i = 0; i < 32; ++i)	1367 for (i = 0; i < 32; ++i)

1358 dest[i] = clip_pixel(dest[i] + a1);	1368 dest[i] = clip_pixel_add(dest[i], a1);

1359 dest += stride;	1369 dest += stride;

1360 }	1370 }

1361 }	1371 }

1362	1372

1363 // idct	1373 // idct

1364 void vp9_idct4x4_add(const tran_low_t input, uint8_t dest, int stride,	1374 void vp9_idct4x4_add(const tran_low_t input, uint8_t dest, int stride,

1365 int eob) {	1375 int eob) {

1366 if (eob > 1)	1376 if (eob > 1)

1367 vp9_idct4x4_16_add(input, dest, stride);	1377 vp9_idct4x4_16_add(input, dest, stride);

1368 else	1378 else

(...skipping 72 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1441 void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t input, uint8_t dest,	1451 void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t input, uint8_t dest,

1442 int stride, int eob) {	1452 int stride, int eob) {

1443 if (tx_type == DCT_DCT) {	1453 if (tx_type == DCT_DCT) {

1444 vp9_idct16x16_add(input, dest, stride, eob);	1454 vp9_idct16x16_add(input, dest, stride, eob);

1445 } else {	1455 } else {

1446 vp9_iht16x16_256_add(input, dest, stride, tx_type);	1456 vp9_iht16x16_256_add(input, dest, stride, tx_type);

1447 }	1457 }

1448 }	1458 }

1449	1459

1450 #if CONFIG_VP9_HIGHBITDEPTH	1460 #if CONFIG_VP9_HIGHBITDEPTH

1451 void vp9_high_iwht4x4_16_add_c(const tran_low_t input, uint8_t dest8,	1461 void vp9_highbd_iwht4x4_16_add_c(const tran_low_t input, uint8_t dest8,

1452 int stride, int bd) {	1462 int stride, int bd) {

1453 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,	1463 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,

1454 0.5 shifts per pixel. */	1464 0.5 shifts per pixel. */

1455 int i;	1465 int i;

1456 tran_low_t output[16];	1466 tran_low_t output[16];

1457 tran_high_t a1, b1, c1, d1, e1;	1467 tran_high_t a1, b1, c1, d1, e1;

1458 const tran_low_t *ip = input;	1468 const tran_low_t *ip = input;

1459 tran_low_t *op = output;	1469 tran_low_t *op = output;

1460 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);	1470 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

1461	1471

1462 for (i = 0; i < 4; i++) {	1472 for (i = 0; i < 4; i++) {

1463 a1 = ip[0] >> UNIT_QUANT_SHIFT;	1473 a1 = ip[0] >> UNIT_QUANT_SHIFT;

1464 c1 = ip[1] >> UNIT_QUANT_SHIFT;	1474 c1 = ip[1] >> UNIT_QUANT_SHIFT;

1465 d1 = ip[2] >> UNIT_QUANT_SHIFT;	1475 d1 = ip[2] >> UNIT_QUANT_SHIFT;

1466 b1 = ip[3] >> UNIT_QUANT_SHIFT;	1476 b1 = ip[3] >> UNIT_QUANT_SHIFT;

1467 a1 += c1;	1477 a1 += c1;

1468 d1 -= b1;	1478 d1 -= b1;

1469 e1 = (a1 - d1) >> 1;	1479 e1 = (a1 - d1) >> 1;

1470 b1 = e1 - b1;	1480 b1 = e1 - b1;

1471 c1 = e1 - c1;	1481 c1 = e1 - c1;

1472 a1 -= b1;	1482 a1 -= b1;

1473 d1 += c1;	1483 d1 += c1;

1474 op[0] = WRAPLOW(a1);	1484 op[0] = WRAPLOW(a1, bd);

1475 op[1] = WRAPLOW(b1);	1485 op[1] = WRAPLOW(b1, bd);

1476 op[2] = WRAPLOW(c1);	1486 op[2] = WRAPLOW(c1, bd);

1477 op[3] = WRAPLOW(d1);	1487 op[3] = WRAPLOW(d1, bd);

1478 ip += 4;	1488 ip += 4;

1479 op += 4;	1489 op += 4;

1480 }	1490 }

1481	1491

1482 ip = output;	1492 ip = output;

1483 for (i = 0; i < 4; i++) {	1493 for (i = 0; i < 4; i++) {

1484 a1 = ip[4 * 0];	1494 a1 = ip[4 * 0];

1485 c1 = ip[4 * 1];	1495 c1 = ip[4 * 1];

1486 d1 = ip[4 * 2];	1496 d1 = ip[4 * 2];

1487 b1 = ip[4 * 3];	1497 b1 = ip[4 * 3];

1488 a1 += c1;	1498 a1 += c1;

1489 d1 -= b1;	1499 d1 -= b1;

1490 e1 = (a1 - d1) >> 1;	1500 e1 = (a1 - d1) >> 1;

1491 b1 = e1 - b1;	1501 b1 = e1 - b1;

1492 c1 = e1 - c1;	1502 c1 = e1 - c1;

1493 a1 -= b1;	1503 a1 -= b1;

1494 d1 += c1;	1504 d1 += c1;

1495 dest[stride * 0] = clip_pixel_bd_high(dest[stride * 0], a1, bd);	1505 dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);

1496 dest[stride * 1] = clip_pixel_bd_high(dest[stride * 1], b1, bd);	1506 dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], b1, bd);

1497 dest[stride * 2] = clip_pixel_bd_high(dest[stride * 2], c1, bd);	1507 dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], c1, bd);

1498 dest[stride * 3] = clip_pixel_bd_high(dest[stride * 3], d1, bd);	1508 dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], d1, bd);

1499	1509

1500 ip++;	1510 ip++;

1501 dest++;	1511 dest++;

1502 }	1512 }

1503 }	1513 }

1504	1514

1505 static void high_idct4(const tran_low_t input, tran_low_t output, int bd) {	1515 void vp9_highbd_iwht4x4_1_add_c(const tran_low_t in, uint8_t dest8,

1506 tran_low_t step[4];	1516 int dest_stride, int bd) {

1507 tran_high_t temp1, temp2;

1508 (void) bd;

1509 // stage 1

1510 temp1 = (input[0] + input[2]) * cospi_16_64;

1511 temp2 = (input[0] - input[2]) * cospi_16_64;

1512 step[0] = WRAPLOW(dct_const_round_shift(temp1));

1513 step[1] = WRAPLOW(dct_const_round_shift(temp2));

1514 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;

1515 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;

1516 step[2] = WRAPLOW(dct_const_round_shift(temp1));

1517 step[3] = WRAPLOW(dct_const_round_shift(temp2));

1518

1519 // stage 2

1520 output[0] = WRAPLOW(step[0] + step[3]);

1521 output[1] = WRAPLOW(step[1] + step[2]);

1522 output[2] = WRAPLOW(step[1] - step[2]);

1523 output[3] = WRAPLOW(step[0] - step[3]);

1524 }

1525

1526 void vp9_high_iwht4x4_1_add_c(const tran_low_t in, uint8_t dest8,

1527 int dest_stride, int bd) {

1528 int i;	1517 int i;

1529 tran_high_t a1, e1;	1518 tran_high_t a1, e1;

1530 tran_low_t tmp[4];	1519 tran_low_t tmp[4];

1531 const tran_low_t *ip = in;	1520 const tran_low_t *ip = in;

1532 tran_low_t *op = tmp;	1521 tran_low_t *op = tmp;

1533 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);	1522 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

1534 (void) bd;	1523 (void) bd;

1535	1524

1536 a1 = ip[0] >> UNIT_QUANT_SHIFT;	1525 a1 = ip[0] >> UNIT_QUANT_SHIFT;

1537 e1 = a1 >> 1;	1526 e1 = a1 >> 1;

1538 a1 -= e1;	1527 a1 -= e1;

1539 op[0] = WRAPLOW(a1);	1528 op[0] = WRAPLOW(a1, bd);

1540 op[1] = op[2] = op[3] = WRAPLOW(e1);	1529 op[1] = op[2] = op[3] = WRAPLOW(e1, bd);

1541	1530

1542 ip = tmp;	1531 ip = tmp;

1543 for (i = 0; i < 4; i++) {	1532 for (i = 0; i < 4; i++) {

1544 e1 = ip[0] >> 1;	1533 e1 = ip[0] >> 1;

1545 a1 = ip[0] - e1;	1534 a1 = ip[0] - e1;

1546 dest[dest_stride * 0] = clip_pixel_bd_high(dest[dest_stride * 0], a1, bd);	1535 dest[dest_stride * 0] = highbd_clip_pixel_add(

1547 dest[dest_stride * 1] = clip_pixel_bd_high(dest[dest_stride * 1], e1, bd);	1536 dest[dest_stride * 0], a1, bd);

1548 dest[dest_stride * 2] = clip_pixel_bd_high(dest[dest_stride * 2], e1, bd);	1537 dest[dest_stride * 1] = highbd_clip_pixel_add(

1549 dest[dest_stride * 3] = clip_pixel_bd_high(dest[dest_stride * 3], e1, bd);	1538 dest[dest_stride * 1], e1, bd);

	1539 dest[dest_stride * 2] = highbd_clip_pixel_add(

	1540 dest[dest_stride * 2], e1, bd);

	1541 dest[dest_stride * 3] = highbd_clip_pixel_add(

	1542 dest[dest_stride * 3], e1, bd);

1550 ip++;	1543 ip++;

1551 dest++;	1544 dest++;

1552 }	1545 }

1553 }	1546 }

1554	1547

1555 void vp9_high_idct4x4_16_add_c(const tran_low_t input, uint8_t dest8,	1548 static void highbd_idct4(const tran_low_t input, tran_low_t output, int bd) {

1556 int stride, int bd) {	1549 tran_low_t step[4];

	1550 tran_high_t temp1, temp2;

	1551 (void) bd;

	1552 // stage 1

	1553 temp1 = (input[0] + input[2]) * cospi_16_64;

	1554 temp2 = (input[0] - input[2]) * cospi_16_64;

	1555 step[0] = WRAPLOW(dct_const_round_shift(temp1), bd);

	1556 step[1] = WRAPLOW(dct_const_round_shift(temp2), bd);

	1557 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;

	1558 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;

	1559 step[2] = WRAPLOW(dct_const_round_shift(temp1), bd);

	1560 step[3] = WRAPLOW(dct_const_round_shift(temp2), bd);

	1561

	1562 // stage 2

	1563 output[0] = WRAPLOW(step[0] + step[3], bd);

	1564 output[1] = WRAPLOW(step[1] + step[2], bd);

	1565 output[2] = WRAPLOW(step[1] - step[2], bd);

	1566 output[3] = WRAPLOW(step[0] - step[3], bd);

	1567 }

	1568

	1569 void vp9_highbd_idct4x4_16_add_c(const tran_low_t input, uint8_t dest8,

	1570 int stride, int bd) {

1557 tran_low_t out[4 * 4];	1571 tran_low_t out[4 * 4];

1558 tran_low_t *outptr = out;	1572 tran_low_t *outptr = out;

1559 int i, j;	1573 int i, j;

1560 tran_low_t temp_in[4], temp_out[4];	1574 tran_low_t temp_in[4], temp_out[4];

1561 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);	1575 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

1562	1576

1563 // Rows	1577 // Rows

1564 for (i = 0; i < 4; ++i) {	1578 for (i = 0; i < 4; ++i) {

1565 high_idct4(input, outptr, bd);	1579 highbd_idct4(input, outptr, bd);

1566 input += 4;	1580 input += 4;

1567 outptr += 4;	1581 outptr += 4;

1568 }	1582 }

1569	1583

1570 // Columns	1584 // Columns

1571 for (i = 0; i < 4; ++i) {	1585 for (i = 0; i < 4; ++i) {

1572 for (j = 0; j < 4; ++j)	1586 for (j = 0; j < 4; ++j)

1573 temp_in[j] = out[j * 4 + i];	1587 temp_in[j] = out[j * 4 + i];

1574 high_idct4(temp_in, temp_out, bd);	1588 highbd_idct4(temp_in, temp_out, bd);

1575 for (j = 0; j < 4; ++j)	1589 for (j = 0; j < 4; ++j) {

1576 dest[j * stride + i] = clip_pixel_bd_high(	1590 dest[j * stride + i] = highbd_clip_pixel_add(

1577 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);	1591 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);

	1592 }

1578 }	1593 }

1579 }	1594 }

1580	1595

1581 void vp9_high_idct4x4_1_add_c(const tran_low_t input, uint8_t dest8,	1596 void vp9_highbd_idct4x4_1_add_c(const tran_low_t input, uint8_t dest8,

1582 int dest_stride, int bd) {	1597 int dest_stride, int bd) {

1583 int i;	1598 int i;

1584 tran_high_t a1;	1599 tran_high_t a1;

1585 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));	1600 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);

1586 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);	1601 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

1587	1602

1588 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));	1603 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);

1589 a1 = ROUND_POWER_OF_TWO(out, 4);	1604 a1 = ROUND_POWER_OF_TWO(out, 4);

1590	1605

1591 for (i = 0; i < 4; i++) {	1606 for (i = 0; i < 4; i++) {

1592 dest[0] = clip_pixel_bd_high(dest[0], a1, bd);	1607 dest[0] = highbd_clip_pixel_add(dest[0], a1, bd);

1593 dest[1] = clip_pixel_bd_high(dest[1], a1, bd);	1608 dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);

1594 dest[2] = clip_pixel_bd_high(dest[2], a1, bd);	1609 dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);

1595 dest[3] = clip_pixel_bd_high(dest[3], a1, bd);	1610 dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);

1596 dest += dest_stride;	1611 dest += dest_stride;

1597 }	1612 }

1598 }	1613 }

1599	1614

1600 static void high_idct8(const tran_low_t input, tran_low_t output, int bd) {	1615 static void highbd_idct8(const tran_low_t input, tran_low_t output, int bd) {

1601 tran_low_t step1[8], step2[8];	1616 tran_low_t step1[8], step2[8];

1602 tran_high_t temp1, temp2;	1617 tran_high_t temp1, temp2;

1603 // stage 1	1618 // stage 1

1604 step1[0] = input[0];	1619 step1[0] = input[0];

1605 step1[2] = input[4];	1620 step1[2] = input[4];

1606 step1[1] = input[2];	1621 step1[1] = input[2];

1607 step1[3] = input[6];	1622 step1[3] = input[6];

1608 temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;	1623 temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;

1609 temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;	1624 temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;

1610 step1[4] = WRAPLOW(dct_const_round_shift(temp1));	1625 step1[4] = WRAPLOW(dct_const_round_shift(temp1), bd);

1611 step1[7] = WRAPLOW(dct_const_round_shift(temp2));	1626 step1[7] = WRAPLOW(dct_const_round_shift(temp2), bd);

1612 temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;	1627 temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;

1613 temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;	1628 temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;

1614 step1[5] = WRAPLOW(dct_const_round_shift(temp1));	1629 step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);

1615 step1[6] = WRAPLOW(dct_const_round_shift(temp2));	1630 step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);

1616	1631

1617 // stage 2 & stage 3 - even half	1632 // stage 2 & stage 3 - even half

1618 high_idct4(step1, step1, bd);	1633 highbd_idct4(step1, step1, bd);

1619	1634

1620 // stage 2 - odd half	1635 // stage 2 - odd half

1621 step2[4] = WRAPLOW(step1[4] + step1[5]);	1636 step2[4] = WRAPLOW(step1[4] + step1[5], bd);

1622 step2[5] = WRAPLOW(step1[4] - step1[5]);	1637 step2[5] = WRAPLOW(step1[4] - step1[5], bd);

1623 step2[6] = WRAPLOW(-step1[6] + step1[7]);	1638 step2[6] = WRAPLOW(-step1[6] + step1[7], bd);

1624 step2[7] = WRAPLOW(step1[6] + step1[7]);	1639 step2[7] = WRAPLOW(step1[6] + step1[7], bd);

1625	1640

1626 // stage 3 - odd half	1641 // stage 3 - odd half

1627 step1[4] = step2[4];	1642 step1[4] = step2[4];

1628 temp1 = (step2[6] - step2[5]) * cospi_16_64;	1643 temp1 = (step2[6] - step2[5]) * cospi_16_64;

1629 temp2 = (step2[5] + step2[6]) * cospi_16_64;	1644 temp2 = (step2[5] + step2[6]) * cospi_16_64;

1630 step1[5] = WRAPLOW(dct_const_round_shift(temp1));	1645 step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);

1631 step1[6] = WRAPLOW(dct_const_round_shift(temp2));	1646 step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);

1632 step1[7] = step2[7];	1647 step1[7] = step2[7];

1633	1648

1634 // stage 4	1649 // stage 4

1635 output[0] = WRAPLOW(step1[0] + step1[7]);	1650 output[0] = WRAPLOW(step1[0] + step1[7], bd);

1636 output[1] = WRAPLOW(step1[1] + step1[6]);	1651 output[1] = WRAPLOW(step1[1] + step1[6], bd);

1637 output[2] = WRAPLOW(step1[2] + step1[5]);	1652 output[2] = WRAPLOW(step1[2] + step1[5], bd);

1638 output[3] = WRAPLOW(step1[3] + step1[4]);	1653 output[3] = WRAPLOW(step1[3] + step1[4], bd);

1639 output[4] = WRAPLOW(step1[3] - step1[4]);	1654 output[4] = WRAPLOW(step1[3] - step1[4], bd);

1640 output[5] = WRAPLOW(step1[2] - step1[5]);	1655 output[5] = WRAPLOW(step1[2] - step1[5], bd);

1641 output[6] = WRAPLOW(step1[1] - step1[6]);	1656 output[6] = WRAPLOW(step1[1] - step1[6], bd);

1642 output[7] = WRAPLOW(step1[0] - step1[7]);	1657 output[7] = WRAPLOW(step1[0] - step1[7], bd);

1643 }	1658 }

1644	1659

1645 void vp9_high_idct8x8_64_add_c(const tran_low_t input, uint8_t dest8,	1660 void vp9_highbd_idct8x8_64_add_c(const tran_low_t input, uint8_t dest8,

1646 int stride, int bd) {	1661 int stride, int bd) {

1647 tran_low_t out[8 * 8];	1662 tran_low_t out[8 * 8];

1648 tran_low_t *outptr = out;	1663 tran_low_t *outptr = out;

1649 int i, j;	1664 int i, j;

1650 tran_low_t temp_in[8], temp_out[8];	1665 tran_low_t temp_in[8], temp_out[8];

1651 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);	1666 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

1652	1667

1653 // First transform rows.	1668 // First transform rows.

1654 for (i = 0; i < 8; ++i) {	1669 for (i = 0; i < 8; ++i) {

1655 high_idct8(input, outptr, bd);	1670 highbd_idct8(input, outptr, bd);

1656 input += 8;	1671 input += 8;

1657 outptr += 8;	1672 outptr += 8;

1658 }	1673 }

1659	1674

1660 // Then transform columns.	1675 // Then transform columns.

1661 for (i = 0; i < 8; ++i) {	1676 for (i = 0; i < 8; ++i) {

1662 for (j = 0; j < 8; ++j)	1677 for (j = 0; j < 8; ++j)

1663 temp_in[j] = out[j * 8 + i];	1678 temp_in[j] = out[j * 8 + i];

1664 high_idct8(temp_in, temp_out, bd);	1679 highbd_idct8(temp_in, temp_out, bd);

1665 for (j = 0; j < 8; ++j)	1680 for (j = 0; j < 8; ++j) {

1666 dest[j * stride + i] = clip_pixel_bd_high(dest[j * stride + i],	1681 dest[j * stride + i] = highbd_clip_pixel_add(

1667 ROUND_POWER_OF_TWO(temp_out[j], 5),	1682 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);

1668 bd);	1683 }

1669 }	1684 }

1670 }	1685 }

1671	1686

1672 void vp9_high_idct8x8_1_add_c(const tran_low_t input, uint8_t dest8,	1687 void vp9_highbd_idct8x8_1_add_c(const tran_low_t input, uint8_t dest8,

1673 int stride, int bd) {	1688 int stride, int bd) {

1674 int i, j;	1689 int i, j;

1675 tran_high_t a1;	1690 tran_high_t a1;

1676 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));	1691 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);

1677 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);	1692 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

1678 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));	1693 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);

1679 a1 = ROUND_POWER_OF_TWO(out, 5);	1694 a1 = ROUND_POWER_OF_TWO(out, 5);

1680 for (j = 0; j < 8; ++j) {	1695 for (j = 0; j < 8; ++j) {

1681 for (i = 0; i < 8; ++i)	1696 for (i = 0; i < 8; ++i)

1682 dest[i] = clip_pixel_bd_high(dest[i], a1, bd);	1697 dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);

1683 dest += stride;	1698 dest += stride;

1684 }	1699 }

1685 }	1700 }

1686	1701

1687 static void high_iadst4(const tran_low_t input, tran_low_t output, int bd) {	1702 static void highbd_iadst4(const tran_low_t input, tran_low_t output, int bd) {

1688 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;	1703 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;

1689	1704

1690 tran_high_t x0 = input[0];	1705 tran_high_t x0 = input[0];

1691 tran_high_t x1 = input[1];	1706 tran_high_t x1 = input[1];

1692 tran_high_t x2 = input[2];	1707 tran_high_t x2 = input[2];

1693 tran_high_t x3 = input[3];	1708 tran_high_t x3 = input[3];

1694 (void) bd;	1709 (void) bd;

1695	1710

1696 if (!(x0 \| x1 \| x2 \| x3)) {	1711 if (!(x0 \| x1 \| x2 \| x3)) {

1697 vpx_memset(output, 0, 4 * sizeof(*output));	1712 vpx_memset(output, 0, 4 * sizeof(*output));

(...skipping 16 matching lines...) Expand all Loading...
1714	1729

1715 s0 = x0 + x3;	1730 s0 = x0 + x3;

1716 s1 = x1 + x3;	1731 s1 = x1 + x3;

1717 s2 = x2;	1732 s2 = x2;

1718 s3 = x0 + x1 - x3;	1733 s3 = x0 + x1 - x3;

1719	1734

1720 // 1-D transform scaling factor is sqrt(2).	1735 // 1-D transform scaling factor is sqrt(2).

1721 // The overall dynamic range is 14b (input) + 14b (multiplication scaling)	1736 // The overall dynamic range is 14b (input) + 14b (multiplication scaling)

1722 // + 1b (addition) = 29b.	1737 // + 1b (addition) = 29b.

1723 // Hence the output bit depth is 15b.	1738 // Hence the output bit depth is 15b.

1724 output[0] = WRAPLOW(dct_const_round_shift(s0));	1739 output[0] = WRAPLOW(dct_const_round_shift(s0), bd);

1725 output[1] = WRAPLOW(dct_const_round_shift(s1));	1740 output[1] = WRAPLOW(dct_const_round_shift(s1), bd);

1726 output[2] = WRAPLOW(dct_const_round_shift(s2));	1741 output[2] = WRAPLOW(dct_const_round_shift(s2), bd);

1727 output[3] = WRAPLOW(dct_const_round_shift(s3));	1742 output[3] = WRAPLOW(dct_const_round_shift(s3), bd);

1728 }	1743 }

1729	1744

1730 void vp9_high_iht4x4_16_add_c(const tran_low_t input, uint8_t dest8,	1745 void vp9_highbd_iht4x4_16_add_c(const tran_low_t input, uint8_t dest8,

1731 int stride, int tx_type, int bd) {	1746 int stride, int tx_type, int bd) {

1732 const high_transform_2d IHT_4[] = {	1747 const highbd_transform_2d IHT_4[] = {

1733 { high_idct4, high_idct4 }, // DCT_DCT = 0	1748 { highbd_idct4, highbd_idct4 }, // DCT_DCT = 0

1734 { high_iadst4, high_idct4 }, // ADST_DCT = 1	1749 { highbd_iadst4, highbd_idct4 }, // ADST_DCT = 1

1735 { high_idct4, high_iadst4 }, // DCT_ADST = 2	1750 { highbd_idct4, highbd_iadst4 }, // DCT_ADST = 2

1736 { high_iadst4, high_iadst4 } // ADST_ADST = 3	1751 { highbd_iadst4, highbd_iadst4 } // ADST_ADST = 3

1737 };	1752 };

1738 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);	1753 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

1739	1754

1740 int i, j;	1755 int i, j;

1741 tran_low_t out[4 * 4];	1756 tran_low_t out[4 * 4];

1742 tran_low_t *outptr = out;	1757 tran_low_t *outptr = out;

1743 tran_low_t temp_in[4], temp_out[4];	1758 tran_low_t temp_in[4], temp_out[4];

1744	1759

1745 // Inverse transform row vectors.	1760 // Inverse transform row vectors.

1746 for (i = 0; i < 4; ++i) {	1761 for (i = 0; i < 4; ++i) {

1747 IHT_4[tx_type].rows(input, outptr, bd);	1762 IHT_4[tx_type].rows(input, outptr, bd);

1748 input += 4;	1763 input += 4;

1749 outptr += 4;	1764 outptr += 4;

1750 }	1765 }

1751	1766

1752 // Inverse transform column vectors.	1767 // Inverse transform column vectors.

1753 for (i = 0; i < 4; ++i) {	1768 for (i = 0; i < 4; ++i) {

1754 for (j = 0; j < 4; ++j)	1769 for (j = 0; j < 4; ++j)

1755 temp_in[j] = out[j * 4 + i];	1770 temp_in[j] = out[j * 4 + i];

1756 IHT_4[tx_type].cols(temp_in, temp_out, bd);	1771 IHT_4[tx_type].cols(temp_in, temp_out, bd);

1757 for (j = 0; j < 4; ++j)	1772 for (j = 0; j < 4; ++j) {

1758 dest[j * stride + i] = clip_pixel_bd_high(	1773 dest[j * stride + i] = highbd_clip_pixel_add(

1759 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);	1774 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);

	1775 }

1760 }	1776 }

1761 }	1777 }

1762	1778

1763 static void high_iadst8(const tran_low_t input, tran_low_t output, int bd) {	1779 static void highbd_iadst8(const tran_low_t input, tran_low_t output, int bd) {

1764 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;	1780 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;

1765	1781

1766 tran_high_t x0 = input[7];	1782 tran_high_t x0 = input[7];

1767 tran_high_t x1 = input[0];	1783 tran_high_t x1 = input[0];

1768 tran_high_t x2 = input[5];	1784 tran_high_t x2 = input[5];

1769 tran_high_t x3 = input[2];	1785 tran_high_t x3 = input[2];

1770 tran_high_t x4 = input[3];	1786 tran_high_t x4 = input[3];

1771 tran_high_t x5 = input[4];	1787 tran_high_t x5 = input[4];

1772 tran_high_t x6 = input[1];	1788 tran_high_t x6 = input[1];

1773 tran_high_t x7 = input[6];	1789 tran_high_t x7 = input[6];

1774 (void) bd;	1790 (void) bd;

1775	1791

1776 if (!(x0 \| x1 \| x2 \| x3 \| x4 \| x5 \| x6 \| x7)) {	1792 if (!(x0 \| x1 \| x2 \| x3 \| x4 \| x5 \| x6 \| x7)) {

1777 vpx_memset(output, 0, 8 * sizeof(*output));	1793 vpx_memset(output, 0, 8 * sizeof(*output));

1778 return;	1794 return;

1779 }	1795 }

1780	1796

1781 // stage 1	1797 // stage 1

1782 s0 = cospi_2_64 * x0 + cospi_30_64 * x1;	1798 s0 = cospi_2_64 * x0 + cospi_30_64 * x1;

1783 s1 = cospi_30_64 * x0 - cospi_2_64 * x1;	1799 s1 = cospi_30_64 * x0 - cospi_2_64 * x1;

1784 s2 = cospi_10_64 * x2 + cospi_22_64 * x3;	1800 s2 = cospi_10_64 * x2 + cospi_22_64 * x3;

1785 s3 = cospi_22_64 * x2 - cospi_10_64 * x3;	1801 s3 = cospi_22_64 * x2 - cospi_10_64 * x3;

1786 s4 = cospi_18_64 * x4 + cospi_14_64 * x5;	1802 s4 = cospi_18_64 * x4 + cospi_14_64 * x5;

1787 s5 = cospi_14_64 * x4 - cospi_18_64 * x5;	1803 s5 = cospi_14_64 * x4 - cospi_18_64 * x5;

1788 s6 = cospi_26_64 * x6 + cospi_6_64 * x7;	1804 s6 = cospi_26_64 * x6 + cospi_6_64 * x7;

1789 s7 = cospi_6_64 * x6 - cospi_26_64 * x7;	1805 s7 = cospi_6_64 * x6 - cospi_26_64 * x7;

1790	1806

1791 x0 = WRAPLOW(dct_const_round_shift(s0 + s4));	1807 x0 = WRAPLOW(dct_const_round_shift(s0 + s4), bd);

1792 x1 = WRAPLOW(dct_const_round_shift(s1 + s5));	1808 x1 = WRAPLOW(dct_const_round_shift(s1 + s5), bd);

1793 x2 = WRAPLOW(dct_const_round_shift(s2 + s6));	1809 x2 = WRAPLOW(dct_const_round_shift(s2 + s6), bd);

1794 x3 = WRAPLOW(dct_const_round_shift(s3 + s7));	1810 x3 = WRAPLOW(dct_const_round_shift(s3 + s7), bd);

1795 x4 = WRAPLOW(dct_const_round_shift(s0 - s4));	1811 x4 = WRAPLOW(dct_const_round_shift(s0 - s4), bd);

1796 x5 = WRAPLOW(dct_const_round_shift(s1 - s5));	1812 x5 = WRAPLOW(dct_const_round_shift(s1 - s5), bd);

1797 x6 = WRAPLOW(dct_const_round_shift(s2 - s6));	1813 x6 = WRAPLOW(dct_const_round_shift(s2 - s6), bd);

1798 x7 = WRAPLOW(dct_const_round_shift(s3 - s7));	1814 x7 = WRAPLOW(dct_const_round_shift(s3 - s7), bd);

1799	1815

1800 // stage 2	1816 // stage 2

1801 s0 = x0;	1817 s0 = x0;

1802 s1 = x1;	1818 s1 = x1;

1803 s2 = x2;	1819 s2 = x2;

1804 s3 = x3;	1820 s3 = x3;

1805 s4 = cospi_8_64 * x4 + cospi_24_64 * x5;	1821 s4 = cospi_8_64 * x4 + cospi_24_64 * x5;

1806 s5 = cospi_24_64 * x4 - cospi_8_64 * x5;	1822 s5 = cospi_24_64 * x4 - cospi_8_64 * x5;

1807 s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;	1823 s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;

1808 s7 = cospi_8_64 * x6 + cospi_24_64 * x7;	1824 s7 = cospi_8_64 * x6 + cospi_24_64 * x7;

1809	1825

1810 x0 = s0 + s2;	1826 x0 = WRAPLOW(s0 + s2, bd);

1811 x1 = s1 + s3;	1827 x1 = WRAPLOW(s1 + s3, bd);

1812 x2 = s0 - s2;	1828 x2 = WRAPLOW(s0 - s2, bd);

1813 x3 = s1 - s3;	1829 x3 = WRAPLOW(s1 - s3, bd);

1814 x4 = WRAPLOW(dct_const_round_shift(s4 + s6));	1830 x4 = WRAPLOW(dct_const_round_shift(s4 + s6), bd);

1815 x5 = WRAPLOW(dct_const_round_shift(s5 + s7));	1831 x5 = WRAPLOW(dct_const_round_shift(s5 + s7), bd);

1816 x6 = WRAPLOW(dct_const_round_shift(s4 - s6));	1832 x6 = WRAPLOW(dct_const_round_shift(s4 - s6), bd);

1817 x7 = WRAPLOW(dct_const_round_shift(s5 - s7));	1833 x7 = WRAPLOW(dct_const_round_shift(s5 - s7), bd);

1818	1834

1819 // stage 3	1835 // stage 3

1820 s2 = cospi_16_64 * (x2 + x3);	1836 s2 = cospi_16_64 * (x2 + x3);

1821 s3 = cospi_16_64 * (x2 - x3);	1837 s3 = cospi_16_64 * (x2 - x3);

1822 s6 = cospi_16_64 * (x6 + x7);	1838 s6 = cospi_16_64 * (x6 + x7);

1823 s7 = cospi_16_64 * (x6 - x7);	1839 s7 = cospi_16_64 * (x6 - x7);

1824	1840

1825 x2 = WRAPLOW(dct_const_round_shift(s2));	1841 x2 = WRAPLOW(dct_const_round_shift(s2), bd);

1826 x3 = WRAPLOW(dct_const_round_shift(s3));	1842 x3 = WRAPLOW(dct_const_round_shift(s3), bd);

1827 x6 = WRAPLOW(dct_const_round_shift(s6));	1843 x6 = WRAPLOW(dct_const_round_shift(s6), bd);

1828 x7 = WRAPLOW(dct_const_round_shift(s7));	1844 x7 = WRAPLOW(dct_const_round_shift(s7), bd);

1829	1845

1830 output[0] = WRAPLOW(x0);	1846 output[0] = WRAPLOW(x0, bd);

1831 output[1] = WRAPLOW(-x4);	1847 output[1] = WRAPLOW(-x4, bd);

1832 output[2] = WRAPLOW(x6);	1848 output[2] = WRAPLOW(x6, bd);

1833 output[3] = WRAPLOW(-x2);	1849 output[3] = WRAPLOW(-x2, bd);

1834 output[4] = WRAPLOW(x3);	1850 output[4] = WRAPLOW(x3, bd);

1835 output[5] = WRAPLOW(-x7);	1851 output[5] = WRAPLOW(-x7, bd);

1836 output[6] = WRAPLOW(x5);	1852 output[6] = WRAPLOW(x5, bd);

1837 output[7] = WRAPLOW(-x1);	1853 output[7] = WRAPLOW(-x1, bd);

1838 }	1854 }

1839	1855

1840 static const high_transform_2d HIGH_IHT_8[] = {	1856 static const highbd_transform_2d HIGH_IHT_8[] = {

1841 { high_idct8, high_idct8 }, // DCT_DCT = 0	1857 { highbd_idct8, highbd_idct8 }, // DCT_DCT = 0

1842 { high_iadst8, high_idct8 }, // ADST_DCT = 1	1858 { highbd_iadst8, highbd_idct8 }, // ADST_DCT = 1

1843 { high_idct8, high_iadst8 }, // DCT_ADST = 2	1859 { highbd_idct8, highbd_iadst8 }, // DCT_ADST = 2

1844 { high_iadst8, high_iadst8 } // ADST_ADST = 3	1860 { highbd_iadst8, highbd_iadst8 } // ADST_ADST = 3

1845 };	1861 };

1846	1862

1847 void vp9_high_iht8x8_64_add_c(const tran_low_t input, uint8_t dest8,	1863 void vp9_highbd_iht8x8_64_add_c(const tran_low_t input, uint8_t dest8,

1848 int stride, int tx_type, int bd) {	1864 int stride, int tx_type, int bd) {

1849 int i, j;	1865 int i, j;

1850 tran_low_t out[8 * 8];	1866 tran_low_t out[8 * 8];

1851 tran_low_t *outptr = out;	1867 tran_low_t *outptr = out;

1852 tran_low_t temp_in[8], temp_out[8];	1868 tran_low_t temp_in[8], temp_out[8];

1853 const high_transform_2d ht = HIGH_IHT_8[tx_type];	1869 const highbd_transform_2d ht = HIGH_IHT_8[tx_type];

1854 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);	1870 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

1855	1871

1856 // Inverse transform row vectors.	1872 // Inverse transform row vectors.

1857 for (i = 0; i < 8; ++i) {	1873 for (i = 0; i < 8; ++i) {

1858 ht.rows(input, outptr, bd);	1874 ht.rows(input, outptr, bd);

1859 input += 8;	1875 input += 8;

1860 outptr += 8;	1876 outptr += 8;

1861 }	1877 }

1862	1878

1863 // Inverse transform column vectors.	1879 // Inverse transform column vectors.

1864 for (i = 0; i < 8; ++i) {	1880 for (i = 0; i < 8; ++i) {

1865 for (j = 0; j < 8; ++j)	1881 for (j = 0; j < 8; ++j)

1866 temp_in[j] = out[j * 8 + i];	1882 temp_in[j] = out[j * 8 + i];

1867 ht.cols(temp_in, temp_out, bd);	1883 ht.cols(temp_in, temp_out, bd);

1868 for (j = 0; j < 8; ++j)	1884 for (j = 0; j < 8; ++j) {

1869 dest[j * stride + i] = clip_pixel_bd_high(	1885 dest[j * stride + i] = highbd_clip_pixel_add(

1870 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);	1886 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);

	1887 }

1871 }	1888 }

1872 }	1889 }

1873	1890

1874 void vp9_high_idct8x8_10_add_c(const tran_low_t input, uint8_t dest8,	1891 void vp9_highbd_idct8x8_10_add_c(const tran_low_t input, uint8_t dest8,

1875 int stride, int bd) {	1892 int stride, int bd) {

1876 tran_low_t out[8 * 8] = { 0 };	1893 tran_low_t out[8 * 8] = { 0 };

1877 tran_low_t *outptr = out;	1894 tran_low_t *outptr = out;

1878 int i, j;	1895 int i, j;

1879 tran_low_t temp_in[8], temp_out[8];	1896 tran_low_t temp_in[8], temp_out[8];

1880 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);	1897 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

1881	1898

1882 // First transform rows.	1899 // First transform rows.

1883 // Only first 4 row has non-zero coefs.	1900 // Only first 4 row has non-zero coefs.

1884 for (i = 0; i < 4; ++i) {	1901 for (i = 0; i < 4; ++i) {

1885 high_idct8(input, outptr, bd);	1902 highbd_idct8(input, outptr, bd);

1886 input += 8;	1903 input += 8;

1887 outptr += 8;	1904 outptr += 8;

1888 }	1905 }

1889 // Then transform columns.	1906 // Then transform columns.

1890 for (i = 0; i < 8; ++i) {	1907 for (i = 0; i < 8; ++i) {

1891 for (j = 0; j < 8; ++j)	1908 for (j = 0; j < 8; ++j)

1892 temp_in[j] = out[j * 8 + i];	1909 temp_in[j] = out[j * 8 + i];

1893 high_idct8(temp_in, temp_out, bd);	1910 highbd_idct8(temp_in, temp_out, bd);

1894 for (j = 0; j < 8; ++j)	1911 for (j = 0; j < 8; ++j) {

1895 dest[j * stride + i] = clip_pixel_bd_high(	1912 dest[j * stride + i] = highbd_clip_pixel_add(

1896 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);	1913 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);

	1914 }

1897 }	1915 }

1898 }	1916 }

1899	1917

1900 static void high_idct16(const tran_low_t input, tran_low_t output, int bd) {	1918 static void highbd_idct16(const tran_low_t input, tran_low_t output, int bd) {

1901 tran_low_t step1[16], step2[16];	1919 tran_low_t step1[16], step2[16];

1902 tran_high_t temp1, temp2;	1920 tran_high_t temp1, temp2;

1903 (void) bd;	1921 (void) bd;

1904	1922

1905 // stage 1	1923 // stage 1

1906 step1[0] = input[0/2];	1924 step1[0] = input[0/2];

1907 step1[1] = input[16/2];	1925 step1[1] = input[16/2];

1908 step1[2] = input[8/2];	1926 step1[2] = input[8/2];

1909 step1[3] = input[24/2];	1927 step1[3] = input[24/2];

1910 step1[4] = input[4/2];	1928 step1[4] = input[4/2];

(...skipping 14 matching lines...) Expand all Loading...
1925 step2[1] = step1[1];	1943 step2[1] = step1[1];

1926 step2[2] = step1[2];	1944 step2[2] = step1[2];

1927 step2[3] = step1[3];	1945 step2[3] = step1[3];

1928 step2[4] = step1[4];	1946 step2[4] = step1[4];

1929 step2[5] = step1[5];	1947 step2[5] = step1[5];

1930 step2[6] = step1[6];	1948 step2[6] = step1[6];

1931 step2[7] = step1[7];	1949 step2[7] = step1[7];

1932	1950

1933 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;	1951 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;

1934 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;	1952 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;

1935 step2[8] = WRAPLOW(dct_const_round_shift(temp1));	1953 step2[8] = WRAPLOW(dct_const_round_shift(temp1), bd);

1936 step2[15] = WRAPLOW(dct_const_round_shift(temp2));	1954 step2[15] = WRAPLOW(dct_const_round_shift(temp2), bd);

1937	1955

1938 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;	1956 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;

1939 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;	1957 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;

1940 step2[9] = WRAPLOW(dct_const_round_shift(temp1));	1958 step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd);

1941 step2[14] = WRAPLOW(dct_const_round_shift(temp2));	1959 step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd);

1942	1960

1943 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;	1961 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;

1944 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;	1962 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;

1945 step2[10] = WRAPLOW(dct_const_round_shift(temp1));	1963 step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);

1946 step2[13] = WRAPLOW(dct_const_round_shift(temp2));	1964 step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);

1947	1965

1948 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;	1966 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;

1949 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;	1967 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;

1950 step2[11] = WRAPLOW(dct_const_round_shift(temp1));	1968 step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd);

1951 step2[12] = WRAPLOW(dct_const_round_shift(temp2));	1969 step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd);

1952	1970

1953 // stage 3	1971 // stage 3

1954 step1[0] = step2[0];	1972 step1[0] = step2[0];

1955 step1[1] = step2[1];	1973 step1[1] = step2[1];

1956 step1[2] = step2[2];	1974 step1[2] = step2[2];

1957 step1[3] = step2[3];	1975 step1[3] = step2[3];

1958	1976

1959 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;	1977 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;

1960 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;	1978 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;

1961 step1[4] = WRAPLOW(dct_const_round_shift(temp1));	1979 step1[4] = WRAPLOW(dct_const_round_shift(temp1), bd);

1962 step1[7] = WRAPLOW(dct_const_round_shift(temp2));	1980 step1[7] = WRAPLOW(dct_const_round_shift(temp2), bd);

1963 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;	1981 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;

1964 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;	1982 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;

1965 step1[5] = WRAPLOW(dct_const_round_shift(temp1));	1983 step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);

1966 step1[6] = WRAPLOW(dct_const_round_shift(temp2));	1984 step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);

1967	1985

1968 step1[8] = WRAPLOW(step2[8] + step2[9]);	1986 step1[8] = WRAPLOW(step2[8] + step2[9], bd);

1969 step1[9] = WRAPLOW(step2[8] - step2[9]);	1987 step1[9] = WRAPLOW(step2[8] - step2[9], bd);

1970 step1[10] = WRAPLOW(-step2[10] + step2[11]);	1988 step1[10] = WRAPLOW(-step2[10] + step2[11], bd);

1971 step1[11] = WRAPLOW(step2[10] + step2[11]);	1989 step1[11] = WRAPLOW(step2[10] + step2[11], bd);

1972 step1[12] = WRAPLOW(step2[12] + step2[13]);	1990 step1[12] = WRAPLOW(step2[12] + step2[13], bd);

1973 step1[13] = WRAPLOW(step2[12] - step2[13]);	1991 step1[13] = WRAPLOW(step2[12] - step2[13], bd);

1974 step1[14] = WRAPLOW(-step2[14] + step2[15]);	1992 step1[14] = WRAPLOW(-step2[14] + step2[15], bd);

1975 step1[15] = WRAPLOW(step2[14] + step2[15]);	1993 step1[15] = WRAPLOW(step2[14] + step2[15], bd);

1976	1994

1977 // stage 4	1995 // stage 4

1978 temp1 = (step1[0] + step1[1]) * cospi_16_64;	1996 temp1 = (step1[0] + step1[1]) * cospi_16_64;

1979 temp2 = (step1[0] - step1[1]) * cospi_16_64;	1997 temp2 = (step1[0] - step1[1]) * cospi_16_64;

1980 step2[0] = WRAPLOW(dct_const_round_shift(temp1));	1998 step2[0] = WRAPLOW(dct_const_round_shift(temp1), bd);

1981 step2[1] = WRAPLOW(dct_const_round_shift(temp2));	1999 step2[1] = WRAPLOW(dct_const_round_shift(temp2), bd);

1982 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;	2000 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;

1983 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;	2001 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;

1984 step2[2] = WRAPLOW(dct_const_round_shift(temp1));	2002 step2[2] = WRAPLOW(dct_const_round_shift(temp1), bd);

1985 step2[3] = WRAPLOW(dct_const_round_shift(temp2));	2003 step2[3] = WRAPLOW(dct_const_round_shift(temp2), bd);

1986 step2[4] = WRAPLOW(step1[4] + step1[5]);	2004 step2[4] = WRAPLOW(step1[4] + step1[5], bd);

1987 step2[5] = WRAPLOW(step1[4] - step1[5]);	2005 step2[5] = WRAPLOW(step1[4] - step1[5], bd);

1988 step2[6] = WRAPLOW(-step1[6] + step1[7]);	2006 step2[6] = WRAPLOW(-step1[6] + step1[7], bd);

1989 step2[7] = WRAPLOW(step1[6] + step1[7]);	2007 step2[7] = WRAPLOW(step1[6] + step1[7], bd);

1990	2008

1991 step2[8] = step1[8];	2009 step2[8] = step1[8];

1992 step2[15] = step1[15];	2010 step2[15] = step1[15];

1993 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;	2011 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;

1994 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;	2012 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;

1995 step2[9] = WRAPLOW(dct_const_round_shift(temp1));	2013 step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd);

1996 step2[14] = WRAPLOW(dct_const_round_shift(temp2));	2014 step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd);

1997 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;	2015 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;

1998 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;	2016 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;

1999 step2[10] = WRAPLOW(dct_const_round_shift(temp1));	2017 step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);

2000 step2[13] = WRAPLOW(dct_const_round_shift(temp2));	2018 step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);

2001 step2[11] = step1[11];	2019 step2[11] = step1[11];

2002 step2[12] = step1[12];	2020 step2[12] = step1[12];

2003	2021

2004 // stage 5	2022 // stage 5

2005 step1[0] = WRAPLOW(step2[0] + step2[3]);	2023 step1[0] = WRAPLOW(step2[0] + step2[3], bd);

2006 step1[1] = WRAPLOW(step2[1] + step2[2]);	2024 step1[1] = WRAPLOW(step2[1] + step2[2], bd);

2007 step1[2] = WRAPLOW(step2[1] - step2[2]);	2025 step1[2] = WRAPLOW(step2[1] - step2[2], bd);

2008 step1[3] = WRAPLOW(step2[0] - step2[3]);	2026 step1[3] = WRAPLOW(step2[0] - step2[3], bd);

2009 step1[4] = step2[4];	2027 step1[4] = step2[4];

2010 temp1 = (step2[6] - step2[5]) * cospi_16_64;	2028 temp1 = (step2[6] - step2[5]) * cospi_16_64;

2011 temp2 = (step2[5] + step2[6]) * cospi_16_64;	2029 temp2 = (step2[5] + step2[6]) * cospi_16_64;

2012 step1[5] = WRAPLOW(dct_const_round_shift(temp1));	2030 step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);

2013 step1[6] = WRAPLOW(dct_const_round_shift(temp2));	2031 step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);

2014 step1[7] = step2[7];	2032 step1[7] = step2[7];

2015	2033

2016 step1[8] = WRAPLOW(step2[8] + step2[11]);	2034 step1[8] = WRAPLOW(step2[8] + step2[11], bd);

2017 step1[9] = WRAPLOW(step2[9] + step2[10]);	2035 step1[9] = WRAPLOW(step2[9] + step2[10], bd);

2018 step1[10] = WRAPLOW(step2[9] - step2[10]);	2036 step1[10] = WRAPLOW(step2[9] - step2[10], bd);

2019 step1[11] = WRAPLOW(step2[8] - step2[11]);	2037 step1[11] = WRAPLOW(step2[8] - step2[11], bd);

2020 step1[12] = WRAPLOW(-step2[12] + step2[15]);	2038 step1[12] = WRAPLOW(-step2[12] + step2[15], bd);

2021 step1[13] = WRAPLOW(-step2[13] + step2[14]);	2039 step1[13] = WRAPLOW(-step2[13] + step2[14], bd);

2022 step1[14] = WRAPLOW(step2[13] + step2[14]);	2040 step1[14] = WRAPLOW(step2[13] + step2[14], bd);

2023 step1[15] = WRAPLOW(step2[12] + step2[15]);	2041 step1[15] = WRAPLOW(step2[12] + step2[15], bd);

2024	2042

2025 // stage 6	2043 // stage 6

2026 step2[0] = WRAPLOW(step1[0] + step1[7]);	2044 step2[0] = WRAPLOW(step1[0] + step1[7], bd);

2027 step2[1] = WRAPLOW(step1[1] + step1[6]);	2045 step2[1] = WRAPLOW(step1[1] + step1[6], bd);

2028 step2[2] = WRAPLOW(step1[2] + step1[5]);	2046 step2[2] = WRAPLOW(step1[2] + step1[5], bd);

2029 step2[3] = WRAPLOW(step1[3] + step1[4]);	2047 step2[3] = WRAPLOW(step1[3] + step1[4], bd);

2030 step2[4] = WRAPLOW(step1[3] - step1[4]);	2048 step2[4] = WRAPLOW(step1[3] - step1[4], bd);

2031 step2[5] = WRAPLOW(step1[2] - step1[5]);	2049 step2[5] = WRAPLOW(step1[2] - step1[5], bd);

2032 step2[6] = WRAPLOW(step1[1] - step1[6]);	2050 step2[6] = WRAPLOW(step1[1] - step1[6], bd);

2033 step2[7] = WRAPLOW(step1[0] - step1[7]);	2051 step2[7] = WRAPLOW(step1[0] - step1[7], bd);

2034 step2[8] = step1[8];	2052 step2[8] = step1[8];

2035 step2[9] = step1[9];	2053 step2[9] = step1[9];

2036 temp1 = (-step1[10] + step1[13]) * cospi_16_64;	2054 temp1 = (-step1[10] + step1[13]) * cospi_16_64;

2037 temp2 = (step1[10] + step1[13]) * cospi_16_64;	2055 temp2 = (step1[10] + step1[13]) * cospi_16_64;

2038 step2[10] = WRAPLOW(dct_const_round_shift(temp1));	2056 step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);

2039 step2[13] = WRAPLOW(dct_const_round_shift(temp2));	2057 step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);

2040 temp1 = (-step1[11] + step1[12]) * cospi_16_64;	2058 temp1 = (-step1[11] + step1[12]) * cospi_16_64;

2041 temp2 = (step1[11] + step1[12]) * cospi_16_64;	2059 temp2 = (step1[11] + step1[12]) * cospi_16_64;

2042 step2[11] = WRAPLOW(dct_const_round_shift(temp1));	2060 step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd);

2043 step2[12] = WRAPLOW(dct_const_round_shift(temp2));	2061 step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd);

2044 step2[14] = step1[14];	2062 step2[14] = step1[14];

2045 step2[15] = step1[15];	2063 step2[15] = step1[15];

2046	2064

2047 // stage 7	2065 // stage 7

2048 output[0] = WRAPLOW(step2[0] + step2[15]);	2066 output[0] = WRAPLOW(step2[0] + step2[15], bd);

2049 output[1] = WRAPLOW(step2[1] + step2[14]);	2067 output[1] = WRAPLOW(step2[1] + step2[14], bd);

2050 output[2] = WRAPLOW(step2[2] + step2[13]);	2068 output[2] = WRAPLOW(step2[2] + step2[13], bd);

2051 output[3] = WRAPLOW(step2[3] + step2[12]);	2069 output[3] = WRAPLOW(step2[3] + step2[12], bd);

2052 output[4] = WRAPLOW(step2[4] + step2[11]);	2070 output[4] = WRAPLOW(step2[4] + step2[11], bd);

2053 output[5] = WRAPLOW(step2[5] + step2[10]);	2071 output[5] = WRAPLOW(step2[5] + step2[10], bd);

2054 output[6] = WRAPLOW(step2[6] + step2[9]);	2072 output[6] = WRAPLOW(step2[6] + step2[9], bd);

2055 output[7] = WRAPLOW(step2[7] + step2[8]);	2073 output[7] = WRAPLOW(step2[7] + step2[8], bd);

2056 output[8] = WRAPLOW(step2[7] - step2[8]);	2074 output[8] = WRAPLOW(step2[7] - step2[8], bd);

2057 output[9] = WRAPLOW(step2[6] - step2[9]);	2075 output[9] = WRAPLOW(step2[6] - step2[9], bd);

2058 output[10] = WRAPLOW(step2[5] - step2[10]);	2076 output[10] = WRAPLOW(step2[5] - step2[10], bd);

2059 output[11] = WRAPLOW(step2[4] - step2[11]);	2077 output[11] = WRAPLOW(step2[4] - step2[11], bd);

2060 output[12] = WRAPLOW(step2[3] - step2[12]);	2078 output[12] = WRAPLOW(step2[3] - step2[12], bd);

2061 output[13] = WRAPLOW(step2[2] - step2[13]);	2079 output[13] = WRAPLOW(step2[2] - step2[13], bd);

2062 output[14] = WRAPLOW(step2[1] - step2[14]);	2080 output[14] = WRAPLOW(step2[1] - step2[14], bd);

2063 output[15] = WRAPLOW(step2[0] - step2[15]);	2081 output[15] = WRAPLOW(step2[0] - step2[15], bd);

2064 }	2082 }

2065	2083

2066 void vp9_high_idct16x16_256_add_c(const tran_low_t input, uint8_t dest8,	2084 void vp9_highbd_idct16x16_256_add_c(const tran_low_t input, uint8_t dest8,

2067 int stride, int bd) {	2085 int stride, int bd) {

2068 tran_low_t out[16 * 16];	2086 tran_low_t out[16 * 16];

2069 tran_low_t *outptr = out;	2087 tran_low_t *outptr = out;

2070 int i, j;	2088 int i, j;

2071 tran_low_t temp_in[16], temp_out[16];	2089 tran_low_t temp_in[16], temp_out[16];

2072 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);	2090 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

2073	2091

2074 // First transform rows.	2092 // First transform rows.

2075 for (i = 0; i < 16; ++i) {	2093 for (i = 0; i < 16; ++i) {

2076 high_idct16(input, outptr, bd);	2094 highbd_idct16(input, outptr, bd);

2077 input += 16;	2095 input += 16;

2078 outptr += 16;	2096 outptr += 16;

2079 }	2097 }

2080	2098

2081 // Then transform columns.	2099 // Then transform columns.

2082 for (i = 0; i < 16; ++i) {	2100 for (i = 0; i < 16; ++i) {

2083 for (j = 0; j < 16; ++j)	2101 for (j = 0; j < 16; ++j)

2084 temp_in[j] = out[j * 16 + i];	2102 temp_in[j] = out[j * 16 + i];

2085 high_idct16(temp_in, temp_out, bd);	2103 highbd_idct16(temp_in, temp_out, bd);

2086 for (j = 0; j < 16; ++j)	2104 for (j = 0; j < 16; ++j) {

2087 dest[j * stride + i] = clip_pixel_bd_high(	2105 dest[j * stride + i] = highbd_clip_pixel_add(

2088 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);	2106 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);

	2107 }

2089 }	2108 }

2090 }	2109 }

2091	2110

2092 static void high_iadst16(const tran_low_t input, tran_low_t output, int bd) {	2111 static void highbd_iadst16(const tran_low_t input, tran_low_t output,

	2112 int bd) {

2093 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;	2113 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;

2094 tran_high_t s9, s10, s11, s12, s13, s14, s15;	2114 tran_high_t s9, s10, s11, s12, s13, s14, s15;

2095	2115

2096 tran_high_t x0 = input[15];	2116 tran_high_t x0 = input[15];

2097 tran_high_t x1 = input[0];	2117 tran_high_t x1 = input[0];

2098 tran_high_t x2 = input[13];	2118 tran_high_t x2 = input[13];

2099 tran_high_t x3 = input[2];	2119 tran_high_t x3 = input[2];

2100 tran_high_t x4 = input[11];	2120 tran_high_t x4 = input[11];

2101 tran_high_t x5 = input[4];	2121 tran_high_t x5 = input[4];

2102 tran_high_t x6 = input[9];	2122 tran_high_t x6 = input[9];

(...skipping 25 matching lines...) Expand all Loading...
2128 s7 = x6 * cospi_19_64 - x7 * cospi_13_64;	2148 s7 = x6 * cospi_19_64 - x7 * cospi_13_64;

2129 s8 = x8 * cospi_17_64 + x9 * cospi_15_64;	2149 s8 = x8 * cospi_17_64 + x9 * cospi_15_64;

2130 s9 = x8 * cospi_15_64 - x9 * cospi_17_64;	2150 s9 = x8 * cospi_15_64 - x9 * cospi_17_64;

2131 s10 = x10 * cospi_21_64 + x11 * cospi_11_64;	2151 s10 = x10 * cospi_21_64 + x11 * cospi_11_64;

2132 s11 = x10 * cospi_11_64 - x11 * cospi_21_64;	2152 s11 = x10 * cospi_11_64 - x11 * cospi_21_64;

2133 s12 = x12 * cospi_25_64 + x13 * cospi_7_64;	2153 s12 = x12 * cospi_25_64 + x13 * cospi_7_64;

2134 s13 = x12 * cospi_7_64 - x13 * cospi_25_64;	2154 s13 = x12 * cospi_7_64 - x13 * cospi_25_64;

2135 s14 = x14 * cospi_29_64 + x15 * cospi_3_64;	2155 s14 = x14 * cospi_29_64 + x15 * cospi_3_64;

2136 s15 = x14 * cospi_3_64 - x15 * cospi_29_64;	2156 s15 = x14 * cospi_3_64 - x15 * cospi_29_64;

2137	2157

2138 x0 = WRAPLOW(dct_const_round_shift(s0 + s8));	2158 x0 = WRAPLOW(dct_const_round_shift(s0 + s8), bd);

2139 x1 = WRAPLOW(dct_const_round_shift(s1 + s9));	2159 x1 = WRAPLOW(dct_const_round_shift(s1 + s9), bd);

2140 x2 = WRAPLOW(dct_const_round_shift(s2 + s10));	2160 x2 = WRAPLOW(dct_const_round_shift(s2 + s10), bd);

2141 x3 = WRAPLOW(dct_const_round_shift(s3 + s11));	2161 x3 = WRAPLOW(dct_const_round_shift(s3 + s11), bd);

2142 x4 = WRAPLOW(dct_const_round_shift(s4 + s12));	2162 x4 = WRAPLOW(dct_const_round_shift(s4 + s12), bd);

2143 x5 = WRAPLOW(dct_const_round_shift(s5 + s13));	2163 x5 = WRAPLOW(dct_const_round_shift(s5 + s13), bd);

2144 x6 = WRAPLOW(dct_const_round_shift(s6 + s14));	2164 x6 = WRAPLOW(dct_const_round_shift(s6 + s14), bd);

2145 x7 = WRAPLOW(dct_const_round_shift(s7 + s15));	2165 x7 = WRAPLOW(dct_const_round_shift(s7 + s15), bd);

2146 x8 = WRAPLOW(dct_const_round_shift(s0 - s8));	2166 x8 = WRAPLOW(dct_const_round_shift(s0 - s8), bd);

2147 x9 = WRAPLOW(dct_const_round_shift(s1 - s9));	2167 x9 = WRAPLOW(dct_const_round_shift(s1 - s9), bd);

2148 x10 = WRAPLOW(dct_const_round_shift(s2 - s10));	2168 x10 = WRAPLOW(dct_const_round_shift(s2 - s10), bd);

2149 x11 = WRAPLOW(dct_const_round_shift(s3 - s11));	2169 x11 = WRAPLOW(dct_const_round_shift(s3 - s11), bd);

2150 x12 = WRAPLOW(dct_const_round_shift(s4 - s12));	2170 x12 = WRAPLOW(dct_const_round_shift(s4 - s12), bd);

2151 x13 = WRAPLOW(dct_const_round_shift(s5 - s13));	2171 x13 = WRAPLOW(dct_const_round_shift(s5 - s13), bd);

2152 x14 = WRAPLOW(dct_const_round_shift(s6 - s14));	2172 x14 = WRAPLOW(dct_const_round_shift(s6 - s14), bd);

2153 x15 = WRAPLOW(dct_const_round_shift(s7 - s15));	2173 x15 = WRAPLOW(dct_const_round_shift(s7 - s15), bd);

2154	2174

2155 // stage 2	2175 // stage 2

2156 s0 = x0;	2176 s0 = x0;

2157 s1 = x1;	2177 s1 = x1;

2158 s2 = x2;	2178 s2 = x2;

2159 s3 = x3;	2179 s3 = x3;

2160 s4 = x4;	2180 s4 = x4;

2161 s5 = x5;	2181 s5 = x5;

2162 s6 = x6;	2182 s6 = x6;

2163 s7 = x7;	2183 s7 = x7;

2164 s8 = x8 * cospi_4_64 + x9 * cospi_28_64;	2184 s8 = x8 * cospi_4_64 + x9 * cospi_28_64;

2165 s9 = x8 * cospi_28_64 - x9 * cospi_4_64;	2185 s9 = x8 * cospi_28_64 - x9 * cospi_4_64;

2166 s10 = x10 * cospi_20_64 + x11 * cospi_12_64;	2186 s10 = x10 * cospi_20_64 + x11 * cospi_12_64;

2167 s11 = x10 * cospi_12_64 - x11 * cospi_20_64;	2187 s11 = x10 * cospi_12_64 - x11 * cospi_20_64;

2168 s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;	2188 s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;

2169 s13 = x12 * cospi_4_64 + x13 * cospi_28_64;	2189 s13 = x12 * cospi_4_64 + x13 * cospi_28_64;

2170 s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;	2190 s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;

2171 s15 = x14 * cospi_20_64 + x15 * cospi_12_64;	2191 s15 = x14 * cospi_20_64 + x15 * cospi_12_64;

2172	2192

2173 x0 = WRAPLOW(s0 + s4);	2193 x0 = WRAPLOW(s0 + s4, bd);

2174 x1 = WRAPLOW(s1 + s5);	2194 x1 = WRAPLOW(s1 + s5, bd);

2175 x2 = WRAPLOW(s2 + s6);	2195 x2 = WRAPLOW(s2 + s6, bd);

2176 x3 = WRAPLOW(s3 + s7);	2196 x3 = WRAPLOW(s3 + s7, bd);

2177 x4 = WRAPLOW(s0 - s4);	2197 x4 = WRAPLOW(s0 - s4, bd);

2178 x5 = WRAPLOW(s1 - s5);	2198 x5 = WRAPLOW(s1 - s5, bd);

2179 x6 = WRAPLOW(s2 - s6);	2199 x6 = WRAPLOW(s2 - s6, bd);

2180 x7 = WRAPLOW(s3 - s7);	2200 x7 = WRAPLOW(s3 - s7, bd);

2181 x8 = WRAPLOW(dct_const_round_shift(s8 + s12));	2201 x8 = WRAPLOW(dct_const_round_shift(s8 + s12), bd);

2182 x9 = WRAPLOW(dct_const_round_shift(s9 + s13));	2202 x9 = WRAPLOW(dct_const_round_shift(s9 + s13), bd);

2183 x10 = WRAPLOW(dct_const_round_shift(s10 + s14));	2203 x10 = WRAPLOW(dct_const_round_shift(s10 + s14), bd);

2184 x11 = WRAPLOW(dct_const_round_shift(s11 + s15));	2204 x11 = WRAPLOW(dct_const_round_shift(s11 + s15), bd);

2185 x12 = WRAPLOW(dct_const_round_shift(s8 - s12));	2205 x12 = WRAPLOW(dct_const_round_shift(s8 - s12), bd);

2186 x13 = WRAPLOW(dct_const_round_shift(s9 - s13));	2206 x13 = WRAPLOW(dct_const_round_shift(s9 - s13), bd);

2187 x14 = WRAPLOW(dct_const_round_shift(s10 - s14));	2207 x14 = WRAPLOW(dct_const_round_shift(s10 - s14), bd);

2188 x15 = WRAPLOW(dct_const_round_shift(s11 - s15));	2208 x15 = WRAPLOW(dct_const_round_shift(s11 - s15), bd);

2189	2209

2190 // stage 3	2210 // stage 3

2191 s0 = x0;	2211 s0 = x0;

2192 s1 = x1;	2212 s1 = x1;

2193 s2 = x2;	2213 s2 = x2;

2194 s3 = x3;	2214 s3 = x3;

2195 s4 = x4 * cospi_8_64 + x5 * cospi_24_64;	2215 s4 = x4 * cospi_8_64 + x5 * cospi_24_64;

2196 s5 = x4 * cospi_24_64 - x5 * cospi_8_64;	2216 s5 = x4 * cospi_24_64 - x5 * cospi_8_64;

2197 s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;	2217 s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;

2198 s7 = x6 * cospi_8_64 + x7 * cospi_24_64;	2218 s7 = x6 * cospi_8_64 + x7 * cospi_24_64;

2199 s8 = x8;	2219 s8 = x8;

2200 s9 = x9;	2220 s9 = x9;

2201 s10 = x10;	2221 s10 = x10;

2202 s11 = x11;	2222 s11 = x11;

2203 s12 = x12 * cospi_8_64 + x13 * cospi_24_64;	2223 s12 = x12 * cospi_8_64 + x13 * cospi_24_64;

2204 s13 = x12 * cospi_24_64 - x13 * cospi_8_64;	2224 s13 = x12 * cospi_24_64 - x13 * cospi_8_64;

2205 s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;	2225 s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;

2206 s15 = x14 * cospi_8_64 + x15 * cospi_24_64;	2226 s15 = x14 * cospi_8_64 + x15 * cospi_24_64;

2207	2227

2208 x0 = WRAPLOW(s0 + s2);	2228 x0 = WRAPLOW(s0 + s2, bd);

2209 x1 = WRAPLOW(s1 + s3);	2229 x1 = WRAPLOW(s1 + s3, bd);

2210 x2 = WRAPLOW(s0 - s2);	2230 x2 = WRAPLOW(s0 - s2, bd);

2211 x3 = WRAPLOW(s1 - s3);	2231 x3 = WRAPLOW(s1 - s3, bd);

2212 x4 = WRAPLOW(dct_const_round_shift(s4 + s6));	2232 x4 = WRAPLOW(dct_const_round_shift(s4 + s6), bd);

2213 x5 = WRAPLOW(dct_const_round_shift(s5 + s7));	2233 x5 = WRAPLOW(dct_const_round_shift(s5 + s7), bd);

2214 x6 = WRAPLOW(dct_const_round_shift(s4 - s6));	2234 x6 = WRAPLOW(dct_const_round_shift(s4 - s6), bd);

2215 x7 = WRAPLOW(dct_const_round_shift(s5 - s7));	2235 x7 = WRAPLOW(dct_const_round_shift(s5 - s7), bd);

2216 x8 = WRAPLOW(s8 + s10);	2236 x8 = WRAPLOW(s8 + s10, bd);

2217 x9 = WRAPLOW(s9 + s11);	2237 x9 = WRAPLOW(s9 + s11, bd);

2218 x10 = WRAPLOW(s8 - s10);	2238 x10 = WRAPLOW(s8 - s10, bd);

2219 x11 = WRAPLOW(s9 - s11);	2239 x11 = WRAPLOW(s9 - s11, bd);

2220 x12 = WRAPLOW(dct_const_round_shift(s12 + s14));	2240 x12 = WRAPLOW(dct_const_round_shift(s12 + s14), bd);

2221 x13 = WRAPLOW(dct_const_round_shift(s13 + s15));	2241 x13 = WRAPLOW(dct_const_round_shift(s13 + s15), bd);

2222 x14 = WRAPLOW(dct_const_round_shift(s12 - s14));	2242 x14 = WRAPLOW(dct_const_round_shift(s12 - s14), bd);

2223 x15 = WRAPLOW(dct_const_round_shift(s13 - s15));	2243 x15 = WRAPLOW(dct_const_round_shift(s13 - s15), bd);

2224	2244

2225 // stage 4	2245 // stage 4

2226 s2 = (- cospi_16_64) * (x2 + x3);	2246 s2 = (- cospi_16_64) * (x2 + x3);

2227 s3 = cospi_16_64 * (x2 - x3);	2247 s3 = cospi_16_64 * (x2 - x3);

2228 s6 = cospi_16_64 * (x6 + x7);	2248 s6 = cospi_16_64 * (x6 + x7);

2229 s7 = cospi_16_64 * (-x6 + x7);	2249 s7 = cospi_16_64 * (-x6 + x7);

2230 s10 = cospi_16_64 * (x10 + x11);	2250 s10 = cospi_16_64 * (x10 + x11);

2231 s11 = cospi_16_64 * (-x10 + x11);	2251 s11 = cospi_16_64 * (-x10 + x11);

2232 s14 = (- cospi_16_64) * (x14 + x15);	2252 s14 = (- cospi_16_64) * (x14 + x15);

2233 s15 = cospi_16_64 * (x14 - x15);	2253 s15 = cospi_16_64 * (x14 - x15);

2234	2254

2235 x2 = WRAPLOW(dct_const_round_shift(s2));	2255 x2 = WRAPLOW(dct_const_round_shift(s2), bd);

2236 x3 = WRAPLOW(dct_const_round_shift(s3));	2256 x3 = WRAPLOW(dct_const_round_shift(s3), bd);

2237 x6 = WRAPLOW(dct_const_round_shift(s6));	2257 x6 = WRAPLOW(dct_const_round_shift(s6), bd);

2238 x7 = WRAPLOW(dct_const_round_shift(s7));	2258 x7 = WRAPLOW(dct_const_round_shift(s7), bd);

2239 x10 = WRAPLOW(dct_const_round_shift(s10));	2259 x10 = WRAPLOW(dct_const_round_shift(s10), bd);

2240 x11 = WRAPLOW(dct_const_round_shift(s11));	2260 x11 = WRAPLOW(dct_const_round_shift(s11), bd);

2241 x14 = WRAPLOW(dct_const_round_shift(s14));	2261 x14 = WRAPLOW(dct_const_round_shift(s14), bd);

2242 x15 = WRAPLOW(dct_const_round_shift(s15));	2262 x15 = WRAPLOW(dct_const_round_shift(s15), bd);

2243	2263

2244 output[0] = WRAPLOW(x0);	2264 output[0] = WRAPLOW(x0, bd);

2245 output[1] = WRAPLOW(-x8);	2265 output[1] = WRAPLOW(-x8, bd);

2246 output[2] = WRAPLOW(x12);	2266 output[2] = WRAPLOW(x12, bd);

2247 output[3] = WRAPLOW(-x4);	2267 output[3] = WRAPLOW(-x4, bd);

2248 output[4] = WRAPLOW(x6);	2268 output[4] = WRAPLOW(x6, bd);

2249 output[5] = WRAPLOW(x14);	2269 output[5] = WRAPLOW(x14, bd);

2250 output[6] = WRAPLOW(x10);	2270 output[6] = WRAPLOW(x10, bd);

2251 output[7] = WRAPLOW(x2);	2271 output[7] = WRAPLOW(x2, bd);

2252 output[8] = WRAPLOW(x3);	2272 output[8] = WRAPLOW(x3, bd);

2253 output[9] = WRAPLOW(x11);	2273 output[9] = WRAPLOW(x11, bd);

2254 output[10] = WRAPLOW(x15);	2274 output[10] = WRAPLOW(x15, bd);

2255 output[11] = WRAPLOW(x7);	2275 output[11] = WRAPLOW(x7, bd);

2256 output[12] = WRAPLOW(x5);	2276 output[12] = WRAPLOW(x5, bd);

2257 output[13] = WRAPLOW(-x13);	2277 output[13] = WRAPLOW(-x13, bd);

2258 output[14] = WRAPLOW(x9);	2278 output[14] = WRAPLOW(x9, bd);

2259 output[15] = WRAPLOW(-x1);	2279 output[15] = WRAPLOW(-x1, bd);

2260 }	2280 }

2261	2281

2262 static const high_transform_2d HIGH_IHT_16[] = {	2282 static const highbd_transform_2d HIGH_IHT_16[] = {

2263 { high_idct16, high_idct16 }, // DCT_DCT = 0	2283 { highbd_idct16, highbd_idct16 }, // DCT_DCT = 0

2264 { high_iadst16, high_idct16 }, // ADST_DCT = 1	2284 { highbd_iadst16, highbd_idct16 }, // ADST_DCT = 1

2265 { high_idct16, high_iadst16 }, // DCT_ADST = 2	2285 { highbd_idct16, highbd_iadst16 }, // DCT_ADST = 2

2266 { high_iadst16, high_iadst16 } // ADST_ADST = 3	2286 { highbd_iadst16, highbd_iadst16 } // ADST_ADST = 3

2267 };	2287 };

2268	2288

2269 void vp9_high_iht16x16_256_add_c(const tran_low_t input, uint8_t dest8,	2289 void vp9_highbd_iht16x16_256_add_c(const tran_low_t input, uint8_t dest8,

2270 int stride, int tx_type, int bd) {	2290 int stride, int tx_type, int bd) {

2271 int i, j;	2291 int i, j;

2272 tran_low_t out[16 * 16];	2292 tran_low_t out[16 * 16];

2273 tran_low_t *outptr = out;	2293 tran_low_t *outptr = out;

2274 tran_low_t temp_in[16], temp_out[16];	2294 tran_low_t temp_in[16], temp_out[16];

2275 const high_transform_2d ht = HIGH_IHT_16[tx_type];	2295 const highbd_transform_2d ht = HIGH_IHT_16[tx_type];

2276 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);	2296 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

2277	2297

2278 // Rows	2298 // Rows

2279 for (i = 0; i < 16; ++i) {	2299 for (i = 0; i < 16; ++i) {

2280 ht.rows(input, outptr, bd);	2300 ht.rows(input, outptr, bd);

2281 input += 16;	2301 input += 16;

2282 outptr += 16;	2302 outptr += 16;

2283 }	2303 }

2284	2304

2285 // Columns	2305 // Columns

2286 for (i = 0; i < 16; ++i) {	2306 for (i = 0; i < 16; ++i) {

2287 for (j = 0; j < 16; ++j)	2307 for (j = 0; j < 16; ++j)

2288 temp_in[j] = out[j * 16 + i];	2308 temp_in[j] = out[j * 16 + i];

2289 ht.cols(temp_in, temp_out, bd);	2309 ht.cols(temp_in, temp_out, bd);

2290 for (j = 0; j < 16; ++j)	2310 for (j = 0; j < 16; ++j) {

2291 dest[j * stride + i] = clip_pixel_bd_high(	2311 dest[j * stride + i] = highbd_clip_pixel_add(

2292 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);	2312 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);

	2313 }

2293 }	2314 }

2294 }	2315 }

2295	2316

2296 void vp9_high_idct16x16_10_add_c(const tran_low_t input, uint8_t dest8,	2317 void vp9_highbd_idct16x16_10_add_c(const tran_low_t input, uint8_t dest8,

2297 int stride, int bd) {	2318 int stride, int bd) {

2298 tran_low_t out[16 * 16] = { 0 };	2319 tran_low_t out[16 * 16] = { 0 };

2299 tran_low_t *outptr = out;	2320 tran_low_t *outptr = out;

2300 int i, j;	2321 int i, j;

2301 tran_low_t temp_in[16], temp_out[16];	2322 tran_low_t temp_in[16], temp_out[16];

2302 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);	2323 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

2303	2324

2304 // First transform rows. Since all non-zero dct coefficients are in	2325 // First transform rows. Since all non-zero dct coefficients are in

2305 // upper-left 4x4 area, we only need to calculate first 4 rows here.	2326 // upper-left 4x4 area, we only need to calculate first 4 rows here.

2306 for (i = 0; i < 4; ++i) {	2327 for (i = 0; i < 4; ++i) {

2307 high_idct16(input, outptr, bd);	2328 highbd_idct16(input, outptr, bd);

2308 input += 16;	2329 input += 16;

2309 outptr += 16;	2330 outptr += 16;

2310 }	2331 }

2311	2332

2312 // Then transform columns.	2333 // Then transform columns.

2313 for (i = 0; i < 16; ++i) {	2334 for (i = 0; i < 16; ++i) {

2314 for (j = 0; j < 16; ++j)	2335 for (j = 0; j < 16; ++j)

2315 temp_in[j] = out[j*16 + i];	2336 temp_in[j] = out[j*16 + i];

2316 high_idct16(temp_in, temp_out, bd);	2337 highbd_idct16(temp_in, temp_out, bd);

2317 for (j = 0; j < 16; ++j)	2338 for (j = 0; j < 16; ++j) {

2318 dest[j * stride + i] = clip_pixel_bd_high(	2339 dest[j * stride + i] = highbd_clip_pixel_add(

2319 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);	2340 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);

	2341 }

2320 }	2342 }

2321 }	2343 }

2322	2344

2323 void vp9_high_idct16x16_1_add_c(const tran_low_t input, uint8_t dest8,	2345 void vp9_highbd_idct16x16_1_add_c(const tran_low_t input, uint8_t dest8,

2324 int stride, int bd) {	2346 int stride, int bd) {

2325 int i, j;	2347 int i, j;

2326 tran_high_t a1;	2348 tran_high_t a1;

2327 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));	2349 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);

2328 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);	2350 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

2329	2351

2330 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));	2352 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);

2331 a1 = ROUND_POWER_OF_TWO(out, 6);	2353 a1 = ROUND_POWER_OF_TWO(out, 6);

2332 for (j = 0; j < 16; ++j) {	2354 for (j = 0; j < 16; ++j) {

2333 for (i = 0; i < 16; ++i)	2355 for (i = 0; i < 16; ++i)

2334 dest[i] = clip_pixel_bd_high(dest[i], a1, bd);	2356 dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);

2335 dest += stride;	2357 dest += stride;

2336 }	2358 }

2337 }	2359 }

2338	2360

2339 static void high_idct32(const tran_low_t input, tran_low_t output, int bd) {	2361 static void highbd_idct32(const tran_low_t input, tran_low_t output, int bd) {

2340 tran_low_t step1[32], step2[32];	2362 tran_low_t step1[32], step2[32];

2341 tran_high_t temp1, temp2;	2363 tran_high_t temp1, temp2;

2342 (void) bd;	2364 (void) bd;

2343	2365

2344 // stage 1	2366 // stage 1

2345 step1[0] = input[0];	2367 step1[0] = input[0];

2346 step1[1] = input[16];	2368 step1[1] = input[16];

2347 step1[2] = input[8];	2369 step1[2] = input[8];

2348 step1[3] = input[24];	2370 step1[3] = input[24];

2349 step1[4] = input[4];	2371 step1[4] = input[4];

2350 step1[5] = input[20];	2372 step1[5] = input[20];

2351 step1[6] = input[12];	2373 step1[6] = input[12];

2352 step1[7] = input[28];	2374 step1[7] = input[28];

2353 step1[8] = input[2];	2375 step1[8] = input[2];

2354 step1[9] = input[18];	2376 step1[9] = input[18];

2355 step1[10] = input[10];	2377 step1[10] = input[10];

2356 step1[11] = input[26];	2378 step1[11] = input[26];

2357 step1[12] = input[6];	2379 step1[12] = input[6];

2358 step1[13] = input[22];	2380 step1[13] = input[22];

2359 step1[14] = input[14];	2381 step1[14] = input[14];

2360 step1[15] = input[30];	2382 step1[15] = input[30];

2361	2383

2362 temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;	2384 temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;

2363 temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;	2385 temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;

2364 step1[16] = WRAPLOW(dct_const_round_shift(temp1));	2386 step1[16] = WRAPLOW(dct_const_round_shift(temp1), bd);

2365 step1[31] = WRAPLOW(dct_const_round_shift(temp2));	2387 step1[31] = WRAPLOW(dct_const_round_shift(temp2), bd);

2366	2388

2367 temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;	2389 temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;

2368 temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;	2390 temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;

2369 step1[17] = WRAPLOW(dct_const_round_shift(temp1));	2391 step1[17] = WRAPLOW(dct_const_round_shift(temp1), bd);

2370 step1[30] = WRAPLOW(dct_const_round_shift(temp2));	2392 step1[30] = WRAPLOW(dct_const_round_shift(temp2), bd);

2371	2393

2372 temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;	2394 temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;

2373 temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;	2395 temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;

2374 step1[18] = WRAPLOW(dct_const_round_shift(temp1));	2396 step1[18] = WRAPLOW(dct_const_round_shift(temp1), bd);

2375 step1[29] = WRAPLOW(dct_const_round_shift(temp2));	2397 step1[29] = WRAPLOW(dct_const_round_shift(temp2), bd);

2376	2398

2377 temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;	2399 temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;

2378 temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;	2400 temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;

2379 step1[19] = WRAPLOW(dct_const_round_shift(temp1));	2401 step1[19] = WRAPLOW(dct_const_round_shift(temp1), bd);

2380 step1[28] = WRAPLOW(dct_const_round_shift(temp2));	2402 step1[28] = WRAPLOW(dct_const_round_shift(temp2), bd);

2381	2403

2382 temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;	2404 temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;

2383 temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;	2405 temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;

2384 step1[20] = WRAPLOW(dct_const_round_shift(temp1));	2406 step1[20] = WRAPLOW(dct_const_round_shift(temp1), bd);

2385 step1[27] = WRAPLOW(dct_const_round_shift(temp2));	2407 step1[27] = WRAPLOW(dct_const_round_shift(temp2), bd);

2386	2408

2387 temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;	2409 temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;

2388 temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;	2410 temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;

2389 step1[21] = WRAPLOW(dct_const_round_shift(temp1));	2411 step1[21] = WRAPLOW(dct_const_round_shift(temp1), bd);

2390 step1[26] = WRAPLOW(dct_const_round_shift(temp2));	2412 step1[26] = WRAPLOW(dct_const_round_shift(temp2), bd);

2391	2413

2392 temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;	2414 temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;

2393 temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;	2415 temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;

2394 step1[22] = WRAPLOW(dct_const_round_shift(temp1));	2416 step1[22] = WRAPLOW(dct_const_round_shift(temp1), bd);

2395 step1[25] = WRAPLOW(dct_const_round_shift(temp2));	2417 step1[25] = WRAPLOW(dct_const_round_shift(temp2), bd);

2396	2418

2397 temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;	2419 temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;

2398 temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;	2420 temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;

2399 step1[23] = WRAPLOW(dct_const_round_shift(temp1));	2421 step1[23] = WRAPLOW(dct_const_round_shift(temp1), bd);

2400 step1[24] = WRAPLOW(dct_const_round_shift(temp2));	2422 step1[24] = WRAPLOW(dct_const_round_shift(temp2), bd);

2401	2423

2402 // stage 2	2424 // stage 2

2403 step2[0] = step1[0];	2425 step2[0] = step1[0];

2404 step2[1] = step1[1];	2426 step2[1] = step1[1];

2405 step2[2] = step1[2];	2427 step2[2] = step1[2];

2406 step2[3] = step1[3];	2428 step2[3] = step1[3];

2407 step2[4] = step1[4];	2429 step2[4] = step1[4];

2408 step2[5] = step1[5];	2430 step2[5] = step1[5];

2409 step2[6] = step1[6];	2431 step2[6] = step1[6];

2410 step2[7] = step1[7];	2432 step2[7] = step1[7];

2411	2433

2412 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;	2434 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;

2413 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;	2435 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;

2414 step2[8] = WRAPLOW(dct_const_round_shift(temp1));	2436 step2[8] = WRAPLOW(dct_const_round_shift(temp1), bd);

2415 step2[15] = WRAPLOW(dct_const_round_shift(temp2));	2437 step2[15] = WRAPLOW(dct_const_round_shift(temp2), bd);

2416	2438

2417 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;	2439 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;

2418 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;	2440 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;

2419 step2[9] = WRAPLOW(dct_const_round_shift(temp1));	2441 step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd);

2420 step2[14] = WRAPLOW(dct_const_round_shift(temp2));	2442 step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd);

2421	2443

2422 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;	2444 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;

2423 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;	2445 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;

2424 step2[10] = WRAPLOW(dct_const_round_shift(temp1));	2446 step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);

2425 step2[13] = WRAPLOW(dct_const_round_shift(temp2));	2447 step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);

2426	2448

2427 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;	2449 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;

2428 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;	2450 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;

2429 step2[11] = WRAPLOW(dct_const_round_shift(temp1));	2451 step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd);

2430 step2[12] = WRAPLOW(dct_const_round_shift(temp2));	2452 step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd);

2431	2453

2432 step2[16] = WRAPLOW(step1[16] + step1[17]);	2454 step2[16] = WRAPLOW(step1[16] + step1[17], bd);

2433 step2[17] = WRAPLOW(step1[16] - step1[17]);	2455 step2[17] = WRAPLOW(step1[16] - step1[17], bd);

2434 step2[18] = WRAPLOW(-step1[18] + step1[19]);	2456 step2[18] = WRAPLOW(-step1[18] + step1[19], bd);

2435 step2[19] = WRAPLOW(step1[18] + step1[19]);	2457 step2[19] = WRAPLOW(step1[18] + step1[19], bd);

2436 step2[20] = WRAPLOW(step1[20] + step1[21]);	2458 step2[20] = WRAPLOW(step1[20] + step1[21], bd);

2437 step2[21] = WRAPLOW(step1[20] - step1[21]);	2459 step2[21] = WRAPLOW(step1[20] - step1[21], bd);

2438 step2[22] = WRAPLOW(-step1[22] + step1[23]);	2460 step2[22] = WRAPLOW(-step1[22] + step1[23], bd);

2439 step2[23] = WRAPLOW(step1[22] + step1[23]);	2461 step2[23] = WRAPLOW(step1[22] + step1[23], bd);

2440 step2[24] = WRAPLOW(step1[24] + step1[25]);	2462 step2[24] = WRAPLOW(step1[24] + step1[25], bd);

2441 step2[25] = WRAPLOW(step1[24] - step1[25]);	2463 step2[25] = WRAPLOW(step1[24] - step1[25], bd);

2442 step2[26] = WRAPLOW(-step1[26] + step1[27]);	2464 step2[26] = WRAPLOW(-step1[26] + step1[27], bd);

2443 step2[27] = WRAPLOW(step1[26] + step1[27]);	2465 step2[27] = WRAPLOW(step1[26] + step1[27], bd);

2444 step2[28] = WRAPLOW(step1[28] + step1[29]);	2466 step2[28] = WRAPLOW(step1[28] + step1[29], bd);

2445 step2[29] = WRAPLOW(step1[28] - step1[29]);	2467 step2[29] = WRAPLOW(step1[28] - step1[29], bd);

2446 step2[30] = WRAPLOW(-step1[30] + step1[31]);	2468 step2[30] = WRAPLOW(-step1[30] + step1[31], bd);

2447 step2[31] = WRAPLOW(step1[30] + step1[31]);	2469 step2[31] = WRAPLOW(step1[30] + step1[31], bd);

2448	2470

2449 // stage 3	2471 // stage 3

2450 step1[0] = step2[0];	2472 step1[0] = step2[0];

2451 step1[1] = step2[1];	2473 step1[1] = step2[1];

2452 step1[2] = step2[2];	2474 step1[2] = step2[2];

2453 step1[3] = step2[3];	2475 step1[3] = step2[3];

2454	2476

2455 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;	2477 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;

2456 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;	2478 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;

2457 step1[4] = WRAPLOW(dct_const_round_shift(temp1));	2479 step1[4] = WRAPLOW(dct_const_round_shift(temp1), bd);

2458 step1[7] = WRAPLOW(dct_const_round_shift(temp2));	2480 step1[7] = WRAPLOW(dct_const_round_shift(temp2), bd);

2459 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;	2481 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;

2460 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;	2482 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;

2461 step1[5] = WRAPLOW(dct_const_round_shift(temp1));	2483 step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);

2462 step1[6] = WRAPLOW(dct_const_round_shift(temp2));	2484 step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);

2463	2485

2464 step1[8] = WRAPLOW(step2[8] + step2[9]);	2486 step1[8] = WRAPLOW(step2[8] + step2[9], bd);

2465 step1[9] = WRAPLOW(step2[8] - step2[9]);	2487 step1[9] = WRAPLOW(step2[8] - step2[9], bd);

2466 step1[10] = WRAPLOW(-step2[10] + step2[11]);	2488 step1[10] = WRAPLOW(-step2[10] + step2[11], bd);

2467 step1[11] = WRAPLOW(step2[10] + step2[11]);	2489 step1[11] = WRAPLOW(step2[10] + step2[11], bd);

2468 step1[12] = WRAPLOW(step2[12] + step2[13]);	2490 step1[12] = WRAPLOW(step2[12] + step2[13], bd);

2469 step1[13] = WRAPLOW(step2[12] - step2[13]);	2491 step1[13] = WRAPLOW(step2[12] - step2[13], bd);

2470 step1[14] = WRAPLOW(-step2[14] + step2[15]);	2492 step1[14] = WRAPLOW(-step2[14] + step2[15], bd);

2471 step1[15] = WRAPLOW(step2[14] + step2[15]);	2493 step1[15] = WRAPLOW(step2[14] + step2[15], bd);

2472	2494

2473 step1[16] = step2[16];	2495 step1[16] = step2[16];

2474 step1[31] = step2[31];	2496 step1[31] = step2[31];

2475 temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;	2497 temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;

2476 temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;	2498 temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;

2477 step1[17] = WRAPLOW(dct_const_round_shift(temp1));	2499 step1[17] = WRAPLOW(dct_const_round_shift(temp1), bd);

2478 step1[30] = WRAPLOW(dct_const_round_shift(temp2));	2500 step1[30] = WRAPLOW(dct_const_round_shift(temp2), bd);

2479 temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;	2501 temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;

2480 temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;	2502 temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;

2481 step1[18] = WRAPLOW(dct_const_round_shift(temp1));	2503 step1[18] = WRAPLOW(dct_const_round_shift(temp1), bd);

2482 step1[29] = WRAPLOW(dct_const_round_shift(temp2));	2504 step1[29] = WRAPLOW(dct_const_round_shift(temp2), bd);

2483 step1[19] = step2[19];	2505 step1[19] = step2[19];

2484 step1[20] = step2[20];	2506 step1[20] = step2[20];

2485 temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;	2507 temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;

2486 temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;	2508 temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;

2487 step1[21] = WRAPLOW(dct_const_round_shift(temp1));	2509 step1[21] = WRAPLOW(dct_const_round_shift(temp1), bd);

2488 step1[26] = WRAPLOW(dct_const_round_shift(temp2));	2510 step1[26] = WRAPLOW(dct_const_round_shift(temp2), bd);

2489 temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;	2511 temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;

2490 temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;	2512 temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;

2491 step1[22] = WRAPLOW(dct_const_round_shift(temp1));	2513 step1[22] = WRAPLOW(dct_const_round_shift(temp1), bd);

2492 step1[25] = WRAPLOW(dct_const_round_shift(temp2));	2514 step1[25] = WRAPLOW(dct_const_round_shift(temp2), bd);

2493 step1[23] = step2[23];	2515 step1[23] = step2[23];

2494 step1[24] = step2[24];	2516 step1[24] = step2[24];

2495 step1[27] = step2[27];	2517 step1[27] = step2[27];

2496 step1[28] = step2[28];	2518 step1[28] = step2[28];

2497	2519

2498 // stage 4	2520 // stage 4

2499 temp1 = (step1[0] + step1[1]) * cospi_16_64;	2521 temp1 = (step1[0] + step1[1]) * cospi_16_64;

2500 temp2 = (step1[0] - step1[1]) * cospi_16_64;	2522 temp2 = (step1[0] - step1[1]) * cospi_16_64;

2501 step2[0] = WRAPLOW(dct_const_round_shift(temp1));	2523 step2[0] = WRAPLOW(dct_const_round_shift(temp1), bd);

2502 step2[1] = WRAPLOW(dct_const_round_shift(temp2));	2524 step2[1] = WRAPLOW(dct_const_round_shift(temp2), bd);

2503 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;	2525 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;

2504 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;	2526 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;

2505 step2[2] = WRAPLOW(dct_const_round_shift(temp1));	2527 step2[2] = WRAPLOW(dct_const_round_shift(temp1), bd);

2506 step2[3] = WRAPLOW(dct_const_round_shift(temp2));	2528 step2[3] = WRAPLOW(dct_const_round_shift(temp2), bd);

2507 step2[4] = WRAPLOW(step1[4] + step1[5]);	2529 step2[4] = WRAPLOW(step1[4] + step1[5], bd);

2508 step2[5] = WRAPLOW(step1[4] - step1[5]);	2530 step2[5] = WRAPLOW(step1[4] - step1[5], bd);

2509 step2[6] = WRAPLOW(-step1[6] + step1[7]);	2531 step2[6] = WRAPLOW(-step1[6] + step1[7], bd);

2510 step2[7] = WRAPLOW(step1[6] + step1[7]);	2532 step2[7] = WRAPLOW(step1[6] + step1[7], bd);

2511	2533

2512 step2[8] = step1[8];	2534 step2[8] = step1[8];

2513 step2[15] = step1[15];	2535 step2[15] = step1[15];

2514 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;	2536 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;

2515 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;	2537 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;

2516 step2[9] = WRAPLOW(dct_const_round_shift(temp1));	2538 step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd);

2517 step2[14] = WRAPLOW(dct_const_round_shift(temp2));	2539 step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd);

2518 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;	2540 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;

2519 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;	2541 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;

2520 step2[10] = WRAPLOW(dct_const_round_shift(temp1));	2542 step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);

2521 step2[13] = WRAPLOW(dct_const_round_shift(temp2));	2543 step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);

2522 step2[11] = step1[11];	2544 step2[11] = step1[11];

2523 step2[12] = step1[12];	2545 step2[12] = step1[12];

2524	2546

2525 step2[16] = WRAPLOW(step1[16] + step1[19]);	2547 step2[16] = WRAPLOW(step1[16] + step1[19], bd);

2526 step2[17] = WRAPLOW(step1[17] + step1[18]);	2548 step2[17] = WRAPLOW(step1[17] + step1[18], bd);

2527 step2[18] = WRAPLOW(step1[17] - step1[18]);	2549 step2[18] = WRAPLOW(step1[17] - step1[18], bd);

2528 step2[19] = WRAPLOW(step1[16] - step1[19]);	2550 step2[19] = WRAPLOW(step1[16] - step1[19], bd);

2529 step2[20] = WRAPLOW(-step1[20] + step1[23]);	2551 step2[20] = WRAPLOW(-step1[20] + step1[23], bd);

2530 step2[21] = WRAPLOW(-step1[21] + step1[22]);	2552 step2[21] = WRAPLOW(-step1[21] + step1[22], bd);

2531 step2[22] = WRAPLOW(step1[21] + step1[22]);	2553 step2[22] = WRAPLOW(step1[21] + step1[22], bd);

2532 step2[23] = WRAPLOW(step1[20] + step1[23]);	2554 step2[23] = WRAPLOW(step1[20] + step1[23], bd);

2533	2555

2534 step2[24] = WRAPLOW(step1[24] + step1[27]);	2556 step2[24] = WRAPLOW(step1[24] + step1[27], bd);

2535 step2[25] = WRAPLOW(step1[25] + step1[26]);	2557 step2[25] = WRAPLOW(step1[25] + step1[26], bd);

2536 step2[26] = WRAPLOW(step1[25] - step1[26]);	2558 step2[26] = WRAPLOW(step1[25] - step1[26], bd);

2537 step2[27] = WRAPLOW(step1[24] - step1[27]);	2559 step2[27] = WRAPLOW(step1[24] - step1[27], bd);

2538 step2[28] = WRAPLOW(-step1[28] + step1[31]);	2560 step2[28] = WRAPLOW(-step1[28] + step1[31], bd);

2539 step2[29] = WRAPLOW(-step1[29] + step1[30]);	2561 step2[29] = WRAPLOW(-step1[29] + step1[30], bd);

2540 step2[30] = WRAPLOW(step1[29] + step1[30]);	2562 step2[30] = WRAPLOW(step1[29] + step1[30], bd);

2541 step2[31] = WRAPLOW(step1[28] + step1[31]);	2563 step2[31] = WRAPLOW(step1[28] + step1[31], bd);

2542	2564

2543 // stage 5	2565 // stage 5

2544 step1[0] = WRAPLOW(step2[0] + step2[3]);	2566 step1[0] = WRAPLOW(step2[0] + step2[3], bd);

2545 step1[1] = WRAPLOW(step2[1] + step2[2]);	2567 step1[1] = WRAPLOW(step2[1] + step2[2], bd);

2546 step1[2] = WRAPLOW(step2[1] - step2[2]);	2568 step1[2] = WRAPLOW(step2[1] - step2[2], bd);

2547 step1[3] = WRAPLOW(step2[0] - step2[3]);	2569 step1[3] = WRAPLOW(step2[0] - step2[3], bd);

2548 step1[4] = step2[4];	2570 step1[4] = step2[4];

2549 temp1 = (step2[6] - step2[5]) * cospi_16_64;	2571 temp1 = (step2[6] - step2[5]) * cospi_16_64;

2550 temp2 = (step2[5] + step2[6]) * cospi_16_64;	2572 temp2 = (step2[5] + step2[6]) * cospi_16_64;

2551 step1[5] = WRAPLOW(dct_const_round_shift(temp1));	2573 step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);

2552 step1[6] = WRAPLOW(dct_const_round_shift(temp2));	2574 step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);

2553 step1[7] = step2[7];	2575 step1[7] = step2[7];

2554	2576

2555 step1[8] = WRAPLOW(step2[8] + step2[11]);	2577 step1[8] = WRAPLOW(step2[8] + step2[11], bd);

2556 step1[9] = WRAPLOW(step2[9] + step2[10]);	2578 step1[9] = WRAPLOW(step2[9] + step2[10], bd);

2557 step1[10] = WRAPLOW(step2[9] - step2[10]);	2579 step1[10] = WRAPLOW(step2[9] - step2[10], bd);

2558 step1[11] = WRAPLOW(step2[8] - step2[11]);	2580 step1[11] = WRAPLOW(step2[8] - step2[11], bd);

2559 step1[12] = WRAPLOW(-step2[12] + step2[15]);	2581 step1[12] = WRAPLOW(-step2[12] + step2[15], bd);

2560 step1[13] = WRAPLOW(-step2[13] + step2[14]);	2582 step1[13] = WRAPLOW(-step2[13] + step2[14], bd);

2561 step1[14] = WRAPLOW(step2[13] + step2[14]);	2583 step1[14] = WRAPLOW(step2[13] + step2[14], bd);

2562 step1[15] = WRAPLOW(step2[12] + step2[15]);	2584 step1[15] = WRAPLOW(step2[12] + step2[15], bd);

2563	2585

2564 step1[16] = step2[16];	2586 step1[16] = step2[16];

2565 step1[17] = step2[17];	2587 step1[17] = step2[17];

2566 temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;	2588 temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;

2567 temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;	2589 temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;

2568 step1[18] = WRAPLOW(dct_const_round_shift(temp1));	2590 step1[18] = WRAPLOW(dct_const_round_shift(temp1), bd);

2569 step1[29] = WRAPLOW(dct_const_round_shift(temp2));	2591 step1[29] = WRAPLOW(dct_const_round_shift(temp2), bd);

2570 temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;	2592 temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;

2571 temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;	2593 temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;

2572 step1[19] = WRAPLOW(dct_const_round_shift(temp1));	2594 step1[19] = WRAPLOW(dct_const_round_shift(temp1), bd);

2573 step1[28] = WRAPLOW(dct_const_round_shift(temp2));	2595 step1[28] = WRAPLOW(dct_const_round_shift(temp2), bd);

2574 temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;	2596 temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;

2575 temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;	2597 temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;

2576 step1[20] = WRAPLOW(dct_const_round_shift(temp1));	2598 step1[20] = WRAPLOW(dct_const_round_shift(temp1), bd);

2577 step1[27] = WRAPLOW(dct_const_round_shift(temp2));	2599 step1[27] = WRAPLOW(dct_const_round_shift(temp2), bd);

2578 temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;	2600 temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;

2579 temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;	2601 temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;

2580 step1[21] = WRAPLOW(dct_const_round_shift(temp1));	2602 step1[21] = WRAPLOW(dct_const_round_shift(temp1), bd);

2581 step1[26] = WRAPLOW(dct_const_round_shift(temp2));	2603 step1[26] = WRAPLOW(dct_const_round_shift(temp2), bd);

2582 step1[22] = step2[22];	2604 step1[22] = step2[22];

2583 step1[23] = step2[23];	2605 step1[23] = step2[23];

2584 step1[24] = step2[24];	2606 step1[24] = step2[24];

2585 step1[25] = step2[25];	2607 step1[25] = step2[25];

2586 step1[30] = step2[30];	2608 step1[30] = step2[30];

2587 step1[31] = step2[31];	2609 step1[31] = step2[31];

2588	2610

2589 // stage 6	2611 // stage 6

2590 step2[0] = WRAPLOW(step1[0] + step1[7]);	2612 step2[0] = WRAPLOW(step1[0] + step1[7], bd);

2591 step2[1] = WRAPLOW(step1[1] + step1[6]);	2613 step2[1] = WRAPLOW(step1[1] + step1[6], bd);

2592 step2[2] = WRAPLOW(step1[2] + step1[5]);	2614 step2[2] = WRAPLOW(step1[2] + step1[5], bd);

2593 step2[3] = WRAPLOW(step1[3] + step1[4]);	2615 step2[3] = WRAPLOW(step1[3] + step1[4], bd);

2594 step2[4] = WRAPLOW(step1[3] - step1[4]);	2616 step2[4] = WRAPLOW(step1[3] - step1[4], bd);

2595 step2[5] = WRAPLOW(step1[2] - step1[5]);	2617 step2[5] = WRAPLOW(step1[2] - step1[5], bd);

2596 step2[6] = WRAPLOW(step1[1] - step1[6]);	2618 step2[6] = WRAPLOW(step1[1] - step1[6], bd);

2597 step2[7] = WRAPLOW(step1[0] - step1[7]);	2619 step2[7] = WRAPLOW(step1[0] - step1[7], bd);

2598 step2[8] = step1[8];	2620 step2[8] = step1[8];

2599 step2[9] = step1[9];	2621 step2[9] = step1[9];

2600 temp1 = (-step1[10] + step1[13]) * cospi_16_64;	2622 temp1 = (-step1[10] + step1[13]) * cospi_16_64;

2601 temp2 = (step1[10] + step1[13]) * cospi_16_64;	2623 temp2 = (step1[10] + step1[13]) * cospi_16_64;

2602 step2[10] = WRAPLOW(dct_const_round_shift(temp1));	2624 step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);

2603 step2[13] = WRAPLOW(dct_const_round_shift(temp2));	2625 step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);

2604 temp1 = (-step1[11] + step1[12]) * cospi_16_64;	2626 temp1 = (-step1[11] + step1[12]) * cospi_16_64;

2605 temp2 = (step1[11] + step1[12]) * cospi_16_64;	2627 temp2 = (step1[11] + step1[12]) * cospi_16_64;

2606 step2[11] = WRAPLOW(dct_const_round_shift(temp1));	2628 step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd);

2607 step2[12] = WRAPLOW(dct_const_round_shift(temp2));	2629 step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd);

2608 step2[14] = WRAPLOW(step1[14]);	2630 step2[14] = step1[14];

2609 step2[15] = WRAPLOW(step1[15]);	2631 step2[15] = step1[15];

2610	2632

2611 step2[16] = WRAPLOW(step1[16] + step1[23]);	2633 step2[16] = WRAPLOW(step1[16] + step1[23], bd);

2612 step2[17] = WRAPLOW(step1[17] + step1[22]);	2634 step2[17] = WRAPLOW(step1[17] + step1[22], bd);

2613 step2[18] = WRAPLOW(step1[18] + step1[21]);	2635 step2[18] = WRAPLOW(step1[18] + step1[21], bd);

2614 step2[19] = WRAPLOW(step1[19] + step1[20]);	2636 step2[19] = WRAPLOW(step1[19] + step1[20], bd);

2615 step2[20] = WRAPLOW(step1[19] - step1[20]);	2637 step2[20] = WRAPLOW(step1[19] - step1[20], bd);

2616 step2[21] = WRAPLOW(step1[18] - step1[21]);	2638 step2[21] = WRAPLOW(step1[18] - step1[21], bd);

2617 step2[22] = WRAPLOW(step1[17] - step1[22]);	2639 step2[22] = WRAPLOW(step1[17] - step1[22], bd);

2618 step2[23] = WRAPLOW(step1[16] - step1[23]);	2640 step2[23] = WRAPLOW(step1[16] - step1[23], bd);

2619	2641

2620 step2[24] = WRAPLOW(-step1[24] + step1[31]);	2642 step2[24] = WRAPLOW(-step1[24] + step1[31], bd);

2621 step2[25] = WRAPLOW(-step1[25] + step1[30]);	2643 step2[25] = WRAPLOW(-step1[25] + step1[30], bd);

2622 step2[26] = WRAPLOW(-step1[26] + step1[29]);	2644 step2[26] = WRAPLOW(-step1[26] + step1[29], bd);

2623 step2[27] = WRAPLOW(-step1[27] + step1[28]);	2645 step2[27] = WRAPLOW(-step1[27] + step1[28], bd);

2624 step2[28] = WRAPLOW(step1[27] + step1[28]);	2646 step2[28] = WRAPLOW(step1[27] + step1[28], bd);

2625 step2[29] = WRAPLOW(step1[26] + step1[29]);	2647 step2[29] = WRAPLOW(step1[26] + step1[29], bd);

2626 step2[30] = WRAPLOW(step1[25] + step1[30]);	2648 step2[30] = WRAPLOW(step1[25] + step1[30], bd);

2627 step2[31] = WRAPLOW(step1[24] + step1[31]);	2649 step2[31] = WRAPLOW(step1[24] + step1[31], bd);

2628	2650

2629 // stage 7	2651 // stage 7

2630 step1[0] = WRAPLOW(step2[0] + step2[15]);	2652 step1[0] = WRAPLOW(step2[0] + step2[15], bd);

2631 step1[1] = WRAPLOW(step2[1] + step2[14]);	2653 step1[1] = WRAPLOW(step2[1] + step2[14], bd);

2632 step1[2] = WRAPLOW(step2[2] + step2[13]);	2654 step1[2] = WRAPLOW(step2[2] + step2[13], bd);

2633 step1[3] = WRAPLOW(step2[3] + step2[12]);	2655 step1[3] = WRAPLOW(step2[3] + step2[12], bd);

2634 step1[4] = WRAPLOW(step2[4] + step2[11]);	2656 step1[4] = WRAPLOW(step2[4] + step2[11], bd);

2635 step1[5] = WRAPLOW(step2[5] + step2[10]);	2657 step1[5] = WRAPLOW(step2[5] + step2[10], bd);

2636 step1[6] = WRAPLOW(step2[6] + step2[9]);	2658 step1[6] = WRAPLOW(step2[6] + step2[9], bd);

2637 step1[7] = WRAPLOW(step2[7] + step2[8]);	2659 step1[7] = WRAPLOW(step2[7] + step2[8], bd);

2638 step1[8] = WRAPLOW(step2[7] - step2[8]);	2660 step1[8] = WRAPLOW(step2[7] - step2[8], bd);

2639 step1[9] = WRAPLOW(step2[6] - step2[9]);	2661 step1[9] = WRAPLOW(step2[6] - step2[9], bd);

2640 step1[10] = WRAPLOW(step2[5] - step2[10]);	2662 step1[10] = WRAPLOW(step2[5] - step2[10], bd);

2641 step1[11] = WRAPLOW(step2[4] - step2[11]);	2663 step1[11] = WRAPLOW(step2[4] - step2[11], bd);

2642 step1[12] = WRAPLOW(step2[3] - step2[12]);	2664 step1[12] = WRAPLOW(step2[3] - step2[12], bd);

2643 step1[13] = WRAPLOW(step2[2] - step2[13]);	2665 step1[13] = WRAPLOW(step2[2] - step2[13], bd);

2644 step1[14] = WRAPLOW(step2[1] - step2[14]);	2666 step1[14] = WRAPLOW(step2[1] - step2[14], bd);

2645 step1[15] = WRAPLOW(step2[0] - step2[15]);	2667 step1[15] = WRAPLOW(step2[0] - step2[15], bd);

2646	2668

2647 step1[16] = step2[16];	2669 step1[16] = step2[16];

2648 step1[17] = step2[17];	2670 step1[17] = step2[17];

2649 step1[18] = step2[18];	2671 step1[18] = step2[18];

2650 step1[19] = step2[19];	2672 step1[19] = step2[19];

2651 temp1 = (-step2[20] + step2[27]) * cospi_16_64;	2673 temp1 = (-step2[20] + step2[27]) * cospi_16_64;

2652 temp2 = (step2[20] + step2[27]) * cospi_16_64;	2674 temp2 = (step2[20] + step2[27]) * cospi_16_64;

2653 step1[20] = WRAPLOW(dct_const_round_shift(temp1));	2675 step1[20] = WRAPLOW(dct_const_round_shift(temp1), bd);

2654 step1[27] = WRAPLOW(dct_const_round_shift(temp2));	2676 step1[27] = WRAPLOW(dct_const_round_shift(temp2), bd);

2655 temp1 = (-step2[21] + step2[26]) * cospi_16_64;	2677 temp1 = (-step2[21] + step2[26]) * cospi_16_64;

2656 temp2 = (step2[21] + step2[26]) * cospi_16_64;	2678 temp2 = (step2[21] + step2[26]) * cospi_16_64;

2657 step1[21] = WRAPLOW(dct_const_round_shift(temp1));	2679 step1[21] = WRAPLOW(dct_const_round_shift(temp1), bd);

2658 step1[26] = WRAPLOW(dct_const_round_shift(temp2));	2680 step1[26] = WRAPLOW(dct_const_round_shift(temp2), bd);

2659 temp1 = (-step2[22] + step2[25]) * cospi_16_64;	2681 temp1 = (-step2[22] + step2[25]) * cospi_16_64;

2660 temp2 = (step2[22] + step2[25]) * cospi_16_64;	2682 temp2 = (step2[22] + step2[25]) * cospi_16_64;

2661 step1[22] = WRAPLOW(dct_const_round_shift(temp1));	2683 step1[22] = WRAPLOW(dct_const_round_shift(temp1), bd);

2662 step1[25] = WRAPLOW(dct_const_round_shift(temp2));	2684 step1[25] = WRAPLOW(dct_const_round_shift(temp2), bd);

2663 temp1 = (-step2[23] + step2[24]) * cospi_16_64;	2685 temp1 = (-step2[23] + step2[24]) * cospi_16_64;

2664 temp2 = (step2[23] + step2[24]) * cospi_16_64;	2686 temp2 = (step2[23] + step2[24]) * cospi_16_64;

2665 step1[23] = WRAPLOW(dct_const_round_shift(temp1));	2687 step1[23] = WRAPLOW(dct_const_round_shift(temp1), bd);

2666 step1[24] = WRAPLOW(dct_const_round_shift(temp2));	2688 step1[24] = WRAPLOW(dct_const_round_shift(temp2), bd);

2667 step1[28] = step2[28];	2689 step1[28] = step2[28];

2668 step1[29] = step2[29];	2690 step1[29] = step2[29];

2669 step1[30] = step2[30];	2691 step1[30] = step2[30];

2670 step1[31] = step2[31];	2692 step1[31] = step2[31];

2671	2693

2672 // final stage	2694 // final stage

2673 output[0] = WRAPLOW(step1[0] + step1[31]);	2695 output[0] = WRAPLOW(step1[0] + step1[31], bd);

2674 output[1] = WRAPLOW(step1[1] + step1[30]);	2696 output[1] = WRAPLOW(step1[1] + step1[30], bd);

2675 output[2] = WRAPLOW(step1[2] + step1[29]);	2697 output[2] = WRAPLOW(step1[2] + step1[29], bd);

2676 output[3] = WRAPLOW(step1[3] + step1[28]);	2698 output[3] = WRAPLOW(step1[3] + step1[28], bd);

2677 output[4] = WRAPLOW(step1[4] + step1[27]);	2699 output[4] = WRAPLOW(step1[4] + step1[27], bd);

2678 output[5] = WRAPLOW(step1[5] + step1[26]);	2700 output[5] = WRAPLOW(step1[5] + step1[26], bd);

2679 output[6] = WRAPLOW(step1[6] + step1[25]);	2701 output[6] = WRAPLOW(step1[6] + step1[25], bd);

2680 output[7] = WRAPLOW(step1[7] + step1[24]);	2702 output[7] = WRAPLOW(step1[7] + step1[24], bd);

2681 output[8] = WRAPLOW(step1[8] + step1[23]);	2703 output[8] = WRAPLOW(step1[8] + step1[23], bd);

2682 output[9] = WRAPLOW(step1[9] + step1[22]);	2704 output[9] = WRAPLOW(step1[9] + step1[22], bd);

2683 output[10] = WRAPLOW(step1[10] + step1[21]);	2705 output[10] = WRAPLOW(step1[10] + step1[21], bd);

2684 output[11] = WRAPLOW(step1[11] + step1[20]);	2706 output[11] = WRAPLOW(step1[11] + step1[20], bd);

2685 output[12] = WRAPLOW(step1[12] + step1[19]);	2707 output[12] = WRAPLOW(step1[12] + step1[19], bd);

2686 output[13] = WRAPLOW(step1[13] + step1[18]);	2708 output[13] = WRAPLOW(step1[13] + step1[18], bd);

2687 output[14] = WRAPLOW(step1[14] + step1[17]);	2709 output[14] = WRAPLOW(step1[14] + step1[17], bd);

2688 output[15] = WRAPLOW(step1[15] + step1[16]);	2710 output[15] = WRAPLOW(step1[15] + step1[16], bd);

2689 output[16] = WRAPLOW(step1[15] - step1[16]);	2711 output[16] = WRAPLOW(step1[15] - step1[16], bd);

2690 output[17] = WRAPLOW(step1[14] - step1[17]);	2712 output[17] = WRAPLOW(step1[14] - step1[17], bd);

2691 output[18] = WRAPLOW(step1[13] - step1[18]);	2713 output[18] = WRAPLOW(step1[13] - step1[18], bd);

2692 output[19] = WRAPLOW(step1[12] - step1[19]);	2714 output[19] = WRAPLOW(step1[12] - step1[19], bd);

2693 output[20] = WRAPLOW(step1[11] - step1[20]);	2715 output[20] = WRAPLOW(step1[11] - step1[20], bd);

2694 output[21] = WRAPLOW(step1[10] - step1[21]);	2716 output[21] = WRAPLOW(step1[10] - step1[21], bd);

2695 output[22] = WRAPLOW(step1[9] - step1[22]);	2717 output[22] = WRAPLOW(step1[9] - step1[22], bd);

2696 output[23] = WRAPLOW(step1[8] - step1[23]);	2718 output[23] = WRAPLOW(step1[8] - step1[23], bd);

2697 output[24] = WRAPLOW(step1[7] - step1[24]);	2719 output[24] = WRAPLOW(step1[7] - step1[24], bd);

2698 output[25] = WRAPLOW(step1[6] - step1[25]);	2720 output[25] = WRAPLOW(step1[6] - step1[25], bd);

2699 output[26] = WRAPLOW(step1[5] - step1[26]);	2721 output[26] = WRAPLOW(step1[5] - step1[26], bd);

2700 output[27] = WRAPLOW(step1[4] - step1[27]);	2722 output[27] = WRAPLOW(step1[4] - step1[27], bd);

2701 output[28] = WRAPLOW(step1[3] - step1[28]);	2723 output[28] = WRAPLOW(step1[3] - step1[28], bd);

2702 output[29] = WRAPLOW(step1[2] - step1[29]);	2724 output[29] = WRAPLOW(step1[2] - step1[29], bd);

2703 output[30] = WRAPLOW(step1[1] - step1[30]);	2725 output[30] = WRAPLOW(step1[1] - step1[30], bd);

2704 output[31] = WRAPLOW(step1[0] - step1[31]);	2726 output[31] = WRAPLOW(step1[0] - step1[31], bd);

2705 }	2727 }

2706	2728

2707 void vp9_high_idct32x32_1024_add_c(const tran_low_t input, uint8_t dest8,	2729 void vp9_highbd_idct32x32_1024_add_c(const tran_low_t input, uint8_t dest8,

2708 int stride, int bd) {	2730 int stride, int bd) {

2709 tran_low_t out[32 * 32];	2731 tran_low_t out[32 * 32];

2710 tran_low_t *outptr = out;	2732 tran_low_t *outptr = out;

2711 int i, j;	2733 int i, j;

2712 tran_low_t temp_in[32], temp_out[32];	2734 tran_low_t temp_in[32], temp_out[32];

2713 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);	2735 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

2714	2736

2715 // Rows	2737 // Rows

2716 for (i = 0; i < 32; ++i) {	2738 for (i = 0; i < 32; ++i) {

2717 tran_low_t zero_coeff[16];	2739 tran_low_t zero_coeff[16];

2718 for (j = 0; j < 16; ++j)	2740 for (j = 0; j < 16; ++j)

2719 zero_coeff[j] = input[2 * j] \| input[2 * j + 1];	2741 zero_coeff[j] = input[2 * j] \| input[2 * j + 1];

2720 for (j = 0; j < 8; ++j)	2742 for (j = 0; j < 8; ++j)

2721 zero_coeff[j] = zero_coeff[2 * j] \| zero_coeff[2 * j + 1];	2743 zero_coeff[j] = zero_coeff[2 * j] \| zero_coeff[2 * j + 1];

2722 for (j = 0; j < 4; ++j)	2744 for (j = 0; j < 4; ++j)

2723 zero_coeff[j] = zero_coeff[2 * j] \| zero_coeff[2 * j + 1];	2745 zero_coeff[j] = zero_coeff[2 * j] \| zero_coeff[2 * j + 1];

2724 for (j = 0; j < 2; ++j)	2746 for (j = 0; j < 2; ++j)

2725 zero_coeff[j] = zero_coeff[2 * j] \| zero_coeff[2 * j + 1];	2747 zero_coeff[j] = zero_coeff[2 * j] \| zero_coeff[2 * j + 1];

2726	2748

2727 if (zero_coeff[0] \| zero_coeff[1])	2749 if (zero_coeff[0] \| zero_coeff[1])

2728 high_idct32(input, outptr, bd);	2750 highbd_idct32(input, outptr, bd);

2729 else	2751 else

2730 vpx_memset(outptr, 0, sizeof(tran_low_t) * 32);	2752 vpx_memset(outptr, 0, sizeof(tran_low_t) * 32);

2731 input += 32;	2753 input += 32;

2732 outptr += 32;	2754 outptr += 32;

2733 }	2755 }

2734	2756

2735 // Columns	2757 // Columns

2736 for (i = 0; i < 32; ++i) {	2758 for (i = 0; i < 32; ++i) {

2737 for (j = 0; j < 32; ++j)	2759 for (j = 0; j < 32; ++j)

2738 temp_in[j] = out[j * 32 + i];	2760 temp_in[j] = out[j * 32 + i];

2739 high_idct32(temp_in, temp_out, bd);	2761 highbd_idct32(temp_in, temp_out, bd);

2740 for (j = 0; j < 32; ++j)	2762 for (j = 0; j < 32; ++j) {

2741 dest[j * stride + i] = clip_pixel_bd_high(	2763 dest[j * stride + i] = highbd_clip_pixel_add(

2742 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);	2764 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);

	2765 }

2743 }	2766 }

2744 }	2767 }

2745	2768

2746 void vp9_high_idct32x32_34_add_c(const tran_low_t input, uint8_t dest8,	2769 void vp9_highbd_idct32x32_34_add_c(const tran_low_t input, uint8_t dest8,

2747 int stride, int bd) {	2770 int stride, int bd) {

2748 tran_low_t out[32 * 32] = {0};	2771 tran_low_t out[32 * 32] = {0};

2749 tran_low_t *outptr = out;	2772 tran_low_t *outptr = out;

2750 int i, j;	2773 int i, j;

2751 tran_low_t temp_in[32], temp_out[32];	2774 tran_low_t temp_in[32], temp_out[32];

2752 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);	2775 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

2753	2776

2754 // Rows	2777 // Rows

2755 // Only upper-left 8x8 has non-zero coeff.	2778 // Only upper-left 8x8 has non-zero coeff.

2756 for (i = 0; i < 8; ++i) {	2779 for (i = 0; i < 8; ++i) {

2757 high_idct32(input, outptr, bd);	2780 highbd_idct32(input, outptr, bd);

2758 input += 32;	2781 input += 32;

2759 outptr += 32;	2782 outptr += 32;

2760 }	2783 }

2761 // Columns	2784 // Columns

2762 for (i = 0; i < 32; ++i) {	2785 for (i = 0; i < 32; ++i) {

2763 for (j = 0; j < 32; ++j)	2786 for (j = 0; j < 32; ++j)

2764 temp_in[j] = out[j * 32 + i];	2787 temp_in[j] = out[j * 32 + i];

2765 high_idct32(temp_in, temp_out, bd);	2788 highbd_idct32(temp_in, temp_out, bd);

2766 for (j = 0; j < 32; ++j)	2789 for (j = 0; j < 32; ++j) {

2767 dest[j * stride + i] = clip_pixel_bd_high(	2790 dest[j * stride + i] = highbd_clip_pixel_add(

2768 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);	2791 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);

	2792 }

2769 }	2793 }

2770 }	2794 }

2771	2795

2772 void vp9_high_idct32x32_1_add_c(const tran_low_t input, uint8_t dest8,	2796 void vp9_highbd_idct32x32_1_add_c(const tran_low_t input, uint8_t dest8,

2773 int stride, int bd) {	2797 int stride, int bd) {

2774 int i, j;	2798 int i, j;

2775 int a1;	2799 int a1;

2776 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);	2800 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

2777	2801

2778 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));	2802 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);

2779 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));	2803 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);

2780 a1 = ROUND_POWER_OF_TWO(out, 6);	2804 a1 = ROUND_POWER_OF_TWO(out, 6);

2781	2805

2782 for (j = 0; j < 32; ++j) {	2806 for (j = 0; j < 32; ++j) {

2783 for (i = 0; i < 32; ++i)	2807 for (i = 0; i < 32; ++i)

2784 dest[i] = clip_pixel_bd_high(dest[i], a1, bd);	2808 dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);

2785 dest += stride;	2809 dest += stride;

2786 }	2810 }

2787 }	2811 }

2788	2812

2789 // idct	2813 // idct

2790 void vp9_high_idct4x4_add(const tran_low_t input, uint8_t dest, int stride,	2814 void vp9_highbd_idct4x4_add(const tran_low_t input, uint8_t dest, int stride,

2791 int eob, int bd) {	2815 int eob, int bd) {

2792 if (eob > 1)	2816 if (eob > 1)

2793 vp9_high_idct4x4_16_add(input, dest, stride, bd);	2817 vp9_highbd_idct4x4_16_add(input, dest, stride, bd);

2794 else	2818 else

2795 vp9_high_idct4x4_1_add(input, dest, stride, bd);	2819 vp9_highbd_idct4x4_1_add(input, dest, stride, bd);

2796 }	2820 }

2797	2821

2798	2822

2799 void vp9_high_iwht4x4_add(const tran_low_t input, uint8_t dest, int stride,	2823 void vp9_highbd_iwht4x4_add(const tran_low_t input, uint8_t dest, int stride,

2800 int eob, int bd) {	2824 int eob, int bd) {

2801 if (eob > 1)	2825 if (eob > 1)

2802 vp9_high_iwht4x4_16_add(input, dest, stride, bd);	2826 vp9_highbd_iwht4x4_16_add(input, dest, stride, bd);

2803 else	2827 else

2804 vp9_high_iwht4x4_1_add(input, dest, stride, bd);	2828 vp9_highbd_iwht4x4_1_add(input, dest, stride, bd);

2805 }	2829 }

2806	2830

2807 void vp9_high_idct8x8_add(const tran_low_t input, uint8_t dest, int stride,	2831 void vp9_highbd_idct8x8_add(const tran_low_t input, uint8_t dest, int stride,

2808 int eob, int bd) {	2832 int eob, int bd) {

2809 // If dc is 1, then input[0] is the reconstructed value, do not need	2833 // If dc is 1, then input[0] is the reconstructed value, do not need

2810 // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.	2834 // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.

2811	2835

2812 // The calculation can be simplified if there are not many non-zero dct	2836 // The calculation can be simplified if there are not many non-zero dct

2813 // coefficients. Use eobs to decide what to do.	2837 // coefficients. Use eobs to decide what to do.

2814 // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.	2838 // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.

2815 // Combine that with code here.	2839 // Combine that with code here.

2816 // DC only DCT coefficient	2840 // DC only DCT coefficient

2817 if (eob == 1) {	2841 if (eob == 1) {

2818 vp9_high_idct8x8_1_add(input, dest, stride, bd);	2842 vp9_highbd_idct8x8_1_add(input, dest, stride, bd);

2819 } else if (eob <= 10) {	2843 } else if (eob <= 10) {

2820 vp9_high_idct8x8_10_add(input, dest, stride, bd);	2844 vp9_highbd_idct8x8_10_add(input, dest, stride, bd);

2821 } else {	2845 } else {

2822 vp9_high_idct8x8_64_add(input, dest, stride, bd);	2846 vp9_highbd_idct8x8_64_add(input, dest, stride, bd);

2823 }	2847 }

2824 }	2848 }

2825	2849

2826 void vp9_high_idct16x16_add(const tran_low_t input, uint8_t dest, int stride,	2850 void vp9_highbd_idct16x16_add(const tran_low_t input, uint8_t dest,

2827 int eob, int bd) {	2851 int stride, int eob, int bd) {

2828 // The calculation can be simplified if there are not many non-zero dct	2852 // The calculation can be simplified if there are not many non-zero dct

2829 // coefficients. Use eobs to separate different cases.	2853 // coefficients. Use eobs to separate different cases.

2830 // DC only DCT coefficient.	2854 // DC only DCT coefficient.

2831 if (eob == 1) {	2855 if (eob == 1) {

2832 vp9_high_idct16x16_1_add(input, dest, stride, bd);	2856 vp9_highbd_idct16x16_1_add(input, dest, stride, bd);

2833 } else if (eob <= 10) {	2857 } else if (eob <= 10) {

2834 vp9_high_idct16x16_10_add(input, dest, stride, bd);	2858 vp9_highbd_idct16x16_10_add(input, dest, stride, bd);

2835 } else {	2859 } else {

2836 vp9_high_idct16x16_256_add(input, dest, stride, bd);	2860 vp9_highbd_idct16x16_256_add(input, dest, stride, bd);

2837 }	2861 }

2838 }	2862 }

2839	2863

2840 void vp9_high_idct32x32_add(const tran_low_t input, uint8_t dest, int stride,	2864 void vp9_highbd_idct32x32_add(const tran_low_t input, uint8_t dest,

2841 int eob, int bd) {	2865 int stride, int eob, int bd) {

2842 // Non-zero coeff only in upper-left 8x8	2866 // Non-zero coeff only in upper-left 8x8

2843 if (eob == 1) {	2867 if (eob == 1) {

2844 vp9_high_idct32x32_1_add(input, dest, stride, bd);	2868 vp9_highbd_idct32x32_1_add(input, dest, stride, bd);

2845 } else if (eob <= 34) {	2869 } else if (eob <= 34) {

2846 vp9_high_idct32x32_34_add(input, dest, stride, bd);	2870 vp9_highbd_idct32x32_34_add(input, dest, stride, bd);

2847 } else {	2871 } else {

2848 vp9_high_idct32x32_1024_add(input, dest, stride, bd);	2872 vp9_highbd_idct32x32_1024_add(input, dest, stride, bd);

2849 }	2873 }

2850 }	2874 }

2851	2875

2852 // iht	2876 // iht

2853 void vp9_high_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,	2877 void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,

2854 uint8_t *dest, int stride, int eob, int bd) {	2878 uint8_t *dest, int stride, int eob, int bd) {

2855 if (tx_type == DCT_DCT)	2879 if (tx_type == DCT_DCT)

2856 vp9_high_idct4x4_add(input, dest, stride, eob, bd);	2880 vp9_highbd_idct4x4_add(input, dest, stride, eob, bd);

2857 else	2881 else

2858 vp9_high_iht4x4_16_add(input, dest, stride, tx_type, bd);	2882 vp9_highbd_iht4x4_16_add(input, dest, stride, tx_type, bd);

2859 }	2883 }

2860	2884

2861 void vp9_high_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,	2885 void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,

2862 uint8_t *dest, int stride, int eob, int bd) {	2886 uint8_t *dest, int stride, int eob, int bd) {

2863 if (tx_type == DCT_DCT) {	2887 if (tx_type == DCT_DCT) {

2864 vp9_high_idct8x8_add(input, dest, stride, eob, bd);	2888 vp9_highbd_idct8x8_add(input, dest, stride, eob, bd);

2865 } else {	2889 } else {

2866 vp9_high_iht8x8_64_add(input, dest, stride, tx_type, bd);	2890 vp9_highbd_iht8x8_64_add(input, dest, stride, tx_type, bd);

2867 }	2891 }

2868 }	2892 }

2869	2893

2870 void vp9_high_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,	2894 void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,

2871 uint8_t *dest, int stride, int eob, int bd) {	2895 uint8_t *dest, int stride, int eob, int bd) {

2872 if (tx_type == DCT_DCT) {	2896 if (tx_type == DCT_DCT) {

2873 vp9_high_idct16x16_add(input, dest, stride, eob, bd);	2897 vp9_highbd_idct16x16_add(input, dest, stride, eob, bd);

2874 } else {	2898 } else {

2875 vp9_high_iht16x16_256_add(input, dest, stride, tx_type, bd);	2899 vp9_highbd_iht16x16_256_add(input, dest, stride, tx_type, bd);

2876 }	2900 }

2877 }	2901 }

2878 #endif // CONFIG_VP9_HIGHBITDEPTH	2902 #endif // CONFIG_VP9_HIGHBITDEPTH

OLD	NEW

« no previous file with comments | « source/libvpx/vp9/common/vp9_idct.h ('k') | source/libvpx/vp9/common/vp9_loopfilter.c » ('j') | no next file with comments »