jidctflt.c - Issue 1953443002: Update to libjpeg_turbo 1.4.90

Side by Side Diff: jidctflt.c

Issue 1953443002: Update to libjpeg_turbo 1.4.90 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libjpeg_turbo.git@master

Patch Set: Created 4 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * jidctflt.c	2 * jidctflt.c

3 *	3 *

	4 * This file was part of the Independent JPEG Group's software:

4 * Copyright (C) 1994-1998, Thomas G. Lane.	5 * Copyright (C) 1994-1998, Thomas G. Lane.

5 * This file is part of the Independent JPEG Group's software.	6 * Modified 2010 by Guido Vollbeding.

6 * For conditions of distribution and use, see the accompanying README file.	7 * libjpeg-turbo Modifications:

	8 * Copyright (C) 2014, D. R. Commander.

	9 * For conditions of distribution and use, see the accompanying README.ijg

	10 * file.

7 *	11 *

8 * This file contains a floating-point implementation of the	12 * This file contains a floating-point implementation of the

9 * inverse DCT (Discrete Cosine Transform). In the IJG code, this routine	13 * inverse DCT (Discrete Cosine Transform). In the IJG code, this routine

10 * must also perform dequantization of the input coefficients.	14 * must also perform dequantization of the input coefficients.

11 *	15 *

12 * This implementation should be more accurate than either of the integer	16 * This implementation should be more accurate than either of the integer

13 * IDCT implementations. However, it may not give the same results on all	17 * IDCT implementations. However, it may not give the same results on all

14 * machines because of differences in roundoff behavior. Speed will depend	18 * machines because of differences in roundoff behavior. Speed will depend

15 * on the hardware's floating point capacity.	19 * on the hardware's floating point capacity.

16 *	20 *

17 * A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT	21 * A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT

18 * on each row (or vice versa, but it's more convenient to emit a row at	22 * on each row (or vice versa, but it's more convenient to emit a row at

19 * a time). Direct algorithms are also available, but they are much more	23 * a time). Direct algorithms are also available, but they are much more

20 * complex and seem not to be any faster when reduced to code.	24 * complex and seem not to be any faster when reduced to code.

21 *	25 *

22 * This implementation is based on Arai, Agui, and Nakajima's algorithm for	26 * This implementation is based on Arai, Agui, and Nakajima's algorithm for

23 * scaled DCT. Their original paper (Trans. IEICE E-71(11):1095) is in	27 * scaled DCT. Their original paper (Trans. IEICE E-71(11):1095) is in

24 * Japanese, but the algorithm is described in the Pennebaker & Mitchell	28 * Japanese, but the algorithm is described in the Pennebaker & Mitchell

25 * JPEG textbook (see REFERENCES section in file README). The following code	29 * JPEG textbook (see REFERENCES section in file README.ijg). The following

26 * is based directly on figure 4-8 in P&M.	30 * code is based directly on figure 4-8 in P&M.

27 * While an 8-point DCT cannot be done in less than 11 multiplies, it is	31 * While an 8-point DCT cannot be done in less than 11 multiplies, it is

28 * possible to arrange the computation so that many of the multiplies are	32 * possible to arrange the computation so that many of the multiplies are

29 * simple scalings of the final outputs. These multiplies can then be	33 * simple scalings of the final outputs. These multiplies can then be

30 * folded into the multiplications or divisions by the JPEG quantization	34 * folded into the multiplications or divisions by the JPEG quantization

31 * table entries. The AA&N method leaves only 5 multiplies and 29 adds	35 * table entries. The AA&N method leaves only 5 multiplies and 29 adds

32 * to be done in the DCT itself.	36 * to be done in the DCT itself.

33 * The primary disadvantage of this method is that with a fixed-point	37 * The primary disadvantage of this method is that with a fixed-point

34 * implementation, accuracy is lost due to imprecise representation of the	38 * implementation, accuracy is lost due to imprecise representation of the

35 * scaled quantization values. However, that problem does not arise if	39 * scaled quantization values. However, that problem does not arise if

36 * we use floating point arithmetic.	40 * we use floating point arithmetic.

37 */	41 */

38	42

39 #define JPEG_INTERNALS	43 #define JPEG_INTERNALS

40 #include "jinclude.h"	44 #include "jinclude.h"

41 #include "jpeglib.h"	45 #include "jpeglib.h"

42 #include "jdct.h"» » /* Private declarations for DCT subsystem */	46 #include "jdct.h" /* Private declarations for DCT subsystem */

43	47

44 #ifdef DCT_FLOAT_SUPPORTED	48 #ifdef DCT_FLOAT_SUPPORTED

45	49

46	50

47 /*	51 /*

48 * This module is specialized to the case DCTSIZE = 8.	52 * This module is specialized to the case DCTSIZE = 8.

49 */	53 */

50	54

51 #if DCTSIZE != 8	55 #if DCTSIZE != 8

52 Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */	56 Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */

53 #endif	57 #endif

54	58

55	59

56 /* Dequantize a coefficient by multiplying it by the multiplier-table	60 /* Dequantize a coefficient by multiplying it by the multiplier-table

57 * entry; produce a float result.	61 * entry; produce a float result.

58 */	62 */

59	63

60 #define DEQUANTIZE(coef,quantval) (((FAST_FLOAT) (coef)) * (quantval))	64 #define DEQUANTIZE(coef,quantval) (((FAST_FLOAT) (coef)) * (quantval))

61	65

62	66

63 /*	67 /*

64 * Perform dequantization and inverse DCT on one block of coefficients.	68 * Perform dequantization and inverse DCT on one block of coefficients.

65 */	69 */

66	70

67 GLOBAL(void)	71 GLOBAL(void)

68 jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,	72 jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,

69 » » JCOEFPTR coef_block,	73 JCOEFPTR coef_block,

70 » » JSAMPARRAY output_buf, JDIMENSION output_col)	74 JSAMPARRAY output_buf, JDIMENSION output_col)

71 {	75 {

72 FAST_FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;	76 FAST_FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;

73 FAST_FLOAT tmp10, tmp11, tmp12, tmp13;	77 FAST_FLOAT tmp10, tmp11, tmp12, tmp13;

74 FAST_FLOAT z5, z10, z11, z12, z13;	78 FAST_FLOAT z5, z10, z11, z12, z13;

75 JCOEFPTR inptr;	79 JCOEFPTR inptr;

76 FLOAT_MULT_TYPE * quantptr;	80 FLOAT_MULT_TYPE *quantptr;

77 FAST_FLOAT * wsptr;	81 FAST_FLOAT *wsptr;

78 JSAMPROW outptr;	82 JSAMPROW outptr;

79 JSAMPLE *range_limit = IDCT_range_limit(cinfo);	83 JSAMPLE *range_limit = cinfo->sample_range_limit;

80 int ctr;	84 int ctr;

81 FAST_FLOAT workspace[DCTSIZE2]; /* buffers data between passes */	85 FAST_FLOAT workspace[DCTSIZE2]; /* buffers data between passes */

82 SHIFT_TEMPS	86 #define _0_125 ((FLOAT_MULT_TYPE)0.125)

83	87

84 /* Pass 1: process columns from input, store into work array. */	88 /* Pass 1: process columns from input, store into work array. */

85	89

86 inptr = coef_block;	90 inptr = coef_block;

87 quantptr = (FLOAT_MULT_TYPE *) compptr->dct_table;	91 quantptr = (FLOAT_MULT_TYPE *) compptr->dct_table;

88 wsptr = workspace;	92 wsptr = workspace;

89 for (ctr = DCTSIZE; ctr > 0; ctr--) {	93 for (ctr = DCTSIZE; ctr > 0; ctr--) {

90 /* Due to quantization, we will usually find that many of the input	94 /* Due to quantization, we will usually find that many of the input

91 * coefficients are zero, especially the AC terms. We can exploit this	95 * coefficients are zero, especially the AC terms. We can exploit this

92 * by short-circuiting the IDCT calculation for any column in which all	96 * by short-circuiting the IDCT calculation for any column in which all

93 * the AC terms are zero. In that case each output is equal to the	97 * the AC terms are zero. In that case each output is equal to the

94 * DC coefficient (with scale factor as needed).	98 * DC coefficient (with scale factor as needed).

95 * With typical images and quantization tables, half or more of the	99 * With typical images and quantization tables, half or more of the

96 * column DCT calculations can be simplified this way.	100 * column DCT calculations can be simplified this way.

97 */	101 */

98	102

99 if (inptr[DCTSIZE1] == 0 && inptr[DCTSIZE2] == 0 &&	103 if (inptr[DCTSIZE1] == 0 && inptr[DCTSIZE2] == 0 &&

100 » inptr[DCTSIZE3] == 0 && inptr[DCTSIZE4] == 0 &&	104 inptr[DCTSIZE3] == 0 && inptr[DCTSIZE4] == 0 &&

101 » inptr[DCTSIZE5] == 0 && inptr[DCTSIZE6] == 0 &&	105 inptr[DCTSIZE5] == 0 && inptr[DCTSIZE6] == 0 &&

102 » inptr[DCTSIZE*7] == 0) {	106 inptr[DCTSIZE*7] == 0) {

103 /* AC terms all zero */	107 /* AC terms all zero */

104 FAST_FLOAT dcval = DEQUANTIZE(inptr[DCTSIZE0], quantptr[DCTSIZE0]);	108 FAST_FLOAT dcval = DEQUANTIZE(inptr[DCTSIZE*0],

105	109 quantptr[DCTSIZE0] _0_125);

	110

106 wsptr[DCTSIZE*0] = dcval;	111 wsptr[DCTSIZE*0] = dcval;

107 wsptr[DCTSIZE*1] = dcval;	112 wsptr[DCTSIZE*1] = dcval;

108 wsptr[DCTSIZE*2] = dcval;	113 wsptr[DCTSIZE*2] = dcval;

109 wsptr[DCTSIZE*3] = dcval;	114 wsptr[DCTSIZE*3] = dcval;

110 wsptr[DCTSIZE*4] = dcval;	115 wsptr[DCTSIZE*4] = dcval;

111 wsptr[DCTSIZE*5] = dcval;	116 wsptr[DCTSIZE*5] = dcval;

112 wsptr[DCTSIZE*6] = dcval;	117 wsptr[DCTSIZE*6] = dcval;

113 wsptr[DCTSIZE*7] = dcval;	118 wsptr[DCTSIZE*7] = dcval;

114	119

115 inptr++;» » » /* advance pointers to next column */	120 inptr++; /* advance pointers to next column */

116 quantptr++;	121 quantptr++;

117 wsptr++;	122 wsptr++;

118 continue;	123 continue;

119 }	124 }

120	125

121 /* Even part */	126 /* Even part */

122	127

123 tmp0 = DEQUANTIZE(inptr[DCTSIZE0], quantptr[DCTSIZE0]);	128 tmp0 = DEQUANTIZE(inptr[DCTSIZE0], quantptr[DCTSIZE0] * _0_125);

124 tmp1 = DEQUANTIZE(inptr[DCTSIZE2], quantptr[DCTSIZE2]);	129 tmp1 = DEQUANTIZE(inptr[DCTSIZE2], quantptr[DCTSIZE2] * _0_125);

125 tmp2 = DEQUANTIZE(inptr[DCTSIZE4], quantptr[DCTSIZE4]);	130 tmp2 = DEQUANTIZE(inptr[DCTSIZE4], quantptr[DCTSIZE4] * _0_125);

126 tmp3 = DEQUANTIZE(inptr[DCTSIZE6], quantptr[DCTSIZE6]);	131 tmp3 = DEQUANTIZE(inptr[DCTSIZE6], quantptr[DCTSIZE6] * _0_125);

127	132

128 tmp10 = tmp0 + tmp2;» /* phase 3 */	133 tmp10 = tmp0 + tmp2; /* phase 3 */

129 tmp11 = tmp0 - tmp2;	134 tmp11 = tmp0 - tmp2;

130	135

131 tmp13 = tmp1 + tmp3;» /* phases 5-3 */	136 tmp13 = tmp1 + tmp3; /* phases 5-3 */

132 tmp12 = (tmp1 - tmp3) * ((FAST_FLOAT) 1.414213562) - tmp13; /* 2c4 /	137 tmp12 = (tmp1 - tmp3) * ((FAST_FLOAT) 1.414213562) - tmp13; /* 2c4 /

133	138

134 tmp0 = tmp10 + tmp13;» /* phase 2 */	139 tmp0 = tmp10 + tmp13; /* phase 2 */

135 tmp3 = tmp10 - tmp13;	140 tmp3 = tmp10 - tmp13;

136 tmp1 = tmp11 + tmp12;	141 tmp1 = tmp11 + tmp12;

137 tmp2 = tmp11 - tmp12;	142 tmp2 = tmp11 - tmp12;

138	143

139 /* Odd part */	144 /* Odd part */

140	145

141 tmp4 = DEQUANTIZE(inptr[DCTSIZE1], quantptr[DCTSIZE1]);	146 tmp4 = DEQUANTIZE(inptr[DCTSIZE1], quantptr[DCTSIZE1] * _0_125);

142 tmp5 = DEQUANTIZE(inptr[DCTSIZE3], quantptr[DCTSIZE3]);	147 tmp5 = DEQUANTIZE(inptr[DCTSIZE3], quantptr[DCTSIZE3] * _0_125);

143 tmp6 = DEQUANTIZE(inptr[DCTSIZE5], quantptr[DCTSIZE5]);	148 tmp6 = DEQUANTIZE(inptr[DCTSIZE5], quantptr[DCTSIZE5] * _0_125);

144 tmp7 = DEQUANTIZE(inptr[DCTSIZE7], quantptr[DCTSIZE7]);	149 tmp7 = DEQUANTIZE(inptr[DCTSIZE7], quantptr[DCTSIZE7] * _0_125);

145	150

146 z13 = tmp6 + tmp5;» » /* phase 6 */	151 z13 = tmp6 + tmp5; /* phase 6 */

147 z10 = tmp6 - tmp5;	152 z10 = tmp6 - tmp5;

148 z11 = tmp4 + tmp7;	153 z11 = tmp4 + tmp7;

149 z12 = tmp4 - tmp7;	154 z12 = tmp4 - tmp7;

150	155

151 tmp7 = z11 + z13;» » /* phase 5 */	156 tmp7 = z11 + z13; /* phase 5 */

152 tmp11 = (z11 - z13) * ((FAST_FLOAT) 1.414213562); /* 2c4 /	157 tmp11 = (z11 - z13) * ((FAST_FLOAT) 1.414213562); /* 2c4 /

153	158

154 z5 = (z10 + z12) * ((FAST_FLOAT) 1.847759065); /* 2c2 /	159 z5 = (z10 + z12) * ((FAST_FLOAT) 1.847759065); /* 2c2 /

155 tmp10 = ((FAST_FLOAT) 1.082392200) * z12 - z5; /* 2(c2-c6) /	160 tmp10 = z5 - z12 * ((FAST_FLOAT) 1.082392200); /* 2(c2-c6) /

156 tmp12 = ((FAST_FLOAT) -2.613125930) * z10 + z5; /* -2(c2+c6) /	161 tmp12 = z5 - z10 * ((FAST_FLOAT) 2.613125930); /* 2(c2+c6) /

157	162

158 tmp6 = tmp12 - tmp7;» /* phase 2 */	163 tmp6 = tmp12 - tmp7; /* phase 2 */

159 tmp5 = tmp11 - tmp6;	164 tmp5 = tmp11 - tmp6;

160 tmp4 = tmp10 + tmp5;	165 tmp4 = tmp10 - tmp5;

161	166

162 wsptr[DCTSIZE*0] = tmp0 + tmp7;	167 wsptr[DCTSIZE*0] = tmp0 + tmp7;

163 wsptr[DCTSIZE*7] = tmp0 - tmp7;	168 wsptr[DCTSIZE*7] = tmp0 - tmp7;

164 wsptr[DCTSIZE*1] = tmp1 + tmp6;	169 wsptr[DCTSIZE*1] = tmp1 + tmp6;

165 wsptr[DCTSIZE*6] = tmp1 - tmp6;	170 wsptr[DCTSIZE*6] = tmp1 - tmp6;

166 wsptr[DCTSIZE*2] = tmp2 + tmp5;	171 wsptr[DCTSIZE*2] = tmp2 + tmp5;

167 wsptr[DCTSIZE*5] = tmp2 - tmp5;	172 wsptr[DCTSIZE*5] = tmp2 - tmp5;

168 wsptr[DCTSIZE*4] = tmp3 + tmp4;	173 wsptr[DCTSIZE*3] = tmp3 + tmp4;

169 wsptr[DCTSIZE*3] = tmp3 - tmp4;	174 wsptr[DCTSIZE*4] = tmp3 - tmp4;

170	175

171 inptr++;» » » /* advance pointers to next column */	176 inptr++; /* advance pointers to next column */

172 quantptr++;	177 quantptr++;

173 wsptr++;	178 wsptr++;

174 }	179 }

175	180

176 /* Pass 2: process rows from work array, store into output array. */	181 /* Pass 2: process rows from work array, store into output array. */

177 /* Note that we must descale the results by a factor of 8 == 2*3. /

178	182

179 wsptr = workspace;	183 wsptr = workspace;

180 for (ctr = 0; ctr < DCTSIZE; ctr++) {	184 for (ctr = 0; ctr < DCTSIZE; ctr++) {

181 outptr = output_buf[ctr] + output_col;	185 outptr = output_buf[ctr] + output_col;

182 /* Rows of zeroes can be exploited in the same way as we did with columns.	186 /* Rows of zeroes can be exploited in the same way as we did with columns.

183 * However, the column calculation has created many nonzero AC terms, so	187 * However, the column calculation has created many nonzero AC terms, so

184 * the simplification applies less often (typically 5% to 10% of the time).	188 * the simplification applies less often (typically 5% to 10% of the time).

185 * And testing floats for zero is relatively expensive, so we don't bother.	189 * And testing floats for zero is relatively expensive, so we don't bother.

186 */	190 */

187	191

188 /* Even part */	192 /* Even part */

189	193

190 tmp10 = wsptr[0] + wsptr[4];	194 /* Apply signed->unsigned and prepare float->int conversion */

191 tmp11 = wsptr[0] - wsptr[4];	195 z5 = wsptr[0] + ((FAST_FLOAT) CENTERJSAMPLE + (FAST_FLOAT) 0.5);

	196 tmp10 = z5 + wsptr[4];

	197 tmp11 = z5 - wsptr[4];

192	198

193 tmp13 = wsptr[2] + wsptr[6];	199 tmp13 = wsptr[2] + wsptr[6];

194 tmp12 = (wsptr[2] - wsptr[6]) * ((FAST_FLOAT) 1.414213562) - tmp13;	200 tmp12 = (wsptr[2] - wsptr[6]) * ((FAST_FLOAT) 1.414213562) - tmp13;

195	201

196 tmp0 = tmp10 + tmp13;	202 tmp0 = tmp10 + tmp13;

197 tmp3 = tmp10 - tmp13;	203 tmp3 = tmp10 - tmp13;

198 tmp1 = tmp11 + tmp12;	204 tmp1 = tmp11 + tmp12;

199 tmp2 = tmp11 - tmp12;	205 tmp2 = tmp11 - tmp12;

200	206

201 /* Odd part */	207 /* Odd part */

202	208

203 z13 = wsptr[5] + wsptr[3];	209 z13 = wsptr[5] + wsptr[3];

204 z10 = wsptr[5] - wsptr[3];	210 z10 = wsptr[5] - wsptr[3];

205 z11 = wsptr[1] + wsptr[7];	211 z11 = wsptr[1] + wsptr[7];

206 z12 = wsptr[1] - wsptr[7];	212 z12 = wsptr[1] - wsptr[7];

207	213

208 tmp7 = z11 + z13;	214 tmp7 = z11 + z13;

209 tmp11 = (z11 - z13) * ((FAST_FLOAT) 1.414213562);	215 tmp11 = (z11 - z13) * ((FAST_FLOAT) 1.414213562);

210	216

211 z5 = (z10 + z12) * ((FAST_FLOAT) 1.847759065); /* 2c2 /	217 z5 = (z10 + z12) * ((FAST_FLOAT) 1.847759065); /* 2c2 /

212 tmp10 = ((FAST_FLOAT) 1.082392200) * z12 - z5; /* 2(c2-c6) /	218 tmp10 = z5 - z12 * ((FAST_FLOAT) 1.082392200); /* 2(c2-c6) /

213 tmp12 = ((FAST_FLOAT) -2.613125930) * z10 + z5; /* -2(c2+c6) /	219 tmp12 = z5 - z10 * ((FAST_FLOAT) 2.613125930); /* 2(c2+c6) /

214	220

215 tmp6 = tmp12 - tmp7;	221 tmp6 = tmp12 - tmp7;

216 tmp5 = tmp11 - tmp6;	222 tmp5 = tmp11 - tmp6;

217 tmp4 = tmp10 + tmp5;	223 tmp4 = tmp10 - tmp5;

218	224

219 /* Final output stage: scale down by a factor of 8 and range-limit */	225 /* Final output stage: float->int conversion and range-limit */

220	226

221 outptr[0] = range_limit[(int) DESCALE((INT32) (tmp0 + tmp7), 3)	227 outptr[0] = range_limit[((int) (tmp0 + tmp7)) & RANGE_MASK];

222 » » » & RANGE_MASK];	228 outptr[7] = range_limit[((int) (tmp0 - tmp7)) & RANGE_MASK];

223 outptr[7] = range_limit[(int) DESCALE((INT32) (tmp0 - tmp7), 3)	229 outptr[1] = range_limit[((int) (tmp1 + tmp6)) & RANGE_MASK];

224 » » » & RANGE_MASK];	230 outptr[6] = range_limit[((int) (tmp1 - tmp6)) & RANGE_MASK];

225 outptr[1] = range_limit[(int) DESCALE((INT32) (tmp1 + tmp6), 3)	231 outptr[2] = range_limit[((int) (tmp2 + tmp5)) & RANGE_MASK];

226 » » » & RANGE_MASK];	232 outptr[5] = range_limit[((int) (tmp2 - tmp5)) & RANGE_MASK];

227 outptr[6] = range_limit[(int) DESCALE((INT32) (tmp1 - tmp6), 3)	233 outptr[3] = range_limit[((int) (tmp3 + tmp4)) & RANGE_MASK];

228 » » » & RANGE_MASK];	234 outptr[4] = range_limit[((int) (tmp3 - tmp4)) & RANGE_MASK];

229 outptr[2] = range_limit[(int) DESCALE((INT32) (tmp2 + tmp5), 3)	235

230 » » » & RANGE_MASK];	236 wsptr += DCTSIZE; /* advance pointer to next row */

231 outptr[5] = range_limit[(int) DESCALE((INT32) (tmp2 - tmp5), 3)

232 » » » & RANGE_MASK];

233 outptr[4] = range_limit[(int) DESCALE((INT32) (tmp3 + tmp4), 3)

234 » » » & RANGE_MASK];

235 outptr[3] = range_limit[(int) DESCALE((INT32) (tmp3 - tmp4), 3)

236 » » » & RANGE_MASK];

237

238 wsptr += DCTSIZE;» » /* advance pointer to next row */

239 }	237 }

240 }	238 }

241	239

242 #endif /* DCT_FLOAT_SUPPORTED */	240 #endif /* DCT_FLOAT_SUPPORTED */

OLD	NEW

« no previous file with comments | « jfdctint.c ('k') | jidctfst.c » ('j') | no next file with comments »