OLD | NEW |
| (Empty) |
1 /* | |
2 * jidctfst.c | |
3 * | |
4 * Copyright (C) 1994-1998, Thomas G. Lane. | |
5 * This file is part of the Independent JPEG Group's software. | |
6 * For conditions of distribution and use, see the accompanying README file. | |
7 * | |
8 * This file contains a fast, not so accurate integer implementation of the | |
9 * inverse DCT (Discrete Cosine Transform). In the IJG code, this routine | |
10 * must also perform dequantization of the input coefficients. | |
11 * | |
12 * A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT | |
13 * on each row (or vice versa, but it's more convenient to emit a row at | |
14 * a time). Direct algorithms are also available, but they are much more | |
15 * complex and seem not to be any faster when reduced to code. | |
16 * | |
17 * This implementation is based on Arai, Agui, and Nakajima's algorithm for | |
18 * scaled DCT. Their original paper (Trans. IEICE E-71(11):1095) is in | |
19 * Japanese, but the algorithm is described in the Pennebaker & Mitchell | |
20 * JPEG textbook (see REFERENCES section in file README). The following code | |
21 * is based directly on figure 4-8 in P&M. | |
22 * While an 8-point DCT cannot be done in less than 11 multiplies, it is | |
23 * possible to arrange the computation so that many of the multiplies are | |
24 * simple scalings of the final outputs. These multiplies can then be | |
25 * folded into the multiplications or divisions by the JPEG quantization | |
26 * table entries. The AA&N method leaves only 5 multiplies and 29 adds | |
27 * to be done in the DCT itself. | |
28 * The primary disadvantage of this method is that with fixed-point math, | |
29 * accuracy is lost due to imprecise representation of the scaled | |
30 * quantization values. The smaller the quantization table entry, the less | |
31 * precise the scaled value, so this implementation does worse with high- | |
32 * quality-setting files than with low-quality ones. | |
33 */ | |
34 | |
35 #define JPEG_INTERNALS | |
36 #include "jinclude.h" | |
37 #include "jpeglib.h" | |
38 #include "jdct.h" /* Private declarations for DCT subsystem */ | |
39 | |
40 #ifdef DCT_IFAST_SUPPORTED | |
41 | |
42 | |
43 /* | |
44 * This module is specialized to the case DCTSIZE = 8. | |
45 */ | |
46 | |
47 #if DCTSIZE != 8 | |
48 Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */ | |
49 #endif | |
50 | |
51 | |
52 /* Scaling decisions are generally the same as in the LL&M algorithm; | |
53 * see jidctint.c for more details. However, we choose to descale | |
54 * (right shift) multiplication products as soon as they are formed, | |
55 * rather than carrying additional fractional bits into subsequent additions. | |
56 * This compromises accuracy slightly, but it lets us save a few shifts. | |
57 * More importantly, 16-bit arithmetic is then adequate (for 8-bit samples) | |
58 * everywhere except in the multiplications proper; this saves a good deal | |
59 * of work on 16-bit-int machines. | |
60 * | |
61 * The dequantized coefficients are not integers because the AA&N scaling | |
62 * factors have been incorporated. We represent them scaled up by PASS1_BITS, | |
63 * so that the first and second IDCT rounds have the same input scaling. | |
64 * For 8-bit JSAMPLEs, we choose IFAST_SCALE_BITS = PASS1_BITS so as to | |
65 * avoid a descaling shift; this compromises accuracy rather drastically | |
66 * for small quantization table entries, but it saves a lot of shifts. | |
67 * For 12-bit JSAMPLEs, there's no hope of using 16x16 multiplies anyway, | |
68 * so we use a much larger scaling factor to preserve accuracy. | |
69 * | |
70 * A final compromise is to represent the multiplicative constants to only | |
71 * 8 fractional bits, rather than 13. This saves some shifting work on some | |
72 * machines, and may also reduce the cost of multiplication (since there | |
73 * are fewer one-bits in the constants). | |
74 */ | |
75 | |
76 #if BITS_IN_JSAMPLE == 8 | |
77 #define CONST_BITS 8 | |
78 #define PASS1_BITS 2 | |
79 #else | |
80 #define CONST_BITS 8 | |
81 #define PASS1_BITS 1 /* lose a little precision to avoid overflow */ | |
82 #endif | |
83 | |
84 /* Some C compilers fail to reduce "FIX(constant)" at compile time, thus | |
85 * causing a lot of useless floating-point operations at run time. | |
86 * To get around this we use the following pre-calculated constants. | |
87 * If you change CONST_BITS you may want to add appropriate values. | |
88 * (With a reasonable C compiler, you can just rely on the FIX() macro...) | |
89 */ | |
90 | |
91 #if CONST_BITS == 8 | |
92 #define FIX_1_082392200 ((INT32) 277) /* FIX(1.082392200) */ | |
93 #define FIX_1_414213562 ((INT32) 362) /* FIX(1.414213562) */ | |
94 #define FIX_1_847759065 ((INT32) 473) /* FIX(1.847759065) */ | |
95 #define FIX_2_613125930 ((INT32) 669) /* FIX(2.613125930) */ | |
96 #else | |
97 #define FIX_1_082392200 FIX(1.082392200) | |
98 #define FIX_1_414213562 FIX(1.414213562) | |
99 #define FIX_1_847759065 FIX(1.847759065) | |
100 #define FIX_2_613125930 FIX(2.613125930) | |
101 #endif | |
102 | |
103 | |
104 /* We can gain a little more speed, with a further compromise in accuracy, | |
105 * by omitting the addition in a descaling shift. This yields an incorrectly | |
106 * rounded result half the time... | |
107 */ | |
108 | |
109 #ifndef USE_ACCURATE_ROUNDING | |
110 #undef DESCALE | |
111 #define DESCALE(x,n) RIGHT_SHIFT(x, n) | |
112 #endif | |
113 | |
114 | |
115 /* Multiply a DCTELEM variable by an INT32 constant, and immediately | |
116 * descale to yield a DCTELEM result. | |
117 */ | |
118 | |
119 #define MULTIPLY(var,const) ((DCTELEM) DESCALE((var) * (const), CONST_BITS)) | |
120 | |
121 | |
122 /* Dequantize a coefficient by multiplying it by the multiplier-table | |
123 * entry; produce a DCTELEM result. For 8-bit data a 16x16->16 | |
124 * multiplication will do. For 12-bit data, the multiplier table is | |
125 * declared INT32, so a 32-bit multiply will be used. | |
126 */ | |
127 | |
128 #if BITS_IN_JSAMPLE == 8 | |
129 #define DEQUANTIZE(coef,quantval) (((IFAST_MULT_TYPE) (coef)) * (quantval)) | |
130 #else | |
131 #define DEQUANTIZE(coef,quantval) \ | |
132 DESCALE((coef)*(quantval), IFAST_SCALE_BITS-PASS1_BITS) | |
133 #endif | |
134 | |
135 | |
136 /* Like DESCALE, but applies to a DCTELEM and produces an int. | |
137 * We assume that int right shift is unsigned if INT32 right shift is. | |
138 */ | |
139 | |
140 #ifdef RIGHT_SHIFT_IS_UNSIGNED | |
141 #define ISHIFT_TEMPS DCTELEM ishift_temp; | |
142 #if BITS_IN_JSAMPLE == 8 | |
143 #define DCTELEMBITS 16 /* DCTELEM may be 16 or 32 bits */ | |
144 #else | |
145 #define DCTELEMBITS 32 /* DCTELEM must be 32 bits */ | |
146 #endif | |
147 #define IRIGHT_SHIFT(x,shft) \ | |
148 ((ishift_temp = (x)) < 0 ? \ | |
149 (ishift_temp >> (shft)) | ((~((DCTELEM) 0)) << (DCTELEMBITS-(shft))) : \ | |
150 (ishift_temp >> (shft))) | |
151 #else | |
152 #define ISHIFT_TEMPS | |
153 #define IRIGHT_SHIFT(x,shft) ((x) >> (shft)) | |
154 #endif | |
155 | |
156 #ifdef USE_ACCURATE_ROUNDING | |
157 #define IDESCALE(x,n) ((int) IRIGHT_SHIFT((x) + (1 << ((n)-1)), n)) | |
158 #else | |
159 #define IDESCALE(x,n) ((int) IRIGHT_SHIFT(x, n)) | |
160 #endif | |
161 | |
162 | |
163 /* | |
164 * Perform dequantization and inverse DCT on one block of coefficients. | |
165 */ | |
166 | |
167 GLOBAL(void) | |
168 jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr, | |
169 JCOEFPTR coef_block, | |
170 JSAMPARRAY output_buf, JDIMENSION output_col) | |
171 { | |
172 DCTELEM tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; | |
173 DCTELEM tmp10, tmp11, tmp12, tmp13; | |
174 DCTELEM z5, z10, z11, z12, z13; | |
175 JCOEFPTR inptr; | |
176 IFAST_MULT_TYPE * quantptr; | |
177 int * wsptr; | |
178 JSAMPROW outptr; | |
179 JSAMPLE *range_limit = IDCT_range_limit(cinfo); | |
180 int ctr; | |
181 int workspace[DCTSIZE2]; /* buffers data between passes */ | |
182 SHIFT_TEMPS /* for DESCALE */ | |
183 ISHIFT_TEMPS /* for IDESCALE */ | |
184 | |
185 /* Pass 1: process columns from input, store into work array. */ | |
186 | |
187 inptr = coef_block; | |
188 quantptr = (IFAST_MULT_TYPE *) compptr->dct_table; | |
189 wsptr = workspace; | |
190 for (ctr = DCTSIZE; ctr > 0; ctr--) { | |
191 /* Due to quantization, we will usually find that many of the input | |
192 * coefficients are zero, especially the AC terms. We can exploit this | |
193 * by short-circuiting the IDCT calculation for any column in which all | |
194 * the AC terms are zero. In that case each output is equal to the | |
195 * DC coefficient (with scale factor as needed). | |
196 * With typical images and quantization tables, half or more of the | |
197 * column DCT calculations can be simplified this way. | |
198 */ | |
199 | |
200 if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 && | |
201 inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 && | |
202 inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 && | |
203 inptr[DCTSIZE*7] == 0) { | |
204 /* AC terms all zero */ | |
205 int dcval = (int) DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); | |
206 | |
207 wsptr[DCTSIZE*0] = dcval; | |
208 wsptr[DCTSIZE*1] = dcval; | |
209 wsptr[DCTSIZE*2] = dcval; | |
210 wsptr[DCTSIZE*3] = dcval; | |
211 wsptr[DCTSIZE*4] = dcval; | |
212 wsptr[DCTSIZE*5] = dcval; | |
213 wsptr[DCTSIZE*6] = dcval; | |
214 wsptr[DCTSIZE*7] = dcval; | |
215 | |
216 inptr++; /* advance pointers to next column */ | |
217 quantptr++; | |
218 wsptr++; | |
219 continue; | |
220 } | |
221 | |
222 /* Even part */ | |
223 | |
224 tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); | |
225 tmp1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); | |
226 tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); | |
227 tmp3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]); | |
228 | |
229 tmp10 = tmp0 + tmp2; /* phase 3 */ | |
230 tmp11 = tmp0 - tmp2; | |
231 | |
232 tmp13 = tmp1 + tmp3; /* phases 5-3 */ | |
233 tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */ | |
234 | |
235 tmp0 = tmp10 + tmp13; /* phase 2 */ | |
236 tmp3 = tmp10 - tmp13; | |
237 tmp1 = tmp11 + tmp12; | |
238 tmp2 = tmp11 - tmp12; | |
239 | |
240 /* Odd part */ | |
241 | |
242 tmp4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); | |
243 tmp5 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); | |
244 tmp6 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); | |
245 tmp7 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]); | |
246 | |
247 z13 = tmp6 + tmp5; /* phase 6 */ | |
248 z10 = tmp6 - tmp5; | |
249 z11 = tmp4 + tmp7; | |
250 z12 = tmp4 - tmp7; | |
251 | |
252 tmp7 = z11 + z13; /* phase 5 */ | |
253 tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */ | |
254 | |
255 z5 = MULTIPLY(z10 + z12, FIX_1_847759065); /* 2*c2 */ | |
256 tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; /* 2*(c2-c6) */ | |
257 tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; /* -2*(c2+c6) */ | |
258 | |
259 tmp6 = tmp12 - tmp7; /* phase 2 */ | |
260 tmp5 = tmp11 - tmp6; | |
261 tmp4 = tmp10 + tmp5; | |
262 | |
263 wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7); | |
264 wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7); | |
265 wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6); | |
266 wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6); | |
267 wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5); | |
268 wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5); | |
269 wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4); | |
270 wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4); | |
271 | |
272 inptr++; /* advance pointers to next column */ | |
273 quantptr++; | |
274 wsptr++; | |
275 } | |
276 | |
277 /* Pass 2: process rows from work array, store into output array. */ | |
278 /* Note that we must descale the results by a factor of 8 == 2**3, */ | |
279 /* and also undo the PASS1_BITS scaling. */ | |
280 | |
281 wsptr = workspace; | |
282 for (ctr = 0; ctr < DCTSIZE; ctr++) { | |
283 outptr = output_buf[ctr] + output_col; | |
284 /* Rows of zeroes can be exploited in the same way as we did with columns. | |
285 * However, the column calculation has created many nonzero AC terms, so | |
286 * the simplification applies less often (typically 5% to 10% of the time). | |
287 * On machines with very fast multiplication, it's possible that the | |
288 * test takes more time than it's worth. In that case this section | |
289 * may be commented out. | |
290 */ | |
291 | |
292 #ifndef NO_ZERO_ROW_TEST | |
293 if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 && | |
294 wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) { | |
295 /* AC terms all zero */ | |
296 JSAMPLE dcval = range_limit[IDESCALE(wsptr[0], PASS1_BITS+3) | |
297 & RANGE_MASK]; | |
298 | |
299 outptr[0] = dcval; | |
300 outptr[1] = dcval; | |
301 outptr[2] = dcval; | |
302 outptr[3] = dcval; | |
303 outptr[4] = dcval; | |
304 outptr[5] = dcval; | |
305 outptr[6] = dcval; | |
306 outptr[7] = dcval; | |
307 | |
308 wsptr += DCTSIZE; /* advance pointer to next row */ | |
309 continue; | |
310 } | |
311 #endif | |
312 | |
313 /* Even part */ | |
314 | |
315 tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]); | |
316 tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]); | |
317 | |
318 tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]); | |
319 tmp12 = MULTIPLY((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6], FIX_1_414213562) | |
320 - tmp13; | |
321 | |
322 tmp0 = tmp10 + tmp13; | |
323 tmp3 = tmp10 - tmp13; | |
324 tmp1 = tmp11 + tmp12; | |
325 tmp2 = tmp11 - tmp12; | |
326 | |
327 /* Odd part */ | |
328 | |
329 z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3]; | |
330 z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3]; | |
331 z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7]; | |
332 z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7]; | |
333 | |
334 tmp7 = z11 + z13; /* phase 5 */ | |
335 tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */ | |
336 | |
337 z5 = MULTIPLY(z10 + z12, FIX_1_847759065); /* 2*c2 */ | |
338 tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; /* 2*(c2-c6) */ | |
339 tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; /* -2*(c2+c6) */ | |
340 | |
341 tmp6 = tmp12 - tmp7; /* phase 2 */ | |
342 tmp5 = tmp11 - tmp6; | |
343 tmp4 = tmp10 + tmp5; | |
344 | |
345 /* Final output stage: scale down by a factor of 8 and range-limit */ | |
346 | |
347 outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3) | |
348 & RANGE_MASK]; | |
349 outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) | |
350 & RANGE_MASK]; | |
351 outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) | |
352 & RANGE_MASK]; | |
353 outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) | |
354 & RANGE_MASK]; | |
355 outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) | |
356 & RANGE_MASK]; | |
357 outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) | |
358 & RANGE_MASK]; | |
359 outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) | |
360 & RANGE_MASK]; | |
361 outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) | |
362 & RANGE_MASK]; | |
363 | |
364 wsptr += DCTSIZE; /* advance pointer to next row */ | |
365 } | |
366 } | |
367 | |
368 #endif /* DCT_IFAST_SUPPORTED */ | |
OLD | NEW |