OLD | NEW |
| (Empty) |
1 ; | |
2 ; jimmxfst.asm - fast integer IDCT (MMX) | |
3 ; | |
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB | |
5 ; | |
6 ; Based on | |
7 ; x86 SIMD extension for IJG JPEG library | |
8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. | |
9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc | |
10 ; | |
11 ; This file should be assembled with NASM (Netwide Assembler), | |
12 ; can *not* be assembled with Microsoft's MASM or any compatible | |
13 ; assembler (including Borland's Turbo Assembler). | |
14 ; NASM is available from http://nasm.sourceforge.net/ or | |
15 ; http://sourceforge.net/project/showfiles.php?group_id=6208 | |
16 ; | |
17 ; This file contains a fast, not so accurate integer implementation of | |
18 ; the inverse DCT (Discrete Cosine Transform). The following code is | |
19 ; based directly on the IJG's original jidctfst.c; see the jidctfst.c | |
20 ; for more details. | |
21 ; | |
22 ; [TAB8] | |
23 | |
24 %include "jsimdext.inc" | |
25 %include "jdct.inc" | |
26 | |
27 ; -------------------------------------------------------------------------- | |
28 | |
29 %define CONST_BITS 8 ; 14 is also OK. | |
30 %define PASS1_BITS 2 | |
31 | |
32 %if IFAST_SCALE_BITS != PASS1_BITS | |
33 %error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'." | |
34 %endif | |
35 | |
36 %if CONST_BITS == 8 | |
37 F_1_082 equ 277 ; FIX(1.082392200) | |
38 F_1_414 equ 362 ; FIX(1.414213562) | |
39 F_1_847 equ 473 ; FIX(1.847759065) | |
40 F_2_613 equ 669 ; FIX(2.613125930) | |
41 F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1) | |
42 %else | |
43 ; NASM cannot do compile-time arithmetic on floating-point constants. | |
44 %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) | |
45 F_1_082 equ DESCALE(1162209775,30-CONST_BITS) ; FIX(1.082392200) | |
46 F_1_414 equ DESCALE(1518500249,30-CONST_BITS) ; FIX(1.414213562) | |
47 F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) | |
48 F_2_613 equ DESCALE(2805822602,30-CONST_BITS) ; FIX(2.613125930) | |
49 F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1) | |
50 %endif | |
51 | |
52 ; -------------------------------------------------------------------------- | |
53 SECTION SEG_CONST | |
54 | |
55 ; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) | |
56 ; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) | |
57 | |
58 %define PRE_MULTIPLY_SCALE_BITS 2 | |
59 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) | |
60 | |
61 alignz 16 | |
62 global EXTN(jconst_idct_ifast_mmx) PRIVATE | |
63 | |
64 EXTN(jconst_idct_ifast_mmx): | |
65 | |
66 PW_F1414 times 4 dw F_1_414 << CONST_SHIFT | |
67 PW_F1847 times 4 dw F_1_847 << CONST_SHIFT | |
68 PW_MF1613 times 4 dw -F_1_613 << CONST_SHIFT | |
69 PW_F1082 times 4 dw F_1_082 << CONST_SHIFT | |
70 PB_CENTERJSAMP times 8 db CENTERJSAMPLE | |
71 | |
72 alignz 16 | |
73 | |
74 ; -------------------------------------------------------------------------- | |
75 SECTION SEG_TEXT | |
76 BITS 32 | |
77 ; | |
78 ; Perform dequantization and inverse DCT on one block of coefficients. | |
79 ; | |
80 ; GLOBAL(void) | |
81 ; jsimd_idct_ifast_mmx (void * dct_table, JCOEFPTR coef_block, | |
82 ; JSAMPARRAY output_buf, JDIMENSION output_col) | |
83 ; | |
84 | |
85 %define dct_table(b) (b)+8 ; jpeg_component_info * compptr | |
86 %define coef_block(b) (b)+12 ; JCOEFPTR coef_block | |
87 %define output_buf(b) (b)+16 ; JSAMPARRAY output_buf | |
88 %define output_col(b) (b)+20 ; JDIMENSION output_col | |
89 | |
90 %define original_ebp ebp+0 | |
91 %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] | |
92 %define WK_NUM 2 | |
93 %define workspace wk(0)-DCTSIZE2*SIZEOF_JCOEF | |
94 ; JCOEF workspace[DCTSIZE2] | |
95 | |
96 align 16 | |
97 global EXTN(jsimd_idct_ifast_mmx) PRIVATE | |
98 | |
99 EXTN(jsimd_idct_ifast_mmx): | |
100 push ebp | |
101 mov eax,esp ; eax = original ebp | |
102 sub esp, byte 4 | |
103 and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits | |
104 mov [esp],eax | |
105 mov ebp,esp ; ebp = aligned ebp | |
106 lea esp, [workspace] | |
107 push ebx | |
108 ; push ecx ; need not be preserved | |
109 ; push edx ; need not be preserved | |
110 push esi | |
111 push edi | |
112 | |
113 get_GOT ebx ; get GOT address | |
114 | |
115 ; ---- Pass 1: process columns from input, store into work array. | |
116 | |
117 ; mov eax, [original_ebp] | |
118 mov edx, POINTER [dct_table(eax)] ; quantptr | |
119 mov esi, JCOEFPTR [coef_block(eax)] ; inptr | |
120 lea edi, [workspace] ; JCOEF * wsptr | |
121 mov ecx, DCTSIZE/4 ; ctr | |
122 alignx 16,7 | |
123 .columnloop: | |
124 %ifndef NO_ZERO_COLUMN_TEST_IFAST_MMX | |
125 mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] | |
126 or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] | |
127 jnz short .columnDCT | |
128 | |
129 movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] | |
130 movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] | |
131 por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] | |
132 por mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] | |
133 por mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] | |
134 por mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] | |
135 por mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] | |
136 por mm1,mm0 | |
137 packsswb mm1,mm1 | |
138 movd eax,mm1 | |
139 test eax,eax | |
140 jnz short .columnDCT | |
141 | |
142 ; -- AC terms all zero | |
143 | |
144 movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] | |
145 pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)] | |
146 | |
147 movq mm2,mm0 ; mm0=in0=(00 01 02 03) | |
148 punpcklwd mm0,mm0 ; mm0=(00 00 01 01) | |
149 punpckhwd mm2,mm2 ; mm2=(02 02 03 03) | |
150 | |
151 movq mm1,mm0 | |
152 punpckldq mm0,mm0 ; mm0=(00 00 00 00) | |
153 punpckhdq mm1,mm1 ; mm1=(01 01 01 01) | |
154 movq mm3,mm2 | |
155 punpckldq mm2,mm2 ; mm2=(02 02 02 02) | |
156 punpckhdq mm3,mm3 ; mm3=(03 03 03 03) | |
157 | |
158 movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0 | |
159 movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0 | |
160 movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1 | |
161 movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1 | |
162 movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2 | |
163 movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2 | |
164 movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3 | |
165 movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3 | |
166 jmp near .nextcolumn | |
167 alignx 16,7 | |
168 %endif | |
169 .columnDCT: | |
170 | |
171 ; -- Even part | |
172 | |
173 movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] | |
174 movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] | |
175 pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)] | |
176 pmullw mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)] | |
177 movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] | |
178 movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] | |
179 pmullw mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)] | |
180 pmullw mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)] | |
181 | |
182 movq mm4,mm0 | |
183 movq mm5,mm1 | |
184 psubw mm0,mm2 ; mm0=tmp11 | |
185 psubw mm1,mm3 | |
186 paddw mm4,mm2 ; mm4=tmp10 | |
187 paddw mm5,mm3 ; mm5=tmp13 | |
188 | |
189 psllw mm1,PRE_MULTIPLY_SCALE_BITS | |
190 pmulhw mm1,[GOTOFF(ebx,PW_F1414)] | |
191 psubw mm1,mm5 ; mm1=tmp12 | |
192 | |
193 movq mm6,mm4 | |
194 movq mm7,mm0 | |
195 psubw mm4,mm5 ; mm4=tmp3 | |
196 psubw mm0,mm1 ; mm0=tmp2 | |
197 paddw mm6,mm5 ; mm6=tmp0 | |
198 paddw mm7,mm1 ; mm7=tmp1 | |
199 | |
200 movq MMWORD [wk(1)], mm4 ; wk(1)=tmp3 | |
201 movq MMWORD [wk(0)], mm0 ; wk(0)=tmp2 | |
202 | |
203 ; -- Odd part | |
204 | |
205 movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] | |
206 movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] | |
207 pmullw mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)] | |
208 pmullw mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)] | |
209 movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] | |
210 movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] | |
211 pmullw mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)] | |
212 pmullw mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)] | |
213 | |
214 movq mm4,mm2 | |
215 movq mm0,mm5 | |
216 psubw mm2,mm1 ; mm2=z12 | |
217 psubw mm5,mm3 ; mm5=z10 | |
218 paddw mm4,mm1 ; mm4=z11 | |
219 paddw mm0,mm3 ; mm0=z13 | |
220 | |
221 movq mm1,mm5 ; mm1=z10(unscaled) | |
222 psllw mm2,PRE_MULTIPLY_SCALE_BITS | |
223 psllw mm5,PRE_MULTIPLY_SCALE_BITS | |
224 | |
225 movq mm3,mm4 | |
226 psubw mm4,mm0 | |
227 paddw mm3,mm0 ; mm3=tmp7 | |
228 | |
229 psllw mm4,PRE_MULTIPLY_SCALE_BITS | |
230 pmulhw mm4,[GOTOFF(ebx,PW_F1414)] ; mm4=tmp11 | |
231 | |
232 ; To avoid overflow... | |
233 ; | |
234 ; (Original) | |
235 ; tmp12 = -2.613125930 * z10 + z5; | |
236 ; | |
237 ; (This implementation) | |
238 ; tmp12 = (-1.613125930 - 1) * z10 + z5; | |
239 ; = -1.613125930 * z10 - z10 + z5; | |
240 | |
241 movq mm0,mm5 | |
242 paddw mm5,mm2 | |
243 pmulhw mm5,[GOTOFF(ebx,PW_F1847)] ; mm5=z5 | |
244 pmulhw mm0,[GOTOFF(ebx,PW_MF1613)] | |
245 pmulhw mm2,[GOTOFF(ebx,PW_F1082)] | |
246 psubw mm0,mm1 | |
247 psubw mm2,mm5 ; mm2=tmp10 | |
248 paddw mm0,mm5 ; mm0=tmp12 | |
249 | |
250 ; -- Final output stage | |
251 | |
252 psubw mm0,mm3 ; mm0=tmp6 | |
253 movq mm1,mm6 | |
254 movq mm5,mm7 | |
255 paddw mm6,mm3 ; mm6=data0=(00 01 02 03) | |
256 paddw mm7,mm0 ; mm7=data1=(10 11 12 13) | |
257 psubw mm1,mm3 ; mm1=data7=(70 71 72 73) | |
258 psubw mm5,mm0 ; mm5=data6=(60 61 62 63) | |
259 psubw mm4,mm0 ; mm4=tmp5 | |
260 | |
261 movq mm3,mm6 ; transpose coefficients(phase 1) | |
262 punpcklwd mm6,mm7 ; mm6=(00 10 01 11) | |
263 punpckhwd mm3,mm7 ; mm3=(02 12 03 13) | |
264 movq mm0,mm5 ; transpose coefficients(phase 1) | |
265 punpcklwd mm5,mm1 ; mm5=(60 70 61 71) | |
266 punpckhwd mm0,mm1 ; mm0=(62 72 63 73) | |
267 | |
268 movq mm7, MMWORD [wk(0)] ; mm7=tmp2 | |
269 movq mm1, MMWORD [wk(1)] ; mm1=tmp3 | |
270 | |
271 movq MMWORD [wk(0)], mm5 ; wk(0)=(60 70 61 71) | |
272 movq MMWORD [wk(1)], mm0 ; wk(1)=(62 72 63 73) | |
273 | |
274 paddw mm2,mm4 ; mm2=tmp4 | |
275 movq mm5,mm7 | |
276 movq mm0,mm1 | |
277 paddw mm7,mm4 ; mm7=data2=(20 21 22 23) | |
278 paddw mm1,mm2 ; mm1=data4=(40 41 42 43) | |
279 psubw mm5,mm4 ; mm5=data5=(50 51 52 53) | |
280 psubw mm0,mm2 ; mm0=data3=(30 31 32 33) | |
281 | |
282 movq mm4,mm7 ; transpose coefficients(phase 1) | |
283 punpcklwd mm7,mm0 ; mm7=(20 30 21 31) | |
284 punpckhwd mm4,mm0 ; mm4=(22 32 23 33) | |
285 movq mm2,mm1 ; transpose coefficients(phase 1) | |
286 punpcklwd mm1,mm5 ; mm1=(40 50 41 51) | |
287 punpckhwd mm2,mm5 ; mm2=(42 52 43 53) | |
288 | |
289 movq mm0,mm6 ; transpose coefficients(phase 2) | |
290 punpckldq mm6,mm7 ; mm6=(00 10 20 30) | |
291 punpckhdq mm0,mm7 ; mm0=(01 11 21 31) | |
292 movq mm5,mm3 ; transpose coefficients(phase 2) | |
293 punpckldq mm3,mm4 ; mm3=(02 12 22 32) | |
294 punpckhdq mm5,mm4 ; mm5=(03 13 23 33) | |
295 | |
296 movq mm7, MMWORD [wk(0)] ; mm7=(60 70 61 71) | |
297 movq mm4, MMWORD [wk(1)] ; mm4=(62 72 63 73) | |
298 | |
299 movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm6 | |
300 movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0 | |
301 movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm3 | |
302 movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5 | |
303 | |
304 movq mm6,mm1 ; transpose coefficients(phase 2) | |
305 punpckldq mm1,mm7 ; mm1=(40 50 60 70) | |
306 punpckhdq mm6,mm7 ; mm6=(41 51 61 71) | |
307 movq mm0,mm2 ; transpose coefficients(phase 2) | |
308 punpckldq mm2,mm4 ; mm2=(42 52 62 72) | |
309 punpckhdq mm0,mm4 ; mm0=(43 53 63 73) | |
310 | |
311 movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1 | |
312 movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm6 | |
313 movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2 | |
314 movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm0 | |
315 | |
316 .nextcolumn: | |
317 add esi, byte 4*SIZEOF_JCOEF ; coef_block | |
318 add edx, byte 4*SIZEOF_IFAST_MULT_TYPE ; quantptr | |
319 add edi, byte 4*DCTSIZE*SIZEOF_JCOEF ; wsptr | |
320 dec ecx ; ctr | |
321 jnz near .columnloop | |
322 | |
323 ; ---- Pass 2: process rows from work array, store into output array. | |
324 | |
325 mov eax, [original_ebp] | |
326 lea esi, [workspace] ; JCOEF * wsptr | |
327 mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) | |
328 mov eax, JDIMENSION [output_col(eax)] | |
329 mov ecx, DCTSIZE/4 ; ctr | |
330 alignx 16,7 | |
331 .rowloop: | |
332 | |
333 ; -- Even part | |
334 | |
335 movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] | |
336 movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] | |
337 movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] | |
338 movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] | |
339 | |
340 movq mm4,mm0 | |
341 movq mm5,mm1 | |
342 psubw mm0,mm2 ; mm0=tmp11 | |
343 psubw mm1,mm3 | |
344 paddw mm4,mm2 ; mm4=tmp10 | |
345 paddw mm5,mm3 ; mm5=tmp13 | |
346 | |
347 psllw mm1,PRE_MULTIPLY_SCALE_BITS | |
348 pmulhw mm1,[GOTOFF(ebx,PW_F1414)] | |
349 psubw mm1,mm5 ; mm1=tmp12 | |
350 | |
351 movq mm6,mm4 | |
352 movq mm7,mm0 | |
353 psubw mm4,mm5 ; mm4=tmp3 | |
354 psubw mm0,mm1 ; mm0=tmp2 | |
355 paddw mm6,mm5 ; mm6=tmp0 | |
356 paddw mm7,mm1 ; mm7=tmp1 | |
357 | |
358 movq MMWORD [wk(1)], mm4 ; wk(1)=tmp3 | |
359 movq MMWORD [wk(0)], mm0 ; wk(0)=tmp2 | |
360 | |
361 ; -- Odd part | |
362 | |
363 movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] | |
364 movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] | |
365 movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] | |
366 movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] | |
367 | |
368 movq mm4,mm2 | |
369 movq mm0,mm5 | |
370 psubw mm2,mm1 ; mm2=z12 | |
371 psubw mm5,mm3 ; mm5=z10 | |
372 paddw mm4,mm1 ; mm4=z11 | |
373 paddw mm0,mm3 ; mm0=z13 | |
374 | |
375 movq mm1,mm5 ; mm1=z10(unscaled) | |
376 psllw mm2,PRE_MULTIPLY_SCALE_BITS | |
377 psllw mm5,PRE_MULTIPLY_SCALE_BITS | |
378 | |
379 movq mm3,mm4 | |
380 psubw mm4,mm0 | |
381 paddw mm3,mm0 ; mm3=tmp7 | |
382 | |
383 psllw mm4,PRE_MULTIPLY_SCALE_BITS | |
384 pmulhw mm4,[GOTOFF(ebx,PW_F1414)] ; mm4=tmp11 | |
385 | |
386 ; To avoid overflow... | |
387 ; | |
388 ; (Original) | |
389 ; tmp12 = -2.613125930 * z10 + z5; | |
390 ; | |
391 ; (This implementation) | |
392 ; tmp12 = (-1.613125930 - 1) * z10 + z5; | |
393 ; = -1.613125930 * z10 - z10 + z5; | |
394 | |
395 movq mm0,mm5 | |
396 paddw mm5,mm2 | |
397 pmulhw mm5,[GOTOFF(ebx,PW_F1847)] ; mm5=z5 | |
398 pmulhw mm0,[GOTOFF(ebx,PW_MF1613)] | |
399 pmulhw mm2,[GOTOFF(ebx,PW_F1082)] | |
400 psubw mm0,mm1 | |
401 psubw mm2,mm5 ; mm2=tmp10 | |
402 paddw mm0,mm5 ; mm0=tmp12 | |
403 | |
404 ; -- Final output stage | |
405 | |
406 psubw mm0,mm3 ; mm0=tmp6 | |
407 movq mm1,mm6 | |
408 movq mm5,mm7 | |
409 paddw mm6,mm3 ; mm6=data0=(00 10 20 30) | |
410 paddw mm7,mm0 ; mm7=data1=(01 11 21 31) | |
411 psraw mm6,(PASS1_BITS+3) ; descale | |
412 psraw mm7,(PASS1_BITS+3) ; descale | |
413 psubw mm1,mm3 ; mm1=data7=(07 17 27 37) | |
414 psubw mm5,mm0 ; mm5=data6=(06 16 26 36) | |
415 psraw mm1,(PASS1_BITS+3) ; descale | |
416 psraw mm5,(PASS1_BITS+3) ; descale | |
417 psubw mm4,mm0 ; mm4=tmp5 | |
418 | |
419 packsswb mm6,mm5 ; mm6=(00 10 20 30 06 16 26 36) | |
420 packsswb mm7,mm1 ; mm7=(01 11 21 31 07 17 27 37) | |
421 | |
422 movq mm3, MMWORD [wk(0)] ; mm3=tmp2 | |
423 movq mm0, MMWORD [wk(1)] ; mm0=tmp3 | |
424 | |
425 paddw mm2,mm4 ; mm2=tmp4 | |
426 movq mm5,mm3 | |
427 movq mm1,mm0 | |
428 paddw mm3,mm4 ; mm3=data2=(02 12 22 32) | |
429 paddw mm0,mm2 ; mm0=data4=(04 14 24 34) | |
430 psraw mm3,(PASS1_BITS+3) ; descale | |
431 psraw mm0,(PASS1_BITS+3) ; descale | |
432 psubw mm5,mm4 ; mm5=data5=(05 15 25 35) | |
433 psubw mm1,mm2 ; mm1=data3=(03 13 23 33) | |
434 psraw mm5,(PASS1_BITS+3) ; descale | |
435 psraw mm1,(PASS1_BITS+3) ; descale | |
436 | |
437 movq mm4,[GOTOFF(ebx,PB_CENTERJSAMP)] ; mm4=[PB_CENTERJSAMP] | |
438 | |
439 packsswb mm3,mm0 ; mm3=(02 12 22 32 04 14 24 34) | |
440 packsswb mm1,mm5 ; mm1=(03 13 23 33 05 15 25 35) | |
441 | |
442 paddb mm6,mm4 | |
443 paddb mm7,mm4 | |
444 paddb mm3,mm4 | |
445 paddb mm1,mm4 | |
446 | |
447 movq mm2,mm6 ; transpose coefficients(phase 1) | |
448 punpcklbw mm6,mm7 ; mm6=(00 01 10 11 20 21 30 31) | |
449 punpckhbw mm2,mm7 ; mm2=(06 07 16 17 26 27 36 37) | |
450 movq mm0,mm3 ; transpose coefficients(phase 1) | |
451 punpcklbw mm3,mm1 ; mm3=(02 03 12 13 22 23 32 33) | |
452 punpckhbw mm0,mm1 ; mm0=(04 05 14 15 24 25 34 35) | |
453 | |
454 movq mm5,mm6 ; transpose coefficients(phase 2) | |
455 punpcklwd mm6,mm3 ; mm6=(00 01 02 03 10 11 12 13) | |
456 punpckhwd mm5,mm3 ; mm5=(20 21 22 23 30 31 32 33) | |
457 movq mm4,mm0 ; transpose coefficients(phase 2) | |
458 punpcklwd mm0,mm2 ; mm0=(04 05 06 07 14 15 16 17) | |
459 punpckhwd mm4,mm2 ; mm4=(24 25 26 27 34 35 36 37) | |
460 | |
461 movq mm7,mm6 ; transpose coefficients(phase 3) | |
462 punpckldq mm6,mm0 ; mm6=(00 01 02 03 04 05 06 07) | |
463 punpckhdq mm7,mm0 ; mm7=(10 11 12 13 14 15 16 17) | |
464 movq mm1,mm5 ; transpose coefficients(phase 3) | |
465 punpckldq mm5,mm4 ; mm5=(20 21 22 23 24 25 26 27) | |
466 punpckhdq mm1,mm4 ; mm1=(30 31 32 33 34 35 36 37) | |
467 | |
468 pushpic ebx ; save GOT address | |
469 | |
470 mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] | |
471 mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] | |
472 movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6 | |
473 movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7 | |
474 mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] | |
475 mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW] | |
476 movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5 | |
477 movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1 | |
478 | |
479 poppic ebx ; restore GOT address | |
480 | |
481 add esi, byte 4*SIZEOF_JCOEF ; wsptr | |
482 add edi, byte 4*SIZEOF_JSAMPROW | |
483 dec ecx ; ctr | |
484 jnz near .rowloop | |
485 | |
486 emms ; empty MMX state | |
487 | |
488 pop edi | |
489 pop esi | |
490 ; pop edx ; need not be preserved | |
491 ; pop ecx ; need not be preserved | |
492 pop ebx | |
493 mov esp,ebp ; esp <- aligned ebp | |
494 pop esp ; esp <- original ebp | |
495 pop ebp | |
496 ret | |
497 | |
498 ; For some reason, the OS X linker does not honor the request to align the | |
499 ; segment unless we do this. | |
500 align 16 | |
OLD | NEW |