OLD | NEW |
| (Empty) |
1 ; | |
2 ; ji3dnflt.asm - floating-point IDCT (3DNow! & MMX) | |
3 ; | |
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB | |
5 ; | |
6 ; Based on | |
7 ; x86 SIMD extension for IJG JPEG library | |
8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. | |
9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc | |
10 ; | |
11 ; This file should be assembled with NASM (Netwide Assembler), | |
12 ; can *not* be assembled with Microsoft's MASM or any compatible | |
13 ; assembler (including Borland's Turbo Assembler). | |
14 ; NASM is available from http://nasm.sourceforge.net/ or | |
15 ; http://sourceforge.net/project/showfiles.php?group_id=6208 | |
16 ; | |
17 ; This file contains a floating-point implementation of the inverse DCT | |
18 ; (Discrete Cosine Transform). The following code is based directly on | |
19 ; the IJG's original jidctflt.c; see the jidctflt.c for more details. | |
20 ; | |
21 ; [TAB8] | |
22 | |
23 %include "jsimdext.inc" | |
24 %include "jdct.inc" | |
25 | |
26 ; -------------------------------------------------------------------------- | |
27 SECTION SEG_CONST | |
28 | |
29 alignz 16 | |
30 global EXTN(jconst_idct_float_3dnow) PRIVATE | |
31 | |
32 EXTN(jconst_idct_float_3dnow): | |
33 | |
34 PD_1_414 times 2 dd 1.414213562373095048801689 | |
35 PD_1_847 times 2 dd 1.847759065022573512256366 | |
36 PD_1_082 times 2 dd 1.082392200292393968799446 | |
37 PD_2_613 times 2 dd 2.613125929752753055713286 | |
38 PD_RNDINT_MAGIC times 2 dd 100663296.0 ; (float)(0x00C00000 << 3) | |
39 PB_CENTERJSAMP times 8 db CENTERJSAMPLE | |
40 | |
41 alignz 16 | |
42 | |
43 ; -------------------------------------------------------------------------- | |
44 SECTION SEG_TEXT | |
45 BITS 32 | |
46 ; | |
47 ; Perform dequantization and inverse DCT on one block of coefficients. | |
48 ; | |
49 ; GLOBAL(void) | |
50 ; jsimd_idct_float_3dnow (void * dct_table, JCOEFPTR coef_block, | |
51 ; JSAMPARRAY output_buf, JDIMENSION output_col) | |
52 ; | |
53 | |
54 %define dct_table(b) (b)+8 ; void * dct_table | |
55 %define coef_block(b) (b)+12 ; JCOEFPTR coef_block | |
56 %define output_buf(b) (b)+16 ; JSAMPARRAY output_buf | |
57 %define output_col(b) (b)+20 ; JDIMENSION output_col | |
58 | |
59 %define original_ebp ebp+0 | |
60 %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] | |
61 %define WK_NUM 2 | |
62 %define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT | |
63 ; FAST_FLOAT workspace[DCTSIZE2] | |
64 | |
65 align 16 | |
66 global EXTN(jsimd_idct_float_3dnow) PRIVATE | |
67 | |
68 EXTN(jsimd_idct_float_3dnow): | |
69 push ebp | |
70 mov eax,esp ; eax = original ebp | |
71 sub esp, byte 4 | |
72 and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits | |
73 mov [esp],eax | |
74 mov ebp,esp ; ebp = aligned ebp | |
75 lea esp, [workspace] | |
76 push ebx | |
77 ; push ecx ; need not be preserved | |
78 ; push edx ; need not be preserved | |
79 push esi | |
80 push edi | |
81 | |
82 get_GOT ebx ; get GOT address | |
83 | |
84 ; ---- Pass 1: process columns from input, store into work array. | |
85 | |
86 ; mov eax, [original_ebp] | |
87 mov edx, POINTER [dct_table(eax)] ; quantptr | |
88 mov esi, JCOEFPTR [coef_block(eax)] ; inptr | |
89 lea edi, [workspace] ; FAST_FLOAT * wsptr | |
90 mov ecx, DCTSIZE/2 ; ctr | |
91 alignx 16,7 | |
92 .columnloop: | |
93 %ifndef NO_ZERO_COLUMN_TEST_FLOAT_3DNOW | |
94 mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] | |
95 or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] | |
96 jnz short .columnDCT | |
97 | |
98 pushpic ebx ; save GOT address | |
99 mov ebx, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)] | |
100 mov eax, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)] | |
101 or ebx, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)] | |
102 or eax, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)] | |
103 or ebx, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)] | |
104 or eax,ebx | |
105 poppic ebx ; restore GOT address | |
106 jnz short .columnDCT | |
107 | |
108 ; -- AC terms all zero | |
109 | |
110 movd mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)] | |
111 | |
112 punpcklwd mm0,mm0 | |
113 psrad mm0,(DWORD_BIT-WORD_BIT) | |
114 pi2fd mm0,mm0 | |
115 | |
116 pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] | |
117 | |
118 movq mm1,mm0 | |
119 punpckldq mm0,mm0 | |
120 punpckhdq mm1,mm1 | |
121 | |
122 movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm0 | |
123 movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm0 | |
124 movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm0 | |
125 movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0 | |
126 movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1 | |
127 movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm1 | |
128 movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm1 | |
129 movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1 | |
130 jmp near .nextcolumn | |
131 alignx 16,7 | |
132 %endif | |
133 .columnDCT: | |
134 | |
135 ; -- Even part | |
136 | |
137 movd mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)] | |
138 movd mm1, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] | |
139 movd mm2, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)] | |
140 movd mm3, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)] | |
141 | |
142 punpcklwd mm0,mm0 | |
143 punpcklwd mm1,mm1 | |
144 psrad mm0,(DWORD_BIT-WORD_BIT) | |
145 psrad mm1,(DWORD_BIT-WORD_BIT) | |
146 pi2fd mm0,mm0 | |
147 pi2fd mm1,mm1 | |
148 | |
149 pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] | |
150 pfmul mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)] | |
151 | |
152 punpcklwd mm2,mm2 | |
153 punpcklwd mm3,mm3 | |
154 psrad mm2,(DWORD_BIT-WORD_BIT) | |
155 psrad mm3,(DWORD_BIT-WORD_BIT) | |
156 pi2fd mm2,mm2 | |
157 pi2fd mm3,mm3 | |
158 | |
159 pfmul mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)] | |
160 pfmul mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)] | |
161 | |
162 movq mm4,mm0 | |
163 movq mm5,mm1 | |
164 pfsub mm0,mm2 ; mm0=tmp11 | |
165 pfsub mm1,mm3 | |
166 pfadd mm4,mm2 ; mm4=tmp10 | |
167 pfadd mm5,mm3 ; mm5=tmp13 | |
168 | |
169 pfmul mm1,[GOTOFF(ebx,PD_1_414)] | |
170 pfsub mm1,mm5 ; mm1=tmp12 | |
171 | |
172 movq mm6,mm4 | |
173 movq mm7,mm0 | |
174 pfsub mm4,mm5 ; mm4=tmp3 | |
175 pfsub mm0,mm1 ; mm0=tmp2 | |
176 pfadd mm6,mm5 ; mm6=tmp0 | |
177 pfadd mm7,mm1 ; mm7=tmp1 | |
178 | |
179 movq MMWORD [wk(1)], mm4 ; tmp3 | |
180 movq MMWORD [wk(0)], mm0 ; tmp2 | |
181 | |
182 ; -- Odd part | |
183 | |
184 movd mm2, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] | |
185 movd mm3, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)] | |
186 movd mm5, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)] | |
187 movd mm1, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)] | |
188 | |
189 punpcklwd mm2,mm2 | |
190 punpcklwd mm3,mm3 | |
191 psrad mm2,(DWORD_BIT-WORD_BIT) | |
192 psrad mm3,(DWORD_BIT-WORD_BIT) | |
193 pi2fd mm2,mm2 | |
194 pi2fd mm3,mm3 | |
195 | |
196 pfmul mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)] | |
197 pfmul mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)] | |
198 | |
199 punpcklwd mm5,mm5 | |
200 punpcklwd mm1,mm1 | |
201 psrad mm5,(DWORD_BIT-WORD_BIT) | |
202 psrad mm1,(DWORD_BIT-WORD_BIT) | |
203 pi2fd mm5,mm5 | |
204 pi2fd mm1,mm1 | |
205 | |
206 pfmul mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)] | |
207 pfmul mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)] | |
208 | |
209 movq mm4,mm2 | |
210 movq mm0,mm5 | |
211 pfadd mm2,mm1 ; mm2=z11 | |
212 pfadd mm5,mm3 ; mm5=z13 | |
213 pfsub mm4,mm1 ; mm4=z12 | |
214 pfsub mm0,mm3 ; mm0=z10 | |
215 | |
216 movq mm1,mm2 | |
217 pfsub mm2,mm5 | |
218 pfadd mm1,mm5 ; mm1=tmp7 | |
219 | |
220 pfmul mm2,[GOTOFF(ebx,PD_1_414)] ; mm2=tmp11 | |
221 | |
222 movq mm3,mm0 | |
223 pfadd mm0,mm4 | |
224 pfmul mm0,[GOTOFF(ebx,PD_1_847)] ; mm0=z5 | |
225 pfmul mm3,[GOTOFF(ebx,PD_2_613)] ; mm3=(z10 * 2.613125930) | |
226 pfmul mm4,[GOTOFF(ebx,PD_1_082)] ; mm4=(z12 * 1.082392200) | |
227 pfsubr mm3,mm0 ; mm3=tmp12 | |
228 pfsub mm4,mm0 ; mm4=tmp10 | |
229 | |
230 ; -- Final output stage | |
231 | |
232 pfsub mm3,mm1 ; mm3=tmp6 | |
233 movq mm5,mm6 | |
234 movq mm0,mm7 | |
235 pfadd mm6,mm1 ; mm6=data0=(00 01) | |
236 pfadd mm7,mm3 ; mm7=data1=(10 11) | |
237 pfsub mm5,mm1 ; mm5=data7=(70 71) | |
238 pfsub mm0,mm3 ; mm0=data6=(60 61) | |
239 pfsub mm2,mm3 ; mm2=tmp5 | |
240 | |
241 movq mm1,mm6 ; transpose coefficients | |
242 punpckldq mm6,mm7 ; mm6=(00 10) | |
243 punpckhdq mm1,mm7 ; mm1=(01 11) | |
244 movq mm3,mm0 ; transpose coefficients | |
245 punpckldq mm0,mm5 ; mm0=(60 70) | |
246 punpckhdq mm3,mm5 ; mm3=(61 71) | |
247 | |
248 movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm6 | |
249 movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1 | |
250 movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0 | |
251 movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm3 | |
252 | |
253 movq mm7, MMWORD [wk(0)] ; mm7=tmp2 | |
254 movq mm5, MMWORD [wk(1)] ; mm5=tmp3 | |
255 | |
256 pfadd mm4,mm2 ; mm4=tmp4 | |
257 movq mm6,mm7 | |
258 movq mm1,mm5 | |
259 pfadd mm7,mm2 ; mm7=data2=(20 21) | |
260 pfadd mm5,mm4 ; mm5=data4=(40 41) | |
261 pfsub mm6,mm2 ; mm6=data5=(50 51) | |
262 pfsub mm1,mm4 ; mm1=data3=(30 31) | |
263 | |
264 movq mm0,mm7 ; transpose coefficients | |
265 punpckldq mm7,mm1 ; mm7=(20 30) | |
266 punpckhdq mm0,mm1 ; mm0=(21 31) | |
267 movq mm3,mm5 ; transpose coefficients | |
268 punpckldq mm5,mm6 ; mm5=(40 50) | |
269 punpckhdq mm3,mm6 ; mm3=(41 51) | |
270 | |
271 movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm7 | |
272 movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm0 | |
273 movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5 | |
274 movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm3 | |
275 | |
276 .nextcolumn: | |
277 add esi, byte 2*SIZEOF_JCOEF ; coef_block | |
278 add edx, byte 2*SIZEOF_FLOAT_MULT_TYPE ; quantptr | |
279 add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr | |
280 dec ecx ; ctr | |
281 jnz near .columnloop | |
282 | |
283 ; -- Prefetch the next coefficient block | |
284 | |
285 prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] | |
286 prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] | |
287 prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] | |
288 prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] | |
289 | |
290 ; ---- Pass 2: process rows from work array, store into output array. | |
291 | |
292 mov eax, [original_ebp] | |
293 lea esi, [workspace] ; FAST_FLOAT * wsptr | |
294 mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) | |
295 mov eax, JDIMENSION [output_col(eax)] | |
296 mov ecx, DCTSIZE/2 ; ctr | |
297 alignx 16,7 | |
298 .rowloop: | |
299 | |
300 ; -- Even part | |
301 | |
302 movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] | |
303 movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)] | |
304 movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)] | |
305 movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)] | |
306 | |
307 movq mm4,mm0 | |
308 movq mm5,mm1 | |
309 pfsub mm0,mm2 ; mm0=tmp11 | |
310 pfsub mm1,mm3 | |
311 pfadd mm4,mm2 ; mm4=tmp10 | |
312 pfadd mm5,mm3 ; mm5=tmp13 | |
313 | |
314 pfmul mm1,[GOTOFF(ebx,PD_1_414)] | |
315 pfsub mm1,mm5 ; mm1=tmp12 | |
316 | |
317 movq mm6,mm4 | |
318 movq mm7,mm0 | |
319 pfsub mm4,mm5 ; mm4=tmp3 | |
320 pfsub mm0,mm1 ; mm0=tmp2 | |
321 pfadd mm6,mm5 ; mm6=tmp0 | |
322 pfadd mm7,mm1 ; mm7=tmp1 | |
323 | |
324 movq MMWORD [wk(1)], mm4 ; tmp3 | |
325 movq MMWORD [wk(0)], mm0 ; tmp2 | |
326 | |
327 ; -- Odd part | |
328 | |
329 movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] | |
330 movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)] | |
331 movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)] | |
332 movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)] | |
333 | |
334 movq mm4,mm2 | |
335 movq mm0,mm5 | |
336 pfadd mm2,mm1 ; mm2=z11 | |
337 pfadd mm5,mm3 ; mm5=z13 | |
338 pfsub mm4,mm1 ; mm4=z12 | |
339 pfsub mm0,mm3 ; mm0=z10 | |
340 | |
341 movq mm1,mm2 | |
342 pfsub mm2,mm5 | |
343 pfadd mm1,mm5 ; mm1=tmp7 | |
344 | |
345 pfmul mm2,[GOTOFF(ebx,PD_1_414)] ; mm2=tmp11 | |
346 | |
347 movq mm3,mm0 | |
348 pfadd mm0,mm4 | |
349 pfmul mm0,[GOTOFF(ebx,PD_1_847)] ; mm0=z5 | |
350 pfmul mm3,[GOTOFF(ebx,PD_2_613)] ; mm3=(z10 * 2.613125930) | |
351 pfmul mm4,[GOTOFF(ebx,PD_1_082)] ; mm4=(z12 * 1.082392200) | |
352 pfsubr mm3,mm0 ; mm3=tmp12 | |
353 pfsub mm4,mm0 ; mm4=tmp10 | |
354 | |
355 ; -- Final output stage | |
356 | |
357 pfsub mm3,mm1 ; mm3=tmp6 | |
358 movq mm5,mm6 | |
359 movq mm0,mm7 | |
360 pfadd mm6,mm1 ; mm6=data0=(00 10) | |
361 pfadd mm7,mm3 ; mm7=data1=(01 11) | |
362 pfsub mm5,mm1 ; mm5=data7=(07 17) | |
363 pfsub mm0,mm3 ; mm0=data6=(06 16) | |
364 pfsub mm2,mm3 ; mm2=tmp5 | |
365 | |
366 movq mm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; mm1=[PD_RNDINT_MAGIC] | |
367 pcmpeqd mm3,mm3 | |
368 psrld mm3,WORD_BIT ; mm3={0xFFFF 0x0000 0xFFFF 0x0000} | |
369 | |
370 pfadd mm6,mm1 ; mm6=roundint(data0/8)=(00 ** 10 **) | |
371 pfadd mm7,mm1 ; mm7=roundint(data1/8)=(01 ** 11 **) | |
372 pfadd mm0,mm1 ; mm0=roundint(data6/8)=(06 ** 16 **) | |
373 pfadd mm5,mm1 ; mm5=roundint(data7/8)=(07 ** 17 **) | |
374 | |
375 pand mm6,mm3 ; mm6=(00 -- 10 --) | |
376 pslld mm7,WORD_BIT ; mm7=(-- 01 -- 11) | |
377 pand mm0,mm3 ; mm0=(06 -- 16 --) | |
378 pslld mm5,WORD_BIT ; mm5=(-- 07 -- 17) | |
379 por mm6,mm7 ; mm6=(00 01 10 11) | |
380 por mm0,mm5 ; mm0=(06 07 16 17) | |
381 | |
382 movq mm1, MMWORD [wk(0)] ; mm1=tmp2 | |
383 movq mm3, MMWORD [wk(1)] ; mm3=tmp3 | |
384 | |
385 pfadd mm4,mm2 ; mm4=tmp4 | |
386 movq mm7,mm1 | |
387 movq mm5,mm3 | |
388 pfadd mm1,mm2 ; mm1=data2=(02 12) | |
389 pfadd mm3,mm4 ; mm3=data4=(04 14) | |
390 pfsub mm7,mm2 ; mm7=data5=(05 15) | |
391 pfsub mm5,mm4 ; mm5=data3=(03 13) | |
392 | |
393 movq mm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; mm2=[PD_RNDINT_MAGIC] | |
394 pcmpeqd mm4,mm4 | |
395 psrld mm4,WORD_BIT ; mm4={0xFFFF 0x0000 0xFFFF 0x0000} | |
396 | |
397 pfadd mm3,mm2 ; mm3=roundint(data4/8)=(04 ** 14 **) | |
398 pfadd mm7,mm2 ; mm7=roundint(data5/8)=(05 ** 15 **) | |
399 pfadd mm1,mm2 ; mm1=roundint(data2/8)=(02 ** 12 **) | |
400 pfadd mm5,mm2 ; mm5=roundint(data3/8)=(03 ** 13 **) | |
401 | |
402 pand mm3,mm4 ; mm3=(04 -- 14 --) | |
403 pslld mm7,WORD_BIT ; mm7=(-- 05 -- 15) | |
404 pand mm1,mm4 ; mm1=(02 -- 12 --) | |
405 pslld mm5,WORD_BIT ; mm5=(-- 03 -- 13) | |
406 por mm3,mm7 ; mm3=(04 05 14 15) | |
407 por mm1,mm5 ; mm1=(02 03 12 13) | |
408 | |
409 movq mm2,[GOTOFF(ebx,PB_CENTERJSAMP)] ; mm2=[PB_CENTERJSAMP] | |
410 | |
411 packsswb mm6,mm3 ; mm6=(00 01 10 11 04 05 14 15) | |
412 packsswb mm1,mm0 ; mm1=(02 03 12 13 06 07 16 17) | |
413 paddb mm6,mm2 | |
414 paddb mm1,mm2 | |
415 | |
416 movq mm4,mm6 ; transpose coefficients(phase 2) | |
417 punpcklwd mm6,mm1 ; mm6=(00 01 02 03 10 11 12 13) | |
418 punpckhwd mm4,mm1 ; mm4=(04 05 06 07 14 15 16 17) | |
419 | |
420 movq mm7,mm6 ; transpose coefficients(phase 3) | |
421 punpckldq mm6,mm4 ; mm6=(00 01 02 03 04 05 06 07) | |
422 punpckhdq mm7,mm4 ; mm7=(10 11 12 13 14 15 16 17) | |
423 | |
424 pushpic ebx ; save GOT address | |
425 | |
426 mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] | |
427 mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] | |
428 movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6 | |
429 movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7 | |
430 | |
431 poppic ebx ; restore GOT address | |
432 | |
433 add esi, byte 2*SIZEOF_FAST_FLOAT ; wsptr | |
434 add edi, byte 2*SIZEOF_JSAMPROW | |
435 dec ecx ; ctr | |
436 jnz near .rowloop | |
437 | |
438 femms ; empty MMX/3DNow! state | |
439 | |
440 pop edi | |
441 pop esi | |
442 ; pop edx ; need not be preserved | |
443 ; pop ecx ; need not be preserved | |
444 pop ebx | |
445 mov esp,ebp ; esp <- aligned ebp | |
446 pop esp ; esp <- original ebp | |
447 pop ebp | |
448 ret | |
449 | |
450 ; For some reason, the OS X linker does not honor the request to align the | |
451 ; segment unless we do this. | |
452 align 16 | |
OLD | NEW |