OLD | NEW |
| (Empty) |
1 ; | |
2 ; jfmmxint.asm - accurate integer FDCT (MMX) | |
3 ; | |
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB | |
5 ; | |
6 ; Based on | |
7 ; x86 SIMD extension for IJG JPEG library | |
8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. | |
9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc | |
10 ; | |
11 ; This file should be assembled with NASM (Netwide Assembler), | |
12 ; can *not* be assembled with Microsoft's MASM or any compatible | |
13 ; assembler (including Borland's Turbo Assembler). | |
14 ; NASM is available from http://nasm.sourceforge.net/ or | |
15 ; http://sourceforge.net/project/showfiles.php?group_id=6208 | |
16 ; | |
17 ; This file contains a slow-but-accurate integer implementation of the | |
18 ; forward DCT (Discrete Cosine Transform). The following code is based | |
19 ; directly on the IJG's original jfdctint.c; see the jfdctint.c for | |
20 ; more details. | |
21 ; | |
22 ; [TAB8] | |
23 | |
24 %include "jsimdext.inc" | |
25 %include "jdct.inc" | |
26 | |
27 ; -------------------------------------------------------------------------- | |
28 | |
29 %define CONST_BITS 13 | |
30 %define PASS1_BITS 2 | |
31 | |
32 %define DESCALE_P1 (CONST_BITS-PASS1_BITS) | |
33 %define DESCALE_P2 (CONST_BITS+PASS1_BITS) | |
34 | |
35 %if CONST_BITS == 13 | |
36 F_0_298 equ 2446 ; FIX(0.298631336) | |
37 F_0_390 equ 3196 ; FIX(0.390180644) | |
38 F_0_541 equ 4433 ; FIX(0.541196100) | |
39 F_0_765 equ 6270 ; FIX(0.765366865) | |
40 F_0_899 equ 7373 ; FIX(0.899976223) | |
41 F_1_175 equ 9633 ; FIX(1.175875602) | |
42 F_1_501 equ 12299 ; FIX(1.501321110) | |
43 F_1_847 equ 15137 ; FIX(1.847759065) | |
44 F_1_961 equ 16069 ; FIX(1.961570560) | |
45 F_2_053 equ 16819 ; FIX(2.053119869) | |
46 F_2_562 equ 20995 ; FIX(2.562915447) | |
47 F_3_072 equ 25172 ; FIX(3.072711026) | |
48 %else | |
49 ; NASM cannot do compile-time arithmetic on floating-point constants. | |
50 %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) | |
51 F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336) | |
52 F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644) | |
53 F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) | |
54 F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) | |
55 F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) | |
56 F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602) | |
57 F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110) | |
58 F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) | |
59 F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560) | |
60 F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869) | |
61 F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) | |
62 F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026) | |
63 %endif | |
64 | |
65 ; -------------------------------------------------------------------------- | |
66 SECTION SEG_CONST | |
67 | |
68 alignz 16 | |
69 global EXTN(jconst_fdct_islow_mmx) PRIVATE | |
70 | |
71 EXTN(jconst_fdct_islow_mmx): | |
72 | |
73 PW_F130_F054 times 2 dw (F_0_541+F_0_765), F_0_541 | |
74 PW_F054_MF130 times 2 dw F_0_541, (F_0_541-F_1_847) | |
75 PW_MF078_F117 times 2 dw (F_1_175-F_1_961), F_1_175 | |
76 PW_F117_F078 times 2 dw F_1_175, (F_1_175-F_0_390) | |
77 PW_MF060_MF089 times 2 dw (F_0_298-F_0_899),-F_0_899 | |
78 PW_MF089_F060 times 2 dw -F_0_899, (F_1_501-F_0_899) | |
79 PW_MF050_MF256 times 2 dw (F_2_053-F_2_562),-F_2_562 | |
80 PW_MF256_F050 times 2 dw -F_2_562, (F_3_072-F_2_562) | |
81 PD_DESCALE_P1 times 2 dd 1 << (DESCALE_P1-1) | |
82 PD_DESCALE_P2 times 2 dd 1 << (DESCALE_P2-1) | |
83 PW_DESCALE_P2X times 4 dw 1 << (PASS1_BITS-1) | |
84 | |
85 alignz 16 | |
86 | |
87 ; -------------------------------------------------------------------------- | |
88 SECTION SEG_TEXT | |
89 BITS 32 | |
90 ; | |
91 ; Perform the forward DCT on one block of samples. | |
92 ; | |
93 ; GLOBAL(void) | |
94 ; jsimd_fdct_islow_mmx (DCTELEM * data) | |
95 ; | |
96 | |
97 %define data(b) (b)+8 ; DCTELEM * data | |
98 | |
99 %define original_ebp ebp+0 | |
100 %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] | |
101 %define WK_NUM 2 | |
102 | |
103 align 16 | |
104 global EXTN(jsimd_fdct_islow_mmx) PRIVATE | |
105 | |
106 EXTN(jsimd_fdct_islow_mmx): | |
107 push ebp | |
108 mov eax,esp ; eax = original ebp | |
109 sub esp, byte 4 | |
110 and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits | |
111 mov [esp],eax | |
112 mov ebp,esp ; ebp = aligned ebp | |
113 lea esp, [wk(0)] | |
114 pushpic ebx | |
115 ; push ecx ; need not be preserved | |
116 ; push edx ; need not be preserved | |
117 ; push esi ; unused | |
118 ; push edi ; unused | |
119 | |
120 get_GOT ebx ; get GOT address | |
121 | |
122 ; ---- Pass 1: process rows. | |
123 | |
124 mov edx, POINTER [data(eax)] ; (DCTELEM *) | |
125 mov ecx, DCTSIZE/4 | |
126 alignx 16,7 | |
127 .rowloop: | |
128 | |
129 movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] | |
130 movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)] | |
131 movq mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)] | |
132 movq mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)] | |
133 | |
134 ; mm0=(20 21 22 23), mm2=(24 25 26 27) | |
135 ; mm1=(30 31 32 33), mm3=(34 35 36 37) | |
136 | |
137 movq mm4,mm0 ; transpose coefficients(phase 1) | |
138 punpcklwd mm0,mm1 ; mm0=(20 30 21 31) | |
139 punpckhwd mm4,mm1 ; mm4=(22 32 23 33) | |
140 movq mm5,mm2 ; transpose coefficients(phase 1) | |
141 punpcklwd mm2,mm3 ; mm2=(24 34 25 35) | |
142 punpckhwd mm5,mm3 ; mm5=(26 36 27 37) | |
143 | |
144 movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)] | |
145 movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)] | |
146 movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)] | |
147 movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)] | |
148 | |
149 ; mm6=(00 01 02 03), mm1=(04 05 06 07) | |
150 ; mm7=(10 11 12 13), mm3=(14 15 16 17) | |
151 | |
152 movq MMWORD [wk(0)], mm4 ; wk(0)=(22 32 23 33) | |
153 movq MMWORD [wk(1)], mm2 ; wk(1)=(24 34 25 35) | |
154 | |
155 movq mm4,mm6 ; transpose coefficients(phase 1) | |
156 punpcklwd mm6,mm7 ; mm6=(00 10 01 11) | |
157 punpckhwd mm4,mm7 ; mm4=(02 12 03 13) | |
158 movq mm2,mm1 ; transpose coefficients(phase 1) | |
159 punpcklwd mm1,mm3 ; mm1=(04 14 05 15) | |
160 punpckhwd mm2,mm3 ; mm2=(06 16 07 17) | |
161 | |
162 movq mm7,mm6 ; transpose coefficients(phase 2) | |
163 punpckldq mm6,mm0 ; mm6=(00 10 20 30)=data0 | |
164 punpckhdq mm7,mm0 ; mm7=(01 11 21 31)=data1 | |
165 movq mm3,mm2 ; transpose coefficients(phase 2) | |
166 punpckldq mm2,mm5 ; mm2=(06 16 26 36)=data6 | |
167 punpckhdq mm3,mm5 ; mm3=(07 17 27 37)=data7 | |
168 | |
169 movq mm0,mm7 | |
170 movq mm5,mm6 | |
171 psubw mm7,mm2 ; mm7=data1-data6=tmp6 | |
172 psubw mm6,mm3 ; mm6=data0-data7=tmp7 | |
173 paddw mm0,mm2 ; mm0=data1+data6=tmp1 | |
174 paddw mm5,mm3 ; mm5=data0+data7=tmp0 | |
175 | |
176 movq mm2, MMWORD [wk(0)] ; mm2=(22 32 23 33) | |
177 movq mm3, MMWORD [wk(1)] ; mm3=(24 34 25 35) | |
178 movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6 | |
179 movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7 | |
180 | |
181 movq mm7,mm4 ; transpose coefficients(phase 2) | |
182 punpckldq mm4,mm2 ; mm4=(02 12 22 32)=data2 | |
183 punpckhdq mm7,mm2 ; mm7=(03 13 23 33)=data3 | |
184 movq mm6,mm1 ; transpose coefficients(phase 2) | |
185 punpckldq mm1,mm3 ; mm1=(04 14 24 34)=data4 | |
186 punpckhdq mm6,mm3 ; mm6=(05 15 25 35)=data5 | |
187 | |
188 movq mm2,mm7 | |
189 movq mm3,mm4 | |
190 paddw mm7,mm1 ; mm7=data3+data4=tmp3 | |
191 paddw mm4,mm6 ; mm4=data2+data5=tmp2 | |
192 psubw mm2,mm1 ; mm2=data3-data4=tmp4 | |
193 psubw mm3,mm6 ; mm3=data2-data5=tmp5 | |
194 | |
195 ; -- Even part | |
196 | |
197 movq mm1,mm5 | |
198 movq mm6,mm0 | |
199 paddw mm5,mm7 ; mm5=tmp10 | |
200 paddw mm0,mm4 ; mm0=tmp11 | |
201 psubw mm1,mm7 ; mm1=tmp13 | |
202 psubw mm6,mm4 ; mm6=tmp12 | |
203 | |
204 movq mm7,mm5 | |
205 paddw mm5,mm0 ; mm5=tmp10+tmp11 | |
206 psubw mm7,mm0 ; mm7=tmp10-tmp11 | |
207 | |
208 psllw mm5,PASS1_BITS ; mm5=data0 | |
209 psllw mm7,PASS1_BITS ; mm7=data4 | |
210 | |
211 movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5 | |
212 movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm7 | |
213 | |
214 ; (Original) | |
215 ; z1 = (tmp12 + tmp13) * 0.541196100; | |
216 ; data2 = z1 + tmp13 * 0.765366865; | |
217 ; data6 = z1 + tmp12 * -1.847759065; | |
218 ; | |
219 ; (This implementation) | |
220 ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; | |
221 ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); | |
222 | |
223 movq mm4,mm1 ; mm1=tmp13 | |
224 movq mm0,mm1 | |
225 punpcklwd mm4,mm6 ; mm6=tmp12 | |
226 punpckhwd mm0,mm6 | |
227 movq mm1,mm4 | |
228 movq mm6,mm0 | |
229 pmaddwd mm4,[GOTOFF(ebx,PW_F130_F054)] ; mm4=data2L | |
230 pmaddwd mm0,[GOTOFF(ebx,PW_F130_F054)] ; mm0=data2H | |
231 pmaddwd mm1,[GOTOFF(ebx,PW_F054_MF130)] ; mm1=data6L | |
232 pmaddwd mm6,[GOTOFF(ebx,PW_F054_MF130)] ; mm6=data6H | |
233 | |
234 paddd mm4,[GOTOFF(ebx,PD_DESCALE_P1)] | |
235 paddd mm0,[GOTOFF(ebx,PD_DESCALE_P1)] | |
236 psrad mm4,DESCALE_P1 | |
237 psrad mm0,DESCALE_P1 | |
238 paddd mm1,[GOTOFF(ebx,PD_DESCALE_P1)] | |
239 paddd mm6,[GOTOFF(ebx,PD_DESCALE_P1)] | |
240 psrad mm1,DESCALE_P1 | |
241 psrad mm6,DESCALE_P1 | |
242 | |
243 packssdw mm4,mm0 ; mm4=data2 | |
244 packssdw mm1,mm6 ; mm1=data6 | |
245 | |
246 movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4 | |
247 movq MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm1 | |
248 | |
249 ; -- Odd part | |
250 | |
251 movq mm5, MMWORD [wk(0)] ; mm5=tmp6 | |
252 movq mm7, MMWORD [wk(1)] ; mm7=tmp7 | |
253 | |
254 movq mm0,mm2 ; mm2=tmp4 | |
255 movq mm6,mm3 ; mm3=tmp5 | |
256 paddw mm0,mm5 ; mm0=z3 | |
257 paddw mm6,mm7 ; mm6=z4 | |
258 | |
259 ; (Original) | |
260 ; z5 = (z3 + z4) * 1.175875602; | |
261 ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; | |
262 ; z3 += z5; z4 += z5; | |
263 ; | |
264 ; (This implementation) | |
265 ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; | |
266 ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); | |
267 | |
268 movq mm4,mm0 | |
269 movq mm1,mm0 | |
270 punpcklwd mm4,mm6 | |
271 punpckhwd mm1,mm6 | |
272 movq mm0,mm4 | |
273 movq mm6,mm1 | |
274 pmaddwd mm4,[GOTOFF(ebx,PW_MF078_F117)] ; mm4=z3L | |
275 pmaddwd mm1,[GOTOFF(ebx,PW_MF078_F117)] ; mm1=z3H | |
276 pmaddwd mm0,[GOTOFF(ebx,PW_F117_F078)] ; mm0=z4L | |
277 pmaddwd mm6,[GOTOFF(ebx,PW_F117_F078)] ; mm6=z4H | |
278 | |
279 movq MMWORD [wk(0)], mm4 ; wk(0)=z3L | |
280 movq MMWORD [wk(1)], mm1 ; wk(1)=z3H | |
281 | |
282 ; (Original) | |
283 ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; | |
284 ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; | |
285 ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; | |
286 ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; | |
287 ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; | |
288 ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; | |
289 ; | |
290 ; (This implementation) | |
291 ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; | |
292 ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; | |
293 ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); | |
294 ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); | |
295 ; data7 = tmp4 + z3; data5 = tmp5 + z4; | |
296 ; data3 = tmp6 + z3; data1 = tmp7 + z4; | |
297 | |
298 movq mm4,mm2 | |
299 movq mm1,mm2 | |
300 punpcklwd mm4,mm7 | |
301 punpckhwd mm1,mm7 | |
302 movq mm2,mm4 | |
303 movq mm7,mm1 | |
304 pmaddwd mm4,[GOTOFF(ebx,PW_MF060_MF089)] ; mm4=tmp4L | |
305 pmaddwd mm1,[GOTOFF(ebx,PW_MF060_MF089)] ; mm1=tmp4H | |
306 pmaddwd mm2,[GOTOFF(ebx,PW_MF089_F060)] ; mm2=tmp7L | |
307 pmaddwd mm7,[GOTOFF(ebx,PW_MF089_F060)] ; mm7=tmp7H | |
308 | |
309 paddd mm4, MMWORD [wk(0)] ; mm4=data7L | |
310 paddd mm1, MMWORD [wk(1)] ; mm1=data7H | |
311 paddd mm2,mm0 ; mm2=data1L | |
312 paddd mm7,mm6 ; mm7=data1H | |
313 | |
314 paddd mm4,[GOTOFF(ebx,PD_DESCALE_P1)] | |
315 paddd mm1,[GOTOFF(ebx,PD_DESCALE_P1)] | |
316 psrad mm4,DESCALE_P1 | |
317 psrad mm1,DESCALE_P1 | |
318 paddd mm2,[GOTOFF(ebx,PD_DESCALE_P1)] | |
319 paddd mm7,[GOTOFF(ebx,PD_DESCALE_P1)] | |
320 psrad mm2,DESCALE_P1 | |
321 psrad mm7,DESCALE_P1 | |
322 | |
323 packssdw mm4,mm1 ; mm4=data7 | |
324 packssdw mm2,mm7 ; mm2=data1 | |
325 | |
326 movq MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm4 | |
327 movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2 | |
328 | |
329 movq mm1,mm3 | |
330 movq mm7,mm3 | |
331 punpcklwd mm1,mm5 | |
332 punpckhwd mm7,mm5 | |
333 movq mm3,mm1 | |
334 movq mm5,mm7 | |
335 pmaddwd mm1,[GOTOFF(ebx,PW_MF050_MF256)] ; mm1=tmp5L | |
336 pmaddwd mm7,[GOTOFF(ebx,PW_MF050_MF256)] ; mm7=tmp5H | |
337 pmaddwd mm3,[GOTOFF(ebx,PW_MF256_F050)] ; mm3=tmp6L | |
338 pmaddwd mm5,[GOTOFF(ebx,PW_MF256_F050)] ; mm5=tmp6H | |
339 | |
340 paddd mm1,mm0 ; mm1=data5L | |
341 paddd mm7,mm6 ; mm7=data5H | |
342 paddd mm3, MMWORD [wk(0)] ; mm3=data3L | |
343 paddd mm5, MMWORD [wk(1)] ; mm5=data3H | |
344 | |
345 paddd mm1,[GOTOFF(ebx,PD_DESCALE_P1)] | |
346 paddd mm7,[GOTOFF(ebx,PD_DESCALE_P1)] | |
347 psrad mm1,DESCALE_P1 | |
348 psrad mm7,DESCALE_P1 | |
349 paddd mm3,[GOTOFF(ebx,PD_DESCALE_P1)] | |
350 paddd mm5,[GOTOFF(ebx,PD_DESCALE_P1)] | |
351 psrad mm3,DESCALE_P1 | |
352 psrad mm5,DESCALE_P1 | |
353 | |
354 packssdw mm1,mm7 ; mm1=data5 | |
355 packssdw mm3,mm5 ; mm3=data3 | |
356 | |
357 movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm1 | |
358 movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3 | |
359 | |
360 add edx, byte 4*DCTSIZE*SIZEOF_DCTELEM | |
361 dec ecx | |
362 jnz near .rowloop | |
363 | |
364 ; ---- Pass 2: process columns. | |
365 | |
366 mov edx, POINTER [data(eax)] ; (DCTELEM *) | |
367 mov ecx, DCTSIZE/4 | |
368 alignx 16,7 | |
369 .columnloop: | |
370 | |
371 movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] | |
372 movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)] | |
373 movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)] | |
374 movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)] | |
375 | |
376 ; mm0=(02 12 22 32), mm2=(42 52 62 72) | |
377 ; mm1=(03 13 23 33), mm3=(43 53 63 73) | |
378 | |
379 movq mm4,mm0 ; transpose coefficients(phase 1) | |
380 punpcklwd mm0,mm1 ; mm0=(02 03 12 13) | |
381 punpckhwd mm4,mm1 ; mm4=(22 23 32 33) | |
382 movq mm5,mm2 ; transpose coefficients(phase 1) | |
383 punpcklwd mm2,mm3 ; mm2=(42 43 52 53) | |
384 punpckhwd mm5,mm3 ; mm5=(62 63 72 73) | |
385 | |
386 movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)] | |
387 movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)] | |
388 movq mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)] | |
389 movq mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)] | |
390 | |
391 ; mm6=(00 10 20 30), mm1=(40 50 60 70) | |
392 ; mm7=(01 11 21 31), mm3=(41 51 61 71) | |
393 | |
394 movq MMWORD [wk(0)], mm4 ; wk(0)=(22 23 32 33) | |
395 movq MMWORD [wk(1)], mm2 ; wk(1)=(42 43 52 53) | |
396 | |
397 movq mm4,mm6 ; transpose coefficients(phase 1) | |
398 punpcklwd mm6,mm7 ; mm6=(00 01 10 11) | |
399 punpckhwd mm4,mm7 ; mm4=(20 21 30 31) | |
400 movq mm2,mm1 ; transpose coefficients(phase 1) | |
401 punpcklwd mm1,mm3 ; mm1=(40 41 50 51) | |
402 punpckhwd mm2,mm3 ; mm2=(60 61 70 71) | |
403 | |
404 movq mm7,mm6 ; transpose coefficients(phase 2) | |
405 punpckldq mm6,mm0 ; mm6=(00 01 02 03)=data0 | |
406 punpckhdq mm7,mm0 ; mm7=(10 11 12 13)=data1 | |
407 movq mm3,mm2 ; transpose coefficients(phase 2) | |
408 punpckldq mm2,mm5 ; mm2=(60 61 62 63)=data6 | |
409 punpckhdq mm3,mm5 ; mm3=(70 71 72 73)=data7 | |
410 | |
411 movq mm0,mm7 | |
412 movq mm5,mm6 | |
413 psubw mm7,mm2 ; mm7=data1-data6=tmp6 | |
414 psubw mm6,mm3 ; mm6=data0-data7=tmp7 | |
415 paddw mm0,mm2 ; mm0=data1+data6=tmp1 | |
416 paddw mm5,mm3 ; mm5=data0+data7=tmp0 | |
417 | |
418 movq mm2, MMWORD [wk(0)] ; mm2=(22 23 32 33) | |
419 movq mm3, MMWORD [wk(1)] ; mm3=(42 43 52 53) | |
420 movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6 | |
421 movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7 | |
422 | |
423 movq mm7,mm4 ; transpose coefficients(phase 2) | |
424 punpckldq mm4,mm2 ; mm4=(20 21 22 23)=data2 | |
425 punpckhdq mm7,mm2 ; mm7=(30 31 32 33)=data3 | |
426 movq mm6,mm1 ; transpose coefficients(phase 2) | |
427 punpckldq mm1,mm3 ; mm1=(40 41 42 43)=data4 | |
428 punpckhdq mm6,mm3 ; mm6=(50 51 52 53)=data5 | |
429 | |
430 movq mm2,mm7 | |
431 movq mm3,mm4 | |
432 paddw mm7,mm1 ; mm7=data3+data4=tmp3 | |
433 paddw mm4,mm6 ; mm4=data2+data5=tmp2 | |
434 psubw mm2,mm1 ; mm2=data3-data4=tmp4 | |
435 psubw mm3,mm6 ; mm3=data2-data5=tmp5 | |
436 | |
437 ; -- Even part | |
438 | |
439 movq mm1,mm5 | |
440 movq mm6,mm0 | |
441 paddw mm5,mm7 ; mm5=tmp10 | |
442 paddw mm0,mm4 ; mm0=tmp11 | |
443 psubw mm1,mm7 ; mm1=tmp13 | |
444 psubw mm6,mm4 ; mm6=tmp12 | |
445 | |
446 movq mm7,mm5 | |
447 paddw mm5,mm0 ; mm5=tmp10+tmp11 | |
448 psubw mm7,mm0 ; mm7=tmp10-tmp11 | |
449 | |
450 paddw mm5,[GOTOFF(ebx,PW_DESCALE_P2X)] | |
451 paddw mm7,[GOTOFF(ebx,PW_DESCALE_P2X)] | |
452 psraw mm5,PASS1_BITS ; mm5=data0 | |
453 psraw mm7,PASS1_BITS ; mm7=data4 | |
454 | |
455 movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5 | |
456 movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm7 | |
457 | |
458 ; (Original) | |
459 ; z1 = (tmp12 + tmp13) * 0.541196100; | |
460 ; data2 = z1 + tmp13 * 0.765366865; | |
461 ; data6 = z1 + tmp12 * -1.847759065; | |
462 ; | |
463 ; (This implementation) | |
464 ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; | |
465 ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); | |
466 | |
467 movq mm4,mm1 ; mm1=tmp13 | |
468 movq mm0,mm1 | |
469 punpcklwd mm4,mm6 ; mm6=tmp12 | |
470 punpckhwd mm0,mm6 | |
471 movq mm1,mm4 | |
472 movq mm6,mm0 | |
473 pmaddwd mm4,[GOTOFF(ebx,PW_F130_F054)] ; mm4=data2L | |
474 pmaddwd mm0,[GOTOFF(ebx,PW_F130_F054)] ; mm0=data2H | |
475 pmaddwd mm1,[GOTOFF(ebx,PW_F054_MF130)] ; mm1=data6L | |
476 pmaddwd mm6,[GOTOFF(ebx,PW_F054_MF130)] ; mm6=data6H | |
477 | |
478 paddd mm4,[GOTOFF(ebx,PD_DESCALE_P2)] | |
479 paddd mm0,[GOTOFF(ebx,PD_DESCALE_P2)] | |
480 psrad mm4,DESCALE_P2 | |
481 psrad mm0,DESCALE_P2 | |
482 paddd mm1,[GOTOFF(ebx,PD_DESCALE_P2)] | |
483 paddd mm6,[GOTOFF(ebx,PD_DESCALE_P2)] | |
484 psrad mm1,DESCALE_P2 | |
485 psrad mm6,DESCALE_P2 | |
486 | |
487 packssdw mm4,mm0 ; mm4=data2 | |
488 packssdw mm1,mm6 ; mm1=data6 | |
489 | |
490 movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4 | |
491 movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm1 | |
492 | |
493 ; -- Odd part | |
494 | |
495 movq mm5, MMWORD [wk(0)] ; mm5=tmp6 | |
496 movq mm7, MMWORD [wk(1)] ; mm7=tmp7 | |
497 | |
498 movq mm0,mm2 ; mm2=tmp4 | |
499 movq mm6,mm3 ; mm3=tmp5 | |
500 paddw mm0,mm5 ; mm0=z3 | |
501 paddw mm6,mm7 ; mm6=z4 | |
502 | |
503 ; (Original) | |
504 ; z5 = (z3 + z4) * 1.175875602; | |
505 ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; | |
506 ; z3 += z5; z4 += z5; | |
507 ; | |
508 ; (This implementation) | |
509 ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; | |
510 ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); | |
511 | |
512 movq mm4,mm0 | |
513 movq mm1,mm0 | |
514 punpcklwd mm4,mm6 | |
515 punpckhwd mm1,mm6 | |
516 movq mm0,mm4 | |
517 movq mm6,mm1 | |
518 pmaddwd mm4,[GOTOFF(ebx,PW_MF078_F117)] ; mm4=z3L | |
519 pmaddwd mm1,[GOTOFF(ebx,PW_MF078_F117)] ; mm1=z3H | |
520 pmaddwd mm0,[GOTOFF(ebx,PW_F117_F078)] ; mm0=z4L | |
521 pmaddwd mm6,[GOTOFF(ebx,PW_F117_F078)] ; mm6=z4H | |
522 | |
523 movq MMWORD [wk(0)], mm4 ; wk(0)=z3L | |
524 movq MMWORD [wk(1)], mm1 ; wk(1)=z3H | |
525 | |
526 ; (Original) | |
527 ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; | |
528 ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; | |
529 ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; | |
530 ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; | |
531 ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; | |
532 ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; | |
533 ; | |
534 ; (This implementation) | |
535 ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; | |
536 ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; | |
537 ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); | |
538 ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); | |
539 ; data7 = tmp4 + z3; data5 = tmp5 + z4; | |
540 ; data3 = tmp6 + z3; data1 = tmp7 + z4; | |
541 | |
542 movq mm4,mm2 | |
543 movq mm1,mm2 | |
544 punpcklwd mm4,mm7 | |
545 punpckhwd mm1,mm7 | |
546 movq mm2,mm4 | |
547 movq mm7,mm1 | |
548 pmaddwd mm4,[GOTOFF(ebx,PW_MF060_MF089)] ; mm4=tmp4L | |
549 pmaddwd mm1,[GOTOFF(ebx,PW_MF060_MF089)] ; mm1=tmp4H | |
550 pmaddwd mm2,[GOTOFF(ebx,PW_MF089_F060)] ; mm2=tmp7L | |
551 pmaddwd mm7,[GOTOFF(ebx,PW_MF089_F060)] ; mm7=tmp7H | |
552 | |
553 paddd mm4, MMWORD [wk(0)] ; mm4=data7L | |
554 paddd mm1, MMWORD [wk(1)] ; mm1=data7H | |
555 paddd mm2,mm0 ; mm2=data1L | |
556 paddd mm7,mm6 ; mm7=data1H | |
557 | |
558 paddd mm4,[GOTOFF(ebx,PD_DESCALE_P2)] | |
559 paddd mm1,[GOTOFF(ebx,PD_DESCALE_P2)] | |
560 psrad mm4,DESCALE_P2 | |
561 psrad mm1,DESCALE_P2 | |
562 paddd mm2,[GOTOFF(ebx,PD_DESCALE_P2)] | |
563 paddd mm7,[GOTOFF(ebx,PD_DESCALE_P2)] | |
564 psrad mm2,DESCALE_P2 | |
565 psrad mm7,DESCALE_P2 | |
566 | |
567 packssdw mm4,mm1 ; mm4=data7 | |
568 packssdw mm2,mm7 ; mm2=data1 | |
569 | |
570 movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm4 | |
571 movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2 | |
572 | |
573 movq mm1,mm3 | |
574 movq mm7,mm3 | |
575 punpcklwd mm1,mm5 | |
576 punpckhwd mm7,mm5 | |
577 movq mm3,mm1 | |
578 movq mm5,mm7 | |
579 pmaddwd mm1,[GOTOFF(ebx,PW_MF050_MF256)] ; mm1=tmp5L | |
580 pmaddwd mm7,[GOTOFF(ebx,PW_MF050_MF256)] ; mm7=tmp5H | |
581 pmaddwd mm3,[GOTOFF(ebx,PW_MF256_F050)] ; mm3=tmp6L | |
582 pmaddwd mm5,[GOTOFF(ebx,PW_MF256_F050)] ; mm5=tmp6H | |
583 | |
584 paddd mm1,mm0 ; mm1=data5L | |
585 paddd mm7,mm6 ; mm7=data5H | |
586 paddd mm3, MMWORD [wk(0)] ; mm3=data3L | |
587 paddd mm5, MMWORD [wk(1)] ; mm5=data3H | |
588 | |
589 paddd mm1,[GOTOFF(ebx,PD_DESCALE_P2)] | |
590 paddd mm7,[GOTOFF(ebx,PD_DESCALE_P2)] | |
591 psrad mm1,DESCALE_P2 | |
592 psrad mm7,DESCALE_P2 | |
593 paddd mm3,[GOTOFF(ebx,PD_DESCALE_P2)] | |
594 paddd mm5,[GOTOFF(ebx,PD_DESCALE_P2)] | |
595 psrad mm3,DESCALE_P2 | |
596 psrad mm5,DESCALE_P2 | |
597 | |
598 packssdw mm1,mm7 ; mm1=data5 | |
599 packssdw mm3,mm5 ; mm3=data3 | |
600 | |
601 movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm1 | |
602 movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3 | |
603 | |
604 add edx, byte 4*SIZEOF_DCTELEM | |
605 dec ecx | |
606 jnz near .columnloop | |
607 | |
608 emms ; empty MMX state | |
609 | |
610 ; pop edi ; unused | |
611 ; pop esi ; unused | |
612 ; pop edx ; need not be preserved | |
613 ; pop ecx ; need not be preserved | |
614 poppic ebx | |
615 mov esp,ebp ; esp <- aligned ebp | |
616 pop esp ; esp <- original ebp | |
617 pop ebp | |
618 ret | |
619 | |
620 ; For some reason, the OS X linker does not honor the request to align the | |
621 ; segment unless we do this. | |
622 align 16 | |
OLD | NEW |