OLD | NEW |
| (Empty) |
1 ; | |
2 ; jf3dnflt.asm - floating-point FDCT (3DNow!) | |
3 ; | |
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB | |
5 ; | |
6 ; Based on | |
7 ; x86 SIMD extension for IJG JPEG library | |
8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. | |
9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc | |
10 ; | |
11 ; This file should be assembled with NASM (Netwide Assembler), | |
12 ; can *not* be assembled with Microsoft's MASM or any compatible | |
13 ; assembler (including Borland's Turbo Assembler). | |
14 ; NASM is available from http://nasm.sourceforge.net/ or | |
15 ; http://sourceforge.net/project/showfiles.php?group_id=6208 | |
16 ; | |
17 ; This file contains a floating-point implementation of the forward DCT | |
18 ; (Discrete Cosine Transform). The following code is based directly on | |
19 ; the IJG's original jfdctflt.c; see the jfdctflt.c for more details. | |
20 ; | |
21 ; [TAB8] | |
22 | |
23 %include "jsimdext.inc" | |
24 %include "jdct.inc" | |
25 | |
26 ; -------------------------------------------------------------------------- | |
27 SECTION SEG_CONST | |
28 | |
29 alignz 16 | |
30 global EXTN(jconst_fdct_float_3dnow) PRIVATE | |
31 | |
32 EXTN(jconst_fdct_float_3dnow): | |
33 | |
34 PD_0_382 times 2 dd 0.382683432365089771728460 | |
35 PD_0_707 times 2 dd 0.707106781186547524400844 | |
36 PD_0_541 times 2 dd 0.541196100146196984399723 | |
37 PD_1_306 times 2 dd 1.306562964876376527856643 | |
38 | |
39 alignz 16 | |
40 | |
41 ; -------------------------------------------------------------------------- | |
42 SECTION SEG_TEXT | |
43 BITS 32 | |
44 ; | |
45 ; Perform the forward DCT on one block of samples. | |
46 ; | |
47 ; GLOBAL(void) | |
48 ; jsimd_fdct_float_3dnow (FAST_FLOAT * data) | |
49 ; | |
50 | |
51 %define data(b) (b)+8 ; FAST_FLOAT * data | |
52 | |
53 %define original_ebp ebp+0 | |
54 %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] | |
55 %define WK_NUM 2 | |
56 | |
57 align 16 | |
58 global EXTN(jsimd_fdct_float_3dnow) PRIVATE | |
59 | |
60 EXTN(jsimd_fdct_float_3dnow): | |
61 push ebp | |
62 mov eax,esp ; eax = original ebp | |
63 sub esp, byte 4 | |
64 and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits | |
65 mov [esp],eax | |
66 mov ebp,esp ; ebp = aligned ebp | |
67 lea esp, [wk(0)] | |
68 pushpic ebx | |
69 ; push ecx ; need not be preserved | |
70 ; push edx ; need not be preserved | |
71 ; push esi ; unused | |
72 ; push edi ; unused | |
73 | |
74 get_GOT ebx ; get GOT address | |
75 | |
76 ; ---- Pass 1: process rows. | |
77 | |
78 mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) | |
79 mov ecx, DCTSIZE/2 | |
80 alignx 16,7 | |
81 .rowloop: | |
82 | |
83 movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] | |
84 movq mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] | |
85 movq mm2, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)] | |
86 movq mm3, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)] | |
87 | |
88 ; mm0=(00 01), mm1=(10 11), mm2=(06 07), mm3=(16 17) | |
89 | |
90 movq mm4,mm0 ; transpose coefficients | |
91 punpckldq mm0,mm1 ; mm0=(00 10)=data0 | |
92 punpckhdq mm4,mm1 ; mm4=(01 11)=data1 | |
93 movq mm5,mm2 ; transpose coefficients | |
94 punpckldq mm2,mm3 ; mm2=(06 16)=data6 | |
95 punpckhdq mm5,mm3 ; mm5=(07 17)=data7 | |
96 | |
97 movq mm6,mm4 | |
98 movq mm7,mm0 | |
99 pfsub mm4,mm2 ; mm4=data1-data6=tmp6 | |
100 pfsub mm0,mm5 ; mm0=data0-data7=tmp7 | |
101 pfadd mm6,mm2 ; mm6=data1+data6=tmp1 | |
102 pfadd mm7,mm5 ; mm7=data0+data7=tmp0 | |
103 | |
104 movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] | |
105 movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] | |
106 movq mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)] | |
107 movq mm5, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)] | |
108 | |
109 ; mm1=(02 03), mm3=(12 13), mm2=(04 05), mm5=(14 15) | |
110 | |
111 movq MMWORD [wk(0)], mm4 ; wk(0)=tmp6 | |
112 movq MMWORD [wk(1)], mm0 ; wk(1)=tmp7 | |
113 | |
114 movq mm4,mm1 ; transpose coefficients | |
115 punpckldq mm1,mm3 ; mm1=(02 12)=data2 | |
116 punpckhdq mm4,mm3 ; mm4=(03 13)=data3 | |
117 movq mm0,mm2 ; transpose coefficients | |
118 punpckldq mm2,mm5 ; mm2=(04 14)=data4 | |
119 punpckhdq mm0,mm5 ; mm0=(05 15)=data5 | |
120 | |
121 movq mm3,mm4 | |
122 movq mm5,mm1 | |
123 pfadd mm4,mm2 ; mm4=data3+data4=tmp3 | |
124 pfadd mm1,mm0 ; mm1=data2+data5=tmp2 | |
125 pfsub mm3,mm2 ; mm3=data3-data4=tmp4 | |
126 pfsub mm5,mm0 ; mm5=data2-data5=tmp5 | |
127 | |
128 ; -- Even part | |
129 | |
130 movq mm2,mm7 | |
131 movq mm0,mm6 | |
132 pfsub mm7,mm4 ; mm7=tmp13 | |
133 pfsub mm6,mm1 ; mm6=tmp12 | |
134 pfadd mm2,mm4 ; mm2=tmp10 | |
135 pfadd mm0,mm1 ; mm0=tmp11 | |
136 | |
137 pfadd mm6,mm7 | |
138 pfmul mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1 | |
139 | |
140 movq mm4,mm2 | |
141 movq mm1,mm7 | |
142 pfsub mm2,mm0 ; mm2=data4 | |
143 pfsub mm7,mm6 ; mm7=data6 | |
144 pfadd mm4,mm0 ; mm4=data0 | |
145 pfadd mm1,mm6 ; mm1=data2 | |
146 | |
147 movq MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)], mm2 | |
148 movq MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)], mm7 | |
149 movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4 | |
150 movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], mm1 | |
151 | |
152 ; -- Odd part | |
153 | |
154 movq mm0, MMWORD [wk(0)] ; mm0=tmp6 | |
155 movq mm6, MMWORD [wk(1)] ; mm6=tmp7 | |
156 | |
157 pfadd mm3,mm5 ; mm3=tmp10 | |
158 pfadd mm5,mm0 ; mm5=tmp11 | |
159 pfadd mm0,mm6 ; mm0=tmp12, mm6=tmp7 | |
160 | |
161 pfmul mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3 | |
162 | |
163 movq mm2,mm3 ; mm2=tmp10 | |
164 pfsub mm3,mm0 | |
165 pfmul mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5 | |
166 pfmul mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610) | |
167 pfmul mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296) | |
168 pfadd mm2,mm3 ; mm2=z2 | |
169 pfadd mm0,mm3 ; mm0=z4 | |
170 | |
171 movq mm7,mm6 | |
172 pfsub mm6,mm5 ; mm6=z13 | |
173 pfadd mm7,mm5 ; mm7=z11 | |
174 | |
175 movq mm4,mm6 | |
176 movq mm1,mm7 | |
177 pfsub mm6,mm2 ; mm6=data3 | |
178 pfsub mm7,mm0 ; mm7=data7 | |
179 pfadd mm4,mm2 ; mm4=data5 | |
180 pfadd mm1,mm0 ; mm1=data1 | |
181 | |
182 movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], mm6 | |
183 movq MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)], mm7 | |
184 movq MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)], mm4 | |
185 movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1 | |
186 | |
187 add edx, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT | |
188 dec ecx | |
189 jnz near .rowloop | |
190 | |
191 ; ---- Pass 2: process columns. | |
192 | |
193 mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) | |
194 mov ecx, DCTSIZE/2 | |
195 alignx 16,7 | |
196 .columnloop: | |
197 | |
198 movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] | |
199 movq mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] | |
200 movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)] | |
201 movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)] | |
202 | |
203 ; mm0=(00 10), mm1=(01 11), mm2=(60 70), mm3=(61 71) | |
204 | |
205 movq mm4,mm0 ; transpose coefficients | |
206 punpckldq mm0,mm1 ; mm0=(00 01)=data0 | |
207 punpckhdq mm4,mm1 ; mm4=(10 11)=data1 | |
208 movq mm5,mm2 ; transpose coefficients | |
209 punpckldq mm2,mm3 ; mm2=(60 61)=data6 | |
210 punpckhdq mm5,mm3 ; mm5=(70 71)=data7 | |
211 | |
212 movq mm6,mm4 | |
213 movq mm7,mm0 | |
214 pfsub mm4,mm2 ; mm4=data1-data6=tmp6 | |
215 pfsub mm0,mm5 ; mm0=data0-data7=tmp7 | |
216 pfadd mm6,mm2 ; mm6=data1+data6=tmp1 | |
217 pfadd mm7,mm5 ; mm7=data0+data7=tmp0 | |
218 | |
219 movq mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)] | |
220 movq mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)] | |
221 movq mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)] | |
222 movq mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)] | |
223 | |
224 ; mm1=(20 30), mm3=(21 31), mm2=(40 50), mm5=(41 51) | |
225 | |
226 movq MMWORD [wk(0)], mm4 ; wk(0)=tmp6 | |
227 movq MMWORD [wk(1)], mm0 ; wk(1)=tmp7 | |
228 | |
229 movq mm4,mm1 ; transpose coefficients | |
230 punpckldq mm1,mm3 ; mm1=(20 21)=data2 | |
231 punpckhdq mm4,mm3 ; mm4=(30 31)=data3 | |
232 movq mm0,mm2 ; transpose coefficients | |
233 punpckldq mm2,mm5 ; mm2=(40 41)=data4 | |
234 punpckhdq mm0,mm5 ; mm0=(50 51)=data5 | |
235 | |
236 movq mm3,mm4 | |
237 movq mm5,mm1 | |
238 pfadd mm4,mm2 ; mm4=data3+data4=tmp3 | |
239 pfadd mm1,mm0 ; mm1=data2+data5=tmp2 | |
240 pfsub mm3,mm2 ; mm3=data3-data4=tmp4 | |
241 pfsub mm5,mm0 ; mm5=data2-data5=tmp5 | |
242 | |
243 ; -- Even part | |
244 | |
245 movq mm2,mm7 | |
246 movq mm0,mm6 | |
247 pfsub mm7,mm4 ; mm7=tmp13 | |
248 pfsub mm6,mm1 ; mm6=tmp12 | |
249 pfadd mm2,mm4 ; mm2=tmp10 | |
250 pfadd mm0,mm1 ; mm0=tmp11 | |
251 | |
252 pfadd mm6,mm7 | |
253 pfmul mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1 | |
254 | |
255 movq mm4,mm2 | |
256 movq mm1,mm7 | |
257 pfsub mm2,mm0 ; mm2=data4 | |
258 pfsub mm7,mm6 ; mm7=data6 | |
259 pfadd mm4,mm0 ; mm4=data0 | |
260 pfadd mm1,mm6 ; mm1=data2 | |
261 | |
262 movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], mm2 | |
263 movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], mm7 | |
264 movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4 | |
265 movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], mm1 | |
266 | |
267 ; -- Odd part | |
268 | |
269 movq mm0, MMWORD [wk(0)] ; mm0=tmp6 | |
270 movq mm6, MMWORD [wk(1)] ; mm6=tmp7 | |
271 | |
272 pfadd mm3,mm5 ; mm3=tmp10 | |
273 pfadd mm5,mm0 ; mm5=tmp11 | |
274 pfadd mm0,mm6 ; mm0=tmp12, mm6=tmp7 | |
275 | |
276 pfmul mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3 | |
277 | |
278 movq mm2,mm3 ; mm2=tmp10 | |
279 pfsub mm3,mm0 | |
280 pfmul mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5 | |
281 pfmul mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610) | |
282 pfmul mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296) | |
283 pfadd mm2,mm3 ; mm2=z2 | |
284 pfadd mm0,mm3 ; mm0=z4 | |
285 | |
286 movq mm7,mm6 | |
287 pfsub mm6,mm5 ; mm6=z13 | |
288 pfadd mm7,mm5 ; mm7=z11 | |
289 | |
290 movq mm4,mm6 | |
291 movq mm1,mm7 | |
292 pfsub mm6,mm2 ; mm6=data3 | |
293 pfsub mm7,mm0 ; mm7=data7 | |
294 pfadd mm4,mm2 ; mm4=data5 | |
295 pfadd mm1,mm0 ; mm1=data1 | |
296 | |
297 movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], mm6 | |
298 movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], mm7 | |
299 movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], mm4 | |
300 movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1 | |
301 | |
302 add edx, byte 2*SIZEOF_FAST_FLOAT | |
303 dec ecx | |
304 jnz near .columnloop | |
305 | |
306 femms ; empty MMX/3DNow! state | |
307 | |
308 ; pop edi ; unused | |
309 ; pop esi ; unused | |
310 ; pop edx ; need not be preserved | |
311 ; pop ecx ; need not be preserved | |
312 poppic ebx | |
313 mov esp,ebp ; esp <- aligned ebp | |
314 pop esp ; esp <- original ebp | |
315 pop ebp | |
316 ret | |
317 | |
318 ; For some reason, the OS X linker does not honor the request to align the | |
319 ; segment unless we do this. | |
320 align 16 | |
OLD | NEW |