Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(10)

Side by Side Diff: third_party/libjpeg_turbo/simd/jf3dnflt.asm

Issue 4134011: Adds libjpeg-turbo to deps... (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/
Patch Set: Created 10 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 ;
2 ; jf3dnflt.asm - floating-point FDCT (3DNow!)
3 ;
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5 ;
6 ; Based on
7 ; x86 SIMD extension for IJG JPEG library
8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
10 ;
11 ; This file should be assembled with NASM (Netwide Assembler),
12 ; can *not* be assembled with Microsoft's MASM or any compatible
13 ; assembler (including Borland's Turbo Assembler).
14 ; NASM is available from http://nasm.sourceforge.net/ or
15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
16 ;
17 ; This file contains a floating-point implementation of the forward DCT
18 ; (Discrete Cosine Transform). The following code is based directly on
19 ; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
20 ;
21 ; [TAB8]
22
23 %include "jsimdext.inc"
24 %include "jdct.inc"
25
26 ; --------------------------------------------------------------------------
27 SECTION SEG_CONST
28
29 alignz 16
30 global EXTN(jconst_fdct_float_3dnow)
31
32 EXTN(jconst_fdct_float_3dnow):
33
34 PD_0_382 times 2 dd 0.382683432365089771728460
35 PD_0_707 times 2 dd 0.707106781186547524400844
36 PD_0_541 times 2 dd 0.541196100146196984399723
37 PD_1_306 times 2 dd 1.306562964876376527856643
38
39 alignz 16
40
41 ; --------------------------------------------------------------------------
42 SECTION SEG_TEXT
43 BITS 32
44 ;
45 ; Perform the forward DCT on one block of samples.
46 ;
47 ; GLOBAL(void)
48 ; jsimd_fdct_float_3dnow (FAST_FLOAT * data)
49 ;
50
51 %define data(b) (b)+8 ; FAST_FLOAT * data
52
53 %define original_ebp ebp+0
54 %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
55 %define WK_NUM 2
56
57 align 16
58 global EXTN(jsimd_fdct_float_3dnow)
59
60 EXTN(jsimd_fdct_float_3dnow):
61 push ebp
62 mov eax,esp ; eax = original ebp
63 sub esp, byte 4
64 and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
65 mov [esp],eax
66 mov ebp,esp ; ebp = aligned ebp
67 lea esp, [wk(0)]
68 pushpic ebx
69 ; push ecx ; need not be preserved
70 ; push edx ; need not be preserved
71 ; push esi ; unused
72 ; push edi ; unused
73
74 get_GOT ebx ; get GOT address
75
76 ; ---- Pass 1: process rows.
77
78 mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
79 mov ecx, DCTSIZE/2
80 alignx 16,7
81 .rowloop:
82
83 movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
84 movq mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
85 movq mm2, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
86 movq mm3, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
87
88 ; mm0=(00 01), mm1=(10 11), mm2=(06 07), mm3=(16 17)
89
90 movq mm4,mm0 ; transpose coefficients
91 punpckldq mm0,mm1 ; mm0=(00 10)=data0
92 punpckhdq mm4,mm1 ; mm4=(01 11)=data1
93 movq mm5,mm2 ; transpose coefficients
94 punpckldq mm2,mm3 ; mm2=(06 16)=data6
95 punpckhdq mm5,mm3 ; mm5=(07 17)=data7
96
97 movq mm6,mm4
98 movq mm7,mm0
99 pfsub mm4,mm2 ; mm4=data1-data6=tmp6
100 pfsub mm0,mm5 ; mm0=data0-data7=tmp7
101 pfadd mm6,mm2 ; mm6=data1+data6=tmp1
102 pfadd mm7,mm5 ; mm7=data0+data7=tmp0
103
104 movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
105 movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
106 movq mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
107 movq mm5, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
108
109 ; mm1=(02 03), mm3=(12 13), mm2=(04 05), mm5=(14 15)
110
111 movq MMWORD [wk(0)], mm4 ; wk(0)=tmp6
112 movq MMWORD [wk(1)], mm0 ; wk(1)=tmp7
113
114 movq mm4,mm1 ; transpose coefficients
115 punpckldq mm1,mm3 ; mm1=(02 12)=data2
116 punpckhdq mm4,mm3 ; mm4=(03 13)=data3
117 movq mm0,mm2 ; transpose coefficients
118 punpckldq mm2,mm5 ; mm2=(04 14)=data4
119 punpckhdq mm0,mm5 ; mm0=(05 15)=data5
120
121 movq mm3,mm4
122 movq mm5,mm1
123 pfadd mm4,mm2 ; mm4=data3+data4=tmp3
124 pfadd mm1,mm0 ; mm1=data2+data5=tmp2
125 pfsub mm3,mm2 ; mm3=data3-data4=tmp4
126 pfsub mm5,mm0 ; mm5=data2-data5=tmp5
127
128 ; -- Even part
129
130 movq mm2,mm7
131 movq mm0,mm6
132 pfsub mm7,mm4 ; mm7=tmp13
133 pfsub mm6,mm1 ; mm6=tmp12
134 pfadd mm2,mm4 ; mm2=tmp10
135 pfadd mm0,mm1 ; mm0=tmp11
136
137 pfadd mm6,mm7
138 pfmul mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1
139
140 movq mm4,mm2
141 movq mm1,mm7
142 pfsub mm2,mm0 ; mm2=data4
143 pfsub mm7,mm6 ; mm7=data6
144 pfadd mm4,mm0 ; mm4=data0
145 pfadd mm1,mm6 ; mm1=data2
146
147 movq MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)], mm2
148 movq MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)], mm7
149 movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
150 movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], mm1
151
152 ; -- Odd part
153
154 movq mm0, MMWORD [wk(0)] ; mm0=tmp6
155 movq mm6, MMWORD [wk(1)] ; mm6=tmp7
156
157 pfadd mm3,mm5 ; mm3=tmp10
158 pfadd mm5,mm0 ; mm5=tmp11
159 pfadd mm0,mm6 ; mm0=tmp12, mm6=tmp7
160
161 pfmul mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3
162
163 movq mm2,mm3 ; mm2=tmp10
164 pfsub mm3,mm0
165 pfmul mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5
166 pfmul mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
167 pfmul mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
168 pfadd mm2,mm3 ; mm2=z2
169 pfadd mm0,mm3 ; mm0=z4
170
171 movq mm7,mm6
172 pfsub mm6,mm5 ; mm6=z13
173 pfadd mm7,mm5 ; mm7=z11
174
175 movq mm4,mm6
176 movq mm1,mm7
177 pfsub mm6,mm2 ; mm6=data3
178 pfsub mm7,mm0 ; mm7=data7
179 pfadd mm4,mm2 ; mm4=data5
180 pfadd mm1,mm0 ; mm1=data1
181
182 movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], mm6
183 movq MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)], mm7
184 movq MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)], mm4
185 movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
186
187 add edx, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
188 dec ecx
189 jnz near .rowloop
190
191 ; ---- Pass 2: process columns.
192
193 mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
194 mov ecx, DCTSIZE/2
195 alignx 16,7
196 .columnloop:
197
198 movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
199 movq mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
200 movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
201 movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
202
203 ; mm0=(00 10), mm1=(01 11), mm2=(60 70), mm3=(61 71)
204
205 movq mm4,mm0 ; transpose coefficients
206 punpckldq mm0,mm1 ; mm0=(00 01)=data0
207 punpckhdq mm4,mm1 ; mm4=(10 11)=data1
208 movq mm5,mm2 ; transpose coefficients
209 punpckldq mm2,mm3 ; mm2=(60 61)=data6
210 punpckhdq mm5,mm3 ; mm5=(70 71)=data7
211
212 movq mm6,mm4
213 movq mm7,mm0
214 pfsub mm4,mm2 ; mm4=data1-data6=tmp6
215 pfsub mm0,mm5 ; mm0=data0-data7=tmp7
216 pfadd mm6,mm2 ; mm6=data1+data6=tmp1
217 pfadd mm7,mm5 ; mm7=data0+data7=tmp0
218
219 movq mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
220 movq mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
221 movq mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
222 movq mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
223
224 ; mm1=(20 30), mm3=(21 31), mm2=(40 50), mm5=(41 51)
225
226 movq MMWORD [wk(0)], mm4 ; wk(0)=tmp6
227 movq MMWORD [wk(1)], mm0 ; wk(1)=tmp7
228
229 movq mm4,mm1 ; transpose coefficients
230 punpckldq mm1,mm3 ; mm1=(20 21)=data2
231 punpckhdq mm4,mm3 ; mm4=(30 31)=data3
232 movq mm0,mm2 ; transpose coefficients
233 punpckldq mm2,mm5 ; mm2=(40 41)=data4
234 punpckhdq mm0,mm5 ; mm0=(50 51)=data5
235
236 movq mm3,mm4
237 movq mm5,mm1
238 pfadd mm4,mm2 ; mm4=data3+data4=tmp3
239 pfadd mm1,mm0 ; mm1=data2+data5=tmp2
240 pfsub mm3,mm2 ; mm3=data3-data4=tmp4
241 pfsub mm5,mm0 ; mm5=data2-data5=tmp5
242
243 ; -- Even part
244
245 movq mm2,mm7
246 movq mm0,mm6
247 pfsub mm7,mm4 ; mm7=tmp13
248 pfsub mm6,mm1 ; mm6=tmp12
249 pfadd mm2,mm4 ; mm2=tmp10
250 pfadd mm0,mm1 ; mm0=tmp11
251
252 pfadd mm6,mm7
253 pfmul mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1
254
255 movq mm4,mm2
256 movq mm1,mm7
257 pfsub mm2,mm0 ; mm2=data4
258 pfsub mm7,mm6 ; mm7=data6
259 pfadd mm4,mm0 ; mm4=data0
260 pfadd mm1,mm6 ; mm1=data2
261
262 movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], mm2
263 movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], mm7
264 movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
265 movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], mm1
266
267 ; -- Odd part
268
269 movq mm0, MMWORD [wk(0)] ; mm0=tmp6
270 movq mm6, MMWORD [wk(1)] ; mm6=tmp7
271
272 pfadd mm3,mm5 ; mm3=tmp10
273 pfadd mm5,mm0 ; mm5=tmp11
274 pfadd mm0,mm6 ; mm0=tmp12, mm6=tmp7
275
276 pfmul mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3
277
278 movq mm2,mm3 ; mm2=tmp10
279 pfsub mm3,mm0
280 pfmul mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5
281 pfmul mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
282 pfmul mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
283 pfadd mm2,mm3 ; mm2=z2
284 pfadd mm0,mm3 ; mm0=z4
285
286 movq mm7,mm6
287 pfsub mm6,mm5 ; mm6=z13
288 pfadd mm7,mm5 ; mm7=z11
289
290 movq mm4,mm6
291 movq mm1,mm7
292 pfsub mm6,mm2 ; mm6=data3
293 pfsub mm7,mm0 ; mm7=data7
294 pfadd mm4,mm2 ; mm4=data5
295 pfadd mm1,mm0 ; mm1=data1
296
297 movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], mm6
298 movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], mm7
299 movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], mm4
300 movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
301
302 add edx, byte 2*SIZEOF_FAST_FLOAT
303 dec ecx
304 jnz near .columnloop
305
306 femms ; empty MMX/3DNow! state
307
308 ; pop edi ; unused
309 ; pop esi ; unused
310 ; pop edx ; need not be preserved
311 ; pop ecx ; need not be preserved
312 poppic ebx
313 mov esp,ebp ; esp <- aligned ebp
314 pop esp ; esp <- original ebp
315 pop ebp
316 ret
317
318 ; For some reason, the OS X linker does not honor the request to align the
319 ; segment unless we do this.
320 align 16
OLDNEW
« no previous file with comments | « third_party/libjpeg_turbo/simd/jdsamss2-64.asm ('k') | third_party/libjpeg_turbo/simd/jfmmxfst.asm » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698