Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(140)

Side by Side Diff: simd/jidctflt-sse2.asm

Issue 1953443002: Update to libjpeg_turbo 1.4.90 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libjpeg_turbo.git@master
Patch Set: Created 4 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « simd/jidctflt-sse.asm ('k') | simd/jidctflt-sse2-64.asm » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 ;
2 ; jidctflt.asm - floating-point IDCT (SSE & SSE2)
3 ;
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5 ;
6 ; Based on
7 ; x86 SIMD extension for IJG JPEG library
8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
10 ;
11 ; This file should be assembled with NASM (Netwide Assembler),
12 ; can *not* be assembled with Microsoft's MASM or any compatible
13 ; assembler (including Borland's Turbo Assembler).
14 ; NASM is available from http://nasm.sourceforge.net/ or
15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
16 ;
17 ; This file contains a floating-point implementation of the inverse DCT
18 ; (Discrete Cosine Transform). The following code is based directly on
19 ; the IJG's original jidctflt.c; see the jidctflt.c for more details.
20 ;
21 ; [TAB8]
22
23 %include "jsimdext.inc"
24 %include "jdct.inc"
25
26 ; --------------------------------------------------------------------------
27
28 %macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
29 shufps %1,%2,0x44
30 %endmacro
31
32 %macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
33 shufps %1,%2,0xEE
34 %endmacro
35
36 ; --------------------------------------------------------------------------
37 SECTION SEG_CONST
38
39 alignz 16
40 global EXTN(jconst_idct_float_sse2)
41
42 EXTN(jconst_idct_float_sse2):
43
44 PD_1_414 times 4 dd 1.414213562373095048801689
45 PD_1_847 times 4 dd 1.847759065022573512256366
46 PD_1_082 times 4 dd 1.082392200292393968799446
47 PD_M2_613 times 4 dd -2.613125929752753055713286
48 PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3)
49 PB_CENTERJSAMP times 16 db CENTERJSAMPLE
50
51 alignz 16
52
53 ; --------------------------------------------------------------------------
54 SECTION SEG_TEXT
55 BITS 32
56 ;
57 ; Perform dequantization and inverse DCT on one block of coefficients.
58 ;
59 ; GLOBAL(void)
60 ; jsimd_idct_float_sse2 (void *dct_table, JCOEFPTR coef_block,
61 ; JSAMPARRAY output_buf, JDIMENSION output_col)
62 ;
63
64 %define dct_table(b) (b)+8 ; void *dct_table
65 %define coef_block(b) (b)+12 ; JCOEFPTR coef_block
66 %define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
67 %define output_col(b) (b)+20 ; JDIMENSION output_col
68
69 %define original_ebp ebp+0
70 %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
71 %define WK_NUM 2
72 %define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
73 ; FAST_FLOAT workspace[DCTSIZE2]
74
75 align 16
76 global EXTN(jsimd_idct_float_sse2)
77
78 EXTN(jsimd_idct_float_sse2):
79 push ebp
80 mov eax,esp ; eax = original ebp
81 sub esp, byte 4
82 and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
83 mov [esp],eax
84 mov ebp,esp ; ebp = aligned ebp
85 lea esp, [workspace]
86 push ebx
87 ; push ecx ; need not be preserved
88 ; push edx ; need not be preserved
89 push esi
90 push edi
91
92 get_GOT ebx ; get GOT address
93
94 ; ---- Pass 1: process columns from input, store into work array.
95
96 ; mov eax, [original_ebp]
97 mov edx, POINTER [dct_table(eax)] ; quantptr
98 mov esi, JCOEFPTR [coef_block(eax)] ; inptr
99 lea edi, [workspace] ; FAST_FLOAT *wsptr
100 mov ecx, DCTSIZE/4 ; ctr
101 alignx 16,7
102 .columnloop:
103 %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
104 mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
105 or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
106 jnz near .columnDCT
107
108 movq xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
109 movq xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
110 movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
111 movq xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
112 movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
113 movq xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
114 movq xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
115 por xmm1,xmm2
116 por xmm3,xmm4
117 por xmm5,xmm6
118 por xmm1,xmm3
119 por xmm5,xmm7
120 por xmm1,xmm5
121 packsswb xmm1,xmm1
122 movd eax,xmm1
123 test eax,eax
124 jnz short .columnDCT
125
126 ; -- AC terms all zero
127
128 movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
129
130 punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
131 psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
132 cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03)
133
134 mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
135
136 movaps xmm1,xmm0
137 movaps xmm2,xmm0
138 movaps xmm3,xmm0
139
140 shufps xmm0,xmm0,0x00 ; xmm0=(00 00 00 00)
141 shufps xmm1,xmm1,0x55 ; xmm1=(01 01 01 01)
142 shufps xmm2,xmm2,0xAA ; xmm2=(02 02 02 02)
143 shufps xmm3,xmm3,0xFF ; xmm3=(03 03 03 03)
144
145 movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
146 movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
147 movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
148 movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
149 movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
150 movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
151 movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
152 movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
153 jmp near .nextcolumn
154 alignx 16,7
155 %endif
156 .columnDCT:
157
158 ; -- Even part
159
160 movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
161 movq xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
162 movq xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
163 movq xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
164
165 punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
166 punpcklwd xmm1,xmm1 ; xmm1=(20 20 21 21 22 22 23 23)
167 psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
168 psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23)
169 cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03)
170 cvtdq2ps xmm1,xmm1 ; xmm1=in2=(20 21 22 23)
171
172 punpcklwd xmm2,xmm2 ; xmm2=(40 40 41 41 42 42 43 43)
173 punpcklwd xmm3,xmm3 ; xmm3=(60 60 61 61 62 62 63 63)
174 psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43)
175 psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63)
176 cvtdq2ps xmm2,xmm2 ; xmm2=in4=(40 41 42 43)
177 cvtdq2ps xmm3,xmm3 ; xmm3=in6=(60 61 62 63)
178
179 mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
180 mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
181 mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
182 mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
183
184 movaps xmm4,xmm0
185 movaps xmm5,xmm1
186 subps xmm0,xmm2 ; xmm0=tmp11
187 subps xmm1,xmm3
188 addps xmm4,xmm2 ; xmm4=tmp10
189 addps xmm5,xmm3 ; xmm5=tmp13
190
191 mulps xmm1,[GOTOFF(ebx,PD_1_414)]
192 subps xmm1,xmm5 ; xmm1=tmp12
193
194 movaps xmm6,xmm4
195 movaps xmm7,xmm0
196 subps xmm4,xmm5 ; xmm4=tmp3
197 subps xmm0,xmm1 ; xmm0=tmp2
198 addps xmm6,xmm5 ; xmm6=tmp0
199 addps xmm7,xmm1 ; xmm7=tmp1
200
201 movaps XMMWORD [wk(1)], xmm4 ; tmp3
202 movaps XMMWORD [wk(0)], xmm0 ; tmp2
203
204 ; -- Odd part
205
206 movq xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
207 movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
208 movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
209 movq xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
210
211 punpcklwd xmm2,xmm2 ; xmm2=(10 10 11 11 12 12 13 13)
212 punpcklwd xmm3,xmm3 ; xmm3=(30 30 31 31 32 32 33 33)
213 psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13)
214 psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33)
215 cvtdq2ps xmm2,xmm2 ; xmm2=in1=(10 11 12 13)
216 cvtdq2ps xmm3,xmm3 ; xmm3=in3=(30 31 32 33)
217
218 punpcklwd xmm5,xmm5 ; xmm5=(50 50 51 51 52 52 53 53)
219 punpcklwd xmm1,xmm1 ; xmm1=(70 70 71 71 72 72 73 73)
220 psrad xmm5,(DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53)
221 psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73)
222 cvtdq2ps xmm5,xmm5 ; xmm5=in5=(50 51 52 53)
223 cvtdq2ps xmm1,xmm1 ; xmm1=in7=(70 71 72 73)
224
225 mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
226 mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
227 mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
228 mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
229
230 movaps xmm4,xmm2
231 movaps xmm0,xmm5
232 addps xmm2,xmm1 ; xmm2=z11
233 addps xmm5,xmm3 ; xmm5=z13
234 subps xmm4,xmm1 ; xmm4=z12
235 subps xmm0,xmm3 ; xmm0=z10
236
237 movaps xmm1,xmm2
238 subps xmm2,xmm5
239 addps xmm1,xmm5 ; xmm1=tmp7
240
241 mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11
242
243 movaps xmm3,xmm0
244 addps xmm0,xmm4
245 mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5
246 mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930)
247 mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200)
248 addps xmm3,xmm0 ; xmm3=tmp12
249 subps xmm4,xmm0 ; xmm4=tmp10
250
251 ; -- Final output stage
252
253 subps xmm3,xmm1 ; xmm3=tmp6
254 movaps xmm5,xmm6
255 movaps xmm0,xmm7
256 addps xmm6,xmm1 ; xmm6=data0=(00 01 02 03)
257 addps xmm7,xmm3 ; xmm7=data1=(10 11 12 13)
258 subps xmm5,xmm1 ; xmm5=data7=(70 71 72 73)
259 subps xmm0,xmm3 ; xmm0=data6=(60 61 62 63)
260 subps xmm2,xmm3 ; xmm2=tmp5
261
262 movaps xmm1,xmm6 ; transpose coefficients(phase 1)
263 unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11)
264 unpckhps xmm1,xmm7 ; xmm1=(02 12 03 13)
265 movaps xmm3,xmm0 ; transpose coefficients(phase 1)
266 unpcklps xmm0,xmm5 ; xmm0=(60 70 61 71)
267 unpckhps xmm3,xmm5 ; xmm3=(62 72 63 73)
268
269 movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
270 movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3
271
272 movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71)
273 movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73)
274
275 addps xmm4,xmm2 ; xmm4=tmp4
276 movaps xmm0,xmm7
277 movaps xmm3,xmm5
278 addps xmm7,xmm2 ; xmm7=data2=(20 21 22 23)
279 addps xmm5,xmm4 ; xmm5=data4=(40 41 42 43)
280 subps xmm0,xmm2 ; xmm0=data5=(50 51 52 53)
281 subps xmm3,xmm4 ; xmm3=data3=(30 31 32 33)
282
283 movaps xmm2,xmm7 ; transpose coefficients(phase 1)
284 unpcklps xmm7,xmm3 ; xmm7=(20 30 21 31)
285 unpckhps xmm2,xmm3 ; xmm2=(22 32 23 33)
286 movaps xmm4,xmm5 ; transpose coefficients(phase 1)
287 unpcklps xmm5,xmm0 ; xmm5=(40 50 41 51)
288 unpckhps xmm4,xmm0 ; xmm4=(42 52 43 53)
289
290 movaps xmm3,xmm6 ; transpose coefficients(phase 2)
291 unpcklps2 xmm6,xmm7 ; xmm6=(00 10 20 30)
292 unpckhps2 xmm3,xmm7 ; xmm3=(01 11 21 31)
293 movaps xmm0,xmm1 ; transpose coefficients(phase 2)
294 unpcklps2 xmm1,xmm2 ; xmm1=(02 12 22 32)
295 unpckhps2 xmm0,xmm2 ; xmm0=(03 13 23 33)
296
297 movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71)
298 movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73)
299
300 movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
301 movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
302 movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
303 movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
304
305 movaps xmm6,xmm5 ; transpose coefficients(phase 2)
306 unpcklps2 xmm5,xmm7 ; xmm5=(40 50 60 70)
307 unpckhps2 xmm6,xmm7 ; xmm6=(41 51 61 71)
308 movaps xmm3,xmm4 ; transpose coefficients(phase 2)
309 unpcklps2 xmm4,xmm2 ; xmm4=(42 52 62 72)
310 unpckhps2 xmm3,xmm2 ; xmm3=(43 53 63 73)
311
312 movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
313 movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
314 movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
315 movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
316
317 .nextcolumn:
318 add esi, byte 4*SIZEOF_JCOEF ; coef_block
319 add edx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr
320 add edi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr
321 dec ecx ; ctr
322 jnz near .columnloop
323
324 ; -- Prefetch the next coefficient block
325
326 prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
327 prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
328 prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
329 prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
330
331 ; ---- Pass 2: process rows from work array, store into output array.
332
333 mov eax, [original_ebp]
334 lea esi, [workspace] ; FAST_FLOAT *wsptr
335 mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
336 mov eax, JDIMENSION [output_col(eax)]
337 mov ecx, DCTSIZE/4 ; ctr
338 alignx 16,7
339 .rowloop:
340
341 ; -- Even part
342
343 movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
344 movaps xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
345 movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
346 movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
347
348 movaps xmm4,xmm0
349 movaps xmm5,xmm1
350 subps xmm0,xmm2 ; xmm0=tmp11
351 subps xmm1,xmm3
352 addps xmm4,xmm2 ; xmm4=tmp10
353 addps xmm5,xmm3 ; xmm5=tmp13
354
355 mulps xmm1,[GOTOFF(ebx,PD_1_414)]
356 subps xmm1,xmm5 ; xmm1=tmp12
357
358 movaps xmm6,xmm4
359 movaps xmm7,xmm0
360 subps xmm4,xmm5 ; xmm4=tmp3
361 subps xmm0,xmm1 ; xmm0=tmp2
362 addps xmm6,xmm5 ; xmm6=tmp0
363 addps xmm7,xmm1 ; xmm7=tmp1
364
365 movaps XMMWORD [wk(1)], xmm4 ; tmp3
366 movaps XMMWORD [wk(0)], xmm0 ; tmp2
367
368 ; -- Odd part
369
370 movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
371 movaps xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
372 movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
373 movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
374
375 movaps xmm4,xmm2
376 movaps xmm0,xmm5
377 addps xmm2,xmm1 ; xmm2=z11
378 addps xmm5,xmm3 ; xmm5=z13
379 subps xmm4,xmm1 ; xmm4=z12
380 subps xmm0,xmm3 ; xmm0=z10
381
382 movaps xmm1,xmm2
383 subps xmm2,xmm5
384 addps xmm1,xmm5 ; xmm1=tmp7
385
386 mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11
387
388 movaps xmm3,xmm0
389 addps xmm0,xmm4
390 mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5
391 mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930)
392 mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200)
393 addps xmm3,xmm0 ; xmm3=tmp12
394 subps xmm4,xmm0 ; xmm4=tmp10
395
396 ; -- Final output stage
397
398 subps xmm3,xmm1 ; xmm3=tmp6
399 movaps xmm5,xmm6
400 movaps xmm0,xmm7
401 addps xmm6,xmm1 ; xmm6=data0=(00 10 20 30)
402 addps xmm7,xmm3 ; xmm7=data1=(01 11 21 31)
403 subps xmm5,xmm1 ; xmm5=data7=(07 17 27 37)
404 subps xmm0,xmm3 ; xmm0=data6=(06 16 26 36)
405 subps xmm2,xmm3 ; xmm2=tmp5
406
407 movaps xmm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm1=[PD_RNDINT_MAGIC]
408 pcmpeqd xmm3,xmm3
409 psrld xmm3,WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
410
411 addps xmm6,xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 * *)
412 addps xmm7,xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 * *)
413 addps xmm0,xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 * *)
414 addps xmm5,xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 * *)
415
416 pand xmm6,xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --)
417 pslld xmm7,WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31)
418 pand xmm0,xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --)
419 pslld xmm5,WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37)
420 por xmm6,xmm7 ; xmm6=(00 01 10 11 20 21 30 31)
421 por xmm0,xmm5 ; xmm0=(06 07 16 17 26 27 36 37)
422
423 movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2
424 movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3
425
426 addps xmm4,xmm2 ; xmm4=tmp4
427 movaps xmm7,xmm1
428 movaps xmm5,xmm3
429 addps xmm1,xmm2 ; xmm1=data2=(02 12 22 32)
430 addps xmm3,xmm4 ; xmm3=data4=(04 14 24 34)
431 subps xmm7,xmm2 ; xmm7=data5=(05 15 25 35)
432 subps xmm5,xmm4 ; xmm5=data3=(03 13 23 33)
433
434 movaps xmm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm2=[PD_RNDINT_MAGIC]
435 pcmpeqd xmm4,xmm4
436 psrld xmm4,WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
437
438 addps xmm3,xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 * *)
439 addps xmm7,xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 * *)
440 addps xmm1,xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 * *)
441 addps xmm5,xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 * *)
442
443 pand xmm3,xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --)
444 pslld xmm7,WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35)
445 pand xmm1,xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --)
446 pslld xmm5,WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33)
447 por xmm3,xmm7 ; xmm3=(04 05 14 15 24 25 34 35)
448 por xmm1,xmm5 ; xmm1=(02 03 12 13 22 23 32 33)
449
450 movdqa xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP]
451
452 packsswb xmm6,xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 2 5 34 35)
453 packsswb xmm1,xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 2 7 36 37)
454 paddb xmm6,xmm2
455 paddb xmm1,xmm2
456
457 movdqa xmm4,xmm6 ; transpose coefficients(phase 2)
458 punpcklwd xmm6,xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 3 1 32 33)
459 punpckhwd xmm4,xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 3 5 36 37)
460
461 movdqa xmm7,xmm6 ; transpose coefficients(phase 3)
462 punpckldq xmm6,xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 1 5 16 17)
463 punpckhdq xmm7,xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 3 5 36 37)
464
465 pshufd xmm5,xmm6,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 0 5 06 07)
466 pshufd xmm3,xmm7,0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 2 5 26 27)
467
468 pushpic ebx ; save GOT address
469
470 mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
471 mov ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
472 movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
473 movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7
474 mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
475 mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
476 movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
477 movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3
478
479 poppic ebx ; restore GOT address
480
481 add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr
482 add edi, byte 4*SIZEOF_JSAMPROW
483 dec ecx ; ctr
484 jnz near .rowloop
485
486 pop edi
487 pop esi
488 ; pop edx ; need not be preserved
489 ; pop ecx ; need not be preserved
490 pop ebx
491 mov esp,ebp ; esp <- aligned ebp
492 pop esp ; esp <- original ebp
493 pop ebp
494 ret
495
496 ; For some reason, the OS X linker does not honor the request to align the
497 ; segment unless we do this.
498 align 16
OLDNEW
« no previous file with comments | « simd/jidctflt-sse.asm ('k') | simd/jidctflt-sse2-64.asm » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698