Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1306)

Side by Side Diff: simd/jiss2flt-64.asm

Issue 1939823002: Update to libjpeg_turbo 1.4.90 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libjpeg_turbo.git@master
Patch Set: Response to comments Created 4 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 ;
2 ; jiss2flt-64.asm - floating-point IDCT (64-bit SSE & SSE2)
3 ;
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5 ; Copyright 2009 D. R. Commander
6 ;
7 ; Based on
8 ; x86 SIMD extension for IJG JPEG library
9 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
11 ;
12 ; This file should be assembled with NASM (Netwide Assembler),
13 ; can *not* be assembled with Microsoft's MASM or any compatible
14 ; assembler (including Borland's Turbo Assembler).
15 ; NASM is available from http://nasm.sourceforge.net/ or
16 ; http://sourceforge.net/project/showfiles.php?group_id=6208
17 ;
18 ; This file contains a floating-point implementation of the inverse DCT
19 ; (Discrete Cosine Transform). The following code is based directly on
20 ; the IJG's original jidctflt.c; see the jidctflt.c for more details.
21 ;
22 ; [TAB8]
23
24 %include "jsimdext.inc"
25 %include "jdct.inc"
26
27 ; --------------------------------------------------------------------------
28
29 %macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
30 shufps %1,%2,0x44
31 %endmacro
32
33 %macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
34 shufps %1,%2,0xEE
35 %endmacro
36
37 ; --------------------------------------------------------------------------
38 SECTION SEG_CONST
39
40 alignz 16
41 global EXTN(jconst_idct_float_sse2) PRIVATE
42
43 EXTN(jconst_idct_float_sse2):
44
45 PD_1_414 times 4 dd 1.414213562373095048801689
46 PD_1_847 times 4 dd 1.847759065022573512256366
47 PD_1_082 times 4 dd 1.082392200292393968799446
48 PD_M2_613 times 4 dd -2.613125929752753055713286
49 PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3)
50 PB_CENTERJSAMP times 16 db CENTERJSAMPLE
51
52 alignz 16
53
54 ; --------------------------------------------------------------------------
55 SECTION SEG_TEXT
56 BITS 64
57 ;
58 ; Perform dequantization and inverse DCT on one block of coefficients.
59 ;
60 ; GLOBAL(void)
61 ; jsimd_idct_float_sse2 (void * dct_table, JCOEFPTR coef_block,
62 ; JSAMPARRAY output_buf, JDIMENSION output_col)
63 ;
64
65 ; r10 = void * dct_table
66 ; r11 = JCOEFPTR coef_block
67 ; r12 = JSAMPARRAY output_buf
68 ; r13 = JDIMENSION output_col
69
70 %define original_rbp rbp+0
71 %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
72 %define WK_NUM 2
73 %define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
74 ; FAST_FLOAT workspace[DCTSIZE2]
75
76 align 16
77 global EXTN(jsimd_idct_float_sse2) PRIVATE
78
79 EXTN(jsimd_idct_float_sse2):
80 push rbp
81 mov rax,rsp ; rax = original rbp
82 sub rsp, byte 4
83 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
84 mov [rsp],rax
85 mov rbp,rsp ; rbp = aligned rbp
86 lea rsp, [workspace]
87 collect_args
88 push rbx
89
90 ; ---- Pass 1: process columns from input, store into work array.
91
92 mov rdx, r10 ; quantptr
93 mov rsi, r11 ; inptr
94 lea rdi, [workspace] ; FAST_FLOAT * wsptr
95 mov rcx, DCTSIZE/4 ; ctr
96 .columnloop:
97 %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
98 mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
99 or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
100 jnz near .columnDCT
101
102 movq xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
103 movq xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
104 movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
105 movq xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
106 movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
107 movq xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
108 movq xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
109 por xmm1,xmm2
110 por xmm3,xmm4
111 por xmm5,xmm6
112 por xmm1,xmm3
113 por xmm5,xmm7
114 por xmm1,xmm5
115 packsswb xmm1,xmm1
116 movd eax,xmm1
117 test rax,rax
118 jnz short .columnDCT
119
120 ; -- AC terms all zero
121
122 movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
123
124 punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
125 psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
126 cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03)
127
128 mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
129
130 movaps xmm1,xmm0
131 movaps xmm2,xmm0
132 movaps xmm3,xmm0
133
134 shufps xmm0,xmm0,0x00 ; xmm0=(00 00 00 00)
135 shufps xmm1,xmm1,0x55 ; xmm1=(01 01 01 01)
136 shufps xmm2,xmm2,0xAA ; xmm2=(02 02 02 02)
137 shufps xmm3,xmm3,0xFF ; xmm3=(03 03 03 03)
138
139 movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
140 movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
141 movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
142 movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
143 movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
144 movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2
145 movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
146 movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
147 jmp near .nextcolumn
148 %endif
149 .columnDCT:
150
151 ; -- Even part
152
153 movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
154 movq xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
155 movq xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
156 movq xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
157
158 punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
159 punpcklwd xmm1,xmm1 ; xmm1=(20 20 21 21 22 22 23 23)
160 psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
161 psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23)
162 cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03)
163 cvtdq2ps xmm1,xmm1 ; xmm1=in2=(20 21 22 23)
164
165 punpcklwd xmm2,xmm2 ; xmm2=(40 40 41 41 42 42 43 43)
166 punpcklwd xmm3,xmm3 ; xmm3=(60 60 61 61 62 62 63 63)
167 psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43)
168 psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63)
169 cvtdq2ps xmm2,xmm2 ; xmm2=in4=(40 41 42 43)
170 cvtdq2ps xmm3,xmm3 ; xmm3=in6=(60 61 62 63)
171
172 mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
173 mulps xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
174 mulps xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
175 mulps xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
176
177 movaps xmm4,xmm0
178 movaps xmm5,xmm1
179 subps xmm0,xmm2 ; xmm0=tmp11
180 subps xmm1,xmm3
181 addps xmm4,xmm2 ; xmm4=tmp10
182 addps xmm5,xmm3 ; xmm5=tmp13
183
184 mulps xmm1,[rel PD_1_414]
185 subps xmm1,xmm5 ; xmm1=tmp12
186
187 movaps xmm6,xmm4
188 movaps xmm7,xmm0
189 subps xmm4,xmm5 ; xmm4=tmp3
190 subps xmm0,xmm1 ; xmm0=tmp2
191 addps xmm6,xmm5 ; xmm6=tmp0
192 addps xmm7,xmm1 ; xmm7=tmp1
193
194 movaps XMMWORD [wk(1)], xmm4 ; tmp3
195 movaps XMMWORD [wk(0)], xmm0 ; tmp2
196
197 ; -- Odd part
198
199 movq xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
200 movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
201 movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
202 movq xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
203
204 punpcklwd xmm2,xmm2 ; xmm2=(10 10 11 11 12 12 13 13)
205 punpcklwd xmm3,xmm3 ; xmm3=(30 30 31 31 32 32 33 33)
206 psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13)
207 psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33)
208 cvtdq2ps xmm2,xmm2 ; xmm2=in1=(10 11 12 13)
209 cvtdq2ps xmm3,xmm3 ; xmm3=in3=(30 31 32 33)
210
211 punpcklwd xmm5,xmm5 ; xmm5=(50 50 51 51 52 52 53 53)
212 punpcklwd xmm1,xmm1 ; xmm1=(70 70 71 71 72 72 73 73)
213 psrad xmm5,(DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53)
214 psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73)
215 cvtdq2ps xmm5,xmm5 ; xmm5=in5=(50 51 52 53)
216 cvtdq2ps xmm1,xmm1 ; xmm1=in7=(70 71 72 73)
217
218 mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
219 mulps xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
220 mulps xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
221 mulps xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
222
223 movaps xmm4,xmm2
224 movaps xmm0,xmm5
225 addps xmm2,xmm1 ; xmm2=z11
226 addps xmm5,xmm3 ; xmm5=z13
227 subps xmm4,xmm1 ; xmm4=z12
228 subps xmm0,xmm3 ; xmm0=z10
229
230 movaps xmm1,xmm2
231 subps xmm2,xmm5
232 addps xmm1,xmm5 ; xmm1=tmp7
233
234 mulps xmm2,[rel PD_1_414] ; xmm2=tmp11
235
236 movaps xmm3,xmm0
237 addps xmm0,xmm4
238 mulps xmm0,[rel PD_1_847] ; xmm0=z5
239 mulps xmm3,[rel PD_M2_613] ; xmm3=(z10 * -2.613125930)
240 mulps xmm4,[rel PD_1_082] ; xmm4=(z12 * 1.082392200)
241 addps xmm3,xmm0 ; xmm3=tmp12
242 subps xmm4,xmm0 ; xmm4=tmp10
243
244 ; -- Final output stage
245
246 subps xmm3,xmm1 ; xmm3=tmp6
247 movaps xmm5,xmm6
248 movaps xmm0,xmm7
249 addps xmm6,xmm1 ; xmm6=data0=(00 01 02 03)
250 addps xmm7,xmm3 ; xmm7=data1=(10 11 12 13)
251 subps xmm5,xmm1 ; xmm5=data7=(70 71 72 73)
252 subps xmm0,xmm3 ; xmm0=data6=(60 61 62 63)
253 subps xmm2,xmm3 ; xmm2=tmp5
254
255 movaps xmm1,xmm6 ; transpose coefficients(phase 1)
256 unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11)
257 unpckhps xmm1,xmm7 ; xmm1=(02 12 03 13)
258 movaps xmm3,xmm0 ; transpose coefficients(phase 1)
259 unpcklps xmm0,xmm5 ; xmm0=(60 70 61 71)
260 unpckhps xmm3,xmm5 ; xmm3=(62 72 63 73)
261
262 movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
263 movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3
264
265 movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71)
266 movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73)
267
268 addps xmm4,xmm2 ; xmm4=tmp4
269 movaps xmm0,xmm7
270 movaps xmm3,xmm5
271 addps xmm7,xmm2 ; xmm7=data2=(20 21 22 23)
272 addps xmm5,xmm4 ; xmm5=data4=(40 41 42 43)
273 subps xmm0,xmm2 ; xmm0=data5=(50 51 52 53)
274 subps xmm3,xmm4 ; xmm3=data3=(30 31 32 33)
275
276 movaps xmm2,xmm7 ; transpose coefficients(phase 1)
277 unpcklps xmm7,xmm3 ; xmm7=(20 30 21 31)
278 unpckhps xmm2,xmm3 ; xmm2=(22 32 23 33)
279 movaps xmm4,xmm5 ; transpose coefficients(phase 1)
280 unpcklps xmm5,xmm0 ; xmm5=(40 50 41 51)
281 unpckhps xmm4,xmm0 ; xmm4=(42 52 43 53)
282
283 movaps xmm3,xmm6 ; transpose coefficients(phase 2)
284 unpcklps2 xmm6,xmm7 ; xmm6=(00 10 20 30)
285 unpckhps2 xmm3,xmm7 ; xmm3=(01 11 21 31)
286 movaps xmm0,xmm1 ; transpose coefficients(phase 2)
287 unpcklps2 xmm1,xmm2 ; xmm1=(02 12 22 32)
288 unpckhps2 xmm0,xmm2 ; xmm0=(03 13 23 33)
289
290 movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71)
291 movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73)
292
293 movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6
294 movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
295 movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
296 movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
297
298 movaps xmm6,xmm5 ; transpose coefficients(phase 2)
299 unpcklps2 xmm5,xmm7 ; xmm5=(40 50 60 70)
300 unpckhps2 xmm6,xmm7 ; xmm6=(41 51 61 71)
301 movaps xmm3,xmm4 ; transpose coefficients(phase 2)
302 unpcklps2 xmm4,xmm2 ; xmm4=(42 52 62 72)
303 unpckhps2 xmm3,xmm2 ; xmm3=(43 53 63 73)
304
305 movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
306 movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
307 movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4
308 movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
309
310 .nextcolumn:
311 add rsi, byte 4*SIZEOF_JCOEF ; coef_block
312 add rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr
313 add rdi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr
314 dec rcx ; ctr
315 jnz near .columnloop
316
317 ; -- Prefetch the next coefficient block
318
319 prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
320 prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
321 prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
322 prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
323
324 ; ---- Pass 2: process rows from work array, store into output array.
325
326 mov rax, [original_rbp]
327 lea rsi, [workspace] ; FAST_FLOAT * wsptr
328 mov rdi, r12 ; (JSAMPROW *)
329 mov eax, r13d
330 mov rcx, DCTSIZE/4 ; ctr
331 .rowloop:
332
333 ; -- Even part
334
335 movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
336 movaps xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)]
337 movaps xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]
338 movaps xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]
339
340 movaps xmm4,xmm0
341 movaps xmm5,xmm1
342 subps xmm0,xmm2 ; xmm0=tmp11
343 subps xmm1,xmm3
344 addps xmm4,xmm2 ; xmm4=tmp10
345 addps xmm5,xmm3 ; xmm5=tmp13
346
347 mulps xmm1,[rel PD_1_414]
348 subps xmm1,xmm5 ; xmm1=tmp12
349
350 movaps xmm6,xmm4
351 movaps xmm7,xmm0
352 subps xmm4,xmm5 ; xmm4=tmp3
353 subps xmm0,xmm1 ; xmm0=tmp2
354 addps xmm6,xmm5 ; xmm6=tmp0
355 addps xmm7,xmm1 ; xmm7=tmp1
356
357 movaps XMMWORD [wk(1)], xmm4 ; tmp3
358 movaps XMMWORD [wk(0)], xmm0 ; tmp2
359
360 ; -- Odd part
361
362 movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
363 movaps xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)]
364 movaps xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]
365 movaps xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]
366
367 movaps xmm4,xmm2
368 movaps xmm0,xmm5
369 addps xmm2,xmm1 ; xmm2=z11
370 addps xmm5,xmm3 ; xmm5=z13
371 subps xmm4,xmm1 ; xmm4=z12
372 subps xmm0,xmm3 ; xmm0=z10
373
374 movaps xmm1,xmm2
375 subps xmm2,xmm5
376 addps xmm1,xmm5 ; xmm1=tmp7
377
378 mulps xmm2,[rel PD_1_414] ; xmm2=tmp11
379
380 movaps xmm3,xmm0
381 addps xmm0,xmm4
382 mulps xmm0,[rel PD_1_847] ; xmm0=z5
383 mulps xmm3,[rel PD_M2_613] ; xmm3=(z10 * -2.613125930)
384 mulps xmm4,[rel PD_1_082] ; xmm4=(z12 * 1.082392200)
385 addps xmm3,xmm0 ; xmm3=tmp12
386 subps xmm4,xmm0 ; xmm4=tmp10
387
388 ; -- Final output stage
389
390 subps xmm3,xmm1 ; xmm3=tmp6
391 movaps xmm5,xmm6
392 movaps xmm0,xmm7
393 addps xmm6,xmm1 ; xmm6=data0=(00 10 20 30)
394 addps xmm7,xmm3 ; xmm7=data1=(01 11 21 31)
395 subps xmm5,xmm1 ; xmm5=data7=(07 17 27 37)
396 subps xmm0,xmm3 ; xmm0=data6=(06 16 26 36)
397 subps xmm2,xmm3 ; xmm2=tmp5
398
399 movaps xmm1,[rel PD_RNDINT_MAGIC] ; xmm1=[rel PD_RNDINT_MAGIC]
400 pcmpeqd xmm3,xmm3
401 psrld xmm3,WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
402
403 addps xmm6,xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 * *)
404 addps xmm7,xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 * *)
405 addps xmm0,xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 * *)
406 addps xmm5,xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 * *)
407
408 pand xmm6,xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --)
409 pslld xmm7,WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31)
410 pand xmm0,xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --)
411 pslld xmm5,WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37)
412 por xmm6,xmm7 ; xmm6=(00 01 10 11 20 21 30 31)
413 por xmm0,xmm5 ; xmm0=(06 07 16 17 26 27 36 37)
414
415 movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2
416 movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3
417
418 addps xmm4,xmm2 ; xmm4=tmp4
419 movaps xmm7,xmm1
420 movaps xmm5,xmm3
421 addps xmm1,xmm2 ; xmm1=data2=(02 12 22 32)
422 addps xmm3,xmm4 ; xmm3=data4=(04 14 24 34)
423 subps xmm7,xmm2 ; xmm7=data5=(05 15 25 35)
424 subps xmm5,xmm4 ; xmm5=data3=(03 13 23 33)
425
426 movaps xmm2,[rel PD_RNDINT_MAGIC] ; xmm2=[rel PD_RNDINT_MAGIC]
427 pcmpeqd xmm4,xmm4
428 psrld xmm4,WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
429
430 addps xmm3,xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 * *)
431 addps xmm7,xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 * *)
432 addps xmm1,xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 * *)
433 addps xmm5,xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 * *)
434
435 pand xmm3,xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --)
436 pslld xmm7,WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35)
437 pand xmm1,xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --)
438 pslld xmm5,WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33)
439 por xmm3,xmm7 ; xmm3=(04 05 14 15 24 25 34 35)
440 por xmm1,xmm5 ; xmm1=(02 03 12 13 22 23 32 33)
441
442 movdqa xmm2,[rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP]
443
444 packsswb xmm6,xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 2 5 34 35)
445 packsswb xmm1,xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 2 7 36 37)
446 paddb xmm6,xmm2
447 paddb xmm1,xmm2
448
449 movdqa xmm4,xmm6 ; transpose coefficients(phase 2)
450 punpcklwd xmm6,xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 3 1 32 33)
451 punpckhwd xmm4,xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 3 5 36 37)
452
453 movdqa xmm7,xmm6 ; transpose coefficients(phase 3)
454 punpckldq xmm6,xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 1 5 16 17)
455 punpckhdq xmm7,xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 3 5 36 37)
456
457 pshufd xmm5,xmm6,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 0 5 06 07)
458 pshufd xmm3,xmm7,0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 2 5 26 27)
459
460 mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
461 mov rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
462 movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
463 movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7
464 mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
465 mov rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
466 movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
467 movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3
468
469 add rsi, byte 4*SIZEOF_FAST_FLOAT ; wsptr
470 add rdi, byte 4*SIZEOF_JSAMPROW
471 dec rcx ; ctr
472 jnz near .rowloop
473
474 pop rbx
475 uncollect_args
476 mov rsp,rbp ; rsp <- aligned rbp
477 pop rsp ; rsp <- original rbp
478 pop rbp
479 ret
480
481 ; For some reason, the OS X linker does not honor the request to align the
482 ; segment unless we do this.
483 align 16
OLDNEW
« jdhuff.c ('K') | « simd/jiss2flt.asm ('k') | simd/jiss2fst.asm » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698