Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(119)

Side by Side Diff: simd/jccolext-sse2-64.asm

Issue 1953443002: Update to libjpeg_turbo 1.4.90 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libjpeg_turbo.git@master
Patch Set: Created 4 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « simd/jccolext-sse2.asm ('k') | simd/jccolmmx.asm » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 ;
2 ; jccolext.asm - colorspace conversion (64-bit SSE2)
3 ;
4 ; x86 SIMD extension for IJG JPEG library
5 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
6 ; Copyright (C) 2009, D. R. Commander.
7 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
8 ;
9 ; This file should be assembled with NASM (Netwide Assembler),
10 ; can *not* be assembled with Microsoft's MASM or any compatible
11 ; assembler (including Borland's Turbo Assembler).
12 ; NASM is available from http://nasm.sourceforge.net/ or
13 ; http://sourceforge.net/project/showfiles.php?group_id=6208
14 ;
15 ; [TAB8]
16
17 %include "jcolsamp.inc"
18
19 ; --------------------------------------------------------------------------
20 ;
21 ; Convert some rows of samples to the output colorspace.
22 ;
23 ; GLOBAL(void)
24 ; jsimd_rgb_ycc_convert_sse2 (JDIMENSION img_width,
25 ; JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
26 ; JDIMENSION output_row, int num_rows);
27 ;
28
29 ; r10 = JDIMENSION img_width
30 ; r11 = JSAMPARRAY input_buf
31 ; r12 = JSAMPIMAGE output_buf
32 ; r13 = JDIMENSION output_row
33 ; r14 = int num_rows
34
35 %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
36 %define WK_NUM 8
37
38 align 16
39
40 global EXTN(jsimd_rgb_ycc_convert_sse2)
41
42 EXTN(jsimd_rgb_ycc_convert_sse2):
43 push rbp
44 mov rax,rsp ; rax = original rbp
45 sub rsp, byte 4
46 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
47 mov [rsp],rax
48 mov rbp,rsp ; rbp = aligned rbp
49 lea rsp, [wk(0)]
50 collect_args
51 push rbx
52
53 mov ecx, r10d
54 test rcx,rcx
55 jz near .return
56
57 push rcx
58
59 mov rsi, r12
60 mov ecx, r13d
61 mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
62 mov rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
63 mov rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
64 lea rdi, [rdi+rcx*SIZEOF_JSAMPROW]
65 lea rbx, [rbx+rcx*SIZEOF_JSAMPROW]
66 lea rdx, [rdx+rcx*SIZEOF_JSAMPROW]
67
68 pop rcx
69
70 mov rsi, r11
71 mov eax, r14d
72 test rax,rax
73 jle near .return
74 .rowloop:
75 push rdx
76 push rbx
77 push rdi
78 push rsi
79 push rcx ; col
80
81 mov rsi, JSAMPROW [rsi] ; inptr
82 mov rdi, JSAMPROW [rdi] ; outptr0
83 mov rbx, JSAMPROW [rbx] ; outptr1
84 mov rdx, JSAMPROW [rdx] ; outptr2
85
86 cmp rcx, byte SIZEOF_XMMWORD
87 jae near .columnloop
88
89 %if RGB_PIXELSIZE == 3 ; ---------------
90
91 .column_ld1:
92 push rax
93 push rdx
94 lea rcx,[rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE
95 test cl, SIZEOF_BYTE
96 jz short .column_ld2
97 sub rcx, byte SIZEOF_BYTE
98 movzx rax, BYTE [rsi+rcx]
99 .column_ld2:
100 test cl, SIZEOF_WORD
101 jz short .column_ld4
102 sub rcx, byte SIZEOF_WORD
103 movzx rdx, WORD [rsi+rcx]
104 shl rax, WORD_BIT
105 or rax,rdx
106 .column_ld4:
107 movd xmmA,eax
108 pop rdx
109 pop rax
110 test cl, SIZEOF_DWORD
111 jz short .column_ld8
112 sub rcx, byte SIZEOF_DWORD
113 movd xmmF, XMM_DWORD [rsi+rcx]
114 pslldq xmmA, SIZEOF_DWORD
115 por xmmA,xmmF
116 .column_ld8:
117 test cl, SIZEOF_MMWORD
118 jz short .column_ld16
119 sub rcx, byte SIZEOF_MMWORD
120 movq xmmB, XMM_MMWORD [rsi+rcx]
121 pslldq xmmA, SIZEOF_MMWORD
122 por xmmA,xmmB
123 .column_ld16:
124 test cl, SIZEOF_XMMWORD
125 jz short .column_ld32
126 movdqa xmmF,xmmA
127 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
128 mov rcx, SIZEOF_XMMWORD
129 jmp short .rgb_ycc_cnv
130 .column_ld32:
131 test cl, 2*SIZEOF_XMMWORD
132 mov rcx, SIZEOF_XMMWORD
133 jz short .rgb_ycc_cnv
134 movdqa xmmB,xmmA
135 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
136 movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
137 jmp short .rgb_ycc_cnv
138
139 .columnloop:
140 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
141 movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
142 movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
143
144 .rgb_ycc_cnv:
145 ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
146 ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
147 ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
148
149 movdqa xmmG,xmmA
150 pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 2 1 02 12)
151 psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- - - -- --)
152
153 punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0 A 12 1A)
154 pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 0 7 17 27)
155
156 punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2 C 05 0D)
157 punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1 F 27 2F)
158
159 movdqa xmmD,xmmA
160 pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 2 8 01 09)
161 psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- - - -- --)
162
163 punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 0 5 09 0D)
164 pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1 B 23 2B)
165
166 punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 1 6 1A 1E)
167 punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 2 7 2B 2F)
168
169 movdqa xmmE,xmmA
170 pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 1 4 18 1C)
171 psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- - - -- --)
172
173 punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1 A 1C 1E)
174 pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 2 5 29 2D)
175
176 punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0 B 0D 0F)
177 punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2 B 2D 2F)
178
179 pxor xmmH,xmmH
180
181 movdqa xmmC,xmmA
182 punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
183 punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
184
185 movdqa xmmB,xmmE
186 punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
187 punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
188
189 movdqa xmmF,xmmD
190 punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
191 punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
192
193 %else ; RGB_PIXELSIZE == 4 ; -----------
194
195 .column_ld1:
196 test cl, SIZEOF_XMMWORD/16
197 jz short .column_ld2
198 sub rcx, byte SIZEOF_XMMWORD/16
199 movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
200 .column_ld2:
201 test cl, SIZEOF_XMMWORD/8
202 jz short .column_ld4
203 sub rcx, byte SIZEOF_XMMWORD/8
204 movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
205 pslldq xmmA, SIZEOF_MMWORD
206 por xmmA,xmmE
207 .column_ld4:
208 test cl, SIZEOF_XMMWORD/4
209 jz short .column_ld8
210 sub rcx, byte SIZEOF_XMMWORD/4
211 movdqa xmmE,xmmA
212 movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
213 .column_ld8:
214 test cl, SIZEOF_XMMWORD/2
215 mov rcx, SIZEOF_XMMWORD
216 jz short .rgb_ycc_cnv
217 movdqa xmmF,xmmA
218 movdqa xmmH,xmmE
219 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
220 movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
221 jmp short .rgb_ycc_cnv
222
223 .columnloop:
224 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
225 movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
226 movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
227 movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
228
229 .rgb_ycc_cnv:
230 ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
231 ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
232 ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
233 ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
234
235 movdqa xmmD,xmmA
236 punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 2 5 31 35)
237 punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 2 7 33 37)
238
239 movdqa xmmC,xmmF
240 punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2 D 39 3D)
241 punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2 F 3B 3F)
242
243 movdqa xmmB,xmmA
244 punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 3 4 38 3C)
245 punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 3 5 39 3D)
246
247 movdqa xmmG,xmmD
248 punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 3 6 3A 3E)
249 punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 3 7 3B 3F)
250
251 movdqa xmmE,xmmA
252 punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1 A 1C 1E)
253 punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3 A 3C 3E)
254
255 movdqa xmmH,xmmB
256 punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1 B 1D 1F)
257 punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3 B 3D 3F)
258
259 pxor xmmF,xmmF
260
261 movdqa xmmC,xmmA
262 punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
263 punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
264
265 movdqa xmmD,xmmB
266 punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
267 punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
268
269 movdqa xmmG,xmmE
270 punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
271 punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
272
273 punpcklbw xmmF,xmmH
274 punpckhbw xmmH,xmmH
275 psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
276 psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
277
278 %endif ; RGB_PIXELSIZE ; ---------------
279
280 ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
281 ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
282
283 ; (Original)
284 ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
285 ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
286 ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
287 ;
288 ; (This implementation)
289 ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
290 ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
291 ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
292
293 movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE
294 movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO
295 movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE
296 movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO
297
298 movdqa xmm6,xmm1
299 punpcklwd xmm1,xmm3
300 punpckhwd xmm6,xmm3
301 movdqa xmm7,xmm1
302 movdqa xmm4,xmm6
303 pmaddwd xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
304 pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
305 pmaddwd xmm7,[rel PW_MF016_MF033] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.33 1)
306 pmaddwd xmm4,[rel PW_MF016_MF033] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.33 1)
307
308 movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
309 movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
310
311 pxor xmm1,xmm1
312 pxor xmm6,xmm6
313 punpcklwd xmm1,xmm5 ; xmm1=BOL
314 punpckhwd xmm6,xmm5 ; xmm6=BOH
315 psrld xmm1,1 ; xmm1=BOL*FIX(0.500)
316 psrld xmm6,1 ; xmm6=BOH*FIX(0.500)
317
318 movdqa xmm5,[rel PD_ONEHALFM1_CJ] ; xmm5=[PD_ONEHALFM1_CJ]
319
320 paddd xmm7,xmm1
321 paddd xmm4,xmm6
322 paddd xmm7,xmm5
323 paddd xmm4,xmm5
324 psrld xmm7,SCALEBITS ; xmm7=CbOL
325 psrld xmm4,SCALEBITS ; xmm4=CbOH
326 packssdw xmm7,xmm4 ; xmm7=CbO
327
328 movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE
329
330 movdqa xmm6,xmm0
331 punpcklwd xmm0,xmm2
332 punpckhwd xmm6,xmm2
333 movdqa xmm5,xmm0
334 movdqa xmm4,xmm6
335 pmaddwd xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
336 pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
337 pmaddwd xmm5,[rel PW_MF016_MF033] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.33 1)
338 pmaddwd xmm4,[rel PW_MF016_MF033] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.33 1)
339
340 movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
341 movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
342
343 pxor xmm0,xmm0
344 pxor xmm6,xmm6
345 punpcklwd xmm0,xmm1 ; xmm0=BEL
346 punpckhwd xmm6,xmm1 ; xmm6=BEH
347 psrld xmm0,1 ; xmm0=BEL*FIX(0.500)
348 psrld xmm6,1 ; xmm6=BEH*FIX(0.500)
349
350 movdqa xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
351
352 paddd xmm5,xmm0
353 paddd xmm4,xmm6
354 paddd xmm5,xmm1
355 paddd xmm4,xmm1
356 psrld xmm5,SCALEBITS ; xmm5=CbEL
357 psrld xmm4,SCALEBITS ; xmm4=CbEH
358 packssdw xmm5,xmm4 ; xmm5=CbE
359
360 psllw xmm7,BYTE_BIT
361 por xmm5,xmm7 ; xmm5=Cb
362 movdqa XMMWORD [rbx], xmm5 ; Save Cb
363
364 movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO
365 movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE
366 movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO
367
368 movdqa xmm4,xmm0
369 punpcklwd xmm0,xmm3
370 punpckhwd xmm4,xmm3
371 movdqa xmm7,xmm0
372 movdqa xmm5,xmm4
373 pmaddwd xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
374 pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
375 pmaddwd xmm7,[rel PW_MF008_MF041] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.41 8)
376 pmaddwd xmm5,[rel PW_MF008_MF041] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.41 8)
377
378 movdqa xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF]
379
380 paddd xmm0, XMMWORD [wk(4)]
381 paddd xmm4, XMMWORD [wk(5)]
382 paddd xmm0,xmm3
383 paddd xmm4,xmm3
384 psrld xmm0,SCALEBITS ; xmm0=YOL
385 psrld xmm4,SCALEBITS ; xmm4=YOH
386 packssdw xmm0,xmm4 ; xmm0=YO
387
388 pxor xmm3,xmm3
389 pxor xmm4,xmm4
390 punpcklwd xmm3,xmm1 ; xmm3=ROL
391 punpckhwd xmm4,xmm1 ; xmm4=ROH
392 psrld xmm3,1 ; xmm3=ROL*FIX(0.500)
393 psrld xmm4,1 ; xmm4=ROH*FIX(0.500)
394
395 movdqa xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
396
397 paddd xmm7,xmm3
398 paddd xmm5,xmm4
399 paddd xmm7,xmm1
400 paddd xmm5,xmm1
401 psrld xmm7,SCALEBITS ; xmm7=CrOL
402 psrld xmm5,SCALEBITS ; xmm5=CrOH
403 packssdw xmm7,xmm5 ; xmm7=CrO
404
405 movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE
406
407 movdqa xmm4,xmm6
408 punpcklwd xmm6,xmm2
409 punpckhwd xmm4,xmm2
410 movdqa xmm1,xmm6
411 movdqa xmm5,xmm4
412 pmaddwd xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
413 pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
414 pmaddwd xmm1,[rel PW_MF008_MF041] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.41 8)
415 pmaddwd xmm5,[rel PW_MF008_MF041] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.41 8)
416
417 movdqa xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF]
418
419 paddd xmm6, XMMWORD [wk(6)]
420 paddd xmm4, XMMWORD [wk(7)]
421 paddd xmm6,xmm2
422 paddd xmm4,xmm2
423 psrld xmm6,SCALEBITS ; xmm6=YEL
424 psrld xmm4,SCALEBITS ; xmm4=YEH
425 packssdw xmm6,xmm4 ; xmm6=YE
426
427 psllw xmm0,BYTE_BIT
428 por xmm6,xmm0 ; xmm6=Y
429 movdqa XMMWORD [rdi], xmm6 ; Save Y
430
431 pxor xmm2,xmm2
432 pxor xmm4,xmm4
433 punpcklwd xmm2,xmm3 ; xmm2=REL
434 punpckhwd xmm4,xmm3 ; xmm4=REH
435 psrld xmm2,1 ; xmm2=REL*FIX(0.500)
436 psrld xmm4,1 ; xmm4=REH*FIX(0.500)
437
438 movdqa xmm0,[rel PD_ONEHALFM1_CJ] ; xmm0=[PD_ONEHALFM1_CJ]
439
440 paddd xmm1,xmm2
441 paddd xmm5,xmm4
442 paddd xmm1,xmm0
443 paddd xmm5,xmm0
444 psrld xmm1,SCALEBITS ; xmm1=CrEL
445 psrld xmm5,SCALEBITS ; xmm5=CrEH
446 packssdw xmm1,xmm5 ; xmm1=CrE
447
448 psllw xmm7,BYTE_BIT
449 por xmm1,xmm7 ; xmm1=Cr
450 movdqa XMMWORD [rdx], xmm1 ; Save Cr
451
452 sub rcx, byte SIZEOF_XMMWORD
453 add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
454 add rdi, byte SIZEOF_XMMWORD ; outptr0
455 add rbx, byte SIZEOF_XMMWORD ; outptr1
456 add rdx, byte SIZEOF_XMMWORD ; outptr2
457 cmp rcx, byte SIZEOF_XMMWORD
458 jae near .columnloop
459 test rcx,rcx
460 jnz near .column_ld1
461
462 pop rcx ; col
463 pop rsi
464 pop rdi
465 pop rbx
466 pop rdx
467
468 add rsi, byte SIZEOF_JSAMPROW ; input_buf
469 add rdi, byte SIZEOF_JSAMPROW
470 add rbx, byte SIZEOF_JSAMPROW
471 add rdx, byte SIZEOF_JSAMPROW
472 dec rax ; num_rows
473 jg near .rowloop
474
475 .return:
476 pop rbx
477 uncollect_args
478 mov rsp,rbp ; rsp <- aligned rbp
479 pop rsp ; rsp <- original rbp
480 pop rbp
481 ret
482
483 ; For some reason, the OS X linker does not honor the request to align the
484 ; segment unless we do this.
485 align 16
OLDNEW
« no previous file with comments | « simd/jccolext-sse2.asm ('k') | simd/jccolmmx.asm » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698