Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(477)

Side by Side Diff: third_party/libjpeg_turbo/simd/jcclrss2-64.asm

Issue 4134011: Adds libjpeg-turbo to deps... (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/
Patch Set: Created 10 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 ;
2 ; jcclrss2-64.asm - colorspace conversion (64-bit SSE2)
3 ;
4 ; x86 SIMD extension for IJG JPEG library
5 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
6 ; Copyright (C) 2009, D. R. Commander.
7 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
8 ;
9 ; This file should be assembled with NASM (Netwide Assembler),
10 ; can *not* be assembled with Microsoft's MASM or any compatible
11 ; assembler (including Borland's Turbo Assembler).
12 ; NASM is available from http://nasm.sourceforge.net/ or
13 ; http://sourceforge.net/project/showfiles.php?group_id=6208
14 ;
15 ; [TAB8]
16
17 %include "jcolsamp.inc"
18
19 ; --------------------------------------------------------------------------
20 SECTION SEG_TEXT
21 BITS 64
22 ;
23 ; Convert some rows of samples to the output colorspace.
24 ;
25 ; GLOBAL(void)
26 ; jsimd_rgb_ycc_convert_sse2 (JDIMENSION img_width,
27 ; JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
28 ; JDIMENSION output_row, int num_rows);
29 ;
30
31 ; r10 = JDIMENSION img_width
32 ; r11 = JSAMPARRAY input_buf
33 ; r12 = JSAMPIMAGE output_buf
34 ; r13 = JDIMENSION output_row
35 ; r14 = int num_rows
36
37 %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
38 %define WK_NUM 8
39
40 align 16
41
42 global EXTN(jsimd_rgb_ycc_convert_sse2)
43
44 EXTN(jsimd_rgb_ycc_convert_sse2):
45 push rbp
46 mov rax,rsp ; rax = original rbp
47 sub rsp, byte 4
48 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
49 mov [rsp],rax
50 mov rbp,rsp ; rbp = aligned rbp
51 lea rsp, [wk(0)]
52 collect_args
53 push rbx
54
55 mov rcx, r10
56 test rcx,rcx
57 jz near .return
58
59 push rcx
60
61 mov rsi, r12
62 mov rcx, r13
63 mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
64 mov rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
65 mov rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
66 lea rdi, [rdi+rcx*SIZEOF_JSAMPROW]
67 lea rbx, [rbx+rcx*SIZEOF_JSAMPROW]
68 lea rdx, [rdx+rcx*SIZEOF_JSAMPROW]
69
70 pop rcx
71
72 mov rsi, r11
73 mov eax, r14d
74 test rax,rax
75 jle near .return
76 .rowloop:
77 push rdx
78 push rbx
79 push rdi
80 push rsi
81 push rcx ; col
82
83 mov rsi, JSAMPROW [rsi] ; inptr
84 mov rdi, JSAMPROW [rdi] ; outptr0
85 mov rbx, JSAMPROW [rbx] ; outptr1
86 mov rdx, JSAMPROW [rdx] ; outptr2
87
88 cmp rcx, byte SIZEOF_XMMWORD
89 jae near .columnloop
90
91 %if RGB_PIXELSIZE == 3 ; ---------------
92
93 .column_ld1:
94 push rax
95 push rdx
96 lea rcx,[rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE
97 test cl, SIZEOF_BYTE
98 jz short .column_ld2
99 sub rcx, byte SIZEOF_BYTE
100 movzx rax, BYTE [rsi+rcx]
101 .column_ld2:
102 test cl, SIZEOF_WORD
103 jz short .column_ld4
104 sub rcx, byte SIZEOF_WORD
105 movzx rdx, WORD [rsi+rcx]
106 shl rax, WORD_BIT
107 or rax,rdx
108 .column_ld4:
109 movd xmmA,eax
110 pop rdx
111 pop rax
112 test cl, SIZEOF_DWORD
113 jz short .column_ld8
114 sub rcx, byte SIZEOF_DWORD
115 movd xmmF, XMM_DWORD [rsi+rcx]
116 pslldq xmmA, SIZEOF_DWORD
117 por xmmA,xmmF
118 .column_ld8:
119 test cl, SIZEOF_MMWORD
120 jz short .column_ld16
121 sub rcx, byte SIZEOF_MMWORD
122 movq xmmB, XMM_MMWORD [rsi+rcx]
123 pslldq xmmA, SIZEOF_MMWORD
124 por xmmA,xmmB
125 .column_ld16:
126 test cl, SIZEOF_XMMWORD
127 jz short .column_ld32
128 movdqa xmmF,xmmA
129 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
130 mov rcx, SIZEOF_XMMWORD
131 jmp short .rgb_ycc_cnv
132 .column_ld32:
133 test cl, 2*SIZEOF_XMMWORD
134 mov rcx, SIZEOF_XMMWORD
135 jz short .rgb_ycc_cnv
136 movdqa xmmB,xmmA
137 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
138 movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
139 jmp short .rgb_ycc_cnv
140
141 .columnloop:
142 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
143 movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
144 movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
145
146 .rgb_ycc_cnv:
147 ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
148 ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
149 ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
150
151 movdqa xmmG,xmmA
152 pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 2 1 02 12)
153 psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- - - -- --)
154
155 punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0 A 12 1A)
156 pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 0 7 17 27)
157
158 punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2 C 05 0D)
159 punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1 F 27 2F)
160
161 movdqa xmmD,xmmA
162 pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 2 8 01 09)
163 psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- - - -- --)
164
165 punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 0 5 09 0D)
166 pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1 B 23 2B)
167
168 punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 1 6 1A 1E)
169 punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 2 7 2B 2F)
170
171 movdqa xmmE,xmmA
172 pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 1 4 18 1C)
173 psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- - - -- --)
174
175 punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1 A 1C 1E)
176 pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 2 5 29 2D)
177
178 punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0 B 0D 0F)
179 punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2 B 2D 2F)
180
181 pxor xmmH,xmmH
182
183 movdqa xmmC,xmmA
184 punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
185 punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
186
187 movdqa xmmB,xmmE
188 punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
189 punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
190
191 movdqa xmmF,xmmD
192 punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
193 punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
194
195 %else ; RGB_PIXELSIZE == 4 ; -----------
196
197 .column_ld1:
198 test cl, SIZEOF_XMMWORD/16
199 jz short .column_ld2
200 sub rcx, byte SIZEOF_XMMWORD/16
201 movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
202 .column_ld2:
203 test cl, SIZEOF_XMMWORD/8
204 jz short .column_ld4
205 sub rcx, byte SIZEOF_XMMWORD/8
206 movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
207 pslldq xmmA, SIZEOF_MMWORD
208 por xmmA,xmmE
209 .column_ld4:
210 test cl, SIZEOF_XMMWORD/4
211 jz short .column_ld8
212 sub rcx, byte SIZEOF_XMMWORD/4
213 movdqa xmmE,xmmA
214 movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
215 .column_ld8:
216 test cl, SIZEOF_XMMWORD/2
217 mov rcx, SIZEOF_XMMWORD
218 jz short .rgb_ycc_cnv
219 movdqa xmmF,xmmA
220 movdqa xmmH,xmmE
221 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
222 movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
223 jmp short .rgb_ycc_cnv
224
225 .columnloop:
226 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
227 movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
228 movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
229 movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
230
231 .rgb_ycc_cnv:
232 ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
233 ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
234 ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
235 ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
236
237 movdqa xmmD,xmmA
238 punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 2 5 31 35)
239 punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 2 7 33 37)
240
241 movdqa xmmC,xmmF
242 punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2 D 39 3D)
243 punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2 F 3B 3F)
244
245 movdqa xmmB,xmmA
246 punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 3 4 38 3C)
247 punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 3 5 39 3D)
248
249 movdqa xmmG,xmmD
250 punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 3 6 3A 3E)
251 punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 3 7 3B 3F)
252
253 movdqa xmmE,xmmA
254 punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1 A 1C 1E)
255 punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3 A 3C 3E)
256
257 movdqa xmmH,xmmB
258 punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1 B 1D 1F)
259 punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3 B 3D 3F)
260
261 pxor xmmF,xmmF
262
263 movdqa xmmC,xmmA
264 punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
265 punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
266
267 movdqa xmmD,xmmB
268 punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
269 punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
270
271 movdqa xmmG,xmmE
272 punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
273 punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
274
275 punpcklbw xmmF,xmmH
276 punpckhbw xmmH,xmmH
277 psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
278 psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
279
280 %endif ; RGB_PIXELSIZE ; ---------------
281
282 ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
283 ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
284
285 ; (Original)
286 ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
287 ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
288 ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
289 ;
290 ; (This implementation)
291 ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
292 ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
293 ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
294
295 movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE
296 movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO
297 movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE
298 movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO
299
300 movdqa xmm6,xmm1
301 punpcklwd xmm1,xmm3
302 punpckhwd xmm6,xmm3
303 movdqa xmm7,xmm1
304 movdqa xmm4,xmm6
305 pmaddwd xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
306 pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
307 pmaddwd xmm7,[rel PW_MF016_MF033] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.33 1)
308 pmaddwd xmm4,[rel PW_MF016_MF033] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.33 1)
309
310 movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
311 movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
312
313 pxor xmm1,xmm1
314 pxor xmm6,xmm6
315 punpcklwd xmm1,xmm5 ; xmm1=BOL
316 punpckhwd xmm6,xmm5 ; xmm6=BOH
317 psrld xmm1,1 ; xmm1=BOL*FIX(0.500)
318 psrld xmm6,1 ; xmm6=BOH*FIX(0.500)
319
320 movdqa xmm5,[rel PD_ONEHALFM1_CJ] ; xmm5=[PD_ONEHALFM1_CJ]
321
322 paddd xmm7,xmm1
323 paddd xmm4,xmm6
324 paddd xmm7,xmm5
325 paddd xmm4,xmm5
326 psrld xmm7,SCALEBITS ; xmm7=CbOL
327 psrld xmm4,SCALEBITS ; xmm4=CbOH
328 packssdw xmm7,xmm4 ; xmm7=CbO
329
330 movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE
331
332 movdqa xmm6,xmm0
333 punpcklwd xmm0,xmm2
334 punpckhwd xmm6,xmm2
335 movdqa xmm5,xmm0
336 movdqa xmm4,xmm6
337 pmaddwd xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
338 pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
339 pmaddwd xmm5,[rel PW_MF016_MF033] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.33 1)
340 pmaddwd xmm4,[rel PW_MF016_MF033] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.33 1)
341
342 movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
343 movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
344
345 pxor xmm0,xmm0
346 pxor xmm6,xmm6
347 punpcklwd xmm0,xmm1 ; xmm0=BEL
348 punpckhwd xmm6,xmm1 ; xmm6=BEH
349 psrld xmm0,1 ; xmm0=BEL*FIX(0.500)
350 psrld xmm6,1 ; xmm6=BEH*FIX(0.500)
351
352 movdqa xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
353
354 paddd xmm5,xmm0
355 paddd xmm4,xmm6
356 paddd xmm5,xmm1
357 paddd xmm4,xmm1
358 psrld xmm5,SCALEBITS ; xmm5=CbEL
359 psrld xmm4,SCALEBITS ; xmm4=CbEH
360 packssdw xmm5,xmm4 ; xmm5=CbE
361
362 psllw xmm7,BYTE_BIT
363 por xmm5,xmm7 ; xmm5=Cb
364 movdqa XMMWORD [rbx], xmm5 ; Save Cb
365
366 movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO
367 movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE
368 movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO
369
370 movdqa xmm4,xmm0
371 punpcklwd xmm0,xmm3
372 punpckhwd xmm4,xmm3
373 movdqa xmm7,xmm0
374 movdqa xmm5,xmm4
375 pmaddwd xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
376 pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
377 pmaddwd xmm7,[rel PW_MF008_MF041] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.41 8)
378 pmaddwd xmm5,[rel PW_MF008_MF041] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.41 8)
379
380 movdqa xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF]
381
382 paddd xmm0, XMMWORD [wk(4)]
383 paddd xmm4, XMMWORD [wk(5)]
384 paddd xmm0,xmm3
385 paddd xmm4,xmm3
386 psrld xmm0,SCALEBITS ; xmm0=YOL
387 psrld xmm4,SCALEBITS ; xmm4=YOH
388 packssdw xmm0,xmm4 ; xmm0=YO
389
390 pxor xmm3,xmm3
391 pxor xmm4,xmm4
392 punpcklwd xmm3,xmm1 ; xmm3=ROL
393 punpckhwd xmm4,xmm1 ; xmm4=ROH
394 psrld xmm3,1 ; xmm3=ROL*FIX(0.500)
395 psrld xmm4,1 ; xmm4=ROH*FIX(0.500)
396
397 movdqa xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
398
399 paddd xmm7,xmm3
400 paddd xmm5,xmm4
401 paddd xmm7,xmm1
402 paddd xmm5,xmm1
403 psrld xmm7,SCALEBITS ; xmm7=CrOL
404 psrld xmm5,SCALEBITS ; xmm5=CrOH
405 packssdw xmm7,xmm5 ; xmm7=CrO
406
407 movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE
408
409 movdqa xmm4,xmm6
410 punpcklwd xmm6,xmm2
411 punpckhwd xmm4,xmm2
412 movdqa xmm1,xmm6
413 movdqa xmm5,xmm4
414 pmaddwd xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
415 pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
416 pmaddwd xmm1,[rel PW_MF008_MF041] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.41 8)
417 pmaddwd xmm5,[rel PW_MF008_MF041] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.41 8)
418
419 movdqa xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF]
420
421 paddd xmm6, XMMWORD [wk(6)]
422 paddd xmm4, XMMWORD [wk(7)]
423 paddd xmm6,xmm2
424 paddd xmm4,xmm2
425 psrld xmm6,SCALEBITS ; xmm6=YEL
426 psrld xmm4,SCALEBITS ; xmm4=YEH
427 packssdw xmm6,xmm4 ; xmm6=YE
428
429 psllw xmm0,BYTE_BIT
430 por xmm6,xmm0 ; xmm6=Y
431 movdqa XMMWORD [rdi], xmm6 ; Save Y
432
433 pxor xmm2,xmm2
434 pxor xmm4,xmm4
435 punpcklwd xmm2,xmm3 ; xmm2=REL
436 punpckhwd xmm4,xmm3 ; xmm4=REH
437 psrld xmm2,1 ; xmm2=REL*FIX(0.500)
438 psrld xmm4,1 ; xmm4=REH*FIX(0.500)
439
440 movdqa xmm0,[rel PD_ONEHALFM1_CJ] ; xmm0=[PD_ONEHALFM1_CJ]
441
442 paddd xmm1,xmm2
443 paddd xmm5,xmm4
444 paddd xmm1,xmm0
445 paddd xmm5,xmm0
446 psrld xmm1,SCALEBITS ; xmm1=CrEL
447 psrld xmm5,SCALEBITS ; xmm5=CrEH
448 packssdw xmm1,xmm5 ; xmm1=CrE
449
450 psllw xmm7,BYTE_BIT
451 por xmm1,xmm7 ; xmm1=Cr
452 movdqa XMMWORD [rdx], xmm1 ; Save Cr
453
454 sub rcx, byte SIZEOF_XMMWORD
455 add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
456 add rdi, byte SIZEOF_XMMWORD ; outptr0
457 add rbx, byte SIZEOF_XMMWORD ; outptr1
458 add rdx, byte SIZEOF_XMMWORD ; outptr2
459 cmp rcx, byte SIZEOF_XMMWORD
460 jae near .columnloop
461 test rcx,rcx
462 jnz near .column_ld1
463
464 pop rcx ; col
465 pop rsi
466 pop rdi
467 pop rbx
468 pop rdx
469
470 add rsi, byte SIZEOF_JSAMPROW ; input_buf
471 add rdi, byte SIZEOF_JSAMPROW
472 add rbx, byte SIZEOF_JSAMPROW
473 add rdx, byte SIZEOF_JSAMPROW
474 dec rax ; num_rows
475 jg near .rowloop
476
477 .return:
478 pop rbx
479 uncollect_args
480 mov rsp,rbp ; rsp <- aligned rbp
481 pop rsp ; rsp <- original rbp
482 pop rbp
483 ret
484
485 ; For some reason, the OS X linker does not honor the request to align the
486 ; segment unless we do this.
487 align 16
OLDNEW
« no previous file with comments | « third_party/libjpeg_turbo/simd/jcclrss2.asm ('k') | third_party/libjpeg_turbo/simd/jccolmmx.asm » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698