Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(499)

Side by Side Diff: third_party/libjpeg_turbo/simd/jdclrmmx.asm

Issue 4134011: Adds libjpeg-turbo to deps... (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/
Patch Set: Created 10 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 ;
2 ; jdclrmmx.asm - colorspace conversion (MMX)
3 ;
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5 ;
6 ; Based on
7 ; x86 SIMD extension for IJG JPEG library
8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
10 ;
11 ; This file should be assembled with NASM (Netwide Assembler),
12 ; can *not* be assembled with Microsoft's MASM or any compatible
13 ; assembler (including Borland's Turbo Assembler).
14 ; NASM is available from http://nasm.sourceforge.net/ or
15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
16 ;
17 ; [TAB8]
18
19 %include "jcolsamp.inc"
20
21 ; --------------------------------------------------------------------------
22 SECTION SEG_TEXT
23 BITS 32
24 ;
25 ; Convert some rows of samples to the output colorspace.
26 ;
27 ; GLOBAL(void)
28 ; jsimd_ycc_rgb_convert_mmx (JDIMENSION out_width,
29 ; JSAMPIMAGE input_buf, JDIMENSION input_row,
30 ; JSAMPARRAY output_buf, int num_rows)
31 ;
32
33 %define out_width(b) (b)+8 ; JDIMENSION out_width
34 %define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf
35 %define input_row(b) (b)+16 ; JDIMENSION input_row
36 %define output_buf(b) (b)+20 ; JSAMPARRAY output_buf
37 %define num_rows(b) (b)+24 ; int num_rows
38
39 %define original_ebp ebp+0
40 %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
41 %define WK_NUM 2
42 %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr
43
44 align 16
45 global EXTN(jsimd_ycc_rgb_convert_mmx)
46
47 EXTN(jsimd_ycc_rgb_convert_mmx):
48 push ebp
49 mov eax,esp ; eax = original ebp
50 sub esp, byte 4
51 and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
52 mov [esp],eax
53 mov ebp,esp ; ebp = aligned ebp
54 lea esp, [wk(0)]
55 pushpic eax ; make a room for GOT address
56 push ebx
57 ; push ecx ; need not be preserved
58 ; push edx ; need not be preserved
59 push esi
60 push edi
61
62 get_GOT ebx ; get GOT address
63 movpic POINTER [gotptr], ebx ; save GOT address
64
65 mov ecx, JDIMENSION [out_width(eax)] ; num_cols
66 test ecx,ecx
67 jz near .return
68
69 push ecx
70
71 mov edi, JSAMPIMAGE [input_buf(eax)]
72 mov ecx, JDIMENSION [input_row(eax)]
73 mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
74 mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
75 mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
76 lea esi, [esi+ecx*SIZEOF_JSAMPROW]
77 lea ebx, [ebx+ecx*SIZEOF_JSAMPROW]
78 lea edx, [edx+ecx*SIZEOF_JSAMPROW]
79
80 pop ecx
81
82 mov edi, JSAMPARRAY [output_buf(eax)]
83 mov eax, INT [num_rows(eax)]
84 test eax,eax
85 jle near .return
86 alignx 16,7
87 .rowloop:
88 push eax
89 push edi
90 push edx
91 push ebx
92 push esi
93 push ecx ; col
94
95 mov esi, JSAMPROW [esi] ; inptr0
96 mov ebx, JSAMPROW [ebx] ; inptr1
97 mov edx, JSAMPROW [edx] ; inptr2
98 mov edi, JSAMPROW [edi] ; outptr
99 movpic eax, POINTER [gotptr] ; load GOT address (eax)
100 alignx 16,7
101 .columnloop:
102
103 movq mm5, MMWORD [ebx] ; mm5=Cb(01234567)
104 movq mm1, MMWORD [edx] ; mm1=Cr(01234567)
105
106 pcmpeqw mm4,mm4
107 pcmpeqw mm7,mm7
108 psrlw mm4,BYTE_BIT
109 psllw mm7,7 ; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
110 movq mm0,mm4 ; mm0=mm4={0xFF 0x00 0xFF 0x00 ..}
111
112 pand mm4,mm5 ; mm4=Cb(0246)=CbE
113 psrlw mm5,BYTE_BIT ; mm5=Cb(1357)=CbO
114 pand mm0,mm1 ; mm0=Cr(0246)=CrE
115 psrlw mm1,BYTE_BIT ; mm1=Cr(1357)=CrO
116
117 paddw mm4,mm7
118 paddw mm5,mm7
119 paddw mm0,mm7
120 paddw mm1,mm7
121
122 ; (Original)
123 ; R = Y + 1.40200 * Cr
124 ; G = Y - 0.34414 * Cb - 0.71414 * Cr
125 ; B = Y + 1.77200 * Cb
126 ;
127 ; (This implementation)
128 ; R = Y + 0.40200 * Cr + Cr
129 ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
130 ; B = Y - 0.22800 * Cb + Cb + Cb
131
132 movq mm2,mm4 ; mm2=CbE
133 movq mm3,mm5 ; mm3=CbO
134 paddw mm4,mm4 ; mm4=2*CbE
135 paddw mm5,mm5 ; mm5=2*CbO
136 movq mm6,mm0 ; mm6=CrE
137 movq mm7,mm1 ; mm7=CrO
138 paddw mm0,mm0 ; mm0=2*CrE
139 paddw mm1,mm1 ; mm1=2*CrO
140
141 pmulhw mm4,[GOTOFF(eax,PW_MF0228)] ; mm4=(2*CbE * -FIX(0.22800))
142 pmulhw mm5,[GOTOFF(eax,PW_MF0228)] ; mm5=(2*CbO * -FIX(0.22800))
143 pmulhw mm0,[GOTOFF(eax,PW_F0402)] ; mm0=(2*CrE * FIX(0.40200))
144 pmulhw mm1,[GOTOFF(eax,PW_F0402)] ; mm1=(2*CrO * FIX(0.40200))
145
146 paddw mm4,[GOTOFF(eax,PW_ONE)]
147 paddw mm5,[GOTOFF(eax,PW_ONE)]
148 psraw mm4,1 ; mm4=(CbE * -FIX(0.22800))
149 psraw mm5,1 ; mm5=(CbO * -FIX(0.22800))
150 paddw mm0,[GOTOFF(eax,PW_ONE)]
151 paddw mm1,[GOTOFF(eax,PW_ONE)]
152 psraw mm0,1 ; mm0=(CrE * FIX(0.40200))
153 psraw mm1,1 ; mm1=(CrO * FIX(0.40200))
154
155 paddw mm4,mm2
156 paddw mm5,mm3
157 paddw mm4,mm2 ; mm4=(CbE * FIX(1.77200))=(B-Y)E
158 paddw mm5,mm3 ; mm5=(CbO * FIX(1.77200))=(B-Y)O
159 paddw mm0,mm6 ; mm0=(CrE * FIX(1.40200))=(R-Y)E
160 paddw mm1,mm7 ; mm1=(CrO * FIX(1.40200))=(R-Y)O
161
162 movq MMWORD [wk(0)], mm4 ; wk(0)=(B-Y)E
163 movq MMWORD [wk(1)], mm5 ; wk(1)=(B-Y)O
164
165 movq mm4,mm2
166 movq mm5,mm3
167 punpcklwd mm2,mm6
168 punpckhwd mm4,mm6
169 pmaddwd mm2,[GOTOFF(eax,PW_MF0344_F0285)]
170 pmaddwd mm4,[GOTOFF(eax,PW_MF0344_F0285)]
171 punpcklwd mm3,mm7
172 punpckhwd mm5,mm7
173 pmaddwd mm3,[GOTOFF(eax,PW_MF0344_F0285)]
174 pmaddwd mm5,[GOTOFF(eax,PW_MF0344_F0285)]
175
176 paddd mm2,[GOTOFF(eax,PD_ONEHALF)]
177 paddd mm4,[GOTOFF(eax,PD_ONEHALF)]
178 psrad mm2,SCALEBITS
179 psrad mm4,SCALEBITS
180 paddd mm3,[GOTOFF(eax,PD_ONEHALF)]
181 paddd mm5,[GOTOFF(eax,PD_ONEHALF)]
182 psrad mm3,SCALEBITS
183 psrad mm5,SCALEBITS
184
185 packssdw mm2,mm4 ; mm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
186 packssdw mm3,mm5 ; mm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
187 psubw mm2,mm6 ; mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
188 psubw mm3,mm7 ; mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
189
190 movq mm5, MMWORD [esi] ; mm5=Y(01234567)
191
192 pcmpeqw mm4,mm4
193 psrlw mm4,BYTE_BIT ; mm4={0xFF 0x00 0xFF 0x00 ..}
194 pand mm4,mm5 ; mm4=Y(0246)=YE
195 psrlw mm5,BYTE_BIT ; mm5=Y(1357)=YO
196
197 paddw mm0,mm4 ; mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6)
198 paddw mm1,mm5 ; mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7)
199 packuswb mm0,mm0 ; mm0=(R0 R2 R4 R6 ** ** ** **)
200 packuswb mm1,mm1 ; mm1=(R1 R3 R5 R7 ** ** ** **)
201
202 paddw mm2,mm4 ; mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6)
203 paddw mm3,mm5 ; mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7)
204 packuswb mm2,mm2 ; mm2=(G0 G2 G4 G6 ** ** ** **)
205 packuswb mm3,mm3 ; mm3=(G1 G3 G5 G7 ** ** ** **)
206
207 paddw mm4, MMWORD [wk(0)] ; mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6)
208 paddw mm5, MMWORD [wk(1)] ; mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7)
209 packuswb mm4,mm4 ; mm4=(B0 B2 B4 B6 ** ** ** **)
210 packuswb mm5,mm5 ; mm5=(B1 B3 B5 B7 ** ** ** **)
211
212 %if RGB_PIXELSIZE == 3 ; ---------------
213
214 ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
215 ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
216 ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
217 ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
218
219 punpcklbw mmA,mmC ; mmA=(00 10 02 12 04 14 06 16)
220 punpcklbw mmE,mmB ; mmE=(20 01 22 03 24 05 26 07)
221 punpcklbw mmD,mmF ; mmD=(11 21 13 23 15 25 17 27)
222
223 movq mmG,mmA
224 movq mmH,mmA
225 punpcklwd mmA,mmE ; mmA=(00 10 20 01 02 12 22 03)
226 punpckhwd mmG,mmE ; mmG=(04 14 24 05 06 16 26 07)
227
228 psrlq mmH,2*BYTE_BIT ; mmH=(02 12 04 14 06 16 -- --)
229 psrlq mmE,2*BYTE_BIT ; mmE=(22 03 24 05 26 07 -- --)
230
231 movq mmC,mmD
232 movq mmB,mmD
233 punpcklwd mmD,mmH ; mmD=(11 21 02 12 13 23 04 14)
234 punpckhwd mmC,mmH ; mmC=(15 25 06 16 17 27 -- --)
235
236 psrlq mmB,2*BYTE_BIT ; mmB=(13 23 15 25 17 27 -- --)
237
238 movq mmF,mmE
239 punpcklwd mmE,mmB ; mmE=(22 03 13 23 24 05 15 25)
240 punpckhwd mmF,mmB ; mmF=(26 07 17 27 -- -- -- --)
241
242 punpckldq mmA,mmD ; mmA=(00 10 20 01 11 21 02 12)
243 punpckldq mmE,mmG ; mmE=(22 03 13 23 04 14 24 05)
244 punpckldq mmC,mmF ; mmC=(15 25 06 16 26 07 17 27)
245
246 cmp ecx, byte SIZEOF_MMWORD
247 jb short .column_st16
248
249 movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
250 movq MMWORD [edi+1*SIZEOF_MMWORD], mmE
251 movq MMWORD [edi+2*SIZEOF_MMWORD], mmC
252
253 sub ecx, byte SIZEOF_MMWORD
254 jz short .nextrow
255
256 add esi, byte SIZEOF_MMWORD ; inptr0
257 add ebx, byte SIZEOF_MMWORD ; inptr1
258 add edx, byte SIZEOF_MMWORD ; inptr2
259 add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr
260 jmp near .columnloop
261 alignx 16,7
262
263 .column_st16:
264 lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
265 cmp ecx, byte 2*SIZEOF_MMWORD
266 jb short .column_st8
267 movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
268 movq MMWORD [edi+1*SIZEOF_MMWORD], mmE
269 movq mmA,mmC
270 sub ecx, byte 2*SIZEOF_MMWORD
271 add edi, byte 2*SIZEOF_MMWORD
272 jmp short .column_st4
273 .column_st8:
274 cmp ecx, byte SIZEOF_MMWORD
275 jb short .column_st4
276 movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
277 movq mmA,mmE
278 sub ecx, byte SIZEOF_MMWORD
279 add edi, byte SIZEOF_MMWORD
280 .column_st4:
281 movd eax,mmA
282 cmp ecx, byte SIZEOF_DWORD
283 jb short .column_st2
284 mov DWORD [edi+0*SIZEOF_DWORD], eax
285 psrlq mmA,DWORD_BIT
286 movd eax,mmA
287 sub ecx, byte SIZEOF_DWORD
288 add edi, byte SIZEOF_DWORD
289 .column_st2:
290 cmp ecx, byte SIZEOF_WORD
291 jb short .column_st1
292 mov WORD [edi+0*SIZEOF_WORD], ax
293 shr eax,WORD_BIT
294 sub ecx, byte SIZEOF_WORD
295 add edi, byte SIZEOF_WORD
296 .column_st1:
297 cmp ecx, byte SIZEOF_BYTE
298 jb short .nextrow
299 mov BYTE [edi+0*SIZEOF_BYTE], al
300
301 %else ; RGB_PIXELSIZE == 4 ; -----------
302
303 %ifdef RGBX_FILLER_0XFF
304 pcmpeqb mm6,mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **)
305 pcmpeqb mm7,mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **)
306 %else
307 pxor mm6,mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **)
308 pxor mm7,mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **)
309 %endif
310 ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
311 ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
312 ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
313 ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
314
315 punpcklbw mmA,mmC ; mmA=(00 10 02 12 04 14 06 16)
316 punpcklbw mmE,mmG ; mmE=(20 30 22 32 24 34 26 36)
317 punpcklbw mmB,mmD ; mmB=(01 11 03 13 05 15 07 17)
318 punpcklbw mmF,mmH ; mmF=(21 31 23 33 25 35 27 37)
319
320 movq mmC,mmA
321 punpcklwd mmA,mmE ; mmA=(00 10 20 30 02 12 22 32)
322 punpckhwd mmC,mmE ; mmC=(04 14 24 34 06 16 26 36)
323 movq mmG,mmB
324 punpcklwd mmB,mmF ; mmB=(01 11 21 31 03 13 23 33)
325 punpckhwd mmG,mmF ; mmG=(05 15 25 35 07 17 27 37)
326
327 movq mmD,mmA
328 punpckldq mmA,mmB ; mmA=(00 10 20 30 01 11 21 31)
329 punpckhdq mmD,mmB ; mmD=(02 12 22 32 03 13 23 33)
330 movq mmH,mmC
331 punpckldq mmC,mmG ; mmC=(04 14 24 34 05 15 25 35)
332 punpckhdq mmH,mmG ; mmH=(06 16 26 36 07 17 27 37)
333
334 cmp ecx, byte SIZEOF_MMWORD
335 jb short .column_st16
336
337 movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
338 movq MMWORD [edi+1*SIZEOF_MMWORD], mmD
339 movq MMWORD [edi+2*SIZEOF_MMWORD], mmC
340 movq MMWORD [edi+3*SIZEOF_MMWORD], mmH
341
342 sub ecx, byte SIZEOF_MMWORD
343 jz short .nextrow
344
345 add esi, byte SIZEOF_MMWORD ; inptr0
346 add ebx, byte SIZEOF_MMWORD ; inptr1
347 add edx, byte SIZEOF_MMWORD ; inptr2
348 add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr
349 jmp near .columnloop
350 alignx 16,7
351
352 .column_st16:
353 cmp ecx, byte SIZEOF_MMWORD/2
354 jb short .column_st8
355 movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
356 movq MMWORD [edi+1*SIZEOF_MMWORD], mmD
357 movq mmA,mmC
358 movq mmD,mmH
359 sub ecx, byte SIZEOF_MMWORD/2
360 add edi, byte 2*SIZEOF_MMWORD
361 .column_st8:
362 cmp ecx, byte SIZEOF_MMWORD/4
363 jb short .column_st4
364 movq MMWORD [edi+0*SIZEOF_MMWORD], mmA
365 movq mmA,mmD
366 sub ecx, byte SIZEOF_MMWORD/4
367 add edi, byte 1*SIZEOF_MMWORD
368 .column_st4:
369 cmp ecx, byte SIZEOF_MMWORD/8
370 jb short .nextrow
371 movd DWORD [edi+0*SIZEOF_DWORD], mmA
372
373 %endif ; RGB_PIXELSIZE ; ---------------
374
375 alignx 16,7
376
377 .nextrow:
378 pop ecx
379 pop esi
380 pop ebx
381 pop edx
382 pop edi
383 pop eax
384
385 add esi, byte SIZEOF_JSAMPROW
386 add ebx, byte SIZEOF_JSAMPROW
387 add edx, byte SIZEOF_JSAMPROW
388 add edi, byte SIZEOF_JSAMPROW ; output_buf
389 dec eax ; num_rows
390 jg near .rowloop
391
392 emms ; empty MMX state
393
394 .return:
395 pop edi
396 pop esi
397 ; pop edx ; need not be preserved
398 ; pop ecx ; need not be preserved
399 pop ebx
400 mov esp,ebp ; esp <- aligned ebp
401 pop esp ; esp <- original ebp
402 pop ebp
403 ret
404
405 ; For some reason, the OS X linker does not honor the request to align the
406 ; segment unless we do this.
407 align 16
OLDNEW
« no previous file with comments | « third_party/libjpeg_turbo/simd/jcsamss2-64.asm ('k') | third_party/libjpeg_turbo/simd/jdclrss2.asm » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698