Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(183)

Side by Side Diff: simd/jcgryext-sse2-64.asm

Issue 1934113002: Update libjpeg_turbo to 1.4.90 from https://github.com/libjpeg-turbo/ (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libjpeg_turbo.git@master
Patch Set: Created 4 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 ;
2 ; jcgryext.asm - grayscale colorspace conversion (64-bit SSE2)
3 ;
4 ; x86 SIMD extension for IJG JPEG library
5 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
6 ; Copyright (C) 2011, D. R. Commander.
7 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
8 ;
9 ; This file should be assembled with NASM (Netwide Assembler),
10 ; can *not* be assembled with Microsoft's MASM or any compatible
11 ; assembler (including Borland's Turbo Assembler).
12 ; NASM is available from http://nasm.sourceforge.net/ or
13 ; http://sourceforge.net/project/showfiles.php?group_id=6208
14 ;
15 ; [TAB8]
16
17 %include "jcolsamp.inc"
18
19 ; --------------------------------------------------------------------------
20 ;
21 ; Convert some rows of samples to the output colorspace.
22 ;
23 ; GLOBAL(void)
24 ; jsimd_rgb_gray_convert_sse2 (JDIMENSION img_width,
25 ; JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
26 ; JDIMENSION output_row, int num_rows);
27 ;
28
29 ; r10 = JDIMENSION img_width
30 ; r11 = JSAMPARRAY input_buf
31 ; r12 = JSAMPIMAGE output_buf
32 ; r13 = JDIMENSION output_row
33 ; r14 = int num_rows
34
35 %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
36 %define WK_NUM 2
37
38 align 16
39
40 global EXTN(jsimd_rgb_gray_convert_sse2)
41
42 EXTN(jsimd_rgb_gray_convert_sse2):
43 push rbp
44 mov rax,rsp ; rax = original rbp
45 sub rsp, byte 4
46 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
47 mov [rsp],rax
48 mov rbp,rsp ; rbp = aligned rbp
49 lea rsp, [wk(0)]
50 collect_args
51 push rbx
52
53 mov ecx, r10d
54 test rcx,rcx
55 jz near .return
56
57 push rcx
58
59 mov rsi, r12
60 mov ecx, r13d
61 mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
62 lea rdi, [rdi+rcx*SIZEOF_JSAMPROW]
63
64 pop rcx
65
66 mov rsi, r11
67 mov eax, r14d
68 test rax,rax
69 jle near .return
70 .rowloop:
71 push rdi
72 push rsi
73 push rcx ; col
74
75 mov rsi, JSAMPROW [rsi] ; inptr
76 mov rdi, JSAMPROW [rdi] ; outptr0
77
78 cmp rcx, byte SIZEOF_XMMWORD
79 jae near .columnloop
80
81 %if RGB_PIXELSIZE == 3 ; ---------------
82
83 .column_ld1:
84 push rax
85 push rdx
86 lea rcx,[rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE
87 test cl, SIZEOF_BYTE
88 jz short .column_ld2
89 sub rcx, byte SIZEOF_BYTE
90 movzx rax, BYTE [rsi+rcx]
91 .column_ld2:
92 test cl, SIZEOF_WORD
93 jz short .column_ld4
94 sub rcx, byte SIZEOF_WORD
95 movzx rdx, WORD [rsi+rcx]
96 shl rax, WORD_BIT
97 or rax,rdx
98 .column_ld4:
99 movd xmmA,eax
100 pop rdx
101 pop rax
102 test cl, SIZEOF_DWORD
103 jz short .column_ld8
104 sub rcx, byte SIZEOF_DWORD
105 movd xmmF, XMM_DWORD [rsi+rcx]
106 pslldq xmmA, SIZEOF_DWORD
107 por xmmA,xmmF
108 .column_ld8:
109 test cl, SIZEOF_MMWORD
110 jz short .column_ld16
111 sub rcx, byte SIZEOF_MMWORD
112 movq xmmB, XMM_MMWORD [rsi+rcx]
113 pslldq xmmA, SIZEOF_MMWORD
114 por xmmA,xmmB
115 .column_ld16:
116 test cl, SIZEOF_XMMWORD
117 jz short .column_ld32
118 movdqa xmmF,xmmA
119 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
120 mov rcx, SIZEOF_XMMWORD
121 jmp short .rgb_gray_cnv
122 .column_ld32:
123 test cl, 2*SIZEOF_XMMWORD
124 mov rcx, SIZEOF_XMMWORD
125 jz short .rgb_gray_cnv
126 movdqa xmmB,xmmA
127 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
128 movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
129 jmp short .rgb_gray_cnv
130
131 .columnloop:
132 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
133 movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
134 movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
135
136 .rgb_gray_cnv:
137 ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
138 ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
139 ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
140
141 movdqa xmmG,xmmA
142 pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 2 1 02 12)
143 psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- - - -- --)
144
145 punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0 A 12 1A)
146 pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 0 7 17 27)
147
148 punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2 C 05 0D)
149 punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1 F 27 2F)
150
151 movdqa xmmD,xmmA
152 pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 2 8 01 09)
153 psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- - - -- --)
154
155 punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 0 5 09 0D)
156 pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1 B 23 2B)
157
158 punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 1 6 1A 1E)
159 punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 2 7 2B 2F)
160
161 movdqa xmmE,xmmA
162 pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 1 4 18 1C)
163 psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- - - -- --)
164
165 punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1 A 1C 1E)
166 pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 2 5 29 2D)
167
168 punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0 B 0D 0F)
169 punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2 B 2D 2F)
170
171 pxor xmmH,xmmH
172
173 movdqa xmmC,xmmA
174 punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E)
175 punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E)
176
177 movdqa xmmB,xmmE
178 punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E)
179 punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F)
180
181 movdqa xmmF,xmmD
182 punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F)
183 punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F)
184
185 %else ; RGB_PIXELSIZE == 4 ; -----------
186
187 .column_ld1:
188 test cl, SIZEOF_XMMWORD/16
189 jz short .column_ld2
190 sub rcx, byte SIZEOF_XMMWORD/16
191 movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
192 .column_ld2:
193 test cl, SIZEOF_XMMWORD/8
194 jz short .column_ld4
195 sub rcx, byte SIZEOF_XMMWORD/8
196 movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
197 pslldq xmmA, SIZEOF_MMWORD
198 por xmmA,xmmE
199 .column_ld4:
200 test cl, SIZEOF_XMMWORD/4
201 jz short .column_ld8
202 sub rcx, byte SIZEOF_XMMWORD/4
203 movdqa xmmE,xmmA
204 movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
205 .column_ld8:
206 test cl, SIZEOF_XMMWORD/2
207 mov rcx, SIZEOF_XMMWORD
208 jz short .rgb_gray_cnv
209 movdqa xmmF,xmmA
210 movdqa xmmH,xmmE
211 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
212 movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
213 jmp short .rgb_gray_cnv
214
215 .columnloop:
216 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
217 movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
218 movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
219 movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
220
221 .rgb_gray_cnv:
222 ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
223 ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
224 ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
225 ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
226
227 movdqa xmmD,xmmA
228 punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 2 5 31 35)
229 punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 2 7 33 37)
230
231 movdqa xmmC,xmmF
232 punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2 D 39 3D)
233 punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2 F 3B 3F)
234
235 movdqa xmmB,xmmA
236 punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 3 4 38 3C)
237 punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 3 5 39 3D)
238
239 movdqa xmmG,xmmD
240 punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 3 6 3A 3E)
241 punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 3 7 3B 3F)
242
243 movdqa xmmE,xmmA
244 punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1 A 1C 1E)
245 punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3 A 3C 3E)
246
247 movdqa xmmH,xmmB
248 punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1 B 1D 1F)
249 punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3 B 3D 3F)
250
251 pxor xmmF,xmmF
252
253 movdqa xmmC,xmmA
254 punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E)
255 punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E)
256
257 movdqa xmmD,xmmB
258 punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F)
259 punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F)
260
261 movdqa xmmG,xmmE
262 punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E)
263 punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E)
264
265 punpcklbw xmmF,xmmH
266 punpckhbw xmmH,xmmH
267 psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
268 psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
269
270 %endif ; RGB_PIXELSIZE ; ---------------
271
272 ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
273 ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
274
275 ; (Original)
276 ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
277 ;
278 ; (This implementation)
279 ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
280
281 movdqa xmm6,xmm1
282 punpcklwd xmm1,xmm3
283 punpckhwd xmm6,xmm3
284 pmaddwd xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
285 pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
286
287 movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
288
289 movdqa xmm6,xmm0
290 punpcklwd xmm0,xmm2
291 punpckhwd xmm6,xmm2
292 pmaddwd xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
293 pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
294
295 movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
296 movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
297
298 movdqa xmm0, xmm5 ; xmm0=BO
299 movdqa xmm6, xmm4 ; xmm6=BE
300
301 movdqa xmm4,xmm0
302 punpcklwd xmm0,xmm3
303 punpckhwd xmm4,xmm3
304 pmaddwd xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
305 pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
306
307 movdqa xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF]
308
309 paddd xmm0, xmm1
310 paddd xmm4, xmm7
311 paddd xmm0,xmm3
312 paddd xmm4,xmm3
313 psrld xmm0,SCALEBITS ; xmm0=YOL
314 psrld xmm4,SCALEBITS ; xmm4=YOH
315 packssdw xmm0,xmm4 ; xmm0=YO
316
317 movdqa xmm4,xmm6
318 punpcklwd xmm6,xmm2
319 punpckhwd xmm4,xmm2
320 pmaddwd xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
321 pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
322
323 movdqa xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF]
324
325 paddd xmm6, XMMWORD [wk(0)]
326 paddd xmm4, XMMWORD [wk(1)]
327 paddd xmm6,xmm2
328 paddd xmm4,xmm2
329 psrld xmm6,SCALEBITS ; xmm6=YEL
330 psrld xmm4,SCALEBITS ; xmm4=YEH
331 packssdw xmm6,xmm4 ; xmm6=YE
332
333 psllw xmm0,BYTE_BIT
334 por xmm6,xmm0 ; xmm6=Y
335 movdqa XMMWORD [rdi], xmm6 ; Save Y
336
337 sub rcx, byte SIZEOF_XMMWORD
338 add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr
339 add rdi, byte SIZEOF_XMMWORD ; outptr0
340 cmp rcx, byte SIZEOF_XMMWORD
341 jae near .columnloop
342 test rcx,rcx
343 jnz near .column_ld1
344
345 pop rcx ; col
346 pop rsi
347 pop rdi
348
349 add rsi, byte SIZEOF_JSAMPROW ; input_buf
350 add rdi, byte SIZEOF_JSAMPROW
351 dec rax ; num_rows
352 jg near .rowloop
353
354 .return:
355 pop rbx
356 uncollect_args
357 mov rsp,rbp ; rsp <- aligned rbp
358 pop rsp ; rsp <- original rbp
359 pop rbp
360 ret
361
362 ; For some reason, the OS X linker does not honor the request to align the
363 ; segment unless we do this.
364 align 16
OLDNEW
« simd/jccolext-sse2-64.asm ('K') | « simd/jcgryext-sse2.asm ('k') | simd/jchuff-sse2.asm » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698