Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(130)

Side by Side Diff: simd/jcsample-sse2-64.asm

Issue 1953443002: Update to libjpeg_turbo 1.4.90 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libjpeg_turbo.git@master
Patch Set: Created 4 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « simd/jcsample-sse2.asm ('k') | simd/jcsamss2.asm » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 ;
2 ; jcsample.asm - downsampling (64-bit SSE2)
3 ;
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5 ; Copyright 2009 D. R. Commander
6 ;
7 ; Based on
8 ; x86 SIMD extension for IJG JPEG library
9 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
11 ;
12 ; This file should be assembled with NASM (Netwide Assembler),
13 ; can *not* be assembled with Microsoft's MASM or any compatible
14 ; assembler (including Borland's Turbo Assembler).
15 ; NASM is available from http://nasm.sourceforge.net/ or
16 ; http://sourceforge.net/project/showfiles.php?group_id=6208
17 ;
18 ; [TAB8]
19
20 %include "jsimdext.inc"
21
22 ; --------------------------------------------------------------------------
23 SECTION SEG_TEXT
24 BITS 64
25 ;
26 ; Downsample pixel values of a single component.
27 ; This version handles the common case of 2:1 horizontal and 1:1 vertical,
28 ; without smoothing.
29 ;
30 ; GLOBAL(void)
31 ; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
32 ; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
33 ; JSAMPARRAY input_data, JSAMPARRAY output_data);
34 ;
35
36 ; r10 = JDIMENSION image_width
37 ; r11 = int max_v_samp_factor
38 ; r12 = JDIMENSION v_samp_factor
39 ; r13 = JDIMENSION width_blocks
40 ; r14 = JSAMPARRAY input_data
41 ; r15 = JSAMPARRAY output_data
42
43 align 16
44 global EXTN(jsimd_h2v1_downsample_sse2)
45
46 EXTN(jsimd_h2v1_downsample_sse2):
47 push rbp
48 mov rax,rsp
49 mov rbp,rsp
50 collect_args
51
52 mov ecx, r13d
53 shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols)
54 jz near .return
55
56 mov edx, r10d
57
58 ; -- expand_right_edge
59
60 push rcx
61 shl rcx,1 ; output_cols * 2
62 sub rcx,rdx
63 jle short .expand_end
64
65 mov rax, r11
66 test rax,rax
67 jle short .expand_end
68
69 cld
70 mov rsi, r14 ; input_data
71 .expandloop:
72 push rax
73 push rcx
74
75 mov rdi, JSAMPROW [rsi]
76 add rdi,rdx
77 mov al, JSAMPLE [rdi-1]
78
79 rep stosb
80
81 pop rcx
82 pop rax
83
84 add rsi, byte SIZEOF_JSAMPROW
85 dec rax
86 jg short .expandloop
87
88 .expand_end:
89 pop rcx ; output_cols
90
91 ; -- h2v1_downsample
92
93 mov eax, r12d ; rowctr
94 test eax,eax
95 jle near .return
96
97 mov rdx, 0x00010000 ; bias pattern
98 movd xmm7,edx
99 pcmpeqw xmm6,xmm6
100 pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
101 psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
102
103 mov rsi, r14 ; input_data
104 mov rdi, r15 ; output_data
105 .rowloop:
106 push rcx
107 push rdi
108 push rsi
109
110 mov rsi, JSAMPROW [rsi] ; inptr
111 mov rdi, JSAMPROW [rdi] ; outptr
112
113 cmp rcx, byte SIZEOF_XMMWORD
114 jae short .columnloop
115
116 .columnloop_r8:
117 movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
118 pxor xmm1,xmm1
119 mov rcx, SIZEOF_XMMWORD
120 jmp short .downsample
121
122 .columnloop:
123 movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
124 movdqa xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
125
126 .downsample:
127 movdqa xmm2,xmm0
128 movdqa xmm3,xmm1
129
130 pand xmm0,xmm6
131 psrlw xmm2,BYTE_BIT
132 pand xmm1,xmm6
133 psrlw xmm3,BYTE_BIT
134
135 paddw xmm0,xmm2
136 paddw xmm1,xmm3
137 paddw xmm0,xmm7
138 paddw xmm1,xmm7
139 psrlw xmm0,1
140 psrlw xmm1,1
141
142 packuswb xmm0,xmm1
143
144 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
145
146 sub rcx, byte SIZEOF_XMMWORD ; outcol
147 add rsi, byte 2*SIZEOF_XMMWORD ; inptr
148 add rdi, byte 1*SIZEOF_XMMWORD ; outptr
149 cmp rcx, byte SIZEOF_XMMWORD
150 jae short .columnloop
151 test rcx,rcx
152 jnz short .columnloop_r8
153
154 pop rsi
155 pop rdi
156 pop rcx
157
158 add rsi, byte SIZEOF_JSAMPROW ; input_data
159 add rdi, byte SIZEOF_JSAMPROW ; output_data
160 dec rax ; rowctr
161 jg near .rowloop
162
163 .return:
164 uncollect_args
165 pop rbp
166 ret
167
168 ; --------------------------------------------------------------------------
169 ;
170 ; Downsample pixel values of a single component.
171 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
172 ; without smoothing.
173 ;
174 ; GLOBAL(void)
175 ; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
176 ; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
177 ; JSAMPARRAY input_data, JSAMPARRAY output_data);
178 ;
179
180 ; r10 = JDIMENSION image_width
181 ; r11 = int max_v_samp_factor
182 ; r12 = JDIMENSION v_samp_factor
183 ; r13 = JDIMENSION width_blocks
184 ; r14 = JSAMPARRAY input_data
185 ; r15 = JSAMPARRAY output_data
186
187 align 16
188 global EXTN(jsimd_h2v2_downsample_sse2)
189
190 EXTN(jsimd_h2v2_downsample_sse2):
191 push rbp
192 mov rax,rsp
193 mov rbp,rsp
194 collect_args
195
196 mov ecx, r13d
197 shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols)
198 jz near .return
199
200 mov edx, r10d
201
202 ; -- expand_right_edge
203
204 push rcx
205 shl rcx,1 ; output_cols * 2
206 sub rcx,rdx
207 jle short .expand_end
208
209 mov rax, r11
210 test rax,rax
211 jle short .expand_end
212
213 cld
214 mov rsi, r14 ; input_data
215 .expandloop:
216 push rax
217 push rcx
218
219 mov rdi, JSAMPROW [rsi]
220 add rdi,rdx
221 mov al, JSAMPLE [rdi-1]
222
223 rep stosb
224
225 pop rcx
226 pop rax
227
228 add rsi, byte SIZEOF_JSAMPROW
229 dec rax
230 jg short .expandloop
231
232 .expand_end:
233 pop rcx ; output_cols
234
235 ; -- h2v2_downsample
236
237 mov eax, r12d ; rowctr
238 test rax,rax
239 jle near .return
240
241 mov rdx, 0x00020001 ; bias pattern
242 movd xmm7,edx
243 pcmpeqw xmm6,xmm6
244 pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
245 psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
246
247 mov rsi, r14 ; input_data
248 mov rdi, r15 ; output_data
249 .rowloop:
250 push rcx
251 push rdi
252 push rsi
253
254 mov rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
255 mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1
256 mov rdi, JSAMPROW [rdi] ; outptr
257
258 cmp rcx, byte SIZEOF_XMMWORD
259 jae short .columnloop
260
261 .columnloop_r8:
262 movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
263 movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
264 pxor xmm2,xmm2
265 pxor xmm3,xmm3
266 mov rcx, SIZEOF_XMMWORD
267 jmp short .downsample
268
269 .columnloop:
270 movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
271 movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
272 movdqa xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD]
273 movdqa xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
274
275 .downsample:
276 movdqa xmm4,xmm0
277 movdqa xmm5,xmm1
278 pand xmm0,xmm6
279 psrlw xmm4,BYTE_BIT
280 pand xmm1,xmm6
281 psrlw xmm5,BYTE_BIT
282 paddw xmm0,xmm4
283 paddw xmm1,xmm5
284
285 movdqa xmm4,xmm2
286 movdqa xmm5,xmm3
287 pand xmm2,xmm6
288 psrlw xmm4,BYTE_BIT
289 pand xmm3,xmm6
290 psrlw xmm5,BYTE_BIT
291 paddw xmm2,xmm4
292 paddw xmm3,xmm5
293
294 paddw xmm0,xmm1
295 paddw xmm2,xmm3
296 paddw xmm0,xmm7
297 paddw xmm2,xmm7
298 psrlw xmm0,2
299 psrlw xmm2,2
300
301 packuswb xmm0,xmm2
302
303 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
304
305 sub rcx, byte SIZEOF_XMMWORD ; outcol
306 add rdx, byte 2*SIZEOF_XMMWORD ; inptr0
307 add rsi, byte 2*SIZEOF_XMMWORD ; inptr1
308 add rdi, byte 1*SIZEOF_XMMWORD ; outptr
309 cmp rcx, byte SIZEOF_XMMWORD
310 jae near .columnloop
311 test rcx,rcx
312 jnz near .columnloop_r8
313
314 pop rsi
315 pop rdi
316 pop rcx
317
318 add rsi, byte 2*SIZEOF_JSAMPROW ; input_data
319 add rdi, byte 1*SIZEOF_JSAMPROW ; output_data
320 dec rax ; rowctr
321 jg near .rowloop
322
323 .return:
324 uncollect_args
325 pop rbp
326 ret
327
328 ; For some reason, the OS X linker does not honor the request to align the
329 ; segment unless we do this.
330 align 16
OLDNEW
« no previous file with comments | « simd/jcsample-sse2.asm ('k') | simd/jcsamss2.asm » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698