Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(809)

Side by Side Diff: simd/jcsamss2.asm

Issue 1953443002: Update to libjpeg_turbo 1.4.90 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libjpeg_turbo.git@master
Patch Set: Created 4 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « simd/jcsample-sse2-64.asm ('k') | simd/jcsamss2-64.asm » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 ;
2 ; jcsamss2.asm - downsampling (SSE2)
3 ;
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5 ;
6 ; Based on
7 ; x86 SIMD extension for IJG JPEG library
8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
10 ;
11 ; This file should be assembled with NASM (Netwide Assembler),
12 ; can *not* be assembled with Microsoft's MASM or any compatible
13 ; assembler (including Borland's Turbo Assembler).
14 ; NASM is available from http://nasm.sourceforge.net/ or
15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
16 ;
17 ; [TAB8]
18
19 %include "jsimdext.inc"
20
21 ; --------------------------------------------------------------------------
22 SECTION SEG_TEXT
23 BITS 32
24 ;
25 ; Downsample pixel values of a single component.
26 ; This version handles the common case of 2:1 horizontal and 1:1 vertical,
27 ; without smoothing.
28 ;
29 ; GLOBAL(void)
30 ; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
31 ; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
32 ; JSAMPARRAY input_data, JSAMPARRAY output_data);
33 ;
34
35 %define img_width(b) (b)+8 ; JDIMENSION image_width
36 %define max_v_samp(b) (b)+12 ; int max_v_samp_factor
37 %define v_samp(b) (b)+16 ; JDIMENSION v_samp_fact or
38 %define width_blks(b) (b)+20 ; JDIMENSION width_blocks
39 %define input_data(b) (b)+24 ; JSAMPARRAY input_data
40 %define output_data(b) (b)+28 ; JSAMPARRAY output_data
41
42 align 16
43 global EXTN(jsimd_h2v1_downsample_sse2) PRIVATE
44
45 EXTN(jsimd_h2v1_downsample_sse2):
46 push ebp
47 mov ebp,esp
48 ; push ebx ; unused
49 ; push ecx ; need not be preserved
50 ; push edx ; need not be preserved
51 push esi
52 push edi
53
54 mov ecx, JDIMENSION [width_blks(ebp)]
55 shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols)
56 jz near .return
57
58 mov edx, JDIMENSION [img_width(ebp)]
59
60 ; -- expand_right_edge
61
62 push ecx
63 shl ecx,1 ; output_cols * 2
64 sub ecx,edx
65 jle short .expand_end
66
67 mov eax, INT [max_v_samp(ebp)]
68 test eax,eax
69 jle short .expand_end
70
71 cld
72 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
73 alignx 16,7
74 .expandloop:
75 push eax
76 push ecx
77
78 mov edi, JSAMPROW [esi]
79 add edi,edx
80 mov al, JSAMPLE [edi-1]
81
82 rep stosb
83
84 pop ecx
85 pop eax
86
87 add esi, byte SIZEOF_JSAMPROW
88 dec eax
89 jg short .expandloop
90
91 .expand_end:
92 pop ecx ; output_cols
93
94 ; -- h2v1_downsample
95
96 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
97 test eax,eax
98 jle near .return
99
100 mov edx, 0x00010000 ; bias pattern
101 movd xmm7,edx
102 pcmpeqw xmm6,xmm6
103 pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
104 psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
105
106 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
107 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
108 alignx 16,7
109 .rowloop:
110 push ecx
111 push edi
112 push esi
113
114 mov esi, JSAMPROW [esi] ; inptr
115 mov edi, JSAMPROW [edi] ; outptr
116
117 cmp ecx, byte SIZEOF_XMMWORD
118 jae short .columnloop
119 alignx 16,7
120
121 .columnloop_r8:
122 movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
123 pxor xmm1,xmm1
124 mov ecx, SIZEOF_XMMWORD
125 jmp short .downsample
126 alignx 16,7
127
128 .columnloop:
129 movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
130 movdqa xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD]
131
132 .downsample:
133 movdqa xmm2,xmm0
134 movdqa xmm3,xmm1
135
136 pand xmm0,xmm6
137 psrlw xmm2,BYTE_BIT
138 pand xmm1,xmm6
139 psrlw xmm3,BYTE_BIT
140
141 paddw xmm0,xmm2
142 paddw xmm1,xmm3
143 paddw xmm0,xmm7
144 paddw xmm1,xmm7
145 psrlw xmm0,1
146 psrlw xmm1,1
147
148 packuswb xmm0,xmm1
149
150 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
151
152 sub ecx, byte SIZEOF_XMMWORD ; outcol
153 add esi, byte 2*SIZEOF_XMMWORD ; inptr
154 add edi, byte 1*SIZEOF_XMMWORD ; outptr
155 cmp ecx, byte SIZEOF_XMMWORD
156 jae short .columnloop
157 test ecx,ecx
158 jnz short .columnloop_r8
159
160 pop esi
161 pop edi
162 pop ecx
163
164 add esi, byte SIZEOF_JSAMPROW ; input_data
165 add edi, byte SIZEOF_JSAMPROW ; output_data
166 dec eax ; rowctr
167 jg near .rowloop
168
169 .return:
170 pop edi
171 pop esi
172 ; pop edx ; need not be preserved
173 ; pop ecx ; need not be preserved
174 ; pop ebx ; unused
175 pop ebp
176 ret
177
178 ; --------------------------------------------------------------------------
179 ;
180 ; Downsample pixel values of a single component.
181 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
182 ; without smoothing.
183 ;
184 ; GLOBAL(void)
185 ; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
186 ; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
187 ; JSAMPARRAY input_data, JSAMPARRAY output_data);
188 ;
189
190 %define img_width(b) (b)+8 ; JDIMENSION image_width
191 %define max_v_samp(b) (b)+12 ; int max_v_samp_factor
192 %define v_samp(b) (b)+16 ; JDIMENSION v_samp_fact or
193 %define width_blks(b) (b)+20 ; JDIMENSION width_blocks
194 %define input_data(b) (b)+24 ; JSAMPARRAY input_data
195 %define output_data(b) (b)+28 ; JSAMPARRAY output_data
196
197 align 16
198 global EXTN(jsimd_h2v2_downsample_sse2) PRIVATE
199
200 EXTN(jsimd_h2v2_downsample_sse2):
201 push ebp
202 mov ebp,esp
203 ; push ebx ; unused
204 ; push ecx ; need not be preserved
205 ; push edx ; need not be preserved
206 push esi
207 push edi
208
209 mov ecx, JDIMENSION [width_blks(ebp)]
210 shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols)
211 jz near .return
212
213 mov edx, JDIMENSION [img_width(ebp)]
214
215 ; -- expand_right_edge
216
217 push ecx
218 shl ecx,1 ; output_cols * 2
219 sub ecx,edx
220 jle short .expand_end
221
222 mov eax, INT [max_v_samp(ebp)]
223 test eax,eax
224 jle short .expand_end
225
226 cld
227 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
228 alignx 16,7
229 .expandloop:
230 push eax
231 push ecx
232
233 mov edi, JSAMPROW [esi]
234 add edi,edx
235 mov al, JSAMPLE [edi-1]
236
237 rep stosb
238
239 pop ecx
240 pop eax
241
242 add esi, byte SIZEOF_JSAMPROW
243 dec eax
244 jg short .expandloop
245
246 .expand_end:
247 pop ecx ; output_cols
248
249 ; -- h2v2_downsample
250
251 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
252 test eax,eax
253 jle near .return
254
255 mov edx, 0x00020001 ; bias pattern
256 movd xmm7,edx
257 pcmpeqw xmm6,xmm6
258 pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
259 psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
260
261 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
262 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
263 alignx 16,7
264 .rowloop:
265 push ecx
266 push edi
267 push esi
268
269 mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
270 mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1
271 mov edi, JSAMPROW [edi] ; outptr
272
273 cmp ecx, byte SIZEOF_XMMWORD
274 jae short .columnloop
275 alignx 16,7
276
277 .columnloop_r8:
278 movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
279 movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
280 pxor xmm2,xmm2
281 pxor xmm3,xmm3
282 mov ecx, SIZEOF_XMMWORD
283 jmp short .downsample
284 alignx 16,7
285
286 .columnloop:
287 movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
288 movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
289 movdqa xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD]
290 movdqa xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD]
291
292 .downsample:
293 movdqa xmm4,xmm0
294 movdqa xmm5,xmm1
295 pand xmm0,xmm6
296 psrlw xmm4,BYTE_BIT
297 pand xmm1,xmm6
298 psrlw xmm5,BYTE_BIT
299 paddw xmm0,xmm4
300 paddw xmm1,xmm5
301
302 movdqa xmm4,xmm2
303 movdqa xmm5,xmm3
304 pand xmm2,xmm6
305 psrlw xmm4,BYTE_BIT
306 pand xmm3,xmm6
307 psrlw xmm5,BYTE_BIT
308 paddw xmm2,xmm4
309 paddw xmm3,xmm5
310
311 paddw xmm0,xmm1
312 paddw xmm2,xmm3
313 paddw xmm0,xmm7
314 paddw xmm2,xmm7
315 psrlw xmm0,2
316 psrlw xmm2,2
317
318 packuswb xmm0,xmm2
319
320 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
321
322 sub ecx, byte SIZEOF_XMMWORD ; outcol
323 add edx, byte 2*SIZEOF_XMMWORD ; inptr0
324 add esi, byte 2*SIZEOF_XMMWORD ; inptr1
325 add edi, byte 1*SIZEOF_XMMWORD ; outptr
326 cmp ecx, byte SIZEOF_XMMWORD
327 jae near .columnloop
328 test ecx,ecx
329 jnz near .columnloop_r8
330
331 pop esi
332 pop edi
333 pop ecx
334
335 add esi, byte 2*SIZEOF_JSAMPROW ; input_data
336 add edi, byte 1*SIZEOF_JSAMPROW ; output_data
337 dec eax ; rowctr
338 jg near .rowloop
339
340 .return:
341 pop edi
342 pop esi
343 ; pop edx ; need not be preserved
344 ; pop ecx ; need not be preserved
345 ; pop ebx ; unused
346 pop ebp
347 ret
348
349 ; For some reason, the OS X linker does not honor the request to align the
350 ; segment unless we do this.
351 align 16
OLDNEW
« no previous file with comments | « simd/jcsample-sse2-64.asm ('k') | simd/jcsamss2-64.asm » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698