OLD | NEW |
| (Empty) |
1 ; | |
2 ; jcsamss2.asm - downsampling (SSE2) | |
3 ; | |
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB | |
5 ; | |
6 ; Based on | |
7 ; x86 SIMD extension for IJG JPEG library | |
8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. | |
9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc | |
10 ; | |
11 ; This file should be assembled with NASM (Netwide Assembler), | |
12 ; can *not* be assembled with Microsoft's MASM or any compatible | |
13 ; assembler (including Borland's Turbo Assembler). | |
14 ; NASM is available from http://nasm.sourceforge.net/ or | |
15 ; http://sourceforge.net/project/showfiles.php?group_id=6208 | |
16 ; | |
17 ; [TAB8] | |
18 | |
19 %include "jsimdext.inc" | |
20 | |
21 ; -------------------------------------------------------------------------- | |
22 SECTION SEG_TEXT | |
23 BITS 32 | |
24 ; | |
25 ; Downsample pixel values of a single component. | |
26 ; This version handles the common case of 2:1 horizontal and 1:1 vertical, | |
27 ; without smoothing. | |
28 ; | |
29 ; GLOBAL(void) | |
30 ; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor, | |
31 ; JDIMENSION v_samp_factor, JDIMENSION width_blocks, | |
32 ; JSAMPARRAY input_data, JSAMPARRAY output_data); | |
33 ; | |
34 | |
35 %define img_width(b) (b)+8 ; JDIMENSION image_width | |
36 %define max_v_samp(b) (b)+12 ; int max_v_samp_factor | |
37 %define v_samp(b) (b)+16 ; JDIMENSION v_samp_fact
or | |
38 %define width_blks(b) (b)+20 ; JDIMENSION width_blocks | |
39 %define input_data(b) (b)+24 ; JSAMPARRAY input_data | |
40 %define output_data(b) (b)+28 ; JSAMPARRAY output_data | |
41 | |
42 align 16 | |
43 global EXTN(jsimd_h2v1_downsample_sse2) PRIVATE | |
44 | |
45 EXTN(jsimd_h2v1_downsample_sse2): | |
46 push ebp | |
47 mov ebp,esp | |
48 ; push ebx ; unused | |
49 ; push ecx ; need not be preserved | |
50 ; push edx ; need not be preserved | |
51 push esi | |
52 push edi | |
53 | |
54 mov ecx, JDIMENSION [width_blks(ebp)] | |
55 shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) | |
56 jz near .return | |
57 | |
58 mov edx, JDIMENSION [img_width(ebp)] | |
59 | |
60 ; -- expand_right_edge | |
61 | |
62 push ecx | |
63 shl ecx,1 ; output_cols * 2 | |
64 sub ecx,edx | |
65 jle short .expand_end | |
66 | |
67 mov eax, INT [max_v_samp(ebp)] | |
68 test eax,eax | |
69 jle short .expand_end | |
70 | |
71 cld | |
72 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data | |
73 alignx 16,7 | |
74 .expandloop: | |
75 push eax | |
76 push ecx | |
77 | |
78 mov edi, JSAMPROW [esi] | |
79 add edi,edx | |
80 mov al, JSAMPLE [edi-1] | |
81 | |
82 rep stosb | |
83 | |
84 pop ecx | |
85 pop eax | |
86 | |
87 add esi, byte SIZEOF_JSAMPROW | |
88 dec eax | |
89 jg short .expandloop | |
90 | |
91 .expand_end: | |
92 pop ecx ; output_cols | |
93 | |
94 ; -- h2v1_downsample | |
95 | |
96 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr | |
97 test eax,eax | |
98 jle near .return | |
99 | |
100 mov edx, 0x00010000 ; bias pattern | |
101 movd xmm7,edx | |
102 pcmpeqw xmm6,xmm6 | |
103 pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} | |
104 psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} | |
105 | |
106 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data | |
107 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data | |
108 alignx 16,7 | |
109 .rowloop: | |
110 push ecx | |
111 push edi | |
112 push esi | |
113 | |
114 mov esi, JSAMPROW [esi] ; inptr | |
115 mov edi, JSAMPROW [edi] ; outptr | |
116 | |
117 cmp ecx, byte SIZEOF_XMMWORD | |
118 jae short .columnloop | |
119 alignx 16,7 | |
120 | |
121 .columnloop_r8: | |
122 movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] | |
123 pxor xmm1,xmm1 | |
124 mov ecx, SIZEOF_XMMWORD | |
125 jmp short .downsample | |
126 alignx 16,7 | |
127 | |
128 .columnloop: | |
129 movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] | |
130 movdqa xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD] | |
131 | |
132 .downsample: | |
133 movdqa xmm2,xmm0 | |
134 movdqa xmm3,xmm1 | |
135 | |
136 pand xmm0,xmm6 | |
137 psrlw xmm2,BYTE_BIT | |
138 pand xmm1,xmm6 | |
139 psrlw xmm3,BYTE_BIT | |
140 | |
141 paddw xmm0,xmm2 | |
142 paddw xmm1,xmm3 | |
143 paddw xmm0,xmm7 | |
144 paddw xmm1,xmm7 | |
145 psrlw xmm0,1 | |
146 psrlw xmm1,1 | |
147 | |
148 packuswb xmm0,xmm1 | |
149 | |
150 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 | |
151 | |
152 sub ecx, byte SIZEOF_XMMWORD ; outcol | |
153 add esi, byte 2*SIZEOF_XMMWORD ; inptr | |
154 add edi, byte 1*SIZEOF_XMMWORD ; outptr | |
155 cmp ecx, byte SIZEOF_XMMWORD | |
156 jae short .columnloop | |
157 test ecx,ecx | |
158 jnz short .columnloop_r8 | |
159 | |
160 pop esi | |
161 pop edi | |
162 pop ecx | |
163 | |
164 add esi, byte SIZEOF_JSAMPROW ; input_data | |
165 add edi, byte SIZEOF_JSAMPROW ; output_data | |
166 dec eax ; rowctr | |
167 jg near .rowloop | |
168 | |
169 .return: | |
170 pop edi | |
171 pop esi | |
172 ; pop edx ; need not be preserved | |
173 ; pop ecx ; need not be preserved | |
174 ; pop ebx ; unused | |
175 pop ebp | |
176 ret | |
177 | |
178 ; -------------------------------------------------------------------------- | |
179 ; | |
180 ; Downsample pixel values of a single component. | |
181 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical, | |
182 ; without smoothing. | |
183 ; | |
184 ; GLOBAL(void) | |
185 ; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor, | |
186 ; JDIMENSION v_samp_factor, JDIMENSION width_blocks, | |
187 ; JSAMPARRAY input_data, JSAMPARRAY output_data); | |
188 ; | |
189 | |
190 %define img_width(b) (b)+8 ; JDIMENSION image_width | |
191 %define max_v_samp(b) (b)+12 ; int max_v_samp_factor | |
192 %define v_samp(b) (b)+16 ; JDIMENSION v_samp_fact
or | |
193 %define width_blks(b) (b)+20 ; JDIMENSION width_blocks | |
194 %define input_data(b) (b)+24 ; JSAMPARRAY input_data | |
195 %define output_data(b) (b)+28 ; JSAMPARRAY output_data | |
196 | |
197 align 16 | |
198 global EXTN(jsimd_h2v2_downsample_sse2) PRIVATE | |
199 | |
200 EXTN(jsimd_h2v2_downsample_sse2): | |
201 push ebp | |
202 mov ebp,esp | |
203 ; push ebx ; unused | |
204 ; push ecx ; need not be preserved | |
205 ; push edx ; need not be preserved | |
206 push esi | |
207 push edi | |
208 | |
209 mov ecx, JDIMENSION [width_blks(ebp)] | |
210 shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) | |
211 jz near .return | |
212 | |
213 mov edx, JDIMENSION [img_width(ebp)] | |
214 | |
215 ; -- expand_right_edge | |
216 | |
217 push ecx | |
218 shl ecx,1 ; output_cols * 2 | |
219 sub ecx,edx | |
220 jle short .expand_end | |
221 | |
222 mov eax, INT [max_v_samp(ebp)] | |
223 test eax,eax | |
224 jle short .expand_end | |
225 | |
226 cld | |
227 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data | |
228 alignx 16,7 | |
229 .expandloop: | |
230 push eax | |
231 push ecx | |
232 | |
233 mov edi, JSAMPROW [esi] | |
234 add edi,edx | |
235 mov al, JSAMPLE [edi-1] | |
236 | |
237 rep stosb | |
238 | |
239 pop ecx | |
240 pop eax | |
241 | |
242 add esi, byte SIZEOF_JSAMPROW | |
243 dec eax | |
244 jg short .expandloop | |
245 | |
246 .expand_end: | |
247 pop ecx ; output_cols | |
248 | |
249 ; -- h2v2_downsample | |
250 | |
251 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr | |
252 test eax,eax | |
253 jle near .return | |
254 | |
255 mov edx, 0x00020001 ; bias pattern | |
256 movd xmm7,edx | |
257 pcmpeqw xmm6,xmm6 | |
258 pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2} | |
259 psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} | |
260 | |
261 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data | |
262 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data | |
263 alignx 16,7 | |
264 .rowloop: | |
265 push ecx | |
266 push edi | |
267 push esi | |
268 | |
269 mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 | |
270 mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1 | |
271 mov edi, JSAMPROW [edi] ; outptr | |
272 | |
273 cmp ecx, byte SIZEOF_XMMWORD | |
274 jae short .columnloop | |
275 alignx 16,7 | |
276 | |
277 .columnloop_r8: | |
278 movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] | |
279 movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] | |
280 pxor xmm2,xmm2 | |
281 pxor xmm3,xmm3 | |
282 mov ecx, SIZEOF_XMMWORD | |
283 jmp short .downsample | |
284 alignx 16,7 | |
285 | |
286 .columnloop: | |
287 movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] | |
288 movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] | |
289 movdqa xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD] | |
290 movdqa xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD] | |
291 | |
292 .downsample: | |
293 movdqa xmm4,xmm0 | |
294 movdqa xmm5,xmm1 | |
295 pand xmm0,xmm6 | |
296 psrlw xmm4,BYTE_BIT | |
297 pand xmm1,xmm6 | |
298 psrlw xmm5,BYTE_BIT | |
299 paddw xmm0,xmm4 | |
300 paddw xmm1,xmm5 | |
301 | |
302 movdqa xmm4,xmm2 | |
303 movdqa xmm5,xmm3 | |
304 pand xmm2,xmm6 | |
305 psrlw xmm4,BYTE_BIT | |
306 pand xmm3,xmm6 | |
307 psrlw xmm5,BYTE_BIT | |
308 paddw xmm2,xmm4 | |
309 paddw xmm3,xmm5 | |
310 | |
311 paddw xmm0,xmm1 | |
312 paddw xmm2,xmm3 | |
313 paddw xmm0,xmm7 | |
314 paddw xmm2,xmm7 | |
315 psrlw xmm0,2 | |
316 psrlw xmm2,2 | |
317 | |
318 packuswb xmm0,xmm2 | |
319 | |
320 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 | |
321 | |
322 sub ecx, byte SIZEOF_XMMWORD ; outcol | |
323 add edx, byte 2*SIZEOF_XMMWORD ; inptr0 | |
324 add esi, byte 2*SIZEOF_XMMWORD ; inptr1 | |
325 add edi, byte 1*SIZEOF_XMMWORD ; outptr | |
326 cmp ecx, byte SIZEOF_XMMWORD | |
327 jae near .columnloop | |
328 test ecx,ecx | |
329 jnz near .columnloop_r8 | |
330 | |
331 pop esi | |
332 pop edi | |
333 pop ecx | |
334 | |
335 add esi, byte 2*SIZEOF_JSAMPROW ; input_data | |
336 add edi, byte 1*SIZEOF_JSAMPROW ; output_data | |
337 dec eax ; rowctr | |
338 jg near .rowloop | |
339 | |
340 .return: | |
341 pop edi | |
342 pop esi | |
343 ; pop edx ; need not be preserved | |
344 ; pop ecx ; need not be preserved | |
345 ; pop ebx ; unused | |
346 pop ebp | |
347 ret | |
348 | |
349 ; For some reason, the OS X linker does not honor the request to align the | |
350 ; segment unless we do this. | |
351 align 16 | |
OLD | NEW |