OLD | NEW |
| (Empty) |
1 ; | |
2 ; jcsammmx.asm - downsampling (MMX) | |
3 ; | |
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB | |
5 ; | |
6 ; Based on | |
7 ; x86 SIMD extension for IJG JPEG library | |
8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. | |
9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc | |
10 ; | |
11 ; This file should be assembled with NASM (Netwide Assembler), | |
12 ; can *not* be assembled with Microsoft's MASM or any compatible | |
13 ; assembler (including Borland's Turbo Assembler). | |
14 ; NASM is available from http://nasm.sourceforge.net/ or | |
15 ; http://sourceforge.net/project/showfiles.php?group_id=6208 | |
16 ; | |
17 ; [TAB8] | |
18 | |
19 %include "jsimdext.inc" | |
20 | |
21 ; -------------------------------------------------------------------------- | |
22 SECTION SEG_TEXT | |
23 BITS 32 | |
24 ; | |
25 ; Downsample pixel values of a single component. | |
26 ; This version handles the common case of 2:1 horizontal and 1:1 vertical, | |
27 ; without smoothing. | |
28 ; | |
29 ; GLOBAL(void) | |
30 ; jsimd_h2v1_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor, | |
31 ; JDIMENSION v_samp_factor, JDIMENSION width_blocks, | |
32 ; JSAMPARRAY input_data, JSAMPARRAY output_data); | |
33 ; | |
34 | |
35 %define img_width(b) (b)+8 ; JDIMENSION image_width | |
36 %define max_v_samp(b) (b)+12 ; int max_v_samp_factor | |
37 %define v_samp(b) (b)+16 ; JDIMENSION v_samp_fact
or | |
38 %define width_blks(b) (b)+20 ; JDIMENSION width_blocks | |
39 %define input_data(b) (b)+24 ; JSAMPARRAY input_data | |
40 %define output_data(b) (b)+28 ; JSAMPARRAY output_data | |
41 | |
42 align 16 | |
43 global EXTN(jsimd_h2v1_downsample_mmx) PRIVATE | |
44 | |
45 EXTN(jsimd_h2v1_downsample_mmx): | |
46 push ebp | |
47 mov ebp,esp | |
48 ; push ebx ; unused | |
49 ; push ecx ; need not be preserved | |
50 ; push edx ; need not be preserved | |
51 push esi | |
52 push edi | |
53 | |
54 mov ecx, JDIMENSION [width_blks(ebp)] | |
55 shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) | |
56 jz near .return | |
57 | |
58 mov edx, JDIMENSION [img_width(ebp)] | |
59 | |
60 ; -- expand_right_edge | |
61 | |
62 push ecx | |
63 shl ecx,1 ; output_cols * 2 | |
64 sub ecx,edx | |
65 jle short .expand_end | |
66 | |
67 mov eax, INT [max_v_samp(ebp)] | |
68 test eax,eax | |
69 jle short .expand_end | |
70 | |
71 cld | |
72 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data | |
73 alignx 16,7 | |
74 .expandloop: | |
75 push eax | |
76 push ecx | |
77 | |
78 mov edi, JSAMPROW [esi] | |
79 add edi,edx | |
80 mov al, JSAMPLE [edi-1] | |
81 | |
82 rep stosb | |
83 | |
84 pop ecx | |
85 pop eax | |
86 | |
87 add esi, byte SIZEOF_JSAMPROW | |
88 dec eax | |
89 jg short .expandloop | |
90 | |
91 .expand_end: | |
92 pop ecx ; output_cols | |
93 | |
94 ; -- h2v1_downsample | |
95 | |
96 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr | |
97 test eax,eax | |
98 jle near .return | |
99 | |
100 mov edx, 0x00010000 ; bias pattern | |
101 movd mm7,edx | |
102 pcmpeqw mm6,mm6 | |
103 punpckldq mm7,mm7 ; mm7={0, 1, 0, 1} | |
104 psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..} | |
105 | |
106 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data | |
107 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data | |
108 alignx 16,7 | |
109 .rowloop: | |
110 push ecx | |
111 push edi | |
112 push esi | |
113 | |
114 mov esi, JSAMPROW [esi] ; inptr | |
115 mov edi, JSAMPROW [edi] ; outptr | |
116 alignx 16,7 | |
117 .columnloop: | |
118 | |
119 movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] | |
120 movq mm1, MMWORD [esi+1*SIZEOF_MMWORD] | |
121 movq mm2,mm0 | |
122 movq mm3,mm1 | |
123 | |
124 pand mm0,mm6 | |
125 psrlw mm2,BYTE_BIT | |
126 pand mm1,mm6 | |
127 psrlw mm3,BYTE_BIT | |
128 | |
129 paddw mm0,mm2 | |
130 paddw mm1,mm3 | |
131 paddw mm0,mm7 | |
132 paddw mm1,mm7 | |
133 psrlw mm0,1 | |
134 psrlw mm1,1 | |
135 | |
136 packuswb mm0,mm1 | |
137 | |
138 movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 | |
139 | |
140 add esi, byte 2*SIZEOF_MMWORD ; inptr | |
141 add edi, byte 1*SIZEOF_MMWORD ; outptr | |
142 sub ecx, byte SIZEOF_MMWORD ; outcol | |
143 jnz short .columnloop | |
144 | |
145 pop esi | |
146 pop edi | |
147 pop ecx | |
148 | |
149 add esi, byte SIZEOF_JSAMPROW ; input_data | |
150 add edi, byte SIZEOF_JSAMPROW ; output_data | |
151 dec eax ; rowctr | |
152 jg short .rowloop | |
153 | |
154 emms ; empty MMX state | |
155 | |
156 .return: | |
157 pop edi | |
158 pop esi | |
159 ; pop edx ; need not be preserved | |
160 ; pop ecx ; need not be preserved | |
161 ; pop ebx ; unused | |
162 pop ebp | |
163 ret | |
164 | |
165 ; -------------------------------------------------------------------------- | |
166 ; | |
167 ; Downsample pixel values of a single component. | |
168 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical, | |
169 ; without smoothing. | |
170 ; | |
171 ; GLOBAL(void) | |
172 ; jsimd_h2v2_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor, | |
173 ; JDIMENSION v_samp_factor, JDIMENSION width_blocks, | |
174 ; JSAMPARRAY input_data, JSAMPARRAY output_data); | |
175 ; | |
176 | |
177 %define img_width(b) (b)+8 ; JDIMENSION image_width | |
178 %define max_v_samp(b) (b)+12 ; int max_v_samp_factor | |
179 %define v_samp(b) (b)+16 ; JDIMENSION v_samp_fact
or | |
180 %define width_blks(b) (b)+20 ; JDIMENSION width_blocks | |
181 %define input_data(b) (b)+24 ; JSAMPARRAY input_data | |
182 %define output_data(b) (b)+28 ; JSAMPARRAY output_data | |
183 | |
184 align 16 | |
185 global EXTN(jsimd_h2v2_downsample_mmx) PRIVATE | |
186 | |
187 EXTN(jsimd_h2v2_downsample_mmx): | |
188 push ebp | |
189 mov ebp,esp | |
190 ; push ebx ; unused | |
191 ; push ecx ; need not be preserved | |
192 ; push edx ; need not be preserved | |
193 push esi | |
194 push edi | |
195 | |
196 mov ecx, JDIMENSION [width_blks(ebp)] | |
197 shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) | |
198 jz near .return | |
199 | |
200 mov edx, JDIMENSION [img_width(ebp)] | |
201 | |
202 ; -- expand_right_edge | |
203 | |
204 push ecx | |
205 shl ecx,1 ; output_cols * 2 | |
206 sub ecx,edx | |
207 jle short .expand_end | |
208 | |
209 mov eax, INT [max_v_samp(ebp)] | |
210 test eax,eax | |
211 jle short .expand_end | |
212 | |
213 cld | |
214 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data | |
215 alignx 16,7 | |
216 .expandloop: | |
217 push eax | |
218 push ecx | |
219 | |
220 mov edi, JSAMPROW [esi] | |
221 add edi,edx | |
222 mov al, JSAMPLE [edi-1] | |
223 | |
224 rep stosb | |
225 | |
226 pop ecx | |
227 pop eax | |
228 | |
229 add esi, byte SIZEOF_JSAMPROW | |
230 dec eax | |
231 jg short .expandloop | |
232 | |
233 .expand_end: | |
234 pop ecx ; output_cols | |
235 | |
236 ; -- h2v2_downsample | |
237 | |
238 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr | |
239 test eax,eax | |
240 jle near .return | |
241 | |
242 mov edx, 0x00020001 ; bias pattern | |
243 movd mm7,edx | |
244 pcmpeqw mm6,mm6 | |
245 punpckldq mm7,mm7 ; mm7={1, 2, 1, 2} | |
246 psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..} | |
247 | |
248 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data | |
249 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data | |
250 alignx 16,7 | |
251 .rowloop: | |
252 push ecx | |
253 push edi | |
254 push esi | |
255 | |
256 mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 | |
257 mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1 | |
258 mov edi, JSAMPROW [edi] ; outptr | |
259 alignx 16,7 | |
260 .columnloop: | |
261 | |
262 movq mm0, MMWORD [edx+0*SIZEOF_MMWORD] | |
263 movq mm1, MMWORD [esi+0*SIZEOF_MMWORD] | |
264 movq mm2, MMWORD [edx+1*SIZEOF_MMWORD] | |
265 movq mm3, MMWORD [esi+1*SIZEOF_MMWORD] | |
266 | |
267 movq mm4,mm0 | |
268 movq mm5,mm1 | |
269 pand mm0,mm6 | |
270 psrlw mm4,BYTE_BIT | |
271 pand mm1,mm6 | |
272 psrlw mm5,BYTE_BIT | |
273 paddw mm0,mm4 | |
274 paddw mm1,mm5 | |
275 | |
276 movq mm4,mm2 | |
277 movq mm5,mm3 | |
278 pand mm2,mm6 | |
279 psrlw mm4,BYTE_BIT | |
280 pand mm3,mm6 | |
281 psrlw mm5,BYTE_BIT | |
282 paddw mm2,mm4 | |
283 paddw mm3,mm5 | |
284 | |
285 paddw mm0,mm1 | |
286 paddw mm2,mm3 | |
287 paddw mm0,mm7 | |
288 paddw mm2,mm7 | |
289 psrlw mm0,2 | |
290 psrlw mm2,2 | |
291 | |
292 packuswb mm0,mm2 | |
293 | |
294 movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 | |
295 | |
296 add edx, byte 2*SIZEOF_MMWORD ; inptr0 | |
297 add esi, byte 2*SIZEOF_MMWORD ; inptr1 | |
298 add edi, byte 1*SIZEOF_MMWORD ; outptr | |
299 sub ecx, byte SIZEOF_MMWORD ; outcol | |
300 jnz near .columnloop | |
301 | |
302 pop esi | |
303 pop edi | |
304 pop ecx | |
305 | |
306 add esi, byte 2*SIZEOF_JSAMPROW ; input_data | |
307 add edi, byte 1*SIZEOF_JSAMPROW ; output_data | |
308 dec eax ; rowctr | |
309 jg near .rowloop | |
310 | |
311 emms ; empty MMX state | |
312 | |
313 .return: | |
314 pop edi | |
315 pop esi | |
316 ; pop edx ; need not be preserved | |
317 ; pop ecx ; need not be preserved | |
318 ; pop ebx ; unused | |
319 pop ebp | |
320 ret | |
321 | |
322 ; For some reason, the OS X linker does not honor the request to align the | |
323 ; segment unless we do this. | |
324 align 16 | |
OLD | NEW |