OLD | NEW |
| (Empty) |
1 ; | |
2 ; jdsammmx.asm - upsampling (MMX) | |
3 ; | |
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB | |
5 ; | |
6 ; Based on | |
7 ; x86 SIMD extension for IJG JPEG library | |
8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. | |
9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc | |
10 ; | |
11 ; This file should be assembled with NASM (Netwide Assembler), | |
12 ; can *not* be assembled with Microsoft's MASM or any compatible | |
13 ; assembler (including Borland's Turbo Assembler). | |
14 ; NASM is available from http://nasm.sourceforge.net/ or | |
15 ; http://sourceforge.net/project/showfiles.php?group_id=6208 | |
16 ; | |
17 ; [TAB8] | |
18 | |
19 %include "jsimdext.inc" | |
20 | |
21 ; -------------------------------------------------------------------------- | |
22 SECTION SEG_CONST | |
23 | |
24 alignz 16 | |
25 global EXTN(jconst_fancy_upsample_mmx) PRIVATE | |
26 | |
27 EXTN(jconst_fancy_upsample_mmx): | |
28 | |
29 PW_ONE times 4 dw 1 | |
30 PW_TWO times 4 dw 2 | |
31 PW_THREE times 4 dw 3 | |
32 PW_SEVEN times 4 dw 7 | |
33 PW_EIGHT times 4 dw 8 | |
34 | |
35 alignz 16 | |
36 | |
37 ; -------------------------------------------------------------------------- | |
38 SECTION SEG_TEXT | |
39 BITS 32 | |
40 ; | |
41 ; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical. | |
42 ; | |
43 ; The upsampling algorithm is linear interpolation between pixel centers, | |
44 ; also known as a "triangle filter". This is a good compromise between | |
45 ; speed and visual quality. The centers of the output pixels are 1/4 and 3/4 | |
46 ; of the way between input pixel centers. | |
47 ; | |
48 ; GLOBAL(void) | |
49 ; jsimd_h2v1_fancy_upsample_mmx (int max_v_samp_factor, | |
50 ; JDIMENSION downsampled_width, | |
51 ; JSAMPARRAY input_data, | |
52 ; JSAMPARRAY * output_data_ptr); | |
53 ; | |
54 | |
55 %define max_v_samp(b) (b)+8 ; int max_v_samp_factor | |
56 %define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width | |
57 %define input_data(b) (b)+16 ; JSAMPARRAY input_data | |
58 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr | |
59 | |
60 align 16 | |
61 global EXTN(jsimd_h2v1_fancy_upsample_mmx) PRIVATE | |
62 | |
63 EXTN(jsimd_h2v1_fancy_upsample_mmx): | |
64 push ebp | |
65 mov ebp,esp | |
66 pushpic ebx | |
67 ; push ecx ; need not be preserved | |
68 ; push edx ; need not be preserved | |
69 push esi | |
70 push edi | |
71 | |
72 get_GOT ebx ; get GOT address | |
73 | |
74 mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr | |
75 test eax,eax | |
76 jz near .return | |
77 | |
78 mov ecx, INT [max_v_samp(ebp)] ; rowctr | |
79 test ecx,ecx | |
80 jz near .return | |
81 | |
82 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data | |
83 mov edi, POINTER [output_data_ptr(ebp)] | |
84 mov edi, JSAMPARRAY [edi] ; output_data | |
85 alignx 16,7 | |
86 .rowloop: | |
87 push eax ; colctr | |
88 push edi | |
89 push esi | |
90 | |
91 mov esi, JSAMPROW [esi] ; inptr | |
92 mov edi, JSAMPROW [edi] ; outptr | |
93 | |
94 test eax, SIZEOF_MMWORD-1 | |
95 jz short .skip | |
96 mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] | |
97 mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample | |
98 .skip: | |
99 pxor mm0,mm0 ; mm0=(all 0's) | |
100 pcmpeqb mm7,mm7 | |
101 psrlq mm7,(SIZEOF_MMWORD-1)*BYTE_BIT | |
102 pand mm7, MMWORD [esi+0*SIZEOF_MMWORD] | |
103 | |
104 add eax, byte SIZEOF_MMWORD-1 | |
105 and eax, byte -SIZEOF_MMWORD | |
106 cmp eax, byte SIZEOF_MMWORD | |
107 ja short .columnloop | |
108 alignx 16,7 | |
109 | |
110 .columnloop_last: | |
111 pcmpeqb mm6,mm6 | |
112 psllq mm6,(SIZEOF_MMWORD-1)*BYTE_BIT | |
113 pand mm6, MMWORD [esi+0*SIZEOF_MMWORD] | |
114 jmp short .upsample | |
115 alignx 16,7 | |
116 | |
117 .columnloop: | |
118 movq mm6, MMWORD [esi+1*SIZEOF_MMWORD] | |
119 psllq mm6,(SIZEOF_MMWORD-1)*BYTE_BIT | |
120 | |
121 .upsample: | |
122 movq mm1, MMWORD [esi+0*SIZEOF_MMWORD] | |
123 movq mm2,mm1 | |
124 movq mm3,mm1 ; mm1=( 0 1 2 3 4 5 6 7) | |
125 psllq mm2,BYTE_BIT ; mm2=( - 0 1 2 3 4 5 6) | |
126 psrlq mm3,BYTE_BIT ; mm3=( 1 2 3 4 5 6 7 -) | |
127 | |
128 por mm2,mm7 ; mm2=(-1 0 1 2 3 4 5 6) | |
129 por mm3,mm6 ; mm3=( 1 2 3 4 5 6 7 8) | |
130 | |
131 movq mm7,mm1 | |
132 psrlq mm7,(SIZEOF_MMWORD-1)*BYTE_BIT ; mm7=( 7 - - - - - - -) | |
133 | |
134 movq mm4,mm1 | |
135 punpcklbw mm1,mm0 ; mm1=( 0 1 2 3) | |
136 punpckhbw mm4,mm0 ; mm4=( 4 5 6 7) | |
137 movq mm5,mm2 | |
138 punpcklbw mm2,mm0 ; mm2=(-1 0 1 2) | |
139 punpckhbw mm5,mm0 ; mm5=( 3 4 5 6) | |
140 movq mm6,mm3 | |
141 punpcklbw mm3,mm0 ; mm3=( 1 2 3 4) | |
142 punpckhbw mm6,mm0 ; mm6=( 5 6 7 8) | |
143 | |
144 pmullw mm1,[GOTOFF(ebx,PW_THREE)] | |
145 pmullw mm4,[GOTOFF(ebx,PW_THREE)] | |
146 paddw mm2,[GOTOFF(ebx,PW_ONE)] | |
147 paddw mm5,[GOTOFF(ebx,PW_ONE)] | |
148 paddw mm3,[GOTOFF(ebx,PW_TWO)] | |
149 paddw mm6,[GOTOFF(ebx,PW_TWO)] | |
150 | |
151 paddw mm2,mm1 | |
152 paddw mm5,mm4 | |
153 psrlw mm2,2 ; mm2=OutLE=( 0 2 4 6) | |
154 psrlw mm5,2 ; mm5=OutHE=( 8 10 12 14) | |
155 paddw mm3,mm1 | |
156 paddw mm6,mm4 | |
157 psrlw mm3,2 ; mm3=OutLO=( 1 3 5 7) | |
158 psrlw mm6,2 ; mm6=OutHO=( 9 11 13 15) | |
159 | |
160 psllw mm3,BYTE_BIT | |
161 psllw mm6,BYTE_BIT | |
162 por mm2,mm3 ; mm2=OutL=( 0 1 2 3 4 5 6 7) | |
163 por mm5,mm6 ; mm5=OutH=( 8 9 10 11 12 13 14 15) | |
164 | |
165 movq MMWORD [edi+0*SIZEOF_MMWORD], mm2 | |
166 movq MMWORD [edi+1*SIZEOF_MMWORD], mm5 | |
167 | |
168 sub eax, byte SIZEOF_MMWORD | |
169 add esi, byte 1*SIZEOF_MMWORD ; inptr | |
170 add edi, byte 2*SIZEOF_MMWORD ; outptr | |
171 cmp eax, byte SIZEOF_MMWORD | |
172 ja near .columnloop | |
173 test eax,eax | |
174 jnz near .columnloop_last | |
175 | |
176 pop esi | |
177 pop edi | |
178 pop eax | |
179 | |
180 add esi, byte SIZEOF_JSAMPROW ; input_data | |
181 add edi, byte SIZEOF_JSAMPROW ; output_data | |
182 dec ecx ; rowctr | |
183 jg near .rowloop | |
184 | |
185 emms ; empty MMX state | |
186 | |
187 .return: | |
188 pop edi | |
189 pop esi | |
190 ; pop edx ; need not be preserved | |
191 ; pop ecx ; need not be preserved | |
192 poppic ebx | |
193 pop ebp | |
194 ret | |
195 | |
196 ; -------------------------------------------------------------------------- | |
197 ; | |
198 ; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. | |
199 ; Again a triangle filter; see comments for h2v1 case, above. | |
200 ; | |
201 ; GLOBAL(void) | |
202 ; jsimd_h2v2_fancy_upsample_mmx (int max_v_samp_factor, | |
203 ; JDIMENSION downsampled_width, | |
204 ; JSAMPARRAY input_data, | |
205 ; JSAMPARRAY * output_data_ptr); | |
206 ; | |
207 | |
208 %define max_v_samp(b) (b)+8 ; int max_v_samp_factor | |
209 %define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width | |
210 %define input_data(b) (b)+16 ; JSAMPARRAY input_data | |
211 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr | |
212 | |
213 %define original_ebp ebp+0 | |
214 %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] | |
215 %define WK_NUM 4 | |
216 %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr | |
217 | |
218 align 16 | |
219 global EXTN(jsimd_h2v2_fancy_upsample_mmx) PRIVATE | |
220 | |
221 EXTN(jsimd_h2v2_fancy_upsample_mmx): | |
222 push ebp | |
223 mov eax,esp ; eax = original ebp | |
224 sub esp, byte 4 | |
225 and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits | |
226 mov [esp],eax | |
227 mov ebp,esp ; ebp = aligned ebp | |
228 lea esp, [wk(0)] | |
229 pushpic eax ; make a room for GOT address | |
230 push ebx | |
231 ; push ecx ; need not be preserved | |
232 ; push edx ; need not be preserved | |
233 push esi | |
234 push edi | |
235 | |
236 get_GOT ebx ; get GOT address | |
237 movpic POINTER [gotptr], ebx ; save GOT address | |
238 | |
239 mov edx,eax ; edx = original ebp | |
240 mov eax, JDIMENSION [downsamp_width(edx)] ; colctr | |
241 test eax,eax | |
242 jz near .return | |
243 | |
244 mov ecx, INT [max_v_samp(edx)] ; rowctr | |
245 test ecx,ecx | |
246 jz near .return | |
247 | |
248 mov esi, JSAMPARRAY [input_data(edx)] ; input_data | |
249 mov edi, POINTER [output_data_ptr(edx)] | |
250 mov edi, JSAMPARRAY [edi] ; output_data | |
251 alignx 16,7 | |
252 .rowloop: | |
253 push eax ; colctr | |
254 push ecx | |
255 push edi | |
256 push esi | |
257 | |
258 mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above) | |
259 mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 | |
260 mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below) | |
261 mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 | |
262 mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 | |
263 | |
264 test eax, SIZEOF_MMWORD-1 | |
265 jz short .skip | |
266 push edx | |
267 mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE] | |
268 mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl | |
269 mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE] | |
270 mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl | |
271 mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] | |
272 mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample | |
273 pop edx | |
274 .skip: | |
275 ; -- process the first column block | |
276 | |
277 movq mm0, MMWORD [ebx+0*SIZEOF_MMWORD] ; mm0=row[ 0][0] | |
278 movq mm1, MMWORD [ecx+0*SIZEOF_MMWORD] ; mm1=row[-1][0] | |
279 movq mm2, MMWORD [esi+0*SIZEOF_MMWORD] ; mm2=row[+1][0] | |
280 | |
281 pushpic ebx | |
282 movpic ebx, POINTER [gotptr] ; load GOT address | |
283 | |
284 pxor mm3,mm3 ; mm3=(all 0's) | |
285 movq mm4,mm0 | |
286 punpcklbw mm0,mm3 ; mm0=row[ 0][0]( 0 1 2 3) | |
287 punpckhbw mm4,mm3 ; mm4=row[ 0][0]( 4 5 6 7) | |
288 movq mm5,mm1 | |
289 punpcklbw mm1,mm3 ; mm1=row[-1][0]( 0 1 2 3) | |
290 punpckhbw mm5,mm3 ; mm5=row[-1][0]( 4 5 6 7) | |
291 movq mm6,mm2 | |
292 punpcklbw mm2,mm3 ; mm2=row[+1][0]( 0 1 2 3) | |
293 punpckhbw mm6,mm3 ; mm6=row[+1][0]( 4 5 6 7) | |
294 | |
295 pmullw mm0,[GOTOFF(ebx,PW_THREE)] | |
296 pmullw mm4,[GOTOFF(ebx,PW_THREE)] | |
297 | |
298 pcmpeqb mm7,mm7 | |
299 psrlq mm7,(SIZEOF_MMWORD-2)*BYTE_BIT | |
300 | |
301 paddw mm1,mm0 ; mm1=Int0L=( 0 1 2 3) | |
302 paddw mm5,mm4 ; mm5=Int0H=( 4 5 6 7) | |
303 paddw mm2,mm0 ; mm2=Int1L=( 0 1 2 3) | |
304 paddw mm6,mm4 ; mm6=Int1H=( 4 5 6 7) | |
305 | |
306 movq MMWORD [edx+0*SIZEOF_MMWORD], mm1 ; temporarily save | |
307 movq MMWORD [edx+1*SIZEOF_MMWORD], mm5 ; the intermediate data | |
308 movq MMWORD [edi+0*SIZEOF_MMWORD], mm2 | |
309 movq MMWORD [edi+1*SIZEOF_MMWORD], mm6 | |
310 | |
311 pand mm1,mm7 ; mm1=( 0 - - -) | |
312 pand mm2,mm7 ; mm2=( 0 - - -) | |
313 | |
314 movq MMWORD [wk(0)], mm1 | |
315 movq MMWORD [wk(1)], mm2 | |
316 | |
317 poppic ebx | |
318 | |
319 add eax, byte SIZEOF_MMWORD-1 | |
320 and eax, byte -SIZEOF_MMWORD | |
321 cmp eax, byte SIZEOF_MMWORD | |
322 ja short .columnloop | |
323 alignx 16,7 | |
324 | |
325 .columnloop_last: | |
326 ; -- process the last column block | |
327 | |
328 pushpic ebx | |
329 movpic ebx, POINTER [gotptr] ; load GOT address | |
330 | |
331 pcmpeqb mm1,mm1 | |
332 psllq mm1,(SIZEOF_MMWORD-2)*BYTE_BIT | |
333 movq mm2,mm1 | |
334 | |
335 pand mm1, MMWORD [edx+1*SIZEOF_MMWORD] ; mm1=( - - - 7) | |
336 pand mm2, MMWORD [edi+1*SIZEOF_MMWORD] ; mm2=( - - - 7) | |
337 | |
338 movq MMWORD [wk(2)], mm1 | |
339 movq MMWORD [wk(3)], mm2 | |
340 | |
341 jmp short .upsample | |
342 alignx 16,7 | |
343 | |
344 .columnloop: | |
345 ; -- process the next column block | |
346 | |
347 movq mm0, MMWORD [ebx+1*SIZEOF_MMWORD] ; mm0=row[ 0][1] | |
348 movq mm1, MMWORD [ecx+1*SIZEOF_MMWORD] ; mm1=row[-1][1] | |
349 movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] ; mm2=row[+1][1] | |
350 | |
351 pushpic ebx | |
352 movpic ebx, POINTER [gotptr] ; load GOT address | |
353 | |
354 pxor mm3,mm3 ; mm3=(all 0's) | |
355 movq mm4,mm0 | |
356 punpcklbw mm0,mm3 ; mm0=row[ 0][1]( 0 1 2 3) | |
357 punpckhbw mm4,mm3 ; mm4=row[ 0][1]( 4 5 6 7) | |
358 movq mm5,mm1 | |
359 punpcklbw mm1,mm3 ; mm1=row[-1][1]( 0 1 2 3) | |
360 punpckhbw mm5,mm3 ; mm5=row[-1][1]( 4 5 6 7) | |
361 movq mm6,mm2 | |
362 punpcklbw mm2,mm3 ; mm2=row[+1][1]( 0 1 2 3) | |
363 punpckhbw mm6,mm3 ; mm6=row[+1][1]( 4 5 6 7) | |
364 | |
365 pmullw mm0,[GOTOFF(ebx,PW_THREE)] | |
366 pmullw mm4,[GOTOFF(ebx,PW_THREE)] | |
367 | |
368 paddw mm1,mm0 ; mm1=Int0L=( 0 1 2 3) | |
369 paddw mm5,mm4 ; mm5=Int0H=( 4 5 6 7) | |
370 paddw mm2,mm0 ; mm2=Int1L=( 0 1 2 3) | |
371 paddw mm6,mm4 ; mm6=Int1H=( 4 5 6 7) | |
372 | |
373 movq MMWORD [edx+2*SIZEOF_MMWORD], mm1 ; temporarily save | |
374 movq MMWORD [edx+3*SIZEOF_MMWORD], mm5 ; the intermediate data | |
375 movq MMWORD [edi+2*SIZEOF_MMWORD], mm2 | |
376 movq MMWORD [edi+3*SIZEOF_MMWORD], mm6 | |
377 | |
378 psllq mm1,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm1=( - - - 0) | |
379 psllq mm2,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm2=( - - - 0) | |
380 | |
381 movq MMWORD [wk(2)], mm1 | |
382 movq MMWORD [wk(3)], mm2 | |
383 | |
384 .upsample: | |
385 ; -- process the upper row | |
386 | |
387 movq mm7, MMWORD [edx+0*SIZEOF_MMWORD] ; mm7=Int0L=( 0 1 2 3) | |
388 movq mm3, MMWORD [edx+1*SIZEOF_MMWORD] ; mm3=Int0H=( 4 5 6 7) | |
389 | |
390 movq mm0,mm7 | |
391 movq mm4,mm3 | |
392 psrlq mm0,2*BYTE_BIT ; mm0=( 1 2 3 -) | |
393 psllq mm4,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( - - - 4) | |
394 movq mm5,mm7 | |
395 movq mm6,mm3 | |
396 psrlq mm5,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm5=( 3 - - -) | |
397 psllq mm6,2*BYTE_BIT ; mm6=( - 4 5 6) | |
398 | |
399 por mm0,mm4 ; mm0=( 1 2 3 4) | |
400 por mm5,mm6 ; mm5=( 3 4 5 6) | |
401 | |
402 movq mm1,mm7 | |
403 movq mm2,mm3 | |
404 psllq mm1,2*BYTE_BIT ; mm1=( - 0 1 2) | |
405 psrlq mm2,2*BYTE_BIT ; mm2=( 5 6 7 -) | |
406 movq mm4,mm3 | |
407 psrlq mm4,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( 7 - - -) | |
408 | |
409 por mm1, MMWORD [wk(0)] ; mm1=(-1 0 1 2) | |
410 por mm2, MMWORD [wk(2)] ; mm2=( 5 6 7 8) | |
411 | |
412 movq MMWORD [wk(0)], mm4 | |
413 | |
414 pmullw mm7,[GOTOFF(ebx,PW_THREE)] | |
415 pmullw mm3,[GOTOFF(ebx,PW_THREE)] | |
416 paddw mm1,[GOTOFF(ebx,PW_EIGHT)] | |
417 paddw mm5,[GOTOFF(ebx,PW_EIGHT)] | |
418 paddw mm0,[GOTOFF(ebx,PW_SEVEN)] | |
419 paddw mm2,[GOTOFF(ebx,PW_SEVEN)] | |
420 | |
421 paddw mm1,mm7 | |
422 paddw mm5,mm3 | |
423 psrlw mm1,4 ; mm1=Out0LE=( 0 2 4 6) | |
424 psrlw mm5,4 ; mm5=Out0HE=( 8 10 12 14) | |
425 paddw mm0,mm7 | |
426 paddw mm2,mm3 | |
427 psrlw mm0,4 ; mm0=Out0LO=( 1 3 5 7) | |
428 psrlw mm2,4 ; mm2=Out0HO=( 9 11 13 15) | |
429 | |
430 psllw mm0,BYTE_BIT | |
431 psllw mm2,BYTE_BIT | |
432 por mm1,mm0 ; mm1=Out0L=( 0 1 2 3 4 5 6 7) | |
433 por mm5,mm2 ; mm5=Out0H=( 8 9 10 11 12 13 14 15) | |
434 | |
435 movq MMWORD [edx+0*SIZEOF_MMWORD], mm1 | |
436 movq MMWORD [edx+1*SIZEOF_MMWORD], mm5 | |
437 | |
438 ; -- process the lower row | |
439 | |
440 movq mm6, MMWORD [edi+0*SIZEOF_MMWORD] ; mm6=Int1L=( 0 1 2 3) | |
441 movq mm4, MMWORD [edi+1*SIZEOF_MMWORD] ; mm4=Int1H=( 4 5 6 7) | |
442 | |
443 movq mm7,mm6 | |
444 movq mm3,mm4 | |
445 psrlq mm7,2*BYTE_BIT ; mm7=( 1 2 3 -) | |
446 psllq mm3,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( - - - 4) | |
447 movq mm0,mm6 | |
448 movq mm2,mm4 | |
449 psrlq mm0,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm0=( 3 - - -) | |
450 psllq mm2,2*BYTE_BIT ; mm2=( - 4 5 6) | |
451 | |
452 por mm7,mm3 ; mm7=( 1 2 3 4) | |
453 por mm0,mm2 ; mm0=( 3 4 5 6) | |
454 | |
455 movq mm1,mm6 | |
456 movq mm5,mm4 | |
457 psllq mm1,2*BYTE_BIT ; mm1=( - 0 1 2) | |
458 psrlq mm5,2*BYTE_BIT ; mm5=( 5 6 7 -) | |
459 movq mm3,mm4 | |
460 psrlq mm3,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( 7 - - -) | |
461 | |
462 por mm1, MMWORD [wk(1)] ; mm1=(-1 0 1 2) | |
463 por mm5, MMWORD [wk(3)] ; mm5=( 5 6 7 8) | |
464 | |
465 movq MMWORD [wk(1)], mm3 | |
466 | |
467 pmullw mm6,[GOTOFF(ebx,PW_THREE)] | |
468 pmullw mm4,[GOTOFF(ebx,PW_THREE)] | |
469 paddw mm1,[GOTOFF(ebx,PW_EIGHT)] | |
470 paddw mm0,[GOTOFF(ebx,PW_EIGHT)] | |
471 paddw mm7,[GOTOFF(ebx,PW_SEVEN)] | |
472 paddw mm5,[GOTOFF(ebx,PW_SEVEN)] | |
473 | |
474 paddw mm1,mm6 | |
475 paddw mm0,mm4 | |
476 psrlw mm1,4 ; mm1=Out1LE=( 0 2 4 6) | |
477 psrlw mm0,4 ; mm0=Out1HE=( 8 10 12 14) | |
478 paddw mm7,mm6 | |
479 paddw mm5,mm4 | |
480 psrlw mm7,4 ; mm7=Out1LO=( 1 3 5 7) | |
481 psrlw mm5,4 ; mm5=Out1HO=( 9 11 13 15) | |
482 | |
483 psllw mm7,BYTE_BIT | |
484 psllw mm5,BYTE_BIT | |
485 por mm1,mm7 ; mm1=Out1L=( 0 1 2 3 4 5 6 7) | |
486 por mm0,mm5 ; mm0=Out1H=( 8 9 10 11 12 13 14 15) | |
487 | |
488 movq MMWORD [edi+0*SIZEOF_MMWORD], mm1 | |
489 movq MMWORD [edi+1*SIZEOF_MMWORD], mm0 | |
490 | |
491 poppic ebx | |
492 | |
493 sub eax, byte SIZEOF_MMWORD | |
494 add ecx, byte 1*SIZEOF_MMWORD ; inptr1(above) | |
495 add ebx, byte 1*SIZEOF_MMWORD ; inptr0 | |
496 add esi, byte 1*SIZEOF_MMWORD ; inptr1(below) | |
497 add edx, byte 2*SIZEOF_MMWORD ; outptr0 | |
498 add edi, byte 2*SIZEOF_MMWORD ; outptr1 | |
499 cmp eax, byte SIZEOF_MMWORD | |
500 ja near .columnloop | |
501 test eax,eax | |
502 jnz near .columnloop_last | |
503 | |
504 pop esi | |
505 pop edi | |
506 pop ecx | |
507 pop eax | |
508 | |
509 add esi, byte 1*SIZEOF_JSAMPROW ; input_data | |
510 add edi, byte 2*SIZEOF_JSAMPROW ; output_data | |
511 sub ecx, byte 2 ; rowctr | |
512 jg near .rowloop | |
513 | |
514 emms ; empty MMX state | |
515 | |
516 .return: | |
517 pop edi | |
518 pop esi | |
519 ; pop edx ; need not be preserved | |
520 ; pop ecx ; need not be preserved | |
521 pop ebx | |
522 mov esp,ebp ; esp <- aligned ebp | |
523 pop esp ; esp <- original ebp | |
524 pop ebp | |
525 ret | |
526 | |
527 ; -------------------------------------------------------------------------- | |
528 ; | |
529 ; Fast processing for the common case of 2:1 horizontal and 1:1 vertical. | |
530 ; It's still a box filter. | |
531 ; | |
532 ; GLOBAL(void) | |
533 ; jsimd_h2v1_upsample_mmx (int max_v_samp_factor, | |
534 ; JDIMENSION output_width, | |
535 ; JSAMPARRAY input_data, | |
536 ; JSAMPARRAY * output_data_ptr); | |
537 ; | |
538 | |
539 %define max_v_samp(b) (b)+8 ; int max_v_samp_factor | |
540 %define output_width(b) (b)+12 ; JDIMENSION output_width | |
541 %define input_data(b) (b)+16 ; JSAMPARRAY input_data | |
542 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr | |
543 | |
544 align 16 | |
545 global EXTN(jsimd_h2v1_upsample_mmx) PRIVATE | |
546 | |
547 EXTN(jsimd_h2v1_upsample_mmx): | |
548 push ebp | |
549 mov ebp,esp | |
550 ; push ebx ; unused | |
551 ; push ecx ; need not be preserved | |
552 ; push edx ; need not be preserved | |
553 push esi | |
554 push edi | |
555 | |
556 mov edx, JDIMENSION [output_width(ebp)] | |
557 add edx, byte (2*SIZEOF_MMWORD)-1 | |
558 and edx, byte -(2*SIZEOF_MMWORD) | |
559 jz short .return | |
560 | |
561 mov ecx, INT [max_v_samp(ebp)] ; rowctr | |
562 test ecx,ecx | |
563 jz short .return | |
564 | |
565 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data | |
566 mov edi, POINTER [output_data_ptr(ebp)] | |
567 mov edi, JSAMPARRAY [edi] ; output_data | |
568 alignx 16,7 | |
569 .rowloop: | |
570 push edi | |
571 push esi | |
572 | |
573 mov esi, JSAMPROW [esi] ; inptr | |
574 mov edi, JSAMPROW [edi] ; outptr | |
575 mov eax,edx ; colctr | |
576 alignx 16,7 | |
577 .columnloop: | |
578 | |
579 movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] | |
580 | |
581 movq mm1,mm0 | |
582 punpcklbw mm0,mm0 | |
583 punpckhbw mm1,mm1 | |
584 | |
585 movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 | |
586 movq MMWORD [edi+1*SIZEOF_MMWORD], mm1 | |
587 | |
588 sub eax, byte 2*SIZEOF_MMWORD | |
589 jz short .nextrow | |
590 | |
591 movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] | |
592 | |
593 movq mm3,mm2 | |
594 punpcklbw mm2,mm2 | |
595 punpckhbw mm3,mm3 | |
596 | |
597 movq MMWORD [edi+2*SIZEOF_MMWORD], mm2 | |
598 movq MMWORD [edi+3*SIZEOF_MMWORD], mm3 | |
599 | |
600 sub eax, byte 2*SIZEOF_MMWORD | |
601 jz short .nextrow | |
602 | |
603 add esi, byte 2*SIZEOF_MMWORD ; inptr | |
604 add edi, byte 4*SIZEOF_MMWORD ; outptr | |
605 jmp short .columnloop | |
606 alignx 16,7 | |
607 | |
608 .nextrow: | |
609 pop esi | |
610 pop edi | |
611 | |
612 add esi, byte SIZEOF_JSAMPROW ; input_data | |
613 add edi, byte SIZEOF_JSAMPROW ; output_data | |
614 dec ecx ; rowctr | |
615 jg short .rowloop | |
616 | |
617 emms ; empty MMX state | |
618 | |
619 .return: | |
620 pop edi | |
621 pop esi | |
622 ; pop edx ; need not be preserved | |
623 ; pop ecx ; need not be preserved | |
624 ; pop ebx ; unused | |
625 pop ebp | |
626 ret | |
627 | |
628 ; -------------------------------------------------------------------------- | |
629 ; | |
630 ; Fast processing for the common case of 2:1 horizontal and 2:1 vertical. | |
631 ; It's still a box filter. | |
632 ; | |
633 ; GLOBAL(void) | |
634 ; jsimd_h2v2_upsample_mmx (int max_v_samp_factor, | |
635 ; JDIMENSION output_width, | |
636 ; JSAMPARRAY input_data, | |
637 ; JSAMPARRAY * output_data_ptr); | |
638 ; | |
639 | |
640 %define max_v_samp(b) (b)+8 ; int max_v_samp_factor | |
641 %define output_width(b) (b)+12 ; JDIMENSION output_width | |
642 %define input_data(b) (b)+16 ; JSAMPARRAY input_data | |
643 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr | |
644 | |
645 align 16 | |
646 global EXTN(jsimd_h2v2_upsample_mmx) PRIVATE | |
647 | |
648 EXTN(jsimd_h2v2_upsample_mmx): | |
649 push ebp | |
650 mov ebp,esp | |
651 push ebx | |
652 ; push ecx ; need not be preserved | |
653 ; push edx ; need not be preserved | |
654 push esi | |
655 push edi | |
656 | |
657 mov edx, JDIMENSION [output_width(ebp)] | |
658 add edx, byte (2*SIZEOF_MMWORD)-1 | |
659 and edx, byte -(2*SIZEOF_MMWORD) | |
660 jz near .return | |
661 | |
662 mov ecx, INT [max_v_samp(ebp)] ; rowctr | |
663 test ecx,ecx | |
664 jz short .return | |
665 | |
666 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data | |
667 mov edi, POINTER [output_data_ptr(ebp)] | |
668 mov edi, JSAMPARRAY [edi] ; output_data | |
669 alignx 16,7 | |
670 .rowloop: | |
671 push edi | |
672 push esi | |
673 | |
674 mov esi, JSAMPROW [esi] ; inptr | |
675 mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 | |
676 mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 | |
677 mov eax,edx ; colctr | |
678 alignx 16,7 | |
679 .columnloop: | |
680 | |
681 movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] | |
682 | |
683 movq mm1,mm0 | |
684 punpcklbw mm0,mm0 | |
685 punpckhbw mm1,mm1 | |
686 | |
687 movq MMWORD [ebx+0*SIZEOF_MMWORD], mm0 | |
688 movq MMWORD [ebx+1*SIZEOF_MMWORD], mm1 | |
689 movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 | |
690 movq MMWORD [edi+1*SIZEOF_MMWORD], mm1 | |
691 | |
692 sub eax, byte 2*SIZEOF_MMWORD | |
693 jz short .nextrow | |
694 | |
695 movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] | |
696 | |
697 movq mm3,mm2 | |
698 punpcklbw mm2,mm2 | |
699 punpckhbw mm3,mm3 | |
700 | |
701 movq MMWORD [ebx+2*SIZEOF_MMWORD], mm2 | |
702 movq MMWORD [ebx+3*SIZEOF_MMWORD], mm3 | |
703 movq MMWORD [edi+2*SIZEOF_MMWORD], mm2 | |
704 movq MMWORD [edi+3*SIZEOF_MMWORD], mm3 | |
705 | |
706 sub eax, byte 2*SIZEOF_MMWORD | |
707 jz short .nextrow | |
708 | |
709 add esi, byte 2*SIZEOF_MMWORD ; inptr | |
710 add ebx, byte 4*SIZEOF_MMWORD ; outptr0 | |
711 add edi, byte 4*SIZEOF_MMWORD ; outptr1 | |
712 jmp short .columnloop | |
713 alignx 16,7 | |
714 | |
715 .nextrow: | |
716 pop esi | |
717 pop edi | |
718 | |
719 add esi, byte 1*SIZEOF_JSAMPROW ; input_data | |
720 add edi, byte 2*SIZEOF_JSAMPROW ; output_data | |
721 sub ecx, byte 2 ; rowctr | |
722 jg short .rowloop | |
723 | |
724 emms ; empty MMX state | |
725 | |
726 .return: | |
727 pop edi | |
728 pop esi | |
729 ; pop edx ; need not be preserved | |
730 ; pop ecx ; need not be preserved | |
731 pop ebx | |
732 pop ebp | |
733 ret | |
734 | |
735 ; For some reason, the OS X linker does not honor the request to align the | |
736 ; segment unless we do this. | |
737 align 16 | |
OLD | NEW |