Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(166)

Side by Side Diff: simd/jdsample-mmx.asm

Issue 1934113002: Update libjpeg_turbo to 1.4.90 from https://github.com/libjpeg-turbo/ (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libjpeg_turbo.git@master
Patch Set: Created 4 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 ;
2 ; jdsample.asm - upsampling (MMX)
3 ;
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5 ;
6 ; Based on
7 ; x86 SIMD extension for IJG JPEG library
8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
10 ;
11 ; This file should be assembled with NASM (Netwide Assembler),
12 ; can *not* be assembled with Microsoft's MASM or any compatible
13 ; assembler (including Borland's Turbo Assembler).
14 ; NASM is available from http://nasm.sourceforge.net/ or
15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
16 ;
17 ; [TAB8]
18
19 %include "jsimdext.inc"
20
21 ; --------------------------------------------------------------------------
22 SECTION SEG_CONST
23
24 alignz 16
25 global EXTN(jconst_fancy_upsample_mmx)
26
27 EXTN(jconst_fancy_upsample_mmx):
28
29 PW_ONE times 4 dw 1
30 PW_TWO times 4 dw 2
31 PW_THREE times 4 dw 3
32 PW_SEVEN times 4 dw 7
33 PW_EIGHT times 4 dw 8
34
35 alignz 16
36
37 ; --------------------------------------------------------------------------
38 SECTION SEG_TEXT
39 BITS 32
40 ;
41 ; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
42 ;
43 ; The upsampling algorithm is linear interpolation between pixel centers,
44 ; also known as a "triangle filter". This is a good compromise between
45 ; speed and visual quality. The centers of the output pixels are 1/4 and 3/4
46 ; of the way between input pixel centers.
47 ;
48 ; GLOBAL(void)
49 ; jsimd_h2v1_fancy_upsample_mmx (int max_v_samp_factor,
50 ; JDIMENSION downsampled_width,
51 ; JSAMPARRAY input_data,
52 ; JSAMPARRAY *output_data_ptr);
53 ;
54
55 %define max_v_samp(b) (b)+8 ; int max_v_samp_factor
56 %define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width
57 %define input_data(b) (b)+16 ; JSAMPARRAY input_data
58 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr
59
60 align 16
61 global EXTN(jsimd_h2v1_fancy_upsample_mmx)
62
63 EXTN(jsimd_h2v1_fancy_upsample_mmx):
64 push ebp
65 mov ebp,esp
66 pushpic ebx
67 ; push ecx ; need not be preserved
68 ; push edx ; need not be preserved
69 push esi
70 push edi
71
72 get_GOT ebx ; get GOT address
73
74 mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr
75 test eax,eax
76 jz near .return
77
78 mov ecx, INT [max_v_samp(ebp)] ; rowctr
79 test ecx,ecx
80 jz near .return
81
82 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
83 mov edi, POINTER [output_data_ptr(ebp)]
84 mov edi, JSAMPARRAY [edi] ; output_data
85 alignx 16,7
86 .rowloop:
87 push eax ; colctr
88 push edi
89 push esi
90
91 mov esi, JSAMPROW [esi] ; inptr
92 mov edi, JSAMPROW [edi] ; outptr
93
94 test eax, SIZEOF_MMWORD-1
95 jz short .skip
96 mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
97 mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
98 .skip:
99 pxor mm0,mm0 ; mm0=(all 0's)
100 pcmpeqb mm7,mm7
101 psrlq mm7,(SIZEOF_MMWORD-1)*BYTE_BIT
102 pand mm7, MMWORD [esi+0*SIZEOF_MMWORD]
103
104 add eax, byte SIZEOF_MMWORD-1
105 and eax, byte -SIZEOF_MMWORD
106 cmp eax, byte SIZEOF_MMWORD
107 ja short .columnloop
108 alignx 16,7
109
110 .columnloop_last:
111 pcmpeqb mm6,mm6
112 psllq mm6,(SIZEOF_MMWORD-1)*BYTE_BIT
113 pand mm6, MMWORD [esi+0*SIZEOF_MMWORD]
114 jmp short .upsample
115 alignx 16,7
116
117 .columnloop:
118 movq mm6, MMWORD [esi+1*SIZEOF_MMWORD]
119 psllq mm6,(SIZEOF_MMWORD-1)*BYTE_BIT
120
121 .upsample:
122 movq mm1, MMWORD [esi+0*SIZEOF_MMWORD]
123 movq mm2,mm1
124 movq mm3,mm1 ; mm1=( 0 1 2 3 4 5 6 7)
125 psllq mm2,BYTE_BIT ; mm2=( - 0 1 2 3 4 5 6)
126 psrlq mm3,BYTE_BIT ; mm3=( 1 2 3 4 5 6 7 -)
127
128 por mm2,mm7 ; mm2=(-1 0 1 2 3 4 5 6)
129 por mm3,mm6 ; mm3=( 1 2 3 4 5 6 7 8)
130
131 movq mm7,mm1
132 psrlq mm7,(SIZEOF_MMWORD-1)*BYTE_BIT ; mm7=( 7 - - - - - - -)
133
134 movq mm4,mm1
135 punpcklbw mm1,mm0 ; mm1=( 0 1 2 3)
136 punpckhbw mm4,mm0 ; mm4=( 4 5 6 7)
137 movq mm5,mm2
138 punpcklbw mm2,mm0 ; mm2=(-1 0 1 2)
139 punpckhbw mm5,mm0 ; mm5=( 3 4 5 6)
140 movq mm6,mm3
141 punpcklbw mm3,mm0 ; mm3=( 1 2 3 4)
142 punpckhbw mm6,mm0 ; mm6=( 5 6 7 8)
143
144 pmullw mm1,[GOTOFF(ebx,PW_THREE)]
145 pmullw mm4,[GOTOFF(ebx,PW_THREE)]
146 paddw mm2,[GOTOFF(ebx,PW_ONE)]
147 paddw mm5,[GOTOFF(ebx,PW_ONE)]
148 paddw mm3,[GOTOFF(ebx,PW_TWO)]
149 paddw mm6,[GOTOFF(ebx,PW_TWO)]
150
151 paddw mm2,mm1
152 paddw mm5,mm4
153 psrlw mm2,2 ; mm2=OutLE=( 0 2 4 6)
154 psrlw mm5,2 ; mm5=OutHE=( 8 10 12 14)
155 paddw mm3,mm1
156 paddw mm6,mm4
157 psrlw mm3,2 ; mm3=OutLO=( 1 3 5 7)
158 psrlw mm6,2 ; mm6=OutHO=( 9 11 13 15)
159
160 psllw mm3,BYTE_BIT
161 psllw mm6,BYTE_BIT
162 por mm2,mm3 ; mm2=OutL=( 0 1 2 3 4 5 6 7)
163 por mm5,mm6 ; mm5=OutH=( 8 9 10 11 12 13 14 15)
164
165 movq MMWORD [edi+0*SIZEOF_MMWORD], mm2
166 movq MMWORD [edi+1*SIZEOF_MMWORD], mm5
167
168 sub eax, byte SIZEOF_MMWORD
169 add esi, byte 1*SIZEOF_MMWORD ; inptr
170 add edi, byte 2*SIZEOF_MMWORD ; outptr
171 cmp eax, byte SIZEOF_MMWORD
172 ja near .columnloop
173 test eax,eax
174 jnz near .columnloop_last
175
176 pop esi
177 pop edi
178 pop eax
179
180 add esi, byte SIZEOF_JSAMPROW ; input_data
181 add edi, byte SIZEOF_JSAMPROW ; output_data
182 dec ecx ; rowctr
183 jg near .rowloop
184
185 emms ; empty MMX state
186
187 .return:
188 pop edi
189 pop esi
190 ; pop edx ; need not be preserved
191 ; pop ecx ; need not be preserved
192 poppic ebx
193 pop ebp
194 ret
195
196 ; --------------------------------------------------------------------------
197 ;
198 ; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
199 ; Again a triangle filter; see comments for h2v1 case, above.
200 ;
201 ; GLOBAL(void)
202 ; jsimd_h2v2_fancy_upsample_mmx (int max_v_samp_factor,
203 ; JDIMENSION downsampled_width,
204 ; JSAMPARRAY input_data,
205 ; JSAMPARRAY *output_data_ptr);
206 ;
207
208 %define max_v_samp(b) (b)+8 ; int max_v_samp_factor
209 %define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width
210 %define input_data(b) (b)+16 ; JSAMPARRAY input_data
211 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr
212
213 %define original_ebp ebp+0
214 %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
215 %define WK_NUM 4
216 %define gotptr wk(0)-SIZEOF_POINTER ; void *gotptr
217
218 align 16
219 global EXTN(jsimd_h2v2_fancy_upsample_mmx)
220
221 EXTN(jsimd_h2v2_fancy_upsample_mmx):
222 push ebp
223 mov eax,esp ; eax = original ebp
224 sub esp, byte 4
225 and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
226 mov [esp],eax
227 mov ebp,esp ; ebp = aligned ebp
228 lea esp, [wk(0)]
229 pushpic eax ; make a room for GOT address
230 push ebx
231 ; push ecx ; need not be preserved
232 ; push edx ; need not be preserved
233 push esi
234 push edi
235
236 get_GOT ebx ; get GOT address
237 movpic POINTER [gotptr], ebx ; save GOT address
238
239 mov edx,eax ; edx = original ebp
240 mov eax, JDIMENSION [downsamp_width(edx)] ; colctr
241 test eax,eax
242 jz near .return
243
244 mov ecx, INT [max_v_samp(edx)] ; rowctr
245 test ecx,ecx
246 jz near .return
247
248 mov esi, JSAMPARRAY [input_data(edx)] ; input_data
249 mov edi, POINTER [output_data_ptr(edx)]
250 mov edi, JSAMPARRAY [edi] ; output_data
251 alignx 16,7
252 .rowloop:
253 push eax ; colctr
254 push ecx
255 push edi
256 push esi
257
258 mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above)
259 mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
260 mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below)
261 mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
262 mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
263
264 test eax, SIZEOF_MMWORD-1
265 jz short .skip
266 push edx
267 mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
268 mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
269 mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
270 mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
271 mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
272 mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
273 pop edx
274 .skip:
275 ; -- process the first column block
276
277 movq mm0, MMWORD [ebx+0*SIZEOF_MMWORD] ; mm0=row[ 0][0]
278 movq mm1, MMWORD [ecx+0*SIZEOF_MMWORD] ; mm1=row[-1][0]
279 movq mm2, MMWORD [esi+0*SIZEOF_MMWORD] ; mm2=row[+1][0]
280
281 pushpic ebx
282 movpic ebx, POINTER [gotptr] ; load GOT address
283
284 pxor mm3,mm3 ; mm3=(all 0's)
285 movq mm4,mm0
286 punpcklbw mm0,mm3 ; mm0=row[ 0][0]( 0 1 2 3)
287 punpckhbw mm4,mm3 ; mm4=row[ 0][0]( 4 5 6 7)
288 movq mm5,mm1
289 punpcklbw mm1,mm3 ; mm1=row[-1][0]( 0 1 2 3)
290 punpckhbw mm5,mm3 ; mm5=row[-1][0]( 4 5 6 7)
291 movq mm6,mm2
292 punpcklbw mm2,mm3 ; mm2=row[+1][0]( 0 1 2 3)
293 punpckhbw mm6,mm3 ; mm6=row[+1][0]( 4 5 6 7)
294
295 pmullw mm0,[GOTOFF(ebx,PW_THREE)]
296 pmullw mm4,[GOTOFF(ebx,PW_THREE)]
297
298 pcmpeqb mm7,mm7
299 psrlq mm7,(SIZEOF_MMWORD-2)*BYTE_BIT
300
301 paddw mm1,mm0 ; mm1=Int0L=( 0 1 2 3)
302 paddw mm5,mm4 ; mm5=Int0H=( 4 5 6 7)
303 paddw mm2,mm0 ; mm2=Int1L=( 0 1 2 3)
304 paddw mm6,mm4 ; mm6=Int1H=( 4 5 6 7)
305
306 movq MMWORD [edx+0*SIZEOF_MMWORD], mm1 ; temporarily save
307 movq MMWORD [edx+1*SIZEOF_MMWORD], mm5 ; the intermediate data
308 movq MMWORD [edi+0*SIZEOF_MMWORD], mm2
309 movq MMWORD [edi+1*SIZEOF_MMWORD], mm6
310
311 pand mm1,mm7 ; mm1=( 0 - - -)
312 pand mm2,mm7 ; mm2=( 0 - - -)
313
314 movq MMWORD [wk(0)], mm1
315 movq MMWORD [wk(1)], mm2
316
317 poppic ebx
318
319 add eax, byte SIZEOF_MMWORD-1
320 and eax, byte -SIZEOF_MMWORD
321 cmp eax, byte SIZEOF_MMWORD
322 ja short .columnloop
323 alignx 16,7
324
325 .columnloop_last:
326 ; -- process the last column block
327
328 pushpic ebx
329 movpic ebx, POINTER [gotptr] ; load GOT address
330
331 pcmpeqb mm1,mm1
332 psllq mm1,(SIZEOF_MMWORD-2)*BYTE_BIT
333 movq mm2,mm1
334
335 pand mm1, MMWORD [edx+1*SIZEOF_MMWORD] ; mm1=( - - - 7)
336 pand mm2, MMWORD [edi+1*SIZEOF_MMWORD] ; mm2=( - - - 7)
337
338 movq MMWORD [wk(2)], mm1
339 movq MMWORD [wk(3)], mm2
340
341 jmp short .upsample
342 alignx 16,7
343
344 .columnloop:
345 ; -- process the next column block
346
347 movq mm0, MMWORD [ebx+1*SIZEOF_MMWORD] ; mm0=row[ 0][1]
348 movq mm1, MMWORD [ecx+1*SIZEOF_MMWORD] ; mm1=row[-1][1]
349 movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] ; mm2=row[+1][1]
350
351 pushpic ebx
352 movpic ebx, POINTER [gotptr] ; load GOT address
353
354 pxor mm3,mm3 ; mm3=(all 0's)
355 movq mm4,mm0
356 punpcklbw mm0,mm3 ; mm0=row[ 0][1]( 0 1 2 3)
357 punpckhbw mm4,mm3 ; mm4=row[ 0][1]( 4 5 6 7)
358 movq mm5,mm1
359 punpcklbw mm1,mm3 ; mm1=row[-1][1]( 0 1 2 3)
360 punpckhbw mm5,mm3 ; mm5=row[-1][1]( 4 5 6 7)
361 movq mm6,mm2
362 punpcklbw mm2,mm3 ; mm2=row[+1][1]( 0 1 2 3)
363 punpckhbw mm6,mm3 ; mm6=row[+1][1]( 4 5 6 7)
364
365 pmullw mm0,[GOTOFF(ebx,PW_THREE)]
366 pmullw mm4,[GOTOFF(ebx,PW_THREE)]
367
368 paddw mm1,mm0 ; mm1=Int0L=( 0 1 2 3)
369 paddw mm5,mm4 ; mm5=Int0H=( 4 5 6 7)
370 paddw mm2,mm0 ; mm2=Int1L=( 0 1 2 3)
371 paddw mm6,mm4 ; mm6=Int1H=( 4 5 6 7)
372
373 movq MMWORD [edx+2*SIZEOF_MMWORD], mm1 ; temporarily save
374 movq MMWORD [edx+3*SIZEOF_MMWORD], mm5 ; the intermediate data
375 movq MMWORD [edi+2*SIZEOF_MMWORD], mm2
376 movq MMWORD [edi+3*SIZEOF_MMWORD], mm6
377
378 psllq mm1,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm1=( - - - 0)
379 psllq mm2,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm2=( - - - 0)
380
381 movq MMWORD [wk(2)], mm1
382 movq MMWORD [wk(3)], mm2
383
384 .upsample:
385 ; -- process the upper row
386
387 movq mm7, MMWORD [edx+0*SIZEOF_MMWORD] ; mm7=Int0L=( 0 1 2 3)
388 movq mm3, MMWORD [edx+1*SIZEOF_MMWORD] ; mm3=Int0H=( 4 5 6 7)
389
390 movq mm0,mm7
391 movq mm4,mm3
392 psrlq mm0,2*BYTE_BIT ; mm0=( 1 2 3 -)
393 psllq mm4,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( - - - 4)
394 movq mm5,mm7
395 movq mm6,mm3
396 psrlq mm5,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm5=( 3 - - -)
397 psllq mm6,2*BYTE_BIT ; mm6=( - 4 5 6)
398
399 por mm0,mm4 ; mm0=( 1 2 3 4)
400 por mm5,mm6 ; mm5=( 3 4 5 6)
401
402 movq mm1,mm7
403 movq mm2,mm3
404 psllq mm1,2*BYTE_BIT ; mm1=( - 0 1 2)
405 psrlq mm2,2*BYTE_BIT ; mm2=( 5 6 7 -)
406 movq mm4,mm3
407 psrlq mm4,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( 7 - - -)
408
409 por mm1, MMWORD [wk(0)] ; mm1=(-1 0 1 2)
410 por mm2, MMWORD [wk(2)] ; mm2=( 5 6 7 8)
411
412 movq MMWORD [wk(0)], mm4
413
414 pmullw mm7,[GOTOFF(ebx,PW_THREE)]
415 pmullw mm3,[GOTOFF(ebx,PW_THREE)]
416 paddw mm1,[GOTOFF(ebx,PW_EIGHT)]
417 paddw mm5,[GOTOFF(ebx,PW_EIGHT)]
418 paddw mm0,[GOTOFF(ebx,PW_SEVEN)]
419 paddw mm2,[GOTOFF(ebx,PW_SEVEN)]
420
421 paddw mm1,mm7
422 paddw mm5,mm3
423 psrlw mm1,4 ; mm1=Out0LE=( 0 2 4 6)
424 psrlw mm5,4 ; mm5=Out0HE=( 8 10 12 14)
425 paddw mm0,mm7
426 paddw mm2,mm3
427 psrlw mm0,4 ; mm0=Out0LO=( 1 3 5 7)
428 psrlw mm2,4 ; mm2=Out0HO=( 9 11 13 15)
429
430 psllw mm0,BYTE_BIT
431 psllw mm2,BYTE_BIT
432 por mm1,mm0 ; mm1=Out0L=( 0 1 2 3 4 5 6 7)
433 por mm5,mm2 ; mm5=Out0H=( 8 9 10 11 12 13 14 15)
434
435 movq MMWORD [edx+0*SIZEOF_MMWORD], mm1
436 movq MMWORD [edx+1*SIZEOF_MMWORD], mm5
437
438 ; -- process the lower row
439
440 movq mm6, MMWORD [edi+0*SIZEOF_MMWORD] ; mm6=Int1L=( 0 1 2 3)
441 movq mm4, MMWORD [edi+1*SIZEOF_MMWORD] ; mm4=Int1H=( 4 5 6 7)
442
443 movq mm7,mm6
444 movq mm3,mm4
445 psrlq mm7,2*BYTE_BIT ; mm7=( 1 2 3 -)
446 psllq mm3,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( - - - 4)
447 movq mm0,mm6
448 movq mm2,mm4
449 psrlq mm0,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm0=( 3 - - -)
450 psllq mm2,2*BYTE_BIT ; mm2=( - 4 5 6)
451
452 por mm7,mm3 ; mm7=( 1 2 3 4)
453 por mm0,mm2 ; mm0=( 3 4 5 6)
454
455 movq mm1,mm6
456 movq mm5,mm4
457 psllq mm1,2*BYTE_BIT ; mm1=( - 0 1 2)
458 psrlq mm5,2*BYTE_BIT ; mm5=( 5 6 7 -)
459 movq mm3,mm4
460 psrlq mm3,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( 7 - - -)
461
462 por mm1, MMWORD [wk(1)] ; mm1=(-1 0 1 2)
463 por mm5, MMWORD [wk(3)] ; mm5=( 5 6 7 8)
464
465 movq MMWORD [wk(1)], mm3
466
467 pmullw mm6,[GOTOFF(ebx,PW_THREE)]
468 pmullw mm4,[GOTOFF(ebx,PW_THREE)]
469 paddw mm1,[GOTOFF(ebx,PW_EIGHT)]
470 paddw mm0,[GOTOFF(ebx,PW_EIGHT)]
471 paddw mm7,[GOTOFF(ebx,PW_SEVEN)]
472 paddw mm5,[GOTOFF(ebx,PW_SEVEN)]
473
474 paddw mm1,mm6
475 paddw mm0,mm4
476 psrlw mm1,4 ; mm1=Out1LE=( 0 2 4 6)
477 psrlw mm0,4 ; mm0=Out1HE=( 8 10 12 14)
478 paddw mm7,mm6
479 paddw mm5,mm4
480 psrlw mm7,4 ; mm7=Out1LO=( 1 3 5 7)
481 psrlw mm5,4 ; mm5=Out1HO=( 9 11 13 15)
482
483 psllw mm7,BYTE_BIT
484 psllw mm5,BYTE_BIT
485 por mm1,mm7 ; mm1=Out1L=( 0 1 2 3 4 5 6 7)
486 por mm0,mm5 ; mm0=Out1H=( 8 9 10 11 12 13 14 15)
487
488 movq MMWORD [edi+0*SIZEOF_MMWORD], mm1
489 movq MMWORD [edi+1*SIZEOF_MMWORD], mm0
490
491 poppic ebx
492
493 sub eax, byte SIZEOF_MMWORD
494 add ecx, byte 1*SIZEOF_MMWORD ; inptr1(above)
495 add ebx, byte 1*SIZEOF_MMWORD ; inptr0
496 add esi, byte 1*SIZEOF_MMWORD ; inptr1(below)
497 add edx, byte 2*SIZEOF_MMWORD ; outptr0
498 add edi, byte 2*SIZEOF_MMWORD ; outptr1
499 cmp eax, byte SIZEOF_MMWORD
500 ja near .columnloop
501 test eax,eax
502 jnz near .columnloop_last
503
504 pop esi
505 pop edi
506 pop ecx
507 pop eax
508
509 add esi, byte 1*SIZEOF_JSAMPROW ; input_data
510 add edi, byte 2*SIZEOF_JSAMPROW ; output_data
511 sub ecx, byte 2 ; rowctr
512 jg near .rowloop
513
514 emms ; empty MMX state
515
516 .return:
517 pop edi
518 pop esi
519 ; pop edx ; need not be preserved
520 ; pop ecx ; need not be preserved
521 pop ebx
522 mov esp,ebp ; esp <- aligned ebp
523 pop esp ; esp <- original ebp
524 pop ebp
525 ret
526
527 ; --------------------------------------------------------------------------
528 ;
529 ; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
530 ; It's still a box filter.
531 ;
532 ; GLOBAL(void)
533 ; jsimd_h2v1_upsample_mmx (int max_v_samp_factor,
534 ; JDIMENSION output_width,
535 ; JSAMPARRAY input_data,
536 ; JSAMPARRAY *output_data_ptr);
537 ;
538
539 %define max_v_samp(b) (b)+8 ; int max_v_samp_factor
540 %define output_width(b) (b)+12 ; JDIMENSION output_width
541 %define input_data(b) (b)+16 ; JSAMPARRAY input_data
542 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr
543
544 align 16
545 global EXTN(jsimd_h2v1_upsample_mmx)
546
547 EXTN(jsimd_h2v1_upsample_mmx):
548 push ebp
549 mov ebp,esp
550 ; push ebx ; unused
551 ; push ecx ; need not be preserved
552 ; push edx ; need not be preserved
553 push esi
554 push edi
555
556 mov edx, JDIMENSION [output_width(ebp)]
557 add edx, byte (2*SIZEOF_MMWORD)-1
558 and edx, byte -(2*SIZEOF_MMWORD)
559 jz short .return
560
561 mov ecx, INT [max_v_samp(ebp)] ; rowctr
562 test ecx,ecx
563 jz short .return
564
565 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
566 mov edi, POINTER [output_data_ptr(ebp)]
567 mov edi, JSAMPARRAY [edi] ; output_data
568 alignx 16,7
569 .rowloop:
570 push edi
571 push esi
572
573 mov esi, JSAMPROW [esi] ; inptr
574 mov edi, JSAMPROW [edi] ; outptr
575 mov eax,edx ; colctr
576 alignx 16,7
577 .columnloop:
578
579 movq mm0, MMWORD [esi+0*SIZEOF_MMWORD]
580
581 movq mm1,mm0
582 punpcklbw mm0,mm0
583 punpckhbw mm1,mm1
584
585 movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
586 movq MMWORD [edi+1*SIZEOF_MMWORD], mm1
587
588 sub eax, byte 2*SIZEOF_MMWORD
589 jz short .nextrow
590
591 movq mm2, MMWORD [esi+1*SIZEOF_MMWORD]
592
593 movq mm3,mm2
594 punpcklbw mm2,mm2
595 punpckhbw mm3,mm3
596
597 movq MMWORD [edi+2*SIZEOF_MMWORD], mm2
598 movq MMWORD [edi+3*SIZEOF_MMWORD], mm3
599
600 sub eax, byte 2*SIZEOF_MMWORD
601 jz short .nextrow
602
603 add esi, byte 2*SIZEOF_MMWORD ; inptr
604 add edi, byte 4*SIZEOF_MMWORD ; outptr
605 jmp short .columnloop
606 alignx 16,7
607
608 .nextrow:
609 pop esi
610 pop edi
611
612 add esi, byte SIZEOF_JSAMPROW ; input_data
613 add edi, byte SIZEOF_JSAMPROW ; output_data
614 dec ecx ; rowctr
615 jg short .rowloop
616
617 emms ; empty MMX state
618
619 .return:
620 pop edi
621 pop esi
622 ; pop edx ; need not be preserved
623 ; pop ecx ; need not be preserved
624 ; pop ebx ; unused
625 pop ebp
626 ret
627
628 ; --------------------------------------------------------------------------
629 ;
630 ; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
631 ; It's still a box filter.
632 ;
633 ; GLOBAL(void)
634 ; jsimd_h2v2_upsample_mmx (int max_v_samp_factor,
635 ; JDIMENSION output_width,
636 ; JSAMPARRAY input_data,
637 ; JSAMPARRAY *output_data_ptr);
638 ;
639
640 %define max_v_samp(b) (b)+8 ; int max_v_samp_factor
641 %define output_width(b) (b)+12 ; JDIMENSION output_width
642 %define input_data(b) (b)+16 ; JSAMPARRAY input_data
643 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY *output_data_ptr
644
645 align 16
646 global EXTN(jsimd_h2v2_upsample_mmx)
647
648 EXTN(jsimd_h2v2_upsample_mmx):
649 push ebp
650 mov ebp,esp
651 push ebx
652 ; push ecx ; need not be preserved
653 ; push edx ; need not be preserved
654 push esi
655 push edi
656
657 mov edx, JDIMENSION [output_width(ebp)]
658 add edx, byte (2*SIZEOF_MMWORD)-1
659 and edx, byte -(2*SIZEOF_MMWORD)
660 jz near .return
661
662 mov ecx, INT [max_v_samp(ebp)] ; rowctr
663 test ecx,ecx
664 jz short .return
665
666 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
667 mov edi, POINTER [output_data_ptr(ebp)]
668 mov edi, JSAMPARRAY [edi] ; output_data
669 alignx 16,7
670 .rowloop:
671 push edi
672 push esi
673
674 mov esi, JSAMPROW [esi] ; inptr
675 mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
676 mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
677 mov eax,edx ; colctr
678 alignx 16,7
679 .columnloop:
680
681 movq mm0, MMWORD [esi+0*SIZEOF_MMWORD]
682
683 movq mm1,mm0
684 punpcklbw mm0,mm0
685 punpckhbw mm1,mm1
686
687 movq MMWORD [ebx+0*SIZEOF_MMWORD], mm0
688 movq MMWORD [ebx+1*SIZEOF_MMWORD], mm1
689 movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
690 movq MMWORD [edi+1*SIZEOF_MMWORD], mm1
691
692 sub eax, byte 2*SIZEOF_MMWORD
693 jz short .nextrow
694
695 movq mm2, MMWORD [esi+1*SIZEOF_MMWORD]
696
697 movq mm3,mm2
698 punpcklbw mm2,mm2
699 punpckhbw mm3,mm3
700
701 movq MMWORD [ebx+2*SIZEOF_MMWORD], mm2
702 movq MMWORD [ebx+3*SIZEOF_MMWORD], mm3
703 movq MMWORD [edi+2*SIZEOF_MMWORD], mm2
704 movq MMWORD [edi+3*SIZEOF_MMWORD], mm3
705
706 sub eax, byte 2*SIZEOF_MMWORD
707 jz short .nextrow
708
709 add esi, byte 2*SIZEOF_MMWORD ; inptr
710 add ebx, byte 4*SIZEOF_MMWORD ; outptr0
711 add edi, byte 4*SIZEOF_MMWORD ; outptr1
712 jmp short .columnloop
713 alignx 16,7
714
715 .nextrow:
716 pop esi
717 pop edi
718
719 add esi, byte 1*SIZEOF_JSAMPROW ; input_data
720 add edi, byte 2*SIZEOF_JSAMPROW ; output_data
721 sub ecx, byte 2 ; rowctr
722 jg short .rowloop
723
724 emms ; empty MMX state
725
726 .return:
727 pop edi
728 pop esi
729 ; pop edx ; need not be preserved
730 ; pop ecx ; need not be preserved
731 pop ebx
732 pop ebp
733 ret
734
735 ; For some reason, the OS X linker does not honor the request to align the
736 ; segment unless we do this.
737 align 16
OLDNEW
« simd/jccolext-sse2-64.asm ('K') | « simd/jdsample-altivec.c ('k') | simd/jdsample-sse2.asm » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698