OLD | NEW |
| (Empty) |
1 ; | |
2 ; jdsamss2.asm - upsampling (SSE2) | |
3 ; | |
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB | |
5 ; | |
6 ; Based on | |
7 ; x86 SIMD extension for IJG JPEG library | |
8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. | |
9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc | |
10 ; | |
11 ; This file should be assembled with NASM (Netwide Assembler), | |
12 ; can *not* be assembled with Microsoft's MASM or any compatible | |
13 ; assembler (including Borland's Turbo Assembler). | |
14 ; NASM is available from http://nasm.sourceforge.net/ or | |
15 ; http://sourceforge.net/project/showfiles.php?group_id=6208 | |
16 ; | |
17 ; [TAB8] | |
18 | |
19 %include "jsimdext.inc" | |
20 | |
21 ; -------------------------------------------------------------------------- | |
22 SECTION SEG_CONST | |
23 | |
24 alignz 16 | |
25 global EXTN(jconst_fancy_upsample_sse2) PRIVATE | |
26 | |
27 EXTN(jconst_fancy_upsample_sse2): | |
28 | |
29 PW_ONE times 8 dw 1 | |
30 PW_TWO times 8 dw 2 | |
31 PW_THREE times 8 dw 3 | |
32 PW_SEVEN times 8 dw 7 | |
33 PW_EIGHT times 8 dw 8 | |
34 | |
35 alignz 16 | |
36 | |
37 ; -------------------------------------------------------------------------- | |
38 SECTION SEG_TEXT | |
39 BITS 32 | |
40 ; | |
41 ; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical. | |
42 ; | |
43 ; The upsampling algorithm is linear interpolation between pixel centers, | |
44 ; also known as a "triangle filter". This is a good compromise between | |
45 ; speed and visual quality. The centers of the output pixels are 1/4 and 3/4 | |
46 ; of the way between input pixel centers. | |
47 ; | |
48 ; GLOBAL(void) | |
49 ; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor, | |
50 ; JDIMENSION downsampled_width, | |
51 ; JSAMPARRAY input_data, | |
52 ; JSAMPARRAY * output_data_ptr); | |
53 ; | |
54 | |
55 %define max_v_samp(b) (b)+8 ; int max_v_samp_factor | |
56 %define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width | |
57 %define input_data(b) (b)+16 ; JSAMPARRAY input_data | |
58 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr | |
59 | |
60 align 16 | |
61 global EXTN(jsimd_h2v1_fancy_upsample_sse2) PRIVATE | |
62 | |
63 EXTN(jsimd_h2v1_fancy_upsample_sse2): | |
64 push ebp | |
65 mov ebp,esp | |
66 pushpic ebx | |
67 ; push ecx ; need not be preserved | |
68 ; push edx ; need not be preserved | |
69 push esi | |
70 push edi | |
71 | |
72 get_GOT ebx ; get GOT address | |
73 | |
74 mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr | |
75 test eax,eax | |
76 jz near .return | |
77 | |
78 mov ecx, INT [max_v_samp(ebp)] ; rowctr | |
79 test ecx,ecx | |
80 jz near .return | |
81 | |
82 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data | |
83 mov edi, POINTER [output_data_ptr(ebp)] | |
84 mov edi, JSAMPARRAY [edi] ; output_data | |
85 alignx 16,7 | |
86 .rowloop: | |
87 push eax ; colctr | |
88 push edi | |
89 push esi | |
90 | |
91 mov esi, JSAMPROW [esi] ; inptr | |
92 mov edi, JSAMPROW [edi] ; outptr | |
93 | |
94 test eax, SIZEOF_XMMWORD-1 | |
95 jz short .skip | |
96 mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] | |
97 mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample | |
98 .skip: | |
99 pxor xmm0,xmm0 ; xmm0=(all 0's) | |
100 pcmpeqb xmm7,xmm7 | |
101 psrldq xmm7,(SIZEOF_XMMWORD-1) | |
102 pand xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD] | |
103 | |
104 add eax, byte SIZEOF_XMMWORD-1 | |
105 and eax, byte -SIZEOF_XMMWORD | |
106 cmp eax, byte SIZEOF_XMMWORD | |
107 ja short .columnloop | |
108 alignx 16,7 | |
109 | |
110 .columnloop_last: | |
111 pcmpeqb xmm6,xmm6 | |
112 pslldq xmm6,(SIZEOF_XMMWORD-1) | |
113 pand xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD] | |
114 jmp short .upsample | |
115 alignx 16,7 | |
116 | |
117 .columnloop: | |
118 movdqa xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD] | |
119 pslldq xmm6,(SIZEOF_XMMWORD-1) | |
120 | |
121 .upsample: | |
122 movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] | |
123 movdqa xmm2,xmm1 | |
124 movdqa xmm3,xmm1 ; xmm1=( 0 1 2 ... 13 14 15) | |
125 pslldq xmm2,1 ; xmm2=(-- 0 1 ... 12 13 14) | |
126 psrldq xmm3,1 ; xmm3=( 1 2 3 ... 14 15 --) | |
127 | |
128 por xmm2,xmm7 ; xmm2=(-1 0 1 ... 12 13 14) | |
129 por xmm3,xmm6 ; xmm3=( 1 2 3 ... 14 15 16) | |
130 | |
131 movdqa xmm7,xmm1 | |
132 psrldq xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --) | |
133 | |
134 movdqa xmm4,xmm1 | |
135 punpcklbw xmm1,xmm0 ; xmm1=( 0 1 2 3 4 5 6 7) | |
136 punpckhbw xmm4,xmm0 ; xmm4=( 8 9 10 11 12 13 14 15) | |
137 movdqa xmm5,xmm2 | |
138 punpcklbw xmm2,xmm0 ; xmm2=(-1 0 1 2 3 4 5 6) | |
139 punpckhbw xmm5,xmm0 ; xmm5=( 7 8 9 10 11 12 13 14) | |
140 movdqa xmm6,xmm3 | |
141 punpcklbw xmm3,xmm0 ; xmm3=( 1 2 3 4 5 6 7 8) | |
142 punpckhbw xmm6,xmm0 ; xmm6=( 9 10 11 12 13 14 15 16) | |
143 | |
144 pmullw xmm1,[GOTOFF(ebx,PW_THREE)] | |
145 pmullw xmm4,[GOTOFF(ebx,PW_THREE)] | |
146 paddw xmm2,[GOTOFF(ebx,PW_ONE)] | |
147 paddw xmm5,[GOTOFF(ebx,PW_ONE)] | |
148 paddw xmm3,[GOTOFF(ebx,PW_TWO)] | |
149 paddw xmm6,[GOTOFF(ebx,PW_TWO)] | |
150 | |
151 paddw xmm2,xmm1 | |
152 paddw xmm5,xmm4 | |
153 psrlw xmm2,2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14) | |
154 psrlw xmm5,2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30) | |
155 paddw xmm3,xmm1 | |
156 paddw xmm6,xmm4 | |
157 psrlw xmm3,2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15) | |
158 psrlw xmm6,2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31) | |
159 | |
160 psllw xmm3,BYTE_BIT | |
161 psllw xmm6,BYTE_BIT | |
162 por xmm2,xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15) | |
163 por xmm5,xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31) | |
164 | |
165 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2 | |
166 movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5 | |
167 | |
168 sub eax, byte SIZEOF_XMMWORD | |
169 add esi, byte 1*SIZEOF_XMMWORD ; inptr | |
170 add edi, byte 2*SIZEOF_XMMWORD ; outptr | |
171 cmp eax, byte SIZEOF_XMMWORD | |
172 ja near .columnloop | |
173 test eax,eax | |
174 jnz near .columnloop_last | |
175 | |
176 pop esi | |
177 pop edi | |
178 pop eax | |
179 | |
180 add esi, byte SIZEOF_JSAMPROW ; input_data | |
181 add edi, byte SIZEOF_JSAMPROW ; output_data | |
182 dec ecx ; rowctr | |
183 jg near .rowloop | |
184 | |
185 .return: | |
186 pop edi | |
187 pop esi | |
188 ; pop edx ; need not be preserved | |
189 ; pop ecx ; need not be preserved | |
190 poppic ebx | |
191 pop ebp | |
192 ret | |
193 | |
194 ; -------------------------------------------------------------------------- | |
195 ; | |
196 ; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. | |
197 ; Again a triangle filter; see comments for h2v1 case, above. | |
198 ; | |
199 ; GLOBAL(void) | |
200 ; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor, | |
201 ; JDIMENSION downsampled_width, | |
202 ; JSAMPARRAY input_data, | |
203 ; JSAMPARRAY * output_data_ptr); | |
204 ; | |
205 | |
206 %define max_v_samp(b) (b)+8 ; int max_v_samp_factor | |
207 %define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width | |
208 %define input_data(b) (b)+16 ; JSAMPARRAY input_data | |
209 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr | |
210 | |
211 %define original_ebp ebp+0 | |
212 %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] | |
213 %define WK_NUM 4 | |
214 %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr | |
215 | |
216 align 16 | |
217 global EXTN(jsimd_h2v2_fancy_upsample_sse2) PRIVATE | |
218 | |
219 EXTN(jsimd_h2v2_fancy_upsample_sse2): | |
220 push ebp | |
221 mov eax,esp ; eax = original ebp | |
222 sub esp, byte 4 | |
223 and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits | |
224 mov [esp],eax | |
225 mov ebp,esp ; ebp = aligned ebp | |
226 lea esp, [wk(0)] | |
227 pushpic eax ; make a room for GOT address | |
228 push ebx | |
229 ; push ecx ; need not be preserved | |
230 ; push edx ; need not be preserved | |
231 push esi | |
232 push edi | |
233 | |
234 get_GOT ebx ; get GOT address | |
235 movpic POINTER [gotptr], ebx ; save GOT address | |
236 | |
237 mov edx,eax ; edx = original ebp | |
238 mov eax, JDIMENSION [downsamp_width(edx)] ; colctr | |
239 test eax,eax | |
240 jz near .return | |
241 | |
242 mov ecx, INT [max_v_samp(edx)] ; rowctr | |
243 test ecx,ecx | |
244 jz near .return | |
245 | |
246 mov esi, JSAMPARRAY [input_data(edx)] ; input_data | |
247 mov edi, POINTER [output_data_ptr(edx)] | |
248 mov edi, JSAMPARRAY [edi] ; output_data | |
249 alignx 16,7 | |
250 .rowloop: | |
251 push eax ; colctr | |
252 push ecx | |
253 push edi | |
254 push esi | |
255 | |
256 mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above) | |
257 mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 | |
258 mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below) | |
259 mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 | |
260 mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 | |
261 | |
262 test eax, SIZEOF_XMMWORD-1 | |
263 jz short .skip | |
264 push edx | |
265 mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE] | |
266 mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl | |
267 mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE] | |
268 mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl | |
269 mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] | |
270 mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample | |
271 pop edx | |
272 .skip: | |
273 ; -- process the first column block | |
274 | |
275 movdqa xmm0, XMMWORD [ebx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0] | |
276 movdqa xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0] | |
277 movdqa xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0] | |
278 | |
279 pushpic ebx | |
280 movpic ebx, POINTER [gotptr] ; load GOT address | |
281 | |
282 pxor xmm3,xmm3 ; xmm3=(all 0's) | |
283 movdqa xmm4,xmm0 | |
284 punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) | |
285 punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) | |
286 movdqa xmm5,xmm1 | |
287 punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) | |
288 punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) | |
289 movdqa xmm6,xmm2 | |
290 punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) | |
291 punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) | |
292 | |
293 pmullw xmm0,[GOTOFF(ebx,PW_THREE)] | |
294 pmullw xmm4,[GOTOFF(ebx,PW_THREE)] | |
295 | |
296 pcmpeqb xmm7,xmm7 | |
297 psrldq xmm7,(SIZEOF_XMMWORD-2) | |
298 | |
299 paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) | |
300 paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) | |
301 paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) | |
302 paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) | |
303 | |
304 movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save | |
305 movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data | |
306 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2 | |
307 movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6 | |
308 | |
309 pand xmm1,xmm7 ; xmm1=( 0 -- -- -- -- -- -- --) | |
310 pand xmm2,xmm7 ; xmm2=( 0 -- -- -- -- -- -- --) | |
311 | |
312 movdqa XMMWORD [wk(0)], xmm1 | |
313 movdqa XMMWORD [wk(1)], xmm2 | |
314 | |
315 poppic ebx | |
316 | |
317 add eax, byte SIZEOF_XMMWORD-1 | |
318 and eax, byte -SIZEOF_XMMWORD | |
319 cmp eax, byte SIZEOF_XMMWORD | |
320 ja short .columnloop | |
321 alignx 16,7 | |
322 | |
323 .columnloop_last: | |
324 ; -- process the last column block | |
325 | |
326 pushpic ebx | |
327 movpic ebx, POINTER [gotptr] ; load GOT address | |
328 | |
329 pcmpeqb xmm1,xmm1 | |
330 pslldq xmm1,(SIZEOF_XMMWORD-2) | |
331 movdqa xmm2,xmm1 | |
332 | |
333 pand xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD] | |
334 pand xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD] | |
335 | |
336 movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15) | |
337 movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15) | |
338 | |
339 jmp near .upsample | |
340 alignx 16,7 | |
341 | |
342 .columnloop: | |
343 ; -- process the next column block | |
344 | |
345 movdqa xmm0, XMMWORD [ebx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1] | |
346 movdqa xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1] | |
347 movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1] | |
348 | |
349 pushpic ebx | |
350 movpic ebx, POINTER [gotptr] ; load GOT address | |
351 | |
352 pxor xmm3,xmm3 ; xmm3=(all 0's) | |
353 movdqa xmm4,xmm0 | |
354 punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) | |
355 punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) | |
356 movdqa xmm5,xmm1 | |
357 punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) | |
358 punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) | |
359 movdqa xmm6,xmm2 | |
360 punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) | |
361 punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) | |
362 | |
363 pmullw xmm0,[GOTOFF(ebx,PW_THREE)] | |
364 pmullw xmm4,[GOTOFF(ebx,PW_THREE)] | |
365 | |
366 paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) | |
367 paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) | |
368 paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) | |
369 paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) | |
370 | |
371 movdqa XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save | |
372 movdqa XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data | |
373 movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2 | |
374 movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6 | |
375 | |
376 pslldq xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0) | |
377 pslldq xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0) | |
378 | |
379 movdqa XMMWORD [wk(2)], xmm1 | |
380 movdqa XMMWORD [wk(3)], xmm2 | |
381 | |
382 .upsample: | |
383 ; -- process the upper row | |
384 | |
385 movdqa xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD] | |
386 movdqa xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD] | |
387 | |
388 movdqa xmm0,xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7) | |
389 movdqa xmm4,xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15) | |
390 psrldq xmm0,2 ; xmm0=( 1 2 3 4 5 6 7 --) | |
391 pslldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8) | |
392 movdqa xmm5,xmm7 | |
393 movdqa xmm6,xmm3 | |
394 psrldq xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --) | |
395 pslldq xmm6,2 ; xmm6=(-- 8 9 10 11 12 13 14) | |
396 | |
397 por xmm0,xmm4 ; xmm0=( 1 2 3 4 5 6 7 8) | |
398 por xmm5,xmm6 ; xmm5=( 7 8 9 10 11 12 13 14) | |
399 | |
400 movdqa xmm1,xmm7 | |
401 movdqa xmm2,xmm3 | |
402 pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6) | |
403 psrldq xmm2,2 ; xmm2=( 9 10 11 12 13 14 15 --) | |
404 movdqa xmm4,xmm3 | |
405 psrldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --) | |
406 | |
407 por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6) | |
408 por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16) | |
409 | |
410 movdqa XMMWORD [wk(0)], xmm4 | |
411 | |
412 pmullw xmm7,[GOTOFF(ebx,PW_THREE)] | |
413 pmullw xmm3,[GOTOFF(ebx,PW_THREE)] | |
414 paddw xmm1,[GOTOFF(ebx,PW_EIGHT)] | |
415 paddw xmm5,[GOTOFF(ebx,PW_EIGHT)] | |
416 paddw xmm0,[GOTOFF(ebx,PW_SEVEN)] | |
417 paddw xmm2,[GOTOFF(ebx,PW_SEVEN)] | |
418 | |
419 paddw xmm1,xmm7 | |
420 paddw xmm5,xmm3 | |
421 psrlw xmm1,4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14) | |
422 psrlw xmm5,4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30) | |
423 paddw xmm0,xmm7 | |
424 paddw xmm2,xmm3 | |
425 psrlw xmm0,4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15) | |
426 psrlw xmm2,4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31) | |
427 | |
428 psllw xmm0,BYTE_BIT | |
429 psllw xmm2,BYTE_BIT | |
430 por xmm1,xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15) | |
431 por xmm5,xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31) | |
432 | |
433 movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1 | |
434 movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5 | |
435 | |
436 ; -- process the lower row | |
437 | |
438 movdqa xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD] | |
439 movdqa xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD] | |
440 | |
441 movdqa xmm7,xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7) | |
442 movdqa xmm3,xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15) | |
443 psrldq xmm7,2 ; xmm7=( 1 2 3 4 5 6 7 --) | |
444 pslldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8) | |
445 movdqa xmm0,xmm6 | |
446 movdqa xmm2,xmm4 | |
447 psrldq xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --) | |
448 pslldq xmm2,2 ; xmm2=(-- 8 9 10 11 12 13 14) | |
449 | |
450 por xmm7,xmm3 ; xmm7=( 1 2 3 4 5 6 7 8) | |
451 por xmm0,xmm2 ; xmm0=( 7 8 9 10 11 12 13 14) | |
452 | |
453 movdqa xmm1,xmm6 | |
454 movdqa xmm5,xmm4 | |
455 pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6) | |
456 psrldq xmm5,2 ; xmm5=( 9 10 11 12 13 14 15 --) | |
457 movdqa xmm3,xmm4 | |
458 psrldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --) | |
459 | |
460 por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6) | |
461 por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16) | |
462 | |
463 movdqa XMMWORD [wk(1)], xmm3 | |
464 | |
465 pmullw xmm6,[GOTOFF(ebx,PW_THREE)] | |
466 pmullw xmm4,[GOTOFF(ebx,PW_THREE)] | |
467 paddw xmm1,[GOTOFF(ebx,PW_EIGHT)] | |
468 paddw xmm0,[GOTOFF(ebx,PW_EIGHT)] | |
469 paddw xmm7,[GOTOFF(ebx,PW_SEVEN)] | |
470 paddw xmm5,[GOTOFF(ebx,PW_SEVEN)] | |
471 | |
472 paddw xmm1,xmm6 | |
473 paddw xmm0,xmm4 | |
474 psrlw xmm1,4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14) | |
475 psrlw xmm0,4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30) | |
476 paddw xmm7,xmm6 | |
477 paddw xmm5,xmm4 | |
478 psrlw xmm7,4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15) | |
479 psrlw xmm5,4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31) | |
480 | |
481 psllw xmm7,BYTE_BIT | |
482 psllw xmm5,BYTE_BIT | |
483 por xmm1,xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15) | |
484 por xmm0,xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31) | |
485 | |
486 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1 | |
487 movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0 | |
488 | |
489 poppic ebx | |
490 | |
491 sub eax, byte SIZEOF_XMMWORD | |
492 add ecx, byte 1*SIZEOF_XMMWORD ; inptr1(above) | |
493 add ebx, byte 1*SIZEOF_XMMWORD ; inptr0 | |
494 add esi, byte 1*SIZEOF_XMMWORD ; inptr1(below) | |
495 add edx, byte 2*SIZEOF_XMMWORD ; outptr0 | |
496 add edi, byte 2*SIZEOF_XMMWORD ; outptr1 | |
497 cmp eax, byte SIZEOF_XMMWORD | |
498 ja near .columnloop | |
499 test eax,eax | |
500 jnz near .columnloop_last | |
501 | |
502 pop esi | |
503 pop edi | |
504 pop ecx | |
505 pop eax | |
506 | |
507 add esi, byte 1*SIZEOF_JSAMPROW ; input_data | |
508 add edi, byte 2*SIZEOF_JSAMPROW ; output_data | |
509 sub ecx, byte 2 ; rowctr | |
510 jg near .rowloop | |
511 | |
512 .return: | |
513 pop edi | |
514 pop esi | |
515 ; pop edx ; need not be preserved | |
516 ; pop ecx ; need not be preserved | |
517 pop ebx | |
518 mov esp,ebp ; esp <- aligned ebp | |
519 pop esp ; esp <- original ebp | |
520 pop ebp | |
521 ret | |
522 | |
523 ; -------------------------------------------------------------------------- | |
524 ; | |
525 ; Fast processing for the common case of 2:1 horizontal and 1:1 vertical. | |
526 ; It's still a box filter. | |
527 ; | |
528 ; GLOBAL(void) | |
529 ; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor, | |
530 ; JDIMENSION output_width, | |
531 ; JSAMPARRAY input_data, | |
532 ; JSAMPARRAY * output_data_ptr); | |
533 ; | |
534 | |
535 %define max_v_samp(b) (b)+8 ; int max_v_samp_factor | |
536 %define output_width(b) (b)+12 ; JDIMENSION output_width | |
537 %define input_data(b) (b)+16 ; JSAMPARRAY input_data | |
538 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr | |
539 | |
540 align 16 | |
541 global EXTN(jsimd_h2v1_upsample_sse2) PRIVATE | |
542 | |
543 EXTN(jsimd_h2v1_upsample_sse2): | |
544 push ebp | |
545 mov ebp,esp | |
546 ; push ebx ; unused | |
547 ; push ecx ; need not be preserved | |
548 ; push edx ; need not be preserved | |
549 push esi | |
550 push edi | |
551 | |
552 mov edx, JDIMENSION [output_width(ebp)] | |
553 add edx, byte (2*SIZEOF_XMMWORD)-1 | |
554 and edx, byte -(2*SIZEOF_XMMWORD) | |
555 jz short .return | |
556 | |
557 mov ecx, INT [max_v_samp(ebp)] ; rowctr | |
558 test ecx,ecx | |
559 jz short .return | |
560 | |
561 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data | |
562 mov edi, POINTER [output_data_ptr(ebp)] | |
563 mov edi, JSAMPARRAY [edi] ; output_data | |
564 alignx 16,7 | |
565 .rowloop: | |
566 push edi | |
567 push esi | |
568 | |
569 mov esi, JSAMPROW [esi] ; inptr | |
570 mov edi, JSAMPROW [edi] ; outptr | |
571 mov eax,edx ; colctr | |
572 alignx 16,7 | |
573 .columnloop: | |
574 | |
575 movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] | |
576 | |
577 movdqa xmm1,xmm0 | |
578 punpcklbw xmm0,xmm0 | |
579 punpckhbw xmm1,xmm1 | |
580 | |
581 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 | |
582 movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1 | |
583 | |
584 sub eax, byte 2*SIZEOF_XMMWORD | |
585 jz short .nextrow | |
586 | |
587 movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] | |
588 | |
589 movdqa xmm3,xmm2 | |
590 punpcklbw xmm2,xmm2 | |
591 punpckhbw xmm3,xmm3 | |
592 | |
593 movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2 | |
594 movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3 | |
595 | |
596 sub eax, byte 2*SIZEOF_XMMWORD | |
597 jz short .nextrow | |
598 | |
599 add esi, byte 2*SIZEOF_XMMWORD ; inptr | |
600 add edi, byte 4*SIZEOF_XMMWORD ; outptr | |
601 jmp short .columnloop | |
602 alignx 16,7 | |
603 | |
604 .nextrow: | |
605 pop esi | |
606 pop edi | |
607 | |
608 add esi, byte SIZEOF_JSAMPROW ; input_data | |
609 add edi, byte SIZEOF_JSAMPROW ; output_data | |
610 dec ecx ; rowctr | |
611 jg short .rowloop | |
612 | |
613 .return: | |
614 pop edi | |
615 pop esi | |
616 ; pop edx ; need not be preserved | |
617 ; pop ecx ; need not be preserved | |
618 ; pop ebx ; unused | |
619 pop ebp | |
620 ret | |
621 | |
622 ; -------------------------------------------------------------------------- | |
623 ; | |
624 ; Fast processing for the common case of 2:1 horizontal and 2:1 vertical. | |
625 ; It's still a box filter. | |
626 ; | |
627 ; GLOBAL(void) | |
628 ; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor, | |
629 ; JDIMENSION output_width, | |
630 ; JSAMPARRAY input_data, | |
631 ; JSAMPARRAY * output_data_ptr); | |
632 ; | |
633 | |
634 %define max_v_samp(b) (b)+8 ; int max_v_samp_factor | |
635 %define output_width(b) (b)+12 ; JDIMENSION output_width | |
636 %define input_data(b) (b)+16 ; JSAMPARRAY input_data | |
637 %define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr | |
638 | |
639 align 16 | |
640 global EXTN(jsimd_h2v2_upsample_sse2) PRIVATE | |
641 | |
642 EXTN(jsimd_h2v2_upsample_sse2): | |
643 push ebp | |
644 mov ebp,esp | |
645 push ebx | |
646 ; push ecx ; need not be preserved | |
647 ; push edx ; need not be preserved | |
648 push esi | |
649 push edi | |
650 | |
651 mov edx, JDIMENSION [output_width(ebp)] | |
652 add edx, byte (2*SIZEOF_XMMWORD)-1 | |
653 and edx, byte -(2*SIZEOF_XMMWORD) | |
654 jz near .return | |
655 | |
656 mov ecx, INT [max_v_samp(ebp)] ; rowctr | |
657 test ecx,ecx | |
658 jz near .return | |
659 | |
660 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data | |
661 mov edi, POINTER [output_data_ptr(ebp)] | |
662 mov edi, JSAMPARRAY [edi] ; output_data | |
663 alignx 16,7 | |
664 .rowloop: | |
665 push edi | |
666 push esi | |
667 | |
668 mov esi, JSAMPROW [esi] ; inptr | |
669 mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 | |
670 mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 | |
671 mov eax,edx ; colctr | |
672 alignx 16,7 | |
673 .columnloop: | |
674 | |
675 movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] | |
676 | |
677 movdqa xmm1,xmm0 | |
678 punpcklbw xmm0,xmm0 | |
679 punpckhbw xmm1,xmm1 | |
680 | |
681 movdqa XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0 | |
682 movdqa XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1 | |
683 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 | |
684 movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1 | |
685 | |
686 sub eax, byte 2*SIZEOF_XMMWORD | |
687 jz short .nextrow | |
688 | |
689 movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] | |
690 | |
691 movdqa xmm3,xmm2 | |
692 punpcklbw xmm2,xmm2 | |
693 punpckhbw xmm3,xmm3 | |
694 | |
695 movdqa XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2 | |
696 movdqa XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3 | |
697 movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2 | |
698 movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3 | |
699 | |
700 sub eax, byte 2*SIZEOF_XMMWORD | |
701 jz short .nextrow | |
702 | |
703 add esi, byte 2*SIZEOF_XMMWORD ; inptr | |
704 add ebx, byte 4*SIZEOF_XMMWORD ; outptr0 | |
705 add edi, byte 4*SIZEOF_XMMWORD ; outptr1 | |
706 jmp short .columnloop | |
707 alignx 16,7 | |
708 | |
709 .nextrow: | |
710 pop esi | |
711 pop edi | |
712 | |
713 add esi, byte 1*SIZEOF_JSAMPROW ; input_data | |
714 add edi, byte 2*SIZEOF_JSAMPROW ; output_data | |
715 sub ecx, byte 2 ; rowctr | |
716 jg short .rowloop | |
717 | |
718 .return: | |
719 pop edi | |
720 pop esi | |
721 ; pop edx ; need not be preserved | |
722 ; pop ecx ; need not be preserved | |
723 pop ebx | |
724 pop ebp | |
725 ret | |
726 | |
727 ; For some reason, the OS X linker does not honor the request to align the | |
728 ; segment unless we do this. | |
729 align 16 | |
OLD | NEW |