OLD | NEW |
| (Empty) |
1 ; | |
2 ; jdsamss2-64.asm - upsampling (64-bit SSE2) | |
3 ; | |
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB | |
5 ; Copyright 2009 D. R. Commander | |
6 ; | |
7 ; Based on | |
8 ; x86 SIMD extension for IJG JPEG library | |
9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. | |
10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc | |
11 ; | |
12 ; This file should be assembled with NASM (Netwide Assembler), | |
13 ; can *not* be assembled with Microsoft's MASM or any compatible | |
14 ; assembler (including Borland's Turbo Assembler). | |
15 ; NASM is available from http://nasm.sourceforge.net/ or | |
16 ; http://sourceforge.net/project/showfiles.php?group_id=6208 | |
17 ; | |
18 ; [TAB8] | |
19 | |
20 %include "jsimdext.inc" | |
21 | |
22 ; -------------------------------------------------------------------------- | |
23 SECTION SEG_CONST | |
24 | |
25 alignz 16 | |
26 global EXTN(jconst_fancy_upsample_sse2) PRIVATE | |
27 | |
28 EXTN(jconst_fancy_upsample_sse2): | |
29 | |
30 PW_ONE times 8 dw 1 | |
31 PW_TWO times 8 dw 2 | |
32 PW_THREE times 8 dw 3 | |
33 PW_SEVEN times 8 dw 7 | |
34 PW_EIGHT times 8 dw 8 | |
35 | |
36 alignz 16 | |
37 | |
38 ; -------------------------------------------------------------------------- | |
39 SECTION SEG_TEXT | |
40 BITS 64 | |
41 ; | |
42 ; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical. | |
43 ; | |
44 ; The upsampling algorithm is linear interpolation between pixel centers, | |
45 ; also known as a "triangle filter". This is a good compromise between | |
46 ; speed and visual quality. The centers of the output pixels are 1/4 and 3/4 | |
47 ; of the way between input pixel centers. | |
48 ; | |
49 ; GLOBAL(void) | |
50 ; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor, | |
51 ; JDIMENSION downsampled_width, | |
52 ; JSAMPARRAY input_data, | |
53 ; JSAMPARRAY * output_data_ptr); | |
54 ; | |
55 | |
56 ; r10 = int max_v_samp_factor | |
57 ; r11 = JDIMENSION downsampled_width | |
58 ; r12 = JSAMPARRAY input_data | |
59 ; r13 = JSAMPARRAY * output_data_ptr | |
60 | |
61 align 16 | |
62 global EXTN(jsimd_h2v1_fancy_upsample_sse2) PRIVATE | |
63 | |
64 EXTN(jsimd_h2v1_fancy_upsample_sse2): | |
65 push rbp | |
66 mov rax,rsp | |
67 mov rbp,rsp | |
68 collect_args | |
69 | |
70 mov eax, r11d ; colctr | |
71 test rax,rax | |
72 jz near .return | |
73 | |
74 mov ecx, r10d ; rowctr | |
75 test rcx,rcx | |
76 jz near .return | |
77 | |
78 mov rsi, r12 ; input_data | |
79 mov rdi, r13 | |
80 mov rdi, JSAMPARRAY [rdi] ; output_data | |
81 .rowloop: | |
82 push rax ; colctr | |
83 push rdi | |
84 push rsi | |
85 | |
86 mov rsi, JSAMPROW [rsi] ; inptr | |
87 mov rdi, JSAMPROW [rdi] ; outptr | |
88 | |
89 test rax, SIZEOF_XMMWORD-1 | |
90 jz short .skip | |
91 mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE] | |
92 mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample | |
93 .skip: | |
94 pxor xmm0,xmm0 ; xmm0=(all 0's) | |
95 pcmpeqb xmm7,xmm7 | |
96 psrldq xmm7,(SIZEOF_XMMWORD-1) | |
97 pand xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD] | |
98 | |
99 add rax, byte SIZEOF_XMMWORD-1 | |
100 and rax, byte -SIZEOF_XMMWORD | |
101 cmp rax, byte SIZEOF_XMMWORD | |
102 ja short .columnloop | |
103 | |
104 .columnloop_last: | |
105 pcmpeqb xmm6,xmm6 | |
106 pslldq xmm6,(SIZEOF_XMMWORD-1) | |
107 pand xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD] | |
108 jmp short .upsample | |
109 | |
110 .columnloop: | |
111 movdqa xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD] | |
112 pslldq xmm6,(SIZEOF_XMMWORD-1) | |
113 | |
114 .upsample: | |
115 movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] | |
116 movdqa xmm2,xmm1 | |
117 movdqa xmm3,xmm1 ; xmm1=( 0 1 2 ... 13 14 15) | |
118 pslldq xmm2,1 ; xmm2=(-- 0 1 ... 12 13 14) | |
119 psrldq xmm3,1 ; xmm3=( 1 2 3 ... 14 15 --) | |
120 | |
121 por xmm2,xmm7 ; xmm2=(-1 0 1 ... 12 13 14) | |
122 por xmm3,xmm6 ; xmm3=( 1 2 3 ... 14 15 16) | |
123 | |
124 movdqa xmm7,xmm1 | |
125 psrldq xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --) | |
126 | |
127 movdqa xmm4,xmm1 | |
128 punpcklbw xmm1,xmm0 ; xmm1=( 0 1 2 3 4 5 6 7) | |
129 punpckhbw xmm4,xmm0 ; xmm4=( 8 9 10 11 12 13 14 15) | |
130 movdqa xmm5,xmm2 | |
131 punpcklbw xmm2,xmm0 ; xmm2=(-1 0 1 2 3 4 5 6) | |
132 punpckhbw xmm5,xmm0 ; xmm5=( 7 8 9 10 11 12 13 14) | |
133 movdqa xmm6,xmm3 | |
134 punpcklbw xmm3,xmm0 ; xmm3=( 1 2 3 4 5 6 7 8) | |
135 punpckhbw xmm6,xmm0 ; xmm6=( 9 10 11 12 13 14 15 16) | |
136 | |
137 pmullw xmm1,[rel PW_THREE] | |
138 pmullw xmm4,[rel PW_THREE] | |
139 paddw xmm2,[rel PW_ONE] | |
140 paddw xmm5,[rel PW_ONE] | |
141 paddw xmm3,[rel PW_TWO] | |
142 paddw xmm6,[rel PW_TWO] | |
143 | |
144 paddw xmm2,xmm1 | |
145 paddw xmm5,xmm4 | |
146 psrlw xmm2,2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14) | |
147 psrlw xmm5,2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30) | |
148 paddw xmm3,xmm1 | |
149 paddw xmm6,xmm4 | |
150 psrlw xmm3,2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15) | |
151 psrlw xmm6,2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31) | |
152 | |
153 psllw xmm3,BYTE_BIT | |
154 psllw xmm6,BYTE_BIT | |
155 por xmm2,xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15) | |
156 por xmm5,xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31) | |
157 | |
158 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2 | |
159 movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5 | |
160 | |
161 sub rax, byte SIZEOF_XMMWORD | |
162 add rsi, byte 1*SIZEOF_XMMWORD ; inptr | |
163 add rdi, byte 2*SIZEOF_XMMWORD ; outptr | |
164 cmp rax, byte SIZEOF_XMMWORD | |
165 ja near .columnloop | |
166 test eax,eax | |
167 jnz near .columnloop_last | |
168 | |
169 pop rsi | |
170 pop rdi | |
171 pop rax | |
172 | |
173 add rsi, byte SIZEOF_JSAMPROW ; input_data | |
174 add rdi, byte SIZEOF_JSAMPROW ; output_data | |
175 dec rcx ; rowctr | |
176 jg near .rowloop | |
177 | |
178 .return: | |
179 uncollect_args | |
180 pop rbp | |
181 ret | |
182 | |
183 ; -------------------------------------------------------------------------- | |
184 ; | |
185 ; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. | |
186 ; Again a triangle filter; see comments for h2v1 case, above. | |
187 ; | |
188 ; GLOBAL(void) | |
189 ; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor, | |
190 ; JDIMENSION downsampled_width, | |
191 ; JSAMPARRAY input_data, | |
192 ; JSAMPARRAY * output_data_ptr); | |
193 ; | |
194 | |
195 ; r10 = int max_v_samp_factor | |
196 ; r11 = JDIMENSION downsampled_width | |
197 ; r12 = JSAMPARRAY input_data | |
198 ; r13 = JSAMPARRAY * output_data_ptr | |
199 | |
200 %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] | |
201 %define WK_NUM 4 | |
202 | |
203 align 16 | |
204 global EXTN(jsimd_h2v2_fancy_upsample_sse2) PRIVATE | |
205 | |
206 EXTN(jsimd_h2v2_fancy_upsample_sse2): | |
207 push rbp | |
208 mov rax,rsp ; rax = original rbp | |
209 sub rsp, byte 4 | |
210 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits | |
211 mov [rsp],rax | |
212 mov rbp,rsp ; rbp = aligned rbp | |
213 lea rsp, [wk(0)] | |
214 collect_args | |
215 push rbx | |
216 | |
217 mov rax, r11 ; colctr | |
218 test rax,rax | |
219 jz near .return | |
220 | |
221 mov rcx, r10 ; rowctr | |
222 test rcx,rcx | |
223 jz near .return | |
224 | |
225 mov rsi, r12 ; input_data | |
226 mov rdi, r13 | |
227 mov rdi, JSAMPARRAY [rdi] ; output_data | |
228 .rowloop: | |
229 push rax ; colctr | |
230 push rcx | |
231 push rdi | |
232 push rsi | |
233 | |
234 mov rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW] ; inptr1(above) | |
235 mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0 | |
236 mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1(below) | |
237 mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0 | |
238 mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1 | |
239 | |
240 test rax, SIZEOF_XMMWORD-1 | |
241 jz short .skip | |
242 push rdx | |
243 mov dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE] | |
244 mov JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl | |
245 mov dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE] | |
246 mov JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl | |
247 mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE] | |
248 mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample | |
249 pop rdx | |
250 .skip: | |
251 ; -- process the first column block | |
252 | |
253 movdqa xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0] | |
254 movdqa xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0] | |
255 movdqa xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0] | |
256 | |
257 pxor xmm3,xmm3 ; xmm3=(all 0's) | |
258 movdqa xmm4,xmm0 | |
259 punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) | |
260 punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) | |
261 movdqa xmm5,xmm1 | |
262 punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) | |
263 punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) | |
264 movdqa xmm6,xmm2 | |
265 punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) | |
266 punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) | |
267 | |
268 pmullw xmm0,[rel PW_THREE] | |
269 pmullw xmm4,[rel PW_THREE] | |
270 | |
271 pcmpeqb xmm7,xmm7 | |
272 psrldq xmm7,(SIZEOF_XMMWORD-2) | |
273 | |
274 paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) | |
275 paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) | |
276 paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) | |
277 paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) | |
278 | |
279 movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save | |
280 movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data | |
281 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2 | |
282 movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6 | |
283 | |
284 pand xmm1,xmm7 ; xmm1=( 0 -- -- -- -- -- -- --) | |
285 pand xmm2,xmm7 ; xmm2=( 0 -- -- -- -- -- -- --) | |
286 | |
287 movdqa XMMWORD [wk(0)], xmm1 | |
288 movdqa XMMWORD [wk(1)], xmm2 | |
289 | |
290 add rax, byte SIZEOF_XMMWORD-1 | |
291 and rax, byte -SIZEOF_XMMWORD | |
292 cmp rax, byte SIZEOF_XMMWORD | |
293 ja short .columnloop | |
294 | |
295 .columnloop_last: | |
296 ; -- process the last column block | |
297 | |
298 pcmpeqb xmm1,xmm1 | |
299 pslldq xmm1,(SIZEOF_XMMWORD-2) | |
300 movdqa xmm2,xmm1 | |
301 | |
302 pand xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD] | |
303 pand xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD] | |
304 | |
305 movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15) | |
306 movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15) | |
307 | |
308 jmp near .upsample | |
309 | |
310 .columnloop: | |
311 ; -- process the next column block | |
312 | |
313 movdqa xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1] | |
314 movdqa xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1] | |
315 movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1] | |
316 | |
317 pxor xmm3,xmm3 ; xmm3=(all 0's) | |
318 movdqa xmm4,xmm0 | |
319 punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) | |
320 punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) | |
321 movdqa xmm5,xmm1 | |
322 punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) | |
323 punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) | |
324 movdqa xmm6,xmm2 | |
325 punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) | |
326 punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) | |
327 | |
328 pmullw xmm0,[rel PW_THREE] | |
329 pmullw xmm4,[rel PW_THREE] | |
330 | |
331 paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) | |
332 paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) | |
333 paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) | |
334 paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) | |
335 | |
336 movdqa XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save | |
337 movdqa XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data | |
338 movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2 | |
339 movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6 | |
340 | |
341 pslldq xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0) | |
342 pslldq xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0) | |
343 | |
344 movdqa XMMWORD [wk(2)], xmm1 | |
345 movdqa XMMWORD [wk(3)], xmm2 | |
346 | |
347 .upsample: | |
348 ; -- process the upper row | |
349 | |
350 movdqa xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD] | |
351 movdqa xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD] | |
352 | |
353 movdqa xmm0,xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7) | |
354 movdqa xmm4,xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15) | |
355 psrldq xmm0,2 ; xmm0=( 1 2 3 4 5 6 7 --) | |
356 pslldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8) | |
357 movdqa xmm5,xmm7 | |
358 movdqa xmm6,xmm3 | |
359 psrldq xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --) | |
360 pslldq xmm6,2 ; xmm6=(-- 8 9 10 11 12 13 14) | |
361 | |
362 por xmm0,xmm4 ; xmm0=( 1 2 3 4 5 6 7 8) | |
363 por xmm5,xmm6 ; xmm5=( 7 8 9 10 11 12 13 14) | |
364 | |
365 movdqa xmm1,xmm7 | |
366 movdqa xmm2,xmm3 | |
367 pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6) | |
368 psrldq xmm2,2 ; xmm2=( 9 10 11 12 13 14 15 --) | |
369 movdqa xmm4,xmm3 | |
370 psrldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --) | |
371 | |
372 por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6) | |
373 por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16) | |
374 | |
375 movdqa XMMWORD [wk(0)], xmm4 | |
376 | |
377 pmullw xmm7,[rel PW_THREE] | |
378 pmullw xmm3,[rel PW_THREE] | |
379 paddw xmm1,[rel PW_EIGHT] | |
380 paddw xmm5,[rel PW_EIGHT] | |
381 paddw xmm0,[rel PW_SEVEN] | |
382 paddw xmm2,[rel PW_SEVEN] | |
383 | |
384 paddw xmm1,xmm7 | |
385 paddw xmm5,xmm3 | |
386 psrlw xmm1,4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14) | |
387 psrlw xmm5,4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30) | |
388 paddw xmm0,xmm7 | |
389 paddw xmm2,xmm3 | |
390 psrlw xmm0,4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15) | |
391 psrlw xmm2,4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31) | |
392 | |
393 psllw xmm0,BYTE_BIT | |
394 psllw xmm2,BYTE_BIT | |
395 por xmm1,xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15) | |
396 por xmm5,xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31) | |
397 | |
398 movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1 | |
399 movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5 | |
400 | |
401 ; -- process the lower row | |
402 | |
403 movdqa xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD] | |
404 movdqa xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD] | |
405 | |
406 movdqa xmm7,xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7) | |
407 movdqa xmm3,xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15) | |
408 psrldq xmm7,2 ; xmm7=( 1 2 3 4 5 6 7 --) | |
409 pslldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8) | |
410 movdqa xmm0,xmm6 | |
411 movdqa xmm2,xmm4 | |
412 psrldq xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --) | |
413 pslldq xmm2,2 ; xmm2=(-- 8 9 10 11 12 13 14) | |
414 | |
415 por xmm7,xmm3 ; xmm7=( 1 2 3 4 5 6 7 8) | |
416 por xmm0,xmm2 ; xmm0=( 7 8 9 10 11 12 13 14) | |
417 | |
418 movdqa xmm1,xmm6 | |
419 movdqa xmm5,xmm4 | |
420 pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6) | |
421 psrldq xmm5,2 ; xmm5=( 9 10 11 12 13 14 15 --) | |
422 movdqa xmm3,xmm4 | |
423 psrldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --) | |
424 | |
425 por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6) | |
426 por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16) | |
427 | |
428 movdqa XMMWORD [wk(1)], xmm3 | |
429 | |
430 pmullw xmm6,[rel PW_THREE] | |
431 pmullw xmm4,[rel PW_THREE] | |
432 paddw xmm1,[rel PW_EIGHT] | |
433 paddw xmm0,[rel PW_EIGHT] | |
434 paddw xmm7,[rel PW_SEVEN] | |
435 paddw xmm5,[rel PW_SEVEN] | |
436 | |
437 paddw xmm1,xmm6 | |
438 paddw xmm0,xmm4 | |
439 psrlw xmm1,4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14) | |
440 psrlw xmm0,4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30) | |
441 paddw xmm7,xmm6 | |
442 paddw xmm5,xmm4 | |
443 psrlw xmm7,4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15) | |
444 psrlw xmm5,4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31) | |
445 | |
446 psllw xmm7,BYTE_BIT | |
447 psllw xmm5,BYTE_BIT | |
448 por xmm1,xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15) | |
449 por xmm0,xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31) | |
450 | |
451 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1 | |
452 movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0 | |
453 | |
454 sub rax, byte SIZEOF_XMMWORD | |
455 add rcx, byte 1*SIZEOF_XMMWORD ; inptr1(above) | |
456 add rbx, byte 1*SIZEOF_XMMWORD ; inptr0 | |
457 add rsi, byte 1*SIZEOF_XMMWORD ; inptr1(below) | |
458 add rdx, byte 2*SIZEOF_XMMWORD ; outptr0 | |
459 add rdi, byte 2*SIZEOF_XMMWORD ; outptr1 | |
460 cmp rax, byte SIZEOF_XMMWORD | |
461 ja near .columnloop | |
462 test rax,rax | |
463 jnz near .columnloop_last | |
464 | |
465 pop rsi | |
466 pop rdi | |
467 pop rcx | |
468 pop rax | |
469 | |
470 add rsi, byte 1*SIZEOF_JSAMPROW ; input_data | |
471 add rdi, byte 2*SIZEOF_JSAMPROW ; output_data | |
472 sub rcx, byte 2 ; rowctr | |
473 jg near .rowloop | |
474 | |
475 .return: | |
476 pop rbx | |
477 uncollect_args | |
478 mov rsp,rbp ; rsp <- aligned rbp | |
479 pop rsp ; rsp <- original rbp | |
480 pop rbp | |
481 ret | |
482 | |
483 ; -------------------------------------------------------------------------- | |
484 ; | |
485 ; Fast processing for the common case of 2:1 horizontal and 1:1 vertical. | |
486 ; It's still a box filter. | |
487 ; | |
488 ; GLOBAL(void) | |
489 ; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor, | |
490 ; JDIMENSION output_width, | |
491 ; JSAMPARRAY input_data, | |
492 ; JSAMPARRAY * output_data_ptr); | |
493 ; | |
494 | |
495 ; r10 = int max_v_samp_factor | |
496 ; r11 = JDIMENSION output_width | |
497 ; r12 = JSAMPARRAY input_data | |
498 ; r13 = JSAMPARRAY * output_data_ptr | |
499 | |
500 align 16 | |
501 global EXTN(jsimd_h2v1_upsample_sse2) PRIVATE | |
502 | |
503 EXTN(jsimd_h2v1_upsample_sse2): | |
504 push rbp | |
505 mov rax,rsp | |
506 mov rbp,rsp | |
507 collect_args | |
508 | |
509 mov edx, r11d | |
510 add rdx, byte (2*SIZEOF_XMMWORD)-1 | |
511 and rdx, byte -(2*SIZEOF_XMMWORD) | |
512 jz near .return | |
513 | |
514 mov rcx, r10 ; rowctr | |
515 test rcx,rcx | |
516 jz short .return | |
517 | |
518 mov rsi, r12 ; input_data | |
519 mov rdi, r13 | |
520 mov rdi, JSAMPARRAY [rdi] ; output_data | |
521 .rowloop: | |
522 push rdi | |
523 push rsi | |
524 | |
525 mov rsi, JSAMPROW [rsi] ; inptr | |
526 mov rdi, JSAMPROW [rdi] ; outptr | |
527 mov rax,rdx ; colctr | |
528 .columnloop: | |
529 | |
530 movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] | |
531 | |
532 movdqa xmm1,xmm0 | |
533 punpcklbw xmm0,xmm0 | |
534 punpckhbw xmm1,xmm1 | |
535 | |
536 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 | |
537 movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1 | |
538 | |
539 sub rax, byte 2*SIZEOF_XMMWORD | |
540 jz short .nextrow | |
541 | |
542 movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] | |
543 | |
544 movdqa xmm3,xmm2 | |
545 punpcklbw xmm2,xmm2 | |
546 punpckhbw xmm3,xmm3 | |
547 | |
548 movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2 | |
549 movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3 | |
550 | |
551 sub rax, byte 2*SIZEOF_XMMWORD | |
552 jz short .nextrow | |
553 | |
554 add rsi, byte 2*SIZEOF_XMMWORD ; inptr | |
555 add rdi, byte 4*SIZEOF_XMMWORD ; outptr | |
556 jmp short .columnloop | |
557 | |
558 .nextrow: | |
559 pop rsi | |
560 pop rdi | |
561 | |
562 add rsi, byte SIZEOF_JSAMPROW ; input_data | |
563 add rdi, byte SIZEOF_JSAMPROW ; output_data | |
564 dec rcx ; rowctr | |
565 jg short .rowloop | |
566 | |
567 .return: | |
568 uncollect_args | |
569 pop rbp | |
570 ret | |
571 | |
572 ; -------------------------------------------------------------------------- | |
573 ; | |
574 ; Fast processing for the common case of 2:1 horizontal and 2:1 vertical. | |
575 ; It's still a box filter. | |
576 ; | |
577 ; GLOBAL(void) | |
578 ; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor, | |
579 ; JDIMENSION output_width, | |
580 ; JSAMPARRAY input_data, | |
581 ; JSAMPARRAY * output_data_ptr); | |
582 ; | |
583 | |
584 ; r10 = int max_v_samp_factor | |
585 ; r11 = JDIMENSION output_width | |
586 ; r12 = JSAMPARRAY input_data | |
587 ; r13 = JSAMPARRAY * output_data_ptr | |
588 | |
589 align 16 | |
590 global EXTN(jsimd_h2v2_upsample_sse2) PRIVATE | |
591 | |
592 EXTN(jsimd_h2v2_upsample_sse2): | |
593 push rbp | |
594 mov rax,rsp | |
595 mov rbp,rsp | |
596 collect_args | |
597 push rbx | |
598 | |
599 mov edx, r11d | |
600 add rdx, byte (2*SIZEOF_XMMWORD)-1 | |
601 and rdx, byte -(2*SIZEOF_XMMWORD) | |
602 jz near .return | |
603 | |
604 mov rcx, r10 ; rowctr | |
605 test rcx,rcx | |
606 jz near .return | |
607 | |
608 mov rsi, r12 ; input_data | |
609 mov rdi, r13 | |
610 mov rdi, JSAMPARRAY [rdi] ; output_data | |
611 .rowloop: | |
612 push rdi | |
613 push rsi | |
614 | |
615 mov rsi, JSAMPROW [rsi] ; inptr | |
616 mov rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0 | |
617 mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1 | |
618 mov rax,rdx ; colctr | |
619 .columnloop: | |
620 | |
621 movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] | |
622 | |
623 movdqa xmm1,xmm0 | |
624 punpcklbw xmm0,xmm0 | |
625 punpckhbw xmm1,xmm1 | |
626 | |
627 movdqa XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0 | |
628 movdqa XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1 | |
629 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 | |
630 movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1 | |
631 | |
632 sub rax, byte 2*SIZEOF_XMMWORD | |
633 jz short .nextrow | |
634 | |
635 movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] | |
636 | |
637 movdqa xmm3,xmm2 | |
638 punpcklbw xmm2,xmm2 | |
639 punpckhbw xmm3,xmm3 | |
640 | |
641 movdqa XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2 | |
642 movdqa XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3 | |
643 movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2 | |
644 movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3 | |
645 | |
646 sub rax, byte 2*SIZEOF_XMMWORD | |
647 jz short .nextrow | |
648 | |
649 add rsi, byte 2*SIZEOF_XMMWORD ; inptr | |
650 add rbx, byte 4*SIZEOF_XMMWORD ; outptr0 | |
651 add rdi, byte 4*SIZEOF_XMMWORD ; outptr1 | |
652 jmp short .columnloop | |
653 | |
654 .nextrow: | |
655 pop rsi | |
656 pop rdi | |
657 | |
658 add rsi, byte 1*SIZEOF_JSAMPROW ; input_data | |
659 add rdi, byte 2*SIZEOF_JSAMPROW ; output_data | |
660 sub rcx, byte 2 ; rowctr | |
661 jg near .rowloop | |
662 | |
663 .return: | |
664 pop rbx | |
665 uncollect_args | |
666 pop rbp | |
667 ret | |
668 | |
669 ; For some reason, the OS X linker does not honor the request to align the | |
670 ; segment unless we do this. | |
671 align 16 | |
OLD | NEW |