OLD | NEW |
| (Empty) |
1 ; | |
2 ; jcsamss2-64.asm - downsampling (64-bit SSE2) | |
3 ; | |
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB | |
5 ; Copyright 2009 D. R. Commander | |
6 ; | |
7 ; Based on | |
8 ; x86 SIMD extension for IJG JPEG library | |
9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. | |
10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc | |
11 ; | |
12 ; This file should be assembled with NASM (Netwide Assembler), | |
13 ; can *not* be assembled with Microsoft's MASM or any compatible | |
14 ; assembler (including Borland's Turbo Assembler). | |
15 ; NASM is available from http://nasm.sourceforge.net/ or | |
16 ; http://sourceforge.net/project/showfiles.php?group_id=6208 | |
17 ; | |
18 ; [TAB8] | |
19 | |
20 %include "jsimdext.inc" | |
21 | |
22 ; -------------------------------------------------------------------------- | |
23 SECTION SEG_TEXT | |
24 BITS 64 | |
25 ; | |
26 ; Downsample pixel values of a single component. | |
27 ; This version handles the common case of 2:1 horizontal and 1:1 vertical, | |
28 ; without smoothing. | |
29 ; | |
30 ; GLOBAL(void) | |
31 ; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor, | |
32 ; JDIMENSION v_samp_factor, JDIMENSION width_blocks, | |
33 ; JSAMPARRAY input_data, JSAMPARRAY output_data); | |
34 ; | |
35 | |
36 ; r10 = JDIMENSION image_width | |
37 ; r11 = int max_v_samp_factor | |
38 ; r12 = JDIMENSION v_samp_factor | |
39 ; r13 = JDIMENSION width_blocks | |
40 ; r14 = JSAMPARRAY input_data | |
41 ; r15 = JSAMPARRAY output_data | |
42 | |
43 align 16 | |
44 global EXTN(jsimd_h2v1_downsample_sse2) PRIVATE | |
45 | |
46 EXTN(jsimd_h2v1_downsample_sse2): | |
47 push rbp | |
48 mov rax,rsp | |
49 mov rbp,rsp | |
50 collect_args | |
51 | |
52 mov ecx, r13d | |
53 shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols) | |
54 jz near .return | |
55 | |
56 mov edx, r10d | |
57 | |
58 ; -- expand_right_edge | |
59 | |
60 push rcx | |
61 shl rcx,1 ; output_cols * 2 | |
62 sub rcx,rdx | |
63 jle short .expand_end | |
64 | |
65 mov rax, r11 | |
66 test rax,rax | |
67 jle short .expand_end | |
68 | |
69 cld | |
70 mov rsi, r14 ; input_data | |
71 .expandloop: | |
72 push rax | |
73 push rcx | |
74 | |
75 mov rdi, JSAMPROW [rsi] | |
76 add rdi,rdx | |
77 mov al, JSAMPLE [rdi-1] | |
78 | |
79 rep stosb | |
80 | |
81 pop rcx | |
82 pop rax | |
83 | |
84 add rsi, byte SIZEOF_JSAMPROW | |
85 dec rax | |
86 jg short .expandloop | |
87 | |
88 .expand_end: | |
89 pop rcx ; output_cols | |
90 | |
91 ; -- h2v1_downsample | |
92 | |
93 mov eax, r12d ; rowctr | |
94 test eax,eax | |
95 jle near .return | |
96 | |
97 mov rdx, 0x00010000 ; bias pattern | |
98 movd xmm7,edx | |
99 pcmpeqw xmm6,xmm6 | |
100 pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} | |
101 psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} | |
102 | |
103 mov rsi, r14 ; input_data | |
104 mov rdi, r15 ; output_data | |
105 .rowloop: | |
106 push rcx | |
107 push rdi | |
108 push rsi | |
109 | |
110 mov rsi, JSAMPROW [rsi] ; inptr | |
111 mov rdi, JSAMPROW [rdi] ; outptr | |
112 | |
113 cmp rcx, byte SIZEOF_XMMWORD | |
114 jae short .columnloop | |
115 | |
116 .columnloop_r8: | |
117 movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] | |
118 pxor xmm1,xmm1 | |
119 mov rcx, SIZEOF_XMMWORD | |
120 jmp short .downsample | |
121 | |
122 .columnloop: | |
123 movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] | |
124 movdqa xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD] | |
125 | |
126 .downsample: | |
127 movdqa xmm2,xmm0 | |
128 movdqa xmm3,xmm1 | |
129 | |
130 pand xmm0,xmm6 | |
131 psrlw xmm2,BYTE_BIT | |
132 pand xmm1,xmm6 | |
133 psrlw xmm3,BYTE_BIT | |
134 | |
135 paddw xmm0,xmm2 | |
136 paddw xmm1,xmm3 | |
137 paddw xmm0,xmm7 | |
138 paddw xmm1,xmm7 | |
139 psrlw xmm0,1 | |
140 psrlw xmm1,1 | |
141 | |
142 packuswb xmm0,xmm1 | |
143 | |
144 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 | |
145 | |
146 sub rcx, byte SIZEOF_XMMWORD ; outcol | |
147 add rsi, byte 2*SIZEOF_XMMWORD ; inptr | |
148 add rdi, byte 1*SIZEOF_XMMWORD ; outptr | |
149 cmp rcx, byte SIZEOF_XMMWORD | |
150 jae short .columnloop | |
151 test rcx,rcx | |
152 jnz short .columnloop_r8 | |
153 | |
154 pop rsi | |
155 pop rdi | |
156 pop rcx | |
157 | |
158 add rsi, byte SIZEOF_JSAMPROW ; input_data | |
159 add rdi, byte SIZEOF_JSAMPROW ; output_data | |
160 dec rax ; rowctr | |
161 jg near .rowloop | |
162 | |
163 .return: | |
164 uncollect_args | |
165 pop rbp | |
166 ret | |
167 | |
168 ; -------------------------------------------------------------------------- | |
169 ; | |
170 ; Downsample pixel values of a single component. | |
171 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical, | |
172 ; without smoothing. | |
173 ; | |
174 ; GLOBAL(void) | |
175 ; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor, | |
176 ; JDIMENSION v_samp_factor, JDIMENSION width_blocks, | |
177 ; JSAMPARRAY input_data, JSAMPARRAY output_data); | |
178 ; | |
179 | |
180 ; r10 = JDIMENSION image_width | |
181 ; r11 = int max_v_samp_factor | |
182 ; r12 = JDIMENSION v_samp_factor | |
183 ; r13 = JDIMENSION width_blocks | |
184 ; r14 = JSAMPARRAY input_data | |
185 ; r15 = JSAMPARRAY output_data | |
186 | |
187 align 16 | |
188 global EXTN(jsimd_h2v2_downsample_sse2) PRIVATE | |
189 | |
190 EXTN(jsimd_h2v2_downsample_sse2): | |
191 push rbp | |
192 mov rax,rsp | |
193 mov rbp,rsp | |
194 collect_args | |
195 | |
196 mov ecx, r13d | |
197 shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols) | |
198 jz near .return | |
199 | |
200 mov edx, r10d | |
201 | |
202 ; -- expand_right_edge | |
203 | |
204 push rcx | |
205 shl rcx,1 ; output_cols * 2 | |
206 sub rcx,rdx | |
207 jle short .expand_end | |
208 | |
209 mov rax, r11 | |
210 test rax,rax | |
211 jle short .expand_end | |
212 | |
213 cld | |
214 mov rsi, r14 ; input_data | |
215 .expandloop: | |
216 push rax | |
217 push rcx | |
218 | |
219 mov rdi, JSAMPROW [rsi] | |
220 add rdi,rdx | |
221 mov al, JSAMPLE [rdi-1] | |
222 | |
223 rep stosb | |
224 | |
225 pop rcx | |
226 pop rax | |
227 | |
228 add rsi, byte SIZEOF_JSAMPROW | |
229 dec rax | |
230 jg short .expandloop | |
231 | |
232 .expand_end: | |
233 pop rcx ; output_cols | |
234 | |
235 ; -- h2v2_downsample | |
236 | |
237 mov eax, r12d ; rowctr | |
238 test rax,rax | |
239 jle near .return | |
240 | |
241 mov rdx, 0x00020001 ; bias pattern | |
242 movd xmm7,edx | |
243 pcmpeqw xmm6,xmm6 | |
244 pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2} | |
245 psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} | |
246 | |
247 mov rsi, r14 ; input_data | |
248 mov rdi, r15 ; output_data | |
249 .rowloop: | |
250 push rcx | |
251 push rdi | |
252 push rsi | |
253 | |
254 mov rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0 | |
255 mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1 | |
256 mov rdi, JSAMPROW [rdi] ; outptr | |
257 | |
258 cmp rcx, byte SIZEOF_XMMWORD | |
259 jae short .columnloop | |
260 | |
261 .columnloop_r8: | |
262 movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] | |
263 movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] | |
264 pxor xmm2,xmm2 | |
265 pxor xmm3,xmm3 | |
266 mov rcx, SIZEOF_XMMWORD | |
267 jmp short .downsample | |
268 | |
269 .columnloop: | |
270 movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] | |
271 movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] | |
272 movdqa xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD] | |
273 movdqa xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD] | |
274 | |
275 .downsample: | |
276 movdqa xmm4,xmm0 | |
277 movdqa xmm5,xmm1 | |
278 pand xmm0,xmm6 | |
279 psrlw xmm4,BYTE_BIT | |
280 pand xmm1,xmm6 | |
281 psrlw xmm5,BYTE_BIT | |
282 paddw xmm0,xmm4 | |
283 paddw xmm1,xmm5 | |
284 | |
285 movdqa xmm4,xmm2 | |
286 movdqa xmm5,xmm3 | |
287 pand xmm2,xmm6 | |
288 psrlw xmm4,BYTE_BIT | |
289 pand xmm3,xmm6 | |
290 psrlw xmm5,BYTE_BIT | |
291 paddw xmm2,xmm4 | |
292 paddw xmm3,xmm5 | |
293 | |
294 paddw xmm0,xmm1 | |
295 paddw xmm2,xmm3 | |
296 paddw xmm0,xmm7 | |
297 paddw xmm2,xmm7 | |
298 psrlw xmm0,2 | |
299 psrlw xmm2,2 | |
300 | |
301 packuswb xmm0,xmm2 | |
302 | |
303 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 | |
304 | |
305 sub rcx, byte SIZEOF_XMMWORD ; outcol | |
306 add rdx, byte 2*SIZEOF_XMMWORD ; inptr0 | |
307 add rsi, byte 2*SIZEOF_XMMWORD ; inptr1 | |
308 add rdi, byte 1*SIZEOF_XMMWORD ; outptr | |
309 cmp rcx, byte SIZEOF_XMMWORD | |
310 jae near .columnloop | |
311 test rcx,rcx | |
312 jnz near .columnloop_r8 | |
313 | |
314 pop rsi | |
315 pop rdi | |
316 pop rcx | |
317 | |
318 add rsi, byte 2*SIZEOF_JSAMPROW ; input_data | |
319 add rdi, byte 1*SIZEOF_JSAMPROW ; output_data | |
320 dec rax ; rowctr | |
321 jg near .rowloop | |
322 | |
323 .return: | |
324 uncollect_args | |
325 pop rbp | |
326 ret | |
327 | |
328 ; For some reason, the OS X linker does not honor the request to align the | |
329 ; segment unless we do this. | |
330 align 16 | |
OLD | NEW |