OLD | NEW |
| (Empty) |
1 ; | |
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | |
3 ; | |
4 ; Use of this source code is governed by a BSD-style license | |
5 ; that can be found in the LICENSE file in the root of the source | |
6 ; tree. An additional intellectual property rights grant can be found | |
7 ; in the file PATENTS. All contributing project authors may | |
8 ; be found in the AUTHORS file in the root of the source tree. | |
9 ; | |
10 | |
11 | |
12 %include "vpx_ports/x86_abi_support.asm" | |
13 | |
14 %define mmx_filter_shift 7 | |
15 | |
16 ;void vp8_filter_block2d_bil4x4_var_mmx | |
17 ;( | |
18 ; unsigned char *ref_ptr, | |
19 ; int ref_pixels_per_line, | |
20 ; unsigned char *src_ptr, | |
21 ; int src_pixels_per_line, | |
22 ; unsigned short *HFilter, | |
23 ; unsigned short *VFilter, | |
24 ; int *sum, | |
25 ; unsigned int *sumsquared | |
26 ;) | |
27 global sym(vp8_filter_block2d_bil4x4_var_mmx) PRIVATE | |
28 sym(vp8_filter_block2d_bil4x4_var_mmx): | |
29 push rbp | |
30 mov rbp, rsp | |
31 SHADOW_ARGS_TO_STACK 8 | |
32 GET_GOT rbx | |
33 push rsi | |
34 push rdi | |
35 sub rsp, 16 | |
36 ; end prolog | |
37 | |
38 | |
39 pxor mm6, mm6 ; | |
40 pxor mm7, mm7 ; | |
41 | |
42 mov rax, arg(4) ;HFilter ; | |
43 mov rdx, arg(5) ;VFilter ; | |
44 | |
45 mov rsi, arg(0) ;ref_ptr ; | |
46 mov rdi, arg(2) ;src_ptr ; | |
47 | |
48 mov rcx, 4 ; | |
49 pxor mm0, mm0 ; | |
50 | |
51 movd mm1, [rsi] ; | |
52 movd mm3, [rsi+1] ; | |
53 | |
54 punpcklbw mm1, mm0 ; | |
55 pmullw mm1, [rax] ; | |
56 | |
57 punpcklbw mm3, mm0 ; | |
58 pmullw mm3, [rax+8] ; | |
59 | |
60 paddw mm1, mm3 ; | |
61 paddw mm1, [GLOBAL(mmx_bi_rd)] ; | |
62 | |
63 psraw mm1, mmx_filter_shift ; | |
64 movq mm5, mm1 | |
65 | |
66 %if ABI_IS_32BIT | |
67 add rsi, dword ptr arg(1) ;ref_pixels_per_line ; | |
68 %else | |
69 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; | |
70 add rsi, r8 | |
71 %endif | |
72 | |
73 .filter_block2d_bil4x4_var_mmx_loop: | |
74 | |
75 movd mm1, [rsi] ; | |
76 movd mm3, [rsi+1] ; | |
77 | |
78 punpcklbw mm1, mm0 ; | |
79 pmullw mm1, [rax] ; | |
80 | |
81 punpcklbw mm3, mm0 ; | |
82 pmullw mm3, [rax+8] ; | |
83 | |
84 paddw mm1, mm3 ; | |
85 paddw mm1, [GLOBAL(mmx_bi_rd)] ; | |
86 | |
87 psraw mm1, mmx_filter_shift ; | |
88 movq mm3, mm5 ; | |
89 | |
90 movq mm5, mm1 ; | |
91 pmullw mm3, [rdx] ; | |
92 | |
93 pmullw mm1, [rdx+8] ; | |
94 paddw mm1, mm3 ; | |
95 | |
96 | |
97 paddw mm1, [GLOBAL(mmx_bi_rd)] ; | |
98 psraw mm1, mmx_filter_shift ; | |
99 | |
100 movd mm3, [rdi] ; | |
101 punpcklbw mm3, mm0 ; | |
102 | |
103 psubw mm1, mm3 ; | |
104 paddw mm6, mm1 ; | |
105 | |
106 pmaddwd mm1, mm1 ; | |
107 paddd mm7, mm1 ; | |
108 | |
109 %if ABI_IS_32BIT | |
110 add rsi, dword ptr arg(1) ;ref_pixels_per_line
; | |
111 add rdi, dword ptr arg(3) ;src_pixels_per_line
; | |
112 %else | |
113 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line | |
114 movsxd r9, dword ptr arg(3) ;src_pixels_per_line | |
115 add rsi, r8 | |
116 add rdi, r9 | |
117 %endif | |
118 sub rcx, 1 ; | |
119 jnz .filter_block2d_bil4x4_var_mmx_loop ; | |
120 | |
121 | |
122 pxor mm3, mm3 ; | |
123 pxor mm2, mm2 ; | |
124 | |
125 punpcklwd mm2, mm6 ; | |
126 punpckhwd mm3, mm6 ; | |
127 | |
128 paddd mm2, mm3 ; | |
129 movq mm6, mm2 ; | |
130 | |
131 psrlq mm6, 32 ; | |
132 paddd mm2, mm6 ; | |
133 | |
134 psrad mm2, 16 ; | |
135 movq mm4, mm7 ; | |
136 | |
137 psrlq mm4, 32 ; | |
138 paddd mm4, mm7 ; | |
139 | |
140 mov rdi, arg(6) ;sum | |
141 mov rsi, arg(7) ;sumsquared | |
142 | |
143 movd dword ptr [rdi], mm2 ; | |
144 movd dword ptr [rsi], mm4 ; | |
145 | |
146 | |
147 | |
148 ; begin epilog | |
149 add rsp, 16 | |
150 pop rdi | |
151 pop rsi | |
152 RESTORE_GOT | |
153 UNSHADOW_ARGS | |
154 pop rbp | |
155 ret | |
156 | |
157 | |
158 | |
159 | |
160 ;void vp8_filter_block2d_bil_var_mmx | |
161 ;( | |
162 ; unsigned char *ref_ptr, | |
163 ; int ref_pixels_per_line, | |
164 ; unsigned char *src_ptr, | |
165 ; int src_pixels_per_line, | |
166 ; unsigned int Height, | |
167 ; unsigned short *HFilter, | |
168 ; unsigned short *VFilter, | |
169 ; int *sum, | |
170 ; unsigned int *sumsquared | |
171 ;) | |
172 global sym(vp8_filter_block2d_bil_var_mmx) PRIVATE | |
173 sym(vp8_filter_block2d_bil_var_mmx): | |
174 push rbp | |
175 mov rbp, rsp | |
176 SHADOW_ARGS_TO_STACK 9 | |
177 GET_GOT rbx | |
178 push rsi | |
179 push rdi | |
180 sub rsp, 16 | |
181 ; end prolog | |
182 | |
183 pxor mm6, mm6 ; | |
184 pxor mm7, mm7 ; | |
185 mov rax, arg(5) ;HFilter ; | |
186 | |
187 mov rdx, arg(6) ;VFilter ; | |
188 mov rsi, arg(0) ;ref_ptr ; | |
189 | |
190 mov rdi, arg(2) ;src_ptr ; | |
191 movsxd rcx, dword ptr arg(4) ;Height ; | |
192 | |
193 pxor mm0, mm0 ; | |
194 movq mm1, [rsi] ; | |
195 | |
196 movq mm3, [rsi+1] ; | |
197 movq mm2, mm1 ; | |
198 | |
199 movq mm4, mm3 ; | |
200 punpcklbw mm1, mm0 ; | |
201 | |
202 punpckhbw mm2, mm0 ; | |
203 pmullw mm1, [rax] ; | |
204 | |
205 pmullw mm2, [rax] ; | |
206 punpcklbw mm3, mm0 ; | |
207 | |
208 punpckhbw mm4, mm0 ; | |
209 pmullw mm3, [rax+8] ; | |
210 | |
211 pmullw mm4, [rax+8] ; | |
212 paddw mm1, mm3 ; | |
213 | |
214 paddw mm2, mm4 ; | |
215 paddw mm1, [GLOBAL(mmx_bi_rd)] ; | |
216 | |
217 psraw mm1, mmx_filter_shift ; | |
218 paddw mm2, [GLOBAL(mmx_bi_rd)] ; | |
219 | |
220 psraw mm2, mmx_filter_shift ; | |
221 movq mm5, mm1 | |
222 | |
223 packuswb mm5, mm2 ; | |
224 %if ABI_IS_32BIT | |
225 add rsi, dword ptr arg(1) ;ref_pixels_per_line | |
226 %else | |
227 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line | |
228 add rsi, r8 | |
229 %endif | |
230 | |
231 .filter_block2d_bil_var_mmx_loop: | |
232 | |
233 movq mm1, [rsi] ; | |
234 movq mm3, [rsi+1] ; | |
235 | |
236 movq mm2, mm1 ; | |
237 movq mm4, mm3 ; | |
238 | |
239 punpcklbw mm1, mm0 ; | |
240 punpckhbw mm2, mm0 ; | |
241 | |
242 pmullw mm1, [rax] ; | |
243 pmullw mm2, [rax] ; | |
244 | |
245 punpcklbw mm3, mm0 ; | |
246 punpckhbw mm4, mm0 ; | |
247 | |
248 pmullw mm3, [rax+8] ; | |
249 pmullw mm4, [rax+8] ; | |
250 | |
251 paddw mm1, mm3 ; | |
252 paddw mm2, mm4 ; | |
253 | |
254 paddw mm1, [GLOBAL(mmx_bi_rd)] ; | |
255 psraw mm1, mmx_filter_shift ; | |
256 | |
257 paddw mm2, [GLOBAL(mmx_bi_rd)] ; | |
258 psraw mm2, mmx_filter_shift ; | |
259 | |
260 movq mm3, mm5 ; | |
261 movq mm4, mm5 ; | |
262 | |
263 punpcklbw mm3, mm0 ; | |
264 punpckhbw mm4, mm0 ; | |
265 | |
266 movq mm5, mm1 ; | |
267 packuswb mm5, mm2 ; | |
268 | |
269 pmullw mm3, [rdx] ; | |
270 pmullw mm4, [rdx] ; | |
271 | |
272 pmullw mm1, [rdx+8] ; | |
273 pmullw mm2, [rdx+8] ; | |
274 | |
275 paddw mm1, mm3 ; | |
276 paddw mm2, mm4 ; | |
277 | |
278 paddw mm1, [GLOBAL(mmx_bi_rd)] ; | |
279 paddw mm2, [GLOBAL(mmx_bi_rd)] ; | |
280 | |
281 psraw mm1, mmx_filter_shift ; | |
282 psraw mm2, mmx_filter_shift ; | |
283 | |
284 movq mm3, [rdi] ; | |
285 movq mm4, mm3 ; | |
286 | |
287 punpcklbw mm3, mm0 ; | |
288 punpckhbw mm4, mm0 ; | |
289 | |
290 psubw mm1, mm3 ; | |
291 psubw mm2, mm4 ; | |
292 | |
293 paddw mm6, mm1 ; | |
294 pmaddwd mm1, mm1 ; | |
295 | |
296 paddw mm6, mm2 ; | |
297 pmaddwd mm2, mm2 ; | |
298 | |
299 paddd mm7, mm1 ; | |
300 paddd mm7, mm2 ; | |
301 | |
302 %if ABI_IS_32BIT | |
303 add rsi, dword ptr arg(1) ;ref_pixels_per_line
; | |
304 add rdi, dword ptr arg(3) ;src_pixels_per_line
; | |
305 %else | |
306 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
; | |
307 movsxd r9, dword ptr arg(3) ;src_pixels_per_line
; | |
308 add rsi, r8 | |
309 add rdi, r9 | |
310 %endif | |
311 sub rcx, 1 ; | |
312 jnz .filter_block2d_bil_var_mmx_loop ; | |
313 | |
314 | |
315 pxor mm3, mm3 ; | |
316 pxor mm2, mm2 ; | |
317 | |
318 punpcklwd mm2, mm6 ; | |
319 punpckhwd mm3, mm6 ; | |
320 | |
321 paddd mm2, mm3 ; | |
322 movq mm6, mm2 ; | |
323 | |
324 psrlq mm6, 32 ; | |
325 paddd mm2, mm6 ; | |
326 | |
327 psrad mm2, 16 ; | |
328 movq mm4, mm7 ; | |
329 | |
330 psrlq mm4, 32 ; | |
331 paddd mm4, mm7 ; | |
332 | |
333 mov rdi, arg(7) ;sum | |
334 mov rsi, arg(8) ;sumsquared | |
335 | |
336 movd dword ptr [rdi], mm2 ; | |
337 movd dword ptr [rsi], mm4 ; | |
338 | |
339 ; begin epilog | |
340 add rsp, 16 | |
341 pop rdi | |
342 pop rsi | |
343 RESTORE_GOT | |
344 UNSHADOW_ARGS | |
345 pop rbp | |
346 ret | |
347 | |
348 | |
349 SECTION_RODATA | |
350 ;short mmx_bi_rd[4] = { 64, 64, 64, 64}; | |
351 align 16 | |
352 mmx_bi_rd: | |
353 times 4 dw 64 | |
OLD | NEW |