OLD | NEW |
| (Empty) |
1 ; | |
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | |
3 ; | |
4 ; Use of this source code is governed by a BSD-style license | |
5 ; that can be found in the LICENSE file in the root of the source | |
6 ; tree. An additional intellectual property rights grant can be found | |
7 ; in the file PATENTS. All contributing project authors may | |
8 ; be found in the AUTHORS file in the root of the source tree. | |
9 ; | |
10 | |
11 | |
12 %include "vpx_ports/x86_abi_support.asm" | |
13 | |
14 ;unsigned int vp8_sad16x16_wmt( | |
15 ; unsigned char *src_ptr, | |
16 ; int src_stride, | |
17 ; unsigned char *ref_ptr, | |
18 ; int ref_stride) | |
19 global sym(vp8_sad16x16_wmt) PRIVATE | |
20 sym(vp8_sad16x16_wmt): | |
21 push rbp | |
22 mov rbp, rsp | |
23 SHADOW_ARGS_TO_STACK 4 | |
24 SAVE_XMM 6 | |
25 push rsi | |
26 push rdi | |
27 ; end prolog | |
28 | |
29 mov rsi, arg(0) ;src_ptr | |
30 mov rdi, arg(2) ;ref_ptr | |
31 | |
32 movsxd rax, dword ptr arg(1) ;src_stride | |
33 movsxd rdx, dword ptr arg(3) ;ref_stride | |
34 | |
35 lea rcx, [rsi+rax*8] | |
36 | |
37 lea rcx, [rcx+rax*8] | |
38 pxor xmm6, xmm6 | |
39 | |
40 .x16x16sad_wmt_loop: | |
41 | |
42 movq xmm0, QWORD PTR [rsi] | |
43 movq xmm2, QWORD PTR [rsi+8] | |
44 | |
45 movq xmm1, QWORD PTR [rdi] | |
46 movq xmm3, QWORD PTR [rdi+8] | |
47 | |
48 movq xmm4, QWORD PTR [rsi+rax] | |
49 movq xmm5, QWORD PTR [rdi+rdx] | |
50 | |
51 | |
52 punpcklbw xmm0, xmm2 | |
53 punpcklbw xmm1, xmm3 | |
54 | |
55 psadbw xmm0, xmm1 | |
56 movq xmm2, QWORD PTR [rsi+rax+8] | |
57 | |
58 movq xmm3, QWORD PTR [rdi+rdx+8] | |
59 lea rsi, [rsi+rax*2] | |
60 | |
61 lea rdi, [rdi+rdx*2] | |
62 punpcklbw xmm4, xmm2 | |
63 | |
64 punpcklbw xmm5, xmm3 | |
65 psadbw xmm4, xmm5 | |
66 | |
67 paddw xmm6, xmm0 | |
68 paddw xmm6, xmm4 | |
69 | |
70 cmp rsi, rcx | |
71 jne .x16x16sad_wmt_loop | |
72 | |
73 movq xmm0, xmm6 | |
74 psrldq xmm6, 8 | |
75 | |
76 paddw xmm0, xmm6 | |
77 movq rax, xmm0 | |
78 | |
79 ; begin epilog | |
80 pop rdi | |
81 pop rsi | |
82 RESTORE_XMM | |
83 UNSHADOW_ARGS | |
84 pop rbp | |
85 ret | |
86 | |
87 ;unsigned int vp8_sad8x16_wmt( | |
88 ; unsigned char *src_ptr, | |
89 ; int src_stride, | |
90 ; unsigned char *ref_ptr, | |
91 ; int ref_stride, | |
92 ; int max_sad) | |
93 global sym(vp8_sad8x16_wmt) PRIVATE | |
94 sym(vp8_sad8x16_wmt): | |
95 push rbp | |
96 mov rbp, rsp | |
97 SHADOW_ARGS_TO_STACK 5 | |
98 push rbx | |
99 push rsi | |
100 push rdi | |
101 ; end prolog | |
102 | |
103 mov rsi, arg(0) ;src_ptr | |
104 mov rdi, arg(2) ;ref_ptr | |
105 | |
106 movsxd rbx, dword ptr arg(1) ;src_stride | |
107 movsxd rdx, dword ptr arg(3) ;ref_stride | |
108 | |
109 lea rcx, [rsi+rbx*8] | |
110 | |
111 lea rcx, [rcx+rbx*8] | |
112 pxor mm7, mm7 | |
113 | |
114 .x8x16sad_wmt_loop: | |
115 | |
116 movq rax, mm7 | |
117 cmp eax, arg(4) | |
118 ja .x8x16sad_wmt_early_exit | |
119 | |
120 movq mm0, QWORD PTR [rsi] | |
121 movq mm1, QWORD PTR [rdi] | |
122 | |
123 movq mm2, QWORD PTR [rsi+rbx] | |
124 movq mm3, QWORD PTR [rdi+rdx] | |
125 | |
126 psadbw mm0, mm1 | |
127 psadbw mm2, mm3 | |
128 | |
129 lea rsi, [rsi+rbx*2] | |
130 lea rdi, [rdi+rdx*2] | |
131 | |
132 paddw mm7, mm0 | |
133 paddw mm7, mm2 | |
134 | |
135 cmp rsi, rcx | |
136 jne .x8x16sad_wmt_loop | |
137 | |
138 movq rax, mm7 | |
139 | |
140 .x8x16sad_wmt_early_exit: | |
141 | |
142 ; begin epilog | |
143 pop rdi | |
144 pop rsi | |
145 pop rbx | |
146 UNSHADOW_ARGS | |
147 pop rbp | |
148 ret | |
149 | |
150 | |
151 ;unsigned int vp8_sad8x8_wmt( | |
152 ; unsigned char *src_ptr, | |
153 ; int src_stride, | |
154 ; unsigned char *ref_ptr, | |
155 ; int ref_stride) | |
156 global sym(vp8_sad8x8_wmt) PRIVATE | |
157 sym(vp8_sad8x8_wmt): | |
158 push rbp | |
159 mov rbp, rsp | |
160 SHADOW_ARGS_TO_STACK 5 | |
161 push rbx | |
162 push rsi | |
163 push rdi | |
164 ; end prolog | |
165 | |
166 mov rsi, arg(0) ;src_ptr | |
167 mov rdi, arg(2) ;ref_ptr | |
168 | |
169 movsxd rbx, dword ptr arg(1) ;src_stride | |
170 movsxd rdx, dword ptr arg(3) ;ref_stride | |
171 | |
172 lea rcx, [rsi+rbx*8] | |
173 pxor mm7, mm7 | |
174 | |
175 .x8x8sad_wmt_loop: | |
176 | |
177 movq rax, mm7 | |
178 cmp eax, arg(4) | |
179 ja .x8x8sad_wmt_early_exit | |
180 | |
181 movq mm0, QWORD PTR [rsi] | |
182 movq mm1, QWORD PTR [rdi] | |
183 | |
184 psadbw mm0, mm1 | |
185 lea rsi, [rsi+rbx] | |
186 | |
187 add rdi, rdx | |
188 paddw mm7, mm0 | |
189 | |
190 cmp rsi, rcx | |
191 jne .x8x8sad_wmt_loop | |
192 | |
193 movq rax, mm7 | |
194 .x8x8sad_wmt_early_exit: | |
195 | |
196 ; begin epilog | |
197 pop rdi | |
198 pop rsi | |
199 pop rbx | |
200 UNSHADOW_ARGS | |
201 pop rbp | |
202 ret | |
203 | |
204 ;unsigned int vp8_sad4x4_wmt( | |
205 ; unsigned char *src_ptr, | |
206 ; int src_stride, | |
207 ; unsigned char *ref_ptr, | |
208 ; int ref_stride) | |
209 global sym(vp8_sad4x4_wmt) PRIVATE | |
210 sym(vp8_sad4x4_wmt): | |
211 push rbp | |
212 mov rbp, rsp | |
213 SHADOW_ARGS_TO_STACK 4 | |
214 push rsi | |
215 push rdi | |
216 ; end prolog | |
217 | |
218 mov rsi, arg(0) ;src_ptr | |
219 mov rdi, arg(2) ;ref_ptr | |
220 | |
221 movsxd rax, dword ptr arg(1) ;src_stride | |
222 movsxd rdx, dword ptr arg(3) ;ref_stride | |
223 | |
224 movd mm0, DWORD PTR [rsi] | |
225 movd mm1, DWORD PTR [rdi] | |
226 | |
227 movd mm2, DWORD PTR [rsi+rax] | |
228 movd mm3, DWORD PTR [rdi+rdx] | |
229 | |
230 punpcklbw mm0, mm2 | |
231 punpcklbw mm1, mm3 | |
232 | |
233 psadbw mm0, mm1 | |
234 lea rsi, [rsi+rax*2] | |
235 | |
236 lea rdi, [rdi+rdx*2] | |
237 movd mm4, DWORD PTR [rsi] | |
238 | |
239 movd mm5, DWORD PTR [rdi] | |
240 movd mm6, DWORD PTR [rsi+rax] | |
241 | |
242 movd mm7, DWORD PTR [rdi+rdx] | |
243 punpcklbw mm4, mm6 | |
244 | |
245 punpcklbw mm5, mm7 | |
246 psadbw mm4, mm5 | |
247 | |
248 paddw mm0, mm4 | |
249 movq rax, mm0 | |
250 | |
251 ; begin epilog | |
252 pop rdi | |
253 pop rsi | |
254 UNSHADOW_ARGS | |
255 pop rbp | |
256 ret | |
257 | |
258 | |
259 ;unsigned int vp8_sad16x8_wmt( | |
260 ; unsigned char *src_ptr, | |
261 ; int src_stride, | |
262 ; unsigned char *ref_ptr, | |
263 ; int ref_stride) | |
264 global sym(vp8_sad16x8_wmt) PRIVATE | |
265 sym(vp8_sad16x8_wmt): | |
266 push rbp | |
267 mov rbp, rsp | |
268 SHADOW_ARGS_TO_STACK 5 | |
269 push rbx | |
270 push rsi | |
271 push rdi | |
272 ; end prolog | |
273 | |
274 | |
275 mov rsi, arg(0) ;src_ptr | |
276 mov rdi, arg(2) ;ref_ptr | |
277 | |
278 movsxd rbx, dword ptr arg(1) ;src_stride | |
279 movsxd rdx, dword ptr arg(3) ;ref_stride | |
280 | |
281 lea rcx, [rsi+rbx*8] | |
282 pxor mm7, mm7 | |
283 | |
284 .x16x8sad_wmt_loop: | |
285 | |
286 movq rax, mm7 | |
287 cmp eax, arg(4) | |
288 ja .x16x8sad_wmt_early_exit | |
289 | |
290 movq mm0, QWORD PTR [rsi] | |
291 movq mm2, QWORD PTR [rsi+8] | |
292 | |
293 movq mm1, QWORD PTR [rdi] | |
294 movq mm3, QWORD PTR [rdi+8] | |
295 | |
296 movq mm4, QWORD PTR [rsi+rbx] | |
297 movq mm5, QWORD PTR [rdi+rdx] | |
298 | |
299 psadbw mm0, mm1 | |
300 psadbw mm2, mm3 | |
301 | |
302 movq mm1, QWORD PTR [rsi+rbx+8] | |
303 movq mm3, QWORD PTR [rdi+rdx+8] | |
304 | |
305 psadbw mm4, mm5 | |
306 psadbw mm1, mm3 | |
307 | |
308 lea rsi, [rsi+rbx*2] | |
309 lea rdi, [rdi+rdx*2] | |
310 | |
311 paddw mm0, mm2 | |
312 paddw mm4, mm1 | |
313 | |
314 paddw mm7, mm0 | |
315 paddw mm7, mm4 | |
316 | |
317 cmp rsi, rcx | |
318 jne .x16x8sad_wmt_loop | |
319 | |
320 movq rax, mm7 | |
321 | |
322 .x16x8sad_wmt_early_exit: | |
323 | |
324 ; begin epilog | |
325 pop rdi | |
326 pop rsi | |
327 pop rbx | |
328 UNSHADOW_ARGS | |
329 pop rbp | |
330 ret | |
331 | |
332 ;void vp8_copy32xn_sse2( | |
333 ; unsigned char *src_ptr, | |
334 ; int src_stride, | |
335 ; unsigned char *dst_ptr, | |
336 ; int dst_stride, | |
337 ; int height); | |
338 global sym(vp8_copy32xn_sse2) PRIVATE | |
339 sym(vp8_copy32xn_sse2): | |
340 push rbp | |
341 mov rbp, rsp | |
342 SHADOW_ARGS_TO_STACK 5 | |
343 SAVE_XMM 7 | |
344 push rsi | |
345 push rdi | |
346 ; end prolog | |
347 | |
348 mov rsi, arg(0) ;src_ptr | |
349 mov rdi, arg(2) ;dst_ptr | |
350 | |
351 movsxd rax, dword ptr arg(1) ;src_stride | |
352 movsxd rdx, dword ptr arg(3) ;dst_stride | |
353 movsxd rcx, dword ptr arg(4) ;height | |
354 | |
355 .block_copy_sse2_loopx4: | |
356 movdqu xmm0, XMMWORD PTR [rsi] | |
357 movdqu xmm1, XMMWORD PTR [rsi + 16] | |
358 movdqu xmm2, XMMWORD PTR [rsi + rax] | |
359 movdqu xmm3, XMMWORD PTR [rsi + rax + 16] | |
360 | |
361 lea rsi, [rsi+rax*2] | |
362 | |
363 movdqu xmm4, XMMWORD PTR [rsi] | |
364 movdqu xmm5, XMMWORD PTR [rsi + 16] | |
365 movdqu xmm6, XMMWORD PTR [rsi + rax] | |
366 movdqu xmm7, XMMWORD PTR [rsi + rax + 16] | |
367 | |
368 lea rsi, [rsi+rax*2] | |
369 | |
370 movdqa XMMWORD PTR [rdi], xmm0 | |
371 movdqa XMMWORD PTR [rdi + 16], xmm1 | |
372 movdqa XMMWORD PTR [rdi + rdx], xmm2 | |
373 movdqa XMMWORD PTR [rdi + rdx + 16], xmm3 | |
374 | |
375 lea rdi, [rdi+rdx*2] | |
376 | |
377 movdqa XMMWORD PTR [rdi], xmm4 | |
378 movdqa XMMWORD PTR [rdi + 16], xmm5 | |
379 movdqa XMMWORD PTR [rdi + rdx], xmm6 | |
380 movdqa XMMWORD PTR [rdi + rdx + 16], xmm7 | |
381 | |
382 lea rdi, [rdi+rdx*2] | |
383 | |
384 sub rcx, 4 | |
385 cmp rcx, 4 | |
386 jge .block_copy_sse2_loopx4 | |
387 | |
388 cmp rcx, 0 | |
389 je .copy_is_done | |
390 | |
391 .block_copy_sse2_loop: | |
392 movdqu xmm0, XMMWORD PTR [rsi] | |
393 movdqu xmm1, XMMWORD PTR [rsi + 16] | |
394 lea rsi, [rsi+rax] | |
395 | |
396 movdqa XMMWORD PTR [rdi], xmm0 | |
397 movdqa XMMWORD PTR [rdi + 16], xmm1 | |
398 lea rdi, [rdi+rdx] | |
399 | |
400 sub rcx, 1 | |
401 jne .block_copy_sse2_loop | |
402 | |
403 .copy_is_done: | |
404 ; begin epilog | |
405 pop rdi | |
406 pop rsi | |
407 RESTORE_XMM | |
408 UNSHADOW_ARGS | |
409 pop rbp | |
410 ret | |
OLD | NEW |