OLD | NEW |
| (Empty) |
1 ; | |
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | |
3 ; | |
4 ; Use of this source code is governed by a BSD-style license | |
5 ; that can be found in the LICENSE file in the root of the source | |
6 ; tree. An additional intellectual property rights grant can be found | |
7 ; in the file PATENTS. All contributing project authors may | |
8 ; be found in the AUTHORS file in the root of the source tree. | |
9 ; | |
10 | |
11 | |
12 %include "vpx_ports/x86_abi_support.asm" | |
13 | |
14 global sym(vp9_sad16x16_mmx) PRIVATE | |
15 global sym(vp9_sad8x16_mmx) PRIVATE | |
16 global sym(vp9_sad8x8_mmx) PRIVATE | |
17 global sym(vp9_sad4x4_mmx) PRIVATE | |
18 global sym(vp9_sad16x8_mmx) PRIVATE | |
19 | |
20 ;unsigned int vp9_sad16x16_mmx( | |
21 ; unsigned char *src_ptr, | |
22 ; int src_stride, | |
23 ; unsigned char *ref_ptr, | |
24 ; int ref_stride) | |
25 sym(vp9_sad16x16_mmx): | |
26 push rbp | |
27 mov rbp, rsp | |
28 SHADOW_ARGS_TO_STACK 4 | |
29 push rsi | |
30 push rdi | |
31 ; end prolog | |
32 | |
33 mov rsi, arg(0) ;src_ptr | |
34 mov rdi, arg(2) ;ref_ptr | |
35 | |
36 movsxd rax, dword ptr arg(1) ;src_stride | |
37 movsxd rdx, dword ptr arg(3) ;ref_stride | |
38 | |
39 lea rcx, [rsi+rax*8] | |
40 | |
41 lea rcx, [rcx+rax*8] | |
42 pxor mm7, mm7 | |
43 | |
44 pxor mm6, mm6 | |
45 | |
46 .x16x16sad_mmx_loop: | |
47 | |
48 movq mm0, QWORD PTR [rsi] | |
49 movq mm2, QWORD PTR [rsi+8] | |
50 | |
51 movq mm1, QWORD PTR [rdi] | |
52 movq mm3, QWORD PTR [rdi+8] | |
53 | |
54 movq mm4, mm0 | |
55 movq mm5, mm2 | |
56 | |
57 psubusb mm0, mm1 | |
58 psubusb mm1, mm4 | |
59 | |
60 psubusb mm2, mm3 | |
61 psubusb mm3, mm5 | |
62 | |
63 por mm0, mm1 | |
64 por mm2, mm3 | |
65 | |
66 movq mm1, mm0 | |
67 movq mm3, mm2 | |
68 | |
69 punpcklbw mm0, mm6 | |
70 punpcklbw mm2, mm6 | |
71 | |
72 punpckhbw mm1, mm6 | |
73 punpckhbw mm3, mm6 | |
74 | |
75 paddw mm0, mm2 | |
76 paddw mm1, mm3 | |
77 | |
78 | |
79 lea rsi, [rsi+rax] | |
80 add rdi, rdx | |
81 | |
82 paddw mm7, mm0 | |
83 paddw mm7, mm1 | |
84 | |
85 cmp rsi, rcx | |
86 jne .x16x16sad_mmx_loop | |
87 | |
88 | |
89 movq mm0, mm7 | |
90 | |
91 punpcklwd mm0, mm6 | |
92 punpckhwd mm7, mm6 | |
93 | |
94 paddw mm0, mm7 | |
95 movq mm7, mm0 | |
96 | |
97 | |
98 psrlq mm0, 32 | |
99 paddw mm7, mm0 | |
100 | |
101 movq rax, mm7 | |
102 | |
103 pop rdi | |
104 pop rsi | |
105 mov rsp, rbp | |
106 ; begin epilog | |
107 UNSHADOW_ARGS | |
108 pop rbp | |
109 ret | |
110 | |
111 | |
112 ;unsigned int vp9_sad8x16_mmx( | |
113 ; unsigned char *src_ptr, | |
114 ; int src_stride, | |
115 ; unsigned char *ref_ptr, | |
116 ; int ref_stride) | |
117 sym(vp9_sad8x16_mmx): | |
118 push rbp | |
119 mov rbp, rsp | |
120 SHADOW_ARGS_TO_STACK 4 | |
121 push rsi | |
122 push rdi | |
123 ; end prolog | |
124 | |
125 mov rsi, arg(0) ;src_ptr | |
126 mov rdi, arg(2) ;ref_ptr | |
127 | |
128 movsxd rax, dword ptr arg(1) ;src_stride | |
129 movsxd rdx, dword ptr arg(3) ;ref_stride | |
130 | |
131 lea rcx, [rsi+rax*8] | |
132 | |
133 lea rcx, [rcx+rax*8] | |
134 pxor mm7, mm7 | |
135 | |
136 pxor mm6, mm6 | |
137 | |
138 .x8x16sad_mmx_loop: | |
139 | |
140 movq mm0, QWORD PTR [rsi] | |
141 movq mm1, QWORD PTR [rdi] | |
142 | |
143 movq mm2, mm0 | |
144 psubusb mm0, mm1 | |
145 | |
146 psubusb mm1, mm2 | |
147 por mm0, mm1 | |
148 | |
149 movq mm2, mm0 | |
150 punpcklbw mm0, mm6 | |
151 | |
152 punpckhbw mm2, mm6 | |
153 lea rsi, [rsi+rax] | |
154 | |
155 add rdi, rdx | |
156 paddw mm7, mm0 | |
157 | |
158 paddw mm7, mm2 | |
159 cmp rsi, rcx | |
160 | |
161 jne .x8x16sad_mmx_loop | |
162 | |
163 movq mm0, mm7 | |
164 punpcklwd mm0, mm6 | |
165 | |
166 punpckhwd mm7, mm6 | |
167 paddw mm0, mm7 | |
168 | |
169 movq mm7, mm0 | |
170 psrlq mm0, 32 | |
171 | |
172 paddw mm7, mm0 | |
173 movq rax, mm7 | |
174 | |
175 pop rdi | |
176 pop rsi | |
177 mov rsp, rbp | |
178 ; begin epilog | |
179 UNSHADOW_ARGS | |
180 pop rbp | |
181 ret | |
182 | |
183 | |
184 ;unsigned int vp9_sad8x8_mmx( | |
185 ; unsigned char *src_ptr, | |
186 ; int src_stride, | |
187 ; unsigned char *ref_ptr, | |
188 ; int ref_stride) | |
189 sym(vp9_sad8x8_mmx): | |
190 push rbp | |
191 mov rbp, rsp | |
192 SHADOW_ARGS_TO_STACK 4 | |
193 push rsi | |
194 push rdi | |
195 ; end prolog | |
196 | |
197 mov rsi, arg(0) ;src_ptr | |
198 mov rdi, arg(2) ;ref_ptr | |
199 | |
200 movsxd rax, dword ptr arg(1) ;src_stride | |
201 movsxd rdx, dword ptr arg(3) ;ref_stride | |
202 | |
203 lea rcx, [rsi+rax*8] | |
204 pxor mm7, mm7 | |
205 | |
206 pxor mm6, mm6 | |
207 | |
208 .x8x8sad_mmx_loop: | |
209 | |
210 movq mm0, QWORD PTR [rsi] | |
211 movq mm1, QWORD PTR [rdi] | |
212 | |
213 movq mm2, mm0 | |
214 psubusb mm0, mm1 | |
215 | |
216 psubusb mm1, mm2 | |
217 por mm0, mm1 | |
218 | |
219 movq mm2, mm0 | |
220 punpcklbw mm0, mm6 | |
221 | |
222 punpckhbw mm2, mm6 | |
223 paddw mm0, mm2 | |
224 | |
225 lea rsi, [rsi+rax] | |
226 add rdi, rdx | |
227 | |
228 paddw mm7, mm0 | |
229 cmp rsi, rcx | |
230 | |
231 jne .x8x8sad_mmx_loop | |
232 | |
233 movq mm0, mm7 | |
234 punpcklwd mm0, mm6 | |
235 | |
236 punpckhwd mm7, mm6 | |
237 paddw mm0, mm7 | |
238 | |
239 movq mm7, mm0 | |
240 psrlq mm0, 32 | |
241 | |
242 paddw mm7, mm0 | |
243 movq rax, mm7 | |
244 | |
245 pop rdi | |
246 pop rsi | |
247 mov rsp, rbp | |
248 ; begin epilog | |
249 UNSHADOW_ARGS | |
250 pop rbp | |
251 ret | |
252 | |
253 | |
254 ;unsigned int vp9_sad4x4_mmx( | |
255 ; unsigned char *src_ptr, | |
256 ; int src_stride, | |
257 ; unsigned char *ref_ptr, | |
258 ; int ref_stride) | |
259 sym(vp9_sad4x4_mmx): | |
260 push rbp | |
261 mov rbp, rsp | |
262 SHADOW_ARGS_TO_STACK 4 | |
263 push rsi | |
264 push rdi | |
265 ; end prolog | |
266 | |
267 mov rsi, arg(0) ;src_ptr | |
268 mov rdi, arg(2) ;ref_ptr | |
269 | |
270 movsxd rax, dword ptr arg(1) ;src_stride | |
271 movsxd rdx, dword ptr arg(3) ;ref_stride | |
272 | |
273 movd mm0, DWORD PTR [rsi] | |
274 movd mm1, DWORD PTR [rdi] | |
275 | |
276 movd mm2, DWORD PTR [rsi+rax] | |
277 movd mm3, DWORD PTR [rdi+rdx] | |
278 | |
279 punpcklbw mm0, mm2 | |
280 punpcklbw mm1, mm3 | |
281 | |
282 movq mm2, mm0 | |
283 psubusb mm0, mm1 | |
284 | |
285 psubusb mm1, mm2 | |
286 por mm0, mm1 | |
287 | |
288 movq mm2, mm0 | |
289 pxor mm3, mm3 | |
290 | |
291 punpcklbw mm0, mm3 | |
292 punpckhbw mm2, mm3 | |
293 | |
294 paddw mm0, mm2 | |
295 | |
296 lea rsi, [rsi+rax*2] | |
297 lea rdi, [rdi+rdx*2] | |
298 | |
299 movd mm4, DWORD PTR [rsi] | |
300 movd mm5, DWORD PTR [rdi] | |
301 | |
302 movd mm6, DWORD PTR [rsi+rax] | |
303 movd mm7, DWORD PTR [rdi+rdx] | |
304 | |
305 punpcklbw mm4, mm6 | |
306 punpcklbw mm5, mm7 | |
307 | |
308 movq mm6, mm4 | |
309 psubusb mm4, mm5 | |
310 | |
311 psubusb mm5, mm6 | |
312 por mm4, mm5 | |
313 | |
314 movq mm5, mm4 | |
315 punpcklbw mm4, mm3 | |
316 | |
317 punpckhbw mm5, mm3 | |
318 paddw mm4, mm5 | |
319 | |
320 paddw mm0, mm4 | |
321 movq mm1, mm0 | |
322 | |
323 punpcklwd mm0, mm3 | |
324 punpckhwd mm1, mm3 | |
325 | |
326 paddw mm0, mm1 | |
327 movq mm1, mm0 | |
328 | |
329 psrlq mm0, 32 | |
330 paddw mm0, mm1 | |
331 | |
332 movq rax, mm0 | |
333 | |
334 pop rdi | |
335 pop rsi | |
336 mov rsp, rbp | |
337 ; begin epilog | |
338 UNSHADOW_ARGS | |
339 pop rbp | |
340 ret | |
341 | |
342 | |
343 ;unsigned int vp9_sad16x8_mmx( | |
344 ; unsigned char *src_ptr, | |
345 ; int src_stride, | |
346 ; unsigned char *ref_ptr, | |
347 ; int ref_stride) | |
348 sym(vp9_sad16x8_mmx): | |
349 push rbp | |
350 mov rbp, rsp | |
351 SHADOW_ARGS_TO_STACK 4 | |
352 push rsi | |
353 push rdi | |
354 ; end prolog | |
355 | |
356 mov rsi, arg(0) ;src_ptr | |
357 mov rdi, arg(2) ;ref_ptr | |
358 | |
359 movsxd rax, dword ptr arg(1) ;src_stride | |
360 movsxd rdx, dword ptr arg(3) ;ref_stride | |
361 | |
362 lea rcx, [rsi+rax*8] | |
363 pxor mm7, mm7 | |
364 | |
365 pxor mm6, mm6 | |
366 | |
367 .x16x8sad_mmx_loop: | |
368 | |
369 movq mm0, [rsi] | |
370 movq mm1, [rdi] | |
371 | |
372 movq mm2, [rsi+8] | |
373 movq mm3, [rdi+8] | |
374 | |
375 movq mm4, mm0 | |
376 movq mm5, mm2 | |
377 | |
378 psubusb mm0, mm1 | |
379 psubusb mm1, mm4 | |
380 | |
381 psubusb mm2, mm3 | |
382 psubusb mm3, mm5 | |
383 | |
384 por mm0, mm1 | |
385 por mm2, mm3 | |
386 | |
387 movq mm1, mm0 | |
388 movq mm3, mm2 | |
389 | |
390 punpcklbw mm0, mm6 | |
391 punpckhbw mm1, mm6 | |
392 | |
393 punpcklbw mm2, mm6 | |
394 punpckhbw mm3, mm6 | |
395 | |
396 | |
397 paddw mm0, mm2 | |
398 paddw mm1, mm3 | |
399 | |
400 paddw mm0, mm1 | |
401 lea rsi, [rsi+rax] | |
402 | |
403 add rdi, rdx | |
404 paddw mm7, mm0 | |
405 | |
406 cmp rsi, rcx | |
407 jne .x16x8sad_mmx_loop | |
408 | |
409 movq mm0, mm7 | |
410 punpcklwd mm0, mm6 | |
411 | |
412 punpckhwd mm7, mm6 | |
413 paddw mm0, mm7 | |
414 | |
415 movq mm7, mm0 | |
416 psrlq mm0, 32 | |
417 | |
418 paddw mm7, mm0 | |
419 movq rax, mm7 | |
420 | |
421 pop rdi | |
422 pop rsi | |
423 mov rsp, rbp | |
424 ; begin epilog | |
425 UNSHADOW_ARGS | |
426 pop rbp | |
427 ret | |
OLD | NEW |