OLD | NEW |
| (Empty) |
1 ; | |
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | |
3 ; | |
4 ; Use of this source code is governed by a BSD-style license | |
5 ; that can be found in the LICENSE file in the root of the source | |
6 ; tree. An additional intellectual property rights grant can be found | |
7 ; in the file PATENTS. All contributing project authors may | |
8 ; be found in the AUTHORS file in the root of the source tree. | |
9 ; | |
10 | |
11 | |
12 %include "vpx_ports/x86_abi_support.asm" | |
13 | |
14 ;unsigned int vp9_get_mb_ss_mmx( short *src_ptr ) | |
15 global sym(vp9_get_mb_ss_mmx) PRIVATE | |
16 sym(vp9_get_mb_ss_mmx): | |
17 push rbp | |
18 mov rbp, rsp | |
19 SHADOW_ARGS_TO_STACK 7 | |
20 GET_GOT rbx | |
21 push rsi | |
22 push rdi | |
23 sub rsp, 8 | |
24 ; end prolog | |
25 | |
26 mov rax, arg(0) ;src_ptr | |
27 mov rcx, 16 | |
28 pxor mm4, mm4 | |
29 | |
30 .NEXTROW: | |
31 movq mm0, [rax] | |
32 movq mm1, [rax+8] | |
33 movq mm2, [rax+16] | |
34 movq mm3, [rax+24] | |
35 pmaddwd mm0, mm0 | |
36 pmaddwd mm1, mm1 | |
37 pmaddwd mm2, mm2 | |
38 pmaddwd mm3, mm3 | |
39 | |
40 paddd mm4, mm0 | |
41 paddd mm4, mm1 | |
42 paddd mm4, mm2 | |
43 paddd mm4, mm3 | |
44 | |
45 add rax, 32 | |
46 dec rcx | |
47 ja .NEXTROW | |
48 movq QWORD PTR [rsp], mm4 | |
49 | |
50 ;return sum[0]+sum[1]; | |
51 movsxd rax, dword ptr [rsp] | |
52 movsxd rcx, dword ptr [rsp+4] | |
53 add rax, rcx | |
54 | |
55 | |
56 ; begin epilog | |
57 add rsp, 8 | |
58 pop rdi | |
59 pop rsi | |
60 RESTORE_GOT | |
61 UNSHADOW_ARGS | |
62 pop rbp | |
63 ret | |
64 | |
65 | |
66 ;unsigned int vp9_get8x8var_mmx | |
67 ;( | |
68 ; unsigned char *src_ptr, | |
69 ; int source_stride, | |
70 ; unsigned char *ref_ptr, | |
71 ; int recon_stride, | |
72 ; unsigned int *SSE, | |
73 ; int *Sum | |
74 ;) | |
75 global sym(vp9_get8x8var_mmx) PRIVATE | |
76 sym(vp9_get8x8var_mmx): | |
77 push rbp | |
78 mov rbp, rsp | |
79 SHADOW_ARGS_TO_STACK 6 | |
80 push rsi | |
81 push rdi | |
82 push rbx | |
83 sub rsp, 16 | |
84 ; end prolog | |
85 | |
86 | |
87 pxor mm5, mm5 ; Blank mmx6 | |
88 pxor mm6, mm6 ; Blank mmx7 | |
89 pxor mm7, mm7 ; Blank mmx7 | |
90 | |
91 mov rax, arg(0) ;[src_ptr] ; Load base addresses | |
92 mov rbx, arg(2) ;[ref_ptr] | |
93 movsxd rcx, dword ptr arg(1) ;[source_stride] | |
94 movsxd rdx, dword ptr arg(3) ;[recon_stride] | |
95 | |
96 ; Row 1 | |
97 movq mm0, [rax] ; Copy eight bytes to mm0 | |
98 movq mm1, [rbx] ; Copy eight bytes to mm1 | |
99 movq mm2, mm0 ; Take copies | |
100 movq mm3, mm1 ; Take copies | |
101 | |
102 punpcklbw mm0, mm6 ; unpack to higher prrcision | |
103 punpcklbw mm1, mm6 | |
104 punpckhbw mm2, mm6 ; unpack to higher prrcision | |
105 punpckhbw mm3, mm6 | |
106 psubsw mm0, mm1 ; A-B (low order) to MM0 | |
107 psubsw mm2, mm3 ; A-B (high order) to MM2 | |
108 | |
109 paddw mm5, mm0 ; accumulate differences in mm5 | |
110 paddw mm5, mm2 ; accumulate differences in mm5 | |
111 | |
112 pmaddwd mm0, mm0 ; square and accumulate | |
113 pmaddwd mm2, mm2 ; square and accumulate | |
114 add rbx,rdx ; Inc pointer into ref data | |
115 add rax,rcx ; Inc pointer into the new data | |
116 movq mm1, [rbx] ; Copy eight bytes to mm1 | |
117 paddd mm7, mm0 ; accumulate in mm7 | |
118 paddd mm7, mm2 ; accumulate in mm7 | |
119 | |
120 | |
121 ; Row 2 | |
122 movq mm0, [rax] ; Copy eight bytes to mm0 | |
123 movq mm2, mm0 ; Take copies | |
124 movq mm3, mm1 ; Take copies | |
125 | |
126 punpcklbw mm0, mm6 ; unpack to higher prrcision | |
127 punpcklbw mm1, mm6 | |
128 punpckhbw mm2, mm6 ; unpack to higher prrcision | |
129 punpckhbw mm3, mm6 | |
130 psubsw mm0, mm1 ; A-B (low order) to MM0 | |
131 psubsw mm2, mm3 ; A-B (high order) to MM2 | |
132 | |
133 paddw mm5, mm0 ; accumulate differences in mm5 | |
134 paddw mm5, mm2 ; accumulate differences in mm5 | |
135 | |
136 pmaddwd mm0, mm0 ; square and accumulate | |
137 pmaddwd mm2, mm2 ; square and accumulate | |
138 add rbx,rdx ; Inc pointer into ref data | |
139 add rax,rcx ; Inc pointer into the new data | |
140 movq mm1, [rbx] ; Copy eight bytes to mm1 | |
141 paddd mm7, mm0 ; accumulate in mm7 | |
142 paddd mm7, mm2 ; accumulate in mm7 | |
143 | |
144 ; Row 3 | |
145 movq mm0, [rax] ; Copy eight bytes to mm0 | |
146 movq mm2, mm0 ; Take copies | |
147 movq mm3, mm1 ; Take copies | |
148 | |
149 punpcklbw mm0, mm6 ; unpack to higher prrcision | |
150 punpcklbw mm1, mm6 | |
151 punpckhbw mm2, mm6 ; unpack to higher prrcision | |
152 punpckhbw mm3, mm6 | |
153 psubsw mm0, mm1 ; A-B (low order) to MM0 | |
154 psubsw mm2, mm3 ; A-B (high order) to MM2 | |
155 | |
156 paddw mm5, mm0 ; accumulate differences in mm5 | |
157 paddw mm5, mm2 ; accumulate differences in mm5 | |
158 | |
159 pmaddwd mm0, mm0 ; square and accumulate | |
160 pmaddwd mm2, mm2 ; square and accumulate | |
161 add rbx,rdx ; Inc pointer into ref data | |
162 add rax,rcx ; Inc pointer into the new data | |
163 movq mm1, [rbx] ; Copy eight bytes to mm1 | |
164 paddd mm7, mm0 ; accumulate in mm7 | |
165 paddd mm7, mm2 ; accumulate in mm7 | |
166 | |
167 ; Row 4 | |
168 movq mm0, [rax] ; Copy eight bytes to mm0 | |
169 movq mm2, mm0 ; Take copies | |
170 movq mm3, mm1 ; Take copies | |
171 | |
172 punpcklbw mm0, mm6 ; unpack to higher prrcision | |
173 punpcklbw mm1, mm6 | |
174 punpckhbw mm2, mm6 ; unpack to higher prrcision | |
175 punpckhbw mm3, mm6 | |
176 psubsw mm0, mm1 ; A-B (low order) to MM0 | |
177 psubsw mm2, mm3 ; A-B (high order) to MM2 | |
178 | |
179 paddw mm5, mm0 ; accumulate differences in mm5 | |
180 paddw mm5, mm2 ; accumulate differences in mm5 | |
181 | |
182 pmaddwd mm0, mm0 ; square and accumulate | |
183 pmaddwd mm2, mm2 ; square and accumulate | |
184 add rbx,rdx ; Inc pointer into ref data | |
185 add rax,rcx ; Inc pointer into the new data | |
186 movq mm1, [rbx] ; Copy eight bytes to mm1 | |
187 paddd mm7, mm0 ; accumulate in mm7 | |
188 paddd mm7, mm2 ; accumulate in mm7 | |
189 | |
190 ; Row 5 | |
191 movq mm0, [rax] ; Copy eight bytes to mm0 | |
192 movq mm2, mm0 ; Take copies | |
193 movq mm3, mm1 ; Take copies | |
194 | |
195 punpcklbw mm0, mm6 ; unpack to higher prrcision | |
196 punpcklbw mm1, mm6 | |
197 punpckhbw mm2, mm6 ; unpack to higher prrcision | |
198 punpckhbw mm3, mm6 | |
199 psubsw mm0, mm1 ; A-B (low order) to MM0 | |
200 psubsw mm2, mm3 ; A-B (high order) to MM2 | |
201 | |
202 paddw mm5, mm0 ; accumulate differences in mm5 | |
203 paddw mm5, mm2 ; accumulate differences in mm5 | |
204 | |
205 pmaddwd mm0, mm0 ; square and accumulate | |
206 pmaddwd mm2, mm2 ; square and accumulate | |
207 add rbx,rdx ; Inc pointer into ref data | |
208 add rax,rcx ; Inc pointer into the new data | |
209 movq mm1, [rbx] ; Copy eight bytes to mm1 | |
210 ; movq mm4, [rbx + rdx] | |
211 paddd mm7, mm0 ; accumulate in mm7 | |
212 paddd mm7, mm2 ; accumulate in mm7 | |
213 | |
214 ; Row 6 | |
215 movq mm0, [rax] ; Copy eight bytes to mm0 | |
216 movq mm2, mm0 ; Take copies | |
217 movq mm3, mm1 ; Take copies | |
218 | |
219 punpcklbw mm0, mm6 ; unpack to higher prrcision | |
220 punpcklbw mm1, mm6 | |
221 punpckhbw mm2, mm6 ; unpack to higher prrcision | |
222 punpckhbw mm3, mm6 | |
223 psubsw mm0, mm1 ; A-B (low order) to MM0 | |
224 psubsw mm2, mm3 ; A-B (high order) to MM2 | |
225 | |
226 paddw mm5, mm0 ; accumulate differences in mm5 | |
227 paddw mm5, mm2 ; accumulate differences in mm5 | |
228 | |
229 pmaddwd mm0, mm0 ; square and accumulate | |
230 pmaddwd mm2, mm2 ; square and accumulate | |
231 add rbx,rdx ; Inc pointer into ref data | |
232 add rax,rcx ; Inc pointer into the new data | |
233 movq mm1, [rbx] ; Copy eight bytes to mm1 | |
234 paddd mm7, mm0 ; accumulate in mm7 | |
235 paddd mm7, mm2 ; accumulate in mm7 | |
236 | |
237 ; Row 7 | |
238 movq mm0, [rax] ; Copy eight bytes to mm0 | |
239 movq mm2, mm0 ; Take copies | |
240 movq mm3, mm1 ; Take copies | |
241 | |
242 punpcklbw mm0, mm6 ; unpack to higher prrcision | |
243 punpcklbw mm1, mm6 | |
244 punpckhbw mm2, mm6 ; unpack to higher prrcision | |
245 punpckhbw mm3, mm6 | |
246 psubsw mm0, mm1 ; A-B (low order) to MM0 | |
247 psubsw mm2, mm3 ; A-B (high order) to MM2 | |
248 | |
249 paddw mm5, mm0 ; accumulate differences in mm5 | |
250 paddw mm5, mm2 ; accumulate differences in mm5 | |
251 | |
252 pmaddwd mm0, mm0 ; square and accumulate | |
253 pmaddwd mm2, mm2 ; square and accumulate | |
254 add rbx,rdx ; Inc pointer into ref data | |
255 add rax,rcx ; Inc pointer into the new data | |
256 movq mm1, [rbx] ; Copy eight bytes to mm1 | |
257 paddd mm7, mm0 ; accumulate in mm7 | |
258 paddd mm7, mm2 ; accumulate in mm7 | |
259 | |
260 ; Row 8 | |
261 movq mm0, [rax] ; Copy eight bytes to mm0 | |
262 movq mm2, mm0 ; Take copies | |
263 movq mm3, mm1 ; Take copies | |
264 | |
265 punpcklbw mm0, mm6 ; unpack to higher prrcision | |
266 punpcklbw mm1, mm6 | |
267 punpckhbw mm2, mm6 ; unpack to higher prrcision | |
268 punpckhbw mm3, mm6 | |
269 psubsw mm0, mm1 ; A-B (low order) to MM0 | |
270 psubsw mm2, mm3 ; A-B (high order) to MM2 | |
271 | |
272 paddw mm5, mm0 ; accumulate differences in mm5 | |
273 paddw mm5, mm2 ; accumulate differences in mm5 | |
274 | |
275 pmaddwd mm0, mm0 ; square and accumulate | |
276 pmaddwd mm2, mm2 ; square and accumulate | |
277 add rbx,rdx ; Inc pointer into ref data | |
278 add rax,rcx ; Inc pointer into the new data | |
279 paddd mm7, mm0 ; accumulate in mm7 | |
280 paddd mm7, mm2 ; accumulate in mm7 | |
281 | |
282 ; Now accumulate the final results. | |
283 movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results
into normal memory | |
284 movq QWORD PTR [rsp], mm7 ; copy back accumulated results
into normal memory | |
285 movsx rdx, WORD PTR [rsp+8] | |
286 movsx rcx, WORD PTR [rsp+10] | |
287 movsx rbx, WORD PTR [rsp+12] | |
288 movsx rax, WORD PTR [rsp+14] | |
289 add rdx, rcx | |
290 add rbx, rax | |
291 add rdx, rbx ;XSum | |
292 movsxd rax, DWORD PTR [rsp] | |
293 movsxd rcx, DWORD PTR [rsp+4] | |
294 add rax, rcx ;XXSum | |
295 mov rsi, arg(4) ;SSE | |
296 mov rdi, arg(5) ;Sum | |
297 mov dword ptr [rsi], eax | |
298 mov dword ptr [rdi], edx | |
299 xor rax, rax ; return 0 | |
300 | |
301 | |
302 ; begin epilog | |
303 add rsp, 16 | |
304 pop rbx | |
305 pop rdi | |
306 pop rsi | |
307 UNSHADOW_ARGS | |
308 pop rbp | |
309 ret | |
310 | |
311 | |
312 | |
313 ;unsigned int | |
314 ;vp9_get4x4var_mmx | |
315 ;( | |
316 ; unsigned char *src_ptr, | |
317 ; int source_stride, | |
318 ; unsigned char *ref_ptr, | |
319 ; int recon_stride, | |
320 ; unsigned int *SSE, | |
321 ; int *Sum | |
322 ;) | |
323 global sym(vp9_get4x4var_mmx) PRIVATE | |
324 sym(vp9_get4x4var_mmx): | |
325 push rbp | |
326 mov rbp, rsp | |
327 SHADOW_ARGS_TO_STACK 6 | |
328 push rsi | |
329 push rdi | |
330 push rbx | |
331 sub rsp, 16 | |
332 ; end prolog | |
333 | |
334 | |
335 pxor mm5, mm5 ; Blank mmx6 | |
336 pxor mm6, mm6 ; Blank mmx7 | |
337 pxor mm7, mm7 ; Blank mmx7 | |
338 | |
339 mov rax, arg(0) ;[src_ptr] ; Load base addresses | |
340 mov rbx, arg(2) ;[ref_ptr] | |
341 movsxd rcx, dword ptr arg(1) ;[source_stride] | |
342 movsxd rdx, dword ptr arg(3) ;[recon_stride] | |
343 | |
344 ; Row 1 | |
345 movd mm0, [rax] ; Copy 4 bytes to mm0 | |
346 movd mm1, [rbx] ; Copy 4 bytes to mm1 | |
347 punpcklbw mm0, mm6 ; unpack to higher prrcision | |
348 punpcklbw mm1, mm6 | |
349 psubsw mm0, mm1 ; A-B (low order) to MM0 | |
350 paddw mm5, mm0 ; accumulate differences in mm5 | |
351 pmaddwd mm0, mm0 ; square and accumulate | |
352 add rbx,rdx ; Inc pointer into ref data | |
353 add rax,rcx ; Inc pointer into the new data | |
354 movd mm1, [rbx] ; Copy 4 bytes to mm1 | |
355 paddd mm7, mm0 ; accumulate in mm7 | |
356 | |
357 | |
358 ; Row 2 | |
359 movd mm0, [rax] ; Copy 4 bytes to mm0 | |
360 punpcklbw mm0, mm6 ; unpack to higher prrcision | |
361 punpcklbw mm1, mm6 | |
362 psubsw mm0, mm1 ; A-B (low order) to MM0 | |
363 paddw mm5, mm0 ; accumulate differences in mm5 | |
364 | |
365 pmaddwd mm0, mm0 ; square and accumulate | |
366 add rbx,rdx ; Inc pointer into ref data | |
367 add rax,rcx ; Inc pointer into the new data | |
368 movd mm1, [rbx] ; Copy 4 bytes to mm1 | |
369 paddd mm7, mm0 ; accumulate in mm7 | |
370 | |
371 ; Row 3 | |
372 movd mm0, [rax] ; Copy 4 bytes to mm0 | |
373 punpcklbw mm0, mm6 ; unpack to higher prrcision | |
374 punpcklbw mm1, mm6 | |
375 psubsw mm0, mm1 ; A-B (low order) to MM0 | |
376 paddw mm5, mm0 ; accumulate differences in mm5 | |
377 | |
378 pmaddwd mm0, mm0 ; square and accumulate | |
379 add rbx,rdx ; Inc pointer into ref data | |
380 add rax,rcx ; Inc pointer into the new data | |
381 movd mm1, [rbx] ; Copy 4 bytes to mm1 | |
382 paddd mm7, mm0 ; accumulate in mm7 | |
383 | |
384 ; Row 4 | |
385 movd mm0, [rax] ; Copy 4 bytes to mm0 | |
386 | |
387 punpcklbw mm0, mm6 ; unpack to higher prrcision | |
388 punpcklbw mm1, mm6 | |
389 psubsw mm0, mm1 ; A-B (low order) to MM0 | |
390 | |
391 paddw mm5, mm0 ; accumulate differences in mm5 | |
392 | |
393 pmaddwd mm0, mm0 ; square and accumulate | |
394 paddd mm7, mm0 ; accumulate in mm7 | |
395 | |
396 | |
397 ; Now accumulate the final results. | |
398 movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results
into normal memory | |
399 movq QWORD PTR [rsp], mm7 ; copy back accumulated results
into normal memory | |
400 movsx rdx, WORD PTR [rsp+8] | |
401 movsx rcx, WORD PTR [rsp+10] | |
402 movsx rbx, WORD PTR [rsp+12] | |
403 movsx rax, WORD PTR [rsp+14] | |
404 add rdx, rcx | |
405 add rbx, rax | |
406 add rdx, rbx ;XSum | |
407 movsxd rax, DWORD PTR [rsp] | |
408 movsxd rcx, DWORD PTR [rsp+4] | |
409 add rax, rcx ;XXSum | |
410 mov rsi, arg(4) ;SSE | |
411 mov rdi, arg(5) ;Sum | |
412 mov dword ptr [rsi], eax | |
413 mov dword ptr [rdi], edx | |
414 xor rax, rax ; return 0 | |
415 | |
416 | |
417 ; begin epilog | |
418 add rsp, 16 | |
419 pop rbx | |
420 pop rdi | |
421 pop rsi | |
422 UNSHADOW_ARGS | |
423 pop rbp | |
424 ret | |
425 | |
426 | |
427 | |
428 ;unsigned int | |
429 ;vp9_get4x4sse_cs_mmx | |
430 ;( | |
431 ; unsigned char *src_ptr, | |
432 ; int source_stride, | |
433 ; unsigned char *ref_ptr, | |
434 ; int recon_stride | |
435 ;) | |
436 global sym(vp9_get4x4sse_cs_mmx) PRIVATE | |
437 sym(vp9_get4x4sse_cs_mmx): | |
438 push rbp | |
439 mov rbp, rsp | |
440 SHADOW_ARGS_TO_STACK 4 | |
441 push rsi | |
442 push rdi | |
443 push rbx | |
444 ; end prolog | |
445 | |
446 | |
447 pxor mm6, mm6 ; Blank mmx7 | |
448 pxor mm7, mm7 ; Blank mmx7 | |
449 | |
450 mov rax, arg(0) ;[src_ptr] ; Load base addresses | |
451 mov rbx, arg(2) ;[ref_ptr] | |
452 movsxd rcx, dword ptr arg(1) ;[source_stride] | |
453 movsxd rdx, dword ptr arg(3) ;[recon_stride] | |
454 ; Row 1 | |
455 movd mm0, [rax] ; Copy eight bytes to mm0 | |
456 movd mm1, [rbx] ; Copy eight bytes to mm1 | |
457 punpcklbw mm0, mm6 ; unpack to higher prrcision | |
458 punpcklbw mm1, mm6 | |
459 psubsw mm0, mm1 ; A-B (low order) to MM0 | |
460 pmaddwd mm0, mm0 ; square and accumulate | |
461 add rbx,rdx ; Inc pointer into ref data | |
462 add rax,rcx ; Inc pointer into the new data | |
463 movd mm1, [rbx] ; Copy eight bytes to mm1 | |
464 paddd mm7, mm0 ; accumulate in mm7 | |
465 | |
466 ; Row 2 | |
467 movd mm0, [rax] ; Copy eight bytes to mm0 | |
468 punpcklbw mm0, mm6 ; unpack to higher prrcision | |
469 punpcklbw mm1, mm6 | |
470 psubsw mm0, mm1 ; A-B (low order) to MM0 | |
471 pmaddwd mm0, mm0 ; square and accumulate | |
472 add rbx,rdx ; Inc pointer into ref data | |
473 add rax,rcx ; Inc pointer into the new data | |
474 movd mm1, [rbx] ; Copy eight bytes to mm1 | |
475 paddd mm7, mm0 ; accumulate in mm7 | |
476 | |
477 ; Row 3 | |
478 movd mm0, [rax] ; Copy eight bytes to mm0 | |
479 punpcklbw mm1, mm6 | |
480 punpcklbw mm0, mm6 ; unpack to higher prrcision | |
481 psubsw mm0, mm1 ; A-B (low order) to MM0 | |
482 | |
483 pmaddwd mm0, mm0 ; square and accumulate | |
484 add rbx,rdx ; Inc pointer into ref data | |
485 add rax,rcx ; Inc pointer into the new data | |
486 movd mm1, [rbx] ; Copy eight bytes to mm1 | |
487 paddd mm7, mm0 ; accumulate in mm7 | |
488 | |
489 ; Row 4 | |
490 movd mm0, [rax] ; Copy eight bytes to mm0 | |
491 punpcklbw mm0, mm6 ; unpack to higher prrcision | |
492 punpcklbw mm1, mm6 | |
493 psubsw mm0, mm1 ; A-B (low order) to MM0 | |
494 pmaddwd mm0, mm0 ; square and accumulate | |
495 paddd mm7, mm0 ; accumulate in mm7 | |
496 | |
497 movq mm0, mm7 ; | |
498 psrlq mm7, 32 | |
499 | |
500 paddd mm0, mm7 | |
501 movq rax, mm0 | |
502 | |
503 | |
504 ; begin epilog | |
505 pop rbx | |
506 pop rdi | |
507 pop rsi | |
508 UNSHADOW_ARGS | |
509 pop rbp | |
510 ret | |
OLD | NEW |