OLD | NEW |
1 ; | 1 ; |
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
3 ; | 3 ; |
4 ; Use of this source code is governed by a BSD-style license | 4 ; Use of this source code is governed by a BSD-style license |
5 ; that can be found in the LICENSE file in the root of the source | 5 ; that can be found in the LICENSE file in the root of the source |
6 ; tree. An additional intellectual property rights grant can be found | 6 ; tree. An additional intellectual property rights grant can be found |
7 ; in the file PATENTS. All contributing project authors may | 7 ; in the file PATENTS. All contributing project authors may |
8 ; be found in the AUTHORS file in the root of the source tree. | 8 ; be found in the AUTHORS file in the root of the source tree. |
9 ; | 9 ; |
10 | 10 |
(...skipping 184 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
195 pop rbx | 195 pop rbx |
196 | 196 |
197 ; begin epilog | 197 ; begin epilog |
198 pop rdi | 198 pop rdi |
199 pop rsi | 199 pop rsi |
200 RESTORE_GOT | 200 RESTORE_GOT |
201 UNSHADOW_ARGS | 201 UNSHADOW_ARGS |
202 pop rbp | 202 pop rbp |
203 ret | 203 ret |
204 | 204 |
205 | |
206 ;void bilinear_predict8x8_mmx | |
207 ;( | |
208 ; unsigned char *src_ptr, | |
209 ; int src_pixels_per_line, | |
210 ; int xoffset, | |
211 ; int yoffset, | |
212 ; unsigned char *dst_ptr, | |
213 ; int dst_pitch | |
214 ;) | |
215 global sym(vp9_bilinear_predict8x8_mmx) PRIVATE | |
216 sym(vp9_bilinear_predict8x8_mmx): | |
217 push rbp | |
218 mov rbp, rsp | |
219 SHADOW_ARGS_TO_STACK 6 | |
220 GET_GOT rbx | |
221 push rsi | |
222 push rdi | |
223 ; end prolog | |
224 | |
225 ;const short *HFilter = bilinear_filters_mmx[xoffset]; | |
226 ;const short *VFilter = bilinear_filters_mmx[yoffset]; | |
227 | |
228 movsxd rax, dword ptr arg(2) ;xoffset | |
229 mov rdi, arg(4) ;dst_ptr ; | |
230 | |
231 shl rax, 5 ; offset * 32 | |
232 lea rcx, [GLOBAL(sym(vp9_bilinear_filters_8x_mmx))] | |
233 | |
234 add rax, rcx ; HFilter | |
235 mov rsi, arg(0) ;src_ptr ; | |
236 | |
237 movsxd rdx, dword ptr arg(5) ;dst_pitch | |
238 movq mm1, [rax] ; | |
239 | |
240 movq mm2, [rax+16] ; | |
241 movsxd rax, dword ptr arg(3) ;yoffset | |
242 | |
243 pxor mm0, mm0 ; | |
244 | |
245 shl rax, 5 ; offset*32 | |
246 add rax, rcx ; VFilter | |
247 | |
248 lea rcx, [rdi+rdx*8] ; | |
249 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ; | |
250 | |
251 | |
252 | |
253 ; get the first horizontal line done ; | |
254 movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07
08 09 10 11 12 13 14 | |
255 movq mm4, mm3 ; make a copy of current lin
e | |
256 | |
257 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 | |
258 punpckhbw mm4, mm0 ; | |
259 | |
260 pmullw mm3, mm1 ; | |
261 pmullw mm4, mm1 ; | |
262 | |
263 movq mm5, [rsi+1] ; | |
264 movq mm6, mm5 ; | |
265 | |
266 punpcklbw mm5, mm0 ; | |
267 punpckhbw mm6, mm0 ; | |
268 | |
269 pmullw mm5, mm2 ; | |
270 pmullw mm6, mm2 ; | |
271 | |
272 paddw mm3, mm5 ; | |
273 paddw mm4, mm6 ; | |
274 | |
275 paddw mm3, [GLOBAL(rd)] ; xmm3 += round val
ue | |
276 psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128 | |
277 | |
278 paddw mm4, [GLOBAL(rd)] ; | |
279 psraw mm4, VP9_FILTER_SHIFT ; | |
280 | |
281 movq mm7, mm3 ; | |
282 packuswb mm7, mm4 ; | |
283 | |
284 add rsi, rdx ; next line | |
285 .next_row_8x8: | |
286 movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07
08 09 10 11 12 13 14 | |
287 movq mm4, mm3 ; make a copy of current lin
e | |
288 | |
289 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 | |
290 punpckhbw mm4, mm0 ; | |
291 | |
292 pmullw mm3, mm1 ; | |
293 pmullw mm4, mm1 ; | |
294 | |
295 movq mm5, [rsi+1] ; | |
296 movq mm6, mm5 ; | |
297 | |
298 punpcklbw mm5, mm0 ; | |
299 punpckhbw mm6, mm0 ; | |
300 | |
301 pmullw mm5, mm2 ; | |
302 pmullw mm6, mm2 ; | |
303 | |
304 paddw mm3, mm5 ; | |
305 paddw mm4, mm6 ; | |
306 | |
307 movq mm5, mm7 ; | |
308 movq mm6, mm7 ; | |
309 | |
310 punpcklbw mm5, mm0 ; | |
311 punpckhbw mm6, mm0 | |
312 | |
313 pmullw mm5, [rax] ; | |
314 pmullw mm6, [rax] ; | |
315 | |
316 paddw mm3, [GLOBAL(rd)] ; xmm3 += round val
ue | |
317 psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128 | |
318 | |
319 paddw mm4, [GLOBAL(rd)] ; | |
320 psraw mm4, VP9_FILTER_SHIFT ; | |
321 | |
322 movq mm7, mm3 ; | |
323 packuswb mm7, mm4 ; | |
324 | |
325 | |
326 pmullw mm3, [rax+16] ; | |
327 pmullw mm4, [rax+16] ; | |
328 | |
329 paddw mm3, mm5 ; | |
330 paddw mm4, mm6 ; | |
331 | |
332 | |
333 paddw mm3, [GLOBAL(rd)] ; xmm3 += round val
ue | |
334 psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128 | |
335 | |
336 paddw mm4, [GLOBAL(rd)] ; | |
337 psraw mm4, VP9_FILTER_SHIFT ; | |
338 | |
339 packuswb mm3, mm4 | |
340 | |
341 movq [rdi], mm3 ; store the results in the d
estination | |
342 | |
343 %if ABI_IS_32BIT | |
344 add rsi, rdx ; next line | |
345 add rdi, dword ptr arg(5) ;dst_pitch ; | |
346 %else | |
347 movsxd r8, dword ptr arg(5) ;dst_pitch | |
348 add rsi, rdx ; next line | |
349 add rdi, r8 ;dst_pitch | |
350 %endif | |
351 cmp rdi, rcx ; | |
352 jne .next_row_8x8 | |
353 | |
354 ; begin epilog | |
355 pop rdi | |
356 pop rsi | |
357 RESTORE_GOT | |
358 UNSHADOW_ARGS | |
359 pop rbp | |
360 ret | |
361 | |
362 | |
363 ;void bilinear_predict8x4_mmx | |
364 ;( | |
365 ; unsigned char *src_ptr, | |
366 ; int src_pixels_per_line, | |
367 ; int xoffset, | |
368 ; int yoffset, | |
369 ; unsigned char *dst_ptr, | |
370 ; int dst_pitch | |
371 ;) | |
372 global sym(vp9_bilinear_predict8x4_mmx) PRIVATE | |
373 sym(vp9_bilinear_predict8x4_mmx): | |
374 push rbp | |
375 mov rbp, rsp | |
376 SHADOW_ARGS_TO_STACK 6 | |
377 GET_GOT rbx | |
378 push rsi | |
379 push rdi | |
380 ; end prolog | |
381 | |
382 ;const short *HFilter = bilinear_filters_mmx[xoffset]; | |
383 ;const short *VFilter = bilinear_filters_mmx[yoffset]; | |
384 | |
385 movsxd rax, dword ptr arg(2) ;xoffset | |
386 mov rdi, arg(4) ;dst_ptr ; | |
387 | |
388 lea rcx, [GLOBAL(sym(vp9_bilinear_filters_8x_mmx))] | |
389 shl rax, 5 | |
390 | |
391 mov rsi, arg(0) ;src_ptr ; | |
392 add rax, rcx | |
393 | |
394 movsxd rdx, dword ptr arg(5) ;dst_pitch | |
395 movq mm1, [rax] ; | |
396 | |
397 movq mm2, [rax+16] ; | |
398 movsxd rax, dword ptr arg(3) ;yoffset | |
399 | |
400 pxor mm0, mm0 ; | |
401 shl rax, 5 | |
402 | |
403 add rax, rcx | |
404 lea rcx, [rdi+rdx*4] ; | |
405 | |
406 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ; | |
407 | |
408 ; get the first horizontal line done ; | |
409 movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07
08 09 10 11 12 13 14 | |
410 movq mm4, mm3 ; make a copy of current lin
e | |
411 | |
412 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 | |
413 punpckhbw mm4, mm0 ; | |
414 | |
415 pmullw mm3, mm1 ; | |
416 pmullw mm4, mm1 ; | |
417 | |
418 movq mm5, [rsi+1] ; | |
419 movq mm6, mm5 ; | |
420 | |
421 punpcklbw mm5, mm0 ; | |
422 punpckhbw mm6, mm0 ; | |
423 | |
424 pmullw mm5, mm2 ; | |
425 pmullw mm6, mm2 ; | |
426 | |
427 paddw mm3, mm5 ; | |
428 paddw mm4, mm6 ; | |
429 | |
430 paddw mm3, [GLOBAL(rd)] ; xmm3 += round val
ue | |
431 psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128 | |
432 | |
433 paddw mm4, [GLOBAL(rd)] ; | |
434 psraw mm4, VP9_FILTER_SHIFT ; | |
435 | |
436 movq mm7, mm3 ; | |
437 packuswb mm7, mm4 ; | |
438 | |
439 add rsi, rdx ; next line | |
440 .next_row_8x4: | |
441 movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07
08 09 10 11 12 13 14 | |
442 movq mm4, mm3 ; make a copy of current lin
e | |
443 | |
444 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 | |
445 punpckhbw mm4, mm0 ; | |
446 | |
447 pmullw mm3, mm1 ; | |
448 pmullw mm4, mm1 ; | |
449 | |
450 movq mm5, [rsi+1] ; | |
451 movq mm6, mm5 ; | |
452 | |
453 punpcklbw mm5, mm0 ; | |
454 punpckhbw mm6, mm0 ; | |
455 | |
456 pmullw mm5, mm2 ; | |
457 pmullw mm6, mm2 ; | |
458 | |
459 paddw mm3, mm5 ; | |
460 paddw mm4, mm6 ; | |
461 | |
462 movq mm5, mm7 ; | |
463 movq mm6, mm7 ; | |
464 | |
465 punpcklbw mm5, mm0 ; | |
466 punpckhbw mm6, mm0 | |
467 | |
468 pmullw mm5, [rax] ; | |
469 pmullw mm6, [rax] ; | |
470 | |
471 paddw mm3, [GLOBAL(rd)] ; xmm3 += round val
ue | |
472 psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128 | |
473 | |
474 paddw mm4, [GLOBAL(rd)] ; | |
475 psraw mm4, VP9_FILTER_SHIFT ; | |
476 | |
477 movq mm7, mm3 ; | |
478 packuswb mm7, mm4 ; | |
479 | |
480 | |
481 pmullw mm3, [rax+16] ; | |
482 pmullw mm4, [rax+16] ; | |
483 | |
484 paddw mm3, mm5 ; | |
485 paddw mm4, mm6 ; | |
486 | |
487 | |
488 paddw mm3, [GLOBAL(rd)] ; xmm3 += round val
ue | |
489 psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128 | |
490 | |
491 paddw mm4, [GLOBAL(rd)] ; | |
492 psraw mm4, VP9_FILTER_SHIFT ; | |
493 | |
494 packuswb mm3, mm4 | |
495 | |
496 movq [rdi], mm3 ; store the results in the d
estination | |
497 | |
498 %if ABI_IS_32BIT | |
499 add rsi, rdx ; next line | |
500 add rdi, dword ptr arg(5) ;dst_pitch ; | |
501 %else | |
502 movsxd r8, dword ptr arg(5) ;dst_pitch | |
503 add rsi, rdx ; next line | |
504 add rdi, r8 | |
505 %endif | |
506 cmp rdi, rcx ; | |
507 jne .next_row_8x4 | |
508 | |
509 ; begin epilog | |
510 pop rdi | |
511 pop rsi | |
512 RESTORE_GOT | |
513 UNSHADOW_ARGS | |
514 pop rbp | |
515 ret | |
516 | |
517 | |
518 ;void bilinear_predict4x4_mmx | |
519 ;( | |
520 ; unsigned char *src_ptr, | |
521 ; int src_pixels_per_line, | |
522 ; int xoffset, | |
523 ; int yoffset, | |
524 ; unsigned char *dst_ptr, | |
525 ; int dst_pitch | |
526 ;) | |
527 global sym(vp9_bilinear_predict4x4_mmx) PRIVATE | |
528 sym(vp9_bilinear_predict4x4_mmx): | |
529 push rbp | |
530 mov rbp, rsp | |
531 SHADOW_ARGS_TO_STACK 6 | |
532 GET_GOT rbx | |
533 push rsi | |
534 push rdi | |
535 ; end prolog | |
536 | |
537 ;const short *HFilter = bilinear_filters_mmx[xoffset]; | |
538 ;const short *VFilter = bilinear_filters_mmx[yoffset]; | |
539 | |
540 movsxd rax, dword ptr arg(2) ;xoffset | |
541 mov rdi, arg(4) ;dst_ptr ; | |
542 | |
543 lea rcx, [GLOBAL(sym(vp9_bilinear_filters_8x_mmx))] | |
544 shl rax, 5 | |
545 | |
546 add rax, rcx ; HFilter | |
547 mov rsi, arg(0) ;src_ptr ; | |
548 | |
549 movsxd rdx, dword ptr arg(5) ;ldst_pitch | |
550 movq mm1, [rax] ; | |
551 | |
552 movq mm2, [rax+16] ; | |
553 movsxd rax, dword ptr arg(3) ;yoffset | |
554 | |
555 pxor mm0, mm0 ; | |
556 shl rax, 5 | |
557 | |
558 add rax, rcx | |
559 lea rcx, [rdi+rdx*4] ; | |
560 | |
561 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ; | |
562 | |
563 ; get the first horizontal line done ; | |
564 movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07
08 09 10 11 12 13 14 | |
565 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 | |
566 | |
567 pmullw mm3, mm1 ; | |
568 movd mm5, [rsi+1] ; | |
569 | |
570 punpcklbw mm5, mm0 ; | |
571 pmullw mm5, mm2 ; | |
572 | |
573 paddw mm3, mm5 ; | |
574 paddw mm3, [GLOBAL(rd)] ; xmm3 += round val
ue | |
575 | |
576 psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128 | |
577 | |
578 movq mm7, mm3 ; | |
579 packuswb mm7, mm0 ; | |
580 | |
581 add rsi, rdx ; next line | |
582 .next_row_4x4: | |
583 movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07
08 09 10 11 12 13 14 | |
584 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 | |
585 | |
586 pmullw mm3, mm1 ; | |
587 movd mm5, [rsi+1] ; | |
588 | |
589 punpcklbw mm5, mm0 ; | |
590 pmullw mm5, mm2 ; | |
591 | |
592 paddw mm3, mm5 ; | |
593 | |
594 movq mm5, mm7 ; | |
595 punpcklbw mm5, mm0 ; | |
596 | |
597 pmullw mm5, [rax] ; | |
598 paddw mm3, [GLOBAL(rd)] ; xmm3 += round val
ue | |
599 | |
600 psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128 | |
601 movq mm7, mm3 ; | |
602 | |
603 packuswb mm7, mm0 ; | |
604 | |
605 pmullw mm3, [rax+16] ; | |
606 paddw mm3, mm5 ; | |
607 | |
608 | |
609 paddw mm3, [GLOBAL(rd)] ; xmm3 += round val
ue | |
610 psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128 | |
611 | |
612 packuswb mm3, mm0 | |
613 movd [rdi], mm3 ; store the results in the d
estination | |
614 | |
615 %if ABI_IS_32BIT | |
616 add rsi, rdx ; next line | |
617 add rdi, dword ptr arg(5) ;dst_pitch ; | |
618 %else | |
619 movsxd r8, dword ptr arg(5) ;dst_pitch ; | |
620 add rsi, rdx ; next line | |
621 add rdi, r8 | |
622 %endif | |
623 | |
624 cmp rdi, rcx ; | |
625 jne .next_row_4x4 | |
626 | |
627 ; begin epilog | |
628 pop rdi | |
629 pop rsi | |
630 RESTORE_GOT | |
631 UNSHADOW_ARGS | |
632 pop rbp | |
633 ret | |
634 | |
635 | |
636 | |
637 SECTION_RODATA | 205 SECTION_RODATA |
638 align 16 | 206 align 16 |
639 rd: | 207 rd: |
640 times 4 dw 0x40 | 208 times 4 dw 0x40 |
641 | 209 |
642 align 16 | 210 align 16 |
643 global HIDDEN_DATA(sym(vp9_six_tap_mmx)) | 211 global HIDDEN_DATA(sym(vp9_six_tap_mmx)) |
644 sym(vp9_six_tap_mmx): | 212 sym(vp9_six_tap_mmx): |
645 times 8 dw 0 | 213 times 8 dw 0 |
646 times 8 dw 0 | 214 times 8 dw 0 |
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
691 times 8 dw -11 | 259 times 8 dw -11 |
692 times 8 dw 2 | 260 times 8 dw 2 |
693 | 261 |
694 times 8 dw 0 | 262 times 8 dw 0 |
695 times 8 dw -1 | 263 times 8 dw -1 |
696 times 8 dw 12 | 264 times 8 dw 12 |
697 times 8 dw 123 | 265 times 8 dw 123 |
698 times 8 dw -6 | 266 times 8 dw -6 |
699 times 8 dw 0 | 267 times 8 dw 0 |
700 | 268 |
701 | |
702 align 16 | |
703 global HIDDEN_DATA(sym(vp9_bilinear_filters_8x_mmx)) | |
704 sym(vp9_bilinear_filters_8x_mmx): | |
705 times 8 dw 128 | |
706 times 8 dw 0 | |
707 | |
708 times 8 dw 112 | |
709 times 8 dw 16 | |
710 | |
711 times 8 dw 96 | |
712 times 8 dw 32 | |
713 | |
714 times 8 dw 80 | |
715 times 8 dw 48 | |
716 | |
717 times 8 dw 64 | |
718 times 8 dw 64 | |
719 | |
720 times 8 dw 48 | |
721 times 8 dw 80 | |
722 | |
723 times 8 dw 32 | |
724 times 8 dw 96 | |
725 | |
726 times 8 dw 16 | |
727 times 8 dw 112 | |
OLD | NEW |