Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(189)

Side by Side Diff: source/libvpx/vpx_dsp/mips/sub_pixel_variance_msa.c

Issue 1302353004: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/libvpx/vpx_dsp/mips/macros_msa.h ('k') | source/libvpx/vpx_dsp/mips/vpx_common_dspr2.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
(...skipping 388 matching lines...) Expand 10 before | Expand all | Expand 10 after
399 const uint8_t *filter, 399 const uint8_t *filter,
400 int32_t height, 400 int32_t height,
401 int32_t *diff) { 401 int32_t *diff) {
402 int16_t filtval; 402 int16_t filtval;
403 uint32_t loop_cnt; 403 uint32_t loop_cnt;
404 uint32_t ref0, ref1, ref2, ref3; 404 uint32_t ref0, ref1, ref2, ref3;
405 v16u8 filt0, ref = { 0 }; 405 v16u8 filt0, ref = { 0 };
406 v16i8 src0, src1, src2, src3; 406 v16i8 src0, src1, src2, src3;
407 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 407 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
408 v8u16 vec0, vec1, vec2, vec3; 408 v8u16 vec0, vec1, vec2, vec3;
409 v8u16 const255;
410 v8i16 avg = { 0 }; 409 v8i16 avg = { 0 };
411 v4i32 vec, var = { 0 }; 410 v4i32 vec, var = { 0 };
412 411
413 filtval = LH(filter); 412 filtval = LH(filter);
414 filt0 = (v16u8)__msa_fill_h(filtval); 413 filt0 = (v16u8)__msa_fill_h(filtval);
415 414
416 const255 = (v8u16)__msa_ldi_h(255);
417
418 for (loop_cnt = (height >> 2); loop_cnt--;) { 415 for (loop_cnt = (height >> 2); loop_cnt--;) {
419 LD_SB4(src, src_stride, src0, src1, src2, src3); 416 LD_SB4(src, src_stride, src0, src1, src2, src3);
420 src += (4 * src_stride); 417 src += (4 * src_stride);
421 LW4(dst, dst_stride, ref0, ref1, ref2, ref3); 418 LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
422 dst += (4 * dst_stride); 419 dst += (4 * dst_stride);
423 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); 420 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
424 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 421 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
425 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 422 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
426 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 423 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
427 vec0, vec1, vec2, vec3); 424 vec0, vec1, vec2, vec3);
428 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); 425 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
429 MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
430 PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, 426 PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
431 src0, src1, src2, src3); 427 src0, src1, src2, src3);
432 ILVEV_W2_SB(src0, src1, src2, src3, src0, src2); 428 ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
433 src0 = (v16i8)__msa_ilvev_d((v2i64)src2, (v2i64)src0); 429 src0 = (v16i8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
434 CALC_MSE_AVG_B(src0, ref, var, avg); 430 CALC_MSE_AVG_B(src0, ref, var, avg);
435 } 431 }
436 432
437 vec = __msa_hadd_s_w(avg, avg); 433 vec = __msa_hadd_s_w(avg, avg);
438 *diff = HADD_SW_S32(vec); 434 *diff = HADD_SW_S32(vec);
439 435
440 return HADD_SW_S32(var); 436 return HADD_SW_S32(var);
441 } 437 }
442 438
443 static uint32_t sub_pixel_sse_diff_8width_h_msa(const uint8_t *src, 439 static uint32_t sub_pixel_sse_diff_8width_h_msa(const uint8_t *src,
444 int32_t src_stride, 440 int32_t src_stride,
445 const uint8_t *dst, 441 const uint8_t *dst,
446 int32_t dst_stride, 442 int32_t dst_stride,
447 const uint8_t *filter, 443 const uint8_t *filter,
448 int32_t height, 444 int32_t height,
449 int32_t *diff) { 445 int32_t *diff) {
450 int16_t filtval; 446 int16_t filtval;
451 uint32_t loop_cnt; 447 uint32_t loop_cnt;
452 v16u8 filt0, out, ref0, ref1, ref2, ref3; 448 v16u8 filt0, out, ref0, ref1, ref2, ref3;
453 v16i8 src0, src1, src2, src3; 449 v16i8 src0, src1, src2, src3;
454 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 450 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
455 v8u16 vec0, vec1, vec2, vec3, const255; 451 v8u16 vec0, vec1, vec2, vec3;
456 v8i16 avg = { 0 }; 452 v8i16 avg = { 0 };
457 v4i32 vec, var = { 0 }; 453 v4i32 vec, var = { 0 };
458 454
459 filtval = LH(filter); 455 filtval = LH(filter);
460 filt0 = (v16u8)__msa_fill_h(filtval); 456 filt0 = (v16u8)__msa_fill_h(filtval);
461 457
462 const255 = (v8u16)__msa_ldi_h(255);
463
464 for (loop_cnt = (height >> 2); loop_cnt--;) { 458 for (loop_cnt = (height >> 2); loop_cnt--;) {
465 LD_SB4(src, src_stride, src0, src1, src2, src3); 459 LD_SB4(src, src_stride, src0, src1, src2, src3);
466 src += (4 * src_stride); 460 src += (4 * src_stride);
467 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); 461 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
468 dst += (4 * dst_stride); 462 dst += (4 * dst_stride);
469 463
470 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); 464 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
471 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 465 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
472 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 466 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
473 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 467 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
474 vec0, vec1, vec2, vec3); 468 vec0, vec1, vec2, vec3);
475 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); 469 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
476 MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
477 PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, 470 PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
478 src0, src1, src2, src3); 471 src0, src1, src2, src3);
479 out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0); 472 out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
480 CALC_MSE_AVG_B(out, ref0, var, avg); 473 CALC_MSE_AVG_B(out, ref0, var, avg);
481 out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2); 474 out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
482 CALC_MSE_AVG_B(out, ref1, var, avg); 475 CALC_MSE_AVG_B(out, ref1, var, avg);
483 } 476 }
484 477
485 vec = __msa_hadd_s_w(avg, avg); 478 vec = __msa_hadd_s_w(avg, avg);
486 *diff = HADD_SW_S32(vec); 479 *diff = HADD_SW_S32(vec);
487 480
488 return HADD_SW_S32(var); 481 return HADD_SW_S32(var);
489 } 482 }
490 483
491 static uint32_t sub_pixel_sse_diff_16width_h_msa(const uint8_t *src, 484 static uint32_t sub_pixel_sse_diff_16width_h_msa(const uint8_t *src,
492 int32_t src_stride, 485 int32_t src_stride,
493 const uint8_t *dst, 486 const uint8_t *dst,
494 int32_t dst_stride, 487 int32_t dst_stride,
495 const uint8_t *filter, 488 const uint8_t *filter,
496 int32_t height, 489 int32_t height,
497 int32_t *diff) { 490 int32_t *diff) {
498 int16_t filtval; 491 int16_t filtval;
499 uint32_t loop_cnt; 492 uint32_t loop_cnt;
500 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 493 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
501 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 494 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
502 v16u8 dst0, dst1, dst2, dst3, filt0; 495 v16u8 dst0, dst1, dst2, dst3, filt0;
503 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 496 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
504 v8u16 out0, out1, out2, out3, out4, out5, out6, out7; 497 v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
505 v8u16 const255;
506 v8i16 avg = { 0 }; 498 v8i16 avg = { 0 };
507 v4i32 vec, var = { 0 }; 499 v4i32 vec, var = { 0 };
508 500
509 filtval = LH(filter); 501 filtval = LH(filter);
510 filt0 = (v16u8)__msa_fill_h(filtval); 502 filt0 = (v16u8)__msa_fill_h(filtval);
511 503
512 const255 = (v8u16)__msa_ldi_h(255);
513
514 for (loop_cnt = (height >> 2); loop_cnt--;) { 504 for (loop_cnt = (height >> 2); loop_cnt--;) {
515 LD_SB4(src, src_stride, src0, src2, src4, src6); 505 LD_SB4(src, src_stride, src0, src2, src4, src6);
516 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 506 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
517 src += (4 * src_stride); 507 src += (4 * src_stride);
518 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 508 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
519 dst += (4 * dst_stride); 509 dst += (4 * dst_stride);
520 510
521 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 511 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
522 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 512 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
523 VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5); 513 VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
524 VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7); 514 VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
525 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 515 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
526 out0, out1, out2, out3); 516 out0, out1, out2, out3);
527 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, 517 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
528 out4, out5, out6, out7); 518 out4, out5, out6, out7);
529 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); 519 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
530 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); 520 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
531 MIN_UH4_UH(out0, out1, out2, out3, const255);
532 MIN_UH4_UH(out4, out5, out6, out7, const255);
533 PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6, 521 PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6,
534 src0, src1, src2, src3); 522 src0, src1, src2, src3);
535 CALC_MSE_AVG_B(src0, dst0, var, avg); 523 CALC_MSE_AVG_B(src0, dst0, var, avg);
536 CALC_MSE_AVG_B(src1, dst1, var, avg); 524 CALC_MSE_AVG_B(src1, dst1, var, avg);
537 CALC_MSE_AVG_B(src2, dst2, var, avg); 525 CALC_MSE_AVG_B(src2, dst2, var, avg);
538 CALC_MSE_AVG_B(src3, dst3, var, avg); 526 CALC_MSE_AVG_B(src3, dst3, var, avg);
539 } 527 }
540 528
541 vec = __msa_hadd_s_w(avg, avg); 529 vec = __msa_hadd_s_w(avg, avg);
542 *diff = HADD_SW_S32(vec); 530 *diff = HADD_SW_S32(vec);
(...skipping 75 matching lines...) Expand 10 before | Expand all | Expand 10 after
618 src += (4 * src_stride); 606 src += (4 * src_stride);
619 LW4(dst, dst_stride, ref0, ref1, ref2, ref3); 607 LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
620 dst += (4 * dst_stride); 608 dst += (4 * dst_stride);
621 609
622 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); 610 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
623 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, 611 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
624 src10_r, src21_r, src32_r, src43_r); 612 src10_r, src21_r, src32_r, src43_r);
625 ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); 613 ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
626 DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); 614 DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
627 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 615 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
628 SAT_UH2_UH(tmp0, tmp1, 7);
629 out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 616 out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
630 CALC_MSE_AVG_B(out, ref, var, avg); 617 CALC_MSE_AVG_B(out, ref, var, avg);
631 src0 = src4; 618 src0 = src4;
632 } 619 }
633 620
634 vec = __msa_hadd_s_w(avg, avg); 621 vec = __msa_hadd_s_w(avg, avg);
635 *diff = HADD_SW_S32(vec); 622 *diff = HADD_SW_S32(vec);
636 623
637 return HADD_SW_S32(var); 624 return HADD_SW_S32(var);
638 } 625 }
(...skipping 26 matching lines...) Expand all
665 src += (4 * src_stride); 652 src += (4 * src_stride);
666 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); 653 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
667 dst += (4 * dst_stride); 654 dst += (4 * dst_stride);
668 655
669 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); 656 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
670 ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, 657 ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3,
671 vec0, vec1, vec2, vec3); 658 vec0, vec1, vec2, vec3);
672 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 659 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
673 tmp0, tmp1, tmp2, tmp3); 660 tmp0, tmp1, tmp2, tmp3);
674 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); 661 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
675 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
676 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); 662 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
677 CALC_MSE_AVG_B(src0, ref0, var, avg); 663 CALC_MSE_AVG_B(src0, ref0, var, avg);
678 CALC_MSE_AVG_B(src1, ref1, var, avg); 664 CALC_MSE_AVG_B(src1, ref1, var, avg);
679 src0 = src4; 665 src0 = src4;
680 } 666 }
681 667
682 vec = __msa_hadd_s_w(avg, avg); 668 vec = __msa_hadd_s_w(avg, avg);
683 *diff = HADD_SW_S32(vec); 669 *diff = HADD_SW_S32(vec);
684 670
685 return HADD_SW_S32(var); 671 return HADD_SW_S32(var);
(...skipping 26 matching lines...) Expand all
712 for (loop_cnt = (height >> 2); loop_cnt--;) { 698 for (loop_cnt = (height >> 2); loop_cnt--;) {
713 LD_UB4(src, src_stride, src1, src2, src3, src4); 699 LD_UB4(src, src_stride, src1, src2, src3, src4);
714 src += (4 * src_stride); 700 src += (4 * src_stride);
715 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); 701 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
716 dst += (4 * dst_stride); 702 dst += (4 * dst_stride);
717 703
718 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); 704 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
719 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); 705 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
720 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); 706 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
721 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 707 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
722 SAT_UH2_UH(tmp0, tmp1, 7);
723 out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 708 out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
724 709
725 ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); 710 ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
726 ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); 711 ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
727 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); 712 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
728 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); 713 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
729 SAT_UH2_UH(tmp2, tmp3, 7);
730 out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); 714 out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
731 715
732 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); 716 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
733 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 717 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
734 SAT_UH2_UH(tmp0, tmp1, 7);
735 out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 718 out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
736 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); 719 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
737 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); 720 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
738 SAT_UH2_UH(tmp2, tmp3, 7);
739 out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); 721 out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
740 722
741 src0 = src4; 723 src0 = src4;
742 724
743 CALC_MSE_AVG_B(out0, ref0, var, avg); 725 CALC_MSE_AVG_B(out0, ref0, var, avg);
744 CALC_MSE_AVG_B(out1, ref1, var, avg); 726 CALC_MSE_AVG_B(out1, ref1, var, avg);
745 CALC_MSE_AVG_B(out2, ref2, var, avg); 727 CALC_MSE_AVG_B(out2, ref2, var, avg);
746 CALC_MSE_AVG_B(out3, ref3, var, avg); 728 CALC_MSE_AVG_B(out3, ref3, var, avg);
747 } 729 }
748 730
(...skipping 82 matching lines...) Expand 10 before | Expand all | Expand 10 after
831 dst += (4 * dst_stride); 813 dst += (4 * dst_stride);
832 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); 814 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
833 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); 815 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
834 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); 816 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
835 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); 817 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
836 hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); 818 hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
837 hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); 819 hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
838 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 820 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
839 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 821 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
840 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 822 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
841 SAT_UH2_UH(tmp0, tmp1, 7);
842 out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 823 out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
843 CALC_MSE_AVG_B(out, ref, var, avg); 824 CALC_MSE_AVG_B(out, ref, var, avg);
844 src0 = src4; 825 src0 = src4;
845 } 826 }
846 827
847 vec = __msa_hadd_s_w(avg, avg); 828 vec = __msa_hadd_s_w(avg, avg);
848 *diff = HADD_SW_S32(vec); 829 *diff = HADD_SW_S32(vec);
849 830
850 return HADD_SW_S32(var); 831 return HADD_SW_S32(var);
851 } 832 }
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after
886 dst += (4 * dst_stride); 867 dst += (4 * dst_stride);
887 868
888 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); 869 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
889 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); 870 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
890 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); 871 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
891 tmp0 = __msa_dotp_u_h(vec0, filt_vt); 872 tmp0 = __msa_dotp_u_h(vec0, filt_vt);
892 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); 873 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
893 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); 874 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
894 tmp1 = __msa_dotp_u_h(vec0, filt_vt); 875 tmp1 = __msa_dotp_u_h(vec0, filt_vt);
895 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 876 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
896 SAT_UH2_UH(tmp0, tmp1, 7);
897 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); 877 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
898 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); 878 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
899 tmp2 = __msa_dotp_u_h(vec0, filt_vt); 879 tmp2 = __msa_dotp_u_h(vec0, filt_vt);
900 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); 880 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
901 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); 881 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
902 tmp3 = __msa_dotp_u_h(vec0, filt_vt); 882 tmp3 = __msa_dotp_u_h(vec0, filt_vt);
903 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); 883 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
904 SAT_UH2_UH(tmp2, tmp3, 7);
905 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); 884 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
906 CALC_MSE_AVG_B(out0, ref0, var, avg); 885 CALC_MSE_AVG_B(out0, ref0, var, avg);
907 CALC_MSE_AVG_B(out1, ref1, var, avg); 886 CALC_MSE_AVG_B(out1, ref1, var, avg);
908 } 887 }
909 888
910 vec = __msa_hadd_s_w(avg, avg); 889 vec = __msa_hadd_s_w(avg, avg);
911 *diff = HADD_SW_S32(vec); 890 *diff = HADD_SW_S32(vec);
912 891
913 return HADD_SW_S32(var); 892 return HADD_SW_S32(var);
914 } 893 }
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after
948 LD_UB4(src + 8, src_stride, src1, src3, src5, src7); 927 LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
949 src += (4 * src_stride); 928 src += (4 * src_stride);
950 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); 929 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
951 dst += (4 * dst_stride); 930 dst += (4 * dst_stride);
952 931
953 hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); 932 hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
954 hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); 933 hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
955 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 934 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
956 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 935 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
957 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 936 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
958 SAT_UH2_UH(tmp0, tmp1, 7);
959 src0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 937 src0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
960 938
961 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); 939 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
962 hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); 940 hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
963 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); 941 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
964 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 942 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
965 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 943 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
966 SAT_UH2_UH(tmp0, tmp1, 7);
967 src1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 944 src1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
968 945
969 hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); 946 hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
970 hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); 947 hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
971 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 948 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
972 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 949 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
973 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 950 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
974 SAT_UH2_UH(tmp0, tmp1, 7);
975 src2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 951 src2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
976 952
977 hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); 953 hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
978 hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); 954 hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
979 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); 955 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
980 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 956 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
981 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 957 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
982 SAT_UH2_UH(tmp0, tmp1, 7);
983 src3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 958 src3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
984 959
985 CALC_MSE_AVG_B(src0, ref0, var, avg); 960 CALC_MSE_AVG_B(src0, ref0, var, avg);
986 CALC_MSE_AVG_B(src1, ref1, var, avg); 961 CALC_MSE_AVG_B(src1, ref1, var, avg);
987 CALC_MSE_AVG_B(src2, ref2, var, avg); 962 CALC_MSE_AVG_B(src2, ref2, var, avg);
988 CALC_MSE_AVG_B(src3, ref3, var, avg); 963 CALC_MSE_AVG_B(src3, ref3, var, avg);
989 } 964 }
990 965
991 vec = __msa_hadd_s_w(avg, avg); 966 vec = __msa_hadd_s_w(avg, avg);
992 *diff = HADD_SW_S32(vec); 967 *diff = HADD_SW_S32(vec);
(...skipping 57 matching lines...) Expand 10 before | Expand all | Expand 10 after
1050 const uint8_t *filter, 1025 const uint8_t *filter,
1051 int32_t height, 1026 int32_t height,
1052 int32_t *diff) { 1027 int32_t *diff) {
1053 int16_t filtval; 1028 int16_t filtval;
1054 uint32_t loop_cnt; 1029 uint32_t loop_cnt;
1055 uint32_t ref0, ref1, ref2, ref3; 1030 uint32_t ref0, ref1, ref2, ref3;
1056 v16u8 out, pred, filt0, ref = { 0 }; 1031 v16u8 out, pred, filt0, ref = { 0 };
1057 v16i8 src0, src1, src2, src3; 1032 v16i8 src0, src1, src2, src3;
1058 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 1033 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1059 v8u16 vec0, vec1, vec2, vec3; 1034 v8u16 vec0, vec1, vec2, vec3;
1060 v8u16 const255;
1061 v8i16 avg = { 0 }; 1035 v8i16 avg = { 0 };
1062 v4i32 vec, var = { 0 }; 1036 v4i32 vec, var = { 0 };
1063 1037
1064 filtval = LH(filter); 1038 filtval = LH(filter);
1065 filt0 = (v16u8)__msa_fill_h(filtval); 1039 filt0 = (v16u8)__msa_fill_h(filtval);
1066 1040
1067 const255 = (v8u16)__msa_ldi_h(255);
1068
1069 for (loop_cnt = (height >> 2); loop_cnt--;) { 1041 for (loop_cnt = (height >> 2); loop_cnt--;) {
1070 LD_SB4(src, src_stride, src0, src1, src2, src3); 1042 LD_SB4(src, src_stride, src0, src1, src2, src3);
1071 src += (4 * src_stride); 1043 src += (4 * src_stride);
1072 pred = LD_UB(sec_pred); 1044 pred = LD_UB(sec_pred);
1073 sec_pred += 16; 1045 sec_pred += 16;
1074 LW4(dst, dst_stride, ref0, ref1, ref2, ref3); 1046 LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
1075 dst += (4 * dst_stride); 1047 dst += (4 * dst_stride);
1076 1048
1077 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); 1049 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
1078 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 1050 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1079 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 1051 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1080 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 1052 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1081 vec0, vec1, vec2, vec3); 1053 vec0, vec1, vec2, vec3);
1082 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); 1054 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
1083 MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
1084 PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, 1055 PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
1085 src0, src1, src2, src3); 1056 src0, src1, src2, src3);
1086 ILVEV_W2_SB(src0, src1, src2, src3, src0, src2); 1057 ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
1087 out = (v16u8)__msa_ilvev_d((v2i64)src2, (v2i64)src0); 1058 out = (v16u8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
1088 out = __msa_aver_u_b(out, pred); 1059 out = __msa_aver_u_b(out, pred);
1089 CALC_MSE_AVG_B(out, ref, var, avg); 1060 CALC_MSE_AVG_B(out, ref, var, avg);
1090 } 1061 }
1091 1062
1092 vec = __msa_hadd_s_w(avg, avg); 1063 vec = __msa_hadd_s_w(avg, avg);
1093 *diff = HADD_SW_S32(vec); 1064 *diff = HADD_SW_S32(vec);
1094 1065
1095 return HADD_SW_S32(var); 1066 return HADD_SW_S32(var);
1096 } 1067 }
1097 1068
1098 static uint32_t sub_pixel_avg_sse_diff_8width_h_msa(const uint8_t *src, 1069 static uint32_t sub_pixel_avg_sse_diff_8width_h_msa(const uint8_t *src,
1099 int32_t src_stride, 1070 int32_t src_stride,
1100 const uint8_t *dst, 1071 const uint8_t *dst,
1101 int32_t dst_stride, 1072 int32_t dst_stride,
1102 const uint8_t *sec_pred, 1073 const uint8_t *sec_pred,
1103 const uint8_t *filter, 1074 const uint8_t *filter,
1104 int32_t height, 1075 int32_t height,
1105 int32_t *diff) { 1076 int32_t *diff) {
1106 int16_t filtval; 1077 int16_t filtval;
1107 uint32_t loop_cnt; 1078 uint32_t loop_cnt;
1108 v16u8 out, pred, filt0; 1079 v16u8 out, pred, filt0;
1109 v16u8 ref0, ref1, ref2, ref3; 1080 v16u8 ref0, ref1, ref2, ref3;
1110 v16i8 src0, src1, src2, src3; 1081 v16i8 src0, src1, src2, src3;
1111 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 1082 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1112 v8u16 vec0, vec1, vec2, vec3; 1083 v8u16 vec0, vec1, vec2, vec3;
1113 v8u16 const255;
1114 v8i16 avg = { 0 }; 1084 v8i16 avg = { 0 };
1115 v4i32 vec, var = { 0 }; 1085 v4i32 vec, var = { 0 };
1116 1086
1117 filtval = LH(filter); 1087 filtval = LH(filter);
1118 filt0 = (v16u8)__msa_fill_h(filtval); 1088 filt0 = (v16u8)__msa_fill_h(filtval);
1119 1089
1120 const255 = (v8u16)__msa_ldi_h(255);
1121
1122 for (loop_cnt = (height >> 2); loop_cnt--;) { 1090 for (loop_cnt = (height >> 2); loop_cnt--;) {
1123 LD_SB4(src, src_stride, src0, src1, src2, src3); 1091 LD_SB4(src, src_stride, src0, src1, src2, src3);
1124 src += (4 * src_stride); 1092 src += (4 * src_stride);
1125 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); 1093 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1126 dst += (4 * dst_stride); 1094 dst += (4 * dst_stride);
1127 1095
1128 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); 1096 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
1129 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 1097 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1130 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 1098 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1131 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 1099 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1132 vec0, vec1, vec2, vec3); 1100 vec0, vec1, vec2, vec3);
1133 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); 1101 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
1134 MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
1135 PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, 1102 PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
1136 src0, src1, src2, src3); 1103 src0, src1, src2, src3);
1137 out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0); 1104 out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
1138 1105
1139 pred = LD_UB(sec_pred); 1106 pred = LD_UB(sec_pred);
1140 sec_pred += 16; 1107 sec_pred += 16;
1141 out = __msa_aver_u_b(out, pred); 1108 out = __msa_aver_u_b(out, pred);
1142 CALC_MSE_AVG_B(out, ref0, var, avg); 1109 CALC_MSE_AVG_B(out, ref0, var, avg);
1143 out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2); 1110 out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
1144 pred = LD_UB(sec_pred); 1111 pred = LD_UB(sec_pred);
(...skipping 19 matching lines...) Expand all
1164 int32_t width) { 1131 int32_t width) {
1165 int16_t filtval; 1132 int16_t filtval;
1166 uint32_t loop_cnt; 1133 uint32_t loop_cnt;
1167 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; 1134 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
1168 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; 1135 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1169 v16u8 dst0, dst1, dst2, dst3; 1136 v16u8 dst0, dst1, dst2, dst3;
1170 v16u8 tmp0, tmp1, tmp2, tmp3; 1137 v16u8 tmp0, tmp1, tmp2, tmp3;
1171 v16u8 pred0, pred1, pred2, pred3, filt0; 1138 v16u8 pred0, pred1, pred2, pred3, filt0;
1172 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1139 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1173 v8u16 out0, out1, out2, out3, out4, out5, out6, out7; 1140 v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
1174 v8u16 const255;
1175 v8i16 avg = { 0 }; 1141 v8i16 avg = { 0 };
1176 v4i32 vec, var = { 0 }; 1142 v4i32 vec, var = { 0 };
1177 1143
1178 filtval = LH(filter); 1144 filtval = LH(filter);
1179 filt0 = (v16u8)__msa_fill_h(filtval); 1145 filt0 = (v16u8)__msa_fill_h(filtval);
1180 1146
1181 const255 = (v8u16)__msa_ldi_h(255);
1182
1183 for (loop_cnt = (height >> 2); loop_cnt--;) { 1147 for (loop_cnt = (height >> 2); loop_cnt--;) {
1184 LD_SB4(src, src_stride, src0, src2, src4, src6); 1148 LD_SB4(src, src_stride, src0, src2, src4, src6);
1185 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 1149 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
1186 src += (4 * src_stride); 1150 src += (4 * src_stride);
1187 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 1151 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1188 dst += (4 * dst_stride); 1152 dst += (4 * dst_stride);
1189 LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3); 1153 LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
1190 sec_pred += (4 * width); 1154 sec_pred += (4 * width);
1191 1155
1192 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 1156 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1193 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 1157 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1194 VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5); 1158 VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
1195 VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7); 1159 VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
1196 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 1160 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1197 out0, out1, out2, out3); 1161 out0, out1, out2, out3);
1198 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, 1162 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
1199 out4, out5, out6, out7); 1163 out4, out5, out6, out7);
1200 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); 1164 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
1201 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); 1165 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
1202 MIN_UH4_UH(out0, out1, out2, out3, const255);
1203 MIN_UH4_UH(out4, out5, out6, out7, const255);
1204 PCKEV_B4_UB(out1, out0, out3, out2, out5, out4, out7, out6, 1166 PCKEV_B4_UB(out1, out0, out3, out2, out5, out4, out7, out6,
1205 tmp0, tmp1, tmp2, tmp3); 1167 tmp0, tmp1, tmp2, tmp3);
1206 AVER_UB4_UB(tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, pred3, 1168 AVER_UB4_UB(tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, pred3,
1207 tmp0, tmp1, tmp2, tmp3); 1169 tmp0, tmp1, tmp2, tmp3);
1208 1170
1209 CALC_MSE_AVG_B(tmp0, dst0, var, avg); 1171 CALC_MSE_AVG_B(tmp0, dst0, var, avg);
1210 CALC_MSE_AVG_B(tmp1, dst1, var, avg); 1172 CALC_MSE_AVG_B(tmp1, dst1, var, avg);
1211 CALC_MSE_AVG_B(tmp2, dst2, var, avg); 1173 CALC_MSE_AVG_B(tmp2, dst2, var, avg);
1212 CALC_MSE_AVG_B(tmp3, dst3, var, avg); 1174 CALC_MSE_AVG_B(tmp3, dst3, var, avg);
1213 } 1175 }
(...skipping 98 matching lines...) Expand 10 before | Expand all | Expand 10 after
1312 sec_pred += 16; 1274 sec_pred += 16;
1313 LW4(dst, dst_stride, ref0, ref1, ref2, ref3); 1275 LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
1314 dst += (4 * dst_stride); 1276 dst += (4 * dst_stride);
1315 1277
1316 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); 1278 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
1317 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, 1279 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
1318 src10_r, src21_r, src32_r, src43_r); 1280 src10_r, src21_r, src32_r, src43_r);
1319 ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); 1281 ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
1320 DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); 1282 DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
1321 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 1283 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1322 SAT_UH2_UH(tmp0, tmp1, 7);
1323 1284
1324 out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 1285 out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1325 out = __msa_aver_u_b(out, pred); 1286 out = __msa_aver_u_b(out, pred);
1326 CALC_MSE_AVG_B(out, ref, var, avg); 1287 CALC_MSE_AVG_B(out, ref, var, avg);
1327 src0 = src4; 1288 src0 = src4;
1328 } 1289 }
1329 1290
1330 vec = __msa_hadd_s_w(avg, avg); 1291 vec = __msa_hadd_s_w(avg, avg);
1331 *diff = HADD_SW_S32(vec); 1292 *diff = HADD_SW_S32(vec);
1332 1293
(...skipping 30 matching lines...) Expand all
1363 LD_UB2(sec_pred, 16, pred0, pred1); 1324 LD_UB2(sec_pred, 16, pred0, pred1);
1364 sec_pred += 32; 1325 sec_pred += 32;
1365 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); 1326 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1366 dst += (4 * dst_stride); 1327 dst += (4 * dst_stride);
1367 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); 1328 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
1368 ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, 1329 ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3,
1369 vec0, vec1, vec2, vec3); 1330 vec0, vec1, vec2, vec3);
1370 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, 1331 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1371 tmp0, tmp1, tmp2, tmp3); 1332 tmp0, tmp1, tmp2, tmp3);
1372 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); 1333 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
1373 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1374 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); 1334 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
1375 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); 1335 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
1376 CALC_MSE_AVG_B(src0, ref0, var, avg); 1336 CALC_MSE_AVG_B(src0, ref0, var, avg);
1377 CALC_MSE_AVG_B(src1, ref1, var, avg); 1337 CALC_MSE_AVG_B(src1, ref1, var, avg);
1378 1338
1379 src0 = src4; 1339 src0 = src4;
1380 } 1340 }
1381 1341
1382 vec = __msa_hadd_s_w(avg, avg); 1342 vec = __msa_hadd_s_w(avg, avg);
1383 *diff = HADD_SW_S32(vec); 1343 *diff = HADD_SW_S32(vec);
(...skipping 30 matching lines...) Expand all
1414 for (loop_cnt = (height >> 2); loop_cnt--;) { 1374 for (loop_cnt = (height >> 2); loop_cnt--;) {
1415 LD_UB4(src, src_stride, src1, src2, src3, src4); 1375 LD_UB4(src, src_stride, src1, src2, src3, src4);
1416 src += (4 * src_stride); 1376 src += (4 * src_stride);
1417 LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3); 1377 LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
1418 sec_pred += (4 * width); 1378 sec_pred += (4 * width);
1419 1379
1420 ILVR_B2_UH(src1, src0, src2, src1, vec0, vec2); 1380 ILVR_B2_UH(src1, src0, src2, src1, vec0, vec2);
1421 ILVL_B2_UH(src1, src0, src2, src1, vec1, vec3); 1381 ILVL_B2_UH(src1, src0, src2, src1, vec1, vec3);
1422 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); 1382 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
1423 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 1383 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1424 SAT_UH2_UH(tmp0, tmp1, 7);
1425 out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 1384 out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1426 1385
1427 ILVR_B2_UH(src3, src2, src4, src3, vec4, vec6); 1386 ILVR_B2_UH(src3, src2, src4, src3, vec4, vec6);
1428 ILVL_B2_UH(src3, src2, src4, src3, vec5, vec7); 1387 ILVL_B2_UH(src3, src2, src4, src3, vec5, vec7);
1429 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); 1388 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
1430 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); 1389 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
1431 SAT_UH2_UH(tmp2, tmp3, 7);
1432 out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); 1390 out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
1433 1391
1434 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); 1392 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
1435 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 1393 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1436 SAT_UH2_UH(tmp0, tmp1, 7);
1437 out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 1394 out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1438 1395
1439 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); 1396 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
1440 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); 1397 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
1441 SAT_UH2_UH(tmp2, tmp3, 7);
1442 out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); 1398 out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
1443 1399
1444 src0 = src4; 1400 src0 = src4;
1445 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); 1401 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1446 dst += (4 * dst_stride); 1402 dst += (4 * dst_stride);
1447 1403
1448 AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, 1404 AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3,
1449 out0, out1, out2, out3); 1405 out0, out1, out2, out3);
1450 1406
1451 CALC_MSE_AVG_B(out0, ref0, var, avg); 1407 CALC_MSE_AVG_B(out0, ref0, var, avg);
(...skipping 104 matching lines...) Expand 10 before | Expand all | Expand 10 after
1556 dst += (4 * dst_stride); 1512 dst += (4 * dst_stride);
1557 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); 1513 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
1558 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); 1514 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
1559 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); 1515 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
1560 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); 1516 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
1561 hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); 1517 hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
1562 hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); 1518 hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
1563 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 1519 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
1564 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 1520 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1565 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 1521 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1566 SAT_UH2_UH(tmp0, tmp1, 7);
1567 out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 1522 out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1568 out = __msa_aver_u_b(out, pred); 1523 out = __msa_aver_u_b(out, pred);
1569 CALC_MSE_AVG_B(out, ref, var, avg); 1524 CALC_MSE_AVG_B(out, ref, var, avg);
1570 src0 = src4; 1525 src0 = src4;
1571 } 1526 }
1572 1527
1573 vec = __msa_hadd_s_w(avg, avg); 1528 vec = __msa_hadd_s_w(avg, avg);
1574 *diff = HADD_SW_S32(vec); 1529 *diff = HADD_SW_S32(vec);
1575 1530
1576 return HADD_SW_S32(var); 1531 return HADD_SW_S32(var);
(...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after
1613 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); 1568 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
1614 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); 1569 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
1615 1570
1616 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); 1571 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
1617 tmp0 = __msa_dotp_u_h(vec0, filt_vt); 1572 tmp0 = __msa_dotp_u_h(vec0, filt_vt);
1618 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); 1573 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
1619 1574
1620 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); 1575 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
1621 tmp1 = __msa_dotp_u_h(vec0, filt_vt); 1576 tmp1 = __msa_dotp_u_h(vec0, filt_vt);
1622 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 1577 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1623 SAT_UH2_UH(tmp0, tmp1, 7);
1624 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); 1578 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
1625 1579
1626 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); 1580 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
1627 tmp2 = __msa_dotp_u_h(vec0, filt_vt); 1581 tmp2 = __msa_dotp_u_h(vec0, filt_vt);
1628 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); 1582 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
1629 1583
1630 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); 1584 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
1631 tmp3 = __msa_dotp_u_h(vec0, filt_vt); 1585 tmp3 = __msa_dotp_u_h(vec0, filt_vt);
1632 1586
1633 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); 1587 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
1634 SAT_UH2_UH(tmp2, tmp3, 7);
1635 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); 1588 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
1636 AVER_UB2_UB(out0, pred0, out1, pred1, out0, out1); 1589 AVER_UB2_UB(out0, pred0, out1, pred1, out0, out1);
1637 1590
1638 CALC_MSE_AVG_B(out0, ref0, var, avg); 1591 CALC_MSE_AVG_B(out0, ref0, var, avg);
1639 CALC_MSE_AVG_B(out1, ref1, var, avg); 1592 CALC_MSE_AVG_B(out1, ref1, var, avg);
1640 } 1593 }
1641 1594
1642 vec = __msa_hadd_s_w(avg, avg); 1595 vec = __msa_hadd_s_w(avg, avg);
1643 *diff = HADD_SW_S32(vec); 1596 *diff = HADD_SW_S32(vec);
1644 1597
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after
1683 LD_UB4(src + 8, src_stride, src1, src3, src5, src7); 1636 LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
1684 src += (4 * src_stride); 1637 src += (4 * src_stride);
1685 LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3); 1638 LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
1686 sec_pred += (4 * width); 1639 sec_pred += (4 * width);
1687 1640
1688 hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); 1641 hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
1689 hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); 1642 hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
1690 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 1643 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
1691 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 1644 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1692 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 1645 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1693 SAT_UH2_UH(tmp0, tmp1, 7);
1694 out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 1646 out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1695 1647
1696 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); 1648 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
1697 hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); 1649 hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
1698 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); 1650 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
1699 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 1651 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1700 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 1652 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1701 SAT_UH2_UH(tmp0, tmp1, 7);
1702 out1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 1653 out1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1703 1654
1704 hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); 1655 hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
1705 hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); 1656 hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
1706 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); 1657 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
1707 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 1658 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1708 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 1659 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1709 SAT_UH2_UH(tmp0, tmp1, 7);
1710 out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 1660 out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1711 1661
1712 hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); 1662 hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
1713 hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); 1663 hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
1714 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); 1664 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
1715 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); 1665 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1716 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 1666 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1717 SAT_UH2_UH(tmp0, tmp1, 7);
1718 out3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 1667 out3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1719 1668
1720 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); 1669 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1721 dst += (4 * dst_stride); 1670 dst += (4 * dst_stride);
1722 1671
1723 AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, 1672 AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3,
1724 out0, out1, out2, out3); 1673 out0, out1, out2, out3);
1725 1674
1726 CALC_MSE_AVG_B(out0, ref0, var, avg); 1675 CALC_MSE_AVG_B(out0, ref0, var, avg);
1727 CALC_MSE_AVG_B(out1, ref1, var, avg); 1676 CALC_MSE_AVG_B(out1, ref1, var, avg);
(...skipping 266 matching lines...) Expand 10 before | Expand all | Expand 10 after
1994 ref_ptr, ref_stride, \ 1943 ref_ptr, ref_stride, \
1995 sec_pred, &diff); \ 1944 sec_pred, &diff); \
1996 } \ 1945 } \
1997 } \ 1946 } \
1998 \ 1947 \
1999 return VARIANCE_64Wx##ht##H(*sse, diff); \ 1948 return VARIANCE_64Wx##ht##H(*sse, diff); \
2000 } 1949 }
2001 1950
2002 VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(32); 1951 VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(32);
2003 VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(64); 1952 VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(64);
OLDNEW
« no previous file with comments | « source/libvpx/vpx_dsp/mips/macros_msa.h ('k') | source/libvpx/vpx_dsp/mips/vpx_common_dspr2.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698