OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 388 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
399 const uint8_t *filter, | 399 const uint8_t *filter, |
400 int32_t height, | 400 int32_t height, |
401 int32_t *diff) { | 401 int32_t *diff) { |
402 int16_t filtval; | 402 int16_t filtval; |
403 uint32_t loop_cnt; | 403 uint32_t loop_cnt; |
404 uint32_t ref0, ref1, ref2, ref3; | 404 uint32_t ref0, ref1, ref2, ref3; |
405 v16u8 filt0, ref = { 0 }; | 405 v16u8 filt0, ref = { 0 }; |
406 v16i8 src0, src1, src2, src3; | 406 v16i8 src0, src1, src2, src3; |
407 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; | 407 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; |
408 v8u16 vec0, vec1, vec2, vec3; | 408 v8u16 vec0, vec1, vec2, vec3; |
409 v8u16 const255; | |
410 v8i16 avg = { 0 }; | 409 v8i16 avg = { 0 }; |
411 v4i32 vec, var = { 0 }; | 410 v4i32 vec, var = { 0 }; |
412 | 411 |
413 filtval = LH(filter); | 412 filtval = LH(filter); |
414 filt0 = (v16u8)__msa_fill_h(filtval); | 413 filt0 = (v16u8)__msa_fill_h(filtval); |
415 | 414 |
416 const255 = (v8u16)__msa_ldi_h(255); | |
417 | |
418 for (loop_cnt = (height >> 2); loop_cnt--;) { | 415 for (loop_cnt = (height >> 2); loop_cnt--;) { |
419 LD_SB4(src, src_stride, src0, src1, src2, src3); | 416 LD_SB4(src, src_stride, src0, src1, src2, src3); |
420 src += (4 * src_stride); | 417 src += (4 * src_stride); |
421 LW4(dst, dst_stride, ref0, ref1, ref2, ref3); | 418 LW4(dst, dst_stride, ref0, ref1, ref2, ref3); |
422 dst += (4 * dst_stride); | 419 dst += (4 * dst_stride); |
423 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); | 420 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); |
424 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); | 421 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); |
425 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); | 422 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); |
426 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, | 423 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, |
427 vec0, vec1, vec2, vec3); | 424 vec0, vec1, vec2, vec3); |
428 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); | 425 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); |
429 MIN_UH4_UH(vec0, vec1, vec2, vec3, const255); | |
430 PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, | 426 PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, |
431 src0, src1, src2, src3); | 427 src0, src1, src2, src3); |
432 ILVEV_W2_SB(src0, src1, src2, src3, src0, src2); | 428 ILVEV_W2_SB(src0, src1, src2, src3, src0, src2); |
433 src0 = (v16i8)__msa_ilvev_d((v2i64)src2, (v2i64)src0); | 429 src0 = (v16i8)__msa_ilvev_d((v2i64)src2, (v2i64)src0); |
434 CALC_MSE_AVG_B(src0, ref, var, avg); | 430 CALC_MSE_AVG_B(src0, ref, var, avg); |
435 } | 431 } |
436 | 432 |
437 vec = __msa_hadd_s_w(avg, avg); | 433 vec = __msa_hadd_s_w(avg, avg); |
438 *diff = HADD_SW_S32(vec); | 434 *diff = HADD_SW_S32(vec); |
439 | 435 |
440 return HADD_SW_S32(var); | 436 return HADD_SW_S32(var); |
441 } | 437 } |
442 | 438 |
443 static uint32_t sub_pixel_sse_diff_8width_h_msa(const uint8_t *src, | 439 static uint32_t sub_pixel_sse_diff_8width_h_msa(const uint8_t *src, |
444 int32_t src_stride, | 440 int32_t src_stride, |
445 const uint8_t *dst, | 441 const uint8_t *dst, |
446 int32_t dst_stride, | 442 int32_t dst_stride, |
447 const uint8_t *filter, | 443 const uint8_t *filter, |
448 int32_t height, | 444 int32_t height, |
449 int32_t *diff) { | 445 int32_t *diff) { |
450 int16_t filtval; | 446 int16_t filtval; |
451 uint32_t loop_cnt; | 447 uint32_t loop_cnt; |
452 v16u8 filt0, out, ref0, ref1, ref2, ref3; | 448 v16u8 filt0, out, ref0, ref1, ref2, ref3; |
453 v16i8 src0, src1, src2, src3; | 449 v16i8 src0, src1, src2, src3; |
454 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; | 450 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; |
455 v8u16 vec0, vec1, vec2, vec3, const255; | 451 v8u16 vec0, vec1, vec2, vec3; |
456 v8i16 avg = { 0 }; | 452 v8i16 avg = { 0 }; |
457 v4i32 vec, var = { 0 }; | 453 v4i32 vec, var = { 0 }; |
458 | 454 |
459 filtval = LH(filter); | 455 filtval = LH(filter); |
460 filt0 = (v16u8)__msa_fill_h(filtval); | 456 filt0 = (v16u8)__msa_fill_h(filtval); |
461 | 457 |
462 const255 = (v8u16)__msa_ldi_h(255); | |
463 | |
464 for (loop_cnt = (height >> 2); loop_cnt--;) { | 458 for (loop_cnt = (height >> 2); loop_cnt--;) { |
465 LD_SB4(src, src_stride, src0, src1, src2, src3); | 459 LD_SB4(src, src_stride, src0, src1, src2, src3); |
466 src += (4 * src_stride); | 460 src += (4 * src_stride); |
467 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); | 461 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); |
468 dst += (4 * dst_stride); | 462 dst += (4 * dst_stride); |
469 | 463 |
470 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); | 464 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); |
471 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); | 465 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); |
472 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); | 466 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); |
473 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, | 467 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, |
474 vec0, vec1, vec2, vec3); | 468 vec0, vec1, vec2, vec3); |
475 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); | 469 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); |
476 MIN_UH4_UH(vec0, vec1, vec2, vec3, const255); | |
477 PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, | 470 PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, |
478 src0, src1, src2, src3); | 471 src0, src1, src2, src3); |
479 out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0); | 472 out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0); |
480 CALC_MSE_AVG_B(out, ref0, var, avg); | 473 CALC_MSE_AVG_B(out, ref0, var, avg); |
481 out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2); | 474 out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2); |
482 CALC_MSE_AVG_B(out, ref1, var, avg); | 475 CALC_MSE_AVG_B(out, ref1, var, avg); |
483 } | 476 } |
484 | 477 |
485 vec = __msa_hadd_s_w(avg, avg); | 478 vec = __msa_hadd_s_w(avg, avg); |
486 *diff = HADD_SW_S32(vec); | 479 *diff = HADD_SW_S32(vec); |
487 | 480 |
488 return HADD_SW_S32(var); | 481 return HADD_SW_S32(var); |
489 } | 482 } |
490 | 483 |
491 static uint32_t sub_pixel_sse_diff_16width_h_msa(const uint8_t *src, | 484 static uint32_t sub_pixel_sse_diff_16width_h_msa(const uint8_t *src, |
492 int32_t src_stride, | 485 int32_t src_stride, |
493 const uint8_t *dst, | 486 const uint8_t *dst, |
494 int32_t dst_stride, | 487 int32_t dst_stride, |
495 const uint8_t *filter, | 488 const uint8_t *filter, |
496 int32_t height, | 489 int32_t height, |
497 int32_t *diff) { | 490 int32_t *diff) { |
498 int16_t filtval; | 491 int16_t filtval; |
499 uint32_t loop_cnt; | 492 uint32_t loop_cnt; |
500 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; | 493 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; |
501 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; | 494 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; |
502 v16u8 dst0, dst1, dst2, dst3, filt0; | 495 v16u8 dst0, dst1, dst2, dst3, filt0; |
503 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; | 496 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; |
504 v8u16 out0, out1, out2, out3, out4, out5, out6, out7; | 497 v8u16 out0, out1, out2, out3, out4, out5, out6, out7; |
505 v8u16 const255; | |
506 v8i16 avg = { 0 }; | 498 v8i16 avg = { 0 }; |
507 v4i32 vec, var = { 0 }; | 499 v4i32 vec, var = { 0 }; |
508 | 500 |
509 filtval = LH(filter); | 501 filtval = LH(filter); |
510 filt0 = (v16u8)__msa_fill_h(filtval); | 502 filt0 = (v16u8)__msa_fill_h(filtval); |
511 | 503 |
512 const255 = (v8u16)__msa_ldi_h(255); | |
513 | |
514 for (loop_cnt = (height >> 2); loop_cnt--;) { | 504 for (loop_cnt = (height >> 2); loop_cnt--;) { |
515 LD_SB4(src, src_stride, src0, src2, src4, src6); | 505 LD_SB4(src, src_stride, src0, src2, src4, src6); |
516 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); | 506 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); |
517 src += (4 * src_stride); | 507 src += (4 * src_stride); |
518 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); | 508 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); |
519 dst += (4 * dst_stride); | 509 dst += (4 * dst_stride); |
520 | 510 |
521 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); | 511 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); |
522 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); | 512 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); |
523 VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5); | 513 VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5); |
524 VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7); | 514 VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7); |
525 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, | 515 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, |
526 out0, out1, out2, out3); | 516 out0, out1, out2, out3); |
527 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, | 517 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, |
528 out4, out5, out6, out7); | 518 out4, out5, out6, out7); |
529 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); | 519 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); |
530 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); | 520 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); |
531 MIN_UH4_UH(out0, out1, out2, out3, const255); | |
532 MIN_UH4_UH(out4, out5, out6, out7, const255); | |
533 PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6, | 521 PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6, |
534 src0, src1, src2, src3); | 522 src0, src1, src2, src3); |
535 CALC_MSE_AVG_B(src0, dst0, var, avg); | 523 CALC_MSE_AVG_B(src0, dst0, var, avg); |
536 CALC_MSE_AVG_B(src1, dst1, var, avg); | 524 CALC_MSE_AVG_B(src1, dst1, var, avg); |
537 CALC_MSE_AVG_B(src2, dst2, var, avg); | 525 CALC_MSE_AVG_B(src2, dst2, var, avg); |
538 CALC_MSE_AVG_B(src3, dst3, var, avg); | 526 CALC_MSE_AVG_B(src3, dst3, var, avg); |
539 } | 527 } |
540 | 528 |
541 vec = __msa_hadd_s_w(avg, avg); | 529 vec = __msa_hadd_s_w(avg, avg); |
542 *diff = HADD_SW_S32(vec); | 530 *diff = HADD_SW_S32(vec); |
(...skipping 75 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
618 src += (4 * src_stride); | 606 src += (4 * src_stride); |
619 LW4(dst, dst_stride, ref0, ref1, ref2, ref3); | 607 LW4(dst, dst_stride, ref0, ref1, ref2, ref3); |
620 dst += (4 * dst_stride); | 608 dst += (4 * dst_stride); |
621 | 609 |
622 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); | 610 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); |
623 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, | 611 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, |
624 src10_r, src21_r, src32_r, src43_r); | 612 src10_r, src21_r, src32_r, src43_r); |
625 ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); | 613 ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); |
626 DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); | 614 DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); |
627 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 615 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
628 SAT_UH2_UH(tmp0, tmp1, 7); | |
629 out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); | 616 out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); |
630 CALC_MSE_AVG_B(out, ref, var, avg); | 617 CALC_MSE_AVG_B(out, ref, var, avg); |
631 src0 = src4; | 618 src0 = src4; |
632 } | 619 } |
633 | 620 |
634 vec = __msa_hadd_s_w(avg, avg); | 621 vec = __msa_hadd_s_w(avg, avg); |
635 *diff = HADD_SW_S32(vec); | 622 *diff = HADD_SW_S32(vec); |
636 | 623 |
637 return HADD_SW_S32(var); | 624 return HADD_SW_S32(var); |
638 } | 625 } |
(...skipping 26 matching lines...) Expand all Loading... |
665 src += (4 * src_stride); | 652 src += (4 * src_stride); |
666 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); | 653 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); |
667 dst += (4 * dst_stride); | 654 dst += (4 * dst_stride); |
668 | 655 |
669 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); | 656 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); |
670 ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, | 657 ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, |
671 vec0, vec1, vec2, vec3); | 658 vec0, vec1, vec2, vec3); |
672 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, | 659 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, |
673 tmp0, tmp1, tmp2, tmp3); | 660 tmp0, tmp1, tmp2, tmp3); |
674 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); | 661 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); |
675 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); | |
676 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); | 662 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); |
677 CALC_MSE_AVG_B(src0, ref0, var, avg); | 663 CALC_MSE_AVG_B(src0, ref0, var, avg); |
678 CALC_MSE_AVG_B(src1, ref1, var, avg); | 664 CALC_MSE_AVG_B(src1, ref1, var, avg); |
679 src0 = src4; | 665 src0 = src4; |
680 } | 666 } |
681 | 667 |
682 vec = __msa_hadd_s_w(avg, avg); | 668 vec = __msa_hadd_s_w(avg, avg); |
683 *diff = HADD_SW_S32(vec); | 669 *diff = HADD_SW_S32(vec); |
684 | 670 |
685 return HADD_SW_S32(var); | 671 return HADD_SW_S32(var); |
(...skipping 26 matching lines...) Expand all Loading... |
712 for (loop_cnt = (height >> 2); loop_cnt--;) { | 698 for (loop_cnt = (height >> 2); loop_cnt--;) { |
713 LD_UB4(src, src_stride, src1, src2, src3, src4); | 699 LD_UB4(src, src_stride, src1, src2, src3, src4); |
714 src += (4 * src_stride); | 700 src += (4 * src_stride); |
715 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); | 701 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); |
716 dst += (4 * dst_stride); | 702 dst += (4 * dst_stride); |
717 | 703 |
718 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); | 704 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); |
719 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); | 705 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); |
720 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); | 706 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); |
721 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 707 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
722 SAT_UH2_UH(tmp0, tmp1, 7); | |
723 out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); | 708 out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); |
724 | 709 |
725 ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); | 710 ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); |
726 ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); | 711 ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); |
727 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); | 712 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); |
728 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); | 713 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); |
729 SAT_UH2_UH(tmp2, tmp3, 7); | |
730 out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); | 714 out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); |
731 | 715 |
732 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); | 716 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); |
733 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 717 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
734 SAT_UH2_UH(tmp0, tmp1, 7); | |
735 out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); | 718 out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); |
736 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); | 719 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); |
737 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); | 720 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); |
738 SAT_UH2_UH(tmp2, tmp3, 7); | |
739 out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); | 721 out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); |
740 | 722 |
741 src0 = src4; | 723 src0 = src4; |
742 | 724 |
743 CALC_MSE_AVG_B(out0, ref0, var, avg); | 725 CALC_MSE_AVG_B(out0, ref0, var, avg); |
744 CALC_MSE_AVG_B(out1, ref1, var, avg); | 726 CALC_MSE_AVG_B(out1, ref1, var, avg); |
745 CALC_MSE_AVG_B(out2, ref2, var, avg); | 727 CALC_MSE_AVG_B(out2, ref2, var, avg); |
746 CALC_MSE_AVG_B(out3, ref3, var, avg); | 728 CALC_MSE_AVG_B(out3, ref3, var, avg); |
747 } | 729 } |
748 | 730 |
(...skipping 82 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
831 dst += (4 * dst_stride); | 813 dst += (4 * dst_stride); |
832 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); | 814 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); |
833 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); | 815 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); |
834 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); | 816 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); |
835 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); | 817 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); |
836 hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); | 818 hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); |
837 hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); | 819 hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); |
838 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); | 820 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); |
839 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); | 821 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); |
840 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 822 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
841 SAT_UH2_UH(tmp0, tmp1, 7); | |
842 out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); | 823 out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); |
843 CALC_MSE_AVG_B(out, ref, var, avg); | 824 CALC_MSE_AVG_B(out, ref, var, avg); |
844 src0 = src4; | 825 src0 = src4; |
845 } | 826 } |
846 | 827 |
847 vec = __msa_hadd_s_w(avg, avg); | 828 vec = __msa_hadd_s_w(avg, avg); |
848 *diff = HADD_SW_S32(vec); | 829 *diff = HADD_SW_S32(vec); |
849 | 830 |
850 return HADD_SW_S32(var); | 831 return HADD_SW_S32(var); |
851 } | 832 } |
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
886 dst += (4 * dst_stride); | 867 dst += (4 * dst_stride); |
887 | 868 |
888 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); | 869 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); |
889 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); | 870 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); |
890 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); | 871 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); |
891 tmp0 = __msa_dotp_u_h(vec0, filt_vt); | 872 tmp0 = __msa_dotp_u_h(vec0, filt_vt); |
892 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); | 873 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); |
893 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); | 874 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); |
894 tmp1 = __msa_dotp_u_h(vec0, filt_vt); | 875 tmp1 = __msa_dotp_u_h(vec0, filt_vt); |
895 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 876 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
896 SAT_UH2_UH(tmp0, tmp1, 7); | |
897 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); | 877 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); |
898 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); | 878 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); |
899 tmp2 = __msa_dotp_u_h(vec0, filt_vt); | 879 tmp2 = __msa_dotp_u_h(vec0, filt_vt); |
900 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); | 880 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); |
901 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); | 881 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); |
902 tmp3 = __msa_dotp_u_h(vec0, filt_vt); | 882 tmp3 = __msa_dotp_u_h(vec0, filt_vt); |
903 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); | 883 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); |
904 SAT_UH2_UH(tmp2, tmp3, 7); | |
905 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); | 884 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); |
906 CALC_MSE_AVG_B(out0, ref0, var, avg); | 885 CALC_MSE_AVG_B(out0, ref0, var, avg); |
907 CALC_MSE_AVG_B(out1, ref1, var, avg); | 886 CALC_MSE_AVG_B(out1, ref1, var, avg); |
908 } | 887 } |
909 | 888 |
910 vec = __msa_hadd_s_w(avg, avg); | 889 vec = __msa_hadd_s_w(avg, avg); |
911 *diff = HADD_SW_S32(vec); | 890 *diff = HADD_SW_S32(vec); |
912 | 891 |
913 return HADD_SW_S32(var); | 892 return HADD_SW_S32(var); |
914 } | 893 } |
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
948 LD_UB4(src + 8, src_stride, src1, src3, src5, src7); | 927 LD_UB4(src + 8, src_stride, src1, src3, src5, src7); |
949 src += (4 * src_stride); | 928 src += (4 * src_stride); |
950 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); | 929 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); |
951 dst += (4 * dst_stride); | 930 dst += (4 * dst_stride); |
952 | 931 |
953 hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); | 932 hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); |
954 hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); | 933 hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); |
955 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); | 934 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); |
956 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); | 935 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); |
957 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 936 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
958 SAT_UH2_UH(tmp0, tmp1, 7); | |
959 src0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); | 937 src0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); |
960 | 938 |
961 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); | 939 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); |
962 hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); | 940 hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); |
963 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); | 941 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); |
964 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); | 942 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); |
965 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 943 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
966 SAT_UH2_UH(tmp0, tmp1, 7); | |
967 src1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); | 944 src1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); |
968 | 945 |
969 hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); | 946 hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); |
970 hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); | 947 hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); |
971 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); | 948 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); |
972 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); | 949 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); |
973 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 950 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
974 SAT_UH2_UH(tmp0, tmp1, 7); | |
975 src2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); | 951 src2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); |
976 | 952 |
977 hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); | 953 hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); |
978 hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); | 954 hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); |
979 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); | 955 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); |
980 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); | 956 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); |
981 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 957 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
982 SAT_UH2_UH(tmp0, tmp1, 7); | |
983 src3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); | 958 src3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); |
984 | 959 |
985 CALC_MSE_AVG_B(src0, ref0, var, avg); | 960 CALC_MSE_AVG_B(src0, ref0, var, avg); |
986 CALC_MSE_AVG_B(src1, ref1, var, avg); | 961 CALC_MSE_AVG_B(src1, ref1, var, avg); |
987 CALC_MSE_AVG_B(src2, ref2, var, avg); | 962 CALC_MSE_AVG_B(src2, ref2, var, avg); |
988 CALC_MSE_AVG_B(src3, ref3, var, avg); | 963 CALC_MSE_AVG_B(src3, ref3, var, avg); |
989 } | 964 } |
990 | 965 |
991 vec = __msa_hadd_s_w(avg, avg); | 966 vec = __msa_hadd_s_w(avg, avg); |
992 *diff = HADD_SW_S32(vec); | 967 *diff = HADD_SW_S32(vec); |
(...skipping 57 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1050 const uint8_t *filter, | 1025 const uint8_t *filter, |
1051 int32_t height, | 1026 int32_t height, |
1052 int32_t *diff) { | 1027 int32_t *diff) { |
1053 int16_t filtval; | 1028 int16_t filtval; |
1054 uint32_t loop_cnt; | 1029 uint32_t loop_cnt; |
1055 uint32_t ref0, ref1, ref2, ref3; | 1030 uint32_t ref0, ref1, ref2, ref3; |
1056 v16u8 out, pred, filt0, ref = { 0 }; | 1031 v16u8 out, pred, filt0, ref = { 0 }; |
1057 v16i8 src0, src1, src2, src3; | 1032 v16i8 src0, src1, src2, src3; |
1058 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; | 1033 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; |
1059 v8u16 vec0, vec1, vec2, vec3; | 1034 v8u16 vec0, vec1, vec2, vec3; |
1060 v8u16 const255; | |
1061 v8i16 avg = { 0 }; | 1035 v8i16 avg = { 0 }; |
1062 v4i32 vec, var = { 0 }; | 1036 v4i32 vec, var = { 0 }; |
1063 | 1037 |
1064 filtval = LH(filter); | 1038 filtval = LH(filter); |
1065 filt0 = (v16u8)__msa_fill_h(filtval); | 1039 filt0 = (v16u8)__msa_fill_h(filtval); |
1066 | 1040 |
1067 const255 = (v8u16)__msa_ldi_h(255); | |
1068 | |
1069 for (loop_cnt = (height >> 2); loop_cnt--;) { | 1041 for (loop_cnt = (height >> 2); loop_cnt--;) { |
1070 LD_SB4(src, src_stride, src0, src1, src2, src3); | 1042 LD_SB4(src, src_stride, src0, src1, src2, src3); |
1071 src += (4 * src_stride); | 1043 src += (4 * src_stride); |
1072 pred = LD_UB(sec_pred); | 1044 pred = LD_UB(sec_pred); |
1073 sec_pred += 16; | 1045 sec_pred += 16; |
1074 LW4(dst, dst_stride, ref0, ref1, ref2, ref3); | 1046 LW4(dst, dst_stride, ref0, ref1, ref2, ref3); |
1075 dst += (4 * dst_stride); | 1047 dst += (4 * dst_stride); |
1076 | 1048 |
1077 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); | 1049 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); |
1078 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); | 1050 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); |
1079 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); | 1051 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); |
1080 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, | 1052 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, |
1081 vec0, vec1, vec2, vec3); | 1053 vec0, vec1, vec2, vec3); |
1082 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); | 1054 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); |
1083 MIN_UH4_UH(vec0, vec1, vec2, vec3, const255); | |
1084 PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, | 1055 PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, |
1085 src0, src1, src2, src3); | 1056 src0, src1, src2, src3); |
1086 ILVEV_W2_SB(src0, src1, src2, src3, src0, src2); | 1057 ILVEV_W2_SB(src0, src1, src2, src3, src0, src2); |
1087 out = (v16u8)__msa_ilvev_d((v2i64)src2, (v2i64)src0); | 1058 out = (v16u8)__msa_ilvev_d((v2i64)src2, (v2i64)src0); |
1088 out = __msa_aver_u_b(out, pred); | 1059 out = __msa_aver_u_b(out, pred); |
1089 CALC_MSE_AVG_B(out, ref, var, avg); | 1060 CALC_MSE_AVG_B(out, ref, var, avg); |
1090 } | 1061 } |
1091 | 1062 |
1092 vec = __msa_hadd_s_w(avg, avg); | 1063 vec = __msa_hadd_s_w(avg, avg); |
1093 *diff = HADD_SW_S32(vec); | 1064 *diff = HADD_SW_S32(vec); |
1094 | 1065 |
1095 return HADD_SW_S32(var); | 1066 return HADD_SW_S32(var); |
1096 } | 1067 } |
1097 | 1068 |
1098 static uint32_t sub_pixel_avg_sse_diff_8width_h_msa(const uint8_t *src, | 1069 static uint32_t sub_pixel_avg_sse_diff_8width_h_msa(const uint8_t *src, |
1099 int32_t src_stride, | 1070 int32_t src_stride, |
1100 const uint8_t *dst, | 1071 const uint8_t *dst, |
1101 int32_t dst_stride, | 1072 int32_t dst_stride, |
1102 const uint8_t *sec_pred, | 1073 const uint8_t *sec_pred, |
1103 const uint8_t *filter, | 1074 const uint8_t *filter, |
1104 int32_t height, | 1075 int32_t height, |
1105 int32_t *diff) { | 1076 int32_t *diff) { |
1106 int16_t filtval; | 1077 int16_t filtval; |
1107 uint32_t loop_cnt; | 1078 uint32_t loop_cnt; |
1108 v16u8 out, pred, filt0; | 1079 v16u8 out, pred, filt0; |
1109 v16u8 ref0, ref1, ref2, ref3; | 1080 v16u8 ref0, ref1, ref2, ref3; |
1110 v16i8 src0, src1, src2, src3; | 1081 v16i8 src0, src1, src2, src3; |
1111 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; | 1082 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; |
1112 v8u16 vec0, vec1, vec2, vec3; | 1083 v8u16 vec0, vec1, vec2, vec3; |
1113 v8u16 const255; | |
1114 v8i16 avg = { 0 }; | 1084 v8i16 avg = { 0 }; |
1115 v4i32 vec, var = { 0 }; | 1085 v4i32 vec, var = { 0 }; |
1116 | 1086 |
1117 filtval = LH(filter); | 1087 filtval = LH(filter); |
1118 filt0 = (v16u8)__msa_fill_h(filtval); | 1088 filt0 = (v16u8)__msa_fill_h(filtval); |
1119 | 1089 |
1120 const255 = (v8u16)__msa_ldi_h(255); | |
1121 | |
1122 for (loop_cnt = (height >> 2); loop_cnt--;) { | 1090 for (loop_cnt = (height >> 2); loop_cnt--;) { |
1123 LD_SB4(src, src_stride, src0, src1, src2, src3); | 1091 LD_SB4(src, src_stride, src0, src1, src2, src3); |
1124 src += (4 * src_stride); | 1092 src += (4 * src_stride); |
1125 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); | 1093 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); |
1126 dst += (4 * dst_stride); | 1094 dst += (4 * dst_stride); |
1127 | 1095 |
1128 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); | 1096 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); |
1129 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); | 1097 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); |
1130 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); | 1098 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); |
1131 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, | 1099 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, |
1132 vec0, vec1, vec2, vec3); | 1100 vec0, vec1, vec2, vec3); |
1133 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); | 1101 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); |
1134 MIN_UH4_UH(vec0, vec1, vec2, vec3, const255); | |
1135 PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, | 1102 PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, |
1136 src0, src1, src2, src3); | 1103 src0, src1, src2, src3); |
1137 out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0); | 1104 out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0); |
1138 | 1105 |
1139 pred = LD_UB(sec_pred); | 1106 pred = LD_UB(sec_pred); |
1140 sec_pred += 16; | 1107 sec_pred += 16; |
1141 out = __msa_aver_u_b(out, pred); | 1108 out = __msa_aver_u_b(out, pred); |
1142 CALC_MSE_AVG_B(out, ref0, var, avg); | 1109 CALC_MSE_AVG_B(out, ref0, var, avg); |
1143 out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2); | 1110 out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2); |
1144 pred = LD_UB(sec_pred); | 1111 pred = LD_UB(sec_pred); |
(...skipping 19 matching lines...) Expand all Loading... |
1164 int32_t width) { | 1131 int32_t width) { |
1165 int16_t filtval; | 1132 int16_t filtval; |
1166 uint32_t loop_cnt; | 1133 uint32_t loop_cnt; |
1167 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; | 1134 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; |
1168 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; | 1135 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; |
1169 v16u8 dst0, dst1, dst2, dst3; | 1136 v16u8 dst0, dst1, dst2, dst3; |
1170 v16u8 tmp0, tmp1, tmp2, tmp3; | 1137 v16u8 tmp0, tmp1, tmp2, tmp3; |
1171 v16u8 pred0, pred1, pred2, pred3, filt0; | 1138 v16u8 pred0, pred1, pred2, pred3, filt0; |
1172 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; | 1139 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; |
1173 v8u16 out0, out1, out2, out3, out4, out5, out6, out7; | 1140 v8u16 out0, out1, out2, out3, out4, out5, out6, out7; |
1174 v8u16 const255; | |
1175 v8i16 avg = { 0 }; | 1141 v8i16 avg = { 0 }; |
1176 v4i32 vec, var = { 0 }; | 1142 v4i32 vec, var = { 0 }; |
1177 | 1143 |
1178 filtval = LH(filter); | 1144 filtval = LH(filter); |
1179 filt0 = (v16u8)__msa_fill_h(filtval); | 1145 filt0 = (v16u8)__msa_fill_h(filtval); |
1180 | 1146 |
1181 const255 = (v8u16)__msa_ldi_h(255); | |
1182 | |
1183 for (loop_cnt = (height >> 2); loop_cnt--;) { | 1147 for (loop_cnt = (height >> 2); loop_cnt--;) { |
1184 LD_SB4(src, src_stride, src0, src2, src4, src6); | 1148 LD_SB4(src, src_stride, src0, src2, src4, src6); |
1185 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); | 1149 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); |
1186 src += (4 * src_stride); | 1150 src += (4 * src_stride); |
1187 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); | 1151 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); |
1188 dst += (4 * dst_stride); | 1152 dst += (4 * dst_stride); |
1189 LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3); | 1153 LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3); |
1190 sec_pred += (4 * width); | 1154 sec_pred += (4 * width); |
1191 | 1155 |
1192 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); | 1156 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); |
1193 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); | 1157 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); |
1194 VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5); | 1158 VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5); |
1195 VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7); | 1159 VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7); |
1196 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, | 1160 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, |
1197 out0, out1, out2, out3); | 1161 out0, out1, out2, out3); |
1198 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, | 1162 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, |
1199 out4, out5, out6, out7); | 1163 out4, out5, out6, out7); |
1200 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); | 1164 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); |
1201 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); | 1165 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); |
1202 MIN_UH4_UH(out0, out1, out2, out3, const255); | |
1203 MIN_UH4_UH(out4, out5, out6, out7, const255); | |
1204 PCKEV_B4_UB(out1, out0, out3, out2, out5, out4, out7, out6, | 1166 PCKEV_B4_UB(out1, out0, out3, out2, out5, out4, out7, out6, |
1205 tmp0, tmp1, tmp2, tmp3); | 1167 tmp0, tmp1, tmp2, tmp3); |
1206 AVER_UB4_UB(tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, pred3, | 1168 AVER_UB4_UB(tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, pred3, |
1207 tmp0, tmp1, tmp2, tmp3); | 1169 tmp0, tmp1, tmp2, tmp3); |
1208 | 1170 |
1209 CALC_MSE_AVG_B(tmp0, dst0, var, avg); | 1171 CALC_MSE_AVG_B(tmp0, dst0, var, avg); |
1210 CALC_MSE_AVG_B(tmp1, dst1, var, avg); | 1172 CALC_MSE_AVG_B(tmp1, dst1, var, avg); |
1211 CALC_MSE_AVG_B(tmp2, dst2, var, avg); | 1173 CALC_MSE_AVG_B(tmp2, dst2, var, avg); |
1212 CALC_MSE_AVG_B(tmp3, dst3, var, avg); | 1174 CALC_MSE_AVG_B(tmp3, dst3, var, avg); |
1213 } | 1175 } |
(...skipping 98 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1312 sec_pred += 16; | 1274 sec_pred += 16; |
1313 LW4(dst, dst_stride, ref0, ref1, ref2, ref3); | 1275 LW4(dst, dst_stride, ref0, ref1, ref2, ref3); |
1314 dst += (4 * dst_stride); | 1276 dst += (4 * dst_stride); |
1315 | 1277 |
1316 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); | 1278 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); |
1317 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, | 1279 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, |
1318 src10_r, src21_r, src32_r, src43_r); | 1280 src10_r, src21_r, src32_r, src43_r); |
1319 ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); | 1281 ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); |
1320 DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); | 1282 DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); |
1321 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 1283 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
1322 SAT_UH2_UH(tmp0, tmp1, 7); | |
1323 | 1284 |
1324 out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); | 1285 out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); |
1325 out = __msa_aver_u_b(out, pred); | 1286 out = __msa_aver_u_b(out, pred); |
1326 CALC_MSE_AVG_B(out, ref, var, avg); | 1287 CALC_MSE_AVG_B(out, ref, var, avg); |
1327 src0 = src4; | 1288 src0 = src4; |
1328 } | 1289 } |
1329 | 1290 |
1330 vec = __msa_hadd_s_w(avg, avg); | 1291 vec = __msa_hadd_s_w(avg, avg); |
1331 *diff = HADD_SW_S32(vec); | 1292 *diff = HADD_SW_S32(vec); |
1332 | 1293 |
(...skipping 30 matching lines...) Expand all Loading... |
1363 LD_UB2(sec_pred, 16, pred0, pred1); | 1324 LD_UB2(sec_pred, 16, pred0, pred1); |
1364 sec_pred += 32; | 1325 sec_pred += 32; |
1365 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); | 1326 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); |
1366 dst += (4 * dst_stride); | 1327 dst += (4 * dst_stride); |
1367 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); | 1328 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); |
1368 ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, | 1329 ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, |
1369 vec0, vec1, vec2, vec3); | 1330 vec0, vec1, vec2, vec3); |
1370 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, | 1331 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, |
1371 tmp0, tmp1, tmp2, tmp3); | 1332 tmp0, tmp1, tmp2, tmp3); |
1372 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); | 1333 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); |
1373 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); | |
1374 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); | 1334 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); |
1375 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); | 1335 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); |
1376 CALC_MSE_AVG_B(src0, ref0, var, avg); | 1336 CALC_MSE_AVG_B(src0, ref0, var, avg); |
1377 CALC_MSE_AVG_B(src1, ref1, var, avg); | 1337 CALC_MSE_AVG_B(src1, ref1, var, avg); |
1378 | 1338 |
1379 src0 = src4; | 1339 src0 = src4; |
1380 } | 1340 } |
1381 | 1341 |
1382 vec = __msa_hadd_s_w(avg, avg); | 1342 vec = __msa_hadd_s_w(avg, avg); |
1383 *diff = HADD_SW_S32(vec); | 1343 *diff = HADD_SW_S32(vec); |
(...skipping 30 matching lines...) Expand all Loading... |
1414 for (loop_cnt = (height >> 2); loop_cnt--;) { | 1374 for (loop_cnt = (height >> 2); loop_cnt--;) { |
1415 LD_UB4(src, src_stride, src1, src2, src3, src4); | 1375 LD_UB4(src, src_stride, src1, src2, src3, src4); |
1416 src += (4 * src_stride); | 1376 src += (4 * src_stride); |
1417 LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3); | 1377 LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3); |
1418 sec_pred += (4 * width); | 1378 sec_pred += (4 * width); |
1419 | 1379 |
1420 ILVR_B2_UH(src1, src0, src2, src1, vec0, vec2); | 1380 ILVR_B2_UH(src1, src0, src2, src1, vec0, vec2); |
1421 ILVL_B2_UH(src1, src0, src2, src1, vec1, vec3); | 1381 ILVL_B2_UH(src1, src0, src2, src1, vec1, vec3); |
1422 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); | 1382 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); |
1423 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 1383 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
1424 SAT_UH2_UH(tmp0, tmp1, 7); | |
1425 out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); | 1384 out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); |
1426 | 1385 |
1427 ILVR_B2_UH(src3, src2, src4, src3, vec4, vec6); | 1386 ILVR_B2_UH(src3, src2, src4, src3, vec4, vec6); |
1428 ILVL_B2_UH(src3, src2, src4, src3, vec5, vec7); | 1387 ILVL_B2_UH(src3, src2, src4, src3, vec5, vec7); |
1429 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); | 1388 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); |
1430 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); | 1389 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); |
1431 SAT_UH2_UH(tmp2, tmp3, 7); | |
1432 out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); | 1390 out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); |
1433 | 1391 |
1434 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); | 1392 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); |
1435 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 1393 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
1436 SAT_UH2_UH(tmp0, tmp1, 7); | |
1437 out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); | 1394 out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); |
1438 | 1395 |
1439 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); | 1396 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); |
1440 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); | 1397 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); |
1441 SAT_UH2_UH(tmp2, tmp3, 7); | |
1442 out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); | 1398 out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); |
1443 | 1399 |
1444 src0 = src4; | 1400 src0 = src4; |
1445 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); | 1401 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); |
1446 dst += (4 * dst_stride); | 1402 dst += (4 * dst_stride); |
1447 | 1403 |
1448 AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, | 1404 AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, |
1449 out0, out1, out2, out3); | 1405 out0, out1, out2, out3); |
1450 | 1406 |
1451 CALC_MSE_AVG_B(out0, ref0, var, avg); | 1407 CALC_MSE_AVG_B(out0, ref0, var, avg); |
(...skipping 104 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1556 dst += (4 * dst_stride); | 1512 dst += (4 * dst_stride); |
1557 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); | 1513 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); |
1558 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); | 1514 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); |
1559 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); | 1515 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); |
1560 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); | 1516 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); |
1561 hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); | 1517 hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); |
1562 hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); | 1518 hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); |
1563 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); | 1519 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); |
1564 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); | 1520 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); |
1565 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 1521 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
1566 SAT_UH2_UH(tmp0, tmp1, 7); | |
1567 out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); | 1522 out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); |
1568 out = __msa_aver_u_b(out, pred); | 1523 out = __msa_aver_u_b(out, pred); |
1569 CALC_MSE_AVG_B(out, ref, var, avg); | 1524 CALC_MSE_AVG_B(out, ref, var, avg); |
1570 src0 = src4; | 1525 src0 = src4; |
1571 } | 1526 } |
1572 | 1527 |
1573 vec = __msa_hadd_s_w(avg, avg); | 1528 vec = __msa_hadd_s_w(avg, avg); |
1574 *diff = HADD_SW_S32(vec); | 1529 *diff = HADD_SW_S32(vec); |
1575 | 1530 |
1576 return HADD_SW_S32(var); | 1531 return HADD_SW_S32(var); |
(...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1613 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); | 1568 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); |
1614 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); | 1569 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); |
1615 | 1570 |
1616 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); | 1571 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); |
1617 tmp0 = __msa_dotp_u_h(vec0, filt_vt); | 1572 tmp0 = __msa_dotp_u_h(vec0, filt_vt); |
1618 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); | 1573 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); |
1619 | 1574 |
1620 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); | 1575 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); |
1621 tmp1 = __msa_dotp_u_h(vec0, filt_vt); | 1576 tmp1 = __msa_dotp_u_h(vec0, filt_vt); |
1622 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 1577 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
1623 SAT_UH2_UH(tmp0, tmp1, 7); | |
1624 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); | 1578 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); |
1625 | 1579 |
1626 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); | 1580 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); |
1627 tmp2 = __msa_dotp_u_h(vec0, filt_vt); | 1581 tmp2 = __msa_dotp_u_h(vec0, filt_vt); |
1628 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); | 1582 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); |
1629 | 1583 |
1630 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); | 1584 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); |
1631 tmp3 = __msa_dotp_u_h(vec0, filt_vt); | 1585 tmp3 = __msa_dotp_u_h(vec0, filt_vt); |
1632 | 1586 |
1633 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); | 1587 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); |
1634 SAT_UH2_UH(tmp2, tmp3, 7); | |
1635 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); | 1588 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); |
1636 AVER_UB2_UB(out0, pred0, out1, pred1, out0, out1); | 1589 AVER_UB2_UB(out0, pred0, out1, pred1, out0, out1); |
1637 | 1590 |
1638 CALC_MSE_AVG_B(out0, ref0, var, avg); | 1591 CALC_MSE_AVG_B(out0, ref0, var, avg); |
1639 CALC_MSE_AVG_B(out1, ref1, var, avg); | 1592 CALC_MSE_AVG_B(out1, ref1, var, avg); |
1640 } | 1593 } |
1641 | 1594 |
1642 vec = __msa_hadd_s_w(avg, avg); | 1595 vec = __msa_hadd_s_w(avg, avg); |
1643 *diff = HADD_SW_S32(vec); | 1596 *diff = HADD_SW_S32(vec); |
1644 | 1597 |
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1683 LD_UB4(src + 8, src_stride, src1, src3, src5, src7); | 1636 LD_UB4(src + 8, src_stride, src1, src3, src5, src7); |
1684 src += (4 * src_stride); | 1637 src += (4 * src_stride); |
1685 LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3); | 1638 LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3); |
1686 sec_pred += (4 * width); | 1639 sec_pred += (4 * width); |
1687 | 1640 |
1688 hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); | 1641 hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); |
1689 hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); | 1642 hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); |
1690 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); | 1643 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); |
1691 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); | 1644 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); |
1692 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 1645 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
1693 SAT_UH2_UH(tmp0, tmp1, 7); | |
1694 out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); | 1646 out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); |
1695 | 1647 |
1696 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); | 1648 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); |
1697 hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); | 1649 hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); |
1698 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); | 1650 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); |
1699 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); | 1651 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); |
1700 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 1652 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
1701 SAT_UH2_UH(tmp0, tmp1, 7); | |
1702 out1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); | 1653 out1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); |
1703 | 1654 |
1704 hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); | 1655 hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); |
1705 hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); | 1656 hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); |
1706 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); | 1657 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); |
1707 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); | 1658 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); |
1708 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 1659 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
1709 SAT_UH2_UH(tmp0, tmp1, 7); | |
1710 out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); | 1660 out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); |
1711 | 1661 |
1712 hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); | 1662 hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); |
1713 hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); | 1663 hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); |
1714 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); | 1664 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); |
1715 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); | 1665 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); |
1716 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 1666 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
1717 SAT_UH2_UH(tmp0, tmp1, 7); | |
1718 out3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); | 1667 out3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); |
1719 | 1668 |
1720 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); | 1669 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); |
1721 dst += (4 * dst_stride); | 1670 dst += (4 * dst_stride); |
1722 | 1671 |
1723 AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, | 1672 AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, |
1724 out0, out1, out2, out3); | 1673 out0, out1, out2, out3); |
1725 | 1674 |
1726 CALC_MSE_AVG_B(out0, ref0, var, avg); | 1675 CALC_MSE_AVG_B(out0, ref0, var, avg); |
1727 CALC_MSE_AVG_B(out1, ref1, var, avg); | 1676 CALC_MSE_AVG_B(out1, ref1, var, avg); |
(...skipping 266 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1994 ref_ptr, ref_stride, \ | 1943 ref_ptr, ref_stride, \ |
1995 sec_pred, &diff); \ | 1944 sec_pred, &diff); \ |
1996 } \ | 1945 } \ |
1997 } \ | 1946 } \ |
1998 \ | 1947 \ |
1999 return VARIANCE_64Wx##ht##H(*sse, diff); \ | 1948 return VARIANCE_64Wx##ht##H(*sse, diff); \ |
2000 } | 1949 } |
2001 | 1950 |
2002 VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(32); | 1951 VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(32); |
2003 VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(64); | 1952 VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(64); |
OLD | NEW |