| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| (...skipping 388 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 399 const uint8_t *filter, | 399 const uint8_t *filter, |
| 400 int32_t height, | 400 int32_t height, |
| 401 int32_t *diff) { | 401 int32_t *diff) { |
| 402 int16_t filtval; | 402 int16_t filtval; |
| 403 uint32_t loop_cnt; | 403 uint32_t loop_cnt; |
| 404 uint32_t ref0, ref1, ref2, ref3; | 404 uint32_t ref0, ref1, ref2, ref3; |
| 405 v16u8 filt0, ref = { 0 }; | 405 v16u8 filt0, ref = { 0 }; |
| 406 v16i8 src0, src1, src2, src3; | 406 v16i8 src0, src1, src2, src3; |
| 407 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; | 407 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; |
| 408 v8u16 vec0, vec1, vec2, vec3; | 408 v8u16 vec0, vec1, vec2, vec3; |
| 409 v8u16 const255; | |
| 410 v8i16 avg = { 0 }; | 409 v8i16 avg = { 0 }; |
| 411 v4i32 vec, var = { 0 }; | 410 v4i32 vec, var = { 0 }; |
| 412 | 411 |
| 413 filtval = LH(filter); | 412 filtval = LH(filter); |
| 414 filt0 = (v16u8)__msa_fill_h(filtval); | 413 filt0 = (v16u8)__msa_fill_h(filtval); |
| 415 | 414 |
| 416 const255 = (v8u16)__msa_ldi_h(255); | |
| 417 | |
| 418 for (loop_cnt = (height >> 2); loop_cnt--;) { | 415 for (loop_cnt = (height >> 2); loop_cnt--;) { |
| 419 LD_SB4(src, src_stride, src0, src1, src2, src3); | 416 LD_SB4(src, src_stride, src0, src1, src2, src3); |
| 420 src += (4 * src_stride); | 417 src += (4 * src_stride); |
| 421 LW4(dst, dst_stride, ref0, ref1, ref2, ref3); | 418 LW4(dst, dst_stride, ref0, ref1, ref2, ref3); |
| 422 dst += (4 * dst_stride); | 419 dst += (4 * dst_stride); |
| 423 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); | 420 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); |
| 424 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); | 421 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); |
| 425 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); | 422 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); |
| 426 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, | 423 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, |
| 427 vec0, vec1, vec2, vec3); | 424 vec0, vec1, vec2, vec3); |
| 428 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); | 425 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); |
| 429 MIN_UH4_UH(vec0, vec1, vec2, vec3, const255); | |
| 430 PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, | 426 PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, |
| 431 src0, src1, src2, src3); | 427 src0, src1, src2, src3); |
| 432 ILVEV_W2_SB(src0, src1, src2, src3, src0, src2); | 428 ILVEV_W2_SB(src0, src1, src2, src3, src0, src2); |
| 433 src0 = (v16i8)__msa_ilvev_d((v2i64)src2, (v2i64)src0); | 429 src0 = (v16i8)__msa_ilvev_d((v2i64)src2, (v2i64)src0); |
| 434 CALC_MSE_AVG_B(src0, ref, var, avg); | 430 CALC_MSE_AVG_B(src0, ref, var, avg); |
| 435 } | 431 } |
| 436 | 432 |
| 437 vec = __msa_hadd_s_w(avg, avg); | 433 vec = __msa_hadd_s_w(avg, avg); |
| 438 *diff = HADD_SW_S32(vec); | 434 *diff = HADD_SW_S32(vec); |
| 439 | 435 |
| 440 return HADD_SW_S32(var); | 436 return HADD_SW_S32(var); |
| 441 } | 437 } |
| 442 | 438 |
| 443 static uint32_t sub_pixel_sse_diff_8width_h_msa(const uint8_t *src, | 439 static uint32_t sub_pixel_sse_diff_8width_h_msa(const uint8_t *src, |
| 444 int32_t src_stride, | 440 int32_t src_stride, |
| 445 const uint8_t *dst, | 441 const uint8_t *dst, |
| 446 int32_t dst_stride, | 442 int32_t dst_stride, |
| 447 const uint8_t *filter, | 443 const uint8_t *filter, |
| 448 int32_t height, | 444 int32_t height, |
| 449 int32_t *diff) { | 445 int32_t *diff) { |
| 450 int16_t filtval; | 446 int16_t filtval; |
| 451 uint32_t loop_cnt; | 447 uint32_t loop_cnt; |
| 452 v16u8 filt0, out, ref0, ref1, ref2, ref3; | 448 v16u8 filt0, out, ref0, ref1, ref2, ref3; |
| 453 v16i8 src0, src1, src2, src3; | 449 v16i8 src0, src1, src2, src3; |
| 454 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; | 450 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; |
| 455 v8u16 vec0, vec1, vec2, vec3, const255; | 451 v8u16 vec0, vec1, vec2, vec3; |
| 456 v8i16 avg = { 0 }; | 452 v8i16 avg = { 0 }; |
| 457 v4i32 vec, var = { 0 }; | 453 v4i32 vec, var = { 0 }; |
| 458 | 454 |
| 459 filtval = LH(filter); | 455 filtval = LH(filter); |
| 460 filt0 = (v16u8)__msa_fill_h(filtval); | 456 filt0 = (v16u8)__msa_fill_h(filtval); |
| 461 | 457 |
| 462 const255 = (v8u16)__msa_ldi_h(255); | |
| 463 | |
| 464 for (loop_cnt = (height >> 2); loop_cnt--;) { | 458 for (loop_cnt = (height >> 2); loop_cnt--;) { |
| 465 LD_SB4(src, src_stride, src0, src1, src2, src3); | 459 LD_SB4(src, src_stride, src0, src1, src2, src3); |
| 466 src += (4 * src_stride); | 460 src += (4 * src_stride); |
| 467 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); | 461 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); |
| 468 dst += (4 * dst_stride); | 462 dst += (4 * dst_stride); |
| 469 | 463 |
| 470 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); | 464 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); |
| 471 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); | 465 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); |
| 472 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); | 466 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); |
| 473 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, | 467 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, |
| 474 vec0, vec1, vec2, vec3); | 468 vec0, vec1, vec2, vec3); |
| 475 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); | 469 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); |
| 476 MIN_UH4_UH(vec0, vec1, vec2, vec3, const255); | |
| 477 PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, | 470 PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, |
| 478 src0, src1, src2, src3); | 471 src0, src1, src2, src3); |
| 479 out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0); | 472 out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0); |
| 480 CALC_MSE_AVG_B(out, ref0, var, avg); | 473 CALC_MSE_AVG_B(out, ref0, var, avg); |
| 481 out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2); | 474 out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2); |
| 482 CALC_MSE_AVG_B(out, ref1, var, avg); | 475 CALC_MSE_AVG_B(out, ref1, var, avg); |
| 483 } | 476 } |
| 484 | 477 |
| 485 vec = __msa_hadd_s_w(avg, avg); | 478 vec = __msa_hadd_s_w(avg, avg); |
| 486 *diff = HADD_SW_S32(vec); | 479 *diff = HADD_SW_S32(vec); |
| 487 | 480 |
| 488 return HADD_SW_S32(var); | 481 return HADD_SW_S32(var); |
| 489 } | 482 } |
| 490 | 483 |
| 491 static uint32_t sub_pixel_sse_diff_16width_h_msa(const uint8_t *src, | 484 static uint32_t sub_pixel_sse_diff_16width_h_msa(const uint8_t *src, |
| 492 int32_t src_stride, | 485 int32_t src_stride, |
| 493 const uint8_t *dst, | 486 const uint8_t *dst, |
| 494 int32_t dst_stride, | 487 int32_t dst_stride, |
| 495 const uint8_t *filter, | 488 const uint8_t *filter, |
| 496 int32_t height, | 489 int32_t height, |
| 497 int32_t *diff) { | 490 int32_t *diff) { |
| 498 int16_t filtval; | 491 int16_t filtval; |
| 499 uint32_t loop_cnt; | 492 uint32_t loop_cnt; |
| 500 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; | 493 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; |
| 501 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; | 494 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; |
| 502 v16u8 dst0, dst1, dst2, dst3, filt0; | 495 v16u8 dst0, dst1, dst2, dst3, filt0; |
| 503 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; | 496 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; |
| 504 v8u16 out0, out1, out2, out3, out4, out5, out6, out7; | 497 v8u16 out0, out1, out2, out3, out4, out5, out6, out7; |
| 505 v8u16 const255; | |
| 506 v8i16 avg = { 0 }; | 498 v8i16 avg = { 0 }; |
| 507 v4i32 vec, var = { 0 }; | 499 v4i32 vec, var = { 0 }; |
| 508 | 500 |
| 509 filtval = LH(filter); | 501 filtval = LH(filter); |
| 510 filt0 = (v16u8)__msa_fill_h(filtval); | 502 filt0 = (v16u8)__msa_fill_h(filtval); |
| 511 | 503 |
| 512 const255 = (v8u16)__msa_ldi_h(255); | |
| 513 | |
| 514 for (loop_cnt = (height >> 2); loop_cnt--;) { | 504 for (loop_cnt = (height >> 2); loop_cnt--;) { |
| 515 LD_SB4(src, src_stride, src0, src2, src4, src6); | 505 LD_SB4(src, src_stride, src0, src2, src4, src6); |
| 516 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); | 506 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); |
| 517 src += (4 * src_stride); | 507 src += (4 * src_stride); |
| 518 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); | 508 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); |
| 519 dst += (4 * dst_stride); | 509 dst += (4 * dst_stride); |
| 520 | 510 |
| 521 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); | 511 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); |
| 522 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); | 512 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); |
| 523 VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5); | 513 VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5); |
| 524 VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7); | 514 VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7); |
| 525 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, | 515 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, |
| 526 out0, out1, out2, out3); | 516 out0, out1, out2, out3); |
| 527 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, | 517 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, |
| 528 out4, out5, out6, out7); | 518 out4, out5, out6, out7); |
| 529 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); | 519 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); |
| 530 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); | 520 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); |
| 531 MIN_UH4_UH(out0, out1, out2, out3, const255); | |
| 532 MIN_UH4_UH(out4, out5, out6, out7, const255); | |
| 533 PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6, | 521 PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6, |
| 534 src0, src1, src2, src3); | 522 src0, src1, src2, src3); |
| 535 CALC_MSE_AVG_B(src0, dst0, var, avg); | 523 CALC_MSE_AVG_B(src0, dst0, var, avg); |
| 536 CALC_MSE_AVG_B(src1, dst1, var, avg); | 524 CALC_MSE_AVG_B(src1, dst1, var, avg); |
| 537 CALC_MSE_AVG_B(src2, dst2, var, avg); | 525 CALC_MSE_AVG_B(src2, dst2, var, avg); |
| 538 CALC_MSE_AVG_B(src3, dst3, var, avg); | 526 CALC_MSE_AVG_B(src3, dst3, var, avg); |
| 539 } | 527 } |
| 540 | 528 |
| 541 vec = __msa_hadd_s_w(avg, avg); | 529 vec = __msa_hadd_s_w(avg, avg); |
| 542 *diff = HADD_SW_S32(vec); | 530 *diff = HADD_SW_S32(vec); |
| (...skipping 75 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 618 src += (4 * src_stride); | 606 src += (4 * src_stride); |
| 619 LW4(dst, dst_stride, ref0, ref1, ref2, ref3); | 607 LW4(dst, dst_stride, ref0, ref1, ref2, ref3); |
| 620 dst += (4 * dst_stride); | 608 dst += (4 * dst_stride); |
| 621 | 609 |
| 622 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); | 610 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); |
| 623 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, | 611 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, |
| 624 src10_r, src21_r, src32_r, src43_r); | 612 src10_r, src21_r, src32_r, src43_r); |
| 625 ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); | 613 ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); |
| 626 DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); | 614 DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); |
| 627 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 615 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
| 628 SAT_UH2_UH(tmp0, tmp1, 7); | |
| 629 out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); | 616 out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); |
| 630 CALC_MSE_AVG_B(out, ref, var, avg); | 617 CALC_MSE_AVG_B(out, ref, var, avg); |
| 631 src0 = src4; | 618 src0 = src4; |
| 632 } | 619 } |
| 633 | 620 |
| 634 vec = __msa_hadd_s_w(avg, avg); | 621 vec = __msa_hadd_s_w(avg, avg); |
| 635 *diff = HADD_SW_S32(vec); | 622 *diff = HADD_SW_S32(vec); |
| 636 | 623 |
| 637 return HADD_SW_S32(var); | 624 return HADD_SW_S32(var); |
| 638 } | 625 } |
| (...skipping 26 matching lines...) Expand all Loading... |
| 665 src += (4 * src_stride); | 652 src += (4 * src_stride); |
| 666 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); | 653 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); |
| 667 dst += (4 * dst_stride); | 654 dst += (4 * dst_stride); |
| 668 | 655 |
| 669 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); | 656 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); |
| 670 ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, | 657 ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, |
| 671 vec0, vec1, vec2, vec3); | 658 vec0, vec1, vec2, vec3); |
| 672 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, | 659 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, |
| 673 tmp0, tmp1, tmp2, tmp3); | 660 tmp0, tmp1, tmp2, tmp3); |
| 674 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); | 661 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); |
| 675 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); | |
| 676 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); | 662 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); |
| 677 CALC_MSE_AVG_B(src0, ref0, var, avg); | 663 CALC_MSE_AVG_B(src0, ref0, var, avg); |
| 678 CALC_MSE_AVG_B(src1, ref1, var, avg); | 664 CALC_MSE_AVG_B(src1, ref1, var, avg); |
| 679 src0 = src4; | 665 src0 = src4; |
| 680 } | 666 } |
| 681 | 667 |
| 682 vec = __msa_hadd_s_w(avg, avg); | 668 vec = __msa_hadd_s_w(avg, avg); |
| 683 *diff = HADD_SW_S32(vec); | 669 *diff = HADD_SW_S32(vec); |
| 684 | 670 |
| 685 return HADD_SW_S32(var); | 671 return HADD_SW_S32(var); |
| (...skipping 26 matching lines...) Expand all Loading... |
| 712 for (loop_cnt = (height >> 2); loop_cnt--;) { | 698 for (loop_cnt = (height >> 2); loop_cnt--;) { |
| 713 LD_UB4(src, src_stride, src1, src2, src3, src4); | 699 LD_UB4(src, src_stride, src1, src2, src3, src4); |
| 714 src += (4 * src_stride); | 700 src += (4 * src_stride); |
| 715 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); | 701 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); |
| 716 dst += (4 * dst_stride); | 702 dst += (4 * dst_stride); |
| 717 | 703 |
| 718 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); | 704 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); |
| 719 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); | 705 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); |
| 720 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); | 706 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); |
| 721 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 707 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
| 722 SAT_UH2_UH(tmp0, tmp1, 7); | |
| 723 out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); | 708 out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); |
| 724 | 709 |
| 725 ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); | 710 ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); |
| 726 ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); | 711 ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); |
| 727 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); | 712 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); |
| 728 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); | 713 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); |
| 729 SAT_UH2_UH(tmp2, tmp3, 7); | |
| 730 out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); | 714 out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); |
| 731 | 715 |
| 732 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); | 716 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); |
| 733 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 717 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
| 734 SAT_UH2_UH(tmp0, tmp1, 7); | |
| 735 out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); | 718 out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); |
| 736 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); | 719 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); |
| 737 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); | 720 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); |
| 738 SAT_UH2_UH(tmp2, tmp3, 7); | |
| 739 out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); | 721 out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); |
| 740 | 722 |
| 741 src0 = src4; | 723 src0 = src4; |
| 742 | 724 |
| 743 CALC_MSE_AVG_B(out0, ref0, var, avg); | 725 CALC_MSE_AVG_B(out0, ref0, var, avg); |
| 744 CALC_MSE_AVG_B(out1, ref1, var, avg); | 726 CALC_MSE_AVG_B(out1, ref1, var, avg); |
| 745 CALC_MSE_AVG_B(out2, ref2, var, avg); | 727 CALC_MSE_AVG_B(out2, ref2, var, avg); |
| 746 CALC_MSE_AVG_B(out3, ref3, var, avg); | 728 CALC_MSE_AVG_B(out3, ref3, var, avg); |
| 747 } | 729 } |
| 748 | 730 |
| (...skipping 82 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 831 dst += (4 * dst_stride); | 813 dst += (4 * dst_stride); |
| 832 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); | 814 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); |
| 833 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); | 815 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); |
| 834 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); | 816 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); |
| 835 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); | 817 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); |
| 836 hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); | 818 hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); |
| 837 hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); | 819 hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); |
| 838 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); | 820 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); |
| 839 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); | 821 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); |
| 840 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 822 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
| 841 SAT_UH2_UH(tmp0, tmp1, 7); | |
| 842 out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); | 823 out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); |
| 843 CALC_MSE_AVG_B(out, ref, var, avg); | 824 CALC_MSE_AVG_B(out, ref, var, avg); |
| 844 src0 = src4; | 825 src0 = src4; |
| 845 } | 826 } |
| 846 | 827 |
| 847 vec = __msa_hadd_s_w(avg, avg); | 828 vec = __msa_hadd_s_w(avg, avg); |
| 848 *diff = HADD_SW_S32(vec); | 829 *diff = HADD_SW_S32(vec); |
| 849 | 830 |
| 850 return HADD_SW_S32(var); | 831 return HADD_SW_S32(var); |
| 851 } | 832 } |
| (...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 886 dst += (4 * dst_stride); | 867 dst += (4 * dst_stride); |
| 887 | 868 |
| 888 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); | 869 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); |
| 889 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); | 870 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); |
| 890 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); | 871 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); |
| 891 tmp0 = __msa_dotp_u_h(vec0, filt_vt); | 872 tmp0 = __msa_dotp_u_h(vec0, filt_vt); |
| 892 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); | 873 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); |
| 893 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); | 874 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); |
| 894 tmp1 = __msa_dotp_u_h(vec0, filt_vt); | 875 tmp1 = __msa_dotp_u_h(vec0, filt_vt); |
| 895 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 876 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
| 896 SAT_UH2_UH(tmp0, tmp1, 7); | |
| 897 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); | 877 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); |
| 898 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); | 878 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); |
| 899 tmp2 = __msa_dotp_u_h(vec0, filt_vt); | 879 tmp2 = __msa_dotp_u_h(vec0, filt_vt); |
| 900 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); | 880 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); |
| 901 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); | 881 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); |
| 902 tmp3 = __msa_dotp_u_h(vec0, filt_vt); | 882 tmp3 = __msa_dotp_u_h(vec0, filt_vt); |
| 903 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); | 883 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); |
| 904 SAT_UH2_UH(tmp2, tmp3, 7); | |
| 905 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); | 884 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); |
| 906 CALC_MSE_AVG_B(out0, ref0, var, avg); | 885 CALC_MSE_AVG_B(out0, ref0, var, avg); |
| 907 CALC_MSE_AVG_B(out1, ref1, var, avg); | 886 CALC_MSE_AVG_B(out1, ref1, var, avg); |
| 908 } | 887 } |
| 909 | 888 |
| 910 vec = __msa_hadd_s_w(avg, avg); | 889 vec = __msa_hadd_s_w(avg, avg); |
| 911 *diff = HADD_SW_S32(vec); | 890 *diff = HADD_SW_S32(vec); |
| 912 | 891 |
| 913 return HADD_SW_S32(var); | 892 return HADD_SW_S32(var); |
| 914 } | 893 } |
| (...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 948 LD_UB4(src + 8, src_stride, src1, src3, src5, src7); | 927 LD_UB4(src + 8, src_stride, src1, src3, src5, src7); |
| 949 src += (4 * src_stride); | 928 src += (4 * src_stride); |
| 950 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); | 929 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); |
| 951 dst += (4 * dst_stride); | 930 dst += (4 * dst_stride); |
| 952 | 931 |
| 953 hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); | 932 hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); |
| 954 hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); | 933 hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); |
| 955 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); | 934 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); |
| 956 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); | 935 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); |
| 957 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 936 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
| 958 SAT_UH2_UH(tmp0, tmp1, 7); | |
| 959 src0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); | 937 src0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); |
| 960 | 938 |
| 961 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); | 939 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); |
| 962 hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); | 940 hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); |
| 963 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); | 941 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); |
| 964 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); | 942 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); |
| 965 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 943 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
| 966 SAT_UH2_UH(tmp0, tmp1, 7); | |
| 967 src1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); | 944 src1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); |
| 968 | 945 |
| 969 hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); | 946 hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); |
| 970 hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); | 947 hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); |
| 971 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); | 948 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); |
| 972 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); | 949 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); |
| 973 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 950 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
| 974 SAT_UH2_UH(tmp0, tmp1, 7); | |
| 975 src2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); | 951 src2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); |
| 976 | 952 |
| 977 hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); | 953 hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); |
| 978 hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); | 954 hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); |
| 979 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); | 955 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); |
| 980 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); | 956 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); |
| 981 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 957 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
| 982 SAT_UH2_UH(tmp0, tmp1, 7); | |
| 983 src3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); | 958 src3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); |
| 984 | 959 |
| 985 CALC_MSE_AVG_B(src0, ref0, var, avg); | 960 CALC_MSE_AVG_B(src0, ref0, var, avg); |
| 986 CALC_MSE_AVG_B(src1, ref1, var, avg); | 961 CALC_MSE_AVG_B(src1, ref1, var, avg); |
| 987 CALC_MSE_AVG_B(src2, ref2, var, avg); | 962 CALC_MSE_AVG_B(src2, ref2, var, avg); |
| 988 CALC_MSE_AVG_B(src3, ref3, var, avg); | 963 CALC_MSE_AVG_B(src3, ref3, var, avg); |
| 989 } | 964 } |
| 990 | 965 |
| 991 vec = __msa_hadd_s_w(avg, avg); | 966 vec = __msa_hadd_s_w(avg, avg); |
| 992 *diff = HADD_SW_S32(vec); | 967 *diff = HADD_SW_S32(vec); |
| (...skipping 57 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1050 const uint8_t *filter, | 1025 const uint8_t *filter, |
| 1051 int32_t height, | 1026 int32_t height, |
| 1052 int32_t *diff) { | 1027 int32_t *diff) { |
| 1053 int16_t filtval; | 1028 int16_t filtval; |
| 1054 uint32_t loop_cnt; | 1029 uint32_t loop_cnt; |
| 1055 uint32_t ref0, ref1, ref2, ref3; | 1030 uint32_t ref0, ref1, ref2, ref3; |
| 1056 v16u8 out, pred, filt0, ref = { 0 }; | 1031 v16u8 out, pred, filt0, ref = { 0 }; |
| 1057 v16i8 src0, src1, src2, src3; | 1032 v16i8 src0, src1, src2, src3; |
| 1058 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; | 1033 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; |
| 1059 v8u16 vec0, vec1, vec2, vec3; | 1034 v8u16 vec0, vec1, vec2, vec3; |
| 1060 v8u16 const255; | |
| 1061 v8i16 avg = { 0 }; | 1035 v8i16 avg = { 0 }; |
| 1062 v4i32 vec, var = { 0 }; | 1036 v4i32 vec, var = { 0 }; |
| 1063 | 1037 |
| 1064 filtval = LH(filter); | 1038 filtval = LH(filter); |
| 1065 filt0 = (v16u8)__msa_fill_h(filtval); | 1039 filt0 = (v16u8)__msa_fill_h(filtval); |
| 1066 | 1040 |
| 1067 const255 = (v8u16)__msa_ldi_h(255); | |
| 1068 | |
| 1069 for (loop_cnt = (height >> 2); loop_cnt--;) { | 1041 for (loop_cnt = (height >> 2); loop_cnt--;) { |
| 1070 LD_SB4(src, src_stride, src0, src1, src2, src3); | 1042 LD_SB4(src, src_stride, src0, src1, src2, src3); |
| 1071 src += (4 * src_stride); | 1043 src += (4 * src_stride); |
| 1072 pred = LD_UB(sec_pred); | 1044 pred = LD_UB(sec_pred); |
| 1073 sec_pred += 16; | 1045 sec_pred += 16; |
| 1074 LW4(dst, dst_stride, ref0, ref1, ref2, ref3); | 1046 LW4(dst, dst_stride, ref0, ref1, ref2, ref3); |
| 1075 dst += (4 * dst_stride); | 1047 dst += (4 * dst_stride); |
| 1076 | 1048 |
| 1077 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); | 1049 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); |
| 1078 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); | 1050 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); |
| 1079 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); | 1051 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); |
| 1080 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, | 1052 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, |
| 1081 vec0, vec1, vec2, vec3); | 1053 vec0, vec1, vec2, vec3); |
| 1082 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); | 1054 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); |
| 1083 MIN_UH4_UH(vec0, vec1, vec2, vec3, const255); | |
| 1084 PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, | 1055 PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, |
| 1085 src0, src1, src2, src3); | 1056 src0, src1, src2, src3); |
| 1086 ILVEV_W2_SB(src0, src1, src2, src3, src0, src2); | 1057 ILVEV_W2_SB(src0, src1, src2, src3, src0, src2); |
| 1087 out = (v16u8)__msa_ilvev_d((v2i64)src2, (v2i64)src0); | 1058 out = (v16u8)__msa_ilvev_d((v2i64)src2, (v2i64)src0); |
| 1088 out = __msa_aver_u_b(out, pred); | 1059 out = __msa_aver_u_b(out, pred); |
| 1089 CALC_MSE_AVG_B(out, ref, var, avg); | 1060 CALC_MSE_AVG_B(out, ref, var, avg); |
| 1090 } | 1061 } |
| 1091 | 1062 |
| 1092 vec = __msa_hadd_s_w(avg, avg); | 1063 vec = __msa_hadd_s_w(avg, avg); |
| 1093 *diff = HADD_SW_S32(vec); | 1064 *diff = HADD_SW_S32(vec); |
| 1094 | 1065 |
| 1095 return HADD_SW_S32(var); | 1066 return HADD_SW_S32(var); |
| 1096 } | 1067 } |
| 1097 | 1068 |
| 1098 static uint32_t sub_pixel_avg_sse_diff_8width_h_msa(const uint8_t *src, | 1069 static uint32_t sub_pixel_avg_sse_diff_8width_h_msa(const uint8_t *src, |
| 1099 int32_t src_stride, | 1070 int32_t src_stride, |
| 1100 const uint8_t *dst, | 1071 const uint8_t *dst, |
| 1101 int32_t dst_stride, | 1072 int32_t dst_stride, |
| 1102 const uint8_t *sec_pred, | 1073 const uint8_t *sec_pred, |
| 1103 const uint8_t *filter, | 1074 const uint8_t *filter, |
| 1104 int32_t height, | 1075 int32_t height, |
| 1105 int32_t *diff) { | 1076 int32_t *diff) { |
| 1106 int16_t filtval; | 1077 int16_t filtval; |
| 1107 uint32_t loop_cnt; | 1078 uint32_t loop_cnt; |
| 1108 v16u8 out, pred, filt0; | 1079 v16u8 out, pred, filt0; |
| 1109 v16u8 ref0, ref1, ref2, ref3; | 1080 v16u8 ref0, ref1, ref2, ref3; |
| 1110 v16i8 src0, src1, src2, src3; | 1081 v16i8 src0, src1, src2, src3; |
| 1111 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; | 1082 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; |
| 1112 v8u16 vec0, vec1, vec2, vec3; | 1083 v8u16 vec0, vec1, vec2, vec3; |
| 1113 v8u16 const255; | |
| 1114 v8i16 avg = { 0 }; | 1084 v8i16 avg = { 0 }; |
| 1115 v4i32 vec, var = { 0 }; | 1085 v4i32 vec, var = { 0 }; |
| 1116 | 1086 |
| 1117 filtval = LH(filter); | 1087 filtval = LH(filter); |
| 1118 filt0 = (v16u8)__msa_fill_h(filtval); | 1088 filt0 = (v16u8)__msa_fill_h(filtval); |
| 1119 | 1089 |
| 1120 const255 = (v8u16)__msa_ldi_h(255); | |
| 1121 | |
| 1122 for (loop_cnt = (height >> 2); loop_cnt--;) { | 1090 for (loop_cnt = (height >> 2); loop_cnt--;) { |
| 1123 LD_SB4(src, src_stride, src0, src1, src2, src3); | 1091 LD_SB4(src, src_stride, src0, src1, src2, src3); |
| 1124 src += (4 * src_stride); | 1092 src += (4 * src_stride); |
| 1125 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); | 1093 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); |
| 1126 dst += (4 * dst_stride); | 1094 dst += (4 * dst_stride); |
| 1127 | 1095 |
| 1128 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); | 1096 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); |
| 1129 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); | 1097 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); |
| 1130 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); | 1098 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); |
| 1131 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, | 1099 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, |
| 1132 vec0, vec1, vec2, vec3); | 1100 vec0, vec1, vec2, vec3); |
| 1133 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); | 1101 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); |
| 1134 MIN_UH4_UH(vec0, vec1, vec2, vec3, const255); | |
| 1135 PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, | 1102 PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, |
| 1136 src0, src1, src2, src3); | 1103 src0, src1, src2, src3); |
| 1137 out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0); | 1104 out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0); |
| 1138 | 1105 |
| 1139 pred = LD_UB(sec_pred); | 1106 pred = LD_UB(sec_pred); |
| 1140 sec_pred += 16; | 1107 sec_pred += 16; |
| 1141 out = __msa_aver_u_b(out, pred); | 1108 out = __msa_aver_u_b(out, pred); |
| 1142 CALC_MSE_AVG_B(out, ref0, var, avg); | 1109 CALC_MSE_AVG_B(out, ref0, var, avg); |
| 1143 out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2); | 1110 out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2); |
| 1144 pred = LD_UB(sec_pred); | 1111 pred = LD_UB(sec_pred); |
| (...skipping 19 matching lines...) Expand all Loading... |
| 1164 int32_t width) { | 1131 int32_t width) { |
| 1165 int16_t filtval; | 1132 int16_t filtval; |
| 1166 uint32_t loop_cnt; | 1133 uint32_t loop_cnt; |
| 1167 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; | 1134 v16i8 src0, src1, src2, src3, src4, src5, src6, src7; |
| 1168 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; | 1135 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; |
| 1169 v16u8 dst0, dst1, dst2, dst3; | 1136 v16u8 dst0, dst1, dst2, dst3; |
| 1170 v16u8 tmp0, tmp1, tmp2, tmp3; | 1137 v16u8 tmp0, tmp1, tmp2, tmp3; |
| 1171 v16u8 pred0, pred1, pred2, pred3, filt0; | 1138 v16u8 pred0, pred1, pred2, pred3, filt0; |
| 1172 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; | 1139 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; |
| 1173 v8u16 out0, out1, out2, out3, out4, out5, out6, out7; | 1140 v8u16 out0, out1, out2, out3, out4, out5, out6, out7; |
| 1174 v8u16 const255; | |
| 1175 v8i16 avg = { 0 }; | 1141 v8i16 avg = { 0 }; |
| 1176 v4i32 vec, var = { 0 }; | 1142 v4i32 vec, var = { 0 }; |
| 1177 | 1143 |
| 1178 filtval = LH(filter); | 1144 filtval = LH(filter); |
| 1179 filt0 = (v16u8)__msa_fill_h(filtval); | 1145 filt0 = (v16u8)__msa_fill_h(filtval); |
| 1180 | 1146 |
| 1181 const255 = (v8u16)__msa_ldi_h(255); | |
| 1182 | |
| 1183 for (loop_cnt = (height >> 2); loop_cnt--;) { | 1147 for (loop_cnt = (height >> 2); loop_cnt--;) { |
| 1184 LD_SB4(src, src_stride, src0, src2, src4, src6); | 1148 LD_SB4(src, src_stride, src0, src2, src4, src6); |
| 1185 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); | 1149 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); |
| 1186 src += (4 * src_stride); | 1150 src += (4 * src_stride); |
| 1187 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); | 1151 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); |
| 1188 dst += (4 * dst_stride); | 1152 dst += (4 * dst_stride); |
| 1189 LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3); | 1153 LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3); |
| 1190 sec_pred += (4 * width); | 1154 sec_pred += (4 * width); |
| 1191 | 1155 |
| 1192 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); | 1156 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); |
| 1193 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); | 1157 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); |
| 1194 VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5); | 1158 VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5); |
| 1195 VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7); | 1159 VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7); |
| 1196 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, | 1160 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, |
| 1197 out0, out1, out2, out3); | 1161 out0, out1, out2, out3); |
| 1198 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, | 1162 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, |
| 1199 out4, out5, out6, out7); | 1163 out4, out5, out6, out7); |
| 1200 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); | 1164 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); |
| 1201 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); | 1165 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); |
| 1202 MIN_UH4_UH(out0, out1, out2, out3, const255); | |
| 1203 MIN_UH4_UH(out4, out5, out6, out7, const255); | |
| 1204 PCKEV_B4_UB(out1, out0, out3, out2, out5, out4, out7, out6, | 1166 PCKEV_B4_UB(out1, out0, out3, out2, out5, out4, out7, out6, |
| 1205 tmp0, tmp1, tmp2, tmp3); | 1167 tmp0, tmp1, tmp2, tmp3); |
| 1206 AVER_UB4_UB(tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, pred3, | 1168 AVER_UB4_UB(tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, pred3, |
| 1207 tmp0, tmp1, tmp2, tmp3); | 1169 tmp0, tmp1, tmp2, tmp3); |
| 1208 | 1170 |
| 1209 CALC_MSE_AVG_B(tmp0, dst0, var, avg); | 1171 CALC_MSE_AVG_B(tmp0, dst0, var, avg); |
| 1210 CALC_MSE_AVG_B(tmp1, dst1, var, avg); | 1172 CALC_MSE_AVG_B(tmp1, dst1, var, avg); |
| 1211 CALC_MSE_AVG_B(tmp2, dst2, var, avg); | 1173 CALC_MSE_AVG_B(tmp2, dst2, var, avg); |
| 1212 CALC_MSE_AVG_B(tmp3, dst3, var, avg); | 1174 CALC_MSE_AVG_B(tmp3, dst3, var, avg); |
| 1213 } | 1175 } |
| (...skipping 98 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1312 sec_pred += 16; | 1274 sec_pred += 16; |
| 1313 LW4(dst, dst_stride, ref0, ref1, ref2, ref3); | 1275 LW4(dst, dst_stride, ref0, ref1, ref2, ref3); |
| 1314 dst += (4 * dst_stride); | 1276 dst += (4 * dst_stride); |
| 1315 | 1277 |
| 1316 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); | 1278 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); |
| 1317 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, | 1279 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, |
| 1318 src10_r, src21_r, src32_r, src43_r); | 1280 src10_r, src21_r, src32_r, src43_r); |
| 1319 ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); | 1281 ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); |
| 1320 DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); | 1282 DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); |
| 1321 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 1283 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
| 1322 SAT_UH2_UH(tmp0, tmp1, 7); | |
| 1323 | 1284 |
| 1324 out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); | 1285 out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); |
| 1325 out = __msa_aver_u_b(out, pred); | 1286 out = __msa_aver_u_b(out, pred); |
| 1326 CALC_MSE_AVG_B(out, ref, var, avg); | 1287 CALC_MSE_AVG_B(out, ref, var, avg); |
| 1327 src0 = src4; | 1288 src0 = src4; |
| 1328 } | 1289 } |
| 1329 | 1290 |
| 1330 vec = __msa_hadd_s_w(avg, avg); | 1291 vec = __msa_hadd_s_w(avg, avg); |
| 1331 *diff = HADD_SW_S32(vec); | 1292 *diff = HADD_SW_S32(vec); |
| 1332 | 1293 |
| (...skipping 30 matching lines...) Expand all Loading... |
| 1363 LD_UB2(sec_pred, 16, pred0, pred1); | 1324 LD_UB2(sec_pred, 16, pred0, pred1); |
| 1364 sec_pred += 32; | 1325 sec_pred += 32; |
| 1365 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); | 1326 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); |
| 1366 dst += (4 * dst_stride); | 1327 dst += (4 * dst_stride); |
| 1367 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); | 1328 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); |
| 1368 ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, | 1329 ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, |
| 1369 vec0, vec1, vec2, vec3); | 1330 vec0, vec1, vec2, vec3); |
| 1370 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, | 1331 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, |
| 1371 tmp0, tmp1, tmp2, tmp3); | 1332 tmp0, tmp1, tmp2, tmp3); |
| 1372 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); | 1333 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); |
| 1373 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); | |
| 1374 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); | 1334 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); |
| 1375 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); | 1335 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); |
| 1376 CALC_MSE_AVG_B(src0, ref0, var, avg); | 1336 CALC_MSE_AVG_B(src0, ref0, var, avg); |
| 1377 CALC_MSE_AVG_B(src1, ref1, var, avg); | 1337 CALC_MSE_AVG_B(src1, ref1, var, avg); |
| 1378 | 1338 |
| 1379 src0 = src4; | 1339 src0 = src4; |
| 1380 } | 1340 } |
| 1381 | 1341 |
| 1382 vec = __msa_hadd_s_w(avg, avg); | 1342 vec = __msa_hadd_s_w(avg, avg); |
| 1383 *diff = HADD_SW_S32(vec); | 1343 *diff = HADD_SW_S32(vec); |
| (...skipping 30 matching lines...) Expand all Loading... |
| 1414 for (loop_cnt = (height >> 2); loop_cnt--;) { | 1374 for (loop_cnt = (height >> 2); loop_cnt--;) { |
| 1415 LD_UB4(src, src_stride, src1, src2, src3, src4); | 1375 LD_UB4(src, src_stride, src1, src2, src3, src4); |
| 1416 src += (4 * src_stride); | 1376 src += (4 * src_stride); |
| 1417 LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3); | 1377 LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3); |
| 1418 sec_pred += (4 * width); | 1378 sec_pred += (4 * width); |
| 1419 | 1379 |
| 1420 ILVR_B2_UH(src1, src0, src2, src1, vec0, vec2); | 1380 ILVR_B2_UH(src1, src0, src2, src1, vec0, vec2); |
| 1421 ILVL_B2_UH(src1, src0, src2, src1, vec1, vec3); | 1381 ILVL_B2_UH(src1, src0, src2, src1, vec1, vec3); |
| 1422 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); | 1382 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); |
| 1423 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 1383 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
| 1424 SAT_UH2_UH(tmp0, tmp1, 7); | |
| 1425 out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); | 1384 out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); |
| 1426 | 1385 |
| 1427 ILVR_B2_UH(src3, src2, src4, src3, vec4, vec6); | 1386 ILVR_B2_UH(src3, src2, src4, src3, vec4, vec6); |
| 1428 ILVL_B2_UH(src3, src2, src4, src3, vec5, vec7); | 1387 ILVL_B2_UH(src3, src2, src4, src3, vec5, vec7); |
| 1429 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); | 1388 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); |
| 1430 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); | 1389 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); |
| 1431 SAT_UH2_UH(tmp2, tmp3, 7); | |
| 1432 out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); | 1390 out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); |
| 1433 | 1391 |
| 1434 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); | 1392 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); |
| 1435 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 1393 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
| 1436 SAT_UH2_UH(tmp0, tmp1, 7); | |
| 1437 out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); | 1394 out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); |
| 1438 | 1395 |
| 1439 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); | 1396 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); |
| 1440 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); | 1397 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); |
| 1441 SAT_UH2_UH(tmp2, tmp3, 7); | |
| 1442 out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); | 1398 out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); |
| 1443 | 1399 |
| 1444 src0 = src4; | 1400 src0 = src4; |
| 1445 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); | 1401 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); |
| 1446 dst += (4 * dst_stride); | 1402 dst += (4 * dst_stride); |
| 1447 | 1403 |
| 1448 AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, | 1404 AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, |
| 1449 out0, out1, out2, out3); | 1405 out0, out1, out2, out3); |
| 1450 | 1406 |
| 1451 CALC_MSE_AVG_B(out0, ref0, var, avg); | 1407 CALC_MSE_AVG_B(out0, ref0, var, avg); |
| (...skipping 104 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1556 dst += (4 * dst_stride); | 1512 dst += (4 * dst_stride); |
| 1557 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); | 1513 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); |
| 1558 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); | 1514 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); |
| 1559 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); | 1515 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); |
| 1560 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); | 1516 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); |
| 1561 hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); | 1517 hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); |
| 1562 hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); | 1518 hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); |
| 1563 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); | 1519 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); |
| 1564 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); | 1520 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); |
| 1565 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 1521 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
| 1566 SAT_UH2_UH(tmp0, tmp1, 7); | |
| 1567 out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); | 1522 out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); |
| 1568 out = __msa_aver_u_b(out, pred); | 1523 out = __msa_aver_u_b(out, pred); |
| 1569 CALC_MSE_AVG_B(out, ref, var, avg); | 1524 CALC_MSE_AVG_B(out, ref, var, avg); |
| 1570 src0 = src4; | 1525 src0 = src4; |
| 1571 } | 1526 } |
| 1572 | 1527 |
| 1573 vec = __msa_hadd_s_w(avg, avg); | 1528 vec = __msa_hadd_s_w(avg, avg); |
| 1574 *diff = HADD_SW_S32(vec); | 1529 *diff = HADD_SW_S32(vec); |
| 1575 | 1530 |
| 1576 return HADD_SW_S32(var); | 1531 return HADD_SW_S32(var); |
| (...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1613 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); | 1568 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); |
| 1614 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); | 1569 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); |
| 1615 | 1570 |
| 1616 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); | 1571 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); |
| 1617 tmp0 = __msa_dotp_u_h(vec0, filt_vt); | 1572 tmp0 = __msa_dotp_u_h(vec0, filt_vt); |
| 1618 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); | 1573 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); |
| 1619 | 1574 |
| 1620 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); | 1575 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); |
| 1621 tmp1 = __msa_dotp_u_h(vec0, filt_vt); | 1576 tmp1 = __msa_dotp_u_h(vec0, filt_vt); |
| 1622 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 1577 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
| 1623 SAT_UH2_UH(tmp0, tmp1, 7); | |
| 1624 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); | 1578 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); |
| 1625 | 1579 |
| 1626 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); | 1580 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); |
| 1627 tmp2 = __msa_dotp_u_h(vec0, filt_vt); | 1581 tmp2 = __msa_dotp_u_h(vec0, filt_vt); |
| 1628 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); | 1582 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); |
| 1629 | 1583 |
| 1630 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); | 1584 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); |
| 1631 tmp3 = __msa_dotp_u_h(vec0, filt_vt); | 1585 tmp3 = __msa_dotp_u_h(vec0, filt_vt); |
| 1632 | 1586 |
| 1633 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); | 1587 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); |
| 1634 SAT_UH2_UH(tmp2, tmp3, 7); | |
| 1635 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); | 1588 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); |
| 1636 AVER_UB2_UB(out0, pred0, out1, pred1, out0, out1); | 1589 AVER_UB2_UB(out0, pred0, out1, pred1, out0, out1); |
| 1637 | 1590 |
| 1638 CALC_MSE_AVG_B(out0, ref0, var, avg); | 1591 CALC_MSE_AVG_B(out0, ref0, var, avg); |
| 1639 CALC_MSE_AVG_B(out1, ref1, var, avg); | 1592 CALC_MSE_AVG_B(out1, ref1, var, avg); |
| 1640 } | 1593 } |
| 1641 | 1594 |
| 1642 vec = __msa_hadd_s_w(avg, avg); | 1595 vec = __msa_hadd_s_w(avg, avg); |
| 1643 *diff = HADD_SW_S32(vec); | 1596 *diff = HADD_SW_S32(vec); |
| 1644 | 1597 |
| (...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1683 LD_UB4(src + 8, src_stride, src1, src3, src5, src7); | 1636 LD_UB4(src + 8, src_stride, src1, src3, src5, src7); |
| 1684 src += (4 * src_stride); | 1637 src += (4 * src_stride); |
| 1685 LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3); | 1638 LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3); |
| 1686 sec_pred += (4 * width); | 1639 sec_pred += (4 * width); |
| 1687 | 1640 |
| 1688 hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); | 1641 hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); |
| 1689 hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); | 1642 hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); |
| 1690 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); | 1643 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); |
| 1691 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); | 1644 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); |
| 1692 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 1645 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
| 1693 SAT_UH2_UH(tmp0, tmp1, 7); | |
| 1694 out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); | 1646 out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); |
| 1695 | 1647 |
| 1696 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); | 1648 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); |
| 1697 hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); | 1649 hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); |
| 1698 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); | 1650 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); |
| 1699 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); | 1651 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); |
| 1700 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 1652 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
| 1701 SAT_UH2_UH(tmp0, tmp1, 7); | |
| 1702 out1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); | 1653 out1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); |
| 1703 | 1654 |
| 1704 hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); | 1655 hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); |
| 1705 hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); | 1656 hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); |
| 1706 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); | 1657 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); |
| 1707 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); | 1658 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); |
| 1708 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 1659 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
| 1709 SAT_UH2_UH(tmp0, tmp1, 7); | |
| 1710 out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); | 1660 out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); |
| 1711 | 1661 |
| 1712 hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); | 1662 hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); |
| 1713 hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); | 1663 hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); |
| 1714 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); | 1664 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); |
| 1715 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); | 1665 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); |
| 1716 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 1666 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
| 1717 SAT_UH2_UH(tmp0, tmp1, 7); | |
| 1718 out3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); | 1667 out3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); |
| 1719 | 1668 |
| 1720 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); | 1669 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); |
| 1721 dst += (4 * dst_stride); | 1670 dst += (4 * dst_stride); |
| 1722 | 1671 |
| 1723 AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, | 1672 AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, |
| 1724 out0, out1, out2, out3); | 1673 out0, out1, out2, out3); |
| 1725 | 1674 |
| 1726 CALC_MSE_AVG_B(out0, ref0, var, avg); | 1675 CALC_MSE_AVG_B(out0, ref0, var, avg); |
| 1727 CALC_MSE_AVG_B(out1, ref1, var, avg); | 1676 CALC_MSE_AVG_B(out1, ref1, var, avg); |
| (...skipping 266 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1994 ref_ptr, ref_stride, \ | 1943 ref_ptr, ref_stride, \ |
| 1995 sec_pred, &diff); \ | 1944 sec_pred, &diff); \ |
| 1996 } \ | 1945 } \ |
| 1997 } \ | 1946 } \ |
| 1998 \ | 1947 \ |
| 1999 return VARIANCE_64Wx##ht##H(*sse, diff); \ | 1948 return VARIANCE_64Wx##ht##H(*sse, diff); \ |
| 2000 } | 1949 } |
| 2001 | 1950 |
| 2002 VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(32); | 1951 VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(32); |
| 2003 VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(64); | 1952 VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(64); |
| OLD | NEW |