| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| 11 #include <assert.h> |
| 11 #include "./vpx_dsp_rtcd.h" | 12 #include "./vpx_dsp_rtcd.h" |
| 12 #include "vpx_dsp/mips/vpx_convolve_msa.h" | 13 #include "vpx_dsp/mips/vpx_convolve_msa.h" |
| 13 | 14 |
| 14 static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride, | 15 static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride, |
| 15 uint8_t *dst, int32_t dst_stride, | 16 uint8_t *dst, int32_t dst_stride, |
| 16 int8_t *filter, int32_t height) { | 17 int8_t *filter, int32_t height) { |
| 17 uint32_t loop_cnt; | 18 uint32_t loop_cnt; |
| 18 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; | 19 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; |
| 19 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; | 20 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; |
| 20 v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776; | 21 v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776; |
| (...skipping 288 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 309 filt0 = (v16u8)__msa_splati_h(filt, 0); | 310 filt0 = (v16u8)__msa_splati_h(filt, 0); |
| 310 | 311 |
| 311 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); | 312 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); |
| 312 src += (5 * src_stride); | 313 src += (5 * src_stride); |
| 313 | 314 |
| 314 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, | 315 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, |
| 315 src32_r, src43_r); | 316 src32_r, src43_r); |
| 316 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); | 317 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); |
| 317 DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); | 318 DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); |
| 318 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 319 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
| 319 SAT_UH2_UH(tmp0, tmp1, 7); | |
| 320 src2110 = __msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); | 320 src2110 = __msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); |
| 321 ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); | 321 ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); |
| 322 } | 322 } |
| 323 | 323 |
| 324 static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride, | 324 static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride, |
| 325 uint8_t *dst, int32_t dst_stride, | 325 uint8_t *dst, int32_t dst_stride, |
| 326 int8_t *filter) { | 326 int8_t *filter) { |
| 327 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; | 327 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; |
| 328 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r; | 328 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r; |
| 329 v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776; | 329 v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776; |
| (...skipping 12 matching lines...) Expand all Loading... |
| 342 | 342 |
| 343 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, | 343 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, |
| 344 src32_r, src43_r); | 344 src32_r, src43_r); |
| 345 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, | 345 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, |
| 346 src76_r, src87_r); | 346 src76_r, src87_r); |
| 347 ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, | 347 ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, |
| 348 src87_r, src76_r, src2110, src4332, src6554, src8776); | 348 src87_r, src76_r, src2110, src4332, src6554, src8776); |
| 349 DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0, | 349 DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0, |
| 350 tmp0, tmp1, tmp2, tmp3); | 350 tmp0, tmp1, tmp2, tmp3); |
| 351 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); | 351 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); |
| 352 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); | |
| 353 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332); | 352 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332); |
| 354 ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); | 353 ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); |
| 355 ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride); | 354 ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride); |
| 356 } | 355 } |
| 357 | 356 |
| 358 static void common_vt_2t_4w_msa(const uint8_t *src, int32_t src_stride, | 357 static void common_vt_2t_4w_msa(const uint8_t *src, int32_t src_stride, |
| 359 uint8_t *dst, int32_t dst_stride, | 358 uint8_t *dst, int32_t dst_stride, |
| 360 int8_t *filter, int32_t height) { | 359 int8_t *filter, int32_t height) { |
| 361 if (4 == height) { | 360 if (4 == height) { |
| 362 common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter); | 361 common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter); |
| (...skipping 13 matching lines...) Expand all Loading... |
| 376 /* rearranging filter_y */ | 375 /* rearranging filter_y */ |
| 377 filt = LD_SH(filter); | 376 filt = LD_SH(filter); |
| 378 filt0 = (v16u8)__msa_splati_h(filt, 0); | 377 filt0 = (v16u8)__msa_splati_h(filt, 0); |
| 379 | 378 |
| 380 LD_UB5(src, src_stride, src0, src1, src2, src3, src4); | 379 LD_UB5(src, src_stride, src0, src1, src2, src3, src4); |
| 381 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1); | 380 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1); |
| 382 ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3); | 381 ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3); |
| 383 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, | 382 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, |
| 384 tmp2, tmp3); | 383 tmp2, tmp3); |
| 385 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); | 384 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); |
| 386 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); | |
| 387 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); | 385 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); |
| 388 ST8x4_UB(out0, out1, dst, dst_stride); | 386 ST8x4_UB(out0, out1, dst, dst_stride); |
| 389 } | 387 } |
| 390 | 388 |
| 391 static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, | 389 static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, |
| 392 uint8_t *dst, int32_t dst_stride, | 390 uint8_t *dst, int32_t dst_stride, |
| 393 int8_t *filter, int32_t height) { | 391 int8_t *filter, int32_t height) { |
| 394 uint32_t loop_cnt; | 392 uint32_t loop_cnt; |
| 395 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; | 393 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; |
| 396 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; | 394 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; |
| (...skipping 12 matching lines...) Expand all Loading... |
| 409 LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8); | 407 LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8); |
| 410 src += (8 * src_stride); | 408 src += (8 * src_stride); |
| 411 | 409 |
| 412 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, | 410 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, |
| 413 vec2, vec3); | 411 vec2, vec3); |
| 414 ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, | 412 ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, |
| 415 vec6, vec7); | 413 vec6, vec7); |
| 416 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, | 414 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, |
| 417 tmp2, tmp3); | 415 tmp2, tmp3); |
| 418 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); | 416 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); |
| 419 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); | |
| 420 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); | 417 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); |
| 421 ST8x4_UB(out0, out1, dst, dst_stride); | 418 ST8x4_UB(out0, out1, dst, dst_stride); |
| 422 dst += (4 * dst_stride); | 419 dst += (4 * dst_stride); |
| 423 | 420 |
| 424 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1, | 421 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1, |
| 425 tmp2, tmp3); | 422 tmp2, tmp3); |
| 426 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); | 423 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); |
| 427 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); | |
| 428 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); | 424 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); |
| 429 ST8x4_UB(out0, out1, dst, dst_stride); | 425 ST8x4_UB(out0, out1, dst, dst_stride); |
| 430 dst += (4 * dst_stride); | 426 dst += (4 * dst_stride); |
| 431 | 427 |
| 432 src0 = src8; | 428 src0 = src8; |
| 433 } | 429 } |
| 434 } | 430 } |
| 435 | 431 |
| 436 static void common_vt_2t_8w_msa(const uint8_t *src, int32_t src_stride, | 432 static void common_vt_2t_8w_msa(const uint8_t *src, int32_t src_stride, |
| 437 uint8_t *dst, int32_t dst_stride, | 433 uint8_t *dst, int32_t dst_stride, |
| (...skipping 22 matching lines...) Expand all Loading... |
| 460 src += src_stride; | 456 src += src_stride; |
| 461 | 457 |
| 462 for (loop_cnt = (height >> 2); loop_cnt--;) { | 458 for (loop_cnt = (height >> 2); loop_cnt--;) { |
| 463 LD_UB4(src, src_stride, src1, src2, src3, src4); | 459 LD_UB4(src, src_stride, src1, src2, src3, src4); |
| 464 src += (4 * src_stride); | 460 src += (4 * src_stride); |
| 465 | 461 |
| 466 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); | 462 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); |
| 467 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); | 463 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); |
| 468 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); | 464 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); |
| 469 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 465 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
| 470 SAT_UH2_UH(tmp0, tmp1, 7); | |
| 471 PCKEV_ST_SB(tmp0, tmp1, dst); | 466 PCKEV_ST_SB(tmp0, tmp1, dst); |
| 472 dst += dst_stride; | 467 dst += dst_stride; |
| 473 | 468 |
| 474 ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); | 469 ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); |
| 475 ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); | 470 ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); |
| 476 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); | 471 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); |
| 477 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); | 472 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); |
| 478 SAT_UH2_UH(tmp2, tmp3, 7); | |
| 479 PCKEV_ST_SB(tmp2, tmp3, dst); | 473 PCKEV_ST_SB(tmp2, tmp3, dst); |
| 480 dst += dst_stride; | 474 dst += dst_stride; |
| 481 | 475 |
| 482 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); | 476 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); |
| 483 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 477 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
| 484 SAT_UH2_UH(tmp0, tmp1, 7); | |
| 485 PCKEV_ST_SB(tmp0, tmp1, dst); | 478 PCKEV_ST_SB(tmp0, tmp1, dst); |
| 486 dst += dst_stride; | 479 dst += dst_stride; |
| 487 | 480 |
| 488 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); | 481 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); |
| 489 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); | 482 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); |
| 490 SAT_UH2_UH(tmp2, tmp3, 7); | |
| 491 PCKEV_ST_SB(tmp2, tmp3, dst); | 483 PCKEV_ST_SB(tmp2, tmp3, dst); |
| 492 dst += dst_stride; | 484 dst += dst_stride; |
| 493 | 485 |
| 494 src0 = src4; | 486 src0 = src4; |
| 495 } | 487 } |
| 496 } | 488 } |
| 497 | 489 |
| 498 static void common_vt_2t_32w_msa(const uint8_t *src, int32_t src_stride, | 490 static void common_vt_2t_32w_msa(const uint8_t *src, int32_t src_stride, |
| 499 uint8_t *dst, int32_t dst_stride, | 491 uint8_t *dst, int32_t dst_stride, |
| 500 int8_t *filter, int32_t height) { | 492 int8_t *filter, int32_t height) { |
| (...skipping 14 matching lines...) Expand all Loading... |
| 515 for (loop_cnt = (height >> 2); loop_cnt--;) { | 507 for (loop_cnt = (height >> 2); loop_cnt--;) { |
| 516 LD_UB4(src, src_stride, src1, src2, src3, src4); | 508 LD_UB4(src, src_stride, src1, src2, src3, src4); |
| 517 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); | 509 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); |
| 518 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); | 510 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); |
| 519 | 511 |
| 520 LD_UB4(src + 16, src_stride, src6, src7, src8, src9); | 512 LD_UB4(src + 16, src_stride, src6, src7, src8, src9); |
| 521 src += (4 * src_stride); | 513 src += (4 * src_stride); |
| 522 | 514 |
| 523 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); | 515 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); |
| 524 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 516 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
| 525 SAT_UH2_UH(tmp0, tmp1, 7); | |
| 526 PCKEV_ST_SB(tmp0, tmp1, dst); | 517 PCKEV_ST_SB(tmp0, tmp1, dst); |
| 527 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); | 518 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); |
| 528 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); | 519 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); |
| 529 SAT_UH2_UH(tmp2, tmp3, 7); | |
| 530 PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride); | 520 PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride); |
| 531 | 521 |
| 532 ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); | 522 ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); |
| 533 ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); | 523 ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); |
| 534 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); | 524 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); |
| 535 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 525 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
| 536 SAT_UH2_UH(tmp0, tmp1, 7); | |
| 537 PCKEV_ST_SB(tmp0, tmp1, dst + 2 * dst_stride); | 526 PCKEV_ST_SB(tmp0, tmp1, dst + 2 * dst_stride); |
| 538 | 527 |
| 539 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); | 528 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); |
| 540 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); | 529 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); |
| 541 SAT_UH2_UH(tmp2, tmp3, 7); | |
| 542 PCKEV_ST_SB(tmp2, tmp3, dst + 3 * dst_stride); | 530 PCKEV_ST_SB(tmp2, tmp3, dst + 3 * dst_stride); |
| 543 | 531 |
| 544 ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2); | 532 ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2); |
| 545 ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3); | 533 ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3); |
| 546 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); | 534 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); |
| 547 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 535 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
| 548 SAT_UH2_UH(tmp0, tmp1, 7); | |
| 549 PCKEV_ST_SB(tmp0, tmp1, dst + 16); | 536 PCKEV_ST_SB(tmp0, tmp1, dst + 16); |
| 550 | 537 |
| 551 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); | 538 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); |
| 552 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); | 539 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); |
| 553 SAT_UH2_UH(tmp2, tmp3, 7); | |
| 554 PCKEV_ST_SB(tmp2, tmp3, dst + 16 + dst_stride); | 540 PCKEV_ST_SB(tmp2, tmp3, dst + 16 + dst_stride); |
| 555 | 541 |
| 556 ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6); | 542 ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6); |
| 557 ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7); | 543 ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7); |
| 558 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); | 544 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); |
| 559 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 545 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
| 560 SAT_UH2_UH(tmp0, tmp1, 7); | |
| 561 PCKEV_ST_SB(tmp0, tmp1, dst + 16 + 2 * dst_stride); | 546 PCKEV_ST_SB(tmp0, tmp1, dst + 16 + 2 * dst_stride); |
| 562 | 547 |
| 563 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); | 548 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); |
| 564 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); | 549 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); |
| 565 SAT_UH2_UH(tmp2, tmp3, 7); | |
| 566 PCKEV_ST_SB(tmp2, tmp3, dst + 16 + 3 * dst_stride); | 550 PCKEV_ST_SB(tmp2, tmp3, dst + 16 + 3 * dst_stride); |
| 567 dst += (4 * dst_stride); | 551 dst += (4 * dst_stride); |
| 568 | 552 |
| 569 src0 = src4; | 553 src0 = src4; |
| 570 src5 = src9; | 554 src5 = src9; |
| 571 } | 555 } |
| 572 } | 556 } |
| 573 | 557 |
| 574 static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride, | 558 static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride, |
| 575 uint8_t *dst, int32_t dst_stride, | 559 uint8_t *dst, int32_t dst_stride, |
| (...skipping 15 matching lines...) Expand all Loading... |
| 591 LD_UB2(src, src_stride, src1, src2); | 575 LD_UB2(src, src_stride, src1, src2); |
| 592 LD_UB2(src + 16, src_stride, src4, src5); | 576 LD_UB2(src + 16, src_stride, src4, src5); |
| 593 LD_UB2(src + 32, src_stride, src7, src8); | 577 LD_UB2(src + 32, src_stride, src7, src8); |
| 594 LD_UB2(src + 48, src_stride, src10, src11); | 578 LD_UB2(src + 48, src_stride, src10, src11); |
| 595 src += (2 * src_stride); | 579 src += (2 * src_stride); |
| 596 | 580 |
| 597 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); | 581 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); |
| 598 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); | 582 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); |
| 599 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); | 583 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); |
| 600 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 584 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
| 601 SAT_UH2_UH(tmp0, tmp1, 7); | |
| 602 PCKEV_ST_SB(tmp0, tmp1, dst); | 585 PCKEV_ST_SB(tmp0, tmp1, dst); |
| 603 | 586 |
| 604 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); | 587 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); |
| 605 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); | 588 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); |
| 606 SAT_UH2_UH(tmp2, tmp3, 7); | |
| 607 PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride); | 589 PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride); |
| 608 | 590 |
| 609 ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6); | 591 ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6); |
| 610 ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7); | 592 ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7); |
| 611 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); | 593 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); |
| 612 SRARI_H2_UH(tmp4, tmp5, FILTER_BITS); | 594 SRARI_H2_UH(tmp4, tmp5, FILTER_BITS); |
| 613 SAT_UH2_UH(tmp4, tmp5, 7); | |
| 614 PCKEV_ST_SB(tmp4, tmp5, dst + 16); | 595 PCKEV_ST_SB(tmp4, tmp5, dst + 16); |
| 615 | 596 |
| 616 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); | 597 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); |
| 617 SRARI_H2_UH(tmp6, tmp7, FILTER_BITS); | 598 SRARI_H2_UH(tmp6, tmp7, FILTER_BITS); |
| 618 SAT_UH2_UH(tmp6, tmp7, 7); | |
| 619 PCKEV_ST_SB(tmp6, tmp7, dst + 16 + dst_stride); | 599 PCKEV_ST_SB(tmp6, tmp7, dst + 16 + dst_stride); |
| 620 | 600 |
| 621 ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2); | 601 ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2); |
| 622 ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3); | 602 ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3); |
| 623 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); | 603 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); |
| 624 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 604 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
| 625 SAT_UH2_UH(tmp0, tmp1, 7); | |
| 626 PCKEV_ST_SB(tmp0, tmp1, dst + 32); | 605 PCKEV_ST_SB(tmp0, tmp1, dst + 32); |
| 627 | 606 |
| 628 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); | 607 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); |
| 629 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); | 608 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); |
| 630 SAT_UH2_UH(tmp2, tmp3, 7); | |
| 631 PCKEV_ST_SB(tmp2, tmp3, dst + 32 + dst_stride); | 609 PCKEV_ST_SB(tmp2, tmp3, dst + 32 + dst_stride); |
| 632 | 610 |
| 633 ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6); | 611 ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6); |
| 634 ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7); | 612 ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7); |
| 635 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); | 613 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); |
| 636 SRARI_H2_UH(tmp4, tmp5, FILTER_BITS); | 614 SRARI_H2_UH(tmp4, tmp5, FILTER_BITS); |
| 637 SAT_UH2_UH(tmp4, tmp5, 7); | |
| 638 PCKEV_ST_SB(tmp4, tmp5, dst + 48); | 615 PCKEV_ST_SB(tmp4, tmp5, dst + 48); |
| 639 | 616 |
| 640 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); | 617 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); |
| 641 SRARI_H2_UH(tmp6, tmp7, FILTER_BITS); | 618 SRARI_H2_UH(tmp6, tmp7, FILTER_BITS); |
| 642 SAT_UH2_UH(tmp6, tmp7, 7); | |
| 643 PCKEV_ST_SB(tmp6, tmp7, dst + 48 + dst_stride); | 619 PCKEV_ST_SB(tmp6, tmp7, dst + 48 + dst_stride); |
| 644 dst += (2 * dst_stride); | 620 dst += (2 * dst_stride); |
| 645 | 621 |
| 646 src0 = src2; | 622 src0 = src2; |
| 647 src3 = src5; | 623 src3 = src5; |
| 648 src6 = src8; | 624 src6 = src8; |
| 649 src9 = src11; | 625 src9 = src11; |
| 650 } | 626 } |
| 651 } | 627 } |
| 652 | 628 |
| 653 void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, | 629 void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, |
| 654 uint8_t *dst, ptrdiff_t dst_stride, | 630 uint8_t *dst, ptrdiff_t dst_stride, |
| 655 const int16_t *filter_x, int x_step_q4, | 631 const int16_t *filter_x, int x_step_q4, |
| 656 const int16_t *filter_y, int y_step_q4, | 632 const int16_t *filter_y, int y_step_q4, |
| 657 int w, int h) { | 633 int w, int h) { |
| 658 int8_t cnt, filt_ver[8]; | 634 int8_t cnt, filt_ver[8]; |
| 659 | 635 |
| 660 if (16 != y_step_q4) { | 636 assert(y_step_q4 == 16); |
| 661 vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, | 637 assert(((const int32_t *)filter_y)[1] != 0x800000); |
| 662 filter_x, x_step_q4, filter_y, y_step_q4, | |
| 663 w, h); | |
| 664 return; | |
| 665 } | |
| 666 | |
| 667 if (((const int32_t *)filter_y)[1] == 0x800000) { | |
| 668 vpx_convolve_copy(src, src_stride, dst, dst_stride, | |
| 669 filter_x, x_step_q4, filter_y, y_step_q4, | |
| 670 w, h); | |
| 671 return; | |
| 672 } | |
| 673 | 638 |
| 674 for (cnt = 8; cnt--;) { | 639 for (cnt = 8; cnt--;) { |
| 675 filt_ver[cnt] = filter_y[cnt]; | 640 filt_ver[cnt] = filter_y[cnt]; |
| 676 } | 641 } |
| 677 | 642 |
| 678 if (((const int32_t *)filter_y)[0] == 0) { | 643 if (((const int32_t *)filter_y)[0] == 0) { |
| 679 switch (w) { | 644 switch (w) { |
| 680 case 4: | 645 case 4: |
| 681 common_vt_2t_4w_msa(src, (int32_t)src_stride, | 646 common_vt_2t_4w_msa(src, (int32_t)src_stride, |
| 682 dst, (int32_t)dst_stride, | 647 dst, (int32_t)dst_stride, |
| (...skipping 53 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 736 filt_ver, h); | 701 filt_ver, h); |
| 737 break; | 702 break; |
| 738 default: | 703 default: |
| 739 vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, | 704 vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, |
| 740 filter_x, x_step_q4, filter_y, y_step_q4, | 705 filter_x, x_step_q4, filter_y, y_step_q4, |
| 741 w, h); | 706 w, h); |
| 742 break; | 707 break; |
| 743 } | 708 } |
| 744 } | 709 } |
| 745 } | 710 } |
| OLD | NEW |