| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| 11 #include <assert.h> |
| 11 #include "./vpx_dsp_rtcd.h" | 12 #include "./vpx_dsp_rtcd.h" |
| 12 #include "vpx_dsp/mips/vpx_convolve_msa.h" | 13 #include "vpx_dsp/mips/vpx_convolve_msa.h" |
| 13 | 14 |
| 14 static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src, | 15 static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src, |
| 15 int32_t src_stride, | 16 int32_t src_stride, |
| 16 uint8_t *dst, | 17 uint8_t *dst, |
| 17 int32_t dst_stride, | 18 int32_t dst_stride, |
| 18 int8_t *filter) { | 19 int8_t *filter) { |
| 19 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; | 20 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; |
| 20 v16u8 dst0, dst1, dst2, dst3, res2, res3; | 21 v16u8 dst0, dst1, dst2, dst3, res2, res3; |
| (...skipping 295 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 316 } | 317 } |
| 317 } | 318 } |
| 318 | 319 |
| 319 static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src, | 320 static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src, |
| 320 int32_t src_stride, | 321 int32_t src_stride, |
| 321 uint8_t *dst, | 322 uint8_t *dst, |
| 322 int32_t dst_stride, | 323 int32_t dst_stride, |
| 323 int8_t *filter) { | 324 int8_t *filter) { |
| 324 v16i8 src0, src1, src2, src3, mask; | 325 v16i8 src0, src1, src2, src3, mask; |
| 325 v16u8 filt0, dst0, dst1, dst2, dst3, vec0, vec1, res0, res1; | 326 v16u8 filt0, dst0, dst1, dst2, dst3, vec0, vec1, res0, res1; |
| 326 v8u16 vec2, vec3, const255, filt; | 327 v8u16 vec2, vec3, filt; |
| 327 | 328 |
| 328 mask = LD_SB(&mc_filt_mask_arr[16]); | 329 mask = LD_SB(&mc_filt_mask_arr[16]); |
| 329 | 330 |
| 330 /* rearranging filter */ | 331 /* rearranging filter */ |
| 331 filt = LD_UH(filter); | 332 filt = LD_UH(filter); |
| 332 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); | 333 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); |
| 333 | 334 |
| 334 const255 = (v8u16)__msa_ldi_h(255); | |
| 335 | |
| 336 LD_SB4(src, src_stride, src0, src1, src2, src3); | 335 LD_SB4(src, src_stride, src0, src1, src2, src3); |
| 337 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); | 336 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); |
| 338 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); | 337 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); |
| 339 DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3); | 338 DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3); |
| 340 SRARI_H2_UH(vec2, vec3, FILTER_BITS); | 339 SRARI_H2_UH(vec2, vec3, FILTER_BITS); |
| 341 MIN_UH2_UH(vec2, vec3, const255); | |
| 342 PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1); | 340 PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1); |
| 343 ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); | 341 ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); |
| 344 AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1); | 342 AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1); |
| 345 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); | 343 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); |
| 346 } | 344 } |
| 347 | 345 |
| 348 static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src, | 346 static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src, |
| 349 int32_t src_stride, | 347 int32_t src_stride, |
| 350 uint8_t *dst, | 348 uint8_t *dst, |
| 351 int32_t dst_stride, | 349 int32_t dst_stride, |
| 352 int8_t *filter) { | 350 int8_t *filter) { |
| 353 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; | 351 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; |
| 354 v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3; | 352 v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3; |
| 355 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; | 353 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; |
| 356 v8u16 vec4, vec5, vec6, vec7, const255, filt; | 354 v8u16 vec4, vec5, vec6, vec7, filt; |
| 357 | 355 |
| 358 mask = LD_SB(&mc_filt_mask_arr[16]); | 356 mask = LD_SB(&mc_filt_mask_arr[16]); |
| 359 | 357 |
| 360 /* rearranging filter */ | 358 /* rearranging filter */ |
| 361 filt = LD_UH(filter); | 359 filt = LD_UH(filter); |
| 362 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); | 360 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); |
| 363 | 361 |
| 364 const255 = (v8u16)__msa_ldi_h(255); | |
| 365 | |
| 366 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); | 362 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); |
| 367 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); | 363 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); |
| 368 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); | 364 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); |
| 369 VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3); | 365 VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3); |
| 370 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5, | 366 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5, |
| 371 vec6, vec7); | 367 vec6, vec7); |
| 372 SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); | 368 SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); |
| 373 MIN_UH4_UH(vec4, vec5, vec6, vec7, const255); | |
| 374 PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2, | 369 PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2, |
| 375 res3); | 370 res3); |
| 376 ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4, | 371 ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4, |
| 377 dst6); | 372 dst6); |
| 378 AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2, | 373 AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2, |
| 379 res3); | 374 res3); |
| 380 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); | 375 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); |
| 381 dst += (4 * dst_stride); | 376 dst += (4 * dst_stride); |
| 382 ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); | 377 ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); |
| 383 } | 378 } |
| (...skipping 11 matching lines...) Expand all Loading... |
| 395 } | 390 } |
| 396 } | 391 } |
| 397 | 392 |
| 398 static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src, | 393 static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src, |
| 399 int32_t src_stride, | 394 int32_t src_stride, |
| 400 uint8_t *dst, | 395 uint8_t *dst, |
| 401 int32_t dst_stride, | 396 int32_t dst_stride, |
| 402 int8_t *filter) { | 397 int8_t *filter) { |
| 403 v16i8 src0, src1, src2, src3, mask; | 398 v16i8 src0, src1, src2, src3, mask; |
| 404 v16u8 filt0, dst0, dst1, dst2, dst3; | 399 v16u8 filt0, dst0, dst1, dst2, dst3; |
| 405 v8u16 vec0, vec1, vec2, vec3, const255, filt; | 400 v8u16 vec0, vec1, vec2, vec3, filt; |
| 406 | 401 |
| 407 mask = LD_SB(&mc_filt_mask_arr[0]); | 402 mask = LD_SB(&mc_filt_mask_arr[0]); |
| 408 | 403 |
| 409 /* rearranging filter */ | 404 /* rearranging filter */ |
| 410 filt = LD_UH(filter); | 405 filt = LD_UH(filter); |
| 411 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); | 406 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); |
| 412 | 407 |
| 413 const255 = (v8u16)__msa_ldi_h(255); | |
| 414 | |
| 415 LD_SB4(src, src_stride, src0, src1, src2, src3); | 408 LD_SB4(src, src_stride, src0, src1, src2, src3); |
| 416 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); | 409 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); |
| 417 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); | 410 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); |
| 418 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, | 411 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, |
| 419 vec2, vec3); | 412 vec2, vec3); |
| 420 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); | 413 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); |
| 421 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); | 414 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); |
| 422 MIN_UH4_UH(vec0, vec1, vec2, vec3, const255); | |
| 423 PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, | 415 PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, |
| 424 dst, dst_stride); | 416 dst, dst_stride); |
| 425 } | 417 } |
| 426 | 418 |
| 427 static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src, | 419 static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src, |
| 428 int32_t src_stride, | 420 int32_t src_stride, |
| 429 uint8_t *dst, | 421 uint8_t *dst, |
| 430 int32_t dst_stride, | 422 int32_t dst_stride, |
| 431 int8_t *filter, | 423 int8_t *filter, |
| 432 int32_t height) { | 424 int32_t height) { |
| 433 v16i8 src0, src1, src2, src3, mask; | 425 v16i8 src0, src1, src2, src3, mask; |
| 434 v16u8 filt0, dst0, dst1, dst2, dst3; | 426 v16u8 filt0, dst0, dst1, dst2, dst3; |
| 435 v8u16 vec0, vec1, vec2, vec3, const255, filt; | 427 v8u16 vec0, vec1, vec2, vec3, filt; |
| 436 | 428 |
| 437 mask = LD_SB(&mc_filt_mask_arr[0]); | 429 mask = LD_SB(&mc_filt_mask_arr[0]); |
| 438 | 430 |
| 439 /* rearranging filter */ | 431 /* rearranging filter */ |
| 440 filt = LD_UH(filter); | 432 filt = LD_UH(filter); |
| 441 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); | 433 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); |
| 442 | 434 |
| 443 const255 = (v8u16)__msa_ldi_h(255); | |
| 444 | |
| 445 LD_SB4(src, src_stride, src0, src1, src2, src3); | 435 LD_SB4(src, src_stride, src0, src1, src2, src3); |
| 446 src += (4 * src_stride); | 436 src += (4 * src_stride); |
| 447 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); | 437 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); |
| 448 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); | 438 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); |
| 449 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, | 439 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, |
| 450 vec2, vec3); | 440 vec2, vec3); |
| 451 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); | 441 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); |
| 452 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); | 442 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); |
| 453 MIN_UH4_UH(vec0, vec1, vec2, vec3, const255); | |
| 454 LD_SB4(src, src_stride, src0, src1, src2, src3); | 443 LD_SB4(src, src_stride, src0, src1, src2, src3); |
| 455 src += (4 * src_stride); | 444 src += (4 * src_stride); |
| 456 PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, | 445 PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, |
| 457 dst, dst_stride); | 446 dst, dst_stride); |
| 458 dst += (4 * dst_stride); | 447 dst += (4 * dst_stride); |
| 459 | 448 |
| 460 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); | 449 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); |
| 461 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); | 450 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); |
| 462 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, | 451 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, |
| 463 vec2, vec3); | 452 vec2, vec3); |
| 464 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); | 453 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); |
| 465 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); | 454 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); |
| 466 MIN_UH4_UH(vec0, vec1, vec2, vec3, const255); | |
| 467 PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, | 455 PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, |
| 468 dst, dst_stride); | 456 dst, dst_stride); |
| 469 dst += (4 * dst_stride); | 457 dst += (4 * dst_stride); |
| 470 | 458 |
| 471 if (16 == height) { | 459 if (16 == height) { |
| 472 LD_SB4(src, src_stride, src0, src1, src2, src3); | 460 LD_SB4(src, src_stride, src0, src1, src2, src3); |
| 473 src += (4 * src_stride); | 461 src += (4 * src_stride); |
| 474 | 462 |
| 475 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); | 463 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); |
| 476 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); | 464 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); |
| 477 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, | 465 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, |
| 478 vec2, vec3); | 466 vec2, vec3); |
| 479 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); | 467 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); |
| 480 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); | 468 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); |
| 481 MIN_UH4_UH(vec0, vec1, vec2, vec3, const255); | |
| 482 LD_SB4(src, src_stride, src0, src1, src2, src3); | 469 LD_SB4(src, src_stride, src0, src1, src2, src3); |
| 483 PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, | 470 PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, |
| 484 dst, dst_stride); | 471 dst, dst_stride); |
| 485 dst += (4 * dst_stride); | 472 dst += (4 * dst_stride); |
| 486 | 473 |
| 487 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); | 474 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); |
| 488 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); | 475 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); |
| 489 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, | 476 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, |
| 490 vec2, vec3); | 477 vec2, vec3); |
| 491 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); | 478 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); |
| 492 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); | 479 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); |
| 493 MIN_UH4_UH(vec0, vec1, vec2, vec3, const255); | |
| 494 PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, | 480 PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, |
| 495 dst, dst_stride); | 481 dst, dst_stride); |
| 496 } | 482 } |
| 497 } | 483 } |
| 498 | 484 |
| 499 static void common_hz_2t_and_aver_dst_8w_msa(const uint8_t *src, | 485 static void common_hz_2t_and_aver_dst_8w_msa(const uint8_t *src, |
| 500 int32_t src_stride, | 486 int32_t src_stride, |
| 501 uint8_t *dst, | 487 uint8_t *dst, |
| 502 int32_t dst_stride, | 488 int32_t dst_stride, |
| 503 int8_t *filter, | 489 int8_t *filter, |
| 504 int32_t height) { | 490 int32_t height) { |
| 505 if (4 == height) { | 491 if (4 == height) { |
| 506 common_hz_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter); | 492 common_hz_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter); |
| 507 } else { | 493 } else { |
| 508 common_hz_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride, | 494 common_hz_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride, |
| 509 filter, height); | 495 filter, height); |
| 510 } | 496 } |
| 511 } | 497 } |
| 512 | 498 |
| 513 static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src, | 499 static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src, |
| 514 int32_t src_stride, | 500 int32_t src_stride, |
| 515 uint8_t *dst, | 501 uint8_t *dst, |
| 516 int32_t dst_stride, | 502 int32_t dst_stride, |
| 517 int8_t *filter, | 503 int8_t *filter, |
| 518 int32_t height) { | 504 int32_t height) { |
| 519 uint32_t loop_cnt; | 505 uint32_t loop_cnt; |
| 520 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; | 506 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; |
| 521 v16u8 filt0, dst0, dst1, dst2, dst3; | 507 v16u8 filt0, dst0, dst1, dst2, dst3; |
| 522 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; | 508 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; |
| 523 v8u16 res0, res1, res2, res3, res4, res5, res6, res7, const255, filt; | 509 v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt; |
| 524 | 510 |
| 525 mask = LD_SB(&mc_filt_mask_arr[0]); | 511 mask = LD_SB(&mc_filt_mask_arr[0]); |
| 526 | 512 |
| 527 /* rearranging filter */ | 513 /* rearranging filter */ |
| 528 filt = LD_UH(filter); | 514 filt = LD_UH(filter); |
| 529 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); | 515 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); |
| 530 | 516 |
| 531 const255 = (v8u16)__msa_ldi_h(255); | |
| 532 | |
| 533 LD_SB4(src, src_stride, src0, src2, src4, src6); | 517 LD_SB4(src, src_stride, src0, src2, src4, src6); |
| 534 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); | 518 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); |
| 535 src += (4 * src_stride); | 519 src += (4 * src_stride); |
| 536 | 520 |
| 537 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); | 521 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); |
| 538 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); | 522 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); |
| 539 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); | 523 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); |
| 540 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); | 524 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); |
| 541 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1, | 525 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1, |
| 542 res2, res3); | 526 res2, res3); |
| 543 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5, | 527 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5, |
| 544 res6, res7); | 528 res6, res7); |
| 545 SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS); | 529 SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS); |
| 546 SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS); | 530 SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS); |
| 547 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); | 531 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); |
| 548 MIN_UH4_UH(res0, res1, res2, res3, const255); | |
| 549 MIN_UH4_UH(res4, res5, res6, res7, const255); | |
| 550 PCKEV_AVG_ST_UB(res1, res0, dst0, dst); | 532 PCKEV_AVG_ST_UB(res1, res0, dst0, dst); |
| 551 dst += dst_stride; | 533 dst += dst_stride; |
| 552 PCKEV_AVG_ST_UB(res3, res2, dst1, dst); | 534 PCKEV_AVG_ST_UB(res3, res2, dst1, dst); |
| 553 dst += dst_stride; | 535 dst += dst_stride; |
| 554 PCKEV_AVG_ST_UB(res5, res4, dst2, dst); | 536 PCKEV_AVG_ST_UB(res5, res4, dst2, dst); |
| 555 dst += dst_stride; | 537 dst += dst_stride; |
| 556 PCKEV_AVG_ST_UB(res7, res6, dst3, dst); | 538 PCKEV_AVG_ST_UB(res7, res6, dst3, dst); |
| 557 dst += dst_stride; | 539 dst += dst_stride; |
| 558 | 540 |
| 559 for (loop_cnt = (height >> 2) - 1; loop_cnt--;) { | 541 for (loop_cnt = (height >> 2) - 1; loop_cnt--;) { |
| 560 LD_SB4(src, src_stride, src0, src2, src4, src6); | 542 LD_SB4(src, src_stride, src0, src2, src4, src6); |
| 561 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); | 543 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); |
| 562 src += (4 * src_stride); | 544 src += (4 * src_stride); |
| 563 | 545 |
| 564 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); | 546 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); |
| 565 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); | 547 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); |
| 566 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); | 548 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); |
| 567 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); | 549 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); |
| 568 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1, | 550 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1, |
| 569 res2, res3); | 551 res2, res3); |
| 570 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5, | 552 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5, |
| 571 res6, res7); | 553 res6, res7); |
| 572 SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS); | 554 SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS); |
| 573 SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS); | 555 SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS); |
| 574 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); | 556 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); |
| 575 MIN_UH4_UH(res0, res1, res2, res3, const255); | |
| 576 MIN_UH4_UH(res4, res5, res6, res7, const255); | |
| 577 PCKEV_AVG_ST_UB(res1, res0, dst0, dst); | 557 PCKEV_AVG_ST_UB(res1, res0, dst0, dst); |
| 578 dst += dst_stride; | 558 dst += dst_stride; |
| 579 PCKEV_AVG_ST_UB(res3, res2, dst1, dst); | 559 PCKEV_AVG_ST_UB(res3, res2, dst1, dst); |
| 580 dst += dst_stride; | 560 dst += dst_stride; |
| 581 PCKEV_AVG_ST_UB(res5, res4, dst2, dst); | 561 PCKEV_AVG_ST_UB(res5, res4, dst2, dst); |
| 582 dst += dst_stride; | 562 dst += dst_stride; |
| 583 PCKEV_AVG_ST_UB(res7, res6, dst3, dst); | 563 PCKEV_AVG_ST_UB(res7, res6, dst3, dst); |
| 584 dst += dst_stride; | 564 dst += dst_stride; |
| 585 } | 565 } |
| 586 } | 566 } |
| 587 | 567 |
| 588 static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src, | 568 static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src, |
| 589 int32_t src_stride, | 569 int32_t src_stride, |
| 590 uint8_t *dst, | 570 uint8_t *dst, |
| 591 int32_t dst_stride, | 571 int32_t dst_stride, |
| 592 int8_t *filter, | 572 int8_t *filter, |
| 593 int32_t height) { | 573 int32_t height) { |
| 594 uint32_t loop_cnt; | 574 uint32_t loop_cnt; |
| 595 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; | 575 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; |
| 596 v16u8 filt0, dst0, dst1, dst2, dst3; | 576 v16u8 filt0, dst0, dst1, dst2, dst3; |
| 597 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; | 577 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; |
| 598 v8u16 res0, res1, res2, res3, res4, res5, res6, res7, const255, filt; | 578 v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt; |
| 599 | 579 |
| 600 mask = LD_SB(&mc_filt_mask_arr[0]); | 580 mask = LD_SB(&mc_filt_mask_arr[0]); |
| 601 | 581 |
| 602 /* rearranging filter */ | 582 /* rearranging filter */ |
| 603 filt = LD_UH(filter); | 583 filt = LD_UH(filter); |
| 604 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); | 584 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); |
| 605 | 585 |
| 606 const255 = (v8u16)__msa_ldi_h(255); | |
| 607 | |
| 608 for (loop_cnt = (height >> 1); loop_cnt--;) { | 586 for (loop_cnt = (height >> 1); loop_cnt--;) { |
| 609 src0 = LD_SB(src); | 587 src0 = LD_SB(src); |
| 610 src2 = LD_SB(src + 16); | 588 src2 = LD_SB(src + 16); |
| 611 src3 = LD_SB(src + 24); | 589 src3 = LD_SB(src + 24); |
| 612 src1 = __msa_sldi_b(src2, src0, 8); | 590 src1 = __msa_sldi_b(src2, src0, 8); |
| 613 src += src_stride; | 591 src += src_stride; |
| 614 src4 = LD_SB(src); | 592 src4 = LD_SB(src); |
| 615 src6 = LD_SB(src + 16); | 593 src6 = LD_SB(src + 16); |
| 616 src7 = LD_SB(src + 24); | 594 src7 = LD_SB(src + 24); |
| 617 src5 = __msa_sldi_b(src6, src4, 8); | 595 src5 = __msa_sldi_b(src6, src4, 8); |
| 618 src += src_stride; | 596 src += src_stride; |
| 619 | 597 |
| 620 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); | 598 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); |
| 621 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); | 599 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); |
| 622 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); | 600 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); |
| 623 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); | 601 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); |
| 624 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1, | 602 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1, |
| 625 res2, res3); | 603 res2, res3); |
| 626 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5, | 604 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5, |
| 627 res6, res7); | 605 res6, res7); |
| 628 SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS); | 606 SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS); |
| 629 SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS); | 607 SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS); |
| 630 MIN_UH4_UH(res0, res1, res2, res3, const255); | |
| 631 MIN_UH4_UH(res4, res5, res6, res7, const255); | |
| 632 LD_UB2(dst, 16, dst0, dst1); | 608 LD_UB2(dst, 16, dst0, dst1); |
| 633 PCKEV_AVG_ST_UB(res1, res0, dst0, dst); | 609 PCKEV_AVG_ST_UB(res1, res0, dst0, dst); |
| 634 PCKEV_AVG_ST_UB(res3, res2, dst1, (dst + 16)); | 610 PCKEV_AVG_ST_UB(res3, res2, dst1, (dst + 16)); |
| 635 dst += dst_stride; | 611 dst += dst_stride; |
| 636 LD_UB2(dst, 16, dst2, dst3); | 612 LD_UB2(dst, 16, dst2, dst3); |
| 637 PCKEV_AVG_ST_UB(res5, res4, dst2, dst); | 613 PCKEV_AVG_ST_UB(res5, res4, dst2, dst); |
| 638 PCKEV_AVG_ST_UB(res7, res6, dst3, (dst + 16)); | 614 PCKEV_AVG_ST_UB(res7, res6, dst3, (dst + 16)); |
| 639 dst += dst_stride; | 615 dst += dst_stride; |
| 640 } | 616 } |
| 641 } | 617 } |
| 642 | 618 |
| 643 static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src, | 619 static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src, |
| 644 int32_t src_stride, | 620 int32_t src_stride, |
| 645 uint8_t *dst, | 621 uint8_t *dst, |
| 646 int32_t dst_stride, | 622 int32_t dst_stride, |
| 647 int8_t *filter, | 623 int8_t *filter, |
| 648 int32_t height) { | 624 int32_t height) { |
| 649 uint32_t loop_cnt; | 625 uint32_t loop_cnt; |
| 650 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; | 626 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; |
| 651 v16u8 filt0, dst0, dst1, dst2, dst3; | 627 v16u8 filt0, dst0, dst1, dst2, dst3; |
| 652 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; | 628 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; |
| 653 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, const255, filt; | 629 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; |
| 654 | 630 |
| 655 mask = LD_SB(&mc_filt_mask_arr[0]); | 631 mask = LD_SB(&mc_filt_mask_arr[0]); |
| 656 | 632 |
| 657 /* rearranging filter */ | 633 /* rearranging filter */ |
| 658 filt = LD_UH(filter); | 634 filt = LD_UH(filter); |
| 659 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); | 635 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); |
| 660 | 636 |
| 661 const255 = (v8u16)__msa_ldi_h(255); | |
| 662 | |
| 663 for (loop_cnt = height; loop_cnt--;) { | 637 for (loop_cnt = height; loop_cnt--;) { |
| 664 LD_SB4(src, 16, src0, src2, src4, src6); | 638 LD_SB4(src, 16, src0, src2, src4, src6); |
| 665 src7 = LD_SB(src + 56); | 639 src7 = LD_SB(src + 56); |
| 666 SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8); | 640 SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8); |
| 667 src += src_stride; | 641 src += src_stride; |
| 668 | 642 |
| 669 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); | 643 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); |
| 670 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); | 644 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); |
| 671 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); | 645 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); |
| 672 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); | 646 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); |
| 673 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, | 647 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, |
| 674 out2, out3); | 648 out2, out3); |
| 675 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, | 649 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, |
| 676 out6, out7); | 650 out6, out7); |
| 677 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); | 651 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); |
| 678 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); | 652 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); |
| 679 LD_UB4(dst, 16, dst0, dst1, dst2, dst3); | 653 LD_UB4(dst, 16, dst0, dst1, dst2, dst3); |
| 680 MIN_UH4_UH(out0, out1, out2, out3, const255); | |
| 681 MIN_UH4_UH(out4, out5, out6, out7, const255); | |
| 682 PCKEV_AVG_ST_UB(out1, out0, dst0, dst); | 654 PCKEV_AVG_ST_UB(out1, out0, dst0, dst); |
| 683 PCKEV_AVG_ST_UB(out3, out2, dst1, dst + 16); | 655 PCKEV_AVG_ST_UB(out3, out2, dst1, dst + 16); |
| 684 PCKEV_AVG_ST_UB(out5, out4, dst2, dst + 32); | 656 PCKEV_AVG_ST_UB(out5, out4, dst2, dst + 32); |
| 685 PCKEV_AVG_ST_UB(out7, out6, dst3, dst + 48); | 657 PCKEV_AVG_ST_UB(out7, out6, dst3, dst + 48); |
| 686 dst += dst_stride; | 658 dst += dst_stride; |
| 687 } | 659 } |
| 688 } | 660 } |
| 689 | 661 |
| 690 void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, | 662 void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, |
| 691 uint8_t *dst, ptrdiff_t dst_stride, | 663 uint8_t *dst, ptrdiff_t dst_stride, |
| 692 const int16_t *filter_x, int x_step_q4, | 664 const int16_t *filter_x, int x_step_q4, |
| 693 const int16_t *filter_y, int y_step_q4, | 665 const int16_t *filter_y, int y_step_q4, |
| 694 int w, int h) { | 666 int w, int h) { |
| 695 int8_t cnt, filt_hor[8]; | 667 int8_t cnt, filt_hor[8]; |
| 696 | 668 |
| 697 if (16 != x_step_q4) { | 669 assert(x_step_q4 == 16); |
| 698 vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, | 670 assert(((const int32_t *)filter_x)[1] != 0x800000); |
| 699 filter_x, x_step_q4, filter_y, y_step_q4, | |
| 700 w, h); | |
| 701 return; | |
| 702 } | |
| 703 | |
| 704 if (((const int32_t *)filter_x)[1] == 0x800000) { | |
| 705 vpx_convolve_avg(src, src_stride, dst, dst_stride, | |
| 706 filter_x, x_step_q4, filter_y, y_step_q4, | |
| 707 w, h); | |
| 708 return; | |
| 709 } | |
| 710 | 671 |
| 711 for (cnt = 0; cnt < 8; ++cnt) { | 672 for (cnt = 0; cnt < 8; ++cnt) { |
| 712 filt_hor[cnt] = filter_x[cnt]; | 673 filt_hor[cnt] = filter_x[cnt]; |
| 713 } | 674 } |
| 714 | 675 |
| 715 if (((const int32_t *)filter_x)[0] == 0) { | 676 if (((const int32_t *)filter_x)[0] == 0) { |
| 716 switch (w) { | 677 switch (w) { |
| 717 case 4: | 678 case 4: |
| 718 common_hz_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, | 679 common_hz_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, |
| 719 dst, (int32_t)dst_stride, | 680 dst, (int32_t)dst_stride, |
| (...skipping 53 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 773 filt_hor, h); | 734 filt_hor, h); |
| 774 break; | 735 break; |
| 775 default: | 736 default: |
| 776 vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, | 737 vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, |
| 777 filter_x, x_step_q4, filter_y, y_step_q4, | 738 filter_x, x_step_q4, filter_y, y_step_q4, |
| 778 w, h); | 739 w, h); |
| 779 break; | 740 break; |
| 780 } | 741 } |
| 781 } | 742 } |
| 782 } | 743 } |
| OLD | NEW |