| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| 11 #include <assert.h> |
| 11 #include "./vpx_dsp_rtcd.h" | 12 #include "./vpx_dsp_rtcd.h" |
| 12 #include "vpx_dsp/mips/vpx_convolve_msa.h" | 13 #include "vpx_dsp/mips/vpx_convolve_msa.h" |
| 13 | 14 |
| 14 static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride, | 15 static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride, |
| 15 uint8_t *dst, int32_t dst_stride, | 16 uint8_t *dst, int32_t dst_stride, |
| 16 int8_t *filter) { | 17 int8_t *filter) { |
| 17 v16u8 mask0, mask1, mask2, mask3, out; | 18 v16u8 mask0, mask1, mask2, mask3, out; |
| 18 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; | 19 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; |
| 19 v8i16 filt, out0, out1; | 20 v8i16 filt, out0, out1; |
| 20 | 21 |
| (...skipping 290 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 311 ST_UB(out, dst + 48); | 312 ST_UB(out, dst + 48); |
| 312 dst += dst_stride; | 313 dst += dst_stride; |
| 313 } | 314 } |
| 314 } | 315 } |
| 315 | 316 |
| 316 static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride, | 317 static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride, |
| 317 uint8_t *dst, int32_t dst_stride, | 318 uint8_t *dst, int32_t dst_stride, |
| 318 int8_t *filter) { | 319 int8_t *filter) { |
| 319 v16i8 src0, src1, src2, src3, mask; | 320 v16i8 src0, src1, src2, src3, mask; |
| 320 v16u8 filt0, vec0, vec1, res0, res1; | 321 v16u8 filt0, vec0, vec1, res0, res1; |
| 321 v8u16 vec2, vec3, filt, const255; | 322 v8u16 vec2, vec3, filt; |
| 322 | 323 |
| 323 mask = LD_SB(&mc_filt_mask_arr[16]); | 324 mask = LD_SB(&mc_filt_mask_arr[16]); |
| 324 | 325 |
| 325 /* rearranging filter */ | 326 /* rearranging filter */ |
| 326 filt = LD_UH(filter); | 327 filt = LD_UH(filter); |
| 327 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); | 328 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); |
| 328 | 329 |
| 329 const255 = (v8u16) __msa_ldi_h(255); | |
| 330 | |
| 331 LD_SB4(src, src_stride, src0, src1, src2, src3); | 330 LD_SB4(src, src_stride, src0, src1, src2, src3); |
| 332 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); | 331 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); |
| 333 DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3); | 332 DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3); |
| 334 SRARI_H2_UH(vec2, vec3, FILTER_BITS); | 333 SRARI_H2_UH(vec2, vec3, FILTER_BITS); |
| 335 MIN_UH2_UH(vec2, vec3, const255); | |
| 336 PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1); | 334 PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1); |
| 337 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); | 335 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); |
| 338 } | 336 } |
| 339 | 337 |
| 340 static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride, | 338 static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride, |
| 341 uint8_t *dst, int32_t dst_stride, | 339 uint8_t *dst, int32_t dst_stride, |
| 342 int8_t *filter) { | 340 int8_t *filter) { |
| 343 v16u8 vec0, vec1, vec2, vec3, filt0; | 341 v16u8 vec0, vec1, vec2, vec3, filt0; |
| 344 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; | 342 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; |
| 345 v16i8 res0, res1, res2, res3; | 343 v16i8 res0, res1, res2, res3; |
| 346 v8u16 vec4, vec5, vec6, vec7, filt, const255; | 344 v8u16 vec4, vec5, vec6, vec7, filt; |
| 347 | 345 |
| 348 mask = LD_SB(&mc_filt_mask_arr[16]); | 346 mask = LD_SB(&mc_filt_mask_arr[16]); |
| 349 | 347 |
| 350 /* rearranging filter */ | 348 /* rearranging filter */ |
| 351 filt = LD_UH(filter); | 349 filt = LD_UH(filter); |
| 352 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); | 350 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); |
| 353 | 351 |
| 354 const255 = (v8u16) __msa_ldi_h(255); | |
| 355 | |
| 356 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); | 352 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); |
| 357 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); | 353 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); |
| 358 VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3); | 354 VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3); |
| 359 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5, | 355 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5, |
| 360 vec6, vec7); | 356 vec6, vec7); |
| 361 SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); | 357 SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); |
| 362 MIN_UH4_UH(vec4, vec5, vec6, vec7, const255); | |
| 363 PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, | 358 PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, |
| 364 res2, res3); | 359 res2, res3); |
| 365 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); | 360 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); |
| 366 dst += (4 * dst_stride); | 361 dst += (4 * dst_stride); |
| 367 ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); | 362 ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); |
| 368 } | 363 } |
| 369 | 364 |
| 370 static void common_hz_2t_4w_msa(const uint8_t *src, int32_t src_stride, | 365 static void common_hz_2t_4w_msa(const uint8_t *src, int32_t src_stride, |
| 371 uint8_t *dst, int32_t dst_stride, | 366 uint8_t *dst, int32_t dst_stride, |
| 372 int8_t *filter, int32_t height) { | 367 int8_t *filter, int32_t height) { |
| 373 if (4 == height) { | 368 if (4 == height) { |
| 374 common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter); | 369 common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter); |
| 375 } else if (8 == height) { | 370 } else if (8 == height) { |
| 376 common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter); | 371 common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter); |
| 377 } | 372 } |
| 378 } | 373 } |
| 379 | 374 |
| 380 static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride, | 375 static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride, |
| 381 uint8_t *dst, int32_t dst_stride, | 376 uint8_t *dst, int32_t dst_stride, |
| 382 int8_t *filter) { | 377 int8_t *filter) { |
| 383 v16u8 filt0; | 378 v16u8 filt0; |
| 384 v16i8 src0, src1, src2, src3, mask; | 379 v16i8 src0, src1, src2, src3, mask; |
| 385 v8u16 vec0, vec1, vec2, vec3, const255, filt; | 380 v8u16 vec0, vec1, vec2, vec3, filt; |
| 386 | 381 |
| 387 mask = LD_SB(&mc_filt_mask_arr[0]); | 382 mask = LD_SB(&mc_filt_mask_arr[0]); |
| 388 | 383 |
| 389 /* rearranging filter */ | 384 /* rearranging filter */ |
| 390 filt = LD_UH(filter); | 385 filt = LD_UH(filter); |
| 391 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); | 386 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); |
| 392 | 387 |
| 393 const255 = (v8u16) __msa_ldi_h(255); | |
| 394 | |
| 395 LD_SB4(src, src_stride, src0, src1, src2, src3); | 388 LD_SB4(src, src_stride, src0, src1, src2, src3); |
| 396 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); | 389 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); |
| 397 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); | 390 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); |
| 398 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, | 391 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, |
| 399 vec2, vec3); | 392 vec2, vec3); |
| 400 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); | 393 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); |
| 401 MIN_UH4_UH(vec0, vec1, vec2, vec3, const255); | |
| 402 PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1); | 394 PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1); |
| 403 ST8x4_UB(src0, src1, dst, dst_stride); | 395 ST8x4_UB(src0, src1, dst, dst_stride); |
| 404 } | 396 } |
| 405 | 397 |
| 406 static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, | 398 static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, |
| 407 uint8_t *dst, int32_t dst_stride, | 399 uint8_t *dst, int32_t dst_stride, |
| 408 int8_t *filter, int32_t height) { | 400 int8_t *filter, int32_t height) { |
| 409 v16u8 filt0; | 401 v16u8 filt0; |
| 410 v16i8 src0, src1, src2, src3, mask, out0, out1; | 402 v16i8 src0, src1, src2, src3, mask, out0, out1; |
| 411 v8u16 vec0, vec1, vec2, vec3, filt, const255; | 403 v8u16 vec0, vec1, vec2, vec3, filt; |
| 412 | 404 |
| 413 mask = LD_SB(&mc_filt_mask_arr[0]); | 405 mask = LD_SB(&mc_filt_mask_arr[0]); |
| 414 | 406 |
| 415 /* rearranging filter */ | 407 /* rearranging filter */ |
| 416 filt = LD_UH(filter); | 408 filt = LD_UH(filter); |
| 417 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); | 409 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); |
| 418 | 410 |
| 419 const255 = (v8u16) __msa_ldi_h(255); | |
| 420 | |
| 421 LD_SB4(src, src_stride, src0, src1, src2, src3); | 411 LD_SB4(src, src_stride, src0, src1, src2, src3); |
| 422 src += (4 * src_stride); | 412 src += (4 * src_stride); |
| 423 | 413 |
| 424 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); | 414 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); |
| 425 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); | 415 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); |
| 426 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, | 416 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, |
| 427 vec2, vec3); | 417 vec2, vec3); |
| 428 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); | 418 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); |
| 429 MIN_UH4_UH(vec0, vec1, vec2, vec3, const255); | |
| 430 | 419 |
| 431 LD_SB4(src, src_stride, src0, src1, src2, src3); | 420 LD_SB4(src, src_stride, src0, src1, src2, src3); |
| 432 src += (4 * src_stride); | 421 src += (4 * src_stride); |
| 433 | 422 |
| 434 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); | 423 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); |
| 435 ST8x4_UB(out0, out1, dst, dst_stride); | 424 ST8x4_UB(out0, out1, dst, dst_stride); |
| 436 dst += (4 * dst_stride); | 425 dst += (4 * dst_stride); |
| 437 | 426 |
| 438 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); | 427 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); |
| 439 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); | 428 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); |
| 440 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, | 429 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, |
| 441 vec2, vec3); | 430 vec2, vec3); |
| 442 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); | 431 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); |
| 443 MIN_UH4_UH(vec0, vec1, vec2, vec3, const255); | |
| 444 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); | 432 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); |
| 445 ST8x4_UB(out0, out1, dst, dst_stride); | 433 ST8x4_UB(out0, out1, dst, dst_stride); |
| 446 dst += (4 * dst_stride); | 434 dst += (4 * dst_stride); |
| 447 | 435 |
| 448 if (16 == height) { | 436 if (16 == height) { |
| 449 LD_SB4(src, src_stride, src0, src1, src2, src3); | 437 LD_SB4(src, src_stride, src0, src1, src2, src3); |
| 450 src += (4 * src_stride); | 438 src += (4 * src_stride); |
| 451 | 439 |
| 452 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); | 440 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); |
| 453 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); | 441 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); |
| 454 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, | 442 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, |
| 455 vec2, vec3); | 443 vec2, vec3); |
| 456 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); | 444 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); |
| 457 MIN_UH4_UH(vec0, vec1, vec2, vec3, const255); | |
| 458 LD_SB4(src, src_stride, src0, src1, src2, src3); | 445 LD_SB4(src, src_stride, src0, src1, src2, src3); |
| 459 src += (4 * src_stride); | 446 src += (4 * src_stride); |
| 460 | 447 |
| 461 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); | 448 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); |
| 462 ST8x4_UB(out0, out1, dst, dst_stride); | 449 ST8x4_UB(out0, out1, dst, dst_stride); |
| 463 | 450 |
| 464 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); | 451 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); |
| 465 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); | 452 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); |
| 466 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, | 453 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, |
| 467 vec2, vec3); | 454 vec2, vec3); |
| 468 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); | 455 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); |
| 469 MIN_UH4_UH(vec0, vec1, vec2, vec3, const255); | |
| 470 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); | 456 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); |
| 471 ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride); | 457 ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride); |
| 472 } | 458 } |
| 473 } | 459 } |
| 474 | 460 |
| 475 static void common_hz_2t_8w_msa(const uint8_t *src, int32_t src_stride, | 461 static void common_hz_2t_8w_msa(const uint8_t *src, int32_t src_stride, |
| 476 uint8_t *dst, int32_t dst_stride, | 462 uint8_t *dst, int32_t dst_stride, |
| 477 int8_t *filter, int32_t height) { | 463 int8_t *filter, int32_t height) { |
| 478 if (4 == height) { | 464 if (4 == height) { |
| 479 common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter); | 465 common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter); |
| 480 } else { | 466 } else { |
| 481 common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height); | 467 common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height); |
| 482 } | 468 } |
| 483 } | 469 } |
| 484 | 470 |
| 485 static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride, | 471 static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride, |
| 486 uint8_t *dst, int32_t dst_stride, | 472 uint8_t *dst, int32_t dst_stride, |
| 487 int8_t *filter, int32_t height) { | 473 int8_t *filter, int32_t height) { |
| 488 uint32_t loop_cnt; | 474 uint32_t loop_cnt; |
| 489 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; | 475 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; |
| 490 v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; | 476 v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; |
| 491 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt, const255; | 477 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; |
| 492 | 478 |
| 493 mask = LD_SB(&mc_filt_mask_arr[0]); | 479 mask = LD_SB(&mc_filt_mask_arr[0]); |
| 494 | 480 |
| 495 loop_cnt = (height >> 2) - 1; | 481 loop_cnt = (height >> 2) - 1; |
| 496 | 482 |
| 497 /* rearranging filter */ | 483 /* rearranging filter */ |
| 498 filt = LD_UH(filter); | 484 filt = LD_UH(filter); |
| 499 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); | 485 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); |
| 500 | 486 |
| 501 const255 = (v8u16) __msa_ldi_h(255); | |
| 502 | |
| 503 LD_SB4(src, src_stride, src0, src2, src4, src6); | 487 LD_SB4(src, src_stride, src0, src2, src4, src6); |
| 504 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); | 488 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); |
| 505 src += (4 * src_stride); | 489 src += (4 * src_stride); |
| 506 | 490 |
| 507 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); | 491 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); |
| 508 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); | 492 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); |
| 509 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); | 493 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); |
| 510 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); | 494 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); |
| 511 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, | 495 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, |
| 512 out2, out3); | 496 out2, out3); |
| 513 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, | 497 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, |
| 514 out6, out7); | 498 out6, out7); |
| 515 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); | 499 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); |
| 516 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); | 500 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); |
| 517 MIN_UH4_UH(out0, out1, out2, out3, const255); | |
| 518 MIN_UH4_UH(out4, out5, out6, out7, const255); | |
| 519 PCKEV_ST_SB(out0, out1, dst); | 501 PCKEV_ST_SB(out0, out1, dst); |
| 520 dst += dst_stride; | 502 dst += dst_stride; |
| 521 PCKEV_ST_SB(out2, out3, dst); | 503 PCKEV_ST_SB(out2, out3, dst); |
| 522 dst += dst_stride; | 504 dst += dst_stride; |
| 523 PCKEV_ST_SB(out4, out5, dst); | 505 PCKEV_ST_SB(out4, out5, dst); |
| 524 dst += dst_stride; | 506 dst += dst_stride; |
| 525 PCKEV_ST_SB(out6, out7, dst); | 507 PCKEV_ST_SB(out6, out7, dst); |
| 526 dst += dst_stride; | 508 dst += dst_stride; |
| 527 | 509 |
| 528 for (; loop_cnt--;) { | 510 for (; loop_cnt--;) { |
| 529 LD_SB4(src, src_stride, src0, src2, src4, src6); | 511 LD_SB4(src, src_stride, src0, src2, src4, src6); |
| 530 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); | 512 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); |
| 531 src += (4 * src_stride); | 513 src += (4 * src_stride); |
| 532 | 514 |
| 533 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); | 515 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); |
| 534 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); | 516 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); |
| 535 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); | 517 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); |
| 536 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); | 518 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); |
| 537 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, | 519 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, |
| 538 out2, out3); | 520 out2, out3); |
| 539 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, | 521 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, |
| 540 out6, out7); | 522 out6, out7); |
| 541 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); | 523 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); |
| 542 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); | 524 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); |
| 543 MIN_UH4_UH(out0, out1, out2, out3, const255); | |
| 544 MIN_UH4_UH(out4, out5, out6, out7, const255); | |
| 545 PCKEV_ST_SB(out0, out1, dst); | 525 PCKEV_ST_SB(out0, out1, dst); |
| 546 dst += dst_stride; | 526 dst += dst_stride; |
| 547 PCKEV_ST_SB(out2, out3, dst); | 527 PCKEV_ST_SB(out2, out3, dst); |
| 548 dst += dst_stride; | 528 dst += dst_stride; |
| 549 PCKEV_ST_SB(out4, out5, dst); | 529 PCKEV_ST_SB(out4, out5, dst); |
| 550 dst += dst_stride; | 530 dst += dst_stride; |
| 551 PCKEV_ST_SB(out6, out7, dst); | 531 PCKEV_ST_SB(out6, out7, dst); |
| 552 dst += dst_stride; | 532 dst += dst_stride; |
| 553 } | 533 } |
| 554 } | 534 } |
| 555 | 535 |
| 556 static void common_hz_2t_32w_msa(const uint8_t *src, int32_t src_stride, | 536 static void common_hz_2t_32w_msa(const uint8_t *src, int32_t src_stride, |
| 557 uint8_t *dst, int32_t dst_stride, | 537 uint8_t *dst, int32_t dst_stride, |
| 558 int8_t *filter, int32_t height) { | 538 int8_t *filter, int32_t height) { |
| 559 uint32_t loop_cnt; | 539 uint32_t loop_cnt; |
| 560 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; | 540 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; |
| 561 v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; | 541 v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; |
| 562 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt, const255; | 542 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; |
| 563 | 543 |
| 564 mask = LD_SB(&mc_filt_mask_arr[0]); | 544 mask = LD_SB(&mc_filt_mask_arr[0]); |
| 565 | 545 |
| 566 /* rearranging filter */ | 546 /* rearranging filter */ |
| 567 filt = LD_UH(filter); | 547 filt = LD_UH(filter); |
| 568 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); | 548 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); |
| 569 | 549 |
| 570 const255 = (v8u16) __msa_ldi_h(255); | |
| 571 | |
| 572 for (loop_cnt = height >> 1; loop_cnt--;) { | 550 for (loop_cnt = height >> 1; loop_cnt--;) { |
| 573 src0 = LD_SB(src); | 551 src0 = LD_SB(src); |
| 574 src2 = LD_SB(src + 16); | 552 src2 = LD_SB(src + 16); |
| 575 src3 = LD_SB(src + 24); | 553 src3 = LD_SB(src + 24); |
| 576 src1 = __msa_sldi_b(src2, src0, 8); | 554 src1 = __msa_sldi_b(src2, src0, 8); |
| 577 src += src_stride; | 555 src += src_stride; |
| 578 src4 = LD_SB(src); | 556 src4 = LD_SB(src); |
| 579 src6 = LD_SB(src + 16); | 557 src6 = LD_SB(src + 16); |
| 580 src7 = LD_SB(src + 24); | 558 src7 = LD_SB(src + 24); |
| 581 src5 = __msa_sldi_b(src6, src4, 8); | 559 src5 = __msa_sldi_b(src6, src4, 8); |
| 582 src += src_stride; | 560 src += src_stride; |
| 583 | 561 |
| 584 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); | 562 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); |
| 585 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); | 563 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); |
| 586 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); | 564 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); |
| 587 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); | 565 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); |
| 588 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, | 566 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, |
| 589 out2, out3); | 567 out2, out3); |
| 590 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, | 568 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, |
| 591 out6, out7); | 569 out6, out7); |
| 592 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); | 570 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); |
| 593 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); | 571 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); |
| 594 MIN_UH4_UH(out0, out1, out2, out3, const255); | |
| 595 MIN_UH4_UH(out4, out5, out6, out7, const255); | |
| 596 PCKEV_ST_SB(out0, out1, dst); | 572 PCKEV_ST_SB(out0, out1, dst); |
| 597 PCKEV_ST_SB(out2, out3, dst + 16); | 573 PCKEV_ST_SB(out2, out3, dst + 16); |
| 598 dst += dst_stride; | 574 dst += dst_stride; |
| 599 PCKEV_ST_SB(out4, out5, dst); | 575 PCKEV_ST_SB(out4, out5, dst); |
| 600 PCKEV_ST_SB(out6, out7, dst + 16); | 576 PCKEV_ST_SB(out6, out7, dst + 16); |
| 601 dst += dst_stride; | 577 dst += dst_stride; |
| 602 } | 578 } |
| 603 } | 579 } |
| 604 | 580 |
| 605 static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride, | 581 static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride, |
| 606 uint8_t *dst, int32_t dst_stride, | 582 uint8_t *dst, int32_t dst_stride, |
| 607 int8_t *filter, int32_t height) { | 583 int8_t *filter, int32_t height) { |
| 608 uint32_t loop_cnt; | 584 uint32_t loop_cnt; |
| 609 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; | 585 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; |
| 610 v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; | 586 v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; |
| 611 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt, const255; | 587 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; |
| 612 | 588 |
| 613 mask = LD_SB(&mc_filt_mask_arr[0]); | 589 mask = LD_SB(&mc_filt_mask_arr[0]); |
| 614 | 590 |
| 615 /* rearranging filter */ | 591 /* rearranging filter */ |
| 616 filt = LD_UH(filter); | 592 filt = LD_UH(filter); |
| 617 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); | 593 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); |
| 618 | 594 |
| 619 const255 = (v8u16) __msa_ldi_h(255); | |
| 620 | |
| 621 for (loop_cnt = height; loop_cnt--;) { | 595 for (loop_cnt = height; loop_cnt--;) { |
| 622 src0 = LD_SB(src); | 596 src0 = LD_SB(src); |
| 623 src2 = LD_SB(src + 16); | 597 src2 = LD_SB(src + 16); |
| 624 src4 = LD_SB(src + 32); | 598 src4 = LD_SB(src + 32); |
| 625 src6 = LD_SB(src + 48); | 599 src6 = LD_SB(src + 48); |
| 626 src7 = LD_SB(src + 56); | 600 src7 = LD_SB(src + 56); |
| 627 SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8); | 601 SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8); |
| 628 src += src_stride; | 602 src += src_stride; |
| 629 | 603 |
| 630 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); | 604 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); |
| 631 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); | 605 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); |
| 632 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); | 606 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); |
| 633 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); | 607 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); |
| 634 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, | 608 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, |
| 635 out2, out3); | 609 out2, out3); |
| 636 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, | 610 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, |
| 637 out6, out7); | 611 out6, out7); |
| 638 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); | 612 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); |
| 639 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); | 613 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); |
| 640 MIN_UH4_UH(out0, out1, out2, out3, const255); | |
| 641 MIN_UH4_UH(out4, out5, out6, out7, const255); | |
| 642 PCKEV_ST_SB(out0, out1, dst); | 614 PCKEV_ST_SB(out0, out1, dst); |
| 643 PCKEV_ST_SB(out2, out3, dst + 16); | 615 PCKEV_ST_SB(out2, out3, dst + 16); |
| 644 PCKEV_ST_SB(out4, out5, dst + 32); | 616 PCKEV_ST_SB(out4, out5, dst + 32); |
| 645 PCKEV_ST_SB(out6, out7, dst + 48); | 617 PCKEV_ST_SB(out6, out7, dst + 48); |
| 646 dst += dst_stride; | 618 dst += dst_stride; |
| 647 } | 619 } |
| 648 } | 620 } |
| 649 | 621 |
| 650 void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, | 622 void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, |
| 651 uint8_t *dst, ptrdiff_t dst_stride, | 623 uint8_t *dst, ptrdiff_t dst_stride, |
| 652 const int16_t *filter_x, int x_step_q4, | 624 const int16_t *filter_x, int x_step_q4, |
| 653 const int16_t *filter_y, int y_step_q4, | 625 const int16_t *filter_y, int y_step_q4, |
| 654 int w, int h) { | 626 int w, int h) { |
| 655 int8_t cnt, filt_hor[8]; | 627 int8_t cnt, filt_hor[8]; |
| 656 | 628 |
| 657 if (16 != x_step_q4) { | 629 assert(x_step_q4 == 16); |
| 658 vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, | 630 assert(((const int32_t *)filter_x)[1] != 0x800000); |
| 659 filter_x, x_step_q4, filter_y, y_step_q4, | |
| 660 w, h); | |
| 661 return; | |
| 662 } | |
| 663 | |
| 664 if (((const int32_t *)filter_x)[1] == 0x800000) { | |
| 665 vpx_convolve_copy(src, src_stride, dst, dst_stride, | |
| 666 filter_x, x_step_q4, filter_y, y_step_q4, | |
| 667 w, h); | |
| 668 return; | |
| 669 } | |
| 670 | 631 |
| 671 for (cnt = 0; cnt < 8; ++cnt) { | 632 for (cnt = 0; cnt < 8; ++cnt) { |
| 672 filt_hor[cnt] = filter_x[cnt]; | 633 filt_hor[cnt] = filter_x[cnt]; |
| 673 } | 634 } |
| 674 | 635 |
| 675 if (((const int32_t *)filter_x)[0] == 0) { | 636 if (((const int32_t *)filter_x)[0] == 0) { |
| 676 switch (w) { | 637 switch (w) { |
| 677 case 4: | 638 case 4: |
| 678 common_hz_2t_4w_msa(src, (int32_t)src_stride, | 639 common_hz_2t_4w_msa(src, (int32_t)src_stride, |
| 679 dst, (int32_t)dst_stride, | 640 dst, (int32_t)dst_stride, |
| (...skipping 53 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 733 filt_hor, h); | 694 filt_hor, h); |
| 734 break; | 695 break; |
| 735 default: | 696 default: |
| 736 vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, | 697 vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, |
| 737 filter_x, x_step_q4, filter_y, y_step_q4, | 698 filter_x, x_step_q4, filter_y, y_step_q4, |
| 738 w, h); | 699 w, h); |
| 739 break; | 700 break; |
| 740 } | 701 } |
| 741 } | 702 } |
| 742 } | 703 } |
| OLD | NEW |