OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
| 11 #include <assert.h> |
11 #include "./vpx_dsp_rtcd.h" | 12 #include "./vpx_dsp_rtcd.h" |
12 #include "vpx_dsp/mips/vpx_convolve_msa.h" | 13 #include "vpx_dsp/mips/vpx_convolve_msa.h" |
13 | 14 |
14 static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src, | 15 static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src, |
15 int32_t src_stride, | 16 int32_t src_stride, |
16 uint8_t *dst, | 17 uint8_t *dst, |
17 int32_t dst_stride, | 18 int32_t dst_stride, |
18 int8_t *filter) { | 19 int8_t *filter) { |
19 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; | 20 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; |
20 v16u8 dst0, dst1, dst2, dst3, res2, res3; | 21 v16u8 dst0, dst1, dst2, dst3, res2, res3; |
(...skipping 295 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
316 } | 317 } |
317 } | 318 } |
318 | 319 |
319 static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src, | 320 static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src, |
320 int32_t src_stride, | 321 int32_t src_stride, |
321 uint8_t *dst, | 322 uint8_t *dst, |
322 int32_t dst_stride, | 323 int32_t dst_stride, |
323 int8_t *filter) { | 324 int8_t *filter) { |
324 v16i8 src0, src1, src2, src3, mask; | 325 v16i8 src0, src1, src2, src3, mask; |
325 v16u8 filt0, dst0, dst1, dst2, dst3, vec0, vec1, res0, res1; | 326 v16u8 filt0, dst0, dst1, dst2, dst3, vec0, vec1, res0, res1; |
326 v8u16 vec2, vec3, const255, filt; | 327 v8u16 vec2, vec3, filt; |
327 | 328 |
328 mask = LD_SB(&mc_filt_mask_arr[16]); | 329 mask = LD_SB(&mc_filt_mask_arr[16]); |
329 | 330 |
330 /* rearranging filter */ | 331 /* rearranging filter */ |
331 filt = LD_UH(filter); | 332 filt = LD_UH(filter); |
332 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); | 333 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); |
333 | 334 |
334 const255 = (v8u16)__msa_ldi_h(255); | |
335 | |
336 LD_SB4(src, src_stride, src0, src1, src2, src3); | 335 LD_SB4(src, src_stride, src0, src1, src2, src3); |
337 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); | 336 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); |
338 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); | 337 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); |
339 DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3); | 338 DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3); |
340 SRARI_H2_UH(vec2, vec3, FILTER_BITS); | 339 SRARI_H2_UH(vec2, vec3, FILTER_BITS); |
341 MIN_UH2_UH(vec2, vec3, const255); | |
342 PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1); | 340 PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1); |
343 ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); | 341 ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); |
344 AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1); | 342 AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1); |
345 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); | 343 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); |
346 } | 344 } |
347 | 345 |
348 static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src, | 346 static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src, |
349 int32_t src_stride, | 347 int32_t src_stride, |
350 uint8_t *dst, | 348 uint8_t *dst, |
351 int32_t dst_stride, | 349 int32_t dst_stride, |
352 int8_t *filter) { | 350 int8_t *filter) { |
353 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; | 351 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; |
354 v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3; | 352 v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3; |
355 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; | 353 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; |
356 v8u16 vec4, vec5, vec6, vec7, const255, filt; | 354 v8u16 vec4, vec5, vec6, vec7, filt; |
357 | 355 |
358 mask = LD_SB(&mc_filt_mask_arr[16]); | 356 mask = LD_SB(&mc_filt_mask_arr[16]); |
359 | 357 |
360 /* rearranging filter */ | 358 /* rearranging filter */ |
361 filt = LD_UH(filter); | 359 filt = LD_UH(filter); |
362 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); | 360 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); |
363 | 361 |
364 const255 = (v8u16)__msa_ldi_h(255); | |
365 | |
366 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); | 362 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); |
367 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); | 363 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); |
368 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); | 364 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); |
369 VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3); | 365 VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3); |
370 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5, | 366 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5, |
371 vec6, vec7); | 367 vec6, vec7); |
372 SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); | 368 SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); |
373 MIN_UH4_UH(vec4, vec5, vec6, vec7, const255); | |
374 PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2, | 369 PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2, |
375 res3); | 370 res3); |
376 ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4, | 371 ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4, |
377 dst6); | 372 dst6); |
378 AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2, | 373 AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2, |
379 res3); | 374 res3); |
380 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); | 375 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); |
381 dst += (4 * dst_stride); | 376 dst += (4 * dst_stride); |
382 ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); | 377 ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); |
383 } | 378 } |
(...skipping 11 matching lines...) Expand all Loading... |
395 } | 390 } |
396 } | 391 } |
397 | 392 |
398 static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src, | 393 static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src, |
399 int32_t src_stride, | 394 int32_t src_stride, |
400 uint8_t *dst, | 395 uint8_t *dst, |
401 int32_t dst_stride, | 396 int32_t dst_stride, |
402 int8_t *filter) { | 397 int8_t *filter) { |
403 v16i8 src0, src1, src2, src3, mask; | 398 v16i8 src0, src1, src2, src3, mask; |
404 v16u8 filt0, dst0, dst1, dst2, dst3; | 399 v16u8 filt0, dst0, dst1, dst2, dst3; |
405 v8u16 vec0, vec1, vec2, vec3, const255, filt; | 400 v8u16 vec0, vec1, vec2, vec3, filt; |
406 | 401 |
407 mask = LD_SB(&mc_filt_mask_arr[0]); | 402 mask = LD_SB(&mc_filt_mask_arr[0]); |
408 | 403 |
409 /* rearranging filter */ | 404 /* rearranging filter */ |
410 filt = LD_UH(filter); | 405 filt = LD_UH(filter); |
411 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); | 406 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); |
412 | 407 |
413 const255 = (v8u16)__msa_ldi_h(255); | |
414 | |
415 LD_SB4(src, src_stride, src0, src1, src2, src3); | 408 LD_SB4(src, src_stride, src0, src1, src2, src3); |
416 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); | 409 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); |
417 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); | 410 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); |
418 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, | 411 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, |
419 vec2, vec3); | 412 vec2, vec3); |
420 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); | 413 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); |
421 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); | 414 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); |
422 MIN_UH4_UH(vec0, vec1, vec2, vec3, const255); | |
423 PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, | 415 PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, |
424 dst, dst_stride); | 416 dst, dst_stride); |
425 } | 417 } |
426 | 418 |
427 static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src, | 419 static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src, |
428 int32_t src_stride, | 420 int32_t src_stride, |
429 uint8_t *dst, | 421 uint8_t *dst, |
430 int32_t dst_stride, | 422 int32_t dst_stride, |
431 int8_t *filter, | 423 int8_t *filter, |
432 int32_t height) { | 424 int32_t height) { |
433 v16i8 src0, src1, src2, src3, mask; | 425 v16i8 src0, src1, src2, src3, mask; |
434 v16u8 filt0, dst0, dst1, dst2, dst3; | 426 v16u8 filt0, dst0, dst1, dst2, dst3; |
435 v8u16 vec0, vec1, vec2, vec3, const255, filt; | 427 v8u16 vec0, vec1, vec2, vec3, filt; |
436 | 428 |
437 mask = LD_SB(&mc_filt_mask_arr[0]); | 429 mask = LD_SB(&mc_filt_mask_arr[0]); |
438 | 430 |
439 /* rearranging filter */ | 431 /* rearranging filter */ |
440 filt = LD_UH(filter); | 432 filt = LD_UH(filter); |
441 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); | 433 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); |
442 | 434 |
443 const255 = (v8u16)__msa_ldi_h(255); | |
444 | |
445 LD_SB4(src, src_stride, src0, src1, src2, src3); | 435 LD_SB4(src, src_stride, src0, src1, src2, src3); |
446 src += (4 * src_stride); | 436 src += (4 * src_stride); |
447 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); | 437 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); |
448 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); | 438 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); |
449 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, | 439 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, |
450 vec2, vec3); | 440 vec2, vec3); |
451 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); | 441 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); |
452 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); | 442 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); |
453 MIN_UH4_UH(vec0, vec1, vec2, vec3, const255); | |
454 LD_SB4(src, src_stride, src0, src1, src2, src3); | 443 LD_SB4(src, src_stride, src0, src1, src2, src3); |
455 src += (4 * src_stride); | 444 src += (4 * src_stride); |
456 PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, | 445 PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, |
457 dst, dst_stride); | 446 dst, dst_stride); |
458 dst += (4 * dst_stride); | 447 dst += (4 * dst_stride); |
459 | 448 |
460 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); | 449 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); |
461 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); | 450 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); |
462 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, | 451 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, |
463 vec2, vec3); | 452 vec2, vec3); |
464 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); | 453 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); |
465 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); | 454 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); |
466 MIN_UH4_UH(vec0, vec1, vec2, vec3, const255); | |
467 PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, | 455 PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, |
468 dst, dst_stride); | 456 dst, dst_stride); |
469 dst += (4 * dst_stride); | 457 dst += (4 * dst_stride); |
470 | 458 |
471 if (16 == height) { | 459 if (16 == height) { |
472 LD_SB4(src, src_stride, src0, src1, src2, src3); | 460 LD_SB4(src, src_stride, src0, src1, src2, src3); |
473 src += (4 * src_stride); | 461 src += (4 * src_stride); |
474 | 462 |
475 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); | 463 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); |
476 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); | 464 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); |
477 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, | 465 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, |
478 vec2, vec3); | 466 vec2, vec3); |
479 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); | 467 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); |
480 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); | 468 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); |
481 MIN_UH4_UH(vec0, vec1, vec2, vec3, const255); | |
482 LD_SB4(src, src_stride, src0, src1, src2, src3); | 469 LD_SB4(src, src_stride, src0, src1, src2, src3); |
483 PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, | 470 PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, |
484 dst, dst_stride); | 471 dst, dst_stride); |
485 dst += (4 * dst_stride); | 472 dst += (4 * dst_stride); |
486 | 473 |
487 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); | 474 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); |
488 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); | 475 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); |
489 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, | 476 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, |
490 vec2, vec3); | 477 vec2, vec3); |
491 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); | 478 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); |
492 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); | 479 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); |
493 MIN_UH4_UH(vec0, vec1, vec2, vec3, const255); | |
494 PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, | 480 PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, |
495 dst, dst_stride); | 481 dst, dst_stride); |
496 } | 482 } |
497 } | 483 } |
498 | 484 |
499 static void common_hz_2t_and_aver_dst_8w_msa(const uint8_t *src, | 485 static void common_hz_2t_and_aver_dst_8w_msa(const uint8_t *src, |
500 int32_t src_stride, | 486 int32_t src_stride, |
501 uint8_t *dst, | 487 uint8_t *dst, |
502 int32_t dst_stride, | 488 int32_t dst_stride, |
503 int8_t *filter, | 489 int8_t *filter, |
504 int32_t height) { | 490 int32_t height) { |
505 if (4 == height) { | 491 if (4 == height) { |
506 common_hz_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter); | 492 common_hz_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter); |
507 } else { | 493 } else { |
508 common_hz_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride, | 494 common_hz_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride, |
509 filter, height); | 495 filter, height); |
510 } | 496 } |
511 } | 497 } |
512 | 498 |
513 static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src, | 499 static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src, |
514 int32_t src_stride, | 500 int32_t src_stride, |
515 uint8_t *dst, | 501 uint8_t *dst, |
516 int32_t dst_stride, | 502 int32_t dst_stride, |
517 int8_t *filter, | 503 int8_t *filter, |
518 int32_t height) { | 504 int32_t height) { |
519 uint32_t loop_cnt; | 505 uint32_t loop_cnt; |
520 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; | 506 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; |
521 v16u8 filt0, dst0, dst1, dst2, dst3; | 507 v16u8 filt0, dst0, dst1, dst2, dst3; |
522 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; | 508 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; |
523 v8u16 res0, res1, res2, res3, res4, res5, res6, res7, const255, filt; | 509 v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt; |
524 | 510 |
525 mask = LD_SB(&mc_filt_mask_arr[0]); | 511 mask = LD_SB(&mc_filt_mask_arr[0]); |
526 | 512 |
527 /* rearranging filter */ | 513 /* rearranging filter */ |
528 filt = LD_UH(filter); | 514 filt = LD_UH(filter); |
529 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); | 515 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); |
530 | 516 |
531 const255 = (v8u16)__msa_ldi_h(255); | |
532 | |
533 LD_SB4(src, src_stride, src0, src2, src4, src6); | 517 LD_SB4(src, src_stride, src0, src2, src4, src6); |
534 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); | 518 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); |
535 src += (4 * src_stride); | 519 src += (4 * src_stride); |
536 | 520 |
537 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); | 521 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); |
538 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); | 522 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); |
539 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); | 523 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); |
540 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); | 524 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); |
541 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1, | 525 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1, |
542 res2, res3); | 526 res2, res3); |
543 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5, | 527 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5, |
544 res6, res7); | 528 res6, res7); |
545 SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS); | 529 SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS); |
546 SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS); | 530 SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS); |
547 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); | 531 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); |
548 MIN_UH4_UH(res0, res1, res2, res3, const255); | |
549 MIN_UH4_UH(res4, res5, res6, res7, const255); | |
550 PCKEV_AVG_ST_UB(res1, res0, dst0, dst); | 532 PCKEV_AVG_ST_UB(res1, res0, dst0, dst); |
551 dst += dst_stride; | 533 dst += dst_stride; |
552 PCKEV_AVG_ST_UB(res3, res2, dst1, dst); | 534 PCKEV_AVG_ST_UB(res3, res2, dst1, dst); |
553 dst += dst_stride; | 535 dst += dst_stride; |
554 PCKEV_AVG_ST_UB(res5, res4, dst2, dst); | 536 PCKEV_AVG_ST_UB(res5, res4, dst2, dst); |
555 dst += dst_stride; | 537 dst += dst_stride; |
556 PCKEV_AVG_ST_UB(res7, res6, dst3, dst); | 538 PCKEV_AVG_ST_UB(res7, res6, dst3, dst); |
557 dst += dst_stride; | 539 dst += dst_stride; |
558 | 540 |
559 for (loop_cnt = (height >> 2) - 1; loop_cnt--;) { | 541 for (loop_cnt = (height >> 2) - 1; loop_cnt--;) { |
560 LD_SB4(src, src_stride, src0, src2, src4, src6); | 542 LD_SB4(src, src_stride, src0, src2, src4, src6); |
561 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); | 543 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); |
562 src += (4 * src_stride); | 544 src += (4 * src_stride); |
563 | 545 |
564 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); | 546 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); |
565 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); | 547 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); |
566 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); | 548 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); |
567 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); | 549 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); |
568 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1, | 550 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1, |
569 res2, res3); | 551 res2, res3); |
570 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5, | 552 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5, |
571 res6, res7); | 553 res6, res7); |
572 SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS); | 554 SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS); |
573 SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS); | 555 SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS); |
574 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); | 556 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); |
575 MIN_UH4_UH(res0, res1, res2, res3, const255); | |
576 MIN_UH4_UH(res4, res5, res6, res7, const255); | |
577 PCKEV_AVG_ST_UB(res1, res0, dst0, dst); | 557 PCKEV_AVG_ST_UB(res1, res0, dst0, dst); |
578 dst += dst_stride; | 558 dst += dst_stride; |
579 PCKEV_AVG_ST_UB(res3, res2, dst1, dst); | 559 PCKEV_AVG_ST_UB(res3, res2, dst1, dst); |
580 dst += dst_stride; | 560 dst += dst_stride; |
581 PCKEV_AVG_ST_UB(res5, res4, dst2, dst); | 561 PCKEV_AVG_ST_UB(res5, res4, dst2, dst); |
582 dst += dst_stride; | 562 dst += dst_stride; |
583 PCKEV_AVG_ST_UB(res7, res6, dst3, dst); | 563 PCKEV_AVG_ST_UB(res7, res6, dst3, dst); |
584 dst += dst_stride; | 564 dst += dst_stride; |
585 } | 565 } |
586 } | 566 } |
587 | 567 |
588 static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src, | 568 static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src, |
589 int32_t src_stride, | 569 int32_t src_stride, |
590 uint8_t *dst, | 570 uint8_t *dst, |
591 int32_t dst_stride, | 571 int32_t dst_stride, |
592 int8_t *filter, | 572 int8_t *filter, |
593 int32_t height) { | 573 int32_t height) { |
594 uint32_t loop_cnt; | 574 uint32_t loop_cnt; |
595 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; | 575 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; |
596 v16u8 filt0, dst0, dst1, dst2, dst3; | 576 v16u8 filt0, dst0, dst1, dst2, dst3; |
597 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; | 577 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; |
598 v8u16 res0, res1, res2, res3, res4, res5, res6, res7, const255, filt; | 578 v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt; |
599 | 579 |
600 mask = LD_SB(&mc_filt_mask_arr[0]); | 580 mask = LD_SB(&mc_filt_mask_arr[0]); |
601 | 581 |
602 /* rearranging filter */ | 582 /* rearranging filter */ |
603 filt = LD_UH(filter); | 583 filt = LD_UH(filter); |
604 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); | 584 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); |
605 | 585 |
606 const255 = (v8u16)__msa_ldi_h(255); | |
607 | |
608 for (loop_cnt = (height >> 1); loop_cnt--;) { | 586 for (loop_cnt = (height >> 1); loop_cnt--;) { |
609 src0 = LD_SB(src); | 587 src0 = LD_SB(src); |
610 src2 = LD_SB(src + 16); | 588 src2 = LD_SB(src + 16); |
611 src3 = LD_SB(src + 24); | 589 src3 = LD_SB(src + 24); |
612 src1 = __msa_sldi_b(src2, src0, 8); | 590 src1 = __msa_sldi_b(src2, src0, 8); |
613 src += src_stride; | 591 src += src_stride; |
614 src4 = LD_SB(src); | 592 src4 = LD_SB(src); |
615 src6 = LD_SB(src + 16); | 593 src6 = LD_SB(src + 16); |
616 src7 = LD_SB(src + 24); | 594 src7 = LD_SB(src + 24); |
617 src5 = __msa_sldi_b(src6, src4, 8); | 595 src5 = __msa_sldi_b(src6, src4, 8); |
618 src += src_stride; | 596 src += src_stride; |
619 | 597 |
620 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); | 598 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); |
621 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); | 599 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); |
622 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); | 600 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); |
623 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); | 601 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); |
624 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1, | 602 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1, |
625 res2, res3); | 603 res2, res3); |
626 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5, | 604 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5, |
627 res6, res7); | 605 res6, res7); |
628 SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS); | 606 SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS); |
629 SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS); | 607 SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS); |
630 MIN_UH4_UH(res0, res1, res2, res3, const255); | |
631 MIN_UH4_UH(res4, res5, res6, res7, const255); | |
632 LD_UB2(dst, 16, dst0, dst1); | 608 LD_UB2(dst, 16, dst0, dst1); |
633 PCKEV_AVG_ST_UB(res1, res0, dst0, dst); | 609 PCKEV_AVG_ST_UB(res1, res0, dst0, dst); |
634 PCKEV_AVG_ST_UB(res3, res2, dst1, (dst + 16)); | 610 PCKEV_AVG_ST_UB(res3, res2, dst1, (dst + 16)); |
635 dst += dst_stride; | 611 dst += dst_stride; |
636 LD_UB2(dst, 16, dst2, dst3); | 612 LD_UB2(dst, 16, dst2, dst3); |
637 PCKEV_AVG_ST_UB(res5, res4, dst2, dst); | 613 PCKEV_AVG_ST_UB(res5, res4, dst2, dst); |
638 PCKEV_AVG_ST_UB(res7, res6, dst3, (dst + 16)); | 614 PCKEV_AVG_ST_UB(res7, res6, dst3, (dst + 16)); |
639 dst += dst_stride; | 615 dst += dst_stride; |
640 } | 616 } |
641 } | 617 } |
642 | 618 |
643 static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src, | 619 static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src, |
644 int32_t src_stride, | 620 int32_t src_stride, |
645 uint8_t *dst, | 621 uint8_t *dst, |
646 int32_t dst_stride, | 622 int32_t dst_stride, |
647 int8_t *filter, | 623 int8_t *filter, |
648 int32_t height) { | 624 int32_t height) { |
649 uint32_t loop_cnt; | 625 uint32_t loop_cnt; |
650 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; | 626 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; |
651 v16u8 filt0, dst0, dst1, dst2, dst3; | 627 v16u8 filt0, dst0, dst1, dst2, dst3; |
652 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; | 628 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; |
653 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, const255, filt; | 629 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; |
654 | 630 |
655 mask = LD_SB(&mc_filt_mask_arr[0]); | 631 mask = LD_SB(&mc_filt_mask_arr[0]); |
656 | 632 |
657 /* rearranging filter */ | 633 /* rearranging filter */ |
658 filt = LD_UH(filter); | 634 filt = LD_UH(filter); |
659 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); | 635 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); |
660 | 636 |
661 const255 = (v8u16)__msa_ldi_h(255); | |
662 | |
663 for (loop_cnt = height; loop_cnt--;) { | 637 for (loop_cnt = height; loop_cnt--;) { |
664 LD_SB4(src, 16, src0, src2, src4, src6); | 638 LD_SB4(src, 16, src0, src2, src4, src6); |
665 src7 = LD_SB(src + 56); | 639 src7 = LD_SB(src + 56); |
666 SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8); | 640 SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8); |
667 src += src_stride; | 641 src += src_stride; |
668 | 642 |
669 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); | 643 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); |
670 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); | 644 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); |
671 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); | 645 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); |
672 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); | 646 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); |
673 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, | 647 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, |
674 out2, out3); | 648 out2, out3); |
675 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, | 649 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, |
676 out6, out7); | 650 out6, out7); |
677 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); | 651 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); |
678 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); | 652 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); |
679 LD_UB4(dst, 16, dst0, dst1, dst2, dst3); | 653 LD_UB4(dst, 16, dst0, dst1, dst2, dst3); |
680 MIN_UH4_UH(out0, out1, out2, out3, const255); | |
681 MIN_UH4_UH(out4, out5, out6, out7, const255); | |
682 PCKEV_AVG_ST_UB(out1, out0, dst0, dst); | 654 PCKEV_AVG_ST_UB(out1, out0, dst0, dst); |
683 PCKEV_AVG_ST_UB(out3, out2, dst1, dst + 16); | 655 PCKEV_AVG_ST_UB(out3, out2, dst1, dst + 16); |
684 PCKEV_AVG_ST_UB(out5, out4, dst2, dst + 32); | 656 PCKEV_AVG_ST_UB(out5, out4, dst2, dst + 32); |
685 PCKEV_AVG_ST_UB(out7, out6, dst3, dst + 48); | 657 PCKEV_AVG_ST_UB(out7, out6, dst3, dst + 48); |
686 dst += dst_stride; | 658 dst += dst_stride; |
687 } | 659 } |
688 } | 660 } |
689 | 661 |
690 void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, | 662 void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, |
691 uint8_t *dst, ptrdiff_t dst_stride, | 663 uint8_t *dst, ptrdiff_t dst_stride, |
692 const int16_t *filter_x, int x_step_q4, | 664 const int16_t *filter_x, int x_step_q4, |
693 const int16_t *filter_y, int y_step_q4, | 665 const int16_t *filter_y, int y_step_q4, |
694 int w, int h) { | 666 int w, int h) { |
695 int8_t cnt, filt_hor[8]; | 667 int8_t cnt, filt_hor[8]; |
696 | 668 |
697 if (16 != x_step_q4) { | 669 assert(x_step_q4 == 16); |
698 vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, | 670 assert(((const int32_t *)filter_x)[1] != 0x800000); |
699 filter_x, x_step_q4, filter_y, y_step_q4, | |
700 w, h); | |
701 return; | |
702 } | |
703 | |
704 if (((const int32_t *)filter_x)[1] == 0x800000) { | |
705 vpx_convolve_avg(src, src_stride, dst, dst_stride, | |
706 filter_x, x_step_q4, filter_y, y_step_q4, | |
707 w, h); | |
708 return; | |
709 } | |
710 | 671 |
711 for (cnt = 0; cnt < 8; ++cnt) { | 672 for (cnt = 0; cnt < 8; ++cnt) { |
712 filt_hor[cnt] = filter_x[cnt]; | 673 filt_hor[cnt] = filter_x[cnt]; |
713 } | 674 } |
714 | 675 |
715 if (((const int32_t *)filter_x)[0] == 0) { | 676 if (((const int32_t *)filter_x)[0] == 0) { |
716 switch (w) { | 677 switch (w) { |
717 case 4: | 678 case 4: |
718 common_hz_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, | 679 common_hz_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, |
719 dst, (int32_t)dst_stride, | 680 dst, (int32_t)dst_stride, |
(...skipping 53 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
773 filt_hor, h); | 734 filt_hor, h); |
774 break; | 735 break; |
775 default: | 736 default: |
776 vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, | 737 vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, |
777 filter_x, x_step_q4, filter_y, y_step_q4, | 738 filter_x, x_step_q4, filter_y, y_step_q4, |
778 w, h); | 739 w, h); |
779 break; | 740 break; |
780 } | 741 } |
781 } | 742 } |
782 } | 743 } |
OLD | NEW |