OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
| 11 #include <assert.h> |
11 #include "./vpx_dsp_rtcd.h" | 12 #include "./vpx_dsp_rtcd.h" |
12 #include "vpx_dsp/mips/vpx_convolve_msa.h" | 13 #include "vpx_dsp/mips/vpx_convolve_msa.h" |
13 | 14 |
14 static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride, | 15 static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride, |
15 uint8_t *dst, int32_t dst_stride, | 16 uint8_t *dst, int32_t dst_stride, |
16 int8_t *filter, int32_t height) { | 17 int8_t *filter, int32_t height) { |
17 uint32_t loop_cnt; | 18 uint32_t loop_cnt; |
18 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; | 19 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; |
19 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; | 20 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; |
20 v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776; | 21 v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776; |
(...skipping 288 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
309 filt0 = (v16u8)__msa_splati_h(filt, 0); | 310 filt0 = (v16u8)__msa_splati_h(filt, 0); |
310 | 311 |
311 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); | 312 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); |
312 src += (5 * src_stride); | 313 src += (5 * src_stride); |
313 | 314 |
314 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, | 315 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, |
315 src32_r, src43_r); | 316 src32_r, src43_r); |
316 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); | 317 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); |
317 DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); | 318 DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); |
318 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 319 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
319 SAT_UH2_UH(tmp0, tmp1, 7); | |
320 src2110 = __msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); | 320 src2110 = __msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); |
321 ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); | 321 ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); |
322 } | 322 } |
323 | 323 |
324 static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride, | 324 static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride, |
325 uint8_t *dst, int32_t dst_stride, | 325 uint8_t *dst, int32_t dst_stride, |
326 int8_t *filter) { | 326 int8_t *filter) { |
327 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; | 327 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; |
328 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r; | 328 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r; |
329 v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776; | 329 v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776; |
(...skipping 12 matching lines...) Expand all Loading... |
342 | 342 |
343 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, | 343 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, |
344 src32_r, src43_r); | 344 src32_r, src43_r); |
345 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, | 345 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, |
346 src76_r, src87_r); | 346 src76_r, src87_r); |
347 ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, | 347 ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, |
348 src87_r, src76_r, src2110, src4332, src6554, src8776); | 348 src87_r, src76_r, src2110, src4332, src6554, src8776); |
349 DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0, | 349 DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0, |
350 tmp0, tmp1, tmp2, tmp3); | 350 tmp0, tmp1, tmp2, tmp3); |
351 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); | 351 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); |
352 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); | |
353 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332); | 352 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332); |
354 ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); | 353 ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); |
355 ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride); | 354 ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride); |
356 } | 355 } |
357 | 356 |
358 static void common_vt_2t_4w_msa(const uint8_t *src, int32_t src_stride, | 357 static void common_vt_2t_4w_msa(const uint8_t *src, int32_t src_stride, |
359 uint8_t *dst, int32_t dst_stride, | 358 uint8_t *dst, int32_t dst_stride, |
360 int8_t *filter, int32_t height) { | 359 int8_t *filter, int32_t height) { |
361 if (4 == height) { | 360 if (4 == height) { |
362 common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter); | 361 common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter); |
(...skipping 13 matching lines...) Expand all Loading... |
376 /* rearranging filter_y */ | 375 /* rearranging filter_y */ |
377 filt = LD_SH(filter); | 376 filt = LD_SH(filter); |
378 filt0 = (v16u8)__msa_splati_h(filt, 0); | 377 filt0 = (v16u8)__msa_splati_h(filt, 0); |
379 | 378 |
380 LD_UB5(src, src_stride, src0, src1, src2, src3, src4); | 379 LD_UB5(src, src_stride, src0, src1, src2, src3, src4); |
381 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1); | 380 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1); |
382 ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3); | 381 ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3); |
383 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, | 382 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, |
384 tmp2, tmp3); | 383 tmp2, tmp3); |
385 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); | 384 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); |
386 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); | |
387 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); | 385 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); |
388 ST8x4_UB(out0, out1, dst, dst_stride); | 386 ST8x4_UB(out0, out1, dst, dst_stride); |
389 } | 387 } |
390 | 388 |
391 static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, | 389 static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, |
392 uint8_t *dst, int32_t dst_stride, | 390 uint8_t *dst, int32_t dst_stride, |
393 int8_t *filter, int32_t height) { | 391 int8_t *filter, int32_t height) { |
394 uint32_t loop_cnt; | 392 uint32_t loop_cnt; |
395 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; | 393 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; |
396 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; | 394 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; |
(...skipping 12 matching lines...) Expand all Loading... |
409 LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8); | 407 LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8); |
410 src += (8 * src_stride); | 408 src += (8 * src_stride); |
411 | 409 |
412 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, | 410 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, |
413 vec2, vec3); | 411 vec2, vec3); |
414 ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, | 412 ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, |
415 vec6, vec7); | 413 vec6, vec7); |
416 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, | 414 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, |
417 tmp2, tmp3); | 415 tmp2, tmp3); |
418 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); | 416 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); |
419 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); | |
420 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); | 417 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); |
421 ST8x4_UB(out0, out1, dst, dst_stride); | 418 ST8x4_UB(out0, out1, dst, dst_stride); |
422 dst += (4 * dst_stride); | 419 dst += (4 * dst_stride); |
423 | 420 |
424 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1, | 421 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1, |
425 tmp2, tmp3); | 422 tmp2, tmp3); |
426 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); | 423 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); |
427 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); | |
428 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); | 424 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); |
429 ST8x4_UB(out0, out1, dst, dst_stride); | 425 ST8x4_UB(out0, out1, dst, dst_stride); |
430 dst += (4 * dst_stride); | 426 dst += (4 * dst_stride); |
431 | 427 |
432 src0 = src8; | 428 src0 = src8; |
433 } | 429 } |
434 } | 430 } |
435 | 431 |
436 static void common_vt_2t_8w_msa(const uint8_t *src, int32_t src_stride, | 432 static void common_vt_2t_8w_msa(const uint8_t *src, int32_t src_stride, |
437 uint8_t *dst, int32_t dst_stride, | 433 uint8_t *dst, int32_t dst_stride, |
(...skipping 22 matching lines...) Expand all Loading... |
460 src += src_stride; | 456 src += src_stride; |
461 | 457 |
462 for (loop_cnt = (height >> 2); loop_cnt--;) { | 458 for (loop_cnt = (height >> 2); loop_cnt--;) { |
463 LD_UB4(src, src_stride, src1, src2, src3, src4); | 459 LD_UB4(src, src_stride, src1, src2, src3, src4); |
464 src += (4 * src_stride); | 460 src += (4 * src_stride); |
465 | 461 |
466 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); | 462 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); |
467 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); | 463 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); |
468 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); | 464 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); |
469 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 465 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
470 SAT_UH2_UH(tmp0, tmp1, 7); | |
471 PCKEV_ST_SB(tmp0, tmp1, dst); | 466 PCKEV_ST_SB(tmp0, tmp1, dst); |
472 dst += dst_stride; | 467 dst += dst_stride; |
473 | 468 |
474 ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); | 469 ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); |
475 ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); | 470 ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); |
476 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); | 471 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); |
477 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); | 472 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); |
478 SAT_UH2_UH(tmp2, tmp3, 7); | |
479 PCKEV_ST_SB(tmp2, tmp3, dst); | 473 PCKEV_ST_SB(tmp2, tmp3, dst); |
480 dst += dst_stride; | 474 dst += dst_stride; |
481 | 475 |
482 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); | 476 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); |
483 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 477 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
484 SAT_UH2_UH(tmp0, tmp1, 7); | |
485 PCKEV_ST_SB(tmp0, tmp1, dst); | 478 PCKEV_ST_SB(tmp0, tmp1, dst); |
486 dst += dst_stride; | 479 dst += dst_stride; |
487 | 480 |
488 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); | 481 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); |
489 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); | 482 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); |
490 SAT_UH2_UH(tmp2, tmp3, 7); | |
491 PCKEV_ST_SB(tmp2, tmp3, dst); | 483 PCKEV_ST_SB(tmp2, tmp3, dst); |
492 dst += dst_stride; | 484 dst += dst_stride; |
493 | 485 |
494 src0 = src4; | 486 src0 = src4; |
495 } | 487 } |
496 } | 488 } |
497 | 489 |
498 static void common_vt_2t_32w_msa(const uint8_t *src, int32_t src_stride, | 490 static void common_vt_2t_32w_msa(const uint8_t *src, int32_t src_stride, |
499 uint8_t *dst, int32_t dst_stride, | 491 uint8_t *dst, int32_t dst_stride, |
500 int8_t *filter, int32_t height) { | 492 int8_t *filter, int32_t height) { |
(...skipping 14 matching lines...) Expand all Loading... |
515 for (loop_cnt = (height >> 2); loop_cnt--;) { | 507 for (loop_cnt = (height >> 2); loop_cnt--;) { |
516 LD_UB4(src, src_stride, src1, src2, src3, src4); | 508 LD_UB4(src, src_stride, src1, src2, src3, src4); |
517 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); | 509 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); |
518 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); | 510 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); |
519 | 511 |
520 LD_UB4(src + 16, src_stride, src6, src7, src8, src9); | 512 LD_UB4(src + 16, src_stride, src6, src7, src8, src9); |
521 src += (4 * src_stride); | 513 src += (4 * src_stride); |
522 | 514 |
523 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); | 515 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); |
524 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 516 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
525 SAT_UH2_UH(tmp0, tmp1, 7); | |
526 PCKEV_ST_SB(tmp0, tmp1, dst); | 517 PCKEV_ST_SB(tmp0, tmp1, dst); |
527 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); | 518 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); |
528 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); | 519 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); |
529 SAT_UH2_UH(tmp2, tmp3, 7); | |
530 PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride); | 520 PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride); |
531 | 521 |
532 ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); | 522 ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); |
533 ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); | 523 ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); |
534 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); | 524 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); |
535 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 525 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
536 SAT_UH2_UH(tmp0, tmp1, 7); | |
537 PCKEV_ST_SB(tmp0, tmp1, dst + 2 * dst_stride); | 526 PCKEV_ST_SB(tmp0, tmp1, dst + 2 * dst_stride); |
538 | 527 |
539 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); | 528 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); |
540 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); | 529 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); |
541 SAT_UH2_UH(tmp2, tmp3, 7); | |
542 PCKEV_ST_SB(tmp2, tmp3, dst + 3 * dst_stride); | 530 PCKEV_ST_SB(tmp2, tmp3, dst + 3 * dst_stride); |
543 | 531 |
544 ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2); | 532 ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2); |
545 ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3); | 533 ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3); |
546 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); | 534 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); |
547 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 535 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
548 SAT_UH2_UH(tmp0, tmp1, 7); | |
549 PCKEV_ST_SB(tmp0, tmp1, dst + 16); | 536 PCKEV_ST_SB(tmp0, tmp1, dst + 16); |
550 | 537 |
551 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); | 538 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); |
552 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); | 539 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); |
553 SAT_UH2_UH(tmp2, tmp3, 7); | |
554 PCKEV_ST_SB(tmp2, tmp3, dst + 16 + dst_stride); | 540 PCKEV_ST_SB(tmp2, tmp3, dst + 16 + dst_stride); |
555 | 541 |
556 ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6); | 542 ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6); |
557 ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7); | 543 ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7); |
558 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); | 544 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); |
559 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 545 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
560 SAT_UH2_UH(tmp0, tmp1, 7); | |
561 PCKEV_ST_SB(tmp0, tmp1, dst + 16 + 2 * dst_stride); | 546 PCKEV_ST_SB(tmp0, tmp1, dst + 16 + 2 * dst_stride); |
562 | 547 |
563 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); | 548 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); |
564 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); | 549 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); |
565 SAT_UH2_UH(tmp2, tmp3, 7); | |
566 PCKEV_ST_SB(tmp2, tmp3, dst + 16 + 3 * dst_stride); | 550 PCKEV_ST_SB(tmp2, tmp3, dst + 16 + 3 * dst_stride); |
567 dst += (4 * dst_stride); | 551 dst += (4 * dst_stride); |
568 | 552 |
569 src0 = src4; | 553 src0 = src4; |
570 src5 = src9; | 554 src5 = src9; |
571 } | 555 } |
572 } | 556 } |
573 | 557 |
574 static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride, | 558 static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride, |
575 uint8_t *dst, int32_t dst_stride, | 559 uint8_t *dst, int32_t dst_stride, |
(...skipping 15 matching lines...) Expand all Loading... |
591 LD_UB2(src, src_stride, src1, src2); | 575 LD_UB2(src, src_stride, src1, src2); |
592 LD_UB2(src + 16, src_stride, src4, src5); | 576 LD_UB2(src + 16, src_stride, src4, src5); |
593 LD_UB2(src + 32, src_stride, src7, src8); | 577 LD_UB2(src + 32, src_stride, src7, src8); |
594 LD_UB2(src + 48, src_stride, src10, src11); | 578 LD_UB2(src + 48, src_stride, src10, src11); |
595 src += (2 * src_stride); | 579 src += (2 * src_stride); |
596 | 580 |
597 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); | 581 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); |
598 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); | 582 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); |
599 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); | 583 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); |
600 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 584 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
601 SAT_UH2_UH(tmp0, tmp1, 7); | |
602 PCKEV_ST_SB(tmp0, tmp1, dst); | 585 PCKEV_ST_SB(tmp0, tmp1, dst); |
603 | 586 |
604 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); | 587 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); |
605 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); | 588 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); |
606 SAT_UH2_UH(tmp2, tmp3, 7); | |
607 PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride); | 589 PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride); |
608 | 590 |
609 ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6); | 591 ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6); |
610 ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7); | 592 ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7); |
611 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); | 593 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); |
612 SRARI_H2_UH(tmp4, tmp5, FILTER_BITS); | 594 SRARI_H2_UH(tmp4, tmp5, FILTER_BITS); |
613 SAT_UH2_UH(tmp4, tmp5, 7); | |
614 PCKEV_ST_SB(tmp4, tmp5, dst + 16); | 595 PCKEV_ST_SB(tmp4, tmp5, dst + 16); |
615 | 596 |
616 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); | 597 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); |
617 SRARI_H2_UH(tmp6, tmp7, FILTER_BITS); | 598 SRARI_H2_UH(tmp6, tmp7, FILTER_BITS); |
618 SAT_UH2_UH(tmp6, tmp7, 7); | |
619 PCKEV_ST_SB(tmp6, tmp7, dst + 16 + dst_stride); | 599 PCKEV_ST_SB(tmp6, tmp7, dst + 16 + dst_stride); |
620 | 600 |
621 ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2); | 601 ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2); |
622 ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3); | 602 ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3); |
623 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); | 603 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); |
624 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 604 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
625 SAT_UH2_UH(tmp0, tmp1, 7); | |
626 PCKEV_ST_SB(tmp0, tmp1, dst + 32); | 605 PCKEV_ST_SB(tmp0, tmp1, dst + 32); |
627 | 606 |
628 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); | 607 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); |
629 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); | 608 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); |
630 SAT_UH2_UH(tmp2, tmp3, 7); | |
631 PCKEV_ST_SB(tmp2, tmp3, dst + 32 + dst_stride); | 609 PCKEV_ST_SB(tmp2, tmp3, dst + 32 + dst_stride); |
632 | 610 |
633 ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6); | 611 ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6); |
634 ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7); | 612 ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7); |
635 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); | 613 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); |
636 SRARI_H2_UH(tmp4, tmp5, FILTER_BITS); | 614 SRARI_H2_UH(tmp4, tmp5, FILTER_BITS); |
637 SAT_UH2_UH(tmp4, tmp5, 7); | |
638 PCKEV_ST_SB(tmp4, tmp5, dst + 48); | 615 PCKEV_ST_SB(tmp4, tmp5, dst + 48); |
639 | 616 |
640 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); | 617 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); |
641 SRARI_H2_UH(tmp6, tmp7, FILTER_BITS); | 618 SRARI_H2_UH(tmp6, tmp7, FILTER_BITS); |
642 SAT_UH2_UH(tmp6, tmp7, 7); | |
643 PCKEV_ST_SB(tmp6, tmp7, dst + 48 + dst_stride); | 619 PCKEV_ST_SB(tmp6, tmp7, dst + 48 + dst_stride); |
644 dst += (2 * dst_stride); | 620 dst += (2 * dst_stride); |
645 | 621 |
646 src0 = src2; | 622 src0 = src2; |
647 src3 = src5; | 623 src3 = src5; |
648 src6 = src8; | 624 src6 = src8; |
649 src9 = src11; | 625 src9 = src11; |
650 } | 626 } |
651 } | 627 } |
652 | 628 |
653 void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, | 629 void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, |
654 uint8_t *dst, ptrdiff_t dst_stride, | 630 uint8_t *dst, ptrdiff_t dst_stride, |
655 const int16_t *filter_x, int x_step_q4, | 631 const int16_t *filter_x, int x_step_q4, |
656 const int16_t *filter_y, int y_step_q4, | 632 const int16_t *filter_y, int y_step_q4, |
657 int w, int h) { | 633 int w, int h) { |
658 int8_t cnt, filt_ver[8]; | 634 int8_t cnt, filt_ver[8]; |
659 | 635 |
660 if (16 != y_step_q4) { | 636 assert(y_step_q4 == 16); |
661 vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, | 637 assert(((const int32_t *)filter_y)[1] != 0x800000); |
662 filter_x, x_step_q4, filter_y, y_step_q4, | |
663 w, h); | |
664 return; | |
665 } | |
666 | |
667 if (((const int32_t *)filter_y)[1] == 0x800000) { | |
668 vpx_convolve_copy(src, src_stride, dst, dst_stride, | |
669 filter_x, x_step_q4, filter_y, y_step_q4, | |
670 w, h); | |
671 return; | |
672 } | |
673 | 638 |
674 for (cnt = 8; cnt--;) { | 639 for (cnt = 8; cnt--;) { |
675 filt_ver[cnt] = filter_y[cnt]; | 640 filt_ver[cnt] = filter_y[cnt]; |
676 } | 641 } |
677 | 642 |
678 if (((const int32_t *)filter_y)[0] == 0) { | 643 if (((const int32_t *)filter_y)[0] == 0) { |
679 switch (w) { | 644 switch (w) { |
680 case 4: | 645 case 4: |
681 common_vt_2t_4w_msa(src, (int32_t)src_stride, | 646 common_vt_2t_4w_msa(src, (int32_t)src_stride, |
682 dst, (int32_t)dst_stride, | 647 dst, (int32_t)dst_stride, |
(...skipping 53 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
736 filt_ver, h); | 701 filt_ver, h); |
737 break; | 702 break; |
738 default: | 703 default: |
739 vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, | 704 vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, |
740 filter_x, x_step_q4, filter_y, y_step_q4, | 705 filter_x, x_step_q4, filter_y, y_step_q4, |
741 w, h); | 706 w, h); |
742 break; | 707 break; |
743 } | 708 } |
744 } | 709 } |
745 } | 710 } |
OLD | NEW |