Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(83)

Side by Side Diff: source/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c

Issue 1302353004: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/libvpx/vpx_dsp/mips/vpx_convolve8_vert_dspr2.c ('k') | source/libvpx/vpx_dsp/prob.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
11 #include <assert.h>
11 #include "./vpx_dsp_rtcd.h" 12 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_dsp/mips/vpx_convolve_msa.h" 13 #include "vpx_dsp/mips/vpx_convolve_msa.h"
13 14
14 static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride, 15 static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride,
15 uint8_t *dst, int32_t dst_stride, 16 uint8_t *dst, int32_t dst_stride,
16 int8_t *filter, int32_t height) { 17 int8_t *filter, int32_t height) {
17 uint32_t loop_cnt; 18 uint32_t loop_cnt;
18 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 19 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
19 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; 20 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
20 v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776; 21 v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
(...skipping 288 matching lines...) Expand 10 before | Expand all | Expand 10 after
309 filt0 = (v16u8)__msa_splati_h(filt, 0); 310 filt0 = (v16u8)__msa_splati_h(filt, 0);
310 311
311 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); 312 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
312 src += (5 * src_stride); 313 src += (5 * src_stride);
313 314
314 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 315 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
315 src32_r, src43_r); 316 src32_r, src43_r);
316 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); 317 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
317 DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); 318 DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
318 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 319 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
319 SAT_UH2_UH(tmp0, tmp1, 7);
320 src2110 = __msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); 320 src2110 = __msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
321 ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); 321 ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
322 } 322 }
323 323
324 static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride, 324 static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
325 uint8_t *dst, int32_t dst_stride, 325 uint8_t *dst, int32_t dst_stride,
326 int8_t *filter) { 326 int8_t *filter) {
327 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 327 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
328 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r; 328 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
329 v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776; 329 v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
(...skipping 12 matching lines...) Expand all
342 342
343 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, 343 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
344 src32_r, src43_r); 344 src32_r, src43_r);
345 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, 345 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
346 src76_r, src87_r); 346 src76_r, src87_r);
347 ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, 347 ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
348 src87_r, src76_r, src2110, src4332, src6554, src8776); 348 src87_r, src76_r, src2110, src4332, src6554, src8776);
349 DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0, 349 DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
350 tmp0, tmp1, tmp2, tmp3); 350 tmp0, tmp1, tmp2, tmp3);
351 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); 351 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
352 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
353 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332); 352 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
354 ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); 353 ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
355 ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride); 354 ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
356 } 355 }
357 356
358 static void common_vt_2t_4w_msa(const uint8_t *src, int32_t src_stride, 357 static void common_vt_2t_4w_msa(const uint8_t *src, int32_t src_stride,
359 uint8_t *dst, int32_t dst_stride, 358 uint8_t *dst, int32_t dst_stride,
360 int8_t *filter, int32_t height) { 359 int8_t *filter, int32_t height) {
361 if (4 == height) { 360 if (4 == height) {
362 common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter); 361 common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
(...skipping 13 matching lines...) Expand all
376 /* rearranging filter_y */ 375 /* rearranging filter_y */
377 filt = LD_SH(filter); 376 filt = LD_SH(filter);
378 filt0 = (v16u8)__msa_splati_h(filt, 0); 377 filt0 = (v16u8)__msa_splati_h(filt, 0);
379 378
380 LD_UB5(src, src_stride, src0, src1, src2, src3, src4); 379 LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
381 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1); 380 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
382 ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3); 381 ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
383 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, 382 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
384 tmp2, tmp3); 383 tmp2, tmp3);
385 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); 384 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
386 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
387 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); 385 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
388 ST8x4_UB(out0, out1, dst, dst_stride); 386 ST8x4_UB(out0, out1, dst, dst_stride);
389 } 387 }
390 388
391 static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, 389 static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
392 uint8_t *dst, int32_t dst_stride, 390 uint8_t *dst, int32_t dst_stride,
393 int8_t *filter, int32_t height) { 391 int8_t *filter, int32_t height) {
394 uint32_t loop_cnt; 392 uint32_t loop_cnt;
395 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 393 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
396 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; 394 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
(...skipping 12 matching lines...) Expand all
409 LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8); 407 LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
410 src += (8 * src_stride); 408 src += (8 * src_stride);
411 409
412 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, 410 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1,
413 vec2, vec3); 411 vec2, vec3);
414 ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, 412 ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5,
415 vec6, vec7); 413 vec6, vec7);
416 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, 414 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
417 tmp2, tmp3); 415 tmp2, tmp3);
418 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); 416 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
419 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
420 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); 417 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
421 ST8x4_UB(out0, out1, dst, dst_stride); 418 ST8x4_UB(out0, out1, dst, dst_stride);
422 dst += (4 * dst_stride); 419 dst += (4 * dst_stride);
423 420
424 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1, 421 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1,
425 tmp2, tmp3); 422 tmp2, tmp3);
426 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); 423 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
427 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
428 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); 424 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
429 ST8x4_UB(out0, out1, dst, dst_stride); 425 ST8x4_UB(out0, out1, dst, dst_stride);
430 dst += (4 * dst_stride); 426 dst += (4 * dst_stride);
431 427
432 src0 = src8; 428 src0 = src8;
433 } 429 }
434 } 430 }
435 431
436 static void common_vt_2t_8w_msa(const uint8_t *src, int32_t src_stride, 432 static void common_vt_2t_8w_msa(const uint8_t *src, int32_t src_stride,
437 uint8_t *dst, int32_t dst_stride, 433 uint8_t *dst, int32_t dst_stride,
(...skipping 22 matching lines...) Expand all
460 src += src_stride; 456 src += src_stride;
461 457
462 for (loop_cnt = (height >> 2); loop_cnt--;) { 458 for (loop_cnt = (height >> 2); loop_cnt--;) {
463 LD_UB4(src, src_stride, src1, src2, src3, src4); 459 LD_UB4(src, src_stride, src1, src2, src3, src4);
464 src += (4 * src_stride); 460 src += (4 * src_stride);
465 461
466 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); 462 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
467 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); 463 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
468 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); 464 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
469 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 465 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
470 SAT_UH2_UH(tmp0, tmp1, 7);
471 PCKEV_ST_SB(tmp0, tmp1, dst); 466 PCKEV_ST_SB(tmp0, tmp1, dst);
472 dst += dst_stride; 467 dst += dst_stride;
473 468
474 ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); 469 ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
475 ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); 470 ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
476 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); 471 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
477 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); 472 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
478 SAT_UH2_UH(tmp2, tmp3, 7);
479 PCKEV_ST_SB(tmp2, tmp3, dst); 473 PCKEV_ST_SB(tmp2, tmp3, dst);
480 dst += dst_stride; 474 dst += dst_stride;
481 475
482 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); 476 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
483 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 477 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
484 SAT_UH2_UH(tmp0, tmp1, 7);
485 PCKEV_ST_SB(tmp0, tmp1, dst); 478 PCKEV_ST_SB(tmp0, tmp1, dst);
486 dst += dst_stride; 479 dst += dst_stride;
487 480
488 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); 481 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
489 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); 482 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
490 SAT_UH2_UH(tmp2, tmp3, 7);
491 PCKEV_ST_SB(tmp2, tmp3, dst); 483 PCKEV_ST_SB(tmp2, tmp3, dst);
492 dst += dst_stride; 484 dst += dst_stride;
493 485
494 src0 = src4; 486 src0 = src4;
495 } 487 }
496 } 488 }
497 489
498 static void common_vt_2t_32w_msa(const uint8_t *src, int32_t src_stride, 490 static void common_vt_2t_32w_msa(const uint8_t *src, int32_t src_stride,
499 uint8_t *dst, int32_t dst_stride, 491 uint8_t *dst, int32_t dst_stride,
500 int8_t *filter, int32_t height) { 492 int8_t *filter, int32_t height) {
(...skipping 14 matching lines...) Expand all
515 for (loop_cnt = (height >> 2); loop_cnt--;) { 507 for (loop_cnt = (height >> 2); loop_cnt--;) {
516 LD_UB4(src, src_stride, src1, src2, src3, src4); 508 LD_UB4(src, src_stride, src1, src2, src3, src4);
517 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); 509 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
518 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); 510 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
519 511
520 LD_UB4(src + 16, src_stride, src6, src7, src8, src9); 512 LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
521 src += (4 * src_stride); 513 src += (4 * src_stride);
522 514
523 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); 515 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
524 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 516 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
525 SAT_UH2_UH(tmp0, tmp1, 7);
526 PCKEV_ST_SB(tmp0, tmp1, dst); 517 PCKEV_ST_SB(tmp0, tmp1, dst);
527 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); 518 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
528 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); 519 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
529 SAT_UH2_UH(tmp2, tmp3, 7);
530 PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride); 520 PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
531 521
532 ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); 522 ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
533 ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); 523 ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
534 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); 524 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
535 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 525 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
536 SAT_UH2_UH(tmp0, tmp1, 7);
537 PCKEV_ST_SB(tmp0, tmp1, dst + 2 * dst_stride); 526 PCKEV_ST_SB(tmp0, tmp1, dst + 2 * dst_stride);
538 527
539 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); 528 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
540 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); 529 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
541 SAT_UH2_UH(tmp2, tmp3, 7);
542 PCKEV_ST_SB(tmp2, tmp3, dst + 3 * dst_stride); 530 PCKEV_ST_SB(tmp2, tmp3, dst + 3 * dst_stride);
543 531
544 ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2); 532 ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
545 ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3); 533 ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
546 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); 534 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
547 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 535 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
548 SAT_UH2_UH(tmp0, tmp1, 7);
549 PCKEV_ST_SB(tmp0, tmp1, dst + 16); 536 PCKEV_ST_SB(tmp0, tmp1, dst + 16);
550 537
551 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); 538 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
552 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); 539 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
553 SAT_UH2_UH(tmp2, tmp3, 7);
554 PCKEV_ST_SB(tmp2, tmp3, dst + 16 + dst_stride); 540 PCKEV_ST_SB(tmp2, tmp3, dst + 16 + dst_stride);
555 541
556 ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6); 542 ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
557 ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7); 543 ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
558 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); 544 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
559 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 545 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
560 SAT_UH2_UH(tmp0, tmp1, 7);
561 PCKEV_ST_SB(tmp0, tmp1, dst + 16 + 2 * dst_stride); 546 PCKEV_ST_SB(tmp0, tmp1, dst + 16 + 2 * dst_stride);
562 547
563 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); 548 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
564 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); 549 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
565 SAT_UH2_UH(tmp2, tmp3, 7);
566 PCKEV_ST_SB(tmp2, tmp3, dst + 16 + 3 * dst_stride); 550 PCKEV_ST_SB(tmp2, tmp3, dst + 16 + 3 * dst_stride);
567 dst += (4 * dst_stride); 551 dst += (4 * dst_stride);
568 552
569 src0 = src4; 553 src0 = src4;
570 src5 = src9; 554 src5 = src9;
571 } 555 }
572 } 556 }
573 557
574 static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride, 558 static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride,
575 uint8_t *dst, int32_t dst_stride, 559 uint8_t *dst, int32_t dst_stride,
(...skipping 15 matching lines...) Expand all
591 LD_UB2(src, src_stride, src1, src2); 575 LD_UB2(src, src_stride, src1, src2);
592 LD_UB2(src + 16, src_stride, src4, src5); 576 LD_UB2(src + 16, src_stride, src4, src5);
593 LD_UB2(src + 32, src_stride, src7, src8); 577 LD_UB2(src + 32, src_stride, src7, src8);
594 LD_UB2(src + 48, src_stride, src10, src11); 578 LD_UB2(src + 48, src_stride, src10, src11);
595 src += (2 * src_stride); 579 src += (2 * src_stride);
596 580
597 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); 581 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
598 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); 582 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
599 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); 583 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
600 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 584 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
601 SAT_UH2_UH(tmp0, tmp1, 7);
602 PCKEV_ST_SB(tmp0, tmp1, dst); 585 PCKEV_ST_SB(tmp0, tmp1, dst);
603 586
604 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); 587 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
605 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); 588 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
606 SAT_UH2_UH(tmp2, tmp3, 7);
607 PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride); 589 PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
608 590
609 ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6); 591 ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
610 ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7); 592 ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
611 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); 593 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
612 SRARI_H2_UH(tmp4, tmp5, FILTER_BITS); 594 SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
613 SAT_UH2_UH(tmp4, tmp5, 7);
614 PCKEV_ST_SB(tmp4, tmp5, dst + 16); 595 PCKEV_ST_SB(tmp4, tmp5, dst + 16);
615 596
616 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); 597 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
617 SRARI_H2_UH(tmp6, tmp7, FILTER_BITS); 598 SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
618 SAT_UH2_UH(tmp6, tmp7, 7);
619 PCKEV_ST_SB(tmp6, tmp7, dst + 16 + dst_stride); 599 PCKEV_ST_SB(tmp6, tmp7, dst + 16 + dst_stride);
620 600
621 ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2); 601 ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
622 ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3); 602 ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
623 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); 603 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
624 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); 604 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
625 SAT_UH2_UH(tmp0, tmp1, 7);
626 PCKEV_ST_SB(tmp0, tmp1, dst + 32); 605 PCKEV_ST_SB(tmp0, tmp1, dst + 32);
627 606
628 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); 607 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
629 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); 608 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
630 SAT_UH2_UH(tmp2, tmp3, 7);
631 PCKEV_ST_SB(tmp2, tmp3, dst + 32 + dst_stride); 609 PCKEV_ST_SB(tmp2, tmp3, dst + 32 + dst_stride);
632 610
633 ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6); 611 ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
634 ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7); 612 ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
635 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); 613 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
636 SRARI_H2_UH(tmp4, tmp5, FILTER_BITS); 614 SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
637 SAT_UH2_UH(tmp4, tmp5, 7);
638 PCKEV_ST_SB(tmp4, tmp5, dst + 48); 615 PCKEV_ST_SB(tmp4, tmp5, dst + 48);
639 616
640 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); 617 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
641 SRARI_H2_UH(tmp6, tmp7, FILTER_BITS); 618 SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
642 SAT_UH2_UH(tmp6, tmp7, 7);
643 PCKEV_ST_SB(tmp6, tmp7, dst + 48 + dst_stride); 619 PCKEV_ST_SB(tmp6, tmp7, dst + 48 + dst_stride);
644 dst += (2 * dst_stride); 620 dst += (2 * dst_stride);
645 621
646 src0 = src2; 622 src0 = src2;
647 src3 = src5; 623 src3 = src5;
648 src6 = src8; 624 src6 = src8;
649 src9 = src11; 625 src9 = src11;
650 } 626 }
651 } 627 }
652 628
653 void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, 629 void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
654 uint8_t *dst, ptrdiff_t dst_stride, 630 uint8_t *dst, ptrdiff_t dst_stride,
655 const int16_t *filter_x, int x_step_q4, 631 const int16_t *filter_x, int x_step_q4,
656 const int16_t *filter_y, int y_step_q4, 632 const int16_t *filter_y, int y_step_q4,
657 int w, int h) { 633 int w, int h) {
658 int8_t cnt, filt_ver[8]; 634 int8_t cnt, filt_ver[8];
659 635
660 if (16 != y_step_q4) { 636 assert(y_step_q4 == 16);
661 vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, 637 assert(((const int32_t *)filter_y)[1] != 0x800000);
662 filter_x, x_step_q4, filter_y, y_step_q4,
663 w, h);
664 return;
665 }
666
667 if (((const int32_t *)filter_y)[1] == 0x800000) {
668 vpx_convolve_copy(src, src_stride, dst, dst_stride,
669 filter_x, x_step_q4, filter_y, y_step_q4,
670 w, h);
671 return;
672 }
673 638
674 for (cnt = 8; cnt--;) { 639 for (cnt = 8; cnt--;) {
675 filt_ver[cnt] = filter_y[cnt]; 640 filt_ver[cnt] = filter_y[cnt];
676 } 641 }
677 642
678 if (((const int32_t *)filter_y)[0] == 0) { 643 if (((const int32_t *)filter_y)[0] == 0) {
679 switch (w) { 644 switch (w) {
680 case 4: 645 case 4:
681 common_vt_2t_4w_msa(src, (int32_t)src_stride, 646 common_vt_2t_4w_msa(src, (int32_t)src_stride,
682 dst, (int32_t)dst_stride, 647 dst, (int32_t)dst_stride,
(...skipping 53 matching lines...) Expand 10 before | Expand all | Expand 10 after
736 filt_ver, h); 701 filt_ver, h);
737 break; 702 break;
738 default: 703 default:
739 vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, 704 vpx_convolve8_vert_c(src, src_stride, dst, dst_stride,
740 filter_x, x_step_q4, filter_y, y_step_q4, 705 filter_x, x_step_q4, filter_y, y_step_q4,
741 w, h); 706 w, h);
742 break; 707 break;
743 } 708 }
744 } 709 }
745 } 710 }
OLDNEW
« no previous file with comments | « source/libvpx/vpx_dsp/mips/vpx_convolve8_vert_dspr2.c ('k') | source/libvpx/vpx_dsp/prob.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698