Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(673)

Side by Side Diff: source/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c

Issue 1302353004: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
11 #include <assert.h>
11 #include "./vpx_dsp_rtcd.h" 12 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_dsp/mips/vpx_convolve_msa.h" 13 #include "vpx_dsp/mips/vpx_convolve_msa.h"
13 14
14 static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src, 15 static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src,
15 int32_t src_stride, 16 int32_t src_stride,
16 uint8_t *dst, 17 uint8_t *dst,
17 int32_t dst_stride, 18 int32_t dst_stride,
18 int8_t *filter) { 19 int8_t *filter) {
19 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; 20 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
20 v16u8 dst0, dst1, dst2, dst3, res2, res3; 21 v16u8 dst0, dst1, dst2, dst3, res2, res3;
(...skipping 295 matching lines...) Expand 10 before | Expand all | Expand 10 after
316 } 317 }
317 } 318 }
318 319
319 static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src, 320 static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src,
320 int32_t src_stride, 321 int32_t src_stride,
321 uint8_t *dst, 322 uint8_t *dst,
322 int32_t dst_stride, 323 int32_t dst_stride,
323 int8_t *filter) { 324 int8_t *filter) {
324 v16i8 src0, src1, src2, src3, mask; 325 v16i8 src0, src1, src2, src3, mask;
325 v16u8 filt0, dst0, dst1, dst2, dst3, vec0, vec1, res0, res1; 326 v16u8 filt0, dst0, dst1, dst2, dst3, vec0, vec1, res0, res1;
326 v8u16 vec2, vec3, const255, filt; 327 v8u16 vec2, vec3, filt;
327 328
328 mask = LD_SB(&mc_filt_mask_arr[16]); 329 mask = LD_SB(&mc_filt_mask_arr[16]);
329 330
330 /* rearranging filter */ 331 /* rearranging filter */
331 filt = LD_UH(filter); 332 filt = LD_UH(filter);
332 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); 333 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
333 334
334 const255 = (v8u16)__msa_ldi_h(255);
335
336 LD_SB4(src, src_stride, src0, src1, src2, src3); 335 LD_SB4(src, src_stride, src0, src1, src2, src3);
337 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 336 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
338 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); 337 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
339 DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3); 338 DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
340 SRARI_H2_UH(vec2, vec3, FILTER_BITS); 339 SRARI_H2_UH(vec2, vec3, FILTER_BITS);
341 MIN_UH2_UH(vec2, vec3, const255);
342 PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1); 340 PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
343 ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); 341 ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
344 AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1); 342 AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1);
345 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); 343 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
346 } 344 }
347 345
348 static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src, 346 static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
349 int32_t src_stride, 347 int32_t src_stride,
350 uint8_t *dst, 348 uint8_t *dst,
351 int32_t dst_stride, 349 int32_t dst_stride,
352 int8_t *filter) { 350 int8_t *filter) {
353 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 351 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
354 v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3; 352 v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
355 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 353 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
356 v8u16 vec4, vec5, vec6, vec7, const255, filt; 354 v8u16 vec4, vec5, vec6, vec7, filt;
357 355
358 mask = LD_SB(&mc_filt_mask_arr[16]); 356 mask = LD_SB(&mc_filt_mask_arr[16]);
359 357
360 /* rearranging filter */ 358 /* rearranging filter */
361 filt = LD_UH(filter); 359 filt = LD_UH(filter);
362 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); 360 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
363 361
364 const255 = (v8u16)__msa_ldi_h(255);
365
366 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 362 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
367 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); 363 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
368 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); 364 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
369 VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3); 365 VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
370 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5, 366 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
371 vec6, vec7); 367 vec6, vec7);
372 SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); 368 SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
373 MIN_UH4_UH(vec4, vec5, vec6, vec7, const255);
374 PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2, 369 PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
375 res3); 370 res3);
376 ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4, 371 ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
377 dst6); 372 dst6);
378 AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2, 373 AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2,
379 res3); 374 res3);
380 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); 375 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
381 dst += (4 * dst_stride); 376 dst += (4 * dst_stride);
382 ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); 377 ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
383 } 378 }
(...skipping 11 matching lines...) Expand all
395 } 390 }
396 } 391 }
397 392
398 static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src, 393 static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src,
399 int32_t src_stride, 394 int32_t src_stride,
400 uint8_t *dst, 395 uint8_t *dst,
401 int32_t dst_stride, 396 int32_t dst_stride,
402 int8_t *filter) { 397 int8_t *filter) {
403 v16i8 src0, src1, src2, src3, mask; 398 v16i8 src0, src1, src2, src3, mask;
404 v16u8 filt0, dst0, dst1, dst2, dst3; 399 v16u8 filt0, dst0, dst1, dst2, dst3;
405 v8u16 vec0, vec1, vec2, vec3, const255, filt; 400 v8u16 vec0, vec1, vec2, vec3, filt;
406 401
407 mask = LD_SB(&mc_filt_mask_arr[0]); 402 mask = LD_SB(&mc_filt_mask_arr[0]);
408 403
409 /* rearranging filter */ 404 /* rearranging filter */
410 filt = LD_UH(filter); 405 filt = LD_UH(filter);
411 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); 406 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
412 407
413 const255 = (v8u16)__msa_ldi_h(255);
414
415 LD_SB4(src, src_stride, src0, src1, src2, src3); 408 LD_SB4(src, src_stride, src0, src1, src2, src3);
416 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 409 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
417 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 410 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
418 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 411 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
419 vec2, vec3); 412 vec2, vec3);
420 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); 413 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
421 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 414 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
422 MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
423 PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, 415 PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
424 dst, dst_stride); 416 dst, dst_stride);
425 } 417 }
426 418
427 static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src, 419 static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
428 int32_t src_stride, 420 int32_t src_stride,
429 uint8_t *dst, 421 uint8_t *dst,
430 int32_t dst_stride, 422 int32_t dst_stride,
431 int8_t *filter, 423 int8_t *filter,
432 int32_t height) { 424 int32_t height) {
433 v16i8 src0, src1, src2, src3, mask; 425 v16i8 src0, src1, src2, src3, mask;
434 v16u8 filt0, dst0, dst1, dst2, dst3; 426 v16u8 filt0, dst0, dst1, dst2, dst3;
435 v8u16 vec0, vec1, vec2, vec3, const255, filt; 427 v8u16 vec0, vec1, vec2, vec3, filt;
436 428
437 mask = LD_SB(&mc_filt_mask_arr[0]); 429 mask = LD_SB(&mc_filt_mask_arr[0]);
438 430
439 /* rearranging filter */ 431 /* rearranging filter */
440 filt = LD_UH(filter); 432 filt = LD_UH(filter);
441 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); 433 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
442 434
443 const255 = (v8u16)__msa_ldi_h(255);
444
445 LD_SB4(src, src_stride, src0, src1, src2, src3); 435 LD_SB4(src, src_stride, src0, src1, src2, src3);
446 src += (4 * src_stride); 436 src += (4 * src_stride);
447 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 437 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
448 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 438 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
449 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 439 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
450 vec2, vec3); 440 vec2, vec3);
451 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); 441 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
452 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 442 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
453 MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
454 LD_SB4(src, src_stride, src0, src1, src2, src3); 443 LD_SB4(src, src_stride, src0, src1, src2, src3);
455 src += (4 * src_stride); 444 src += (4 * src_stride);
456 PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, 445 PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
457 dst, dst_stride); 446 dst, dst_stride);
458 dst += (4 * dst_stride); 447 dst += (4 * dst_stride);
459 448
460 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 449 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
461 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 450 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
462 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 451 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
463 vec2, vec3); 452 vec2, vec3);
464 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); 453 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
465 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 454 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
466 MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
467 PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, 455 PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
468 dst, dst_stride); 456 dst, dst_stride);
469 dst += (4 * dst_stride); 457 dst += (4 * dst_stride);
470 458
471 if (16 == height) { 459 if (16 == height) {
472 LD_SB4(src, src_stride, src0, src1, src2, src3); 460 LD_SB4(src, src_stride, src0, src1, src2, src3);
473 src += (4 * src_stride); 461 src += (4 * src_stride);
474 462
475 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 463 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
476 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 464 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
477 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 465 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
478 vec2, vec3); 466 vec2, vec3);
479 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); 467 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
480 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 468 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
481 MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
482 LD_SB4(src, src_stride, src0, src1, src2, src3); 469 LD_SB4(src, src_stride, src0, src1, src2, src3);
483 PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, 470 PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
484 dst, dst_stride); 471 dst, dst_stride);
485 dst += (4 * dst_stride); 472 dst += (4 * dst_stride);
486 473
487 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 474 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
488 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 475 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
489 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 476 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
490 vec2, vec3); 477 vec2, vec3);
491 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); 478 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
492 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 479 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
493 MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
494 PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, 480 PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
495 dst, dst_stride); 481 dst, dst_stride);
496 } 482 }
497 } 483 }
498 484
499 static void common_hz_2t_and_aver_dst_8w_msa(const uint8_t *src, 485 static void common_hz_2t_and_aver_dst_8w_msa(const uint8_t *src,
500 int32_t src_stride, 486 int32_t src_stride,
501 uint8_t *dst, 487 uint8_t *dst,
502 int32_t dst_stride, 488 int32_t dst_stride,
503 int8_t *filter, 489 int8_t *filter,
504 int32_t height) { 490 int32_t height) {
505 if (4 == height) { 491 if (4 == height) {
506 common_hz_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter); 492 common_hz_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter);
507 } else { 493 } else {
508 common_hz_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride, 494 common_hz_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
509 filter, height); 495 filter, height);
510 } 496 }
511 } 497 }
512 498
513 static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src, 499 static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src,
514 int32_t src_stride, 500 int32_t src_stride,
515 uint8_t *dst, 501 uint8_t *dst,
516 int32_t dst_stride, 502 int32_t dst_stride,
517 int8_t *filter, 503 int8_t *filter,
518 int32_t height) { 504 int32_t height) {
519 uint32_t loop_cnt; 505 uint32_t loop_cnt;
520 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 506 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
521 v16u8 filt0, dst0, dst1, dst2, dst3; 507 v16u8 filt0, dst0, dst1, dst2, dst3;
522 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 508 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
523 v8u16 res0, res1, res2, res3, res4, res5, res6, res7, const255, filt; 509 v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
524 510
525 mask = LD_SB(&mc_filt_mask_arr[0]); 511 mask = LD_SB(&mc_filt_mask_arr[0]);
526 512
527 /* rearranging filter */ 513 /* rearranging filter */
528 filt = LD_UH(filter); 514 filt = LD_UH(filter);
529 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); 515 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
530 516
531 const255 = (v8u16)__msa_ldi_h(255);
532
533 LD_SB4(src, src_stride, src0, src2, src4, src6); 517 LD_SB4(src, src_stride, src0, src2, src4, src6);
534 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 518 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
535 src += (4 * src_stride); 519 src += (4 * src_stride);
536 520
537 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); 521 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
538 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); 522 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
539 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); 523 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
540 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); 524 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
541 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1, 525 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
542 res2, res3); 526 res2, res3);
543 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5, 527 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
544 res6, res7); 528 res6, res7);
545 SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS); 529 SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
546 SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS); 530 SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
547 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 531 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
548 MIN_UH4_UH(res0, res1, res2, res3, const255);
549 MIN_UH4_UH(res4, res5, res6, res7, const255);
550 PCKEV_AVG_ST_UB(res1, res0, dst0, dst); 532 PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
551 dst += dst_stride; 533 dst += dst_stride;
552 PCKEV_AVG_ST_UB(res3, res2, dst1, dst); 534 PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
553 dst += dst_stride; 535 dst += dst_stride;
554 PCKEV_AVG_ST_UB(res5, res4, dst2, dst); 536 PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
555 dst += dst_stride; 537 dst += dst_stride;
556 PCKEV_AVG_ST_UB(res7, res6, dst3, dst); 538 PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
557 dst += dst_stride; 539 dst += dst_stride;
558 540
559 for (loop_cnt = (height >> 2) - 1; loop_cnt--;) { 541 for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
560 LD_SB4(src, src_stride, src0, src2, src4, src6); 542 LD_SB4(src, src_stride, src0, src2, src4, src6);
561 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 543 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
562 src += (4 * src_stride); 544 src += (4 * src_stride);
563 545
564 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); 546 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
565 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); 547 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
566 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); 548 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
567 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); 549 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
568 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1, 550 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
569 res2, res3); 551 res2, res3);
570 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5, 552 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
571 res6, res7); 553 res6, res7);
572 SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS); 554 SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
573 SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS); 555 SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
574 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); 556 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
575 MIN_UH4_UH(res0, res1, res2, res3, const255);
576 MIN_UH4_UH(res4, res5, res6, res7, const255);
577 PCKEV_AVG_ST_UB(res1, res0, dst0, dst); 557 PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
578 dst += dst_stride; 558 dst += dst_stride;
579 PCKEV_AVG_ST_UB(res3, res2, dst1, dst); 559 PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
580 dst += dst_stride; 560 dst += dst_stride;
581 PCKEV_AVG_ST_UB(res5, res4, dst2, dst); 561 PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
582 dst += dst_stride; 562 dst += dst_stride;
583 PCKEV_AVG_ST_UB(res7, res6, dst3, dst); 563 PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
584 dst += dst_stride; 564 dst += dst_stride;
585 } 565 }
586 } 566 }
587 567
588 static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src, 568 static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src,
589 int32_t src_stride, 569 int32_t src_stride,
590 uint8_t *dst, 570 uint8_t *dst,
591 int32_t dst_stride, 571 int32_t dst_stride,
592 int8_t *filter, 572 int8_t *filter,
593 int32_t height) { 573 int32_t height) {
594 uint32_t loop_cnt; 574 uint32_t loop_cnt;
595 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 575 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
596 v16u8 filt0, dst0, dst1, dst2, dst3; 576 v16u8 filt0, dst0, dst1, dst2, dst3;
597 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 577 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
598 v8u16 res0, res1, res2, res3, res4, res5, res6, res7, const255, filt; 578 v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
599 579
600 mask = LD_SB(&mc_filt_mask_arr[0]); 580 mask = LD_SB(&mc_filt_mask_arr[0]);
601 581
602 /* rearranging filter */ 582 /* rearranging filter */
603 filt = LD_UH(filter); 583 filt = LD_UH(filter);
604 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); 584 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
605 585
606 const255 = (v8u16)__msa_ldi_h(255);
607
608 for (loop_cnt = (height >> 1); loop_cnt--;) { 586 for (loop_cnt = (height >> 1); loop_cnt--;) {
609 src0 = LD_SB(src); 587 src0 = LD_SB(src);
610 src2 = LD_SB(src + 16); 588 src2 = LD_SB(src + 16);
611 src3 = LD_SB(src + 24); 589 src3 = LD_SB(src + 24);
612 src1 = __msa_sldi_b(src2, src0, 8); 590 src1 = __msa_sldi_b(src2, src0, 8);
613 src += src_stride; 591 src += src_stride;
614 src4 = LD_SB(src); 592 src4 = LD_SB(src);
615 src6 = LD_SB(src + 16); 593 src6 = LD_SB(src + 16);
616 src7 = LD_SB(src + 24); 594 src7 = LD_SB(src + 24);
617 src5 = __msa_sldi_b(src6, src4, 8); 595 src5 = __msa_sldi_b(src6, src4, 8);
618 src += src_stride; 596 src += src_stride;
619 597
620 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); 598 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
621 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); 599 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
622 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); 600 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
623 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); 601 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
624 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1, 602 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
625 res2, res3); 603 res2, res3);
626 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5, 604 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
627 res6, res7); 605 res6, res7);
628 SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS); 606 SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
629 SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS); 607 SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
630 MIN_UH4_UH(res0, res1, res2, res3, const255);
631 MIN_UH4_UH(res4, res5, res6, res7, const255);
632 LD_UB2(dst, 16, dst0, dst1); 608 LD_UB2(dst, 16, dst0, dst1);
633 PCKEV_AVG_ST_UB(res1, res0, dst0, dst); 609 PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
634 PCKEV_AVG_ST_UB(res3, res2, dst1, (dst + 16)); 610 PCKEV_AVG_ST_UB(res3, res2, dst1, (dst + 16));
635 dst += dst_stride; 611 dst += dst_stride;
636 LD_UB2(dst, 16, dst2, dst3); 612 LD_UB2(dst, 16, dst2, dst3);
637 PCKEV_AVG_ST_UB(res5, res4, dst2, dst); 613 PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
638 PCKEV_AVG_ST_UB(res7, res6, dst3, (dst + 16)); 614 PCKEV_AVG_ST_UB(res7, res6, dst3, (dst + 16));
639 dst += dst_stride; 615 dst += dst_stride;
640 } 616 }
641 } 617 }
642 618
643 static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src, 619 static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src,
644 int32_t src_stride, 620 int32_t src_stride,
645 uint8_t *dst, 621 uint8_t *dst,
646 int32_t dst_stride, 622 int32_t dst_stride,
647 int8_t *filter, 623 int8_t *filter,
648 int32_t height) { 624 int32_t height) {
649 uint32_t loop_cnt; 625 uint32_t loop_cnt;
650 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 626 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
651 v16u8 filt0, dst0, dst1, dst2, dst3; 627 v16u8 filt0, dst0, dst1, dst2, dst3;
652 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 628 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
653 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, const255, filt; 629 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
654 630
655 mask = LD_SB(&mc_filt_mask_arr[0]); 631 mask = LD_SB(&mc_filt_mask_arr[0]);
656 632
657 /* rearranging filter */ 633 /* rearranging filter */
658 filt = LD_UH(filter); 634 filt = LD_UH(filter);
659 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); 635 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
660 636
661 const255 = (v8u16)__msa_ldi_h(255);
662
663 for (loop_cnt = height; loop_cnt--;) { 637 for (loop_cnt = height; loop_cnt--;) {
664 LD_SB4(src, 16, src0, src2, src4, src6); 638 LD_SB4(src, 16, src0, src2, src4, src6);
665 src7 = LD_SB(src + 56); 639 src7 = LD_SB(src + 56);
666 SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8); 640 SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
667 src += src_stride; 641 src += src_stride;
668 642
669 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); 643 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
670 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); 644 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
671 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); 645 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
672 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); 646 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
673 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, 647 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
674 out2, out3); 648 out2, out3);
675 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, 649 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
676 out6, out7); 650 out6, out7);
677 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); 651 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
678 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); 652 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
679 LD_UB4(dst, 16, dst0, dst1, dst2, dst3); 653 LD_UB4(dst, 16, dst0, dst1, dst2, dst3);
680 MIN_UH4_UH(out0, out1, out2, out3, const255);
681 MIN_UH4_UH(out4, out5, out6, out7, const255);
682 PCKEV_AVG_ST_UB(out1, out0, dst0, dst); 654 PCKEV_AVG_ST_UB(out1, out0, dst0, dst);
683 PCKEV_AVG_ST_UB(out3, out2, dst1, dst + 16); 655 PCKEV_AVG_ST_UB(out3, out2, dst1, dst + 16);
684 PCKEV_AVG_ST_UB(out5, out4, dst2, dst + 32); 656 PCKEV_AVG_ST_UB(out5, out4, dst2, dst + 32);
685 PCKEV_AVG_ST_UB(out7, out6, dst3, dst + 48); 657 PCKEV_AVG_ST_UB(out7, out6, dst3, dst + 48);
686 dst += dst_stride; 658 dst += dst_stride;
687 } 659 }
688 } 660 }
689 661
690 void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, 662 void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
691 uint8_t *dst, ptrdiff_t dst_stride, 663 uint8_t *dst, ptrdiff_t dst_stride,
692 const int16_t *filter_x, int x_step_q4, 664 const int16_t *filter_x, int x_step_q4,
693 const int16_t *filter_y, int y_step_q4, 665 const int16_t *filter_y, int y_step_q4,
694 int w, int h) { 666 int w, int h) {
695 int8_t cnt, filt_hor[8]; 667 int8_t cnt, filt_hor[8];
696 668
697 if (16 != x_step_q4) { 669 assert(x_step_q4 == 16);
698 vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, 670 assert(((const int32_t *)filter_x)[1] != 0x800000);
699 filter_x, x_step_q4, filter_y, y_step_q4,
700 w, h);
701 return;
702 }
703
704 if (((const int32_t *)filter_x)[1] == 0x800000) {
705 vpx_convolve_avg(src, src_stride, dst, dst_stride,
706 filter_x, x_step_q4, filter_y, y_step_q4,
707 w, h);
708 return;
709 }
710 671
711 for (cnt = 0; cnt < 8; ++cnt) { 672 for (cnt = 0; cnt < 8; ++cnt) {
712 filt_hor[cnt] = filter_x[cnt]; 673 filt_hor[cnt] = filter_x[cnt];
713 } 674 }
714 675
715 if (((const int32_t *)filter_x)[0] == 0) { 676 if (((const int32_t *)filter_x)[0] == 0) {
716 switch (w) { 677 switch (w) {
717 case 4: 678 case 4:
718 common_hz_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, 679 common_hz_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride,
719 dst, (int32_t)dst_stride, 680 dst, (int32_t)dst_stride,
(...skipping 53 matching lines...) Expand 10 before | Expand all | Expand 10 after
773 filt_hor, h); 734 filt_hor, h);
774 break; 735 break;
775 default: 736 default:
776 vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, 737 vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
777 filter_x, x_step_q4, filter_y, y_step_q4, 738 filter_x, x_step_q4, filter_y, y_step_q4,
778 w, h); 739 w, h);
779 break; 740 break;
780 } 741 }
781 } 742 }
782 } 743 }
OLDNEW
« no previous file with comments | « source/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_dspr2.c ('k') | source/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698