Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(23)

Side by Side Diff: source/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c

Issue 1302353004: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
11 #include <assert.h>
11 #include "./vpx_dsp_rtcd.h" 12 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_dsp/mips/vpx_convolve_msa.h" 13 #include "vpx_dsp/mips/vpx_convolve_msa.h"
13 14
14 static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride, 15 static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride,
15 uint8_t *dst, int32_t dst_stride, 16 uint8_t *dst, int32_t dst_stride,
16 int8_t *filter) { 17 int8_t *filter) {
17 v16u8 mask0, mask1, mask2, mask3, out; 18 v16u8 mask0, mask1, mask2, mask3, out;
18 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; 19 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
19 v8i16 filt, out0, out1; 20 v8i16 filt, out0, out1;
20 21
(...skipping 290 matching lines...) Expand 10 before | Expand all | Expand 10 after
311 ST_UB(out, dst + 48); 312 ST_UB(out, dst + 48);
312 dst += dst_stride; 313 dst += dst_stride;
313 } 314 }
314 } 315 }
315 316
316 static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride, 317 static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
317 uint8_t *dst, int32_t dst_stride, 318 uint8_t *dst, int32_t dst_stride,
318 int8_t *filter) { 319 int8_t *filter) {
319 v16i8 src0, src1, src2, src3, mask; 320 v16i8 src0, src1, src2, src3, mask;
320 v16u8 filt0, vec0, vec1, res0, res1; 321 v16u8 filt0, vec0, vec1, res0, res1;
321 v8u16 vec2, vec3, filt, const255; 322 v8u16 vec2, vec3, filt;
322 323
323 mask = LD_SB(&mc_filt_mask_arr[16]); 324 mask = LD_SB(&mc_filt_mask_arr[16]);
324 325
325 /* rearranging filter */ 326 /* rearranging filter */
326 filt = LD_UH(filter); 327 filt = LD_UH(filter);
327 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); 328 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
328 329
329 const255 = (v8u16) __msa_ldi_h(255);
330
331 LD_SB4(src, src_stride, src0, src1, src2, src3); 330 LD_SB4(src, src_stride, src0, src1, src2, src3);
332 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); 331 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
333 DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3); 332 DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
334 SRARI_H2_UH(vec2, vec3, FILTER_BITS); 333 SRARI_H2_UH(vec2, vec3, FILTER_BITS);
335 MIN_UH2_UH(vec2, vec3, const255);
336 PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1); 334 PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
337 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); 335 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
338 } 336 }
339 337
340 static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride, 338 static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
341 uint8_t *dst, int32_t dst_stride, 339 uint8_t *dst, int32_t dst_stride,
342 int8_t *filter) { 340 int8_t *filter) {
343 v16u8 vec0, vec1, vec2, vec3, filt0; 341 v16u8 vec0, vec1, vec2, vec3, filt0;
344 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 342 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
345 v16i8 res0, res1, res2, res3; 343 v16i8 res0, res1, res2, res3;
346 v8u16 vec4, vec5, vec6, vec7, filt, const255; 344 v8u16 vec4, vec5, vec6, vec7, filt;
347 345
348 mask = LD_SB(&mc_filt_mask_arr[16]); 346 mask = LD_SB(&mc_filt_mask_arr[16]);
349 347
350 /* rearranging filter */ 348 /* rearranging filter */
351 filt = LD_UH(filter); 349 filt = LD_UH(filter);
352 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); 350 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
353 351
354 const255 = (v8u16) __msa_ldi_h(255);
355
356 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); 352 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
357 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); 353 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
358 VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3); 354 VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
359 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5, 355 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
360 vec6, vec7); 356 vec6, vec7);
361 SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); 357 SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
362 MIN_UH4_UH(vec4, vec5, vec6, vec7, const255);
363 PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, 358 PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1,
364 res2, res3); 359 res2, res3);
365 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); 360 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
366 dst += (4 * dst_stride); 361 dst += (4 * dst_stride);
367 ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); 362 ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
368 } 363 }
369 364
370 static void common_hz_2t_4w_msa(const uint8_t *src, int32_t src_stride, 365 static void common_hz_2t_4w_msa(const uint8_t *src, int32_t src_stride,
371 uint8_t *dst, int32_t dst_stride, 366 uint8_t *dst, int32_t dst_stride,
372 int8_t *filter, int32_t height) { 367 int8_t *filter, int32_t height) {
373 if (4 == height) { 368 if (4 == height) {
374 common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter); 369 common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
375 } else if (8 == height) { 370 } else if (8 == height) {
376 common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter); 371 common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
377 } 372 }
378 } 373 }
379 374
380 static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride, 375 static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
381 uint8_t *dst, int32_t dst_stride, 376 uint8_t *dst, int32_t dst_stride,
382 int8_t *filter) { 377 int8_t *filter) {
383 v16u8 filt0; 378 v16u8 filt0;
384 v16i8 src0, src1, src2, src3, mask; 379 v16i8 src0, src1, src2, src3, mask;
385 v8u16 vec0, vec1, vec2, vec3, const255, filt; 380 v8u16 vec0, vec1, vec2, vec3, filt;
386 381
387 mask = LD_SB(&mc_filt_mask_arr[0]); 382 mask = LD_SB(&mc_filt_mask_arr[0]);
388 383
389 /* rearranging filter */ 384 /* rearranging filter */
390 filt = LD_UH(filter); 385 filt = LD_UH(filter);
391 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); 386 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
392 387
393 const255 = (v8u16) __msa_ldi_h(255);
394
395 LD_SB4(src, src_stride, src0, src1, src2, src3); 388 LD_SB4(src, src_stride, src0, src1, src2, src3);
396 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 389 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
397 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 390 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
398 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 391 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
399 vec2, vec3); 392 vec2, vec3);
400 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); 393 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
401 MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
402 PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1); 394 PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
403 ST8x4_UB(src0, src1, dst, dst_stride); 395 ST8x4_UB(src0, src1, dst, dst_stride);
404 } 396 }
405 397
406 static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, 398 static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
407 uint8_t *dst, int32_t dst_stride, 399 uint8_t *dst, int32_t dst_stride,
408 int8_t *filter, int32_t height) { 400 int8_t *filter, int32_t height) {
409 v16u8 filt0; 401 v16u8 filt0;
410 v16i8 src0, src1, src2, src3, mask, out0, out1; 402 v16i8 src0, src1, src2, src3, mask, out0, out1;
411 v8u16 vec0, vec1, vec2, vec3, filt, const255; 403 v8u16 vec0, vec1, vec2, vec3, filt;
412 404
413 mask = LD_SB(&mc_filt_mask_arr[0]); 405 mask = LD_SB(&mc_filt_mask_arr[0]);
414 406
415 /* rearranging filter */ 407 /* rearranging filter */
416 filt = LD_UH(filter); 408 filt = LD_UH(filter);
417 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); 409 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
418 410
419 const255 = (v8u16) __msa_ldi_h(255);
420
421 LD_SB4(src, src_stride, src0, src1, src2, src3); 411 LD_SB4(src, src_stride, src0, src1, src2, src3);
422 src += (4 * src_stride); 412 src += (4 * src_stride);
423 413
424 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 414 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
425 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 415 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
426 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 416 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
427 vec2, vec3); 417 vec2, vec3);
428 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); 418 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
429 MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
430 419
431 LD_SB4(src, src_stride, src0, src1, src2, src3); 420 LD_SB4(src, src_stride, src0, src1, src2, src3);
432 src += (4 * src_stride); 421 src += (4 * src_stride);
433 422
434 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); 423 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
435 ST8x4_UB(out0, out1, dst, dst_stride); 424 ST8x4_UB(out0, out1, dst, dst_stride);
436 dst += (4 * dst_stride); 425 dst += (4 * dst_stride);
437 426
438 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 427 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
439 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 428 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
440 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 429 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
441 vec2, vec3); 430 vec2, vec3);
442 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); 431 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
443 MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
444 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); 432 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
445 ST8x4_UB(out0, out1, dst, dst_stride); 433 ST8x4_UB(out0, out1, dst, dst_stride);
446 dst += (4 * dst_stride); 434 dst += (4 * dst_stride);
447 435
448 if (16 == height) { 436 if (16 == height) {
449 LD_SB4(src, src_stride, src0, src1, src2, src3); 437 LD_SB4(src, src_stride, src0, src1, src2, src3);
450 src += (4 * src_stride); 438 src += (4 * src_stride);
451 439
452 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 440 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
453 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 441 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
454 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 442 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
455 vec2, vec3); 443 vec2, vec3);
456 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); 444 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
457 MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
458 LD_SB4(src, src_stride, src0, src1, src2, src3); 445 LD_SB4(src, src_stride, src0, src1, src2, src3);
459 src += (4 * src_stride); 446 src += (4 * src_stride);
460 447
461 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); 448 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
462 ST8x4_UB(out0, out1, dst, dst_stride); 449 ST8x4_UB(out0, out1, dst, dst_stride);
463 450
464 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); 451 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
465 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); 452 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
466 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, 453 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
467 vec2, vec3); 454 vec2, vec3);
468 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); 455 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
469 MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
470 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); 456 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
471 ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride); 457 ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
472 } 458 }
473 } 459 }
474 460
475 static void common_hz_2t_8w_msa(const uint8_t *src, int32_t src_stride, 461 static void common_hz_2t_8w_msa(const uint8_t *src, int32_t src_stride,
476 uint8_t *dst, int32_t dst_stride, 462 uint8_t *dst, int32_t dst_stride,
477 int8_t *filter, int32_t height) { 463 int8_t *filter, int32_t height) {
478 if (4 == height) { 464 if (4 == height) {
479 common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter); 465 common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
480 } else { 466 } else {
481 common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height); 467 common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
482 } 468 }
483 } 469 }
484 470
485 static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride, 471 static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride,
486 uint8_t *dst, int32_t dst_stride, 472 uint8_t *dst, int32_t dst_stride,
487 int8_t *filter, int32_t height) { 473 int8_t *filter, int32_t height) {
488 uint32_t loop_cnt; 474 uint32_t loop_cnt;
489 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 475 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
490 v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 476 v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
491 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt, const255; 477 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
492 478
493 mask = LD_SB(&mc_filt_mask_arr[0]); 479 mask = LD_SB(&mc_filt_mask_arr[0]);
494 480
495 loop_cnt = (height >> 2) - 1; 481 loop_cnt = (height >> 2) - 1;
496 482
497 /* rearranging filter */ 483 /* rearranging filter */
498 filt = LD_UH(filter); 484 filt = LD_UH(filter);
499 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); 485 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
500 486
501 const255 = (v8u16) __msa_ldi_h(255);
502
503 LD_SB4(src, src_stride, src0, src2, src4, src6); 487 LD_SB4(src, src_stride, src0, src2, src4, src6);
504 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 488 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
505 src += (4 * src_stride); 489 src += (4 * src_stride);
506 490
507 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); 491 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
508 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); 492 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
509 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); 493 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
510 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); 494 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
511 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, 495 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
512 out2, out3); 496 out2, out3);
513 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, 497 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
514 out6, out7); 498 out6, out7);
515 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); 499 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
516 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); 500 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
517 MIN_UH4_UH(out0, out1, out2, out3, const255);
518 MIN_UH4_UH(out4, out5, out6, out7, const255);
519 PCKEV_ST_SB(out0, out1, dst); 501 PCKEV_ST_SB(out0, out1, dst);
520 dst += dst_stride; 502 dst += dst_stride;
521 PCKEV_ST_SB(out2, out3, dst); 503 PCKEV_ST_SB(out2, out3, dst);
522 dst += dst_stride; 504 dst += dst_stride;
523 PCKEV_ST_SB(out4, out5, dst); 505 PCKEV_ST_SB(out4, out5, dst);
524 dst += dst_stride; 506 dst += dst_stride;
525 PCKEV_ST_SB(out6, out7, dst); 507 PCKEV_ST_SB(out6, out7, dst);
526 dst += dst_stride; 508 dst += dst_stride;
527 509
528 for (; loop_cnt--;) { 510 for (; loop_cnt--;) {
529 LD_SB4(src, src_stride, src0, src2, src4, src6); 511 LD_SB4(src, src_stride, src0, src2, src4, src6);
530 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); 512 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
531 src += (4 * src_stride); 513 src += (4 * src_stride);
532 514
533 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); 515 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
534 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); 516 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
535 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); 517 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
536 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); 518 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
537 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, 519 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
538 out2, out3); 520 out2, out3);
539 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, 521 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
540 out6, out7); 522 out6, out7);
541 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); 523 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
542 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); 524 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
543 MIN_UH4_UH(out0, out1, out2, out3, const255);
544 MIN_UH4_UH(out4, out5, out6, out7, const255);
545 PCKEV_ST_SB(out0, out1, dst); 525 PCKEV_ST_SB(out0, out1, dst);
546 dst += dst_stride; 526 dst += dst_stride;
547 PCKEV_ST_SB(out2, out3, dst); 527 PCKEV_ST_SB(out2, out3, dst);
548 dst += dst_stride; 528 dst += dst_stride;
549 PCKEV_ST_SB(out4, out5, dst); 529 PCKEV_ST_SB(out4, out5, dst);
550 dst += dst_stride; 530 dst += dst_stride;
551 PCKEV_ST_SB(out6, out7, dst); 531 PCKEV_ST_SB(out6, out7, dst);
552 dst += dst_stride; 532 dst += dst_stride;
553 } 533 }
554 } 534 }
555 535
556 static void common_hz_2t_32w_msa(const uint8_t *src, int32_t src_stride, 536 static void common_hz_2t_32w_msa(const uint8_t *src, int32_t src_stride,
557 uint8_t *dst, int32_t dst_stride, 537 uint8_t *dst, int32_t dst_stride,
558 int8_t *filter, int32_t height) { 538 int8_t *filter, int32_t height) {
559 uint32_t loop_cnt; 539 uint32_t loop_cnt;
560 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 540 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
561 v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 541 v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
562 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt, const255; 542 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
563 543
564 mask = LD_SB(&mc_filt_mask_arr[0]); 544 mask = LD_SB(&mc_filt_mask_arr[0]);
565 545
566 /* rearranging filter */ 546 /* rearranging filter */
567 filt = LD_UH(filter); 547 filt = LD_UH(filter);
568 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); 548 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
569 549
570 const255 = (v8u16) __msa_ldi_h(255);
571
572 for (loop_cnt = height >> 1; loop_cnt--;) { 550 for (loop_cnt = height >> 1; loop_cnt--;) {
573 src0 = LD_SB(src); 551 src0 = LD_SB(src);
574 src2 = LD_SB(src + 16); 552 src2 = LD_SB(src + 16);
575 src3 = LD_SB(src + 24); 553 src3 = LD_SB(src + 24);
576 src1 = __msa_sldi_b(src2, src0, 8); 554 src1 = __msa_sldi_b(src2, src0, 8);
577 src += src_stride; 555 src += src_stride;
578 src4 = LD_SB(src); 556 src4 = LD_SB(src);
579 src6 = LD_SB(src + 16); 557 src6 = LD_SB(src + 16);
580 src7 = LD_SB(src + 24); 558 src7 = LD_SB(src + 24);
581 src5 = __msa_sldi_b(src6, src4, 8); 559 src5 = __msa_sldi_b(src6, src4, 8);
582 src += src_stride; 560 src += src_stride;
583 561
584 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); 562 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
585 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); 563 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
586 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); 564 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
587 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); 565 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
588 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, 566 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
589 out2, out3); 567 out2, out3);
590 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, 568 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
591 out6, out7); 569 out6, out7);
592 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); 570 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
593 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); 571 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
594 MIN_UH4_UH(out0, out1, out2, out3, const255);
595 MIN_UH4_UH(out4, out5, out6, out7, const255);
596 PCKEV_ST_SB(out0, out1, dst); 572 PCKEV_ST_SB(out0, out1, dst);
597 PCKEV_ST_SB(out2, out3, dst + 16); 573 PCKEV_ST_SB(out2, out3, dst + 16);
598 dst += dst_stride; 574 dst += dst_stride;
599 PCKEV_ST_SB(out4, out5, dst); 575 PCKEV_ST_SB(out4, out5, dst);
600 PCKEV_ST_SB(out6, out7, dst + 16); 576 PCKEV_ST_SB(out6, out7, dst + 16);
601 dst += dst_stride; 577 dst += dst_stride;
602 } 578 }
603 } 579 }
604 580
605 static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride, 581 static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride,
606 uint8_t *dst, int32_t dst_stride, 582 uint8_t *dst, int32_t dst_stride,
607 int8_t *filter, int32_t height) { 583 int8_t *filter, int32_t height) {
608 uint32_t loop_cnt; 584 uint32_t loop_cnt;
609 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; 585 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
610 v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 586 v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
611 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt, const255; 587 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
612 588
613 mask = LD_SB(&mc_filt_mask_arr[0]); 589 mask = LD_SB(&mc_filt_mask_arr[0]);
614 590
615 /* rearranging filter */ 591 /* rearranging filter */
616 filt = LD_UH(filter); 592 filt = LD_UH(filter);
617 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); 593 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
618 594
619 const255 = (v8u16) __msa_ldi_h(255);
620
621 for (loop_cnt = height; loop_cnt--;) { 595 for (loop_cnt = height; loop_cnt--;) {
622 src0 = LD_SB(src); 596 src0 = LD_SB(src);
623 src2 = LD_SB(src + 16); 597 src2 = LD_SB(src + 16);
624 src4 = LD_SB(src + 32); 598 src4 = LD_SB(src + 32);
625 src6 = LD_SB(src + 48); 599 src6 = LD_SB(src + 48);
626 src7 = LD_SB(src + 56); 600 src7 = LD_SB(src + 56);
627 SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8); 601 SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
628 src += src_stride; 602 src += src_stride;
629 603
630 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); 604 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
631 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); 605 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
632 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); 606 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
633 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); 607 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
634 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, 608 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
635 out2, out3); 609 out2, out3);
636 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, 610 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
637 out6, out7); 611 out6, out7);
638 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); 612 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
639 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); 613 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
640 MIN_UH4_UH(out0, out1, out2, out3, const255);
641 MIN_UH4_UH(out4, out5, out6, out7, const255);
642 PCKEV_ST_SB(out0, out1, dst); 614 PCKEV_ST_SB(out0, out1, dst);
643 PCKEV_ST_SB(out2, out3, dst + 16); 615 PCKEV_ST_SB(out2, out3, dst + 16);
644 PCKEV_ST_SB(out4, out5, dst + 32); 616 PCKEV_ST_SB(out4, out5, dst + 32);
645 PCKEV_ST_SB(out6, out7, dst + 48); 617 PCKEV_ST_SB(out6, out7, dst + 48);
646 dst += dst_stride; 618 dst += dst_stride;
647 } 619 }
648 } 620 }
649 621
650 void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, 622 void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
651 uint8_t *dst, ptrdiff_t dst_stride, 623 uint8_t *dst, ptrdiff_t dst_stride,
652 const int16_t *filter_x, int x_step_q4, 624 const int16_t *filter_x, int x_step_q4,
653 const int16_t *filter_y, int y_step_q4, 625 const int16_t *filter_y, int y_step_q4,
654 int w, int h) { 626 int w, int h) {
655 int8_t cnt, filt_hor[8]; 627 int8_t cnt, filt_hor[8];
656 628
657 if (16 != x_step_q4) { 629 assert(x_step_q4 == 16);
658 vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, 630 assert(((const int32_t *)filter_x)[1] != 0x800000);
659 filter_x, x_step_q4, filter_y, y_step_q4,
660 w, h);
661 return;
662 }
663
664 if (((const int32_t *)filter_x)[1] == 0x800000) {
665 vpx_convolve_copy(src, src_stride, dst, dst_stride,
666 filter_x, x_step_q4, filter_y, y_step_q4,
667 w, h);
668 return;
669 }
670 631
671 for (cnt = 0; cnt < 8; ++cnt) { 632 for (cnt = 0; cnt < 8; ++cnt) {
672 filt_hor[cnt] = filter_x[cnt]; 633 filt_hor[cnt] = filter_x[cnt];
673 } 634 }
674 635
675 if (((const int32_t *)filter_x)[0] == 0) { 636 if (((const int32_t *)filter_x)[0] == 0) {
676 switch (w) { 637 switch (w) {
677 case 4: 638 case 4:
678 common_hz_2t_4w_msa(src, (int32_t)src_stride, 639 common_hz_2t_4w_msa(src, (int32_t)src_stride,
679 dst, (int32_t)dst_stride, 640 dst, (int32_t)dst_stride,
(...skipping 53 matching lines...) Expand 10 before | Expand all | Expand 10 after
733 filt_hor, h); 694 filt_hor, h);
734 break; 695 break;
735 default: 696 default:
736 vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, 697 vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride,
737 filter_x, x_step_q4, filter_y, y_step_q4, 698 filter_x, x_step_q4, filter_y, y_step_q4,
738 w, h); 699 w, h);
739 break; 700 break;
740 } 701 }
741 } 702 }
742 } 703 }
OLDNEW
« no previous file with comments | « source/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_dspr2.c ('k') | source/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698