OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
| 11 #include <assert.h> |
11 #include "./vpx_dsp_rtcd.h" | 12 #include "./vpx_dsp_rtcd.h" |
12 #include "vpx_dsp/mips/vpx_convolve_msa.h" | 13 #include "vpx_dsp/mips/vpx_convolve_msa.h" |
13 | 14 |
14 static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride, | 15 static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride, |
15 uint8_t *dst, int32_t dst_stride, | 16 uint8_t *dst, int32_t dst_stride, |
16 int8_t *filter) { | 17 int8_t *filter) { |
17 v16u8 mask0, mask1, mask2, mask3, out; | 18 v16u8 mask0, mask1, mask2, mask3, out; |
18 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; | 19 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; |
19 v8i16 filt, out0, out1; | 20 v8i16 filt, out0, out1; |
20 | 21 |
(...skipping 290 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
311 ST_UB(out, dst + 48); | 312 ST_UB(out, dst + 48); |
312 dst += dst_stride; | 313 dst += dst_stride; |
313 } | 314 } |
314 } | 315 } |
315 | 316 |
316 static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride, | 317 static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride, |
317 uint8_t *dst, int32_t dst_stride, | 318 uint8_t *dst, int32_t dst_stride, |
318 int8_t *filter) { | 319 int8_t *filter) { |
319 v16i8 src0, src1, src2, src3, mask; | 320 v16i8 src0, src1, src2, src3, mask; |
320 v16u8 filt0, vec0, vec1, res0, res1; | 321 v16u8 filt0, vec0, vec1, res0, res1; |
321 v8u16 vec2, vec3, filt, const255; | 322 v8u16 vec2, vec3, filt; |
322 | 323 |
323 mask = LD_SB(&mc_filt_mask_arr[16]); | 324 mask = LD_SB(&mc_filt_mask_arr[16]); |
324 | 325 |
325 /* rearranging filter */ | 326 /* rearranging filter */ |
326 filt = LD_UH(filter); | 327 filt = LD_UH(filter); |
327 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); | 328 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); |
328 | 329 |
329 const255 = (v8u16) __msa_ldi_h(255); | |
330 | |
331 LD_SB4(src, src_stride, src0, src1, src2, src3); | 330 LD_SB4(src, src_stride, src0, src1, src2, src3); |
332 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); | 331 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); |
333 DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3); | 332 DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3); |
334 SRARI_H2_UH(vec2, vec3, FILTER_BITS); | 333 SRARI_H2_UH(vec2, vec3, FILTER_BITS); |
335 MIN_UH2_UH(vec2, vec3, const255); | |
336 PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1); | 334 PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1); |
337 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); | 335 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); |
338 } | 336 } |
339 | 337 |
340 static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride, | 338 static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride, |
341 uint8_t *dst, int32_t dst_stride, | 339 uint8_t *dst, int32_t dst_stride, |
342 int8_t *filter) { | 340 int8_t *filter) { |
343 v16u8 vec0, vec1, vec2, vec3, filt0; | 341 v16u8 vec0, vec1, vec2, vec3, filt0; |
344 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; | 342 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; |
345 v16i8 res0, res1, res2, res3; | 343 v16i8 res0, res1, res2, res3; |
346 v8u16 vec4, vec5, vec6, vec7, filt, const255; | 344 v8u16 vec4, vec5, vec6, vec7, filt; |
347 | 345 |
348 mask = LD_SB(&mc_filt_mask_arr[16]); | 346 mask = LD_SB(&mc_filt_mask_arr[16]); |
349 | 347 |
350 /* rearranging filter */ | 348 /* rearranging filter */ |
351 filt = LD_UH(filter); | 349 filt = LD_UH(filter); |
352 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); | 350 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); |
353 | 351 |
354 const255 = (v8u16) __msa_ldi_h(255); | |
355 | |
356 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); | 352 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); |
357 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); | 353 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); |
358 VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3); | 354 VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3); |
359 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5, | 355 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5, |
360 vec6, vec7); | 356 vec6, vec7); |
361 SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); | 357 SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); |
362 MIN_UH4_UH(vec4, vec5, vec6, vec7, const255); | |
363 PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, | 358 PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, |
364 res2, res3); | 359 res2, res3); |
365 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); | 360 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); |
366 dst += (4 * dst_stride); | 361 dst += (4 * dst_stride); |
367 ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); | 362 ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); |
368 } | 363 } |
369 | 364 |
370 static void common_hz_2t_4w_msa(const uint8_t *src, int32_t src_stride, | 365 static void common_hz_2t_4w_msa(const uint8_t *src, int32_t src_stride, |
371 uint8_t *dst, int32_t dst_stride, | 366 uint8_t *dst, int32_t dst_stride, |
372 int8_t *filter, int32_t height) { | 367 int8_t *filter, int32_t height) { |
373 if (4 == height) { | 368 if (4 == height) { |
374 common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter); | 369 common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter); |
375 } else if (8 == height) { | 370 } else if (8 == height) { |
376 common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter); | 371 common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter); |
377 } | 372 } |
378 } | 373 } |
379 | 374 |
380 static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride, | 375 static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride, |
381 uint8_t *dst, int32_t dst_stride, | 376 uint8_t *dst, int32_t dst_stride, |
382 int8_t *filter) { | 377 int8_t *filter) { |
383 v16u8 filt0; | 378 v16u8 filt0; |
384 v16i8 src0, src1, src2, src3, mask; | 379 v16i8 src0, src1, src2, src3, mask; |
385 v8u16 vec0, vec1, vec2, vec3, const255, filt; | 380 v8u16 vec0, vec1, vec2, vec3, filt; |
386 | 381 |
387 mask = LD_SB(&mc_filt_mask_arr[0]); | 382 mask = LD_SB(&mc_filt_mask_arr[0]); |
388 | 383 |
389 /* rearranging filter */ | 384 /* rearranging filter */ |
390 filt = LD_UH(filter); | 385 filt = LD_UH(filter); |
391 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); | 386 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); |
392 | 387 |
393 const255 = (v8u16) __msa_ldi_h(255); | |
394 | |
395 LD_SB4(src, src_stride, src0, src1, src2, src3); | 388 LD_SB4(src, src_stride, src0, src1, src2, src3); |
396 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); | 389 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); |
397 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); | 390 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); |
398 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, | 391 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, |
399 vec2, vec3); | 392 vec2, vec3); |
400 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); | 393 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); |
401 MIN_UH4_UH(vec0, vec1, vec2, vec3, const255); | |
402 PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1); | 394 PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1); |
403 ST8x4_UB(src0, src1, dst, dst_stride); | 395 ST8x4_UB(src0, src1, dst, dst_stride); |
404 } | 396 } |
405 | 397 |
406 static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, | 398 static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, |
407 uint8_t *dst, int32_t dst_stride, | 399 uint8_t *dst, int32_t dst_stride, |
408 int8_t *filter, int32_t height) { | 400 int8_t *filter, int32_t height) { |
409 v16u8 filt0; | 401 v16u8 filt0; |
410 v16i8 src0, src1, src2, src3, mask, out0, out1; | 402 v16i8 src0, src1, src2, src3, mask, out0, out1; |
411 v8u16 vec0, vec1, vec2, vec3, filt, const255; | 403 v8u16 vec0, vec1, vec2, vec3, filt; |
412 | 404 |
413 mask = LD_SB(&mc_filt_mask_arr[0]); | 405 mask = LD_SB(&mc_filt_mask_arr[0]); |
414 | 406 |
415 /* rearranging filter */ | 407 /* rearranging filter */ |
416 filt = LD_UH(filter); | 408 filt = LD_UH(filter); |
417 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); | 409 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); |
418 | 410 |
419 const255 = (v8u16) __msa_ldi_h(255); | |
420 | |
421 LD_SB4(src, src_stride, src0, src1, src2, src3); | 411 LD_SB4(src, src_stride, src0, src1, src2, src3); |
422 src += (4 * src_stride); | 412 src += (4 * src_stride); |
423 | 413 |
424 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); | 414 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); |
425 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); | 415 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); |
426 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, | 416 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, |
427 vec2, vec3); | 417 vec2, vec3); |
428 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); | 418 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); |
429 MIN_UH4_UH(vec0, vec1, vec2, vec3, const255); | |
430 | 419 |
431 LD_SB4(src, src_stride, src0, src1, src2, src3); | 420 LD_SB4(src, src_stride, src0, src1, src2, src3); |
432 src += (4 * src_stride); | 421 src += (4 * src_stride); |
433 | 422 |
434 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); | 423 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); |
435 ST8x4_UB(out0, out1, dst, dst_stride); | 424 ST8x4_UB(out0, out1, dst, dst_stride); |
436 dst += (4 * dst_stride); | 425 dst += (4 * dst_stride); |
437 | 426 |
438 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); | 427 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); |
439 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); | 428 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); |
440 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, | 429 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, |
441 vec2, vec3); | 430 vec2, vec3); |
442 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); | 431 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); |
443 MIN_UH4_UH(vec0, vec1, vec2, vec3, const255); | |
444 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); | 432 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); |
445 ST8x4_UB(out0, out1, dst, dst_stride); | 433 ST8x4_UB(out0, out1, dst, dst_stride); |
446 dst += (4 * dst_stride); | 434 dst += (4 * dst_stride); |
447 | 435 |
448 if (16 == height) { | 436 if (16 == height) { |
449 LD_SB4(src, src_stride, src0, src1, src2, src3); | 437 LD_SB4(src, src_stride, src0, src1, src2, src3); |
450 src += (4 * src_stride); | 438 src += (4 * src_stride); |
451 | 439 |
452 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); | 440 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); |
453 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); | 441 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); |
454 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, | 442 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, |
455 vec2, vec3); | 443 vec2, vec3); |
456 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); | 444 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); |
457 MIN_UH4_UH(vec0, vec1, vec2, vec3, const255); | |
458 LD_SB4(src, src_stride, src0, src1, src2, src3); | 445 LD_SB4(src, src_stride, src0, src1, src2, src3); |
459 src += (4 * src_stride); | 446 src += (4 * src_stride); |
460 | 447 |
461 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); | 448 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); |
462 ST8x4_UB(out0, out1, dst, dst_stride); | 449 ST8x4_UB(out0, out1, dst, dst_stride); |
463 | 450 |
464 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); | 451 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); |
465 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); | 452 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); |
466 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, | 453 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, |
467 vec2, vec3); | 454 vec2, vec3); |
468 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); | 455 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); |
469 MIN_UH4_UH(vec0, vec1, vec2, vec3, const255); | |
470 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); | 456 PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); |
471 ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride); | 457 ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride); |
472 } | 458 } |
473 } | 459 } |
474 | 460 |
475 static void common_hz_2t_8w_msa(const uint8_t *src, int32_t src_stride, | 461 static void common_hz_2t_8w_msa(const uint8_t *src, int32_t src_stride, |
476 uint8_t *dst, int32_t dst_stride, | 462 uint8_t *dst, int32_t dst_stride, |
477 int8_t *filter, int32_t height) { | 463 int8_t *filter, int32_t height) { |
478 if (4 == height) { | 464 if (4 == height) { |
479 common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter); | 465 common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter); |
480 } else { | 466 } else { |
481 common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height); | 467 common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height); |
482 } | 468 } |
483 } | 469 } |
484 | 470 |
485 static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride, | 471 static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride, |
486 uint8_t *dst, int32_t dst_stride, | 472 uint8_t *dst, int32_t dst_stride, |
487 int8_t *filter, int32_t height) { | 473 int8_t *filter, int32_t height) { |
488 uint32_t loop_cnt; | 474 uint32_t loop_cnt; |
489 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; | 475 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; |
490 v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; | 476 v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; |
491 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt, const255; | 477 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; |
492 | 478 |
493 mask = LD_SB(&mc_filt_mask_arr[0]); | 479 mask = LD_SB(&mc_filt_mask_arr[0]); |
494 | 480 |
495 loop_cnt = (height >> 2) - 1; | 481 loop_cnt = (height >> 2) - 1; |
496 | 482 |
497 /* rearranging filter */ | 483 /* rearranging filter */ |
498 filt = LD_UH(filter); | 484 filt = LD_UH(filter); |
499 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); | 485 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); |
500 | 486 |
501 const255 = (v8u16) __msa_ldi_h(255); | |
502 | |
503 LD_SB4(src, src_stride, src0, src2, src4, src6); | 487 LD_SB4(src, src_stride, src0, src2, src4, src6); |
504 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); | 488 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); |
505 src += (4 * src_stride); | 489 src += (4 * src_stride); |
506 | 490 |
507 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); | 491 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); |
508 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); | 492 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); |
509 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); | 493 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); |
510 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); | 494 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); |
511 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, | 495 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, |
512 out2, out3); | 496 out2, out3); |
513 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, | 497 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, |
514 out6, out7); | 498 out6, out7); |
515 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); | 499 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); |
516 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); | 500 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); |
517 MIN_UH4_UH(out0, out1, out2, out3, const255); | |
518 MIN_UH4_UH(out4, out5, out6, out7, const255); | |
519 PCKEV_ST_SB(out0, out1, dst); | 501 PCKEV_ST_SB(out0, out1, dst); |
520 dst += dst_stride; | 502 dst += dst_stride; |
521 PCKEV_ST_SB(out2, out3, dst); | 503 PCKEV_ST_SB(out2, out3, dst); |
522 dst += dst_stride; | 504 dst += dst_stride; |
523 PCKEV_ST_SB(out4, out5, dst); | 505 PCKEV_ST_SB(out4, out5, dst); |
524 dst += dst_stride; | 506 dst += dst_stride; |
525 PCKEV_ST_SB(out6, out7, dst); | 507 PCKEV_ST_SB(out6, out7, dst); |
526 dst += dst_stride; | 508 dst += dst_stride; |
527 | 509 |
528 for (; loop_cnt--;) { | 510 for (; loop_cnt--;) { |
529 LD_SB4(src, src_stride, src0, src2, src4, src6); | 511 LD_SB4(src, src_stride, src0, src2, src4, src6); |
530 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); | 512 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); |
531 src += (4 * src_stride); | 513 src += (4 * src_stride); |
532 | 514 |
533 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); | 515 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); |
534 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); | 516 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); |
535 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); | 517 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); |
536 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); | 518 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); |
537 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, | 519 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, |
538 out2, out3); | 520 out2, out3); |
539 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, | 521 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, |
540 out6, out7); | 522 out6, out7); |
541 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); | 523 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); |
542 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); | 524 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); |
543 MIN_UH4_UH(out0, out1, out2, out3, const255); | |
544 MIN_UH4_UH(out4, out5, out6, out7, const255); | |
545 PCKEV_ST_SB(out0, out1, dst); | 525 PCKEV_ST_SB(out0, out1, dst); |
546 dst += dst_stride; | 526 dst += dst_stride; |
547 PCKEV_ST_SB(out2, out3, dst); | 527 PCKEV_ST_SB(out2, out3, dst); |
548 dst += dst_stride; | 528 dst += dst_stride; |
549 PCKEV_ST_SB(out4, out5, dst); | 529 PCKEV_ST_SB(out4, out5, dst); |
550 dst += dst_stride; | 530 dst += dst_stride; |
551 PCKEV_ST_SB(out6, out7, dst); | 531 PCKEV_ST_SB(out6, out7, dst); |
552 dst += dst_stride; | 532 dst += dst_stride; |
553 } | 533 } |
554 } | 534 } |
555 | 535 |
556 static void common_hz_2t_32w_msa(const uint8_t *src, int32_t src_stride, | 536 static void common_hz_2t_32w_msa(const uint8_t *src, int32_t src_stride, |
557 uint8_t *dst, int32_t dst_stride, | 537 uint8_t *dst, int32_t dst_stride, |
558 int8_t *filter, int32_t height) { | 538 int8_t *filter, int32_t height) { |
559 uint32_t loop_cnt; | 539 uint32_t loop_cnt; |
560 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; | 540 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; |
561 v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; | 541 v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; |
562 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt, const255; | 542 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; |
563 | 543 |
564 mask = LD_SB(&mc_filt_mask_arr[0]); | 544 mask = LD_SB(&mc_filt_mask_arr[0]); |
565 | 545 |
566 /* rearranging filter */ | 546 /* rearranging filter */ |
567 filt = LD_UH(filter); | 547 filt = LD_UH(filter); |
568 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); | 548 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); |
569 | 549 |
570 const255 = (v8u16) __msa_ldi_h(255); | |
571 | |
572 for (loop_cnt = height >> 1; loop_cnt--;) { | 550 for (loop_cnt = height >> 1; loop_cnt--;) { |
573 src0 = LD_SB(src); | 551 src0 = LD_SB(src); |
574 src2 = LD_SB(src + 16); | 552 src2 = LD_SB(src + 16); |
575 src3 = LD_SB(src + 24); | 553 src3 = LD_SB(src + 24); |
576 src1 = __msa_sldi_b(src2, src0, 8); | 554 src1 = __msa_sldi_b(src2, src0, 8); |
577 src += src_stride; | 555 src += src_stride; |
578 src4 = LD_SB(src); | 556 src4 = LD_SB(src); |
579 src6 = LD_SB(src + 16); | 557 src6 = LD_SB(src + 16); |
580 src7 = LD_SB(src + 24); | 558 src7 = LD_SB(src + 24); |
581 src5 = __msa_sldi_b(src6, src4, 8); | 559 src5 = __msa_sldi_b(src6, src4, 8); |
582 src += src_stride; | 560 src += src_stride; |
583 | 561 |
584 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); | 562 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); |
585 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); | 563 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); |
586 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); | 564 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); |
587 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); | 565 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); |
588 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, | 566 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, |
589 out2, out3); | 567 out2, out3); |
590 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, | 568 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, |
591 out6, out7); | 569 out6, out7); |
592 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); | 570 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); |
593 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); | 571 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); |
594 MIN_UH4_UH(out0, out1, out2, out3, const255); | |
595 MIN_UH4_UH(out4, out5, out6, out7, const255); | |
596 PCKEV_ST_SB(out0, out1, dst); | 572 PCKEV_ST_SB(out0, out1, dst); |
597 PCKEV_ST_SB(out2, out3, dst + 16); | 573 PCKEV_ST_SB(out2, out3, dst + 16); |
598 dst += dst_stride; | 574 dst += dst_stride; |
599 PCKEV_ST_SB(out4, out5, dst); | 575 PCKEV_ST_SB(out4, out5, dst); |
600 PCKEV_ST_SB(out6, out7, dst + 16); | 576 PCKEV_ST_SB(out6, out7, dst + 16); |
601 dst += dst_stride; | 577 dst += dst_stride; |
602 } | 578 } |
603 } | 579 } |
604 | 580 |
605 static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride, | 581 static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride, |
606 uint8_t *dst, int32_t dst_stride, | 582 uint8_t *dst, int32_t dst_stride, |
607 int8_t *filter, int32_t height) { | 583 int8_t *filter, int32_t height) { |
608 uint32_t loop_cnt; | 584 uint32_t loop_cnt; |
609 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; | 585 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; |
610 v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; | 586 v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; |
611 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt, const255; | 587 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; |
612 | 588 |
613 mask = LD_SB(&mc_filt_mask_arr[0]); | 589 mask = LD_SB(&mc_filt_mask_arr[0]); |
614 | 590 |
615 /* rearranging filter */ | 591 /* rearranging filter */ |
616 filt = LD_UH(filter); | 592 filt = LD_UH(filter); |
617 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); | 593 filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); |
618 | 594 |
619 const255 = (v8u16) __msa_ldi_h(255); | |
620 | |
621 for (loop_cnt = height; loop_cnt--;) { | 595 for (loop_cnt = height; loop_cnt--;) { |
622 src0 = LD_SB(src); | 596 src0 = LD_SB(src); |
623 src2 = LD_SB(src + 16); | 597 src2 = LD_SB(src + 16); |
624 src4 = LD_SB(src + 32); | 598 src4 = LD_SB(src + 32); |
625 src6 = LD_SB(src + 48); | 599 src6 = LD_SB(src + 48); |
626 src7 = LD_SB(src + 56); | 600 src7 = LD_SB(src + 56); |
627 SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8); | 601 SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8); |
628 src += src_stride; | 602 src += src_stride; |
629 | 603 |
630 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); | 604 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); |
631 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); | 605 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); |
632 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); | 606 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); |
633 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); | 607 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); |
634 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, | 608 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, |
635 out2, out3); | 609 out2, out3); |
636 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, | 610 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, |
637 out6, out7); | 611 out6, out7); |
638 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); | 612 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); |
639 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); | 613 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); |
640 MIN_UH4_UH(out0, out1, out2, out3, const255); | |
641 MIN_UH4_UH(out4, out5, out6, out7, const255); | |
642 PCKEV_ST_SB(out0, out1, dst); | 614 PCKEV_ST_SB(out0, out1, dst); |
643 PCKEV_ST_SB(out2, out3, dst + 16); | 615 PCKEV_ST_SB(out2, out3, dst + 16); |
644 PCKEV_ST_SB(out4, out5, dst + 32); | 616 PCKEV_ST_SB(out4, out5, dst + 32); |
645 PCKEV_ST_SB(out6, out7, dst + 48); | 617 PCKEV_ST_SB(out6, out7, dst + 48); |
646 dst += dst_stride; | 618 dst += dst_stride; |
647 } | 619 } |
648 } | 620 } |
649 | 621 |
650 void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, | 622 void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, |
651 uint8_t *dst, ptrdiff_t dst_stride, | 623 uint8_t *dst, ptrdiff_t dst_stride, |
652 const int16_t *filter_x, int x_step_q4, | 624 const int16_t *filter_x, int x_step_q4, |
653 const int16_t *filter_y, int y_step_q4, | 625 const int16_t *filter_y, int y_step_q4, |
654 int w, int h) { | 626 int w, int h) { |
655 int8_t cnt, filt_hor[8]; | 627 int8_t cnt, filt_hor[8]; |
656 | 628 |
657 if (16 != x_step_q4) { | 629 assert(x_step_q4 == 16); |
658 vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, | 630 assert(((const int32_t *)filter_x)[1] != 0x800000); |
659 filter_x, x_step_q4, filter_y, y_step_q4, | |
660 w, h); | |
661 return; | |
662 } | |
663 | |
664 if (((const int32_t *)filter_x)[1] == 0x800000) { | |
665 vpx_convolve_copy(src, src_stride, dst, dst_stride, | |
666 filter_x, x_step_q4, filter_y, y_step_q4, | |
667 w, h); | |
668 return; | |
669 } | |
670 | 631 |
671 for (cnt = 0; cnt < 8; ++cnt) { | 632 for (cnt = 0; cnt < 8; ++cnt) { |
672 filt_hor[cnt] = filter_x[cnt]; | 633 filt_hor[cnt] = filter_x[cnt]; |
673 } | 634 } |
674 | 635 |
675 if (((const int32_t *)filter_x)[0] == 0) { | 636 if (((const int32_t *)filter_x)[0] == 0) { |
676 switch (w) { | 637 switch (w) { |
677 case 4: | 638 case 4: |
678 common_hz_2t_4w_msa(src, (int32_t)src_stride, | 639 common_hz_2t_4w_msa(src, (int32_t)src_stride, |
679 dst, (int32_t)dst_stride, | 640 dst, (int32_t)dst_stride, |
(...skipping 53 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
733 filt_hor, h); | 694 filt_hor, h); |
734 break; | 695 break; |
735 default: | 696 default: |
736 vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, | 697 vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, |
737 filter_x, x_step_q4, filter_y, y_step_q4, | 698 filter_x, x_step_q4, filter_y, y_step_q4, |
738 w, h); | 699 w, h); |
739 break; | 700 break; |
740 } | 701 } |
741 } | 702 } |
742 } | 703 } |
OLD | NEW |