OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
| 11 #include <assert.h> |
11 #include "./vpx_dsp_rtcd.h" | 12 #include "./vpx_dsp_rtcd.h" |
12 #include "vpx_dsp/mips/vpx_convolve_msa.h" | 13 #include "vpx_dsp/mips/vpx_convolve_msa.h" |
13 | 14 |
14 const uint8_t mc_filt_mask_arr[16 * 3] = { | 15 const uint8_t mc_filt_mask_arr[16 * 3] = { |
15 /* 8 width cases */ | 16 /* 8 width cases */ |
16 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, | 17 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, |
17 /* 4 width cases */ | 18 /* 4 width cases */ |
18 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, | 19 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, |
19 /* 4 width cases */ | 20 /* 4 width cases */ |
20 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 | 21 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 |
(...skipping 228 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
249 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); | 250 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); |
250 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); | 251 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); |
251 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); | 252 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); |
252 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); | 253 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); |
253 hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); | 254 hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); |
254 hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); | 255 hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); |
255 | 256 |
256 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); | 257 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); |
257 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); | 258 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); |
258 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 259 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
259 SAT_UH2_UH(tmp0, tmp1, 7); | |
260 PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1); | 260 PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1); |
261 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); | 261 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); |
262 } | 262 } |
263 | 263 |
264 static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride, | 264 static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride, |
265 uint8_t *dst, int32_t dst_stride, | 265 uint8_t *dst, int32_t dst_stride, |
266 int8_t *filter_horiz, | 266 int8_t *filter_horiz, |
267 int8_t *filter_vert) { | 267 int8_t *filter_vert) { |
268 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; | 268 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; |
269 v16i8 res0, res1, res2, res3; | 269 v16i8 res0, res1, res2, res3; |
(...skipping 21 matching lines...) Expand all Loading... |
291 hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS); | 291 hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS); |
292 SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1, | 292 SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1, |
293 hz_out3, hz_out5, 8); | 293 hz_out3, hz_out5, 8); |
294 hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6); | 294 hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6); |
295 | 295 |
296 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); | 296 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); |
297 ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3); | 297 ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3); |
298 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, | 298 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, |
299 vec4, vec5, vec6, vec7); | 299 vec4, vec5, vec6, vec7); |
300 SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); | 300 SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); |
301 SAT_UH4_UH(vec4, vec5, vec6, vec7, 7); | |
302 PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, | 301 PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, |
303 res2, res3); | 302 res2, res3); |
304 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); | 303 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); |
305 dst += (4 * dst_stride); | 304 dst += (4 * dst_stride); |
306 ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); | 305 ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); |
307 } | 306 } |
308 | 307 |
309 static void common_hv_2ht_2vt_4w_msa(const uint8_t *src, int32_t src_stride, | 308 static void common_hv_2ht_2vt_4w_msa(const uint8_t *src, int32_t src_stride, |
310 uint8_t *dst, int32_t dst_stride, | 309 uint8_t *dst, int32_t dst_stride, |
311 int8_t *filter_horiz, int8_t *filter_vert, | 310 int8_t *filter_horiz, int8_t *filter_vert, |
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
350 | 349 |
351 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); | 350 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); |
352 vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); | 351 vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); |
353 tmp2 = __msa_dotp_u_h(vec2, filt_vt); | 352 tmp2 = __msa_dotp_u_h(vec2, filt_vt); |
354 | 353 |
355 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); | 354 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); |
356 vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); | 355 vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); |
357 tmp3 = __msa_dotp_u_h(vec3, filt_vt); | 356 tmp3 = __msa_dotp_u_h(vec3, filt_vt); |
358 | 357 |
359 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); | 358 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); |
360 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); | |
361 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); | 359 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); |
362 ST8x4_UB(out0, out1, dst, dst_stride); | 360 ST8x4_UB(out0, out1, dst, dst_stride); |
363 } | 361 } |
364 | 362 |
365 static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src, | 363 static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src, |
366 int32_t src_stride, | 364 int32_t src_stride, |
367 uint8_t *dst, | 365 uint8_t *dst, |
368 int32_t dst_stride, | 366 int32_t dst_stride, |
369 int8_t *filter_horiz, | 367 int8_t *filter_horiz, |
370 int8_t *filter_vert, | 368 int8_t *filter_vert, |
(...skipping 24 matching lines...) Expand all Loading... |
395 | 393 |
396 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); | 394 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); |
397 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); | 395 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); |
398 tmp1 = __msa_dotp_u_h(vec0, filt_vt); | 396 tmp1 = __msa_dotp_u_h(vec0, filt_vt); |
399 | 397 |
400 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); | 398 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); |
401 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); | 399 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); |
402 tmp2 = __msa_dotp_u_h(vec0, filt_vt); | 400 tmp2 = __msa_dotp_u_h(vec0, filt_vt); |
403 | 401 |
404 SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); | 402 SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); |
405 SAT_UH2_UH(tmp1, tmp2, 7); | |
406 | 403 |
407 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); | 404 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); |
408 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); | 405 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); |
409 tmp3 = __msa_dotp_u_h(vec0, filt_vt); | 406 tmp3 = __msa_dotp_u_h(vec0, filt_vt); |
410 | 407 |
411 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); | 408 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); |
412 LD_SB4(src, src_stride, src1, src2, src3, src4); | 409 LD_SB4(src, src_stride, src1, src2, src3, src4); |
413 src += (4 * src_stride); | 410 src += (4 * src_stride); |
414 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); | 411 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); |
415 tmp4 = __msa_dotp_u_h(vec0, filt_vt); | 412 tmp4 = __msa_dotp_u_h(vec0, filt_vt); |
416 | 413 |
417 SRARI_H2_UH(tmp3, tmp4, FILTER_BITS); | 414 SRARI_H2_UH(tmp3, tmp4, FILTER_BITS); |
418 SAT_UH2_UH(tmp3, tmp4, 7); | |
419 PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1); | 415 PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1); |
420 ST8x4_UB(out0, out1, dst, dst_stride); | 416 ST8x4_UB(out0, out1, dst, dst_stride); |
421 dst += (4 * dst_stride); | 417 dst += (4 * dst_stride); |
422 | 418 |
423 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); | 419 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); |
424 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); | 420 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); |
425 tmp5 = __msa_dotp_u_h(vec0, filt_vt); | 421 tmp5 = __msa_dotp_u_h(vec0, filt_vt); |
426 | 422 |
427 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); | 423 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); |
428 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); | 424 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); |
429 tmp6 = __msa_dotp_u_h(vec0, filt_vt); | 425 tmp6 = __msa_dotp_u_h(vec0, filt_vt); |
430 | 426 |
431 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); | 427 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); |
432 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); | 428 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); |
433 tmp7 = __msa_dotp_u_h(vec0, filt_vt); | 429 tmp7 = __msa_dotp_u_h(vec0, filt_vt); |
434 | 430 |
435 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); | 431 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); |
436 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); | 432 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); |
437 tmp8 = __msa_dotp_u_h(vec0, filt_vt); | 433 tmp8 = __msa_dotp_u_h(vec0, filt_vt); |
438 | 434 |
439 SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, FILTER_BITS); | 435 SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, FILTER_BITS); |
440 SAT_UH4_UH(tmp5, tmp6, tmp7, tmp8, 7); | |
441 PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1); | 436 PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1); |
442 ST8x4_UB(out0, out1, dst, dst_stride); | 437 ST8x4_UB(out0, out1, dst, dst_stride); |
443 dst += (4 * dst_stride); | 438 dst += (4 * dst_stride); |
444 } | 439 } |
445 } | 440 } |
446 | 441 |
447 static void common_hv_2ht_2vt_8w_msa(const uint8_t *src, int32_t src_stride, | 442 static void common_hv_2ht_2vt_8w_msa(const uint8_t *src, int32_t src_stride, |
448 uint8_t *dst, int32_t dst_stride, | 443 uint8_t *dst, int32_t dst_stride, |
449 int8_t *filter_horiz, int8_t *filter_vert, | 444 int8_t *filter_horiz, int8_t *filter_vert, |
450 int32_t height) { | 445 int32_t height) { |
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
485 for (loop_cnt = (height >> 2); loop_cnt--;) { | 480 for (loop_cnt = (height >> 2); loop_cnt--;) { |
486 LD_SB4(src, src_stride, src0, src2, src4, src6); | 481 LD_SB4(src, src_stride, src0, src2, src4, src6); |
487 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); | 482 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); |
488 src += (4 * src_stride); | 483 src += (4 * src_stride); |
489 | 484 |
490 hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); | 485 hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); |
491 hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); | 486 hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); |
492 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); | 487 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); |
493 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); | 488 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); |
494 SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); | 489 SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); |
495 SAT_UH2_UH(tmp1, tmp2, 7); | |
496 PCKEV_ST_SB(tmp1, tmp2, dst); | 490 PCKEV_ST_SB(tmp1, tmp2, dst); |
497 dst += dst_stride; | 491 dst += dst_stride; |
498 | 492 |
499 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); | 493 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); |
500 hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); | 494 hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); |
501 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); | 495 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); |
502 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); | 496 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); |
503 SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); | 497 SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); |
504 SAT_UH2_UH(tmp1, tmp2, 7); | |
505 PCKEV_ST_SB(tmp1, tmp2, dst); | 498 PCKEV_ST_SB(tmp1, tmp2, dst); |
506 dst += dst_stride; | 499 dst += dst_stride; |
507 | 500 |
508 hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); | 501 hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); |
509 hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); | 502 hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); |
510 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); | 503 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); |
511 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); | 504 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); |
512 SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); | 505 SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); |
513 SAT_UH2_UH(tmp1, tmp2, 7); | |
514 PCKEV_ST_SB(tmp1, tmp2, dst); | 506 PCKEV_ST_SB(tmp1, tmp2, dst); |
515 dst += dst_stride; | 507 dst += dst_stride; |
516 | 508 |
517 hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); | 509 hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); |
518 hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); | 510 hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); |
519 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); | 511 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); |
520 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); | 512 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); |
521 SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); | 513 SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); |
522 SAT_UH2_UH(tmp1, tmp2, 7); | |
523 PCKEV_ST_SB(tmp1, tmp2, dst); | 514 PCKEV_ST_SB(tmp1, tmp2, dst); |
524 dst += dst_stride; | 515 dst += dst_stride; |
525 } | 516 } |
526 } | 517 } |
527 | 518 |
528 static void common_hv_2ht_2vt_32w_msa(const uint8_t *src, int32_t src_stride, | 519 static void common_hv_2ht_2vt_32w_msa(const uint8_t *src, int32_t src_stride, |
529 uint8_t *dst, int32_t dst_stride, | 520 uint8_t *dst, int32_t dst_stride, |
530 int8_t *filter_horiz, int8_t *filter_vert, | 521 int8_t *filter_horiz, int8_t *filter_vert, |
531 int32_t height) { | 522 int32_t height) { |
532 int32_t multiple8_cnt; | 523 int32_t multiple8_cnt; |
(...skipping 18 matching lines...) Expand all Loading... |
551 } | 542 } |
552 } | 543 } |
553 | 544 |
554 void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, | 545 void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, |
555 uint8_t *dst, ptrdiff_t dst_stride, | 546 uint8_t *dst, ptrdiff_t dst_stride, |
556 const int16_t *filter_x, int32_t x_step_q4, | 547 const int16_t *filter_x, int32_t x_step_q4, |
557 const int16_t *filter_y, int32_t y_step_q4, | 548 const int16_t *filter_y, int32_t y_step_q4, |
558 int32_t w, int32_t h) { | 549 int32_t w, int32_t h) { |
559 int8_t cnt, filt_hor[8], filt_ver[8]; | 550 int8_t cnt, filt_hor[8], filt_ver[8]; |
560 | 551 |
561 if (16 != x_step_q4 || 16 != y_step_q4) { | 552 assert(x_step_q4 == 16); |
562 vpx_convolve8_c(src, src_stride, dst, dst_stride, | 553 assert(y_step_q4 == 16); |
563 filter_x, x_step_q4, filter_y, y_step_q4, | 554 assert(((const int32_t *)filter_x)[1] != 0x800000); |
564 w, h); | 555 assert(((const int32_t *)filter_y)[1] != 0x800000); |
565 return; | |
566 } | |
567 | |
568 if (((const int32_t *)filter_x)[1] == 0x800000 && | |
569 ((const int32_t *)filter_y)[1] == 0x800000) { | |
570 vpx_convolve_copy(src, src_stride, dst, dst_stride, | |
571 filter_x, x_step_q4, filter_y, y_step_q4, | |
572 w, h); | |
573 return; | |
574 } | |
575 | 556 |
576 for (cnt = 0; cnt < 8; ++cnt) { | 557 for (cnt = 0; cnt < 8; ++cnt) { |
577 filt_hor[cnt] = filter_x[cnt]; | 558 filt_hor[cnt] = filter_x[cnt]; |
578 filt_ver[cnt] = filter_y[cnt]; | 559 filt_ver[cnt] = filter_y[cnt]; |
579 } | 560 } |
580 | 561 |
581 if (((const int32_t *)filter_x)[0] == 0 && | 562 if (((const int32_t *)filter_x)[0] == 0 && |
582 ((const int32_t *)filter_y)[0] == 0) { | 563 ((const int32_t *)filter_y)[0] == 0) { |
583 switch (w) { | 564 switch (w) { |
584 case 4: | 565 case 4: |
(...skipping 60 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
645 filt_hor, filt_ver, (int32_t)h); | 626 filt_hor, filt_ver, (int32_t)h); |
646 break; | 627 break; |
647 default: | 628 default: |
648 vpx_convolve8_c(src, src_stride, dst, dst_stride, | 629 vpx_convolve8_c(src, src_stride, dst, dst_stride, |
649 filter_x, x_step_q4, filter_y, y_step_q4, | 630 filter_x, x_step_q4, filter_y, y_step_q4, |
650 w, h); | 631 w, h); |
651 break; | 632 break; |
652 } | 633 } |
653 } | 634 } |
654 } | 635 } |
OLD | NEW |