| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| 11 #include <assert.h> |
| 11 #include "./vpx_dsp_rtcd.h" | 12 #include "./vpx_dsp_rtcd.h" |
| 12 #include "vpx_dsp/mips/vpx_convolve_msa.h" | 13 #include "vpx_dsp/mips/vpx_convolve_msa.h" |
| 13 | 14 |
| 14 const uint8_t mc_filt_mask_arr[16 * 3] = { | 15 const uint8_t mc_filt_mask_arr[16 * 3] = { |
| 15 /* 8 width cases */ | 16 /* 8 width cases */ |
| 16 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, | 17 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, |
| 17 /* 4 width cases */ | 18 /* 4 width cases */ |
| 18 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, | 19 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, |
| 19 /* 4 width cases */ | 20 /* 4 width cases */ |
| 20 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 | 21 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 |
| (...skipping 228 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 249 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); | 250 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); |
| 250 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); | 251 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); |
| 251 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); | 252 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); |
| 252 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); | 253 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); |
| 253 hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); | 254 hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); |
| 254 hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); | 255 hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); |
| 255 | 256 |
| 256 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); | 257 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); |
| 257 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); | 258 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); |
| 258 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); | 259 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
| 259 SAT_UH2_UH(tmp0, tmp1, 7); | |
| 260 PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1); | 260 PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1); |
| 261 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); | 261 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); |
| 262 } | 262 } |
| 263 | 263 |
| 264 static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride, | 264 static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride, |
| 265 uint8_t *dst, int32_t dst_stride, | 265 uint8_t *dst, int32_t dst_stride, |
| 266 int8_t *filter_horiz, | 266 int8_t *filter_horiz, |
| 267 int8_t *filter_vert) { | 267 int8_t *filter_vert) { |
| 268 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; | 268 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; |
| 269 v16i8 res0, res1, res2, res3; | 269 v16i8 res0, res1, res2, res3; |
| (...skipping 21 matching lines...) Expand all Loading... |
| 291 hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS); | 291 hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS); |
| 292 SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1, | 292 SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1, |
| 293 hz_out3, hz_out5, 8); | 293 hz_out3, hz_out5, 8); |
| 294 hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6); | 294 hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6); |
| 295 | 295 |
| 296 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); | 296 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); |
| 297 ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3); | 297 ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3); |
| 298 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, | 298 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, |
| 299 vec4, vec5, vec6, vec7); | 299 vec4, vec5, vec6, vec7); |
| 300 SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); | 300 SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); |
| 301 SAT_UH4_UH(vec4, vec5, vec6, vec7, 7); | |
| 302 PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, | 301 PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, |
| 303 res2, res3); | 302 res2, res3); |
| 304 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); | 303 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); |
| 305 dst += (4 * dst_stride); | 304 dst += (4 * dst_stride); |
| 306 ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); | 305 ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); |
| 307 } | 306 } |
| 308 | 307 |
| 309 static void common_hv_2ht_2vt_4w_msa(const uint8_t *src, int32_t src_stride, | 308 static void common_hv_2ht_2vt_4w_msa(const uint8_t *src, int32_t src_stride, |
| 310 uint8_t *dst, int32_t dst_stride, | 309 uint8_t *dst, int32_t dst_stride, |
| 311 int8_t *filter_horiz, int8_t *filter_vert, | 310 int8_t *filter_horiz, int8_t *filter_vert, |
| (...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 350 | 349 |
| 351 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); | 350 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); |
| 352 vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); | 351 vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); |
| 353 tmp2 = __msa_dotp_u_h(vec2, filt_vt); | 352 tmp2 = __msa_dotp_u_h(vec2, filt_vt); |
| 354 | 353 |
| 355 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); | 354 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); |
| 356 vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); | 355 vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); |
| 357 tmp3 = __msa_dotp_u_h(vec3, filt_vt); | 356 tmp3 = __msa_dotp_u_h(vec3, filt_vt); |
| 358 | 357 |
| 359 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); | 358 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); |
| 360 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); | |
| 361 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); | 359 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); |
| 362 ST8x4_UB(out0, out1, dst, dst_stride); | 360 ST8x4_UB(out0, out1, dst, dst_stride); |
| 363 } | 361 } |
| 364 | 362 |
| 365 static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src, | 363 static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src, |
| 366 int32_t src_stride, | 364 int32_t src_stride, |
| 367 uint8_t *dst, | 365 uint8_t *dst, |
| 368 int32_t dst_stride, | 366 int32_t dst_stride, |
| 369 int8_t *filter_horiz, | 367 int8_t *filter_horiz, |
| 370 int8_t *filter_vert, | 368 int8_t *filter_vert, |
| (...skipping 24 matching lines...) Expand all Loading... |
| 395 | 393 |
| 396 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); | 394 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); |
| 397 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); | 395 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); |
| 398 tmp1 = __msa_dotp_u_h(vec0, filt_vt); | 396 tmp1 = __msa_dotp_u_h(vec0, filt_vt); |
| 399 | 397 |
| 400 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); | 398 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); |
| 401 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); | 399 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); |
| 402 tmp2 = __msa_dotp_u_h(vec0, filt_vt); | 400 tmp2 = __msa_dotp_u_h(vec0, filt_vt); |
| 403 | 401 |
| 404 SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); | 402 SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); |
| 405 SAT_UH2_UH(tmp1, tmp2, 7); | |
| 406 | 403 |
| 407 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); | 404 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); |
| 408 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); | 405 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); |
| 409 tmp3 = __msa_dotp_u_h(vec0, filt_vt); | 406 tmp3 = __msa_dotp_u_h(vec0, filt_vt); |
| 410 | 407 |
| 411 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); | 408 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); |
| 412 LD_SB4(src, src_stride, src1, src2, src3, src4); | 409 LD_SB4(src, src_stride, src1, src2, src3, src4); |
| 413 src += (4 * src_stride); | 410 src += (4 * src_stride); |
| 414 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); | 411 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); |
| 415 tmp4 = __msa_dotp_u_h(vec0, filt_vt); | 412 tmp4 = __msa_dotp_u_h(vec0, filt_vt); |
| 416 | 413 |
| 417 SRARI_H2_UH(tmp3, tmp4, FILTER_BITS); | 414 SRARI_H2_UH(tmp3, tmp4, FILTER_BITS); |
| 418 SAT_UH2_UH(tmp3, tmp4, 7); | |
| 419 PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1); | 415 PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1); |
| 420 ST8x4_UB(out0, out1, dst, dst_stride); | 416 ST8x4_UB(out0, out1, dst, dst_stride); |
| 421 dst += (4 * dst_stride); | 417 dst += (4 * dst_stride); |
| 422 | 418 |
| 423 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); | 419 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); |
| 424 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); | 420 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); |
| 425 tmp5 = __msa_dotp_u_h(vec0, filt_vt); | 421 tmp5 = __msa_dotp_u_h(vec0, filt_vt); |
| 426 | 422 |
| 427 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); | 423 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); |
| 428 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); | 424 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); |
| 429 tmp6 = __msa_dotp_u_h(vec0, filt_vt); | 425 tmp6 = __msa_dotp_u_h(vec0, filt_vt); |
| 430 | 426 |
| 431 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); | 427 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); |
| 432 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); | 428 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); |
| 433 tmp7 = __msa_dotp_u_h(vec0, filt_vt); | 429 tmp7 = __msa_dotp_u_h(vec0, filt_vt); |
| 434 | 430 |
| 435 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); | 431 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); |
| 436 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); | 432 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); |
| 437 tmp8 = __msa_dotp_u_h(vec0, filt_vt); | 433 tmp8 = __msa_dotp_u_h(vec0, filt_vt); |
| 438 | 434 |
| 439 SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, FILTER_BITS); | 435 SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, FILTER_BITS); |
| 440 SAT_UH4_UH(tmp5, tmp6, tmp7, tmp8, 7); | |
| 441 PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1); | 436 PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1); |
| 442 ST8x4_UB(out0, out1, dst, dst_stride); | 437 ST8x4_UB(out0, out1, dst, dst_stride); |
| 443 dst += (4 * dst_stride); | 438 dst += (4 * dst_stride); |
| 444 } | 439 } |
| 445 } | 440 } |
| 446 | 441 |
| 447 static void common_hv_2ht_2vt_8w_msa(const uint8_t *src, int32_t src_stride, | 442 static void common_hv_2ht_2vt_8w_msa(const uint8_t *src, int32_t src_stride, |
| 448 uint8_t *dst, int32_t dst_stride, | 443 uint8_t *dst, int32_t dst_stride, |
| 449 int8_t *filter_horiz, int8_t *filter_vert, | 444 int8_t *filter_horiz, int8_t *filter_vert, |
| 450 int32_t height) { | 445 int32_t height) { |
| (...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 485 for (loop_cnt = (height >> 2); loop_cnt--;) { | 480 for (loop_cnt = (height >> 2); loop_cnt--;) { |
| 486 LD_SB4(src, src_stride, src0, src2, src4, src6); | 481 LD_SB4(src, src_stride, src0, src2, src4, src6); |
| 487 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); | 482 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); |
| 488 src += (4 * src_stride); | 483 src += (4 * src_stride); |
| 489 | 484 |
| 490 hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); | 485 hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); |
| 491 hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); | 486 hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); |
| 492 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); | 487 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); |
| 493 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); | 488 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); |
| 494 SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); | 489 SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); |
| 495 SAT_UH2_UH(tmp1, tmp2, 7); | |
| 496 PCKEV_ST_SB(tmp1, tmp2, dst); | 490 PCKEV_ST_SB(tmp1, tmp2, dst); |
| 497 dst += dst_stride; | 491 dst += dst_stride; |
| 498 | 492 |
| 499 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); | 493 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); |
| 500 hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); | 494 hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); |
| 501 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); | 495 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); |
| 502 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); | 496 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); |
| 503 SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); | 497 SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); |
| 504 SAT_UH2_UH(tmp1, tmp2, 7); | |
| 505 PCKEV_ST_SB(tmp1, tmp2, dst); | 498 PCKEV_ST_SB(tmp1, tmp2, dst); |
| 506 dst += dst_stride; | 499 dst += dst_stride; |
| 507 | 500 |
| 508 hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); | 501 hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); |
| 509 hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); | 502 hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); |
| 510 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); | 503 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); |
| 511 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); | 504 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); |
| 512 SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); | 505 SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); |
| 513 SAT_UH2_UH(tmp1, tmp2, 7); | |
| 514 PCKEV_ST_SB(tmp1, tmp2, dst); | 506 PCKEV_ST_SB(tmp1, tmp2, dst); |
| 515 dst += dst_stride; | 507 dst += dst_stride; |
| 516 | 508 |
| 517 hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); | 509 hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); |
| 518 hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); | 510 hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); |
| 519 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); | 511 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); |
| 520 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); | 512 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); |
| 521 SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); | 513 SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); |
| 522 SAT_UH2_UH(tmp1, tmp2, 7); | |
| 523 PCKEV_ST_SB(tmp1, tmp2, dst); | 514 PCKEV_ST_SB(tmp1, tmp2, dst); |
| 524 dst += dst_stride; | 515 dst += dst_stride; |
| 525 } | 516 } |
| 526 } | 517 } |
| 527 | 518 |
| 528 static void common_hv_2ht_2vt_32w_msa(const uint8_t *src, int32_t src_stride, | 519 static void common_hv_2ht_2vt_32w_msa(const uint8_t *src, int32_t src_stride, |
| 529 uint8_t *dst, int32_t dst_stride, | 520 uint8_t *dst, int32_t dst_stride, |
| 530 int8_t *filter_horiz, int8_t *filter_vert, | 521 int8_t *filter_horiz, int8_t *filter_vert, |
| 531 int32_t height) { | 522 int32_t height) { |
| 532 int32_t multiple8_cnt; | 523 int32_t multiple8_cnt; |
| (...skipping 18 matching lines...) Expand all Loading... |
| 551 } | 542 } |
| 552 } | 543 } |
| 553 | 544 |
| 554 void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, | 545 void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, |
| 555 uint8_t *dst, ptrdiff_t dst_stride, | 546 uint8_t *dst, ptrdiff_t dst_stride, |
| 556 const int16_t *filter_x, int32_t x_step_q4, | 547 const int16_t *filter_x, int32_t x_step_q4, |
| 557 const int16_t *filter_y, int32_t y_step_q4, | 548 const int16_t *filter_y, int32_t y_step_q4, |
| 558 int32_t w, int32_t h) { | 549 int32_t w, int32_t h) { |
| 559 int8_t cnt, filt_hor[8], filt_ver[8]; | 550 int8_t cnt, filt_hor[8], filt_ver[8]; |
| 560 | 551 |
| 561 if (16 != x_step_q4 || 16 != y_step_q4) { | 552 assert(x_step_q4 == 16); |
| 562 vpx_convolve8_c(src, src_stride, dst, dst_stride, | 553 assert(y_step_q4 == 16); |
| 563 filter_x, x_step_q4, filter_y, y_step_q4, | 554 assert(((const int32_t *)filter_x)[1] != 0x800000); |
| 564 w, h); | 555 assert(((const int32_t *)filter_y)[1] != 0x800000); |
| 565 return; | |
| 566 } | |
| 567 | |
| 568 if (((const int32_t *)filter_x)[1] == 0x800000 && | |
| 569 ((const int32_t *)filter_y)[1] == 0x800000) { | |
| 570 vpx_convolve_copy(src, src_stride, dst, dst_stride, | |
| 571 filter_x, x_step_q4, filter_y, y_step_q4, | |
| 572 w, h); | |
| 573 return; | |
| 574 } | |
| 575 | 556 |
| 576 for (cnt = 0; cnt < 8; ++cnt) { | 557 for (cnt = 0; cnt < 8; ++cnt) { |
| 577 filt_hor[cnt] = filter_x[cnt]; | 558 filt_hor[cnt] = filter_x[cnt]; |
| 578 filt_ver[cnt] = filter_y[cnt]; | 559 filt_ver[cnt] = filter_y[cnt]; |
| 579 } | 560 } |
| 580 | 561 |
| 581 if (((const int32_t *)filter_x)[0] == 0 && | 562 if (((const int32_t *)filter_x)[0] == 0 && |
| 582 ((const int32_t *)filter_y)[0] == 0) { | 563 ((const int32_t *)filter_y)[0] == 0) { |
| 583 switch (w) { | 564 switch (w) { |
| 584 case 4: | 565 case 4: |
| (...skipping 60 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 645 filt_hor, filt_ver, (int32_t)h); | 626 filt_hor, filt_ver, (int32_t)h); |
| 646 break; | 627 break; |
| 647 default: | 628 default: |
| 648 vpx_convolve8_c(src, src_stride, dst, dst_stride, | 629 vpx_convolve8_c(src, src_stride, dst, dst_stride, |
| 649 filter_x, x_step_q4, filter_y, y_step_q4, | 630 filter_x, x_step_q4, filter_y, y_step_q4, |
| 650 w, h); | 631 w, h); |
| 651 break; | 632 break; |
| 652 } | 633 } |
| 653 } | 634 } |
| 654 } | 635 } |
| OLD | NEW |