Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(16)

Unified Diff: source/libvpx/vp9/common/mips/msa/vp9_convolve_msa.h

Issue 1169543007: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: source/libvpx/vp9/common/mips/msa/vp9_convolve_msa.h
diff --git a/source/libvpx/vp9/common/mips/msa/vp9_convolve_msa.h b/source/libvpx/vp9/common/mips/msa/vp9_convolve_msa.h
index b109a4014f857d03f7fa41177a1b377a6d687286..40fe94d3be595c9c08b3bcad7c52baa79e445797 100644
--- a/source/libvpx/vp9/common/mips/msa/vp9_convolve_msa.h
+++ b/source/libvpx/vp9/common/mips/msa/vp9_convolve_msa.h
@@ -16,142 +16,104 @@
extern const uint8_t mc_filt_mask_arr[16 * 3];
-#define HORIZ_8TAP_FILT(src, mask0, mask1, mask2, mask3, \
- filt_h0, filt_h1, filt_h2, filt_h3) ({ \
- v8i16 vec0, vec1, vec2, vec3, horiz_out; \
- \
- vec0 = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src), (v16i8)(src)); \
- vec0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)(filt_h0)); \
- vec1 = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src), (v16i8)(src)); \
- vec0 = __msa_dpadd_s_h(vec0, (v16i8)(filt_h1), (v16i8)vec1); \
- vec2 = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src), (v16i8)(src)); \
- vec2 = __msa_dotp_s_h((v16i8)vec2, (v16i8)(filt_h2)); \
- vec3 = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src), (v16i8)(src)); \
- vec2 = __msa_dpadd_s_h(vec2, (v16i8)(filt_h3), (v16i8)vec3); \
- vec0 = __msa_adds_s_h(vec0, vec2); \
- horiz_out = SRARI_SATURATE_SIGNED_H(vec0, FILTER_BITS, 7); \
- \
- horiz_out; \
+#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, \
+ filt0, filt1, filt2, filt3) ({ \
+ v8i16 tmp0, tmp1; \
+ \
+ tmp0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0); \
+ tmp0 = __msa_dpadd_s_h(tmp0, (v16i8)vec1, (v16i8)filt1); \
+ tmp1 = __msa_dotp_s_h((v16i8)vec2, (v16i8)filt2); \
+ tmp1 = __msa_dpadd_s_h(tmp1, (v16i8)vec3, (v16i8)filt3); \
+ tmp0 = __msa_adds_s_h(tmp0, tmp1); \
+ \
+ tmp0; \
})
-#define HORIZ_8TAP_FILT_2VECS(src0, src1, mask0, mask1, mask2, mask3, \
- filt_h0, filt_h1, filt_h2, filt_h3) ({ \
- v8i16 vec0, vec1, vec2, vec3, horiz_out; \
- \
- vec0 = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src1), (v16i8)(src0)); \
- vec0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)(filt_h0)); \
- vec1 = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src1), (v16i8)(src0)); \
- vec0 = __msa_dpadd_s_h(vec0, (v16i8)(filt_h1), (v16i8)vec1); \
- vec2 = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src1), (v16i8)(src0)); \
- vec2 = __msa_dotp_s_h((v16i8)vec2, (v16i8)(filt_h2)); \
- vec3 = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src1), (v16i8)(src0)); \
- vec2 = __msa_dpadd_s_h(vec2, ((v16i8)filt_h3), (v16i8)vec3); \
- vec0 = __msa_adds_s_h(vec0, vec2); \
- horiz_out = (v8i16)SRARI_SATURATE_SIGNED_H(vec0, FILTER_BITS, 7); \
- \
- horiz_out; \
+#define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, \
+ filt_h0, filt_h1, filt_h2, filt_h3) ({ \
+ v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
+ v8i16 hz_out_m; \
+ \
+ VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3, \
+ vec0_m, vec1_m, vec2_m, vec3_m); \
+ hz_out_m = FILT_8TAP_DPADD_S_H(vec0_m, vec1_m, vec2_m, vec3_m, \
+ filt_h0, filt_h1, filt_h2, filt_h3); \
+ \
+ hz_out_m = __msa_srari_h(hz_out_m, FILTER_BITS); \
+ hz_out_m = __msa_sat_s_h(hz_out_m, 7); \
+ \
+ hz_out_m; \
})
-#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, \
- filt0, filt1, filt2, filt3) ({ \
- v8i16 tmp0, tmp1; \
- \
- tmp0 = __msa_dotp_s_h((v16i8)(vec0), (v16i8)(filt0)); \
- tmp0 = __msa_dpadd_s_h(tmp0, (v16i8)(vec1), (v16i8)(filt1)); \
- tmp1 = __msa_dotp_s_h((v16i8)(vec2), (v16i8)(filt2)); \
- tmp1 = __msa_dpadd_s_h(tmp1, (v16i8)(vec3), ((v16i8)filt3)); \
- tmp0 = __msa_adds_s_h(tmp0, tmp1); \
- \
- tmp0; \
-})
+#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
+ mask0, mask1, mask2, mask3, \
+ filt0, filt1, filt2, filt3, \
+ out0, out1) { \
+ v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
+ v8i16 res0_m, res1_m, res2_m, res3_m; \
+ \
+ VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
+ DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m); \
+ VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
+ DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m); \
+ VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \
+ DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m); \
+ VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \
+ DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m); \
+ ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1); \
+}
+
+#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
+ mask0, mask1, mask2, mask3, \
+ filt0, filt1, filt2, filt3, \
+ out0, out1, out2, out3) { \
+ v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
+ v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m; \
+ \
+ VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
+ VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
+ DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
+ res0_m, res1_m, res2_m, res3_m); \
+ VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \
+ VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \
+ DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \
+ res4_m, res5_m, res6_m, res7_m); \
+ VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \
+ VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \
+ DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \
+ res0_m, res1_m, res2_m, res3_m); \
+ VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \
+ VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \
+ DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \
+ res4_m, res5_m, res6_m, res7_m); \
+ ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m, \
+ res7_m, out0, out1, out2, out3); \
+}
+
+#define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst) { \
+ v16u8 tmp_m; \
+ \
+ tmp_m = PCKEV_XORI128_UB(in1, in0); \
+ tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst); \
+ ST_UB(tmp_m, (pdst)); \
+}
-#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
- mask0, mask1, mask2, mask3, \
- filt0, filt1, filt2, filt3, \
- out0, out1) { \
- v8i16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
- v8i16 res0_m, res1_m, res2_m, res3_m; \
- \
- vec0_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src1), (v16i8)(src0)); \
- vec1_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src3), (v16i8)(src2)); \
- \
- res0_m = __msa_dotp_s_h((v16i8)vec0_m, (v16i8)(filt0)); \
- res1_m = __msa_dotp_s_h((v16i8)vec1_m, (v16i8)(filt0)); \
- \
- vec2_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src1), (v16i8)(src0)); \
- vec3_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src3), (v16i8)(src2)); \
- \
- res0_m = __msa_dpadd_s_h(res0_m, (filt1), (v16i8)vec2_m); \
- res1_m = __msa_dpadd_s_h(res1_m, (filt1), (v16i8)vec3_m); \
- \
- vec4_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src1), (v16i8)(src0)); \
- vec5_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src3), (v16i8)(src2)); \
- \
- res2_m = __msa_dotp_s_h((v16i8)(filt2), (v16i8)vec4_m); \
- res3_m = __msa_dotp_s_h((v16i8)(filt2), (v16i8)vec5_m); \
- \
- vec6_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src1), (v16i8)(src0)); \
- vec7_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src3), (v16i8)(src2)); \
- \
- res2_m = __msa_dpadd_s_h(res2_m, (v16i8)(filt3), (v16i8)vec6_m); \
- res3_m = __msa_dpadd_s_h(res3_m, (v16i8)(filt3), (v16i8)vec7_m); \
- \
- out0 = __msa_adds_s_h(res0_m, res2_m); \
- out1 = __msa_adds_s_h(res1_m, res3_m); \
+#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst) { \
+ v16u8 tmp_m; \
+ \
+ tmp_m = (v16u8)__msa_pckev_b((v16i8)in0, (v16i8)in1); \
+ tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst); \
+ ST_UB(tmp_m, (pdst)); \
}
-#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
- mask0, mask1, mask2, mask3, \
- filt0, filt1, filt2, filt3, \
- out0, out1, out2, out3) { \
- v8i16 vec0_m, vec1_m, vec2_m, vec3_m; \
- v8i16 vec4_m, vec5_m, vec6_m, vec7_m; \
- v8i16 res0_m, res1_m, res2_m, res3_m; \
- v8i16 res4_m, res5_m, res6_m, res7_m; \
- \
- vec0_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src0), (v16i8)(src0)); \
- vec1_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src1), (v16i8)(src1)); \
- vec2_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src2), (v16i8)(src2)); \
- vec3_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src3), (v16i8)(src3)); \
- \
- res0_m = __msa_dotp_s_h((v16i8)vec0_m, (v16i8)(filt0)); \
- res1_m = __msa_dotp_s_h((v16i8)vec1_m, (v16i8)(filt0)); \
- res2_m = __msa_dotp_s_h((v16i8)vec2_m, (v16i8)(filt0)); \
- res3_m = __msa_dotp_s_h((v16i8)vec3_m, (v16i8)(filt0)); \
- \
- vec0_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src0), (v16i8)(src0)); \
- vec1_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src1), (v16i8)(src1)); \
- vec2_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src2), (v16i8)(src2)); \
- vec3_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src3), (v16i8)(src3)); \
- \
- res4_m = __msa_dotp_s_h((v16i8)vec0_m, (v16i8)(filt2)); \
- res5_m = __msa_dotp_s_h((v16i8)vec1_m, (v16i8)(filt2)); \
- res6_m = __msa_dotp_s_h((v16i8)vec2_m, (v16i8)(filt2)); \
- res7_m = __msa_dotp_s_h((v16i8)vec3_m, (v16i8)(filt2)); \
- \
- vec4_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src0), (v16i8)(src0)); \
- vec5_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src1), (v16i8)(src1)); \
- vec6_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src2), (v16i8)(src2)); \
- vec7_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src3), (v16i8)(src3)); \
- \
- res0_m = __msa_dpadd_s_h(res0_m, (v16i8)(filt1), (v16i8)vec4_m); \
- res1_m = __msa_dpadd_s_h(res1_m, (v16i8)(filt1), (v16i8)vec5_m); \
- res2_m = __msa_dpadd_s_h(res2_m, (v16i8)(filt1), (v16i8)vec6_m); \
- res3_m = __msa_dpadd_s_h(res3_m, (v16i8)(filt1), (v16i8)vec7_m); \
- \
- vec4_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src0), (v16i8)(src0)); \
- vec5_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src1), (v16i8)(src1)); \
- vec6_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src2), (v16i8)(src2)); \
- vec7_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src3), (v16i8)(src3)); \
- \
- res4_m = __msa_dpadd_s_h(res4_m, (v16i8)(filt3), (v16i8)vec4_m); \
- res5_m = __msa_dpadd_s_h(res5_m, (v16i8)(filt3), (v16i8)vec5_m); \
- res6_m = __msa_dpadd_s_h(res6_m, (v16i8)(filt3), (v16i8)vec6_m); \
- res7_m = __msa_dpadd_s_h(res7_m, (v16i8)(filt3), (v16i8)vec7_m); \
- \
- out0 = __msa_adds_s_h(res0_m, res4_m); \
- out1 = __msa_adds_s_h(res1_m, res5_m); \
- out2 = __msa_adds_s_h(res2_m, res6_m); \
- out3 = __msa_adds_s_h(res3_m, res7_m); \
+#define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3, \
+ pdst, stride) { \
+ v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ uint8_t *pdst_m = (uint8_t *)(pdst); \
+ \
+ PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m); \
+ PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \
+ AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \
+ ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \
}
#endif /* VP9_COMMON_MIPS_MSA_VP9_CONVOLVE_MSA_H_ */
« no previous file with comments | « source/libvpx/vp9/common/mips/msa/vp9_convolve_copy_msa.c ('k') | source/libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698