| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| 11 #ifndef VP9_COMMON_MIPS_MSA_VP9_CONVOLVE_MSA_H_ | 11 #ifndef VP9_COMMON_MIPS_MSA_VP9_CONVOLVE_MSA_H_ |
| 12 #define VP9_COMMON_MIPS_MSA_VP9_CONVOLVE_MSA_H_ | 12 #define VP9_COMMON_MIPS_MSA_VP9_CONVOLVE_MSA_H_ |
| 13 | 13 |
| 14 #include "vp9/common/vp9_filter.h" | 14 #include "vp9/common/vp9_filter.h" |
| 15 #include "vp9/common/mips/msa/vp9_macros_msa.h" | 15 #include "vp9/common/mips/msa/vp9_macros_msa.h" |
| 16 | 16 |
| 17 extern const uint8_t mc_filt_mask_arr[16 * 3]; | 17 extern const uint8_t mc_filt_mask_arr[16 * 3]; |
| 18 | 18 |
| 19 #define HORIZ_8TAP_FILT(src, mask0, mask1, mask2, mask3, \ | 19 #define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, \ |
| 20 filt_h0, filt_h1, filt_h2, filt_h3) ({ \ | 20 filt0, filt1, filt2, filt3) ({ \ |
| 21 v8i16 vec0, vec1, vec2, vec3, horiz_out; \ | 21 v8i16 tmp0, tmp1; \ |
| 22 \ | 22 \ |
| 23 vec0 = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src), (v16i8)(src)); \ | 23 tmp0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0); \ |
| 24 vec0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)(filt_h0)); \ | 24 tmp0 = __msa_dpadd_s_h(tmp0, (v16i8)vec1, (v16i8)filt1); \ |
| 25 vec1 = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src), (v16i8)(src)); \ | 25 tmp1 = __msa_dotp_s_h((v16i8)vec2, (v16i8)filt2); \ |
| 26 vec0 = __msa_dpadd_s_h(vec0, (v16i8)(filt_h1), (v16i8)vec1); \ | 26 tmp1 = __msa_dpadd_s_h(tmp1, (v16i8)vec3, (v16i8)filt3); \ |
| 27 vec2 = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src), (v16i8)(src)); \ | 27 tmp0 = __msa_adds_s_h(tmp0, tmp1); \ |
| 28 vec2 = __msa_dotp_s_h((v16i8)vec2, (v16i8)(filt_h2)); \ | 28 \ |
| 29 vec3 = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src), (v16i8)(src)); \ | 29 tmp0; \ |
| 30 vec2 = __msa_dpadd_s_h(vec2, (v16i8)(filt_h3), (v16i8)vec3); \ | |
| 31 vec0 = __msa_adds_s_h(vec0, vec2); \ | |
| 32 horiz_out = SRARI_SATURATE_SIGNED_H(vec0, FILTER_BITS, 7); \ | |
| 33 \ | |
| 34 horiz_out; \ | |
| 35 }) | 30 }) |
| 36 | 31 |
| 37 #define HORIZ_8TAP_FILT_2VECS(src0, src1, mask0, mask1, mask2, mask3, \ | 32 #define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, \ |
| 38 filt_h0, filt_h1, filt_h2, filt_h3) ({ \ | 33 filt_h0, filt_h1, filt_h2, filt_h3) ({ \ |
| 39 v8i16 vec0, vec1, vec2, vec3, horiz_out; \ | 34 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \ |
| 40 \ | 35 v8i16 hz_out_m; \ |
| 41 vec0 = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src1), (v16i8)(src0)); \ | 36 \ |
| 42 vec0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)(filt_h0)); \ | 37 VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3, \ |
| 43 vec1 = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src1), (v16i8)(src0)); \ | 38 vec0_m, vec1_m, vec2_m, vec3_m); \ |
| 44 vec0 = __msa_dpadd_s_h(vec0, (v16i8)(filt_h1), (v16i8)vec1); \ | 39 hz_out_m = FILT_8TAP_DPADD_S_H(vec0_m, vec1_m, vec2_m, vec3_m, \ |
| 45 vec2 = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src1), (v16i8)(src0)); \ | 40 filt_h0, filt_h1, filt_h2, filt_h3); \ |
| 46 vec2 = __msa_dotp_s_h((v16i8)vec2, (v16i8)(filt_h2)); \ | 41 \ |
| 47 vec3 = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src1), (v16i8)(src0)); \ | 42 hz_out_m = __msa_srari_h(hz_out_m, FILTER_BITS); \ |
| 48 vec2 = __msa_dpadd_s_h(vec2, ((v16i8)filt_h3), (v16i8)vec3); \ | 43 hz_out_m = __msa_sat_s_h(hz_out_m, 7); \ |
| 49 vec0 = __msa_adds_s_h(vec0, vec2); \ | 44 \ |
| 50 horiz_out = (v8i16)SRARI_SATURATE_SIGNED_H(vec0, FILTER_BITS, 7); \ | 45 hz_out_m; \ |
| 51 \ | |
| 52 horiz_out; \ | |
| 53 }) | 46 }) |
| 54 | 47 |
| 55 #define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, \ | 48 #define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \ |
| 56 filt0, filt1, filt2, filt3) ({ \ | 49 mask0, mask1, mask2, mask3, \ |
| 57 v8i16 tmp0, tmp1; \ | 50 filt0, filt1, filt2, filt3, \ |
| 58 \ | 51 out0, out1) { \ |
| 59 tmp0 = __msa_dotp_s_h((v16i8)(vec0), (v16i8)(filt0)); \ | 52 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ |
| 60 tmp0 = __msa_dpadd_s_h(tmp0, (v16i8)(vec1), (v16i8)(filt1)); \ | 53 v8i16 res0_m, res1_m, res2_m, res3_m; \ |
| 61 tmp1 = __msa_dotp_s_h((v16i8)(vec2), (v16i8)(filt2)); \ | 54 \ |
| 62 tmp1 = __msa_dpadd_s_h(tmp1, (v16i8)(vec3), ((v16i8)filt3)); \ | 55 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \ |
| 63 tmp0 = __msa_adds_s_h(tmp0, tmp1); \ | 56 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m); \ |
| 64 \ | 57 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \ |
| 65 tmp0; \ | 58 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m); \ |
| 66 }) | 59 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \ |
| 67 | 60 DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m); \ |
| 68 #define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \ | 61 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \ |
| 69 mask0, mask1, mask2, mask3, \ | 62 DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m); \ |
| 70 filt0, filt1, filt2, filt3, \ | 63 ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1); \ |
| 71 out0, out1) { \ | |
| 72 v8i16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ | |
| 73 v8i16 res0_m, res1_m, res2_m, res3_m; \ | |
| 74 \ | |
| 75 vec0_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src1), (v16i8)(src0)); \ | |
| 76 vec1_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src3), (v16i8)(src2)); \ | |
| 77 \ | |
| 78 res0_m = __msa_dotp_s_h((v16i8)vec0_m, (v16i8)(filt0)); \ | |
| 79 res1_m = __msa_dotp_s_h((v16i8)vec1_m, (v16i8)(filt0)); \ | |
| 80 \ | |
| 81 vec2_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src1), (v16i8)(src0)); \ | |
| 82 vec3_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src3), (v16i8)(src2)); \ | |
| 83 \ | |
| 84 res0_m = __msa_dpadd_s_h(res0_m, (filt1), (v16i8)vec2_m); \ | |
| 85 res1_m = __msa_dpadd_s_h(res1_m, (filt1), (v16i8)vec3_m); \ | |
| 86 \ | |
| 87 vec4_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src1), (v16i8)(src0)); \ | |
| 88 vec5_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src3), (v16i8)(src2)); \ | |
| 89 \ | |
| 90 res2_m = __msa_dotp_s_h((v16i8)(filt2), (v16i8)vec4_m); \ | |
| 91 res3_m = __msa_dotp_s_h((v16i8)(filt2), (v16i8)vec5_m); \ | |
| 92 \ | |
| 93 vec6_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src1), (v16i8)(src0)); \ | |
| 94 vec7_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src3), (v16i8)(src2)); \ | |
| 95 \ | |
| 96 res2_m = __msa_dpadd_s_h(res2_m, (v16i8)(filt3), (v16i8)vec6_m); \ | |
| 97 res3_m = __msa_dpadd_s_h(res3_m, (v16i8)(filt3), (v16i8)vec7_m); \ | |
| 98 \ | |
| 99 out0 = __msa_adds_s_h(res0_m, res2_m); \ | |
| 100 out1 = __msa_adds_s_h(res1_m, res3_m); \ | |
| 101 } | 64 } |
| 102 | 65 |
| 103 #define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \ | 66 #define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \ |
| 104 mask0, mask1, mask2, mask3, \ | 67 mask0, mask1, mask2, mask3, \ |
| 105 filt0, filt1, filt2, filt3, \ | 68 filt0, filt1, filt2, filt3, \ |
| 106 out0, out1, out2, out3) { \ | 69 out0, out1, out2, out3) { \ |
| 107 v8i16 vec0_m, vec1_m, vec2_m, vec3_m; \ | 70 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ |
| 108 v8i16 vec4_m, vec5_m, vec6_m, vec7_m; \ | 71 v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m; \ |
| 109 v8i16 res0_m, res1_m, res2_m, res3_m; \ | 72 \ |
| 110 v8i16 res4_m, res5_m, res6_m, res7_m; \ | 73 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \ |
| 111 \ | 74 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \ |
| 112 vec0_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src0), (v16i8)(src0)); \ | 75 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \ |
| 113 vec1_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src1), (v16i8)(src1)); \ | 76 res0_m, res1_m, res2_m, res3_m); \ |
| 114 vec2_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src2), (v16i8)(src2)); \ | 77 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \ |
| 115 vec3_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src3), (v16i8)(src3)); \ | 78 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \ |
| 116 \ | 79 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \ |
| 117 res0_m = __msa_dotp_s_h((v16i8)vec0_m, (v16i8)(filt0)); \ | 80 res4_m, res5_m, res6_m, res7_m); \ |
| 118 res1_m = __msa_dotp_s_h((v16i8)vec1_m, (v16i8)(filt0)); \ | 81 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \ |
| 119 res2_m = __msa_dotp_s_h((v16i8)vec2_m, (v16i8)(filt0)); \ | 82 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \ |
| 120 res3_m = __msa_dotp_s_h((v16i8)vec3_m, (v16i8)(filt0)); \ | 83 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \ |
| 121 \ | 84 res0_m, res1_m, res2_m, res3_m); \ |
| 122 vec0_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src0), (v16i8)(src0)); \ | 85 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \ |
| 123 vec1_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src1), (v16i8)(src1)); \ | 86 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \ |
| 124 vec2_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src2), (v16i8)(src2)); \ | 87 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \ |
| 125 vec3_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src3), (v16i8)(src3)); \ | 88 res4_m, res5_m, res6_m, res7_m); \ |
| 126 \ | 89 ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m, \ |
| 127 res4_m = __msa_dotp_s_h((v16i8)vec0_m, (v16i8)(filt2)); \ | 90 res7_m, out0, out1, out2, out3); \ |
| 128 res5_m = __msa_dotp_s_h((v16i8)vec1_m, (v16i8)(filt2)); \ | 91 } |
| 129 res6_m = __msa_dotp_s_h((v16i8)vec2_m, (v16i8)(filt2)); \ | 92 |
| 130 res7_m = __msa_dotp_s_h((v16i8)vec3_m, (v16i8)(filt2)); \ | 93 #define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst) { \ |
| 131 \ | 94 v16u8 tmp_m; \ |
| 132 vec4_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src0), (v16i8)(src0)); \ | 95 \ |
| 133 vec5_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src1), (v16i8)(src1)); \ | 96 tmp_m = PCKEV_XORI128_UB(in1, in0); \ |
| 134 vec6_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src2), (v16i8)(src2)); \ | 97 tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst); \ |
| 135 vec7_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src3), (v16i8)(src3)); \ | 98 ST_UB(tmp_m, (pdst)); \ |
| 136 \ | 99 } |
| 137 res0_m = __msa_dpadd_s_h(res0_m, (v16i8)(filt1), (v16i8)vec4_m); \ | 100 |
| 138 res1_m = __msa_dpadd_s_h(res1_m, (v16i8)(filt1), (v16i8)vec5_m); \ | 101 #define PCKEV_AVG_ST_UB(in0, in1, dst, pdst) { \ |
| 139 res2_m = __msa_dpadd_s_h(res2_m, (v16i8)(filt1), (v16i8)vec6_m); \ | 102 v16u8 tmp_m; \ |
| 140 res3_m = __msa_dpadd_s_h(res3_m, (v16i8)(filt1), (v16i8)vec7_m); \ | 103 \ |
| 141 \ | 104 tmp_m = (v16u8)__msa_pckev_b((v16i8)in0, (v16i8)in1); \ |
| 142 vec4_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src0), (v16i8)(src0)); \ | 105 tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst); \ |
| 143 vec5_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src1), (v16i8)(src1)); \ | 106 ST_UB(tmp_m, (pdst)); \ |
| 144 vec6_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src2), (v16i8)(src2)); \ | 107 } |
| 145 vec7_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src3), (v16i8)(src3)); \ | 108 |
| 146 \ | 109 #define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3, \ |
| 147 res4_m = __msa_dpadd_s_h(res4_m, (v16i8)(filt3), (v16i8)vec4_m); \ | 110 pdst, stride) { \ |
| 148 res5_m = __msa_dpadd_s_h(res5_m, (v16i8)(filt3), (v16i8)vec5_m); \ | 111 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ |
| 149 res6_m = __msa_dpadd_s_h(res6_m, (v16i8)(filt3), (v16i8)vec6_m); \ | 112 uint8_t *pdst_m = (uint8_t *)(pdst); \ |
| 150 res7_m = __msa_dpadd_s_h(res7_m, (v16i8)(filt3), (v16i8)vec7_m); \ | 113 \ |
| 151 \ | 114 PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m); \ |
| 152 out0 = __msa_adds_s_h(res0_m, res4_m); \ | 115 PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \ |
| 153 out1 = __msa_adds_s_h(res1_m, res5_m); \ | 116 AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \ |
| 154 out2 = __msa_adds_s_h(res2_m, res6_m); \ | 117 ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \ |
| 155 out3 = __msa_adds_s_h(res3_m, res7_m); \ | |
| 156 } | 118 } |
| 157 #endif /* VP9_COMMON_MIPS_MSA_VP9_CONVOLVE_MSA_H_ */ | 119 #endif /* VP9_COMMON_MIPS_MSA_VP9_CONVOLVE_MSA_H_ */ |
| OLD | NEW |