| OLD | NEW |
| (Empty) | |
| 1 /* |
| 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. |
| 3 * |
| 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ |
| 10 |
| 11 #include "./vp9_rtcd.h" |
| 12 #include "vp9/common/mips/msa/vp9_convolve_msa.h" |
| 13 |
| 14 static void common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t *src, |
| 15 int32_t src_stride, |
| 16 uint8_t *dst, |
| 17 int32_t dst_stride, |
| 18 int8_t *filter_horiz, |
| 19 int8_t *filter_vert, |
| 20 int32_t height) { |
| 21 uint32_t loop_cnt; |
| 22 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; |
| 23 v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3, tmp0, tmp1; |
| 24 v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; |
| 25 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; |
| 26 v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4; |
| 27 v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; |
| 28 |
| 29 mask0 = LD_UB(&mc_filt_mask_arr[16]); |
| 30 src -= (3 + 3 * src_stride); |
| 31 |
| 32 /* rearranging filter */ |
| 33 filt = LD_SH(filter_horiz); |
| 34 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); |
| 35 |
| 36 mask1 = mask0 + 2; |
| 37 mask2 = mask0 + 4; |
| 38 mask3 = mask0 + 6; |
| 39 |
| 40 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); |
| 41 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); |
| 42 src += (7 * src_stride); |
| 43 |
| 44 hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0, |
| 45 filt_hz1, filt_hz2, filt_hz3); |
| 46 hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0, |
| 47 filt_hz1, filt_hz2, filt_hz3); |
| 48 hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0, |
| 49 filt_hz1, filt_hz2, filt_hz3); |
| 50 hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0, |
| 51 filt_hz1, filt_hz2, filt_hz3); |
| 52 SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8); |
| 53 |
| 54 filt = LD_SH(filter_vert); |
| 55 SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); |
| 56 |
| 57 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); |
| 58 vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4); |
| 59 |
| 60 for (loop_cnt = (height >> 2); loop_cnt--;) { |
| 61 LD_SB4(src, src_stride, src7, src8, src9, src10); |
| 62 XORI_B4_128_SB(src7, src8, src9, src10); |
| 63 src += (4 * src_stride); |
| 64 |
| 65 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); |
| 66 hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, |
| 67 filt_hz0, filt_hz1, filt_hz2, filt_hz3); |
| 68 hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8); |
| 69 vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); |
| 70 res0 = FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt_vt0, filt_vt1, |
| 71 filt_vt2, filt_vt3); |
| 72 |
| 73 hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, |
| 74 filt_hz0, filt_hz1, filt_hz2, filt_hz3); |
| 75 hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8); |
| 76 vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); |
| 77 res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1, |
| 78 filt_vt2, filt_vt3); |
| 79 ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); |
| 80 |
| 81 SRARI_H2_SH(res0, res1, FILTER_BITS); |
| 82 SAT_SH2_SH(res0, res1, 7); |
| 83 PCKEV_B2_UB(res0, res0, res1, res1, tmp0, tmp1); |
| 84 XORI_B2_128_UB(tmp0, tmp1); |
| 85 AVER_UB2_UB(tmp0, dst0, tmp1, dst2, tmp0, tmp1); |
| 86 ST4x4_UB(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); |
| 87 dst += (4 * dst_stride); |
| 88 |
| 89 hz_out5 = hz_out9; |
| 90 vec0 = vec2; |
| 91 vec1 = vec3; |
| 92 vec2 = vec4; |
| 93 } |
| 94 } |
| 95 |
| 96 static void common_hv_8ht_8vt_and_aver_dst_8w_msa(const uint8_t *src, |
| 97 int32_t src_stride, |
| 98 uint8_t *dst, |
| 99 int32_t dst_stride, |
| 100 int8_t *filter_horiz, |
| 101 int8_t *filter_vert, |
| 102 int32_t height) { |
| 103 uint32_t loop_cnt; |
| 104 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; |
| 105 v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; |
| 106 v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; |
| 107 v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3; |
| 108 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; |
| 109 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3; |
| 110 v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9; |
| 111 |
| 112 mask0 = LD_UB(&mc_filt_mask_arr[0]); |
| 113 src -= (3 + 3 * src_stride); |
| 114 |
| 115 /* rearranging filter */ |
| 116 filt = LD_SH(filter_horiz); |
| 117 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); |
| 118 |
| 119 mask1 = mask0 + 2; |
| 120 mask2 = mask0 + 4; |
| 121 mask3 = mask0 + 6; |
| 122 |
| 123 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); |
| 124 src += (7 * src_stride); |
| 125 |
| 126 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); |
| 127 hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0, |
| 128 filt_hz1, filt_hz2, filt_hz3); |
| 129 hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0, |
| 130 filt_hz1, filt_hz2, filt_hz3); |
| 131 hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0, |
| 132 filt_hz1, filt_hz2, filt_hz3); |
| 133 hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0, |
| 134 filt_hz1, filt_hz2, filt_hz3); |
| 135 hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0, |
| 136 filt_hz1, filt_hz2, filt_hz3); |
| 137 hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0, |
| 138 filt_hz1, filt_hz2, filt_hz3); |
| 139 hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0, |
| 140 filt_hz1, filt_hz2, filt_hz3); |
| 141 |
| 142 filt = LD_SH(filter_vert); |
| 143 SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); |
| 144 |
| 145 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); |
| 146 ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4); |
| 147 ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6); |
| 148 |
| 149 for (loop_cnt = (height >> 2); loop_cnt--;) { |
| 150 LD_SB4(src, src_stride, src7, src8, src9, src10); |
| 151 XORI_B4_128_SB(src7, src8, src9, src10); |
| 152 src += (4 * src_stride); |
| 153 |
| 154 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); |
| 155 |
| 156 hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, |
| 157 filt_hz0, filt_hz1, filt_hz2, filt_hz3); |
| 158 out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); |
| 159 tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1, |
| 160 filt_vt2, filt_vt3); |
| 161 |
| 162 hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, |
| 163 filt_hz0, filt_hz1, filt_hz2, filt_hz3); |
| 164 out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7); |
| 165 tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1, |
| 166 filt_vt2, filt_vt3); |
| 167 |
| 168 hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, |
| 169 filt_hz0, filt_hz1, filt_hz2, filt_hz3); |
| 170 out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); |
| 171 tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1, |
| 172 filt_vt2, filt_vt3); |
| 173 |
| 174 hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3, |
| 175 filt_hz0, filt_hz1, filt_hz2, filt_hz3); |
| 176 out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9); |
| 177 tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1, |
| 178 filt_vt2, filt_vt3); |
| 179 |
| 180 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); |
| 181 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); |
| 182 CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst2, dst3, |
| 183 dst, dst_stride); |
| 184 dst += (4 * dst_stride); |
| 185 |
| 186 hz_out6 = hz_out10; |
| 187 out0 = out2; |
| 188 out1 = out3; |
| 189 out2 = out8; |
| 190 out4 = out6; |
| 191 out5 = out7; |
| 192 out6 = out9; |
| 193 } |
| 194 } |
| 195 |
| 196 static void common_hv_8ht_8vt_and_aver_dst_16w_msa(const uint8_t *src, |
| 197 int32_t src_stride, |
| 198 uint8_t *dst, |
| 199 int32_t dst_stride, |
| 200 int8_t *filter_horiz, |
| 201 int8_t *filter_vert, |
| 202 int32_t height) { |
| 203 int32_t multiple8_cnt; |
| 204 for (multiple8_cnt = 2; multiple8_cnt--;) { |
| 205 common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, |
| 206 filter_horiz, filter_vert, height); |
| 207 src += 8; |
| 208 dst += 8; |
| 209 } |
| 210 } |
| 211 |
| 212 static void common_hv_8ht_8vt_and_aver_dst_32w_msa(const uint8_t *src, |
| 213 int32_t src_stride, |
| 214 uint8_t *dst, |
| 215 int32_t dst_stride, |
| 216 int8_t *filter_horiz, |
| 217 int8_t *filter_vert, |
| 218 int32_t height) { |
| 219 int32_t multiple8_cnt; |
| 220 for (multiple8_cnt = 4; multiple8_cnt--;) { |
| 221 common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, |
| 222 filter_horiz, filter_vert, height); |
| 223 src += 8; |
| 224 dst += 8; |
| 225 } |
| 226 } |
| 227 |
| 228 static void common_hv_8ht_8vt_and_aver_dst_64w_msa(const uint8_t *src, |
| 229 int32_t src_stride, |
| 230 uint8_t *dst, |
| 231 int32_t dst_stride, |
| 232 int8_t *filter_horiz, |
| 233 int8_t *filter_vert, |
| 234 int32_t height) { |
| 235 int32_t multiple8_cnt; |
| 236 for (multiple8_cnt = 8; multiple8_cnt--;) { |
| 237 common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, |
| 238 filter_horiz, filter_vert, height); |
| 239 src += 8; |
| 240 dst += 8; |
| 241 } |
| 242 } |
| 243 |
| 244 static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(const uint8_t *src, |
| 245 int32_t src_stride, |
| 246 uint8_t *dst, |
| 247 int32_t dst_stride, |
| 248 int8_t *filter_horiz, |
| 249 int8_t *filter_vert) { |
| 250 v16i8 src0, src1, src2, src3, src4, mask; |
| 251 v16u8 filt_hz, filt_vt, vec0, vec1; |
| 252 v16u8 dst0, dst1, dst2, dst3, res0, res1; |
| 253 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, filt; |
| 254 |
| 255 mask = LD_SB(&mc_filt_mask_arr[16]); |
| 256 |
| 257 /* rearranging filter */ |
| 258 filt = LD_UH(filter_horiz); |
| 259 filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0); |
| 260 |
| 261 filt = LD_UH(filter_vert); |
| 262 filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0); |
| 263 |
| 264 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); |
| 265 |
| 266 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); |
| 267 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); |
| 268 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); |
| 269 hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); |
| 270 hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); |
| 271 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); |
| 272 |
| 273 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); |
| 274 ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); |
| 275 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); |
| 276 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
| 277 SAT_UH2_UH(tmp0, tmp1, 7); |
| 278 PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1); |
| 279 AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1); |
| 280 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); |
| 281 } |
| 282 |
| 283 static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(const uint8_t *src, |
| 284 int32_t src_stride, |
| 285 uint8_t *dst, |
| 286 int32_t dst_stride, |
| 287 int8_t *filter_horiz, |
| 288 int8_t *filter_vert) { |
| 289 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; |
| 290 v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1, res2, res3; |
| 291 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; |
| 292 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; |
| 293 v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3; |
| 294 v8i16 filt; |
| 295 |
| 296 mask = LD_SB(&mc_filt_mask_arr[16]); |
| 297 |
| 298 /* rearranging filter */ |
| 299 filt = LD_SH(filter_horiz); |
| 300 filt_hz = (v16u8)__msa_splati_h(filt, 0); |
| 301 |
| 302 filt = LD_SH(filter_vert); |
| 303 filt_vt = (v16u8)__msa_splati_h(filt, 0); |
| 304 |
| 305 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); |
| 306 src += (8 * src_stride); |
| 307 src8 = LD_SB(src); |
| 308 |
| 309 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); |
| 310 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); |
| 311 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS); |
| 312 hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS); |
| 313 hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS); |
| 314 SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1, |
| 315 hz_out3, hz_out5, 8); |
| 316 hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6); |
| 317 |
| 318 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); |
| 319 ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, |
| 320 dst4, dst6); |
| 321 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); |
| 322 ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3); |
| 323 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, |
| 324 tmp0, tmp1, tmp2, tmp3); |
| 325 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); |
| 326 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); |
| 327 PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1, |
| 328 res2, res3); |
| 329 AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, |
| 330 res2, res3); |
| 331 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); |
| 332 dst += (4 * dst_stride); |
| 333 ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); |
| 334 } |
| 335 |
| 336 static void common_hv_2ht_2vt_and_aver_dst_4w_msa(const uint8_t *src, |
| 337 int32_t src_stride, |
| 338 uint8_t *dst, |
| 339 int32_t dst_stride, |
| 340 int8_t *filter_horiz, |
| 341 int8_t *filter_vert, |
| 342 int32_t height) { |
| 343 if (4 == height) { |
| 344 common_hv_2ht_2vt_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, |
| 345 filter_horiz, filter_vert); |
| 346 } else if (8 == height) { |
| 347 common_hv_2ht_2vt_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, |
| 348 filter_horiz, filter_vert); |
| 349 } |
| 350 } |
| 351 |
| 352 static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(const uint8_t *src, |
| 353 int32_t src_stride, |
| 354 uint8_t *dst, |
| 355 int32_t dst_stride, |
| 356 int8_t *filter_horiz, |
| 357 int8_t *filter_vert) { |
| 358 v16i8 src0, src1, src2, src3, src4, mask; |
| 359 v16u8 filt_hz, filt_vt, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3; |
| 360 v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; |
| 361 v8i16 filt; |
| 362 |
| 363 mask = LD_SB(&mc_filt_mask_arr[0]); |
| 364 |
| 365 /* rearranging filter */ |
| 366 filt = LD_SH(filter_horiz); |
| 367 filt_hz = (v16u8)__msa_splati_h(filt, 0); |
| 368 |
| 369 filt = LD_SH(filter_vert); |
| 370 filt_vt = (v16u8)__msa_splati_h(filt, 0); |
| 371 |
| 372 LD_SB5(src, src_stride, src0, src1, src2, src3, src4); |
| 373 src += (5 * src_stride); |
| 374 |
| 375 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); |
| 376 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); |
| 377 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); |
| 378 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); |
| 379 tmp0 = __msa_dotp_u_h(vec0, filt_vt); |
| 380 |
| 381 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); |
| 382 vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); |
| 383 tmp1 = __msa_dotp_u_h(vec1, filt_vt); |
| 384 |
| 385 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); |
| 386 vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); |
| 387 tmp2 = __msa_dotp_u_h(vec2, filt_vt); |
| 388 |
| 389 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); |
| 390 vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); |
| 391 tmp3 = __msa_dotp_u_h(vec3, filt_vt); |
| 392 |
| 393 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); |
| 394 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); |
| 395 PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, |
| 396 dst, dst_stride); |
| 397 } |
| 398 |
| 399 static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(const uint8_t *src, |
| 400 int32_t src_stride, |
| 401 uint8_t *dst, |
| 402 int32_t dst_stride, |
| 403 int8_t *filter_horiz, |
| 404 int8_t *filter_vert, |
| 405 int32_t height) { |
| 406 uint32_t loop_cnt; |
| 407 v16i8 src0, src1, src2, src3, src4, mask; |
| 408 v16u8 filt_hz, filt_vt, vec0, dst0, dst1, dst2, dst3; |
| 409 v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; |
| 410 v8i16 filt; |
| 411 |
| 412 mask = LD_SB(&mc_filt_mask_arr[0]); |
| 413 |
| 414 /* rearranging filter */ |
| 415 filt = LD_SH(filter_horiz); |
| 416 filt_hz = (v16u8)__msa_splati_h(filt, 0); |
| 417 |
| 418 filt = LD_SH(filter_vert); |
| 419 filt_vt = (v16u8)__msa_splati_h(filt, 0); |
| 420 |
| 421 src0 = LD_SB(src); |
| 422 src += src_stride; |
| 423 |
| 424 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); |
| 425 |
| 426 for (loop_cnt = (height >> 2); loop_cnt--;) { |
| 427 LD_SB4(src, src_stride, src1, src2, src3, src4); |
| 428 src += (4 * src_stride); |
| 429 |
| 430 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); |
| 431 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); |
| 432 tmp0 = __msa_dotp_u_h(vec0, filt_vt); |
| 433 |
| 434 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); |
| 435 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); |
| 436 tmp1 = __msa_dotp_u_h(vec0, filt_vt); |
| 437 |
| 438 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
| 439 SAT_UH2_UH(tmp0, tmp1, 7); |
| 440 |
| 441 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); |
| 442 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); |
| 443 tmp2 = __msa_dotp_u_h(vec0, filt_vt); |
| 444 |
| 445 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); |
| 446 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); |
| 447 tmp3 = __msa_dotp_u_h(vec0, filt_vt); |
| 448 |
| 449 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); |
| 450 SAT_UH2_UH(tmp2, tmp3, 7); |
| 451 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); |
| 452 PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, |
| 453 dst, dst_stride); |
| 454 dst += (4 * dst_stride); |
| 455 } |
| 456 } |
| 457 |
| 458 static void common_hv_2ht_2vt_and_aver_dst_8w_msa(const uint8_t *src, |
| 459 int32_t src_stride, |
| 460 uint8_t *dst, |
| 461 int32_t dst_stride, |
| 462 int8_t *filter_horiz, |
| 463 int8_t *filter_vert, |
| 464 int32_t height) { |
| 465 if (4 == height) { |
| 466 common_hv_2ht_2vt_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, |
| 467 filter_horiz, filter_vert); |
| 468 } else { |
| 469 common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride, |
| 470 filter_horiz, filter_vert, |
| 471 height); |
| 472 } |
| 473 } |
| 474 |
| 475 static void common_hv_2ht_2vt_and_aver_dst_16w_msa(const uint8_t *src, |
| 476 int32_t src_stride, |
| 477 uint8_t *dst, |
| 478 int32_t dst_stride, |
| 479 int8_t *filter_horiz, |
| 480 int8_t *filter_vert, |
| 481 int32_t height) { |
| 482 uint32_t loop_cnt; |
| 483 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; |
| 484 v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3; |
| 485 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1; |
| 486 v8i16 filt; |
| 487 |
| 488 mask = LD_SB(&mc_filt_mask_arr[0]); |
| 489 |
| 490 /* rearranging filter */ |
| 491 filt = LD_SH(filter_horiz); |
| 492 filt_hz = (v16u8)__msa_splati_h(filt, 0); |
| 493 |
| 494 filt = LD_SH(filter_vert); |
| 495 filt_vt = (v16u8)__msa_splati_h(filt, 0); |
| 496 |
| 497 LD_SB2(src, 8, src0, src1); |
| 498 src += src_stride; |
| 499 |
| 500 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); |
| 501 hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); |
| 502 |
| 503 for (loop_cnt = (height >> 2); loop_cnt--;) { |
| 504 LD_SB4(src, src_stride, src0, src2, src4, src6); |
| 505 LD_SB4(src + 8, src_stride, src1, src3, src5, src7); |
| 506 src += (4 * src_stride); |
| 507 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); |
| 508 |
| 509 hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); |
| 510 hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); |
| 511 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); |
| 512 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); |
| 513 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
| 514 SAT_UH2_UH(tmp0, tmp1, 7); |
| 515 PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst); |
| 516 dst += dst_stride; |
| 517 |
| 518 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); |
| 519 hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); |
| 520 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); |
| 521 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); |
| 522 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
| 523 SAT_UH2_UH(tmp0, tmp1, 7); |
| 524 PCKEV_AVG_ST_UB(tmp1, tmp0, dst1, dst); |
| 525 dst += dst_stride; |
| 526 |
| 527 hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); |
| 528 hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); |
| 529 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); |
| 530 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); |
| 531 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
| 532 SAT_UH2_UH(tmp0, tmp1, 7); |
| 533 PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst); |
| 534 dst += dst_stride; |
| 535 |
| 536 hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); |
| 537 hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); |
| 538 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); |
| 539 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); |
| 540 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); |
| 541 SAT_UH2_UH(tmp0, tmp1, 7); |
| 542 PCKEV_AVG_ST_UB(tmp1, tmp0, dst3, dst); |
| 543 dst += dst_stride; |
| 544 } |
| 545 } |
| 546 |
| 547 static void common_hv_2ht_2vt_and_aver_dst_32w_msa(const uint8_t *src, |
| 548 int32_t src_stride, |
| 549 uint8_t *dst, |
| 550 int32_t dst_stride, |
| 551 int8_t *filter_horiz, |
| 552 int8_t *filter_vert, |
| 553 int32_t height) { |
| 554 int32_t multiple8_cnt; |
| 555 for (multiple8_cnt = 2; multiple8_cnt--;) { |
| 556 common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride, |
| 557 filter_horiz, filter_vert, height); |
| 558 src += 16; |
| 559 dst += 16; |
| 560 } |
| 561 } |
| 562 |
| 563 static void common_hv_2ht_2vt_and_aver_dst_64w_msa(const uint8_t *src, |
| 564 int32_t src_stride, |
| 565 uint8_t *dst, |
| 566 int32_t dst_stride, |
| 567 int8_t *filter_horiz, |
| 568 int8_t *filter_vert, |
| 569 int32_t height) { |
| 570 int32_t multiple8_cnt; |
| 571 for (multiple8_cnt = 4; multiple8_cnt--;) { |
| 572 common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride, |
| 573 filter_horiz, filter_vert, height); |
| 574 src += 16; |
| 575 dst += 16; |
| 576 } |
| 577 } |
| 578 |
| 579 void vp9_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, |
| 580 uint8_t *dst, ptrdiff_t dst_stride, |
| 581 const int16_t *filter_x, int x_step_q4, |
| 582 const int16_t *filter_y, int y_step_q4, |
| 583 int w, int h) { |
| 584 int8_t cnt, filt_hor[8], filt_ver[8]; |
| 585 |
| 586 if (16 != x_step_q4 || 16 != y_step_q4) { |
| 587 vp9_convolve8_avg_c(src, src_stride, dst, dst_stride, |
| 588 filter_x, x_step_q4, filter_y, y_step_q4, |
| 589 w, h); |
| 590 return; |
| 591 } |
| 592 |
| 593 if (((const int32_t *)filter_x)[1] == 0x800000 && |
| 594 ((const int32_t *)filter_y)[1] == 0x800000) { |
| 595 vp9_convolve_avg(src, src_stride, dst, dst_stride, |
| 596 filter_x, x_step_q4, filter_y, y_step_q4, |
| 597 w, h); |
| 598 return; |
| 599 } |
| 600 |
| 601 for (cnt = 0; cnt < 8; ++cnt) { |
| 602 filt_hor[cnt] = filter_x[cnt]; |
| 603 filt_ver[cnt] = filter_y[cnt]; |
| 604 } |
| 605 |
| 606 if (((const int32_t *)filter_x)[0] == 0 && |
| 607 ((const int32_t *)filter_y)[0] == 0) { |
| 608 switch (w) { |
| 609 case 4: |
| 610 common_hv_2ht_2vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, |
| 611 dst, (int32_t)dst_stride, |
| 612 &filt_hor[3], &filt_ver[3], h); |
| 613 break; |
| 614 case 8: |
| 615 common_hv_2ht_2vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, |
| 616 dst, (int32_t)dst_stride, |
| 617 &filt_hor[3], &filt_ver[3], h); |
| 618 break; |
| 619 case 16: |
| 620 common_hv_2ht_2vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, |
| 621 dst, (int32_t)dst_stride, |
| 622 &filt_hor[3], &filt_ver[3], h); |
| 623 break; |
| 624 case 32: |
| 625 common_hv_2ht_2vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, |
| 626 dst, (int32_t)dst_stride, |
| 627 &filt_hor[3], &filt_ver[3], h); |
| 628 break; |
| 629 case 64: |
| 630 common_hv_2ht_2vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, |
| 631 dst, (int32_t)dst_stride, |
| 632 &filt_hor[3], &filt_ver[3], h); |
| 633 break; |
| 634 default: |
| 635 vp9_convolve8_avg_c(src, src_stride, dst, dst_stride, |
| 636 filter_x, x_step_q4, filter_y, y_step_q4, |
| 637 w, h); |
| 638 break; |
| 639 } |
| 640 } else if (((const int32_t *)filter_x)[0] == 0 || |
| 641 ((const int32_t *)filter_y)[0] == 0) { |
| 642 vp9_convolve8_avg_c(src, src_stride, dst, dst_stride, |
| 643 filter_x, x_step_q4, filter_y, y_step_q4, |
| 644 w, h); |
| 645 } else { |
| 646 switch (w) { |
| 647 case 4: |
| 648 common_hv_8ht_8vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, |
| 649 dst, (int32_t)dst_stride, |
| 650 filt_hor, filt_ver, h); |
| 651 break; |
| 652 case 8: |
| 653 common_hv_8ht_8vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, |
| 654 dst, (int32_t)dst_stride, |
| 655 filt_hor, filt_ver, h); |
| 656 break; |
| 657 case 16: |
| 658 common_hv_8ht_8vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, |
| 659 dst, (int32_t)dst_stride, |
| 660 filt_hor, filt_ver, h); |
| 661 break; |
| 662 case 32: |
| 663 common_hv_8ht_8vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, |
| 664 dst, (int32_t)dst_stride, |
| 665 filt_hor, filt_ver, h); |
| 666 break; |
| 667 case 64: |
| 668 common_hv_8ht_8vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, |
| 669 dst, (int32_t)dst_stride, |
| 670 filt_hor, filt_ver, h); |
| 671 break; |
| 672 default: |
| 673 vp9_convolve8_avg_c(src, src_stride, dst, dst_stride, |
| 674 filter_x, x_step_q4, filter_y, y_step_q4, |
| 675 w, h); |
| 676 break; |
| 677 } |
| 678 } |
| 679 } |
| OLD | NEW |