Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(727)

Side by Side Diff: source/libvpx/vp9/common/mips/msa/vp9_convolve8_avg_msa.c

Issue 1169543007: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "./vp9_rtcd.h"
12 #include "vp9/common/mips/msa/vp9_convolve_msa.h"
13
14 static void common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t *src,
15 int32_t src_stride,
16 uint8_t *dst,
17 int32_t dst_stride,
18 int8_t *filter_horiz,
19 int8_t *filter_vert,
20 int32_t height) {
21 uint32_t loop_cnt;
22 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
23 v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3, tmp0, tmp1;
24 v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
25 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
26 v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4;
27 v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
28
29 mask0 = LD_UB(&mc_filt_mask_arr[16]);
30 src -= (3 + 3 * src_stride);
31
32 /* rearranging filter */
33 filt = LD_SH(filter_horiz);
34 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
35
36 mask1 = mask0 + 2;
37 mask2 = mask0 + 4;
38 mask3 = mask0 + 6;
39
40 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
41 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
42 src += (7 * src_stride);
43
44 hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
45 filt_hz1, filt_hz2, filt_hz3);
46 hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
47 filt_hz1, filt_hz2, filt_hz3);
48 hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
49 filt_hz1, filt_hz2, filt_hz3);
50 hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
51 filt_hz1, filt_hz2, filt_hz3);
52 SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
53
54 filt = LD_SH(filter_vert);
55 SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
56
57 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
58 vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
59
60 for (loop_cnt = (height >> 2); loop_cnt--;) {
61 LD_SB4(src, src_stride, src7, src8, src9, src10);
62 XORI_B4_128_SB(src7, src8, src9, src10);
63 src += (4 * src_stride);
64
65 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
66 hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3,
67 filt_hz0, filt_hz1, filt_hz2, filt_hz3);
68 hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
69 vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
70 res0 = FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt_vt0, filt_vt1,
71 filt_vt2, filt_vt3);
72
73 hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3,
74 filt_hz0, filt_hz1, filt_hz2, filt_hz3);
75 hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8);
76 vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
77 res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1,
78 filt_vt2, filt_vt3);
79 ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
80
81 SRARI_H2_SH(res0, res1, FILTER_BITS);
82 SAT_SH2_SH(res0, res1, 7);
83 PCKEV_B2_UB(res0, res0, res1, res1, tmp0, tmp1);
84 XORI_B2_128_UB(tmp0, tmp1);
85 AVER_UB2_UB(tmp0, dst0, tmp1, dst2, tmp0, tmp1);
86 ST4x4_UB(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
87 dst += (4 * dst_stride);
88
89 hz_out5 = hz_out9;
90 vec0 = vec2;
91 vec1 = vec3;
92 vec2 = vec4;
93 }
94 }
95
96 static void common_hv_8ht_8vt_and_aver_dst_8w_msa(const uint8_t *src,
97 int32_t src_stride,
98 uint8_t *dst,
99 int32_t dst_stride,
100 int8_t *filter_horiz,
101 int8_t *filter_vert,
102 int32_t height) {
103 uint32_t loop_cnt;
104 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
105 v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
106 v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
107 v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3;
108 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
109 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
110 v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
111
112 mask0 = LD_UB(&mc_filt_mask_arr[0]);
113 src -= (3 + 3 * src_stride);
114
115 /* rearranging filter */
116 filt = LD_SH(filter_horiz);
117 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
118
119 mask1 = mask0 + 2;
120 mask2 = mask0 + 4;
121 mask3 = mask0 + 6;
122
123 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
124 src += (7 * src_stride);
125
126 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
127 hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
128 filt_hz1, filt_hz2, filt_hz3);
129 hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
130 filt_hz1, filt_hz2, filt_hz3);
131 hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
132 filt_hz1, filt_hz2, filt_hz3);
133 hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
134 filt_hz1, filt_hz2, filt_hz3);
135 hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
136 filt_hz1, filt_hz2, filt_hz3);
137 hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
138 filt_hz1, filt_hz2, filt_hz3);
139 hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
140 filt_hz1, filt_hz2, filt_hz3);
141
142 filt = LD_SH(filter_vert);
143 SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
144
145 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
146 ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
147 ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
148
149 for (loop_cnt = (height >> 2); loop_cnt--;) {
150 LD_SB4(src, src_stride, src7, src8, src9, src10);
151 XORI_B4_128_SB(src7, src8, src9, src10);
152 src += (4 * src_stride);
153
154 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
155
156 hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3,
157 filt_hz0, filt_hz1, filt_hz2, filt_hz3);
158 out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
159 tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
160 filt_vt2, filt_vt3);
161
162 hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3,
163 filt_hz0, filt_hz1, filt_hz2, filt_hz3);
164 out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
165 tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
166 filt_vt2, filt_vt3);
167
168 hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3,
169 filt_hz0, filt_hz1, filt_hz2, filt_hz3);
170 out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
171 tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
172 filt_vt2, filt_vt3);
173
174 hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
175 filt_hz0, filt_hz1, filt_hz2, filt_hz3);
176 out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9);
177 tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
178 filt_vt2, filt_vt3);
179
180 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
181 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
182 CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst2, dst3,
183 dst, dst_stride);
184 dst += (4 * dst_stride);
185
186 hz_out6 = hz_out10;
187 out0 = out2;
188 out1 = out3;
189 out2 = out8;
190 out4 = out6;
191 out5 = out7;
192 out6 = out9;
193 }
194 }
195
196 static void common_hv_8ht_8vt_and_aver_dst_16w_msa(const uint8_t *src,
197 int32_t src_stride,
198 uint8_t *dst,
199 int32_t dst_stride,
200 int8_t *filter_horiz,
201 int8_t *filter_vert,
202 int32_t height) {
203 int32_t multiple8_cnt;
204 for (multiple8_cnt = 2; multiple8_cnt--;) {
205 common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
206 filter_horiz, filter_vert, height);
207 src += 8;
208 dst += 8;
209 }
210 }
211
212 static void common_hv_8ht_8vt_and_aver_dst_32w_msa(const uint8_t *src,
213 int32_t src_stride,
214 uint8_t *dst,
215 int32_t dst_stride,
216 int8_t *filter_horiz,
217 int8_t *filter_vert,
218 int32_t height) {
219 int32_t multiple8_cnt;
220 for (multiple8_cnt = 4; multiple8_cnt--;) {
221 common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
222 filter_horiz, filter_vert, height);
223 src += 8;
224 dst += 8;
225 }
226 }
227
228 static void common_hv_8ht_8vt_and_aver_dst_64w_msa(const uint8_t *src,
229 int32_t src_stride,
230 uint8_t *dst,
231 int32_t dst_stride,
232 int8_t *filter_horiz,
233 int8_t *filter_vert,
234 int32_t height) {
235 int32_t multiple8_cnt;
236 for (multiple8_cnt = 8; multiple8_cnt--;) {
237 common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
238 filter_horiz, filter_vert, height);
239 src += 8;
240 dst += 8;
241 }
242 }
243
244 static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(const uint8_t *src,
245 int32_t src_stride,
246 uint8_t *dst,
247 int32_t dst_stride,
248 int8_t *filter_horiz,
249 int8_t *filter_vert) {
250 v16i8 src0, src1, src2, src3, src4, mask;
251 v16u8 filt_hz, filt_vt, vec0, vec1;
252 v16u8 dst0, dst1, dst2, dst3, res0, res1;
253 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, filt;
254
255 mask = LD_SB(&mc_filt_mask_arr[16]);
256
257 /* rearranging filter */
258 filt = LD_UH(filter_horiz);
259 filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
260
261 filt = LD_UH(filter_vert);
262 filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
263
264 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
265
266 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
267 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
268 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
269 hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
270 hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
271 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
272
273 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
274 ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
275 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
276 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
277 SAT_UH2_UH(tmp0, tmp1, 7);
278 PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
279 AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1);
280 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
281 }
282
283 static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(const uint8_t *src,
284 int32_t src_stride,
285 uint8_t *dst,
286 int32_t dst_stride,
287 int8_t *filter_horiz,
288 int8_t *filter_vert) {
289 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
290 v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
291 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
292 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
293 v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3;
294 v8i16 filt;
295
296 mask = LD_SB(&mc_filt_mask_arr[16]);
297
298 /* rearranging filter */
299 filt = LD_SH(filter_horiz);
300 filt_hz = (v16u8)__msa_splati_h(filt, 0);
301
302 filt = LD_SH(filter_vert);
303 filt_vt = (v16u8)__msa_splati_h(filt, 0);
304
305 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
306 src += (8 * src_stride);
307 src8 = LD_SB(src);
308
309 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
310 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
311 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS);
312 hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS);
313 hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS);
314 SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
315 hz_out3, hz_out5, 8);
316 hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
317
318 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
319 ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2,
320 dst4, dst6);
321 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
322 ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
323 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
324 tmp0, tmp1, tmp2, tmp3);
325 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
326 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
327 PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1,
328 res2, res3);
329 AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1,
330 res2, res3);
331 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
332 dst += (4 * dst_stride);
333 ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
334 }
335
336 static void common_hv_2ht_2vt_and_aver_dst_4w_msa(const uint8_t *src,
337 int32_t src_stride,
338 uint8_t *dst,
339 int32_t dst_stride,
340 int8_t *filter_horiz,
341 int8_t *filter_vert,
342 int32_t height) {
343 if (4 == height) {
344 common_hv_2ht_2vt_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
345 filter_horiz, filter_vert);
346 } else if (8 == height) {
347 common_hv_2ht_2vt_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
348 filter_horiz, filter_vert);
349 }
350 }
351
352 static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(const uint8_t *src,
353 int32_t src_stride,
354 uint8_t *dst,
355 int32_t dst_stride,
356 int8_t *filter_horiz,
357 int8_t *filter_vert) {
358 v16i8 src0, src1, src2, src3, src4, mask;
359 v16u8 filt_hz, filt_vt, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
360 v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
361 v8i16 filt;
362
363 mask = LD_SB(&mc_filt_mask_arr[0]);
364
365 /* rearranging filter */
366 filt = LD_SH(filter_horiz);
367 filt_hz = (v16u8)__msa_splati_h(filt, 0);
368
369 filt = LD_SH(filter_vert);
370 filt_vt = (v16u8)__msa_splati_h(filt, 0);
371
372 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
373 src += (5 * src_stride);
374
375 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
376 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
377 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
378 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
379 tmp0 = __msa_dotp_u_h(vec0, filt_vt);
380
381 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
382 vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
383 tmp1 = __msa_dotp_u_h(vec1, filt_vt);
384
385 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
386 vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
387 tmp2 = __msa_dotp_u_h(vec2, filt_vt);
388
389 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
390 vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
391 tmp3 = __msa_dotp_u_h(vec3, filt_vt);
392
393 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
394 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
395 PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
396 dst, dst_stride);
397 }
398
399 static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(const uint8_t *src,
400 int32_t src_stride,
401 uint8_t *dst,
402 int32_t dst_stride,
403 int8_t *filter_horiz,
404 int8_t *filter_vert,
405 int32_t height) {
406 uint32_t loop_cnt;
407 v16i8 src0, src1, src2, src3, src4, mask;
408 v16u8 filt_hz, filt_vt, vec0, dst0, dst1, dst2, dst3;
409 v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
410 v8i16 filt;
411
412 mask = LD_SB(&mc_filt_mask_arr[0]);
413
414 /* rearranging filter */
415 filt = LD_SH(filter_horiz);
416 filt_hz = (v16u8)__msa_splati_h(filt, 0);
417
418 filt = LD_SH(filter_vert);
419 filt_vt = (v16u8)__msa_splati_h(filt, 0);
420
421 src0 = LD_SB(src);
422 src += src_stride;
423
424 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
425
426 for (loop_cnt = (height >> 2); loop_cnt--;) {
427 LD_SB4(src, src_stride, src1, src2, src3, src4);
428 src += (4 * src_stride);
429
430 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
431 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
432 tmp0 = __msa_dotp_u_h(vec0, filt_vt);
433
434 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
435 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
436 tmp1 = __msa_dotp_u_h(vec0, filt_vt);
437
438 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
439 SAT_UH2_UH(tmp0, tmp1, 7);
440
441 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
442 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
443 tmp2 = __msa_dotp_u_h(vec0, filt_vt);
444
445 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
446 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
447 tmp3 = __msa_dotp_u_h(vec0, filt_vt);
448
449 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
450 SAT_UH2_UH(tmp2, tmp3, 7);
451 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
452 PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
453 dst, dst_stride);
454 dst += (4 * dst_stride);
455 }
456 }
457
458 static void common_hv_2ht_2vt_and_aver_dst_8w_msa(const uint8_t *src,
459 int32_t src_stride,
460 uint8_t *dst,
461 int32_t dst_stride,
462 int8_t *filter_horiz,
463 int8_t *filter_vert,
464 int32_t height) {
465 if (4 == height) {
466 common_hv_2ht_2vt_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride,
467 filter_horiz, filter_vert);
468 } else {
469 common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
470 filter_horiz, filter_vert,
471 height);
472 }
473 }
474
475 static void common_hv_2ht_2vt_and_aver_dst_16w_msa(const uint8_t *src,
476 int32_t src_stride,
477 uint8_t *dst,
478 int32_t dst_stride,
479 int8_t *filter_horiz,
480 int8_t *filter_vert,
481 int32_t height) {
482 uint32_t loop_cnt;
483 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
484 v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3;
485 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
486 v8i16 filt;
487
488 mask = LD_SB(&mc_filt_mask_arr[0]);
489
490 /* rearranging filter */
491 filt = LD_SH(filter_horiz);
492 filt_hz = (v16u8)__msa_splati_h(filt, 0);
493
494 filt = LD_SH(filter_vert);
495 filt_vt = (v16u8)__msa_splati_h(filt, 0);
496
497 LD_SB2(src, 8, src0, src1);
498 src += src_stride;
499
500 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
501 hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
502
503 for (loop_cnt = (height >> 2); loop_cnt--;) {
504 LD_SB4(src, src_stride, src0, src2, src4, src6);
505 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
506 src += (4 * src_stride);
507 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
508
509 hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
510 hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
511 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
512 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
513 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
514 SAT_UH2_UH(tmp0, tmp1, 7);
515 PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
516 dst += dst_stride;
517
518 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
519 hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
520 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
521 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
522 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
523 SAT_UH2_UH(tmp0, tmp1, 7);
524 PCKEV_AVG_ST_UB(tmp1, tmp0, dst1, dst);
525 dst += dst_stride;
526
527 hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
528 hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
529 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
530 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
531 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
532 SAT_UH2_UH(tmp0, tmp1, 7);
533 PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
534 dst += dst_stride;
535
536 hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
537 hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
538 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
539 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
540 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
541 SAT_UH2_UH(tmp0, tmp1, 7);
542 PCKEV_AVG_ST_UB(tmp1, tmp0, dst3, dst);
543 dst += dst_stride;
544 }
545 }
546
547 static void common_hv_2ht_2vt_and_aver_dst_32w_msa(const uint8_t *src,
548 int32_t src_stride,
549 uint8_t *dst,
550 int32_t dst_stride,
551 int8_t *filter_horiz,
552 int8_t *filter_vert,
553 int32_t height) {
554 int32_t multiple8_cnt;
555 for (multiple8_cnt = 2; multiple8_cnt--;) {
556 common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride,
557 filter_horiz, filter_vert, height);
558 src += 16;
559 dst += 16;
560 }
561 }
562
563 static void common_hv_2ht_2vt_and_aver_dst_64w_msa(const uint8_t *src,
564 int32_t src_stride,
565 uint8_t *dst,
566 int32_t dst_stride,
567 int8_t *filter_horiz,
568 int8_t *filter_vert,
569 int32_t height) {
570 int32_t multiple8_cnt;
571 for (multiple8_cnt = 4; multiple8_cnt--;) {
572 common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride,
573 filter_horiz, filter_vert, height);
574 src += 16;
575 dst += 16;
576 }
577 }
578
579 void vp9_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
580 uint8_t *dst, ptrdiff_t dst_stride,
581 const int16_t *filter_x, int x_step_q4,
582 const int16_t *filter_y, int y_step_q4,
583 int w, int h) {
584 int8_t cnt, filt_hor[8], filt_ver[8];
585
586 if (16 != x_step_q4 || 16 != y_step_q4) {
587 vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
588 filter_x, x_step_q4, filter_y, y_step_q4,
589 w, h);
590 return;
591 }
592
593 if (((const int32_t *)filter_x)[1] == 0x800000 &&
594 ((const int32_t *)filter_y)[1] == 0x800000) {
595 vp9_convolve_avg(src, src_stride, dst, dst_stride,
596 filter_x, x_step_q4, filter_y, y_step_q4,
597 w, h);
598 return;
599 }
600
601 for (cnt = 0; cnt < 8; ++cnt) {
602 filt_hor[cnt] = filter_x[cnt];
603 filt_ver[cnt] = filter_y[cnt];
604 }
605
606 if (((const int32_t *)filter_x)[0] == 0 &&
607 ((const int32_t *)filter_y)[0] == 0) {
608 switch (w) {
609 case 4:
610 common_hv_2ht_2vt_and_aver_dst_4w_msa(src, (int32_t)src_stride,
611 dst, (int32_t)dst_stride,
612 &filt_hor[3], &filt_ver[3], h);
613 break;
614 case 8:
615 common_hv_2ht_2vt_and_aver_dst_8w_msa(src, (int32_t)src_stride,
616 dst, (int32_t)dst_stride,
617 &filt_hor[3], &filt_ver[3], h);
618 break;
619 case 16:
620 common_hv_2ht_2vt_and_aver_dst_16w_msa(src, (int32_t)src_stride,
621 dst, (int32_t)dst_stride,
622 &filt_hor[3], &filt_ver[3], h);
623 break;
624 case 32:
625 common_hv_2ht_2vt_and_aver_dst_32w_msa(src, (int32_t)src_stride,
626 dst, (int32_t)dst_stride,
627 &filt_hor[3], &filt_ver[3], h);
628 break;
629 case 64:
630 common_hv_2ht_2vt_and_aver_dst_64w_msa(src, (int32_t)src_stride,
631 dst, (int32_t)dst_stride,
632 &filt_hor[3], &filt_ver[3], h);
633 break;
634 default:
635 vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
636 filter_x, x_step_q4, filter_y, y_step_q4,
637 w, h);
638 break;
639 }
640 } else if (((const int32_t *)filter_x)[0] == 0 ||
641 ((const int32_t *)filter_y)[0] == 0) {
642 vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
643 filter_x, x_step_q4, filter_y, y_step_q4,
644 w, h);
645 } else {
646 switch (w) {
647 case 4:
648 common_hv_8ht_8vt_and_aver_dst_4w_msa(src, (int32_t)src_stride,
649 dst, (int32_t)dst_stride,
650 filt_hor, filt_ver, h);
651 break;
652 case 8:
653 common_hv_8ht_8vt_and_aver_dst_8w_msa(src, (int32_t)src_stride,
654 dst, (int32_t)dst_stride,
655 filt_hor, filt_ver, h);
656 break;
657 case 16:
658 common_hv_8ht_8vt_and_aver_dst_16w_msa(src, (int32_t)src_stride,
659 dst, (int32_t)dst_stride,
660 filt_hor, filt_ver, h);
661 break;
662 case 32:
663 common_hv_8ht_8vt_and_aver_dst_32w_msa(src, (int32_t)src_stride,
664 dst, (int32_t)dst_stride,
665 filt_hor, filt_ver, h);
666 break;
667 case 64:
668 common_hv_8ht_8vt_and_aver_dst_64w_msa(src, (int32_t)src_stride,
669 dst, (int32_t)dst_stride,
670 filt_hor, filt_ver, h);
671 break;
672 default:
673 vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
674 filter_x, x_step_q4, filter_y, y_step_q4,
675 w, h);
676 break;
677 }
678 }
679 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698