Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(665)

Side by Side Diff: source/libvpx/vp9/common/mips/msa/vp9_convolve8_msa.c

Issue 1124333011: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: only update to last nights LKGR Created 5 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "./vp9_rtcd.h"
12 #include "vp9/common/mips/msa/vp9_convolve_msa.h"
13
14 const uint8_t mc_filt_mask_arr[16 * 3] = {
15 /* 8 width cases */
16 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
17 /* 4 width cases */
18 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
19 /* 4 width cases */
20 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
21 };
22
23 static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride,
24 uint8_t *dst, int32_t dst_stride,
25 int8_t *filter_horiz, int8_t *filter_vert,
26 int32_t height) {
27 uint32_t loop_cnt;
28 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
29 v16i8 filt_horiz0, filt_horiz1, filt_horiz2, filt_horiz3;
30 v16u8 mask0, mask1, mask2, mask3;
31 v8i16 filt_horiz;
32 v8i16 horiz_out0, horiz_out1, horiz_out2, horiz_out3, horiz_out4;
33 v8i16 horiz_out5, horiz_out6, horiz_out7, horiz_out8, horiz_out9;
34 v8i16 tmp0, tmp1, out0, out1, out2, out3, out4;
35 v8i16 filt, filt_vert0, filt_vert1, filt_vert2, filt_vert3;
36
37 mask0 = LOAD_UB(&mc_filt_mask_arr[16]);
38
39 src -= (3 + 3 * src_stride);
40
41 /* rearranging filter */
42 filt_horiz = LOAD_SH(filter_horiz);
43 filt_horiz0 = (v16i8)__msa_splati_h(filt_horiz, 0);
44 filt_horiz1 = (v16i8)__msa_splati_h(filt_horiz, 1);
45 filt_horiz2 = (v16i8)__msa_splati_h(filt_horiz, 2);
46 filt_horiz3 = (v16i8)__msa_splati_h(filt_horiz, 3);
47
48 mask1 = mask0 + 2;
49 mask2 = mask0 + 4;
50 mask3 = mask0 + 6;
51
52 LOAD_7VECS_SB(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
53 src += (7 * src_stride);
54
55 XORI_B_7VECS_SB(src0, src1, src2, src3, src4, src5, src6,
56 src0, src1, src2, src3, src4, src5, src6, 128);
57
58 horiz_out0 = HORIZ_8TAP_FILT_2VECS(src0, src1, mask0, mask1, mask2, mask3,
59 filt_horiz0, filt_horiz1, filt_horiz2,
60 filt_horiz3);
61 horiz_out2 = HORIZ_8TAP_FILT_2VECS(src2, src3, mask0, mask1, mask2, mask3,
62 filt_horiz0, filt_horiz1, filt_horiz2,
63 filt_horiz3);
64 horiz_out4 = HORIZ_8TAP_FILT_2VECS(src4, src5, mask0, mask1, mask2, mask3,
65 filt_horiz0, filt_horiz1, filt_horiz2,
66 filt_horiz3);
67 horiz_out5 = HORIZ_8TAP_FILT_2VECS(src5, src6, mask0, mask1, mask2, mask3,
68 filt_horiz0, filt_horiz1, filt_horiz2,
69 filt_horiz3);
70 horiz_out1 = (v8i16)__msa_sldi_b((v16i8)horiz_out2, (v16i8)horiz_out0, 8);
71 horiz_out3 = (v8i16)__msa_sldi_b((v16i8)horiz_out4, (v16i8)horiz_out2, 8);
72
73 filt = LOAD_SH(filter_vert);
74 filt_vert0 = __msa_splati_h(filt, 0);
75 filt_vert1 = __msa_splati_h(filt, 1);
76 filt_vert2 = __msa_splati_h(filt, 2);
77 filt_vert3 = __msa_splati_h(filt, 3);
78
79 out0 = (v8i16)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
80 out1 = (v8i16)__msa_ilvev_b((v16i8)horiz_out3, (v16i8)horiz_out2);
81 out2 = (v8i16)__msa_ilvev_b((v16i8)horiz_out5, (v16i8)horiz_out4);
82
83 for (loop_cnt = (height >> 2); loop_cnt--;) {
84 LOAD_4VECS_SB(src, src_stride, src7, src8, src9, src10);
85 src += (4 * src_stride);
86
87 XORI_B_4VECS_SB(src7, src8, src9, src10, src7, src8, src9, src10, 128);
88
89 horiz_out7 = HORIZ_8TAP_FILT_2VECS(src7, src8, mask0, mask1, mask2, mask3,
90 filt_horiz0, filt_horiz1, filt_horiz2,
91 filt_horiz3);
92 horiz_out6 = (v8i16)__msa_sldi_b((v16i8)horiz_out7, (v16i8)horiz_out5, 8);
93
94 out3 = (v8i16)__msa_ilvev_b((v16i8)horiz_out7, (v16i8)horiz_out6);
95
96 tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vert0, filt_vert1,
97 filt_vert2, filt_vert3);
98
99 horiz_out9 = HORIZ_8TAP_FILT_2VECS(src9, src10, mask0, mask1, mask2, mask3,
100 filt_horiz0, filt_horiz1, filt_horiz2,
101 filt_horiz3);
102 horiz_out8 = (v8i16)__msa_sldi_b((v16i8)horiz_out9, (v16i8)horiz_out7, 8);
103
104 out4 = (v8i16)__msa_ilvev_b((v16i8)horiz_out9, (v16i8)horiz_out8);
105
106 tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vert0, filt_vert1,
107 filt_vert2, filt_vert3);
108 tmp0 = SRARI_SATURATE_SIGNED_H(tmp0, FILTER_BITS, 7);
109 tmp1 = SRARI_SATURATE_SIGNED_H(tmp1, FILTER_BITS, 7);
110
111 PCKEV_2B_XORI128_STORE_4_BYTES_4(tmp0, tmp1, dst, dst_stride);
112 dst += (4 * dst_stride);
113
114 horiz_out5 = horiz_out9;
115
116 out0 = out2;
117 out1 = out3;
118 out2 = out4;
119 }
120 }
121
122 static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride,
123 uint8_t *dst, int32_t dst_stride,
124 int8_t *filter_horiz, int8_t *filter_vert,
125 int32_t height) {
126 uint32_t loop_cnt;
127 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
128 v16i8 filt_horiz0, filt_horiz1, filt_horiz2, filt_horiz3;
129 v8i16 filt_horiz, filt, filt_vert0, filt_vert1, filt_vert2, filt_vert3;
130 v16u8 mask0, mask1, mask2, mask3;
131 v8i16 horiz_out0, horiz_out1, horiz_out2, horiz_out3;
132 v8i16 horiz_out4, horiz_out5, horiz_out6, horiz_out7;
133 v8i16 horiz_out8, horiz_out9, horiz_out10;
134 v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
135 v8i16 tmp0, tmp1, tmp2, tmp3;
136
137 mask0 = LOAD_UB(&mc_filt_mask_arr[0]);
138
139 src -= (3 + 3 * src_stride);
140
141 /* rearranging filter */
142 filt_horiz = LOAD_SH(filter_horiz);
143 filt_horiz0 = (v16i8)__msa_splati_h(filt_horiz, 0);
144 filt_horiz1 = (v16i8)__msa_splati_h(filt_horiz, 1);
145 filt_horiz2 = (v16i8)__msa_splati_h(filt_horiz, 2);
146 filt_horiz3 = (v16i8)__msa_splati_h(filt_horiz, 3);
147
148 mask1 = mask0 + 2;
149 mask2 = mask0 + 4;
150 mask3 = mask0 + 6;
151
152 LOAD_7VECS_SB(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
153 src += (7 * src_stride);
154
155 XORI_B_7VECS_SB(src0, src1, src2, src3, src4, src5, src6,
156 src0, src1, src2, src3, src4, src5, src6, 128);
157
158 horiz_out0 = HORIZ_8TAP_FILT(src0, mask0, mask1, mask2, mask3, filt_horiz0,
159 filt_horiz1, filt_horiz2, filt_horiz3);
160 horiz_out1 = HORIZ_8TAP_FILT(src1, mask0, mask1, mask2, mask3, filt_horiz0,
161 filt_horiz1, filt_horiz2, filt_horiz3);
162 horiz_out2 = HORIZ_8TAP_FILT(src2, mask0, mask1, mask2, mask3, filt_horiz0,
163 filt_horiz1, filt_horiz2, filt_horiz3);
164 horiz_out3 = HORIZ_8TAP_FILT(src3, mask0, mask1, mask2, mask3, filt_horiz0,
165 filt_horiz1, filt_horiz2, filt_horiz3);
166 horiz_out4 = HORIZ_8TAP_FILT(src4, mask0, mask1, mask2, mask3, filt_horiz0,
167 filt_horiz1, filt_horiz2, filt_horiz3);
168 horiz_out5 = HORIZ_8TAP_FILT(src5, mask0, mask1, mask2, mask3, filt_horiz0,
169 filt_horiz1, filt_horiz2, filt_horiz3);
170 horiz_out6 = HORIZ_8TAP_FILT(src6, mask0, mask1, mask2, mask3, filt_horiz0,
171 filt_horiz1, filt_horiz2, filt_horiz3);
172
173 filt = LOAD_SH(filter_vert);
174 filt_vert0 = __msa_splati_h(filt, 0);
175 filt_vert1 = __msa_splati_h(filt, 1);
176 filt_vert2 = __msa_splati_h(filt, 2);
177 filt_vert3 = __msa_splati_h(filt, 3);
178
179 out0 = (v8i16)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
180 out1 = (v8i16)__msa_ilvev_b((v16i8)horiz_out3, (v16i8)horiz_out2);
181 out2 = (v8i16)__msa_ilvev_b((v16i8)horiz_out5, (v16i8)horiz_out4);
182 out4 = (v8i16)__msa_ilvev_b((v16i8)horiz_out2, (v16i8)horiz_out1);
183 out5 = (v8i16)__msa_ilvev_b((v16i8)horiz_out4, (v16i8)horiz_out3);
184 out6 = (v8i16)__msa_ilvev_b((v16i8)horiz_out6, (v16i8)horiz_out5);
185
186 for (loop_cnt = (height >> 2); loop_cnt--;) {
187 LOAD_4VECS_SB(src, src_stride, src7, src8, src9, src10);
188 src += (4 * src_stride);
189
190 XORI_B_4VECS_SB(src7, src8, src9, src10, src7, src8, src9, src10, 128);
191
192 horiz_out7 = HORIZ_8TAP_FILT(src7, mask0, mask1, mask2, mask3, filt_horiz0,
193 filt_horiz1, filt_horiz2, filt_horiz3);
194
195 out3 = (v8i16)__msa_ilvev_b((v16i8)horiz_out7, (v16i8)horiz_out6);
196 tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vert0, filt_vert1,
197 filt_vert2, filt_vert3);
198 tmp0 = SRARI_SATURATE_SIGNED_H(tmp0, FILTER_BITS, 7);
199
200 horiz_out8 = HORIZ_8TAP_FILT(src8, mask0, mask1, mask2, mask3, filt_horiz0,
201 filt_horiz1, filt_horiz2, filt_horiz3);
202
203 out7 = (v8i16)__msa_ilvev_b((v16i8)horiz_out8, (v16i8)horiz_out7);
204 tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vert0, filt_vert1,
205 filt_vert2, filt_vert3);
206 tmp1 = SRARI_SATURATE_SIGNED_H(tmp1, FILTER_BITS, 7);
207
208 horiz_out9 = HORIZ_8TAP_FILT(src9, mask0, mask1, mask2, mask3, filt_horiz0,
209 filt_horiz1, filt_horiz2, filt_horiz3);
210
211 out8 = (v8i16)__msa_ilvev_b((v16i8)horiz_out9, (v16i8)horiz_out8);
212 tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vert0, filt_vert1,
213 filt_vert2, filt_vert3);
214 tmp2 = SRARI_SATURATE_SIGNED_H(tmp2, FILTER_BITS, 7);
215
216 horiz_out10 = HORIZ_8TAP_FILT(src10, mask0, mask1, mask2, mask3,
217 filt_horiz0, filt_horiz1, filt_horiz2,
218 filt_horiz3);
219
220 out9 = (v8i16)__msa_ilvev_b((v16i8)horiz_out10, (v16i8)horiz_out9);
221 tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vert0, filt_vert1,
222 filt_vert2, filt_vert3);
223 tmp3 = SRARI_SATURATE_SIGNED_H(tmp3, FILTER_BITS, 7);
224
225 PCKEV_B_4_XORI128_STORE_8_BYTES_4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
226 dst += (4 * dst_stride);
227
228 horiz_out6 = horiz_out10;
229
230 out0 = out2;
231 out1 = out3;
232 out2 = out8;
233 out4 = out6;
234 out5 = out7;
235 out6 = out9;
236 }
237 }
238
239 static void common_hv_8ht_8vt_16w_msa(const uint8_t *src, int32_t src_stride,
240 uint8_t *dst, int32_t dst_stride,
241 int8_t *filter_horiz, int8_t *filter_vert,
242 int32_t height) {
243 int32_t multiple8_cnt;
244 for (multiple8_cnt = 2; multiple8_cnt--;) {
245 common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
246 filter_vert, height);
247 src += 8;
248 dst += 8;
249 }
250 }
251
252 static void common_hv_8ht_8vt_32w_msa(const uint8_t *src, int32_t src_stride,
253 uint8_t *dst, int32_t dst_stride,
254 int8_t *filter_horiz, int8_t *filter_vert,
255 int32_t height) {
256 int32_t multiple8_cnt;
257 for (multiple8_cnt = 4; multiple8_cnt--;) {
258 common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
259 filter_vert, height);
260 src += 8;
261 dst += 8;
262 }
263 }
264
265 static void common_hv_8ht_8vt_64w_msa(const uint8_t *src, int32_t src_stride,
266 uint8_t *dst, int32_t dst_stride,
267 int8_t *filter_horiz, int8_t *filter_vert,
268 int32_t height) {
269 int32_t multiple8_cnt;
270 for (multiple8_cnt = 8; multiple8_cnt--;) {
271 common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
272 filter_vert, height);
273 src += 8;
274 dst += 8;
275 }
276 }
277
278 static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride,
279 uint8_t *dst, int32_t dst_stride,
280 int8_t *filter_horiz,
281 int8_t *filter_vert) {
282 uint32_t out0, out1, out2, out3;
283 v16i8 src0, src1, src2, src3, src4, mask;
284 v16u8 res0, res1, horiz_vec;
285 v16u8 filt_vert, filt_horiz, vec0, vec1;
286 v8u16 filt, tmp0, tmp1;
287 v8u16 horiz_out0, horiz_out1, horiz_out2, horiz_out3, horiz_out4;
288
289 mask = LOAD_SB(&mc_filt_mask_arr[16]);
290
291 /* rearranging filter */
292 filt = LOAD_UH(filter_horiz);
293 filt_horiz = (v16u8)__msa_splati_h((v8i16)filt, 0);
294
295 filt = LOAD_UH(filter_vert);
296 filt_vert = (v16u8)__msa_splati_h((v8i16)filt, 0);
297
298 LOAD_5VECS_SB(src, src_stride, src0, src1, src2, src3, src4);
299
300 horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src0);
301 horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
302 horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
303
304 horiz_vec = (v16u8)__msa_vshf_b(mask, src3, src2);
305 horiz_out2 = __msa_dotp_u_h(horiz_vec, filt_horiz);
306 horiz_out2 = SRARI_SATURATE_UNSIGNED_H(horiz_out2, FILTER_BITS, 7);
307
308 horiz_vec = (v16u8)__msa_vshf_b(mask, src4, src4);
309 horiz_out4 = __msa_dotp_u_h(horiz_vec, filt_horiz);
310 horiz_out4 = SRARI_SATURATE_UNSIGNED_H(horiz_out4, FILTER_BITS, 7);
311
312 horiz_out1 = (v8u16)__msa_sldi_b((v16i8)horiz_out2, (v16i8)horiz_out0, 8);
313 horiz_out3 = (v8u16)__msa_pckod_d((v2i64)horiz_out4, (v2i64)horiz_out2);
314
315 vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
316 vec1 = (v16u8)__msa_ilvev_b((v16i8)horiz_out3, (v16i8)horiz_out2);
317
318 tmp0 = __msa_dotp_u_h(vec0, filt_vert);
319 tmp1 = __msa_dotp_u_h(vec1, filt_vert);
320 tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
321 tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
322
323 res0 = (v16u8)__msa_pckev_b((v16i8)tmp0, (v16i8)tmp0);
324 res1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp1);
325
326 out0 = __msa_copy_u_w((v4i32)res0, 0);
327 out1 = __msa_copy_u_w((v4i32)res0, 1);
328 out2 = __msa_copy_u_w((v4i32)res1, 0);
329 out3 = __msa_copy_u_w((v4i32)res1, 1);
330
331 STORE_WORD(dst, out0);
332 dst += dst_stride;
333 STORE_WORD(dst, out1);
334 dst += dst_stride;
335 STORE_WORD(dst, out2);
336 dst += dst_stride;
337 STORE_WORD(dst, out3);
338 }
339
340 static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride,
341 uint8_t *dst, int32_t dst_stride,
342 int8_t *filter_horiz,
343 int8_t *filter_vert) {
344 uint32_t out0, out1, out2, out3;
345 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
346 v16u8 filt_horiz, filt_vert, horiz_vec;
347 v16u8 vec0, vec1, vec2, vec3;
348 v8u16 horiz_out0, horiz_out1, horiz_out2, horiz_out3;
349 v8u16 vec4, vec5, vec6, vec7, filt;
350 v8u16 horiz_out4, horiz_out5, horiz_out6, horiz_out7, horiz_out8;
351 v16i8 res0, res1, res2, res3;
352
353 mask = LOAD_SB(&mc_filt_mask_arr[16]);
354
355 /* rearranging filter */
356 filt = LOAD_UH(filter_horiz);
357 filt_horiz = (v16u8)__msa_splati_h((v8i16)filt, 0);
358
359 filt = LOAD_UH(filter_vert);
360 filt_vert = (v16u8)__msa_splati_h((v8i16)filt, 0);
361
362 LOAD_8VECS_SB(src, src_stride,
363 src0, src1, src2, src3, src4, src5, src6, src7);
364 src += (8 * src_stride);
365 src8 = LOAD_SB(src);
366
367 horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src0);
368 horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
369 horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
370
371 horiz_vec = (v16u8)__msa_vshf_b(mask, src3, src2);
372 horiz_out2 = __msa_dotp_u_h(horiz_vec, filt_horiz);
373 horiz_out2 = SRARI_SATURATE_UNSIGNED_H(horiz_out2, FILTER_BITS, 7);
374
375 horiz_vec = (v16u8)__msa_vshf_b(mask, src5, src4);
376 horiz_out4 = __msa_dotp_u_h(horiz_vec, filt_horiz);
377 horiz_out4 = SRARI_SATURATE_UNSIGNED_H(horiz_out4, FILTER_BITS, 7);
378
379 horiz_vec = (v16u8)__msa_vshf_b(mask, src7, src6);
380 horiz_out6 = __msa_dotp_u_h(horiz_vec, filt_horiz);
381 horiz_out6 = SRARI_SATURATE_UNSIGNED_H(horiz_out6, FILTER_BITS, 7);
382
383 horiz_vec = (v16u8)__msa_vshf_b(mask, src8, src8);
384 horiz_out8 = __msa_dotp_u_h(horiz_vec, filt_horiz);
385 horiz_out8 = SRARI_SATURATE_UNSIGNED_H(horiz_out8, FILTER_BITS, 7);
386
387 horiz_out1 = (v8u16)__msa_sldi_b((v16i8)horiz_out2, (v16i8)horiz_out0, 8);
388 horiz_out3 = (v8u16)__msa_sldi_b((v16i8)horiz_out4, (v16i8)horiz_out2, 8);
389 horiz_out5 = (v8u16)__msa_sldi_b((v16i8)horiz_out6, (v16i8)horiz_out4, 8);
390 horiz_out7 = (v8u16)__msa_pckod_d((v2i64)horiz_out8, (v2i64)horiz_out6);
391
392 vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
393 vec1 = (v16u8)__msa_ilvev_b((v16i8)horiz_out3, (v16i8)horiz_out2);
394 vec2 = (v16u8)__msa_ilvev_b((v16i8)horiz_out5, (v16i8)horiz_out4);
395 vec3 = (v16u8)__msa_ilvev_b((v16i8)horiz_out7, (v16i8)horiz_out6);
396
397 vec4 = __msa_dotp_u_h(vec0, filt_vert);
398 vec5 = __msa_dotp_u_h(vec1, filt_vert);
399 vec6 = __msa_dotp_u_h(vec2, filt_vert);
400 vec7 = __msa_dotp_u_h(vec3, filt_vert);
401
402 vec4 = SRARI_SATURATE_UNSIGNED_H(vec4, FILTER_BITS, 7);
403 vec5 = SRARI_SATURATE_UNSIGNED_H(vec5, FILTER_BITS, 7);
404 vec6 = SRARI_SATURATE_UNSIGNED_H(vec6, FILTER_BITS, 7);
405 vec7 = SRARI_SATURATE_UNSIGNED_H(vec7, FILTER_BITS, 7);
406
407 res0 = __msa_pckev_b((v16i8)vec4, (v16i8)vec4);
408 res1 = __msa_pckev_b((v16i8)vec5, (v16i8)vec5);
409 res2 = __msa_pckev_b((v16i8)vec6, (v16i8)vec6);
410 res3 = __msa_pckev_b((v16i8)vec7, (v16i8)vec7);
411
412 out0 = __msa_copy_u_w((v4i32)res0, 0);
413 out1 = __msa_copy_u_w((v4i32)res0, 1);
414 out2 = __msa_copy_u_w((v4i32)res1, 0);
415 out3 = __msa_copy_u_w((v4i32)res1, 1);
416
417 STORE_WORD(dst, out0);
418 dst += dst_stride;
419 STORE_WORD(dst, out1);
420 dst += dst_stride;
421 STORE_WORD(dst, out2);
422 dst += dst_stride;
423 STORE_WORD(dst, out3);
424 dst += dst_stride;
425
426 out0 = __msa_copy_u_w((v4i32)res2, 0);
427 out1 = __msa_copy_u_w((v4i32)res2, 1);
428 out2 = __msa_copy_u_w((v4i32)res3, 0);
429 out3 = __msa_copy_u_w((v4i32)res3, 1);
430
431 STORE_WORD(dst, out0);
432 dst += dst_stride;
433 STORE_WORD(dst, out1);
434 dst += dst_stride;
435 STORE_WORD(dst, out2);
436 dst += dst_stride;
437 STORE_WORD(dst, out3);
438 }
439
440 static void common_hv_2ht_2vt_4w_msa(const uint8_t *src, int32_t src_stride,
441 uint8_t *dst, int32_t dst_stride,
442 int8_t *filter_horiz,
443 int8_t *filter_vert,
444 int32_t height) {
445 if (4 == height) {
446 common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride,
447 filter_horiz, filter_vert);
448 } else if (8 == height) {
449 common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride,
450 filter_horiz, filter_vert);
451 }
452 }
453
454 static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride,
455 uint8_t *dst, int32_t dst_stride,
456 int8_t *filter_horiz,
457 int8_t *filter_vert) {
458 v16i8 src0, src1, src2, src3, src4, mask;
459 v16u8 filt_horiz, filt_vert, horiz_vec;
460 v16u8 vec0, vec1, vec2, vec3;
461 v8u16 horiz_out0, horiz_out1;
462 v8u16 tmp0, tmp1, tmp2, tmp3;
463 v8i16 filt;
464
465 mask = LOAD_SB(&mc_filt_mask_arr[0]);
466
467 /* rearranging filter */
468 filt = LOAD_SH(filter_horiz);
469 filt_horiz = (v16u8)__msa_splati_h(filt, 0);
470
471 filt = LOAD_SH(filter_vert);
472 filt_vert = (v16u8)__msa_splati_h(filt, 0);
473
474 LOAD_5VECS_SB(src, src_stride, src0, src1, src2, src3, src4);
475 src += (5 * src_stride);
476
477 horiz_vec = (v16u8)__msa_vshf_b(mask, src0, src0);
478 horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
479 horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
480
481 horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src1);
482 horiz_out1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
483 horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_out1, FILTER_BITS, 7);
484
485 vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
486 tmp0 = __msa_dotp_u_h(vec0, filt_vert);
487
488 horiz_vec = (v16u8)__msa_vshf_b(mask, src2, src2);
489 horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
490 horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
491
492 vec1 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1);
493 tmp1 = __msa_dotp_u_h(vec1, filt_vert);
494
495 horiz_vec = (v16u8)__msa_vshf_b(mask, src3, src3);
496 horiz_out1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
497 horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_out1, FILTER_BITS, 7);
498
499 vec2 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
500 tmp2 = __msa_dotp_u_h(vec2, filt_vert);
501
502 horiz_vec = (v16u8)__msa_vshf_b(mask, src4, src4);
503 horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
504 horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
505
506 vec3 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1);
507 tmp3 = __msa_dotp_u_h(vec3, filt_vert);
508
509 tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
510 tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
511 tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
512 tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
513
514 PCKEV_B_STORE_8_BYTES_4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
515 }
516
517 static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src,
518 int32_t src_stride,
519 uint8_t *dst,
520 int32_t dst_stride,
521 int8_t *filter_horiz,
522 int8_t *filter_vert,
523 int32_t height) {
524 uint32_t loop_cnt;
525 v16i8 src0, src1, src2, src3, src4, mask;
526 v16u8 filt_horiz, filt_vert, vec0, horiz_vec;
527 v8u16 horiz_out0, horiz_out1;
528 v8u16 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
529 v8i16 filt;
530
531 mask = LOAD_SB(&mc_filt_mask_arr[0]);
532
533 /* rearranging filter */
534 filt = LOAD_SH(filter_horiz);
535 filt_horiz = (v16u8)__msa_splati_h(filt, 0);
536
537 filt = LOAD_SH(filter_vert);
538 filt_vert = (v16u8)__msa_splati_h(filt, 0);
539
540 src0 = LOAD_SB(src);
541 src += src_stride;
542
543 horiz_vec = (v16u8)__msa_vshf_b(mask, src0, src0);
544 horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
545 horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
546
547 for (loop_cnt = (height >> 3); loop_cnt--;) {
548 LOAD_4VECS_SB(src, src_stride, src1, src2, src3, src4);
549 src += (4 * src_stride);
550
551 horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src1);
552 horiz_out1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
553 horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_out1, FILTER_BITS, 7);
554
555 vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
556 tmp1 = __msa_dotp_u_h(vec0, filt_vert);
557
558 horiz_vec = (v16u8)__msa_vshf_b(mask, src2, src2);
559 horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
560 horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
561
562 vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1);
563 tmp2 = (v8u16)__msa_dotp_u_h(vec0, filt_vert);
564
565 tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
566 tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
567
568 horiz_vec = (v16u8)__msa_vshf_b(mask, src3, src3);
569 horiz_out1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
570 horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_out1, FILTER_BITS, 7);
571
572 vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
573 tmp3 = __msa_dotp_u_h(vec0, filt_vert);
574
575 horiz_vec = (v16u8)__msa_vshf_b(mask, src4, src4);
576 horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
577 horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
578
579 LOAD_4VECS_SB(src, src_stride, src1, src2, src3, src4);
580 src += (4 * src_stride);
581
582 vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1);
583 tmp4 = __msa_dotp_u_h(vec0, filt_vert);
584
585 tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
586 tmp4 = SRARI_SATURATE_UNSIGNED_H(tmp4, FILTER_BITS, 7);
587
588 PCKEV_B_STORE_8_BYTES_4(tmp1, tmp2, tmp3, tmp4, dst, dst_stride);
589 dst += (4 * dst_stride);
590
591 horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src1);
592 horiz_out1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
593 horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_out1, FILTER_BITS, 7);
594
595 vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
596 tmp5 = __msa_dotp_u_h(vec0, filt_vert);
597
598 horiz_vec = (v16u8)__msa_vshf_b(mask, src2, src2);
599 horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
600 horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
601
602 vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1);
603 tmp6 = __msa_dotp_u_h(vec0, filt_vert);
604
605 horiz_vec = (v16u8)__msa_vshf_b(mask, src3, src3);
606 horiz_out1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
607 horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_out1, FILTER_BITS, 7);
608
609 vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
610 tmp7 = __msa_dotp_u_h(vec0, filt_vert);
611
612 horiz_vec = (v16u8)__msa_vshf_b(mask, src4, src4);
613 horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
614 horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
615
616 vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1);
617 tmp8 = __msa_dotp_u_h(vec0, filt_vert);
618
619 tmp5 = SRARI_SATURATE_UNSIGNED_H(tmp5, FILTER_BITS, 7);
620 tmp6 = SRARI_SATURATE_UNSIGNED_H(tmp6, FILTER_BITS, 7);
621 tmp7 = SRARI_SATURATE_UNSIGNED_H(tmp7, FILTER_BITS, 7);
622 tmp8 = SRARI_SATURATE_UNSIGNED_H(tmp8, FILTER_BITS, 7);
623
624 PCKEV_B_STORE_8_BYTES_4(tmp5, tmp6, tmp7, tmp8, dst, dst_stride);
625 dst += (4 * dst_stride);
626 }
627 }
628
629 static void common_hv_2ht_2vt_8w_msa(const uint8_t *src, int32_t src_stride,
630 uint8_t *dst, int32_t dst_stride,
631 int8_t *filter_horiz, int8_t *filter_vert,
632 int32_t height) {
633 if (4 == height) {
634 common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride, filter_horiz,
635 filter_vert);
636 } else {
637 common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride,
638 filter_horiz, filter_vert, height);
639 }
640 }
641
642 static void common_hv_2ht_2vt_16w_msa(const uint8_t *src, int32_t src_stride,
643 uint8_t *dst, int32_t dst_stride,
644 int8_t *filter_horiz, int8_t *filter_vert,
645 int32_t height) {
646 uint32_t loop_cnt;
647 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
648 v16u8 filt_horiz, filt_vert, vec0, horiz_vec;
649 v8u16 horiz_vec0, horiz_vec1, tmp1, tmp2;
650 v8u16 horiz_out0, horiz_out1, horiz_out2, horiz_out3;
651 v8i16 filt;
652
653 mask = LOAD_SB(&mc_filt_mask_arr[0]);
654
655 /* rearranging filter */
656 filt = LOAD_SH(filter_horiz);
657 filt_horiz = (v16u8)__msa_splati_h(filt, 0);
658
659 filt = LOAD_SH(filter_vert);
660 filt_vert = (v16u8)__msa_splati_h(filt, 0);
661
662 src0 = LOAD_SB(src);
663 src1 = LOAD_SB(src + 8);
664
665 horiz_vec = (v16u8)__msa_vshf_b(mask, src0, src0);
666 horiz_vec0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
667 horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_vec0, FILTER_BITS, 7);
668
669 horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src1);
670 horiz_vec1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
671 horiz_out2 = SRARI_SATURATE_UNSIGNED_H(horiz_vec1, FILTER_BITS, 7);
672
673 src += src_stride;
674
675 for (loop_cnt = (height >> 2); loop_cnt--;) {
676 LOAD_4VECS_SB(src, src_stride, src0, src2, src4, src6);
677 LOAD_4VECS_SB(src + 8, src_stride, src1, src3, src5, src7);
678 src += (4 * src_stride);
679
680 horiz_vec = (v16u8)__msa_vshf_b(mask, src0, src0);
681 horiz_vec0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
682 horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_vec0, FILTER_BITS, 7);
683
684 horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src1);
685 horiz_vec1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
686 horiz_out3 = SRARI_SATURATE_UNSIGNED_H(horiz_vec1, FILTER_BITS, 7);
687
688 vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
689 tmp1 = __msa_dotp_u_h(vec0, filt_vert);
690 vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out3, (v16i8)horiz_out2);
691 tmp2 = __msa_dotp_u_h(vec0, filt_vert);
692 tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
693 tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
694
695 PCKEV_B_STORE_VEC(tmp2, tmp1, dst);
696 dst += dst_stride;
697
698 horiz_vec = (v16u8)__msa_vshf_b(mask, src2, src2);
699 horiz_vec0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
700 horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_vec0, FILTER_BITS, 7);
701
702 horiz_vec = (v16u8)__msa_vshf_b(mask, src3, src3);
703 horiz_vec1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
704 horiz_out2 = SRARI_SATURATE_UNSIGNED_H(horiz_vec1, FILTER_BITS, 7);
705
706 vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1);
707 tmp1 = __msa_dotp_u_h(vec0, filt_vert);
708 vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out2, (v16i8)horiz_out3);
709 tmp2 = __msa_dotp_u_h(vec0, filt_vert);
710 tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
711 tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
712
713 PCKEV_B_STORE_VEC(tmp2, tmp1, dst);
714 dst += dst_stride;
715
716 horiz_vec = (v16u8)__msa_vshf_b(mask, src4, src4);
717 horiz_vec0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
718 horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_vec0, FILTER_BITS, 7);
719
720 horiz_vec = (v16u8)__msa_vshf_b(mask, src5, src5);
721 horiz_vec1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
722 horiz_out3 = SRARI_SATURATE_UNSIGNED_H(horiz_vec1, FILTER_BITS, 7);
723
724 vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
725 tmp1 = __msa_dotp_u_h(vec0, filt_vert);
726 vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out3, (v16i8)horiz_out2);
727 tmp2 = __msa_dotp_u_h(vec0, filt_vert);
728 tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
729 tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
730
731 PCKEV_B_STORE_VEC(tmp2, tmp1, dst);
732 dst += dst_stride;
733
734 horiz_vec = (v16u8)__msa_vshf_b(mask, src6, src6);
735 horiz_vec0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
736 horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_vec0, FILTER_BITS, 7);
737
738 horiz_vec = (v16u8)__msa_vshf_b(mask, src7, src7);
739 horiz_vec1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
740 horiz_out2 = SRARI_SATURATE_UNSIGNED_H(horiz_vec1, FILTER_BITS, 7);
741
742 vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1);
743 tmp1 = __msa_dotp_u_h(vec0, filt_vert);
744 vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out2, (v16i8)horiz_out3);
745 tmp2 = __msa_dotp_u_h(vec0, filt_vert);
746 tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
747 tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
748
749 PCKEV_B_STORE_VEC(tmp2, tmp1, dst);
750 dst += dst_stride;
751 }
752 }
753
754 static void common_hv_2ht_2vt_32w_msa(const uint8_t *src, int32_t src_stride,
755 uint8_t *dst, int32_t dst_stride,
756 int8_t *filter_horiz, int8_t *filter_vert,
757 int32_t height) {
758 int32_t multiple8_cnt;
759 for (multiple8_cnt = 2; multiple8_cnt--;) {
760 common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz,
761 filter_vert, height);
762 src += 16;
763 dst += 16;
764 }
765 }
766
767 static void common_hv_2ht_2vt_64w_msa(const uint8_t *src, int32_t src_stride,
768 uint8_t *dst, int32_t dst_stride,
769 int8_t *filter_horiz, int8_t *filter_vert,
770 int32_t height) {
771 int32_t multiple8_cnt;
772 for (multiple8_cnt = 4; multiple8_cnt--;) {
773 common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz,
774 filter_vert, height);
775 src += 16;
776 dst += 16;
777 }
778 }
779
780 void vp9_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride,
781 uint8_t *dst, ptrdiff_t dst_stride,
782 const int16_t *filter_x, int32_t x_step_q4,
783 const int16_t *filter_y, int32_t y_step_q4,
784 int32_t w, int32_t h) {
785 int8_t cnt, filt_hor[8], filt_ver[8];
786
787 if (16 != x_step_q4 || 16 != y_step_q4) {
788 vp9_convolve8_c(src, src_stride, dst, dst_stride,
789 filter_x, x_step_q4, filter_y, y_step_q4,
790 w, h);
791 return;
792 }
793
794 if (((const int32_t *)filter_x)[1] == 0x800000 &&
795 ((const int32_t *)filter_y)[1] == 0x800000) {
796 vp9_convolve_copy(src, src_stride, dst, dst_stride,
797 filter_x, x_step_q4, filter_y, y_step_q4,
798 w, h);
799 return;
800 }
801
802 for (cnt = 0; cnt < 8; ++cnt) {
803 filt_hor[cnt] = filter_x[cnt];
804 filt_ver[cnt] = filter_y[cnt];
805 }
806
807 if (((const int32_t *)filter_x)[0] == 0 &&
808 ((const int32_t *)filter_y)[0] == 0) {
809 switch (w) {
810 case 4:
811 common_hv_2ht_2vt_4w_msa(src, (int32_t)src_stride,
812 dst, (int32_t)dst_stride,
813 &filt_hor[3], &filt_ver[3], (int32_t)h);
814 break;
815 case 8:
816 common_hv_2ht_2vt_8w_msa(src, (int32_t)src_stride,
817 dst, (int32_t)dst_stride,
818 &filt_hor[3], &filt_ver[3], (int32_t)h);
819 break;
820 case 16:
821 common_hv_2ht_2vt_16w_msa(src, (int32_t)src_stride,
822 dst, (int32_t)dst_stride,
823 &filt_hor[3], &filt_ver[3], (int32_t)h);
824 break;
825 case 32:
826 common_hv_2ht_2vt_32w_msa(src, (int32_t)src_stride,
827 dst, (int32_t)dst_stride,
828 &filt_hor[3], &filt_ver[3], (int32_t)h);
829 break;
830 case 64:
831 common_hv_2ht_2vt_64w_msa(src, (int32_t)src_stride,
832 dst, (int32_t)dst_stride,
833 &filt_hor[3], &filt_ver[3], (int32_t)h);
834 break;
835 default:
836 vp9_convolve8_c(src, src_stride, dst, dst_stride,
837 filter_x, x_step_q4, filter_y, y_step_q4,
838 w, h);
839 break;
840 }
841 } else if (((const int32_t *)filter_x)[0] == 0 ||
842 ((const int32_t *)filter_y)[0] == 0) {
843 vp9_convolve8_c(src, src_stride, dst, dst_stride,
844 filter_x, x_step_q4, filter_y, y_step_q4,
845 w, h);
846 } else {
847 switch (w) {
848 case 4:
849 common_hv_8ht_8vt_4w_msa(src, (int32_t)src_stride,
850 dst, (int32_t)dst_stride,
851 filt_hor, filt_ver, (int32_t)h);
852 break;
853 case 8:
854 common_hv_8ht_8vt_8w_msa(src, (int32_t)src_stride,
855 dst, (int32_t)dst_stride,
856 filt_hor, filt_ver, (int32_t)h);
857 break;
858 case 16:
859 common_hv_8ht_8vt_16w_msa(src, (int32_t)src_stride,
860 dst, (int32_t)dst_stride,
861 filt_hor, filt_ver, (int32_t)h);
862 break;
863 case 32:
864 common_hv_8ht_8vt_32w_msa(src, (int32_t)src_stride,
865 dst, (int32_t)dst_stride,
866 filt_hor, filt_ver, (int32_t)h);
867 break;
868 case 64:
869 common_hv_8ht_8vt_64w_msa(src, (int32_t)src_stride,
870 dst, (int32_t)dst_stride,
871 filt_hor, filt_ver, (int32_t)h);
872 break;
873 default:
874 vp9_convolve8_c(src, src_stride, dst, dst_stride,
875 filter_x, x_step_q4, filter_y, y_step_q4,
876 w, h);
877 break;
878 }
879 }
880 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698