Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(254)

Side by Side Diff: source/libvpx/vp9/common/mips/msa/vp9_convolve8_horiz_msa.c

Issue 1124333011: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: only update to last nights LKGR Created 5 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "./vp9_rtcd.h"
12 #include "vp9/common/mips/msa/vp9_convolve_msa.h"
13
14 static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride,
15 uint8_t *dst, int32_t dst_stride,
16 int8_t *filter) {
17 v16i8 filt0, filt1, filt2, filt3;
18 v16i8 src0, src1, src2, src3;
19 v16u8 mask0, mask1, mask2, mask3;
20 v8i16 filt, out0, out1;
21
22 mask0 = LOAD_UB(&mc_filt_mask_arr[16]);
23
24 src -= 3;
25
26 /* rearranging filter */
27 filt = LOAD_SH(filter);
28 filt0 = (v16i8)__msa_splati_h(filt, 0);
29 filt1 = (v16i8)__msa_splati_h(filt, 1);
30 filt2 = (v16i8)__msa_splati_h(filt, 2);
31 filt3 = (v16i8)__msa_splati_h(filt, 3);
32
33 mask1 = mask0 + 2;
34 mask2 = mask0 + 4;
35 mask3 = mask0 + 6;
36
37 LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
38
39 XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
40
41 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
42 filt0, filt1, filt2, filt3, out0, out1);
43
44 out0 = SRARI_SATURATE_SIGNED_H(out0, FILTER_BITS, 7);
45 out1 = SRARI_SATURATE_SIGNED_H(out1, FILTER_BITS, 7);
46
47 PCKEV_2B_XORI128_STORE_4_BYTES_4(out0, out1, dst, dst_stride);
48 }
49
50 static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride,
51 uint8_t *dst, int32_t dst_stride,
52 int8_t *filter) {
53 v16i8 filt0, filt1, filt2, filt3;
54 v16i8 src0, src1, src2, src3;
55 v16u8 mask0, mask1, mask2, mask3;
56 v8i16 filt, out0, out1, out2, out3;
57
58 mask0 = LOAD_UB(&mc_filt_mask_arr[16]);
59
60 src -= 3;
61
62 /* rearranging filter */
63 filt = LOAD_SH(filter);
64 filt0 = (v16i8)__msa_splati_h(filt, 0);
65 filt1 = (v16i8)__msa_splati_h(filt, 1);
66 filt2 = (v16i8)__msa_splati_h(filt, 2);
67 filt3 = (v16i8)__msa_splati_h(filt, 3);
68
69 mask1 = mask0 + 2;
70 mask2 = mask0 + 4;
71 mask3 = mask0 + 6;
72
73 LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
74 src += (4 * src_stride);
75
76 XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
77
78 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
79 filt0, filt1, filt2, filt3, out0, out1);
80
81 LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
82
83 XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
84
85 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
86 filt0, filt1, filt2, filt3, out2, out3);
87
88 out0 = SRARI_SATURATE_SIGNED_H(out0, FILTER_BITS, 7);
89 out1 = SRARI_SATURATE_SIGNED_H(out1, FILTER_BITS, 7);
90 out2 = SRARI_SATURATE_SIGNED_H(out2, FILTER_BITS, 7);
91 out3 = SRARI_SATURATE_SIGNED_H(out3, FILTER_BITS, 7);
92
93 PCKEV_2B_XORI128_STORE_4_BYTES_4(out0, out1, dst, dst_stride);
94 dst += (4 * dst_stride);
95 PCKEV_2B_XORI128_STORE_4_BYTES_4(out2, out3, dst, dst_stride);
96 }
97
98 static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride,
99 uint8_t *dst, int32_t dst_stride,
100 int8_t *filter, int32_t height) {
101 if (4 == height) {
102 common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter);
103 } else if (8 == height) {
104 common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter);
105 }
106 }
107
108 static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride,
109 uint8_t *dst, int32_t dst_stride,
110 int8_t *filter) {
111 v16i8 filt0, filt1, filt2, filt3;
112 v16i8 src0, src1, src2, src3;
113 v16u8 mask0, mask1, mask2, mask3;
114 v8i16 filt, out0, out1, out2, out3;
115
116 mask0 = LOAD_UB(&mc_filt_mask_arr[0]);
117
118 src -= 3;
119
120 /* rearranging filter */
121 filt = LOAD_SH(filter);
122 filt0 = (v16i8)__msa_splati_h(filt, 0);
123 filt1 = (v16i8)__msa_splati_h(filt, 1);
124 filt2 = (v16i8)__msa_splati_h(filt, 2);
125 filt3 = (v16i8)__msa_splati_h(filt, 3);
126
127 mask1 = mask0 + 2;
128 mask2 = mask0 + 4;
129 mask3 = mask0 + 6;
130
131 LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
132
133 XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
134
135 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
136 filt0, filt1, filt2, filt3, out0, out1, out2,
137 out3);
138
139 out0 = SRARI_SATURATE_SIGNED_H(out0, FILTER_BITS, 7);
140 out1 = SRARI_SATURATE_SIGNED_H(out1, FILTER_BITS, 7);
141 out2 = SRARI_SATURATE_SIGNED_H(out2, FILTER_BITS, 7);
142 out3 = SRARI_SATURATE_SIGNED_H(out3, FILTER_BITS, 7);
143
144 PCKEV_B_4_XORI128_STORE_8_BYTES_4(out0, out1, out2, out3, dst, dst_stride);
145 }
146
147 static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
148 uint8_t *dst, int32_t dst_stride,
149 int8_t *filter, int32_t height) {
150 uint32_t loop_cnt;
151 v16i8 filt0, filt1, filt2, filt3;
152 v16i8 src0, src1, src2, src3;
153 v16u8 mask0, mask1, mask2, mask3;
154 v8i16 filt, out0, out1, out2, out3;
155
156 mask0 = LOAD_UB(&mc_filt_mask_arr[0]);
157
158 src -= 3;
159
160 /* rearranging filter */
161 filt = LOAD_SH(filter);
162 filt0 = (v16i8)__msa_splati_h(filt, 0);
163 filt1 = (v16i8)__msa_splati_h(filt, 1);
164 filt2 = (v16i8)__msa_splati_h(filt, 2);
165 filt3 = (v16i8)__msa_splati_h(filt, 3);
166
167 mask1 = mask0 + 2;
168 mask2 = mask0 + 4;
169 mask3 = mask0 + 6;
170
171 for (loop_cnt = (height >> 2); loop_cnt--;) {
172 LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
173 src += (4 * src_stride);
174
175 XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
176
177 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
178 mask3, filt0, filt1, filt2, filt3, out0, out1,
179 out2, out3);
180
181 out0 = SRARI_SATURATE_SIGNED_H(out0, FILTER_BITS, 7);
182 out1 = SRARI_SATURATE_SIGNED_H(out1, FILTER_BITS, 7);
183 out2 = SRARI_SATURATE_SIGNED_H(out2, FILTER_BITS, 7);
184 out3 = SRARI_SATURATE_SIGNED_H(out3, FILTER_BITS, 7);
185
186 PCKEV_B_4_XORI128_STORE_8_BYTES_4(out0, out1, out2, out3, dst, dst_stride);
187 dst += (4 * dst_stride);
188 }
189 }
190
191 static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride,
192 uint8_t *dst, int32_t dst_stride,
193 int8_t *filter, int32_t height) {
194 if (4 == height) {
195 common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter);
196 } else {
197 common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
198 }
199 }
200
201 static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride,
202 uint8_t *dst, int32_t dst_stride,
203 int8_t *filter, int32_t height) {
204 uint32_t loop_cnt;
205 v16i8 src0, src1, src2, src3;
206 v16i8 filt0, filt1, filt2, filt3;
207 v16u8 mask0, mask1, mask2, mask3;
208 v8i16 filt, out0, out1, out2, out3;
209
210 mask0 = LOAD_UB(&mc_filt_mask_arr[0]);
211
212 src -= 3;
213
214 /* rearranging filter */
215 filt = LOAD_SH(filter);
216 filt0 = (v16i8)__msa_splati_h(filt, 0);
217 filt1 = (v16i8)__msa_splati_h(filt, 1);
218 filt2 = (v16i8)__msa_splati_h(filt, 2);
219 filt3 = (v16i8)__msa_splati_h(filt, 3);
220
221 mask1 = mask0 + 2;
222 mask2 = mask0 + 4;
223 mask3 = mask0 + 6;
224
225 for (loop_cnt = (height >> 1); loop_cnt--;) {
226 src0 = LOAD_SB(src);
227 src1 = LOAD_SB(src + 8);
228 src += src_stride;
229 src2 = LOAD_SB(src);
230 src3 = LOAD_SB(src + 8);
231 src += src_stride;
232
233 XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
234
235 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
236 mask3, filt0, filt1, filt2, filt3, out0, out1,
237 out2, out3);
238
239 out0 = SRARI_SATURATE_SIGNED_H(out0, FILTER_BITS, 7);
240 out1 = SRARI_SATURATE_SIGNED_H(out1, FILTER_BITS, 7);
241 out2 = SRARI_SATURATE_SIGNED_H(out2, FILTER_BITS, 7);
242 out3 = SRARI_SATURATE_SIGNED_H(out3, FILTER_BITS, 7);
243
244 PCKEV_B_XORI128_STORE_VEC(out1, out0, dst);
245 dst += dst_stride;
246 PCKEV_B_XORI128_STORE_VEC(out3, out2, dst);
247 dst += dst_stride;
248 }
249 }
250
251 static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride,
252 uint8_t *dst, int32_t dst_stride,
253 int8_t *filter, int32_t height) {
254 uint32_t loop_cnt;
255 v16i8 src0, src1, src2, src3;
256 v16i8 filt0, filt1, filt2, filt3;
257 v16u8 mask0, mask1, mask2, mask3;
258 v8i16 filt, out0, out1, out2, out3;
259
260 mask0 = LOAD_UB(&mc_filt_mask_arr[0]);
261
262 src -= 3;
263
264 /* rearranging filter */
265 filt = LOAD_SH(filter);
266 filt0 = (v16i8)__msa_splati_h(filt, 0);
267 filt1 = (v16i8)__msa_splati_h(filt, 1);
268 filt2 = (v16i8)__msa_splati_h(filt, 2);
269 filt3 = (v16i8)__msa_splati_h(filt, 3);
270
271 mask1 = mask0 + 2;
272 mask2 = mask0 + 4;
273 mask3 = mask0 + 6;
274
275 for (loop_cnt = (height >> 1); loop_cnt--;) {
276 src0 = LOAD_SB(src);
277 src2 = LOAD_SB(src + 16);
278 src3 = LOAD_SB(src + 24);
279 src1 = __msa_sld_b((v16i8)src2, (v16i8)src0, 8);
280 src += src_stride;
281
282 XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
283
284 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
285 mask3, filt0, filt1, filt2, filt3, out0, out1,
286 out2, out3);
287
288 out0 = SRARI_SATURATE_SIGNED_H(out0, FILTER_BITS, 7);
289 out1 = SRARI_SATURATE_SIGNED_H(out1, FILTER_BITS, 7);
290 out2 = SRARI_SATURATE_SIGNED_H(out2, FILTER_BITS, 7);
291 out3 = SRARI_SATURATE_SIGNED_H(out3, FILTER_BITS, 7);
292
293 src0 = LOAD_SB(src);
294 src2 = LOAD_SB(src + 16);
295 src3 = LOAD_SB(src + 24);
296 src1 = __msa_sld_b((v16i8)src2, (v16i8)src0, 8);
297
298 PCKEV_B_XORI128_STORE_VEC(out1, out0, dst);
299 PCKEV_B_XORI128_STORE_VEC(out3, out2, (dst + 16));
300 dst += dst_stride;
301
302 XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
303
304 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
305 mask3, filt0, filt1, filt2, filt3, out0, out1,
306 out2, out3);
307
308 out0 = SRARI_SATURATE_SIGNED_H(out0, FILTER_BITS, 7);
309 out1 = SRARI_SATURATE_SIGNED_H(out1, FILTER_BITS, 7);
310 out2 = SRARI_SATURATE_SIGNED_H(out2, FILTER_BITS, 7);
311 out3 = SRARI_SATURATE_SIGNED_H(out3, FILTER_BITS, 7);
312
313 PCKEV_B_XORI128_STORE_VEC(out1, out0, dst);
314 PCKEV_B_XORI128_STORE_VEC(out3, out2, (dst + 16));
315
316 src += src_stride;
317 dst += dst_stride;
318 }
319 }
320
321 static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride,
322 uint8_t *dst, int32_t dst_stride,
323 int8_t *filter, int32_t height) {
324 uint32_t loop_cnt, cnt;
325 v16i8 src0, src1, src2, src3;
326 v16i8 filt0, filt1, filt2, filt3;
327 v16u8 mask0, mask1, mask2, mask3;
328 v8i16 filt, out0, out1, out2, out3;
329
330 mask0 = LOAD_UB(&mc_filt_mask_arr[0]);
331
332 src -= 3;
333
334 /* rearranging filter */
335 filt = LOAD_SH(filter);
336 filt0 = (v16i8)__msa_splati_h(filt, 0);
337 filt1 = (v16i8)__msa_splati_h(filt, 1);
338 filt2 = (v16i8)__msa_splati_h(filt, 2);
339 filt3 = (v16i8)__msa_splati_h(filt, 3);
340
341 mask1 = mask0 + 2;
342 mask2 = mask0 + 4;
343 mask3 = mask0 + 6;
344
345 for (loop_cnt = height; loop_cnt--;) {
346 for (cnt = 0; cnt < 2; ++cnt) {
347 src0 = LOAD_SB(&src[cnt << 5]);
348 src2 = LOAD_SB(&src[16 + (cnt << 5)]);
349 src3 = LOAD_SB(&src[24 + (cnt << 5)]);
350 src1 = __msa_sld_b((v16i8)src2, (v16i8)src0, 8);
351
352 XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
353
354 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
355 mask3, filt0, filt1, filt2, filt3, out0, out1,
356 out2, out3);
357
358 out0 = SRARI_SATURATE_SIGNED_H(out0, FILTER_BITS, 7);
359 out1 = SRARI_SATURATE_SIGNED_H(out1, FILTER_BITS, 7);
360 out2 = SRARI_SATURATE_SIGNED_H(out2, FILTER_BITS, 7);
361 out3 = SRARI_SATURATE_SIGNED_H(out3, FILTER_BITS, 7);
362
363 PCKEV_B_XORI128_STORE_VEC(out1, out0, &dst[cnt << 5]);
364 PCKEV_B_XORI128_STORE_VEC(out3, out2, &dst[16 + (cnt << 5)]);
365 }
366
367 src += src_stride;
368 dst += dst_stride;
369 }
370 }
371
372 static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
373 uint8_t *dst, int32_t dst_stride,
374 int8_t *filter) {
375 uint32_t out0, out1, out2, out3;
376 v16i8 src0, src1, src2, src3, mask;
377 v16u8 vec0, vec1, filt0;
378 v16i8 res0, res1;
379 v8u16 vec2, vec3, filt, const255;
380
381 mask = LOAD_SB(&mc_filt_mask_arr[16]);
382
383 /* rearranging filter */
384 filt = LOAD_UH(filter);
385 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
386
387 const255 = (v8u16)__msa_ldi_h(255);
388
389 LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
390
391 vec0 = (v16u8)__msa_vshf_b(mask, src1, src0);
392 vec1 = (v16u8)__msa_vshf_b(mask, src3, src2);
393
394 vec2 = __msa_dotp_u_h(vec0, filt0);
395 vec3 = __msa_dotp_u_h(vec1, filt0);
396
397 vec2 = (v8u16)__msa_srari_h((v8i16)vec2, FILTER_BITS);
398 vec3 = (v8u16)__msa_srari_h((v8i16)vec3, FILTER_BITS);
399
400 vec2 = __msa_min_u_h(vec2, const255);
401 vec3 = __msa_min_u_h(vec3, const255);
402
403 res0 = __msa_pckev_b((v16i8)vec2, (v16i8)vec2);
404 res1 = __msa_pckev_b((v16i8)vec3, (v16i8)vec3);
405
406 out0 = __msa_copy_u_w((v4i32)res0, 0);
407 out1 = __msa_copy_u_w((v4i32)res0, 1);
408 out2 = __msa_copy_u_w((v4i32)res1, 0);
409 out3 = __msa_copy_u_w((v4i32)res1, 1);
410
411 STORE_WORD(dst, out0);
412 dst += dst_stride;
413 STORE_WORD(dst, out1);
414 dst += dst_stride;
415 STORE_WORD(dst, out2);
416 dst += dst_stride;
417 STORE_WORD(dst, out3);
418 }
419
420 static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
421 uint8_t *dst, int32_t dst_stride,
422 int8_t *filter) {
423 uint32_t out0, out1, out2, out3;
424 v16u8 filt0;
425 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
426 v16u8 vec0, vec1, vec2, vec3;
427 v8u16 vec4, vec5, vec6, vec7;
428 v16i8 res0, res1, res2, res3;
429 v8u16 filt, const255;
430
431 mask = LOAD_SB(&mc_filt_mask_arr[16]);
432
433 /* rearranging filter */
434 filt = LOAD_UH(filter);
435 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
436
437 const255 = (v8u16)__msa_ldi_h(255);
438
439 LOAD_8VECS_SB(src, src_stride,
440 src0, src1, src2, src3, src4, src5, src6, src7);
441
442 vec0 = (v16u8)__msa_vshf_b(mask, src1, src0);
443 vec1 = (v16u8)__msa_vshf_b(mask, src3, src2);
444 vec2 = (v16u8)__msa_vshf_b(mask, src5, src4);
445 vec3 = (v16u8)__msa_vshf_b(mask, src7, src6);
446
447 vec4 = __msa_dotp_u_h(vec0, filt0);
448 vec5 = __msa_dotp_u_h(vec1, filt0);
449 vec6 = __msa_dotp_u_h(vec2, filt0);
450 vec7 = __msa_dotp_u_h(vec3, filt0);
451
452 vec4 = (v8u16)__msa_srari_h((v8i16)vec4, FILTER_BITS);
453 vec5 = (v8u16)__msa_srari_h((v8i16)vec5, FILTER_BITS);
454 vec6 = (v8u16)__msa_srari_h((v8i16)vec6, FILTER_BITS);
455 vec7 = (v8u16)__msa_srari_h((v8i16)vec7, FILTER_BITS);
456
457 vec4 = __msa_min_u_h(vec4, const255);
458 vec5 = __msa_min_u_h(vec5, const255);
459 vec6 = __msa_min_u_h(vec6, const255);
460 vec7 = __msa_min_u_h(vec7, const255);
461
462 res0 = __msa_pckev_b((v16i8)vec4, (v16i8)vec4);
463 res1 = __msa_pckev_b((v16i8)vec5, (v16i8)vec5);
464 res2 = __msa_pckev_b((v16i8)vec6, (v16i8)vec6);
465 res3 = __msa_pckev_b((v16i8)vec7, (v16i8)vec7);
466
467 out0 = __msa_copy_u_w((v4i32)res0, 0);
468 out1 = __msa_copy_u_w((v4i32)res0, 1);
469 out2 = __msa_copy_u_w((v4i32)res1, 0);
470 out3 = __msa_copy_u_w((v4i32)res1, 1);
471
472 STORE_WORD(dst, out0);
473 dst += dst_stride;
474 STORE_WORD(dst, out1);
475 dst += dst_stride;
476 STORE_WORD(dst, out2);
477 dst += dst_stride;
478 STORE_WORD(dst, out3);
479 dst += dst_stride;
480
481 out0 = __msa_copy_u_w((v4i32)res2, 0);
482 out1 = __msa_copy_u_w((v4i32)res2, 1);
483 out2 = __msa_copy_u_w((v4i32)res3, 0);
484 out3 = __msa_copy_u_w((v4i32)res3, 1);
485
486 STORE_WORD(dst, out0);
487 dst += dst_stride;
488 STORE_WORD(dst, out1);
489 dst += dst_stride;
490 STORE_WORD(dst, out2);
491 dst += dst_stride;
492 STORE_WORD(dst, out3);
493 }
494
495 static void common_hz_2t_4w_msa(const uint8_t *src, int32_t src_stride,
496 uint8_t *dst, int32_t dst_stride,
497 int8_t *filter, int32_t height) {
498 if (4 == height) {
499 common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
500 } else if (8 == height) {
501 common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
502 }
503 }
504
505 static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
506 uint8_t *dst, int32_t dst_stride,
507 int8_t *filter) {
508 v16u8 filt0;
509 v16i8 src0, src1, src2, src3, mask;
510 v8u16 vec0, vec1, vec2, vec3;
511 v8u16 out0, out1, out2, out3;
512 v8u16 const255, filt;
513
514 mask = LOAD_SB(&mc_filt_mask_arr[0]);
515
516 /* rearranging filter */
517 filt = LOAD_UH(filter);
518 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
519
520 const255 = (v8u16)__msa_ldi_h(255);
521
522 LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
523
524 vec0 = (v8u16)__msa_vshf_b(mask, src0, src0);
525 vec1 = (v8u16)__msa_vshf_b(mask, src1, src1);
526 vec2 = (v8u16)__msa_vshf_b(mask, src2, src2);
527 vec3 = (v8u16)__msa_vshf_b(mask, src3, src3);
528
529 vec0 = __msa_dotp_u_h((v16u8)vec0, filt0);
530 vec1 = __msa_dotp_u_h((v16u8)vec1, filt0);
531 vec2 = __msa_dotp_u_h((v16u8)vec2, filt0);
532 vec3 = __msa_dotp_u_h((v16u8)vec3, filt0);
533
534 SRARI_H_4VECS_UH(vec0, vec1, vec2, vec3, vec0, vec1, vec2, vec3, FILTER_BITS);
535
536 out0 = __msa_min_u_h(vec0, const255);
537 out1 = __msa_min_u_h(vec1, const255);
538 out2 = __msa_min_u_h(vec2, const255);
539 out3 = __msa_min_u_h(vec3, const255);
540
541 PCKEV_B_STORE_8_BYTES_4(out0, out1, out2, out3, dst, dst_stride);
542 }
543
544 static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
545 uint8_t *dst, int32_t dst_stride,
546 int8_t *filter, int32_t height) {
547 v16u8 filt0;
548 v16i8 src0, src1, src2, src3, mask;
549 v8u16 vec0, vec1, vec2, vec3;
550 v8u16 filt, const255;
551
552 mask = LOAD_SB(&mc_filt_mask_arr[0]);
553
554 /* rearranging filter */
555 filt = LOAD_UH(filter);
556 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
557
558 const255 = (v8u16)__msa_ldi_h(255);
559
560 LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
561 src += (4 * src_stride);
562
563 vec0 = (v8u16)__msa_vshf_b(mask, src0, src0);
564 vec1 = (v8u16)__msa_vshf_b(mask, src1, src1);
565 vec2 = (v8u16)__msa_vshf_b(mask, src2, src2);
566 vec3 = (v8u16)__msa_vshf_b(mask, src3, src3);
567
568 vec0 = __msa_dotp_u_h((v16u8)vec0, filt0);
569 vec1 = __msa_dotp_u_h((v16u8)vec1, filt0);
570 vec2 = __msa_dotp_u_h((v16u8)vec2, filt0);
571 vec3 = __msa_dotp_u_h((v16u8)vec3, filt0);
572
573 SRARI_H_4VECS_UH(vec0, vec1, vec2, vec3, vec0, vec1, vec2, vec3, FILTER_BITS);
574
575 vec0 = __msa_min_u_h(vec0, const255);
576 vec1 = __msa_min_u_h(vec1, const255);
577 vec2 = __msa_min_u_h(vec2, const255);
578 vec3 = __msa_min_u_h(vec3, const255);
579
580 LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
581 src += (4 * src_stride);
582
583 PCKEV_B_STORE_8_BYTES_4(vec0, vec1, vec2, vec3, dst, dst_stride);
584 dst += (4 * dst_stride);
585
586 vec0 = (v8u16)__msa_vshf_b(mask, src0, src0);
587 vec1 = (v8u16)__msa_vshf_b(mask, src1, src1);
588 vec2 = (v8u16)__msa_vshf_b(mask, src2, src2);
589 vec3 = (v8u16)__msa_vshf_b(mask, src3, src3);
590
591 vec0 = __msa_dotp_u_h((v16u8)vec0, filt0);
592 vec1 = __msa_dotp_u_h((v16u8)vec1, filt0);
593 vec2 = __msa_dotp_u_h((v16u8)vec2, filt0);
594 vec3 = __msa_dotp_u_h((v16u8)vec3, filt0);
595
596 SRARI_H_4VECS_UH(vec0, vec1, vec2, vec3, vec0, vec1, vec2, vec3, FILTER_BITS);
597
598 vec0 = __msa_min_u_h(vec0, const255);
599 vec1 = __msa_min_u_h(vec1, const255);
600 vec2 = __msa_min_u_h(vec2, const255);
601 vec3 = __msa_min_u_h(vec3, const255);
602
603 PCKEV_B_STORE_8_BYTES_4(vec0, vec1, vec2, vec3, dst, dst_stride);
604 dst += (4 * dst_stride);
605
606 if (16 == height) {
607 LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
608 src += (4 * src_stride);
609
610 vec0 = (v8u16)__msa_vshf_b(mask, src0, src0);
611 vec1 = (v8u16)__msa_vshf_b(mask, src1, src1);
612 vec2 = (v8u16)__msa_vshf_b(mask, src2, src2);
613 vec3 = (v8u16)__msa_vshf_b(mask, src3, src3);
614
615 vec0 = __msa_dotp_u_h((v16u8)vec0, filt0);
616 vec1 = __msa_dotp_u_h((v16u8)vec1, filt0);
617 vec2 = __msa_dotp_u_h((v16u8)vec2, filt0);
618 vec3 = __msa_dotp_u_h((v16u8)vec3, filt0);
619
620 SRARI_H_4VECS_UH(vec0, vec1, vec2, vec3,
621 vec0, vec1, vec2, vec3, FILTER_BITS);
622
623 vec0 = __msa_min_u_h(vec0, const255);
624 vec1 = __msa_min_u_h(vec1, const255);
625 vec2 = __msa_min_u_h(vec2, const255);
626 vec3 = __msa_min_u_h(vec3, const255);
627
628 LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
629 src += (4 * src_stride);
630
631 PCKEV_B_STORE_8_BYTES_4(vec0, vec1, vec2, vec3, dst, dst_stride);
632 dst += (4 * dst_stride);
633
634 vec0 = (v8u16)__msa_vshf_b(mask, src0, src0);
635 vec1 = (v8u16)__msa_vshf_b(mask, src1, src1);
636 vec2 = (v8u16)__msa_vshf_b(mask, src2, src2);
637 vec3 = (v8u16)__msa_vshf_b(mask, src3, src3);
638
639 vec0 = __msa_dotp_u_h((v16u8)vec0, filt0);
640 vec1 = __msa_dotp_u_h((v16u8)vec1, filt0);
641 vec2 = __msa_dotp_u_h((v16u8)vec2, filt0);
642 vec3 = __msa_dotp_u_h((v16u8)vec3, filt0);
643
644 SRARI_H_4VECS_UH(vec0, vec1, vec2, vec3,
645 vec0, vec1, vec2, vec3, FILTER_BITS);
646
647 vec0 = __msa_min_u_h(vec0, const255);
648 vec1 = __msa_min_u_h(vec1, const255);
649 vec2 = __msa_min_u_h(vec2, const255);
650 vec3 = __msa_min_u_h(vec3, const255);
651
652 PCKEV_B_STORE_8_BYTES_4(vec0, vec1, vec2, vec3, dst, dst_stride);
653 }
654 }
655
656 static void common_hz_2t_8w_msa(const uint8_t *src, int32_t src_stride,
657 uint8_t *dst, int32_t dst_stride,
658 int8_t *filter, int32_t height) {
659 if (4 == height) {
660 common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
661 } else {
662 common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
663 }
664 }
665
666 static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride,
667 uint8_t *dst, int32_t dst_stride,
668 int8_t *filter, int32_t height) {
669 uint32_t loop_cnt;
670 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
671 v16u8 filt0;
672 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
673 v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
674 v8u16 filt, const255;
675
676 mask = LOAD_SB(&mc_filt_mask_arr[0]);
677
678 loop_cnt = (height >> 2) - 1;
679
680 /* rearranging filter */
681 filt = LOAD_UH(filter);
682 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
683
684 const255 = (v8u16)__msa_ldi_h(255);
685
686 src0 = LOAD_SB(src);
687 src1 = LOAD_SB(src + 8);
688 src += src_stride;
689 src2 = LOAD_SB(src);
690 src3 = LOAD_SB(src + 8);
691 src += src_stride;
692 src4 = LOAD_SB(src);
693 src5 = LOAD_SB(src + 8);
694 src += src_stride;
695 src6 = LOAD_SB(src);
696 src7 = LOAD_SB(src + 8);
697 src += src_stride;
698
699 vec0 = (v16u8)__msa_vshf_b(mask, src0, src0);
700 vec1 = (v16u8)__msa_vshf_b(mask, src1, src1);
701 vec2 = (v16u8)__msa_vshf_b(mask, src2, src2);
702 vec3 = (v16u8)__msa_vshf_b(mask, src3, src3);
703 vec4 = (v16u8)__msa_vshf_b(mask, src4, src4);
704 vec5 = (v16u8)__msa_vshf_b(mask, src5, src5);
705 vec6 = (v16u8)__msa_vshf_b(mask, src6, src6);
706 vec7 = (v16u8)__msa_vshf_b(mask, src7, src7);
707
708 out0 = __msa_dotp_u_h(vec0, filt0);
709 out1 = __msa_dotp_u_h(vec1, filt0);
710 out2 = __msa_dotp_u_h(vec2, filt0);
711 out3 = __msa_dotp_u_h(vec3, filt0);
712 out4 = __msa_dotp_u_h(vec4, filt0);
713 out5 = __msa_dotp_u_h(vec5, filt0);
714 out6 = __msa_dotp_u_h(vec6, filt0);
715 out7 = __msa_dotp_u_h(vec7, filt0);
716
717 out0 = (v8u16)__msa_srari_h((v8i16)out0, FILTER_BITS);
718 out1 = (v8u16)__msa_srari_h((v8i16)out1, FILTER_BITS);
719 out2 = (v8u16)__msa_srari_h((v8i16)out2, FILTER_BITS);
720 out3 = (v8u16)__msa_srari_h((v8i16)out3, FILTER_BITS);
721 out4 = (v8u16)__msa_srari_h((v8i16)out4, FILTER_BITS);
722 out5 = (v8u16)__msa_srari_h((v8i16)out5, FILTER_BITS);
723 out6 = (v8u16)__msa_srari_h((v8i16)out6, FILTER_BITS);
724 out7 = (v8u16)__msa_srari_h((v8i16)out7, FILTER_BITS);
725
726 out0 = __msa_min_u_h(out0, const255);
727 out1 = __msa_min_u_h(out1, const255);
728 out2 = __msa_min_u_h(out2, const255);
729 out3 = __msa_min_u_h(out3, const255);
730 out4 = __msa_min_u_h(out4, const255);
731 out5 = __msa_min_u_h(out5, const255);
732 out6 = __msa_min_u_h(out6, const255);
733 out7 = __msa_min_u_h(out7, const255);
734
735 PCKEV_B_STORE_VEC(out1, out0, dst);
736 dst += dst_stride;
737 PCKEV_B_STORE_VEC(out3, out2, dst);
738 dst += dst_stride;
739 PCKEV_B_STORE_VEC(out5, out4, dst);
740 dst += dst_stride;
741 PCKEV_B_STORE_VEC(out7, out6, dst);
742 dst += dst_stride;
743
744 for (; loop_cnt--;) {
745 src0 = LOAD_SB(src);
746 src1 = LOAD_SB(src + 8);
747 src += src_stride;
748 src2 = LOAD_SB(src);
749 src3 = LOAD_SB(src + 8);
750 src += src_stride;
751 src4 = LOAD_SB(src);
752 src5 = LOAD_SB(src + 8);
753 src += src_stride;
754 src6 = LOAD_SB(src);
755 src7 = LOAD_SB(src + 8);
756 src += src_stride;
757
758 vec0 = (v16u8)__msa_vshf_b(mask, src0, src0);
759 vec1 = (v16u8)__msa_vshf_b(mask, src1, src1);
760 vec2 = (v16u8)__msa_vshf_b(mask, src2, src2);
761 vec3 = (v16u8)__msa_vshf_b(mask, src3, src3);
762 vec4 = (v16u8)__msa_vshf_b(mask, src4, src4);
763 vec5 = (v16u8)__msa_vshf_b(mask, src5, src5);
764 vec6 = (v16u8)__msa_vshf_b(mask, src6, src6);
765 vec7 = (v16u8)__msa_vshf_b(mask, src7, src7);
766
767 out0 = __msa_dotp_u_h(vec0, filt0);
768 out1 = __msa_dotp_u_h(vec1, filt0);
769 out2 = __msa_dotp_u_h(vec2, filt0);
770 out3 = __msa_dotp_u_h(vec3, filt0);
771 out4 = __msa_dotp_u_h(vec4, filt0);
772 out5 = __msa_dotp_u_h(vec5, filt0);
773 out6 = __msa_dotp_u_h(vec6, filt0);
774 out7 = __msa_dotp_u_h(vec7, filt0);
775
776 out0 = (v8u16)__msa_srari_h((v8i16)out0, FILTER_BITS);
777 out1 = (v8u16)__msa_srari_h((v8i16)out1, FILTER_BITS);
778 out2 = (v8u16)__msa_srari_h((v8i16)out2, FILTER_BITS);
779 out3 = (v8u16)__msa_srari_h((v8i16)out3, FILTER_BITS);
780 out4 = (v8u16)__msa_srari_h((v8i16)out4, FILTER_BITS);
781 out5 = (v8u16)__msa_srari_h((v8i16)out5, FILTER_BITS);
782 out6 = (v8u16)__msa_srari_h((v8i16)out6, FILTER_BITS);
783 out7 = (v8u16)__msa_srari_h((v8i16)out7, FILTER_BITS);
784
785 out0 = __msa_min_u_h(out0, const255);
786 out1 = __msa_min_u_h(out1, const255);
787 out2 = __msa_min_u_h(out2, const255);
788 out3 = __msa_min_u_h(out3, const255);
789 out4 = __msa_min_u_h(out4, const255);
790 out5 = __msa_min_u_h(out5, const255);
791 out6 = __msa_min_u_h(out6, const255);
792 out7 = __msa_min_u_h(out7, const255);
793
794 PCKEV_B_STORE_VEC(out1, out0, dst);
795 dst += dst_stride;
796 PCKEV_B_STORE_VEC(out3, out2, dst);
797 dst += dst_stride;
798 PCKEV_B_STORE_VEC(out5, out4, dst);
799 dst += dst_stride;
800 PCKEV_B_STORE_VEC(out7, out6, dst);
801 dst += dst_stride;
802 }
803 }
804
805 static void common_hz_2t_32w_msa(const uint8_t *src, int32_t src_stride,
806 uint8_t *dst, int32_t dst_stride,
807 int8_t *filter, int32_t height) {
808 uint32_t loop_cnt;
809 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
810 v16u8 filt0;
811 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
812 v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
813 v8u16 filt, const255;
814
815 mask = LOAD_SB(&mc_filt_mask_arr[0]);
816
817 /* rearranging filter */
818 filt = LOAD_UH(filter);
819 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
820
821 const255 = (v8u16)__msa_ldi_h(255);
822
823 for (loop_cnt = height >> 1; loop_cnt--;) {
824 src0 = LOAD_SB(src);
825 src2 = LOAD_SB(src + 16);
826 src3 = LOAD_SB(src + 24);
827 src1 = __msa_sld_b(src2, src0, 8);
828 src += src_stride;
829 src4 = LOAD_SB(src);
830 src6 = LOAD_SB(src + 16);
831 src7 = LOAD_SB(src + 24);
832 src5 = __msa_sld_b(src6, src4, 8);
833 src += src_stride;
834
835 vec0 = (v16u8)__msa_vshf_b(mask, src0, src0);
836 vec1 = (v16u8)__msa_vshf_b(mask, src1, src1);
837 vec2 = (v16u8)__msa_vshf_b(mask, src2, src2);
838 vec3 = (v16u8)__msa_vshf_b(mask, src3, src3);
839 vec4 = (v16u8)__msa_vshf_b(mask, src4, src4);
840 vec5 = (v16u8)__msa_vshf_b(mask, src5, src5);
841 vec6 = (v16u8)__msa_vshf_b(mask, src6, src6);
842 vec7 = (v16u8)__msa_vshf_b(mask, src7, src7);
843
844 out0 = __msa_dotp_u_h(vec0, filt0);
845 out1 = __msa_dotp_u_h(vec1, filt0);
846 out2 = __msa_dotp_u_h(vec2, filt0);
847 out3 = __msa_dotp_u_h(vec3, filt0);
848 out4 = __msa_dotp_u_h(vec4, filt0);
849 out5 = __msa_dotp_u_h(vec5, filt0);
850 out6 = __msa_dotp_u_h(vec6, filt0);
851 out7 = __msa_dotp_u_h(vec7, filt0);
852
853 out0 = (v8u16)__msa_srari_h((v8i16)out0, FILTER_BITS);
854 out1 = (v8u16)__msa_srari_h((v8i16)out1, FILTER_BITS);
855 out2 = (v8u16)__msa_srari_h((v8i16)out2, FILTER_BITS);
856 out3 = (v8u16)__msa_srari_h((v8i16)out3, FILTER_BITS);
857 out4 = (v8u16)__msa_srari_h((v8i16)out4, FILTER_BITS);
858 out5 = (v8u16)__msa_srari_h((v8i16)out5, FILTER_BITS);
859 out6 = (v8u16)__msa_srari_h((v8i16)out6, FILTER_BITS);
860 out7 = (v8u16)__msa_srari_h((v8i16)out7, FILTER_BITS);
861
862 out0 = __msa_min_u_h(out0, const255);
863 out1 = __msa_min_u_h(out1, const255);
864 out2 = __msa_min_u_h(out2, const255);
865 out3 = __msa_min_u_h(out3, const255);
866 out4 = __msa_min_u_h(out4, const255);
867 out5 = __msa_min_u_h(out5, const255);
868 out6 = __msa_min_u_h(out6, const255);
869 out7 = __msa_min_u_h(out7, const255);
870
871 PCKEV_B_STORE_VEC(out1, out0, dst);
872 PCKEV_B_STORE_VEC(out3, out2, dst + 16);
873 dst += dst_stride;
874 PCKEV_B_STORE_VEC(out5, out4, dst);
875 PCKEV_B_STORE_VEC(out7, out6, dst + 16);
876 dst += dst_stride;
877 }
878 }
879
880 static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride,
881 uint8_t *dst, int32_t dst_stride,
882 int8_t *filter, int32_t height) {
883 uint32_t loop_cnt;
884 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
885 v16u8 filt0;
886 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
887 v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
888 v8u16 filt, const255;
889
890 mask = LOAD_SB(&mc_filt_mask_arr[0]);
891
892 /* rearranging filter */
893 filt = LOAD_UH(filter);
894 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
895
896 const255 = (v8u16)__msa_ldi_h(255);
897
898 for (loop_cnt = height; loop_cnt--;) {
899 src0 = LOAD_SB(src);
900 src2 = LOAD_SB(src + 16);
901 src4 = LOAD_SB(src + 32);
902 src6 = LOAD_SB(src + 48);
903 src7 = LOAD_SB(src + 56);
904 src1 = __msa_sld_b(src2, src0, 8);
905 src3 = __msa_sld_b(src4, src2, 8);
906 src5 = __msa_sld_b(src6, src4, 8);
907 src += src_stride;
908
909 vec0 = (v16u8)__msa_vshf_b(mask, src0, src0);
910 vec1 = (v16u8)__msa_vshf_b(mask, src1, src1);
911 vec2 = (v16u8)__msa_vshf_b(mask, src2, src2);
912 vec3 = (v16u8)__msa_vshf_b(mask, src3, src3);
913 vec4 = (v16u8)__msa_vshf_b(mask, src4, src4);
914 vec5 = (v16u8)__msa_vshf_b(mask, src5, src5);
915 vec6 = (v16u8)__msa_vshf_b(mask, src6, src6);
916 vec7 = (v16u8)__msa_vshf_b(mask, src7, src7);
917
918 out0 = __msa_dotp_u_h(vec0, filt0);
919 out1 = __msa_dotp_u_h(vec1, filt0);
920 out2 = __msa_dotp_u_h(vec2, filt0);
921 out3 = __msa_dotp_u_h(vec3, filt0);
922 out4 = __msa_dotp_u_h(vec4, filt0);
923 out5 = __msa_dotp_u_h(vec5, filt0);
924 out6 = __msa_dotp_u_h(vec6, filt0);
925 out7 = __msa_dotp_u_h(vec7, filt0);
926
927 out0 = (v8u16)__msa_srari_h((v8i16)out0, FILTER_BITS);
928 out1 = (v8u16)__msa_srari_h((v8i16)out1, FILTER_BITS);
929 out2 = (v8u16)__msa_srari_h((v8i16)out2, FILTER_BITS);
930 out3 = (v8u16)__msa_srari_h((v8i16)out3, FILTER_BITS);
931 out4 = (v8u16)__msa_srari_h((v8i16)out4, FILTER_BITS);
932 out5 = (v8u16)__msa_srari_h((v8i16)out5, FILTER_BITS);
933 out6 = (v8u16)__msa_srari_h((v8i16)out6, FILTER_BITS);
934 out7 = (v8u16)__msa_srari_h((v8i16)out7, FILTER_BITS);
935
936 out0 = __msa_min_u_h(out0, const255);
937 out1 = __msa_min_u_h(out1, const255);
938 out2 = __msa_min_u_h(out2, const255);
939 out3 = __msa_min_u_h(out3, const255);
940 out4 = __msa_min_u_h(out4, const255);
941 out5 = __msa_min_u_h(out5, const255);
942 out6 = __msa_min_u_h(out6, const255);
943 out7 = __msa_min_u_h(out7, const255);
944
945 PCKEV_B_STORE_VEC(out1, out0, dst);
946 PCKEV_B_STORE_VEC(out3, out2, dst + 16);
947 PCKEV_B_STORE_VEC(out5, out4, dst + 32);
948 PCKEV_B_STORE_VEC(out7, out6, dst + 48);
949 dst += dst_stride;
950 }
951 }
952
953 void vp9_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
954 uint8_t *dst, ptrdiff_t dst_stride,
955 const int16_t *filter_x, int x_step_q4,
956 const int16_t *filter_y, int y_step_q4,
957 int w, int h) {
958 int8_t cnt, filt_hor[8];
959
960 if (16 != x_step_q4) {
961 vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
962 filter_x, x_step_q4, filter_y, y_step_q4,
963 w, h);
964 return;
965 }
966
967 if (((const int32_t *)filter_x)[1] == 0x800000) {
968 vp9_convolve_copy(src, src_stride, dst, dst_stride,
969 filter_x, x_step_q4, filter_y, y_step_q4,
970 w, h);
971 return;
972 }
973
974 for (cnt = 0; cnt < 8; ++cnt) {
975 filt_hor[cnt] = filter_x[cnt];
976 }
977
978 if (((const int32_t *)filter_x)[0] == 0) {
979 switch (w) {
980 case 4:
981 common_hz_2t_4w_msa(src, (int32_t)src_stride,
982 dst, (int32_t)dst_stride,
983 &filt_hor[3], h);
984 break;
985 case 8:
986 common_hz_2t_8w_msa(src, (int32_t)src_stride,
987 dst, (int32_t)dst_stride,
988 &filt_hor[3], h);
989 break;
990 case 16:
991 common_hz_2t_16w_msa(src, (int32_t)src_stride,
992 dst, (int32_t)dst_stride,
993 &filt_hor[3], h);
994 break;
995 case 32:
996 common_hz_2t_32w_msa(src, (int32_t)src_stride,
997 dst, (int32_t)dst_stride,
998 &filt_hor[3], h);
999 break;
1000 case 64:
1001 common_hz_2t_64w_msa(src, (int32_t)src_stride,
1002 dst, (int32_t)dst_stride,
1003 &filt_hor[3], h);
1004 break;
1005 default:
1006 vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
1007 filter_x, x_step_q4, filter_y, y_step_q4,
1008 w, h);
1009 break;
1010 }
1011 } else {
1012 switch (w) {
1013 case 4:
1014 common_hz_8t_4w_msa(src, (int32_t)src_stride,
1015 dst, (int32_t)dst_stride,
1016 filt_hor, h);
1017 break;
1018 case 8:
1019 common_hz_8t_8w_msa(src, (int32_t)src_stride,
1020 dst, (int32_t)dst_stride,
1021 filt_hor, h);
1022 break;
1023 case 16:
1024 common_hz_8t_16w_msa(src, (int32_t)src_stride,
1025 dst, (int32_t)dst_stride,
1026 filt_hor, h);
1027 break;
1028 case 32:
1029 common_hz_8t_32w_msa(src, (int32_t)src_stride,
1030 dst, (int32_t)dst_stride,
1031 filt_hor, h);
1032 break;
1033 case 64:
1034 common_hz_8t_64w_msa(src, (int32_t)src_stride,
1035 dst, (int32_t)dst_stride,
1036 filt_hor, h);
1037 break;
1038 default:
1039 vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
1040 filter_x, x_step_q4, filter_y, y_step_q4,
1041 w, h);
1042 break;
1043 }
1044 }
1045 }
OLDNEW
« no previous file with comments | « source/libvpx/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c ('k') | source/libvpx/vp9/common/mips/msa/vp9_convolve8_msa.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698