Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(96)

Side by Side Diff: source/libvpx/vp9/common/mips/msa/vp9_convolve8_avg_vert_msa.c

Issue 1169543007: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "./vp9_rtcd.h"
12 #include "vp9/common/mips/msa/vp9_convolve_msa.h"
13
14 static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src,
15 int32_t src_stride,
16 uint8_t *dst,
17 int32_t dst_stride,
18 int8_t *filter,
19 int32_t height) {
20 uint32_t loop_cnt;
21 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
22 v16u8 dst0, dst1, dst2, dst3, out;
23 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
24 v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
25 v16i8 src10998, filt0, filt1, filt2, filt3;
26 v8i16 filt, out10, out32;
27
28 src -= (3 * src_stride);
29
30 filt = LD_SH(filter);
31 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
32
33 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
34 src += (7 * src_stride);
35
36 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
37 src54_r, src21_r);
38 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
39 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
40 src4332, src6554);
41 XORI_B3_128_SB(src2110, src4332, src6554);
42
43 for (loop_cnt = (height >> 2); loop_cnt--;) {
44 LD_SB4(src, src_stride, src7, src8, src9, src10);
45 src += (4 * src_stride);
46
47 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
48 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
49 src87_r, src98_r, src109_r);
50 ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
51 XORI_B2_128_SB(src8776, src10998);
52 out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
53 filt1, filt2, filt3);
54 out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
55 filt1, filt2, filt3);
56 SRARI_H2_SH(out10, out32, FILTER_BITS);
57 SAT_SH2_SH(out10, out32, 7);
58 out = PCKEV_XORI128_UB(out10, out32);
59 ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
60
61 dst0 = (v16u8)__msa_ilvr_d((v2i64)dst2, (v2i64)dst0);
62 out = __msa_aver_u_b(out, dst0);
63
64 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
65 dst += (4 * dst_stride);
66
67 src2110 = src6554;
68 src4332 = src8776;
69 src6554 = src10998;
70 src6 = src10;
71 }
72 }
73
74 static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src,
75 int32_t src_stride,
76 uint8_t *dst,
77 int32_t dst_stride,
78 int8_t *filter,
79 int32_t height) {
80 uint32_t loop_cnt;
81 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
82 v16u8 dst0, dst1, dst2, dst3;
83 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
84 v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
85 v8i16 filt, out0, out1, out2, out3;
86
87 src -= (3 * src_stride);
88
89 filt = LD_SH(filter);
90 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
91
92 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
93 src += (7 * src_stride);
94
95 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
96 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
97 src54_r, src21_r);
98 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
99
100 for (loop_cnt = (height >> 2); loop_cnt--;) {
101 LD_SB4(src, src_stride, src7, src8, src9, src10);
102 src += (4 * src_stride);
103
104 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
105 XORI_B4_128_SB(src7, src8, src9, src10);
106 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
107 src87_r, src98_r, src109_r);
108 out0 = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
109 filt1, filt2, filt3);
110 out1 = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
111 filt1, filt2, filt3);
112 out2 = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
113 filt1, filt2, filt3);
114 out3 = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
115 filt1, filt2, filt3);
116 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
117 SAT_SH4_SH(out0, out1, out2, out3, 7);
118 CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3,
119 dst, dst_stride);
120 dst += (4 * dst_stride);
121
122 src10_r = src54_r;
123 src32_r = src76_r;
124 src54_r = src98_r;
125 src21_r = src65_r;
126 src43_r = src87_r;
127 src65_r = src109_r;
128 src6 = src10;
129 }
130 }
131
132 static void common_vt_8t_and_aver_dst_16w_mult_msa(const uint8_t *src,
133 int32_t src_stride,
134 uint8_t *dst,
135 int32_t dst_stride,
136 int8_t *filter,
137 int32_t height,
138 int32_t width) {
139 const uint8_t *src_tmp;
140 uint8_t *dst_tmp;
141 uint32_t loop_cnt, cnt;
142 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
143 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
144 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
145 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
146 v16i8 filt0, filt1, filt2, filt3;
147 v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
148 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt;
149
150 src -= (3 * src_stride);
151
152 filt = LD_SH(filter);
153 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
154
155 for (cnt = (width >> 4); cnt--;) {
156 src_tmp = src;
157 dst_tmp = dst;
158
159 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
160 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
161 src_tmp += (7 * src_stride);
162
163 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
164 src54_r, src21_r);
165 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
166 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
167 src54_l, src21_l);
168 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
169
170 for (loop_cnt = (height >> 2); loop_cnt--;) {
171 LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
172 src_tmp += (4 * src_stride);
173
174 LD_UB4(dst_tmp, dst_stride, dst0, dst1, dst2, dst3);
175 XORI_B4_128_SB(src7, src8, src9, src10);
176 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
177 src87_r, src98_r, src109_r);
178 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
179 src87_l, src98_l, src109_l);
180 out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
181 filt1, filt2, filt3);
182 out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
183 filt1, filt2, filt3);
184 out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
185 filt1, filt2, filt3);
186 out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
187 filt1, filt2, filt3);
188 out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
189 filt1, filt2, filt3);
190 out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
191 filt1, filt2, filt3);
192 out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
193 filt1, filt2, filt3);
194 out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
195 filt1, filt2, filt3);
196 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
197 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS);
198 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
199 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
200 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
201 out3_r, tmp0, tmp1, tmp2, tmp3);
202 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
203 AVER_UB4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst0, dst1,
204 dst2, dst3);
205 ST_UB4(dst0, dst1, dst2, dst3, dst_tmp, dst_stride);
206 dst_tmp += (4 * dst_stride);
207
208 src10_r = src54_r;
209 src32_r = src76_r;
210 src54_r = src98_r;
211 src21_r = src65_r;
212 src43_r = src87_r;
213 src65_r = src109_r;
214 src10_l = src54_l;
215 src32_l = src76_l;
216 src54_l = src98_l;
217 src21_l = src65_l;
218 src43_l = src87_l;
219 src65_l = src109_l;
220 src6 = src10;
221 }
222
223 src += 16;
224 dst += 16;
225 }
226 }
227
228 static void common_vt_8t_and_aver_dst_16w_msa(const uint8_t *src,
229 int32_t src_stride,
230 uint8_t *dst,
231 int32_t dst_stride,
232 int8_t *filter,
233 int32_t height) {
234 common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
235 filter, height, 16);
236 }
237
238 static void common_vt_8t_and_aver_dst_32w_msa(const uint8_t *src,
239 int32_t src_stride,
240 uint8_t *dst,
241 int32_t dst_stride,
242 int8_t *filter,
243 int32_t height) {
244 common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
245 filter, height, 32);
246 }
247
248 static void common_vt_8t_and_aver_dst_64w_msa(const uint8_t *src,
249 int32_t src_stride,
250 uint8_t *dst,
251 int32_t dst_stride,
252 int8_t *filter,
253 int32_t height) {
254 common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
255 filter, height, 64);
256 }
257
258 static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src,
259 int32_t src_stride,
260 uint8_t *dst,
261 int32_t dst_stride,
262 int8_t *filter) {
263 v16i8 src0, src1, src2, src3, src4;
264 v16u8 dst0, dst1, dst2, dst3, out, filt0, src2110, src4332;
265 v16i8 src10_r, src32_r, src21_r, src43_r;
266 v8i16 filt;
267 v8u16 tmp0, tmp1;
268
269 filt = LD_SH(filter);
270 filt0 = (v16u8)__msa_splati_h(filt, 0);
271
272 LD_SB4(src, src_stride, src0, src1, src2, src3);
273 src += (4 * src_stride);
274
275 src4 = LD_SB(src);
276 src += src_stride;
277
278 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
279 ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
280 dst0 = (v16u8)__msa_ilvr_d((v2i64)dst1, (v2i64)dst0);
281 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
282 src32_r, src43_r);
283 ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
284 DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
285 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
286 SAT_UH2_UH(tmp0, tmp1, 7);
287
288 out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
289 out = __msa_aver_u_b(out, dst0);
290
291 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
292 }
293
294 static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src,
295 int32_t src_stride,
296 uint8_t *dst,
297 int32_t dst_stride,
298 int8_t *filter) {
299 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
300 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src87_r;
301 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
302 v16u8 src2110, src4332, src6554, src8776, filt0;
303 v8u16 tmp0, tmp1, tmp2, tmp3;
304 v8i16 filt;
305
306 filt = LD_SH(filter);
307 filt0 = (v16u8)__msa_splati_h(filt, 0);
308
309 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
310 src += (8 * src_stride);
311 src8 = LD_SB(src);
312
313 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
314 ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1,
315 dst2, dst3);
316 ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
317 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
318 src32_r, src43_r);
319 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
320 src76_r, src87_r);
321 ILVR_D4_UB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
322 src87_r, src76_r, src2110, src4332, src6554, src8776);
323 DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
324 tmp0, tmp1, tmp2, tmp3);
325 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
326 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
327 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
328 AVER_UB2_UB(src2110, dst0, src4332, dst1, src2110, src4332);
329 ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
330 dst += (4 * dst_stride);
331 ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst, dst_stride);
332 }
333
334 static void common_vt_2t_and_aver_dst_4w_msa(const uint8_t *src,
335 int32_t src_stride,
336 uint8_t *dst,
337 int32_t dst_stride,
338 int8_t *filter,
339 int32_t height) {
340 if (4 == height) {
341 common_vt_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
342 } else if (8 == height) {
343 common_vt_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter);
344 }
345 }
346
347 static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src,
348 int32_t src_stride,
349 uint8_t *dst,
350 int32_t dst_stride,
351 int8_t *filter) {
352 v16u8 src0, src1, src2, src3, src4;
353 v16u8 dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3, filt0;
354 v8u16 tmp0, tmp1, tmp2, tmp3;
355 v8i16 filt;
356
357 /* rearranging filter_y */
358 filt = LD_SH(filter);
359 filt0 = (v16u8)__msa_splati_h(filt, 0);
360
361 LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
362 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
363 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
364 ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
365 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
366 tmp2, tmp3);
367 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
368 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
369 PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
370 dst, dst_stride);
371 }
372
373 static void common_vt_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
374 int32_t src_stride,
375 uint8_t *dst,
376 int32_t dst_stride,
377 int8_t *filter,
378 int32_t height) {
379 uint32_t loop_cnt;
380 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
381 v16u8 dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
382 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
383 v8u16 tmp0, tmp1, tmp2, tmp3;
384 v8i16 filt;
385
386 /* rearranging filter_y */
387 filt = LD_SH(filter);
388 filt0 = (v16u8)__msa_splati_h(filt, 0);
389
390 src0 = LD_UB(src);
391 src += src_stride;
392
393 for (loop_cnt = (height >> 3); loop_cnt--;) {
394 LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
395 src += (8 * src_stride);
396 LD_UB8(dst, dst_stride, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8);
397
398 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1,
399 vec2, vec3);
400 ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5,
401 vec6, vec7);
402 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
403 tmp2, tmp3);
404 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
405 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
406 PCKEV_AVG_ST8x4_UB(tmp0, dst1, tmp1, dst2, tmp2, dst3, tmp3, dst4,
407 dst, dst_stride);
408 dst += (4 * dst_stride);
409
410 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1,
411 tmp2, tmp3);
412 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
413 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
414 PCKEV_AVG_ST8x4_UB(tmp0, dst5, tmp1, dst6, tmp2, dst7, tmp3, dst8,
415 dst, dst_stride);
416 dst += (4 * dst_stride);
417
418 src0 = src8;
419 }
420 }
421
422 static void common_vt_2t_and_aver_dst_8w_msa(const uint8_t *src,
423 int32_t src_stride,
424 uint8_t *dst,
425 int32_t dst_stride,
426 int8_t *filter,
427 int32_t height) {
428 if (4 == height) {
429 common_vt_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter);
430 } else {
431 common_vt_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
432 filter, height);
433 }
434 }
435
436 static void common_vt_2t_and_aver_dst_16w_msa(const uint8_t *src,
437 int32_t src_stride,
438 uint8_t *dst,
439 int32_t dst_stride,
440 int8_t *filter,
441 int32_t height) {
442 uint32_t loop_cnt;
443 v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0;
444 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
445 v8u16 tmp0, tmp1, tmp2, tmp3, filt;
446
447 /* rearranging filter_y */
448 filt = LD_UH(filter);
449 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
450
451 src0 = LD_UB(src);
452 src += src_stride;
453
454 for (loop_cnt = (height >> 2); loop_cnt--;) {
455 LD_UB4(src, src_stride, src1, src2, src3, src4);
456 src += (4 * src_stride);
457
458 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
459 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
460 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
461 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
462 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
463 SAT_UH2_UH(tmp0, tmp1, 7);
464 PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
465 dst += dst_stride;
466
467 ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
468 ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
469 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
470 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
471 SAT_UH2_UH(tmp2, tmp3, 7);
472 PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst);
473 dst += dst_stride;
474
475 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
476 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
477 SAT_UH2_UH(tmp0, tmp1, 7);
478 PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
479 dst += dst_stride;
480
481 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
482 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
483 SAT_UH2_UH(tmp2, tmp3, 7);
484 PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst);
485 dst += dst_stride;
486
487 src0 = src4;
488 }
489 }
490
491 static void common_vt_2t_and_aver_dst_32w_msa(const uint8_t *src,
492 int32_t src_stride,
493 uint8_t *dst,
494 int32_t dst_stride,
495 int8_t *filter,
496 int32_t height) {
497 uint32_t loop_cnt;
498 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
499 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
500 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
501 v8u16 tmp0, tmp1, tmp2, tmp3, filt;
502
503 /* rearranging filter_y */
504 filt = LD_UH(filter);
505 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
506
507 LD_UB2(src, 16, src0, src5);
508 src += src_stride;
509
510 for (loop_cnt = (height >> 2); loop_cnt--;) {
511 LD_UB4(src, src_stride, src1, src2, src3, src4);
512 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
513 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
514 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
515
516 LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
517 LD_UB4(dst + 16, dst_stride, dst4, dst5, dst6, dst7);
518 src += (4 * src_stride);
519
520 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
521 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
522 SAT_UH2_UH(tmp0, tmp1, 7);
523 PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
524
525 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
526 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
527 SAT_UH2_UH(tmp2, tmp3, 7);
528 PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
529
530 ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
531 ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
532 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
533 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
534 SAT_UH2_UH(tmp0, tmp1, 7);
535 PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst + 2 * dst_stride);
536
537 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
538 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
539 SAT_UH2_UH(tmp2, tmp3, 7);
540 PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst + 3 * dst_stride);
541
542 ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
543 ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
544 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
545 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
546 SAT_UH2_UH(tmp0, tmp1, 7);
547 PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 16);
548
549 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
550 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
551 SAT_UH2_UH(tmp2, tmp3, 7);
552 PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 16 + dst_stride);
553
554 ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
555 ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
556 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
557 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
558 SAT_UH2_UH(tmp0, tmp1, 7);
559 PCKEV_AVG_ST_UB(tmp1, tmp0, dst6, dst + 16 + 2 * dst_stride);
560
561 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
562 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
563 SAT_UH2_UH(tmp2, tmp3, 7);
564 PCKEV_AVG_ST_UB(tmp3, tmp2, dst7, dst + 16 + 3 * dst_stride);
565 dst += (4 * dst_stride);
566
567 src0 = src4;
568 src5 = src9;
569 }
570 }
571
572 static void common_vt_2t_and_aver_dst_64w_msa(const uint8_t *src,
573 int32_t src_stride,
574 uint8_t *dst,
575 int32_t dst_stride,
576 int8_t *filter,
577 int32_t height) {
578 uint32_t loop_cnt;
579 v16u8 src0, src1, src2, src3, src4, src5;
580 v16u8 src6, src7, src8, src9, src10, src11, filt0;
581 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
582 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
583 v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
584 v8u16 filt;
585
586 /* rearranging filter_y */
587 filt = LD_UH(filter);
588 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
589
590 LD_UB4(src, 16, src0, src3, src6, src9);
591 src += src_stride;
592
593 for (loop_cnt = (height >> 1); loop_cnt--;) {
594 LD_UB2(src, src_stride, src1, src2);
595 LD_UB2(dst, dst_stride, dst0, dst1);
596 LD_UB2(src + 16, src_stride, src4, src5);
597 LD_UB2(dst + 16, dst_stride, dst2, dst3);
598 LD_UB2(src + 32, src_stride, src7, src8);
599 LD_UB2(dst + 32, dst_stride, dst4, dst5);
600 LD_UB2(src + 48, src_stride, src10, src11);
601 LD_UB2(dst + 48, dst_stride, dst6, dst7);
602 src += (2 * src_stride);
603
604 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
605 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
606 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
607 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
608 SAT_UH2_UH(tmp0, tmp1, 7);
609 PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
610
611 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
612 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
613 SAT_UH2_UH(tmp2, tmp3, 7);
614 PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
615
616 ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
617 ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
618 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
619 SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
620 SAT_UH2_UH(tmp4, tmp5, 7);
621 PCKEV_AVG_ST_UB(tmp5, tmp4, dst2, dst + 16);
622
623 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
624 SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
625 SAT_UH2_UH(tmp6, tmp7, 7);
626 PCKEV_AVG_ST_UB(tmp7, tmp6, dst3, dst + 16 + dst_stride);
627
628 ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
629 ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
630 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
631 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
632 SAT_UH2_UH(tmp0, tmp1, 7);
633 PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 32);
634
635 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
636 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
637 SAT_UH2_UH(tmp2, tmp3, 7);
638 PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 32 + dst_stride);
639
640 ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
641 ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
642 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
643 SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
644 SAT_UH2_UH(tmp4, tmp5, 7);
645 PCKEV_AVG_ST_UB(tmp5, tmp4, dst6, (dst + 48));
646
647 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
648 SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
649 SAT_UH2_UH(tmp6, tmp7, 7);
650 PCKEV_AVG_ST_UB(tmp7, tmp6, dst7, dst + 48 + dst_stride);
651 dst += (2 * dst_stride);
652
653 src0 = src2;
654 src3 = src5;
655 src6 = src8;
656 src9 = src11;
657 }
658 }
659
660 void vp9_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
661 uint8_t *dst, ptrdiff_t dst_stride,
662 const int16_t *filter_x, int x_step_q4,
663 const int16_t *filter_y, int y_step_q4,
664 int w, int h) {
665 int8_t cnt, filt_ver[8];
666
667 if (16 != y_step_q4) {
668 vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
669 filter_x, x_step_q4, filter_y, y_step_q4,
670 w, h);
671 return;
672 }
673
674 if (((const int32_t *)filter_y)[1] == 0x800000) {
675 vp9_convolve_avg(src, src_stride, dst, dst_stride,
676 filter_x, x_step_q4, filter_y, y_step_q4,
677 w, h);
678 return;
679 }
680
681 for (cnt = 0; cnt < 8; ++cnt) {
682 filt_ver[cnt] = filter_y[cnt];
683 }
684
685 if (((const int32_t *)filter_y)[0] == 0) {
686 switch (w) {
687 case 4:
688 common_vt_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride,
689 dst, (int32_t)dst_stride,
690 &filt_ver[3], h);
691 break;
692 case 8:
693 common_vt_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride,
694 dst, (int32_t)dst_stride,
695 &filt_ver[3], h);
696 break;
697 case 16:
698 common_vt_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride,
699 dst, (int32_t)dst_stride,
700 &filt_ver[3], h);
701 break;
702 case 32:
703 common_vt_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride,
704 dst, (int32_t)dst_stride,
705 &filt_ver[3], h);
706 break;
707 case 64:
708 common_vt_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride,
709 dst, (int32_t)dst_stride,
710 &filt_ver[3], h);
711 break;
712 default:
713 vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
714 filter_x, x_step_q4, filter_y, y_step_q4,
715 w, h);
716 break;
717 }
718 } else {
719 switch (w) {
720 case 4:
721 common_vt_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride,
722 dst, (int32_t)dst_stride,
723 filt_ver, h);
724 break;
725 case 8:
726 common_vt_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride,
727 dst, (int32_t)dst_stride,
728 filt_ver, h);
729 break;
730 case 16:
731 common_vt_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride,
732 dst, (int32_t)dst_stride,
733 filt_ver, h);
734
735 break;
736 case 32:
737 common_vt_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride,
738 dst, (int32_t)dst_stride,
739 filt_ver, h);
740 break;
741 case 64:
742 common_vt_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride,
743 dst, (int32_t)dst_stride,
744 filt_ver, h);
745 break;
746 default:
747 vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
748 filter_x, x_step_q4, filter_y, y_step_q4,
749 w, h);
750 break;
751 }
752 }
753 }
OLDNEW
« no previous file with comments | « source/libvpx/vp9/common/mips/msa/vp9_convolve8_avg_msa.c ('k') | source/libvpx/vp9/common/mips/msa/vp9_convolve8_horiz_msa.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698