Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(119)

Side by Side Diff: source/libvpx/vp9/common/mips/msa/vp9_convolve8_vert_msa.c

Issue 1124333011: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: only update to last nights LKGR Created 5 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "./vp9_rtcd.h"
12 #include "vp9/common/mips/msa/vp9_convolve_msa.h"
13
14 static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride,
15 uint8_t *dst, int32_t dst_stride,
16 int8_t *filter, int32_t height) {
17 uint32_t loop_cnt;
18 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
19 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
20 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
21 v16i8 src2110, src4332, src6554, src8776, src10998;
22 v8i16 filt, out10, out32;
23 v16i8 filt0, filt1, filt2, filt3;
24
25 src -= (3 * src_stride);
26
27 filt = LOAD_SH(filter);
28 filt0 = (v16i8)__msa_splati_h(filt, 0);
29 filt1 = (v16i8)__msa_splati_h(filt, 1);
30 filt2 = (v16i8)__msa_splati_h(filt, 2);
31 filt3 = (v16i8)__msa_splati_h(filt, 3);
32
33 LOAD_7VECS_SB(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
34 src += (7 * src_stride);
35
36 ILVR_B_6VECS_SB(src0, src2, src4, src1, src3, src5,
37 src1, src3, src5, src2, src4, src6,
38 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r);
39
40 ILVR_D_3VECS_SB(src2110, src21_r, src10_r, src4332, src43_r, src32_r,
41 src6554, src65_r, src54_r);
42
43 XORI_B_3VECS_SB(src2110, src4332, src6554, src2110, src4332, src6554, 128);
44
45 for (loop_cnt = (height >> 2); loop_cnt--;) {
46 LOAD_4VECS_SB(src, src_stride, src7, src8, src9, src10);
47 src += (4 * src_stride);
48
49 ILVR_B_4VECS_SB(src6, src7, src8, src9, src7, src8, src9, src10,
50 src76_r, src87_r, src98_r, src109_r);
51
52 ILVR_D_2VECS_SB(src8776, src87_r, src76_r, src10998, src109_r, src98_r);
53
54 XORI_B_2VECS_SB(src8776, src10998, src8776, src10998, 128);
55
56 out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776,
57 filt0, filt1, filt2, filt3);
58 out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998,
59 filt0, filt1, filt2, filt3);
60
61 out10 = SRARI_SATURATE_SIGNED_H(out10, FILTER_BITS, 7);
62 out32 = SRARI_SATURATE_SIGNED_H(out32, FILTER_BITS, 7);
63
64 PCKEV_2B_XORI128_STORE_4_BYTES_4(out10, out32, dst, dst_stride);
65 dst += (4 * dst_stride);
66
67 src2110 = src6554;
68 src4332 = src8776;
69 src6554 = src10998;
70
71 src6 = src10;
72 }
73 }
74
75 static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride,
76 uint8_t *dst, int32_t dst_stride,
77 int8_t *filter, int32_t height) {
78 uint32_t loop_cnt;
79 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
80 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
81 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
82 v16i8 filt0, filt1, filt2, filt3;
83 v8i16 filt, out0_r, out1_r, out2_r, out3_r;
84
85 src -= (3 * src_stride);
86
87 filt = LOAD_SH(filter);
88 filt0 = (v16i8)__msa_splati_h(filt, 0);
89 filt1 = (v16i8)__msa_splati_h(filt, 1);
90 filt2 = (v16i8)__msa_splati_h(filt, 2);
91 filt3 = (v16i8)__msa_splati_h(filt, 3);
92
93 LOAD_7VECS_SB(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
94 src += (7 * src_stride);
95
96 XORI_B_7VECS_SB(src0, src1, src2, src3, src4, src5, src6,
97 src0, src1, src2, src3, src4, src5, src6, 128);
98
99 ILVR_B_6VECS_SB(src0, src2, src4, src1, src3, src5,
100 src1, src3, src5, src2, src4, src6,
101 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r);
102
103 for (loop_cnt = (height >> 2); loop_cnt--;) {
104 LOAD_4VECS_SB(src, src_stride, src7, src8, src9, src10);
105 src += (4 * src_stride);
106
107 XORI_B_4VECS_SB(src7, src8, src9, src10, src7, src8, src9, src10, 128);
108
109 ILVR_B_4VECS_SB(src6, src7, src8, src9, src7, src8, src9, src10,
110 src76_r, src87_r, src98_r, src109_r);
111
112 out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r,
113 filt0, filt1, filt2, filt3);
114 out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r,
115 filt0, filt1, filt2, filt3);
116 out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r,
117 filt0, filt1, filt2, filt3);
118 out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r,
119 filt0, filt1, filt2, filt3);
120
121 out0_r = SRARI_SATURATE_SIGNED_H(out0_r, FILTER_BITS, 7);
122 out1_r = SRARI_SATURATE_SIGNED_H(out1_r, FILTER_BITS, 7);
123 out2_r = SRARI_SATURATE_SIGNED_H(out2_r, FILTER_BITS, 7);
124 out3_r = SRARI_SATURATE_SIGNED_H(out3_r, FILTER_BITS, 7);
125
126 PCKEV_B_4_XORI128_STORE_8_BYTES_4(out0_r, out1_r, out2_r, out3_r,
127 dst, dst_stride);
128 dst += (4 * dst_stride);
129
130 src10_r = src54_r;
131 src32_r = src76_r;
132 src54_r = src98_r;
133 src21_r = src65_r;
134 src43_r = src87_r;
135 src65_r = src109_r;
136
137 src6 = src10;
138 }
139 }
140
141 static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride,
142 uint8_t *dst, int32_t dst_stride,
143 int8_t *filter, int32_t height,
144 int32_t width) {
145 const uint8_t *src_tmp;
146 uint8_t *dst_tmp;
147 uint32_t loop_cnt, cnt;
148 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
149 v16i8 filt0, filt1, filt2, filt3;
150 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
151 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
152 v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
153 v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
154 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
155 v8i16 filt;
156 v16u8 tmp0, tmp1, tmp2, tmp3;
157
158 src -= (3 * src_stride);
159
160 filt = LOAD_SH(filter);
161 filt0 = (v16i8)__msa_splati_h(filt, 0);
162 filt1 = (v16i8)__msa_splati_h(filt, 1);
163 filt2 = (v16i8)__msa_splati_h(filt, 2);
164 filt3 = (v16i8)__msa_splati_h(filt, 3);
165
166 for (cnt = (width >> 4); cnt--;) {
167 src_tmp = src;
168 dst_tmp = dst;
169
170 LOAD_7VECS_SB(src_tmp, src_stride,
171 src0, src1, src2, src3, src4, src5, src6);
172 src_tmp += (7 * src_stride);
173
174 XORI_B_7VECS_SB(src0, src1, src2, src3, src4, src5, src6,
175 src0, src1, src2, src3, src4, src5, src6, 128);
176
177 ILVR_B_6VECS_SB(src0, src2, src4, src1, src3, src5,
178 src1, src3, src5, src2, src4, src6,
179 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r);
180
181 ILVL_B_6VECS_SB(src0, src2, src4, src1, src3, src5,
182 src1, src3, src5, src2, src4, src6,
183 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l);
184
185 for (loop_cnt = (height >> 2); loop_cnt--;) {
186 LOAD_4VECS_SB(src_tmp, src_stride, src7, src8, src9, src10);
187 src_tmp += (4 * src_stride);
188
189 XORI_B_4VECS_SB(src7, src8, src9, src10, src7, src8, src9, src10, 128);
190
191 ILVR_B_4VECS_SB(src6, src7, src8, src9, src7, src8, src9, src10,
192 src76_r, src87_r, src98_r, src109_r);
193
194 ILVL_B_4VECS_SB(src6, src7, src8, src9, src7, src8, src9, src10,
195 src76_l, src87_l, src98_l, src109_l);
196
197 out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r,
198 filt0, filt1, filt2, filt3);
199 out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r,
200 filt0, filt1, filt2, filt3);
201 out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r,
202 filt0, filt1, filt2, filt3);
203 out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r,
204 filt0, filt1, filt2, filt3);
205
206 out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l,
207 filt0, filt1, filt2, filt3);
208 out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l,
209 filt0, filt1, filt2, filt3);
210 out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l,
211 filt0, filt1, filt2, filt3);
212 out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l,
213 filt0, filt1, filt2, filt3);
214
215 out0_r = SRARI_SATURATE_SIGNED_H(out0_r, FILTER_BITS, 7);
216 out1_r = SRARI_SATURATE_SIGNED_H(out1_r, FILTER_BITS, 7);
217 out2_r = SRARI_SATURATE_SIGNED_H(out2_r, FILTER_BITS, 7);
218 out3_r = SRARI_SATURATE_SIGNED_H(out3_r, FILTER_BITS, 7);
219 out0_l = SRARI_SATURATE_SIGNED_H(out0_l, FILTER_BITS, 7);
220 out1_l = SRARI_SATURATE_SIGNED_H(out1_l, FILTER_BITS, 7);
221 out2_l = SRARI_SATURATE_SIGNED_H(out2_l, FILTER_BITS, 7);
222 out3_l = SRARI_SATURATE_SIGNED_H(out3_l, FILTER_BITS, 7);
223
224 out0_r = (v8i16)__msa_pckev_b((v16i8)out0_l, (v16i8)out0_r);
225 out1_r = (v8i16)__msa_pckev_b((v16i8)out1_l, (v16i8)out1_r);
226 out2_r = (v8i16)__msa_pckev_b((v16i8)out2_l, (v16i8)out2_r);
227 out3_r = (v8i16)__msa_pckev_b((v16i8)out3_l, (v16i8)out3_r);
228
229 XORI_B_4VECS_UB(out0_r, out1_r, out2_r, out3_r,
230 tmp0, tmp1, tmp2, tmp3, 128);
231
232 STORE_4VECS_UB(dst_tmp, dst_stride, tmp0, tmp1, tmp2, tmp3);
233 dst_tmp += (4 * dst_stride);
234
235 src10_r = src54_r;
236 src32_r = src76_r;
237 src54_r = src98_r;
238 src21_r = src65_r;
239 src43_r = src87_r;
240 src65_r = src109_r;
241
242 src10_l = src54_l;
243 src32_l = src76_l;
244 src54_l = src98_l;
245 src21_l = src65_l;
246 src43_l = src87_l;
247 src65_l = src109_l;
248
249 src6 = src10;
250 }
251
252 src += 16;
253 dst += 16;
254 }
255 }
256
257 static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride,
258 uint8_t *dst, int32_t dst_stride,
259 int8_t *filter, int32_t height) {
260 common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride,
261 filter, height, 16);
262 }
263
264 static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride,
265 uint8_t *dst, int32_t dst_stride,
266 int8_t *filter, int32_t height) {
267 common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride,
268 filter, height, 32);
269 }
270
271 static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride,
272 uint8_t *dst, int32_t dst_stride,
273 int8_t *filter, int32_t height) {
274 common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride,
275 filter, height, 64);
276 }
277
278 static void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
279 uint8_t *dst, int32_t dst_stride,
280 int8_t *filter) {
281 uint32_t out0, out1, out2, out3;
282 v16i8 src0, src1, src2, src3, src4;
283 v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
284 v16i8 filt0;
285 v8u16 filt;
286
287 filt = LOAD_UH(filter);
288 filt0 = (v16i8)__msa_splati_h((v8i16)filt, 0);
289
290 LOAD_5VECS_SB(src, src_stride, src0, src1, src2, src3, src4);
291 src += (5 * src_stride);
292
293 ILVR_B_4VECS_SB(src0, src1, src2, src3, src1, src2, src3, src4,
294 src10_r, src21_r, src32_r, src43_r);
295
296 ILVR_D_2VECS_SB(src2110, src21_r, src10_r, src4332, src43_r, src32_r);
297
298 src2110 = (v16i8)__msa_dotp_u_h((v16u8)src2110, (v16u8)filt0);
299 src4332 = (v16i8)__msa_dotp_u_h((v16u8)src4332, (v16u8)filt0);
300
301 src2110 = (v16i8)SRARI_SATURATE_UNSIGNED_H(src2110, FILTER_BITS, 7);
302 src4332 = (v16i8)SRARI_SATURATE_UNSIGNED_H(src4332, FILTER_BITS, 7);
303
304 src2110 = (v16i8)__msa_pckev_b((v16i8)src4332, (v16i8)src2110);
305
306 out0 = __msa_copy_u_w((v4i32)src2110, 0);
307 out1 = __msa_copy_u_w((v4i32)src2110, 1);
308 out2 = __msa_copy_u_w((v4i32)src2110, 2);
309 out3 = __msa_copy_u_w((v4i32)src2110, 3);
310
311 STORE_WORD(dst, out0);
312 dst += dst_stride;
313 STORE_WORD(dst, out1);
314 dst += dst_stride;
315 STORE_WORD(dst, out2);
316 dst += dst_stride;
317 STORE_WORD(dst, out3);
318 }
319
320 static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
321 uint8_t *dst, int32_t dst_stride,
322 int8_t *filter) {
323 uint32_t out0, out1, out2, out3, out4, out5, out6, out7;
324 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
325 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
326 v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
327 v16i8 filt0;
328 v8u16 filt;
329
330 filt = LOAD_UH(filter);
331 filt0 = (v16i8)__msa_splati_h((v8i16)filt, 0);
332
333 LOAD_8VECS_SB(src, src_stride,
334 src0, src1, src2, src3, src4, src5, src6, src7);
335 src += (8 * src_stride);
336
337 src8 = LOAD_SB(src);
338 src += src_stride;
339
340 ILVR_B_8VECS_SB(src0, src1, src2, src3, src4, src5, src6, src7,
341 src1, src2, src3, src4, src5, src6, src7, src8,
342 src10_r, src21_r, src32_r, src43_r,
343 src54_r, src65_r, src76_r, src87_r);
344
345 ILVR_D_4VECS_SB(src2110, src21_r, src10_r, src4332, src43_r, src32_r,
346 src6554, src65_r, src54_r, src8776, src87_r, src76_r);
347
348 src2110 = (v16i8)__msa_dotp_u_h((v16u8)src2110, (v16u8)filt0);
349 src4332 = (v16i8)__msa_dotp_u_h((v16u8)src4332, (v16u8)filt0);
350 src6554 = (v16i8)__msa_dotp_u_h((v16u8)src6554, (v16u8)filt0);
351 src8776 = (v16i8)__msa_dotp_u_h((v16u8)src8776, (v16u8)filt0);
352
353 src2110 = (v16i8)SRARI_SATURATE_UNSIGNED_H(src2110, FILTER_BITS, 7);
354 src4332 = (v16i8)SRARI_SATURATE_UNSIGNED_H(src4332, FILTER_BITS, 7);
355 src6554 = (v16i8)SRARI_SATURATE_UNSIGNED_H(src6554, FILTER_BITS, 7);
356 src8776 = (v16i8)SRARI_SATURATE_UNSIGNED_H(src8776, FILTER_BITS, 7);
357
358 src2110 = (v16i8)__msa_pckev_b((v16i8)src4332, (v16i8)src2110);
359 src4332 = (v16i8)__msa_pckev_b((v16i8)src8776, (v16i8)src6554);
360
361 out0 = __msa_copy_u_w((v4i32)src2110, 0);
362 out1 = __msa_copy_u_w((v4i32)src2110, 1);
363 out2 = __msa_copy_u_w((v4i32)src2110, 2);
364 out3 = __msa_copy_u_w((v4i32)src2110, 3);
365 out4 = __msa_copy_u_w((v4i32)src4332, 0);
366 out5 = __msa_copy_u_w((v4i32)src4332, 1);
367 out6 = __msa_copy_u_w((v4i32)src4332, 2);
368 out7 = __msa_copy_u_w((v4i32)src4332, 3);
369
370 STORE_WORD(dst, out0);
371 dst += dst_stride;
372 STORE_WORD(dst, out1);
373 dst += dst_stride;
374 STORE_WORD(dst, out2);
375 dst += dst_stride;
376 STORE_WORD(dst, out3);
377 dst += dst_stride;
378 STORE_WORD(dst, out4);
379 dst += dst_stride;
380 STORE_WORD(dst, out5);
381 dst += dst_stride;
382 STORE_WORD(dst, out6);
383 dst += dst_stride;
384 STORE_WORD(dst, out7);
385 }
386
387 static void common_vt_2t_4w_msa(const uint8_t *src, int32_t src_stride,
388 uint8_t *dst, int32_t dst_stride,
389 int8_t *filter, int32_t height) {
390 if (4 == height) {
391 common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
392 } else if (8 == height) {
393 common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
394 }
395 }
396
397 static void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
398 uint8_t *dst, int32_t dst_stride,
399 int8_t *filter) {
400 v16u8 src0, src1, src2, src3, src4;
401 v16u8 vec0, vec1, vec2, vec3, filt0;
402 v8u16 tmp0, tmp1, tmp2, tmp3;
403 v8u16 filt;
404
405 /* rearranging filter_y */
406 filt = LOAD_UH(filter);
407 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
408
409 LOAD_5VECS_UB(src, src_stride, src0, src1, src2, src3, src4);
410
411 ILVR_B_2VECS_UB(src0, src1, src1, src2, vec0, vec1);
412 ILVR_B_2VECS_UB(src2, src3, src3, src4, vec2, vec3);
413
414 /* filter calc */
415 tmp0 = __msa_dotp_u_h(vec0, filt0);
416 tmp1 = __msa_dotp_u_h(vec1, filt0);
417 tmp2 = __msa_dotp_u_h(vec2, filt0);
418 tmp3 = __msa_dotp_u_h(vec3, filt0);
419
420 tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
421 tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
422 tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
423 tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
424
425 PCKEV_B_STORE_8_BYTES_4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
426 }
427
428 static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
429 uint8_t *dst, int32_t dst_stride,
430 int8_t *filter, int32_t height) {
431 uint32_t loop_cnt;
432 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
433 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
434 v8u16 tmp0, tmp1, tmp2, tmp3;
435 v8u16 filt;
436
437 /* rearranging filter_y */
438 filt = LOAD_UH(filter);
439 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
440
441 src0 = LOAD_UB(src);
442 src += src_stride;
443
444 for (loop_cnt = (height >> 3); loop_cnt--;) {
445 LOAD_8VECS_UB(src, src_stride,
446 src1, src2, src3, src4, src5, src6, src7, src8);
447 src += (8 * src_stride);
448
449 ILVR_B_4VECS_UB(src0, src1, src2, src3, src1, src2, src3, src4,
450 vec0, vec1, vec2, vec3);
451
452 ILVR_B_4VECS_UB(src4, src5, src6, src7, src5, src6, src7, src8,
453 vec4, vec5, vec6, vec7);
454
455 tmp0 = __msa_dotp_u_h(vec0, filt0);
456 tmp1 = __msa_dotp_u_h(vec1, filt0);
457 tmp2 = __msa_dotp_u_h(vec2, filt0);
458 tmp3 = __msa_dotp_u_h(vec3, filt0);
459
460 tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
461 tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
462 tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
463 tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
464
465 PCKEV_B_STORE_8_BYTES_4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
466 dst += (4 * dst_stride);
467
468 tmp0 = __msa_dotp_u_h(vec4, filt0);
469 tmp1 = __msa_dotp_u_h(vec5, filt0);
470 tmp2 = __msa_dotp_u_h(vec6, filt0);
471 tmp3 = __msa_dotp_u_h(vec7, filt0);
472
473 tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
474 tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
475 tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
476 tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
477
478 PCKEV_B_STORE_8_BYTES_4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
479 dst += (4 * dst_stride);
480
481 src0 = src8;
482 }
483 }
484
485 static void common_vt_2t_8w_msa(const uint8_t *src, int32_t src_stride,
486 uint8_t *dst, int32_t dst_stride,
487 int8_t *filter, int32_t height) {
488 if (4 == height) {
489 common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
490 } else {
491 common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
492 }
493 }
494
495 static void common_vt_2t_16w_msa(const uint8_t *src, int32_t src_stride,
496 uint8_t *dst, int32_t dst_stride,
497 int8_t *filter, int32_t height) {
498 uint32_t loop_cnt;
499 v16u8 src0, src1, src2, src3, src4;
500 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
501 v8u16 tmp0, tmp1, tmp2, tmp3;
502 v8u16 filt;
503
504 /* rearranging filter_y */
505 filt = LOAD_UH(filter);
506 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
507
508 src0 = LOAD_UB(src);
509 src += src_stride;
510
511 for (loop_cnt = (height >> 2); loop_cnt--;) {
512 LOAD_4VECS_UB(src, src_stride, src1, src2, src3, src4);
513 src += (4 * src_stride);
514
515 ILV_B_LRLR_UB(src0, src1, src1, src2, vec1, vec0, vec3, vec2);
516
517 tmp0 = __msa_dotp_u_h(vec0, filt0);
518 tmp1 = __msa_dotp_u_h(vec1, filt0);
519
520 tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
521 tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
522
523 PCKEV_B_STORE_VEC(tmp1, tmp0, dst);
524 dst += dst_stride;
525
526 ILV_B_LRLR_UB(src2, src3, src3, src4, vec5, vec4, vec7, vec6);
527
528 tmp2 = __msa_dotp_u_h(vec2, filt0);
529 tmp3 = __msa_dotp_u_h(vec3, filt0);
530
531 tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
532 tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
533
534 PCKEV_B_STORE_VEC(tmp3, tmp2, dst);
535 dst += dst_stride;
536
537 tmp0 = __msa_dotp_u_h(vec4, filt0);
538 tmp1 = __msa_dotp_u_h(vec5, filt0);
539
540 tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
541 tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
542
543 PCKEV_B_STORE_VEC(tmp1, tmp0, dst);
544 dst += dst_stride;
545
546 tmp2 = __msa_dotp_u_h(vec6, filt0);
547 tmp3 = __msa_dotp_u_h(vec7, filt0);
548
549 tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
550 tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
551
552 PCKEV_B_STORE_VEC(tmp3, tmp2, dst);
553 dst += dst_stride;
554
555 src0 = src4;
556 }
557 }
558
559 static void common_vt_2t_32w_msa(const uint8_t *src, int32_t src_stride,
560 uint8_t *dst, int32_t dst_stride,
561 int8_t *filter, int32_t height) {
562 uint32_t loop_cnt;
563 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
564 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
565 v8u16 tmp0, tmp1, tmp2, tmp3;
566 v8u16 filt;
567
568 /* rearranging filter_y */
569 filt = LOAD_UH(filter);
570 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
571
572 src0 = LOAD_UB(src);
573 src5 = LOAD_UB(src + 16);
574 src += src_stride;
575
576 for (loop_cnt = (height >> 2); loop_cnt--;) {
577 LOAD_4VECS_UB(src, src_stride, src1, src2, src3, src4);
578
579 ILV_B_LRLR_UB(src0, src1, src1, src2, vec1, vec0, vec3, vec2);
580
581 LOAD_4VECS_UB(src + 16, src_stride, src6, src7, src8, src9);
582 src += (4 * src_stride);
583
584 tmp0 = __msa_dotp_u_h(vec0, filt0);
585 tmp1 = __msa_dotp_u_h(vec1, filt0);
586
587 tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
588 tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
589
590 PCKEV_B_STORE_VEC(tmp1, tmp0, dst);
591
592 tmp2 = __msa_dotp_u_h(vec2, filt0);
593 tmp3 = __msa_dotp_u_h(vec3, filt0);
594
595 tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
596 tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
597
598 PCKEV_B_STORE_VEC(tmp3, tmp2, dst + dst_stride);
599
600 ILV_B_LRLR_UB(src2, src3, src3, src4, vec5, vec4, vec7, vec6);
601
602 tmp0 = __msa_dotp_u_h(vec4, filt0);
603 tmp1 = __msa_dotp_u_h(vec5, filt0);
604
605 tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
606 tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
607
608 PCKEV_B_STORE_VEC(tmp1, tmp0, dst + 2 * dst_stride);
609
610 tmp2 = __msa_dotp_u_h(vec6, filt0);
611 tmp3 = __msa_dotp_u_h(vec7, filt0);
612
613 tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
614 tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
615
616 PCKEV_B_STORE_VEC(tmp3, tmp2, dst + 3 * dst_stride);
617
618 ILV_B_LRLR_UB(src5, src6, src6, src7, vec1, vec0, vec3, vec2);
619
620 tmp0 = __msa_dotp_u_h(vec0, filt0);
621 tmp1 = __msa_dotp_u_h(vec1, filt0);
622
623 tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
624 tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
625
626 PCKEV_B_STORE_VEC(tmp1, tmp0, dst + 16);
627
628 tmp2 = __msa_dotp_u_h(vec2, filt0);
629 tmp3 = __msa_dotp_u_h(vec3, filt0);
630
631 tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
632 tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
633
634 PCKEV_B_STORE_VEC(tmp3, tmp2, dst + 16 + dst_stride);
635
636 ILV_B_LRLR_UB(src7, src8, src8, src9, vec5, vec4, vec7, vec6);
637
638 tmp0 = __msa_dotp_u_h(vec4, filt0);
639 tmp1 = __msa_dotp_u_h(vec5, filt0);
640
641 tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
642 tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
643
644 PCKEV_B_STORE_VEC(tmp1, tmp0, dst + 16 + 2 * dst_stride);
645
646 tmp2 = __msa_dotp_u_h(vec6, filt0);
647 tmp3 = __msa_dotp_u_h(vec7, filt0);
648
649 tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
650 tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
651
652 PCKEV_B_STORE_VEC(tmp3, tmp2, dst + 16 + 3 * dst_stride);
653 dst += (4 * dst_stride);
654
655 src0 = src4;
656 src5 = src9;
657 }
658 }
659
660 static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride,
661 uint8_t *dst, int32_t dst_stride,
662 int8_t *filter, int32_t height) {
663 uint32_t loop_cnt;
664 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
665 v16u8 src8, src9, src10, src11;
666 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
667 v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
668 v8u16 filt;
669
670 /* rearranging filter_y */
671 filt = LOAD_UH(filter);
672 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
673
674 LOAD_4VECS_UB(src, 16, src0, src3, src6, src9);
675 src += src_stride;
676
677 for (loop_cnt = (height >> 1); loop_cnt--;) {
678 LOAD_2VECS_UB(src, src_stride, src1, src2);
679 LOAD_2VECS_UB(src + 16, src_stride, src4, src5);
680 LOAD_2VECS_UB(src + 32, src_stride, src7, src8);
681 LOAD_2VECS_UB(src + 48, src_stride, src10, src11);
682 src += (2 * src_stride);
683
684 ILV_B_LRLR_UB(src0, src1, src1, src2, vec1, vec0, vec3, vec2);
685
686 tmp0 = __msa_dotp_u_h(vec0, filt0);
687 tmp1 = __msa_dotp_u_h(vec1, filt0);
688
689 tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
690 tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
691
692 PCKEV_B_STORE_VEC(tmp1, tmp0, dst);
693
694 tmp2 = __msa_dotp_u_h(vec2, filt0);
695 tmp3 = __msa_dotp_u_h(vec3, filt0);
696
697 tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
698 tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
699
700 PCKEV_B_STORE_VEC(tmp3, tmp2, dst + dst_stride);
701
702 ILV_B_LRLR_UB(src3, src4, src4, src5, vec5, vec4, vec7, vec6);
703
704 tmp4 = __msa_dotp_u_h(vec4, filt0);
705 tmp5 = __msa_dotp_u_h(vec5, filt0);
706
707 tmp4 = SRARI_SATURATE_UNSIGNED_H(tmp4, FILTER_BITS, 7);
708 tmp5 = SRARI_SATURATE_UNSIGNED_H(tmp5, FILTER_BITS, 7);
709
710 PCKEV_B_STORE_VEC(tmp5, tmp4, dst + 16);
711
712 tmp6 = __msa_dotp_u_h(vec6, filt0);
713 tmp7 = __msa_dotp_u_h(vec7, filt0);
714
715 tmp6 = SRARI_SATURATE_UNSIGNED_H(tmp6, FILTER_BITS, 7);
716 tmp7 = SRARI_SATURATE_UNSIGNED_H(tmp7, FILTER_BITS, 7);
717
718 PCKEV_B_STORE_VEC(tmp7, tmp6, dst + 16 + dst_stride);
719
720 ILV_B_LRLR_UB(src6, src7, src7, src8, vec1, vec0, vec3, vec2);
721
722 tmp0 = __msa_dotp_u_h(vec0, filt0);
723 tmp1 = __msa_dotp_u_h(vec1, filt0);
724
725 tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
726 tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
727
728 PCKEV_B_STORE_VEC(tmp1, tmp0, dst + 32);
729
730 tmp2 = __msa_dotp_u_h(vec2, filt0);
731 tmp3 = __msa_dotp_u_h(vec3, filt0);
732
733 tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
734 tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
735
736 PCKEV_B_STORE_VEC(tmp3, tmp2, dst + 32 + dst_stride);
737
738 ILV_B_LRLR_UB(src9, src10, src10, src11, vec5, vec4, vec7, vec6);
739
740 tmp4 = __msa_dotp_u_h(vec4, filt0);
741 tmp5 = __msa_dotp_u_h(vec5, filt0);
742
743 tmp4 = SRARI_SATURATE_UNSIGNED_H(tmp4, FILTER_BITS, 7);
744 tmp5 = SRARI_SATURATE_UNSIGNED_H(tmp5, FILTER_BITS, 7);
745
746 PCKEV_B_STORE_VEC(tmp5, tmp4, dst + 48);
747
748 tmp6 = __msa_dotp_u_h(vec6, filt0);
749 tmp7 = __msa_dotp_u_h(vec7, filt0);
750
751 tmp6 = SRARI_SATURATE_UNSIGNED_H(tmp6, FILTER_BITS, 7);
752 tmp7 = SRARI_SATURATE_UNSIGNED_H(tmp7, FILTER_BITS, 7);
753
754 PCKEV_B_STORE_VEC(tmp7, tmp6, dst + 48 + dst_stride);
755 dst += (2 * dst_stride);
756
757 src0 = src2;
758 src3 = src5;
759 src6 = src8;
760 src9 = src11;
761 }
762 }
763
764 void vp9_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
765 uint8_t *dst, ptrdiff_t dst_stride,
766 const int16_t *filter_x, int x_step_q4,
767 const int16_t *filter_y, int y_step_q4,
768 int w, int h) {
769 int8_t cnt, filt_ver[8];
770
771 if (16 != y_step_q4) {
772 vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
773 filter_x, x_step_q4, filter_y, y_step_q4,
774 w, h);
775 return;
776 }
777
778 if (((const int32_t *)filter_y)[1] == 0x800000) {
779 vp9_convolve_copy(src, src_stride, dst, dst_stride,
780 filter_x, x_step_q4, filter_y, y_step_q4,
781 w, h);
782 return;
783 }
784
785 for (cnt = 8; cnt--;) {
786 filt_ver[cnt] = filter_y[cnt];
787 }
788
789 if (((const int32_t *)filter_y)[0] == 0) {
790 switch (w) {
791 case 4:
792 common_vt_2t_4w_msa(src, (int32_t)src_stride,
793 dst, (int32_t)dst_stride,
794 &filt_ver[3], h);
795 break;
796 case 8:
797 common_vt_2t_8w_msa(src, (int32_t)src_stride,
798 dst, (int32_t)dst_stride,
799 &filt_ver[3], h);
800 break;
801 case 16:
802 common_vt_2t_16w_msa(src, (int32_t)src_stride,
803 dst, (int32_t)dst_stride,
804 &filt_ver[3], h);
805 break;
806 case 32:
807 common_vt_2t_32w_msa(src, (int32_t)src_stride,
808 dst, (int32_t)dst_stride,
809 &filt_ver[3], h);
810 break;
811 case 64:
812 common_vt_2t_64w_msa(src, (int32_t)src_stride,
813 dst, (int32_t)dst_stride,
814 &filt_ver[3], h);
815 break;
816 default:
817 vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
818 filter_x, x_step_q4, filter_y, y_step_q4,
819 w, h);
820 break;
821 }
822 } else {
823 switch (w) {
824 case 4:
825 common_vt_8t_4w_msa(src, (int32_t)src_stride,
826 dst, (int32_t)dst_stride,
827 filt_ver, h);
828 break;
829 case 8:
830 common_vt_8t_8w_msa(src, (int32_t)src_stride,
831 dst, (int32_t)dst_stride,
832 filt_ver, h);
833 break;
834 case 16:
835 common_vt_8t_16w_msa(src, (int32_t)src_stride,
836 dst, (int32_t)dst_stride,
837 filt_ver, h);
838 break;
839 case 32:
840 common_vt_8t_32w_msa(src, (int32_t)src_stride,
841 dst, (int32_t)dst_stride,
842 filt_ver, h);
843 break;
844 case 64:
845 common_vt_8t_64w_msa(src, (int32_t)src_stride,
846 dst, (int32_t)dst_stride,
847 filt_ver, h);
848 break;
849 default:
850 vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
851 filter_x, x_step_q4, filter_y, y_step_q4,
852 w, h);
853 break;
854 }
855 }
856 }
OLDNEW
« no previous file with comments | « source/libvpx/vp9/common/mips/msa/vp9_convolve8_msa.c ('k') | source/libvpx/vp9/common/mips/msa/vp9_convolve_avg_msa.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698