Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(256)

Side by Side Diff: source/libvpx/vp9/common/mips/msa/vp9_macros_msa.h

Issue 1124333011: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: only update to last nights LKGR Created 5 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #ifndef VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_
12 #define VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_
13
14 #include <msa.h>
15
16 #include "./vpx_config.h"
17 #include "vpx/vpx_integer.h"
18
19 #if HAVE_MSA
20 /* load macros */
21 #define LOAD_UB(psrc) *((const v16u8 *)(psrc))
22 #define LOAD_SB(psrc) *((const v16i8 *)(psrc))
23 #define LOAD_UH(psrc) *((const v8u16 *)(psrc))
24 #define LOAD_SH(psrc) *((const v8i16 *)(psrc))
25 #define LOAD_UW(psrc) *((const v4u32 *)(psrc))
26 #define LOAD_SW(psrc) *((const v4i32 *)(psrc))
27 #define LOAD_UD(psrc) *((const v2u64 *)(psrc))
28 #define LOAD_SD(psrc) *((const v2i64 *)(psrc))
29
30 /* store macros */
31 #define STORE_UB(vec, pdest) *((v16u8 *)(pdest)) = (vec)
32 #define STORE_SB(vec, pdest) *((v16i8 *)(pdest)) = (vec)
33 #define STORE_UH(vec, pdest) *((v8u16 *)(pdest)) = (vec)
34 #define STORE_SH(vec, pdest) *((v8i16 *)(pdest)) = (vec)
35 #define STORE_UW(vec, pdest) *((v4u32 *)(pdest)) = (vec)
36 #define STORE_SW(vec, pdest) *((v4i32 *)(pdest)) = (vec)
37 #define STORE_UD(vec, pdest) *((v2u64 *)(pdest)) = (vec)
38 #define STORE_SD(vec, pdest) *((v2i64 *)(pdest)) = (vec)
39
40 #if (__mips_isa_rev >= 6)
41 #define LOAD_WORD(psrc) ({ \
42 const uint8_t *src_m = (const uint8_t *)(psrc); \
43 uint32_t val_m; \
44 \
45 __asm__ __volatile__ ( \
46 "lw %[val_m], %[src_m] \n\t" \
47 \
48 : [val_m] "=r" (val_m) \
49 : [src_m] "m" (*src_m) \
50 ); \
51 \
52 val_m; \
53 })
54
55 #if (__mips == 64)
56 #define LOAD_DWORD(psrc) ({ \
57 const uint8_t *src_m = (const uint8_t *)(psrc); \
58 uint64_t val_m = 0; \
59 \
60 __asm__ __volatile__ ( \
61 "ld %[val_m], %[src_m] \n\t" \
62 \
63 : [val_m] "=r" (val_m) \
64 : [src_m] "m" (*src_m) \
65 ); \
66 \
67 val_m; \
68 })
69 #else // !(__mips == 64)
70 #define LOAD_DWORD(psrc) ({ \
71 const uint8_t *src1_m = (const uint8_t *)(psrc); \
72 const uint8_t *src2_m = ((const uint8_t *)(psrc)) + 4; \
73 uint32_t val0_m, val1_m; \
74 uint64_t genval_m = 0; \
75 \
76 __asm__ __volatile__ ( \
77 "lw %[val0_m], %[src1_m] \n\t" \
78 \
79 : [val0_m] "=r" (val0_m) \
80 : [src1_m] "m" (*src1_m) \
81 ); \
82 \
83 __asm__ __volatile__ ( \
84 "lw %[val1_m], %[src2_m] \n\t" \
85 \
86 : [val1_m] "=r" (val1_m) \
87 : [src2_m] "m" (*src2_m) \
88 ); \
89 \
90 genval_m = (uint64_t)(val1_m); \
91 genval_m = (uint64_t)((genval_m << 32) & 0xFFFFFFFF00000000); \
92 genval_m = (uint64_t)(genval_m | (uint64_t)val0_m); \
93 \
94 genval_m; \
95 })
96 #endif // (__mips == 64)
97 #define STORE_WORD_WITH_OFFSET_1(pdst, val) { \
98 uint8_t *dst_ptr_m = ((uint8_t *)(pdst)) + 1; \
99 const uint32_t val_m = (val); \
100 \
101 __asm__ __volatile__ ( \
102 "sw %[val_m], %[dst_ptr_m] \n\t" \
103 \
104 : [dst_ptr_m] "=m" (*dst_ptr_m) \
105 : [val_m] "r" (val_m) \
106 ); \
107 }
108
109 #define STORE_WORD(pdst, val) { \
110 uint8_t *dst_ptr_m = (uint8_t *)(pdst); \
111 const uint32_t val_m = (val); \
112 \
113 __asm__ __volatile__ ( \
114 "sw %[val_m], %[dst_ptr_m] \n\t" \
115 \
116 : [dst_ptr_m] "=m" (*dst_ptr_m) \
117 : [val_m] "r" (val_m) \
118 ); \
119 }
120
121 #define STORE_DWORD(pdst, val) { \
122 uint8_t *dst_ptr_m = (uint8_t *)(pdst); \
123 const uint64_t val_m = (val); \
124 \
125 __asm__ __volatile__ ( \
126 "sd %[val_m], %[dst_ptr_m] \n\t" \
127 \
128 : [dst_ptr_m] "=m" (*dst_ptr_m) \
129 : [val_m] "r" (val_m) \
130 ); \
131 }
132 #else // !(__mips_isa_rev >= 6)
133 #define LOAD_WORD(psrc) ({ \
134 const uint8_t *src_m = (const uint8_t *)(psrc); \
135 uint32_t val_m; \
136 \
137 __asm__ __volatile__ ( \
138 "ulw %[val_m], %[src_m] \n\t" \
139 \
140 : [val_m] "=r" (val_m) \
141 : [src_m] "m" (*src_m) \
142 ); \
143 \
144 val_m; \
145 })
146
147 #if (__mips == 64)
148 #define LOAD_DWORD(psrc) ({ \
149 const uint8_t *src_m = (const uint8_t *)(psrc); \
150 uint64_t val_m = 0; \
151 \
152 __asm__ __volatile__ ( \
153 "uld %[val_m], %[src_m] \n\t" \
154 \
155 : [val_m] "=r" (val_m) \
156 : [src_m] "m" (*src_m) \
157 ); \
158 \
159 val_m; \
160 })
161 #else // !(__mips == 64)
162 #define LOAD_DWORD(psrc) ({ \
163 const uint8_t *src1_m = (const uint8_t *)(psrc); \
164 const uint8_t *src2_m = ((const uint8_t *)(psrc)) + 4; \
165 uint32_t val0_m, val1_m; \
166 uint64_t genval_m = 0; \
167 \
168 __asm__ __volatile__ ( \
169 "ulw %[val0_m], %[src1_m] \n\t" \
170 \
171 : [val0_m] "=r" (val0_m) \
172 : [src1_m] "m" (*src1_m) \
173 ); \
174 \
175 __asm__ __volatile__ ( \
176 "ulw %[val1_m], %[src2_m] \n\t" \
177 \
178 : [val1_m] "=r" (val1_m) \
179 : [src2_m] "m" (*src2_m) \
180 ); \
181 \
182 genval_m = (uint64_t)(val1_m); \
183 genval_m = (uint64_t)((genval_m << 32) & 0xFFFFFFFF00000000); \
184 genval_m = (uint64_t)(genval_m | (uint64_t)val0_m); \
185 \
186 genval_m; \
187 })
188 #endif // (__mips == 64)
189
190 #define STORE_WORD_WITH_OFFSET_1(pdst, val) { \
191 uint8_t *dst_ptr_m = ((uint8_t *)(pdst)) + 1; \
192 const uint32_t val_m = (val); \
193 \
194 __asm__ __volatile__ ( \
195 "usw %[val_m], %[dst_ptr_m] \n\t" \
196 \
197 : [dst_ptr_m] "=m" (*dst_ptr_m) \
198 : [val_m] "r" (val_m) \
199 ); \
200 }
201
202 #define STORE_WORD(pdst, val) { \
203 uint8_t *dst_ptr_m = (uint8_t *)(pdst); \
204 const uint32_t val_m = (val); \
205 \
206 __asm__ __volatile__ ( \
207 "usw %[val_m], %[dst_ptr_m] \n\t" \
208 \
209 : [dst_ptr_m] "=m" (*dst_ptr_m) \
210 : [val_m] "r" (val_m) \
211 ); \
212 }
213
214 #define STORE_DWORD(pdst, val) { \
215 uint8_t *dst1_m = (uint8_t *)(pdst); \
216 uint8_t *dst2_m = ((uint8_t *)(pdst)) + 4; \
217 uint32_t val0_m, val1_m; \
218 \
219 val0_m = (uint32_t)((val) & 0x00000000FFFFFFFF); \
220 val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
221 \
222 __asm__ __volatile__ ( \
223 "usw %[val0_m], %[dst1_m] \n\t" \
224 "usw %[val1_m], %[dst2_m] \n\t" \
225 \
226 : [dst1_m] "=m" (*dst1_m), [dst2_m] "=m" (*dst2_m) \
227 : [val0_m] "r" (val0_m), [val1_m] "r" (val1_m) \
228 ); \
229 }
230 #endif // (__mips_isa_rev >= 6)
231
232 #define LOAD_2VECS_UB(psrc, stride, \
233 val0, val1) { \
234 val0 = LOAD_UB(psrc + 0 * stride); \
235 val1 = LOAD_UB(psrc + 1 * stride); \
236 }
237
238 #define LOAD_4VECS_UB(psrc, stride, \
239 val0, val1, val2, val3) { \
240 val0 = LOAD_UB(psrc + 0 * stride); \
241 val1 = LOAD_UB(psrc + 1 * stride); \
242 val2 = LOAD_UB(psrc + 2 * stride); \
243 val3 = LOAD_UB(psrc + 3 * stride); \
244 }
245
246 #define LOAD_4VECS_SB(psrc, stride, \
247 val0, val1, val2, val3) { \
248 val0 = LOAD_SB(psrc + 0 * stride); \
249 val1 = LOAD_SB(psrc + 1 * stride); \
250 val2 = LOAD_SB(psrc + 2 * stride); \
251 val3 = LOAD_SB(psrc + 3 * stride); \
252 }
253
254 #define LOAD_5VECS_UB(psrc, stride, \
255 out0, out1, out2, out3, out4) { \
256 LOAD_4VECS_UB((psrc), (stride), \
257 (out0), (out1), (out2), (out3)); \
258 out4 = LOAD_UB(psrc + 4 * stride); \
259 }
260
261 #define LOAD_5VECS_SB(psrc, stride, \
262 out0, out1, out2, out3, out4) { \
263 LOAD_4VECS_SB((psrc), (stride), \
264 (out0), (out1), (out2), (out3)); \
265 out4 = LOAD_SB(psrc + 4 * stride); \
266 }
267
268 #define LOAD_7VECS_SB(psrc, stride, \
269 val0, val1, val2, val3, \
270 val4, val5, val6) { \
271 val0 = LOAD_SB((psrc) + 0 * (stride)); \
272 val1 = LOAD_SB((psrc) + 1 * (stride)); \
273 val2 = LOAD_SB((psrc) + 2 * (stride)); \
274 val3 = LOAD_SB((psrc) + 3 * (stride)); \
275 val4 = LOAD_SB((psrc) + 4 * (stride)); \
276 val5 = LOAD_SB((psrc) + 5 * (stride)); \
277 val6 = LOAD_SB((psrc) + 6 * (stride)); \
278 }
279
280 #define LOAD_8VECS_UB(psrc, stride, \
281 out0, out1, out2, out3, \
282 out4, out5, out6, out7) { \
283 LOAD_4VECS_UB((psrc), (stride), \
284 (out0), (out1), (out2), (out3)); \
285 LOAD_4VECS_UB((psrc + 4 * stride), (stride), \
286 (out4), (out5), (out6), (out7)); \
287 }
288
289 #define LOAD_8VECS_SB(psrc, stride, \
290 out0, out1, out2, out3, \
291 out4, out5, out6, out7) { \
292 LOAD_4VECS_SB((psrc), (stride), \
293 (out0), (out1), (out2), (out3)); \
294 LOAD_4VECS_SB((psrc + 4 * stride), (stride), \
295 (out4), (out5), (out6), (out7)); \
296 }
297
298 #define LOAD_2VECS_SH(psrc, stride, \
299 val0, val1) { \
300 val0 = LOAD_SH((psrc) + 0 * (stride)); \
301 val1 = LOAD_SH((psrc) + 1 * (stride)); \
302 }
303
304 #define LOAD_4VECS_SH(psrc, stride, \
305 val0, val1, val2, val3) { \
306 LOAD_2VECS_SH((psrc), (stride), val0, val1); \
307 LOAD_2VECS_SH((psrc + 2 * stride), (stride), val2, val3); \
308 }
309
310 #define LOAD_8VECS_SH(psrc, stride, \
311 val0, val1, val2, val3, \
312 val4, val5, val6, val7) { \
313 LOAD_4VECS_SH((psrc), (stride), \
314 val0, val1, val2, val3); \
315 LOAD_4VECS_SH((psrc + 4 * stride), (stride), \
316 val4, val5, val6, val7); \
317 }
318
319 #define LOAD_16VECS_SH(psrc, stride, \
320 val0, val1, val2, val3, \
321 val4, val5, val6, val7, \
322 val8, val9, val10, val11, \
323 val12, val13, val14, val15) { \
324 LOAD_8VECS_SH((psrc), (stride), \
325 val0, val1, val2, val3, \
326 val4, val5, val6, val7); \
327 LOAD_8VECS_SH((psrc + 8 * (stride)), (stride), \
328 val8, val9, val10, val11, \
329 val12, val13, val14, val15); \
330 }
331
332 #define STORE_4VECS_UB(dst_out, pitch, \
333 in0, in1, in2, in3) { \
334 STORE_UB((in0), (dst_out)); \
335 STORE_UB((in1), ((dst_out) + (pitch))); \
336 STORE_UB((in2), ((dst_out) + 2 * (pitch))); \
337 STORE_UB((in3), ((dst_out) + 3 * (pitch))); \
338 }
339
340 #define STORE_8VECS_UB(dst_out, pitch_in, \
341 in0, in1, in2, in3, \
342 in4, in5, in6, in7) { \
343 STORE_4VECS_UB(dst_out, pitch_in, \
344 in0, in1, in2, in3); \
345 STORE_4VECS_UB((dst_out + 4 * (pitch_in)), pitch_in, \
346 in4, in5, in6, in7); \
347 }
348
349 #define VEC_INSERT_4W_UB(src, src0, src1, src2, src3) { \
350 src = (v16u8)__msa_insert_w((v4i32)(src), 0, (src0)); \
351 src = (v16u8)__msa_insert_w((v4i32)(src), 1, (src1)); \
352 src = (v16u8)__msa_insert_w((v4i32)(src), 2, (src2)); \
353 src = (v16u8)__msa_insert_w((v4i32)(src), 3, (src3)); \
354 }
355
356 #define VEC_INSERT_2DW_UB(src, src0, src1) { \
357 src = (v16u8)__msa_insert_d((v2i64)(src), 0, (src0)); \
358 src = (v16u8)__msa_insert_d((v2i64)(src), 1, (src1)); \
359 }
360
361 #define STORE_4VECS_SH(ptr, stride, \
362 in0, in1, in2, in3) { \
363 STORE_SH(in0, ((ptr) + 0 * stride)); \
364 STORE_SH(in1, ((ptr) + 1 * stride)); \
365 STORE_SH(in2, ((ptr) + 2 * stride)); \
366 STORE_SH(in3, ((ptr) + 3 * stride)); \
367 }
368
369 #define STORE_8VECS_SH(ptr, stride, \
370 in0, in1, in2, in3, \
371 in4, in5, in6, in7) { \
372 STORE_SH(in0, ((ptr) + 0 * stride)); \
373 STORE_SH(in1, ((ptr) + 1 * stride)); \
374 STORE_SH(in2, ((ptr) + 2 * stride)); \
375 STORE_SH(in3, ((ptr) + 3 * stride)); \
376 STORE_SH(in4, ((ptr) + 4 * stride)); \
377 STORE_SH(in5, ((ptr) + 5 * stride)); \
378 STORE_SH(in6, ((ptr) + 6 * stride)); \
379 STORE_SH(in7, ((ptr) + 7 * stride)); \
380 }
381
382 #define CLIP_UNSIGNED_CHAR_H(in) ({ \
383 v8i16 max_m = __msa_ldi_h(255); \
384 v8i16 out_m; \
385 \
386 out_m = __msa_maxi_s_h((v8i16)(in), 0); \
387 out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \
388 out_m; \
389 })
390
391 /* halfword 8x8 transpose macro */
392 #define TRANSPOSE8x8_H_SH(in0, in1, in2, in3, \
393 in4, in5, in6, in7, \
394 out0, out1, out2, out3, \
395 out4, out5, out6, out7) { \
396 v8i16 s0_m, s1_m; \
397 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
398 v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
399 \
400 s0_m = __msa_ilvr_h((v8i16)(in6), (v8i16)(in4)); \
401 s1_m = __msa_ilvr_h((v8i16)(in7), (v8i16)(in5)); \
402 tmp0_m = __msa_ilvr_h((v8i16)s1_m, (v8i16)s0_m); \
403 tmp1_m = __msa_ilvl_h((v8i16)s1_m, (v8i16)s0_m); \
404 \
405 s0_m = __msa_ilvl_h((v8i16)(in6), (v8i16)(in4)); \
406 s1_m = __msa_ilvl_h((v8i16)(in7), (v8i16)(in5)); \
407 tmp2_m = __msa_ilvr_h((v8i16)s1_m, (v8i16)s0_m); \
408 tmp3_m = __msa_ilvl_h((v8i16)s1_m, (v8i16)s0_m); \
409 \
410 s0_m = __msa_ilvr_h((v8i16)(in2), (v8i16)(in0)); \
411 s1_m = __msa_ilvr_h((v8i16)(in3), (v8i16)(in1)); \
412 tmp4_m = __msa_ilvr_h((v8i16)s1_m, (v8i16)s0_m); \
413 tmp5_m = __msa_ilvl_h((v8i16)s1_m, (v8i16)s0_m); \
414 \
415 s0_m = __msa_ilvl_h((v8i16)(in2), (v8i16)(in0)); \
416 s1_m = __msa_ilvl_h((v8i16)(in3), (v8i16)(in1)); \
417 tmp6_m = __msa_ilvr_h((v8i16)s1_m, (v8i16)s0_m); \
418 tmp7_m = __msa_ilvl_h((v8i16)s1_m, (v8i16)s0_m); \
419 \
420 out0 = (v8i16)__msa_pckev_d((v2i64)tmp0_m, (v2i64)tmp4_m); \
421 out1 = (v8i16)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m); \
422 out2 = (v8i16)__msa_pckev_d((v2i64)tmp1_m, (v2i64)tmp5_m); \
423 out3 = (v8i16)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m); \
424 out4 = (v8i16)__msa_pckev_d((v2i64)tmp2_m, (v2i64)tmp6_m); \
425 out5 = (v8i16)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m); \
426 out6 = (v8i16)__msa_pckev_d((v2i64)tmp3_m, (v2i64)tmp7_m); \
427 out7 = (v8i16)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m); \
428 }
429
430 /* interleave macros */
431 /* no in-place support */
432 #define ILV_B_LRLR_UB(in0, in1, in2, in3, \
433 out0, out1, out2, out3) { \
434 out0 = (v16u8)__msa_ilvl_b((v16i8)(in1), (v16i8)(in0)); \
435 out1 = (v16u8)__msa_ilvr_b((v16i8)(in1), (v16i8)(in0)); \
436 out2 = (v16u8)__msa_ilvl_b((v16i8)(in3), (v16i8)(in2)); \
437 out3 = (v16u8)__msa_ilvr_b((v16i8)(in3), (v16i8)(in2)); \
438 }
439
440 #define ILV_H_LRLR_SH(in0, in1, in2, in3, \
441 out0, out1, out2, out3) { \
442 out0 = __msa_ilvl_h((v8i16)(in1), (v8i16)(in0)); \
443 out1 = __msa_ilvr_h((v8i16)(in1), (v8i16)(in0)); \
444 out2 = __msa_ilvl_h((v8i16)(in3), (v8i16)(in2)); \
445 out3 = __msa_ilvr_h((v8i16)(in3), (v8i16)(in2)); \
446 }
447
448 #define ILV_H_LR_SH(in0, in1, out0, out1) { \
449 out0 = __msa_ilvl_h((v8i16)(in1), (v8i16)(in0)); \
450 out1 = __msa_ilvr_h((v8i16)(in1), (v8i16)(in0)); \
451 }
452
453 #define ILVR_B_2VECS_UB(in0_r, in1_r, in0_l, in1_l, \
454 out0, out1) { \
455 out0 = (v16u8)__msa_ilvr_b((v16i8)(in0_l), (v16i8)(in0_r)); \
456 out1 = (v16u8)__msa_ilvr_b((v16i8)(in1_l), (v16i8)(in1_r)); \
457 }
458
459 #define ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \
460 out0, out1) { \
461 out0 = __msa_ilvr_b((v16i8)(in0_l), (v16i8)(in0_r)); \
462 out1 = __msa_ilvr_b((v16i8)(in1_l), (v16i8)(in1_r)); \
463 }
464
465 #define ILVR_B_4VECS_UB(in0_r, in1_r, in2_r, in3_r, \
466 in0_l, in1_l, in2_l, in3_l, \
467 out0, out1, out2, out3) { \
468 ILVR_B_2VECS_UB(in0_r, in1_r, in0_l, in1_l, \
469 out0, out1); \
470 ILVR_B_2VECS_UB(in2_r, in3_r, in2_l, in3_l, \
471 out2, out3); \
472 }
473
474 #define ILVR_B_4VECS_SB(in0_r, in1_r, in2_r, in3_r, \
475 in0_l, in1_l, in2_l, in3_l, \
476 out0, out1, out2, out3) { \
477 ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \
478 out0, out1); \
479 ILVR_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \
480 out2, out3); \
481 }
482
483 #define ILVR_B_6VECS_SB(in0_r, in1_r, in2_r, \
484 in3_r, in4_r, in5_r, \
485 in0_l, in1_l, in2_l, \
486 in3_l, in4_l, in5_l, \
487 out0, out1, out2, \
488 out3, out4, out5) { \
489 ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \
490 out0, out1); \
491 ILVR_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \
492 out2, out3); \
493 ILVR_B_2VECS_SB(in4_r, in5_r, in4_l, in5_l, \
494 out4, out5); \
495 }
496
497 #define ILVR_B_8VECS_SB(in0_r, in1_r, in2_r, in3_r, \
498 in4_r, in5_r, in6_r, in7_r, \
499 in0_l, in1_l, in2_l, in3_l, \
500 in4_l, in5_l, in6_l, in7_l, \
501 out0, out1, out2, out3, \
502 out4, out5, out6, out7) { \
503 ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \
504 out0, out1); \
505 ILVR_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \
506 out2, out3); \
507 ILVR_B_2VECS_SB(in4_r, in5_r, in4_l, in5_l, \
508 out4, out5); \
509 ILVR_B_2VECS_SB(in6_r, in7_r, in6_l, in7_l, \
510 out6, out7); \
511 }
512
513 #define ILVL_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \
514 out0, out1) { \
515 out0 = __msa_ilvl_b((v16i8)(in0_l), (v16i8)(in0_r)); \
516 out1 = __msa_ilvl_b((v16i8)(in1_l), (v16i8)(in1_r)); \
517 }
518
519 #define ILVL_B_4VECS_SB(in0_r, in1_r, in2_r, in3_r, \
520 in0_l, in1_l, in2_l, in3_l, \
521 out0, out1, out2, out3) { \
522 ILVL_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \
523 out0, out1); \
524 ILVL_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \
525 out2, out3); \
526 }
527
528 #define ILVL_B_6VECS_SB(in0_r, in1_r, in2_r, \
529 in3_r, in4_r, in5_r, \
530 in0_l, in1_l, in2_l, \
531 in3_l, in4_l, in5_l, \
532 out0, out1, out2, \
533 out3, out4, out5) { \
534 ILVL_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \
535 out0, out1); \
536 ILVL_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \
537 out2, out3); \
538 ILVL_B_2VECS_SB(in4_r, in5_r, in4_l, in5_l, \
539 out4, out5); \
540 }
541
542 #define ILVR_D_2VECS_SB(out0, in0_l, in0_r, \
543 out1, in1_l, in1_r) { \
544 out0 = (v16i8)__msa_ilvr_d((v2i64)(in0_l), (v2i64)(in0_r)); \
545 out1 = (v16i8)__msa_ilvr_d((v2i64)(in1_l), (v2i64)(in1_r)); \
546 }
547
548 #define ILVR_D_3VECS_SB(out0, in0_l, in0_r, \
549 out1, in1_l, in1_r, \
550 out2, in2_l, in2_r) { \
551 ILVR_D_2VECS_SB(out0, in0_l, in0_r, \
552 out1, in1_l, in1_r); \
553 out2 = (v16i8)__msa_ilvr_d((v2i64)(in2_l), (v2i64)(in2_r)); \
554 }
555
556 #define ILVR_D_4VECS_SB(out0, in0_l, in0_r, \
557 out1, in1_l, in1_r, \
558 out2, in2_l, in2_r, \
559 out3, in3_l, in3_r) { \
560 ILVR_D_2VECS_SB(out0, in0_l, in0_r, \
561 out1, in1_l, in1_r); \
562 ILVR_D_2VECS_SB(out2, in2_l, in2_r, \
563 out3, in3_l, in3_r); \
564 }
565
566 #define DOTP_S_W_4VECS_SW(m0, c0, m1, c1, \
567 m2, c2, m3, c3, \
568 out0, out1, out2, out3) { \
569 out0 = __msa_dotp_s_w((v8i16)(m0), (v8i16)(c0)); \
570 out1 = __msa_dotp_s_w((v8i16)(m1), (v8i16)(c1)); \
571 out2 = __msa_dotp_s_w((v8i16)(m2), (v8i16)(c2)); \
572 out3 = __msa_dotp_s_w((v8i16)(m3), (v8i16)(c3)); \
573 }
574
575 #define PCKEV_H_2VECS_SH(in0_l, in0_r, in1_l, in1_r, \
576 out0, out1) { \
577 out0 = __msa_pckev_h((v8i16)(in0_l), (v8i16)(in0_r)); \
578 out1 = __msa_pckev_h((v8i16)(in1_l), (v8i16)(in1_r)); \
579 }
580
581 #define XORI_B_2VECS_UB(val0, val1, \
582 out0, out1, xor_val) { \
583 out0 = __msa_xori_b((v16u8)(val0), (xor_val)); \
584 out1 = __msa_xori_b((v16u8)(val1), (xor_val)); \
585 }
586
587 #define XORI_B_2VECS_SB(val0, val1, \
588 out0, out1, xor_val) { \
589 out0 = (v16i8)__msa_xori_b((v16u8)(val0), (xor_val)); \
590 out1 = (v16i8)__msa_xori_b((v16u8)(val1), (xor_val)); \
591 }
592
593 #define XORI_B_3VECS_SB(val0, val1, val2, \
594 out0, out1, out2, xor_val) { \
595 XORI_B_2VECS_SB(val0, val1, out0, out1, xor_val); \
596 out2 = (v16i8)__msa_xori_b((v16u8)(val2), (xor_val)); \
597 }
598
599 #define XORI_B_4VECS_UB(val0, val1, val2, val3, \
600 out0, out1, out2, out3, \
601 xor_val) { \
602 XORI_B_2VECS_UB(val0, val1, out0, out1, xor_val); \
603 XORI_B_2VECS_UB(val2, val3, out2, out3, xor_val); \
604 }
605
606 #define XORI_B_4VECS_SB(val0, val1, val2, val3, \
607 out0, out1, out2, out3, \
608 xor_val) { \
609 XORI_B_2VECS_SB(val0, val1, out0, out1, xor_val); \
610 XORI_B_2VECS_SB(val2, val3, out2, out3, xor_val); \
611 }
612
613 #define XORI_B_7VECS_SB(val0, val1, val2, val3, \
614 val4, val5, val6, \
615 out0, out1, out2, out3, \
616 out4, out5, out6, \
617 xor_val) { \
618 XORI_B_4VECS_SB(val0, val1, val2, val3, \
619 out0, out1, out2, out3, xor_val); \
620 XORI_B_3VECS_SB(val4, val5, val6, \
621 out4, out5, out6, xor_val); \
622 }
623
624 #define SRARI_H_4VECS_UH(val0, val1, val2, val3, \
625 out0, out1, out2, out3, \
626 shift_right_val) { \
627 out0 = (v8u16)__msa_srari_h((v8i16)(val0), (shift_right_val)); \
628 out1 = (v8u16)__msa_srari_h((v8i16)(val1), (shift_right_val)); \
629 out2 = (v8u16)__msa_srari_h((v8i16)(val2), (shift_right_val)); \
630 out3 = (v8u16)__msa_srari_h((v8i16)(val3), (shift_right_val)); \
631 }
632
633 #define SRARI_H_4VECS_SH(val0, val1, val2, val3, \
634 out0, out1, out2, out3, \
635 shift_right_val) { \
636 out0 = __msa_srari_h((v8i16)(val0), (shift_right_val)); \
637 out1 = __msa_srari_h((v8i16)(val1), (shift_right_val)); \
638 out2 = __msa_srari_h((v8i16)(val2), (shift_right_val)); \
639 out3 = __msa_srari_h((v8i16)(val3), (shift_right_val)); \
640 }
641
642 #define SRARI_W_4VECS_SW(val0, val1, val2, val3, \
643 out0, out1, out2, out3, \
644 shift_right_val) { \
645 out0 = __msa_srari_w((v4i32)(val0), (shift_right_val)); \
646 out1 = __msa_srari_w((v4i32)(val1), (shift_right_val)); \
647 out2 = __msa_srari_w((v4i32)(val2), (shift_right_val)); \
648 out3 = __msa_srari_w((v4i32)(val3), (shift_right_val)); \
649 }
650
651 #define SRARI_SATURATE_UNSIGNED_H(input, right_shift_val, sat_val) ({ \
652 v8u16 out_m; \
653 \
654 out_m = (v8u16)__msa_srari_h((v8i16)(input), (right_shift_val)); \
655 out_m = __msa_sat_u_h(out_m, (sat_val)); \
656 out_m; \
657 })
658
659 #define SRARI_SATURATE_SIGNED_H(input, right_shift_val, sat_val) ({ \
660 v8i16 out_m; \
661 \
662 out_m = __msa_srari_h((v8i16)(input), (right_shift_val)); \
663 out_m = __msa_sat_s_h(out_m, (sat_val)); \
664 out_m; \
665 })
666
667 #define PCKEV_2B_XORI128_STORE_4_BYTES_4(in1, in2, \
668 pdst, stride) { \
669 uint32_t out0_m, out1_m, out2_m, out3_m; \
670 v16i8 tmp0_m; \
671 uint8_t *dst_m = (uint8_t *)(pdst); \
672 \
673 tmp0_m = __msa_pckev_b((v16i8)(in2), (v16i8)(in1)); \
674 tmp0_m = (v16i8)__msa_xori_b((v16u8)tmp0_m, 128); \
675 \
676 out0_m = __msa_copy_u_w((v4i32)tmp0_m, 0); \
677 out1_m = __msa_copy_u_w((v4i32)tmp0_m, 1); \
678 out2_m = __msa_copy_u_w((v4i32)tmp0_m, 2); \
679 out3_m = __msa_copy_u_w((v4i32)tmp0_m, 3); \
680 \
681 STORE_WORD(dst_m, out0_m); \
682 dst_m += stride; \
683 STORE_WORD(dst_m, out1_m); \
684 dst_m += stride; \
685 STORE_WORD(dst_m, out2_m); \
686 dst_m += stride; \
687 STORE_WORD(dst_m, out3_m); \
688 }
689
690 #define PCKEV_B_4_XORI128_STORE_8_BYTES_4(in1, in2, \
691 in3, in4, \
692 pdst, stride) { \
693 uint64_t out0_m, out1_m, out2_m, out3_m; \
694 v16i8 tmp0_m, tmp1_m; \
695 uint8_t *dst_m = (uint8_t *)(pdst); \
696 \
697 tmp0_m = __msa_pckev_b((v16i8)(in2), (v16i8)(in1)); \
698 tmp1_m = __msa_pckev_b((v16i8)(in4), (v16i8)(in3)); \
699 \
700 tmp0_m = (v16i8)__msa_xori_b((v16u8)tmp0_m, 128); \
701 tmp1_m = (v16i8)__msa_xori_b((v16u8)tmp1_m, 128); \
702 \
703 out0_m = __msa_copy_u_d((v2i64)tmp0_m, 0); \
704 out1_m = __msa_copy_u_d((v2i64)tmp0_m, 1); \
705 out2_m = __msa_copy_u_d((v2i64)tmp1_m, 0); \
706 out3_m = __msa_copy_u_d((v2i64)tmp1_m, 1); \
707 \
708 STORE_DWORD(dst_m, out0_m); \
709 dst_m += stride; \
710 STORE_DWORD(dst_m, out1_m); \
711 dst_m += stride; \
712 STORE_DWORD(dst_m, out2_m); \
713 dst_m += stride; \
714 STORE_DWORD(dst_m, out3_m); \
715 }
716
717 /* Only for signed vecs */
718 #define PCKEV_B_XORI128_STORE_VEC(in1, in2, pdest) { \
719 v16i8 tmp_m; \
720 \
721 tmp_m = __msa_pckev_b((v16i8)(in1), (v16i8)(in2)); \
722 tmp_m = (v16i8)__msa_xori_b((v16u8)tmp_m, 128); \
723 STORE_SB(tmp_m, (pdest)); \
724 }
725
726 /* Only for signed vecs */
727 #define PCKEV_B_4_XORI128_AVG_STORE_8_BYTES_4(in1, dst0, \
728 in2, dst1, \
729 in3, dst2, \
730 in4, dst3, \
731 pdst, stride) { \
732 uint64_t out0_m, out1_m, out2_m, out3_m; \
733 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
734 uint8_t *dst_m = (uint8_t *)(pdst); \
735 \
736 tmp0_m = (v16u8)__msa_pckev_b((v16i8)(in2), (v16i8)(in1)); \
737 tmp1_m = (v16u8)__msa_pckev_b((v16i8)(in4), (v16i8)(in3)); \
738 \
739 tmp2_m = (v16u8)__msa_ilvr_d((v2i64)(dst1), (v2i64)(dst0)); \
740 tmp3_m = (v16u8)__msa_ilvr_d((v2i64)(dst3), (v2i64)(dst2)); \
741 \
742 tmp0_m = __msa_xori_b(tmp0_m, 128); \
743 tmp1_m = __msa_xori_b(tmp1_m, 128); \
744 \
745 tmp0_m = __msa_aver_u_b(tmp0_m, tmp2_m); \
746 tmp1_m = __msa_aver_u_b(tmp1_m, tmp3_m); \
747 \
748 out0_m = __msa_copy_u_d((v2i64)tmp0_m, 0); \
749 out1_m = __msa_copy_u_d((v2i64)tmp0_m, 1); \
750 out2_m = __msa_copy_u_d((v2i64)tmp1_m, 0); \
751 out3_m = __msa_copy_u_d((v2i64)tmp1_m, 1); \
752 \
753 STORE_DWORD(dst_m, out0_m); \
754 dst_m += stride; \
755 STORE_DWORD(dst_m, out1_m); \
756 dst_m += stride; \
757 STORE_DWORD(dst_m, out2_m); \
758 dst_m += stride; \
759 STORE_DWORD(dst_m, out3_m); \
760 }
761
762 /* Only for signed vecs */
763 #define PCKEV_B_XORI128_AVG_STORE_VEC(in1, in2, dst, pdest) { \
764 v16u8 tmp_m; \
765 \
766 tmp_m = (v16u8)__msa_pckev_b((v16i8)(in1), (v16i8)(in2)); \
767 tmp_m = __msa_xori_b(tmp_m, 128); \
768 tmp_m = __msa_aver_u_b(tmp_m, (v16u8)(dst)); \
769 STORE_UB(tmp_m, (pdest)); \
770 }
771
772 #define PCKEV_B_STORE_8_BYTES_4(in1, in2, in3, in4, \
773 pdst, stride) { \
774 uint64_t out0_m, out1_m, out2_m, out3_m; \
775 v16i8 tmp0_m, tmp1_m; \
776 uint8_t *dst_m = (uint8_t *)(pdst); \
777 \
778 tmp0_m = __msa_pckev_b((v16i8)(in2), (v16i8)(in1)); \
779 tmp1_m = __msa_pckev_b((v16i8)(in4), (v16i8)(in3)); \
780 \
781 out0_m = __msa_copy_u_d((v2i64)tmp0_m, 0); \
782 out1_m = __msa_copy_u_d((v2i64)tmp0_m, 1); \
783 out2_m = __msa_copy_u_d((v2i64)tmp1_m, 0); \
784 out3_m = __msa_copy_u_d((v2i64)tmp1_m, 1); \
785 \
786 STORE_DWORD(dst_m, out0_m); \
787 dst_m += stride; \
788 STORE_DWORD(dst_m, out1_m); \
789 dst_m += stride; \
790 STORE_DWORD(dst_m, out2_m); \
791 dst_m += stride; \
792 STORE_DWORD(dst_m, out3_m); \
793 }
794
795 /* Only for unsigned vecs */
796 #define PCKEV_B_STORE_VEC(in1, in2, pdest) { \
797 v16i8 tmp_m; \
798 \
799 tmp_m = __msa_pckev_b((v16i8)(in1), (v16i8)(in2)); \
800 STORE_SB(tmp_m, (pdest)); \
801 }
802
803 #define PCKEV_B_AVG_STORE_8_BYTES_4(in1, dst0, in2, dst1, \
804 in3, dst2, in4, dst3, \
805 pdst, stride) { \
806 uint64_t out0_m, out1_m, out2_m, out3_m; \
807 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
808 uint8_t *dst_m = (uint8_t *)(pdst); \
809 \
810 tmp0_m = (v16u8)__msa_pckev_b((v16i8)(in2), (v16i8)(in1)); \
811 tmp1_m = (v16u8)__msa_pckev_b((v16i8)(in4), (v16i8)(in3)); \
812 \
813 tmp2_m = (v16u8)__msa_pckev_d((v2i64)(dst1), (v2i64)(dst0)); \
814 tmp3_m = (v16u8)__msa_pckev_d((v2i64)(dst3), (v2i64)(dst2)); \
815 \
816 tmp0_m = __msa_aver_u_b(tmp0_m, tmp2_m); \
817 tmp1_m = __msa_aver_u_b(tmp1_m, tmp3_m); \
818 \
819 out0_m = __msa_copy_u_d((v2i64)tmp0_m, 0); \
820 out1_m = __msa_copy_u_d((v2i64)tmp0_m, 1); \
821 out2_m = __msa_copy_u_d((v2i64)tmp1_m, 0); \
822 out3_m = __msa_copy_u_d((v2i64)tmp1_m, 1); \
823 \
824 STORE_DWORD(dst_m, out0_m); \
825 dst_m += stride; \
826 STORE_DWORD(dst_m, out1_m); \
827 dst_m += stride; \
828 STORE_DWORD(dst_m, out2_m); \
829 dst_m += stride; \
830 STORE_DWORD(dst_m, out3_m); \
831 }
832
833 #define PCKEV_B_AVG_STORE_VEC(in1, in2, dst, pdest) { \
834 v16u8 tmp_m; \
835 \
836 tmp_m = (v16u8)__msa_pckev_b((v16i8)(in1), (v16i8)(in2)); \
837 tmp_m = __msa_aver_u_b(tmp_m, (v16u8)(dst)); \
838 STORE_UB(tmp_m, (pdest)); \
839 }
840
841 /* Generic for Vector types and GP operations */
842 #define BUTTERFLY_4(in0, in1, in2, in3, \
843 out0, out1, out2, out3) { \
844 out0 = (in0) + (in3); \
845 out1 = (in1) + (in2); \
846 \
847 out2 = (in1) - (in2); \
848 out3 = (in0) - (in3); \
849 }
850
851 /* Generic for Vector types and GP operations */
852 #define BUTTERFLY_8(in0, in1, in2, in3, \
853 in4, in5, in6, in7, \
854 out0, out1, out2, out3, \
855 out4, out5, out6, out7) { \
856 out0 = (in0) + (in7); \
857 out1 = (in1) + (in6); \
858 out2 = (in2) + (in5); \
859 out3 = (in3) + (in4); \
860 \
861 out4 = (in3) - (in4); \
862 out5 = (in2) - (in5); \
863 out6 = (in1) - (in6); \
864 out7 = (in0) - (in7); \
865 }
866 #endif /* HAVE_MSA */
867 #endif /* VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_ */
OLDNEW
« no previous file with comments | « source/libvpx/vp9/common/mips/msa/vp9_idct32x32_msa.c ('k') | source/libvpx/vp9/common/vp9_alloccommon.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698