Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1)

Side by Side Diff: source/libvpx/vp9/common/mips/msa/vp9_macros_msa.h

Issue 1162573005: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
11 #ifndef VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_ 11 #ifndef VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_
12 #define VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_ 12 #define VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_
13 13
14 #include <msa.h> 14 #include <msa.h>
15 15
16 #include "./vpx_config.h" 16 #include "./vpx_config.h"
17 #include "vpx/vpx_integer.h" 17 #include "vpx/vpx_integer.h"
18 18
19 #if HAVE_MSA 19 #define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc))
20 /* load macros */ 20 #define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
21 #define LOAD_UB(psrc) *((const v16u8 *)(psrc)) 21 #define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
22 #define LOAD_SB(psrc) *((const v16i8 *)(psrc)) 22
23 #define LOAD_UH(psrc) *((const v8u16 *)(psrc)) 23 #define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc))
24 #define LOAD_SH(psrc) *((const v8i16 *)(psrc)) 24 #define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
25 #define LOAD_UW(psrc) *((const v4u32 *)(psrc)) 25 #define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
26 #define LOAD_SW(psrc) *((const v4i32 *)(psrc)) 26
27 #define LOAD_UD(psrc) *((const v2u64 *)(psrc)) 27 #define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc))
28 #define LOAD_SD(psrc) *((const v2i64 *)(psrc)) 28 #define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
29 29
30 /* store macros */ 30 #define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
31 #define STORE_UB(vec, pdest) *((v16u8 *)(pdest)) = (vec) 31 #define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
32 #define STORE_SB(vec, pdest) *((v16i8 *)(pdest)) = (vec) 32 #define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
33 #define STORE_UH(vec, pdest) *((v8u16 *)(pdest)) = (vec) 33
34 #define STORE_SH(vec, pdest) *((v8i16 *)(pdest)) = (vec) 34 #define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
35 #define STORE_UW(vec, pdest) *((v4u32 *)(pdest)) = (vec) 35 #define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
36 #define STORE_SW(vec, pdest) *((v4i32 *)(pdest)) = (vec) 36
37 #define STORE_UD(vec, pdest) *((v2u64 *)(pdest)) = (vec) 37 #define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
38 #define STORE_SD(vec, pdest) *((v2i64 *)(pdest)) = (vec) 38 #define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
39 39
40 #if (__mips_isa_rev >= 6) 40 #if (__mips_isa_rev >= 6)
41 #define LOAD_WORD(psrc) ({ \ 41 #define LH(psrc) ({ \
42 const uint8_t *src_m = (const uint8_t *)(psrc); \ 42 const uint8_t *psrc_m = (const uint8_t *)(psrc); \
43 uint32_t val_m; \ 43 uint16_t val_m; \
44 \ 44 \
45 __asm__ __volatile__ ( \ 45 __asm__ __volatile__ ( \
46 "lw %[val_m], %[src_m] \n\t" \ 46 "lh %[val_m], %[psrc_m] \n\t" \
47 \ 47 \
48 : [val_m] "=r" (val_m) \ 48 : [val_m] "=r" (val_m) \
49 : [src_m] "m" (*src_m) \ 49 : [psrc_m] "m" (*psrc_m) \
50 ); \ 50 ); \
51 \ 51 \
52 val_m; \ 52 val_m; \
53 }) 53 })
54 54
55 #define LW(psrc) ({ \
56 const uint8_t *psrc_m = (const uint8_t *)(psrc); \
57 uint32_t val_m; \
58 \
59 __asm__ __volatile__ ( \
60 "lw %[val_m], %[psrc_m] \n\t" \
61 \
62 : [val_m] "=r" (val_m) \
63 : [psrc_m] "m" (*psrc_m) \
64 ); \
65 \
66 val_m; \
67 })
68
55 #if (__mips == 64) 69 #if (__mips == 64)
56 #define LOAD_DWORD(psrc) ({ \ 70 #define LD(psrc) ({ \
57 const uint8_t *src_m = (const uint8_t *)(psrc); \ 71 const uint8_t *psrc_m = (const uint8_t *)(psrc); \
58 uint64_t val_m = 0; \ 72 uint64_t val_m = 0; \
59 \ 73 \
60 __asm__ __volatile__ ( \ 74 __asm__ __volatile__ ( \
61 "ld %[val_m], %[src_m] \n\t" \ 75 "ld %[val_m], %[psrc_m] \n\t" \
62 \ 76 \
63 : [val_m] "=r" (val_m) \ 77 : [val_m] "=r" (val_m) \
64 : [src_m] "m" (*src_m) \ 78 : [psrc_m] "m" (*psrc_m) \
65 ); \ 79 ); \
66 \ 80 \
67 val_m; \ 81 val_m; \
68 }) 82 })
69 #else // !(__mips == 64) 83 #else // !(__mips == 64)
70 #define LOAD_DWORD(psrc) ({ \ 84 #define LD(psrc) ({ \
71 const uint8_t *src1_m = (const uint8_t *)(psrc); \ 85 const uint8_t *psrc_m = (const uint8_t *)(psrc); \
72 const uint8_t *src2_m = ((const uint8_t *)(psrc)) + 4; \ 86 uint32_t val0_m, val1_m; \
73 uint32_t val0_m, val1_m; \ 87 uint64_t val_m = 0; \
74 uint64_t genval_m = 0; \ 88 \
75 \ 89 val0_m = LW(psrc_m); \
76 __asm__ __volatile__ ( \ 90 val1_m = LW(psrc_m + 4); \
77 "lw %[val0_m], %[src1_m] \n\t" \ 91 \
78 \ 92 val_m = (uint64_t)(val1_m); \
79 : [val0_m] "=r" (val0_m) \ 93 val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
80 : [src1_m] "m" (*src1_m) \ 94 val_m = (uint64_t)(val_m | (uint64_t)val0_m); \
81 ); \ 95 \
82 \ 96 val_m; \
83 __asm__ __volatile__ ( \
84 "lw %[val1_m], %[src2_m] \n\t" \
85 \
86 : [val1_m] "=r" (val1_m) \
87 : [src2_m] "m" (*src2_m) \
88 ); \
89 \
90 genval_m = (uint64_t)(val1_m); \
91 genval_m = (uint64_t)((genval_m << 32) & 0xFFFFFFFF00000000); \
92 genval_m = (uint64_t)(genval_m | (uint64_t)val0_m); \
93 \
94 genval_m; \
95 }) 97 })
96 #endif // (__mips == 64) 98 #endif // (__mips == 64)
97 #define STORE_WORD_WITH_OFFSET_1(pdst, val) { \ 99
98 uint8_t *dst_ptr_m = ((uint8_t *)(pdst)) + 1; \ 100 #define SH(val, pdst) { \
99 const uint32_t val_m = (val); \ 101 uint8_t *pdst_m = (uint8_t *)(pdst); \
100 \ 102 const uint16_t val_m = (val); \
101 __asm__ __volatile__ ( \ 103 \
102 "sw %[val_m], %[dst_ptr_m] \n\t" \ 104 __asm__ __volatile__ ( \
103 \ 105 "sh %[val_m], %[pdst_m] \n\t" \
104 : [dst_ptr_m] "=m" (*dst_ptr_m) \ 106 \
105 : [val_m] "r" (val_m) \ 107 : [pdst_m] "=m" (*pdst_m) \
106 ); \ 108 : [val_m] "r" (val_m) \
107 } 109 ); \
108 110 }
109 #define STORE_WORD(pdst, val) { \ 111
110 uint8_t *dst_ptr_m = (uint8_t *)(pdst); \ 112 #define SW(val, pdst) { \
111 const uint32_t val_m = (val); \ 113 uint8_t *pdst_m = (uint8_t *)(pdst); \
112 \ 114 const uint32_t val_m = (val); \
113 __asm__ __volatile__ ( \ 115 \
114 "sw %[val_m], %[dst_ptr_m] \n\t" \ 116 __asm__ __volatile__ ( \
115 \ 117 "sw %[val_m], %[pdst_m] \n\t" \
116 : [dst_ptr_m] "=m" (*dst_ptr_m) \ 118 \
117 : [val_m] "r" (val_m) \ 119 : [pdst_m] "=m" (*pdst_m) \
118 ); \ 120 : [val_m] "r" (val_m) \
119 } 121 ); \
120 122 }
121 #define STORE_DWORD(pdst, val) { \ 123
122 uint8_t *dst_ptr_m = (uint8_t *)(pdst); \ 124 #define SD(val, pdst) { \
123 const uint64_t val_m = (val); \ 125 uint8_t *pdst_m = (uint8_t *)(pdst); \
124 \ 126 const uint64_t val_m = (val); \
125 __asm__ __volatile__ ( \ 127 \
126 "sd %[val_m], %[dst_ptr_m] \n\t" \ 128 __asm__ __volatile__ ( \
127 \ 129 "sd %[val_m], %[pdst_m] \n\t" \
128 : [dst_ptr_m] "=m" (*dst_ptr_m) \ 130 \
129 : [val_m] "r" (val_m) \ 131 : [pdst_m] "=m" (*pdst_m) \
130 ); \ 132 : [val_m] "r" (val_m) \
133 ); \
131 } 134 }
132 #else // !(__mips_isa_rev >= 6) 135 #else // !(__mips_isa_rev >= 6)
133 #define LOAD_WORD(psrc) ({ \ 136 #define LH(psrc) ({ \
134 const uint8_t *src_m = (const uint8_t *)(psrc); \ 137 const uint8_t *psrc_m = (const uint8_t *)(psrc); \
135 uint32_t val_m; \ 138 uint16_t val_m; \
136 \ 139 \
137 __asm__ __volatile__ ( \ 140 __asm__ __volatile__ ( \
138 "ulw %[val_m], %[src_m] \n\t" \ 141 "ulh %[val_m], %[psrc_m] \n\t" \
139 \ 142 \
140 : [val_m] "=r" (val_m) \ 143 : [val_m] "=r" (val_m) \
141 : [src_m] "m" (*src_m) \ 144 : [psrc_m] "m" (*psrc_m) \
142 ); \ 145 ); \
143 \ 146 \
144 val_m; \ 147 val_m; \
145 }) 148 })
146 149
150 #define LW(psrc) ({ \
151 const uint8_t *psrc_m = (const uint8_t *)(psrc); \
152 uint32_t val_m; \
153 \
154 __asm__ __volatile__ ( \
155 "ulw %[val_m], %[psrc_m] \n\t" \
156 \
157 : [val_m] "=r" (val_m) \
158 : [psrc_m] "m" (*psrc_m) \
159 ); \
160 \
161 val_m; \
162 })
163
147 #if (__mips == 64) 164 #if (__mips == 64)
148 #define LOAD_DWORD(psrc) ({ \ 165 #define LD(psrc) ({ \
149 const uint8_t *src_m = (const uint8_t *)(psrc); \ 166 const uint8_t *psrc_m = (const uint8_t *)(psrc); \
150 uint64_t val_m = 0; \ 167 uint64_t val_m = 0; \
151 \ 168 \
152 __asm__ __volatile__ ( \ 169 __asm__ __volatile__ ( \
153 "uld %[val_m], %[src_m] \n\t" \ 170 "uld %[val_m], %[psrc_m] \n\t" \
154 \ 171 \
155 : [val_m] "=r" (val_m) \ 172 : [val_m] "=r" (val_m) \
156 : [src_m] "m" (*src_m) \ 173 : [psrc_m] "m" (*psrc_m) \
157 ); \ 174 ); \
158 \ 175 \
159 val_m; \ 176 val_m; \
160 }) 177 })
161 #else // !(__mips == 64) 178 #else // !(__mips == 64)
162 #define LOAD_DWORD(psrc) ({ \ 179 #define LD(psrc) ({ \
163 const uint8_t *src1_m = (const uint8_t *)(psrc); \ 180 const uint8_t *psrc_m1 = (const uint8_t *)(psrc); \
164 const uint8_t *src2_m = ((const uint8_t *)(psrc)) + 4; \ 181 uint32_t val0_m, val1_m; \
165 uint32_t val0_m, val1_m; \ 182 uint64_t val_m = 0; \
166 uint64_t genval_m = 0; \ 183 \
167 \ 184 val0_m = LW(psrc_m1); \
168 __asm__ __volatile__ ( \ 185 val1_m = LW(psrc_m1 + 4); \
169 "ulw %[val0_m], %[src1_m] \n\t" \ 186 \
170 \ 187 val_m = (uint64_t)(val1_m); \
171 : [val0_m] "=r" (val0_m) \ 188 val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
172 : [src1_m] "m" (*src1_m) \ 189 val_m = (uint64_t)(val_m | (uint64_t)val0_m); \
173 ); \ 190 \
174 \ 191 val_m; \
175 __asm__ __volatile__ ( \
176 "ulw %[val1_m], %[src2_m] \n\t" \
177 \
178 : [val1_m] "=r" (val1_m) \
179 : [src2_m] "m" (*src2_m) \
180 ); \
181 \
182 genval_m = (uint64_t)(val1_m); \
183 genval_m = (uint64_t)((genval_m << 32) & 0xFFFFFFFF00000000); \
184 genval_m = (uint64_t)(genval_m | (uint64_t)val0_m); \
185 \
186 genval_m; \
187 }) 192 })
188 #endif // (__mips == 64) 193 #endif // (__mips == 64)
189 194
190 #define STORE_WORD_WITH_OFFSET_1(pdst, val) { \ 195 #define SH(val, pdst) { \
191 uint8_t *dst_ptr_m = ((uint8_t *)(pdst)) + 1; \ 196 uint8_t *pdst_m = (uint8_t *)(pdst); \
192 const uint32_t val_m = (val); \ 197 const uint16_t val_m = (val); \
193 \ 198 \
194 __asm__ __volatile__ ( \ 199 __asm__ __volatile__ ( \
195 "usw %[val_m], %[dst_ptr_m] \n\t" \ 200 "ush %[val_m], %[pdst_m] \n\t" \
196 \ 201 \
197 : [dst_ptr_m] "=m" (*dst_ptr_m) \ 202 : [pdst_m] "=m" (*pdst_m) \
198 : [val_m] "r" (val_m) \ 203 : [val_m] "r" (val_m) \
199 ); \ 204 ); \
200 } 205 }
201 206
202 #define STORE_WORD(pdst, val) { \ 207 #define SW(val, pdst) { \
203 uint8_t *dst_ptr_m = (uint8_t *)(pdst); \ 208 uint8_t *pdst_m = (uint8_t *)(pdst); \
204 const uint32_t val_m = (val); \ 209 const uint32_t val_m = (val); \
205 \ 210 \
206 __asm__ __volatile__ ( \ 211 __asm__ __volatile__ ( \
207 "usw %[val_m], %[dst_ptr_m] \n\t" \ 212 "usw %[val_m], %[pdst_m] \n\t" \
208 \ 213 \
209 : [dst_ptr_m] "=m" (*dst_ptr_m) \ 214 : [pdst_m] "=m" (*pdst_m) \
210 : [val_m] "r" (val_m) \ 215 : [val_m] "r" (val_m) \
211 ); \ 216 ); \
212 } 217 }
213 218
214 #define STORE_DWORD(pdst, val) { \ 219 #define SD(val, pdst) { \
215 uint8_t *dst1_m = (uint8_t *)(pdst); \ 220 uint8_t *pdst_m1 = (uint8_t *)(pdst); \
216 uint8_t *dst2_m = ((uint8_t *)(pdst)) + 4; \
217 uint32_t val0_m, val1_m; \ 221 uint32_t val0_m, val1_m; \
218 \ 222 \
219 val0_m = (uint32_t)((val) & 0x00000000FFFFFFFF); \ 223 val0_m = (uint32_t)((val) & 0x00000000FFFFFFFF); \
220 val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ 224 val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
221 \ 225 \
222 __asm__ __volatile__ ( \ 226 SW(val0_m, pdst_m1); \
223 "usw %[val0_m], %[dst1_m] \n\t" \ 227 SW(val1_m, pdst_m1 + 4); \
224 "usw %[val1_m], %[dst2_m] \n\t" \ 228 }
229 #endif // (__mips_isa_rev >= 6)
230
231 /* Description : Store 4 words with stride
232 Arguments : Inputs - in0, in1, in2, in3, pdst, stride
233 Details : Stores word from 'in0' to (pdst)
234 Stores word from 'in1' to (pdst + stride)
235 Stores word from 'in2' to (pdst + 2 * stride)
236 Stores word from 'in3' to (pdst + 3 * stride)
237 */
238 #define SW4(in0, in1, in2, in3, pdst, stride) { \
239 SW(in0, (pdst)) \
240 SW(in1, (pdst) + stride); \
241 SW(in2, (pdst) + 2 * stride); \
242 SW(in3, (pdst) + 3 * stride); \
243 }
244
245 /* Description : Store 4 double words with stride
246 Arguments : Inputs - in0, in1, in2, in3, pdst, stride
247 Details : Stores double word from 'in0' to (pdst)
248 Stores double word from 'in1' to (pdst + stride)
249 Stores double word from 'in2' to (pdst + 2 * stride)
250 Stores double word from 'in3' to (pdst + 3 * stride)
251 */
252 #define SD4(in0, in1, in2, in3, pdst, stride) { \
253 SD(in0, (pdst)) \
254 SD(in1, (pdst) + stride); \
255 SD(in2, (pdst) + 2 * stride); \
256 SD(in3, (pdst) + 3 * stride); \
257 }
258
259 /* Description : Load vectors with 16 byte elements with stride
260 Arguments : Inputs - psrc (source pointer to load from)
261 - stride
262 Outputs - out0, out1
263 Return Type - as per RTYPE
264 Details : Loads 16 byte elements in 'out0' from (psrc)
265 Loads 16 byte elements in 'out1' from (psrc + stride)
266 */
267 #define LD_B2(RTYPE, psrc, stride, out0, out1) { \
268 out0 = LD_B(RTYPE, (psrc)); \
269 out1 = LD_B(RTYPE, (psrc) + stride); \
270 }
271 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
272 #define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
273
274 #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) { \
275 LD_B2(RTYPE, (psrc), stride, out0, out1); \
276 LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3); \
277 }
278 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
279 #define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
280
281 #define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) { \
282 LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
283 out4 = LD_B(RTYPE, (psrc) + 4 * stride); \
284 }
285 #define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
286 #define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)
287
288 #define LD_B7(RTYPE, psrc, stride, \
289 out0, out1, out2, out3, out4, out5, out6) { \
290 LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \
291 LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \
292 }
293 #define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__)
294
295 #define LD_B8(RTYPE, psrc, stride, \
296 out0, out1, out2, out3, out4, out5, out6, out7) { \
297 LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
298 LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
299 }
300 #define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
301 #define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
302
303 /* Description : Load vectors with 8 halfword elements with stride
304 Arguments : Inputs - psrc (source pointer to load from)
305 - stride
306 Outputs - out0, out1
307 Details : Loads 8 halfword elements in 'out0' from (psrc)
308 Loads 8 halfword elements in 'out1' from (psrc + stride)
309 */
310 #define LD_H2(RTYPE, psrc, stride, out0, out1) { \
311 out0 = LD_H(RTYPE, (psrc)); \
312 out1 = LD_H(RTYPE, (psrc) + (stride)); \
313 }
314 #define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
315
316 #define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) { \
317 LD_H2(RTYPE, (psrc), stride, out0, out1); \
318 LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
319 }
320 #define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
321
322 #define LD_H8(RTYPE, psrc, stride, \
323 out0, out1, out2, out3, out4, out5, out6, out7) { \
324 LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
325 LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
326 }
327 #define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__)
328
329 #define LD_H16(RTYPE, psrc, stride, \
330 out0, out1, out2, out3, out4, out5, out6, out7, \
331 out8, out9, out10, out11, out12, out13, out14, out15) { \
332 LD_H8(RTYPE, (psrc), stride, \
333 out0, out1, out2, out3, out4, out5, out6, out7); \
334 LD_H8(RTYPE, (psrc) + 8 * stride, stride, \
335 out8, out9, out10, out11, out12, out13, out14, out15); \
336 }
337 #define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__)
338
339 /* Description : Store vectors of 16 byte elements with stride
340 Arguments : Inputs - in0, in1, stride
341 Outputs - pdst (destination pointer to store to)
342 Details : Stores 16 byte elements from 'in0' to (pdst)
343 Stores 16 byte elements from 'in1' to (pdst + stride)
344 */
345 #define ST_B2(RTYPE, in0, in1, pdst, stride) { \
346 ST_B(RTYPE, in0, (pdst)); \
347 ST_B(RTYPE, in1, (pdst) + stride); \
348 }
349 #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
350
351 #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) { \
352 ST_B2(RTYPE, in0, in1, (pdst), stride); \
353 ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
354 }
355 #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
356
357 #define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
358 pdst, stride) { \
359 ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \
360 ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
361 }
362 #define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
363
364 /* Description : Store vectors of 8 halfword elements with stride
365 Arguments : Inputs - in0, in1, stride
366 Outputs - pdst (destination pointer to store to)
367 Details : Stores 8 halfword elements from 'in0' to (pdst)
368 Stores 8 halfword elements from 'in1' to (pdst + stride)
369 */
370 #define ST_H2(RTYPE, in0, in1, pdst, stride) { \
371 ST_H(RTYPE, in0, (pdst)); \
372 ST_H(RTYPE, in1, (pdst) + stride); \
373 }
374 #define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
375
376 #define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) { \
377 ST_H2(RTYPE, in0, in1, (pdst), stride); \
378 ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
379 }
380 #define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__)
381
382 #define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) { \
383 ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \
384 ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
385 }
386 #define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)
387
388 /* Description : Store as 4x4 byte block to destination memory from input vector
389 Arguments : Inputs - in0, in1, pdst, stride
390 Return Type - unsigned byte
391 Details : Idx0 word element from input vector 'in0' is copied and stored
392 on first line
393 Idx1 word element from input vector 'in0' is copied and stored
394 on second line
395 Idx2 word element from input vector 'in1' is copied and stored
396 on third line
397 Idx3 word element from input vector 'in1' is copied and stored
398 on fourth line
399 */
400 #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) { \
401 uint32_t out0_m, out1_m, out2_m, out3_m; \
402 uint8_t *pblk_4x4_m = (uint8_t *)(pdst); \
403 \
404 out0_m = __msa_copy_u_w((v4i32)in0, idx0); \
405 out1_m = __msa_copy_u_w((v4i32)in0, idx1); \
406 out2_m = __msa_copy_u_w((v4i32)in1, idx2); \
407 out3_m = __msa_copy_u_w((v4i32)in1, idx3); \
408 \
409 SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \
410 }
411 #define ST4x8_UB(in0, in1, pdst, stride) { \
412 uint8_t *pblk_4x8 = (uint8_t *)(pdst); \
413 \
414 ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \
415 ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \
416 }
417
418 /* Description : Store as 8x1 byte block to destination memory from input vector
419 Arguments : Inputs - in, pdst
420 Details : Index 0 double word element from input vector 'in' is copied
421 and stored to destination memory at (pdst)
422 */
423 #define ST8x1_UB(in, pdst) { \
424 uint64_t out0_m; \
425 \
426 out0_m = __msa_copy_u_d((v2i64)in, 0); \
427 SD(out0_m, pdst); \
428 }
429
430 /* Description : Store as 8x4 byte block to destination memory from input
431 vectors
432 Arguments : Inputs - in0, in1, pdst, stride
433 Details : Index 0 double word element from input vector 'in0' is copied
434 and stored to destination memory at (pblk_8x4_m)
435 Index 1 double word element from input vector 'in0' is copied
436 and stored to destination memory at (pblk_8x4_m + stride)
437 Index 0 double word element from input vector 'in1' is copied
438 and stored to destination memory at (pblk_8x4_m + 2 * stride)
439 Index 1 double word element from input vector 'in1' is copied
440 and stored to destination memory at (pblk_8x4_m + 3 * stride)
441 */
442 #define ST8x4_UB(in0, in1, pdst, stride) { \
443 uint64_t out0_m, out1_m, out2_m, out3_m; \
444 uint8_t *pblk_8x4_m = (uint8_t *)(pdst); \
225 \ 445 \
226 : [dst1_m] "=m" (*dst1_m), [dst2_m] "=m" (*dst2_m) \ 446 out0_m = __msa_copy_u_d((v2i64)in0, 0); \
227 : [val0_m] "r" (val0_m), [val1_m] "r" (val1_m) \ 447 out1_m = __msa_copy_u_d((v2i64)in0, 1); \
228 ); \ 448 out2_m = __msa_copy_u_d((v2i64)in1, 0); \
229 } 449 out3_m = __msa_copy_u_d((v2i64)in1, 1); \
230 #endif // (__mips_isa_rev >= 6) 450 \
231 451 SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \
232 #define LOAD_2VECS_UB(psrc, stride, \ 452 }
233 val0, val1) { \ 453
234 val0 = LOAD_UB(psrc + 0 * stride); \ 454 /* Description : average with rounding (in0 + in1 + 1) / 2.
235 val1 = LOAD_UB(psrc + 1 * stride); \ 455 Arguments : Inputs - in0, in1, in2, in3,
236 } 456 Outputs - out0, out1
237 457 Return Type - signed byte
238 #define LOAD_4VECS_UB(psrc, stride, \ 458 Details : Each byte element from 'in0' vector is added with each byte
239 val0, val1, val2, val3) { \ 459 element from 'in1' vector. The addition of the elements plus 1
240 val0 = LOAD_UB(psrc + 0 * stride); \ 460 (for rounding) is done unsigned with full precision,
241 val1 = LOAD_UB(psrc + 1 * stride); \ 461 i.e. the result has one extra bit. Unsigned division by 2
242 val2 = LOAD_UB(psrc + 2 * stride); \ 462 (or logical shift right by one bit) is performed before writing
243 val3 = LOAD_UB(psrc + 3 * stride); \ 463 the result to vector 'out0'
244 } 464 Similar for the pair of 'in2' and 'in3'
245 465 */
246 #define LOAD_4VECS_SB(psrc, stride, \ 466 #define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) { \
247 val0, val1, val2, val3) { \ 467 out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1); \
248 val0 = LOAD_SB(psrc + 0 * stride); \ 468 out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3); \
249 val1 = LOAD_SB(psrc + 1 * stride); \ 469 }
250 val2 = LOAD_SB(psrc + 2 * stride); \ 470 #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
251 val3 = LOAD_SB(psrc + 3 * stride); \ 471
252 } 472 #define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
253 473 out0, out1, out2, out3) { \
254 #define LOAD_5VECS_UB(psrc, stride, \ 474 AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \
255 out0, out1, out2, out3, out4) { \ 475 AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3) \
256 LOAD_4VECS_UB((psrc), (stride), \ 476 }
257 (out0), (out1), (out2), (out3)); \ 477 #define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
258 out4 = LOAD_UB(psrc + 4 * stride); \ 478
259 } 479 /* Description : Immediate number of columns to slide with zero
260 480 Arguments : Inputs - in0, in1, slide_val
261 #define LOAD_5VECS_SB(psrc, stride, \ 481 Outputs - out0, out1
262 out0, out1, out2, out3, out4) { \ 482 Return Type - as per RTYPE
263 LOAD_4VECS_SB((psrc), (stride), \ 483 Details : Byte elements from 'zero_m' vector are slide into 'in0' by
264 (out0), (out1), (out2), (out3)); \ 484 number of elements specified by 'slide_val'
265 out4 = LOAD_SB(psrc + 4 * stride); \ 485 */
266 } 486 #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) { \
267 487 v16i8 zero_m = { 0 }; \
268 #define LOAD_7VECS_SB(psrc, stride, \ 488 out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \
269 val0, val1, val2, val3, \ 489 out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \
270 val4, val5, val6) { \ 490 }
271 val0 = LOAD_SB((psrc) + 0 * (stride)); \ 491 #define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__)
272 val1 = LOAD_SB((psrc) + 1 * (stride)); \ 492
273 val2 = LOAD_SB((psrc) + 2 * (stride)); \ 493 /* Description : Immediate number of columns to slide
274 val3 = LOAD_SB((psrc) + 3 * (stride)); \ 494 Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val
275 val4 = LOAD_SB((psrc) + 4 * (stride)); \ 495 Outputs - out0, out1
276 val5 = LOAD_SB((psrc) + 5 * (stride)); \ 496 Return Type - as per RTYPE
277 val6 = LOAD_SB((psrc) + 6 * (stride)); \ 497 Details : Byte elements from 'in0_0' vector are slide into 'in1_0' by
278 } 498 number of elements specified by 'slide_val'
279 499 */
280 #define LOAD_8VECS_UB(psrc, stride, \ 500 #define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) { \
281 out0, out1, out2, out3, \ 501 out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val); \
282 out4, out5, out6, out7) { \ 502 out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val); \
283 LOAD_4VECS_UB((psrc), (stride), \ 503 }
284 (out0), (out1), (out2), (out3)); \ 504 #define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
285 LOAD_4VECS_UB((psrc + 4 * stride), (stride), \ 505
286 (out4), (out5), (out6), (out7)); \ 506 #define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, \
287 } 507 out0, out1, out2, slide_val) { \
288 508 SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
289 #define LOAD_8VECS_SB(psrc, stride, \ 509 out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val); \
290 out0, out1, out2, out3, \ 510 }
291 out4, out5, out6, out7) { \ 511 #define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)
292 LOAD_4VECS_SB((psrc), (stride), \ 512 #define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
293 (out0), (out1), (out2), (out3)); \ 513
294 LOAD_4VECS_SB((psrc + 4 * stride), (stride), \ 514 /* Description : Shuffle byte vector elements as per mask vector
295 (out4), (out5), (out6), (out7)); \ 515 Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
296 } 516 Outputs - out0, out1
297 517 Return Type - as per RTYPE
298 #define LOAD_2VECS_SH(psrc, stride, \ 518 Details : Selective byte elements from in0 & in1 are copied to out0 as
299 val0, val1) { \ 519 per control vector mask0
300 val0 = LOAD_SH((psrc) + 0 * (stride)); \ 520 Selective byte elements from in2 & in3 are copied to out1 as
301 val1 = LOAD_SH((psrc) + 1 * (stride)); \ 521 per control vector mask1
302 } 522 */
303 523 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) { \
304 #define LOAD_4VECS_SH(psrc, stride, \ 524 out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \
305 val0, val1, val2, val3) { \ 525 out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \
306 LOAD_2VECS_SH((psrc), (stride), val0, val1); \ 526 }
307 LOAD_2VECS_SH((psrc + 2 * stride), (stride), val2, val3); \ 527 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
308 } 528 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
309 529 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
310 #define LOAD_8VECS_SH(psrc, stride, \ 530
311 val0, val1, val2, val3, \ 531 #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, \
312 val4, val5, val6, val7) { \ 532 out0, out1, out2, out3) { \
313 LOAD_4VECS_SH((psrc), (stride), \ 533 VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1); \
314 val0, val1, val2, val3); \ 534 VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \
315 LOAD_4VECS_SH((psrc + 4 * stride), (stride), \ 535 }
316 val4, val5, val6, val7); \ 536 #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
317 } 537 #define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
318 538
319 #define LOAD_16VECS_SH(psrc, stride, \ 539 /* Description : Dot product of byte vector elements
320 val0, val1, val2, val3, \ 540 Arguments : Inputs - mult0, mult1
321 val4, val5, val6, val7, \ 541 cnst0, cnst1
322 val8, val9, val10, val11, \ 542 Outputs - out0, out1
323 val12, val13, val14, val15) { \ 543 Return Type - unsigned halfword
324 LOAD_8VECS_SH((psrc), (stride), \ 544 Details : Unsigned byte elements from mult0 are multiplied with
325 val0, val1, val2, val3, \ 545 unsigned byte elements from cnst0 producing a result
326 val4, val5, val6, val7); \ 546 twice the size of input i.e. unsigned halfword.
327 LOAD_8VECS_SH((psrc + 8 * (stride)), (stride), \ 547 Then this multiplication results of adjacent odd-even elements
328 val8, val9, val10, val11, \ 548 are added together and stored to the out vector
329 val12, val13, val14, val15); \ 549 (2 unsigned halfword results)
330 } 550 */
331 551 #define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \
332 #define STORE_4VECS_UB(dst_out, pitch, \ 552 out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0); \
333 in0, in1, in2, in3) { \ 553 out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1); \
334 STORE_UB((in0), (dst_out)); \ 554 }
335 STORE_UB((in1), ((dst_out) + (pitch))); \ 555 #define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
336 STORE_UB((in2), ((dst_out) + 2 * (pitch))); \ 556
337 STORE_UB((in3), ((dst_out) + 3 * (pitch))); \ 557 #define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, \
338 } 558 cnst0, cnst1, cnst2, cnst3, \
339 559 out0, out1, out2, out3) { \
340 #define STORE_8VECS_UB(dst_out, pitch_in, \ 560 DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
341 in0, in1, in2, in3, \ 561 DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
342 in4, in5, in6, in7) { \ 562 }
343 STORE_4VECS_UB(dst_out, pitch_in, \ 563 #define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
344 in0, in1, in2, in3); \ 564
345 STORE_4VECS_UB((dst_out + 4 * (pitch_in)), pitch_in, \ 565 /* Description : Dot product of byte vector elements
346 in4, in5, in6, in7); \ 566 Arguments : Inputs - mult0, mult1
347 } 567 cnst0, cnst1
348 568 Outputs - out0, out1
349 #define VEC_INSERT_4W_UB(src, src0, src1, src2, src3) { \ 569 Return Type - signed halfword
350 src = (v16u8)__msa_insert_w((v4i32)(src), 0, (src0)); \ 570 Details : Signed byte elements from mult0 are multiplied with
351 src = (v16u8)__msa_insert_w((v4i32)(src), 1, (src1)); \ 571 signed byte elements from cnst0 producing a result
352 src = (v16u8)__msa_insert_w((v4i32)(src), 2, (src2)); \ 572 twice the size of input i.e. signed halfword.
353 src = (v16u8)__msa_insert_w((v4i32)(src), 3, (src3)); \ 573 Then this multiplication results of adjacent odd-even elements
354 } 574 are added together and stored to the out vector
355 575 (2 signed halfword results)
356 #define VEC_INSERT_2DW_UB(src, src0, src1) { \ 576 */
357 src = (v16u8)__msa_insert_d((v2i64)(src), 0, (src0)); \ 577 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \
358 src = (v16u8)__msa_insert_d((v2i64)(src), 1, (src1)); \ 578 out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0); \
359 } 579 out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1); \
360 580 }
361 #define STORE_4VECS_SH(ptr, stride, \ 581 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
362 in0, in1, in2, in3) { \ 582
363 STORE_SH(in0, ((ptr) + 0 * stride)); \ 583 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, \
364 STORE_SH(in1, ((ptr) + 1 * stride)); \ 584 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) { \
365 STORE_SH(in2, ((ptr) + 2 * stride)); \ 585 DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
366 STORE_SH(in3, ((ptr) + 3 * stride)); \ 586 DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
367 } 587 }
368 588 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
369 #define STORE_8VECS_SH(ptr, stride, \ 589
370 in0, in1, in2, in3, \ 590 /* Description : Dot product of halfword vector elements
371 in4, in5, in6, in7) { \ 591 Arguments : Inputs - mult0, mult1
372 STORE_SH(in0, ((ptr) + 0 * stride)); \ 592 cnst0, cnst1
373 STORE_SH(in1, ((ptr) + 1 * stride)); \ 593 Outputs - out0, out1
374 STORE_SH(in2, ((ptr) + 2 * stride)); \ 594 Return Type - signed word
375 STORE_SH(in3, ((ptr) + 3 * stride)); \ 595 Details : Signed halfword elements from mult0 are multiplied with
376 STORE_SH(in4, ((ptr) + 4 * stride)); \ 596 signed halfword elements from cnst0 producing a result
377 STORE_SH(in5, ((ptr) + 5 * stride)); \ 597 twice the size of input i.e. signed word.
378 STORE_SH(in6, ((ptr) + 6 * stride)); \ 598 Then this multiplication results of adjacent odd-even elements
379 STORE_SH(in7, ((ptr) + 7 * stride)); \ 599 are added together and stored to the out vector
380 } 600 (2 signed word results)
381 601 */
382 #define CLIP_UNSIGNED_CHAR_H(in) ({ \ 602 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \
603 out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0); \
604 out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1); \
605 }
606 #define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
607
608 #define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, \
609 cnst0, cnst1, cnst2, cnst3, \
610 out0, out1, out2, out3) { \
611 DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
612 DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
613 }
614 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
615
616 /* Description : Dot product & addition of byte vector elements
617 Arguments : Inputs - mult0, mult1
618 cnst0, cnst1
619 Outputs - out0, out1
620 Return Type - signed halfword
621 Details : Signed byte elements from mult0 are multiplied with
622 signed byte elements from cnst0 producing a result
623 twice the size of input i.e. signed halfword.
624 Then this multiplication results of adjacent odd-even elements
625 are added to the out vector
626 (2 signed halfword results)
627 */
628 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \
629 out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0); \
630 out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1); \
631 }
632 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
633
634 #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, \
635 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) { \
636 DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
637 DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
638 }
639 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
640
641 /* Description : Minimum values between unsigned elements of
642 either vector are copied to the output vector
643 Arguments : Inputs - in0, in1, min_vec
644 Outputs - in0, in1, (in place)
645 Return Type - unsigned halfword
646 Details : Minimum of unsigned halfword element values from 'in0' and
647 'min_value' are written to output vector 'in0'
648 */
649 #define MIN_UH2(RTYPE, in0, in1, min_vec) { \
650 in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec); \
651 in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec); \
652 }
653 #define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)
654
655 #define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) { \
656 MIN_UH2(RTYPE, in0, in1, min_vec); \
657 MIN_UH2(RTYPE, in2, in3, min_vec); \
658 }
659 #define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)
660
661 /* Description : Clips all signed halfword elements of input vector
662 between 0 & 255
663 Arguments : Inputs - in (input vector)
664 Outputs - out_m (output vector with clipped elements)
665 Return Type - signed halfword
666 */
667 #define CLIP_SH_0_255(in) ({ \
383 v8i16 max_m = __msa_ldi_h(255); \ 668 v8i16 max_m = __msa_ldi_h(255); \
384 v8i16 out_m; \ 669 v8i16 out_m; \
385 \ 670 \
386 out_m = __msa_maxi_s_h((v8i16)(in), 0); \ 671 out_m = __msa_maxi_s_h((v8i16)in, 0); \
387 out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \ 672 out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \
388 out_m; \ 673 out_m; \
389 }) 674 })
390 675 #define CLIP_SH2_0_255(in0, in1) { \
391 /* halfword 8x8 transpose macro */ 676 in0 = CLIP_SH_0_255(in0); \
392 #define TRANSPOSE8x8_H_SH(in0, in1, in2, in3, \ 677 in1 = CLIP_SH_0_255(in1); \
393 in4, in5, in6, in7, \ 678 }
394 out0, out1, out2, out3, \ 679 #define CLIP_SH4_0_255(in0, in1, in2, in3) { \
395 out4, out5, out6, out7) { \ 680 CLIP_SH2_0_255(in0, in1); \
396 v8i16 s0_m, s1_m; \ 681 CLIP_SH2_0_255(in2, in3); \
397 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 682 }
398 v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ 683
399 \ 684 /* Description : Interleave even byte elements from vectors
400 s0_m = __msa_ilvr_h((v8i16)(in6), (v8i16)(in4)); \ 685 Arguments : Inputs - in0, in1, in2, in3
401 s1_m = __msa_ilvr_h((v8i16)(in7), (v8i16)(in5)); \ 686 Outputs - out0, out1
402 tmp0_m = __msa_ilvr_h((v8i16)s1_m, (v8i16)s0_m); \ 687 Return Type - as per RTYPE
403 tmp1_m = __msa_ilvl_h((v8i16)s1_m, (v8i16)s0_m); \ 688 Details : Even byte elements of 'in0' and even byte
404 \ 689 elements of 'in1' are interleaved and copied to 'out0'
405 s0_m = __msa_ilvl_h((v8i16)(in6), (v8i16)(in4)); \ 690 Even byte elements of 'in2' and even byte
406 s1_m = __msa_ilvl_h((v8i16)(in7), (v8i16)(in5)); \ 691 elements of 'in3' are interleaved and copied to 'out1'
407 tmp2_m = __msa_ilvr_h((v8i16)s1_m, (v8i16)s0_m); \ 692 */
408 tmp3_m = __msa_ilvl_h((v8i16)s1_m, (v8i16)s0_m); \ 693 #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \
409 \ 694 out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \
410 s0_m = __msa_ilvr_h((v8i16)(in2), (v8i16)(in0)); \ 695 out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \
411 s1_m = __msa_ilvr_h((v8i16)(in3), (v8i16)(in1)); \ 696 }
412 tmp4_m = __msa_ilvr_h((v8i16)s1_m, (v8i16)s0_m); \ 697 #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
413 tmp5_m = __msa_ilvl_h((v8i16)s1_m, (v8i16)s0_m); \ 698 #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
414 \ 699
415 s0_m = __msa_ilvl_h((v8i16)(in2), (v8i16)(in0)); \ 700 /* Description : Interleave even halfword elements from vectors
416 s1_m = __msa_ilvl_h((v8i16)(in3), (v8i16)(in1)); \ 701 Arguments : Inputs - in0, in1, in2, in3
417 tmp6_m = __msa_ilvr_h((v8i16)s1_m, (v8i16)s0_m); \ 702 Outputs - out0, out1
418 tmp7_m = __msa_ilvl_h((v8i16)s1_m, (v8i16)s0_m); \ 703 Return Type - as per RTYPE
419 \ 704 Details : Even halfword elements of 'in0' and even halfword
420 out0 = (v8i16)__msa_pckev_d((v2i64)tmp0_m, (v2i64)tmp4_m); \ 705 elements of 'in1' are interleaved and copied to 'out0'
421 out1 = (v8i16)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m); \ 706 Even halfword elements of 'in2' and even halfword
422 out2 = (v8i16)__msa_pckev_d((v2i64)tmp1_m, (v2i64)tmp5_m); \ 707 elements of 'in3' are interleaved and copied to 'out1'
423 out3 = (v8i16)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m); \ 708 */
424 out4 = (v8i16)__msa_pckev_d((v2i64)tmp2_m, (v2i64)tmp6_m); \ 709 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \
425 out5 = (v8i16)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m); \ 710 out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \
426 out6 = (v8i16)__msa_pckev_d((v2i64)tmp3_m, (v2i64)tmp7_m); \ 711 out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \
427 out7 = (v8i16)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m); \ 712 }
428 } 713 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
429 714 #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
430 /* interleave macros */ 715 #define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
431 /* no in-place support */ 716
432 #define ILV_B_LRLR_UB(in0, in1, in2, in3, \ 717 /* Description : Interleave left half of byte elements from vectors
433 out0, out1, out2, out3) { \ 718 Arguments : Inputs - in0, in1, in2, in3
434 out0 = (v16u8)__msa_ilvl_b((v16i8)(in1), (v16i8)(in0)); \ 719 Outputs - out0, out1
435 out1 = (v16u8)__msa_ilvr_b((v16i8)(in1), (v16i8)(in0)); \ 720 Return Type - as per RTYPE
436 out2 = (v16u8)__msa_ilvl_b((v16i8)(in3), (v16i8)(in2)); \ 721 Details : Left half of byte elements of in0 and left half of byte
437 out3 = (v16u8)__msa_ilvr_b((v16i8)(in3), (v16i8)(in2)); \ 722 elements of in1 are interleaved and copied to out0.
438 } 723 Left half of byte elements of in2 and left half of byte
439 724 elements of in3 are interleaved and copied to out1.
440 #define ILV_H_LRLR_SH(in0, in1, in2, in3, \ 725 */
441 out0, out1, out2, out3) { \ 726 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \
442 out0 = __msa_ilvl_h((v8i16)(in1), (v8i16)(in0)); \ 727 out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
443 out1 = __msa_ilvr_h((v8i16)(in1), (v8i16)(in0)); \ 728 out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \
444 out2 = __msa_ilvl_h((v8i16)(in3), (v8i16)(in2)); \ 729 }
445 out3 = __msa_ilvr_h((v8i16)(in3), (v8i16)(in2)); \ 730 #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
446 } 731 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
447 732 #define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
448 #define ILV_H_LR_SH(in0, in1, out0, out1) { \ 733 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
449 out0 = __msa_ilvl_h((v8i16)(in1), (v8i16)(in0)); \ 734
450 out1 = __msa_ilvr_h((v8i16)(in1), (v8i16)(in0)); \ 735 #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
451 } 736 out0, out1, out2, out3) { \
452 737 ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
453 #define ILVR_B_2VECS_UB(in0_r, in1_r, in0_l, in1_l, \ 738 ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
454 out0, out1) { \ 739 }
455 out0 = (v16u8)__msa_ilvr_b((v16i8)(in0_l), (v16i8)(in0_r)); \ 740 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
456 out1 = (v16u8)__msa_ilvr_b((v16i8)(in1_l), (v16i8)(in1_r)); \ 741 #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
457 } 742
458 743 /* Description : Interleave left half of halfword elements from vectors
459 #define ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \ 744 Arguments : Inputs - in0, in1, in2, in3
460 out0, out1) { \ 745 Outputs - out0, out1
461 out0 = __msa_ilvr_b((v16i8)(in0_l), (v16i8)(in0_r)); \ 746 Return Type - as per RTYPE
462 out1 = __msa_ilvr_b((v16i8)(in1_l), (v16i8)(in1_r)); \ 747 Details : Left half of halfword elements of in0 and left half of halfword
463 } 748 elements of in1 are interleaved and copied to out0.
464 749 Left half of halfword elements of in2 and left half of halfword
465 #define ILVR_B_4VECS_UB(in0_r, in1_r, in2_r, in3_r, \ 750 elements of in3 are interleaved and copied to out1.
466 in0_l, in1_l, in2_l, in3_l, \ 751 */
467 out0, out1, out2, out3) { \ 752 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \
468 ILVR_B_2VECS_UB(in0_r, in1_r, in0_l, in1_l, \ 753 out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
469 out0, out1); \ 754 out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \
470 ILVR_B_2VECS_UB(in2_r, in3_r, in2_l, in3_l, \ 755 }
471 out2, out3); \ 756 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
472 } 757
473 758 /* Description : Interleave left half of word elements from vectors
474 #define ILVR_B_4VECS_SB(in0_r, in1_r, in2_r, in3_r, \ 759 Arguments : Inputs - in0, in1, in2, in3
475 in0_l, in1_l, in2_l, in3_l, \ 760 Outputs - out0, out1
476 out0, out1, out2, out3) { \ 761 Return Type - as per RTYPE
477 ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \ 762 Details : Left half of word elements of in0 and left half of word
478 out0, out1); \ 763 elements of in1 are interleaved and copied to out0.
479 ILVR_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \ 764 Left half of word elements of in2 and left half of word
480 out2, out3); \ 765 elements of in3 are interleaved and copied to out1.
481 } 766 */
482 767 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) { \
483 #define ILVR_B_6VECS_SB(in0_r, in1_r, in2_r, \ 768 out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
484 in3_r, in4_r, in5_r, \ 769 out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3); \
485 in0_l, in1_l, in2_l, \ 770 }
486 in3_l, in4_l, in5_l, \ 771 #define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)
487 out0, out1, out2, \ 772 #define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
488 out3, out4, out5) { \ 773
489 ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \ 774 /* Description : Interleave right half of byte elements from vectors
490 out0, out1); \ 775 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
491 ILVR_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \ 776 Outputs - out0, out1, out2, out3
492 out2, out3); \ 777 Return Type - as per RTYPE
493 ILVR_B_2VECS_SB(in4_r, in5_r, in4_l, in5_l, \ 778 Details : Right half of byte elements of in0 and right half of byte
494 out4, out5); \ 779 elements of in1 are interleaved and copied to out0.
495 } 780 Right half of byte elements of in2 and right half of byte
496 781 elements of in3 are interleaved and copied to out1.
497 #define ILVR_B_8VECS_SB(in0_r, in1_r, in2_r, in3_r, \ 782 Similar for other pairs
498 in4_r, in5_r, in6_r, in7_r, \ 783 */
499 in0_l, in1_l, in2_l, in3_l, \ 784 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \
500 in4_l, in5_l, in6_l, in7_l, \ 785 out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
501 out0, out1, out2, out3, \ 786 out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \
502 out4, out5, out6, out7) { \ 787 }
503 ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \ 788 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
504 out0, out1); \ 789 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
505 ILVR_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \ 790 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
506 out2, out3); \ 791 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
507 ILVR_B_2VECS_SB(in4_r, in5_r, in4_l, in5_l, \ 792
508 out4, out5); \ 793 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
509 ILVR_B_2VECS_SB(in6_r, in7_r, in6_l, in7_l, \ 794 out0, out1, out2, out3) { \
510 out6, out7); \ 795 ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
511 } 796 ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
512 797 }
513 #define ILVL_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \ 798 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
514 out0, out1) { \ 799 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
515 out0 = __msa_ilvl_b((v16i8)(in0_l), (v16i8)(in0_r)); \ 800 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
516 out1 = __msa_ilvl_b((v16i8)(in1_l), (v16i8)(in1_r)); \ 801 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
517 } 802
518 803 /* Description : Interleave right half of halfword elements from vectors
519 #define ILVL_B_4VECS_SB(in0_r, in1_r, in2_r, in3_r, \ 804 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
520 in0_l, in1_l, in2_l, in3_l, \ 805 Outputs - out0, out1, out2, out3
521 out0, out1, out2, out3) { \ 806 Return Type - signed halfword
522 ILVL_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \ 807 Details : Right half of halfword elements of in0 and right half of
523 out0, out1); \ 808 halfword elements of in1 are interleaved and copied to out0.
524 ILVL_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \ 809 Right half of halfword elements of in2 and right half of
525 out2, out3); \ 810 halfword elements of in3 are interleaved and copied to out1.
526 } 811 Similar for other pairs
527 812 */
528 #define ILVL_B_6VECS_SB(in0_r, in1_r, in2_r, \ 813 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \
529 in3_r, in4_r, in5_r, \ 814 out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
530 in0_l, in1_l, in2_l, \ 815 out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \
531 in3_l, in4_l, in5_l, \ 816 }
532 out0, out1, out2, \ 817 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
533 out3, out4, out5) { \ 818
534 ILVL_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \ 819 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
535 out0, out1); \ 820 out0, out1, out2, out3) { \
536 ILVL_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \ 821 ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
537 out2, out3); \ 822 ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
538 ILVL_B_2VECS_SB(in4_r, in5_r, in4_l, in5_l, \ 823 }
539 out4, out5); \ 824 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
540 } 825
541 826 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) { \
542 #define ILVR_D_2VECS_SB(out0, in0_l, in0_r, \ 827 out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
543 out1, in1_l, in1_r) { \ 828 out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \
544 out0 = (v16i8)__msa_ilvr_d((v2i64)(in0_l), (v2i64)(in0_r)); \ 829 }
545 out1 = (v16i8)__msa_ilvr_d((v2i64)(in1_l), (v2i64)(in1_r)); \ 830 #define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
546 } 831 #define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
547 832
548 #define ILVR_D_3VECS_SB(out0, in0_l, in0_r, \ 833 /* Description : Interleave right half of double word elements from vectors
549 out1, in1_l, in1_r, \ 834 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
550 out2, in2_l, in2_r) { \ 835 Outputs - out0, out1, out2, out3
551 ILVR_D_2VECS_SB(out0, in0_l, in0_r, \ 836 Return Type - unsigned double word
552 out1, in1_l, in1_r); \ 837 Details : Right half of double word elements of in0 and right half of
553 out2 = (v16i8)__msa_ilvr_d((v2i64)(in2_l), (v2i64)(in2_r)); \ 838 double word elements of in1 are interleaved and copied to out0.
554 } 839 Right half of double word elements of in2 and right half of
555 840 double word elements of in3 are interleaved and copied to out1.
556 #define ILVR_D_4VECS_SB(out0, in0_l, in0_r, \ 841 */
557 out1, in1_l, in1_r, \ 842 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) { \
558 out2, in2_l, in2_r, \ 843 out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1)); \
559 out3, in3_l, in3_r) { \ 844 out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3)); \
560 ILVR_D_2VECS_SB(out0, in0_l, in0_r, \ 845 }
561 out1, in1_l, in1_r); \ 846 #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
562 ILVR_D_2VECS_SB(out2, in2_l, in2_r, \ 847 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
563 out3, in3_l, in3_r); \ 848 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
564 } 849
565 850 #define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) { \
566 #define DOTP_S_W_4VECS_SW(m0, c0, m1, c1, \ 851 ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
567 m2, c2, m3, c3, \ 852 out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5)); \
568 out0, out1, out2, out3) { \ 853 }
569 out0 = __msa_dotp_s_w((v8i16)(m0), (v8i16)(c0)); \ 854 #define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
570 out1 = __msa_dotp_s_w((v8i16)(m1), (v8i16)(c1)); \ 855
571 out2 = __msa_dotp_s_w((v8i16)(m2), (v8i16)(c2)); \ 856 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
572 out3 = __msa_dotp_s_w((v8i16)(m3), (v8i16)(c3)); \ 857 out0, out1, out2, out3) { \
573 } 858 ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
574 859 ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
575 #define PCKEV_H_2VECS_SH(in0_l, in0_r, in1_l, in1_r, \ 860 }
576 out0, out1) { \ 861 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
577 out0 = __msa_pckev_h((v8i16)(in0_l), (v8i16)(in0_r)); \ 862 #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
578 out1 = __msa_pckev_h((v8i16)(in1_l), (v8i16)(in1_r)); \ 863
579 } 864 /* Description : Interleave both left and right half of input vectors
580 865 Arguments : Inputs - in0, in1
581 #define XORI_B_2VECS_UB(val0, val1, \ 866 Outputs - out0, out1
582 out0, out1, xor_val) { \ 867 Return Type - as per RTYPE
583 out0 = __msa_xori_b((v16u8)(val0), (xor_val)); \ 868 Details : Right half of byte elements from 'in0' and 'in1' are
584 out1 = __msa_xori_b((v16u8)(val1), (xor_val)); \ 869 interleaved and stored to 'out0'
585 } 870 Left half of byte elements from 'in0' and 'in1' are
586 871 interleaved and stored to 'out1'
587 #define XORI_B_2VECS_SB(val0, val1, \ 872 */
588 out0, out1, xor_val) { \ 873 #define ILVRL_B2(RTYPE, in0, in1, out0, out1) { \
589 out0 = (v16i8)__msa_xori_b((v16u8)(val0), (xor_val)); \ 874 out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
590 out1 = (v16i8)__msa_xori_b((v16u8)(val1), (xor_val)); \ 875 out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
591 } 876 }
592 877 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
593 #define XORI_B_3VECS_SB(val0, val1, val2, \ 878 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
594 out0, out1, out2, xor_val) { \ 879 #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
595 XORI_B_2VECS_SB(val0, val1, out0, out1, xor_val); \ 880 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
596 out2 = (v16i8)__msa_xori_b((v16u8)(val2), (xor_val)); \ 881
597 } 882 #define ILVRL_H2(RTYPE, in0, in1, out0, out1) { \
598 883 out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
599 #define XORI_B_4VECS_UB(val0, val1, val2, val3, \ 884 out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
600 out0, out1, out2, out3, \ 885 }
601 xor_val) { \ 886 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
602 XORI_B_2VECS_UB(val0, val1, out0, out1, xor_val); \ 887 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
603 XORI_B_2VECS_UB(val2, val3, out2, out3, xor_val); \ 888
604 } 889 #define ILVRL_W2(RTYPE, in0, in1, out0, out1) { \
605 890 out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
606 #define XORI_B_4VECS_SB(val0, val1, val2, val3, \ 891 out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
607 out0, out1, out2, out3, \ 892 }
608 xor_val) { \ 893 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
609 XORI_B_2VECS_SB(val0, val1, out0, out1, xor_val); \ 894 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
610 XORI_B_2VECS_SB(val2, val3, out2, out3, xor_val); \ 895
611 } 896 /* Description : Saturate the halfword element values to the max
612 897 unsigned value of (sat_val+1 bits)
613 #define XORI_B_7VECS_SB(val0, val1, val2, val3, \ 898 The element data width remains unchanged
614 val4, val5, val6, \ 899 Arguments : Inputs - in0, in1, in2, in3, sat_val
615 out0, out1, out2, out3, \ 900 Outputs - in0, in1, in2, in3 (in place)
616 out4, out5, out6, \ 901 Return Type - unsigned halfword
617 xor_val) { \ 902 Details : Each unsigned halfword element from 'in0' is saturated to the
618 XORI_B_4VECS_SB(val0, val1, val2, val3, \ 903 value generated with (sat_val+1) bit range
619 out0, out1, out2, out3, xor_val); \ 904 Results are in placed to original vectors
620 XORI_B_3VECS_SB(val4, val5, val6, \ 905 */
621 out4, out5, out6, xor_val); \ 906 #define SAT_UH2(RTYPE, in0, in1, sat_val) { \
622 } 907 in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \
623 908 in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \
624 #define SRARI_H_4VECS_UH(val0, val1, val2, val3, \ 909 }
625 out0, out1, out2, out3, \ 910 #define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
626 shift_right_val) { \ 911
627 out0 = (v8u16)__msa_srari_h((v8i16)(val0), (shift_right_val)); \ 912 #define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) { \
628 out1 = (v8u16)__msa_srari_h((v8i16)(val1), (shift_right_val)); \ 913 SAT_UH2(RTYPE, in0, in1, sat_val); \
629 out2 = (v8u16)__msa_srari_h((v8i16)(val2), (shift_right_val)); \ 914 SAT_UH2(RTYPE, in2, in3, sat_val) \
630 out3 = (v8u16)__msa_srari_h((v8i16)(val3), (shift_right_val)); \ 915 }
631 } 916 #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
632 917
633 #define SRARI_H_4VECS_SH(val0, val1, val2, val3, \ 918 /* Description : Saturate the halfword element values to the max
634 out0, out1, out2, out3, \ 919 unsigned value of (sat_val+1 bits)
635 shift_right_val) { \ 920 The element data width remains unchanged
636 out0 = __msa_srari_h((v8i16)(val0), (shift_right_val)); \ 921 Arguments : Inputs - in0, in1, in2, in3, sat_val
637 out1 = __msa_srari_h((v8i16)(val1), (shift_right_val)); \ 922 Outputs - in0, in1, in2, in3 (in place)
638 out2 = __msa_srari_h((v8i16)(val2), (shift_right_val)); \ 923 Return Type - unsigned halfword
639 out3 = __msa_srari_h((v8i16)(val3), (shift_right_val)); \ 924 Details : Each unsigned halfword element from 'in0' is saturated to the
640 } 925 value generated with (sat_val+1) bit range
641 926 Results are in placed to original vectors
642 #define SRARI_W_4VECS_SW(val0, val1, val2, val3, \ 927 */
643 out0, out1, out2, out3, \ 928 #define SAT_SH2(RTYPE, in0, in1, sat_val) { \
644 shift_right_val) { \ 929 in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \
645 out0 = __msa_srari_w((v4i32)(val0), (shift_right_val)); \ 930 in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val); \
646 out1 = __msa_srari_w((v4i32)(val1), (shift_right_val)); \ 931 }
647 out2 = __msa_srari_w((v4i32)(val2), (shift_right_val)); \ 932 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
648 out3 = __msa_srari_w((v4i32)(val3), (shift_right_val)); \ 933
649 } 934 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) { \
650 935 SAT_SH2(RTYPE, in0, in1, sat_val); \
651 #define SRARI_SATURATE_UNSIGNED_H(input, right_shift_val, sat_val) ({ \ 936 SAT_SH2(RTYPE, in2, in3, sat_val); \
652 v8u16 out_m; \ 937 }
938 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
939
940 /* Description : Indexed halfword element values are replicated to all
941 elements in output vector
942 Arguments : Inputs - in, idx0, idx1
943 Outputs - out0, out1
944 Return Type - as per RTYPE
945 Details : 'idx0' element value from 'in' vector is replicated to all
946 elements in 'out0' vector
947 Valid index range for halfword operation is 0-7
948 */
949 #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) { \
950 out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0); \
951 out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1); \
952 }
953 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
954
955 #define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, \
956 out0, out1, out2, out3) { \
957 SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \
958 SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \
959 }
960 #define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
961 #define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
962
963 /* Description : Pack even byte elements of vector pairs
964 Arguments : Inputs - in0, in1, in2, in3
965 Outputs - out0, out1
966 Return Type - as per RTYPE
967 Details : Even byte elements of in0 are copied to the left half of
968 out0 & even byte elements of in1 are copied to the right
969 half of out0.
970 Even byte elements of in2 are copied to the left half of
971 out1 & even byte elements of in3 are copied to the right
972 half of out1.
973 */
974 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \
975 out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \
976 out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \
977 }
978 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
979 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
980 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
981
982 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
983 out0, out1, out2, out3) { \
984 PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
985 PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
986 }
987 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
988 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
989 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
990
991 /* Description : Pack even halfword elements of vector pairs
992 Arguments : Inputs - in0, in1, in2, in3
993 Outputs - out0, out1
994 Return Type - as per RTYPE
995 Details : Even halfword elements of in0 are copied to the left half of
996 out0 & even halfword elements of in1 are copied to the right
997 half of out0.
998 Even halfword elements of in2 are copied to the left half of
999 out1 & even halfword elements of in3 are copied to the right
1000 half of out1.
1001 */
1002 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \
1003 out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \
1004 out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \
1005 }
1006 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
1007 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
1008
1009 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1010 out0, out1, out2, out3) { \
1011 PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1012 PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1013 }
1014 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
1015
1016 /* Description : Pack even double word elements of vector pairs
1017 Arguments : Inputs - in0, in1, in2, in3
1018 Outputs - out0, out1
1019 Return Type - unsigned byte
1020 Details : Even double elements of in0 are copied to the left half of
1021 out0 & even double elements of in1 are copied to the right
1022 half of out0.
1023 Even double elements of in2 are copied to the left half of
1024 out1 & even double elements of in3 are copied to the right
1025 half of out1.
1026 */
1027 #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) { \
1028 out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1); \
1029 out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3); \
1030 }
1031 #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
1032 #define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
1033
1034 #define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1035 out0, out1, out2, out3) { \
1036 PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1037 PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
1038 }
1039 #define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
1040
1041 /* Description : Each byte element is logically xor'ed with immediate 128
1042 Arguments : Inputs - in0, in1
1043 Outputs - in0, in1 (in-place)
1044 Return Type - as per RTYPE
1045 Details : Each unsigned byte element from input vector 'in0' is
1046 logically xor'ed with 128 and result is in-place stored in
1047 'in0' vector
1048 Each unsigned byte element from input vector 'in1' is
1049 logically xor'ed with 128 and result is in-place stored in
1050 'in1' vector
1051 Similar for other pairs
1052 */
1053 #define XORI_B2_128(RTYPE, in0, in1) { \
1054 in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128); \
1055 in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128); \
1056 }
1057 #define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
1058 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
1059
1060 #define XORI_B3_128(RTYPE, in0, in1, in2) { \
1061 XORI_B2_128(RTYPE, in0, in1); \
1062 in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128); \
1063 }
1064 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
1065
1066 #define XORI_B4_128(RTYPE, in0, in1, in2, in3) { \
1067 XORI_B2_128(RTYPE, in0, in1); \
1068 XORI_B2_128(RTYPE, in2, in3); \
1069 }
1070 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
1071 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
1072
1073 #define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) { \
1074 XORI_B4_128(RTYPE, in0, in1, in2, in3); \
1075 XORI_B3_128(RTYPE, in4, in5, in6); \
1076 }
1077 #define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
1078
1079 /* Description : Addition of signed halfword elements and signed saturation
1080 Arguments : Inputs - in0, in1, in2, in3
1081 Outputs - out0, out1
1082 Return Type - as per RTYPE
1083 Details : Signed halfword elements from 'in0' are added to signed
1084 halfword elements of 'in1'. The result is then signed saturated
1085 between -32768 to +32767 (as per halfword data type)
1086 Similar for other pairs
1087 */
1088 #define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) { \
1089 out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1); \
1090 out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3); \
1091 }
1092 #define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
1093
1094 #define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1095 out0, out1, out2, out3) { \
1096 ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \
1097 ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \
1098 }
1099 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
1100
1101 /* Description : Shift right arithmetic rounded (immediate)
1102 Arguments : Inputs - in0, in1, in2, in3, shift
1103 Outputs - in0, in1, in2, in3 (in place)
1104 Return Type - as per RTYPE
1105 Details : Each element of vector 'in0' is shifted right arithmetic by
1106 value in 'shift'.
1107 The last discarded bit is added to shifted value for rounding
1108 and the result is in place written to 'in0'
1109 Similar for other pairs
1110 */
1111 #define SRARI_H2(RTYPE, in0, in1, shift) { \
1112 in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift); \
1113 in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift); \
1114 }
1115 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
1116 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
1117
1118 #define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) { \
1119 SRARI_H2(RTYPE, in0, in1, shift); \
1120 SRARI_H2(RTYPE, in2, in3, shift); \
1121 }
1122 #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
1123 #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
1124
1125 /* Description : Shift right arithmetic rounded (immediate)
1126 Arguments : Inputs - in0, in1, shift
1127 Outputs - in0, in1 (in place)
1128 Return Type - as per RTYPE
1129 Details : Each element of vector 'in0' is shifted right arithmetic by
1130 value in 'shift'.
1131 The last discarded bit is added to shifted value for rounding
1132 and the result is in place written to 'in0'
1133 Similar for other pairs
1134 */
1135 #define SRARI_W2(RTYPE, in0, in1, shift) { \
1136 in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \
1137 in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \
1138 }
1139 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
1140
1141 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) { \
1142 SRARI_W2(RTYPE, in0, in1, shift); \
1143 SRARI_W2(RTYPE, in2, in3, shift); \
1144 }
1145 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
1146
1147 /* Description : Addition of 2 pairs of vectors
1148 Arguments : Inputs - in0, in1, in2, in3
1149 Outputs - out0, out1
1150 Details : Each element from 2 pairs vectors is added and 2 results are
1151 produced
1152 */
1153 #define ADD2(in0, in1, in2, in3, out0, out1) { \
1154 out0 = in0 + in1; \
1155 out1 = in2 + in3; \
1156 }
1157 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, \
1158 out0, out1, out2, out3) { \
1159 ADD2(in0, in1, in2, in3, out0, out1); \
1160 ADD2(in4, in5, in6, in7, out2, out3); \
1161 }
1162
1163 /* Description : Subtraction of 2 pairs of vectors
1164 Arguments : Inputs - in0, in1, in2, in3
1165 Outputs - out0, out1
1166 Details : Each element from 2 pairs vectors is subtracted and 2 results
1167 are produced
1168 */
1169 #define SUB2(in0, in1, in2, in3, out0, out1) { \
1170 out0 = in0 - in1; \
1171 out1 = in2 - in3; \
1172 }
1173 #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, \
1174 out0, out1, out2, out3) { \
1175 out0 = in0 - in1; \
1176 out1 = in2 - in3; \
1177 out2 = in4 - in5; \
1178 out3 = in6 - in7; \
1179 }
1180
1181 /* Description : Zero extend unsigned byte elements to halfword elements
1182 Arguments : Inputs - in (1 input unsigned byte vector)
1183 Outputs - out0, out1 (unsigned 2 halfword vectors)
1184 Return Type - signed halfword
1185 Details : Zero extended right half of vector is returned in 'out0'
1186 Zero extended left half of vector is returned in 'out1'
1187 */
1188 #define UNPCK_UB_SH(in, out0, out1) { \
1189 v16i8 zero_m = { 0 }; \
1190 \
1191 ILVRL_B2_SH(zero_m, in, out0, out1); \
1192 }
1193
1194 /* Description : Butterfly of 4 input vectors
1195 Arguments : Inputs - in0, in1, in2, in3
1196 Outputs - out0, out1, out2, out3
1197 Details : Butterfly operation
1198 */
1199 #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) { \
1200 out0 = in0 + in3; \
1201 out1 = in1 + in2; \
1202 \
1203 out2 = in1 - in2; \
1204 out3 = in0 - in3; \
1205 }
1206
1207 /* Description : Butterfly of 8 input vectors
1208 Arguments : Inputs - in0 ... in7
1209 Outputs - out0 .. out7
1210 Details : Butterfly operation
1211 */
1212 #define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, \
1213 out0, out1, out2, out3, out4, out5, out6, out7) { \
1214 out0 = in0 + in7; \
1215 out1 = in1 + in6; \
1216 out2 = in2 + in5; \
1217 out3 = in3 + in4; \
653 \ 1218 \
654 out_m = (v8u16)__msa_srari_h((v8i16)(input), (right_shift_val)); \ 1219 out4 = in3 - in4; \
655 out_m = __msa_sat_u_h(out_m, (sat_val)); \ 1220 out5 = in2 - in5; \
656 out_m; \ 1221 out6 = in1 - in6; \
1222 out7 = in0 - in7; \
1223 }
1224
1225 /* Description : Transposes 4x8 block with half word elements in vectors
1226 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1227 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1228 Return Type - signed halfword
1229 Details :
1230 */
1231 #define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, \
1232 out0, out1, out2, out3, out4, out5, out6, out7) { \
1233 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
1234 v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n; \
1235 v8i16 zero_m = { 0 }; \
1236 \
1237 ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, \
1238 tmp0_n, tmp1_n, tmp2_n, tmp3_n); \
1239 ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m); \
1240 ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m); \
1241 \
1242 out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m); \
1243 out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m); \
1244 out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m); \
1245 out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m); \
1246 \
1247 out4 = zero_m; \
1248 out5 = zero_m; \
1249 out6 = zero_m; \
1250 out7 = zero_m; \
1251 }
1252
1253 /* Description : Transposes 8x4 block with half word elements in vectors
1254 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1255 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1256 Return Type - signed halfword
1257 Details :
1258 */
1259 #define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) { \
1260 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
1261 \
1262 ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m); \
1263 ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m); \
1264 ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2); \
1265 ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3); \
1266 }
1267
1268 /* Description : Transposes 8x8 block with half word elements in vectors
1269 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1270 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1271 Return Type - signed halfword
1272 Details :
1273 */
1274 #define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1275 out0, out1, out2, out3, out4, out5, out6, out7) { \
1276 v8i16 s0_m, s1_m; \
1277 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
1278 v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
1279 \
1280 ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \
1281 ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m); \
1282 ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \
1283 ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m); \
1284 ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
1285 ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m); \
1286 ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
1287 ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m); \
1288 PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, \
1289 tmp3_m, tmp7_m, out0, out2, out4, out6); \
1290 out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m); \
1291 out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m); \
1292 out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m); \
1293 out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m); \
1294 }
1295 #define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
1296
1297 /* Description : Pack even elements of input vectors & xor with 128
1298 Arguments : Inputs - in0, in1
1299 Outputs - out_m
1300 Return Type - unsigned byte
1301 Details : Signed byte even elements from 'in0' and 'in1' are packed
1302 together in one vector and the resulted vector is xor'ed with
1303 128 to shift the range from signed to unsigned byte
1304 */
1305 #define PCKEV_XORI128_UB(in0, in1) ({ \
1306 v16u8 out_m; \
1307 \
1308 out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \
1309 out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128); \
1310 out_m; \
657 }) 1311 })
658 1312
659 #define SRARI_SATURATE_SIGNED_H(input, right_shift_val, sat_val) ({ \ 1313 /* Description : Pack even byte elements and store byte vector in destination
660 v8i16 out_m; \ 1314 memory
661 \ 1315 Arguments : Inputs - in0, in1, pdst
662 out_m = __msa_srari_h((v8i16)(input), (right_shift_val)); \ 1316 */
663 out_m = __msa_sat_s_h(out_m, (sat_val)); \ 1317 #define PCKEV_ST_SB(in0, in1, pdst) { \
664 out_m; \ 1318 v16i8 tmp_m; \
1319 \
1320 tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \
1321 ST_SB(tmp_m, (pdst)); \
1322 }
1323
1324 /* Description : Horizontal 2 tap filter kernel code
1325 Arguments : Inputs - in0, in1, mask, coeff, shift
1326 */
1327 #define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) ({ \
1328 v16i8 tmp0_m; \
1329 v8u16 tmp1_m; \
1330 \
1331 tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \
1332 tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff); \
1333 tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift); \
1334 tmp1_m = __msa_sat_u_h(tmp1_m, shift); \
1335 \
1336 tmp1_m; \
665 }) 1337 })
666
667 #define PCKEV_2B_XORI128_STORE_4_BYTES_4(in1, in2, \
668 pdst, stride) { \
669 uint32_t out0_m, out1_m, out2_m, out3_m; \
670 v16i8 tmp0_m; \
671 uint8_t *dst_m = (uint8_t *)(pdst); \
672 \
673 tmp0_m = __msa_pckev_b((v16i8)(in2), (v16i8)(in1)); \
674 tmp0_m = (v16i8)__msa_xori_b((v16u8)tmp0_m, 128); \
675 \
676 out0_m = __msa_copy_u_w((v4i32)tmp0_m, 0); \
677 out1_m = __msa_copy_u_w((v4i32)tmp0_m, 1); \
678 out2_m = __msa_copy_u_w((v4i32)tmp0_m, 2); \
679 out3_m = __msa_copy_u_w((v4i32)tmp0_m, 3); \
680 \
681 STORE_WORD(dst_m, out0_m); \
682 dst_m += stride; \
683 STORE_WORD(dst_m, out1_m); \
684 dst_m += stride; \
685 STORE_WORD(dst_m, out2_m); \
686 dst_m += stride; \
687 STORE_WORD(dst_m, out3_m); \
688 }
689
690 #define PCKEV_B_4_XORI128_STORE_8_BYTES_4(in1, in2, \
691 in3, in4, \
692 pdst, stride) { \
693 uint64_t out0_m, out1_m, out2_m, out3_m; \
694 v16i8 tmp0_m, tmp1_m; \
695 uint8_t *dst_m = (uint8_t *)(pdst); \
696 \
697 tmp0_m = __msa_pckev_b((v16i8)(in2), (v16i8)(in1)); \
698 tmp1_m = __msa_pckev_b((v16i8)(in4), (v16i8)(in3)); \
699 \
700 tmp0_m = (v16i8)__msa_xori_b((v16u8)tmp0_m, 128); \
701 tmp1_m = (v16i8)__msa_xori_b((v16u8)tmp1_m, 128); \
702 \
703 out0_m = __msa_copy_u_d((v2i64)tmp0_m, 0); \
704 out1_m = __msa_copy_u_d((v2i64)tmp0_m, 1); \
705 out2_m = __msa_copy_u_d((v2i64)tmp1_m, 0); \
706 out3_m = __msa_copy_u_d((v2i64)tmp1_m, 1); \
707 \
708 STORE_DWORD(dst_m, out0_m); \
709 dst_m += stride; \
710 STORE_DWORD(dst_m, out1_m); \
711 dst_m += stride; \
712 STORE_DWORD(dst_m, out2_m); \
713 dst_m += stride; \
714 STORE_DWORD(dst_m, out3_m); \
715 }
716
717 /* Only for signed vecs */
718 #define PCKEV_B_XORI128_STORE_VEC(in1, in2, pdest) { \
719 v16i8 tmp_m; \
720 \
721 tmp_m = __msa_pckev_b((v16i8)(in1), (v16i8)(in2)); \
722 tmp_m = (v16i8)__msa_xori_b((v16u8)tmp_m, 128); \
723 STORE_SB(tmp_m, (pdest)); \
724 }
725
726 /* Only for signed vecs */
727 #define PCKEV_B_4_XORI128_AVG_STORE_8_BYTES_4(in1, dst0, \
728 in2, dst1, \
729 in3, dst2, \
730 in4, dst3, \
731 pdst, stride) { \
732 uint64_t out0_m, out1_m, out2_m, out3_m; \
733 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
734 uint8_t *dst_m = (uint8_t *)(pdst); \
735 \
736 tmp0_m = (v16u8)__msa_pckev_b((v16i8)(in2), (v16i8)(in1)); \
737 tmp1_m = (v16u8)__msa_pckev_b((v16i8)(in4), (v16i8)(in3)); \
738 \
739 tmp2_m = (v16u8)__msa_ilvr_d((v2i64)(dst1), (v2i64)(dst0)); \
740 tmp3_m = (v16u8)__msa_ilvr_d((v2i64)(dst3), (v2i64)(dst2)); \
741 \
742 tmp0_m = __msa_xori_b(tmp0_m, 128); \
743 tmp1_m = __msa_xori_b(tmp1_m, 128); \
744 \
745 tmp0_m = __msa_aver_u_b(tmp0_m, tmp2_m); \
746 tmp1_m = __msa_aver_u_b(tmp1_m, tmp3_m); \
747 \
748 out0_m = __msa_copy_u_d((v2i64)tmp0_m, 0); \
749 out1_m = __msa_copy_u_d((v2i64)tmp0_m, 1); \
750 out2_m = __msa_copy_u_d((v2i64)tmp1_m, 0); \
751 out3_m = __msa_copy_u_d((v2i64)tmp1_m, 1); \
752 \
753 STORE_DWORD(dst_m, out0_m); \
754 dst_m += stride; \
755 STORE_DWORD(dst_m, out1_m); \
756 dst_m += stride; \
757 STORE_DWORD(dst_m, out2_m); \
758 dst_m += stride; \
759 STORE_DWORD(dst_m, out3_m); \
760 }
761
762 /* Only for signed vecs */
763 #define PCKEV_B_XORI128_AVG_STORE_VEC(in1, in2, dst, pdest) { \
764 v16u8 tmp_m; \
765 \
766 tmp_m = (v16u8)__msa_pckev_b((v16i8)(in1), (v16i8)(in2)); \
767 tmp_m = __msa_xori_b(tmp_m, 128); \
768 tmp_m = __msa_aver_u_b(tmp_m, (v16u8)(dst)); \
769 STORE_UB(tmp_m, (pdest)); \
770 }
771
772 #define PCKEV_B_STORE_8_BYTES_4(in1, in2, in3, in4, \
773 pdst, stride) { \
774 uint64_t out0_m, out1_m, out2_m, out3_m; \
775 v16i8 tmp0_m, tmp1_m; \
776 uint8_t *dst_m = (uint8_t *)(pdst); \
777 \
778 tmp0_m = __msa_pckev_b((v16i8)(in2), (v16i8)(in1)); \
779 tmp1_m = __msa_pckev_b((v16i8)(in4), (v16i8)(in3)); \
780 \
781 out0_m = __msa_copy_u_d((v2i64)tmp0_m, 0); \
782 out1_m = __msa_copy_u_d((v2i64)tmp0_m, 1); \
783 out2_m = __msa_copy_u_d((v2i64)tmp1_m, 0); \
784 out3_m = __msa_copy_u_d((v2i64)tmp1_m, 1); \
785 \
786 STORE_DWORD(dst_m, out0_m); \
787 dst_m += stride; \
788 STORE_DWORD(dst_m, out1_m); \
789 dst_m += stride; \
790 STORE_DWORD(dst_m, out2_m); \
791 dst_m += stride; \
792 STORE_DWORD(dst_m, out3_m); \
793 }
794
795 /* Only for unsigned vecs */
796 #define PCKEV_B_STORE_VEC(in1, in2, pdest) { \
797 v16i8 tmp_m; \
798 \
799 tmp_m = __msa_pckev_b((v16i8)(in1), (v16i8)(in2)); \
800 STORE_SB(tmp_m, (pdest)); \
801 }
802
803 #define PCKEV_B_AVG_STORE_8_BYTES_4(in1, dst0, in2, dst1, \
804 in3, dst2, in4, dst3, \
805 pdst, stride) { \
806 uint64_t out0_m, out1_m, out2_m, out3_m; \
807 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
808 uint8_t *dst_m = (uint8_t *)(pdst); \
809 \
810 tmp0_m = (v16u8)__msa_pckev_b((v16i8)(in2), (v16i8)(in1)); \
811 tmp1_m = (v16u8)__msa_pckev_b((v16i8)(in4), (v16i8)(in3)); \
812 \
813 tmp2_m = (v16u8)__msa_pckev_d((v2i64)(dst1), (v2i64)(dst0)); \
814 tmp3_m = (v16u8)__msa_pckev_d((v2i64)(dst3), (v2i64)(dst2)); \
815 \
816 tmp0_m = __msa_aver_u_b(tmp0_m, tmp2_m); \
817 tmp1_m = __msa_aver_u_b(tmp1_m, tmp3_m); \
818 \
819 out0_m = __msa_copy_u_d((v2i64)tmp0_m, 0); \
820 out1_m = __msa_copy_u_d((v2i64)tmp0_m, 1); \
821 out2_m = __msa_copy_u_d((v2i64)tmp1_m, 0); \
822 out3_m = __msa_copy_u_d((v2i64)tmp1_m, 1); \
823 \
824 STORE_DWORD(dst_m, out0_m); \
825 dst_m += stride; \
826 STORE_DWORD(dst_m, out1_m); \
827 dst_m += stride; \
828 STORE_DWORD(dst_m, out2_m); \
829 dst_m += stride; \
830 STORE_DWORD(dst_m, out3_m); \
831 }
832
833 #define PCKEV_B_AVG_STORE_VEC(in1, in2, dst, pdest) { \
834 v16u8 tmp_m; \
835 \
836 tmp_m = (v16u8)__msa_pckev_b((v16i8)(in1), (v16i8)(in2)); \
837 tmp_m = __msa_aver_u_b(tmp_m, (v16u8)(dst)); \
838 STORE_UB(tmp_m, (pdest)); \
839 }
840
841 /* Generic for Vector types and GP operations */
842 #define BUTTERFLY_4(in0, in1, in2, in3, \
843 out0, out1, out2, out3) { \
844 out0 = (in0) + (in3); \
845 out1 = (in1) + (in2); \
846 \
847 out2 = (in1) - (in2); \
848 out3 = (in0) - (in3); \
849 }
850
851 /* Generic for Vector types and GP operations */
852 #define BUTTERFLY_8(in0, in1, in2, in3, \
853 in4, in5, in6, in7, \
854 out0, out1, out2, out3, \
855 out4, out5, out6, out7) { \
856 out0 = (in0) + (in7); \
857 out1 = (in1) + (in6); \
858 out2 = (in2) + (in5); \
859 out3 = (in3) + (in4); \
860 \
861 out4 = (in3) - (in4); \
862 out5 = (in2) - (in5); \
863 out6 = (in1) - (in6); \
864 out7 = (in0) - (in7); \
865 }
866 #endif /* HAVE_MSA */
867 #endif /* VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_ */ 1338 #endif /* VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_ */
OLDNEW
« no previous file with comments | « source/libvpx/vp9/common/mips/msa/vp9_idct_msa.h ('k') | source/libvpx/vp9/common/vp9_alloccommon.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698