OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 #ifndef VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_ | 11 #ifndef VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_ |
12 #define VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_ | 12 #define VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_ |
13 | 13 |
14 #include <msa.h> | 14 #include <msa.h> |
15 | 15 |
16 #include "./vpx_config.h" | 16 #include "./vpx_config.h" |
17 #include "vpx/vpx_integer.h" | 17 #include "vpx/vpx_integer.h" |
18 | 18 |
19 #if HAVE_MSA | 19 #define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc)) |
20 /* load macros */ | 20 #define LD_UB(...) LD_B(v16u8, __VA_ARGS__) |
21 #define LOAD_UB(psrc) *((const v16u8 *)(psrc)) | 21 #define LD_SB(...) LD_B(v16i8, __VA_ARGS__) |
22 #define LOAD_SB(psrc) *((const v16i8 *)(psrc)) | 22 |
23 #define LOAD_UH(psrc) *((const v8u16 *)(psrc)) | 23 #define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc)) |
24 #define LOAD_SH(psrc) *((const v8i16 *)(psrc)) | 24 #define LD_UH(...) LD_H(v8u16, __VA_ARGS__) |
25 #define LOAD_UW(psrc) *((const v4u32 *)(psrc)) | 25 #define LD_SH(...) LD_H(v8i16, __VA_ARGS__) |
26 #define LOAD_SW(psrc) *((const v4i32 *)(psrc)) | 26 |
27 #define LOAD_UD(psrc) *((const v2u64 *)(psrc)) | 27 #define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc)) |
28 #define LOAD_SD(psrc) *((const v2i64 *)(psrc)) | 28 #define LD_SW(...) LD_W(v4i32, __VA_ARGS__) |
29 | 29 |
30 /* store macros */ | 30 #define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) |
31 #define STORE_UB(vec, pdest) *((v16u8 *)(pdest)) = (vec) | 31 #define ST_UB(...) ST_B(v16u8, __VA_ARGS__) |
32 #define STORE_SB(vec, pdest) *((v16i8 *)(pdest)) = (vec) | 32 #define ST_SB(...) ST_B(v16i8, __VA_ARGS__) |
33 #define STORE_UH(vec, pdest) *((v8u16 *)(pdest)) = (vec) | 33 |
34 #define STORE_SH(vec, pdest) *((v8i16 *)(pdest)) = (vec) | 34 #define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) |
35 #define STORE_UW(vec, pdest) *((v4u32 *)(pdest)) = (vec) | 35 #define ST_SH(...) ST_H(v8i16, __VA_ARGS__) |
36 #define STORE_SW(vec, pdest) *((v4i32 *)(pdest)) = (vec) | 36 |
37 #define STORE_UD(vec, pdest) *((v2u64 *)(pdest)) = (vec) | 37 #define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) |
38 #define STORE_SD(vec, pdest) *((v2i64 *)(pdest)) = (vec) | 38 #define ST_SW(...) ST_W(v4i32, __VA_ARGS__) |
39 | 39 |
40 #if (__mips_isa_rev >= 6) | 40 #if (__mips_isa_rev >= 6) |
41 #define LOAD_WORD(psrc) ({ \ | 41 #define LH(psrc) ({ \ |
42 const uint8_t *src_m = (const uint8_t *)(psrc); \ | 42 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ |
43 uint32_t val_m; \ | 43 uint16_t val_m; \ |
44 \ | 44 \ |
45 __asm__ __volatile__ ( \ | 45 __asm__ __volatile__ ( \ |
46 "lw %[val_m], %[src_m] \n\t" \ | 46 "lh %[val_m], %[psrc_m] \n\t" \ |
47 \ | 47 \ |
48 : [val_m] "=r" (val_m) \ | 48 : [val_m] "=r" (val_m) \ |
49 : [src_m] "m" (*src_m) \ | 49 : [psrc_m] "m" (*psrc_m) \ |
50 ); \ | 50 ); \ |
51 \ | 51 \ |
52 val_m; \ | 52 val_m; \ |
53 }) | 53 }) |
54 | 54 |
| 55 #define LW(psrc) ({ \ |
| 56 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ |
| 57 uint32_t val_m; \ |
| 58 \ |
| 59 __asm__ __volatile__ ( \ |
| 60 "lw %[val_m], %[psrc_m] \n\t" \ |
| 61 \ |
| 62 : [val_m] "=r" (val_m) \ |
| 63 : [psrc_m] "m" (*psrc_m) \ |
| 64 ); \ |
| 65 \ |
| 66 val_m; \ |
| 67 }) |
| 68 |
55 #if (__mips == 64) | 69 #if (__mips == 64) |
56 #define LOAD_DWORD(psrc) ({ \ | 70 #define LD(psrc) ({ \ |
57 const uint8_t *src_m = (const uint8_t *)(psrc); \ | 71 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ |
58 uint64_t val_m = 0; \ | 72 uint64_t val_m = 0; \ |
59 \ | 73 \ |
60 __asm__ __volatile__ ( \ | 74 __asm__ __volatile__ ( \ |
61 "ld %[val_m], %[src_m] \n\t" \ | 75 "ld %[val_m], %[psrc_m] \n\t" \ |
62 \ | 76 \ |
63 : [val_m] "=r" (val_m) \ | 77 : [val_m] "=r" (val_m) \ |
64 : [src_m] "m" (*src_m) \ | 78 : [psrc_m] "m" (*psrc_m) \ |
65 ); \ | 79 ); \ |
66 \ | 80 \ |
67 val_m; \ | 81 val_m; \ |
68 }) | 82 }) |
69 #else // !(__mips == 64) | 83 #else // !(__mips == 64) |
70 #define LOAD_DWORD(psrc) ({ \ | 84 #define LD(psrc) ({ \ |
71 const uint8_t *src1_m = (const uint8_t *)(psrc); \ | 85 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ |
72 const uint8_t *src2_m = ((const uint8_t *)(psrc)) + 4; \ | 86 uint32_t val0_m, val1_m; \ |
73 uint32_t val0_m, val1_m; \ | 87 uint64_t val_m = 0; \ |
74 uint64_t genval_m = 0; \ | 88 \ |
75 \ | 89 val0_m = LW(psrc_m); \ |
76 __asm__ __volatile__ ( \ | 90 val1_m = LW(psrc_m + 4); \ |
77 "lw %[val0_m], %[src1_m] \n\t" \ | 91 \ |
78 \ | 92 val_m = (uint64_t)(val1_m); \ |
79 : [val0_m] "=r" (val0_m) \ | 93 val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \ |
80 : [src1_m] "m" (*src1_m) \ | 94 val_m = (uint64_t)(val_m | (uint64_t)val0_m); \ |
81 ); \ | 95 \ |
82 \ | 96 val_m; \ |
83 __asm__ __volatile__ ( \ | |
84 "lw %[val1_m], %[src2_m] \n\t" \ | |
85 \ | |
86 : [val1_m] "=r" (val1_m) \ | |
87 : [src2_m] "m" (*src2_m) \ | |
88 ); \ | |
89 \ | |
90 genval_m = (uint64_t)(val1_m); \ | |
91 genval_m = (uint64_t)((genval_m << 32) & 0xFFFFFFFF00000000); \ | |
92 genval_m = (uint64_t)(genval_m | (uint64_t)val0_m); \ | |
93 \ | |
94 genval_m; \ | |
95 }) | 97 }) |
96 #endif // (__mips == 64) | 98 #endif // (__mips == 64) |
97 #define STORE_WORD_WITH_OFFSET_1(pdst, val) { \ | 99 |
98 uint8_t *dst_ptr_m = ((uint8_t *)(pdst)) + 1; \ | 100 #define SH(val, pdst) { \ |
99 const uint32_t val_m = (val); \ | 101 uint8_t *pdst_m = (uint8_t *)(pdst); \ |
100 \ | 102 const uint16_t val_m = (val); \ |
101 __asm__ __volatile__ ( \ | 103 \ |
102 "sw %[val_m], %[dst_ptr_m] \n\t" \ | 104 __asm__ __volatile__ ( \ |
103 \ | 105 "sh %[val_m], %[pdst_m] \n\t" \ |
104 : [dst_ptr_m] "=m" (*dst_ptr_m) \ | 106 \ |
105 : [val_m] "r" (val_m) \ | 107 : [pdst_m] "=m" (*pdst_m) \ |
106 ); \ | 108 : [val_m] "r" (val_m) \ |
107 } | 109 ); \ |
108 | 110 } |
109 #define STORE_WORD(pdst, val) { \ | 111 |
110 uint8_t *dst_ptr_m = (uint8_t *)(pdst); \ | 112 #define SW(val, pdst) { \ |
111 const uint32_t val_m = (val); \ | 113 uint8_t *pdst_m = (uint8_t *)(pdst); \ |
112 \ | 114 const uint32_t val_m = (val); \ |
113 __asm__ __volatile__ ( \ | 115 \ |
114 "sw %[val_m], %[dst_ptr_m] \n\t" \ | 116 __asm__ __volatile__ ( \ |
115 \ | 117 "sw %[val_m], %[pdst_m] \n\t" \ |
116 : [dst_ptr_m] "=m" (*dst_ptr_m) \ | 118 \ |
117 : [val_m] "r" (val_m) \ | 119 : [pdst_m] "=m" (*pdst_m) \ |
118 ); \ | 120 : [val_m] "r" (val_m) \ |
119 } | 121 ); \ |
120 | 122 } |
121 #define STORE_DWORD(pdst, val) { \ | 123 |
122 uint8_t *dst_ptr_m = (uint8_t *)(pdst); \ | 124 #define SD(val, pdst) { \ |
123 const uint64_t val_m = (val); \ | 125 uint8_t *pdst_m = (uint8_t *)(pdst); \ |
124 \ | 126 const uint64_t val_m = (val); \ |
125 __asm__ __volatile__ ( \ | 127 \ |
126 "sd %[val_m], %[dst_ptr_m] \n\t" \ | 128 __asm__ __volatile__ ( \ |
127 \ | 129 "sd %[val_m], %[pdst_m] \n\t" \ |
128 : [dst_ptr_m] "=m" (*dst_ptr_m) \ | 130 \ |
129 : [val_m] "r" (val_m) \ | 131 : [pdst_m] "=m" (*pdst_m) \ |
130 ); \ | 132 : [val_m] "r" (val_m) \ |
| 133 ); \ |
131 } | 134 } |
132 #else // !(__mips_isa_rev >= 6) | 135 #else // !(__mips_isa_rev >= 6) |
133 #define LOAD_WORD(psrc) ({ \ | 136 #define LH(psrc) ({ \ |
134 const uint8_t *src_m = (const uint8_t *)(psrc); \ | 137 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ |
135 uint32_t val_m; \ | 138 uint16_t val_m; \ |
136 \ | 139 \ |
137 __asm__ __volatile__ ( \ | 140 __asm__ __volatile__ ( \ |
138 "ulw %[val_m], %[src_m] \n\t" \ | 141 "ulh %[val_m], %[psrc_m] \n\t" \ |
139 \ | 142 \ |
140 : [val_m] "=r" (val_m) \ | 143 : [val_m] "=r" (val_m) \ |
141 : [src_m] "m" (*src_m) \ | 144 : [psrc_m] "m" (*psrc_m) \ |
142 ); \ | 145 ); \ |
143 \ | 146 \ |
144 val_m; \ | 147 val_m; \ |
145 }) | 148 }) |
146 | 149 |
| 150 #define LW(psrc) ({ \ |
| 151 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ |
| 152 uint32_t val_m; \ |
| 153 \ |
| 154 __asm__ __volatile__ ( \ |
| 155 "ulw %[val_m], %[psrc_m] \n\t" \ |
| 156 \ |
| 157 : [val_m] "=r" (val_m) \ |
| 158 : [psrc_m] "m" (*psrc_m) \ |
| 159 ); \ |
| 160 \ |
| 161 val_m; \ |
| 162 }) |
| 163 |
147 #if (__mips == 64) | 164 #if (__mips == 64) |
148 #define LOAD_DWORD(psrc) ({ \ | 165 #define LD(psrc) ({ \ |
149 const uint8_t *src_m = (const uint8_t *)(psrc); \ | 166 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ |
150 uint64_t val_m = 0; \ | 167 uint64_t val_m = 0; \ |
151 \ | 168 \ |
152 __asm__ __volatile__ ( \ | 169 __asm__ __volatile__ ( \ |
153 "uld %[val_m], %[src_m] \n\t" \ | 170 "uld %[val_m], %[psrc_m] \n\t" \ |
154 \ | 171 \ |
155 : [val_m] "=r" (val_m) \ | 172 : [val_m] "=r" (val_m) \ |
156 : [src_m] "m" (*src_m) \ | 173 : [psrc_m] "m" (*psrc_m) \ |
157 ); \ | 174 ); \ |
158 \ | 175 \ |
159 val_m; \ | 176 val_m; \ |
160 }) | 177 }) |
161 #else // !(__mips == 64) | 178 #else // !(__mips == 64) |
162 #define LOAD_DWORD(psrc) ({ \ | 179 #define LD(psrc) ({ \ |
163 const uint8_t *src1_m = (const uint8_t *)(psrc); \ | 180 const uint8_t *psrc_m1 = (const uint8_t *)(psrc); \ |
164 const uint8_t *src2_m = ((const uint8_t *)(psrc)) + 4; \ | 181 uint32_t val0_m, val1_m; \ |
165 uint32_t val0_m, val1_m; \ | 182 uint64_t val_m = 0; \ |
166 uint64_t genval_m = 0; \ | 183 \ |
167 \ | 184 val0_m = LW(psrc_m1); \ |
168 __asm__ __volatile__ ( \ | 185 val1_m = LW(psrc_m1 + 4); \ |
169 "ulw %[val0_m], %[src1_m] \n\t" \ | 186 \ |
170 \ | 187 val_m = (uint64_t)(val1_m); \ |
171 : [val0_m] "=r" (val0_m) \ | 188 val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \ |
172 : [src1_m] "m" (*src1_m) \ | 189 val_m = (uint64_t)(val_m | (uint64_t)val0_m); \ |
173 ); \ | 190 \ |
174 \ | 191 val_m; \ |
175 __asm__ __volatile__ ( \ | |
176 "ulw %[val1_m], %[src2_m] \n\t" \ | |
177 \ | |
178 : [val1_m] "=r" (val1_m) \ | |
179 : [src2_m] "m" (*src2_m) \ | |
180 ); \ | |
181 \ | |
182 genval_m = (uint64_t)(val1_m); \ | |
183 genval_m = (uint64_t)((genval_m << 32) & 0xFFFFFFFF00000000); \ | |
184 genval_m = (uint64_t)(genval_m | (uint64_t)val0_m); \ | |
185 \ | |
186 genval_m; \ | |
187 }) | 192 }) |
188 #endif // (__mips == 64) | 193 #endif // (__mips == 64) |
189 | 194 |
190 #define STORE_WORD_WITH_OFFSET_1(pdst, val) { \ | 195 #define SH(val, pdst) { \ |
191 uint8_t *dst_ptr_m = ((uint8_t *)(pdst)) + 1; \ | 196 uint8_t *pdst_m = (uint8_t *)(pdst); \ |
192 const uint32_t val_m = (val); \ | 197 const uint16_t val_m = (val); \ |
193 \ | 198 \ |
194 __asm__ __volatile__ ( \ | 199 __asm__ __volatile__ ( \ |
195 "usw %[val_m], %[dst_ptr_m] \n\t" \ | 200 "ush %[val_m], %[pdst_m] \n\t" \ |
196 \ | 201 \ |
197 : [dst_ptr_m] "=m" (*dst_ptr_m) \ | 202 : [pdst_m] "=m" (*pdst_m) \ |
198 : [val_m] "r" (val_m) \ | 203 : [val_m] "r" (val_m) \ |
199 ); \ | 204 ); \ |
200 } | 205 } |
201 | 206 |
202 #define STORE_WORD(pdst, val) { \ | 207 #define SW(val, pdst) { \ |
203 uint8_t *dst_ptr_m = (uint8_t *)(pdst); \ | 208 uint8_t *pdst_m = (uint8_t *)(pdst); \ |
204 const uint32_t val_m = (val); \ | 209 const uint32_t val_m = (val); \ |
205 \ | 210 \ |
206 __asm__ __volatile__ ( \ | 211 __asm__ __volatile__ ( \ |
207 "usw %[val_m], %[dst_ptr_m] \n\t" \ | 212 "usw %[val_m], %[pdst_m] \n\t" \ |
208 \ | 213 \ |
209 : [dst_ptr_m] "=m" (*dst_ptr_m) \ | 214 : [pdst_m] "=m" (*pdst_m) \ |
210 : [val_m] "r" (val_m) \ | 215 : [val_m] "r" (val_m) \ |
211 ); \ | 216 ); \ |
212 } | 217 } |
213 | 218 |
214 #define STORE_DWORD(pdst, val) { \ | 219 #define SD(val, pdst) { \ |
215 uint8_t *dst1_m = (uint8_t *)(pdst); \ | 220 uint8_t *pdst_m1 = (uint8_t *)(pdst); \ |
216 uint8_t *dst2_m = ((uint8_t *)(pdst)) + 4; \ | |
217 uint32_t val0_m, val1_m; \ | 221 uint32_t val0_m, val1_m; \ |
218 \ | 222 \ |
219 val0_m = (uint32_t)((val) & 0x00000000FFFFFFFF); \ | 223 val0_m = (uint32_t)((val) & 0x00000000FFFFFFFF); \ |
220 val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ | 224 val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ |
221 \ | 225 \ |
222 __asm__ __volatile__ ( \ | 226 SW(val0_m, pdst_m1); \ |
223 "usw %[val0_m], %[dst1_m] \n\t" \ | 227 SW(val1_m, pdst_m1 + 4); \ |
224 "usw %[val1_m], %[dst2_m] \n\t" \ | 228 } |
| 229 #endif // (__mips_isa_rev >= 6) |
| 230 |
| 231 /* Description : Store 4 words with stride |
| 232 Arguments : Inputs - in0, in1, in2, in3, pdst, stride |
| 233 Details : Stores word from 'in0' to (pdst) |
| 234 Stores word from 'in1' to (pdst + stride) |
| 235 Stores word from 'in2' to (pdst + 2 * stride) |
| 236 Stores word from 'in3' to (pdst + 3 * stride) |
| 237 */ |
| 238 #define SW4(in0, in1, in2, in3, pdst, stride) { \ |
| 239 SW(in0, (pdst)) \ |
| 240 SW(in1, (pdst) + stride); \ |
| 241 SW(in2, (pdst) + 2 * stride); \ |
| 242 SW(in3, (pdst) + 3 * stride); \ |
| 243 } |
| 244 |
| 245 /* Description : Store 4 double words with stride |
| 246 Arguments : Inputs - in0, in1, in2, in3, pdst, stride |
| 247 Details : Stores double word from 'in0' to (pdst) |
| 248 Stores double word from 'in1' to (pdst + stride) |
| 249 Stores double word from 'in2' to (pdst + 2 * stride) |
| 250 Stores double word from 'in3' to (pdst + 3 * stride) |
| 251 */ |
| 252 #define SD4(in0, in1, in2, in3, pdst, stride) { \ |
| 253 SD(in0, (pdst)) \ |
| 254 SD(in1, (pdst) + stride); \ |
| 255 SD(in2, (pdst) + 2 * stride); \ |
| 256 SD(in3, (pdst) + 3 * stride); \ |
| 257 } |
| 258 |
| 259 /* Description : Load vectors with 16 byte elements with stride |
| 260 Arguments : Inputs - psrc (source pointer to load from) |
| 261 - stride |
| 262 Outputs - out0, out1 |
| 263 Return Type - as per RTYPE |
| 264 Details : Loads 16 byte elements in 'out0' from (psrc) |
| 265 Loads 16 byte elements in 'out1' from (psrc + stride) |
| 266 */ |
| 267 #define LD_B2(RTYPE, psrc, stride, out0, out1) { \ |
| 268 out0 = LD_B(RTYPE, (psrc)); \ |
| 269 out1 = LD_B(RTYPE, (psrc) + stride); \ |
| 270 } |
| 271 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__) |
| 272 #define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__) |
| 273 |
| 274 #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) { \ |
| 275 LD_B2(RTYPE, (psrc), stride, out0, out1); \ |
| 276 LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3); \ |
| 277 } |
| 278 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__) |
| 279 #define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__) |
| 280 |
| 281 #define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) { \ |
| 282 LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ |
| 283 out4 = LD_B(RTYPE, (psrc) + 4 * stride); \ |
| 284 } |
| 285 #define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__) |
| 286 #define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__) |
| 287 |
| 288 #define LD_B7(RTYPE, psrc, stride, \ |
| 289 out0, out1, out2, out3, out4, out5, out6) { \ |
| 290 LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \ |
| 291 LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \ |
| 292 } |
| 293 #define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__) |
| 294 |
| 295 #define LD_B8(RTYPE, psrc, stride, \ |
| 296 out0, out1, out2, out3, out4, out5, out6, out7) { \ |
| 297 LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ |
| 298 LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ |
| 299 } |
| 300 #define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__) |
| 301 #define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__) |
| 302 |
| 303 /* Description : Load vectors with 8 halfword elements with stride |
| 304 Arguments : Inputs - psrc (source pointer to load from) |
| 305 - stride |
| 306 Outputs - out0, out1 |
| 307 Details : Loads 8 halfword elements in 'out0' from (psrc) |
| 308 Loads 8 halfword elements in 'out1' from (psrc + stride) |
| 309 */ |
| 310 #define LD_H2(RTYPE, psrc, stride, out0, out1) { \ |
| 311 out0 = LD_H(RTYPE, (psrc)); \ |
| 312 out1 = LD_H(RTYPE, (psrc) + (stride)); \ |
| 313 } |
| 314 #define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__) |
| 315 |
| 316 #define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) { \ |
| 317 LD_H2(RTYPE, (psrc), stride, out0, out1); \ |
| 318 LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ |
| 319 } |
| 320 #define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__) |
| 321 |
| 322 #define LD_H8(RTYPE, psrc, stride, \ |
| 323 out0, out1, out2, out3, out4, out5, out6, out7) { \ |
| 324 LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ |
| 325 LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ |
| 326 } |
| 327 #define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__) |
| 328 |
| 329 #define LD_H16(RTYPE, psrc, stride, \ |
| 330 out0, out1, out2, out3, out4, out5, out6, out7, \ |
| 331 out8, out9, out10, out11, out12, out13, out14, out15) { \ |
| 332 LD_H8(RTYPE, (psrc), stride, \ |
| 333 out0, out1, out2, out3, out4, out5, out6, out7); \ |
| 334 LD_H8(RTYPE, (psrc) + 8 * stride, stride, \ |
| 335 out8, out9, out10, out11, out12, out13, out14, out15); \ |
| 336 } |
| 337 #define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__) |
| 338 |
| 339 /* Description : Store vectors of 16 byte elements with stride |
| 340 Arguments : Inputs - in0, in1, stride |
| 341 Outputs - pdst (destination pointer to store to) |
| 342 Details : Stores 16 byte elements from 'in0' to (pdst) |
| 343 Stores 16 byte elements from 'in1' to (pdst + stride) |
| 344 */ |
| 345 #define ST_B2(RTYPE, in0, in1, pdst, stride) { \ |
| 346 ST_B(RTYPE, in0, (pdst)); \ |
| 347 ST_B(RTYPE, in1, (pdst) + stride); \ |
| 348 } |
| 349 #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__) |
| 350 |
| 351 #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) { \ |
| 352 ST_B2(RTYPE, in0, in1, (pdst), stride); \ |
| 353 ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ |
| 354 } |
| 355 #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__) |
| 356 |
| 357 #define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ |
| 358 pdst, stride) { \ |
| 359 ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \ |
| 360 ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ |
| 361 } |
| 362 #define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__) |
| 363 |
| 364 /* Description : Store vectors of 8 halfword elements with stride |
| 365 Arguments : Inputs - in0, in1, stride |
| 366 Outputs - pdst (destination pointer to store to) |
| 367 Details : Stores 8 halfword elements from 'in0' to (pdst) |
| 368 Stores 8 halfword elements from 'in1' to (pdst + stride) |
| 369 */ |
| 370 #define ST_H2(RTYPE, in0, in1, pdst, stride) { \ |
| 371 ST_H(RTYPE, in0, (pdst)); \ |
| 372 ST_H(RTYPE, in1, (pdst) + stride); \ |
| 373 } |
| 374 #define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__) |
| 375 |
| 376 #define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) { \ |
| 377 ST_H2(RTYPE, in0, in1, (pdst), stride); \ |
| 378 ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ |
| 379 } |
| 380 #define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__) |
| 381 |
| 382 #define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) { \ |
| 383 ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \ |
| 384 ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ |
| 385 } |
| 386 #define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__) |
| 387 |
| 388 /* Description : Store as 4x4 byte block to destination memory from input vector |
| 389 Arguments : Inputs - in0, in1, pdst, stride |
| 390 Return Type - unsigned byte |
| 391 Details : Idx0 word element from input vector 'in0' is copied and stored |
| 392 on first line |
| 393 Idx1 word element from input vector 'in0' is copied and stored |
| 394 on second line |
| 395 Idx2 word element from input vector 'in1' is copied and stored |
| 396 on third line |
| 397 Idx3 word element from input vector 'in1' is copied and stored |
| 398 on fourth line |
| 399 */ |
| 400 #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) { \ |
| 401 uint32_t out0_m, out1_m, out2_m, out3_m; \ |
| 402 uint8_t *pblk_4x4_m = (uint8_t *)(pdst); \ |
| 403 \ |
| 404 out0_m = __msa_copy_u_w((v4i32)in0, idx0); \ |
| 405 out1_m = __msa_copy_u_w((v4i32)in0, idx1); \ |
| 406 out2_m = __msa_copy_u_w((v4i32)in1, idx2); \ |
| 407 out3_m = __msa_copy_u_w((v4i32)in1, idx3); \ |
| 408 \ |
| 409 SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \ |
| 410 } |
| 411 #define ST4x8_UB(in0, in1, pdst, stride) { \ |
| 412 uint8_t *pblk_4x8 = (uint8_t *)(pdst); \ |
| 413 \ |
| 414 ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \ |
| 415 ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \ |
| 416 } |
| 417 |
| 418 /* Description : Store as 8x1 byte block to destination memory from input vector |
| 419 Arguments : Inputs - in, pdst |
| 420 Details : Index 0 double word element from input vector 'in' is copied |
| 421 and stored to destination memory at (pdst) |
| 422 */ |
| 423 #define ST8x1_UB(in, pdst) { \ |
| 424 uint64_t out0_m; \ |
| 425 \ |
| 426 out0_m = __msa_copy_u_d((v2i64)in, 0); \ |
| 427 SD(out0_m, pdst); \ |
| 428 } |
| 429 |
| 430 /* Description : Store as 8x4 byte block to destination memory from input |
| 431 vectors |
| 432 Arguments : Inputs - in0, in1, pdst, stride |
| 433 Details : Index 0 double word element from input vector 'in0' is copied |
| 434 and stored to destination memory at (pblk_8x4_m) |
| 435 Index 1 double word element from input vector 'in0' is copied |
| 436 and stored to destination memory at (pblk_8x4_m + stride) |
| 437 Index 0 double word element from input vector 'in1' is copied |
| 438 and stored to destination memory at (pblk_8x4_m + 2 * stride) |
| 439 Index 1 double word element from input vector 'in1' is copied |
| 440 and stored to destination memory at (pblk_8x4_m + 3 * stride) |
| 441 */ |
| 442 #define ST8x4_UB(in0, in1, pdst, stride) { \ |
| 443 uint64_t out0_m, out1_m, out2_m, out3_m; \ |
| 444 uint8_t *pblk_8x4_m = (uint8_t *)(pdst); \ |
225 \ | 445 \ |
226 : [dst1_m] "=m" (*dst1_m), [dst2_m] "=m" (*dst2_m) \ | 446 out0_m = __msa_copy_u_d((v2i64)in0, 0); \ |
227 : [val0_m] "r" (val0_m), [val1_m] "r" (val1_m) \ | 447 out1_m = __msa_copy_u_d((v2i64)in0, 1); \ |
228 ); \ | 448 out2_m = __msa_copy_u_d((v2i64)in1, 0); \ |
229 } | 449 out3_m = __msa_copy_u_d((v2i64)in1, 1); \ |
230 #endif // (__mips_isa_rev >= 6) | 450 \ |
231 | 451 SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \ |
232 #define LOAD_2VECS_UB(psrc, stride, \ | 452 } |
233 val0, val1) { \ | 453 |
234 val0 = LOAD_UB(psrc + 0 * stride); \ | 454 /* Description : average with rounding (in0 + in1 + 1) / 2. |
235 val1 = LOAD_UB(psrc + 1 * stride); \ | 455 Arguments : Inputs - in0, in1, in2, in3, |
236 } | 456 Outputs - out0, out1 |
237 | 457 Return Type - signed byte |
238 #define LOAD_4VECS_UB(psrc, stride, \ | 458 Details : Each byte element from 'in0' vector is added with each byte |
239 val0, val1, val2, val3) { \ | 459 element from 'in1' vector. The addition of the elements plus 1 |
240 val0 = LOAD_UB(psrc + 0 * stride); \ | 460 (for rounding) is done unsigned with full precision, |
241 val1 = LOAD_UB(psrc + 1 * stride); \ | 461 i.e. the result has one extra bit. Unsigned division by 2 |
242 val2 = LOAD_UB(psrc + 2 * stride); \ | 462 (or logical shift right by one bit) is performed before writing |
243 val3 = LOAD_UB(psrc + 3 * stride); \ | 463 the result to vector 'out0' |
244 } | 464 Similar for the pair of 'in2' and 'in3' |
245 | 465 */ |
246 #define LOAD_4VECS_SB(psrc, stride, \ | 466 #define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) { \ |
247 val0, val1, val2, val3) { \ | 467 out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1); \ |
248 val0 = LOAD_SB(psrc + 0 * stride); \ | 468 out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3); \ |
249 val1 = LOAD_SB(psrc + 1 * stride); \ | 469 } |
250 val2 = LOAD_SB(psrc + 2 * stride); \ | 470 #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__) |
251 val3 = LOAD_SB(psrc + 3 * stride); \ | 471 |
252 } | 472 #define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ |
253 | 473 out0, out1, out2, out3) { \ |
254 #define LOAD_5VECS_UB(psrc, stride, \ | 474 AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \ |
255 out0, out1, out2, out3, out4) { \ | 475 AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3) \ |
256 LOAD_4VECS_UB((psrc), (stride), \ | 476 } |
257 (out0), (out1), (out2), (out3)); \ | 477 #define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__) |
258 out4 = LOAD_UB(psrc + 4 * stride); \ | 478 |
259 } | 479 /* Description : Immediate number of columns to slide with zero |
260 | 480 Arguments : Inputs - in0, in1, slide_val |
261 #define LOAD_5VECS_SB(psrc, stride, \ | 481 Outputs - out0, out1 |
262 out0, out1, out2, out3, out4) { \ | 482 Return Type - as per RTYPE |
263 LOAD_4VECS_SB((psrc), (stride), \ | 483 Details : Byte elements from 'zero_m' vector are slide into 'in0' by |
264 (out0), (out1), (out2), (out3)); \ | 484 number of elements specified by 'slide_val' |
265 out4 = LOAD_SB(psrc + 4 * stride); \ | 485 */ |
266 } | 486 #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) { \ |
267 | 487 v16i8 zero_m = { 0 }; \ |
268 #define LOAD_7VECS_SB(psrc, stride, \ | 488 out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \ |
269 val0, val1, val2, val3, \ | 489 out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \ |
270 val4, val5, val6) { \ | 490 } |
271 val0 = LOAD_SB((psrc) + 0 * (stride)); \ | 491 #define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__) |
272 val1 = LOAD_SB((psrc) + 1 * (stride)); \ | 492 |
273 val2 = LOAD_SB((psrc) + 2 * (stride)); \ | 493 /* Description : Immediate number of columns to slide |
274 val3 = LOAD_SB((psrc) + 3 * (stride)); \ | 494 Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val |
275 val4 = LOAD_SB((psrc) + 4 * (stride)); \ | 495 Outputs - out0, out1 |
276 val5 = LOAD_SB((psrc) + 5 * (stride)); \ | 496 Return Type - as per RTYPE |
277 val6 = LOAD_SB((psrc) + 6 * (stride)); \ | 497 Details : Byte elements from 'in0_0' vector are slide into 'in1_0' by |
278 } | 498 number of elements specified by 'slide_val' |
279 | 499 */ |
280 #define LOAD_8VECS_UB(psrc, stride, \ | 500 #define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) { \ |
281 out0, out1, out2, out3, \ | 501 out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val); \ |
282 out4, out5, out6, out7) { \ | 502 out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val); \ |
283 LOAD_4VECS_UB((psrc), (stride), \ | 503 } |
284 (out0), (out1), (out2), (out3)); \ | 504 #define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__) |
285 LOAD_4VECS_UB((psrc + 4 * stride), (stride), \ | 505 |
286 (out4), (out5), (out6), (out7)); \ | 506 #define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, \ |
287 } | 507 out0, out1, out2, slide_val) { \ |
288 | 508 SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \ |
289 #define LOAD_8VECS_SB(psrc, stride, \ | 509 out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val); \ |
290 out0, out1, out2, out3, \ | 510 } |
291 out4, out5, out6, out7) { \ | 511 #define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__) |
292 LOAD_4VECS_SB((psrc), (stride), \ | 512 #define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__) |
293 (out0), (out1), (out2), (out3)); \ | 513 |
294 LOAD_4VECS_SB((psrc + 4 * stride), (stride), \ | 514 /* Description : Shuffle byte vector elements as per mask vector |
295 (out4), (out5), (out6), (out7)); \ | 515 Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 |
296 } | 516 Outputs - out0, out1 |
297 | 517 Return Type - as per RTYPE |
298 #define LOAD_2VECS_SH(psrc, stride, \ | 518 Details : Selective byte elements from in0 & in1 are copied to out0 as |
299 val0, val1) { \ | 519 per control vector mask0 |
300 val0 = LOAD_SH((psrc) + 0 * (stride)); \ | 520 Selective byte elements from in2 & in3 are copied to out1 as |
301 val1 = LOAD_SH((psrc) + 1 * (stride)); \ | 521 per control vector mask1 |
302 } | 522 */ |
303 | 523 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) { \ |
304 #define LOAD_4VECS_SH(psrc, stride, \ | 524 out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \ |
305 val0, val1, val2, val3) { \ | 525 out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \ |
306 LOAD_2VECS_SH((psrc), (stride), val0, val1); \ | 526 } |
307 LOAD_2VECS_SH((psrc + 2 * stride), (stride), val2, val3); \ | 527 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__) |
308 } | 528 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__) |
309 | 529 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__) |
310 #define LOAD_8VECS_SH(psrc, stride, \ | 530 |
311 val0, val1, val2, val3, \ | 531 #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, \ |
312 val4, val5, val6, val7) { \ | 532 out0, out1, out2, out3) { \ |
313 LOAD_4VECS_SH((psrc), (stride), \ | 533 VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1); \ |
314 val0, val1, val2, val3); \ | 534 VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \ |
315 LOAD_4VECS_SH((psrc + 4 * stride), (stride), \ | 535 } |
316 val4, val5, val6, val7); \ | 536 #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__) |
317 } | 537 #define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__) |
318 | 538 |
319 #define LOAD_16VECS_SH(psrc, stride, \ | 539 /* Description : Dot product of byte vector elements |
320 val0, val1, val2, val3, \ | 540 Arguments : Inputs - mult0, mult1 |
321 val4, val5, val6, val7, \ | 541 cnst0, cnst1 |
322 val8, val9, val10, val11, \ | 542 Outputs - out0, out1 |
323 val12, val13, val14, val15) { \ | 543 Return Type - unsigned halfword |
324 LOAD_8VECS_SH((psrc), (stride), \ | 544 Details : Unsigned byte elements from mult0 are multiplied with |
325 val0, val1, val2, val3, \ | 545 unsigned byte elements from cnst0 producing a result |
326 val4, val5, val6, val7); \ | 546 twice the size of input i.e. unsigned halfword. |
327 LOAD_8VECS_SH((psrc + 8 * (stride)), (stride), \ | 547 Then this multiplication results of adjacent odd-even elements |
328 val8, val9, val10, val11, \ | 548 are added together and stored to the out vector |
329 val12, val13, val14, val15); \ | 549 (2 unsigned halfword results) |
330 } | 550 */ |
331 | 551 #define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \ |
332 #define STORE_4VECS_UB(dst_out, pitch, \ | 552 out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0); \ |
333 in0, in1, in2, in3) { \ | 553 out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1); \ |
334 STORE_UB((in0), (dst_out)); \ | 554 } |
335 STORE_UB((in1), ((dst_out) + (pitch))); \ | 555 #define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__) |
336 STORE_UB((in2), ((dst_out) + 2 * (pitch))); \ | 556 |
337 STORE_UB((in3), ((dst_out) + 3 * (pitch))); \ | 557 #define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, \ |
338 } | 558 cnst0, cnst1, cnst2, cnst3, \ |
339 | 559 out0, out1, out2, out3) { \ |
340 #define STORE_8VECS_UB(dst_out, pitch_in, \ | 560 DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ |
341 in0, in1, in2, in3, \ | 561 DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ |
342 in4, in5, in6, in7) { \ | 562 } |
343 STORE_4VECS_UB(dst_out, pitch_in, \ | 563 #define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__) |
344 in0, in1, in2, in3); \ | 564 |
345 STORE_4VECS_UB((dst_out + 4 * (pitch_in)), pitch_in, \ | 565 /* Description : Dot product of byte vector elements |
346 in4, in5, in6, in7); \ | 566 Arguments : Inputs - mult0, mult1 |
347 } | 567 cnst0, cnst1 |
348 | 568 Outputs - out0, out1 |
349 #define VEC_INSERT_4W_UB(src, src0, src1, src2, src3) { \ | 569 Return Type - signed halfword |
350 src = (v16u8)__msa_insert_w((v4i32)(src), 0, (src0)); \ | 570 Details : Signed byte elements from mult0 are multiplied with |
351 src = (v16u8)__msa_insert_w((v4i32)(src), 1, (src1)); \ | 571 signed byte elements from cnst0 producing a result |
352 src = (v16u8)__msa_insert_w((v4i32)(src), 2, (src2)); \ | 572 twice the size of input i.e. signed halfword. |
353 src = (v16u8)__msa_insert_w((v4i32)(src), 3, (src3)); \ | 573 Then this multiplication results of adjacent odd-even elements |
354 } | 574 are added together and stored to the out vector |
355 | 575 (2 signed halfword results) |
356 #define VEC_INSERT_2DW_UB(src, src0, src1) { \ | 576 */ |
357 src = (v16u8)__msa_insert_d((v2i64)(src), 0, (src0)); \ | 577 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \ |
358 src = (v16u8)__msa_insert_d((v2i64)(src), 1, (src1)); \ | 578 out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0); \ |
359 } | 579 out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1); \ |
360 | 580 } |
361 #define STORE_4VECS_SH(ptr, stride, \ | 581 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__) |
362 in0, in1, in2, in3) { \ | 582 |
363 STORE_SH(in0, ((ptr) + 0 * stride)); \ | 583 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, \ |
364 STORE_SH(in1, ((ptr) + 1 * stride)); \ | 584 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) { \ |
365 STORE_SH(in2, ((ptr) + 2 * stride)); \ | 585 DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ |
366 STORE_SH(in3, ((ptr) + 3 * stride)); \ | 586 DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ |
367 } | 587 } |
368 | 588 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__) |
369 #define STORE_8VECS_SH(ptr, stride, \ | 589 |
370 in0, in1, in2, in3, \ | 590 /* Description : Dot product of halfword vector elements |
371 in4, in5, in6, in7) { \ | 591 Arguments : Inputs - mult0, mult1 |
372 STORE_SH(in0, ((ptr) + 0 * stride)); \ | 592 cnst0, cnst1 |
373 STORE_SH(in1, ((ptr) + 1 * stride)); \ | 593 Outputs - out0, out1 |
374 STORE_SH(in2, ((ptr) + 2 * stride)); \ | 594 Return Type - signed word |
375 STORE_SH(in3, ((ptr) + 3 * stride)); \ | 595 Details : Signed halfword elements from mult0 are multiplied with |
376 STORE_SH(in4, ((ptr) + 4 * stride)); \ | 596 signed halfword elements from cnst0 producing a result |
377 STORE_SH(in5, ((ptr) + 5 * stride)); \ | 597 twice the size of input i.e. signed word. |
378 STORE_SH(in6, ((ptr) + 6 * stride)); \ | 598 Then this multiplication results of adjacent odd-even elements |
379 STORE_SH(in7, ((ptr) + 7 * stride)); \ | 599 are added together and stored to the out vector |
380 } | 600 (2 signed word results) |
381 | 601 */ |
382 #define CLIP_UNSIGNED_CHAR_H(in) ({ \ | 602 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \ |
| 603 out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0); \ |
| 604 out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1); \ |
| 605 } |
| 606 #define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__) |
| 607 |
| 608 #define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, \ |
| 609 cnst0, cnst1, cnst2, cnst3, \ |
| 610 out0, out1, out2, out3) { \ |
| 611 DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ |
| 612 DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ |
| 613 } |
| 614 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__) |
| 615 |
| 616 /* Description : Dot product & addition of byte vector elements |
| 617 Arguments : Inputs - mult0, mult1 |
| 618 cnst0, cnst1 |
| 619 Outputs - out0, out1 |
| 620 Return Type - signed halfword |
| 621 Details : Signed byte elements from mult0 are multiplied with |
| 622 signed byte elements from cnst0 producing a result |
| 623 twice the size of input i.e. signed halfword. |
| 624 Then this multiplication results of adjacent odd-even elements |
| 625 are added to the out vector |
| 626 (2 signed halfword results) |
| 627 */ |
| 628 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \ |
| 629 out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0); \ |
| 630 out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1); \ |
| 631 } |
| 632 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__) |
| 633 |
| 634 #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, \ |
| 635 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) { \ |
| 636 DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ |
| 637 DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ |
| 638 } |
| 639 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__) |
| 640 |
| 641 /* Description : Minimum values between unsigned elements of |
| 642 either vector are copied to the output vector |
| 643 Arguments : Inputs - in0, in1, min_vec |
| 644 Outputs - in0, in1, (in place) |
| 645 Return Type - unsigned halfword |
| 646 Details : Minimum of unsigned halfword element values from 'in0' and |
| 647 'min_value' are written to output vector 'in0' |
| 648 */ |
| 649 #define MIN_UH2(RTYPE, in0, in1, min_vec) { \ |
| 650 in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec); \ |
| 651 in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec); \ |
| 652 } |
| 653 #define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__) |
| 654 |
| 655 #define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) { \ |
| 656 MIN_UH2(RTYPE, in0, in1, min_vec); \ |
| 657 MIN_UH2(RTYPE, in2, in3, min_vec); \ |
| 658 } |
| 659 #define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__) |
| 660 |
| 661 /* Description : Clips all signed halfword elements of input vector |
| 662 between 0 & 255 |
| 663 Arguments : Inputs - in (input vector) |
| 664 Outputs - out_m (output vector with clipped elements) |
| 665 Return Type - signed halfword |
| 666 */ |
| 667 #define CLIP_SH_0_255(in) ({ \ |
383 v8i16 max_m = __msa_ldi_h(255); \ | 668 v8i16 max_m = __msa_ldi_h(255); \ |
384 v8i16 out_m; \ | 669 v8i16 out_m; \ |
385 \ | 670 \ |
386 out_m = __msa_maxi_s_h((v8i16)(in), 0); \ | 671 out_m = __msa_maxi_s_h((v8i16)in, 0); \ |
387 out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \ | 672 out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \ |
388 out_m; \ | 673 out_m; \ |
389 }) | 674 }) |
390 | 675 #define CLIP_SH2_0_255(in0, in1) { \ |
391 /* halfword 8x8 transpose macro */ | 676 in0 = CLIP_SH_0_255(in0); \ |
392 #define TRANSPOSE8x8_H_SH(in0, in1, in2, in3, \ | 677 in1 = CLIP_SH_0_255(in1); \ |
393 in4, in5, in6, in7, \ | 678 } |
394 out0, out1, out2, out3, \ | 679 #define CLIP_SH4_0_255(in0, in1, in2, in3) { \ |
395 out4, out5, out6, out7) { \ | 680 CLIP_SH2_0_255(in0, in1); \ |
396 v8i16 s0_m, s1_m; \ | 681 CLIP_SH2_0_255(in2, in3); \ |
397 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ | 682 } |
398 v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ | 683 |
399 \ | 684 /* Description : Interleave even byte elements from vectors |
400 s0_m = __msa_ilvr_h((v8i16)(in6), (v8i16)(in4)); \ | 685 Arguments : Inputs - in0, in1, in2, in3 |
401 s1_m = __msa_ilvr_h((v8i16)(in7), (v8i16)(in5)); \ | 686 Outputs - out0, out1 |
402 tmp0_m = __msa_ilvr_h((v8i16)s1_m, (v8i16)s0_m); \ | 687 Return Type - as per RTYPE |
403 tmp1_m = __msa_ilvl_h((v8i16)s1_m, (v8i16)s0_m); \ | 688 Details : Even byte elements of 'in0' and even byte |
404 \ | 689 elements of 'in1' are interleaved and copied to 'out0' |
405 s0_m = __msa_ilvl_h((v8i16)(in6), (v8i16)(in4)); \ | 690 Even byte elements of 'in2' and even byte |
406 s1_m = __msa_ilvl_h((v8i16)(in7), (v8i16)(in5)); \ | 691 elements of 'in3' are interleaved and copied to 'out1' |
407 tmp2_m = __msa_ilvr_h((v8i16)s1_m, (v8i16)s0_m); \ | 692 */ |
408 tmp3_m = __msa_ilvl_h((v8i16)s1_m, (v8i16)s0_m); \ | 693 #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \ |
409 \ | 694 out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \ |
410 s0_m = __msa_ilvr_h((v8i16)(in2), (v8i16)(in0)); \ | 695 out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \ |
411 s1_m = __msa_ilvr_h((v8i16)(in3), (v8i16)(in1)); \ | 696 } |
412 tmp4_m = __msa_ilvr_h((v8i16)s1_m, (v8i16)s0_m); \ | 697 #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__) |
413 tmp5_m = __msa_ilvl_h((v8i16)s1_m, (v8i16)s0_m); \ | 698 #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__) |
414 \ | 699 |
415 s0_m = __msa_ilvl_h((v8i16)(in2), (v8i16)(in0)); \ | 700 /* Description : Interleave even halfword elements from vectors |
416 s1_m = __msa_ilvl_h((v8i16)(in3), (v8i16)(in1)); \ | 701 Arguments : Inputs - in0, in1, in2, in3 |
417 tmp6_m = __msa_ilvr_h((v8i16)s1_m, (v8i16)s0_m); \ | 702 Outputs - out0, out1 |
418 tmp7_m = __msa_ilvl_h((v8i16)s1_m, (v8i16)s0_m); \ | 703 Return Type - as per RTYPE |
419 \ | 704 Details : Even halfword elements of 'in0' and even halfword |
420 out0 = (v8i16)__msa_pckev_d((v2i64)tmp0_m, (v2i64)tmp4_m); \ | 705 elements of 'in1' are interleaved and copied to 'out0' |
421 out1 = (v8i16)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m); \ | 706 Even halfword elements of 'in2' and even halfword |
422 out2 = (v8i16)__msa_pckev_d((v2i64)tmp1_m, (v2i64)tmp5_m); \ | 707 elements of 'in3' are interleaved and copied to 'out1' |
423 out3 = (v8i16)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m); \ | 708 */ |
424 out4 = (v8i16)__msa_pckev_d((v2i64)tmp2_m, (v2i64)tmp6_m); \ | 709 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \ |
425 out5 = (v8i16)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m); \ | 710 out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \ |
426 out6 = (v8i16)__msa_pckev_d((v2i64)tmp3_m, (v2i64)tmp7_m); \ | 711 out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \ |
427 out7 = (v8i16)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m); \ | 712 } |
428 } | 713 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__) |
429 | 714 #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__) |
430 /* interleave macros */ | 715 #define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__) |
431 /* no in-place support */ | 716 |
432 #define ILV_B_LRLR_UB(in0, in1, in2, in3, \ | 717 /* Description : Interleave left half of byte elements from vectors |
433 out0, out1, out2, out3) { \ | 718 Arguments : Inputs - in0, in1, in2, in3 |
434 out0 = (v16u8)__msa_ilvl_b((v16i8)(in1), (v16i8)(in0)); \ | 719 Outputs - out0, out1 |
435 out1 = (v16u8)__msa_ilvr_b((v16i8)(in1), (v16i8)(in0)); \ | 720 Return Type - as per RTYPE |
436 out2 = (v16u8)__msa_ilvl_b((v16i8)(in3), (v16i8)(in2)); \ | 721 Details : Left half of byte elements of in0 and left half of byte |
437 out3 = (v16u8)__msa_ilvr_b((v16i8)(in3), (v16i8)(in2)); \ | 722 elements of in1 are interleaved and copied to out0. |
438 } | 723 Left half of byte elements of in2 and left half of byte |
439 | 724 elements of in3 are interleaved and copied to out1. |
440 #define ILV_H_LRLR_SH(in0, in1, in2, in3, \ | 725 */ |
441 out0, out1, out2, out3) { \ | 726 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \ |
442 out0 = __msa_ilvl_h((v8i16)(in1), (v8i16)(in0)); \ | 727 out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ |
443 out1 = __msa_ilvr_h((v8i16)(in1), (v8i16)(in0)); \ | 728 out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \ |
444 out2 = __msa_ilvl_h((v8i16)(in3), (v8i16)(in2)); \ | 729 } |
445 out3 = __msa_ilvr_h((v8i16)(in3), (v8i16)(in2)); \ | 730 #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__) |
446 } | 731 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__) |
447 | 732 #define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__) |
448 #define ILV_H_LR_SH(in0, in1, out0, out1) { \ | 733 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__) |
449 out0 = __msa_ilvl_h((v8i16)(in1), (v8i16)(in0)); \ | 734 |
450 out1 = __msa_ilvr_h((v8i16)(in1), (v8i16)(in0)); \ | 735 #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ |
451 } | 736 out0, out1, out2, out3) { \ |
452 | 737 ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ |
453 #define ILVR_B_2VECS_UB(in0_r, in1_r, in0_l, in1_l, \ | 738 ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ |
454 out0, out1) { \ | 739 } |
455 out0 = (v16u8)__msa_ilvr_b((v16i8)(in0_l), (v16i8)(in0_r)); \ | 740 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__) |
456 out1 = (v16u8)__msa_ilvr_b((v16i8)(in1_l), (v16i8)(in1_r)); \ | 741 #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__) |
457 } | 742 |
458 | 743 /* Description : Interleave left half of halfword elements from vectors |
459 #define ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \ | 744 Arguments : Inputs - in0, in1, in2, in3 |
460 out0, out1) { \ | 745 Outputs - out0, out1 |
461 out0 = __msa_ilvr_b((v16i8)(in0_l), (v16i8)(in0_r)); \ | 746 Return Type - as per RTYPE |
462 out1 = __msa_ilvr_b((v16i8)(in1_l), (v16i8)(in1_r)); \ | 747 Details : Left half of halfword elements of in0 and left half of halfword |
463 } | 748 elements of in1 are interleaved and copied to out0. |
464 | 749 Left half of halfword elements of in2 and left half of halfword |
465 #define ILVR_B_4VECS_UB(in0_r, in1_r, in2_r, in3_r, \ | 750 elements of in3 are interleaved and copied to out1. |
466 in0_l, in1_l, in2_l, in3_l, \ | 751 */ |
467 out0, out1, out2, out3) { \ | 752 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \ |
468 ILVR_B_2VECS_UB(in0_r, in1_r, in0_l, in1_l, \ | 753 out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ |
469 out0, out1); \ | 754 out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \ |
470 ILVR_B_2VECS_UB(in2_r, in3_r, in2_l, in3_l, \ | 755 } |
471 out2, out3); \ | 756 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__) |
472 } | 757 |
473 | 758 /* Description : Interleave left half of word elements from vectors |
474 #define ILVR_B_4VECS_SB(in0_r, in1_r, in2_r, in3_r, \ | 759 Arguments : Inputs - in0, in1, in2, in3 |
475 in0_l, in1_l, in2_l, in3_l, \ | 760 Outputs - out0, out1 |
476 out0, out1, out2, out3) { \ | 761 Return Type - as per RTYPE |
477 ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \ | 762 Details : Left half of word elements of in0 and left half of word |
478 out0, out1); \ | 763 elements of in1 are interleaved and copied to out0. |
479 ILVR_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \ | 764 Left half of word elements of in2 and left half of word |
480 out2, out3); \ | 765 elements of in3 are interleaved and copied to out1. |
481 } | 766 */ |
482 | 767 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) { \ |
483 #define ILVR_B_6VECS_SB(in0_r, in1_r, in2_r, \ | 768 out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ |
484 in3_r, in4_r, in5_r, \ | 769 out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3); \ |
485 in0_l, in1_l, in2_l, \ | 770 } |
486 in3_l, in4_l, in5_l, \ | 771 #define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__) |
487 out0, out1, out2, \ | 772 #define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__) |
488 out3, out4, out5) { \ | 773 |
489 ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \ | 774 /* Description : Interleave right half of byte elements from vectors |
490 out0, out1); \ | 775 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 |
491 ILVR_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \ | 776 Outputs - out0, out1, out2, out3 |
492 out2, out3); \ | 777 Return Type - as per RTYPE |
493 ILVR_B_2VECS_SB(in4_r, in5_r, in4_l, in5_l, \ | 778 Details : Right half of byte elements of in0 and right half of byte |
494 out4, out5); \ | 779 elements of in1 are interleaved and copied to out0. |
495 } | 780 Right half of byte elements of in2 and right half of byte |
496 | 781 elements of in3 are interleaved and copied to out1. |
497 #define ILVR_B_8VECS_SB(in0_r, in1_r, in2_r, in3_r, \ | 782 Similar for other pairs |
498 in4_r, in5_r, in6_r, in7_r, \ | 783 */ |
499 in0_l, in1_l, in2_l, in3_l, \ | 784 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \ |
500 in4_l, in5_l, in6_l, in7_l, \ | 785 out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ |
501 out0, out1, out2, out3, \ | 786 out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \ |
502 out4, out5, out6, out7) { \ | 787 } |
503 ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \ | 788 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__) |
504 out0, out1); \ | 789 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__) |
505 ILVR_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \ | 790 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__) |
506 out2, out3); \ | 791 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__) |
507 ILVR_B_2VECS_SB(in4_r, in5_r, in4_l, in5_l, \ | 792 |
508 out4, out5); \ | 793 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ |
509 ILVR_B_2VECS_SB(in6_r, in7_r, in6_l, in7_l, \ | 794 out0, out1, out2, out3) { \ |
510 out6, out7); \ | 795 ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ |
511 } | 796 ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ |
512 | 797 } |
513 #define ILVL_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \ | 798 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__) |
514 out0, out1) { \ | 799 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__) |
515 out0 = __msa_ilvl_b((v16i8)(in0_l), (v16i8)(in0_r)); \ | 800 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__) |
516 out1 = __msa_ilvl_b((v16i8)(in1_l), (v16i8)(in1_r)); \ | 801 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__) |
517 } | 802 |
518 | 803 /* Description : Interleave right half of halfword elements from vectors |
519 #define ILVL_B_4VECS_SB(in0_r, in1_r, in2_r, in3_r, \ | 804 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 |
520 in0_l, in1_l, in2_l, in3_l, \ | 805 Outputs - out0, out1, out2, out3 |
521 out0, out1, out2, out3) { \ | 806 Return Type - signed halfword |
522 ILVL_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \ | 807 Details : Right half of halfword elements of in0 and right half of |
523 out0, out1); \ | 808 halfword elements of in1 are interleaved and copied to out0. |
524 ILVL_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \ | 809 Right half of halfword elements of in2 and right half of |
525 out2, out3); \ | 810 halfword elements of in3 are interleaved and copied to out1. |
526 } | 811 Similar for other pairs |
527 | 812 */ |
528 #define ILVL_B_6VECS_SB(in0_r, in1_r, in2_r, \ | 813 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \ |
529 in3_r, in4_r, in5_r, \ | 814 out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ |
530 in0_l, in1_l, in2_l, \ | 815 out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \ |
531 in3_l, in4_l, in5_l, \ | 816 } |
532 out0, out1, out2, \ | 817 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__) |
533 out3, out4, out5) { \ | 818 |
534 ILVL_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \ | 819 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ |
535 out0, out1); \ | 820 out0, out1, out2, out3) { \ |
536 ILVL_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \ | 821 ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ |
537 out2, out3); \ | 822 ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ |
538 ILVL_B_2VECS_SB(in4_r, in5_r, in4_l, in5_l, \ | 823 } |
539 out4, out5); \ | 824 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__) |
540 } | 825 |
541 | 826 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) { \ |
542 #define ILVR_D_2VECS_SB(out0, in0_l, in0_r, \ | 827 out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ |
543 out1, in1_l, in1_r) { \ | 828 out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \ |
544 out0 = (v16i8)__msa_ilvr_d((v2i64)(in0_l), (v2i64)(in0_r)); \ | 829 } |
545 out1 = (v16i8)__msa_ilvr_d((v2i64)(in1_l), (v2i64)(in1_r)); \ | 830 #define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__) |
546 } | 831 #define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__) |
547 | 832 |
548 #define ILVR_D_3VECS_SB(out0, in0_l, in0_r, \ | 833 /* Description : Interleave right half of double word elements from vectors |
549 out1, in1_l, in1_r, \ | 834 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 |
550 out2, in2_l, in2_r) { \ | 835 Outputs - out0, out1, out2, out3 |
551 ILVR_D_2VECS_SB(out0, in0_l, in0_r, \ | 836 Return Type - unsigned double word |
552 out1, in1_l, in1_r); \ | 837 Details : Right half of double word elements of in0 and right half of |
553 out2 = (v16i8)__msa_ilvr_d((v2i64)(in2_l), (v2i64)(in2_r)); \ | 838 double word elements of in1 are interleaved and copied to out0. |
554 } | 839 Right half of double word elements of in2 and right half of |
555 | 840 double word elements of in3 are interleaved and copied to out1. |
556 #define ILVR_D_4VECS_SB(out0, in0_l, in0_r, \ | 841 */ |
557 out1, in1_l, in1_r, \ | 842 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) { \ |
558 out2, in2_l, in2_r, \ | 843 out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1)); \ |
559 out3, in3_l, in3_r) { \ | 844 out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3)); \ |
560 ILVR_D_2VECS_SB(out0, in0_l, in0_r, \ | 845 } |
561 out1, in1_l, in1_r); \ | 846 #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__) |
562 ILVR_D_2VECS_SB(out2, in2_l, in2_r, \ | 847 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__) |
563 out3, in3_l, in3_r); \ | 848 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__) |
564 } | 849 |
565 | 850 #define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) { \ |
566 #define DOTP_S_W_4VECS_SW(m0, c0, m1, c1, \ | 851 ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ |
567 m2, c2, m3, c3, \ | 852 out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5)); \ |
568 out0, out1, out2, out3) { \ | 853 } |
569 out0 = __msa_dotp_s_w((v8i16)(m0), (v8i16)(c0)); \ | 854 #define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__) |
570 out1 = __msa_dotp_s_w((v8i16)(m1), (v8i16)(c1)); \ | 855 |
571 out2 = __msa_dotp_s_w((v8i16)(m2), (v8i16)(c2)); \ | 856 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ |
572 out3 = __msa_dotp_s_w((v8i16)(m3), (v8i16)(c3)); \ | 857 out0, out1, out2, out3) { \ |
573 } | 858 ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ |
574 | 859 ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ |
575 #define PCKEV_H_2VECS_SH(in0_l, in0_r, in1_l, in1_r, \ | 860 } |
576 out0, out1) { \ | 861 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__) |
577 out0 = __msa_pckev_h((v8i16)(in0_l), (v8i16)(in0_r)); \ | 862 #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__) |
578 out1 = __msa_pckev_h((v8i16)(in1_l), (v8i16)(in1_r)); \ | 863 |
579 } | 864 /* Description : Interleave both left and right half of input vectors |
580 | 865 Arguments : Inputs - in0, in1 |
581 #define XORI_B_2VECS_UB(val0, val1, \ | 866 Outputs - out0, out1 |
582 out0, out1, xor_val) { \ | 867 Return Type - as per RTYPE |
583 out0 = __msa_xori_b((v16u8)(val0), (xor_val)); \ | 868 Details : Right half of byte elements from 'in0' and 'in1' are |
584 out1 = __msa_xori_b((v16u8)(val1), (xor_val)); \ | 869 interleaved and stored to 'out0' |
585 } | 870 Left half of byte elements from 'in0' and 'in1' are |
586 | 871 interleaved and stored to 'out1' |
587 #define XORI_B_2VECS_SB(val0, val1, \ | 872 */ |
588 out0, out1, xor_val) { \ | 873 #define ILVRL_B2(RTYPE, in0, in1, out0, out1) { \ |
589 out0 = (v16i8)__msa_xori_b((v16u8)(val0), (xor_val)); \ | 874 out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ |
590 out1 = (v16i8)__msa_xori_b((v16u8)(val1), (xor_val)); \ | 875 out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ |
591 } | 876 } |
592 | 877 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__) |
593 #define XORI_B_3VECS_SB(val0, val1, val2, \ | 878 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__) |
594 out0, out1, out2, xor_val) { \ | 879 #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__) |
595 XORI_B_2VECS_SB(val0, val1, out0, out1, xor_val); \ | 880 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__) |
596 out2 = (v16i8)__msa_xori_b((v16u8)(val2), (xor_val)); \ | 881 |
597 } | 882 #define ILVRL_H2(RTYPE, in0, in1, out0, out1) { \ |
598 | 883 out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ |
599 #define XORI_B_4VECS_UB(val0, val1, val2, val3, \ | 884 out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ |
600 out0, out1, out2, out3, \ | 885 } |
601 xor_val) { \ | 886 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__) |
602 XORI_B_2VECS_UB(val0, val1, out0, out1, xor_val); \ | 887 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__) |
603 XORI_B_2VECS_UB(val2, val3, out2, out3, xor_val); \ | 888 |
604 } | 889 #define ILVRL_W2(RTYPE, in0, in1, out0, out1) { \ |
605 | 890 out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ |
606 #define XORI_B_4VECS_SB(val0, val1, val2, val3, \ | 891 out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ |
607 out0, out1, out2, out3, \ | 892 } |
608 xor_val) { \ | 893 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__) |
609 XORI_B_2VECS_SB(val0, val1, out0, out1, xor_val); \ | 894 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__) |
610 XORI_B_2VECS_SB(val2, val3, out2, out3, xor_val); \ | 895 |
611 } | 896 /* Description : Saturate the halfword element values to the max |
612 | 897 unsigned value of (sat_val+1 bits) |
613 #define XORI_B_7VECS_SB(val0, val1, val2, val3, \ | 898 The element data width remains unchanged |
614 val4, val5, val6, \ | 899 Arguments : Inputs - in0, in1, in2, in3, sat_val |
615 out0, out1, out2, out3, \ | 900 Outputs - in0, in1, in2, in3 (in place) |
616 out4, out5, out6, \ | 901 Return Type - unsigned halfword |
617 xor_val) { \ | 902 Details : Each unsigned halfword element from 'in0' is saturated to the |
618 XORI_B_4VECS_SB(val0, val1, val2, val3, \ | 903 value generated with (sat_val+1) bit range |
619 out0, out1, out2, out3, xor_val); \ | 904 Results are in placed to original vectors |
620 XORI_B_3VECS_SB(val4, val5, val6, \ | 905 */ |
621 out4, out5, out6, xor_val); \ | 906 #define SAT_UH2(RTYPE, in0, in1, sat_val) { \ |
622 } | 907 in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \ |
623 | 908 in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \ |
624 #define SRARI_H_4VECS_UH(val0, val1, val2, val3, \ | 909 } |
625 out0, out1, out2, out3, \ | 910 #define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__) |
626 shift_right_val) { \ | 911 |
627 out0 = (v8u16)__msa_srari_h((v8i16)(val0), (shift_right_val)); \ | 912 #define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) { \ |
628 out1 = (v8u16)__msa_srari_h((v8i16)(val1), (shift_right_val)); \ | 913 SAT_UH2(RTYPE, in0, in1, sat_val); \ |
629 out2 = (v8u16)__msa_srari_h((v8i16)(val2), (shift_right_val)); \ | 914 SAT_UH2(RTYPE, in2, in3, sat_val) \ |
630 out3 = (v8u16)__msa_srari_h((v8i16)(val3), (shift_right_val)); \ | 915 } |
631 } | 916 #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__) |
632 | 917 |
633 #define SRARI_H_4VECS_SH(val0, val1, val2, val3, \ | 918 /* Description : Saturate the halfword element values to the max |
634 out0, out1, out2, out3, \ | 919 unsigned value of (sat_val+1 bits) |
635 shift_right_val) { \ | 920 The element data width remains unchanged |
636 out0 = __msa_srari_h((v8i16)(val0), (shift_right_val)); \ | 921 Arguments : Inputs - in0, in1, in2, in3, sat_val |
637 out1 = __msa_srari_h((v8i16)(val1), (shift_right_val)); \ | 922 Outputs - in0, in1, in2, in3 (in place) |
638 out2 = __msa_srari_h((v8i16)(val2), (shift_right_val)); \ | 923 Return Type - unsigned halfword |
639 out3 = __msa_srari_h((v8i16)(val3), (shift_right_val)); \ | 924 Details : Each unsigned halfword element from 'in0' is saturated to the |
640 } | 925 value generated with (sat_val+1) bit range |
641 | 926 Results are in placed to original vectors |
642 #define SRARI_W_4VECS_SW(val0, val1, val2, val3, \ | 927 */ |
643 out0, out1, out2, out3, \ | 928 #define SAT_SH2(RTYPE, in0, in1, sat_val) { \ |
644 shift_right_val) { \ | 929 in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \ |
645 out0 = __msa_srari_w((v4i32)(val0), (shift_right_val)); \ | 930 in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val); \ |
646 out1 = __msa_srari_w((v4i32)(val1), (shift_right_val)); \ | 931 } |
647 out2 = __msa_srari_w((v4i32)(val2), (shift_right_val)); \ | 932 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__) |
648 out3 = __msa_srari_w((v4i32)(val3), (shift_right_val)); \ | 933 |
649 } | 934 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) { \ |
650 | 935 SAT_SH2(RTYPE, in0, in1, sat_val); \ |
651 #define SRARI_SATURATE_UNSIGNED_H(input, right_shift_val, sat_val) ({ \ | 936 SAT_SH2(RTYPE, in2, in3, sat_val); \ |
652 v8u16 out_m; \ | 937 } |
| 938 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__) |
| 939 |
| 940 /* Description : Indexed halfword element values are replicated to all |
| 941 elements in output vector |
| 942 Arguments : Inputs - in, idx0, idx1 |
| 943 Outputs - out0, out1 |
| 944 Return Type - as per RTYPE |
| 945 Details : 'idx0' element value from 'in' vector is replicated to all |
| 946 elements in 'out0' vector |
| 947 Valid index range for halfword operation is 0-7 |
| 948 */ |
| 949 #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) { \ |
| 950 out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0); \ |
| 951 out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1); \ |
| 952 } |
| 953 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__) |
| 954 |
| 955 #define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, \ |
| 956 out0, out1, out2, out3) { \ |
| 957 SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \ |
| 958 SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \ |
| 959 } |
| 960 #define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__) |
| 961 #define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__) |
| 962 |
| 963 /* Description : Pack even byte elements of vector pairs |
| 964 Arguments : Inputs - in0, in1, in2, in3 |
| 965 Outputs - out0, out1 |
| 966 Return Type - as per RTYPE |
| 967 Details : Even byte elements of in0 are copied to the left half of |
| 968 out0 & even byte elements of in1 are copied to the right |
| 969 half of out0. |
| 970 Even byte elements of in2 are copied to the left half of |
| 971 out1 & even byte elements of in3 are copied to the right |
| 972 half of out1. |
| 973 */ |
| 974 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \ |
| 975 out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \ |
| 976 out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \ |
| 977 } |
| 978 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__) |
| 979 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__) |
| 980 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__) |
| 981 |
| 982 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ |
| 983 out0, out1, out2, out3) { \ |
| 984 PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ |
| 985 PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ |
| 986 } |
| 987 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__) |
| 988 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__) |
| 989 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__) |
| 990 |
| 991 /* Description : Pack even halfword elements of vector pairs |
| 992 Arguments : Inputs - in0, in1, in2, in3 |
| 993 Outputs - out0, out1 |
| 994 Return Type - as per RTYPE |
| 995 Details : Even halfword elements of in0 are copied to the left half of |
| 996 out0 & even halfword elements of in1 are copied to the right |
| 997 half of out0. |
| 998 Even halfword elements of in2 are copied to the left half of |
| 999 out1 & even halfword elements of in3 are copied to the right |
| 1000 half of out1. |
| 1001 */ |
| 1002 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \ |
| 1003 out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \ |
| 1004 out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \ |
| 1005 } |
| 1006 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__) |
| 1007 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__) |
| 1008 |
| 1009 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ |
| 1010 out0, out1, out2, out3) { \ |
| 1011 PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ |
| 1012 PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ |
| 1013 } |
| 1014 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__) |
| 1015 |
| 1016 /* Description : Pack even double word elements of vector pairs |
| 1017 Arguments : Inputs - in0, in1, in2, in3 |
| 1018 Outputs - out0, out1 |
| 1019 Return Type - unsigned byte |
| 1020 Details : Even double elements of in0 are copied to the left half of |
| 1021 out0 & even double elements of in1 are copied to the right |
| 1022 half of out0. |
| 1023 Even double elements of in2 are copied to the left half of |
| 1024 out1 & even double elements of in3 are copied to the right |
| 1025 half of out1. |
| 1026 */ |
| 1027 #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) { \ |
| 1028 out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1); \ |
| 1029 out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3); \ |
| 1030 } |
| 1031 #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__) |
| 1032 #define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__) |
| 1033 |
| 1034 #define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ |
| 1035 out0, out1, out2, out3) { \ |
| 1036 PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ |
| 1037 PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ |
| 1038 } |
| 1039 #define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__) |
| 1040 |
| 1041 /* Description : Each byte element is logically xor'ed with immediate 128 |
| 1042 Arguments : Inputs - in0, in1 |
| 1043 Outputs - in0, in1 (in-place) |
| 1044 Return Type - as per RTYPE |
| 1045 Details : Each unsigned byte element from input vector 'in0' is |
| 1046 logically xor'ed with 128 and result is in-place stored in |
| 1047 'in0' vector |
| 1048 Each unsigned byte element from input vector 'in1' is |
| 1049 logically xor'ed with 128 and result is in-place stored in |
| 1050 'in1' vector |
| 1051 Similar for other pairs |
| 1052 */ |
| 1053 #define XORI_B2_128(RTYPE, in0, in1) { \ |
| 1054 in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128); \ |
| 1055 in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128); \ |
| 1056 } |
| 1057 #define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__) |
| 1058 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__) |
| 1059 |
| 1060 #define XORI_B3_128(RTYPE, in0, in1, in2) { \ |
| 1061 XORI_B2_128(RTYPE, in0, in1); \ |
| 1062 in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128); \ |
| 1063 } |
| 1064 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__) |
| 1065 |
| 1066 #define XORI_B4_128(RTYPE, in0, in1, in2, in3) { \ |
| 1067 XORI_B2_128(RTYPE, in0, in1); \ |
| 1068 XORI_B2_128(RTYPE, in2, in3); \ |
| 1069 } |
| 1070 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__) |
| 1071 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__) |
| 1072 |
| 1073 #define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) { \ |
| 1074 XORI_B4_128(RTYPE, in0, in1, in2, in3); \ |
| 1075 XORI_B3_128(RTYPE, in4, in5, in6); \ |
| 1076 } |
| 1077 #define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__) |
| 1078 |
| 1079 /* Description : Addition of signed halfword elements and signed saturation |
| 1080 Arguments : Inputs - in0, in1, in2, in3 |
| 1081 Outputs - out0, out1 |
| 1082 Return Type - as per RTYPE |
| 1083 Details : Signed halfword elements from 'in0' are added to signed |
| 1084 halfword elements of 'in1'. The result is then signed saturated |
| 1085 between -32768 to +32767 (as per halfword data type) |
| 1086 Similar for other pairs |
| 1087 */ |
| 1088 #define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) { \ |
| 1089 out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1); \ |
| 1090 out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3); \ |
| 1091 } |
| 1092 #define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__) |
| 1093 |
| 1094 #define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ |
| 1095 out0, out1, out2, out3) { \ |
| 1096 ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \ |
| 1097 ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \ |
| 1098 } |
| 1099 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__) |
| 1100 |
| 1101 /* Description : Shift right arithmetic rounded (immediate) |
| 1102 Arguments : Inputs - in0, in1, in2, in3, shift |
| 1103 Outputs - in0, in1, in2, in3 (in place) |
| 1104 Return Type - as per RTYPE |
| 1105 Details : Each element of vector 'in0' is shifted right arithmetic by |
| 1106 value in 'shift'. |
| 1107 The last discarded bit is added to shifted value for rounding |
| 1108 and the result is in place written to 'in0' |
| 1109 Similar for other pairs |
| 1110 */ |
| 1111 #define SRARI_H2(RTYPE, in0, in1, shift) { \ |
| 1112 in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift); \ |
| 1113 in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift); \ |
| 1114 } |
| 1115 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__) |
| 1116 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__) |
| 1117 |
| 1118 #define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) { \ |
| 1119 SRARI_H2(RTYPE, in0, in1, shift); \ |
| 1120 SRARI_H2(RTYPE, in2, in3, shift); \ |
| 1121 } |
| 1122 #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__) |
| 1123 #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__) |
| 1124 |
| 1125 /* Description : Shift right arithmetic rounded (immediate) |
| 1126 Arguments : Inputs - in0, in1, shift |
| 1127 Outputs - in0, in1 (in place) |
| 1128 Return Type - as per RTYPE |
| 1129 Details : Each element of vector 'in0' is shifted right arithmetic by |
| 1130 value in 'shift'. |
| 1131 The last discarded bit is added to shifted value for rounding |
| 1132 and the result is in place written to 'in0' |
| 1133 Similar for other pairs |
| 1134 */ |
| 1135 #define SRARI_W2(RTYPE, in0, in1, shift) { \ |
| 1136 in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \ |
| 1137 in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \ |
| 1138 } |
| 1139 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__) |
| 1140 |
| 1141 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) { \ |
| 1142 SRARI_W2(RTYPE, in0, in1, shift); \ |
| 1143 SRARI_W2(RTYPE, in2, in3, shift); \ |
| 1144 } |
| 1145 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__) |
| 1146 |
| 1147 /* Description : Addition of 2 pairs of vectors |
| 1148 Arguments : Inputs - in0, in1, in2, in3 |
| 1149 Outputs - out0, out1 |
| 1150 Details : Each element from 2 pairs vectors is added and 2 results are |
| 1151 produced |
| 1152 */ |
| 1153 #define ADD2(in0, in1, in2, in3, out0, out1) { \ |
| 1154 out0 = in0 + in1; \ |
| 1155 out1 = in2 + in3; \ |
| 1156 } |
| 1157 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, \ |
| 1158 out0, out1, out2, out3) { \ |
| 1159 ADD2(in0, in1, in2, in3, out0, out1); \ |
| 1160 ADD2(in4, in5, in6, in7, out2, out3); \ |
| 1161 } |
| 1162 |
| 1163 /* Description : Subtraction of 2 pairs of vectors |
| 1164 Arguments : Inputs - in0, in1, in2, in3 |
| 1165 Outputs - out0, out1 |
| 1166 Details : Each element from 2 pairs vectors is subtracted and 2 results |
| 1167 are produced |
| 1168 */ |
| 1169 #define SUB2(in0, in1, in2, in3, out0, out1) { \ |
| 1170 out0 = in0 - in1; \ |
| 1171 out1 = in2 - in3; \ |
| 1172 } |
| 1173 #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, \ |
| 1174 out0, out1, out2, out3) { \ |
| 1175 out0 = in0 - in1; \ |
| 1176 out1 = in2 - in3; \ |
| 1177 out2 = in4 - in5; \ |
| 1178 out3 = in6 - in7; \ |
| 1179 } |
| 1180 |
| 1181 /* Description : Zero extend unsigned byte elements to halfword elements |
| 1182 Arguments : Inputs - in (1 input unsigned byte vector) |
| 1183 Outputs - out0, out1 (unsigned 2 halfword vectors) |
| 1184 Return Type - signed halfword |
| 1185 Details : Zero extended right half of vector is returned in 'out0' |
| 1186 Zero extended left half of vector is returned in 'out1' |
| 1187 */ |
| 1188 #define UNPCK_UB_SH(in, out0, out1) { \ |
| 1189 v16i8 zero_m = { 0 }; \ |
| 1190 \ |
| 1191 ILVRL_B2_SH(zero_m, in, out0, out1); \ |
| 1192 } |
| 1193 |
| 1194 /* Description : Butterfly of 4 input vectors |
| 1195 Arguments : Inputs - in0, in1, in2, in3 |
| 1196 Outputs - out0, out1, out2, out3 |
| 1197 Details : Butterfly operation |
| 1198 */ |
| 1199 #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) { \ |
| 1200 out0 = in0 + in3; \ |
| 1201 out1 = in1 + in2; \ |
| 1202 \ |
| 1203 out2 = in1 - in2; \ |
| 1204 out3 = in0 - in3; \ |
| 1205 } |
| 1206 |
| 1207 /* Description : Butterfly of 8 input vectors |
| 1208 Arguments : Inputs - in0 ... in7 |
| 1209 Outputs - out0 .. out7 |
| 1210 Details : Butterfly operation |
| 1211 */ |
| 1212 #define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, \ |
| 1213 out0, out1, out2, out3, out4, out5, out6, out7) { \ |
| 1214 out0 = in0 + in7; \ |
| 1215 out1 = in1 + in6; \ |
| 1216 out2 = in2 + in5; \ |
| 1217 out3 = in3 + in4; \ |
653 \ | 1218 \ |
654 out_m = (v8u16)__msa_srari_h((v8i16)(input), (right_shift_val)); \ | 1219 out4 = in3 - in4; \ |
655 out_m = __msa_sat_u_h(out_m, (sat_val)); \ | 1220 out5 = in2 - in5; \ |
656 out_m; \ | 1221 out6 = in1 - in6; \ |
| 1222 out7 = in0 - in7; \ |
| 1223 } |
| 1224 |
| 1225 /* Description : Transposes 4x8 block with half word elements in vectors |
| 1226 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 |
| 1227 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 |
| 1228 Return Type - signed halfword |
| 1229 Details : |
| 1230 */ |
| 1231 #define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, \ |
| 1232 out0, out1, out2, out3, out4, out5, out6, out7) { \ |
| 1233 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ |
| 1234 v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n; \ |
| 1235 v8i16 zero_m = { 0 }; \ |
| 1236 \ |
| 1237 ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, \ |
| 1238 tmp0_n, tmp1_n, tmp2_n, tmp3_n); \ |
| 1239 ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m); \ |
| 1240 ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m); \ |
| 1241 \ |
| 1242 out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m); \ |
| 1243 out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m); \ |
| 1244 out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m); \ |
| 1245 out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m); \ |
| 1246 \ |
| 1247 out4 = zero_m; \ |
| 1248 out5 = zero_m; \ |
| 1249 out6 = zero_m; \ |
| 1250 out7 = zero_m; \ |
| 1251 } |
| 1252 |
| 1253 /* Description : Transposes 8x4 block with half word elements in vectors |
| 1254 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 |
| 1255 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 |
| 1256 Return Type - signed halfword |
| 1257 Details : |
| 1258 */ |
| 1259 #define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) { \ |
| 1260 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ |
| 1261 \ |
| 1262 ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m); \ |
| 1263 ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m); \ |
| 1264 ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2); \ |
| 1265 ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3); \ |
| 1266 } |
| 1267 |
| 1268 /* Description : Transposes 8x8 block with half word elements in vectors |
| 1269 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 |
| 1270 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 |
| 1271 Return Type - signed halfword |
| 1272 Details : |
| 1273 */ |
| 1274 #define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ |
| 1275 out0, out1, out2, out3, out4, out5, out6, out7) { \ |
| 1276 v8i16 s0_m, s1_m; \ |
| 1277 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ |
| 1278 v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ |
| 1279 \ |
| 1280 ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \ |
| 1281 ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m); \ |
| 1282 ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \ |
| 1283 ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m); \ |
| 1284 ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ |
| 1285 ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m); \ |
| 1286 ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ |
| 1287 ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m); \ |
| 1288 PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, \ |
| 1289 tmp3_m, tmp7_m, out0, out2, out4, out6); \ |
| 1290 out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m); \ |
| 1291 out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m); \ |
| 1292 out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m); \ |
| 1293 out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m); \ |
| 1294 } |
| 1295 #define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__) |
| 1296 |
| 1297 /* Description : Pack even elements of input vectors & xor with 128 |
| 1298 Arguments : Inputs - in0, in1 |
| 1299 Outputs - out_m |
| 1300 Return Type - unsigned byte |
| 1301 Details : Signed byte even elements from 'in0' and 'in1' are packed |
| 1302 together in one vector and the resulted vector is xor'ed with |
| 1303 128 to shift the range from signed to unsigned byte |
| 1304 */ |
| 1305 #define PCKEV_XORI128_UB(in0, in1) ({ \ |
| 1306 v16u8 out_m; \ |
| 1307 \ |
| 1308 out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \ |
| 1309 out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128); \ |
| 1310 out_m; \ |
657 }) | 1311 }) |
658 | 1312 |
659 #define SRARI_SATURATE_SIGNED_H(input, right_shift_val, sat_val) ({ \ | 1313 /* Description : Pack even byte elements and store byte vector in destination |
660 v8i16 out_m; \ | 1314 memory |
661 \ | 1315 Arguments : Inputs - in0, in1, pdst |
662 out_m = __msa_srari_h((v8i16)(input), (right_shift_val)); \ | 1316 */ |
663 out_m = __msa_sat_s_h(out_m, (sat_val)); \ | 1317 #define PCKEV_ST_SB(in0, in1, pdst) { \ |
664 out_m; \ | 1318 v16i8 tmp_m; \ |
| 1319 \ |
| 1320 tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \ |
| 1321 ST_SB(tmp_m, (pdst)); \ |
| 1322 } |
| 1323 |
| 1324 /* Description : Horizontal 2 tap filter kernel code |
| 1325 Arguments : Inputs - in0, in1, mask, coeff, shift |
| 1326 */ |
| 1327 #define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) ({ \ |
| 1328 v16i8 tmp0_m; \ |
| 1329 v8u16 tmp1_m; \ |
| 1330 \ |
| 1331 tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \ |
| 1332 tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff); \ |
| 1333 tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift); \ |
| 1334 tmp1_m = __msa_sat_u_h(tmp1_m, shift); \ |
| 1335 \ |
| 1336 tmp1_m; \ |
665 }) | 1337 }) |
666 | |
667 #define PCKEV_2B_XORI128_STORE_4_BYTES_4(in1, in2, \ | |
668 pdst, stride) { \ | |
669 uint32_t out0_m, out1_m, out2_m, out3_m; \ | |
670 v16i8 tmp0_m; \ | |
671 uint8_t *dst_m = (uint8_t *)(pdst); \ | |
672 \ | |
673 tmp0_m = __msa_pckev_b((v16i8)(in2), (v16i8)(in1)); \ | |
674 tmp0_m = (v16i8)__msa_xori_b((v16u8)tmp0_m, 128); \ | |
675 \ | |
676 out0_m = __msa_copy_u_w((v4i32)tmp0_m, 0); \ | |
677 out1_m = __msa_copy_u_w((v4i32)tmp0_m, 1); \ | |
678 out2_m = __msa_copy_u_w((v4i32)tmp0_m, 2); \ | |
679 out3_m = __msa_copy_u_w((v4i32)tmp0_m, 3); \ | |
680 \ | |
681 STORE_WORD(dst_m, out0_m); \ | |
682 dst_m += stride; \ | |
683 STORE_WORD(dst_m, out1_m); \ | |
684 dst_m += stride; \ | |
685 STORE_WORD(dst_m, out2_m); \ | |
686 dst_m += stride; \ | |
687 STORE_WORD(dst_m, out3_m); \ | |
688 } | |
689 | |
690 #define PCKEV_B_4_XORI128_STORE_8_BYTES_4(in1, in2, \ | |
691 in3, in4, \ | |
692 pdst, stride) { \ | |
693 uint64_t out0_m, out1_m, out2_m, out3_m; \ | |
694 v16i8 tmp0_m, tmp1_m; \ | |
695 uint8_t *dst_m = (uint8_t *)(pdst); \ | |
696 \ | |
697 tmp0_m = __msa_pckev_b((v16i8)(in2), (v16i8)(in1)); \ | |
698 tmp1_m = __msa_pckev_b((v16i8)(in4), (v16i8)(in3)); \ | |
699 \ | |
700 tmp0_m = (v16i8)__msa_xori_b((v16u8)tmp0_m, 128); \ | |
701 tmp1_m = (v16i8)__msa_xori_b((v16u8)tmp1_m, 128); \ | |
702 \ | |
703 out0_m = __msa_copy_u_d((v2i64)tmp0_m, 0); \ | |
704 out1_m = __msa_copy_u_d((v2i64)tmp0_m, 1); \ | |
705 out2_m = __msa_copy_u_d((v2i64)tmp1_m, 0); \ | |
706 out3_m = __msa_copy_u_d((v2i64)tmp1_m, 1); \ | |
707 \ | |
708 STORE_DWORD(dst_m, out0_m); \ | |
709 dst_m += stride; \ | |
710 STORE_DWORD(dst_m, out1_m); \ | |
711 dst_m += stride; \ | |
712 STORE_DWORD(dst_m, out2_m); \ | |
713 dst_m += stride; \ | |
714 STORE_DWORD(dst_m, out3_m); \ | |
715 } | |
716 | |
717 /* Only for signed vecs */ | |
718 #define PCKEV_B_XORI128_STORE_VEC(in1, in2, pdest) { \ | |
719 v16i8 tmp_m; \ | |
720 \ | |
721 tmp_m = __msa_pckev_b((v16i8)(in1), (v16i8)(in2)); \ | |
722 tmp_m = (v16i8)__msa_xori_b((v16u8)tmp_m, 128); \ | |
723 STORE_SB(tmp_m, (pdest)); \ | |
724 } | |
725 | |
726 /* Only for signed vecs */ | |
727 #define PCKEV_B_4_XORI128_AVG_STORE_8_BYTES_4(in1, dst0, \ | |
728 in2, dst1, \ | |
729 in3, dst2, \ | |
730 in4, dst3, \ | |
731 pdst, stride) { \ | |
732 uint64_t out0_m, out1_m, out2_m, out3_m; \ | |
733 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ | |
734 uint8_t *dst_m = (uint8_t *)(pdst); \ | |
735 \ | |
736 tmp0_m = (v16u8)__msa_pckev_b((v16i8)(in2), (v16i8)(in1)); \ | |
737 tmp1_m = (v16u8)__msa_pckev_b((v16i8)(in4), (v16i8)(in3)); \ | |
738 \ | |
739 tmp2_m = (v16u8)__msa_ilvr_d((v2i64)(dst1), (v2i64)(dst0)); \ | |
740 tmp3_m = (v16u8)__msa_ilvr_d((v2i64)(dst3), (v2i64)(dst2)); \ | |
741 \ | |
742 tmp0_m = __msa_xori_b(tmp0_m, 128); \ | |
743 tmp1_m = __msa_xori_b(tmp1_m, 128); \ | |
744 \ | |
745 tmp0_m = __msa_aver_u_b(tmp0_m, tmp2_m); \ | |
746 tmp1_m = __msa_aver_u_b(tmp1_m, tmp3_m); \ | |
747 \ | |
748 out0_m = __msa_copy_u_d((v2i64)tmp0_m, 0); \ | |
749 out1_m = __msa_copy_u_d((v2i64)tmp0_m, 1); \ | |
750 out2_m = __msa_copy_u_d((v2i64)tmp1_m, 0); \ | |
751 out3_m = __msa_copy_u_d((v2i64)tmp1_m, 1); \ | |
752 \ | |
753 STORE_DWORD(dst_m, out0_m); \ | |
754 dst_m += stride; \ | |
755 STORE_DWORD(dst_m, out1_m); \ | |
756 dst_m += stride; \ | |
757 STORE_DWORD(dst_m, out2_m); \ | |
758 dst_m += stride; \ | |
759 STORE_DWORD(dst_m, out3_m); \ | |
760 } | |
761 | |
762 /* Only for signed vecs */ | |
763 #define PCKEV_B_XORI128_AVG_STORE_VEC(in1, in2, dst, pdest) { \ | |
764 v16u8 tmp_m; \ | |
765 \ | |
766 tmp_m = (v16u8)__msa_pckev_b((v16i8)(in1), (v16i8)(in2)); \ | |
767 tmp_m = __msa_xori_b(tmp_m, 128); \ | |
768 tmp_m = __msa_aver_u_b(tmp_m, (v16u8)(dst)); \ | |
769 STORE_UB(tmp_m, (pdest)); \ | |
770 } | |
771 | |
772 #define PCKEV_B_STORE_8_BYTES_4(in1, in2, in3, in4, \ | |
773 pdst, stride) { \ | |
774 uint64_t out0_m, out1_m, out2_m, out3_m; \ | |
775 v16i8 tmp0_m, tmp1_m; \ | |
776 uint8_t *dst_m = (uint8_t *)(pdst); \ | |
777 \ | |
778 tmp0_m = __msa_pckev_b((v16i8)(in2), (v16i8)(in1)); \ | |
779 tmp1_m = __msa_pckev_b((v16i8)(in4), (v16i8)(in3)); \ | |
780 \ | |
781 out0_m = __msa_copy_u_d((v2i64)tmp0_m, 0); \ | |
782 out1_m = __msa_copy_u_d((v2i64)tmp0_m, 1); \ | |
783 out2_m = __msa_copy_u_d((v2i64)tmp1_m, 0); \ | |
784 out3_m = __msa_copy_u_d((v2i64)tmp1_m, 1); \ | |
785 \ | |
786 STORE_DWORD(dst_m, out0_m); \ | |
787 dst_m += stride; \ | |
788 STORE_DWORD(dst_m, out1_m); \ | |
789 dst_m += stride; \ | |
790 STORE_DWORD(dst_m, out2_m); \ | |
791 dst_m += stride; \ | |
792 STORE_DWORD(dst_m, out3_m); \ | |
793 } | |
794 | |
795 /* Only for unsigned vecs */ | |
796 #define PCKEV_B_STORE_VEC(in1, in2, pdest) { \ | |
797 v16i8 tmp_m; \ | |
798 \ | |
799 tmp_m = __msa_pckev_b((v16i8)(in1), (v16i8)(in2)); \ | |
800 STORE_SB(tmp_m, (pdest)); \ | |
801 } | |
802 | |
803 #define PCKEV_B_AVG_STORE_8_BYTES_4(in1, dst0, in2, dst1, \ | |
804 in3, dst2, in4, dst3, \ | |
805 pdst, stride) { \ | |
806 uint64_t out0_m, out1_m, out2_m, out3_m; \ | |
807 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ | |
808 uint8_t *dst_m = (uint8_t *)(pdst); \ | |
809 \ | |
810 tmp0_m = (v16u8)__msa_pckev_b((v16i8)(in2), (v16i8)(in1)); \ | |
811 tmp1_m = (v16u8)__msa_pckev_b((v16i8)(in4), (v16i8)(in3)); \ | |
812 \ | |
813 tmp2_m = (v16u8)__msa_pckev_d((v2i64)(dst1), (v2i64)(dst0)); \ | |
814 tmp3_m = (v16u8)__msa_pckev_d((v2i64)(dst3), (v2i64)(dst2)); \ | |
815 \ | |
816 tmp0_m = __msa_aver_u_b(tmp0_m, tmp2_m); \ | |
817 tmp1_m = __msa_aver_u_b(tmp1_m, tmp3_m); \ | |
818 \ | |
819 out0_m = __msa_copy_u_d((v2i64)tmp0_m, 0); \ | |
820 out1_m = __msa_copy_u_d((v2i64)tmp0_m, 1); \ | |
821 out2_m = __msa_copy_u_d((v2i64)tmp1_m, 0); \ | |
822 out3_m = __msa_copy_u_d((v2i64)tmp1_m, 1); \ | |
823 \ | |
824 STORE_DWORD(dst_m, out0_m); \ | |
825 dst_m += stride; \ | |
826 STORE_DWORD(dst_m, out1_m); \ | |
827 dst_m += stride; \ | |
828 STORE_DWORD(dst_m, out2_m); \ | |
829 dst_m += stride; \ | |
830 STORE_DWORD(dst_m, out3_m); \ | |
831 } | |
832 | |
833 #define PCKEV_B_AVG_STORE_VEC(in1, in2, dst, pdest) { \ | |
834 v16u8 tmp_m; \ | |
835 \ | |
836 tmp_m = (v16u8)__msa_pckev_b((v16i8)(in1), (v16i8)(in2)); \ | |
837 tmp_m = __msa_aver_u_b(tmp_m, (v16u8)(dst)); \ | |
838 STORE_UB(tmp_m, (pdest)); \ | |
839 } | |
840 | |
841 /* Generic for Vector types and GP operations */ | |
842 #define BUTTERFLY_4(in0, in1, in2, in3, \ | |
843 out0, out1, out2, out3) { \ | |
844 out0 = (in0) + (in3); \ | |
845 out1 = (in1) + (in2); \ | |
846 \ | |
847 out2 = (in1) - (in2); \ | |
848 out3 = (in0) - (in3); \ | |
849 } | |
850 | |
851 /* Generic for Vector types and GP operations */ | |
852 #define BUTTERFLY_8(in0, in1, in2, in3, \ | |
853 in4, in5, in6, in7, \ | |
854 out0, out1, out2, out3, \ | |
855 out4, out5, out6, out7) { \ | |
856 out0 = (in0) + (in7); \ | |
857 out1 = (in1) + (in6); \ | |
858 out2 = (in2) + (in5); \ | |
859 out3 = (in3) + (in4); \ | |
860 \ | |
861 out4 = (in3) - (in4); \ | |
862 out5 = (in2) - (in5); \ | |
863 out6 = (in1) - (in6); \ | |
864 out7 = (in0) - (in7); \ | |
865 } | |
866 #endif /* HAVE_MSA */ | |
867 #endif /* VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_ */ | 1338 #endif /* VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_ */ |
OLD | NEW |