| OLD | NEW |
| 1 // Copyright 2016 The Chromium Authors. All rights reserved. | 1 // Copyright 2016 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #ifndef CommonMacrosMSA_h | 5 #ifndef CommonMacrosMSA_h |
| 6 #define CommonMacrosMSA_h | 6 #define CommonMacrosMSA_h |
| 7 | 7 |
| 8 #include <msa.h> | 8 #include <msa.h> |
| 9 #include <stdint.h> | 9 #include <stdint.h> |
| 10 | 10 |
| 11 #if defined(__clang__) | 11 #if defined(__clang__) |
| 12 #define CLANG_BUILD | 12 #define CLANG_BUILD |
| 13 #endif | 13 #endif |
| 14 | 14 |
| 15 #ifdef CLANG_BUILD | 15 #ifdef CLANG_BUILD |
| 16 #define SRLI_B(a, b) __msa_srli_b((v16i8)a, b) |
| 16 #define SRLI_H(a, b) __msa_srli_h((v8i16)a, b) | 17 #define SRLI_H(a, b) __msa_srli_h((v8i16)a, b) |
| 18 #define SLLI_B(a, b) __msa_slli_b((v16i8)a, b) |
| 17 #define SLLI_H(a, b) __msa_slli_h((v8i16)a, b) | 19 #define SLLI_H(a, b) __msa_slli_h((v8i16)a, b) |
| 20 #define CEQI_B(a, b) __msa_ceqi_b((v16i8)a, b) |
| 18 #define CEQI_H(a, b) __msa_ceqi_h((v8i16)a, b) | 21 #define CEQI_H(a, b) __msa_ceqi_h((v8i16)a, b) |
| 22 #define ANDI_B(a, b) __msa_andi_b((v16u8)a, b) |
| 19 #else | 23 #else |
| 24 #define SRLI_B(a, b) ((v16u8)a >> b) |
| 20 #define SRLI_H(a, b) ((v8u16)a >> b) | 25 #define SRLI_H(a, b) ((v8u16)a >> b) |
| 26 #define SLLI_B(a, b) ((v16i8)a << b) |
| 21 #define SLLI_H(a, b) ((v8i16)a << b) | 27 #define SLLI_H(a, b) ((v8i16)a << b) |
| 28 #define CEQI_B(a, b) (a == b) |
| 22 #define CEQI_H(a, b) (a == b) | 29 #define CEQI_H(a, b) (a == b) |
| 30 #define ANDI_B(a, b) ((v16u8)a & b) |
| 23 #endif | 31 #endif |
| 24 | 32 |
| 25 #define LD_V(RTYPE, psrc) *((RTYPE*)(psrc)) | 33 #define LD_V(RTYPE, psrc) *((RTYPE*)(psrc)) |
| 26 #define LD_UB(...) LD_V(v16u8, __VA_ARGS__) | 34 #define LD_UB(...) LD_V(v16u8, __VA_ARGS__) |
| 27 #define LD_UH(...) LD_V(v8u16, __VA_ARGS__) | 35 #define LD_UH(...) LD_V(v8u16, __VA_ARGS__) |
| 28 #define LD_SP(...) LD_V(v4f32, __VA_ARGS__) | 36 #define LD_SP(...) LD_V(v4f32, __VA_ARGS__) |
| 29 #define LD_DP(...) LD_V(v2f64, __VA_ARGS__) | 37 #define LD_DP(...) LD_V(v2f64, __VA_ARGS__) |
| 30 | 38 |
| 31 #define ST_V(RTYPE, in, pdst) *((RTYPE*)(pdst)) = in | 39 #define ST_V(RTYPE, in, pdst) *((RTYPE*)(pdst)) = in |
| 32 #define ST_UB(...) ST_V(v16u8, __VA_ARGS__) | 40 #define ST_UB(...) ST_V(v16u8, __VA_ARGS__) |
| (...skipping 94 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 127 | 135 |
| 128 #define LD_V4(RTYPE, psrc, stride, out0, out1, out2, out3) \ | 136 #define LD_V4(RTYPE, psrc, stride, out0, out1, out2, out3) \ |
| 129 { \ | 137 { \ |
| 130 LD_V2(RTYPE, psrc, stride, out0, out1); \ | 138 LD_V2(RTYPE, psrc, stride, out0, out1); \ |
| 131 LD_V2(RTYPE, psrc, stride, out2, out3); \ | 139 LD_V2(RTYPE, psrc, stride, out2, out3); \ |
| 132 } | 140 } |
| 133 #define LD_UB4(...) LD_V4(v16u8, __VA_ARGS__) | 141 #define LD_UB4(...) LD_V4(v16u8, __VA_ARGS__) |
| 134 #define LD_UH4(...) LD_V4(v8u16, __VA_ARGS__) | 142 #define LD_UH4(...) LD_V4(v8u16, __VA_ARGS__) |
| 135 #define LD_SP4(...) LD_V4(v4f32, __VA_ARGS__) | 143 #define LD_SP4(...) LD_V4(v4f32, __VA_ARGS__) |
| 136 | 144 |
| 145 #define LD_V6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5) \ |
| 146 { \ |
| 147 LD_V4(RTYPE, psrc, stride, out0, out1, out2, out3); \ |
| 148 LD_V2(RTYPE, psrc, stride, out4, out5); \ |
| 149 } |
| 150 #define LD_UB6(...) LD_V6(v16u8, __VA_ARGS__) |
| 151 #define LD_UH6(...) LD_V6(v8u16, __VA_ARGS__) |
| 152 #define LD_SP6(...) LD_V6(v4f32, __VA_ARGS__) |
| 153 |
| 154 #define LD_V8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, out
7) \ |
| 155 {
\ |
| 156 LD_V4(RTYPE, psrc, stride, out0, out1, out2, out3);
\ |
| 157 LD_V4(RTYPE, psrc, stride, out4, out5, out6, out7);
\ |
| 158 } |
| 159 #define LD_UB8(...) LD_V8(v16u8, __VA_ARGS__) |
| 160 #define LD_UH8(...) LD_V8(v8u16, __VA_ARGS__) |
| 161 #define LD_SP8(...) LD_V8(v4f32, __VA_ARGS__) |
| 162 #define LD_DP8(...) LD_V8(v2f64, __VA_ARGS__) |
| 163 |
| 137 /* Description : Store vectors of elements with stride | 164 /* Description : Store vectors of elements with stride |
| 138 * Arguments : Inputs - in0, in1, pdst, stride | 165 * Arguments : Inputs - in0, in1, pdst, stride |
| 139 * Details : Store elements from 'in0' to (pdst) | 166 * Details : Store elements from 'in0' to (pdst) |
| 140 * Store elements from 'in1' to (pdst + stride) | 167 * Store elements from 'in1' to (pdst + stride) |
| 141 */ | 168 */ |
| 142 #define ST_V2(RTYPE, in0, in1, pdst, stride) \ | 169 #define ST_V2(RTYPE, in0, in1, pdst, stride) \ |
| 143 { \ | 170 { \ |
| 144 ST_V(RTYPE, in0, pdst); \ | 171 ST_V(RTYPE, in0, pdst); \ |
| 145 pdst += stride; \ | 172 pdst += stride; \ |
| 146 ST_V(RTYPE, in1, pdst); \ | 173 ST_V(RTYPE, in1, pdst); \ |
| (...skipping 13 matching lines...) Expand all Loading... |
| 160 #define ST_UH3(...) ST_V3(v8u16, __VA_ARGS__) | 187 #define ST_UH3(...) ST_V3(v8u16, __VA_ARGS__) |
| 161 | 188 |
| 162 #define ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride) \ | 189 #define ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride) \ |
| 163 { \ | 190 { \ |
| 164 ST_V2(RTYPE, in0, in1, pdst, stride); \ | 191 ST_V2(RTYPE, in0, in1, pdst, stride); \ |
| 165 ST_V2(RTYPE, in2, in3, pdst, stride); \ | 192 ST_V2(RTYPE, in2, in3, pdst, stride); \ |
| 166 } | 193 } |
| 167 #define ST_UB4(...) ST_V4(v16u8, __VA_ARGS__) | 194 #define ST_UB4(...) ST_V4(v16u8, __VA_ARGS__) |
| 168 #define ST_UH4(...) ST_V4(v8u16, __VA_ARGS__) | 195 #define ST_UH4(...) ST_V4(v8u16, __VA_ARGS__) |
| 169 #define ST_SP4(...) ST_V4(v4f32, __VA_ARGS__) | 196 #define ST_SP4(...) ST_V4(v4f32, __VA_ARGS__) |
| 197 |
| 170 #define ST_V6(RTYPE, in0, in1, in2, in3, in4, in5, pdst, stride) \ | 198 #define ST_V6(RTYPE, in0, in1, in2, in3, in4, in5, pdst, stride) \ |
| 171 { \ | 199 { \ |
| 172 ST_V3(RTYPE, in0, in1, in2, pdst, stride); \ | 200 ST_V3(RTYPE, in0, in1, in2, pdst, stride); \ |
| 173 ST_V3(RTYPE, in3, in4, in5, pdst, stride); \ | 201 ST_V3(RTYPE, in3, in4, in5, pdst, stride); \ |
| 174 } | 202 } |
| 175 #define ST_UB6(...) ST_V6(v16u8, __VA_ARGS__) | 203 #define ST_UB6(...) ST_V6(v16u8, __VA_ARGS__) |
| 176 #define ST_SP6(...) ST_V6(v4f32, __VA_ARGS__) | 204 #define ST_SP6(...) ST_V6(v4f32, __VA_ARGS__) |
| 177 | 205 |
| 178 #define ST_V8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ | 206 #define ST_V8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ |
| 179 { \ | 207 { \ |
| 180 ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride); \ | 208 ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride); \ |
| 181 ST_V4(RTYPE, in4, in5, in6, in7, pdst, stride); \ | 209 ST_V4(RTYPE, in4, in5, in6, in7, pdst, stride); \ |
| 182 } | 210 } |
| 183 #define ST_UB8(...) ST_V8(v16u8, __VA_ARGS__) | 211 #define ST_UB8(...) ST_V8(v16u8, __VA_ARGS__) |
| 184 #define ST_SP8(...) ST_V8(v4f32, __VA_ARGS__) | 212 #define ST_SP8(...) ST_V8(v4f32, __VA_ARGS__) |
| 185 | 213 |
| 214 /* Description : Logical and in0 and in1. |
| 215 Arguments : Inputs - in0, in1, in2, in3, |
| 216 Outputs - out0, out1, out2, out3 |
| 217 Return Type - as per RTYPE |
| 218 Details : Each unsigned word element from 'in0' vector is added with |
| 219 each unsigned word element from 'in1' vector. Then the average |
| 220 is calculated and written to 'out0' |
| 221 */ |
| 222 #define AND_V2(RTYPE, in0, in1, mask, out0, out1) \ |
| 223 { \ |
| 224 out0 = (RTYPE)__msa_and_v((v16u8)in0, (v16u8)mask); \ |
| 225 out1 = (RTYPE)__msa_and_v((v16u8)in1, (v16u8)mask); \ |
| 226 } |
| 227 #define AND_V2_UB(...) AND_V2(v16u8, __VA_ARGS__) |
| 228 |
| 229 #define AND_V4(RTYPE, in0, in1, in2, in3, mask, out0, out1, out2, out3) \ |
| 230 { \ |
| 231 AND_V2(RTYPE, in0, in1, mask, out0, out1); \ |
| 232 AND_V2(RTYPE, in2, in3, mask, out2, out3); \ |
| 233 } |
| 234 #define AND_V4_UB(...) AND_V4(v16u8, __VA_ARGS__) |
| 235 |
| 236 /* Description : Logical equate of input vectors with immediate value |
| 237 Arguments : Inputs - in0, in1, val |
| 238 Outputs - in place operation |
| 239 Return Type - as per RTYPE |
| 240 Details : Each unsigned byte element from input vector 'in0' & 'in1' is |
| 241 logically and'ed with immediate mask and the result |
| 242 is stored in-place. |
| 243 */ |
| 244 #define CEQI_B2(RTYPE, in0, in1, val, out0, out1) \ |
| 245 { \ |
| 246 out0 = CEQI_B(in0, val); \ |
| 247 out1 = CEQI_B(in1, val); \ |
| 248 } |
| 249 #define CEQI_B2_UB(...) CEQI_B2(v16u8, __VA_ARGS__) |
| 250 |
| 251 #define CEQI_B4(RTYPE, in0, in1, in2, in3, val, out0, out1, out2, out3) \ |
| 252 { \ |
| 253 CEQI_B2(RTYPE, in0, in1, val, out0, out1); \ |
| 254 CEQI_B2(RTYPE, in2, in3, val, out2, out3); \ |
| 255 } |
| 256 #define CEQI_B4_UB(...) CEQI_B4(v16u8, __VA_ARGS__) |
| 257 |
| 258 /* Description : Immediate number of elements to slide |
| 259 * Arguments : Inputs - in0, in1, slide_val |
| 260 * Outputs - out |
| 261 * Return Type - as per RTYPE |
| 262 * Details : Byte elements from 'in1' vector are slid into 'in0' by |
| 263 * value specified in the 'slide_val' |
| 264 */ |
| 265 #define SLDI_B(RTYPE, in0, in1, slide_val) \ |
| 266 (RTYPE)__msa_sldi_b((v16i8)in0, (v16i8)in1, slide_val) |
| 267 #define SLDI_UB(...) SLDI_B(v16u8, __VA_ARGS__) |
| 268 #define SLDI_D(...) SLDI_B(v2f64, __VA_ARGS__) |
| 269 |
| 270 /* Description : Immediate number of elements to slide |
| 271 Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val |
| 272 Outputs - out0, out1 |
| 273 Return Type - as per RTYPE |
| 274 Details : Byte elements from 'in0_0' vector are slid into 'in1_0' by |
| 275 value specified in the 'slide_val' |
| 276 */ |
| 277 #define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \ |
| 278 { \ |
| 279 out0 = SLDI_B(RTYPE, in0_0, in1_0, slide_val); \ |
| 280 out1 = SLDI_B(RTYPE, in0_1, in1_1, slide_val); \ |
| 281 } |
| 282 #define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__) |
| 283 |
| 284 /* Description : Shuffle byte vector elements as per variable |
| 285 Arguments : Inputs - in0, in1, shf_val |
| 286 Outputs - out0, out1 |
| 287 Return Type - as per RTYPE |
| 288 Details : Byte elements from 'in0' & 'in1' are copied selectively to |
| 289 'out0' as per control variable 'shf_val'. |
| 290 */ |
| 291 #define SHF_B2(RTYPE, in0, in1, shf_val) \ |
| 292 { \ |
| 293 in0 = (RTYPE)__msa_shf_b((v16i8)in0, shf_val); \ |
| 294 in1 = (RTYPE)__msa_shf_b((v16i8)in1, shf_val); \ |
| 295 } |
| 296 #define SHF_B2_UB(...) SHF_B2(v16u8, __VA_ARGS__) |
| 297 #define SHF_B2_UH(...) SHF_B2(v8u16, __VA_ARGS__) |
| 298 |
| 299 #define SHF_B4(RTYPE, in0, in1, in2, in3, shf_val) \ |
| 300 { \ |
| 301 SHF_B2(RTYPE, in0, in1, shf_val); \ |
| 302 SHF_B2(RTYPE, in2, in3, shf_val); \ |
| 303 } |
| 304 #define SHF_B4_UB(...) SHF_B4(v16u8, __VA_ARGS__) |
| 305 #define SHF_B4_UH(...) SHF_B4(v8u16, __VA_ARGS__) |
| 306 |
| 307 /* Description : Interleave even byte elements from vectors |
| 308 Arguments : Inputs - in0, in1, in2, in3 |
| 309 Outputs - out0, out1 |
| 310 Return Type - as per RTYPE |
| 311 Details : Even byte elements of 'in0' and 'in1' are interleaved |
| 312 and written to 'out0' |
| 313 */ |
| 314 #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ |
| 315 { \ |
| 316 out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \ |
| 317 out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \ |
| 318 } |
| 319 #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__) |
| 320 #define ILVEV_B2_UH(...) ILVEV_B2(v8u16, __VA_ARGS__) |
| 321 |
| 322 #define ILVEV_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \ |
| 323 { \ |
| 324 ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ |
| 325 out2 = (RTYPE)__msa_ilvev_b((v16i8)in5, (v16i8)in4); \ |
| 326 } |
| 327 #define ILVEV_B3_UH(...) ILVEV_B3(v8u16, __VA_ARGS__) |
| 328 |
| 186 /* Description : Interleave even halfword elements from vectors | 329 /* Description : Interleave even halfword elements from vectors |
| 187 Arguments : Inputs - in0, in1, in2, in3 | 330 Arguments : Inputs - in0, in1, in2, in3 |
| 188 Outputs - out0, out1 | 331 Outputs - out0, out1 |
| 189 Return Type - as per RTYPE | 332 Return Type - as per RTYPE |
| 190 Details : Even halfword elements of 'in0' and 'in1' are interleaved | 333 Details : Even halfword elements of 'in0' and 'in1' are interleaved |
| 191 and written to 'out0' | 334 and written to 'out0' |
| 192 */ | 335 */ |
| 193 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ | 336 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ |
| 194 { \ | 337 { \ |
| 195 out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \ | 338 out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \ |
| 196 out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \ | 339 out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \ |
| 197 } | 340 } |
| 198 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__) | 341 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__) |
| 199 | 342 |
| 200 /* Description : Interleave both left and right half of input vectors | 343 /* Description : Interleave both left and right half of input vectors |
| 201 Arguments : Inputs - in0, in1 | 344 Arguments : Inputs - in0, in1 |
| 202 Outputs - out0, out1 | 345 Outputs - out0, out1 |
| 203 Return Type - as per RTYPE | 346 Return Type - as per RTYPE |
| 204 Details : Right half of byte elements from 'in0' and 'in1' are | 347 Details : Right half of byte elements from 'in0' and 'in1' are |
| 205 interleaved and written to 'out0' | 348 interleaved and written to 'out0' |
| 206 */ | 349 */ |
| 207 #define ILVRL_B2(RTYPE, in0, in1, out0, out1) \ | 350 #define ILVRL_B2(RTYPE, in0, in1, out0, out1) \ |
| 208 { \ | 351 { \ |
| 209 out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ | 352 out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ |
| 210 out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ | 353 out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ |
| 211 } | 354 } |
| 212 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__) | 355 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__) |
| 213 | 356 |
| 357 #define ILVRL_H2(RTYPE, in0, in1, out0, out1) \ |
| 358 { \ |
| 359 out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ |
| 360 out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ |
| 361 } |
| 362 #define ILVRL_H2_UB(...) ILVRL_H2(v16u8, __VA_ARGS__) |
| 363 |
| 364 /* Description : Interleave both odd and even half of input vectors |
| 365 Arguments : Inputs - in0, in1 |
| 366 Outputs - out0, out1 |
| 367 Return Type - as per RTYPE |
| 368 Details : Odd half of byte elements from 'in0' and 'in1' are |
| 369 interleaved and written to 'out0' |
| 370 */ |
| 371 #define ILVODEV_B2(RTYPE, in0, in1, out0, out1) \ |
| 372 { \ |
| 373 out0 = (RTYPE)__msa_ilvod_b((v16i8)in0, (v16i8)in1); \ |
| 374 out1 = (RTYPE)__msa_ilvev_b((v16i8)in0, (v16i8)in1); \ |
| 375 } |
| 376 #define ILVODEV_B2_UB(...) ILVODEV_B2(v16u8, __VA_ARGS__) |
| 377 |
| 378 /* Description : Pack even halfword elements of vector pairs |
| 379 Arguments : Inputs - in0, in1, in2, in3 |
| 380 Outputs - out0, out1 |
| 381 Return Type - as per RTYPE |
| 382 Details : Even halfword elements of 'in0' are copied to the left half of |
| 383 'out0' & even halfword elements of 'in1' are copied to the |
| 384 right half of 'out0'. |
| 385 */ |
| 386 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ |
| 387 { \ |
| 388 out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \ |
| 389 out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \ |
| 390 } |
| 391 #define PCKEV_H2_UB(...) PCKEV_H2(v16u8, __VA_ARGS__) |
| 392 |
| 393 #define PCKEV_H3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \ |
| 394 { \ |
| 395 PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ |
| 396 out2 = (RTYPE)__msa_pckev_h((v8i16)in4, (v8i16)in5); \ |
| 397 } |
| 398 #define PCKEV_H3_UB(...) PCKEV_H3(v16u8, __VA_ARGS__) |
| 399 |
| 400 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2
, out3) \ |
| 401 {
\ |
| 402 PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1);
\ |
| 403 PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3);
\ |
| 404 } |
| 405 #define PCKEV_H4_UB(...) PCKEV_H4(v16u8, __VA_ARGS__) |
| 406 |
| 407 /* Description : Pack odd halfword elements of vector pairs |
| 408 Arguments : Inputs - in0, in1, in2, in3 |
| 409 Outputs - out0, out1 |
| 410 Return Type - as per RTYPE |
| 411 Details : Odd halfword elements of 'in0' are copied to the left half of |
| 412 'out0' & odd halfword elements of 'in1' are copied to the |
| 413 right half of 'out0'. |
| 414 */ |
| 415 #define PCKOD_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ |
| 416 { \ |
| 417 out0 = (RTYPE)__msa_pckod_h((v8i16)in0, (v8i16)in1); \ |
| 418 out1 = (RTYPE)__msa_pckod_h((v8i16)in2, (v8i16)in3); \ |
| 419 } |
| 420 #define PCKOD_H2_UB(...) PCKOD_H2(v16u8, __VA_ARGS__) |
| 421 |
| 422 #define PCKOD_H3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \ |
| 423 { \ |
| 424 PCKOD_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ |
| 425 out2 = (RTYPE)__msa_pckod_h((v8i16)in4, (v8i16)in5); \ |
| 426 } |
| 427 #define PCKOD_H3_UB(...) PCKOD_H3(v16u8, __VA_ARGS__) |
| 428 |
| 429 #define PCKOD_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2
, out3) \ |
| 430 {
\ |
| 431 PCKOD_H2(RTYPE, in0, in1, in2, in3, out0, out1);
\ |
| 432 PCKOD_H2(RTYPE, in4, in5, in6, in7, out2, out3);
\ |
| 433 } |
| 434 #define PCKOD_H4_UB(...) PCKOD_H4(v16u8, __VA_ARGS__) |
| 435 |
| 436 /* Description : Logical shift right all elements of half-word vector |
| 437 Arguments : Inputs - in0, in1, shift |
| 438 Outputs - in place operation |
| 439 Return Type - as per input vector RTYPE |
| 440 Details : Each element of vector 'in0' is right shifted by 'shift' and |
| 441 the result is written in-place. 'shift' is a GP variable. |
| 442 */ |
| 443 #define SRLI_B2(RTYPE, in0, in1, shift_val) \ |
| 444 { \ |
| 445 in0 = (RTYPE)SRLI_B(in0, shift_val); \ |
| 446 in1 = (RTYPE)SRLI_B(in1, shift_val); \ |
| 447 } |
| 448 #define SRLI_B2_UB(...) SRLI_B2(v16u8, __VA_ARGS__) |
| 449 |
| 450 #define SRLI_B3(RTYPE, in0, in1, in2, shift_val) \ |
| 451 { \ |
| 452 SRLI_B2(RTYPE, in0, in1, shift_val); \ |
| 453 in2 = (RTYPE)SRLI_B(in2, shift_val); \ |
| 454 } |
| 455 #define SRLI_B3_UB(...) SRLI_B3(v16u8, __VA_ARGS__) |
| 456 |
| 457 #define SRLI_B4(RTYPE, in0, in1, in2, in3, shift_val) \ |
| 458 { \ |
| 459 SRLI_B2(RTYPE, in0, in1, shift_val); \ |
| 460 SRLI_B2(RTYPE, in2, in3, shift_val); \ |
| 461 } |
| 462 #define SRLI_B4_UB(...) SRLI_B4(v16u8, __VA_ARGS__) |
| 463 |
| 464 /* Description : Immediate Bit Insert Right (immediate) |
| 465 Arguments : Inputs - in0, in1, in2, in3, shift |
| 466 Outputs - out0, out1 |
| 467 Return Type - as per RTYPE |
| 468 Details : Copy least significant (right) bits in each element of vector |
| 469 'in1' to elements in vector in0 while preserving the most |
| 470 significant (left) bits. The number of bits to copy is given |
| 471 by the immediate 'shift + 1'. |
| 472 */ |
| 473 #define BINSRI_B2(RTYPE, in0, in1, in2, in3, out0, out1, shift) \ |
| 474 { \ |
| 475 out0 = (RTYPE)__msa_binsri_b((v16u8)in0, (v16u8)in1, shift); \ |
| 476 out1 = (RTYPE)__msa_binsri_b((v16u8)in2, (v16u8)in3, shift); \ |
| 477 } |
| 478 #define BINSRI_B2_UB(...) BINSRI_B2(v16u8, __VA_ARGS__) |
| 479 |
| 480 #define BINSRI_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2, shift)
\ |
| 481 {
\ |
| 482 BINSRI_B2(RTYPE, in0, in1, in2, in3, out0, out1, shift);
\ |
| 483 out2 = (RTYPE)__msa_binsri_b((v16u8)in4, (v16u8)in5, shift);
\ |
| 484 } |
| 485 #define BINSRI_B3_UB(...) BINSRI_B3(v16u8, __VA_ARGS__) |
| 486 |
| 487 /* Description : Multiplication of pairs of vectors |
| 488 Arguments : Inputs - in0, in1, in2, in3 |
| 489 Outputs - out0, out1 |
| 490 Details : Each element from 'in0' is multiplied with elements from 'in1' |
| 491 and the result is written to 'out0' |
| 492 */ |
| 493 #define MUL2(in0, in1, in2, in3, out0, out1) \ |
| 494 { \ |
| 495 out0 = in0 * in1; \ |
| 496 out1 = in2 * in3; \ |
| 497 } |
| 498 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ |
| 499 { \ |
| 500 MUL2(in0, in1, in2, in3, out0, out1); \ |
| 501 MUL2(in4, in5, in6, in7, out2, out3); \ |
| 502 } |
| 503 |
| 504 /* Description : Division of pairs of vectors |
| 505 Arguments : Inputs - in0, in1, in2, in3 |
| 506 Outputs - out0, out1 |
| 507 Details : Each element from 'in0' is divided by elements from 'in1' |
| 508 and the result is written to 'out0' |
| 509 */ |
| 510 #define DIV2(in0, in1, in2, in3, out0, out1) \ |
| 511 { \ |
| 512 out0 = in0 / in1; \ |
| 513 out1 = in2 / in3; \ |
| 514 } |
| 515 #define DIV4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ |
| 516 { \ |
| 517 DIV2(in0, in1, in2, in3, out0, out1); \ |
| 518 DIV2(in4, in5, in6, in7, out2, out3); \ |
| 519 } |
| 520 |
| 521 /* Description : Vector Floating-Point Convert from Unsigned Integer |
| 522 Arguments : Inputs - in0, in1 |
| 523 Outputs - out0, out1 |
| 524 Details : |
| 525 */ |
| 526 #define FFINTU_W2(RTYPE, in0, in1, out0, out1) \ |
| 527 { \ |
| 528 out0 = (RTYPE)__msa_ffint_u_w((v4u32)in0); \ |
| 529 out1 = (RTYPE)__msa_ffint_u_w((v4u32)in1); \ |
| 530 } |
| 531 #define FFINTU_W2_SP(...) FFINTU_W2(v4f32, __VA_ARGS__) |
| 532 |
| 533 #define FFINTU_W4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \ |
| 534 { \ |
| 535 FFINTU_W2(RTYPE, in0, in1, out0, out1); \ |
| 536 FFINTU_W2(RTYPE, in2, in3, out2, out3); \ |
| 537 } |
| 538 #define FFINTU_W4_SP(...) FFINTU_W4(v4f32, __VA_ARGS__) |
| 539 |
| 540 /* Description : Vector Floating-Point Truncate and Convert to Unsigned Integer |
| 541 Arguments : Inputs - in0, in1 |
| 542 Outputs - out0, out1 |
| 543 Details : |
| 544 */ |
| 545 #define FTRUNCU_W2(RTYPE, in0, in1, out0, out1) \ |
| 546 { \ |
| 547 out0 = (RTYPE)__msa_ftrunc_u_w((v4f32)in0); \ |
| 548 out1 = (RTYPE)__msa_ftrunc_u_w((v4f32)in1); \ |
| 549 } |
| 550 #define FTRUNCU_W2_UB(...) FTRUNCU_W2(v16u8, __VA_ARGS__) |
| 551 |
| 552 #define FTRUNCU_W4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \ |
| 553 { \ |
| 554 FTRUNCU_W2(RTYPE, in0, in1, out0, out1); \ |
| 555 FTRUNCU_W2(RTYPE, in2, in3, out2, out3); \ |
| 556 } |
| 557 #define FTRUNCU_W4_UB(...) FTRUNCU_W4(v16u8, __VA_ARGS__) |
| 558 |
| 214 #endif // CommonMacrosMSA_h | 559 #endif // CommonMacrosMSA_h |
| OLD | NEW |