OLD | NEW |
(Empty) | |
| 1 /* |
| 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. |
| 3 * |
| 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ |
| 10 |
| 11 #ifndef VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_ |
| 12 #define VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_ |
| 13 |
| 14 #include <msa.h> |
| 15 |
| 16 #include "./vpx_config.h" |
| 17 #include "vpx/vpx_integer.h" |
| 18 |
| 19 #if HAVE_MSA |
| 20 /* load macros */ |
| 21 #define LOAD_UB(psrc) *((const v16u8 *)(psrc)) |
| 22 #define LOAD_SB(psrc) *((const v16i8 *)(psrc)) |
| 23 #define LOAD_UH(psrc) *((const v8u16 *)(psrc)) |
| 24 #define LOAD_SH(psrc) *((const v8i16 *)(psrc)) |
| 25 #define LOAD_UW(psrc) *((const v4u32 *)(psrc)) |
| 26 #define LOAD_SW(psrc) *((const v4i32 *)(psrc)) |
| 27 #define LOAD_UD(psrc) *((const v2u64 *)(psrc)) |
| 28 #define LOAD_SD(psrc) *((const v2i64 *)(psrc)) |
| 29 |
| 30 /* store macros */ |
| 31 #define STORE_UB(vec, pdest) *((v16u8 *)(pdest)) = (vec) |
| 32 #define STORE_SB(vec, pdest) *((v16i8 *)(pdest)) = (vec) |
| 33 #define STORE_UH(vec, pdest) *((v8u16 *)(pdest)) = (vec) |
| 34 #define STORE_SH(vec, pdest) *((v8i16 *)(pdest)) = (vec) |
| 35 #define STORE_UW(vec, pdest) *((v4u32 *)(pdest)) = (vec) |
| 36 #define STORE_SW(vec, pdest) *((v4i32 *)(pdest)) = (vec) |
| 37 #define STORE_UD(vec, pdest) *((v2u64 *)(pdest)) = (vec) |
| 38 #define STORE_SD(vec, pdest) *((v2i64 *)(pdest)) = (vec) |
| 39 |
| 40 #if (__mips_isa_rev >= 6) |
| 41 #define LOAD_WORD(psrc) ({ \ |
| 42 const uint8_t *src_m = (const uint8_t *)(psrc); \ |
| 43 uint32_t val_m; \ |
| 44 \ |
| 45 __asm__ __volatile__ ( \ |
| 46 "lw %[val_m], %[src_m] \n\t" \ |
| 47 \ |
| 48 : [val_m] "=r" (val_m) \ |
| 49 : [src_m] "m" (*src_m) \ |
| 50 ); \ |
| 51 \ |
| 52 val_m; \ |
| 53 }) |
| 54 |
| 55 #if (__mips == 64) |
| 56 #define LOAD_DWORD(psrc) ({ \ |
| 57 const uint8_t *src_m = (const uint8_t *)(psrc); \ |
| 58 uint64_t val_m = 0; \ |
| 59 \ |
| 60 __asm__ __volatile__ ( \ |
| 61 "ld %[val_m], %[src_m] \n\t" \ |
| 62 \ |
| 63 : [val_m] "=r" (val_m) \ |
| 64 : [src_m] "m" (*src_m) \ |
| 65 ); \ |
| 66 \ |
| 67 val_m; \ |
| 68 }) |
| 69 #else // !(__mips == 64) |
| 70 #define LOAD_DWORD(psrc) ({ \ |
| 71 const uint8_t *src1_m = (const uint8_t *)(psrc); \ |
| 72 const uint8_t *src2_m = ((const uint8_t *)(psrc)) + 4; \ |
| 73 uint32_t val0_m, val1_m; \ |
| 74 uint64_t genval_m = 0; \ |
| 75 \ |
| 76 __asm__ __volatile__ ( \ |
| 77 "lw %[val0_m], %[src1_m] \n\t" \ |
| 78 \ |
| 79 : [val0_m] "=r" (val0_m) \ |
| 80 : [src1_m] "m" (*src1_m) \ |
| 81 ); \ |
| 82 \ |
| 83 __asm__ __volatile__ ( \ |
| 84 "lw %[val1_m], %[src2_m] \n\t" \ |
| 85 \ |
| 86 : [val1_m] "=r" (val1_m) \ |
| 87 : [src2_m] "m" (*src2_m) \ |
| 88 ); \ |
| 89 \ |
| 90 genval_m = (uint64_t)(val1_m); \ |
| 91 genval_m = (uint64_t)((genval_m << 32) & 0xFFFFFFFF00000000); \ |
| 92 genval_m = (uint64_t)(genval_m | (uint64_t)val0_m); \ |
| 93 \ |
| 94 genval_m; \ |
| 95 }) |
| 96 #endif // (__mips == 64) |
| 97 #define STORE_WORD_WITH_OFFSET_1(pdst, val) { \ |
| 98 uint8_t *dst_ptr_m = ((uint8_t *)(pdst)) + 1; \ |
| 99 const uint32_t val_m = (val); \ |
| 100 \ |
| 101 __asm__ __volatile__ ( \ |
| 102 "sw %[val_m], %[dst_ptr_m] \n\t" \ |
| 103 \ |
| 104 : [dst_ptr_m] "=m" (*dst_ptr_m) \ |
| 105 : [val_m] "r" (val_m) \ |
| 106 ); \ |
| 107 } |
| 108 |
| 109 #define STORE_WORD(pdst, val) { \ |
| 110 uint8_t *dst_ptr_m = (uint8_t *)(pdst); \ |
| 111 const uint32_t val_m = (val); \ |
| 112 \ |
| 113 __asm__ __volatile__ ( \ |
| 114 "sw %[val_m], %[dst_ptr_m] \n\t" \ |
| 115 \ |
| 116 : [dst_ptr_m] "=m" (*dst_ptr_m) \ |
| 117 : [val_m] "r" (val_m) \ |
| 118 ); \ |
| 119 } |
| 120 |
| 121 #define STORE_DWORD(pdst, val) { \ |
| 122 uint8_t *dst_ptr_m = (uint8_t *)(pdst); \ |
| 123 const uint64_t val_m = (val); \ |
| 124 \ |
| 125 __asm__ __volatile__ ( \ |
| 126 "sd %[val_m], %[dst_ptr_m] \n\t" \ |
| 127 \ |
| 128 : [dst_ptr_m] "=m" (*dst_ptr_m) \ |
| 129 : [val_m] "r" (val_m) \ |
| 130 ); \ |
| 131 } |
| 132 #else // !(__mips_isa_rev >= 6) |
| 133 #define LOAD_WORD(psrc) ({ \ |
| 134 const uint8_t *src_m = (const uint8_t *)(psrc); \ |
| 135 uint32_t val_m; \ |
| 136 \ |
| 137 __asm__ __volatile__ ( \ |
| 138 "ulw %[val_m], %[src_m] \n\t" \ |
| 139 \ |
| 140 : [val_m] "=r" (val_m) \ |
| 141 : [src_m] "m" (*src_m) \ |
| 142 ); \ |
| 143 \ |
| 144 val_m; \ |
| 145 }) |
| 146 |
| 147 #if (__mips == 64) |
| 148 #define LOAD_DWORD(psrc) ({ \ |
| 149 const uint8_t *src_m = (const uint8_t *)(psrc); \ |
| 150 uint64_t val_m = 0; \ |
| 151 \ |
| 152 __asm__ __volatile__ ( \ |
| 153 "uld %[val_m], %[src_m] \n\t" \ |
| 154 \ |
| 155 : [val_m] "=r" (val_m) \ |
| 156 : [src_m] "m" (*src_m) \ |
| 157 ); \ |
| 158 \ |
| 159 val_m; \ |
| 160 }) |
| 161 #else // !(__mips == 64) |
| 162 #define LOAD_DWORD(psrc) ({ \ |
| 163 const uint8_t *src1_m = (const uint8_t *)(psrc); \ |
| 164 const uint8_t *src2_m = ((const uint8_t *)(psrc)) + 4; \ |
| 165 uint32_t val0_m, val1_m; \ |
| 166 uint64_t genval_m = 0; \ |
| 167 \ |
| 168 __asm__ __volatile__ ( \ |
| 169 "ulw %[val0_m], %[src1_m] \n\t" \ |
| 170 \ |
| 171 : [val0_m] "=r" (val0_m) \ |
| 172 : [src1_m] "m" (*src1_m) \ |
| 173 ); \ |
| 174 \ |
| 175 __asm__ __volatile__ ( \ |
| 176 "ulw %[val1_m], %[src2_m] \n\t" \ |
| 177 \ |
| 178 : [val1_m] "=r" (val1_m) \ |
| 179 : [src2_m] "m" (*src2_m) \ |
| 180 ); \ |
| 181 \ |
| 182 genval_m = (uint64_t)(val1_m); \ |
| 183 genval_m = (uint64_t)((genval_m << 32) & 0xFFFFFFFF00000000); \ |
| 184 genval_m = (uint64_t)(genval_m | (uint64_t)val0_m); \ |
| 185 \ |
| 186 genval_m; \ |
| 187 }) |
| 188 #endif // (__mips == 64) |
| 189 |
| 190 #define STORE_WORD_WITH_OFFSET_1(pdst, val) { \ |
| 191 uint8_t *dst_ptr_m = ((uint8_t *)(pdst)) + 1; \ |
| 192 const uint32_t val_m = (val); \ |
| 193 \ |
| 194 __asm__ __volatile__ ( \ |
| 195 "usw %[val_m], %[dst_ptr_m] \n\t" \ |
| 196 \ |
| 197 : [dst_ptr_m] "=m" (*dst_ptr_m) \ |
| 198 : [val_m] "r" (val_m) \ |
| 199 ); \ |
| 200 } |
| 201 |
| 202 #define STORE_WORD(pdst, val) { \ |
| 203 uint8_t *dst_ptr_m = (uint8_t *)(pdst); \ |
| 204 const uint32_t val_m = (val); \ |
| 205 \ |
| 206 __asm__ __volatile__ ( \ |
| 207 "usw %[val_m], %[dst_ptr_m] \n\t" \ |
| 208 \ |
| 209 : [dst_ptr_m] "=m" (*dst_ptr_m) \ |
| 210 : [val_m] "r" (val_m) \ |
| 211 ); \ |
| 212 } |
| 213 |
| 214 #define STORE_DWORD(pdst, val) { \ |
| 215 uint8_t *dst1_m = (uint8_t *)(pdst); \ |
| 216 uint8_t *dst2_m = ((uint8_t *)(pdst)) + 4; \ |
| 217 uint32_t val0_m, val1_m; \ |
| 218 \ |
| 219 val0_m = (uint32_t)((val) & 0x00000000FFFFFFFF); \ |
| 220 val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ |
| 221 \ |
| 222 __asm__ __volatile__ ( \ |
| 223 "usw %[val0_m], %[dst1_m] \n\t" \ |
| 224 "usw %[val1_m], %[dst2_m] \n\t" \ |
| 225 \ |
| 226 : [dst1_m] "=m" (*dst1_m), [dst2_m] "=m" (*dst2_m) \ |
| 227 : [val0_m] "r" (val0_m), [val1_m] "r" (val1_m) \ |
| 228 ); \ |
| 229 } |
| 230 #endif // (__mips_isa_rev >= 6) |
| 231 |
| 232 #define LOAD_2VECS_UB(psrc, stride, \ |
| 233 val0, val1) { \ |
| 234 val0 = LOAD_UB(psrc + 0 * stride); \ |
| 235 val1 = LOAD_UB(psrc + 1 * stride); \ |
| 236 } |
| 237 |
| 238 #define LOAD_4VECS_UB(psrc, stride, \ |
| 239 val0, val1, val2, val3) { \ |
| 240 val0 = LOAD_UB(psrc + 0 * stride); \ |
| 241 val1 = LOAD_UB(psrc + 1 * stride); \ |
| 242 val2 = LOAD_UB(psrc + 2 * stride); \ |
| 243 val3 = LOAD_UB(psrc + 3 * stride); \ |
| 244 } |
| 245 |
| 246 #define LOAD_4VECS_SB(psrc, stride, \ |
| 247 val0, val1, val2, val3) { \ |
| 248 val0 = LOAD_SB(psrc + 0 * stride); \ |
| 249 val1 = LOAD_SB(psrc + 1 * stride); \ |
| 250 val2 = LOAD_SB(psrc + 2 * stride); \ |
| 251 val3 = LOAD_SB(psrc + 3 * stride); \ |
| 252 } |
| 253 |
| 254 #define LOAD_5VECS_UB(psrc, stride, \ |
| 255 out0, out1, out2, out3, out4) { \ |
| 256 LOAD_4VECS_UB((psrc), (stride), \ |
| 257 (out0), (out1), (out2), (out3)); \ |
| 258 out4 = LOAD_UB(psrc + 4 * stride); \ |
| 259 } |
| 260 |
| 261 #define LOAD_5VECS_SB(psrc, stride, \ |
| 262 out0, out1, out2, out3, out4) { \ |
| 263 LOAD_4VECS_SB((psrc), (stride), \ |
| 264 (out0), (out1), (out2), (out3)); \ |
| 265 out4 = LOAD_SB(psrc + 4 * stride); \ |
| 266 } |
| 267 |
| 268 #define LOAD_7VECS_SB(psrc, stride, \ |
| 269 val0, val1, val2, val3, \ |
| 270 val4, val5, val6) { \ |
| 271 val0 = LOAD_SB((psrc) + 0 * (stride)); \ |
| 272 val1 = LOAD_SB((psrc) + 1 * (stride)); \ |
| 273 val2 = LOAD_SB((psrc) + 2 * (stride)); \ |
| 274 val3 = LOAD_SB((psrc) + 3 * (stride)); \ |
| 275 val4 = LOAD_SB((psrc) + 4 * (stride)); \ |
| 276 val5 = LOAD_SB((psrc) + 5 * (stride)); \ |
| 277 val6 = LOAD_SB((psrc) + 6 * (stride)); \ |
| 278 } |
| 279 |
| 280 #define LOAD_8VECS_UB(psrc, stride, \ |
| 281 out0, out1, out2, out3, \ |
| 282 out4, out5, out6, out7) { \ |
| 283 LOAD_4VECS_UB((psrc), (stride), \ |
| 284 (out0), (out1), (out2), (out3)); \ |
| 285 LOAD_4VECS_UB((psrc + 4 * stride), (stride), \ |
| 286 (out4), (out5), (out6), (out7)); \ |
| 287 } |
| 288 |
| 289 #define LOAD_8VECS_SB(psrc, stride, \ |
| 290 out0, out1, out2, out3, \ |
| 291 out4, out5, out6, out7) { \ |
| 292 LOAD_4VECS_SB((psrc), (stride), \ |
| 293 (out0), (out1), (out2), (out3)); \ |
| 294 LOAD_4VECS_SB((psrc + 4 * stride), (stride), \ |
| 295 (out4), (out5), (out6), (out7)); \ |
| 296 } |
| 297 |
| 298 #define LOAD_2VECS_SH(psrc, stride, \ |
| 299 val0, val1) { \ |
| 300 val0 = LOAD_SH((psrc) + 0 * (stride)); \ |
| 301 val1 = LOAD_SH((psrc) + 1 * (stride)); \ |
| 302 } |
| 303 |
| 304 #define LOAD_4VECS_SH(psrc, stride, \ |
| 305 val0, val1, val2, val3) { \ |
| 306 LOAD_2VECS_SH((psrc), (stride), val0, val1); \ |
| 307 LOAD_2VECS_SH((psrc + 2 * stride), (stride), val2, val3); \ |
| 308 } |
| 309 |
| 310 #define LOAD_8VECS_SH(psrc, stride, \ |
| 311 val0, val1, val2, val3, \ |
| 312 val4, val5, val6, val7) { \ |
| 313 LOAD_4VECS_SH((psrc), (stride), \ |
| 314 val0, val1, val2, val3); \ |
| 315 LOAD_4VECS_SH((psrc + 4 * stride), (stride), \ |
| 316 val4, val5, val6, val7); \ |
| 317 } |
| 318 |
| 319 #define LOAD_16VECS_SH(psrc, stride, \ |
| 320 val0, val1, val2, val3, \ |
| 321 val4, val5, val6, val7, \ |
| 322 val8, val9, val10, val11, \ |
| 323 val12, val13, val14, val15) { \ |
| 324 LOAD_8VECS_SH((psrc), (stride), \ |
| 325 val0, val1, val2, val3, \ |
| 326 val4, val5, val6, val7); \ |
| 327 LOAD_8VECS_SH((psrc + 8 * (stride)), (stride), \ |
| 328 val8, val9, val10, val11, \ |
| 329 val12, val13, val14, val15); \ |
| 330 } |
| 331 |
| 332 #define STORE_4VECS_UB(dst_out, pitch, \ |
| 333 in0, in1, in2, in3) { \ |
| 334 STORE_UB((in0), (dst_out)); \ |
| 335 STORE_UB((in1), ((dst_out) + (pitch))); \ |
| 336 STORE_UB((in2), ((dst_out) + 2 * (pitch))); \ |
| 337 STORE_UB((in3), ((dst_out) + 3 * (pitch))); \ |
| 338 } |
| 339 |
| 340 #define STORE_8VECS_UB(dst_out, pitch_in, \ |
| 341 in0, in1, in2, in3, \ |
| 342 in4, in5, in6, in7) { \ |
| 343 STORE_4VECS_UB(dst_out, pitch_in, \ |
| 344 in0, in1, in2, in3); \ |
| 345 STORE_4VECS_UB((dst_out + 4 * (pitch_in)), pitch_in, \ |
| 346 in4, in5, in6, in7); \ |
| 347 } |
| 348 |
| 349 #define VEC_INSERT_4W_UB(src, src0, src1, src2, src3) { \ |
| 350 src = (v16u8)__msa_insert_w((v4i32)(src), 0, (src0)); \ |
| 351 src = (v16u8)__msa_insert_w((v4i32)(src), 1, (src1)); \ |
| 352 src = (v16u8)__msa_insert_w((v4i32)(src), 2, (src2)); \ |
| 353 src = (v16u8)__msa_insert_w((v4i32)(src), 3, (src3)); \ |
| 354 } |
| 355 |
| 356 #define VEC_INSERT_2DW_UB(src, src0, src1) { \ |
| 357 src = (v16u8)__msa_insert_d((v2i64)(src), 0, (src0)); \ |
| 358 src = (v16u8)__msa_insert_d((v2i64)(src), 1, (src1)); \ |
| 359 } |
| 360 |
| 361 #define STORE_4VECS_SH(ptr, stride, \ |
| 362 in0, in1, in2, in3) { \ |
| 363 STORE_SH(in0, ((ptr) + 0 * stride)); \ |
| 364 STORE_SH(in1, ((ptr) + 1 * stride)); \ |
| 365 STORE_SH(in2, ((ptr) + 2 * stride)); \ |
| 366 STORE_SH(in3, ((ptr) + 3 * stride)); \ |
| 367 } |
| 368 |
| 369 #define STORE_8VECS_SH(ptr, stride, \ |
| 370 in0, in1, in2, in3, \ |
| 371 in4, in5, in6, in7) { \ |
| 372 STORE_SH(in0, ((ptr) + 0 * stride)); \ |
| 373 STORE_SH(in1, ((ptr) + 1 * stride)); \ |
| 374 STORE_SH(in2, ((ptr) + 2 * stride)); \ |
| 375 STORE_SH(in3, ((ptr) + 3 * stride)); \ |
| 376 STORE_SH(in4, ((ptr) + 4 * stride)); \ |
| 377 STORE_SH(in5, ((ptr) + 5 * stride)); \ |
| 378 STORE_SH(in6, ((ptr) + 6 * stride)); \ |
| 379 STORE_SH(in7, ((ptr) + 7 * stride)); \ |
| 380 } |
| 381 |
| 382 #define CLIP_UNSIGNED_CHAR_H(in) ({ \ |
| 383 v8i16 max_m = __msa_ldi_h(255); \ |
| 384 v8i16 out_m; \ |
| 385 \ |
| 386 out_m = __msa_maxi_s_h((v8i16)(in), 0); \ |
| 387 out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \ |
| 388 out_m; \ |
| 389 }) |
| 390 |
| 391 /* halfword 8x8 transpose macro */ |
| 392 #define TRANSPOSE8x8_H_SH(in0, in1, in2, in3, \ |
| 393 in4, in5, in6, in7, \ |
| 394 out0, out1, out2, out3, \ |
| 395 out4, out5, out6, out7) { \ |
| 396 v8i16 s0_m, s1_m; \ |
| 397 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ |
| 398 v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ |
| 399 \ |
| 400 s0_m = __msa_ilvr_h((v8i16)(in6), (v8i16)(in4)); \ |
| 401 s1_m = __msa_ilvr_h((v8i16)(in7), (v8i16)(in5)); \ |
| 402 tmp0_m = __msa_ilvr_h((v8i16)s1_m, (v8i16)s0_m); \ |
| 403 tmp1_m = __msa_ilvl_h((v8i16)s1_m, (v8i16)s0_m); \ |
| 404 \ |
| 405 s0_m = __msa_ilvl_h((v8i16)(in6), (v8i16)(in4)); \ |
| 406 s1_m = __msa_ilvl_h((v8i16)(in7), (v8i16)(in5)); \ |
| 407 tmp2_m = __msa_ilvr_h((v8i16)s1_m, (v8i16)s0_m); \ |
| 408 tmp3_m = __msa_ilvl_h((v8i16)s1_m, (v8i16)s0_m); \ |
| 409 \ |
| 410 s0_m = __msa_ilvr_h((v8i16)(in2), (v8i16)(in0)); \ |
| 411 s1_m = __msa_ilvr_h((v8i16)(in3), (v8i16)(in1)); \ |
| 412 tmp4_m = __msa_ilvr_h((v8i16)s1_m, (v8i16)s0_m); \ |
| 413 tmp5_m = __msa_ilvl_h((v8i16)s1_m, (v8i16)s0_m); \ |
| 414 \ |
| 415 s0_m = __msa_ilvl_h((v8i16)(in2), (v8i16)(in0)); \ |
| 416 s1_m = __msa_ilvl_h((v8i16)(in3), (v8i16)(in1)); \ |
| 417 tmp6_m = __msa_ilvr_h((v8i16)s1_m, (v8i16)s0_m); \ |
| 418 tmp7_m = __msa_ilvl_h((v8i16)s1_m, (v8i16)s0_m); \ |
| 419 \ |
| 420 out0 = (v8i16)__msa_pckev_d((v2i64)tmp0_m, (v2i64)tmp4_m); \ |
| 421 out1 = (v8i16)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m); \ |
| 422 out2 = (v8i16)__msa_pckev_d((v2i64)tmp1_m, (v2i64)tmp5_m); \ |
| 423 out3 = (v8i16)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m); \ |
| 424 out4 = (v8i16)__msa_pckev_d((v2i64)tmp2_m, (v2i64)tmp6_m); \ |
| 425 out5 = (v8i16)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m); \ |
| 426 out6 = (v8i16)__msa_pckev_d((v2i64)tmp3_m, (v2i64)tmp7_m); \ |
| 427 out7 = (v8i16)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m); \ |
| 428 } |
| 429 |
| 430 /* interleave macros */ |
| 431 /* no in-place support */ |
| 432 #define ILV_B_LRLR_UB(in0, in1, in2, in3, \ |
| 433 out0, out1, out2, out3) { \ |
| 434 out0 = (v16u8)__msa_ilvl_b((v16i8)(in1), (v16i8)(in0)); \ |
| 435 out1 = (v16u8)__msa_ilvr_b((v16i8)(in1), (v16i8)(in0)); \ |
| 436 out2 = (v16u8)__msa_ilvl_b((v16i8)(in3), (v16i8)(in2)); \ |
| 437 out3 = (v16u8)__msa_ilvr_b((v16i8)(in3), (v16i8)(in2)); \ |
| 438 } |
| 439 |
| 440 #define ILV_H_LRLR_SH(in0, in1, in2, in3, \ |
| 441 out0, out1, out2, out3) { \ |
| 442 out0 = __msa_ilvl_h((v8i16)(in1), (v8i16)(in0)); \ |
| 443 out1 = __msa_ilvr_h((v8i16)(in1), (v8i16)(in0)); \ |
| 444 out2 = __msa_ilvl_h((v8i16)(in3), (v8i16)(in2)); \ |
| 445 out3 = __msa_ilvr_h((v8i16)(in3), (v8i16)(in2)); \ |
| 446 } |
| 447 |
| 448 #define ILV_H_LR_SH(in0, in1, out0, out1) { \ |
| 449 out0 = __msa_ilvl_h((v8i16)(in1), (v8i16)(in0)); \ |
| 450 out1 = __msa_ilvr_h((v8i16)(in1), (v8i16)(in0)); \ |
| 451 } |
| 452 |
| 453 #define ILVR_B_2VECS_UB(in0_r, in1_r, in0_l, in1_l, \ |
| 454 out0, out1) { \ |
| 455 out0 = (v16u8)__msa_ilvr_b((v16i8)(in0_l), (v16i8)(in0_r)); \ |
| 456 out1 = (v16u8)__msa_ilvr_b((v16i8)(in1_l), (v16i8)(in1_r)); \ |
| 457 } |
| 458 |
| 459 #define ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \ |
| 460 out0, out1) { \ |
| 461 out0 = __msa_ilvr_b((v16i8)(in0_l), (v16i8)(in0_r)); \ |
| 462 out1 = __msa_ilvr_b((v16i8)(in1_l), (v16i8)(in1_r)); \ |
| 463 } |
| 464 |
| 465 #define ILVR_B_4VECS_UB(in0_r, in1_r, in2_r, in3_r, \ |
| 466 in0_l, in1_l, in2_l, in3_l, \ |
| 467 out0, out1, out2, out3) { \ |
| 468 ILVR_B_2VECS_UB(in0_r, in1_r, in0_l, in1_l, \ |
| 469 out0, out1); \ |
| 470 ILVR_B_2VECS_UB(in2_r, in3_r, in2_l, in3_l, \ |
| 471 out2, out3); \ |
| 472 } |
| 473 |
| 474 #define ILVR_B_4VECS_SB(in0_r, in1_r, in2_r, in3_r, \ |
| 475 in0_l, in1_l, in2_l, in3_l, \ |
| 476 out0, out1, out2, out3) { \ |
| 477 ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \ |
| 478 out0, out1); \ |
| 479 ILVR_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \ |
| 480 out2, out3); \ |
| 481 } |
| 482 |
| 483 #define ILVR_B_6VECS_SB(in0_r, in1_r, in2_r, \ |
| 484 in3_r, in4_r, in5_r, \ |
| 485 in0_l, in1_l, in2_l, \ |
| 486 in3_l, in4_l, in5_l, \ |
| 487 out0, out1, out2, \ |
| 488 out3, out4, out5) { \ |
| 489 ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \ |
| 490 out0, out1); \ |
| 491 ILVR_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \ |
| 492 out2, out3); \ |
| 493 ILVR_B_2VECS_SB(in4_r, in5_r, in4_l, in5_l, \ |
| 494 out4, out5); \ |
| 495 } |
| 496 |
| 497 #define ILVR_B_8VECS_SB(in0_r, in1_r, in2_r, in3_r, \ |
| 498 in4_r, in5_r, in6_r, in7_r, \ |
| 499 in0_l, in1_l, in2_l, in3_l, \ |
| 500 in4_l, in5_l, in6_l, in7_l, \ |
| 501 out0, out1, out2, out3, \ |
| 502 out4, out5, out6, out7) { \ |
| 503 ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \ |
| 504 out0, out1); \ |
| 505 ILVR_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \ |
| 506 out2, out3); \ |
| 507 ILVR_B_2VECS_SB(in4_r, in5_r, in4_l, in5_l, \ |
| 508 out4, out5); \ |
| 509 ILVR_B_2VECS_SB(in6_r, in7_r, in6_l, in7_l, \ |
| 510 out6, out7); \ |
| 511 } |
| 512 |
| 513 #define ILVL_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \ |
| 514 out0, out1) { \ |
| 515 out0 = __msa_ilvl_b((v16i8)(in0_l), (v16i8)(in0_r)); \ |
| 516 out1 = __msa_ilvl_b((v16i8)(in1_l), (v16i8)(in1_r)); \ |
| 517 } |
| 518 |
| 519 #define ILVL_B_4VECS_SB(in0_r, in1_r, in2_r, in3_r, \ |
| 520 in0_l, in1_l, in2_l, in3_l, \ |
| 521 out0, out1, out2, out3) { \ |
| 522 ILVL_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \ |
| 523 out0, out1); \ |
| 524 ILVL_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \ |
| 525 out2, out3); \ |
| 526 } |
| 527 |
| 528 #define ILVL_B_6VECS_SB(in0_r, in1_r, in2_r, \ |
| 529 in3_r, in4_r, in5_r, \ |
| 530 in0_l, in1_l, in2_l, \ |
| 531 in3_l, in4_l, in5_l, \ |
| 532 out0, out1, out2, \ |
| 533 out3, out4, out5) { \ |
| 534 ILVL_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \ |
| 535 out0, out1); \ |
| 536 ILVL_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \ |
| 537 out2, out3); \ |
| 538 ILVL_B_2VECS_SB(in4_r, in5_r, in4_l, in5_l, \ |
| 539 out4, out5); \ |
| 540 } |
| 541 |
| 542 #define ILVR_D_2VECS_SB(out0, in0_l, in0_r, \ |
| 543 out1, in1_l, in1_r) { \ |
| 544 out0 = (v16i8)__msa_ilvr_d((v2i64)(in0_l), (v2i64)(in0_r)); \ |
| 545 out1 = (v16i8)__msa_ilvr_d((v2i64)(in1_l), (v2i64)(in1_r)); \ |
| 546 } |
| 547 |
| 548 #define ILVR_D_3VECS_SB(out0, in0_l, in0_r, \ |
| 549 out1, in1_l, in1_r, \ |
| 550 out2, in2_l, in2_r) { \ |
| 551 ILVR_D_2VECS_SB(out0, in0_l, in0_r, \ |
| 552 out1, in1_l, in1_r); \ |
| 553 out2 = (v16i8)__msa_ilvr_d((v2i64)(in2_l), (v2i64)(in2_r)); \ |
| 554 } |
| 555 |
| 556 #define ILVR_D_4VECS_SB(out0, in0_l, in0_r, \ |
| 557 out1, in1_l, in1_r, \ |
| 558 out2, in2_l, in2_r, \ |
| 559 out3, in3_l, in3_r) { \ |
| 560 ILVR_D_2VECS_SB(out0, in0_l, in0_r, \ |
| 561 out1, in1_l, in1_r); \ |
| 562 ILVR_D_2VECS_SB(out2, in2_l, in2_r, \ |
| 563 out3, in3_l, in3_r); \ |
| 564 } |
| 565 |
| 566 #define DOTP_S_W_4VECS_SW(m0, c0, m1, c1, \ |
| 567 m2, c2, m3, c3, \ |
| 568 out0, out1, out2, out3) { \ |
| 569 out0 = __msa_dotp_s_w((v8i16)(m0), (v8i16)(c0)); \ |
| 570 out1 = __msa_dotp_s_w((v8i16)(m1), (v8i16)(c1)); \ |
| 571 out2 = __msa_dotp_s_w((v8i16)(m2), (v8i16)(c2)); \ |
| 572 out3 = __msa_dotp_s_w((v8i16)(m3), (v8i16)(c3)); \ |
| 573 } |
| 574 |
| 575 #define PCKEV_H_2VECS_SH(in0_l, in0_r, in1_l, in1_r, \ |
| 576 out0, out1) { \ |
| 577 out0 = __msa_pckev_h((v8i16)(in0_l), (v8i16)(in0_r)); \ |
| 578 out1 = __msa_pckev_h((v8i16)(in1_l), (v8i16)(in1_r)); \ |
| 579 } |
| 580 |
| 581 #define XORI_B_2VECS_UB(val0, val1, \ |
| 582 out0, out1, xor_val) { \ |
| 583 out0 = __msa_xori_b((v16u8)(val0), (xor_val)); \ |
| 584 out1 = __msa_xori_b((v16u8)(val1), (xor_val)); \ |
| 585 } |
| 586 |
| 587 #define XORI_B_2VECS_SB(val0, val1, \ |
| 588 out0, out1, xor_val) { \ |
| 589 out0 = (v16i8)__msa_xori_b((v16u8)(val0), (xor_val)); \ |
| 590 out1 = (v16i8)__msa_xori_b((v16u8)(val1), (xor_val)); \ |
| 591 } |
| 592 |
| 593 #define XORI_B_3VECS_SB(val0, val1, val2, \ |
| 594 out0, out1, out2, xor_val) { \ |
| 595 XORI_B_2VECS_SB(val0, val1, out0, out1, xor_val); \ |
| 596 out2 = (v16i8)__msa_xori_b((v16u8)(val2), (xor_val)); \ |
| 597 } |
| 598 |
| 599 #define XORI_B_4VECS_UB(val0, val1, val2, val3, \ |
| 600 out0, out1, out2, out3, \ |
| 601 xor_val) { \ |
| 602 XORI_B_2VECS_UB(val0, val1, out0, out1, xor_val); \ |
| 603 XORI_B_2VECS_UB(val2, val3, out2, out3, xor_val); \ |
| 604 } |
| 605 |
| 606 #define XORI_B_4VECS_SB(val0, val1, val2, val3, \ |
| 607 out0, out1, out2, out3, \ |
| 608 xor_val) { \ |
| 609 XORI_B_2VECS_SB(val0, val1, out0, out1, xor_val); \ |
| 610 XORI_B_2VECS_SB(val2, val3, out2, out3, xor_val); \ |
| 611 } |
| 612 |
| 613 #define XORI_B_7VECS_SB(val0, val1, val2, val3, \ |
| 614 val4, val5, val6, \ |
| 615 out0, out1, out2, out3, \ |
| 616 out4, out5, out6, \ |
| 617 xor_val) { \ |
| 618 XORI_B_4VECS_SB(val0, val1, val2, val3, \ |
| 619 out0, out1, out2, out3, xor_val); \ |
| 620 XORI_B_3VECS_SB(val4, val5, val6, \ |
| 621 out4, out5, out6, xor_val); \ |
| 622 } |
| 623 |
| 624 #define SRARI_H_4VECS_UH(val0, val1, val2, val3, \ |
| 625 out0, out1, out2, out3, \ |
| 626 shift_right_val) { \ |
| 627 out0 = (v8u16)__msa_srari_h((v8i16)(val0), (shift_right_val)); \ |
| 628 out1 = (v8u16)__msa_srari_h((v8i16)(val1), (shift_right_val)); \ |
| 629 out2 = (v8u16)__msa_srari_h((v8i16)(val2), (shift_right_val)); \ |
| 630 out3 = (v8u16)__msa_srari_h((v8i16)(val3), (shift_right_val)); \ |
| 631 } |
| 632 |
| 633 #define SRARI_H_4VECS_SH(val0, val1, val2, val3, \ |
| 634 out0, out1, out2, out3, \ |
| 635 shift_right_val) { \ |
| 636 out0 = __msa_srari_h((v8i16)(val0), (shift_right_val)); \ |
| 637 out1 = __msa_srari_h((v8i16)(val1), (shift_right_val)); \ |
| 638 out2 = __msa_srari_h((v8i16)(val2), (shift_right_val)); \ |
| 639 out3 = __msa_srari_h((v8i16)(val3), (shift_right_val)); \ |
| 640 } |
| 641 |
| 642 #define SRARI_W_4VECS_SW(val0, val1, val2, val3, \ |
| 643 out0, out1, out2, out3, \ |
| 644 shift_right_val) { \ |
| 645 out0 = __msa_srari_w((v4i32)(val0), (shift_right_val)); \ |
| 646 out1 = __msa_srari_w((v4i32)(val1), (shift_right_val)); \ |
| 647 out2 = __msa_srari_w((v4i32)(val2), (shift_right_val)); \ |
| 648 out3 = __msa_srari_w((v4i32)(val3), (shift_right_val)); \ |
| 649 } |
| 650 |
| 651 #define SRARI_SATURATE_UNSIGNED_H(input, right_shift_val, sat_val) ({ \ |
| 652 v8u16 out_m; \ |
| 653 \ |
| 654 out_m = (v8u16)__msa_srari_h((v8i16)(input), (right_shift_val)); \ |
| 655 out_m = __msa_sat_u_h(out_m, (sat_val)); \ |
| 656 out_m; \ |
| 657 }) |
| 658 |
| 659 #define SRARI_SATURATE_SIGNED_H(input, right_shift_val, sat_val) ({ \ |
| 660 v8i16 out_m; \ |
| 661 \ |
| 662 out_m = __msa_srari_h((v8i16)(input), (right_shift_val)); \ |
| 663 out_m = __msa_sat_s_h(out_m, (sat_val)); \ |
| 664 out_m; \ |
| 665 }) |
| 666 |
| 667 #define PCKEV_2B_XORI128_STORE_4_BYTES_4(in1, in2, \ |
| 668 pdst, stride) { \ |
| 669 uint32_t out0_m, out1_m, out2_m, out3_m; \ |
| 670 v16i8 tmp0_m; \ |
| 671 uint8_t *dst_m = (uint8_t *)(pdst); \ |
| 672 \ |
| 673 tmp0_m = __msa_pckev_b((v16i8)(in2), (v16i8)(in1)); \ |
| 674 tmp0_m = (v16i8)__msa_xori_b((v16u8)tmp0_m, 128); \ |
| 675 \ |
| 676 out0_m = __msa_copy_u_w((v4i32)tmp0_m, 0); \ |
| 677 out1_m = __msa_copy_u_w((v4i32)tmp0_m, 1); \ |
| 678 out2_m = __msa_copy_u_w((v4i32)tmp0_m, 2); \ |
| 679 out3_m = __msa_copy_u_w((v4i32)tmp0_m, 3); \ |
| 680 \ |
| 681 STORE_WORD(dst_m, out0_m); \ |
| 682 dst_m += stride; \ |
| 683 STORE_WORD(dst_m, out1_m); \ |
| 684 dst_m += stride; \ |
| 685 STORE_WORD(dst_m, out2_m); \ |
| 686 dst_m += stride; \ |
| 687 STORE_WORD(dst_m, out3_m); \ |
| 688 } |
| 689 |
| 690 #define PCKEV_B_4_XORI128_STORE_8_BYTES_4(in1, in2, \ |
| 691 in3, in4, \ |
| 692 pdst, stride) { \ |
| 693 uint64_t out0_m, out1_m, out2_m, out3_m; \ |
| 694 v16i8 tmp0_m, tmp1_m; \ |
| 695 uint8_t *dst_m = (uint8_t *)(pdst); \ |
| 696 \ |
| 697 tmp0_m = __msa_pckev_b((v16i8)(in2), (v16i8)(in1)); \ |
| 698 tmp1_m = __msa_pckev_b((v16i8)(in4), (v16i8)(in3)); \ |
| 699 \ |
| 700 tmp0_m = (v16i8)__msa_xori_b((v16u8)tmp0_m, 128); \ |
| 701 tmp1_m = (v16i8)__msa_xori_b((v16u8)tmp1_m, 128); \ |
| 702 \ |
| 703 out0_m = __msa_copy_u_d((v2i64)tmp0_m, 0); \ |
| 704 out1_m = __msa_copy_u_d((v2i64)tmp0_m, 1); \ |
| 705 out2_m = __msa_copy_u_d((v2i64)tmp1_m, 0); \ |
| 706 out3_m = __msa_copy_u_d((v2i64)tmp1_m, 1); \ |
| 707 \ |
| 708 STORE_DWORD(dst_m, out0_m); \ |
| 709 dst_m += stride; \ |
| 710 STORE_DWORD(dst_m, out1_m); \ |
| 711 dst_m += stride; \ |
| 712 STORE_DWORD(dst_m, out2_m); \ |
| 713 dst_m += stride; \ |
| 714 STORE_DWORD(dst_m, out3_m); \ |
| 715 } |
| 716 |
| 717 /* Only for signed vecs */ |
| 718 #define PCKEV_B_XORI128_STORE_VEC(in1, in2, pdest) { \ |
| 719 v16i8 tmp_m; \ |
| 720 \ |
| 721 tmp_m = __msa_pckev_b((v16i8)(in1), (v16i8)(in2)); \ |
| 722 tmp_m = (v16i8)__msa_xori_b((v16u8)tmp_m, 128); \ |
| 723 STORE_SB(tmp_m, (pdest)); \ |
| 724 } |
| 725 |
| 726 /* Only for signed vecs */ |
| 727 #define PCKEV_B_4_XORI128_AVG_STORE_8_BYTES_4(in1, dst0, \ |
| 728 in2, dst1, \ |
| 729 in3, dst2, \ |
| 730 in4, dst3, \ |
| 731 pdst, stride) { \ |
| 732 uint64_t out0_m, out1_m, out2_m, out3_m; \ |
| 733 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ |
| 734 uint8_t *dst_m = (uint8_t *)(pdst); \ |
| 735 \ |
| 736 tmp0_m = (v16u8)__msa_pckev_b((v16i8)(in2), (v16i8)(in1)); \ |
| 737 tmp1_m = (v16u8)__msa_pckev_b((v16i8)(in4), (v16i8)(in3)); \ |
| 738 \ |
| 739 tmp2_m = (v16u8)__msa_ilvr_d((v2i64)(dst1), (v2i64)(dst0)); \ |
| 740 tmp3_m = (v16u8)__msa_ilvr_d((v2i64)(dst3), (v2i64)(dst2)); \ |
| 741 \ |
| 742 tmp0_m = __msa_xori_b(tmp0_m, 128); \ |
| 743 tmp1_m = __msa_xori_b(tmp1_m, 128); \ |
| 744 \ |
| 745 tmp0_m = __msa_aver_u_b(tmp0_m, tmp2_m); \ |
| 746 tmp1_m = __msa_aver_u_b(tmp1_m, tmp3_m); \ |
| 747 \ |
| 748 out0_m = __msa_copy_u_d((v2i64)tmp0_m, 0); \ |
| 749 out1_m = __msa_copy_u_d((v2i64)tmp0_m, 1); \ |
| 750 out2_m = __msa_copy_u_d((v2i64)tmp1_m, 0); \ |
| 751 out3_m = __msa_copy_u_d((v2i64)tmp1_m, 1); \ |
| 752 \ |
| 753 STORE_DWORD(dst_m, out0_m); \ |
| 754 dst_m += stride; \ |
| 755 STORE_DWORD(dst_m, out1_m); \ |
| 756 dst_m += stride; \ |
| 757 STORE_DWORD(dst_m, out2_m); \ |
| 758 dst_m += stride; \ |
| 759 STORE_DWORD(dst_m, out3_m); \ |
| 760 } |
| 761 |
| 762 /* Only for signed vecs */ |
| 763 #define PCKEV_B_XORI128_AVG_STORE_VEC(in1, in2, dst, pdest) { \ |
| 764 v16u8 tmp_m; \ |
| 765 \ |
| 766 tmp_m = (v16u8)__msa_pckev_b((v16i8)(in1), (v16i8)(in2)); \ |
| 767 tmp_m = __msa_xori_b(tmp_m, 128); \ |
| 768 tmp_m = __msa_aver_u_b(tmp_m, (v16u8)(dst)); \ |
| 769 STORE_UB(tmp_m, (pdest)); \ |
| 770 } |
| 771 |
| 772 #define PCKEV_B_STORE_8_BYTES_4(in1, in2, in3, in4, \ |
| 773 pdst, stride) { \ |
| 774 uint64_t out0_m, out1_m, out2_m, out3_m; \ |
| 775 v16i8 tmp0_m, tmp1_m; \ |
| 776 uint8_t *dst_m = (uint8_t *)(pdst); \ |
| 777 \ |
| 778 tmp0_m = __msa_pckev_b((v16i8)(in2), (v16i8)(in1)); \ |
| 779 tmp1_m = __msa_pckev_b((v16i8)(in4), (v16i8)(in3)); \ |
| 780 \ |
| 781 out0_m = __msa_copy_u_d((v2i64)tmp0_m, 0); \ |
| 782 out1_m = __msa_copy_u_d((v2i64)tmp0_m, 1); \ |
| 783 out2_m = __msa_copy_u_d((v2i64)tmp1_m, 0); \ |
| 784 out3_m = __msa_copy_u_d((v2i64)tmp1_m, 1); \ |
| 785 \ |
| 786 STORE_DWORD(dst_m, out0_m); \ |
| 787 dst_m += stride; \ |
| 788 STORE_DWORD(dst_m, out1_m); \ |
| 789 dst_m += stride; \ |
| 790 STORE_DWORD(dst_m, out2_m); \ |
| 791 dst_m += stride; \ |
| 792 STORE_DWORD(dst_m, out3_m); \ |
| 793 } |
| 794 |
| 795 /* Only for unsigned vecs */ |
| 796 #define PCKEV_B_STORE_VEC(in1, in2, pdest) { \ |
| 797 v16i8 tmp_m; \ |
| 798 \ |
| 799 tmp_m = __msa_pckev_b((v16i8)(in1), (v16i8)(in2)); \ |
| 800 STORE_SB(tmp_m, (pdest)); \ |
| 801 } |
| 802 |
| 803 #define PCKEV_B_AVG_STORE_8_BYTES_4(in1, dst0, in2, dst1, \ |
| 804 in3, dst2, in4, dst3, \ |
| 805 pdst, stride) { \ |
| 806 uint64_t out0_m, out1_m, out2_m, out3_m; \ |
| 807 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ |
| 808 uint8_t *dst_m = (uint8_t *)(pdst); \ |
| 809 \ |
| 810 tmp0_m = (v16u8)__msa_pckev_b((v16i8)(in2), (v16i8)(in1)); \ |
| 811 tmp1_m = (v16u8)__msa_pckev_b((v16i8)(in4), (v16i8)(in3)); \ |
| 812 \ |
| 813 tmp2_m = (v16u8)__msa_pckev_d((v2i64)(dst1), (v2i64)(dst0)); \ |
| 814 tmp3_m = (v16u8)__msa_pckev_d((v2i64)(dst3), (v2i64)(dst2)); \ |
| 815 \ |
| 816 tmp0_m = __msa_aver_u_b(tmp0_m, tmp2_m); \ |
| 817 tmp1_m = __msa_aver_u_b(tmp1_m, tmp3_m); \ |
| 818 \ |
| 819 out0_m = __msa_copy_u_d((v2i64)tmp0_m, 0); \ |
| 820 out1_m = __msa_copy_u_d((v2i64)tmp0_m, 1); \ |
| 821 out2_m = __msa_copy_u_d((v2i64)tmp1_m, 0); \ |
| 822 out3_m = __msa_copy_u_d((v2i64)tmp1_m, 1); \ |
| 823 \ |
| 824 STORE_DWORD(dst_m, out0_m); \ |
| 825 dst_m += stride; \ |
| 826 STORE_DWORD(dst_m, out1_m); \ |
| 827 dst_m += stride; \ |
| 828 STORE_DWORD(dst_m, out2_m); \ |
| 829 dst_m += stride; \ |
| 830 STORE_DWORD(dst_m, out3_m); \ |
| 831 } |
| 832 |
| 833 #define PCKEV_B_AVG_STORE_VEC(in1, in2, dst, pdest) { \ |
| 834 v16u8 tmp_m; \ |
| 835 \ |
| 836 tmp_m = (v16u8)__msa_pckev_b((v16i8)(in1), (v16i8)(in2)); \ |
| 837 tmp_m = __msa_aver_u_b(tmp_m, (v16u8)(dst)); \ |
| 838 STORE_UB(tmp_m, (pdest)); \ |
| 839 } |
| 840 |
| 841 /* Generic for Vector types and GP operations */ |
| 842 #define BUTTERFLY_4(in0, in1, in2, in3, \ |
| 843 out0, out1, out2, out3) { \ |
| 844 out0 = (in0) + (in3); \ |
| 845 out1 = (in1) + (in2); \ |
| 846 \ |
| 847 out2 = (in1) - (in2); \ |
| 848 out3 = (in0) - (in3); \ |
| 849 } |
| 850 |
| 851 /* Generic for Vector types and GP operations */ |
| 852 #define BUTTERFLY_8(in0, in1, in2, in3, \ |
| 853 in4, in5, in6, in7, \ |
| 854 out0, out1, out2, out3, \ |
| 855 out4, out5, out6, out7) { \ |
| 856 out0 = (in0) + (in7); \ |
| 857 out1 = (in1) + (in6); \ |
| 858 out2 = (in2) + (in5); \ |
| 859 out3 = (in3) + (in4); \ |
| 860 \ |
| 861 out4 = (in3) - (in4); \ |
| 862 out5 = (in2) - (in5); \ |
| 863 out6 = (in1) - (in6); \ |
| 864 out7 = (in0) - (in7); \ |
| 865 } |
| 866 #endif /* HAVE_MSA */ |
| 867 #endif /* VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_ */ |
OLD | NEW |