| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| 11 #ifndef VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_ | 11 #ifndef VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_ |
| 12 #define VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_ | 12 #define VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_ |
| 13 | 13 |
| 14 #include <msa.h> | 14 #include <msa.h> |
| 15 | 15 |
| 16 #include "./vpx_config.h" | 16 #include "./vpx_config.h" |
| 17 #include "vpx/vpx_integer.h" | 17 #include "vpx/vpx_integer.h" |
| 18 | 18 |
| 19 #if HAVE_MSA | 19 #define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc)) |
| 20 /* load macros */ | 20 #define LD_UB(...) LD_B(v16u8, __VA_ARGS__) |
| 21 #define LOAD_UB(psrc) *((const v16u8 *)(psrc)) | 21 #define LD_SB(...) LD_B(v16i8, __VA_ARGS__) |
| 22 #define LOAD_SB(psrc) *((const v16i8 *)(psrc)) | 22 |
| 23 #define LOAD_UH(psrc) *((const v8u16 *)(psrc)) | 23 #define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc)) |
| 24 #define LOAD_SH(psrc) *((const v8i16 *)(psrc)) | 24 #define LD_UH(...) LD_H(v8u16, __VA_ARGS__) |
| 25 #define LOAD_UW(psrc) *((const v4u32 *)(psrc)) | 25 #define LD_SH(...) LD_H(v8i16, __VA_ARGS__) |
| 26 #define LOAD_SW(psrc) *((const v4i32 *)(psrc)) | 26 |
| 27 #define LOAD_UD(psrc) *((const v2u64 *)(psrc)) | 27 #define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc)) |
| 28 #define LOAD_SD(psrc) *((const v2i64 *)(psrc)) | 28 #define LD_SW(...) LD_W(v4i32, __VA_ARGS__) |
| 29 | 29 |
| 30 /* store macros */ | 30 #define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) |
| 31 #define STORE_UB(vec, pdest) *((v16u8 *)(pdest)) = (vec) | 31 #define ST_UB(...) ST_B(v16u8, __VA_ARGS__) |
| 32 #define STORE_SB(vec, pdest) *((v16i8 *)(pdest)) = (vec) | 32 #define ST_SB(...) ST_B(v16i8, __VA_ARGS__) |
| 33 #define STORE_UH(vec, pdest) *((v8u16 *)(pdest)) = (vec) | 33 |
| 34 #define STORE_SH(vec, pdest) *((v8i16 *)(pdest)) = (vec) | 34 #define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) |
| 35 #define STORE_UW(vec, pdest) *((v4u32 *)(pdest)) = (vec) | 35 #define ST_SH(...) ST_H(v8i16, __VA_ARGS__) |
| 36 #define STORE_SW(vec, pdest) *((v4i32 *)(pdest)) = (vec) | 36 |
| 37 #define STORE_UD(vec, pdest) *((v2u64 *)(pdest)) = (vec) | 37 #define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) |
| 38 #define STORE_SD(vec, pdest) *((v2i64 *)(pdest)) = (vec) | 38 #define ST_SW(...) ST_W(v4i32, __VA_ARGS__) |
| 39 | 39 |
| 40 #if (__mips_isa_rev >= 6) | 40 #if (__mips_isa_rev >= 6) |
| 41 #define LOAD_WORD(psrc) ({ \ | 41 #define LH(psrc) ({ \ |
| 42 const uint8_t *src_m = (const uint8_t *)(psrc); \ | 42 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ |
| 43 uint32_t val_m; \ | 43 uint16_t val_m; \ |
| 44 \ | 44 \ |
| 45 __asm__ __volatile__ ( \ | 45 __asm__ __volatile__ ( \ |
| 46 "lw %[val_m], %[src_m] \n\t" \ | 46 "lh %[val_m], %[psrc_m] \n\t" \ |
| 47 \ | 47 \ |
| 48 : [val_m] "=r" (val_m) \ | 48 : [val_m] "=r" (val_m) \ |
| 49 : [src_m] "m" (*src_m) \ | 49 : [psrc_m] "m" (*psrc_m) \ |
| 50 ); \ | 50 ); \ |
| 51 \ | 51 \ |
| 52 val_m; \ | 52 val_m; \ |
| 53 }) | 53 }) |
| 54 | 54 |
| 55 #define LW(psrc) ({ \ |
| 56 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ |
| 57 uint32_t val_m; \ |
| 58 \ |
| 59 __asm__ __volatile__ ( \ |
| 60 "lw %[val_m], %[psrc_m] \n\t" \ |
| 61 \ |
| 62 : [val_m] "=r" (val_m) \ |
| 63 : [psrc_m] "m" (*psrc_m) \ |
| 64 ); \ |
| 65 \ |
| 66 val_m; \ |
| 67 }) |
| 68 |
| 55 #if (__mips == 64) | 69 #if (__mips == 64) |
| 56 #define LOAD_DWORD(psrc) ({ \ | 70 #define LD(psrc) ({ \ |
| 57 const uint8_t *src_m = (const uint8_t *)(psrc); \ | 71 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ |
| 58 uint64_t val_m = 0; \ | 72 uint64_t val_m = 0; \ |
| 59 \ | 73 \ |
| 60 __asm__ __volatile__ ( \ | 74 __asm__ __volatile__ ( \ |
| 61 "ld %[val_m], %[src_m] \n\t" \ | 75 "ld %[val_m], %[psrc_m] \n\t" \ |
| 62 \ | 76 \ |
| 63 : [val_m] "=r" (val_m) \ | 77 : [val_m] "=r" (val_m) \ |
| 64 : [src_m] "m" (*src_m) \ | 78 : [psrc_m] "m" (*psrc_m) \ |
| 65 ); \ | 79 ); \ |
| 66 \ | 80 \ |
| 67 val_m; \ | 81 val_m; \ |
| 68 }) | 82 }) |
| 69 #else // !(__mips == 64) | 83 #else // !(__mips == 64) |
| 70 #define LOAD_DWORD(psrc) ({ \ | 84 #define LD(psrc) ({ \ |
| 71 const uint8_t *src1_m = (const uint8_t *)(psrc); \ | 85 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ |
| 72 const uint8_t *src2_m = ((const uint8_t *)(psrc)) + 4; \ | 86 uint32_t val0_m, val1_m; \ |
| 73 uint32_t val0_m, val1_m; \ | 87 uint64_t val_m = 0; \ |
| 74 uint64_t genval_m = 0; \ | 88 \ |
| 75 \ | 89 val0_m = LW(psrc_m); \ |
| 76 __asm__ __volatile__ ( \ | 90 val1_m = LW(psrc_m + 4); \ |
| 77 "lw %[val0_m], %[src1_m] \n\t" \ | 91 \ |
| 78 \ | 92 val_m = (uint64_t)(val1_m); \ |
| 79 : [val0_m] "=r" (val0_m) \ | 93 val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \ |
| 80 : [src1_m] "m" (*src1_m) \ | 94 val_m = (uint64_t)(val_m | (uint64_t)val0_m); \ |
| 81 ); \ | 95 \ |
| 82 \ | 96 val_m; \ |
| 83 __asm__ __volatile__ ( \ | |
| 84 "lw %[val1_m], %[src2_m] \n\t" \ | |
| 85 \ | |
| 86 : [val1_m] "=r" (val1_m) \ | |
| 87 : [src2_m] "m" (*src2_m) \ | |
| 88 ); \ | |
| 89 \ | |
| 90 genval_m = (uint64_t)(val1_m); \ | |
| 91 genval_m = (uint64_t)((genval_m << 32) & 0xFFFFFFFF00000000); \ | |
| 92 genval_m = (uint64_t)(genval_m | (uint64_t)val0_m); \ | |
| 93 \ | |
| 94 genval_m; \ | |
| 95 }) | 97 }) |
| 96 #endif // (__mips == 64) | 98 #endif // (__mips == 64) |
| 97 #define STORE_WORD_WITH_OFFSET_1(pdst, val) { \ | 99 |
| 98 uint8_t *dst_ptr_m = ((uint8_t *)(pdst)) + 1; \ | 100 #define SH(val, pdst) { \ |
| 99 const uint32_t val_m = (val); \ | 101 uint8_t *pdst_m = (uint8_t *)(pdst); \ |
| 100 \ | 102 const uint16_t val_m = (val); \ |
| 101 __asm__ __volatile__ ( \ | 103 \ |
| 102 "sw %[val_m], %[dst_ptr_m] \n\t" \ | 104 __asm__ __volatile__ ( \ |
| 103 \ | 105 "sh %[val_m], %[pdst_m] \n\t" \ |
| 104 : [dst_ptr_m] "=m" (*dst_ptr_m) \ | 106 \ |
| 105 : [val_m] "r" (val_m) \ | 107 : [pdst_m] "=m" (*pdst_m) \ |
| 106 ); \ | 108 : [val_m] "r" (val_m) \ |
| 107 } | 109 ); \ |
| 108 | 110 } |
| 109 #define STORE_WORD(pdst, val) { \ | 111 |
| 110 uint8_t *dst_ptr_m = (uint8_t *)(pdst); \ | 112 #define SW(val, pdst) { \ |
| 111 const uint32_t val_m = (val); \ | 113 uint8_t *pdst_m = (uint8_t *)(pdst); \ |
| 112 \ | 114 const uint32_t val_m = (val); \ |
| 113 __asm__ __volatile__ ( \ | 115 \ |
| 114 "sw %[val_m], %[dst_ptr_m] \n\t" \ | 116 __asm__ __volatile__ ( \ |
| 115 \ | 117 "sw %[val_m], %[pdst_m] \n\t" \ |
| 116 : [dst_ptr_m] "=m" (*dst_ptr_m) \ | 118 \ |
| 117 : [val_m] "r" (val_m) \ | 119 : [pdst_m] "=m" (*pdst_m) \ |
| 118 ); \ | 120 : [val_m] "r" (val_m) \ |
| 119 } | 121 ); \ |
| 120 | 122 } |
| 121 #define STORE_DWORD(pdst, val) { \ | 123 |
| 122 uint8_t *dst_ptr_m = (uint8_t *)(pdst); \ | 124 #define SD(val, pdst) { \ |
| 123 const uint64_t val_m = (val); \ | 125 uint8_t *pdst_m = (uint8_t *)(pdst); \ |
| 124 \ | 126 const uint64_t val_m = (val); \ |
| 125 __asm__ __volatile__ ( \ | 127 \ |
| 126 "sd %[val_m], %[dst_ptr_m] \n\t" \ | 128 __asm__ __volatile__ ( \ |
| 127 \ | 129 "sd %[val_m], %[pdst_m] \n\t" \ |
| 128 : [dst_ptr_m] "=m" (*dst_ptr_m) \ | 130 \ |
| 129 : [val_m] "r" (val_m) \ | 131 : [pdst_m] "=m" (*pdst_m) \ |
| 130 ); \ | 132 : [val_m] "r" (val_m) \ |
| 133 ); \ |
| 131 } | 134 } |
| 132 #else // !(__mips_isa_rev >= 6) | 135 #else // !(__mips_isa_rev >= 6) |
| 133 #define LOAD_WORD(psrc) ({ \ | 136 #define LH(psrc) ({ \ |
| 134 const uint8_t *src_m = (const uint8_t *)(psrc); \ | 137 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ |
| 135 uint32_t val_m; \ | 138 uint16_t val_m; \ |
| 136 \ | 139 \ |
| 137 __asm__ __volatile__ ( \ | 140 __asm__ __volatile__ ( \ |
| 138 "ulw %[val_m], %[src_m] \n\t" \ | 141 "ulh %[val_m], %[psrc_m] \n\t" \ |
| 139 \ | 142 \ |
| 140 : [val_m] "=r" (val_m) \ | 143 : [val_m] "=r" (val_m) \ |
| 141 : [src_m] "m" (*src_m) \ | 144 : [psrc_m] "m" (*psrc_m) \ |
| 142 ); \ | 145 ); \ |
| 143 \ | 146 \ |
| 144 val_m; \ | 147 val_m; \ |
| 145 }) | 148 }) |
| 146 | 149 |
| 150 #define LW(psrc) ({ \ |
| 151 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ |
| 152 uint32_t val_m; \ |
| 153 \ |
| 154 __asm__ __volatile__ ( \ |
| 155 "ulw %[val_m], %[psrc_m] \n\t" \ |
| 156 \ |
| 157 : [val_m] "=r" (val_m) \ |
| 158 : [psrc_m] "m" (*psrc_m) \ |
| 159 ); \ |
| 160 \ |
| 161 val_m; \ |
| 162 }) |
| 163 |
| 147 #if (__mips == 64) | 164 #if (__mips == 64) |
| 148 #define LOAD_DWORD(psrc) ({ \ | 165 #define LD(psrc) ({ \ |
| 149 const uint8_t *src_m = (const uint8_t *)(psrc); \ | 166 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ |
| 150 uint64_t val_m = 0; \ | 167 uint64_t val_m = 0; \ |
| 151 \ | 168 \ |
| 152 __asm__ __volatile__ ( \ | 169 __asm__ __volatile__ ( \ |
| 153 "uld %[val_m], %[src_m] \n\t" \ | 170 "uld %[val_m], %[psrc_m] \n\t" \ |
| 154 \ | 171 \ |
| 155 : [val_m] "=r" (val_m) \ | 172 : [val_m] "=r" (val_m) \ |
| 156 : [src_m] "m" (*src_m) \ | 173 : [psrc_m] "m" (*psrc_m) \ |
| 157 ); \ | 174 ); \ |
| 158 \ | 175 \ |
| 159 val_m; \ | 176 val_m; \ |
| 160 }) | 177 }) |
| 161 #else // !(__mips == 64) | 178 #else // !(__mips == 64) |
| 162 #define LOAD_DWORD(psrc) ({ \ | 179 #define LD(psrc) ({ \ |
| 163 const uint8_t *src1_m = (const uint8_t *)(psrc); \ | 180 const uint8_t *psrc_m1 = (const uint8_t *)(psrc); \ |
| 164 const uint8_t *src2_m = ((const uint8_t *)(psrc)) + 4; \ | 181 uint32_t val0_m, val1_m; \ |
| 165 uint32_t val0_m, val1_m; \ | 182 uint64_t val_m = 0; \ |
| 166 uint64_t genval_m = 0; \ | 183 \ |
| 167 \ | 184 val0_m = LW(psrc_m1); \ |
| 168 __asm__ __volatile__ ( \ | 185 val1_m = LW(psrc_m1 + 4); \ |
| 169 "ulw %[val0_m], %[src1_m] \n\t" \ | 186 \ |
| 170 \ | 187 val_m = (uint64_t)(val1_m); \ |
| 171 : [val0_m] "=r" (val0_m) \ | 188 val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \ |
| 172 : [src1_m] "m" (*src1_m) \ | 189 val_m = (uint64_t)(val_m | (uint64_t)val0_m); \ |
| 173 ); \ | 190 \ |
| 174 \ | 191 val_m; \ |
| 175 __asm__ __volatile__ ( \ | |
| 176 "ulw %[val1_m], %[src2_m] \n\t" \ | |
| 177 \ | |
| 178 : [val1_m] "=r" (val1_m) \ | |
| 179 : [src2_m] "m" (*src2_m) \ | |
| 180 ); \ | |
| 181 \ | |
| 182 genval_m = (uint64_t)(val1_m); \ | |
| 183 genval_m = (uint64_t)((genval_m << 32) & 0xFFFFFFFF00000000); \ | |
| 184 genval_m = (uint64_t)(genval_m | (uint64_t)val0_m); \ | |
| 185 \ | |
| 186 genval_m; \ | |
| 187 }) | 192 }) |
| 188 #endif // (__mips == 64) | 193 #endif // (__mips == 64) |
| 189 | 194 |
| 190 #define STORE_WORD_WITH_OFFSET_1(pdst, val) { \ | 195 #define SH(val, pdst) { \ |
| 191 uint8_t *dst_ptr_m = ((uint8_t *)(pdst)) + 1; \ | 196 uint8_t *pdst_m = (uint8_t *)(pdst); \ |
| 192 const uint32_t val_m = (val); \ | 197 const uint16_t val_m = (val); \ |
| 193 \ | 198 \ |
| 194 __asm__ __volatile__ ( \ | 199 __asm__ __volatile__ ( \ |
| 195 "usw %[val_m], %[dst_ptr_m] \n\t" \ | 200 "ush %[val_m], %[pdst_m] \n\t" \ |
| 196 \ | 201 \ |
| 197 : [dst_ptr_m] "=m" (*dst_ptr_m) \ | 202 : [pdst_m] "=m" (*pdst_m) \ |
| 198 : [val_m] "r" (val_m) \ | 203 : [val_m] "r" (val_m) \ |
| 199 ); \ | 204 ); \ |
| 200 } | 205 } |
| 201 | 206 |
| 202 #define STORE_WORD(pdst, val) { \ | 207 #define SW(val, pdst) { \ |
| 203 uint8_t *dst_ptr_m = (uint8_t *)(pdst); \ | 208 uint8_t *pdst_m = (uint8_t *)(pdst); \ |
| 204 const uint32_t val_m = (val); \ | 209 const uint32_t val_m = (val); \ |
| 205 \ | 210 \ |
| 206 __asm__ __volatile__ ( \ | 211 __asm__ __volatile__ ( \ |
| 207 "usw %[val_m], %[dst_ptr_m] \n\t" \ | 212 "usw %[val_m], %[pdst_m] \n\t" \ |
| 208 \ | 213 \ |
| 209 : [dst_ptr_m] "=m" (*dst_ptr_m) \ | 214 : [pdst_m] "=m" (*pdst_m) \ |
| 210 : [val_m] "r" (val_m) \ | 215 : [val_m] "r" (val_m) \ |
| 211 ); \ | 216 ); \ |
| 212 } | 217 } |
| 213 | 218 |
| 214 #define STORE_DWORD(pdst, val) { \ | 219 #define SD(val, pdst) { \ |
| 215 uint8_t *dst1_m = (uint8_t *)(pdst); \ | 220 uint8_t *pdst_m1 = (uint8_t *)(pdst); \ |
| 216 uint8_t *dst2_m = ((uint8_t *)(pdst)) + 4; \ | |
| 217 uint32_t val0_m, val1_m; \ | 221 uint32_t val0_m, val1_m; \ |
| 218 \ | 222 \ |
| 219 val0_m = (uint32_t)((val) & 0x00000000FFFFFFFF); \ | 223 val0_m = (uint32_t)((val) & 0x00000000FFFFFFFF); \ |
| 220 val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ | 224 val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ |
| 221 \ | 225 \ |
| 222 __asm__ __volatile__ ( \ | 226 SW(val0_m, pdst_m1); \ |
| 223 "usw %[val0_m], %[dst1_m] \n\t" \ | 227 SW(val1_m, pdst_m1 + 4); \ |
| 224 "usw %[val1_m], %[dst2_m] \n\t" \ | 228 } |
| 229 #endif // (__mips_isa_rev >= 6) |
| 230 |
| 231 /* Description : Store 4 words with stride |
| 232 Arguments : Inputs - in0, in1, in2, in3, pdst, stride |
| 233 Details : Stores word from 'in0' to (pdst) |
| 234 Stores word from 'in1' to (pdst + stride) |
| 235 Stores word from 'in2' to (pdst + 2 * stride) |
| 236 Stores word from 'in3' to (pdst + 3 * stride) |
| 237 */ |
| 238 #define SW4(in0, in1, in2, in3, pdst, stride) { \ |
| 239 SW(in0, (pdst)) \ |
| 240 SW(in1, (pdst) + stride); \ |
| 241 SW(in2, (pdst) + 2 * stride); \ |
| 242 SW(in3, (pdst) + 3 * stride); \ |
| 243 } |
| 244 |
| 245 /* Description : Store 4 double words with stride |
| 246 Arguments : Inputs - in0, in1, in2, in3, pdst, stride |
| 247 Details : Stores double word from 'in0' to (pdst) |
| 248 Stores double word from 'in1' to (pdst + stride) |
| 249 Stores double word from 'in2' to (pdst + 2 * stride) |
| 250 Stores double word from 'in3' to (pdst + 3 * stride) |
| 251 */ |
| 252 #define SD4(in0, in1, in2, in3, pdst, stride) { \ |
| 253 SD(in0, (pdst)) \ |
| 254 SD(in1, (pdst) + stride); \ |
| 255 SD(in2, (pdst) + 2 * stride); \ |
| 256 SD(in3, (pdst) + 3 * stride); \ |
| 257 } |
| 258 |
| 259 /* Description : Load vectors with 16 byte elements with stride |
| 260 Arguments : Inputs - psrc (source pointer to load from) |
| 261 - stride |
| 262 Outputs - out0, out1 |
| 263 Return Type - as per RTYPE |
| 264 Details : Loads 16 byte elements in 'out0' from (psrc) |
| 265 Loads 16 byte elements in 'out1' from (psrc + stride) |
| 266 */ |
| 267 #define LD_B2(RTYPE, psrc, stride, out0, out1) { \ |
| 268 out0 = LD_B(RTYPE, (psrc)); \ |
| 269 out1 = LD_B(RTYPE, (psrc) + stride); \ |
| 270 } |
| 271 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__) |
| 272 #define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__) |
| 273 |
| 274 #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) { \ |
| 275 LD_B2(RTYPE, (psrc), stride, out0, out1); \ |
| 276 LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3); \ |
| 277 } |
| 278 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__) |
| 279 #define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__) |
| 280 |
| 281 #define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) { \ |
| 282 LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ |
| 283 out4 = LD_B(RTYPE, (psrc) + 4 * stride); \ |
| 284 } |
| 285 #define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__) |
| 286 #define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__) |
| 287 |
| 288 #define LD_B7(RTYPE, psrc, stride, \ |
| 289 out0, out1, out2, out3, out4, out5, out6) { \ |
| 290 LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \ |
| 291 LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \ |
| 292 } |
| 293 #define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__) |
| 294 |
| 295 #define LD_B8(RTYPE, psrc, stride, \ |
| 296 out0, out1, out2, out3, out4, out5, out6, out7) { \ |
| 297 LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ |
| 298 LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ |
| 299 } |
| 300 #define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__) |
| 301 #define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__) |
| 302 |
| 303 /* Description : Load vectors with 8 halfword elements with stride |
| 304 Arguments : Inputs - psrc (source pointer to load from) |
| 305 - stride |
| 306 Outputs - out0, out1 |
| 307 Details : Loads 8 halfword elements in 'out0' from (psrc) |
| 308 Loads 8 halfword elements in 'out1' from (psrc + stride) |
| 309 */ |
| 310 #define LD_H2(RTYPE, psrc, stride, out0, out1) { \ |
| 311 out0 = LD_H(RTYPE, (psrc)); \ |
| 312 out1 = LD_H(RTYPE, (psrc) + (stride)); \ |
| 313 } |
| 314 #define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__) |
| 315 |
| 316 #define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) { \ |
| 317 LD_H2(RTYPE, (psrc), stride, out0, out1); \ |
| 318 LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ |
| 319 } |
| 320 #define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__) |
| 321 |
| 322 #define LD_H8(RTYPE, psrc, stride, \ |
| 323 out0, out1, out2, out3, out4, out5, out6, out7) { \ |
| 324 LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ |
| 325 LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ |
| 326 } |
| 327 #define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__) |
| 328 |
| 329 #define LD_H16(RTYPE, psrc, stride, \ |
| 330 out0, out1, out2, out3, out4, out5, out6, out7, \ |
| 331 out8, out9, out10, out11, out12, out13, out14, out15) { \ |
| 332 LD_H8(RTYPE, (psrc), stride, \ |
| 333 out0, out1, out2, out3, out4, out5, out6, out7); \ |
| 334 LD_H8(RTYPE, (psrc) + 8 * stride, stride, \ |
| 335 out8, out9, out10, out11, out12, out13, out14, out15); \ |
| 336 } |
| 337 #define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__) |
| 338 |
| 339 /* Description : Store vectors of 16 byte elements with stride |
| 340 Arguments : Inputs - in0, in1, stride |
| 341 Outputs - pdst (destination pointer to store to) |
| 342 Details : Stores 16 byte elements from 'in0' to (pdst) |
| 343 Stores 16 byte elements from 'in1' to (pdst + stride) |
| 344 */ |
| 345 #define ST_B2(RTYPE, in0, in1, pdst, stride) { \ |
| 346 ST_B(RTYPE, in0, (pdst)); \ |
| 347 ST_B(RTYPE, in1, (pdst) + stride); \ |
| 348 } |
| 349 #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__) |
| 350 |
| 351 #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) { \ |
| 352 ST_B2(RTYPE, in0, in1, (pdst), stride); \ |
| 353 ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ |
| 354 } |
| 355 #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__) |
| 356 |
| 357 #define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ |
| 358 pdst, stride) { \ |
| 359 ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \ |
| 360 ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ |
| 361 } |
| 362 #define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__) |
| 363 |
| 364 /* Description : Store vectors of 8 halfword elements with stride |
| 365 Arguments : Inputs - in0, in1, stride |
| 366 Outputs - pdst (destination pointer to store to) |
| 367 Details : Stores 8 halfword elements from 'in0' to (pdst) |
| 368 Stores 8 halfword elements from 'in1' to (pdst + stride) |
| 369 */ |
| 370 #define ST_H2(RTYPE, in0, in1, pdst, stride) { \ |
| 371 ST_H(RTYPE, in0, (pdst)); \ |
| 372 ST_H(RTYPE, in1, (pdst) + stride); \ |
| 373 } |
| 374 #define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__) |
| 375 |
| 376 #define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) { \ |
| 377 ST_H2(RTYPE, in0, in1, (pdst), stride); \ |
| 378 ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ |
| 379 } |
| 380 #define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__) |
| 381 |
| 382 #define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) { \ |
| 383 ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \ |
| 384 ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ |
| 385 } |
| 386 #define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__) |
| 387 |
| 388 /* Description : Store as 4x4 byte block to destination memory from input vector |
| 389 Arguments : Inputs - in0, in1, pdst, stride |
| 390 Return Type - unsigned byte |
| 391 Details : Idx0 word element from input vector 'in0' is copied and stored |
| 392 on first line |
| 393 Idx1 word element from input vector 'in0' is copied and stored |
| 394 on second line |
| 395 Idx2 word element from input vector 'in1' is copied and stored |
| 396 on third line |
| 397 Idx3 word element from input vector 'in1' is copied and stored |
| 398 on fourth line |
| 399 */ |
| 400 #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) { \ |
| 401 uint32_t out0_m, out1_m, out2_m, out3_m; \ |
| 402 uint8_t *pblk_4x4_m = (uint8_t *)(pdst); \ |
| 403 \ |
| 404 out0_m = __msa_copy_u_w((v4i32)in0, idx0); \ |
| 405 out1_m = __msa_copy_u_w((v4i32)in0, idx1); \ |
| 406 out2_m = __msa_copy_u_w((v4i32)in1, idx2); \ |
| 407 out3_m = __msa_copy_u_w((v4i32)in1, idx3); \ |
| 408 \ |
| 409 SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \ |
| 410 } |
| 411 #define ST4x8_UB(in0, in1, pdst, stride) { \ |
| 412 uint8_t *pblk_4x8 = (uint8_t *)(pdst); \ |
| 413 \ |
| 414 ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \ |
| 415 ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \ |
| 416 } |
| 417 |
| 418 /* Description : Store as 8x1 byte block to destination memory from input vector |
| 419 Arguments : Inputs - in, pdst |
| 420 Details : Index 0 double word element from input vector 'in' is copied |
| 421 and stored to destination memory at (pdst) |
| 422 */ |
| 423 #define ST8x1_UB(in, pdst) { \ |
| 424 uint64_t out0_m; \ |
| 425 \ |
| 426 out0_m = __msa_copy_u_d((v2i64)in, 0); \ |
| 427 SD(out0_m, pdst); \ |
| 428 } |
| 429 |
| 430 /* Description : Store as 8x4 byte block to destination memory from input |
| 431 vectors |
| 432 Arguments : Inputs - in0, in1, pdst, stride |
| 433 Details : Index 0 double word element from input vector 'in0' is copied |
| 434 and stored to destination memory at (pblk_8x4_m) |
| 435 Index 1 double word element from input vector 'in0' is copied |
| 436 and stored to destination memory at (pblk_8x4_m + stride) |
| 437 Index 0 double word element from input vector 'in1' is copied |
| 438 and stored to destination memory at (pblk_8x4_m + 2 * stride) |
| 439 Index 1 double word element from input vector 'in1' is copied |
| 440 and stored to destination memory at (pblk_8x4_m + 3 * stride) |
| 441 */ |
| 442 #define ST8x4_UB(in0, in1, pdst, stride) { \ |
| 443 uint64_t out0_m, out1_m, out2_m, out3_m; \ |
| 444 uint8_t *pblk_8x4_m = (uint8_t *)(pdst); \ |
| 225 \ | 445 \ |
| 226 : [dst1_m] "=m" (*dst1_m), [dst2_m] "=m" (*dst2_m) \ | 446 out0_m = __msa_copy_u_d((v2i64)in0, 0); \ |
| 227 : [val0_m] "r" (val0_m), [val1_m] "r" (val1_m) \ | 447 out1_m = __msa_copy_u_d((v2i64)in0, 1); \ |
| 228 ); \ | 448 out2_m = __msa_copy_u_d((v2i64)in1, 0); \ |
| 229 } | 449 out3_m = __msa_copy_u_d((v2i64)in1, 1); \ |
| 230 #endif // (__mips_isa_rev >= 6) | 450 \ |
| 231 | 451 SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \ |
| 232 #define LOAD_2VECS_UB(psrc, stride, \ | 452 } |
| 233 val0, val1) { \ | 453 |
| 234 val0 = LOAD_UB(psrc + 0 * stride); \ | 454 /* Description : average with rounding (in0 + in1 + 1) / 2. |
| 235 val1 = LOAD_UB(psrc + 1 * stride); \ | 455 Arguments : Inputs - in0, in1, in2, in3, |
| 236 } | 456 Outputs - out0, out1 |
| 237 | 457 Return Type - signed byte |
| 238 #define LOAD_4VECS_UB(psrc, stride, \ | 458 Details : Each byte element from 'in0' vector is added with each byte |
| 239 val0, val1, val2, val3) { \ | 459 element from 'in1' vector. The addition of the elements plus 1 |
| 240 val0 = LOAD_UB(psrc + 0 * stride); \ | 460 (for rounding) is done unsigned with full precision, |
| 241 val1 = LOAD_UB(psrc + 1 * stride); \ | 461 i.e. the result has one extra bit. Unsigned division by 2 |
| 242 val2 = LOAD_UB(psrc + 2 * stride); \ | 462 (or logical shift right by one bit) is performed before writing |
| 243 val3 = LOAD_UB(psrc + 3 * stride); \ | 463 the result to vector 'out0' |
| 244 } | 464 Similar for the pair of 'in2' and 'in3' |
| 245 | 465 */ |
| 246 #define LOAD_4VECS_SB(psrc, stride, \ | 466 #define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) { \ |
| 247 val0, val1, val2, val3) { \ | 467 out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1); \ |
| 248 val0 = LOAD_SB(psrc + 0 * stride); \ | 468 out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3); \ |
| 249 val1 = LOAD_SB(psrc + 1 * stride); \ | 469 } |
| 250 val2 = LOAD_SB(psrc + 2 * stride); \ | 470 #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__) |
| 251 val3 = LOAD_SB(psrc + 3 * stride); \ | 471 |
| 252 } | 472 #define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ |
| 253 | 473 out0, out1, out2, out3) { \ |
| 254 #define LOAD_5VECS_UB(psrc, stride, \ | 474 AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \ |
| 255 out0, out1, out2, out3, out4) { \ | 475 AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3) \ |
| 256 LOAD_4VECS_UB((psrc), (stride), \ | 476 } |
| 257 (out0), (out1), (out2), (out3)); \ | 477 #define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__) |
| 258 out4 = LOAD_UB(psrc + 4 * stride); \ | 478 |
| 259 } | 479 /* Description : Immediate number of columns to slide with zero |
| 260 | 480 Arguments : Inputs - in0, in1, slide_val |
| 261 #define LOAD_5VECS_SB(psrc, stride, \ | 481 Outputs - out0, out1 |
| 262 out0, out1, out2, out3, out4) { \ | 482 Return Type - as per RTYPE |
| 263 LOAD_4VECS_SB((psrc), (stride), \ | 483 Details : Byte elements from 'zero_m' vector are slide into 'in0' by |
| 264 (out0), (out1), (out2), (out3)); \ | 484 number of elements specified by 'slide_val' |
| 265 out4 = LOAD_SB(psrc + 4 * stride); \ | 485 */ |
| 266 } | 486 #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) { \ |
| 267 | 487 v16i8 zero_m = { 0 }; \ |
| 268 #define LOAD_7VECS_SB(psrc, stride, \ | 488 out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \ |
| 269 val0, val1, val2, val3, \ | 489 out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \ |
| 270 val4, val5, val6) { \ | 490 } |
| 271 val0 = LOAD_SB((psrc) + 0 * (stride)); \ | 491 #define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__) |
| 272 val1 = LOAD_SB((psrc) + 1 * (stride)); \ | 492 |
| 273 val2 = LOAD_SB((psrc) + 2 * (stride)); \ | 493 /* Description : Immediate number of columns to slide |
| 274 val3 = LOAD_SB((psrc) + 3 * (stride)); \ | 494 Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val |
| 275 val4 = LOAD_SB((psrc) + 4 * (stride)); \ | 495 Outputs - out0, out1 |
| 276 val5 = LOAD_SB((psrc) + 5 * (stride)); \ | 496 Return Type - as per RTYPE |
| 277 val6 = LOAD_SB((psrc) + 6 * (stride)); \ | 497 Details : Byte elements from 'in0_0' vector are slide into 'in1_0' by |
| 278 } | 498 number of elements specified by 'slide_val' |
| 279 | 499 */ |
| 280 #define LOAD_8VECS_UB(psrc, stride, \ | 500 #define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) { \ |
| 281 out0, out1, out2, out3, \ | 501 out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val); \ |
| 282 out4, out5, out6, out7) { \ | 502 out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val); \ |
| 283 LOAD_4VECS_UB((psrc), (stride), \ | 503 } |
| 284 (out0), (out1), (out2), (out3)); \ | 504 #define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__) |
| 285 LOAD_4VECS_UB((psrc + 4 * stride), (stride), \ | 505 |
| 286 (out4), (out5), (out6), (out7)); \ | 506 #define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, \ |
| 287 } | 507 out0, out1, out2, slide_val) { \ |
| 288 | 508 SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \ |
| 289 #define LOAD_8VECS_SB(psrc, stride, \ | 509 out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val); \ |
| 290 out0, out1, out2, out3, \ | 510 } |
| 291 out4, out5, out6, out7) { \ | 511 #define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__) |
| 292 LOAD_4VECS_SB((psrc), (stride), \ | 512 #define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__) |
| 293 (out0), (out1), (out2), (out3)); \ | 513 |
| 294 LOAD_4VECS_SB((psrc + 4 * stride), (stride), \ | 514 /* Description : Shuffle byte vector elements as per mask vector |
| 295 (out4), (out5), (out6), (out7)); \ | 515 Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 |
| 296 } | 516 Outputs - out0, out1 |
| 297 | 517 Return Type - as per RTYPE |
| 298 #define LOAD_2VECS_SH(psrc, stride, \ | 518 Details : Selective byte elements from in0 & in1 are copied to out0 as |
| 299 val0, val1) { \ | 519 per control vector mask0 |
| 300 val0 = LOAD_SH((psrc) + 0 * (stride)); \ | 520 Selective byte elements from in2 & in3 are copied to out1 as |
| 301 val1 = LOAD_SH((psrc) + 1 * (stride)); \ | 521 per control vector mask1 |
| 302 } | 522 */ |
| 303 | 523 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) { \ |
| 304 #define LOAD_4VECS_SH(psrc, stride, \ | 524 out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \ |
| 305 val0, val1, val2, val3) { \ | 525 out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \ |
| 306 LOAD_2VECS_SH((psrc), (stride), val0, val1); \ | 526 } |
| 307 LOAD_2VECS_SH((psrc + 2 * stride), (stride), val2, val3); \ | 527 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__) |
| 308 } | 528 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__) |
| 309 | 529 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__) |
| 310 #define LOAD_8VECS_SH(psrc, stride, \ | 530 |
| 311 val0, val1, val2, val3, \ | 531 #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, \ |
| 312 val4, val5, val6, val7) { \ | 532 out0, out1, out2, out3) { \ |
| 313 LOAD_4VECS_SH((psrc), (stride), \ | 533 VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1); \ |
| 314 val0, val1, val2, val3); \ | 534 VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \ |
| 315 LOAD_4VECS_SH((psrc + 4 * stride), (stride), \ | 535 } |
| 316 val4, val5, val6, val7); \ | 536 #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__) |
| 317 } | 537 #define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__) |
| 318 | 538 |
| 319 #define LOAD_16VECS_SH(psrc, stride, \ | 539 /* Description : Dot product of byte vector elements |
| 320 val0, val1, val2, val3, \ | 540 Arguments : Inputs - mult0, mult1 |
| 321 val4, val5, val6, val7, \ | 541 cnst0, cnst1 |
| 322 val8, val9, val10, val11, \ | 542 Outputs - out0, out1 |
| 323 val12, val13, val14, val15) { \ | 543 Return Type - unsigned halfword |
| 324 LOAD_8VECS_SH((psrc), (stride), \ | 544 Details : Unsigned byte elements from mult0 are multiplied with |
| 325 val0, val1, val2, val3, \ | 545 unsigned byte elements from cnst0 producing a result |
| 326 val4, val5, val6, val7); \ | 546 twice the size of input i.e. unsigned halfword. |
| 327 LOAD_8VECS_SH((psrc + 8 * (stride)), (stride), \ | 547 Then this multiplication results of adjacent odd-even elements |
| 328 val8, val9, val10, val11, \ | 548 are added together and stored to the out vector |
| 329 val12, val13, val14, val15); \ | 549 (2 unsigned halfword results) |
| 330 } | 550 */ |
| 331 | 551 #define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \ |
| 332 #define STORE_4VECS_UB(dst_out, pitch, \ | 552 out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0); \ |
| 333 in0, in1, in2, in3) { \ | 553 out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1); \ |
| 334 STORE_UB((in0), (dst_out)); \ | 554 } |
| 335 STORE_UB((in1), ((dst_out) + (pitch))); \ | 555 #define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__) |
| 336 STORE_UB((in2), ((dst_out) + 2 * (pitch))); \ | 556 |
| 337 STORE_UB((in3), ((dst_out) + 3 * (pitch))); \ | 557 #define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, \ |
| 338 } | 558 cnst0, cnst1, cnst2, cnst3, \ |
| 339 | 559 out0, out1, out2, out3) { \ |
| 340 #define STORE_8VECS_UB(dst_out, pitch_in, \ | 560 DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ |
| 341 in0, in1, in2, in3, \ | 561 DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ |
| 342 in4, in5, in6, in7) { \ | 562 } |
| 343 STORE_4VECS_UB(dst_out, pitch_in, \ | 563 #define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__) |
| 344 in0, in1, in2, in3); \ | 564 |
| 345 STORE_4VECS_UB((dst_out + 4 * (pitch_in)), pitch_in, \ | 565 /* Description : Dot product of byte vector elements |
| 346 in4, in5, in6, in7); \ | 566 Arguments : Inputs - mult0, mult1 |
| 347 } | 567 cnst0, cnst1 |
| 348 | 568 Outputs - out0, out1 |
| 349 #define VEC_INSERT_4W_UB(src, src0, src1, src2, src3) { \ | 569 Return Type - signed halfword |
| 350 src = (v16u8)__msa_insert_w((v4i32)(src), 0, (src0)); \ | 570 Details : Signed byte elements from mult0 are multiplied with |
| 351 src = (v16u8)__msa_insert_w((v4i32)(src), 1, (src1)); \ | 571 signed byte elements from cnst0 producing a result |
| 352 src = (v16u8)__msa_insert_w((v4i32)(src), 2, (src2)); \ | 572 twice the size of input i.e. signed halfword. |
| 353 src = (v16u8)__msa_insert_w((v4i32)(src), 3, (src3)); \ | 573 Then this multiplication results of adjacent odd-even elements |
| 354 } | 574 are added together and stored to the out vector |
| 355 | 575 (2 signed halfword results) |
| 356 #define VEC_INSERT_2DW_UB(src, src0, src1) { \ | 576 */ |
| 357 src = (v16u8)__msa_insert_d((v2i64)(src), 0, (src0)); \ | 577 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \ |
| 358 src = (v16u8)__msa_insert_d((v2i64)(src), 1, (src1)); \ | 578 out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0); \ |
| 359 } | 579 out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1); \ |
| 360 | 580 } |
| 361 #define STORE_4VECS_SH(ptr, stride, \ | 581 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__) |
| 362 in0, in1, in2, in3) { \ | 582 |
| 363 STORE_SH(in0, ((ptr) + 0 * stride)); \ | 583 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, \ |
| 364 STORE_SH(in1, ((ptr) + 1 * stride)); \ | 584 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) { \ |
| 365 STORE_SH(in2, ((ptr) + 2 * stride)); \ | 585 DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ |
| 366 STORE_SH(in3, ((ptr) + 3 * stride)); \ | 586 DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ |
| 367 } | 587 } |
| 368 | 588 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__) |
| 369 #define STORE_8VECS_SH(ptr, stride, \ | 589 |
| 370 in0, in1, in2, in3, \ | 590 /* Description : Dot product of halfword vector elements |
| 371 in4, in5, in6, in7) { \ | 591 Arguments : Inputs - mult0, mult1 |
| 372 STORE_SH(in0, ((ptr) + 0 * stride)); \ | 592 cnst0, cnst1 |
| 373 STORE_SH(in1, ((ptr) + 1 * stride)); \ | 593 Outputs - out0, out1 |
| 374 STORE_SH(in2, ((ptr) + 2 * stride)); \ | 594 Return Type - signed word |
| 375 STORE_SH(in3, ((ptr) + 3 * stride)); \ | 595 Details : Signed halfword elements from mult0 are multiplied with |
| 376 STORE_SH(in4, ((ptr) + 4 * stride)); \ | 596 signed halfword elements from cnst0 producing a result |
| 377 STORE_SH(in5, ((ptr) + 5 * stride)); \ | 597 twice the size of input i.e. signed word. |
| 378 STORE_SH(in6, ((ptr) + 6 * stride)); \ | 598 Then this multiplication results of adjacent odd-even elements |
| 379 STORE_SH(in7, ((ptr) + 7 * stride)); \ | 599 are added together and stored to the out vector |
| 380 } | 600 (2 signed word results) |
| 381 | 601 */ |
| 382 #define CLIP_UNSIGNED_CHAR_H(in) ({ \ | 602 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \ |
| 603 out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0); \ |
| 604 out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1); \ |
| 605 } |
| 606 #define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__) |
| 607 |
| 608 #define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, \ |
| 609 cnst0, cnst1, cnst2, cnst3, \ |
| 610 out0, out1, out2, out3) { \ |
| 611 DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ |
| 612 DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ |
| 613 } |
| 614 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__) |
| 615 |
| 616 /* Description : Dot product & addition of byte vector elements |
| 617 Arguments : Inputs - mult0, mult1 |
| 618 cnst0, cnst1 |
| 619 Outputs - out0, out1 |
| 620 Return Type - signed halfword |
| 621 Details : Signed byte elements from mult0 are multiplied with |
| 622 signed byte elements from cnst0 producing a result |
| 623 twice the size of input i.e. signed halfword. |
| 624 Then this multiplication results of adjacent odd-even elements |
| 625 are added to the out vector |
| 626 (2 signed halfword results) |
| 627 */ |
| 628 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \ |
| 629 out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0); \ |
| 630 out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1); \ |
| 631 } |
| 632 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__) |
| 633 |
| 634 #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, \ |
| 635 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) { \ |
| 636 DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ |
| 637 DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ |
| 638 } |
| 639 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__) |
| 640 |
| 641 /* Description : Minimum values between unsigned elements of |
| 642 either vector are copied to the output vector |
| 643 Arguments : Inputs - in0, in1, min_vec |
| 644 Outputs - in0, in1, (in place) |
| 645 Return Type - unsigned halfword |
| 646 Details : Minimum of unsigned halfword element values from 'in0' and |
| 647 'min_value' are written to output vector 'in0' |
| 648 */ |
| 649 #define MIN_UH2(RTYPE, in0, in1, min_vec) { \ |
| 650 in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec); \ |
| 651 in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec); \ |
| 652 } |
| 653 #define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__) |
| 654 |
| 655 #define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) { \ |
| 656 MIN_UH2(RTYPE, in0, in1, min_vec); \ |
| 657 MIN_UH2(RTYPE, in2, in3, min_vec); \ |
| 658 } |
| 659 #define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__) |
| 660 |
| 661 /* Description : Clips all signed halfword elements of input vector |
| 662 between 0 & 255 |
| 663 Arguments : Inputs - in (input vector) |
| 664 Outputs - out_m (output vector with clipped elements) |
| 665 Return Type - signed halfword |
| 666 */ |
| 667 #define CLIP_SH_0_255(in) ({ \ |
| 383 v8i16 max_m = __msa_ldi_h(255); \ | 668 v8i16 max_m = __msa_ldi_h(255); \ |
| 384 v8i16 out_m; \ | 669 v8i16 out_m; \ |
| 385 \ | 670 \ |
| 386 out_m = __msa_maxi_s_h((v8i16)(in), 0); \ | 671 out_m = __msa_maxi_s_h((v8i16)in, 0); \ |
| 387 out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \ | 672 out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \ |
| 388 out_m; \ | 673 out_m; \ |
| 389 }) | 674 }) |
| 390 | 675 #define CLIP_SH2_0_255(in0, in1) { \ |
| 391 /* halfword 8x8 transpose macro */ | 676 in0 = CLIP_SH_0_255(in0); \ |
| 392 #define TRANSPOSE8x8_H_SH(in0, in1, in2, in3, \ | 677 in1 = CLIP_SH_0_255(in1); \ |
| 393 in4, in5, in6, in7, \ | 678 } |
| 394 out0, out1, out2, out3, \ | 679 #define CLIP_SH4_0_255(in0, in1, in2, in3) { \ |
| 395 out4, out5, out6, out7) { \ | 680 CLIP_SH2_0_255(in0, in1); \ |
| 396 v8i16 s0_m, s1_m; \ | 681 CLIP_SH2_0_255(in2, in3); \ |
| 397 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ | 682 } |
| 398 v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ | 683 |
| 399 \ | 684 /* Description : Interleave even byte elements from vectors |
| 400 s0_m = __msa_ilvr_h((v8i16)(in6), (v8i16)(in4)); \ | 685 Arguments : Inputs - in0, in1, in2, in3 |
| 401 s1_m = __msa_ilvr_h((v8i16)(in7), (v8i16)(in5)); \ | 686 Outputs - out0, out1 |
| 402 tmp0_m = __msa_ilvr_h((v8i16)s1_m, (v8i16)s0_m); \ | 687 Return Type - as per RTYPE |
| 403 tmp1_m = __msa_ilvl_h((v8i16)s1_m, (v8i16)s0_m); \ | 688 Details : Even byte elements of 'in0' and even byte |
| 404 \ | 689 elements of 'in1' are interleaved and copied to 'out0' |
| 405 s0_m = __msa_ilvl_h((v8i16)(in6), (v8i16)(in4)); \ | 690 Even byte elements of 'in2' and even byte |
| 406 s1_m = __msa_ilvl_h((v8i16)(in7), (v8i16)(in5)); \ | 691 elements of 'in3' are interleaved and copied to 'out1' |
| 407 tmp2_m = __msa_ilvr_h((v8i16)s1_m, (v8i16)s0_m); \ | 692 */ |
| 408 tmp3_m = __msa_ilvl_h((v8i16)s1_m, (v8i16)s0_m); \ | 693 #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \ |
| 409 \ | 694 out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \ |
| 410 s0_m = __msa_ilvr_h((v8i16)(in2), (v8i16)(in0)); \ | 695 out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \ |
| 411 s1_m = __msa_ilvr_h((v8i16)(in3), (v8i16)(in1)); \ | 696 } |
| 412 tmp4_m = __msa_ilvr_h((v8i16)s1_m, (v8i16)s0_m); \ | 697 #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__) |
| 413 tmp5_m = __msa_ilvl_h((v8i16)s1_m, (v8i16)s0_m); \ | 698 #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__) |
| 414 \ | 699 |
| 415 s0_m = __msa_ilvl_h((v8i16)(in2), (v8i16)(in0)); \ | 700 /* Description : Interleave even halfword elements from vectors |
| 416 s1_m = __msa_ilvl_h((v8i16)(in3), (v8i16)(in1)); \ | 701 Arguments : Inputs - in0, in1, in2, in3 |
| 417 tmp6_m = __msa_ilvr_h((v8i16)s1_m, (v8i16)s0_m); \ | 702 Outputs - out0, out1 |
| 418 tmp7_m = __msa_ilvl_h((v8i16)s1_m, (v8i16)s0_m); \ | 703 Return Type - as per RTYPE |
| 419 \ | 704 Details : Even halfword elements of 'in0' and even halfword |
| 420 out0 = (v8i16)__msa_pckev_d((v2i64)tmp0_m, (v2i64)tmp4_m); \ | 705 elements of 'in1' are interleaved and copied to 'out0' |
| 421 out1 = (v8i16)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m); \ | 706 Even halfword elements of 'in2' and even halfword |
| 422 out2 = (v8i16)__msa_pckev_d((v2i64)tmp1_m, (v2i64)tmp5_m); \ | 707 elements of 'in3' are interleaved and copied to 'out1' |
| 423 out3 = (v8i16)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m); \ | 708 */ |
| 424 out4 = (v8i16)__msa_pckev_d((v2i64)tmp2_m, (v2i64)tmp6_m); \ | 709 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \ |
| 425 out5 = (v8i16)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m); \ | 710 out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \ |
| 426 out6 = (v8i16)__msa_pckev_d((v2i64)tmp3_m, (v2i64)tmp7_m); \ | 711 out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \ |
| 427 out7 = (v8i16)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m); \ | 712 } |
| 428 } | 713 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__) |
| 429 | 714 #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__) |
| 430 /* interleave macros */ | 715 #define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__) |
| 431 /* no in-place support */ | 716 |
| 432 #define ILV_B_LRLR_UB(in0, in1, in2, in3, \ | 717 /* Description : Interleave left half of byte elements from vectors |
| 433 out0, out1, out2, out3) { \ | 718 Arguments : Inputs - in0, in1, in2, in3 |
| 434 out0 = (v16u8)__msa_ilvl_b((v16i8)(in1), (v16i8)(in0)); \ | 719 Outputs - out0, out1 |
| 435 out1 = (v16u8)__msa_ilvr_b((v16i8)(in1), (v16i8)(in0)); \ | 720 Return Type - as per RTYPE |
| 436 out2 = (v16u8)__msa_ilvl_b((v16i8)(in3), (v16i8)(in2)); \ | 721 Details : Left half of byte elements of in0 and left half of byte |
| 437 out3 = (v16u8)__msa_ilvr_b((v16i8)(in3), (v16i8)(in2)); \ | 722 elements of in1 are interleaved and copied to out0. |
| 438 } | 723 Left half of byte elements of in2 and left half of byte |
| 439 | 724 elements of in3 are interleaved and copied to out1. |
| 440 #define ILV_H_LRLR_SH(in0, in1, in2, in3, \ | 725 */ |
| 441 out0, out1, out2, out3) { \ | 726 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \ |
| 442 out0 = __msa_ilvl_h((v8i16)(in1), (v8i16)(in0)); \ | 727 out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ |
| 443 out1 = __msa_ilvr_h((v8i16)(in1), (v8i16)(in0)); \ | 728 out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \ |
| 444 out2 = __msa_ilvl_h((v8i16)(in3), (v8i16)(in2)); \ | 729 } |
| 445 out3 = __msa_ilvr_h((v8i16)(in3), (v8i16)(in2)); \ | 730 #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__) |
| 446 } | 731 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__) |
| 447 | 732 #define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__) |
| 448 #define ILV_H_LR_SH(in0, in1, out0, out1) { \ | 733 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__) |
| 449 out0 = __msa_ilvl_h((v8i16)(in1), (v8i16)(in0)); \ | 734 |
| 450 out1 = __msa_ilvr_h((v8i16)(in1), (v8i16)(in0)); \ | 735 #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ |
| 451 } | 736 out0, out1, out2, out3) { \ |
| 452 | 737 ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ |
| 453 #define ILVR_B_2VECS_UB(in0_r, in1_r, in0_l, in1_l, \ | 738 ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ |
| 454 out0, out1) { \ | 739 } |
| 455 out0 = (v16u8)__msa_ilvr_b((v16i8)(in0_l), (v16i8)(in0_r)); \ | 740 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__) |
| 456 out1 = (v16u8)__msa_ilvr_b((v16i8)(in1_l), (v16i8)(in1_r)); \ | 741 #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__) |
| 457 } | 742 |
| 458 | 743 /* Description : Interleave left half of halfword elements from vectors |
| 459 #define ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \ | 744 Arguments : Inputs - in0, in1, in2, in3 |
| 460 out0, out1) { \ | 745 Outputs - out0, out1 |
| 461 out0 = __msa_ilvr_b((v16i8)(in0_l), (v16i8)(in0_r)); \ | 746 Return Type - as per RTYPE |
| 462 out1 = __msa_ilvr_b((v16i8)(in1_l), (v16i8)(in1_r)); \ | 747 Details : Left half of halfword elements of in0 and left half of halfword |
| 463 } | 748 elements of in1 are interleaved and copied to out0. |
| 464 | 749 Left half of halfword elements of in2 and left half of halfword |
| 465 #define ILVR_B_4VECS_UB(in0_r, in1_r, in2_r, in3_r, \ | 750 elements of in3 are interleaved and copied to out1. |
| 466 in0_l, in1_l, in2_l, in3_l, \ | 751 */ |
| 467 out0, out1, out2, out3) { \ | 752 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \ |
| 468 ILVR_B_2VECS_UB(in0_r, in1_r, in0_l, in1_l, \ | 753 out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ |
| 469 out0, out1); \ | 754 out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \ |
| 470 ILVR_B_2VECS_UB(in2_r, in3_r, in2_l, in3_l, \ | 755 } |
| 471 out2, out3); \ | 756 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__) |
| 472 } | 757 |
| 473 | 758 /* Description : Interleave left half of word elements from vectors |
| 474 #define ILVR_B_4VECS_SB(in0_r, in1_r, in2_r, in3_r, \ | 759 Arguments : Inputs - in0, in1, in2, in3 |
| 475 in0_l, in1_l, in2_l, in3_l, \ | 760 Outputs - out0, out1 |
| 476 out0, out1, out2, out3) { \ | 761 Return Type - as per RTYPE |
| 477 ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \ | 762 Details : Left half of word elements of in0 and left half of word |
| 478 out0, out1); \ | 763 elements of in1 are interleaved and copied to out0. |
| 479 ILVR_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \ | 764 Left half of word elements of in2 and left half of word |
| 480 out2, out3); \ | 765 elements of in3 are interleaved and copied to out1. |
| 481 } | 766 */ |
| 482 | 767 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) { \ |
| 483 #define ILVR_B_6VECS_SB(in0_r, in1_r, in2_r, \ | 768 out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ |
| 484 in3_r, in4_r, in5_r, \ | 769 out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3); \ |
| 485 in0_l, in1_l, in2_l, \ | 770 } |
| 486 in3_l, in4_l, in5_l, \ | 771 #define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__) |
| 487 out0, out1, out2, \ | 772 #define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__) |
| 488 out3, out4, out5) { \ | 773 |
| 489 ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \ | 774 /* Description : Interleave right half of byte elements from vectors |
| 490 out0, out1); \ | 775 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 |
| 491 ILVR_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \ | 776 Outputs - out0, out1, out2, out3 |
| 492 out2, out3); \ | 777 Return Type - as per RTYPE |
| 493 ILVR_B_2VECS_SB(in4_r, in5_r, in4_l, in5_l, \ | 778 Details : Right half of byte elements of in0 and right half of byte |
| 494 out4, out5); \ | 779 elements of in1 are interleaved and copied to out0. |
| 495 } | 780 Right half of byte elements of in2 and right half of byte |
| 496 | 781 elements of in3 are interleaved and copied to out1. |
| 497 #define ILVR_B_8VECS_SB(in0_r, in1_r, in2_r, in3_r, \ | 782 Similar for other pairs |
| 498 in4_r, in5_r, in6_r, in7_r, \ | 783 */ |
| 499 in0_l, in1_l, in2_l, in3_l, \ | 784 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \ |
| 500 in4_l, in5_l, in6_l, in7_l, \ | 785 out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ |
| 501 out0, out1, out2, out3, \ | 786 out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \ |
| 502 out4, out5, out6, out7) { \ | 787 } |
| 503 ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \ | 788 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__) |
| 504 out0, out1); \ | 789 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__) |
| 505 ILVR_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \ | 790 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__) |
| 506 out2, out3); \ | 791 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__) |
| 507 ILVR_B_2VECS_SB(in4_r, in5_r, in4_l, in5_l, \ | 792 |
| 508 out4, out5); \ | 793 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ |
| 509 ILVR_B_2VECS_SB(in6_r, in7_r, in6_l, in7_l, \ | 794 out0, out1, out2, out3) { \ |
| 510 out6, out7); \ | 795 ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ |
| 511 } | 796 ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ |
| 512 | 797 } |
| 513 #define ILVL_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \ | 798 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__) |
| 514 out0, out1) { \ | 799 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__) |
| 515 out0 = __msa_ilvl_b((v16i8)(in0_l), (v16i8)(in0_r)); \ | 800 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__) |
| 516 out1 = __msa_ilvl_b((v16i8)(in1_l), (v16i8)(in1_r)); \ | 801 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__) |
| 517 } | 802 |
| 518 | 803 /* Description : Interleave right half of halfword elements from vectors |
| 519 #define ILVL_B_4VECS_SB(in0_r, in1_r, in2_r, in3_r, \ | 804 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 |
| 520 in0_l, in1_l, in2_l, in3_l, \ | 805 Outputs - out0, out1, out2, out3 |
| 521 out0, out1, out2, out3) { \ | 806 Return Type - signed halfword |
| 522 ILVL_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \ | 807 Details : Right half of halfword elements of in0 and right half of |
| 523 out0, out1); \ | 808 halfword elements of in1 are interleaved and copied to out0. |
| 524 ILVL_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \ | 809 Right half of halfword elements of in2 and right half of |
| 525 out2, out3); \ | 810 halfword elements of in3 are interleaved and copied to out1. |
| 526 } | 811 Similar for other pairs |
| 527 | 812 */ |
| 528 #define ILVL_B_6VECS_SB(in0_r, in1_r, in2_r, \ | 813 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \ |
| 529 in3_r, in4_r, in5_r, \ | 814 out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ |
| 530 in0_l, in1_l, in2_l, \ | 815 out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \ |
| 531 in3_l, in4_l, in5_l, \ | 816 } |
| 532 out0, out1, out2, \ | 817 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__) |
| 533 out3, out4, out5) { \ | 818 |
| 534 ILVL_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l, \ | 819 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ |
| 535 out0, out1); \ | 820 out0, out1, out2, out3) { \ |
| 536 ILVL_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l, \ | 821 ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ |
| 537 out2, out3); \ | 822 ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ |
| 538 ILVL_B_2VECS_SB(in4_r, in5_r, in4_l, in5_l, \ | 823 } |
| 539 out4, out5); \ | 824 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__) |
| 540 } | 825 |
| 541 | 826 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) { \ |
| 542 #define ILVR_D_2VECS_SB(out0, in0_l, in0_r, \ | 827 out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ |
| 543 out1, in1_l, in1_r) { \ | 828 out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \ |
| 544 out0 = (v16i8)__msa_ilvr_d((v2i64)(in0_l), (v2i64)(in0_r)); \ | 829 } |
| 545 out1 = (v16i8)__msa_ilvr_d((v2i64)(in1_l), (v2i64)(in1_r)); \ | 830 #define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__) |
| 546 } | 831 #define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__) |
| 547 | 832 |
| 548 #define ILVR_D_3VECS_SB(out0, in0_l, in0_r, \ | 833 /* Description : Interleave right half of double word elements from vectors |
| 549 out1, in1_l, in1_r, \ | 834 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 |
| 550 out2, in2_l, in2_r) { \ | 835 Outputs - out0, out1, out2, out3 |
| 551 ILVR_D_2VECS_SB(out0, in0_l, in0_r, \ | 836 Return Type - unsigned double word |
| 552 out1, in1_l, in1_r); \ | 837 Details : Right half of double word elements of in0 and right half of |
| 553 out2 = (v16i8)__msa_ilvr_d((v2i64)(in2_l), (v2i64)(in2_r)); \ | 838 double word elements of in1 are interleaved and copied to out0. |
| 554 } | 839 Right half of double word elements of in2 and right half of |
| 555 | 840 double word elements of in3 are interleaved and copied to out1. |
| 556 #define ILVR_D_4VECS_SB(out0, in0_l, in0_r, \ | 841 */ |
| 557 out1, in1_l, in1_r, \ | 842 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) { \ |
| 558 out2, in2_l, in2_r, \ | 843 out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1)); \ |
| 559 out3, in3_l, in3_r) { \ | 844 out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3)); \ |
| 560 ILVR_D_2VECS_SB(out0, in0_l, in0_r, \ | 845 } |
| 561 out1, in1_l, in1_r); \ | 846 #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__) |
| 562 ILVR_D_2VECS_SB(out2, in2_l, in2_r, \ | 847 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__) |
| 563 out3, in3_l, in3_r); \ | 848 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__) |
| 564 } | 849 |
| 565 | 850 #define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) { \ |
| 566 #define DOTP_S_W_4VECS_SW(m0, c0, m1, c1, \ | 851 ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ |
| 567 m2, c2, m3, c3, \ | 852 out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5)); \ |
| 568 out0, out1, out2, out3) { \ | 853 } |
| 569 out0 = __msa_dotp_s_w((v8i16)(m0), (v8i16)(c0)); \ | 854 #define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__) |
| 570 out1 = __msa_dotp_s_w((v8i16)(m1), (v8i16)(c1)); \ | 855 |
| 571 out2 = __msa_dotp_s_w((v8i16)(m2), (v8i16)(c2)); \ | 856 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ |
| 572 out3 = __msa_dotp_s_w((v8i16)(m3), (v8i16)(c3)); \ | 857 out0, out1, out2, out3) { \ |
| 573 } | 858 ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ |
| 574 | 859 ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ |
| 575 #define PCKEV_H_2VECS_SH(in0_l, in0_r, in1_l, in1_r, \ | 860 } |
| 576 out0, out1) { \ | 861 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__) |
| 577 out0 = __msa_pckev_h((v8i16)(in0_l), (v8i16)(in0_r)); \ | 862 #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__) |
| 578 out1 = __msa_pckev_h((v8i16)(in1_l), (v8i16)(in1_r)); \ | 863 |
| 579 } | 864 /* Description : Interleave both left and right half of input vectors |
| 580 | 865 Arguments : Inputs - in0, in1 |
| 581 #define XORI_B_2VECS_UB(val0, val1, \ | 866 Outputs - out0, out1 |
| 582 out0, out1, xor_val) { \ | 867 Return Type - as per RTYPE |
| 583 out0 = __msa_xori_b((v16u8)(val0), (xor_val)); \ | 868 Details : Right half of byte elements from 'in0' and 'in1' are |
| 584 out1 = __msa_xori_b((v16u8)(val1), (xor_val)); \ | 869 interleaved and stored to 'out0' |
| 585 } | 870 Left half of byte elements from 'in0' and 'in1' are |
| 586 | 871 interleaved and stored to 'out1' |
| 587 #define XORI_B_2VECS_SB(val0, val1, \ | 872 */ |
| 588 out0, out1, xor_val) { \ | 873 #define ILVRL_B2(RTYPE, in0, in1, out0, out1) { \ |
| 589 out0 = (v16i8)__msa_xori_b((v16u8)(val0), (xor_val)); \ | 874 out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ |
| 590 out1 = (v16i8)__msa_xori_b((v16u8)(val1), (xor_val)); \ | 875 out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ |
| 591 } | 876 } |
| 592 | 877 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__) |
| 593 #define XORI_B_3VECS_SB(val0, val1, val2, \ | 878 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__) |
| 594 out0, out1, out2, xor_val) { \ | 879 #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__) |
| 595 XORI_B_2VECS_SB(val0, val1, out0, out1, xor_val); \ | 880 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__) |
| 596 out2 = (v16i8)__msa_xori_b((v16u8)(val2), (xor_val)); \ | 881 |
| 597 } | 882 #define ILVRL_H2(RTYPE, in0, in1, out0, out1) { \ |
| 598 | 883 out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ |
| 599 #define XORI_B_4VECS_UB(val0, val1, val2, val3, \ | 884 out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ |
| 600 out0, out1, out2, out3, \ | 885 } |
| 601 xor_val) { \ | 886 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__) |
| 602 XORI_B_2VECS_UB(val0, val1, out0, out1, xor_val); \ | 887 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__) |
| 603 XORI_B_2VECS_UB(val2, val3, out2, out3, xor_val); \ | 888 |
| 604 } | 889 #define ILVRL_W2(RTYPE, in0, in1, out0, out1) { \ |
| 605 | 890 out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ |
| 606 #define XORI_B_4VECS_SB(val0, val1, val2, val3, \ | 891 out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ |
| 607 out0, out1, out2, out3, \ | 892 } |
| 608 xor_val) { \ | 893 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__) |
| 609 XORI_B_2VECS_SB(val0, val1, out0, out1, xor_val); \ | 894 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__) |
| 610 XORI_B_2VECS_SB(val2, val3, out2, out3, xor_val); \ | 895 |
| 611 } | 896 /* Description : Saturate the halfword element values to the max |
| 612 | 897 unsigned value of (sat_val+1 bits) |
| 613 #define XORI_B_7VECS_SB(val0, val1, val2, val3, \ | 898 The element data width remains unchanged |
| 614 val4, val5, val6, \ | 899 Arguments : Inputs - in0, in1, in2, in3, sat_val |
| 615 out0, out1, out2, out3, \ | 900 Outputs - in0, in1, in2, in3 (in place) |
| 616 out4, out5, out6, \ | 901 Return Type - unsigned halfword |
| 617 xor_val) { \ | 902 Details : Each unsigned halfword element from 'in0' is saturated to the |
| 618 XORI_B_4VECS_SB(val0, val1, val2, val3, \ | 903 value generated with (sat_val+1) bit range |
| 619 out0, out1, out2, out3, xor_val); \ | 904 Results are in placed to original vectors |
| 620 XORI_B_3VECS_SB(val4, val5, val6, \ | 905 */ |
| 621 out4, out5, out6, xor_val); \ | 906 #define SAT_UH2(RTYPE, in0, in1, sat_val) { \ |
| 622 } | 907 in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \ |
| 623 | 908 in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \ |
| 624 #define SRARI_H_4VECS_UH(val0, val1, val2, val3, \ | 909 } |
| 625 out0, out1, out2, out3, \ | 910 #define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__) |
| 626 shift_right_val) { \ | 911 |
| 627 out0 = (v8u16)__msa_srari_h((v8i16)(val0), (shift_right_val)); \ | 912 #define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) { \ |
| 628 out1 = (v8u16)__msa_srari_h((v8i16)(val1), (shift_right_val)); \ | 913 SAT_UH2(RTYPE, in0, in1, sat_val); \ |
| 629 out2 = (v8u16)__msa_srari_h((v8i16)(val2), (shift_right_val)); \ | 914 SAT_UH2(RTYPE, in2, in3, sat_val) \ |
| 630 out3 = (v8u16)__msa_srari_h((v8i16)(val3), (shift_right_val)); \ | 915 } |
| 631 } | 916 #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__) |
| 632 | 917 |
| 633 #define SRARI_H_4VECS_SH(val0, val1, val2, val3, \ | 918 /* Description : Saturate the halfword element values to the max |
| 634 out0, out1, out2, out3, \ | 919 unsigned value of (sat_val+1 bits) |
| 635 shift_right_val) { \ | 920 The element data width remains unchanged |
| 636 out0 = __msa_srari_h((v8i16)(val0), (shift_right_val)); \ | 921 Arguments : Inputs - in0, in1, in2, in3, sat_val |
| 637 out1 = __msa_srari_h((v8i16)(val1), (shift_right_val)); \ | 922 Outputs - in0, in1, in2, in3 (in place) |
| 638 out2 = __msa_srari_h((v8i16)(val2), (shift_right_val)); \ | 923 Return Type - unsigned halfword |
| 639 out3 = __msa_srari_h((v8i16)(val3), (shift_right_val)); \ | 924 Details : Each unsigned halfword element from 'in0' is saturated to the |
| 640 } | 925 value generated with (sat_val+1) bit range |
| 641 | 926 Results are in placed to original vectors |
| 642 #define SRARI_W_4VECS_SW(val0, val1, val2, val3, \ | 927 */ |
| 643 out0, out1, out2, out3, \ | 928 #define SAT_SH2(RTYPE, in0, in1, sat_val) { \ |
| 644 shift_right_val) { \ | 929 in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \ |
| 645 out0 = __msa_srari_w((v4i32)(val0), (shift_right_val)); \ | 930 in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val); \ |
| 646 out1 = __msa_srari_w((v4i32)(val1), (shift_right_val)); \ | 931 } |
| 647 out2 = __msa_srari_w((v4i32)(val2), (shift_right_val)); \ | 932 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__) |
| 648 out3 = __msa_srari_w((v4i32)(val3), (shift_right_val)); \ | 933 |
| 649 } | 934 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) { \ |
| 650 | 935 SAT_SH2(RTYPE, in0, in1, sat_val); \ |
| 651 #define SRARI_SATURATE_UNSIGNED_H(input, right_shift_val, sat_val) ({ \ | 936 SAT_SH2(RTYPE, in2, in3, sat_val); \ |
| 652 v8u16 out_m; \ | 937 } |
| 938 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__) |
| 939 |
| 940 /* Description : Indexed halfword element values are replicated to all |
| 941 elements in output vector |
| 942 Arguments : Inputs - in, idx0, idx1 |
| 943 Outputs - out0, out1 |
| 944 Return Type - as per RTYPE |
| 945 Details : 'idx0' element value from 'in' vector is replicated to all |
| 946 elements in 'out0' vector |
| 947 Valid index range for halfword operation is 0-7 |
| 948 */ |
| 949 #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) { \ |
| 950 out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0); \ |
| 951 out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1); \ |
| 952 } |
| 953 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__) |
| 954 |
| 955 #define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, \ |
| 956 out0, out1, out2, out3) { \ |
| 957 SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \ |
| 958 SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \ |
| 959 } |
| 960 #define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__) |
| 961 #define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__) |
| 962 |
| 963 /* Description : Pack even byte elements of vector pairs |
| 964 Arguments : Inputs - in0, in1, in2, in3 |
| 965 Outputs - out0, out1 |
| 966 Return Type - as per RTYPE |
| 967 Details : Even byte elements of in0 are copied to the left half of |
| 968 out0 & even byte elements of in1 are copied to the right |
| 969 half of out0. |
| 970 Even byte elements of in2 are copied to the left half of |
| 971 out1 & even byte elements of in3 are copied to the right |
| 972 half of out1. |
| 973 */ |
| 974 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \ |
| 975 out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \ |
| 976 out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \ |
| 977 } |
| 978 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__) |
| 979 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__) |
| 980 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__) |
| 981 |
| 982 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ |
| 983 out0, out1, out2, out3) { \ |
| 984 PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ |
| 985 PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ |
| 986 } |
| 987 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__) |
| 988 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__) |
| 989 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__) |
| 990 |
| 991 /* Description : Pack even halfword elements of vector pairs |
| 992 Arguments : Inputs - in0, in1, in2, in3 |
| 993 Outputs - out0, out1 |
| 994 Return Type - as per RTYPE |
| 995 Details : Even halfword elements of in0 are copied to the left half of |
| 996 out0 & even halfword elements of in1 are copied to the right |
| 997 half of out0. |
| 998 Even halfword elements of in2 are copied to the left half of |
| 999 out1 & even halfword elements of in3 are copied to the right |
| 1000 half of out1. |
| 1001 */ |
| 1002 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \ |
| 1003 out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \ |
| 1004 out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \ |
| 1005 } |
| 1006 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__) |
| 1007 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__) |
| 1008 |
| 1009 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ |
| 1010 out0, out1, out2, out3) { \ |
| 1011 PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ |
| 1012 PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ |
| 1013 } |
| 1014 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__) |
| 1015 |
| 1016 /* Description : Pack even double word elements of vector pairs |
| 1017 Arguments : Inputs - in0, in1, in2, in3 |
| 1018 Outputs - out0, out1 |
| 1019 Return Type - unsigned byte |
| 1020 Details : Even double elements of in0 are copied to the left half of |
| 1021 out0 & even double elements of in1 are copied to the right |
| 1022 half of out0. |
| 1023 Even double elements of in2 are copied to the left half of |
| 1024 out1 & even double elements of in3 are copied to the right |
| 1025 half of out1. |
| 1026 */ |
| 1027 #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) { \ |
| 1028 out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1); \ |
| 1029 out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3); \ |
| 1030 } |
| 1031 #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__) |
| 1032 #define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__) |
| 1033 |
| 1034 #define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ |
| 1035 out0, out1, out2, out3) { \ |
| 1036 PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ |
| 1037 PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ |
| 1038 } |
| 1039 #define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__) |
| 1040 |
| 1041 /* Description : Each byte element is logically xor'ed with immediate 128 |
| 1042 Arguments : Inputs - in0, in1 |
| 1043 Outputs - in0, in1 (in-place) |
| 1044 Return Type - as per RTYPE |
| 1045 Details : Each unsigned byte element from input vector 'in0' is |
| 1046 logically xor'ed with 128 and result is in-place stored in |
| 1047 'in0' vector |
| 1048 Each unsigned byte element from input vector 'in1' is |
| 1049 logically xor'ed with 128 and result is in-place stored in |
| 1050 'in1' vector |
| 1051 Similar for other pairs |
| 1052 */ |
| 1053 #define XORI_B2_128(RTYPE, in0, in1) { \ |
| 1054 in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128); \ |
| 1055 in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128); \ |
| 1056 } |
| 1057 #define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__) |
| 1058 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__) |
| 1059 |
| 1060 #define XORI_B3_128(RTYPE, in0, in1, in2) { \ |
| 1061 XORI_B2_128(RTYPE, in0, in1); \ |
| 1062 in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128); \ |
| 1063 } |
| 1064 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__) |
| 1065 |
| 1066 #define XORI_B4_128(RTYPE, in0, in1, in2, in3) { \ |
| 1067 XORI_B2_128(RTYPE, in0, in1); \ |
| 1068 XORI_B2_128(RTYPE, in2, in3); \ |
| 1069 } |
| 1070 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__) |
| 1071 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__) |
| 1072 |
| 1073 #define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) { \ |
| 1074 XORI_B4_128(RTYPE, in0, in1, in2, in3); \ |
| 1075 XORI_B3_128(RTYPE, in4, in5, in6); \ |
| 1076 } |
| 1077 #define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__) |
| 1078 |
| 1079 /* Description : Addition of signed halfword elements and signed saturation |
| 1080 Arguments : Inputs - in0, in1, in2, in3 |
| 1081 Outputs - out0, out1 |
| 1082 Return Type - as per RTYPE |
| 1083 Details : Signed halfword elements from 'in0' are added to signed |
| 1084 halfword elements of 'in1'. The result is then signed saturated |
| 1085 between -32768 to +32767 (as per halfword data type) |
| 1086 Similar for other pairs |
| 1087 */ |
| 1088 #define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) { \ |
| 1089 out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1); \ |
| 1090 out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3); \ |
| 1091 } |
| 1092 #define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__) |
| 1093 |
| 1094 #define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ |
| 1095 out0, out1, out2, out3) { \ |
| 1096 ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \ |
| 1097 ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \ |
| 1098 } |
| 1099 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__) |
| 1100 |
| 1101 /* Description : Shift right arithmetic rounded (immediate) |
| 1102 Arguments : Inputs - in0, in1, in2, in3, shift |
| 1103 Outputs - in0, in1, in2, in3 (in place) |
| 1104 Return Type - as per RTYPE |
| 1105 Details : Each element of vector 'in0' is shifted right arithmetic by |
| 1106 value in 'shift'. |
| 1107 The last discarded bit is added to shifted value for rounding |
| 1108 and the result is in place written to 'in0' |
| 1109 Similar for other pairs |
| 1110 */ |
| 1111 #define SRARI_H2(RTYPE, in0, in1, shift) { \ |
| 1112 in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift); \ |
| 1113 in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift); \ |
| 1114 } |
| 1115 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__) |
| 1116 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__) |
| 1117 |
| 1118 #define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) { \ |
| 1119 SRARI_H2(RTYPE, in0, in1, shift); \ |
| 1120 SRARI_H2(RTYPE, in2, in3, shift); \ |
| 1121 } |
| 1122 #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__) |
| 1123 #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__) |
| 1124 |
| 1125 /* Description : Shift right arithmetic rounded (immediate) |
| 1126 Arguments : Inputs - in0, in1, shift |
| 1127 Outputs - in0, in1 (in place) |
| 1128 Return Type - as per RTYPE |
| 1129 Details : Each element of vector 'in0' is shifted right arithmetic by |
| 1130 value in 'shift'. |
| 1131 The last discarded bit is added to shifted value for rounding |
| 1132 and the result is in place written to 'in0' |
| 1133 Similar for other pairs |
| 1134 */ |
| 1135 #define SRARI_W2(RTYPE, in0, in1, shift) { \ |
| 1136 in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \ |
| 1137 in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \ |
| 1138 } |
| 1139 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__) |
| 1140 |
| 1141 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) { \ |
| 1142 SRARI_W2(RTYPE, in0, in1, shift); \ |
| 1143 SRARI_W2(RTYPE, in2, in3, shift); \ |
| 1144 } |
| 1145 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__) |
| 1146 |
| 1147 /* Description : Addition of 2 pairs of vectors |
| 1148 Arguments : Inputs - in0, in1, in2, in3 |
| 1149 Outputs - out0, out1 |
| 1150 Details : Each element from 2 pairs vectors is added and 2 results are |
| 1151 produced |
| 1152 */ |
| 1153 #define ADD2(in0, in1, in2, in3, out0, out1) { \ |
| 1154 out0 = in0 + in1; \ |
| 1155 out1 = in2 + in3; \ |
| 1156 } |
| 1157 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, \ |
| 1158 out0, out1, out2, out3) { \ |
| 1159 ADD2(in0, in1, in2, in3, out0, out1); \ |
| 1160 ADD2(in4, in5, in6, in7, out2, out3); \ |
| 1161 } |
| 1162 |
| 1163 /* Description : Subtraction of 2 pairs of vectors |
| 1164 Arguments : Inputs - in0, in1, in2, in3 |
| 1165 Outputs - out0, out1 |
| 1166 Details : Each element from 2 pairs vectors is subtracted and 2 results |
| 1167 are produced |
| 1168 */ |
| 1169 #define SUB2(in0, in1, in2, in3, out0, out1) { \ |
| 1170 out0 = in0 - in1; \ |
| 1171 out1 = in2 - in3; \ |
| 1172 } |
| 1173 #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, \ |
| 1174 out0, out1, out2, out3) { \ |
| 1175 out0 = in0 - in1; \ |
| 1176 out1 = in2 - in3; \ |
| 1177 out2 = in4 - in5; \ |
| 1178 out3 = in6 - in7; \ |
| 1179 } |
| 1180 |
| 1181 /* Description : Zero extend unsigned byte elements to halfword elements |
| 1182 Arguments : Inputs - in (1 input unsigned byte vector) |
| 1183 Outputs - out0, out1 (unsigned 2 halfword vectors) |
| 1184 Return Type - signed halfword |
| 1185 Details : Zero extended right half of vector is returned in 'out0' |
| 1186 Zero extended left half of vector is returned in 'out1' |
| 1187 */ |
| 1188 #define UNPCK_UB_SH(in, out0, out1) { \ |
| 1189 v16i8 zero_m = { 0 }; \ |
| 1190 \ |
| 1191 ILVRL_B2_SH(zero_m, in, out0, out1); \ |
| 1192 } |
| 1193 |
| 1194 /* Description : Butterfly of 4 input vectors |
| 1195 Arguments : Inputs - in0, in1, in2, in3 |
| 1196 Outputs - out0, out1, out2, out3 |
| 1197 Details : Butterfly operation |
| 1198 */ |
| 1199 #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) { \ |
| 1200 out0 = in0 + in3; \ |
| 1201 out1 = in1 + in2; \ |
| 1202 \ |
| 1203 out2 = in1 - in2; \ |
| 1204 out3 = in0 - in3; \ |
| 1205 } |
| 1206 |
| 1207 /* Description : Butterfly of 8 input vectors |
| 1208 Arguments : Inputs - in0 ... in7 |
| 1209 Outputs - out0 .. out7 |
| 1210 Details : Butterfly operation |
| 1211 */ |
| 1212 #define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, \ |
| 1213 out0, out1, out2, out3, out4, out5, out6, out7) { \ |
| 1214 out0 = in0 + in7; \ |
| 1215 out1 = in1 + in6; \ |
| 1216 out2 = in2 + in5; \ |
| 1217 out3 = in3 + in4; \ |
| 653 \ | 1218 \ |
| 654 out_m = (v8u16)__msa_srari_h((v8i16)(input), (right_shift_val)); \ | 1219 out4 = in3 - in4; \ |
| 655 out_m = __msa_sat_u_h(out_m, (sat_val)); \ | 1220 out5 = in2 - in5; \ |
| 656 out_m; \ | 1221 out6 = in1 - in6; \ |
| 1222 out7 = in0 - in7; \ |
| 1223 } |
| 1224 |
| 1225 /* Description : Transposes 4x8 block with half word elements in vectors |
| 1226 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 |
| 1227 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 |
| 1228 Return Type - signed halfword |
| 1229 Details : |
| 1230 */ |
| 1231 #define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, \ |
| 1232 out0, out1, out2, out3, out4, out5, out6, out7) { \ |
| 1233 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ |
| 1234 v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n; \ |
| 1235 v8i16 zero_m = { 0 }; \ |
| 1236 \ |
| 1237 ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, \ |
| 1238 tmp0_n, tmp1_n, tmp2_n, tmp3_n); \ |
| 1239 ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m); \ |
| 1240 ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m); \ |
| 1241 \ |
| 1242 out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m); \ |
| 1243 out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m); \ |
| 1244 out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m); \ |
| 1245 out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m); \ |
| 1246 \ |
| 1247 out4 = zero_m; \ |
| 1248 out5 = zero_m; \ |
| 1249 out6 = zero_m; \ |
| 1250 out7 = zero_m; \ |
| 1251 } |
| 1252 |
| 1253 /* Description : Transposes 8x4 block with half word elements in vectors |
| 1254 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 |
| 1255 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 |
| 1256 Return Type - signed halfword |
| 1257 Details : |
| 1258 */ |
| 1259 #define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) { \ |
| 1260 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ |
| 1261 \ |
| 1262 ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m); \ |
| 1263 ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m); \ |
| 1264 ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2); \ |
| 1265 ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3); \ |
| 1266 } |
| 1267 |
| 1268 /* Description : Transposes 8x8 block with half word elements in vectors |
| 1269 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 |
| 1270 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 |
| 1271 Return Type - signed halfword |
| 1272 Details : |
| 1273 */ |
| 1274 #define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ |
| 1275 out0, out1, out2, out3, out4, out5, out6, out7) { \ |
| 1276 v8i16 s0_m, s1_m; \ |
| 1277 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ |
| 1278 v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ |
| 1279 \ |
| 1280 ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \ |
| 1281 ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m); \ |
| 1282 ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \ |
| 1283 ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m); \ |
| 1284 ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ |
| 1285 ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m); \ |
| 1286 ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ |
| 1287 ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m); \ |
| 1288 PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, \ |
| 1289 tmp3_m, tmp7_m, out0, out2, out4, out6); \ |
| 1290 out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m); \ |
| 1291 out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m); \ |
| 1292 out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m); \ |
| 1293 out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m); \ |
| 1294 } |
| 1295 #define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__) |
| 1296 |
| 1297 /* Description : Pack even elements of input vectors & xor with 128 |
| 1298 Arguments : Inputs - in0, in1 |
| 1299 Outputs - out_m |
| 1300 Return Type - unsigned byte |
| 1301 Details : Signed byte even elements from 'in0' and 'in1' are packed |
| 1302 together in one vector and the resulted vector is xor'ed with |
| 1303 128 to shift the range from signed to unsigned byte |
| 1304 */ |
| 1305 #define PCKEV_XORI128_UB(in0, in1) ({ \ |
| 1306 v16u8 out_m; \ |
| 1307 \ |
| 1308 out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \ |
| 1309 out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128); \ |
| 1310 out_m; \ |
| 657 }) | 1311 }) |
| 658 | 1312 |
| 659 #define SRARI_SATURATE_SIGNED_H(input, right_shift_val, sat_val) ({ \ | 1313 /* Description : Pack even byte elements and store byte vector in destination |
| 660 v8i16 out_m; \ | 1314 memory |
| 661 \ | 1315 Arguments : Inputs - in0, in1, pdst |
| 662 out_m = __msa_srari_h((v8i16)(input), (right_shift_val)); \ | 1316 */ |
| 663 out_m = __msa_sat_s_h(out_m, (sat_val)); \ | 1317 #define PCKEV_ST_SB(in0, in1, pdst) { \ |
| 664 out_m; \ | 1318 v16i8 tmp_m; \ |
| 1319 \ |
| 1320 tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \ |
| 1321 ST_SB(tmp_m, (pdst)); \ |
| 1322 } |
| 1323 |
| 1324 /* Description : Horizontal 2 tap filter kernel code |
| 1325 Arguments : Inputs - in0, in1, mask, coeff, shift |
| 1326 */ |
| 1327 #define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) ({ \ |
| 1328 v16i8 tmp0_m; \ |
| 1329 v8u16 tmp1_m; \ |
| 1330 \ |
| 1331 tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \ |
| 1332 tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff); \ |
| 1333 tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift); \ |
| 1334 tmp1_m = __msa_sat_u_h(tmp1_m, shift); \ |
| 1335 \ |
| 1336 tmp1_m; \ |
| 665 }) | 1337 }) |
| 666 | |
| 667 #define PCKEV_2B_XORI128_STORE_4_BYTES_4(in1, in2, \ | |
| 668 pdst, stride) { \ | |
| 669 uint32_t out0_m, out1_m, out2_m, out3_m; \ | |
| 670 v16i8 tmp0_m; \ | |
| 671 uint8_t *dst_m = (uint8_t *)(pdst); \ | |
| 672 \ | |
| 673 tmp0_m = __msa_pckev_b((v16i8)(in2), (v16i8)(in1)); \ | |
| 674 tmp0_m = (v16i8)__msa_xori_b((v16u8)tmp0_m, 128); \ | |
| 675 \ | |
| 676 out0_m = __msa_copy_u_w((v4i32)tmp0_m, 0); \ | |
| 677 out1_m = __msa_copy_u_w((v4i32)tmp0_m, 1); \ | |
| 678 out2_m = __msa_copy_u_w((v4i32)tmp0_m, 2); \ | |
| 679 out3_m = __msa_copy_u_w((v4i32)tmp0_m, 3); \ | |
| 680 \ | |
| 681 STORE_WORD(dst_m, out0_m); \ | |
| 682 dst_m += stride; \ | |
| 683 STORE_WORD(dst_m, out1_m); \ | |
| 684 dst_m += stride; \ | |
| 685 STORE_WORD(dst_m, out2_m); \ | |
| 686 dst_m += stride; \ | |
| 687 STORE_WORD(dst_m, out3_m); \ | |
| 688 } | |
| 689 | |
| 690 #define PCKEV_B_4_XORI128_STORE_8_BYTES_4(in1, in2, \ | |
| 691 in3, in4, \ | |
| 692 pdst, stride) { \ | |
| 693 uint64_t out0_m, out1_m, out2_m, out3_m; \ | |
| 694 v16i8 tmp0_m, tmp1_m; \ | |
| 695 uint8_t *dst_m = (uint8_t *)(pdst); \ | |
| 696 \ | |
| 697 tmp0_m = __msa_pckev_b((v16i8)(in2), (v16i8)(in1)); \ | |
| 698 tmp1_m = __msa_pckev_b((v16i8)(in4), (v16i8)(in3)); \ | |
| 699 \ | |
| 700 tmp0_m = (v16i8)__msa_xori_b((v16u8)tmp0_m, 128); \ | |
| 701 tmp1_m = (v16i8)__msa_xori_b((v16u8)tmp1_m, 128); \ | |
| 702 \ | |
| 703 out0_m = __msa_copy_u_d((v2i64)tmp0_m, 0); \ | |
| 704 out1_m = __msa_copy_u_d((v2i64)tmp0_m, 1); \ | |
| 705 out2_m = __msa_copy_u_d((v2i64)tmp1_m, 0); \ | |
| 706 out3_m = __msa_copy_u_d((v2i64)tmp1_m, 1); \ | |
| 707 \ | |
| 708 STORE_DWORD(dst_m, out0_m); \ | |
| 709 dst_m += stride; \ | |
| 710 STORE_DWORD(dst_m, out1_m); \ | |
| 711 dst_m += stride; \ | |
| 712 STORE_DWORD(dst_m, out2_m); \ | |
| 713 dst_m += stride; \ | |
| 714 STORE_DWORD(dst_m, out3_m); \ | |
| 715 } | |
| 716 | |
| 717 /* Only for signed vecs */ | |
| 718 #define PCKEV_B_XORI128_STORE_VEC(in1, in2, pdest) { \ | |
| 719 v16i8 tmp_m; \ | |
| 720 \ | |
| 721 tmp_m = __msa_pckev_b((v16i8)(in1), (v16i8)(in2)); \ | |
| 722 tmp_m = (v16i8)__msa_xori_b((v16u8)tmp_m, 128); \ | |
| 723 STORE_SB(tmp_m, (pdest)); \ | |
| 724 } | |
| 725 | |
| 726 /* Only for signed vecs */ | |
| 727 #define PCKEV_B_4_XORI128_AVG_STORE_8_BYTES_4(in1, dst0, \ | |
| 728 in2, dst1, \ | |
| 729 in3, dst2, \ | |
| 730 in4, dst3, \ | |
| 731 pdst, stride) { \ | |
| 732 uint64_t out0_m, out1_m, out2_m, out3_m; \ | |
| 733 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ | |
| 734 uint8_t *dst_m = (uint8_t *)(pdst); \ | |
| 735 \ | |
| 736 tmp0_m = (v16u8)__msa_pckev_b((v16i8)(in2), (v16i8)(in1)); \ | |
| 737 tmp1_m = (v16u8)__msa_pckev_b((v16i8)(in4), (v16i8)(in3)); \ | |
| 738 \ | |
| 739 tmp2_m = (v16u8)__msa_ilvr_d((v2i64)(dst1), (v2i64)(dst0)); \ | |
| 740 tmp3_m = (v16u8)__msa_ilvr_d((v2i64)(dst3), (v2i64)(dst2)); \ | |
| 741 \ | |
| 742 tmp0_m = __msa_xori_b(tmp0_m, 128); \ | |
| 743 tmp1_m = __msa_xori_b(tmp1_m, 128); \ | |
| 744 \ | |
| 745 tmp0_m = __msa_aver_u_b(tmp0_m, tmp2_m); \ | |
| 746 tmp1_m = __msa_aver_u_b(tmp1_m, tmp3_m); \ | |
| 747 \ | |
| 748 out0_m = __msa_copy_u_d((v2i64)tmp0_m, 0); \ | |
| 749 out1_m = __msa_copy_u_d((v2i64)tmp0_m, 1); \ | |
| 750 out2_m = __msa_copy_u_d((v2i64)tmp1_m, 0); \ | |
| 751 out3_m = __msa_copy_u_d((v2i64)tmp1_m, 1); \ | |
| 752 \ | |
| 753 STORE_DWORD(dst_m, out0_m); \ | |
| 754 dst_m += stride; \ | |
| 755 STORE_DWORD(dst_m, out1_m); \ | |
| 756 dst_m += stride; \ | |
| 757 STORE_DWORD(dst_m, out2_m); \ | |
| 758 dst_m += stride; \ | |
| 759 STORE_DWORD(dst_m, out3_m); \ | |
| 760 } | |
| 761 | |
| 762 /* Only for signed vecs */ | |
| 763 #define PCKEV_B_XORI128_AVG_STORE_VEC(in1, in2, dst, pdest) { \ | |
| 764 v16u8 tmp_m; \ | |
| 765 \ | |
| 766 tmp_m = (v16u8)__msa_pckev_b((v16i8)(in1), (v16i8)(in2)); \ | |
| 767 tmp_m = __msa_xori_b(tmp_m, 128); \ | |
| 768 tmp_m = __msa_aver_u_b(tmp_m, (v16u8)(dst)); \ | |
| 769 STORE_UB(tmp_m, (pdest)); \ | |
| 770 } | |
| 771 | |
| 772 #define PCKEV_B_STORE_8_BYTES_4(in1, in2, in3, in4, \ | |
| 773 pdst, stride) { \ | |
| 774 uint64_t out0_m, out1_m, out2_m, out3_m; \ | |
| 775 v16i8 tmp0_m, tmp1_m; \ | |
| 776 uint8_t *dst_m = (uint8_t *)(pdst); \ | |
| 777 \ | |
| 778 tmp0_m = __msa_pckev_b((v16i8)(in2), (v16i8)(in1)); \ | |
| 779 tmp1_m = __msa_pckev_b((v16i8)(in4), (v16i8)(in3)); \ | |
| 780 \ | |
| 781 out0_m = __msa_copy_u_d((v2i64)tmp0_m, 0); \ | |
| 782 out1_m = __msa_copy_u_d((v2i64)tmp0_m, 1); \ | |
| 783 out2_m = __msa_copy_u_d((v2i64)tmp1_m, 0); \ | |
| 784 out3_m = __msa_copy_u_d((v2i64)tmp1_m, 1); \ | |
| 785 \ | |
| 786 STORE_DWORD(dst_m, out0_m); \ | |
| 787 dst_m += stride; \ | |
| 788 STORE_DWORD(dst_m, out1_m); \ | |
| 789 dst_m += stride; \ | |
| 790 STORE_DWORD(dst_m, out2_m); \ | |
| 791 dst_m += stride; \ | |
| 792 STORE_DWORD(dst_m, out3_m); \ | |
| 793 } | |
| 794 | |
| 795 /* Only for unsigned vecs */ | |
| 796 #define PCKEV_B_STORE_VEC(in1, in2, pdest) { \ | |
| 797 v16i8 tmp_m; \ | |
| 798 \ | |
| 799 tmp_m = __msa_pckev_b((v16i8)(in1), (v16i8)(in2)); \ | |
| 800 STORE_SB(tmp_m, (pdest)); \ | |
| 801 } | |
| 802 | |
| 803 #define PCKEV_B_AVG_STORE_8_BYTES_4(in1, dst0, in2, dst1, \ | |
| 804 in3, dst2, in4, dst3, \ | |
| 805 pdst, stride) { \ | |
| 806 uint64_t out0_m, out1_m, out2_m, out3_m; \ | |
| 807 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ | |
| 808 uint8_t *dst_m = (uint8_t *)(pdst); \ | |
| 809 \ | |
| 810 tmp0_m = (v16u8)__msa_pckev_b((v16i8)(in2), (v16i8)(in1)); \ | |
| 811 tmp1_m = (v16u8)__msa_pckev_b((v16i8)(in4), (v16i8)(in3)); \ | |
| 812 \ | |
| 813 tmp2_m = (v16u8)__msa_pckev_d((v2i64)(dst1), (v2i64)(dst0)); \ | |
| 814 tmp3_m = (v16u8)__msa_pckev_d((v2i64)(dst3), (v2i64)(dst2)); \ | |
| 815 \ | |
| 816 tmp0_m = __msa_aver_u_b(tmp0_m, tmp2_m); \ | |
| 817 tmp1_m = __msa_aver_u_b(tmp1_m, tmp3_m); \ | |
| 818 \ | |
| 819 out0_m = __msa_copy_u_d((v2i64)tmp0_m, 0); \ | |
| 820 out1_m = __msa_copy_u_d((v2i64)tmp0_m, 1); \ | |
| 821 out2_m = __msa_copy_u_d((v2i64)tmp1_m, 0); \ | |
| 822 out3_m = __msa_copy_u_d((v2i64)tmp1_m, 1); \ | |
| 823 \ | |
| 824 STORE_DWORD(dst_m, out0_m); \ | |
| 825 dst_m += stride; \ | |
| 826 STORE_DWORD(dst_m, out1_m); \ | |
| 827 dst_m += stride; \ | |
| 828 STORE_DWORD(dst_m, out2_m); \ | |
| 829 dst_m += stride; \ | |
| 830 STORE_DWORD(dst_m, out3_m); \ | |
| 831 } | |
| 832 | |
| 833 #define PCKEV_B_AVG_STORE_VEC(in1, in2, dst, pdest) { \ | |
| 834 v16u8 tmp_m; \ | |
| 835 \ | |
| 836 tmp_m = (v16u8)__msa_pckev_b((v16i8)(in1), (v16i8)(in2)); \ | |
| 837 tmp_m = __msa_aver_u_b(tmp_m, (v16u8)(dst)); \ | |
| 838 STORE_UB(tmp_m, (pdest)); \ | |
| 839 } | |
| 840 | |
| 841 /* Generic for Vector types and GP operations */ | |
| 842 #define BUTTERFLY_4(in0, in1, in2, in3, \ | |
| 843 out0, out1, out2, out3) { \ | |
| 844 out0 = (in0) + (in3); \ | |
| 845 out1 = (in1) + (in2); \ | |
| 846 \ | |
| 847 out2 = (in1) - (in2); \ | |
| 848 out3 = (in0) - (in3); \ | |
| 849 } | |
| 850 | |
| 851 /* Generic for Vector types and GP operations */ | |
| 852 #define BUTTERFLY_8(in0, in1, in2, in3, \ | |
| 853 in4, in5, in6, in7, \ | |
| 854 out0, out1, out2, out3, \ | |
| 855 out4, out5, out6, out7) { \ | |
| 856 out0 = (in0) + (in7); \ | |
| 857 out1 = (in1) + (in6); \ | |
| 858 out2 = (in2) + (in5); \ | |
| 859 out3 = (in3) + (in4); \ | |
| 860 \ | |
| 861 out4 = (in3) - (in4); \ | |
| 862 out5 = (in2) - (in5); \ | |
| 863 out6 = (in1) - (in6); \ | |
| 864 out7 = (in0) - (in7); \ | |
| 865 } | |
| 866 #endif /* HAVE_MSA */ | |
| 867 #endif /* VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_ */ | 1338 #endif /* VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_ */ |
| OLD | NEW |