OLD | NEW |
(Empty) | |
| 1 // Copyright 2016 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. |
| 4 |
| 5 #ifndef CommonMacrosMSA_h |
| 6 #define CommonMacrosMSA_h |
| 7 |
| 8 #include <msa.h> |
| 9 #include <stdint.h> |
| 10 |
| 11 #if defined(__clang__) |
| 12 #define CLANG_BUILD |
| 13 #endif |
| 14 |
| 15 #ifdef CLANG_BUILD |
| 16 #define SRLI_H(a, b) __msa_srli_h((v8i16)a, b) |
| 17 #define SLLI_H(a, b) __msa_slli_h((v8i16)a, b) |
| 18 #define CEQI_H(a, b) __msa_ceqi_h((v8i16)a, b) |
| 19 #else |
| 20 #define SRLI_H(a, b) ((v8u16)a >> b) |
| 21 #define SLLI_H(a, b) ((v8i16)a << b) |
| 22 #define CEQI_H(a, b) (a == b) |
| 23 #endif |
| 24 |
| 25 #define LD_V(RTYPE, psrc) *((RTYPE*)(psrc)) |
| 26 #define LD_UB(...) LD_V(v16u8, __VA_ARGS__) |
| 27 #define LD_UH(...) LD_V(v8u16, __VA_ARGS__) |
| 28 #define LD_SP(...) LD_V(v4f32, __VA_ARGS__) |
| 29 #define LD_DP(...) LD_V(v2f64, __VA_ARGS__) |
| 30 |
| 31 #define ST_V(RTYPE, in, pdst) *((RTYPE*)(pdst)) = in |
| 32 #define ST_UB(...) ST_V(v16u8, __VA_ARGS__) |
| 33 #define ST_UH(...) ST_V(v8u16, __VA_ARGS__) |
| 34 #define ST_SP(...) ST_V(v4f32, __VA_ARGS__) |
| 35 #define ST_DP(...) ST_V(v2f64, __VA_ARGS__) |
| 36 |
| 37 #ifdef CLANG_BUILD |
| 38 #define COPY_DOUBLE_TO_VECTOR(a) ({ \ |
| 39 v2f64 out; \ |
| 40 out = (v2f64) __msa_fill_d(*(int64_t *)(&a)); \ |
| 41 out; \ |
| 42 }) |
| 43 #else |
| 44 #define COPY_DOUBLE_TO_VECTOR(a) ({ \ |
| 45 v2f64 out; \ |
| 46 out = __msa_cast_to_vector_double(a); \ |
| 47 out = (v2f64) __msa_splati_d((v2i64) out, 0); \ |
| 48 out; \ |
| 49 }) |
| 50 #endif |
| 51 |
| 52 #define MSA_STORE_FUNC(TYPE, INSTR, FUNCNAME) \ |
| 53 static inline void FUNCNAME(TYPE val, void* const pdst) \ |
| 54 { \ |
| 55 uint8_t* const pdstm = (uint8_t*)pdst; \ |
| 56 TYPE valm = val; \ |
| 57 asm volatile( \ |
| 58 " " #INSTR " %[valm], %[pdstm] \n\t" \ |
| 59 : [pdstm] "=m" (*pdstm) \ |
| 60 : [valm] "r" (valm)); \ |
| 61 } |
| 62 |
| 63 #define MSA_STORE(val, pdst, FUNCNAME) FUNCNAME(val, pdst) |
| 64 |
| 65 #ifdef CLANG_BUILD |
| 66 MSA_STORE_FUNC(uint32_t, sw, msa_sw); |
| 67 #define SW(val, pdst) MSA_STORE(val, pdst, msa_sw) |
| 68 #if (__mips == 64) |
| 69 MSA_STORE_FUNC(uint64_t, sd, msa_sd); |
| 70 #define SD(val, pdst) MSA_STORE(val, pdst, msa_sd) |
| 71 #else |
| 72 #define SD(val, pdst) \ |
| 73 { \ |
| 74 uint8_t* const pdstsd = (uint8_t*)(pdst); \ |
| 75 const uint32_t val0m = (uint32_t)(val & 0x00000000FFFFFFFF); \ |
| 76 const uint32_t val1m = (uint32_t)((val >> 32) & 0x00000000FFFFFFFF); \ |
| 77 SW(val0m, pdstsd); \ |
| 78 SW(val1m, pdstsd + 4); \ |
| 79 } |
| 80 #endif |
| 81 #else |
| 82 #if (__mips_isa_rev >= 6) |
| 83 MSA_STORE_FUNC(uint32_t, sw, msa_sw); |
| 84 #define SW(val, pdst) MSA_STORE(val, pdst, msa_sw) |
| 85 MSA_STORE_FUNC(uint64_t, sd, msa_sd); |
| 86 #define SD(val, pdst) MSA_STORE(val, pdst, msa_sd) |
| 87 #else // !(__mips_isa_rev >= 6) |
| 88 MSA_STORE_FUNC(uint32_t, usw, msa_usw); |
| 89 #define SW(val, pdst) MSA_STORE(val, pdst, msa_usw) |
| 90 #define SD(val, pdst) \ |
| 91 { \ |
| 92 uint8_t* const pdstsd = (uint8_t*)(pdst); \ |
| 93 const uint32_t val0m = (uint32_t)(val & 0x00000000FFFFFFFF); \ |
| 94 const uint32_t val1m = (uint32_t)((val >> 32) & 0x00000000FFFFFFFF); \ |
| 95 SW(val0m, pdstsd); \ |
| 96 SW(val1m, pdstsd + 4); \ |
| 97 } |
| 98 #endif // (__mips_isa_rev >= 6) |
| 99 #endif |
| 100 |
| 101 /* Description : Load vectors with elements with stride |
| 102 * Arguments : Inputs - psrc, stride |
| 103 * Outputs - out0, out1 |
| 104 * Return Type - as per RTYPE |
| 105 * Details : Load elements in 'out0' from (psrc) |
| 106 * Load elements in 'out1' from (psrc + stride) |
| 107 */ |
| 108 #define LD_V2(RTYPE, psrc, stride, out0, out1) \ |
| 109 { \ |
| 110 out0 = LD_V(RTYPE, psrc); \ |
| 111 psrc += stride; \ |
| 112 out1 = LD_V(RTYPE, psrc); \ |
| 113 psrc += stride; \ |
| 114 } |
| 115 #define LD_UB2(...) LD_V2(v16u8, __VA_ARGS__) |
| 116 #define LD_UH2(...) LD_V2(v8u16, __VA_ARGS__) |
| 117 #define LD_SP2(...) LD_V2(v4f32, __VA_ARGS__) |
| 118 |
| 119 #define LD_V3(RTYPE, psrc, stride, out0, out1, out2) \ |
| 120 { \ |
| 121 LD_V2(RTYPE, psrc, stride, out0, out1); \ |
| 122 out2 = LD_V(RTYPE, psrc); \ |
| 123 psrc += stride; \ |
| 124 } |
| 125 #define LD_UB3(...) LD_V3(v16u8, __VA_ARGS__) |
| 126 #define LD_UH3(...) LD_V3(v8u16, __VA_ARGS__) |
| 127 |
| 128 #define LD_V4(RTYPE, psrc, stride, out0, out1, out2, out3) \ |
| 129 { \ |
| 130 LD_V2(RTYPE, psrc, stride, out0, out1); \ |
| 131 LD_V2(RTYPE, psrc, stride, out2, out3); \ |
| 132 } |
| 133 #define LD_UB4(...) LD_V4(v16u8, __VA_ARGS__) |
| 134 #define LD_UH4(...) LD_V4(v8u16, __VA_ARGS__) |
| 135 #define LD_SP4(...) LD_V4(v4f32, __VA_ARGS__) |
| 136 |
| 137 /* Description : Store vectors of elements with stride |
| 138 * Arguments : Inputs - in0, in1, pdst, stride |
| 139 * Details : Store elements from 'in0' to (pdst) |
| 140 * Store elements from 'in1' to (pdst + stride) |
| 141 */ |
| 142 #define ST_V2(RTYPE, in0, in1, pdst, stride) \ |
| 143 { \ |
| 144 ST_V(RTYPE, in0, pdst); \ |
| 145 pdst += stride; \ |
| 146 ST_V(RTYPE, in1, pdst); \ |
| 147 pdst += stride; \ |
| 148 } |
| 149 #define ST_UB2(...) ST_V2(v16u8, __VA_ARGS__) |
| 150 #define ST_UH2(...) ST_V2(v8u16, __VA_ARGS__) |
| 151 #define ST_SP2(...) ST_V2(v4f32, __VA_ARGS__) |
| 152 |
| 153 #define ST_V3(RTYPE, in0, in1, in2, pdst, stride) \ |
| 154 { \ |
| 155 ST_V2(RTYPE, in0, in1, pdst, stride); \ |
| 156 ST_V(RTYPE, in2, pdst); \ |
| 157 pdst += stride; \ |
| 158 } |
| 159 #define ST_UB3(...) ST_V3(v16u8, __VA_ARGS__) |
| 160 #define ST_UH3(...) ST_V3(v8u16, __VA_ARGS__) |
| 161 |
| 162 #define ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride) \ |
| 163 { \ |
| 164 ST_V2(RTYPE, in0, in1, pdst, stride); \ |
| 165 ST_V2(RTYPE, in2, in3, pdst, stride); \ |
| 166 } |
| 167 #define ST_UB4(...) ST_V4(v16u8, __VA_ARGS__) |
| 168 #define ST_UH4(...) ST_V4(v8u16, __VA_ARGS__) |
| 169 #define ST_SP4(...) ST_V4(v4f32, __VA_ARGS__) |
| 170 #define ST_V6(RTYPE, in0, in1, in2, in3, in4, in5, pdst, stride) \ |
| 171 { \ |
| 172 ST_V3(RTYPE, in0, in1, in2, pdst, stride); \ |
| 173 ST_V3(RTYPE, in3, in4, in5, pdst, stride); \ |
| 174 } |
| 175 #define ST_UB6(...) ST_V6(v16u8, __VA_ARGS__) |
| 176 #define ST_SP6(...) ST_V6(v4f32, __VA_ARGS__) |
| 177 |
| 178 #define ST_V8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ |
| 179 { \ |
| 180 ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride); \ |
| 181 ST_V4(RTYPE, in4, in5, in6, in7, pdst, stride); \ |
| 182 } |
| 183 #define ST_UB8(...) ST_V8(v16u8, __VA_ARGS__) |
| 184 #define ST_SP8(...) ST_V8(v4f32, __VA_ARGS__) |
| 185 |
| 186 /* Description : Interleave even halfword elements from vectors |
| 187 Arguments : Inputs - in0, in1, in2, in3 |
| 188 Outputs - out0, out1 |
| 189 Return Type - as per RTYPE |
| 190 Details : Even halfword elements of 'in0' and 'in1' are interleaved |
| 191 and written to 'out0' |
| 192 */ |
| 193 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ |
| 194 { \ |
| 195 out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \ |
| 196 out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \ |
| 197 } |
| 198 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__) |
| 199 |
| 200 /* Description : Interleave both left and right half of input vectors |
| 201 Arguments : Inputs - in0, in1 |
| 202 Outputs - out0, out1 |
| 203 Return Type - as per RTYPE |
| 204 Details : Right half of byte elements from 'in0' and 'in1' are |
| 205 interleaved and written to 'out0' |
| 206 */ |
| 207 #define ILVRL_B2(RTYPE, in0, in1, out0, out1) \ |
| 208 { \ |
| 209 out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ |
| 210 out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ |
| 211 } |
| 212 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__) |
| 213 |
| 214 #endif // CommonMacrosMSA_h |
OLD | NEW |