third_party/libwebp/dsp/msa_macro.h - Issue 2651883004: libwebp-0.6.0-rc1

Side by Side Diff: third_party/libwebp/dsp/msa_macro.h

Issue 2651883004: libwebp-0.6.0-rc1 (Closed)

Patch Set: Created 3 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2016 Google Inc. All Rights Reserved.	1 // Copyright 2016 Google Inc. All Rights Reserved.

2 //	2 //

3 // Use of this source code is governed by a BSD-style license	3 // Use of this source code is governed by a BSD-style license

4 // that can be found in the COPYING file in the root of the source	4 // that can be found in the COPYING file in the root of the source

5 // tree. An additional intellectual property rights grant can be found	5 // tree. An additional intellectual property rights grant can be found

6 // in the file PATENTS. All contributing project authors may	6 // in the file PATENTS. All contributing project authors may

7 // be found in the AUTHORS file in the root of the source tree.	7 // be found in the AUTHORS file in the root of the source tree.

8 // -----------------------------------------------------------------------------	8 // -----------------------------------------------------------------------------

9 //	9 //

10 // MSA common macros	10 // MSA common macros

11 //	11 //

12 // Author(s): Prashant Patil (prashant.patil@imgtec.com)	12 // Author(s): Prashant Patil (prashant.patil@imgtec.com)

13	13

14 #ifndef WEBP_DSP_MSA_MACRO_H_	14 #ifndef WEBP_DSP_MSA_MACRO_H_

15 #define WEBP_DSP_MSA_MACRO_H_	15 #define WEBP_DSP_MSA_MACRO_H_

16	16

17 #include <stdint.h>	17 #include <stdint.h>

18 #include <msa.h>	18 #include <msa.h>

19	19

20 #if defined(__clang__)	20 #if defined(__clang__)

21 #define CLANG_BUILD	21 #define CLANG_BUILD

22 #endif	22 #endif

23	23

24 #ifdef CLANG_BUILD	24 #ifdef CLANG_BUILD

25 #define ADDVI_H(a, b) __msa_addvi_h((v8i16)a, b)	25 #define ADDVI_H(a, b) __msa_addvi_h((v8i16)a, b)

	26 #define ADDVI_W(a, b) __msa_addvi_w((v4i32)a, b)

	27 #define SRAI_B(a, b) __msa_srai_b((v16i8)a, b)

26 #define SRAI_H(a, b) __msa_srai_h((v8i16)a, b)	28 #define SRAI_H(a, b) __msa_srai_h((v8i16)a, b)

27 #define SRAI_W(a, b) __msa_srai_w((v4i32)a, b)	29 #define SRAI_W(a, b) __msa_srai_w((v4i32)a, b)

	30 #define SRLI_H(a, b) __msa_srli_h((v8i16)a, b)

	31 #define SLLI_B(a, b) __msa_slli_b((v4i32)a, b)

	32 #define ANDI_B(a, b) __msa_andi_b((v16u8)a, b)

	33 #define ORI_B(a, b) __msa_ori_b((v16u8)a, b)

28 #else	34 #else

29 #define ADDVI_H(a, b) (a + b)	35 #define ADDVI_H(a, b) (a + b)

	36 #define ADDVI_W(a, b) (a + b)

	37 #define SRAI_B(a, b) (a >> b)

30 #define SRAI_H(a, b) (a >> b)	38 #define SRAI_H(a, b) (a >> b)

31 #define SRAI_W(a, b) (a >> b)	39 #define SRAI_W(a, b) (a >> b)

	40 #define SRLI_H(a, b) (a << b)

	41 #define SLLI_B(a, b) (a << b)

	42 #define ANDI_B(a, b) (a & b)

	43 #define ORI_B(a, b) (a \| b)

32 #endif	44 #endif

33	45

34 #define LD_B(RTYPE, psrc) ((RTYPE)(psrc))	46 #define LD_B(RTYPE, psrc) ((RTYPE)(psrc))

35 #define LD_UB(...) LD_B(v16u8, __VA_ARGS__)	47 #define LD_UB(...) LD_B(v16u8, __VA_ARGS__)

36 #define LD_SB(...) LD_B(v16i8, __VA_ARGS__)	48 #define LD_SB(...) LD_B(v16i8, __VA_ARGS__)

37	49

38 #define LD_H(RTYPE, psrc) ((RTYPE)(psrc))	50 #define LD_H(RTYPE, psrc) ((RTYPE)(psrc))

39 #define LD_UH(...) LD_H(v8u16, __VA_ARGS__)	51 #define LD_UH(...) LD_H(v8u16, __VA_ARGS__)

40 #define LD_SH(...) LD_H(v8i16, __VA_ARGS__)	52 #define LD_SH(...) LD_H(v8i16, __VA_ARGS__)

41	53

(...skipping 67 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
109 #define LD(psrc) MSA_LOAD(psrc, msa_uld)	121 #define LD(psrc) MSA_LOAD(psrc, msa_uld)

110 #else // !(__mips == 64)	122 #else // !(__mips == 64)

111 #define LD(psrc) ((((uint64_t)MSA_LOAD(psrc + 4, msa_ulw)) << 32) \| \	123 #define LD(psrc) ((((uint64_t)MSA_LOAD(psrc + 4, msa_ulw)) << 32) \| \

112 MSA_LOAD(psrc, msa_ulw))	124 MSA_LOAD(psrc, msa_ulw))

113 #endif // (__mips == 64)	125 #endif // (__mips == 64)

114	126

115 MSA_STORE_FUNC(uint16_t, ush, msa_ush);	127 MSA_STORE_FUNC(uint16_t, ush, msa_ush);

116 #define SH(val, pdst) MSA_STORE(val, pdst, msa_ush)	128 #define SH(val, pdst) MSA_STORE(val, pdst, msa_ush)

117 MSA_STORE_FUNC(uint32_t, usw, msa_usw);	129 MSA_STORE_FUNC(uint32_t, usw, msa_usw);

118 #define SW(val, pdst) MSA_STORE(val, pdst, msa_usw)	130 #define SW(val, pdst) MSA_STORE(val, pdst, msa_usw)

119 #define SD(val, pdst) { \	131 #define SD(val, pdst) do { \

120 uint8_t* const pdst_sd_m = (uint8_t*)(pdst); \	132 uint8_t* const pdst_sd_m = (uint8_t*)(pdst); \

121 const uint32_t val0_m = (uint32_t)(val & 0x00000000FFFFFFFF); \	133 const uint32_t val0_m = (uint32_t)(val & 0x00000000FFFFFFFF); \

122 const uint32_t val1_m = (uint32_t)((val >> 32) & 0x00000000FFFFFFFF); \	134 const uint32_t val1_m = (uint32_t)((val >> 32) & 0x00000000FFFFFFFF); \

123 SW(val0_m, pdst_sd_m); \	135 SW(val0_m, pdst_sd_m); \

124 SW(val1_m, pdst_sd_m + 4); \	136 SW(val1_m, pdst_sd_m + 4); \

125 }	137 } while (0)

126 #endif // (__mips_isa_rev >= 6)	138 #endif // (__mips_isa_rev >= 6)

127	139

128 /* Description : Load 4 words with stride	140 /* Description : Load 4 words with stride

129 * Arguments : Inputs - psrc, stride	141 * Arguments : Inputs - psrc, stride

130 * Outputs - out0, out1, out2, out3	142 * Outputs - out0, out1, out2, out3

131 * Details : Load word in 'out0' from (psrc)	143 * Details : Load word in 'out0' from (psrc)

132 * Load word in 'out1' from (psrc + stride)	144 * Load word in 'out1' from (psrc + stride)

133 * Load word in 'out2' from (psrc + 2 * stride)	145 * Load word in 'out2' from (psrc + 2 * stride)

134 * Load word in 'out3' from (psrc + 3 * stride)	146 * Load word in 'out3' from (psrc + 3 * stride)

135 */	147 */

136 #define LW4(psrc, stride, out0, out1, out2, out3) { \	148 #define LW4(psrc, stride, out0, out1, out2, out3) do { \

137 const uint8_t* ptmp = (const uint8_t*)psrc; \	149 const uint8_t* ptmp = (const uint8_t*)psrc; \

138 out0 = LW(ptmp); \	150 out0 = LW(ptmp); \

139 ptmp += stride; \	151 ptmp += stride; \

140 out1 = LW(ptmp); \	152 out1 = LW(ptmp); \

141 ptmp += stride; \	153 ptmp += stride; \

142 out2 = LW(ptmp); \	154 out2 = LW(ptmp); \

143 ptmp += stride; \	155 ptmp += stride; \

144 out3 = LW(ptmp); \	156 out3 = LW(ptmp); \

145 }	157 } while (0)

146	158

147 /* Description : Store 4 words with stride	159 /* Description : Store words with stride

148 * Arguments : Inputs - in0, in1, in2, in3, pdst, stride	160 * Arguments : Inputs - in0, in1, in2, in3, pdst, stride

149 * Details : Store word from 'in0' to (pdst)	161 * Details : Store word from 'in0' to (pdst)

150 * Store word from 'in1' to (pdst + stride)	162 * Store word from 'in1' to (pdst + stride)

151 * Store word from 'in2' to (pdst + 2 * stride)	163 * Store word from 'in2' to (pdst + 2 * stride)

152 * Store word from 'in3' to (pdst + 3 * stride)	164 * Store word from 'in3' to (pdst + 3 * stride)

153 */	165 */

154 #define SW4(in0, in1, in2, in3, pdst, stride) { \	166 #define SW4(in0, in1, in2, in3, pdst, stride) do { \

155 uint8_t* ptmp = (uint8_t*)pdst; \	167 uint8_t* ptmp = (uint8_t*)pdst; \

156 SW(in0, ptmp); \	168 SW(in0, ptmp); \

157 ptmp += stride; \	169 ptmp += stride; \

158 SW(in1, ptmp); \	170 SW(in1, ptmp); \

159 ptmp += stride; \	171 ptmp += stride; \

160 SW(in2, ptmp); \	172 SW(in2, ptmp); \

161 ptmp += stride; \	173 ptmp += stride; \

162 SW(in3, ptmp); \	174 SW(in3, ptmp); \

163 }	175 } while (0)

	176

	177 #define SW3(in0, in1, in2, pdst, stride) do { \

	178 uint8_t* ptmp = (uint8_t*)pdst; \

	179 SW(in0, ptmp); \

	180 ptmp += stride; \

	181 SW(in1, ptmp); \

	182 ptmp += stride; \

	183 SW(in2, ptmp); \

	184 } while (0)

	185

	186 #define SW2(in0, in1, pdst, stride) do { \

	187 uint8_t* ptmp = (uint8_t*)pdst; \

	188 SW(in0, ptmp); \

	189 ptmp += stride; \

	190 SW(in1, ptmp); \

	191 } while (0)

	192

	193 /* Description : Store 4 double words with stride

	194 * Arguments : Inputs - in0, in1, in2, in3, pdst, stride

	195 * Details : Store double word from 'in0' to (pdst)

	196 * Store double word from 'in1' to (pdst + stride)

	197 * Store double word from 'in2' to (pdst + 2 * stride)

	198 * Store double word from 'in3' to (pdst + 3 * stride)

	199 */

	200 #define SD4(in0, in1, in2, in3, pdst, stride) do { \

	201 uint8_t* ptmp = (uint8_t*)pdst; \

	202 SD(in0, ptmp); \

	203 ptmp += stride; \

	204 SD(in1, ptmp); \

	205 ptmp += stride; \

	206 SD(in2, ptmp); \

	207 ptmp += stride; \

	208 SD(in3, ptmp); \

	209 } while (0)

164	210

165 /* Description : Load vectors with 16 byte elements with stride	211 /* Description : Load vectors with 16 byte elements with stride

166 * Arguments : Inputs - psrc, stride	212 * Arguments : Inputs - psrc, stride

167 * Outputs - out0, out1	213 * Outputs - out0, out1

168 * Return Type - as per RTYPE	214 * Return Type - as per RTYPE

169 * Details : Load 16 byte elements in 'out0' from (psrc)	215 * Details : Load 16 byte elements in 'out0' from (psrc)

170 * Load 16 byte elements in 'out1' from (psrc + stride)	216 * Load 16 byte elements in 'out1' from (psrc + stride)

171 */	217 */

172 #define LD_B2(RTYPE, psrc, stride, out0, out1) { \	218 #define LD_B2(RTYPE, psrc, stride, out0, out1) do { \

173 out0 = LD_B(RTYPE, psrc); \	219 out0 = LD_B(RTYPE, psrc); \

174 out1 = LD_B(RTYPE, psrc + stride); \	220 out1 = LD_B(RTYPE, psrc + stride); \

175 }	221 } while (0)

176 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)	222 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)

177 #define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)	223 #define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)

178	224

179 #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) { \	225 #define LD_B3(RTYPE, psrc, stride, out0, out1, out2) do { \

180 LD_B2(RTYPE, psrc, stride, out0, out1); \	226 LD_B2(RTYPE, psrc, stride, out0, out1); \

181 LD_B2(RTYPE, psrc + 2 * stride , stride, out2, out3); \	227 out2 = LD_B(RTYPE, psrc + 2 * stride); \

182 }	228 } while (0)

	229 #define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__)

	230 #define LD_SB3(...) LD_B3(v16i8, __VA_ARGS__)

	231

	232 #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) do { \

	233 LD_B2(RTYPE, psrc, stride, out0, out1); \

	234 LD_B2(RTYPE, psrc + 2 * stride , stride, out2, out3); \

	235 } while (0)

183 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)	236 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)

184 #define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)	237 #define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)

185	238

	239 #define LD_B8(RTYPE, psrc, stride, \

	240 out0, out1, out2, out3, out4, out5, out6, out7) do { \

	241 LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3); \

	242 LD_B4(RTYPE, psrc + 4 * stride, stride, out4, out5, out6, out7); \

	243 } while (0)

	244 #define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)

	245 #define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)

	246

186 /* Description : Load vectors with 8 halfword elements with stride	247 /* Description : Load vectors with 8 halfword elements with stride

187 * Arguments : Inputs - psrc, stride	248 * Arguments : Inputs - psrc, stride

188 * Outputs - out0, out1	249 * Outputs - out0, out1

189 * Details : Load 8 halfword elements in 'out0' from (psrc)	250 * Details : Load 8 halfword elements in 'out0' from (psrc)

190 * Load 8 halfword elements in 'out1' from (psrc + stride)	251 * Load 8 halfword elements in 'out1' from (psrc + stride)

191 */	252 */

192 #define LD_H2(RTYPE, psrc, stride, out0, out1) { \	253 #define LD_H2(RTYPE, psrc, stride, out0, out1) do { \

193 out0 = LD_H(RTYPE, psrc); \	254 out0 = LD_H(RTYPE, psrc); \

194 out1 = LD_H(RTYPE, psrc + stride); \	255 out1 = LD_H(RTYPE, psrc + stride); \

195 }	256 } while (0)

196 #define LD_UH2(...) LD_H2(v8u16, __VA_ARGS__)	257 #define LD_UH2(...) LD_H2(v8u16, __VA_ARGS__)

197 #define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)	258 #define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)

198	259

	260 /* Description : Load vectors with 4 word elements with stride

	261 * Arguments : Inputs - psrc, stride

	262 * Outputs - out0, out1, out2, out3

	263 * Details : Load 4 word elements in 'out0' from (psrc + 0 * stride)

	264 * Load 4 word elements in 'out1' from (psrc + 1 * stride)

	265 * Load 4 word elements in 'out2' from (psrc + 2 * stride)

	266 * Load 4 word elements in 'out3' from (psrc + 3 * stride)

	267 */

	268 #define LD_W2(RTYPE, psrc, stride, out0, out1) do { \

	269 out0 = LD_W(RTYPE, psrc); \

	270 out1 = LD_W(RTYPE, psrc + stride); \

	271 } while (0)

	272 #define LD_UW2(...) LD_W2(v4u32, __VA_ARGS__)

	273 #define LD_SW2(...) LD_W2(v4i32, __VA_ARGS__)

	274

	275 #define LD_W3(RTYPE, psrc, stride, out0, out1, out2) do { \

	276 LD_W2(RTYPE, psrc, stride, out0, out1); \

	277 out2 = LD_W(RTYPE, psrc + 2 * stride); \

	278 } while (0)

	279 #define LD_UW3(...) LD_W3(v4u32, __VA_ARGS__)

	280 #define LD_SW3(...) LD_W3(v4i32, __VA_ARGS__)

	281

	282 #define LD_W4(RTYPE, psrc, stride, out0, out1, out2, out3) do { \

	283 LD_W2(RTYPE, psrc, stride, out0, out1); \

	284 LD_W2(RTYPE, psrc + 2 * stride, stride, out2, out3); \

	285 } while (0)

	286 #define LD_UW4(...) LD_W4(v4u32, __VA_ARGS__)

	287 #define LD_SW4(...) LD_W4(v4i32, __VA_ARGS__)

	288

	289 /* Description : Store vectors of 16 byte elements with stride

	290 * Arguments : Inputs - in0, in1, pdst, stride

	291 * Details : Store 16 byte elements from 'in0' to (pdst)

	292 * Store 16 byte elements from 'in1' to (pdst + stride)

	293 */

	294 #define ST_B2(RTYPE, in0, in1, pdst, stride) do { \

	295 ST_B(RTYPE, in0, pdst); \

	296 ST_B(RTYPE, in1, pdst + stride); \

	297 } while (0)

	298 #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)

	299 #define ST_SB2(...) ST_B2(v16i8, __VA_ARGS__)

	300

	301 #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) do { \

	302 ST_B2(RTYPE, in0, in1, pdst, stride); \

	303 ST_B2(RTYPE, in2, in3, pdst + 2 * stride, stride); \

	304 } while (0)

	305 #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)

	306 #define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__)

	307

	308 #define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \

	309 pdst, stride) do { \

	310 ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \

	311 ST_B4(RTYPE, in4, in5, in6, in7, pdst + 4 * stride, stride); \

	312 } while (0)

	313 #define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)

	314

	315 /* Description : Store vectors of 4 word elements with stride

	316 * Arguments : Inputs - in0, in1, in2, in3, pdst, stride

	317 * Details : Store 4 word elements from 'in0' to (pdst + 0 * stride)

	318 * Store 4 word elements from 'in1' to (pdst + 1 * stride)

	319 * Store 4 word elements from 'in2' to (pdst + 2 * stride)

	320 * Store 4 word elements from 'in3' to (pdst + 3 * stride)

	321 */

	322 #define ST_W2(RTYPE, in0, in1, pdst, stride) do { \

	323 ST_W(RTYPE, in0, pdst); \

	324 ST_W(RTYPE, in1, pdst + stride); \

	325 } while (0)

	326 #define ST_UW2(...) ST_W2(v4u32, __VA_ARGS__)

	327 #define ST_SW2(...) ST_W2(v4i32, __VA_ARGS__)

	328

	329 #define ST_W3(RTYPE, in0, in1, in2, pdst, stride) do { \

	330 ST_W2(RTYPE, in0, in1, pdst, stride); \

	331 ST_W(RTYPE, in2, pdst + 2 * stride); \

	332 } while (0)

	333 #define ST_UW3(...) ST_W3(v4u32, __VA_ARGS__)

	334 #define ST_SW3(...) ST_W3(v4i32, __VA_ARGS__)

	335

	336 #define ST_W4(RTYPE, in0, in1, in2, in3, pdst, stride) do { \

	337 ST_W2(RTYPE, in0, in1, pdst, stride); \

	338 ST_W2(RTYPE, in2, in3, pdst + 2 * stride, stride); \

	339 } while (0)

	340 #define ST_UW4(...) ST_W4(v4u32, __VA_ARGS__)

	341 #define ST_SW4(...) ST_W4(v4i32, __VA_ARGS__)

	342

	343 /* Description : Store vectors of 8 halfword elements with stride

	344 * Arguments : Inputs - in0, in1, pdst, stride

	345 * Details : Store 8 halfword elements from 'in0' to (pdst)

	346 * Store 8 halfword elements from 'in1' to (pdst + stride)

	347 */

	348 #define ST_H2(RTYPE, in0, in1, pdst, stride) do { \

	349 ST_H(RTYPE, in0, pdst); \

	350 ST_H(RTYPE, in1, pdst + stride); \

	351 } while (0)

	352 #define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__)

	353 #define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)

	354

	355 /* Description : Store 2x4 byte block to destination memory from input vector

	356 * Arguments : Inputs - in, stidx, pdst, stride

	357 * Details : Index 'stidx' halfword element from 'in' vector is copied to

	358 * the GP register and stored to (pdst)

	359 * Index 'stidx+1' halfword element from 'in' vector is copied to

	360 * the GP register and stored to (pdst + stride)

	361 * Index 'stidx+2' halfword element from 'in' vector is copied to

	362 * the GP register and stored to (pdst + 2 * stride)

	363 * Index 'stidx+3' halfword element from 'in' vector is copied to

	364 * the GP register and stored to (pdst + 3 * stride)

	365 */

	366 #define ST2x4_UB(in, stidx, pdst, stride) do { \

	367 uint8_t* pblk_2x4_m = (uint8_t*)pdst; \

	368 const uint16_t out0_m = __msa_copy_s_h((v8i16)in, stidx); \

	369 const uint16_t out1_m = __msa_copy_s_h((v8i16)in, stidx + 1); \

	370 const uint16_t out2_m = __msa_copy_s_h((v8i16)in, stidx + 2); \

	371 const uint16_t out3_m = __msa_copy_s_h((v8i16)in, stidx + 3); \

	372 SH(out0_m, pblk_2x4_m); \

	373 pblk_2x4_m += stride; \

	374 SH(out1_m, pblk_2x4_m); \

	375 pblk_2x4_m += stride; \

	376 SH(out2_m, pblk_2x4_m); \

	377 pblk_2x4_m += stride; \

	378 SH(out3_m, pblk_2x4_m); \

	379 } while (0)

	380

199 /* Description : Store 4x4 byte block to destination memory from input vector	381 /* Description : Store 4x4 byte block to destination memory from input vector

200 * Arguments : Inputs - in0, in1, pdst, stride	382 * Arguments : Inputs - in0, in1, pdst, stride

201 * Details : 'Idx0' word element from input vector 'in0' is copied to the	383 * Details : 'Idx0' word element from input vector 'in0' is copied to the

202 * GP register and stored to (pdst)	384 * GP register and stored to (pdst)

203 * 'Idx1' word element from input vector 'in0' is copied to the	385 * 'Idx1' word element from input vector 'in0' is copied to the

204 * GP register and stored to (pdst + stride)	386 * GP register and stored to (pdst + stride)

205 * 'Idx2' word element from input vector 'in0' is copied to the	387 * 'Idx2' word element from input vector 'in0' is copied to the

206 * GP register and stored to (pdst + 2 * stride)	388 * GP register and stored to (pdst + 2 * stride)

207 * 'Idx3' word element from input vector 'in0' is copied to the	389 * 'Idx3' word element from input vector 'in0' is copied to the

208 * GP register and stored to (pdst + 3 * stride)	390 * GP register and stored to (pdst + 3 * stride)

209 */	391 */

210 #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) { \	392 #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) do { \

211 uint8_t* const pblk_4x4_m = (uint8_t*)pdst; \	393 uint8_t* const pblk_4x4_m = (uint8_t*)pdst; \

212 const uint32_t out0_m = __msa_copy_s_w((v4i32)in0, idx0); \	394 const uint32_t out0_m = __msa_copy_s_w((v4i32)in0, idx0); \

213 const uint32_t out1_m = __msa_copy_s_w((v4i32)in0, idx1); \	395 const uint32_t out1_m = __msa_copy_s_w((v4i32)in0, idx1); \

214 const uint32_t out2_m = __msa_copy_s_w((v4i32)in1, idx2); \	396 const uint32_t out2_m = __msa_copy_s_w((v4i32)in1, idx2); \

215 const uint32_t out3_m = __msa_copy_s_w((v4i32)in1, idx3); \	397 const uint32_t out3_m = __msa_copy_s_w((v4i32)in1, idx3); \

216 SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \	398 SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \

217 }	399 } while (0)

	400

	401 #define ST4x8_UB(in0, in1, pdst, stride) do { \

	402 uint8_t* const pblk_4x8 = (uint8_t*)pdst; \

	403 ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \

	404 ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \

	405 } while (0)

218	406

219 /* Description : Immediate number of elements to slide	407 /* Description : Immediate number of elements to slide

220 * Arguments : Inputs - in0, in1, slide_val	408 * Arguments : Inputs - in0, in1, slide_val

221 * Outputs - out	409 * Outputs - out

222 * Return Type - as per RTYPE	410 * Return Type - as per RTYPE

223 * Details : Byte elements from 'in1' vector are slid into 'in0' by	411 * Details : Byte elements from 'in1' vector are slid into 'in0' by

224 * value specified in the 'slide_val'	412 * value specified in the 'slide_val'

225 */	413 */

226 #define SLDI_B(RTYPE, in0, in1, slide_val) \	414 #define SLDI_B(RTYPE, in0, in1, slide_val) \

227 (RTYPE)__msa_sldi_b((v16i8)in0, (v16i8)in1, slide_val) \	415 (RTYPE)__msa_sldi_b((v16i8)in0, (v16i8)in1, slide_val) \

228	416

229 #define SLDI_UB(...) SLDI_B(v16u8, __VA_ARGS__)	417 #define SLDI_UB(...) SLDI_B(v16u8, __VA_ARGS__)

230 #define SLDI_SB(...) SLDI_B(v16i8, __VA_ARGS__)	418 #define SLDI_SB(...) SLDI_B(v16i8, __VA_ARGS__)

231 #define SLDI_SH(...) SLDI_B(v8i16, __VA_ARGS__)	419 #define SLDI_SH(...) SLDI_B(v8i16, __VA_ARGS__)

232	420

	421 /* Description : Shuffle byte vector elements as per mask vector

	422 * Arguments : Inputs - in0, in1, in2, in3, mask0, mask1

	423 * Outputs - out0, out1

	424 * Return Type - as per RTYPE

	425 * Details : Byte elements from 'in0' & 'in1' are copied selectively to

	426 * 'out0' as per control vector 'mask0'

	427 */

	428 #define VSHF_B(RTYPE, in0, in1, mask) \

	429 (RTYPE)__msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0)

	430

	431 #define VSHF_UB(...) VSHF_B(v16u8, __VA_ARGS__)

	432 #define VSHF_SB(...) VSHF_B(v16i8, __VA_ARGS__)

	433 #define VSHF_UH(...) VSHF_B(v8u16, __VA_ARGS__)

	434 #define VSHF_SH(...) VSHF_B(v8i16, __VA_ARGS__)

	435

	436 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) do { \

	437 out0 = VSHF_B(RTYPE, in0, in1, mask0); \

	438 out1 = VSHF_B(RTYPE, in2, in3, mask1); \

	439 } while (0)

	440 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)

	441 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)

	442 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)

	443 #define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__)

	444

233 /* Description : Shuffle halfword vector elements as per mask vector	445 /* Description : Shuffle halfword vector elements as per mask vector

234 * Arguments : Inputs - in0, in1, in2, in3, mask0, mask1	446 * Arguments : Inputs - in0, in1, in2, in3, mask0, mask1

235 * Outputs - out0, out1	447 * Outputs - out0, out1

236 * Return Type - as per RTYPE	448 * Return Type - as per RTYPE

237 * Details : halfword elements from 'in0' & 'in1' are copied selectively to	449 * Details : halfword elements from 'in0' & 'in1' are copied selectively to

238 * 'out0' as per control vector 'mask0'	450 * 'out0' as per control vector 'mask0'

239 */	451 */

240 #define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) { \	452 #define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) do { \

241 out0 = (RTYPE)__msa_vshf_h((v8i16)mask0, (v8i16)in1, (v8i16)in0); \	453 out0 = (RTYPE)__msa_vshf_h((v8i16)mask0, (v8i16)in1, (v8i16)in0); \

242 out1 = (RTYPE)__msa_vshf_h((v8i16)mask1, (v8i16)in3, (v8i16)in2); \	454 out1 = (RTYPE)__msa_vshf_h((v8i16)mask1, (v8i16)in3, (v8i16)in2); \

243 }	455 } while (0)

244 #define VSHF_H2_UH(...) VSHF_H2(v8u16, __VA_ARGS__)	456 #define VSHF_H2_UH(...) VSHF_H2(v8u16, __VA_ARGS__)

245 #define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__)	457 #define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__)

246	458

	459 /* Description : Dot product of byte vector elements

	460 * Arguments : Inputs - mult0, mult1, cnst0, cnst1

	461 * Outputs - out0, out1

	462 * Return Type - as per RTYPE

	463 * Details : Signed byte elements from 'mult0' are multiplied with

	464 * signed byte elements from 'cnst0' producing a result

	465 * twice the size of input i.e. signed halfword.

	466 * The multiplication result of adjacent odd-even elements

	467 * are added together and written to the 'out0' vector

	468 */

	469 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) do { \

	470 out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0); \

	471 out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1); \

	472 } while (0)

	473 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)

	474

	475 /* Description : Dot product of halfword vector elements

	476 * Arguments : Inputs - mult0, mult1, cnst0, cnst1

	477 * Outputs - out0, out1

	478 * Return Type - as per RTYPE

	479 * Details : Signed halfword elements from 'mult0' are multiplied with

	480 * signed halfword elements from 'cnst0' producing a result

	481 * twice the size of input i.e. signed word.

	482 * The multiplication result of adjacent odd-even elements

	483 * are added together and written to the 'out0' vector

	484 */

	485 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) do { \

	486 out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0); \

	487 out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1); \

	488 } while (0)

	489 #define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)

	490

	491 /* Description : Dot product of unsigned word vector elements

	492 * Arguments : Inputs - mult0, mult1, cnst0, cnst1

	493 * Outputs - out0, out1

	494 * Return Type - as per RTYPE

	495 * Details : Unsigned word elements from 'mult0' are multiplied with

	496 * unsigned word elements from 'cnst0' producing a result

	497 * twice the size of input i.e. unsigned double word.

	498 * The multiplication result of adjacent odd-even elements

	499 * are added together and written to the 'out0' vector

	500 */

	501 #define DOTP_UW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) do { \

	502 out0 = (RTYPE)__msa_dotp_u_d((v4u32)mult0, (v4u32)cnst0); \

	503 out1 = (RTYPE)__msa_dotp_u_d((v4u32)mult1, (v4u32)cnst1); \

	504 } while (0)

	505 #define DOTP_UW2_UD(...) DOTP_UW2(v2u64, __VA_ARGS__)

	506

	507 /* Description : Dot product & addition of halfword vector elements

	508 * Arguments : Inputs - mult0, mult1, cnst0, cnst1

	509 * Outputs - out0, out1

	510 * Return Type - as per RTYPE

	511 * Details : Signed halfword elements from 'mult0' are multiplied with

	512 * signed halfword elements from 'cnst0' producing a result

	513 * twice the size of input i.e. signed word.

	514 * The multiplication result of adjacent odd-even elements

	515 * are added to the 'out0' vector

	516 */

	517 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) do { \

	518 out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \

	519 out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \

	520 } while (0)

	521 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)

	522

247 /* Description : Clips all signed halfword elements of input vector	523 /* Description : Clips all signed halfword elements of input vector

248 * between 0 & 255	524 * between 0 & 255

249 * Arguments : Input/output - val	525 * Arguments : Input/output - val

250 * Return Type - signed halfword	526 * Return Type - signed halfword

251 */	527 */

252 #define CLIP_SH_0_255(val) { \	528 #define CLIP_SH_0_255(val) do { \

253 const v8i16 max_m = __msa_ldi_h(255); \	529 const v8i16 max_m = __msa_ldi_h(255); \

254 val = __msa_maxi_s_h((v8i16)val, 0); \	530 val = __msa_maxi_s_h((v8i16)val, 0); \

255 val = __msa_min_s_h(max_m, (v8i16)val); \	531 val = __msa_min_s_h(max_m, (v8i16)val); \

256 }	532 } while (0)

257 #define CLIP_SH2_0_255(in0, in1) { \	533

258 CLIP_SH_0_255(in0); \	534 #define CLIP_SH2_0_255(in0, in1) do { \

259 CLIP_SH_0_255(in1); \	535 CLIP_SH_0_255(in0); \

260 }	536 CLIP_SH_0_255(in1); \

	537 } while (0)

	538

	539 #define CLIP_SH4_0_255(in0, in1, in2, in3) do { \

	540 CLIP_SH2_0_255(in0, in1); \

	541 CLIP_SH2_0_255(in2, in3); \

	542 } while (0)

	543

	544 /* Description : Clips all unsigned halfword elements of input vector

	545 * between 0 & 255

	546 * Arguments : Input - in

	547 * Output - out_m

	548 * Return Type - unsigned halfword

	549 */

	550 #define CLIP_UH_0_255(in) do { \

	551 const v8u16 max_m = (v8u16)__msa_ldi_h(255); \

	552 in = __msa_maxi_u_h((v8u16) in, 0); \

	553 in = __msa_min_u_h((v8u16) max_m, (v8u16) in); \

	554 } while (0)

	555

	556 #define CLIP_UH2_0_255(in0, in1) do { \

	557 CLIP_UH_0_255(in0); \

	558 CLIP_UH_0_255(in1); \

	559 } while (0)

261	560

262 /* Description : Clips all signed word elements of input vector	561 /* Description : Clips all signed word elements of input vector

263 * between 0 & 255	562 * between 0 & 255

264 * Arguments : Input/output - val	563 * Arguments : Input/output - val

265 * Return Type - signed word	564 * Return Type - signed word

266 */	565 */

267 #define CLIP_SW_0_255(val) { \	566 #define CLIP_SW_0_255(val) do { \

268 const v4i32 max_m = __msa_ldi_w(255); \	567 const v4i32 max_m = __msa_ldi_w(255); \

269 val = __msa_maxi_s_w((v4i32)val, 0); \	568 val = __msa_maxi_s_w((v4i32)val, 0); \

270 val = __msa_min_s_w(max_m, (v4i32)val); \	569 val = __msa_min_s_w(max_m, (v4i32)val); \

	570 } while (0)

	571

	572 #define CLIP_SW4_0_255(in0, in1, in2, in3) do { \

	573 CLIP_SW_0_255(in0); \

	574 CLIP_SW_0_255(in1); \

	575 CLIP_SW_0_255(in2); \

	576 CLIP_SW_0_255(in3); \

	577 } while (0)

	578

	579 /* Description : Horizontal addition of 4 signed word elements of input vector

	580 * Arguments : Input - in (signed word vector)

	581 * Output - sum_m (i32 sum)

	582 * Return Type - signed word (GP)

	583 * Details : 4 signed word elements of 'in' vector are added together and

	584 * the resulting integer sum is returned

	585 */

	586 static WEBP_INLINE int32_t func_hadd_sw_s32(v4i32 in) {

	587 const v2i64 res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in);

	588 const v2i64 res1_m = __msa_splati_d(res0_m, 1);

	589 const v2i64 out = res0_m + res1_m;

	590 int32_t sum_m = __msa_copy_s_w((v4i32)out, 0);

	591 return sum_m;

271 }	592 }

272 #define CLIP_SW4_0_255(in0, in1, in2, in3) { \	593 #define HADD_SW_S32(in) func_hadd_sw_s32(in)

273 CLIP_SW_0_255(in0); \	594

274 CLIP_SW_0_255(in1); \	595 /* Description : Horizontal addition of 8 signed halfword elements

275 CLIP_SW_0_255(in2); \	596 * Arguments : Input - in (signed halfword vector)

276 CLIP_SW_0_255(in3); \	597 * Output - sum_m (s32 sum)

	598 * Return Type - signed word

	599 * Details : 8 signed halfword elements of input vector are added

	600 * together and the resulting integer sum is returned

	601 */

	602 static WEBP_INLINE int32_t func_hadd_sh_s32(v8i16 in) {

	603 const v4i32 res = __msa_hadd_s_w(in, in);

	604 const v2i64 res0 = __msa_hadd_s_d(res, res);

	605 const v2i64 res1 = __msa_splati_d(res0, 1);

	606 const v2i64 res2 = res0 + res1;

	607 const int32_t sum_m = __msa_copy_s_w((v4i32)res2, 0);

	608 return sum_m;

277 }	609 }

	610 #define HADD_SH_S32(in) func_hadd_sh_s32(in)

	611

	612 /* Description : Horizontal addition of 8 unsigned halfword elements

	613 * Arguments : Input - in (unsigned halfword vector)

	614 * Output - sum_m (u32 sum)

	615 * Return Type - unsigned word

	616 * Details : 8 unsigned halfword elements of input vector are added

	617 * together and the resulting integer sum is returned

	618 */

	619 static WEBP_INLINE uint32_t func_hadd_uh_u32(v8u16 in) {

	620 uint32_t sum_m;

	621 const v4u32 res_m = __msa_hadd_u_w(in, in);

	622 v2u64 res0_m = __msa_hadd_u_d(res_m, res_m);

	623 v2u64 res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1);

	624 res0_m = res0_m + res1_m;

	625 sum_m = __msa_copy_s_w((v4i32)res0_m, 0);

	626 return sum_m;

	627 }

	628 #define HADD_UH_U32(in) func_hadd_uh_u32(in)

	629

	630 /* Description : Horizontal addition of signed half word vector elements

	631 Arguments : Inputs - in0, in1

	632 Outputs - out0, out1

	633 Return Type - as per RTYPE

	634 Details : Each signed odd half word element from 'in0' is added to

	635 even signed half word element from 'in0' (pairwise) and the

	636 halfword result is written in 'out0'

	637 */

	638 #define HADD_SH2(RTYPE, in0, in1, out0, out1) do { \

	639 out0 = (RTYPE)__msa_hadd_s_w((v8i16)in0, (v8i16)in0); \

	640 out1 = (RTYPE)__msa_hadd_s_w((v8i16)in1, (v8i16)in1); \

	641 } while (0)

	642 #define HADD_SH2_SW(...) HADD_SH2(v4i32, __VA_ARGS__)

	643

	644 #define HADD_SH4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) do { \

	645 HADD_SH2(RTYPE, in0, in1, out0, out1); \

	646 HADD_SH2(RTYPE, in2, in3, out2, out3); \

	647 } while (0)

	648 #define HADD_SH4_SW(...) HADD_SH4(v4i32, __VA_ARGS__)

	649

	650 /* Description : Horizontal subtraction of unsigned byte vector elements

	651 * Arguments : Inputs - in0, in1

	652 * Outputs - out0, out1

	653 * Return Type - as per RTYPE

	654 * Details : Each unsigned odd byte element from 'in0' is subtracted from

	655 * even unsigned byte element from 'in0' (pairwise) and the

	656 * halfword result is written to 'out0'

	657 */

	658 #define HSUB_UB2(RTYPE, in0, in1, out0, out1) do { \

	659 out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \

	660 out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \

	661 } while (0)

	662 #define HSUB_UB2_UH(...) HSUB_UB2(v8u16, __VA_ARGS__)

	663 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)

	664 #define HSUB_UB2_SW(...) HSUB_UB2(v4i32, __VA_ARGS__)

278	665

279 /* Description : Set element n input vector to GPR value	666 /* Description : Set element n input vector to GPR value

280 * Arguments : Inputs - in0, in1, in2, in3	667 * Arguments : Inputs - in0, in1, in2, in3

281 * Output - out	668 * Output - out

282 * Return Type - as per RTYPE	669 * Return Type - as per RTYPE

283 * Details : Set element 0 in vector 'out' to value specified in 'in0'	670 * Details : Set element 0 in vector 'out' to value specified in 'in0'

284 */	671 */

285 #define INSERT_W2(RTYPE, in0, in1, out) { \	672 #define INSERT_W2(RTYPE, in0, in1, out) do { \

286 out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \	673 out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \

287 out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \	674 out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \

288 }	675 } while (0)

289 #define INSERT_W2_UB(...) INSERT_W2(v16u8, __VA_ARGS__)	676 #define INSERT_W2_UB(...) INSERT_W2(v16u8, __VA_ARGS__)

290 #define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)	677 #define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)

291	678

292 #define INSERT_W4(RTYPE, in0, in1, in2, in3, out) { \	679 #define INSERT_W4(RTYPE, in0, in1, in2, in3, out) do { \

293 out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \	680 out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \

294 out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \	681 out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \

295 out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2); \	682 out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2); \

296 out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3); \	683 out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3); \

297 }	684 } while (0)

298 #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)	685 #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)

299 #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)	686 #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)

300 #define INSERT_W4_SW(...) INSERT_W4(v4i32, __VA_ARGS__)	687 #define INSERT_W4_SW(...) INSERT_W4(v4i32, __VA_ARGS__)

301	688

	689 /* Description : Set element n of double word input vector to GPR value

	690 * Arguments : Inputs - in0, in1

	691 * Output - out

	692 * Return Type - as per RTYPE

	693 * Details : Set element 0 in vector 'out' to GPR value specified in 'in0'

	694 * Set element 1 in vector 'out' to GPR value specified in 'in1'

	695 */

	696 #define INSERT_D2(RTYPE, in0, in1, out) do { \

	697 out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \

	698 out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \

	699 } while (0)

	700 #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)

	701 #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)

	702

	703 /* Description : Interleave even byte elements from vectors

	704 * Arguments : Inputs - in0, in1, in2, in3

	705 * Outputs - out0, out1

	706 * Return Type - as per RTYPE

	707 * Details : Even byte elements of 'in0' and 'in1' are interleaved

	708 * and written to 'out0'

	709 */

	710 #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) do { \

	711 out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \

	712 out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \

	713 } while (0)

	714 #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)

	715 #define ILVEV_B2_SB(...) ILVEV_B2(v16i8, __VA_ARGS__)

	716 #define ILVEV_B2_UH(...) ILVEV_B2(v8u16, __VA_ARGS__)

	717 #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)

	718 #define ILVEV_B2_SD(...) ILVEV_B2(v2i64, __VA_ARGS__)

	719

	720 /* Description : Interleave odd byte elements from vectors

	721 * Arguments : Inputs - in0, in1, in2, in3

	722 * Outputs - out0, out1

	723 * Return Type - as per RTYPE

	724 * Details : Odd byte elements of 'in0' and 'in1' are interleaved

	725 * and written to 'out0'

	726 */

	727 #define ILVOD_B2(RTYPE, in0, in1, in2, in3, out0, out1) do { \

	728 out0 = (RTYPE)__msa_ilvod_b((v16i8)in1, (v16i8)in0); \

	729 out1 = (RTYPE)__msa_ilvod_b((v16i8)in3, (v16i8)in2); \

	730 } while (0)

	731 #define ILVOD_B2_UB(...) ILVOD_B2(v16u8, __VA_ARGS__)

	732 #define ILVOD_B2_SB(...) ILVOD_B2(v16i8, __VA_ARGS__)

	733 #define ILVOD_B2_UH(...) ILVOD_B2(v8u16, __VA_ARGS__)

	734 #define ILVOD_B2_SH(...) ILVOD_B2(v8i16, __VA_ARGS__)

	735 #define ILVOD_B2_SD(...) ILVOD_B2(v2i64, __VA_ARGS__)

	736

	737 /* Description : Interleave even halfword elements from vectors

	738 * Arguments : Inputs - in0, in1, in2, in3

	739 * Outputs - out0, out1

	740 * Return Type - as per RTYPE

	741 * Details : Even halfword elements of 'in0' and 'in1' are interleaved

	742 * and written to 'out0'

	743 */

	744 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) do { \

	745 out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \

	746 out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \

	747 } while (0)

	748 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)

	749 #define ILVEV_H2_UH(...) ILVEV_H2(v8u16, __VA_ARGS__)

	750 #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)

	751 #define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)

	752

	753 /* Description : Interleave odd halfword elements from vectors

	754 * Arguments : Inputs - in0, in1, in2, in3

	755 * Outputs - out0, out1

	756 * Return Type - as per RTYPE

	757 * Details : Odd halfword elements of 'in0' and 'in1' are interleaved

	758 * and written to 'out0'

	759 */

	760 #define ILVOD_H2(RTYPE, in0, in1, in2, in3, out0, out1) do { \

	761 out0 = (RTYPE)__msa_ilvod_h((v8i16)in1, (v8i16)in0); \

	762 out1 = (RTYPE)__msa_ilvod_h((v8i16)in3, (v8i16)in2); \

	763 } while (0)

	764 #define ILVOD_H2_UB(...) ILVOD_H2(v16u8, __VA_ARGS__)

	765 #define ILVOD_H2_UH(...) ILVOD_H2(v8u16, __VA_ARGS__)

	766 #define ILVOD_H2_SH(...) ILVOD_H2(v8i16, __VA_ARGS__)

	767 #define ILVOD_H2_SW(...) ILVOD_H2(v4i32, __VA_ARGS__)

	768

	769 /* Description : Interleave even word elements from vectors

	770 * Arguments : Inputs - in0, in1, in2, in3

	771 * Outputs - out0, out1

	772 * Return Type - as per RTYPE

	773 * Details : Even word elements of 'in0' and 'in1' are interleaved

	774 * and written to 'out0'

	775 */

	776 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) do { \

	777 out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \

	778 out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \

	779 } while (0)

	780 #define ILVEV_W2_UB(...) ILVEV_W2(v16u8, __VA_ARGS__)

	781 #define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)

	782 #define ILVEV_W2_UH(...) ILVEV_W2(v8u16, __VA_ARGS__)

	783 #define ILVEV_W2_SD(...) ILVEV_W2(v2i64, __VA_ARGS__)

	784

	785 /* Description : Interleave even-odd word elements from vectors

	786 * Arguments : Inputs - in0, in1, in2, in3

	787 * Outputs - out0, out1

	788 * Return Type - as per RTYPE

	789 * Details : Even word elements of 'in0' and 'in1' are interleaved

	790 * and written to 'out0'

	791 * Odd word elements of 'in2' and 'in3' are interleaved

	792 * and written to 'out1'

	793 */

	794 #define ILVEVOD_W2(RTYPE, in0, in1, in2, in3, out0, out1) do { \

	795 out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \

	796 out1 = (RTYPE)__msa_ilvod_w((v4i32)in3, (v4i32)in2); \

	797 } while (0)

	798 #define ILVEVOD_W2_UB(...) ILVEVOD_W2(v16u8, __VA_ARGS__)

	799 #define ILVEVOD_W2_UH(...) ILVEVOD_W2(v8u16, __VA_ARGS__)

	800 #define ILVEVOD_W2_SH(...) ILVEVOD_W2(v8i16, __VA_ARGS__)

	801 #define ILVEVOD_W2_SW(...) ILVEVOD_W2(v4i32, __VA_ARGS__)

	802

	803 /* Description : Interleave even-odd half-word elements from vectors

	804 * Arguments : Inputs - in0, in1, in2, in3

	805 * Outputs - out0, out1

	806 * Return Type - as per RTYPE

	807 * Details : Even half-word elements of 'in0' and 'in1' are interleaved

	808 * and written to 'out0'

	809 * Odd half-word elements of 'in2' and 'in3' are interleaved

	810 * and written to 'out1'

	811 */

	812 #define ILVEVOD_H2(RTYPE, in0, in1, in2, in3, out0, out1) do { \

	813 out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \

	814 out1 = (RTYPE)__msa_ilvod_h((v8i16)in3, (v8i16)in2); \

	815 } while (0)

	816 #define ILVEVOD_H2_UB(...) ILVEVOD_H2(v16u8, __VA_ARGS__)

	817 #define ILVEVOD_H2_UH(...) ILVEVOD_H2(v8u16, __VA_ARGS__)

	818 #define ILVEVOD_H2_SH(...) ILVEVOD_H2(v8i16, __VA_ARGS__)

	819 #define ILVEVOD_H2_SW(...) ILVEVOD_H2(v4i32, __VA_ARGS__)

	820

	821 /* Description : Interleave even double word elements from vectors

	822 * Arguments : Inputs - in0, in1, in2, in3

	823 * Outputs - out0, out1

	824 * Return Type - as per RTYPE

	825 * Details : Even double word elements of 'in0' and 'in1' are interleaved

	826 * and written to 'out0'

	827 */

	828 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) do { \

	829 out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \

	830 out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \

	831 } while (0)

	832 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)

	833 #define ILVEV_D2_SB(...) ILVEV_D2(v16i8, __VA_ARGS__)

	834 #define ILVEV_D2_SW(...) ILVEV_D2(v4i32, __VA_ARGS__)

	835 #define ILVEV_D2_SD(...) ILVEV_D2(v2i64, __VA_ARGS__)

	836

	837 /* Description : Interleave left half of byte elements from vectors

	838 * Arguments : Inputs - in0, in1, in2, in3

	839 * Outputs - out0, out1

	840 * Return Type - as per RTYPE

	841 * Details : Left half of byte elements of 'in0' and 'in1' are interleaved

	842 * and written to 'out0'.

	843 */

	844 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) do { \

	845 out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \

	846 out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \

	847 } while (0)

	848 #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)

	849 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)

	850 #define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)

	851 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)

	852 #define ILVL_B2_SW(...) ILVL_B2(v4i32, __VA_ARGS__)

	853

302 /* Description : Interleave right half of byte elements from vectors	854 /* Description : Interleave right half of byte elements from vectors

303 * Arguments : Inputs - in0, in1, in2, in3	855 * Arguments : Inputs - in0, in1, in2, in3

304 * Outputs - out0, out1	856 * Outputs - out0, out1

305 * Return Type - as per RTYPE	857 * Return Type - as per RTYPE

306 * Details : Right half of byte elements of 'in0' and 'in1' are interleaved	858 * Details : Right half of byte elements of 'in0' and 'in1' are interleaved

307 * and written to out0.	859 * and written to out0.

308 */	860 */

309 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \	861 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) do { \

310 out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \	862 out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \

311 out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \	863 out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \

312 }	864 } while (0)

313 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)	865 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)

314 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)	866 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)

315 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)	867 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)

316 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)	868 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)

317 #define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__)	869 #define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__)

318	870

319 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \	871 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \

320 out0, out1, out2, out3) { \	872 out0, out1, out2, out3) do { \

321 ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \	873 ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \

322 ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \	874 ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \

323 }	875 } while (0)

324 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)	876 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)

325 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)	877 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)

326 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)	878 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)

327 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)	879 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)

328 #define ILVR_B4_SW(...) ILVR_B4(v4i32, __VA_ARGS__)	880 #define ILVR_B4_SW(...) ILVR_B4(v4i32, __VA_ARGS__)

329	881

330 /* Description : Interleave right half of halfword elements from vectors	882 /* Description : Interleave right half of halfword elements from vectors

331 * Arguments : Inputs - in0, in1, in2, in3	883 * Arguments : Inputs - in0, in1, in2, in3

332 * Outputs - out0, out1	884 * Outputs - out0, out1

333 * Return Type - as per RTYPE	885 * Return Type - as per RTYPE

334 * Details : Right half of halfword elements of 'in0' and 'in1' are	886 * Details : Right half of halfword elements of 'in0' and 'in1' are

335 * interleaved and written to 'out0'.	887 * interleaved and written to 'out0'.

336 */	888 */

337 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \	889 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) do { \

338 out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \	890 out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \

339 out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \	891 out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \

340 }	892 } while (0)

341 #define ILVR_H2_UB(...) ILVR_H2(v16u8, __VA_ARGS__)	893 #define ILVR_H2_UB(...) ILVR_H2(v16u8, __VA_ARGS__)

342 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)	894 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)

343 #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)	895 #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)

344	896

345 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \	897 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \

346 out0, out1, out2, out3) { \	898 out0, out1, out2, out3) do { \

347 ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \	899 ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \

348 ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \	900 ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \

349 }	901 } while (0)

350 #define ILVR_H4_UB(...) ILVR_H4(v16u8, __VA_ARGS__)	902 #define ILVR_H4_UB(...) ILVR_H4(v16u8, __VA_ARGS__)

351 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)	903 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)

352 #define ILVR_H4_SW(...) ILVR_H4(v4i32, __VA_ARGS__)	904 #define ILVR_H4_SW(...) ILVR_H4(v4i32, __VA_ARGS__)

353	905

354 /* Description : Interleave right half of double word elements from vectors	906 /* Description : Interleave right half of double word elements from vectors

355 * Arguments : Inputs - in0, in1, in2, in3	907 * Arguments : Inputs - in0, in1, in2, in3

356 * Outputs - out0, out1	908 * Outputs - out0, out1

357 * Return Type - as per RTYPE	909 * Return Type - as per RTYPE

358 * Details : Right half of double word elements of 'in0' and 'in1' are	910 * Details : Right half of double word elements of 'in0' and 'in1' are

359 * interleaved and written to 'out0'.	911 * interleaved and written to 'out0'.

360 */	912 */

361 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) { \	913 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) do { \

362 out0 = (RTYPE)__msa_ilvr_d((v2i64)in0, (v2i64)in1); \	914 out0 = (RTYPE)__msa_ilvr_d((v2i64)in0, (v2i64)in1); \

363 out1 = (RTYPE)__msa_ilvr_d((v2i64)in2, (v2i64)in3); \	915 out1 = (RTYPE)__msa_ilvr_d((v2i64)in2, (v2i64)in3); \

364 }	916 } while (0)

365 #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)	917 #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)

366 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)	918 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)

367 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)	919 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)

368	920

369 #define ILVRL_H2(RTYPE, in0, in1, out0, out1) { \	921 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \

	922 out0, out1, out2, out3) do { \

	923 ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \

	924 ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \

	925 } while (0)

	926 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)

	927 #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)

	928

	929 /* Description : Interleave both left and right half of input vectors

	930 * Arguments : Inputs - in0, in1

	931 * Outputs - out0, out1

	932 * Return Type - as per RTYPE

	933 * Details : Right half of byte elements from 'in0' and 'in1' are

	934 * interleaved and written to 'out0'

	935 */

	936 #define ILVRL_B2(RTYPE, in0, in1, out0, out1) do { \

	937 out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \

	938 out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \

	939 } while (0)

	940 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)

	941 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)

	942 #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)

	943 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)

	944 #define ILVRL_B2_SW(...) ILVRL_B2(v4i32, __VA_ARGS__)

	945

	946 #define ILVRL_H2(RTYPE, in0, in1, out0, out1) do { \

370 out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \	947 out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \

371 out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \	948 out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \

372 }	949 } while (0)

373 #define ILVRL_H2_UB(...) ILVRL_H2(v16u8, __VA_ARGS__)	950 #define ILVRL_H2_UB(...) ILVRL_H2(v16u8, __VA_ARGS__)

374 #define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__)	951 #define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__)

375 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)	952 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)

376 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)	953 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)

377 #define ILVRL_H2_UW(...) ILVRL_H2(v4u32, __VA_ARGS__)	954 #define ILVRL_H2_UW(...) ILVRL_H2(v4u32, __VA_ARGS__)

378	955

379 #define ILVRL_W2(RTYPE, in0, in1, out0, out1) { \	956 #define ILVRL_W2(RTYPE, in0, in1, out0, out1) do { \

380 out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \	957 out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \

381 out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \	958 out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \

382 }	959 } while (0)

383 #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)	960 #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)

384 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)	961 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)

385 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)	962 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)

	963 #define ILVRL_W2_UW(...) ILVRL_W2(v4u32, __VA_ARGS__)

386	964

387 /* Description : Pack even byte elements of vector pairs	965 /* Description : Pack even byte elements of vector pairs

388 * Arguments : Inputs - in0, in1, in2, in3	966 * Arguments : Inputs - in0, in1, in2, in3

389 * Outputs - out0, out1	967 * Outputs - out0, out1

390 * Return Type - as per RTYPE	968 * Return Type - as per RTYPE

391 * Details : Even byte elements of 'in0' are copied to the left half of	969 * Details : Even byte elements of 'in0' are copied to the left half of

392 * 'out0' & even byte elements of 'in1' are copied to the right	970 * 'out0' & even byte elements of 'in1' are copied to the right

393 * half of 'out0'.	971 * half of 'out0'.

394 */	972 */

395 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \	973 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) do { \

396 out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \	974 out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \

397 out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \	975 out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \

398 }	976 } while (0)

399 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)	977 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)

400 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)	978 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)

401 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)	979 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)

402 #define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__)	980 #define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__)

403	981

	982 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \

	983 out0, out1, out2, out3) do { \

	984 PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \

	985 PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \

	986 } while (0)

	987 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)

	988 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)

	989 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)

	990 #define PCKEV_B4_SW(...) PCKEV_B4(v4i32, __VA_ARGS__)

	991

	992 /* Description : Pack even halfword elements of vector pairs

	993 * Arguments : Inputs - in0, in1, in2, in3

	994 * Outputs - out0, out1

	995 * Return Type - as per RTYPE

	996 * Details : Even halfword elements of 'in0' are copied to the left half of

	997 * 'out0' & even halfword elements of 'in1' are copied to the

	998 * right half of 'out0'.

	999 */

	1000 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) do { \

	1001 out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \

	1002 out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \

	1003 } while (0)

	1004 #define PCKEV_H2_UH(...) PCKEV_H2(v8u16, __VA_ARGS__)

	1005 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)

	1006 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)

	1007 #define PCKEV_H2_UW(...) PCKEV_H2(v4u32, __VA_ARGS__)

	1008

	1009 /* Description : Pack even word elements of vector pairs

	1010 * Arguments : Inputs - in0, in1, in2, in3

	1011 * Outputs - out0, out1

	1012 * Return Type - as per RTYPE

	1013 * Details : Even word elements of 'in0' are copied to the left half of

	1014 * 'out0' & even word elements of 'in1' are copied to the

	1015 * right half of 'out0'.

	1016 */

	1017 #define PCKEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) do { \

	1018 out0 = (RTYPE)__msa_pckev_w((v4i32)in0, (v4i32)in1); \

	1019 out1 = (RTYPE)__msa_pckev_w((v4i32)in2, (v4i32)in3); \

	1020 } while (0)

	1021 #define PCKEV_W2_UH(...) PCKEV_W2(v8u16, __VA_ARGS__)

	1022 #define PCKEV_W2_SH(...) PCKEV_W2(v8i16, __VA_ARGS__)

	1023 #define PCKEV_W2_SW(...) PCKEV_W2(v4i32, __VA_ARGS__)

	1024 #define PCKEV_W2_UW(...) PCKEV_W2(v4u32, __VA_ARGS__)

	1025

	1026 /* Description : Pack odd halfword elements of vector pairs

	1027 * Arguments : Inputs - in0, in1, in2, in3

	1028 * Outputs - out0, out1

	1029 * Return Type - as per RTYPE

	1030 * Details : Odd halfword elements of 'in0' are copied to the left half of

	1031 * 'out0' & odd halfword elements of 'in1' are copied to the

	1032 * right half of 'out0'.

	1033 */

	1034 #define PCKOD_H2(RTYPE, in0, in1, in2, in3, out0, out1) do { \

	1035 out0 = (RTYPE)__msa_pckod_h((v8i16)in0, (v8i16)in1); \

	1036 out1 = (RTYPE)__msa_pckod_h((v8i16)in2, (v8i16)in3); \

	1037 } while (0)

	1038 #define PCKOD_H2_UH(...) PCKOD_H2(v8u16, __VA_ARGS__)

	1039 #define PCKOD_H2_SH(...) PCKOD_H2(v8i16, __VA_ARGS__)

	1040 #define PCKOD_H2_SW(...) PCKOD_H2(v4i32, __VA_ARGS__)

	1041 #define PCKOD_H2_UW(...) PCKOD_H2(v4u32, __VA_ARGS__)

	1042

404 /* Description : Arithmetic immediate shift right all elements of word vector	1043 /* Description : Arithmetic immediate shift right all elements of word vector

405 * Arguments : Inputs - in0, in1, shift	1044 * Arguments : Inputs - in0, in1, shift

406 * Outputs - in place operation	1045 * Outputs - in place operation

407 * Return Type - as per input vector RTYPE	1046 * Return Type - as per input vector RTYPE

408 * Details : Each element of vector 'in0' is right shifted by 'shift' and	1047 * Details : Each element of vector 'in0' is right shifted by 'shift' and

409 * the result is written in-place. 'shift' is a GP variable.	1048 * the result is written in-place. 'shift' is a GP variable.

410 */	1049 */

411 #define SRAI_W2(RTYPE, in0, in1, shift_val) { \	1050 #define SRAI_W2(RTYPE, in0, in1, shift_val) do { \

412 in0 = (RTYPE)SRAI_W(in0, shift_val); \	1051 in0 = (RTYPE)SRAI_W(in0, shift_val); \

413 in1 = (RTYPE)SRAI_W(in1, shift_val); \	1052 in1 = (RTYPE)SRAI_W(in1, shift_val); \

414 }	1053 } while (0)

415 #define SRAI_W2_SW(...) SRAI_W2(v4i32, __VA_ARGS__)	1054 #define SRAI_W2_SW(...) SRAI_W2(v4i32, __VA_ARGS__)

416 #define SRAI_W2_UW(...) SRAI_W2(v4u32, __VA_ARGS__)	1055 #define SRAI_W2_UW(...) SRAI_W2(v4u32, __VA_ARGS__)

417	1056

418 #define SRAI_W4(RTYPE, in0, in1, in2, in3, shift_val) { \	1057 #define SRAI_W4(RTYPE, in0, in1, in2, in3, shift_val) do { \

419 SRAI_W2(RTYPE, in0, in1, shift_val); \	1058 SRAI_W2(RTYPE, in0, in1, shift_val); \

420 SRAI_W2(RTYPE, in2, in3, shift_val); \	1059 SRAI_W2(RTYPE, in2, in3, shift_val); \

421 }	1060 } while (0)

422 #define SRAI_W4_SW(...) SRAI_W4(v4i32, __VA_ARGS__)	1061 #define SRAI_W4_SW(...) SRAI_W4(v4i32, __VA_ARGS__)

423 #define SRAI_W4_UW(...) SRAI_W4(v4u32, __VA_ARGS__)	1062 #define SRAI_W4_UW(...) SRAI_W4(v4u32, __VA_ARGS__)

424	1063

425 /* Description : Arithmetic shift right all elements of half-word vector	1064 /* Description : Arithmetic shift right all elements of half-word vector

426 * Arguments : Inputs - in0, in1, shift	1065 * Arguments : Inputs - in0, in1, shift

427 * Outputs - in place operation	1066 * Outputs - in place operation

428 * Return Type - as per input vector RTYPE	1067 * Return Type - as per input vector RTYPE

429 * Details : Each element of vector 'in0' is right shifted by 'shift' and	1068 * Details : Each element of vector 'in0' is right shifted by 'shift' and

430 * the result is written in-place. 'shift' is a GP variable.	1069 * the result is written in-place. 'shift' is a GP variable.

431 */	1070 */

432 #define SRAI_H2(RTYPE, in0, in1, shift_val) { \	1071 #define SRAI_H2(RTYPE, in0, in1, shift_val) do { \

433 in0 = (RTYPE)SRAI_H(in0, shift_val); \	1072 in0 = (RTYPE)SRAI_H(in0, shift_val); \

434 in1 = (RTYPE)SRAI_H(in1, shift_val); \	1073 in1 = (RTYPE)SRAI_H(in1, shift_val); \

435 }	1074 } while (0)

436 #define SRAI_H2_SH(...) SRAI_H2(v8i16, __VA_ARGS__)	1075 #define SRAI_H2_SH(...) SRAI_H2(v8i16, __VA_ARGS__)

437 #define SRAI_H2_UH(...) SRAI_H2(v8u16, __VA_ARGS__)	1076 #define SRAI_H2_UH(...) SRAI_H2(v8u16, __VA_ARGS__)

438	1077

439 /* Description : Arithmetic rounded shift right all elements of word vector	1078 /* Description : Arithmetic rounded shift right all elements of word vector

440 * Arguments : Inputs - in0, in1, shift	1079 * Arguments : Inputs - in0, in1, shift

441 * Outputs - in place operation	1080 * Outputs - in place operation

442 * Return Type - as per input vector RTYPE	1081 * Return Type - as per input vector RTYPE

443 * Details : Each element of vector 'in0' is right shifted by 'shift' and	1082 * Details : Each element of vector 'in0' is right shifted by 'shift' and

444 * the result is written in-place. 'shift' is a GP variable.	1083 * the result is written in-place. 'shift' is a GP variable.

445 */	1084 */

446 #define SRARI_W2(RTYPE, in0, in1, shift) { \	1085 #define SRARI_W2(RTYPE, in0, in1, shift) do { \

447 in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \	1086 in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \

448 in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \	1087 in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \

449 }	1088 } while (0)

450 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)	1089 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)

451	1090

452 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) { \	1091 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) do { \

453 SRARI_W2(RTYPE, in0, in1, shift); \	1092 SRARI_W2(RTYPE, in0, in1, shift); \

454 SRARI_W2(RTYPE, in2, in3, shift); \	1093 SRARI_W2(RTYPE, in2, in3, shift); \

455 }	1094 } while (0)

456 #define SRARI_W4_SH(...) SRARI_W4(v8i16, __VA_ARGS__)	1095 #define SRARI_W4_SH(...) SRARI_W4(v8i16, __VA_ARGS__)

457 #define SRARI_W4_UW(...) SRARI_W4(v4u32, __VA_ARGS__)	1096 #define SRARI_W4_UW(...) SRARI_W4(v4u32, __VA_ARGS__)

458 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)	1097 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)

459	1098

	1099 /* Description : Shift right arithmetic rounded double words

	1100 * Arguments : Inputs - in0, in1, shift

	1101 * Outputs - in place operation

	1102 * Return Type - as per RTYPE

	1103 * Details : Each element of vector 'in0' is shifted right arithmetically by

	1104 * the number of bits in the corresponding element in the vector

	1105 * 'shift'. The last discarded bit is added to shifted value for

	1106 * rounding and the result is written in-place.

	1107 * 'shift' is a vector.

	1108 */

	1109 #define SRAR_D2(RTYPE, in0, in1, shift) do { \

	1110 in0 = (RTYPE)__msa_srar_d((v2i64)in0, (v2i64)shift); \

	1111 in1 = (RTYPE)__msa_srar_d((v2i64)in1, (v2i64)shift); \

	1112 } while (0)

	1113 #define SRAR_D2_SW(...) SRAR_D2(v4i32, __VA_ARGS__)

	1114 #define SRAR_D2_SD(...) SRAR_D2(v2i64, __VA_ARGS__)

	1115 #define SRAR_D2_UD(...) SRAR_D2(v2u64, __VA_ARGS__)

	1116

	1117 #define SRAR_D4(RTYPE, in0, in1, in2, in3, shift) do { \

	1118 SRAR_D2(RTYPE, in0, in1, shift); \

	1119 SRAR_D2(RTYPE, in2, in3, shift); \

	1120 } while (0)

	1121 #define SRAR_D4_SD(...) SRAR_D4(v2i64, __VA_ARGS__)

	1122 #define SRAR_D4_UD(...) SRAR_D4(v2u64, __VA_ARGS__)

	1123

460 /* Description : Addition of 2 pairs of half-word vectors	1124 /* Description : Addition of 2 pairs of half-word vectors

461 * Arguments : Inputs - in0, in1, in2, in3	1125 * Arguments : Inputs - in0, in1, in2, in3

462 * Outputs - out0, out1	1126 * Outputs - out0, out1

463 * Details : Each element in 'in0' is added to 'in1' and result is written	1127 * Details : Each element in 'in0' is added to 'in1' and result is written

464 * to 'out0'.	1128 * to 'out0'.

465 */	1129 */

466 #define ADDVI_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \	1130 #define ADDVI_H2(RTYPE, in0, in1, in2, in3, out0, out1) do { \

467 out0 = (RTYPE)ADDVI_H(in0, in1); \	1131 out0 = (RTYPE)ADDVI_H(in0, in1); \

468 out1 = (RTYPE)ADDVI_H(in2, in3); \	1132 out1 = (RTYPE)ADDVI_H(in2, in3); \

469 }	1133 } while (0)

470 #define ADDVI_H2_SH(...) ADDVI_H2(v8i16, __VA_ARGS__)	1134 #define ADDVI_H2_SH(...) ADDVI_H2(v8i16, __VA_ARGS__)

471 #define ADDVI_H2_UH(...) ADDVI_H2(v8u16, __VA_ARGS__)	1135 #define ADDVI_H2_UH(...) ADDVI_H2(v8u16, __VA_ARGS__)

472	1136

	1137 /* Description : Addition of 2 pairs of word vectors

	1138 * Arguments : Inputs - in0, in1, in2, in3

	1139 * Outputs - out0, out1

	1140 * Details : Each element in 'in0' is added to 'in1' and result is written

	1141 * to 'out0'.

	1142 */

	1143 #define ADDVI_W2(RTYPE, in0, in1, in2, in3, out0, out1) do { \

	1144 out0 = (RTYPE)ADDVI_W(in0, in1); \

	1145 out1 = (RTYPE)ADDVI_W(in2, in3); \

	1146 } while (0)

	1147 #define ADDVI_W2_SW(...) ADDVI_W2(v4i32, __VA_ARGS__)

	1148

	1149 /* Description : Fill 2 pairs of word vectors with GP registers

	1150 * Arguments : Inputs - in0, in1

	1151 * Outputs - out0, out1

	1152 * Details : GP register in0 is replicated in each word element of out0

	1153 * GP register in1 is replicated in each word element of out1

	1154 */

	1155 #define FILL_W2(RTYPE, in0, in1, out0, out1) do { \

	1156 out0 = (RTYPE)__msa_fill_w(in0); \

	1157 out1 = (RTYPE)__msa_fill_w(in1); \

	1158 } while (0)

	1159 #define FILL_W2_SW(...) FILL_W2(v4i32, __VA_ARGS__)

	1160

473 /* Description : Addition of 2 pairs of vectors	1161 /* Description : Addition of 2 pairs of vectors

474 * Arguments : Inputs - in0, in1, in2, in3	1162 * Arguments : Inputs - in0, in1, in2, in3

475 * Outputs - out0, out1	1163 * Outputs - out0, out1

476 * Details : Each element in 'in0' is added to 'in1' and result is written	1164 * Details : Each element in 'in0' is added to 'in1' and result is written

477 * to 'out0'.	1165 * to 'out0'.

478 */	1166 */

479 #define ADD2(in0, in1, in2, in3, out0, out1) { \	1167 #define ADD2(in0, in1, in2, in3, out0, out1) do { \

480 out0 = in0 + in1; \	1168 out0 = in0 + in1; \

481 out1 = in2 + in3; \	1169 out1 = in2 + in3; \

482 }	1170 } while (0)

	1171

483 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, \	1172 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, \

484 out0, out1, out2, out3) { \	1173 out0, out1, out2, out3) do { \

485 ADD2(in0, in1, in2, in3, out0, out1); \	1174 ADD2(in0, in1, in2, in3, out0, out1); \

486 ADD2(in4, in5, in6, in7, out2, out3); \	1175 ADD2(in4, in5, in6, in7, out2, out3); \

487 }	1176 } while (0)

	1177

	1178 /* Description : Subtraction of 2 pairs of vectors

	1179 * Arguments : Inputs - in0, in1, in2, in3

	1180 * Outputs - out0, out1

	1181 * Details : Each element in 'in1' is subtracted from 'in0' and result is

	1182 * written to 'out0'.

	1183 */

	1184 #define SUB2(in0, in1, in2, in3, out0, out1) do { \

	1185 out0 = in0 - in1; \

	1186 out1 = in2 - in3; \

	1187 } while (0)

	1188

	1189 #define SUB3(in0, in1, in2, in3, in4, in5, out0, out1, out2) do { \

	1190 out0 = in0 - in1; \

	1191 out1 = in2 - in3; \

	1192 out2 = in4 - in5; \

	1193 } while (0)

	1194

	1195 #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, \

	1196 out0, out1, out2, out3) do { \

	1197 out0 = in0 - in1; \

	1198 out1 = in2 - in3; \

	1199 out2 = in4 - in5; \

	1200 out3 = in6 - in7; \

	1201 } while (0)

	1202

	1203 /* Description : Addition - Subtraction of input vectors

	1204 * Arguments : Inputs - in0, in1

	1205 * Outputs - out0, out1

	1206 * Details : Each element in 'in1' is added to 'in0' and result is

	1207 * written to 'out0'.

	1208 * Each element in 'in1' is subtracted from 'in0' and result is

	1209 * written to 'out1'.

	1210 */

	1211 #define ADDSUB2(in0, in1, out0, out1) do { \

	1212 out0 = in0 + in1; \

	1213 out1 = in0 - in1; \

	1214 } while (0)

	1215

	1216 /* Description : Multiplication of pairs of vectors

	1217 * Arguments : Inputs - in0, in1, in2, in3

	1218 * Outputs - out0, out1

	1219 * Details : Each element from 'in0' is multiplied with elements from 'in1'

	1220 * and the result is written to 'out0'

	1221 */

	1222 #define MUL2(in0, in1, in2, in3, out0, out1) do { \

	1223 out0 = in0 * in1; \

	1224 out1 = in2 * in3; \

	1225 } while (0)

	1226

	1227 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, \

	1228 out0, out1, out2, out3) do { \

	1229 MUL2(in0, in1, in2, in3, out0, out1); \

	1230 MUL2(in4, in5, in6, in7, out2, out3); \

	1231 } while (0)

	1232

	1233 /* Description : Sign extend halfword elements from right half of the vector

	1234 * Arguments : Input - in (halfword vector)

	1235 * Output - out (sign extended word vector)

	1236 * Return Type - signed word

	1237 * Details : Sign bit of halfword elements from input vector 'in' is

	1238 * extracted and interleaved with same vector 'in0' to generate

	1239 * 4 word elements keeping sign intact

	1240 */

	1241 #define UNPCK_R_SH_SW(in, out) do { \

	1242 const v8i16 sign_m = __msa_clti_s_h((v8i16)in, 0); \

	1243 out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \

	1244 } while (0)

488	1245

489 /* Description : Sign extend halfword elements from input vector and return	1246 /* Description : Sign extend halfword elements from input vector and return

490 * the result in pair of vectors	1247 * the result in pair of vectors

491 * Arguments : Input - in (halfword vector)	1248 * Arguments : Input - in (halfword vector)

492 * Outputs - out0, out1 (sign extended word vectors)	1249 * Outputs - out0, out1 (sign extended word vectors)

493 * Return Type - signed word	1250 * Return Type - signed word

494 * Details : Sign bit of halfword elements from input vector 'in' is	1251 * Details : Sign bit of halfword elements from input vector 'in' is

495 * extracted and interleaved right with same vector 'in0' to	1252 * extracted and interleaved right with same vector 'in0' to

496 * generate 4 signed word elements in 'out0'	1253 * generate 4 signed word elements in 'out0'

497 * Then interleaved left with same vector 'in0' to	1254 * Then interleaved left with same vector 'in0' to

498 * generate 4 signed word elements in 'out1'	1255 * generate 4 signed word elements in 'out1'

499 */	1256 */

500 #define UNPCK_SH_SW(in, out0, out1) { \	1257 #define UNPCK_SH_SW(in, out0, out1) do { \

501 const v8i16 tmp_m = __msa_clti_s_h((v8i16)in, 0); \	1258 const v8i16 tmp_m = __msa_clti_s_h((v8i16)in, 0); \

502 ILVRL_H2_SW(tmp_m, in, out0, out1); \	1259 ILVRL_H2_SW(tmp_m, in, out0, out1); \

503 }	1260 } while (0)

504	1261

505 /* Description : Butterfly of 4 input vectors	1262 /* Description : Butterfly of 4 input vectors

506 * Arguments : Inputs - in0, in1, in2, in3	1263 * Arguments : Inputs - in0, in1, in2, in3

507 * Outputs - out0, out1, out2, out3	1264 * Outputs - out0, out1, out2, out3

508 * Details : Butterfly operation	1265 * Details : Butterfly operation

509 */	1266 */

510 #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) { \	1267 #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) do { \

511 out0 = in0 + in3; \	1268 out0 = in0 + in3; \

512 out1 = in1 + in2; \	1269 out1 = in1 + in2; \

513 out2 = in1 - in2; \	1270 out2 = in1 - in2; \

514 out3 = in0 - in3; \	1271 out3 = in0 - in3; \

515 }	1272 } while (0)

	1273

	1274 /* Description : Transpose 16x4 block into 4x16 with byte elements in vectors

	1275 * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,

	1276 * in8, in9, in10, in11, in12, in13, in14, in15

	1277 * Outputs - out0, out1, out2, out3

	1278 * Return Type - unsigned byte

	1279 */

	1280 #define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \

	1281 in8, in9, in10, in11, in12, in13, in14, in15, \

	1282 out0, out1, out2, out3) do { \

	1283 v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m, tmp4_m, tmp5_m; \

	1284 ILVEV_W2_SD(in0, in4, in8, in12, tmp2_m, tmp3_m); \

	1285 ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m); \

	1286 ILVEV_D2_UB(tmp2_m, tmp3_m, tmp0_m, tmp1_m, out1, out3); \

	1287 ILVEV_W2_SD(in2, in6, in10, in14, tmp4_m, tmp5_m); \

	1288 ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m); \

	1289 ILVEV_D2_SD(tmp4_m, tmp5_m, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \

	1290 ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m); \

	1291 ILVEVOD_H2_UB(tmp0_m, tmp1_m, tmp0_m, tmp1_m, out0, out2); \

	1292 ILVOD_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m); \

	1293 ILVEVOD_H2_UB(tmp0_m, tmp1_m, tmp0_m, tmp1_m, out1, out3); \

	1294 } while (0)

	1295

	1296 /* Description : Transpose 16x8 block into 8x16 with byte elements in vectors

	1297 * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,

	1298 * in8, in9, in10, in11, in12, in13, in14, in15

	1299 * Outputs - out0, out1, out2, out3, out4, out5, out6, out7

	1300 * Return Type - unsigned byte

	1301 */

	1302 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \

	1303 in8, in9, in10, in11, in12, in13, in14, in15, \

	1304 out0, out1, out2, out3, out4, out5, \

	1305 out6, out7) do { \

	1306 v8i16 tmp0_m, tmp1_m, tmp4_m, tmp5_m, tmp6_m, tmp7_m; \

	1307 v4i32 tmp2_m, tmp3_m; \

	1308 ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \

	1309 ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \

	1310 ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \

	1311 ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \

	1312 ILVEV_B2_SH(out7, out6, out5, out4, tmp0_m, tmp1_m); \

	1313 ILVOD_B2_SH(out7, out6, out5, out4, tmp4_m, tmp5_m); \

	1314 ILVEV_B2_UB(out3, out2, out1, out0, out5, out7); \

	1315 ILVOD_B2_SH(out3, out2, out1, out0, tmp6_m, tmp7_m); \

	1316 ILVEV_H2_SW(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \

	1317 ILVEVOD_W2_UB(tmp2_m, tmp3_m, tmp2_m, tmp3_m, out0, out4); \

	1318 ILVOD_H2_SW(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \

	1319 ILVEVOD_W2_UB(tmp2_m, tmp3_m, tmp2_m, tmp3_m, out2, out6); \

	1320 ILVEV_H2_SW(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \

	1321 ILVEVOD_W2_UB(tmp2_m, tmp3_m, tmp2_m, tmp3_m, out1, out5); \

	1322 ILVOD_H2_SW(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \

	1323 ILVEVOD_W2_UB(tmp2_m, tmp3_m, tmp2_m, tmp3_m, out3, out7); \

	1324 } while (0)

516	1325

517 /* Description : Transpose 4x4 block with word elements in vectors	1326 /* Description : Transpose 4x4 block with word elements in vectors

518 * Arguments : Inputs - in0, in1, in2, in3	1327 * Arguments : Inputs - in0, in1, in2, in3

519 * Outputs - out0, out1, out2, out3	1328 * Outputs - out0, out1, out2, out3

520 * Return Type - as per RTYPE	1329 * Return Type - as per RTYPE

521 */	1330 */

522 #define TRANSPOSE4x4_W(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) { \	1331 #define TRANSPOSE4x4_W(RTYPE, in0, in1, in2, in3, \

	1332 out0, out1, out2, out3) do { \

523 v4i32 s0_m, s1_m, s2_m, s3_m; \	1333 v4i32 s0_m, s1_m, s2_m, s3_m; \

524 ILVRL_W2_SW(in1, in0, s0_m, s1_m); \	1334 ILVRL_W2_SW(in1, in0, s0_m, s1_m); \

525 ILVRL_W2_SW(in3, in2, s2_m, s3_m); \	1335 ILVRL_W2_SW(in3, in2, s2_m, s3_m); \

526 out0 = (RTYPE)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m); \	1336 out0 = (RTYPE)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m); \

527 out1 = (RTYPE)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m); \	1337 out1 = (RTYPE)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m); \

528 out2 = (RTYPE)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m); \	1338 out2 = (RTYPE)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m); \

529 out3 = (RTYPE)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m); \	1339 out3 = (RTYPE)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m); \

530 }	1340 } while (0)

531 #define TRANSPOSE4x4_SW_SW(...) TRANSPOSE4x4_W(v4i32, __VA_ARGS__)	1341 #define TRANSPOSE4x4_SW_SW(...) TRANSPOSE4x4_W(v4i32, __VA_ARGS__)

532	1342

533 /* Description : Add block 4x4	1343 /* Description : Add block 4x4

534 * Arguments : Inputs - in0, in1, in2, in3, pdst, stride	1344 * Arguments : Inputs - in0, in1, in2, in3, pdst, stride

535 * Details : Least significant 4 bytes from each input vector are added to	1345 * Details : Least significant 4 bytes from each input vector are added to

536 * the destination bytes, clipped between 0-255 and stored.	1346 * the destination bytes, clipped between 0-255 and stored.

537 */	1347 */

538 #define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) { \	1348 #define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) do { \

539 uint32_t src0_m, src1_m, src2_m, src3_m; \	1349 uint32_t src0_m, src1_m, src2_m, src3_m; \

540 v8i16 inp0_m, inp1_m, res0_m, res1_m; \	1350 v8i16 inp0_m, inp1_m, res0_m, res1_m; \

541 v16i8 dst0_m = { 0 }; \	1351 v16i8 dst0_m = { 0 }; \

542 v16i8 dst1_m = { 0 }; \	1352 v16i8 dst1_m = { 0 }; \

543 const v16i8 zero_m = { 0 }; \	1353 const v16i8 zero_m = { 0 }; \

544 ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m); \	1354 ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m); \

545 LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m); \	1355 LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m); \

546 INSERT_W2_SB(src0_m, src1_m, dst0_m); \	1356 INSERT_W2_SB(src0_m, src1_m, dst0_m); \

547 INSERT_W2_SB(src2_m, src3_m, dst1_m); \	1357 INSERT_W2_SB(src2_m, src3_m, dst1_m); \

548 ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m); \	1358 ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m); \

549 ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \	1359 ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \

550 CLIP_SH2_0_255(res0_m, res1_m); \	1360 CLIP_SH2_0_255(res0_m, res1_m); \

551 PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \	1361 PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \

552 ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride); \	1362 ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride); \

553 }	1363 } while (0)

	1364

	1365 /* Description : Pack even byte elements, extract 0 & 2 index words from pair

	1366 * of results and store 4 words in destination memory as per

	1367 * stride

	1368 * Arguments : Inputs - in0, in1, in2, in3, pdst, stride

	1369 */

	1370 #define PCKEV_ST4x4_UB(in0, in1, in2, in3, pdst, stride) do { \

	1371 v16i8 tmp0_m, tmp1_m; \

	1372 PCKEV_B2_SB(in1, in0, in3, in2, tmp0_m, tmp1_m); \

	1373 ST4x4_UB(tmp0_m, tmp1_m, 0, 2, 0, 2, pdst, stride); \

	1374 } while (0)

	1375

	1376 /* Description : average with rounding (in0 + in1 + 1) / 2.

	1377 * Arguments : Inputs - in0, in1, in2, in3,

	1378 * Outputs - out0, out1

	1379 * Return Type - as per RTYPE

	1380 * Details : Each unsigned byte element from 'in0' vector is added with

	1381 * each unsigned byte element from 'in1' vector. Then the average

	1382 * with rounding is calculated and written to 'out0'

	1383 */

	1384 #define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) do { \

	1385 out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1); \

	1386 out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3); \

	1387 } while (0)

	1388 #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)

554	1389

555 #endif /* WEBP_DSP_MSA_MACRO_H_ */	1390 #endif /* WEBP_DSP_MSA_MACRO_H_ */

OLD	NEW

« no previous file with comments | « third_party/libwebp/dsp/lossless_sse2.c ('k') | third_party/libwebp/dsp/neon.h » ('j') | no next file with comments »