Linux_ia32/lib/clang/3.2/include/avxintrin.h - Issue 11348245: Add 32-bit version of llvm-allocated-type.

Side by Side Diff: Linux_ia32/lib/clang/3.2/include/avxintrin.h

Issue 11348245: Add 32-bit version of llvm-allocated-type. Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/llvm-allocated-type/

Patch Set: Created 8 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 /*===---- avxintrin.h - AVX intrinsics -------------------------------------===

	2 *

	3 * Permission is hereby granted, free of charge, to any person obtaining a copy

	4 * of this software and associated documentation files (the "Software"), to deal

	5 * in the Software without restriction, including without limitation the rights

	6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell

	7 * copies of the Software, and to permit persons to whom the Software is

	8 * furnished to do so, subject to the following conditions:

	9 *

	10 * The above copyright notice and this permission notice shall be included in

	11 * all copies or substantial portions of the Software.

	12 *

	13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

	14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

	15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

	16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

	17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

	18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN

	19 * THE SOFTWARE.

	20 *

	21 *===-----------------------------------------------------------------------===

	22 */

	23

	24 #ifndef __IMMINTRIN_H

	25 #error "Never use <avxintrin.h> directly; include <immintrin.h> instead."

	26 #endif

	27

	28 typedef double __v4df __attribute__ ((__vector_size__ (32)));

	29 typedef float __v8sf __attribute__ ((__vector_size__ (32)));

	30 typedef long long __v4di __attribute__ ((__vector_size__ (32)));

	31 typedef int __v8si __attribute__ ((__vector_size__ (32)));

	32 typedef short __v16hi __attribute__ ((__vector_size__ (32)));

	33 typedef char __v32qi __attribute__ ((__vector_size__ (32)));

	34

	35 typedef float __m256 __attribute__ ((__vector_size__ (32)));

	36 typedef double __m256d __attribute__((__vector_size__(32)));

	37 typedef long long __m256i __attribute__((__vector_size__(32)));

	38

	39 /* Arithmetic */

	40 static __inline __m256d __attribute__((__always_inline__, __nodebug__))

	41 _mm256_add_pd(__m256d a, __m256d b)

	42 {

	43 return a+b;

	44 }

	45

	46 static __inline __m256 __attribute__((__always_inline__, __nodebug__))

	47 _mm256_add_ps(__m256 a, __m256 b)

	48 {

	49 return a+b;

	50 }

	51

	52 static __inline __m256d __attribute__((__always_inline__, __nodebug__))

	53 _mm256_sub_pd(__m256d a, __m256d b)

	54 {

	55 return a-b;

	56 }

	57

	58 static __inline __m256 __attribute__((__always_inline__, __nodebug__))

	59 _mm256_sub_ps(__m256 a, __m256 b)

	60 {

	61 return a-b;

	62 }

	63

	64 static __inline __m256d __attribute__((__always_inline__, __nodebug__))

	65 _mm256_addsub_pd(__m256d a, __m256d b)

	66 {

	67 return (__m256d)__builtin_ia32_addsubpd256((__v4df)a, (__v4df)b);

	68 }

	69

	70 static __inline __m256 __attribute__((__always_inline__, __nodebug__))

	71 _mm256_addsub_ps(__m256 a, __m256 b)

	72 {

	73 return (__m256)__builtin_ia32_addsubps256((__v8sf)a, (__v8sf)b);

	74 }

	75

	76 static __inline __m256d __attribute__((__always_inline__, __nodebug__))

	77 _mm256_div_pd(__m256d a, __m256d b)

	78 {

	79 return a / b;

	80 }

	81

	82 static __inline __m256 __attribute__((__always_inline__, __nodebug__))

	83 _mm256_div_ps(__m256 a, __m256 b)

	84 {

	85 return a / b;

	86 }

	87

	88 static __inline __m256d __attribute__((__always_inline__, __nodebug__))

	89 _mm256_max_pd(__m256d a, __m256d b)

	90 {

	91 return (__m256d)__builtin_ia32_maxpd256((__v4df)a, (__v4df)b);

	92 }

	93

	94 static __inline __m256 __attribute__((__always_inline__, __nodebug__))

	95 _mm256_max_ps(__m256 a, __m256 b)

	96 {

	97 return (__m256)__builtin_ia32_maxps256((__v8sf)a, (__v8sf)b);

	98 }

	99

	100 static __inline __m256d __attribute__((__always_inline__, __nodebug__))

	101 _mm256_min_pd(__m256d a, __m256d b)

	102 {

	103 return (__m256d)__builtin_ia32_minpd256((__v4df)a, (__v4df)b);

	104 }

	105

	106 static __inline __m256 __attribute__((__always_inline__, __nodebug__))

	107 _mm256_min_ps(__m256 a, __m256 b)

	108 {

	109 return (__m256)__builtin_ia32_minps256((__v8sf)a, (__v8sf)b);

	110 }

	111

	112 static __inline __m256d __attribute__((__always_inline__, __nodebug__))

	113 _mm256_mul_pd(__m256d a, __m256d b)

	114 {

	115 return a * b;

	116 }

	117

	118 static __inline __m256 __attribute__((__always_inline__, __nodebug__))

	119 _mm256_mul_ps(__m256 a, __m256 b)

	120 {

	121 return a * b;

	122 }

	123

	124 static __inline __m256d __attribute__((__always_inline__, __nodebug__))

	125 _mm256_sqrt_pd(__m256d a)

	126 {

	127 return (__m256d)__builtin_ia32_sqrtpd256((__v4df)a);

	128 }

	129

	130 static __inline __m256 __attribute__((__always_inline__, __nodebug__))

	131 _mm256_sqrt_ps(__m256 a)

	132 {

	133 return (__m256)__builtin_ia32_sqrtps256((__v8sf)a);

	134 }

	135

	136 static __inline __m256 __attribute__((__always_inline__, __nodebug__))

	137 _mm256_rsqrt_ps(__m256 a)

	138 {

	139 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)a);

	140 }

	141

	142 static __inline __m256 __attribute__((__always_inline__, __nodebug__))

	143 _mm256_rcp_ps(__m256 a)

	144 {

	145 return (__m256)__builtin_ia32_rcpps256((__v8sf)a);

	146 }

	147

	148 #define _mm256_round_pd(V, M) __extension__ ({ \

	149 __m256d __V = (V); \

	150 (__m256d)__builtin_ia32_roundpd256((__v4df)__V, (M)); })

	151

	152 #define _mm256_round_ps(V, M) __extension__ ({ \

	153 __m256 __V = (V); \

	154 (__m256)__builtin_ia32_roundps256((__v8sf)__V, (M)); })

	155

	156 #define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)

	157 #define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)

	158 #define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)

	159 #define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)

	160

	161 /* Logical */

	162 static __inline __m256d __attribute__((__always_inline__, __nodebug__))

	163 _mm256_and_pd(__m256d a, __m256d b)

	164 {

	165 return (__m256d)((__v4di)a & (__v4di)b);

	166 }

	167

	168 static __inline __m256 __attribute__((__always_inline__, __nodebug__))

	169 _mm256_and_ps(__m256 a, __m256 b)

	170 {

	171 return (__m256)((__v8si)a & (__v8si)b);

	172 }

	173

	174 static __inline __m256d __attribute__((__always_inline__, __nodebug__))

	175 _mm256_andnot_pd(__m256d a, __m256d b)

	176 {

	177 return (__m256d)(~(__v4di)a & (__v4di)b);

	178 }

	179

	180 static __inline __m256 __attribute__((__always_inline__, __nodebug__))

	181 _mm256_andnot_ps(__m256 a, __m256 b)

	182 {

	183 return (__m256)(~(__v8si)a & (__v8si)b);

	184 }

	185

	186 static __inline __m256d __attribute__((__always_inline__, __nodebug__))

	187 _mm256_or_pd(__m256d a, __m256d b)

	188 {

	189 return (__m256d)((__v4di)a \| (__v4di)b);

	190 }

	191

	192 static __inline __m256 __attribute__((__always_inline__, __nodebug__))

	193 _mm256_or_ps(__m256 a, __m256 b)

	194 {

	195 return (__m256)((__v8si)a \| (__v8si)b);

	196 }

	197

	198 static __inline __m256d __attribute__((__always_inline__, __nodebug__))

	199 _mm256_xor_pd(__m256d a, __m256d b)

	200 {

	201 return (__m256d)((__v4di)a ^ (__v4di)b);

	202 }

	203

	204 static __inline __m256 __attribute__((__always_inline__, __nodebug__))

	205 _mm256_xor_ps(__m256 a, __m256 b)

	206 {

	207 return (__m256)((__v8si)a ^ (__v8si)b);

	208 }

	209

	210 /* Horizontal arithmetic */

	211 static __inline __m256d __attribute__((__always_inline__, __nodebug__))

	212 _mm256_hadd_pd(__m256d a, __m256d b)

	213 {

	214 return (__m256d)__builtin_ia32_haddpd256((__v4df)a, (__v4df)b);

	215 }

	216

	217 static __inline __m256 __attribute__((__always_inline__, __nodebug__))

	218 _mm256_hadd_ps(__m256 a, __m256 b)

	219 {

	220 return (__m256)__builtin_ia32_haddps256((__v8sf)a, (__v8sf)b);

	221 }

	222

	223 static __inline __m256d __attribute__((__always_inline__, __nodebug__))

	224 _mm256_hsub_pd(__m256d a, __m256d b)

	225 {

	226 return (__m256d)__builtin_ia32_hsubpd256((__v4df)a, (__v4df)b);

	227 }

	228

	229 static __inline __m256 __attribute__((__always_inline__, __nodebug__))

	230 _mm256_hsub_ps(__m256 a, __m256 b)

	231 {

	232 return (__m256)__builtin_ia32_hsubps256((__v8sf)a, (__v8sf)b);

	233 }

	234

	235 /* Vector permutations */

	236 static __inline __m128d __attribute__((__always_inline__, __nodebug__))

	237 _mm_permutevar_pd(__m128d a, __m128i c)

	238 {

	239 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)a, (__v2di)c);

	240 }

	241

	242 static __inline __m256d __attribute__((__always_inline__, __nodebug__))

	243 _mm256_permutevar_pd(__m256d a, __m256i c)

	244 {

	245 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)a, (__v4di)c);

	246 }

	247

	248 static __inline __m128 __attribute__((__always_inline__, __nodebug__))

	249 _mm_permutevar_ps(__m128 a, __m128i c)

	250 {

	251 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)a, (__v4si)c);

	252 }

	253

	254 static __inline __m256 __attribute__((__always_inline__, __nodebug__))

	255 _mm256_permutevar_ps(__m256 a, __m256i c)

	256 {

	257 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)a,

	258 (__v8si)c);

	259 }

	260

	261 #define _mm_permute_pd(A, C) __extension__ ({ \

	262 __m128d __A = (A); \

	263 (__m128d)__builtin_shufflevector((__v2df)__A, (__v2df) _mm_setzero_pd(), \

	264 (C) & 0x1, ((C) & 0x2) >> 1); })

	265

	266 #define _mm256_permute_pd(A, C) __extension__ ({ \

	267 __m256d __A = (A); \

	268 (__m256d)__builtin_shufflevector((__v4df)__A, (__v4df) _mm256_setzero_pd(), \

	269 (C) & 0x1, ((C) & 0x2) >> 1, \

	270 2 + (((C) & 0x4) >> 2), \

	271 2 + (((C) & 0x8) >> 3)); })

	272

	273 #define _mm_permute_ps(A, C) __extension__ ({ \

	274 __m128 __A = (A); \

	275 (__m128)__builtin_shufflevector((__v4sf)__A, (__v4sf) _mm_setzero_ps(), \

	276 (C) & 0x3, ((C) & 0xc) >> 2, \

	277 ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6); })

	278

	279 #define _mm256_permute_ps(A, C) __extension__ ({ \

	280 __m256 __A = (A); \

	281 (__m256)__builtin_shufflevector((__v8sf)__A, (__v8sf) _mm256_setzero_ps(), \

	282 (C) & 0x3, ((C) & 0xc) >> 2, \

	283 ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6, \

	284 4 + (((C) & 0x03) >> 0), \

	285 4 + (((C) & 0x0c) >> 2), \

	286 4 + (((C) & 0x30) >> 4), \

	287 4 + (((C) & 0xc0) >> 6)); })

	288

	289 #define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \

	290 __m256d __V1 = (V1); \

	291 __m256d __V2 = (V2); \

	292 (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)__V1, (__v4df)__V2, (M)); })

	293

	294 #define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \

	295 __m256 __V1 = (V1); \

	296 __m256 __V2 = (V2); \

	297 (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)__V1, (__v8sf)__V2, (M)); })

	298

	299 #define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \

	300 __m256i __V1 = (V1); \

	301 __m256i __V2 = (V2); \

	302 (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)__V1, (__v8si)__V2, (M)); })

	303

	304 /* Vector Blend */

	305 #define _mm256_blend_pd(V1, V2, M) __extension__ ({ \

	306 __m256d __V1 = (V1); \

	307 __m256d __V2 = (V2); \

	308 (__m256d)__builtin_ia32_blendpd256((__v4df)__V1, (__v4df)__V2, (M)); })

	309

	310 #define _mm256_blend_ps(V1, V2, M) __extension__ ({ \

	311 __m256 __V1 = (V1); \

	312 __m256 __V2 = (V2); \

	313 (__m256)__builtin_ia32_blendps256((__v8sf)__V1, (__v8sf)__V2, (M)); })

	314

	315 static __inline __m256d __attribute__((__always_inline__, __nodebug__))

	316 _mm256_blendv_pd(__m256d a, __m256d b, __m256d c)

	317 {

	318 return (__m256d)__builtin_ia32_blendvpd256((__v4df)a, (__v4df)b, (__v4df)c);

	319 }

	320

	321 static __inline __m256 __attribute__((__always_inline__, __nodebug__))

	322 _mm256_blendv_ps(__m256 a, __m256 b, __m256 c)

	323 {

	324 return (__m256)__builtin_ia32_blendvps256((__v8sf)a, (__v8sf)b, (__v8sf)c);

	325 }

	326

	327 /* Vector Dot Product */

	328 #define _mm256_dp_ps(V1, V2, M) __extension__ ({ \

	329 __m256 __V1 = (V1); \

	330 __m256 __V2 = (V2); \

	331 (__m256)__builtin_ia32_dpps256((__v8sf)__V1, (__v8sf)__V2, (M)); })

	332

	333 /* Vector shuffle */

	334 #define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \

	335 __m256 __a = (a); \

	336 __m256 __b = (b); \

	337 (__m256)__builtin_shufflevector((__v8sf)__a, (__v8sf)__b, \

	338 (mask) & 0x3, ((mask) & 0xc) >> 2, \

	339 (((mask) & 0x30) >> 4) + 8, (((mask) & 0xc0) >> 6) + 8, \

	340 ((mask) & 0x3) + 4, (((mask) & 0xc) >> 2) + 4, \

	341 (((mask) & 0x30) >> 4) + 12, (((mask) & 0xc0) >> 6) + 12); })

	342

	343 #define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \

	344 __m256d __a = (a); \

	345 __m256d __b = (b); \

	346 (__m256d)__builtin_shufflevector((__v4df)__a, (__v4df)__b, \

	347 (mask) & 0x1, \

	348 (((mask) & 0x2) >> 1) + 4, \

	349 (((mask) & 0x4) >> 2) + 2, \

	350 (((mask) & 0x8) >> 3) + 6); })

	351

	352 /* Compare */

	353 #define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */

	354 #define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */

	355 #define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */

	356 #define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */

	357 #define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */

	358 #define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */

	359 #define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */

	360 #define _CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */

	361 #define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */

	362 #define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unord, signaling) */

	363 #define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */

	364 #define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */

	365 #define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */

	366 #define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */

	367 #define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */

	368 #define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */

	369 #define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */

	370 #define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */

	371 #define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */

	372 #define _CMP_UNORD_S 0x13 /* Unordered (signaling) */

	373 #define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */

	374 #define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */

	375 #define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unord, non-signaling) */

	376 #define _CMP_ORD_S 0x17 /* Ordered (signaling) */

	377 #define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */

	378 #define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unord, non-sign) */

	379 #define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */

	380 #define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */

	381 #define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */

	382 #define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */

	383 #define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */

	384 #define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */

	385

	386 #define _mm_cmp_pd(a, b, c) __extension__ ({ \

	387 __m128d __a = (a); \

	388 __m128d __b = (b); \

	389 (__m128d)__builtin_ia32_cmppd((__v2df)__a, (__v2df)__b, (c)); })

	390

	391 #define _mm_cmp_ps(a, b, c) __extension__ ({ \

	392 __m128 __a = (a); \

	393 __m128 __b = (b); \

	394 (__m128)__builtin_ia32_cmpps((__v4sf)__a, (__v4sf)__b, (c)); })

	395

	396 #define _mm256_cmp_pd(a, b, c) __extension__ ({ \

	397 __m256d __a = (a); \

	398 __m256d __b = (b); \

	399 (__m256d)__builtin_ia32_cmppd256((__v4df)__a, (__v4df)__b, (c)); })

	400

	401 #define _mm256_cmp_ps(a, b, c) __extension__ ({ \

	402 __m256 __a = (a); \

	403 __m256 __b = (b); \

	404 (__m256)__builtin_ia32_cmpps256((__v8sf)__a, (__v8sf)__b, (c)); })

	405

	406 #define _mm_cmp_sd(a, b, c) __extension__ ({ \

	407 __m128d __a = (a); \

	408 __m128d __b = (b); \

	409 (__m128d)__builtin_ia32_cmpsd((__v2df)__a, (__v2df)__b, (c)); })

	410

	411 #define _mm_cmp_ss(a, b, c) __extension__ ({ \

	412 __m128 __a = (a); \

	413 __m128 __b = (b); \

	414 (__m128)__builtin_ia32_cmpss((__v4sf)__a, (__v4sf)__b, (c)); })

	415

	416 /* Vector extract */

	417 #define _mm256_extractf128_pd(A, O) __extension__ ({ \

	418 __m256d __A = (A); \

	419 (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)__A, (O)); })

	420

	421 #define _mm256_extractf128_ps(A, O) __extension__ ({ \

	422 __m256 __A = (A); \

	423 (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)__A, (O)); })

	424

	425 #define _mm256_extractf128_si256(A, O) __extension__ ({ \

	426 __m256i __A = (A); \

	427 (__m128i)__builtin_ia32_vextractf128_si256((__v8si)__A, (O)); })

	428

	429 static __inline int __attribute__((__always_inline__, __nodebug__))

	430 _mm256_extract_epi32(__m256i a, int const imm)

	431 {

	432 __v8si b = (__v8si)a;

	433 return b[imm];

	434 }

	435

	436 static __inline int __attribute__((__always_inline__, __nodebug__))

	437 _mm256_extract_epi16(__m256i a, int const imm)

	438 {

	439 __v16hi b = (__v16hi)a;

	440 return b[imm];

	441 }

	442

	443 static __inline int __attribute__((__always_inline__, __nodebug__))

	444 _mm256_extract_epi8(__m256i a, int const imm)

	445 {

	446 __v32qi b = (__v32qi)a;

	447 return b[imm];

	448 }

	449

	450 #ifdef __x86_64__

	451 static __inline long long __attribute__((__always_inline__, __nodebug__))

	452 _mm256_extract_epi64(__m256i a, const int imm)

	453 {

	454 __v4di b = (__v4di)a;

	455 return b[imm];

	456 }

	457 #endif

	458

	459 /* Vector insert */

	460 #define _mm256_insertf128_pd(V1, V2, O) __extension__ ({ \

	461 __m256d __V1 = (V1); \

	462 __m128d __V2 = (V2); \

	463 (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)__V1, (__v2df)__V2, (O)); })

	464

	465 #define _mm256_insertf128_ps(V1, V2, O) __extension__ ({ \

	466 __m256 __V1 = (V1); \

	467 __m128 __V2 = (V2); \

	468 (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)__V1, (__v4sf)__V2, (O)); })

	469

	470 #define _mm256_insertf128_si256(V1, V2, O) __extension__ ({ \

	471 __m256i __V1 = (V1); \

	472 __m128i __V2 = (V2); \

	473 (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)__V1, (__v4si)__V2, (O)); })

	474

	475 static __inline __m256i __attribute__((__always_inline__, __nodebug__))

	476 _mm256_insert_epi32(__m256i a, int b, int const imm)

	477 {

	478 __v8si c = (__v8si)a;

	479 c[imm & 7] = b;

	480 return (__m256i)c;

	481 }

	482

	483 static __inline __m256i __attribute__((__always_inline__, __nodebug__))

	484 _mm256_insert_epi16(__m256i a, int b, int const imm)

	485 {

	486 __v16hi c = (__v16hi)a;

	487 c[imm & 15] = b;

	488 return (__m256i)c;

	489 }

	490

	491 static __inline __m256i __attribute__((__always_inline__, __nodebug__))

	492 _mm256_insert_epi8(__m256i a, int b, int const imm)

	493 {

	494 __v32qi c = (__v32qi)a;

	495 c[imm & 31] = b;

	496 return (__m256i)c;

	497 }

	498

	499 #ifdef __x86_64__

	500 static __inline __m256i __attribute__((__always_inline__, __nodebug__))

	501 _mm256_insert_epi64(__m256i a, int b, int const imm)

	502 {

	503 __v4di c = (__v4di)a;

	504 c[imm & 3] = b;

	505 return (__m256i)c;

	506 }

	507 #endif

	508

	509 /* Conversion */

	510 static __inline __m256d __attribute__((__always_inline__, __nodebug__))

	511 _mm256_cvtepi32_pd(__m128i a)

	512 {

	513 return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) a);

	514 }

	515

	516 static __inline __m256 __attribute__((__always_inline__, __nodebug__))

	517 _mm256_cvtepi32_ps(__m256i a)

	518 {

	519 return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) a);

	520 }

	521

	522 static __inline __m128 __attribute__((__always_inline__, __nodebug__))

	523 _mm256_cvtpd_ps(__m256d a)

	524 {

	525 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) a);

	526 }

	527

	528 static __inline __m256i __attribute__((__always_inline__, __nodebug__))

	529 _mm256_cvtps_epi32(__m256 a)

	530 {

	531 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) a);

	532 }

	533

	534 static __inline __m256d __attribute__((__always_inline__, __nodebug__))

	535 _mm256_cvtps_pd(__m128 a)

	536 {

	537 return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) a);

	538 }

	539

	540 static __inline __m128i __attribute__((__always_inline__, __nodebug__))

	541 _mm256_cvttpd_epi32(__m256d a)

	542 {

	543 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) a);

	544 }

	545

	546 static __inline __m128i __attribute__((__always_inline__, __nodebug__))

	547 _mm256_cvtpd_epi32(__m256d a)

	548 {

	549 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) a);

	550 }

	551

	552 static __inline __m256i __attribute__((__always_inline__, __nodebug__))

	553 _mm256_cvttps_epi32(__m256 a)

	554 {

	555 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) a);

	556 }

	557

	558 /* Vector replicate */

	559 static __inline __m256 __attribute__((__always_inline__, __nodebug__))

	560 _mm256_movehdup_ps(__m256 a)

	561 {

	562 return __builtin_shufflevector(a, a, 1, 1, 3, 3, 5, 5, 7, 7);

	563 }

	564

	565 static __inline __m256 __attribute__((__always_inline__, __nodebug__))

	566 _mm256_moveldup_ps(__m256 a)

	567 {

	568 return __builtin_shufflevector(a, a, 0, 0, 2, 2, 4, 4, 6, 6);

	569 }

	570

	571 static __inline __m256d __attribute__((__always_inline__, __nodebug__))

	572 _mm256_movedup_pd(__m256d a)

	573 {

	574 return __builtin_shufflevector(a, a, 0, 0, 2, 2);

	575 }

	576

	577 /* Unpack and Interleave */

	578 static __inline __m256d __attribute__((__always_inline__, __nodebug__))

	579 _mm256_unpackhi_pd(__m256d a, __m256d b)

	580 {

	581 return __builtin_shufflevector(a, b, 1, 5, 1+2, 5+2);

	582 }

	583

	584 static __inline __m256d __attribute__((__always_inline__, __nodebug__))

	585 _mm256_unpacklo_pd(__m256d a, __m256d b)

	586 {

	587 return __builtin_shufflevector(a, b, 0, 4, 0+2, 4+2);

	588 }

	589

	590 static __inline __m256 __attribute__((__always_inline__, __nodebug__))

	591 _mm256_unpackhi_ps(__m256 a, __m256 b)

	592 {

	593 return __builtin_shufflevector(a, b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);

	594 }

	595

	596 static __inline __m256 __attribute__((__always_inline__, __nodebug__))

	597 _mm256_unpacklo_ps(__m256 a, __m256 b)

	598 {

	599 return __builtin_shufflevector(a, b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);

	600 }

	601

	602 /* Bit Test */

	603 static __inline int __attribute__((__always_inline__, __nodebug__))

	604 _mm_testz_pd(__m128d a, __m128d b)

	605 {

	606 return __builtin_ia32_vtestzpd((__v2df)a, (__v2df)b);

	607 }

	608

	609 static __inline int __attribute__((__always_inline__, __nodebug__))

	610 _mm_testc_pd(__m128d a, __m128d b)

	611 {

	612 return __builtin_ia32_vtestcpd((__v2df)a, (__v2df)b);

	613 }

	614

	615 static __inline int __attribute__((__always_inline__, __nodebug__))

	616 _mm_testnzc_pd(__m128d a, __m128d b)

	617 {

	618 return __builtin_ia32_vtestnzcpd((__v2df)a, (__v2df)b);

	619 }

	620

	621 static __inline int __attribute__((__always_inline__, __nodebug__))

	622 _mm_testz_ps(__m128 a, __m128 b)

	623 {

	624 return __builtin_ia32_vtestzps((__v4sf)a, (__v4sf)b);

	625 }

	626

	627 static __inline int __attribute__((__always_inline__, __nodebug__))

	628 _mm_testc_ps(__m128 a, __m128 b)

	629 {

	630 return __builtin_ia32_vtestcps((__v4sf)a, (__v4sf)b);

	631 }

	632

	633 static __inline int __attribute__((__always_inline__, __nodebug__))

	634 _mm_testnzc_ps(__m128 a, __m128 b)

	635 {

	636 return __builtin_ia32_vtestnzcps((__v4sf)a, (__v4sf)b);

	637 }

	638

	639 static __inline int __attribute__((__always_inline__, __nodebug__))

	640 _mm256_testz_pd(__m256d a, __m256d b)

	641 {

	642 return __builtin_ia32_vtestzpd256((__v4df)a, (__v4df)b);

	643 }

	644

	645 static __inline int __attribute__((__always_inline__, __nodebug__))

	646 _mm256_testc_pd(__m256d a, __m256d b)

	647 {

	648 return __builtin_ia32_vtestcpd256((__v4df)a, (__v4df)b);

	649 }

	650

	651 static __inline int __attribute__((__always_inline__, __nodebug__))

	652 _mm256_testnzc_pd(__m256d a, __m256d b)

	653 {

	654 return __builtin_ia32_vtestnzcpd256((__v4df)a, (__v4df)b);

	655 }

	656

	657 static __inline int __attribute__((__always_inline__, __nodebug__))

	658 _mm256_testz_ps(__m256 a, __m256 b)

	659 {

	660 return __builtin_ia32_vtestzps256((__v8sf)a, (__v8sf)b);

	661 }

	662

	663 static __inline int __attribute__((__always_inline__, __nodebug__))

	664 _mm256_testc_ps(__m256 a, __m256 b)

	665 {

	666 return __builtin_ia32_vtestcps256((__v8sf)a, (__v8sf)b);

	667 }

	668

	669 static __inline int __attribute__((__always_inline__, __nodebug__))

	670 _mm256_testnzc_ps(__m256 a, __m256 b)

	671 {

	672 return __builtin_ia32_vtestnzcps256((__v8sf)a, (__v8sf)b);

	673 }

	674

	675 static __inline int __attribute__((__always_inline__, __nodebug__))

	676 _mm256_testz_si256(__m256i a, __m256i b)

	677 {

	678 return __builtin_ia32_ptestz256((__v4di)a, (__v4di)b);

	679 }

	680

	681 static __inline int __attribute__((__always_inline__, __nodebug__))

	682 _mm256_testc_si256(__m256i a, __m256i b)

	683 {

	684 return __builtin_ia32_ptestc256((__v4di)a, (__v4di)b);

	685 }

	686

	687 static __inline int __attribute__((__always_inline__, __nodebug__))

	688 _mm256_testnzc_si256(__m256i a, __m256i b)

	689 {

	690 return __builtin_ia32_ptestnzc256((__v4di)a, (__v4di)b);

	691 }

	692

	693 /* Vector extract sign mask */

	694 static __inline int __attribute__((__always_inline__, __nodebug__))

	695 _mm256_movemask_pd(__m256d a)

	696 {

	697 return __builtin_ia32_movmskpd256((__v4df)a);

	698 }

	699

	700 static __inline int __attribute__((__always_inline__, __nodebug__))

	701 _mm256_movemask_ps(__m256 a)

	702 {

	703 return __builtin_ia32_movmskps256((__v8sf)a);

	704 }

	705

	706 /* Vector zero */

	707 static __inline void __attribute__((__always_inline__, __nodebug__))

	708 _mm256_zeroall(void)

	709 {

	710 __builtin_ia32_vzeroall();

	711 }

	712

	713 static __inline void __attribute__((__always_inline__, __nodebug__))

	714 _mm256_zeroupper(void)

	715 {

	716 __builtin_ia32_vzeroupper();

	717 }

	718

	719 /* Vector load with broadcast */

	720 static __inline __m128 __attribute__((__always_inline__, __nodebug__))

	721 _mm_broadcast_ss(float const *a)

	722 {

	723 return (__m128)__builtin_ia32_vbroadcastss(a);

	724 }

	725

	726 static __inline __m256d __attribute__((__always_inline__, __nodebug__))

	727 _mm256_broadcast_sd(double const *a)

	728 {

	729 return (__m256d)__builtin_ia32_vbroadcastsd256(a);

	730 }

	731

	732 static __inline __m256 __attribute__((__always_inline__, __nodebug__))

	733 _mm256_broadcast_ss(float const *a)

	734 {

	735 return (__m256)__builtin_ia32_vbroadcastss256(a);

	736 }

	737

	738 static __inline __m256d __attribute__((__always_inline__, __nodebug__))

	739 _mm256_broadcast_pd(__m128d const *a)

	740 {

	741 return (__m256d)__builtin_ia32_vbroadcastf128_pd256(a);

	742 }

	743

	744 static __inline __m256 __attribute__((__always_inline__, __nodebug__))

	745 _mm256_broadcast_ps(__m128 const *a)

	746 {

	747 return (__m256)__builtin_ia32_vbroadcastf128_ps256(a);

	748 }

	749

	750 /* SIMD load ops */

	751 static __inline __m256d __attribute__((__always_inline__, __nodebug__))

	752 _mm256_load_pd(double const *p)

	753 {

	754 return (__m256d )p;

	755 }

	756

	757 static __inline __m256 __attribute__((__always_inline__, __nodebug__))

	758 _mm256_load_ps(float const *p)

	759 {

	760 return (__m256 )p;

	761 }

	762

	763 static __inline __m256d __attribute__((__always_inline__, __nodebug__))

	764 _mm256_loadu_pd(double const *p)

	765 {

	766 struct __loadu_pd {

	767 __m256d v;

	768 } __attribute__((packed, may_alias));

	769 return ((struct __loadu_pd*)p)->v;

	770 }

	771

	772 static __inline __m256 __attribute__((__always_inline__, __nodebug__))

	773 _mm256_loadu_ps(float const *p)

	774 {

	775 struct __loadu_ps {

	776 __m256 v;

	777 } __attribute__((packed, may_alias));

	778 return ((struct __loadu_ps*)p)->v;

	779 }

	780

	781 static __inline __m256i __attribute__((__always_inline__, __nodebug__))

	782 _mm256_load_si256(__m256i const *p)

	783 {

	784 return *p;

	785 }

	786

	787 static __inline __m256i __attribute__((__always_inline__, __nodebug__))

	788 _mm256_loadu_si256(__m256i const *p)

	789 {

	790 struct __loadu_si256 {

	791 __m256i v;

	792 } __attribute__((packed, may_alias));

	793 return ((struct __loadu_si256*)p)->v;

	794 }

	795

	796 static __inline __m256i __attribute__((__always_inline__, __nodebug__))

	797 _mm256_lddqu_si256(__m256i const *p)

	798 {

	799 return (__m256i)__builtin_ia32_lddqu256((char const *)p);

	800 }

	801

	802 /* SIMD store ops */

	803 static __inline void __attribute__((__always_inline__, __nodebug__))

	804 _mm256_store_pd(double *p, __m256d a)

	805 {

	806 (__m256d )p = a;

	807 }

	808

	809 static __inline void __attribute__((__always_inline__, __nodebug__))

	810 _mm256_store_ps(float *p, __m256 a)

	811 {

	812 (__m256 )p = a;

	813 }

	814

	815 static __inline void __attribute__((__always_inline__, __nodebug__))

	816 _mm256_storeu_pd(double *p, __m256d a)

	817 {

	818 __builtin_ia32_storeupd256(p, (__v4df)a);

	819 }

	820

	821 static __inline void __attribute__((__always_inline__, __nodebug__))

	822 _mm256_storeu_ps(float *p, __m256 a)

	823 {

	824 __builtin_ia32_storeups256(p, (__v8sf)a);

	825 }

	826

	827 static __inline void __attribute__((__always_inline__, __nodebug__))

	828 _mm256_store_si256(__m256i *p, __m256i a)

	829 {

	830 *p = a;

	831 }

	832

	833 static __inline void __attribute__((__always_inline__, __nodebug__))

	834 _mm256_storeu_si256(__m256i *p, __m256i a)

	835 {

	836 __builtin_ia32_storedqu256((char *)p, (__v32qi)a);

	837 }

	838

	839 /* Conditional load ops */

	840 static __inline __m128d __attribute__((__always_inline__, __nodebug__))

	841 _mm_maskload_pd(double const *p, __m128d m)

	842 {

	843 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)p, (__v2df)m);

	844 }

	845

	846 static __inline __m256d __attribute__((__always_inline__, __nodebug__))

	847 _mm256_maskload_pd(double const *p, __m256d m)

	848 {

	849 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)p, (__v4df)m);

	850 }

	851

	852 static __inline __m128 __attribute__((__always_inline__, __nodebug__))

	853 _mm_maskload_ps(float const *p, __m128 m)

	854 {

	855 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)p, (__v4sf)m);

	856 }

	857

	858 static __inline __m256 __attribute__((__always_inline__, __nodebug__))

	859 _mm256_maskload_ps(float const *p, __m256 m)

	860 {

	861 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)p, (__v8sf)m);

	862 }

	863

	864 /* Conditional store ops */

	865 static __inline void __attribute__((__always_inline__, __nodebug__))

	866 _mm256_maskstore_ps(float *p, __m256 m, __m256 a)

	867 {

	868 __builtin_ia32_maskstoreps256((__v8sf *)p, (__v8sf)m, (__v8sf)a);

	869 }

	870

	871 static __inline void __attribute__((__always_inline__, __nodebug__))

	872 _mm_maskstore_pd(double *p, __m128d m, __m128d a)

	873 {

	874 __builtin_ia32_maskstorepd((__v2df *)p, (__v2df)m, (__v2df)a);

	875 }

	876

	877 static __inline void __attribute__((__always_inline__, __nodebug__))

	878 _mm256_maskstore_pd(double *p, __m256d m, __m256d a)

	879 {

	880 __builtin_ia32_maskstorepd256((__v4df *)p, (__v4df)m, (__v4df)a);

	881 }

	882

	883 static __inline void __attribute__((__always_inline__, __nodebug__))

	884 _mm_maskstore_ps(float *p, __m128 m, __m128 a)

	885 {

	886 __builtin_ia32_maskstoreps((__v4sf *)p, (__v4sf)m, (__v4sf)a);

	887 }

	888

	889 /* Cacheability support ops */

	890 static __inline void __attribute__((__always_inline__, __nodebug__))

	891 _mm256_stream_si256(__m256i *a, __m256i b)

	892 {

	893 __builtin_ia32_movntdq256((__v4di *)a, (__v4di)b);

	894 }

	895

	896 static __inline void __attribute__((__always_inline__, __nodebug__))

	897 _mm256_stream_pd(double *a, __m256d b)

	898 {

	899 __builtin_ia32_movntpd256(a, (__v4df)b);

	900 }

	901

	902 static __inline void __attribute__((__always_inline__, __nodebug__))

	903 _mm256_stream_ps(float *p, __m256 a)

	904 {

	905 __builtin_ia32_movntps256(p, (__v8sf)a);

	906 }

	907

	908 /* Create vectors */

	909 static __inline __m256d __attribute__((__always_inline__, __nodebug__))

	910 _mm256_set_pd(double a, double b, double c, double d)

	911 {

	912 return (__m256d){ d, c, b, a };

	913 }

	914

	915 static __inline __m256 __attribute__((__always_inline__, __nodebug__))

	916 _mm256_set_ps(float a, float b, float c, float d,

	917 float e, float f, float g, float h)

	918 {

	919 return (__m256){ h, g, f, e, d, c, b, a };

	920 }

	921

	922 static __inline __m256i __attribute__((__always_inline__, __nodebug__))

	923 _mm256_set_epi32(int i0, int i1, int i2, int i3,

	924 int i4, int i5, int i6, int i7)

	925 {

	926 return (__m256i)(__v8si){ i7, i6, i5, i4, i3, i2, i1, i0 };

	927 }

	928

	929 static __inline __m256i __attribute__((__always_inline__, __nodebug__))

	930 _mm256_set_epi16(short w15, short w14, short w13, short w12,

	931 short w11, short w10, short w09, short w08,

	932 short w07, short w06, short w05, short w04,

	933 short w03, short w02, short w01, short w00)

	934 {

	935 return (__m256i)(__v16hi){ w00, w01, w02, w03, w04, w05, w06, w07,

	936 w08, w09, w10, w11, w12, w13, w14, w15 };

	937 }

	938

	939 static __inline __m256i __attribute__((__always_inline__, __nodebug__))

	940 _mm256_set_epi8(char b31, char b30, char b29, char b28,

	941 char b27, char b26, char b25, char b24,

	942 char b23, char b22, char b21, char b20,

	943 char b19, char b18, char b17, char b16,

	944 char b15, char b14, char b13, char b12,

	945 char b11, char b10, char b09, char b08,

	946 char b07, char b06, char b05, char b04,

	947 char b03, char b02, char b01, char b00)

	948 {

	949 return (__m256i)(__v32qi){

	950 b00, b01, b02, b03, b04, b05, b06, b07,

	951 b08, b09, b10, b11, b12, b13, b14, b15,

	952 b16, b17, b18, b19, b20, b21, b22, b23,

	953 b24, b25, b26, b27, b28, b29, b30, b31

	954 };

	955 }

	956

	957 static __inline __m256i __attribute__((__always_inline__, __nodebug__))

	958 _mm256_set_epi64x(long long a, long long b, long long c, long long d)

	959 {

	960 return (__m256i)(__v4di){ d, c, b, a };

	961 }

	962

	963 /* Create vectors with elements in reverse order */

	964 static __inline __m256d __attribute__((__always_inline__, __nodebug__))

	965 _mm256_setr_pd(double a, double b, double c, double d)

	966 {

	967 return (__m256d){ a, b, c, d };

	968 }

	969

	970 static __inline __m256 __attribute__((__always_inline__, __nodebug__))

	971 _mm256_setr_ps(float a, float b, float c, float d,

	972 float e, float f, float g, float h)

	973 {

	974 return (__m256){ a, b, c, d, e, f, g, h };

	975 }

	976

	977 static __inline __m256i __attribute__((__always_inline__, __nodebug__))

	978 _mm256_setr_epi32(int i0, int i1, int i2, int i3,

	979 int i4, int i5, int i6, int i7)

	980 {

	981 return (__m256i)(__v8si){ i0, i1, i2, i3, i4, i5, i6, i7 };

	982 }

	983

	984 static __inline __m256i __attribute__((__always_inline__, __nodebug__))

	985 _mm256_setr_epi16(short w15, short w14, short w13, short w12,

	986 short w11, short w10, short w09, short w08,

	987 short w07, short w06, short w05, short w04,

	988 short w03, short w02, short w01, short w00)

	989 {

	990 return (__m256i)(__v16hi){ w15, w14, w13, w12, w11, w10, w09, w08,

	991 w07, w06, w05, w04, w03, w02, w01 , w00 };

	992 }

	993

	994 static __inline __m256i __attribute__((__always_inline__, __nodebug__))

	995 _mm256_setr_epi8(char b31, char b30, char b29, char b28,

	996 char b27, char b26, char b25, char b24,

	997 char b23, char b22, char b21, char b20,

	998 char b19, char b18, char b17, char b16,

	999 char b15, char b14, char b13, char b12,

	1000 char b11, char b10, char b09, char b08,

	1001 char b07, char b06, char b05, char b04,

	1002 char b03, char b02, char b01, char b00)

	1003 {

	1004 return (__m256i)(__v32qi){

	1005 b31, b30, b29, b28, b27, b26, b25, b24,

	1006 b23, b22, b21, b20, b19, b18, b17, b16,

	1007 b15, b14, b13, b12, b11, b10, b09, b08,

	1008 b07, b06, b05, b04, b03, b02, b01, b00 };

	1009 }

	1010

	1011 static __inline __m256i __attribute__((__always_inline__, __nodebug__))

	1012 _mm256_setr_epi64x(long long a, long long b, long long c, long long d)

	1013 {

	1014 return (__m256i)(__v4di){ a, b, c, d };

	1015 }

	1016

	1017 /* Create vectors with repeated elements */

	1018 static __inline __m256d __attribute__((__always_inline__, __nodebug__))

	1019 _mm256_set1_pd(double w)

	1020 {

	1021 return (__m256d){ w, w, w, w };

	1022 }

	1023

	1024 static __inline __m256 __attribute__((__always_inline__, __nodebug__))

	1025 _mm256_set1_ps(float w)

	1026 {

	1027 return (__m256){ w, w, w, w, w, w, w, w };

	1028 }

	1029

	1030 static __inline __m256i __attribute__((__always_inline__, __nodebug__))

	1031 _mm256_set1_epi32(int i)

	1032 {

	1033 return (__m256i)(__v8si){ i, i, i, i, i, i, i, i };

	1034 }

	1035

	1036 static __inline __m256i __attribute__((__always_inline__, __nodebug__))

	1037 _mm256_set1_epi16(short w)

	1038 {

	1039 return (__m256i)(__v16hi){ w, w, w, w, w, w, w, w, w, w, w, w, w, w, w, w };

	1040 }

	1041

	1042 static __inline __m256i __attribute__((__always_inline__, __nodebug__))

	1043 _mm256_set1_epi8(char b)

	1044 {

	1045 return (__m256i)(__v32qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b,

	1046 b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b };

	1047 }

	1048

	1049 static __inline __m256i __attribute__((__always_inline__, __nodebug__))

	1050 _mm256_set1_epi64x(long long q)

	1051 {

	1052 return (__m256i)(__v4di){ q, q, q, q };

	1053 }

	1054

	1055 /* Create zeroed vectors */

	1056 static __inline __m256d __attribute__((__always_inline__, __nodebug__))

	1057 _mm256_setzero_pd(void)

	1058 {

	1059 return (__m256d){ 0, 0, 0, 0 };

	1060 }

	1061

	1062 static __inline __m256 __attribute__((__always_inline__, __nodebug__))

	1063 _mm256_setzero_ps(void)

	1064 {

	1065 return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };

	1066 }

	1067

	1068 static __inline __m256i __attribute__((__always_inline__, __nodebug__))

	1069 _mm256_setzero_si256(void)

	1070 {

	1071 return (__m256i){ 0LL, 0LL, 0LL, 0LL };

	1072 }

	1073

	1074 /* Cast between vector types */

	1075 static __inline __m256 __attribute__((__always_inline__, __nodebug__))

	1076 _mm256_castpd_ps(__m256d in)

	1077 {

	1078 return (__m256)in;

	1079 }

	1080

	1081 static __inline __m256i __attribute__((__always_inline__, __nodebug__))

	1082 _mm256_castpd_si256(__m256d in)

	1083 {

	1084 return (__m256i)in;

	1085 }

	1086

	1087 static __inline __m256d __attribute__((__always_inline__, __nodebug__))

	1088 _mm256_castps_pd(__m256 in)

	1089 {

	1090 return (__m256d)in;

	1091 }

	1092

	1093 static __inline __m256i __attribute__((__always_inline__, __nodebug__))

	1094 _mm256_castps_si256(__m256 in)

	1095 {

	1096 return (__m256i)in;

	1097 }

	1098

	1099 static __inline __m256 __attribute__((__always_inline__, __nodebug__))

	1100 _mm256_castsi256_ps(__m256i in)

	1101 {

	1102 return (__m256)in;

	1103 }

	1104

	1105 static __inline __m256d __attribute__((__always_inline__, __nodebug__))

	1106 _mm256_castsi256_pd(__m256i in)

	1107 {

	1108 return (__m256d)in;

	1109 }

	1110

	1111 static __inline __m128d __attribute__((__always_inline__, __nodebug__))

	1112 _mm256_castpd256_pd128(__m256d in)

	1113 {

	1114 return __builtin_shufflevector(in, in, 0, 1);

	1115 }

	1116

	1117 static __inline __m128 __attribute__((__always_inline__, __nodebug__))

	1118 _mm256_castps256_ps128(__m256 in)

	1119 {

	1120 return __builtin_shufflevector(in, in, 0, 1, 2, 3);

	1121 }

	1122

	1123 static __inline __m128i __attribute__((__always_inline__, __nodebug__))

	1124 _mm256_castsi256_si128(__m256i in)

	1125 {

	1126 return __builtin_shufflevector(in, in, 0, 1);

	1127 }

	1128

	1129 static __inline __m256d __attribute__((__always_inline__, __nodebug__))

	1130 _mm256_castpd128_pd256(__m128d in)

	1131 {

	1132 __m128d zero = _mm_setzero_pd();

	1133 return __builtin_shufflevector(in, zero, 0, 1, 2, 2);

	1134 }

	1135

	1136 static __inline __m256 __attribute__((__always_inline__, __nodebug__))

	1137 _mm256_castps128_ps256(__m128 in)

	1138 {

	1139 __m128 zero = _mm_setzero_ps();

	1140 return __builtin_shufflevector(in, zero, 0, 1, 2, 3, 4, 4, 4, 4);

	1141 }

	1142

	1143 static __inline __m256i __attribute__((__always_inline__, __nodebug__))

	1144 _mm256_castsi128_si256(__m128i in)

	1145 {

	1146 __m128i zero = _mm_setzero_si128();

	1147 return __builtin_shufflevector(in, zero, 0, 1, 2, 2);

	1148 }

	1149

	1150 /* SIMD load ops (unaligned) */

	1151 static __inline __m256 __attribute__((__always_inline__, __nodebug__))

	1152 _mm256_loadu2_m128(float const addr_hi, float const addr_lo)

	1153 {

	1154 struct __loadu_ps {

	1155 __m128 v;

	1156 } __attribute__((__packed__, __may_alias__));

	1157

	1158 __m256 v256 = _mm256_castps128_ps256(((struct __loadu_ps*)addr_lo)->v);

	1159 return _mm256_insertf128_ps(v256, ((struct __loadu_ps*)addr_hi)->v, 1);

	1160 }

	1161

	1162 static __inline __m256d __attribute__((__always_inline__, __nodebug__))

	1163 _mm256_loadu2_m128d(double const addr_hi, double const addr_lo)

	1164 {

	1165 struct __loadu_pd {

	1166 __m128d v;

	1167 } __attribute__((__packed__, __may_alias__));

	1168

	1169 __m256d v256 = _mm256_castpd128_pd256(((struct __loadu_pd*)addr_lo)->v);

	1170 return _mm256_insertf128_pd(v256, ((struct __loadu_pd*)addr_hi)->v, 1);

	1171 }

	1172

	1173 static __inline __m256i __attribute__((__always_inline__, __nodebug__))

	1174 _mm256_loadu2_m128i(__m128i const addr_hi, __m128i const addr_lo)

	1175 {

	1176 struct __loadu_si128 {

	1177 __m128i v;

	1178 } __attribute__((packed, may_alias));

	1179 __m256i v256 = _mm256_castsi128_si256(((struct __loadu_si128*)addr_lo)->v);

	1180 return _mm256_insertf128_si256(v256, ((struct __loadu_si128*)addr_hi)->v, 1);

	1181 }

	1182

	1183 /* SIMD store ops (unaligned) */

	1184 static __inline void __attribute__((__always_inline__, __nodebug__))

	1185 _mm256_storeu2_m128(float addr_hi, float addr_lo, __m256 a)

	1186 {

	1187 __m128 v128;

	1188

	1189 v128 = _mm256_castps256_ps128(a);

	1190 __builtin_ia32_storeups(addr_lo, v128);

	1191 v128 = _mm256_extractf128_ps(a, 1);

	1192 __builtin_ia32_storeups(addr_hi, v128);

	1193 }

	1194

	1195 static __inline void __attribute__((__always_inline__, __nodebug__))

	1196 _mm256_storeu2_m128d(double addr_hi, double addr_lo, __m256d a)

	1197 {

	1198 __m128d v128;

	1199

	1200 v128 = _mm256_castpd256_pd128(a);

	1201 __builtin_ia32_storeupd(addr_lo, v128);

	1202 v128 = _mm256_extractf128_pd(a, 1);

	1203 __builtin_ia32_storeupd(addr_hi, v128);

	1204 }

	1205

	1206 static __inline void __attribute__((__always_inline__, __nodebug__))

	1207 _mm256_storeu2_m128i(__m128i addr_hi, __m128i addr_lo, __m256i a)

	1208 {

	1209 __m128i v128;

	1210

	1211 v128 = _mm256_castsi256_si128(a);

	1212 __builtin_ia32_storedqu((char *)addr_lo, (__v16qi)v128);

	1213 v128 = _mm256_extractf128_si256(a, 1);

	1214 __builtin_ia32_storedqu((char *)addr_hi, (__v16qi)v128);

	1215 }

OLD	NEW

« no previous file with comments | « Linux_ia32/lib/clang/3.2/include/avx2intrin.h ('k') | Linux_ia32/lib/clang/3.2/include/bmi2intrin.h » ('j') | no next file with comments »