| OLD | NEW |
| 1 /*===---- avxintrin.h - AVX intrinsics -------------------------------------=== | 1 /*===---- avxintrin.h - AVX intrinsics -------------------------------------=== |
| 2 * | 2 * |
| 3 * Permission is hereby granted, free of charge, to any person obtaining a copy | 3 * Permission is hereby granted, free of charge, to any person obtaining a copy |
| 4 * of this software and associated documentation files (the "Software"), to deal | 4 * of this software and associated documentation files (the "Software"), to deal |
| 5 * in the Software without restriction, including without limitation the rights | 5 * in the Software without restriction, including without limitation the rights |
| 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| 7 * copies of the Software, and to permit persons to whom the Software is | 7 * copies of the Software, and to permit persons to whom the Software is |
| 8 * furnished to do so, subject to the following conditions: | 8 * furnished to do so, subject to the following conditions: |
| 9 * | 9 * |
| 10 * The above copyright notice and this permission notice shall be included in | 10 * The above copyright notice and this permission notice shall be included in |
| (...skipping 127 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 138 { | 138 { |
| 139 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)a); | 139 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)a); |
| 140 } | 140 } |
| 141 | 141 |
| 142 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) | 142 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) |
| 143 _mm256_rcp_ps(__m256 a) | 143 _mm256_rcp_ps(__m256 a) |
| 144 { | 144 { |
| 145 return (__m256)__builtin_ia32_rcpps256((__v8sf)a); | 145 return (__m256)__builtin_ia32_rcpps256((__v8sf)a); |
| 146 } | 146 } |
| 147 | 147 |
| 148 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) | 148 #define _mm256_round_pd(V, M) __extension__ ({ \ |
| 149 _mm256_round_pd(__m256d v, const int m) | 149 __m256d __V = (V); \ |
| 150 { | 150 (__m256d)__builtin_ia32_roundpd256((__v4df)__V, (M)); }) |
| 151 return (__m256d)__builtin_ia32_roundpd256((__v4df)v, m); | |
| 152 } | |
| 153 | 151 |
| 154 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) | 152 #define _mm256_round_ps(V, M) __extension__ ({ \ |
| 155 _mm256_round_ps(__m256 v, const int m) | 153 __m256 __V = (V); \ |
| 156 { | 154 (__m256)__builtin_ia32_roundps256((__v8sf)__V, (M)); }) |
| 157 return (__m256)__builtin_ia32_roundps256((__v8sf)v, m); | |
| 158 } | |
| 159 | 155 |
| 160 #define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL) | 156 #define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL) |
| 161 #define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR) | 157 #define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR) |
| 162 #define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL) | 158 #define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL) |
| 163 #define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR) | 159 #define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR) |
| 164 | 160 |
| 165 /* Logical */ | 161 /* Logical */ |
| 166 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) | 162 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) |
| 167 _mm256_and_pd(__m256d a, __m256d b) | 163 _mm256_and_pd(__m256d a, __m256d b) |
| 168 { | 164 { |
| (...skipping 86 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 255 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)a, (__v4si)c); | 251 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)a, (__v4si)c); |
| 256 } | 252 } |
| 257 | 253 |
| 258 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) | 254 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) |
| 259 _mm256_permutevar_ps(__m256 a, __m256i c) | 255 _mm256_permutevar_ps(__m256 a, __m256i c) |
| 260 { | 256 { |
| 261 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)a, | 257 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)a, |
| 262 (__v8si)c); | 258 (__v8si)c); |
| 263 } | 259 } |
| 264 | 260 |
| 265 static __inline __m128d __attribute__((__always_inline__, __nodebug__)) | 261 #define _mm_permute_pd(A, C) __extension__ ({ \ |
| 266 _mm_permute_pd(__m128d a, const int c) | 262 __m128d __A = (A); \ |
| 267 { | 263 (__m128d)__builtin_ia32_vpermilpd((__v2df)__A, (C)); }) |
| 268 return (__m128d)__builtin_ia32_vpermilpd((__v2df)a, c); | |
| 269 } | |
| 270 | 264 |
| 271 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) | 265 #define _mm256_permute_pd(A, C) __extension__ ({ \ |
| 272 _mm256_permute_pd(__m256d a, const int c) | 266 __m256d __A = (A); \ |
| 273 { | 267 (__m256d)__builtin_ia32_vpermilpd256((__v4df)__A, (C)); }) |
| 274 return (__m256d)__builtin_ia32_vpermilpd256((__v4df)a, c); | |
| 275 } | |
| 276 | 268 |
| 277 static __inline __m128 __attribute__((__always_inline__, __nodebug__)) | 269 #define _mm_permute_ps(A, C) __extension__ ({ \ |
| 278 _mm_permute_ps(__m128 a, const int c) | 270 __m128 __A = (A); \ |
| 279 { | 271 (__m128)__builtin_ia32_vpermilps((__v4sf)__A, (C)); }) |
| 280 return (__m128)__builtin_ia32_vpermilps((__v4sf)a, c); | |
| 281 } | |
| 282 | 272 |
| 283 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) | 273 #define _mm256_permute_ps(A, C) __extension__ ({ \ |
| 284 _mm256_permute_ps(__m256 a, const int c) | 274 __m256 __A = (A); \ |
| 285 { | 275 (__m256)__builtin_ia32_vpermilps256((__v8sf)__A, (C)); }) |
| 286 return (__m256)__builtin_ia32_vpermilps256((__v8sf)a, c); | |
| 287 } | |
| 288 | 276 |
| 289 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) | 277 #define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \ |
| 290 _mm256_permute2f128_pd(__m256d a, __m256d b, const int c) | 278 __m256d __V1 = (V1); \ |
| 291 { | 279 __m256d __V2 = (V2); \ |
| 292 return (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)a, (__v4df)b, c); | 280 (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)__V1, (__v4df)__V2, (M)); }) |
| 293 } | |
| 294 | 281 |
| 295 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) | 282 #define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \ |
| 296 _mm256_permute2f128_ps(__m256 a, __m256 b, const int c) | 283 __m256 __V1 = (V1); \ |
| 297 { | 284 __m256 __V2 = (V2); \ |
| 298 return (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)a, (__v8sf)b, c); | 285 (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)__V1, (__v8sf)__V2, (M)); }) |
| 299 } | |
| 300 | 286 |
| 301 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) | 287 #define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \ |
| 302 _mm256_permute2f128_si256(__m256i a, __m256i b, const int c) | 288 __m256i __V1 = (V1); \ |
| 303 { | 289 __m256i __V2 = (V2); \ |
| 304 return (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)a, (__v8si)b, c); | 290 (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)__V1, (__v8si)__V2, (M)); }) |
| 305 } | |
| 306 | 291 |
| 307 /* Vector Blend */ | 292 /* Vector Blend */ |
| 308 #define _mm256_blend_pd(V1, V2, M) __extension__ ({ \ | 293 #define _mm256_blend_pd(V1, V2, M) __extension__ ({ \ |
| 309 __m256d __V1 = (V1); \ | 294 __m256d __V1 = (V1); \ |
| 310 __m256d __V2 = (V2); \ | 295 __m256d __V2 = (V2); \ |
| 311 (__m256d)__builtin_ia32_blendpd256((__v4df)__V1, (__v4df)__V2, M); }) | 296 (__m256d)__builtin_ia32_blendpd256((__v4df)__V1, (__v4df)__V2, (M)); }) |
| 312 | 297 |
| 313 #define _mm256_blend_ps(V1, V2, M) __extension__ ({ \ | 298 #define _mm256_blend_ps(V1, V2, M) __extension__ ({ \ |
| 314 __m256 __V1 = (V1); \ | 299 __m256 __V1 = (V1); \ |
| 315 __m256 __V2 = (V2); \ | 300 __m256 __V2 = (V2); \ |
| 316 (__m256)__builtin_ia32_blendps256((__v8sf)__V1, (__v8sf)__V2, M); }) | 301 (__m256)__builtin_ia32_blendps256((__v8sf)__V1, (__v8sf)__V2, (M)); }) |
| 317 | 302 |
| 318 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) | 303 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) |
| 319 _mm256_blendv_pd(__m256d a, __m256d b, __m256d c) | 304 _mm256_blendv_pd(__m256d a, __m256d b, __m256d c) |
| 320 { | 305 { |
| 321 return (__m256d)__builtin_ia32_blendvpd256((__v4df)a, (__v4df)b, (__v4df)c); | 306 return (__m256d)__builtin_ia32_blendvpd256((__v4df)a, (__v4df)b, (__v4df)c); |
| 322 } | 307 } |
| 323 | 308 |
| 324 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) | 309 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) |
| 325 _mm256_blendv_ps(__m256 a, __m256 b, __m256 c) | 310 _mm256_blendv_ps(__m256 a, __m256 b, __m256 c) |
| 326 { | 311 { |
| 327 return (__m256)__builtin_ia32_blendvps256((__v8sf)a, (__v8sf)b, (__v8sf)c); | 312 return (__m256)__builtin_ia32_blendvps256((__v8sf)a, (__v8sf)b, (__v8sf)c); |
| 328 } | 313 } |
| 329 | 314 |
| 330 /* Vector Dot Product */ | 315 /* Vector Dot Product */ |
| 331 #define _mm256_dp_ps(V1, V2, M) __extension__ ({ \ | 316 #define _mm256_dp_ps(V1, V2, M) __extension__ ({ \ |
| 332 __m256 __V1 = (V1); \ | 317 __m256 __V1 = (V1); \ |
| 333 __m256 __V2 = (V2); \ | 318 __m256 __V2 = (V2); \ |
| 334 (__m256)__builtin_ia32_dpps256((__v8sf)__V1, (__v8sf)__V2, M); }) | 319 (__m256)__builtin_ia32_dpps256((__v8sf)__V1, (__v8sf)__V2, (M)); }) |
| 335 | 320 |
| 336 /* Vector shuffle */ | 321 /* Vector shuffle */ |
| 337 #define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \ | 322 #define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \ |
| 338 __m256 __a = (a); \ | 323 __m256 __a = (a); \ |
| 339 __m256 __b = (b); \ | 324 __m256 __b = (b); \ |
| 340 (__m256)__builtin_shufflevector((__v8sf)__a, (__v8sf)__b, \ | 325 (__m256)__builtin_shufflevector((__v8sf)__a, (__v8sf)__b, \ |
| 341 (mask) & 0x3, ((mask) & 0xc) >> 2, \ | 326 (mask) & 0x3, ((mask) & 0xc) >> 2, \ |
| 342 (((mask) & 0x30) >> 4) + 8, (((mask) & 0xc0) >> 6) + 8, \ | 327 (((mask) & 0x30) >> 4) + 8, (((mask) & 0xc0) >> 6) + 8, \ |
| 343 ((mask) & 0x3) + 4, (((mask) & 0xc) >> 2) + 4, \ | 328 ((mask) & 0x3) + 4, (((mask) & 0xc) >> 2) + 4, \ |
| 344 (((mask) & 0x30) >> 4) + 12, (((mask) & 0xc0) >> 6) + 12); }) | 329 (((mask) & 0x30) >> 4) + 12, (((mask) & 0xc0) >> 6) + 12); }) |
| (...skipping 65 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 410 __m128d __a = (a); \ | 395 __m128d __a = (a); \ |
| 411 __m128d __b = (b); \ | 396 __m128d __b = (b); \ |
| 412 (__m128d)__builtin_ia32_cmpsd((__v2df)__a, (__v2df)__b, (c)); }) | 397 (__m128d)__builtin_ia32_cmpsd((__v2df)__a, (__v2df)__b, (c)); }) |
| 413 | 398 |
| 414 #define _mm_cmp_ss(a, b, c) __extension__ ({ \ | 399 #define _mm_cmp_ss(a, b, c) __extension__ ({ \ |
| 415 __m128 __a = (a); \ | 400 __m128 __a = (a); \ |
| 416 __m128 __b = (b); \ | 401 __m128 __b = (b); \ |
| 417 (__m128)__builtin_ia32_cmpss((__v4sf)__a, (__v4sf)__b, (c)); }) | 402 (__m128)__builtin_ia32_cmpss((__v4sf)__a, (__v4sf)__b, (c)); }) |
| 418 | 403 |
| 419 /* Vector extract */ | 404 /* Vector extract */ |
| 420 static __inline __m128d __attribute__((__always_inline__, __nodebug__)) | 405 #define _mm256_extractf128_pd(A, O) __extension__ ({ \ |
| 421 _mm256_extractf128_pd(__m256d a, const int o) | 406 __m256d __A = (A); \ |
| 422 { | 407 (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)__A, (O)); }) |
| 423 return (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)a, o); | |
| 424 } | |
| 425 | 408 |
| 426 static __inline __m128 __attribute__((__always_inline__, __nodebug__)) | 409 #define _mm256_extractf128_ps(A, O) __extension__ ({ \ |
| 427 _mm256_extractf128_ps(__m256 a, const int o) | 410 __m256 __A = (A); \ |
| 428 { | 411 (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)__A, (O)); }) |
| 429 return (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)a, o); | |
| 430 } | |
| 431 | 412 |
| 432 static __inline __m128i __attribute__((__always_inline__, __nodebug__)) | 413 #define _mm256_extractf128_si256(A, O) __extension__ ({ \ |
| 433 _mm256_extractf128_si256(__m256i a, const int o) | 414 __m256i __A = (A); \ |
| 434 { | 415 (__m128i)__builtin_ia32_vextractf128_si256((__v8si)__A, (O)); }) |
| 435 return (__m128i)__builtin_ia32_vextractf128_si256((__v8si)a, o); | |
| 436 } | |
| 437 | 416 |
| 438 static __inline int __attribute__((__always_inline__, __nodebug__)) | 417 static __inline int __attribute__((__always_inline__, __nodebug__)) |
| 439 _mm256_extract_epi32(__m256i a, int const imm) | 418 _mm256_extract_epi32(__m256i a, int const imm) |
| 440 { | 419 { |
| 441 __v8si b = (__v8si)a; | 420 __v8si b = (__v8si)a; |
| 442 return b[imm]; | 421 return b[imm]; |
| 443 } | 422 } |
| 444 | 423 |
| 445 static __inline int __attribute__((__always_inline__, __nodebug__)) | 424 static __inline int __attribute__((__always_inline__, __nodebug__)) |
| 446 _mm256_extract_epi16(__m256i a, int const imm) | 425 _mm256_extract_epi16(__m256i a, int const imm) |
| (...skipping 12 matching lines...) Expand all Loading... |
| 459 #ifdef __x86_64__ | 438 #ifdef __x86_64__ |
| 460 static __inline long long __attribute__((__always_inline__, __nodebug__)) | 439 static __inline long long __attribute__((__always_inline__, __nodebug__)) |
| 461 _mm256_extract_epi64(__m256i a, const int imm) | 440 _mm256_extract_epi64(__m256i a, const int imm) |
| 462 { | 441 { |
| 463 __v4di b = (__v4di)a; | 442 __v4di b = (__v4di)a; |
| 464 return b[imm]; | 443 return b[imm]; |
| 465 } | 444 } |
| 466 #endif | 445 #endif |
| 467 | 446 |
| 468 /* Vector insert */ | 447 /* Vector insert */ |
| 469 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) | 448 #define _mm256_insertf128_pd(V1, V2, O) __extension__ ({ \ |
| 470 _mm256_insertf128_pd(__m256d a, __m128d b, const int o) | 449 __m256d __V1 = (V1); \ |
| 471 { | 450 __m128d __V2 = (V2); \ |
| 472 return (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)a, (__v2df)b, o); | 451 (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)__V1, (__v2df)__V2, (O)); }) |
| 473 } | |
| 474 | 452 |
| 475 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) | 453 #define _mm256_insertf128_ps(V1, V2, O) __extension__ ({ \ |
| 476 _mm256_insertf128_ps(__m256 a, __m128 b, const int o) | 454 __m256 __V1 = (V1); \ |
| 477 { | 455 __m128 __V2 = (V2); \ |
| 478 return (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)a, (__v4sf)b, o); | 456 (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)__V1, (__v4sf)__V2, (O)); }) |
| 479 } | |
| 480 | 457 |
| 481 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) | 458 #define _mm256_insertf128_si256(V1, V2, O) __extension__ ({ \ |
| 482 _mm256_insertf128_si256(__m256i a, __m128i b, const int o) | 459 __m256i __V1 = (V1); \ |
| 483 { | 460 __m128i __V2 = (V2); \ |
| 484 return (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)a, (__v4si)b, o); | 461 (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)__V1, (__v4si)__V2, (O)); }) |
| 485 } | |
| 486 | 462 |
| 487 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) | 463 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) |
| 488 _mm256_insert_epi32(__m256i a, int b, int const imm) | 464 _mm256_insert_epi32(__m256i a, int b, int const imm) |
| 489 { | 465 { |
| 490 __v8si c = (__v8si)a; | 466 __v8si c = (__v8si)a; |
| 491 c[imm & 7] = b; | 467 c[imm & 7] = b; |
| 492 return (__m256i)c; | 468 return (__m256i)c; |
| 493 } | 469 } |
| 494 | 470 |
| 495 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) | 471 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) |
| (...skipping 646 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1142 __m128 zero = _mm_setzero_ps(); | 1118 __m128 zero = _mm_setzero_ps(); |
| 1143 return __builtin_shufflevector(in, zero, 0, 1, 2, 3, 4, 4, 4, 4); | 1119 return __builtin_shufflevector(in, zero, 0, 1, 2, 3, 4, 4, 4, 4); |
| 1144 } | 1120 } |
| 1145 | 1121 |
| 1146 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) | 1122 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) |
| 1147 _mm256_castsi128_si256(__m128i in) | 1123 _mm256_castsi128_si256(__m128i in) |
| 1148 { | 1124 { |
| 1149 __m128i zero = _mm_setzero_si128(); | 1125 __m128i zero = _mm_setzero_si128(); |
| 1150 return __builtin_shufflevector(in, zero, 0, 1, 2, 2); | 1126 return __builtin_shufflevector(in, zero, 0, 1, 2, 2); |
| 1151 } | 1127 } |
| OLD | NEW |