OLD | NEW |
| (Empty) |
1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------=== | |
2 * | |
3 * Permission is hereby granted, free of charge, to any person obtaining a copy | |
4 * of this software and associated documentation files (the "Software"), to deal | |
5 * in the Software without restriction, including without limitation the rights | |
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
7 * copies of the Software, and to permit persons to whom the Software is | |
8 * furnished to do so, subject to the following conditions: | |
9 * | |
10 * The above copyright notice and this permission notice shall be included in | |
11 * all copies or substantial portions of the Software. | |
12 * | |
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
19 * THE SOFTWARE. | |
20 * | |
21 *===-----------------------------------------------------------------------=== | |
22 */ | |
23 | |
24 #ifndef __EMMINTRIN_H | |
25 #define __EMMINTRIN_H | |
26 | |
27 #ifndef __SSE2__ | |
28 #error "SSE2 instruction set not enabled" | |
29 #else | |
30 | |
31 #include <xmmintrin.h> | |
32 | |
33 typedef double __m128d __attribute__((__vector_size__(16))); | |
34 typedef long long __m128i __attribute__((__vector_size__(16))); | |
35 | |
36 /* Type defines. */ | |
37 typedef double __v2df __attribute__ ((__vector_size__ (16))); | |
38 typedef long long __v2di __attribute__ ((__vector_size__ (16))); | |
39 typedef short __v8hi __attribute__((__vector_size__(16))); | |
40 typedef char __v16qi __attribute__((__vector_size__(16))); | |
41 | |
42 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
43 _mm_add_sd(__m128d a, __m128d b) | |
44 { | |
45 a[0] += b[0]; | |
46 return a; | |
47 } | |
48 | |
49 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
50 _mm_add_pd(__m128d a, __m128d b) | |
51 { | |
52 return a + b; | |
53 } | |
54 | |
55 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
56 _mm_sub_sd(__m128d a, __m128d b) | |
57 { | |
58 a[0] -= b[0]; | |
59 return a; | |
60 } | |
61 | |
62 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
63 _mm_sub_pd(__m128d a, __m128d b) | |
64 { | |
65 return a - b; | |
66 } | |
67 | |
68 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
69 _mm_mul_sd(__m128d a, __m128d b) | |
70 { | |
71 a[0] *= b[0]; | |
72 return a; | |
73 } | |
74 | |
75 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
76 _mm_mul_pd(__m128d a, __m128d b) | |
77 { | |
78 return a * b; | |
79 } | |
80 | |
81 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
82 _mm_div_sd(__m128d a, __m128d b) | |
83 { | |
84 a[0] /= b[0]; | |
85 return a; | |
86 } | |
87 | |
88 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
89 _mm_div_pd(__m128d a, __m128d b) | |
90 { | |
91 return a / b; | |
92 } | |
93 | |
94 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
95 _mm_sqrt_sd(__m128d a, __m128d b) | |
96 { | |
97 __m128d c = __builtin_ia32_sqrtsd(b); | |
98 return (__m128d) { c[0], a[1] }; | |
99 } | |
100 | |
101 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
102 _mm_sqrt_pd(__m128d a) | |
103 { | |
104 return __builtin_ia32_sqrtpd(a); | |
105 } | |
106 | |
107 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
108 _mm_min_sd(__m128d a, __m128d b) | |
109 { | |
110 return __builtin_ia32_minsd(a, b); | |
111 } | |
112 | |
113 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
114 _mm_min_pd(__m128d a, __m128d b) | |
115 { | |
116 return __builtin_ia32_minpd(a, b); | |
117 } | |
118 | |
119 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
120 _mm_max_sd(__m128d a, __m128d b) | |
121 { | |
122 return __builtin_ia32_maxsd(a, b); | |
123 } | |
124 | |
125 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
126 _mm_max_pd(__m128d a, __m128d b) | |
127 { | |
128 return __builtin_ia32_maxpd(a, b); | |
129 } | |
130 | |
131 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
132 _mm_and_pd(__m128d a, __m128d b) | |
133 { | |
134 return (__m128d)((__v4si)a & (__v4si)b); | |
135 } | |
136 | |
137 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
138 _mm_andnot_pd(__m128d a, __m128d b) | |
139 { | |
140 return (__m128d)(~(__v4si)a & (__v4si)b); | |
141 } | |
142 | |
143 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
144 _mm_or_pd(__m128d a, __m128d b) | |
145 { | |
146 return (__m128d)((__v4si)a | (__v4si)b); | |
147 } | |
148 | |
149 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
150 _mm_xor_pd(__m128d a, __m128d b) | |
151 { | |
152 return (__m128d)((__v4si)a ^ (__v4si)b); | |
153 } | |
154 | |
155 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
156 _mm_cmpeq_pd(__m128d a, __m128d b) | |
157 { | |
158 return (__m128d)__builtin_ia32_cmppd(a, b, 0); | |
159 } | |
160 | |
161 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
162 _mm_cmplt_pd(__m128d a, __m128d b) | |
163 { | |
164 return (__m128d)__builtin_ia32_cmppd(a, b, 1); | |
165 } | |
166 | |
167 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
168 _mm_cmple_pd(__m128d a, __m128d b) | |
169 { | |
170 return (__m128d)__builtin_ia32_cmppd(a, b, 2); | |
171 } | |
172 | |
173 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
174 _mm_cmpgt_pd(__m128d a, __m128d b) | |
175 { | |
176 return (__m128d)__builtin_ia32_cmppd(b, a, 1); | |
177 } | |
178 | |
179 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
180 _mm_cmpge_pd(__m128d a, __m128d b) | |
181 { | |
182 return (__m128d)__builtin_ia32_cmppd(b, a, 2); | |
183 } | |
184 | |
185 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
186 _mm_cmpord_pd(__m128d a, __m128d b) | |
187 { | |
188 return (__m128d)__builtin_ia32_cmppd(a, b, 7); | |
189 } | |
190 | |
191 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
192 _mm_cmpunord_pd(__m128d a, __m128d b) | |
193 { | |
194 return (__m128d)__builtin_ia32_cmppd(a, b, 3); | |
195 } | |
196 | |
197 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
198 _mm_cmpneq_pd(__m128d a, __m128d b) | |
199 { | |
200 return (__m128d)__builtin_ia32_cmppd(a, b, 4); | |
201 } | |
202 | |
203 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
204 _mm_cmpnlt_pd(__m128d a, __m128d b) | |
205 { | |
206 return (__m128d)__builtin_ia32_cmppd(a, b, 5); | |
207 } | |
208 | |
209 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
210 _mm_cmpnle_pd(__m128d a, __m128d b) | |
211 { | |
212 return (__m128d)__builtin_ia32_cmppd(a, b, 6); | |
213 } | |
214 | |
215 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
216 _mm_cmpngt_pd(__m128d a, __m128d b) | |
217 { | |
218 return (__m128d)__builtin_ia32_cmppd(b, a, 5); | |
219 } | |
220 | |
221 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
222 _mm_cmpnge_pd(__m128d a, __m128d b) | |
223 { | |
224 return (__m128d)__builtin_ia32_cmppd(b, a, 6); | |
225 } | |
226 | |
227 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
228 _mm_cmpeq_sd(__m128d a, __m128d b) | |
229 { | |
230 return (__m128d)__builtin_ia32_cmpsd(a, b, 0); | |
231 } | |
232 | |
233 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
234 _mm_cmplt_sd(__m128d a, __m128d b) | |
235 { | |
236 return (__m128d)__builtin_ia32_cmpsd(a, b, 1); | |
237 } | |
238 | |
239 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
240 _mm_cmple_sd(__m128d a, __m128d b) | |
241 { | |
242 return (__m128d)__builtin_ia32_cmpsd(a, b, 2); | |
243 } | |
244 | |
245 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
246 _mm_cmpgt_sd(__m128d a, __m128d b) | |
247 { | |
248 return (__m128d)__builtin_ia32_cmpsd(b, a, 1); | |
249 } | |
250 | |
251 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
252 _mm_cmpge_sd(__m128d a, __m128d b) | |
253 { | |
254 return (__m128d)__builtin_ia32_cmpsd(b, a, 2); | |
255 } | |
256 | |
257 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
258 _mm_cmpord_sd(__m128d a, __m128d b) | |
259 { | |
260 return (__m128d)__builtin_ia32_cmpsd(a, b, 7); | |
261 } | |
262 | |
263 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
264 _mm_cmpunord_sd(__m128d a, __m128d b) | |
265 { | |
266 return (__m128d)__builtin_ia32_cmpsd(a, b, 3); | |
267 } | |
268 | |
269 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
270 _mm_cmpneq_sd(__m128d a, __m128d b) | |
271 { | |
272 return (__m128d)__builtin_ia32_cmpsd(a, b, 4); | |
273 } | |
274 | |
275 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
276 _mm_cmpnlt_sd(__m128d a, __m128d b) | |
277 { | |
278 return (__m128d)__builtin_ia32_cmpsd(a, b, 5); | |
279 } | |
280 | |
281 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
282 _mm_cmpnle_sd(__m128d a, __m128d b) | |
283 { | |
284 return (__m128d)__builtin_ia32_cmpsd(a, b, 6); | |
285 } | |
286 | |
287 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
288 _mm_cmpngt_sd(__m128d a, __m128d b) | |
289 { | |
290 return (__m128d)__builtin_ia32_cmpsd(b, a, 5); | |
291 } | |
292 | |
293 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
294 _mm_cmpnge_sd(__m128d a, __m128d b) | |
295 { | |
296 return (__m128d)__builtin_ia32_cmpsd(b, a, 6); | |
297 } | |
298 | |
299 static __inline__ int __attribute__((__always_inline__, __nodebug__)) | |
300 _mm_comieq_sd(__m128d a, __m128d b) | |
301 { | |
302 return __builtin_ia32_comisdeq(a, b); | |
303 } | |
304 | |
305 static __inline__ int __attribute__((__always_inline__, __nodebug__)) | |
306 _mm_comilt_sd(__m128d a, __m128d b) | |
307 { | |
308 return __builtin_ia32_comisdlt(a, b); | |
309 } | |
310 | |
311 static __inline__ int __attribute__((__always_inline__, __nodebug__)) | |
312 _mm_comile_sd(__m128d a, __m128d b) | |
313 { | |
314 return __builtin_ia32_comisdle(a, b); | |
315 } | |
316 | |
317 static __inline__ int __attribute__((__always_inline__, __nodebug__)) | |
318 _mm_comigt_sd(__m128d a, __m128d b) | |
319 { | |
320 return __builtin_ia32_comisdgt(a, b); | |
321 } | |
322 | |
323 static __inline__ int __attribute__((__always_inline__, __nodebug__)) | |
324 _mm_comineq_sd(__m128d a, __m128d b) | |
325 { | |
326 return __builtin_ia32_comisdneq(a, b); | |
327 } | |
328 | |
329 static __inline__ int __attribute__((__always_inline__, __nodebug__)) | |
330 _mm_ucomieq_sd(__m128d a, __m128d b) | |
331 { | |
332 return __builtin_ia32_ucomisdeq(a, b); | |
333 } | |
334 | |
335 static __inline__ int __attribute__((__always_inline__, __nodebug__)) | |
336 _mm_ucomilt_sd(__m128d a, __m128d b) | |
337 { | |
338 return __builtin_ia32_ucomisdlt(a, b); | |
339 } | |
340 | |
341 static __inline__ int __attribute__((__always_inline__, __nodebug__)) | |
342 _mm_ucomile_sd(__m128d a, __m128d b) | |
343 { | |
344 return __builtin_ia32_ucomisdle(a, b); | |
345 } | |
346 | |
347 static __inline__ int __attribute__((__always_inline__, __nodebug__)) | |
348 _mm_ucomigt_sd(__m128d a, __m128d b) | |
349 { | |
350 return __builtin_ia32_ucomisdgt(a, b); | |
351 } | |
352 | |
353 static __inline__ int __attribute__((__always_inline__, __nodebug__)) | |
354 _mm_ucomige_sd(__m128d a, __m128d b) | |
355 { | |
356 return __builtin_ia32_ucomisdge(a, b); | |
357 } | |
358 | |
359 static __inline__ int __attribute__((__always_inline__, __nodebug__)) | |
360 _mm_ucomineq_sd(__m128d a, __m128d b) | |
361 { | |
362 return __builtin_ia32_ucomisdneq(a, b); | |
363 } | |
364 | |
365 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) | |
366 _mm_cvtpd_ps(__m128d a) | |
367 { | |
368 return __builtin_ia32_cvtpd2ps(a); | |
369 } | |
370 | |
371 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
372 _mm_cvtps_pd(__m128 a) | |
373 { | |
374 return __builtin_ia32_cvtps2pd(a); | |
375 } | |
376 | |
377 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
378 _mm_cvtepi32_pd(__m128i a) | |
379 { | |
380 return __builtin_ia32_cvtdq2pd((__v4si)a); | |
381 } | |
382 | |
383 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
384 _mm_cvtpd_epi32(__m128d a) | |
385 { | |
386 return __builtin_ia32_cvtpd2dq(a); | |
387 } | |
388 | |
389 static __inline__ int __attribute__((__always_inline__, __nodebug__)) | |
390 _mm_cvtsd_si32(__m128d a) | |
391 { | |
392 return __builtin_ia32_cvtsd2si(a); | |
393 } | |
394 | |
395 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) | |
396 _mm_cvtsd_ss(__m128 a, __m128d b) | |
397 { | |
398 a[0] = b[0]; | |
399 return a; | |
400 } | |
401 | |
402 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
403 _mm_cvtsi32_sd(__m128d a, int b) | |
404 { | |
405 a[0] = b; | |
406 return a; | |
407 } | |
408 | |
409 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
410 _mm_cvtss_sd(__m128d a, __m128 b) | |
411 { | |
412 a[0] = b[0]; | |
413 return a; | |
414 } | |
415 | |
416 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
417 _mm_cvttpd_epi32(__m128d a) | |
418 { | |
419 return (__m128i)__builtin_ia32_cvttpd2dq(a); | |
420 } | |
421 | |
422 static __inline__ int __attribute__((__always_inline__, __nodebug__)) | |
423 _mm_cvttsd_si32(__m128d a) | |
424 { | |
425 return a[0]; | |
426 } | |
427 | |
428 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) | |
429 _mm_cvtpd_pi32(__m128d a) | |
430 { | |
431 return (__m64)__builtin_ia32_cvtpd2pi(a); | |
432 } | |
433 | |
434 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) | |
435 _mm_cvttpd_pi32(__m128d a) | |
436 { | |
437 return (__m64)__builtin_ia32_cvttpd2pi(a); | |
438 } | |
439 | |
440 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
441 _mm_cvtpi32_pd(__m64 a) | |
442 { | |
443 return __builtin_ia32_cvtpi2pd((__v2si)a); | |
444 } | |
445 | |
446 static __inline__ double __attribute__((__always_inline__, __nodebug__)) | |
447 _mm_cvtsd_f64(__m128d a) | |
448 { | |
449 return a[0]; | |
450 } | |
451 | |
452 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
453 _mm_load_pd(double const *dp) | |
454 { | |
455 return *(__m128d*)dp; | |
456 } | |
457 | |
458 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
459 _mm_load1_pd(double const *dp) | |
460 { | |
461 struct __mm_load1_pd_struct { | |
462 double u; | |
463 } __attribute__((__packed__, __may_alias__)); | |
464 double u = ((struct __mm_load1_pd_struct*)dp)->u; | |
465 return (__m128d){ u, u }; | |
466 } | |
467 | |
468 #define _mm_load_pd1(dp) _mm_load1_pd(dp) | |
469 | |
470 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
471 _mm_loadr_pd(double const *dp) | |
472 { | |
473 __m128d u = *(__m128d*)dp; | |
474 return __builtin_shufflevector(u, u, 1, 0); | |
475 } | |
476 | |
477 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
478 _mm_loadu_pd(double const *dp) | |
479 { | |
480 struct __loadu_pd { | |
481 __m128d v; | |
482 } __attribute__((packed, may_alias)); | |
483 return ((struct __loadu_pd*)dp)->v; | |
484 } | |
485 | |
486 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
487 _mm_load_sd(double const *dp) | |
488 { | |
489 struct __mm_load_sd_struct { | |
490 double u; | |
491 } __attribute__((__packed__, __may_alias__)); | |
492 double u = ((struct __mm_load_sd_struct*)dp)->u; | |
493 return (__m128d){ u, 0 }; | |
494 } | |
495 | |
496 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
497 _mm_loadh_pd(__m128d a, double const *dp) | |
498 { | |
499 struct __mm_loadh_pd_struct { | |
500 double u; | |
501 } __attribute__((__packed__, __may_alias__)); | |
502 double u = ((struct __mm_loadh_pd_struct*)dp)->u; | |
503 return (__m128d){ a[0], u }; | |
504 } | |
505 | |
506 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
507 _mm_loadl_pd(__m128d a, double const *dp) | |
508 { | |
509 struct __mm_loadl_pd_struct { | |
510 double u; | |
511 } __attribute__((__packed__, __may_alias__)); | |
512 double u = ((struct __mm_loadl_pd_struct*)dp)->u; | |
513 return (__m128d){ u, a[1] }; | |
514 } | |
515 | |
516 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
517 _mm_set_sd(double w) | |
518 { | |
519 return (__m128d){ w, 0 }; | |
520 } | |
521 | |
522 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
523 _mm_set1_pd(double w) | |
524 { | |
525 return (__m128d){ w, w }; | |
526 } | |
527 | |
528 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
529 _mm_set_pd(double w, double x) | |
530 { | |
531 return (__m128d){ x, w }; | |
532 } | |
533 | |
534 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
535 _mm_setr_pd(double w, double x) | |
536 { | |
537 return (__m128d){ w, x }; | |
538 } | |
539 | |
540 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
541 _mm_setzero_pd(void) | |
542 { | |
543 return (__m128d){ 0, 0 }; | |
544 } | |
545 | |
546 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
547 _mm_move_sd(__m128d a, __m128d b) | |
548 { | |
549 return (__m128d){ b[0], a[1] }; | |
550 } | |
551 | |
552 static __inline__ void __attribute__((__always_inline__, __nodebug__)) | |
553 _mm_store_sd(double *dp, __m128d a) | |
554 { | |
555 struct __mm_store_sd_struct { | |
556 double u; | |
557 } __attribute__((__packed__, __may_alias__)); | |
558 ((struct __mm_store_sd_struct*)dp)->u = a[0]; | |
559 } | |
560 | |
561 static __inline__ void __attribute__((__always_inline__, __nodebug__)) | |
562 _mm_store1_pd(double *dp, __m128d a) | |
563 { | |
564 struct __mm_store1_pd_struct { | |
565 double u[2]; | |
566 } __attribute__((__packed__, __may_alias__)); | |
567 ((struct __mm_store1_pd_struct*)dp)->u[0] = a[0]; | |
568 ((struct __mm_store1_pd_struct*)dp)->u[1] = a[0]; | |
569 } | |
570 | |
571 static __inline__ void __attribute__((__always_inline__, __nodebug__)) | |
572 _mm_store_pd(double *dp, __m128d a) | |
573 { | |
574 *(__m128d *)dp = a; | |
575 } | |
576 | |
577 static __inline__ void __attribute__((__always_inline__, __nodebug__)) | |
578 _mm_storeu_pd(double *dp, __m128d a) | |
579 { | |
580 __builtin_ia32_storeupd(dp, a); | |
581 } | |
582 | |
583 static __inline__ void __attribute__((__always_inline__, __nodebug__)) | |
584 _mm_storer_pd(double *dp, __m128d a) | |
585 { | |
586 a = __builtin_shufflevector(a, a, 1, 0); | |
587 *(__m128d *)dp = a; | |
588 } | |
589 | |
590 static __inline__ void __attribute__((__always_inline__, __nodebug__)) | |
591 _mm_storeh_pd(double *dp, __m128d a) | |
592 { | |
593 struct __mm_storeh_pd_struct { | |
594 double u; | |
595 } __attribute__((__packed__, __may_alias__)); | |
596 ((struct __mm_storeh_pd_struct*)dp)->u = a[1]; | |
597 } | |
598 | |
599 static __inline__ void __attribute__((__always_inline__, __nodebug__)) | |
600 _mm_storel_pd(double *dp, __m128d a) | |
601 { | |
602 struct __mm_storeh_pd_struct { | |
603 double u; | |
604 } __attribute__((__packed__, __may_alias__)); | |
605 ((struct __mm_storeh_pd_struct*)dp)->u = a[0]; | |
606 } | |
607 | |
608 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
609 _mm_add_epi8(__m128i a, __m128i b) | |
610 { | |
611 return (__m128i)((__v16qi)a + (__v16qi)b); | |
612 } | |
613 | |
614 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
615 _mm_add_epi16(__m128i a, __m128i b) | |
616 { | |
617 return (__m128i)((__v8hi)a + (__v8hi)b); | |
618 } | |
619 | |
620 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
621 _mm_add_epi32(__m128i a, __m128i b) | |
622 { | |
623 return (__m128i)((__v4si)a + (__v4si)b); | |
624 } | |
625 | |
626 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) | |
627 _mm_add_si64(__m64 a, __m64 b) | |
628 { | |
629 return a + b; | |
630 } | |
631 | |
632 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
633 _mm_add_epi64(__m128i a, __m128i b) | |
634 { | |
635 return a + b; | |
636 } | |
637 | |
638 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
639 _mm_adds_epi8(__m128i a, __m128i b) | |
640 { | |
641 return (__m128i)__builtin_ia32_paddsb128((__v16qi)a, (__v16qi)b); | |
642 } | |
643 | |
644 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
645 _mm_adds_epi16(__m128i a, __m128i b) | |
646 { | |
647 return (__m128i)__builtin_ia32_paddsw128((__v8hi)a, (__v8hi)b); | |
648 } | |
649 | |
650 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
651 _mm_adds_epu8(__m128i a, __m128i b) | |
652 { | |
653 return (__m128i)__builtin_ia32_paddusb128((__v16qi)a, (__v16qi)b); | |
654 } | |
655 | |
656 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
657 _mm_adds_epu16(__m128i a, __m128i b) | |
658 { | |
659 return (__m128i)__builtin_ia32_paddusw128((__v8hi)a, (__v8hi)b); | |
660 } | |
661 | |
662 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
663 _mm_avg_epu8(__m128i a, __m128i b) | |
664 { | |
665 return (__m128i)__builtin_ia32_pavgb128((__v16qi)a, (__v16qi)b); | |
666 } | |
667 | |
668 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
669 _mm_avg_epu16(__m128i a, __m128i b) | |
670 { | |
671 return (__m128i)__builtin_ia32_pavgw128((__v8hi)a, (__v8hi)b); | |
672 } | |
673 | |
674 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
675 _mm_madd_epi16(__m128i a, __m128i b) | |
676 { | |
677 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)a, (__v8hi)b); | |
678 } | |
679 | |
680 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
681 _mm_max_epi16(__m128i a, __m128i b) | |
682 { | |
683 return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)a, (__v8hi)b); | |
684 } | |
685 | |
686 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
687 _mm_max_epu8(__m128i a, __m128i b) | |
688 { | |
689 return (__m128i)__builtin_ia32_pmaxub128((__v16qi)a, (__v16qi)b); | |
690 } | |
691 | |
692 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
693 _mm_min_epi16(__m128i a, __m128i b) | |
694 { | |
695 return (__m128i)__builtin_ia32_pminsw128((__v8hi)a, (__v8hi)b); | |
696 } | |
697 | |
698 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
699 _mm_min_epu8(__m128i a, __m128i b) | |
700 { | |
701 return (__m128i)__builtin_ia32_pminub128((__v16qi)a, (__v16qi)b); | |
702 } | |
703 | |
704 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
705 _mm_mulhi_epi16(__m128i a, __m128i b) | |
706 { | |
707 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)a, (__v8hi)b); | |
708 } | |
709 | |
710 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
711 _mm_mulhi_epu16(__m128i a, __m128i b) | |
712 { | |
713 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)a, (__v8hi)b); | |
714 } | |
715 | |
716 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
717 _mm_mullo_epi16(__m128i a, __m128i b) | |
718 { | |
719 return (__m128i)((__v8hi)a * (__v8hi)b); | |
720 } | |
721 | |
722 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) | |
723 _mm_mul_su32(__m64 a, __m64 b) | |
724 { | |
725 return __builtin_ia32_pmuludq((__v2si)a, (__v2si)b); | |
726 } | |
727 | |
728 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
729 _mm_mul_epu32(__m128i a, __m128i b) | |
730 { | |
731 return __builtin_ia32_pmuludq128((__v4si)a, (__v4si)b); | |
732 } | |
733 | |
734 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
735 _mm_sad_epu8(__m128i a, __m128i b) | |
736 { | |
737 return __builtin_ia32_psadbw128((__v16qi)a, (__v16qi)b); | |
738 } | |
739 | |
740 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
741 _mm_sub_epi8(__m128i a, __m128i b) | |
742 { | |
743 return (__m128i)((__v16qi)a - (__v16qi)b); | |
744 } | |
745 | |
746 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
747 _mm_sub_epi16(__m128i a, __m128i b) | |
748 { | |
749 return (__m128i)((__v8hi)a - (__v8hi)b); | |
750 } | |
751 | |
752 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
753 _mm_sub_epi32(__m128i a, __m128i b) | |
754 { | |
755 return (__m128i)((__v4si)a - (__v4si)b); | |
756 } | |
757 | |
758 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) | |
759 _mm_sub_si64(__m64 a, __m64 b) | |
760 { | |
761 return a - b; | |
762 } | |
763 | |
764 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
765 _mm_sub_epi64(__m128i a, __m128i b) | |
766 { | |
767 return a - b; | |
768 } | |
769 | |
770 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
771 _mm_subs_epi8(__m128i a, __m128i b) | |
772 { | |
773 return (__m128i)__builtin_ia32_psubsb128((__v16qi)a, (__v16qi)b); | |
774 } | |
775 | |
776 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
777 _mm_subs_epi16(__m128i a, __m128i b) | |
778 { | |
779 return (__m128i)__builtin_ia32_psubsw128((__v8hi)a, (__v8hi)b); | |
780 } | |
781 | |
782 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
783 _mm_subs_epu8(__m128i a, __m128i b) | |
784 { | |
785 return (__m128i)__builtin_ia32_psubusb128((__v16qi)a, (__v16qi)b); | |
786 } | |
787 | |
788 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
789 _mm_subs_epu16(__m128i a, __m128i b) | |
790 { | |
791 return (__m128i)__builtin_ia32_psubusw128((__v8hi)a, (__v8hi)b); | |
792 } | |
793 | |
794 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
795 _mm_and_si128(__m128i a, __m128i b) | |
796 { | |
797 return a & b; | |
798 } | |
799 | |
800 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
801 _mm_andnot_si128(__m128i a, __m128i b) | |
802 { | |
803 return ~a & b; | |
804 } | |
805 | |
806 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
807 _mm_or_si128(__m128i a, __m128i b) | |
808 { | |
809 return a | b; | |
810 } | |
811 | |
812 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
813 _mm_xor_si128(__m128i a, __m128i b) | |
814 { | |
815 return a ^ b; | |
816 } | |
817 | |
818 #define _mm_slli_si128(VEC, IMM) \ | |
819 ((__m128i)__builtin_ia32_pslldqi128((__m128i)(VEC), (IMM)*8)) | |
820 | |
821 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
822 _mm_slli_epi16(__m128i a, int count) | |
823 { | |
824 return (__m128i)__builtin_ia32_psllwi128((__v8hi)a, count); | |
825 } | |
826 | |
827 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
828 _mm_sll_epi16(__m128i a, __m128i count) | |
829 { | |
830 return (__m128i)__builtin_ia32_psllw128((__v8hi)a, (__v8hi)count); | |
831 } | |
832 | |
833 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
834 _mm_slli_epi32(__m128i a, int count) | |
835 { | |
836 return (__m128i)__builtin_ia32_pslldi128((__v4si)a, count); | |
837 } | |
838 | |
839 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
840 _mm_sll_epi32(__m128i a, __m128i count) | |
841 { | |
842 return (__m128i)__builtin_ia32_pslld128((__v4si)a, (__v4si)count); | |
843 } | |
844 | |
845 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
846 _mm_slli_epi64(__m128i a, int count) | |
847 { | |
848 return __builtin_ia32_psllqi128(a, count); | |
849 } | |
850 | |
851 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
852 _mm_sll_epi64(__m128i a, __m128i count) | |
853 { | |
854 return __builtin_ia32_psllq128(a, count); | |
855 } | |
856 | |
857 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
858 _mm_srai_epi16(__m128i a, int count) | |
859 { | |
860 return (__m128i)__builtin_ia32_psrawi128((__v8hi)a, count); | |
861 } | |
862 | |
863 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
864 _mm_sra_epi16(__m128i a, __m128i count) | |
865 { | |
866 return (__m128i)__builtin_ia32_psraw128((__v8hi)a, (__v8hi)count); | |
867 } | |
868 | |
869 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
870 _mm_srai_epi32(__m128i a, int count) | |
871 { | |
872 return (__m128i)__builtin_ia32_psradi128((__v4si)a, count); | |
873 } | |
874 | |
875 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
876 _mm_sra_epi32(__m128i a, __m128i count) | |
877 { | |
878 return (__m128i)__builtin_ia32_psrad128((__v4si)a, (__v4si)count); | |
879 } | |
880 | |
881 | |
882 #define _mm_srli_si128(VEC, IMM) \ | |
883 ((__m128i)__builtin_ia32_psrldqi128((__m128i)(VEC), (IMM)*8)) | |
884 | |
885 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
886 _mm_srli_epi16(__m128i a, int count) | |
887 { | |
888 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)a, count); | |
889 } | |
890 | |
891 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
892 _mm_srl_epi16(__m128i a, __m128i count) | |
893 { | |
894 return (__m128i)__builtin_ia32_psrlw128((__v8hi)a, (__v8hi)count); | |
895 } | |
896 | |
897 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
898 _mm_srli_epi32(__m128i a, int count) | |
899 { | |
900 return (__m128i)__builtin_ia32_psrldi128((__v4si)a, count); | |
901 } | |
902 | |
903 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
904 _mm_srl_epi32(__m128i a, __m128i count) | |
905 { | |
906 return (__m128i)__builtin_ia32_psrld128((__v4si)a, (__v4si)count); | |
907 } | |
908 | |
909 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
910 _mm_srli_epi64(__m128i a, int count) | |
911 { | |
912 return __builtin_ia32_psrlqi128(a, count); | |
913 } | |
914 | |
915 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
916 _mm_srl_epi64(__m128i a, __m128i count) | |
917 { | |
918 return __builtin_ia32_psrlq128(a, count); | |
919 } | |
920 | |
921 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
922 _mm_cmpeq_epi8(__m128i a, __m128i b) | |
923 { | |
924 return (__m128i)((__v16qi)a == (__v16qi)b); | |
925 } | |
926 | |
927 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
928 _mm_cmpeq_epi16(__m128i a, __m128i b) | |
929 { | |
930 return (__m128i)((__v8hi)a == (__v8hi)b); | |
931 } | |
932 | |
933 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
934 _mm_cmpeq_epi32(__m128i a, __m128i b) | |
935 { | |
936 return (__m128i)((__v4si)a == (__v4si)b); | |
937 } | |
938 | |
939 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
940 _mm_cmpgt_epi8(__m128i a, __m128i b) | |
941 { | |
942 return (__m128i)((__v16qi)a > (__v16qi)b); | |
943 } | |
944 | |
945 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
946 _mm_cmpgt_epi16(__m128i a, __m128i b) | |
947 { | |
948 return (__m128i)((__v8hi)a > (__v8hi)b); | |
949 } | |
950 | |
951 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
952 _mm_cmpgt_epi32(__m128i a, __m128i b) | |
953 { | |
954 return (__m128i)((__v4si)a > (__v4si)b); | |
955 } | |
956 | |
957 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
958 _mm_cmplt_epi8(__m128i a, __m128i b) | |
959 { | |
960 return _mm_cmpgt_epi8(b,a); | |
961 } | |
962 | |
963 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
964 _mm_cmplt_epi16(__m128i a, __m128i b) | |
965 { | |
966 return _mm_cmpgt_epi16(b,a); | |
967 } | |
968 | |
969 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
970 _mm_cmplt_epi32(__m128i a, __m128i b) | |
971 { | |
972 return _mm_cmpgt_epi32(b,a); | |
973 } | |
974 | |
975 #ifdef __x86_64__ | |
976 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
977 _mm_cvtsi64_sd(__m128d a, long long b) | |
978 { | |
979 a[0] = b; | |
980 return a; | |
981 } | |
982 | |
983 static __inline__ long long __attribute__((__always_inline__, __nodebug__)) | |
984 _mm_cvtsd_si64(__m128d a) | |
985 { | |
986 return __builtin_ia32_cvtsd2si64(a); | |
987 } | |
988 | |
989 static __inline__ long long __attribute__((__always_inline__, __nodebug__)) | |
990 _mm_cvttsd_si64(__m128d a) | |
991 { | |
992 return a[0]; | |
993 } | |
994 #endif | |
995 | |
996 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) | |
997 _mm_cvtepi32_ps(__m128i a) | |
998 { | |
999 return __builtin_ia32_cvtdq2ps((__v4si)a); | |
1000 } | |
1001 | |
1002 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
1003 _mm_cvtps_epi32(__m128 a) | |
1004 { | |
1005 return (__m128i)__builtin_ia32_cvtps2dq(a); | |
1006 } | |
1007 | |
1008 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
1009 _mm_cvttps_epi32(__m128 a) | |
1010 { | |
1011 return (__m128i)__builtin_ia32_cvttps2dq(a); | |
1012 } | |
1013 | |
1014 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
1015 _mm_cvtsi32_si128(int a) | |
1016 { | |
1017 return (__m128i)(__v4si){ a, 0, 0, 0 }; | |
1018 } | |
1019 | |
1020 #ifdef __x86_64__ | |
1021 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
1022 _mm_cvtsi64_si128(long long a) | |
1023 { | |
1024 return (__m128i){ a, 0 }; | |
1025 } | |
1026 #endif | |
1027 | |
1028 static __inline__ int __attribute__((__always_inline__, __nodebug__)) | |
1029 _mm_cvtsi128_si32(__m128i a) | |
1030 { | |
1031 __v4si b = (__v4si)a; | |
1032 return b[0]; | |
1033 } | |
1034 | |
1035 #ifdef __x86_64__ | |
1036 static __inline__ long long __attribute__((__always_inline__, __nodebug__)) | |
1037 _mm_cvtsi128_si64(__m128i a) | |
1038 { | |
1039 return a[0]; | |
1040 } | |
1041 #endif | |
1042 | |
1043 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
1044 _mm_load_si128(__m128i const *p) | |
1045 { | |
1046 return *p; | |
1047 } | |
1048 | |
1049 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
1050 _mm_loadu_si128(__m128i const *p) | |
1051 { | |
1052 struct __loadu_si128 { | |
1053 __m128i v; | |
1054 } __attribute__((packed, may_alias)); | |
1055 return ((struct __loadu_si128*)p)->v; | |
1056 } | |
1057 | |
1058 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
1059 _mm_loadl_epi64(__m128i const *p) | |
1060 { | |
1061 struct __mm_loadl_epi64_struct { | |
1062 long long u; | |
1063 } __attribute__((__packed__, __may_alias__)); | |
1064 return (__m128i) { ((struct __mm_loadl_epi64_struct*)p)->u, 0}; | |
1065 } | |
1066 | |
1067 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
1068 _mm_set_epi64x(long long q1, long long q0) | |
1069 { | |
1070 return (__m128i){ q0, q1 }; | |
1071 } | |
1072 | |
1073 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
1074 _mm_set_epi64(__m64 q1, __m64 q0) | |
1075 { | |
1076 return (__m128i){ (long long)q0, (long long)q1 }; | |
1077 } | |
1078 | |
1079 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
1080 _mm_set_epi32(int i3, int i2, int i1, int i0) | |
1081 { | |
1082 return (__m128i)(__v4si){ i0, i1, i2, i3}; | |
1083 } | |
1084 | |
1085 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
1086 _mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short
w1, short w0) | |
1087 { | |
1088 return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 }; | |
1089 } | |
1090 | |
1091 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
1092 _mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9
, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b
0) | |
1093 { | |
1094 return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b
12, b13, b14, b15 }; | |
1095 } | |
1096 | |
1097 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
1098 _mm_set1_epi64x(long long q) | |
1099 { | |
1100 return (__m128i){ q, q }; | |
1101 } | |
1102 | |
1103 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
1104 _mm_set1_epi64(__m64 q) | |
1105 { | |
1106 return (__m128i){ (long long)q, (long long)q }; | |
1107 } | |
1108 | |
1109 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
1110 _mm_set1_epi32(int i) | |
1111 { | |
1112 return (__m128i)(__v4si){ i, i, i, i }; | |
1113 } | |
1114 | |
1115 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
1116 _mm_set1_epi16(short w) | |
1117 { | |
1118 return (__m128i)(__v8hi){ w, w, w, w, w, w, w, w }; | |
1119 } | |
1120 | |
1121 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
1122 _mm_set1_epi8(char b) | |
1123 { | |
1124 return (__m128i)(__v16qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b }; | |
1125 } | |
1126 | |
1127 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
1128 _mm_setr_epi64(__m64 q0, __m64 q1) | |
1129 { | |
1130 return (__m128i){ (long long)q0, (long long)q1 }; | |
1131 } | |
1132 | |
1133 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
1134 _mm_setr_epi32(int i0, int i1, int i2, int i3) | |
1135 { | |
1136 return (__m128i)(__v4si){ i0, i1, i2, i3}; | |
1137 } | |
1138 | |
1139 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
1140 _mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short
w6, short w7) | |
1141 { | |
1142 return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 }; | |
1143 } | |
1144 | |
1145 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
1146 _mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, cha
r b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b
15) | |
1147 { | |
1148 return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b
12, b13, b14, b15 }; | |
1149 } | |
1150 | |
1151 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
1152 _mm_setzero_si128(void) | |
1153 { | |
1154 return (__m128i){ 0LL, 0LL }; | |
1155 } | |
1156 | |
1157 static __inline__ void __attribute__((__always_inline__, __nodebug__)) | |
1158 _mm_store_si128(__m128i *p, __m128i b) | |
1159 { | |
1160 *p = b; | |
1161 } | |
1162 | |
1163 static __inline__ void __attribute__((__always_inline__, __nodebug__)) | |
1164 _mm_storeu_si128(__m128i *p, __m128i b) | |
1165 { | |
1166 __builtin_ia32_storedqu((char *)p, (__v16qi)b); | |
1167 } | |
1168 | |
1169 static __inline__ void __attribute__((__always_inline__, __nodebug__)) | |
1170 _mm_maskmoveu_si128(__m128i d, __m128i n, char *p) | |
1171 { | |
1172 __builtin_ia32_maskmovdqu((__v16qi)d, (__v16qi)n, p); | |
1173 } | |
1174 | |
1175 static __inline__ void __attribute__((__always_inline__, __nodebug__)) | |
1176 _mm_storel_epi64(__m128i *p, __m128i a) | |
1177 { | |
1178 __builtin_ia32_storelv4si((__v2si *)p, a); | |
1179 } | |
1180 | |
1181 static __inline__ void __attribute__((__always_inline__, __nodebug__)) | |
1182 _mm_stream_pd(double *p, __m128d a) | |
1183 { | |
1184 __builtin_ia32_movntpd(p, a); | |
1185 } | |
1186 | |
1187 static __inline__ void __attribute__((__always_inline__, __nodebug__)) | |
1188 _mm_stream_si128(__m128i *p, __m128i a) | |
1189 { | |
1190 __builtin_ia32_movntdq(p, a); | |
1191 } | |
1192 | |
1193 static __inline__ void __attribute__((__always_inline__, __nodebug__)) | |
1194 _mm_stream_si32(int *p, int a) | |
1195 { | |
1196 __builtin_ia32_movnti(p, a); | |
1197 } | |
1198 | |
1199 static __inline__ void __attribute__((__always_inline__, __nodebug__)) | |
1200 _mm_clflush(void const *p) | |
1201 { | |
1202 __builtin_ia32_clflush(p); | |
1203 } | |
1204 | |
1205 static __inline__ void __attribute__((__always_inline__, __nodebug__)) | |
1206 _mm_lfence(void) | |
1207 { | |
1208 __builtin_ia32_lfence(); | |
1209 } | |
1210 | |
1211 static __inline__ void __attribute__((__always_inline__, __nodebug__)) | |
1212 _mm_mfence(void) | |
1213 { | |
1214 __builtin_ia32_mfence(); | |
1215 } | |
1216 | |
1217 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
1218 _mm_packs_epi16(__m128i a, __m128i b) | |
1219 { | |
1220 return (__m128i)__builtin_ia32_packsswb128((__v8hi)a, (__v8hi)b); | |
1221 } | |
1222 | |
1223 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
1224 _mm_packs_epi32(__m128i a, __m128i b) | |
1225 { | |
1226 return (__m128i)__builtin_ia32_packssdw128((__v4si)a, (__v4si)b); | |
1227 } | |
1228 | |
1229 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
1230 _mm_packus_epi16(__m128i a, __m128i b) | |
1231 { | |
1232 return (__m128i)__builtin_ia32_packuswb128((__v8hi)a, (__v8hi)b); | |
1233 } | |
1234 | |
1235 static __inline__ int __attribute__((__always_inline__, __nodebug__)) | |
1236 _mm_extract_epi16(__m128i a, int imm) | |
1237 { | |
1238 __v8hi b = (__v8hi)a; | |
1239 return (unsigned short)b[imm]; | |
1240 } | |
1241 | |
1242 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
1243 _mm_insert_epi16(__m128i a, int b, int imm) | |
1244 { | |
1245 __v8hi c = (__v8hi)a; | |
1246 c[imm & 7] = b; | |
1247 return (__m128i)c; | |
1248 } | |
1249 | |
1250 static __inline__ int __attribute__((__always_inline__, __nodebug__)) | |
1251 _mm_movemask_epi8(__m128i a) | |
1252 { | |
1253 return __builtin_ia32_pmovmskb128((__v16qi)a); | |
1254 } | |
1255 | |
1256 #define _mm_shuffle_epi32(a, imm) \ | |
1257 ((__m128i)__builtin_shufflevector((__v4si)(a), (__v4si) _mm_set1_epi32(0), \ | |
1258 (imm) & 0x3, ((imm) & 0xc) >> 2, \ | |
1259 ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6)) | |
1260 | |
1261 | |
1262 #define _mm_shufflelo_epi16(a, imm) \ | |
1263 ((__m128i)__builtin_shufflevector((__v8hi)(a), (__v8hi) _mm_set1_epi16(0), \ | |
1264 (imm) & 0x3, ((imm) & 0xc) >> 2, \ | |
1265 ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \ | |
1266 4, 5, 6, 7)) | |
1267 #define _mm_shufflehi_epi16(a, imm) \ | |
1268 ((__m128i)__builtin_shufflevector((__v8hi)(a), (__v8hi) _mm_set1_epi16(0), 0,
1, 2, 3, \ | |
1269 4 + (((imm) & 0x03) >> 0), \ | |
1270 4 + (((imm) & 0x0c) >> 2), \ | |
1271 4 + (((imm) & 0x30) >> 4), \ | |
1272 4 + (((imm) & 0xc0) >> 6))) | |
1273 | |
1274 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
1275 _mm_unpackhi_epi8(__m128i a, __m128i b) | |
1276 { | |
1277 return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 8, 16+8, 9, 16
+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15); | |
1278 } | |
1279 | |
1280 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
1281 _mm_unpackhi_epi16(__m128i a, __m128i b) | |
1282 { | |
1283 return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 4, 8+4, 5, 8+5,
6, 8+6, 7, 8+7); | |
1284 } | |
1285 | |
1286 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
1287 _mm_unpackhi_epi32(__m128i a, __m128i b) | |
1288 { | |
1289 return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 2, 4+2, 3, 4+3); | |
1290 } | |
1291 | |
1292 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
1293 _mm_unpackhi_epi64(__m128i a, __m128i b) | |
1294 { | |
1295 return (__m128i)__builtin_shufflevector(a, b, 1, 2+1); | |
1296 } | |
1297 | |
1298 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
1299 _mm_unpacklo_epi8(__m128i a, __m128i b) | |
1300 { | |
1301 return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 0, 16+0, 1, 16
+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7); | |
1302 } | |
1303 | |
1304 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
1305 _mm_unpacklo_epi16(__m128i a, __m128i b) | |
1306 { | |
1307 return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 0, 8+0, 1, 8+1,
2, 8+2, 3, 8+3); | |
1308 } | |
1309 | |
1310 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
1311 _mm_unpacklo_epi32(__m128i a, __m128i b) | |
1312 { | |
1313 return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 0, 4+0, 1, 4+1); | |
1314 } | |
1315 | |
1316 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
1317 _mm_unpacklo_epi64(__m128i a, __m128i b) | |
1318 { | |
1319 return (__m128i)__builtin_shufflevector(a, b, 0, 2+0); | |
1320 } | |
1321 | |
1322 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) | |
1323 _mm_movepi64_pi64(__m128i a) | |
1324 { | |
1325 return (__m64)a[0]; | |
1326 } | |
1327 | |
1328 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
1329 _mm_movpi64_pi64(__m64 a) | |
1330 { | |
1331 return (__m128i){ (long long)a, 0 }; | |
1332 } | |
1333 | |
1334 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
1335 _mm_move_epi64(__m128i a) | |
1336 { | |
1337 return __builtin_shufflevector(a, (__m128i){ 0 }, 0, 2); | |
1338 } | |
1339 | |
1340 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
1341 _mm_unpackhi_pd(__m128d a, __m128d b) | |
1342 { | |
1343 return __builtin_shufflevector(a, b, 1, 2+1); | |
1344 } | |
1345 | |
1346 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
1347 _mm_unpacklo_pd(__m128d a, __m128d b) | |
1348 { | |
1349 return __builtin_shufflevector(a, b, 0, 2+0); | |
1350 } | |
1351 | |
1352 static __inline__ int __attribute__((__always_inline__, __nodebug__)) | |
1353 _mm_movemask_pd(__m128d a) | |
1354 { | |
1355 return __builtin_ia32_movmskpd(a); | |
1356 } | |
1357 | |
1358 #define _mm_shuffle_pd(a, b, i) \ | |
1359 (__builtin_shufflevector((__m128d)(a), (__m128d)(b), (i) & 1, \ | |
1360 (((i) & 2) >> 1) + 2)) | |
1361 | |
1362 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) | |
1363 _mm_castpd_ps(__m128d in) | |
1364 { | |
1365 return (__m128)in; | |
1366 } | |
1367 | |
1368 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
1369 _mm_castpd_si128(__m128d in) | |
1370 { | |
1371 return (__m128i)in; | |
1372 } | |
1373 | |
1374 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
1375 _mm_castps_pd(__m128 in) | |
1376 { | |
1377 return (__m128d)in; | |
1378 } | |
1379 | |
1380 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
1381 _mm_castps_si128(__m128 in) | |
1382 { | |
1383 return (__m128i)in; | |
1384 } | |
1385 | |
1386 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) | |
1387 _mm_castsi128_ps(__m128i in) | |
1388 { | |
1389 return (__m128)in; | |
1390 } | |
1391 | |
1392 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
1393 _mm_castsi128_pd(__m128i in) | |
1394 { | |
1395 return (__m128d)in; | |
1396 } | |
1397 | |
1398 static __inline__ void __attribute__((__always_inline__, __nodebug__)) | |
1399 _mm_pause(void) | |
1400 { | |
1401 __asm__ volatile ("pause"); | |
1402 } | |
1403 | |
1404 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y)) | |
1405 | |
1406 #endif /* __SSE2__ */ | |
1407 | |
1408 #endif /* __EMMINTRIN_H */ | |
OLD | NEW |