OLD | NEW |
1 ;***************************************************************************** | 1 ;***************************************************************************** |
2 ;* x86inc.asm: x264asm abstraction layer | 2 ;* x86inc.asm: x264asm abstraction layer |
3 ;***************************************************************************** | 3 ;***************************************************************************** |
4 ;* Copyright (C) 2005-2012 x264 project | 4 ;* Copyright (C) 2005-2015 x264 project |
5 ;* | 5 ;* |
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu> | 6 ;* Authors: Loren Merritt <lorenm@u.washington.edu> |
7 ;* Anton Mitrofanov <BugMaster@narod.ru> | 7 ;* Anton Mitrofanov <BugMaster@narod.ru> |
8 ;* Jason Garrett-Glaser <darkshikari@gmail.com> | 8 ;* Fiona Glaser <fiona@x264.com> |
9 ;* Henrik Gramner <hengar-6@student.ltu.se> | 9 ;* Henrik Gramner <henrik@gramner.com> |
10 ;* | 10 ;* |
11 ;* Permission to use, copy, modify, and/or distribute this software for any | 11 ;* Permission to use, copy, modify, and/or distribute this software for any |
12 ;* purpose with or without fee is hereby granted, provided that the above | 12 ;* purpose with or without fee is hereby granted, provided that the above |
13 ;* copyright notice and this permission notice appear in all copies. | 13 ;* copyright notice and this permission notice appear in all copies. |
14 ;* | 14 ;* |
15 ;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES | 15 ;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
16 ;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | 16 ;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
17 ;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR | 17 ;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR |
18 ;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | 18 ;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
19 ;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | 19 ;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
20 ;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | 20 ;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF |
21 ;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | 21 ;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
22 ;***************************************************************************** | 22 ;***************************************************************************** |
23 | 23 |
24 ; This is a header file for the x264ASM assembly language, which uses | 24 ; This is a header file for the x264ASM assembly language, which uses |
25 ; NASM/YASM syntax combined with a large number of macros to provide easy | 25 ; NASM/YASM syntax combined with a large number of macros to provide easy |
26 ; abstraction between different calling conventions (x86_32, win64, linux64). | 26 ; abstraction between different calling conventions (x86_32, win64, linux64). |
27 ; It also has various other useful features to simplify writing the kind of | 27 ; It also has various other useful features to simplify writing the kind of |
28 ; DSP functions that are most often used in x264. | 28 ; DSP functions that are most often used in x264. |
29 | 29 |
30 ; Unlike the rest of x264, this file is available under an ISC license, as it | 30 ; Unlike the rest of x264, this file is available under an ISC license, as it |
31 ; has significant usefulness outside of x264 and we want it to be available | 31 ; has significant usefulness outside of x264 and we want it to be available |
32 ; to the largest audience possible. Of course, if you modify it for your own | 32 ; to the largest audience possible. Of course, if you modify it for your own |
33 ; purposes to add a new feature, we strongly encourage contributing a patch | 33 ; purposes to add a new feature, we strongly encourage contributing a patch |
34 ; as this feature might be useful for others as well. Send patches or ideas | 34 ; as this feature might be useful for others as well. Send patches or ideas |
35 ; to x264-devel@videolan.org . | 35 ; to x264-devel@videolan.org . |
36 | 36 |
37 %include "vpx_config.asm" | 37 %include "vpx_config.asm" |
38 | 38 |
39 %ifndef program_name | 39 %ifndef private_prefix |
40 %define program_name vp9 | 40 %define private_prefix vpx |
41 %endif | 41 %endif |
42 | 42 |
| 43 %ifndef public_prefix |
| 44 %define public_prefix private_prefix |
| 45 %endif |
43 | 46 |
| 47 %ifndef STACK_ALIGNMENT |
| 48 %if ARCH_X86_64 |
| 49 %define STACK_ALIGNMENT 16 |
| 50 %else |
| 51 %define STACK_ALIGNMENT 4 |
| 52 %endif |
| 53 %endif |
| 54 |
| 55 %define WIN64 0 |
44 %define UNIX64 0 | 56 %define UNIX64 0 |
45 %define WIN64 0 | |
46 %if ARCH_X86_64 | 57 %if ARCH_X86_64 |
47 %ifidn __OUTPUT_FORMAT__,win32 | 58 %ifidn __OUTPUT_FORMAT__,win32 |
48 %define WIN64 1 | 59 %define WIN64 1 |
49 %elifidn __OUTPUT_FORMAT__,win64 | 60 %elifidn __OUTPUT_FORMAT__,win64 |
50 %define WIN64 1 | 61 %define WIN64 1 |
51 %elifidn __OUTPUT_FORMAT__,x64 | 62 %elifidn __OUTPUT_FORMAT__,x64 |
52 %define WIN64 1 | 63 %define WIN64 1 |
53 %else | 64 %else |
54 %define UNIX64 1 | 65 %define UNIX64 1 |
55 %endif | 66 %endif |
56 %endif | 67 %endif |
57 | 68 |
58 %ifidn __OUTPUT_FORMAT__,elf32 | 69 %ifidn __OUTPUT_FORMAT__,elf32 |
59 %define mangle(x) x | 70 %define mangle(x) x |
60 %elifidn __OUTPUT_FORMAT__,elf64 | 71 %elifidn __OUTPUT_FORMAT__,elf64 |
61 %define mangle(x) x | 72 %define mangle(x) x |
62 %elifidn __OUTPUT_FORMAT__,elf | |
63 %define mangle(x) x | |
64 %elifidn __OUTPUT_FORMAT__,x64 | 73 %elifidn __OUTPUT_FORMAT__,x64 |
65 %define mangle(x) x | 74 %define mangle(x) x |
66 %elifidn __OUTPUT_FORMAT__,win64 | 75 %elifidn __OUTPUT_FORMAT__,win64 |
67 %define mangle(x) x | 76 %define mangle(x) x |
68 %else | 77 %else |
69 %define mangle(x) _ %+ x | 78 %define mangle(x) _ %+ x |
70 %endif | 79 %endif |
71 | 80 |
72 ; FIXME: All of the 64bit asm functions that take a stride as an argument | 81 ; In some instances macho32 tables get misaligned when using .rodata. |
73 ; via register, assume that the high dword of that register is filled with 0. | 82 ; When looking at the disassembly it appears that the offset is either |
74 ; This is true in practice (since we never do any 64bit arithmetic on strides, | 83 ; correct or consistently off by 90. Placing them in the .text section |
75 ; and x264's strides are all positive), but is not guaranteed by the ABI. | 84 ; works around the issue. It appears to be specific to the way libvpx |
76 | 85 ; handles the tables. |
77 ; Name of the .rodata section. | |
78 ; Kludge: Something on OS X fails to align .rodata even given an align attribute
, | |
79 ; so use a different read-only section. | |
80 %macro SECTION_RODATA 0-1 16 | 86 %macro SECTION_RODATA 0-1 16 |
81 %ifidn __OUTPUT_FORMAT__,macho64 | 87 %ifidn __OUTPUT_FORMAT__,macho32 |
82 SECTION .text align=%1 | |
83 %elifidn __OUTPUT_FORMAT__,macho32 | |
84 SECTION .text align=%1 | |
85 fakegot: | |
86 %elifidn __OUTPUT_FORMAT__,macho | |
87 SECTION .text align=%1 | 88 SECTION .text align=%1 |
88 fakegot: | 89 fakegot: |
89 %elifidn __OUTPUT_FORMAT__,aout | 90 %elifidn __OUTPUT_FORMAT__,aout |
90 section .text | 91 SECTION .text |
91 %else | 92 %else |
92 SECTION .rodata align=%1 | 93 SECTION .rodata align=%1 |
93 %endif | 94 %endif |
94 %endmacro | 95 %endmacro |
95 | 96 |
96 ; aout does not support align= | |
97 %macro SECTION_TEXT 0-1 16 | 97 %macro SECTION_TEXT 0-1 16 |
98 %ifidn __OUTPUT_FORMAT__,aout | 98 %ifidn __OUTPUT_FORMAT__,aout |
99 SECTION .text | 99 SECTION .text |
100 %else | 100 %else |
101 SECTION .text align=%1 | 101 SECTION .text align=%1 |
102 %endif | 102 %endif |
103 %endmacro | 103 %endmacro |
104 | 104 |
105 ; PIC macros are copied from vpx_ports/x86_abi_support.asm. The "define PIC" | 105 ; PIC macros are copied from vpx_ports/x86_abi_support.asm. The "define PIC" |
106 ; from original code is added in for 64bit. | 106 ; from original code is added in for 64bit. |
107 %ifidn __OUTPUT_FORMAT__,elf32 | 107 %ifidn __OUTPUT_FORMAT__,elf32 |
108 %define ABI_IS_32BIT 1 | 108 %define ABI_IS_32BIT 1 |
109 %elifidn __OUTPUT_FORMAT__,macho32 | 109 %elifidn __OUTPUT_FORMAT__,macho32 |
110 %define ABI_IS_32BIT 1 | 110 %define ABI_IS_32BIT 1 |
111 %elifidn __OUTPUT_FORMAT__,win32 | 111 %elifidn __OUTPUT_FORMAT__,win32 |
112 %define ABI_IS_32BIT 1 | 112 %define ABI_IS_32BIT 1 |
113 %elifidn __OUTPUT_FORMAT__,aout | 113 %elifidn __OUTPUT_FORMAT__,aout |
114 %define ABI_IS_32BIT 1 | 114 %define ABI_IS_32BIT 1 |
115 %else | 115 %else |
116 %define ABI_IS_32BIT 0 | 116 %define ABI_IS_32BIT 0 |
117 %endif | 117 %endif |
118 | 118 |
119 %if ABI_IS_32BIT | 119 %if ABI_IS_32BIT |
120 %if CONFIG_PIC=1 | 120 %if CONFIG_PIC=1 |
121 %ifidn __OUTPUT_FORMAT__,elf32 | 121 %ifidn __OUTPUT_FORMAT__,elf32 |
122 %define GET_GOT_SAVE_ARG 1 | 122 %define GET_GOT_SAVE_ARG 1 |
123 %define WRT_PLT wrt ..plt | 123 %define WRT_PLT wrt ..plt |
124 %macro GET_GOT 1 | 124 %macro GET_GOT 1 |
125 extern _GLOBAL_OFFSET_TABLE_ | 125 extern _GLOBAL_OFFSET_TABLE_ |
126 push %1 | 126 push %1 |
127 call %%get_got | 127 call %%get_got |
128 %%sub_offset: | 128 %%sub_offset: |
129 jmp %%exitGG | 129 jmp %%exitGG |
130 %%get_got: | 130 %%get_got: |
131 mov %1, [esp] | 131 mov %1, [esp] |
132 add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%sub_offset wrt ..gotpc | 132 add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%sub_offset wrt ..gotpc |
133 ret | 133 ret |
134 %%exitGG: | 134 %%exitGG: |
135 %undef GLOBAL | 135 %undef GLOBAL |
136 %define GLOBAL(x) x + %1 wrt ..gotoff | 136 %define GLOBAL(x) x + %1 wrt ..gotoff |
137 %undef RESTORE_GOT | 137 %undef RESTORE_GOT |
138 %define RESTORE_GOT pop %1 | 138 %define RESTORE_GOT pop %1 |
139 %endmacro | 139 %endmacro |
140 %elifidn __OUTPUT_FORMAT__,macho32 | 140 %elifidn __OUTPUT_FORMAT__,macho32 |
141 %define GET_GOT_SAVE_ARG 1 | 141 %define GET_GOT_SAVE_ARG 1 |
142 %macro GET_GOT 1 | 142 %macro GET_GOT 1 |
143 push %1 | 143 push %1 |
144 call %%get_got | 144 call %%get_got |
145 %%get_got: | 145 %%get_got: |
146 pop %1 | 146 pop %1 |
147 %undef GLOBAL | 147 %undef GLOBAL |
148 %define GLOBAL(x) x + %1 - %%get_got | 148 %define GLOBAL(x) x + %1 - %%get_got |
149 %undef RESTORE_GOT | 149 %undef RESTORE_GOT |
150 %define RESTORE_GOT pop %1 | 150 %define RESTORE_GOT pop %1 |
151 %endmacro | 151 %endmacro |
152 %endif | 152 %endif |
153 %endif | 153 %endif |
154 | 154 |
155 %if ARCH_X86_64 == 0 | 155 %if ARCH_X86_64 == 0 |
156 %undef PIC | 156 %undef PIC |
157 %endif | 157 %endif |
158 | 158 |
159 %else | 159 %else |
160 %macro GET_GOT 1 | 160 %macro GET_GOT 1 |
161 %endmacro | 161 %endmacro |
162 %define GLOBAL(x) rel x | 162 %define GLOBAL(x) rel x |
163 %define WRT_PLT wrt ..plt | 163 %define WRT_PLT wrt ..plt |
164 | 164 |
165 %if WIN64 | 165 %if WIN64 |
166 %define PIC | 166 %define PIC |
167 %elifidn __OUTPUT_FORMAT__,macho64 | 167 %elifidn __OUTPUT_FORMAT__,macho64 |
168 %define PIC | 168 %define PIC |
169 %elif CONFIG_PIC | 169 %elif CONFIG_PIC |
170 %define PIC | 170 %define PIC |
171 %endif | 171 %endif |
172 %endif | 172 %endif |
173 | 173 |
174 %ifnmacro GET_GOT | 174 %ifnmacro GET_GOT |
175 %macro GET_GOT 1 | 175 %macro GET_GOT 1 |
176 %endmacro | 176 %endmacro |
177 %define GLOBAL(x) x | 177 %define GLOBAL(x) x |
178 %endif | 178 %endif |
179 %ifndef RESTORE_GOT | 179 %ifndef RESTORE_GOT |
180 %define RESTORE_GOT | 180 %define RESTORE_GOT |
181 %endif | 181 %endif |
182 %ifndef WRT_PLT | 182 %ifndef WRT_PLT |
183 %define WRT_PLT | 183 %define WRT_PLT |
184 %endif | 184 %endif |
185 | 185 |
186 %ifdef PIC | 186 %ifdef PIC |
187 default rel | 187 default rel |
188 %endif | 188 %endif |
189 ; Done with PIC macros | 189 ; Done with PIC macros |
190 | 190 |
191 ; Always use long nops (reduces 0x90 spam in disassembly on x86_32) | |
192 %ifndef __NASM_VER__ | |
193 CPU amdnop | |
194 %else | |
195 %use smartalign | |
196 ALIGNMODE k7 | |
197 %endif | |
198 | |
199 ; Macros to eliminate most code duplication between x86_32 and x86_64: | 191 ; Macros to eliminate most code duplication between x86_32 and x86_64: |
200 ; Currently this works only for leaf functions which load all their arguments | 192 ; Currently this works only for leaf functions which load all their arguments |
201 ; into registers at the start, and make no other use of the stack. Luckily that | 193 ; into registers at the start, and make no other use of the stack. Luckily that |
202 ; covers most of x264's asm. | 194 ; covers most of x264's asm. |
203 | 195 |
204 ; PROLOGUE: | 196 ; PROLOGUE: |
205 ; %1 = number of arguments. loads them from stack if needed. | 197 ; %1 = number of arguments. loads them from stack if needed. |
206 ; %2 = number of registers used. pushes callee-saved regs if needed. | 198 ; %2 = number of registers used. pushes callee-saved regs if needed. |
207 ; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. | 199 ; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. |
208 ; %4 = list of names to define to registers | 200 ; %4 = (optional) stack size to be allocated. The stack will be aligned before |
| 201 ; allocating the specified stack size. If the required stack alignment is |
| 202 ; larger than the known stack alignment the stack will be manually aligned |
| 203 ; and an extra register will be allocated to hold the original stack |
| 204 ; pointer (to not invalidate r0m etc.). To prevent the use of an extra |
| 205 ; register as stack pointer, request a negative stack size. |
| 206 ; %4+/%5+ = list of names to define to registers |
209 ; PROLOGUE can also be invoked by adding the same options to cglobal | 207 ; PROLOGUE can also be invoked by adding the same options to cglobal |
210 | 208 |
211 ; e.g. | 209 ; e.g. |
212 ; cglobal foo, 2,3,0, dst, src, tmp | 210 ; cglobal foo, 2,3,7,0x40, dst, src, tmp |
213 ; declares a function (foo), taking two args (dst and src) and one local variabl
e (tmp) | 211 ; declares a function (foo) that automatically loads two arguments (dst and |
| 212 ; src) into registers, uses one additional register (tmp) plus 7 vector |
| 213 ; registers (m0-m6) and allocates 0x40 bytes of stack space. |
214 | 214 |
215 ; TODO Some functions can use some args directly from the stack. If they're the | 215 ; TODO Some functions can use some args directly from the stack. If they're the |
216 ; last args then you can just not declare them, but if they're in the middle | 216 ; last args then you can just not declare them, but if they're in the middle |
217 ; we need more flexible macro. | 217 ; we need more flexible macro. |
218 | 218 |
219 ; RET: | 219 ; RET: |
220 ; Pops anything that was pushed by PROLOGUE, and returns. | 220 ; Pops anything that was pushed by PROLOGUE, and returns. |
221 | 221 |
222 ; REP_RET: | 222 ; REP_RET: |
223 ; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons | 223 ; Use this instead of RET if it's a branch target. |
224 ; which are slow when a normal ret follows a branch. | |
225 | 224 |
226 ; registers: | 225 ; registers: |
227 ; rN and rNq are the native-size register holding function argument N | 226 ; rN and rNq are the native-size register holding function argument N |
228 ; rNd, rNw, rNb are dword, word, and byte size | 227 ; rNd, rNw, rNb are dword, word, and byte size |
| 228 ; rNh is the high 8 bits of the word size |
229 ; rNm is the original location of arg N (a register or on the stack), dword | 229 ; rNm is the original location of arg N (a register or on the stack), dword |
230 ; rNmp is native size | 230 ; rNmp is native size |
231 | 231 |
232 %macro DECLARE_REG 5-6 | 232 %macro DECLARE_REG 2-3 |
233 %define r%1q %2 | 233 %define r%1q %2 |
234 %define r%1d %3 | 234 %define r%1d %2d |
235 %define r%1w %4 | 235 %define r%1w %2w |
236 %define r%1b %5 | 236 %define r%1b %2b |
237 %if %0 == 5 | 237 %define r%1h %2h |
238 %define r%1m %3 | 238 %if %0 == 2 |
| 239 %define r%1m %2d |
239 %define r%1mp %2 | 240 %define r%1mp %2 |
240 %elif ARCH_X86_64 ; memory | 241 %elif ARCH_X86_64 ; memory |
241 %define r%1m [rsp + stack_offset + %6] | 242 %define r%1m [rstk + stack_offset + %3] |
242 %define r%1mp qword r %+ %1 %+ m | 243 %define r%1mp qword r %+ %1 %+ m |
243 %else | 244 %else |
244 %define r%1m [esp + stack_offset + %6] | 245 %define r%1m [rstk + stack_offset + %3] |
245 %define r%1mp dword r %+ %1 %+ m | 246 %define r%1mp dword r %+ %1 %+ m |
246 %endif | 247 %endif |
247 %define r%1 %2 | 248 %define r%1 %2 |
248 %endmacro | 249 %endmacro |
249 | 250 |
250 %macro DECLARE_REG_SIZE 2 | 251 %macro DECLARE_REG_SIZE 3 |
251 %define r%1q r%1 | 252 %define r%1q r%1 |
252 %define e%1q r%1 | 253 %define e%1q r%1 |
253 %define r%1d e%1 | 254 %define r%1d e%1 |
254 %define e%1d e%1 | 255 %define e%1d e%1 |
255 %define r%1w %1 | 256 %define r%1w %1 |
256 %define e%1w %1 | 257 %define e%1w %1 |
| 258 %define r%1h %3 |
| 259 %define e%1h %3 |
257 %define r%1b %2 | 260 %define r%1b %2 |
258 %define e%1b %2 | 261 %define e%1b %2 |
259 %if ARCH_X86_64 == 0 | 262 %if ARCH_X86_64 == 0 |
260 %define r%1 e%1 | 263 %define r%1 e%1 |
261 %endif | 264 %endif |
262 %endmacro | 265 %endmacro |
263 | 266 |
264 DECLARE_REG_SIZE ax, al | 267 DECLARE_REG_SIZE ax, al, ah |
265 DECLARE_REG_SIZE bx, bl | 268 DECLARE_REG_SIZE bx, bl, bh |
266 DECLARE_REG_SIZE cx, cl | 269 DECLARE_REG_SIZE cx, cl, ch |
267 DECLARE_REG_SIZE dx, dl | 270 DECLARE_REG_SIZE dx, dl, dh |
268 DECLARE_REG_SIZE si, sil | 271 DECLARE_REG_SIZE si, sil, null |
269 DECLARE_REG_SIZE di, dil | 272 DECLARE_REG_SIZE di, dil, null |
270 DECLARE_REG_SIZE bp, bpl | 273 DECLARE_REG_SIZE bp, bpl, null |
271 | 274 |
272 ; t# defines for when per-arch register allocation is more complex than just fun
ction arguments | 275 ; t# defines for when per-arch register allocation is more complex than just fun
ction arguments |
273 | 276 |
274 %macro DECLARE_REG_TMP 1-* | 277 %macro DECLARE_REG_TMP 1-* |
275 %assign %%i 0 | 278 %assign %%i 0 |
276 %rep %0 | 279 %rep %0 |
277 CAT_XDEFINE t, %%i, r%1 | 280 CAT_XDEFINE t, %%i, r%1 |
278 %assign %%i %%i+1 | 281 %assign %%i %%i+1 |
279 %rotate 1 | 282 %rotate 1 |
280 %endrep | 283 %endrep |
281 %endmacro | 284 %endmacro |
282 | 285 |
283 %macro DECLARE_REG_TMP_SIZE 0-* | 286 %macro DECLARE_REG_TMP_SIZE 0-* |
284 %rep %0 | 287 %rep %0 |
285 %define t%1q t%1 %+ q | 288 %define t%1q t%1 %+ q |
286 %define t%1d t%1 %+ d | 289 %define t%1d t%1 %+ d |
287 %define t%1w t%1 %+ w | 290 %define t%1w t%1 %+ w |
| 291 %define t%1h t%1 %+ h |
288 %define t%1b t%1 %+ b | 292 %define t%1b t%1 %+ b |
289 %rotate 1 | 293 %rotate 1 |
290 %endrep | 294 %endrep |
291 %endmacro | 295 %endmacro |
292 | 296 |
293 DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 | 297 DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 |
294 | 298 |
295 %if ARCH_X86_64 | 299 %if ARCH_X86_64 |
296 %define gprsize 8 | 300 %define gprsize 8 |
297 %else | 301 %else |
298 %define gprsize 4 | 302 %define gprsize 4 |
299 %endif | 303 %endif |
300 | 304 |
301 %macro PUSH 1 | 305 %macro PUSH 1 |
302 push %1 | 306 push %1 |
303 %assign stack_offset stack_offset+gprsize | 307 %ifidn rstk, rsp |
| 308 %assign stack_offset stack_offset+gprsize |
| 309 %endif |
304 %endmacro | 310 %endmacro |
305 | 311 |
306 %macro POP 1 | 312 %macro POP 1 |
307 pop %1 | 313 pop %1 |
308 %assign stack_offset stack_offset-gprsize | 314 %ifidn rstk, rsp |
| 315 %assign stack_offset stack_offset-gprsize |
| 316 %endif |
309 %endmacro | 317 %endmacro |
310 | 318 |
311 %macro PUSH_IF_USED 1-* | 319 %macro PUSH_IF_USED 1-* |
312 %rep %0 | 320 %rep %0 |
313 %if %1 < regs_used | 321 %if %1 < regs_used |
314 PUSH r%1 | 322 PUSH r%1 |
315 %endif | 323 %endif |
316 %rotate 1 | 324 %rotate 1 |
317 %endrep | 325 %endrep |
318 %endmacro | 326 %endmacro |
(...skipping 11 matching lines...) Expand all Loading... |
330 %rep %0 | 338 %rep %0 |
331 %if %1 < num_args | 339 %if %1 < num_args |
332 mov r%1, r %+ %1 %+ mp | 340 mov r%1, r %+ %1 %+ mp |
333 %endif | 341 %endif |
334 %rotate 1 | 342 %rotate 1 |
335 %endrep | 343 %endrep |
336 %endmacro | 344 %endmacro |
337 | 345 |
338 %macro SUB 2 | 346 %macro SUB 2 |
339 sub %1, %2 | 347 sub %1, %2 |
340 %ifidn %1, rsp | 348 %ifidn %1, rstk |
341 %assign stack_offset stack_offset+(%2) | 349 %assign stack_offset stack_offset+(%2) |
342 %endif | 350 %endif |
343 %endmacro | 351 %endmacro |
344 | 352 |
345 %macro ADD 2 | 353 %macro ADD 2 |
346 add %1, %2 | 354 add %1, %2 |
347 %ifidn %1, rsp | 355 %ifidn %1, rstk |
348 %assign stack_offset stack_offset-(%2) | 356 %assign stack_offset stack_offset-(%2) |
349 %endif | 357 %endif |
350 %endmacro | 358 %endmacro |
351 | 359 |
352 %macro movifnidn 2 | 360 %macro movifnidn 2 |
353 %ifnidn %1, %2 | 361 %ifnidn %1, %2 |
354 mov %1, %2 | 362 mov %1, %2 |
355 %endif | 363 %endif |
356 %endmacro | 364 %endmacro |
357 | 365 |
358 %macro movsxdifnidn 2 | 366 %macro movsxdifnidn 2 |
359 %ifnidn %1, %2 | 367 %ifnidn %1, %2 |
360 movsxd %1, %2 | 368 movsxd %1, %2 |
361 %endif | 369 %endif |
362 %endmacro | 370 %endmacro |
363 | 371 |
364 %macro ASSERT 1 | 372 %macro ASSERT 1 |
365 %if (%1) == 0 | 373 %if (%1) == 0 |
366 %error assert failed | 374 %error assert failed |
367 %endif | 375 %endif |
368 %endmacro | 376 %endmacro |
369 | 377 |
370 %macro DEFINE_ARGS 0-* | 378 %macro DEFINE_ARGS 0-* |
371 %ifdef n_arg_names | 379 %ifdef n_arg_names |
372 %assign %%i 0 | 380 %assign %%i 0 |
373 %rep n_arg_names | 381 %rep n_arg_names |
374 CAT_UNDEF arg_name %+ %%i, q | 382 CAT_UNDEF arg_name %+ %%i, q |
375 CAT_UNDEF arg_name %+ %%i, d | 383 CAT_UNDEF arg_name %+ %%i, d |
376 CAT_UNDEF arg_name %+ %%i, w | 384 CAT_UNDEF arg_name %+ %%i, w |
| 385 CAT_UNDEF arg_name %+ %%i, h |
377 CAT_UNDEF arg_name %+ %%i, b | 386 CAT_UNDEF arg_name %+ %%i, b |
378 CAT_UNDEF arg_name %+ %%i, m | 387 CAT_UNDEF arg_name %+ %%i, m |
379 CAT_UNDEF arg_name %+ %%i, mp | 388 CAT_UNDEF arg_name %+ %%i, mp |
380 CAT_UNDEF arg_name, %%i | 389 CAT_UNDEF arg_name, %%i |
381 %assign %%i %%i+1 | 390 %assign %%i %%i+1 |
382 %endrep | 391 %endrep |
383 %endif | 392 %endif |
384 | 393 |
385 %xdefine %%stack_offset stack_offset | 394 %xdefine %%stack_offset stack_offset |
386 %undef stack_offset ; so that the current value of stack_offset doesn't get
baked in by xdefine | 395 %undef stack_offset ; so that the current value of stack_offset doesn't get
baked in by xdefine |
387 %assign %%i 0 | 396 %assign %%i 0 |
388 %rep %0 | 397 %rep %0 |
389 %xdefine %1q r %+ %%i %+ q | 398 %xdefine %1q r %+ %%i %+ q |
390 %xdefine %1d r %+ %%i %+ d | 399 %xdefine %1d r %+ %%i %+ d |
391 %xdefine %1w r %+ %%i %+ w | 400 %xdefine %1w r %+ %%i %+ w |
| 401 %xdefine %1h r %+ %%i %+ h |
392 %xdefine %1b r %+ %%i %+ b | 402 %xdefine %1b r %+ %%i %+ b |
393 %xdefine %1m r %+ %%i %+ m | 403 %xdefine %1m r %+ %%i %+ m |
394 %xdefine %1mp r %+ %%i %+ mp | 404 %xdefine %1mp r %+ %%i %+ mp |
395 CAT_XDEFINE arg_name, %%i, %1 | 405 CAT_XDEFINE arg_name, %%i, %1 |
396 %assign %%i %%i+1 | 406 %assign %%i %%i+1 |
397 %rotate 1 | 407 %rotate 1 |
398 %endrep | 408 %endrep |
399 %xdefine stack_offset %%stack_offset | 409 %xdefine stack_offset %%stack_offset |
400 %assign n_arg_names %0 | 410 %assign n_arg_names %0 |
401 %endmacro | 411 %endmacro |
402 | 412 |
403 %if ARCH_X86_64 | 413 %define required_stack_alignment ((mmsize + 15) & ~15) |
404 %macro ALLOC_STACK 2 ; stack_size, num_regs | 414 |
405 %assign %%stack_aligment ((mmsize + 15) & ~15) | 415 %macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only) |
406 %assign stack_size_padded %1 | 416 %ifnum %1 |
407 | 417 %if %1 != 0 |
408 %assign %%reg_num (%2 - 1) | 418 %assign %%pad 0 |
409 %xdefine rsp_tmp r %+ %%reg_num | 419 %assign stack_size %1 |
410 mov rsp_tmp, rsp | 420 %if stack_size < 0 |
411 sub rsp, stack_size_padded | 421 %assign stack_size -stack_size |
412 and rsp, ~(%%stack_aligment - 1) | 422 %endif |
413 %endmacro | 423 %if WIN64 |
414 | 424 %assign %%pad %%pad + 32 ; shadow space |
415 %macro RESTORE_STACK 0 ; reset rsp register | 425 %if mmsize != 8 |
416 mov rsp, rsp_tmp | 426 %assign xmm_regs_used %2 |
417 %endmacro | 427 %if xmm_regs_used > 8 |
418 %endif | 428 %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-save
d xmm registers |
| 429 %endif |
| 430 %endif |
| 431 %endif |
| 432 %if required_stack_alignment <= STACK_ALIGNMENT |
| 433 ; maintain the current stack alignment |
| 434 %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_of
fset-gprsize) & (STACK_ALIGNMENT-1)) |
| 435 SUB rsp, stack_size_padded |
| 436 %else |
| 437 %assign %%reg_num (regs_used - 1) |
| 438 %xdefine rstk r %+ %%reg_num |
| 439 ; align stack, and save original stack location directly above |
| 440 ; it, i.e. in [rsp+stack_size_padded], so we can restore the |
| 441 ; stack in a single instruction (i.e. mov rsp, rstk or mov |
| 442 ; rsp, [rsp+stack_size_padded]) |
| 443 %if %1 < 0 ; need to store rsp on stack |
| 444 %xdefine rstkm [rsp + stack_size + %%pad] |
| 445 %assign %%pad %%pad + gprsize |
| 446 %else ; can keep rsp in rstk during whole function |
| 447 %xdefine rstkm rstk |
| 448 %endif |
| 449 %assign stack_size_padded stack_size + ((%%pad + required_stack_
alignment-1) & ~(required_stack_alignment-1)) |
| 450 mov rstk, rsp |
| 451 and rsp, ~(required_stack_alignment-1) |
| 452 sub rsp, stack_size_padded |
| 453 movifnidn rstkm, rstk |
| 454 %endif |
| 455 WIN64_PUSH_XMM |
| 456 %endif |
| 457 %endif |
| 458 %endmacro |
| 459 |
| 460 %macro SETUP_STACK_POINTER 1 |
| 461 %ifnum %1 |
| 462 %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT |
| 463 %if %1 > 0 |
| 464 %assign regs_used (regs_used + 1) |
| 465 %elif ARCH_X86_64 && regs_used == num_args && num_args <= 4 + UNIX64
* 2 |
| 466 %warning "Stack pointer will overwrite register argument" |
| 467 %endif |
| 468 %endif |
| 469 %endif |
| 470 %endmacro |
| 471 |
| 472 %macro DEFINE_ARGS_INTERNAL 3+ |
| 473 %ifnum %2 |
| 474 DEFINE_ARGS %3 |
| 475 %elif %1 == 4 |
| 476 DEFINE_ARGS %2 |
| 477 %elif %1 > 4 |
| 478 DEFINE_ARGS %2, %3 |
| 479 %endif |
| 480 %endmacro |
419 | 481 |
420 %if WIN64 ; Windows x64 ;================================================= | 482 %if WIN64 ; Windows x64 ;================================================= |
421 | 483 |
422 DECLARE_REG 0, rcx, ecx, cx, cl | 484 DECLARE_REG 0, rcx |
423 DECLARE_REG 1, rdx, edx, dx, dl | 485 DECLARE_REG 1, rdx |
424 DECLARE_REG 2, R8, R8D, R8W, R8B | 486 DECLARE_REG 2, R8 |
425 DECLARE_REG 3, R9, R9D, R9W, R9B | 487 DECLARE_REG 3, R9 |
426 DECLARE_REG 4, R10, R10D, R10W, R10B, 40 | 488 DECLARE_REG 4, R10, 40 |
427 DECLARE_REG 5, R11, R11D, R11W, R11B, 48 | 489 DECLARE_REG 5, R11, 48 |
428 DECLARE_REG 6, rax, eax, ax, al, 56 | 490 DECLARE_REG 6, rax, 56 |
429 DECLARE_REG 7, rdi, edi, di, dil, 64 | 491 DECLARE_REG 7, rdi, 64 |
430 DECLARE_REG 8, rsi, esi, si, sil, 72 | 492 DECLARE_REG 8, rsi, 72 |
431 DECLARE_REG 9, rbx, ebx, bx, bl, 80 | 493 DECLARE_REG 9, rbx, 80 |
432 DECLARE_REG 10, rbp, ebp, bp, bpl, 88 | 494 DECLARE_REG 10, rbp, 88 |
433 DECLARE_REG 11, R12, R12D, R12W, R12B, 96 | 495 DECLARE_REG 11, R12, 96 |
434 DECLARE_REG 12, R13, R13D, R13W, R13B, 104 | 496 DECLARE_REG 12, R13, 104 |
435 DECLARE_REG 13, R14, R14D, R14W, R14B, 112 | 497 DECLARE_REG 13, R14, 112 |
436 DECLARE_REG 14, R15, R15D, R15W, R15B, 120 | 498 DECLARE_REG 14, R15, 120 |
437 | 499 |
438 %macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names... | 500 %macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... |
439 %assign num_args %1 | 501 %assign num_args %1 |
440 %assign regs_used %2 | 502 %assign regs_used %2 |
441 ASSERT regs_used >= num_args | 503 ASSERT regs_used >= num_args |
| 504 SETUP_STACK_POINTER %4 |
442 ASSERT regs_used <= 15 | 505 ASSERT regs_used <= 15 |
443 PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 | 506 PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 |
444 %if mmsize == 8 | 507 ALLOC_STACK %4, %3 |
445 %assign xmm_regs_used 0 | 508 %if mmsize != 8 && stack_size == 0 |
446 %else | |
447 WIN64_SPILL_XMM %3 | 509 WIN64_SPILL_XMM %3 |
448 %endif | 510 %endif |
449 LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 | 511 LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 |
450 DEFINE_ARGS %4 | 512 DEFINE_ARGS_INTERNAL %0, %4, %5 |
| 513 %endmacro |
| 514 |
| 515 %macro WIN64_PUSH_XMM 0 |
| 516 ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space al
located. |
| 517 %if xmm_regs_used > 6 |
| 518 movaps [rstk + stack_offset + 8], xmm6 |
| 519 %endif |
| 520 %if xmm_regs_used > 7 |
| 521 movaps [rstk + stack_offset + 24], xmm7 |
| 522 %endif |
| 523 %if xmm_regs_used > 8 |
| 524 %assign %%i 8 |
| 525 %rep xmm_regs_used-8 |
| 526 movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i |
| 527 %assign %%i %%i+1 |
| 528 %endrep |
| 529 %endif |
451 %endmacro | 530 %endmacro |
452 | 531 |
453 %macro WIN64_SPILL_XMM 1 | 532 %macro WIN64_SPILL_XMM 1 |
454 %assign xmm_regs_used %1 | 533 %assign xmm_regs_used %1 |
455 ASSERT xmm_regs_used <= 16 | 534 ASSERT xmm_regs_used <= 16 |
| 535 %if xmm_regs_used > 8 |
| 536 ; Allocate stack space for callee-saved xmm registers plus shadow space
and align the stack. |
| 537 %assign %%pad (xmm_regs_used-8)*16 + 32 |
| 538 %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STAC
K_ALIGNMENT-1)) |
| 539 SUB rsp, stack_size_padded |
| 540 %endif |
| 541 WIN64_PUSH_XMM |
| 542 %endmacro |
| 543 |
| 544 %macro WIN64_RESTORE_XMM_INTERNAL 1 |
| 545 %assign %%pad_size 0 |
| 546 %if xmm_regs_used > 8 |
| 547 %assign %%i xmm_regs_used |
| 548 %rep xmm_regs_used-8 |
| 549 %assign %%i %%i-1 |
| 550 movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32] |
| 551 %endrep |
| 552 %endif |
| 553 %if stack_size_padded > 0 |
| 554 %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT |
| 555 mov rsp, rstkm |
| 556 %else |
| 557 add %1, stack_size_padded |
| 558 %assign %%pad_size stack_size_padded |
| 559 %endif |
| 560 %endif |
| 561 %if xmm_regs_used > 7 |
| 562 movaps xmm7, [%1 + stack_offset - %%pad_size + 24] |
| 563 %endif |
456 %if xmm_regs_used > 6 | 564 %if xmm_regs_used > 6 |
457 SUB rsp, (xmm_regs_used-6)*16+16 | 565 movaps xmm6, [%1 + stack_offset - %%pad_size + 8] |
458 %assign %%i xmm_regs_used | |
459 %rep (xmm_regs_used-6) | |
460 %assign %%i %%i-1 | |
461 movdqa [rsp + (%%i-6)*16+(~stack_offset&8)], xmm %+ %%i | |
462 %endrep | |
463 %endif | |
464 %endmacro | |
465 | |
466 %macro WIN64_RESTORE_XMM_INTERNAL 1 | |
467 %if xmm_regs_used > 6 | |
468 %assign %%i xmm_regs_used | |
469 %rep (xmm_regs_used-6) | |
470 %assign %%i %%i-1 | |
471 movdqa xmm %+ %%i, [%1 + (%%i-6)*16+(~stack_offset&8)] | |
472 %endrep | |
473 add %1, (xmm_regs_used-6)*16+16 | |
474 %endif | 566 %endif |
475 %endmacro | 567 %endmacro |
476 | 568 |
477 %macro WIN64_RESTORE_XMM 1 | 569 %macro WIN64_RESTORE_XMM 1 |
478 WIN64_RESTORE_XMM_INTERNAL %1 | 570 WIN64_RESTORE_XMM_INTERNAL %1 |
479 %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16 | 571 %assign stack_offset (stack_offset-stack_size_padded) |
480 %assign xmm_regs_used 0 | 572 %assign xmm_regs_used 0 |
481 %endmacro | 573 %endmacro |
482 | 574 |
| 575 %define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack
_size > 0 |
| 576 |
483 %macro RET 0 | 577 %macro RET 0 |
484 WIN64_RESTORE_XMM_INTERNAL rsp | 578 WIN64_RESTORE_XMM_INTERNAL rsp |
485 POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 | 579 POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 |
486 ret | 580 %if mmsize == 32 |
487 %endmacro | 581 vzeroupper |
488 | 582 %endif |
489 %macro REP_RET 0 | 583 AUTO_REP_RET |
490 %if regs_used > 7 || xmm_regs_used > 6 | |
491 RET | |
492 %else | |
493 rep ret | |
494 %endif | |
495 %endmacro | 584 %endmacro |
496 | 585 |
497 %elif ARCH_X86_64 ; *nix x64 ;============================================= | 586 %elif ARCH_X86_64 ; *nix x64 ;============================================= |
498 | 587 |
499 DECLARE_REG 0, rdi, edi, di, dil | 588 DECLARE_REG 0, rdi |
500 DECLARE_REG 1, rsi, esi, si, sil | 589 DECLARE_REG 1, rsi |
501 DECLARE_REG 2, rdx, edx, dx, dl | 590 DECLARE_REG 2, rdx |
502 DECLARE_REG 3, rcx, ecx, cx, cl | 591 DECLARE_REG 3, rcx |
503 DECLARE_REG 4, R8, R8D, R8W, R8B | 592 DECLARE_REG 4, R8 |
504 DECLARE_REG 5, R9, R9D, R9W, R9B | 593 DECLARE_REG 5, R9 |
505 DECLARE_REG 6, rax, eax, ax, al, 8 | 594 DECLARE_REG 6, rax, 8 |
506 DECLARE_REG 7, R10, R10D, R10W, R10B, 16 | 595 DECLARE_REG 7, R10, 16 |
507 DECLARE_REG 8, R11, R11D, R11W, R11B, 24 | 596 DECLARE_REG 8, R11, 24 |
508 DECLARE_REG 9, rbx, ebx, bx, bl, 32 | 597 DECLARE_REG 9, rbx, 32 |
509 DECLARE_REG 10, rbp, ebp, bp, bpl, 40 | 598 DECLARE_REG 10, rbp, 40 |
510 DECLARE_REG 11, R12, R12D, R12W, R12B, 48 | 599 DECLARE_REG 11, R12, 48 |
511 DECLARE_REG 12, R13, R13D, R13W, R13B, 56 | 600 DECLARE_REG 12, R13, 56 |
512 DECLARE_REG 13, R14, R14D, R14W, R14B, 64 | 601 DECLARE_REG 13, R14, 64 |
513 DECLARE_REG 14, R15, R15D, R15W, R15B, 72 | 602 DECLARE_REG 14, R15, 72 |
514 | 603 |
515 %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... | 604 %macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names... |
516 %assign num_args %1 | 605 %assign num_args %1 |
517 %assign regs_used %2 | 606 %assign regs_used %2 |
518 ASSERT regs_used >= num_args | 607 ASSERT regs_used >= num_args |
| 608 SETUP_STACK_POINTER %4 |
519 ASSERT regs_used <= 15 | 609 ASSERT regs_used <= 15 |
520 PUSH_IF_USED 9, 10, 11, 12, 13, 14 | 610 PUSH_IF_USED 9, 10, 11, 12, 13, 14 |
| 611 ALLOC_STACK %4 |
521 LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 | 612 LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 |
522 DEFINE_ARGS %4 | 613 DEFINE_ARGS_INTERNAL %0, %4, %5 |
523 %endmacro | 614 %endmacro |
| 615 |
| 616 %define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0 |
524 | 617 |
525 %macro RET 0 | 618 %macro RET 0 |
| 619 %if stack_size_padded > 0 |
| 620 %if required_stack_alignment > STACK_ALIGNMENT |
| 621 mov rsp, rstkm |
| 622 %else |
| 623 add rsp, stack_size_padded |
| 624 %endif |
| 625 %endif |
526 POP_IF_USED 14, 13, 12, 11, 10, 9 | 626 POP_IF_USED 14, 13, 12, 11, 10, 9 |
527 ret | 627 %if mmsize == 32 |
528 %endmacro | 628 vzeroupper |
529 | 629 %endif |
530 %macro REP_RET 0 | 630 AUTO_REP_RET |
531 %if regs_used > 9 | |
532 RET | |
533 %else | |
534 rep ret | |
535 %endif | |
536 %endmacro | 631 %endmacro |
537 | 632 |
538 %else ; X86_32 ;============================================================== | 633 %else ; X86_32 ;============================================================== |
539 | 634 |
540 DECLARE_REG 0, eax, eax, ax, al, 4 | 635 DECLARE_REG 0, eax, 4 |
541 DECLARE_REG 1, ecx, ecx, cx, cl, 8 | 636 DECLARE_REG 1, ecx, 8 |
542 DECLARE_REG 2, edx, edx, dx, dl, 12 | 637 DECLARE_REG 2, edx, 12 |
543 DECLARE_REG 3, ebx, ebx, bx, bl, 16 | 638 DECLARE_REG 3, ebx, 16 |
544 DECLARE_REG 4, esi, esi, si, null, 20 | 639 DECLARE_REG 4, esi, 20 |
545 DECLARE_REG 5, edi, edi, di, null, 24 | 640 DECLARE_REG 5, edi, 24 |
546 DECLARE_REG 6, ebp, ebp, bp, null, 28 | 641 DECLARE_REG 6, ebp, 28 |
547 %define rsp esp | 642 %define rsp esp |
548 | 643 |
549 %macro DECLARE_ARG 1-* | 644 %macro DECLARE_ARG 1-* |
550 %rep %0 | 645 %rep %0 |
551 %define r%1m [esp + stack_offset + 4*%1 + 4] | 646 %define r%1m [rstk + stack_offset + 4*%1 + 4] |
552 %define r%1mp dword r%1m | 647 %define r%1mp dword r%1m |
553 %rotate 1 | 648 %rotate 1 |
554 %endrep | 649 %endrep |
555 %endmacro | 650 %endmacro |
556 | 651 |
557 DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 | 652 DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 |
558 | 653 |
559 %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... | 654 %macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names... |
560 %assign num_args %1 | 655 %assign num_args %1 |
561 %assign regs_used %2 | 656 %assign regs_used %2 |
| 657 ASSERT regs_used >= num_args |
| 658 %if num_args > 7 |
| 659 %assign num_args 7 |
| 660 %endif |
562 %if regs_used > 7 | 661 %if regs_used > 7 |
563 %assign regs_used 7 | 662 %assign regs_used 7 |
564 %endif | 663 %endif |
565 ASSERT regs_used >= num_args | 664 SETUP_STACK_POINTER %4 |
| 665 ASSERT regs_used <= 7 |
566 PUSH_IF_USED 3, 4, 5, 6 | 666 PUSH_IF_USED 3, 4, 5, 6 |
| 667 ALLOC_STACK %4 |
567 LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 | 668 LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 |
568 DEFINE_ARGS %4 | 669 DEFINE_ARGS_INTERNAL %0, %4, %5 |
569 %endmacro | 670 %endmacro |
570 | 671 |
| 672 %define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0 |
| 673 |
571 %macro RET 0 | 674 %macro RET 0 |
| 675 %if stack_size_padded > 0 |
| 676 %if required_stack_alignment > STACK_ALIGNMENT |
| 677 mov rsp, rstkm |
| 678 %else |
| 679 add rsp, stack_size_padded |
| 680 %endif |
| 681 %endif |
572 POP_IF_USED 6, 5, 4, 3 | 682 POP_IF_USED 6, 5, 4, 3 |
573 ret | 683 %if mmsize == 32 |
574 %endmacro | 684 vzeroupper |
575 | 685 %endif |
576 %macro REP_RET 0 | 686 AUTO_REP_RET |
577 %if regs_used > 3 | |
578 RET | |
579 %else | |
580 rep ret | |
581 %endif | |
582 %endmacro | 687 %endmacro |
583 | 688 |
584 %endif ;====================================================================== | 689 %endif ;====================================================================== |
585 | 690 |
586 %if WIN64 == 0 | 691 %if WIN64 == 0 |
587 %macro WIN64_SPILL_XMM 1 | 692 %macro WIN64_SPILL_XMM 1 |
588 %endmacro | 693 %endmacro |
589 %macro WIN64_RESTORE_XMM 1 | 694 %macro WIN64_RESTORE_XMM 1 |
590 %endmacro | 695 %endmacro |
| 696 %macro WIN64_PUSH_XMM 0 |
| 697 %endmacro |
591 %endif | 698 %endif |
592 | 699 |
| 700 ; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either |
| 701 ; a branch or a branch target. So switch to a 2-byte form of ret in that case. |
| 702 ; We can automatically detect "follows a branch", but not a branch target. |
| 703 ; (SSSE3 is a sufficient condition to know that your cpu doesn't have this probl
em.) |
| 704 %macro REP_RET 0 |
| 705 %if has_epilogue |
| 706 RET |
| 707 %else |
| 708 rep ret |
| 709 %endif |
| 710 %endmacro |
| 711 |
| 712 %define last_branch_adr $$ |
| 713 %macro AUTO_REP_RET 0 |
| 714 %ifndef cpuflags |
| 715 times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ != last_branch_adr
. |
| 716 %elif notcpuflag(ssse3) |
| 717 times ((last_branch_adr-$)>>31)+1 rep |
| 718 %endif |
| 719 ret |
| 720 %endmacro |
| 721 |
| 722 %macro BRANCH_INSTR 0-* |
| 723 %rep %0 |
| 724 %macro %1 1-2 %1 |
| 725 %2 %1 |
| 726 %%branch_instr: |
| 727 %xdefine last_branch_adr %%branch_instr |
| 728 %endmacro |
| 729 %rotate 1 |
| 730 %endrep |
| 731 %endmacro |
| 732 |
| 733 BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp |
| 734 |
| 735 %macro TAIL_CALL 2 ; callee, is_nonadjacent |
| 736 %if has_epilogue |
| 737 call %1 |
| 738 RET |
| 739 %elif %2 |
| 740 jmp %1 |
| 741 %endif |
| 742 %endmacro |
| 743 |
593 ;============================================================================= | 744 ;============================================================================= |
594 ; arch-independent part | 745 ; arch-independent part |
595 ;============================================================================= | 746 ;============================================================================= |
596 | 747 |
597 %assign function_align 16 | 748 %assign function_align 16 |
598 | 749 |
599 ; Begin a function. | 750 ; Begin a function. |
600 ; Applies any symbol mangling needed for C linkage, and sets up a define such th
at | 751 ; Applies any symbol mangling needed for C linkage, and sets up a define such th
at |
601 ; subsequent uses of the function name automatically refer to the mangled versio
n. | 752 ; subsequent uses of the function name automatically refer to the mangled versio
n. |
602 ; Appends cpuflags to the function name if cpuflags has been specified. | 753 ; Appends cpuflags to the function name if cpuflags has been specified. |
603 %macro cglobal 1-2+ ; name, [PROLOGUE args] | 754 ; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX |
604 %if %0 == 1 | 755 ; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2). |
605 cglobal_internal %1 %+ SUFFIX | 756 %macro cglobal 1-2+ "" ; name, [PROLOGUE args] |
606 %else | 757 cglobal_internal 1, %1 %+ SUFFIX, %2 |
607 cglobal_internal %1 %+ SUFFIX, %2 | |
608 %endif | |
609 %endmacro | 758 %endmacro |
610 %macro cglobal_internal 1-2+ | 759 %macro cvisible 1-2+ "" ; name, [PROLOGUE args] |
611 %ifndef cglobaled_%1 | 760 cglobal_internal 0, %1 %+ SUFFIX, %2 |
612 %xdefine %1 mangle(program_name %+ _ %+ %1) | 761 %endmacro |
613 %xdefine %1.skip_prologue %1 %+ .skip_prologue | 762 %macro cglobal_internal 2-3+ |
614 CAT_XDEFINE cglobaled_, %1, 1 | 763 %if %1 |
615 %endif | 764 %xdefine %%FUNCTION_PREFIX private_prefix |
616 %xdefine current_function %1 | 765 ; libvpx explicitly sets visibility in shared object builds. Avoid |
617 %ifdef CHROMIUM | 766 ; setting visibility to hidden as it may break builds that split |
618 %ifidn __OUTPUT_FORMAT__,elf | 767 ; sources on e.g., directory boundaries. |
619 global %1:function hidden | 768 %ifdef CHROMIUM |
620 %elifidn __OUTPUT_FORMAT__,elf32 | 769 %xdefine %%VISIBILITY hidden |
621 global %1:function hidden | |
622 %elifidn __OUTPUT_FORMAT__,elf64 | |
623 global %1:function hidden | |
624 %elifidn __OUTPUT_FORMAT__,macho32 | |
625 %ifdef __NASM_VER__ | |
626 global %1 | |
627 %else | |
628 global %1:private_extern | |
629 %endif | |
630 %elifidn __OUTPUT_FORMAT__,macho64 | |
631 %ifdef __NASM_VER__ | |
632 global %1 | |
633 %else | |
634 global %1:private_extern | |
635 %endif | |
636 %else | 770 %else |
637 global %1 | 771 %xdefine %%VISIBILITY |
638 %endif | 772 %endif |
639 %else | 773 %else |
640 global %1 | 774 %xdefine %%FUNCTION_PREFIX public_prefix |
| 775 %xdefine %%VISIBILITY |
| 776 %endif |
| 777 %ifndef cglobaled_%2 |
| 778 %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2) |
| 779 %xdefine %2.skip_prologue %2 %+ .skip_prologue |
| 780 CAT_XDEFINE cglobaled_, %2, 1 |
| 781 %endif |
| 782 %xdefine current_function %2 |
| 783 %ifidn __OUTPUT_FORMAT__,elf32 |
| 784 global %2:function %%VISIBILITY |
| 785 %elifidn __OUTPUT_FORMAT__,elf64 |
| 786 global %2:function %%VISIBILITY |
| 787 %elifidn __OUTPUT_FORMAT__,macho32 |
| 788 %ifdef __NASM_VER__ |
| 789 global %2 |
| 790 %else |
| 791 global %2:private_extern |
| 792 %endif |
| 793 %elifidn __OUTPUT_FORMAT__,macho64 |
| 794 %ifdef __NASM_VER__ |
| 795 global %2 |
| 796 %else |
| 797 global %2:private_extern |
| 798 %endif |
| 799 %else |
| 800 global %2 |
641 %endif | 801 %endif |
642 align function_align | 802 align function_align |
643 %1: | 803 %2: |
644 RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nic
er | 804 RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly some
what nicer |
645 %assign stack_offset 0 | 805 %xdefine rstk rsp ; copy of the original stack pointer, used when
greater alignment than the known stack alignment is required |
646 %if %0 > 1 | 806 %assign stack_offset 0 ; stack pointer offset relative to the return ad
dress |
647 PROLOGUE %2 | 807 %assign stack_size 0 ; amount of stack space that can be freely used
inside a function |
| 808 %assign stack_size_padded 0 ; total amount of allocated stack space, includi
ng space for callee-saved xmm registers on WIN64 and alignment padding |
| 809 %assign xmm_regs_used 0 ; number of XMM registers requested, used for de
aling with callee-saved registers on WIN64 |
| 810 %ifnidn %3, "" |
| 811 PROLOGUE %3 |
648 %endif | 812 %endif |
649 %endmacro | 813 %endmacro |
650 | 814 |
651 %macro cextern 1 | 815 %macro cextern 1 |
652 %xdefine %1 mangle(program_name %+ _ %+ %1) | 816 %xdefine %1 mangle(private_prefix %+ _ %+ %1) |
653 CAT_XDEFINE cglobaled_, %1, 1 | 817 CAT_XDEFINE cglobaled_, %1, 1 |
654 extern %1 | 818 extern %1 |
655 %endmacro | 819 %endmacro |
656 | 820 |
657 ; like cextern, but without the prefix | 821 ; like cextern, but without the prefix |
658 %macro cextern_naked 1 | 822 %macro cextern_naked 1 |
659 %xdefine %1 mangle(%1) | 823 %xdefine %1 mangle(%1) |
660 CAT_XDEFINE cglobaled_, %1, 1 | 824 CAT_XDEFINE cglobaled_, %1, 1 |
661 extern %1 | 825 extern %1 |
662 %endmacro | 826 %endmacro |
663 | 827 |
664 %macro const 2+ | 828 %macro const 1-2+ |
665 %xdefine %1 mangle(program_name %+ _ %+ %1) | 829 %xdefine %1 mangle(private_prefix %+ _ %+ %1) |
666 global %1 | 830 %ifidn __OUTPUT_FORMAT__,elf32 |
| 831 global %1:data hidden |
| 832 %elifidn __OUTPUT_FORMAT__,elf64 |
| 833 global %1:data hidden |
| 834 %else |
| 835 global %1 |
| 836 %endif |
667 %1: %2 | 837 %1: %2 |
668 %endmacro | 838 %endmacro |
669 | 839 |
670 ; This is needed for ELF, otherwise the GNU linker assumes the stack is | 840 ; This is needed for ELF, otherwise the GNU linker assumes the stack is |
671 ; executable by default. | 841 ; executable by default. |
672 %ifidn __OUTPUT_FORMAT__,elf | 842 %ifidn __OUTPUT_FORMAT__,elf32 |
673 SECTION .note.GNU-stack noalloc noexec nowrite progbits | |
674 %elifidn __OUTPUT_FORMAT__,elf32 | |
675 SECTION .note.GNU-stack noalloc noexec nowrite progbits | 843 SECTION .note.GNU-stack noalloc noexec nowrite progbits |
676 %elifidn __OUTPUT_FORMAT__,elf64 | 844 %elifidn __OUTPUT_FORMAT__,elf64 |
677 SECTION .note.GNU-stack noalloc noexec nowrite progbits | 845 SECTION .note.GNU-stack noalloc noexec nowrite progbits |
678 %endif | 846 %endif |
679 | 847 |
680 ; cpuflags | 848 ; cpuflags |
681 | 849 |
682 %assign cpuflags_mmx (1<<0) | 850 %assign cpuflags_mmx (1<<0) |
683 %assign cpuflags_mmx2 (1<<1) | cpuflags_mmx | 851 %assign cpuflags_mmx2 (1<<1) | cpuflags_mmx |
684 %assign cpuflags_3dnow (1<<2) | cpuflags_mmx | 852 %assign cpuflags_3dnow (1<<2) | cpuflags_mmx |
685 %assign cpuflags_3dnow2 (1<<3) | cpuflags_3dnow | 853 %assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow |
686 %assign cpuflags_sse (1<<4) | cpuflags_mmx2 | 854 %assign cpuflags_sse (1<<4) | cpuflags_mmx2 |
687 %assign cpuflags_sse2 (1<<5) | cpuflags_sse | 855 %assign cpuflags_sse2 (1<<5) | cpuflags_sse |
688 %assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 | 856 %assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 |
689 %assign cpuflags_sse3 (1<<7) | cpuflags_sse2 | 857 %assign cpuflags_sse3 (1<<7) | cpuflags_sse2 |
690 %assign cpuflags_ssse3 (1<<8) | cpuflags_sse3 | 858 %assign cpuflags_ssse3 (1<<8) | cpuflags_sse3 |
691 %assign cpuflags_sse4 (1<<9) | cpuflags_ssse3 | 859 %assign cpuflags_sse4 (1<<9) | cpuflags_ssse3 |
692 %assign cpuflags_sse42 (1<<10)| cpuflags_sse4 | 860 %assign cpuflags_sse42 (1<<10)| cpuflags_sse4 |
693 %assign cpuflags_avx (1<<11)| cpuflags_sse42 | 861 %assign cpuflags_avx (1<<11)| cpuflags_sse42 |
694 %assign cpuflags_xop (1<<12)| cpuflags_avx | 862 %assign cpuflags_xop (1<<12)| cpuflags_avx |
695 %assign cpuflags_fma4 (1<<13)| cpuflags_avx | 863 %assign cpuflags_fma4 (1<<13)| cpuflags_avx |
| 864 %assign cpuflags_fma3 (1<<14)| cpuflags_avx |
| 865 %assign cpuflags_avx2 (1<<15)| cpuflags_fma3 |
696 | 866 |
697 %assign cpuflags_cache32 (1<<16) | 867 %assign cpuflags_cache32 (1<<16) |
698 %assign cpuflags_cache64 (1<<17) | 868 %assign cpuflags_cache64 (1<<17) |
699 %assign cpuflags_slowctz (1<<18) | 869 %assign cpuflags_slowctz (1<<18) |
700 %assign cpuflags_lzcnt (1<<19) | 870 %assign cpuflags_lzcnt (1<<19) |
701 %assign cpuflags_misalign (1<<20) | 871 %assign cpuflags_aligned (1<<20) ; not a cpu feature, but a function variant |
702 %assign cpuflags_aligned (1<<21) ; not a cpu feature, but a function variant | 872 %assign cpuflags_atom (1<<21) |
703 %assign cpuflags_atom (1<<22) | 873 %assign cpuflags_bmi1 (1<<22)|cpuflags_lzcnt |
| 874 %assign cpuflags_bmi2 (1<<23)|cpuflags_bmi1 |
704 | 875 |
705 %define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x)) | 876 %define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x)) |
706 %define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x)) | 877 %define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x)) |
707 | 878 |
708 ; Takes up to 2 cpuflags from the above list. | 879 ; Takes an arbitrary number of cpuflags from the above list. |
709 ; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the speci
fied cpu. | 880 ; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the speci
fied cpu. |
710 ; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_M
MX &co. | 881 ; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_M
MX &co. |
711 %macro INIT_CPUFLAGS 0-2 | 882 %macro INIT_CPUFLAGS 0-* |
| 883 %xdefine SUFFIX |
| 884 %undef cpuname |
| 885 %assign cpuflags 0 |
| 886 |
712 %if %0 >= 1 | 887 %if %0 >= 1 |
713 %xdefine cpuname %1 | 888 %rep %0 |
714 %assign cpuflags cpuflags_%1 | 889 %ifdef cpuname |
715 %if %0 >= 2 | 890 %xdefine cpuname cpuname %+ _%1 |
716 %xdefine cpuname %1_%2 | 891 %else |
717 %assign cpuflags cpuflags | cpuflags_%2 | 892 %xdefine cpuname %1 |
718 %endif | 893 %endif |
| 894 %assign cpuflags cpuflags | cpuflags_%1 |
| 895 %rotate 1 |
| 896 %endrep |
719 %xdefine SUFFIX _ %+ cpuname | 897 %xdefine SUFFIX _ %+ cpuname |
| 898 |
720 %if cpuflag(avx) | 899 %if cpuflag(avx) |
721 %assign avx_enabled 1 | 900 %assign avx_enabled 1 |
722 %endif | 901 %endif |
723 %if mmsize == 16 && notcpuflag(sse2) | 902 %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(av
x2)) |
724 %define mova movaps | 903 %define mova movaps |
725 %define movu movups | 904 %define movu movups |
726 %define movnta movntps | 905 %define movnta movntps |
727 %endif | 906 %endif |
728 %if cpuflag(aligned) | 907 %if cpuflag(aligned) |
729 %define movu mova | 908 %define movu mova |
730 %elifidn %1, sse3 | 909 %elif cpuflag(sse3) && notcpuflag(ssse3) |
731 %define movu lddqu | 910 %define movu lddqu |
732 %endif | 911 %endif |
| 912 %endif |
| 913 |
| 914 %ifdef __NASM_VER__ |
| 915 %use smartalign |
| 916 ALIGNMODE k7 |
| 917 %elif ARCH_X86_64 || cpuflag(sse2) |
| 918 CPU amdnop |
733 %else | 919 %else |
734 %xdefine SUFFIX | 920 CPU basicnop |
735 %undef cpuname | |
736 %undef cpuflags | |
737 %endif | 921 %endif |
738 %endmacro | 922 %endmacro |
739 | 923 |
740 ; merge mmx and sse* | 924 ; Merge mmx and sse* |
| 925 ; m# is a simd register of the currently selected size |
| 926 ; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m
# |
| 927 ; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m
# |
| 928 ; (All 3 remain in sync through SWAP.) |
741 | 929 |
742 %macro CAT_XDEFINE 3 | 930 %macro CAT_XDEFINE 3 |
743 %xdefine %1%2 %3 | 931 %xdefine %1%2 %3 |
744 %endmacro | 932 %endmacro |
745 | 933 |
746 %macro CAT_UNDEF 2 | 934 %macro CAT_UNDEF 2 |
747 %undef %1%2 | 935 %undef %1%2 |
748 %endmacro | 936 %endmacro |
749 | 937 |
750 %macro INIT_MMX 0-1+ | 938 %macro INIT_MMX 0-1+ |
751 %assign avx_enabled 0 | 939 %assign avx_enabled 0 |
752 %define RESET_MM_PERMUTATION INIT_MMX %1 | 940 %define RESET_MM_PERMUTATION INIT_MMX %1 |
753 %define mmsize 8 | 941 %define mmsize 8 |
754 %define num_mmregs 8 | 942 %define num_mmregs 8 |
755 %define mova movq | 943 %define mova movq |
756 %define movu movq | 944 %define movu movq |
757 %define movh movd | 945 %define movh movd |
758 %define movnta movntq | 946 %define movnta movntq |
759 %assign %%i 0 | 947 %assign %%i 0 |
760 %rep 8 | 948 %rep 8 |
761 CAT_XDEFINE m, %%i, mm %+ %%i | 949 CAT_XDEFINE m, %%i, mm %+ %%i |
762 CAT_XDEFINE nmm, %%i, %%i | 950 CAT_XDEFINE nnmm, %%i, %%i |
763 %assign %%i %%i+1 | 951 %assign %%i %%i+1 |
764 %endrep | 952 %endrep |
765 %rep 8 | 953 %rep 8 |
766 CAT_UNDEF m, %%i | 954 CAT_UNDEF m, %%i |
767 CAT_UNDEF nmm, %%i | 955 CAT_UNDEF nnmm, %%i |
768 %assign %%i %%i+1 | 956 %assign %%i %%i+1 |
769 %endrep | 957 %endrep |
770 INIT_CPUFLAGS %1 | 958 INIT_CPUFLAGS %1 |
771 %endmacro | 959 %endmacro |
772 | 960 |
773 %macro INIT_XMM 0-1+ | 961 %macro INIT_XMM 0-1+ |
774 %assign avx_enabled 0 | 962 %assign avx_enabled 0 |
775 %define RESET_MM_PERMUTATION INIT_XMM %1 | 963 %define RESET_MM_PERMUTATION INIT_XMM %1 |
776 %define mmsize 16 | 964 %define mmsize 16 |
777 %define num_mmregs 8 | 965 %define num_mmregs 8 |
778 %if ARCH_X86_64 | 966 %if ARCH_X86_64 |
779 %define num_mmregs 16 | 967 %define num_mmregs 16 |
780 %endif | 968 %endif |
781 %define mova movdqa | 969 %define mova movdqa |
782 %define movu movdqu | 970 %define movu movdqu |
783 %define movh movq | 971 %define movh movq |
784 %define movnta movntdq | 972 %define movnta movntdq |
785 %assign %%i 0 | 973 %assign %%i 0 |
786 %rep num_mmregs | 974 %rep num_mmregs |
787 CAT_XDEFINE m, %%i, xmm %+ %%i | 975 CAT_XDEFINE m, %%i, xmm %+ %%i |
788 CAT_XDEFINE nxmm, %%i, %%i | 976 CAT_XDEFINE nnxmm, %%i, %%i |
789 %assign %%i %%i+1 | 977 %assign %%i %%i+1 |
790 %endrep | 978 %endrep |
791 INIT_CPUFLAGS %1 | 979 INIT_CPUFLAGS %1 |
792 %endmacro | 980 %endmacro |
793 | 981 |
794 ; FIXME: INIT_AVX can be replaced by INIT_XMM avx | |
795 %macro INIT_AVX 0 | |
796 INIT_XMM | |
797 %assign avx_enabled 1 | |
798 %define PALIGNR PALIGNR_SSSE3 | |
799 %define RESET_MM_PERMUTATION INIT_AVX | |
800 %endmacro | |
801 | |
802 %macro INIT_YMM 0-1+ | 982 %macro INIT_YMM 0-1+ |
803 %assign avx_enabled 1 | 983 %assign avx_enabled 1 |
804 %define RESET_MM_PERMUTATION INIT_YMM %1 | 984 %define RESET_MM_PERMUTATION INIT_YMM %1 |
805 %define mmsize 32 | 985 %define mmsize 32 |
806 %define num_mmregs 8 | 986 %define num_mmregs 8 |
807 %if ARCH_X86_64 | 987 %if ARCH_X86_64 |
808 %define num_mmregs 16 | 988 %define num_mmregs 16 |
809 %endif | 989 %endif |
810 %define mova vmovaps | 990 %define mova movdqa |
811 %define movu vmovups | 991 %define movu movdqu |
812 %undef movh | 992 %undef movh |
813 %define movnta vmovntps | 993 %define movnta movntdq |
814 %assign %%i 0 | 994 %assign %%i 0 |
815 %rep num_mmregs | 995 %rep num_mmregs |
816 CAT_XDEFINE m, %%i, ymm %+ %%i | 996 CAT_XDEFINE m, %%i, ymm %+ %%i |
817 CAT_XDEFINE nymm, %%i, %%i | 997 CAT_XDEFINE nnymm, %%i, %%i |
818 %assign %%i %%i+1 | 998 %assign %%i %%i+1 |
819 %endrep | 999 %endrep |
820 INIT_CPUFLAGS %1 | 1000 INIT_CPUFLAGS %1 |
821 %endmacro | 1001 %endmacro |
822 | 1002 |
823 INIT_XMM | 1003 INIT_XMM |
824 | 1004 |
| 1005 %macro DECLARE_MMCAST 1 |
| 1006 %define mmmm%1 mm%1 |
| 1007 %define mmxmm%1 mm%1 |
| 1008 %define mmymm%1 mm%1 |
| 1009 %define xmmmm%1 mm%1 |
| 1010 %define xmmxmm%1 xmm%1 |
| 1011 %define xmmymm%1 xmm%1 |
| 1012 %define ymmmm%1 mm%1 |
| 1013 %define ymmxmm%1 xmm%1 |
| 1014 %define ymmymm%1 ymm%1 |
| 1015 %define xm%1 xmm %+ m%1 |
| 1016 %define ym%1 ymm %+ m%1 |
| 1017 %endmacro |
| 1018 |
| 1019 %assign i 0 |
| 1020 %rep 16 |
| 1021 DECLARE_MMCAST i |
| 1022 %assign i i+1 |
| 1023 %endrep |
| 1024 |
825 ; I often want to use macros that permute their arguments. e.g. there's no | 1025 ; I often want to use macros that permute their arguments. e.g. there's no |
826 ; efficient way to implement butterfly or transpose or dct without swapping some | 1026 ; efficient way to implement butterfly or transpose or dct without swapping some |
827 ; arguments. | 1027 ; arguments. |
828 ; | 1028 ; |
829 ; I would like to not have to manually keep track of the permutations: | 1029 ; I would like to not have to manually keep track of the permutations: |
830 ; If I insert a permutation in the middle of a function, it should automatically | 1030 ; If I insert a permutation in the middle of a function, it should automatically |
831 ; change everything that follows. For more complex macros I may also have multip
le | 1031 ; change everything that follows. For more complex macros I may also have multip
le |
832 ; implementations, e.g. the SSE2 and SSSE3 versions may have different permutati
ons. | 1032 ; implementations, e.g. the SSE2 and SSSE3 versions may have different permutati
ons. |
833 ; | 1033 ; |
834 ; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that | 1034 ; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that |
835 ; permutes its arguments. It's equivalent to exchanging the contents of the | 1035 ; permutes its arguments. It's equivalent to exchanging the contents of the |
836 ; registers, except that this way you exchange the register names instead, so it | 1036 ; registers, except that this way you exchange the register names instead, so it |
837 ; doesn't cost any cycles. | 1037 ; doesn't cost any cycles. |
838 | 1038 |
839 %macro PERMUTE 2-* ; takes a list of pairs to swap | 1039 %macro PERMUTE 2-* ; takes a list of pairs to swap |
840 %rep %0/2 | 1040 %rep %0/2 |
841 %xdefine tmp%2 m%2 | 1041 %xdefine %%tmp%2 m%2 |
842 %xdefine ntmp%2 nm%2 | |
843 %rotate 2 | 1042 %rotate 2 |
844 %endrep | 1043 %endrep |
845 %rep %0/2 | 1044 %rep %0/2 |
846 %xdefine m%1 tmp%2 | 1045 %xdefine m%1 %%tmp%2 |
847 %xdefine nm%1 ntmp%2 | 1046 CAT_XDEFINE nn, m%1, %1 |
848 %undef tmp%2 | |
849 %undef ntmp%2 | |
850 %rotate 2 | 1047 %rotate 2 |
851 %endrep | 1048 %endrep |
852 %endmacro | 1049 %endmacro |
853 | 1050 |
854 %macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs) | 1051 %macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs) |
855 %rep %0-1 | 1052 %ifnum %1 ; SWAP 0, 1, ... |
856 %ifdef m%1 | 1053 SWAP_INTERNAL_NUM %1, %2 |
857 %xdefine tmp m%1 | 1054 %else ; SWAP m0, m1, ... |
858 %xdefine m%1 m%2 | 1055 SWAP_INTERNAL_NAME %1, %2 |
859 %xdefine m%2 tmp | |
860 CAT_XDEFINE n, m%1, %1 | |
861 CAT_XDEFINE n, m%2, %2 | |
862 %else | |
863 ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the origina
l numbers here. | |
864 ; Be careful using this mode in nested macros though, as in some cases there
may be | |
865 ; other copies of m# that have already been dereferenced and don't get updat
ed correctly. | |
866 %xdefine %%n1 n %+ %1 | |
867 %xdefine %%n2 n %+ %2 | |
868 %xdefine tmp m %+ %%n1 | |
869 CAT_XDEFINE m, %%n1, m %+ %%n2 | |
870 CAT_XDEFINE m, %%n2, tmp | |
871 CAT_XDEFINE n, m %+ %%n1, %%n1 | |
872 CAT_XDEFINE n, m %+ %%n2, %%n2 | |
873 %endif | 1056 %endif |
874 %undef tmp | 1057 %endmacro |
| 1058 |
| 1059 %macro SWAP_INTERNAL_NUM 2-* |
| 1060 %rep %0-1 |
| 1061 %xdefine %%tmp m%1 |
| 1062 %xdefine m%1 m%2 |
| 1063 %xdefine m%2 %%tmp |
| 1064 CAT_XDEFINE nn, m%1, %1 |
| 1065 CAT_XDEFINE nn, m%2, %2 |
875 %rotate 1 | 1066 %rotate 1 |
876 %endrep | 1067 %endrep |
| 1068 %endmacro |
| 1069 |
| 1070 %macro SWAP_INTERNAL_NAME 2-* |
| 1071 %xdefine %%args nn %+ %1 |
| 1072 %rep %0-1 |
| 1073 %xdefine %%args %%args, nn %+ %2 |
| 1074 %rotate 1 |
| 1075 %endrep |
| 1076 SWAP_INTERNAL_NUM %%args |
877 %endmacro | 1077 %endmacro |
878 | 1078 |
879 ; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later | 1079 ; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later |
880 ; calls to that function will automatically load the permutation, so values can | 1080 ; calls to that function will automatically load the permutation, so values can |
881 ; be returned in mmregs. | 1081 ; be returned in mmregs. |
882 %macro SAVE_MM_PERMUTATION 0-1 | 1082 %macro SAVE_MM_PERMUTATION 0-1 |
883 %if %0 | 1083 %if %0 |
884 %xdefine %%f %1_m | 1084 %xdefine %%f %1_m |
885 %else | 1085 %else |
886 %xdefine %%f current_function %+ _m | 1086 %xdefine %%f current_function %+ _m |
887 %endif | 1087 %endif |
888 %assign %%i 0 | 1088 %assign %%i 0 |
889 %rep num_mmregs | 1089 %rep num_mmregs |
890 CAT_XDEFINE %%f, %%i, m %+ %%i | 1090 CAT_XDEFINE %%f, %%i, m %+ %%i |
891 %assign %%i %%i+1 | 1091 %assign %%i %%i+1 |
892 %endrep | 1092 %endrep |
893 %endmacro | 1093 %endmacro |
894 | 1094 |
895 %macro LOAD_MM_PERMUTATION 1 ; name to load from | 1095 %macro LOAD_MM_PERMUTATION 1 ; name to load from |
896 %ifdef %1_m0 | 1096 %ifdef %1_m0 |
897 %assign %%i 0 | 1097 %assign %%i 0 |
898 %rep num_mmregs | 1098 %rep num_mmregs |
899 CAT_XDEFINE m, %%i, %1_m %+ %%i | 1099 CAT_XDEFINE m, %%i, %1_m %+ %%i |
900 CAT_XDEFINE n, m %+ %%i, %%i | 1100 CAT_XDEFINE nn, m %+ %%i, %%i |
901 %assign %%i %%i+1 | 1101 %assign %%i %%i+1 |
902 %endrep | 1102 %endrep |
903 %endif | 1103 %endif |
904 %endmacro | 1104 %endmacro |
905 | 1105 |
906 ; Append cpuflags to the callee's name iff the appended name is known and the pl
ain name isn't | 1106 ; Append cpuflags to the callee's name iff the appended name is known and the pl
ain name isn't |
907 %macro call 1 | 1107 %macro call 1 |
908 call_internal %1, %1 %+ SUFFIX | 1108 call_internal %1, %1 %+ SUFFIX |
909 %endmacro | 1109 %endmacro |
910 %macro call_internal 2 | 1110 %macro call_internal 2 |
(...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
951 %rep 16 | 1151 %rep 16 |
952 %if i < 8 | 1152 %if i < 8 |
953 CAT_XDEFINE sizeofmm, i, 8 | 1153 CAT_XDEFINE sizeofmm, i, 8 |
954 %endif | 1154 %endif |
955 CAT_XDEFINE sizeofxmm, i, 16 | 1155 CAT_XDEFINE sizeofxmm, i, 16 |
956 CAT_XDEFINE sizeofymm, i, 32 | 1156 CAT_XDEFINE sizeofymm, i, 32 |
957 %assign i i+1 | 1157 %assign i i+1 |
958 %endrep | 1158 %endrep |
959 %undef i | 1159 %undef i |
960 | 1160 |
| 1161 %macro CHECK_AVX_INSTR_EMU 3-* |
| 1162 %xdefine %%opcode %1 |
| 1163 %xdefine %%dst %2 |
| 1164 %rep %0-2 |
| 1165 %ifidn %%dst, %3 |
| 1166 %error non-avx emulation of ``%%opcode'' is not supported |
| 1167 %endif |
| 1168 %rotate 1 |
| 1169 %endrep |
| 1170 %endmacro |
| 1171 |
961 ;%1 == instruction | 1172 ;%1 == instruction |
962 ;%2 == 1 if float, 0 if int | 1173 ;%2 == minimal instruction set |
963 ;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm) | 1174 ;%3 == 1 if float, 0 if int |
964 ;%4 == number of operands given | 1175 ;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise |
965 ;%5+: operands | 1176 ;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not |
966 %macro RUN_AVX_INSTR 6-7+ | 1177 ;%6+: operands |
967 %ifid %5 | 1178 %macro RUN_AVX_INSTR 6-9+ |
968 %define %%size sizeof%5 | 1179 %ifnum sizeof%7 |
| 1180 %assign __sizeofreg sizeof%7 |
| 1181 %elifnum sizeof%6 |
| 1182 %assign __sizeofreg sizeof%6 |
969 %else | 1183 %else |
970 %define %%size mmsize | 1184 %assign __sizeofreg mmsize |
971 %endif | 1185 %endif |
972 %if %%size==32 | 1186 %assign __emulate_avx 0 |
973 %if %0 >= 7 | 1187 %if avx_enabled && __sizeofreg >= 16 |
974 v%1 %5, %6, %7 | 1188 %xdefine __instr v%1 |
| 1189 %else |
| 1190 %xdefine __instr %1 |
| 1191 %if %0 >= 8+%4 |
| 1192 %assign __emulate_avx 1 |
| 1193 %endif |
| 1194 %endif |
| 1195 %ifnidn %2, fnord |
| 1196 %ifdef cpuname |
| 1197 %if notcpuflag(%2) |
| 1198 %error use of ``%1'' %2 instruction in cpuname function: current
_function |
| 1199 %elif cpuflags_%2 < cpuflags_sse && notcpuflag(sse2) && __sizeofreg
> 8 |
| 1200 %error use of ``%1'' sse2 instruction in cpuname function: curre
nt_function |
| 1201 %endif |
| 1202 %endif |
| 1203 %endif |
| 1204 |
| 1205 %if __emulate_avx |
| 1206 %xdefine __src1 %7 |
| 1207 %xdefine __src2 %8 |
| 1208 %ifnidn %6, %7 |
| 1209 %if %0 >= 9 |
| 1210 CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, %8, %9 |
| 1211 %else |
| 1212 CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, %8 |
| 1213 %endif |
| 1214 %if %5 && %4 == 0 |
| 1215 %ifnid %8 |
| 1216 ; 3-operand AVX instructions with a memory arg can only have
it in src2, |
| 1217 ; whereas SSE emulation prefers to have it in src1 (i.e. the
mov). |
| 1218 ; So, if the instruction is commutative with a memory arg, s
wap them. |
| 1219 %xdefine __src1 %8 |
| 1220 %xdefine __src2 %7 |
| 1221 %endif |
| 1222 %endif |
| 1223 %if __sizeofreg == 8 |
| 1224 MOVQ %6, __src1 |
| 1225 %elif %3 |
| 1226 MOVAPS %6, __src1 |
| 1227 %else |
| 1228 MOVDQA %6, __src1 |
| 1229 %endif |
| 1230 %endif |
| 1231 %if %0 >= 9 |
| 1232 %1 %6, __src2, %9 |
975 %else | 1233 %else |
976 v%1 %5, %6 | 1234 %1 %6, __src2 |
977 %endif | 1235 %endif |
| 1236 %elif %0 >= 9 |
| 1237 __instr %6, %7, %8, %9 |
| 1238 %elif %0 == 8 |
| 1239 __instr %6, %7, %8 |
| 1240 %elif %0 == 7 |
| 1241 __instr %6, %7 |
978 %else | 1242 %else |
979 %if %%size==8 | 1243 __instr %6 |
980 %define %%regmov movq | |
981 %elif %2 | |
982 %define %%regmov movaps | |
983 %else | |
984 %define %%regmov movdqa | |
985 %endif | |
986 | |
987 %if %4>=3+%3 | |
988 %ifnidn %5, %6 | |
989 %if avx_enabled && sizeof%5==16 | |
990 v%1 %5, %6, %7 | |
991 %else | |
992 %%regmov %5, %6 | |
993 %1 %5, %7 | |
994 %endif | |
995 %else | |
996 %1 %5, %7 | |
997 %endif | |
998 %elif %3 | |
999 %1 %5, %6, %7 | |
1000 %else | |
1001 %1 %5, %6 | |
1002 %endif | |
1003 %endif | 1244 %endif |
1004 %endmacro | 1245 %endmacro |
1005 | 1246 |
1006 ; 3arg AVX ops with a memory arg can only have it in src2, | |
1007 ; whereas SSE emulation of 3arg prefers to have it in src1 (i.e. the mov). | |
1008 ; So, if the op is symmetric and the wrong one is memory, swap them. | |
1009 %macro RUN_AVX_INSTR1 8 | |
1010 %assign %%swap 0 | |
1011 %if avx_enabled | |
1012 %ifnid %6 | |
1013 %assign %%swap 1 | |
1014 %endif | |
1015 %elifnidn %5, %6 | |
1016 %ifnid %7 | |
1017 %assign %%swap 1 | |
1018 %endif | |
1019 %endif | |
1020 %if %%swap && %3 == 0 && %8 == 1 | |
1021 RUN_AVX_INSTR %1, %2, %3, %4, %5, %7, %6 | |
1022 %else | |
1023 RUN_AVX_INSTR %1, %2, %3, %4, %5, %6, %7 | |
1024 %endif | |
1025 %endmacro | |
1026 | |
1027 ;%1 == instruction | 1247 ;%1 == instruction |
1028 ;%2 == 1 if float, 0 if int | 1248 ;%2 == minimal instruction set |
1029 ;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 3-operand (xmm, xmm, xmm) | 1249 ;%3 == 1 if float, 0 if int |
1030 ;%4 == 1 if symmetric (i.e. doesn't matter which src arg is which), 0 if not | 1250 ;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise |
1031 %macro AVX_INSTR 4 | 1251 ;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not |
1032 %macro %1 2-9 fnord, fnord, fnord, %1, %2, %3, %4 | 1252 %macro AVX_INSTR 1-5 fnord, 0, 1, 0 |
1033 %ifidn %3, fnord | 1253 %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5 |
1034 RUN_AVX_INSTR %6, %7, %8, 2, %1, %2 | 1254 %ifidn %2, fnord |
| 1255 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1 |
| 1256 %elifidn %3, fnord |
| 1257 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2 |
1035 %elifidn %4, fnord | 1258 %elifidn %4, fnord |
1036 RUN_AVX_INSTR1 %6, %7, %8, 3, %1, %2, %3, %9 | 1259 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3 |
1037 %elifidn %5, fnord | 1260 %elifidn %5, fnord |
1038 RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4 | 1261 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4 |
1039 %else | 1262 %else |
1040 RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5 | 1263 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4, %5 |
1041 %endif | 1264 %endif |
1042 %endmacro | 1265 %endmacro |
1043 %endmacro | 1266 %endmacro |
1044 | 1267 |
1045 AVX_INSTR addpd, 1, 0, 1 | 1268 ; Instructions with both VEX and non-VEX encodings |
1046 AVX_INSTR addps, 1, 0, 1 | 1269 ; Non-destructive instructions are written without parameters |
1047 AVX_INSTR addsd, 1, 0, 1 | 1270 AVX_INSTR addpd, sse2, 1, 0, 1 |
1048 AVX_INSTR addss, 1, 0, 1 | 1271 AVX_INSTR addps, sse, 1, 0, 1 |
1049 AVX_INSTR addsubpd, 1, 0, 0 | 1272 AVX_INSTR addsd, sse2, 1, 0, 1 |
1050 AVX_INSTR addsubps, 1, 0, 0 | 1273 AVX_INSTR addss, sse, 1, 0, 1 |
1051 AVX_INSTR andpd, 1, 0, 1 | 1274 AVX_INSTR addsubpd, sse3, 1, 0, 0 |
1052 AVX_INSTR andps, 1, 0, 1 | 1275 AVX_INSTR addsubps, sse3, 1, 0, 0 |
1053 AVX_INSTR andnpd, 1, 0, 0 | 1276 AVX_INSTR aesdec, fnord, 0, 0, 0 |
1054 AVX_INSTR andnps, 1, 0, 0 | 1277 AVX_INSTR aesdeclast, fnord, 0, 0, 0 |
1055 AVX_INSTR blendpd, 1, 0, 0 | 1278 AVX_INSTR aesenc, fnord, 0, 0, 0 |
1056 AVX_INSTR blendps, 1, 0, 0 | 1279 AVX_INSTR aesenclast, fnord, 0, 0, 0 |
1057 AVX_INSTR blendvpd, 1, 0, 0 | 1280 AVX_INSTR aesimc |
1058 AVX_INSTR blendvps, 1, 0, 0 | 1281 AVX_INSTR aeskeygenassist |
1059 AVX_INSTR cmppd, 1, 0, 0 | 1282 AVX_INSTR andnpd, sse2, 1, 0, 0 |
1060 AVX_INSTR cmpps, 1, 0, 0 | 1283 AVX_INSTR andnps, sse, 1, 0, 0 |
1061 AVX_INSTR cmpsd, 1, 0, 0 | 1284 AVX_INSTR andpd, sse2, 1, 0, 1 |
1062 AVX_INSTR cmpss, 1, 0, 0 | 1285 AVX_INSTR andps, sse, 1, 0, 1 |
1063 AVX_INSTR cvtdq2ps, 1, 0, 0 | 1286 AVX_INSTR blendpd, sse4, 1, 0, 0 |
1064 AVX_INSTR cvtps2dq, 1, 0, 0 | 1287 AVX_INSTR blendps, sse4, 1, 0, 0 |
1065 AVX_INSTR divpd, 1, 0, 0 | 1288 AVX_INSTR blendvpd, sse4, 1, 0, 0 |
1066 AVX_INSTR divps, 1, 0, 0 | 1289 AVX_INSTR blendvps, sse4, 1, 0, 0 |
1067 AVX_INSTR divsd, 1, 0, 0 | 1290 AVX_INSTR cmppd, sse2, 1, 1, 0 |
1068 AVX_INSTR divss, 1, 0, 0 | 1291 AVX_INSTR cmpps, sse, 1, 1, 0 |
1069 AVX_INSTR dppd, 1, 1, 0 | 1292 AVX_INSTR cmpsd, sse2, 1, 1, 0 |
1070 AVX_INSTR dpps, 1, 1, 0 | 1293 AVX_INSTR cmpss, sse, 1, 1, 0 |
1071 AVX_INSTR haddpd, 1, 0, 0 | 1294 AVX_INSTR comisd, sse2 |
1072 AVX_INSTR haddps, 1, 0, 0 | 1295 AVX_INSTR comiss, sse |
1073 AVX_INSTR hsubpd, 1, 0, 0 | 1296 AVX_INSTR cvtdq2pd, sse2 |
1074 AVX_INSTR hsubps, 1, 0, 0 | 1297 AVX_INSTR cvtdq2ps, sse2 |
1075 AVX_INSTR maxpd, 1, 0, 1 | 1298 AVX_INSTR cvtpd2dq, sse2 |
1076 AVX_INSTR maxps, 1, 0, 1 | 1299 AVX_INSTR cvtpd2ps, sse2 |
1077 AVX_INSTR maxsd, 1, 0, 1 | 1300 AVX_INSTR cvtps2dq, sse2 |
1078 AVX_INSTR maxss, 1, 0, 1 | 1301 AVX_INSTR cvtps2pd, sse2 |
1079 AVX_INSTR minpd, 1, 0, 1 | 1302 AVX_INSTR cvtsd2si, sse2 |
1080 AVX_INSTR minps, 1, 0, 1 | 1303 AVX_INSTR cvtsd2ss, sse2 |
1081 AVX_INSTR minsd, 1, 0, 1 | 1304 AVX_INSTR cvtsi2sd, sse2 |
1082 AVX_INSTR minss, 1, 0, 1 | 1305 AVX_INSTR cvtsi2ss, sse |
1083 AVX_INSTR movhlps, 1, 0, 0 | 1306 AVX_INSTR cvtss2sd, sse2 |
1084 AVX_INSTR movlhps, 1, 0, 0 | 1307 AVX_INSTR cvtss2si, sse |
1085 AVX_INSTR movsd, 1, 0, 0 | 1308 AVX_INSTR cvttpd2dq, sse2 |
1086 AVX_INSTR movss, 1, 0, 0 | 1309 AVX_INSTR cvttps2dq, sse2 |
1087 AVX_INSTR mpsadbw, 0, 1, 0 | 1310 AVX_INSTR cvttsd2si, sse2 |
1088 AVX_INSTR mulpd, 1, 0, 1 | 1311 AVX_INSTR cvttss2si, sse |
1089 AVX_INSTR mulps, 1, 0, 1 | 1312 AVX_INSTR divpd, sse2, 1, 0, 0 |
1090 AVX_INSTR mulsd, 1, 0, 1 | 1313 AVX_INSTR divps, sse, 1, 0, 0 |
1091 AVX_INSTR mulss, 1, 0, 1 | 1314 AVX_INSTR divsd, sse2, 1, 0, 0 |
1092 AVX_INSTR orpd, 1, 0, 1 | 1315 AVX_INSTR divss, sse, 1, 0, 0 |
1093 AVX_INSTR orps, 1, 0, 1 | 1316 AVX_INSTR dppd, sse4, 1, 1, 0 |
1094 AVX_INSTR packsswb, 0, 0, 0 | 1317 AVX_INSTR dpps, sse4, 1, 1, 0 |
1095 AVX_INSTR packssdw, 0, 0, 0 | 1318 AVX_INSTR extractps, sse4 |
1096 AVX_INSTR packuswb, 0, 0, 0 | 1319 AVX_INSTR haddpd, sse3, 1, 0, 0 |
1097 AVX_INSTR packusdw, 0, 0, 0 | 1320 AVX_INSTR haddps, sse3, 1, 0, 0 |
1098 AVX_INSTR paddb, 0, 0, 1 | 1321 AVX_INSTR hsubpd, sse3, 1, 0, 0 |
1099 AVX_INSTR paddw, 0, 0, 1 | 1322 AVX_INSTR hsubps, sse3, 1, 0, 0 |
1100 AVX_INSTR paddd, 0, 0, 1 | 1323 AVX_INSTR insertps, sse4, 1, 1, 0 |
1101 AVX_INSTR paddq, 0, 0, 1 | 1324 AVX_INSTR lddqu, sse3 |
1102 AVX_INSTR paddsb, 0, 0, 1 | 1325 AVX_INSTR ldmxcsr, sse |
1103 AVX_INSTR paddsw, 0, 0, 1 | 1326 AVX_INSTR maskmovdqu, sse2 |
1104 AVX_INSTR paddusb, 0, 0, 1 | 1327 AVX_INSTR maxpd, sse2, 1, 0, 1 |
1105 AVX_INSTR paddusw, 0, 0, 1 | 1328 AVX_INSTR maxps, sse, 1, 0, 1 |
1106 AVX_INSTR palignr, 0, 1, 0 | 1329 AVX_INSTR maxsd, sse2, 1, 0, 1 |
1107 AVX_INSTR pand, 0, 0, 1 | 1330 AVX_INSTR maxss, sse, 1, 0, 1 |
1108 AVX_INSTR pandn, 0, 0, 0 | 1331 AVX_INSTR minpd, sse2, 1, 0, 1 |
1109 AVX_INSTR pavgb, 0, 0, 1 | 1332 AVX_INSTR minps, sse, 1, 0, 1 |
1110 AVX_INSTR pavgw, 0, 0, 1 | 1333 AVX_INSTR minsd, sse2, 1, 0, 1 |
1111 AVX_INSTR pblendvb, 0, 0, 0 | 1334 AVX_INSTR minss, sse, 1, 0, 1 |
1112 AVX_INSTR pblendw, 0, 1, 0 | 1335 AVX_INSTR movapd, sse2 |
1113 AVX_INSTR pcmpestri, 0, 0, 0 | 1336 AVX_INSTR movaps, sse |
1114 AVX_INSTR pcmpestrm, 0, 0, 0 | 1337 AVX_INSTR movd, mmx |
1115 AVX_INSTR pcmpistri, 0, 0, 0 | 1338 AVX_INSTR movddup, sse3 |
1116 AVX_INSTR pcmpistrm, 0, 0, 0 | 1339 AVX_INSTR movdqa, sse2 |
1117 AVX_INSTR pcmpeqb, 0, 0, 1 | 1340 AVX_INSTR movdqu, sse2 |
1118 AVX_INSTR pcmpeqw, 0, 0, 1 | 1341 AVX_INSTR movhlps, sse, 1, 0, 0 |
1119 AVX_INSTR pcmpeqd, 0, 0, 1 | 1342 AVX_INSTR movhpd, sse2, 1, 0, 0 |
1120 AVX_INSTR pcmpeqq, 0, 0, 1 | 1343 AVX_INSTR movhps, sse, 1, 0, 0 |
1121 AVX_INSTR pcmpgtb, 0, 0, 0 | 1344 AVX_INSTR movlhps, sse, 1, 0, 0 |
1122 AVX_INSTR pcmpgtw, 0, 0, 0 | 1345 AVX_INSTR movlpd, sse2, 1, 0, 0 |
1123 AVX_INSTR pcmpgtd, 0, 0, 0 | 1346 AVX_INSTR movlps, sse, 1, 0, 0 |
1124 AVX_INSTR pcmpgtq, 0, 0, 0 | 1347 AVX_INSTR movmskpd, sse2 |
1125 AVX_INSTR phaddw, 0, 0, 0 | 1348 AVX_INSTR movmskps, sse |
1126 AVX_INSTR phaddd, 0, 0, 0 | 1349 AVX_INSTR movntdq, sse2 |
1127 AVX_INSTR phaddsw, 0, 0, 0 | 1350 AVX_INSTR movntdqa, sse4 |
1128 AVX_INSTR phsubw, 0, 0, 0 | 1351 AVX_INSTR movntpd, sse2 |
1129 AVX_INSTR phsubd, 0, 0, 0 | 1352 AVX_INSTR movntps, sse |
1130 AVX_INSTR phsubsw, 0, 0, 0 | 1353 AVX_INSTR movq, mmx |
1131 AVX_INSTR pmaddwd, 0, 0, 1 | 1354 AVX_INSTR movsd, sse2, 1, 0, 0 |
1132 AVX_INSTR pmaddubsw, 0, 0, 0 | 1355 AVX_INSTR movshdup, sse3 |
1133 AVX_INSTR pmaxsb, 0, 0, 1 | 1356 AVX_INSTR movsldup, sse3 |
1134 AVX_INSTR pmaxsw, 0, 0, 1 | 1357 AVX_INSTR movss, sse, 1, 0, 0 |
1135 AVX_INSTR pmaxsd, 0, 0, 1 | 1358 AVX_INSTR movupd, sse2 |
1136 AVX_INSTR pmaxub, 0, 0, 1 | 1359 AVX_INSTR movups, sse |
1137 AVX_INSTR pmaxuw, 0, 0, 1 | 1360 AVX_INSTR mpsadbw, sse4 |
1138 AVX_INSTR pmaxud, 0, 0, 1 | 1361 AVX_INSTR mulpd, sse2, 1, 0, 1 |
1139 AVX_INSTR pminsb, 0, 0, 1 | 1362 AVX_INSTR mulps, sse, 1, 0, 1 |
1140 AVX_INSTR pminsw, 0, 0, 1 | 1363 AVX_INSTR mulsd, sse2, 1, 0, 1 |
1141 AVX_INSTR pminsd, 0, 0, 1 | 1364 AVX_INSTR mulss, sse, 1, 0, 1 |
1142 AVX_INSTR pminub, 0, 0, 1 | 1365 AVX_INSTR orpd, sse2, 1, 0, 1 |
1143 AVX_INSTR pminuw, 0, 0, 1 | 1366 AVX_INSTR orps, sse, 1, 0, 1 |
1144 AVX_INSTR pminud, 0, 0, 1 | 1367 AVX_INSTR pabsb, ssse3 |
1145 AVX_INSTR pmulhuw, 0, 0, 1 | 1368 AVX_INSTR pabsd, ssse3 |
1146 AVX_INSTR pmulhrsw, 0, 0, 1 | 1369 AVX_INSTR pabsw, ssse3 |
1147 AVX_INSTR pmulhw, 0, 0, 1 | 1370 AVX_INSTR packsswb, mmx, 0, 0, 0 |
1148 AVX_INSTR pmullw, 0, 0, 1 | 1371 AVX_INSTR packssdw, mmx, 0, 0, 0 |
1149 AVX_INSTR pmulld, 0, 0, 1 | 1372 AVX_INSTR packuswb, mmx, 0, 0, 0 |
1150 AVX_INSTR pmuludq, 0, 0, 1 | 1373 AVX_INSTR packusdw, sse4, 0, 0, 0 |
1151 AVX_INSTR pmuldq, 0, 0, 1 | 1374 AVX_INSTR paddb, mmx, 0, 0, 1 |
1152 AVX_INSTR por, 0, 0, 1 | 1375 AVX_INSTR paddw, mmx, 0, 0, 1 |
1153 AVX_INSTR psadbw, 0, 0, 1 | 1376 AVX_INSTR paddd, mmx, 0, 0, 1 |
1154 AVX_INSTR pshufb, 0, 0, 0 | 1377 AVX_INSTR paddq, sse2, 0, 0, 1 |
1155 AVX_INSTR psignb, 0, 0, 0 | 1378 AVX_INSTR paddsb, mmx, 0, 0, 1 |
1156 AVX_INSTR psignw, 0, 0, 0 | 1379 AVX_INSTR paddsw, mmx, 0, 0, 1 |
1157 AVX_INSTR psignd, 0, 0, 0 | 1380 AVX_INSTR paddusb, mmx, 0, 0, 1 |
1158 AVX_INSTR psllw, 0, 0, 0 | 1381 AVX_INSTR paddusw, mmx, 0, 0, 1 |
1159 AVX_INSTR pslld, 0, 0, 0 | 1382 AVX_INSTR palignr, ssse3 |
1160 AVX_INSTR psllq, 0, 0, 0 | 1383 AVX_INSTR pand, mmx, 0, 0, 1 |
1161 AVX_INSTR pslldq, 0, 0, 0 | 1384 AVX_INSTR pandn, mmx, 0, 0, 0 |
1162 AVX_INSTR psraw, 0, 0, 0 | 1385 AVX_INSTR pavgb, mmx2, 0, 0, 1 |
1163 AVX_INSTR psrad, 0, 0, 0 | 1386 AVX_INSTR pavgw, mmx2, 0, 0, 1 |
1164 AVX_INSTR psrlw, 0, 0, 0 | 1387 AVX_INSTR pblendvb, sse4, 0, 0, 0 |
1165 AVX_INSTR psrld, 0, 0, 0 | 1388 AVX_INSTR pblendw, sse4 |
1166 AVX_INSTR psrlq, 0, 0, 0 | 1389 AVX_INSTR pclmulqdq |
1167 AVX_INSTR psrldq, 0, 0, 0 | 1390 AVX_INSTR pcmpestri, sse42 |
1168 AVX_INSTR psubb, 0, 0, 0 | 1391 AVX_INSTR pcmpestrm, sse42 |
1169 AVX_INSTR psubw, 0, 0, 0 | 1392 AVX_INSTR pcmpistri, sse42 |
1170 AVX_INSTR psubd, 0, 0, 0 | 1393 AVX_INSTR pcmpistrm, sse42 |
1171 AVX_INSTR psubq, 0, 0, 0 | 1394 AVX_INSTR pcmpeqb, mmx, 0, 0, 1 |
1172 AVX_INSTR psubsb, 0, 0, 0 | 1395 AVX_INSTR pcmpeqw, mmx, 0, 0, 1 |
1173 AVX_INSTR psubsw, 0, 0, 0 | 1396 AVX_INSTR pcmpeqd, mmx, 0, 0, 1 |
1174 AVX_INSTR psubusb, 0, 0, 0 | 1397 AVX_INSTR pcmpeqq, sse4, 0, 0, 1 |
1175 AVX_INSTR psubusw, 0, 0, 0 | 1398 AVX_INSTR pcmpgtb, mmx, 0, 0, 0 |
1176 AVX_INSTR punpckhbw, 0, 0, 0 | 1399 AVX_INSTR pcmpgtw, mmx, 0, 0, 0 |
1177 AVX_INSTR punpckhwd, 0, 0, 0 | 1400 AVX_INSTR pcmpgtd, mmx, 0, 0, 0 |
1178 AVX_INSTR punpckhdq, 0, 0, 0 | 1401 AVX_INSTR pcmpgtq, sse42, 0, 0, 0 |
1179 AVX_INSTR punpckhqdq, 0, 0, 0 | 1402 AVX_INSTR pextrb, sse4 |
1180 AVX_INSTR punpcklbw, 0, 0, 0 | 1403 AVX_INSTR pextrd, sse4 |
1181 AVX_INSTR punpcklwd, 0, 0, 0 | 1404 AVX_INSTR pextrq, sse4 |
1182 AVX_INSTR punpckldq, 0, 0, 0 | 1405 AVX_INSTR pextrw, mmx2 |
1183 AVX_INSTR punpcklqdq, 0, 0, 0 | 1406 AVX_INSTR phaddw, ssse3, 0, 0, 0 |
1184 AVX_INSTR pxor, 0, 0, 1 | 1407 AVX_INSTR phaddd, ssse3, 0, 0, 0 |
1185 AVX_INSTR shufps, 1, 1, 0 | 1408 AVX_INSTR phaddsw, ssse3, 0, 0, 0 |
1186 AVX_INSTR subpd, 1, 0, 0 | 1409 AVX_INSTR phminposuw, sse4 |
1187 AVX_INSTR subps, 1, 0, 0 | 1410 AVX_INSTR phsubw, ssse3, 0, 0, 0 |
1188 AVX_INSTR subsd, 1, 0, 0 | 1411 AVX_INSTR phsubd, ssse3, 0, 0, 0 |
1189 AVX_INSTR subss, 1, 0, 0 | 1412 AVX_INSTR phsubsw, ssse3, 0, 0, 0 |
1190 AVX_INSTR unpckhpd, 1, 0, 0 | 1413 AVX_INSTR pinsrb, sse4 |
1191 AVX_INSTR unpckhps, 1, 0, 0 | 1414 AVX_INSTR pinsrd, sse4 |
1192 AVX_INSTR unpcklpd, 1, 0, 0 | 1415 AVX_INSTR pinsrq, sse4 |
1193 AVX_INSTR unpcklps, 1, 0, 0 | 1416 AVX_INSTR pinsrw, mmx2 |
1194 AVX_INSTR xorpd, 1, 0, 1 | 1417 AVX_INSTR pmaddwd, mmx, 0, 0, 1 |
1195 AVX_INSTR xorps, 1, 0, 1 | 1418 AVX_INSTR pmaddubsw, ssse3, 0, 0, 0 |
| 1419 AVX_INSTR pmaxsb, sse4, 0, 0, 1 |
| 1420 AVX_INSTR pmaxsw, mmx2, 0, 0, 1 |
| 1421 AVX_INSTR pmaxsd, sse4, 0, 0, 1 |
| 1422 AVX_INSTR pmaxub, mmx2, 0, 0, 1 |
| 1423 AVX_INSTR pmaxuw, sse4, 0, 0, 1 |
| 1424 AVX_INSTR pmaxud, sse4, 0, 0, 1 |
| 1425 AVX_INSTR pminsb, sse4, 0, 0, 1 |
| 1426 AVX_INSTR pminsw, mmx2, 0, 0, 1 |
| 1427 AVX_INSTR pminsd, sse4, 0, 0, 1 |
| 1428 AVX_INSTR pminub, mmx2, 0, 0, 1 |
| 1429 AVX_INSTR pminuw, sse4, 0, 0, 1 |
| 1430 AVX_INSTR pminud, sse4, 0, 0, 1 |
| 1431 AVX_INSTR pmovmskb, mmx2 |
| 1432 AVX_INSTR pmovsxbw, sse4 |
| 1433 AVX_INSTR pmovsxbd, sse4 |
| 1434 AVX_INSTR pmovsxbq, sse4 |
| 1435 AVX_INSTR pmovsxwd, sse4 |
| 1436 AVX_INSTR pmovsxwq, sse4 |
| 1437 AVX_INSTR pmovsxdq, sse4 |
| 1438 AVX_INSTR pmovzxbw, sse4 |
| 1439 AVX_INSTR pmovzxbd, sse4 |
| 1440 AVX_INSTR pmovzxbq, sse4 |
| 1441 AVX_INSTR pmovzxwd, sse4 |
| 1442 AVX_INSTR pmovzxwq, sse4 |
| 1443 AVX_INSTR pmovzxdq, sse4 |
| 1444 AVX_INSTR pmuldq, sse4, 0, 0, 1 |
| 1445 AVX_INSTR pmulhrsw, ssse3, 0, 0, 1 |
| 1446 AVX_INSTR pmulhuw, mmx2, 0, 0, 1 |
| 1447 AVX_INSTR pmulhw, mmx, 0, 0, 1 |
| 1448 AVX_INSTR pmullw, mmx, 0, 0, 1 |
| 1449 AVX_INSTR pmulld, sse4, 0, 0, 1 |
| 1450 AVX_INSTR pmuludq, sse2, 0, 0, 1 |
| 1451 AVX_INSTR por, mmx, 0, 0, 1 |
| 1452 AVX_INSTR psadbw, mmx2, 0, 0, 1 |
| 1453 AVX_INSTR pshufb, ssse3, 0, 0, 0 |
| 1454 AVX_INSTR pshufd, sse2 |
| 1455 AVX_INSTR pshufhw, sse2 |
| 1456 AVX_INSTR pshuflw, sse2 |
| 1457 AVX_INSTR psignb, ssse3, 0, 0, 0 |
| 1458 AVX_INSTR psignw, ssse3, 0, 0, 0 |
| 1459 AVX_INSTR psignd, ssse3, 0, 0, 0 |
| 1460 AVX_INSTR psllw, mmx, 0, 0, 0 |
| 1461 AVX_INSTR pslld, mmx, 0, 0, 0 |
| 1462 AVX_INSTR psllq, mmx, 0, 0, 0 |
| 1463 AVX_INSTR pslldq, sse2, 0, 0, 0 |
| 1464 AVX_INSTR psraw, mmx, 0, 0, 0 |
| 1465 AVX_INSTR psrad, mmx, 0, 0, 0 |
| 1466 AVX_INSTR psrlw, mmx, 0, 0, 0 |
| 1467 AVX_INSTR psrld, mmx, 0, 0, 0 |
| 1468 AVX_INSTR psrlq, mmx, 0, 0, 0 |
| 1469 AVX_INSTR psrldq, sse2, 0, 0, 0 |
| 1470 AVX_INSTR psubb, mmx, 0, 0, 0 |
| 1471 AVX_INSTR psubw, mmx, 0, 0, 0 |
| 1472 AVX_INSTR psubd, mmx, 0, 0, 0 |
| 1473 AVX_INSTR psubq, sse2, 0, 0, 0 |
| 1474 AVX_INSTR psubsb, mmx, 0, 0, 0 |
| 1475 AVX_INSTR psubsw, mmx, 0, 0, 0 |
| 1476 AVX_INSTR psubusb, mmx, 0, 0, 0 |
| 1477 AVX_INSTR psubusw, mmx, 0, 0, 0 |
| 1478 AVX_INSTR ptest, sse4 |
| 1479 AVX_INSTR punpckhbw, mmx, 0, 0, 0 |
| 1480 AVX_INSTR punpckhwd, mmx, 0, 0, 0 |
| 1481 AVX_INSTR punpckhdq, mmx, 0, 0, 0 |
| 1482 AVX_INSTR punpckhqdq, sse2, 0, 0, 0 |
| 1483 AVX_INSTR punpcklbw, mmx, 0, 0, 0 |
| 1484 AVX_INSTR punpcklwd, mmx, 0, 0, 0 |
| 1485 AVX_INSTR punpckldq, mmx, 0, 0, 0 |
| 1486 AVX_INSTR punpcklqdq, sse2, 0, 0, 0 |
| 1487 AVX_INSTR pxor, mmx, 0, 0, 1 |
| 1488 AVX_INSTR rcpps, sse, 1, 0, 0 |
| 1489 AVX_INSTR rcpss, sse, 1, 0, 0 |
| 1490 AVX_INSTR roundpd, sse4 |
| 1491 AVX_INSTR roundps, sse4 |
| 1492 AVX_INSTR roundsd, sse4 |
| 1493 AVX_INSTR roundss, sse4 |
| 1494 AVX_INSTR rsqrtps, sse, 1, 0, 0 |
| 1495 AVX_INSTR rsqrtss, sse, 1, 0, 0 |
| 1496 AVX_INSTR shufpd, sse2, 1, 1, 0 |
| 1497 AVX_INSTR shufps, sse, 1, 1, 0 |
| 1498 AVX_INSTR sqrtpd, sse2, 1, 0, 0 |
| 1499 AVX_INSTR sqrtps, sse, 1, 0, 0 |
| 1500 AVX_INSTR sqrtsd, sse2, 1, 0, 0 |
| 1501 AVX_INSTR sqrtss, sse, 1, 0, 0 |
| 1502 AVX_INSTR stmxcsr, sse |
| 1503 AVX_INSTR subpd, sse2, 1, 0, 0 |
| 1504 AVX_INSTR subps, sse, 1, 0, 0 |
| 1505 AVX_INSTR subsd, sse2, 1, 0, 0 |
| 1506 AVX_INSTR subss, sse, 1, 0, 0 |
| 1507 AVX_INSTR ucomisd, sse2 |
| 1508 AVX_INSTR ucomiss, sse |
| 1509 AVX_INSTR unpckhpd, sse2, 1, 0, 0 |
| 1510 AVX_INSTR unpckhps, sse, 1, 0, 0 |
| 1511 AVX_INSTR unpcklpd, sse2, 1, 0, 0 |
| 1512 AVX_INSTR unpcklps, sse, 1, 0, 0 |
| 1513 AVX_INSTR xorpd, sse2, 1, 0, 1 |
| 1514 AVX_INSTR xorps, sse, 1, 0, 1 |
1196 | 1515 |
1197 ; 3DNow instructions, for sharing code between AVX, SSE and 3DN | 1516 ; 3DNow instructions, for sharing code between AVX, SSE and 3DN |
1198 AVX_INSTR pfadd, 1, 0, 1 | 1517 AVX_INSTR pfadd, 3dnow, 1, 0, 1 |
1199 AVX_INSTR pfsub, 1, 0, 0 | 1518 AVX_INSTR pfsub, 3dnow, 1, 0, 0 |
1200 AVX_INSTR pfmul, 1, 0, 1 | 1519 AVX_INSTR pfmul, 3dnow, 1, 0, 1 |
1201 | 1520 |
1202 ; base-4 constants for shuffles | 1521 ; base-4 constants for shuffles |
1203 %assign i 0 | 1522 %assign i 0 |
1204 %rep 256 | 1523 %rep 256 |
1205 %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) | 1524 %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) |
1206 %if j < 10 | 1525 %if j < 10 |
1207 CAT_XDEFINE q000, j, i | 1526 CAT_XDEFINE q000, j, i |
1208 %elif j < 100 | 1527 %elif j < 100 |
1209 CAT_XDEFINE q00, j, i | 1528 CAT_XDEFINE q00, j, i |
1210 %elif j < 1000 | 1529 %elif j < 1000 |
1211 CAT_XDEFINE q0, j, i | 1530 CAT_XDEFINE q0, j, i |
1212 %else | 1531 %else |
1213 CAT_XDEFINE q, j, i | 1532 CAT_XDEFINE q, j, i |
1214 %endif | 1533 %endif |
1215 %assign i i+1 | 1534 %assign i i+1 |
1216 %endrep | 1535 %endrep |
1217 %undef i | 1536 %undef i |
1218 %undef j | 1537 %undef j |
1219 | 1538 |
1220 %macro FMA_INSTR 3 | 1539 %macro FMA_INSTR 3 |
1221 %macro %1 4-7 %1, %2, %3 | 1540 %macro %1 4-7 %1, %2, %3 |
1222 %if cpuflag(xop) | 1541 %if cpuflag(xop) |
1223 v%5 %1, %2, %3, %4 | 1542 v%5 %1, %2, %3, %4 |
1224 %else | 1543 %elifnidn %1, %4 |
1225 %6 %1, %2, %3 | 1544 %6 %1, %2, %3 |
1226 %7 %1, %4 | 1545 %7 %1, %4 |
| 1546 %else |
| 1547 %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported |
1227 %endif | 1548 %endif |
1228 %endmacro | 1549 %endmacro |
1229 %endmacro | 1550 %endmacro |
1230 | 1551 |
1231 FMA_INSTR pmacsdd, pmulld, paddd | |
1232 FMA_INSTR pmacsww, pmullw, paddw | 1552 FMA_INSTR pmacsww, pmullw, paddw |
| 1553 FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation |
| 1554 FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation |
1233 FMA_INSTR pmadcswd, pmaddwd, paddd | 1555 FMA_INSTR pmadcswd, pmaddwd, paddd |
| 1556 |
| 1557 ; convert FMA4 to FMA3 if possible |
| 1558 %macro FMA4_INSTR 4 |
| 1559 %macro %1 4-8 %1, %2, %3, %4 |
| 1560 %if cpuflag(fma4) |
| 1561 v%5 %1, %2, %3, %4 |
| 1562 %elifidn %1, %2 |
| 1563 v%6 %1, %4, %3 ; %1 = %1 * %3 + %4 |
| 1564 %elifidn %1, %3 |
| 1565 v%7 %1, %2, %4 ; %1 = %2 * %1 + %4 |
| 1566 %elifidn %1, %4 |
| 1567 v%8 %1, %2, %3 ; %1 = %2 * %3 + %1 |
| 1568 %else |
| 1569 %error fma3 emulation of ``%5 %1, %2, %3, %4'' is not supported |
| 1570 %endif |
| 1571 %endmacro |
| 1572 %endmacro |
| 1573 |
| 1574 FMA4_INSTR fmaddpd, fmadd132pd, fmadd213pd, fmadd231pd |
| 1575 FMA4_INSTR fmaddps, fmadd132ps, fmadd213ps, fmadd231ps |
| 1576 FMA4_INSTR fmaddsd, fmadd132sd, fmadd213sd, fmadd231sd |
| 1577 FMA4_INSTR fmaddss, fmadd132ss, fmadd213ss, fmadd231ss |
| 1578 |
| 1579 FMA4_INSTR fmaddsubpd, fmaddsub132pd, fmaddsub213pd, fmaddsub231pd |
| 1580 FMA4_INSTR fmaddsubps, fmaddsub132ps, fmaddsub213ps, fmaddsub231ps |
| 1581 FMA4_INSTR fmsubaddpd, fmsubadd132pd, fmsubadd213pd, fmsubadd231pd |
| 1582 FMA4_INSTR fmsubaddps, fmsubadd132ps, fmsubadd213ps, fmsubadd231ps |
| 1583 |
| 1584 FMA4_INSTR fmsubpd, fmsub132pd, fmsub213pd, fmsub231pd |
| 1585 FMA4_INSTR fmsubps, fmsub132ps, fmsub213ps, fmsub231ps |
| 1586 FMA4_INSTR fmsubsd, fmsub132sd, fmsub213sd, fmsub231sd |
| 1587 FMA4_INSTR fmsubss, fmsub132ss, fmsub213ss, fmsub231ss |
| 1588 |
| 1589 FMA4_INSTR fnmaddpd, fnmadd132pd, fnmadd213pd, fnmadd231pd |
| 1590 FMA4_INSTR fnmaddps, fnmadd132ps, fnmadd213ps, fnmadd231ps |
| 1591 FMA4_INSTR fnmaddsd, fnmadd132sd, fnmadd213sd, fnmadd231sd |
| 1592 FMA4_INSTR fnmaddss, fnmadd132ss, fnmadd213ss, fnmadd231ss |
| 1593 |
| 1594 FMA4_INSTR fnmsubpd, fnmsub132pd, fnmsub213pd, fnmsub231pd |
| 1595 FMA4_INSTR fnmsubps, fnmsub132ps, fnmsub213ps, fnmsub231ps |
| 1596 FMA4_INSTR fnmsubsd, fnmsub132sd, fnmsub213sd, fnmsub231sd |
| 1597 FMA4_INSTR fnmsubss, fnmsub132ss, fnmsub213ss, fnmsub231ss |
| 1598 |
| 1599 ; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug |
| 1600 %if ARCH_X86_64 == 0 |
| 1601 %macro vpbroadcastq 2 |
| 1602 %if sizeof%1 == 16 |
| 1603 movddup %1, %2 |
| 1604 %else |
| 1605 vbroadcastsd %1, %2 |
| 1606 %endif |
| 1607 %endmacro |
| 1608 %endif |
OLD | NEW |