Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(89)

Side by Side Diff: source/libvpx/third_party/x86inc/x86inc.asm

Issue 1302353004: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/libvpx/third_party/x86inc/README.libvpx ('k') | source/libvpx/tools_common.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 ;***************************************************************************** 1 ;*****************************************************************************
2 ;* x86inc.asm: x264asm abstraction layer 2 ;* x86inc.asm: x264asm abstraction layer
3 ;***************************************************************************** 3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2012 x264 project 4 ;* Copyright (C) 2005-2015 x264 project
5 ;* 5 ;*
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu> 6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Anton Mitrofanov <BugMaster@narod.ru> 7 ;* Anton Mitrofanov <BugMaster@narod.ru>
8 ;* Jason Garrett-Glaser <darkshikari@gmail.com> 8 ;* Fiona Glaser <fiona@x264.com>
9 ;* Henrik Gramner <hengar-6@student.ltu.se> 9 ;* Henrik Gramner <henrik@gramner.com>
10 ;* 10 ;*
11 ;* Permission to use, copy, modify, and/or distribute this software for any 11 ;* Permission to use, copy, modify, and/or distribute this software for any
12 ;* purpose with or without fee is hereby granted, provided that the above 12 ;* purpose with or without fee is hereby granted, provided that the above
13 ;* copyright notice and this permission notice appear in all copies. 13 ;* copyright notice and this permission notice appear in all copies.
14 ;* 14 ;*
15 ;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 15 ;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
16 ;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 16 ;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
17 ;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 17 ;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
18 ;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 18 ;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 ;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 19 ;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
20 ;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 20 ;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
21 ;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 21 ;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
22 ;***************************************************************************** 22 ;*****************************************************************************
23 23
24 ; This is a header file for the x264ASM assembly language, which uses 24 ; This is a header file for the x264ASM assembly language, which uses
25 ; NASM/YASM syntax combined with a large number of macros to provide easy 25 ; NASM/YASM syntax combined with a large number of macros to provide easy
26 ; abstraction between different calling conventions (x86_32, win64, linux64). 26 ; abstraction between different calling conventions (x86_32, win64, linux64).
27 ; It also has various other useful features to simplify writing the kind of 27 ; It also has various other useful features to simplify writing the kind of
28 ; DSP functions that are most often used in x264. 28 ; DSP functions that are most often used in x264.
29 29
30 ; Unlike the rest of x264, this file is available under an ISC license, as it 30 ; Unlike the rest of x264, this file is available under an ISC license, as it
31 ; has significant usefulness outside of x264 and we want it to be available 31 ; has significant usefulness outside of x264 and we want it to be available
32 ; to the largest audience possible. Of course, if you modify it for your own 32 ; to the largest audience possible. Of course, if you modify it for your own
33 ; purposes to add a new feature, we strongly encourage contributing a patch 33 ; purposes to add a new feature, we strongly encourage contributing a patch
34 ; as this feature might be useful for others as well. Send patches or ideas 34 ; as this feature might be useful for others as well. Send patches or ideas
35 ; to x264-devel@videolan.org . 35 ; to x264-devel@videolan.org .
36 36
37 %include "vpx_config.asm" 37 %include "vpx_config.asm"
38 38
39 %ifndef program_name 39 %ifndef private_prefix
40 %define program_name vp9 40 %define private_prefix vpx
41 %endif 41 %endif
42 42
43 %ifndef public_prefix
44 %define public_prefix private_prefix
45 %endif
43 46
47 %ifndef STACK_ALIGNMENT
48 %if ARCH_X86_64
49 %define STACK_ALIGNMENT 16
50 %else
51 %define STACK_ALIGNMENT 4
52 %endif
53 %endif
54
55 %define WIN64 0
44 %define UNIX64 0 56 %define UNIX64 0
45 %define WIN64 0
46 %if ARCH_X86_64 57 %if ARCH_X86_64
47 %ifidn __OUTPUT_FORMAT__,win32 58 %ifidn __OUTPUT_FORMAT__,win32
48 %define WIN64 1 59 %define WIN64 1
49 %elifidn __OUTPUT_FORMAT__,win64 60 %elifidn __OUTPUT_FORMAT__,win64
50 %define WIN64 1 61 %define WIN64 1
51 %elifidn __OUTPUT_FORMAT__,x64 62 %elifidn __OUTPUT_FORMAT__,x64
52 %define WIN64 1 63 %define WIN64 1
53 %else 64 %else
54 %define UNIX64 1 65 %define UNIX64 1
55 %endif 66 %endif
56 %endif 67 %endif
57 68
58 %ifidn __OUTPUT_FORMAT__,elf32 69 %ifidn __OUTPUT_FORMAT__,elf32
59 %define mangle(x) x 70 %define mangle(x) x
60 %elifidn __OUTPUT_FORMAT__,elf64 71 %elifidn __OUTPUT_FORMAT__,elf64
61 %define mangle(x) x 72 %define mangle(x) x
62 %elifidn __OUTPUT_FORMAT__,elf
63 %define mangle(x) x
64 %elifidn __OUTPUT_FORMAT__,x64 73 %elifidn __OUTPUT_FORMAT__,x64
65 %define mangle(x) x 74 %define mangle(x) x
66 %elifidn __OUTPUT_FORMAT__,win64 75 %elifidn __OUTPUT_FORMAT__,win64
67 %define mangle(x) x 76 %define mangle(x) x
68 %else 77 %else
69 %define mangle(x) _ %+ x 78 %define mangle(x) _ %+ x
70 %endif 79 %endif
71 80
72 ; FIXME: All of the 64bit asm functions that take a stride as an argument 81 ; In some instances macho32 tables get misaligned when using .rodata.
73 ; via register, assume that the high dword of that register is filled with 0. 82 ; When looking at the disassembly it appears that the offset is either
74 ; This is true in practice (since we never do any 64bit arithmetic on strides, 83 ; correct or consistently off by 90. Placing them in the .text section
75 ; and x264's strides are all positive), but is not guaranteed by the ABI. 84 ; works around the issue. It appears to be specific to the way libvpx
76 85 ; handles the tables.
77 ; Name of the .rodata section.
78 ; Kludge: Something on OS X fails to align .rodata even given an align attribute ,
79 ; so use a different read-only section.
80 %macro SECTION_RODATA 0-1 16 86 %macro SECTION_RODATA 0-1 16
81 %ifidn __OUTPUT_FORMAT__,macho64 87 %ifidn __OUTPUT_FORMAT__,macho32
82 SECTION .text align=%1
83 %elifidn __OUTPUT_FORMAT__,macho32
84 SECTION .text align=%1
85 fakegot:
86 %elifidn __OUTPUT_FORMAT__,macho
87 SECTION .text align=%1 88 SECTION .text align=%1
88 fakegot: 89 fakegot:
89 %elifidn __OUTPUT_FORMAT__,aout 90 %elifidn __OUTPUT_FORMAT__,aout
90 section .text 91 SECTION .text
91 %else 92 %else
92 SECTION .rodata align=%1 93 SECTION .rodata align=%1
93 %endif 94 %endif
94 %endmacro 95 %endmacro
95 96
96 ; aout does not support align=
97 %macro SECTION_TEXT 0-1 16 97 %macro SECTION_TEXT 0-1 16
98 %ifidn __OUTPUT_FORMAT__,aout 98 %ifidn __OUTPUT_FORMAT__,aout
99 SECTION .text 99 SECTION .text
100 %else 100 %else
101 SECTION .text align=%1 101 SECTION .text align=%1
102 %endif 102 %endif
103 %endmacro 103 %endmacro
104 104
105 ; PIC macros are copied from vpx_ports/x86_abi_support.asm. The "define PIC" 105 ; PIC macros are copied from vpx_ports/x86_abi_support.asm. The "define PIC"
106 ; from original code is added in for 64bit. 106 ; from original code is added in for 64bit.
107 %ifidn __OUTPUT_FORMAT__,elf32 107 %ifidn __OUTPUT_FORMAT__,elf32
108 %define ABI_IS_32BIT 1 108 %define ABI_IS_32BIT 1
109 %elifidn __OUTPUT_FORMAT__,macho32 109 %elifidn __OUTPUT_FORMAT__,macho32
110 %define ABI_IS_32BIT 1 110 %define ABI_IS_32BIT 1
111 %elifidn __OUTPUT_FORMAT__,win32 111 %elifidn __OUTPUT_FORMAT__,win32
112 %define ABI_IS_32BIT 1 112 %define ABI_IS_32BIT 1
113 %elifidn __OUTPUT_FORMAT__,aout 113 %elifidn __OUTPUT_FORMAT__,aout
114 %define ABI_IS_32BIT 1 114 %define ABI_IS_32BIT 1
115 %else 115 %else
116 %define ABI_IS_32BIT 0 116 %define ABI_IS_32BIT 0
117 %endif 117 %endif
118 118
119 %if ABI_IS_32BIT 119 %if ABI_IS_32BIT
120 %if CONFIG_PIC=1 120 %if CONFIG_PIC=1
121 %ifidn __OUTPUT_FORMAT__,elf32 121 %ifidn __OUTPUT_FORMAT__,elf32
122 %define GET_GOT_SAVE_ARG 1 122 %define GET_GOT_SAVE_ARG 1
123 %define WRT_PLT wrt ..plt 123 %define WRT_PLT wrt ..plt
124 %macro GET_GOT 1 124 %macro GET_GOT 1
125 extern _GLOBAL_OFFSET_TABLE_ 125 extern _GLOBAL_OFFSET_TABLE_
126 push %1 126 push %1
127 call %%get_got 127 call %%get_got
128 %%sub_offset: 128 %%sub_offset:
129 jmp %%exitGG 129 jmp %%exitGG
130 %%get_got: 130 %%get_got:
131 mov %1, [esp] 131 mov %1, [esp]
132 add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%sub_offset wrt ..gotpc 132 add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%sub_offset wrt ..gotpc
133 ret 133 ret
134 %%exitGG: 134 %%exitGG:
135 %undef GLOBAL 135 %undef GLOBAL
136 %define GLOBAL(x) x + %1 wrt ..gotoff 136 %define GLOBAL(x) x + %1 wrt ..gotoff
137 %undef RESTORE_GOT 137 %undef RESTORE_GOT
138 %define RESTORE_GOT pop %1 138 %define RESTORE_GOT pop %1
139 %endmacro 139 %endmacro
140 %elifidn __OUTPUT_FORMAT__,macho32 140 %elifidn __OUTPUT_FORMAT__,macho32
141 %define GET_GOT_SAVE_ARG 1 141 %define GET_GOT_SAVE_ARG 1
142 %macro GET_GOT 1 142 %macro GET_GOT 1
143 push %1 143 push %1
144 call %%get_got 144 call %%get_got
145 %%get_got: 145 %%get_got:
146 pop %1 146 pop %1
147 %undef GLOBAL 147 %undef GLOBAL
148 %define GLOBAL(x) x + %1 - %%get_got 148 %define GLOBAL(x) x + %1 - %%get_got
149 %undef RESTORE_GOT 149 %undef RESTORE_GOT
150 %define RESTORE_GOT pop %1 150 %define RESTORE_GOT pop %1
151 %endmacro 151 %endmacro
152 %endif 152 %endif
153 %endif 153 %endif
154 154
155 %if ARCH_X86_64 == 0 155 %if ARCH_X86_64 == 0
156 %undef PIC 156 %undef PIC
157 %endif 157 %endif
158 158
159 %else 159 %else
160 %macro GET_GOT 1 160 %macro GET_GOT 1
161 %endmacro 161 %endmacro
162 %define GLOBAL(x) rel x 162 %define GLOBAL(x) rel x
163 %define WRT_PLT wrt ..plt 163 %define WRT_PLT wrt ..plt
164 164
165 %if WIN64 165 %if WIN64
166 %define PIC 166 %define PIC
167 %elifidn __OUTPUT_FORMAT__,macho64 167 %elifidn __OUTPUT_FORMAT__,macho64
168 %define PIC 168 %define PIC
169 %elif CONFIG_PIC 169 %elif CONFIG_PIC
170 %define PIC 170 %define PIC
171 %endif 171 %endif
172 %endif 172 %endif
173 173
174 %ifnmacro GET_GOT 174 %ifnmacro GET_GOT
175 %macro GET_GOT 1 175 %macro GET_GOT 1
176 %endmacro 176 %endmacro
177 %define GLOBAL(x) x 177 %define GLOBAL(x) x
178 %endif 178 %endif
179 %ifndef RESTORE_GOT 179 %ifndef RESTORE_GOT
180 %define RESTORE_GOT 180 %define RESTORE_GOT
181 %endif 181 %endif
182 %ifndef WRT_PLT 182 %ifndef WRT_PLT
183 %define WRT_PLT 183 %define WRT_PLT
184 %endif 184 %endif
185 185
186 %ifdef PIC 186 %ifdef PIC
187 default rel 187 default rel
188 %endif 188 %endif
189 ; Done with PIC macros 189 ; Done with PIC macros
190 190
191 ; Always use long nops (reduces 0x90 spam in disassembly on x86_32)
192 %ifndef __NASM_VER__
193 CPU amdnop
194 %else
195 %use smartalign
196 ALIGNMODE k7
197 %endif
198
199 ; Macros to eliminate most code duplication between x86_32 and x86_64: 191 ; Macros to eliminate most code duplication between x86_32 and x86_64:
200 ; Currently this works only for leaf functions which load all their arguments 192 ; Currently this works only for leaf functions which load all their arguments
201 ; into registers at the start, and make no other use of the stack. Luckily that 193 ; into registers at the start, and make no other use of the stack. Luckily that
202 ; covers most of x264's asm. 194 ; covers most of x264's asm.
203 195
204 ; PROLOGUE: 196 ; PROLOGUE:
205 ; %1 = number of arguments. loads them from stack if needed. 197 ; %1 = number of arguments. loads them from stack if needed.
206 ; %2 = number of registers used. pushes callee-saved regs if needed. 198 ; %2 = number of registers used. pushes callee-saved regs if needed.
207 ; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. 199 ; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
208 ; %4 = list of names to define to registers 200 ; %4 = (optional) stack size to be allocated. The stack will be aligned before
201 ; allocating the specified stack size. If the required stack alignment is
202 ; larger than the known stack alignment the stack will be manually aligned
203 ; and an extra register will be allocated to hold the original stack
204 ; pointer (to not invalidate r0m etc.). To prevent the use of an extra
205 ; register as stack pointer, request a negative stack size.
206 ; %4+/%5+ = list of names to define to registers
209 ; PROLOGUE can also be invoked by adding the same options to cglobal 207 ; PROLOGUE can also be invoked by adding the same options to cglobal
210 208
211 ; e.g. 209 ; e.g.
212 ; cglobal foo, 2,3,0, dst, src, tmp 210 ; cglobal foo, 2,3,7,0x40, dst, src, tmp
213 ; declares a function (foo), taking two args (dst and src) and one local variabl e (tmp) 211 ; declares a function (foo) that automatically loads two arguments (dst and
212 ; src) into registers, uses one additional register (tmp) plus 7 vector
213 ; registers (m0-m6) and allocates 0x40 bytes of stack space.
214 214
215 ; TODO Some functions can use some args directly from the stack. If they're the 215 ; TODO Some functions can use some args directly from the stack. If they're the
216 ; last args then you can just not declare them, but if they're in the middle 216 ; last args then you can just not declare them, but if they're in the middle
217 ; we need more flexible macro. 217 ; we need more flexible macro.
218 218
219 ; RET: 219 ; RET:
220 ; Pops anything that was pushed by PROLOGUE, and returns. 220 ; Pops anything that was pushed by PROLOGUE, and returns.
221 221
222 ; REP_RET: 222 ; REP_RET:
223 ; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons 223 ; Use this instead of RET if it's a branch target.
224 ; which are slow when a normal ret follows a branch.
225 224
226 ; registers: 225 ; registers:
227 ; rN and rNq are the native-size register holding function argument N 226 ; rN and rNq are the native-size register holding function argument N
228 ; rNd, rNw, rNb are dword, word, and byte size 227 ; rNd, rNw, rNb are dword, word, and byte size
228 ; rNh is the high 8 bits of the word size
229 ; rNm is the original location of arg N (a register or on the stack), dword 229 ; rNm is the original location of arg N (a register or on the stack), dword
230 ; rNmp is native size 230 ; rNmp is native size
231 231
232 %macro DECLARE_REG 5-6 232 %macro DECLARE_REG 2-3
233 %define r%1q %2 233 %define r%1q %2
234 %define r%1d %3 234 %define r%1d %2d
235 %define r%1w %4 235 %define r%1w %2w
236 %define r%1b %5 236 %define r%1b %2b
237 %if %0 == 5 237 %define r%1h %2h
238 %define r%1m %3 238 %if %0 == 2
239 %define r%1m %2d
239 %define r%1mp %2 240 %define r%1mp %2
240 %elif ARCH_X86_64 ; memory 241 %elif ARCH_X86_64 ; memory
241 %define r%1m [rsp + stack_offset + %6] 242 %define r%1m [rstk + stack_offset + %3]
242 %define r%1mp qword r %+ %1 %+ m 243 %define r%1mp qword r %+ %1 %+ m
243 %else 244 %else
244 %define r%1m [esp + stack_offset + %6] 245 %define r%1m [rstk + stack_offset + %3]
245 %define r%1mp dword r %+ %1 %+ m 246 %define r%1mp dword r %+ %1 %+ m
246 %endif 247 %endif
247 %define r%1 %2 248 %define r%1 %2
248 %endmacro 249 %endmacro
249 250
250 %macro DECLARE_REG_SIZE 2 251 %macro DECLARE_REG_SIZE 3
251 %define r%1q r%1 252 %define r%1q r%1
252 %define e%1q r%1 253 %define e%1q r%1
253 %define r%1d e%1 254 %define r%1d e%1
254 %define e%1d e%1 255 %define e%1d e%1
255 %define r%1w %1 256 %define r%1w %1
256 %define e%1w %1 257 %define e%1w %1
258 %define r%1h %3
259 %define e%1h %3
257 %define r%1b %2 260 %define r%1b %2
258 %define e%1b %2 261 %define e%1b %2
259 %if ARCH_X86_64 == 0 262 %if ARCH_X86_64 == 0
260 %define r%1 e%1 263 %define r%1 e%1
261 %endif 264 %endif
262 %endmacro 265 %endmacro
263 266
264 DECLARE_REG_SIZE ax, al 267 DECLARE_REG_SIZE ax, al, ah
265 DECLARE_REG_SIZE bx, bl 268 DECLARE_REG_SIZE bx, bl, bh
266 DECLARE_REG_SIZE cx, cl 269 DECLARE_REG_SIZE cx, cl, ch
267 DECLARE_REG_SIZE dx, dl 270 DECLARE_REG_SIZE dx, dl, dh
268 DECLARE_REG_SIZE si, sil 271 DECLARE_REG_SIZE si, sil, null
269 DECLARE_REG_SIZE di, dil 272 DECLARE_REG_SIZE di, dil, null
270 DECLARE_REG_SIZE bp, bpl 273 DECLARE_REG_SIZE bp, bpl, null
271 274
272 ; t# defines for when per-arch register allocation is more complex than just fun ction arguments 275 ; t# defines for when per-arch register allocation is more complex than just fun ction arguments
273 276
274 %macro DECLARE_REG_TMP 1-* 277 %macro DECLARE_REG_TMP 1-*
275 %assign %%i 0 278 %assign %%i 0
276 %rep %0 279 %rep %0
277 CAT_XDEFINE t, %%i, r%1 280 CAT_XDEFINE t, %%i, r%1
278 %assign %%i %%i+1 281 %assign %%i %%i+1
279 %rotate 1 282 %rotate 1
280 %endrep 283 %endrep
281 %endmacro 284 %endmacro
282 285
283 %macro DECLARE_REG_TMP_SIZE 0-* 286 %macro DECLARE_REG_TMP_SIZE 0-*
284 %rep %0 287 %rep %0
285 %define t%1q t%1 %+ q 288 %define t%1q t%1 %+ q
286 %define t%1d t%1 %+ d 289 %define t%1d t%1 %+ d
287 %define t%1w t%1 %+ w 290 %define t%1w t%1 %+ w
291 %define t%1h t%1 %+ h
288 %define t%1b t%1 %+ b 292 %define t%1b t%1 %+ b
289 %rotate 1 293 %rotate 1
290 %endrep 294 %endrep
291 %endmacro 295 %endmacro
292 296
293 DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 297 DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
294 298
295 %if ARCH_X86_64 299 %if ARCH_X86_64
296 %define gprsize 8 300 %define gprsize 8
297 %else 301 %else
298 %define gprsize 4 302 %define gprsize 4
299 %endif 303 %endif
300 304
301 %macro PUSH 1 305 %macro PUSH 1
302 push %1 306 push %1
303 %assign stack_offset stack_offset+gprsize 307 %ifidn rstk, rsp
308 %assign stack_offset stack_offset+gprsize
309 %endif
304 %endmacro 310 %endmacro
305 311
306 %macro POP 1 312 %macro POP 1
307 pop %1 313 pop %1
308 %assign stack_offset stack_offset-gprsize 314 %ifidn rstk, rsp
315 %assign stack_offset stack_offset-gprsize
316 %endif
309 %endmacro 317 %endmacro
310 318
311 %macro PUSH_IF_USED 1-* 319 %macro PUSH_IF_USED 1-*
312 %rep %0 320 %rep %0
313 %if %1 < regs_used 321 %if %1 < regs_used
314 PUSH r%1 322 PUSH r%1
315 %endif 323 %endif
316 %rotate 1 324 %rotate 1
317 %endrep 325 %endrep
318 %endmacro 326 %endmacro
(...skipping 11 matching lines...) Expand all
330 %rep %0 338 %rep %0
331 %if %1 < num_args 339 %if %1 < num_args
332 mov r%1, r %+ %1 %+ mp 340 mov r%1, r %+ %1 %+ mp
333 %endif 341 %endif
334 %rotate 1 342 %rotate 1
335 %endrep 343 %endrep
336 %endmacro 344 %endmacro
337 345
338 %macro SUB 2 346 %macro SUB 2
339 sub %1, %2 347 sub %1, %2
340 %ifidn %1, rsp 348 %ifidn %1, rstk
341 %assign stack_offset stack_offset+(%2) 349 %assign stack_offset stack_offset+(%2)
342 %endif 350 %endif
343 %endmacro 351 %endmacro
344 352
345 %macro ADD 2 353 %macro ADD 2
346 add %1, %2 354 add %1, %2
347 %ifidn %1, rsp 355 %ifidn %1, rstk
348 %assign stack_offset stack_offset-(%2) 356 %assign stack_offset stack_offset-(%2)
349 %endif 357 %endif
350 %endmacro 358 %endmacro
351 359
352 %macro movifnidn 2 360 %macro movifnidn 2
353 %ifnidn %1, %2 361 %ifnidn %1, %2
354 mov %1, %2 362 mov %1, %2
355 %endif 363 %endif
356 %endmacro 364 %endmacro
357 365
358 %macro movsxdifnidn 2 366 %macro movsxdifnidn 2
359 %ifnidn %1, %2 367 %ifnidn %1, %2
360 movsxd %1, %2 368 movsxd %1, %2
361 %endif 369 %endif
362 %endmacro 370 %endmacro
363 371
364 %macro ASSERT 1 372 %macro ASSERT 1
365 %if (%1) == 0 373 %if (%1) == 0
366 %error assert failed 374 %error assert failed
367 %endif 375 %endif
368 %endmacro 376 %endmacro
369 377
370 %macro DEFINE_ARGS 0-* 378 %macro DEFINE_ARGS 0-*
371 %ifdef n_arg_names 379 %ifdef n_arg_names
372 %assign %%i 0 380 %assign %%i 0
373 %rep n_arg_names 381 %rep n_arg_names
374 CAT_UNDEF arg_name %+ %%i, q 382 CAT_UNDEF arg_name %+ %%i, q
375 CAT_UNDEF arg_name %+ %%i, d 383 CAT_UNDEF arg_name %+ %%i, d
376 CAT_UNDEF arg_name %+ %%i, w 384 CAT_UNDEF arg_name %+ %%i, w
385 CAT_UNDEF arg_name %+ %%i, h
377 CAT_UNDEF arg_name %+ %%i, b 386 CAT_UNDEF arg_name %+ %%i, b
378 CAT_UNDEF arg_name %+ %%i, m 387 CAT_UNDEF arg_name %+ %%i, m
379 CAT_UNDEF arg_name %+ %%i, mp 388 CAT_UNDEF arg_name %+ %%i, mp
380 CAT_UNDEF arg_name, %%i 389 CAT_UNDEF arg_name, %%i
381 %assign %%i %%i+1 390 %assign %%i %%i+1
382 %endrep 391 %endrep
383 %endif 392 %endif
384 393
385 %xdefine %%stack_offset stack_offset 394 %xdefine %%stack_offset stack_offset
386 %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine 395 %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
387 %assign %%i 0 396 %assign %%i 0
388 %rep %0 397 %rep %0
389 %xdefine %1q r %+ %%i %+ q 398 %xdefine %1q r %+ %%i %+ q
390 %xdefine %1d r %+ %%i %+ d 399 %xdefine %1d r %+ %%i %+ d
391 %xdefine %1w r %+ %%i %+ w 400 %xdefine %1w r %+ %%i %+ w
401 %xdefine %1h r %+ %%i %+ h
392 %xdefine %1b r %+ %%i %+ b 402 %xdefine %1b r %+ %%i %+ b
393 %xdefine %1m r %+ %%i %+ m 403 %xdefine %1m r %+ %%i %+ m
394 %xdefine %1mp r %+ %%i %+ mp 404 %xdefine %1mp r %+ %%i %+ mp
395 CAT_XDEFINE arg_name, %%i, %1 405 CAT_XDEFINE arg_name, %%i, %1
396 %assign %%i %%i+1 406 %assign %%i %%i+1
397 %rotate 1 407 %rotate 1
398 %endrep 408 %endrep
399 %xdefine stack_offset %%stack_offset 409 %xdefine stack_offset %%stack_offset
400 %assign n_arg_names %0 410 %assign n_arg_names %0
401 %endmacro 411 %endmacro
402 412
403 %if ARCH_X86_64 413 %define required_stack_alignment ((mmsize + 15) & ~15)
404 %macro ALLOC_STACK 2 ; stack_size, num_regs 414
405 %assign %%stack_aligment ((mmsize + 15) & ~15) 415 %macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
406 %assign stack_size_padded %1 416 %ifnum %1
407 417 %if %1 != 0
408 %assign %%reg_num (%2 - 1) 418 %assign %%pad 0
409 %xdefine rsp_tmp r %+ %%reg_num 419 %assign stack_size %1
410 mov rsp_tmp, rsp 420 %if stack_size < 0
411 sub rsp, stack_size_padded 421 %assign stack_size -stack_size
412 and rsp, ~(%%stack_aligment - 1) 422 %endif
413 %endmacro 423 %if WIN64
414 424 %assign %%pad %%pad + 32 ; shadow space
415 %macro RESTORE_STACK 0 ; reset rsp register 425 %if mmsize != 8
416 mov rsp, rsp_tmp 426 %assign xmm_regs_used %2
417 %endmacro 427 %if xmm_regs_used > 8
418 %endif 428 %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-save d xmm registers
429 %endif
430 %endif
431 %endif
432 %if required_stack_alignment <= STACK_ALIGNMENT
433 ; maintain the current stack alignment
434 %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_of fset-gprsize) & (STACK_ALIGNMENT-1))
435 SUB rsp, stack_size_padded
436 %else
437 %assign %%reg_num (regs_used - 1)
438 %xdefine rstk r %+ %%reg_num
439 ; align stack, and save original stack location directly above
440 ; it, i.e. in [rsp+stack_size_padded], so we can restore the
441 ; stack in a single instruction (i.e. mov rsp, rstk or mov
442 ; rsp, [rsp+stack_size_padded])
443 %if %1 < 0 ; need to store rsp on stack
444 %xdefine rstkm [rsp + stack_size + %%pad]
445 %assign %%pad %%pad + gprsize
446 %else ; can keep rsp in rstk during whole function
447 %xdefine rstkm rstk
448 %endif
449 %assign stack_size_padded stack_size + ((%%pad + required_stack_ alignment-1) & ~(required_stack_alignment-1))
450 mov rstk, rsp
451 and rsp, ~(required_stack_alignment-1)
452 sub rsp, stack_size_padded
453 movifnidn rstkm, rstk
454 %endif
455 WIN64_PUSH_XMM
456 %endif
457 %endif
458 %endmacro
459
460 %macro SETUP_STACK_POINTER 1
461 %ifnum %1
462 %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT
463 %if %1 > 0
464 %assign regs_used (regs_used + 1)
465 %elif ARCH_X86_64 && regs_used == num_args && num_args <= 4 + UNIX64 * 2
466 %warning "Stack pointer will overwrite register argument"
467 %endif
468 %endif
469 %endif
470 %endmacro
471
472 %macro DEFINE_ARGS_INTERNAL 3+
473 %ifnum %2
474 DEFINE_ARGS %3
475 %elif %1 == 4
476 DEFINE_ARGS %2
477 %elif %1 > 4
478 DEFINE_ARGS %2, %3
479 %endif
480 %endmacro
419 481
420 %if WIN64 ; Windows x64 ;================================================= 482 %if WIN64 ; Windows x64 ;=================================================
421 483
422 DECLARE_REG 0, rcx, ecx, cx, cl 484 DECLARE_REG 0, rcx
423 DECLARE_REG 1, rdx, edx, dx, dl 485 DECLARE_REG 1, rdx
424 DECLARE_REG 2, R8, R8D, R8W, R8B 486 DECLARE_REG 2, R8
425 DECLARE_REG 3, R9, R9D, R9W, R9B 487 DECLARE_REG 3, R9
426 DECLARE_REG 4, R10, R10D, R10W, R10B, 40 488 DECLARE_REG 4, R10, 40
427 DECLARE_REG 5, R11, R11D, R11W, R11B, 48 489 DECLARE_REG 5, R11, 48
428 DECLARE_REG 6, rax, eax, ax, al, 56 490 DECLARE_REG 6, rax, 56
429 DECLARE_REG 7, rdi, edi, di, dil, 64 491 DECLARE_REG 7, rdi, 64
430 DECLARE_REG 8, rsi, esi, si, sil, 72 492 DECLARE_REG 8, rsi, 72
431 DECLARE_REG 9, rbx, ebx, bx, bl, 80 493 DECLARE_REG 9, rbx, 80
432 DECLARE_REG 10, rbp, ebp, bp, bpl, 88 494 DECLARE_REG 10, rbp, 88
433 DECLARE_REG 11, R12, R12D, R12W, R12B, 96 495 DECLARE_REG 11, R12, 96
434 DECLARE_REG 12, R13, R13D, R13W, R13B, 104 496 DECLARE_REG 12, R13, 104
435 DECLARE_REG 13, R14, R14D, R14W, R14B, 112 497 DECLARE_REG 13, R14, 112
436 DECLARE_REG 14, R15, R15D, R15W, R15B, 120 498 DECLARE_REG 14, R15, 120
437 499
438 %macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names... 500 %macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
439 %assign num_args %1 501 %assign num_args %1
440 %assign regs_used %2 502 %assign regs_used %2
441 ASSERT regs_used >= num_args 503 ASSERT regs_used >= num_args
504 SETUP_STACK_POINTER %4
442 ASSERT regs_used <= 15 505 ASSERT regs_used <= 15
443 PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 506 PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
444 %if mmsize == 8 507 ALLOC_STACK %4, %3
445 %assign xmm_regs_used 0 508 %if mmsize != 8 && stack_size == 0
446 %else
447 WIN64_SPILL_XMM %3 509 WIN64_SPILL_XMM %3
448 %endif 510 %endif
449 LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 511 LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
450 DEFINE_ARGS %4 512 DEFINE_ARGS_INTERNAL %0, %4, %5
513 %endmacro
514
515 %macro WIN64_PUSH_XMM 0
516 ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space al located.
517 %if xmm_regs_used > 6
518 movaps [rstk + stack_offset + 8], xmm6
519 %endif
520 %if xmm_regs_used > 7
521 movaps [rstk + stack_offset + 24], xmm7
522 %endif
523 %if xmm_regs_used > 8
524 %assign %%i 8
525 %rep xmm_regs_used-8
526 movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
527 %assign %%i %%i+1
528 %endrep
529 %endif
451 %endmacro 530 %endmacro
452 531
453 %macro WIN64_SPILL_XMM 1 532 %macro WIN64_SPILL_XMM 1
454 %assign xmm_regs_used %1 533 %assign xmm_regs_used %1
455 ASSERT xmm_regs_used <= 16 534 ASSERT xmm_regs_used <= 16
535 %if xmm_regs_used > 8
536 ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack.
537 %assign %%pad (xmm_regs_used-8)*16 + 32
538 %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STAC K_ALIGNMENT-1))
539 SUB rsp, stack_size_padded
540 %endif
541 WIN64_PUSH_XMM
542 %endmacro
543
544 %macro WIN64_RESTORE_XMM_INTERNAL 1
545 %assign %%pad_size 0
546 %if xmm_regs_used > 8
547 %assign %%i xmm_regs_used
548 %rep xmm_regs_used-8
549 %assign %%i %%i-1
550 movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32]
551 %endrep
552 %endif
553 %if stack_size_padded > 0
554 %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT
555 mov rsp, rstkm
556 %else
557 add %1, stack_size_padded
558 %assign %%pad_size stack_size_padded
559 %endif
560 %endif
561 %if xmm_regs_used > 7
562 movaps xmm7, [%1 + stack_offset - %%pad_size + 24]
563 %endif
456 %if xmm_regs_used > 6 564 %if xmm_regs_used > 6
457 SUB rsp, (xmm_regs_used-6)*16+16 565 movaps xmm6, [%1 + stack_offset - %%pad_size + 8]
458 %assign %%i xmm_regs_used
459 %rep (xmm_regs_used-6)
460 %assign %%i %%i-1
461 movdqa [rsp + (%%i-6)*16+(~stack_offset&8)], xmm %+ %%i
462 %endrep
463 %endif
464 %endmacro
465
466 %macro WIN64_RESTORE_XMM_INTERNAL 1
467 %if xmm_regs_used > 6
468 %assign %%i xmm_regs_used
469 %rep (xmm_regs_used-6)
470 %assign %%i %%i-1
471 movdqa xmm %+ %%i, [%1 + (%%i-6)*16+(~stack_offset&8)]
472 %endrep
473 add %1, (xmm_regs_used-6)*16+16
474 %endif 566 %endif
475 %endmacro 567 %endmacro
476 568
477 %macro WIN64_RESTORE_XMM 1 569 %macro WIN64_RESTORE_XMM 1
478 WIN64_RESTORE_XMM_INTERNAL %1 570 WIN64_RESTORE_XMM_INTERNAL %1
479 %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16 571 %assign stack_offset (stack_offset-stack_size_padded)
480 %assign xmm_regs_used 0 572 %assign xmm_regs_used 0
481 %endmacro 573 %endmacro
482 574
575 %define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack _size > 0
576
483 %macro RET 0 577 %macro RET 0
484 WIN64_RESTORE_XMM_INTERNAL rsp 578 WIN64_RESTORE_XMM_INTERNAL rsp
485 POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 579 POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
486 ret 580 %if mmsize == 32
487 %endmacro 581 vzeroupper
488 582 %endif
489 %macro REP_RET 0 583 AUTO_REP_RET
490 %if regs_used > 7 || xmm_regs_used > 6
491 RET
492 %else
493 rep ret
494 %endif
495 %endmacro 584 %endmacro
496 585
497 %elif ARCH_X86_64 ; *nix x64 ;============================================= 586 %elif ARCH_X86_64 ; *nix x64 ;=============================================
498 587
499 DECLARE_REG 0, rdi, edi, di, dil 588 DECLARE_REG 0, rdi
500 DECLARE_REG 1, rsi, esi, si, sil 589 DECLARE_REG 1, rsi
501 DECLARE_REG 2, rdx, edx, dx, dl 590 DECLARE_REG 2, rdx
502 DECLARE_REG 3, rcx, ecx, cx, cl 591 DECLARE_REG 3, rcx
503 DECLARE_REG 4, R8, R8D, R8W, R8B 592 DECLARE_REG 4, R8
504 DECLARE_REG 5, R9, R9D, R9W, R9B 593 DECLARE_REG 5, R9
505 DECLARE_REG 6, rax, eax, ax, al, 8 594 DECLARE_REG 6, rax, 8
506 DECLARE_REG 7, R10, R10D, R10W, R10B, 16 595 DECLARE_REG 7, R10, 16
507 DECLARE_REG 8, R11, R11D, R11W, R11B, 24 596 DECLARE_REG 8, R11, 24
508 DECLARE_REG 9, rbx, ebx, bx, bl, 32 597 DECLARE_REG 9, rbx, 32
509 DECLARE_REG 10, rbp, ebp, bp, bpl, 40 598 DECLARE_REG 10, rbp, 40
510 DECLARE_REG 11, R12, R12D, R12W, R12B, 48 599 DECLARE_REG 11, R12, 48
511 DECLARE_REG 12, R13, R13D, R13W, R13B, 56 600 DECLARE_REG 12, R13, 56
512 DECLARE_REG 13, R14, R14D, R14W, R14B, 64 601 DECLARE_REG 13, R14, 64
513 DECLARE_REG 14, R15, R15D, R15W, R15B, 72 602 DECLARE_REG 14, R15, 72
514 603
515 %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... 604 %macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
516 %assign num_args %1 605 %assign num_args %1
517 %assign regs_used %2 606 %assign regs_used %2
518 ASSERT regs_used >= num_args 607 ASSERT regs_used >= num_args
608 SETUP_STACK_POINTER %4
519 ASSERT regs_used <= 15 609 ASSERT regs_used <= 15
520 PUSH_IF_USED 9, 10, 11, 12, 13, 14 610 PUSH_IF_USED 9, 10, 11, 12, 13, 14
611 ALLOC_STACK %4
521 LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 612 LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
522 DEFINE_ARGS %4 613 DEFINE_ARGS_INTERNAL %0, %4, %5
523 %endmacro 614 %endmacro
615
616 %define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0
524 617
525 %macro RET 0 618 %macro RET 0
619 %if stack_size_padded > 0
620 %if required_stack_alignment > STACK_ALIGNMENT
621 mov rsp, rstkm
622 %else
623 add rsp, stack_size_padded
624 %endif
625 %endif
526 POP_IF_USED 14, 13, 12, 11, 10, 9 626 POP_IF_USED 14, 13, 12, 11, 10, 9
527 ret 627 %if mmsize == 32
528 %endmacro 628 vzeroupper
529 629 %endif
530 %macro REP_RET 0 630 AUTO_REP_RET
531 %if regs_used > 9
532 RET
533 %else
534 rep ret
535 %endif
536 %endmacro 631 %endmacro
537 632
538 %else ; X86_32 ;============================================================== 633 %else ; X86_32 ;==============================================================
539 634
540 DECLARE_REG 0, eax, eax, ax, al, 4 635 DECLARE_REG 0, eax, 4
541 DECLARE_REG 1, ecx, ecx, cx, cl, 8 636 DECLARE_REG 1, ecx, 8
542 DECLARE_REG 2, edx, edx, dx, dl, 12 637 DECLARE_REG 2, edx, 12
543 DECLARE_REG 3, ebx, ebx, bx, bl, 16 638 DECLARE_REG 3, ebx, 16
544 DECLARE_REG 4, esi, esi, si, null, 20 639 DECLARE_REG 4, esi, 20
545 DECLARE_REG 5, edi, edi, di, null, 24 640 DECLARE_REG 5, edi, 24
546 DECLARE_REG 6, ebp, ebp, bp, null, 28 641 DECLARE_REG 6, ebp, 28
547 %define rsp esp 642 %define rsp esp
548 643
549 %macro DECLARE_ARG 1-* 644 %macro DECLARE_ARG 1-*
550 %rep %0 645 %rep %0
551 %define r%1m [esp + stack_offset + 4*%1 + 4] 646 %define r%1m [rstk + stack_offset + 4*%1 + 4]
552 %define r%1mp dword r%1m 647 %define r%1mp dword r%1m
553 %rotate 1 648 %rotate 1
554 %endrep 649 %endrep
555 %endmacro 650 %endmacro
556 651
557 DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 652 DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
558 653
559 %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... 654 %macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
560 %assign num_args %1 655 %assign num_args %1
561 %assign regs_used %2 656 %assign regs_used %2
657 ASSERT regs_used >= num_args
658 %if num_args > 7
659 %assign num_args 7
660 %endif
562 %if regs_used > 7 661 %if regs_used > 7
563 %assign regs_used 7 662 %assign regs_used 7
564 %endif 663 %endif
565 ASSERT regs_used >= num_args 664 SETUP_STACK_POINTER %4
665 ASSERT regs_used <= 7
566 PUSH_IF_USED 3, 4, 5, 6 666 PUSH_IF_USED 3, 4, 5, 6
667 ALLOC_STACK %4
567 LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 668 LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
568 DEFINE_ARGS %4 669 DEFINE_ARGS_INTERNAL %0, %4, %5
569 %endmacro 670 %endmacro
570 671
672 %define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0
673
571 %macro RET 0 674 %macro RET 0
675 %if stack_size_padded > 0
676 %if required_stack_alignment > STACK_ALIGNMENT
677 mov rsp, rstkm
678 %else
679 add rsp, stack_size_padded
680 %endif
681 %endif
572 POP_IF_USED 6, 5, 4, 3 682 POP_IF_USED 6, 5, 4, 3
573 ret 683 %if mmsize == 32
574 %endmacro 684 vzeroupper
575 685 %endif
576 %macro REP_RET 0 686 AUTO_REP_RET
577 %if regs_used > 3
578 RET
579 %else
580 rep ret
581 %endif
582 %endmacro 687 %endmacro
583 688
584 %endif ;====================================================================== 689 %endif ;======================================================================
585 690
586 %if WIN64 == 0 691 %if WIN64 == 0
587 %macro WIN64_SPILL_XMM 1 692 %macro WIN64_SPILL_XMM 1
588 %endmacro 693 %endmacro
589 %macro WIN64_RESTORE_XMM 1 694 %macro WIN64_RESTORE_XMM 1
590 %endmacro 695 %endmacro
696 %macro WIN64_PUSH_XMM 0
697 %endmacro
591 %endif 698 %endif
592 699
700 ; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either
701 ; a branch or a branch target. So switch to a 2-byte form of ret in that case.
702 ; We can automatically detect "follows a branch", but not a branch target.
703 ; (SSSE3 is a sufficient condition to know that your cpu doesn't have this probl em.)
704 %macro REP_RET 0
705 %if has_epilogue
706 RET
707 %else
708 rep ret
709 %endif
710 %endmacro
711
712 %define last_branch_adr $$
713 %macro AUTO_REP_RET 0
714 %ifndef cpuflags
715 times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ != last_branch_adr .
716 %elif notcpuflag(ssse3)
717 times ((last_branch_adr-$)>>31)+1 rep
718 %endif
719 ret
720 %endmacro
721
722 %macro BRANCH_INSTR 0-*
723 %rep %0
724 %macro %1 1-2 %1
725 %2 %1
726 %%branch_instr:
727 %xdefine last_branch_adr %%branch_instr
728 %endmacro
729 %rotate 1
730 %endrep
731 %endmacro
732
733 BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp
734
735 %macro TAIL_CALL 2 ; callee, is_nonadjacent
736 %if has_epilogue
737 call %1
738 RET
739 %elif %2
740 jmp %1
741 %endif
742 %endmacro
743
593 ;============================================================================= 744 ;=============================================================================
594 ; arch-independent part 745 ; arch-independent part
595 ;============================================================================= 746 ;=============================================================================
596 747
597 %assign function_align 16 748 %assign function_align 16
598 749
599 ; Begin a function. 750 ; Begin a function.
600 ; Applies any symbol mangling needed for C linkage, and sets up a define such th at 751 ; Applies any symbol mangling needed for C linkage, and sets up a define such th at
601 ; subsequent uses of the function name automatically refer to the mangled versio n. 752 ; subsequent uses of the function name automatically refer to the mangled versio n.
602 ; Appends cpuflags to the function name if cpuflags has been specified. 753 ; Appends cpuflags to the function name if cpuflags has been specified.
603 %macro cglobal 1-2+ ; name, [PROLOGUE args] 754 ; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX
604 %if %0 == 1 755 ; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2).
605 cglobal_internal %1 %+ SUFFIX 756 %macro cglobal 1-2+ "" ; name, [PROLOGUE args]
606 %else 757 cglobal_internal 1, %1 %+ SUFFIX, %2
607 cglobal_internal %1 %+ SUFFIX, %2
608 %endif
609 %endmacro 758 %endmacro
610 %macro cglobal_internal 1-2+ 759 %macro cvisible 1-2+ "" ; name, [PROLOGUE args]
611 %ifndef cglobaled_%1 760 cglobal_internal 0, %1 %+ SUFFIX, %2
612 %xdefine %1 mangle(program_name %+ _ %+ %1) 761 %endmacro
613 %xdefine %1.skip_prologue %1 %+ .skip_prologue 762 %macro cglobal_internal 2-3+
614 CAT_XDEFINE cglobaled_, %1, 1 763 %if %1
615 %endif 764 %xdefine %%FUNCTION_PREFIX private_prefix
616 %xdefine current_function %1 765 ; libvpx explicitly sets visibility in shared object builds. Avoid
617 %ifdef CHROMIUM 766 ; setting visibility to hidden as it may break builds that split
618 %ifidn __OUTPUT_FORMAT__,elf 767 ; sources on e.g., directory boundaries.
619 global %1:function hidden 768 %ifdef CHROMIUM
620 %elifidn __OUTPUT_FORMAT__,elf32 769 %xdefine %%VISIBILITY hidden
621 global %1:function hidden
622 %elifidn __OUTPUT_FORMAT__,elf64
623 global %1:function hidden
624 %elifidn __OUTPUT_FORMAT__,macho32
625 %ifdef __NASM_VER__
626 global %1
627 %else
628 global %1:private_extern
629 %endif
630 %elifidn __OUTPUT_FORMAT__,macho64
631 %ifdef __NASM_VER__
632 global %1
633 %else
634 global %1:private_extern
635 %endif
636 %else 770 %else
637 global %1 771 %xdefine %%VISIBILITY
638 %endif 772 %endif
639 %else 773 %else
640 global %1 774 %xdefine %%FUNCTION_PREFIX public_prefix
775 %xdefine %%VISIBILITY
776 %endif
777 %ifndef cglobaled_%2
778 %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2)
779 %xdefine %2.skip_prologue %2 %+ .skip_prologue
780 CAT_XDEFINE cglobaled_, %2, 1
781 %endif
782 %xdefine current_function %2
783 %ifidn __OUTPUT_FORMAT__,elf32
784 global %2:function %%VISIBILITY
785 %elifidn __OUTPUT_FORMAT__,elf64
786 global %2:function %%VISIBILITY
787 %elifidn __OUTPUT_FORMAT__,macho32
788 %ifdef __NASM_VER__
789 global %2
790 %else
791 global %2:private_extern
792 %endif
793 %elifidn __OUTPUT_FORMAT__,macho64
794 %ifdef __NASM_VER__
795 global %2
796 %else
797 global %2:private_extern
798 %endif
799 %else
800 global %2
641 %endif 801 %endif
642 align function_align 802 align function_align
643 %1: 803 %2:
644 RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nic er 804 RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly some what nicer
645 %assign stack_offset 0 805 %xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required
646 %if %0 > 1 806 %assign stack_offset 0 ; stack pointer offset relative to the return ad dress
647 PROLOGUE %2 807 %assign stack_size 0 ; amount of stack space that can be freely used inside a function
808 %assign stack_size_padded 0 ; total amount of allocated stack space, includi ng space for callee-saved xmm registers on WIN64 and alignment padding
809 %assign xmm_regs_used 0 ; number of XMM registers requested, used for de aling with callee-saved registers on WIN64
810 %ifnidn %3, ""
811 PROLOGUE %3
648 %endif 812 %endif
649 %endmacro 813 %endmacro
650 814
651 %macro cextern 1 815 %macro cextern 1
652 %xdefine %1 mangle(program_name %+ _ %+ %1) 816 %xdefine %1 mangle(private_prefix %+ _ %+ %1)
653 CAT_XDEFINE cglobaled_, %1, 1 817 CAT_XDEFINE cglobaled_, %1, 1
654 extern %1 818 extern %1
655 %endmacro 819 %endmacro
656 820
657 ; like cextern, but without the prefix 821 ; like cextern, but without the prefix
658 %macro cextern_naked 1 822 %macro cextern_naked 1
659 %xdefine %1 mangle(%1) 823 %xdefine %1 mangle(%1)
660 CAT_XDEFINE cglobaled_, %1, 1 824 CAT_XDEFINE cglobaled_, %1, 1
661 extern %1 825 extern %1
662 %endmacro 826 %endmacro
663 827
664 %macro const 2+ 828 %macro const 1-2+
665 %xdefine %1 mangle(program_name %+ _ %+ %1) 829 %xdefine %1 mangle(private_prefix %+ _ %+ %1)
666 global %1 830 %ifidn __OUTPUT_FORMAT__,elf32
831 global %1:data hidden
832 %elifidn __OUTPUT_FORMAT__,elf64
833 global %1:data hidden
834 %else
835 global %1
836 %endif
667 %1: %2 837 %1: %2
668 %endmacro 838 %endmacro
669 839
670 ; This is needed for ELF, otherwise the GNU linker assumes the stack is 840 ; This is needed for ELF, otherwise the GNU linker assumes the stack is
671 ; executable by default. 841 ; executable by default.
672 %ifidn __OUTPUT_FORMAT__,elf 842 %ifidn __OUTPUT_FORMAT__,elf32
673 SECTION .note.GNU-stack noalloc noexec nowrite progbits
674 %elifidn __OUTPUT_FORMAT__,elf32
675 SECTION .note.GNU-stack noalloc noexec nowrite progbits 843 SECTION .note.GNU-stack noalloc noexec nowrite progbits
676 %elifidn __OUTPUT_FORMAT__,elf64 844 %elifidn __OUTPUT_FORMAT__,elf64
677 SECTION .note.GNU-stack noalloc noexec nowrite progbits 845 SECTION .note.GNU-stack noalloc noexec nowrite progbits
678 %endif 846 %endif
679 847
680 ; cpuflags 848 ; cpuflags
681 849
682 %assign cpuflags_mmx (1<<0) 850 %assign cpuflags_mmx (1<<0)
683 %assign cpuflags_mmx2 (1<<1) | cpuflags_mmx 851 %assign cpuflags_mmx2 (1<<1) | cpuflags_mmx
684 %assign cpuflags_3dnow (1<<2) | cpuflags_mmx 852 %assign cpuflags_3dnow (1<<2) | cpuflags_mmx
685 %assign cpuflags_3dnow2 (1<<3) | cpuflags_3dnow 853 %assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow
686 %assign cpuflags_sse (1<<4) | cpuflags_mmx2 854 %assign cpuflags_sse (1<<4) | cpuflags_mmx2
687 %assign cpuflags_sse2 (1<<5) | cpuflags_sse 855 %assign cpuflags_sse2 (1<<5) | cpuflags_sse
688 %assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 856 %assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
689 %assign cpuflags_sse3 (1<<7) | cpuflags_sse2 857 %assign cpuflags_sse3 (1<<7) | cpuflags_sse2
690 %assign cpuflags_ssse3 (1<<8) | cpuflags_sse3 858 %assign cpuflags_ssse3 (1<<8) | cpuflags_sse3
691 %assign cpuflags_sse4 (1<<9) | cpuflags_ssse3 859 %assign cpuflags_sse4 (1<<9) | cpuflags_ssse3
692 %assign cpuflags_sse42 (1<<10)| cpuflags_sse4 860 %assign cpuflags_sse42 (1<<10)| cpuflags_sse4
693 %assign cpuflags_avx (1<<11)| cpuflags_sse42 861 %assign cpuflags_avx (1<<11)| cpuflags_sse42
694 %assign cpuflags_xop (1<<12)| cpuflags_avx 862 %assign cpuflags_xop (1<<12)| cpuflags_avx
695 %assign cpuflags_fma4 (1<<13)| cpuflags_avx 863 %assign cpuflags_fma4 (1<<13)| cpuflags_avx
864 %assign cpuflags_fma3 (1<<14)| cpuflags_avx
865 %assign cpuflags_avx2 (1<<15)| cpuflags_fma3
696 866
697 %assign cpuflags_cache32 (1<<16) 867 %assign cpuflags_cache32 (1<<16)
698 %assign cpuflags_cache64 (1<<17) 868 %assign cpuflags_cache64 (1<<17)
699 %assign cpuflags_slowctz (1<<18) 869 %assign cpuflags_slowctz (1<<18)
700 %assign cpuflags_lzcnt (1<<19) 870 %assign cpuflags_lzcnt (1<<19)
701 %assign cpuflags_misalign (1<<20) 871 %assign cpuflags_aligned (1<<20) ; not a cpu feature, but a function variant
702 %assign cpuflags_aligned (1<<21) ; not a cpu feature, but a function variant 872 %assign cpuflags_atom (1<<21)
703 %assign cpuflags_atom (1<<22) 873 %assign cpuflags_bmi1 (1<<22)|cpuflags_lzcnt
874 %assign cpuflags_bmi2 (1<<23)|cpuflags_bmi1
704 875
705 %define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x)) 876 %define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
706 %define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x)) 877 %define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))
707 878
708 ; Takes up to 2 cpuflags from the above list. 879 ; Takes an arbitrary number of cpuflags from the above list.
709 ; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the speci fied cpu. 880 ; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the speci fied cpu.
710 ; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_M MX &co. 881 ; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_M MX &co.
711 %macro INIT_CPUFLAGS 0-2 882 %macro INIT_CPUFLAGS 0-*
883 %xdefine SUFFIX
884 %undef cpuname
885 %assign cpuflags 0
886
712 %if %0 >= 1 887 %if %0 >= 1
713 %xdefine cpuname %1 888 %rep %0
714 %assign cpuflags cpuflags_%1 889 %ifdef cpuname
715 %if %0 >= 2 890 %xdefine cpuname cpuname %+ _%1
716 %xdefine cpuname %1_%2 891 %else
717 %assign cpuflags cpuflags | cpuflags_%2 892 %xdefine cpuname %1
718 %endif 893 %endif
894 %assign cpuflags cpuflags | cpuflags_%1
895 %rotate 1
896 %endrep
719 %xdefine SUFFIX _ %+ cpuname 897 %xdefine SUFFIX _ %+ cpuname
898
720 %if cpuflag(avx) 899 %if cpuflag(avx)
721 %assign avx_enabled 1 900 %assign avx_enabled 1
722 %endif 901 %endif
723 %if mmsize == 16 && notcpuflag(sse2) 902 %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(av x2))
724 %define mova movaps 903 %define mova movaps
725 %define movu movups 904 %define movu movups
726 %define movnta movntps 905 %define movnta movntps
727 %endif 906 %endif
728 %if cpuflag(aligned) 907 %if cpuflag(aligned)
729 %define movu mova 908 %define movu mova
730 %elifidn %1, sse3 909 %elif cpuflag(sse3) && notcpuflag(ssse3)
731 %define movu lddqu 910 %define movu lddqu
732 %endif 911 %endif
912 %endif
913
914 %ifdef __NASM_VER__
915 %use smartalign
916 ALIGNMODE k7
917 %elif ARCH_X86_64 || cpuflag(sse2)
918 CPU amdnop
733 %else 919 %else
734 %xdefine SUFFIX 920 CPU basicnop
735 %undef cpuname
736 %undef cpuflags
737 %endif 921 %endif
738 %endmacro 922 %endmacro
739 923
740 ; merge mmx and sse* 924 ; Merge mmx and sse*
925 ; m# is a simd register of the currently selected size
926 ; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m #
927 ; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m #
928 ; (All 3 remain in sync through SWAP.)
741 929
742 %macro CAT_XDEFINE 3 930 %macro CAT_XDEFINE 3
743 %xdefine %1%2 %3 931 %xdefine %1%2 %3
744 %endmacro 932 %endmacro
745 933
746 %macro CAT_UNDEF 2 934 %macro CAT_UNDEF 2
747 %undef %1%2 935 %undef %1%2
748 %endmacro 936 %endmacro
749 937
750 %macro INIT_MMX 0-1+ 938 %macro INIT_MMX 0-1+
751 %assign avx_enabled 0 939 %assign avx_enabled 0
752 %define RESET_MM_PERMUTATION INIT_MMX %1 940 %define RESET_MM_PERMUTATION INIT_MMX %1
753 %define mmsize 8 941 %define mmsize 8
754 %define num_mmregs 8 942 %define num_mmregs 8
755 %define mova movq 943 %define mova movq
756 %define movu movq 944 %define movu movq
757 %define movh movd 945 %define movh movd
758 %define movnta movntq 946 %define movnta movntq
759 %assign %%i 0 947 %assign %%i 0
760 %rep 8 948 %rep 8
761 CAT_XDEFINE m, %%i, mm %+ %%i 949 CAT_XDEFINE m, %%i, mm %+ %%i
762 CAT_XDEFINE nmm, %%i, %%i 950 CAT_XDEFINE nnmm, %%i, %%i
763 %assign %%i %%i+1 951 %assign %%i %%i+1
764 %endrep 952 %endrep
765 %rep 8 953 %rep 8
766 CAT_UNDEF m, %%i 954 CAT_UNDEF m, %%i
767 CAT_UNDEF nmm, %%i 955 CAT_UNDEF nnmm, %%i
768 %assign %%i %%i+1 956 %assign %%i %%i+1
769 %endrep 957 %endrep
770 INIT_CPUFLAGS %1 958 INIT_CPUFLAGS %1
771 %endmacro 959 %endmacro
772 960
773 %macro INIT_XMM 0-1+ 961 %macro INIT_XMM 0-1+
774 %assign avx_enabled 0 962 %assign avx_enabled 0
775 %define RESET_MM_PERMUTATION INIT_XMM %1 963 %define RESET_MM_PERMUTATION INIT_XMM %1
776 %define mmsize 16 964 %define mmsize 16
777 %define num_mmregs 8 965 %define num_mmregs 8
778 %if ARCH_X86_64 966 %if ARCH_X86_64
779 %define num_mmregs 16 967 %define num_mmregs 16
780 %endif 968 %endif
781 %define mova movdqa 969 %define mova movdqa
782 %define movu movdqu 970 %define movu movdqu
783 %define movh movq 971 %define movh movq
784 %define movnta movntdq 972 %define movnta movntdq
785 %assign %%i 0 973 %assign %%i 0
786 %rep num_mmregs 974 %rep num_mmregs
787 CAT_XDEFINE m, %%i, xmm %+ %%i 975 CAT_XDEFINE m, %%i, xmm %+ %%i
788 CAT_XDEFINE nxmm, %%i, %%i 976 CAT_XDEFINE nnxmm, %%i, %%i
789 %assign %%i %%i+1 977 %assign %%i %%i+1
790 %endrep 978 %endrep
791 INIT_CPUFLAGS %1 979 INIT_CPUFLAGS %1
792 %endmacro 980 %endmacro
793 981
794 ; FIXME: INIT_AVX can be replaced by INIT_XMM avx
795 %macro INIT_AVX 0
796 INIT_XMM
797 %assign avx_enabled 1
798 %define PALIGNR PALIGNR_SSSE3
799 %define RESET_MM_PERMUTATION INIT_AVX
800 %endmacro
801
802 %macro INIT_YMM 0-1+ 982 %macro INIT_YMM 0-1+
803 %assign avx_enabled 1 983 %assign avx_enabled 1
804 %define RESET_MM_PERMUTATION INIT_YMM %1 984 %define RESET_MM_PERMUTATION INIT_YMM %1
805 %define mmsize 32 985 %define mmsize 32
806 %define num_mmregs 8 986 %define num_mmregs 8
807 %if ARCH_X86_64 987 %if ARCH_X86_64
808 %define num_mmregs 16 988 %define num_mmregs 16
809 %endif 989 %endif
810 %define mova vmovaps 990 %define mova movdqa
811 %define movu vmovups 991 %define movu movdqu
812 %undef movh 992 %undef movh
813 %define movnta vmovntps 993 %define movnta movntdq
814 %assign %%i 0 994 %assign %%i 0
815 %rep num_mmregs 995 %rep num_mmregs
816 CAT_XDEFINE m, %%i, ymm %+ %%i 996 CAT_XDEFINE m, %%i, ymm %+ %%i
817 CAT_XDEFINE nymm, %%i, %%i 997 CAT_XDEFINE nnymm, %%i, %%i
818 %assign %%i %%i+1 998 %assign %%i %%i+1
819 %endrep 999 %endrep
820 INIT_CPUFLAGS %1 1000 INIT_CPUFLAGS %1
821 %endmacro 1001 %endmacro
822 1002
823 INIT_XMM 1003 INIT_XMM
824 1004
1005 %macro DECLARE_MMCAST 1
1006 %define mmmm%1 mm%1
1007 %define mmxmm%1 mm%1
1008 %define mmymm%1 mm%1
1009 %define xmmmm%1 mm%1
1010 %define xmmxmm%1 xmm%1
1011 %define xmmymm%1 xmm%1
1012 %define ymmmm%1 mm%1
1013 %define ymmxmm%1 xmm%1
1014 %define ymmymm%1 ymm%1
1015 %define xm%1 xmm %+ m%1
1016 %define ym%1 ymm %+ m%1
1017 %endmacro
1018
1019 %assign i 0
1020 %rep 16
1021 DECLARE_MMCAST i
1022 %assign i i+1
1023 %endrep
1024
825 ; I often want to use macros that permute their arguments. e.g. there's no 1025 ; I often want to use macros that permute their arguments. e.g. there's no
826 ; efficient way to implement butterfly or transpose or dct without swapping some 1026 ; efficient way to implement butterfly or transpose or dct without swapping some
827 ; arguments. 1027 ; arguments.
828 ; 1028 ;
829 ; I would like to not have to manually keep track of the permutations: 1029 ; I would like to not have to manually keep track of the permutations:
830 ; If I insert a permutation in the middle of a function, it should automatically 1030 ; If I insert a permutation in the middle of a function, it should automatically
831 ; change everything that follows. For more complex macros I may also have multip le 1031 ; change everything that follows. For more complex macros I may also have multip le
832 ; implementations, e.g. the SSE2 and SSSE3 versions may have different permutati ons. 1032 ; implementations, e.g. the SSE2 and SSSE3 versions may have different permutati ons.
833 ; 1033 ;
834 ; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that 1034 ; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
835 ; permutes its arguments. It's equivalent to exchanging the contents of the 1035 ; permutes its arguments. It's equivalent to exchanging the contents of the
836 ; registers, except that this way you exchange the register names instead, so it 1036 ; registers, except that this way you exchange the register names instead, so it
837 ; doesn't cost any cycles. 1037 ; doesn't cost any cycles.
838 1038
839 %macro PERMUTE 2-* ; takes a list of pairs to swap 1039 %macro PERMUTE 2-* ; takes a list of pairs to swap
840 %rep %0/2 1040 %rep %0/2
841 %xdefine tmp%2 m%2 1041 %xdefine %%tmp%2 m%2
842 %xdefine ntmp%2 nm%2
843 %rotate 2 1042 %rotate 2
844 %endrep 1043 %endrep
845 %rep %0/2 1044 %rep %0/2
846 %xdefine m%1 tmp%2 1045 %xdefine m%1 %%tmp%2
847 %xdefine nm%1 ntmp%2 1046 CAT_XDEFINE nn, m%1, %1
848 %undef tmp%2
849 %undef ntmp%2
850 %rotate 2 1047 %rotate 2
851 %endrep 1048 %endrep
852 %endmacro 1049 %endmacro
853 1050
854 %macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs) 1051 %macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs)
855 %rep %0-1 1052 %ifnum %1 ; SWAP 0, 1, ...
856 %ifdef m%1 1053 SWAP_INTERNAL_NUM %1, %2
857 %xdefine tmp m%1 1054 %else ; SWAP m0, m1, ...
858 %xdefine m%1 m%2 1055 SWAP_INTERNAL_NAME %1, %2
859 %xdefine m%2 tmp
860 CAT_XDEFINE n, m%1, %1
861 CAT_XDEFINE n, m%2, %2
862 %else
863 ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the origina l numbers here.
864 ; Be careful using this mode in nested macros though, as in some cases there may be
865 ; other copies of m# that have already been dereferenced and don't get updat ed correctly.
866 %xdefine %%n1 n %+ %1
867 %xdefine %%n2 n %+ %2
868 %xdefine tmp m %+ %%n1
869 CAT_XDEFINE m, %%n1, m %+ %%n2
870 CAT_XDEFINE m, %%n2, tmp
871 CAT_XDEFINE n, m %+ %%n1, %%n1
872 CAT_XDEFINE n, m %+ %%n2, %%n2
873 %endif 1056 %endif
874 %undef tmp 1057 %endmacro
1058
1059 %macro SWAP_INTERNAL_NUM 2-*
1060 %rep %0-1
1061 %xdefine %%tmp m%1
1062 %xdefine m%1 m%2
1063 %xdefine m%2 %%tmp
1064 CAT_XDEFINE nn, m%1, %1
1065 CAT_XDEFINE nn, m%2, %2
875 %rotate 1 1066 %rotate 1
876 %endrep 1067 %endrep
1068 %endmacro
1069
1070 %macro SWAP_INTERNAL_NAME 2-*
1071 %xdefine %%args nn %+ %1
1072 %rep %0-1
1073 %xdefine %%args %%args, nn %+ %2
1074 %rotate 1
1075 %endrep
1076 SWAP_INTERNAL_NUM %%args
877 %endmacro 1077 %endmacro
878 1078
879 ; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later 1079 ; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
880 ; calls to that function will automatically load the permutation, so values can 1080 ; calls to that function will automatically load the permutation, so values can
881 ; be returned in mmregs. 1081 ; be returned in mmregs.
882 %macro SAVE_MM_PERMUTATION 0-1 1082 %macro SAVE_MM_PERMUTATION 0-1
883 %if %0 1083 %if %0
884 %xdefine %%f %1_m 1084 %xdefine %%f %1_m
885 %else 1085 %else
886 %xdefine %%f current_function %+ _m 1086 %xdefine %%f current_function %+ _m
887 %endif 1087 %endif
888 %assign %%i 0 1088 %assign %%i 0
889 %rep num_mmregs 1089 %rep num_mmregs
890 CAT_XDEFINE %%f, %%i, m %+ %%i 1090 CAT_XDEFINE %%f, %%i, m %+ %%i
891 %assign %%i %%i+1 1091 %assign %%i %%i+1
892 %endrep 1092 %endrep
893 %endmacro 1093 %endmacro
894 1094
895 %macro LOAD_MM_PERMUTATION 1 ; name to load from 1095 %macro LOAD_MM_PERMUTATION 1 ; name to load from
896 %ifdef %1_m0 1096 %ifdef %1_m0
897 %assign %%i 0 1097 %assign %%i 0
898 %rep num_mmregs 1098 %rep num_mmregs
899 CAT_XDEFINE m, %%i, %1_m %+ %%i 1099 CAT_XDEFINE m, %%i, %1_m %+ %%i
900 CAT_XDEFINE n, m %+ %%i, %%i 1100 CAT_XDEFINE nn, m %+ %%i, %%i
901 %assign %%i %%i+1 1101 %assign %%i %%i+1
902 %endrep 1102 %endrep
903 %endif 1103 %endif
904 %endmacro 1104 %endmacro
905 1105
906 ; Append cpuflags to the callee's name iff the appended name is known and the pl ain name isn't 1106 ; Append cpuflags to the callee's name iff the appended name is known and the pl ain name isn't
907 %macro call 1 1107 %macro call 1
908 call_internal %1, %1 %+ SUFFIX 1108 call_internal %1, %1 %+ SUFFIX
909 %endmacro 1109 %endmacro
910 %macro call_internal 2 1110 %macro call_internal 2
(...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after
951 %rep 16 1151 %rep 16
952 %if i < 8 1152 %if i < 8
953 CAT_XDEFINE sizeofmm, i, 8 1153 CAT_XDEFINE sizeofmm, i, 8
954 %endif 1154 %endif
955 CAT_XDEFINE sizeofxmm, i, 16 1155 CAT_XDEFINE sizeofxmm, i, 16
956 CAT_XDEFINE sizeofymm, i, 32 1156 CAT_XDEFINE sizeofymm, i, 32
957 %assign i i+1 1157 %assign i i+1
958 %endrep 1158 %endrep
959 %undef i 1159 %undef i
960 1160
1161 %macro CHECK_AVX_INSTR_EMU 3-*
1162 %xdefine %%opcode %1
1163 %xdefine %%dst %2
1164 %rep %0-2
1165 %ifidn %%dst, %3
1166 %error non-avx emulation of ``%%opcode'' is not supported
1167 %endif
1168 %rotate 1
1169 %endrep
1170 %endmacro
1171
961 ;%1 == instruction 1172 ;%1 == instruction
962 ;%2 == 1 if float, 0 if int 1173 ;%2 == minimal instruction set
963 ;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm) 1174 ;%3 == 1 if float, 0 if int
964 ;%4 == number of operands given 1175 ;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
965 ;%5+: operands 1176 ;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
966 %macro RUN_AVX_INSTR 6-7+ 1177 ;%6+: operands
967 %ifid %5 1178 %macro RUN_AVX_INSTR 6-9+
968 %define %%size sizeof%5 1179 %ifnum sizeof%7
1180 %assign __sizeofreg sizeof%7
1181 %elifnum sizeof%6
1182 %assign __sizeofreg sizeof%6
969 %else 1183 %else
970 %define %%size mmsize 1184 %assign __sizeofreg mmsize
971 %endif 1185 %endif
972 %if %%size==32 1186 %assign __emulate_avx 0
973 %if %0 >= 7 1187 %if avx_enabled && __sizeofreg >= 16
974 v%1 %5, %6, %7 1188 %xdefine __instr v%1
1189 %else
1190 %xdefine __instr %1
1191 %if %0 >= 8+%4
1192 %assign __emulate_avx 1
1193 %endif
1194 %endif
1195 %ifnidn %2, fnord
1196 %ifdef cpuname
1197 %if notcpuflag(%2)
1198 %error use of ``%1'' %2 instruction in cpuname function: current _function
1199 %elif cpuflags_%2 < cpuflags_sse && notcpuflag(sse2) && __sizeofreg > 8
1200 %error use of ``%1'' sse2 instruction in cpuname function: curre nt_function
1201 %endif
1202 %endif
1203 %endif
1204
1205 %if __emulate_avx
1206 %xdefine __src1 %7
1207 %xdefine __src2 %8
1208 %ifnidn %6, %7
1209 %if %0 >= 9
1210 CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, %8, %9
1211 %else
1212 CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, %8
1213 %endif
1214 %if %5 && %4 == 0
1215 %ifnid %8
1216 ; 3-operand AVX instructions with a memory arg can only have it in src2,
1217 ; whereas SSE emulation prefers to have it in src1 (i.e. the mov).
1218 ; So, if the instruction is commutative with a memory arg, s wap them.
1219 %xdefine __src1 %8
1220 %xdefine __src2 %7
1221 %endif
1222 %endif
1223 %if __sizeofreg == 8
1224 MOVQ %6, __src1
1225 %elif %3
1226 MOVAPS %6, __src1
1227 %else
1228 MOVDQA %6, __src1
1229 %endif
1230 %endif
1231 %if %0 >= 9
1232 %1 %6, __src2, %9
975 %else 1233 %else
976 v%1 %5, %6 1234 %1 %6, __src2
977 %endif 1235 %endif
1236 %elif %0 >= 9
1237 __instr %6, %7, %8, %9
1238 %elif %0 == 8
1239 __instr %6, %7, %8
1240 %elif %0 == 7
1241 __instr %6, %7
978 %else 1242 %else
979 %if %%size==8 1243 __instr %6
980 %define %%regmov movq
981 %elif %2
982 %define %%regmov movaps
983 %else
984 %define %%regmov movdqa
985 %endif
986
987 %if %4>=3+%3
988 %ifnidn %5, %6
989 %if avx_enabled && sizeof%5==16
990 v%1 %5, %6, %7
991 %else
992 %%regmov %5, %6
993 %1 %5, %7
994 %endif
995 %else
996 %1 %5, %7
997 %endif
998 %elif %3
999 %1 %5, %6, %7
1000 %else
1001 %1 %5, %6
1002 %endif
1003 %endif 1244 %endif
1004 %endmacro 1245 %endmacro
1005 1246
1006 ; 3arg AVX ops with a memory arg can only have it in src2,
1007 ; whereas SSE emulation of 3arg prefers to have it in src1 (i.e. the mov).
1008 ; So, if the op is symmetric and the wrong one is memory, swap them.
1009 %macro RUN_AVX_INSTR1 8
1010 %assign %%swap 0
1011 %if avx_enabled
1012 %ifnid %6
1013 %assign %%swap 1
1014 %endif
1015 %elifnidn %5, %6
1016 %ifnid %7
1017 %assign %%swap 1
1018 %endif
1019 %endif
1020 %if %%swap && %3 == 0 && %8 == 1
1021 RUN_AVX_INSTR %1, %2, %3, %4, %5, %7, %6
1022 %else
1023 RUN_AVX_INSTR %1, %2, %3, %4, %5, %6, %7
1024 %endif
1025 %endmacro
1026
1027 ;%1 == instruction 1247 ;%1 == instruction
1028 ;%2 == 1 if float, 0 if int 1248 ;%2 == minimal instruction set
1029 ;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 3-operand (xmm, xmm, xmm) 1249 ;%3 == 1 if float, 0 if int
1030 ;%4 == 1 if symmetric (i.e. doesn't matter which src arg is which), 0 if not 1250 ;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
1031 %macro AVX_INSTR 4 1251 ;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
1032 %macro %1 2-9 fnord, fnord, fnord, %1, %2, %3, %4 1252 %macro AVX_INSTR 1-5 fnord, 0, 1, 0
1033 %ifidn %3, fnord 1253 %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5
1034 RUN_AVX_INSTR %6, %7, %8, 2, %1, %2 1254 %ifidn %2, fnord
1255 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1
1256 %elifidn %3, fnord
1257 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2
1035 %elifidn %4, fnord 1258 %elifidn %4, fnord
1036 RUN_AVX_INSTR1 %6, %7, %8, 3, %1, %2, %3, %9 1259 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3
1037 %elifidn %5, fnord 1260 %elifidn %5, fnord
1038 RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4 1261 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4
1039 %else 1262 %else
1040 RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5 1263 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4, %5
1041 %endif 1264 %endif
1042 %endmacro 1265 %endmacro
1043 %endmacro 1266 %endmacro
1044 1267
1045 AVX_INSTR addpd, 1, 0, 1 1268 ; Instructions with both VEX and non-VEX encodings
1046 AVX_INSTR addps, 1, 0, 1 1269 ; Non-destructive instructions are written without parameters
1047 AVX_INSTR addsd, 1, 0, 1 1270 AVX_INSTR addpd, sse2, 1, 0, 1
1048 AVX_INSTR addss, 1, 0, 1 1271 AVX_INSTR addps, sse, 1, 0, 1
1049 AVX_INSTR addsubpd, 1, 0, 0 1272 AVX_INSTR addsd, sse2, 1, 0, 1
1050 AVX_INSTR addsubps, 1, 0, 0 1273 AVX_INSTR addss, sse, 1, 0, 1
1051 AVX_INSTR andpd, 1, 0, 1 1274 AVX_INSTR addsubpd, sse3, 1, 0, 0
1052 AVX_INSTR andps, 1, 0, 1 1275 AVX_INSTR addsubps, sse3, 1, 0, 0
1053 AVX_INSTR andnpd, 1, 0, 0 1276 AVX_INSTR aesdec, fnord, 0, 0, 0
1054 AVX_INSTR andnps, 1, 0, 0 1277 AVX_INSTR aesdeclast, fnord, 0, 0, 0
1055 AVX_INSTR blendpd, 1, 0, 0 1278 AVX_INSTR aesenc, fnord, 0, 0, 0
1056 AVX_INSTR blendps, 1, 0, 0 1279 AVX_INSTR aesenclast, fnord, 0, 0, 0
1057 AVX_INSTR blendvpd, 1, 0, 0 1280 AVX_INSTR aesimc
1058 AVX_INSTR blendvps, 1, 0, 0 1281 AVX_INSTR aeskeygenassist
1059 AVX_INSTR cmppd, 1, 0, 0 1282 AVX_INSTR andnpd, sse2, 1, 0, 0
1060 AVX_INSTR cmpps, 1, 0, 0 1283 AVX_INSTR andnps, sse, 1, 0, 0
1061 AVX_INSTR cmpsd, 1, 0, 0 1284 AVX_INSTR andpd, sse2, 1, 0, 1
1062 AVX_INSTR cmpss, 1, 0, 0 1285 AVX_INSTR andps, sse, 1, 0, 1
1063 AVX_INSTR cvtdq2ps, 1, 0, 0 1286 AVX_INSTR blendpd, sse4, 1, 0, 0
1064 AVX_INSTR cvtps2dq, 1, 0, 0 1287 AVX_INSTR blendps, sse4, 1, 0, 0
1065 AVX_INSTR divpd, 1, 0, 0 1288 AVX_INSTR blendvpd, sse4, 1, 0, 0
1066 AVX_INSTR divps, 1, 0, 0 1289 AVX_INSTR blendvps, sse4, 1, 0, 0
1067 AVX_INSTR divsd, 1, 0, 0 1290 AVX_INSTR cmppd, sse2, 1, 1, 0
1068 AVX_INSTR divss, 1, 0, 0 1291 AVX_INSTR cmpps, sse, 1, 1, 0
1069 AVX_INSTR dppd, 1, 1, 0 1292 AVX_INSTR cmpsd, sse2, 1, 1, 0
1070 AVX_INSTR dpps, 1, 1, 0 1293 AVX_INSTR cmpss, sse, 1, 1, 0
1071 AVX_INSTR haddpd, 1, 0, 0 1294 AVX_INSTR comisd, sse2
1072 AVX_INSTR haddps, 1, 0, 0 1295 AVX_INSTR comiss, sse
1073 AVX_INSTR hsubpd, 1, 0, 0 1296 AVX_INSTR cvtdq2pd, sse2
1074 AVX_INSTR hsubps, 1, 0, 0 1297 AVX_INSTR cvtdq2ps, sse2
1075 AVX_INSTR maxpd, 1, 0, 1 1298 AVX_INSTR cvtpd2dq, sse2
1076 AVX_INSTR maxps, 1, 0, 1 1299 AVX_INSTR cvtpd2ps, sse2
1077 AVX_INSTR maxsd, 1, 0, 1 1300 AVX_INSTR cvtps2dq, sse2
1078 AVX_INSTR maxss, 1, 0, 1 1301 AVX_INSTR cvtps2pd, sse2
1079 AVX_INSTR minpd, 1, 0, 1 1302 AVX_INSTR cvtsd2si, sse2
1080 AVX_INSTR minps, 1, 0, 1 1303 AVX_INSTR cvtsd2ss, sse2
1081 AVX_INSTR minsd, 1, 0, 1 1304 AVX_INSTR cvtsi2sd, sse2
1082 AVX_INSTR minss, 1, 0, 1 1305 AVX_INSTR cvtsi2ss, sse
1083 AVX_INSTR movhlps, 1, 0, 0 1306 AVX_INSTR cvtss2sd, sse2
1084 AVX_INSTR movlhps, 1, 0, 0 1307 AVX_INSTR cvtss2si, sse
1085 AVX_INSTR movsd, 1, 0, 0 1308 AVX_INSTR cvttpd2dq, sse2
1086 AVX_INSTR movss, 1, 0, 0 1309 AVX_INSTR cvttps2dq, sse2
1087 AVX_INSTR mpsadbw, 0, 1, 0 1310 AVX_INSTR cvttsd2si, sse2
1088 AVX_INSTR mulpd, 1, 0, 1 1311 AVX_INSTR cvttss2si, sse
1089 AVX_INSTR mulps, 1, 0, 1 1312 AVX_INSTR divpd, sse2, 1, 0, 0
1090 AVX_INSTR mulsd, 1, 0, 1 1313 AVX_INSTR divps, sse, 1, 0, 0
1091 AVX_INSTR mulss, 1, 0, 1 1314 AVX_INSTR divsd, sse2, 1, 0, 0
1092 AVX_INSTR orpd, 1, 0, 1 1315 AVX_INSTR divss, sse, 1, 0, 0
1093 AVX_INSTR orps, 1, 0, 1 1316 AVX_INSTR dppd, sse4, 1, 1, 0
1094 AVX_INSTR packsswb, 0, 0, 0 1317 AVX_INSTR dpps, sse4, 1, 1, 0
1095 AVX_INSTR packssdw, 0, 0, 0 1318 AVX_INSTR extractps, sse4
1096 AVX_INSTR packuswb, 0, 0, 0 1319 AVX_INSTR haddpd, sse3, 1, 0, 0
1097 AVX_INSTR packusdw, 0, 0, 0 1320 AVX_INSTR haddps, sse3, 1, 0, 0
1098 AVX_INSTR paddb, 0, 0, 1 1321 AVX_INSTR hsubpd, sse3, 1, 0, 0
1099 AVX_INSTR paddw, 0, 0, 1 1322 AVX_INSTR hsubps, sse3, 1, 0, 0
1100 AVX_INSTR paddd, 0, 0, 1 1323 AVX_INSTR insertps, sse4, 1, 1, 0
1101 AVX_INSTR paddq, 0, 0, 1 1324 AVX_INSTR lddqu, sse3
1102 AVX_INSTR paddsb, 0, 0, 1 1325 AVX_INSTR ldmxcsr, sse
1103 AVX_INSTR paddsw, 0, 0, 1 1326 AVX_INSTR maskmovdqu, sse2
1104 AVX_INSTR paddusb, 0, 0, 1 1327 AVX_INSTR maxpd, sse2, 1, 0, 1
1105 AVX_INSTR paddusw, 0, 0, 1 1328 AVX_INSTR maxps, sse, 1, 0, 1
1106 AVX_INSTR palignr, 0, 1, 0 1329 AVX_INSTR maxsd, sse2, 1, 0, 1
1107 AVX_INSTR pand, 0, 0, 1 1330 AVX_INSTR maxss, sse, 1, 0, 1
1108 AVX_INSTR pandn, 0, 0, 0 1331 AVX_INSTR minpd, sse2, 1, 0, 1
1109 AVX_INSTR pavgb, 0, 0, 1 1332 AVX_INSTR minps, sse, 1, 0, 1
1110 AVX_INSTR pavgw, 0, 0, 1 1333 AVX_INSTR minsd, sse2, 1, 0, 1
1111 AVX_INSTR pblendvb, 0, 0, 0 1334 AVX_INSTR minss, sse, 1, 0, 1
1112 AVX_INSTR pblendw, 0, 1, 0 1335 AVX_INSTR movapd, sse2
1113 AVX_INSTR pcmpestri, 0, 0, 0 1336 AVX_INSTR movaps, sse
1114 AVX_INSTR pcmpestrm, 0, 0, 0 1337 AVX_INSTR movd, mmx
1115 AVX_INSTR pcmpistri, 0, 0, 0 1338 AVX_INSTR movddup, sse3
1116 AVX_INSTR pcmpistrm, 0, 0, 0 1339 AVX_INSTR movdqa, sse2
1117 AVX_INSTR pcmpeqb, 0, 0, 1 1340 AVX_INSTR movdqu, sse2
1118 AVX_INSTR pcmpeqw, 0, 0, 1 1341 AVX_INSTR movhlps, sse, 1, 0, 0
1119 AVX_INSTR pcmpeqd, 0, 0, 1 1342 AVX_INSTR movhpd, sse2, 1, 0, 0
1120 AVX_INSTR pcmpeqq, 0, 0, 1 1343 AVX_INSTR movhps, sse, 1, 0, 0
1121 AVX_INSTR pcmpgtb, 0, 0, 0 1344 AVX_INSTR movlhps, sse, 1, 0, 0
1122 AVX_INSTR pcmpgtw, 0, 0, 0 1345 AVX_INSTR movlpd, sse2, 1, 0, 0
1123 AVX_INSTR pcmpgtd, 0, 0, 0 1346 AVX_INSTR movlps, sse, 1, 0, 0
1124 AVX_INSTR pcmpgtq, 0, 0, 0 1347 AVX_INSTR movmskpd, sse2
1125 AVX_INSTR phaddw, 0, 0, 0 1348 AVX_INSTR movmskps, sse
1126 AVX_INSTR phaddd, 0, 0, 0 1349 AVX_INSTR movntdq, sse2
1127 AVX_INSTR phaddsw, 0, 0, 0 1350 AVX_INSTR movntdqa, sse4
1128 AVX_INSTR phsubw, 0, 0, 0 1351 AVX_INSTR movntpd, sse2
1129 AVX_INSTR phsubd, 0, 0, 0 1352 AVX_INSTR movntps, sse
1130 AVX_INSTR phsubsw, 0, 0, 0 1353 AVX_INSTR movq, mmx
1131 AVX_INSTR pmaddwd, 0, 0, 1 1354 AVX_INSTR movsd, sse2, 1, 0, 0
1132 AVX_INSTR pmaddubsw, 0, 0, 0 1355 AVX_INSTR movshdup, sse3
1133 AVX_INSTR pmaxsb, 0, 0, 1 1356 AVX_INSTR movsldup, sse3
1134 AVX_INSTR pmaxsw, 0, 0, 1 1357 AVX_INSTR movss, sse, 1, 0, 0
1135 AVX_INSTR pmaxsd, 0, 0, 1 1358 AVX_INSTR movupd, sse2
1136 AVX_INSTR pmaxub, 0, 0, 1 1359 AVX_INSTR movups, sse
1137 AVX_INSTR pmaxuw, 0, 0, 1 1360 AVX_INSTR mpsadbw, sse4
1138 AVX_INSTR pmaxud, 0, 0, 1 1361 AVX_INSTR mulpd, sse2, 1, 0, 1
1139 AVX_INSTR pminsb, 0, 0, 1 1362 AVX_INSTR mulps, sse, 1, 0, 1
1140 AVX_INSTR pminsw, 0, 0, 1 1363 AVX_INSTR mulsd, sse2, 1, 0, 1
1141 AVX_INSTR pminsd, 0, 0, 1 1364 AVX_INSTR mulss, sse, 1, 0, 1
1142 AVX_INSTR pminub, 0, 0, 1 1365 AVX_INSTR orpd, sse2, 1, 0, 1
1143 AVX_INSTR pminuw, 0, 0, 1 1366 AVX_INSTR orps, sse, 1, 0, 1
1144 AVX_INSTR pminud, 0, 0, 1 1367 AVX_INSTR pabsb, ssse3
1145 AVX_INSTR pmulhuw, 0, 0, 1 1368 AVX_INSTR pabsd, ssse3
1146 AVX_INSTR pmulhrsw, 0, 0, 1 1369 AVX_INSTR pabsw, ssse3
1147 AVX_INSTR pmulhw, 0, 0, 1 1370 AVX_INSTR packsswb, mmx, 0, 0, 0
1148 AVX_INSTR pmullw, 0, 0, 1 1371 AVX_INSTR packssdw, mmx, 0, 0, 0
1149 AVX_INSTR pmulld, 0, 0, 1 1372 AVX_INSTR packuswb, mmx, 0, 0, 0
1150 AVX_INSTR pmuludq, 0, 0, 1 1373 AVX_INSTR packusdw, sse4, 0, 0, 0
1151 AVX_INSTR pmuldq, 0, 0, 1 1374 AVX_INSTR paddb, mmx, 0, 0, 1
1152 AVX_INSTR por, 0, 0, 1 1375 AVX_INSTR paddw, mmx, 0, 0, 1
1153 AVX_INSTR psadbw, 0, 0, 1 1376 AVX_INSTR paddd, mmx, 0, 0, 1
1154 AVX_INSTR pshufb, 0, 0, 0 1377 AVX_INSTR paddq, sse2, 0, 0, 1
1155 AVX_INSTR psignb, 0, 0, 0 1378 AVX_INSTR paddsb, mmx, 0, 0, 1
1156 AVX_INSTR psignw, 0, 0, 0 1379 AVX_INSTR paddsw, mmx, 0, 0, 1
1157 AVX_INSTR psignd, 0, 0, 0 1380 AVX_INSTR paddusb, mmx, 0, 0, 1
1158 AVX_INSTR psllw, 0, 0, 0 1381 AVX_INSTR paddusw, mmx, 0, 0, 1
1159 AVX_INSTR pslld, 0, 0, 0 1382 AVX_INSTR palignr, ssse3
1160 AVX_INSTR psllq, 0, 0, 0 1383 AVX_INSTR pand, mmx, 0, 0, 1
1161 AVX_INSTR pslldq, 0, 0, 0 1384 AVX_INSTR pandn, mmx, 0, 0, 0
1162 AVX_INSTR psraw, 0, 0, 0 1385 AVX_INSTR pavgb, mmx2, 0, 0, 1
1163 AVX_INSTR psrad, 0, 0, 0 1386 AVX_INSTR pavgw, mmx2, 0, 0, 1
1164 AVX_INSTR psrlw, 0, 0, 0 1387 AVX_INSTR pblendvb, sse4, 0, 0, 0
1165 AVX_INSTR psrld, 0, 0, 0 1388 AVX_INSTR pblendw, sse4
1166 AVX_INSTR psrlq, 0, 0, 0 1389 AVX_INSTR pclmulqdq
1167 AVX_INSTR psrldq, 0, 0, 0 1390 AVX_INSTR pcmpestri, sse42
1168 AVX_INSTR psubb, 0, 0, 0 1391 AVX_INSTR pcmpestrm, sse42
1169 AVX_INSTR psubw, 0, 0, 0 1392 AVX_INSTR pcmpistri, sse42
1170 AVX_INSTR psubd, 0, 0, 0 1393 AVX_INSTR pcmpistrm, sse42
1171 AVX_INSTR psubq, 0, 0, 0 1394 AVX_INSTR pcmpeqb, mmx, 0, 0, 1
1172 AVX_INSTR psubsb, 0, 0, 0 1395 AVX_INSTR pcmpeqw, mmx, 0, 0, 1
1173 AVX_INSTR psubsw, 0, 0, 0 1396 AVX_INSTR pcmpeqd, mmx, 0, 0, 1
1174 AVX_INSTR psubusb, 0, 0, 0 1397 AVX_INSTR pcmpeqq, sse4, 0, 0, 1
1175 AVX_INSTR psubusw, 0, 0, 0 1398 AVX_INSTR pcmpgtb, mmx, 0, 0, 0
1176 AVX_INSTR punpckhbw, 0, 0, 0 1399 AVX_INSTR pcmpgtw, mmx, 0, 0, 0
1177 AVX_INSTR punpckhwd, 0, 0, 0 1400 AVX_INSTR pcmpgtd, mmx, 0, 0, 0
1178 AVX_INSTR punpckhdq, 0, 0, 0 1401 AVX_INSTR pcmpgtq, sse42, 0, 0, 0
1179 AVX_INSTR punpckhqdq, 0, 0, 0 1402 AVX_INSTR pextrb, sse4
1180 AVX_INSTR punpcklbw, 0, 0, 0 1403 AVX_INSTR pextrd, sse4
1181 AVX_INSTR punpcklwd, 0, 0, 0 1404 AVX_INSTR pextrq, sse4
1182 AVX_INSTR punpckldq, 0, 0, 0 1405 AVX_INSTR pextrw, mmx2
1183 AVX_INSTR punpcklqdq, 0, 0, 0 1406 AVX_INSTR phaddw, ssse3, 0, 0, 0
1184 AVX_INSTR pxor, 0, 0, 1 1407 AVX_INSTR phaddd, ssse3, 0, 0, 0
1185 AVX_INSTR shufps, 1, 1, 0 1408 AVX_INSTR phaddsw, ssse3, 0, 0, 0
1186 AVX_INSTR subpd, 1, 0, 0 1409 AVX_INSTR phminposuw, sse4
1187 AVX_INSTR subps, 1, 0, 0 1410 AVX_INSTR phsubw, ssse3, 0, 0, 0
1188 AVX_INSTR subsd, 1, 0, 0 1411 AVX_INSTR phsubd, ssse3, 0, 0, 0
1189 AVX_INSTR subss, 1, 0, 0 1412 AVX_INSTR phsubsw, ssse3, 0, 0, 0
1190 AVX_INSTR unpckhpd, 1, 0, 0 1413 AVX_INSTR pinsrb, sse4
1191 AVX_INSTR unpckhps, 1, 0, 0 1414 AVX_INSTR pinsrd, sse4
1192 AVX_INSTR unpcklpd, 1, 0, 0 1415 AVX_INSTR pinsrq, sse4
1193 AVX_INSTR unpcklps, 1, 0, 0 1416 AVX_INSTR pinsrw, mmx2
1194 AVX_INSTR xorpd, 1, 0, 1 1417 AVX_INSTR pmaddwd, mmx, 0, 0, 1
1195 AVX_INSTR xorps, 1, 0, 1 1418 AVX_INSTR pmaddubsw, ssse3, 0, 0, 0
1419 AVX_INSTR pmaxsb, sse4, 0, 0, 1
1420 AVX_INSTR pmaxsw, mmx2, 0, 0, 1
1421 AVX_INSTR pmaxsd, sse4, 0, 0, 1
1422 AVX_INSTR pmaxub, mmx2, 0, 0, 1
1423 AVX_INSTR pmaxuw, sse4, 0, 0, 1
1424 AVX_INSTR pmaxud, sse4, 0, 0, 1
1425 AVX_INSTR pminsb, sse4, 0, 0, 1
1426 AVX_INSTR pminsw, mmx2, 0, 0, 1
1427 AVX_INSTR pminsd, sse4, 0, 0, 1
1428 AVX_INSTR pminub, mmx2, 0, 0, 1
1429 AVX_INSTR pminuw, sse4, 0, 0, 1
1430 AVX_INSTR pminud, sse4, 0, 0, 1
1431 AVX_INSTR pmovmskb, mmx2
1432 AVX_INSTR pmovsxbw, sse4
1433 AVX_INSTR pmovsxbd, sse4
1434 AVX_INSTR pmovsxbq, sse4
1435 AVX_INSTR pmovsxwd, sse4
1436 AVX_INSTR pmovsxwq, sse4
1437 AVX_INSTR pmovsxdq, sse4
1438 AVX_INSTR pmovzxbw, sse4
1439 AVX_INSTR pmovzxbd, sse4
1440 AVX_INSTR pmovzxbq, sse4
1441 AVX_INSTR pmovzxwd, sse4
1442 AVX_INSTR pmovzxwq, sse4
1443 AVX_INSTR pmovzxdq, sse4
1444 AVX_INSTR pmuldq, sse4, 0, 0, 1
1445 AVX_INSTR pmulhrsw, ssse3, 0, 0, 1
1446 AVX_INSTR pmulhuw, mmx2, 0, 0, 1
1447 AVX_INSTR pmulhw, mmx, 0, 0, 1
1448 AVX_INSTR pmullw, mmx, 0, 0, 1
1449 AVX_INSTR pmulld, sse4, 0, 0, 1
1450 AVX_INSTR pmuludq, sse2, 0, 0, 1
1451 AVX_INSTR por, mmx, 0, 0, 1
1452 AVX_INSTR psadbw, mmx2, 0, 0, 1
1453 AVX_INSTR pshufb, ssse3, 0, 0, 0
1454 AVX_INSTR pshufd, sse2
1455 AVX_INSTR pshufhw, sse2
1456 AVX_INSTR pshuflw, sse2
1457 AVX_INSTR psignb, ssse3, 0, 0, 0
1458 AVX_INSTR psignw, ssse3, 0, 0, 0
1459 AVX_INSTR psignd, ssse3, 0, 0, 0
1460 AVX_INSTR psllw, mmx, 0, 0, 0
1461 AVX_INSTR pslld, mmx, 0, 0, 0
1462 AVX_INSTR psllq, mmx, 0, 0, 0
1463 AVX_INSTR pslldq, sse2, 0, 0, 0
1464 AVX_INSTR psraw, mmx, 0, 0, 0
1465 AVX_INSTR psrad, mmx, 0, 0, 0
1466 AVX_INSTR psrlw, mmx, 0, 0, 0
1467 AVX_INSTR psrld, mmx, 0, 0, 0
1468 AVX_INSTR psrlq, mmx, 0, 0, 0
1469 AVX_INSTR psrldq, sse2, 0, 0, 0
1470 AVX_INSTR psubb, mmx, 0, 0, 0
1471 AVX_INSTR psubw, mmx, 0, 0, 0
1472 AVX_INSTR psubd, mmx, 0, 0, 0
1473 AVX_INSTR psubq, sse2, 0, 0, 0
1474 AVX_INSTR psubsb, mmx, 0, 0, 0
1475 AVX_INSTR psubsw, mmx, 0, 0, 0
1476 AVX_INSTR psubusb, mmx, 0, 0, 0
1477 AVX_INSTR psubusw, mmx, 0, 0, 0
1478 AVX_INSTR ptest, sse4
1479 AVX_INSTR punpckhbw, mmx, 0, 0, 0
1480 AVX_INSTR punpckhwd, mmx, 0, 0, 0
1481 AVX_INSTR punpckhdq, mmx, 0, 0, 0
1482 AVX_INSTR punpckhqdq, sse2, 0, 0, 0
1483 AVX_INSTR punpcklbw, mmx, 0, 0, 0
1484 AVX_INSTR punpcklwd, mmx, 0, 0, 0
1485 AVX_INSTR punpckldq, mmx, 0, 0, 0
1486 AVX_INSTR punpcklqdq, sse2, 0, 0, 0
1487 AVX_INSTR pxor, mmx, 0, 0, 1
1488 AVX_INSTR rcpps, sse, 1, 0, 0
1489 AVX_INSTR rcpss, sse, 1, 0, 0
1490 AVX_INSTR roundpd, sse4
1491 AVX_INSTR roundps, sse4
1492 AVX_INSTR roundsd, sse4
1493 AVX_INSTR roundss, sse4
1494 AVX_INSTR rsqrtps, sse, 1, 0, 0
1495 AVX_INSTR rsqrtss, sse, 1, 0, 0
1496 AVX_INSTR shufpd, sse2, 1, 1, 0
1497 AVX_INSTR shufps, sse, 1, 1, 0
1498 AVX_INSTR sqrtpd, sse2, 1, 0, 0
1499 AVX_INSTR sqrtps, sse, 1, 0, 0
1500 AVX_INSTR sqrtsd, sse2, 1, 0, 0
1501 AVX_INSTR sqrtss, sse, 1, 0, 0
1502 AVX_INSTR stmxcsr, sse
1503 AVX_INSTR subpd, sse2, 1, 0, 0
1504 AVX_INSTR subps, sse, 1, 0, 0
1505 AVX_INSTR subsd, sse2, 1, 0, 0
1506 AVX_INSTR subss, sse, 1, 0, 0
1507 AVX_INSTR ucomisd, sse2
1508 AVX_INSTR ucomiss, sse
1509 AVX_INSTR unpckhpd, sse2, 1, 0, 0
1510 AVX_INSTR unpckhps, sse, 1, 0, 0
1511 AVX_INSTR unpcklpd, sse2, 1, 0, 0
1512 AVX_INSTR unpcklps, sse, 1, 0, 0
1513 AVX_INSTR xorpd, sse2, 1, 0, 1
1514 AVX_INSTR xorps, sse, 1, 0, 1
1196 1515
1197 ; 3DNow instructions, for sharing code between AVX, SSE and 3DN 1516 ; 3DNow instructions, for sharing code between AVX, SSE and 3DN
1198 AVX_INSTR pfadd, 1, 0, 1 1517 AVX_INSTR pfadd, 3dnow, 1, 0, 1
1199 AVX_INSTR pfsub, 1, 0, 0 1518 AVX_INSTR pfsub, 3dnow, 1, 0, 0
1200 AVX_INSTR pfmul, 1, 0, 1 1519 AVX_INSTR pfmul, 3dnow, 1, 0, 1
1201 1520
1202 ; base-4 constants for shuffles 1521 ; base-4 constants for shuffles
1203 %assign i 0 1522 %assign i 0
1204 %rep 256 1523 %rep 256
1205 %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) 1524 %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
1206 %if j < 10 1525 %if j < 10
1207 CAT_XDEFINE q000, j, i 1526 CAT_XDEFINE q000, j, i
1208 %elif j < 100 1527 %elif j < 100
1209 CAT_XDEFINE q00, j, i 1528 CAT_XDEFINE q00, j, i
1210 %elif j < 1000 1529 %elif j < 1000
1211 CAT_XDEFINE q0, j, i 1530 CAT_XDEFINE q0, j, i
1212 %else 1531 %else
1213 CAT_XDEFINE q, j, i 1532 CAT_XDEFINE q, j, i
1214 %endif 1533 %endif
1215 %assign i i+1 1534 %assign i i+1
1216 %endrep 1535 %endrep
1217 %undef i 1536 %undef i
1218 %undef j 1537 %undef j
1219 1538
1220 %macro FMA_INSTR 3 1539 %macro FMA_INSTR 3
1221 %macro %1 4-7 %1, %2, %3 1540 %macro %1 4-7 %1, %2, %3
1222 %if cpuflag(xop) 1541 %if cpuflag(xop)
1223 v%5 %1, %2, %3, %4 1542 v%5 %1, %2, %3, %4
1224 %else 1543 %elifnidn %1, %4
1225 %6 %1, %2, %3 1544 %6 %1, %2, %3
1226 %7 %1, %4 1545 %7 %1, %4
1546 %else
1547 %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported
1227 %endif 1548 %endif
1228 %endmacro 1549 %endmacro
1229 %endmacro 1550 %endmacro
1230 1551
1231 FMA_INSTR pmacsdd, pmulld, paddd
1232 FMA_INSTR pmacsww, pmullw, paddw 1552 FMA_INSTR pmacsww, pmullw, paddw
1553 FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation
1554 FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation
1233 FMA_INSTR pmadcswd, pmaddwd, paddd 1555 FMA_INSTR pmadcswd, pmaddwd, paddd
1556
1557 ; convert FMA4 to FMA3 if possible
1558 %macro FMA4_INSTR 4
1559 %macro %1 4-8 %1, %2, %3, %4
1560 %if cpuflag(fma4)
1561 v%5 %1, %2, %3, %4
1562 %elifidn %1, %2
1563 v%6 %1, %4, %3 ; %1 = %1 * %3 + %4
1564 %elifidn %1, %3
1565 v%7 %1, %2, %4 ; %1 = %2 * %1 + %4
1566 %elifidn %1, %4
1567 v%8 %1, %2, %3 ; %1 = %2 * %3 + %1
1568 %else
1569 %error fma3 emulation of ``%5 %1, %2, %3, %4'' is not supported
1570 %endif
1571 %endmacro
1572 %endmacro
1573
1574 FMA4_INSTR fmaddpd, fmadd132pd, fmadd213pd, fmadd231pd
1575 FMA4_INSTR fmaddps, fmadd132ps, fmadd213ps, fmadd231ps
1576 FMA4_INSTR fmaddsd, fmadd132sd, fmadd213sd, fmadd231sd
1577 FMA4_INSTR fmaddss, fmadd132ss, fmadd213ss, fmadd231ss
1578
1579 FMA4_INSTR fmaddsubpd, fmaddsub132pd, fmaddsub213pd, fmaddsub231pd
1580 FMA4_INSTR fmaddsubps, fmaddsub132ps, fmaddsub213ps, fmaddsub231ps
1581 FMA4_INSTR fmsubaddpd, fmsubadd132pd, fmsubadd213pd, fmsubadd231pd
1582 FMA4_INSTR fmsubaddps, fmsubadd132ps, fmsubadd213ps, fmsubadd231ps
1583
1584 FMA4_INSTR fmsubpd, fmsub132pd, fmsub213pd, fmsub231pd
1585 FMA4_INSTR fmsubps, fmsub132ps, fmsub213ps, fmsub231ps
1586 FMA4_INSTR fmsubsd, fmsub132sd, fmsub213sd, fmsub231sd
1587 FMA4_INSTR fmsubss, fmsub132ss, fmsub213ss, fmsub231ss
1588
1589 FMA4_INSTR fnmaddpd, fnmadd132pd, fnmadd213pd, fnmadd231pd
1590 FMA4_INSTR fnmaddps, fnmadd132ps, fnmadd213ps, fnmadd231ps
1591 FMA4_INSTR fnmaddsd, fnmadd132sd, fnmadd213sd, fnmadd231sd
1592 FMA4_INSTR fnmaddss, fnmadd132ss, fnmadd213ss, fnmadd231ss
1593
1594 FMA4_INSTR fnmsubpd, fnmsub132pd, fnmsub213pd, fnmsub231pd
1595 FMA4_INSTR fnmsubps, fnmsub132ps, fnmsub213ps, fnmsub231ps
1596 FMA4_INSTR fnmsubsd, fnmsub132sd, fnmsub213sd, fnmsub231sd
1597 FMA4_INSTR fnmsubss, fnmsub132ss, fnmsub213ss, fnmsub231ss
1598
1599 ; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug
1600 %if ARCH_X86_64 == 0
1601 %macro vpbroadcastq 2
1602 %if sizeof%1 == 16
1603 movddup %1, %2
1604 %else
1605 vbroadcastsd %1, %2
1606 %endif
1607 %endmacro
1608 %endif
OLDNEW
« no previous file with comments | « source/libvpx/third_party/x86inc/README.libvpx ('k') | source/libvpx/tools_common.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698